upton 0.2.7 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,20 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+
8
+ require "webmock/rspec"
9
+ RSpec.configure do |config|
10
+ config.treat_symbols_as_metadata_keys_with_true_values = true
11
+ config.run_all_when_everything_filtered = true
12
+ config.filter_run :focus
13
+
14
+ # Run specs in random order to surface order dependencies. If you find an
15
+ # order dependency and want to debug it, you can fix the order by providing
16
+ # the seed, which is printed after each run.
17
+ # --seed 1234
18
+ config.order = 'random'
19
+ WebMock.disable_net_connect!(:allow_localhost => true)
20
+ end
@@ -0,0 +1,75 @@
1
+ require "spec_helper.rb"
2
+ require_relative "../lib/upton/downloader.rb"
3
+
4
+ describe Upton::Downloader do
5
+
6
+ def remove_default_cache_folder!
7
+ FileUtils.rm_rf(default_cache_folder)
8
+ end
9
+
10
+ def default_cache_folder
11
+ "#{Dir.tmpdir}/upton"
12
+ end
13
+
14
+ let(:cache) { Upton::Downloader.new("http://www.example.com") }
15
+ let(:uncache) { Upton::Downloader.new("http://www.example.com", cache: false ) }
16
+
17
+ context "When caching enabled" do
18
+
19
+ context "When disk cache is unavailable" do
20
+ before(:each) do
21
+ remove_default_cache_folder!
22
+ end
23
+
24
+ it "should download from the resource once" do
25
+ stub = stub_request(:get, "http://www.example.com")
26
+ cache.get
27
+ stub.should have_been_requested.once
28
+ end
29
+
30
+ it "should use the cache from the second request" do
31
+ stub = stub_request(:get, "http://www.example.com")
32
+ cache.get
33
+ cache.get
34
+ stub.should have_been_requested.once
35
+ end
36
+
37
+ end
38
+
39
+ context "cache available" do
40
+ it "should not make a http request" do
41
+ stub = stub_request(:get, "http://www.example.com")
42
+ cache.get
43
+ stub.should_not have_been_requested
44
+ end
45
+ end
46
+
47
+
48
+ context "Different urls should have different caches" do
49
+ let(:cache_one) { Upton::Downloader.new("http://www.example.com", cache: true) }
50
+ let(:cache_two) { Upton::Downloader.new("http://www.example.com?a=1&b=2", cache: true) }
51
+
52
+ it "should create two cached files inside the cache directory" do
53
+ remove_default_cache_folder!
54
+ stub_one = stub_request(:get, "http://www.example.com")
55
+ stub_two = stub_request(:get, "http://www.example.com?a=1&b=2")
56
+
57
+ cache_one.get
58
+ cache_two.get
59
+ Dir.entries(default_cache_folder).count.should eq(4)
60
+ end
61
+
62
+ end
63
+ end
64
+
65
+ context "When caching disabled" do
66
+ context "When #download is called twice" do
67
+ it "should make two requests" do
68
+ stub = stub_request(:get, "http://www.example.com")
69
+ uncache.get
70
+ uncache.get
71
+ stub.should have_been_requested.twice
72
+ end
73
+ end
74
+ end
75
+ end
@@ -5,53 +5,29 @@ require 'thin'
5
5
  require 'nokogiri'
6
6
  require 'restclient'
7
7
  require 'fileutils'
8
+ require "spec_helper.rb"
9
+
8
10
  require './lib/upton'
9
11
 
12
+
10
13
  describe Upton do
11
14
  before :all do
12
- #start the server
13
- class Server
14
- def call(env)
15
- @root = File.expand_path(File.dirname(__FILE__))
16
- path = Rack::Utils.unescape(env['PATH_INFO'])
17
- path += 'index.html' if path == '/'
18
- file = File.join(@root, "data", path)
19
-
20
- params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
21
-
22
- if File.exists?(file)
23
- [ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
24
- else
25
- [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
26
- end
27
- end
28
- end
29
-
30
- def start_test_server
31
- @server_thread = Thread.new do
32
- Rack::Handler::Thin.run ::Server.new, :Port => 9876
33
- end
34
- sleep(1) # wait a sec for the server to be booted
35
- end
36
-
37
- start_test_server()
38
-
39
- @headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
15
+ @headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
40
16
  "",
41
17
  "A Prosecutor, a Wrongful Conviction and a Question of Justice",
42
18
  "Six Facts Lost in the IRS Scandal"]
43
- @most_commented_heds = [["Six Facts Lost in the IRS Scandal",
44
- "How the IRS’s Nonprofit Division Got So Dysfunctional",
45
- "Sound, Fury and the IRS Mess",
46
- "The Most Important #Muckreads on Rape in the Military",
47
- "Congressmen to Hagel: Where Are the Missing War Records?",
48
- "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
49
- "A Prosecutor, a Wrongful Conviction and a Question of Justice",
50
- "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
19
+ @most_commented_heds = [["Six Facts Lost in the IRS Scandal",
20
+ "How the IRS’s Nonprofit Division Got So Dysfunctional",
21
+ "Sound, Fury and the IRS Mess",
22
+ "The Most Important #Muckreads on Rape in the Military",
23
+ "Congressmen to Hagel: Where Are the Missing War Records?",
24
+ "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
25
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
26
+ "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
51
27
  "The Story Behind Our Hospital Interactive",
52
28
  "irs-test-charts-for-embedding"]]
53
- @east_timor_prime_ministers = [[
54
- ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
29
+ @east_timor_prime_ministers = [[
30
+ ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
55
31
  "1", "2", "3", "4",],
56
32
  [],
57
33
  ["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
@@ -59,12 +35,27 @@ describe Upton do
59
35
  ["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
60
36
  ["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
61
37
  ]]
38
+ @searchResults = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
39
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
40
+ "Six Facts Lost in the IRS Scandal"]
62
41
  end
63
42
 
64
43
  it "should scrape in the basic case" do
65
- propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
44
+ stub_request(:get, "www.example.com/propublica.html").
45
+ to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
46
+ stub_request(:get, "www.example.com/discussion.html").
47
+ to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
48
+ stub_request(:get, "www.example.com/prosecutor.html").
49
+ to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
50
+ stub_request(:get, "www.example.com/webinar.html").
51
+ to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
52
+ stub_request(:get, "www.example.com/sixfacts.html").
53
+ to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
54
+
55
+ propubscraper = Upton::Scraper.new("http://www.example.com/propublica.html", "section#river section h1 a")
66
56
  propubscraper.debug = true
67
57
  propubscraper.verbose = true
58
+ propubscraper.sleep_time_between_requests = 0
68
59
 
69
60
  heds = propubscraper.scrape do |article_str|
70
61
  doc = Nokogiri::HTML(article_str)
@@ -74,18 +65,31 @@ describe Upton do
74
65
  heds.should eql @headlines
75
66
  end
76
67
 
77
- it 'should properly handle relative urls' do
68
+ it 'should properly handle relative urls' do
78
69
  # uses a modified page from the previous test in which the target
79
70
  # href, http://127.0.0.1:9876/prosecutors.html, has been changed
80
71
  # to a relative url
81
72
  #
82
- # Note: this test is a bit quirky, because it passes on the fact that
73
+ # Note: this test is a bit quirky, because it passes on the fact that
83
74
  # the resolve_url creates a url identical to one that is already stashed ("prosecutors.html").
84
75
  # So it works, but because of a coupling to how Upton handles caching in the file system
85
76
 
86
- propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica-relative.html", "section#river h1 a", :css)
77
+ stub_request(:get, "www.example.com/propublica-relative.html").
78
+ to_return(:body => File.new('./spec/data/propublica-relative.html'), :status => 200)
79
+ stub_request(:get, "www.example.com/prosecutor.html").
80
+ to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
81
+ stub_request(:get, "www.example.com/sixfacts.html").
82
+ to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
83
+ stub_request(:get, "www.example.com/webinar.html").
84
+ to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
85
+ stub_request(:get, "www.example.com/discussion.html").
86
+ to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
87
+
88
+
89
+ propubscraper = Upton::Scraper.new("http://www.example.com/propublica-relative.html", "section#river h1 a")
87
90
  propubscraper.debug = true
88
91
  propubscraper.verbose = true
92
+ propubscraper.sleep_time_between_requests = 0
89
93
 
90
94
  heds = propubscraper.scrape do |article_str|
91
95
  doc = Nokogiri::HTML(article_str)
@@ -96,23 +100,82 @@ describe Upton do
96
100
  end
97
101
 
98
102
  it "should scrape a list properly with the list helper" do
99
- propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
103
+ stub_request(:get, "www.example.com/propublica.html").
104
+ to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
105
+
106
+ propubscraper = Upton::Scraper.new(["http://www.example.com/propublica.html"])
100
107
  propubscraper.debug = true
101
108
  propubscraper.verbose = true
102
- list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
109
+ propubscraper.sleep_time_between_requests = 0
110
+
111
+ list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a"))
103
112
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
104
113
  list.should eql @most_commented_heds
105
114
  end
106
115
 
107
116
  it "should scrape a table properly with the table helper" do
108
- propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
117
+ stub_request(:get, "www.example.com/easttimor.html").
118
+ to_return(:body => File.new('./spec/data/easttimor.html'), :status => 200)
119
+
120
+ propubscraper = Upton::Scraper.new(["http://www.example.com/easttimor.html"])
109
121
  propubscraper.debug = true
110
122
  propubscraper.verbose = true
123
+ propubscraper.sleep_time_between_requests = 0
124
+
111
125
  table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
112
126
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
113
127
  table.should eql @east_timor_prime_ministers
114
128
  end
115
129
 
116
- it "should test saving files with the right encoding"
117
- it "should test stashing to make sure pages are stashed at the right times, but not at the wrong ones"
130
+ it "should test saving files with the right encoding" do
131
+ pending "finding a site that gives funny encodings"
132
+ end
133
+
134
+ it "should scrape paginated pages" do
135
+ stub_request(:get, "www.example.com/propublica_search.html").
136
+ to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
137
+ stub_request(:get, "www.example.com/propublica_search.html?p=2").
138
+ to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
139
+ stub_request(:get, "www.example.com/propublica_search.html?p=3").
140
+ to_return(:body => '', :status => 200)
141
+ stub_request(:get, "www.example.com/webinar.html").
142
+ to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
143
+ stub_request(:get, "www.example.com/prosecutor.html").
144
+ to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
145
+ stub_request(:get, "www.example.com/sixfacts.html").
146
+ to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
147
+
148
+
149
+ propubscraper = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.compact-list a.title-link')
150
+ propubscraper.debug = true
151
+ propubscraper.verbose = true
152
+ propubscraper.paginated = true
153
+ propubscraper.pagination_param = 'p'
154
+ propubscraper.pagination_max_pages = 3
155
+ propubscraper.sleep_time_between_requests = 0
156
+
157
+ results = propubscraper.scrape do |article_str|
158
+ doc = Nokogiri::HTML(article_str)
159
+ hed = doc.css('h1.article-title').text
160
+ end
161
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
162
+ results.should eql @searchResults
163
+ end
164
+
165
+
166
+ before do
167
+ Upton::Scraper.stub(:sleep)
168
+ end
169
+
170
+ it "should sleep after uncached requests" do
171
+ stub_request(:get, "www.example.com")
172
+ u = Upton::Scraper.new("http://www.example.com", '.whatever')
173
+ u.should_receive(:sleep)
174
+ stub = stub_request(:get, "http://www.example.com")
175
+ u.scrape
176
+ end
177
+
178
+ it "should be silent if verbose if false" do
179
+ pending
180
+ end
118
181
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-11 00:00:00.000000000 Z
11
+ date: 2013-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rack
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: webmock
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: thin
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -130,7 +144,9 @@ extensions: []
130
144
  extra_rdoc_files: []
131
145
  files:
132
146
  - lib/upton.rb
133
- - lib/utils.rb
147
+ - lib/upton/utils.rb
148
+ - lib/upton/downloader.rb
149
+ - spec/data/propublica_search_page_2.html
134
150
  - spec/data/webinar.html
135
151
  - spec/data/propublica-relative.html
136
152
  - spec/data/propublica.html
@@ -138,7 +154,10 @@ files:
138
154
  - spec/data/sixfacts.html
139
155
  - spec/data/discussion.html
140
156
  - spec/data/easttimor.html
157
+ - spec/data/propublica_search.html
141
158
  - spec/upton_spec.rb
159
+ - spec/spec_helper.rb
160
+ - spec/upton_downloader_spec.rb
142
161
  homepage: http://github.org/propublica/upton
143
162
  licenses:
144
163
  - MIT
@@ -164,6 +183,7 @@ signing_key:
164
183
  specification_version: 4
165
184
  summary: A simple web-scraping framework
166
185
  test_files:
186
+ - spec/data/propublica_search_page_2.html
167
187
  - spec/data/webinar.html
168
188
  - spec/data/propublica-relative.html
169
189
  - spec/data/propublica.html
@@ -171,5 +191,8 @@ test_files:
171
191
  - spec/data/sixfacts.html
172
192
  - spec/data/discussion.html
173
193
  - spec/data/easttimor.html
194
+ - spec/data/propublica_search.html
174
195
  - spec/upton_spec.rb
196
+ - spec/spec_helper.rb
197
+ - spec/upton_downloader_spec.rb
175
198
  has_rdoc: true
@@ -1,74 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- ##
4
- # This module contains a collection of helpers for Upton
5
- ##
6
- module Upton
7
-
8
- ##
9
- # This class contains a collection of helpers for Upton
10
- #
11
- # Each method returns a Proc that (with an & ) can be used as the final
12
- # argument to Upton's `scrape` and `scrape_to_csv`
13
- ##
14
- module Utils
15
-
16
- ##
17
- # Scrapes an HTML <table> element into an Array of Arrays. The header, if
18
- # present, is returned as the first row.
19
- ##
20
- def self.table(table_selector, selector_method=:xpath)
21
- return Proc.new do |instance_html|
22
- html = ::Nokogiri::HTML(instance_html)
23
- output = []
24
- headers = html.send(selector_method, table_selector).css("th").map &:text
25
- output << headers
26
-
27
- table = html.send(selector_method, table_selector).css("tr").each{|tr| output << tr.css("td").map(&:text) }
28
- output
29
- end
30
- end
31
-
32
- ##
33
- # Scrapes any set of HTML elements into an Array.
34
- ##
35
- def self.list(list_selector, selector_method=:xpath)
36
- return Proc.new do |instance_html|
37
- html = ::Nokogiri::HTML(instance_html)
38
- html.send(selector_method, list_selector).map{|list_element| list_element.text }
39
- end
40
- end
41
-
42
- ##
43
- # Takes :_href and resolves it to an absolute URL according to
44
- # the supplied :_page_url. They can be either Strings or URI
45
- # instances.
46
- #
47
- # raises ArgumentError if either href or page_url is nil
48
- # raises ArgumentError if page_url is not absolute
49
- #
50
- # returns: a String with absolute URL
51
- def self.resolve_url(_href, _page_url)
52
-
53
- page_url = URI(_page_url).dup
54
- raise ArgumentError, "#{page_url} must be absolute" unless page_url.absolute?
55
-
56
- href = URI(_href).dup
57
-
58
- # return :href if :href is already absolute
59
- return href.to_s if href.absolute?
60
-
61
-
62
- # TODO: There may be edge cases worth considering
63
- # but this should handle the following non-absolute href possibilities:
64
- # //anothersite.com (keeps scheme, too!)
65
- # /root/dir
66
- # relative/dir
67
- # ?query=2
68
- # #bang
69
-
70
- URI.join(page_url, href).to_s
71
- end
72
-
73
- end
74
- end