upton 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ # This file was generated by the `rspec --init` command. Conventionally, all
2
+ # specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
3
+ # Require this file using `require "spec_helper"` to ensure that it is only
4
+ # loaded once.
5
+ #
6
+ # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
+
8
+ require "webmock/rspec"
9
+ RSpec.configure do |config|
10
+ config.treat_symbols_as_metadata_keys_with_true_values = true
11
+ config.run_all_when_everything_filtered = true
12
+ config.filter_run :focus
13
+
14
+ # Run specs in random order to surface order dependencies. If you find an
15
+ # order dependency and want to debug it, you can fix the order by providing
16
+ # the seed, which is printed after each run.
17
+ # --seed 1234
18
+ config.order = 'random'
19
+ WebMock.disable_net_connect!(:allow_localhost => true)
20
+ end
@@ -0,0 +1,75 @@
1
+ require "spec_helper.rb"
2
+ require_relative "../lib/upton/downloader.rb"
3
+
4
+ describe Upton::Downloader do
5
+
6
+ def remove_default_cache_folder!
7
+ FileUtils.rm_rf(default_cache_folder)
8
+ end
9
+
10
+ def default_cache_folder
11
+ "#{Dir.tmpdir}/upton"
12
+ end
13
+
14
+ let(:cache) { Upton::Downloader.new("http://www.example.com") }
15
+ let(:uncache) { Upton::Downloader.new("http://www.example.com", cache: false ) }
16
+
17
+ context "When caching enabled" do
18
+
19
+ context "When disk cache is unavailable" do
20
+ before(:each) do
21
+ remove_default_cache_folder!
22
+ end
23
+
24
+ it "should download from the resource once" do
25
+ stub = stub_request(:get, "http://www.example.com")
26
+ cache.get
27
+ stub.should have_been_requested.once
28
+ end
29
+
30
+ it "should use the cache from the second request" do
31
+ stub = stub_request(:get, "http://www.example.com")
32
+ cache.get
33
+ cache.get
34
+ stub.should have_been_requested.once
35
+ end
36
+
37
+ end
38
+
39
+ context "cache available" do
40
+ it "should not make a http request" do
41
+ stub = stub_request(:get, "http://www.example.com")
42
+ cache.get
43
+ stub.should_not have_been_requested
44
+ end
45
+ end
46
+
47
+
48
+ context "Different urls should have different caches" do
49
+ let(:cache_one) { Upton::Downloader.new("http://www.example.com", cache: true) }
50
+ let(:cache_two) { Upton::Downloader.new("http://www.example.com?a=1&b=2", cache: true) }
51
+
52
+ it "should create two cached files inside the cache directory" do
53
+ remove_default_cache_folder!
54
+ stub_one = stub_request(:get, "http://www.example.com")
55
+ stub_two = stub_request(:get, "http://www.example.com?a=1&b=2")
56
+
57
+ cache_one.get
58
+ cache_two.get
59
+ Dir.entries(default_cache_folder).count.should eq(4)
60
+ end
61
+
62
+ end
63
+ end
64
+
65
+ context "When caching disabled" do
66
+ context "When #download is called twice" do
67
+ it "should make two requests" do
68
+ stub = stub_request(:get, "http://www.example.com")
69
+ uncache.get
70
+ uncache.get
71
+ stub.should have_been_requested.twice
72
+ end
73
+ end
74
+ end
75
+ end
@@ -5,53 +5,29 @@ require 'thin'
5
5
  require 'nokogiri'
6
6
  require 'restclient'
7
7
  require 'fileutils'
8
+ require "spec_helper.rb"
9
+
8
10
  require './lib/upton'
9
11
 
12
+
10
13
  describe Upton do
11
14
  before :all do
12
- #start the server
13
- class Server
14
- def call(env)
15
- @root = File.expand_path(File.dirname(__FILE__))
16
- path = Rack::Utils.unescape(env['PATH_INFO'])
17
- path += 'index.html' if path == '/'
18
- file = File.join(@root, "data", path)
19
-
20
- params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
21
-
22
- if File.exists?(file)
23
- [ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
24
- else
25
- [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
26
- end
27
- end
28
- end
29
-
30
- def start_test_server
31
- @server_thread = Thread.new do
32
- Rack::Handler::Thin.run ::Server.new, :Port => 9876
33
- end
34
- sleep(1) # wait a sec for the server to be booted
35
- end
36
-
37
- start_test_server()
38
-
39
- @headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
15
+ @headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
40
16
  "",
41
17
  "A Prosecutor, a Wrongful Conviction and a Question of Justice",
42
18
  "Six Facts Lost in the IRS Scandal"]
43
- @most_commented_heds = [["Six Facts Lost in the IRS Scandal",
44
- "How the IRS’s Nonprofit Division Got So Dysfunctional",
45
- "Sound, Fury and the IRS Mess",
46
- "The Most Important #Muckreads on Rape in the Military",
47
- "Congressmen to Hagel: Where Are the Missing War Records?",
48
- "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
49
- "A Prosecutor, a Wrongful Conviction and a Question of Justice",
50
- "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
19
+ @most_commented_heds = [["Six Facts Lost in the IRS Scandal",
20
+ "How the IRS’s Nonprofit Division Got So Dysfunctional",
21
+ "Sound, Fury and the IRS Mess",
22
+ "The Most Important #Muckreads on Rape in the Military",
23
+ "Congressmen to Hagel: Where Are the Missing War Records?",
24
+ "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
25
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
26
+ "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
51
27
  "The Story Behind Our Hospital Interactive",
52
28
  "irs-test-charts-for-embedding"]]
53
- @east_timor_prime_ministers = [[
54
- ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
29
+ @east_timor_prime_ministers = [[
30
+ ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
55
31
  "1", "2", "3", "4",],
56
32
  [],
57
33
  ["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
@@ -59,12 +35,27 @@ describe Upton do
59
35
  ["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
60
36
  ["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
61
37
  ]]
38
+ @searchResults = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
39
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
40
+ "Six Facts Lost in the IRS Scandal"]
62
41
  end
63
42
 
64
43
  it "should scrape in the basic case" do
65
- propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
44
+ stub_request(:get, "www.example.com/propublica.html").
45
+ to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
46
+ stub_request(:get, "www.example.com/discussion.html").
47
+ to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
48
+ stub_request(:get, "www.example.com/prosecutor.html").
49
+ to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
50
+ stub_request(:get, "www.example.com/webinar.html").
51
+ to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
52
+ stub_request(:get, "www.example.com/sixfacts.html").
53
+ to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
54
+
55
+ propubscraper = Upton::Scraper.new("http://www.example.com/propublica.html", "section#river section h1 a")
66
56
  propubscraper.debug = true
67
57
  propubscraper.verbose = true
58
+ propubscraper.sleep_time_between_requests = 0
68
59
 
69
60
  heds = propubscraper.scrape do |article_str|
70
61
  doc = Nokogiri::HTML(article_str)
@@ -74,18 +65,31 @@ describe Upton do
74
65
  heds.should eql @headlines
75
66
  end
76
67
 
77
- it 'should properly handle relative urls' do
68
+ it 'should properly handle relative urls' do
78
69
  # uses a modified page from the previous test in which the target
79
70
  # href, http://127.0.0.1:9876/prosecutors.html, has been changed
80
71
  # to a relative url
81
72
  #
82
- # Note: this test is a bit quirky, because it passes on the fact that
73
+ # Note: this test is a bit quirky, because it passes on the fact that
83
74
  # the resolve_url creates a url identical to one that is already stashed ("prosecutors.html").
84
75
  # So it works, but because of a coupling to how Upton handles caching in the file system
85
76
 
86
- propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica-relative.html", "section#river h1 a", :css)
77
+ stub_request(:get, "www.example.com/propublica-relative.html").
78
+ to_return(:body => File.new('./spec/data/propublica-relative.html'), :status => 200)
79
+ stub_request(:get, "www.example.com/prosecutor.html").
80
+ to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
81
+ stub_request(:get, "www.example.com/sixfacts.html").
82
+ to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
83
+ stub_request(:get, "www.example.com/webinar.html").
84
+ to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
85
+ stub_request(:get, "www.example.com/discussion.html").
86
+ to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
87
+
88
+
89
+ propubscraper = Upton::Scraper.new("http://www.example.com/propublica-relative.html", "section#river h1 a")
87
90
  propubscraper.debug = true
88
91
  propubscraper.verbose = true
92
+ propubscraper.sleep_time_between_requests = 0
89
93
 
90
94
  heds = propubscraper.scrape do |article_str|
91
95
  doc = Nokogiri::HTML(article_str)
@@ -96,23 +100,82 @@ describe Upton do
96
100
  end
97
101
 
98
102
  it "should scrape a list properly with the list helper" do
99
- propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
103
+ stub_request(:get, "www.example.com/propublica.html").
104
+ to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
105
+
106
+ propubscraper = Upton::Scraper.new(["http://www.example.com/propublica.html"])
100
107
  propubscraper.debug = true
101
108
  propubscraper.verbose = true
102
- list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
109
+ propubscraper.sleep_time_between_requests = 0
110
+
111
+ list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a"))
103
112
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
104
113
  list.should eql @most_commented_heds
105
114
  end
106
115
 
107
116
  it "should scrape a table properly with the table helper" do
108
- propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
117
+ stub_request(:get, "www.example.com/easttimor.html").
118
+ to_return(:body => File.new('./spec/data/easttimor.html'), :status => 200)
119
+
120
+ propubscraper = Upton::Scraper.new(["http://www.example.com/easttimor.html"])
109
121
  propubscraper.debug = true
110
122
  propubscraper.verbose = true
123
+ propubscraper.sleep_time_between_requests = 0
124
+
111
125
  table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
112
126
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
113
127
  table.should eql @east_timor_prime_ministers
114
128
  end
115
129
 
116
- it "should test saving files with the right encoding"
117
- it "should test stashing to make sure pages are stashed at the right times, but not at the wrong ones"
130
+ it "should test saving files with the right encoding" do
131
+ pending "finding a site that gives funny encodings"
132
+ end
133
+
134
+ it "should scrape paginated pages" do
135
+ stub_request(:get, "www.example.com/propublica_search.html").
136
+ to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
137
+ stub_request(:get, "www.example.com/propublica_search.html?p=2").
138
+ to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
139
+ stub_request(:get, "www.example.com/propublica_search.html?p=3").
140
+ to_return(:body => '', :status => 200)
141
+ stub_request(:get, "www.example.com/webinar.html").
142
+ to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
143
+ stub_request(:get, "www.example.com/prosecutor.html").
144
+ to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
145
+ stub_request(:get, "www.example.com/sixfacts.html").
146
+ to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
147
+
148
+
149
+ propubscraper = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.compact-list a.title-link')
150
+ propubscraper.debug = true
151
+ propubscraper.verbose = true
152
+ propubscraper.paginated = true
153
+ propubscraper.pagination_param = 'p'
154
+ propubscraper.pagination_max_pages = 3
155
+ propubscraper.sleep_time_between_requests = 0
156
+
157
+ results = propubscraper.scrape do |article_str|
158
+ doc = Nokogiri::HTML(article_str)
159
+ hed = doc.css('h1.article-title').text
160
+ end
161
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
162
+ results.should eql @searchResults
163
+ end
164
+
165
+
166
+ before do
167
+ Upton::Scraper.stub(:sleep)
168
+ end
169
+
170
+ it "should sleep after uncached requests" do
171
+ stub_request(:get, "www.example.com")
172
+ u = Upton::Scraper.new("http://www.example.com", '.whatever')
173
+ u.should_receive(:sleep)
174
+ stub = stub_request(:get, "http://www.example.com")
175
+ u.scrape
176
+ end
177
+
178
+ it "should be silent if verbose if false" do
179
+ pending
180
+ end
118
181
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-08-11 00:00:00.000000000 Z
11
+ date: 2013-08-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rack
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: webmock
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: thin
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -130,7 +144,9 @@ extensions: []
130
144
  extra_rdoc_files: []
131
145
  files:
132
146
  - lib/upton.rb
133
- - lib/utils.rb
147
+ - lib/upton/utils.rb
148
+ - lib/upton/downloader.rb
149
+ - spec/data/propublica_search_page_2.html
134
150
  - spec/data/webinar.html
135
151
  - spec/data/propublica-relative.html
136
152
  - spec/data/propublica.html
@@ -138,7 +154,10 @@ files:
138
154
  - spec/data/sixfacts.html
139
155
  - spec/data/discussion.html
140
156
  - spec/data/easttimor.html
157
+ - spec/data/propublica_search.html
141
158
  - spec/upton_spec.rb
159
+ - spec/spec_helper.rb
160
+ - spec/upton_downloader_spec.rb
142
161
  homepage: http://github.org/propublica/upton
143
162
  licenses:
144
163
  - MIT
@@ -164,6 +183,7 @@ signing_key:
164
183
  specification_version: 4
165
184
  summary: A simple web-scraping framework
166
185
  test_files:
186
+ - spec/data/propublica_search_page_2.html
167
187
  - spec/data/webinar.html
168
188
  - spec/data/propublica-relative.html
169
189
  - spec/data/propublica.html
@@ -171,5 +191,8 @@ test_files:
171
191
  - spec/data/sixfacts.html
172
192
  - spec/data/discussion.html
173
193
  - spec/data/easttimor.html
194
+ - spec/data/propublica_search.html
174
195
  - spec/upton_spec.rb
196
+ - spec/spec_helper.rb
197
+ - spec/upton_downloader_spec.rb
175
198
  has_rdoc: true
@@ -1,74 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- ##
4
- # This module contains a collection of helpers for Upton
5
- ##
6
- module Upton
7
-
8
- ##
9
- # This class contains a collection of helpers for Upton
10
- #
11
- # Each method returns a Proc that (with an & ) can be used as the final
12
- # argument to Upton's `scrape` and `scrape_to_csv`
13
- ##
14
- module Utils
15
-
16
- ##
17
- # Scrapes an HTML <table> element into an Array of Arrays. The header, if
18
- # present, is returned as the first row.
19
- ##
20
- def self.table(table_selector, selector_method=:xpath)
21
- return Proc.new do |instance_html|
22
- html = ::Nokogiri::HTML(instance_html)
23
- output = []
24
- headers = html.send(selector_method, table_selector).css("th").map &:text
25
- output << headers
26
-
27
- table = html.send(selector_method, table_selector).css("tr").each{|tr| output << tr.css("td").map(&:text) }
28
- output
29
- end
30
- end
31
-
32
- ##
33
- # Scrapes any set of HTML elements into an Array.
34
- ##
35
- def self.list(list_selector, selector_method=:xpath)
36
- return Proc.new do |instance_html|
37
- html = ::Nokogiri::HTML(instance_html)
38
- html.send(selector_method, list_selector).map{|list_element| list_element.text }
39
- end
40
- end
41
-
42
- ##
43
- # Takes :_href and resolves it to an absolute URL according to
44
- # the supplied :_page_url. They can be either Strings or URI
45
- # instances.
46
- #
47
- # raises ArgumentError if either href or page_url is nil
48
- # raises ArgumentError if page_url is not absolute
49
- #
50
- # returns: a String with absolute URL
51
- def self.resolve_url(_href, _page_url)
52
-
53
- page_url = URI(_page_url).dup
54
- raise ArgumentError, "#{page_url} must be absolute" unless page_url.absolute?
55
-
56
- href = URI(_href).dup
57
-
58
- # return :href if :href is already absolute
59
- return href.to_s if href.absolute?
60
-
61
-
62
- # TODO: There may be edge cases worth considering
63
- # but this should handle the following non-absolute href possibilities:
64
- # //anothersite.com (keeps scheme, too!)
65
- # /root/dir
66
- # relative/dir
67
- # ?query=2
68
- # #bang
69
-
70
- URI.join(page_url, href).to_s
71
- end
72
-
73
- end
74
- end