upton 0.2.11 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/upton.rb +54 -51
- data/lib/upton/downloader.rb +7 -3
- data/lib/upton/version.rb +3 -0
- data/spec/upton_spec.rb +69 -10
- metadata +57 -36
- checksums.yaml +0 -7
data/lib/upton.rb
CHANGED
@@ -35,7 +35,7 @@ module Upton
|
|
35
35
|
EMPTY_STRING = ''
|
36
36
|
|
37
37
|
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
|
38
|
-
:paginated, :pagination_param, :pagination_max_pages, :readable_filenames
|
38
|
+
:paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames
|
39
39
|
|
40
40
|
##
|
41
41
|
# This is the main user-facing method for a basic scraper.
|
@@ -54,21 +54,13 @@ module Upton
|
|
54
54
|
# +selector+: The XPath expression or CSS selector that specifies the
|
55
55
|
# anchor elements within the page, if a url is specified for
|
56
56
|
# the previous argument.
|
57
|
-
# +selector_method+: Deprecated and ignored. Next breaking release will
|
58
|
-
# remove this option.x
|
59
57
|
#
|
60
58
|
# These options are a shortcut. If you plan to override +get_index+, you
|
61
59
|
# do not need to set them.
|
62
60
|
# If you don't specify a selector, the first argument will be treated as a
|
63
61
|
# list of URLs.
|
64
62
|
##
|
65
|
-
|
66
|
-
# DEPRECATION NOTE, re: selector_method
|
67
|
-
# the selector_method parameter is unneeded, as Nokogiri provides the
|
68
|
-
# #search method, which picks a selector depending on whether
|
69
|
-
# the String passed is of CSS/XPath notation
|
70
|
-
|
71
|
-
def initialize(index_url_or_array, selector="", selector_method=:deprecated)
|
63
|
+
def initialize(index_url_or_array, selector="")
|
72
64
|
|
73
65
|
#if first arg is a valid URL, do already-written stuff;
|
74
66
|
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
@@ -107,8 +99,9 @@ module Upton
|
|
107
99
|
@pagination_param = 'page'
|
108
100
|
# Default number of paginated pages to scrape
|
109
101
|
@pagination_max_pages = 2
|
110
|
-
|
111
|
-
|
102
|
+
# Default starting number for pagination (second page is this plus 1).
|
103
|
+
@pagination_start_index = 1
|
104
|
+
|
112
105
|
# Folder name for stashes, if you want them to be stored somewhere else,
|
113
106
|
# e.g. under /tmp.
|
114
107
|
if @stash_folder
|
@@ -231,18 +224,34 @@ module Upton
|
|
231
224
|
##
|
232
225
|
# sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
|
233
226
|
# resolve_url resolves them to absolute urls.
|
234
|
-
# absolute_url_str must be a URL, as a string
|
227
|
+
# absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
|
235
228
|
##
|
236
229
|
def resolve_url(href_str, absolute_url_str)
|
237
|
-
|
230
|
+
if absolute_url_str.class <= URI::Generic
|
231
|
+
absolute_url = absolute_url_str.dup
|
232
|
+
else
|
233
|
+
begin
|
234
|
+
absolute_url = URI(absolute_url_str).dup
|
235
|
+
rescue URI::InvalidURIError
|
236
|
+
raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
|
237
|
+
end
|
238
|
+
end
|
238
239
|
raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
|
239
|
-
|
240
|
+
if href_str.class <= URI::Generic
|
241
|
+
href = href_str.dup
|
242
|
+
else
|
243
|
+
begin
|
244
|
+
href = URI(href_str).dup
|
245
|
+
rescue URI::InvalidURIError
|
246
|
+
raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
|
247
|
+
end
|
248
|
+
end
|
240
249
|
|
241
250
|
# return :href if :href is already absolute
|
242
251
|
return href.to_s if href.absolute?
|
243
252
|
|
244
253
|
#TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
|
245
|
-
URI.join(absolute_url, href).to_s
|
254
|
+
URI.join(absolute_url.to_s, href.to_s).to_s
|
246
255
|
end
|
247
256
|
|
248
257
|
##
|
@@ -251,16 +260,7 @@ module Upton
|
|
251
260
|
# comes from an API.
|
252
261
|
##
|
253
262
|
def get_index
|
254
|
-
|
255
|
-
parse_index(get_index_pages(@index_url, 1), @index_selector)
|
256
|
-
end
|
257
|
-
|
258
|
-
##
|
259
|
-
# Using the XPath expression or CSS selector and selector_method that
|
260
|
-
# uniquely identifies the links in the index, return those links as strings. ##
|
261
|
-
def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
|
262
|
-
# for now, override selector_method with :search, which will work with either CSS or XPath
|
263
|
-
Nokogiri::HTML(text).search(selector).to_a.map{|l| l["href"] }
|
263
|
+
index_pages = get_index_pages(@index_url, @pagination_start_index).map{|page| parse_index(page, @index_selector) }.flatten
|
264
264
|
end
|
265
265
|
|
266
266
|
# TODO: Not sure the best way to handle this
|
@@ -274,8 +274,7 @@ module Upton
|
|
274
274
|
# to make sure that this method returns absolute urls
|
275
275
|
# i.e. this method expects @index_url to always have an absolute address
|
276
276
|
# for the lifetime of an Upton instance
|
277
|
-
def parse_index(text, selector
|
278
|
-
# for now, override selector_method with :search, which will work with either CSS or XPath
|
277
|
+
def parse_index(text, selector)
|
279
278
|
Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
|
280
279
|
href = a_element["href"]
|
281
280
|
resolved_url = resolve_url( href, @index_url) unless href.nil?
|
@@ -290,18 +289,19 @@ module Upton
|
|
290
289
|
# e.g. a site listing links with 2+ pages.
|
291
290
|
##
|
292
291
|
def get_index_pages(url, pagination_index, options={})
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
292
|
+
resps = [self.get_page(url, @index_debug, options)]
|
293
|
+
prev_url = url
|
294
|
+
while !resps.last.empty?
|
295
|
+
pagination_index += 1
|
296
|
+
next_url = self.next_index_page_url(url, pagination_index)
|
298
297
|
next_url = resolve_url(next_url, url)
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
298
|
+
break if next_url == prev_url || next_url.empty?
|
299
|
+
|
300
|
+
next_resp = self.get_page(next_url, @index_debug, options).to_s
|
301
|
+
prev_url = next_url
|
302
|
+
resps << next_resp
|
303
303
|
end
|
304
|
-
|
304
|
+
resps
|
305
305
|
end
|
306
306
|
|
307
307
|
##
|
@@ -313,26 +313,29 @@ module Upton
|
|
313
313
|
# page, e.g. if a news article has two pages.
|
314
314
|
##
|
315
315
|
def get_instance(url, pagination_index=0, options={})
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
316
|
+
resps = [self.get_page(url, @debug, options)]
|
317
|
+
pagination_index = pagination_index.to_i
|
318
|
+
prev_url = url
|
319
|
+
while !resps.last.empty?
|
320
|
+
next_url = self.next_instance_page_url(url, pagination_index + 1)
|
321
|
+
break if next_url == prev_url || next_url.empty?
|
322
|
+
|
323
|
+
next_resp = self.get_page(next_url, @debug, options)
|
324
|
+
prev_url = next_url
|
325
|
+
resps << next_resp
|
325
326
|
end
|
326
|
-
|
327
|
+
resps
|
327
328
|
end
|
328
329
|
|
329
330
|
# Just a helper for +scrape+.
|
330
331
|
def scrape_from_list(list, blk)
|
331
332
|
puts "Scraping #{list.size} instances" if @verbose
|
332
333
|
list.each_with_index.map do |instance_url, instance_index|
|
333
|
-
|
334
|
-
|
335
|
-
|
334
|
+
instance_resps = get_instance instance_url, nil, :instance_index => instance_index
|
335
|
+
instance_resps.each_with_index.map do |instance_resp, pagination_index|
|
336
|
+
blk.call(instance_resp, instance_url, instance_index, pagination_index)
|
337
|
+
end
|
338
|
+
end.flatten(1)
|
336
339
|
end
|
337
340
|
|
338
341
|
# it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
|
data/lib/upton/downloader.rb
CHANGED
@@ -42,10 +42,14 @@ module Upton
|
|
42
42
|
|
43
43
|
private
|
44
44
|
|
45
|
+
def make_request_for_resource!
|
46
|
+
RestClient.get(uri)
|
47
|
+
end
|
48
|
+
|
45
49
|
def download_from_resource!
|
46
50
|
begin
|
47
51
|
puts "Downloading from #{uri}" if @verbose
|
48
|
-
resp =
|
52
|
+
resp = make_request_for_resource!
|
49
53
|
puts "Downloaded #{uri}" if @verbose
|
50
54
|
rescue RestClient::ResourceNotFound
|
51
55
|
puts "404 error, skipping: #{uri}" if @verbose
|
@@ -73,7 +77,7 @@ module Upton
|
|
73
77
|
puts "Cache of #{uri} unavailable. Will download from the internet"
|
74
78
|
end
|
75
79
|
end
|
76
|
-
from_resource =
|
80
|
+
from_resource = true
|
77
81
|
download_from_resource!
|
78
82
|
end
|
79
83
|
unless cached_file_exists?
|
@@ -84,7 +88,7 @@ module Upton
|
|
84
88
|
puts "Writing #{uri} data to the cache"
|
85
89
|
end
|
86
90
|
end
|
87
|
-
|
91
|
+
open(cached_file, 'w'){|f| f << resp}
|
88
92
|
end
|
89
93
|
{:resp => resp, :from_resource => from_resource }
|
90
94
|
end
|
data/spec/upton_spec.rb
CHANGED
@@ -54,8 +54,9 @@ describe Upton do
|
|
54
54
|
|
55
55
|
propubscraper = Upton::Scraper.new("http://www.example.com/propublica.html", "section#river section h1 a")
|
56
56
|
propubscraper.debug = true
|
57
|
-
propubscraper.verbose =
|
57
|
+
propubscraper.verbose = false
|
58
58
|
propubscraper.sleep_time_between_requests = 0
|
59
|
+
propubscraper.stash_folder = "test_stashes"
|
59
60
|
|
60
61
|
heds = propubscraper.scrape do |article_str|
|
61
62
|
doc = Nokogiri::HTML(article_str)
|
@@ -88,8 +89,9 @@ describe Upton do
|
|
88
89
|
|
89
90
|
propubscraper = Upton::Scraper.new("http://www.example.com/propublica-relative.html", "section#river h1 a")
|
90
91
|
propubscraper.debug = true
|
91
|
-
propubscraper.verbose =
|
92
|
+
propubscraper.verbose = false
|
92
93
|
propubscraper.sleep_time_between_requests = 0
|
94
|
+
propubscraper.stash_folder = "test_stashes"
|
93
95
|
|
94
96
|
heds = propubscraper.scrape do |article_str|
|
95
97
|
doc = Nokogiri::HTML(article_str)
|
@@ -105,8 +107,9 @@ describe Upton do
|
|
105
107
|
|
106
108
|
propubscraper = Upton::Scraper.new(["http://www.example.com/propublica.html"])
|
107
109
|
propubscraper.debug = true
|
108
|
-
propubscraper.verbose =
|
110
|
+
propubscraper.verbose = false
|
109
111
|
propubscraper.sleep_time_between_requests = 0
|
112
|
+
propubscraper.stash_folder = "test_stashes"
|
110
113
|
|
111
114
|
list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a"))
|
112
115
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
@@ -119,10 +122,12 @@ describe Upton do
|
|
119
122
|
|
120
123
|
propubscraper = Upton::Scraper.new(["http://www.example.com/easttimor.html"])
|
121
124
|
propubscraper.debug = true
|
122
|
-
propubscraper.verbose =
|
125
|
+
propubscraper.verbose = false
|
123
126
|
propubscraper.sleep_time_between_requests = 0
|
127
|
+
propubscraper.stash_folder = "test_stashes"
|
124
128
|
|
125
129
|
table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
|
130
|
+
table.map{|outer| outer.map{|row| row.map{|cell| cell.gsub!("\n", '') } }} # cope with diff nokogiri versions differing behavior.
|
126
131
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
127
132
|
table.should eql @east_timor_prime_ministers
|
128
133
|
end
|
@@ -148,11 +153,12 @@ describe Upton do
|
|
148
153
|
|
149
154
|
propubscraper = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.compact-list a.title-link')
|
150
155
|
propubscraper.debug = true
|
151
|
-
propubscraper.verbose =
|
156
|
+
propubscraper.verbose = false
|
152
157
|
propubscraper.paginated = true
|
153
158
|
propubscraper.pagination_param = 'p'
|
154
159
|
propubscraper.pagination_max_pages = 3
|
155
160
|
propubscraper.sleep_time_between_requests = 0
|
161
|
+
propubscraper.stash_folder = "test_stashes"
|
156
162
|
|
157
163
|
results = propubscraper.scrape do |article_str|
|
158
164
|
doc = Nokogiri::HTML(article_str)
|
@@ -167,15 +173,57 @@ describe Upton do
|
|
167
173
|
Upton::Scraper.stub(:sleep)
|
168
174
|
end
|
169
175
|
|
170
|
-
it "should sleep after
|
176
|
+
it "should sleep after requests with caching disabled" do
|
171
177
|
stub_request(:get, "www.example.com")
|
172
178
|
u = Upton::Scraper.new("http://www.example.com", '.whatever')
|
179
|
+
u.index_debug = false
|
173
180
|
u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
|
174
181
|
u.should_receive(:sleep)
|
175
|
-
stub = stub_request(:get, "http://www.example.com")
|
176
182
|
u.scrape
|
177
183
|
end
|
178
184
|
|
185
|
+
it "should sleep after uncached requests when caching is enabled" do
|
186
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
187
|
+
stub_request(:get, "www.example.com")
|
188
|
+
u = Upton::Scraper.new("http://www.example.com", '.whatever')
|
189
|
+
u.index_debug = true
|
190
|
+
u.stash_folder = "test_stashes"
|
191
|
+
u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
|
192
|
+
u.should_receive(:sleep)
|
193
|
+
u.scrape
|
194
|
+
end
|
195
|
+
|
196
|
+
it "should sleep after paginated requests when caching is disabled" do
|
197
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
198
|
+
stub_request(:get, "www.example.com/propublica_search.html").
|
199
|
+
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
200
|
+
stub_request(:get, "www.example.com/propublica_search.html?p=2").
|
201
|
+
to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
|
202
|
+
stub_request(:get, "www.example.com/propublica_search.html?p=3").
|
203
|
+
to_return(:body => '', :status => 200)
|
204
|
+
stub_request(:get, "www.example.com/webinar.html").
|
205
|
+
to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
|
206
|
+
stub_request(:get, "www.example.com/prosecutor.html").
|
207
|
+
to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
|
208
|
+
stub_request(:get, "www.example.com/sixfacts.html").
|
209
|
+
to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
|
210
|
+
|
211
|
+
|
212
|
+
u = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.nonexistent')
|
213
|
+
u.index_debug = false
|
214
|
+
u.debug = false
|
215
|
+
u.paginated = true
|
216
|
+
u.pagination_param = 'p'
|
217
|
+
u.pagination_max_pages = 3
|
218
|
+
u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
|
219
|
+
u.stash_folder = "test_stashes"
|
220
|
+
|
221
|
+
u.should_receive(:sleep).exactly(3).times #once for each search page, so 3.
|
222
|
+
u.scrape
|
223
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
224
|
+
end
|
225
|
+
|
226
|
+
|
179
227
|
it "should save to the designated stash folder" do
|
180
228
|
custom_cache_folder = "#{Dir.tmpdir}/upton/test"
|
181
229
|
FileUtils.rm_rf(custom_cache_folder)
|
@@ -183,17 +231,28 @@ describe Upton do
|
|
183
231
|
to_return(:body => '', :status => 200)
|
184
232
|
|
185
233
|
u = Upton::Scraper.new("http://www.example.com", '.whatever')
|
234
|
+
u.sleep_time_between_requests = 0.0
|
186
235
|
u.stash_folder = custom_cache_folder
|
187
236
|
u.debug = true
|
188
237
|
u.scrape do
|
189
238
|
1+1
|
190
239
|
end
|
191
|
-
puts [custom_cache_folder, custom_cache_folder + "/*", Dir.glob(custom_cache_folder)].inspect
|
192
240
|
files = Dir.glob(custom_cache_folder)
|
193
241
|
expect(files).not_to be_empty
|
194
242
|
end
|
195
243
|
|
196
|
-
|
197
|
-
|
244
|
+
|
245
|
+
before do
|
246
|
+
Upton::Scraper.stub(:puts)
|
198
247
|
end
|
248
|
+
|
249
|
+
it "should be silent if verbose is false" do
|
250
|
+
stub_request(:get, "www.example.com")
|
251
|
+
u = Upton::Scraper.new("http://www.example.com", '.whatever')
|
252
|
+
u.sleep_time_between_requests = 0.0
|
253
|
+
u.verbose = false
|
254
|
+
u.should_not_receive(:puts)
|
255
|
+
u.scrape
|
256
|
+
end
|
257
|
+
|
199
258
|
end
|
metadata
CHANGED
@@ -1,102 +1,116 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
|
+
prerelease:
|
5
6
|
platform: ruby
|
6
7
|
authors:
|
7
8
|
- Jeremy B. Merrill
|
8
9
|
autorequire:
|
9
10
|
bindir: bin
|
10
11
|
cert_chain: []
|
11
|
-
date: 2013-
|
12
|
+
date: 2013-12-22 00:00:00.000000000 Z
|
12
13
|
dependencies:
|
13
14
|
- !ruby/object:Gem::Dependency
|
14
15
|
name: rack
|
15
16
|
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
16
18
|
requirements:
|
17
|
-
- - '>='
|
19
|
+
- - ! '>='
|
18
20
|
- !ruby/object:Gem::Version
|
19
21
|
version: '0'
|
20
22
|
type: :development
|
21
23
|
prerelease: false
|
22
24
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
23
26
|
requirements:
|
24
|
-
- - '>='
|
27
|
+
- - ! '>='
|
25
28
|
- !ruby/object:Gem::Version
|
26
29
|
version: '0'
|
27
30
|
- !ruby/object:Gem::Dependency
|
28
31
|
name: rspec
|
29
32
|
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
30
34
|
requirements:
|
31
|
-
- - '>='
|
35
|
+
- - ! '>='
|
32
36
|
- !ruby/object:Gem::Version
|
33
37
|
version: '0'
|
34
38
|
type: :development
|
35
39
|
prerelease: false
|
36
40
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
37
42
|
requirements:
|
38
|
-
- - '>='
|
43
|
+
- - ! '>='
|
39
44
|
- !ruby/object:Gem::Version
|
40
45
|
version: '0'
|
41
46
|
- !ruby/object:Gem::Dependency
|
42
47
|
name: webmock
|
43
48
|
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
44
50
|
requirements:
|
45
|
-
- - '>='
|
51
|
+
- - ! '>='
|
46
52
|
- !ruby/object:Gem::Version
|
47
53
|
version: '0'
|
48
54
|
type: :development
|
49
55
|
prerelease: false
|
50
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
51
58
|
requirements:
|
52
|
-
- - '>='
|
59
|
+
- - ! '>='
|
53
60
|
- !ruby/object:Gem::Version
|
54
61
|
version: '0'
|
55
62
|
- !ruby/object:Gem::Dependency
|
56
63
|
name: thin
|
57
64
|
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
58
66
|
requirements:
|
59
|
-
- - '>='
|
67
|
+
- - ! '>='
|
60
68
|
- !ruby/object:Gem::Version
|
61
69
|
version: '0'
|
62
70
|
type: :development
|
63
71
|
prerelease: false
|
64
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
65
74
|
requirements:
|
66
|
-
- - '>='
|
75
|
+
- - ! '>='
|
67
76
|
- !ruby/object:Gem::Version
|
68
77
|
version: '0'
|
69
78
|
- !ruby/object:Gem::Dependency
|
70
79
|
name: nokogiri
|
71
80
|
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
72
82
|
requirements:
|
73
|
-
- - '>='
|
83
|
+
- - ! '>='
|
74
84
|
- !ruby/object:Gem::Version
|
75
|
-
version:
|
85
|
+
version: 1.5.1
|
76
86
|
type: :development
|
77
87
|
prerelease: false
|
78
88
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
79
90
|
requirements:
|
80
|
-
- - '>='
|
91
|
+
- - ! '>='
|
81
92
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
93
|
+
version: 1.5.1
|
83
94
|
- !ruby/object:Gem::Dependency
|
84
95
|
name: yard
|
85
96
|
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
86
98
|
requirements:
|
87
|
-
- - '>='
|
99
|
+
- - ! '>='
|
88
100
|
- !ruby/object:Gem::Version
|
89
101
|
version: '0'
|
90
102
|
type: :development
|
91
103
|
prerelease: false
|
92
104
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
93
106
|
requirements:
|
94
|
-
- - '>='
|
107
|
+
- - ! '>='
|
95
108
|
- !ruby/object:Gem::Version
|
96
109
|
version: '0'
|
97
110
|
- !ruby/object:Gem::Dependency
|
98
111
|
name: rest-client
|
99
112
|
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
100
114
|
requirements:
|
101
115
|
- - ~>
|
102
116
|
- !ruby/object:Gem::Version
|
@@ -104,6 +118,7 @@ dependencies:
|
|
104
118
|
type: :runtime
|
105
119
|
prerelease: false
|
106
120
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
107
122
|
requirements:
|
108
123
|
- - ~>
|
109
124
|
- !ruby/object:Gem::Version
|
@@ -111,29 +126,33 @@ dependencies:
|
|
111
126
|
- !ruby/object:Gem::Dependency
|
112
127
|
name: nokogiri
|
113
128
|
requirement: !ruby/object:Gem::Requirement
|
129
|
+
none: false
|
114
130
|
requirements:
|
115
|
-
- - '>='
|
131
|
+
- - ! '>='
|
116
132
|
- !ruby/object:Gem::Version
|
117
133
|
version: '0'
|
118
134
|
type: :runtime
|
119
135
|
prerelease: false
|
120
136
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
121
138
|
requirements:
|
122
|
-
- - '>='
|
139
|
+
- - ! '>='
|
123
140
|
- !ruby/object:Gem::Version
|
124
141
|
version: '0'
|
125
142
|
- !ruby/object:Gem::Dependency
|
126
143
|
name: mechanize
|
127
144
|
requirement: !ruby/object:Gem::Requirement
|
145
|
+
none: false
|
128
146
|
requirements:
|
129
|
-
- - '>='
|
147
|
+
- - ! '>='
|
130
148
|
- !ruby/object:Gem::Version
|
131
149
|
version: '0'
|
132
150
|
type: :runtime
|
133
151
|
prerelease: false
|
134
152
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
135
154
|
requirements:
|
136
|
-
- - '>='
|
155
|
+
- - ! '>='
|
137
156
|
- !ruby/object:Gem::Version
|
138
157
|
version: '0'
|
139
158
|
description: Don't re-write web scrapers every time. Upton gives you a scraper template
|
@@ -146,52 +165,54 @@ files:
|
|
146
165
|
- lib/upton.rb
|
147
166
|
- lib/upton/utils.rb
|
148
167
|
- lib/upton/downloader.rb
|
149
|
-
-
|
168
|
+
- lib/upton/version.rb
|
169
|
+
- spec/data/prosecutor.html
|
150
170
|
- spec/data/easttimor.html
|
151
|
-
- spec/data/
|
152
|
-
- spec/data/propublica.html
|
171
|
+
- spec/data/discussion.html
|
153
172
|
- spec/data/propublica_search.html
|
154
173
|
- spec/data/propublica_search_page_2.html
|
155
|
-
- spec/data/
|
156
|
-
- spec/data/sixfacts.html
|
174
|
+
- spec/data/propublica-relative.html
|
157
175
|
- spec/data/webinar.html
|
176
|
+
- spec/data/propublica.html
|
177
|
+
- spec/data/sixfacts.html
|
158
178
|
- spec/upton_spec.rb
|
159
179
|
- spec/spec_helper.rb
|
160
180
|
- spec/upton_downloader_spec.rb
|
161
181
|
homepage: http://github.org/propublica/upton
|
162
182
|
licenses:
|
163
183
|
- MIT
|
164
|
-
metadata: {}
|
165
184
|
post_install_message:
|
166
185
|
rdoc_options: []
|
167
186
|
require_paths:
|
168
187
|
- lib
|
169
188
|
required_ruby_version: !ruby/object:Gem::Requirement
|
189
|
+
none: false
|
170
190
|
requirements:
|
171
|
-
- - '>='
|
191
|
+
- - ! '>='
|
172
192
|
- !ruby/object:Gem::Version
|
173
|
-
version: 1.
|
193
|
+
version: 1.9.2
|
174
194
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
195
|
+
none: false
|
175
196
|
requirements:
|
176
|
-
- - '>='
|
197
|
+
- - ! '>='
|
177
198
|
- !ruby/object:Gem::Version
|
178
199
|
version: '0'
|
179
200
|
requirements: []
|
180
201
|
rubyforge_project:
|
181
|
-
rubygems_version:
|
202
|
+
rubygems_version: 1.8.23
|
182
203
|
signing_key:
|
183
|
-
specification_version:
|
204
|
+
specification_version: 3
|
184
205
|
summary: A simple web-scraping framework
|
185
206
|
test_files:
|
186
|
-
- spec/data/
|
207
|
+
- spec/data/prosecutor.html
|
187
208
|
- spec/data/easttimor.html
|
188
|
-
- spec/data/
|
189
|
-
- spec/data/propublica.html
|
209
|
+
- spec/data/discussion.html
|
190
210
|
- spec/data/propublica_search.html
|
191
211
|
- spec/data/propublica_search_page_2.html
|
192
|
-
- spec/data/
|
193
|
-
- spec/data/sixfacts.html
|
212
|
+
- spec/data/propublica-relative.html
|
194
213
|
- spec/data/webinar.html
|
214
|
+
- spec/data/propublica.html
|
215
|
+
- spec/data/sixfacts.html
|
195
216
|
- spec/upton_spec.rb
|
196
217
|
- spec/spec_helper.rb
|
197
218
|
- spec/upton_downloader_spec.rb
|
checksums.yaml
DELETED
@@ -1,7 +0,0 @@
|
|
1
|
-
---
|
2
|
-
SHA1:
|
3
|
-
metadata.gz: 2ef1916db6e2fb734cb8ea7ed33eb5edb67b37e3
|
4
|
-
data.tar.gz: 2a9da49f8a47dfc9e1feab2138045f7aa49268d6
|
5
|
-
SHA512:
|
6
|
-
metadata.gz: e94a228a8fb01c90c0e7535b106b2af4dd8983ea3e92b2813cd5d038c3985a5f55c5fbcac19ee5f16f3271ad9e390d426f0ad8ad7b0c08afdf3b9d745cff2738
|
7
|
-
data.tar.gz: f8b0475e022980cd6ca0eec6dc8512394723084ba59d0b47cd36c24c736fbfc4a58b52ce186a3f5b91c69fd1241dfaa9d57c5f71bf6867255426e0fd3f26ed0f
|