upton 0.3.6 → 1.0.0.prea
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/upton.rb +3 -329
- data/lib/upton/downloader.rb +1 -1
- data/lib/upton/scraper.rb +123 -76
- data/lib/upton/utils.rb +2 -4
- data/lib/upton/version.rb +1 -1
- data/spec/upton_spec.rb +98 -27
- metadata +50 -41
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c446c20e57e387b365d9c5bcda546a1b48ebbcf1
|
4
|
+
data.tar.gz: a29ee1aa35b18a9324d504ae8e99e2a9bafcfb27
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 11d5e990c42441d5bf599952bf3d49289754f68da72a51b31d62c86281531960fc18adffb7e52c59fe37ac5e275c35e766ce922c9a1922294f032d2a5c7cbea7
|
7
|
+
data.tar.gz: a6cbe33126fe3506c2248d40677e88e3d4e545ec2dc3e6613b3d623232f2288eae1305f41ee1ce63a996ff64812558596a95f1b14dffb63bca71bd89562e9fe7
|
data/lib/upton.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
|
-
|
4
|
-
require 'uri'
|
5
|
-
require 'restclient'
|
3
|
+
require_relative 'upton/scraper'
|
6
4
|
require_relative 'upton/utils'
|
5
|
+
require_relative 'upton/version'
|
7
6
|
require_relative 'upton/downloader'
|
7
|
+
require_relative 'upton/scraper'
|
8
8
|
|
9
9
|
##
|
10
10
|
# This module contains a scraper called Upton
|
@@ -22,332 +22,6 @@ module Upton
|
|
22
22
|
# site's search page or a newspaper's homepage.
|
23
23
|
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
24
24
|
# job listings or news articles.
|
25
|
-
#
|
26
|
-
# Upton::Scraper can be used as-is for basic use-cases by:
|
27
|
-
# 1. specifying the pages to be scraped in `new` as an index page
|
28
|
-
# or as an Array of URLs.
|
29
|
-
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
30
|
-
# block from Upton::Utils.
|
31
|
-
# For more complicated cases; subclass Upton::Scraper
|
32
|
-
# e.g. +MyScraper < Upton::Scraper+ and override various methods.
|
33
25
|
##
|
34
|
-
class Scraper
|
35
|
-
EMPTY_STRING = ''
|
36
|
-
|
37
|
-
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
|
38
|
-
:paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
|
39
|
-
:pagination_interval
|
40
|
-
|
41
|
-
##
|
42
|
-
# This is the main user-facing method for a basic scraper.
|
43
|
-
# Call +scrape+ with a block; this block will be called on
|
44
|
-
# the text of each instance page, (and optionally, its URL and its index
|
45
|
-
# in the list of instance URLs returned by +get_index+).
|
46
|
-
##
|
47
|
-
def scrape(&blk)
|
48
|
-
self.url_array = self.get_index unless self.url_array
|
49
|
-
blk = Proc.new{|x| x} if blk.nil?
|
50
|
-
self.scrape_from_list(self.url_array, blk)
|
51
|
-
end
|
52
|
-
|
53
|
-
##
|
54
|
-
# +index_url_or_array+: A list of string URLs, OR
|
55
|
-
# the URL of the page containing the list of instances.
|
56
|
-
# +selector+: The XPath expression or CSS selector that specifies the
|
57
|
-
# anchor elements within the page, if a url is specified for
|
58
|
-
# the previous argument.
|
59
|
-
#
|
60
|
-
# These options are a shortcut. If you plan to override +get_index+, you
|
61
|
-
# do not need to set them.
|
62
|
-
# If you don't specify a selector, the first argument will be treated as a
|
63
|
-
# list of URLs.
|
64
|
-
##
|
65
|
-
def initialize(index_url_or_array, selector="")
|
66
|
-
|
67
|
-
#if first arg is a valid URL, do already-written stuff;
|
68
|
-
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
69
|
-
#e.g. Scraper.new(["http://jeremybmerrill.com"])
|
70
|
-
|
71
|
-
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
72
|
-
if index_url_or_array.respond_to? :each_with_index
|
73
|
-
@url_array = index_url_or_array
|
74
|
-
else
|
75
|
-
@index_url = index_url_or_array
|
76
|
-
@index_selector = selector
|
77
|
-
end
|
78
|
-
|
79
|
-
# If true, then Upton prints information about when it gets
|
80
|
-
# files from the internet and when it gets them from its stash.
|
81
|
-
@verbose = false
|
82
|
-
|
83
|
-
# If true, then Upton fetches each instance page only once
|
84
|
-
# future requests for that file are responded to with the locally stashed
|
85
|
-
# version.
|
86
|
-
# You may want to set @debug to false for production (but maybe not).
|
87
|
-
# You can also control stashing behavior on a per-call basis with the
|
88
|
-
# optional second argument to get_page, if, for instance, you want to
|
89
|
-
# stash certain instance pages, e.g. based on their modification date.
|
90
|
-
@debug = true
|
91
|
-
# Index debug does the same, but for index pages.
|
92
|
-
@index_debug = false
|
93
|
-
|
94
|
-
# In order to not hammer servers, Upton waits for, by default, 30
|
95
|
-
# seconds between requests to the remote server.
|
96
|
-
@sleep_time_between_requests = 30 #seconds
|
97
|
-
|
98
|
-
# If true, then Upton will attempt to scrape paginated index pages
|
99
|
-
@paginated = false
|
100
|
-
# Default query string parameter used to specify the current page
|
101
|
-
@pagination_param = 'page'
|
102
|
-
# Default number of paginated pages to scrape
|
103
|
-
@pagination_max_pages = 2
|
104
|
-
# Default starting number for pagination (second page is this plus 1).
|
105
|
-
@pagination_start_index = 1
|
106
|
-
# Default value to increment page number by
|
107
|
-
@pagination_interval = 1
|
108
|
-
|
109
|
-
# Folder name for stashes, if you want them to be stored somewhere else,
|
110
|
-
# e.g. under /tmp.
|
111
|
-
if @stash_folder
|
112
|
-
FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
##
|
117
|
-
# If instance pages are paginated, <b>you must override</b>
|
118
|
-
# this method to return the next URL, given the current URL and its index.
|
119
|
-
#
|
120
|
-
# If instance pages aren't paginated, there's no need to override this.
|
121
|
-
#
|
122
|
-
# Recursion stops if the fetching URL returns an empty string or an error.
|
123
|
-
#
|
124
|
-
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
125
|
-
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
126
|
-
##
|
127
|
-
def next_instance_page_url(url, pagination_index)
|
128
|
-
EMPTY_STRING
|
129
|
-
end
|
130
|
-
|
131
|
-
##
|
132
|
-
# Return the next URL to scrape, given the current URL and its index.
|
133
|
-
#
|
134
|
-
# Recursion stops if the fetching URL returns an empty string or an error.
|
135
|
-
#
|
136
|
-
# If @paginated is not set (the default), this method returns an empty string.
|
137
|
-
#
|
138
|
-
# If @paginated is set, this method will return the next pagination URL
|
139
|
-
# to scrape using @pagination_param and the pagination_index.
|
140
|
-
#
|
141
|
-
# If the pagination_index is greater than @pagination_max_pages, then the
|
142
|
-
# method will return an empty string.
|
143
|
-
#
|
144
|
-
# Override this method to handle pagination is an alternative way
|
145
|
-
# e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
|
146
|
-
# ought to return "http://whatever.com/articles?page=2"
|
147
|
-
#
|
148
|
-
##
|
149
|
-
def next_index_page_url(url, pagination_index)
|
150
|
-
return url unless @paginated
|
151
|
-
|
152
|
-
if pagination_index > @pagination_max_pages
|
153
|
-
puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
|
154
|
-
EMPTY_STRING
|
155
|
-
else
|
156
|
-
uri = URI.parse(url)
|
157
|
-
query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
|
158
|
-
# update the pagination query string parameter
|
159
|
-
query[@pagination_param] = pagination_index
|
160
|
-
uri.query = URI.encode_www_form(query)
|
161
|
-
puts "Next index pagination url is #{uri}" if @verbose
|
162
|
-
uri.to_s
|
163
|
-
end
|
164
|
-
end
|
165
|
-
|
166
|
-
##
|
167
|
-
# Writes the scraped result to a CSV at the given filename.
|
168
|
-
##
|
169
|
-
def scrape_to_csv filename, &blk
|
170
|
-
require 'csv'
|
171
|
-
self.url_array = self.get_index unless self.url_array
|
172
|
-
CSV.open filename, 'wb' do |csv|
|
173
|
-
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
174
|
-
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
175
|
-
if document[0].respond_to? :map
|
176
|
-
document.each{|row| csv << row }
|
177
|
-
else
|
178
|
-
csv << document
|
179
|
-
end
|
180
|
-
end
|
181
|
-
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
def scrape_to_tsv filename, &blk
|
186
|
-
require 'csv'
|
187
|
-
self.url_array = self.get_index unless self.url_array
|
188
|
-
CSV.open filename, 'wb', :col_sep => "\t" do |csv|
|
189
|
-
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
190
|
-
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
191
|
-
if document[0].respond_to? :map
|
192
|
-
document.each{|row| csv << row }
|
193
|
-
else
|
194
|
-
csv << document
|
195
|
-
end
|
196
|
-
end
|
197
|
-
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
protected
|
202
|
-
|
203
|
-
##
|
204
|
-
# Handles getting pages with Downlader, which handles stashing.
|
205
|
-
##
|
206
|
-
def get_page(url, stash=false, options={})
|
207
|
-
return EMPTY_STRING if url.nil? || url.empty? #url is nil if the <a> lacks an `href` attribute.
|
208
|
-
global_options = {
|
209
|
-
:cache => stash,
|
210
|
-
:verbose => @verbose
|
211
|
-
}
|
212
|
-
if @readable_filenames
|
213
|
-
global_options[:readable_filenames] = true
|
214
|
-
end
|
215
|
-
if @stash_folder
|
216
|
-
global_options[:readable_filenames] = true
|
217
|
-
global_options[:cache_location] = @stash_folder
|
218
|
-
end
|
219
|
-
resp_and_cache = Downloader.new(url, global_options.merge(options)).get
|
220
|
-
if resp_and_cache[:from_resource]
|
221
|
-
puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
|
222
|
-
sleep @sleep_time_between_requests
|
223
|
-
end
|
224
|
-
resp_and_cache[:resp]
|
225
|
-
end
|
226
|
-
|
227
|
-
|
228
|
-
##
|
229
|
-
# sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
|
230
|
-
# resolve_url resolves them to absolute urls.
|
231
|
-
# absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
|
232
|
-
##
|
233
|
-
def resolve_url(href_str, absolute_url_str)
|
234
|
-
if absolute_url_str.class <= URI::Generic
|
235
|
-
absolute_url = absolute_url_str.dup
|
236
|
-
else
|
237
|
-
begin
|
238
|
-
absolute_url = URI(absolute_url_str).dup
|
239
|
-
rescue URI::InvalidURIError
|
240
|
-
raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
|
241
|
-
end
|
242
|
-
end
|
243
|
-
raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
|
244
|
-
if href_str.class <= URI::Generic
|
245
|
-
href = href_str.dup
|
246
|
-
else
|
247
|
-
begin
|
248
|
-
href = URI(href_str).dup
|
249
|
-
rescue URI::InvalidURIError
|
250
|
-
raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
|
251
|
-
end
|
252
|
-
end
|
253
|
-
|
254
|
-
# return :href if :href is already absolute
|
255
|
-
return href.to_s if href.absolute?
|
256
|
-
|
257
|
-
#TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
|
258
|
-
URI.join(absolute_url.to_s, href.to_s).to_s
|
259
|
-
end
|
260
|
-
|
261
|
-
##
|
262
|
-
# Return a list of URLs for the instances you want to scrape.
|
263
|
-
# This can optionally be overridden if, for example, the list of instances
|
264
|
-
# comes from an API.
|
265
|
-
##
|
266
|
-
def get_index
|
267
|
-
index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
|
268
|
-
end
|
269
|
-
|
270
|
-
# TODO: Not sure the best way to handle this
|
271
|
-
# Currently, #parse_index is called upon #get_index_pages,
|
272
|
-
# which itself is dependent on @index_url
|
273
|
-
# Does @index_url stay unaltered for the lifetime of the Upton instance?
|
274
|
-
# It seems to at this point, but that may be something that gets
|
275
|
-
# deprecated later
|
276
|
-
#
|
277
|
-
# So for now, @index_url is used in conjunction with resolve_url
|
278
|
-
# to make sure that this method returns absolute urls
|
279
|
-
# i.e. this method expects @index_url to always have an absolute address
|
280
|
-
# for the lifetime of an Upton instance
|
281
|
-
def parse_index(text, selector)
|
282
|
-
Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
|
283
|
-
href = a_element["href"]
|
284
|
-
resolved_url = resolve_url( href, @index_url) unless href.nil?
|
285
|
-
puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
|
286
|
-
resolved_url
|
287
|
-
end
|
288
|
-
end
|
289
|
-
|
290
|
-
|
291
|
-
##
|
292
|
-
# Returns the concatenated output of each member of a paginated index,
|
293
|
-
# e.g. a site listing links with 2+ pages.
|
294
|
-
##
|
295
|
-
def get_index_pages(original_url, pagination_index, pagination_interval, options={})
|
296
|
-
resps = []
|
297
|
-
prev_url = nil
|
298
|
-
while resps.empty? || !resps.last.empty?
|
299
|
-
next_url = self.next_index_page_url(original_url, pagination_index)
|
300
|
-
break if next_url.empty?
|
301
|
-
|
302
|
-
next_url = resolve_url(next_url, original_url)
|
303
|
-
break if next_url == prev_url
|
304
|
-
|
305
|
-
next_resp = self.get_page(next_url, @index_debug, options).to_s
|
306
|
-
prev_url = next_url
|
307
|
-
pagination_index += pagination_interval
|
308
|
-
resps << next_resp
|
309
|
-
end
|
310
|
-
resps
|
311
|
-
end
|
312
|
-
|
313
|
-
##
|
314
|
-
# Returns the instance at `url`.
|
315
|
-
#
|
316
|
-
# If the page is stashed, returns that, otherwise, fetches it from the web.
|
317
|
-
#
|
318
|
-
# If an instance is paginated, returns the concatenated output of each
|
319
|
-
# page, e.g. if a news article has two pages.
|
320
|
-
##
|
321
|
-
def get_instance(url, pagination_index=0, options={})
|
322
|
-
resps = [self.get_page(url, @debug, options)]
|
323
|
-
pagination_index = pagination_index.to_i
|
324
|
-
prev_url = url
|
325
|
-
while !resps.last.empty?
|
326
|
-
next_url = self.next_instance_page_url(url, pagination_index + 1)
|
327
|
-
break if next_url == prev_url || next_url.empty?
|
328
|
-
|
329
|
-
next_resp = self.get_page(next_url, @debug, options)
|
330
|
-
prev_url = next_url
|
331
|
-
resps << next_resp
|
332
|
-
end
|
333
|
-
resps
|
334
|
-
end
|
335
|
-
|
336
|
-
# Just a helper for +scrape+.
|
337
|
-
def scrape_from_list(list, blk)
|
338
|
-
puts "Scraping #{list.size} instances" if @verbose
|
339
|
-
list.each_with_index.map do |instance_url, instance_index|
|
340
|
-
instance_resps = get_instance instance_url, nil, :instance_index => instance_index
|
341
|
-
instance_resps.each_with_index.map do |instance_resp, pagination_index|
|
342
|
-
blk.call(instance_resp, instance_url, instance_index, pagination_index)
|
343
|
-
end
|
344
|
-
end.flatten(1)
|
345
|
-
end
|
346
|
-
|
347
|
-
# it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
|
348
|
-
def slug(url)
|
349
|
-
url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
|
350
|
-
end
|
351
26
|
|
352
|
-
end
|
353
27
|
end
|
data/lib/upton/downloader.rb
CHANGED
@@ -103,7 +103,7 @@ module Upton
|
|
103
103
|
msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
|
104
104
|
resp_html = Nokogiri::HTML(resp)
|
105
105
|
comment = Nokogiri::XML::Comment.new(resp_html, msg)
|
106
|
-
if resp_html.root.nil?
|
106
|
+
if resp_html.root.nil?
|
107
107
|
return resp
|
108
108
|
elsif resp_html.root.children.empty?
|
109
109
|
resp_html.root.add_child(comment)
|
data/lib/upton/scraper.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
require 'uri'
|
2
2
|
require 'nokogiri'
|
3
3
|
require_relative './downloader'
|
4
|
+
require_relative './page'
|
4
5
|
|
5
6
|
module Upton
|
6
|
-
|
7
|
+
# Upton::Scraper can be used as-is for basic use-cases by:
|
7
8
|
# 1. specifying the pages to be scraped in `new` as an index page
|
8
9
|
# or as an Array of URLs.
|
9
10
|
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
@@ -14,9 +15,8 @@ module Upton
|
|
14
15
|
class Scraper
|
15
16
|
EMPTY_STRING = ''
|
16
17
|
|
17
|
-
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests,
|
18
|
-
|
19
|
-
:pagination_interval
|
18
|
+
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests,
|
19
|
+
:stash_folder, :readable_filenames
|
20
20
|
|
21
21
|
##
|
22
22
|
# This is the main user-facing method for a basic scraper.
|
@@ -25,8 +25,8 @@ module Upton
|
|
25
25
|
# in the list of instance URLs returned by +get_index+).
|
26
26
|
##
|
27
27
|
def scrape(&blk)
|
28
|
-
|
29
|
-
self.scrape_from_list(
|
28
|
+
get_indexes!
|
29
|
+
self.scrape_from_list(@instance_urls, blk)
|
30
30
|
end
|
31
31
|
|
32
32
|
##
|
@@ -41,23 +41,10 @@ module Upton
|
|
41
41
|
# If you don't specify a selector, the first argument will be treated as a
|
42
42
|
# list of URLs.
|
43
43
|
##
|
44
|
-
def initialize(
|
45
|
-
|
46
|
-
#if first arg is a valid URL, do already-written stuff;
|
47
|
-
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
48
|
-
#e.g. Scraper.new(["http://jeremybmerrill.com"])
|
49
|
-
|
50
|
-
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
51
|
-
if index_url_or_array.respond_to? :each_with_index
|
52
|
-
@url_array = index_url_or_array
|
53
|
-
else
|
54
|
-
@index_url = index_url_or_array
|
55
|
-
@index_selector = selector
|
56
|
-
end
|
57
|
-
|
44
|
+
def initialize(options={})
|
58
45
|
# If true, then Upton prints information about when it gets
|
59
46
|
# files from the internet and when it gets them from its stash.
|
60
|
-
@verbose = false
|
47
|
+
@verbose = options[:verbose] || false
|
61
48
|
|
62
49
|
# If true, then Upton fetches each instance page only once
|
63
50
|
# future requests for that file are responded to with the locally stashed
|
@@ -66,29 +53,77 @@ module Upton
|
|
66
53
|
# You can also control stashing behavior on a per-call basis with the
|
67
54
|
# optional second argument to get_page, if, for instance, you want to
|
68
55
|
# stash certain instance pages, e.g. based on their modification date.
|
69
|
-
@debug = true
|
56
|
+
@debug = options[:debug] || true
|
70
57
|
# Index debug does the same, but for index pages.
|
71
|
-
@index_debug = false
|
58
|
+
@index_debug = options[:index_debug] || false
|
72
59
|
|
73
60
|
# In order to not hammer servers, Upton waits for, by default, 30
|
74
61
|
# seconds between requests to the remote server.
|
75
|
-
@sleep_time_between_requests = 30 #seconds
|
62
|
+
@sleep_time_between_requests = options[:sleep_time_between_requests] || 30 #seconds
|
63
|
+
|
64
|
+
# Folder name for stashes, if you want them to be stored somewhere else,
|
65
|
+
# e.g. under /tmp.
|
66
|
+
if @stash_folder
|
67
|
+
FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
|
68
|
+
end
|
69
|
+
|
70
|
+
@indexes = []
|
71
|
+
@instance_urls = []
|
72
|
+
end
|
73
|
+
|
74
|
+
def index(index_url, selector, options={})
|
75
|
+
# for future:
|
76
|
+
@indexes ||= []
|
76
77
|
|
78
|
+
##
|
79
|
+
# Pagination options are per-index page
|
80
|
+
#
|
77
81
|
# If true, then Upton will attempt to scrape paginated index pages
|
78
|
-
|
82
|
+
options[:paginated] ||= false
|
79
83
|
# Default query string parameter used to specify the current page
|
80
|
-
|
84
|
+
options[:pagination_param] ||= 'page'
|
81
85
|
# Default number of paginated pages to scrape
|
82
|
-
|
86
|
+
options[:pagination_max_pages] ||= 2
|
83
87
|
# Default starting number for pagination (second page is this plus 1).
|
84
|
-
|
88
|
+
options[:pagination_start_index] ||= 1
|
85
89
|
# Default value to increment page number by
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
90
|
+
options[:pagination_interval] ||= 1
|
91
|
+
##
|
92
|
+
|
93
|
+
@indexes << [index_url, selector, options]
|
94
|
+
# and actually go scrape the index page, populate @instances
|
95
|
+
self
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.index(index_url, selector, options={})
|
99
|
+
scraper = self.new
|
100
|
+
scraper.index(index_url, selector, options)
|
101
|
+
scraper
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.instances(instances, options={})
|
105
|
+
s = self.new
|
106
|
+
s.instance_variable_set(:@instance_urls, instances)
|
107
|
+
s
|
108
|
+
end
|
109
|
+
|
110
|
+
# does
|
111
|
+
# def add_instances(urls)
|
112
|
+
# #for future:
|
113
|
+
# # @instances += urls
|
114
|
+
# # @instances.uniq!
|
115
|
+
# @instance_urls ||= []
|
116
|
+
# @instance_urls += urls
|
117
|
+
# @instance_urls.uniq!
|
118
|
+
# end
|
119
|
+
|
120
|
+
def instances(urls=nil)
|
121
|
+
if urls.nil?
|
122
|
+
@instance_urls
|
123
|
+
else
|
124
|
+
@instance_urls ||= []
|
125
|
+
@instance_urls += urls
|
126
|
+
self
|
92
127
|
end
|
93
128
|
end
|
94
129
|
|
@@ -125,21 +160,14 @@ module Upton
|
|
125
160
|
# ought to return "http://whatever.com/articles?page=2"
|
126
161
|
#
|
127
162
|
##
|
128
|
-
def next_index_page_url(url, pagination_index)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
|
137
|
-
# update the pagination query string parameter
|
138
|
-
query[@pagination_param] = pagination_index
|
139
|
-
uri.query = URI.encode_www_form(query)
|
140
|
-
puts "Next index pagination url is #{uri}" if @verbose
|
141
|
-
uri.to_s
|
142
|
-
end
|
163
|
+
def next_index_page_url(url, pagination_param, pagination_index)
|
164
|
+
uri = URI.parse(url)
|
165
|
+
query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
|
166
|
+
# update the pagination query string parameter
|
167
|
+
query[pagination_param] = pagination_index
|
168
|
+
uri.query = URI.encode_www_form(query)
|
169
|
+
puts "Next index pagination url is #{uri}" if @verbose
|
170
|
+
uri.to_s
|
143
171
|
end
|
144
172
|
|
145
173
|
##
|
@@ -147,36 +175,46 @@ module Upton
|
|
147
175
|
##
|
148
176
|
def scrape_to_csv filename, &blk
|
149
177
|
require 'csv'
|
150
|
-
self.
|
178
|
+
self.get_indexes!
|
151
179
|
CSV.open filename, 'wb' do |csv|
|
152
180
|
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
153
|
-
self.scrape_from_list(
|
181
|
+
self.scrape_from_list(@instance_urls, blk).compact.each do |document|
|
154
182
|
if document[0].respond_to? :map
|
155
183
|
document.each{|row| csv << row }
|
156
184
|
else
|
157
185
|
csv << document
|
158
186
|
end
|
159
187
|
end
|
160
|
-
#self.scrape_from_list(
|
188
|
+
#self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
|
161
189
|
end
|
162
190
|
end
|
163
191
|
|
164
192
|
def scrape_to_tsv filename, &blk
|
165
193
|
require 'csv'
|
166
|
-
|
194
|
+
get_indexes!
|
167
195
|
CSV.open filename, 'wb', :col_sep => "\t" do |csv|
|
168
196
|
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
169
|
-
self.scrape_from_list(
|
197
|
+
self.scrape_from_list(@instance_urls, blk).compact.each do |document|
|
170
198
|
if document[0].respond_to? :map
|
171
199
|
document.each{|row| csv << row }
|
172
200
|
else
|
173
201
|
csv << document
|
174
202
|
end
|
175
203
|
end
|
176
|
-
#self.scrape_from_list(
|
204
|
+
#self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
|
177
205
|
end
|
178
206
|
end
|
179
207
|
|
208
|
+
def +(other_scraper)
|
209
|
+
raise ArgumentError, "#{other_scraper.class} can't be coerced into Upton::Scraper" unless other_scraper.class <= Upton::Scraper
|
210
|
+
new_scraper = Scraper.new
|
211
|
+
new_indexes = @indexes + other_scraper.instance_variable_get(:@indexes)
|
212
|
+
new_instances = @instance_urls + other_scraper.instance_variable_get(:@instance_urls)
|
213
|
+
new_scraper.instance_variable_set(:@indexes, new_indexes)
|
214
|
+
new_scraper.instance_variable_set(:@instance_urls, new_instances)
|
215
|
+
new_scraper
|
216
|
+
end
|
217
|
+
|
180
218
|
protected
|
181
219
|
|
182
220
|
##
|
@@ -217,6 +255,8 @@ module Upton
|
|
217
255
|
absolute_url = URI(absolute_url_str).dup
|
218
256
|
rescue URI::InvalidURIError
|
219
257
|
raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
|
258
|
+
rescue ArgumentError
|
259
|
+
raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
|
220
260
|
end
|
221
261
|
end
|
222
262
|
raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
|
@@ -237,15 +277,6 @@ module Upton
|
|
237
277
|
URI.join(absolute_url.to_s, href.to_s).to_s
|
238
278
|
end
|
239
279
|
|
240
|
-
##
|
241
|
-
# Return a list of URLs for the instances you want to scrape.
|
242
|
-
# This can optionally be overridden if, for example, the list of instances
|
243
|
-
# comes from an API.
|
244
|
-
##
|
245
|
-
def get_index
|
246
|
-
index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
|
247
|
-
end
|
248
|
-
|
249
280
|
# TODO: Not sure the best way to handle this
|
250
281
|
# Currently, #parse_index is called upon #get_index_pages,
|
251
282
|
# which itself is dependent on @index_url
|
@@ -253,30 +284,31 @@ module Upton
|
|
253
284
|
# It seems to at this point, but that may be something that gets
|
254
285
|
# deprecated later
|
255
286
|
#
|
256
|
-
# So for now,
|
287
|
+
# So for now, index_url is used in conjunction with resolve_url
|
257
288
|
# to make sure that this method returns absolute urls
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
href = a_element["href"]
|
263
|
-
resolved_url = resolve_url( href, @index_url) unless href.nil?
|
289
|
+
def parse_index(text, selector, index_url)
|
290
|
+
Nokogiri::HTML(text).search(selector).to_a.map do |anchor|
|
291
|
+
href = anchor["href"]
|
292
|
+
resolved_url = resolve_url( href, index_url) unless href.nil?
|
264
293
|
puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
|
265
294
|
resolved_url
|
266
295
|
end
|
267
296
|
end
|
268
297
|
|
269
|
-
|
270
298
|
##
|
271
299
|
# Returns the concatenated output of each member of a paginated index,
|
272
300
|
# e.g. a site listing links with 2+ pages.
|
273
301
|
##
|
274
|
-
def get_index_pages(url, pagination_index,
|
302
|
+
def get_index_pages(url, pagination_index, options={})
|
275
303
|
resps = [self.get_page(url, @index_debug, options)]
|
304
|
+
return resps unless options[:paginated]
|
305
|
+
|
276
306
|
prev_url = url
|
277
307
|
while !resps.last.empty?
|
278
|
-
pagination_index += pagination_interval
|
279
|
-
|
308
|
+
pagination_index += options[:pagination_interval]
|
309
|
+
break if pagination_index > options[:pagination_max_pages]
|
310
|
+
|
311
|
+
next_url = self.next_index_page_url(url, options[:pagination_param], pagination_index)
|
280
312
|
next_url = resolve_url(next_url, url)
|
281
313
|
break if next_url == prev_url || next_url.empty?
|
282
314
|
|
@@ -310,13 +342,28 @@ module Upton
|
|
310
342
|
resps
|
311
343
|
end
|
312
344
|
|
345
|
+
##
|
346
|
+
# Return a list of URLs for the instances you want to scrape.
|
347
|
+
# This can optionally be overridden if, for example, the list of instances
|
348
|
+
# comes from an API.
|
349
|
+
##
|
350
|
+
def get_indexes!
|
351
|
+
@indexes.each do |index_url, index_selector, options|
|
352
|
+
#TODO: cope with pagination stuff per URL
|
353
|
+
|
354
|
+
@instance_urls += get_index_pages(index_url, options[:pagination_start_index], options).map{|page| parse_index(page, index_selector, index_url) }.flatten
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
|
313
359
|
# Just a helper for +scrape+.
|
314
360
|
def scrape_from_list(list, blk)
|
315
361
|
puts "Scraping #{list.size} instances" if @verbose
|
316
362
|
list.each_with_index.map do |instance_url, instance_index|
|
317
363
|
instance_resps = get_instance instance_url, nil, :instance_index => instance_index
|
318
364
|
instance_resps.each_with_index.map do |instance_resp, pagination_index|
|
319
|
-
|
365
|
+
page = Page.new(instance_resp, instance_url, instance_index, pagination_index)
|
366
|
+
blk.call(page)
|
320
367
|
end
|
321
368
|
end.flatten(1)
|
322
369
|
end
|
data/lib/upton/utils.rb
CHANGED
@@ -18,8 +18,7 @@ module Upton
|
|
18
18
|
# present, is returned as the first row.
|
19
19
|
##
|
20
20
|
def self.table(table_selector, deprecated=nil)
|
21
|
-
return Proc.new do |
|
22
|
-
html = ::Nokogiri::HTML(instance_html)
|
21
|
+
return Proc.new do |html|
|
23
22
|
output = []
|
24
23
|
headers = html.search(table_selector).css("th").map &:text
|
25
24
|
output << headers
|
@@ -33,8 +32,7 @@ module Upton
|
|
33
32
|
# Scrapes any set of HTML elements into an Array.
|
34
33
|
##
|
35
34
|
def self.list(list_selector, deprecated=nil)
|
36
|
-
return Proc.new do |
|
37
|
-
html = ::Nokogiri::HTML(instance_html)
|
35
|
+
return Proc.new do |html|
|
38
36
|
html.search(list_selector).map{|list_element| list_element.text }
|
39
37
|
end
|
40
38
|
end
|
data/lib/upton/version.rb
CHANGED
data/spec/upton_spec.rb
CHANGED
@@ -52,15 +52,14 @@ describe Upton do
|
|
52
52
|
stub_request(:get, "www.example.com/sixfacts.html").
|
53
53
|
to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
|
54
54
|
|
55
|
-
propubscraper = Upton::Scraper.
|
55
|
+
propubscraper = Upton::Scraper.index("http://www.example.com/propublica.html", "section#river section h1 a")
|
56
56
|
propubscraper.debug = true
|
57
57
|
propubscraper.verbose = false
|
58
58
|
propubscraper.sleep_time_between_requests = 0
|
59
59
|
propubscraper.stash_folder = "test_stashes"
|
60
60
|
|
61
|
-
heds = propubscraper.scrape do |
|
62
|
-
doc
|
63
|
-
hed = doc.css('h1.article-title').text
|
61
|
+
heds = propubscraper.scrape do |doc|
|
62
|
+
doc.css('h1.article-title').text
|
64
63
|
end
|
65
64
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
66
65
|
heds.should eql @headlines
|
@@ -87,14 +86,13 @@ describe Upton do
|
|
87
86
|
to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
|
88
87
|
|
89
88
|
|
90
|
-
propubscraper = Upton::Scraper.
|
89
|
+
propubscraper = Upton::Scraper.index("http://www.example.com/propublica-relative.html", "section#river h1 a")
|
91
90
|
propubscraper.debug = true
|
92
91
|
propubscraper.verbose = false
|
93
92
|
propubscraper.sleep_time_between_requests = 0
|
94
93
|
propubscraper.stash_folder = "test_stashes"
|
95
94
|
|
96
|
-
heds = propubscraper.scrape do |
|
97
|
-
doc = Nokogiri::HTML(article_str)
|
95
|
+
heds = propubscraper.scrape do |doc|
|
98
96
|
hed = doc.css('h1.article-title').text
|
99
97
|
end
|
100
98
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
@@ -105,7 +103,7 @@ describe Upton do
|
|
105
103
|
stub_request(:get, "www.example.com/propublica.html").
|
106
104
|
to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
|
107
105
|
|
108
|
-
propubscraper = Upton::Scraper.
|
106
|
+
propubscraper = Upton::Scraper.instances(["http://www.example.com/propublica.html"])
|
109
107
|
propubscraper.debug = true
|
110
108
|
propubscraper.verbose = false
|
111
109
|
propubscraper.sleep_time_between_requests = 0
|
@@ -120,7 +118,7 @@ describe Upton do
|
|
120
118
|
stub_request(:get, "www.example.com/easttimor.html").
|
121
119
|
to_return(:body => File.new('./spec/data/easttimor.html'), :status => 200)
|
122
120
|
|
123
|
-
propubscraper = Upton::Scraper.
|
121
|
+
propubscraper = Upton::Scraper.instances(["http://www.example.com/easttimor.html"])
|
124
122
|
propubscraper.debug = true
|
125
123
|
propubscraper.verbose = false
|
126
124
|
propubscraper.sleep_time_between_requests = 0
|
@@ -139,8 +137,6 @@ describe Upton do
|
|
139
137
|
it "should scrape paginated pages" do
|
140
138
|
stub_request(:get, "www.example.com/propublica_search.html").
|
141
139
|
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
142
|
-
stub_request(:get, "www.example.com/propublica_search.html?p=1").
|
143
|
-
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
144
140
|
stub_request(:get, "www.example.com/propublica_search.html?p=2").
|
145
141
|
to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
|
146
142
|
stub_request(:get, "www.example.com/propublica_search.html?p=3").
|
@@ -153,17 +149,21 @@ describe Upton do
|
|
153
149
|
to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
|
154
150
|
|
155
151
|
|
156
|
-
propubscraper = Upton::Scraper.
|
152
|
+
propubscraper = Upton::Scraper.index(
|
153
|
+
"http://www.example.com/propublica_search.html",
|
154
|
+
'.compact-list a.title-link',
|
155
|
+
{
|
156
|
+
:paginated => true,
|
157
|
+
:pagination_param => 'p',
|
158
|
+
:pagination_max_pages => 3,
|
159
|
+
}
|
160
|
+
)
|
157
161
|
propubscraper.debug = true
|
158
162
|
propubscraper.verbose = false
|
159
|
-
propubscraper.paginated = true
|
160
|
-
propubscraper.pagination_param = 'p'
|
161
|
-
propubscraper.pagination_max_pages = 3
|
162
163
|
propubscraper.sleep_time_between_requests = 0
|
163
164
|
propubscraper.stash_folder = "test_stashes"
|
164
165
|
|
165
|
-
results = propubscraper.scrape do |
|
166
|
-
doc = Nokogiri::HTML(article_str)
|
166
|
+
results = propubscraper.scrape do |doc|
|
167
167
|
hed = doc.css('h1.article-title').text
|
168
168
|
end
|
169
169
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
@@ -177,7 +177,7 @@ describe Upton do
|
|
177
177
|
|
178
178
|
it "should sleep after requests with caching disabled" do
|
179
179
|
stub_request(:get, "www.example.com")
|
180
|
-
u = Upton::Scraper.
|
180
|
+
u = Upton::Scraper.index("http://www.example.com", '.whatever')
|
181
181
|
u.index_debug = false
|
182
182
|
u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
|
183
183
|
u.should_receive(:sleep)
|
@@ -187,7 +187,7 @@ describe Upton do
|
|
187
187
|
it "should sleep after uncached requests when caching is enabled" do
|
188
188
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
189
189
|
stub_request(:get, "www.example.com")
|
190
|
-
u = Upton::Scraper.
|
190
|
+
u = Upton::Scraper.index("http://www.example.com", '.whatever')
|
191
191
|
u.index_debug = true
|
192
192
|
u.stash_folder = "test_stashes"
|
193
193
|
u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
|
@@ -199,8 +199,6 @@ describe Upton do
|
|
199
199
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
200
200
|
stub_request(:get, "www.example.com/propublica_search.html").
|
201
201
|
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
202
|
-
stub_request(:get, "www.example.com/propublica_search.html?p=1").
|
203
|
-
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
204
202
|
stub_request(:get, "www.example.com/propublica_search.html?p=2").
|
205
203
|
to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
|
206
204
|
stub_request(:get, "www.example.com/propublica_search.html?p=3").
|
@@ -213,12 +211,15 @@ describe Upton do
|
|
213
211
|
to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
|
214
212
|
|
215
213
|
|
216
|
-
u = Upton::Scraper.
|
214
|
+
u = Upton::Scraper.index("http://www.example.com/propublica_search.html", '.nonexistent',
|
215
|
+
{
|
216
|
+
:paginated => true,
|
217
|
+
:pagination_param => 'p',
|
218
|
+
:pagination_max_pages => 3,
|
219
|
+
}
|
220
|
+
)
|
217
221
|
u.index_debug = false
|
218
222
|
u.debug = false
|
219
|
-
u.paginated = true
|
220
|
-
u.pagination_param = 'p'
|
221
|
-
u.pagination_max_pages = 3
|
222
223
|
u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
|
223
224
|
u.stash_folder = "test_stashes"
|
224
225
|
|
@@ -234,7 +235,7 @@ describe Upton do
|
|
234
235
|
stub_request(:get, "www.example.com").
|
235
236
|
to_return(:body => '', :status => 200)
|
236
237
|
|
237
|
-
u = Upton::Scraper.
|
238
|
+
u = Upton::Scraper.index("http://www.example.com", '.whatever')
|
238
239
|
u.sleep_time_between_requests = 0.0
|
239
240
|
u.stash_folder = custom_cache_folder
|
240
241
|
u.debug = true
|
@@ -245,6 +246,76 @@ describe Upton do
|
|
245
246
|
expect(files).not_to be_empty
|
246
247
|
end
|
247
248
|
|
249
|
+
it "should scrape in the basic case with the index method" do
|
250
|
+
stub_request(:get, "www.example.com/propublica.html").
|
251
|
+
to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
|
252
|
+
stub_request(:get, "www.example.com/discussion.html").
|
253
|
+
to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
|
254
|
+
stub_request(:get, "www.example.com/prosecutor.html").
|
255
|
+
to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
|
256
|
+
stub_request(:get, "www.example.com/webinar.html").
|
257
|
+
to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
|
258
|
+
stub_request(:get, "www.example.com/sixfacts.html").
|
259
|
+
to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
|
260
|
+
|
261
|
+
propubscraper = Upton::Scraper.index("http://www.example.com/propublica.html", "section#river section h1 a")
|
262
|
+
propubscraper.debug = true
|
263
|
+
propubscraper.verbose = false
|
264
|
+
propubscraper.sleep_time_between_requests = 0
|
265
|
+
propubscraper.stash_folder = "test_stashes"
|
266
|
+
|
267
|
+
heds = propubscraper.scrape do |doc|
|
268
|
+
hed = doc.css('h1.article-title').text
|
269
|
+
end
|
270
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
271
|
+
heds.should eql @headlines
|
272
|
+
end
|
273
|
+
|
274
|
+
it "should allow instances to be set on a new Scraper" do
|
275
|
+
stub_request(:get, "www.example.com/propublica.html").
|
276
|
+
to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
|
277
|
+
stub_request(:get, "www.example.com/discussion.html").
|
278
|
+
to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
|
279
|
+
stub_request(:get, "www.example.com/prosecutor.html").
|
280
|
+
to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
|
281
|
+
stub_request(:get, "www.example.com/webinar.html").
|
282
|
+
to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
|
283
|
+
stub_request(:get, "www.example.com/sixfacts.html").
|
284
|
+
to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
|
285
|
+
|
286
|
+
propubscraper = Upton::Scraper.instances(["www.example.com/webinar.html",
|
287
|
+
"www.example.com/discussion.html",
|
288
|
+
"www.example.com/prosecutor.html",
|
289
|
+
"www.example.com/sixfacts.html"])
|
290
|
+
|
291
|
+
propubscraper.debug = true
|
292
|
+
propubscraper.verbose = false
|
293
|
+
propubscraper.sleep_time_between_requests = 0
|
294
|
+
propubscraper.stash_folder = "test_stashes"
|
295
|
+
|
296
|
+
heds = propubscraper.scrape do |doc|
|
297
|
+
hed = doc.css('h1.article-title').text
|
298
|
+
end
|
299
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
300
|
+
heds.should eql @headlines
|
301
|
+
end
|
302
|
+
|
303
|
+
it "should allow Scrapers to be added (indexes)" do
|
304
|
+
u = Upton::Scraper.index("http://www.example1.com", '.link')
|
305
|
+
w = Upton::Scraper.index("http://www.example2.com", '.link')
|
306
|
+
new_scraper = u + w
|
307
|
+
new_scraper.instance_variable_get(:@indexes).map{|a| a[0]}.should eql ["http://www.example1.com", "http://www.example2.com"]
|
308
|
+
end
|
309
|
+
|
310
|
+
it "should allow Scrapers to be added (instances)" do
|
311
|
+
pending
|
312
|
+
u = Upton::Scraper.instances(["http://www.example1.com"])
|
313
|
+
w = Upton::Scraper.instances(["http://www.example2.com"])
|
314
|
+
new_scraper = u + w
|
315
|
+
new_scraper.instance_variable_get(:@indexes).should eql []
|
316
|
+
new_scraper.instance_variable_get(:@instance_urls).map{|a| a[0]}.should eql ["http://www.example1.com", "http://www.example2.com"]
|
317
|
+
end
|
318
|
+
|
248
319
|
|
249
320
|
before do
|
250
321
|
Upton::Scraper.stub(:puts)
|
@@ -252,7 +323,7 @@ describe Upton do
|
|
252
323
|
|
253
324
|
it "should be silent if verbose is false" do
|
254
325
|
stub_request(:get, "www.example.com")
|
255
|
-
u = Upton::Scraper.
|
326
|
+
u = Upton::Scraper.index("http://www.example.com", '.whatever')
|
256
327
|
u.sleep_time_between_requests = 0.0
|
257
328
|
u.verbose = false
|
258
329
|
u.should_not_receive(:puts)
|
metadata
CHANGED
@@ -1,119 +1,127 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0.prea
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rack
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - '>='
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - '>='
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: webmock
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - '>='
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - '>='
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: thin
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - '>='
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- -
|
66
|
+
- - '>='
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 1.5.1
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.5.1
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: yard
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
|
-
- -
|
87
|
+
- - '>='
|
74
88
|
- !ruby/object:Gem::Version
|
75
89
|
version: '0'
|
76
90
|
type: :development
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
80
|
-
- -
|
94
|
+
- - '>='
|
81
95
|
- !ruby/object:Gem::Version
|
82
96
|
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: rest-client
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- -
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '1.6'
|
90
|
-
- - "~>"
|
101
|
+
- - ~>
|
91
102
|
- !ruby/object:Gem::Version
|
92
|
-
version:
|
103
|
+
version: 1.6.7
|
93
104
|
type: :runtime
|
94
105
|
prerelease: false
|
95
106
|
version_requirements: !ruby/object:Gem::Requirement
|
96
107
|
requirements:
|
97
|
-
- -
|
98
|
-
- !ruby/object:Gem::Version
|
99
|
-
version: '1.6'
|
100
|
-
- - "~>"
|
108
|
+
- - ~>
|
101
109
|
- !ruby/object:Gem::Version
|
102
|
-
version:
|
110
|
+
version: 1.6.7
|
103
111
|
- !ruby/object:Gem::Dependency
|
104
112
|
name: nokogiri
|
105
113
|
requirement: !ruby/object:Gem::Requirement
|
106
114
|
requirements:
|
107
|
-
- -
|
115
|
+
- - '>='
|
108
116
|
- !ruby/object:Gem::Version
|
109
|
-
version: '
|
117
|
+
version: '0'
|
110
118
|
type: :runtime
|
111
119
|
prerelease: false
|
112
120
|
version_requirements: !ruby/object:Gem::Requirement
|
113
121
|
requirements:
|
114
|
-
- -
|
122
|
+
- - '>='
|
115
123
|
- !ruby/object:Gem::Version
|
116
|
-
version: '
|
124
|
+
version: '0'
|
117
125
|
description: Don't re-write web scrapers every time. Upton gives you a scraper template
|
118
126
|
that's easy to use for debugging and doesn't hammer servers by default.
|
119
127
|
email: jeremybmerrill@jeremybmerrill.com
|
@@ -122,22 +130,22 @@ extensions: []
|
|
122
130
|
extra_rdoc_files: []
|
123
131
|
files:
|
124
132
|
- lib/upton.rb
|
125
|
-
- lib/upton/downloader.rb
|
126
133
|
- lib/upton/scraper.rb
|
127
134
|
- lib/upton/utils.rb
|
135
|
+
- lib/upton/downloader.rb
|
128
136
|
- lib/upton/version.rb
|
129
|
-
- spec/data/
|
137
|
+
- spec/data/prosecutor.html
|
130
138
|
- spec/data/easttimor.html
|
131
|
-
- spec/data/
|
132
|
-
- spec/data/propublica.html
|
139
|
+
- spec/data/discussion.html
|
133
140
|
- spec/data/propublica_search.html
|
134
141
|
- spec/data/propublica_search_page_2.html
|
135
|
-
- spec/data/
|
136
|
-
- spec/data/sixfacts.html
|
142
|
+
- spec/data/propublica-relative.html
|
137
143
|
- spec/data/webinar.html
|
144
|
+
- spec/data/propublica.html
|
145
|
+
- spec/data/sixfacts.html
|
146
|
+
- spec/upton_spec.rb
|
138
147
|
- spec/spec_helper.rb
|
139
148
|
- spec/upton_downloader_spec.rb
|
140
|
-
- spec/upton_spec.rb
|
141
149
|
homepage: http://github.org/propublica/upton
|
142
150
|
licenses:
|
143
151
|
- MIT
|
@@ -148,30 +156,31 @@ require_paths:
|
|
148
156
|
- lib
|
149
157
|
required_ruby_version: !ruby/object:Gem::Requirement
|
150
158
|
requirements:
|
151
|
-
- -
|
159
|
+
- - '>='
|
152
160
|
- !ruby/object:Gem::Version
|
153
161
|
version: 1.9.2
|
154
162
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
155
163
|
requirements:
|
156
|
-
- -
|
164
|
+
- - '>'
|
157
165
|
- !ruby/object:Gem::Version
|
158
|
-
version:
|
166
|
+
version: 1.3.1
|
159
167
|
requirements: []
|
160
168
|
rubyforge_project:
|
161
|
-
rubygems_version: 2.
|
169
|
+
rubygems_version: 2.0.14
|
162
170
|
signing_key:
|
163
171
|
specification_version: 4
|
164
172
|
summary: A simple web-scraping framework
|
165
173
|
test_files:
|
166
174
|
- spec/data/prosecutor.html
|
167
|
-
- spec/data/
|
168
|
-
- spec/data/propublica.html
|
175
|
+
- spec/data/easttimor.html
|
169
176
|
- spec/data/discussion.html
|
177
|
+
- spec/data/propublica_search.html
|
170
178
|
- spec/data/propublica_search_page_2.html
|
171
|
-
- spec/data/sixfacts.html
|
172
179
|
- spec/data/propublica-relative.html
|
173
|
-
- spec/data/easttimor.html
|
174
180
|
- spec/data/webinar.html
|
181
|
+
- spec/data/propublica.html
|
182
|
+
- spec/data/sixfacts.html
|
175
183
|
- spec/upton_spec.rb
|
176
184
|
- spec/spec_helper.rb
|
177
185
|
- spec/upton_downloader_spec.rb
|
186
|
+
has_rdoc: true
|