upton 0.3.6 → 1.0.0.prea
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/upton.rb +3 -329
- data/lib/upton/downloader.rb +1 -1
- data/lib/upton/scraper.rb +123 -76
- data/lib/upton/utils.rb +2 -4
- data/lib/upton/version.rb +1 -1
- data/spec/upton_spec.rb +98 -27
- metadata +50 -41
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c446c20e57e387b365d9c5bcda546a1b48ebbcf1
|
4
|
+
data.tar.gz: a29ee1aa35b18a9324d504ae8e99e2a9bafcfb27
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 11d5e990c42441d5bf599952bf3d49289754f68da72a51b31d62c86281531960fc18adffb7e52c59fe37ac5e275c35e766ce922c9a1922294f032d2a5c7cbea7
|
7
|
+
data.tar.gz: a6cbe33126fe3506c2248d40677e88e3d4e545ec2dc3e6613b3d623232f2288eae1305f41ee1ce63a996ff64812558596a95f1b14dffb63bca71bd89562e9fe7
|
data/lib/upton.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
|
-
|
4
|
-
require 'uri'
|
5
|
-
require 'restclient'
|
3
|
+
require_relative 'upton/scraper'
|
6
4
|
require_relative 'upton/utils'
|
5
|
+
require_relative 'upton/version'
|
7
6
|
require_relative 'upton/downloader'
|
7
|
+
require_relative 'upton/scraper'
|
8
8
|
|
9
9
|
##
|
10
10
|
# This module contains a scraper called Upton
|
@@ -22,332 +22,6 @@ module Upton
|
|
22
22
|
# site's search page or a newspaper's homepage.
|
23
23
|
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
24
24
|
# job listings or news articles.
|
25
|
-
#
|
26
|
-
# Upton::Scraper can be used as-is for basic use-cases by:
|
27
|
-
# 1. specifying the pages to be scraped in `new` as an index page
|
28
|
-
# or as an Array of URLs.
|
29
|
-
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
30
|
-
# block from Upton::Utils.
|
31
|
-
# For more complicated cases; subclass Upton::Scraper
|
32
|
-
# e.g. +MyScraper < Upton::Scraper+ and override various methods.
|
33
25
|
##
|
34
|
-
class Scraper
|
35
|
-
EMPTY_STRING = ''
|
36
|
-
|
37
|
-
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
|
38
|
-
:paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
|
39
|
-
:pagination_interval
|
40
|
-
|
41
|
-
##
|
42
|
-
# This is the main user-facing method for a basic scraper.
|
43
|
-
# Call +scrape+ with a block; this block will be called on
|
44
|
-
# the text of each instance page, (and optionally, its URL and its index
|
45
|
-
# in the list of instance URLs returned by +get_index+).
|
46
|
-
##
|
47
|
-
def scrape(&blk)
|
48
|
-
self.url_array = self.get_index unless self.url_array
|
49
|
-
blk = Proc.new{|x| x} if blk.nil?
|
50
|
-
self.scrape_from_list(self.url_array, blk)
|
51
|
-
end
|
52
|
-
|
53
|
-
##
|
54
|
-
# +index_url_or_array+: A list of string URLs, OR
|
55
|
-
# the URL of the page containing the list of instances.
|
56
|
-
# +selector+: The XPath expression or CSS selector that specifies the
|
57
|
-
# anchor elements within the page, if a url is specified for
|
58
|
-
# the previous argument.
|
59
|
-
#
|
60
|
-
# These options are a shortcut. If you plan to override +get_index+, you
|
61
|
-
# do not need to set them.
|
62
|
-
# If you don't specify a selector, the first argument will be treated as a
|
63
|
-
# list of URLs.
|
64
|
-
##
|
65
|
-
def initialize(index_url_or_array, selector="")
|
66
|
-
|
67
|
-
#if first arg is a valid URL, do already-written stuff;
|
68
|
-
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
69
|
-
#e.g. Scraper.new(["http://jeremybmerrill.com"])
|
70
|
-
|
71
|
-
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
72
|
-
if index_url_or_array.respond_to? :each_with_index
|
73
|
-
@url_array = index_url_or_array
|
74
|
-
else
|
75
|
-
@index_url = index_url_or_array
|
76
|
-
@index_selector = selector
|
77
|
-
end
|
78
|
-
|
79
|
-
# If true, then Upton prints information about when it gets
|
80
|
-
# files from the internet and when it gets them from its stash.
|
81
|
-
@verbose = false
|
82
|
-
|
83
|
-
# If true, then Upton fetches each instance page only once
|
84
|
-
# future requests for that file are responded to with the locally stashed
|
85
|
-
# version.
|
86
|
-
# You may want to set @debug to false for production (but maybe not).
|
87
|
-
# You can also control stashing behavior on a per-call basis with the
|
88
|
-
# optional second argument to get_page, if, for instance, you want to
|
89
|
-
# stash certain instance pages, e.g. based on their modification date.
|
90
|
-
@debug = true
|
91
|
-
# Index debug does the same, but for index pages.
|
92
|
-
@index_debug = false
|
93
|
-
|
94
|
-
# In order to not hammer servers, Upton waits for, by default, 30
|
95
|
-
# seconds between requests to the remote server.
|
96
|
-
@sleep_time_between_requests = 30 #seconds
|
97
|
-
|
98
|
-
# If true, then Upton will attempt to scrape paginated index pages
|
99
|
-
@paginated = false
|
100
|
-
# Default query string parameter used to specify the current page
|
101
|
-
@pagination_param = 'page'
|
102
|
-
# Default number of paginated pages to scrape
|
103
|
-
@pagination_max_pages = 2
|
104
|
-
# Default starting number for pagination (second page is this plus 1).
|
105
|
-
@pagination_start_index = 1
|
106
|
-
# Default value to increment page number by
|
107
|
-
@pagination_interval = 1
|
108
|
-
|
109
|
-
# Folder name for stashes, if you want them to be stored somewhere else,
|
110
|
-
# e.g. under /tmp.
|
111
|
-
if @stash_folder
|
112
|
-
FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
|
113
|
-
end
|
114
|
-
end
|
115
|
-
|
116
|
-
##
|
117
|
-
# If instance pages are paginated, <b>you must override</b>
|
118
|
-
# this method to return the next URL, given the current URL and its index.
|
119
|
-
#
|
120
|
-
# If instance pages aren't paginated, there's no need to override this.
|
121
|
-
#
|
122
|
-
# Recursion stops if the fetching URL returns an empty string or an error.
|
123
|
-
#
|
124
|
-
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
125
|
-
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
126
|
-
##
|
127
|
-
def next_instance_page_url(url, pagination_index)
|
128
|
-
EMPTY_STRING
|
129
|
-
end
|
130
|
-
|
131
|
-
##
|
132
|
-
# Return the next URL to scrape, given the current URL and its index.
|
133
|
-
#
|
134
|
-
# Recursion stops if the fetching URL returns an empty string or an error.
|
135
|
-
#
|
136
|
-
# If @paginated is not set (the default), this method returns an empty string.
|
137
|
-
#
|
138
|
-
# If @paginated is set, this method will return the next pagination URL
|
139
|
-
# to scrape using @pagination_param and the pagination_index.
|
140
|
-
#
|
141
|
-
# If the pagination_index is greater than @pagination_max_pages, then the
|
142
|
-
# method will return an empty string.
|
143
|
-
#
|
144
|
-
# Override this method to handle pagination is an alternative way
|
145
|
-
# e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
|
146
|
-
# ought to return "http://whatever.com/articles?page=2"
|
147
|
-
#
|
148
|
-
##
|
149
|
-
def next_index_page_url(url, pagination_index)
|
150
|
-
return url unless @paginated
|
151
|
-
|
152
|
-
if pagination_index > @pagination_max_pages
|
153
|
-
puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
|
154
|
-
EMPTY_STRING
|
155
|
-
else
|
156
|
-
uri = URI.parse(url)
|
157
|
-
query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
|
158
|
-
# update the pagination query string parameter
|
159
|
-
query[@pagination_param] = pagination_index
|
160
|
-
uri.query = URI.encode_www_form(query)
|
161
|
-
puts "Next index pagination url is #{uri}" if @verbose
|
162
|
-
uri.to_s
|
163
|
-
end
|
164
|
-
end
|
165
|
-
|
166
|
-
##
|
167
|
-
# Writes the scraped result to a CSV at the given filename.
|
168
|
-
##
|
169
|
-
def scrape_to_csv filename, &blk
|
170
|
-
require 'csv'
|
171
|
-
self.url_array = self.get_index unless self.url_array
|
172
|
-
CSV.open filename, 'wb' do |csv|
|
173
|
-
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
174
|
-
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
175
|
-
if document[0].respond_to? :map
|
176
|
-
document.each{|row| csv << row }
|
177
|
-
else
|
178
|
-
csv << document
|
179
|
-
end
|
180
|
-
end
|
181
|
-
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
182
|
-
end
|
183
|
-
end
|
184
|
-
|
185
|
-
def scrape_to_tsv filename, &blk
|
186
|
-
require 'csv'
|
187
|
-
self.url_array = self.get_index unless self.url_array
|
188
|
-
CSV.open filename, 'wb', :col_sep => "\t" do |csv|
|
189
|
-
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
190
|
-
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
191
|
-
if document[0].respond_to? :map
|
192
|
-
document.each{|row| csv << row }
|
193
|
-
else
|
194
|
-
csv << document
|
195
|
-
end
|
196
|
-
end
|
197
|
-
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
198
|
-
end
|
199
|
-
end
|
200
|
-
|
201
|
-
protected
|
202
|
-
|
203
|
-
##
|
204
|
-
# Handles getting pages with Downlader, which handles stashing.
|
205
|
-
##
|
206
|
-
def get_page(url, stash=false, options={})
|
207
|
-
return EMPTY_STRING if url.nil? || url.empty? #url is nil if the <a> lacks an `href` attribute.
|
208
|
-
global_options = {
|
209
|
-
:cache => stash,
|
210
|
-
:verbose => @verbose
|
211
|
-
}
|
212
|
-
if @readable_filenames
|
213
|
-
global_options[:readable_filenames] = true
|
214
|
-
end
|
215
|
-
if @stash_folder
|
216
|
-
global_options[:readable_filenames] = true
|
217
|
-
global_options[:cache_location] = @stash_folder
|
218
|
-
end
|
219
|
-
resp_and_cache = Downloader.new(url, global_options.merge(options)).get
|
220
|
-
if resp_and_cache[:from_resource]
|
221
|
-
puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
|
222
|
-
sleep @sleep_time_between_requests
|
223
|
-
end
|
224
|
-
resp_and_cache[:resp]
|
225
|
-
end
|
226
|
-
|
227
|
-
|
228
|
-
##
|
229
|
-
# sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
|
230
|
-
# resolve_url resolves them to absolute urls.
|
231
|
-
# absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
|
232
|
-
##
|
233
|
-
def resolve_url(href_str, absolute_url_str)
|
234
|
-
if absolute_url_str.class <= URI::Generic
|
235
|
-
absolute_url = absolute_url_str.dup
|
236
|
-
else
|
237
|
-
begin
|
238
|
-
absolute_url = URI(absolute_url_str).dup
|
239
|
-
rescue URI::InvalidURIError
|
240
|
-
raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
|
241
|
-
end
|
242
|
-
end
|
243
|
-
raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
|
244
|
-
if href_str.class <= URI::Generic
|
245
|
-
href = href_str.dup
|
246
|
-
else
|
247
|
-
begin
|
248
|
-
href = URI(href_str).dup
|
249
|
-
rescue URI::InvalidURIError
|
250
|
-
raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
|
251
|
-
end
|
252
|
-
end
|
253
|
-
|
254
|
-
# return :href if :href is already absolute
|
255
|
-
return href.to_s if href.absolute?
|
256
|
-
|
257
|
-
#TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
|
258
|
-
URI.join(absolute_url.to_s, href.to_s).to_s
|
259
|
-
end
|
260
|
-
|
261
|
-
##
|
262
|
-
# Return a list of URLs for the instances you want to scrape.
|
263
|
-
# This can optionally be overridden if, for example, the list of instances
|
264
|
-
# comes from an API.
|
265
|
-
##
|
266
|
-
def get_index
|
267
|
-
index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
|
268
|
-
end
|
269
|
-
|
270
|
-
# TODO: Not sure the best way to handle this
|
271
|
-
# Currently, #parse_index is called upon #get_index_pages,
|
272
|
-
# which itself is dependent on @index_url
|
273
|
-
# Does @index_url stay unaltered for the lifetime of the Upton instance?
|
274
|
-
# It seems to at this point, but that may be something that gets
|
275
|
-
# deprecated later
|
276
|
-
#
|
277
|
-
# So for now, @index_url is used in conjunction with resolve_url
|
278
|
-
# to make sure that this method returns absolute urls
|
279
|
-
# i.e. this method expects @index_url to always have an absolute address
|
280
|
-
# for the lifetime of an Upton instance
|
281
|
-
def parse_index(text, selector)
|
282
|
-
Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
|
283
|
-
href = a_element["href"]
|
284
|
-
resolved_url = resolve_url( href, @index_url) unless href.nil?
|
285
|
-
puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
|
286
|
-
resolved_url
|
287
|
-
end
|
288
|
-
end
|
289
|
-
|
290
|
-
|
291
|
-
##
|
292
|
-
# Returns the concatenated output of each member of a paginated index,
|
293
|
-
# e.g. a site listing links with 2+ pages.
|
294
|
-
##
|
295
|
-
def get_index_pages(original_url, pagination_index, pagination_interval, options={})
|
296
|
-
resps = []
|
297
|
-
prev_url = nil
|
298
|
-
while resps.empty? || !resps.last.empty?
|
299
|
-
next_url = self.next_index_page_url(original_url, pagination_index)
|
300
|
-
break if next_url.empty?
|
301
|
-
|
302
|
-
next_url = resolve_url(next_url, original_url)
|
303
|
-
break if next_url == prev_url
|
304
|
-
|
305
|
-
next_resp = self.get_page(next_url, @index_debug, options).to_s
|
306
|
-
prev_url = next_url
|
307
|
-
pagination_index += pagination_interval
|
308
|
-
resps << next_resp
|
309
|
-
end
|
310
|
-
resps
|
311
|
-
end
|
312
|
-
|
313
|
-
##
|
314
|
-
# Returns the instance at `url`.
|
315
|
-
#
|
316
|
-
# If the page is stashed, returns that, otherwise, fetches it from the web.
|
317
|
-
#
|
318
|
-
# If an instance is paginated, returns the concatenated output of each
|
319
|
-
# page, e.g. if a news article has two pages.
|
320
|
-
##
|
321
|
-
def get_instance(url, pagination_index=0, options={})
|
322
|
-
resps = [self.get_page(url, @debug, options)]
|
323
|
-
pagination_index = pagination_index.to_i
|
324
|
-
prev_url = url
|
325
|
-
while !resps.last.empty?
|
326
|
-
next_url = self.next_instance_page_url(url, pagination_index + 1)
|
327
|
-
break if next_url == prev_url || next_url.empty?
|
328
|
-
|
329
|
-
next_resp = self.get_page(next_url, @debug, options)
|
330
|
-
prev_url = next_url
|
331
|
-
resps << next_resp
|
332
|
-
end
|
333
|
-
resps
|
334
|
-
end
|
335
|
-
|
336
|
-
# Just a helper for +scrape+.
|
337
|
-
def scrape_from_list(list, blk)
|
338
|
-
puts "Scraping #{list.size} instances" if @verbose
|
339
|
-
list.each_with_index.map do |instance_url, instance_index|
|
340
|
-
instance_resps = get_instance instance_url, nil, :instance_index => instance_index
|
341
|
-
instance_resps.each_with_index.map do |instance_resp, pagination_index|
|
342
|
-
blk.call(instance_resp, instance_url, instance_index, pagination_index)
|
343
|
-
end
|
344
|
-
end.flatten(1)
|
345
|
-
end
|
346
|
-
|
347
|
-
# it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
|
348
|
-
def slug(url)
|
349
|
-
url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
|
350
|
-
end
|
351
26
|
|
352
|
-
end
|
353
27
|
end
|
data/lib/upton/downloader.rb
CHANGED
@@ -103,7 +103,7 @@ module Upton
|
|
103
103
|
msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
|
104
104
|
resp_html = Nokogiri::HTML(resp)
|
105
105
|
comment = Nokogiri::XML::Comment.new(resp_html, msg)
|
106
|
-
if resp_html.root.nil?
|
106
|
+
if resp_html.root.nil?
|
107
107
|
return resp
|
108
108
|
elsif resp_html.root.children.empty?
|
109
109
|
resp_html.root.add_child(comment)
|
data/lib/upton/scraper.rb
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
require 'uri'
|
2
2
|
require 'nokogiri'
|
3
3
|
require_relative './downloader'
|
4
|
+
require_relative './page'
|
4
5
|
|
5
6
|
module Upton
|
6
|
-
|
7
|
+
# Upton::Scraper can be used as-is for basic use-cases by:
|
7
8
|
# 1. specifying the pages to be scraped in `new` as an index page
|
8
9
|
# or as an Array of URLs.
|
9
10
|
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
@@ -14,9 +15,8 @@ module Upton
|
|
14
15
|
class Scraper
|
15
16
|
EMPTY_STRING = ''
|
16
17
|
|
17
|
-
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests,
|
18
|
-
|
19
|
-
:pagination_interval
|
18
|
+
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests,
|
19
|
+
:stash_folder, :readable_filenames
|
20
20
|
|
21
21
|
##
|
22
22
|
# This is the main user-facing method for a basic scraper.
|
@@ -25,8 +25,8 @@ module Upton
|
|
25
25
|
# in the list of instance URLs returned by +get_index+).
|
26
26
|
##
|
27
27
|
def scrape(&blk)
|
28
|
-
|
29
|
-
self.scrape_from_list(
|
28
|
+
get_indexes!
|
29
|
+
self.scrape_from_list(@instance_urls, blk)
|
30
30
|
end
|
31
31
|
|
32
32
|
##
|
@@ -41,23 +41,10 @@ module Upton
|
|
41
41
|
# If you don't specify a selector, the first argument will be treated as a
|
42
42
|
# list of URLs.
|
43
43
|
##
|
44
|
-
def initialize(
|
45
|
-
|
46
|
-
#if first arg is a valid URL, do already-written stuff;
|
47
|
-
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
48
|
-
#e.g. Scraper.new(["http://jeremybmerrill.com"])
|
49
|
-
|
50
|
-
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
51
|
-
if index_url_or_array.respond_to? :each_with_index
|
52
|
-
@url_array = index_url_or_array
|
53
|
-
else
|
54
|
-
@index_url = index_url_or_array
|
55
|
-
@index_selector = selector
|
56
|
-
end
|
57
|
-
|
44
|
+
def initialize(options={})
|
58
45
|
# If true, then Upton prints information about when it gets
|
59
46
|
# files from the internet and when it gets them from its stash.
|
60
|
-
@verbose = false
|
47
|
+
@verbose = options[:verbose] || false
|
61
48
|
|
62
49
|
# If true, then Upton fetches each instance page only once
|
63
50
|
# future requests for that file are responded to with the locally stashed
|
@@ -66,29 +53,77 @@ module Upton
|
|
66
53
|
# You can also control stashing behavior on a per-call basis with the
|
67
54
|
# optional second argument to get_page, if, for instance, you want to
|
68
55
|
# stash certain instance pages, e.g. based on their modification date.
|
69
|
-
@debug = true
|
56
|
+
@debug = options[:debug] || true
|
70
57
|
# Index debug does the same, but for index pages.
|
71
|
-
@index_debug = false
|
58
|
+
@index_debug = options[:index_debug] || false
|
72
59
|
|
73
60
|
# In order to not hammer servers, Upton waits for, by default, 30
|
74
61
|
# seconds between requests to the remote server.
|
75
|
-
@sleep_time_between_requests = 30 #seconds
|
62
|
+
@sleep_time_between_requests = options[:sleep_time_between_requests] || 30 #seconds
|
63
|
+
|
64
|
+
# Folder name for stashes, if you want them to be stored somewhere else,
|
65
|
+
# e.g. under /tmp.
|
66
|
+
if @stash_folder
|
67
|
+
FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
|
68
|
+
end
|
69
|
+
|
70
|
+
@indexes = []
|
71
|
+
@instance_urls = []
|
72
|
+
end
|
73
|
+
|
74
|
+
def index(index_url, selector, options={})
|
75
|
+
# for future:
|
76
|
+
@indexes ||= []
|
76
77
|
|
78
|
+
##
|
79
|
+
# Pagination options are per-index page
|
80
|
+
#
|
77
81
|
# If true, then Upton will attempt to scrape paginated index pages
|
78
|
-
|
82
|
+
options[:paginated] ||= false
|
79
83
|
# Default query string parameter used to specify the current page
|
80
|
-
|
84
|
+
options[:pagination_param] ||= 'page'
|
81
85
|
# Default number of paginated pages to scrape
|
82
|
-
|
86
|
+
options[:pagination_max_pages] ||= 2
|
83
87
|
# Default starting number for pagination (second page is this plus 1).
|
84
|
-
|
88
|
+
options[:pagination_start_index] ||= 1
|
85
89
|
# Default value to increment page number by
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
90
|
+
options[:pagination_interval] ||= 1
|
91
|
+
##
|
92
|
+
|
93
|
+
@indexes << [index_url, selector, options]
|
94
|
+
# and actually go scrape the index page, populate @instances
|
95
|
+
self
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.index(index_url, selector, options={})
|
99
|
+
scraper = self.new
|
100
|
+
scraper.index(index_url, selector, options)
|
101
|
+
scraper
|
102
|
+
end
|
103
|
+
|
104
|
+
def self.instances(instances, options={})
|
105
|
+
s = self.new
|
106
|
+
s.instance_variable_set(:@instance_urls, instances)
|
107
|
+
s
|
108
|
+
end
|
109
|
+
|
110
|
+
# does
|
111
|
+
# def add_instances(urls)
|
112
|
+
# #for future:
|
113
|
+
# # @instances += urls
|
114
|
+
# # @instances.uniq!
|
115
|
+
# @instance_urls ||= []
|
116
|
+
# @instance_urls += urls
|
117
|
+
# @instance_urls.uniq!
|
118
|
+
# end
|
119
|
+
|
120
|
+
def instances(urls=nil)
|
121
|
+
if urls.nil?
|
122
|
+
@instance_urls
|
123
|
+
else
|
124
|
+
@instance_urls ||= []
|
125
|
+
@instance_urls += urls
|
126
|
+
self
|
92
127
|
end
|
93
128
|
end
|
94
129
|
|
@@ -125,21 +160,14 @@ module Upton
|
|
125
160
|
# ought to return "http://whatever.com/articles?page=2"
|
126
161
|
#
|
127
162
|
##
|
128
|
-
def next_index_page_url(url, pagination_index)
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
|
137
|
-
# update the pagination query string parameter
|
138
|
-
query[@pagination_param] = pagination_index
|
139
|
-
uri.query = URI.encode_www_form(query)
|
140
|
-
puts "Next index pagination url is #{uri}" if @verbose
|
141
|
-
uri.to_s
|
142
|
-
end
|
163
|
+
def next_index_page_url(url, pagination_param, pagination_index)
|
164
|
+
uri = URI.parse(url)
|
165
|
+
query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
|
166
|
+
# update the pagination query string parameter
|
167
|
+
query[pagination_param] = pagination_index
|
168
|
+
uri.query = URI.encode_www_form(query)
|
169
|
+
puts "Next index pagination url is #{uri}" if @verbose
|
170
|
+
uri.to_s
|
143
171
|
end
|
144
172
|
|
145
173
|
##
|
@@ -147,36 +175,46 @@ module Upton
|
|
147
175
|
##
|
148
176
|
def scrape_to_csv filename, &blk
|
149
177
|
require 'csv'
|
150
|
-
self.
|
178
|
+
self.get_indexes!
|
151
179
|
CSV.open filename, 'wb' do |csv|
|
152
180
|
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
153
|
-
self.scrape_from_list(
|
181
|
+
self.scrape_from_list(@instance_urls, blk).compact.each do |document|
|
154
182
|
if document[0].respond_to? :map
|
155
183
|
document.each{|row| csv << row }
|
156
184
|
else
|
157
185
|
csv << document
|
158
186
|
end
|
159
187
|
end
|
160
|
-
#self.scrape_from_list(
|
188
|
+
#self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
|
161
189
|
end
|
162
190
|
end
|
163
191
|
|
164
192
|
def scrape_to_tsv filename, &blk
|
165
193
|
require 'csv'
|
166
|
-
|
194
|
+
get_indexes!
|
167
195
|
CSV.open filename, 'wb', :col_sep => "\t" do |csv|
|
168
196
|
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
169
|
-
self.scrape_from_list(
|
197
|
+
self.scrape_from_list(@instance_urls, blk).compact.each do |document|
|
170
198
|
if document[0].respond_to? :map
|
171
199
|
document.each{|row| csv << row }
|
172
200
|
else
|
173
201
|
csv << document
|
174
202
|
end
|
175
203
|
end
|
176
|
-
#self.scrape_from_list(
|
204
|
+
#self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
|
177
205
|
end
|
178
206
|
end
|
179
207
|
|
208
|
+
def +(other_scraper)
|
209
|
+
raise ArgumentError, "#{other_scraper.class} can't be coerced into Upton::Scraper" unless other_scraper.class <= Upton::Scraper
|
210
|
+
new_scraper = Scraper.new
|
211
|
+
new_indexes = @indexes + other_scraper.instance_variable_get(:@indexes)
|
212
|
+
new_instances = @instance_urls + other_scraper.instance_variable_get(:@instance_urls)
|
213
|
+
new_scraper.instance_variable_set(:@indexes, new_indexes)
|
214
|
+
new_scraper.instance_variable_set(:@instance_urls, new_instances)
|
215
|
+
new_scraper
|
216
|
+
end
|
217
|
+
|
180
218
|
protected
|
181
219
|
|
182
220
|
##
|
@@ -217,6 +255,8 @@ module Upton
|
|
217
255
|
absolute_url = URI(absolute_url_str).dup
|
218
256
|
rescue URI::InvalidURIError
|
219
257
|
raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
|
258
|
+
rescue ArgumentError
|
259
|
+
raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
|
220
260
|
end
|
221
261
|
end
|
222
262
|
raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
|
@@ -237,15 +277,6 @@ module Upton
|
|
237
277
|
URI.join(absolute_url.to_s, href.to_s).to_s
|
238
278
|
end
|
239
279
|
|
240
|
-
##
|
241
|
-
# Return a list of URLs for the instances you want to scrape.
|
242
|
-
# This can optionally be overridden if, for example, the list of instances
|
243
|
-
# comes from an API.
|
244
|
-
##
|
245
|
-
def get_index
|
246
|
-
index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
|
247
|
-
end
|
248
|
-
|
249
280
|
# TODO: Not sure the best way to handle this
|
250
281
|
# Currently, #parse_index is called upon #get_index_pages,
|
251
282
|
# which itself is dependent on @index_url
|
@@ -253,30 +284,31 @@ module Upton
|
|
253
284
|
# It seems to at this point, but that may be something that gets
|
254
285
|
# deprecated later
|
255
286
|
#
|
256
|
-
# So for now,
|
287
|
+
# So for now, index_url is used in conjunction with resolve_url
|
257
288
|
# to make sure that this method returns absolute urls
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
href = a_element["href"]
|
263
|
-
resolved_url = resolve_url( href, @index_url) unless href.nil?
|
289
|
+
def parse_index(text, selector, index_url)
|
290
|
+
Nokogiri::HTML(text).search(selector).to_a.map do |anchor|
|
291
|
+
href = anchor["href"]
|
292
|
+
resolved_url = resolve_url( href, index_url) unless href.nil?
|
264
293
|
puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
|
265
294
|
resolved_url
|
266
295
|
end
|
267
296
|
end
|
268
297
|
|
269
|
-
|
270
298
|
##
|
271
299
|
# Returns the concatenated output of each member of a paginated index,
|
272
300
|
# e.g. a site listing links with 2+ pages.
|
273
301
|
##
|
274
|
-
def get_index_pages(url, pagination_index,
|
302
|
+
def get_index_pages(url, pagination_index, options={})
|
275
303
|
resps = [self.get_page(url, @index_debug, options)]
|
304
|
+
return resps unless options[:paginated]
|
305
|
+
|
276
306
|
prev_url = url
|
277
307
|
while !resps.last.empty?
|
278
|
-
pagination_index += pagination_interval
|
279
|
-
|
308
|
+
pagination_index += options[:pagination_interval]
|
309
|
+
break if pagination_index > options[:pagination_max_pages]
|
310
|
+
|
311
|
+
next_url = self.next_index_page_url(url, options[:pagination_param], pagination_index)
|
280
312
|
next_url = resolve_url(next_url, url)
|
281
313
|
break if next_url == prev_url || next_url.empty?
|
282
314
|
|
@@ -310,13 +342,28 @@ module Upton
|
|
310
342
|
resps
|
311
343
|
end
|
312
344
|
|
345
|
+
##
|
346
|
+
# Return a list of URLs for the instances you want to scrape.
|
347
|
+
# This can optionally be overridden if, for example, the list of instances
|
348
|
+
# comes from an API.
|
349
|
+
##
|
350
|
+
def get_indexes!
|
351
|
+
@indexes.each do |index_url, index_selector, options|
|
352
|
+
#TODO: cope with pagination stuff per URL
|
353
|
+
|
354
|
+
@instance_urls += get_index_pages(index_url, options[:pagination_start_index], options).map{|page| parse_index(page, index_selector, index_url) }.flatten
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
|
313
359
|
# Just a helper for +scrape+.
|
314
360
|
def scrape_from_list(list, blk)
|
315
361
|
puts "Scraping #{list.size} instances" if @verbose
|
316
362
|
list.each_with_index.map do |instance_url, instance_index|
|
317
363
|
instance_resps = get_instance instance_url, nil, :instance_index => instance_index
|
318
364
|
instance_resps.each_with_index.map do |instance_resp, pagination_index|
|
319
|
-
|
365
|
+
page = Page.new(instance_resp, instance_url, instance_index, pagination_index)
|
366
|
+
blk.call(page)
|
320
367
|
end
|
321
368
|
end.flatten(1)
|
322
369
|
end
|
data/lib/upton/utils.rb
CHANGED
@@ -18,8 +18,7 @@ module Upton
|
|
18
18
|
# present, is returned as the first row.
|
19
19
|
##
|
20
20
|
def self.table(table_selector, deprecated=nil)
|
21
|
-
return Proc.new do |
|
22
|
-
html = ::Nokogiri::HTML(instance_html)
|
21
|
+
return Proc.new do |html|
|
23
22
|
output = []
|
24
23
|
headers = html.search(table_selector).css("th").map &:text
|
25
24
|
output << headers
|
@@ -33,8 +32,7 @@ module Upton
|
|
33
32
|
# Scrapes any set of HTML elements into an Array.
|
34
33
|
##
|
35
34
|
def self.list(list_selector, deprecated=nil)
|
36
|
-
return Proc.new do |
|
37
|
-
html = ::Nokogiri::HTML(instance_html)
|
35
|
+
return Proc.new do |html|
|
38
36
|
html.search(list_selector).map{|list_element| list_element.text }
|
39
37
|
end
|
40
38
|
end
|
data/lib/upton/version.rb
CHANGED
data/spec/upton_spec.rb
CHANGED
@@ -52,15 +52,14 @@ describe Upton do
|
|
52
52
|
stub_request(:get, "www.example.com/sixfacts.html").
|
53
53
|
to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
|
54
54
|
|
55
|
-
propubscraper = Upton::Scraper.
|
55
|
+
propubscraper = Upton::Scraper.index("http://www.example.com/propublica.html", "section#river section h1 a")
|
56
56
|
propubscraper.debug = true
|
57
57
|
propubscraper.verbose = false
|
58
58
|
propubscraper.sleep_time_between_requests = 0
|
59
59
|
propubscraper.stash_folder = "test_stashes"
|
60
60
|
|
61
|
-
heds = propubscraper.scrape do |
|
62
|
-
doc
|
63
|
-
hed = doc.css('h1.article-title').text
|
61
|
+
heds = propubscraper.scrape do |doc|
|
62
|
+
doc.css('h1.article-title').text
|
64
63
|
end
|
65
64
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
66
65
|
heds.should eql @headlines
|
@@ -87,14 +86,13 @@ describe Upton do
|
|
87
86
|
to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
|
88
87
|
|
89
88
|
|
90
|
-
propubscraper = Upton::Scraper.
|
89
|
+
propubscraper = Upton::Scraper.index("http://www.example.com/propublica-relative.html", "section#river h1 a")
|
91
90
|
propubscraper.debug = true
|
92
91
|
propubscraper.verbose = false
|
93
92
|
propubscraper.sleep_time_between_requests = 0
|
94
93
|
propubscraper.stash_folder = "test_stashes"
|
95
94
|
|
96
|
-
heds = propubscraper.scrape do |
|
97
|
-
doc = Nokogiri::HTML(article_str)
|
95
|
+
heds = propubscraper.scrape do |doc|
|
98
96
|
hed = doc.css('h1.article-title').text
|
99
97
|
end
|
100
98
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
@@ -105,7 +103,7 @@ describe Upton do
|
|
105
103
|
stub_request(:get, "www.example.com/propublica.html").
|
106
104
|
to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
|
107
105
|
|
108
|
-
propubscraper = Upton::Scraper.
|
106
|
+
propubscraper = Upton::Scraper.instances(["http://www.example.com/propublica.html"])
|
109
107
|
propubscraper.debug = true
|
110
108
|
propubscraper.verbose = false
|
111
109
|
propubscraper.sleep_time_between_requests = 0
|
@@ -120,7 +118,7 @@ describe Upton do
|
|
120
118
|
stub_request(:get, "www.example.com/easttimor.html").
|
121
119
|
to_return(:body => File.new('./spec/data/easttimor.html'), :status => 200)
|
122
120
|
|
123
|
-
propubscraper = Upton::Scraper.
|
121
|
+
propubscraper = Upton::Scraper.instances(["http://www.example.com/easttimor.html"])
|
124
122
|
propubscraper.debug = true
|
125
123
|
propubscraper.verbose = false
|
126
124
|
propubscraper.sleep_time_between_requests = 0
|
@@ -139,8 +137,6 @@ describe Upton do
|
|
139
137
|
it "should scrape paginated pages" do
|
140
138
|
stub_request(:get, "www.example.com/propublica_search.html").
|
141
139
|
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
142
|
-
stub_request(:get, "www.example.com/propublica_search.html?p=1").
|
143
|
-
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
144
140
|
stub_request(:get, "www.example.com/propublica_search.html?p=2").
|
145
141
|
to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
|
146
142
|
stub_request(:get, "www.example.com/propublica_search.html?p=3").
|
@@ -153,17 +149,21 @@ describe Upton do
|
|
153
149
|
to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
|
154
150
|
|
155
151
|
|
156
|
-
propubscraper = Upton::Scraper.
|
152
|
+
propubscraper = Upton::Scraper.index(
|
153
|
+
"http://www.example.com/propublica_search.html",
|
154
|
+
'.compact-list a.title-link',
|
155
|
+
{
|
156
|
+
:paginated => true,
|
157
|
+
:pagination_param => 'p',
|
158
|
+
:pagination_max_pages => 3,
|
159
|
+
}
|
160
|
+
)
|
157
161
|
propubscraper.debug = true
|
158
162
|
propubscraper.verbose = false
|
159
|
-
propubscraper.paginated = true
|
160
|
-
propubscraper.pagination_param = 'p'
|
161
|
-
propubscraper.pagination_max_pages = 3
|
162
163
|
propubscraper.sleep_time_between_requests = 0
|
163
164
|
propubscraper.stash_folder = "test_stashes"
|
164
165
|
|
165
|
-
results = propubscraper.scrape do |
|
166
|
-
doc = Nokogiri::HTML(article_str)
|
166
|
+
results = propubscraper.scrape do |doc|
|
167
167
|
hed = doc.css('h1.article-title').text
|
168
168
|
end
|
169
169
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
@@ -177,7 +177,7 @@ describe Upton do
|
|
177
177
|
|
178
178
|
it "should sleep after requests with caching disabled" do
|
179
179
|
stub_request(:get, "www.example.com")
|
180
|
-
u = Upton::Scraper.
|
180
|
+
u = Upton::Scraper.index("http://www.example.com", '.whatever')
|
181
181
|
u.index_debug = false
|
182
182
|
u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
|
183
183
|
u.should_receive(:sleep)
|
@@ -187,7 +187,7 @@ describe Upton do
|
|
187
187
|
it "should sleep after uncached requests when caching is enabled" do
|
188
188
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
189
189
|
stub_request(:get, "www.example.com")
|
190
|
-
u = Upton::Scraper.
|
190
|
+
u = Upton::Scraper.index("http://www.example.com", '.whatever')
|
191
191
|
u.index_debug = true
|
192
192
|
u.stash_folder = "test_stashes"
|
193
193
|
u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
|
@@ -199,8 +199,6 @@ describe Upton do
|
|
199
199
|
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
200
200
|
stub_request(:get, "www.example.com/propublica_search.html").
|
201
201
|
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
202
|
-
stub_request(:get, "www.example.com/propublica_search.html?p=1").
|
203
|
-
to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
|
204
202
|
stub_request(:get, "www.example.com/propublica_search.html?p=2").
|
205
203
|
to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
|
206
204
|
stub_request(:get, "www.example.com/propublica_search.html?p=3").
|
@@ -213,12 +211,15 @@ describe Upton do
|
|
213
211
|
to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
|
214
212
|
|
215
213
|
|
216
|
-
u = Upton::Scraper.
|
214
|
+
u = Upton::Scraper.index("http://www.example.com/propublica_search.html", '.nonexistent',
|
215
|
+
{
|
216
|
+
:paginated => true,
|
217
|
+
:pagination_param => 'p',
|
218
|
+
:pagination_max_pages => 3,
|
219
|
+
}
|
220
|
+
)
|
217
221
|
u.index_debug = false
|
218
222
|
u.debug = false
|
219
|
-
u.paginated = true
|
220
|
-
u.pagination_param = 'p'
|
221
|
-
u.pagination_max_pages = 3
|
222
223
|
u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
|
223
224
|
u.stash_folder = "test_stashes"
|
224
225
|
|
@@ -234,7 +235,7 @@ describe Upton do
|
|
234
235
|
stub_request(:get, "www.example.com").
|
235
236
|
to_return(:body => '', :status => 200)
|
236
237
|
|
237
|
-
u = Upton::Scraper.
|
238
|
+
u = Upton::Scraper.index("http://www.example.com", '.whatever')
|
238
239
|
u.sleep_time_between_requests = 0.0
|
239
240
|
u.stash_folder = custom_cache_folder
|
240
241
|
u.debug = true
|
@@ -245,6 +246,76 @@ describe Upton do
|
|
245
246
|
expect(files).not_to be_empty
|
246
247
|
end
|
247
248
|
|
249
|
+
it "should scrape in the basic case with the index method" do
|
250
|
+
stub_request(:get, "www.example.com/propublica.html").
|
251
|
+
to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
|
252
|
+
stub_request(:get, "www.example.com/discussion.html").
|
253
|
+
to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
|
254
|
+
stub_request(:get, "www.example.com/prosecutor.html").
|
255
|
+
to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
|
256
|
+
stub_request(:get, "www.example.com/webinar.html").
|
257
|
+
to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
|
258
|
+
stub_request(:get, "www.example.com/sixfacts.html").
|
259
|
+
to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
|
260
|
+
|
261
|
+
propubscraper = Upton::Scraper.index("http://www.example.com/propublica.html", "section#river section h1 a")
|
262
|
+
propubscraper.debug = true
|
263
|
+
propubscraper.verbose = false
|
264
|
+
propubscraper.sleep_time_between_requests = 0
|
265
|
+
propubscraper.stash_folder = "test_stashes"
|
266
|
+
|
267
|
+
heds = propubscraper.scrape do |doc|
|
268
|
+
hed = doc.css('h1.article-title').text
|
269
|
+
end
|
270
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
271
|
+
heds.should eql @headlines
|
272
|
+
end
|
273
|
+
|
274
|
+
it "should allow instances to be set on a new Scraper" do
|
275
|
+
stub_request(:get, "www.example.com/propublica.html").
|
276
|
+
to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
|
277
|
+
stub_request(:get, "www.example.com/discussion.html").
|
278
|
+
to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
|
279
|
+
stub_request(:get, "www.example.com/prosecutor.html").
|
280
|
+
to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
|
281
|
+
stub_request(:get, "www.example.com/webinar.html").
|
282
|
+
to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
|
283
|
+
stub_request(:get, "www.example.com/sixfacts.html").
|
284
|
+
to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
|
285
|
+
|
286
|
+
propubscraper = Upton::Scraper.instances(["www.example.com/webinar.html",
|
287
|
+
"www.example.com/discussion.html",
|
288
|
+
"www.example.com/prosecutor.html",
|
289
|
+
"www.example.com/sixfacts.html"])
|
290
|
+
|
291
|
+
propubscraper.debug = true
|
292
|
+
propubscraper.verbose = false
|
293
|
+
propubscraper.sleep_time_between_requests = 0
|
294
|
+
propubscraper.stash_folder = "test_stashes"
|
295
|
+
|
296
|
+
heds = propubscraper.scrape do |doc|
|
297
|
+
hed = doc.css('h1.article-title').text
|
298
|
+
end
|
299
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
300
|
+
heds.should eql @headlines
|
301
|
+
end
|
302
|
+
|
303
|
+
it "should allow Scrapers to be added (indexes)" do
|
304
|
+
u = Upton::Scraper.index("http://www.example1.com", '.link')
|
305
|
+
w = Upton::Scraper.index("http://www.example2.com", '.link')
|
306
|
+
new_scraper = u + w
|
307
|
+
new_scraper.instance_variable_get(:@indexes).map{|a| a[0]}.should eql ["http://www.example1.com", "http://www.example2.com"]
|
308
|
+
end
|
309
|
+
|
310
|
+
it "should allow Scrapers to be added (instances)" do
|
311
|
+
pending
|
312
|
+
u = Upton::Scraper.instances(["http://www.example1.com"])
|
313
|
+
w = Upton::Scraper.instances(["http://www.example2.com"])
|
314
|
+
new_scraper = u + w
|
315
|
+
new_scraper.instance_variable_get(:@indexes).should eql []
|
316
|
+
new_scraper.instance_variable_get(:@instance_urls).map{|a| a[0]}.should eql ["http://www.example1.com", "http://www.example2.com"]
|
317
|
+
end
|
318
|
+
|
248
319
|
|
249
320
|
before do
|
250
321
|
Upton::Scraper.stub(:puts)
|
@@ -252,7 +323,7 @@ describe Upton do
|
|
252
323
|
|
253
324
|
it "should be silent if verbose is false" do
|
254
325
|
stub_request(:get, "www.example.com")
|
255
|
-
u = Upton::Scraper.
|
326
|
+
u = Upton::Scraper.index("http://www.example.com", '.whatever')
|
256
327
|
u.sleep_time_between_requests = 0.0
|
257
328
|
u.verbose = false
|
258
329
|
u.should_not_receive(:puts)
|
metadata
CHANGED
@@ -1,119 +1,127 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0.prea
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rack
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - '>='
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- -
|
31
|
+
- - '>='
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- -
|
38
|
+
- - '>='
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: webmock
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- -
|
45
|
+
- - '>='
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- -
|
52
|
+
- - '>='
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
56
|
name: thin
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
|
-
- -
|
59
|
+
- - '>='
|
60
60
|
- !ruby/object:Gem::Version
|
61
61
|
version: '0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
|
-
- -
|
66
|
+
- - '>='
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: nokogiri
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 1.5.1
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 1.5.1
|
69
83
|
- !ruby/object:Gem::Dependency
|
70
84
|
name: yard
|
71
85
|
requirement: !ruby/object:Gem::Requirement
|
72
86
|
requirements:
|
73
|
-
- -
|
87
|
+
- - '>='
|
74
88
|
- !ruby/object:Gem::Version
|
75
89
|
version: '0'
|
76
90
|
type: :development
|
77
91
|
prerelease: false
|
78
92
|
version_requirements: !ruby/object:Gem::Requirement
|
79
93
|
requirements:
|
80
|
-
- -
|
94
|
+
- - '>='
|
81
95
|
- !ruby/object:Gem::Version
|
82
96
|
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: rest-client
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- -
|
88
|
-
- !ruby/object:Gem::Version
|
89
|
-
version: '1.6'
|
90
|
-
- - "~>"
|
101
|
+
- - ~>
|
91
102
|
- !ruby/object:Gem::Version
|
92
|
-
version:
|
103
|
+
version: 1.6.7
|
93
104
|
type: :runtime
|
94
105
|
prerelease: false
|
95
106
|
version_requirements: !ruby/object:Gem::Requirement
|
96
107
|
requirements:
|
97
|
-
- -
|
98
|
-
- !ruby/object:Gem::Version
|
99
|
-
version: '1.6'
|
100
|
-
- - "~>"
|
108
|
+
- - ~>
|
101
109
|
- !ruby/object:Gem::Version
|
102
|
-
version:
|
110
|
+
version: 1.6.7
|
103
111
|
- !ruby/object:Gem::Dependency
|
104
112
|
name: nokogiri
|
105
113
|
requirement: !ruby/object:Gem::Requirement
|
106
114
|
requirements:
|
107
|
-
- -
|
115
|
+
- - '>='
|
108
116
|
- !ruby/object:Gem::Version
|
109
|
-
version: '
|
117
|
+
version: '0'
|
110
118
|
type: :runtime
|
111
119
|
prerelease: false
|
112
120
|
version_requirements: !ruby/object:Gem::Requirement
|
113
121
|
requirements:
|
114
|
-
- -
|
122
|
+
- - '>='
|
115
123
|
- !ruby/object:Gem::Version
|
116
|
-
version: '
|
124
|
+
version: '0'
|
117
125
|
description: Don't re-write web scrapers every time. Upton gives you a scraper template
|
118
126
|
that's easy to use for debugging and doesn't hammer servers by default.
|
119
127
|
email: jeremybmerrill@jeremybmerrill.com
|
@@ -122,22 +130,22 @@ extensions: []
|
|
122
130
|
extra_rdoc_files: []
|
123
131
|
files:
|
124
132
|
- lib/upton.rb
|
125
|
-
- lib/upton/downloader.rb
|
126
133
|
- lib/upton/scraper.rb
|
127
134
|
- lib/upton/utils.rb
|
135
|
+
- lib/upton/downloader.rb
|
128
136
|
- lib/upton/version.rb
|
129
|
-
- spec/data/
|
137
|
+
- spec/data/prosecutor.html
|
130
138
|
- spec/data/easttimor.html
|
131
|
-
- spec/data/
|
132
|
-
- spec/data/propublica.html
|
139
|
+
- spec/data/discussion.html
|
133
140
|
- spec/data/propublica_search.html
|
134
141
|
- spec/data/propublica_search_page_2.html
|
135
|
-
- spec/data/
|
136
|
-
- spec/data/sixfacts.html
|
142
|
+
- spec/data/propublica-relative.html
|
137
143
|
- spec/data/webinar.html
|
144
|
+
- spec/data/propublica.html
|
145
|
+
- spec/data/sixfacts.html
|
146
|
+
- spec/upton_spec.rb
|
138
147
|
- spec/spec_helper.rb
|
139
148
|
- spec/upton_downloader_spec.rb
|
140
|
-
- spec/upton_spec.rb
|
141
149
|
homepage: http://github.org/propublica/upton
|
142
150
|
licenses:
|
143
151
|
- MIT
|
@@ -148,30 +156,31 @@ require_paths:
|
|
148
156
|
- lib
|
149
157
|
required_ruby_version: !ruby/object:Gem::Requirement
|
150
158
|
requirements:
|
151
|
-
- -
|
159
|
+
- - '>='
|
152
160
|
- !ruby/object:Gem::Version
|
153
161
|
version: 1.9.2
|
154
162
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
155
163
|
requirements:
|
156
|
-
- -
|
164
|
+
- - '>'
|
157
165
|
- !ruby/object:Gem::Version
|
158
|
-
version:
|
166
|
+
version: 1.3.1
|
159
167
|
requirements: []
|
160
168
|
rubyforge_project:
|
161
|
-
rubygems_version: 2.
|
169
|
+
rubygems_version: 2.0.14
|
162
170
|
signing_key:
|
163
171
|
specification_version: 4
|
164
172
|
summary: A simple web-scraping framework
|
165
173
|
test_files:
|
166
174
|
- spec/data/prosecutor.html
|
167
|
-
- spec/data/
|
168
|
-
- spec/data/propublica.html
|
175
|
+
- spec/data/easttimor.html
|
169
176
|
- spec/data/discussion.html
|
177
|
+
- spec/data/propublica_search.html
|
170
178
|
- spec/data/propublica_search_page_2.html
|
171
|
-
- spec/data/sixfacts.html
|
172
179
|
- spec/data/propublica-relative.html
|
173
|
-
- spec/data/easttimor.html
|
174
180
|
- spec/data/webinar.html
|
181
|
+
- spec/data/propublica.html
|
182
|
+
- spec/data/sixfacts.html
|
175
183
|
- spec/upton_spec.rb
|
176
184
|
- spec/spec_helper.rb
|
177
185
|
- spec/upton_downloader_spec.rb
|
186
|
+
has_rdoc: true
|