upton 0.3.6 → 1.0.0.prea

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dc480dd21a06c69b7a337b17ed01f95fa262d33b
4
- data.tar.gz: f5f5f1c99471d884a585e0098c8d3ca975fe8c8e
3
+ metadata.gz: c446c20e57e387b365d9c5bcda546a1b48ebbcf1
4
+ data.tar.gz: a29ee1aa35b18a9324d504ae8e99e2a9bafcfb27
5
5
  SHA512:
6
- metadata.gz: 7bda1a3ee82d668b3d966b50ba7bff58133e807d9da88e988288be25bdcb26956917216a9925a91275c3978906b31eaf4ee9a5dba1694bbeb9811b6912339771
7
- data.tar.gz: 121c8c524f56f41c24e6f9cb9fb9584d9b0d988bf0c1c4a163b63b539dfd44f8e969ccd41efa45f96b4da41ed4c3963c87b6f5e708ef20b31c0e518f7d850dff
6
+ metadata.gz: 11d5e990c42441d5bf599952bf3d49289754f68da72a51b31d62c86281531960fc18adffb7e52c59fe37ac5e275c35e766ce922c9a1922294f032d2a5c7cbea7
7
+ data.tar.gz: a6cbe33126fe3506c2248d40677e88e3d4e545ec2dc3e6613b3d623232f2288eae1305f41ee1ce63a996ff64812558596a95f1b14dffb63bca71bd89562e9fe7
data/lib/upton.rb CHANGED
@@ -1,10 +1,10 @@
1
1
  # encoding: UTF-8
2
2
 
3
- require 'nokogiri'
4
- require 'uri'
5
- require 'restclient'
3
+ require_relative 'upton/scraper'
6
4
  require_relative 'upton/utils'
5
+ require_relative 'upton/version'
7
6
  require_relative 'upton/downloader'
7
+ require_relative 'upton/scraper'
8
8
 
9
9
  ##
10
10
  # This module contains a scraper called Upton
@@ -22,332 +22,6 @@ module Upton
22
22
  # site's search page or a newspaper's homepage.
23
23
  # 2. Instance pages, which represent the goal of your scraping, e.g.
24
24
  # job listings or news articles.
25
- #
26
- # Upton::Scraper can be used as-is for basic use-cases by:
27
- # 1. specifying the pages to be scraped in `new` as an index page
28
- # or as an Array of URLs.
29
- # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
30
- # block from Upton::Utils.
31
- # For more complicated cases; subclass Upton::Scraper
32
- # e.g. +MyScraper < Upton::Scraper+ and override various methods.
33
25
  ##
34
- class Scraper
35
- EMPTY_STRING = ''
36
-
37
- attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
38
- :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
39
- :pagination_interval
40
-
41
- ##
42
- # This is the main user-facing method for a basic scraper.
43
- # Call +scrape+ with a block; this block will be called on
44
- # the text of each instance page, (and optionally, its URL and its index
45
- # in the list of instance URLs returned by +get_index+).
46
- ##
47
- def scrape(&blk)
48
- self.url_array = self.get_index unless self.url_array
49
- blk = Proc.new{|x| x} if blk.nil?
50
- self.scrape_from_list(self.url_array, blk)
51
- end
52
-
53
- ##
54
- # +index_url_or_array+: A list of string URLs, OR
55
- # the URL of the page containing the list of instances.
56
- # +selector+: The XPath expression or CSS selector that specifies the
57
- # anchor elements within the page, if a url is specified for
58
- # the previous argument.
59
- #
60
- # These options are a shortcut. If you plan to override +get_index+, you
61
- # do not need to set them.
62
- # If you don't specify a selector, the first argument will be treated as a
63
- # list of URLs.
64
- ##
65
- def initialize(index_url_or_array, selector="")
66
-
67
- #if first arg is a valid URL, do already-written stuff;
68
- #if it's not (or if it's a list?) don't bother with get_index, etc.
69
- #e.g. Scraper.new(["http://jeremybmerrill.com"])
70
-
71
- #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
72
- if index_url_or_array.respond_to? :each_with_index
73
- @url_array = index_url_or_array
74
- else
75
- @index_url = index_url_or_array
76
- @index_selector = selector
77
- end
78
-
79
- # If true, then Upton prints information about when it gets
80
- # files from the internet and when it gets them from its stash.
81
- @verbose = false
82
-
83
- # If true, then Upton fetches each instance page only once
84
- # future requests for that file are responded to with the locally stashed
85
- # version.
86
- # You may want to set @debug to false for production (but maybe not).
87
- # You can also control stashing behavior on a per-call basis with the
88
- # optional second argument to get_page, if, for instance, you want to
89
- # stash certain instance pages, e.g. based on their modification date.
90
- @debug = true
91
- # Index debug does the same, but for index pages.
92
- @index_debug = false
93
-
94
- # In order to not hammer servers, Upton waits for, by default, 30
95
- # seconds between requests to the remote server.
96
- @sleep_time_between_requests = 30 #seconds
97
-
98
- # If true, then Upton will attempt to scrape paginated index pages
99
- @paginated = false
100
- # Default query string parameter used to specify the current page
101
- @pagination_param = 'page'
102
- # Default number of paginated pages to scrape
103
- @pagination_max_pages = 2
104
- # Default starting number for pagination (second page is this plus 1).
105
- @pagination_start_index = 1
106
- # Default value to increment page number by
107
- @pagination_interval = 1
108
-
109
- # Folder name for stashes, if you want them to be stored somewhere else,
110
- # e.g. under /tmp.
111
- if @stash_folder
112
- FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
113
- end
114
- end
115
-
116
- ##
117
- # If instance pages are paginated, <b>you must override</b>
118
- # this method to return the next URL, given the current URL and its index.
119
- #
120
- # If instance pages aren't paginated, there's no need to override this.
121
- #
122
- # Recursion stops if the fetching URL returns an empty string or an error.
123
- #
124
- # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
125
- # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
126
- ##
127
- def next_instance_page_url(url, pagination_index)
128
- EMPTY_STRING
129
- end
130
-
131
- ##
132
- # Return the next URL to scrape, given the current URL and its index.
133
- #
134
- # Recursion stops if the fetching URL returns an empty string or an error.
135
- #
136
- # If @paginated is not set (the default), this method returns an empty string.
137
- #
138
- # If @paginated is set, this method will return the next pagination URL
139
- # to scrape using @pagination_param and the pagination_index.
140
- #
141
- # If the pagination_index is greater than @pagination_max_pages, then the
142
- # method will return an empty string.
143
- #
144
- # Override this method to handle pagination is an alternative way
145
- # e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
146
- # ought to return "http://whatever.com/articles?page=2"
147
- #
148
- ##
149
- def next_index_page_url(url, pagination_index)
150
- return url unless @paginated
151
-
152
- if pagination_index > @pagination_max_pages
153
- puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
154
- EMPTY_STRING
155
- else
156
- uri = URI.parse(url)
157
- query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
158
- # update the pagination query string parameter
159
- query[@pagination_param] = pagination_index
160
- uri.query = URI.encode_www_form(query)
161
- puts "Next index pagination url is #{uri}" if @verbose
162
- uri.to_s
163
- end
164
- end
165
-
166
- ##
167
- # Writes the scraped result to a CSV at the given filename.
168
- ##
169
- def scrape_to_csv filename, &blk
170
- require 'csv'
171
- self.url_array = self.get_index unless self.url_array
172
- CSV.open filename, 'wb' do |csv|
173
- #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
174
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
175
- if document[0].respond_to? :map
176
- document.each{|row| csv << row }
177
- else
178
- csv << document
179
- end
180
- end
181
- #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
182
- end
183
- end
184
-
185
- def scrape_to_tsv filename, &blk
186
- require 'csv'
187
- self.url_array = self.get_index unless self.url_array
188
- CSV.open filename, 'wb', :col_sep => "\t" do |csv|
189
- #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
190
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
191
- if document[0].respond_to? :map
192
- document.each{|row| csv << row }
193
- else
194
- csv << document
195
- end
196
- end
197
- #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
198
- end
199
- end
200
-
201
- protected
202
-
203
- ##
204
- # Handles getting pages with Downlader, which handles stashing.
205
- ##
206
- def get_page(url, stash=false, options={})
207
- return EMPTY_STRING if url.nil? || url.empty? #url is nil if the <a> lacks an `href` attribute.
208
- global_options = {
209
- :cache => stash,
210
- :verbose => @verbose
211
- }
212
- if @readable_filenames
213
- global_options[:readable_filenames] = true
214
- end
215
- if @stash_folder
216
- global_options[:readable_filenames] = true
217
- global_options[:cache_location] = @stash_folder
218
- end
219
- resp_and_cache = Downloader.new(url, global_options.merge(options)).get
220
- if resp_and_cache[:from_resource]
221
- puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
222
- sleep @sleep_time_between_requests
223
- end
224
- resp_and_cache[:resp]
225
- end
226
-
227
-
228
- ##
229
- # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
230
- # resolve_url resolves them to absolute urls.
231
- # absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
232
- ##
233
- def resolve_url(href_str, absolute_url_str)
234
- if absolute_url_str.class <= URI::Generic
235
- absolute_url = absolute_url_str.dup
236
- else
237
- begin
238
- absolute_url = URI(absolute_url_str).dup
239
- rescue URI::InvalidURIError
240
- raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
241
- end
242
- end
243
- raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
244
- if href_str.class <= URI::Generic
245
- href = href_str.dup
246
- else
247
- begin
248
- href = URI(href_str).dup
249
- rescue URI::InvalidURIError
250
- raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
251
- end
252
- end
253
-
254
- # return :href if :href is already absolute
255
- return href.to_s if href.absolute?
256
-
257
- #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
258
- URI.join(absolute_url.to_s, href.to_s).to_s
259
- end
260
-
261
- ##
262
- # Return a list of URLs for the instances you want to scrape.
263
- # This can optionally be overridden if, for example, the list of instances
264
- # comes from an API.
265
- ##
266
- def get_index
267
- index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
268
- end
269
-
270
- # TODO: Not sure the best way to handle this
271
- # Currently, #parse_index is called upon #get_index_pages,
272
- # which itself is dependent on @index_url
273
- # Does @index_url stay unaltered for the lifetime of the Upton instance?
274
- # It seems to at this point, but that may be something that gets
275
- # deprecated later
276
- #
277
- # So for now, @index_url is used in conjunction with resolve_url
278
- # to make sure that this method returns absolute urls
279
- # i.e. this method expects @index_url to always have an absolute address
280
- # for the lifetime of an Upton instance
281
- def parse_index(text, selector)
282
- Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
283
- href = a_element["href"]
284
- resolved_url = resolve_url( href, @index_url) unless href.nil?
285
- puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
286
- resolved_url
287
- end
288
- end
289
-
290
-
291
- ##
292
- # Returns the concatenated output of each member of a paginated index,
293
- # e.g. a site listing links with 2+ pages.
294
- ##
295
- def get_index_pages(original_url, pagination_index, pagination_interval, options={})
296
- resps = []
297
- prev_url = nil
298
- while resps.empty? || !resps.last.empty?
299
- next_url = self.next_index_page_url(original_url, pagination_index)
300
- break if next_url.empty?
301
-
302
- next_url = resolve_url(next_url, original_url)
303
- break if next_url == prev_url
304
-
305
- next_resp = self.get_page(next_url, @index_debug, options).to_s
306
- prev_url = next_url
307
- pagination_index += pagination_interval
308
- resps << next_resp
309
- end
310
- resps
311
- end
312
-
313
- ##
314
- # Returns the instance at `url`.
315
- #
316
- # If the page is stashed, returns that, otherwise, fetches it from the web.
317
- #
318
- # If an instance is paginated, returns the concatenated output of each
319
- # page, e.g. if a news article has two pages.
320
- ##
321
- def get_instance(url, pagination_index=0, options={})
322
- resps = [self.get_page(url, @debug, options)]
323
- pagination_index = pagination_index.to_i
324
- prev_url = url
325
- while !resps.last.empty?
326
- next_url = self.next_instance_page_url(url, pagination_index + 1)
327
- break if next_url == prev_url || next_url.empty?
328
-
329
- next_resp = self.get_page(next_url, @debug, options)
330
- prev_url = next_url
331
- resps << next_resp
332
- end
333
- resps
334
- end
335
-
336
- # Just a helper for +scrape+.
337
- def scrape_from_list(list, blk)
338
- puts "Scraping #{list.size} instances" if @verbose
339
- list.each_with_index.map do |instance_url, instance_index|
340
- instance_resps = get_instance instance_url, nil, :instance_index => instance_index
341
- instance_resps.each_with_index.map do |instance_resp, pagination_index|
342
- blk.call(instance_resp, instance_url, instance_index, pagination_index)
343
- end
344
- end.flatten(1)
345
- end
346
-
347
- # it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
348
- def slug(url)
349
- url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
350
- end
351
26
 
352
- end
353
27
  end
@@ -103,7 +103,7 @@ module Upton
103
103
  msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
104
104
  resp_html = Nokogiri::HTML(resp)
105
105
  comment = Nokogiri::XML::Comment.new(resp_html, msg)
106
- if resp_html.root.nil? || !resp_html.include?("<html")
106
+ if resp_html.root.nil?
107
107
  return resp
108
108
  elsif resp_html.root.children.empty?
109
109
  resp_html.root.add_child(comment)
data/lib/upton/scraper.rb CHANGED
@@ -1,9 +1,10 @@
1
1
  require 'uri'
2
2
  require 'nokogiri'
3
3
  require_relative './downloader'
4
+ require_relative './page'
4
5
 
5
6
  module Upton
6
- # Upton::Scraper can be used as-is for basic use-cases by:
7
+ # Upton::Scraper can be used as-is for basic use-cases by:
7
8
  # 1. specifying the pages to be scraped in `new` as an index page
8
9
  # or as an Array of URLs.
9
10
  # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
@@ -14,9 +15,8 @@ module Upton
14
15
  class Scraper
15
16
  EMPTY_STRING = ''
16
17
 
17
- attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
18
- :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
19
- :pagination_interval
18
+ attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests,
19
+ :stash_folder, :readable_filenames
20
20
 
21
21
  ##
22
22
  # This is the main user-facing method for a basic scraper.
@@ -25,8 +25,8 @@ module Upton
25
25
  # in the list of instance URLs returned by +get_index+).
26
26
  ##
27
27
  def scrape(&blk)
28
- self.url_array = self.get_index unless self.url_array
29
- self.scrape_from_list(self.url_array, blk)
28
+ get_indexes!
29
+ self.scrape_from_list(@instance_urls, blk)
30
30
  end
31
31
 
32
32
  ##
@@ -41,23 +41,10 @@ module Upton
41
41
  # If you don't specify a selector, the first argument will be treated as a
42
42
  # list of URLs.
43
43
  ##
44
- def initialize(index_url_or_array, selector="")
45
-
46
- #if first arg is a valid URL, do already-written stuff;
47
- #if it's not (or if it's a list?) don't bother with get_index, etc.
48
- #e.g. Scraper.new(["http://jeremybmerrill.com"])
49
-
50
- #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
51
- if index_url_or_array.respond_to? :each_with_index
52
- @url_array = index_url_or_array
53
- else
54
- @index_url = index_url_or_array
55
- @index_selector = selector
56
- end
57
-
44
+ def initialize(options={})
58
45
  # If true, then Upton prints information about when it gets
59
46
  # files from the internet and when it gets them from its stash.
60
- @verbose = false
47
+ @verbose = options[:verbose] || false
61
48
 
62
49
  # If true, then Upton fetches each instance page only once
63
50
  # future requests for that file are responded to with the locally stashed
@@ -66,29 +53,77 @@ module Upton
66
53
  # You can also control stashing behavior on a per-call basis with the
67
54
  # optional second argument to get_page, if, for instance, you want to
68
55
  # stash certain instance pages, e.g. based on their modification date.
69
- @debug = true
56
+ @debug = options[:debug] || true
70
57
  # Index debug does the same, but for index pages.
71
- @index_debug = false
58
+ @index_debug = options[:index_debug] || false
72
59
 
73
60
  # In order to not hammer servers, Upton waits for, by default, 30
74
61
  # seconds between requests to the remote server.
75
- @sleep_time_between_requests = 30 #seconds
62
+ @sleep_time_between_requests = options[:sleep_time_between_requests] || 30 #seconds
63
+
64
+ # Folder name for stashes, if you want them to be stored somewhere else,
65
+ # e.g. under /tmp.
66
+ if @stash_folder
67
+ FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
68
+ end
69
+
70
+ @indexes = []
71
+ @instance_urls = []
72
+ end
73
+
74
+ def index(index_url, selector, options={})
75
+ # for future:
76
+ @indexes ||= []
76
77
 
78
+ ##
79
+ # Pagination options are per-index page
80
+ #
77
81
  # If true, then Upton will attempt to scrape paginated index pages
78
- @paginated = false
82
+ options[:paginated] ||= false
79
83
  # Default query string parameter used to specify the current page
80
- @pagination_param = 'page'
84
+ options[:pagination_param] ||= 'page'
81
85
  # Default number of paginated pages to scrape
82
- @pagination_max_pages = 2
86
+ options[:pagination_max_pages] ||= 2
83
87
  # Default starting number for pagination (second page is this plus 1).
84
- @pagination_start_index = 1
88
+ options[:pagination_start_index] ||= 1
85
89
  # Default value to increment page number by
86
- @pagination_interval = 1
87
-
88
- # Folder name for stashes, if you want them to be stored somewhere else,
89
- # e.g. under /tmp.
90
- if @stash_folder
91
- FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
90
+ options[:pagination_interval] ||= 1
91
+ ##
92
+
93
+ @indexes << [index_url, selector, options]
94
+ # and actually go scrape the index page, populate @instances
95
+ self
96
+ end
97
+
98
+ def self.index(index_url, selector, options={})
99
+ scraper = self.new
100
+ scraper.index(index_url, selector, options)
101
+ scraper
102
+ end
103
+
104
+ def self.instances(instances, options={})
105
+ s = self.new
106
+ s.instance_variable_set(:@instance_urls, instances)
107
+ s
108
+ end
109
+
110
+ # does
111
+ # def add_instances(urls)
112
+ # #for future:
113
+ # # @instances += urls
114
+ # # @instances.uniq!
115
+ # @instance_urls ||= []
116
+ # @instance_urls += urls
117
+ # @instance_urls.uniq!
118
+ # end
119
+
120
+ def instances(urls=nil)
121
+ if urls.nil?
122
+ @instance_urls
123
+ else
124
+ @instance_urls ||= []
125
+ @instance_urls += urls
126
+ self
92
127
  end
93
128
  end
94
129
 
@@ -125,21 +160,14 @@ module Upton
125
160
  # ought to return "http://whatever.com/articles?page=2"
126
161
  #
127
162
  ##
128
- def next_index_page_url(url, pagination_index)
129
- return EMPTY_STRING unless @paginated
130
-
131
- if pagination_index > @pagination_max_pages
132
- puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
133
- EMPTY_STRING
134
- else
135
- uri = URI.parse(url)
136
- query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
137
- # update the pagination query string parameter
138
- query[@pagination_param] = pagination_index
139
- uri.query = URI.encode_www_form(query)
140
- puts "Next index pagination url is #{uri}" if @verbose
141
- uri.to_s
142
- end
163
+ def next_index_page_url(url, pagination_param, pagination_index)
164
+ uri = URI.parse(url)
165
+ query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
166
+ # update the pagination query string parameter
167
+ query[pagination_param] = pagination_index
168
+ uri.query = URI.encode_www_form(query)
169
+ puts "Next index pagination url is #{uri}" if @verbose
170
+ uri.to_s
143
171
  end
144
172
 
145
173
  ##
@@ -147,36 +175,46 @@ module Upton
147
175
  ##
148
176
  def scrape_to_csv filename, &blk
149
177
  require 'csv'
150
- self.url_array = self.get_index unless self.url_array
178
+ self.get_indexes!
151
179
  CSV.open filename, 'wb' do |csv|
152
180
  #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
153
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
181
+ self.scrape_from_list(@instance_urls, blk).compact.each do |document|
154
182
  if document[0].respond_to? :map
155
183
  document.each{|row| csv << row }
156
184
  else
157
185
  csv << document
158
186
  end
159
187
  end
160
- #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
188
+ #self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
161
189
  end
162
190
  end
163
191
 
164
192
  def scrape_to_tsv filename, &blk
165
193
  require 'csv'
166
- self.url_array = self.get_index unless self.url_array
194
+ get_indexes!
167
195
  CSV.open filename, 'wb', :col_sep => "\t" do |csv|
168
196
  #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
169
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
197
+ self.scrape_from_list(@instance_urls, blk).compact.each do |document|
170
198
  if document[0].respond_to? :map
171
199
  document.each{|row| csv << row }
172
200
  else
173
201
  csv << document
174
202
  end
175
203
  end
176
- #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
204
+ #self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
177
205
  end
178
206
  end
179
207
 
208
+ def +(other_scraper)
209
+ raise ArgumentError, "#{other_scraper.class} can't be coerced into Upton::Scraper" unless other_scraper.class <= Upton::Scraper
210
+ new_scraper = Scraper.new
211
+ new_indexes = @indexes + other_scraper.instance_variable_get(:@indexes)
212
+ new_instances = @instance_urls + other_scraper.instance_variable_get(:@instance_urls)
213
+ new_scraper.instance_variable_set(:@indexes, new_indexes)
214
+ new_scraper.instance_variable_set(:@instance_urls, new_instances)
215
+ new_scraper
216
+ end
217
+
180
218
  protected
181
219
 
182
220
  ##
@@ -217,6 +255,8 @@ module Upton
217
255
  absolute_url = URI(absolute_url_str).dup
218
256
  rescue URI::InvalidURIError
219
257
  raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
258
+ rescue ArgumentError
259
+ raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
220
260
  end
221
261
  end
222
262
  raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
@@ -237,15 +277,6 @@ module Upton
237
277
  URI.join(absolute_url.to_s, href.to_s).to_s
238
278
  end
239
279
 
240
- ##
241
- # Return a list of URLs for the instances you want to scrape.
242
- # This can optionally be overridden if, for example, the list of instances
243
- # comes from an API.
244
- ##
245
- def get_index
246
- index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
247
- end
248
-
249
280
  # TODO: Not sure the best way to handle this
250
281
  # Currently, #parse_index is called upon #get_index_pages,
251
282
  # which itself is dependent on @index_url
@@ -253,30 +284,31 @@ module Upton
253
284
  # It seems to at this point, but that may be something that gets
254
285
  # deprecated later
255
286
  #
256
- # So for now, @index_url is used in conjunction with resolve_url
287
+ # So for now, index_url is used in conjunction with resolve_url
257
288
  # to make sure that this method returns absolute urls
258
- # i.e. this method expects @index_url to always have an absolute address
259
- # for the lifetime of an Upton instance
260
- def parse_index(text, selector)
261
- Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
262
- href = a_element["href"]
263
- resolved_url = resolve_url( href, @index_url) unless href.nil?
289
+ def parse_index(text, selector, index_url)
290
+ Nokogiri::HTML(text).search(selector).to_a.map do |anchor|
291
+ href = anchor["href"]
292
+ resolved_url = resolve_url( href, index_url) unless href.nil?
264
293
  puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
265
294
  resolved_url
266
295
  end
267
296
  end
268
297
 
269
-
270
298
  ##
271
299
  # Returns the concatenated output of each member of a paginated index,
272
300
  # e.g. a site listing links with 2+ pages.
273
301
  ##
274
- def get_index_pages(url, pagination_index, pagination_interval, options={})
302
+ def get_index_pages(url, pagination_index, options={})
275
303
  resps = [self.get_page(url, @index_debug, options)]
304
+ return resps unless options[:paginated]
305
+
276
306
  prev_url = url
277
307
  while !resps.last.empty?
278
- pagination_index += pagination_interval
279
- next_url = self.next_index_page_url(url, pagination_index)
308
+ pagination_index += options[:pagination_interval]
309
+ break if pagination_index > options[:pagination_max_pages]
310
+
311
+ next_url = self.next_index_page_url(url, options[:pagination_param], pagination_index)
280
312
  next_url = resolve_url(next_url, url)
281
313
  break if next_url == prev_url || next_url.empty?
282
314
 
@@ -310,13 +342,28 @@ module Upton
310
342
  resps
311
343
  end
312
344
 
345
+ ##
346
+ # Return a list of URLs for the instances you want to scrape.
347
+ # This can optionally be overridden if, for example, the list of instances
348
+ # comes from an API.
349
+ ##
350
+ def get_indexes!
351
+ @indexes.each do |index_url, index_selector, options|
352
+ #TODO: cope with pagination stuff per URL
353
+
354
+ @instance_urls += get_index_pages(index_url, options[:pagination_start_index], options).map{|page| parse_index(page, index_selector, index_url) }.flatten
355
+ end
356
+ end
357
+
358
+
313
359
  # Just a helper for +scrape+.
314
360
  def scrape_from_list(list, blk)
315
361
  puts "Scraping #{list.size} instances" if @verbose
316
362
  list.each_with_index.map do |instance_url, instance_index|
317
363
  instance_resps = get_instance instance_url, nil, :instance_index => instance_index
318
364
  instance_resps.each_with_index.map do |instance_resp, pagination_index|
319
- blk.call(instance_resp, instance_url, instance_index, pagination_index)
365
+ page = Page.new(instance_resp, instance_url, instance_index, pagination_index)
366
+ blk.call(page)
320
367
  end
321
368
  end.flatten(1)
322
369
  end
data/lib/upton/utils.rb CHANGED
@@ -18,8 +18,7 @@ module Upton
18
18
  # present, is returned as the first row.
19
19
  ##
20
20
  def self.table(table_selector, deprecated=nil)
21
- return Proc.new do |instance_html|
22
- html = ::Nokogiri::HTML(instance_html)
21
+ return Proc.new do |html|
23
22
  output = []
24
23
  headers = html.search(table_selector).css("th").map &:text
25
24
  output << headers
@@ -33,8 +32,7 @@ module Upton
33
32
  # Scrapes any set of HTML elements into an Array.
34
33
  ##
35
34
  def self.list(list_selector, deprecated=nil)
36
- return Proc.new do |instance_html|
37
- html = ::Nokogiri::HTML(instance_html)
35
+ return Proc.new do |html|
38
36
  html.search(list_selector).map{|list_element| list_element.text }
39
37
  end
40
38
  end
data/lib/upton/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Upton # :nodoc:
2
- VERSION = '0.3.6'
2
+ VERSION = '1.0.0.prea'
3
3
  end
data/spec/upton_spec.rb CHANGED
@@ -52,15 +52,14 @@ describe Upton do
52
52
  stub_request(:get, "www.example.com/sixfacts.html").
53
53
  to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
54
54
 
55
- propubscraper = Upton::Scraper.new("http://www.example.com/propublica.html", "section#river section h1 a")
55
+ propubscraper = Upton::Scraper.index("http://www.example.com/propublica.html", "section#river section h1 a")
56
56
  propubscraper.debug = true
57
57
  propubscraper.verbose = false
58
58
  propubscraper.sleep_time_between_requests = 0
59
59
  propubscraper.stash_folder = "test_stashes"
60
60
 
61
- heds = propubscraper.scrape do |article_str|
62
- doc = Nokogiri::HTML(article_str)
63
- hed = doc.css('h1.article-title').text
61
+ heds = propubscraper.scrape do |doc|
62
+ doc.css('h1.article-title').text
64
63
  end
65
64
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
66
65
  heds.should eql @headlines
@@ -87,14 +86,13 @@ describe Upton do
87
86
  to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
88
87
 
89
88
 
90
- propubscraper = Upton::Scraper.new("http://www.example.com/propublica-relative.html", "section#river h1 a")
89
+ propubscraper = Upton::Scraper.index("http://www.example.com/propublica-relative.html", "section#river h1 a")
91
90
  propubscraper.debug = true
92
91
  propubscraper.verbose = false
93
92
  propubscraper.sleep_time_between_requests = 0
94
93
  propubscraper.stash_folder = "test_stashes"
95
94
 
96
- heds = propubscraper.scrape do |article_str|
97
- doc = Nokogiri::HTML(article_str)
95
+ heds = propubscraper.scrape do |doc|
98
96
  hed = doc.css('h1.article-title').text
99
97
  end
100
98
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
@@ -105,7 +103,7 @@ describe Upton do
105
103
  stub_request(:get, "www.example.com/propublica.html").
106
104
  to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
107
105
 
108
- propubscraper = Upton::Scraper.new(["http://www.example.com/propublica.html"])
106
+ propubscraper = Upton::Scraper.instances(["http://www.example.com/propublica.html"])
109
107
  propubscraper.debug = true
110
108
  propubscraper.verbose = false
111
109
  propubscraper.sleep_time_between_requests = 0
@@ -120,7 +118,7 @@ describe Upton do
120
118
  stub_request(:get, "www.example.com/easttimor.html").
121
119
  to_return(:body => File.new('./spec/data/easttimor.html'), :status => 200)
122
120
 
123
- propubscraper = Upton::Scraper.new(["http://www.example.com/easttimor.html"])
121
+ propubscraper = Upton::Scraper.instances(["http://www.example.com/easttimor.html"])
124
122
  propubscraper.debug = true
125
123
  propubscraper.verbose = false
126
124
  propubscraper.sleep_time_between_requests = 0
@@ -139,8 +137,6 @@ describe Upton do
139
137
  it "should scrape paginated pages" do
140
138
  stub_request(:get, "www.example.com/propublica_search.html").
141
139
  to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
142
- stub_request(:get, "www.example.com/propublica_search.html?p=1").
143
- to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
144
140
  stub_request(:get, "www.example.com/propublica_search.html?p=2").
145
141
  to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
146
142
  stub_request(:get, "www.example.com/propublica_search.html?p=3").
@@ -153,17 +149,21 @@ describe Upton do
153
149
  to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
154
150
 
155
151
 
156
- propubscraper = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.compact-list a.title-link')
152
+ propubscraper = Upton::Scraper.index(
153
+ "http://www.example.com/propublica_search.html",
154
+ '.compact-list a.title-link',
155
+ {
156
+ :paginated => true,
157
+ :pagination_param => 'p',
158
+ :pagination_max_pages => 3,
159
+ }
160
+ )
157
161
  propubscraper.debug = true
158
162
  propubscraper.verbose = false
159
- propubscraper.paginated = true
160
- propubscraper.pagination_param = 'p'
161
- propubscraper.pagination_max_pages = 3
162
163
  propubscraper.sleep_time_between_requests = 0
163
164
  propubscraper.stash_folder = "test_stashes"
164
165
 
165
- results = propubscraper.scrape do |article_str|
166
- doc = Nokogiri::HTML(article_str)
166
+ results = propubscraper.scrape do |doc|
167
167
  hed = doc.css('h1.article-title').text
168
168
  end
169
169
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
@@ -177,7 +177,7 @@ describe Upton do
177
177
 
178
178
  it "should sleep after requests with caching disabled" do
179
179
  stub_request(:get, "www.example.com")
180
- u = Upton::Scraper.new("http://www.example.com", '.whatever')
180
+ u = Upton::Scraper.index("http://www.example.com", '.whatever')
181
181
  u.index_debug = false
182
182
  u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
183
183
  u.should_receive(:sleep)
@@ -187,7 +187,7 @@ describe Upton do
187
187
  it "should sleep after uncached requests when caching is enabled" do
188
188
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
189
189
  stub_request(:get, "www.example.com")
190
- u = Upton::Scraper.new("http://www.example.com", '.whatever')
190
+ u = Upton::Scraper.index("http://www.example.com", '.whatever')
191
191
  u.index_debug = true
192
192
  u.stash_folder = "test_stashes"
193
193
  u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
@@ -199,8 +199,6 @@ describe Upton do
199
199
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
200
200
  stub_request(:get, "www.example.com/propublica_search.html").
201
201
  to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
202
- stub_request(:get, "www.example.com/propublica_search.html?p=1").
203
- to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
204
202
  stub_request(:get, "www.example.com/propublica_search.html?p=2").
205
203
  to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
206
204
  stub_request(:get, "www.example.com/propublica_search.html?p=3").
@@ -213,12 +211,15 @@ describe Upton do
213
211
  to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
214
212
 
215
213
 
216
- u = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.nonexistent')
214
+ u = Upton::Scraper.index("http://www.example.com/propublica_search.html", '.nonexistent',
215
+ {
216
+ :paginated => true,
217
+ :pagination_param => 'p',
218
+ :pagination_max_pages => 3,
219
+ }
220
+ )
217
221
  u.index_debug = false
218
222
  u.debug = false
219
- u.paginated = true
220
- u.pagination_param = 'p'
221
- u.pagination_max_pages = 3
222
223
  u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
223
224
  u.stash_folder = "test_stashes"
224
225
 
@@ -234,7 +235,7 @@ describe Upton do
234
235
  stub_request(:get, "www.example.com").
235
236
  to_return(:body => '', :status => 200)
236
237
 
237
- u = Upton::Scraper.new("http://www.example.com", '.whatever')
238
+ u = Upton::Scraper.index("http://www.example.com", '.whatever')
238
239
  u.sleep_time_between_requests = 0.0
239
240
  u.stash_folder = custom_cache_folder
240
241
  u.debug = true
@@ -245,6 +246,76 @@ describe Upton do
245
246
  expect(files).not_to be_empty
246
247
  end
247
248
 
249
+ it "should scrape in the basic case with the index method" do
250
+ stub_request(:get, "www.example.com/propublica.html").
251
+ to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
252
+ stub_request(:get, "www.example.com/discussion.html").
253
+ to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
254
+ stub_request(:get, "www.example.com/prosecutor.html").
255
+ to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
256
+ stub_request(:get, "www.example.com/webinar.html").
257
+ to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
258
+ stub_request(:get, "www.example.com/sixfacts.html").
259
+ to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
260
+
261
+ propubscraper = Upton::Scraper.index("http://www.example.com/propublica.html", "section#river section h1 a")
262
+ propubscraper.debug = true
263
+ propubscraper.verbose = false
264
+ propubscraper.sleep_time_between_requests = 0
265
+ propubscraper.stash_folder = "test_stashes"
266
+
267
+ heds = propubscraper.scrape do |doc|
268
+ hed = doc.css('h1.article-title').text
269
+ end
270
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
271
+ heds.should eql @headlines
272
+ end
273
+
274
+ it "should allow instances to be set on a new Scraper" do
275
+ stub_request(:get, "www.example.com/propublica.html").
276
+ to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
277
+ stub_request(:get, "www.example.com/discussion.html").
278
+ to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
279
+ stub_request(:get, "www.example.com/prosecutor.html").
280
+ to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
281
+ stub_request(:get, "www.example.com/webinar.html").
282
+ to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
283
+ stub_request(:get, "www.example.com/sixfacts.html").
284
+ to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
285
+
286
+ propubscraper = Upton::Scraper.instances(["www.example.com/webinar.html",
287
+ "www.example.com/discussion.html",
288
+ "www.example.com/prosecutor.html",
289
+ "www.example.com/sixfacts.html"])
290
+
291
+ propubscraper.debug = true
292
+ propubscraper.verbose = false
293
+ propubscraper.sleep_time_between_requests = 0
294
+ propubscraper.stash_folder = "test_stashes"
295
+
296
+ heds = propubscraper.scrape do |doc|
297
+ hed = doc.css('h1.article-title').text
298
+ end
299
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
300
+ heds.should eql @headlines
301
+ end
302
+
303
+ it "should allow Scrapers to be added (indexes)" do
304
+ u = Upton::Scraper.index("http://www.example1.com", '.link')
305
+ w = Upton::Scraper.index("http://www.example2.com", '.link')
306
+ new_scraper = u + w
307
+ new_scraper.instance_variable_get(:@indexes).map{|a| a[0]}.should eql ["http://www.example1.com", "http://www.example2.com"]
308
+ end
309
+
310
+ it "should allow Scrapers to be added (instances)" do
311
+ pending
312
+ u = Upton::Scraper.instances(["http://www.example1.com"])
313
+ w = Upton::Scraper.instances(["http://www.example2.com"])
314
+ new_scraper = u + w
315
+ new_scraper.instance_variable_get(:@indexes).should eql []
316
+ new_scraper.instance_variable_get(:@instance_urls).map{|a| a[0]}.should eql ["http://www.example1.com", "http://www.example2.com"]
317
+ end
318
+
248
319
 
249
320
  before do
250
321
  Upton::Scraper.stub(:puts)
@@ -252,7 +323,7 @@ describe Upton do
252
323
 
253
324
  it "should be silent if verbose is false" do
254
325
  stub_request(:get, "www.example.com")
255
- u = Upton::Scraper.new("http://www.example.com", '.whatever')
326
+ u = Upton::Scraper.index("http://www.example.com", '.whatever')
256
327
  u.sleep_time_between_requests = 0.0
257
328
  u.verbose = false
258
329
  u.should_not_receive(:puts)
metadata CHANGED
@@ -1,119 +1,127 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 1.0.0.prea
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-12-25 00:00:00.000000000 Z
11
+ date: 2014-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rack
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - '>='
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - '>='
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ">="
31
+ - - '>='
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ">="
38
+ - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: webmock
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - '>='
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - '>='
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: thin
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ">="
59
+ - - '>='
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ">="
66
+ - - '>='
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: 1.5.1
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: 1.5.1
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: yard
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
- - - ">="
87
+ - - '>='
74
88
  - !ruby/object:Gem::Version
75
89
  version: '0'
76
90
  type: :development
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
- - - ">="
94
+ - - '>='
81
95
  - !ruby/object:Gem::Version
82
96
  version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: rest-client
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: '1.6'
90
- - - "~>"
101
+ - - ~>
91
102
  - !ruby/object:Gem::Version
92
- version: '2.0'
103
+ version: 1.6.7
93
104
  type: :runtime
94
105
  prerelease: false
95
106
  version_requirements: !ruby/object:Gem::Requirement
96
107
  requirements:
97
- - - ">="
98
- - !ruby/object:Gem::Version
99
- version: '1.6'
100
- - - "~>"
108
+ - - ~>
101
109
  - !ruby/object:Gem::Version
102
- version: '2.0'
110
+ version: 1.6.7
103
111
  - !ruby/object:Gem::Dependency
104
112
  name: nokogiri
105
113
  requirement: !ruby/object:Gem::Requirement
106
114
  requirements:
107
- - - "~>"
115
+ - - '>='
108
116
  - !ruby/object:Gem::Version
109
- version: '1.5'
117
+ version: '0'
110
118
  type: :runtime
111
119
  prerelease: false
112
120
  version_requirements: !ruby/object:Gem::Requirement
113
121
  requirements:
114
- - - "~>"
122
+ - - '>='
115
123
  - !ruby/object:Gem::Version
116
- version: '1.5'
124
+ version: '0'
117
125
  description: Don't re-write web scrapers every time. Upton gives you a scraper template
118
126
  that's easy to use for debugging and doesn't hammer servers by default.
119
127
  email: jeremybmerrill@jeremybmerrill.com
@@ -122,22 +130,22 @@ extensions: []
122
130
  extra_rdoc_files: []
123
131
  files:
124
132
  - lib/upton.rb
125
- - lib/upton/downloader.rb
126
133
  - lib/upton/scraper.rb
127
134
  - lib/upton/utils.rb
135
+ - lib/upton/downloader.rb
128
136
  - lib/upton/version.rb
129
- - spec/data/discussion.html
137
+ - spec/data/prosecutor.html
130
138
  - spec/data/easttimor.html
131
- - spec/data/propublica-relative.html
132
- - spec/data/propublica.html
139
+ - spec/data/discussion.html
133
140
  - spec/data/propublica_search.html
134
141
  - spec/data/propublica_search_page_2.html
135
- - spec/data/prosecutor.html
136
- - spec/data/sixfacts.html
142
+ - spec/data/propublica-relative.html
137
143
  - spec/data/webinar.html
144
+ - spec/data/propublica.html
145
+ - spec/data/sixfacts.html
146
+ - spec/upton_spec.rb
138
147
  - spec/spec_helper.rb
139
148
  - spec/upton_downloader_spec.rb
140
- - spec/upton_spec.rb
141
149
  homepage: http://github.org/propublica/upton
142
150
  licenses:
143
151
  - MIT
@@ -148,30 +156,31 @@ require_paths:
148
156
  - lib
149
157
  required_ruby_version: !ruby/object:Gem::Requirement
150
158
  requirements:
151
- - - ">="
159
+ - - '>='
152
160
  - !ruby/object:Gem::Version
153
161
  version: 1.9.2
154
162
  required_rubygems_version: !ruby/object:Gem::Requirement
155
163
  requirements:
156
- - - ">="
164
+ - - '>'
157
165
  - !ruby/object:Gem::Version
158
- version: '0'
166
+ version: 1.3.1
159
167
  requirements: []
160
168
  rubyforge_project:
161
- rubygems_version: 2.5.1
169
+ rubygems_version: 2.0.14
162
170
  signing_key:
163
171
  specification_version: 4
164
172
  summary: A simple web-scraping framework
165
173
  test_files:
166
174
  - spec/data/prosecutor.html
167
- - spec/data/propublica_search.html
168
- - spec/data/propublica.html
175
+ - spec/data/easttimor.html
169
176
  - spec/data/discussion.html
177
+ - spec/data/propublica_search.html
170
178
  - spec/data/propublica_search_page_2.html
171
- - spec/data/sixfacts.html
172
179
  - spec/data/propublica-relative.html
173
- - spec/data/easttimor.html
174
180
  - spec/data/webinar.html
181
+ - spec/data/propublica.html
182
+ - spec/data/sixfacts.html
175
183
  - spec/upton_spec.rb
176
184
  - spec/spec_helper.rb
177
185
  - spec/upton_downloader_spec.rb
186
+ has_rdoc: true