upton 0.3.6 → 1.0.0.prea

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dc480dd21a06c69b7a337b17ed01f95fa262d33b
4
- data.tar.gz: f5f5f1c99471d884a585e0098c8d3ca975fe8c8e
3
+ metadata.gz: c446c20e57e387b365d9c5bcda546a1b48ebbcf1
4
+ data.tar.gz: a29ee1aa35b18a9324d504ae8e99e2a9bafcfb27
5
5
  SHA512:
6
- metadata.gz: 7bda1a3ee82d668b3d966b50ba7bff58133e807d9da88e988288be25bdcb26956917216a9925a91275c3978906b31eaf4ee9a5dba1694bbeb9811b6912339771
7
- data.tar.gz: 121c8c524f56f41c24e6f9cb9fb9584d9b0d988bf0c1c4a163b63b539dfd44f8e969ccd41efa45f96b4da41ed4c3963c87b6f5e708ef20b31c0e518f7d850dff
6
+ metadata.gz: 11d5e990c42441d5bf599952bf3d49289754f68da72a51b31d62c86281531960fc18adffb7e52c59fe37ac5e275c35e766ce922c9a1922294f032d2a5c7cbea7
7
+ data.tar.gz: a6cbe33126fe3506c2248d40677e88e3d4e545ec2dc3e6613b3d623232f2288eae1305f41ee1ce63a996ff64812558596a95f1b14dffb63bca71bd89562e9fe7
data/lib/upton.rb CHANGED
@@ -1,10 +1,10 @@
1
1
  # encoding: UTF-8
2
2
 
3
- require 'nokogiri'
4
- require 'uri'
5
- require 'restclient'
3
+ require_relative 'upton/scraper'
6
4
  require_relative 'upton/utils'
5
+ require_relative 'upton/version'
7
6
  require_relative 'upton/downloader'
7
+ require_relative 'upton/scraper'
8
8
 
9
9
  ##
10
10
  # This module contains a scraper called Upton
@@ -22,332 +22,6 @@ module Upton
22
22
  # site's search page or a newspaper's homepage.
23
23
  # 2. Instance pages, which represent the goal of your scraping, e.g.
24
24
  # job listings or news articles.
25
- #
26
- # Upton::Scraper can be used as-is for basic use-cases by:
27
- # 1. specifying the pages to be scraped in `new` as an index page
28
- # or as an Array of URLs.
29
- # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
30
- # block from Upton::Utils.
31
- # For more complicated cases; subclass Upton::Scraper
32
- # e.g. +MyScraper < Upton::Scraper+ and override various methods.
33
25
  ##
34
- class Scraper
35
- EMPTY_STRING = ''
36
-
37
- attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
38
- :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
39
- :pagination_interval
40
-
41
- ##
42
- # This is the main user-facing method for a basic scraper.
43
- # Call +scrape+ with a block; this block will be called on
44
- # the text of each instance page, (and optionally, its URL and its index
45
- # in the list of instance URLs returned by +get_index+).
46
- ##
47
- def scrape(&blk)
48
- self.url_array = self.get_index unless self.url_array
49
- blk = Proc.new{|x| x} if blk.nil?
50
- self.scrape_from_list(self.url_array, blk)
51
- end
52
-
53
- ##
54
- # +index_url_or_array+: A list of string URLs, OR
55
- # the URL of the page containing the list of instances.
56
- # +selector+: The XPath expression or CSS selector that specifies the
57
- # anchor elements within the page, if a url is specified for
58
- # the previous argument.
59
- #
60
- # These options are a shortcut. If you plan to override +get_index+, you
61
- # do not need to set them.
62
- # If you don't specify a selector, the first argument will be treated as a
63
- # list of URLs.
64
- ##
65
- def initialize(index_url_or_array, selector="")
66
-
67
- #if first arg is a valid URL, do already-written stuff;
68
- #if it's not (or if it's a list?) don't bother with get_index, etc.
69
- #e.g. Scraper.new(["http://jeremybmerrill.com"])
70
-
71
- #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
72
- if index_url_or_array.respond_to? :each_with_index
73
- @url_array = index_url_or_array
74
- else
75
- @index_url = index_url_or_array
76
- @index_selector = selector
77
- end
78
-
79
- # If true, then Upton prints information about when it gets
80
- # files from the internet and when it gets them from its stash.
81
- @verbose = false
82
-
83
- # If true, then Upton fetches each instance page only once
84
- # future requests for that file are responded to with the locally stashed
85
- # version.
86
- # You may want to set @debug to false for production (but maybe not).
87
- # You can also control stashing behavior on a per-call basis with the
88
- # optional second argument to get_page, if, for instance, you want to
89
- # stash certain instance pages, e.g. based on their modification date.
90
- @debug = true
91
- # Index debug does the same, but for index pages.
92
- @index_debug = false
93
-
94
- # In order to not hammer servers, Upton waits for, by default, 30
95
- # seconds between requests to the remote server.
96
- @sleep_time_between_requests = 30 #seconds
97
-
98
- # If true, then Upton will attempt to scrape paginated index pages
99
- @paginated = false
100
- # Default query string parameter used to specify the current page
101
- @pagination_param = 'page'
102
- # Default number of paginated pages to scrape
103
- @pagination_max_pages = 2
104
- # Default starting number for pagination (second page is this plus 1).
105
- @pagination_start_index = 1
106
- # Default value to increment page number by
107
- @pagination_interval = 1
108
-
109
- # Folder name for stashes, if you want them to be stored somewhere else,
110
- # e.g. under /tmp.
111
- if @stash_folder
112
- FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
113
- end
114
- end
115
-
116
- ##
117
- # If instance pages are paginated, <b>you must override</b>
118
- # this method to return the next URL, given the current URL and its index.
119
- #
120
- # If instance pages aren't paginated, there's no need to override this.
121
- #
122
- # Recursion stops if the fetching URL returns an empty string or an error.
123
- #
124
- # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
125
- # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
126
- ##
127
- def next_instance_page_url(url, pagination_index)
128
- EMPTY_STRING
129
- end
130
-
131
- ##
132
- # Return the next URL to scrape, given the current URL and its index.
133
- #
134
- # Recursion stops if the fetching URL returns an empty string or an error.
135
- #
136
- # If @paginated is not set (the default), this method returns an empty string.
137
- #
138
- # If @paginated is set, this method will return the next pagination URL
139
- # to scrape using @pagination_param and the pagination_index.
140
- #
141
- # If the pagination_index is greater than @pagination_max_pages, then the
142
- # method will return an empty string.
143
- #
144
- # Override this method to handle pagination is an alternative way
145
- # e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
146
- # ought to return "http://whatever.com/articles?page=2"
147
- #
148
- ##
149
- def next_index_page_url(url, pagination_index)
150
- return url unless @paginated
151
-
152
- if pagination_index > @pagination_max_pages
153
- puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
154
- EMPTY_STRING
155
- else
156
- uri = URI.parse(url)
157
- query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
158
- # update the pagination query string parameter
159
- query[@pagination_param] = pagination_index
160
- uri.query = URI.encode_www_form(query)
161
- puts "Next index pagination url is #{uri}" if @verbose
162
- uri.to_s
163
- end
164
- end
165
-
166
- ##
167
- # Writes the scraped result to a CSV at the given filename.
168
- ##
169
- def scrape_to_csv filename, &blk
170
- require 'csv'
171
- self.url_array = self.get_index unless self.url_array
172
- CSV.open filename, 'wb' do |csv|
173
- #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
174
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
175
- if document[0].respond_to? :map
176
- document.each{|row| csv << row }
177
- else
178
- csv << document
179
- end
180
- end
181
- #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
182
- end
183
- end
184
-
185
- def scrape_to_tsv filename, &blk
186
- require 'csv'
187
- self.url_array = self.get_index unless self.url_array
188
- CSV.open filename, 'wb', :col_sep => "\t" do |csv|
189
- #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
190
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
191
- if document[0].respond_to? :map
192
- document.each{|row| csv << row }
193
- else
194
- csv << document
195
- end
196
- end
197
- #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
198
- end
199
- end
200
-
201
- protected
202
-
203
- ##
204
- # Handles getting pages with Downlader, which handles stashing.
205
- ##
206
- def get_page(url, stash=false, options={})
207
- return EMPTY_STRING if url.nil? || url.empty? #url is nil if the <a> lacks an `href` attribute.
208
- global_options = {
209
- :cache => stash,
210
- :verbose => @verbose
211
- }
212
- if @readable_filenames
213
- global_options[:readable_filenames] = true
214
- end
215
- if @stash_folder
216
- global_options[:readable_filenames] = true
217
- global_options[:cache_location] = @stash_folder
218
- end
219
- resp_and_cache = Downloader.new(url, global_options.merge(options)).get
220
- if resp_and_cache[:from_resource]
221
- puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
222
- sleep @sleep_time_between_requests
223
- end
224
- resp_and_cache[:resp]
225
- end
226
-
227
-
228
- ##
229
- # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
230
- # resolve_url resolves them to absolute urls.
231
- # absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
232
- ##
233
- def resolve_url(href_str, absolute_url_str)
234
- if absolute_url_str.class <= URI::Generic
235
- absolute_url = absolute_url_str.dup
236
- else
237
- begin
238
- absolute_url = URI(absolute_url_str).dup
239
- rescue URI::InvalidURIError
240
- raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
241
- end
242
- end
243
- raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
244
- if href_str.class <= URI::Generic
245
- href = href_str.dup
246
- else
247
- begin
248
- href = URI(href_str).dup
249
- rescue URI::InvalidURIError
250
- raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
251
- end
252
- end
253
-
254
- # return :href if :href is already absolute
255
- return href.to_s if href.absolute?
256
-
257
- #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
258
- URI.join(absolute_url.to_s, href.to_s).to_s
259
- end
260
-
261
- ##
262
- # Return a list of URLs for the instances you want to scrape.
263
- # This can optionally be overridden if, for example, the list of instances
264
- # comes from an API.
265
- ##
266
- def get_index
267
- index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
268
- end
269
-
270
- # TODO: Not sure the best way to handle this
271
- # Currently, #parse_index is called upon #get_index_pages,
272
- # which itself is dependent on @index_url
273
- # Does @index_url stay unaltered for the lifetime of the Upton instance?
274
- # It seems to at this point, but that may be something that gets
275
- # deprecated later
276
- #
277
- # So for now, @index_url is used in conjunction with resolve_url
278
- # to make sure that this method returns absolute urls
279
- # i.e. this method expects @index_url to always have an absolute address
280
- # for the lifetime of an Upton instance
281
- def parse_index(text, selector)
282
- Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
283
- href = a_element["href"]
284
- resolved_url = resolve_url( href, @index_url) unless href.nil?
285
- puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
286
- resolved_url
287
- end
288
- end
289
-
290
-
291
- ##
292
- # Returns the concatenated output of each member of a paginated index,
293
- # e.g. a site listing links with 2+ pages.
294
- ##
295
- def get_index_pages(original_url, pagination_index, pagination_interval, options={})
296
- resps = []
297
- prev_url = nil
298
- while resps.empty? || !resps.last.empty?
299
- next_url = self.next_index_page_url(original_url, pagination_index)
300
- break if next_url.empty?
301
-
302
- next_url = resolve_url(next_url, original_url)
303
- break if next_url == prev_url
304
-
305
- next_resp = self.get_page(next_url, @index_debug, options).to_s
306
- prev_url = next_url
307
- pagination_index += pagination_interval
308
- resps << next_resp
309
- end
310
- resps
311
- end
312
-
313
- ##
314
- # Returns the instance at `url`.
315
- #
316
- # If the page is stashed, returns that, otherwise, fetches it from the web.
317
- #
318
- # If an instance is paginated, returns the concatenated output of each
319
- # page, e.g. if a news article has two pages.
320
- ##
321
- def get_instance(url, pagination_index=0, options={})
322
- resps = [self.get_page(url, @debug, options)]
323
- pagination_index = pagination_index.to_i
324
- prev_url = url
325
- while !resps.last.empty?
326
- next_url = self.next_instance_page_url(url, pagination_index + 1)
327
- break if next_url == prev_url || next_url.empty?
328
-
329
- next_resp = self.get_page(next_url, @debug, options)
330
- prev_url = next_url
331
- resps << next_resp
332
- end
333
- resps
334
- end
335
-
336
- # Just a helper for +scrape+.
337
- def scrape_from_list(list, blk)
338
- puts "Scraping #{list.size} instances" if @verbose
339
- list.each_with_index.map do |instance_url, instance_index|
340
- instance_resps = get_instance instance_url, nil, :instance_index => instance_index
341
- instance_resps.each_with_index.map do |instance_resp, pagination_index|
342
- blk.call(instance_resp, instance_url, instance_index, pagination_index)
343
- end
344
- end.flatten(1)
345
- end
346
-
347
- # it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
348
- def slug(url)
349
- url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
350
- end
351
26
 
352
- end
353
27
  end
@@ -103,7 +103,7 @@ module Upton
103
103
  msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
104
104
  resp_html = Nokogiri::HTML(resp)
105
105
  comment = Nokogiri::XML::Comment.new(resp_html, msg)
106
- if resp_html.root.nil? || !resp_html.include?("<html")
106
+ if resp_html.root.nil?
107
107
  return resp
108
108
  elsif resp_html.root.children.empty?
109
109
  resp_html.root.add_child(comment)
data/lib/upton/scraper.rb CHANGED
@@ -1,9 +1,10 @@
1
1
  require 'uri'
2
2
  require 'nokogiri'
3
3
  require_relative './downloader'
4
+ require_relative './page'
4
5
 
5
6
  module Upton
6
- # Upton::Scraper can be used as-is for basic use-cases by:
7
+ # Upton::Scraper can be used as-is for basic use-cases by:
7
8
  # 1. specifying the pages to be scraped in `new` as an index page
8
9
  # or as an Array of URLs.
9
10
  # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
@@ -14,9 +15,8 @@ module Upton
14
15
  class Scraper
15
16
  EMPTY_STRING = ''
16
17
 
17
- attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
18
- :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
19
- :pagination_interval
18
+ attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests,
19
+ :stash_folder, :readable_filenames
20
20
 
21
21
  ##
22
22
  # This is the main user-facing method for a basic scraper.
@@ -25,8 +25,8 @@ module Upton
25
25
  # in the list of instance URLs returned by +get_index+).
26
26
  ##
27
27
  def scrape(&blk)
28
- self.url_array = self.get_index unless self.url_array
29
- self.scrape_from_list(self.url_array, blk)
28
+ get_indexes!
29
+ self.scrape_from_list(@instance_urls, blk)
30
30
  end
31
31
 
32
32
  ##
@@ -41,23 +41,10 @@ module Upton
41
41
  # If you don't specify a selector, the first argument will be treated as a
42
42
  # list of URLs.
43
43
  ##
44
- def initialize(index_url_or_array, selector="")
45
-
46
- #if first arg is a valid URL, do already-written stuff;
47
- #if it's not (or if it's a list?) don't bother with get_index, etc.
48
- #e.g. Scraper.new(["http://jeremybmerrill.com"])
49
-
50
- #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
51
- if index_url_or_array.respond_to? :each_with_index
52
- @url_array = index_url_or_array
53
- else
54
- @index_url = index_url_or_array
55
- @index_selector = selector
56
- end
57
-
44
+ def initialize(options={})
58
45
  # If true, then Upton prints information about when it gets
59
46
  # files from the internet and when it gets them from its stash.
60
- @verbose = false
47
+ @verbose = options[:verbose] || false
61
48
 
62
49
  # If true, then Upton fetches each instance page only once
63
50
  # future requests for that file are responded to with the locally stashed
@@ -66,29 +53,77 @@ module Upton
66
53
  # You can also control stashing behavior on a per-call basis with the
67
54
  # optional second argument to get_page, if, for instance, you want to
68
55
  # stash certain instance pages, e.g. based on their modification date.
69
- @debug = true
56
+ @debug = options[:debug] || true
70
57
  # Index debug does the same, but for index pages.
71
- @index_debug = false
58
+ @index_debug = options[:index_debug] || false
72
59
 
73
60
  # In order to not hammer servers, Upton waits for, by default, 30
74
61
  # seconds between requests to the remote server.
75
- @sleep_time_between_requests = 30 #seconds
62
+ @sleep_time_between_requests = options[:sleep_time_between_requests] || 30 #seconds
63
+
64
+ # Folder name for stashes, if you want them to be stored somewhere else,
65
+ # e.g. under /tmp.
66
+ if @stash_folder
67
+ FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
68
+ end
69
+
70
+ @indexes = []
71
+ @instance_urls = []
72
+ end
73
+
74
+ def index(index_url, selector, options={})
75
+ # for future:
76
+ @indexes ||= []
76
77
 
78
+ ##
79
+ # Pagination options are per-index page
80
+ #
77
81
  # If true, then Upton will attempt to scrape paginated index pages
78
- @paginated = false
82
+ options[:paginated] ||= false
79
83
  # Default query string parameter used to specify the current page
80
- @pagination_param = 'page'
84
+ options[:pagination_param] ||= 'page'
81
85
  # Default number of paginated pages to scrape
82
- @pagination_max_pages = 2
86
+ options[:pagination_max_pages] ||= 2
83
87
  # Default starting number for pagination (second page is this plus 1).
84
- @pagination_start_index = 1
88
+ options[:pagination_start_index] ||= 1
85
89
  # Default value to increment page number by
86
- @pagination_interval = 1
87
-
88
- # Folder name for stashes, if you want them to be stored somewhere else,
89
- # e.g. under /tmp.
90
- if @stash_folder
91
- FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
90
+ options[:pagination_interval] ||= 1
91
+ ##
92
+
93
+ @indexes << [index_url, selector, options]
94
+ # and actually go scrape the index page, populate @instances
95
+ self
96
+ end
97
+
98
+ def self.index(index_url, selector, options={})
99
+ scraper = self.new
100
+ scraper.index(index_url, selector, options)
101
+ scraper
102
+ end
103
+
104
+ def self.instances(instances, options={})
105
+ s = self.new
106
+ s.instance_variable_set(:@instance_urls, instances)
107
+ s
108
+ end
109
+
110
+ # does
111
+ # def add_instances(urls)
112
+ # #for future:
113
+ # # @instances += urls
114
+ # # @instances.uniq!
115
+ # @instance_urls ||= []
116
+ # @instance_urls += urls
117
+ # @instance_urls.uniq!
118
+ # end
119
+
120
+ def instances(urls=nil)
121
+ if urls.nil?
122
+ @instance_urls
123
+ else
124
+ @instance_urls ||= []
125
+ @instance_urls += urls
126
+ self
92
127
  end
93
128
  end
94
129
 
@@ -125,21 +160,14 @@ module Upton
125
160
  # ought to return "http://whatever.com/articles?page=2"
126
161
  #
127
162
  ##
128
- def next_index_page_url(url, pagination_index)
129
- return EMPTY_STRING unless @paginated
130
-
131
- if pagination_index > @pagination_max_pages
132
- puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
133
- EMPTY_STRING
134
- else
135
- uri = URI.parse(url)
136
- query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
137
- # update the pagination query string parameter
138
- query[@pagination_param] = pagination_index
139
- uri.query = URI.encode_www_form(query)
140
- puts "Next index pagination url is #{uri}" if @verbose
141
- uri.to_s
142
- end
163
+ def next_index_page_url(url, pagination_param, pagination_index)
164
+ uri = URI.parse(url)
165
+ query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
166
+ # update the pagination query string parameter
167
+ query[pagination_param] = pagination_index
168
+ uri.query = URI.encode_www_form(query)
169
+ puts "Next index pagination url is #{uri}" if @verbose
170
+ uri.to_s
143
171
  end
144
172
 
145
173
  ##
@@ -147,36 +175,46 @@ module Upton
147
175
  ##
148
176
  def scrape_to_csv filename, &blk
149
177
  require 'csv'
150
- self.url_array = self.get_index unless self.url_array
178
+ self.get_indexes!
151
179
  CSV.open filename, 'wb' do |csv|
152
180
  #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
153
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
181
+ self.scrape_from_list(@instance_urls, blk).compact.each do |document|
154
182
  if document[0].respond_to? :map
155
183
  document.each{|row| csv << row }
156
184
  else
157
185
  csv << document
158
186
  end
159
187
  end
160
- #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
188
+ #self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
161
189
  end
162
190
  end
163
191
 
164
192
  def scrape_to_tsv filename, &blk
165
193
  require 'csv'
166
- self.url_array = self.get_index unless self.url_array
194
+ get_indexes!
167
195
  CSV.open filename, 'wb', :col_sep => "\t" do |csv|
168
196
  #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
169
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
197
+ self.scrape_from_list(@instance_urls, blk).compact.each do |document|
170
198
  if document[0].respond_to? :map
171
199
  document.each{|row| csv << row }
172
200
  else
173
201
  csv << document
174
202
  end
175
203
  end
176
- #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
204
+ #self.scrape_from_list(@instance_urls, blk).compact.each{|document| csv << document }
177
205
  end
178
206
  end
179
207
 
208
+ def +(other_scraper)
209
+ raise ArgumentError, "#{other_scraper.class} can't be coerced into Upton::Scraper" unless other_scraper.class <= Upton::Scraper
210
+ new_scraper = Scraper.new
211
+ new_indexes = @indexes + other_scraper.instance_variable_get(:@indexes)
212
+ new_instances = @instance_urls + other_scraper.instance_variable_get(:@instance_urls)
213
+ new_scraper.instance_variable_set(:@indexes, new_indexes)
214
+ new_scraper.instance_variable_set(:@instance_urls, new_instances)
215
+ new_scraper
216
+ end
217
+
180
218
  protected
181
219
 
182
220
  ##
@@ -217,6 +255,8 @@ module Upton
217
255
  absolute_url = URI(absolute_url_str).dup
218
256
  rescue URI::InvalidURIError
219
257
  raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
258
+ rescue ArgumentError
259
+ raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
220
260
  end
221
261
  end
222
262
  raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
@@ -237,15 +277,6 @@ module Upton
237
277
  URI.join(absolute_url.to_s, href.to_s).to_s
238
278
  end
239
279
 
240
- ##
241
- # Return a list of URLs for the instances you want to scrape.
242
- # This can optionally be overridden if, for example, the list of instances
243
- # comes from an API.
244
- ##
245
- def get_index
246
- index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
247
- end
248
-
249
280
  # TODO: Not sure the best way to handle this
250
281
  # Currently, #parse_index is called upon #get_index_pages,
251
282
  # which itself is dependent on @index_url
@@ -253,30 +284,31 @@ module Upton
253
284
  # It seems to at this point, but that may be something that gets
254
285
  # deprecated later
255
286
  #
256
- # So for now, @index_url is used in conjunction with resolve_url
287
+ # So for now, index_url is used in conjunction with resolve_url
257
288
  # to make sure that this method returns absolute urls
258
- # i.e. this method expects @index_url to always have an absolute address
259
- # for the lifetime of an Upton instance
260
- def parse_index(text, selector)
261
- Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
262
- href = a_element["href"]
263
- resolved_url = resolve_url( href, @index_url) unless href.nil?
289
+ def parse_index(text, selector, index_url)
290
+ Nokogiri::HTML(text).search(selector).to_a.map do |anchor|
291
+ href = anchor["href"]
292
+ resolved_url = resolve_url( href, index_url) unless href.nil?
264
293
  puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
265
294
  resolved_url
266
295
  end
267
296
  end
268
297
 
269
-
270
298
  ##
271
299
  # Returns the concatenated output of each member of a paginated index,
272
300
  # e.g. a site listing links with 2+ pages.
273
301
  ##
274
- def get_index_pages(url, pagination_index, pagination_interval, options={})
302
+ def get_index_pages(url, pagination_index, options={})
275
303
  resps = [self.get_page(url, @index_debug, options)]
304
+ return resps unless options[:paginated]
305
+
276
306
  prev_url = url
277
307
  while !resps.last.empty?
278
- pagination_index += pagination_interval
279
- next_url = self.next_index_page_url(url, pagination_index)
308
+ pagination_index += options[:pagination_interval]
309
+ break if pagination_index > options[:pagination_max_pages]
310
+
311
+ next_url = self.next_index_page_url(url, options[:pagination_param], pagination_index)
280
312
  next_url = resolve_url(next_url, url)
281
313
  break if next_url == prev_url || next_url.empty?
282
314
 
@@ -310,13 +342,28 @@ module Upton
310
342
  resps
311
343
  end
312
344
 
345
+ ##
346
+ # Return a list of URLs for the instances you want to scrape.
347
+ # This can optionally be overridden if, for example, the list of instances
348
+ # comes from an API.
349
+ ##
350
+ def get_indexes!
351
+ @indexes.each do |index_url, index_selector, options|
352
+ #TODO: cope with pagination stuff per URL
353
+
354
+ @instance_urls += get_index_pages(index_url, options[:pagination_start_index], options).map{|page| parse_index(page, index_selector, index_url) }.flatten
355
+ end
356
+ end
357
+
358
+
313
359
  # Just a helper for +scrape+.
314
360
  def scrape_from_list(list, blk)
315
361
  puts "Scraping #{list.size} instances" if @verbose
316
362
  list.each_with_index.map do |instance_url, instance_index|
317
363
  instance_resps = get_instance instance_url, nil, :instance_index => instance_index
318
364
  instance_resps.each_with_index.map do |instance_resp, pagination_index|
319
- blk.call(instance_resp, instance_url, instance_index, pagination_index)
365
+ page = Page.new(instance_resp, instance_url, instance_index, pagination_index)
366
+ blk.call(page)
320
367
  end
321
368
  end.flatten(1)
322
369
  end
data/lib/upton/utils.rb CHANGED
@@ -18,8 +18,7 @@ module Upton
18
18
  # present, is returned as the first row.
19
19
  ##
20
20
  def self.table(table_selector, deprecated=nil)
21
- return Proc.new do |instance_html|
22
- html = ::Nokogiri::HTML(instance_html)
21
+ return Proc.new do |html|
23
22
  output = []
24
23
  headers = html.search(table_selector).css("th").map &:text
25
24
  output << headers
@@ -33,8 +32,7 @@ module Upton
33
32
  # Scrapes any set of HTML elements into an Array.
34
33
  ##
35
34
  def self.list(list_selector, deprecated=nil)
36
- return Proc.new do |instance_html|
37
- html = ::Nokogiri::HTML(instance_html)
35
+ return Proc.new do |html|
38
36
  html.search(list_selector).map{|list_element| list_element.text }
39
37
  end
40
38
  end
data/lib/upton/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Upton # :nodoc:
2
- VERSION = '0.3.6'
2
+ VERSION = '1.0.0.prea'
3
3
  end
data/spec/upton_spec.rb CHANGED
@@ -52,15 +52,14 @@ describe Upton do
52
52
  stub_request(:get, "www.example.com/sixfacts.html").
53
53
  to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
54
54
 
55
- propubscraper = Upton::Scraper.new("http://www.example.com/propublica.html", "section#river section h1 a")
55
+ propubscraper = Upton::Scraper.index("http://www.example.com/propublica.html", "section#river section h1 a")
56
56
  propubscraper.debug = true
57
57
  propubscraper.verbose = false
58
58
  propubscraper.sleep_time_between_requests = 0
59
59
  propubscraper.stash_folder = "test_stashes"
60
60
 
61
- heds = propubscraper.scrape do |article_str|
62
- doc = Nokogiri::HTML(article_str)
63
- hed = doc.css('h1.article-title').text
61
+ heds = propubscraper.scrape do |doc|
62
+ doc.css('h1.article-title').text
64
63
  end
65
64
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
66
65
  heds.should eql @headlines
@@ -87,14 +86,13 @@ describe Upton do
87
86
  to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
88
87
 
89
88
 
90
- propubscraper = Upton::Scraper.new("http://www.example.com/propublica-relative.html", "section#river h1 a")
89
+ propubscraper = Upton::Scraper.index("http://www.example.com/propublica-relative.html", "section#river h1 a")
91
90
  propubscraper.debug = true
92
91
  propubscraper.verbose = false
93
92
  propubscraper.sleep_time_between_requests = 0
94
93
  propubscraper.stash_folder = "test_stashes"
95
94
 
96
- heds = propubscraper.scrape do |article_str|
97
- doc = Nokogiri::HTML(article_str)
95
+ heds = propubscraper.scrape do |doc|
98
96
  hed = doc.css('h1.article-title').text
99
97
  end
100
98
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
@@ -105,7 +103,7 @@ describe Upton do
105
103
  stub_request(:get, "www.example.com/propublica.html").
106
104
  to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
107
105
 
108
- propubscraper = Upton::Scraper.new(["http://www.example.com/propublica.html"])
106
+ propubscraper = Upton::Scraper.instances(["http://www.example.com/propublica.html"])
109
107
  propubscraper.debug = true
110
108
  propubscraper.verbose = false
111
109
  propubscraper.sleep_time_between_requests = 0
@@ -120,7 +118,7 @@ describe Upton do
120
118
  stub_request(:get, "www.example.com/easttimor.html").
121
119
  to_return(:body => File.new('./spec/data/easttimor.html'), :status => 200)
122
120
 
123
- propubscraper = Upton::Scraper.new(["http://www.example.com/easttimor.html"])
121
+ propubscraper = Upton::Scraper.instances(["http://www.example.com/easttimor.html"])
124
122
  propubscraper.debug = true
125
123
  propubscraper.verbose = false
126
124
  propubscraper.sleep_time_between_requests = 0
@@ -139,8 +137,6 @@ describe Upton do
139
137
  it "should scrape paginated pages" do
140
138
  stub_request(:get, "www.example.com/propublica_search.html").
141
139
  to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
142
- stub_request(:get, "www.example.com/propublica_search.html?p=1").
143
- to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
144
140
  stub_request(:get, "www.example.com/propublica_search.html?p=2").
145
141
  to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
146
142
  stub_request(:get, "www.example.com/propublica_search.html?p=3").
@@ -153,17 +149,21 @@ describe Upton do
153
149
  to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
154
150
 
155
151
 
156
- propubscraper = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.compact-list a.title-link')
152
+ propubscraper = Upton::Scraper.index(
153
+ "http://www.example.com/propublica_search.html",
154
+ '.compact-list a.title-link',
155
+ {
156
+ :paginated => true,
157
+ :pagination_param => 'p',
158
+ :pagination_max_pages => 3,
159
+ }
160
+ )
157
161
  propubscraper.debug = true
158
162
  propubscraper.verbose = false
159
- propubscraper.paginated = true
160
- propubscraper.pagination_param = 'p'
161
- propubscraper.pagination_max_pages = 3
162
163
  propubscraper.sleep_time_between_requests = 0
163
164
  propubscraper.stash_folder = "test_stashes"
164
165
 
165
- results = propubscraper.scrape do |article_str|
166
- doc = Nokogiri::HTML(article_str)
166
+ results = propubscraper.scrape do |doc|
167
167
  hed = doc.css('h1.article-title').text
168
168
  end
169
169
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
@@ -177,7 +177,7 @@ describe Upton do
177
177
 
178
178
  it "should sleep after requests with caching disabled" do
179
179
  stub_request(:get, "www.example.com")
180
- u = Upton::Scraper.new("http://www.example.com", '.whatever')
180
+ u = Upton::Scraper.index("http://www.example.com", '.whatever')
181
181
  u.index_debug = false
182
182
  u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
183
183
  u.should_receive(:sleep)
@@ -187,7 +187,7 @@ describe Upton do
187
187
  it "should sleep after uncached requests when caching is enabled" do
188
188
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
189
189
  stub_request(:get, "www.example.com")
190
- u = Upton::Scraper.new("http://www.example.com", '.whatever')
190
+ u = Upton::Scraper.index("http://www.example.com", '.whatever')
191
191
  u.index_debug = true
192
192
  u.stash_folder = "test_stashes"
193
193
  u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
@@ -199,8 +199,6 @@ describe Upton do
199
199
  FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
200
200
  stub_request(:get, "www.example.com/propublica_search.html").
201
201
  to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
202
- stub_request(:get, "www.example.com/propublica_search.html?p=1").
203
- to_return(:body => File.new('./spec/data/propublica_search.html'), :status => 200)
204
202
  stub_request(:get, "www.example.com/propublica_search.html?p=2").
205
203
  to_return(:body => File.new('./spec/data/propublica_search_page_2.html'), :status => 200)
206
204
  stub_request(:get, "www.example.com/propublica_search.html?p=3").
@@ -213,12 +211,15 @@ describe Upton do
213
211
  to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
214
212
 
215
213
 
216
- u = Upton::Scraper.new("http://www.example.com/propublica_search.html", '.nonexistent')
214
+ u = Upton::Scraper.index("http://www.example.com/propublica_search.html", '.nonexistent',
215
+ {
216
+ :paginated => true,
217
+ :pagination_param => 'p',
218
+ :pagination_max_pages => 3,
219
+ }
220
+ )
217
221
  u.index_debug = false
218
222
  u.debug = false
219
- u.paginated = true
220
- u.pagination_param = 'p'
221
- u.pagination_max_pages = 3
222
223
  u.sleep_time_between_requests = 1 #don't sleep too long, that's annoying.
223
224
  u.stash_folder = "test_stashes"
224
225
 
@@ -234,7 +235,7 @@ describe Upton do
234
235
  stub_request(:get, "www.example.com").
235
236
  to_return(:body => '', :status => 200)
236
237
 
237
- u = Upton::Scraper.new("http://www.example.com", '.whatever')
238
+ u = Upton::Scraper.index("http://www.example.com", '.whatever')
238
239
  u.sleep_time_between_requests = 0.0
239
240
  u.stash_folder = custom_cache_folder
240
241
  u.debug = true
@@ -245,6 +246,76 @@ describe Upton do
245
246
  expect(files).not_to be_empty
246
247
  end
247
248
 
249
+ it "should scrape in the basic case with the index method" do
250
+ stub_request(:get, "www.example.com/propublica.html").
251
+ to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
252
+ stub_request(:get, "www.example.com/discussion.html").
253
+ to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
254
+ stub_request(:get, "www.example.com/prosecutor.html").
255
+ to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
256
+ stub_request(:get, "www.example.com/webinar.html").
257
+ to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
258
+ stub_request(:get, "www.example.com/sixfacts.html").
259
+ to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
260
+
261
+ propubscraper = Upton::Scraper.index("http://www.example.com/propublica.html", "section#river section h1 a")
262
+ propubscraper.debug = true
263
+ propubscraper.verbose = false
264
+ propubscraper.sleep_time_between_requests = 0
265
+ propubscraper.stash_folder = "test_stashes"
266
+
267
+ heds = propubscraper.scrape do |doc|
268
+ hed = doc.css('h1.article-title').text
269
+ end
270
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
271
+ heds.should eql @headlines
272
+ end
273
+
274
+ it "should allow instances to be set on a new Scraper" do
275
+ stub_request(:get, "www.example.com/propublica.html").
276
+ to_return(:body => File.new('./spec/data/propublica.html'), :status => 200)
277
+ stub_request(:get, "www.example.com/discussion.html").
278
+ to_return(:body => File.new('./spec/data/discussion.html'), :status => 200)
279
+ stub_request(:get, "www.example.com/prosecutor.html").
280
+ to_return(:body => File.new('./spec/data/prosecutor.html'), :status => 200)
281
+ stub_request(:get, "www.example.com/webinar.html").
282
+ to_return(:body => File.new('./spec/data/webinar.html'), :status => 200)
283
+ stub_request(:get, "www.example.com/sixfacts.html").
284
+ to_return(:body => File.new('./spec/data/sixfacts.html'), :status => 200)
285
+
286
+ propubscraper = Upton::Scraper.instances(["www.example.com/webinar.html",
287
+ "www.example.com/discussion.html",
288
+ "www.example.com/prosecutor.html",
289
+ "www.example.com/sixfacts.html"])
290
+
291
+ propubscraper.debug = true
292
+ propubscraper.verbose = false
293
+ propubscraper.sleep_time_between_requests = 0
294
+ propubscraper.stash_folder = "test_stashes"
295
+
296
+ heds = propubscraper.scrape do |doc|
297
+ hed = doc.css('h1.article-title').text
298
+ end
299
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
300
+ heds.should eql @headlines
301
+ end
302
+
303
+ it "should allow Scrapers to be added (indexes)" do
304
+ u = Upton::Scraper.index("http://www.example1.com", '.link')
305
+ w = Upton::Scraper.index("http://www.example2.com", '.link')
306
+ new_scraper = u + w
307
+ new_scraper.instance_variable_get(:@indexes).map{|a| a[0]}.should eql ["http://www.example1.com", "http://www.example2.com"]
308
+ end
309
+
310
+ it "should allow Scrapers to be added (instances)" do
311
+ pending
312
+ u = Upton::Scraper.instances(["http://www.example1.com"])
313
+ w = Upton::Scraper.instances(["http://www.example2.com"])
314
+ new_scraper = u + w
315
+ new_scraper.instance_variable_get(:@indexes).should eql []
316
+ new_scraper.instance_variable_get(:@instance_urls).map{|a| a[0]}.should eql ["http://www.example1.com", "http://www.example2.com"]
317
+ end
318
+
248
319
 
249
320
  before do
250
321
  Upton::Scraper.stub(:puts)
@@ -252,7 +323,7 @@ describe Upton do
252
323
 
253
324
  it "should be silent if verbose is false" do
254
325
  stub_request(:get, "www.example.com")
255
- u = Upton::Scraper.new("http://www.example.com", '.whatever')
326
+ u = Upton::Scraper.index("http://www.example.com", '.whatever')
256
327
  u.sleep_time_between_requests = 0.0
257
328
  u.verbose = false
258
329
  u.should_not_receive(:puts)
metadata CHANGED
@@ -1,119 +1,127 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.6
4
+ version: 1.0.0.prea
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-12-25 00:00:00.000000000 Z
11
+ date: 2014-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rack
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - '>='
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - '>='
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ">="
31
+ - - '>='
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ">="
38
+ - - '>='
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: webmock
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - '>='
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - '>='
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: thin
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
- - - ">="
59
+ - - '>='
60
60
  - !ruby/object:Gem::Version
61
61
  version: '0'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
- - - ">="
66
+ - - '>='
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: nokogiri
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: 1.5.1
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: 1.5.1
69
83
  - !ruby/object:Gem::Dependency
70
84
  name: yard
71
85
  requirement: !ruby/object:Gem::Requirement
72
86
  requirements:
73
- - - ">="
87
+ - - '>='
74
88
  - !ruby/object:Gem::Version
75
89
  version: '0'
76
90
  type: :development
77
91
  prerelease: false
78
92
  version_requirements: !ruby/object:Gem::Requirement
79
93
  requirements:
80
- - - ">="
94
+ - - '>='
81
95
  - !ruby/object:Gem::Version
82
96
  version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: rest-client
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - ">="
88
- - !ruby/object:Gem::Version
89
- version: '1.6'
90
- - - "~>"
101
+ - - ~>
91
102
  - !ruby/object:Gem::Version
92
- version: '2.0'
103
+ version: 1.6.7
93
104
  type: :runtime
94
105
  prerelease: false
95
106
  version_requirements: !ruby/object:Gem::Requirement
96
107
  requirements:
97
- - - ">="
98
- - !ruby/object:Gem::Version
99
- version: '1.6'
100
- - - "~>"
108
+ - - ~>
101
109
  - !ruby/object:Gem::Version
102
- version: '2.0'
110
+ version: 1.6.7
103
111
  - !ruby/object:Gem::Dependency
104
112
  name: nokogiri
105
113
  requirement: !ruby/object:Gem::Requirement
106
114
  requirements:
107
- - - "~>"
115
+ - - '>='
108
116
  - !ruby/object:Gem::Version
109
- version: '1.5'
117
+ version: '0'
110
118
  type: :runtime
111
119
  prerelease: false
112
120
  version_requirements: !ruby/object:Gem::Requirement
113
121
  requirements:
114
- - - "~>"
122
+ - - '>='
115
123
  - !ruby/object:Gem::Version
116
- version: '1.5'
124
+ version: '0'
117
125
  description: Don't re-write web scrapers every time. Upton gives you a scraper template
118
126
  that's easy to use for debugging and doesn't hammer servers by default.
119
127
  email: jeremybmerrill@jeremybmerrill.com
@@ -122,22 +130,22 @@ extensions: []
122
130
  extra_rdoc_files: []
123
131
  files:
124
132
  - lib/upton.rb
125
- - lib/upton/downloader.rb
126
133
  - lib/upton/scraper.rb
127
134
  - lib/upton/utils.rb
135
+ - lib/upton/downloader.rb
128
136
  - lib/upton/version.rb
129
- - spec/data/discussion.html
137
+ - spec/data/prosecutor.html
130
138
  - spec/data/easttimor.html
131
- - spec/data/propublica-relative.html
132
- - spec/data/propublica.html
139
+ - spec/data/discussion.html
133
140
  - spec/data/propublica_search.html
134
141
  - spec/data/propublica_search_page_2.html
135
- - spec/data/prosecutor.html
136
- - spec/data/sixfacts.html
142
+ - spec/data/propublica-relative.html
137
143
  - spec/data/webinar.html
144
+ - spec/data/propublica.html
145
+ - spec/data/sixfacts.html
146
+ - spec/upton_spec.rb
138
147
  - spec/spec_helper.rb
139
148
  - spec/upton_downloader_spec.rb
140
- - spec/upton_spec.rb
141
149
  homepage: http://github.org/propublica/upton
142
150
  licenses:
143
151
  - MIT
@@ -148,30 +156,31 @@ require_paths:
148
156
  - lib
149
157
  required_ruby_version: !ruby/object:Gem::Requirement
150
158
  requirements:
151
- - - ">="
159
+ - - '>='
152
160
  - !ruby/object:Gem::Version
153
161
  version: 1.9.2
154
162
  required_rubygems_version: !ruby/object:Gem::Requirement
155
163
  requirements:
156
- - - ">="
164
+ - - '>'
157
165
  - !ruby/object:Gem::Version
158
- version: '0'
166
+ version: 1.3.1
159
167
  requirements: []
160
168
  rubyforge_project:
161
- rubygems_version: 2.5.1
169
+ rubygems_version: 2.0.14
162
170
  signing_key:
163
171
  specification_version: 4
164
172
  summary: A simple web-scraping framework
165
173
  test_files:
166
174
  - spec/data/prosecutor.html
167
- - spec/data/propublica_search.html
168
- - spec/data/propublica.html
175
+ - spec/data/easttimor.html
169
176
  - spec/data/discussion.html
177
+ - spec/data/propublica_search.html
170
178
  - spec/data/propublica_search_page_2.html
171
- - spec/data/sixfacts.html
172
179
  - spec/data/propublica-relative.html
173
- - spec/data/easttimor.html
174
180
  - spec/data/webinar.html
181
+ - spec/data/propublica.html
182
+ - spec/data/sixfacts.html
175
183
  - spec/upton_spec.rb
176
184
  - spec/spec_helper.rb
177
185
  - spec/upton_downloader_spec.rb
186
+ has_rdoc: true