upton 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 892592f6c890ecd94fb1bdf3b8cc500e813ebfa3
4
+ data.tar.gz: 95d10ea4c37aaec611c76dc98c45dd449a1ac35d
5
+ SHA512:
6
+ metadata.gz: f112a48ed90264ac5e111e48b45e6b67468793059f613385faa87bd6ab5122a7f11c532358daf7382c689e42b66db022bfce24ffd0b32ffe15619de0a026df77
7
+ data.tar.gz: 8fcbd1276ea6e284481d0395de5a1f73c07a2acb159edab6775c875d3dbf76e2b3c16c6b46b2aa48aec345fbd4950a5af179a6e560a1af5923e5087a8c6a648b
@@ -2,6 +2,7 @@ require "fileutils"
2
2
  require "open-uri"
3
3
  require "tmpdir"
4
4
  require "restclient"
5
+ require_relative "./version"
5
6
 
6
7
  module Upton
7
8
 
@@ -88,11 +89,30 @@ module Upton
88
89
  puts "Writing #{uri} data to the cache"
89
90
  end
90
91
  end
91
- open(cached_file, 'w'){|f| f << resp}
92
+ commented_resp = add_comment(resp)
93
+ open(cached_file, 'w'){|f| f << commented_resp}
92
94
  end
93
95
  {:resp => resp, :from_resource => from_resource }
94
96
  end
95
97
 
98
+ def add_comment(resp)
99
+ # n = Nokogiri::HTML("<html></html>")
100
+ # c = Nokogiri::XML::Comment.new(n, "asdfasdf")
101
+ # n.root.add_child(c)
102
+ # <!----Retrieved by Upton from http://www.somesite.com on January 15 at 4:28 p.m.-->
103
+ msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
104
+ resp_html = Nokogiri::HTML(resp)
105
+ comment = Nokogiri::XML::Comment.new(resp_html, msg)
106
+ if resp_html.root.nil?
107
+ return resp
108
+ elsif resp_html.root.children.empty?
109
+ resp_html.root.add_child(comment)
110
+ else
111
+ resp_html.root.children.before(comment)
112
+ end
113
+ resp_html.to_html
114
+ end
115
+
96
116
  def cache_enabled?
97
117
  !!@cache
98
118
  end
@@ -0,0 +1,330 @@
1
+ require 'uri'
2
+ require 'nokogiri'
3
+ require_relative './downloader'
4
+
5
+ module Upton
6
+ # Upton::Scraper can be used as-is for basic use-cases by:
7
+ # 1. specifying the pages to be scraped in `new` as an index page
8
+ # or as an Array of URLs.
9
+ # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
10
+ # block from Upton::Utils.
11
+ # For more complicated cases; subclass Upton::Scraper
12
+ # e.g. +MyScraper < Upton::Scraper+ and override various methods.
13
+ ##
14
+ class Scraper
15
+ EMPTY_STRING = ''
16
+
17
+ attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
18
+ :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
19
+ :pagination_interval
20
+
21
+ ##
22
+ # This is the main user-facing method for a basic scraper.
23
+ # Call +scrape+ with a block; this block will be called on
24
+ # the text of each instance page, (and optionally, its URL and its index
25
+ # in the list of instance URLs returned by +get_index+).
26
+ ##
27
+ def scrape(&blk)
28
+ self.url_array = self.get_index unless self.url_array
29
+ self.scrape_from_list(self.url_array, blk)
30
+ end
31
+
32
+ ##
33
+ # +index_url_or_array+: A list of string URLs, OR
34
+ # the URL of the page containing the list of instances.
35
+ # +selector+: The XPath expression or CSS selector that specifies the
36
+ # anchor elements within the page, if a url is specified for
37
+ # the previous argument.
38
+ #
39
+ # These options are a shortcut. If you plan to override +get_index+, you
40
+ # do not need to set them.
41
+ # If you don't specify a selector, the first argument will be treated as a
42
+ # list of URLs.
43
+ ##
44
+ def initialize(index_url_or_array, selector="")
45
+
46
+ #if first arg is a valid URL, do already-written stuff;
47
+ #if it's not (or if it's a list?) don't bother with get_index, etc.
48
+ #e.g. Scraper.new(["http://jeremybmerrill.com"])
49
+
50
+ #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
51
+ if index_url_or_array.respond_to? :each_with_index
52
+ @url_array = index_url_or_array
53
+ else
54
+ @index_url = index_url_or_array
55
+ @index_selector = selector
56
+ end
57
+
58
+ # If true, then Upton prints information about when it gets
59
+ # files from the internet and when it gets them from its stash.
60
+ @verbose = false
61
+
62
+ # If true, then Upton fetches each instance page only once
63
+ # future requests for that file are responded to with the locally stashed
64
+ # version.
65
+ # You may want to set @debug to false for production (but maybe not).
66
+ # You can also control stashing behavior on a per-call basis with the
67
+ # optional second argument to get_page, if, for instance, you want to
68
+ # stash certain instance pages, e.g. based on their modification date.
69
+ @debug = true
70
+ # Index debug does the same, but for index pages.
71
+ @index_debug = false
72
+
73
+ # In order to not hammer servers, Upton waits for, by default, 30
74
+ # seconds between requests to the remote server.
75
+ @sleep_time_between_requests = 30 #seconds
76
+
77
+ # If true, then Upton will attempt to scrape paginated index pages
78
+ @paginated = false
79
+ # Default query string parameter used to specify the current page
80
+ @pagination_param = 'page'
81
+ # Default number of paginated pages to scrape
82
+ @pagination_max_pages = 2
83
+ # Default starting number for pagination (second page is this plus 1).
84
+ @pagination_start_index = 1
85
+ # Default value to increment page number by
86
+ @pagination_interval = 1
87
+
88
+ # Folder name for stashes, if you want them to be stored somewhere else,
89
+ # e.g. under /tmp.
90
+ if @stash_folder
91
+ FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
92
+ end
93
+ end
94
+
95
+ ##
96
+ # If instance pages are paginated, <b>you must override</b>
97
+ # this method to return the next URL, given the current URL and its index.
98
+ #
99
+ # If instance pages aren't paginated, there's no need to override this.
100
+ #
101
+ # Recursion stops if the fetching URL returns an empty string or an error.
102
+ #
103
+ # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
104
+ # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
105
+ ##
106
+ def next_instance_page_url(url, pagination_index)
107
+ EMPTY_STRING
108
+ end
109
+
110
+ ##
111
+ # Return the next URL to scrape, given the current URL and its index.
112
+ #
113
+ # Recursion stops if the fetching URL returns an empty string or an error.
114
+ #
115
+ # If @paginated is not set (the default), this method returns an empty string.
116
+ #
117
+ # If @paginated is set, this method will return the next pagination URL
118
+ # to scrape using @pagination_param and the pagination_index.
119
+ #
120
+ # If the pagination_index is greater than @pagination_max_pages, then the
121
+ # method will return an empty string.
122
+ #
123
+ # Override this method to handle pagination is an alternative way
124
+ # e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
125
+ # ought to return "http://whatever.com/articles?page=2"
126
+ #
127
+ ##
128
+ def next_index_page_url(url, pagination_index)
129
+ return EMPTY_STRING unless @paginated
130
+
131
+ if pagination_index > @pagination_max_pages
132
+ puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
133
+ EMPTY_STRING
134
+ else
135
+ uri = URI.parse(url)
136
+ query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
137
+ # update the pagination query string parameter
138
+ query[@pagination_param] = pagination_index
139
+ uri.query = URI.encode_www_form(query)
140
+ puts "Next index pagination url is #{uri}" if @verbose
141
+ uri.to_s
142
+ end
143
+ end
144
+
145
+ ##
146
+ # Writes the scraped result to a CSV at the given filename.
147
+ ##
148
+ def scrape_to_csv filename, &blk
149
+ require 'csv'
150
+ self.url_array = self.get_index unless self.url_array
151
+ CSV.open filename, 'wb' do |csv|
152
+ #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
153
+ self.scrape_from_list(self.url_array, blk).compact.each do |document|
154
+ if document[0].respond_to? :map
155
+ document.each{|row| csv << row }
156
+ else
157
+ csv << document
158
+ end
159
+ end
160
+ #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
161
+ end
162
+ end
163
+
164
+ def scrape_to_tsv filename, &blk
165
+ require 'csv'
166
+ self.url_array = self.get_index unless self.url_array
167
+ CSV.open filename, 'wb', :col_sep => "\t" do |csv|
168
+ #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
169
+ self.scrape_from_list(self.url_array, blk).compact.each do |document|
170
+ if document[0].respond_to? :map
171
+ document.each{|row| csv << row }
172
+ else
173
+ csv << document
174
+ end
175
+ end
176
+ #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
177
+ end
178
+ end
179
+
180
+ protected
181
+
182
+ ##
183
+ # Handles getting pages with Downlader, which handles stashing.
184
+ ##
185
+ def get_page(url, stash=false, options={})
186
+ return EMPTY_STRING if url.nil? || url.empty? #url is nil if the <a> lacks an `href` attribute.
187
+ global_options = {
188
+ :cache => stash,
189
+ :verbose => @verbose
190
+ }
191
+ if @readable_filenames
192
+ global_options[:readable_filenames] = true
193
+ end
194
+ if @stash_folder
195
+ global_options[:readable_filenames] = true
196
+ global_options[:cache_location] = @stash_folder
197
+ end
198
+ resp_and_cache = Downloader.new(url, global_options.merge(options)).get
199
+ if resp_and_cache[:from_resource]
200
+ puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
201
+ sleep @sleep_time_between_requests
202
+ end
203
+ resp_and_cache[:resp]
204
+ end
205
+
206
+
207
+ ##
208
+ # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
209
+ # resolve_url resolves them to absolute urls.
210
+ # absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
211
+ ##
212
+ def resolve_url(href_str, absolute_url_str)
213
+ if absolute_url_str.class <= URI::Generic
214
+ absolute_url = absolute_url_str.dup
215
+ else
216
+ begin
217
+ absolute_url = URI(absolute_url_str).dup
218
+ rescue URI::InvalidURIError
219
+ raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
220
+ end
221
+ end
222
+ raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
223
+ if href_str.class <= URI::Generic
224
+ href = href_str.dup
225
+ else
226
+ begin
227
+ href = URI(href_str).dup
228
+ rescue URI::InvalidURIError
229
+ raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
230
+ end
231
+ end
232
+
233
+ # return :href if :href is already absolute
234
+ return href.to_s if href.absolute?
235
+
236
+ #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
237
+ URI.join(absolute_url.to_s, href.to_s).to_s
238
+ end
239
+
240
+ ##
241
+ # Return a list of URLs for the instances you want to scrape.
242
+ # This can optionally be overridden if, for example, the list of instances
243
+ # comes from an API.
244
+ ##
245
+ def get_index
246
+ index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
247
+ end
248
+
249
+ # TODO: Not sure the best way to handle this
250
+ # Currently, #parse_index is called upon #get_index_pages,
251
+ # which itself is dependent on @index_url
252
+ # Does @index_url stay unaltered for the lifetime of the Upton instance?
253
+ # It seems to at this point, but that may be something that gets
254
+ # deprecated later
255
+ #
256
+ # So for now, @index_url is used in conjunction with resolve_url
257
+ # to make sure that this method returns absolute urls
258
+ # i.e. this method expects @index_url to always have an absolute address
259
+ # for the lifetime of an Upton instance
260
+ def parse_index(text, selector)
261
+ Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
262
+ href = a_element["href"]
263
+ resolved_url = resolve_url( href, @index_url) unless href.nil?
264
+ puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
265
+ resolved_url
266
+ end
267
+ end
268
+
269
+
270
+ ##
271
+ # Returns the concatenated output of each member of a paginated index,
272
+ # e.g. a site listing links with 2+ pages.
273
+ ##
274
+ def get_index_pages(url, pagination_index, pagination_interval, options={})
275
+ resps = [self.get_page(url, @index_debug, options)]
276
+ prev_url = url
277
+ while !resps.last.empty?
278
+ pagination_index += pagination_interval
279
+ next_url = self.next_index_page_url(url, pagination_index)
280
+ next_url = resolve_url(next_url, url)
281
+ break if next_url == prev_url || next_url.empty?
282
+
283
+ next_resp = self.get_page(next_url, @index_debug, options).to_s
284
+ prev_url = next_url
285
+ resps << next_resp
286
+ end
287
+ resps
288
+ end
289
+
290
+ ##
291
+ # Returns the instance at `url`.
292
+ #
293
+ # If the page is stashed, returns that, otherwise, fetches it from the web.
294
+ #
295
+ # If an instance is paginated, returns the concatenated output of each
296
+ # page, e.g. if a news article has two pages.
297
+ ##
298
+ def get_instance(url, pagination_index=0, options={})
299
+ resps = [self.get_page(url, @debug, options)]
300
+ pagination_index = pagination_index.to_i
301
+ prev_url = url
302
+ while !resps.last.empty?
303
+ next_url = self.next_instance_page_url(url, pagination_index + 1)
304
+ break if next_url == prev_url || next_url.empty?
305
+
306
+ next_resp = self.get_page(next_url, @debug, options)
307
+ prev_url = next_url
308
+ resps << next_resp
309
+ end
310
+ resps
311
+ end
312
+
313
+ # Just a helper for +scrape+.
314
+ def scrape_from_list(list, blk)
315
+ puts "Scraping #{list.size} instances" if @verbose
316
+ list.each_with_index.map do |instance_url, instance_index|
317
+ instance_resps = get_instance instance_url, nil, :instance_index => instance_index
318
+ instance_resps.each_with_index.map do |instance_resp, pagination_index|
319
+ blk.call(instance_resp, instance_url, instance_index, pagination_index)
320
+ end
321
+ end.flatten(1)
322
+ end
323
+
324
+ # it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
325
+ def slug(url)
326
+ url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
327
+ end
328
+
329
+ end
330
+ end
data/lib/upton/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Upton # :nodoc:
2
- VERSION = '0.3.0'
2
+ VERSION = '0.3.1'
3
3
  end
data/lib/upton.rb CHANGED
@@ -35,7 +35,8 @@ module Upton
35
35
  EMPTY_STRING = ''
36
36
 
37
37
  attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
38
- :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames
38
+ :paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
39
+ :pagination_interval
39
40
 
40
41
  ##
41
42
  # This is the main user-facing method for a basic scraper.
@@ -101,6 +102,8 @@ module Upton
101
102
  @pagination_max_pages = 2
102
103
  # Default starting number for pagination (second page is this plus 1).
103
104
  @pagination_start_index = 1
105
+ # Default value to increment page number by
106
+ @pagination_interval = 1
104
107
 
105
108
  # Folder name for stashes, if you want them to be stored somewhere else,
106
109
  # e.g. under /tmp.
@@ -260,7 +263,7 @@ module Upton
260
263
  # comes from an API.
261
264
  ##
262
265
  def get_index
263
- index_pages = get_index_pages(@index_url, @pagination_start_index).map{|page| parse_index(page, @index_selector) }.flatten
266
+ index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
264
267
  end
265
268
 
266
269
  # TODO: Not sure the best way to handle this
@@ -288,11 +291,11 @@ module Upton
288
291
  # Returns the concatenated output of each member of a paginated index,
289
292
  # e.g. a site listing links with 2+ pages.
290
293
  ##
291
- def get_index_pages(url, pagination_index, options={})
294
+ def get_index_pages(url, pagination_index, pagination_interval, options={})
292
295
  resps = [self.get_page(url, @index_debug, options)]
293
296
  prev_url = url
294
297
  while !resps.last.empty?
295
- pagination_index += 1
298
+ pagination_index += pagination_interval
296
299
  next_url = self.next_index_page_url(url, pagination_index)
297
300
  next_url = resolve_url(next_url, url)
298
301
  break if next_url == prev_url || next_url.empty?
metadata CHANGED
@@ -1,116 +1,102 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
5
- prerelease:
4
+ version: 0.3.1
6
5
  platform: ruby
7
6
  authors:
8
7
  - Jeremy B. Merrill
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-12-22 00:00:00.000000000 Z
11
+ date: 2014-02-16 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: rack
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
- - - ! '>='
17
+ - - '>='
20
18
  - !ruby/object:Gem::Version
21
19
  version: '0'
22
20
  type: :development
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
- - - ! '>='
24
+ - - '>='
28
25
  - !ruby/object:Gem::Version
29
26
  version: '0'
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: rspec
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
- - - ! '>='
31
+ - - '>='
36
32
  - !ruby/object:Gem::Version
37
33
  version: '0'
38
34
  type: :development
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
- - - ! '>='
38
+ - - '>='
44
39
  - !ruby/object:Gem::Version
45
40
  version: '0'
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: webmock
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
- - - ! '>='
45
+ - - '>='
52
46
  - !ruby/object:Gem::Version
53
47
  version: '0'
54
48
  type: :development
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
- - - ! '>='
52
+ - - '>='
60
53
  - !ruby/object:Gem::Version
61
54
  version: '0'
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: thin
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
- - - ! '>='
59
+ - - '>='
68
60
  - !ruby/object:Gem::Version
69
61
  version: '0'
70
62
  type: :development
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
- - - ! '>='
66
+ - - '>='
76
67
  - !ruby/object:Gem::Version
77
68
  version: '0'
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: nokogiri
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
- - - ! '>='
73
+ - - '>='
84
74
  - !ruby/object:Gem::Version
85
75
  version: 1.5.1
86
76
  type: :development
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
- - - ! '>='
80
+ - - '>='
92
81
  - !ruby/object:Gem::Version
93
82
  version: 1.5.1
94
83
  - !ruby/object:Gem::Dependency
95
84
  name: yard
96
85
  requirement: !ruby/object:Gem::Requirement
97
- none: false
98
86
  requirements:
99
- - - ! '>='
87
+ - - '>='
100
88
  - !ruby/object:Gem::Version
101
89
  version: '0'
102
90
  type: :development
103
91
  prerelease: false
104
92
  version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
93
  requirements:
107
- - - ! '>='
94
+ - - '>='
108
95
  - !ruby/object:Gem::Version
109
96
  version: '0'
110
97
  - !ruby/object:Gem::Dependency
111
98
  name: rest-client
112
99
  requirement: !ruby/object:Gem::Requirement
113
- none: false
114
100
  requirements:
115
101
  - - ~>
116
102
  - !ruby/object:Gem::Version
@@ -118,7 +104,6 @@ dependencies:
118
104
  type: :runtime
119
105
  prerelease: false
120
106
  version_requirements: !ruby/object:Gem::Requirement
121
- none: false
122
107
  requirements:
123
108
  - - ~>
124
109
  - !ruby/object:Gem::Version
@@ -126,33 +111,29 @@ dependencies:
126
111
  - !ruby/object:Gem::Dependency
127
112
  name: nokogiri
128
113
  requirement: !ruby/object:Gem::Requirement
129
- none: false
130
114
  requirements:
131
- - - ! '>='
115
+ - - '>='
132
116
  - !ruby/object:Gem::Version
133
117
  version: '0'
134
118
  type: :runtime
135
119
  prerelease: false
136
120
  version_requirements: !ruby/object:Gem::Requirement
137
- none: false
138
121
  requirements:
139
- - - ! '>='
122
+ - - '>='
140
123
  - !ruby/object:Gem::Version
141
124
  version: '0'
142
125
  - !ruby/object:Gem::Dependency
143
126
  name: mechanize
144
127
  requirement: !ruby/object:Gem::Requirement
145
- none: false
146
128
  requirements:
147
- - - ! '>='
129
+ - - '>='
148
130
  - !ruby/object:Gem::Version
149
131
  version: '0'
150
132
  type: :runtime
151
133
  prerelease: false
152
134
  version_requirements: !ruby/object:Gem::Requirement
153
- none: false
154
135
  requirements:
155
- - - ! '>='
136
+ - - '>='
156
137
  - !ruby/object:Gem::Version
157
138
  version: '0'
158
139
  description: Don't re-write web scrapers every time. Upton gives you a scraper template
@@ -163,6 +144,7 @@ extensions: []
163
144
  extra_rdoc_files: []
164
145
  files:
165
146
  - lib/upton.rb
147
+ - lib/upton/scraper.rb
166
148
  - lib/upton/utils.rb
167
149
  - lib/upton/downloader.rb
168
150
  - lib/upton/version.rb
@@ -181,27 +163,26 @@ files:
181
163
  homepage: http://github.org/propublica/upton
182
164
  licenses:
183
165
  - MIT
166
+ metadata: {}
184
167
  post_install_message:
185
168
  rdoc_options: []
186
169
  require_paths:
187
170
  - lib
188
171
  required_ruby_version: !ruby/object:Gem::Requirement
189
- none: false
190
172
  requirements:
191
- - - ! '>='
173
+ - - '>='
192
174
  - !ruby/object:Gem::Version
193
175
  version: 1.9.2
194
176
  required_rubygems_version: !ruby/object:Gem::Requirement
195
- none: false
196
177
  requirements:
197
- - - ! '>='
178
+ - - '>='
198
179
  - !ruby/object:Gem::Version
199
180
  version: '0'
200
181
  requirements: []
201
182
  rubyforge_project:
202
- rubygems_version: 1.8.23
183
+ rubygems_version: 2.0.14
203
184
  signing_key:
204
- specification_version: 3
185
+ specification_version: 4
205
186
  summary: A simple web-scraping framework
206
187
  test_files:
207
188
  - spec/data/prosecutor.html