upton 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/upton/downloader.rb +21 -1
- data/lib/upton/scraper.rb +330 -0
- data/lib/upton/version.rb +1 -1
- data/lib/upton.rb +7 -4
- metadata +24 -43
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 892592f6c890ecd94fb1bdf3b8cc500e813ebfa3
|
4
|
+
data.tar.gz: 95d10ea4c37aaec611c76dc98c45dd449a1ac35d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f112a48ed90264ac5e111e48b45e6b67468793059f613385faa87bd6ab5122a7f11c532358daf7382c689e42b66db022bfce24ffd0b32ffe15619de0a026df77
|
7
|
+
data.tar.gz: 8fcbd1276ea6e284481d0395de5a1f73c07a2acb159edab6775c875d3dbf76e2b3c16c6b46b2aa48aec345fbd4950a5af179a6e560a1af5923e5087a8c6a648b
|
data/lib/upton/downloader.rb
CHANGED
@@ -2,6 +2,7 @@ require "fileutils"
|
|
2
2
|
require "open-uri"
|
3
3
|
require "tmpdir"
|
4
4
|
require "restclient"
|
5
|
+
require_relative "./version"
|
5
6
|
|
6
7
|
module Upton
|
7
8
|
|
@@ -88,11 +89,30 @@ module Upton
|
|
88
89
|
puts "Writing #{uri} data to the cache"
|
89
90
|
end
|
90
91
|
end
|
91
|
-
|
92
|
+
commented_resp = add_comment(resp)
|
93
|
+
open(cached_file, 'w'){|f| f << commented_resp}
|
92
94
|
end
|
93
95
|
{:resp => resp, :from_resource => from_resource }
|
94
96
|
end
|
95
97
|
|
98
|
+
def add_comment(resp)
|
99
|
+
# n = Nokogiri::HTML("<html></html>")
|
100
|
+
# c = Nokogiri::XML::Comment.new(n, "asdfasdf")
|
101
|
+
# n.root.add_child(c)
|
102
|
+
# <!----Retrieved by Upton from http://www.somesite.com on January 15 at 4:28 p.m.-->
|
103
|
+
msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
|
104
|
+
resp_html = Nokogiri::HTML(resp)
|
105
|
+
comment = Nokogiri::XML::Comment.new(resp_html, msg)
|
106
|
+
if resp_html.root.nil?
|
107
|
+
return resp
|
108
|
+
elsif resp_html.root.children.empty?
|
109
|
+
resp_html.root.add_child(comment)
|
110
|
+
else
|
111
|
+
resp_html.root.children.before(comment)
|
112
|
+
end
|
113
|
+
resp_html.to_html
|
114
|
+
end
|
115
|
+
|
96
116
|
def cache_enabled?
|
97
117
|
!!@cache
|
98
118
|
end
|
@@ -0,0 +1,330 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
require_relative './downloader'
|
4
|
+
|
5
|
+
module Upton
|
6
|
+
# Upton::Scraper can be used as-is for basic use-cases by:
|
7
|
+
# 1. specifying the pages to be scraped in `new` as an index page
|
8
|
+
# or as an Array of URLs.
|
9
|
+
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
10
|
+
# block from Upton::Utils.
|
11
|
+
# For more complicated cases; subclass Upton::Scraper
|
12
|
+
# e.g. +MyScraper < Upton::Scraper+ and override various methods.
|
13
|
+
##
|
14
|
+
class Scraper
|
15
|
+
EMPTY_STRING = ''
|
16
|
+
|
17
|
+
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
|
18
|
+
:paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
|
19
|
+
:pagination_interval
|
20
|
+
|
21
|
+
##
|
22
|
+
# This is the main user-facing method for a basic scraper.
|
23
|
+
# Call +scrape+ with a block; this block will be called on
|
24
|
+
# the text of each instance page, (and optionally, its URL and its index
|
25
|
+
# in the list of instance URLs returned by +get_index+).
|
26
|
+
##
|
27
|
+
def scrape(&blk)
|
28
|
+
self.url_array = self.get_index unless self.url_array
|
29
|
+
self.scrape_from_list(self.url_array, blk)
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# +index_url_or_array+: A list of string URLs, OR
|
34
|
+
# the URL of the page containing the list of instances.
|
35
|
+
# +selector+: The XPath expression or CSS selector that specifies the
|
36
|
+
# anchor elements within the page, if a url is specified for
|
37
|
+
# the previous argument.
|
38
|
+
#
|
39
|
+
# These options are a shortcut. If you plan to override +get_index+, you
|
40
|
+
# do not need to set them.
|
41
|
+
# If you don't specify a selector, the first argument will be treated as a
|
42
|
+
# list of URLs.
|
43
|
+
##
|
44
|
+
def initialize(index_url_or_array, selector="")
|
45
|
+
|
46
|
+
#if first arg is a valid URL, do already-written stuff;
|
47
|
+
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
48
|
+
#e.g. Scraper.new(["http://jeremybmerrill.com"])
|
49
|
+
|
50
|
+
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
51
|
+
if index_url_or_array.respond_to? :each_with_index
|
52
|
+
@url_array = index_url_or_array
|
53
|
+
else
|
54
|
+
@index_url = index_url_or_array
|
55
|
+
@index_selector = selector
|
56
|
+
end
|
57
|
+
|
58
|
+
# If true, then Upton prints information about when it gets
|
59
|
+
# files from the internet and when it gets them from its stash.
|
60
|
+
@verbose = false
|
61
|
+
|
62
|
+
# If true, then Upton fetches each instance page only once
|
63
|
+
# future requests for that file are responded to with the locally stashed
|
64
|
+
# version.
|
65
|
+
# You may want to set @debug to false for production (but maybe not).
|
66
|
+
# You can also control stashing behavior on a per-call basis with the
|
67
|
+
# optional second argument to get_page, if, for instance, you want to
|
68
|
+
# stash certain instance pages, e.g. based on their modification date.
|
69
|
+
@debug = true
|
70
|
+
# Index debug does the same, but for index pages.
|
71
|
+
@index_debug = false
|
72
|
+
|
73
|
+
# In order to not hammer servers, Upton waits for, by default, 30
|
74
|
+
# seconds between requests to the remote server.
|
75
|
+
@sleep_time_between_requests = 30 #seconds
|
76
|
+
|
77
|
+
# If true, then Upton will attempt to scrape paginated index pages
|
78
|
+
@paginated = false
|
79
|
+
# Default query string parameter used to specify the current page
|
80
|
+
@pagination_param = 'page'
|
81
|
+
# Default number of paginated pages to scrape
|
82
|
+
@pagination_max_pages = 2
|
83
|
+
# Default starting number for pagination (second page is this plus 1).
|
84
|
+
@pagination_start_index = 1
|
85
|
+
# Default value to increment page number by
|
86
|
+
@pagination_interval = 1
|
87
|
+
|
88
|
+
# Folder name for stashes, if you want them to be stored somewhere else,
|
89
|
+
# e.g. under /tmp.
|
90
|
+
if @stash_folder
|
91
|
+
FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
##
|
96
|
+
# If instance pages are paginated, <b>you must override</b>
|
97
|
+
# this method to return the next URL, given the current URL and its index.
|
98
|
+
#
|
99
|
+
# If instance pages aren't paginated, there's no need to override this.
|
100
|
+
#
|
101
|
+
# Recursion stops if the fetching URL returns an empty string or an error.
|
102
|
+
#
|
103
|
+
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
104
|
+
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
105
|
+
##
|
106
|
+
def next_instance_page_url(url, pagination_index)
|
107
|
+
EMPTY_STRING
|
108
|
+
end
|
109
|
+
|
110
|
+
##
|
111
|
+
# Return the next URL to scrape, given the current URL and its index.
|
112
|
+
#
|
113
|
+
# Recursion stops if the fetching URL returns an empty string or an error.
|
114
|
+
#
|
115
|
+
# If @paginated is not set (the default), this method returns an empty string.
|
116
|
+
#
|
117
|
+
# If @paginated is set, this method will return the next pagination URL
|
118
|
+
# to scrape using @pagination_param and the pagination_index.
|
119
|
+
#
|
120
|
+
# If the pagination_index is greater than @pagination_max_pages, then the
|
121
|
+
# method will return an empty string.
|
122
|
+
#
|
123
|
+
# Override this method to handle pagination is an alternative way
|
124
|
+
# e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
|
125
|
+
# ought to return "http://whatever.com/articles?page=2"
|
126
|
+
#
|
127
|
+
##
|
128
|
+
def next_index_page_url(url, pagination_index)
|
129
|
+
return EMPTY_STRING unless @paginated
|
130
|
+
|
131
|
+
if pagination_index > @pagination_max_pages
|
132
|
+
puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
|
133
|
+
EMPTY_STRING
|
134
|
+
else
|
135
|
+
uri = URI.parse(url)
|
136
|
+
query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
|
137
|
+
# update the pagination query string parameter
|
138
|
+
query[@pagination_param] = pagination_index
|
139
|
+
uri.query = URI.encode_www_form(query)
|
140
|
+
puts "Next index pagination url is #{uri}" if @verbose
|
141
|
+
uri.to_s
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
##
|
146
|
+
# Writes the scraped result to a CSV at the given filename.
|
147
|
+
##
|
148
|
+
def scrape_to_csv filename, &blk
|
149
|
+
require 'csv'
|
150
|
+
self.url_array = self.get_index unless self.url_array
|
151
|
+
CSV.open filename, 'wb' do |csv|
|
152
|
+
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
153
|
+
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
154
|
+
if document[0].respond_to? :map
|
155
|
+
document.each{|row| csv << row }
|
156
|
+
else
|
157
|
+
csv << document
|
158
|
+
end
|
159
|
+
end
|
160
|
+
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def scrape_to_tsv filename, &blk
|
165
|
+
require 'csv'
|
166
|
+
self.url_array = self.get_index unless self.url_array
|
167
|
+
CSV.open filename, 'wb', :col_sep => "\t" do |csv|
|
168
|
+
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
169
|
+
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
170
|
+
if document[0].respond_to? :map
|
171
|
+
document.each{|row| csv << row }
|
172
|
+
else
|
173
|
+
csv << document
|
174
|
+
end
|
175
|
+
end
|
176
|
+
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
protected
|
181
|
+
|
182
|
+
##
|
183
|
+
# Handles getting pages with Downlader, which handles stashing.
|
184
|
+
##
|
185
|
+
def get_page(url, stash=false, options={})
|
186
|
+
return EMPTY_STRING if url.nil? || url.empty? #url is nil if the <a> lacks an `href` attribute.
|
187
|
+
global_options = {
|
188
|
+
:cache => stash,
|
189
|
+
:verbose => @verbose
|
190
|
+
}
|
191
|
+
if @readable_filenames
|
192
|
+
global_options[:readable_filenames] = true
|
193
|
+
end
|
194
|
+
if @stash_folder
|
195
|
+
global_options[:readable_filenames] = true
|
196
|
+
global_options[:cache_location] = @stash_folder
|
197
|
+
end
|
198
|
+
resp_and_cache = Downloader.new(url, global_options.merge(options)).get
|
199
|
+
if resp_and_cache[:from_resource]
|
200
|
+
puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
|
201
|
+
sleep @sleep_time_between_requests
|
202
|
+
end
|
203
|
+
resp_and_cache[:resp]
|
204
|
+
end
|
205
|
+
|
206
|
+
|
207
|
+
##
|
208
|
+
# sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
|
209
|
+
# resolve_url resolves them to absolute urls.
|
210
|
+
# absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
|
211
|
+
##
|
212
|
+
def resolve_url(href_str, absolute_url_str)
|
213
|
+
if absolute_url_str.class <= URI::Generic
|
214
|
+
absolute_url = absolute_url_str.dup
|
215
|
+
else
|
216
|
+
begin
|
217
|
+
absolute_url = URI(absolute_url_str).dup
|
218
|
+
rescue URI::InvalidURIError
|
219
|
+
raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
|
220
|
+
end
|
221
|
+
end
|
222
|
+
raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
|
223
|
+
if href_str.class <= URI::Generic
|
224
|
+
href = href_str.dup
|
225
|
+
else
|
226
|
+
begin
|
227
|
+
href = URI(href_str).dup
|
228
|
+
rescue URI::InvalidURIError
|
229
|
+
raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
# return :href if :href is already absolute
|
234
|
+
return href.to_s if href.absolute?
|
235
|
+
|
236
|
+
#TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
|
237
|
+
URI.join(absolute_url.to_s, href.to_s).to_s
|
238
|
+
end
|
239
|
+
|
240
|
+
##
|
241
|
+
# Return a list of URLs for the instances you want to scrape.
|
242
|
+
# This can optionally be overridden if, for example, the list of instances
|
243
|
+
# comes from an API.
|
244
|
+
##
|
245
|
+
def get_index
|
246
|
+
index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
|
247
|
+
end
|
248
|
+
|
249
|
+
# TODO: Not sure the best way to handle this
|
250
|
+
# Currently, #parse_index is called upon #get_index_pages,
|
251
|
+
# which itself is dependent on @index_url
|
252
|
+
# Does @index_url stay unaltered for the lifetime of the Upton instance?
|
253
|
+
# It seems to at this point, but that may be something that gets
|
254
|
+
# deprecated later
|
255
|
+
#
|
256
|
+
# So for now, @index_url is used in conjunction with resolve_url
|
257
|
+
# to make sure that this method returns absolute urls
|
258
|
+
# i.e. this method expects @index_url to always have an absolute address
|
259
|
+
# for the lifetime of an Upton instance
|
260
|
+
def parse_index(text, selector)
|
261
|
+
Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
|
262
|
+
href = a_element["href"]
|
263
|
+
resolved_url = resolve_url( href, @index_url) unless href.nil?
|
264
|
+
puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
|
265
|
+
resolved_url
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
|
270
|
+
##
|
271
|
+
# Returns the concatenated output of each member of a paginated index,
|
272
|
+
# e.g. a site listing links with 2+ pages.
|
273
|
+
##
|
274
|
+
def get_index_pages(url, pagination_index, pagination_interval, options={})
|
275
|
+
resps = [self.get_page(url, @index_debug, options)]
|
276
|
+
prev_url = url
|
277
|
+
while !resps.last.empty?
|
278
|
+
pagination_index += pagination_interval
|
279
|
+
next_url = self.next_index_page_url(url, pagination_index)
|
280
|
+
next_url = resolve_url(next_url, url)
|
281
|
+
break if next_url == prev_url || next_url.empty?
|
282
|
+
|
283
|
+
next_resp = self.get_page(next_url, @index_debug, options).to_s
|
284
|
+
prev_url = next_url
|
285
|
+
resps << next_resp
|
286
|
+
end
|
287
|
+
resps
|
288
|
+
end
|
289
|
+
|
290
|
+
##
|
291
|
+
# Returns the instance at `url`.
|
292
|
+
#
|
293
|
+
# If the page is stashed, returns that, otherwise, fetches it from the web.
|
294
|
+
#
|
295
|
+
# If an instance is paginated, returns the concatenated output of each
|
296
|
+
# page, e.g. if a news article has two pages.
|
297
|
+
##
|
298
|
+
def get_instance(url, pagination_index=0, options={})
|
299
|
+
resps = [self.get_page(url, @debug, options)]
|
300
|
+
pagination_index = pagination_index.to_i
|
301
|
+
prev_url = url
|
302
|
+
while !resps.last.empty?
|
303
|
+
next_url = self.next_instance_page_url(url, pagination_index + 1)
|
304
|
+
break if next_url == prev_url || next_url.empty?
|
305
|
+
|
306
|
+
next_resp = self.get_page(next_url, @debug, options)
|
307
|
+
prev_url = next_url
|
308
|
+
resps << next_resp
|
309
|
+
end
|
310
|
+
resps
|
311
|
+
end
|
312
|
+
|
313
|
+
# Just a helper for +scrape+.
|
314
|
+
def scrape_from_list(list, blk)
|
315
|
+
puts "Scraping #{list.size} instances" if @verbose
|
316
|
+
list.each_with_index.map do |instance_url, instance_index|
|
317
|
+
instance_resps = get_instance instance_url, nil, :instance_index => instance_index
|
318
|
+
instance_resps.each_with_index.map do |instance_resp, pagination_index|
|
319
|
+
blk.call(instance_resp, instance_url, instance_index, pagination_index)
|
320
|
+
end
|
321
|
+
end.flatten(1)
|
322
|
+
end
|
323
|
+
|
324
|
+
# it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
|
325
|
+
def slug(url)
|
326
|
+
url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
|
327
|
+
end
|
328
|
+
|
329
|
+
end
|
330
|
+
end
|
data/lib/upton/version.rb
CHANGED
data/lib/upton.rb
CHANGED
@@ -35,7 +35,8 @@ module Upton
|
|
35
35
|
EMPTY_STRING = ''
|
36
36
|
|
37
37
|
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
|
38
|
-
:paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames
|
38
|
+
:paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
|
39
|
+
:pagination_interval
|
39
40
|
|
40
41
|
##
|
41
42
|
# This is the main user-facing method for a basic scraper.
|
@@ -101,6 +102,8 @@ module Upton
|
|
101
102
|
@pagination_max_pages = 2
|
102
103
|
# Default starting number for pagination (second page is this plus 1).
|
103
104
|
@pagination_start_index = 1
|
105
|
+
# Default value to increment page number by
|
106
|
+
@pagination_interval = 1
|
104
107
|
|
105
108
|
# Folder name for stashes, if you want them to be stored somewhere else,
|
106
109
|
# e.g. under /tmp.
|
@@ -260,7 +263,7 @@ module Upton
|
|
260
263
|
# comes from an API.
|
261
264
|
##
|
262
265
|
def get_index
|
263
|
-
index_pages = get_index_pages(@index_url, @pagination_start_index).map{|page| parse_index(page, @index_selector) }.flatten
|
266
|
+
index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
|
264
267
|
end
|
265
268
|
|
266
269
|
# TODO: Not sure the best way to handle this
|
@@ -288,11 +291,11 @@ module Upton
|
|
288
291
|
# Returns the concatenated output of each member of a paginated index,
|
289
292
|
# e.g. a site listing links with 2+ pages.
|
290
293
|
##
|
291
|
-
def get_index_pages(url, pagination_index, options={})
|
294
|
+
def get_index_pages(url, pagination_index, pagination_interval, options={})
|
292
295
|
resps = [self.get_page(url, @index_debug, options)]
|
293
296
|
prev_url = url
|
294
297
|
while !resps.last.empty?
|
295
|
-
pagination_index +=
|
298
|
+
pagination_index += pagination_interval
|
296
299
|
next_url = self.next_index_page_url(url, pagination_index)
|
297
300
|
next_url = resolve_url(next_url, url)
|
298
301
|
break if next_url == prev_url || next_url.empty?
|
metadata
CHANGED
@@ -1,116 +1,102 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
5
|
-
prerelease:
|
4
|
+
version: 0.3.1
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Jeremy B. Merrill
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-02-16 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rack
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '0'
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - '>='
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '0'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: rspec
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - '>='
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '0'
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - '>='
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '0'
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: webmock
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
|
-
- -
|
45
|
+
- - '>='
|
52
46
|
- !ruby/object:Gem::Version
|
53
47
|
version: '0'
|
54
48
|
type: :development
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - '>='
|
60
53
|
- !ruby/object:Gem::Version
|
61
54
|
version: '0'
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: thin
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
|
-
- -
|
59
|
+
- - '>='
|
68
60
|
- !ruby/object:Gem::Version
|
69
61
|
version: '0'
|
70
62
|
type: :development
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
|
-
- -
|
66
|
+
- - '>='
|
76
67
|
- !ruby/object:Gem::Version
|
77
68
|
version: '0'
|
78
69
|
- !ruby/object:Gem::Dependency
|
79
70
|
name: nokogiri
|
80
71
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
72
|
requirements:
|
83
|
-
- -
|
73
|
+
- - '>='
|
84
74
|
- !ruby/object:Gem::Version
|
85
75
|
version: 1.5.1
|
86
76
|
type: :development
|
87
77
|
prerelease: false
|
88
78
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
79
|
requirements:
|
91
|
-
- -
|
80
|
+
- - '>='
|
92
81
|
- !ruby/object:Gem::Version
|
93
82
|
version: 1.5.1
|
94
83
|
- !ruby/object:Gem::Dependency
|
95
84
|
name: yard
|
96
85
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
86
|
requirements:
|
99
|
-
- -
|
87
|
+
- - '>='
|
100
88
|
- !ruby/object:Gem::Version
|
101
89
|
version: '0'
|
102
90
|
type: :development
|
103
91
|
prerelease: false
|
104
92
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
93
|
requirements:
|
107
|
-
- -
|
94
|
+
- - '>='
|
108
95
|
- !ruby/object:Gem::Version
|
109
96
|
version: '0'
|
110
97
|
- !ruby/object:Gem::Dependency
|
111
98
|
name: rest-client
|
112
99
|
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
100
|
requirements:
|
115
101
|
- - ~>
|
116
102
|
- !ruby/object:Gem::Version
|
@@ -118,7 +104,6 @@ dependencies:
|
|
118
104
|
type: :runtime
|
119
105
|
prerelease: false
|
120
106
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
107
|
requirements:
|
123
108
|
- - ~>
|
124
109
|
- !ruby/object:Gem::Version
|
@@ -126,33 +111,29 @@ dependencies:
|
|
126
111
|
- !ruby/object:Gem::Dependency
|
127
112
|
name: nokogiri
|
128
113
|
requirement: !ruby/object:Gem::Requirement
|
129
|
-
none: false
|
130
114
|
requirements:
|
131
|
-
- -
|
115
|
+
- - '>='
|
132
116
|
- !ruby/object:Gem::Version
|
133
117
|
version: '0'
|
134
118
|
type: :runtime
|
135
119
|
prerelease: false
|
136
120
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
121
|
requirements:
|
139
|
-
- -
|
122
|
+
- - '>='
|
140
123
|
- !ruby/object:Gem::Version
|
141
124
|
version: '0'
|
142
125
|
- !ruby/object:Gem::Dependency
|
143
126
|
name: mechanize
|
144
127
|
requirement: !ruby/object:Gem::Requirement
|
145
|
-
none: false
|
146
128
|
requirements:
|
147
|
-
- -
|
129
|
+
- - '>='
|
148
130
|
- !ruby/object:Gem::Version
|
149
131
|
version: '0'
|
150
132
|
type: :runtime
|
151
133
|
prerelease: false
|
152
134
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
-
none: false
|
154
135
|
requirements:
|
155
|
-
- -
|
136
|
+
- - '>='
|
156
137
|
- !ruby/object:Gem::Version
|
157
138
|
version: '0'
|
158
139
|
description: Don't re-write web scrapers every time. Upton gives you a scraper template
|
@@ -163,6 +144,7 @@ extensions: []
|
|
163
144
|
extra_rdoc_files: []
|
164
145
|
files:
|
165
146
|
- lib/upton.rb
|
147
|
+
- lib/upton/scraper.rb
|
166
148
|
- lib/upton/utils.rb
|
167
149
|
- lib/upton/downloader.rb
|
168
150
|
- lib/upton/version.rb
|
@@ -181,27 +163,26 @@ files:
|
|
181
163
|
homepage: http://github.org/propublica/upton
|
182
164
|
licenses:
|
183
165
|
- MIT
|
166
|
+
metadata: {}
|
184
167
|
post_install_message:
|
185
168
|
rdoc_options: []
|
186
169
|
require_paths:
|
187
170
|
- lib
|
188
171
|
required_ruby_version: !ruby/object:Gem::Requirement
|
189
|
-
none: false
|
190
172
|
requirements:
|
191
|
-
- -
|
173
|
+
- - '>='
|
192
174
|
- !ruby/object:Gem::Version
|
193
175
|
version: 1.9.2
|
194
176
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
195
|
-
none: false
|
196
177
|
requirements:
|
197
|
-
- -
|
178
|
+
- - '>='
|
198
179
|
- !ruby/object:Gem::Version
|
199
180
|
version: '0'
|
200
181
|
requirements: []
|
201
182
|
rubyforge_project:
|
202
|
-
rubygems_version:
|
183
|
+
rubygems_version: 2.0.14
|
203
184
|
signing_key:
|
204
|
-
specification_version:
|
185
|
+
specification_version: 4
|
205
186
|
summary: A simple web-scraping framework
|
206
187
|
test_files:
|
207
188
|
- spec/data/prosecutor.html
|