upton 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/upton/downloader.rb +21 -1
- data/lib/upton/scraper.rb +330 -0
- data/lib/upton/version.rb +1 -1
- data/lib/upton.rb +7 -4
- metadata +24 -43
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 892592f6c890ecd94fb1bdf3b8cc500e813ebfa3
|
4
|
+
data.tar.gz: 95d10ea4c37aaec611c76dc98c45dd449a1ac35d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f112a48ed90264ac5e111e48b45e6b67468793059f613385faa87bd6ab5122a7f11c532358daf7382c689e42b66db022bfce24ffd0b32ffe15619de0a026df77
|
7
|
+
data.tar.gz: 8fcbd1276ea6e284481d0395de5a1f73c07a2acb159edab6775c875d3dbf76e2b3c16c6b46b2aa48aec345fbd4950a5af179a6e560a1af5923e5087a8c6a648b
|
data/lib/upton/downloader.rb
CHANGED
@@ -2,6 +2,7 @@ require "fileutils"
|
|
2
2
|
require "open-uri"
|
3
3
|
require "tmpdir"
|
4
4
|
require "restclient"
|
5
|
+
require_relative "./version"
|
5
6
|
|
6
7
|
module Upton
|
7
8
|
|
@@ -88,11 +89,30 @@ module Upton
|
|
88
89
|
puts "Writing #{uri} data to the cache"
|
89
90
|
end
|
90
91
|
end
|
91
|
-
|
92
|
+
commented_resp = add_comment(resp)
|
93
|
+
open(cached_file, 'w'){|f| f << commented_resp}
|
92
94
|
end
|
93
95
|
{:resp => resp, :from_resource => from_resource }
|
94
96
|
end
|
95
97
|
|
98
|
+
def add_comment(resp)
|
99
|
+
# n = Nokogiri::HTML("<html></html>")
|
100
|
+
# c = Nokogiri::XML::Comment.new(n, "asdfasdf")
|
101
|
+
# n.root.add_child(c)
|
102
|
+
# <!----Retrieved by Upton from http://www.somesite.com on January 15 at 4:28 p.m.-->
|
103
|
+
msg = "Stashed file retrieved by Upton #{Upton::VERSION} from #{@uri} at #{Time.now}"
|
104
|
+
resp_html = Nokogiri::HTML(resp)
|
105
|
+
comment = Nokogiri::XML::Comment.new(resp_html, msg)
|
106
|
+
if resp_html.root.nil?
|
107
|
+
return resp
|
108
|
+
elsif resp_html.root.children.empty?
|
109
|
+
resp_html.root.add_child(comment)
|
110
|
+
else
|
111
|
+
resp_html.root.children.before(comment)
|
112
|
+
end
|
113
|
+
resp_html.to_html
|
114
|
+
end
|
115
|
+
|
96
116
|
def cache_enabled?
|
97
117
|
!!@cache
|
98
118
|
end
|
@@ -0,0 +1,330 @@
|
|
1
|
+
require 'uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
require_relative './downloader'
|
4
|
+
|
5
|
+
module Upton
|
6
|
+
# Upton::Scraper can be used as-is for basic use-cases by:
|
7
|
+
# 1. specifying the pages to be scraped in `new` as an index page
|
8
|
+
# or as an Array of URLs.
|
9
|
+
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
10
|
+
# block from Upton::Utils.
|
11
|
+
# For more complicated cases; subclass Upton::Scraper
|
12
|
+
# e.g. +MyScraper < Upton::Scraper+ and override various methods.
|
13
|
+
##
|
14
|
+
class Scraper
|
15
|
+
EMPTY_STRING = ''
|
16
|
+
|
17
|
+
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
|
18
|
+
:paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
|
19
|
+
:pagination_interval
|
20
|
+
|
21
|
+
##
|
22
|
+
# This is the main user-facing method for a basic scraper.
|
23
|
+
# Call +scrape+ with a block; this block will be called on
|
24
|
+
# the text of each instance page, (and optionally, its URL and its index
|
25
|
+
# in the list of instance URLs returned by +get_index+).
|
26
|
+
##
|
27
|
+
def scrape(&blk)
|
28
|
+
self.url_array = self.get_index unless self.url_array
|
29
|
+
self.scrape_from_list(self.url_array, blk)
|
30
|
+
end
|
31
|
+
|
32
|
+
##
|
33
|
+
# +index_url_or_array+: A list of string URLs, OR
|
34
|
+
# the URL of the page containing the list of instances.
|
35
|
+
# +selector+: The XPath expression or CSS selector that specifies the
|
36
|
+
# anchor elements within the page, if a url is specified for
|
37
|
+
# the previous argument.
|
38
|
+
#
|
39
|
+
# These options are a shortcut. If you plan to override +get_index+, you
|
40
|
+
# do not need to set them.
|
41
|
+
# If you don't specify a selector, the first argument will be treated as a
|
42
|
+
# list of URLs.
|
43
|
+
##
|
44
|
+
def initialize(index_url_or_array, selector="")
|
45
|
+
|
46
|
+
#if first arg is a valid URL, do already-written stuff;
|
47
|
+
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
48
|
+
#e.g. Scraper.new(["http://jeremybmerrill.com"])
|
49
|
+
|
50
|
+
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
51
|
+
if index_url_or_array.respond_to? :each_with_index
|
52
|
+
@url_array = index_url_or_array
|
53
|
+
else
|
54
|
+
@index_url = index_url_or_array
|
55
|
+
@index_selector = selector
|
56
|
+
end
|
57
|
+
|
58
|
+
# If true, then Upton prints information about when it gets
|
59
|
+
# files from the internet and when it gets them from its stash.
|
60
|
+
@verbose = false
|
61
|
+
|
62
|
+
# If true, then Upton fetches each instance page only once
|
63
|
+
# future requests for that file are responded to with the locally stashed
|
64
|
+
# version.
|
65
|
+
# You may want to set @debug to false for production (but maybe not).
|
66
|
+
# You can also control stashing behavior on a per-call basis with the
|
67
|
+
# optional second argument to get_page, if, for instance, you want to
|
68
|
+
# stash certain instance pages, e.g. based on their modification date.
|
69
|
+
@debug = true
|
70
|
+
# Index debug does the same, but for index pages.
|
71
|
+
@index_debug = false
|
72
|
+
|
73
|
+
# In order to not hammer servers, Upton waits for, by default, 30
|
74
|
+
# seconds between requests to the remote server.
|
75
|
+
@sleep_time_between_requests = 30 #seconds
|
76
|
+
|
77
|
+
# If true, then Upton will attempt to scrape paginated index pages
|
78
|
+
@paginated = false
|
79
|
+
# Default query string parameter used to specify the current page
|
80
|
+
@pagination_param = 'page'
|
81
|
+
# Default number of paginated pages to scrape
|
82
|
+
@pagination_max_pages = 2
|
83
|
+
# Default starting number for pagination (second page is this plus 1).
|
84
|
+
@pagination_start_index = 1
|
85
|
+
# Default value to increment page number by
|
86
|
+
@pagination_interval = 1
|
87
|
+
|
88
|
+
# Folder name for stashes, if you want them to be stored somewhere else,
|
89
|
+
# e.g. under /tmp.
|
90
|
+
if @stash_folder
|
91
|
+
FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
##
|
96
|
+
# If instance pages are paginated, <b>you must override</b>
|
97
|
+
# this method to return the next URL, given the current URL and its index.
|
98
|
+
#
|
99
|
+
# If instance pages aren't paginated, there's no need to override this.
|
100
|
+
#
|
101
|
+
# Recursion stops if the fetching URL returns an empty string or an error.
|
102
|
+
#
|
103
|
+
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
104
|
+
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
105
|
+
##
|
106
|
+
def next_instance_page_url(url, pagination_index)
|
107
|
+
EMPTY_STRING
|
108
|
+
end
|
109
|
+
|
110
|
+
##
|
111
|
+
# Return the next URL to scrape, given the current URL and its index.
|
112
|
+
#
|
113
|
+
# Recursion stops if the fetching URL returns an empty string or an error.
|
114
|
+
#
|
115
|
+
# If @paginated is not set (the default), this method returns an empty string.
|
116
|
+
#
|
117
|
+
# If @paginated is set, this method will return the next pagination URL
|
118
|
+
# to scrape using @pagination_param and the pagination_index.
|
119
|
+
#
|
120
|
+
# If the pagination_index is greater than @pagination_max_pages, then the
|
121
|
+
# method will return an empty string.
|
122
|
+
#
|
123
|
+
# Override this method to handle pagination is an alternative way
|
124
|
+
# e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
|
125
|
+
# ought to return "http://whatever.com/articles?page=2"
|
126
|
+
#
|
127
|
+
##
|
128
|
+
def next_index_page_url(url, pagination_index)
|
129
|
+
return EMPTY_STRING unless @paginated
|
130
|
+
|
131
|
+
if pagination_index > @pagination_max_pages
|
132
|
+
puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
|
133
|
+
EMPTY_STRING
|
134
|
+
else
|
135
|
+
uri = URI.parse(url)
|
136
|
+
query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
|
137
|
+
# update the pagination query string parameter
|
138
|
+
query[@pagination_param] = pagination_index
|
139
|
+
uri.query = URI.encode_www_form(query)
|
140
|
+
puts "Next index pagination url is #{uri}" if @verbose
|
141
|
+
uri.to_s
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
##
|
146
|
+
# Writes the scraped result to a CSV at the given filename.
|
147
|
+
##
|
148
|
+
def scrape_to_csv filename, &blk
|
149
|
+
require 'csv'
|
150
|
+
self.url_array = self.get_index unless self.url_array
|
151
|
+
CSV.open filename, 'wb' do |csv|
|
152
|
+
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
153
|
+
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
154
|
+
if document[0].respond_to? :map
|
155
|
+
document.each{|row| csv << row }
|
156
|
+
else
|
157
|
+
csv << document
|
158
|
+
end
|
159
|
+
end
|
160
|
+
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def scrape_to_tsv filename, &blk
|
165
|
+
require 'csv'
|
166
|
+
self.url_array = self.get_index unless self.url_array
|
167
|
+
CSV.open filename, 'wb', :col_sep => "\t" do |csv|
|
168
|
+
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
169
|
+
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
170
|
+
if document[0].respond_to? :map
|
171
|
+
document.each{|row| csv << row }
|
172
|
+
else
|
173
|
+
csv << document
|
174
|
+
end
|
175
|
+
end
|
176
|
+
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
protected
|
181
|
+
|
182
|
+
##
|
183
|
+
# Handles getting pages with Downlader, which handles stashing.
|
184
|
+
##
|
185
|
+
def get_page(url, stash=false, options={})
|
186
|
+
return EMPTY_STRING if url.nil? || url.empty? #url is nil if the <a> lacks an `href` attribute.
|
187
|
+
global_options = {
|
188
|
+
:cache => stash,
|
189
|
+
:verbose => @verbose
|
190
|
+
}
|
191
|
+
if @readable_filenames
|
192
|
+
global_options[:readable_filenames] = true
|
193
|
+
end
|
194
|
+
if @stash_folder
|
195
|
+
global_options[:readable_filenames] = true
|
196
|
+
global_options[:cache_location] = @stash_folder
|
197
|
+
end
|
198
|
+
resp_and_cache = Downloader.new(url, global_options.merge(options)).get
|
199
|
+
if resp_and_cache[:from_resource]
|
200
|
+
puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
|
201
|
+
sleep @sleep_time_between_requests
|
202
|
+
end
|
203
|
+
resp_and_cache[:resp]
|
204
|
+
end
|
205
|
+
|
206
|
+
|
207
|
+
##
|
208
|
+
# sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
|
209
|
+
# resolve_url resolves them to absolute urls.
|
210
|
+
# absolute_url_str must be a URL, as a string that represents an absolute URL or a URI
|
211
|
+
##
|
212
|
+
def resolve_url(href_str, absolute_url_str)
|
213
|
+
if absolute_url_str.class <= URI::Generic
|
214
|
+
absolute_url = absolute_url_str.dup
|
215
|
+
else
|
216
|
+
begin
|
217
|
+
absolute_url = URI(absolute_url_str).dup
|
218
|
+
rescue URI::InvalidURIError
|
219
|
+
raise ArgumentError, "#{absolute_url_str} must be represent a valid relative or absolute URI"
|
220
|
+
end
|
221
|
+
end
|
222
|
+
raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
|
223
|
+
if href_str.class <= URI::Generic
|
224
|
+
href = href_str.dup
|
225
|
+
else
|
226
|
+
begin
|
227
|
+
href = URI(href_str).dup
|
228
|
+
rescue URI::InvalidURIError
|
229
|
+
raise ArgumentError, "#{href_str} must be represent a valid relative or absolute URI"
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
# return :href if :href is already absolute
|
234
|
+
return href.to_s if href.absolute?
|
235
|
+
|
236
|
+
#TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
|
237
|
+
URI.join(absolute_url.to_s, href.to_s).to_s
|
238
|
+
end
|
239
|
+
|
240
|
+
##
|
241
|
+
# Return a list of URLs for the instances you want to scrape.
|
242
|
+
# This can optionally be overridden if, for example, the list of instances
|
243
|
+
# comes from an API.
|
244
|
+
##
|
245
|
+
def get_index
|
246
|
+
index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
|
247
|
+
end
|
248
|
+
|
249
|
+
# TODO: Not sure the best way to handle this
|
250
|
+
# Currently, #parse_index is called upon #get_index_pages,
|
251
|
+
# which itself is dependent on @index_url
|
252
|
+
# Does @index_url stay unaltered for the lifetime of the Upton instance?
|
253
|
+
# It seems to at this point, but that may be something that gets
|
254
|
+
# deprecated later
|
255
|
+
#
|
256
|
+
# So for now, @index_url is used in conjunction with resolve_url
|
257
|
+
# to make sure that this method returns absolute urls
|
258
|
+
# i.e. this method expects @index_url to always have an absolute address
|
259
|
+
# for the lifetime of an Upton instance
|
260
|
+
def parse_index(text, selector)
|
261
|
+
Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
|
262
|
+
href = a_element["href"]
|
263
|
+
resolved_url = resolve_url( href, @index_url) unless href.nil?
|
264
|
+
puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
|
265
|
+
resolved_url
|
266
|
+
end
|
267
|
+
end
|
268
|
+
|
269
|
+
|
270
|
+
##
|
271
|
+
# Returns the concatenated output of each member of a paginated index,
|
272
|
+
# e.g. a site listing links with 2+ pages.
|
273
|
+
##
|
274
|
+
def get_index_pages(url, pagination_index, pagination_interval, options={})
|
275
|
+
resps = [self.get_page(url, @index_debug, options)]
|
276
|
+
prev_url = url
|
277
|
+
while !resps.last.empty?
|
278
|
+
pagination_index += pagination_interval
|
279
|
+
next_url = self.next_index_page_url(url, pagination_index)
|
280
|
+
next_url = resolve_url(next_url, url)
|
281
|
+
break if next_url == prev_url || next_url.empty?
|
282
|
+
|
283
|
+
next_resp = self.get_page(next_url, @index_debug, options).to_s
|
284
|
+
prev_url = next_url
|
285
|
+
resps << next_resp
|
286
|
+
end
|
287
|
+
resps
|
288
|
+
end
|
289
|
+
|
290
|
+
##
|
291
|
+
# Returns the instance at `url`.
|
292
|
+
#
|
293
|
+
# If the page is stashed, returns that, otherwise, fetches it from the web.
|
294
|
+
#
|
295
|
+
# If an instance is paginated, returns the concatenated output of each
|
296
|
+
# page, e.g. if a news article has two pages.
|
297
|
+
##
|
298
|
+
def get_instance(url, pagination_index=0, options={})
|
299
|
+
resps = [self.get_page(url, @debug, options)]
|
300
|
+
pagination_index = pagination_index.to_i
|
301
|
+
prev_url = url
|
302
|
+
while !resps.last.empty?
|
303
|
+
next_url = self.next_instance_page_url(url, pagination_index + 1)
|
304
|
+
break if next_url == prev_url || next_url.empty?
|
305
|
+
|
306
|
+
next_resp = self.get_page(next_url, @debug, options)
|
307
|
+
prev_url = next_url
|
308
|
+
resps << next_resp
|
309
|
+
end
|
310
|
+
resps
|
311
|
+
end
|
312
|
+
|
313
|
+
# Just a helper for +scrape+.
|
314
|
+
def scrape_from_list(list, blk)
|
315
|
+
puts "Scraping #{list.size} instances" if @verbose
|
316
|
+
list.each_with_index.map do |instance_url, instance_index|
|
317
|
+
instance_resps = get_instance instance_url, nil, :instance_index => instance_index
|
318
|
+
instance_resps.each_with_index.map do |instance_resp, pagination_index|
|
319
|
+
blk.call(instance_resp, instance_url, instance_index, pagination_index)
|
320
|
+
end
|
321
|
+
end.flatten(1)
|
322
|
+
end
|
323
|
+
|
324
|
+
# it's often useful to have this slug method for uniquely (almost certainly) identifying pages.
|
325
|
+
def slug(url)
|
326
|
+
url.split("/")[-1].gsub(/\?.*/, "").gsub(/.html.*/, "")
|
327
|
+
end
|
328
|
+
|
329
|
+
end
|
330
|
+
end
|
data/lib/upton/version.rb
CHANGED
data/lib/upton.rb
CHANGED
@@ -35,7 +35,8 @@ module Upton
|
|
35
35
|
EMPTY_STRING = ''
|
36
36
|
|
37
37
|
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
|
38
|
-
:paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames
|
38
|
+
:paginated, :pagination_param, :pagination_max_pages, :pagination_start_index, :readable_filenames,
|
39
|
+
:pagination_interval
|
39
40
|
|
40
41
|
##
|
41
42
|
# This is the main user-facing method for a basic scraper.
|
@@ -101,6 +102,8 @@ module Upton
|
|
101
102
|
@pagination_max_pages = 2
|
102
103
|
# Default starting number for pagination (second page is this plus 1).
|
103
104
|
@pagination_start_index = 1
|
105
|
+
# Default value to increment page number by
|
106
|
+
@pagination_interval = 1
|
104
107
|
|
105
108
|
# Folder name for stashes, if you want them to be stored somewhere else,
|
106
109
|
# e.g. under /tmp.
|
@@ -260,7 +263,7 @@ module Upton
|
|
260
263
|
# comes from an API.
|
261
264
|
##
|
262
265
|
def get_index
|
263
|
-
index_pages = get_index_pages(@index_url, @pagination_start_index).map{|page| parse_index(page, @index_selector) }.flatten
|
266
|
+
index_pages = get_index_pages(@index_url, @pagination_start_index, @pagination_interval).map{|page| parse_index(page, @index_selector) }.flatten
|
264
267
|
end
|
265
268
|
|
266
269
|
# TODO: Not sure the best way to handle this
|
@@ -288,11 +291,11 @@ module Upton
|
|
288
291
|
# Returns the concatenated output of each member of a paginated index,
|
289
292
|
# e.g. a site listing links with 2+ pages.
|
290
293
|
##
|
291
|
-
def get_index_pages(url, pagination_index, options={})
|
294
|
+
def get_index_pages(url, pagination_index, pagination_interval, options={})
|
292
295
|
resps = [self.get_page(url, @index_debug, options)]
|
293
296
|
prev_url = url
|
294
297
|
while !resps.last.empty?
|
295
|
-
pagination_index +=
|
298
|
+
pagination_index += pagination_interval
|
296
299
|
next_url = self.next_index_page_url(url, pagination_index)
|
297
300
|
next_url = resolve_url(next_url, url)
|
298
301
|
break if next_url == prev_url || next_url.empty?
|
metadata
CHANGED
@@ -1,116 +1,102 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
5
|
-
prerelease:
|
4
|
+
version: 0.3.1
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Jeremy B. Merrill
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-02-16 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: rack
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '0'
|
22
20
|
type: :development
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - '>='
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '0'
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: rspec
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
|
-
- -
|
31
|
+
- - '>='
|
36
32
|
- !ruby/object:Gem::Version
|
37
33
|
version: '0'
|
38
34
|
type: :development
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
|
-
- -
|
38
|
+
- - '>='
|
44
39
|
- !ruby/object:Gem::Version
|
45
40
|
version: '0'
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: webmock
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
|
-
- -
|
45
|
+
- - '>='
|
52
46
|
- !ruby/object:Gem::Version
|
53
47
|
version: '0'
|
54
48
|
type: :development
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - '>='
|
60
53
|
- !ruby/object:Gem::Version
|
61
54
|
version: '0'
|
62
55
|
- !ruby/object:Gem::Dependency
|
63
56
|
name: thin
|
64
57
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
58
|
requirements:
|
67
|
-
- -
|
59
|
+
- - '>='
|
68
60
|
- !ruby/object:Gem::Version
|
69
61
|
version: '0'
|
70
62
|
type: :development
|
71
63
|
prerelease: false
|
72
64
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
65
|
requirements:
|
75
|
-
- -
|
66
|
+
- - '>='
|
76
67
|
- !ruby/object:Gem::Version
|
77
68
|
version: '0'
|
78
69
|
- !ruby/object:Gem::Dependency
|
79
70
|
name: nokogiri
|
80
71
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
72
|
requirements:
|
83
|
-
- -
|
73
|
+
- - '>='
|
84
74
|
- !ruby/object:Gem::Version
|
85
75
|
version: 1.5.1
|
86
76
|
type: :development
|
87
77
|
prerelease: false
|
88
78
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
79
|
requirements:
|
91
|
-
- -
|
80
|
+
- - '>='
|
92
81
|
- !ruby/object:Gem::Version
|
93
82
|
version: 1.5.1
|
94
83
|
- !ruby/object:Gem::Dependency
|
95
84
|
name: yard
|
96
85
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
86
|
requirements:
|
99
|
-
- -
|
87
|
+
- - '>='
|
100
88
|
- !ruby/object:Gem::Version
|
101
89
|
version: '0'
|
102
90
|
type: :development
|
103
91
|
prerelease: false
|
104
92
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
93
|
requirements:
|
107
|
-
- -
|
94
|
+
- - '>='
|
108
95
|
- !ruby/object:Gem::Version
|
109
96
|
version: '0'
|
110
97
|
- !ruby/object:Gem::Dependency
|
111
98
|
name: rest-client
|
112
99
|
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
100
|
requirements:
|
115
101
|
- - ~>
|
116
102
|
- !ruby/object:Gem::Version
|
@@ -118,7 +104,6 @@ dependencies:
|
|
118
104
|
type: :runtime
|
119
105
|
prerelease: false
|
120
106
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
107
|
requirements:
|
123
108
|
- - ~>
|
124
109
|
- !ruby/object:Gem::Version
|
@@ -126,33 +111,29 @@ dependencies:
|
|
126
111
|
- !ruby/object:Gem::Dependency
|
127
112
|
name: nokogiri
|
128
113
|
requirement: !ruby/object:Gem::Requirement
|
129
|
-
none: false
|
130
114
|
requirements:
|
131
|
-
- -
|
115
|
+
- - '>='
|
132
116
|
- !ruby/object:Gem::Version
|
133
117
|
version: '0'
|
134
118
|
type: :runtime
|
135
119
|
prerelease: false
|
136
120
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
121
|
requirements:
|
139
|
-
- -
|
122
|
+
- - '>='
|
140
123
|
- !ruby/object:Gem::Version
|
141
124
|
version: '0'
|
142
125
|
- !ruby/object:Gem::Dependency
|
143
126
|
name: mechanize
|
144
127
|
requirement: !ruby/object:Gem::Requirement
|
145
|
-
none: false
|
146
128
|
requirements:
|
147
|
-
- -
|
129
|
+
- - '>='
|
148
130
|
- !ruby/object:Gem::Version
|
149
131
|
version: '0'
|
150
132
|
type: :runtime
|
151
133
|
prerelease: false
|
152
134
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
-
none: false
|
154
135
|
requirements:
|
155
|
-
- -
|
136
|
+
- - '>='
|
156
137
|
- !ruby/object:Gem::Version
|
157
138
|
version: '0'
|
158
139
|
description: Don't re-write web scrapers every time. Upton gives you a scraper template
|
@@ -163,6 +144,7 @@ extensions: []
|
|
163
144
|
extra_rdoc_files: []
|
164
145
|
files:
|
165
146
|
- lib/upton.rb
|
147
|
+
- lib/upton/scraper.rb
|
166
148
|
- lib/upton/utils.rb
|
167
149
|
- lib/upton/downloader.rb
|
168
150
|
- lib/upton/version.rb
|
@@ -181,27 +163,26 @@ files:
|
|
181
163
|
homepage: http://github.org/propublica/upton
|
182
164
|
licenses:
|
183
165
|
- MIT
|
166
|
+
metadata: {}
|
184
167
|
post_install_message:
|
185
168
|
rdoc_options: []
|
186
169
|
require_paths:
|
187
170
|
- lib
|
188
171
|
required_ruby_version: !ruby/object:Gem::Requirement
|
189
|
-
none: false
|
190
172
|
requirements:
|
191
|
-
- -
|
173
|
+
- - '>='
|
192
174
|
- !ruby/object:Gem::Version
|
193
175
|
version: 1.9.2
|
194
176
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
195
|
-
none: false
|
196
177
|
requirements:
|
197
|
-
- -
|
178
|
+
- - '>='
|
198
179
|
- !ruby/object:Gem::Version
|
199
180
|
version: '0'
|
200
181
|
requirements: []
|
201
182
|
rubyforge_project:
|
202
|
-
rubygems_version:
|
183
|
+
rubygems_version: 2.0.14
|
203
184
|
signing_key:
|
204
|
-
specification_version:
|
185
|
+
specification_version: 4
|
205
186
|
summary: A simple web-scraping framework
|
206
187
|
test_files:
|
207
188
|
- spec/data/prosecutor.html
|