upton 0.2.6 → 0.2.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +6 -14
- data/lib/upton.rb +130 -35
- data/lib/utils.rb +32 -2
- data/{test → spec}/data/discussion.html +0 -0
- data/{test → spec}/data/easttimor.html +0 -0
- data/spec/data/propublica-relative.html +17 -0
- data/{test → spec}/data/propublica.html +0 -0
- data/{test → spec}/data/prosecutor.html +0 -0
- data/{test → spec}/data/sixfacts.html +0 -0
- data/{test → spec}/data/webinar.html +0 -0
- data/spec/upton_spec.rb +118 -0
- metadata +59 -29
- data/test/test_upton.rb +0 -141
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
metadata.gz: !binary |-
|
9
|
-
MjFhYjI5OTYwZGVlYTNlMmNhYTc1OWQ5ZGJmMzBlN2FiM2U4MzllMDM4Nzhk
|
10
|
-
MjJkMTczOGZjNWUwNDMyYmFlOGRkZDlhNjFkM2RlMzM1YjFmZTgyZWQ4MTBj
|
11
|
-
MjY2ZmFiYmZlOTc5YmE2YzFjMWE1YjVjZWY2MWMyYTczZmEwNGU=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
YmFjOTllZjdlNWIwNzhhMGIxODQwOTI1Y2EwY2YzMTE1YWEzOTdkMWI3NDEy
|
14
|
-
ZjA1OTE1N2Q0OGYwOWEyYjVjMDM3ZWQ1NzlhZmU3NDZlNTAxNDJmZWZjZGFm
|
15
|
-
YjUxMzc3ZThkZDg1ZDdkMjgwM2UyODMwZTZiMjdjZDAyNjAxNTQ=
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2eb19ce88f56ef55d8c32d6c16e7c777ce3f44e6
|
4
|
+
data.tar.gz: 1b86794f51292a30b310ceffa1f36a85144af3e5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f49d0d404cea0d07038a6b3394d9f000332045901d121fc1065c85da945dd1e372f1cb6f87f7cd8738250f5f7d35183729bd0d8cdd8c89872cdd8e1333225a6e
|
7
|
+
data.tar.gz: 70351047072d55ac1518b40b4a9a04c3287702465631527e41c6367559ddc39880ea0384f3e8c16a785ca31cbaff6ce129ed6d0a5f00ac633f6d29c86e2e613a
|
data/lib/upton.rb
CHANGED
@@ -28,7 +28,7 @@ module Upton
|
|
28
28
|
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
29
29
|
# block from Upton::Utils.
|
30
30
|
# For more complicated cases; subclass Upton::Scraper
|
31
|
-
# e.g. +MyScraper < Upton::Scraper+ and
|
31
|
+
# e.g. +MyScraper < Upton::Scraper+ and override various methods.
|
32
32
|
##
|
33
33
|
class Scraper
|
34
34
|
|
@@ -53,28 +53,32 @@ module Upton
|
|
53
53
|
# +selector+: The XPath expression or CSS selector that specifies the
|
54
54
|
# anchor elements within the page, if a url is specified for
|
55
55
|
# the previous argument.
|
56
|
-
# +selector_method+:
|
56
|
+
# +selector_method+: Deprecated and ignored. Next breaking release will
|
57
|
+
# remove this option.x
|
57
58
|
#
|
58
|
-
# These options are a shortcut. If you
|
59
|
+
# These options are a shortcut. If you plan to override +get_index+, you
|
59
60
|
# do not need to set them.
|
60
61
|
# If you don't specify a selector, the first argument will be treated as a
|
61
62
|
# list of URLs.
|
62
63
|
##
|
63
|
-
def initialize(index_url_or_array, selector="", selector_method=:xpath)
|
64
64
|
|
65
|
+
# DEPRECATION NOTE, re: selector_method
|
66
|
+
# the selector_method parameter is unneeded, as Nokogiri provides the
|
67
|
+
# #search method, which picks a selector depending on whether
|
68
|
+
# the String passed is of CSS/XPath notation
|
69
|
+
|
70
|
+
def initialize(index_url_or_array, selector="", selector_method=:deprecated)
|
71
|
+
|
65
72
|
#if first arg is a valid URL, do already-written stuff;
|
66
73
|
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
67
74
|
#e.g. Scraper.new(["http://jeremybmerrill.com"])
|
68
75
|
|
69
76
|
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
70
|
-
if
|
77
|
+
if index_url_or_array.respond_to? :each_with_index
|
71
78
|
@url_array = index_url_or_array
|
72
|
-
|
79
|
+
else
|
73
80
|
@index_url = index_url_or_array
|
74
81
|
@index_selector = selector
|
75
|
-
@index_selector_method = selector_method
|
76
|
-
else
|
77
|
-
raise ArgumentError
|
78
82
|
end
|
79
83
|
# If true, then Upton prints information about when it gets
|
80
84
|
# files from the internet and when it gets them from its stash.
|
@@ -97,9 +101,9 @@ module Upton
|
|
97
101
|
|
98
102
|
# Folder name for stashes, if you want them to be stored somewhere else,
|
99
103
|
# e.g. under /tmp.
|
100
|
-
@stash_folder
|
104
|
+
@stash_folder ||= "stashes"
|
101
105
|
unless Dir.exists?(@stash_folder)
|
102
|
-
|
106
|
+
FileUtils.mkdir_p(@stash_folder)
|
103
107
|
end
|
104
108
|
end
|
105
109
|
|
@@ -114,7 +118,7 @@ module Upton
|
|
114
118
|
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
115
119
|
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
116
120
|
##
|
117
|
-
def next_instance_page_url(url,
|
121
|
+
def next_instance_page_url(url, pagination_index)
|
118
122
|
""
|
119
123
|
end
|
120
124
|
|
@@ -129,7 +133,7 @@ module Upton
|
|
129
133
|
# e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
|
130
134
|
# ought to return "http://whatever.com/articles?page=2"
|
131
135
|
##
|
132
|
-
def next_index_page_url(url,
|
136
|
+
def next_index_page_url(url, pagination_index)
|
133
137
|
""
|
134
138
|
end
|
135
139
|
|
@@ -142,29 +146,64 @@ module Upton
|
|
142
146
|
self.url_array = self.get_index
|
143
147
|
end
|
144
148
|
CSV.open filename, 'wb' do |csv|
|
145
|
-
|
149
|
+
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
150
|
+
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
151
|
+
puts document.inspect
|
152
|
+
if document[0].respond_to? :map
|
153
|
+
document.each{|row| csv << row }
|
154
|
+
else
|
155
|
+
csv << document
|
156
|
+
end
|
157
|
+
end
|
158
|
+
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def scrape_to_tsv filename, &blk
|
163
|
+
require 'csv'
|
164
|
+
unless self.url_array
|
165
|
+
self.url_array = self.get_index
|
166
|
+
end
|
167
|
+
CSV.open filename, 'wb', :col_sep => "\t" do |csv|
|
168
|
+
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
169
|
+
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
170
|
+
puts document.inspect
|
171
|
+
if document[0].respond_to? :map
|
172
|
+
document.each{|row| csv << row }
|
173
|
+
else
|
174
|
+
csv << document
|
175
|
+
end
|
176
|
+
end
|
177
|
+
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
146
178
|
end
|
147
179
|
end
|
148
180
|
|
149
181
|
protected
|
150
182
|
|
183
|
+
##
|
184
|
+
# Actually fetches the page
|
185
|
+
##
|
186
|
+
def fetch_page(url, options={})
|
187
|
+
RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
|
188
|
+
end
|
189
|
+
|
151
190
|
##
|
152
191
|
# Handles getting pages with RestClient or getting them from the local stash.
|
153
192
|
#
|
154
193
|
# Uses a kludge (because rest-client is outdated) to handle encoding.
|
155
194
|
##
|
156
|
-
def get_page(url, stash=false)
|
195
|
+
def get_page(url, stash=false, options={})
|
157
196
|
return "" if url.empty?
|
158
197
|
|
159
198
|
#the filename for each stashed version is a cleaned version of the URL.
|
160
|
-
if stash && File.exists?(
|
199
|
+
if stash && File.exists?( url_to_filename(url, options) )
|
161
200
|
puts "usin' a stashed copy of " + url if @verbose
|
162
|
-
resp = open(
|
201
|
+
resp = open( url_to_filename(url, options), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
|
163
202
|
else
|
164
203
|
begin
|
165
204
|
puts "getting " + url if @verbose
|
166
205
|
sleep @sleep_time_between_requests
|
167
|
-
resp =
|
206
|
+
resp = fetch_page(url, options)
|
168
207
|
|
169
208
|
#this is silly, but rest-client needs to get on their game.
|
170
209
|
#cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
|
@@ -188,42 +227,95 @@ module Upton
|
|
188
227
|
rescue URI::InvalidURIError
|
189
228
|
puts "Invalid URI: #{url}" if @verbose
|
190
229
|
resp = ""
|
230
|
+
rescue RestClient::RequestTimeout
|
231
|
+
"Timeout: #{url}" if @verbose
|
232
|
+
retry
|
191
233
|
end
|
192
234
|
if stash
|
193
235
|
puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
|
194
|
-
open(
|
236
|
+
open( url_to_filename(url, options), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
|
195
237
|
end
|
196
238
|
end
|
197
239
|
resp
|
198
240
|
end
|
199
241
|
|
242
|
+
def url_to_filename(url, options={})
|
243
|
+
File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") )
|
244
|
+
end
|
245
|
+
|
246
|
+
|
247
|
+
##
|
248
|
+
# sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
|
249
|
+
# resolve_url resolves them to absolute urls.
|
250
|
+
# absolute_url_str must be a URL, as a string, that is absolute.
|
251
|
+
##
|
252
|
+
def resolve_url(href_str, absolute_url_str)
|
253
|
+
absolute_url = URI(absolute_url_str).dup
|
254
|
+
raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
|
255
|
+
href = URI(href_str).dup
|
256
|
+
|
257
|
+
# return :href if :href is already absolute
|
258
|
+
return href.to_s if href.absolute?
|
259
|
+
|
260
|
+
#TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
|
261
|
+
URI.join(absolute_url, href).to_s
|
262
|
+
end
|
263
|
+
|
200
264
|
##
|
201
265
|
# Return a list of URLs for the instances you want to scrape.
|
202
266
|
# This can optionally be overridden if, for example, the list of instances
|
203
267
|
# comes from an API.
|
204
268
|
##
|
205
269
|
def get_index
|
206
|
-
|
270
|
+
# TODO: Deprecate @index_Selector_method in next minor release
|
271
|
+
parse_index(get_index_pages(@index_url, 1), @index_selector)
|
207
272
|
end
|
208
273
|
|
209
274
|
##
|
210
275
|
# Using the XPath expression or CSS selector and selector_method that
|
211
|
-
# uniquely identifies the links in the index, return those links as strings.
|
212
|
-
|
213
|
-
|
214
|
-
Nokogiri::HTML(text).
|
276
|
+
# uniquely identifies the links in the index, return those links as strings. ##
|
277
|
+
def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
|
278
|
+
# for now, override selector_method with :search, which will work with either CSS or XPath
|
279
|
+
Nokogiri::HTML(text).search(selector).to_a.map{|l| l["href"] }
|
280
|
+
end
|
281
|
+
|
282
|
+
# TODO: Not sure the best way to handle this
|
283
|
+
# Currently, #parse_index is called upon #get_index_pages,
|
284
|
+
# which itself is dependent on @index_url
|
285
|
+
# Does @index_url stay unaltered for the lifetime of the Upton instance?
|
286
|
+
# It seems to at this point, but that may be something that gets
|
287
|
+
# deprecated later
|
288
|
+
#
|
289
|
+
# So for now, @index_url is used in conjunction with resolve_url
|
290
|
+
# to make sure that this method returns absolute urls
|
291
|
+
# i.e. this method expects @index_url to always have an absolute address
|
292
|
+
# for the lifetime of an Upton instance
|
293
|
+
def parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
|
294
|
+
# for now, override selector_method with :search, which will work with either CSS or XPath
|
295
|
+
Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
|
296
|
+
href = a_element["href"]
|
297
|
+
u = resolve_url( href, @index_url) unless href.nil?
|
298
|
+
unless u == href
|
299
|
+
puts "resolved #{href} to #{u}"
|
300
|
+
end
|
301
|
+
u
|
302
|
+
end
|
215
303
|
end
|
216
304
|
|
305
|
+
|
217
306
|
##
|
218
307
|
# Returns the concatenated output of each member of a paginated index,
|
219
308
|
# e.g. a site listing links with 2+ pages.
|
220
309
|
##
|
221
|
-
def get_index_pages(url,
|
222
|
-
resp = self.get_page(url, @index_debug)
|
310
|
+
def get_index_pages(url, pagination_index, options={})
|
311
|
+
resp = self.get_page(url, @index_debug, options)
|
223
312
|
if !resp.empty?
|
224
|
-
next_url = self.next_index_page_url(url,
|
313
|
+
next_url = self.next_index_page_url(url, pagination_index + 1)
|
314
|
+
# resolve to absolute url
|
315
|
+
#
|
316
|
+
next_url = resolve_url(next_url, url)
|
225
317
|
unless next_url == url
|
226
|
-
next_resp = self.get_index_pages(next_url,
|
318
|
+
next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
|
227
319
|
resp += next_resp
|
228
320
|
end
|
229
321
|
end
|
@@ -231,19 +323,21 @@ module Upton
|
|
231
323
|
end
|
232
324
|
|
233
325
|
##
|
234
|
-
# Returns the
|
326
|
+
# Returns the instance at `url`.
|
235
327
|
#
|
236
328
|
# If the page is stashed, returns that, otherwise, fetches it from the web.
|
237
329
|
#
|
238
330
|
# If an instance is paginated, returns the concatenated output of each
|
239
331
|
# page, e.g. if a news article has two pages.
|
240
332
|
##
|
241
|
-
def get_instance(url,
|
242
|
-
resp = self.get_page(url, @debug)
|
333
|
+
def get_instance(url, pagination_index=0, options={})
|
334
|
+
resp = self.get_page(url, @debug, options)
|
243
335
|
if !resp.empty?
|
244
|
-
next_url = self.next_instance_page_url(url,
|
336
|
+
next_url = self.next_instance_page_url(url, pagination_index.to_i + 1)
|
337
|
+
|
338
|
+
# next_url = resolve_url(next_url, url)
|
245
339
|
unless next_url == url
|
246
|
-
next_resp = self.get_instance(next_url,
|
340
|
+
next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
|
247
341
|
resp += next_resp
|
248
342
|
end
|
249
343
|
end
|
@@ -253,8 +347,9 @@ module Upton
|
|
253
347
|
# Just a helper for +scrape+.
|
254
348
|
def scrape_from_list(list, blk)
|
255
349
|
puts "Scraping #{list.size} instances" if @verbose
|
256
|
-
list.each_with_index.map do |instance_url,
|
257
|
-
|
350
|
+
list.each_with_index.map do |instance_url, instance_index|
|
351
|
+
instance_resp = get_instance instance_url, nil, :instance_index => instance_index
|
352
|
+
blk.call(instance_resp, instance_url, instance_index)
|
258
353
|
end
|
259
354
|
end
|
260
355
|
|
data/lib/utils.rb
CHANGED
@@ -18,7 +18,6 @@ module Upton
|
|
18
18
|
# present, is returned as the first row.
|
19
19
|
##
|
20
20
|
def self.table(table_selector, selector_method=:xpath)
|
21
|
-
require 'csv'
|
22
21
|
return Proc.new do |instance_html|
|
23
22
|
html = ::Nokogiri::HTML(instance_html)
|
24
23
|
output = []
|
@@ -34,11 +33,42 @@ module Upton
|
|
34
33
|
# Scrapes any set of HTML elements into an Array.
|
35
34
|
##
|
36
35
|
def self.list(list_selector, selector_method=:xpath)
|
37
|
-
require 'csv'
|
38
36
|
return Proc.new do |instance_html|
|
39
37
|
html = ::Nokogiri::HTML(instance_html)
|
40
38
|
html.send(selector_method, list_selector).map{|list_element| list_element.text }
|
41
39
|
end
|
42
40
|
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# Takes :_href and resolves it to an absolute URL according to
|
44
|
+
# the supplied :_page_url. They can be either Strings or URI
|
45
|
+
# instances.
|
46
|
+
#
|
47
|
+
# raises ArgumentError if either href or page_url is nil
|
48
|
+
# raises ArgumentError if page_url is not absolute
|
49
|
+
#
|
50
|
+
# returns: a String with absolute URL
|
51
|
+
def self.resolve_url(_href, _page_url)
|
52
|
+
|
53
|
+
page_url = URI(_page_url).dup
|
54
|
+
raise ArgumentError, "#{page_url} must be absolute" unless page_url.absolute?
|
55
|
+
|
56
|
+
href = URI(_href).dup
|
57
|
+
|
58
|
+
# return :href if :href is already absolute
|
59
|
+
return href.to_s if href.absolute?
|
60
|
+
|
61
|
+
|
62
|
+
# TODO: There may be edge cases worth considering
|
63
|
+
# but this should handle the following non-absolute href possibilities:
|
64
|
+
# //anothersite.com (keeps scheme, too!)
|
65
|
+
# /root/dir
|
66
|
+
# relative/dir
|
67
|
+
# ?query=2
|
68
|
+
# #bang
|
69
|
+
|
70
|
+
URI.join(page_url, href).to_s
|
71
|
+
end
|
72
|
+
|
43
73
|
end
|
44
74
|
end
|
File without changes
|
File without changes
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<html lang="en">
|
3
|
+
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
<title>Document</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<!-- refactored fixture for relative URL testing -->
|
9
|
+
|
10
|
+
<h2><a href="iamnottobeselected.html" class="title-link">An unnecessary proof of concept but just for kicks</a></h2>
|
11
|
+
|
12
|
+
<section id="river">
|
13
|
+
<h1><a href="prosecutor.html" class="title-link">A Prosecutor, a Wrongful Conviction and a Question of Justice</a></h1>
|
14
|
+
</section>
|
15
|
+
|
16
|
+
</body>
|
17
|
+
</html>
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/spec/upton_spec.rb
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'rack'
|
4
|
+
require 'thin'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'restclient'
|
7
|
+
require 'fileutils'
|
8
|
+
require './lib/upton'
|
9
|
+
|
10
|
+
describe Upton do
|
11
|
+
before :all do
|
12
|
+
#start the server
|
13
|
+
class Server
|
14
|
+
def call(env)
|
15
|
+
@root = File.expand_path(File.dirname(__FILE__))
|
16
|
+
path = Rack::Utils.unescape(env['PATH_INFO'])
|
17
|
+
path += 'index.html' if path == '/'
|
18
|
+
file = File.join(@root, "data", path)
|
19
|
+
|
20
|
+
params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
|
21
|
+
|
22
|
+
if File.exists?(file)
|
23
|
+
[ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
|
24
|
+
else
|
25
|
+
[ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def start_test_server
|
31
|
+
@server_thread = Thread.new do
|
32
|
+
Rack::Handler::Thin.run ::Server.new, :Port => 9876
|
33
|
+
end
|
34
|
+
sleep(1) # wait a sec for the server to be booted
|
35
|
+
end
|
36
|
+
|
37
|
+
start_test_server()
|
38
|
+
|
39
|
+
@headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
|
40
|
+
"",
|
41
|
+
"A Prosecutor, a Wrongful Conviction and a Question of Justice",
|
42
|
+
"Six Facts Lost in the IRS Scandal"]
|
43
|
+
@most_commented_heds = [["Six Facts Lost in the IRS Scandal",
|
44
|
+
"How the IRS’s Nonprofit Division Got So Dysfunctional",
|
45
|
+
"Sound, Fury and the IRS Mess",
|
46
|
+
"The Most Important #Muckreads on Rape in the Military",
|
47
|
+
"Congressmen to Hagel: Where Are the Missing War Records?",
|
48
|
+
"As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
|
49
|
+
"A Prosecutor, a Wrongful Conviction and a Question of Justice",
|
50
|
+
"A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
|
51
|
+
"The Story Behind Our Hospital Interactive",
|
52
|
+
"irs-test-charts-for-embedding"]]
|
53
|
+
@east_timor_prime_ministers = [[
|
54
|
+
["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
|
55
|
+
"1", "2", "3", "4",],
|
56
|
+
[],
|
57
|
+
["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
|
58
|
+
["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
|
59
|
+
["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
|
60
|
+
["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
|
61
|
+
]]
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should scrape in the basic case" do
|
65
|
+
propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
|
66
|
+
propubscraper.debug = true
|
67
|
+
propubscraper.verbose = true
|
68
|
+
|
69
|
+
heds = propubscraper.scrape do |article_str|
|
70
|
+
doc = Nokogiri::HTML(article_str)
|
71
|
+
hed = doc.css('h1.article-title').text
|
72
|
+
end
|
73
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
74
|
+
heds.should eql @headlines
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'should properly handle relative urls' do
|
78
|
+
# uses a modified page from the previous test in which the target
|
79
|
+
# href, http://127.0.0.1:9876/prosecutors.html, has been changed
|
80
|
+
# to a relative url
|
81
|
+
#
|
82
|
+
# Note: this test is a bit quirky, because it passes on the fact that
|
83
|
+
# the resolve_url creates a url identical to one that is already stashed ("prosecutors.html").
|
84
|
+
# So it works, but because of a coupling to how Upton handles caching in the file system
|
85
|
+
|
86
|
+
propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica-relative.html", "section#river h1 a", :css)
|
87
|
+
propubscraper.debug = true
|
88
|
+
propubscraper.verbose = true
|
89
|
+
|
90
|
+
heds = propubscraper.scrape do |article_str|
|
91
|
+
doc = Nokogiri::HTML(article_str)
|
92
|
+
hed = doc.css('h1.article-title').text
|
93
|
+
end
|
94
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
95
|
+
heds.should eql ["A Prosecutor, a Wrongful Conviction and a Question of Justice"]
|
96
|
+
end
|
97
|
+
|
98
|
+
it "should scrape a list properly with the list helper" do
|
99
|
+
propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
|
100
|
+
propubscraper.debug = true
|
101
|
+
propubscraper.verbose = true
|
102
|
+
list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
|
103
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
104
|
+
list.should eql @most_commented_heds
|
105
|
+
end
|
106
|
+
|
107
|
+
it "should scrape a table properly with the table helper" do
|
108
|
+
propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
|
109
|
+
propubscraper.debug = true
|
110
|
+
propubscraper.verbose = true
|
111
|
+
table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
|
112
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
113
|
+
table.should eql @east_timor_prime_ministers
|
114
|
+
end
|
115
|
+
|
116
|
+
it "should test saving files with the right encoding"
|
117
|
+
it "should test stashing to make sure pages are stashed at the right times, but not at the wrong ones"
|
118
|
+
end
|
metadata
CHANGED
@@ -1,69 +1,83 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rack
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - '>='
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
25
39
|
- !ruby/object:Gem::Version
|
26
40
|
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: thin
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
30
44
|
requirements:
|
31
|
-
- -
|
45
|
+
- - '>='
|
32
46
|
- !ruby/object:Gem::Version
|
33
47
|
version: '0'
|
34
48
|
type: :development
|
35
49
|
prerelease: false
|
36
50
|
version_requirements: !ruby/object:Gem::Requirement
|
37
51
|
requirements:
|
38
|
-
- -
|
52
|
+
- - '>='
|
39
53
|
- !ruby/object:Gem::Version
|
40
54
|
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: nokogiri
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
|
-
- -
|
59
|
+
- - '>='
|
46
60
|
- !ruby/object:Gem::Version
|
47
61
|
version: '0'
|
48
62
|
type: :development
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
|
-
- -
|
66
|
+
- - '>='
|
53
67
|
- !ruby/object:Gem::Version
|
54
68
|
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: yard
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
|
-
- -
|
73
|
+
- - '>='
|
60
74
|
- !ruby/object:Gem::Version
|
61
75
|
version: '0'
|
62
76
|
type: :development
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
|
-
- -
|
80
|
+
- - '>='
|
67
81
|
- !ruby/object:Gem::Version
|
68
82
|
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
@@ -84,14 +98,28 @@ dependencies:
|
|
84
98
|
name: nokogiri
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- -
|
101
|
+
- - '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: mechanize
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - '>='
|
88
116
|
- !ruby/object:Gem::Version
|
89
117
|
version: '0'
|
90
118
|
type: :runtime
|
91
119
|
prerelease: false
|
92
120
|
version_requirements: !ruby/object:Gem::Requirement
|
93
121
|
requirements:
|
94
|
-
- -
|
122
|
+
- - '>='
|
95
123
|
- !ruby/object:Gem::Version
|
96
124
|
version: '0'
|
97
125
|
description: Don't re-write web scrapers every time. Upton gives you a scraper template
|
@@ -103,13 +131,14 @@ extra_rdoc_files: []
|
|
103
131
|
files:
|
104
132
|
- lib/upton.rb
|
105
133
|
- lib/utils.rb
|
106
|
-
-
|
107
|
-
-
|
108
|
-
-
|
109
|
-
-
|
110
|
-
-
|
111
|
-
-
|
112
|
-
-
|
134
|
+
- spec/data/webinar.html
|
135
|
+
- spec/data/propublica-relative.html
|
136
|
+
- spec/data/propublica.html
|
137
|
+
- spec/data/prosecutor.html
|
138
|
+
- spec/data/sixfacts.html
|
139
|
+
- spec/data/discussion.html
|
140
|
+
- spec/data/easttimor.html
|
141
|
+
- spec/upton_spec.rb
|
113
142
|
homepage: http://github.org/propublica/upton
|
114
143
|
licenses:
|
115
144
|
- MIT
|
@@ -120,26 +149,27 @@ require_paths:
|
|
120
149
|
- lib
|
121
150
|
required_ruby_version: !ruby/object:Gem::Requirement
|
122
151
|
requirements:
|
123
|
-
- -
|
152
|
+
- - '>='
|
124
153
|
- !ruby/object:Gem::Version
|
125
154
|
version: 1.8.7
|
126
155
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
127
156
|
requirements:
|
128
|
-
- -
|
157
|
+
- - '>='
|
129
158
|
- !ruby/object:Gem::Version
|
130
159
|
version: '0'
|
131
160
|
requirements: []
|
132
161
|
rubyforge_project:
|
133
|
-
rubygems_version: 2.0.
|
162
|
+
rubygems_version: 2.0.2
|
134
163
|
signing_key:
|
135
164
|
specification_version: 4
|
136
165
|
summary: A simple web-scraping framework
|
137
166
|
test_files:
|
138
|
-
-
|
139
|
-
-
|
140
|
-
-
|
141
|
-
-
|
142
|
-
-
|
143
|
-
-
|
144
|
-
-
|
167
|
+
- spec/data/webinar.html
|
168
|
+
- spec/data/propublica-relative.html
|
169
|
+
- spec/data/propublica.html
|
170
|
+
- spec/data/prosecutor.html
|
171
|
+
- spec/data/sixfacts.html
|
172
|
+
- spec/data/discussion.html
|
173
|
+
- spec/data/easttimor.html
|
174
|
+
- spec/upton_spec.rb
|
145
175
|
has_rdoc: true
|
data/test/test_upton.rb
DELETED
@@ -1,141 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
require 'test/unit'
|
4
|
-
require 'rack'
|
5
|
-
require 'thin'
|
6
|
-
require 'nokogiri'
|
7
|
-
require 'restclient'
|
8
|
-
require './lib/upton'
|
9
|
-
require 'fileutils'
|
10
|
-
|
11
|
-
module Upton
|
12
|
-
module Test
|
13
|
-
|
14
|
-
# class ProPublicaScraper < Upton::Scraper
|
15
|
-
# def initialize(a, b, c)
|
16
|
-
# super
|
17
|
-
# @verbose = false
|
18
|
-
# @debug = false
|
19
|
-
# @stash_folder = "test_stashes"
|
20
|
-
# end
|
21
|
-
# end
|
22
|
-
|
23
|
-
|
24
|
-
class UptonTest < ::Test::Unit::TestCase
|
25
|
-
|
26
|
-
# def test_get_page
|
27
|
-
#TODO
|
28
|
-
# end
|
29
|
-
|
30
|
-
# def test_stash
|
31
|
-
#TODO
|
32
|
-
# end
|
33
|
-
|
34
|
-
def test_scrape
|
35
|
-
#this doesn't test stashing.
|
36
|
-
start_test_server()
|
37
|
-
|
38
|
-
headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
|
39
|
-
"",
|
40
|
-
"A Prosecutor, a Wrongful Conviction and a Question of Justice",
|
41
|
-
"Six Facts Lost in the IRS Scandal"]
|
42
|
-
|
43
|
-
propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
|
44
|
-
propubscraper.debug = true
|
45
|
-
propubscraper.verbose = true
|
46
|
-
|
47
|
-
heds = propubscraper.scrape do |article_str|
|
48
|
-
doc = Nokogiri::HTML(article_str)
|
49
|
-
hed = doc.css('h1.article-title').text
|
50
|
-
end
|
51
|
-
assert_equal(heds, headlines)
|
52
|
-
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
53
|
-
end
|
54
|
-
|
55
|
-
def test_encodings
|
56
|
-
skip "should test getting pages, switching their encoding to UTF-8, saving them as UTF-8, reading them as UTF-8"
|
57
|
-
end
|
58
|
-
|
59
|
-
def test_stashing
|
60
|
-
skip "should test stashing, make sure we never send too many requests"
|
61
|
-
end
|
62
|
-
|
63
|
-
def test_scrape_list
|
64
|
-
#this doesn't test stashing.
|
65
|
-
#TODO: needs a website that has links to a multi-page list (or table)
|
66
|
-
start_test_server()
|
67
|
-
|
68
|
-
most_commented_heds = [["Six Facts Lost in the IRS Scandal",
|
69
|
-
"How the IRS’s Nonprofit Division Got So Dysfunctional",
|
70
|
-
"Sound, Fury and the IRS Mess",
|
71
|
-
"The Most Important #Muckreads on Rape in the Military",
|
72
|
-
"Congressmen to Hagel: Where Are the Missing War Records?",
|
73
|
-
"As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
|
74
|
-
"A Prosecutor, a Wrongful Conviction and a Question of Justice",
|
75
|
-
"A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
|
76
|
-
"The Story Behind Our Hospital Interactive",
|
77
|
-
"irs-test-charts-for-embedding"]]
|
78
|
-
|
79
|
-
propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
|
80
|
-
propubscraper.debug = true
|
81
|
-
propubscraper.verbose = true
|
82
|
-
list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
|
83
|
-
|
84
|
-
assert_equal(list, most_commented_heds)
|
85
|
-
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
86
|
-
end
|
87
|
-
|
88
|
-
def test_scrape_table
|
89
|
-
#this doesn't test stashing.
|
90
|
-
start_test_server()
|
91
|
-
|
92
|
-
east_timor_prime_ministers = [[
|
93
|
-
["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
|
94
|
-
"1", "2", "3", "4",],
|
95
|
-
[],
|
96
|
-
["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
|
97
|
-
["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
|
98
|
-
["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
|
99
|
-
["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
|
100
|
-
]]
|
101
|
-
|
102
|
-
propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
|
103
|
-
propubscraper.debug = true
|
104
|
-
propubscraper.verbose = true
|
105
|
-
table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
|
106
|
-
assert_equal(table, east_timor_prime_ministers)
|
107
|
-
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
108
|
-
end
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
private
|
113
|
-
def start_test_server
|
114
|
-
@server_thread = Thread.new do
|
115
|
-
Rack::Handler::Thin.run Upton::Test::Server.new, :Port => 9876
|
116
|
-
end
|
117
|
-
sleep(1) # wait a sec for the server to be booted
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
# via http://stackoverflow.com/questions/10166611/launching-a-web-server-inside-ruby-tests
|
124
|
-
class Server
|
125
|
-
def call(env)
|
126
|
-
@root = File.expand_path(File.dirname(__FILE__))
|
127
|
-
path = Rack::Utils.unescape(env['PATH_INFO'])
|
128
|
-
path += 'index.html' if path == '/'
|
129
|
-
file = File.join(@root, "data", path)
|
130
|
-
|
131
|
-
params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
|
132
|
-
|
133
|
-
if File.exists?(file)
|
134
|
-
[ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
|
135
|
-
else
|
136
|
-
[ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
141
|
-
end
|