upton 0.2.6 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +6 -14
- data/lib/upton.rb +130 -35
- data/lib/utils.rb +32 -2
- data/{test → spec}/data/discussion.html +0 -0
- data/{test → spec}/data/easttimor.html +0 -0
- data/spec/data/propublica-relative.html +17 -0
- data/{test → spec}/data/propublica.html +0 -0
- data/{test → spec}/data/prosecutor.html +0 -0
- data/{test → spec}/data/sixfacts.html +0 -0
- data/{test → spec}/data/webinar.html +0 -0
- data/spec/upton_spec.rb +118 -0
- metadata +59 -29
- data/test/test_upton.rb +0 -141
checksums.yaml
CHANGED
@@ -1,15 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
metadata.gz: !binary |-
|
9
|
-
MjFhYjI5OTYwZGVlYTNlMmNhYTc1OWQ5ZGJmMzBlN2FiM2U4MzllMDM4Nzhk
|
10
|
-
MjJkMTczOGZjNWUwNDMyYmFlOGRkZDlhNjFkM2RlMzM1YjFmZTgyZWQ4MTBj
|
11
|
-
MjY2ZmFiYmZlOTc5YmE2YzFjMWE1YjVjZWY2MWMyYTczZmEwNGU=
|
12
|
-
data.tar.gz: !binary |-
|
13
|
-
YmFjOTllZjdlNWIwNzhhMGIxODQwOTI1Y2EwY2YzMTE1YWEzOTdkMWI3NDEy
|
14
|
-
ZjA1OTE1N2Q0OGYwOWEyYjVjMDM3ZWQ1NzlhZmU3NDZlNTAxNDJmZWZjZGFm
|
15
|
-
YjUxMzc3ZThkZDg1ZDdkMjgwM2UyODMwZTZiMjdjZDAyNjAxNTQ=
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 2eb19ce88f56ef55d8c32d6c16e7c777ce3f44e6
|
4
|
+
data.tar.gz: 1b86794f51292a30b310ceffa1f36a85144af3e5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f49d0d404cea0d07038a6b3394d9f000332045901d121fc1065c85da945dd1e372f1cb6f87f7cd8738250f5f7d35183729bd0d8cdd8c89872cdd8e1333225a6e
|
7
|
+
data.tar.gz: 70351047072d55ac1518b40b4a9a04c3287702465631527e41c6367559ddc39880ea0384f3e8c16a785ca31cbaff6ce129ed6d0a5f00ac633f6d29c86e2e613a
|
data/lib/upton.rb
CHANGED
@@ -28,7 +28,7 @@ module Upton
|
|
28
28
|
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
29
29
|
# block from Upton::Utils.
|
30
30
|
# For more complicated cases; subclass Upton::Scraper
|
31
|
-
# e.g. +MyScraper < Upton::Scraper+ and
|
31
|
+
# e.g. +MyScraper < Upton::Scraper+ and override various methods.
|
32
32
|
##
|
33
33
|
class Scraper
|
34
34
|
|
@@ -53,28 +53,32 @@ module Upton
|
|
53
53
|
# +selector+: The XPath expression or CSS selector that specifies the
|
54
54
|
# anchor elements within the page, if a url is specified for
|
55
55
|
# the previous argument.
|
56
|
-
# +selector_method+:
|
56
|
+
# +selector_method+: Deprecated and ignored. Next breaking release will
|
57
|
+
# remove this option.x
|
57
58
|
#
|
58
|
-
# These options are a shortcut. If you
|
59
|
+
# These options are a shortcut. If you plan to override +get_index+, you
|
59
60
|
# do not need to set them.
|
60
61
|
# If you don't specify a selector, the first argument will be treated as a
|
61
62
|
# list of URLs.
|
62
63
|
##
|
63
|
-
def initialize(index_url_or_array, selector="", selector_method=:xpath)
|
64
64
|
|
65
|
+
# DEPRECATION NOTE, re: selector_method
|
66
|
+
# the selector_method parameter is unneeded, as Nokogiri provides the
|
67
|
+
# #search method, which picks a selector depending on whether
|
68
|
+
# the String passed is of CSS/XPath notation
|
69
|
+
|
70
|
+
def initialize(index_url_or_array, selector="", selector_method=:deprecated)
|
71
|
+
|
65
72
|
#if first arg is a valid URL, do already-written stuff;
|
66
73
|
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
67
74
|
#e.g. Scraper.new(["http://jeremybmerrill.com"])
|
68
75
|
|
69
76
|
#TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
|
70
|
-
if
|
77
|
+
if index_url_or_array.respond_to? :each_with_index
|
71
78
|
@url_array = index_url_or_array
|
72
|
-
|
79
|
+
else
|
73
80
|
@index_url = index_url_or_array
|
74
81
|
@index_selector = selector
|
75
|
-
@index_selector_method = selector_method
|
76
|
-
else
|
77
|
-
raise ArgumentError
|
78
82
|
end
|
79
83
|
# If true, then Upton prints information about when it gets
|
80
84
|
# files from the internet and when it gets them from its stash.
|
@@ -97,9 +101,9 @@ module Upton
|
|
97
101
|
|
98
102
|
# Folder name for stashes, if you want them to be stored somewhere else,
|
99
103
|
# e.g. under /tmp.
|
100
|
-
@stash_folder
|
104
|
+
@stash_folder ||= "stashes"
|
101
105
|
unless Dir.exists?(@stash_folder)
|
102
|
-
|
106
|
+
FileUtils.mkdir_p(@stash_folder)
|
103
107
|
end
|
104
108
|
end
|
105
109
|
|
@@ -114,7 +118,7 @@ module Upton
|
|
114
118
|
# e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
|
115
119
|
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
116
120
|
##
|
117
|
-
def next_instance_page_url(url,
|
121
|
+
def next_instance_page_url(url, pagination_index)
|
118
122
|
""
|
119
123
|
end
|
120
124
|
|
@@ -129,7 +133,7 @@ module Upton
|
|
129
133
|
# e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
|
130
134
|
# ought to return "http://whatever.com/articles?page=2"
|
131
135
|
##
|
132
|
-
def next_index_page_url(url,
|
136
|
+
def next_index_page_url(url, pagination_index)
|
133
137
|
""
|
134
138
|
end
|
135
139
|
|
@@ -142,29 +146,64 @@ module Upton
|
|
142
146
|
self.url_array = self.get_index
|
143
147
|
end
|
144
148
|
CSV.open filename, 'wb' do |csv|
|
145
|
-
|
149
|
+
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
150
|
+
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
151
|
+
puts document.inspect
|
152
|
+
if document[0].respond_to? :map
|
153
|
+
document.each{|row| csv << row }
|
154
|
+
else
|
155
|
+
csv << document
|
156
|
+
end
|
157
|
+
end
|
158
|
+
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def scrape_to_tsv filename, &blk
|
163
|
+
require 'csv'
|
164
|
+
unless self.url_array
|
165
|
+
self.url_array = self.get_index
|
166
|
+
end
|
167
|
+
CSV.open filename, 'wb', :col_sep => "\t" do |csv|
|
168
|
+
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
169
|
+
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
170
|
+
puts document.inspect
|
171
|
+
if document[0].respond_to? :map
|
172
|
+
document.each{|row| csv << row }
|
173
|
+
else
|
174
|
+
csv << document
|
175
|
+
end
|
176
|
+
end
|
177
|
+
#self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
|
146
178
|
end
|
147
179
|
end
|
148
180
|
|
149
181
|
protected
|
150
182
|
|
183
|
+
##
|
184
|
+
# Actually fetches the page
|
185
|
+
##
|
186
|
+
def fetch_page(url, options={})
|
187
|
+
RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
|
188
|
+
end
|
189
|
+
|
151
190
|
##
|
152
191
|
# Handles getting pages with RestClient or getting them from the local stash.
|
153
192
|
#
|
154
193
|
# Uses a kludge (because rest-client is outdated) to handle encoding.
|
155
194
|
##
|
156
|
-
def get_page(url, stash=false)
|
195
|
+
def get_page(url, stash=false, options={})
|
157
196
|
return "" if url.empty?
|
158
197
|
|
159
198
|
#the filename for each stashed version is a cleaned version of the URL.
|
160
|
-
if stash && File.exists?(
|
199
|
+
if stash && File.exists?( url_to_filename(url, options) )
|
161
200
|
puts "usin' a stashed copy of " + url if @verbose
|
162
|
-
resp = open(
|
201
|
+
resp = open( url_to_filename(url, options), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
|
163
202
|
else
|
164
203
|
begin
|
165
204
|
puts "getting " + url if @verbose
|
166
205
|
sleep @sleep_time_between_requests
|
167
|
-
resp =
|
206
|
+
resp = fetch_page(url, options)
|
168
207
|
|
169
208
|
#this is silly, but rest-client needs to get on their game.
|
170
209
|
#cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
|
@@ -188,42 +227,95 @@ module Upton
|
|
188
227
|
rescue URI::InvalidURIError
|
189
228
|
puts "Invalid URI: #{url}" if @verbose
|
190
229
|
resp = ""
|
230
|
+
rescue RestClient::RequestTimeout
|
231
|
+
"Timeout: #{url}" if @verbose
|
232
|
+
retry
|
191
233
|
end
|
192
234
|
if stash
|
193
235
|
puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
|
194
|
-
open(
|
236
|
+
open( url_to_filename(url, options), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
|
195
237
|
end
|
196
238
|
end
|
197
239
|
resp
|
198
240
|
end
|
199
241
|
|
242
|
+
def url_to_filename(url, options={})
|
243
|
+
File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") )
|
244
|
+
end
|
245
|
+
|
246
|
+
|
247
|
+
##
|
248
|
+
# sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
|
249
|
+
# resolve_url resolves them to absolute urls.
|
250
|
+
# absolute_url_str must be a URL, as a string, that is absolute.
|
251
|
+
##
|
252
|
+
def resolve_url(href_str, absolute_url_str)
|
253
|
+
absolute_url = URI(absolute_url_str).dup
|
254
|
+
raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
|
255
|
+
href = URI(href_str).dup
|
256
|
+
|
257
|
+
# return :href if :href is already absolute
|
258
|
+
return href.to_s if href.absolute?
|
259
|
+
|
260
|
+
#TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
|
261
|
+
URI.join(absolute_url, href).to_s
|
262
|
+
end
|
263
|
+
|
200
264
|
##
|
201
265
|
# Return a list of URLs for the instances you want to scrape.
|
202
266
|
# This can optionally be overridden if, for example, the list of instances
|
203
267
|
# comes from an API.
|
204
268
|
##
|
205
269
|
def get_index
|
206
|
-
|
270
|
+
# TODO: Deprecate @index_Selector_method in next minor release
|
271
|
+
parse_index(get_index_pages(@index_url, 1), @index_selector)
|
207
272
|
end
|
208
273
|
|
209
274
|
##
|
210
275
|
# Using the XPath expression or CSS selector and selector_method that
|
211
|
-
# uniquely identifies the links in the index, return those links as strings.
|
212
|
-
|
213
|
-
|
214
|
-
Nokogiri::HTML(text).
|
276
|
+
# uniquely identifies the links in the index, return those links as strings. ##
|
277
|
+
def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
|
278
|
+
# for now, override selector_method with :search, which will work with either CSS or XPath
|
279
|
+
Nokogiri::HTML(text).search(selector).to_a.map{|l| l["href"] }
|
280
|
+
end
|
281
|
+
|
282
|
+
# TODO: Not sure the best way to handle this
|
283
|
+
# Currently, #parse_index is called upon #get_index_pages,
|
284
|
+
# which itself is dependent on @index_url
|
285
|
+
# Does @index_url stay unaltered for the lifetime of the Upton instance?
|
286
|
+
# It seems to at this point, but that may be something that gets
|
287
|
+
# deprecated later
|
288
|
+
#
|
289
|
+
# So for now, @index_url is used in conjunction with resolve_url
|
290
|
+
# to make sure that this method returns absolute urls
|
291
|
+
# i.e. this method expects @index_url to always have an absolute address
|
292
|
+
# for the lifetime of an Upton instance
|
293
|
+
def parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
|
294
|
+
# for now, override selector_method with :search, which will work with either CSS or XPath
|
295
|
+
Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
|
296
|
+
href = a_element["href"]
|
297
|
+
u = resolve_url( href, @index_url) unless href.nil?
|
298
|
+
unless u == href
|
299
|
+
puts "resolved #{href} to #{u}"
|
300
|
+
end
|
301
|
+
u
|
302
|
+
end
|
215
303
|
end
|
216
304
|
|
305
|
+
|
217
306
|
##
|
218
307
|
# Returns the concatenated output of each member of a paginated index,
|
219
308
|
# e.g. a site listing links with 2+ pages.
|
220
309
|
##
|
221
|
-
def get_index_pages(url,
|
222
|
-
resp = self.get_page(url, @index_debug)
|
310
|
+
def get_index_pages(url, pagination_index, options={})
|
311
|
+
resp = self.get_page(url, @index_debug, options)
|
223
312
|
if !resp.empty?
|
224
|
-
next_url = self.next_index_page_url(url,
|
313
|
+
next_url = self.next_index_page_url(url, pagination_index + 1)
|
314
|
+
# resolve to absolute url
|
315
|
+
#
|
316
|
+
next_url = resolve_url(next_url, url)
|
225
317
|
unless next_url == url
|
226
|
-
next_resp = self.get_index_pages(next_url,
|
318
|
+
next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
|
227
319
|
resp += next_resp
|
228
320
|
end
|
229
321
|
end
|
@@ -231,19 +323,21 @@ module Upton
|
|
231
323
|
end
|
232
324
|
|
233
325
|
##
|
234
|
-
# Returns the
|
326
|
+
# Returns the instance at `url`.
|
235
327
|
#
|
236
328
|
# If the page is stashed, returns that, otherwise, fetches it from the web.
|
237
329
|
#
|
238
330
|
# If an instance is paginated, returns the concatenated output of each
|
239
331
|
# page, e.g. if a news article has two pages.
|
240
332
|
##
|
241
|
-
def get_instance(url,
|
242
|
-
resp = self.get_page(url, @debug)
|
333
|
+
def get_instance(url, pagination_index=0, options={})
|
334
|
+
resp = self.get_page(url, @debug, options)
|
243
335
|
if !resp.empty?
|
244
|
-
next_url = self.next_instance_page_url(url,
|
336
|
+
next_url = self.next_instance_page_url(url, pagination_index.to_i + 1)
|
337
|
+
|
338
|
+
# next_url = resolve_url(next_url, url)
|
245
339
|
unless next_url == url
|
246
|
-
next_resp = self.get_instance(next_url,
|
340
|
+
next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
|
247
341
|
resp += next_resp
|
248
342
|
end
|
249
343
|
end
|
@@ -253,8 +347,9 @@ module Upton
|
|
253
347
|
# Just a helper for +scrape+.
|
254
348
|
def scrape_from_list(list, blk)
|
255
349
|
puts "Scraping #{list.size} instances" if @verbose
|
256
|
-
list.each_with_index.map do |instance_url,
|
257
|
-
|
350
|
+
list.each_with_index.map do |instance_url, instance_index|
|
351
|
+
instance_resp = get_instance instance_url, nil, :instance_index => instance_index
|
352
|
+
blk.call(instance_resp, instance_url, instance_index)
|
258
353
|
end
|
259
354
|
end
|
260
355
|
|
data/lib/utils.rb
CHANGED
@@ -18,7 +18,6 @@ module Upton
|
|
18
18
|
# present, is returned as the first row.
|
19
19
|
##
|
20
20
|
def self.table(table_selector, selector_method=:xpath)
|
21
|
-
require 'csv'
|
22
21
|
return Proc.new do |instance_html|
|
23
22
|
html = ::Nokogiri::HTML(instance_html)
|
24
23
|
output = []
|
@@ -34,11 +33,42 @@ module Upton
|
|
34
33
|
# Scrapes any set of HTML elements into an Array.
|
35
34
|
##
|
36
35
|
def self.list(list_selector, selector_method=:xpath)
|
37
|
-
require 'csv'
|
38
36
|
return Proc.new do |instance_html|
|
39
37
|
html = ::Nokogiri::HTML(instance_html)
|
40
38
|
html.send(selector_method, list_selector).map{|list_element| list_element.text }
|
41
39
|
end
|
42
40
|
end
|
41
|
+
|
42
|
+
##
|
43
|
+
# Takes :_href and resolves it to an absolute URL according to
|
44
|
+
# the supplied :_page_url. They can be either Strings or URI
|
45
|
+
# instances.
|
46
|
+
#
|
47
|
+
# raises ArgumentError if either href or page_url is nil
|
48
|
+
# raises ArgumentError if page_url is not absolute
|
49
|
+
#
|
50
|
+
# returns: a String with absolute URL
|
51
|
+
def self.resolve_url(_href, _page_url)
|
52
|
+
|
53
|
+
page_url = URI(_page_url).dup
|
54
|
+
raise ArgumentError, "#{page_url} must be absolute" unless page_url.absolute?
|
55
|
+
|
56
|
+
href = URI(_href).dup
|
57
|
+
|
58
|
+
# return :href if :href is already absolute
|
59
|
+
return href.to_s if href.absolute?
|
60
|
+
|
61
|
+
|
62
|
+
# TODO: There may be edge cases worth considering
|
63
|
+
# but this should handle the following non-absolute href possibilities:
|
64
|
+
# //anothersite.com (keeps scheme, too!)
|
65
|
+
# /root/dir
|
66
|
+
# relative/dir
|
67
|
+
# ?query=2
|
68
|
+
# #bang
|
69
|
+
|
70
|
+
URI.join(page_url, href).to_s
|
71
|
+
end
|
72
|
+
|
43
73
|
end
|
44
74
|
end
|
File without changes
|
File without changes
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<!doctype html>
|
2
|
+
<html lang="en">
|
3
|
+
<head>
|
4
|
+
<meta charset="UTF-8">
|
5
|
+
<title>Document</title>
|
6
|
+
</head>
|
7
|
+
<body>
|
8
|
+
<!-- refactored fixture for relative URL testing -->
|
9
|
+
|
10
|
+
<h2><a href="iamnottobeselected.html" class="title-link">An unnecessary proof of concept but just for kicks</a></h2>
|
11
|
+
|
12
|
+
<section id="river">
|
13
|
+
<h1><a href="prosecutor.html" class="title-link">A Prosecutor, a Wrongful Conviction and a Question of Justice</a></h1>
|
14
|
+
</section>
|
15
|
+
|
16
|
+
</body>
|
17
|
+
</html>
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
data/spec/upton_spec.rb
ADDED
@@ -0,0 +1,118 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'rack'
|
4
|
+
require 'thin'
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'restclient'
|
7
|
+
require 'fileutils'
|
8
|
+
require './lib/upton'
|
9
|
+
|
10
|
+
describe Upton do
|
11
|
+
before :all do
|
12
|
+
#start the server
|
13
|
+
class Server
|
14
|
+
def call(env)
|
15
|
+
@root = File.expand_path(File.dirname(__FILE__))
|
16
|
+
path = Rack::Utils.unescape(env['PATH_INFO'])
|
17
|
+
path += 'index.html' if path == '/'
|
18
|
+
file = File.join(@root, "data", path)
|
19
|
+
|
20
|
+
params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
|
21
|
+
|
22
|
+
if File.exists?(file)
|
23
|
+
[ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
|
24
|
+
else
|
25
|
+
[ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def start_test_server
|
31
|
+
@server_thread = Thread.new do
|
32
|
+
Rack::Handler::Thin.run ::Server.new, :Port => 9876
|
33
|
+
end
|
34
|
+
sleep(1) # wait a sec for the server to be booted
|
35
|
+
end
|
36
|
+
|
37
|
+
start_test_server()
|
38
|
+
|
39
|
+
@headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
|
40
|
+
"",
|
41
|
+
"A Prosecutor, a Wrongful Conviction and a Question of Justice",
|
42
|
+
"Six Facts Lost in the IRS Scandal"]
|
43
|
+
@most_commented_heds = [["Six Facts Lost in the IRS Scandal",
|
44
|
+
"How the IRS’s Nonprofit Division Got So Dysfunctional",
|
45
|
+
"Sound, Fury and the IRS Mess",
|
46
|
+
"The Most Important #Muckreads on Rape in the Military",
|
47
|
+
"Congressmen to Hagel: Where Are the Missing War Records?",
|
48
|
+
"As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
|
49
|
+
"A Prosecutor, a Wrongful Conviction and a Question of Justice",
|
50
|
+
"A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
|
51
|
+
"The Story Behind Our Hospital Interactive",
|
52
|
+
"irs-test-charts-for-embedding"]]
|
53
|
+
@east_timor_prime_ministers = [[
|
54
|
+
["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
|
55
|
+
"1", "2", "3", "4",],
|
56
|
+
[],
|
57
|
+
["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
|
58
|
+
["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
|
59
|
+
["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
|
60
|
+
["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
|
61
|
+
]]
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should scrape in the basic case" do
|
65
|
+
propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
|
66
|
+
propubscraper.debug = true
|
67
|
+
propubscraper.verbose = true
|
68
|
+
|
69
|
+
heds = propubscraper.scrape do |article_str|
|
70
|
+
doc = Nokogiri::HTML(article_str)
|
71
|
+
hed = doc.css('h1.article-title').text
|
72
|
+
end
|
73
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
74
|
+
heds.should eql @headlines
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'should properly handle relative urls' do
|
78
|
+
# uses a modified page from the previous test in which the target
|
79
|
+
# href, http://127.0.0.1:9876/prosecutors.html, has been changed
|
80
|
+
# to a relative url
|
81
|
+
#
|
82
|
+
# Note: this test is a bit quirky, because it passes on the fact that
|
83
|
+
# the resolve_url creates a url identical to one that is already stashed ("prosecutors.html").
|
84
|
+
# So it works, but because of a coupling to how Upton handles caching in the file system
|
85
|
+
|
86
|
+
propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica-relative.html", "section#river h1 a", :css)
|
87
|
+
propubscraper.debug = true
|
88
|
+
propubscraper.verbose = true
|
89
|
+
|
90
|
+
heds = propubscraper.scrape do |article_str|
|
91
|
+
doc = Nokogiri::HTML(article_str)
|
92
|
+
hed = doc.css('h1.article-title').text
|
93
|
+
end
|
94
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
95
|
+
heds.should eql ["A Prosecutor, a Wrongful Conviction and a Question of Justice"]
|
96
|
+
end
|
97
|
+
|
98
|
+
it "should scrape a list properly with the list helper" do
|
99
|
+
propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
|
100
|
+
propubscraper.debug = true
|
101
|
+
propubscraper.verbose = true
|
102
|
+
list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
|
103
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
104
|
+
list.should eql @most_commented_heds
|
105
|
+
end
|
106
|
+
|
107
|
+
it "should scrape a table properly with the table helper" do
|
108
|
+
propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
|
109
|
+
propubscraper.debug = true
|
110
|
+
propubscraper.verbose = true
|
111
|
+
table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
|
112
|
+
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
113
|
+
table.should eql @east_timor_prime_ministers
|
114
|
+
end
|
115
|
+
|
116
|
+
it "should test saving files with the right encoding"
|
117
|
+
it "should test stashing to make sure pages are stashed at the right times, but not at the wrong ones"
|
118
|
+
end
|
metadata
CHANGED
@@ -1,69 +1,83 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: upton
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jeremy B. Merrill
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rack
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - '>='
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
25
39
|
- !ruby/object:Gem::Version
|
26
40
|
version: '0'
|
27
41
|
- !ruby/object:Gem::Dependency
|
28
42
|
name: thin
|
29
43
|
requirement: !ruby/object:Gem::Requirement
|
30
44
|
requirements:
|
31
|
-
- -
|
45
|
+
- - '>='
|
32
46
|
- !ruby/object:Gem::Version
|
33
47
|
version: '0'
|
34
48
|
type: :development
|
35
49
|
prerelease: false
|
36
50
|
version_requirements: !ruby/object:Gem::Requirement
|
37
51
|
requirements:
|
38
|
-
- -
|
52
|
+
- - '>='
|
39
53
|
- !ruby/object:Gem::Version
|
40
54
|
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: nokogiri
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
44
58
|
requirements:
|
45
|
-
- -
|
59
|
+
- - '>='
|
46
60
|
- !ruby/object:Gem::Version
|
47
61
|
version: '0'
|
48
62
|
type: :development
|
49
63
|
prerelease: false
|
50
64
|
version_requirements: !ruby/object:Gem::Requirement
|
51
65
|
requirements:
|
52
|
-
- -
|
66
|
+
- - '>='
|
53
67
|
- !ruby/object:Gem::Version
|
54
68
|
version: '0'
|
55
69
|
- !ruby/object:Gem::Dependency
|
56
70
|
name: yard
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
58
72
|
requirements:
|
59
|
-
- -
|
73
|
+
- - '>='
|
60
74
|
- !ruby/object:Gem::Version
|
61
75
|
version: '0'
|
62
76
|
type: :development
|
63
77
|
prerelease: false
|
64
78
|
version_requirements: !ruby/object:Gem::Requirement
|
65
79
|
requirements:
|
66
|
-
- -
|
80
|
+
- - '>='
|
67
81
|
- !ruby/object:Gem::Version
|
68
82
|
version: '0'
|
69
83
|
- !ruby/object:Gem::Dependency
|
@@ -84,14 +98,28 @@ dependencies:
|
|
84
98
|
name: nokogiri
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- -
|
101
|
+
- - '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - '>='
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: mechanize
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - '>='
|
88
116
|
- !ruby/object:Gem::Version
|
89
117
|
version: '0'
|
90
118
|
type: :runtime
|
91
119
|
prerelease: false
|
92
120
|
version_requirements: !ruby/object:Gem::Requirement
|
93
121
|
requirements:
|
94
|
-
- -
|
122
|
+
- - '>='
|
95
123
|
- !ruby/object:Gem::Version
|
96
124
|
version: '0'
|
97
125
|
description: Don't re-write web scrapers every time. Upton gives you a scraper template
|
@@ -103,13 +131,14 @@ extra_rdoc_files: []
|
|
103
131
|
files:
|
104
132
|
- lib/upton.rb
|
105
133
|
- lib/utils.rb
|
106
|
-
-
|
107
|
-
-
|
108
|
-
-
|
109
|
-
-
|
110
|
-
-
|
111
|
-
-
|
112
|
-
-
|
134
|
+
- spec/data/webinar.html
|
135
|
+
- spec/data/propublica-relative.html
|
136
|
+
- spec/data/propublica.html
|
137
|
+
- spec/data/prosecutor.html
|
138
|
+
- spec/data/sixfacts.html
|
139
|
+
- spec/data/discussion.html
|
140
|
+
- spec/data/easttimor.html
|
141
|
+
- spec/upton_spec.rb
|
113
142
|
homepage: http://github.org/propublica/upton
|
114
143
|
licenses:
|
115
144
|
- MIT
|
@@ -120,26 +149,27 @@ require_paths:
|
|
120
149
|
- lib
|
121
150
|
required_ruby_version: !ruby/object:Gem::Requirement
|
122
151
|
requirements:
|
123
|
-
- -
|
152
|
+
- - '>='
|
124
153
|
- !ruby/object:Gem::Version
|
125
154
|
version: 1.8.7
|
126
155
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
127
156
|
requirements:
|
128
|
-
- -
|
157
|
+
- - '>='
|
129
158
|
- !ruby/object:Gem::Version
|
130
159
|
version: '0'
|
131
160
|
requirements: []
|
132
161
|
rubyforge_project:
|
133
|
-
rubygems_version: 2.0.
|
162
|
+
rubygems_version: 2.0.2
|
134
163
|
signing_key:
|
135
164
|
specification_version: 4
|
136
165
|
summary: A simple web-scraping framework
|
137
166
|
test_files:
|
138
|
-
-
|
139
|
-
-
|
140
|
-
-
|
141
|
-
-
|
142
|
-
-
|
143
|
-
-
|
144
|
-
-
|
167
|
+
- spec/data/webinar.html
|
168
|
+
- spec/data/propublica-relative.html
|
169
|
+
- spec/data/propublica.html
|
170
|
+
- spec/data/prosecutor.html
|
171
|
+
- spec/data/sixfacts.html
|
172
|
+
- spec/data/discussion.html
|
173
|
+
- spec/data/easttimor.html
|
174
|
+
- spec/upton_spec.rb
|
145
175
|
has_rdoc: true
|
data/test/test_upton.rb
DELETED
@@ -1,141 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
|
3
|
-
require 'test/unit'
|
4
|
-
require 'rack'
|
5
|
-
require 'thin'
|
6
|
-
require 'nokogiri'
|
7
|
-
require 'restclient'
|
8
|
-
require './lib/upton'
|
9
|
-
require 'fileutils'
|
10
|
-
|
11
|
-
module Upton
|
12
|
-
module Test
|
13
|
-
|
14
|
-
# class ProPublicaScraper < Upton::Scraper
|
15
|
-
# def initialize(a, b, c)
|
16
|
-
# super
|
17
|
-
# @verbose = false
|
18
|
-
# @debug = false
|
19
|
-
# @stash_folder = "test_stashes"
|
20
|
-
# end
|
21
|
-
# end
|
22
|
-
|
23
|
-
|
24
|
-
class UptonTest < ::Test::Unit::TestCase
|
25
|
-
|
26
|
-
# def test_get_page
|
27
|
-
#TODO
|
28
|
-
# end
|
29
|
-
|
30
|
-
# def test_stash
|
31
|
-
#TODO
|
32
|
-
# end
|
33
|
-
|
34
|
-
def test_scrape
|
35
|
-
#this doesn't test stashing.
|
36
|
-
start_test_server()
|
37
|
-
|
38
|
-
headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
|
39
|
-
"",
|
40
|
-
"A Prosecutor, a Wrongful Conviction and a Question of Justice",
|
41
|
-
"Six Facts Lost in the IRS Scandal"]
|
42
|
-
|
43
|
-
propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
|
44
|
-
propubscraper.debug = true
|
45
|
-
propubscraper.verbose = true
|
46
|
-
|
47
|
-
heds = propubscraper.scrape do |article_str|
|
48
|
-
doc = Nokogiri::HTML(article_str)
|
49
|
-
hed = doc.css('h1.article-title').text
|
50
|
-
end
|
51
|
-
assert_equal(heds, headlines)
|
52
|
-
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
53
|
-
end
|
54
|
-
|
55
|
-
def test_encodings
|
56
|
-
skip "should test getting pages, switching their encoding to UTF-8, saving them as UTF-8, reading them as UTF-8"
|
57
|
-
end
|
58
|
-
|
59
|
-
def test_stashing
|
60
|
-
skip "should test stashing, make sure we never send too many requests"
|
61
|
-
end
|
62
|
-
|
63
|
-
def test_scrape_list
|
64
|
-
#this doesn't test stashing.
|
65
|
-
#TODO: needs a website that has links to a multi-page list (or table)
|
66
|
-
start_test_server()
|
67
|
-
|
68
|
-
most_commented_heds = [["Six Facts Lost in the IRS Scandal",
|
69
|
-
"How the IRS’s Nonprofit Division Got So Dysfunctional",
|
70
|
-
"Sound, Fury and the IRS Mess",
|
71
|
-
"The Most Important #Muckreads on Rape in the Military",
|
72
|
-
"Congressmen to Hagel: Where Are the Missing War Records?",
|
73
|
-
"As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
|
74
|
-
"A Prosecutor, a Wrongful Conviction and a Question of Justice",
|
75
|
-
"A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
|
76
|
-
"The Story Behind Our Hospital Interactive",
|
77
|
-
"irs-test-charts-for-embedding"]]
|
78
|
-
|
79
|
-
propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
|
80
|
-
propubscraper.debug = true
|
81
|
-
propubscraper.verbose = true
|
82
|
-
list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
|
83
|
-
|
84
|
-
assert_equal(list, most_commented_heds)
|
85
|
-
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
86
|
-
end
|
87
|
-
|
88
|
-
def test_scrape_table
|
89
|
-
#this doesn't test stashing.
|
90
|
-
start_test_server()
|
91
|
-
|
92
|
-
east_timor_prime_ministers = [[
|
93
|
-
["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
|
94
|
-
"1", "2", "3", "4",],
|
95
|
-
[],
|
96
|
-
["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
|
97
|
-
["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
|
98
|
-
["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
|
99
|
-
["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
|
100
|
-
]]
|
101
|
-
|
102
|
-
propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
|
103
|
-
propubscraper.debug = true
|
104
|
-
propubscraper.verbose = true
|
105
|
-
table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
|
106
|
-
assert_equal(table, east_timor_prime_ministers)
|
107
|
-
FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
|
108
|
-
end
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
private
|
113
|
-
def start_test_server
|
114
|
-
@server_thread = Thread.new do
|
115
|
-
Rack::Handler::Thin.run Upton::Test::Server.new, :Port => 9876
|
116
|
-
end
|
117
|
-
sleep(1) # wait a sec for the server to be booted
|
118
|
-
end
|
119
|
-
end
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
# via http://stackoverflow.com/questions/10166611/launching-a-web-server-inside-ruby-tests
|
124
|
-
class Server
|
125
|
-
def call(env)
|
126
|
-
@root = File.expand_path(File.dirname(__FILE__))
|
127
|
-
path = Rack::Utils.unescape(env['PATH_INFO'])
|
128
|
-
path += 'index.html' if path == '/'
|
129
|
-
file = File.join(@root, "data", path)
|
130
|
-
|
131
|
-
params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
|
132
|
-
|
133
|
-
if File.exists?(file)
|
134
|
-
[ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
|
135
|
-
else
|
136
|
-
[ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
|
137
|
-
end
|
138
|
-
end
|
139
|
-
end
|
140
|
-
end
|
141
|
-
end
|