upton 0.2.6 → 0.2.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- OWFkOGUyYjcyNzA3ZWQ2YTNmYTZmMjJjOTc3NzJiMjY0MTllMDhhOA==
5
- data.tar.gz: !binary |-
6
- YTA5YTEyMzczZjNjYjVlYjNmNmUyZWM0MTA4Zjk2NTRjYWQwZjFjMg==
7
- !binary "U0hBNTEy":
8
- metadata.gz: !binary |-
9
- MjFhYjI5OTYwZGVlYTNlMmNhYTc1OWQ5ZGJmMzBlN2FiM2U4MzllMDM4Nzhk
10
- MjJkMTczOGZjNWUwNDMyYmFlOGRkZDlhNjFkM2RlMzM1YjFmZTgyZWQ4MTBj
11
- MjY2ZmFiYmZlOTc5YmE2YzFjMWE1YjVjZWY2MWMyYTczZmEwNGU=
12
- data.tar.gz: !binary |-
13
- YmFjOTllZjdlNWIwNzhhMGIxODQwOTI1Y2EwY2YzMTE1YWEzOTdkMWI3NDEy
14
- ZjA1OTE1N2Q0OGYwOWEyYjVjMDM3ZWQ1NzlhZmU3NDZlNTAxNDJmZWZjZGFm
15
- YjUxMzc3ZThkZDg1ZDdkMjgwM2UyODMwZTZiMjdjZDAyNjAxNTQ=
2
+ SHA1:
3
+ metadata.gz: 2eb19ce88f56ef55d8c32d6c16e7c777ce3f44e6
4
+ data.tar.gz: 1b86794f51292a30b310ceffa1f36a85144af3e5
5
+ SHA512:
6
+ metadata.gz: f49d0d404cea0d07038a6b3394d9f000332045901d121fc1065c85da945dd1e372f1cb6f87f7cd8738250f5f7d35183729bd0d8cdd8c89872cdd8e1333225a6e
7
+ data.tar.gz: 70351047072d55ac1518b40b4a9a04c3287702465631527e41c6367559ddc39880ea0384f3e8c16a785ca31cbaff6ce129ed6d0a5f00ac633f6d29c86e2e613a
data/lib/upton.rb CHANGED
@@ -28,7 +28,7 @@ module Upton
28
28
  # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
29
29
  # block from Upton::Utils.
30
30
  # For more complicated cases; subclass Upton::Scraper
31
- # e.g. +MyScraper < Upton::Scraper+ and overrdie various methods.
31
+ # e.g. +MyScraper < Upton::Scraper+ and override various methods.
32
32
  ##
33
33
  class Scraper
34
34
 
@@ -53,28 +53,32 @@ module Upton
53
53
  # +selector+: The XPath expression or CSS selector that specifies the
54
54
  # anchor elements within the page, if a url is specified for
55
55
  # the previous argument.
56
- # +selector_method+: +:xpath+ or +:css+. By default, +:xpath+.
56
+ # +selector_method+: Deprecated and ignored. Next breaking release will
57
+ # remove this option.x
57
58
  #
58
- # These options are a shortcut. If you plant to override +get_index+, you
59
+ # These options are a shortcut. If you plan to override +get_index+, you
59
60
  # do not need to set them.
60
61
  # If you don't specify a selector, the first argument will be treated as a
61
62
  # list of URLs.
62
63
  ##
63
- def initialize(index_url_or_array, selector="", selector_method=:xpath)
64
64
 
65
+ # DEPRECATION NOTE, re: selector_method
66
+ # the selector_method parameter is unneeded, as Nokogiri provides the
67
+ # #search method, which picks a selector depending on whether
68
+ # the String passed is of CSS/XPath notation
69
+
70
+ def initialize(index_url_or_array, selector="", selector_method=:deprecated)
71
+
65
72
  #if first arg is a valid URL, do already-written stuff;
66
73
  #if it's not (or if it's a list?) don't bother with get_index, etc.
67
74
  #e.g. Scraper.new(["http://jeremybmerrill.com"])
68
75
 
69
76
  #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
70
- if selector.empty?
77
+ if index_url_or_array.respond_to? :each_with_index
71
78
  @url_array = index_url_or_array
72
- elsif index_url_or_array =~ ::URI::ABS_URI
79
+ else
73
80
  @index_url = index_url_or_array
74
81
  @index_selector = selector
75
- @index_selector_method = selector_method
76
- else
77
- raise ArgumentError
78
82
  end
79
83
  # If true, then Upton prints information about when it gets
80
84
  # files from the internet and when it gets them from its stash.
@@ -97,9 +101,9 @@ module Upton
97
101
 
98
102
  # Folder name for stashes, if you want them to be stored somewhere else,
99
103
  # e.g. under /tmp.
100
- @stash_folder = "stashes"
104
+ @stash_folder ||= "stashes"
101
105
  unless Dir.exists?(@stash_folder)
102
- Dir.mkdir(@stash_folder)
106
+ FileUtils.mkdir_p(@stash_folder)
103
107
  end
104
108
  end
105
109
 
@@ -114,7 +118,7 @@ module Upton
114
118
  # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
115
119
  # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
116
120
  ##
117
- def next_instance_page_url(url, index)
121
+ def next_instance_page_url(url, pagination_index)
118
122
  ""
119
123
  end
120
124
 
@@ -129,7 +133,7 @@ module Upton
129
133
  # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
130
134
  # ought to return "http://whatever.com/articles?page=2"
131
135
  ##
132
- def next_index_page_url(url, index)
136
+ def next_index_page_url(url, pagination_index)
133
137
  ""
134
138
  end
135
139
 
@@ -142,29 +146,64 @@ module Upton
142
146
  self.url_array = self.get_index
143
147
  end
144
148
  CSV.open filename, 'wb' do |csv|
145
- self.scrape_from_list(self.url_array, blk).each{|document| csv << document }
149
+ #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
150
+ self.scrape_from_list(self.url_array, blk).compact.each do |document|
151
+ puts document.inspect
152
+ if document[0].respond_to? :map
153
+ document.each{|row| csv << row }
154
+ else
155
+ csv << document
156
+ end
157
+ end
158
+ #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
159
+ end
160
+ end
161
+
162
+ def scrape_to_tsv filename, &blk
163
+ require 'csv'
164
+ unless self.url_array
165
+ self.url_array = self.get_index
166
+ end
167
+ CSV.open filename, 'wb', :col_sep => "\t" do |csv|
168
+ #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
169
+ self.scrape_from_list(self.url_array, blk).compact.each do |document|
170
+ puts document.inspect
171
+ if document[0].respond_to? :map
172
+ document.each{|row| csv << row }
173
+ else
174
+ csv << document
175
+ end
176
+ end
177
+ #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
146
178
  end
147
179
  end
148
180
 
149
181
  protected
150
182
 
183
+ ##
184
+ # Actually fetches the page
185
+ ##
186
+ def fetch_page(url, options={})
187
+ RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
188
+ end
189
+
151
190
  ##
152
191
  # Handles getting pages with RestClient or getting them from the local stash.
153
192
  #
154
193
  # Uses a kludge (because rest-client is outdated) to handle encoding.
155
194
  ##
156
- def get_page(url, stash=false)
195
+ def get_page(url, stash=false, options={})
157
196
  return "" if url.empty?
158
197
 
159
198
  #the filename for each stashed version is a cleaned version of the URL.
160
- if stash && File.exists?( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ) )
199
+ if stash && File.exists?( url_to_filename(url, options) )
161
200
  puts "usin' a stashed copy of " + url if @verbose
162
- resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
201
+ resp = open( url_to_filename(url, options), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
163
202
  else
164
203
  begin
165
204
  puts "getting " + url if @verbose
166
205
  sleep @sleep_time_between_requests
167
- resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
206
+ resp = fetch_page(url, options)
168
207
 
169
208
  #this is silly, but rest-client needs to get on their game.
170
209
  #cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
@@ -188,42 +227,95 @@ module Upton
188
227
  rescue URI::InvalidURIError
189
228
  puts "Invalid URI: #{url}" if @verbose
190
229
  resp = ""
230
+ rescue RestClient::RequestTimeout
231
+ "Timeout: #{url}" if @verbose
232
+ retry
191
233
  end
192
234
  if stash
193
235
  puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
194
- open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
236
+ open( url_to_filename(url, options), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
195
237
  end
196
238
  end
197
239
  resp
198
240
  end
199
241
 
242
+ def url_to_filename(url, options={})
243
+ File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") )
244
+ end
245
+
246
+
247
+ ##
248
+ # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
249
+ # resolve_url resolves them to absolute urls.
250
+ # absolute_url_str must be a URL, as a string, that is absolute.
251
+ ##
252
+ def resolve_url(href_str, absolute_url_str)
253
+ absolute_url = URI(absolute_url_str).dup
254
+ raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
255
+ href = URI(href_str).dup
256
+
257
+ # return :href if :href is already absolute
258
+ return href.to_s if href.absolute?
259
+
260
+ #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
261
+ URI.join(absolute_url, href).to_s
262
+ end
263
+
200
264
  ##
201
265
  # Return a list of URLs for the instances you want to scrape.
202
266
  # This can optionally be overridden if, for example, the list of instances
203
267
  # comes from an API.
204
268
  ##
205
269
  def get_index
206
- parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
270
+ # TODO: Deprecate @index_Selector_method in next minor release
271
+ parse_index(get_index_pages(@index_url, 1), @index_selector)
207
272
  end
208
273
 
209
274
  ##
210
275
  # Using the XPath expression or CSS selector and selector_method that
211
- # uniquely identifies the links in the index, return those links as strings.
212
- ##
213
- def parse_index(text, selector, selector_method=:xpath)
214
- Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
276
+ # uniquely identifies the links in the index, return those links as strings. ##
277
+ def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
278
+ # for now, override selector_method with :search, which will work with either CSS or XPath
279
+ Nokogiri::HTML(text).search(selector).to_a.map{|l| l["href"] }
280
+ end
281
+
282
+ # TODO: Not sure the best way to handle this
283
+ # Currently, #parse_index is called upon #get_index_pages,
284
+ # which itself is dependent on @index_url
285
+ # Does @index_url stay unaltered for the lifetime of the Upton instance?
286
+ # It seems to at this point, but that may be something that gets
287
+ # deprecated later
288
+ #
289
+ # So for now, @index_url is used in conjunction with resolve_url
290
+ # to make sure that this method returns absolute urls
291
+ # i.e. this method expects @index_url to always have an absolute address
292
+ # for the lifetime of an Upton instance
293
+ def parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
294
+ # for now, override selector_method with :search, which will work with either CSS or XPath
295
+ Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
296
+ href = a_element["href"]
297
+ u = resolve_url( href, @index_url) unless href.nil?
298
+ unless u == href
299
+ puts "resolved #{href} to #{u}"
300
+ end
301
+ u
302
+ end
215
303
  end
216
304
 
305
+
217
306
  ##
218
307
  # Returns the concatenated output of each member of a paginated index,
219
308
  # e.g. a site listing links with 2+ pages.
220
309
  ##
221
- def get_index_pages(url, index)
222
- resp = self.get_page(url, @index_debug)
310
+ def get_index_pages(url, pagination_index, options={})
311
+ resp = self.get_page(url, @index_debug, options)
223
312
  if !resp.empty?
224
- next_url = self.next_index_page_url(url, index + 1)
313
+ next_url = self.next_index_page_url(url, pagination_index + 1)
314
+ # resolve to absolute url
315
+ #
316
+ next_url = resolve_url(next_url, url)
225
317
  unless next_url == url
226
- next_resp = self.get_index_pages(next_url, index + 1).to_s
318
+ next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
227
319
  resp += next_resp
228
320
  end
229
321
  end
@@ -231,19 +323,21 @@ module Upton
231
323
  end
232
324
 
233
325
  ##
234
- # Returns the article at `url`.
326
+ # Returns the instance at `url`.
235
327
  #
236
328
  # If the page is stashed, returns that, otherwise, fetches it from the web.
237
329
  #
238
330
  # If an instance is paginated, returns the concatenated output of each
239
331
  # page, e.g. if a news article has two pages.
240
332
  ##
241
- def get_instance(url, index=0)
242
- resp = self.get_page(url, @debug)
333
+ def get_instance(url, pagination_index=0, options={})
334
+ resp = self.get_page(url, @debug, options)
243
335
  if !resp.empty?
244
- next_url = self.next_instance_page_url(url, index + 1)
336
+ next_url = self.next_instance_page_url(url, pagination_index.to_i + 1)
337
+
338
+ # next_url = resolve_url(next_url, url)
245
339
  unless next_url == url
246
- next_resp = self.get_instance(next_url, index + 1).to_s
340
+ next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
247
341
  resp += next_resp
248
342
  end
249
343
  end
@@ -253,8 +347,9 @@ module Upton
253
347
  # Just a helper for +scrape+.
254
348
  def scrape_from_list(list, blk)
255
349
  puts "Scraping #{list.size} instances" if @verbose
256
- list.each_with_index.map do |instance_url, index|
257
- blk.call(get_instance(instance_url), instance_url, index)
350
+ list.each_with_index.map do |instance_url, instance_index|
351
+ instance_resp = get_instance instance_url, nil, :instance_index => instance_index
352
+ blk.call(instance_resp, instance_url, instance_index)
258
353
  end
259
354
  end
260
355
 
data/lib/utils.rb CHANGED
@@ -18,7 +18,6 @@ module Upton
18
18
  # present, is returned as the first row.
19
19
  ##
20
20
  def self.table(table_selector, selector_method=:xpath)
21
- require 'csv'
22
21
  return Proc.new do |instance_html|
23
22
  html = ::Nokogiri::HTML(instance_html)
24
23
  output = []
@@ -34,11 +33,42 @@ module Upton
34
33
  # Scrapes any set of HTML elements into an Array.
35
34
  ##
36
35
  def self.list(list_selector, selector_method=:xpath)
37
- require 'csv'
38
36
  return Proc.new do |instance_html|
39
37
  html = ::Nokogiri::HTML(instance_html)
40
38
  html.send(selector_method, list_selector).map{|list_element| list_element.text }
41
39
  end
42
40
  end
41
+
42
+ ##
43
+ # Takes :_href and resolves it to an absolute URL according to
44
+ # the supplied :_page_url. They can be either Strings or URI
45
+ # instances.
46
+ #
47
+ # raises ArgumentError if either href or page_url is nil
48
+ # raises ArgumentError if page_url is not absolute
49
+ #
50
+ # returns: a String with absolute URL
51
+ def self.resolve_url(_href, _page_url)
52
+
53
+ page_url = URI(_page_url).dup
54
+ raise ArgumentError, "#{page_url} must be absolute" unless page_url.absolute?
55
+
56
+ href = URI(_href).dup
57
+
58
+ # return :href if :href is already absolute
59
+ return href.to_s if href.absolute?
60
+
61
+
62
+ # TODO: There may be edge cases worth considering
63
+ # but this should handle the following non-absolute href possibilities:
64
+ # //anothersite.com (keeps scheme, too!)
65
+ # /root/dir
66
+ # relative/dir
67
+ # ?query=2
68
+ # #bang
69
+
70
+ URI.join(page_url, href).to_s
71
+ end
72
+
43
73
  end
44
74
  end
File without changes
File without changes
@@ -0,0 +1,17 @@
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Document</title>
6
+ </head>
7
+ <body>
8
+ <!-- refactored fixture for relative URL testing -->
9
+
10
+ <h2><a href="iamnottobeselected.html" class="title-link">An unnecessary proof of concept but just for kicks</a></h2>
11
+
12
+ <section id="river">
13
+ <h1><a href="prosecutor.html" class="title-link">A Prosecutor, a Wrongful Conviction and a Question of Justice</a></h1>
14
+ </section>
15
+
16
+ </body>
17
+ </html>
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,118 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'rack'
4
+ require 'thin'
5
+ require 'nokogiri'
6
+ require 'restclient'
7
+ require 'fileutils'
8
+ require './lib/upton'
9
+
10
+ describe Upton do
11
+ before :all do
12
+ #start the server
13
+ class Server
14
+ def call(env)
15
+ @root = File.expand_path(File.dirname(__FILE__))
16
+ path = Rack::Utils.unescape(env['PATH_INFO'])
17
+ path += 'index.html' if path == '/'
18
+ file = File.join(@root, "data", path)
19
+
20
+ params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
21
+
22
+ if File.exists?(file)
23
+ [ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
24
+ else
25
+ [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
26
+ end
27
+ end
28
+ end
29
+
30
+ def start_test_server
31
+ @server_thread = Thread.new do
32
+ Rack::Handler::Thin.run ::Server.new, :Port => 9876
33
+ end
34
+ sleep(1) # wait a sec for the server to be booted
35
+ end
36
+
37
+ start_test_server()
38
+
39
+ @headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
40
+ "",
41
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
42
+ "Six Facts Lost in the IRS Scandal"]
43
+ @most_commented_heds = [["Six Facts Lost in the IRS Scandal",
44
+ "How the IRS’s Nonprofit Division Got So Dysfunctional",
45
+ "Sound, Fury and the IRS Mess",
46
+ "The Most Important #Muckreads on Rape in the Military",
47
+ "Congressmen to Hagel: Where Are the Missing War Records?",
48
+ "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
49
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
50
+ "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
51
+ "The Story Behind Our Hospital Interactive",
52
+ "irs-test-charts-for-embedding"]]
53
+ @east_timor_prime_ministers = [[
54
+ ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
55
+ "1", "2", "3", "4",],
56
+ [],
57
+ ["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
58
+ ["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
59
+ ["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
60
+ ["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
61
+ ]]
62
+ end
63
+
64
+ it "should scrape in the basic case" do
65
+ propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
66
+ propubscraper.debug = true
67
+ propubscraper.verbose = true
68
+
69
+ heds = propubscraper.scrape do |article_str|
70
+ doc = Nokogiri::HTML(article_str)
71
+ hed = doc.css('h1.article-title').text
72
+ end
73
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
74
+ heds.should eql @headlines
75
+ end
76
+
77
+ it 'should properly handle relative urls' do
78
+ # uses a modified page from the previous test in which the target
79
+ # href, http://127.0.0.1:9876/prosecutors.html, has been changed
80
+ # to a relative url
81
+ #
82
+ # Note: this test is a bit quirky, because it passes on the fact that
83
+ # the resolve_url creates a url identical to one that is already stashed ("prosecutors.html").
84
+ # So it works, but because of a coupling to how Upton handles caching in the file system
85
+
86
+ propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica-relative.html", "section#river h1 a", :css)
87
+ propubscraper.debug = true
88
+ propubscraper.verbose = true
89
+
90
+ heds = propubscraper.scrape do |article_str|
91
+ doc = Nokogiri::HTML(article_str)
92
+ hed = doc.css('h1.article-title').text
93
+ end
94
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
95
+ heds.should eql ["A Prosecutor, a Wrongful Conviction and a Question of Justice"]
96
+ end
97
+
98
+ it "should scrape a list properly with the list helper" do
99
+ propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
100
+ propubscraper.debug = true
101
+ propubscraper.verbose = true
102
+ list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
103
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
104
+ list.should eql @most_commented_heds
105
+ end
106
+
107
+ it "should scrape a table properly with the table helper" do
108
+ propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
109
+ propubscraper.debug = true
110
+ propubscraper.verbose = true
111
+ table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
112
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
113
+ table.should eql @east_timor_prime_ministers
114
+ end
115
+
116
+ it "should test saving files with the right encoding"
117
+ it "should test stashing to make sure pages are stashed at the right times, but not at the wrong ones"
118
+ end
metadata CHANGED
@@ -1,69 +1,83 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-22 00:00:00.000000000 Z
11
+ date: 2013-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rack
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ! '>='
17
+ - - '>='
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ! '>='
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
25
39
  - !ruby/object:Gem::Version
26
40
  version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: thin
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
- - - ! '>='
45
+ - - '>='
32
46
  - !ruby/object:Gem::Version
33
47
  version: '0'
34
48
  type: :development
35
49
  prerelease: false
36
50
  version_requirements: !ruby/object:Gem::Requirement
37
51
  requirements:
38
- - - ! '>='
52
+ - - '>='
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: nokogiri
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
- - - ! '>='
59
+ - - '>='
46
60
  - !ruby/object:Gem::Version
47
61
  version: '0'
48
62
  type: :development
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
- - - ! '>='
66
+ - - '>='
53
67
  - !ruby/object:Gem::Version
54
68
  version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: yard
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
- - - ! '>='
73
+ - - '>='
60
74
  - !ruby/object:Gem::Version
61
75
  version: '0'
62
76
  type: :development
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
- - - ! '>='
80
+ - - '>='
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
69
83
  - !ruby/object:Gem::Dependency
@@ -84,14 +98,28 @@ dependencies:
84
98
  name: nokogiri
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - ! '>='
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: mechanize
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '>='
88
116
  - !ruby/object:Gem::Version
89
117
  version: '0'
90
118
  type: :runtime
91
119
  prerelease: false
92
120
  version_requirements: !ruby/object:Gem::Requirement
93
121
  requirements:
94
- - - ! '>='
122
+ - - '>='
95
123
  - !ruby/object:Gem::Version
96
124
  version: '0'
97
125
  description: Don't re-write web scrapers every time. Upton gives you a scraper template
@@ -103,13 +131,14 @@ extra_rdoc_files: []
103
131
  files:
104
132
  - lib/upton.rb
105
133
  - lib/utils.rb
106
- - test/data/discussion.html
107
- - test/data/easttimor.html
108
- - test/data/propublica.html
109
- - test/data/prosecutor.html
110
- - test/data/sixfacts.html
111
- - test/data/webinar.html
112
- - test/test_upton.rb
134
+ - spec/data/webinar.html
135
+ - spec/data/propublica-relative.html
136
+ - spec/data/propublica.html
137
+ - spec/data/prosecutor.html
138
+ - spec/data/sixfacts.html
139
+ - spec/data/discussion.html
140
+ - spec/data/easttimor.html
141
+ - spec/upton_spec.rb
113
142
  homepage: http://github.org/propublica/upton
114
143
  licenses:
115
144
  - MIT
@@ -120,26 +149,27 @@ require_paths:
120
149
  - lib
121
150
  required_ruby_version: !ruby/object:Gem::Requirement
122
151
  requirements:
123
- - - ! '>='
152
+ - - '>='
124
153
  - !ruby/object:Gem::Version
125
154
  version: 1.8.7
126
155
  required_rubygems_version: !ruby/object:Gem::Requirement
127
156
  requirements:
128
- - - ! '>='
157
+ - - '>='
129
158
  - !ruby/object:Gem::Version
130
159
  version: '0'
131
160
  requirements: []
132
161
  rubyforge_project:
133
- rubygems_version: 2.0.5
162
+ rubygems_version: 2.0.2
134
163
  signing_key:
135
164
  specification_version: 4
136
165
  summary: A simple web-scraping framework
137
166
  test_files:
138
- - test/data/discussion.html
139
- - test/data/easttimor.html
140
- - test/data/propublica.html
141
- - test/data/prosecutor.html
142
- - test/data/sixfacts.html
143
- - test/data/webinar.html
144
- - test/test_upton.rb
167
+ - spec/data/webinar.html
168
+ - spec/data/propublica-relative.html
169
+ - spec/data/propublica.html
170
+ - spec/data/prosecutor.html
171
+ - spec/data/sixfacts.html
172
+ - spec/data/discussion.html
173
+ - spec/data/easttimor.html
174
+ - spec/upton_spec.rb
145
175
  has_rdoc: true
data/test/test_upton.rb DELETED
@@ -1,141 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- require 'test/unit'
4
- require 'rack'
5
- require 'thin'
6
- require 'nokogiri'
7
- require 'restclient'
8
- require './lib/upton'
9
- require 'fileutils'
10
-
11
- module Upton
12
- module Test
13
-
14
- # class ProPublicaScraper < Upton::Scraper
15
- # def initialize(a, b, c)
16
- # super
17
- # @verbose = false
18
- # @debug = false
19
- # @stash_folder = "test_stashes"
20
- # end
21
- # end
22
-
23
-
24
- class UptonTest < ::Test::Unit::TestCase
25
-
26
- # def test_get_page
27
- #TODO
28
- # end
29
-
30
- # def test_stash
31
- #TODO
32
- # end
33
-
34
- def test_scrape
35
- #this doesn't test stashing.
36
- start_test_server()
37
-
38
- headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
39
- "",
40
- "A Prosecutor, a Wrongful Conviction and a Question of Justice",
41
- "Six Facts Lost in the IRS Scandal"]
42
-
43
- propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
44
- propubscraper.debug = true
45
- propubscraper.verbose = true
46
-
47
- heds = propubscraper.scrape do |article_str|
48
- doc = Nokogiri::HTML(article_str)
49
- hed = doc.css('h1.article-title').text
50
- end
51
- assert_equal(heds, headlines)
52
- FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
53
- end
54
-
55
- def test_encodings
56
- skip "should test getting pages, switching their encoding to UTF-8, saving them as UTF-8, reading them as UTF-8"
57
- end
58
-
59
- def test_stashing
60
- skip "should test stashing, make sure we never send too many requests"
61
- end
62
-
63
- def test_scrape_list
64
- #this doesn't test stashing.
65
- #TODO: needs a website that has links to a multi-page list (or table)
66
- start_test_server()
67
-
68
- most_commented_heds = [["Six Facts Lost in the IRS Scandal",
69
- "How the IRS’s Nonprofit Division Got So Dysfunctional",
70
- "Sound, Fury and the IRS Mess",
71
- "The Most Important #Muckreads on Rape in the Military",
72
- "Congressmen to Hagel: Where Are the Missing War Records?",
73
- "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
74
- "A Prosecutor, a Wrongful Conviction and a Question of Justice",
75
- "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
76
- "The Story Behind Our Hospital Interactive",
77
- "irs-test-charts-for-embedding"]]
78
-
79
- propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
80
- propubscraper.debug = true
81
- propubscraper.verbose = true
82
- list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
83
-
84
- assert_equal(list, most_commented_heds)
85
- FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
86
- end
87
-
88
- def test_scrape_table
89
- #this doesn't test stashing.
90
- start_test_server()
91
-
92
- east_timor_prime_ministers = [[
93
- ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
94
- "1", "2", "3", "4",],
95
- [],
96
- ["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
97
- ["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
98
- ["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
99
- ["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
100
- ]]
101
-
102
- propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
103
- propubscraper.debug = true
104
- propubscraper.verbose = true
105
- table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
106
- assert_equal(table, east_timor_prime_ministers)
107
- FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
108
- end
109
-
110
-
111
-
112
- private
113
- def start_test_server
114
- @server_thread = Thread.new do
115
- Rack::Handler::Thin.run Upton::Test::Server.new, :Port => 9876
116
- end
117
- sleep(1) # wait a sec for the server to be booted
118
- end
119
- end
120
-
121
-
122
-
123
- # via http://stackoverflow.com/questions/10166611/launching-a-web-server-inside-ruby-tests
124
- class Server
125
- def call(env)
126
- @root = File.expand_path(File.dirname(__FILE__))
127
- path = Rack::Utils.unescape(env['PATH_INFO'])
128
- path += 'index.html' if path == '/'
129
- file = File.join(@root, "data", path)
130
-
131
- params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
132
-
133
- if File.exists?(file)
134
- [ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
135
- else
136
- [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
137
- end
138
- end
139
- end
140
- end
141
- end