upton 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,7 @@
1
1
  ---
2
- !binary "U0hBMQ==":
3
- metadata.gz: !binary |-
4
- OWFkOGUyYjcyNzA3ZWQ2YTNmYTZmMjJjOTc3NzJiMjY0MTllMDhhOA==
5
- data.tar.gz: !binary |-
6
- YTA5YTEyMzczZjNjYjVlYjNmNmUyZWM0MTA4Zjk2NTRjYWQwZjFjMg==
7
- !binary "U0hBNTEy":
8
- metadata.gz: !binary |-
9
- MjFhYjI5OTYwZGVlYTNlMmNhYTc1OWQ5ZGJmMzBlN2FiM2U4MzllMDM4Nzhk
10
- MjJkMTczOGZjNWUwNDMyYmFlOGRkZDlhNjFkM2RlMzM1YjFmZTgyZWQ4MTBj
11
- MjY2ZmFiYmZlOTc5YmE2YzFjMWE1YjVjZWY2MWMyYTczZmEwNGU=
12
- data.tar.gz: !binary |-
13
- YmFjOTllZjdlNWIwNzhhMGIxODQwOTI1Y2EwY2YzMTE1YWEzOTdkMWI3NDEy
14
- ZjA1OTE1N2Q0OGYwOWEyYjVjMDM3ZWQ1NzlhZmU3NDZlNTAxNDJmZWZjZGFm
15
- YjUxMzc3ZThkZDg1ZDdkMjgwM2UyODMwZTZiMjdjZDAyNjAxNTQ=
2
+ SHA1:
3
+ metadata.gz: 2eb19ce88f56ef55d8c32d6c16e7c777ce3f44e6
4
+ data.tar.gz: 1b86794f51292a30b310ceffa1f36a85144af3e5
5
+ SHA512:
6
+ metadata.gz: f49d0d404cea0d07038a6b3394d9f000332045901d121fc1065c85da945dd1e372f1cb6f87f7cd8738250f5f7d35183729bd0d8cdd8c89872cdd8e1333225a6e
7
+ data.tar.gz: 70351047072d55ac1518b40b4a9a04c3287702465631527e41c6367559ddc39880ea0384f3e8c16a785ca31cbaff6ce129ed6d0a5f00ac633f6d29c86e2e613a
data/lib/upton.rb CHANGED
@@ -28,7 +28,7 @@ module Upton
28
28
  # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
29
29
  # block from Upton::Utils.
30
30
  # For more complicated cases; subclass Upton::Scraper
31
- # e.g. +MyScraper < Upton::Scraper+ and overrdie various methods.
31
+ # e.g. +MyScraper < Upton::Scraper+ and override various methods.
32
32
  ##
33
33
  class Scraper
34
34
 
@@ -53,28 +53,32 @@ module Upton
53
53
  # +selector+: The XPath expression or CSS selector that specifies the
54
54
  # anchor elements within the page, if a url is specified for
55
55
  # the previous argument.
56
- # +selector_method+: +:xpath+ or +:css+. By default, +:xpath+.
56
+ # +selector_method+: Deprecated and ignored. Next breaking release will
57
+ # remove this option.x
57
58
  #
58
- # These options are a shortcut. If you plant to override +get_index+, you
59
+ # These options are a shortcut. If you plan to override +get_index+, you
59
60
  # do not need to set them.
60
61
  # If you don't specify a selector, the first argument will be treated as a
61
62
  # list of URLs.
62
63
  ##
63
- def initialize(index_url_or_array, selector="", selector_method=:xpath)
64
64
 
65
+ # DEPRECATION NOTE, re: selector_method
66
+ # the selector_method parameter is unneeded, as Nokogiri provides the
67
+ # #search method, which picks a selector depending on whether
68
+ # the String passed is of CSS/XPath notation
69
+
70
+ def initialize(index_url_or_array, selector="", selector_method=:deprecated)
71
+
65
72
  #if first arg is a valid URL, do already-written stuff;
66
73
  #if it's not (or if it's a list?) don't bother with get_index, etc.
67
74
  #e.g. Scraper.new(["http://jeremybmerrill.com"])
68
75
 
69
76
  #TODO: rewrite this, because it's a little silly. (i.e. should be a more sensical division of how these arguments work)
70
- if selector.empty?
77
+ if index_url_or_array.respond_to? :each_with_index
71
78
  @url_array = index_url_or_array
72
- elsif index_url_or_array =~ ::URI::ABS_URI
79
+ else
73
80
  @index_url = index_url_or_array
74
81
  @index_selector = selector
75
- @index_selector_method = selector_method
76
- else
77
- raise ArgumentError
78
82
  end
79
83
  # If true, then Upton prints information about when it gets
80
84
  # files from the internet and when it gets them from its stash.
@@ -97,9 +101,9 @@ module Upton
97
101
 
98
102
  # Folder name for stashes, if you want them to be stored somewhere else,
99
103
  # e.g. under /tmp.
100
- @stash_folder = "stashes"
104
+ @stash_folder ||= "stashes"
101
105
  unless Dir.exists?(@stash_folder)
102
- Dir.mkdir(@stash_folder)
106
+ FileUtils.mkdir_p(@stash_folder)
103
107
  end
104
108
  end
105
109
 
@@ -114,7 +118,7 @@ module Upton
114
118
  # e.g. next_instance_page_url("http://whatever.com/article/upton-sinclairs-the-jungle?page=1", 2)
115
119
  # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
116
120
  ##
117
- def next_instance_page_url(url, index)
121
+ def next_instance_page_url(url, pagination_index)
118
122
  ""
119
123
  end
120
124
 
@@ -129,7 +133,7 @@ module Upton
129
133
  # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
130
134
  # ought to return "http://whatever.com/articles?page=2"
131
135
  ##
132
- def next_index_page_url(url, index)
136
+ def next_index_page_url(url, pagination_index)
133
137
  ""
134
138
  end
135
139
 
@@ -142,29 +146,64 @@ module Upton
142
146
  self.url_array = self.get_index
143
147
  end
144
148
  CSV.open filename, 'wb' do |csv|
145
- self.scrape_from_list(self.url_array, blk).each{|document| csv << document }
149
+ #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
150
+ self.scrape_from_list(self.url_array, blk).compact.each do |document|
151
+ puts document.inspect
152
+ if document[0].respond_to? :map
153
+ document.each{|row| csv << row }
154
+ else
155
+ csv << document
156
+ end
157
+ end
158
+ #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
159
+ end
160
+ end
161
+
162
+ def scrape_to_tsv filename, &blk
163
+ require 'csv'
164
+ unless self.url_array
165
+ self.url_array = self.get_index
166
+ end
167
+ CSV.open filename, 'wb', :col_sep => "\t" do |csv|
168
+ #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
169
+ self.scrape_from_list(self.url_array, blk).compact.each do |document|
170
+ puts document.inspect
171
+ if document[0].respond_to? :map
172
+ document.each{|row| csv << row }
173
+ else
174
+ csv << document
175
+ end
176
+ end
177
+ #self.scrape_from_list(self.url_array, blk).compact.each{|document| csv << document }
146
178
  end
147
179
  end
148
180
 
149
181
  protected
150
182
 
183
+ ##
184
+ # Actually fetches the page
185
+ ##
186
+ def fetch_page(url, options={})
187
+ RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
188
+ end
189
+
151
190
  ##
152
191
  # Handles getting pages with RestClient or getting them from the local stash.
153
192
  #
154
193
  # Uses a kludge (because rest-client is outdated) to handle encoding.
155
194
  ##
156
- def get_page(url, stash=false)
195
+ def get_page(url, stash=false, options={})
157
196
  return "" if url.empty?
158
197
 
159
198
  #the filename for each stashed version is a cleaned version of the URL.
160
- if stash && File.exists?( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ) )
199
+ if stash && File.exists?( url_to_filename(url, options) )
161
200
  puts "usin' a stashed copy of " + url if @verbose
162
- resp = open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "")), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
201
+ resp = open( url_to_filename(url, options), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
163
202
  else
164
203
  begin
165
204
  puts "getting " + url if @verbose
166
205
  sleep @sleep_time_between_requests
167
- resp = RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
206
+ resp = fetch_page(url, options)
168
207
 
169
208
  #this is silly, but rest-client needs to get on their game.
170
209
  #cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
@@ -188,42 +227,95 @@ module Upton
188
227
  rescue URI::InvalidURIError
189
228
  puts "Invalid URI: #{url}" if @verbose
190
229
  resp = ""
230
+ rescue RestClient::RequestTimeout
231
+ "Timeout: #{url}" if @verbose
232
+ retry
191
233
  end
192
234
  if stash
193
235
  puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
194
- open( File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") ), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
236
+ open( url_to_filename(url, options), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
195
237
  end
196
238
  end
197
239
  resp
198
240
  end
199
241
 
242
+ def url_to_filename(url, options={})
243
+ File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") )
244
+ end
245
+
246
+
247
+ ##
248
+ # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
249
+ # resolve_url resolves them to absolute urls.
250
+ # absolute_url_str must be a URL, as a string, that is absolute.
251
+ ##
252
+ def resolve_url(href_str, absolute_url_str)
253
+ absolute_url = URI(absolute_url_str).dup
254
+ raise ArgumentError, "#{absolute_url} must be absolute" unless absolute_url.absolute?
255
+ href = URI(href_str).dup
256
+
257
+ # return :href if :href is already absolute
258
+ return href.to_s if href.absolute?
259
+
260
+ #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
261
+ URI.join(absolute_url, href).to_s
262
+ end
263
+
200
264
  ##
201
265
  # Return a list of URLs for the instances you want to scrape.
202
266
  # This can optionally be overridden if, for example, the list of instances
203
267
  # comes from an API.
204
268
  ##
205
269
  def get_index
206
- parse_index(get_index_pages(@index_url, 1), @index_selector, @index_selector_method)
270
+ # TODO: Deprecate @index_Selector_method in next minor release
271
+ parse_index(get_index_pages(@index_url, 1), @index_selector)
207
272
  end
208
273
 
209
274
  ##
210
275
  # Using the XPath expression or CSS selector and selector_method that
211
- # uniquely identifies the links in the index, return those links as strings.
212
- ##
213
- def parse_index(text, selector, selector_method=:xpath)
214
- Nokogiri::HTML(text).send(selector_method, selector).to_a.map{|l| l["href"] }
276
+ # uniquely identifies the links in the index, return those links as strings. ##
277
+ def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
278
+ # for now, override selector_method with :search, which will work with either CSS or XPath
279
+ Nokogiri::HTML(text).search(selector).to_a.map{|l| l["href"] }
280
+ end
281
+
282
+ # TODO: Not sure the best way to handle this
283
+ # Currently, #parse_index is called upon #get_index_pages,
284
+ # which itself is dependent on @index_url
285
+ # Does @index_url stay unaltered for the lifetime of the Upton instance?
286
+ # It seems to at this point, but that may be something that gets
287
+ # deprecated later
288
+ #
289
+ # So for now, @index_url is used in conjunction with resolve_url
290
+ # to make sure that this method returns absolute urls
291
+ # i.e. this method expects @index_url to always have an absolute address
292
+ # for the lifetime of an Upton instance
293
+ def parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
294
+ # for now, override selector_method with :search, which will work with either CSS or XPath
295
+ Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
296
+ href = a_element["href"]
297
+ u = resolve_url( href, @index_url) unless href.nil?
298
+ unless u == href
299
+ puts "resolved #{href} to #{u}"
300
+ end
301
+ u
302
+ end
215
303
  end
216
304
 
305
+
217
306
  ##
218
307
  # Returns the concatenated output of each member of a paginated index,
219
308
  # e.g. a site listing links with 2+ pages.
220
309
  ##
221
- def get_index_pages(url, index)
222
- resp = self.get_page(url, @index_debug)
310
+ def get_index_pages(url, pagination_index, options={})
311
+ resp = self.get_page(url, @index_debug, options)
223
312
  if !resp.empty?
224
- next_url = self.next_index_page_url(url, index + 1)
313
+ next_url = self.next_index_page_url(url, pagination_index + 1)
314
+ # resolve to absolute url
315
+ #
316
+ next_url = resolve_url(next_url, url)
225
317
  unless next_url == url
226
- next_resp = self.get_index_pages(next_url, index + 1).to_s
318
+ next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
227
319
  resp += next_resp
228
320
  end
229
321
  end
@@ -231,19 +323,21 @@ module Upton
231
323
  end
232
324
 
233
325
  ##
234
- # Returns the article at `url`.
326
+ # Returns the instance at `url`.
235
327
  #
236
328
  # If the page is stashed, returns that, otherwise, fetches it from the web.
237
329
  #
238
330
  # If an instance is paginated, returns the concatenated output of each
239
331
  # page, e.g. if a news article has two pages.
240
332
  ##
241
- def get_instance(url, index=0)
242
- resp = self.get_page(url, @debug)
333
+ def get_instance(url, pagination_index=0, options={})
334
+ resp = self.get_page(url, @debug, options)
243
335
  if !resp.empty?
244
- next_url = self.next_instance_page_url(url, index + 1)
336
+ next_url = self.next_instance_page_url(url, pagination_index.to_i + 1)
337
+
338
+ # next_url = resolve_url(next_url, url)
245
339
  unless next_url == url
246
- next_resp = self.get_instance(next_url, index + 1).to_s
340
+ next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
247
341
  resp += next_resp
248
342
  end
249
343
  end
@@ -253,8 +347,9 @@ module Upton
253
347
  # Just a helper for +scrape+.
254
348
  def scrape_from_list(list, blk)
255
349
  puts "Scraping #{list.size} instances" if @verbose
256
- list.each_with_index.map do |instance_url, index|
257
- blk.call(get_instance(instance_url), instance_url, index)
350
+ list.each_with_index.map do |instance_url, instance_index|
351
+ instance_resp = get_instance instance_url, nil, :instance_index => instance_index
352
+ blk.call(instance_resp, instance_url, instance_index)
258
353
  end
259
354
  end
260
355
 
data/lib/utils.rb CHANGED
@@ -18,7 +18,6 @@ module Upton
18
18
  # present, is returned as the first row.
19
19
  ##
20
20
  def self.table(table_selector, selector_method=:xpath)
21
- require 'csv'
22
21
  return Proc.new do |instance_html|
23
22
  html = ::Nokogiri::HTML(instance_html)
24
23
  output = []
@@ -34,11 +33,42 @@ module Upton
34
33
  # Scrapes any set of HTML elements into an Array.
35
34
  ##
36
35
  def self.list(list_selector, selector_method=:xpath)
37
- require 'csv'
38
36
  return Proc.new do |instance_html|
39
37
  html = ::Nokogiri::HTML(instance_html)
40
38
  html.send(selector_method, list_selector).map{|list_element| list_element.text }
41
39
  end
42
40
  end
41
+
42
+ ##
43
+ # Takes :_href and resolves it to an absolute URL according to
44
+ # the supplied :_page_url. They can be either Strings or URI
45
+ # instances.
46
+ #
47
+ # raises ArgumentError if either href or page_url is nil
48
+ # raises ArgumentError if page_url is not absolute
49
+ #
50
+ # returns: a String with absolute URL
51
+ def self.resolve_url(_href, _page_url)
52
+
53
+ page_url = URI(_page_url).dup
54
+ raise ArgumentError, "#{page_url} must be absolute" unless page_url.absolute?
55
+
56
+ href = URI(_href).dup
57
+
58
+ # return :href if :href is already absolute
59
+ return href.to_s if href.absolute?
60
+
61
+
62
+ # TODO: There may be edge cases worth considering
63
+ # but this should handle the following non-absolute href possibilities:
64
+ # //anothersite.com (keeps scheme, too!)
65
+ # /root/dir
66
+ # relative/dir
67
+ # ?query=2
68
+ # #bang
69
+
70
+ URI.join(page_url, href).to_s
71
+ end
72
+
43
73
  end
44
74
  end
File without changes
File without changes
@@ -0,0 +1,17 @@
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Document</title>
6
+ </head>
7
+ <body>
8
+ <!-- refactored fixture for relative URL testing -->
9
+
10
+ <h2><a href="iamnottobeselected.html" class="title-link">An unnecessary proof of concept but just for kicks</a></h2>
11
+
12
+ <section id="river">
13
+ <h1><a href="prosecutor.html" class="title-link">A Prosecutor, a Wrongful Conviction and a Question of Justice</a></h1>
14
+ </section>
15
+
16
+ </body>
17
+ </html>
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,118 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'rack'
4
+ require 'thin'
5
+ require 'nokogiri'
6
+ require 'restclient'
7
+ require 'fileutils'
8
+ require './lib/upton'
9
+
10
+ describe Upton do
11
+ before :all do
12
+ #start the server
13
+ class Server
14
+ def call(env)
15
+ @root = File.expand_path(File.dirname(__FILE__))
16
+ path = Rack::Utils.unescape(env['PATH_INFO'])
17
+ path += 'index.html' if path == '/'
18
+ file = File.join(@root, "data", path)
19
+
20
+ params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
21
+
22
+ if File.exists?(file)
23
+ [ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
24
+ else
25
+ [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
26
+ end
27
+ end
28
+ end
29
+
30
+ def start_test_server
31
+ @server_thread = Thread.new do
32
+ Rack::Handler::Thin.run ::Server.new, :Port => 9876
33
+ end
34
+ sleep(1) # wait a sec for the server to be booted
35
+ end
36
+
37
+ start_test_server()
38
+
39
+ @headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
40
+ "",
41
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
42
+ "Six Facts Lost in the IRS Scandal"]
43
+ @most_commented_heds = [["Six Facts Lost in the IRS Scandal",
44
+ "How the IRS’s Nonprofit Division Got So Dysfunctional",
45
+ "Sound, Fury and the IRS Mess",
46
+ "The Most Important #Muckreads on Rape in the Military",
47
+ "Congressmen to Hagel: Where Are the Missing War Records?",
48
+ "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
49
+ "A Prosecutor, a Wrongful Conviction and a Question of Justice",
50
+ "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
51
+ "The Story Behind Our Hospital Interactive",
52
+ "irs-test-charts-for-embedding"]]
53
+ @east_timor_prime_ministers = [[
54
+ ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
55
+ "1", "2", "3", "4",],
56
+ [],
57
+ ["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
58
+ ["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
59
+ ["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
60
+ ["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
61
+ ]]
62
+ end
63
+
64
+ it "should scrape in the basic case" do
65
+ propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
66
+ propubscraper.debug = true
67
+ propubscraper.verbose = true
68
+
69
+ heds = propubscraper.scrape do |article_str|
70
+ doc = Nokogiri::HTML(article_str)
71
+ hed = doc.css('h1.article-title').text
72
+ end
73
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
74
+ heds.should eql @headlines
75
+ end
76
+
77
+ it 'should properly handle relative urls' do
78
+ # uses a modified page from the previous test in which the target
79
+ # href, http://127.0.0.1:9876/prosecutors.html, has been changed
80
+ # to a relative url
81
+ #
82
+ # Note: this test is a bit quirky, because it passes on the fact that
83
+ # the resolve_url creates a url identical to one that is already stashed ("prosecutors.html").
84
+ # So it works, but because of a coupling to how Upton handles caching in the file system
85
+
86
+ propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica-relative.html", "section#river h1 a", :css)
87
+ propubscraper.debug = true
88
+ propubscraper.verbose = true
89
+
90
+ heds = propubscraper.scrape do |article_str|
91
+ doc = Nokogiri::HTML(article_str)
92
+ hed = doc.css('h1.article-title').text
93
+ end
94
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
95
+ heds.should eql ["A Prosecutor, a Wrongful Conviction and a Question of Justice"]
96
+ end
97
+
98
+ it "should scrape a list properly with the list helper" do
99
+ propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
100
+ propubscraper.debug = true
101
+ propubscraper.verbose = true
102
+ list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
103
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
104
+ list.should eql @most_commented_heds
105
+ end
106
+
107
+ it "should scrape a table properly with the table helper" do
108
+ propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
109
+ propubscraper.debug = true
110
+ propubscraper.verbose = true
111
+ table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
112
+ FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
113
+ table.should eql @east_timor_prime_ministers
114
+ end
115
+
116
+ it "should test saving files with the right encoding"
117
+ it "should test stashing to make sure pages are stashed at the right times, but not at the wrong ones"
118
+ end
metadata CHANGED
@@ -1,69 +1,83 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upton
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jeremy B. Merrill
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-22 00:00:00.000000000 Z
11
+ date: 2013-08-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rack
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ! '>='
17
+ - - '>='
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ! '>='
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rspec
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
25
39
  - !ruby/object:Gem::Version
26
40
  version: '0'
27
41
  - !ruby/object:Gem::Dependency
28
42
  name: thin
29
43
  requirement: !ruby/object:Gem::Requirement
30
44
  requirements:
31
- - - ! '>='
45
+ - - '>='
32
46
  - !ruby/object:Gem::Version
33
47
  version: '0'
34
48
  type: :development
35
49
  prerelease: false
36
50
  version_requirements: !ruby/object:Gem::Requirement
37
51
  requirements:
38
- - - ! '>='
52
+ - - '>='
39
53
  - !ruby/object:Gem::Version
40
54
  version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: nokogiri
43
57
  requirement: !ruby/object:Gem::Requirement
44
58
  requirements:
45
- - - ! '>='
59
+ - - '>='
46
60
  - !ruby/object:Gem::Version
47
61
  version: '0'
48
62
  type: :development
49
63
  prerelease: false
50
64
  version_requirements: !ruby/object:Gem::Requirement
51
65
  requirements:
52
- - - ! '>='
66
+ - - '>='
53
67
  - !ruby/object:Gem::Version
54
68
  version: '0'
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: yard
57
71
  requirement: !ruby/object:Gem::Requirement
58
72
  requirements:
59
- - - ! '>='
73
+ - - '>='
60
74
  - !ruby/object:Gem::Version
61
75
  version: '0'
62
76
  type: :development
63
77
  prerelease: false
64
78
  version_requirements: !ruby/object:Gem::Requirement
65
79
  requirements:
66
- - - ! '>='
80
+ - - '>='
67
81
  - !ruby/object:Gem::Version
68
82
  version: '0'
69
83
  - !ruby/object:Gem::Dependency
@@ -84,14 +98,28 @@ dependencies:
84
98
  name: nokogiri
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - ! '>='
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: mechanize
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - '>='
88
116
  - !ruby/object:Gem::Version
89
117
  version: '0'
90
118
  type: :runtime
91
119
  prerelease: false
92
120
  version_requirements: !ruby/object:Gem::Requirement
93
121
  requirements:
94
- - - ! '>='
122
+ - - '>='
95
123
  - !ruby/object:Gem::Version
96
124
  version: '0'
97
125
  description: Don't re-write web scrapers every time. Upton gives you a scraper template
@@ -103,13 +131,14 @@ extra_rdoc_files: []
103
131
  files:
104
132
  - lib/upton.rb
105
133
  - lib/utils.rb
106
- - test/data/discussion.html
107
- - test/data/easttimor.html
108
- - test/data/propublica.html
109
- - test/data/prosecutor.html
110
- - test/data/sixfacts.html
111
- - test/data/webinar.html
112
- - test/test_upton.rb
134
+ - spec/data/webinar.html
135
+ - spec/data/propublica-relative.html
136
+ - spec/data/propublica.html
137
+ - spec/data/prosecutor.html
138
+ - spec/data/sixfacts.html
139
+ - spec/data/discussion.html
140
+ - spec/data/easttimor.html
141
+ - spec/upton_spec.rb
113
142
  homepage: http://github.org/propublica/upton
114
143
  licenses:
115
144
  - MIT
@@ -120,26 +149,27 @@ require_paths:
120
149
  - lib
121
150
  required_ruby_version: !ruby/object:Gem::Requirement
122
151
  requirements:
123
- - - ! '>='
152
+ - - '>='
124
153
  - !ruby/object:Gem::Version
125
154
  version: 1.8.7
126
155
  required_rubygems_version: !ruby/object:Gem::Requirement
127
156
  requirements:
128
- - - ! '>='
157
+ - - '>='
129
158
  - !ruby/object:Gem::Version
130
159
  version: '0'
131
160
  requirements: []
132
161
  rubyforge_project:
133
- rubygems_version: 2.0.5
162
+ rubygems_version: 2.0.2
134
163
  signing_key:
135
164
  specification_version: 4
136
165
  summary: A simple web-scraping framework
137
166
  test_files:
138
- - test/data/discussion.html
139
- - test/data/easttimor.html
140
- - test/data/propublica.html
141
- - test/data/prosecutor.html
142
- - test/data/sixfacts.html
143
- - test/data/webinar.html
144
- - test/test_upton.rb
167
+ - spec/data/webinar.html
168
+ - spec/data/propublica-relative.html
169
+ - spec/data/propublica.html
170
+ - spec/data/prosecutor.html
171
+ - spec/data/sixfacts.html
172
+ - spec/data/discussion.html
173
+ - spec/data/easttimor.html
174
+ - spec/upton_spec.rb
145
175
  has_rdoc: true
data/test/test_upton.rb DELETED
@@ -1,141 +0,0 @@
1
- # encoding: UTF-8
2
-
3
- require 'test/unit'
4
- require 'rack'
5
- require 'thin'
6
- require 'nokogiri'
7
- require 'restclient'
8
- require './lib/upton'
9
- require 'fileutils'
10
-
11
- module Upton
12
- module Test
13
-
14
- # class ProPublicaScraper < Upton::Scraper
15
- # def initialize(a, b, c)
16
- # super
17
- # @verbose = false
18
- # @debug = false
19
- # @stash_folder = "test_stashes"
20
- # end
21
- # end
22
-
23
-
24
- class UptonTest < ::Test::Unit::TestCase
25
-
26
- # def test_get_page
27
- #TODO
28
- # end
29
-
30
- # def test_stash
31
- #TODO
32
- # end
33
-
34
- def test_scrape
35
- #this doesn't test stashing.
36
- start_test_server()
37
-
38
- headlines = ["Webinar: How to Use Prescriber Checkup to Power Your Reporting",
39
- "",
40
- "A Prosecutor, a Wrongful Conviction and a Question of Justice",
41
- "Six Facts Lost in the IRS Scandal"]
42
-
43
- propubscraper = Upton::Scraper.new("http://127.0.0.1:9876/propublica.html", "section#river section h1 a", :css)
44
- propubscraper.debug = true
45
- propubscraper.verbose = true
46
-
47
- heds = propubscraper.scrape do |article_str|
48
- doc = Nokogiri::HTML(article_str)
49
- hed = doc.css('h1.article-title').text
50
- end
51
- assert_equal(heds, headlines)
52
- FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
53
- end
54
-
55
- def test_encodings
56
- skip "should test getting pages, switching their encoding to UTF-8, saving them as UTF-8, reading them as UTF-8"
57
- end
58
-
59
- def test_stashing
60
- skip "should test stashing, make sure we never send too many requests"
61
- end
62
-
63
- def test_scrape_list
64
- #this doesn't test stashing.
65
- #TODO: needs a website that has links to a multi-page list (or table)
66
- start_test_server()
67
-
68
- most_commented_heds = [["Six Facts Lost in the IRS Scandal",
69
- "How the IRS’s Nonprofit Division Got So Dysfunctional",
70
- "Sound, Fury and the IRS Mess",
71
- "The Most Important #Muckreads on Rape in the Military",
72
- "Congressmen to Hagel: Where Are the Missing War Records?",
73
- "As Need for New Flood Maps Rises, Congress and Obama Cut Funding",
74
- "A Prosecutor, a Wrongful Conviction and a Question of Justice",
75
- "A Prolonged Stay: The Reasons Behind the Slow Pace of Executions",
76
- "The Story Behind Our Hospital Interactive",
77
- "irs-test-charts-for-embedding"]]
78
-
79
- propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/propublica.html"])
80
- propubscraper.debug = true
81
- propubscraper.verbose = true
82
- list = propubscraper.scrape(&Upton::Utils.list("#jamb.wNarrow #most-commented li a", :css))
83
-
84
- assert_equal(list, most_commented_heds)
85
- FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
86
- end
87
-
88
- def test_scrape_table
89
- #this doesn't test stashing.
90
- start_test_server()
91
-
92
- east_timor_prime_ministers = [[
93
- ["#", "Portrait", "Name(Birth–Death)", "Term of Office", "Party",
94
- "1", "2", "3", "4",],
95
- [],
96
- ["", "Mari Alkatiri(b. 1949)", "20 May 2002", "26 June 2006[1]", "FRETILIN"],
97
- ["", "José Ramos-Horta(b. 1949)", "26 June 2006", "19 May 2007", "Independent"],
98
- ["", "Estanislau da Silva(b. 1952)", "19 May 2007", "8 August 2007", "FRETILIN"],
99
- ["", "Xanana Gusmão(b. 1946)", "8 August 2007", "Incumbent", "CNRT"],
100
- ]]
101
-
102
- propubscraper = Upton::Scraper.new(["http://127.0.0.1:9876/easttimor.html"])
103
- propubscraper.debug = true
104
- propubscraper.verbose = true
105
- table = propubscraper.scrape(&Upton::Utils.table('//table[contains(concat(" ", normalize-space(@class), " "), " wikitable ")][2]'))
106
- assert_equal(table, east_timor_prime_ministers)
107
- FileUtils.rm_r("test_stashes") if Dir.exists?("test_stashes")
108
- end
109
-
110
-
111
-
112
- private
113
- def start_test_server
114
- @server_thread = Thread.new do
115
- Rack::Handler::Thin.run Upton::Test::Server.new, :Port => 9876
116
- end
117
- sleep(1) # wait a sec for the server to be booted
118
- end
119
- end
120
-
121
-
122
-
123
- # via http://stackoverflow.com/questions/10166611/launching-a-web-server-inside-ruby-tests
124
- class Server
125
- def call(env)
126
- @root = File.expand_path(File.dirname(__FILE__))
127
- path = Rack::Utils.unescape(env['PATH_INFO'])
128
- path += 'index.html' if path == '/'
129
- file = File.join(@root, "data", path)
130
-
131
- params = Rack::Utils.parse_nested_query(env['QUERY_STRING'])
132
-
133
- if File.exists?(file)
134
- [ 200, {"Content-Type" => "text/html; charset=utf-8"}, File.read(file) ]
135
- else
136
- [ 404, {'Content-Type' => 'text/plain'}, 'file not found' ]
137
- end
138
- end
139
- end
140
- end
141
- end