upton 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2eb19ce88f56ef55d8c32d6c16e7c777ce3f44e6
4
- data.tar.gz: 1b86794f51292a30b310ceffa1f36a85144af3e5
3
+ metadata.gz: 0bc8fddf34dc974bde7491e7dd311eb09b5d393e
4
+ data.tar.gz: b8a8010408cd715b010406163cd14e45045af2d6
5
5
  SHA512:
6
- metadata.gz: f49d0d404cea0d07038a6b3394d9f000332045901d121fc1065c85da945dd1e372f1cb6f87f7cd8738250f5f7d35183729bd0d8cdd8c89872cdd8e1333225a6e
7
- data.tar.gz: 70351047072d55ac1518b40b4a9a04c3287702465631527e41c6367559ddc39880ea0384f3e8c16a785ca31cbaff6ce129ed6d0a5f00ac633f6d29c86e2e613a
6
+ metadata.gz: 0c5cdda936dcaf7a045afbc6cb317fc463191823a13d585732717f6ddfb3d4970a94c51df0324a343022c938702ca8b0fdbbf9e8b54fb0cc5fafec1dd8af8276
7
+ data.tar.gz: e5f2bd0c9f9ba843607b0ac7816c84df21cc6acbb0de13ec5918e3edb866fa41d7e6e9b39d4d0af7ea74c0ebf4628240ee783612edc3370842f860039ccc6465
@@ -3,55 +3,56 @@
3
3
  require 'nokogiri'
4
4
  require 'uri'
5
5
  require 'restclient'
6
- require_relative './utils'
6
+ require_relative 'upton/utils'
7
+ require_relative 'upton/downloader'
7
8
 
8
9
  ##
9
10
  # This module contains a scraper called Upton
10
11
  ##
11
12
  module Upton
12
13
  ##
13
- # *Upton* is a framework for easy web-scraping with a useful debug mode
14
- # that doesn't hammer your target's servers. It does the repetitive parts of
14
+ # *Upton* is a framework for easy web-scraping with a useful debug mode
15
+ # that doesn't hammer your target's servers. It does the repetitive parts of
15
16
  # writing scrapers, so you only have to write the unique parts for each site.
16
17
  #
17
18
  # Upton operates on the theory that, for most scraping projects, you need to
18
19
  # scrape two types of pages:
19
- #
20
- # 1. Index pages, which list instance pages. For example, a job search
20
+ #
21
+ # 1. Index pages, which list instance pages. For example, a job search
21
22
  # site's search page or a newspaper's homepage.
22
23
  # 2. Instance pages, which represent the goal of your scraping, e.g.
23
24
  # job listings or news articles.
24
25
  #
25
26
  # Upton::Scraper can be used as-is for basic use-cases by:
26
- # 1. specifying the pages to be scraped in `new` as an index page
27
+ # 1. specifying the pages to be scraped in `new` as an index page
27
28
  # or as an Array of URLs.
28
- # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
29
+ # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
29
30
  # block from Upton::Utils.
30
- # For more complicated cases; subclass Upton::Scraper
31
+ # For more complicated cases; subclass Upton::Scraper
31
32
  # e.g. +MyScraper < Upton::Scraper+ and override various methods.
32
33
  ##
33
34
  class Scraper
35
+ EMPTY_STRING = ''
34
36
 
35
- attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
37
+ attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
38
+ :paginated, :pagination_param, :pagination_max_pages
36
39
 
37
40
  ##
38
41
  # This is the main user-facing method for a basic scraper.
39
- # Call +scrape+ with a block; this block will be called on
42
+ # Call +scrape+ with a block; this block will be called on
40
43
  # the text of each instance page, (and optionally, its URL and its index
41
44
  # in the list of instance URLs returned by +get_index+).
42
45
  ##
43
- def scrape &blk
44
- unless self.url_array
45
- self.url_array = self.get_index
46
- end
46
+ def scrape(&blk)
47
+ self.url_array = self.get_index unless self.url_array
47
48
  self.scrape_from_list(self.url_array, blk)
48
49
  end
49
50
 
50
51
  ##
51
52
  # +index_url_or_array+: A list of string URLs, OR
52
53
  # the URL of the page containing the list of instances.
53
- # +selector+: The XPath expression or CSS selector that specifies the
54
- # anchor elements within the page, if a url is specified for
54
+ # +selector+: The XPath expression or CSS selector that specifies the
55
+ # anchor elements within the page, if a url is specified for
55
56
  # the previous argument.
56
57
  # +selector_method+: Deprecated and ignored. Next breaking release will
57
58
  # remove this option.x
@@ -68,7 +69,7 @@ module Upton
68
69
  # the String passed is of CSS/XPath notation
69
70
 
70
71
  def initialize(index_url_or_array, selector="", selector_method=:deprecated)
71
-
72
+
72
73
  #if first arg is a valid URL, do already-written stuff;
73
74
  #if it's not (or if it's a list?) don't bother with get_index, etc.
74
75
  #e.g. Scraper.new(["http://jeremybmerrill.com"])
@@ -80,6 +81,7 @@ module Upton
80
81
  @index_url = index_url_or_array
81
82
  @index_selector = selector
82
83
  end
84
+
83
85
  # If true, then Upton prints information about when it gets
84
86
  # files from the internet and when it gets them from its stash.
85
87
  @verbose = false
@@ -89,26 +91,32 @@ module Upton
89
91
  # version.
90
92
  # You may want to set @debug to false for production (but maybe not).
91
93
  # You can also control stashing behavior on a per-call basis with the
92
- # optional second argument to get_page, if, for instance, you want to
94
+ # optional second argument to get_page, if, for instance, you want to
93
95
  # stash certain instance pages, e.g. based on their modification date.
94
96
  @debug = true
95
97
  # Index debug does the same, but for index pages.
96
98
  @index_debug = false
97
99
 
98
- # In order to not hammer servers, Upton waits for, by default, 30
100
+ # In order to not hammer servers, Upton waits for, by default, 30
99
101
  # seconds between requests to the remote server.
100
102
  @sleep_time_between_requests = 30 #seconds
101
103
 
104
+ # If true, then Upton will attempt to scrape paginated index pages
105
+ @paginated = false
106
+ # Default query string parameter used to specify the current page
107
+ @pagination_param = 'page'
108
+ # Default number of paginated pages to scrape
109
+ @pagination_max_pages = 2
110
+
111
+
102
112
  # Folder name for stashes, if you want them to be stored somewhere else,
103
113
  # e.g. under /tmp.
104
114
  @stash_folder ||= "stashes"
105
- unless Dir.exists?(@stash_folder)
106
- FileUtils.mkdir_p(@stash_folder)
107
- end
115
+ FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
108
116
  end
109
117
 
110
118
  ##
111
- # If instance pages are paginated, <b>you must override</b>
119
+ # If instance pages are paginated, <b>you must override</b>
112
120
  # this method to return the next URL, given the current URL and its index.
113
121
  #
114
122
  # If instance pages aren't paginated, there's no need to override this.
@@ -119,22 +127,42 @@ module Upton
119
127
  # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
120
128
  ##
121
129
  def next_instance_page_url(url, pagination_index)
122
- ""
130
+ EMPTY_STRING
123
131
  end
124
132
 
125
133
  ##
126
- # If index pages are paginated, <b>you must override</b>
127
- # this method to return the next URL, given the current URL and its index.
128
- #
129
- # If index pages aren't paginated, there's no need to override this.
134
+ # Return the next URL to scrape, given the current URL and its index.
130
135
  #
131
136
  # Recursion stops if the fetching URL returns an empty string or an error.
132
137
  #
133
- # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
138
+ # If @paginated is not set (the default), this method returns an empty string.
139
+ #
140
+ # If @paginated is set, this method will return the next pagination URL
141
+ # to scrape using @pagination_param and the pagination_index.
142
+ #
143
+ # If the pagination_index is greater than @pagination_max_pages, then the
144
+ # method will return an empty string.
145
+ #
146
+ # Override this method to handle pagination is an alternative way
147
+ # e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
134
148
  # ought to return "http://whatever.com/articles?page=2"
149
+ #
135
150
  ##
136
151
  def next_index_page_url(url, pagination_index)
137
- ""
152
+ return EMPTY_STRING unless @paginated
153
+
154
+ if pagination_index > @pagination_max_pages
155
+ puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
156
+ EMPTY_STRING
157
+ else
158
+ uri = URI.parse(url)
159
+ query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
160
+ # update the pagination query string parameter
161
+ query[@pagination_param] = pagination_index
162
+ uri.query = URI.encode_www_form(query)
163
+ puts "Next index pagination url is #{uri}" if @verbose
164
+ uri.to_s
165
+ end
138
166
  end
139
167
 
140
168
  ##
@@ -142,13 +170,10 @@ module Upton
142
170
  ##
143
171
  def scrape_to_csv filename, &blk
144
172
  require 'csv'
145
- unless self.url_array
146
- self.url_array = self.get_index
147
- end
173
+ self.url_array = self.get_index unless self.url_array
148
174
  CSV.open filename, 'wb' do |csv|
149
175
  #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
150
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
151
- puts document.inspect
176
+ self.scrape_from_list(self.url_array, blk).compact.each do |document|
152
177
  if document[0].respond_to? :map
153
178
  document.each{|row| csv << row }
154
179
  else
@@ -161,13 +186,10 @@ module Upton
161
186
 
162
187
  def scrape_to_tsv filename, &blk
163
188
  require 'csv'
164
- unless self.url_array
165
- self.url_array = self.get_index
166
- end
189
+ self.url_array = self.get_index unless self.url_array
167
190
  CSV.open filename, 'wb', :col_sep => "\t" do |csv|
168
191
  #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
169
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
170
- puts document.inspect
192
+ self.scrape_from_list(self.url_array, blk).compact.each do |document|
171
193
  if document[0].respond_to? :map
172
194
  document.each{|row| csv << row }
173
195
  else
@@ -181,70 +203,20 @@ module Upton
181
203
  protected
182
204
 
183
205
  ##
184
- # Actually fetches the page
185
- ##
186
- def fetch_page(url, options={})
187
- RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
188
- end
189
-
190
- ##
191
- # Handles getting pages with RestClient or getting them from the local stash.
192
- #
193
- # Uses a kludge (because rest-client is outdated) to handle encoding.
206
+ # Handles getting pages with Downlader, which handles stashing.
194
207
  ##
195
208
  def get_page(url, stash=false, options={})
196
- return "" if url.empty?
197
-
198
- #the filename for each stashed version is a cleaned version of the URL.
199
- if stash && File.exists?( url_to_filename(url, options) )
200
- puts "usin' a stashed copy of " + url if @verbose
201
- resp = open( url_to_filename(url, options), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
202
- else
203
- begin
204
- puts "getting " + url if @verbose
205
- sleep @sleep_time_between_requests
206
- resp = fetch_page(url, options)
207
-
208
- #this is silly, but rest-client needs to get on their game.
209
- #cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
210
- if ((200..207).include?(resp.net_http_res.code.to_i) && content_type = resp.net_http_res.content_type)
211
- charset = if set = resp.net_http_res.type_params['charset']
212
- set
213
- elsif content_type == 'text/xml'
214
- 'us-ascii'
215
- elsif content_type.split('/').first == 'text'
216
- 'iso-8859-1'
217
- end
218
- resp.force_encoding(charset) if charset
219
- end
220
-
221
- rescue RestClient::ResourceNotFound
222
- puts "404 error, skipping: #{url}" if @verbose
223
- resp = ""
224
- rescue RestClient::InternalServerError
225
- puts "500 Error, skipping: #{url}" if @verbose
226
- resp = ""
227
- rescue URI::InvalidURIError
228
- puts "Invalid URI: #{url}" if @verbose
229
- resp = ""
230
- rescue RestClient::RequestTimeout
231
- "Timeout: #{url}" if @verbose
232
- retry
233
- end
234
- if stash
235
- puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
236
- open( url_to_filename(url, options), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
237
- end
209
+ return EMPTY_STRING if url.empty?
210
+ resp_and_cache = Downloader.new(url, {:cache => stash, :verbose => @verbose}.merge(options)).get
211
+ if resp_and_cache[:from_resource]
212
+ puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
213
+ sleep @sleep_time_between_requests
238
214
  end
239
- resp
215
+ resp_and_cache[:resp]
240
216
  end
241
217
 
242
- def url_to_filename(url, options={})
243
- File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") )
244
- end
245
218
 
246
-
247
- ##
219
+ ##
248
220
  # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
249
221
  # resolve_url resolves them to absolute urls.
250
222
  # absolute_url_str must be a URL, as a string, that is absolute.
@@ -258,7 +230,7 @@ module Upton
258
230
  return href.to_s if href.absolute?
259
231
 
260
232
  #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
261
- URI.join(absolute_url, href).to_s
233
+ URI.join(absolute_url, href).to_s
262
234
  end
263
235
 
264
236
  ##
@@ -272,7 +244,7 @@ module Upton
272
244
  end
273
245
 
274
246
  ##
275
- # Using the XPath expression or CSS selector and selector_method that
247
+ # Using the XPath expression or CSS selector and selector_method that
276
248
  # uniquely identifies the links in the index, return those links as strings. ##
277
249
  def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
278
250
  # for now, override selector_method with :search, which will work with either CSS or XPath
@@ -285,20 +257,18 @@ module Upton
285
257
  # Does @index_url stay unaltered for the lifetime of the Upton instance?
286
258
  # It seems to at this point, but that may be something that gets
287
259
  # deprecated later
288
- #
289
- # So for now, @index_url is used in conjunction with resolve_url
260
+ #
261
+ # So for now, @index_url is used in conjunction with resolve_url
290
262
  # to make sure that this method returns absolute urls
291
263
  # i.e. this method expects @index_url to always have an absolute address
292
264
  # for the lifetime of an Upton instance
293
265
  def parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
294
266
  # for now, override selector_method with :search, which will work with either CSS or XPath
295
- Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
267
+ Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
296
268
  href = a_element["href"]
297
- u = resolve_url( href, @index_url) unless href.nil?
298
- unless u == href
299
- puts "resolved #{href} to #{u}"
300
- end
301
- u
269
+ resolved_url = resolve_url( href, @index_url) unless href.nil?
270
+ puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
271
+ resolved_url
302
272
  end
303
273
  end
304
274
 
@@ -309,13 +279,13 @@ module Upton
309
279
  ##
310
280
  def get_index_pages(url, pagination_index, options={})
311
281
  resp = self.get_page(url, @index_debug, options)
312
- if !resp.empty?
282
+ unless resp.empty?
313
283
  next_url = self.next_index_page_url(url, pagination_index + 1)
314
284
  # resolve to absolute url
315
285
  #
316
286
  next_url = resolve_url(next_url, url)
317
287
  unless next_url == url
318
- next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
288
+ next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
319
289
  resp += next_resp
320
290
  end
321
291
  end
@@ -324,20 +294,20 @@ module Upton
324
294
 
325
295
  ##
326
296
  # Returns the instance at `url`.
327
- #
297
+ #
328
298
  # If the page is stashed, returns that, otherwise, fetches it from the web.
329
299
  #
330
- # If an instance is paginated, returns the concatenated output of each
300
+ # If an instance is paginated, returns the concatenated output of each
331
301
  # page, e.g. if a news article has two pages.
332
302
  ##
333
303
  def get_instance(url, pagination_index=0, options={})
334
304
  resp = self.get_page(url, @debug, options)
335
- if !resp.empty?
305
+ if !resp.empty?
336
306
  next_url = self.next_instance_page_url(url, pagination_index.to_i + 1)
337
-
338
- # next_url = resolve_url(next_url, url)
307
+
308
+ #next_url = resolve_url(next_url, url)
339
309
  unless next_url == url
340
- next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
310
+ next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
341
311
  resp += next_resp
342
312
  end
343
313
  end
@@ -0,0 +1,126 @@
1
+ require "fileutils"
2
+ require "open-uri"
3
+ require "tmpdir"
4
+ require "restclient"
5
+
6
+ module Upton
7
+
8
+ # This class is used internally to download and cache the webpages
9
+ # that are requested.
10
+ #
11
+ # By default, the cache location is the output of `Dir.tmpdir`/upton.
12
+ # The Dir.tmpdir returns the temporary directory of the operating system.
13
+ # By default, the stashed files have a non-human-readable md5-based filename.
14
+ # If `readable_stash_filenames` is true, they will have human-readable names.
15
+ class Downloader
16
+
17
+ MAX_FILENAME_LENGTH = 130 #for unixes, win xp+
18
+ EMPTY_STRING = ''
19
+
20
+ attr_reader :uri, :cache_location, :verbose
21
+ def initialize(uri, options = {})
22
+ @uri = uri
23
+ @cache = options.fetch(:cache) { true }
24
+ @cache_location = File.absolute_path(options[:cache_location] || "#{Dir.tmpdir}/upton")
25
+ @verbose = options[:verbose] || false
26
+ @readable_stash_filenames = options[:readable_filenames] || false
27
+ initialize_cache!
28
+ end
29
+
30
+ def get
31
+ if cache_enabled?
32
+ puts "Stashing enabled. Will try reading #{uri} data from cache." if @verbose
33
+ download_from_cache!
34
+ else
35
+ puts "Stashing disabled. Will download from the internet." if @verbose
36
+ from_resource = true
37
+ resp = download_from_resource!
38
+ {:resp => resp, :from_resource => from_resource }
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def download_from_resource!
45
+ begin
46
+ puts "Downloading from #{uri}" if @verbose
47
+ resp = RestClient.get(uri)
48
+ puts "Downloaded #{uri}" if @verbose
49
+ rescue RestClient::ResourceNotFound
50
+ puts "404 error, skipping: #{uri}" if @verbose
51
+ rescue RestClient::InternalServerError
52
+ puts "500 Error, skipping: #{uri}" if @verbose
53
+ rescue URI::InvalidURIError
54
+ puts "Invalid URI: #{uri}" if @verbose
55
+ rescue RestClient::RequestTimeout
56
+ puts "Timeout: #{uri}" if @verbose
57
+ retry
58
+ end
59
+ resp ||= EMPTY_STRING
60
+ end
61
+
62
+ def download_from_cache!
63
+ resp = if cached_file_exists?
64
+ puts "Cache of #{uri} available" if @verbose
65
+ from_resource = false
66
+ open(cached_file).read
67
+ else
68
+ if @verbose
69
+ if @readable_stash_filenames
70
+ puts "Cache of #{uri} unavailable at #{filename_from_uri}. Will download from the internet"
71
+ else
72
+ puts "Cache of #{uri} unavailable. Will download from the internet"
73
+ end
74
+ end
75
+ from_resource = false
76
+ download_from_resource!
77
+ end
78
+ unless cached_file_exists?
79
+ if @verbose
80
+ if @readable_stash_filenames
81
+ puts "Writing #{uri} data to the cache at #{cached_file}"
82
+ else
83
+ puts "Writing #{uri} data to the cache"
84
+ end
85
+ end
86
+ File.write(cached_file, resp)
87
+ end
88
+ {:resp => resp, :from_resource => from_resource }
89
+ end
90
+
91
+ def cache_enabled?
92
+ !!@cache
93
+ end
94
+
95
+ def filename_from_uri
96
+ @readable_stash_filenames ? readable_filename_from_uri : hashed_filename_from_uri
97
+ end
98
+
99
+ def hashed_filename_from_uri
100
+ Digest::MD5.hexdigest(uri)
101
+ end
102
+
103
+ def readable_filename_from_uri
104
+ html = "html"
105
+ clean_url_max_length = MAX_FILENAME_LENGTH - html.length - cache_location.size
106
+ clean_url = uri.gsub(/[^A-Za-z0-9\-_]/, "")[0...clean_url_max_length]
107
+ "#{clean_url}.#{html}"
108
+ end
109
+
110
+ def cached_file
111
+ "#{cache_location}/#{filename_from_uri}"
112
+ end
113
+
114
+ def cached_file_exists?
115
+ File.exists?(cached_file)
116
+ end
117
+
118
+ def initialize_cache!
119
+ unless Dir.exists?(cache_location)
120
+ Dir.mkdir(cache_location)
121
+ FileUtils.chmod 0700, cache_location
122
+ end
123
+ end
124
+
125
+ end
126
+ end