upton 0.2.7 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2eb19ce88f56ef55d8c32d6c16e7c777ce3f44e6
4
- data.tar.gz: 1b86794f51292a30b310ceffa1f36a85144af3e5
3
+ metadata.gz: 0bc8fddf34dc974bde7491e7dd311eb09b5d393e
4
+ data.tar.gz: b8a8010408cd715b010406163cd14e45045af2d6
5
5
  SHA512:
6
- metadata.gz: f49d0d404cea0d07038a6b3394d9f000332045901d121fc1065c85da945dd1e372f1cb6f87f7cd8738250f5f7d35183729bd0d8cdd8c89872cdd8e1333225a6e
7
- data.tar.gz: 70351047072d55ac1518b40b4a9a04c3287702465631527e41c6367559ddc39880ea0384f3e8c16a785ca31cbaff6ce129ed6d0a5f00ac633f6d29c86e2e613a
6
+ metadata.gz: 0c5cdda936dcaf7a045afbc6cb317fc463191823a13d585732717f6ddfb3d4970a94c51df0324a343022c938702ca8b0fdbbf9e8b54fb0cc5fafec1dd8af8276
7
+ data.tar.gz: e5f2bd0c9f9ba843607b0ac7816c84df21cc6acbb0de13ec5918e3edb866fa41d7e6e9b39d4d0af7ea74c0ebf4628240ee783612edc3370842f860039ccc6465
@@ -3,55 +3,56 @@
3
3
  require 'nokogiri'
4
4
  require 'uri'
5
5
  require 'restclient'
6
- require_relative './utils'
6
+ require_relative 'upton/utils'
7
+ require_relative 'upton/downloader'
7
8
 
8
9
  ##
9
10
  # This module contains a scraper called Upton
10
11
  ##
11
12
  module Upton
12
13
  ##
13
- # *Upton* is a framework for easy web-scraping with a useful debug mode
14
- # that doesn't hammer your target's servers. It does the repetitive parts of
14
+ # *Upton* is a framework for easy web-scraping with a useful debug mode
15
+ # that doesn't hammer your target's servers. It does the repetitive parts of
15
16
  # writing scrapers, so you only have to write the unique parts for each site.
16
17
  #
17
18
  # Upton operates on the theory that, for most scraping projects, you need to
18
19
  # scrape two types of pages:
19
- #
20
- # 1. Index pages, which list instance pages. For example, a job search
20
+ #
21
+ # 1. Index pages, which list instance pages. For example, a job search
21
22
  # site's search page or a newspaper's homepage.
22
23
  # 2. Instance pages, which represent the goal of your scraping, e.g.
23
24
  # job listings or news articles.
24
25
  #
25
26
  # Upton::Scraper can be used as-is for basic use-cases by:
26
- # 1. specifying the pages to be scraped in `new` as an index page
27
+ # 1. specifying the pages to be scraped in `new` as an index page
27
28
  # or as an Array of URLs.
28
- # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
29
+ # 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
29
30
  # block from Upton::Utils.
30
- # For more complicated cases; subclass Upton::Scraper
31
+ # For more complicated cases; subclass Upton::Scraper
31
32
  # e.g. +MyScraper < Upton::Scraper+ and override various methods.
32
33
  ##
33
34
  class Scraper
35
+ EMPTY_STRING = ''
34
36
 
35
- attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
37
+ attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
38
+ :paginated, :pagination_param, :pagination_max_pages
36
39
 
37
40
  ##
38
41
  # This is the main user-facing method for a basic scraper.
39
- # Call +scrape+ with a block; this block will be called on
42
+ # Call +scrape+ with a block; this block will be called on
40
43
  # the text of each instance page, (and optionally, its URL and its index
41
44
  # in the list of instance URLs returned by +get_index+).
42
45
  ##
43
- def scrape &blk
44
- unless self.url_array
45
- self.url_array = self.get_index
46
- end
46
+ def scrape(&blk)
47
+ self.url_array = self.get_index unless self.url_array
47
48
  self.scrape_from_list(self.url_array, blk)
48
49
  end
49
50
 
50
51
  ##
51
52
  # +index_url_or_array+: A list of string URLs, OR
52
53
  # the URL of the page containing the list of instances.
53
- # +selector+: The XPath expression or CSS selector that specifies the
54
- # anchor elements within the page, if a url is specified for
54
+ # +selector+: The XPath expression or CSS selector that specifies the
55
+ # anchor elements within the page, if a url is specified for
55
56
  # the previous argument.
56
57
  # +selector_method+: Deprecated and ignored. Next breaking release will
57
58
  # remove this option.x
@@ -68,7 +69,7 @@ module Upton
68
69
  # the String passed is of CSS/XPath notation
69
70
 
70
71
  def initialize(index_url_or_array, selector="", selector_method=:deprecated)
71
-
72
+
72
73
  #if first arg is a valid URL, do already-written stuff;
73
74
  #if it's not (or if it's a list?) don't bother with get_index, etc.
74
75
  #e.g. Scraper.new(["http://jeremybmerrill.com"])
@@ -80,6 +81,7 @@ module Upton
80
81
  @index_url = index_url_or_array
81
82
  @index_selector = selector
82
83
  end
84
+
83
85
  # If true, then Upton prints information about when it gets
84
86
  # files from the internet and when it gets them from its stash.
85
87
  @verbose = false
@@ -89,26 +91,32 @@ module Upton
89
91
  # version.
90
92
  # You may want to set @debug to false for production (but maybe not).
91
93
  # You can also control stashing behavior on a per-call basis with the
92
- # optional second argument to get_page, if, for instance, you want to
94
+ # optional second argument to get_page, if, for instance, you want to
93
95
  # stash certain instance pages, e.g. based on their modification date.
94
96
  @debug = true
95
97
  # Index debug does the same, but for index pages.
96
98
  @index_debug = false
97
99
 
98
- # In order to not hammer servers, Upton waits for, by default, 30
100
+ # In order to not hammer servers, Upton waits for, by default, 30
99
101
  # seconds between requests to the remote server.
100
102
  @sleep_time_between_requests = 30 #seconds
101
103
 
104
+ # If true, then Upton will attempt to scrape paginated index pages
105
+ @paginated = false
106
+ # Default query string parameter used to specify the current page
107
+ @pagination_param = 'page'
108
+ # Default number of paginated pages to scrape
109
+ @pagination_max_pages = 2
110
+
111
+
102
112
  # Folder name for stashes, if you want them to be stored somewhere else,
103
113
  # e.g. under /tmp.
104
114
  @stash_folder ||= "stashes"
105
- unless Dir.exists?(@stash_folder)
106
- FileUtils.mkdir_p(@stash_folder)
107
- end
115
+ FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
108
116
  end
109
117
 
110
118
  ##
111
- # If instance pages are paginated, <b>you must override</b>
119
+ # If instance pages are paginated, <b>you must override</b>
112
120
  # this method to return the next URL, given the current URL and its index.
113
121
  #
114
122
  # If instance pages aren't paginated, there's no need to override this.
@@ -119,22 +127,42 @@ module Upton
119
127
  # ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
120
128
  ##
121
129
  def next_instance_page_url(url, pagination_index)
122
- ""
130
+ EMPTY_STRING
123
131
  end
124
132
 
125
133
  ##
126
- # If index pages are paginated, <b>you must override</b>
127
- # this method to return the next URL, given the current URL and its index.
128
- #
129
- # If index pages aren't paginated, there's no need to override this.
134
+ # Return the next URL to scrape, given the current URL and its index.
130
135
  #
131
136
  # Recursion stops if the fetching URL returns an empty string or an error.
132
137
  #
133
- # e.g. +next_index_page_url("http://whatever.com/articles?page=1", 2)+
138
+ # If @paginated is not set (the default), this method returns an empty string.
139
+ #
140
+ # If @paginated is set, this method will return the next pagination URL
141
+ # to scrape using @pagination_param and the pagination_index.
142
+ #
143
+ # If the pagination_index is greater than @pagination_max_pages, then the
144
+ # method will return an empty string.
145
+ #
146
+ # Override this method to handle pagination is an alternative way
147
+ # e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
134
148
  # ought to return "http://whatever.com/articles?page=2"
149
+ #
135
150
  ##
136
151
  def next_index_page_url(url, pagination_index)
137
- ""
152
+ return EMPTY_STRING unless @paginated
153
+
154
+ if pagination_index > @pagination_max_pages
155
+ puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
156
+ EMPTY_STRING
157
+ else
158
+ uri = URI.parse(url)
159
+ query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
160
+ # update the pagination query string parameter
161
+ query[@pagination_param] = pagination_index
162
+ uri.query = URI.encode_www_form(query)
163
+ puts "Next index pagination url is #{uri}" if @verbose
164
+ uri.to_s
165
+ end
138
166
  end
139
167
 
140
168
  ##
@@ -142,13 +170,10 @@ module Upton
142
170
  ##
143
171
  def scrape_to_csv filename, &blk
144
172
  require 'csv'
145
- unless self.url_array
146
- self.url_array = self.get_index
147
- end
173
+ self.url_array = self.get_index unless self.url_array
148
174
  CSV.open filename, 'wb' do |csv|
149
175
  #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
150
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
151
- puts document.inspect
176
+ self.scrape_from_list(self.url_array, blk).compact.each do |document|
152
177
  if document[0].respond_to? :map
153
178
  document.each{|row| csv << row }
154
179
  else
@@ -161,13 +186,10 @@ module Upton
161
186
 
162
187
  def scrape_to_tsv filename, &blk
163
188
  require 'csv'
164
- unless self.url_array
165
- self.url_array = self.get_index
166
- end
189
+ self.url_array = self.get_index unless self.url_array
167
190
  CSV.open filename, 'wb', :col_sep => "\t" do |csv|
168
191
  #this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
169
- self.scrape_from_list(self.url_array, blk).compact.each do |document|
170
- puts document.inspect
192
+ self.scrape_from_list(self.url_array, blk).compact.each do |document|
171
193
  if document[0].respond_to? :map
172
194
  document.each{|row| csv << row }
173
195
  else
@@ -181,70 +203,20 @@ module Upton
181
203
  protected
182
204
 
183
205
  ##
184
- # Actually fetches the page
185
- ##
186
- def fetch_page(url, options={})
187
- RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
188
- end
189
-
190
- ##
191
- # Handles getting pages with RestClient or getting them from the local stash.
192
- #
193
- # Uses a kludge (because rest-client is outdated) to handle encoding.
206
+ # Handles getting pages with Downlader, which handles stashing.
194
207
  ##
195
208
  def get_page(url, stash=false, options={})
196
- return "" if url.empty?
197
-
198
- #the filename for each stashed version is a cleaned version of the URL.
199
- if stash && File.exists?( url_to_filename(url, options) )
200
- puts "usin' a stashed copy of " + url if @verbose
201
- resp = open( url_to_filename(url, options), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
202
- else
203
- begin
204
- puts "getting " + url if @verbose
205
- sleep @sleep_time_between_requests
206
- resp = fetch_page(url, options)
207
-
208
- #this is silly, but rest-client needs to get on their game.
209
- #cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
210
- if ((200..207).include?(resp.net_http_res.code.to_i) && content_type = resp.net_http_res.content_type)
211
- charset = if set = resp.net_http_res.type_params['charset']
212
- set
213
- elsif content_type == 'text/xml'
214
- 'us-ascii'
215
- elsif content_type.split('/').first == 'text'
216
- 'iso-8859-1'
217
- end
218
- resp.force_encoding(charset) if charset
219
- end
220
-
221
- rescue RestClient::ResourceNotFound
222
- puts "404 error, skipping: #{url}" if @verbose
223
- resp = ""
224
- rescue RestClient::InternalServerError
225
- puts "500 Error, skipping: #{url}" if @verbose
226
- resp = ""
227
- rescue URI::InvalidURIError
228
- puts "Invalid URI: #{url}" if @verbose
229
- resp = ""
230
- rescue RestClient::RequestTimeout
231
- "Timeout: #{url}" if @verbose
232
- retry
233
- end
234
- if stash
235
- puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
236
- open( url_to_filename(url, options), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
237
- end
209
+ return EMPTY_STRING if url.empty?
210
+ resp_and_cache = Downloader.new(url, {:cache => stash, :verbose => @verbose}.merge(options)).get
211
+ if resp_and_cache[:from_resource]
212
+ puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
213
+ sleep @sleep_time_between_requests
238
214
  end
239
- resp
215
+ resp_and_cache[:resp]
240
216
  end
241
217
 
242
- def url_to_filename(url, options={})
243
- File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") )
244
- end
245
218
 
246
-
247
- ##
219
+ ##
248
220
  # sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
249
221
  # resolve_url resolves them to absolute urls.
250
222
  # absolute_url_str must be a URL, as a string, that is absolute.
@@ -258,7 +230,7 @@ module Upton
258
230
  return href.to_s if href.absolute?
259
231
 
260
232
  #TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
261
- URI.join(absolute_url, href).to_s
233
+ URI.join(absolute_url, href).to_s
262
234
  end
263
235
 
264
236
  ##
@@ -272,7 +244,7 @@ module Upton
272
244
  end
273
245
 
274
246
  ##
275
- # Using the XPath expression or CSS selector and selector_method that
247
+ # Using the XPath expression or CSS selector and selector_method that
276
248
  # uniquely identifies the links in the index, return those links as strings. ##
277
249
  def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
278
250
  # for now, override selector_method with :search, which will work with either CSS or XPath
@@ -285,20 +257,18 @@ module Upton
285
257
  # Does @index_url stay unaltered for the lifetime of the Upton instance?
286
258
  # It seems to at this point, but that may be something that gets
287
259
  # deprecated later
288
- #
289
- # So for now, @index_url is used in conjunction with resolve_url
260
+ #
261
+ # So for now, @index_url is used in conjunction with resolve_url
290
262
  # to make sure that this method returns absolute urls
291
263
  # i.e. this method expects @index_url to always have an absolute address
292
264
  # for the lifetime of an Upton instance
293
265
  def parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
294
266
  # for now, override selector_method with :search, which will work with either CSS or XPath
295
- Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
267
+ Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
296
268
  href = a_element["href"]
297
- u = resolve_url( href, @index_url) unless href.nil?
298
- unless u == href
299
- puts "resolved #{href} to #{u}"
300
- end
301
- u
269
+ resolved_url = resolve_url( href, @index_url) unless href.nil?
270
+ puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
271
+ resolved_url
302
272
  end
303
273
  end
304
274
 
@@ -309,13 +279,13 @@ module Upton
309
279
  ##
310
280
  def get_index_pages(url, pagination_index, options={})
311
281
  resp = self.get_page(url, @index_debug, options)
312
- if !resp.empty?
282
+ unless resp.empty?
313
283
  next_url = self.next_index_page_url(url, pagination_index + 1)
314
284
  # resolve to absolute url
315
285
  #
316
286
  next_url = resolve_url(next_url, url)
317
287
  unless next_url == url
318
- next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
288
+ next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
319
289
  resp += next_resp
320
290
  end
321
291
  end
@@ -324,20 +294,20 @@ module Upton
324
294
 
325
295
  ##
326
296
  # Returns the instance at `url`.
327
- #
297
+ #
328
298
  # If the page is stashed, returns that, otherwise, fetches it from the web.
329
299
  #
330
- # If an instance is paginated, returns the concatenated output of each
300
+ # If an instance is paginated, returns the concatenated output of each
331
301
  # page, e.g. if a news article has two pages.
332
302
  ##
333
303
  def get_instance(url, pagination_index=0, options={})
334
304
  resp = self.get_page(url, @debug, options)
335
- if !resp.empty?
305
+ if !resp.empty?
336
306
  next_url = self.next_instance_page_url(url, pagination_index.to_i + 1)
337
-
338
- # next_url = resolve_url(next_url, url)
307
+
308
+ #next_url = resolve_url(next_url, url)
339
309
  unless next_url == url
340
- next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
310
+ next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
341
311
  resp += next_resp
342
312
  end
343
313
  end
@@ -0,0 +1,126 @@
1
+ require "fileutils"
2
+ require "open-uri"
3
+ require "tmpdir"
4
+ require "restclient"
5
+
6
+ module Upton
7
+
8
+ # This class is used internally to download and cache the webpages
9
+ # that are requested.
10
+ #
11
+ # By default, the cache location is the output of `Dir.tmpdir`/upton.
12
+ # The Dir.tmpdir returns the temporary directory of the operating system.
13
+ # By default, the stashed files have a non-human-readable md5-based filename.
14
+ # If `readable_stash_filenames` is true, they will have human-readable names.
15
+ class Downloader
16
+
17
+ MAX_FILENAME_LENGTH = 130 #for unixes, win xp+
18
+ EMPTY_STRING = ''
19
+
20
+ attr_reader :uri, :cache_location, :verbose
21
+ def initialize(uri, options = {})
22
+ @uri = uri
23
+ @cache = options.fetch(:cache) { true }
24
+ @cache_location = File.absolute_path(options[:cache_location] || "#{Dir.tmpdir}/upton")
25
+ @verbose = options[:verbose] || false
26
+ @readable_stash_filenames = options[:readable_filenames] || false
27
+ initialize_cache!
28
+ end
29
+
30
+ def get
31
+ if cache_enabled?
32
+ puts "Stashing enabled. Will try reading #{uri} data from cache." if @verbose
33
+ download_from_cache!
34
+ else
35
+ puts "Stashing disabled. Will download from the internet." if @verbose
36
+ from_resource = true
37
+ resp = download_from_resource!
38
+ {:resp => resp, :from_resource => from_resource }
39
+ end
40
+ end
41
+
42
+ private
43
+
44
+ def download_from_resource!
45
+ begin
46
+ puts "Downloading from #{uri}" if @verbose
47
+ resp = RestClient.get(uri)
48
+ puts "Downloaded #{uri}" if @verbose
49
+ rescue RestClient::ResourceNotFound
50
+ puts "404 error, skipping: #{uri}" if @verbose
51
+ rescue RestClient::InternalServerError
52
+ puts "500 Error, skipping: #{uri}" if @verbose
53
+ rescue URI::InvalidURIError
54
+ puts "Invalid URI: #{uri}" if @verbose
55
+ rescue RestClient::RequestTimeout
56
+ puts "Timeout: #{uri}" if @verbose
57
+ retry
58
+ end
59
+ resp ||= EMPTY_STRING
60
+ end
61
+
62
+ def download_from_cache!
63
+ resp = if cached_file_exists?
64
+ puts "Cache of #{uri} available" if @verbose
65
+ from_resource = false
66
+ open(cached_file).read
67
+ else
68
+ if @verbose
69
+ if @readable_stash_filenames
70
+ puts "Cache of #{uri} unavailable at #{filename_from_uri}. Will download from the internet"
71
+ else
72
+ puts "Cache of #{uri} unavailable. Will download from the internet"
73
+ end
74
+ end
75
+ from_resource = false
76
+ download_from_resource!
77
+ end
78
+ unless cached_file_exists?
79
+ if @verbose
80
+ if @readable_stash_filenames
81
+ puts "Writing #{uri} data to the cache at #{cached_file}"
82
+ else
83
+ puts "Writing #{uri} data to the cache"
84
+ end
85
+ end
86
+ File.write(cached_file, resp)
87
+ end
88
+ {:resp => resp, :from_resource => from_resource }
89
+ end
90
+
91
+ def cache_enabled?
92
+ !!@cache
93
+ end
94
+
95
+ def filename_from_uri
96
+ @readable_stash_filenames ? readable_filename_from_uri : hashed_filename_from_uri
97
+ end
98
+
99
+ def hashed_filename_from_uri
100
+ Digest::MD5.hexdigest(uri)
101
+ end
102
+
103
+ def readable_filename_from_uri
104
+ html = "html"
105
+ clean_url_max_length = MAX_FILENAME_LENGTH - html.length - cache_location.size
106
+ clean_url = uri.gsub(/[^A-Za-z0-9\-_]/, "")[0...clean_url_max_length]
107
+ "#{clean_url}.#{html}"
108
+ end
109
+
110
+ def cached_file
111
+ "#{cache_location}/#{filename_from_uri}"
112
+ end
113
+
114
+ def cached_file_exists?
115
+ File.exists?(cached_file)
116
+ end
117
+
118
+ def initialize_cache!
119
+ unless Dir.exists?(cache_location)
120
+ Dir.mkdir(cache_location)
121
+ FileUtils.chmod 0700, cache_location
122
+ end
123
+ end
124
+
125
+ end
126
+ end