upton 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/upton.rb +86 -116
- data/lib/upton/downloader.rb +126 -0
- data/lib/upton/utils.rb +43 -0
- data/spec/data/propublica.html +269 -269
- data/spec/data/propublica_search.html +388 -0
- data/spec/data/propublica_search_page_2.html +375 -0
- data/spec/spec_helper.rb +20 -0
- data/spec/upton_downloader_spec.rb +75 -0
- data/spec/upton_spec.rb +110 -47
- metadata +26 -3
- data/lib/utils.rb +0 -74
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0bc8fddf34dc974bde7491e7dd311eb09b5d393e
|
4
|
+
data.tar.gz: b8a8010408cd715b010406163cd14e45045af2d6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0c5cdda936dcaf7a045afbc6cb317fc463191823a13d585732717f6ddfb3d4970a94c51df0324a343022c938702ca8b0fdbbf9e8b54fb0cc5fafec1dd8af8276
|
7
|
+
data.tar.gz: e5f2bd0c9f9ba843607b0ac7816c84df21cc6acbb0de13ec5918e3edb866fa41d7e6e9b39d4d0af7ea74c0ebf4628240ee783612edc3370842f860039ccc6465
|
data/lib/upton.rb
CHANGED
@@ -3,55 +3,56 @@
|
|
3
3
|
require 'nokogiri'
|
4
4
|
require 'uri'
|
5
5
|
require 'restclient'
|
6
|
-
require_relative '
|
6
|
+
require_relative 'upton/utils'
|
7
|
+
require_relative 'upton/downloader'
|
7
8
|
|
8
9
|
##
|
9
10
|
# This module contains a scraper called Upton
|
10
11
|
##
|
11
12
|
module Upton
|
12
13
|
##
|
13
|
-
# *Upton* is a framework for easy web-scraping with a useful debug mode
|
14
|
-
# that doesn't hammer your target's servers. It does the repetitive parts of
|
14
|
+
# *Upton* is a framework for easy web-scraping with a useful debug mode
|
15
|
+
# that doesn't hammer your target's servers. It does the repetitive parts of
|
15
16
|
# writing scrapers, so you only have to write the unique parts for each site.
|
16
17
|
#
|
17
18
|
# Upton operates on the theory that, for most scraping projects, you need to
|
18
19
|
# scrape two types of pages:
|
19
|
-
#
|
20
|
-
# 1. Index pages, which list instance pages. For example, a job search
|
20
|
+
#
|
21
|
+
# 1. Index pages, which list instance pages. For example, a job search
|
21
22
|
# site's search page or a newspaper's homepage.
|
22
23
|
# 2. Instance pages, which represent the goal of your scraping, e.g.
|
23
24
|
# job listings or news articles.
|
24
25
|
#
|
25
26
|
# Upton::Scraper can be used as-is for basic use-cases by:
|
26
|
-
# 1. specifying the pages to be scraped in `new` as an index page
|
27
|
+
# 1. specifying the pages to be scraped in `new` as an index page
|
27
28
|
# or as an Array of URLs.
|
28
|
-
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
29
|
+
# 2. supplying a block to `scrape` or `scrape_to_csv` or using a pre-build
|
29
30
|
# block from Upton::Utils.
|
30
|
-
# For more complicated cases; subclass Upton::Scraper
|
31
|
+
# For more complicated cases; subclass Upton::Scraper
|
31
32
|
# e.g. +MyScraper < Upton::Scraper+ and override various methods.
|
32
33
|
##
|
33
34
|
class Scraper
|
35
|
+
EMPTY_STRING = ''
|
34
36
|
|
35
|
-
attr_accessor :verbose, :debug, :sleep_time_between_requests, :stash_folder, :url_array
|
37
|
+
attr_accessor :verbose, :debug, :index_debug, :sleep_time_between_requests, :stash_folder, :url_array,
|
38
|
+
:paginated, :pagination_param, :pagination_max_pages
|
36
39
|
|
37
40
|
##
|
38
41
|
# This is the main user-facing method for a basic scraper.
|
39
|
-
# Call +scrape+ with a block; this block will be called on
|
42
|
+
# Call +scrape+ with a block; this block will be called on
|
40
43
|
# the text of each instance page, (and optionally, its URL and its index
|
41
44
|
# in the list of instance URLs returned by +get_index+).
|
42
45
|
##
|
43
|
-
def scrape
|
44
|
-
unless self.url_array
|
45
|
-
self.url_array = self.get_index
|
46
|
-
end
|
46
|
+
def scrape(&blk)
|
47
|
+
self.url_array = self.get_index unless self.url_array
|
47
48
|
self.scrape_from_list(self.url_array, blk)
|
48
49
|
end
|
49
50
|
|
50
51
|
##
|
51
52
|
# +index_url_or_array+: A list of string URLs, OR
|
52
53
|
# the URL of the page containing the list of instances.
|
53
|
-
# +selector+: The XPath expression or CSS selector that specifies the
|
54
|
-
# anchor elements within the page, if a url is specified for
|
54
|
+
# +selector+: The XPath expression or CSS selector that specifies the
|
55
|
+
# anchor elements within the page, if a url is specified for
|
55
56
|
# the previous argument.
|
56
57
|
# +selector_method+: Deprecated and ignored. Next breaking release will
|
57
58
|
# remove this option.x
|
@@ -68,7 +69,7 @@ module Upton
|
|
68
69
|
# the String passed is of CSS/XPath notation
|
69
70
|
|
70
71
|
def initialize(index_url_or_array, selector="", selector_method=:deprecated)
|
71
|
-
|
72
|
+
|
72
73
|
#if first arg is a valid URL, do already-written stuff;
|
73
74
|
#if it's not (or if it's a list?) don't bother with get_index, etc.
|
74
75
|
#e.g. Scraper.new(["http://jeremybmerrill.com"])
|
@@ -80,6 +81,7 @@ module Upton
|
|
80
81
|
@index_url = index_url_or_array
|
81
82
|
@index_selector = selector
|
82
83
|
end
|
84
|
+
|
83
85
|
# If true, then Upton prints information about when it gets
|
84
86
|
# files from the internet and when it gets them from its stash.
|
85
87
|
@verbose = false
|
@@ -89,26 +91,32 @@ module Upton
|
|
89
91
|
# version.
|
90
92
|
# You may want to set @debug to false for production (but maybe not).
|
91
93
|
# You can also control stashing behavior on a per-call basis with the
|
92
|
-
# optional second argument to get_page, if, for instance, you want to
|
94
|
+
# optional second argument to get_page, if, for instance, you want to
|
93
95
|
# stash certain instance pages, e.g. based on their modification date.
|
94
96
|
@debug = true
|
95
97
|
# Index debug does the same, but for index pages.
|
96
98
|
@index_debug = false
|
97
99
|
|
98
|
-
# In order to not hammer servers, Upton waits for, by default, 30
|
100
|
+
# In order to not hammer servers, Upton waits for, by default, 30
|
99
101
|
# seconds between requests to the remote server.
|
100
102
|
@sleep_time_between_requests = 30 #seconds
|
101
103
|
|
104
|
+
# If true, then Upton will attempt to scrape paginated index pages
|
105
|
+
@paginated = false
|
106
|
+
# Default query string parameter used to specify the current page
|
107
|
+
@pagination_param = 'page'
|
108
|
+
# Default number of paginated pages to scrape
|
109
|
+
@pagination_max_pages = 2
|
110
|
+
|
111
|
+
|
102
112
|
# Folder name for stashes, if you want them to be stored somewhere else,
|
103
113
|
# e.g. under /tmp.
|
104
114
|
@stash_folder ||= "stashes"
|
105
|
-
unless Dir.exists?(@stash_folder)
|
106
|
-
FileUtils.mkdir_p(@stash_folder)
|
107
|
-
end
|
115
|
+
FileUtils.mkdir_p(@stash_folder) unless Dir.exists?(@stash_folder)
|
108
116
|
end
|
109
117
|
|
110
118
|
##
|
111
|
-
# If instance pages are paginated, <b>you must override</b>
|
119
|
+
# If instance pages are paginated, <b>you must override</b>
|
112
120
|
# this method to return the next URL, given the current URL and its index.
|
113
121
|
#
|
114
122
|
# If instance pages aren't paginated, there's no need to override this.
|
@@ -119,22 +127,42 @@ module Upton
|
|
119
127
|
# ought to return "http://whatever.com/article/upton-sinclairs-the-jungle?page=2"
|
120
128
|
##
|
121
129
|
def next_instance_page_url(url, pagination_index)
|
122
|
-
|
130
|
+
EMPTY_STRING
|
123
131
|
end
|
124
132
|
|
125
133
|
##
|
126
|
-
#
|
127
|
-
# this method to return the next URL, given the current URL and its index.
|
128
|
-
#
|
129
|
-
# If index pages aren't paginated, there's no need to override this.
|
134
|
+
# Return the next URL to scrape, given the current URL and its index.
|
130
135
|
#
|
131
136
|
# Recursion stops if the fetching URL returns an empty string or an error.
|
132
137
|
#
|
133
|
-
#
|
138
|
+
# If @paginated is not set (the default), this method returns an empty string.
|
139
|
+
#
|
140
|
+
# If @paginated is set, this method will return the next pagination URL
|
141
|
+
# to scrape using @pagination_param and the pagination_index.
|
142
|
+
#
|
143
|
+
# If the pagination_index is greater than @pagination_max_pages, then the
|
144
|
+
# method will return an empty string.
|
145
|
+
#
|
146
|
+
# Override this method to handle pagination is an alternative way
|
147
|
+
# e.g. next_index_page_url("http://whatever.com/articles?page=1", 2)
|
134
148
|
# ought to return "http://whatever.com/articles?page=2"
|
149
|
+
#
|
135
150
|
##
|
136
151
|
def next_index_page_url(url, pagination_index)
|
137
|
-
|
152
|
+
return EMPTY_STRING unless @paginated
|
153
|
+
|
154
|
+
if pagination_index > @pagination_max_pages
|
155
|
+
puts "Exceeded pagination limit of #{@pagination_max_pages}" if @verbose
|
156
|
+
EMPTY_STRING
|
157
|
+
else
|
158
|
+
uri = URI.parse(url)
|
159
|
+
query = uri.query ? Hash[URI.decode_www_form(uri.query)] : {}
|
160
|
+
# update the pagination query string parameter
|
161
|
+
query[@pagination_param] = pagination_index
|
162
|
+
uri.query = URI.encode_www_form(query)
|
163
|
+
puts "Next index pagination url is #{uri}" if @verbose
|
164
|
+
uri.to_s
|
165
|
+
end
|
138
166
|
end
|
139
167
|
|
140
168
|
##
|
@@ -142,13 +170,10 @@ module Upton
|
|
142
170
|
##
|
143
171
|
def scrape_to_csv filename, &blk
|
144
172
|
require 'csv'
|
145
|
-
unless self.url_array
|
146
|
-
self.url_array = self.get_index
|
147
|
-
end
|
173
|
+
self.url_array = self.get_index unless self.url_array
|
148
174
|
CSV.open filename, 'wb' do |csv|
|
149
175
|
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
150
|
-
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
151
|
-
puts document.inspect
|
176
|
+
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
152
177
|
if document[0].respond_to? :map
|
153
178
|
document.each{|row| csv << row }
|
154
179
|
else
|
@@ -161,13 +186,10 @@ module Upton
|
|
161
186
|
|
162
187
|
def scrape_to_tsv filename, &blk
|
163
188
|
require 'csv'
|
164
|
-
unless self.url_array
|
165
|
-
self.url_array = self.get_index
|
166
|
-
end
|
189
|
+
self.url_array = self.get_index unless self.url_array
|
167
190
|
CSV.open filename, 'wb', :col_sep => "\t" do |csv|
|
168
191
|
#this is a conscious choice: each document is a list of things, either single elements or rows (as lists).
|
169
|
-
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
170
|
-
puts document.inspect
|
192
|
+
self.scrape_from_list(self.url_array, blk).compact.each do |document|
|
171
193
|
if document[0].respond_to? :map
|
172
194
|
document.each{|row| csv << row }
|
173
195
|
else
|
@@ -181,70 +203,20 @@ module Upton
|
|
181
203
|
protected
|
182
204
|
|
183
205
|
##
|
184
|
-
#
|
185
|
-
##
|
186
|
-
def fetch_page(url, options={})
|
187
|
-
RestClient.get(url, {:accept=> "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"})
|
188
|
-
end
|
189
|
-
|
190
|
-
##
|
191
|
-
# Handles getting pages with RestClient or getting them from the local stash.
|
192
|
-
#
|
193
|
-
# Uses a kludge (because rest-client is outdated) to handle encoding.
|
206
|
+
# Handles getting pages with Downlader, which handles stashing.
|
194
207
|
##
|
195
208
|
def get_page(url, stash=false, options={})
|
196
|
-
return
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
resp = open( url_to_filename(url, options), 'r:UTF-8').read .encode("UTF-8", :invalid => :replace, :undef => :replace )
|
202
|
-
else
|
203
|
-
begin
|
204
|
-
puts "getting " + url if @verbose
|
205
|
-
sleep @sleep_time_between_requests
|
206
|
-
resp = fetch_page(url, options)
|
207
|
-
|
208
|
-
#this is silly, but rest-client needs to get on their game.
|
209
|
-
#cf https://github.com/jcoyne/rest-client/blob/fb80f2c320687943bc4fae1503ed15f9dff4ce64/lib/restclient/response.rb#L26
|
210
|
-
if ((200..207).include?(resp.net_http_res.code.to_i) && content_type = resp.net_http_res.content_type)
|
211
|
-
charset = if set = resp.net_http_res.type_params['charset']
|
212
|
-
set
|
213
|
-
elsif content_type == 'text/xml'
|
214
|
-
'us-ascii'
|
215
|
-
elsif content_type.split('/').first == 'text'
|
216
|
-
'iso-8859-1'
|
217
|
-
end
|
218
|
-
resp.force_encoding(charset) if charset
|
219
|
-
end
|
220
|
-
|
221
|
-
rescue RestClient::ResourceNotFound
|
222
|
-
puts "404 error, skipping: #{url}" if @verbose
|
223
|
-
resp = ""
|
224
|
-
rescue RestClient::InternalServerError
|
225
|
-
puts "500 Error, skipping: #{url}" if @verbose
|
226
|
-
resp = ""
|
227
|
-
rescue URI::InvalidURIError
|
228
|
-
puts "Invalid URI: #{url}" if @verbose
|
229
|
-
resp = ""
|
230
|
-
rescue RestClient::RequestTimeout
|
231
|
-
"Timeout: #{url}" if @verbose
|
232
|
-
retry
|
233
|
-
end
|
234
|
-
if stash
|
235
|
-
puts "I just stashed (#{resp.code if resp.respond_to?(:code)}): #{url}" if @verbose
|
236
|
-
open( url_to_filename(url, options), 'w:UTF-8'){|f| f.write(resp.encode("UTF-8", :invalid => :replace, :undef => :replace ) )}
|
237
|
-
end
|
209
|
+
return EMPTY_STRING if url.empty?
|
210
|
+
resp_and_cache = Downloader.new(url, {:cache => stash, :verbose => @verbose}.merge(options)).get
|
211
|
+
if resp_and_cache[:from_resource]
|
212
|
+
puts "sleeping #{@sleep_time_between_requests} secs" if @verbose
|
213
|
+
sleep @sleep_time_between_requests
|
238
214
|
end
|
239
|
-
resp
|
215
|
+
resp_and_cache[:resp]
|
240
216
|
end
|
241
217
|
|
242
|
-
def url_to_filename(url, options={})
|
243
|
-
File.join(@stash_folder, url.gsub(/[^A-Za-z0-9\-]/, "") )
|
244
|
-
end
|
245
218
|
|
246
|
-
|
247
|
-
##
|
219
|
+
##
|
248
220
|
# sometimes URLs are relative, e.g. "index.html" as opposed to "http://site.com/index.html"
|
249
221
|
# resolve_url resolves them to absolute urls.
|
250
222
|
# absolute_url_str must be a URL, as a string, that is absolute.
|
@@ -258,7 +230,7 @@ module Upton
|
|
258
230
|
return href.to_s if href.absolute?
|
259
231
|
|
260
232
|
#TODO: edge cases, see [issue #8](https://github.com/propublica/upton/issues/8)
|
261
|
-
URI.join(absolute_url, href).to_s
|
233
|
+
URI.join(absolute_url, href).to_s
|
262
234
|
end
|
263
235
|
|
264
236
|
##
|
@@ -272,7 +244,7 @@ module Upton
|
|
272
244
|
end
|
273
245
|
|
274
246
|
##
|
275
|
-
# Using the XPath expression or CSS selector and selector_method that
|
247
|
+
# Using the XPath expression or CSS selector and selector_method that
|
276
248
|
# uniquely identifies the links in the index, return those links as strings. ##
|
277
249
|
def old_parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
|
278
250
|
# for now, override selector_method with :search, which will work with either CSS or XPath
|
@@ -285,20 +257,18 @@ module Upton
|
|
285
257
|
# Does @index_url stay unaltered for the lifetime of the Upton instance?
|
286
258
|
# It seems to at this point, but that may be something that gets
|
287
259
|
# deprecated later
|
288
|
-
#
|
289
|
-
# So for now, @index_url is used in conjunction with resolve_url
|
260
|
+
#
|
261
|
+
# So for now, @index_url is used in conjunction with resolve_url
|
290
262
|
# to make sure that this method returns absolute urls
|
291
263
|
# i.e. this method expects @index_url to always have an absolute address
|
292
264
|
# for the lifetime of an Upton instance
|
293
265
|
def parse_index(text, selector, selector_method=:deprecated) # TODO: Deprecate selector_method in next minor release.
|
294
266
|
# for now, override selector_method with :search, which will work with either CSS or XPath
|
295
|
-
Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
|
267
|
+
Nokogiri::HTML(text).search(selector).to_a.map do |a_element|
|
296
268
|
href = a_element["href"]
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
end
|
301
|
-
u
|
269
|
+
resolved_url = resolve_url( href, @index_url) unless href.nil?
|
270
|
+
puts "resolved #{href} to #{resolved_url}" if @verbose && resolved_url != href
|
271
|
+
resolved_url
|
302
272
|
end
|
303
273
|
end
|
304
274
|
|
@@ -309,13 +279,13 @@ module Upton
|
|
309
279
|
##
|
310
280
|
def get_index_pages(url, pagination_index, options={})
|
311
281
|
resp = self.get_page(url, @index_debug, options)
|
312
|
-
|
282
|
+
unless resp.empty?
|
313
283
|
next_url = self.next_index_page_url(url, pagination_index + 1)
|
314
284
|
# resolve to absolute url
|
315
285
|
#
|
316
286
|
next_url = resolve_url(next_url, url)
|
317
287
|
unless next_url == url
|
318
|
-
next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
|
288
|
+
next_resp = self.get_index_pages(next_url, pagination_index + 1).to_s
|
319
289
|
resp += next_resp
|
320
290
|
end
|
321
291
|
end
|
@@ -324,20 +294,20 @@ module Upton
|
|
324
294
|
|
325
295
|
##
|
326
296
|
# Returns the instance at `url`.
|
327
|
-
#
|
297
|
+
#
|
328
298
|
# If the page is stashed, returns that, otherwise, fetches it from the web.
|
329
299
|
#
|
330
|
-
# If an instance is paginated, returns the concatenated output of each
|
300
|
+
# If an instance is paginated, returns the concatenated output of each
|
331
301
|
# page, e.g. if a news article has two pages.
|
332
302
|
##
|
333
303
|
def get_instance(url, pagination_index=0, options={})
|
334
304
|
resp = self.get_page(url, @debug, options)
|
335
|
-
if !resp.empty?
|
305
|
+
if !resp.empty?
|
336
306
|
next_url = self.next_instance_page_url(url, pagination_index.to_i + 1)
|
337
|
-
|
338
|
-
#
|
307
|
+
|
308
|
+
#next_url = resolve_url(next_url, url)
|
339
309
|
unless next_url == url
|
340
|
-
next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
|
310
|
+
next_resp = self.get_instance(next_url, pagination_index.to_i + 1).to_s
|
341
311
|
resp += next_resp
|
342
312
|
end
|
343
313
|
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
require "open-uri"
|
3
|
+
require "tmpdir"
|
4
|
+
require "restclient"
|
5
|
+
|
6
|
+
module Upton
|
7
|
+
|
8
|
+
# This class is used internally to download and cache the webpages
|
9
|
+
# that are requested.
|
10
|
+
#
|
11
|
+
# By default, the cache location is the output of `Dir.tmpdir`/upton.
|
12
|
+
# The Dir.tmpdir returns the temporary directory of the operating system.
|
13
|
+
# By default, the stashed files have a non-human-readable md5-based filename.
|
14
|
+
# If `readable_stash_filenames` is true, they will have human-readable names.
|
15
|
+
class Downloader
|
16
|
+
|
17
|
+
MAX_FILENAME_LENGTH = 130 #for unixes, win xp+
|
18
|
+
EMPTY_STRING = ''
|
19
|
+
|
20
|
+
attr_reader :uri, :cache_location, :verbose
|
21
|
+
def initialize(uri, options = {})
|
22
|
+
@uri = uri
|
23
|
+
@cache = options.fetch(:cache) { true }
|
24
|
+
@cache_location = File.absolute_path(options[:cache_location] || "#{Dir.tmpdir}/upton")
|
25
|
+
@verbose = options[:verbose] || false
|
26
|
+
@readable_stash_filenames = options[:readable_filenames] || false
|
27
|
+
initialize_cache!
|
28
|
+
end
|
29
|
+
|
30
|
+
def get
|
31
|
+
if cache_enabled?
|
32
|
+
puts "Stashing enabled. Will try reading #{uri} data from cache." if @verbose
|
33
|
+
download_from_cache!
|
34
|
+
else
|
35
|
+
puts "Stashing disabled. Will download from the internet." if @verbose
|
36
|
+
from_resource = true
|
37
|
+
resp = download_from_resource!
|
38
|
+
{:resp => resp, :from_resource => from_resource }
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def download_from_resource!
|
45
|
+
begin
|
46
|
+
puts "Downloading from #{uri}" if @verbose
|
47
|
+
resp = RestClient.get(uri)
|
48
|
+
puts "Downloaded #{uri}" if @verbose
|
49
|
+
rescue RestClient::ResourceNotFound
|
50
|
+
puts "404 error, skipping: #{uri}" if @verbose
|
51
|
+
rescue RestClient::InternalServerError
|
52
|
+
puts "500 Error, skipping: #{uri}" if @verbose
|
53
|
+
rescue URI::InvalidURIError
|
54
|
+
puts "Invalid URI: #{uri}" if @verbose
|
55
|
+
rescue RestClient::RequestTimeout
|
56
|
+
puts "Timeout: #{uri}" if @verbose
|
57
|
+
retry
|
58
|
+
end
|
59
|
+
resp ||= EMPTY_STRING
|
60
|
+
end
|
61
|
+
|
62
|
+
def download_from_cache!
|
63
|
+
resp = if cached_file_exists?
|
64
|
+
puts "Cache of #{uri} available" if @verbose
|
65
|
+
from_resource = false
|
66
|
+
open(cached_file).read
|
67
|
+
else
|
68
|
+
if @verbose
|
69
|
+
if @readable_stash_filenames
|
70
|
+
puts "Cache of #{uri} unavailable at #{filename_from_uri}. Will download from the internet"
|
71
|
+
else
|
72
|
+
puts "Cache of #{uri} unavailable. Will download from the internet"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
from_resource = false
|
76
|
+
download_from_resource!
|
77
|
+
end
|
78
|
+
unless cached_file_exists?
|
79
|
+
if @verbose
|
80
|
+
if @readable_stash_filenames
|
81
|
+
puts "Writing #{uri} data to the cache at #{cached_file}"
|
82
|
+
else
|
83
|
+
puts "Writing #{uri} data to the cache"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
File.write(cached_file, resp)
|
87
|
+
end
|
88
|
+
{:resp => resp, :from_resource => from_resource }
|
89
|
+
end
|
90
|
+
|
91
|
+
def cache_enabled?
|
92
|
+
!!@cache
|
93
|
+
end
|
94
|
+
|
95
|
+
def filename_from_uri
|
96
|
+
@readable_stash_filenames ? readable_filename_from_uri : hashed_filename_from_uri
|
97
|
+
end
|
98
|
+
|
99
|
+
def hashed_filename_from_uri
|
100
|
+
Digest::MD5.hexdigest(uri)
|
101
|
+
end
|
102
|
+
|
103
|
+
def readable_filename_from_uri
|
104
|
+
html = "html"
|
105
|
+
clean_url_max_length = MAX_FILENAME_LENGTH - html.length - cache_location.size
|
106
|
+
clean_url = uri.gsub(/[^A-Za-z0-9\-_]/, "")[0...clean_url_max_length]
|
107
|
+
"#{clean_url}.#{html}"
|
108
|
+
end
|
109
|
+
|
110
|
+
def cached_file
|
111
|
+
"#{cache_location}/#{filename_from_uri}"
|
112
|
+
end
|
113
|
+
|
114
|
+
def cached_file_exists?
|
115
|
+
File.exists?(cached_file)
|
116
|
+
end
|
117
|
+
|
118
|
+
def initialize_cache!
|
119
|
+
unless Dir.exists?(cache_location)
|
120
|
+
Dir.mkdir(cache_location)
|
121
|
+
FileUtils.chmod 0700, cache_location
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
126
|
+
end
|