parallel588_polipus 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +33 -0
- data/.travis.yml +22 -0
- data/AUTHORS.md +5 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +70 -0
- data/Rakefile +8 -0
- data/examples/basic.rb +63 -0
- data/examples/error_handling.rb +23 -0
- data/examples/incremental.rb +63 -0
- data/examples/robots_txt_handling.rb +14 -0
- data/examples/survival.rb +10 -0
- data/lib/polipus.rb +488 -0
- data/lib/polipus/http.rb +282 -0
- data/lib/polipus/page.rb +256 -0
- data/lib/polipus/plugin.rb +14 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +15 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +26 -0
- data/lib/polipus/queue_overflow/base.rb +7 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
- data/lib/polipus/queue_overflow/manager.rb +57 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/robotex.rb +145 -0
- data/lib/polipus/signal_handler.rb +42 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +20 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +90 -0
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/url_tracker.rb +21 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +5 -0
- data/polipus.gemspec +44 -0
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/cassettes/gzipped_on.yml +147 -0
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +12 -0
- data/spec/polipus/http_spec.rb +139 -0
- data/spec/polipus/page_spec.rb +68 -0
- data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
- data/spec/polipus/queue_overflow_spec.rb +66 -0
- data/spec/polipus/robotex_spec.rb +85 -0
- data/spec/polipus/signal_handler_spec.rb +15 -0
- data/spec/polipus/storage/memory_store_spec.rb +87 -0
- data/spec/polipus/storage/mongo_store_spec.rb +119 -0
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/polipus/url_tracker_spec.rb +29 -0
- data/spec/polipus_spec.rb +107 -0
- data/spec/spec_helper.rb +42 -0
- metadata +348 -0
data/lib/polipus/http.rb
ADDED
@@ -0,0 +1,282 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'net/https'
|
3
|
+
require 'polipus/page'
|
4
|
+
require 'zlib'
|
5
|
+
require 'http/cookie'
|
6
|
+
|
7
|
+
module Polipus
|
8
|
+
class HTTP
|
9
|
+
# Maximum number of redirects to follow on each get_response
|
10
|
+
REDIRECT_LIMIT = 5
|
11
|
+
RESCUABLE_ERRORS = [
|
12
|
+
EOFError,
|
13
|
+
Errno::ECONNREFUSED,
|
14
|
+
Errno::ECONNRESET,
|
15
|
+
Errno::EHOSTUNREACH,
|
16
|
+
Errno::EINVAL,
|
17
|
+
Errno::EPIPE,
|
18
|
+
Errno::ETIMEDOUT,
|
19
|
+
Net::HTTPBadResponse,
|
20
|
+
Net::HTTPHeaderSyntaxError,
|
21
|
+
Net::ProtocolError,
|
22
|
+
SocketError,
|
23
|
+
Timeout::Error,
|
24
|
+
Zlib::DataError,
|
25
|
+
Zlib::GzipFile::Error
|
26
|
+
]
|
27
|
+
|
28
|
+
def initialize(opts = {})
|
29
|
+
@connections = {}
|
30
|
+
@connections_hits = {}
|
31
|
+
@opts = opts
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# Fetch a single Page from the response of an HTTP request to *url*.
|
36
|
+
# Just gets the final destination page.
|
37
|
+
#
|
38
|
+
def fetch_page(url, referer = nil, depth = nil)
|
39
|
+
fetch_pages(url, referer, depth).last
|
40
|
+
end
|
41
|
+
|
42
|
+
#
|
43
|
+
# Create new Pages from the response of an HTTP request to *url*,
|
44
|
+
# including redirects
|
45
|
+
#
|
46
|
+
def fetch_pages(url, referer = nil, depth = nil)
|
47
|
+
url = URI(url)
|
48
|
+
pages = []
|
49
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
50
|
+
handle_compression response
|
51
|
+
pages << Page.new(location, body: response.body,
|
52
|
+
code: code,
|
53
|
+
headers: response.to_hash,
|
54
|
+
referer: referer,
|
55
|
+
depth: depth,
|
56
|
+
redirect_to: redirect_to,
|
57
|
+
response_time: response_time,
|
58
|
+
fetched_at: Time.now.to_i)
|
59
|
+
end
|
60
|
+
|
61
|
+
pages
|
62
|
+
rescue *RESCUABLE_ERRORS => e
|
63
|
+
if verbose?
|
64
|
+
puts e.inspect
|
65
|
+
puts e.backtrace
|
66
|
+
end
|
67
|
+
|
68
|
+
[Page.new(url, error: e, referer: referer, depth: depth)]
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# The maximum number of redirects to follow
|
73
|
+
#
|
74
|
+
def redirect_limit
|
75
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
76
|
+
end
|
77
|
+
|
78
|
+
#
|
79
|
+
# The user-agent string which will be sent with each request,
|
80
|
+
# or nil if no such option is set
|
81
|
+
#
|
82
|
+
def user_agent
|
83
|
+
if @opts[:user_agent].respond_to?(:sample)
|
84
|
+
@opts[:user_agent].sample
|
85
|
+
else
|
86
|
+
@opts[:user_agent]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# The proxy address string
|
92
|
+
#
|
93
|
+
def proxy_host
|
94
|
+
@opts[:proxy_host].respond_to?(:call) ? @opts[:proxy_host].call(self) : @opts[:proxy_host]
|
95
|
+
end
|
96
|
+
|
97
|
+
#
|
98
|
+
# The proxy port
|
99
|
+
#
|
100
|
+
def proxy_port
|
101
|
+
@opts[:proxy_port].respond_to?(:call) ? @opts[:proxy_port].call(self) : @opts[:proxy_port]
|
102
|
+
end
|
103
|
+
|
104
|
+
#
|
105
|
+
# The proxy username
|
106
|
+
#
|
107
|
+
def proxy_user
|
108
|
+
@opts[:proxy_user].respond_to?(:call) ? @opts[:proxy_user].call(self) : @opts[:proxy_user]
|
109
|
+
end
|
110
|
+
|
111
|
+
#
|
112
|
+
# The proxy password
|
113
|
+
#
|
114
|
+
def proxy_pass
|
115
|
+
#return proxy_host_port[3] unless @opts[:proxy_host_port].nil?
|
116
|
+
@opts[:proxy_pass].respond_to?(:call) ? @opts[:proxy_pass].call(self) : @opts[:proxy_pass]
|
117
|
+
end
|
118
|
+
|
119
|
+
#
|
120
|
+
# Shorthand to get proxy info with a single call
|
121
|
+
# It returns an array of ['addr', port, 'user', 'pass']
|
122
|
+
#
|
123
|
+
def proxy_host_port
|
124
|
+
@opts[:proxy_host_port].respond_to?(:call) ? @opts[:proxy_host_port].call(self) : @opts[:proxy_host_port]
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# HTTP read timeout in seconds
|
129
|
+
#
|
130
|
+
def read_timeout
|
131
|
+
@opts[:read_timeout]
|
132
|
+
end
|
133
|
+
|
134
|
+
#
|
135
|
+
# HTTP open timeout in seconds
|
136
|
+
#
|
137
|
+
def open_timeout
|
138
|
+
@opts[:open_timeout]
|
139
|
+
end
|
140
|
+
|
141
|
+
# Does this HTTP client accept cookies from the server?
|
142
|
+
#
|
143
|
+
def accept_cookies?
|
144
|
+
@opts[:accept_cookies]
|
145
|
+
end
|
146
|
+
|
147
|
+
def cookie_jar
|
148
|
+
@opts[:cookie_jar] ||= ::HTTP::CookieJar.new
|
149
|
+
@opts[:cookie_jar]
|
150
|
+
end
|
151
|
+
|
152
|
+
private
|
153
|
+
|
154
|
+
#
|
155
|
+
# Retrieve HTTP responses for *url*, including redirects.
|
156
|
+
# Yields the response object, response code, and URI location
|
157
|
+
# for each response.
|
158
|
+
#
|
159
|
+
def get(url, referer = nil)
|
160
|
+
limit = redirect_limit
|
161
|
+
loc = url
|
162
|
+
loop do
|
163
|
+
# if redirected to a relative url, merge it with the host of the original
|
164
|
+
# request url
|
165
|
+
loc = url.merge(loc) if loc.relative?
|
166
|
+
|
167
|
+
response, response_time = get_response(loc, referer)
|
168
|
+
code = Integer(response.code)
|
169
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
170
|
+
yield response, code, loc, redirect_to, response_time
|
171
|
+
limit -= 1
|
172
|
+
break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
#
|
177
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
178
|
+
#
|
179
|
+
def get_response(url, referer = nil)
|
180
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
181
|
+
|
182
|
+
opts = {}
|
183
|
+
opts['User-Agent'] = user_agent if user_agent
|
184
|
+
opts['Referer'] = referer.to_s if referer
|
185
|
+
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
186
|
+
opts['Accept-Encoding'] = 'gzip,deflate'
|
187
|
+
|
188
|
+
retries = 0
|
189
|
+
begin
|
190
|
+
start = Time.now
|
191
|
+
# format request
|
192
|
+
req = Net::HTTP::Get.new(full_path, opts)
|
193
|
+
# HTTP Basic authentication
|
194
|
+
req.basic_auth url.user, url.password if url.user
|
195
|
+
if @opts[:http_user]
|
196
|
+
req.basic_auth @opts[:http_user], @opts[:http_password]
|
197
|
+
end
|
198
|
+
# urls auth schema has higher priority
|
199
|
+
req.basic_auth url.user, url.password if url.user
|
200
|
+
response = connection(url).request(req)
|
201
|
+
finish = Time.now
|
202
|
+
response_time = ((finish - start) * 1000).round
|
203
|
+
cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies? && response['Set-Cookie']
|
204
|
+
return response, response_time
|
205
|
+
rescue *RESCUABLE_ERRORS => e
|
206
|
+
puts e.inspect if verbose?
|
207
|
+
refresh_connection(url)
|
208
|
+
retries += 1
|
209
|
+
if retries < 3
|
210
|
+
retry
|
211
|
+
else
|
212
|
+
raise e
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
def connection(url)
|
218
|
+
@connections[url.host] ||= {}
|
219
|
+
@connections_hits[url.host] ||= {}
|
220
|
+
|
221
|
+
if @connections[url.host][url.port]
|
222
|
+
if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
|
223
|
+
@opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
|
224
|
+
return refresh_connection url
|
225
|
+
end
|
226
|
+
@connections_hits[url.host][url.port] += 1
|
227
|
+
return @connections[url.host][url.port]
|
228
|
+
end
|
229
|
+
|
230
|
+
refresh_connection url
|
231
|
+
end
|
232
|
+
|
233
|
+
def refresh_connection(url)
|
234
|
+
if @opts[:logger] && proxy_host && proxy_port
|
235
|
+
@opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
|
236
|
+
end
|
237
|
+
|
238
|
+
# Block has higher priority
|
239
|
+
unless @opts[:proxy_host_port].nil?
|
240
|
+
p_host, p_port, p_user, p_pass = proxy_host_port
|
241
|
+
else
|
242
|
+
p_host = proxy_host
|
243
|
+
p_port = proxy_port
|
244
|
+
p_user = proxy_user
|
245
|
+
p_pass = proxy_pass
|
246
|
+
end
|
247
|
+
|
248
|
+
http = Net::HTTP.new(url.host, url.port, p_host, p_port, p_user, p_pass)
|
249
|
+
|
250
|
+
http.read_timeout = read_timeout if read_timeout
|
251
|
+
http.open_timeout = open_timeout if open_timeout
|
252
|
+
|
253
|
+
if url.scheme == 'https'
|
254
|
+
http.use_ssl = true
|
255
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
256
|
+
end
|
257
|
+
@connections_hits[url.host][url.port] = 1
|
258
|
+
@connections[url.host][url.port] = http.start
|
259
|
+
end
|
260
|
+
|
261
|
+
def verbose?
|
262
|
+
@opts[:verbose]
|
263
|
+
end
|
264
|
+
|
265
|
+
#
|
266
|
+
# Allowed to connect to the requested url?
|
267
|
+
#
|
268
|
+
def allowed?(to_url, from_url)
|
269
|
+
to_url.host.nil? || (to_url.host == from_url.host)
|
270
|
+
end
|
271
|
+
|
272
|
+
def handle_compression(response)
|
273
|
+
case response['content-encoding']
|
274
|
+
when 'gzip', 'x-gzip'
|
275
|
+
body_io = StringIO.new(response.body)
|
276
|
+
response.body.replace Zlib::GzipReader.new(body_io).read
|
277
|
+
when 'deflate'
|
278
|
+
response.body.replace Zlib::Inflate.inflate(response.body)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
data/lib/polipus/page.rb
ADDED
@@ -0,0 +1,256 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'json'
|
4
|
+
require 'ostruct'
|
5
|
+
require 'set'
|
6
|
+
require 'kconv'
|
7
|
+
|
8
|
+
module Polipus
|
9
|
+
class Page
|
10
|
+
# The URL of the page
|
11
|
+
attr_reader :url
|
12
|
+
# The raw HTTP response body of the page
|
13
|
+
attr_reader :body
|
14
|
+
# Headers of the HTTP response
|
15
|
+
attr_reader :headers
|
16
|
+
# URL of the page this one redirected to, if any
|
17
|
+
attr_reader :redirect_to
|
18
|
+
# Exception object, if one was raised during HTTP#fetch_page
|
19
|
+
attr_reader :error
|
20
|
+
# Integer response code of the page
|
21
|
+
attr_accessor :code
|
22
|
+
# Depth of this page from the root of the crawl.
|
23
|
+
attr_accessor :depth
|
24
|
+
# URL of the page that brought us to this page
|
25
|
+
attr_accessor :referer
|
26
|
+
# Response time of the request for this page in milliseconds
|
27
|
+
attr_accessor :response_time
|
28
|
+
# OpenStruct it holds users defined data
|
29
|
+
attr_accessor :user_data
|
30
|
+
|
31
|
+
attr_accessor :aliases
|
32
|
+
|
33
|
+
attr_accessor :domain_aliases
|
34
|
+
|
35
|
+
# Whether the current page should be stored
|
36
|
+
# Default: true
|
37
|
+
attr_accessor :storable
|
38
|
+
|
39
|
+
attr_accessor :fetched_at
|
40
|
+
|
41
|
+
#
|
42
|
+
# Create a new page
|
43
|
+
#
|
44
|
+
def initialize(url, params = {})
|
45
|
+
@url = URI(url)
|
46
|
+
@code = params[:code]
|
47
|
+
@headers = params[:headers] || {}
|
48
|
+
@headers['content-type'] ||= ['']
|
49
|
+
@aliases = Array(params[:aka]).compact
|
50
|
+
@referer = params[:referer]
|
51
|
+
@depth = params[:depth] || 0
|
52
|
+
@redirect_to = to_absolute(params[:redirect_to])
|
53
|
+
@response_time = params[:response_time]
|
54
|
+
@body = params[:body]
|
55
|
+
@error = params[:error]
|
56
|
+
@fetched = !params[:code].nil?
|
57
|
+
@user_data = OpenStruct.new
|
58
|
+
@domain_aliases = params[:domain_aliases] ||= []
|
59
|
+
@storable = true
|
60
|
+
@fetched_at = params[:fetched_at]
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# Array of distinct A tag HREFs from the page
|
65
|
+
#
|
66
|
+
def links
|
67
|
+
return @links.to_a unless @links.nil?
|
68
|
+
@links = Set.new
|
69
|
+
return [] unless doc
|
70
|
+
|
71
|
+
doc.search('//a[@href]').each do |a|
|
72
|
+
u = a['href']
|
73
|
+
next if u.nil? || u.empty?
|
74
|
+
abs = to_absolute(u) rescue next
|
75
|
+
@links << abs if in_domain?(abs)
|
76
|
+
end
|
77
|
+
@links.to_a
|
78
|
+
end
|
79
|
+
|
80
|
+
#
|
81
|
+
# Nokogiri document for the HTML body
|
82
|
+
#
|
83
|
+
def doc
|
84
|
+
return @doc if @doc
|
85
|
+
@doc = Nokogiri::HTML(@body.toutf8, nil, 'utf-8') if @body && html? rescue nil
|
86
|
+
end
|
87
|
+
|
88
|
+
#
|
89
|
+
# Discard links, a next call of page.links will return an empty array
|
90
|
+
#
|
91
|
+
def discard_links!
|
92
|
+
@links = []
|
93
|
+
end
|
94
|
+
|
95
|
+
#
|
96
|
+
# Delete the Nokogiri document and response body to conserve memory
|
97
|
+
#
|
98
|
+
def discard_doc!
|
99
|
+
links # force parsing of page links before we trash the document
|
100
|
+
@doc = @body = nil
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Was the page successfully fetched?
|
105
|
+
# +true+ if the page was fetched with no error, +false+ otherwise.
|
106
|
+
#
|
107
|
+
def fetched?
|
108
|
+
@fetched
|
109
|
+
end
|
110
|
+
|
111
|
+
#
|
112
|
+
# The content-type returned by the HTTP request for this page
|
113
|
+
#
|
114
|
+
def content_type
|
115
|
+
headers['content-type'].first
|
116
|
+
end
|
117
|
+
|
118
|
+
#
|
119
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
120
|
+
# otherwise.
|
121
|
+
#
|
122
|
+
def html?
|
123
|
+
content_type =~ %r{^(text/html|application/xhtml+xml)\b}
|
124
|
+
end
|
125
|
+
|
126
|
+
#
|
127
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
128
|
+
# otherwise.
|
129
|
+
#
|
130
|
+
def redirect?
|
131
|
+
(300...400).include?(@code)
|
132
|
+
end
|
133
|
+
|
134
|
+
#
|
135
|
+
# Returns +true+ if the page is a HTTP success, returns +false+
|
136
|
+
# otherwise.
|
137
|
+
#
|
138
|
+
def success?
|
139
|
+
(200..206).include?(@code)
|
140
|
+
end
|
141
|
+
|
142
|
+
#
|
143
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
144
|
+
# returns +false+ otherwise.
|
145
|
+
#
|
146
|
+
def not_found?
|
147
|
+
404 == @code
|
148
|
+
end
|
149
|
+
|
150
|
+
#
|
151
|
+
# Base URI from the HTML doc head element
|
152
|
+
# http://www.w3.org/TR/html4/struct/links.html#edef-BASE
|
153
|
+
#
|
154
|
+
def base
|
155
|
+
@base = if doc
|
156
|
+
href = doc.search('//head/base/@href')
|
157
|
+
URI(href.to_s) unless href.nil? rescue nil
|
158
|
+
end unless @base
|
159
|
+
|
160
|
+
return nil if @base && @base.to_s.empty?
|
161
|
+
@base
|
162
|
+
end
|
163
|
+
|
164
|
+
#
|
165
|
+
# Converts relative URL *link* into an absolute URL based on the
|
166
|
+
# location of the page
|
167
|
+
#
|
168
|
+
def to_absolute(link)
|
169
|
+
return nil if link.nil?
|
170
|
+
|
171
|
+
# remove anchor
|
172
|
+
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
173
|
+
|
174
|
+
relative = URI(link)
|
175
|
+
absolute = base ? base.merge(relative) : @url.merge(relative)
|
176
|
+
|
177
|
+
absolute.path = '/' if absolute.path.empty?
|
178
|
+
|
179
|
+
absolute
|
180
|
+
end
|
181
|
+
|
182
|
+
#
|
183
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
184
|
+
# +false+ otherwise
|
185
|
+
#
|
186
|
+
def in_domain?(uri)
|
187
|
+
@domain_aliases ||= []
|
188
|
+
uri.host == @url.host || @domain_aliases.include?(uri.host)
|
189
|
+
end
|
190
|
+
|
191
|
+
def to_hash
|
192
|
+
{
|
193
|
+
'url' => @url.to_s,
|
194
|
+
'headers' => Marshal.dump(@headers),
|
195
|
+
'body' => @body,
|
196
|
+
'links' => links.map(&:to_s),
|
197
|
+
'code' => @code,
|
198
|
+
'depth' => @depth,
|
199
|
+
'referer' => @referer.to_s,
|
200
|
+
'redirect_to' => @redirect_to.to_s,
|
201
|
+
'response_time' => @response_time,
|
202
|
+
'fetched' => @fetched,
|
203
|
+
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
|
204
|
+
'fetched_at' => @fetched_at,
|
205
|
+
'error' => @error.to_s
|
206
|
+
}
|
207
|
+
end
|
208
|
+
|
209
|
+
def to_json
|
210
|
+
th = to_hash.dup
|
211
|
+
th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
|
212
|
+
th.delete('headers') if content_type.empty?
|
213
|
+
th.to_json
|
214
|
+
end
|
215
|
+
|
216
|
+
#
|
217
|
+
# Returns +true+ if page is marked as storeable
|
218
|
+
# +false+ otherwise
|
219
|
+
# Default is +true+
|
220
|
+
#
|
221
|
+
def storable?
|
222
|
+
@storable
|
223
|
+
end
|
224
|
+
|
225
|
+
def expired?(ttl)
|
226
|
+
return false if fetched_at.nil?
|
227
|
+
(Time.now.to_i - ttl) > fetched_at
|
228
|
+
end
|
229
|
+
|
230
|
+
def self.from_hash(hash)
|
231
|
+
page = new(URI(hash['url']))
|
232
|
+
{
|
233
|
+
'@headers' => hash['headers'] && !hash['headers'].empty? ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
|
234
|
+
'@body' => hash['body'],
|
235
|
+
'@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
|
236
|
+
'@code' => hash['code'].to_i,
|
237
|
+
'@depth' => hash['depth'].to_i,
|
238
|
+
'@referer' => hash['referer'],
|
239
|
+
'@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
|
240
|
+
'@response_time' => hash['response_time'].to_i,
|
241
|
+
'@fetched' => hash['fetched'],
|
242
|
+
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
|
243
|
+
'@fetched_at' => hash['fetched_at'],
|
244
|
+
'@error' => hash['error']
|
245
|
+
}.each do |var, value|
|
246
|
+
page.instance_variable_set(var, value)
|
247
|
+
end
|
248
|
+
page
|
249
|
+
end
|
250
|
+
|
251
|
+
def self.from_json(json)
|
252
|
+
hash = JSON.parse json
|
253
|
+
from_hash hash
|
254
|
+
end
|
255
|
+
end
|
256
|
+
end
|