sutch-anemone 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/CHANGELOG.rdoc +136 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +38 -0
- data/Rakefile +23 -0
- data/VERSION +1 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +339 -0
- data/lib/anemone/exceptions.rb +5 -0
- data/lib/anemone/http.rb +187 -0
- data/lib/anemone/page.rb +217 -0
- data/lib/anemone/page_store.rb +161 -0
- data/lib/anemone/resource.rb +42 -0
- data/lib/anemone/storage.rb +44 -0
- data/lib/anemone/storage/base.rb +75 -0
- data/lib/anemone/storage/exceptions.rb +15 -0
- data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
- data/lib/anemone/storage/mongodb.rb +89 -0
- data/lib/anemone/storage/pstore.rb +50 -0
- data/lib/anemone/storage/redis.rb +90 -0
- data/lib/anemone/storage/sqlite3.rb +90 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +60 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +16 -0
- data/spec/cookie_store_spec.rb +28 -0
- data/spec/core_spec.rb +344 -0
- data/spec/fakeweb_helper.rb +77 -0
- data/spec/http_spec.rb +19 -0
- data/spec/page_spec.rb +186 -0
- data/spec/page_store_spec.rb +171 -0
- data/spec/resource_spec.rb +91 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/storage_spec.rb +252 -0
- metadata +281 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
options = OpenStruct.new
|
6
|
+
options.relative = false
|
7
|
+
|
8
|
+
begin
|
9
|
+
# make sure that the last option is a URL we can crawl
|
10
|
+
root = URI(ARGV.last)
|
11
|
+
rescue
|
12
|
+
puts <<-INFO
|
13
|
+
Usage:
|
14
|
+
anemone url-list [options] <url>
|
15
|
+
|
16
|
+
Synopsis:
|
17
|
+
Crawls a site starting at the given URL, and outputs the URL of each page
|
18
|
+
in the domain as they are encountered.
|
19
|
+
|
20
|
+
Options:
|
21
|
+
-r, --relative Output relative URLs (rather than absolute)
|
22
|
+
INFO
|
23
|
+
exit(0)
|
24
|
+
end
|
25
|
+
|
26
|
+
# parse command-line options
|
27
|
+
opts = OptionParser.new
|
28
|
+
opts.on('-r', '--relative') { options.relative = true }
|
29
|
+
opts.parse!(ARGV)
|
30
|
+
|
31
|
+
Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
|
32
|
+
|
33
|
+
anemone.on_every_page do |page|
|
34
|
+
if options.relative
|
35
|
+
puts page.url.path
|
36
|
+
else
|
37
|
+
puts page.url
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'delegate'
|
2
|
+
require 'webrick/cookie'
|
3
|
+
|
4
|
+
class WEBrick::Cookie
|
5
|
+
def expired?
|
6
|
+
!!expires && expires < Time.now
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
module Anemone
|
11
|
+
class CookieStore < DelegateClass(Hash)
|
12
|
+
|
13
|
+
def initialize(cookies = nil)
|
14
|
+
@cookies = {}
|
15
|
+
cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
|
16
|
+
super(@cookies)
|
17
|
+
end
|
18
|
+
|
19
|
+
def merge!(set_cookie_str)
|
20
|
+
begin
|
21
|
+
cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
|
22
|
+
hash[cookie.name] = cookie if !!cookie
|
23
|
+
hash
|
24
|
+
end
|
25
|
+
@cookies.merge! cookie_hash
|
26
|
+
rescue
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
@cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
data/lib/anemone/core.rb
ADDED
@@ -0,0 +1,339 @@
|
|
1
|
+
require 'thread'
|
2
|
+
require 'robotex'
|
3
|
+
require 'anemone/tentacle'
|
4
|
+
require 'anemone/page'
|
5
|
+
require 'anemone/resource'
|
6
|
+
require 'anemone/exceptions'
|
7
|
+
require 'anemone/page_store'
|
8
|
+
require 'anemone/storage'
|
9
|
+
require 'anemone/storage/base'
|
10
|
+
|
11
|
+
module Anemone
|
12
|
+
|
13
|
+
VERSION = '0.7.2';
|
14
|
+
|
15
|
+
#
|
16
|
+
# Convenience method to start a crawl
|
17
|
+
#
|
18
|
+
def Anemone.crawl(urls, options = {}, &block)
|
19
|
+
Core.crawl(urls, options, &block)
|
20
|
+
end
|
21
|
+
|
22
|
+
class Core
|
23
|
+
|
24
|
+
# PageStore storing all Page objects encountered during the crawl
|
25
|
+
attr_reader :pages
|
26
|
+
# Hash of options for the crawl
|
27
|
+
attr_reader :opts
|
28
|
+
|
29
|
+
DEFAULT_OPTS = {
|
30
|
+
# run 4 Tentacle threads to fetch pages
|
31
|
+
:threads => 4,
|
32
|
+
# disable verbose output
|
33
|
+
:verbose => false,
|
34
|
+
# don't throw away the page response body after scanning it for links
|
35
|
+
:discard_page_bodies => false,
|
36
|
+
# identify self as Anemone/VERSION
|
37
|
+
:user_agent => "Anemone/#{Anemone::VERSION}",
|
38
|
+
# no delay between requests
|
39
|
+
:delay => 0,
|
40
|
+
# don't obey the robots exclusion protocol
|
41
|
+
:obey_robots_txt => false,
|
42
|
+
# by default, don't limit the depth of the crawl
|
43
|
+
:depth_limit => false,
|
44
|
+
# number of times HTTP redirects will be followed
|
45
|
+
:redirect_limit => 5,
|
46
|
+
# limit the size of the page queue to keep memory usage low
|
47
|
+
:page_queue_size_limit => nil,
|
48
|
+
# limit the size of the link queue to keep memory usage low
|
49
|
+
:link_queue_size_limit => nil,
|
50
|
+
# storage engine defaults to Hash in +process_options+ if none specified
|
51
|
+
:storage => nil,
|
52
|
+
# Hash of cookie name => value to send with HTTP requests
|
53
|
+
:cookies => nil,
|
54
|
+
# accept cookies from the server and send them back?
|
55
|
+
:accept_cookies => false,
|
56
|
+
# skip any link with a query string? e.g. http://foo.com/?u=user
|
57
|
+
:skip_query_strings => false,
|
58
|
+
# proxy server hostname
|
59
|
+
:proxy_host => nil,
|
60
|
+
# proxy server port number
|
61
|
+
:proxy_port => false,
|
62
|
+
# HTTP read timeout in seconds
|
63
|
+
:read_timeout => nil,
|
64
|
+
# parse pages using Page class
|
65
|
+
:page_class => Anemone::Page,
|
66
|
+
}
|
67
|
+
|
68
|
+
# Create setter methods for all options to be called from the crawl block
|
69
|
+
DEFAULT_OPTS.keys.each do |key|
|
70
|
+
define_method "#{key}=" do |value|
|
71
|
+
@opts[key.to_sym] = value
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
#
|
76
|
+
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
77
|
+
# and optional *block*
|
78
|
+
#
|
79
|
+
def initialize(urls, opts = {})
|
80
|
+
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
81
|
+
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
82
|
+
|
83
|
+
@tentacles = []
|
84
|
+
@on_every_page_blocks = []
|
85
|
+
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
86
|
+
@skip_link_patterns = []
|
87
|
+
@after_crawl_blocks = []
|
88
|
+
@opts = opts
|
89
|
+
@stop_crawl = false
|
90
|
+
|
91
|
+
yield self if block_given?
|
92
|
+
end
|
93
|
+
|
94
|
+
#
|
95
|
+
# Convenience method to start a new crawl
|
96
|
+
#
|
97
|
+
def self.crawl(urls, opts = {})
|
98
|
+
self.new(urls, opts) do |core|
|
99
|
+
yield core if block_given?
|
100
|
+
core.run
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
#
|
105
|
+
# Add a block to be executed on the PageStore after the crawl
|
106
|
+
# is finished
|
107
|
+
#
|
108
|
+
def after_crawl(&block)
|
109
|
+
@after_crawl_blocks << block
|
110
|
+
self
|
111
|
+
end
|
112
|
+
|
113
|
+
#
|
114
|
+
# Add one ore more Regex patterns for URLs which should not be
|
115
|
+
# followed
|
116
|
+
#
|
117
|
+
def skip_links_like(*patterns)
|
118
|
+
@skip_link_patterns.concat [patterns].flatten.compact
|
119
|
+
self
|
120
|
+
end
|
121
|
+
|
122
|
+
#
|
123
|
+
# Add a block to be executed on every Page as they are encountered
|
124
|
+
# during the crawl
|
125
|
+
#
|
126
|
+
def on_every_page(&block)
|
127
|
+
@on_every_page_blocks << block
|
128
|
+
self
|
129
|
+
end
|
130
|
+
|
131
|
+
#
|
132
|
+
# Add a block to be executed on Page objects with a URL matching
|
133
|
+
# one or more patterns
|
134
|
+
#
|
135
|
+
def on_pages_like(*patterns, &block)
|
136
|
+
if patterns
|
137
|
+
patterns.each do |pattern|
|
138
|
+
@on_pages_like_blocks[pattern] << block
|
139
|
+
end
|
140
|
+
end
|
141
|
+
self
|
142
|
+
end
|
143
|
+
|
144
|
+
#
|
145
|
+
# Specify a block which will select which links to follow on each page.
|
146
|
+
# The block should return an Array of URI objects.
|
147
|
+
#
|
148
|
+
def focus_crawl(&block)
|
149
|
+
@focus_crawl_block = block
|
150
|
+
self
|
151
|
+
end
|
152
|
+
|
153
|
+
#
|
154
|
+
# Signals the crawler that it should stop the crawl before visiting the
|
155
|
+
# next page.
|
156
|
+
#
|
157
|
+
# This method is expected to be called within a page block, and it signals
|
158
|
+
# the crawler that it must stop after the current page is completely
|
159
|
+
# processed. All pages and links currently on queue are discared.
|
160
|
+
#
|
161
|
+
def stop_crawl
|
162
|
+
@stop_crawl = true
|
163
|
+
end
|
164
|
+
|
165
|
+
#
|
166
|
+
# Perform the crawl
|
167
|
+
#
|
168
|
+
def run
|
169
|
+
process_options
|
170
|
+
|
171
|
+
@urls.delete_if { |url| !visit_link?(url) }
|
172
|
+
return if @urls.empty?
|
173
|
+
|
174
|
+
link_queue = build_queue(@opts[:link_queue_size_limit])
|
175
|
+
page_queue = build_queue(@opts[:page_queue_size_limit])
|
176
|
+
|
177
|
+
@opts[:threads].times do
|
178
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
179
|
+
end
|
180
|
+
|
181
|
+
@urls.each{ |url| link_queue.enq(url) }
|
182
|
+
|
183
|
+
loop do
|
184
|
+
page = page_queue.deq
|
185
|
+
@pages.touch_key page.url
|
186
|
+
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
187
|
+
do_page_blocks page
|
188
|
+
page.discard_doc! if @opts[:discard_page_bodies]
|
189
|
+
|
190
|
+
links = links_to_follow page
|
191
|
+
links.each do |link|
|
192
|
+
link_queue << [link, page.url.dup, page.depth + 1]
|
193
|
+
end
|
194
|
+
@pages.touch_keys links
|
195
|
+
|
196
|
+
@pages[page.url] = page
|
197
|
+
|
198
|
+
if @stop_crawl
|
199
|
+
page_queue.clear
|
200
|
+
link_queue.clear
|
201
|
+
end
|
202
|
+
|
203
|
+
# if we are done with the crawl, tell the threads to end
|
204
|
+
if link_queue.empty? and page_queue.empty?
|
205
|
+
until link_queue.num_waiting == @tentacles.size
|
206
|
+
Thread.pass
|
207
|
+
end
|
208
|
+
if page_queue.empty? || @stop_crawl
|
209
|
+
@tentacles.size.times { link_queue << :END }
|
210
|
+
break
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
@tentacles.each { |thread| thread.join }
|
216
|
+
do_after_crawl_blocks
|
217
|
+
self
|
218
|
+
end
|
219
|
+
|
220
|
+
private
|
221
|
+
|
222
|
+
def process_options
|
223
|
+
@opts = DEFAULT_OPTS.merge @opts
|
224
|
+
@opts[:threads] = 1 if @opts[:delay] > 0
|
225
|
+
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
|
226
|
+
@pages = PageStore.new(storage, @opts)
|
227
|
+
@robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
228
|
+
|
229
|
+
freeze_options
|
230
|
+
end
|
231
|
+
|
232
|
+
#
|
233
|
+
# Freeze the opts Hash so that no options can be modified
|
234
|
+
# once the crawl begins
|
235
|
+
#
|
236
|
+
def freeze_options
|
237
|
+
@opts.freeze
|
238
|
+
@opts.each_key { |key| @opts[key].freeze }
|
239
|
+
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
|
240
|
+
end
|
241
|
+
|
242
|
+
#
|
243
|
+
# Execute the after_crawl blocks
|
244
|
+
#
|
245
|
+
def do_after_crawl_blocks
|
246
|
+
@after_crawl_blocks.each { |block| block.call(@pages) }
|
247
|
+
end
|
248
|
+
|
249
|
+
#
|
250
|
+
# Execute the on_every_page blocks for *page*
|
251
|
+
#
|
252
|
+
def do_page_blocks(page)
|
253
|
+
@on_every_page_blocks.each do |block|
|
254
|
+
block.call(page)
|
255
|
+
end
|
256
|
+
|
257
|
+
@on_pages_like_blocks.each do |pattern, blocks|
|
258
|
+
blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
#
|
263
|
+
# Return an Array of links to follow from the given page.
|
264
|
+
# Based on whether or not the link has already been crawled,
|
265
|
+
# and the block given to focus_crawl()
|
266
|
+
#
|
267
|
+
def links_to_follow(page)
|
268
|
+
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
269
|
+
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
|
270
|
+
end
|
271
|
+
|
272
|
+
#
|
273
|
+
# Returns +true+ if *link* has not been visited already,
|
274
|
+
# and is not excluded by a skip_link pattern...
|
275
|
+
# and is not excluded by robots.txt...
|
276
|
+
# and is not deeper than the depth limit
|
277
|
+
# Returns +false+ otherwise.
|
278
|
+
#
|
279
|
+
def visit_link?(link, from_page = nil)
|
280
|
+
!@pages.has_page?(link) &&
|
281
|
+
!skip_link?(link) &&
|
282
|
+
!skip_query_string?(link) &&
|
283
|
+
allowed(link) &&
|
284
|
+
!too_deep?(from_page)
|
285
|
+
end
|
286
|
+
|
287
|
+
#
|
288
|
+
# Returns +true+ if we are obeying robots.txt and the link
|
289
|
+
# is granted access in it. Always returns +true+ when we are
|
290
|
+
# not obeying robots.txt.
|
291
|
+
#
|
292
|
+
def allowed(link)
|
293
|
+
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
294
|
+
rescue
|
295
|
+
false
|
296
|
+
end
|
297
|
+
|
298
|
+
#
|
299
|
+
# Returns +true+ if we are over the page depth limit.
|
300
|
+
# This only works when coming from a page and with the +depth_limit+ option set.
|
301
|
+
# When neither is the case, will always return +false+.
|
302
|
+
def too_deep?(from_page)
|
303
|
+
if from_page && @opts[:depth_limit]
|
304
|
+
from_page.depth >= @opts[:depth_limit]
|
305
|
+
else
|
306
|
+
false
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
#
|
311
|
+
# Returns +true+ if *link* should not be visited because
|
312
|
+
# it has a query string and +skip_query_strings+ is true.
|
313
|
+
#
|
314
|
+
def skip_query_string?(link)
|
315
|
+
@opts[:skip_query_strings] && link.query
|
316
|
+
end
|
317
|
+
|
318
|
+
#
|
319
|
+
# Returns +true+ if *link* should not be visited because
|
320
|
+
# its URL matches a skip_link pattern.
|
321
|
+
#
|
322
|
+
def skip_link?(link)
|
323
|
+
@skip_link_patterns.any? { |pattern| link.path =~ pattern }
|
324
|
+
end
|
325
|
+
|
326
|
+
#
|
327
|
+
# Creates a new queue constrained to the given maximum size,
|
328
|
+
# or unconstrained if +size+ is not a positive integer.
|
329
|
+
#
|
330
|
+
def build_queue(size = nil)
|
331
|
+
if size.is_a?(Integer) && size > 0
|
332
|
+
SizedQueue.new(size)
|
333
|
+
else
|
334
|
+
Queue.new
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
end
|
339
|
+
end
|
data/lib/anemone/http.rb
ADDED
@@ -0,0 +1,187 @@
|
|
1
|
+
require 'net/https'
|
2
|
+
require 'anemone/page'
|
3
|
+
require 'anemone/cookie_store'
|
4
|
+
|
5
|
+
module Anemone
|
6
|
+
class HTTP
|
7
|
+
# Maximum number of redirects to follow on each get_response
|
8
|
+
REDIRECT_LIMIT = 5
|
9
|
+
|
10
|
+
# CookieStore for this HTTP client
|
11
|
+
attr_reader :cookie_store
|
12
|
+
|
13
|
+
def initialize(opts = {})
|
14
|
+
@connections = {}
|
15
|
+
@opts = opts
|
16
|
+
@cookie_store = CookieStore.new(@opts[:cookies])
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Fetch a single Page from the response of an HTTP request to *url*.
|
21
|
+
# Just gets the final destination page.
|
22
|
+
#
|
23
|
+
def fetch_page(url, referer = nil, depth = nil)
|
24
|
+
fetch_pages(url, referer, depth).last
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Create new Pages from the response of an HTTP request to *url*,
|
29
|
+
# including redirects
|
30
|
+
#
|
31
|
+
def fetch_pages(url, referer = nil, depth = nil)
|
32
|
+
begin
|
33
|
+
url = URI(url) unless url.is_a?(URI)
|
34
|
+
pages = []
|
35
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
36
|
+
pages << @opts[:page_class].new(location, :body => response.body.dup,
|
37
|
+
:code => code,
|
38
|
+
:headers => response.to_hash,
|
39
|
+
:referer => referer,
|
40
|
+
:depth => depth,
|
41
|
+
:redirect_to => redirect_to,
|
42
|
+
:response_time => response_time)
|
43
|
+
end
|
44
|
+
|
45
|
+
return pages
|
46
|
+
rescue Exception => e
|
47
|
+
if verbose?
|
48
|
+
puts e.inspect
|
49
|
+
puts e.backtrace
|
50
|
+
end
|
51
|
+
return [@opts[:page_class].new(url, :error => e)]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
#
|
56
|
+
# The maximum number of redirects to follow
|
57
|
+
#
|
58
|
+
def redirect_limit
|
59
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# The user-agent string which will be sent with each request,
|
64
|
+
# or nil if no such option is set
|
65
|
+
#
|
66
|
+
def user_agent
|
67
|
+
@opts[:user_agent]
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# Does this HTTP client accept cookies from the server?
|
72
|
+
#
|
73
|
+
def accept_cookies?
|
74
|
+
@opts[:accept_cookies]
|
75
|
+
end
|
76
|
+
|
77
|
+
#
|
78
|
+
# The proxy address string
|
79
|
+
#
|
80
|
+
def proxy_host
|
81
|
+
@opts[:proxy_host]
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# The proxy port
|
86
|
+
#
|
87
|
+
def proxy_port
|
88
|
+
@opts[:proxy_port]
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# HTTP read timeout in seconds
|
93
|
+
#
|
94
|
+
def read_timeout
|
95
|
+
@opts[:read_timeout]
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
#
|
101
|
+
# Retrieve HTTP responses for *url*, including redirects.
|
102
|
+
# Yields the response object, response code, and URI location
|
103
|
+
# for each response.
|
104
|
+
#
|
105
|
+
def get(url, referer = nil)
|
106
|
+
limit = redirect_limit
|
107
|
+
loc = url
|
108
|
+
begin
|
109
|
+
# if redirected to a relative url, merge it with the host of the original
|
110
|
+
# request url
|
111
|
+
loc = url.merge(loc) if loc.relative?
|
112
|
+
|
113
|
+
response, response_time = get_response(loc, referer)
|
114
|
+
code = Integer(response.code)
|
115
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
116
|
+
yield response, code, loc, redirect_to, response_time
|
117
|
+
limit -= 1
|
118
|
+
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
119
|
+
end
|
120
|
+
|
121
|
+
#
|
122
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
123
|
+
#
|
124
|
+
def get_response(url, referer = nil)
|
125
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
126
|
+
|
127
|
+
opts = {}
|
128
|
+
opts['User-Agent'] = user_agent if user_agent
|
129
|
+
opts['Referer'] = referer.to_s if referer
|
130
|
+
opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
|
131
|
+
|
132
|
+
retries = 0
|
133
|
+
begin
|
134
|
+
start = Time.now()
|
135
|
+
# format request
|
136
|
+
req = Net::HTTP::Get.new(full_path, opts)
|
137
|
+
# HTTP Basic authentication
|
138
|
+
req.basic_auth url.user, url.password if url.user
|
139
|
+
response = connection(url).request(req)
|
140
|
+
finish = Time.now()
|
141
|
+
response_time = ((finish - start) * 1000).round
|
142
|
+
@cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
|
143
|
+
return response, response_time
|
144
|
+
rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
|
145
|
+
puts e.inspect if verbose?
|
146
|
+
refresh_connection(url)
|
147
|
+
retries += 1
|
148
|
+
retry unless retries > 3
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def connection(url)
|
153
|
+
@connections[url.host] ||= {}
|
154
|
+
|
155
|
+
if conn = @connections[url.host][url.port]
|
156
|
+
return conn
|
157
|
+
end
|
158
|
+
|
159
|
+
refresh_connection url
|
160
|
+
end
|
161
|
+
|
162
|
+
def refresh_connection(url)
|
163
|
+
http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
|
164
|
+
|
165
|
+
http.read_timeout = read_timeout if !!read_timeout
|
166
|
+
|
167
|
+
if url.scheme == 'https'
|
168
|
+
http.use_ssl = true
|
169
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
170
|
+
end
|
171
|
+
|
172
|
+
@connections[url.host][url.port] = http.start
|
173
|
+
end
|
174
|
+
|
175
|
+
def verbose?
|
176
|
+
@opts[:verbose]
|
177
|
+
end
|
178
|
+
|
179
|
+
#
|
180
|
+
# Allowed to connect to the requested url?
|
181
|
+
#
|
182
|
+
def allowed?(to_url, from_url)
|
183
|
+
to_url.host.nil? || (to_url.host == from_url.host)
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
end
|