sutch-anemone 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/CHANGELOG.rdoc +136 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +38 -0
- data/Rakefile +23 -0
- data/VERSION +1 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +339 -0
- data/lib/anemone/exceptions.rb +5 -0
- data/lib/anemone/http.rb +187 -0
- data/lib/anemone/page.rb +217 -0
- data/lib/anemone/page_store.rb +161 -0
- data/lib/anemone/resource.rb +42 -0
- data/lib/anemone/storage.rb +44 -0
- data/lib/anemone/storage/base.rb +75 -0
- data/lib/anemone/storage/exceptions.rb +15 -0
- data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
- data/lib/anemone/storage/mongodb.rb +89 -0
- data/lib/anemone/storage/pstore.rb +50 -0
- data/lib/anemone/storage/redis.rb +90 -0
- data/lib/anemone/storage/sqlite3.rb +90 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +60 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +16 -0
- data/spec/cookie_store_spec.rb +28 -0
- data/spec/core_spec.rb +344 -0
- data/spec/fakeweb_helper.rb +77 -0
- data/spec/http_spec.rb +19 -0
- data/spec/page_spec.rb +186 -0
- data/spec/page_store_spec.rb +171 -0
- data/spec/resource_spec.rb +91 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/storage_spec.rb +252 -0
- metadata +281 -0
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
options = OpenStruct.new
|
6
|
+
options.relative = false
|
7
|
+
|
8
|
+
begin
|
9
|
+
# make sure that the last option is a URL we can crawl
|
10
|
+
root = URI(ARGV.last)
|
11
|
+
rescue
|
12
|
+
puts <<-INFO
|
13
|
+
Usage:
|
14
|
+
anemone url-list [options] <url>
|
15
|
+
|
16
|
+
Synopsis:
|
17
|
+
Crawls a site starting at the given URL, and outputs the URL of each page
|
18
|
+
in the domain as they are encountered.
|
19
|
+
|
20
|
+
Options:
|
21
|
+
-r, --relative Output relative URLs (rather than absolute)
|
22
|
+
INFO
|
23
|
+
exit(0)
|
24
|
+
end
|
25
|
+
|
26
|
+
# parse command-line options
|
27
|
+
opts = OptionParser.new
|
28
|
+
opts.on('-r', '--relative') { options.relative = true }
|
29
|
+
opts.parse!(ARGV)
|
30
|
+
|
31
|
+
Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
|
32
|
+
|
33
|
+
anemone.on_every_page do |page|
|
34
|
+
if options.relative
|
35
|
+
puts page.url.path
|
36
|
+
else
|
37
|
+
puts page.url
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'delegate'
|
2
|
+
require 'webrick/cookie'
|
3
|
+
|
4
|
+
class WEBrick::Cookie
|
5
|
+
def expired?
|
6
|
+
!!expires && expires < Time.now
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
module Anemone
|
11
|
+
class CookieStore < DelegateClass(Hash)
|
12
|
+
|
13
|
+
def initialize(cookies = nil)
|
14
|
+
@cookies = {}
|
15
|
+
cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
|
16
|
+
super(@cookies)
|
17
|
+
end
|
18
|
+
|
19
|
+
def merge!(set_cookie_str)
|
20
|
+
begin
|
21
|
+
cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
|
22
|
+
hash[cookie.name] = cookie if !!cookie
|
23
|
+
hash
|
24
|
+
end
|
25
|
+
@cookies.merge! cookie_hash
|
26
|
+
rescue
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def to_s
|
31
|
+
@cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
data/lib/anemone/core.rb
ADDED
@@ -0,0 +1,339 @@
|
|
1
|
+
require 'thread'
|
2
|
+
require 'robotex'
|
3
|
+
require 'anemone/tentacle'
|
4
|
+
require 'anemone/page'
|
5
|
+
require 'anemone/resource'
|
6
|
+
require 'anemone/exceptions'
|
7
|
+
require 'anemone/page_store'
|
8
|
+
require 'anemone/storage'
|
9
|
+
require 'anemone/storage/base'
|
10
|
+
|
11
|
+
module Anemone
|
12
|
+
|
13
|
+
VERSION = '0.7.2';
|
14
|
+
|
15
|
+
#
|
16
|
+
# Convenience method to start a crawl
|
17
|
+
#
|
18
|
+
def Anemone.crawl(urls, options = {}, &block)
|
19
|
+
Core.crawl(urls, options, &block)
|
20
|
+
end
|
21
|
+
|
22
|
+
class Core
|
23
|
+
|
24
|
+
# PageStore storing all Page objects encountered during the crawl
|
25
|
+
attr_reader :pages
|
26
|
+
# Hash of options for the crawl
|
27
|
+
attr_reader :opts
|
28
|
+
|
29
|
+
DEFAULT_OPTS = {
|
30
|
+
# run 4 Tentacle threads to fetch pages
|
31
|
+
:threads => 4,
|
32
|
+
# disable verbose output
|
33
|
+
:verbose => false,
|
34
|
+
# don't throw away the page response body after scanning it for links
|
35
|
+
:discard_page_bodies => false,
|
36
|
+
# identify self as Anemone/VERSION
|
37
|
+
:user_agent => "Anemone/#{Anemone::VERSION}",
|
38
|
+
# no delay between requests
|
39
|
+
:delay => 0,
|
40
|
+
# don't obey the robots exclusion protocol
|
41
|
+
:obey_robots_txt => false,
|
42
|
+
# by default, don't limit the depth of the crawl
|
43
|
+
:depth_limit => false,
|
44
|
+
# number of times HTTP redirects will be followed
|
45
|
+
:redirect_limit => 5,
|
46
|
+
# limit the size of the page queue to keep memory usage low
|
47
|
+
:page_queue_size_limit => nil,
|
48
|
+
# limit the size of the link queue to keep memory usage low
|
49
|
+
:link_queue_size_limit => nil,
|
50
|
+
# storage engine defaults to Hash in +process_options+ if none specified
|
51
|
+
:storage => nil,
|
52
|
+
# Hash of cookie name => value to send with HTTP requests
|
53
|
+
:cookies => nil,
|
54
|
+
# accept cookies from the server and send them back?
|
55
|
+
:accept_cookies => false,
|
56
|
+
# skip any link with a query string? e.g. http://foo.com/?u=user
|
57
|
+
:skip_query_strings => false,
|
58
|
+
# proxy server hostname
|
59
|
+
:proxy_host => nil,
|
60
|
+
# proxy server port number
|
61
|
+
:proxy_port => false,
|
62
|
+
# HTTP read timeout in seconds
|
63
|
+
:read_timeout => nil,
|
64
|
+
# parse pages using Page class
|
65
|
+
:page_class => Anemone::Page,
|
66
|
+
}
|
67
|
+
|
68
|
+
# Create setter methods for all options to be called from the crawl block
|
69
|
+
DEFAULT_OPTS.keys.each do |key|
|
70
|
+
define_method "#{key}=" do |value|
|
71
|
+
@opts[key.to_sym] = value
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
#
|
76
|
+
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
77
|
+
# and optional *block*
|
78
|
+
#
|
79
|
+
def initialize(urls, opts = {})
|
80
|
+
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
81
|
+
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
82
|
+
|
83
|
+
@tentacles = []
|
84
|
+
@on_every_page_blocks = []
|
85
|
+
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
86
|
+
@skip_link_patterns = []
|
87
|
+
@after_crawl_blocks = []
|
88
|
+
@opts = opts
|
89
|
+
@stop_crawl = false
|
90
|
+
|
91
|
+
yield self if block_given?
|
92
|
+
end
|
93
|
+
|
94
|
+
#
|
95
|
+
# Convenience method to start a new crawl
|
96
|
+
#
|
97
|
+
def self.crawl(urls, opts = {})
|
98
|
+
self.new(urls, opts) do |core|
|
99
|
+
yield core if block_given?
|
100
|
+
core.run
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
#
|
105
|
+
# Add a block to be executed on the PageStore after the crawl
|
106
|
+
# is finished
|
107
|
+
#
|
108
|
+
def after_crawl(&block)
|
109
|
+
@after_crawl_blocks << block
|
110
|
+
self
|
111
|
+
end
|
112
|
+
|
113
|
+
#
|
114
|
+
# Add one ore more Regex patterns for URLs which should not be
|
115
|
+
# followed
|
116
|
+
#
|
117
|
+
def skip_links_like(*patterns)
|
118
|
+
@skip_link_patterns.concat [patterns].flatten.compact
|
119
|
+
self
|
120
|
+
end
|
121
|
+
|
122
|
+
#
|
123
|
+
# Add a block to be executed on every Page as they are encountered
|
124
|
+
# during the crawl
|
125
|
+
#
|
126
|
+
def on_every_page(&block)
|
127
|
+
@on_every_page_blocks << block
|
128
|
+
self
|
129
|
+
end
|
130
|
+
|
131
|
+
#
|
132
|
+
# Add a block to be executed on Page objects with a URL matching
|
133
|
+
# one or more patterns
|
134
|
+
#
|
135
|
+
def on_pages_like(*patterns, &block)
|
136
|
+
if patterns
|
137
|
+
patterns.each do |pattern|
|
138
|
+
@on_pages_like_blocks[pattern] << block
|
139
|
+
end
|
140
|
+
end
|
141
|
+
self
|
142
|
+
end
|
143
|
+
|
144
|
+
#
|
145
|
+
# Specify a block which will select which links to follow on each page.
|
146
|
+
# The block should return an Array of URI objects.
|
147
|
+
#
|
148
|
+
def focus_crawl(&block)
|
149
|
+
@focus_crawl_block = block
|
150
|
+
self
|
151
|
+
end
|
152
|
+
|
153
|
+
#
|
154
|
+
# Signals the crawler that it should stop the crawl before visiting the
|
155
|
+
# next page.
|
156
|
+
#
|
157
|
+
# This method is expected to be called within a page block, and it signals
|
158
|
+
# the crawler that it must stop after the current page is completely
|
159
|
+
# processed. All pages and links currently on queue are discared.
|
160
|
+
#
|
161
|
+
def stop_crawl
|
162
|
+
@stop_crawl = true
|
163
|
+
end
|
164
|
+
|
165
|
+
#
|
166
|
+
# Perform the crawl
|
167
|
+
#
|
168
|
+
def run
|
169
|
+
process_options
|
170
|
+
|
171
|
+
@urls.delete_if { |url| !visit_link?(url) }
|
172
|
+
return if @urls.empty?
|
173
|
+
|
174
|
+
link_queue = build_queue(@opts[:link_queue_size_limit])
|
175
|
+
page_queue = build_queue(@opts[:page_queue_size_limit])
|
176
|
+
|
177
|
+
@opts[:threads].times do
|
178
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
179
|
+
end
|
180
|
+
|
181
|
+
@urls.each{ |url| link_queue.enq(url) }
|
182
|
+
|
183
|
+
loop do
|
184
|
+
page = page_queue.deq
|
185
|
+
@pages.touch_key page.url
|
186
|
+
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
187
|
+
do_page_blocks page
|
188
|
+
page.discard_doc! if @opts[:discard_page_bodies]
|
189
|
+
|
190
|
+
links = links_to_follow page
|
191
|
+
links.each do |link|
|
192
|
+
link_queue << [link, page.url.dup, page.depth + 1]
|
193
|
+
end
|
194
|
+
@pages.touch_keys links
|
195
|
+
|
196
|
+
@pages[page.url] = page
|
197
|
+
|
198
|
+
if @stop_crawl
|
199
|
+
page_queue.clear
|
200
|
+
link_queue.clear
|
201
|
+
end
|
202
|
+
|
203
|
+
# if we are done with the crawl, tell the threads to end
|
204
|
+
if link_queue.empty? and page_queue.empty?
|
205
|
+
until link_queue.num_waiting == @tentacles.size
|
206
|
+
Thread.pass
|
207
|
+
end
|
208
|
+
if page_queue.empty? || @stop_crawl
|
209
|
+
@tentacles.size.times { link_queue << :END }
|
210
|
+
break
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
@tentacles.each { |thread| thread.join }
|
216
|
+
do_after_crawl_blocks
|
217
|
+
self
|
218
|
+
end
|
219
|
+
|
220
|
+
private
|
221
|
+
|
222
|
+
def process_options
|
223
|
+
@opts = DEFAULT_OPTS.merge @opts
|
224
|
+
@opts[:threads] = 1 if @opts[:delay] > 0
|
225
|
+
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
|
226
|
+
@pages = PageStore.new(storage, @opts)
|
227
|
+
@robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
228
|
+
|
229
|
+
freeze_options
|
230
|
+
end
|
231
|
+
|
232
|
+
#
|
233
|
+
# Freeze the opts Hash so that no options can be modified
|
234
|
+
# once the crawl begins
|
235
|
+
#
|
236
|
+
def freeze_options
|
237
|
+
@opts.freeze
|
238
|
+
@opts.each_key { |key| @opts[key].freeze }
|
239
|
+
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
|
240
|
+
end
|
241
|
+
|
242
|
+
#
|
243
|
+
# Execute the after_crawl blocks
|
244
|
+
#
|
245
|
+
def do_after_crawl_blocks
|
246
|
+
@after_crawl_blocks.each { |block| block.call(@pages) }
|
247
|
+
end
|
248
|
+
|
249
|
+
#
|
250
|
+
# Execute the on_every_page blocks for *page*
|
251
|
+
#
|
252
|
+
def do_page_blocks(page)
|
253
|
+
@on_every_page_blocks.each do |block|
|
254
|
+
block.call(page)
|
255
|
+
end
|
256
|
+
|
257
|
+
@on_pages_like_blocks.each do |pattern, blocks|
|
258
|
+
blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
|
259
|
+
end
|
260
|
+
end
|
261
|
+
|
262
|
+
#
|
263
|
+
# Return an Array of links to follow from the given page.
|
264
|
+
# Based on whether or not the link has already been crawled,
|
265
|
+
# and the block given to focus_crawl()
|
266
|
+
#
|
267
|
+
def links_to_follow(page)
|
268
|
+
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
269
|
+
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
|
270
|
+
end
|
271
|
+
|
272
|
+
#
|
273
|
+
# Returns +true+ if *link* has not been visited already,
|
274
|
+
# and is not excluded by a skip_link pattern...
|
275
|
+
# and is not excluded by robots.txt...
|
276
|
+
# and is not deeper than the depth limit
|
277
|
+
# Returns +false+ otherwise.
|
278
|
+
#
|
279
|
+
def visit_link?(link, from_page = nil)
|
280
|
+
!@pages.has_page?(link) &&
|
281
|
+
!skip_link?(link) &&
|
282
|
+
!skip_query_string?(link) &&
|
283
|
+
allowed(link) &&
|
284
|
+
!too_deep?(from_page)
|
285
|
+
end
|
286
|
+
|
287
|
+
#
|
288
|
+
# Returns +true+ if we are obeying robots.txt and the link
|
289
|
+
# is granted access in it. Always returns +true+ when we are
|
290
|
+
# not obeying robots.txt.
|
291
|
+
#
|
292
|
+
def allowed(link)
|
293
|
+
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
294
|
+
rescue
|
295
|
+
false
|
296
|
+
end
|
297
|
+
|
298
|
+
#
|
299
|
+
# Returns +true+ if we are over the page depth limit.
|
300
|
+
# This only works when coming from a page and with the +depth_limit+ option set.
|
301
|
+
# When neither is the case, will always return +false+.
|
302
|
+
def too_deep?(from_page)
|
303
|
+
if from_page && @opts[:depth_limit]
|
304
|
+
from_page.depth >= @opts[:depth_limit]
|
305
|
+
else
|
306
|
+
false
|
307
|
+
end
|
308
|
+
end
|
309
|
+
|
310
|
+
#
|
311
|
+
# Returns +true+ if *link* should not be visited because
|
312
|
+
# it has a query string and +skip_query_strings+ is true.
|
313
|
+
#
|
314
|
+
def skip_query_string?(link)
|
315
|
+
@opts[:skip_query_strings] && link.query
|
316
|
+
end
|
317
|
+
|
318
|
+
#
|
319
|
+
# Returns +true+ if *link* should not be visited because
|
320
|
+
# its URL matches a skip_link pattern.
|
321
|
+
#
|
322
|
+
def skip_link?(link)
|
323
|
+
@skip_link_patterns.any? { |pattern| link.path =~ pattern }
|
324
|
+
end
|
325
|
+
|
326
|
+
#
|
327
|
+
# Creates a new queue constrained to the given maximum size,
|
328
|
+
# or unconstrained if +size+ is not a positive integer.
|
329
|
+
#
|
330
|
+
def build_queue(size = nil)
|
331
|
+
if size.is_a?(Integer) && size > 0
|
332
|
+
SizedQueue.new(size)
|
333
|
+
else
|
334
|
+
Queue.new
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
end
|
339
|
+
end
|
data/lib/anemone/http.rb
ADDED
@@ -0,0 +1,187 @@
|
|
1
|
+
require 'net/https'
|
2
|
+
require 'anemone/page'
|
3
|
+
require 'anemone/cookie_store'
|
4
|
+
|
5
|
+
module Anemone
|
6
|
+
class HTTP
|
7
|
+
# Maximum number of redirects to follow on each get_response
|
8
|
+
REDIRECT_LIMIT = 5
|
9
|
+
|
10
|
+
# CookieStore for this HTTP client
|
11
|
+
attr_reader :cookie_store
|
12
|
+
|
13
|
+
def initialize(opts = {})
|
14
|
+
@connections = {}
|
15
|
+
@opts = opts
|
16
|
+
@cookie_store = CookieStore.new(@opts[:cookies])
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Fetch a single Page from the response of an HTTP request to *url*.
|
21
|
+
# Just gets the final destination page.
|
22
|
+
#
|
23
|
+
def fetch_page(url, referer = nil, depth = nil)
|
24
|
+
fetch_pages(url, referer, depth).last
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Create new Pages from the response of an HTTP request to *url*,
|
29
|
+
# including redirects
|
30
|
+
#
|
31
|
+
def fetch_pages(url, referer = nil, depth = nil)
|
32
|
+
begin
|
33
|
+
url = URI(url) unless url.is_a?(URI)
|
34
|
+
pages = []
|
35
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
36
|
+
pages << @opts[:page_class].new(location, :body => response.body.dup,
|
37
|
+
:code => code,
|
38
|
+
:headers => response.to_hash,
|
39
|
+
:referer => referer,
|
40
|
+
:depth => depth,
|
41
|
+
:redirect_to => redirect_to,
|
42
|
+
:response_time => response_time)
|
43
|
+
end
|
44
|
+
|
45
|
+
return pages
|
46
|
+
rescue Exception => e
|
47
|
+
if verbose?
|
48
|
+
puts e.inspect
|
49
|
+
puts e.backtrace
|
50
|
+
end
|
51
|
+
return [@opts[:page_class].new(url, :error => e)]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
#
|
56
|
+
# The maximum number of redirects to follow
|
57
|
+
#
|
58
|
+
def redirect_limit
|
59
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# The user-agent string which will be sent with each request,
|
64
|
+
# or nil if no such option is set
|
65
|
+
#
|
66
|
+
def user_agent
|
67
|
+
@opts[:user_agent]
|
68
|
+
end
|
69
|
+
|
70
|
+
#
|
71
|
+
# Does this HTTP client accept cookies from the server?
|
72
|
+
#
|
73
|
+
def accept_cookies?
|
74
|
+
@opts[:accept_cookies]
|
75
|
+
end
|
76
|
+
|
77
|
+
#
|
78
|
+
# The proxy address string
|
79
|
+
#
|
80
|
+
def proxy_host
|
81
|
+
@opts[:proxy_host]
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# The proxy port
|
86
|
+
#
|
87
|
+
def proxy_port
|
88
|
+
@opts[:proxy_port]
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# HTTP read timeout in seconds
|
93
|
+
#
|
94
|
+
def read_timeout
|
95
|
+
@opts[:read_timeout]
|
96
|
+
end
|
97
|
+
|
98
|
+
private
|
99
|
+
|
100
|
+
#
|
101
|
+
# Retrieve HTTP responses for *url*, including redirects.
|
102
|
+
# Yields the response object, response code, and URI location
|
103
|
+
# for each response.
|
104
|
+
#
|
105
|
+
def get(url, referer = nil)
|
106
|
+
limit = redirect_limit
|
107
|
+
loc = url
|
108
|
+
begin
|
109
|
+
# if redirected to a relative url, merge it with the host of the original
|
110
|
+
# request url
|
111
|
+
loc = url.merge(loc) if loc.relative?
|
112
|
+
|
113
|
+
response, response_time = get_response(loc, referer)
|
114
|
+
code = Integer(response.code)
|
115
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
116
|
+
yield response, code, loc, redirect_to, response_time
|
117
|
+
limit -= 1
|
118
|
+
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
119
|
+
end
|
120
|
+
|
121
|
+
#
|
122
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
123
|
+
#
|
124
|
+
def get_response(url, referer = nil)
|
125
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
126
|
+
|
127
|
+
opts = {}
|
128
|
+
opts['User-Agent'] = user_agent if user_agent
|
129
|
+
opts['Referer'] = referer.to_s if referer
|
130
|
+
opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
|
131
|
+
|
132
|
+
retries = 0
|
133
|
+
begin
|
134
|
+
start = Time.now()
|
135
|
+
# format request
|
136
|
+
req = Net::HTTP::Get.new(full_path, opts)
|
137
|
+
# HTTP Basic authentication
|
138
|
+
req.basic_auth url.user, url.password if url.user
|
139
|
+
response = connection(url).request(req)
|
140
|
+
finish = Time.now()
|
141
|
+
response_time = ((finish - start) * 1000).round
|
142
|
+
@cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
|
143
|
+
return response, response_time
|
144
|
+
rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
|
145
|
+
puts e.inspect if verbose?
|
146
|
+
refresh_connection(url)
|
147
|
+
retries += 1
|
148
|
+
retry unless retries > 3
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def connection(url)
|
153
|
+
@connections[url.host] ||= {}
|
154
|
+
|
155
|
+
if conn = @connections[url.host][url.port]
|
156
|
+
return conn
|
157
|
+
end
|
158
|
+
|
159
|
+
refresh_connection url
|
160
|
+
end
|
161
|
+
|
162
|
+
def refresh_connection(url)
|
163
|
+
http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
|
164
|
+
|
165
|
+
http.read_timeout = read_timeout if !!read_timeout
|
166
|
+
|
167
|
+
if url.scheme == 'https'
|
168
|
+
http.use_ssl = true
|
169
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
170
|
+
end
|
171
|
+
|
172
|
+
@connections[url.host][url.port] = http.start
|
173
|
+
end
|
174
|
+
|
175
|
+
def verbose?
|
176
|
+
@opts[:verbose]
|
177
|
+
end
|
178
|
+
|
179
|
+
#
|
180
|
+
# Allowed to connect to the requested url?
|
181
|
+
#
|
182
|
+
def allowed?(to_url, from_url)
|
183
|
+
to_url.host.nil? || (to_url.host == from_url.host)
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
end
|