medusa-crawler 1.0.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- checksums.yaml.gz.sig +0 -0
- data.tar.gz.sig +0 -0
- data/CHANGELOG.md +20 -0
- data/CONTRIBUTORS.md +22 -0
- data/LICENSE.txt +20 -0
- data/README.md +48 -0
- data/Rakefile +24 -0
- data/VERSION +1 -0
- data/bin/medusa +4 -0
- data/lib/medusa.rb +2 -0
- data/lib/medusa/cli.rb +24 -0
- data/lib/medusa/cli/count.rb +22 -0
- data/lib/medusa/cli/cron.rb +90 -0
- data/lib/medusa/cli/pagedepth.rb +32 -0
- data/lib/medusa/cli/serialize.rb +35 -0
- data/lib/medusa/cli/url_list.rb +41 -0
- data/lib/medusa/cookie_store.rb +35 -0
- data/lib/medusa/core.rb +305 -0
- data/lib/medusa/exceptions.rb +5 -0
- data/lib/medusa/http.rb +202 -0
- data/lib/medusa/page.rb +229 -0
- data/lib/medusa/page_store.rb +160 -0
- data/lib/medusa/storage.rb +8 -0
- data/lib/medusa/storage/base.rb +81 -0
- data/lib/medusa/storage/exceptions.rb +15 -0
- data/lib/medusa/storage/moneta.rb +42 -0
- data/lib/medusa/tentacle.rb +39 -0
- data/lib/medusa/version.rb +3 -0
- data/spec/fakeweb_helper.rb +85 -0
- data/spec/medusa_helper.rb +5 -0
- data/spec/medusa_spec.rb +14 -0
- data/spec/spec_helper.rb +104 -0
- metadata +187 -0
- metadata.gz.sig +0 -0
data/lib/medusa/core.rb
ADDED
@@ -0,0 +1,305 @@
|
|
1
|
+
require 'thread'
|
2
|
+
require 'robotex'
|
3
|
+
require 'medusa/tentacle'
|
4
|
+
require 'medusa/page'
|
5
|
+
require 'medusa/exceptions'
|
6
|
+
require 'medusa/page_store'
|
7
|
+
require 'medusa/version'
|
8
|
+
require 'medusa/storage'
|
9
|
+
require 'medusa/storage/base'
|
10
|
+
|
11
|
+
module Medusa
|
12
|
+
#
|
13
|
+
# Convenience method to start a crawl
|
14
|
+
#
|
15
|
+
def Medusa.crawl(urls, options = {}, &block)
|
16
|
+
Core.crawl(urls, options, &block)
|
17
|
+
end
|
18
|
+
|
19
|
+
class Core
|
20
|
+
|
21
|
+
# PageStore storing all Page objects encountered during the crawl
|
22
|
+
attr_reader :pages
|
23
|
+
# Hash of options for the crawl
|
24
|
+
attr_reader :opts
|
25
|
+
|
26
|
+
DEFAULT_OPTS = {
|
27
|
+
# run 4 Tentacle threads to fetch pages
|
28
|
+
:threads => 4,
|
29
|
+
# disable verbose output
|
30
|
+
:verbose => false,
|
31
|
+
# don't throw away the page response body after scanning it for links
|
32
|
+
:discard_page_bodies => false,
|
33
|
+
# identify self as Medusa/VERSION
|
34
|
+
:user_agent => "Medusa/#{Medusa::VERSION}",
|
35
|
+
# no delay between requests
|
36
|
+
:delay => 0,
|
37
|
+
# don't obey the robots exclusion protocol
|
38
|
+
:obey_robots_txt => false,
|
39
|
+
# by default, don't limit the depth of the crawl
|
40
|
+
:depth_limit => false,
|
41
|
+
# number of times HTTP redirects will be followed
|
42
|
+
:redirect_limit => 5,
|
43
|
+
# storage engine defaults to Hash in +process_options+ if none specified
|
44
|
+
:storage => nil,
|
45
|
+
# cleanups of the storage on every startup of the crawler
|
46
|
+
:clear_on_startup => true,
|
47
|
+
# Hash of cookie name => value to send with HTTP requests
|
48
|
+
:cookies => nil,
|
49
|
+
# accept cookies from the server and send them back?
|
50
|
+
:accept_cookies => false,
|
51
|
+
# skip any link with a query string? e.g. http://foo.com/?u=user
|
52
|
+
:skip_query_strings => false,
|
53
|
+
# proxy server hostname
|
54
|
+
:proxy_host => nil,
|
55
|
+
# proxy server port number
|
56
|
+
:proxy_port => false,
|
57
|
+
# HTTP read timeout in seconds
|
58
|
+
:read_timeout => nil
|
59
|
+
}.freeze
|
60
|
+
|
61
|
+
# Create setter methods for all options to be called from the crawl block
|
62
|
+
DEFAULT_OPTS.keys.each do |key|
|
63
|
+
define_method "#{key}=" do |value|
|
64
|
+
@opts[key.to_sym] = value
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
#
|
69
|
+
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
70
|
+
# and optional *block*
|
71
|
+
#
|
72
|
+
def initialize(urls, opts = {})
|
73
|
+
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
74
|
+
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
75
|
+
|
76
|
+
@tentacles = []
|
77
|
+
@on_every_page_blocks = []
|
78
|
+
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
79
|
+
@skip_link_patterns = []
|
80
|
+
@after_crawl_blocks = []
|
81
|
+
@opts = opts
|
82
|
+
@focus_crawl_block = nil
|
83
|
+
|
84
|
+
|
85
|
+
yield self if block_given?
|
86
|
+
end
|
87
|
+
|
88
|
+
#
|
89
|
+
# Convenience method to start a new crawl
|
90
|
+
#
|
91
|
+
def self.crawl(urls, opts = {})
|
92
|
+
self.new(urls, opts) do |core|
|
93
|
+
yield core if block_given?
|
94
|
+
core.run
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
#
|
99
|
+
# Add a block to be executed on the PageStore after the crawl
|
100
|
+
# is finished
|
101
|
+
#
|
102
|
+
def after_crawl(&block)
|
103
|
+
@after_crawl_blocks << block
|
104
|
+
self
|
105
|
+
end
|
106
|
+
|
107
|
+
#
|
108
|
+
# Add one ore more Regex patterns for URLs which should not be
|
109
|
+
# followed
|
110
|
+
#
|
111
|
+
def skip_links_like(*patterns)
|
112
|
+
@skip_link_patterns.concat [patterns].flatten.compact
|
113
|
+
self
|
114
|
+
end
|
115
|
+
|
116
|
+
#
|
117
|
+
# Add a block to be executed on every Page as they are encountered
|
118
|
+
# during the crawl
|
119
|
+
#
|
120
|
+
def on_every_page(&block)
|
121
|
+
@on_every_page_blocks << block
|
122
|
+
self
|
123
|
+
end
|
124
|
+
|
125
|
+
#
|
126
|
+
# Add a block to be executed on Page objects with a URL matching
|
127
|
+
# one or more patterns
|
128
|
+
#
|
129
|
+
def on_pages_like(*patterns, &block)
|
130
|
+
if patterns
|
131
|
+
patterns.each do |pattern|
|
132
|
+
@on_pages_like_blocks[pattern] << block
|
133
|
+
end
|
134
|
+
end
|
135
|
+
self
|
136
|
+
end
|
137
|
+
|
138
|
+
#
|
139
|
+
# Specify a block which will select which links to follow on each page.
|
140
|
+
# The block should return an Array of URI objects.
|
141
|
+
#
|
142
|
+
def focus_crawl(&block)
|
143
|
+
@focus_crawl_block = block
|
144
|
+
self
|
145
|
+
end
|
146
|
+
|
147
|
+
#
|
148
|
+
# Perform the crawl
|
149
|
+
#
|
150
|
+
def run
|
151
|
+
process_options
|
152
|
+
|
153
|
+
@urls.delete_if { |url| !visit_link?(url) }
|
154
|
+
return if @urls.empty?
|
155
|
+
|
156
|
+
link_queue = Queue.new
|
157
|
+
page_queue = Queue.new
|
158
|
+
|
159
|
+
@opts[:threads].times do
|
160
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts.dup).run }
|
161
|
+
end
|
162
|
+
|
163
|
+
@urls.each{ |url| link_queue.enq(url) }
|
164
|
+
|
165
|
+
loop do
|
166
|
+
page = page_queue.deq
|
167
|
+
@pages.touch_key page.url
|
168
|
+
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
169
|
+
do_page_blocks page
|
170
|
+
page.discard_doc! if @opts[:discard_page_bodies]
|
171
|
+
|
172
|
+
links = links_to_follow page
|
173
|
+
links.each do |link|
|
174
|
+
link_queue << [link, page.url.dup, page.depth + 1]
|
175
|
+
end
|
176
|
+
@pages.touch_keys links
|
177
|
+
|
178
|
+
@pages[page.url] = page
|
179
|
+
|
180
|
+
# if we are done with the crawl, tell the threads to end
|
181
|
+
if link_queue.empty? and page_queue.empty?
|
182
|
+
until link_queue.num_waiting == @tentacles.size
|
183
|
+
Thread.pass
|
184
|
+
end
|
185
|
+
if page_queue.empty?
|
186
|
+
@tentacles.size.times { link_queue << :END }
|
187
|
+
break
|
188
|
+
end
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
@tentacles.each { |thread| thread.join }
|
193
|
+
do_after_crawl_blocks
|
194
|
+
self
|
195
|
+
end
|
196
|
+
|
197
|
+
private
|
198
|
+
|
199
|
+
def process_options
|
200
|
+
@opts = DEFAULT_OPTS.merge @opts
|
201
|
+
@opts[:threads] = 1 if @opts[:delay] > 0
|
202
|
+
storage = Storage::Base.new(@opts[:storage] || Storage.Moneta(:Memory))
|
203
|
+
storage.clear if @opts[:clear_on_startup]
|
204
|
+
@pages = PageStore.new(storage)
|
205
|
+
@robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
206
|
+
|
207
|
+
freeze_options
|
208
|
+
end
|
209
|
+
|
210
|
+
#
|
211
|
+
# Freeze the opts Hash so that no options can be modified
|
212
|
+
# once the crawl begins
|
213
|
+
#
|
214
|
+
def freeze_options
|
215
|
+
@opts.freeze
|
216
|
+
@opts.each_key { |key| @opts[key].freeze }
|
217
|
+
@opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
|
218
|
+
end
|
219
|
+
|
220
|
+
#
|
221
|
+
# Execute the after_crawl blocks
|
222
|
+
#
|
223
|
+
def do_after_crawl_blocks
|
224
|
+
@after_crawl_blocks.each { |block| block.call(@pages) }
|
225
|
+
end
|
226
|
+
|
227
|
+
#
|
228
|
+
# Execute the on_every_page blocks for *page*
|
229
|
+
#
|
230
|
+
def do_page_blocks(page)
|
231
|
+
@on_every_page_blocks.each do |block|
|
232
|
+
block.call(page)
|
233
|
+
end
|
234
|
+
|
235
|
+
@on_pages_like_blocks.each do |pattern, blocks|
|
236
|
+
blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
#
|
241
|
+
# Return an Array of links to follow from the given page.
|
242
|
+
# Based on whether or not the link has already been crawled,
|
243
|
+
# and the block given to focus_crawl()
|
244
|
+
#
|
245
|
+
def links_to_follow(page)
|
246
|
+
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
247
|
+
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
|
248
|
+
end
|
249
|
+
|
250
|
+
#
|
251
|
+
# Returns +true+ if *link* has not been visited already,
|
252
|
+
# and is not excluded by a skip_link pattern...
|
253
|
+
# and is not excluded by robots.txt...
|
254
|
+
# and is not deeper than the depth limit
|
255
|
+
# Returns +false+ otherwise.
|
256
|
+
#
|
257
|
+
def visit_link?(link, from_page = nil)
|
258
|
+
!@pages.has_page?(link) &&
|
259
|
+
!skip_link?(link) &&
|
260
|
+
!skip_query_string?(link) &&
|
261
|
+
allowed(link) &&
|
262
|
+
!too_deep?(from_page)
|
263
|
+
end
|
264
|
+
|
265
|
+
#
|
266
|
+
# Returns +true+ if we are obeying robots.txt and the link
|
267
|
+
# is granted access in it. Always returns +true+ when we are
|
268
|
+
# not obeying robots.txt.
|
269
|
+
#
|
270
|
+
def allowed(link)
|
271
|
+
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
272
|
+
rescue
|
273
|
+
false
|
274
|
+
end
|
275
|
+
|
276
|
+
#
|
277
|
+
# Returns +true+ if we are over the page depth limit.
|
278
|
+
# This only works when coming from a page and with the +depth_limit+ option set.
|
279
|
+
# When neither is the case, will always return +false+.
|
280
|
+
def too_deep?(from_page)
|
281
|
+
if from_page && @opts[:depth_limit]
|
282
|
+
from_page.depth >= @opts[:depth_limit]
|
283
|
+
else
|
284
|
+
false
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
#
|
289
|
+
# Returns +true+ if *link* should not be visited because
|
290
|
+
# it has a query string and +skip_query_strings+ is true.
|
291
|
+
#
|
292
|
+
def skip_query_string?(link)
|
293
|
+
@opts[:skip_query_strings] && link.query
|
294
|
+
end
|
295
|
+
|
296
|
+
#
|
297
|
+
# Returns +true+ if *link* should not be visited because
|
298
|
+
# its URL matches a skip_link pattern.
|
299
|
+
#
|
300
|
+
def skip_link?(link)
|
301
|
+
@skip_link_patterns.any? { |pattern| link.path =~ pattern }
|
302
|
+
end
|
303
|
+
|
304
|
+
end
|
305
|
+
end
|
data/lib/medusa/http.rb
ADDED
@@ -0,0 +1,202 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'medusa/page'
|
3
|
+
require 'medusa/cookie_store'
|
4
|
+
|
5
|
+
module Medusa
|
6
|
+
class HTTP
|
7
|
+
# Maximum number of redirects to follow on each get_response
|
8
|
+
REDIRECT_LIMIT = 5
|
9
|
+
RETRY_LIMIT = 6
|
10
|
+
|
11
|
+
# CookieStore for this HTTP client
|
12
|
+
attr_reader :cookie_store
|
13
|
+
|
14
|
+
def initialize(opts = {})
|
15
|
+
@opts = opts
|
16
|
+
@cookie_store = CookieStore.new(@opts[:cookies])
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
# Fetch a single Page from the response of an HTTP request to *url*.
|
21
|
+
# Just gets the final destination page.
|
22
|
+
#
|
23
|
+
def fetch_page(url, referer = nil, depth = nil)
|
24
|
+
fetch_pages(url, referer, depth).last
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Create new Pages from the response of an HTTP request to *url*,
|
29
|
+
# including redirects
|
30
|
+
#
|
31
|
+
def fetch_pages(url, referer = nil, depth = nil)
|
32
|
+
begin
|
33
|
+
url = URI(url) unless url.is_a?(URI)
|
34
|
+
pages = []
|
35
|
+
get(url, referer) do |response, headers, code, location, redirect_to, response_time|
|
36
|
+
pages << Page.new(location, :body => response,
|
37
|
+
:headers => headers,
|
38
|
+
:code => code,
|
39
|
+
:referer => referer,
|
40
|
+
:depth => depth,
|
41
|
+
:redirect_to => redirect_to,
|
42
|
+
:response_time => response_time)
|
43
|
+
end
|
44
|
+
|
45
|
+
return pages
|
46
|
+
rescue Exception => e
|
47
|
+
if verbose?
|
48
|
+
puts e.inspect
|
49
|
+
puts e.backtrace
|
50
|
+
end
|
51
|
+
pages ||= []
|
52
|
+
return pages << Page.new(url, :error => e)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
# The maximum number of redirects to follow
|
58
|
+
#
|
59
|
+
def redirect_limit
|
60
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# The user-agent string which will be sent with each request,
|
65
|
+
# or nil if no such option is set
|
66
|
+
#
|
67
|
+
def user_agent
|
68
|
+
@opts[:user_agent]
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Does this HTTP client accept cookies from the server?
|
73
|
+
#
|
74
|
+
def accept_cookies?
|
75
|
+
@opts[:accept_cookies]
|
76
|
+
end
|
77
|
+
|
78
|
+
#
|
79
|
+
# The http authentication options as in http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
|
80
|
+
# userinfo is deprecated [RFC3986]
|
81
|
+
#
|
82
|
+
def http_basic_authentication
|
83
|
+
@opts[:http_basic_authentication]
|
84
|
+
end
|
85
|
+
|
86
|
+
#
|
87
|
+
# The proxy authentication options as in http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
|
88
|
+
#
|
89
|
+
def proxy_http_basic_authentication
|
90
|
+
@opts[:proxy_http_basic_authentication]
|
91
|
+
end
|
92
|
+
|
93
|
+
#
|
94
|
+
# The proxy options as in http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
|
95
|
+
#
|
96
|
+
def proxy
|
97
|
+
@opts[:proxy]
|
98
|
+
end
|
99
|
+
|
100
|
+
#
|
101
|
+
# The proxy address string
|
102
|
+
#
|
103
|
+
def proxy_host
|
104
|
+
@opts[:proxy_host]
|
105
|
+
end
|
106
|
+
|
107
|
+
#
|
108
|
+
# The proxy port
|
109
|
+
#
|
110
|
+
def proxy_port
|
111
|
+
@opts[:proxy_port]
|
112
|
+
end
|
113
|
+
|
114
|
+
#
|
115
|
+
# HTTP read timeout in seconds
|
116
|
+
#
|
117
|
+
def read_timeout
|
118
|
+
@opts[:read_timeout]
|
119
|
+
end
|
120
|
+
|
121
|
+
private
|
122
|
+
|
123
|
+
#
|
124
|
+
# Retrieve HTTP responses for *url*, including redirects.
|
125
|
+
# Yields the response object, response code, and URI location
|
126
|
+
# for each response.
|
127
|
+
#
|
128
|
+
def get(url, referer = nil)
|
129
|
+
limit = redirect_limit
|
130
|
+
loc = url
|
131
|
+
begin
|
132
|
+
# if redirected to a relative url, merge it with the host of the original
|
133
|
+
# request url
|
134
|
+
loc = url.merge(loc) if loc.relative?
|
135
|
+
|
136
|
+
response, headers, response_time, response_code, redirect_to = get_response(loc, referer)
|
137
|
+
|
138
|
+
yield response, headers, Integer(response_code), loc, redirect_to, response_time
|
139
|
+
limit -= 1
|
140
|
+
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
141
|
+
end
|
142
|
+
|
143
|
+
#
|
144
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
145
|
+
#
|
146
|
+
def get_response(url, referer = nil)
|
147
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
148
|
+
|
149
|
+
opts = {}
|
150
|
+
opts['User-Agent'] = user_agent if user_agent
|
151
|
+
opts['Referer'] = referer.to_s if referer
|
152
|
+
opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
|
153
|
+
opts[:http_basic_authentication] = http_basic_authentication if http_basic_authentication
|
154
|
+
opts[:proxy] = proxy if proxy
|
155
|
+
opts[:proxy_http_basic_authentication] = proxy_http_basic_authentication if proxy_http_basic_authentication
|
156
|
+
opts[:read_timeout] = read_timeout if !!read_timeout
|
157
|
+
opts[:redirect] = false
|
158
|
+
redirect_to = nil
|
159
|
+
retries = 0
|
160
|
+
begin
|
161
|
+
start = Time.now()
|
162
|
+
|
163
|
+
begin
|
164
|
+
if Gem::Requirement.new('< 2.5').satisfied_by?(Gem::Version.new(RUBY_VERSION))
|
165
|
+
resource = open(url, opts)
|
166
|
+
else
|
167
|
+
resource = URI.open(url, opts)
|
168
|
+
end
|
169
|
+
rescue OpenURI::HTTPRedirect => e_redirect
|
170
|
+
resource = e_redirect.io
|
171
|
+
redirect_to = e_redirect.uri
|
172
|
+
rescue OpenURI::HTTPError => e_http
|
173
|
+
resource = e_http.io
|
174
|
+
end
|
175
|
+
|
176
|
+
finish = Time.now()
|
177
|
+
response_time = ((finish - start) * 1000).round
|
178
|
+
@cookie_store.merge!(resource.meta['set-cookie']) if accept_cookies?
|
179
|
+
return resource.read, resource.meta, response_time, resource.status.shift, redirect_to
|
180
|
+
|
181
|
+
rescue Timeout::Error, EOFError, Errno::ECONNREFUSED, Errno::ETIMEDOUT, Errno::ECONNRESET => e
|
182
|
+
retries += 1
|
183
|
+
puts "[medusa] Retrying ##{retries} on url #{url} because of: #{e.inspect}" if verbose?
|
184
|
+
sleep(3 ^ retries)
|
185
|
+
retry unless retries > RETRY_LIMIT
|
186
|
+
ensure
|
187
|
+
resource.close if !resource.nil? && !resource.closed?
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def verbose?
|
192
|
+
@opts[:verbose]
|
193
|
+
end
|
194
|
+
|
195
|
+
#
|
196
|
+
# Allowed to connect to the requested url?
|
197
|
+
#
|
198
|
+
def allowed?(to_url, from_url)
|
199
|
+
to_url.host.nil? || (to_url.host == from_url.host)
|
200
|
+
end
|
201
|
+
end
|
202
|
+
end
|