medusa-crawler 1.0.0.pre.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,305 @@
1
+ require 'thread'
2
+ require 'robotex'
3
+ require 'medusa/tentacle'
4
+ require 'medusa/page'
5
+ require 'medusa/exceptions'
6
+ require 'medusa/page_store'
7
+ require 'medusa/version'
8
+ require 'medusa/storage'
9
+ require 'medusa/storage/base'
10
+
11
+ module Medusa
12
+ #
13
+ # Convenience method to start a crawl
14
+ #
15
+ def Medusa.crawl(urls, options = {}, &block)
16
+ Core.crawl(urls, options, &block)
17
+ end
18
+
19
+ class Core
20
+
21
+ # PageStore storing all Page objects encountered during the crawl
22
+ attr_reader :pages
23
+ # Hash of options for the crawl
24
+ attr_reader :opts
25
+
26
+ DEFAULT_OPTS = {
27
+ # run 4 Tentacle threads to fetch pages
28
+ :threads => 4,
29
+ # disable verbose output
30
+ :verbose => false,
31
+ # don't throw away the page response body after scanning it for links
32
+ :discard_page_bodies => false,
33
+ # identify self as Medusa/VERSION
34
+ :user_agent => "Medusa/#{Medusa::VERSION}",
35
+ # no delay between requests
36
+ :delay => 0,
37
+ # don't obey the robots exclusion protocol
38
+ :obey_robots_txt => false,
39
+ # by default, don't limit the depth of the crawl
40
+ :depth_limit => false,
41
+ # number of times HTTP redirects will be followed
42
+ :redirect_limit => 5,
43
+ # storage engine defaults to Hash in +process_options+ if none specified
44
+ :storage => nil,
45
+ # cleanups of the storage on every startup of the crawler
46
+ :clear_on_startup => true,
47
+ # Hash of cookie name => value to send with HTTP requests
48
+ :cookies => nil,
49
+ # accept cookies from the server and send them back?
50
+ :accept_cookies => false,
51
+ # skip any link with a query string? e.g. http://foo.com/?u=user
52
+ :skip_query_strings => false,
53
+ # proxy server hostname
54
+ :proxy_host => nil,
55
+ # proxy server port number
56
+ :proxy_port => false,
57
+ # HTTP read timeout in seconds
58
+ :read_timeout => nil
59
+ }.freeze
60
+
61
+ # Create setter methods for all options to be called from the crawl block
62
+ DEFAULT_OPTS.keys.each do |key|
63
+ define_method "#{key}=" do |value|
64
+ @opts[key.to_sym] = value
65
+ end
66
+ end
67
+
68
+ #
69
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
70
+ # and optional *block*
71
+ #
72
+ def initialize(urls, opts = {})
73
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
74
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
75
+
76
+ @tentacles = []
77
+ @on_every_page_blocks = []
78
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
79
+ @skip_link_patterns = []
80
+ @after_crawl_blocks = []
81
+ @opts = opts
82
+ @focus_crawl_block = nil
83
+
84
+
85
+ yield self if block_given?
86
+ end
87
+
88
+ #
89
+ # Convenience method to start a new crawl
90
+ #
91
+ def self.crawl(urls, opts = {})
92
+ self.new(urls, opts) do |core|
93
+ yield core if block_given?
94
+ core.run
95
+ end
96
+ end
97
+
98
+ #
99
+ # Add a block to be executed on the PageStore after the crawl
100
+ # is finished
101
+ #
102
+ def after_crawl(&block)
103
+ @after_crawl_blocks << block
104
+ self
105
+ end
106
+
107
+ #
108
+ # Add one ore more Regex patterns for URLs which should not be
109
+ # followed
110
+ #
111
+ def skip_links_like(*patterns)
112
+ @skip_link_patterns.concat [patterns].flatten.compact
113
+ self
114
+ end
115
+
116
+ #
117
+ # Add a block to be executed on every Page as they are encountered
118
+ # during the crawl
119
+ #
120
+ def on_every_page(&block)
121
+ @on_every_page_blocks << block
122
+ self
123
+ end
124
+
125
+ #
126
+ # Add a block to be executed on Page objects with a URL matching
127
+ # one or more patterns
128
+ #
129
+ def on_pages_like(*patterns, &block)
130
+ if patterns
131
+ patterns.each do |pattern|
132
+ @on_pages_like_blocks[pattern] << block
133
+ end
134
+ end
135
+ self
136
+ end
137
+
138
+ #
139
+ # Specify a block which will select which links to follow on each page.
140
+ # The block should return an Array of URI objects.
141
+ #
142
+ def focus_crawl(&block)
143
+ @focus_crawl_block = block
144
+ self
145
+ end
146
+
147
+ #
148
+ # Perform the crawl
149
+ #
150
+ def run
151
+ process_options
152
+
153
+ @urls.delete_if { |url| !visit_link?(url) }
154
+ return if @urls.empty?
155
+
156
+ link_queue = Queue.new
157
+ page_queue = Queue.new
158
+
159
+ @opts[:threads].times do
160
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts.dup).run }
161
+ end
162
+
163
+ @urls.each{ |url| link_queue.enq(url) }
164
+
165
+ loop do
166
+ page = page_queue.deq
167
+ @pages.touch_key page.url
168
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
169
+ do_page_blocks page
170
+ page.discard_doc! if @opts[:discard_page_bodies]
171
+
172
+ links = links_to_follow page
173
+ links.each do |link|
174
+ link_queue << [link, page.url.dup, page.depth + 1]
175
+ end
176
+ @pages.touch_keys links
177
+
178
+ @pages[page.url] = page
179
+
180
+ # if we are done with the crawl, tell the threads to end
181
+ if link_queue.empty? and page_queue.empty?
182
+ until link_queue.num_waiting == @tentacles.size
183
+ Thread.pass
184
+ end
185
+ if page_queue.empty?
186
+ @tentacles.size.times { link_queue << :END }
187
+ break
188
+ end
189
+ end
190
+ end
191
+
192
+ @tentacles.each { |thread| thread.join }
193
+ do_after_crawl_blocks
194
+ self
195
+ end
196
+
197
+ private
198
+
199
+ def process_options
200
+ @opts = DEFAULT_OPTS.merge @opts
201
+ @opts[:threads] = 1 if @opts[:delay] > 0
202
+ storage = Storage::Base.new(@opts[:storage] || Storage.Moneta(:Memory))
203
+ storage.clear if @opts[:clear_on_startup]
204
+ @pages = PageStore.new(storage)
205
+ @robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
206
+
207
+ freeze_options
208
+ end
209
+
210
+ #
211
+ # Freeze the opts Hash so that no options can be modified
212
+ # once the crawl begins
213
+ #
214
+ def freeze_options
215
+ @opts.freeze
216
+ @opts.each_key { |key| @opts[key].freeze }
217
+ @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
218
+ end
219
+
220
+ #
221
+ # Execute the after_crawl blocks
222
+ #
223
+ def do_after_crawl_blocks
224
+ @after_crawl_blocks.each { |block| block.call(@pages) }
225
+ end
226
+
227
+ #
228
+ # Execute the on_every_page blocks for *page*
229
+ #
230
+ def do_page_blocks(page)
231
+ @on_every_page_blocks.each do |block|
232
+ block.call(page)
233
+ end
234
+
235
+ @on_pages_like_blocks.each do |pattern, blocks|
236
+ blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
237
+ end
238
+ end
239
+
240
+ #
241
+ # Return an Array of links to follow from the given page.
242
+ # Based on whether or not the link has already been crawled,
243
+ # and the block given to focus_crawl()
244
+ #
245
+ def links_to_follow(page)
246
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
247
+ links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
248
+ end
249
+
250
+ #
251
+ # Returns +true+ if *link* has not been visited already,
252
+ # and is not excluded by a skip_link pattern...
253
+ # and is not excluded by robots.txt...
254
+ # and is not deeper than the depth limit
255
+ # Returns +false+ otherwise.
256
+ #
257
+ def visit_link?(link, from_page = nil)
258
+ !@pages.has_page?(link) &&
259
+ !skip_link?(link) &&
260
+ !skip_query_string?(link) &&
261
+ allowed(link) &&
262
+ !too_deep?(from_page)
263
+ end
264
+
265
+ #
266
+ # Returns +true+ if we are obeying robots.txt and the link
267
+ # is granted access in it. Always returns +true+ when we are
268
+ # not obeying robots.txt.
269
+ #
270
+ def allowed(link)
271
+ @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
272
+ rescue
273
+ false
274
+ end
275
+
276
+ #
277
+ # Returns +true+ if we are over the page depth limit.
278
+ # This only works when coming from a page and with the +depth_limit+ option set.
279
+ # When neither is the case, will always return +false+.
280
+ def too_deep?(from_page)
281
+ if from_page && @opts[:depth_limit]
282
+ from_page.depth >= @opts[:depth_limit]
283
+ else
284
+ false
285
+ end
286
+ end
287
+
288
+ #
289
+ # Returns +true+ if *link* should not be visited because
290
+ # it has a query string and +skip_query_strings+ is true.
291
+ #
292
+ def skip_query_string?(link)
293
+ @opts[:skip_query_strings] && link.query
294
+ end
295
+
296
+ #
297
+ # Returns +true+ if *link* should not be visited because
298
+ # its URL matches a skip_link pattern.
299
+ #
300
+ def skip_link?(link)
301
+ @skip_link_patterns.any? { |pattern| link.path =~ pattern }
302
+ end
303
+
304
+ end
305
+ end
@@ -0,0 +1,5 @@
1
+ module Medusa
2
+ class Error < ::StandardError
3
+ attr_accessor :wrapped_exception
4
+ end
5
+ end
@@ -0,0 +1,202 @@
1
+ require 'rubygems'
2
+ require 'medusa/page'
3
+ require 'medusa/cookie_store'
4
+
5
+ module Medusa
6
+ class HTTP
7
+ # Maximum number of redirects to follow on each get_response
8
+ REDIRECT_LIMIT = 5
9
+ RETRY_LIMIT = 6
10
+
11
+ # CookieStore for this HTTP client
12
+ attr_reader :cookie_store
13
+
14
+ def initialize(opts = {})
15
+ @opts = opts
16
+ @cookie_store = CookieStore.new(@opts[:cookies])
17
+ end
18
+
19
+ #
20
+ # Fetch a single Page from the response of an HTTP request to *url*.
21
+ # Just gets the final destination page.
22
+ #
23
+ def fetch_page(url, referer = nil, depth = nil)
24
+ fetch_pages(url, referer, depth).last
25
+ end
26
+
27
+ #
28
+ # Create new Pages from the response of an HTTP request to *url*,
29
+ # including redirects
30
+ #
31
+ def fetch_pages(url, referer = nil, depth = nil)
32
+ begin
33
+ url = URI(url) unless url.is_a?(URI)
34
+ pages = []
35
+ get(url, referer) do |response, headers, code, location, redirect_to, response_time|
36
+ pages << Page.new(location, :body => response,
37
+ :headers => headers,
38
+ :code => code,
39
+ :referer => referer,
40
+ :depth => depth,
41
+ :redirect_to => redirect_to,
42
+ :response_time => response_time)
43
+ end
44
+
45
+ return pages
46
+ rescue Exception => e
47
+ if verbose?
48
+ puts e.inspect
49
+ puts e.backtrace
50
+ end
51
+ pages ||= []
52
+ return pages << Page.new(url, :error => e)
53
+ end
54
+ end
55
+
56
+ #
57
+ # The maximum number of redirects to follow
58
+ #
59
+ def redirect_limit
60
+ @opts[:redirect_limit] || REDIRECT_LIMIT
61
+ end
62
+
63
+ #
64
+ # The user-agent string which will be sent with each request,
65
+ # or nil if no such option is set
66
+ #
67
+ def user_agent
68
+ @opts[:user_agent]
69
+ end
70
+
71
+ #
72
+ # Does this HTTP client accept cookies from the server?
73
+ #
74
+ def accept_cookies?
75
+ @opts[:accept_cookies]
76
+ end
77
+
78
+ #
79
+ # The http authentication options as in http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
80
+ # userinfo is deprecated [RFC3986]
81
+ #
82
+ def http_basic_authentication
83
+ @opts[:http_basic_authentication]
84
+ end
85
+
86
+ #
87
+ # The proxy authentication options as in http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
88
+ #
89
+ def proxy_http_basic_authentication
90
+ @opts[:proxy_http_basic_authentication]
91
+ end
92
+
93
+ #
94
+ # The proxy options as in http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
95
+ #
96
+ def proxy
97
+ @opts[:proxy]
98
+ end
99
+
100
+ #
101
+ # The proxy address string
102
+ #
103
+ def proxy_host
104
+ @opts[:proxy_host]
105
+ end
106
+
107
+ #
108
+ # The proxy port
109
+ #
110
+ def proxy_port
111
+ @opts[:proxy_port]
112
+ end
113
+
114
+ #
115
+ # HTTP read timeout in seconds
116
+ #
117
+ def read_timeout
118
+ @opts[:read_timeout]
119
+ end
120
+
121
+ private
122
+
123
+ #
124
+ # Retrieve HTTP responses for *url*, including redirects.
125
+ # Yields the response object, response code, and URI location
126
+ # for each response.
127
+ #
128
+ def get(url, referer = nil)
129
+ limit = redirect_limit
130
+ loc = url
131
+ begin
132
+ # if redirected to a relative url, merge it with the host of the original
133
+ # request url
134
+ loc = url.merge(loc) if loc.relative?
135
+
136
+ response, headers, response_time, response_code, redirect_to = get_response(loc, referer)
137
+
138
+ yield response, headers, Integer(response_code), loc, redirect_to, response_time
139
+ limit -= 1
140
+ end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
141
+ end
142
+
143
+ #
144
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
145
+ #
146
+ def get_response(url, referer = nil)
147
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
148
+
149
+ opts = {}
150
+ opts['User-Agent'] = user_agent if user_agent
151
+ opts['Referer'] = referer.to_s if referer
152
+ opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
153
+ opts[:http_basic_authentication] = http_basic_authentication if http_basic_authentication
154
+ opts[:proxy] = proxy if proxy
155
+ opts[:proxy_http_basic_authentication] = proxy_http_basic_authentication if proxy_http_basic_authentication
156
+ opts[:read_timeout] = read_timeout if !!read_timeout
157
+ opts[:redirect] = false
158
+ redirect_to = nil
159
+ retries = 0
160
+ begin
161
+ start = Time.now()
162
+
163
+ begin
164
+ if Gem::Requirement.new('< 2.5').satisfied_by?(Gem::Version.new(RUBY_VERSION))
165
+ resource = open(url, opts)
166
+ else
167
+ resource = URI.open(url, opts)
168
+ end
169
+ rescue OpenURI::HTTPRedirect => e_redirect
170
+ resource = e_redirect.io
171
+ redirect_to = e_redirect.uri
172
+ rescue OpenURI::HTTPError => e_http
173
+ resource = e_http.io
174
+ end
175
+
176
+ finish = Time.now()
177
+ response_time = ((finish - start) * 1000).round
178
+ @cookie_store.merge!(resource.meta['set-cookie']) if accept_cookies?
179
+ return resource.read, resource.meta, response_time, resource.status.shift, redirect_to
180
+
181
+ rescue Timeout::Error, EOFError, Errno::ECONNREFUSED, Errno::ETIMEDOUT, Errno::ECONNRESET => e
182
+ retries += 1
183
+ puts "[medusa] Retrying ##{retries} on url #{url} because of: #{e.inspect}" if verbose?
184
+ sleep(3 ^ retries)
185
+ retry unless retries > RETRY_LIMIT
186
+ ensure
187
+ resource.close if !resource.nil? && !resource.closed?
188
+ end
189
+ end
190
+
191
+ def verbose?
192
+ @opts[:verbose]
193
+ end
194
+
195
+ #
196
+ # Allowed to connect to the requested url?
197
+ #
198
+ def allowed?(to_url, from_url)
199
+ to_url.host.nil? || (to_url.host == from_url.host)
200
+ end
201
+ end
202
+ end