medusa-crawler 1.0.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,305 @@
1
+ require 'thread'
2
+ require 'robotex'
3
+ require 'medusa/tentacle'
4
+ require 'medusa/page'
5
+ require 'medusa/exceptions'
6
+ require 'medusa/page_store'
7
+ require 'medusa/version'
8
+ require 'medusa/storage'
9
+ require 'medusa/storage/base'
10
+
11
+ module Medusa
12
+ #
13
+ # Convenience method to start a crawl
14
+ #
15
+ def Medusa.crawl(urls, options = {}, &block)
16
+ Core.crawl(urls, options, &block)
17
+ end
18
+
19
+ class Core
20
+
21
+ # PageStore storing all Page objects encountered during the crawl
22
+ attr_reader :pages
23
+ # Hash of options for the crawl
24
+ attr_reader :opts
25
+
26
+ DEFAULT_OPTS = {
27
+ # run 4 Tentacle threads to fetch pages
28
+ :threads => 4,
29
+ # disable verbose output
30
+ :verbose => false,
31
+ # don't throw away the page response body after scanning it for links
32
+ :discard_page_bodies => false,
33
+ # identify self as Medusa/VERSION
34
+ :user_agent => "Medusa/#{Medusa::VERSION}",
35
+ # no delay between requests
36
+ :delay => 0,
37
+ # don't obey the robots exclusion protocol
38
+ :obey_robots_txt => false,
39
+ # by default, don't limit the depth of the crawl
40
+ :depth_limit => false,
41
+ # number of times HTTP redirects will be followed
42
+ :redirect_limit => 5,
43
+ # storage engine defaults to Hash in +process_options+ if none specified
44
+ :storage => nil,
45
+ # cleanups of the storage on every startup of the crawler
46
+ :clear_on_startup => true,
47
+ # Hash of cookie name => value to send with HTTP requests
48
+ :cookies => nil,
49
+ # accept cookies from the server and send them back?
50
+ :accept_cookies => false,
51
+ # skip any link with a query string? e.g. http://foo.com/?u=user
52
+ :skip_query_strings => false,
53
+ # proxy server hostname
54
+ :proxy_host => nil,
55
+ # proxy server port number
56
+ :proxy_port => false,
57
+ # HTTP read timeout in seconds
58
+ :read_timeout => nil
59
+ }.freeze
60
+
61
+ # Create setter methods for all options to be called from the crawl block
62
+ DEFAULT_OPTS.keys.each do |key|
63
+ define_method "#{key}=" do |value|
64
+ @opts[key.to_sym] = value
65
+ end
66
+ end
67
+
68
+ #
69
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
70
+ # and optional *block*
71
+ #
72
+ def initialize(urls, opts = {})
73
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
74
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
75
+
76
+ @tentacles = []
77
+ @on_every_page_blocks = []
78
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
79
+ @skip_link_patterns = []
80
+ @after_crawl_blocks = []
81
+ @opts = opts
82
+ @focus_crawl_block = nil
83
+
84
+
85
+ yield self if block_given?
86
+ end
87
+
88
+ #
89
+ # Convenience method to start a new crawl
90
+ #
91
+ def self.crawl(urls, opts = {})
92
+ self.new(urls, opts) do |core|
93
+ yield core if block_given?
94
+ core.run
95
+ end
96
+ end
97
+
98
+ #
99
+ # Add a block to be executed on the PageStore after the crawl
100
+ # is finished
101
+ #
102
+ def after_crawl(&block)
103
+ @after_crawl_blocks << block
104
+ self
105
+ end
106
+
107
+ #
108
+ # Add one ore more Regex patterns for URLs which should not be
109
+ # followed
110
+ #
111
+ def skip_links_like(*patterns)
112
+ @skip_link_patterns.concat [patterns].flatten.compact
113
+ self
114
+ end
115
+
116
+ #
117
+ # Add a block to be executed on every Page as they are encountered
118
+ # during the crawl
119
+ #
120
+ def on_every_page(&block)
121
+ @on_every_page_blocks << block
122
+ self
123
+ end
124
+
125
+ #
126
+ # Add a block to be executed on Page objects with a URL matching
127
+ # one or more patterns
128
+ #
129
+ def on_pages_like(*patterns, &block)
130
+ if patterns
131
+ patterns.each do |pattern|
132
+ @on_pages_like_blocks[pattern] << block
133
+ end
134
+ end
135
+ self
136
+ end
137
+
138
+ #
139
+ # Specify a block which will select which links to follow on each page.
140
+ # The block should return an Array of URI objects.
141
+ #
142
+ def focus_crawl(&block)
143
+ @focus_crawl_block = block
144
+ self
145
+ end
146
+
147
+ #
148
+ # Perform the crawl
149
+ #
150
+ def run
151
+ process_options
152
+
153
+ @urls.delete_if { |url| !visit_link?(url) }
154
+ return if @urls.empty?
155
+
156
+ link_queue = Queue.new
157
+ page_queue = Queue.new
158
+
159
+ @opts[:threads].times do
160
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts.dup).run }
161
+ end
162
+
163
+ @urls.each{ |url| link_queue.enq(url) }
164
+
165
+ loop do
166
+ page = page_queue.deq
167
+ @pages.touch_key page.url
168
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
169
+ do_page_blocks page
170
+ page.discard_doc! if @opts[:discard_page_bodies]
171
+
172
+ links = links_to_follow page
173
+ links.each do |link|
174
+ link_queue << [link, page.url.dup, page.depth + 1]
175
+ end
176
+ @pages.touch_keys links
177
+
178
+ @pages[page.url] = page
179
+
180
+ # if we are done with the crawl, tell the threads to end
181
+ if link_queue.empty? and page_queue.empty?
182
+ until link_queue.num_waiting == @tentacles.size
183
+ Thread.pass
184
+ end
185
+ if page_queue.empty?
186
+ @tentacles.size.times { link_queue << :END }
187
+ break
188
+ end
189
+ end
190
+ end
191
+
192
+ @tentacles.each { |thread| thread.join }
193
+ do_after_crawl_blocks
194
+ self
195
+ end
196
+
197
+ private
198
+
199
+ def process_options
200
+ @opts = DEFAULT_OPTS.merge @opts
201
+ @opts[:threads] = 1 if @opts[:delay] > 0
202
+ storage = Storage::Base.new(@opts[:storage] || Storage.Moneta(:Memory))
203
+ storage.clear if @opts[:clear_on_startup]
204
+ @pages = PageStore.new(storage)
205
+ @robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
206
+
207
+ freeze_options
208
+ end
209
+
210
+ #
211
+ # Freeze the opts Hash so that no options can be modified
212
+ # once the crawl begins
213
+ #
214
+ def freeze_options
215
+ @opts.freeze
216
+ @opts.each_key { |key| @opts[key].freeze }
217
+ @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
218
+ end
219
+
220
+ #
221
+ # Execute the after_crawl blocks
222
+ #
223
+ def do_after_crawl_blocks
224
+ @after_crawl_blocks.each { |block| block.call(@pages) }
225
+ end
226
+
227
+ #
228
+ # Execute the on_every_page blocks for *page*
229
+ #
230
+ def do_page_blocks(page)
231
+ @on_every_page_blocks.each do |block|
232
+ block.call(page)
233
+ end
234
+
235
+ @on_pages_like_blocks.each do |pattern, blocks|
236
+ blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
237
+ end
238
+ end
239
+
240
+ #
241
+ # Return an Array of links to follow from the given page.
242
+ # Based on whether or not the link has already been crawled,
243
+ # and the block given to focus_crawl()
244
+ #
245
+ def links_to_follow(page)
246
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
247
+ links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
248
+ end
249
+
250
+ #
251
+ # Returns +true+ if *link* has not been visited already,
252
+ # and is not excluded by a skip_link pattern...
253
+ # and is not excluded by robots.txt...
254
+ # and is not deeper than the depth limit
255
+ # Returns +false+ otherwise.
256
+ #
257
+ def visit_link?(link, from_page = nil)
258
+ !@pages.has_page?(link) &&
259
+ !skip_link?(link) &&
260
+ !skip_query_string?(link) &&
261
+ allowed(link) &&
262
+ !too_deep?(from_page)
263
+ end
264
+
265
+ #
266
+ # Returns +true+ if we are obeying robots.txt and the link
267
+ # is granted access in it. Always returns +true+ when we are
268
+ # not obeying robots.txt.
269
+ #
270
+ def allowed(link)
271
+ @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
272
+ rescue
273
+ false
274
+ end
275
+
276
+ #
277
+ # Returns +true+ if we are over the page depth limit.
278
+ # This only works when coming from a page and with the +depth_limit+ option set.
279
+ # When neither is the case, will always return +false+.
280
+ def too_deep?(from_page)
281
+ if from_page && @opts[:depth_limit]
282
+ from_page.depth >= @opts[:depth_limit]
283
+ else
284
+ false
285
+ end
286
+ end
287
+
288
+ #
289
+ # Returns +true+ if *link* should not be visited because
290
+ # it has a query string and +skip_query_strings+ is true.
291
+ #
292
+ def skip_query_string?(link)
293
+ @opts[:skip_query_strings] && link.query
294
+ end
295
+
296
+ #
297
+ # Returns +true+ if *link* should not be visited because
298
+ # its URL matches a skip_link pattern.
299
+ #
300
+ def skip_link?(link)
301
+ @skip_link_patterns.any? { |pattern| link.path =~ pattern }
302
+ end
303
+
304
+ end
305
+ end
@@ -0,0 +1,5 @@
1
+ module Medusa
2
+ class Error < ::StandardError
3
+ attr_accessor :wrapped_exception
4
+ end
5
+ end
@@ -0,0 +1,202 @@
1
+ require 'rubygems'
2
+ require 'medusa/page'
3
+ require 'medusa/cookie_store'
4
+
5
+ module Medusa
6
+ class HTTP
7
+ # Maximum number of redirects to follow on each get_response
8
+ REDIRECT_LIMIT = 5
9
+ RETRY_LIMIT = 6
10
+
11
+ # CookieStore for this HTTP client
12
+ attr_reader :cookie_store
13
+
14
+ def initialize(opts = {})
15
+ @opts = opts
16
+ @cookie_store = CookieStore.new(@opts[:cookies])
17
+ end
18
+
19
+ #
20
+ # Fetch a single Page from the response of an HTTP request to *url*.
21
+ # Just gets the final destination page.
22
+ #
23
+ def fetch_page(url, referer = nil, depth = nil)
24
+ fetch_pages(url, referer, depth).last
25
+ end
26
+
27
+ #
28
+ # Create new Pages from the response of an HTTP request to *url*,
29
+ # including redirects
30
+ #
31
+ def fetch_pages(url, referer = nil, depth = nil)
32
+ begin
33
+ url = URI(url) unless url.is_a?(URI)
34
+ pages = []
35
+ get(url, referer) do |response, headers, code, location, redirect_to, response_time|
36
+ pages << Page.new(location, :body => response,
37
+ :headers => headers,
38
+ :code => code,
39
+ :referer => referer,
40
+ :depth => depth,
41
+ :redirect_to => redirect_to,
42
+ :response_time => response_time)
43
+ end
44
+
45
+ return pages
46
+ rescue Exception => e
47
+ if verbose?
48
+ puts e.inspect
49
+ puts e.backtrace
50
+ end
51
+ pages ||= []
52
+ return pages << Page.new(url, :error => e)
53
+ end
54
+ end
55
+
56
+ #
57
+ # The maximum number of redirects to follow
58
+ #
59
+ def redirect_limit
60
+ @opts[:redirect_limit] || REDIRECT_LIMIT
61
+ end
62
+
63
+ #
64
+ # The user-agent string which will be sent with each request,
65
+ # or nil if no such option is set
66
+ #
67
+ def user_agent
68
+ @opts[:user_agent]
69
+ end
70
+
71
+ #
72
+ # Does this HTTP client accept cookies from the server?
73
+ #
74
+ def accept_cookies?
75
+ @opts[:accept_cookies]
76
+ end
77
+
78
+ #
79
+ # The http authentication options as in http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
80
+ # userinfo is deprecated [RFC3986]
81
+ #
82
+ def http_basic_authentication
83
+ @opts[:http_basic_authentication]
84
+ end
85
+
86
+ #
87
+ # The proxy authentication options as in http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
88
+ #
89
+ def proxy_http_basic_authentication
90
+ @opts[:proxy_http_basic_authentication]
91
+ end
92
+
93
+ #
94
+ # The proxy options as in http://www.ruby-doc.org/stdlib/libdoc/open-uri/rdoc/OpenURI/OpenRead.html
95
+ #
96
+ def proxy
97
+ @opts[:proxy]
98
+ end
99
+
100
+ #
101
+ # The proxy address string
102
+ #
103
+ def proxy_host
104
+ @opts[:proxy_host]
105
+ end
106
+
107
+ #
108
+ # The proxy port
109
+ #
110
+ def proxy_port
111
+ @opts[:proxy_port]
112
+ end
113
+
114
+ #
115
+ # HTTP read timeout in seconds
116
+ #
117
+ def read_timeout
118
+ @opts[:read_timeout]
119
+ end
120
+
121
+ private
122
+
123
+ #
124
+ # Retrieve HTTP responses for *url*, including redirects.
125
+ # Yields the response object, response code, and URI location
126
+ # for each response.
127
+ #
128
+ def get(url, referer = nil)
129
+ limit = redirect_limit
130
+ loc = url
131
+ begin
132
+ # if redirected to a relative url, merge it with the host of the original
133
+ # request url
134
+ loc = url.merge(loc) if loc.relative?
135
+
136
+ response, headers, response_time, response_code, redirect_to = get_response(loc, referer)
137
+
138
+ yield response, headers, Integer(response_code), loc, redirect_to, response_time
139
+ limit -= 1
140
+ end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
141
+ end
142
+
143
+ #
144
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
145
+ #
146
+ def get_response(url, referer = nil)
147
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
148
+
149
+ opts = {}
150
+ opts['User-Agent'] = user_agent if user_agent
151
+ opts['Referer'] = referer.to_s if referer
152
+ opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
153
+ opts[:http_basic_authentication] = http_basic_authentication if http_basic_authentication
154
+ opts[:proxy] = proxy if proxy
155
+ opts[:proxy_http_basic_authentication] = proxy_http_basic_authentication if proxy_http_basic_authentication
156
+ opts[:read_timeout] = read_timeout if !!read_timeout
157
+ opts[:redirect] = false
158
+ redirect_to = nil
159
+ retries = 0
160
+ begin
161
+ start = Time.now()
162
+
163
+ begin
164
+ if Gem::Requirement.new('< 2.5').satisfied_by?(Gem::Version.new(RUBY_VERSION))
165
+ resource = open(url, opts)
166
+ else
167
+ resource = URI.open(url, opts)
168
+ end
169
+ rescue OpenURI::HTTPRedirect => e_redirect
170
+ resource = e_redirect.io
171
+ redirect_to = e_redirect.uri
172
+ rescue OpenURI::HTTPError => e_http
173
+ resource = e_http.io
174
+ end
175
+
176
+ finish = Time.now()
177
+ response_time = ((finish - start) * 1000).round
178
+ @cookie_store.merge!(resource.meta['set-cookie']) if accept_cookies?
179
+ return resource.read, resource.meta, response_time, resource.status.shift, redirect_to
180
+
181
+ rescue Timeout::Error, EOFError, Errno::ECONNREFUSED, Errno::ETIMEDOUT, Errno::ECONNRESET => e
182
+ retries += 1
183
+ puts "[medusa] Retrying ##{retries} on url #{url} because of: #{e.inspect}" if verbose?
184
+ sleep(3 ^ retries)
185
+ retry unless retries > RETRY_LIMIT
186
+ ensure
187
+ resource.close if !resource.nil? && !resource.closed?
188
+ end
189
+ end
190
+
191
+ def verbose?
192
+ @opts[:verbose]
193
+ end
194
+
195
+ #
196
+ # Allowed to connect to the requested url?
197
+ #
198
+ def allowed?(to_url, from_url)
199
+ to_url.host.nil? || (to_url.host == from_url.host)
200
+ end
201
+ end
202
+ end