rodneyc-anemone 0.7.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,332 @@
1
+ require 'thread'
2
+ require 'robotex'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page'
5
+ require 'anemone/exceptions'
6
+ require 'anemone/page_store'
7
+ require 'anemone/storage'
8
+ require 'anemone/storage/base'
9
+
10
+ module Anemone
11
+
12
+ VERSION = '0.7.1';
13
+
14
+ #
15
+ # Convenience method to start a crawl
16
+ #
17
+ def Anemone.crawl(urls, options = {}, &block)
18
+ Core.crawl(urls, options, &block)
19
+ end
20
+
21
+ class Core
22
+
23
+ # PageStore storing all Page objects encountered during the crawl
24
+ attr_reader :pages
25
+ # Hash of options for the crawl
26
+ attr_reader :opts
27
+
28
+ DEFAULT_OPTS = {
29
+ # run 4 Tentacle threads to fetch pages
30
+ :threads => 4,
31
+ # disable verbose output
32
+ :verbose => false,
33
+ # don't throw away the page response body after scanning it for links
34
+ :discard_page_bodies => false,
35
+ # identify self as Anemone/VERSION
36
+ :user_agent => "Anemone/#{Anemone::VERSION}",
37
+ # no delay between requests
38
+ :delay => 0,
39
+ # don't obey the robots exclusion protocol
40
+ :obey_robots_txt => false,
41
+ # by default, don't limit the depth of the crawl
42
+ :depth_limit => false,
43
+ # number of times HTTP redirects will be followed
44
+ :redirect_limit => 5,
45
+ # storage engine defaults to Hash in +process_options+ if none specified
46
+ :storage => nil,
47
+ # Hash of cookie name => value to send with HTTP requests
48
+ :cookies => nil,
49
+ # accept cookies from the server and send them back?
50
+ :accept_cookies => false,
51
+ # skip any link with a query string? e.g. http://foo.com/?u=user
52
+ :skip_query_strings => false,
53
+ # proxy server hostname
54
+ :proxy_host => nil,
55
+ # proxy server port number
56
+ :proxy_port => false,
57
+ # HTTP read timeout in seconds
58
+ :read_timeout => nil
59
+ }
60
+
61
+ # Create setter methods for all options to be called from the crawl block
62
+ DEFAULT_OPTS.keys.each do |key|
63
+ define_method "#{key}=" do |value|
64
+ @opts[key.to_sym] = value
65
+ end
66
+ end
67
+
68
+ #
69
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
70
+ # and optional *block*
71
+ #
72
+ def initialize(urls, opts = {})
73
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
74
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
75
+
76
+ @tentacles = []
77
+ @on_every_page_blocks = []
78
+ @before_filter_urls = nil
79
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
80
+ @skip_link_patterns = []
81
+ @include_link_patterns = []
82
+ @after_crawl_blocks = []
83
+ @opts = opts
84
+
85
+ yield self if block_given?
86
+ end
87
+
88
+ #
89
+ # Convenience method to start a new crawl
90
+ #
91
+ def self.crawl(urls, opts = {})
92
+ self.new(urls, opts) do |core|
93
+ yield core if block_given?
94
+ core.run
95
+ end
96
+ end
97
+
98
+ #
99
+ # Add a block to be executed on the PageStore after the crawl
100
+ # is finished
101
+ #
102
+ def after_crawl(&block)
103
+ @after_crawl_blocks << block
104
+ self
105
+ end
106
+
107
+ #
108
+ # Add one ore more Regex patterns for URLs which should not be
109
+ # followed
110
+ #
111
+ def skip_links_like(*patterns)
112
+ @skip_link_patterns.concat [patterns].flatten.compact
113
+ self
114
+ end
115
+
116
+ #
117
+ # Add one ore more Regex patterns for URLs which should not be
118
+ # followed
119
+ #
120
+ def include_links_like(*patterns)
121
+ @include_link_patterns.concat [patterns].flatten.compact
122
+ self
123
+ end
124
+
125
+ #
126
+ # Add a block to be executed on every Page as they are encountered
127
+ # during the crawl
128
+ #
129
+ def on_every_page(&block)
130
+ @on_every_page_blocks << block
131
+ self
132
+ end
133
+
134
+ def filter_urls(&block)
135
+ @before_filter_urls = block
136
+ self
137
+ end
138
+
139
+ #
140
+ # Add a block to be executed on Page objects with a URL matching
141
+ # one or more patterns
142
+ #
143
+ def on_pages_like(*patterns, &block)
144
+ if patterns
145
+ patterns.each do |pattern|
146
+ @on_pages_like_blocks[pattern] << block
147
+ end
148
+ end
149
+ self
150
+ end
151
+
152
+ #
153
+ # Specify a block which will select which links to follow on each page.
154
+ # The block should return an Array of URI objects.
155
+ #
156
+ def focus_crawl(&block)
157
+ @focus_crawl_block = block
158
+ self
159
+ end
160
+
161
+ #
162
+ # Perform the crawl
163
+ #
164
+ def run
165
+ process_options
166
+
167
+ @urls.delete_if { |url| !visit_link?(url) }
168
+ return if @urls.empty?
169
+
170
+ link_queue = Queue.new
171
+ page_queue = Queue.new
172
+
173
+ @opts[:threads].times do
174
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
175
+ end
176
+
177
+ @urls.each{ |url| link_queue.enq(url) }
178
+
179
+ loop do
180
+ page = page_queue.deq
181
+ @pages.touch_key page.url
182
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
183
+ do_page_blocks page
184
+ page.discard_doc! if @opts[:discard_page_bodies]
185
+
186
+ links = links_to_follow page
187
+ links = do_filter_urls(links, page.depth)
188
+ links.each do |link|
189
+ link_queue << [link, page.url.dup, page.depth + 1]
190
+ end
191
+ @pages.touch_keys links
192
+
193
+ @pages[page.url] = page
194
+
195
+ # if we are done with the crawl, tell the threads to end
196
+ if link_queue.empty? and page_queue.empty?
197
+ until link_queue.num_waiting == @tentacles.size
198
+ Thread.pass
199
+ end
200
+ if page_queue.empty?
201
+ @tentacles.size.times { link_queue << :END }
202
+ break
203
+ end
204
+ end
205
+ end
206
+
207
+ @tentacles.each { |thread| thread.join }
208
+ do_after_crawl_blocks
209
+ self
210
+ end
211
+
212
+ private
213
+
214
+ def process_options
215
+ @opts = DEFAULT_OPTS.merge @opts
216
+ @opts[:threads] = 1 if @opts[:delay] > 0
217
+ storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
218
+ @pages = PageStore.new(storage)
219
+ @robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
220
+
221
+ freeze_options
222
+ end
223
+
224
+ #
225
+ # Freeze the opts Hash so that no options can be modified
226
+ # once the crawl begins
227
+ #
228
+ def freeze_options
229
+ @opts.freeze
230
+ @opts.each_key { |key| @opts[key].freeze }
231
+ @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
232
+ end
233
+
234
+ #
235
+ # Execute the after_crawl blocks
236
+ #
237
+ def do_after_crawl_blocks
238
+ @after_crawl_blocks.each { |block| block.call(@pages) }
239
+ end
240
+
241
+ #
242
+ # Execute the on_every_page blocks for *page*
243
+ #
244
+ def do_page_blocks(page)
245
+ @on_every_page_blocks.each do |block|
246
+ block.call(page)
247
+ end
248
+
249
+ @on_pages_like_blocks.each do |pattern, blocks|
250
+ blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
251
+ end
252
+ end
253
+
254
+ def do_filter_urls(urls, depth)
255
+ @before_filter_urls.call(urls, depth)
256
+ end
257
+
258
+ #
259
+ # Return an Array of links to follow from the given page.
260
+ # Based on whether or not the link has already been crawled,
261
+ # and the block given to focus_crawl()
262
+ #
263
+ def links_to_follow(page)
264
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
265
+ links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
266
+ end
267
+
268
+ #
269
+ # Returns +true+ if *link* has not been visited already,
270
+ # and is not excluded by a skip_link pattern...
271
+ # and is not excluded by robots.txt...
272
+ # and is not deeper than the depth limit
273
+ # Returns +false+ otherwise.
274
+ #
275
+ def visit_link?(link, from_page = nil)
276
+ !@pages.has_page?(link) &&
277
+ !skip_link?(link) &&
278
+ include_link?(link) &&
279
+ !skip_query_string?(link) &&
280
+ allowed(link) &&
281
+ !too_deep?(from_page)
282
+ end
283
+
284
+ #
285
+ # Returns +true+ if we are obeying robots.txt and the link
286
+ # is granted access in it. Always returns +true+ when we are
287
+ # not obeying robots.txt.
288
+ #
289
+ def allowed(link)
290
+ @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
291
+ rescue
292
+ false
293
+ end
294
+
295
+ #
296
+ # Returns +true+ if we are over the page depth limit.
297
+ # This only works when coming from a page and with the +depth_limit+ option set.
298
+ # When neither is the case, will always return +false+.
299
+ def too_deep?(from_page)
300
+ if from_page && @opts[:depth_limit]
301
+ from_page.depth >= @opts[:depth_limit]
302
+ else
303
+ false
304
+ end
305
+ end
306
+
307
+ #
308
+ # Returns +true+ if *link* should not be visited because
309
+ # it has a query string and +skip_query_strings+ is true.
310
+ #
311
+ def skip_query_string?(link)
312
+ @opts[:skip_query_strings] && link.query
313
+ end
314
+
315
+ #
316
+ # Returns +true+ if *link* should not be visited because
317
+ # its URL matches a skip_link pattern.
318
+ #
319
+ def skip_link?(link)
320
+ @skip_link_patterns.any? { |pattern| link.path =~ pattern }
321
+ end
322
+
323
+ #
324
+ # Returns +true+ if *link* should be visited because
325
+ # its URL matches a include_link pattern.
326
+ #
327
+ def include_link?(link)
328
+ return true if @include_link_patterns.empty?
329
+ @include_link_patterns.any? { |pattern| link.path =~ pattern }
330
+ end
331
+ end
332
+ end
@@ -0,0 +1,5 @@
1
+ module Anemone
2
+ class Error < ::StandardError
3
+ attr_accessor :wrapped_exception
4
+ end
5
+ end
@@ -0,0 +1,187 @@
1
+ require 'net/https'
2
+ require 'anemone/page'
3
+ require 'anemone/cookie_store'
4
+
5
+ module Anemone
6
+ class HTTP
7
+ # Maximum number of redirects to follow on each get_response
8
+ REDIRECT_LIMIT = 5
9
+
10
+ # CookieStore for this HTTP client
11
+ attr_reader :cookie_store
12
+
13
+ def initialize(opts = {})
14
+ @connections = {}
15
+ @opts = opts
16
+ @cookie_store = CookieStore.new(@opts[:cookies])
17
+ end
18
+
19
+ #
20
+ # Fetch a single Page from the response of an HTTP request to *url*.
21
+ # Just gets the final destination page.
22
+ #
23
+ def fetch_page(url, referer = nil, depth = nil)
24
+ fetch_pages(url, referer, depth).last
25
+ end
26
+
27
+ #
28
+ # Create new Pages from the response of an HTTP request to *url*,
29
+ # including redirects
30
+ #
31
+ def fetch_pages(url, referer = nil, depth = nil)
32
+ begin
33
+ url = URI(url) unless url.is_a?(URI)
34
+ pages = []
35
+ get(url, referer) do |response, code, location, redirect_to, response_time|
36
+ pages << Page.new(location, :body => response.body.dup,
37
+ :code => code,
38
+ :headers => response.to_hash,
39
+ :referer => referer,
40
+ :depth => depth,
41
+ :redirect_to => redirect_to,
42
+ :response_time => response_time)
43
+ end
44
+
45
+ return pages
46
+ rescue Exception => e
47
+ if verbose?
48
+ puts e.inspect
49
+ puts e.backtrace
50
+ end
51
+ return [Page.new(url, :error => e)]
52
+ end
53
+ end
54
+
55
+ #
56
+ # The maximum number of redirects to follow
57
+ #
58
+ def redirect_limit
59
+ @opts[:redirect_limit] || REDIRECT_LIMIT
60
+ end
61
+
62
+ #
63
+ # The user-agent string which will be sent with each request,
64
+ # or nil if no such option is set
65
+ #
66
+ def user_agent
67
+ @opts[:user_agent]
68
+ end
69
+
70
+ #
71
+ # Does this HTTP client accept cookies from the server?
72
+ #
73
+ def accept_cookies?
74
+ @opts[:accept_cookies]
75
+ end
76
+
77
+ #
78
+ # The proxy address string
79
+ #
80
+ def proxy_host
81
+ @opts[:proxy_host]
82
+ end
83
+
84
+ #
85
+ # The proxy port
86
+ #
87
+ def proxy_port
88
+ @opts[:proxy_port]
89
+ end
90
+
91
+ #
92
+ # HTTP read timeout in seconds
93
+ #
94
+ def read_timeout
95
+ @opts[:read_timeout]
96
+ end
97
+
98
+ private
99
+
100
+ #
101
+ # Retrieve HTTP responses for *url*, including redirects.
102
+ # Yields the response object, response code, and URI location
103
+ # for each response.
104
+ #
105
+ def get(url, referer = nil)
106
+ limit = redirect_limit
107
+ loc = url
108
+ begin
109
+ # if redirected to a relative url, merge it with the host of the original
110
+ # request url
111
+ loc = url.merge(loc) if loc.relative?
112
+
113
+ response, response_time = get_response(loc, referer)
114
+ code = Integer(response.code)
115
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
116
+ yield response, code, loc, redirect_to, response_time
117
+ limit -= 1
118
+ end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
119
+ end
120
+
121
+ #
122
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
123
+ #
124
+ def get_response(url, referer = nil)
125
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
126
+
127
+ opts = {}
128
+ opts['User-Agent'] = user_agent if user_agent
129
+ opts['Referer'] = referer.to_s if referer
130
+ opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
131
+
132
+ retries = 0
133
+ begin
134
+ start = Time.now()
135
+ # format request
136
+ req = Net::HTTP::Get.new(full_path, opts)
137
+ # HTTP Basic authentication
138
+ req.basic_auth url.user, url.password if url.user
139
+ response = connection(url).request(req)
140
+ finish = Time.now()
141
+ response_time = ((finish - start) * 1000).round
142
+ @cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
143
+ return response, response_time
144
+ rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
145
+ puts e.inspect if verbose?
146
+ refresh_connection(url)
147
+ retries += 1
148
+ retry unless retries > 3
149
+ end
150
+ end
151
+
152
+ def connection(url)
153
+ @connections[url.host] ||= {}
154
+
155
+ if conn = @connections[url.host][url.port]
156
+ return conn
157
+ end
158
+
159
+ refresh_connection url
160
+ end
161
+
162
+ def refresh_connection(url)
163
+ http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
164
+
165
+ http.read_timeout = read_timeout if !!read_timeout
166
+
167
+ if url.scheme == 'https'
168
+ http.use_ssl = true
169
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
170
+ end
171
+
172
+ @connections[url.host][url.port] = http.start
173
+ end
174
+
175
+ def verbose?
176
+ @opts[:verbose]
177
+ end
178
+
179
+ #
180
+ # Allowed to connect to the requested url?
181
+ #
182
+ def allowed?(to_url, from_url)
183
+ to_url.host.nil? || (to_url.host == from_url.host)
184
+ end
185
+
186
+ end
187
+ end