sutch-anemone 0.7.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,41 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+
8
+ begin
9
+ # make sure that the last option is a URL we can crawl
10
+ root = URI(ARGV.last)
11
+ rescue
12
+ puts <<-INFO
13
+ Usage:
14
+ anemone url-list [options] <url>
15
+
16
+ Synopsis:
17
+ Crawls a site starting at the given URL, and outputs the URL of each page
18
+ in the domain as they are encountered.
19
+
20
+ Options:
21
+ -r, --relative Output relative URLs (rather than absolute)
22
+ INFO
23
+ exit(0)
24
+ end
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-r', '--relative') { options.relative = true }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
32
+
33
+ anemone.on_every_page do |page|
34
+ if options.relative
35
+ puts page.url.path
36
+ else
37
+ puts page.url
38
+ end
39
+ end
40
+
41
+ end
@@ -0,0 +1,35 @@
1
+ require 'delegate'
2
+ require 'webrick/cookie'
3
+
4
+ class WEBrick::Cookie
5
+ def expired?
6
+ !!expires && expires < Time.now
7
+ end
8
+ end
9
+
10
+ module Anemone
11
+ class CookieStore < DelegateClass(Hash)
12
+
13
+ def initialize(cookies = nil)
14
+ @cookies = {}
15
+ cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
16
+ super(@cookies)
17
+ end
18
+
19
+ def merge!(set_cookie_str)
20
+ begin
21
+ cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
22
+ hash[cookie.name] = cookie if !!cookie
23
+ hash
24
+ end
25
+ @cookies.merge! cookie_hash
26
+ rescue
27
+ end
28
+ end
29
+
30
+ def to_s
31
+ @cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,339 @@
1
+ require 'thread'
2
+ require 'robotex'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page'
5
+ require 'anemone/resource'
6
+ require 'anemone/exceptions'
7
+ require 'anemone/page_store'
8
+ require 'anemone/storage'
9
+ require 'anemone/storage/base'
10
+
11
+ module Anemone
12
+
13
+ VERSION = '0.7.2';
14
+
15
+ #
16
+ # Convenience method to start a crawl
17
+ #
18
+ def Anemone.crawl(urls, options = {}, &block)
19
+ Core.crawl(urls, options, &block)
20
+ end
21
+
22
+ class Core
23
+
24
+ # PageStore storing all Page objects encountered during the crawl
25
+ attr_reader :pages
26
+ # Hash of options for the crawl
27
+ attr_reader :opts
28
+
29
+ DEFAULT_OPTS = {
30
+ # run 4 Tentacle threads to fetch pages
31
+ :threads => 4,
32
+ # disable verbose output
33
+ :verbose => false,
34
+ # don't throw away the page response body after scanning it for links
35
+ :discard_page_bodies => false,
36
+ # identify self as Anemone/VERSION
37
+ :user_agent => "Anemone/#{Anemone::VERSION}",
38
+ # no delay between requests
39
+ :delay => 0,
40
+ # don't obey the robots exclusion protocol
41
+ :obey_robots_txt => false,
42
+ # by default, don't limit the depth of the crawl
43
+ :depth_limit => false,
44
+ # number of times HTTP redirects will be followed
45
+ :redirect_limit => 5,
46
+ # limit the size of the page queue to keep memory usage low
47
+ :page_queue_size_limit => nil,
48
+ # limit the size of the link queue to keep memory usage low
49
+ :link_queue_size_limit => nil,
50
+ # storage engine defaults to Hash in +process_options+ if none specified
51
+ :storage => nil,
52
+ # Hash of cookie name => value to send with HTTP requests
53
+ :cookies => nil,
54
+ # accept cookies from the server and send them back?
55
+ :accept_cookies => false,
56
+ # skip any link with a query string? e.g. http://foo.com/?u=user
57
+ :skip_query_strings => false,
58
+ # proxy server hostname
59
+ :proxy_host => nil,
60
+ # proxy server port number
61
+ :proxy_port => false,
62
+ # HTTP read timeout in seconds
63
+ :read_timeout => nil,
64
+ # parse pages using Page class
65
+ :page_class => Anemone::Page,
66
+ }
67
+
68
+ # Create setter methods for all options to be called from the crawl block
69
+ DEFAULT_OPTS.keys.each do |key|
70
+ define_method "#{key}=" do |value|
71
+ @opts[key.to_sym] = value
72
+ end
73
+ end
74
+
75
+ #
76
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
77
+ # and optional *block*
78
+ #
79
+ def initialize(urls, opts = {})
80
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
81
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
82
+
83
+ @tentacles = []
84
+ @on_every_page_blocks = []
85
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
86
+ @skip_link_patterns = []
87
+ @after_crawl_blocks = []
88
+ @opts = opts
89
+ @stop_crawl = false
90
+
91
+ yield self if block_given?
92
+ end
93
+
94
+ #
95
+ # Convenience method to start a new crawl
96
+ #
97
+ def self.crawl(urls, opts = {})
98
+ self.new(urls, opts) do |core|
99
+ yield core if block_given?
100
+ core.run
101
+ end
102
+ end
103
+
104
+ #
105
+ # Add a block to be executed on the PageStore after the crawl
106
+ # is finished
107
+ #
108
+ def after_crawl(&block)
109
+ @after_crawl_blocks << block
110
+ self
111
+ end
112
+
113
+ #
114
+ # Add one ore more Regex patterns for URLs which should not be
115
+ # followed
116
+ #
117
+ def skip_links_like(*patterns)
118
+ @skip_link_patterns.concat [patterns].flatten.compact
119
+ self
120
+ end
121
+
122
+ #
123
+ # Add a block to be executed on every Page as they are encountered
124
+ # during the crawl
125
+ #
126
+ def on_every_page(&block)
127
+ @on_every_page_blocks << block
128
+ self
129
+ end
130
+
131
+ #
132
+ # Add a block to be executed on Page objects with a URL matching
133
+ # one or more patterns
134
+ #
135
+ def on_pages_like(*patterns, &block)
136
+ if patterns
137
+ patterns.each do |pattern|
138
+ @on_pages_like_blocks[pattern] << block
139
+ end
140
+ end
141
+ self
142
+ end
143
+
144
+ #
145
+ # Specify a block which will select which links to follow on each page.
146
+ # The block should return an Array of URI objects.
147
+ #
148
+ def focus_crawl(&block)
149
+ @focus_crawl_block = block
150
+ self
151
+ end
152
+
153
+ #
154
+ # Signals the crawler that it should stop the crawl before visiting the
155
+ # next page.
156
+ #
157
+ # This method is expected to be called within a page block, and it signals
158
+ # the crawler that it must stop after the current page is completely
159
+ # processed. All pages and links currently on queue are discared.
160
+ #
161
+ def stop_crawl
162
+ @stop_crawl = true
163
+ end
164
+
165
+ #
166
+ # Perform the crawl
167
+ #
168
+ def run
169
+ process_options
170
+
171
+ @urls.delete_if { |url| !visit_link?(url) }
172
+ return if @urls.empty?
173
+
174
+ link_queue = build_queue(@opts[:link_queue_size_limit])
175
+ page_queue = build_queue(@opts[:page_queue_size_limit])
176
+
177
+ @opts[:threads].times do
178
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
179
+ end
180
+
181
+ @urls.each{ |url| link_queue.enq(url) }
182
+
183
+ loop do
184
+ page = page_queue.deq
185
+ @pages.touch_key page.url
186
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
187
+ do_page_blocks page
188
+ page.discard_doc! if @opts[:discard_page_bodies]
189
+
190
+ links = links_to_follow page
191
+ links.each do |link|
192
+ link_queue << [link, page.url.dup, page.depth + 1]
193
+ end
194
+ @pages.touch_keys links
195
+
196
+ @pages[page.url] = page
197
+
198
+ if @stop_crawl
199
+ page_queue.clear
200
+ link_queue.clear
201
+ end
202
+
203
+ # if we are done with the crawl, tell the threads to end
204
+ if link_queue.empty? and page_queue.empty?
205
+ until link_queue.num_waiting == @tentacles.size
206
+ Thread.pass
207
+ end
208
+ if page_queue.empty? || @stop_crawl
209
+ @tentacles.size.times { link_queue << :END }
210
+ break
211
+ end
212
+ end
213
+ end
214
+
215
+ @tentacles.each { |thread| thread.join }
216
+ do_after_crawl_blocks
217
+ self
218
+ end
219
+
220
+ private
221
+
222
+ def process_options
223
+ @opts = DEFAULT_OPTS.merge @opts
224
+ @opts[:threads] = 1 if @opts[:delay] > 0
225
+ storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
226
+ @pages = PageStore.new(storage, @opts)
227
+ @robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
228
+
229
+ freeze_options
230
+ end
231
+
232
+ #
233
+ # Freeze the opts Hash so that no options can be modified
234
+ # once the crawl begins
235
+ #
236
+ def freeze_options
237
+ @opts.freeze
238
+ @opts.each_key { |key| @opts[key].freeze }
239
+ @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
240
+ end
241
+
242
+ #
243
+ # Execute the after_crawl blocks
244
+ #
245
+ def do_after_crawl_blocks
246
+ @after_crawl_blocks.each { |block| block.call(@pages) }
247
+ end
248
+
249
+ #
250
+ # Execute the on_every_page blocks for *page*
251
+ #
252
+ def do_page_blocks(page)
253
+ @on_every_page_blocks.each do |block|
254
+ block.call(page)
255
+ end
256
+
257
+ @on_pages_like_blocks.each do |pattern, blocks|
258
+ blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
259
+ end
260
+ end
261
+
262
+ #
263
+ # Return an Array of links to follow from the given page.
264
+ # Based on whether or not the link has already been crawled,
265
+ # and the block given to focus_crawl()
266
+ #
267
+ def links_to_follow(page)
268
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
269
+ links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
270
+ end
271
+
272
+ #
273
+ # Returns +true+ if *link* has not been visited already,
274
+ # and is not excluded by a skip_link pattern...
275
+ # and is not excluded by robots.txt...
276
+ # and is not deeper than the depth limit
277
+ # Returns +false+ otherwise.
278
+ #
279
+ def visit_link?(link, from_page = nil)
280
+ !@pages.has_page?(link) &&
281
+ !skip_link?(link) &&
282
+ !skip_query_string?(link) &&
283
+ allowed(link) &&
284
+ !too_deep?(from_page)
285
+ end
286
+
287
+ #
288
+ # Returns +true+ if we are obeying robots.txt and the link
289
+ # is granted access in it. Always returns +true+ when we are
290
+ # not obeying robots.txt.
291
+ #
292
+ def allowed(link)
293
+ @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
294
+ rescue
295
+ false
296
+ end
297
+
298
+ #
299
+ # Returns +true+ if we are over the page depth limit.
300
+ # This only works when coming from a page and with the +depth_limit+ option set.
301
+ # When neither is the case, will always return +false+.
302
+ def too_deep?(from_page)
303
+ if from_page && @opts[:depth_limit]
304
+ from_page.depth >= @opts[:depth_limit]
305
+ else
306
+ false
307
+ end
308
+ end
309
+
310
+ #
311
+ # Returns +true+ if *link* should not be visited because
312
+ # it has a query string and +skip_query_strings+ is true.
313
+ #
314
+ def skip_query_string?(link)
315
+ @opts[:skip_query_strings] && link.query
316
+ end
317
+
318
+ #
319
+ # Returns +true+ if *link* should not be visited because
320
+ # its URL matches a skip_link pattern.
321
+ #
322
+ def skip_link?(link)
323
+ @skip_link_patterns.any? { |pattern| link.path =~ pattern }
324
+ end
325
+
326
+ #
327
+ # Creates a new queue constrained to the given maximum size,
328
+ # or unconstrained if +size+ is not a positive integer.
329
+ #
330
+ def build_queue(size = nil)
331
+ if size.is_a?(Integer) && size > 0
332
+ SizedQueue.new(size)
333
+ else
334
+ Queue.new
335
+ end
336
+ end
337
+
338
+ end
339
+ end
@@ -0,0 +1,5 @@
1
+ module Anemone
2
+ class Error < ::StandardError
3
+ attr_accessor :wrapped_exception
4
+ end
5
+ end
@@ -0,0 +1,187 @@
1
+ require 'net/https'
2
+ require 'anemone/page'
3
+ require 'anemone/cookie_store'
4
+
5
+ module Anemone
6
+ class HTTP
7
+ # Maximum number of redirects to follow on each get_response
8
+ REDIRECT_LIMIT = 5
9
+
10
+ # CookieStore for this HTTP client
11
+ attr_reader :cookie_store
12
+
13
+ def initialize(opts = {})
14
+ @connections = {}
15
+ @opts = opts
16
+ @cookie_store = CookieStore.new(@opts[:cookies])
17
+ end
18
+
19
+ #
20
+ # Fetch a single Page from the response of an HTTP request to *url*.
21
+ # Just gets the final destination page.
22
+ #
23
+ def fetch_page(url, referer = nil, depth = nil)
24
+ fetch_pages(url, referer, depth).last
25
+ end
26
+
27
+ #
28
+ # Create new Pages from the response of an HTTP request to *url*,
29
+ # including redirects
30
+ #
31
+ def fetch_pages(url, referer = nil, depth = nil)
32
+ begin
33
+ url = URI(url) unless url.is_a?(URI)
34
+ pages = []
35
+ get(url, referer) do |response, code, location, redirect_to, response_time|
36
+ pages << @opts[:page_class].new(location, :body => response.body.dup,
37
+ :code => code,
38
+ :headers => response.to_hash,
39
+ :referer => referer,
40
+ :depth => depth,
41
+ :redirect_to => redirect_to,
42
+ :response_time => response_time)
43
+ end
44
+
45
+ return pages
46
+ rescue Exception => e
47
+ if verbose?
48
+ puts e.inspect
49
+ puts e.backtrace
50
+ end
51
+ return [@opts[:page_class].new(url, :error => e)]
52
+ end
53
+ end
54
+
55
+ #
56
+ # The maximum number of redirects to follow
57
+ #
58
+ def redirect_limit
59
+ @opts[:redirect_limit] || REDIRECT_LIMIT
60
+ end
61
+
62
+ #
63
+ # The user-agent string which will be sent with each request,
64
+ # or nil if no such option is set
65
+ #
66
+ def user_agent
67
+ @opts[:user_agent]
68
+ end
69
+
70
+ #
71
+ # Does this HTTP client accept cookies from the server?
72
+ #
73
+ def accept_cookies?
74
+ @opts[:accept_cookies]
75
+ end
76
+
77
+ #
78
+ # The proxy address string
79
+ #
80
+ def proxy_host
81
+ @opts[:proxy_host]
82
+ end
83
+
84
+ #
85
+ # The proxy port
86
+ #
87
+ def proxy_port
88
+ @opts[:proxy_port]
89
+ end
90
+
91
+ #
92
+ # HTTP read timeout in seconds
93
+ #
94
+ def read_timeout
95
+ @opts[:read_timeout]
96
+ end
97
+
98
+ private
99
+
100
+ #
101
+ # Retrieve HTTP responses for *url*, including redirects.
102
+ # Yields the response object, response code, and URI location
103
+ # for each response.
104
+ #
105
+ def get(url, referer = nil)
106
+ limit = redirect_limit
107
+ loc = url
108
+ begin
109
+ # if redirected to a relative url, merge it with the host of the original
110
+ # request url
111
+ loc = url.merge(loc) if loc.relative?
112
+
113
+ response, response_time = get_response(loc, referer)
114
+ code = Integer(response.code)
115
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
116
+ yield response, code, loc, redirect_to, response_time
117
+ limit -= 1
118
+ end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
119
+ end
120
+
121
+ #
122
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
123
+ #
124
+ def get_response(url, referer = nil)
125
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
126
+
127
+ opts = {}
128
+ opts['User-Agent'] = user_agent if user_agent
129
+ opts['Referer'] = referer.to_s if referer
130
+ opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
131
+
132
+ retries = 0
133
+ begin
134
+ start = Time.now()
135
+ # format request
136
+ req = Net::HTTP::Get.new(full_path, opts)
137
+ # HTTP Basic authentication
138
+ req.basic_auth url.user, url.password if url.user
139
+ response = connection(url).request(req)
140
+ finish = Time.now()
141
+ response_time = ((finish - start) * 1000).round
142
+ @cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
143
+ return response, response_time
144
+ rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
145
+ puts e.inspect if verbose?
146
+ refresh_connection(url)
147
+ retries += 1
148
+ retry unless retries > 3
149
+ end
150
+ end
151
+
152
+ def connection(url)
153
+ @connections[url.host] ||= {}
154
+
155
+ if conn = @connections[url.host][url.port]
156
+ return conn
157
+ end
158
+
159
+ refresh_connection url
160
+ end
161
+
162
+ def refresh_connection(url)
163
+ http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
164
+
165
+ http.read_timeout = read_timeout if !!read_timeout
166
+
167
+ if url.scheme == 'https'
168
+ http.use_ssl = true
169
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
170
+ end
171
+
172
+ @connections[url.host][url.port] = http.start
173
+ end
174
+
175
+ def verbose?
176
+ @opts[:verbose]
177
+ end
178
+
179
+ #
180
+ # Allowed to connect to the requested url?
181
+ #
182
+ def allowed?(to_url, from_url)
183
+ to_url.host.nil? || (to_url.host == from_url.host)
184
+ end
185
+
186
+ end
187
+ end