sutch-anemone 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+
8
+ begin
9
+ # make sure that the last option is a URL we can crawl
10
+ root = URI(ARGV.last)
11
+ rescue
12
+ puts <<-INFO
13
+ Usage:
14
+ anemone url-list [options] <url>
15
+
16
+ Synopsis:
17
+ Crawls a site starting at the given URL, and outputs the URL of each page
18
+ in the domain as they are encountered.
19
+
20
+ Options:
21
+ -r, --relative Output relative URLs (rather than absolute)
22
+ INFO
23
+ exit(0)
24
+ end
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-r', '--relative') { options.relative = true }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
32
+
33
+ anemone.on_every_page do |page|
34
+ if options.relative
35
+ puts page.url.path
36
+ else
37
+ puts page.url
38
+ end
39
+ end
40
+
41
+ end
@@ -0,0 +1,35 @@
1
+ require 'delegate'
2
+ require 'webrick/cookie'
3
+
4
+ class WEBrick::Cookie
5
+ def expired?
6
+ !!expires && expires < Time.now
7
+ end
8
+ end
9
+
10
+ module Anemone
11
+ class CookieStore < DelegateClass(Hash)
12
+
13
+ def initialize(cookies = nil)
14
+ @cookies = {}
15
+ cookies.each { |name, value| @cookies[name] = WEBrick::Cookie.new(name, value) } if cookies
16
+ super(@cookies)
17
+ end
18
+
19
+ def merge!(set_cookie_str)
20
+ begin
21
+ cookie_hash = WEBrick::Cookie.parse_set_cookies(set_cookie_str).inject({}) do |hash, cookie|
22
+ hash[cookie.name] = cookie if !!cookie
23
+ hash
24
+ end
25
+ @cookies.merge! cookie_hash
26
+ rescue
27
+ end
28
+ end
29
+
30
+ def to_s
31
+ @cookies.values.reject { |cookie| cookie.expired? }.map { |cookie| "#{cookie.name}=#{cookie.value}" }.join(';')
32
+ end
33
+
34
+ end
35
+ end
@@ -0,0 +1,339 @@
1
+ require 'thread'
2
+ require 'robotex'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page'
5
+ require 'anemone/resource'
6
+ require 'anemone/exceptions'
7
+ require 'anemone/page_store'
8
+ require 'anemone/storage'
9
+ require 'anemone/storage/base'
10
+
11
+ module Anemone
12
+
13
+ VERSION = '0.7.2';
14
+
15
+ #
16
+ # Convenience method to start a crawl
17
+ #
18
+ def Anemone.crawl(urls, options = {}, &block)
19
+ Core.crawl(urls, options, &block)
20
+ end
21
+
22
+ class Core
23
+
24
+ # PageStore storing all Page objects encountered during the crawl
25
+ attr_reader :pages
26
+ # Hash of options for the crawl
27
+ attr_reader :opts
28
+
29
+ DEFAULT_OPTS = {
30
+ # run 4 Tentacle threads to fetch pages
31
+ :threads => 4,
32
+ # disable verbose output
33
+ :verbose => false,
34
+ # don't throw away the page response body after scanning it for links
35
+ :discard_page_bodies => false,
36
+ # identify self as Anemone/VERSION
37
+ :user_agent => "Anemone/#{Anemone::VERSION}",
38
+ # no delay between requests
39
+ :delay => 0,
40
+ # don't obey the robots exclusion protocol
41
+ :obey_robots_txt => false,
42
+ # by default, don't limit the depth of the crawl
43
+ :depth_limit => false,
44
+ # number of times HTTP redirects will be followed
45
+ :redirect_limit => 5,
46
+ # limit the size of the page queue to keep memory usage low
47
+ :page_queue_size_limit => nil,
48
+ # limit the size of the link queue to keep memory usage low
49
+ :link_queue_size_limit => nil,
50
+ # storage engine defaults to Hash in +process_options+ if none specified
51
+ :storage => nil,
52
+ # Hash of cookie name => value to send with HTTP requests
53
+ :cookies => nil,
54
+ # accept cookies from the server and send them back?
55
+ :accept_cookies => false,
56
+ # skip any link with a query string? e.g. http://foo.com/?u=user
57
+ :skip_query_strings => false,
58
+ # proxy server hostname
59
+ :proxy_host => nil,
60
+ # proxy server port number
61
+ :proxy_port => false,
62
+ # HTTP read timeout in seconds
63
+ :read_timeout => nil,
64
+ # parse pages using Page class
65
+ :page_class => Anemone::Page,
66
+ }
67
+
68
+ # Create setter methods for all options to be called from the crawl block
69
+ DEFAULT_OPTS.keys.each do |key|
70
+ define_method "#{key}=" do |value|
71
+ @opts[key.to_sym] = value
72
+ end
73
+ end
74
+
75
+ #
76
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
77
+ # and optional *block*
78
+ #
79
+ def initialize(urls, opts = {})
80
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
81
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
82
+
83
+ @tentacles = []
84
+ @on_every_page_blocks = []
85
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
86
+ @skip_link_patterns = []
87
+ @after_crawl_blocks = []
88
+ @opts = opts
89
+ @stop_crawl = false
90
+
91
+ yield self if block_given?
92
+ end
93
+
94
+ #
95
+ # Convenience method to start a new crawl
96
+ #
97
+ def self.crawl(urls, opts = {})
98
+ self.new(urls, opts) do |core|
99
+ yield core if block_given?
100
+ core.run
101
+ end
102
+ end
103
+
104
+ #
105
+ # Add a block to be executed on the PageStore after the crawl
106
+ # is finished
107
+ #
108
+ def after_crawl(&block)
109
+ @after_crawl_blocks << block
110
+ self
111
+ end
112
+
113
+ #
114
+ # Add one ore more Regex patterns for URLs which should not be
115
+ # followed
116
+ #
117
+ def skip_links_like(*patterns)
118
+ @skip_link_patterns.concat [patterns].flatten.compact
119
+ self
120
+ end
121
+
122
+ #
123
+ # Add a block to be executed on every Page as they are encountered
124
+ # during the crawl
125
+ #
126
+ def on_every_page(&block)
127
+ @on_every_page_blocks << block
128
+ self
129
+ end
130
+
131
+ #
132
+ # Add a block to be executed on Page objects with a URL matching
133
+ # one or more patterns
134
+ #
135
+ def on_pages_like(*patterns, &block)
136
+ if patterns
137
+ patterns.each do |pattern|
138
+ @on_pages_like_blocks[pattern] << block
139
+ end
140
+ end
141
+ self
142
+ end
143
+
144
+ #
145
+ # Specify a block which will select which links to follow on each page.
146
+ # The block should return an Array of URI objects.
147
+ #
148
+ def focus_crawl(&block)
149
+ @focus_crawl_block = block
150
+ self
151
+ end
152
+
153
+ #
154
+ # Signals the crawler that it should stop the crawl before visiting the
155
+ # next page.
156
+ #
157
+ # This method is expected to be called within a page block, and it signals
158
+ # the crawler that it must stop after the current page is completely
159
+ # processed. All pages and links currently on queue are discared.
160
+ #
161
+ def stop_crawl
162
+ @stop_crawl = true
163
+ end
164
+
165
+ #
166
+ # Perform the crawl
167
+ #
168
+ def run
169
+ process_options
170
+
171
+ @urls.delete_if { |url| !visit_link?(url) }
172
+ return if @urls.empty?
173
+
174
+ link_queue = build_queue(@opts[:link_queue_size_limit])
175
+ page_queue = build_queue(@opts[:page_queue_size_limit])
176
+
177
+ @opts[:threads].times do
178
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
179
+ end
180
+
181
+ @urls.each{ |url| link_queue.enq(url) }
182
+
183
+ loop do
184
+ page = page_queue.deq
185
+ @pages.touch_key page.url
186
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
187
+ do_page_blocks page
188
+ page.discard_doc! if @opts[:discard_page_bodies]
189
+
190
+ links = links_to_follow page
191
+ links.each do |link|
192
+ link_queue << [link, page.url.dup, page.depth + 1]
193
+ end
194
+ @pages.touch_keys links
195
+
196
+ @pages[page.url] = page
197
+
198
+ if @stop_crawl
199
+ page_queue.clear
200
+ link_queue.clear
201
+ end
202
+
203
+ # if we are done with the crawl, tell the threads to end
204
+ if link_queue.empty? and page_queue.empty?
205
+ until link_queue.num_waiting == @tentacles.size
206
+ Thread.pass
207
+ end
208
+ if page_queue.empty? || @stop_crawl
209
+ @tentacles.size.times { link_queue << :END }
210
+ break
211
+ end
212
+ end
213
+ end
214
+
215
+ @tentacles.each { |thread| thread.join }
216
+ do_after_crawl_blocks
217
+ self
218
+ end
219
+
220
+ private
221
+
222
+ def process_options
223
+ @opts = DEFAULT_OPTS.merge @opts
224
+ @opts[:threads] = 1 if @opts[:delay] > 0
225
+ storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
226
+ @pages = PageStore.new(storage, @opts)
227
+ @robots = Robotex.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
228
+
229
+ freeze_options
230
+ end
231
+
232
+ #
233
+ # Freeze the opts Hash so that no options can be modified
234
+ # once the crawl begins
235
+ #
236
+ def freeze_options
237
+ @opts.freeze
238
+ @opts.each_key { |key| @opts[key].freeze }
239
+ @opts[:cookies].each_key { |key| @opts[:cookies][key].freeze } rescue nil
240
+ end
241
+
242
+ #
243
+ # Execute the after_crawl blocks
244
+ #
245
+ def do_after_crawl_blocks
246
+ @after_crawl_blocks.each { |block| block.call(@pages) }
247
+ end
248
+
249
+ #
250
+ # Execute the on_every_page blocks for *page*
251
+ #
252
+ def do_page_blocks(page)
253
+ @on_every_page_blocks.each do |block|
254
+ block.call(page)
255
+ end
256
+
257
+ @on_pages_like_blocks.each do |pattern, blocks|
258
+ blocks.each { |block| block.call(page) } if page.url.to_s =~ pattern
259
+ end
260
+ end
261
+
262
+ #
263
+ # Return an Array of links to follow from the given page.
264
+ # Based on whether or not the link has already been crawled,
265
+ # and the block given to focus_crawl()
266
+ #
267
+ def links_to_follow(page)
268
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
269
+ links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
270
+ end
271
+
272
+ #
273
+ # Returns +true+ if *link* has not been visited already,
274
+ # and is not excluded by a skip_link pattern...
275
+ # and is not excluded by robots.txt...
276
+ # and is not deeper than the depth limit
277
+ # Returns +false+ otherwise.
278
+ #
279
+ def visit_link?(link, from_page = nil)
280
+ !@pages.has_page?(link) &&
281
+ !skip_link?(link) &&
282
+ !skip_query_string?(link) &&
283
+ allowed(link) &&
284
+ !too_deep?(from_page)
285
+ end
286
+
287
+ #
288
+ # Returns +true+ if we are obeying robots.txt and the link
289
+ # is granted access in it. Always returns +true+ when we are
290
+ # not obeying robots.txt.
291
+ #
292
+ def allowed(link)
293
+ @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
294
+ rescue
295
+ false
296
+ end
297
+
298
+ #
299
+ # Returns +true+ if we are over the page depth limit.
300
+ # This only works when coming from a page and with the +depth_limit+ option set.
301
+ # When neither is the case, will always return +false+.
302
+ def too_deep?(from_page)
303
+ if from_page && @opts[:depth_limit]
304
+ from_page.depth >= @opts[:depth_limit]
305
+ else
306
+ false
307
+ end
308
+ end
309
+
310
+ #
311
+ # Returns +true+ if *link* should not be visited because
312
+ # it has a query string and +skip_query_strings+ is true.
313
+ #
314
+ def skip_query_string?(link)
315
+ @opts[:skip_query_strings] && link.query
316
+ end
317
+
318
+ #
319
+ # Returns +true+ if *link* should not be visited because
320
+ # its URL matches a skip_link pattern.
321
+ #
322
+ def skip_link?(link)
323
+ @skip_link_patterns.any? { |pattern| link.path =~ pattern }
324
+ end
325
+
326
+ #
327
+ # Creates a new queue constrained to the given maximum size,
328
+ # or unconstrained if +size+ is not a positive integer.
329
+ #
330
+ def build_queue(size = nil)
331
+ if size.is_a?(Integer) && size > 0
332
+ SizedQueue.new(size)
333
+ else
334
+ Queue.new
335
+ end
336
+ end
337
+
338
+ end
339
+ end
@@ -0,0 +1,5 @@
1
+ module Anemone
2
+ class Error < ::StandardError
3
+ attr_accessor :wrapped_exception
4
+ end
5
+ end
@@ -0,0 +1,187 @@
1
+ require 'net/https'
2
+ require 'anemone/page'
3
+ require 'anemone/cookie_store'
4
+
5
+ module Anemone
6
+ class HTTP
7
+ # Maximum number of redirects to follow on each get_response
8
+ REDIRECT_LIMIT = 5
9
+
10
+ # CookieStore for this HTTP client
11
+ attr_reader :cookie_store
12
+
13
+ def initialize(opts = {})
14
+ @connections = {}
15
+ @opts = opts
16
+ @cookie_store = CookieStore.new(@opts[:cookies])
17
+ end
18
+
19
+ #
20
+ # Fetch a single Page from the response of an HTTP request to *url*.
21
+ # Just gets the final destination page.
22
+ #
23
+ def fetch_page(url, referer = nil, depth = nil)
24
+ fetch_pages(url, referer, depth).last
25
+ end
26
+
27
+ #
28
+ # Create new Pages from the response of an HTTP request to *url*,
29
+ # including redirects
30
+ #
31
+ def fetch_pages(url, referer = nil, depth = nil)
32
+ begin
33
+ url = URI(url) unless url.is_a?(URI)
34
+ pages = []
35
+ get(url, referer) do |response, code, location, redirect_to, response_time|
36
+ pages << @opts[:page_class].new(location, :body => response.body.dup,
37
+ :code => code,
38
+ :headers => response.to_hash,
39
+ :referer => referer,
40
+ :depth => depth,
41
+ :redirect_to => redirect_to,
42
+ :response_time => response_time)
43
+ end
44
+
45
+ return pages
46
+ rescue Exception => e
47
+ if verbose?
48
+ puts e.inspect
49
+ puts e.backtrace
50
+ end
51
+ return [@opts[:page_class].new(url, :error => e)]
52
+ end
53
+ end
54
+
55
+ #
56
+ # The maximum number of redirects to follow
57
+ #
58
+ def redirect_limit
59
+ @opts[:redirect_limit] || REDIRECT_LIMIT
60
+ end
61
+
62
+ #
63
+ # The user-agent string which will be sent with each request,
64
+ # or nil if no such option is set
65
+ #
66
+ def user_agent
67
+ @opts[:user_agent]
68
+ end
69
+
70
+ #
71
+ # Does this HTTP client accept cookies from the server?
72
+ #
73
+ def accept_cookies?
74
+ @opts[:accept_cookies]
75
+ end
76
+
77
+ #
78
+ # The proxy address string
79
+ #
80
+ def proxy_host
81
+ @opts[:proxy_host]
82
+ end
83
+
84
+ #
85
+ # The proxy port
86
+ #
87
+ def proxy_port
88
+ @opts[:proxy_port]
89
+ end
90
+
91
+ #
92
+ # HTTP read timeout in seconds
93
+ #
94
+ def read_timeout
95
+ @opts[:read_timeout]
96
+ end
97
+
98
+ private
99
+
100
+ #
101
+ # Retrieve HTTP responses for *url*, including redirects.
102
+ # Yields the response object, response code, and URI location
103
+ # for each response.
104
+ #
105
+ def get(url, referer = nil)
106
+ limit = redirect_limit
107
+ loc = url
108
+ begin
109
+ # if redirected to a relative url, merge it with the host of the original
110
+ # request url
111
+ loc = url.merge(loc) if loc.relative?
112
+
113
+ response, response_time = get_response(loc, referer)
114
+ code = Integer(response.code)
115
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
116
+ yield response, code, loc, redirect_to, response_time
117
+ limit -= 1
118
+ end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
119
+ end
120
+
121
+ #
122
+ # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
123
+ #
124
+ def get_response(url, referer = nil)
125
+ full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
126
+
127
+ opts = {}
128
+ opts['User-Agent'] = user_agent if user_agent
129
+ opts['Referer'] = referer.to_s if referer
130
+ opts['Cookie'] = @cookie_store.to_s unless @cookie_store.empty? || (!accept_cookies? && @opts[:cookies].nil?)
131
+
132
+ retries = 0
133
+ begin
134
+ start = Time.now()
135
+ # format request
136
+ req = Net::HTTP::Get.new(full_path, opts)
137
+ # HTTP Basic authentication
138
+ req.basic_auth url.user, url.password if url.user
139
+ response = connection(url).request(req)
140
+ finish = Time.now()
141
+ response_time = ((finish - start) * 1000).round
142
+ @cookie_store.merge!(response['Set-Cookie']) if accept_cookies?
143
+ return response, response_time
144
+ rescue Timeout::Error, Net::HTTPBadResponse, EOFError => e
145
+ puts e.inspect if verbose?
146
+ refresh_connection(url)
147
+ retries += 1
148
+ retry unless retries > 3
149
+ end
150
+ end
151
+
152
+ def connection(url)
153
+ @connections[url.host] ||= {}
154
+
155
+ if conn = @connections[url.host][url.port]
156
+ return conn
157
+ end
158
+
159
+ refresh_connection url
160
+ end
161
+
162
+ def refresh_connection(url)
163
+ http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
164
+
165
+ http.read_timeout = read_timeout if !!read_timeout
166
+
167
+ if url.scheme == 'https'
168
+ http.use_ssl = true
169
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
170
+ end
171
+
172
+ @connections[url.host][url.port] = http.start
173
+ end
174
+
175
+ def verbose?
176
+ @opts[:verbose]
177
+ end
178
+
179
+ #
180
+ # Allowed to connect to the requested url?
181
+ #
182
+ def allowed?(to_url, from_url)
183
+ to_url.host.nil? || (to_url.host == from_url.host)
184
+ end
185
+
186
+ end
187
+ end