anemone 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,13 @@
1
+ == 0.3.0 / 2009-12-15
2
+
3
+ * Major enchancements
4
+
5
+ * Option for persistent storage of pages during crawl with TokyoCabinet or PStore
6
+
7
+ * Minor enhancements
8
+
9
+ * Options can be set via methods on the Core object in the crawl block
10
+
1
11
  == 0.2.3 / 2009-11-01
2
12
 
3
13
  * Minor enhancements
@@ -24,4 +34,4 @@
24
34
  * Minor enhancements
25
35
 
26
36
  * HTTP request response time recorded in Page.
27
- * Use of persistent HTTP connections.
37
+ * Use of persistent HTTP connections.
@@ -15,6 +15,8 @@ See http://anemone.rubyforge.org for more information.
15
15
  * HTTPS support
16
16
  * Records response time for each page
17
17
  * CLI program can list all pages in a domain, calculate page depths, and more
18
+ * Obey robots.txt
19
+ * In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
18
20
 
19
21
  == Examples
20
22
  See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
@@ -12,10 +12,10 @@ Usage:
12
12
 
13
13
  Synopsis:
14
14
  Crawls a site starting at the given URL and saves the resulting
15
- PageHash object to a file using Marshal serialization.
15
+ PageStore object to a file using Marshal serialization.
16
16
 
17
17
  Options:
18
- -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
18
+ -o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
19
19
  INFO
20
20
  exit(0)
21
21
  end
@@ -2,25 +2,26 @@ require 'thread'
2
2
  require 'robots'
3
3
  require 'anemone/tentacle'
4
4
  require 'anemone/page'
5
- require 'anemone/page_hash'
5
+ require 'anemone/page_store'
6
+ require 'anemone/storage'
6
7
 
7
8
  module Anemone
8
9
 
9
- VERSION = '0.2.3';
10
+ VERSION = '0.3.0';
10
11
 
11
12
  #
12
13
  # Convenience method to start a crawl
13
14
  #
14
15
  def Anemone.crawl(urls, options = {}, &block)
15
16
  Core.crawl(urls, options, &block)
16
- end
17
+ end
17
18
 
18
19
  class Core
19
- # PageHash storing all Page objects encountered during the crawl
20
- attr_reader :pages
21
20
 
21
+ # PageStore storing all Page objects encountered during the crawl
22
+ attr_reader :pages
22
23
  # Hash of options for the crawl
23
- attr_accessor :opts
24
+ attr_reader :opts
24
25
 
25
26
  DEFAULT_OPTS = {
26
27
  # run 4 Tentacle threads to fetch pages
@@ -38,9 +39,18 @@ module Anemone
38
39
  # by default, don't limit the depth of the crawl
39
40
  :depth_limit => false,
40
41
  # number of times HTTP redirects will be followed
41
- :redirect_limit => 5
42
+ :redirect_limit => 5,
43
+ # storage engine defaults to Hash in +process_options+ if none specified
44
+ :storage => nil
42
45
  }
43
46
 
47
+ # Create setter methods for all options to be called from the crawl block
48
+ DEFAULT_OPTS.keys.each do |key|
49
+ define_method "#{key}=" do |*args|
50
+ @opts[key.to_sym] = *args
51
+ end
52
+ end
53
+
44
54
  #
45
55
  # Initialize the crawl with starting *urls* (single URL or Array of URLs)
46
56
  # and optional *block*
@@ -50,17 +60,15 @@ module Anemone
50
60
  @urls.each{ |url| url.path = '/' if url.path.empty? }
51
61
 
52
62
  @tentacles = []
53
- @pages = PageHash.new
54
63
  @on_every_page_blocks = []
55
64
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
56
65
  @skip_link_patterns = []
57
66
  @after_crawl_blocks = []
58
-
59
- process_options opts
67
+ @opts = opts
60
68
 
61
69
  yield self if block_given?
62
70
  end
63
-
71
+
64
72
  #
65
73
  # Convenience method to start a new crawl
66
74
  #
@@ -70,16 +78,16 @@ module Anemone
70
78
  core.run
71
79
  end
72
80
  end
73
-
81
+
74
82
  #
75
- # Add a block to be executed on the PageHash after the crawl
83
+ # Add a block to be executed on the PageStore after the crawl
76
84
  # is finished
77
85
  #
78
86
  def after_crawl(&block)
79
87
  @after_crawl_blocks << block
80
88
  self
81
89
  end
82
-
90
+
83
91
  #
84
92
  # Add one ore more Regex patterns for URLs which should not be
85
93
  # followed
@@ -88,7 +96,7 @@ module Anemone
88
96
  @skip_link_patterns.concat [patterns].flatten.compact
89
97
  self
90
98
  end
91
-
99
+
92
100
  #
93
101
  # Add a block to be executed on every Page as they are encountered
94
102
  # during the crawl
@@ -97,7 +105,7 @@ module Anemone
97
105
  @on_every_page_blocks << block
98
106
  self
99
107
  end
100
-
108
+
101
109
  #
102
110
  # Add a block to be executed on Page objects with a URL matching
103
111
  # one or more patterns
@@ -110,7 +118,7 @@ module Anemone
110
118
  end
111
119
  self
112
120
  end
113
-
121
+
114
122
  #
115
123
  # Specify a block which will select which links to follow on each page.
116
124
  # The block should return an Array of URI objects.
@@ -119,77 +127,63 @@ module Anemone
119
127
  @focus_crawl_block = block
120
128
  self
121
129
  end
122
-
130
+
123
131
  #
124
132
  # Perform the crawl
125
133
  #
126
134
  def run
135
+ process_options
136
+
127
137
  @urls.delete_if { |url| !visit_link?(url) }
128
138
  return if @urls.empty?
129
-
139
+
130
140
  link_queue = Queue.new
131
141
  page_queue = Queue.new
132
142
 
133
143
  @opts[:threads].times do
134
144
  @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
135
145
  end
136
-
146
+
137
147
  @urls.each{ |url| link_queue.enq(url) }
138
148
 
139
149
  loop do
140
150
  page = page_queue.deq
141
-
142
- @pages[page.url] = page
143
-
151
+ @pages.touch_key page.url
144
152
  puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
145
-
146
- # perform the on_every_page blocks for this page
147
- do_page_blocks(page)
148
-
153
+ do_page_blocks page
149
154
  page.discard_doc! if @opts[:discard_page_bodies]
150
-
151
- links_to_follow(page).each do |link|
152
- link_queue.enq([link, page])
153
- @pages[link] = nil
154
- end
155
-
156
- # create an entry in the page hash for each alias of this page,
157
- # i.e. all the pages that redirected to this page
158
- page.aliases.each do |aka|
159
- if !@pages.has_key?(aka) or @pages[aka].nil?
160
- @pages[aka] = page.alias_clone(aka)
161
- end
162
- @pages[aka].add_alias!(page.url)
155
+
156
+ links = links_to_follow page
157
+ links.each do |link|
158
+ link_queue << [link, page.url.dup, page.depth + 1]
163
159
  end
164
-
160
+ @pages.touch_keys links
161
+
162
+ @pages[page.url] = page
163
+
165
164
  # if we are done with the crawl, tell the threads to end
166
165
  if link_queue.empty? and page_queue.empty?
167
166
  until link_queue.num_waiting == @tentacles.size
168
167
  Thread.pass
169
168
  end
170
-
171
169
  if page_queue.empty?
172
- @tentacles.size.times { link_queue.enq(:END)}
170
+ @tentacles.size.times { link_queue << :END }
173
171
  break
174
172
  end
175
173
  end
176
-
177
174
  end
178
175
 
179
176
  @tentacles.each { |t| t.join }
180
-
181
- do_after_crawl_blocks()
182
-
177
+ do_after_crawl_blocks
183
178
  self
184
179
  end
185
-
186
- private
187
180
 
188
- def process_options(options)
189
- @opts = DEFAULT_OPTS.merge options
181
+ private
190
182
 
183
+ def process_options
184
+ @opts = DEFAULT_OPTS.merge @opts
191
185
  @opts[:threads] = 1 if @opts[:delay] > 0
192
-
186
+ @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
193
187
  @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
194
188
  end
195
189
 
@@ -197,9 +191,9 @@ module Anemone
197
191
  # Execute the after_crawl blocks
198
192
  #
199
193
  def do_after_crawl_blocks
200
- @after_crawl_blocks.each {|b| b.call(@pages)}
194
+ @after_crawl_blocks.each { |b| b.call(@pages) }
201
195
  end
202
-
196
+
203
197
  #
204
198
  # Execute the on_every_page blocks for *page*
205
199
  #
@@ -207,14 +201,12 @@ module Anemone
207
201
  @on_every_page_blocks.each do |blk|
208
202
  blk.call(page)
209
203
  end
210
-
204
+
211
205
  @on_pages_like_blocks.each do |pattern, blks|
212
- if page.url.to_s =~ pattern
213
- blks.each { |blk| blk.call(page) }
214
- end
206
+ blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
215
207
  end
216
- end
217
-
208
+ end
209
+
218
210
  #
219
211
  # Return an Array of links to follow from the given page.
220
212
  # Based on whether or not the link has already been crawled,
@@ -222,9 +214,9 @@ module Anemone
222
214
  #
223
215
  def links_to_follow(page)
224
216
  links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
225
- links.select { |link| visit_link?(link, page) }
217
+ links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
226
218
  end
227
-
219
+
228
220
  #
229
221
  # Returns +true+ if *link* has not been visited already,
230
222
  # and is not excluded by a skip_link pattern...
@@ -234,16 +226,16 @@ module Anemone
234
226
  #
235
227
  def visit_link?(link, from_page = nil)
236
228
  allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
237
-
229
+
238
230
  if from_page && @opts[:depth_limit]
239
231
  too_deep = from_page.depth >= @opts[:depth_limit]
240
232
  else
241
233
  too_deep = false
242
234
  end
243
-
235
+
244
236
  !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
245
237
  end
246
-
238
+
247
239
  #
248
240
  # Returns +true+ if *link* should not be visited because
249
241
  # its URL matches a skip_link pattern.
@@ -251,6 +243,6 @@ module Anemone
251
243
  def skip_link?(link)
252
244
  @skip_link_patterns.any? { |p| link.path =~ p }
253
245
  end
254
-
246
+
255
247
  end
256
248
  end
@@ -12,62 +12,73 @@ module Anemone
12
12
  end
13
13
 
14
14
  #
15
- # Create a new Page from the response of an HTTP request to *url*
15
+ # Fetch a single Page from the response of an HTTP request to *url*.
16
+ # Just gets the final destination page.
16
17
  #
17
- def fetch_page(url, from_page = nil)
18
+ def fetch_page(url, referer = nil, depth = nil)
19
+ fetch_pages(url, referer, depth).last
20
+ end
21
+
22
+ #
23
+ # Create new Pages from the response of an HTTP request to *url*,
24
+ # including redirects
25
+ #
26
+ def fetch_pages(url, referer = nil, depth = nil)
18
27
  begin
19
28
  url = URI(url) unless url.is_a?(URI)
20
-
21
- if from_page
22
- referer = from_page.url
23
- depth = from_page.depth + 1
24
- end
25
-
26
- response, code, location, response_time = get(url, referer)
27
-
28
- aka = nil
29
- if !url.eql?(location)
30
- aka = location
29
+ pages = []
30
+ get(url, referer) do |response, code, location, redirect_to, response_time|
31
+ pages << Page.new(location, :body => response.body.dup,
32
+ :code => code,
33
+ :headers => response.to_hash,
34
+ :referer => referer,
35
+ :depth => depth,
36
+ :redirect_to => redirect_to,
37
+ :response_time => response_time)
31
38
  end
32
39
 
33
- return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
40
+ return pages
34
41
  rescue => e
35
42
  if verbose?
36
43
  puts e.inspect
37
44
  puts e.backtrace
38
- end
39
- return Page.new(url)
45
+ end
46
+ return [Page.new(url, :error => e)]
40
47
  end
41
48
  end
42
49
 
43
50
  private
44
51
 
45
52
  #
46
- # Retrieve an HTTP response for *url*, following redirects.
47
- # Returns the response object, response code, and final URI location.
48
- #
53
+ # Retrieve HTTP responses for *url*, including redirects.
54
+ # Yields the response object, response code, and URI location
55
+ # for each response.
56
+ #
49
57
  def get(url, referer = nil)
50
58
  response, response_time = get_response(url, referer)
51
59
  code = Integer(response.code)
52
60
  loc = url
53
-
61
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
62
+ yield response, code, loc, redirect_to, response_time
63
+
54
64
  limit = redirect_limit
55
65
  while response.is_a?(Net::HTTPRedirection) and limit > 0
56
- loc = URI(response['location'])
66
+ loc = redirect_to
57
67
  loc = url.merge(loc) if loc.relative?
58
68
  response, response_time = get_response(loc, referer)
69
+ code = Integer(response.code)
70
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
71
+ yield response, code, loc, redirect_to, response_time
59
72
  limit -= 1
60
73
  end
61
-
62
- return response, code, loc, response_time
63
74
  end
64
-
75
+
65
76
  #
66
77
  # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
67
78
  #
68
79
  def get_response(url, referer = nil)
69
80
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
70
-
81
+
71
82
  opts = {}
72
83
  opts['User-Agent'] = user_agent if user_agent
73
84
  opts['Referer'] = referer.to_s if referer
@@ -78,7 +89,7 @@ module Anemone
78
89
  response = connection(url).get(full_path, opts)
79
90
  finish = Time.now()
80
91
  response_time = ((finish - start) * 1000).round
81
- return response, response_time
92
+ return response, response_time
82
93
  rescue EOFError
83
94
  refresh_connection(url)
84
95
  retries += 1
@@ -93,7 +104,7 @@ module Anemone
93
104
  return conn
94
105
  end
95
106
 
96
- refresh_connection(url)
107
+ refresh_connection url
97
108
  end
98
109
 
99
110
  def refresh_connection(url)
@@ -102,7 +113,7 @@ module Anemone
102
113
  http.use_ssl = true
103
114
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
104
115
  end
105
- @connections[url.host][url.port] = http.start
116
+ @connections[url.host][url.port] = http.start
106
117
  end
107
118
 
108
119
  def redirect_limit