anemone 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,13 @@
1
+ == 0.3.0 / 2009-12-15
2
+
3
+ * Major enchancements
4
+
5
+ * Option for persistent storage of pages during crawl with TokyoCabinet or PStore
6
+
7
+ * Minor enhancements
8
+
9
+ * Options can be set via methods on the Core object in the crawl block
10
+
1
11
  == 0.2.3 / 2009-11-01
2
12
 
3
13
  * Minor enhancements
@@ -24,4 +34,4 @@
24
34
  * Minor enhancements
25
35
 
26
36
  * HTTP request response time recorded in Page.
27
- * Use of persistent HTTP connections.
37
+ * Use of persistent HTTP connections.
@@ -15,6 +15,8 @@ See http://anemone.rubyforge.org for more information.
15
15
  * HTTPS support
16
16
  * Records response time for each page
17
17
  * CLI program can list all pages in a domain, calculate page depths, and more
18
+ * Obey robots.txt
19
+ * In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
18
20
 
19
21
  == Examples
20
22
  See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
@@ -12,10 +12,10 @@ Usage:
12
12
 
13
13
  Synopsis:
14
14
  Crawls a site starting at the given URL and saves the resulting
15
- PageHash object to a file using Marshal serialization.
15
+ PageStore object to a file using Marshal serialization.
16
16
 
17
17
  Options:
18
- -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
18
+ -o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
19
19
  INFO
20
20
  exit(0)
21
21
  end
@@ -2,25 +2,26 @@ require 'thread'
2
2
  require 'robots'
3
3
  require 'anemone/tentacle'
4
4
  require 'anemone/page'
5
- require 'anemone/page_hash'
5
+ require 'anemone/page_store'
6
+ require 'anemone/storage'
6
7
 
7
8
  module Anemone
8
9
 
9
- VERSION = '0.2.3';
10
+ VERSION = '0.3.0';
10
11
 
11
12
  #
12
13
  # Convenience method to start a crawl
13
14
  #
14
15
  def Anemone.crawl(urls, options = {}, &block)
15
16
  Core.crawl(urls, options, &block)
16
- end
17
+ end
17
18
 
18
19
  class Core
19
- # PageHash storing all Page objects encountered during the crawl
20
- attr_reader :pages
21
20
 
21
+ # PageStore storing all Page objects encountered during the crawl
22
+ attr_reader :pages
22
23
  # Hash of options for the crawl
23
- attr_accessor :opts
24
+ attr_reader :opts
24
25
 
25
26
  DEFAULT_OPTS = {
26
27
  # run 4 Tentacle threads to fetch pages
@@ -38,9 +39,18 @@ module Anemone
38
39
  # by default, don't limit the depth of the crawl
39
40
  :depth_limit => false,
40
41
  # number of times HTTP redirects will be followed
41
- :redirect_limit => 5
42
+ :redirect_limit => 5,
43
+ # storage engine defaults to Hash in +process_options+ if none specified
44
+ :storage => nil
42
45
  }
43
46
 
47
+ # Create setter methods for all options to be called from the crawl block
48
+ DEFAULT_OPTS.keys.each do |key|
49
+ define_method "#{key}=" do |*args|
50
+ @opts[key.to_sym] = *args
51
+ end
52
+ end
53
+
44
54
  #
45
55
  # Initialize the crawl with starting *urls* (single URL or Array of URLs)
46
56
  # and optional *block*
@@ -50,17 +60,15 @@ module Anemone
50
60
  @urls.each{ |url| url.path = '/' if url.path.empty? }
51
61
 
52
62
  @tentacles = []
53
- @pages = PageHash.new
54
63
  @on_every_page_blocks = []
55
64
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
56
65
  @skip_link_patterns = []
57
66
  @after_crawl_blocks = []
58
-
59
- process_options opts
67
+ @opts = opts
60
68
 
61
69
  yield self if block_given?
62
70
  end
63
-
71
+
64
72
  #
65
73
  # Convenience method to start a new crawl
66
74
  #
@@ -70,16 +78,16 @@ module Anemone
70
78
  core.run
71
79
  end
72
80
  end
73
-
81
+
74
82
  #
75
- # Add a block to be executed on the PageHash after the crawl
83
+ # Add a block to be executed on the PageStore after the crawl
76
84
  # is finished
77
85
  #
78
86
  def after_crawl(&block)
79
87
  @after_crawl_blocks << block
80
88
  self
81
89
  end
82
-
90
+
83
91
  #
84
92
  # Add one ore more Regex patterns for URLs which should not be
85
93
  # followed
@@ -88,7 +96,7 @@ module Anemone
88
96
  @skip_link_patterns.concat [patterns].flatten.compact
89
97
  self
90
98
  end
91
-
99
+
92
100
  #
93
101
  # Add a block to be executed on every Page as they are encountered
94
102
  # during the crawl
@@ -97,7 +105,7 @@ module Anemone
97
105
  @on_every_page_blocks << block
98
106
  self
99
107
  end
100
-
108
+
101
109
  #
102
110
  # Add a block to be executed on Page objects with a URL matching
103
111
  # one or more patterns
@@ -110,7 +118,7 @@ module Anemone
110
118
  end
111
119
  self
112
120
  end
113
-
121
+
114
122
  #
115
123
  # Specify a block which will select which links to follow on each page.
116
124
  # The block should return an Array of URI objects.
@@ -119,77 +127,63 @@ module Anemone
119
127
  @focus_crawl_block = block
120
128
  self
121
129
  end
122
-
130
+
123
131
  #
124
132
  # Perform the crawl
125
133
  #
126
134
  def run
135
+ process_options
136
+
127
137
  @urls.delete_if { |url| !visit_link?(url) }
128
138
  return if @urls.empty?
129
-
139
+
130
140
  link_queue = Queue.new
131
141
  page_queue = Queue.new
132
142
 
133
143
  @opts[:threads].times do
134
144
  @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
135
145
  end
136
-
146
+
137
147
  @urls.each{ |url| link_queue.enq(url) }
138
148
 
139
149
  loop do
140
150
  page = page_queue.deq
141
-
142
- @pages[page.url] = page
143
-
151
+ @pages.touch_key page.url
144
152
  puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
145
-
146
- # perform the on_every_page blocks for this page
147
- do_page_blocks(page)
148
-
153
+ do_page_blocks page
149
154
  page.discard_doc! if @opts[:discard_page_bodies]
150
-
151
- links_to_follow(page).each do |link|
152
- link_queue.enq([link, page])
153
- @pages[link] = nil
154
- end
155
-
156
- # create an entry in the page hash for each alias of this page,
157
- # i.e. all the pages that redirected to this page
158
- page.aliases.each do |aka|
159
- if !@pages.has_key?(aka) or @pages[aka].nil?
160
- @pages[aka] = page.alias_clone(aka)
161
- end
162
- @pages[aka].add_alias!(page.url)
155
+
156
+ links = links_to_follow page
157
+ links.each do |link|
158
+ link_queue << [link, page.url.dup, page.depth + 1]
163
159
  end
164
-
160
+ @pages.touch_keys links
161
+
162
+ @pages[page.url] = page
163
+
165
164
  # if we are done with the crawl, tell the threads to end
166
165
  if link_queue.empty? and page_queue.empty?
167
166
  until link_queue.num_waiting == @tentacles.size
168
167
  Thread.pass
169
168
  end
170
-
171
169
  if page_queue.empty?
172
- @tentacles.size.times { link_queue.enq(:END)}
170
+ @tentacles.size.times { link_queue << :END }
173
171
  break
174
172
  end
175
173
  end
176
-
177
174
  end
178
175
 
179
176
  @tentacles.each { |t| t.join }
180
-
181
- do_after_crawl_blocks()
182
-
177
+ do_after_crawl_blocks
183
178
  self
184
179
  end
185
-
186
- private
187
180
 
188
- def process_options(options)
189
- @opts = DEFAULT_OPTS.merge options
181
+ private
190
182
 
183
+ def process_options
184
+ @opts = DEFAULT_OPTS.merge @opts
191
185
  @opts[:threads] = 1 if @opts[:delay] > 0
192
-
186
+ @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
193
187
  @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
194
188
  end
195
189
 
@@ -197,9 +191,9 @@ module Anemone
197
191
  # Execute the after_crawl blocks
198
192
  #
199
193
  def do_after_crawl_blocks
200
- @after_crawl_blocks.each {|b| b.call(@pages)}
194
+ @after_crawl_blocks.each { |b| b.call(@pages) }
201
195
  end
202
-
196
+
203
197
  #
204
198
  # Execute the on_every_page blocks for *page*
205
199
  #
@@ -207,14 +201,12 @@ module Anemone
207
201
  @on_every_page_blocks.each do |blk|
208
202
  blk.call(page)
209
203
  end
210
-
204
+
211
205
  @on_pages_like_blocks.each do |pattern, blks|
212
- if page.url.to_s =~ pattern
213
- blks.each { |blk| blk.call(page) }
214
- end
206
+ blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
215
207
  end
216
- end
217
-
208
+ end
209
+
218
210
  #
219
211
  # Return an Array of links to follow from the given page.
220
212
  # Based on whether or not the link has already been crawled,
@@ -222,9 +214,9 @@ module Anemone
222
214
  #
223
215
  def links_to_follow(page)
224
216
  links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
225
- links.select { |link| visit_link?(link, page) }
217
+ links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
226
218
  end
227
-
219
+
228
220
  #
229
221
  # Returns +true+ if *link* has not been visited already,
230
222
  # and is not excluded by a skip_link pattern...
@@ -234,16 +226,16 @@ module Anemone
234
226
  #
235
227
  def visit_link?(link, from_page = nil)
236
228
  allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
237
-
229
+
238
230
  if from_page && @opts[:depth_limit]
239
231
  too_deep = from_page.depth >= @opts[:depth_limit]
240
232
  else
241
233
  too_deep = false
242
234
  end
243
-
235
+
244
236
  !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
245
237
  end
246
-
238
+
247
239
  #
248
240
  # Returns +true+ if *link* should not be visited because
249
241
  # its URL matches a skip_link pattern.
@@ -251,6 +243,6 @@ module Anemone
251
243
  def skip_link?(link)
252
244
  @skip_link_patterns.any? { |p| link.path =~ p }
253
245
  end
254
-
246
+
255
247
  end
256
248
  end
@@ -12,62 +12,73 @@ module Anemone
12
12
  end
13
13
 
14
14
  #
15
- # Create a new Page from the response of an HTTP request to *url*
15
+ # Fetch a single Page from the response of an HTTP request to *url*.
16
+ # Just gets the final destination page.
16
17
  #
17
- def fetch_page(url, from_page = nil)
18
+ def fetch_page(url, referer = nil, depth = nil)
19
+ fetch_pages(url, referer, depth).last
20
+ end
21
+
22
+ #
23
+ # Create new Pages from the response of an HTTP request to *url*,
24
+ # including redirects
25
+ #
26
+ def fetch_pages(url, referer = nil, depth = nil)
18
27
  begin
19
28
  url = URI(url) unless url.is_a?(URI)
20
-
21
- if from_page
22
- referer = from_page.url
23
- depth = from_page.depth + 1
24
- end
25
-
26
- response, code, location, response_time = get(url, referer)
27
-
28
- aka = nil
29
- if !url.eql?(location)
30
- aka = location
29
+ pages = []
30
+ get(url, referer) do |response, code, location, redirect_to, response_time|
31
+ pages << Page.new(location, :body => response.body.dup,
32
+ :code => code,
33
+ :headers => response.to_hash,
34
+ :referer => referer,
35
+ :depth => depth,
36
+ :redirect_to => redirect_to,
37
+ :response_time => response_time)
31
38
  end
32
39
 
33
- return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
40
+ return pages
34
41
  rescue => e
35
42
  if verbose?
36
43
  puts e.inspect
37
44
  puts e.backtrace
38
- end
39
- return Page.new(url)
45
+ end
46
+ return [Page.new(url, :error => e)]
40
47
  end
41
48
  end
42
49
 
43
50
  private
44
51
 
45
52
  #
46
- # Retrieve an HTTP response for *url*, following redirects.
47
- # Returns the response object, response code, and final URI location.
48
- #
53
+ # Retrieve HTTP responses for *url*, including redirects.
54
+ # Yields the response object, response code, and URI location
55
+ # for each response.
56
+ #
49
57
  def get(url, referer = nil)
50
58
  response, response_time = get_response(url, referer)
51
59
  code = Integer(response.code)
52
60
  loc = url
53
-
61
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
62
+ yield response, code, loc, redirect_to, response_time
63
+
54
64
  limit = redirect_limit
55
65
  while response.is_a?(Net::HTTPRedirection) and limit > 0
56
- loc = URI(response['location'])
66
+ loc = redirect_to
57
67
  loc = url.merge(loc) if loc.relative?
58
68
  response, response_time = get_response(loc, referer)
69
+ code = Integer(response.code)
70
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
71
+ yield response, code, loc, redirect_to, response_time
59
72
  limit -= 1
60
73
  end
61
-
62
- return response, code, loc, response_time
63
74
  end
64
-
75
+
65
76
  #
66
77
  # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
67
78
  #
68
79
  def get_response(url, referer = nil)
69
80
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
70
-
81
+
71
82
  opts = {}
72
83
  opts['User-Agent'] = user_agent if user_agent
73
84
  opts['Referer'] = referer.to_s if referer
@@ -78,7 +89,7 @@ module Anemone
78
89
  response = connection(url).get(full_path, opts)
79
90
  finish = Time.now()
80
91
  response_time = ((finish - start) * 1000).round
81
- return response, response_time
92
+ return response, response_time
82
93
  rescue EOFError
83
94
  refresh_connection(url)
84
95
  retries += 1
@@ -93,7 +104,7 @@ module Anemone
93
104
  return conn
94
105
  end
95
106
 
96
- refresh_connection(url)
107
+ refresh_connection url
97
108
  end
98
109
 
99
110
  def refresh_connection(url)
@@ -102,7 +113,7 @@ module Anemone
102
113
  http.use_ssl = true
103
114
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
104
115
  end
105
- @connections[url.host][url.port] = http.start
116
+ @connections[url.host][url.port] = http.start
106
117
  end
107
118
 
108
119
  def redirect_limit