spk-anemone 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,13 @@
1
+ == 0.3.0 / 2009-12-15
2
+
3
+ * Major enchancements
4
+
5
+ * Option for persistent storage of pages during crawl with TokyoCabinet or PStore
6
+
7
+ * Minor enhancements
8
+
9
+ * Options can be set via methods on the Core object in the crawl block
10
+
1
11
  == 0.2.4 / 2009-11-26
2
12
 
3
13
  * Minor enhancements
data/README.rdoc CHANGED
@@ -15,6 +15,8 @@ See http://anemone.rubyforge.org for more information.
15
15
  * HTTPS support
16
16
  * Records response time for each page
17
17
  * CLI program can list all pages in a domain, calculate page depths, and more
18
+ * Obey robots.txt
19
+ * In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
18
20
 
19
21
  == Examples
20
22
  See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
@@ -12,10 +12,10 @@ Usage:
12
12
 
13
13
  Synopsis:
14
14
  Crawls a site starting at the given URL and saves the resulting
15
- PageHash object to a file using Marshal serialization.
15
+ PageStore object to a file using Marshal serialization.
16
16
 
17
17
  Options:
18
- -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
18
+ -o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
19
19
  INFO
20
20
  exit(0)
21
21
  end
data/lib/anemone/core.rb CHANGED
@@ -2,11 +2,12 @@ require 'thread'
2
2
  require 'robots'
3
3
  require 'anemone/tentacle'
4
4
  require 'anemone/page'
5
- require 'anemone/page_hash'
5
+ require 'anemone/page_store'
6
+ require 'anemone/storage'
6
7
 
7
8
  module Anemone
8
9
 
9
- VERSION = '0.2.4';
10
+ VERSION = '0.3.0';
10
11
 
11
12
  #
12
13
  # Convenience method to start a crawl
@@ -16,11 +17,11 @@ module Anemone
16
17
  end
17
18
 
18
19
  class Core
19
- # PageHash storing all Page objects encountered during the crawl
20
- attr_reader :pages
21
20
 
21
+ # PageStore storing all Page objects encountered during the crawl
22
+ attr_reader :pages
22
23
  # Hash of options for the crawl
23
- attr_accessor :opts
24
+ attr_reader :opts
24
25
 
25
26
  DEFAULT_OPTS = {
26
27
  # run 4 Tentacle threads to fetch pages
@@ -39,29 +40,33 @@ module Anemone
39
40
  :depth_limit => false,
40
41
  # number of times HTTP redirects will be followed
41
42
  :redirect_limit => 5,
43
+ # storage engine defaults to Hash in +process_options+ if none specified
44
+ :storage => nil,
42
45
  # Authentication
43
46
  :authorization => nil,
44
47
  }
45
48
 
49
+ # Create setter methods for all options to be called from the crawl block
50
+ DEFAULT_OPTS.keys.each do |key|
51
+ define_method "#{key}=" do |*args|
52
+ @opts[key.to_sym] = *args
53
+ end
54
+ end
55
+
46
56
  #
47
57
  # Initialize the crawl with starting *urls* (single URL or Array of URLs)
48
58
  # and optional *block*
49
59
  #
50
60
  def initialize(urls, opts = {})
51
- process_options opts
52
-
53
61
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
54
- @urls.each{ |url|
55
- url.path = '/' if url.path.empty?
56
- authorization(url) if url.user
57
- }
62
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
58
63
 
59
64
  @tentacles = []
60
- @pages = PageHash.new
61
65
  @on_every_page_blocks = []
62
66
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
63
67
  @skip_link_patterns = []
64
68
  @after_crawl_blocks = []
69
+ @opts = opts
65
70
 
66
71
  yield self if block_given?
67
72
  end
@@ -77,7 +82,7 @@ module Anemone
77
82
  end
78
83
 
79
84
  #
80
- # Add a block to be executed on the PageHash after the crawl
85
+ # Add a block to be executed on the PageStore after the crawl
81
86
  # is finished
82
87
  #
83
88
  def after_crawl(&block)
@@ -129,6 +134,8 @@ module Anemone
129
134
  # Perform the crawl
130
135
  #
131
136
  def run
137
+ process_options
138
+
132
139
  @urls.delete_if { |url| !visit_link?(url) }
133
140
  return if @urls.empty?
134
141
 
@@ -139,81 +146,66 @@ module Anemone
139
146
  @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
140
147
  end
141
148
 
142
- @urls.each{ |url| link_queue.enq(url) }
149
+ @urls.each{ |url|
150
+ link_queue.enq(url)
151
+ authorization(url) if url.user
152
+ }
143
153
 
144
154
  loop do
145
155
  page = page_queue.deq
146
-
147
- @pages[page.url] = page
148
-
156
+ @pages.touch_key page.url
149
157
  puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
150
-
151
- # perform the on_every_page blocks for this page
152
- do_page_blocks(page)
153
-
158
+ do_page_blocks page
154
159
  page.discard_doc! if @opts[:discard_page_bodies]
155
160
 
156
- links_to_follow(page).each do |link|
157
- link_queue.enq([link, page])
158
- @pages[link] = nil
161
+ links = links_to_follow page
162
+ links.each do |link|
163
+ link_queue << [link, page.url.dup, page.depth + 1]
159
164
  end
165
+ @pages.touch_keys links
160
166
 
161
- # create an entry in the page hash for each alias of this page,
162
- # i.e. all the pages that redirected to this page
163
- page.aliases.each do |aka|
164
- if !@pages.has_key?(aka) or @pages[aka].nil?
165
- @pages[aka] = page.alias_clone(aka)
166
- end
167
- @pages[aka].add_alias!(page.url)
168
- end
167
+ @pages[page.url] = page
169
168
 
170
169
  # if we are done with the crawl, tell the threads to end
171
170
  if link_queue.empty? and page_queue.empty?
172
171
  until link_queue.num_waiting == @tentacles.size
173
172
  Thread.pass
174
173
  end
175
-
176
174
  if page_queue.empty?
177
- @tentacles.size.times { link_queue.enq(:END)}
175
+ @tentacles.size.times { link_queue << :END }
178
176
  break
179
177
  end
180
178
  end
181
-
182
179
  end
183
180
 
184
181
  @tentacles.each { |t| t.join }
185
-
186
- do_after_crawl_blocks()
187
-
182
+ do_after_crawl_blocks
188
183
  self
189
184
  end
190
185
 
191
186
  private
192
187
 
193
- def process_options(options)
194
- @opts = DEFAULT_OPTS.merge options
195
-
196
- authorization(@opts[:authorization])
197
-
188
+ def process_options
189
+ @opts = DEFAULT_OPTS.merge @opts
190
+ authorization(@opts[:authorization]) if @opts[:authorization]
198
191
  @opts[:threads] = 1 if @opts[:delay] > 0
199
-
192
+ @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
200
193
  @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
201
194
  end
202
195
 
203
196
  # Generate Authorization string only if not already set
204
197
  def authorization(auth=nil)
205
- return if @opts[:authorization] =~ /^Basic .*/
206
198
  require 'base64'
207
199
  if auth.is_a?(String) && auth.include?(':')
208
- @opts[:authorization] = "Basic #{Base64.b64encode(auth)}"
200
+ self.authorization = "Basic #{Base64.b64encode(auth)}"
209
201
  elsif auth.is_a?(Array)
210
202
  user = auth.first
211
203
  password = auth.last
212
- @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
204
+ self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
213
205
  elsif auth.is_a?(URI)
214
206
  user = auth.user
215
207
  password = auth.password
216
- @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
208
+ self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
217
209
  end
218
210
  end
219
211
 
@@ -221,7 +213,7 @@ module Anemone
221
213
  # Execute the after_crawl blocks
222
214
  #
223
215
  def do_after_crawl_blocks
224
- @after_crawl_blocks.each {|b| b.call(@pages)}
216
+ @after_crawl_blocks.each { |b| b.call(@pages) }
225
217
  end
226
218
 
227
219
  #
@@ -233,9 +225,7 @@ module Anemone
233
225
  end
234
226
 
235
227
  @on_pages_like_blocks.each do |pattern, blks|
236
- if page.url.to_s =~ pattern
237
- blks.each { |blk| blk.call(page) }
238
- end
228
+ blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
239
229
  end
240
230
  end
241
231
 
@@ -246,7 +236,7 @@ module Anemone
246
236
  #
247
237
  def links_to_follow(page)
248
238
  links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
249
- links.select { |link| visit_link?(link, page) }
239
+ links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
250
240
  end
251
241
 
252
242
  #
data/lib/anemone/http.rb CHANGED
@@ -12,54 +12,65 @@ module Anemone
12
12
  end
13
13
 
14
14
  #
15
- # Create a new Page from the response of an HTTP request to *url*
15
+ # Fetch a single Page from the response of an HTTP request to *url*.
16
+ # Just gets the final destination page.
16
17
  #
17
- def fetch_page(url, from_page = nil)
18
+ def fetch_page(url, referer = nil, depth = nil)
19
+ fetch_pages(url, referer, depth).last
20
+ end
21
+
22
+ #
23
+ # Create new Pages from the response of an HTTP request to *url*,
24
+ # including redirects
25
+ #
26
+ def fetch_pages(url, referer = nil, depth = nil)
18
27
  begin
19
28
  url = URI(url) unless url.is_a?(URI)
20
-
21
- if from_page
22
- referer = from_page.url
23
- depth = from_page.depth + 1
29
+ pages = []
30
+ get(url, referer) do |response, code, location, redirect_to, response_time|
31
+ pages << Page.new(location, :body => response.body.dup,
32
+ :code => code,
33
+ :headers => response.to_hash,
34
+ :referer => referer,
35
+ :depth => depth,
36
+ :redirect_to => redirect_to,
37
+ :response_time => response_time)
24
38
  end
25
39
 
26
- response, code, location, response_time = get(url, referer)
27
-
28
- aka = nil
29
- if !url.eql?(location)
30
- aka = location
31
- end
32
-
33
- return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
40
+ return pages
34
41
  rescue => e
35
42
  if verbose?
36
43
  puts e.inspect
37
44
  puts e.backtrace
38
45
  end
39
- return Page.new(url)
46
+ return [Page.new(url, :error => e)]
40
47
  end
41
48
  end
42
49
 
43
50
  private
44
51
 
45
52
  #
46
- # Retrieve an HTTP response for *url*, following redirects.
47
- # Returns the response object, response code, and final URI location.
53
+ # Retrieve HTTP responses for *url*, including redirects.
54
+ # Yields the response object, response code, and URI location
55
+ # for each response.
48
56
  #
49
57
  def get(url, referer = nil)
50
58
  response, response_time = get_response(url, referer)
51
59
  code = Integer(response.code)
52
60
  loc = url
61
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
62
+ yield response, code, loc, redirect_to, response_time
53
63
 
54
64
  limit = redirect_limit
55
65
  while response.is_a?(Net::HTTPRedirection) and limit > 0
56
- loc = URI(response['location'])
66
+ loc = redirect_to
57
67
  loc = url.merge(loc) if loc.relative?
58
68
  response, response_time = get_response(loc, referer)
69
+ code = Integer(response.code)
70
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
71
+ yield response, code, loc, redirect_to, response_time
59
72
  limit -= 1
60
73
  end
61
-
62
- return response, code, loc, response_time
63
74
  end
64
75
 
65
76
  #
@@ -94,7 +105,7 @@ module Anemone
94
105
  return conn
95
106
  end
96
107
 
97
- refresh_connection(url)
108
+ refresh_connection url
98
109
  end
99
110
 
100
111
  def refresh_connection(url)
data/lib/anemone/page.rb CHANGED
@@ -8,21 +8,21 @@ module Anemone
8
8
  attr_reader :url
9
9
  # Headers of the HTTP response
10
10
  attr_reader :headers
11
+ # URL of the page this one redirected to, if any
12
+ attr_reader :redirect_to
13
+ # Exception object, if one was raised during HTTP#fetch_page
14
+ attr_reader :error
15
+ # HTML body
16
+ attr_reader :body
11
17
 
12
18
  # OpenStruct for user-stored data
13
19
  attr_accessor :data
14
- # HTML body
15
- attr_accessor :body
16
- # Nokogiri document for the HTML body
17
- attr_accessor :doc
18
20
  # Integer response code of the page
19
21
  attr_accessor :code
20
- # Array of redirect-aliases for the page
21
- attr_accessor :aliases
22
- # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
22
+ # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
23
23
  attr_accessor :visited
24
24
  # Depth of this page from the root of the crawl. This is not necessarily the
25
- # shortest path; use PageHash#shortest_paths! to find that value.
25
+ # shortest path; use PageStore#shortest_paths! to find that value.
26
26
  attr_accessor :depth
27
27
  # URL of the page that brought us to this page
28
28
  attr_accessor :referer
@@ -32,18 +32,22 @@ module Anemone
32
32
  #
33
33
  # Create a new page
34
34
  #
35
- def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
35
+ def initialize(url, params = {})
36
36
  @url = url
37
- @code = code
38
- @headers = headers || {}
39
- @headers['content-type'] ||= ['']
40
- @aliases = Array(aka)
41
37
  @data = OpenStruct.new
42
- @referer = referer
43
- @depth = depth || 0
44
- @response_time = response_time
45
- @body = body
46
- @doc = Nokogiri::HTML(body) if body && html? rescue nil
38
+
39
+ @code = params[:code]
40
+ @headers = params[:headers] || {}
41
+ @headers['content-type'] ||= ['']
42
+ @aliases = Array(params[:aka]).compact
43
+ @referer = params[:referer]
44
+ @depth = params[:depth] || 0
45
+ @redirect_to = to_absolute(params[:redirect_to])
46
+ @response_time = params[:response_time]
47
+ @body = params[:body]
48
+ @error = params[:error]
49
+
50
+ @fetched = !params[:code].nil?
47
51
  end
48
52
 
49
53
  # Array of distinct A tag HREFs from the page
@@ -62,42 +66,20 @@ module Anemone
62
66
  @links
63
67
  end
64
68
 
65
- def discard_doc!
66
- links # force parsing of page links before we trash the document
67
- @doc = nil
68
- end
69
-
70
- #
71
- # Return a new page with the same *response* and *url*, but
72
- # with a 200 response code
73
- #
74
- def alias_clone(url)
75
- p = clone
76
- p.add_alias!(@aka) if !@aka.nil?
77
- p.code = 200
78
- p
69
+ # Nokogiri document for the HTML body
70
+ def doc
71
+ return @doc if @doc
72
+ @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
79
73
  end
80
74
 
81
- #
82
- # Add a redirect-alias String *aka* to the list of the page's aliases
83
- #
84
- # Returns *self*
85
- #
86
- def add_alias!(aka)
87
- @aliases << aka if !@aliases.include?(aka)
88
- self
75
+ # Delete the Nokogiri document and response body to conserve memory
76
+ def discard_doc!
77
+ links # force parsing of page links before we trash the document
78
+ @doc = @body = nil
89
79
  end
90
80
 
91
- #
92
- # Returns an Array of all links from this page, and all the
93
- # redirect-aliases of those pages, as String objects.
94
- #
95
- # *page_hash* is a PageHash object with the results of the current crawl.
96
- #
97
- def links_and_their_aliases(page_hash)
98
- links.inject([]) do |results, link|
99
- results.concat([link].concat(page_hash[link].aliases))
100
- end
81
+ def fetched?
82
+ @fetched
101
83
  end
102
84
 
103
85
  #
@@ -136,6 +118,8 @@ module Anemone
136
118
  # location of the page
137
119
  #
138
120
  def to_absolute(link)
121
+ return nil if link.nil?
122
+
139
123
  # remove anchor
140
124
  link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
141
125
 
@@ -154,5 +138,14 @@ module Anemone
154
138
  def in_domain?(uri)
155
139
  uri.host == @url.host
156
140
  end
141
+
142
+ def marshal_dump
143
+ [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
144
+ end
145
+
146
+ def marshal_load(ary)
147
+ @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
148
+ end
149
+
157
150
  end
158
151
  end