spk-anemone 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.rdoc CHANGED
@@ -1,3 +1,13 @@
1
+ == 0.3.0 / 2009-12-15
2
+
3
+ * Major enchancements
4
+
5
+ * Option for persistent storage of pages during crawl with TokyoCabinet or PStore
6
+
7
+ * Minor enhancements
8
+
9
+ * Options can be set via methods on the Core object in the crawl block
10
+
1
11
  == 0.2.4 / 2009-11-26
2
12
 
3
13
  * Minor enhancements
data/README.rdoc CHANGED
@@ -15,6 +15,8 @@ See http://anemone.rubyforge.org for more information.
15
15
  * HTTPS support
16
16
  * Records response time for each page
17
17
  * CLI program can list all pages in a domain, calculate page depths, and more
18
+ * Obey robots.txt
19
+ * In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
18
20
 
19
21
  == Examples
20
22
  See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
@@ -12,10 +12,10 @@ Usage:
12
12
 
13
13
  Synopsis:
14
14
  Crawls a site starting at the given URL and saves the resulting
15
- PageHash object to a file using Marshal serialization.
15
+ PageStore object to a file using Marshal serialization.
16
16
 
17
17
  Options:
18
- -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
18
+ -o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
19
19
  INFO
20
20
  exit(0)
21
21
  end
data/lib/anemone/core.rb CHANGED
@@ -2,11 +2,12 @@ require 'thread'
2
2
  require 'robots'
3
3
  require 'anemone/tentacle'
4
4
  require 'anemone/page'
5
- require 'anemone/page_hash'
5
+ require 'anemone/page_store'
6
+ require 'anemone/storage'
6
7
 
7
8
  module Anemone
8
9
 
9
- VERSION = '0.2.4';
10
+ VERSION = '0.3.0';
10
11
 
11
12
  #
12
13
  # Convenience method to start a crawl
@@ -16,11 +17,11 @@ module Anemone
16
17
  end
17
18
 
18
19
  class Core
19
- # PageHash storing all Page objects encountered during the crawl
20
- attr_reader :pages
21
20
 
21
+ # PageStore storing all Page objects encountered during the crawl
22
+ attr_reader :pages
22
23
  # Hash of options for the crawl
23
- attr_accessor :opts
24
+ attr_reader :opts
24
25
 
25
26
  DEFAULT_OPTS = {
26
27
  # run 4 Tentacle threads to fetch pages
@@ -39,29 +40,33 @@ module Anemone
39
40
  :depth_limit => false,
40
41
  # number of times HTTP redirects will be followed
41
42
  :redirect_limit => 5,
43
+ # storage engine defaults to Hash in +process_options+ if none specified
44
+ :storage => nil,
42
45
  # Authentication
43
46
  :authorization => nil,
44
47
  }
45
48
 
49
+ # Create setter methods for all options to be called from the crawl block
50
+ DEFAULT_OPTS.keys.each do |key|
51
+ define_method "#{key}=" do |*args|
52
+ @opts[key.to_sym] = *args
53
+ end
54
+ end
55
+
46
56
  #
47
57
  # Initialize the crawl with starting *urls* (single URL or Array of URLs)
48
58
  # and optional *block*
49
59
  #
50
60
  def initialize(urls, opts = {})
51
- process_options opts
52
-
53
61
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
54
- @urls.each{ |url|
55
- url.path = '/' if url.path.empty?
56
- authorization(url) if url.user
57
- }
62
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
58
63
 
59
64
  @tentacles = []
60
- @pages = PageHash.new
61
65
  @on_every_page_blocks = []
62
66
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
63
67
  @skip_link_patterns = []
64
68
  @after_crawl_blocks = []
69
+ @opts = opts
65
70
 
66
71
  yield self if block_given?
67
72
  end
@@ -77,7 +82,7 @@ module Anemone
77
82
  end
78
83
 
79
84
  #
80
- # Add a block to be executed on the PageHash after the crawl
85
+ # Add a block to be executed on the PageStore after the crawl
81
86
  # is finished
82
87
  #
83
88
  def after_crawl(&block)
@@ -129,6 +134,8 @@ module Anemone
129
134
  # Perform the crawl
130
135
  #
131
136
  def run
137
+ process_options
138
+
132
139
  @urls.delete_if { |url| !visit_link?(url) }
133
140
  return if @urls.empty?
134
141
 
@@ -139,81 +146,66 @@ module Anemone
139
146
  @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
140
147
  end
141
148
 
142
- @urls.each{ |url| link_queue.enq(url) }
149
+ @urls.each{ |url|
150
+ link_queue.enq(url)
151
+ authorization(url) if url.user
152
+ }
143
153
 
144
154
  loop do
145
155
  page = page_queue.deq
146
-
147
- @pages[page.url] = page
148
-
156
+ @pages.touch_key page.url
149
157
  puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
150
-
151
- # perform the on_every_page blocks for this page
152
- do_page_blocks(page)
153
-
158
+ do_page_blocks page
154
159
  page.discard_doc! if @opts[:discard_page_bodies]
155
160
 
156
- links_to_follow(page).each do |link|
157
- link_queue.enq([link, page])
158
- @pages[link] = nil
161
+ links = links_to_follow page
162
+ links.each do |link|
163
+ link_queue << [link, page.url.dup, page.depth + 1]
159
164
  end
165
+ @pages.touch_keys links
160
166
 
161
- # create an entry in the page hash for each alias of this page,
162
- # i.e. all the pages that redirected to this page
163
- page.aliases.each do |aka|
164
- if !@pages.has_key?(aka) or @pages[aka].nil?
165
- @pages[aka] = page.alias_clone(aka)
166
- end
167
- @pages[aka].add_alias!(page.url)
168
- end
167
+ @pages[page.url] = page
169
168
 
170
169
  # if we are done with the crawl, tell the threads to end
171
170
  if link_queue.empty? and page_queue.empty?
172
171
  until link_queue.num_waiting == @tentacles.size
173
172
  Thread.pass
174
173
  end
175
-
176
174
  if page_queue.empty?
177
- @tentacles.size.times { link_queue.enq(:END)}
175
+ @tentacles.size.times { link_queue << :END }
178
176
  break
179
177
  end
180
178
  end
181
-
182
179
  end
183
180
 
184
181
  @tentacles.each { |t| t.join }
185
-
186
- do_after_crawl_blocks()
187
-
182
+ do_after_crawl_blocks
188
183
  self
189
184
  end
190
185
 
191
186
  private
192
187
 
193
- def process_options(options)
194
- @opts = DEFAULT_OPTS.merge options
195
-
196
- authorization(@opts[:authorization])
197
-
188
+ def process_options
189
+ @opts = DEFAULT_OPTS.merge @opts
190
+ authorization(@opts[:authorization]) if @opts[:authorization]
198
191
  @opts[:threads] = 1 if @opts[:delay] > 0
199
-
192
+ @pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
200
193
  @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
201
194
  end
202
195
 
203
196
  # Generate Authorization string only if not already set
204
197
  def authorization(auth=nil)
205
- return if @opts[:authorization] =~ /^Basic .*/
206
198
  require 'base64'
207
199
  if auth.is_a?(String) && auth.include?(':')
208
- @opts[:authorization] = "Basic #{Base64.b64encode(auth)}"
200
+ self.authorization = "Basic #{Base64.b64encode(auth)}"
209
201
  elsif auth.is_a?(Array)
210
202
  user = auth.first
211
203
  password = auth.last
212
- @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
204
+ self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
213
205
  elsif auth.is_a?(URI)
214
206
  user = auth.user
215
207
  password = auth.password
216
- @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
208
+ self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
217
209
  end
218
210
  end
219
211
 
@@ -221,7 +213,7 @@ module Anemone
221
213
  # Execute the after_crawl blocks
222
214
  #
223
215
  def do_after_crawl_blocks
224
- @after_crawl_blocks.each {|b| b.call(@pages)}
216
+ @after_crawl_blocks.each { |b| b.call(@pages) }
225
217
  end
226
218
 
227
219
  #
@@ -233,9 +225,7 @@ module Anemone
233
225
  end
234
226
 
235
227
  @on_pages_like_blocks.each do |pattern, blks|
236
- if page.url.to_s =~ pattern
237
- blks.each { |blk| blk.call(page) }
238
- end
228
+ blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
239
229
  end
240
230
  end
241
231
 
@@ -246,7 +236,7 @@ module Anemone
246
236
  #
247
237
  def links_to_follow(page)
248
238
  links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
249
- links.select { |link| visit_link?(link, page) }
239
+ links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
250
240
  end
251
241
 
252
242
  #
data/lib/anemone/http.rb CHANGED
@@ -12,54 +12,65 @@ module Anemone
12
12
  end
13
13
 
14
14
  #
15
- # Create a new Page from the response of an HTTP request to *url*
15
+ # Fetch a single Page from the response of an HTTP request to *url*.
16
+ # Just gets the final destination page.
16
17
  #
17
- def fetch_page(url, from_page = nil)
18
+ def fetch_page(url, referer = nil, depth = nil)
19
+ fetch_pages(url, referer, depth).last
20
+ end
21
+
22
+ #
23
+ # Create new Pages from the response of an HTTP request to *url*,
24
+ # including redirects
25
+ #
26
+ def fetch_pages(url, referer = nil, depth = nil)
18
27
  begin
19
28
  url = URI(url) unless url.is_a?(URI)
20
-
21
- if from_page
22
- referer = from_page.url
23
- depth = from_page.depth + 1
29
+ pages = []
30
+ get(url, referer) do |response, code, location, redirect_to, response_time|
31
+ pages << Page.new(location, :body => response.body.dup,
32
+ :code => code,
33
+ :headers => response.to_hash,
34
+ :referer => referer,
35
+ :depth => depth,
36
+ :redirect_to => redirect_to,
37
+ :response_time => response_time)
24
38
  end
25
39
 
26
- response, code, location, response_time = get(url, referer)
27
-
28
- aka = nil
29
- if !url.eql?(location)
30
- aka = location
31
- end
32
-
33
- return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
40
+ return pages
34
41
  rescue => e
35
42
  if verbose?
36
43
  puts e.inspect
37
44
  puts e.backtrace
38
45
  end
39
- return Page.new(url)
46
+ return [Page.new(url, :error => e)]
40
47
  end
41
48
  end
42
49
 
43
50
  private
44
51
 
45
52
  #
46
- # Retrieve an HTTP response for *url*, following redirects.
47
- # Returns the response object, response code, and final URI location.
53
+ # Retrieve HTTP responses for *url*, including redirects.
54
+ # Yields the response object, response code, and URI location
55
+ # for each response.
48
56
  #
49
57
  def get(url, referer = nil)
50
58
  response, response_time = get_response(url, referer)
51
59
  code = Integer(response.code)
52
60
  loc = url
61
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
62
+ yield response, code, loc, redirect_to, response_time
53
63
 
54
64
  limit = redirect_limit
55
65
  while response.is_a?(Net::HTTPRedirection) and limit > 0
56
- loc = URI(response['location'])
66
+ loc = redirect_to
57
67
  loc = url.merge(loc) if loc.relative?
58
68
  response, response_time = get_response(loc, referer)
69
+ code = Integer(response.code)
70
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
71
+ yield response, code, loc, redirect_to, response_time
59
72
  limit -= 1
60
73
  end
61
-
62
- return response, code, loc, response_time
63
74
  end
64
75
 
65
76
  #
@@ -94,7 +105,7 @@ module Anemone
94
105
  return conn
95
106
  end
96
107
 
97
- refresh_connection(url)
108
+ refresh_connection url
98
109
  end
99
110
 
100
111
  def refresh_connection(url)
data/lib/anemone/page.rb CHANGED
@@ -8,21 +8,21 @@ module Anemone
8
8
  attr_reader :url
9
9
  # Headers of the HTTP response
10
10
  attr_reader :headers
11
+ # URL of the page this one redirected to, if any
12
+ attr_reader :redirect_to
13
+ # Exception object, if one was raised during HTTP#fetch_page
14
+ attr_reader :error
15
+ # HTML body
16
+ attr_reader :body
11
17
 
12
18
  # OpenStruct for user-stored data
13
19
  attr_accessor :data
14
- # HTML body
15
- attr_accessor :body
16
- # Nokogiri document for the HTML body
17
- attr_accessor :doc
18
20
  # Integer response code of the page
19
21
  attr_accessor :code
20
- # Array of redirect-aliases for the page
21
- attr_accessor :aliases
22
- # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
22
+ # Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
23
23
  attr_accessor :visited
24
24
  # Depth of this page from the root of the crawl. This is not necessarily the
25
- # shortest path; use PageHash#shortest_paths! to find that value.
25
+ # shortest path; use PageStore#shortest_paths! to find that value.
26
26
  attr_accessor :depth
27
27
  # URL of the page that brought us to this page
28
28
  attr_accessor :referer
@@ -32,18 +32,22 @@ module Anemone
32
32
  #
33
33
  # Create a new page
34
34
  #
35
- def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
35
+ def initialize(url, params = {})
36
36
  @url = url
37
- @code = code
38
- @headers = headers || {}
39
- @headers['content-type'] ||= ['']
40
- @aliases = Array(aka)
41
37
  @data = OpenStruct.new
42
- @referer = referer
43
- @depth = depth || 0
44
- @response_time = response_time
45
- @body = body
46
- @doc = Nokogiri::HTML(body) if body && html? rescue nil
38
+
39
+ @code = params[:code]
40
+ @headers = params[:headers] || {}
41
+ @headers['content-type'] ||= ['']
42
+ @aliases = Array(params[:aka]).compact
43
+ @referer = params[:referer]
44
+ @depth = params[:depth] || 0
45
+ @redirect_to = to_absolute(params[:redirect_to])
46
+ @response_time = params[:response_time]
47
+ @body = params[:body]
48
+ @error = params[:error]
49
+
50
+ @fetched = !params[:code].nil?
47
51
  end
48
52
 
49
53
  # Array of distinct A tag HREFs from the page
@@ -62,42 +66,20 @@ module Anemone
62
66
  @links
63
67
  end
64
68
 
65
- def discard_doc!
66
- links # force parsing of page links before we trash the document
67
- @doc = nil
68
- end
69
-
70
- #
71
- # Return a new page with the same *response* and *url*, but
72
- # with a 200 response code
73
- #
74
- def alias_clone(url)
75
- p = clone
76
- p.add_alias!(@aka) if !@aka.nil?
77
- p.code = 200
78
- p
69
+ # Nokogiri document for the HTML body
70
+ def doc
71
+ return @doc if @doc
72
+ @doc = Nokogiri::HTML(@body) if @body && html? rescue nil
79
73
  end
80
74
 
81
- #
82
- # Add a redirect-alias String *aka* to the list of the page's aliases
83
- #
84
- # Returns *self*
85
- #
86
- def add_alias!(aka)
87
- @aliases << aka if !@aliases.include?(aka)
88
- self
75
+ # Delete the Nokogiri document and response body to conserve memory
76
+ def discard_doc!
77
+ links # force parsing of page links before we trash the document
78
+ @doc = @body = nil
89
79
  end
90
80
 
91
- #
92
- # Returns an Array of all links from this page, and all the
93
- # redirect-aliases of those pages, as String objects.
94
- #
95
- # *page_hash* is a PageHash object with the results of the current crawl.
96
- #
97
- def links_and_their_aliases(page_hash)
98
- links.inject([]) do |results, link|
99
- results.concat([link].concat(page_hash[link].aliases))
100
- end
81
+ def fetched?
82
+ @fetched
101
83
  end
102
84
 
103
85
  #
@@ -136,6 +118,8 @@ module Anemone
136
118
  # location of the page
137
119
  #
138
120
  def to_absolute(link)
121
+ return nil if link.nil?
122
+
139
123
  # remove anchor
140
124
  link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
141
125
 
@@ -154,5 +138,14 @@ module Anemone
154
138
  def in_domain?(uri)
155
139
  uri.host == @url.host
156
140
  end
141
+
142
+ def marshal_dump
143
+ [@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
144
+ end
145
+
146
+ def marshal_load(ary)
147
+ @url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
148
+ end
149
+
157
150
  end
158
151
  end