spk-anemone 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +10 -0
- data/README.rdoc +2 -0
- data/lib/anemone/cli/serialize.rb +2 -2
- data/lib/anemone/core.rb +43 -53
- data/lib/anemone/http.rb +32 -21
- data/lib/anemone/page.rb +43 -50
- data/lib/anemone/{page_hash.rb → page_store.rb} +76 -58
- data/lib/anemone/storage.rb +19 -0
- data/lib/anemone/storage/pstore.rb +48 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +57 -0
- data/lib/anemone/tentacle.rb +7 -7
- data/spec/anemone_spec.rb +4 -4
- data/spec/core_spec.rb +226 -163
- data/spec/http_spec.rb +23 -0
- data/spec/page_spec.rb +28 -14
- data/spec/page_store_spec.rb +128 -0
- data/spec/storage_spec.rb +123 -0
- metadata +10 -5
data/CHANGELOG.rdoc
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
== 0.3.0 / 2009-12-15
|
2
|
+
|
3
|
+
* Major enchancements
|
4
|
+
|
5
|
+
* Option for persistent storage of pages during crawl with TokyoCabinet or PStore
|
6
|
+
|
7
|
+
* Minor enhancements
|
8
|
+
|
9
|
+
* Options can be set via methods on the Core object in the crawl block
|
10
|
+
|
1
11
|
== 0.2.4 / 2009-11-26
|
2
12
|
|
3
13
|
* Minor enhancements
|
data/README.rdoc
CHANGED
@@ -15,6 +15,8 @@ See http://anemone.rubyforge.org for more information.
|
|
15
15
|
* HTTPS support
|
16
16
|
* Records response time for each page
|
17
17
|
* CLI program can list all pages in a domain, calculate page depths, and more
|
18
|
+
* Obey robots.txt
|
19
|
+
* In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
|
18
20
|
|
19
21
|
== Examples
|
20
22
|
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
|
@@ -12,10 +12,10 @@ Usage:
|
|
12
12
|
|
13
13
|
Synopsis:
|
14
14
|
Crawls a site starting at the given URL and saves the resulting
|
15
|
-
|
15
|
+
PageStore object to a file using Marshal serialization.
|
16
16
|
|
17
17
|
Options:
|
18
|
-
-o, --output filename Filename to save
|
18
|
+
-o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
|
19
19
|
INFO
|
20
20
|
exit(0)
|
21
21
|
end
|
data/lib/anemone/core.rb
CHANGED
@@ -2,11 +2,12 @@ require 'thread'
|
|
2
2
|
require 'robots'
|
3
3
|
require 'anemone/tentacle'
|
4
4
|
require 'anemone/page'
|
5
|
-
require 'anemone/
|
5
|
+
require 'anemone/page_store'
|
6
|
+
require 'anemone/storage'
|
6
7
|
|
7
8
|
module Anemone
|
8
9
|
|
9
|
-
VERSION = '0.
|
10
|
+
VERSION = '0.3.0';
|
10
11
|
|
11
12
|
#
|
12
13
|
# Convenience method to start a crawl
|
@@ -16,11 +17,11 @@ module Anemone
|
|
16
17
|
end
|
17
18
|
|
18
19
|
class Core
|
19
|
-
# PageHash storing all Page objects encountered during the crawl
|
20
|
-
attr_reader :pages
|
21
20
|
|
21
|
+
# PageStore storing all Page objects encountered during the crawl
|
22
|
+
attr_reader :pages
|
22
23
|
# Hash of options for the crawl
|
23
|
-
|
24
|
+
attr_reader :opts
|
24
25
|
|
25
26
|
DEFAULT_OPTS = {
|
26
27
|
# run 4 Tentacle threads to fetch pages
|
@@ -39,29 +40,33 @@ module Anemone
|
|
39
40
|
:depth_limit => false,
|
40
41
|
# number of times HTTP redirects will be followed
|
41
42
|
:redirect_limit => 5,
|
43
|
+
# storage engine defaults to Hash in +process_options+ if none specified
|
44
|
+
:storage => nil,
|
42
45
|
# Authentication
|
43
46
|
:authorization => nil,
|
44
47
|
}
|
45
48
|
|
49
|
+
# Create setter methods for all options to be called from the crawl block
|
50
|
+
DEFAULT_OPTS.keys.each do |key|
|
51
|
+
define_method "#{key}=" do |*args|
|
52
|
+
@opts[key.to_sym] = *args
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
46
56
|
#
|
47
57
|
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
48
58
|
# and optional *block*
|
49
59
|
#
|
50
60
|
def initialize(urls, opts = {})
|
51
|
-
process_options opts
|
52
|
-
|
53
61
|
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
54
|
-
@urls.each{ |url|
|
55
|
-
url.path = '/' if url.path.empty?
|
56
|
-
authorization(url) if url.user
|
57
|
-
}
|
62
|
+
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
58
63
|
|
59
64
|
@tentacles = []
|
60
|
-
@pages = PageHash.new
|
61
65
|
@on_every_page_blocks = []
|
62
66
|
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
63
67
|
@skip_link_patterns = []
|
64
68
|
@after_crawl_blocks = []
|
69
|
+
@opts = opts
|
65
70
|
|
66
71
|
yield self if block_given?
|
67
72
|
end
|
@@ -77,7 +82,7 @@ module Anemone
|
|
77
82
|
end
|
78
83
|
|
79
84
|
#
|
80
|
-
# Add a block to be executed on the
|
85
|
+
# Add a block to be executed on the PageStore after the crawl
|
81
86
|
# is finished
|
82
87
|
#
|
83
88
|
def after_crawl(&block)
|
@@ -129,6 +134,8 @@ module Anemone
|
|
129
134
|
# Perform the crawl
|
130
135
|
#
|
131
136
|
def run
|
137
|
+
process_options
|
138
|
+
|
132
139
|
@urls.delete_if { |url| !visit_link?(url) }
|
133
140
|
return if @urls.empty?
|
134
141
|
|
@@ -139,81 +146,66 @@ module Anemone
|
|
139
146
|
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
140
147
|
end
|
141
148
|
|
142
|
-
@urls.each{ |url|
|
149
|
+
@urls.each{ |url|
|
150
|
+
link_queue.enq(url)
|
151
|
+
authorization(url) if url.user
|
152
|
+
}
|
143
153
|
|
144
154
|
loop do
|
145
155
|
page = page_queue.deq
|
146
|
-
|
147
|
-
@pages[page.url] = page
|
148
|
-
|
156
|
+
@pages.touch_key page.url
|
149
157
|
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
150
|
-
|
151
|
-
# perform the on_every_page blocks for this page
|
152
|
-
do_page_blocks(page)
|
153
|
-
|
158
|
+
do_page_blocks page
|
154
159
|
page.discard_doc! if @opts[:discard_page_bodies]
|
155
160
|
|
156
|
-
links_to_follow
|
157
|
-
|
158
|
-
|
161
|
+
links = links_to_follow page
|
162
|
+
links.each do |link|
|
163
|
+
link_queue << [link, page.url.dup, page.depth + 1]
|
159
164
|
end
|
165
|
+
@pages.touch_keys links
|
160
166
|
|
161
|
-
|
162
|
-
# i.e. all the pages that redirected to this page
|
163
|
-
page.aliases.each do |aka|
|
164
|
-
if !@pages.has_key?(aka) or @pages[aka].nil?
|
165
|
-
@pages[aka] = page.alias_clone(aka)
|
166
|
-
end
|
167
|
-
@pages[aka].add_alias!(page.url)
|
168
|
-
end
|
167
|
+
@pages[page.url] = page
|
169
168
|
|
170
169
|
# if we are done with the crawl, tell the threads to end
|
171
170
|
if link_queue.empty? and page_queue.empty?
|
172
171
|
until link_queue.num_waiting == @tentacles.size
|
173
172
|
Thread.pass
|
174
173
|
end
|
175
|
-
|
176
174
|
if page_queue.empty?
|
177
|
-
@tentacles.size.times { link_queue
|
175
|
+
@tentacles.size.times { link_queue << :END }
|
178
176
|
break
|
179
177
|
end
|
180
178
|
end
|
181
|
-
|
182
179
|
end
|
183
180
|
|
184
181
|
@tentacles.each { |t| t.join }
|
185
|
-
|
186
|
-
do_after_crawl_blocks()
|
187
|
-
|
182
|
+
do_after_crawl_blocks
|
188
183
|
self
|
189
184
|
end
|
190
185
|
|
191
186
|
private
|
192
187
|
|
193
|
-
def process_options
|
194
|
-
@opts = DEFAULT_OPTS.merge
|
195
|
-
|
196
|
-
authorization(@opts[:authorization])
|
197
|
-
|
188
|
+
def process_options
|
189
|
+
@opts = DEFAULT_OPTS.merge @opts
|
190
|
+
authorization(@opts[:authorization]) if @opts[:authorization]
|
198
191
|
@opts[:threads] = 1 if @opts[:delay] > 0
|
199
|
-
|
192
|
+
@pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
|
200
193
|
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
201
194
|
end
|
202
195
|
|
203
196
|
# Generate Authorization string only if not already set
|
204
197
|
def authorization(auth=nil)
|
205
|
-
return if @opts[:authorization] =~ /^Basic .*/
|
206
198
|
require 'base64'
|
207
199
|
if auth.is_a?(String) && auth.include?(':')
|
208
|
-
|
200
|
+
self.authorization = "Basic #{Base64.b64encode(auth)}"
|
209
201
|
elsif auth.is_a?(Array)
|
210
202
|
user = auth.first
|
211
203
|
password = auth.last
|
212
|
-
|
204
|
+
self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
|
213
205
|
elsif auth.is_a?(URI)
|
214
206
|
user = auth.user
|
215
207
|
password = auth.password
|
216
|
-
|
208
|
+
self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
|
217
209
|
end
|
218
210
|
end
|
219
211
|
|
@@ -221,7 +213,7 @@ module Anemone
|
|
221
213
|
# Execute the after_crawl blocks
|
222
214
|
#
|
223
215
|
def do_after_crawl_blocks
|
224
|
-
@after_crawl_blocks.each {|b| b.call(@pages)}
|
216
|
+
@after_crawl_blocks.each { |b| b.call(@pages) }
|
225
217
|
end
|
226
218
|
|
227
219
|
#
|
@@ -233,9 +225,7 @@ module Anemone
|
|
233
225
|
end
|
234
226
|
|
235
227
|
@on_pages_like_blocks.each do |pattern, blks|
|
236
|
-
if page.url.to_s =~ pattern
|
237
|
-
blks.each { |blk| blk.call(page) }
|
238
|
-
end
|
228
|
+
blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
|
239
229
|
end
|
240
230
|
end
|
241
231
|
|
@@ -246,7 +236,7 @@ module Anemone
|
|
246
236
|
#
|
247
237
|
def links_to_follow(page)
|
248
238
|
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
249
|
-
links.select { |link| visit_link?(link, page) }
|
239
|
+
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
|
250
240
|
end
|
251
241
|
|
252
242
|
#
|
data/lib/anemone/http.rb
CHANGED
@@ -12,54 +12,65 @@ module Anemone
|
|
12
12
|
end
|
13
13
|
|
14
14
|
#
|
15
|
-
#
|
15
|
+
# Fetch a single Page from the response of an HTTP request to *url*.
|
16
|
+
# Just gets the final destination page.
|
16
17
|
#
|
17
|
-
def fetch_page(url,
|
18
|
+
def fetch_page(url, referer = nil, depth = nil)
|
19
|
+
fetch_pages(url, referer, depth).last
|
20
|
+
end
|
21
|
+
|
22
|
+
#
|
23
|
+
# Create new Pages from the response of an HTTP request to *url*,
|
24
|
+
# including redirects
|
25
|
+
#
|
26
|
+
def fetch_pages(url, referer = nil, depth = nil)
|
18
27
|
begin
|
19
28
|
url = URI(url) unless url.is_a?(URI)
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
29
|
+
pages = []
|
30
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
31
|
+
pages << Page.new(location, :body => response.body.dup,
|
32
|
+
:code => code,
|
33
|
+
:headers => response.to_hash,
|
34
|
+
:referer => referer,
|
35
|
+
:depth => depth,
|
36
|
+
:redirect_to => redirect_to,
|
37
|
+
:response_time => response_time)
|
24
38
|
end
|
25
39
|
|
26
|
-
|
27
|
-
|
28
|
-
aka = nil
|
29
|
-
if !url.eql?(location)
|
30
|
-
aka = location
|
31
|
-
end
|
32
|
-
|
33
|
-
return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
|
40
|
+
return pages
|
34
41
|
rescue => e
|
35
42
|
if verbose?
|
36
43
|
puts e.inspect
|
37
44
|
puts e.backtrace
|
38
45
|
end
|
39
|
-
return Page.new(url)
|
46
|
+
return [Page.new(url, :error => e)]
|
40
47
|
end
|
41
48
|
end
|
42
49
|
|
43
50
|
private
|
44
51
|
|
45
52
|
#
|
46
|
-
# Retrieve
|
47
|
-
#
|
53
|
+
# Retrieve HTTP responses for *url*, including redirects.
|
54
|
+
# Yields the response object, response code, and URI location
|
55
|
+
# for each response.
|
48
56
|
#
|
49
57
|
def get(url, referer = nil)
|
50
58
|
response, response_time = get_response(url, referer)
|
51
59
|
code = Integer(response.code)
|
52
60
|
loc = url
|
61
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
62
|
+
yield response, code, loc, redirect_to, response_time
|
53
63
|
|
54
64
|
limit = redirect_limit
|
55
65
|
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
56
|
-
loc =
|
66
|
+
loc = redirect_to
|
57
67
|
loc = url.merge(loc) if loc.relative?
|
58
68
|
response, response_time = get_response(loc, referer)
|
69
|
+
code = Integer(response.code)
|
70
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
71
|
+
yield response, code, loc, redirect_to, response_time
|
59
72
|
limit -= 1
|
60
73
|
end
|
61
|
-
|
62
|
-
return response, code, loc, response_time
|
63
74
|
end
|
64
75
|
|
65
76
|
#
|
@@ -94,7 +105,7 @@ module Anemone
|
|
94
105
|
return conn
|
95
106
|
end
|
96
107
|
|
97
|
-
refresh_connection
|
108
|
+
refresh_connection url
|
98
109
|
end
|
99
110
|
|
100
111
|
def refresh_connection(url)
|
data/lib/anemone/page.rb
CHANGED
@@ -8,21 +8,21 @@ module Anemone
|
|
8
8
|
attr_reader :url
|
9
9
|
# Headers of the HTTP response
|
10
10
|
attr_reader :headers
|
11
|
+
# URL of the page this one redirected to, if any
|
12
|
+
attr_reader :redirect_to
|
13
|
+
# Exception object, if one was raised during HTTP#fetch_page
|
14
|
+
attr_reader :error
|
15
|
+
# HTML body
|
16
|
+
attr_reader :body
|
11
17
|
|
12
18
|
# OpenStruct for user-stored data
|
13
19
|
attr_accessor :data
|
14
|
-
# HTML body
|
15
|
-
attr_accessor :body
|
16
|
-
# Nokogiri document for the HTML body
|
17
|
-
attr_accessor :doc
|
18
20
|
# Integer response code of the page
|
19
21
|
attr_accessor :code
|
20
|
-
#
|
21
|
-
attr_accessor :aliases
|
22
|
-
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
22
|
+
# Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
|
23
23
|
attr_accessor :visited
|
24
24
|
# Depth of this page from the root of the crawl. This is not necessarily the
|
25
|
-
# shortest path; use
|
25
|
+
# shortest path; use PageStore#shortest_paths! to find that value.
|
26
26
|
attr_accessor :depth
|
27
27
|
# URL of the page that brought us to this page
|
28
28
|
attr_accessor :referer
|
@@ -32,18 +32,22 @@ module Anemone
|
|
32
32
|
#
|
33
33
|
# Create a new page
|
34
34
|
#
|
35
|
-
def initialize(url,
|
35
|
+
def initialize(url, params = {})
|
36
36
|
@url = url
|
37
|
-
@code = code
|
38
|
-
@headers = headers || {}
|
39
|
-
@headers['content-type'] ||= ['']
|
40
|
-
@aliases = Array(aka)
|
41
37
|
@data = OpenStruct.new
|
42
|
-
|
43
|
-
@
|
44
|
-
@
|
45
|
-
@
|
46
|
-
@
|
38
|
+
|
39
|
+
@code = params[:code]
|
40
|
+
@headers = params[:headers] || {}
|
41
|
+
@headers['content-type'] ||= ['']
|
42
|
+
@aliases = Array(params[:aka]).compact
|
43
|
+
@referer = params[:referer]
|
44
|
+
@depth = params[:depth] || 0
|
45
|
+
@redirect_to = to_absolute(params[:redirect_to])
|
46
|
+
@response_time = params[:response_time]
|
47
|
+
@body = params[:body]
|
48
|
+
@error = params[:error]
|
49
|
+
|
50
|
+
@fetched = !params[:code].nil?
|
47
51
|
end
|
48
52
|
|
49
53
|
# Array of distinct A tag HREFs from the page
|
@@ -62,42 +66,20 @@ module Anemone
|
|
62
66
|
@links
|
63
67
|
end
|
64
68
|
|
65
|
-
|
66
|
-
|
67
|
-
@doc
|
68
|
-
|
69
|
-
|
70
|
-
#
|
71
|
-
# Return a new page with the same *response* and *url*, but
|
72
|
-
# with a 200 response code
|
73
|
-
#
|
74
|
-
def alias_clone(url)
|
75
|
-
p = clone
|
76
|
-
p.add_alias!(@aka) if !@aka.nil?
|
77
|
-
p.code = 200
|
78
|
-
p
|
69
|
+
# Nokogiri document for the HTML body
|
70
|
+
def doc
|
71
|
+
return @doc if @doc
|
72
|
+
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
|
79
73
|
end
|
80
74
|
|
81
|
-
#
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
#
|
86
|
-
def add_alias!(aka)
|
87
|
-
@aliases << aka if !@aliases.include?(aka)
|
88
|
-
self
|
75
|
+
# Delete the Nokogiri document and response body to conserve memory
|
76
|
+
def discard_doc!
|
77
|
+
links # force parsing of page links before we trash the document
|
78
|
+
@doc = @body = nil
|
89
79
|
end
|
90
80
|
|
91
|
-
|
92
|
-
|
93
|
-
# redirect-aliases of those pages, as String objects.
|
94
|
-
#
|
95
|
-
# *page_hash* is a PageHash object with the results of the current crawl.
|
96
|
-
#
|
97
|
-
def links_and_their_aliases(page_hash)
|
98
|
-
links.inject([]) do |results, link|
|
99
|
-
results.concat([link].concat(page_hash[link].aliases))
|
100
|
-
end
|
81
|
+
def fetched?
|
82
|
+
@fetched
|
101
83
|
end
|
102
84
|
|
103
85
|
#
|
@@ -136,6 +118,8 @@ module Anemone
|
|
136
118
|
# location of the page
|
137
119
|
#
|
138
120
|
def to_absolute(link)
|
121
|
+
return nil if link.nil?
|
122
|
+
|
139
123
|
# remove anchor
|
140
124
|
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
141
125
|
|
@@ -154,5 +138,14 @@ module Anemone
|
|
154
138
|
def in_domain?(uri)
|
155
139
|
uri.host == @url.host
|
156
140
|
end
|
141
|
+
|
142
|
+
def marshal_dump
|
143
|
+
[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
|
144
|
+
end
|
145
|
+
|
146
|
+
def marshal_load(ary)
|
147
|
+
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
|
148
|
+
end
|
149
|
+
|
157
150
|
end
|
158
151
|
end
|