spk-anemone 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +10 -0
- data/README.rdoc +2 -0
- data/lib/anemone/cli/serialize.rb +2 -2
- data/lib/anemone/core.rb +43 -53
- data/lib/anemone/http.rb +32 -21
- data/lib/anemone/page.rb +43 -50
- data/lib/anemone/{page_hash.rb → page_store.rb} +76 -58
- data/lib/anemone/storage.rb +19 -0
- data/lib/anemone/storage/pstore.rb +48 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +57 -0
- data/lib/anemone/tentacle.rb +7 -7
- data/spec/anemone_spec.rb +4 -4
- data/spec/core_spec.rb +226 -163
- data/spec/http_spec.rb +23 -0
- data/spec/page_spec.rb +28 -14
- data/spec/page_store_spec.rb +128 -0
- data/spec/storage_spec.rb +123 -0
- metadata +10 -5
data/CHANGELOG.rdoc
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
== 0.3.0 / 2009-12-15
|
2
|
+
|
3
|
+
* Major enchancements
|
4
|
+
|
5
|
+
* Option for persistent storage of pages during crawl with TokyoCabinet or PStore
|
6
|
+
|
7
|
+
* Minor enhancements
|
8
|
+
|
9
|
+
* Options can be set via methods on the Core object in the crawl block
|
10
|
+
|
1
11
|
== 0.2.4 / 2009-11-26
|
2
12
|
|
3
13
|
* Minor enhancements
|
data/README.rdoc
CHANGED
@@ -15,6 +15,8 @@ See http://anemone.rubyforge.org for more information.
|
|
15
15
|
* HTTPS support
|
16
16
|
* Records response time for each page
|
17
17
|
* CLI program can list all pages in a domain, calculate page depths, and more
|
18
|
+
* Obey robots.txt
|
19
|
+
* In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
|
18
20
|
|
19
21
|
== Examples
|
20
22
|
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
|
@@ -12,10 +12,10 @@ Usage:
|
|
12
12
|
|
13
13
|
Synopsis:
|
14
14
|
Crawls a site starting at the given URL and saves the resulting
|
15
|
-
|
15
|
+
PageStore object to a file using Marshal serialization.
|
16
16
|
|
17
17
|
Options:
|
18
|
-
-o, --output filename Filename to save
|
18
|
+
-o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
|
19
19
|
INFO
|
20
20
|
exit(0)
|
21
21
|
end
|
data/lib/anemone/core.rb
CHANGED
@@ -2,11 +2,12 @@ require 'thread'
|
|
2
2
|
require 'robots'
|
3
3
|
require 'anemone/tentacle'
|
4
4
|
require 'anemone/page'
|
5
|
-
require 'anemone/
|
5
|
+
require 'anemone/page_store'
|
6
|
+
require 'anemone/storage'
|
6
7
|
|
7
8
|
module Anemone
|
8
9
|
|
9
|
-
VERSION = '0.
|
10
|
+
VERSION = '0.3.0';
|
10
11
|
|
11
12
|
#
|
12
13
|
# Convenience method to start a crawl
|
@@ -16,11 +17,11 @@ module Anemone
|
|
16
17
|
end
|
17
18
|
|
18
19
|
class Core
|
19
|
-
# PageHash storing all Page objects encountered during the crawl
|
20
|
-
attr_reader :pages
|
21
20
|
|
21
|
+
# PageStore storing all Page objects encountered during the crawl
|
22
|
+
attr_reader :pages
|
22
23
|
# Hash of options for the crawl
|
23
|
-
|
24
|
+
attr_reader :opts
|
24
25
|
|
25
26
|
DEFAULT_OPTS = {
|
26
27
|
# run 4 Tentacle threads to fetch pages
|
@@ -39,29 +40,33 @@ module Anemone
|
|
39
40
|
:depth_limit => false,
|
40
41
|
# number of times HTTP redirects will be followed
|
41
42
|
:redirect_limit => 5,
|
43
|
+
# storage engine defaults to Hash in +process_options+ if none specified
|
44
|
+
:storage => nil,
|
42
45
|
# Authentication
|
43
46
|
:authorization => nil,
|
44
47
|
}
|
45
48
|
|
49
|
+
# Create setter methods for all options to be called from the crawl block
|
50
|
+
DEFAULT_OPTS.keys.each do |key|
|
51
|
+
define_method "#{key}=" do |*args|
|
52
|
+
@opts[key.to_sym] = *args
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
46
56
|
#
|
47
57
|
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
48
58
|
# and optional *block*
|
49
59
|
#
|
50
60
|
def initialize(urls, opts = {})
|
51
|
-
process_options opts
|
52
|
-
|
53
61
|
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
54
|
-
@urls.each{ |url|
|
55
|
-
url.path = '/' if url.path.empty?
|
56
|
-
authorization(url) if url.user
|
57
|
-
}
|
62
|
+
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
58
63
|
|
59
64
|
@tentacles = []
|
60
|
-
@pages = PageHash.new
|
61
65
|
@on_every_page_blocks = []
|
62
66
|
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
63
67
|
@skip_link_patterns = []
|
64
68
|
@after_crawl_blocks = []
|
69
|
+
@opts = opts
|
65
70
|
|
66
71
|
yield self if block_given?
|
67
72
|
end
|
@@ -77,7 +82,7 @@ module Anemone
|
|
77
82
|
end
|
78
83
|
|
79
84
|
#
|
80
|
-
# Add a block to be executed on the
|
85
|
+
# Add a block to be executed on the PageStore after the crawl
|
81
86
|
# is finished
|
82
87
|
#
|
83
88
|
def after_crawl(&block)
|
@@ -129,6 +134,8 @@ module Anemone
|
|
129
134
|
# Perform the crawl
|
130
135
|
#
|
131
136
|
def run
|
137
|
+
process_options
|
138
|
+
|
132
139
|
@urls.delete_if { |url| !visit_link?(url) }
|
133
140
|
return if @urls.empty?
|
134
141
|
|
@@ -139,81 +146,66 @@ module Anemone
|
|
139
146
|
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
140
147
|
end
|
141
148
|
|
142
|
-
@urls.each{ |url|
|
149
|
+
@urls.each{ |url|
|
150
|
+
link_queue.enq(url)
|
151
|
+
authorization(url) if url.user
|
152
|
+
}
|
143
153
|
|
144
154
|
loop do
|
145
155
|
page = page_queue.deq
|
146
|
-
|
147
|
-
@pages[page.url] = page
|
148
|
-
|
156
|
+
@pages.touch_key page.url
|
149
157
|
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
150
|
-
|
151
|
-
# perform the on_every_page blocks for this page
|
152
|
-
do_page_blocks(page)
|
153
|
-
|
158
|
+
do_page_blocks page
|
154
159
|
page.discard_doc! if @opts[:discard_page_bodies]
|
155
160
|
|
156
|
-
links_to_follow
|
157
|
-
|
158
|
-
|
161
|
+
links = links_to_follow page
|
162
|
+
links.each do |link|
|
163
|
+
link_queue << [link, page.url.dup, page.depth + 1]
|
159
164
|
end
|
165
|
+
@pages.touch_keys links
|
160
166
|
|
161
|
-
|
162
|
-
# i.e. all the pages that redirected to this page
|
163
|
-
page.aliases.each do |aka|
|
164
|
-
if !@pages.has_key?(aka) or @pages[aka].nil?
|
165
|
-
@pages[aka] = page.alias_clone(aka)
|
166
|
-
end
|
167
|
-
@pages[aka].add_alias!(page.url)
|
168
|
-
end
|
167
|
+
@pages[page.url] = page
|
169
168
|
|
170
169
|
# if we are done with the crawl, tell the threads to end
|
171
170
|
if link_queue.empty? and page_queue.empty?
|
172
171
|
until link_queue.num_waiting == @tentacles.size
|
173
172
|
Thread.pass
|
174
173
|
end
|
175
|
-
|
176
174
|
if page_queue.empty?
|
177
|
-
@tentacles.size.times { link_queue
|
175
|
+
@tentacles.size.times { link_queue << :END }
|
178
176
|
break
|
179
177
|
end
|
180
178
|
end
|
181
|
-
|
182
179
|
end
|
183
180
|
|
184
181
|
@tentacles.each { |t| t.join }
|
185
|
-
|
186
|
-
do_after_crawl_blocks()
|
187
|
-
|
182
|
+
do_after_crawl_blocks
|
188
183
|
self
|
189
184
|
end
|
190
185
|
|
191
186
|
private
|
192
187
|
|
193
|
-
def process_options
|
194
|
-
@opts = DEFAULT_OPTS.merge
|
195
|
-
|
196
|
-
authorization(@opts[:authorization])
|
197
|
-
|
188
|
+
def process_options
|
189
|
+
@opts = DEFAULT_OPTS.merge @opts
|
190
|
+
authorization(@opts[:authorization]) if @opts[:authorization]
|
198
191
|
@opts[:threads] = 1 if @opts[:delay] > 0
|
199
|
-
|
192
|
+
@pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
|
200
193
|
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
201
194
|
end
|
202
195
|
|
203
196
|
# Generate Authorization string only if not already set
|
204
197
|
def authorization(auth=nil)
|
205
|
-
return if @opts[:authorization] =~ /^Basic .*/
|
206
198
|
require 'base64'
|
207
199
|
if auth.is_a?(String) && auth.include?(':')
|
208
|
-
|
200
|
+
self.authorization = "Basic #{Base64.b64encode(auth)}"
|
209
201
|
elsif auth.is_a?(Array)
|
210
202
|
user = auth.first
|
211
203
|
password = auth.last
|
212
|
-
|
204
|
+
self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
|
213
205
|
elsif auth.is_a?(URI)
|
214
206
|
user = auth.user
|
215
207
|
password = auth.password
|
216
|
-
|
208
|
+
self.authorization = "Basic #{Base64.b64encode(user+":"+password)}"
|
217
209
|
end
|
218
210
|
end
|
219
211
|
|
@@ -221,7 +213,7 @@ module Anemone
|
|
221
213
|
# Execute the after_crawl blocks
|
222
214
|
#
|
223
215
|
def do_after_crawl_blocks
|
224
|
-
@after_crawl_blocks.each {|b| b.call(@pages)}
|
216
|
+
@after_crawl_blocks.each { |b| b.call(@pages) }
|
225
217
|
end
|
226
218
|
|
227
219
|
#
|
@@ -233,9 +225,7 @@ module Anemone
|
|
233
225
|
end
|
234
226
|
|
235
227
|
@on_pages_like_blocks.each do |pattern, blks|
|
236
|
-
if page.url.to_s =~ pattern
|
237
|
-
blks.each { |blk| blk.call(page) }
|
238
|
-
end
|
228
|
+
blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
|
239
229
|
end
|
240
230
|
end
|
241
231
|
|
@@ -246,7 +236,7 @@ module Anemone
|
|
246
236
|
#
|
247
237
|
def links_to_follow(page)
|
248
238
|
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
249
|
-
links.select { |link| visit_link?(link, page) }
|
239
|
+
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
|
250
240
|
end
|
251
241
|
|
252
242
|
#
|
data/lib/anemone/http.rb
CHANGED
@@ -12,54 +12,65 @@ module Anemone
|
|
12
12
|
end
|
13
13
|
|
14
14
|
#
|
15
|
-
#
|
15
|
+
# Fetch a single Page from the response of an HTTP request to *url*.
|
16
|
+
# Just gets the final destination page.
|
16
17
|
#
|
17
|
-
def fetch_page(url,
|
18
|
+
def fetch_page(url, referer = nil, depth = nil)
|
19
|
+
fetch_pages(url, referer, depth).last
|
20
|
+
end
|
21
|
+
|
22
|
+
#
|
23
|
+
# Create new Pages from the response of an HTTP request to *url*,
|
24
|
+
# including redirects
|
25
|
+
#
|
26
|
+
def fetch_pages(url, referer = nil, depth = nil)
|
18
27
|
begin
|
19
28
|
url = URI(url) unless url.is_a?(URI)
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
29
|
+
pages = []
|
30
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
31
|
+
pages << Page.new(location, :body => response.body.dup,
|
32
|
+
:code => code,
|
33
|
+
:headers => response.to_hash,
|
34
|
+
:referer => referer,
|
35
|
+
:depth => depth,
|
36
|
+
:redirect_to => redirect_to,
|
37
|
+
:response_time => response_time)
|
24
38
|
end
|
25
39
|
|
26
|
-
|
27
|
-
|
28
|
-
aka = nil
|
29
|
-
if !url.eql?(location)
|
30
|
-
aka = location
|
31
|
-
end
|
32
|
-
|
33
|
-
return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
|
40
|
+
return pages
|
34
41
|
rescue => e
|
35
42
|
if verbose?
|
36
43
|
puts e.inspect
|
37
44
|
puts e.backtrace
|
38
45
|
end
|
39
|
-
return Page.new(url)
|
46
|
+
return [Page.new(url, :error => e)]
|
40
47
|
end
|
41
48
|
end
|
42
49
|
|
43
50
|
private
|
44
51
|
|
45
52
|
#
|
46
|
-
# Retrieve
|
47
|
-
#
|
53
|
+
# Retrieve HTTP responses for *url*, including redirects.
|
54
|
+
# Yields the response object, response code, and URI location
|
55
|
+
# for each response.
|
48
56
|
#
|
49
57
|
def get(url, referer = nil)
|
50
58
|
response, response_time = get_response(url, referer)
|
51
59
|
code = Integer(response.code)
|
52
60
|
loc = url
|
61
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
62
|
+
yield response, code, loc, redirect_to, response_time
|
53
63
|
|
54
64
|
limit = redirect_limit
|
55
65
|
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
56
|
-
loc =
|
66
|
+
loc = redirect_to
|
57
67
|
loc = url.merge(loc) if loc.relative?
|
58
68
|
response, response_time = get_response(loc, referer)
|
69
|
+
code = Integer(response.code)
|
70
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
71
|
+
yield response, code, loc, redirect_to, response_time
|
59
72
|
limit -= 1
|
60
73
|
end
|
61
|
-
|
62
|
-
return response, code, loc, response_time
|
63
74
|
end
|
64
75
|
|
65
76
|
#
|
@@ -94,7 +105,7 @@ module Anemone
|
|
94
105
|
return conn
|
95
106
|
end
|
96
107
|
|
97
|
-
refresh_connection
|
108
|
+
refresh_connection url
|
98
109
|
end
|
99
110
|
|
100
111
|
def refresh_connection(url)
|
data/lib/anemone/page.rb
CHANGED
@@ -8,21 +8,21 @@ module Anemone
|
|
8
8
|
attr_reader :url
|
9
9
|
# Headers of the HTTP response
|
10
10
|
attr_reader :headers
|
11
|
+
# URL of the page this one redirected to, if any
|
12
|
+
attr_reader :redirect_to
|
13
|
+
# Exception object, if one was raised during HTTP#fetch_page
|
14
|
+
attr_reader :error
|
15
|
+
# HTML body
|
16
|
+
attr_reader :body
|
11
17
|
|
12
18
|
# OpenStruct for user-stored data
|
13
19
|
attr_accessor :data
|
14
|
-
# HTML body
|
15
|
-
attr_accessor :body
|
16
|
-
# Nokogiri document for the HTML body
|
17
|
-
attr_accessor :doc
|
18
20
|
# Integer response code of the page
|
19
21
|
attr_accessor :code
|
20
|
-
#
|
21
|
-
attr_accessor :aliases
|
22
|
-
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
22
|
+
# Boolean indicating whether or not this page has been visited in PageStore#shortest_paths!
|
23
23
|
attr_accessor :visited
|
24
24
|
# Depth of this page from the root of the crawl. This is not necessarily the
|
25
|
-
# shortest path; use
|
25
|
+
# shortest path; use PageStore#shortest_paths! to find that value.
|
26
26
|
attr_accessor :depth
|
27
27
|
# URL of the page that brought us to this page
|
28
28
|
attr_accessor :referer
|
@@ -32,18 +32,22 @@ module Anemone
|
|
32
32
|
#
|
33
33
|
# Create a new page
|
34
34
|
#
|
35
|
-
def initialize(url,
|
35
|
+
def initialize(url, params = {})
|
36
36
|
@url = url
|
37
|
-
@code = code
|
38
|
-
@headers = headers || {}
|
39
|
-
@headers['content-type'] ||= ['']
|
40
|
-
@aliases = Array(aka)
|
41
37
|
@data = OpenStruct.new
|
42
|
-
|
43
|
-
@
|
44
|
-
@
|
45
|
-
@
|
46
|
-
@
|
38
|
+
|
39
|
+
@code = params[:code]
|
40
|
+
@headers = params[:headers] || {}
|
41
|
+
@headers['content-type'] ||= ['']
|
42
|
+
@aliases = Array(params[:aka]).compact
|
43
|
+
@referer = params[:referer]
|
44
|
+
@depth = params[:depth] || 0
|
45
|
+
@redirect_to = to_absolute(params[:redirect_to])
|
46
|
+
@response_time = params[:response_time]
|
47
|
+
@body = params[:body]
|
48
|
+
@error = params[:error]
|
49
|
+
|
50
|
+
@fetched = !params[:code].nil?
|
47
51
|
end
|
48
52
|
|
49
53
|
# Array of distinct A tag HREFs from the page
|
@@ -62,42 +66,20 @@ module Anemone
|
|
62
66
|
@links
|
63
67
|
end
|
64
68
|
|
65
|
-
|
66
|
-
|
67
|
-
@doc
|
68
|
-
|
69
|
-
|
70
|
-
#
|
71
|
-
# Return a new page with the same *response* and *url*, but
|
72
|
-
# with a 200 response code
|
73
|
-
#
|
74
|
-
def alias_clone(url)
|
75
|
-
p = clone
|
76
|
-
p.add_alias!(@aka) if !@aka.nil?
|
77
|
-
p.code = 200
|
78
|
-
p
|
69
|
+
# Nokogiri document for the HTML body
|
70
|
+
def doc
|
71
|
+
return @doc if @doc
|
72
|
+
@doc = Nokogiri::HTML(@body) if @body && html? rescue nil
|
79
73
|
end
|
80
74
|
|
81
|
-
#
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
#
|
86
|
-
def add_alias!(aka)
|
87
|
-
@aliases << aka if !@aliases.include?(aka)
|
88
|
-
self
|
75
|
+
# Delete the Nokogiri document and response body to conserve memory
|
76
|
+
def discard_doc!
|
77
|
+
links # force parsing of page links before we trash the document
|
78
|
+
@doc = @body = nil
|
89
79
|
end
|
90
80
|
|
91
|
-
|
92
|
-
|
93
|
-
# redirect-aliases of those pages, as String objects.
|
94
|
-
#
|
95
|
-
# *page_hash* is a PageHash object with the results of the current crawl.
|
96
|
-
#
|
97
|
-
def links_and_their_aliases(page_hash)
|
98
|
-
links.inject([]) do |results, link|
|
99
|
-
results.concat([link].concat(page_hash[link].aliases))
|
100
|
-
end
|
81
|
+
def fetched?
|
82
|
+
@fetched
|
101
83
|
end
|
102
84
|
|
103
85
|
#
|
@@ -136,6 +118,8 @@ module Anemone
|
|
136
118
|
# location of the page
|
137
119
|
#
|
138
120
|
def to_absolute(link)
|
121
|
+
return nil if link.nil?
|
122
|
+
|
139
123
|
# remove anchor
|
140
124
|
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
141
125
|
|
@@ -154,5 +138,14 @@ module Anemone
|
|
154
138
|
def in_domain?(uri)
|
155
139
|
uri.host == @url.host
|
156
140
|
end
|
141
|
+
|
142
|
+
def marshal_dump
|
143
|
+
[@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched]
|
144
|
+
end
|
145
|
+
|
146
|
+
def marshal_load(ary)
|
147
|
+
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
|
148
|
+
end
|
149
|
+
|
157
150
|
end
|
158
151
|
end
|