anemone 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +11 -1
- data/README.rdoc +2 -0
- data/lib/anemone/cli/serialize.rb +2 -2
- data/lib/anemone/core.rb +58 -66
- data/lib/anemone/http.rb +39 -28
- data/lib/anemone/page.rb +53 -59
- data/lib/anemone/{page_hash.rb → page_store.rb} +76 -58
- data/lib/anemone/storage.rb +19 -0
- data/lib/anemone/storage/pstore.rb +48 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +57 -0
- data/lib/anemone/tentacle.rb +7 -7
- data/spec/anemone_spec.rb +4 -4
- data/spec/core_spec.rb +226 -163
- data/spec/http_spec.rb +23 -0
- data/spec/page_spec.rb +28 -14
- data/spec/page_store_spec.rb +128 -0
- data/spec/storage_spec.rb +123 -0
- metadata +9 -3
data/CHANGELOG.rdoc
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
== 0.3.0 / 2009-12-15
|
2
|
+
|
3
|
+
* Major enchancements
|
4
|
+
|
5
|
+
* Option for persistent storage of pages during crawl with TokyoCabinet or PStore
|
6
|
+
|
7
|
+
* Minor enhancements
|
8
|
+
|
9
|
+
* Options can be set via methods on the Core object in the crawl block
|
10
|
+
|
1
11
|
== 0.2.3 / 2009-11-01
|
2
12
|
|
3
13
|
* Minor enhancements
|
@@ -24,4 +34,4 @@
|
|
24
34
|
* Minor enhancements
|
25
35
|
|
26
36
|
* HTTP request response time recorded in Page.
|
27
|
-
* Use of persistent HTTP connections.
|
37
|
+
* Use of persistent HTTP connections.
|
data/README.rdoc
CHANGED
@@ -15,6 +15,8 @@ See http://anemone.rubyforge.org for more information.
|
|
15
15
|
* HTTPS support
|
16
16
|
* Records response time for each page
|
17
17
|
* CLI program can list all pages in a domain, calculate page depths, and more
|
18
|
+
* Obey robots.txt
|
19
|
+
* In-memory or persistent storage of pages during crawl, using TokyoCabinet or PStore
|
18
20
|
|
19
21
|
== Examples
|
20
22
|
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
|
@@ -12,10 +12,10 @@ Usage:
|
|
12
12
|
|
13
13
|
Synopsis:
|
14
14
|
Crawls a site starting at the given URL and saves the resulting
|
15
|
-
|
15
|
+
PageStore object to a file using Marshal serialization.
|
16
16
|
|
17
17
|
Options:
|
18
|
-
-o, --output filename Filename to save
|
18
|
+
-o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
|
19
19
|
INFO
|
20
20
|
exit(0)
|
21
21
|
end
|
data/lib/anemone/core.rb
CHANGED
@@ -2,25 +2,26 @@ require 'thread'
|
|
2
2
|
require 'robots'
|
3
3
|
require 'anemone/tentacle'
|
4
4
|
require 'anemone/page'
|
5
|
-
require 'anemone/
|
5
|
+
require 'anemone/page_store'
|
6
|
+
require 'anemone/storage'
|
6
7
|
|
7
8
|
module Anemone
|
8
9
|
|
9
|
-
VERSION = '0.
|
10
|
+
VERSION = '0.3.0';
|
10
11
|
|
11
12
|
#
|
12
13
|
# Convenience method to start a crawl
|
13
14
|
#
|
14
15
|
def Anemone.crawl(urls, options = {}, &block)
|
15
16
|
Core.crawl(urls, options, &block)
|
16
|
-
end
|
17
|
+
end
|
17
18
|
|
18
19
|
class Core
|
19
|
-
# PageHash storing all Page objects encountered during the crawl
|
20
|
-
attr_reader :pages
|
21
20
|
|
21
|
+
# PageStore storing all Page objects encountered during the crawl
|
22
|
+
attr_reader :pages
|
22
23
|
# Hash of options for the crawl
|
23
|
-
|
24
|
+
attr_reader :opts
|
24
25
|
|
25
26
|
DEFAULT_OPTS = {
|
26
27
|
# run 4 Tentacle threads to fetch pages
|
@@ -38,9 +39,18 @@ module Anemone
|
|
38
39
|
# by default, don't limit the depth of the crawl
|
39
40
|
:depth_limit => false,
|
40
41
|
# number of times HTTP redirects will be followed
|
41
|
-
:redirect_limit => 5
|
42
|
+
:redirect_limit => 5,
|
43
|
+
# storage engine defaults to Hash in +process_options+ if none specified
|
44
|
+
:storage => nil
|
42
45
|
}
|
43
46
|
|
47
|
+
# Create setter methods for all options to be called from the crawl block
|
48
|
+
DEFAULT_OPTS.keys.each do |key|
|
49
|
+
define_method "#{key}=" do |*args|
|
50
|
+
@opts[key.to_sym] = *args
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
44
54
|
#
|
45
55
|
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
46
56
|
# and optional *block*
|
@@ -50,17 +60,15 @@ module Anemone
|
|
50
60
|
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
51
61
|
|
52
62
|
@tentacles = []
|
53
|
-
@pages = PageHash.new
|
54
63
|
@on_every_page_blocks = []
|
55
64
|
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
56
65
|
@skip_link_patterns = []
|
57
66
|
@after_crawl_blocks = []
|
58
|
-
|
59
|
-
process_options opts
|
67
|
+
@opts = opts
|
60
68
|
|
61
69
|
yield self if block_given?
|
62
70
|
end
|
63
|
-
|
71
|
+
|
64
72
|
#
|
65
73
|
# Convenience method to start a new crawl
|
66
74
|
#
|
@@ -70,16 +78,16 @@ module Anemone
|
|
70
78
|
core.run
|
71
79
|
end
|
72
80
|
end
|
73
|
-
|
81
|
+
|
74
82
|
#
|
75
|
-
# Add a block to be executed on the
|
83
|
+
# Add a block to be executed on the PageStore after the crawl
|
76
84
|
# is finished
|
77
85
|
#
|
78
86
|
def after_crawl(&block)
|
79
87
|
@after_crawl_blocks << block
|
80
88
|
self
|
81
89
|
end
|
82
|
-
|
90
|
+
|
83
91
|
#
|
84
92
|
# Add one ore more Regex patterns for URLs which should not be
|
85
93
|
# followed
|
@@ -88,7 +96,7 @@ module Anemone
|
|
88
96
|
@skip_link_patterns.concat [patterns].flatten.compact
|
89
97
|
self
|
90
98
|
end
|
91
|
-
|
99
|
+
|
92
100
|
#
|
93
101
|
# Add a block to be executed on every Page as they are encountered
|
94
102
|
# during the crawl
|
@@ -97,7 +105,7 @@ module Anemone
|
|
97
105
|
@on_every_page_blocks << block
|
98
106
|
self
|
99
107
|
end
|
100
|
-
|
108
|
+
|
101
109
|
#
|
102
110
|
# Add a block to be executed on Page objects with a URL matching
|
103
111
|
# one or more patterns
|
@@ -110,7 +118,7 @@ module Anemone
|
|
110
118
|
end
|
111
119
|
self
|
112
120
|
end
|
113
|
-
|
121
|
+
|
114
122
|
#
|
115
123
|
# Specify a block which will select which links to follow on each page.
|
116
124
|
# The block should return an Array of URI objects.
|
@@ -119,77 +127,63 @@ module Anemone
|
|
119
127
|
@focus_crawl_block = block
|
120
128
|
self
|
121
129
|
end
|
122
|
-
|
130
|
+
|
123
131
|
#
|
124
132
|
# Perform the crawl
|
125
133
|
#
|
126
134
|
def run
|
135
|
+
process_options
|
136
|
+
|
127
137
|
@urls.delete_if { |url| !visit_link?(url) }
|
128
138
|
return if @urls.empty?
|
129
|
-
|
139
|
+
|
130
140
|
link_queue = Queue.new
|
131
141
|
page_queue = Queue.new
|
132
142
|
|
133
143
|
@opts[:threads].times do
|
134
144
|
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
135
145
|
end
|
136
|
-
|
146
|
+
|
137
147
|
@urls.each{ |url| link_queue.enq(url) }
|
138
148
|
|
139
149
|
loop do
|
140
150
|
page = page_queue.deq
|
141
|
-
|
142
|
-
@pages[page.url] = page
|
143
|
-
|
151
|
+
@pages.touch_key page.url
|
144
152
|
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
145
|
-
|
146
|
-
# perform the on_every_page blocks for this page
|
147
|
-
do_page_blocks(page)
|
148
|
-
|
153
|
+
do_page_blocks page
|
149
154
|
page.discard_doc! if @opts[:discard_page_bodies]
|
150
|
-
|
151
|
-
links_to_follow
|
152
|
-
|
153
|
-
|
154
|
-
end
|
155
|
-
|
156
|
-
# create an entry in the page hash for each alias of this page,
|
157
|
-
# i.e. all the pages that redirected to this page
|
158
|
-
page.aliases.each do |aka|
|
159
|
-
if !@pages.has_key?(aka) or @pages[aka].nil?
|
160
|
-
@pages[aka] = page.alias_clone(aka)
|
161
|
-
end
|
162
|
-
@pages[aka].add_alias!(page.url)
|
155
|
+
|
156
|
+
links = links_to_follow page
|
157
|
+
links.each do |link|
|
158
|
+
link_queue << [link, page.url.dup, page.depth + 1]
|
163
159
|
end
|
164
|
-
|
160
|
+
@pages.touch_keys links
|
161
|
+
|
162
|
+
@pages[page.url] = page
|
163
|
+
|
165
164
|
# if we are done with the crawl, tell the threads to end
|
166
165
|
if link_queue.empty? and page_queue.empty?
|
167
166
|
until link_queue.num_waiting == @tentacles.size
|
168
167
|
Thread.pass
|
169
168
|
end
|
170
|
-
|
171
169
|
if page_queue.empty?
|
172
|
-
@tentacles.size.times { link_queue
|
170
|
+
@tentacles.size.times { link_queue << :END }
|
173
171
|
break
|
174
172
|
end
|
175
173
|
end
|
176
|
-
|
177
174
|
end
|
178
175
|
|
179
176
|
@tentacles.each { |t| t.join }
|
180
|
-
|
181
|
-
do_after_crawl_blocks()
|
182
|
-
|
177
|
+
do_after_crawl_blocks
|
183
178
|
self
|
184
179
|
end
|
185
|
-
|
186
|
-
private
|
187
180
|
|
188
|
-
|
189
|
-
@opts = DEFAULT_OPTS.merge options
|
181
|
+
private
|
190
182
|
|
183
|
+
def process_options
|
184
|
+
@opts = DEFAULT_OPTS.merge @opts
|
191
185
|
@opts[:threads] = 1 if @opts[:delay] > 0
|
192
|
-
|
186
|
+
@pages = PageStore.new(@opts[:storage] || Anemone::Storage.Hash)
|
193
187
|
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
194
188
|
end
|
195
189
|
|
@@ -197,9 +191,9 @@ module Anemone
|
|
197
191
|
# Execute the after_crawl blocks
|
198
192
|
#
|
199
193
|
def do_after_crawl_blocks
|
200
|
-
@after_crawl_blocks.each {|b| b.call(@pages)}
|
194
|
+
@after_crawl_blocks.each { |b| b.call(@pages) }
|
201
195
|
end
|
202
|
-
|
196
|
+
|
203
197
|
#
|
204
198
|
# Execute the on_every_page blocks for *page*
|
205
199
|
#
|
@@ -207,14 +201,12 @@ module Anemone
|
|
207
201
|
@on_every_page_blocks.each do |blk|
|
208
202
|
blk.call(page)
|
209
203
|
end
|
210
|
-
|
204
|
+
|
211
205
|
@on_pages_like_blocks.each do |pattern, blks|
|
212
|
-
if page.url.to_s =~ pattern
|
213
|
-
blks.each { |blk| blk.call(page) }
|
214
|
-
end
|
206
|
+
blks.each { |blk| blk.call(page) } if page.url.to_s =~ pattern
|
215
207
|
end
|
216
|
-
end
|
217
|
-
|
208
|
+
end
|
209
|
+
|
218
210
|
#
|
219
211
|
# Return an Array of links to follow from the given page.
|
220
212
|
# Based on whether or not the link has already been crawled,
|
@@ -222,9 +214,9 @@ module Anemone
|
|
222
214
|
#
|
223
215
|
def links_to_follow(page)
|
224
216
|
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
225
|
-
links.select { |link| visit_link?(link, page) }
|
217
|
+
links.select { |link| visit_link?(link, page) }.map { |link| link.dup }
|
226
218
|
end
|
227
|
-
|
219
|
+
|
228
220
|
#
|
229
221
|
# Returns +true+ if *link* has not been visited already,
|
230
222
|
# and is not excluded by a skip_link pattern...
|
@@ -234,16 +226,16 @@ module Anemone
|
|
234
226
|
#
|
235
227
|
def visit_link?(link, from_page = nil)
|
236
228
|
allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
237
|
-
|
229
|
+
|
238
230
|
if from_page && @opts[:depth_limit]
|
239
231
|
too_deep = from_page.depth >= @opts[:depth_limit]
|
240
232
|
else
|
241
233
|
too_deep = false
|
242
234
|
end
|
243
|
-
|
235
|
+
|
244
236
|
!@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
|
245
237
|
end
|
246
|
-
|
238
|
+
|
247
239
|
#
|
248
240
|
# Returns +true+ if *link* should not be visited because
|
249
241
|
# its URL matches a skip_link pattern.
|
@@ -251,6 +243,6 @@ module Anemone
|
|
251
243
|
def skip_link?(link)
|
252
244
|
@skip_link_patterns.any? { |p| link.path =~ p }
|
253
245
|
end
|
254
|
-
|
246
|
+
|
255
247
|
end
|
256
248
|
end
|
data/lib/anemone/http.rb
CHANGED
@@ -12,62 +12,73 @@ module Anemone
|
|
12
12
|
end
|
13
13
|
|
14
14
|
#
|
15
|
-
#
|
15
|
+
# Fetch a single Page from the response of an HTTP request to *url*.
|
16
|
+
# Just gets the final destination page.
|
16
17
|
#
|
17
|
-
def fetch_page(url,
|
18
|
+
def fetch_page(url, referer = nil, depth = nil)
|
19
|
+
fetch_pages(url, referer, depth).last
|
20
|
+
end
|
21
|
+
|
22
|
+
#
|
23
|
+
# Create new Pages from the response of an HTTP request to *url*,
|
24
|
+
# including redirects
|
25
|
+
#
|
26
|
+
def fetch_pages(url, referer = nil, depth = nil)
|
18
27
|
begin
|
19
28
|
url = URI(url) unless url.is_a?(URI)
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
if !url.eql?(location)
|
30
|
-
aka = location
|
29
|
+
pages = []
|
30
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
31
|
+
pages << Page.new(location, :body => response.body.dup,
|
32
|
+
:code => code,
|
33
|
+
:headers => response.to_hash,
|
34
|
+
:referer => referer,
|
35
|
+
:depth => depth,
|
36
|
+
:redirect_to => redirect_to,
|
37
|
+
:response_time => response_time)
|
31
38
|
end
|
32
39
|
|
33
|
-
return
|
40
|
+
return pages
|
34
41
|
rescue => e
|
35
42
|
if verbose?
|
36
43
|
puts e.inspect
|
37
44
|
puts e.backtrace
|
38
|
-
end
|
39
|
-
return Page.new(url)
|
45
|
+
end
|
46
|
+
return [Page.new(url, :error => e)]
|
40
47
|
end
|
41
48
|
end
|
42
49
|
|
43
50
|
private
|
44
51
|
|
45
52
|
#
|
46
|
-
# Retrieve
|
47
|
-
#
|
48
|
-
#
|
53
|
+
# Retrieve HTTP responses for *url*, including redirects.
|
54
|
+
# Yields the response object, response code, and URI location
|
55
|
+
# for each response.
|
56
|
+
#
|
49
57
|
def get(url, referer = nil)
|
50
58
|
response, response_time = get_response(url, referer)
|
51
59
|
code = Integer(response.code)
|
52
60
|
loc = url
|
53
|
-
|
61
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
62
|
+
yield response, code, loc, redirect_to, response_time
|
63
|
+
|
54
64
|
limit = redirect_limit
|
55
65
|
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
56
|
-
loc =
|
66
|
+
loc = redirect_to
|
57
67
|
loc = url.merge(loc) if loc.relative?
|
58
68
|
response, response_time = get_response(loc, referer)
|
69
|
+
code = Integer(response.code)
|
70
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
71
|
+
yield response, code, loc, redirect_to, response_time
|
59
72
|
limit -= 1
|
60
73
|
end
|
61
|
-
|
62
|
-
return response, code, loc, response_time
|
63
74
|
end
|
64
|
-
|
75
|
+
|
65
76
|
#
|
66
77
|
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
67
78
|
#
|
68
79
|
def get_response(url, referer = nil)
|
69
80
|
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
70
|
-
|
81
|
+
|
71
82
|
opts = {}
|
72
83
|
opts['User-Agent'] = user_agent if user_agent
|
73
84
|
opts['Referer'] = referer.to_s if referer
|
@@ -78,7 +89,7 @@ module Anemone
|
|
78
89
|
response = connection(url).get(full_path, opts)
|
79
90
|
finish = Time.now()
|
80
91
|
response_time = ((finish - start) * 1000).round
|
81
|
-
return response, response_time
|
92
|
+
return response, response_time
|
82
93
|
rescue EOFError
|
83
94
|
refresh_connection(url)
|
84
95
|
retries += 1
|
@@ -93,7 +104,7 @@ module Anemone
|
|
93
104
|
return conn
|
94
105
|
end
|
95
106
|
|
96
|
-
refresh_connection
|
107
|
+
refresh_connection url
|
97
108
|
end
|
98
109
|
|
99
110
|
def refresh_connection(url)
|
@@ -102,7 +113,7 @@ module Anemone
|
|
102
113
|
http.use_ssl = true
|
103
114
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
104
115
|
end
|
105
|
-
@connections[url.host][url.port] = http.start
|
116
|
+
@connections[url.host][url.port] = http.start
|
106
117
|
end
|
107
118
|
|
108
119
|
def redirect_limit
|