anemone 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +16 -0
- data/README.rdoc +12 -2
- data/Rakefile +26 -0
- data/VERSION +1 -0
- data/lib/anemone/core.rb +38 -8
- data/lib/anemone/exceptions.rb +5 -0
- data/lib/anemone/http.rb +1 -1
- data/lib/anemone/page.rb +36 -3
- data/lib/anemone/storage.rb +17 -2
- data/lib/anemone/storage/base.rb +75 -0
- data/lib/anemone/storage/exceptions.rb +15 -0
- data/lib/anemone/storage/mongodb.rb +89 -0
- data/lib/anemone/storage/pstore.rb +2 -0
- data/lib/anemone/storage/redis.rb +90 -0
- data/spec/core_spec.rb +13 -0
- data/spec/page_spec.rb +16 -1
- data/spec/page_store_spec.rb +26 -2
- data/spec/storage_spec.rb +63 -16
- metadata +68 -28
data/CHANGELOG.rdoc
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
== 0.5.0 / 2010-09-01
|
2
|
+
|
3
|
+
* Major enhancements
|
4
|
+
|
5
|
+
* Added page storage engines for MongoDB and Redis
|
6
|
+
|
7
|
+
* Minor enhancements
|
8
|
+
|
9
|
+
* Use xpath for link parsing instead of CSS (faster) (Marc Seeger)
|
10
|
+
* Added skip_query_strings option to skip links with query strings (Joost Baaij)
|
11
|
+
|
12
|
+
* Bug fixes
|
13
|
+
|
14
|
+
* Only consider status code 300..307 a redirect (Marc Seeger)
|
15
|
+
* Canonicalize redirect links (Marc Seeger)
|
16
|
+
|
1
17
|
== 0.4.0 / 2010-04-08
|
2
18
|
|
3
19
|
* Major enchancements
|
data/README.rdoc
CHANGED
@@ -8,7 +8,7 @@ See http://anemone.rubyforge.org for more information.
|
|
8
8
|
|
9
9
|
== Features
|
10
10
|
* Multi-threaded design for high performance
|
11
|
-
* Tracks 301 HTTP redirects
|
11
|
+
* Tracks 301 HTTP redirects
|
12
12
|
* Built-in BFS algorithm for determining page depth
|
13
13
|
* Allows exclusion of URLs based on regular expressions
|
14
14
|
* Choose the links to follow on each page with focus_crawl()
|
@@ -16,7 +16,7 @@ See http://anemone.rubyforge.org for more information.
|
|
16
16
|
* Records response time for each page
|
17
17
|
* CLI program can list all pages in a domain, calculate page depths, and more
|
18
18
|
* Obey robots.txt
|
19
|
-
* In-memory or persistent storage of pages during crawl, using TokyoCabinet or
|
19
|
+
* In-memory or persistent storage of pages during crawl, using TokyoCabinet, MongoDB, or Redis
|
20
20
|
|
21
21
|
== Examples
|
22
22
|
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
|
@@ -24,3 +24,13 @@ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of sev
|
|
24
24
|
== Requirements
|
25
25
|
* nokogiri
|
26
26
|
* robots
|
27
|
+
|
28
|
+
== Development
|
29
|
+
To test and develop this gem, additional requirements are:
|
30
|
+
* rspec
|
31
|
+
* fakeweb
|
32
|
+
* tokyocabinet
|
33
|
+
* mongo
|
34
|
+
* redis
|
35
|
+
|
36
|
+
You will need to have {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
require 'spec/rake/spectask'
|
5
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
6
|
+
spec.libs << 'lib' << 'spec'
|
7
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
8
|
+
end
|
9
|
+
|
10
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
11
|
+
spec.libs << 'lib' << 'spec'
|
12
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
13
|
+
spec.rcov = true
|
14
|
+
end
|
15
|
+
|
16
|
+
task :default => :spec
|
17
|
+
|
18
|
+
require 'rake/rdoctask'
|
19
|
+
Rake::RDocTask.new do |rdoc|
|
20
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
21
|
+
|
22
|
+
rdoc.rdoc_dir = 'rdoc'
|
23
|
+
rdoc.title = "anemone #{version}"
|
24
|
+
rdoc.rdoc_files.include('README*')
|
25
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
26
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.5.0
|
data/lib/anemone/core.rb
CHANGED
@@ -2,12 +2,14 @@ require 'thread'
|
|
2
2
|
require 'robots'
|
3
3
|
require 'anemone/tentacle'
|
4
4
|
require 'anemone/page'
|
5
|
+
require 'anemone/exceptions'
|
5
6
|
require 'anemone/page_store'
|
6
7
|
require 'anemone/storage'
|
8
|
+
require 'anemone/storage/base'
|
7
9
|
|
8
10
|
module Anemone
|
9
11
|
|
10
|
-
VERSION = '0.
|
12
|
+
VERSION = '0.5.0';
|
11
13
|
|
12
14
|
#
|
13
15
|
# Convenience method to start a crawl
|
@@ -45,7 +47,9 @@ module Anemone
|
|
45
47
|
# Hash of cookie name => value to send with HTTP requests
|
46
48
|
:cookies => nil,
|
47
49
|
# accept cookies from the server and send them back?
|
48
|
-
:accept_cookies => false
|
50
|
+
:accept_cookies => false,
|
51
|
+
# skip any link with a query string? e.g. http://foo.com/?u=user
|
52
|
+
:skip_query_strings => false
|
49
53
|
}
|
50
54
|
|
51
55
|
# Create setter methods for all options to be called from the crawl block
|
@@ -187,7 +191,8 @@ module Anemone
|
|
187
191
|
def process_options
|
188
192
|
@opts = DEFAULT_OPTS.merge @opts
|
189
193
|
@opts[:threads] = 1 if @opts[:delay] > 0
|
190
|
-
|
194
|
+
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
|
195
|
+
@pages = PageStore.new(storage)
|
191
196
|
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
192
197
|
|
193
198
|
freeze_options
|
@@ -241,15 +246,40 @@ module Anemone
|
|
241
246
|
# Returns +false+ otherwise.
|
242
247
|
#
|
243
248
|
def visit_link?(link, from_page = nil)
|
244
|
-
|
249
|
+
!@pages.has_page?(link) &&
|
250
|
+
!skip_link?(link) &&
|
251
|
+
!skip_query_string?(link) &&
|
252
|
+
allowed(link) &&
|
253
|
+
!too_deep?(from_page)
|
254
|
+
end
|
255
|
+
|
256
|
+
#
|
257
|
+
# Returns +true+ if we are obeying robots.txt and the link
|
258
|
+
# is granted access in it. Always returns +true+ when we are
|
259
|
+
# not obeying robots.txt.
|
260
|
+
#
|
261
|
+
def allowed(link)
|
262
|
+
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
263
|
+
end
|
245
264
|
|
265
|
+
#
|
266
|
+
# Returns +true+ if we are over the page depth limit.
|
267
|
+
# This only works when coming from a page and with the +depth_limit+ option set.
|
268
|
+
# When neither is the case, will always return +false+.
|
269
|
+
def too_deep?(from_page)
|
246
270
|
if from_page && @opts[:depth_limit]
|
247
|
-
|
271
|
+
from_page.depth >= @opts[:depth_limit]
|
248
272
|
else
|
249
|
-
|
273
|
+
false
|
250
274
|
end
|
251
|
-
|
252
|
-
|
275
|
+
end
|
276
|
+
|
277
|
+
#
|
278
|
+
# Returns +true+ if *link* should not be visited because
|
279
|
+
# it has a query string and +skip_query_strings+ is true.
|
280
|
+
#
|
281
|
+
def skip_query_string?(link)
|
282
|
+
@opts[:skip_query_strings] && link.query
|
253
283
|
end
|
254
284
|
|
255
285
|
#
|
data/lib/anemone/http.rb
CHANGED
@@ -91,7 +91,7 @@ module Anemone
|
|
91
91
|
|
92
92
|
response, response_time = get_response(loc, referer)
|
93
93
|
code = Integer(response.code)
|
94
|
-
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
94
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
95
95
|
yield response, code, loc, redirect_to, response_time
|
96
96
|
limit -= 1
|
97
97
|
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
data/lib/anemone/page.rb
CHANGED
@@ -59,8 +59,8 @@ module Anemone
|
|
59
59
|
@links = []
|
60
60
|
return @links if !doc
|
61
61
|
|
62
|
-
doc.
|
63
|
-
u = a
|
62
|
+
doc.search("//a[@href]").each do |a|
|
63
|
+
u = a['href']
|
64
64
|
next if u.nil? or u.empty?
|
65
65
|
abs = to_absolute(URI(u)) rescue next
|
66
66
|
@links << abs if in_domain?(abs)
|
@@ -120,7 +120,7 @@ module Anemone
|
|
120
120
|
# otherwise.
|
121
121
|
#
|
122
122
|
def redirect?
|
123
|
-
(300..
|
123
|
+
(300..307).include?(@code)
|
124
124
|
end
|
125
125
|
|
126
126
|
#
|
@@ -165,5 +165,38 @@ module Anemone
|
|
165
165
|
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
|
166
166
|
end
|
167
167
|
|
168
|
+
def to_hash
|
169
|
+
{'url' => @url.to_s,
|
170
|
+
'headers' => Marshal.dump(@headers),
|
171
|
+
'data' => Marshal.dump(@data),
|
172
|
+
'body' => @body,
|
173
|
+
'links' => links.map(&:to_s),
|
174
|
+
'code' => @code,
|
175
|
+
'visited' => @visited,
|
176
|
+
'depth' => @depth,
|
177
|
+
'referer' => @referer.to_s,
|
178
|
+
'redirect_to' => @redirect_to.to_s,
|
179
|
+
'response_time' => @response_time,
|
180
|
+
'fetched' => @fetched}
|
181
|
+
end
|
182
|
+
|
183
|
+
def self.from_hash(hash)
|
184
|
+
page = self.new(URI(hash['url']))
|
185
|
+
{'@headers' => Marshal.load(hash['headers']),
|
186
|
+
'@data' => Marshal.load(hash['data']),
|
187
|
+
'@body' => hash['body'],
|
188
|
+
'@links' => hash['links'].map { |link| URI(link) },
|
189
|
+
'@code' => hash['code'].to_i,
|
190
|
+
'@visited' => hash['visited'],
|
191
|
+
'@depth' => hash['depth'].to_i,
|
192
|
+
'@referer' => hash['referer'],
|
193
|
+
'@redirect_to' => URI(hash['redirect_to']),
|
194
|
+
'@response_time' => hash['response_time'].to_i,
|
195
|
+
'@fetched' => hash['fetched']
|
196
|
+
}.each do |var, value|
|
197
|
+
page.instance_variable_set(var, value)
|
198
|
+
end
|
199
|
+
page
|
200
|
+
end
|
168
201
|
end
|
169
202
|
end
|
data/lib/anemone/storage.rb
CHANGED
@@ -2,7 +2,10 @@ module Anemone
|
|
2
2
|
module Storage
|
3
3
|
|
4
4
|
def self.Hash(*args)
|
5
|
-
Hash.new(*args)
|
5
|
+
hash = Hash.new(*args)
|
6
|
+
# add close method for compatibility with Storage::Base
|
7
|
+
class << hash; def close; end; end
|
8
|
+
hash
|
6
9
|
end
|
7
10
|
|
8
11
|
def self.PStore(*args)
|
@@ -10,10 +13,22 @@ module Anemone
|
|
10
13
|
self::PStore.new(*args)
|
11
14
|
end
|
12
15
|
|
13
|
-
def self.TokyoCabinet(file)
|
16
|
+
def self.TokyoCabinet(file = 'anemone.tch')
|
14
17
|
require 'anemone/storage/tokyo_cabinet'
|
15
18
|
self::TokyoCabinet.new(file)
|
16
19
|
end
|
17
20
|
|
21
|
+
def self.MongoDB(mongo_db = nil, collection_name = 'pages')
|
22
|
+
require 'anemone/storage/mongodb'
|
23
|
+
mongo_db ||= Mongo::Connection.new.db('anemone')
|
24
|
+
raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
|
25
|
+
self::MongoDB.new(mongo_db, collection_name)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.Redis(opts = {})
|
29
|
+
require 'anemone/storage/redis'
|
30
|
+
self::Redis.new(opts)
|
31
|
+
end
|
32
|
+
|
18
33
|
end
|
19
34
|
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'anemone/storage/exceptions'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
module Storage
|
5
|
+
class Base
|
6
|
+
|
7
|
+
def initialize(adapter)
|
8
|
+
@adap = adapter
|
9
|
+
|
10
|
+
# verify adapter conforms to this class's methods
|
11
|
+
methods.each do |method|
|
12
|
+
if !@adap.respond_to?(method.to_sym)
|
13
|
+
raise "Storage adapter must support method #{method}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def [](key)
|
19
|
+
@adap[key]
|
20
|
+
rescue
|
21
|
+
puts key
|
22
|
+
raise RetrievalError, $!
|
23
|
+
end
|
24
|
+
|
25
|
+
def []=(key, value)
|
26
|
+
@adap[key] = value
|
27
|
+
rescue
|
28
|
+
raise InsertionError, $!
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete(key)
|
32
|
+
@adap.delete(key)
|
33
|
+
rescue
|
34
|
+
raise DeletionError, $!
|
35
|
+
end
|
36
|
+
|
37
|
+
def each
|
38
|
+
@adap.each { |k, v| yield k, v }
|
39
|
+
rescue
|
40
|
+
raise GenericError, $!
|
41
|
+
end
|
42
|
+
|
43
|
+
def merge!(hash)
|
44
|
+
@adap.merge!(hash)
|
45
|
+
rescue
|
46
|
+
raise GenericError, $!
|
47
|
+
end
|
48
|
+
|
49
|
+
def close
|
50
|
+
@adap.close
|
51
|
+
rescue
|
52
|
+
raise CloseError, $!
|
53
|
+
end
|
54
|
+
|
55
|
+
def size
|
56
|
+
@adap.size
|
57
|
+
rescue
|
58
|
+
raise GenericError, $!
|
59
|
+
end
|
60
|
+
|
61
|
+
def keys
|
62
|
+
@adap.keys
|
63
|
+
rescue
|
64
|
+
raise GenericError, $!
|
65
|
+
end
|
66
|
+
|
67
|
+
def has_key?(key)
|
68
|
+
@adap.has_key?(key)
|
69
|
+
rescue
|
70
|
+
raise GenericError, $!
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
begin
|
2
|
+
require 'mongo'
|
3
|
+
rescue LoadError
|
4
|
+
puts "You need the mongo gem to use Anemone::Storage::MongoDB"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
module Anemone
|
9
|
+
module Storage
|
10
|
+
class MongoDB
|
11
|
+
|
12
|
+
BINARY_FIELDS = %w(body headers data)
|
13
|
+
|
14
|
+
def initialize(mongo_db, collection_name)
|
15
|
+
@db = mongo_db
|
16
|
+
@collection = @db[collection_name]
|
17
|
+
@collection.remove
|
18
|
+
@collection.create_index 'url'
|
19
|
+
end
|
20
|
+
|
21
|
+
def [](url)
|
22
|
+
if value = @collection.find_one('url' => url.to_s)
|
23
|
+
load_page(value)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def []=(url, page)
|
28
|
+
hash = page.to_hash
|
29
|
+
BINARY_FIELDS.each do |field|
|
30
|
+
hash[field] = BSON::Binary.new(hash[field]) unless hash[field].nil?
|
31
|
+
end
|
32
|
+
@collection.update(
|
33
|
+
{'url' => page.url.to_s},
|
34
|
+
hash,
|
35
|
+
:upsert => true
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
def delete(url)
|
40
|
+
page = self[url]
|
41
|
+
@collection.remove('url' => url.to_s)
|
42
|
+
page
|
43
|
+
end
|
44
|
+
|
45
|
+
def each
|
46
|
+
@collection.find do |cursor|
|
47
|
+
cursor.each do |doc|
|
48
|
+
page = load_page(doc)
|
49
|
+
yield page.url.to_s, page
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def merge!(hash)
|
55
|
+
hash.each { |key, value| self[key] = value }
|
56
|
+
self
|
57
|
+
end
|
58
|
+
|
59
|
+
def size
|
60
|
+
@collection.count
|
61
|
+
end
|
62
|
+
|
63
|
+
def keys
|
64
|
+
keys = []
|
65
|
+
self.each { |k, v| keys << k.to_s }
|
66
|
+
keys
|
67
|
+
end
|
68
|
+
|
69
|
+
def has_key?(url)
|
70
|
+
!!@collection.find_one('url' => url.to_s)
|
71
|
+
end
|
72
|
+
|
73
|
+
def close
|
74
|
+
@db.connection.close
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def load_page(hash)
|
80
|
+
BINARY_FIELDS.each do |field|
|
81
|
+
hash[field] = hash[field].to_s
|
82
|
+
end
|
83
|
+
Page.from_hash(hash)
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'redis'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
module Storage
|
5
|
+
class Redis
|
6
|
+
|
7
|
+
MARSHAL_FIELDS = %w(links visited fetched)
|
8
|
+
|
9
|
+
def initialize(opts = {})
|
10
|
+
@redis = ::Redis.new(opts)
|
11
|
+
@key_prefix = opts[:key_prefix] || 'anemone'
|
12
|
+
keys.each { |key| delete(key) }
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](key)
|
16
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
17
|
+
rget(rkey)
|
18
|
+
end
|
19
|
+
|
20
|
+
def []=(key, value)
|
21
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
22
|
+
hash = value.to_hash
|
23
|
+
MARSHAL_FIELDS.each do |field|
|
24
|
+
hash[field] = Marshal.dump(hash[field])
|
25
|
+
end
|
26
|
+
hash.each do |field, value|
|
27
|
+
@redis.hset(rkey, field, value)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete(key)
|
32
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
33
|
+
page = self[key]
|
34
|
+
@redis.del(rkey)
|
35
|
+
page
|
36
|
+
end
|
37
|
+
|
38
|
+
def each
|
39
|
+
rkeys = @redis.keys("#{@key_prefix}:pages:*")
|
40
|
+
rkeys.each do |rkey|
|
41
|
+
page = rget(rkey)
|
42
|
+
yield page.url.to_s, page
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def merge!(hash)
|
47
|
+
hash.each { |key, value| self[key] = value }
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
def size
|
52
|
+
@redis.keys("#{@key_prefix}:pages:*").size
|
53
|
+
end
|
54
|
+
|
55
|
+
def keys
|
56
|
+
keys = []
|
57
|
+
self.each { |k, v| keys << k.to_s }
|
58
|
+
keys
|
59
|
+
end
|
60
|
+
|
61
|
+
def has_key?(key)
|
62
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
63
|
+
@redis.exists(rkey)
|
64
|
+
end
|
65
|
+
|
66
|
+
def close
|
67
|
+
@redis.quit
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def load_value(hash)
|
73
|
+
MARSHAL_FIELDS.each do |field|
|
74
|
+
unless hash[field].nil? || hash[field] == ''
|
75
|
+
hash[field] = Marshal.load(hash[field])
|
76
|
+
end
|
77
|
+
end
|
78
|
+
Page.from_hash(hash)
|
79
|
+
end
|
80
|
+
|
81
|
+
def rget(rkey)
|
82
|
+
hash = @redis.hgetall(rkey)
|
83
|
+
if !!hash
|
84
|
+
load_value(hash)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
data/spec/core_spec.rb
CHANGED
@@ -72,6 +72,19 @@ module Anemone
|
|
72
72
|
core.pages.keys.should_not include(pages[2].url)
|
73
73
|
end
|
74
74
|
|
75
|
+
it "should be able to skip links with query strings" do
|
76
|
+
pages = []
|
77
|
+
pages << FakePage.new('0', :links => ['1?foo=1', '2'])
|
78
|
+
pages << FakePage.new('1?foo=1')
|
79
|
+
pages << FakePage.new('2')
|
80
|
+
|
81
|
+
core = Anemone.crawl(pages[0].url, @opts) do |a|
|
82
|
+
a.skip_query_strings = true
|
83
|
+
end
|
84
|
+
|
85
|
+
core.should have(2).pages
|
86
|
+
end
|
87
|
+
|
75
88
|
it "should be able to skip links based on a RegEx" do
|
76
89
|
pages = []
|
77
90
|
pages << FakePage.new('0', :links => ['1', '2'])
|
data/spec/page_spec.rb
CHANGED
@@ -6,7 +6,7 @@ module Anemone
|
|
6
6
|
before(:each) do
|
7
7
|
FakeWeb.clean_registry
|
8
8
|
@http = Anemone::HTTP.new
|
9
|
-
@page = @http.fetch_page(FakePage.new('home').url)
|
9
|
+
@page = @http.fetch_page(FakePage.new('home', :links => '1').url)
|
10
10
|
end
|
11
11
|
|
12
12
|
it "should indicate whether it successfully fetched via HTTP" do
|
@@ -73,5 +73,20 @@ module Anemone
|
|
73
73
|
@page.cookies.should == []
|
74
74
|
end
|
75
75
|
|
76
|
+
it "should have a to_hash method that converts the page to a hash" do
|
77
|
+
hash = @page.to_hash
|
78
|
+
hash['url'].should == @page.url.to_s
|
79
|
+
hash['referer'].should == @page.referer.to_s
|
80
|
+
hash['links'].should == @page.links.map(&:to_s)
|
81
|
+
end
|
82
|
+
|
83
|
+
it "should have a from_hash method to convert from a hash to a Page" do
|
84
|
+
page = @page.dup
|
85
|
+
page.depth = 1
|
86
|
+
converted = Page.from_hash(page.to_hash)
|
87
|
+
converted.links.should == page.links
|
88
|
+
converted.depth.should == page.depth
|
89
|
+
end
|
90
|
+
|
76
91
|
end
|
77
92
|
end
|
data/spec/page_store_spec.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
-
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
2
|
+
%w[pstore tokyo_cabinet mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
describe PageStore do
|
@@ -9,7 +9,7 @@ module Anemone
|
|
9
9
|
end
|
10
10
|
|
11
11
|
shared_examples_for "page storage" do
|
12
|
-
it "should be able to
|
12
|
+
it "should be able to compute single-source shortest paths in-place" do
|
13
13
|
pages = []
|
14
14
|
pages << FakePage.new('0', :links => ['1', '3'])
|
15
15
|
pages << FakePage.new('1', :redirect => '2')
|
@@ -124,5 +124,29 @@ module Anemone
|
|
124
124
|
end
|
125
125
|
end
|
126
126
|
|
127
|
+
describe Storage::MongoDB do
|
128
|
+
it_should_behave_like "page storage"
|
129
|
+
|
130
|
+
before(:each) do
|
131
|
+
@opts = {:storage => @store = Storage.MongoDB}
|
132
|
+
end
|
133
|
+
|
134
|
+
after(:each) do
|
135
|
+
@store.close
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
describe Storage::Redis do
|
140
|
+
it_should_behave_like "page storage"
|
141
|
+
|
142
|
+
before(:each) do
|
143
|
+
@opts = {:storage => @store = Storage.Redis}
|
144
|
+
end
|
145
|
+
|
146
|
+
after(:each) do
|
147
|
+
@store.close
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
127
151
|
end
|
128
152
|
end
|
data/spec/storage_spec.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
-
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
2
|
+
%w[pstore tokyo_cabinet mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
describe Storage do
|
@@ -23,21 +23,41 @@ module Anemone
|
|
23
23
|
store.close
|
24
24
|
end
|
25
25
|
|
26
|
+
it "should have a class method to produce a MongoDB" do
|
27
|
+
Anemone::Storage.should respond_to(:MongoDB)
|
28
|
+
store = Anemone::Storage.MongoDB
|
29
|
+
store.should be_an_instance_of(Anemone::Storage::MongoDB)
|
30
|
+
store.close
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should have a class method to produce a Redis" do
|
34
|
+
Anemone::Storage.should respond_to(:Redis)
|
35
|
+
store = Anemone::Storage.Redis
|
36
|
+
store.should be_an_instance_of(Anemone::Storage::Redis)
|
37
|
+
store.close
|
38
|
+
end
|
39
|
+
|
26
40
|
module Storage
|
27
41
|
shared_examples_for "storage engine" do
|
42
|
+
|
43
|
+
before(:each) do
|
44
|
+
@url = SPEC_DOMAIN
|
45
|
+
@page = Page.new(URI(@url))
|
46
|
+
end
|
47
|
+
|
28
48
|
it "should implement [] and []=" do
|
29
49
|
@store.should respond_to(:[])
|
30
50
|
@store.should respond_to(:[]=)
|
31
51
|
|
32
|
-
@store[
|
33
|
-
@store[
|
52
|
+
@store[@url] = @page
|
53
|
+
@store[@url].url.should == URI(@url)
|
34
54
|
end
|
35
55
|
|
36
56
|
it "should implement has_key?" do
|
37
57
|
@store.should respond_to(:has_key?)
|
38
58
|
|
39
|
-
@store[
|
40
|
-
@store.has_key?(
|
59
|
+
@store[@url] = @page
|
60
|
+
@store.has_key?(@url).should == true
|
41
61
|
|
42
62
|
@store.has_key?('missing').should == false
|
43
63
|
end
|
@@ -45,37 +65,41 @@ module Anemone
|
|
45
65
|
it "should implement delete" do
|
46
66
|
@store.should respond_to(:delete)
|
47
67
|
|
48
|
-
@store[
|
49
|
-
@store.delete(
|
50
|
-
@store.has_key?(
|
68
|
+
@store[@url] = @page
|
69
|
+
@store.delete(@url).url.should == @page.url
|
70
|
+
@store.has_key?(@url).should == false
|
51
71
|
end
|
52
72
|
|
53
73
|
it "should implement keys" do
|
54
74
|
@store.should respond_to(:keys)
|
55
75
|
|
56
|
-
|
57
|
-
|
76
|
+
urls = [SPEC_DOMAIN, SPEC_DOMAIN + 'test', SPEC_DOMAIN + 'another']
|
77
|
+
pages = urls.map { |url| Page.new(URI(url)) }
|
78
|
+
urls.zip(pages).each { |arr| @store[arr[0]] = arr[1] }
|
58
79
|
|
59
|
-
@store.keys.should ==
|
80
|
+
(@store.keys - urls).should == []
|
60
81
|
end
|
61
82
|
|
62
83
|
it "should implement each" do
|
63
84
|
@store.should respond_to(:each)
|
64
85
|
|
65
|
-
|
66
|
-
|
86
|
+
urls = [SPEC_DOMAIN, SPEC_DOMAIN + 'test', SPEC_DOMAIN + 'another']
|
87
|
+
pages = urls.map { |url| Page.new(URI(url)) }
|
88
|
+
urls.zip(pages).each { |arr| @store[arr[0]] = arr[1] }
|
67
89
|
|
68
90
|
result = {}
|
69
91
|
@store.each { |k, v| result[k] = v }
|
70
|
-
result.
|
92
|
+
(result.keys - urls).should == []
|
93
|
+
(result.values.map { |page| page.url.to_s } - urls).should == []
|
71
94
|
end
|
72
95
|
|
73
96
|
it "should implement merge!, and return self" do
|
74
97
|
@store.should respond_to(:merge!)
|
75
98
|
|
76
|
-
hash = {
|
99
|
+
hash = {SPEC_DOMAIN => Page.new(URI(SPEC_DOMAIN)),
|
100
|
+
SPEC_DOMAIN + 'test' => Page.new(URI(SPEC_DOMAIN + 'test'))}
|
77
101
|
merged = @store.merge! hash
|
78
|
-
hash.each { |key, value| @store[key].should ==
|
102
|
+
hash.each { |key, value| @store[key].url.to_s.should == key }
|
79
103
|
|
80
104
|
merged.should === @store
|
81
105
|
end
|
@@ -115,7 +139,30 @@ module Anemone
|
|
115
139
|
it "should raise an error if supplied with a file extension other than .tch" do
|
116
140
|
lambda { Anemone::Storage.TokyoCabinet('test.tmp') }.should raise_error(RuntimeError)
|
117
141
|
end
|
142
|
+
end
|
118
143
|
|
144
|
+
describe Storage::MongoDB do
|
145
|
+
it_should_behave_like "storage engine"
|
146
|
+
|
147
|
+
before(:each) do
|
148
|
+
@store = Storage.MongoDB
|
149
|
+
end
|
150
|
+
|
151
|
+
after(:each) do
|
152
|
+
@store.close
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
describe Storage::Redis do
|
157
|
+
it_should_behave_like "storage engine"
|
158
|
+
|
159
|
+
before(:each) do
|
160
|
+
@store = Storage.Redis
|
161
|
+
end
|
162
|
+
|
163
|
+
after(:each) do
|
164
|
+
@store.close
|
165
|
+
end
|
119
166
|
end
|
120
167
|
|
121
168
|
end
|
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 11
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 5
|
9
|
+
- 0
|
10
|
+
version: 0.5.0
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Chris Kite
|
@@ -9,29 +15,41 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date: 2010-
|
18
|
+
date: 2010-09-01 00:00:00 -05:00
|
13
19
|
default_executable:
|
14
20
|
dependencies:
|
15
21
|
- !ruby/object:Gem::Dependency
|
16
22
|
name: nokogiri
|
17
|
-
|
18
|
-
|
19
|
-
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
20
26
|
requirements:
|
21
27
|
- - ">="
|
22
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 27
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 3
|
33
|
+
- 0
|
23
34
|
version: 1.3.0
|
24
|
-
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
25
37
|
- !ruby/object:Gem::Dependency
|
26
38
|
name: robots
|
27
|
-
|
28
|
-
|
29
|
-
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
30
42
|
requirements:
|
31
43
|
- - ">="
|
32
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 7
|
46
|
+
segments:
|
47
|
+
- 0
|
48
|
+
- 7
|
49
|
+
- 2
|
33
50
|
version: 0.7.2
|
34
|
-
|
51
|
+
type: :runtime
|
52
|
+
version_requirements: *id002
|
35
53
|
description:
|
36
54
|
email:
|
37
55
|
executables:
|
@@ -41,26 +59,42 @@ extensions: []
|
|
41
59
|
extra_rdoc_files:
|
42
60
|
- README.rdoc
|
43
61
|
files:
|
62
|
+
- VERSION
|
44
63
|
- LICENSE.txt
|
45
64
|
- CHANGELOG.rdoc
|
46
65
|
- README.rdoc
|
47
|
-
-
|
48
|
-
- lib/anemone.rb
|
49
|
-
- lib/anemone/cookie_store.rb
|
50
|
-
- lib/anemone/core.rb
|
51
|
-
- lib/anemone/http.rb
|
52
|
-
- lib/anemone/page.rb
|
53
|
-
- lib/anemone/page_store.rb
|
54
|
-
- lib/anemone/tentacle.rb
|
55
|
-
- lib/anemone/storage.rb
|
66
|
+
- Rakefile
|
56
67
|
- lib/anemone/storage/pstore.rb
|
68
|
+
- lib/anemone/storage/mongodb.rb
|
57
69
|
- lib/anemone/storage/tokyo_cabinet.rb
|
70
|
+
- lib/anemone/storage/exceptions.rb
|
71
|
+
- lib/anemone/storage/redis.rb
|
72
|
+
- lib/anemone/storage/base.rb
|
73
|
+
- lib/anemone/page_store.rb
|
74
|
+
- lib/anemone/storage.rb
|
75
|
+
- lib/anemone/tentacle.rb
|
76
|
+
- lib/anemone/http.rb
|
58
77
|
- lib/anemone/cli.rb
|
78
|
+
- lib/anemone/page.rb
|
79
|
+
- lib/anemone/exceptions.rb
|
80
|
+
- lib/anemone/core.rb
|
59
81
|
- lib/anemone/cli/url_list.rb
|
60
|
-
- lib/anemone/cli/
|
82
|
+
- lib/anemone/cli/serialize.rb
|
61
83
|
- lib/anemone/cli/count.rb
|
84
|
+
- lib/anemone/cli/cron.rb
|
62
85
|
- lib/anemone/cli/pagedepth.rb
|
63
|
-
- lib/anemone/
|
86
|
+
- lib/anemone/cookie_store.rb
|
87
|
+
- lib/anemone.rb
|
88
|
+
- spec/fakeweb_helper.rb
|
89
|
+
- spec/page_spec.rb
|
90
|
+
- spec/anemone_spec.rb
|
91
|
+
- spec/core_spec.rb
|
92
|
+
- spec/storage_spec.rb
|
93
|
+
- spec/page_store_spec.rb
|
94
|
+
- spec/cookie_store_spec.rb
|
95
|
+
- spec/http_spec.rb
|
96
|
+
- spec/spec_helper.rb
|
97
|
+
- bin/anemone
|
64
98
|
has_rdoc: true
|
65
99
|
homepage: http://anemone.rubyforge.org
|
66
100
|
licenses: []
|
@@ -74,31 +108,37 @@ rdoc_options:
|
|
74
108
|
require_paths:
|
75
109
|
- lib
|
76
110
|
required_ruby_version: !ruby/object:Gem::Requirement
|
111
|
+
none: false
|
77
112
|
requirements:
|
78
113
|
- - ">="
|
79
114
|
- !ruby/object:Gem::Version
|
115
|
+
hash: 3
|
116
|
+
segments:
|
117
|
+
- 0
|
80
118
|
version: "0"
|
81
|
-
version:
|
82
119
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
none: false
|
83
121
|
requirements:
|
84
122
|
- - ">="
|
85
123
|
- !ruby/object:Gem::Version
|
124
|
+
hash: 3
|
125
|
+
segments:
|
126
|
+
- 0
|
86
127
|
version: "0"
|
87
|
-
version:
|
88
128
|
requirements: []
|
89
129
|
|
90
130
|
rubyforge_project: anemone
|
91
|
-
rubygems_version: 1.3.
|
131
|
+
rubygems_version: 1.3.7
|
92
132
|
signing_key:
|
93
133
|
specification_version: 3
|
94
134
|
summary: Anemone web-spider framework
|
95
135
|
test_files:
|
136
|
+
- spec/fakeweb_helper.rb
|
137
|
+
- spec/page_spec.rb
|
96
138
|
- spec/anemone_spec.rb
|
97
|
-
- spec/cookie_store_spec.rb
|
98
139
|
- spec/core_spec.rb
|
99
|
-
- spec/
|
140
|
+
- spec/storage_spec.rb
|
100
141
|
- spec/page_store_spec.rb
|
142
|
+
- spec/cookie_store_spec.rb
|
101
143
|
- spec/http_spec.rb
|
102
|
-
- spec/storage_spec.rb
|
103
|
-
- spec/fakeweb_helper.rb
|
104
144
|
- spec/spec_helper.rb
|