anemone 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +16 -0
- data/README.rdoc +12 -2
- data/Rakefile +26 -0
- data/VERSION +1 -0
- data/lib/anemone/core.rb +38 -8
- data/lib/anemone/exceptions.rb +5 -0
- data/lib/anemone/http.rb +1 -1
- data/lib/anemone/page.rb +36 -3
- data/lib/anemone/storage.rb +17 -2
- data/lib/anemone/storage/base.rb +75 -0
- data/lib/anemone/storage/exceptions.rb +15 -0
- data/lib/anemone/storage/mongodb.rb +89 -0
- data/lib/anemone/storage/pstore.rb +2 -0
- data/lib/anemone/storage/redis.rb +90 -0
- data/spec/core_spec.rb +13 -0
- data/spec/page_spec.rb +16 -1
- data/spec/page_store_spec.rb +26 -2
- data/spec/storage_spec.rb +63 -16
- metadata +68 -28
data/CHANGELOG.rdoc
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
== 0.5.0 / 2010-09-01
|
2
|
+
|
3
|
+
* Major enhancements
|
4
|
+
|
5
|
+
* Added page storage engines for MongoDB and Redis
|
6
|
+
|
7
|
+
* Minor enhancements
|
8
|
+
|
9
|
+
* Use xpath for link parsing instead of CSS (faster) (Marc Seeger)
|
10
|
+
* Added skip_query_strings option to skip links with query strings (Joost Baaij)
|
11
|
+
|
12
|
+
* Bug fixes
|
13
|
+
|
14
|
+
* Only consider status code 300..307 a redirect (Marc Seeger)
|
15
|
+
* Canonicalize redirect links (Marc Seeger)
|
16
|
+
|
1
17
|
== 0.4.0 / 2010-04-08
|
2
18
|
|
3
19
|
* Major enchancements
|
data/README.rdoc
CHANGED
@@ -8,7 +8,7 @@ See http://anemone.rubyforge.org for more information.
|
|
8
8
|
|
9
9
|
== Features
|
10
10
|
* Multi-threaded design for high performance
|
11
|
-
* Tracks 301 HTTP redirects
|
11
|
+
* Tracks 301 HTTP redirects
|
12
12
|
* Built-in BFS algorithm for determining page depth
|
13
13
|
* Allows exclusion of URLs based on regular expressions
|
14
14
|
* Choose the links to follow on each page with focus_crawl()
|
@@ -16,7 +16,7 @@ See http://anemone.rubyforge.org for more information.
|
|
16
16
|
* Records response time for each page
|
17
17
|
* CLI program can list all pages in a domain, calculate page depths, and more
|
18
18
|
* Obey robots.txt
|
19
|
-
* In-memory or persistent storage of pages during crawl, using TokyoCabinet or
|
19
|
+
* In-memory or persistent storage of pages during crawl, using TokyoCabinet, MongoDB, or Redis
|
20
20
|
|
21
21
|
== Examples
|
22
22
|
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
|
@@ -24,3 +24,13 @@ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of sev
|
|
24
24
|
== Requirements
|
25
25
|
* nokogiri
|
26
26
|
* robots
|
27
|
+
|
28
|
+
== Development
|
29
|
+
To test and develop this gem, additional requirements are:
|
30
|
+
* rspec
|
31
|
+
* fakeweb
|
32
|
+
* tokyocabinet
|
33
|
+
* mongo
|
34
|
+
* redis
|
35
|
+
|
36
|
+
You will need to have {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
require 'spec/rake/spectask'
|
5
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
6
|
+
spec.libs << 'lib' << 'spec'
|
7
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
8
|
+
end
|
9
|
+
|
10
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
11
|
+
spec.libs << 'lib' << 'spec'
|
12
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
13
|
+
spec.rcov = true
|
14
|
+
end
|
15
|
+
|
16
|
+
task :default => :spec
|
17
|
+
|
18
|
+
require 'rake/rdoctask'
|
19
|
+
Rake::RDocTask.new do |rdoc|
|
20
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
21
|
+
|
22
|
+
rdoc.rdoc_dir = 'rdoc'
|
23
|
+
rdoc.title = "anemone #{version}"
|
24
|
+
rdoc.rdoc_files.include('README*')
|
25
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
26
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.5.0
|
data/lib/anemone/core.rb
CHANGED
@@ -2,12 +2,14 @@ require 'thread'
|
|
2
2
|
require 'robots'
|
3
3
|
require 'anemone/tentacle'
|
4
4
|
require 'anemone/page'
|
5
|
+
require 'anemone/exceptions'
|
5
6
|
require 'anemone/page_store'
|
6
7
|
require 'anemone/storage'
|
8
|
+
require 'anemone/storage/base'
|
7
9
|
|
8
10
|
module Anemone
|
9
11
|
|
10
|
-
VERSION = '0.
|
12
|
+
VERSION = '0.5.0';
|
11
13
|
|
12
14
|
#
|
13
15
|
# Convenience method to start a crawl
|
@@ -45,7 +47,9 @@ module Anemone
|
|
45
47
|
# Hash of cookie name => value to send with HTTP requests
|
46
48
|
:cookies => nil,
|
47
49
|
# accept cookies from the server and send them back?
|
48
|
-
:accept_cookies => false
|
50
|
+
:accept_cookies => false,
|
51
|
+
# skip any link with a query string? e.g. http://foo.com/?u=user
|
52
|
+
:skip_query_strings => false
|
49
53
|
}
|
50
54
|
|
51
55
|
# Create setter methods for all options to be called from the crawl block
|
@@ -187,7 +191,8 @@ module Anemone
|
|
187
191
|
def process_options
|
188
192
|
@opts = DEFAULT_OPTS.merge @opts
|
189
193
|
@opts[:threads] = 1 if @opts[:delay] > 0
|
190
|
-
|
194
|
+
storage = Anemone::Storage::Base.new(@opts[:storage] || Anemone::Storage.Hash)
|
195
|
+
@pages = PageStore.new(storage)
|
191
196
|
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
192
197
|
|
193
198
|
freeze_options
|
@@ -241,15 +246,40 @@ module Anemone
|
|
241
246
|
# Returns +false+ otherwise.
|
242
247
|
#
|
243
248
|
def visit_link?(link, from_page = nil)
|
244
|
-
|
249
|
+
!@pages.has_page?(link) &&
|
250
|
+
!skip_link?(link) &&
|
251
|
+
!skip_query_string?(link) &&
|
252
|
+
allowed(link) &&
|
253
|
+
!too_deep?(from_page)
|
254
|
+
end
|
255
|
+
|
256
|
+
#
|
257
|
+
# Returns +true+ if we are obeying robots.txt and the link
|
258
|
+
# is granted access in it. Always returns +true+ when we are
|
259
|
+
# not obeying robots.txt.
|
260
|
+
#
|
261
|
+
def allowed(link)
|
262
|
+
@opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
263
|
+
end
|
245
264
|
|
265
|
+
#
|
266
|
+
# Returns +true+ if we are over the page depth limit.
|
267
|
+
# This only works when coming from a page and with the +depth_limit+ option set.
|
268
|
+
# When neither is the case, will always return +false+.
|
269
|
+
def too_deep?(from_page)
|
246
270
|
if from_page && @opts[:depth_limit]
|
247
|
-
|
271
|
+
from_page.depth >= @opts[:depth_limit]
|
248
272
|
else
|
249
|
-
|
273
|
+
false
|
250
274
|
end
|
251
|
-
|
252
|
-
|
275
|
+
end
|
276
|
+
|
277
|
+
#
|
278
|
+
# Returns +true+ if *link* should not be visited because
|
279
|
+
# it has a query string and +skip_query_strings+ is true.
|
280
|
+
#
|
281
|
+
def skip_query_string?(link)
|
282
|
+
@opts[:skip_query_strings] && link.query
|
253
283
|
end
|
254
284
|
|
255
285
|
#
|
data/lib/anemone/http.rb
CHANGED
@@ -91,7 +91,7 @@ module Anemone
|
|
91
91
|
|
92
92
|
response, response_time = get_response(loc, referer)
|
93
93
|
code = Integer(response.code)
|
94
|
-
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']) : nil
|
94
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
95
95
|
yield response, code, loc, redirect_to, response_time
|
96
96
|
limit -= 1
|
97
97
|
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
data/lib/anemone/page.rb
CHANGED
@@ -59,8 +59,8 @@ module Anemone
|
|
59
59
|
@links = []
|
60
60
|
return @links if !doc
|
61
61
|
|
62
|
-
doc.
|
63
|
-
u = a
|
62
|
+
doc.search("//a[@href]").each do |a|
|
63
|
+
u = a['href']
|
64
64
|
next if u.nil? or u.empty?
|
65
65
|
abs = to_absolute(URI(u)) rescue next
|
66
66
|
@links << abs if in_domain?(abs)
|
@@ -120,7 +120,7 @@ module Anemone
|
|
120
120
|
# otherwise.
|
121
121
|
#
|
122
122
|
def redirect?
|
123
|
-
(300..
|
123
|
+
(300..307).include?(@code)
|
124
124
|
end
|
125
125
|
|
126
126
|
#
|
@@ -165,5 +165,38 @@ module Anemone
|
|
165
165
|
@url, @headers, @data, @body, @links, @code, @visited, @depth, @referer, @redirect_to, @response_time, @fetched = ary
|
166
166
|
end
|
167
167
|
|
168
|
+
def to_hash
|
169
|
+
{'url' => @url.to_s,
|
170
|
+
'headers' => Marshal.dump(@headers),
|
171
|
+
'data' => Marshal.dump(@data),
|
172
|
+
'body' => @body,
|
173
|
+
'links' => links.map(&:to_s),
|
174
|
+
'code' => @code,
|
175
|
+
'visited' => @visited,
|
176
|
+
'depth' => @depth,
|
177
|
+
'referer' => @referer.to_s,
|
178
|
+
'redirect_to' => @redirect_to.to_s,
|
179
|
+
'response_time' => @response_time,
|
180
|
+
'fetched' => @fetched}
|
181
|
+
end
|
182
|
+
|
183
|
+
def self.from_hash(hash)
|
184
|
+
page = self.new(URI(hash['url']))
|
185
|
+
{'@headers' => Marshal.load(hash['headers']),
|
186
|
+
'@data' => Marshal.load(hash['data']),
|
187
|
+
'@body' => hash['body'],
|
188
|
+
'@links' => hash['links'].map { |link| URI(link) },
|
189
|
+
'@code' => hash['code'].to_i,
|
190
|
+
'@visited' => hash['visited'],
|
191
|
+
'@depth' => hash['depth'].to_i,
|
192
|
+
'@referer' => hash['referer'],
|
193
|
+
'@redirect_to' => URI(hash['redirect_to']),
|
194
|
+
'@response_time' => hash['response_time'].to_i,
|
195
|
+
'@fetched' => hash['fetched']
|
196
|
+
}.each do |var, value|
|
197
|
+
page.instance_variable_set(var, value)
|
198
|
+
end
|
199
|
+
page
|
200
|
+
end
|
168
201
|
end
|
169
202
|
end
|
data/lib/anemone/storage.rb
CHANGED
@@ -2,7 +2,10 @@ module Anemone
|
|
2
2
|
module Storage
|
3
3
|
|
4
4
|
def self.Hash(*args)
|
5
|
-
Hash.new(*args)
|
5
|
+
hash = Hash.new(*args)
|
6
|
+
# add close method for compatibility with Storage::Base
|
7
|
+
class << hash; def close; end; end
|
8
|
+
hash
|
6
9
|
end
|
7
10
|
|
8
11
|
def self.PStore(*args)
|
@@ -10,10 +13,22 @@ module Anemone
|
|
10
13
|
self::PStore.new(*args)
|
11
14
|
end
|
12
15
|
|
13
|
-
def self.TokyoCabinet(file)
|
16
|
+
def self.TokyoCabinet(file = 'anemone.tch')
|
14
17
|
require 'anemone/storage/tokyo_cabinet'
|
15
18
|
self::TokyoCabinet.new(file)
|
16
19
|
end
|
17
20
|
|
21
|
+
def self.MongoDB(mongo_db = nil, collection_name = 'pages')
|
22
|
+
require 'anemone/storage/mongodb'
|
23
|
+
mongo_db ||= Mongo::Connection.new.db('anemone')
|
24
|
+
raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
|
25
|
+
self::MongoDB.new(mongo_db, collection_name)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.Redis(opts = {})
|
29
|
+
require 'anemone/storage/redis'
|
30
|
+
self::Redis.new(opts)
|
31
|
+
end
|
32
|
+
|
18
33
|
end
|
19
34
|
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'anemone/storage/exceptions'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
module Storage
|
5
|
+
class Base
|
6
|
+
|
7
|
+
def initialize(adapter)
|
8
|
+
@adap = adapter
|
9
|
+
|
10
|
+
# verify adapter conforms to this class's methods
|
11
|
+
methods.each do |method|
|
12
|
+
if !@adap.respond_to?(method.to_sym)
|
13
|
+
raise "Storage adapter must support method #{method}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def [](key)
|
19
|
+
@adap[key]
|
20
|
+
rescue
|
21
|
+
puts key
|
22
|
+
raise RetrievalError, $!
|
23
|
+
end
|
24
|
+
|
25
|
+
def []=(key, value)
|
26
|
+
@adap[key] = value
|
27
|
+
rescue
|
28
|
+
raise InsertionError, $!
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete(key)
|
32
|
+
@adap.delete(key)
|
33
|
+
rescue
|
34
|
+
raise DeletionError, $!
|
35
|
+
end
|
36
|
+
|
37
|
+
def each
|
38
|
+
@adap.each { |k, v| yield k, v }
|
39
|
+
rescue
|
40
|
+
raise GenericError, $!
|
41
|
+
end
|
42
|
+
|
43
|
+
def merge!(hash)
|
44
|
+
@adap.merge!(hash)
|
45
|
+
rescue
|
46
|
+
raise GenericError, $!
|
47
|
+
end
|
48
|
+
|
49
|
+
def close
|
50
|
+
@adap.close
|
51
|
+
rescue
|
52
|
+
raise CloseError, $!
|
53
|
+
end
|
54
|
+
|
55
|
+
def size
|
56
|
+
@adap.size
|
57
|
+
rescue
|
58
|
+
raise GenericError, $!
|
59
|
+
end
|
60
|
+
|
61
|
+
def keys
|
62
|
+
@adap.keys
|
63
|
+
rescue
|
64
|
+
raise GenericError, $!
|
65
|
+
end
|
66
|
+
|
67
|
+
def has_key?(key)
|
68
|
+
@adap.has_key?(key)
|
69
|
+
rescue
|
70
|
+
raise GenericError, $!
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
begin
|
2
|
+
require 'mongo'
|
3
|
+
rescue LoadError
|
4
|
+
puts "You need the mongo gem to use Anemone::Storage::MongoDB"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
module Anemone
|
9
|
+
module Storage
|
10
|
+
class MongoDB
|
11
|
+
|
12
|
+
BINARY_FIELDS = %w(body headers data)
|
13
|
+
|
14
|
+
def initialize(mongo_db, collection_name)
|
15
|
+
@db = mongo_db
|
16
|
+
@collection = @db[collection_name]
|
17
|
+
@collection.remove
|
18
|
+
@collection.create_index 'url'
|
19
|
+
end
|
20
|
+
|
21
|
+
def [](url)
|
22
|
+
if value = @collection.find_one('url' => url.to_s)
|
23
|
+
load_page(value)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def []=(url, page)
|
28
|
+
hash = page.to_hash
|
29
|
+
BINARY_FIELDS.each do |field|
|
30
|
+
hash[field] = BSON::Binary.new(hash[field]) unless hash[field].nil?
|
31
|
+
end
|
32
|
+
@collection.update(
|
33
|
+
{'url' => page.url.to_s},
|
34
|
+
hash,
|
35
|
+
:upsert => true
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
def delete(url)
|
40
|
+
page = self[url]
|
41
|
+
@collection.remove('url' => url.to_s)
|
42
|
+
page
|
43
|
+
end
|
44
|
+
|
45
|
+
def each
|
46
|
+
@collection.find do |cursor|
|
47
|
+
cursor.each do |doc|
|
48
|
+
page = load_page(doc)
|
49
|
+
yield page.url.to_s, page
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def merge!(hash)
|
55
|
+
hash.each { |key, value| self[key] = value }
|
56
|
+
self
|
57
|
+
end
|
58
|
+
|
59
|
+
def size
|
60
|
+
@collection.count
|
61
|
+
end
|
62
|
+
|
63
|
+
def keys
|
64
|
+
keys = []
|
65
|
+
self.each { |k, v| keys << k.to_s }
|
66
|
+
keys
|
67
|
+
end
|
68
|
+
|
69
|
+
def has_key?(url)
|
70
|
+
!!@collection.find_one('url' => url.to_s)
|
71
|
+
end
|
72
|
+
|
73
|
+
def close
|
74
|
+
@db.connection.close
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def load_page(hash)
|
80
|
+
BINARY_FIELDS.each do |field|
|
81
|
+
hash[field] = hash[field].to_s
|
82
|
+
end
|
83
|
+
Page.from_hash(hash)
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'redis'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
module Storage
|
5
|
+
class Redis
|
6
|
+
|
7
|
+
MARSHAL_FIELDS = %w(links visited fetched)
|
8
|
+
|
9
|
+
def initialize(opts = {})
|
10
|
+
@redis = ::Redis.new(opts)
|
11
|
+
@key_prefix = opts[:key_prefix] || 'anemone'
|
12
|
+
keys.each { |key| delete(key) }
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](key)
|
16
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
17
|
+
rget(rkey)
|
18
|
+
end
|
19
|
+
|
20
|
+
def []=(key, value)
|
21
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
22
|
+
hash = value.to_hash
|
23
|
+
MARSHAL_FIELDS.each do |field|
|
24
|
+
hash[field] = Marshal.dump(hash[field])
|
25
|
+
end
|
26
|
+
hash.each do |field, value|
|
27
|
+
@redis.hset(rkey, field, value)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete(key)
|
32
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
33
|
+
page = self[key]
|
34
|
+
@redis.del(rkey)
|
35
|
+
page
|
36
|
+
end
|
37
|
+
|
38
|
+
def each
|
39
|
+
rkeys = @redis.keys("#{@key_prefix}:pages:*")
|
40
|
+
rkeys.each do |rkey|
|
41
|
+
page = rget(rkey)
|
42
|
+
yield page.url.to_s, page
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def merge!(hash)
|
47
|
+
hash.each { |key, value| self[key] = value }
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
def size
|
52
|
+
@redis.keys("#{@key_prefix}:pages:*").size
|
53
|
+
end
|
54
|
+
|
55
|
+
def keys
|
56
|
+
keys = []
|
57
|
+
self.each { |k, v| keys << k.to_s }
|
58
|
+
keys
|
59
|
+
end
|
60
|
+
|
61
|
+
def has_key?(key)
|
62
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
63
|
+
@redis.exists(rkey)
|
64
|
+
end
|
65
|
+
|
66
|
+
def close
|
67
|
+
@redis.quit
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def load_value(hash)
|
73
|
+
MARSHAL_FIELDS.each do |field|
|
74
|
+
unless hash[field].nil? || hash[field] == ''
|
75
|
+
hash[field] = Marshal.load(hash[field])
|
76
|
+
end
|
77
|
+
end
|
78
|
+
Page.from_hash(hash)
|
79
|
+
end
|
80
|
+
|
81
|
+
def rget(rkey)
|
82
|
+
hash = @redis.hgetall(rkey)
|
83
|
+
if !!hash
|
84
|
+
load_value(hash)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
data/spec/core_spec.rb
CHANGED
@@ -72,6 +72,19 @@ module Anemone
|
|
72
72
|
core.pages.keys.should_not include(pages[2].url)
|
73
73
|
end
|
74
74
|
|
75
|
+
it "should be able to skip links with query strings" do
|
76
|
+
pages = []
|
77
|
+
pages << FakePage.new('0', :links => ['1?foo=1', '2'])
|
78
|
+
pages << FakePage.new('1?foo=1')
|
79
|
+
pages << FakePage.new('2')
|
80
|
+
|
81
|
+
core = Anemone.crawl(pages[0].url, @opts) do |a|
|
82
|
+
a.skip_query_strings = true
|
83
|
+
end
|
84
|
+
|
85
|
+
core.should have(2).pages
|
86
|
+
end
|
87
|
+
|
75
88
|
it "should be able to skip links based on a RegEx" do
|
76
89
|
pages = []
|
77
90
|
pages << FakePage.new('0', :links => ['1', '2'])
|
data/spec/page_spec.rb
CHANGED
@@ -6,7 +6,7 @@ module Anemone
|
|
6
6
|
before(:each) do
|
7
7
|
FakeWeb.clean_registry
|
8
8
|
@http = Anemone::HTTP.new
|
9
|
-
@page = @http.fetch_page(FakePage.new('home').url)
|
9
|
+
@page = @http.fetch_page(FakePage.new('home', :links => '1').url)
|
10
10
|
end
|
11
11
|
|
12
12
|
it "should indicate whether it successfully fetched via HTTP" do
|
@@ -73,5 +73,20 @@ module Anemone
|
|
73
73
|
@page.cookies.should == []
|
74
74
|
end
|
75
75
|
|
76
|
+
it "should have a to_hash method that converts the page to a hash" do
|
77
|
+
hash = @page.to_hash
|
78
|
+
hash['url'].should == @page.url.to_s
|
79
|
+
hash['referer'].should == @page.referer.to_s
|
80
|
+
hash['links'].should == @page.links.map(&:to_s)
|
81
|
+
end
|
82
|
+
|
83
|
+
it "should have a from_hash method to convert from a hash to a Page" do
|
84
|
+
page = @page.dup
|
85
|
+
page.depth = 1
|
86
|
+
converted = Page.from_hash(page.to_hash)
|
87
|
+
converted.links.should == page.links
|
88
|
+
converted.depth.should == page.depth
|
89
|
+
end
|
90
|
+
|
76
91
|
end
|
77
92
|
end
|
data/spec/page_store_spec.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
-
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
2
|
+
%w[pstore tokyo_cabinet mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
describe PageStore do
|
@@ -9,7 +9,7 @@ module Anemone
|
|
9
9
|
end
|
10
10
|
|
11
11
|
shared_examples_for "page storage" do
|
12
|
-
it "should be able to
|
12
|
+
it "should be able to compute single-source shortest paths in-place" do
|
13
13
|
pages = []
|
14
14
|
pages << FakePage.new('0', :links => ['1', '3'])
|
15
15
|
pages << FakePage.new('1', :redirect => '2')
|
@@ -124,5 +124,29 @@ module Anemone
|
|
124
124
|
end
|
125
125
|
end
|
126
126
|
|
127
|
+
describe Storage::MongoDB do
|
128
|
+
it_should_behave_like "page storage"
|
129
|
+
|
130
|
+
before(:each) do
|
131
|
+
@opts = {:storage => @store = Storage.MongoDB}
|
132
|
+
end
|
133
|
+
|
134
|
+
after(:each) do
|
135
|
+
@store.close
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
describe Storage::Redis do
|
140
|
+
it_should_behave_like "page storage"
|
141
|
+
|
142
|
+
before(:each) do
|
143
|
+
@opts = {:storage => @store = Storage.Redis}
|
144
|
+
end
|
145
|
+
|
146
|
+
after(:each) do
|
147
|
+
@store.close
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
127
151
|
end
|
128
152
|
end
|
data/spec/storage_spec.rb
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
-
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
2
|
+
%w[pstore tokyo_cabinet mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
describe Storage do
|
@@ -23,21 +23,41 @@ module Anemone
|
|
23
23
|
store.close
|
24
24
|
end
|
25
25
|
|
26
|
+
it "should have a class method to produce a MongoDB" do
|
27
|
+
Anemone::Storage.should respond_to(:MongoDB)
|
28
|
+
store = Anemone::Storage.MongoDB
|
29
|
+
store.should be_an_instance_of(Anemone::Storage::MongoDB)
|
30
|
+
store.close
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should have a class method to produce a Redis" do
|
34
|
+
Anemone::Storage.should respond_to(:Redis)
|
35
|
+
store = Anemone::Storage.Redis
|
36
|
+
store.should be_an_instance_of(Anemone::Storage::Redis)
|
37
|
+
store.close
|
38
|
+
end
|
39
|
+
|
26
40
|
module Storage
|
27
41
|
shared_examples_for "storage engine" do
|
42
|
+
|
43
|
+
before(:each) do
|
44
|
+
@url = SPEC_DOMAIN
|
45
|
+
@page = Page.new(URI(@url))
|
46
|
+
end
|
47
|
+
|
28
48
|
it "should implement [] and []=" do
|
29
49
|
@store.should respond_to(:[])
|
30
50
|
@store.should respond_to(:[]=)
|
31
51
|
|
32
|
-
@store[
|
33
|
-
@store[
|
52
|
+
@store[@url] = @page
|
53
|
+
@store[@url].url.should == URI(@url)
|
34
54
|
end
|
35
55
|
|
36
56
|
it "should implement has_key?" do
|
37
57
|
@store.should respond_to(:has_key?)
|
38
58
|
|
39
|
-
@store[
|
40
|
-
@store.has_key?(
|
59
|
+
@store[@url] = @page
|
60
|
+
@store.has_key?(@url).should == true
|
41
61
|
|
42
62
|
@store.has_key?('missing').should == false
|
43
63
|
end
|
@@ -45,37 +65,41 @@ module Anemone
|
|
45
65
|
it "should implement delete" do
|
46
66
|
@store.should respond_to(:delete)
|
47
67
|
|
48
|
-
@store[
|
49
|
-
@store.delete(
|
50
|
-
@store.has_key?(
|
68
|
+
@store[@url] = @page
|
69
|
+
@store.delete(@url).url.should == @page.url
|
70
|
+
@store.has_key?(@url).should == false
|
51
71
|
end
|
52
72
|
|
53
73
|
it "should implement keys" do
|
54
74
|
@store.should respond_to(:keys)
|
55
75
|
|
56
|
-
|
57
|
-
|
76
|
+
urls = [SPEC_DOMAIN, SPEC_DOMAIN + 'test', SPEC_DOMAIN + 'another']
|
77
|
+
pages = urls.map { |url| Page.new(URI(url)) }
|
78
|
+
urls.zip(pages).each { |arr| @store[arr[0]] = arr[1] }
|
58
79
|
|
59
|
-
@store.keys.should ==
|
80
|
+
(@store.keys - urls).should == []
|
60
81
|
end
|
61
82
|
|
62
83
|
it "should implement each" do
|
63
84
|
@store.should respond_to(:each)
|
64
85
|
|
65
|
-
|
66
|
-
|
86
|
+
urls = [SPEC_DOMAIN, SPEC_DOMAIN + 'test', SPEC_DOMAIN + 'another']
|
87
|
+
pages = urls.map { |url| Page.new(URI(url)) }
|
88
|
+
urls.zip(pages).each { |arr| @store[arr[0]] = arr[1] }
|
67
89
|
|
68
90
|
result = {}
|
69
91
|
@store.each { |k, v| result[k] = v }
|
70
|
-
result.
|
92
|
+
(result.keys - urls).should == []
|
93
|
+
(result.values.map { |page| page.url.to_s } - urls).should == []
|
71
94
|
end
|
72
95
|
|
73
96
|
it "should implement merge!, and return self" do
|
74
97
|
@store.should respond_to(:merge!)
|
75
98
|
|
76
|
-
hash = {
|
99
|
+
hash = {SPEC_DOMAIN => Page.new(URI(SPEC_DOMAIN)),
|
100
|
+
SPEC_DOMAIN + 'test' => Page.new(URI(SPEC_DOMAIN + 'test'))}
|
77
101
|
merged = @store.merge! hash
|
78
|
-
hash.each { |key, value| @store[key].should ==
|
102
|
+
hash.each { |key, value| @store[key].url.to_s.should == key }
|
79
103
|
|
80
104
|
merged.should === @store
|
81
105
|
end
|
@@ -115,7 +139,30 @@ module Anemone
|
|
115
139
|
it "should raise an error if supplied with a file extension other than .tch" do
|
116
140
|
lambda { Anemone::Storage.TokyoCabinet('test.tmp') }.should raise_error(RuntimeError)
|
117
141
|
end
|
142
|
+
end
|
118
143
|
|
144
|
+
describe Storage::MongoDB do
|
145
|
+
it_should_behave_like "storage engine"
|
146
|
+
|
147
|
+
before(:each) do
|
148
|
+
@store = Storage.MongoDB
|
149
|
+
end
|
150
|
+
|
151
|
+
after(:each) do
|
152
|
+
@store.close
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
describe Storage::Redis do
|
157
|
+
it_should_behave_like "storage engine"
|
158
|
+
|
159
|
+
before(:each) do
|
160
|
+
@store = Storage.Redis
|
161
|
+
end
|
162
|
+
|
163
|
+
after(:each) do
|
164
|
+
@store.close
|
165
|
+
end
|
119
166
|
end
|
120
167
|
|
121
168
|
end
|
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 11
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 5
|
9
|
+
- 0
|
10
|
+
version: 0.5.0
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Chris Kite
|
@@ -9,29 +15,41 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date: 2010-
|
18
|
+
date: 2010-09-01 00:00:00 -05:00
|
13
19
|
default_executable:
|
14
20
|
dependencies:
|
15
21
|
- !ruby/object:Gem::Dependency
|
16
22
|
name: nokogiri
|
17
|
-
|
18
|
-
|
19
|
-
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
20
26
|
requirements:
|
21
27
|
- - ">="
|
22
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 27
|
30
|
+
segments:
|
31
|
+
- 1
|
32
|
+
- 3
|
33
|
+
- 0
|
23
34
|
version: 1.3.0
|
24
|
-
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: *id001
|
25
37
|
- !ruby/object:Gem::Dependency
|
26
38
|
name: robots
|
27
|
-
|
28
|
-
|
29
|
-
|
39
|
+
prerelease: false
|
40
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
30
42
|
requirements:
|
31
43
|
- - ">="
|
32
44
|
- !ruby/object:Gem::Version
|
45
|
+
hash: 7
|
46
|
+
segments:
|
47
|
+
- 0
|
48
|
+
- 7
|
49
|
+
- 2
|
33
50
|
version: 0.7.2
|
34
|
-
|
51
|
+
type: :runtime
|
52
|
+
version_requirements: *id002
|
35
53
|
description:
|
36
54
|
email:
|
37
55
|
executables:
|
@@ -41,26 +59,42 @@ extensions: []
|
|
41
59
|
extra_rdoc_files:
|
42
60
|
- README.rdoc
|
43
61
|
files:
|
62
|
+
- VERSION
|
44
63
|
- LICENSE.txt
|
45
64
|
- CHANGELOG.rdoc
|
46
65
|
- README.rdoc
|
47
|
-
-
|
48
|
-
- lib/anemone.rb
|
49
|
-
- lib/anemone/cookie_store.rb
|
50
|
-
- lib/anemone/core.rb
|
51
|
-
- lib/anemone/http.rb
|
52
|
-
- lib/anemone/page.rb
|
53
|
-
- lib/anemone/page_store.rb
|
54
|
-
- lib/anemone/tentacle.rb
|
55
|
-
- lib/anemone/storage.rb
|
66
|
+
- Rakefile
|
56
67
|
- lib/anemone/storage/pstore.rb
|
68
|
+
- lib/anemone/storage/mongodb.rb
|
57
69
|
- lib/anemone/storage/tokyo_cabinet.rb
|
70
|
+
- lib/anemone/storage/exceptions.rb
|
71
|
+
- lib/anemone/storage/redis.rb
|
72
|
+
- lib/anemone/storage/base.rb
|
73
|
+
- lib/anemone/page_store.rb
|
74
|
+
- lib/anemone/storage.rb
|
75
|
+
- lib/anemone/tentacle.rb
|
76
|
+
- lib/anemone/http.rb
|
58
77
|
- lib/anemone/cli.rb
|
78
|
+
- lib/anemone/page.rb
|
79
|
+
- lib/anemone/exceptions.rb
|
80
|
+
- lib/anemone/core.rb
|
59
81
|
- lib/anemone/cli/url_list.rb
|
60
|
-
- lib/anemone/cli/
|
82
|
+
- lib/anemone/cli/serialize.rb
|
61
83
|
- lib/anemone/cli/count.rb
|
84
|
+
- lib/anemone/cli/cron.rb
|
62
85
|
- lib/anemone/cli/pagedepth.rb
|
63
|
-
- lib/anemone/
|
86
|
+
- lib/anemone/cookie_store.rb
|
87
|
+
- lib/anemone.rb
|
88
|
+
- spec/fakeweb_helper.rb
|
89
|
+
- spec/page_spec.rb
|
90
|
+
- spec/anemone_spec.rb
|
91
|
+
- spec/core_spec.rb
|
92
|
+
- spec/storage_spec.rb
|
93
|
+
- spec/page_store_spec.rb
|
94
|
+
- spec/cookie_store_spec.rb
|
95
|
+
- spec/http_spec.rb
|
96
|
+
- spec/spec_helper.rb
|
97
|
+
- bin/anemone
|
64
98
|
has_rdoc: true
|
65
99
|
homepage: http://anemone.rubyforge.org
|
66
100
|
licenses: []
|
@@ -74,31 +108,37 @@ rdoc_options:
|
|
74
108
|
require_paths:
|
75
109
|
- lib
|
76
110
|
required_ruby_version: !ruby/object:Gem::Requirement
|
111
|
+
none: false
|
77
112
|
requirements:
|
78
113
|
- - ">="
|
79
114
|
- !ruby/object:Gem::Version
|
115
|
+
hash: 3
|
116
|
+
segments:
|
117
|
+
- 0
|
80
118
|
version: "0"
|
81
|
-
version:
|
82
119
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
120
|
+
none: false
|
83
121
|
requirements:
|
84
122
|
- - ">="
|
85
123
|
- !ruby/object:Gem::Version
|
124
|
+
hash: 3
|
125
|
+
segments:
|
126
|
+
- 0
|
86
127
|
version: "0"
|
87
|
-
version:
|
88
128
|
requirements: []
|
89
129
|
|
90
130
|
rubyforge_project: anemone
|
91
|
-
rubygems_version: 1.3.
|
131
|
+
rubygems_version: 1.3.7
|
92
132
|
signing_key:
|
93
133
|
specification_version: 3
|
94
134
|
summary: Anemone web-spider framework
|
95
135
|
test_files:
|
136
|
+
- spec/fakeweb_helper.rb
|
137
|
+
- spec/page_spec.rb
|
96
138
|
- spec/anemone_spec.rb
|
97
|
-
- spec/cookie_store_spec.rb
|
98
139
|
- spec/core_spec.rb
|
99
|
-
- spec/
|
140
|
+
- spec/storage_spec.rb
|
100
141
|
- spec/page_store_spec.rb
|
142
|
+
- spec/cookie_store_spec.rb
|
101
143
|
- spec/http_spec.rb
|
102
|
-
- spec/storage_spec.rb
|
103
|
-
- spec/fakeweb_helper.rb
|
104
144
|
- spec/spec_helper.rb
|