sutch-anemone 0.7.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/CHANGELOG.rdoc +136 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +38 -0
- data/Rakefile +23 -0
- data/VERSION +1 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +339 -0
- data/lib/anemone/exceptions.rb +5 -0
- data/lib/anemone/http.rb +187 -0
- data/lib/anemone/page.rb +217 -0
- data/lib/anemone/page_store.rb +161 -0
- data/lib/anemone/resource.rb +42 -0
- data/lib/anemone/storage.rb +44 -0
- data/lib/anemone/storage/base.rb +75 -0
- data/lib/anemone/storage/exceptions.rb +15 -0
- data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
- data/lib/anemone/storage/mongodb.rb +89 -0
- data/lib/anemone/storage/pstore.rb +50 -0
- data/lib/anemone/storage/redis.rb +90 -0
- data/lib/anemone/storage/sqlite3.rb +90 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +60 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +16 -0
- data/spec/cookie_store_spec.rb +28 -0
- data/spec/core_spec.rb +344 -0
- data/spec/fakeweb_helper.rb +77 -0
- data/spec/http_spec.rb +19 -0
- data/spec/page_spec.rb +186 -0
- data/spec/page_store_spec.rb +171 -0
- data/spec/resource_spec.rb +91 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/storage_spec.rb +252 -0
- metadata +281 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'anemone/storage/exceptions'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
module Storage
|
5
|
+
class Base
|
6
|
+
|
7
|
+
def initialize(adapter)
|
8
|
+
@adap = adapter
|
9
|
+
|
10
|
+
# verify adapter conforms to this class's methods
|
11
|
+
methods.each do |method|
|
12
|
+
if !@adap.respond_to?(method.to_sym)
|
13
|
+
raise "Storage adapter must support method #{method}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def [](key)
|
19
|
+
@adap[key]
|
20
|
+
rescue
|
21
|
+
puts key
|
22
|
+
raise RetrievalError, $!
|
23
|
+
end
|
24
|
+
|
25
|
+
def []=(key, value)
|
26
|
+
@adap[key] = value
|
27
|
+
rescue
|
28
|
+
raise InsertionError, $!
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete(key)
|
32
|
+
@adap.delete(key)
|
33
|
+
rescue
|
34
|
+
raise DeletionError, $!
|
35
|
+
end
|
36
|
+
|
37
|
+
def each
|
38
|
+
@adap.each { |k, v| yield k, v }
|
39
|
+
rescue
|
40
|
+
raise GenericError, $!
|
41
|
+
end
|
42
|
+
|
43
|
+
def merge!(hash)
|
44
|
+
@adap.merge!(hash)
|
45
|
+
rescue
|
46
|
+
raise GenericError, $!
|
47
|
+
end
|
48
|
+
|
49
|
+
def close
|
50
|
+
@adap.close
|
51
|
+
rescue
|
52
|
+
raise CloseError, $!
|
53
|
+
end
|
54
|
+
|
55
|
+
def size
|
56
|
+
@adap.size
|
57
|
+
rescue
|
58
|
+
raise GenericError, $!
|
59
|
+
end
|
60
|
+
|
61
|
+
def keys
|
62
|
+
@adap.keys
|
63
|
+
rescue
|
64
|
+
raise GenericError, $!
|
65
|
+
end
|
66
|
+
|
67
|
+
def has_key?(key)
|
68
|
+
@adap.has_key?(key)
|
69
|
+
rescue
|
70
|
+
raise GenericError, $!
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
begin
|
2
|
+
require 'kyotocabinet'
|
3
|
+
rescue LoadError
|
4
|
+
puts $!
|
5
|
+
puts "You need the kyotocabinet-ruby gem to use Anemone::Storage::KyotoCabinet"
|
6
|
+
exit
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'forwardable'
|
10
|
+
|
11
|
+
module Anemone
|
12
|
+
module Storage
|
13
|
+
class KyotoCabinet
|
14
|
+
extend Forwardable
|
15
|
+
|
16
|
+
def_delegators :@db, :close, :size, :each
|
17
|
+
|
18
|
+
def initialize(file)
|
19
|
+
raise "KyotoCabinet filename must have .kch extension" if File.extname(file) != '.kch'
|
20
|
+
@db = ::KyotoCabinet::DB::new
|
21
|
+
@db.open(file, ::KyotoCabinet::DB::OWRITER | ::KyotoCabinet::DB::OCREATE)
|
22
|
+
@db.clear
|
23
|
+
end
|
24
|
+
|
25
|
+
def [](key)
|
26
|
+
if value = @db[key]
|
27
|
+
load_value(value)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def []=(key, value)
|
32
|
+
@db[key] = [Marshal.dump(value)].pack("m")
|
33
|
+
end
|
34
|
+
|
35
|
+
def each
|
36
|
+
@db.each do |k, v|
|
37
|
+
yield(k, load_value(v))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def has_key?(key)
|
42
|
+
# Kyoto Cabinet doesn't have a way to query whether a key exists, so hack it
|
43
|
+
keys = @db.match_prefix(key)
|
44
|
+
!!keys && keys.include?(key)
|
45
|
+
end
|
46
|
+
|
47
|
+
def keys
|
48
|
+
acc = []
|
49
|
+
@db.each_key { |key| acc << key.first }
|
50
|
+
acc
|
51
|
+
end
|
52
|
+
|
53
|
+
def delete(key)
|
54
|
+
value = self[key]
|
55
|
+
@db.delete(key)
|
56
|
+
value
|
57
|
+
end
|
58
|
+
|
59
|
+
def merge!(hash)
|
60
|
+
hash.each { |key, value| self[key] = value }
|
61
|
+
self
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def load_value(value)
|
67
|
+
Marshal.load(value.unpack("m")[0])
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
begin
|
2
|
+
require 'mongo'
|
3
|
+
rescue LoadError
|
4
|
+
puts "You need the mongo gem to use Anemone::Storage::MongoDB"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
module Anemone
|
9
|
+
module Storage
|
10
|
+
class MongoDB
|
11
|
+
|
12
|
+
BINARY_FIELDS = %w(body headers data)
|
13
|
+
|
14
|
+
def initialize(mongo_db, collection_name)
|
15
|
+
@db = mongo_db
|
16
|
+
@collection = @db[collection_name]
|
17
|
+
@collection.remove
|
18
|
+
@collection.create_index 'url'
|
19
|
+
end
|
20
|
+
|
21
|
+
def [](url)
|
22
|
+
if value = @collection.find_one('url' => url.to_s)
|
23
|
+
load_page(value)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def []=(url, page)
|
28
|
+
hash = page.to_hash
|
29
|
+
BINARY_FIELDS.each do |field|
|
30
|
+
hash[field] = BSON::Binary.new(hash[field]) unless hash[field].nil?
|
31
|
+
end
|
32
|
+
@collection.update(
|
33
|
+
{'url' => page.url.to_s},
|
34
|
+
hash,
|
35
|
+
:upsert => true
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
def delete(url)
|
40
|
+
page = self[url]
|
41
|
+
@collection.remove('url' => url.to_s)
|
42
|
+
page
|
43
|
+
end
|
44
|
+
|
45
|
+
def each
|
46
|
+
@collection.find do |cursor|
|
47
|
+
cursor.each do |doc|
|
48
|
+
page = load_page(doc)
|
49
|
+
yield page.url.to_s, page
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def merge!(hash)
|
55
|
+
hash.each { |key, value| self[key] = value }
|
56
|
+
self
|
57
|
+
end
|
58
|
+
|
59
|
+
def size
|
60
|
+
@collection.count
|
61
|
+
end
|
62
|
+
|
63
|
+
def keys
|
64
|
+
keys = []
|
65
|
+
self.each { |k, v| keys << k.to_s }
|
66
|
+
keys
|
67
|
+
end
|
68
|
+
|
69
|
+
def has_key?(url)
|
70
|
+
!!@collection.find_one('url' => url.to_s)
|
71
|
+
end
|
72
|
+
|
73
|
+
def close
|
74
|
+
@db.connection.close
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def load_page(hash)
|
80
|
+
BINARY_FIELDS.each do |field|
|
81
|
+
hash[field] = hash[field].to_s
|
82
|
+
end
|
83
|
+
Page.from_hash(hash)
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'pstore'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
module Storage
|
6
|
+
class PStore
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
def_delegators :@keys, :has_key?, :keys, :size
|
10
|
+
|
11
|
+
def initialize(file)
|
12
|
+
File.delete(file) if File.exists?(file)
|
13
|
+
@store = ::PStore.new(file)
|
14
|
+
@keys = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
def [](key)
|
18
|
+
@store.transaction { |s| s[key] }
|
19
|
+
end
|
20
|
+
|
21
|
+
def []=(key,value)
|
22
|
+
@keys[key] = nil
|
23
|
+
@store.transaction { |s| s[key] = value }
|
24
|
+
end
|
25
|
+
|
26
|
+
def delete(key)
|
27
|
+
@keys.delete(key)
|
28
|
+
@store.transaction { |s| s.delete key}
|
29
|
+
end
|
30
|
+
|
31
|
+
def each
|
32
|
+
@keys.each_key do |key|
|
33
|
+
value = nil
|
34
|
+
@store.transaction { |s| value = s[key] }
|
35
|
+
yield key, value
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def merge!(hash)
|
40
|
+
@store.transaction do |s|
|
41
|
+
hash.each { |key, value| s[key] = value; @keys[key] = nil }
|
42
|
+
end
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
def close; end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'redis'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
module Storage
|
5
|
+
class Redis
|
6
|
+
|
7
|
+
MARSHAL_FIELDS = %w(links visited fetched)
|
8
|
+
|
9
|
+
def initialize(opts = {})
|
10
|
+
@redis = ::Redis.new(opts)
|
11
|
+
@key_prefix = opts[:key_prefix] || 'anemone'
|
12
|
+
keys.each { |key| delete(key) }
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](key)
|
16
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
17
|
+
rget(rkey)
|
18
|
+
end
|
19
|
+
|
20
|
+
def []=(key, value)
|
21
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
22
|
+
hash = value.to_hash
|
23
|
+
MARSHAL_FIELDS.each do |field|
|
24
|
+
hash[field] = Marshal.dump(hash[field])
|
25
|
+
end
|
26
|
+
hash.each do |field, value|
|
27
|
+
@redis.hset(rkey, field, value)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete(key)
|
32
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
33
|
+
page = self[key]
|
34
|
+
@redis.del(rkey)
|
35
|
+
page
|
36
|
+
end
|
37
|
+
|
38
|
+
def each
|
39
|
+
rkeys = @redis.keys("#{@key_prefix}:pages:*")
|
40
|
+
rkeys.each do |rkey|
|
41
|
+
page = rget(rkey)
|
42
|
+
yield page.url.to_s, page
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def merge!(hash)
|
47
|
+
hash.each { |key, value| self[key] = value }
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
def size
|
52
|
+
@redis.keys("#{@key_prefix}:pages:*").size
|
53
|
+
end
|
54
|
+
|
55
|
+
def keys
|
56
|
+
keys = []
|
57
|
+
self.each { |k, v| keys << k.to_s }
|
58
|
+
keys
|
59
|
+
end
|
60
|
+
|
61
|
+
def has_key?(key)
|
62
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
63
|
+
@redis.exists(rkey)
|
64
|
+
end
|
65
|
+
|
66
|
+
def close
|
67
|
+
@redis.quit
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def load_value(hash)
|
73
|
+
MARSHAL_FIELDS.each do |field|
|
74
|
+
unless hash[field].nil? || hash[field] == ''
|
75
|
+
hash[field] = Marshal.load(hash[field])
|
76
|
+
end
|
77
|
+
end
|
78
|
+
Page.from_hash(hash)
|
79
|
+
end
|
80
|
+
|
81
|
+
def rget(rkey)
|
82
|
+
hash = @redis.hgetall(rkey)
|
83
|
+
if !!hash
|
84
|
+
load_value(hash)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
begin
|
2
|
+
require 'sqlite3'
|
3
|
+
rescue LoadError
|
4
|
+
puts "You need the sqlite3 gem to use Anemone::Storage::SQLite3"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
module Anemone
|
9
|
+
module Storage
|
10
|
+
class SQLite3
|
11
|
+
|
12
|
+
def initialize(file)
|
13
|
+
@db = ::SQLite3::Database.new(file)
|
14
|
+
create_schema
|
15
|
+
end
|
16
|
+
|
17
|
+
def [](url)
|
18
|
+
value = @db.get_first_value('SELECT data FROM anemone_storage WHERE key = ?', url.to_s)
|
19
|
+
if value
|
20
|
+
Marshal.load(value)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def []=(url, value)
|
25
|
+
data = Marshal.dump(value)
|
26
|
+
if has_key?(url)
|
27
|
+
@db.execute('UPDATE anemone_storage SET data = ? WHERE key = ?', data, url.to_s)
|
28
|
+
else
|
29
|
+
@db.execute('INSERT INTO anemone_storage (data, key) VALUES(?, ?)', data, url.to_s)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def delete(url)
|
34
|
+
page = self[url]
|
35
|
+
@db.execute('DELETE FROM anemone_storage WHERE key = ?', url.to_s)
|
36
|
+
page
|
37
|
+
end
|
38
|
+
|
39
|
+
def each
|
40
|
+
@db.execute("SELECT key, data FROM anemone_storage ORDER BY id") do |row|
|
41
|
+
value = Marshal.load(row[1])
|
42
|
+
yield row[0], value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def merge!(hash)
|
47
|
+
hash.each { |key, value| self[key] = value }
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
def size
|
52
|
+
@db.get_first_value('SELECT COUNT(id) FROM anemone_storage')
|
53
|
+
end
|
54
|
+
|
55
|
+
def keys
|
56
|
+
@db.execute("SELECT key FROM anemone_storage ORDER BY id").map{|t| t[0]}
|
57
|
+
end
|
58
|
+
|
59
|
+
def has_key?(url)
|
60
|
+
!!@db.get_first_value('SELECT id FROM anemone_storage WHERE key = ?', url.to_s)
|
61
|
+
end
|
62
|
+
|
63
|
+
def close
|
64
|
+
@db.close
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def create_schema
|
70
|
+
@db.execute_batch <<SQL
|
71
|
+
create table if not exists anemone_storage (
|
72
|
+
id INTEGER PRIMARY KEY ASC,
|
73
|
+
key TEXT,
|
74
|
+
data BLOB
|
75
|
+
);
|
76
|
+
create index if not exists anemone_key_idx on anemone_storage (key);
|
77
|
+
SQL
|
78
|
+
end
|
79
|
+
|
80
|
+
def load_page(hash)
|
81
|
+
BINARY_FIELDS.each do |field|
|
82
|
+
hash[field] = hash[field].to_s
|
83
|
+
end
|
84
|
+
Page.from_hash(hash)
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|