sutch-anemone 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/CHANGELOG.rdoc +136 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +38 -0
- data/Rakefile +23 -0
- data/VERSION +1 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +339 -0
- data/lib/anemone/exceptions.rb +5 -0
- data/lib/anemone/http.rb +187 -0
- data/lib/anemone/page.rb +217 -0
- data/lib/anemone/page_store.rb +161 -0
- data/lib/anemone/resource.rb +42 -0
- data/lib/anemone/storage.rb +44 -0
- data/lib/anemone/storage/base.rb +75 -0
- data/lib/anemone/storage/exceptions.rb +15 -0
- data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
- data/lib/anemone/storage/mongodb.rb +89 -0
- data/lib/anemone/storage/pstore.rb +50 -0
- data/lib/anemone/storage/redis.rb +90 -0
- data/lib/anemone/storage/sqlite3.rb +90 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +60 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +16 -0
- data/spec/cookie_store_spec.rb +28 -0
- data/spec/core_spec.rb +344 -0
- data/spec/fakeweb_helper.rb +77 -0
- data/spec/http_spec.rb +19 -0
- data/spec/page_spec.rb +186 -0
- data/spec/page_store_spec.rb +171 -0
- data/spec/resource_spec.rb +91 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/storage_spec.rb +252 -0
- metadata +281 -0
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'anemone/storage/exceptions'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
module Storage
|
5
|
+
class Base
|
6
|
+
|
7
|
+
def initialize(adapter)
|
8
|
+
@adap = adapter
|
9
|
+
|
10
|
+
# verify adapter conforms to this class's methods
|
11
|
+
methods.each do |method|
|
12
|
+
if !@adap.respond_to?(method.to_sym)
|
13
|
+
raise "Storage adapter must support method #{method}"
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def [](key)
|
19
|
+
@adap[key]
|
20
|
+
rescue
|
21
|
+
puts key
|
22
|
+
raise RetrievalError, $!
|
23
|
+
end
|
24
|
+
|
25
|
+
def []=(key, value)
|
26
|
+
@adap[key] = value
|
27
|
+
rescue
|
28
|
+
raise InsertionError, $!
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete(key)
|
32
|
+
@adap.delete(key)
|
33
|
+
rescue
|
34
|
+
raise DeletionError, $!
|
35
|
+
end
|
36
|
+
|
37
|
+
def each
|
38
|
+
@adap.each { |k, v| yield k, v }
|
39
|
+
rescue
|
40
|
+
raise GenericError, $!
|
41
|
+
end
|
42
|
+
|
43
|
+
def merge!(hash)
|
44
|
+
@adap.merge!(hash)
|
45
|
+
rescue
|
46
|
+
raise GenericError, $!
|
47
|
+
end
|
48
|
+
|
49
|
+
def close
|
50
|
+
@adap.close
|
51
|
+
rescue
|
52
|
+
raise CloseError, $!
|
53
|
+
end
|
54
|
+
|
55
|
+
def size
|
56
|
+
@adap.size
|
57
|
+
rescue
|
58
|
+
raise GenericError, $!
|
59
|
+
end
|
60
|
+
|
61
|
+
def keys
|
62
|
+
@adap.keys
|
63
|
+
rescue
|
64
|
+
raise GenericError, $!
|
65
|
+
end
|
66
|
+
|
67
|
+
def has_key?(key)
|
68
|
+
@adap.has_key?(key)
|
69
|
+
rescue
|
70
|
+
raise GenericError, $!
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
begin
|
2
|
+
require 'kyotocabinet'
|
3
|
+
rescue LoadError
|
4
|
+
puts $!
|
5
|
+
puts "You need the kyotocabinet-ruby gem to use Anemone::Storage::KyotoCabinet"
|
6
|
+
exit
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'forwardable'
|
10
|
+
|
11
|
+
module Anemone
|
12
|
+
module Storage
|
13
|
+
class KyotoCabinet
|
14
|
+
extend Forwardable
|
15
|
+
|
16
|
+
def_delegators :@db, :close, :size, :each
|
17
|
+
|
18
|
+
def initialize(file)
|
19
|
+
raise "KyotoCabinet filename must have .kch extension" if File.extname(file) != '.kch'
|
20
|
+
@db = ::KyotoCabinet::DB::new
|
21
|
+
@db.open(file, ::KyotoCabinet::DB::OWRITER | ::KyotoCabinet::DB::OCREATE)
|
22
|
+
@db.clear
|
23
|
+
end
|
24
|
+
|
25
|
+
def [](key)
|
26
|
+
if value = @db[key]
|
27
|
+
load_value(value)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def []=(key, value)
|
32
|
+
@db[key] = [Marshal.dump(value)].pack("m")
|
33
|
+
end
|
34
|
+
|
35
|
+
def each
|
36
|
+
@db.each do |k, v|
|
37
|
+
yield(k, load_value(v))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def has_key?(key)
|
42
|
+
# Kyoto Cabinet doesn't have a way to query whether a key exists, so hack it
|
43
|
+
keys = @db.match_prefix(key)
|
44
|
+
!!keys && keys.include?(key)
|
45
|
+
end
|
46
|
+
|
47
|
+
def keys
|
48
|
+
acc = []
|
49
|
+
@db.each_key { |key| acc << key.first }
|
50
|
+
acc
|
51
|
+
end
|
52
|
+
|
53
|
+
def delete(key)
|
54
|
+
value = self[key]
|
55
|
+
@db.delete(key)
|
56
|
+
value
|
57
|
+
end
|
58
|
+
|
59
|
+
def merge!(hash)
|
60
|
+
hash.each { |key, value| self[key] = value }
|
61
|
+
self
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def load_value(value)
|
67
|
+
Marshal.load(value.unpack("m")[0])
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
begin
|
2
|
+
require 'mongo'
|
3
|
+
rescue LoadError
|
4
|
+
puts "You need the mongo gem to use Anemone::Storage::MongoDB"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
module Anemone
|
9
|
+
module Storage
|
10
|
+
class MongoDB
|
11
|
+
|
12
|
+
BINARY_FIELDS = %w(body headers data)
|
13
|
+
|
14
|
+
def initialize(mongo_db, collection_name)
|
15
|
+
@db = mongo_db
|
16
|
+
@collection = @db[collection_name]
|
17
|
+
@collection.remove
|
18
|
+
@collection.create_index 'url'
|
19
|
+
end
|
20
|
+
|
21
|
+
def [](url)
|
22
|
+
if value = @collection.find_one('url' => url.to_s)
|
23
|
+
load_page(value)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def []=(url, page)
|
28
|
+
hash = page.to_hash
|
29
|
+
BINARY_FIELDS.each do |field|
|
30
|
+
hash[field] = BSON::Binary.new(hash[field]) unless hash[field].nil?
|
31
|
+
end
|
32
|
+
@collection.update(
|
33
|
+
{'url' => page.url.to_s},
|
34
|
+
hash,
|
35
|
+
:upsert => true
|
36
|
+
)
|
37
|
+
end
|
38
|
+
|
39
|
+
def delete(url)
|
40
|
+
page = self[url]
|
41
|
+
@collection.remove('url' => url.to_s)
|
42
|
+
page
|
43
|
+
end
|
44
|
+
|
45
|
+
def each
|
46
|
+
@collection.find do |cursor|
|
47
|
+
cursor.each do |doc|
|
48
|
+
page = load_page(doc)
|
49
|
+
yield page.url.to_s, page
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def merge!(hash)
|
55
|
+
hash.each { |key, value| self[key] = value }
|
56
|
+
self
|
57
|
+
end
|
58
|
+
|
59
|
+
def size
|
60
|
+
@collection.count
|
61
|
+
end
|
62
|
+
|
63
|
+
def keys
|
64
|
+
keys = []
|
65
|
+
self.each { |k, v| keys << k.to_s }
|
66
|
+
keys
|
67
|
+
end
|
68
|
+
|
69
|
+
def has_key?(url)
|
70
|
+
!!@collection.find_one('url' => url.to_s)
|
71
|
+
end
|
72
|
+
|
73
|
+
def close
|
74
|
+
@db.connection.close
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def load_page(hash)
|
80
|
+
BINARY_FIELDS.each do |field|
|
81
|
+
hash[field] = hash[field].to_s
|
82
|
+
end
|
83
|
+
Page.from_hash(hash)
|
84
|
+
end
|
85
|
+
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'pstore'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
module Storage
|
6
|
+
class PStore
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
def_delegators :@keys, :has_key?, :keys, :size
|
10
|
+
|
11
|
+
def initialize(file)
|
12
|
+
File.delete(file) if File.exists?(file)
|
13
|
+
@store = ::PStore.new(file)
|
14
|
+
@keys = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
def [](key)
|
18
|
+
@store.transaction { |s| s[key] }
|
19
|
+
end
|
20
|
+
|
21
|
+
def []=(key,value)
|
22
|
+
@keys[key] = nil
|
23
|
+
@store.transaction { |s| s[key] = value }
|
24
|
+
end
|
25
|
+
|
26
|
+
def delete(key)
|
27
|
+
@keys.delete(key)
|
28
|
+
@store.transaction { |s| s.delete key}
|
29
|
+
end
|
30
|
+
|
31
|
+
def each
|
32
|
+
@keys.each_key do |key|
|
33
|
+
value = nil
|
34
|
+
@store.transaction { |s| value = s[key] }
|
35
|
+
yield key, value
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def merge!(hash)
|
40
|
+
@store.transaction do |s|
|
41
|
+
hash.each { |key, value| s[key] = value; @keys[key] = nil }
|
42
|
+
end
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
def close; end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'redis'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
module Storage
|
5
|
+
class Redis
|
6
|
+
|
7
|
+
MARSHAL_FIELDS = %w(links visited fetched)
|
8
|
+
|
9
|
+
def initialize(opts = {})
|
10
|
+
@redis = ::Redis.new(opts)
|
11
|
+
@key_prefix = opts[:key_prefix] || 'anemone'
|
12
|
+
keys.each { |key| delete(key) }
|
13
|
+
end
|
14
|
+
|
15
|
+
def [](key)
|
16
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
17
|
+
rget(rkey)
|
18
|
+
end
|
19
|
+
|
20
|
+
def []=(key, value)
|
21
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
22
|
+
hash = value.to_hash
|
23
|
+
MARSHAL_FIELDS.each do |field|
|
24
|
+
hash[field] = Marshal.dump(hash[field])
|
25
|
+
end
|
26
|
+
hash.each do |field, value|
|
27
|
+
@redis.hset(rkey, field, value)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def delete(key)
|
32
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
33
|
+
page = self[key]
|
34
|
+
@redis.del(rkey)
|
35
|
+
page
|
36
|
+
end
|
37
|
+
|
38
|
+
def each
|
39
|
+
rkeys = @redis.keys("#{@key_prefix}:pages:*")
|
40
|
+
rkeys.each do |rkey|
|
41
|
+
page = rget(rkey)
|
42
|
+
yield page.url.to_s, page
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def merge!(hash)
|
47
|
+
hash.each { |key, value| self[key] = value }
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
def size
|
52
|
+
@redis.keys("#{@key_prefix}:pages:*").size
|
53
|
+
end
|
54
|
+
|
55
|
+
def keys
|
56
|
+
keys = []
|
57
|
+
self.each { |k, v| keys << k.to_s }
|
58
|
+
keys
|
59
|
+
end
|
60
|
+
|
61
|
+
def has_key?(key)
|
62
|
+
rkey = "#{@key_prefix}:pages:#{key.to_s}"
|
63
|
+
@redis.exists(rkey)
|
64
|
+
end
|
65
|
+
|
66
|
+
def close
|
67
|
+
@redis.quit
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def load_value(hash)
|
73
|
+
MARSHAL_FIELDS.each do |field|
|
74
|
+
unless hash[field].nil? || hash[field] == ''
|
75
|
+
hash[field] = Marshal.load(hash[field])
|
76
|
+
end
|
77
|
+
end
|
78
|
+
Page.from_hash(hash)
|
79
|
+
end
|
80
|
+
|
81
|
+
def rget(rkey)
|
82
|
+
hash = @redis.hgetall(rkey)
|
83
|
+
if !!hash
|
84
|
+
load_value(hash)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
begin
|
2
|
+
require 'sqlite3'
|
3
|
+
rescue LoadError
|
4
|
+
puts "You need the sqlite3 gem to use Anemone::Storage::SQLite3"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
module Anemone
|
9
|
+
module Storage
|
10
|
+
class SQLite3
|
11
|
+
|
12
|
+
def initialize(file)
|
13
|
+
@db = ::SQLite3::Database.new(file)
|
14
|
+
create_schema
|
15
|
+
end
|
16
|
+
|
17
|
+
def [](url)
|
18
|
+
value = @db.get_first_value('SELECT data FROM anemone_storage WHERE key = ?', url.to_s)
|
19
|
+
if value
|
20
|
+
Marshal.load(value)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def []=(url, value)
|
25
|
+
data = Marshal.dump(value)
|
26
|
+
if has_key?(url)
|
27
|
+
@db.execute('UPDATE anemone_storage SET data = ? WHERE key = ?', data, url.to_s)
|
28
|
+
else
|
29
|
+
@db.execute('INSERT INTO anemone_storage (data, key) VALUES(?, ?)', data, url.to_s)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def delete(url)
|
34
|
+
page = self[url]
|
35
|
+
@db.execute('DELETE FROM anemone_storage WHERE key = ?', url.to_s)
|
36
|
+
page
|
37
|
+
end
|
38
|
+
|
39
|
+
def each
|
40
|
+
@db.execute("SELECT key, data FROM anemone_storage ORDER BY id") do |row|
|
41
|
+
value = Marshal.load(row[1])
|
42
|
+
yield row[0], value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def merge!(hash)
|
47
|
+
hash.each { |key, value| self[key] = value }
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
def size
|
52
|
+
@db.get_first_value('SELECT COUNT(id) FROM anemone_storage')
|
53
|
+
end
|
54
|
+
|
55
|
+
def keys
|
56
|
+
@db.execute("SELECT key FROM anemone_storage ORDER BY id").map{|t| t[0]}
|
57
|
+
end
|
58
|
+
|
59
|
+
def has_key?(url)
|
60
|
+
!!@db.get_first_value('SELECT id FROM anemone_storage WHERE key = ?', url.to_s)
|
61
|
+
end
|
62
|
+
|
63
|
+
def close
|
64
|
+
@db.close
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def create_schema
|
70
|
+
@db.execute_batch <<SQL
|
71
|
+
create table if not exists anemone_storage (
|
72
|
+
id INTEGER PRIMARY KEY ASC,
|
73
|
+
key TEXT,
|
74
|
+
data BLOB
|
75
|
+
);
|
76
|
+
create index if not exists anemone_key_idx on anemone_storage (key);
|
77
|
+
SQL
|
78
|
+
end
|
79
|
+
|
80
|
+
def load_page(hash)
|
81
|
+
BINARY_FIELDS.each do |field|
|
82
|
+
hash[field] = hash[field].to_s
|
83
|
+
end
|
84
|
+
Page.from_hash(hash)
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|