anemone 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +18 -1
- data/README.rdoc +2 -1
- data/Rakefile +9 -10
- data/VERSION +1 -1
- data/lib/anemone/core.rb +1 -1
- data/lib/anemone/http.rb +1 -1
- data/lib/anemone/page.rb +18 -3
- data/lib/anemone/storage.rb +10 -0
- data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
- data/lib/anemone/storage/sqlite3.rb +90 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +1 -0
- data/spec/core_spec.rb +22 -1
- data/spec/fakeweb_helper.rb +6 -8
- data/spec/http_spec.rb +1 -7
- data/spec/page_spec.rb +97 -13
- data/spec/page_store_spec.rb +19 -1
- data/spec/spec_helper.rb +2 -0
- data/spec/storage_spec.rb +56 -1
- metadata +164 -85
data/CHANGELOG.rdoc
CHANGED
@@ -1,6 +1,23 @@
|
|
1
|
+
== 0.7.0 / 2012-01-19
|
2
|
+
|
3
|
+
* Major enhancements
|
4
|
+
|
5
|
+
* Added support for SQLite3 and Kyoto Cabinet storage
|
6
|
+
|
7
|
+
* Minor enhancements
|
8
|
+
|
9
|
+
* Added Page#base to use base HTML element
|
10
|
+
* Use bundler for development dependencies
|
11
|
+
|
12
|
+
* Bug fixes
|
13
|
+
|
14
|
+
* Encode characters in URLs
|
15
|
+
* Fix specs to run under rake
|
16
|
+
* Fix handling of redirect_to in storage adapters
|
17
|
+
|
1
18
|
== 0.6.1 / 2011-02-24
|
2
19
|
|
3
|
-
*Bug fixes
|
20
|
+
* Bug fixes
|
4
21
|
|
5
22
|
* Fix a bug preventing SSL connections from working
|
6
23
|
|
data/README.rdoc
CHANGED
@@ -16,7 +16,7 @@ See http://anemone.rubyforge.org for more information.
|
|
16
16
|
* Records response time for each page
|
17
17
|
* CLI program can list all pages in a domain, calculate page depths, and more
|
18
18
|
* Obey robots.txt
|
19
|
-
* In-memory or persistent storage of pages during crawl, using TokyoCabinet, MongoDB, or Redis
|
19
|
+
* In-memory or persistent storage of pages during crawl, using TokyoCabinet, SQLite3, MongoDB, or Redis
|
20
20
|
|
21
21
|
== Examples
|
22
22
|
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
|
@@ -32,5 +32,6 @@ To test and develop this gem, additional requirements are:
|
|
32
32
|
* tokyocabinet
|
33
33
|
* mongo
|
34
34
|
* redis
|
35
|
+
* sqlite3
|
35
36
|
|
36
37
|
You will need to have {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.
|
data/Rakefile
CHANGED
@@ -1,26 +1,25 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'rake'
|
3
|
+
require 'rspec/core/rake_task'
|
3
4
|
|
4
|
-
|
5
|
-
|
6
|
-
spec.
|
7
|
-
spec.spec_files = FileList['spec/**/*_spec.rb']
|
5
|
+
desc "Run all specs"
|
6
|
+
RSpec::Core::RakeTask.new(:rspec) do |spec|
|
7
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
8
8
|
end
|
9
9
|
|
10
|
-
|
11
|
-
spec.libs << 'lib' << 'spec'
|
10
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
12
11
|
spec.pattern = 'spec/**/*_spec.rb'
|
13
12
|
spec.rcov = true
|
14
13
|
end
|
15
14
|
|
16
|
-
task :default => :
|
15
|
+
task :default => :rspec
|
17
16
|
|
18
|
-
require '
|
19
|
-
|
17
|
+
require 'rdoc/task'
|
18
|
+
RDoc::Task.new do |rdoc|
|
20
19
|
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
21
20
|
|
22
21
|
rdoc.rdoc_dir = 'rdoc'
|
23
22
|
rdoc.title = "anemone #{version}"
|
24
23
|
rdoc.rdoc_files.include('README*')
|
25
24
|
rdoc.rdoc_files.include('lib/**/*.rb')
|
26
|
-
end
|
25
|
+
end
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.7.0
|
data/lib/anemone/core.rb
CHANGED
data/lib/anemone/http.rb
CHANGED
@@ -112,7 +112,7 @@ module Anemone
|
|
112
112
|
|
113
113
|
response, response_time = get_response(loc, referer)
|
114
114
|
code = Integer(response.code)
|
115
|
-
redirect_to = response.is_a?(Net::HTTPRedirection) ?
|
115
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
116
116
|
yield response, code, loc, redirect_to, response_time
|
117
117
|
limit -= 1
|
118
118
|
end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
data/lib/anemone/page.rb
CHANGED
@@ -62,7 +62,7 @@ module Anemone
|
|
62
62
|
doc.search("//a[@href]").each do |a|
|
63
63
|
u = a['href']
|
64
64
|
next if u.nil? or u.empty?
|
65
|
-
abs = to_absolute(URI(u)) rescue next
|
65
|
+
abs = to_absolute(URI(URI.escape(u))) rescue next
|
66
66
|
@links << abs if in_domain?(abs)
|
67
67
|
end
|
68
68
|
@links.uniq!
|
@@ -131,6 +131,21 @@ module Anemone
|
|
131
131
|
404 == @code
|
132
132
|
end
|
133
133
|
|
134
|
+
#
|
135
|
+
# Base URI from the HTML doc head element
|
136
|
+
# http://www.w3.org/TR/html4/struct/links.html#edef-BASE
|
137
|
+
#
|
138
|
+
def base
|
139
|
+
@base = if doc
|
140
|
+
href = doc.search('//head/base/@href')
|
141
|
+
URI(href.to_s) unless href.nil? rescue nil
|
142
|
+
end unless @base
|
143
|
+
|
144
|
+
return nil if @base && @base.to_s().empty?
|
145
|
+
@base
|
146
|
+
end
|
147
|
+
|
148
|
+
|
134
149
|
#
|
135
150
|
# Converts relative URL *link* into an absolute URL based on the
|
136
151
|
# location of the page
|
@@ -142,7 +157,7 @@ module Anemone
|
|
142
157
|
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
|
143
158
|
|
144
159
|
relative = URI(link)
|
145
|
-
absolute = @url.merge(relative)
|
160
|
+
absolute = base ? base.merge(relative) : @url.merge(relative)
|
146
161
|
|
147
162
|
absolute.path = '/' if absolute.path.empty?
|
148
163
|
|
@@ -190,7 +205,7 @@ module Anemone
|
|
190
205
|
'@visited' => hash['visited'],
|
191
206
|
'@depth' => hash['depth'].to_i,
|
192
207
|
'@referer' => hash['referer'],
|
193
|
-
'@redirect_to' => URI(hash['redirect_to']),
|
208
|
+
'@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
|
194
209
|
'@response_time' => hash['response_time'].to_i,
|
195
210
|
'@fetched' => hash['fetched']
|
196
211
|
}.each do |var, value|
|
data/lib/anemone/storage.rb
CHANGED
@@ -18,6 +18,11 @@ module Anemone
|
|
18
18
|
self::TokyoCabinet.new(file)
|
19
19
|
end
|
20
20
|
|
21
|
+
def self.KyotoCabinet(file = 'anemone.tch')
|
22
|
+
require 'anemone/storage/kyoto_cabinet'
|
23
|
+
self::KyotoCabinet.new(file)
|
24
|
+
end
|
25
|
+
|
21
26
|
def self.MongoDB(mongo_db = nil, collection_name = 'pages')
|
22
27
|
require 'anemone/storage/mongodb'
|
23
28
|
mongo_db ||= Mongo::Connection.new.db('anemone')
|
@@ -29,6 +34,11 @@ module Anemone
|
|
29
34
|
require 'anemone/storage/redis'
|
30
35
|
self::Redis.new(opts)
|
31
36
|
end
|
37
|
+
|
38
|
+
def self.SQLite3(file = 'anemone.db')
|
39
|
+
require 'anemone/storage/sqlite3'
|
40
|
+
self::SQLite3.new(file)
|
41
|
+
end
|
32
42
|
|
33
43
|
end
|
34
44
|
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
begin
|
2
|
+
require 'kyotocabinet'
|
3
|
+
rescue LoadError
|
4
|
+
puts $!
|
5
|
+
puts "You need the kyotocabinet-ruby gem to use Anemone::Storage::KyotoCabinet"
|
6
|
+
exit
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'forwardable'
|
10
|
+
|
11
|
+
module Anemone
|
12
|
+
module Storage
|
13
|
+
class KyotoCabinet
|
14
|
+
extend Forwardable
|
15
|
+
|
16
|
+
def_delegators :@db, :close, :size, :each
|
17
|
+
|
18
|
+
def initialize(file)
|
19
|
+
raise "KyotoCabinet filename must have .kch extension" if File.extname(file) != '.kch'
|
20
|
+
@db = ::KyotoCabinet::DB::new
|
21
|
+
@db.open(file, ::KyotoCabinet::DB::OWRITER | ::KyotoCabinet::DB::OCREATE)
|
22
|
+
@db.clear
|
23
|
+
end
|
24
|
+
|
25
|
+
def [](key)
|
26
|
+
if value = @db[key]
|
27
|
+
load_value(value)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def []=(key, value)
|
32
|
+
@db[key] = [Marshal.dump(value)].pack("m")
|
33
|
+
end
|
34
|
+
|
35
|
+
def each
|
36
|
+
@db.each do |k, v|
|
37
|
+
yield(k, load_value(v))
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def has_key?(key)
|
42
|
+
# Kyoto Cabinet doesn't have a way to query whether a key exists, so hack it
|
43
|
+
keys = @db.match_prefix(key)
|
44
|
+
!!keys && keys.include?(key)
|
45
|
+
end
|
46
|
+
|
47
|
+
def keys
|
48
|
+
acc = []
|
49
|
+
@db.each_key { |key| acc << key.first }
|
50
|
+
acc
|
51
|
+
end
|
52
|
+
|
53
|
+
def delete(key)
|
54
|
+
value = self[key]
|
55
|
+
@db.delete(key)
|
56
|
+
value
|
57
|
+
end
|
58
|
+
|
59
|
+
def merge!(hash)
|
60
|
+
hash.each { |key, value| self[key] = value }
|
61
|
+
self
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def load_value(value)
|
67
|
+
Marshal.load(value.unpack("m")[0])
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
begin
|
2
|
+
require 'sqlite3'
|
3
|
+
rescue LoadError
|
4
|
+
puts "You need the sqlite3 gem to use Anemone::Storage::SQLite3"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
module Anemone
|
9
|
+
module Storage
|
10
|
+
class SQLite3
|
11
|
+
|
12
|
+
def initialize(file)
|
13
|
+
@db = ::SQLite3::Database.new(file)
|
14
|
+
create_schema
|
15
|
+
end
|
16
|
+
|
17
|
+
def [](url)
|
18
|
+
value = @db.get_first_value('SELECT data FROM anemone_storage WHERE key = ?', url.to_s)
|
19
|
+
if value
|
20
|
+
Marshal.load(value)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def []=(url, value)
|
25
|
+
data = Marshal.dump(value)
|
26
|
+
if has_key?(url)
|
27
|
+
@db.execute('UPDATE anemone_storage SET data = ? WHERE key = ?', data, url.to_s)
|
28
|
+
else
|
29
|
+
@db.execute('INSERT INTO anemone_storage (data, key) VALUES(?, ?)', data, url.to_s)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def delete(url)
|
34
|
+
page = self[url]
|
35
|
+
@db.execute('DELETE FROM anemone_storage WHERE key = ?', url.to_s)
|
36
|
+
page
|
37
|
+
end
|
38
|
+
|
39
|
+
def each
|
40
|
+
@db.execute("SELECT key, data FROM anemone_storage ORDER BY id") do |row|
|
41
|
+
value = Marshal.load(row[1])
|
42
|
+
yield row[0], value
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def merge!(hash)
|
47
|
+
hash.each { |key, value| self[key] = value }
|
48
|
+
self
|
49
|
+
end
|
50
|
+
|
51
|
+
def size
|
52
|
+
@db.get_first_value('SELECT COUNT(id) FROM anemone_storage')
|
53
|
+
end
|
54
|
+
|
55
|
+
def keys
|
56
|
+
@db.execute("SELECT key FROM anemone_storage ORDER BY id").map{|t| t[0]}
|
57
|
+
end
|
58
|
+
|
59
|
+
def has_key?(url)
|
60
|
+
!!@db.get_first_value('SELECT id FROM anemone_storage WHERE key = ?', url.to_s)
|
61
|
+
end
|
62
|
+
|
63
|
+
def close
|
64
|
+
@db.close
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def create_schema
|
70
|
+
@db.execute_batch <<SQL
|
71
|
+
create table if not exists anemone_storage (
|
72
|
+
id INTEGER PRIMARY KEY ASC,
|
73
|
+
key TEXT,
|
74
|
+
data BLOB
|
75
|
+
);
|
76
|
+
create index if not exists anemone_key_idx on anemone_storage (key);
|
77
|
+
SQL
|
78
|
+
end
|
79
|
+
|
80
|
+
def load_page(hash)
|
81
|
+
BINARY_FIELDS.each do |field|
|
82
|
+
hash[field] = hash[field].to_s
|
83
|
+
end
|
84
|
+
Page.from_hash(hash)
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
data/spec/core_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__))
|
2
2
|
require 'spec_helper'
|
3
|
-
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
3
|
+
%w[pstore tokyo_cabinet sqlite3].each { |file| require "anemone/storage/#{file}.rb" }
|
4
4
|
|
5
5
|
module Anemone
|
6
6
|
describe Core do
|
@@ -277,6 +277,27 @@ module Anemone
|
|
277
277
|
end
|
278
278
|
end
|
279
279
|
|
280
|
+
describe Storage::SQLite3 do
|
281
|
+
it_should_behave_like "crawl"
|
282
|
+
|
283
|
+
before(:all) do
|
284
|
+
@test_file = 'test.db'
|
285
|
+
end
|
286
|
+
|
287
|
+
before(:each) do
|
288
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
289
|
+
@opts = {:storage => @store = Storage.SQLite3(@test_file)}
|
290
|
+
end
|
291
|
+
|
292
|
+
after(:each) do
|
293
|
+
@store.close
|
294
|
+
end
|
295
|
+
|
296
|
+
after(:each) do
|
297
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
280
301
|
describe "options" do
|
281
302
|
it "should accept options for the crawl" do
|
282
303
|
core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
|
data/spec/fakeweb_helper.rb
CHANGED
@@ -1,10 +1,3 @@
|
|
1
|
-
begin
|
2
|
-
require 'fakeweb'
|
3
|
-
rescue LoadError
|
4
|
-
warn "You need the 'fakeweb' gem installed to test Anemone"
|
5
|
-
exit
|
6
|
-
end
|
7
|
-
|
8
1
|
FakeWeb.allow_net_connect = false
|
9
2
|
|
10
3
|
module Anemone
|
@@ -22,6 +15,7 @@ module Anemone
|
|
22
15
|
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
23
16
|
@redirect = options[:redirect] if options.has_key?(:redirect)
|
24
17
|
@auth = options[:auth] if options.has_key?(:auth)
|
18
|
+
@base = options[:base] if options.has_key?(:base)
|
25
19
|
@content_type = options[:content_type] || "text/html"
|
26
20
|
@body = options[:body]
|
27
21
|
|
@@ -40,7 +34,11 @@ module Anemone
|
|
40
34
|
private
|
41
35
|
|
42
36
|
def create_body
|
43
|
-
@
|
37
|
+
if @base
|
38
|
+
@body = "<html><head><base href=\"#{@base}\"></head><body>"
|
39
|
+
else
|
40
|
+
@body = "<html><body>"
|
41
|
+
end
|
44
42
|
@links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
|
45
43
|
@hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
|
46
44
|
@body += "</body></html>"
|
data/spec/http_spec.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
$:.unshift(File.dirname(__FILE__))
|
2
1
|
require 'spec_helper'
|
3
2
|
|
4
3
|
module Anemone
|
@@ -10,12 +9,7 @@ module Anemone
|
|
10
9
|
end
|
11
10
|
|
12
11
|
it "should still return a Page if an exception occurs during the HTTP connection" do
|
13
|
-
|
14
|
-
def refresh_connection
|
15
|
-
raise "test exception"
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
12
|
+
HTTP.stub!(:refresh_connection).and_raise(StandardError)
|
19
13
|
http = Anemone::HTTP.new
|
20
14
|
http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
|
21
15
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -74,19 +74,103 @@ module Anemone
|
|
74
74
|
@page.cookies.should == []
|
75
75
|
end
|
76
76
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
77
|
+
describe "#to_hash" do
|
78
|
+
it "converts the page to a hash" do
|
79
|
+
hash = @page.to_hash
|
80
|
+
hash['url'].should == @page.url.to_s
|
81
|
+
hash['referer'].should == @page.referer.to_s
|
82
|
+
hash['links'].should == @page.links.map(&:to_s)
|
83
|
+
end
|
84
|
+
|
85
|
+
context "when redirect_to is nil" do
|
86
|
+
it "sets 'redirect_to' to nil in the hash" do
|
87
|
+
@page.redirect_to.should be_nil
|
88
|
+
@page.to_hash[:redirect_to].should be_nil
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
context "when redirect_to is a non-nil URI" do
|
93
|
+
it "sets 'redirect_to' to the URI string" do
|
94
|
+
new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
|
95
|
+
new_page.redirect_to.to_s.should == SPEC_DOMAIN + '1'
|
96
|
+
new_page.to_hash['redirect_to'].should == SPEC_DOMAIN + '1'
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
describe "#from_hash" do
|
102
|
+
it "converts from a hash to a Page" do
|
103
|
+
page = @page.dup
|
104
|
+
page.depth = 1
|
105
|
+
converted = Page.from_hash(page.to_hash)
|
106
|
+
converted.links.should == page.links
|
107
|
+
converted.depth.should == page.depth
|
108
|
+
end
|
109
|
+
|
110
|
+
it 'handles a from_hash with a nil redirect_to' do
|
111
|
+
page_hash = @page.to_hash
|
112
|
+
page_hash['redirect_to'] = nil
|
113
|
+
lambda{Page.from_hash(page_hash)}.should_not raise_error(URI::InvalidURIError)
|
114
|
+
Page.from_hash(page_hash).redirect_to.should be_nil
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
describe "#redirect_to" do
|
119
|
+
context "when the page was a redirect" do
|
120
|
+
it "returns a URI of the page it redirects to" do
|
121
|
+
new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
|
122
|
+
redirect = new_page.redirect_to
|
123
|
+
redirect.should be_a(URI)
|
124
|
+
redirect.to_s.should == SPEC_DOMAIN + '1'
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
it "should detect, store and expose the base url for the page head" do
|
130
|
+
base = "#{SPEC_DOMAIN}path/to/base_url/"
|
131
|
+
page = @http.fetch_page(FakePage.new('body_test', {:base => base}).url)
|
132
|
+
page.base.should == URI(base)
|
133
|
+
@page.base.should be_nil
|
134
|
+
end
|
135
|
+
|
136
|
+
it "should have a method to convert a relative url to an absolute one" do
|
137
|
+
@page.should respond_to(:to_absolute)
|
138
|
+
|
139
|
+
# Identity
|
140
|
+
@page.to_absolute(@page.url).should == @page.url
|
141
|
+
@page.to_absolute("").should == @page.url
|
142
|
+
|
143
|
+
# Root-ness
|
144
|
+
@page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
|
145
|
+
|
146
|
+
# Relativeness
|
147
|
+
relative_path = "a/relative/path"
|
148
|
+
@page.to_absolute(relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
|
149
|
+
|
150
|
+
deep_page = @http.fetch_page(FakePage.new('home/deep', :links => '1').url)
|
151
|
+
upward_relative_path = "../a/relative/path"
|
152
|
+
deep_page.to_absolute(upward_relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
|
153
|
+
|
154
|
+
# The base URL case
|
155
|
+
base_path = "path/to/base_url/"
|
156
|
+
base = "#{SPEC_DOMAIN}#{base_path}"
|
157
|
+
page = @http.fetch_page(FakePage.new('home', {:base => base}).url)
|
158
|
+
|
159
|
+
# Identity
|
160
|
+
page.to_absolute(page.url).should == page.url
|
161
|
+
# It should revert to the base url
|
162
|
+
page.to_absolute("").should_not == page.url
|
163
|
+
|
164
|
+
# Root-ness
|
165
|
+
page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
|
166
|
+
|
167
|
+
# Relativeness
|
168
|
+
relative_path = "a/relative/path"
|
169
|
+
page.to_absolute(relative_path).should == URI("#{base}#{relative_path}")
|
170
|
+
|
171
|
+
upward_relative_path = "../a/relative/path"
|
172
|
+
upward_base = "#{SPEC_DOMAIN}path/to/"
|
173
|
+
page.to_absolute(upward_relative_path).should == URI("#{upward_base}#{relative_path}")
|
90
174
|
end
|
91
175
|
|
92
176
|
end
|
data/spec/page_store_spec.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__))
|
2
2
|
require 'spec_helper'
|
3
|
-
%w[pstore tokyo_cabinet mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
|
3
|
+
%w[pstore tokyo_cabinet sqlite3 mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
|
4
4
|
|
5
5
|
module Anemone
|
6
6
|
describe PageStore do
|
@@ -125,6 +125,24 @@ module Anemone
|
|
125
125
|
end
|
126
126
|
end
|
127
127
|
|
128
|
+
describe Storage::SQLite3 do
|
129
|
+
it_should_behave_like "page storage"
|
130
|
+
|
131
|
+
before(:each) do
|
132
|
+
@test_file = 'test.db'
|
133
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
134
|
+
@opts = {:storage => @store = Storage.SQLite3(@test_file)}
|
135
|
+
end
|
136
|
+
|
137
|
+
after(:each) do
|
138
|
+
@store.close
|
139
|
+
end
|
140
|
+
|
141
|
+
after(:each) do
|
142
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
128
146
|
describe Storage::MongoDB do
|
129
147
|
it_should_behave_like "page storage"
|
130
148
|
|
data/spec/spec_helper.rb
CHANGED
data/spec/storage_spec.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
$:.unshift(File.dirname(__FILE__))
|
2
2
|
require 'spec_helper'
|
3
3
|
|
4
|
-
%w[pstore tokyo_cabinet mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
|
4
|
+
%w[pstore tokyo_cabinet kyoto_cabinet sqlite3 mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
|
5
5
|
|
6
6
|
module Anemone
|
7
7
|
describe Storage do
|
@@ -25,6 +25,14 @@ module Anemone
|
|
25
25
|
store.close
|
26
26
|
end
|
27
27
|
|
28
|
+
it "should have a class method to produce a SQLite3" do
|
29
|
+
test_file = 'test.db'
|
30
|
+
Anemone::Storage.should respond_to(:SQLite3)
|
31
|
+
store = Anemone::Storage.SQLite3(test_file)
|
32
|
+
store.should be_an_instance_of(Anemone::Storage::SQLite3)
|
33
|
+
store.close
|
34
|
+
end
|
35
|
+
|
28
36
|
it "should have a class method to produce a MongoDB" do
|
29
37
|
Anemone::Storage.should respond_to(:MongoDB)
|
30
38
|
store = Anemone::Storage.MongoDB
|
@@ -105,6 +113,12 @@ module Anemone
|
|
105
113
|
|
106
114
|
merged.should === @store
|
107
115
|
end
|
116
|
+
|
117
|
+
it "should correctly deserialize nil redirect_to when loading" do
|
118
|
+
@page.redirect_to.should be_nil
|
119
|
+
@store[@url] = @page
|
120
|
+
@store[@url].redirect_to.should be_nil
|
121
|
+
end
|
108
122
|
end
|
109
123
|
|
110
124
|
describe PStore do
|
@@ -143,6 +157,47 @@ module Anemone
|
|
143
157
|
end
|
144
158
|
end
|
145
159
|
|
160
|
+
describe KyotoCabinet do
|
161
|
+
it_should_behave_like "storage engine"
|
162
|
+
|
163
|
+
before(:each) do
|
164
|
+
@test_file = 'test.kch'
|
165
|
+
File.delete @test_file rescue nil
|
166
|
+
@store = Anemone::Storage.KyotoCabinet(@test_file)
|
167
|
+
end
|
168
|
+
|
169
|
+
after(:each) do
|
170
|
+
@store.close
|
171
|
+
end
|
172
|
+
|
173
|
+
after(:all) do
|
174
|
+
File.delete @test_file rescue nil
|
175
|
+
end
|
176
|
+
|
177
|
+
it "should raise an error if supplied with a file extension other than .kch" do
|
178
|
+
lambda { Anemone::Storage.KyotoCabinet('test.tmp') }.should raise_error(RuntimeError)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
describe SQLite3 do
|
183
|
+
it_should_behave_like "storage engine"
|
184
|
+
|
185
|
+
before(:each) do
|
186
|
+
@test_file = 'test.db'
|
187
|
+
File.delete @test_file rescue nil
|
188
|
+
@store = Anemone::Storage.SQLite3(@test_file)
|
189
|
+
end
|
190
|
+
|
191
|
+
after(:each) do
|
192
|
+
@store.close
|
193
|
+
end
|
194
|
+
|
195
|
+
after(:all) do
|
196
|
+
File.delete @test_file rescue nil
|
197
|
+
end
|
198
|
+
|
199
|
+
end
|
200
|
+
|
146
201
|
describe Storage::MongoDB do
|
147
202
|
it_should_behave_like "storage engine"
|
148
203
|
|
metadata
CHANGED
@@ -1,139 +1,218 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
- 0
|
7
|
-
- 6
|
8
|
-
- 1
|
9
|
-
version: 0.6.1
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.7.0
|
5
|
+
prerelease:
|
10
6
|
platform: ruby
|
11
|
-
authors:
|
7
|
+
authors:
|
12
8
|
- Chris Kite
|
13
9
|
autorequire:
|
14
10
|
bindir: bin
|
15
11
|
cert_chain: []
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
dependencies:
|
20
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-01-20 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
21
15
|
name: nokogiri
|
22
|
-
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: &19111780 !ruby/object:Gem::Requirement
|
24
17
|
none: false
|
25
|
-
requirements:
|
26
|
-
- -
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
segments:
|
29
|
-
- 1
|
30
|
-
- 3
|
31
|
-
- 0
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
32
21
|
version: 1.3.0
|
33
22
|
type: :runtime
|
34
|
-
version_requirements: *id001
|
35
|
-
- !ruby/object:Gem::Dependency
|
36
|
-
name: robots
|
37
23
|
prerelease: false
|
38
|
-
|
24
|
+
version_requirements: *19111780
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: robots
|
27
|
+
requirement: &19111300 !ruby/object:Gem::Requirement
|
39
28
|
none: false
|
40
|
-
requirements:
|
41
|
-
- -
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
segments:
|
44
|
-
- 0
|
45
|
-
- 7
|
46
|
-
- 2
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
47
32
|
version: 0.7.2
|
48
33
|
type: :runtime
|
49
|
-
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *19111300
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rake
|
38
|
+
requirement: &19141340 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.8.7
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *19141340
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rspec
|
49
|
+
requirement: &19140880 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 2.6.0
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *19140880
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: fakeweb
|
60
|
+
requirement: &19140420 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 1.3.0
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *19140420
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: redis
|
71
|
+
requirement: &19139960 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: 2.2.0
|
77
|
+
type: :development
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *19139960
|
80
|
+
- !ruby/object:Gem::Dependency
|
81
|
+
name: mongo
|
82
|
+
requirement: &19139500 !ruby/object:Gem::Requirement
|
83
|
+
none: false
|
84
|
+
requirements:
|
85
|
+
- - ! '>='
|
86
|
+
- !ruby/object:Gem::Version
|
87
|
+
version: 1.3.1
|
88
|
+
type: :development
|
89
|
+
prerelease: false
|
90
|
+
version_requirements: *19139500
|
91
|
+
- !ruby/object:Gem::Dependency
|
92
|
+
name: bson_ext
|
93
|
+
requirement: &19139040 !ruby/object:Gem::Requirement
|
94
|
+
none: false
|
95
|
+
requirements:
|
96
|
+
- - ! '>='
|
97
|
+
- !ruby/object:Gem::Version
|
98
|
+
version: 1.3.1
|
99
|
+
type: :development
|
100
|
+
prerelease: false
|
101
|
+
version_requirements: *19139040
|
102
|
+
- !ruby/object:Gem::Dependency
|
103
|
+
name: tokyocabinet
|
104
|
+
requirement: &19138580 !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '1.29'
|
110
|
+
type: :development
|
111
|
+
prerelease: false
|
112
|
+
version_requirements: *19138580
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: kyotocabinet-ruby
|
115
|
+
requirement: &19138120 !ruby/object:Gem::Requirement
|
116
|
+
none: false
|
117
|
+
requirements:
|
118
|
+
- - ! '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: 1.27.1
|
121
|
+
type: :development
|
122
|
+
prerelease: false
|
123
|
+
version_requirements: *19138120
|
124
|
+
- !ruby/object:Gem::Dependency
|
125
|
+
name: sqlite3
|
126
|
+
requirement: &19137660 !ruby/object:Gem::Requirement
|
127
|
+
none: false
|
128
|
+
requirements:
|
129
|
+
- - ! '>='
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: 1.3.4
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: *19137660
|
50
135
|
description:
|
51
136
|
email:
|
52
|
-
executables:
|
137
|
+
executables:
|
53
138
|
- anemone
|
54
139
|
extensions: []
|
55
|
-
|
56
|
-
extra_rdoc_files:
|
140
|
+
extra_rdoc_files:
|
57
141
|
- README.rdoc
|
58
|
-
files:
|
142
|
+
files:
|
59
143
|
- VERSION
|
60
144
|
- LICENSE.txt
|
61
145
|
- CHANGELOG.rdoc
|
62
146
|
- README.rdoc
|
63
147
|
- Rakefile
|
64
|
-
- lib/anemone.rb
|
65
|
-
- lib/anemone/cookie_store.rb
|
66
|
-
- lib/anemone/storage.rb
|
67
|
-
- lib/anemone/core.rb
|
68
|
-
- lib/anemone/cli.rb
|
69
|
-
- lib/anemone/exceptions.rb
|
70
|
-
- lib/anemone/tentacle.rb
|
71
|
-
- lib/anemone/storage/tokyo_cabinet.rb
|
72
|
-
- lib/anemone/storage/base.rb
|
73
|
-
- lib/anemone/storage/exceptions.rb
|
74
148
|
- lib/anemone/storage/pstore.rb
|
75
149
|
- lib/anemone/storage/mongodb.rb
|
150
|
+
- lib/anemone/storage/tokyo_cabinet.rb
|
151
|
+
- lib/anemone/storage/exceptions.rb
|
76
152
|
- lib/anemone/storage/redis.rb
|
77
|
-
- lib/anemone/
|
153
|
+
- lib/anemone/storage/sqlite3.rb
|
154
|
+
- lib/anemone/storage/base.rb
|
155
|
+
- lib/anemone/storage/kyoto_cabinet.rb
|
78
156
|
- lib/anemone/page_store.rb
|
79
|
-
- lib/anemone/
|
80
|
-
- lib/anemone/
|
81
|
-
- lib/anemone/
|
157
|
+
- lib/anemone/storage.rb
|
158
|
+
- lib/anemone/tentacle.rb
|
159
|
+
- lib/anemone/http.rb
|
160
|
+
- lib/anemone/cli.rb
|
161
|
+
- lib/anemone/page.rb
|
162
|
+
- lib/anemone/exceptions.rb
|
163
|
+
- lib/anemone/core.rb
|
82
164
|
- lib/anemone/cli/url_list.rb
|
83
165
|
- lib/anemone/cli/serialize.rb
|
84
|
-
- lib/anemone/
|
85
|
-
-
|
86
|
-
-
|
87
|
-
-
|
166
|
+
- lib/anemone/cli/count.rb
|
167
|
+
- lib/anemone/cli/cron.rb
|
168
|
+
- lib/anemone/cli/pagedepth.rb
|
169
|
+
- lib/anemone/cookie_store.rb
|
170
|
+
- lib/anemone.rb
|
88
171
|
- spec/fakeweb_helper.rb
|
89
172
|
- spec/page_spec.rb
|
90
|
-
- spec/cookie_store_spec.rb
|
91
173
|
- spec/anemone_spec.rb
|
92
|
-
- spec/
|
174
|
+
- spec/core_spec.rb
|
93
175
|
- spec/storage_spec.rb
|
176
|
+
- spec/page_store_spec.rb
|
177
|
+
- spec/cookie_store_spec.rb
|
178
|
+
- spec/http_spec.rb
|
179
|
+
- spec/spec_helper.rb
|
94
180
|
- bin/anemone
|
95
|
-
has_rdoc: true
|
96
181
|
homepage: http://anemone.rubyforge.org
|
97
182
|
licenses: []
|
98
|
-
|
99
183
|
post_install_message:
|
100
|
-
rdoc_options:
|
184
|
+
rdoc_options:
|
101
185
|
- -m
|
102
186
|
- README.rdoc
|
103
187
|
- -t
|
104
188
|
- Anemone
|
105
|
-
require_paths:
|
189
|
+
require_paths:
|
106
190
|
- lib
|
107
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
191
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
192
|
none: false
|
109
|
-
requirements:
|
110
|
-
- -
|
111
|
-
- !ruby/object:Gem::Version
|
112
|
-
|
113
|
-
|
114
|
-
version: "0"
|
115
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
193
|
+
requirements:
|
194
|
+
- - ! '>='
|
195
|
+
- !ruby/object:Gem::Version
|
196
|
+
version: '0'
|
197
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
116
198
|
none: false
|
117
|
-
requirements:
|
118
|
-
- -
|
119
|
-
- !ruby/object:Gem::Version
|
120
|
-
|
121
|
-
- 0
|
122
|
-
version: "0"
|
199
|
+
requirements:
|
200
|
+
- - ! '>='
|
201
|
+
- !ruby/object:Gem::Version
|
202
|
+
version: '0'
|
123
203
|
requirements: []
|
124
|
-
|
125
204
|
rubyforge_project: anemone
|
126
|
-
rubygems_version: 1.
|
205
|
+
rubygems_version: 1.8.15
|
127
206
|
signing_key:
|
128
207
|
specification_version: 3
|
129
208
|
summary: Anemone web-spider framework
|
130
|
-
test_files:
|
131
|
-
- spec/http_spec.rb
|
132
|
-
- spec/page_store_spec.rb
|
133
|
-
- spec/core_spec.rb
|
209
|
+
test_files:
|
134
210
|
- spec/fakeweb_helper.rb
|
135
211
|
- spec/page_spec.rb
|
136
|
-
- spec/cookie_store_spec.rb
|
137
212
|
- spec/anemone_spec.rb
|
138
|
-
- spec/
|
213
|
+
- spec/core_spec.rb
|
139
214
|
- spec/storage_spec.rb
|
215
|
+
- spec/page_store_spec.rb
|
216
|
+
- spec/cookie_store_spec.rb
|
217
|
+
- spec/http_spec.rb
|
218
|
+
- spec/spec_helper.rb
|