anemone 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +11 -1
- data/README.rdoc +2 -0
- data/lib/anemone/cli/serialize.rb +2 -2
- data/lib/anemone/core.rb +58 -66
- data/lib/anemone/http.rb +39 -28
- data/lib/anemone/page.rb +53 -59
- data/lib/anemone/{page_hash.rb → page_store.rb} +76 -58
- data/lib/anemone/storage.rb +19 -0
- data/lib/anemone/storage/pstore.rb +48 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +57 -0
- data/lib/anemone/tentacle.rb +7 -7
- data/spec/anemone_spec.rb +4 -4
- data/spec/core_spec.rb +226 -163
- data/spec/http_spec.rb +23 -0
- data/spec/page_spec.rb +28 -14
- data/spec/page_store_spec.rb +128 -0
- data/spec/storage_spec.rb +123 -0
- metadata +9 -3
data/spec/http_spec.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
describe HTTP do
|
5
|
+
|
6
|
+
describe "fetch_page" do
|
7
|
+
before(:each) do
|
8
|
+
FakeWeb.clean_registry
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should still return a Page if an exception occurs during the HTTP connection" do
|
12
|
+
class HTTP
|
13
|
+
def refresh_connection
|
14
|
+
raise "test exception"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
http = HTTP.new
|
19
|
+
http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/spec/page_spec.rb
CHANGED
@@ -3,43 +3,57 @@ require File.dirname(__FILE__) + '/spec_helper'
|
|
3
3
|
module Anemone
|
4
4
|
describe Page do
|
5
5
|
|
6
|
-
before(:
|
6
|
+
before(:each) do
|
7
|
+
FakeWeb.clean_registry
|
7
8
|
@http = Anemone::HTTP.new
|
9
|
+
@page = @http.fetch_page(FakePage.new('home').url)
|
8
10
|
end
|
9
11
|
|
10
|
-
|
11
|
-
@page
|
12
|
+
it "should indicate whether it successfully fetched via HTTP" do
|
13
|
+
@page.should respond_to(:fetched?)
|
14
|
+
@page.fetched?.should == true
|
15
|
+
|
16
|
+
fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
|
17
|
+
fail_page.fetched?.should == false
|
12
18
|
end
|
13
|
-
|
19
|
+
|
20
|
+
it "should record any error that occurs during fetch_page" do
|
21
|
+
@page.should respond_to(:error)
|
22
|
+
@page.error.should be_nil
|
23
|
+
|
24
|
+
fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
|
25
|
+
fail_page.error.should_not be_nil
|
26
|
+
end
|
27
|
+
|
14
28
|
it "should store the response headers when fetching a page" do
|
15
29
|
@page.headers.should_not be_nil
|
16
30
|
@page.headers.should have_key('content-type')
|
17
31
|
end
|
18
|
-
|
32
|
+
|
19
33
|
it "should have an OpenStruct attribute for the developer to store data in" do
|
20
34
|
@page.data.should_not be_nil
|
21
35
|
@page.data.should be_an_instance_of(OpenStruct)
|
22
|
-
|
36
|
+
|
23
37
|
@page.data.test = 'test'
|
24
38
|
@page.data.test.should == 'test'
|
25
39
|
end
|
26
|
-
|
40
|
+
|
27
41
|
it "should have a Nokogori::HTML::Document attribute for the page body" do
|
28
42
|
@page.doc.should_not be_nil
|
29
43
|
@page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
|
30
44
|
end
|
31
|
-
|
45
|
+
|
32
46
|
it "should indicate whether it was fetched after an HTTP redirect" do
|
33
47
|
@page.should respond_to(:redirect?)
|
34
|
-
|
48
|
+
|
35
49
|
@page.redirect?.should == false
|
36
|
-
|
37
|
-
@http.
|
50
|
+
|
51
|
+
@http.fetch_pages(FakePage.new('redir', :redirect => 'home').url).first.redirect?.should == true
|
38
52
|
end
|
39
|
-
|
53
|
+
|
40
54
|
it "should have a method to tell if a URI is in the same domain as the page" do
|
41
55
|
@page.should respond_to(:in_domain?)
|
42
|
-
|
56
|
+
|
43
57
|
@page.in_domain?(URI(FakePage.new('test').url)).should == true
|
44
58
|
@page.in_domain?(URI('http://www.other.com/')).should == false
|
45
59
|
end
|
@@ -47,6 +61,6 @@ module Anemone
|
|
47
61
|
it "should include the response time for the HTTP request" do
|
48
62
|
@page.should respond_to(:response_time)
|
49
63
|
end
|
50
|
-
|
64
|
+
|
51
65
|
end
|
52
66
|
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
describe PageStore do
|
6
|
+
|
7
|
+
before(:all) do
|
8
|
+
FakeWeb.clean_registry
|
9
|
+
end
|
10
|
+
|
11
|
+
shared_examples_for "page storage" do
|
12
|
+
it "should be able to computer single-source shortest paths in-place" do
|
13
|
+
pages = []
|
14
|
+
pages << FakePage.new('0', :links => ['1', '3'])
|
15
|
+
pages << FakePage.new('1', :redirect => '2')
|
16
|
+
pages << FakePage.new('2', :links => ['4'])
|
17
|
+
pages << FakePage.new('3')
|
18
|
+
pages << FakePage.new('4')
|
19
|
+
|
20
|
+
# crawl, then set depths to nil
|
21
|
+
page_store = Anemone.crawl(pages.first.url, @opts) do |a|
|
22
|
+
a.after_crawl do |ps|
|
23
|
+
ps.each { |url, page| page.depth = nil; ps[url] = page }
|
24
|
+
end
|
25
|
+
end.pages
|
26
|
+
|
27
|
+
page_store.should respond_to(:shortest_paths!)
|
28
|
+
|
29
|
+
page_store.shortest_paths!(pages[0].url)
|
30
|
+
page_store[pages[0].url].depth.should == 0
|
31
|
+
page_store[pages[1].url].depth.should == 1
|
32
|
+
page_store[pages[2].url].depth.should == 1
|
33
|
+
page_store[pages[3].url].depth.should == 1
|
34
|
+
page_store[pages[4].url].depth.should == 2
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should be able to remove all redirects in-place" do
|
38
|
+
pages = []
|
39
|
+
pages << FakePage.new('0', :links => ['1'])
|
40
|
+
pages << FakePage.new('1', :redirect => '2')
|
41
|
+
pages << FakePage.new('2')
|
42
|
+
|
43
|
+
page_store = Anemone.crawl(pages[0].url, @opts).pages
|
44
|
+
|
45
|
+
page_store.should respond_to(:uniq!)
|
46
|
+
|
47
|
+
page_store.uniq!
|
48
|
+
page_store.has_key?(pages[1].url).should == false
|
49
|
+
page_store.has_key?(pages[0].url).should == true
|
50
|
+
page_store.has_key?(pages[2].url).should == true
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should be able to find pages linking to a url" do
|
54
|
+
pages = []
|
55
|
+
pages << FakePage.new('0', :links => ['1'])
|
56
|
+
pages << FakePage.new('1', :redirect => '2')
|
57
|
+
pages << FakePage.new('2')
|
58
|
+
|
59
|
+
page_store = Anemone.crawl(pages[0].url, @opts).pages
|
60
|
+
|
61
|
+
page_store.should respond_to(:pages_linking_to)
|
62
|
+
|
63
|
+
page_store.pages_linking_to(pages[2].url).size.should == 0
|
64
|
+
links_to_1 = page_store.pages_linking_to(pages[1].url)
|
65
|
+
links_to_1.size.should == 1
|
66
|
+
links_to_1.first.should be_an_instance_of(Page)
|
67
|
+
links_to_1.first.url.to_s.should == pages[0].url
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should be able to find urls linking to a url" do
|
71
|
+
pages = []
|
72
|
+
pages << FakePage.new('0', :links => ['1'])
|
73
|
+
pages << FakePage.new('1', :redirect => '2')
|
74
|
+
pages << FakePage.new('2')
|
75
|
+
|
76
|
+
page_store = Anemone.crawl(pages[0].url, @opts).pages
|
77
|
+
|
78
|
+
page_store.should respond_to(:pages_linking_to)
|
79
|
+
|
80
|
+
page_store.urls_linking_to(pages[2].url).size.should == 0
|
81
|
+
links_to_1 = page_store.urls_linking_to(pages[1].url)
|
82
|
+
links_to_1.size.should == 1
|
83
|
+
links_to_1.first.to_s.should == pages[0].url
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
describe Hash do
|
88
|
+
it_should_behave_like "page storage"
|
89
|
+
|
90
|
+
before(:all) do
|
91
|
+
@opts = {}
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
describe Storage::PStore do
|
96
|
+
it_should_behave_like "page storage"
|
97
|
+
|
98
|
+
before(:each) do
|
99
|
+
@test_file = 'test.pstore'
|
100
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
101
|
+
@opts = {:storage => Storage.PStore(@test_file)}
|
102
|
+
end
|
103
|
+
|
104
|
+
after(:all) do
|
105
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
describe Storage::TokyoCabinet do
|
110
|
+
it_should_behave_like "page storage"
|
111
|
+
|
112
|
+
before(:each) do
|
113
|
+
@test_file = 'test.tch'
|
114
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
115
|
+
@opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
|
116
|
+
end
|
117
|
+
|
118
|
+
after(:each) do
|
119
|
+
@store.close
|
120
|
+
end
|
121
|
+
|
122
|
+
after(:all) do
|
123
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
describe Storage do
|
6
|
+
|
7
|
+
it "should have a class method to produce a Hash" do
|
8
|
+
Anemone::Storage.should respond_to(:Hash)
|
9
|
+
Anemone::Storage.Hash.should be_an_instance_of(Hash)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should have a class method to produce a PStore" do
|
13
|
+
test_file = 'test.pstore'
|
14
|
+
Anemone::Storage.should respond_to(:PStore)
|
15
|
+
Anemone::Storage.PStore(test_file).should be_an_instance_of(Anemone::Storage::PStore)
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should have a class method to produce a TokyoCabinet" do
|
19
|
+
test_file = 'test.tch'
|
20
|
+
Anemone::Storage.should respond_to(:TokyoCabinet)
|
21
|
+
store = Anemone::Storage.TokyoCabinet(test_file)
|
22
|
+
store.should be_an_instance_of(Anemone::Storage::TokyoCabinet)
|
23
|
+
store.close
|
24
|
+
end
|
25
|
+
|
26
|
+
module Storage
|
27
|
+
shared_examples_for "storage engine" do
|
28
|
+
it "should implement [] and []=" do
|
29
|
+
@store.should respond_to(:[])
|
30
|
+
@store.should respond_to(:[]=)
|
31
|
+
|
32
|
+
@store['index'] = 'test'
|
33
|
+
@store['index'].should == 'test'
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should implement has_key?" do
|
37
|
+
@store.should respond_to(:has_key?)
|
38
|
+
|
39
|
+
@store['index'] = 'test'
|
40
|
+
@store.has_key?('index').should == true
|
41
|
+
|
42
|
+
@store.has_key?('missing').should == false
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should implement delete" do
|
46
|
+
@store.should respond_to(:delete)
|
47
|
+
|
48
|
+
@store['index'] = 'test'
|
49
|
+
@store.delete('index').should == 'test'
|
50
|
+
@store.has_key?('index').should == false
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should implement keys" do
|
54
|
+
@store.should respond_to(:keys)
|
55
|
+
|
56
|
+
keys = ['a', 'b', 'c']
|
57
|
+
keys.each { |key| @store[key] = key }
|
58
|
+
|
59
|
+
@store.keys.should == keys
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should implement each" do
|
63
|
+
@store.should respond_to(:each)
|
64
|
+
|
65
|
+
keys = ['a', 'b', 'c']
|
66
|
+
keys.each { |key| @store[key] = key }
|
67
|
+
|
68
|
+
result = {}
|
69
|
+
@store.each { |k, v| result[k] = v }
|
70
|
+
result.values.should == keys
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should implement merge!, and return self" do
|
74
|
+
@store.should respond_to(:merge!)
|
75
|
+
|
76
|
+
hash = {'a' => 'a', 'b' => 'b', 'c' => 'c'}
|
77
|
+
merged = @store.merge! hash
|
78
|
+
hash.each { |key, value| @store[key].should == value }
|
79
|
+
|
80
|
+
merged.should === @store
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
describe PStore do
|
85
|
+
it_should_behave_like "storage engine"
|
86
|
+
|
87
|
+
before(:each) do
|
88
|
+
@test_file = 'test.pstore'
|
89
|
+
File.delete @test_file rescue nil
|
90
|
+
@store = Anemone::Storage.PStore(@test_file)
|
91
|
+
end
|
92
|
+
|
93
|
+
after(:all) do
|
94
|
+
File.delete @test_file rescue nil
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
describe TokyoCabinet do
|
99
|
+
it_should_behave_like "storage engine"
|
100
|
+
|
101
|
+
before(:each) do
|
102
|
+
@test_file = 'test.tch'
|
103
|
+
File.delete @test_file rescue nil
|
104
|
+
@store = Anemone::Storage.TokyoCabinet(@test_file)
|
105
|
+
end
|
106
|
+
|
107
|
+
after(:each) do
|
108
|
+
@store.close
|
109
|
+
end
|
110
|
+
|
111
|
+
after(:all) do
|
112
|
+
File.delete @test_file rescue nil
|
113
|
+
end
|
114
|
+
|
115
|
+
it "should raise an error if supplied with a file extension other than .tch" do
|
116
|
+
lambda { Anemone::Storage.TokyoCabinet('test.tmp') }.should raise_error(RuntimeError)
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-12-16 00:00:00 -06:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -49,8 +49,11 @@ files:
|
|
49
49
|
- lib/anemone/core.rb
|
50
50
|
- lib/anemone/http.rb
|
51
51
|
- lib/anemone/page.rb
|
52
|
-
- lib/anemone/
|
52
|
+
- lib/anemone/page_store.rb
|
53
53
|
- lib/anemone/tentacle.rb
|
54
|
+
- lib/anemone/storage.rb
|
55
|
+
- lib/anemone/storage/pstore.rb
|
56
|
+
- lib/anemone/storage/tokyo_cabinet.rb
|
54
57
|
- lib/anemone/cli.rb
|
55
58
|
- lib/anemone/cli/url_list.rb
|
56
59
|
- lib/anemone/cli/cron.rb
|
@@ -92,5 +95,8 @@ test_files:
|
|
92
95
|
- spec/anemone_spec.rb
|
93
96
|
- spec/core_spec.rb
|
94
97
|
- spec/page_spec.rb
|
98
|
+
- spec/page_store_spec.rb
|
99
|
+
- spec/http_spec.rb
|
100
|
+
- spec/storage_spec.rb
|
95
101
|
- spec/fakeweb_helper.rb
|
96
102
|
- spec/spec_helper.rb
|