spk-anemone 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +10 -0
- data/README.rdoc +2 -0
- data/lib/anemone/cli/serialize.rb +2 -2
- data/lib/anemone/core.rb +43 -53
- data/lib/anemone/http.rb +32 -21
- data/lib/anemone/page.rb +43 -50
- data/lib/anemone/{page_hash.rb → page_store.rb} +76 -58
- data/lib/anemone/storage.rb +19 -0
- data/lib/anemone/storage/pstore.rb +48 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +57 -0
- data/lib/anemone/tentacle.rb +7 -7
- data/spec/anemone_spec.rb +4 -4
- data/spec/core_spec.rb +226 -163
- data/spec/http_spec.rb +23 -0
- data/spec/page_spec.rb +28 -14
- data/spec/page_store_spec.rb +128 -0
- data/spec/storage_spec.rb +123 -0
- metadata +10 -5
data/spec/http_spec.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
describe HTTP do
|
5
|
+
|
6
|
+
describe "fetch_page" do
|
7
|
+
before(:each) do
|
8
|
+
FakeWeb.clean_registry
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should still return a Page if an exception occurs during the HTTP connection" do
|
12
|
+
class HTTP
|
13
|
+
def refresh_connection
|
14
|
+
raise "test exception"
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
http = HTTP.new
|
19
|
+
http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/spec/page_spec.rb
CHANGED
@@ -3,43 +3,57 @@ require File.dirname(__FILE__) + '/spec_helper'
|
|
3
3
|
module Anemone
|
4
4
|
describe Page do
|
5
5
|
|
6
|
-
before(:
|
6
|
+
before(:each) do
|
7
|
+
FakeWeb.clean_registry
|
7
8
|
@http = Anemone::HTTP.new
|
9
|
+
@page = @http.fetch_page(FakePage.new('home').url)
|
8
10
|
end
|
9
11
|
|
10
|
-
|
11
|
-
@page
|
12
|
+
it "should indicate whether it successfully fetched via HTTP" do
|
13
|
+
@page.should respond_to(:fetched?)
|
14
|
+
@page.fetched?.should == true
|
15
|
+
|
16
|
+
fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
|
17
|
+
fail_page.fetched?.should == false
|
12
18
|
end
|
13
|
-
|
19
|
+
|
20
|
+
it "should record any error that occurs during fetch_page" do
|
21
|
+
@page.should respond_to(:error)
|
22
|
+
@page.error.should be_nil
|
23
|
+
|
24
|
+
fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
|
25
|
+
fail_page.error.should_not be_nil
|
26
|
+
end
|
27
|
+
|
14
28
|
it "should store the response headers when fetching a page" do
|
15
29
|
@page.headers.should_not be_nil
|
16
30
|
@page.headers.should have_key('content-type')
|
17
31
|
end
|
18
|
-
|
32
|
+
|
19
33
|
it "should have an OpenStruct attribute for the developer to store data in" do
|
20
34
|
@page.data.should_not be_nil
|
21
35
|
@page.data.should be_an_instance_of(OpenStruct)
|
22
|
-
|
36
|
+
|
23
37
|
@page.data.test = 'test'
|
24
38
|
@page.data.test.should == 'test'
|
25
39
|
end
|
26
|
-
|
40
|
+
|
27
41
|
it "should have a Nokogori::HTML::Document attribute for the page body" do
|
28
42
|
@page.doc.should_not be_nil
|
29
43
|
@page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
|
30
44
|
end
|
31
|
-
|
45
|
+
|
32
46
|
it "should indicate whether it was fetched after an HTTP redirect" do
|
33
47
|
@page.should respond_to(:redirect?)
|
34
|
-
|
48
|
+
|
35
49
|
@page.redirect?.should == false
|
36
|
-
|
37
|
-
@http.
|
50
|
+
|
51
|
+
@http.fetch_pages(FakePage.new('redir', :redirect => 'home').url).first.redirect?.should == true
|
38
52
|
end
|
39
|
-
|
53
|
+
|
40
54
|
it "should have a method to tell if a URI is in the same domain as the page" do
|
41
55
|
@page.should respond_to(:in_domain?)
|
42
|
-
|
56
|
+
|
43
57
|
@page.in_domain?(URI(FakePage.new('test').url)).should == true
|
44
58
|
@page.in_domain?(URI('http://www.other.com/')).should == false
|
45
59
|
end
|
@@ -47,6 +61,6 @@ module Anemone
|
|
47
61
|
it "should include the response time for the HTTP request" do
|
48
62
|
@page.should respond_to(:response_time)
|
49
63
|
end
|
50
|
-
|
64
|
+
|
51
65
|
end
|
52
66
|
end
|
@@ -0,0 +1,128 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
describe PageStore do
|
6
|
+
|
7
|
+
before(:all) do
|
8
|
+
FakeWeb.clean_registry
|
9
|
+
end
|
10
|
+
|
11
|
+
shared_examples_for "page storage" do
|
12
|
+
it "should be able to computer single-source shortest paths in-place" do
|
13
|
+
pages = []
|
14
|
+
pages << FakePage.new('0', :links => ['1', '3'])
|
15
|
+
pages << FakePage.new('1', :redirect => '2')
|
16
|
+
pages << FakePage.new('2', :links => ['4'])
|
17
|
+
pages << FakePage.new('3')
|
18
|
+
pages << FakePage.new('4')
|
19
|
+
|
20
|
+
# crawl, then set depths to nil
|
21
|
+
page_store = Anemone.crawl(pages.first.url, @opts) do |a|
|
22
|
+
a.after_crawl do |ps|
|
23
|
+
ps.each { |url, page| page.depth = nil; ps[url] = page }
|
24
|
+
end
|
25
|
+
end.pages
|
26
|
+
|
27
|
+
page_store.should respond_to(:shortest_paths!)
|
28
|
+
|
29
|
+
page_store.shortest_paths!(pages[0].url)
|
30
|
+
page_store[pages[0].url].depth.should == 0
|
31
|
+
page_store[pages[1].url].depth.should == 1
|
32
|
+
page_store[pages[2].url].depth.should == 1
|
33
|
+
page_store[pages[3].url].depth.should == 1
|
34
|
+
page_store[pages[4].url].depth.should == 2
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should be able to remove all redirects in-place" do
|
38
|
+
pages = []
|
39
|
+
pages << FakePage.new('0', :links => ['1'])
|
40
|
+
pages << FakePage.new('1', :redirect => '2')
|
41
|
+
pages << FakePage.new('2')
|
42
|
+
|
43
|
+
page_store = Anemone.crawl(pages[0].url, @opts).pages
|
44
|
+
|
45
|
+
page_store.should respond_to(:uniq!)
|
46
|
+
|
47
|
+
page_store.uniq!
|
48
|
+
page_store.has_key?(pages[1].url).should == false
|
49
|
+
page_store.has_key?(pages[0].url).should == true
|
50
|
+
page_store.has_key?(pages[2].url).should == true
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should be able to find pages linking to a url" do
|
54
|
+
pages = []
|
55
|
+
pages << FakePage.new('0', :links => ['1'])
|
56
|
+
pages << FakePage.new('1', :redirect => '2')
|
57
|
+
pages << FakePage.new('2')
|
58
|
+
|
59
|
+
page_store = Anemone.crawl(pages[0].url, @opts).pages
|
60
|
+
|
61
|
+
page_store.should respond_to(:pages_linking_to)
|
62
|
+
|
63
|
+
page_store.pages_linking_to(pages[2].url).size.should == 0
|
64
|
+
links_to_1 = page_store.pages_linking_to(pages[1].url)
|
65
|
+
links_to_1.size.should == 1
|
66
|
+
links_to_1.first.should be_an_instance_of(Page)
|
67
|
+
links_to_1.first.url.to_s.should == pages[0].url
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should be able to find urls linking to a url" do
|
71
|
+
pages = []
|
72
|
+
pages << FakePage.new('0', :links => ['1'])
|
73
|
+
pages << FakePage.new('1', :redirect => '2')
|
74
|
+
pages << FakePage.new('2')
|
75
|
+
|
76
|
+
page_store = Anemone.crawl(pages[0].url, @opts).pages
|
77
|
+
|
78
|
+
page_store.should respond_to(:pages_linking_to)
|
79
|
+
|
80
|
+
page_store.urls_linking_to(pages[2].url).size.should == 0
|
81
|
+
links_to_1 = page_store.urls_linking_to(pages[1].url)
|
82
|
+
links_to_1.size.should == 1
|
83
|
+
links_to_1.first.to_s.should == pages[0].url
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
describe Hash do
|
88
|
+
it_should_behave_like "page storage"
|
89
|
+
|
90
|
+
before(:all) do
|
91
|
+
@opts = {}
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
describe Storage::PStore do
|
96
|
+
it_should_behave_like "page storage"
|
97
|
+
|
98
|
+
before(:each) do
|
99
|
+
@test_file = 'test.pstore'
|
100
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
101
|
+
@opts = {:storage => Storage.PStore(@test_file)}
|
102
|
+
end
|
103
|
+
|
104
|
+
after(:all) do
|
105
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
describe Storage::TokyoCabinet do
|
110
|
+
it_should_behave_like "page storage"
|
111
|
+
|
112
|
+
before(:each) do
|
113
|
+
@test_file = 'test.tch'
|
114
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
115
|
+
@opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
|
116
|
+
end
|
117
|
+
|
118
|
+
after(:each) do
|
119
|
+
@store.close
|
120
|
+
end
|
121
|
+
|
122
|
+
after(:all) do
|
123
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,123 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
describe Storage do
|
6
|
+
|
7
|
+
it "should have a class method to produce a Hash" do
|
8
|
+
Anemone::Storage.should respond_to(:Hash)
|
9
|
+
Anemone::Storage.Hash.should be_an_instance_of(Hash)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should have a class method to produce a PStore" do
|
13
|
+
test_file = 'test.pstore'
|
14
|
+
Anemone::Storage.should respond_to(:PStore)
|
15
|
+
Anemone::Storage.PStore(test_file).should be_an_instance_of(Anemone::Storage::PStore)
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should have a class method to produce a TokyoCabinet" do
|
19
|
+
test_file = 'test.tch'
|
20
|
+
Anemone::Storage.should respond_to(:TokyoCabinet)
|
21
|
+
store = Anemone::Storage.TokyoCabinet(test_file)
|
22
|
+
store.should be_an_instance_of(Anemone::Storage::TokyoCabinet)
|
23
|
+
store.close
|
24
|
+
end
|
25
|
+
|
26
|
+
module Storage
|
27
|
+
shared_examples_for "storage engine" do
|
28
|
+
it "should implement [] and []=" do
|
29
|
+
@store.should respond_to(:[])
|
30
|
+
@store.should respond_to(:[]=)
|
31
|
+
|
32
|
+
@store['index'] = 'test'
|
33
|
+
@store['index'].should == 'test'
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should implement has_key?" do
|
37
|
+
@store.should respond_to(:has_key?)
|
38
|
+
|
39
|
+
@store['index'] = 'test'
|
40
|
+
@store.has_key?('index').should == true
|
41
|
+
|
42
|
+
@store.has_key?('missing').should == false
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should implement delete" do
|
46
|
+
@store.should respond_to(:delete)
|
47
|
+
|
48
|
+
@store['index'] = 'test'
|
49
|
+
@store.delete('index').should == 'test'
|
50
|
+
@store.has_key?('index').should == false
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should implement keys" do
|
54
|
+
@store.should respond_to(:keys)
|
55
|
+
|
56
|
+
keys = ['a', 'b', 'c']
|
57
|
+
keys.each { |key| @store[key] = key }
|
58
|
+
|
59
|
+
@store.keys.should == keys
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should implement each" do
|
63
|
+
@store.should respond_to(:each)
|
64
|
+
|
65
|
+
keys = ['a', 'b', 'c']
|
66
|
+
keys.each { |key| @store[key] = key }
|
67
|
+
|
68
|
+
result = {}
|
69
|
+
@store.each { |k, v| result[k] = v }
|
70
|
+
result.values.should == keys
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should implement merge!, and return self" do
|
74
|
+
@store.should respond_to(:merge!)
|
75
|
+
|
76
|
+
hash = {'a' => 'a', 'b' => 'b', 'c' => 'c'}
|
77
|
+
merged = @store.merge! hash
|
78
|
+
hash.each { |key, value| @store[key].should == value }
|
79
|
+
|
80
|
+
merged.should === @store
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
describe PStore do
|
85
|
+
it_should_behave_like "storage engine"
|
86
|
+
|
87
|
+
before(:each) do
|
88
|
+
@test_file = 'test.pstore'
|
89
|
+
File.delete @test_file rescue nil
|
90
|
+
@store = Anemone::Storage.PStore(@test_file)
|
91
|
+
end
|
92
|
+
|
93
|
+
after(:all) do
|
94
|
+
File.delete @test_file rescue nil
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
describe TokyoCabinet do
|
99
|
+
it_should_behave_like "storage engine"
|
100
|
+
|
101
|
+
before(:each) do
|
102
|
+
@test_file = 'test.tch'
|
103
|
+
File.delete @test_file rescue nil
|
104
|
+
@store = Anemone::Storage.TokyoCabinet(@test_file)
|
105
|
+
end
|
106
|
+
|
107
|
+
after(:each) do
|
108
|
+
@store.close
|
109
|
+
end
|
110
|
+
|
111
|
+
after(:all) do
|
112
|
+
File.delete @test_file rescue nil
|
113
|
+
end
|
114
|
+
|
115
|
+
it "should raise an error if supplied with a file extension other than .tch" do
|
116
|
+
lambda { Anemone::Storage.TokyoCabinet('test.tmp') }.should raise_error(RuntimeError)
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
metadata
CHANGED
@@ -1,16 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spk-anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
8
|
-
- spk
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
11
|
|
13
|
-
date:
|
12
|
+
date: 2010-01-03 00:00:00 +01:00
|
14
13
|
default_executable:
|
15
14
|
dependencies:
|
16
15
|
- !ruby/object:Gem::Dependency
|
@@ -21,7 +20,7 @@ dependencies:
|
|
21
20
|
requirements:
|
22
21
|
- - ">="
|
23
22
|
- !ruby/object:Gem::Version
|
24
|
-
version: 1.4.
|
23
|
+
version: 1.4.1
|
25
24
|
version:
|
26
25
|
- !ruby/object:Gem::Dependency
|
27
26
|
name: robots
|
@@ -50,8 +49,11 @@ files:
|
|
50
49
|
- lib/anemone/core.rb
|
51
50
|
- lib/anemone/http.rb
|
52
51
|
- lib/anemone/page.rb
|
53
|
-
- lib/anemone/
|
52
|
+
- lib/anemone/page_store.rb
|
54
53
|
- lib/anemone/tentacle.rb
|
54
|
+
- lib/anemone/storage.rb
|
55
|
+
- lib/anemone/storage/pstore.rb
|
56
|
+
- lib/anemone/storage/tokyo_cabinet.rb
|
55
57
|
- lib/anemone/cli.rb
|
56
58
|
- lib/anemone/cli/url_list.rb
|
57
59
|
- lib/anemone/cli/cron.rb
|
@@ -93,5 +95,8 @@ test_files:
|
|
93
95
|
- spec/anemone_spec.rb
|
94
96
|
- spec/core_spec.rb
|
95
97
|
- spec/page_spec.rb
|
98
|
+
- spec/page_store_spec.rb
|
99
|
+
- spec/http_spec.rb
|
100
|
+
- spec/storage_spec.rb
|
96
101
|
- spec/fakeweb_helper.rb
|
97
102
|
- spec/spec_helper.rb
|