spk-anemone 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/spec/http_spec.rb ADDED
@@ -0,0 +1,23 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe HTTP do
5
+
6
+ describe "fetch_page" do
7
+ before(:each) do
8
+ FakeWeb.clean_registry
9
+ end
10
+
11
+ it "should still return a Page if an exception occurs during the HTTP connection" do
12
+ class HTTP
13
+ def refresh_connection
14
+ raise "test exception"
15
+ end
16
+ end
17
+
18
+ http = HTTP.new
19
+ http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
20
+ end
21
+ end
22
+ end
23
+ end
data/spec/page_spec.rb CHANGED
@@ -3,43 +3,57 @@ require File.dirname(__FILE__) + '/spec_helper'
3
3
  module Anemone
4
4
  describe Page do
5
5
 
6
- before(:all) do
6
+ before(:each) do
7
+ FakeWeb.clean_registry
7
8
  @http = Anemone::HTTP.new
9
+ @page = @http.fetch_page(FakePage.new('home').url)
8
10
  end
9
11
 
10
- before(:each) do
11
- @page = @http.fetch_page(FakePage.new('home').url)
12
+ it "should indicate whether it successfully fetched via HTTP" do
13
+ @page.should respond_to(:fetched?)
14
+ @page.fetched?.should == true
15
+
16
+ fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
17
+ fail_page.fetched?.should == false
12
18
  end
13
-
19
+
20
+ it "should record any error that occurs during fetch_page" do
21
+ @page.should respond_to(:error)
22
+ @page.error.should be_nil
23
+
24
+ fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
25
+ fail_page.error.should_not be_nil
26
+ end
27
+
14
28
  it "should store the response headers when fetching a page" do
15
29
  @page.headers.should_not be_nil
16
30
  @page.headers.should have_key('content-type')
17
31
  end
18
-
32
+
19
33
  it "should have an OpenStruct attribute for the developer to store data in" do
20
34
  @page.data.should_not be_nil
21
35
  @page.data.should be_an_instance_of(OpenStruct)
22
-
36
+
23
37
  @page.data.test = 'test'
24
38
  @page.data.test.should == 'test'
25
39
  end
26
-
40
+
27
41
  it "should have a Nokogori::HTML::Document attribute for the page body" do
28
42
  @page.doc.should_not be_nil
29
43
  @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
30
44
  end
31
-
45
+
32
46
  it "should indicate whether it was fetched after an HTTP redirect" do
33
47
  @page.should respond_to(:redirect?)
34
-
48
+
35
49
  @page.redirect?.should == false
36
-
37
- @http.fetch_page(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
50
+
51
+ @http.fetch_pages(FakePage.new('redir', :redirect => 'home').url).first.redirect?.should == true
38
52
  end
39
-
53
+
40
54
  it "should have a method to tell if a URI is in the same domain as the page" do
41
55
  @page.should respond_to(:in_domain?)
42
-
56
+
43
57
  @page.in_domain?(URI(FakePage.new('test').url)).should == true
44
58
  @page.in_domain?(URI('http://www.other.com/')).should == false
45
59
  end
@@ -47,6 +61,6 @@ module Anemone
47
61
  it "should include the response time for the HTTP request" do
48
62
  @page.should respond_to(:response_time)
49
63
  end
50
-
64
+
51
65
  end
52
66
  end
@@ -0,0 +1,128 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+ %w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
3
+
4
+ module Anemone
5
+ describe PageStore do
6
+
7
+ before(:all) do
8
+ FakeWeb.clean_registry
9
+ end
10
+
11
+ shared_examples_for "page storage" do
12
+ it "should be able to computer single-source shortest paths in-place" do
13
+ pages = []
14
+ pages << FakePage.new('0', :links => ['1', '3'])
15
+ pages << FakePage.new('1', :redirect => '2')
16
+ pages << FakePage.new('2', :links => ['4'])
17
+ pages << FakePage.new('3')
18
+ pages << FakePage.new('4')
19
+
20
+ # crawl, then set depths to nil
21
+ page_store = Anemone.crawl(pages.first.url, @opts) do |a|
22
+ a.after_crawl do |ps|
23
+ ps.each { |url, page| page.depth = nil; ps[url] = page }
24
+ end
25
+ end.pages
26
+
27
+ page_store.should respond_to(:shortest_paths!)
28
+
29
+ page_store.shortest_paths!(pages[0].url)
30
+ page_store[pages[0].url].depth.should == 0
31
+ page_store[pages[1].url].depth.should == 1
32
+ page_store[pages[2].url].depth.should == 1
33
+ page_store[pages[3].url].depth.should == 1
34
+ page_store[pages[4].url].depth.should == 2
35
+ end
36
+
37
+ it "should be able to remove all redirects in-place" do
38
+ pages = []
39
+ pages << FakePage.new('0', :links => ['1'])
40
+ pages << FakePage.new('1', :redirect => '2')
41
+ pages << FakePage.new('2')
42
+
43
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
44
+
45
+ page_store.should respond_to(:uniq!)
46
+
47
+ page_store.uniq!
48
+ page_store.has_key?(pages[1].url).should == false
49
+ page_store.has_key?(pages[0].url).should == true
50
+ page_store.has_key?(pages[2].url).should == true
51
+ end
52
+
53
+ it "should be able to find pages linking to a url" do
54
+ pages = []
55
+ pages << FakePage.new('0', :links => ['1'])
56
+ pages << FakePage.new('1', :redirect => '2')
57
+ pages << FakePage.new('2')
58
+
59
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
60
+
61
+ page_store.should respond_to(:pages_linking_to)
62
+
63
+ page_store.pages_linking_to(pages[2].url).size.should == 0
64
+ links_to_1 = page_store.pages_linking_to(pages[1].url)
65
+ links_to_1.size.should == 1
66
+ links_to_1.first.should be_an_instance_of(Page)
67
+ links_to_1.first.url.to_s.should == pages[0].url
68
+ end
69
+
70
+ it "should be able to find urls linking to a url" do
71
+ pages = []
72
+ pages << FakePage.new('0', :links => ['1'])
73
+ pages << FakePage.new('1', :redirect => '2')
74
+ pages << FakePage.new('2')
75
+
76
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
77
+
78
+ page_store.should respond_to(:pages_linking_to)
79
+
80
+ page_store.urls_linking_to(pages[2].url).size.should == 0
81
+ links_to_1 = page_store.urls_linking_to(pages[1].url)
82
+ links_to_1.size.should == 1
83
+ links_to_1.first.to_s.should == pages[0].url
84
+ end
85
+ end
86
+
87
+ describe Hash do
88
+ it_should_behave_like "page storage"
89
+
90
+ before(:all) do
91
+ @opts = {}
92
+ end
93
+ end
94
+
95
+ describe Storage::PStore do
96
+ it_should_behave_like "page storage"
97
+
98
+ before(:each) do
99
+ @test_file = 'test.pstore'
100
+ File.delete(@test_file) if File.exists?(@test_file)
101
+ @opts = {:storage => Storage.PStore(@test_file)}
102
+ end
103
+
104
+ after(:all) do
105
+ File.delete(@test_file) if File.exists?(@test_file)
106
+ end
107
+ end
108
+
109
+ describe Storage::TokyoCabinet do
110
+ it_should_behave_like "page storage"
111
+
112
+ before(:each) do
113
+ @test_file = 'test.tch'
114
+ File.delete(@test_file) if File.exists?(@test_file)
115
+ @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
116
+ end
117
+
118
+ after(:each) do
119
+ @store.close
120
+ end
121
+
122
+ after(:all) do
123
+ File.delete(@test_file) if File.exists?(@test_file)
124
+ end
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,123 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+ %w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
3
+
4
+ module Anemone
5
+ describe Storage do
6
+
7
+ it "should have a class method to produce a Hash" do
8
+ Anemone::Storage.should respond_to(:Hash)
9
+ Anemone::Storage.Hash.should be_an_instance_of(Hash)
10
+ end
11
+
12
+ it "should have a class method to produce a PStore" do
13
+ test_file = 'test.pstore'
14
+ Anemone::Storage.should respond_to(:PStore)
15
+ Anemone::Storage.PStore(test_file).should be_an_instance_of(Anemone::Storage::PStore)
16
+ end
17
+
18
+ it "should have a class method to produce a TokyoCabinet" do
19
+ test_file = 'test.tch'
20
+ Anemone::Storage.should respond_to(:TokyoCabinet)
21
+ store = Anemone::Storage.TokyoCabinet(test_file)
22
+ store.should be_an_instance_of(Anemone::Storage::TokyoCabinet)
23
+ store.close
24
+ end
25
+
26
+ module Storage
27
+ shared_examples_for "storage engine" do
28
+ it "should implement [] and []=" do
29
+ @store.should respond_to(:[])
30
+ @store.should respond_to(:[]=)
31
+
32
+ @store['index'] = 'test'
33
+ @store['index'].should == 'test'
34
+ end
35
+
36
+ it "should implement has_key?" do
37
+ @store.should respond_to(:has_key?)
38
+
39
+ @store['index'] = 'test'
40
+ @store.has_key?('index').should == true
41
+
42
+ @store.has_key?('missing').should == false
43
+ end
44
+
45
+ it "should implement delete" do
46
+ @store.should respond_to(:delete)
47
+
48
+ @store['index'] = 'test'
49
+ @store.delete('index').should == 'test'
50
+ @store.has_key?('index').should == false
51
+ end
52
+
53
+ it "should implement keys" do
54
+ @store.should respond_to(:keys)
55
+
56
+ keys = ['a', 'b', 'c']
57
+ keys.each { |key| @store[key] = key }
58
+
59
+ @store.keys.should == keys
60
+ end
61
+
62
+ it "should implement each" do
63
+ @store.should respond_to(:each)
64
+
65
+ keys = ['a', 'b', 'c']
66
+ keys.each { |key| @store[key] = key }
67
+
68
+ result = {}
69
+ @store.each { |k, v| result[k] = v }
70
+ result.values.should == keys
71
+ end
72
+
73
+ it "should implement merge!, and return self" do
74
+ @store.should respond_to(:merge!)
75
+
76
+ hash = {'a' => 'a', 'b' => 'b', 'c' => 'c'}
77
+ merged = @store.merge! hash
78
+ hash.each { |key, value| @store[key].should == value }
79
+
80
+ merged.should === @store
81
+ end
82
+ end
83
+
84
+ describe PStore do
85
+ it_should_behave_like "storage engine"
86
+
87
+ before(:each) do
88
+ @test_file = 'test.pstore'
89
+ File.delete @test_file rescue nil
90
+ @store = Anemone::Storage.PStore(@test_file)
91
+ end
92
+
93
+ after(:all) do
94
+ File.delete @test_file rescue nil
95
+ end
96
+ end
97
+
98
+ describe TokyoCabinet do
99
+ it_should_behave_like "storage engine"
100
+
101
+ before(:each) do
102
+ @test_file = 'test.tch'
103
+ File.delete @test_file rescue nil
104
+ @store = Anemone::Storage.TokyoCabinet(@test_file)
105
+ end
106
+
107
+ after(:each) do
108
+ @store.close
109
+ end
110
+
111
+ after(:all) do
112
+ File.delete @test_file rescue nil
113
+ end
114
+
115
+ it "should raise an error if supplied with a file extension other than .tch" do
116
+ lambda { Anemone::Storage.TokyoCabinet('test.tmp') }.should raise_error(RuntimeError)
117
+ end
118
+
119
+ end
120
+
121
+ end
122
+ end
123
+ end
metadata CHANGED
@@ -1,16 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spk-anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
8
- - spk
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
11
 
13
- date: 2009-11-26 00:00:00 +01:00
12
+ date: 2010-01-03 00:00:00 +01:00
14
13
  default_executable:
15
14
  dependencies:
16
15
  - !ruby/object:Gem::Dependency
@@ -21,7 +20,7 @@ dependencies:
21
20
  requirements:
22
21
  - - ">="
23
22
  - !ruby/object:Gem::Version
24
- version: 1.4.0
23
+ version: 1.4.1
25
24
  version:
26
25
  - !ruby/object:Gem::Dependency
27
26
  name: robots
@@ -50,8 +49,11 @@ files:
50
49
  - lib/anemone/core.rb
51
50
  - lib/anemone/http.rb
52
51
  - lib/anemone/page.rb
53
- - lib/anemone/page_hash.rb
52
+ - lib/anemone/page_store.rb
54
53
  - lib/anemone/tentacle.rb
54
+ - lib/anemone/storage.rb
55
+ - lib/anemone/storage/pstore.rb
56
+ - lib/anemone/storage/tokyo_cabinet.rb
55
57
  - lib/anemone/cli.rb
56
58
  - lib/anemone/cli/url_list.rb
57
59
  - lib/anemone/cli/cron.rb
@@ -93,5 +95,8 @@ test_files:
93
95
  - spec/anemone_spec.rb
94
96
  - spec/core_spec.rb
95
97
  - spec/page_spec.rb
98
+ - spec/page_store_spec.rb
99
+ - spec/http_spec.rb
100
+ - spec/storage_spec.rb
96
101
  - spec/fakeweb_helper.rb
97
102
  - spec/spec_helper.rb