spk-anemone 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/spec/http_spec.rb ADDED
@@ -0,0 +1,23 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ module Anemone
4
+ describe HTTP do
5
+
6
+ describe "fetch_page" do
7
+ before(:each) do
8
+ FakeWeb.clean_registry
9
+ end
10
+
11
+ it "should still return a Page if an exception occurs during the HTTP connection" do
12
+ class HTTP
13
+ def refresh_connection
14
+ raise "test exception"
15
+ end
16
+ end
17
+
18
+ http = HTTP.new
19
+ http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
20
+ end
21
+ end
22
+ end
23
+ end
data/spec/page_spec.rb CHANGED
@@ -3,43 +3,57 @@ require File.dirname(__FILE__) + '/spec_helper'
3
3
  module Anemone
4
4
  describe Page do
5
5
 
6
- before(:all) do
6
+ before(:each) do
7
+ FakeWeb.clean_registry
7
8
  @http = Anemone::HTTP.new
9
+ @page = @http.fetch_page(FakePage.new('home').url)
8
10
  end
9
11
 
10
- before(:each) do
11
- @page = @http.fetch_page(FakePage.new('home').url)
12
+ it "should indicate whether it successfully fetched via HTTP" do
13
+ @page.should respond_to(:fetched?)
14
+ @page.fetched?.should == true
15
+
16
+ fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
17
+ fail_page.fetched?.should == false
12
18
  end
13
-
19
+
20
+ it "should record any error that occurs during fetch_page" do
21
+ @page.should respond_to(:error)
22
+ @page.error.should be_nil
23
+
24
+ fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
25
+ fail_page.error.should_not be_nil
26
+ end
27
+
14
28
  it "should store the response headers when fetching a page" do
15
29
  @page.headers.should_not be_nil
16
30
  @page.headers.should have_key('content-type')
17
31
  end
18
-
32
+
19
33
  it "should have an OpenStruct attribute for the developer to store data in" do
20
34
  @page.data.should_not be_nil
21
35
  @page.data.should be_an_instance_of(OpenStruct)
22
-
36
+
23
37
  @page.data.test = 'test'
24
38
  @page.data.test.should == 'test'
25
39
  end
26
-
40
+
27
41
  it "should have a Nokogori::HTML::Document attribute for the page body" do
28
42
  @page.doc.should_not be_nil
29
43
  @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
30
44
  end
31
-
45
+
32
46
  it "should indicate whether it was fetched after an HTTP redirect" do
33
47
  @page.should respond_to(:redirect?)
34
-
48
+
35
49
  @page.redirect?.should == false
36
-
37
- @http.fetch_page(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
50
+
51
+ @http.fetch_pages(FakePage.new('redir', :redirect => 'home').url).first.redirect?.should == true
38
52
  end
39
-
53
+
40
54
  it "should have a method to tell if a URI is in the same domain as the page" do
41
55
  @page.should respond_to(:in_domain?)
42
-
56
+
43
57
  @page.in_domain?(URI(FakePage.new('test').url)).should == true
44
58
  @page.in_domain?(URI('http://www.other.com/')).should == false
45
59
  end
@@ -47,6 +61,6 @@ module Anemone
47
61
  it "should include the response time for the HTTP request" do
48
62
  @page.should respond_to(:response_time)
49
63
  end
50
-
64
+
51
65
  end
52
66
  end
@@ -0,0 +1,128 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+ %w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
3
+
4
+ module Anemone
5
+ describe PageStore do
6
+
7
+ before(:all) do
8
+ FakeWeb.clean_registry
9
+ end
10
+
11
+ shared_examples_for "page storage" do
12
+ it "should be able to computer single-source shortest paths in-place" do
13
+ pages = []
14
+ pages << FakePage.new('0', :links => ['1', '3'])
15
+ pages << FakePage.new('1', :redirect => '2')
16
+ pages << FakePage.new('2', :links => ['4'])
17
+ pages << FakePage.new('3')
18
+ pages << FakePage.new('4')
19
+
20
+ # crawl, then set depths to nil
21
+ page_store = Anemone.crawl(pages.first.url, @opts) do |a|
22
+ a.after_crawl do |ps|
23
+ ps.each { |url, page| page.depth = nil; ps[url] = page }
24
+ end
25
+ end.pages
26
+
27
+ page_store.should respond_to(:shortest_paths!)
28
+
29
+ page_store.shortest_paths!(pages[0].url)
30
+ page_store[pages[0].url].depth.should == 0
31
+ page_store[pages[1].url].depth.should == 1
32
+ page_store[pages[2].url].depth.should == 1
33
+ page_store[pages[3].url].depth.should == 1
34
+ page_store[pages[4].url].depth.should == 2
35
+ end
36
+
37
+ it "should be able to remove all redirects in-place" do
38
+ pages = []
39
+ pages << FakePage.new('0', :links => ['1'])
40
+ pages << FakePage.new('1', :redirect => '2')
41
+ pages << FakePage.new('2')
42
+
43
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
44
+
45
+ page_store.should respond_to(:uniq!)
46
+
47
+ page_store.uniq!
48
+ page_store.has_key?(pages[1].url).should == false
49
+ page_store.has_key?(pages[0].url).should == true
50
+ page_store.has_key?(pages[2].url).should == true
51
+ end
52
+
53
+ it "should be able to find pages linking to a url" do
54
+ pages = []
55
+ pages << FakePage.new('0', :links => ['1'])
56
+ pages << FakePage.new('1', :redirect => '2')
57
+ pages << FakePage.new('2')
58
+
59
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
60
+
61
+ page_store.should respond_to(:pages_linking_to)
62
+
63
+ page_store.pages_linking_to(pages[2].url).size.should == 0
64
+ links_to_1 = page_store.pages_linking_to(pages[1].url)
65
+ links_to_1.size.should == 1
66
+ links_to_1.first.should be_an_instance_of(Page)
67
+ links_to_1.first.url.to_s.should == pages[0].url
68
+ end
69
+
70
+ it "should be able to find urls linking to a url" do
71
+ pages = []
72
+ pages << FakePage.new('0', :links => ['1'])
73
+ pages << FakePage.new('1', :redirect => '2')
74
+ pages << FakePage.new('2')
75
+
76
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
77
+
78
+ page_store.should respond_to(:pages_linking_to)
79
+
80
+ page_store.urls_linking_to(pages[2].url).size.should == 0
81
+ links_to_1 = page_store.urls_linking_to(pages[1].url)
82
+ links_to_1.size.should == 1
83
+ links_to_1.first.to_s.should == pages[0].url
84
+ end
85
+ end
86
+
87
+ describe Hash do
88
+ it_should_behave_like "page storage"
89
+
90
+ before(:all) do
91
+ @opts = {}
92
+ end
93
+ end
94
+
95
+ describe Storage::PStore do
96
+ it_should_behave_like "page storage"
97
+
98
+ before(:each) do
99
+ @test_file = 'test.pstore'
100
+ File.delete(@test_file) if File.exists?(@test_file)
101
+ @opts = {:storage => Storage.PStore(@test_file)}
102
+ end
103
+
104
+ after(:all) do
105
+ File.delete(@test_file) if File.exists?(@test_file)
106
+ end
107
+ end
108
+
109
+ describe Storage::TokyoCabinet do
110
+ it_should_behave_like "page storage"
111
+
112
+ before(:each) do
113
+ @test_file = 'test.tch'
114
+ File.delete(@test_file) if File.exists?(@test_file)
115
+ @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
116
+ end
117
+
118
+ after(:each) do
119
+ @store.close
120
+ end
121
+
122
+ after(:all) do
123
+ File.delete(@test_file) if File.exists?(@test_file)
124
+ end
125
+ end
126
+
127
+ end
128
+ end
@@ -0,0 +1,123 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+ %w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
3
+
4
+ module Anemone
5
+ describe Storage do
6
+
7
+ it "should have a class method to produce a Hash" do
8
+ Anemone::Storage.should respond_to(:Hash)
9
+ Anemone::Storage.Hash.should be_an_instance_of(Hash)
10
+ end
11
+
12
+ it "should have a class method to produce a PStore" do
13
+ test_file = 'test.pstore'
14
+ Anemone::Storage.should respond_to(:PStore)
15
+ Anemone::Storage.PStore(test_file).should be_an_instance_of(Anemone::Storage::PStore)
16
+ end
17
+
18
+ it "should have a class method to produce a TokyoCabinet" do
19
+ test_file = 'test.tch'
20
+ Anemone::Storage.should respond_to(:TokyoCabinet)
21
+ store = Anemone::Storage.TokyoCabinet(test_file)
22
+ store.should be_an_instance_of(Anemone::Storage::TokyoCabinet)
23
+ store.close
24
+ end
25
+
26
+ module Storage
27
+ shared_examples_for "storage engine" do
28
+ it "should implement [] and []=" do
29
+ @store.should respond_to(:[])
30
+ @store.should respond_to(:[]=)
31
+
32
+ @store['index'] = 'test'
33
+ @store['index'].should == 'test'
34
+ end
35
+
36
+ it "should implement has_key?" do
37
+ @store.should respond_to(:has_key?)
38
+
39
+ @store['index'] = 'test'
40
+ @store.has_key?('index').should == true
41
+
42
+ @store.has_key?('missing').should == false
43
+ end
44
+
45
+ it "should implement delete" do
46
+ @store.should respond_to(:delete)
47
+
48
+ @store['index'] = 'test'
49
+ @store.delete('index').should == 'test'
50
+ @store.has_key?('index').should == false
51
+ end
52
+
53
+ it "should implement keys" do
54
+ @store.should respond_to(:keys)
55
+
56
+ keys = ['a', 'b', 'c']
57
+ keys.each { |key| @store[key] = key }
58
+
59
+ @store.keys.should == keys
60
+ end
61
+
62
+ it "should implement each" do
63
+ @store.should respond_to(:each)
64
+
65
+ keys = ['a', 'b', 'c']
66
+ keys.each { |key| @store[key] = key }
67
+
68
+ result = {}
69
+ @store.each { |k, v| result[k] = v }
70
+ result.values.should == keys
71
+ end
72
+
73
+ it "should implement merge!, and return self" do
74
+ @store.should respond_to(:merge!)
75
+
76
+ hash = {'a' => 'a', 'b' => 'b', 'c' => 'c'}
77
+ merged = @store.merge! hash
78
+ hash.each { |key, value| @store[key].should == value }
79
+
80
+ merged.should === @store
81
+ end
82
+ end
83
+
84
+ describe PStore do
85
+ it_should_behave_like "storage engine"
86
+
87
+ before(:each) do
88
+ @test_file = 'test.pstore'
89
+ File.delete @test_file rescue nil
90
+ @store = Anemone::Storage.PStore(@test_file)
91
+ end
92
+
93
+ after(:all) do
94
+ File.delete @test_file rescue nil
95
+ end
96
+ end
97
+
98
+ describe TokyoCabinet do
99
+ it_should_behave_like "storage engine"
100
+
101
+ before(:each) do
102
+ @test_file = 'test.tch'
103
+ File.delete @test_file rescue nil
104
+ @store = Anemone::Storage.TokyoCabinet(@test_file)
105
+ end
106
+
107
+ after(:each) do
108
+ @store.close
109
+ end
110
+
111
+ after(:all) do
112
+ File.delete @test_file rescue nil
113
+ end
114
+
115
+ it "should raise an error if supplied with a file extension other than .tch" do
116
+ lambda { Anemone::Storage.TokyoCabinet('test.tmp') }.should raise_error(RuntimeError)
117
+ end
118
+
119
+ end
120
+
121
+ end
122
+ end
123
+ end
metadata CHANGED
@@ -1,16 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spk-anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
8
- - spk
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
11
 
13
- date: 2009-11-26 00:00:00 +01:00
12
+ date: 2010-01-03 00:00:00 +01:00
14
13
  default_executable:
15
14
  dependencies:
16
15
  - !ruby/object:Gem::Dependency
@@ -21,7 +20,7 @@ dependencies:
21
20
  requirements:
22
21
  - - ">="
23
22
  - !ruby/object:Gem::Version
24
- version: 1.4.0
23
+ version: 1.4.1
25
24
  version:
26
25
  - !ruby/object:Gem::Dependency
27
26
  name: robots
@@ -50,8 +49,11 @@ files:
50
49
  - lib/anemone/core.rb
51
50
  - lib/anemone/http.rb
52
51
  - lib/anemone/page.rb
53
- - lib/anemone/page_hash.rb
52
+ - lib/anemone/page_store.rb
54
53
  - lib/anemone/tentacle.rb
54
+ - lib/anemone/storage.rb
55
+ - lib/anemone/storage/pstore.rb
56
+ - lib/anemone/storage/tokyo_cabinet.rb
55
57
  - lib/anemone/cli.rb
56
58
  - lib/anemone/cli/url_list.rb
57
59
  - lib/anemone/cli/cron.rb
@@ -93,5 +95,8 @@ test_files:
93
95
  - spec/anemone_spec.rb
94
96
  - spec/core_spec.rb
95
97
  - spec/page_spec.rb
98
+ - spec/page_store_spec.rb
99
+ - spec/http_spec.rb
100
+ - spec/storage_spec.rb
96
101
  - spec/fakeweb_helper.rb
97
102
  - spec/spec_helper.rb