sutch-anemone 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ FakeWeb.allow_net_connect = false
2
+
3
+ module Anemone
4
+ SPEC_DOMAIN = "http://www.example.com/"
5
+ AUTH_SPEC_DOMAIN = "http://user:pass@#{URI.parse(SPEC_DOMAIN).host}/"
6
+
7
+ class FakePage
8
+ attr_accessor :links
9
+ attr_accessor :hrefs
10
+ attr_accessor :body
11
+
12
+ def initialize(name = '', options = {})
13
+ @name = name
14
+ @links = [options[:links]].flatten if options.has_key?(:links)
15
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
16
+ @redirect = options[:redirect] if options.has_key?(:redirect)
17
+ @auth = options[:auth] if options.has_key?(:auth)
18
+ @base = options[:base] if options.has_key?(:base)
19
+ @content_type = options[:content_type] || "text/html"
20
+ @body = options[:body]
21
+
22
+ create_body unless @body
23
+ add_to_fakeweb
24
+ end
25
+
26
+ def url
27
+ SPEC_DOMAIN + @name
28
+ end
29
+
30
+ def auth_url
31
+ AUTH_SPEC_DOMAIN + @name
32
+ end
33
+
34
+ private
35
+
36
+ def create_body
37
+ if @base
38
+ @body = "<html><head><base href=\"#{@base}\"></head><body>"
39
+ else
40
+ @body = "<html><body>"
41
+ end
42
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
43
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
44
+ @body += "</body></html>"
45
+ end
46
+
47
+ def add_to_fakeweb
48
+ options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
49
+
50
+ if @redirect
51
+ options[:status] = [301, "Permanently Moved"]
52
+
53
+ # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
54
+ redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
55
+ options[:location] = redirect_url
56
+
57
+ # register the page this one redirects to
58
+ FakeWeb.register_uri(:get, redirect_url, {:body => '',
59
+ :content_type => @content_type,
60
+ :status => [200, "OK"]})
61
+ end
62
+
63
+ if @auth
64
+ unautorized_options = {
65
+ :body => "Unauthorized", :status => ["401", "Unauthorized"]
66
+ }
67
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, unautorized_options)
68
+ FakeWeb.register_uri(:get, AUTH_SPEC_DOMAIN + @name, options)
69
+ else
70
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
71
+ end
72
+ end
73
+ end
74
+ end
75
+
76
+ #default root
77
+ Anemone::FakePage.new
data/spec/http_spec.rb ADDED
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+
3
+ module Anemone
4
+ describe HTTP do
5
+
6
+ describe "fetch_page" do
7
+ before(:each) do
8
+ FakeWeb.clean_registry
9
+ end
10
+
11
+ it "should still return a Page if an exception occurs during the HTTP connection" do
12
+ HTTP.stub!(:refresh_connection).and_raise(StandardError)
13
+ http = Anemone::HTTP.new(:page_class => Anemone::Page)
14
+ http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
15
+ end
16
+
17
+ end
18
+ end
19
+ end
data/spec/page_spec.rb ADDED
@@ -0,0 +1,186 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ module Anemone
5
+ describe Page do
6
+
7
+ before(:each) do
8
+ FakeWeb.clean_registry
9
+ @http = Anemone::HTTP.new(:page_class => Anemone::Page)
10
+
11
+ @page = @http.fetch_page(FakePage.new('home', :links => '1').url)
12
+ end
13
+
14
+ it "should indicate whether it successfully fetched via HTTP" do
15
+ @page.should respond_to(:fetched?)
16
+ @page.fetched?.should == true
17
+
18
+ fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
19
+ fail_page.fetched?.should == false
20
+ end
21
+
22
+ it "should store and expose the response body of the HTTP request" do
23
+ body = 'test'
24
+ page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
25
+ page.body.should == body
26
+ end
27
+
28
+ it "should record any error that occurs during fetch_page" do
29
+ @page.should respond_to(:error)
30
+ @page.error.should be_nil
31
+
32
+ fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
33
+ fail_page.error.should_not be_nil
34
+ end
35
+
36
+ it "should store the response headers when fetching a page" do
37
+ @page.headers.should_not be_nil
38
+ @page.headers.should have_key('content-type')
39
+ end
40
+
41
+ it "should have an OpenStruct attribute for the developer to store data in" do
42
+ @page.data.should_not be_nil
43
+ @page.data.should be_an_instance_of(OpenStruct)
44
+
45
+ @page.data.test = 'test'
46
+ @page.data.test.should == 'test'
47
+ end
48
+
49
+ it "should have a Nokogori::HTML::Document attribute for the page body" do
50
+ @page.doc.should_not be_nil
51
+ @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
52
+ end
53
+
54
+ it "should indicate whether it was fetched after an HTTP redirect" do
55
+ @page.should respond_to(:redirect?)
56
+
57
+ @page.redirect?.should == false
58
+
59
+ @http.fetch_pages(FakePage.new('redir', :redirect => 'home').url).first.redirect?.should == true
60
+ end
61
+
62
+ it "should have a method to tell if a URI is in the same domain as the page" do
63
+ @page.should respond_to(:in_domain?)
64
+
65
+ @page.in_domain?(URI(FakePage.new('test').url)).should == true
66
+ @page.in_domain?(URI('http://www.other.com/')).should == false
67
+ end
68
+
69
+ it "should include the response time for the HTTP request" do
70
+ @page.should respond_to(:response_time)
71
+ end
72
+
73
+ it "should have the cookies received with the page" do
74
+ @page.should respond_to(:cookies)
75
+ @page.cookies.should == []
76
+ end
77
+
78
+ describe "#to_hash" do
79
+ it "converts the page to a hash" do
80
+ hash = @page.to_hash
81
+ hash['url'].should == @page.url.to_s
82
+ hash['referer'].should == @page.referer.to_s
83
+ hash['links'].should == @page.links.map(&:to_s)
84
+ end
85
+
86
+ context "when redirect_to is nil" do
87
+ it "sets 'redirect_to' to nil in the hash" do
88
+ @page.redirect_to.should be_nil
89
+ @page.to_hash[:redirect_to].should be_nil
90
+ end
91
+ end
92
+
93
+ context "when redirect_to is a non-nil URI" do
94
+ it "sets 'redirect_to' to the URI string" do
95
+ new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
96
+ new_page.redirect_to.to_s.should == SPEC_DOMAIN + '1'
97
+ new_page.to_hash['redirect_to'].should == SPEC_DOMAIN + '1'
98
+ end
99
+ end
100
+ end
101
+
102
+ describe "#from_hash" do
103
+ it "converts from a hash to a Page" do
104
+ page = @page.dup
105
+ page.depth = 1
106
+ converted = Page.from_hash(page.to_hash)
107
+ converted.links.should == page.links
108
+ converted.depth.should == page.depth
109
+ end
110
+
111
+ it 'handles a from_hash with a nil redirect_to' do
112
+ page_hash = @page.to_hash
113
+ page_hash['redirect_to'] = nil
114
+ lambda{Page.from_hash(page_hash)}.should_not raise_error(URI::InvalidURIError)
115
+ Page.from_hash(page_hash).redirect_to.should be_nil
116
+ end
117
+ end
118
+
119
+ describe "#redirect_to" do
120
+ context "when the page was a redirect" do
121
+ it "returns a URI of the page it redirects to" do
122
+ new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
123
+ redirect = new_page.redirect_to
124
+ redirect.should be_a(URI)
125
+ redirect.to_s.should == SPEC_DOMAIN + '1'
126
+ end
127
+ end
128
+ end
129
+
130
+ describe "#links" do
131
+ it "should not convert anchors to %23" do
132
+ page = @http.fetch_page(FakePage.new('', :body => '<a href="#top">Top</a>').url)
133
+ page.links.should have(1).link
134
+ page.links.first.to_s.should == SPEC_DOMAIN
135
+ end
136
+ end
137
+
138
+ it "should detect, store and expose the base url for the page head" do
139
+ base = "#{SPEC_DOMAIN}path/to/base_url/"
140
+ page = @http.fetch_page(FakePage.new('body_test', {:base => base}).url)
141
+ page.base.should == URI(base)
142
+ @page.base.should be_nil
143
+ end
144
+
145
+ it "should have a method to convert a relative url to an absolute one" do
146
+ @page.should respond_to(:to_absolute)
147
+
148
+ # Identity
149
+ @page.to_absolute(@page.url).should == @page.url
150
+ @page.to_absolute("").should == @page.url
151
+
152
+ # Root-ness
153
+ @page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
154
+
155
+ # Relativeness
156
+ relative_path = "a/relative/path"
157
+ @page.to_absolute(relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
158
+
159
+ deep_page = @http.fetch_page(FakePage.new('home/deep', :links => '1').url)
160
+ upward_relative_path = "../a/relative/path"
161
+ deep_page.to_absolute(upward_relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
162
+
163
+ # The base URL case
164
+ base_path = "path/to/base_url/"
165
+ base = "#{SPEC_DOMAIN}#{base_path}"
166
+ page = @http.fetch_page(FakePage.new('home', {:base => base}).url)
167
+
168
+ # Identity
169
+ page.to_absolute(page.url).should == page.url
170
+ # It should revert to the base url
171
+ page.to_absolute("").should_not == page.url
172
+
173
+ # Root-ness
174
+ page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
175
+
176
+ # Relativeness
177
+ relative_path = "a/relative/path"
178
+ page.to_absolute(relative_path).should == URI("#{base}#{relative_path}")
179
+
180
+ upward_relative_path = "../a/relative/path"
181
+ upward_base = "#{SPEC_DOMAIN}path/to/"
182
+ page.to_absolute(upward_relative_path).should == URI("#{upward_base}#{relative_path}")
183
+ end
184
+
185
+ end
186
+ end
@@ -0,0 +1,171 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+ %w[pstore tokyo_cabinet sqlite3 mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
4
+
5
+ module Anemone
6
+ describe PageStore do
7
+
8
+ before(:all) do
9
+ FakeWeb.clean_registry
10
+ end
11
+
12
+ shared_examples_for "page storage" do
13
+ it "should be able to compute single-source shortest paths in-place" do
14
+ pages = []
15
+ pages << FakePage.new('0', :links => ['1', '3'])
16
+ pages << FakePage.new('1', :redirect => '2')
17
+ pages << FakePage.new('2', :links => ['4'])
18
+ pages << FakePage.new('3')
19
+ pages << FakePage.new('4')
20
+
21
+ # crawl, then set depths to nil
22
+ page_store = Anemone.crawl(pages.first.url, @opts) do |a|
23
+ a.after_crawl do |ps|
24
+ ps.each { |url, page| page.depth = nil; ps[url] = page }
25
+ end
26
+ end.pages
27
+
28
+ page_store.should respond_to(:shortest_paths!)
29
+
30
+ page_store.shortest_paths!(pages[0].url)
31
+ page_store[pages[0].url].depth.should == 0
32
+ page_store[pages[1].url].depth.should == 1
33
+ page_store[pages[2].url].depth.should == 1
34
+ page_store[pages[3].url].depth.should == 1
35
+ page_store[pages[4].url].depth.should == 2
36
+ end
37
+
38
+ it "should be able to remove all redirects in-place" do
39
+ pages = []
40
+ pages << FakePage.new('0', :links => ['1'])
41
+ pages << FakePage.new('1', :redirect => '2')
42
+ pages << FakePage.new('2')
43
+
44
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
45
+
46
+ page_store.should respond_to(:uniq!)
47
+
48
+ page_store.uniq!
49
+ page_store.has_key?(pages[1].url).should == false
50
+ page_store.has_key?(pages[0].url).should == true
51
+ page_store.has_key?(pages[2].url).should == true
52
+ end
53
+
54
+ it "should be able to find pages linking to a url" do
55
+ pages = []
56
+ pages << FakePage.new('0', :links => ['1'])
57
+ pages << FakePage.new('1', :redirect => '2')
58
+ pages << FakePage.new('2')
59
+
60
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
61
+
62
+ page_store.should respond_to(:pages_linking_to)
63
+
64
+ page_store.pages_linking_to(pages[2].url).size.should == 0
65
+ links_to_1 = page_store.pages_linking_to(pages[1].url)
66
+ links_to_1.size.should == 1
67
+ links_to_1.first.should be_an_instance_of(Page)
68
+ links_to_1.first.url.to_s.should == pages[0].url
69
+ end
70
+
71
+ it "should be able to find urls linking to a url" do
72
+ pages = []
73
+ pages << FakePage.new('0', :links => ['1'])
74
+ pages << FakePage.new('1', :redirect => '2')
75
+ pages << FakePage.new('2')
76
+
77
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
78
+
79
+ page_store.should respond_to(:pages_linking_to)
80
+
81
+ page_store.urls_linking_to(pages[2].url).size.should == 0
82
+ links_to_1 = page_store.urls_linking_to(pages[1].url)
83
+ links_to_1.size.should == 1
84
+ links_to_1.first.to_s.should == pages[0].url
85
+ end
86
+ end
87
+
88
+ describe Hash do
89
+ it_should_behave_like "page storage"
90
+
91
+ before(:all) do
92
+ @opts = {}
93
+ end
94
+ end
95
+
96
+ describe Storage::PStore do
97
+ it_should_behave_like "page storage"
98
+
99
+ before(:each) do
100
+ @test_file = 'test.pstore'
101
+ File.delete(@test_file) if File.exists?(@test_file)
102
+ @opts = {:storage => Storage.PStore(@test_file)}
103
+ end
104
+
105
+ after(:each) do
106
+ File.delete(@test_file) if File.exists?(@test_file)
107
+ end
108
+ end
109
+
110
+ describe Storage::TokyoCabinet do
111
+ it_should_behave_like "page storage"
112
+
113
+ before(:each) do
114
+ @test_file = 'test.tch'
115
+ File.delete(@test_file) if File.exists?(@test_file)
116
+ @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
117
+ end
118
+
119
+ after(:each) do
120
+ @store.close
121
+ end
122
+
123
+ after(:each) do
124
+ File.delete(@test_file) if File.exists?(@test_file)
125
+ end
126
+ end
127
+
128
+ describe Storage::SQLite3 do
129
+ it_should_behave_like "page storage"
130
+
131
+ before(:each) do
132
+ @test_file = 'test.db'
133
+ File.delete(@test_file) if File.exists?(@test_file)
134
+ @opts = {:storage => @store = Storage.SQLite3(@test_file)}
135
+ end
136
+
137
+ after(:each) do
138
+ @store.close
139
+ end
140
+
141
+ after(:each) do
142
+ File.delete(@test_file) if File.exists?(@test_file)
143
+ end
144
+ end
145
+
146
+ describe Storage::MongoDB do
147
+ it_should_behave_like "page storage"
148
+
149
+ before(:each) do
150
+ @opts = {:storage => @store = Storage.MongoDB}
151
+ end
152
+
153
+ after(:each) do
154
+ @store.close
155
+ end
156
+ end
157
+
158
+ describe Storage::Redis do
159
+ it_should_behave_like "page storage"
160
+
161
+ before(:each) do
162
+ @opts = {:storage => @store = Storage.Redis}
163
+ end
164
+
165
+ after(:each) do
166
+ @store.close
167
+ end
168
+ end
169
+
170
+ end
171
+ end