sutch-anemone 0.7.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,77 @@
1
+ FakeWeb.allow_net_connect = false
2
+
3
+ module Anemone
4
+ SPEC_DOMAIN = "http://www.example.com/"
5
+ AUTH_SPEC_DOMAIN = "http://user:pass@#{URI.parse(SPEC_DOMAIN).host}/"
6
+
7
+ class FakePage
8
+ attr_accessor :links
9
+ attr_accessor :hrefs
10
+ attr_accessor :body
11
+
12
+ def initialize(name = '', options = {})
13
+ @name = name
14
+ @links = [options[:links]].flatten if options.has_key?(:links)
15
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
16
+ @redirect = options[:redirect] if options.has_key?(:redirect)
17
+ @auth = options[:auth] if options.has_key?(:auth)
18
+ @base = options[:base] if options.has_key?(:base)
19
+ @content_type = options[:content_type] || "text/html"
20
+ @body = options[:body]
21
+
22
+ create_body unless @body
23
+ add_to_fakeweb
24
+ end
25
+
26
+ def url
27
+ SPEC_DOMAIN + @name
28
+ end
29
+
30
+ def auth_url
31
+ AUTH_SPEC_DOMAIN + @name
32
+ end
33
+
34
+ private
35
+
36
+ def create_body
37
+ if @base
38
+ @body = "<html><head><base href=\"#{@base}\"></head><body>"
39
+ else
40
+ @body = "<html><body>"
41
+ end
42
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
43
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
44
+ @body += "</body></html>"
45
+ end
46
+
47
+ def add_to_fakeweb
48
+ options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
49
+
50
+ if @redirect
51
+ options[:status] = [301, "Permanently Moved"]
52
+
53
+ # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
54
+ redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
55
+ options[:location] = redirect_url
56
+
57
+ # register the page this one redirects to
58
+ FakeWeb.register_uri(:get, redirect_url, {:body => '',
59
+ :content_type => @content_type,
60
+ :status => [200, "OK"]})
61
+ end
62
+
63
+ if @auth
64
+ unautorized_options = {
65
+ :body => "Unauthorized", :status => ["401", "Unauthorized"]
66
+ }
67
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, unautorized_options)
68
+ FakeWeb.register_uri(:get, AUTH_SPEC_DOMAIN + @name, options)
69
+ else
70
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
71
+ end
72
+ end
73
+ end
74
+ end
75
+
76
+ #default root
77
+ Anemone::FakePage.new
data/spec/http_spec.rb ADDED
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+
3
+ module Anemone
4
+ describe HTTP do
5
+
6
+ describe "fetch_page" do
7
+ before(:each) do
8
+ FakeWeb.clean_registry
9
+ end
10
+
11
+ it "should still return a Page if an exception occurs during the HTTP connection" do
12
+ HTTP.stub!(:refresh_connection).and_raise(StandardError)
13
+ http = Anemone::HTTP.new(:page_class => Anemone::Page)
14
+ http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
15
+ end
16
+
17
+ end
18
+ end
19
+ end
data/spec/page_spec.rb ADDED
@@ -0,0 +1,186 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ module Anemone
5
+ describe Page do
6
+
7
+ before(:each) do
8
+ FakeWeb.clean_registry
9
+ @http = Anemone::HTTP.new(:page_class => Anemone::Page)
10
+
11
+ @page = @http.fetch_page(FakePage.new('home', :links => '1').url)
12
+ end
13
+
14
+ it "should indicate whether it successfully fetched via HTTP" do
15
+ @page.should respond_to(:fetched?)
16
+ @page.fetched?.should == true
17
+
18
+ fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
19
+ fail_page.fetched?.should == false
20
+ end
21
+
22
+ it "should store and expose the response body of the HTTP request" do
23
+ body = 'test'
24
+ page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
25
+ page.body.should == body
26
+ end
27
+
28
+ it "should record any error that occurs during fetch_page" do
29
+ @page.should respond_to(:error)
30
+ @page.error.should be_nil
31
+
32
+ fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
33
+ fail_page.error.should_not be_nil
34
+ end
35
+
36
+ it "should store the response headers when fetching a page" do
37
+ @page.headers.should_not be_nil
38
+ @page.headers.should have_key('content-type')
39
+ end
40
+
41
+ it "should have an OpenStruct attribute for the developer to store data in" do
42
+ @page.data.should_not be_nil
43
+ @page.data.should be_an_instance_of(OpenStruct)
44
+
45
+ @page.data.test = 'test'
46
+ @page.data.test.should == 'test'
47
+ end
48
+
49
+ it "should have a Nokogori::HTML::Document attribute for the page body" do
50
+ @page.doc.should_not be_nil
51
+ @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
52
+ end
53
+
54
+ it "should indicate whether it was fetched after an HTTP redirect" do
55
+ @page.should respond_to(:redirect?)
56
+
57
+ @page.redirect?.should == false
58
+
59
+ @http.fetch_pages(FakePage.new('redir', :redirect => 'home').url).first.redirect?.should == true
60
+ end
61
+
62
+ it "should have a method to tell if a URI is in the same domain as the page" do
63
+ @page.should respond_to(:in_domain?)
64
+
65
+ @page.in_domain?(URI(FakePage.new('test').url)).should == true
66
+ @page.in_domain?(URI('http://www.other.com/')).should == false
67
+ end
68
+
69
+ it "should include the response time for the HTTP request" do
70
+ @page.should respond_to(:response_time)
71
+ end
72
+
73
+ it "should have the cookies received with the page" do
74
+ @page.should respond_to(:cookies)
75
+ @page.cookies.should == []
76
+ end
77
+
78
+ describe "#to_hash" do
79
+ it "converts the page to a hash" do
80
+ hash = @page.to_hash
81
+ hash['url'].should == @page.url.to_s
82
+ hash['referer'].should == @page.referer.to_s
83
+ hash['links'].should == @page.links.map(&:to_s)
84
+ end
85
+
86
+ context "when redirect_to is nil" do
87
+ it "sets 'redirect_to' to nil in the hash" do
88
+ @page.redirect_to.should be_nil
89
+ @page.to_hash[:redirect_to].should be_nil
90
+ end
91
+ end
92
+
93
+ context "when redirect_to is a non-nil URI" do
94
+ it "sets 'redirect_to' to the URI string" do
95
+ new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
96
+ new_page.redirect_to.to_s.should == SPEC_DOMAIN + '1'
97
+ new_page.to_hash['redirect_to'].should == SPEC_DOMAIN + '1'
98
+ end
99
+ end
100
+ end
101
+
102
+ describe "#from_hash" do
103
+ it "converts from a hash to a Page" do
104
+ page = @page.dup
105
+ page.depth = 1
106
+ converted = Page.from_hash(page.to_hash)
107
+ converted.links.should == page.links
108
+ converted.depth.should == page.depth
109
+ end
110
+
111
+ it 'handles a from_hash with a nil redirect_to' do
112
+ page_hash = @page.to_hash
113
+ page_hash['redirect_to'] = nil
114
+ lambda{Page.from_hash(page_hash)}.should_not raise_error(URI::InvalidURIError)
115
+ Page.from_hash(page_hash).redirect_to.should be_nil
116
+ end
117
+ end
118
+
119
+ describe "#redirect_to" do
120
+ context "when the page was a redirect" do
121
+ it "returns a URI of the page it redirects to" do
122
+ new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
123
+ redirect = new_page.redirect_to
124
+ redirect.should be_a(URI)
125
+ redirect.to_s.should == SPEC_DOMAIN + '1'
126
+ end
127
+ end
128
+ end
129
+
130
+ describe "#links" do
131
+ it "should not convert anchors to %23" do
132
+ page = @http.fetch_page(FakePage.new('', :body => '<a href="#top">Top</a>').url)
133
+ page.links.should have(1).link
134
+ page.links.first.to_s.should == SPEC_DOMAIN
135
+ end
136
+ end
137
+
138
+ it "should detect, store and expose the base url for the page head" do
139
+ base = "#{SPEC_DOMAIN}path/to/base_url/"
140
+ page = @http.fetch_page(FakePage.new('body_test', {:base => base}).url)
141
+ page.base.should == URI(base)
142
+ @page.base.should be_nil
143
+ end
144
+
145
+ it "should have a method to convert a relative url to an absolute one" do
146
+ @page.should respond_to(:to_absolute)
147
+
148
+ # Identity
149
+ @page.to_absolute(@page.url).should == @page.url
150
+ @page.to_absolute("").should == @page.url
151
+
152
+ # Root-ness
153
+ @page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
154
+
155
+ # Relativeness
156
+ relative_path = "a/relative/path"
157
+ @page.to_absolute(relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
158
+
159
+ deep_page = @http.fetch_page(FakePage.new('home/deep', :links => '1').url)
160
+ upward_relative_path = "../a/relative/path"
161
+ deep_page.to_absolute(upward_relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
162
+
163
+ # The base URL case
164
+ base_path = "path/to/base_url/"
165
+ base = "#{SPEC_DOMAIN}#{base_path}"
166
+ page = @http.fetch_page(FakePage.new('home', {:base => base}).url)
167
+
168
+ # Identity
169
+ page.to_absolute(page.url).should == page.url
170
+ # It should revert to the base url
171
+ page.to_absolute("").should_not == page.url
172
+
173
+ # Root-ness
174
+ page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
175
+
176
+ # Relativeness
177
+ relative_path = "a/relative/path"
178
+ page.to_absolute(relative_path).should == URI("#{base}#{relative_path}")
179
+
180
+ upward_relative_path = "../a/relative/path"
181
+ upward_base = "#{SPEC_DOMAIN}path/to/"
182
+ page.to_absolute(upward_relative_path).should == URI("#{upward_base}#{relative_path}")
183
+ end
184
+
185
+ end
186
+ end
@@ -0,0 +1,171 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+ %w[pstore tokyo_cabinet sqlite3 mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
4
+
5
+ module Anemone
6
+ describe PageStore do
7
+
8
+ before(:all) do
9
+ FakeWeb.clean_registry
10
+ end
11
+
12
+ shared_examples_for "page storage" do
13
+ it "should be able to compute single-source shortest paths in-place" do
14
+ pages = []
15
+ pages << FakePage.new('0', :links => ['1', '3'])
16
+ pages << FakePage.new('1', :redirect => '2')
17
+ pages << FakePage.new('2', :links => ['4'])
18
+ pages << FakePage.new('3')
19
+ pages << FakePage.new('4')
20
+
21
+ # crawl, then set depths to nil
22
+ page_store = Anemone.crawl(pages.first.url, @opts) do |a|
23
+ a.after_crawl do |ps|
24
+ ps.each { |url, page| page.depth = nil; ps[url] = page }
25
+ end
26
+ end.pages
27
+
28
+ page_store.should respond_to(:shortest_paths!)
29
+
30
+ page_store.shortest_paths!(pages[0].url)
31
+ page_store[pages[0].url].depth.should == 0
32
+ page_store[pages[1].url].depth.should == 1
33
+ page_store[pages[2].url].depth.should == 1
34
+ page_store[pages[3].url].depth.should == 1
35
+ page_store[pages[4].url].depth.should == 2
36
+ end
37
+
38
+ it "should be able to remove all redirects in-place" do
39
+ pages = []
40
+ pages << FakePage.new('0', :links => ['1'])
41
+ pages << FakePage.new('1', :redirect => '2')
42
+ pages << FakePage.new('2')
43
+
44
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
45
+
46
+ page_store.should respond_to(:uniq!)
47
+
48
+ page_store.uniq!
49
+ page_store.has_key?(pages[1].url).should == false
50
+ page_store.has_key?(pages[0].url).should == true
51
+ page_store.has_key?(pages[2].url).should == true
52
+ end
53
+
54
+ it "should be able to find pages linking to a url" do
55
+ pages = []
56
+ pages << FakePage.new('0', :links => ['1'])
57
+ pages << FakePage.new('1', :redirect => '2')
58
+ pages << FakePage.new('2')
59
+
60
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
61
+
62
+ page_store.should respond_to(:pages_linking_to)
63
+
64
+ page_store.pages_linking_to(pages[2].url).size.should == 0
65
+ links_to_1 = page_store.pages_linking_to(pages[1].url)
66
+ links_to_1.size.should == 1
67
+ links_to_1.first.should be_an_instance_of(Page)
68
+ links_to_1.first.url.to_s.should == pages[0].url
69
+ end
70
+
71
+ it "should be able to find urls linking to a url" do
72
+ pages = []
73
+ pages << FakePage.new('0', :links => ['1'])
74
+ pages << FakePage.new('1', :redirect => '2')
75
+ pages << FakePage.new('2')
76
+
77
+ page_store = Anemone.crawl(pages[0].url, @opts).pages
78
+
79
+ page_store.should respond_to(:pages_linking_to)
80
+
81
+ page_store.urls_linking_to(pages[2].url).size.should == 0
82
+ links_to_1 = page_store.urls_linking_to(pages[1].url)
83
+ links_to_1.size.should == 1
84
+ links_to_1.first.to_s.should == pages[0].url
85
+ end
86
+ end
87
+
88
+ describe Hash do
89
+ it_should_behave_like "page storage"
90
+
91
+ before(:all) do
92
+ @opts = {}
93
+ end
94
+ end
95
+
96
+ describe Storage::PStore do
97
+ it_should_behave_like "page storage"
98
+
99
+ before(:each) do
100
+ @test_file = 'test.pstore'
101
+ File.delete(@test_file) if File.exists?(@test_file)
102
+ @opts = {:storage => Storage.PStore(@test_file)}
103
+ end
104
+
105
+ after(:each) do
106
+ File.delete(@test_file) if File.exists?(@test_file)
107
+ end
108
+ end
109
+
110
+ describe Storage::TokyoCabinet do
111
+ it_should_behave_like "page storage"
112
+
113
+ before(:each) do
114
+ @test_file = 'test.tch'
115
+ File.delete(@test_file) if File.exists?(@test_file)
116
+ @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
117
+ end
118
+
119
+ after(:each) do
120
+ @store.close
121
+ end
122
+
123
+ after(:each) do
124
+ File.delete(@test_file) if File.exists?(@test_file)
125
+ end
126
+ end
127
+
128
+ describe Storage::SQLite3 do
129
+ it_should_behave_like "page storage"
130
+
131
+ before(:each) do
132
+ @test_file = 'test.db'
133
+ File.delete(@test_file) if File.exists?(@test_file)
134
+ @opts = {:storage => @store = Storage.SQLite3(@test_file)}
135
+ end
136
+
137
+ after(:each) do
138
+ @store.close
139
+ end
140
+
141
+ after(:each) do
142
+ File.delete(@test_file) if File.exists?(@test_file)
143
+ end
144
+ end
145
+
146
+ describe Storage::MongoDB do
147
+ it_should_behave_like "page storage"
148
+
149
+ before(:each) do
150
+ @opts = {:storage => @store = Storage.MongoDB}
151
+ end
152
+
153
+ after(:each) do
154
+ @store.close
155
+ end
156
+ end
157
+
158
+ describe Storage::Redis do
159
+ it_should_behave_like "page storage"
160
+
161
+ before(:each) do
162
+ @opts = {:storage => @store = Storage.Redis}
163
+ end
164
+
165
+ after(:each) do
166
+ @store.close
167
+ end
168
+ end
169
+
170
+ end
171
+ end