rodneyc-anemone 0.7.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/spec/core_spec.rb ADDED
@@ -0,0 +1,344 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+ %w[pstore tokyo_cabinet sqlite3].each { |file| require "anemone/storage/#{file}.rb" }
4
+
5
+ module Anemone
6
+ describe Core do
7
+
8
+ before(:each) do
9
+ FakeWeb.clean_registry
10
+ end
11
+
12
+ shared_examples_for "crawl" do
13
+ it "should crawl all the html pages in a domain by following <a> href's" do
14
+ pages = []
15
+ pages << FakePage.new('0', :links => ['1', '2'])
16
+ pages << FakePage.new('1', :links => ['3'])
17
+ pages << FakePage.new('2')
18
+ pages << FakePage.new('3')
19
+
20
+ Anemone.crawl(pages[0].url, @opts).should have(4).pages
21
+ end
22
+
23
+ it "should not follow links that leave the original domain" do
24
+ pages = []
25
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
26
+ pages << FakePage.new('1')
27
+
28
+ core = Anemone.crawl(pages[0].url, @opts)
29
+
30
+ core.should have(2).pages
31
+ core.pages.keys.should_not include('http://www.other.com/')
32
+ end
33
+
34
+ it "should not follow redirects that leave the original domain" do
35
+ pages = []
36
+ pages << FakePage.new('0', :links => ['1'], :redirect => 'http://www.other.com/')
37
+ pages << FakePage.new('1')
38
+
39
+ core = Anemone.crawl(pages[0].url, @opts)
40
+
41
+ core.should have(2).pages
42
+ core.pages.keys.should_not include('http://www.other.com/')
43
+ end
44
+
45
+ it "should follow http redirects" do
46
+ pages = []
47
+ pages << FakePage.new('0', :links => ['1'])
48
+ pages << FakePage.new('1', :redirect => '2')
49
+ pages << FakePage.new('2')
50
+
51
+ Anemone.crawl(pages[0].url, @opts).should have(3).pages
52
+ end
53
+
54
+ it "should follow with HTTP basic authentication" do
55
+ pages = []
56
+ pages << FakePage.new('0', :links => ['1', '2'], :auth => true)
57
+ pages << FakePage.new('1', :links => ['3'], :auth => true)
58
+
59
+ Anemone.crawl(pages.first.auth_url, @opts).should have(3).pages
60
+ end
61
+
62
+ it "should accept multiple starting URLs" do
63
+ pages = []
64
+ pages << FakePage.new('0', :links => ['1'])
65
+ pages << FakePage.new('1')
66
+ pages << FakePage.new('2', :links => ['3'])
67
+ pages << FakePage.new('3')
68
+
69
+ Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
70
+ end
71
+
72
+ it "should include the query string when following links" do
73
+ pages = []
74
+ pages << FakePage.new('0', :links => ['1?foo=1'])
75
+ pages << FakePage.new('1?foo=1')
76
+ pages << FakePage.new('1')
77
+
78
+ core = Anemone.crawl(pages[0].url, @opts)
79
+
80
+ core.should have(2).pages
81
+ core.pages.keys.should_not include(pages[2].url)
82
+ end
83
+
84
+ it "should be able to skip links with query strings" do
85
+ pages = []
86
+ pages << FakePage.new('0', :links => ['1?foo=1', '2'])
87
+ pages << FakePage.new('1?foo=1')
88
+ pages << FakePage.new('2')
89
+
90
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
91
+ a.skip_query_strings = true
92
+ end
93
+
94
+ core.should have(2).pages
95
+ end
96
+
97
+ it "should be able to skip links based on a RegEx" do
98
+ pages = []
99
+ pages << FakePage.new('0', :links => ['1', '2'])
100
+ pages << FakePage.new('1')
101
+ pages << FakePage.new('2')
102
+ pages << FakePage.new('3')
103
+
104
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
105
+ a.skip_links_like /1/, /3/
106
+ end
107
+
108
+ core.should have(2).pages
109
+ core.pages.keys.should_not include(pages[1].url)
110
+ core.pages.keys.should_not include(pages[3].url)
111
+ end
112
+
113
+ it "should be able to call a block on every page" do
114
+ pages = []
115
+ pages << FakePage.new('0', :links => ['1', '2'])
116
+ pages << FakePage.new('1')
117
+ pages << FakePage.new('2')
118
+
119
+ count = 0
120
+ Anemone.crawl(pages[0].url, @opts) do |a|
121
+ a.on_every_page { count += 1 }
122
+ end
123
+
124
+ count.should == 3
125
+ end
126
+
127
+ it "should not discard page bodies by default" do
128
+ Anemone.crawl(FakePage.new('0').url, @opts).pages.values#.first.doc.should_not be_nil
129
+ end
130
+
131
+ it "should optionally discard page bodies to conserve memory" do
132
+ # core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
133
+ # core.pages.values.first.doc.should be_nil
134
+ end
135
+
136
+ it "should provide a focus_crawl method to select the links on each page to follow" do
137
+ pages = []
138
+ pages << FakePage.new('0', :links => ['1', '2'])
139
+ pages << FakePage.new('1')
140
+ pages << FakePage.new('2')
141
+
142
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
143
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
144
+ end
145
+
146
+ core.should have(2).pages
147
+ core.pages.keys.should_not include(pages[1].url)
148
+ end
149
+
150
+ it "should optionally delay between page requests" do
151
+ delay = 0.25
152
+
153
+ pages = []
154
+ pages << FakePage.new('0', :links => '1')
155
+ pages << FakePage.new('1')
156
+
157
+ start = Time.now
158
+ Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
159
+ finish = Time.now
160
+
161
+ (finish - start).should satisfy {|t| t > delay * 2}
162
+ end
163
+
164
+ it "should optionally obey the robots exclusion protocol" do
165
+ pages = []
166
+ pages << FakePage.new('0', :links => '1')
167
+ pages << FakePage.new('1')
168
+ pages << FakePage.new('robots.txt',
169
+ :body => "User-agent: *\nDisallow: /1",
170
+ :content_type => 'text/plain')
171
+
172
+ core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
173
+ urls = core.pages.keys
174
+
175
+ urls.should include(pages[0].url)
176
+ urls.should_not include(pages[1].url)
177
+ end
178
+
179
+ it "should be able to set cookies to send with HTTP requests" do
180
+ cookies = {:a => '1', :b => '2'}
181
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
182
+ anemone.cookies = cookies
183
+ end
184
+ core.opts[:cookies].should == cookies
185
+ end
186
+
187
+ it "should freeze the options once the crawl begins" do
188
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
189
+ anemone.threads = 4
190
+ anemone.on_every_page do
191
+ lambda {anemone.threads = 2}.should raise_error
192
+ end
193
+ end
194
+ core.opts[:threads].should == 4
195
+ end
196
+
197
+ describe "many pages" do
198
+ before(:each) do
199
+ @pages, size = [], 5
200
+
201
+ size.times do |n|
202
+ # register this page with a link to the next page
203
+ link = (n + 1).to_s if n + 1 < size
204
+ @pages << FakePage.new(n.to_s, :links => Array(link))
205
+ end
206
+ end
207
+
208
+ it "should track the page depth and referer" do
209
+ core = Anemone.crawl(@pages[0].url, @opts)
210
+ previous_page = nil
211
+
212
+ @pages.each_with_index do |page, i|
213
+ page = core.pages[page.url]
214
+ page.should be
215
+ page.depth.should == i
216
+
217
+ if previous_page
218
+ page.referer.should == previous_page.url
219
+ else
220
+ page.referer.should be_nil
221
+ end
222
+ previous_page = page
223
+ end
224
+ end
225
+
226
+ it "should optionally limit the depth of the crawl" do
227
+ core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
228
+ core.should have(4).pages
229
+ end
230
+ end
231
+
232
+ end
233
+
234
+ describe Hash do
235
+ it_should_behave_like "crawl"
236
+
237
+ before(:all) do
238
+ @opts = {}
239
+ end
240
+ end
241
+
242
+ describe Storage::PStore do
243
+ it_should_behave_like "crawl"
244
+
245
+ before(:all) do
246
+ @test_file = 'test.pstore'
247
+ end
248
+
249
+ before(:each) do
250
+ File.delete(@test_file) if File.exists?(@test_file)
251
+ @opts = {:storage => Storage.PStore(@test_file)}
252
+ end
253
+
254
+ after(:each) do
255
+ File.delete(@test_file) if File.exists?(@test_file)
256
+ end
257
+ end
258
+
259
+ describe Storage::TokyoCabinet do
260
+ it_should_behave_like "crawl"
261
+
262
+ before(:all) do
263
+ @test_file = 'test.tch'
264
+ end
265
+
266
+ before(:each) do
267
+ File.delete(@test_file) if File.exists?(@test_file)
268
+ @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
269
+ end
270
+
271
+ after(:each) do
272
+ @store.close
273
+ end
274
+
275
+ after(:each) do
276
+ File.delete(@test_file) if File.exists?(@test_file)
277
+ end
278
+ end
279
+
280
+ describe Storage::SQLite3 do
281
+ it_should_behave_like "crawl"
282
+
283
+ before(:all) do
284
+ @test_file = 'test.db'
285
+ end
286
+
287
+ before(:each) do
288
+ File.delete(@test_file) if File.exists?(@test_file)
289
+ @opts = {:storage => @store = Storage.SQLite3(@test_file)}
290
+ end
291
+
292
+ after(:each) do
293
+ @store.close
294
+ end
295
+
296
+ after(:each) do
297
+ File.delete(@test_file) if File.exists?(@test_file)
298
+ end
299
+ end
300
+
301
+ describe "options" do
302
+ it "should accept options for the crawl" do
303
+ core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
304
+ :threads => 2,
305
+ :discard_page_bodies => true,
306
+ :user_agent => 'test',
307
+ :obey_robots_txt => true,
308
+ :depth_limit => 3)
309
+
310
+ core.opts[:verbose].should == false
311
+ core.opts[:threads].should == 2
312
+ core.opts[:discard_page_bodies].should == true
313
+ core.opts[:delay].should == 0
314
+ core.opts[:user_agent].should == 'test'
315
+ core.opts[:obey_robots_txt].should == true
316
+ core.opts[:depth_limit].should == 3
317
+ end
318
+
319
+ it "should accept options via setter methods in the crawl block" do
320
+ core = Anemone.crawl(SPEC_DOMAIN) do |a|
321
+ a.verbose = false
322
+ a.threads = 2
323
+ a.discard_page_bodies = true
324
+ a.user_agent = 'test'
325
+ a.obey_robots_txt = true
326
+ a.depth_limit = 3
327
+ end
328
+
329
+ core.opts[:verbose].should == false
330
+ core.opts[:threads].should == 2
331
+ core.opts[:discard_page_bodies].should == true
332
+ core.opts[:delay].should == 0
333
+ core.opts[:user_agent].should == 'test'
334
+ core.opts[:obey_robots_txt].should == true
335
+ core.opts[:depth_limit].should == 3
336
+ end
337
+
338
+ it "should use 1 thread if a delay is requested" do
339
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
340
+ end
341
+ end
342
+
343
+ end
344
+ end
@@ -0,0 +1,77 @@
1
+ FakeWeb.allow_net_connect = false
2
+
3
+ module Anemone
4
+ SPEC_DOMAIN = "http://www.example.com/"
5
+ AUTH_SPEC_DOMAIN = "http://user:pass@#{URI.parse(SPEC_DOMAIN).host}/"
6
+
7
+ class FakePage
8
+ attr_accessor :links
9
+ attr_accessor :hrefs
10
+ attr_accessor :body
11
+
12
+ def initialize(name = '', options = {})
13
+ @name = name
14
+ @links = [options[:links]].flatten if options.has_key?(:links)
15
+ @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
16
+ @redirect = options[:redirect] if options.has_key?(:redirect)
17
+ @auth = options[:auth] if options.has_key?(:auth)
18
+ @base = options[:base] if options.has_key?(:base)
19
+ @content_type = options[:content_type] || "text/html"
20
+ @body = options[:body]
21
+
22
+ create_body unless @body
23
+ add_to_fakeweb
24
+ end
25
+
26
+ def url
27
+ SPEC_DOMAIN + @name
28
+ end
29
+
30
+ def auth_url
31
+ AUTH_SPEC_DOMAIN + @name
32
+ end
33
+
34
+ private
35
+
36
+ def create_body
37
+ if @base
38
+ @body = "<html><head><base href=\"#{@base}\"></head><body>"
39
+ else
40
+ @body = "<html><body>"
41
+ end
42
+ @links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
43
+ @hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
44
+ @body += "</body></html>"
45
+ end
46
+
47
+ def add_to_fakeweb
48
+ options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
49
+
50
+ if @redirect
51
+ options[:status] = [301, "Permanently Moved"]
52
+
53
+ # only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
54
+ redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
55
+ options[:location] = redirect_url
56
+
57
+ # register the page this one redirects to
58
+ FakeWeb.register_uri(:get, redirect_url, {:body => '',
59
+ :content_type => @content_type,
60
+ :status => [200, "OK"]})
61
+ end
62
+
63
+ if @auth
64
+ unautorized_options = {
65
+ :body => "Unauthorized", :status => ["401", "Unauthorized"]
66
+ }
67
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, unautorized_options)
68
+ FakeWeb.register_uri(:get, AUTH_SPEC_DOMAIN + @name, options)
69
+ else
70
+ FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
71
+ end
72
+ end
73
+ end
74
+ end
75
+
76
+ #default root
77
+ Anemone::FakePage.new
data/spec/http_spec.rb ADDED
@@ -0,0 +1,19 @@
1
+ require 'spec_helper'
2
+
3
+ module Anemone
4
+ describe HTTP do
5
+
6
+ describe "fetch_page" do
7
+ before(:each) do
8
+ FakeWeb.clean_registry
9
+ end
10
+
11
+ it "should still return a Page if an exception occurs during the HTTP connection" do
12
+ HTTP.stub!(:refresh_connection).and_raise(StandardError)
13
+ http = Anemone::HTTP.new
14
+ http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
15
+ end
16
+
17
+ end
18
+ end
19
+ end
data/spec/page_spec.rb ADDED
@@ -0,0 +1,177 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ module Anemone
5
+ describe Page do
6
+
7
+ before(:each) do
8
+ FakeWeb.clean_registry
9
+ @http = Anemone::HTTP.new
10
+ @page = @http.fetch_page(FakePage.new('home', :links => '1').url)
11
+ end
12
+
13
+ it "should indicate whether it successfully fetched via HTTP" do
14
+ @page.should respond_to(:fetched?)
15
+ @page.fetched?.should == true
16
+
17
+ fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
18
+ fail_page.fetched?.should == false
19
+ end
20
+
21
+ it "should store and expose the response body of the HTTP request" do
22
+ body = 'test'
23
+ page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
24
+ page.body.should == body
25
+ end
26
+
27
+ it "should record any error that occurs during fetch_page" do
28
+ @page.should respond_to(:error)
29
+ @page.error.should be_nil
30
+
31
+ fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
32
+ fail_page.error.should_not be_nil
33
+ end
34
+
35
+ it "should store the response headers when fetching a page" do
36
+ @page.headers.should_not be_nil
37
+ @page.headers.should have_key('content-type')
38
+ end
39
+
40
+ it "should have an OpenStruct attribute for the developer to store data in" do
41
+ @page.data.should_not be_nil
42
+ @page.data.should be_an_instance_of(OpenStruct)
43
+
44
+ @page.data.test = 'test'
45
+ @page.data.test.should == 'test'
46
+ end
47
+
48
+ it "should have a Nokogori::HTML::Document attribute for the page body" do
49
+ @page.doc.should_not be_nil
50
+ @page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
51
+ end
52
+
53
+ it "should indicate whether it was fetched after an HTTP redirect" do
54
+ @page.should respond_to(:redirect?)
55
+
56
+ @page.redirect?.should == false
57
+
58
+ @http.fetch_pages(FakePage.new('redir', :redirect => 'home').url).first.redirect?.should == true
59
+ end
60
+
61
+ it "should have a method to tell if a URI is in the same domain as the page" do
62
+ @page.should respond_to(:in_domain?)
63
+
64
+ @page.in_domain?(URI(FakePage.new('test').url)).should == true
65
+ @page.in_domain?(URI('http://www.other.com/')).should == false
66
+ end
67
+
68
+ it "should include the response time for the HTTP request" do
69
+ @page.should respond_to(:response_time)
70
+ end
71
+
72
+ it "should have the cookies received with the page" do
73
+ @page.should respond_to(:cookies)
74
+ @page.cookies.should == []
75
+ end
76
+
77
+ describe "#to_hash" do
78
+ it "converts the page to a hash" do
79
+ hash = @page.to_hash
80
+ hash['url'].should == @page.url.to_s
81
+ hash['referer'].should == @page.referer.to_s
82
+ hash['links'].should == @page.links.map(&:to_s)
83
+ end
84
+
85
+ context "when redirect_to is nil" do
86
+ it "sets 'redirect_to' to nil in the hash" do
87
+ @page.redirect_to.should be_nil
88
+ @page.to_hash[:redirect_to].should be_nil
89
+ end
90
+ end
91
+
92
+ context "when redirect_to is a non-nil URI" do
93
+ it "sets 'redirect_to' to the URI string" do
94
+ new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
95
+ new_page.redirect_to.to_s.should == SPEC_DOMAIN + '1'
96
+ new_page.to_hash['redirect_to'].should == SPEC_DOMAIN + '1'
97
+ end
98
+ end
99
+ end
100
+
101
+ describe "#from_hash" do
102
+ it "converts from a hash to a Page" do
103
+ page = @page.dup
104
+ page.depth = 1
105
+ converted = Page.from_hash(page.to_hash)
106
+ converted.links.should == page.links
107
+ converted.depth.should == page.depth
108
+ end
109
+
110
+ it 'handles a from_hash with a nil redirect_to' do
111
+ page_hash = @page.to_hash
112
+ page_hash['redirect_to'] = nil
113
+ lambda{Page.from_hash(page_hash)}.should_not raise_error(URI::InvalidURIError)
114
+ Page.from_hash(page_hash).redirect_to.should be_nil
115
+ end
116
+ end
117
+
118
+ describe "#redirect_to" do
119
+ context "when the page was a redirect" do
120
+ it "returns a URI of the page it redirects to" do
121
+ new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
122
+ redirect = new_page.redirect_to
123
+ redirect.should be_a(URI)
124
+ redirect.to_s.should == SPEC_DOMAIN + '1'
125
+ end
126
+ end
127
+ end
128
+
129
+ it "should detect, store and expose the base url for the page head" do
130
+ base = "#{SPEC_DOMAIN}path/to/base_url/"
131
+ page = @http.fetch_page(FakePage.new('body_test', {:base => base}).url)
132
+ page.base.should == URI(base)
133
+ @page.base.should be_nil
134
+ end
135
+
136
+ it "should have a method to convert a relative url to an absolute one" do
137
+ @page.should respond_to(:to_absolute)
138
+
139
+ # Identity
140
+ @page.to_absolute(@page.url).should == @page.url
141
+ @page.to_absolute("").should == @page.url
142
+
143
+ # Root-ness
144
+ @page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
145
+
146
+ # Relativeness
147
+ relative_path = "a/relative/path"
148
+ @page.to_absolute(relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
149
+
150
+ deep_page = @http.fetch_page(FakePage.new('home/deep', :links => '1').url)
151
+ upward_relative_path = "../a/relative/path"
152
+ deep_page.to_absolute(upward_relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
153
+
154
+ # The base URL case
155
+ base_path = "path/to/base_url/"
156
+ base = "#{SPEC_DOMAIN}#{base_path}"
157
+ page = @http.fetch_page(FakePage.new('home', {:base => base}).url)
158
+
159
+ # Identity
160
+ page.to_absolute(page.url).should == page.url
161
+ # It should revert to the base url
162
+ page.to_absolute("").should_not == page.url
163
+
164
+ # Root-ness
165
+ page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
166
+
167
+ # Relativeness
168
+ relative_path = "a/relative/path"
169
+ page.to_absolute(relative_path).should == URI("#{base}#{relative_path}")
170
+
171
+ upward_relative_path = "../a/relative/path"
172
+ upward_base = "#{SPEC_DOMAIN}path/to/"
173
+ page.to_absolute(upward_relative_path).should == URI("#{upward_base}#{relative_path}")
174
+ end
175
+
176
+ end
177
+ end