sutch-anemone 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/CHANGELOG.rdoc +136 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +38 -0
- data/Rakefile +23 -0
- data/VERSION +1 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +339 -0
- data/lib/anemone/exceptions.rb +5 -0
- data/lib/anemone/http.rb +187 -0
- data/lib/anemone/page.rb +217 -0
- data/lib/anemone/page_store.rb +161 -0
- data/lib/anemone/resource.rb +42 -0
- data/lib/anemone/storage.rb +44 -0
- data/lib/anemone/storage/base.rb +75 -0
- data/lib/anemone/storage/exceptions.rb +15 -0
- data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
- data/lib/anemone/storage/mongodb.rb +89 -0
- data/lib/anemone/storage/pstore.rb +50 -0
- data/lib/anemone/storage/redis.rb +90 -0
- data/lib/anemone/storage/sqlite3.rb +90 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +60 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +16 -0
- data/spec/cookie_store_spec.rb +28 -0
- data/spec/core_spec.rb +344 -0
- data/spec/fakeweb_helper.rb +77 -0
- data/spec/http_spec.rb +19 -0
- data/spec/page_spec.rb +186 -0
- data/spec/page_store_spec.rb +171 -0
- data/spec/resource_spec.rb +91 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/storage_spec.rb +252 -0
- metadata +281 -0
@@ -0,0 +1,77 @@
|
|
1
|
+
FakeWeb.allow_net_connect = false
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
SPEC_DOMAIN = "http://www.example.com/"
|
5
|
+
AUTH_SPEC_DOMAIN = "http://user:pass@#{URI.parse(SPEC_DOMAIN).host}/"
|
6
|
+
|
7
|
+
class FakePage
|
8
|
+
attr_accessor :links
|
9
|
+
attr_accessor :hrefs
|
10
|
+
attr_accessor :body
|
11
|
+
|
12
|
+
def initialize(name = '', options = {})
|
13
|
+
@name = name
|
14
|
+
@links = [options[:links]].flatten if options.has_key?(:links)
|
15
|
+
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
16
|
+
@redirect = options[:redirect] if options.has_key?(:redirect)
|
17
|
+
@auth = options[:auth] if options.has_key?(:auth)
|
18
|
+
@base = options[:base] if options.has_key?(:base)
|
19
|
+
@content_type = options[:content_type] || "text/html"
|
20
|
+
@body = options[:body]
|
21
|
+
|
22
|
+
create_body unless @body
|
23
|
+
add_to_fakeweb
|
24
|
+
end
|
25
|
+
|
26
|
+
def url
|
27
|
+
SPEC_DOMAIN + @name
|
28
|
+
end
|
29
|
+
|
30
|
+
def auth_url
|
31
|
+
AUTH_SPEC_DOMAIN + @name
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def create_body
|
37
|
+
if @base
|
38
|
+
@body = "<html><head><base href=\"#{@base}\"></head><body>"
|
39
|
+
else
|
40
|
+
@body = "<html><body>"
|
41
|
+
end
|
42
|
+
@links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
|
43
|
+
@hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
|
44
|
+
@body += "</body></html>"
|
45
|
+
end
|
46
|
+
|
47
|
+
def add_to_fakeweb
|
48
|
+
options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
|
49
|
+
|
50
|
+
if @redirect
|
51
|
+
options[:status] = [301, "Permanently Moved"]
|
52
|
+
|
53
|
+
# only prepend SPEC_DOMAIN if a relative url (without an http scheme) was specified
|
54
|
+
redirect_url = (@redirect =~ /http/) ? @redirect : SPEC_DOMAIN + @redirect
|
55
|
+
options[:location] = redirect_url
|
56
|
+
|
57
|
+
# register the page this one redirects to
|
58
|
+
FakeWeb.register_uri(:get, redirect_url, {:body => '',
|
59
|
+
:content_type => @content_type,
|
60
|
+
:status => [200, "OK"]})
|
61
|
+
end
|
62
|
+
|
63
|
+
if @auth
|
64
|
+
unautorized_options = {
|
65
|
+
:body => "Unauthorized", :status => ["401", "Unauthorized"]
|
66
|
+
}
|
67
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, unautorized_options)
|
68
|
+
FakeWeb.register_uri(:get, AUTH_SPEC_DOMAIN + @name, options)
|
69
|
+
else
|
70
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
#default root
|
77
|
+
Anemone::FakePage.new
|
data/spec/http_spec.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
describe HTTP do
|
5
|
+
|
6
|
+
describe "fetch_page" do
|
7
|
+
before(:each) do
|
8
|
+
FakeWeb.clean_registry
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should still return a Page if an exception occurs during the HTTP connection" do
|
12
|
+
HTTP.stub!(:refresh_connection).and_raise(StandardError)
|
13
|
+
http = Anemone::HTTP.new(:page_class => Anemone::Page)
|
14
|
+
http.fetch_page(SPEC_DOMAIN).should be_an_instance_of(Page)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,186 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__))
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
describe Page do
|
6
|
+
|
7
|
+
before(:each) do
|
8
|
+
FakeWeb.clean_registry
|
9
|
+
@http = Anemone::HTTP.new(:page_class => Anemone::Page)
|
10
|
+
|
11
|
+
@page = @http.fetch_page(FakePage.new('home', :links => '1').url)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should indicate whether it successfully fetched via HTTP" do
|
15
|
+
@page.should respond_to(:fetched?)
|
16
|
+
@page.fetched?.should == true
|
17
|
+
|
18
|
+
fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
|
19
|
+
fail_page.fetched?.should == false
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should store and expose the response body of the HTTP request" do
|
23
|
+
body = 'test'
|
24
|
+
page = @http.fetch_page(FakePage.new('body_test', {:body => body}).url)
|
25
|
+
page.body.should == body
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should record any error that occurs during fetch_page" do
|
29
|
+
@page.should respond_to(:error)
|
30
|
+
@page.error.should be_nil
|
31
|
+
|
32
|
+
fail_page = @http.fetch_page(SPEC_DOMAIN + 'fail')
|
33
|
+
fail_page.error.should_not be_nil
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should store the response headers when fetching a page" do
|
37
|
+
@page.headers.should_not be_nil
|
38
|
+
@page.headers.should have_key('content-type')
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should have an OpenStruct attribute for the developer to store data in" do
|
42
|
+
@page.data.should_not be_nil
|
43
|
+
@page.data.should be_an_instance_of(OpenStruct)
|
44
|
+
|
45
|
+
@page.data.test = 'test'
|
46
|
+
@page.data.test.should == 'test'
|
47
|
+
end
|
48
|
+
|
49
|
+
it "should have a Nokogori::HTML::Document attribute for the page body" do
|
50
|
+
@page.doc.should_not be_nil
|
51
|
+
@page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should indicate whether it was fetched after an HTTP redirect" do
|
55
|
+
@page.should respond_to(:redirect?)
|
56
|
+
|
57
|
+
@page.redirect?.should == false
|
58
|
+
|
59
|
+
@http.fetch_pages(FakePage.new('redir', :redirect => 'home').url).first.redirect?.should == true
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should have a method to tell if a URI is in the same domain as the page" do
|
63
|
+
@page.should respond_to(:in_domain?)
|
64
|
+
|
65
|
+
@page.in_domain?(URI(FakePage.new('test').url)).should == true
|
66
|
+
@page.in_domain?(URI('http://www.other.com/')).should == false
|
67
|
+
end
|
68
|
+
|
69
|
+
it "should include the response time for the HTTP request" do
|
70
|
+
@page.should respond_to(:response_time)
|
71
|
+
end
|
72
|
+
|
73
|
+
it "should have the cookies received with the page" do
|
74
|
+
@page.should respond_to(:cookies)
|
75
|
+
@page.cookies.should == []
|
76
|
+
end
|
77
|
+
|
78
|
+
describe "#to_hash" do
|
79
|
+
it "converts the page to a hash" do
|
80
|
+
hash = @page.to_hash
|
81
|
+
hash['url'].should == @page.url.to_s
|
82
|
+
hash['referer'].should == @page.referer.to_s
|
83
|
+
hash['links'].should == @page.links.map(&:to_s)
|
84
|
+
end
|
85
|
+
|
86
|
+
context "when redirect_to is nil" do
|
87
|
+
it "sets 'redirect_to' to nil in the hash" do
|
88
|
+
@page.redirect_to.should be_nil
|
89
|
+
@page.to_hash[:redirect_to].should be_nil
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
context "when redirect_to is a non-nil URI" do
|
94
|
+
it "sets 'redirect_to' to the URI string" do
|
95
|
+
new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
|
96
|
+
new_page.redirect_to.to_s.should == SPEC_DOMAIN + '1'
|
97
|
+
new_page.to_hash['redirect_to'].should == SPEC_DOMAIN + '1'
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
describe "#from_hash" do
|
103
|
+
it "converts from a hash to a Page" do
|
104
|
+
page = @page.dup
|
105
|
+
page.depth = 1
|
106
|
+
converted = Page.from_hash(page.to_hash)
|
107
|
+
converted.links.should == page.links
|
108
|
+
converted.depth.should == page.depth
|
109
|
+
end
|
110
|
+
|
111
|
+
it 'handles a from_hash with a nil redirect_to' do
|
112
|
+
page_hash = @page.to_hash
|
113
|
+
page_hash['redirect_to'] = nil
|
114
|
+
lambda{Page.from_hash(page_hash)}.should_not raise_error(URI::InvalidURIError)
|
115
|
+
Page.from_hash(page_hash).redirect_to.should be_nil
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
describe "#redirect_to" do
|
120
|
+
context "when the page was a redirect" do
|
121
|
+
it "returns a URI of the page it redirects to" do
|
122
|
+
new_page = Page.new(URI(SPEC_DOMAIN), {:redirect_to => URI(SPEC_DOMAIN + '1')})
|
123
|
+
redirect = new_page.redirect_to
|
124
|
+
redirect.should be_a(URI)
|
125
|
+
redirect.to_s.should == SPEC_DOMAIN + '1'
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
describe "#links" do
|
131
|
+
it "should not convert anchors to %23" do
|
132
|
+
page = @http.fetch_page(FakePage.new('', :body => '<a href="#top">Top</a>').url)
|
133
|
+
page.links.should have(1).link
|
134
|
+
page.links.first.to_s.should == SPEC_DOMAIN
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
it "should detect, store and expose the base url for the page head" do
|
139
|
+
base = "#{SPEC_DOMAIN}path/to/base_url/"
|
140
|
+
page = @http.fetch_page(FakePage.new('body_test', {:base => base}).url)
|
141
|
+
page.base.should == URI(base)
|
142
|
+
@page.base.should be_nil
|
143
|
+
end
|
144
|
+
|
145
|
+
it "should have a method to convert a relative url to an absolute one" do
|
146
|
+
@page.should respond_to(:to_absolute)
|
147
|
+
|
148
|
+
# Identity
|
149
|
+
@page.to_absolute(@page.url).should == @page.url
|
150
|
+
@page.to_absolute("").should == @page.url
|
151
|
+
|
152
|
+
# Root-ness
|
153
|
+
@page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
|
154
|
+
|
155
|
+
# Relativeness
|
156
|
+
relative_path = "a/relative/path"
|
157
|
+
@page.to_absolute(relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
|
158
|
+
|
159
|
+
deep_page = @http.fetch_page(FakePage.new('home/deep', :links => '1').url)
|
160
|
+
upward_relative_path = "../a/relative/path"
|
161
|
+
deep_page.to_absolute(upward_relative_path).should == URI("#{SPEC_DOMAIN}#{relative_path}")
|
162
|
+
|
163
|
+
# The base URL case
|
164
|
+
base_path = "path/to/base_url/"
|
165
|
+
base = "#{SPEC_DOMAIN}#{base_path}"
|
166
|
+
page = @http.fetch_page(FakePage.new('home', {:base => base}).url)
|
167
|
+
|
168
|
+
# Identity
|
169
|
+
page.to_absolute(page.url).should == page.url
|
170
|
+
# It should revert to the base url
|
171
|
+
page.to_absolute("").should_not == page.url
|
172
|
+
|
173
|
+
# Root-ness
|
174
|
+
page.to_absolute("/").should == URI("#{SPEC_DOMAIN}")
|
175
|
+
|
176
|
+
# Relativeness
|
177
|
+
relative_path = "a/relative/path"
|
178
|
+
page.to_absolute(relative_path).should == URI("#{base}#{relative_path}")
|
179
|
+
|
180
|
+
upward_relative_path = "../a/relative/path"
|
181
|
+
upward_base = "#{SPEC_DOMAIN}path/to/"
|
182
|
+
page.to_absolute(upward_relative_path).should == URI("#{upward_base}#{relative_path}")
|
183
|
+
end
|
184
|
+
|
185
|
+
end
|
186
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__))
|
2
|
+
require 'spec_helper'
|
3
|
+
%w[pstore tokyo_cabinet sqlite3 mongodb redis].each { |file| require "anemone/storage/#{file}.rb" }
|
4
|
+
|
5
|
+
module Anemone
|
6
|
+
describe PageStore do
|
7
|
+
|
8
|
+
before(:all) do
|
9
|
+
FakeWeb.clean_registry
|
10
|
+
end
|
11
|
+
|
12
|
+
shared_examples_for "page storage" do
|
13
|
+
it "should be able to compute single-source shortest paths in-place" do
|
14
|
+
pages = []
|
15
|
+
pages << FakePage.new('0', :links => ['1', '3'])
|
16
|
+
pages << FakePage.new('1', :redirect => '2')
|
17
|
+
pages << FakePage.new('2', :links => ['4'])
|
18
|
+
pages << FakePage.new('3')
|
19
|
+
pages << FakePage.new('4')
|
20
|
+
|
21
|
+
# crawl, then set depths to nil
|
22
|
+
page_store = Anemone.crawl(pages.first.url, @opts) do |a|
|
23
|
+
a.after_crawl do |ps|
|
24
|
+
ps.each { |url, page| page.depth = nil; ps[url] = page }
|
25
|
+
end
|
26
|
+
end.pages
|
27
|
+
|
28
|
+
page_store.should respond_to(:shortest_paths!)
|
29
|
+
|
30
|
+
page_store.shortest_paths!(pages[0].url)
|
31
|
+
page_store[pages[0].url].depth.should == 0
|
32
|
+
page_store[pages[1].url].depth.should == 1
|
33
|
+
page_store[pages[2].url].depth.should == 1
|
34
|
+
page_store[pages[3].url].depth.should == 1
|
35
|
+
page_store[pages[4].url].depth.should == 2
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should be able to remove all redirects in-place" do
|
39
|
+
pages = []
|
40
|
+
pages << FakePage.new('0', :links => ['1'])
|
41
|
+
pages << FakePage.new('1', :redirect => '2')
|
42
|
+
pages << FakePage.new('2')
|
43
|
+
|
44
|
+
page_store = Anemone.crawl(pages[0].url, @opts).pages
|
45
|
+
|
46
|
+
page_store.should respond_to(:uniq!)
|
47
|
+
|
48
|
+
page_store.uniq!
|
49
|
+
page_store.has_key?(pages[1].url).should == false
|
50
|
+
page_store.has_key?(pages[0].url).should == true
|
51
|
+
page_store.has_key?(pages[2].url).should == true
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should be able to find pages linking to a url" do
|
55
|
+
pages = []
|
56
|
+
pages << FakePage.new('0', :links => ['1'])
|
57
|
+
pages << FakePage.new('1', :redirect => '2')
|
58
|
+
pages << FakePage.new('2')
|
59
|
+
|
60
|
+
page_store = Anemone.crawl(pages[0].url, @opts).pages
|
61
|
+
|
62
|
+
page_store.should respond_to(:pages_linking_to)
|
63
|
+
|
64
|
+
page_store.pages_linking_to(pages[2].url).size.should == 0
|
65
|
+
links_to_1 = page_store.pages_linking_to(pages[1].url)
|
66
|
+
links_to_1.size.should == 1
|
67
|
+
links_to_1.first.should be_an_instance_of(Page)
|
68
|
+
links_to_1.first.url.to_s.should == pages[0].url
|
69
|
+
end
|
70
|
+
|
71
|
+
it "should be able to find urls linking to a url" do
|
72
|
+
pages = []
|
73
|
+
pages << FakePage.new('0', :links => ['1'])
|
74
|
+
pages << FakePage.new('1', :redirect => '2')
|
75
|
+
pages << FakePage.new('2')
|
76
|
+
|
77
|
+
page_store = Anemone.crawl(pages[0].url, @opts).pages
|
78
|
+
|
79
|
+
page_store.should respond_to(:pages_linking_to)
|
80
|
+
|
81
|
+
page_store.urls_linking_to(pages[2].url).size.should == 0
|
82
|
+
links_to_1 = page_store.urls_linking_to(pages[1].url)
|
83
|
+
links_to_1.size.should == 1
|
84
|
+
links_to_1.first.to_s.should == pages[0].url
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
describe Hash do
|
89
|
+
it_should_behave_like "page storage"
|
90
|
+
|
91
|
+
before(:all) do
|
92
|
+
@opts = {}
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
describe Storage::PStore do
|
97
|
+
it_should_behave_like "page storage"
|
98
|
+
|
99
|
+
before(:each) do
|
100
|
+
@test_file = 'test.pstore'
|
101
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
102
|
+
@opts = {:storage => Storage.PStore(@test_file)}
|
103
|
+
end
|
104
|
+
|
105
|
+
after(:each) do
|
106
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
describe Storage::TokyoCabinet do
|
111
|
+
it_should_behave_like "page storage"
|
112
|
+
|
113
|
+
before(:each) do
|
114
|
+
@test_file = 'test.tch'
|
115
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
116
|
+
@opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
|
117
|
+
end
|
118
|
+
|
119
|
+
after(:each) do
|
120
|
+
@store.close
|
121
|
+
end
|
122
|
+
|
123
|
+
after(:each) do
|
124
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
describe Storage::SQLite3 do
|
129
|
+
it_should_behave_like "page storage"
|
130
|
+
|
131
|
+
before(:each) do
|
132
|
+
@test_file = 'test.db'
|
133
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
134
|
+
@opts = {:storage => @store = Storage.SQLite3(@test_file)}
|
135
|
+
end
|
136
|
+
|
137
|
+
after(:each) do
|
138
|
+
@store.close
|
139
|
+
end
|
140
|
+
|
141
|
+
after(:each) do
|
142
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
describe Storage::MongoDB do
|
147
|
+
it_should_behave_like "page storage"
|
148
|
+
|
149
|
+
before(:each) do
|
150
|
+
@opts = {:storage => @store = Storage.MongoDB}
|
151
|
+
end
|
152
|
+
|
153
|
+
after(:each) do
|
154
|
+
@store.close
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
describe Storage::Redis do
|
159
|
+
it_should_behave_like "page storage"
|
160
|
+
|
161
|
+
before(:each) do
|
162
|
+
@opts = {:storage => @store = Storage.Redis}
|
163
|
+
end
|
164
|
+
|
165
|
+
after(:each) do
|
166
|
+
@store.close
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
end
|