sutch-anemone 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/CHANGELOG.rdoc +136 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +38 -0
- data/Rakefile +23 -0
- data/VERSION +1 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +339 -0
- data/lib/anemone/exceptions.rb +5 -0
- data/lib/anemone/http.rb +187 -0
- data/lib/anemone/page.rb +217 -0
- data/lib/anemone/page_store.rb +161 -0
- data/lib/anemone/resource.rb +42 -0
- data/lib/anemone/storage.rb +44 -0
- data/lib/anemone/storage/base.rb +75 -0
- data/lib/anemone/storage/exceptions.rb +15 -0
- data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
- data/lib/anemone/storage/mongodb.rb +89 -0
- data/lib/anemone/storage/pstore.rb +50 -0
- data/lib/anemone/storage/redis.rb +90 -0
- data/lib/anemone/storage/sqlite3.rb +90 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +60 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +16 -0
- data/spec/cookie_store_spec.rb +28 -0
- data/spec/core_spec.rb +344 -0
- data/spec/fakeweb_helper.rb +77 -0
- data/spec/http_spec.rb +19 -0
- data/spec/page_spec.rb +186 -0
- data/spec/page_store_spec.rb +171 -0
- data/spec/resource_spec.rb +91 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/storage_spec.rb +252 -0
- metadata +281 -0
@@ -0,0 +1,60 @@
|
|
1
|
+
begin
|
2
|
+
require 'tokyocabinet'
|
3
|
+
rescue LoadError
|
4
|
+
puts $!
|
5
|
+
puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
|
6
|
+
exit
|
7
|
+
end
|
8
|
+
|
9
|
+
require 'forwardable'
|
10
|
+
|
11
|
+
module Anemone
|
12
|
+
module Storage
|
13
|
+
class TokyoCabinet
|
14
|
+
extend Forwardable
|
15
|
+
|
16
|
+
def_delegators :@db, :close, :size, :keys, :has_key?
|
17
|
+
|
18
|
+
def initialize(file)
|
19
|
+
raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
|
20
|
+
@db = ::TokyoCabinet::HDB::new
|
21
|
+
@db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
|
22
|
+
@db.clear
|
23
|
+
end
|
24
|
+
|
25
|
+
def [](key)
|
26
|
+
if value = @db[key]
|
27
|
+
load_value(value)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def []=(key, value)
|
32
|
+
@db[key] = [Marshal.dump(value)].pack("m")
|
33
|
+
end
|
34
|
+
|
35
|
+
def delete(key)
|
36
|
+
value = self[key]
|
37
|
+
@db.delete(key)
|
38
|
+
value
|
39
|
+
end
|
40
|
+
|
41
|
+
def each
|
42
|
+
@db.keys.each do |k|
|
43
|
+
yield(k, self[k])
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def merge!(hash)
|
48
|
+
hash.each { |key, value| self[key] = value }
|
49
|
+
self
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def load_value(value)
|
55
|
+
Marshal.load(value.unpack("m")[0])
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
require 'anemone/http'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class Tentacle
|
5
|
+
|
6
|
+
#
|
7
|
+
# Create a new Tentacle
|
8
|
+
#
|
9
|
+
def initialize(link_queue, page_queue, opts = {})
|
10
|
+
@link_queue = link_queue
|
11
|
+
@page_queue = page_queue
|
12
|
+
@http = Anemone::HTTP.new(opts)
|
13
|
+
@opts = opts
|
14
|
+
end
|
15
|
+
|
16
|
+
#
|
17
|
+
# Gets links from @link_queue, and returns the fetched
|
18
|
+
# Page objects into @page_queue
|
19
|
+
#
|
20
|
+
def run
|
21
|
+
loop do
|
22
|
+
link, referer, depth = @link_queue.deq
|
23
|
+
|
24
|
+
break if link == :END
|
25
|
+
|
26
|
+
@http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
|
27
|
+
|
28
|
+
delay
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def delay
|
35
|
+
sleep @opts[:delay] if @opts[:delay] > 0
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__))
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
describe Anemone do
|
5
|
+
|
6
|
+
it "should have a version" do
|
7
|
+
Anemone.const_defined?('VERSION').should == true
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should return a Anemone::Core from the crawl, which has a PageStore" do
|
11
|
+
result = Anemone.crawl(SPEC_DOMAIN)
|
12
|
+
result.should be_an_instance_of(Anemone::Core)
|
13
|
+
result.pages.should be_an_instance_of(Anemone::PageStore)
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__))
|
2
|
+
require 'spec_helper'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
describe CookieStore do
|
6
|
+
|
7
|
+
it "should start out empty if no cookies are specified" do
|
8
|
+
CookieStore.new.empty?.should be true
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should accept a Hash of cookies in the constructor" do
|
12
|
+
CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should be able to merge an HTTP cookie string" do
|
16
|
+
cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
|
17
|
+
cs.merge! "a=A; path=/, c=C; path=/"
|
18
|
+
cs['a'].value.should == 'A'
|
19
|
+
cs['b'].value.should == 'b'
|
20
|
+
cs['c'].value.should == 'C'
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
|
24
|
+
CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
data/spec/core_spec.rb
ADDED
@@ -0,0 +1,344 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__))
|
2
|
+
require 'spec_helper'
|
3
|
+
%w[pstore tokyo_cabinet sqlite3].each { |file| require "anemone/storage/#{file}.rb" }
|
4
|
+
|
5
|
+
module Anemone
|
6
|
+
describe Core do
|
7
|
+
|
8
|
+
before(:each) do
|
9
|
+
FakeWeb.clean_registry
|
10
|
+
end
|
11
|
+
|
12
|
+
shared_examples_for "crawl" do
|
13
|
+
it "should crawl all the html pages in a domain by following <a> href's" do
|
14
|
+
pages = []
|
15
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
16
|
+
pages << FakePage.new('1', :links => ['3'])
|
17
|
+
pages << FakePage.new('2')
|
18
|
+
pages << FakePage.new('3')
|
19
|
+
|
20
|
+
Anemone.crawl(pages[0].url, @opts).should have(4).pages
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should not follow links that leave the original domain" do
|
24
|
+
pages = []
|
25
|
+
pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
|
26
|
+
pages << FakePage.new('1')
|
27
|
+
|
28
|
+
core = Anemone.crawl(pages[0].url, @opts)
|
29
|
+
|
30
|
+
core.should have(2).pages
|
31
|
+
core.pages.keys.should_not include('http://www.other.com/')
|
32
|
+
end
|
33
|
+
|
34
|
+
it "should not follow redirects that leave the original domain" do
|
35
|
+
pages = []
|
36
|
+
pages << FakePage.new('0', :links => ['1'], :redirect => 'http://www.other.com/')
|
37
|
+
pages << FakePage.new('1')
|
38
|
+
|
39
|
+
core = Anemone.crawl(pages[0].url, @opts)
|
40
|
+
|
41
|
+
core.should have(2).pages
|
42
|
+
core.pages.keys.should_not include('http://www.other.com/')
|
43
|
+
end
|
44
|
+
|
45
|
+
it "should follow http redirects" do
|
46
|
+
pages = []
|
47
|
+
pages << FakePage.new('0', :links => ['1'])
|
48
|
+
pages << FakePage.new('1', :redirect => '2')
|
49
|
+
pages << FakePage.new('2')
|
50
|
+
|
51
|
+
Anemone.crawl(pages[0].url, @opts).should have(3).pages
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should follow with HTTP basic authentication" do
|
55
|
+
pages = []
|
56
|
+
pages << FakePage.new('0', :links => ['1', '2'], :auth => true)
|
57
|
+
pages << FakePage.new('1', :links => ['3'], :auth => true)
|
58
|
+
|
59
|
+
Anemone.crawl(pages.first.auth_url, @opts).should have(3).pages
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should accept multiple starting URLs" do
|
63
|
+
pages = []
|
64
|
+
pages << FakePage.new('0', :links => ['1'])
|
65
|
+
pages << FakePage.new('1')
|
66
|
+
pages << FakePage.new('2', :links => ['3'])
|
67
|
+
pages << FakePage.new('3')
|
68
|
+
|
69
|
+
Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
|
70
|
+
end
|
71
|
+
|
72
|
+
it "should include the query string when following links" do
|
73
|
+
pages = []
|
74
|
+
pages << FakePage.new('0', :links => ['1?foo=1'])
|
75
|
+
pages << FakePage.new('1?foo=1')
|
76
|
+
pages << FakePage.new('1')
|
77
|
+
|
78
|
+
core = Anemone.crawl(pages[0].url, @opts)
|
79
|
+
|
80
|
+
core.should have(2).pages
|
81
|
+
core.pages.keys.should_not include(pages[2].url)
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should be able to skip links with query strings" do
|
85
|
+
pages = []
|
86
|
+
pages << FakePage.new('0', :links => ['1?foo=1', '2'])
|
87
|
+
pages << FakePage.new('1?foo=1')
|
88
|
+
pages << FakePage.new('2')
|
89
|
+
|
90
|
+
core = Anemone.crawl(pages[0].url, @opts) do |a|
|
91
|
+
a.skip_query_strings = true
|
92
|
+
end
|
93
|
+
|
94
|
+
core.should have(2).pages
|
95
|
+
end
|
96
|
+
|
97
|
+
it "should be able to skip links based on a RegEx" do
|
98
|
+
pages = []
|
99
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
100
|
+
pages << FakePage.new('1')
|
101
|
+
pages << FakePage.new('2')
|
102
|
+
pages << FakePage.new('3')
|
103
|
+
|
104
|
+
core = Anemone.crawl(pages[0].url, @opts) do |a|
|
105
|
+
a.skip_links_like /1/, /3/
|
106
|
+
end
|
107
|
+
|
108
|
+
core.should have(2).pages
|
109
|
+
core.pages.keys.should_not include(pages[1].url)
|
110
|
+
core.pages.keys.should_not include(pages[3].url)
|
111
|
+
end
|
112
|
+
|
113
|
+
it "should be able to call a block on every page" do
|
114
|
+
pages = []
|
115
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
116
|
+
pages << FakePage.new('1')
|
117
|
+
pages << FakePage.new('2')
|
118
|
+
|
119
|
+
count = 0
|
120
|
+
Anemone.crawl(pages[0].url, @opts) do |a|
|
121
|
+
a.on_every_page { count += 1 }
|
122
|
+
end
|
123
|
+
|
124
|
+
count.should == 3
|
125
|
+
end
|
126
|
+
|
127
|
+
it "should not discard page bodies by default" do
|
128
|
+
Anemone.crawl(FakePage.new('0').url, @opts).pages.values#.first.doc.should_not be_nil
|
129
|
+
end
|
130
|
+
|
131
|
+
it "should optionally discard page bodies to conserve memory" do
|
132
|
+
# core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
|
133
|
+
# core.pages.values.first.doc.should be_nil
|
134
|
+
end
|
135
|
+
|
136
|
+
it "should provide a focus_crawl method to select the links on each page to follow" do
|
137
|
+
pages = []
|
138
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
139
|
+
pages << FakePage.new('1')
|
140
|
+
pages << FakePage.new('2')
|
141
|
+
|
142
|
+
core = Anemone.crawl(pages[0].url, @opts) do |a|
|
143
|
+
a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
|
144
|
+
end
|
145
|
+
|
146
|
+
core.should have(2).pages
|
147
|
+
core.pages.keys.should_not include(pages[1].url)
|
148
|
+
end
|
149
|
+
|
150
|
+
it "should optionally delay between page requests" do
|
151
|
+
delay = 0.25
|
152
|
+
|
153
|
+
pages = []
|
154
|
+
pages << FakePage.new('0', :links => '1')
|
155
|
+
pages << FakePage.new('1')
|
156
|
+
|
157
|
+
start = Time.now
|
158
|
+
Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
|
159
|
+
finish = Time.now
|
160
|
+
|
161
|
+
(finish - start).should satisfy {|t| t > delay * 2}
|
162
|
+
end
|
163
|
+
|
164
|
+
it "should optionally obey the robots exclusion protocol" do
|
165
|
+
pages = []
|
166
|
+
pages << FakePage.new('0', :links => '1')
|
167
|
+
pages << FakePage.new('1')
|
168
|
+
pages << FakePage.new('robots.txt',
|
169
|
+
:body => "User-agent: *\nDisallow: /1",
|
170
|
+
:content_type => 'text/plain')
|
171
|
+
|
172
|
+
core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
|
173
|
+
urls = core.pages.keys
|
174
|
+
|
175
|
+
urls.should include(pages[0].url)
|
176
|
+
urls.should_not include(pages[1].url)
|
177
|
+
end
|
178
|
+
|
179
|
+
it "should be able to set cookies to send with HTTP requests" do
|
180
|
+
cookies = {:a => '1', :b => '2'}
|
181
|
+
core = Anemone.crawl(FakePage.new('0').url) do |anemone|
|
182
|
+
anemone.cookies = cookies
|
183
|
+
end
|
184
|
+
core.opts[:cookies].should == cookies
|
185
|
+
end
|
186
|
+
|
187
|
+
it "should freeze the options once the crawl begins" do
|
188
|
+
core = Anemone.crawl(FakePage.new('0').url) do |anemone|
|
189
|
+
anemone.threads = 4
|
190
|
+
anemone.on_every_page do
|
191
|
+
lambda {anemone.threads = 2}.should raise_error
|
192
|
+
end
|
193
|
+
end
|
194
|
+
core.opts[:threads].should == 4
|
195
|
+
end
|
196
|
+
|
197
|
+
describe "many pages" do
|
198
|
+
before(:each) do
|
199
|
+
@pages, size = [], 5
|
200
|
+
|
201
|
+
size.times do |n|
|
202
|
+
# register this page with a link to the next page
|
203
|
+
link = (n + 1).to_s if n + 1 < size
|
204
|
+
@pages << FakePage.new(n.to_s, :links => Array(link))
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
it "should track the page depth and referer" do
|
209
|
+
core = Anemone.crawl(@pages[0].url, @opts)
|
210
|
+
previous_page = nil
|
211
|
+
|
212
|
+
@pages.each_with_index do |page, i|
|
213
|
+
page = core.pages[page.url]
|
214
|
+
page.should be
|
215
|
+
page.depth.should == i
|
216
|
+
|
217
|
+
if previous_page
|
218
|
+
page.referer.should == previous_page.url
|
219
|
+
else
|
220
|
+
page.referer.should be_nil
|
221
|
+
end
|
222
|
+
previous_page = page
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
it "should optionally limit the depth of the crawl" do
|
227
|
+
core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
|
228
|
+
core.should have(4).pages
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
233
|
+
|
234
|
+
describe Hash do
|
235
|
+
it_should_behave_like "crawl"
|
236
|
+
|
237
|
+
before(:all) do
|
238
|
+
@opts = {}
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
describe Storage::PStore do
|
243
|
+
it_should_behave_like "crawl"
|
244
|
+
|
245
|
+
before(:all) do
|
246
|
+
@test_file = 'test.pstore'
|
247
|
+
end
|
248
|
+
|
249
|
+
before(:each) do
|
250
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
251
|
+
@opts = {:storage => Storage.PStore(@test_file)}
|
252
|
+
end
|
253
|
+
|
254
|
+
after(:each) do
|
255
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
describe Storage::TokyoCabinet do
|
260
|
+
it_should_behave_like "crawl"
|
261
|
+
|
262
|
+
before(:all) do
|
263
|
+
@test_file = 'test.tch'
|
264
|
+
end
|
265
|
+
|
266
|
+
before(:each) do
|
267
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
268
|
+
@opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
|
269
|
+
end
|
270
|
+
|
271
|
+
after(:each) do
|
272
|
+
@store.close
|
273
|
+
end
|
274
|
+
|
275
|
+
after(:each) do
|
276
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
describe Storage::SQLite3 do
|
281
|
+
it_should_behave_like "crawl"
|
282
|
+
|
283
|
+
before(:all) do
|
284
|
+
@test_file = 'test.db'
|
285
|
+
end
|
286
|
+
|
287
|
+
before(:each) do
|
288
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
289
|
+
@opts = {:storage => @store = Storage.SQLite3(@test_file)}
|
290
|
+
end
|
291
|
+
|
292
|
+
after(:each) do
|
293
|
+
@store.close
|
294
|
+
end
|
295
|
+
|
296
|
+
after(:each) do
|
297
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
298
|
+
end
|
299
|
+
end
|
300
|
+
|
301
|
+
describe "options" do
|
302
|
+
it "should accept options for the crawl" do
|
303
|
+
core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
|
304
|
+
:threads => 2,
|
305
|
+
:discard_page_bodies => true,
|
306
|
+
:user_agent => 'test',
|
307
|
+
:obey_robots_txt => true,
|
308
|
+
:depth_limit => 3)
|
309
|
+
|
310
|
+
core.opts[:verbose].should == false
|
311
|
+
core.opts[:threads].should == 2
|
312
|
+
core.opts[:discard_page_bodies].should == true
|
313
|
+
core.opts[:delay].should == 0
|
314
|
+
core.opts[:user_agent].should == 'test'
|
315
|
+
core.opts[:obey_robots_txt].should == true
|
316
|
+
core.opts[:depth_limit].should == 3
|
317
|
+
end
|
318
|
+
|
319
|
+
it "should accept options via setter methods in the crawl block" do
|
320
|
+
core = Anemone.crawl(SPEC_DOMAIN) do |a|
|
321
|
+
a.verbose = false
|
322
|
+
a.threads = 2
|
323
|
+
a.discard_page_bodies = true
|
324
|
+
a.user_agent = 'test'
|
325
|
+
a.obey_robots_txt = true
|
326
|
+
a.depth_limit = 3
|
327
|
+
end
|
328
|
+
|
329
|
+
core.opts[:verbose].should == false
|
330
|
+
core.opts[:threads].should == 2
|
331
|
+
core.opts[:discard_page_bodies].should == true
|
332
|
+
core.opts[:delay].should == 0
|
333
|
+
core.opts[:user_agent].should == 'test'
|
334
|
+
core.opts[:obey_robots_txt].should == true
|
335
|
+
core.opts[:depth_limit].should == 3
|
336
|
+
end
|
337
|
+
|
338
|
+
it "should use 1 thread if a delay is requested" do
|
339
|
+
Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
|
340
|
+
end
|
341
|
+
end
|
342
|
+
|
343
|
+
end
|
344
|
+
end
|