sutch-anemone 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,60 @@
1
+ begin
2
+ require 'tokyocabinet'
3
+ rescue LoadError
4
+ puts $!
5
+ puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
6
+ exit
7
+ end
8
+
9
+ require 'forwardable'
10
+
11
+ module Anemone
12
+ module Storage
13
+ class TokyoCabinet
14
+ extend Forwardable
15
+
16
+ def_delegators :@db, :close, :size, :keys, :has_key?
17
+
18
+ def initialize(file)
19
+ raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
20
+ @db = ::TokyoCabinet::HDB::new
21
+ @db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
22
+ @db.clear
23
+ end
24
+
25
+ def [](key)
26
+ if value = @db[key]
27
+ load_value(value)
28
+ end
29
+ end
30
+
31
+ def []=(key, value)
32
+ @db[key] = [Marshal.dump(value)].pack("m")
33
+ end
34
+
35
+ def delete(key)
36
+ value = self[key]
37
+ @db.delete(key)
38
+ value
39
+ end
40
+
41
+ def each
42
+ @db.keys.each do |k|
43
+ yield(k, self[k])
44
+ end
45
+ end
46
+
47
+ def merge!(hash)
48
+ hash.each { |key, value| self[key] = value }
49
+ self
50
+ end
51
+
52
+ private
53
+
54
+ def load_value(value)
55
+ Marshal.load(value.unpack("m")[0])
56
+ end
57
+
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,39 @@
1
+ require 'anemone/http'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue, opts = {})
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ @http = Anemone::HTTP.new(opts)
13
+ @opts = opts
14
+ end
15
+
16
+ #
17
+ # Gets links from @link_queue, and returns the fetched
18
+ # Page objects into @page_queue
19
+ #
20
+ def run
21
+ loop do
22
+ link, referer, depth = @link_queue.deq
23
+
24
+ break if link == :END
25
+
26
+ @http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
27
+
28
+ delay
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def delay
35
+ sleep @opts[:delay] if @opts[:delay] > 0
36
+ end
37
+
38
+ end
39
+ end
@@ -0,0 +1,16 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ describe Anemone do
5
+
6
+ it "should have a version" do
7
+ Anemone.const_defined?('VERSION').should == true
8
+ end
9
+
10
+ it "should return a Anemone::Core from the crawl, which has a PageStore" do
11
+ result = Anemone.crawl(SPEC_DOMAIN)
12
+ result.should be_an_instance_of(Anemone::Core)
13
+ result.pages.should be_an_instance_of(Anemone::PageStore)
14
+ end
15
+
16
+ end
@@ -0,0 +1,28 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ module Anemone
5
+ describe CookieStore do
6
+
7
+ it "should start out empty if no cookies are specified" do
8
+ CookieStore.new.empty?.should be true
9
+ end
10
+
11
+ it "should accept a Hash of cookies in the constructor" do
12
+ CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
13
+ end
14
+
15
+ it "should be able to merge an HTTP cookie string" do
16
+ cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
17
+ cs.merge! "a=A; path=/, c=C; path=/"
18
+ cs['a'].value.should == 'A'
19
+ cs['b'].value.should == 'b'
20
+ cs['c'].value.should == 'C'
21
+ end
22
+
23
+ it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
24
+ CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
25
+ end
26
+
27
+ end
28
+ end
data/spec/core_spec.rb ADDED
@@ -0,0 +1,344 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+ %w[pstore tokyo_cabinet sqlite3].each { |file| require "anemone/storage/#{file}.rb" }
4
+
5
+ module Anemone
6
+ describe Core do
7
+
8
+ before(:each) do
9
+ FakeWeb.clean_registry
10
+ end
11
+
12
+ shared_examples_for "crawl" do
13
+ it "should crawl all the html pages in a domain by following <a> href's" do
14
+ pages = []
15
+ pages << FakePage.new('0', :links => ['1', '2'])
16
+ pages << FakePage.new('1', :links => ['3'])
17
+ pages << FakePage.new('2')
18
+ pages << FakePage.new('3')
19
+
20
+ Anemone.crawl(pages[0].url, @opts).should have(4).pages
21
+ end
22
+
23
+ it "should not follow links that leave the original domain" do
24
+ pages = []
25
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
26
+ pages << FakePage.new('1')
27
+
28
+ core = Anemone.crawl(pages[0].url, @opts)
29
+
30
+ core.should have(2).pages
31
+ core.pages.keys.should_not include('http://www.other.com/')
32
+ end
33
+
34
+ it "should not follow redirects that leave the original domain" do
35
+ pages = []
36
+ pages << FakePage.new('0', :links => ['1'], :redirect => 'http://www.other.com/')
37
+ pages << FakePage.new('1')
38
+
39
+ core = Anemone.crawl(pages[0].url, @opts)
40
+
41
+ core.should have(2).pages
42
+ core.pages.keys.should_not include('http://www.other.com/')
43
+ end
44
+
45
+ it "should follow http redirects" do
46
+ pages = []
47
+ pages << FakePage.new('0', :links => ['1'])
48
+ pages << FakePage.new('1', :redirect => '2')
49
+ pages << FakePage.new('2')
50
+
51
+ Anemone.crawl(pages[0].url, @opts).should have(3).pages
52
+ end
53
+
54
+ it "should follow with HTTP basic authentication" do
55
+ pages = []
56
+ pages << FakePage.new('0', :links => ['1', '2'], :auth => true)
57
+ pages << FakePage.new('1', :links => ['3'], :auth => true)
58
+
59
+ Anemone.crawl(pages.first.auth_url, @opts).should have(3).pages
60
+ end
61
+
62
+ it "should accept multiple starting URLs" do
63
+ pages = []
64
+ pages << FakePage.new('0', :links => ['1'])
65
+ pages << FakePage.new('1')
66
+ pages << FakePage.new('2', :links => ['3'])
67
+ pages << FakePage.new('3')
68
+
69
+ Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
70
+ end
71
+
72
+ it "should include the query string when following links" do
73
+ pages = []
74
+ pages << FakePage.new('0', :links => ['1?foo=1'])
75
+ pages << FakePage.new('1?foo=1')
76
+ pages << FakePage.new('1')
77
+
78
+ core = Anemone.crawl(pages[0].url, @opts)
79
+
80
+ core.should have(2).pages
81
+ core.pages.keys.should_not include(pages[2].url)
82
+ end
83
+
84
+ it "should be able to skip links with query strings" do
85
+ pages = []
86
+ pages << FakePage.new('0', :links => ['1?foo=1', '2'])
87
+ pages << FakePage.new('1?foo=1')
88
+ pages << FakePage.new('2')
89
+
90
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
91
+ a.skip_query_strings = true
92
+ end
93
+
94
+ core.should have(2).pages
95
+ end
96
+
97
+ it "should be able to skip links based on a RegEx" do
98
+ pages = []
99
+ pages << FakePage.new('0', :links => ['1', '2'])
100
+ pages << FakePage.new('1')
101
+ pages << FakePage.new('2')
102
+ pages << FakePage.new('3')
103
+
104
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
105
+ a.skip_links_like /1/, /3/
106
+ end
107
+
108
+ core.should have(2).pages
109
+ core.pages.keys.should_not include(pages[1].url)
110
+ core.pages.keys.should_not include(pages[3].url)
111
+ end
112
+
113
+ it "should be able to call a block on every page" do
114
+ pages = []
115
+ pages << FakePage.new('0', :links => ['1', '2'])
116
+ pages << FakePage.new('1')
117
+ pages << FakePage.new('2')
118
+
119
+ count = 0
120
+ Anemone.crawl(pages[0].url, @opts) do |a|
121
+ a.on_every_page { count += 1 }
122
+ end
123
+
124
+ count.should == 3
125
+ end
126
+
127
+ it "should not discard page bodies by default" do
128
+ Anemone.crawl(FakePage.new('0').url, @opts).pages.values#.first.doc.should_not be_nil
129
+ end
130
+
131
+ it "should optionally discard page bodies to conserve memory" do
132
+ # core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
133
+ # core.pages.values.first.doc.should be_nil
134
+ end
135
+
136
+ it "should provide a focus_crawl method to select the links on each page to follow" do
137
+ pages = []
138
+ pages << FakePage.new('0', :links => ['1', '2'])
139
+ pages << FakePage.new('1')
140
+ pages << FakePage.new('2')
141
+
142
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
143
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
144
+ end
145
+
146
+ core.should have(2).pages
147
+ core.pages.keys.should_not include(pages[1].url)
148
+ end
149
+
150
+ it "should optionally delay between page requests" do
151
+ delay = 0.25
152
+
153
+ pages = []
154
+ pages << FakePage.new('0', :links => '1')
155
+ pages << FakePage.new('1')
156
+
157
+ start = Time.now
158
+ Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
159
+ finish = Time.now
160
+
161
+ (finish - start).should satisfy {|t| t > delay * 2}
162
+ end
163
+
164
+ it "should optionally obey the robots exclusion protocol" do
165
+ pages = []
166
+ pages << FakePage.new('0', :links => '1')
167
+ pages << FakePage.new('1')
168
+ pages << FakePage.new('robots.txt',
169
+ :body => "User-agent: *\nDisallow: /1",
170
+ :content_type => 'text/plain')
171
+
172
+ core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
173
+ urls = core.pages.keys
174
+
175
+ urls.should include(pages[0].url)
176
+ urls.should_not include(pages[1].url)
177
+ end
178
+
179
+ it "should be able to set cookies to send with HTTP requests" do
180
+ cookies = {:a => '1', :b => '2'}
181
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
182
+ anemone.cookies = cookies
183
+ end
184
+ core.opts[:cookies].should == cookies
185
+ end
186
+
187
+ it "should freeze the options once the crawl begins" do
188
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
189
+ anemone.threads = 4
190
+ anemone.on_every_page do
191
+ lambda {anemone.threads = 2}.should raise_error
192
+ end
193
+ end
194
+ core.opts[:threads].should == 4
195
+ end
196
+
197
+ describe "many pages" do
198
+ before(:each) do
199
+ @pages, size = [], 5
200
+
201
+ size.times do |n|
202
+ # register this page with a link to the next page
203
+ link = (n + 1).to_s if n + 1 < size
204
+ @pages << FakePage.new(n.to_s, :links => Array(link))
205
+ end
206
+ end
207
+
208
+ it "should track the page depth and referer" do
209
+ core = Anemone.crawl(@pages[0].url, @opts)
210
+ previous_page = nil
211
+
212
+ @pages.each_with_index do |page, i|
213
+ page = core.pages[page.url]
214
+ page.should be
215
+ page.depth.should == i
216
+
217
+ if previous_page
218
+ page.referer.should == previous_page.url
219
+ else
220
+ page.referer.should be_nil
221
+ end
222
+ previous_page = page
223
+ end
224
+ end
225
+
226
+ it "should optionally limit the depth of the crawl" do
227
+ core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
228
+ core.should have(4).pages
229
+ end
230
+ end
231
+
232
+ end
233
+
234
+ describe Hash do
235
+ it_should_behave_like "crawl"
236
+
237
+ before(:all) do
238
+ @opts = {}
239
+ end
240
+ end
241
+
242
+ describe Storage::PStore do
243
+ it_should_behave_like "crawl"
244
+
245
+ before(:all) do
246
+ @test_file = 'test.pstore'
247
+ end
248
+
249
+ before(:each) do
250
+ File.delete(@test_file) if File.exists?(@test_file)
251
+ @opts = {:storage => Storage.PStore(@test_file)}
252
+ end
253
+
254
+ after(:each) do
255
+ File.delete(@test_file) if File.exists?(@test_file)
256
+ end
257
+ end
258
+
259
+ describe Storage::TokyoCabinet do
260
+ it_should_behave_like "crawl"
261
+
262
+ before(:all) do
263
+ @test_file = 'test.tch'
264
+ end
265
+
266
+ before(:each) do
267
+ File.delete(@test_file) if File.exists?(@test_file)
268
+ @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
269
+ end
270
+
271
+ after(:each) do
272
+ @store.close
273
+ end
274
+
275
+ after(:each) do
276
+ File.delete(@test_file) if File.exists?(@test_file)
277
+ end
278
+ end
279
+
280
+ describe Storage::SQLite3 do
281
+ it_should_behave_like "crawl"
282
+
283
+ before(:all) do
284
+ @test_file = 'test.db'
285
+ end
286
+
287
+ before(:each) do
288
+ File.delete(@test_file) if File.exists?(@test_file)
289
+ @opts = {:storage => @store = Storage.SQLite3(@test_file)}
290
+ end
291
+
292
+ after(:each) do
293
+ @store.close
294
+ end
295
+
296
+ after(:each) do
297
+ File.delete(@test_file) if File.exists?(@test_file)
298
+ end
299
+ end
300
+
301
+ describe "options" do
302
+ it "should accept options for the crawl" do
303
+ core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
304
+ :threads => 2,
305
+ :discard_page_bodies => true,
306
+ :user_agent => 'test',
307
+ :obey_robots_txt => true,
308
+ :depth_limit => 3)
309
+
310
+ core.opts[:verbose].should == false
311
+ core.opts[:threads].should == 2
312
+ core.opts[:discard_page_bodies].should == true
313
+ core.opts[:delay].should == 0
314
+ core.opts[:user_agent].should == 'test'
315
+ core.opts[:obey_robots_txt].should == true
316
+ core.opts[:depth_limit].should == 3
317
+ end
318
+
319
+ it "should accept options via setter methods in the crawl block" do
320
+ core = Anemone.crawl(SPEC_DOMAIN) do |a|
321
+ a.verbose = false
322
+ a.threads = 2
323
+ a.discard_page_bodies = true
324
+ a.user_agent = 'test'
325
+ a.obey_robots_txt = true
326
+ a.depth_limit = 3
327
+ end
328
+
329
+ core.opts[:verbose].should == false
330
+ core.opts[:threads].should == 2
331
+ core.opts[:discard_page_bodies].should == true
332
+ core.opts[:delay].should == 0
333
+ core.opts[:user_agent].should == 'test'
334
+ core.opts[:obey_robots_txt].should == true
335
+ core.opts[:depth_limit].should == 3
336
+ end
337
+
338
+ it "should use 1 thread if a delay is requested" do
339
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
340
+ end
341
+ end
342
+
343
+ end
344
+ end