sutch-anemone 0.7.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,60 @@
1
+ begin
2
+ require 'tokyocabinet'
3
+ rescue LoadError
4
+ puts $!
5
+ puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
6
+ exit
7
+ end
8
+
9
+ require 'forwardable'
10
+
11
+ module Anemone
12
+ module Storage
13
+ class TokyoCabinet
14
+ extend Forwardable
15
+
16
+ def_delegators :@db, :close, :size, :keys, :has_key?
17
+
18
+ def initialize(file)
19
+ raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
20
+ @db = ::TokyoCabinet::HDB::new
21
+ @db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
22
+ @db.clear
23
+ end
24
+
25
+ def [](key)
26
+ if value = @db[key]
27
+ load_value(value)
28
+ end
29
+ end
30
+
31
+ def []=(key, value)
32
+ @db[key] = [Marshal.dump(value)].pack("m")
33
+ end
34
+
35
+ def delete(key)
36
+ value = self[key]
37
+ @db.delete(key)
38
+ value
39
+ end
40
+
41
+ def each
42
+ @db.keys.each do |k|
43
+ yield(k, self[k])
44
+ end
45
+ end
46
+
47
+ def merge!(hash)
48
+ hash.each { |key, value| self[key] = value }
49
+ self
50
+ end
51
+
52
+ private
53
+
54
+ def load_value(value)
55
+ Marshal.load(value.unpack("m")[0])
56
+ end
57
+
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,39 @@
1
+ require 'anemone/http'
2
+
3
+ module Anemone
4
+ class Tentacle
5
+
6
+ #
7
+ # Create a new Tentacle
8
+ #
9
+ def initialize(link_queue, page_queue, opts = {})
10
+ @link_queue = link_queue
11
+ @page_queue = page_queue
12
+ @http = Anemone::HTTP.new(opts)
13
+ @opts = opts
14
+ end
15
+
16
+ #
17
+ # Gets links from @link_queue, and returns the fetched
18
+ # Page objects into @page_queue
19
+ #
20
+ def run
21
+ loop do
22
+ link, referer, depth = @link_queue.deq
23
+
24
+ break if link == :END
25
+
26
+ @http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
27
+
28
+ delay
29
+ end
30
+ end
31
+
32
+ private
33
+
34
+ def delay
35
+ sleep @opts[:delay] if @opts[:delay] > 0
36
+ end
37
+
38
+ end
39
+ end
@@ -0,0 +1,16 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ describe Anemone do
5
+
6
+ it "should have a version" do
7
+ Anemone.const_defined?('VERSION').should == true
8
+ end
9
+
10
+ it "should return a Anemone::Core from the crawl, which has a PageStore" do
11
+ result = Anemone.crawl(SPEC_DOMAIN)
12
+ result.should be_an_instance_of(Anemone::Core)
13
+ result.pages.should be_an_instance_of(Anemone::PageStore)
14
+ end
15
+
16
+ end
@@ -0,0 +1,28 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+
4
+ module Anemone
5
+ describe CookieStore do
6
+
7
+ it "should start out empty if no cookies are specified" do
8
+ CookieStore.new.empty?.should be true
9
+ end
10
+
11
+ it "should accept a Hash of cookies in the constructor" do
12
+ CookieStore.new({'test' => 'cookie'})['test'].value.should == 'cookie'
13
+ end
14
+
15
+ it "should be able to merge an HTTP cookie string" do
16
+ cs = CookieStore.new({'a' => 'a', 'b' => 'b'})
17
+ cs.merge! "a=A; path=/, c=C; path=/"
18
+ cs['a'].value.should == 'A'
19
+ cs['b'].value.should == 'b'
20
+ cs['c'].value.should == 'C'
21
+ end
22
+
23
+ it "should have a to_s method to turn the cookies into a string for the HTTP Cookie header" do
24
+ CookieStore.new({'a' => 'a', 'b' => 'b'}).to_s.should == 'a=a;b=b'
25
+ end
26
+
27
+ end
28
+ end
data/spec/core_spec.rb ADDED
@@ -0,0 +1,344 @@
1
+ $:.unshift(File.dirname(__FILE__))
2
+ require 'spec_helper'
3
+ %w[pstore tokyo_cabinet sqlite3].each { |file| require "anemone/storage/#{file}.rb" }
4
+
5
+ module Anemone
6
+ describe Core do
7
+
8
+ before(:each) do
9
+ FakeWeb.clean_registry
10
+ end
11
+
12
+ shared_examples_for "crawl" do
13
+ it "should crawl all the html pages in a domain by following <a> href's" do
14
+ pages = []
15
+ pages << FakePage.new('0', :links => ['1', '2'])
16
+ pages << FakePage.new('1', :links => ['3'])
17
+ pages << FakePage.new('2')
18
+ pages << FakePage.new('3')
19
+
20
+ Anemone.crawl(pages[0].url, @opts).should have(4).pages
21
+ end
22
+
23
+ it "should not follow links that leave the original domain" do
24
+ pages = []
25
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
26
+ pages << FakePage.new('1')
27
+
28
+ core = Anemone.crawl(pages[0].url, @opts)
29
+
30
+ core.should have(2).pages
31
+ core.pages.keys.should_not include('http://www.other.com/')
32
+ end
33
+
34
+ it "should not follow redirects that leave the original domain" do
35
+ pages = []
36
+ pages << FakePage.new('0', :links => ['1'], :redirect => 'http://www.other.com/')
37
+ pages << FakePage.new('1')
38
+
39
+ core = Anemone.crawl(pages[0].url, @opts)
40
+
41
+ core.should have(2).pages
42
+ core.pages.keys.should_not include('http://www.other.com/')
43
+ end
44
+
45
+ it "should follow http redirects" do
46
+ pages = []
47
+ pages << FakePage.new('0', :links => ['1'])
48
+ pages << FakePage.new('1', :redirect => '2')
49
+ pages << FakePage.new('2')
50
+
51
+ Anemone.crawl(pages[0].url, @opts).should have(3).pages
52
+ end
53
+
54
+ it "should follow with HTTP basic authentication" do
55
+ pages = []
56
+ pages << FakePage.new('0', :links => ['1', '2'], :auth => true)
57
+ pages << FakePage.new('1', :links => ['3'], :auth => true)
58
+
59
+ Anemone.crawl(pages.first.auth_url, @opts).should have(3).pages
60
+ end
61
+
62
+ it "should accept multiple starting URLs" do
63
+ pages = []
64
+ pages << FakePage.new('0', :links => ['1'])
65
+ pages << FakePage.new('1')
66
+ pages << FakePage.new('2', :links => ['3'])
67
+ pages << FakePage.new('3')
68
+
69
+ Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
70
+ end
71
+
72
+ it "should include the query string when following links" do
73
+ pages = []
74
+ pages << FakePage.new('0', :links => ['1?foo=1'])
75
+ pages << FakePage.new('1?foo=1')
76
+ pages << FakePage.new('1')
77
+
78
+ core = Anemone.crawl(pages[0].url, @opts)
79
+
80
+ core.should have(2).pages
81
+ core.pages.keys.should_not include(pages[2].url)
82
+ end
83
+
84
+ it "should be able to skip links with query strings" do
85
+ pages = []
86
+ pages << FakePage.new('0', :links => ['1?foo=1', '2'])
87
+ pages << FakePage.new('1?foo=1')
88
+ pages << FakePage.new('2')
89
+
90
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
91
+ a.skip_query_strings = true
92
+ end
93
+
94
+ core.should have(2).pages
95
+ end
96
+
97
+ it "should be able to skip links based on a RegEx" do
98
+ pages = []
99
+ pages << FakePage.new('0', :links => ['1', '2'])
100
+ pages << FakePage.new('1')
101
+ pages << FakePage.new('2')
102
+ pages << FakePage.new('3')
103
+
104
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
105
+ a.skip_links_like /1/, /3/
106
+ end
107
+
108
+ core.should have(2).pages
109
+ core.pages.keys.should_not include(pages[1].url)
110
+ core.pages.keys.should_not include(pages[3].url)
111
+ end
112
+
113
+ it "should be able to call a block on every page" do
114
+ pages = []
115
+ pages << FakePage.new('0', :links => ['1', '2'])
116
+ pages << FakePage.new('1')
117
+ pages << FakePage.new('2')
118
+
119
+ count = 0
120
+ Anemone.crawl(pages[0].url, @opts) do |a|
121
+ a.on_every_page { count += 1 }
122
+ end
123
+
124
+ count.should == 3
125
+ end
126
+
127
+ it "should not discard page bodies by default" do
128
+ Anemone.crawl(FakePage.new('0').url, @opts).pages.values#.first.doc.should_not be_nil
129
+ end
130
+
131
+ it "should optionally discard page bodies to conserve memory" do
132
+ # core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
133
+ # core.pages.values.first.doc.should be_nil
134
+ end
135
+
136
+ it "should provide a focus_crawl method to select the links on each page to follow" do
137
+ pages = []
138
+ pages << FakePage.new('0', :links => ['1', '2'])
139
+ pages << FakePage.new('1')
140
+ pages << FakePage.new('2')
141
+
142
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
143
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
144
+ end
145
+
146
+ core.should have(2).pages
147
+ core.pages.keys.should_not include(pages[1].url)
148
+ end
149
+
150
+ it "should optionally delay between page requests" do
151
+ delay = 0.25
152
+
153
+ pages = []
154
+ pages << FakePage.new('0', :links => '1')
155
+ pages << FakePage.new('1')
156
+
157
+ start = Time.now
158
+ Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
159
+ finish = Time.now
160
+
161
+ (finish - start).should satisfy {|t| t > delay * 2}
162
+ end
163
+
164
+ it "should optionally obey the robots exclusion protocol" do
165
+ pages = []
166
+ pages << FakePage.new('0', :links => '1')
167
+ pages << FakePage.new('1')
168
+ pages << FakePage.new('robots.txt',
169
+ :body => "User-agent: *\nDisallow: /1",
170
+ :content_type => 'text/plain')
171
+
172
+ core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
173
+ urls = core.pages.keys
174
+
175
+ urls.should include(pages[0].url)
176
+ urls.should_not include(pages[1].url)
177
+ end
178
+
179
+ it "should be able to set cookies to send with HTTP requests" do
180
+ cookies = {:a => '1', :b => '2'}
181
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
182
+ anemone.cookies = cookies
183
+ end
184
+ core.opts[:cookies].should == cookies
185
+ end
186
+
187
+ it "should freeze the options once the crawl begins" do
188
+ core = Anemone.crawl(FakePage.new('0').url) do |anemone|
189
+ anemone.threads = 4
190
+ anemone.on_every_page do
191
+ lambda {anemone.threads = 2}.should raise_error
192
+ end
193
+ end
194
+ core.opts[:threads].should == 4
195
+ end
196
+
197
+ describe "many pages" do
198
+ before(:each) do
199
+ @pages, size = [], 5
200
+
201
+ size.times do |n|
202
+ # register this page with a link to the next page
203
+ link = (n + 1).to_s if n + 1 < size
204
+ @pages << FakePage.new(n.to_s, :links => Array(link))
205
+ end
206
+ end
207
+
208
+ it "should track the page depth and referer" do
209
+ core = Anemone.crawl(@pages[0].url, @opts)
210
+ previous_page = nil
211
+
212
+ @pages.each_with_index do |page, i|
213
+ page = core.pages[page.url]
214
+ page.should be
215
+ page.depth.should == i
216
+
217
+ if previous_page
218
+ page.referer.should == previous_page.url
219
+ else
220
+ page.referer.should be_nil
221
+ end
222
+ previous_page = page
223
+ end
224
+ end
225
+
226
+ it "should optionally limit the depth of the crawl" do
227
+ core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
228
+ core.should have(4).pages
229
+ end
230
+ end
231
+
232
+ end
233
+
234
+ describe Hash do
235
+ it_should_behave_like "crawl"
236
+
237
+ before(:all) do
238
+ @opts = {}
239
+ end
240
+ end
241
+
242
+ describe Storage::PStore do
243
+ it_should_behave_like "crawl"
244
+
245
+ before(:all) do
246
+ @test_file = 'test.pstore'
247
+ end
248
+
249
+ before(:each) do
250
+ File.delete(@test_file) if File.exists?(@test_file)
251
+ @opts = {:storage => Storage.PStore(@test_file)}
252
+ end
253
+
254
+ after(:each) do
255
+ File.delete(@test_file) if File.exists?(@test_file)
256
+ end
257
+ end
258
+
259
+ describe Storage::TokyoCabinet do
260
+ it_should_behave_like "crawl"
261
+
262
+ before(:all) do
263
+ @test_file = 'test.tch'
264
+ end
265
+
266
+ before(:each) do
267
+ File.delete(@test_file) if File.exists?(@test_file)
268
+ @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
269
+ end
270
+
271
+ after(:each) do
272
+ @store.close
273
+ end
274
+
275
+ after(:each) do
276
+ File.delete(@test_file) if File.exists?(@test_file)
277
+ end
278
+ end
279
+
280
+ describe Storage::SQLite3 do
281
+ it_should_behave_like "crawl"
282
+
283
+ before(:all) do
284
+ @test_file = 'test.db'
285
+ end
286
+
287
+ before(:each) do
288
+ File.delete(@test_file) if File.exists?(@test_file)
289
+ @opts = {:storage => @store = Storage.SQLite3(@test_file)}
290
+ end
291
+
292
+ after(:each) do
293
+ @store.close
294
+ end
295
+
296
+ after(:each) do
297
+ File.delete(@test_file) if File.exists?(@test_file)
298
+ end
299
+ end
300
+
301
+ describe "options" do
302
+ it "should accept options for the crawl" do
303
+ core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
304
+ :threads => 2,
305
+ :discard_page_bodies => true,
306
+ :user_agent => 'test',
307
+ :obey_robots_txt => true,
308
+ :depth_limit => 3)
309
+
310
+ core.opts[:verbose].should == false
311
+ core.opts[:threads].should == 2
312
+ core.opts[:discard_page_bodies].should == true
313
+ core.opts[:delay].should == 0
314
+ core.opts[:user_agent].should == 'test'
315
+ core.opts[:obey_robots_txt].should == true
316
+ core.opts[:depth_limit].should == 3
317
+ end
318
+
319
+ it "should accept options via setter methods in the crawl block" do
320
+ core = Anemone.crawl(SPEC_DOMAIN) do |a|
321
+ a.verbose = false
322
+ a.threads = 2
323
+ a.discard_page_bodies = true
324
+ a.user_agent = 'test'
325
+ a.obey_robots_txt = true
326
+ a.depth_limit = 3
327
+ end
328
+
329
+ core.opts[:verbose].should == false
330
+ core.opts[:threads].should == 2
331
+ core.opts[:discard_page_bodies].should == true
332
+ core.opts[:delay].should == 0
333
+ core.opts[:user_agent].should == 'test'
334
+ core.opts[:obey_robots_txt].should == true
335
+ core.opts[:depth_limit].should == 3
336
+ end
337
+
338
+ it "should use 1 thread if a delay is requested" do
339
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
340
+ end
341
+ end
342
+
343
+ end
344
+ end