anemone 0.2.3 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,57 @@
1
+ begin
2
+ require 'tokyocabinet'
3
+ rescue LoadError
4
+ puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
5
+ exit
6
+ end
7
+
8
+ require 'forwardable'
9
+
10
+ module Anemone
11
+ module Storage
12
+ class TokyoCabinet
13
+ extend Forwardable
14
+
15
+ def_delegators :@db, :close, :size, :keys, :has_key?
16
+
17
+ def initialize(file)
18
+ raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
19
+ @db = ::TokyoCabinet::HDB::new
20
+ @db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
21
+ @db.clear
22
+ end
23
+
24
+ def [](key)
25
+ if value = @db[key]
26
+ load_value(value)
27
+ end
28
+ end
29
+
30
+ def []=(key, value)
31
+ @db[key] = [Marshal.dump(value)].pack("m")
32
+ end
33
+
34
+ def delete(key)
35
+ value = self[key]
36
+ @db.delete(key)
37
+ value
38
+ end
39
+
40
+ def each
41
+ @db.each { |k, v| yield k, load_value(v) }
42
+ end
43
+
44
+ def merge!(hash)
45
+ hash.each { |key, value| self[key] = value }
46
+ self
47
+ end
48
+
49
+ private
50
+
51
+ def load_value(value)
52
+ Marshal.load(value.unpack("m")[0])
53
+ end
54
+
55
+ end
56
+ end
57
+ end
@@ -2,7 +2,7 @@ require 'anemone/http'
2
2
 
3
3
  module Anemone
4
4
  class Tentacle
5
-
5
+
6
6
  #
7
7
  # Create a new Tentacle
8
8
  #
@@ -12,18 +12,18 @@ module Anemone
12
12
  @http = Anemone::HTTP.new(opts)
13
13
  @opts = opts
14
14
  end
15
-
15
+
16
16
  #
17
17
  # Gets links from @link_queue, and returns the fetched
18
18
  # Page objects into @page_queue
19
19
  #
20
20
  def run
21
21
  loop do
22
- link, from_page = @link_queue.deq
23
-
22
+ link, referer, depth = @link_queue.deq
23
+
24
24
  break if link == :END
25
25
 
26
- @page_queue << @http.fetch_page(link, from_page)
26
+ @http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
27
27
 
28
28
  delay
29
29
  end
@@ -32,8 +32,8 @@ module Anemone
32
32
  private
33
33
 
34
34
  def delay
35
- sleep @opts[:delay] if @opts[:delay]
35
+ sleep @opts[:delay] if @opts[:delay] > 0
36
36
  end
37
37
 
38
38
  end
39
- end
39
+ end
@@ -1,15 +1,15 @@
1
1
  require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  describe Anemone do
4
-
4
+
5
5
  it "should have a version" do
6
6
  Anemone.const_defined?('VERSION').should == true
7
7
  end
8
8
 
9
- it "should return a Anemone::Core from the crawl, which has a PageHash" do
9
+ it "should return a Anemone::Core from the crawl, which has a PageStore" do
10
10
  result = Anemone.crawl(SPEC_DOMAIN)
11
11
  result.should be_an_instance_of(Anemone::Core)
12
- result.pages.should be_an_instance_of(Anemone::PageHash)
12
+ result.pages.should be_an_instance_of(Anemone::PageStore)
13
13
  end
14
-
14
+
15
15
  end
@@ -1,178 +1,222 @@
1
1
  require File.dirname(__FILE__) + '/spec_helper'
2
+ %w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
2
3
 
3
4
  module Anemone
4
5
  describe Core do
5
-
6
+
6
7
  before(:each) do
7
8
  FakeWeb.clean_registry
8
9
  end
9
-
10
- it "should crawl all the html pages in a domain by following <a> href's" do
11
- pages = []
12
- pages << FakePage.new('0', :links => ['1', '2'])
13
- pages << FakePage.new('1', :links => ['3'])
14
- pages << FakePage.new('2')
15
- pages << FakePage.new('3')
16
-
17
- Anemone.crawl(pages[0].url).should have(4).pages
18
- end
19
-
20
- it "should not leave the original domain" do
21
- pages = []
22
- pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
- pages << FakePage.new('1')
24
-
25
- core = Anemone.crawl(pages[0].url)
26
-
27
- core.should have(2).pages
28
- core.pages.keys.should_not include('http://www.other.com/')
29
- end
30
-
31
- it "should follow http redirects" do
32
- pages = []
33
- pages << FakePage.new('0', :links => ['1'])
34
- pages << FakePage.new('1', :redirect => '2')
35
- pages << FakePage.new('2')
36
-
37
- Anemone.crawl(pages[0].url).should have(3).pages
38
- end
39
-
40
- it "should accept multiple starting URLs" do
41
- pages = []
42
- pages << FakePage.new('0', :links => ['1'])
43
- pages << FakePage.new('1')
44
- pages << FakePage.new('2', :links => ['3'])
45
- pages << FakePage.new('3')
46
-
47
- Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
- end
49
-
50
- it "should include the query string when following links" do
51
- pages = []
52
- pages << FakePage.new('0', :links => ['1?foo=1'])
53
- pages << FakePage.new('1?foo=1')
54
- pages << FakePage.new('1')
55
-
56
- core = Anemone.crawl(pages[0].url)
57
-
58
- core.should have(2).pages
59
- core.pages.keys.should_not include(pages[2].url)
60
- end
61
-
62
- it "should be able to skip links based on a RegEx" do
63
- pages = []
64
- pages << FakePage.new('0', :links => ['1', '2'])
65
- pages << FakePage.new('1')
66
- pages << FakePage.new('2')
67
- pages << FakePage.new('3')
68
-
69
- core = Anemone.crawl(pages[0].url) do |a|
70
- a.skip_links_like /1/, /3/
71
- end
72
-
73
- core.should have(2).pages
74
- core.pages.keys.should_not include(pages[1].url)
75
- core.pages.keys.should_not include(pages[3].url)
76
- end
77
-
78
- it "should be able to call a block on every page" do
79
- pages = []
80
- pages << FakePage.new('0', :links => ['1', '2'])
81
- pages << FakePage.new('1')
82
- pages << FakePage.new('2')
83
-
84
- count = 0
85
- Anemone.crawl(pages[0].url) do |a|
86
- a.on_every_page { count += 1 }
87
- end
88
-
89
- count.should == 3
90
- end
91
-
92
- it "should not discard page bodies by default" do
93
- Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
94
- end
95
-
96
- it "should optionally discard page bodies to conserve memory" do
97
- core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
98
- core.pages.values.first.doc.should be_nil
99
- end
100
-
101
- it "should provide a focus_crawl method to select the links on each page to follow" do
102
- pages = []
103
- pages << FakePage.new('0', :links => ['1', '2'])
104
- pages << FakePage.new('1')
105
- pages << FakePage.new('2')
106
-
107
- core = Anemone.crawl(pages[0].url) do |a|
108
- a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
109
- end
110
-
111
- core.should have(2).pages
112
- core.pages.keys.should_not include(pages[1].url)
10
+
11
+ shared_examples_for "crawl" do
12
+ it "should crawl all the html pages in a domain by following <a> href's" do
13
+ pages = []
14
+ pages << FakePage.new('0', :links => ['1', '2'])
15
+ pages << FakePage.new('1', :links => ['3'])
16
+ pages << FakePage.new('2')
17
+ pages << FakePage.new('3')
18
+
19
+ Anemone.crawl(pages[0].url, @opts).should have(4).pages
20
+ end
21
+
22
+ it "should not leave the original domain" do
23
+ pages = []
24
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
25
+ pages << FakePage.new('1')
26
+
27
+ core = Anemone.crawl(pages[0].url, @opts)
28
+
29
+ core.should have(2).pages
30
+ core.pages.keys.should_not include('http://www.other.com/')
31
+ end
32
+
33
+ it "should follow http redirects" do
34
+ pages = []
35
+ pages << FakePage.new('0', :links => ['1'])
36
+ pages << FakePage.new('1', :redirect => '2')
37
+ pages << FakePage.new('2')
38
+
39
+ Anemone.crawl(pages[0].url, @opts).should have(3).pages
40
+ end
41
+
42
+ it "should accept multiple starting URLs" do
43
+ pages = []
44
+ pages << FakePage.new('0', :links => ['1'])
45
+ pages << FakePage.new('1')
46
+ pages << FakePage.new('2', :links => ['3'])
47
+ pages << FakePage.new('3')
48
+
49
+ Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
50
+ end
51
+
52
+ it "should include the query string when following links" do
53
+ pages = []
54
+ pages << FakePage.new('0', :links => ['1?foo=1'])
55
+ pages << FakePage.new('1?foo=1')
56
+ pages << FakePage.new('1')
57
+
58
+ core = Anemone.crawl(pages[0].url, @opts)
59
+
60
+ core.should have(2).pages
61
+ core.pages.keys.should_not include(pages[2].url)
62
+ end
63
+
64
+ it "should be able to skip links based on a RegEx" do
65
+ pages = []
66
+ pages << FakePage.new('0', :links => ['1', '2'])
67
+ pages << FakePage.new('1')
68
+ pages << FakePage.new('2')
69
+ pages << FakePage.new('3')
70
+
71
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
72
+ a.skip_links_like /1/, /3/
73
+ end
74
+
75
+ core.should have(2).pages
76
+ core.pages.keys.should_not include(pages[1].url)
77
+ core.pages.keys.should_not include(pages[3].url)
78
+ end
79
+
80
+ it "should be able to call a block on every page" do
81
+ pages = []
82
+ pages << FakePage.new('0', :links => ['1', '2'])
83
+ pages << FakePage.new('1')
84
+ pages << FakePage.new('2')
85
+
86
+ count = 0
87
+ Anemone.crawl(pages[0].url, @opts) do |a|
88
+ a.on_every_page { count += 1 }
89
+ end
90
+
91
+ count.should == 3
92
+ end
93
+
94
+ it "should not discard page bodies by default" do
95
+ Anemone.crawl(FakePage.new('0').url, @opts).pages.values.first.doc.should_not be_nil
96
+ end
97
+
98
+ it "should optionally discard page bodies to conserve memory" do
99
+ core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
100
+ core.pages.values.first.doc.should be_nil
101
+ end
102
+
103
+ it "should provide a focus_crawl method to select the links on each page to follow" do
104
+ pages = []
105
+ pages << FakePage.new('0', :links => ['1', '2'])
106
+ pages << FakePage.new('1')
107
+ pages << FakePage.new('2')
108
+
109
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
110
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
111
+ end
112
+
113
+ core.should have(2).pages
114
+ core.pages.keys.should_not include(pages[1].url)
115
+ end
116
+
117
+ it "should optionally delay between page requests" do
118
+ delay = 0.25
119
+
120
+ pages = []
121
+ pages << FakePage.new('0', :links => '1')
122
+ pages << FakePage.new('1')
123
+
124
+ start = Time.now
125
+ Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
126
+ finish = Time.now
127
+
128
+ (finish - start).should satisfy {|t| t > delay * 2}
129
+ end
130
+
131
+ it "should optionally obey the robots exclusion protocol" do
132
+ pages = []
133
+ pages << FakePage.new('0', :links => '1')
134
+ pages << FakePage.new('1')
135
+ pages << FakePage.new('robots.txt',
136
+ :body => "User-agent: *\nDisallow: /1",
137
+ :content_type => 'text/plain')
138
+
139
+ core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
140
+ urls = core.pages.keys
141
+
142
+ urls.should include(pages[0].url)
143
+ urls.should_not include(pages[1].url)
144
+ end
145
+
146
+ describe "many pages" do
147
+ before(:each) do
148
+ @pages, size = [], 5
149
+
150
+ size.times do |n|
151
+ # register this page with a link to the next page
152
+ link = (n + 1).to_s if n + 1 < size
153
+ @pages << FakePage.new(n.to_s, :links => Array(link))
154
+ end
155
+ end
156
+
157
+ it "should track the page depth and referer" do
158
+ core = Anemone.crawl(@pages[0].url, @opts)
159
+ previous_page = nil
160
+
161
+ @pages.each_with_index do |page, i|
162
+ page = core.pages[page.url]
163
+ page.should be
164
+ page.depth.should == i
165
+
166
+ if previous_page
167
+ page.referer.should == previous_page.url
168
+ else
169
+ page.referer.should be_nil
170
+ end
171
+ previous_page = page
172
+ end
173
+ end
174
+
175
+ it "should optionally limit the depth of the crawl" do
176
+ core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
177
+ core.should have(4).pages
178
+ end
179
+ end
180
+
113
181
  end
114
-
115
- it "should optionally delay between page requests" do
116
- delay = 0.25
117
-
118
- pages = []
119
- pages << FakePage.new('0', :links => '1')
120
- pages << FakePage.new('1')
121
-
122
- start = Time.now
123
- Anemone.crawl(pages[0].url, :delay => delay)
124
- finish = Time.now
125
-
126
- (finish - start).should satisfy {|t| t > delay * 2}
182
+
183
+ describe Hash do
184
+ it_should_behave_like "crawl"
185
+
186
+ before(:all) do
187
+ @opts = {}
188
+ end
127
189
  end
128
-
129
- it "should optionally obey the robots exclusion protocol" do
130
- pages = []
131
- pages << FakePage.new('0', :links => '1')
132
- pages << FakePage.new('1')
133
- pages << FakePage.new('robots.txt',
134
- :body => "User-agent: *\nDisallow: /1",
135
- :content_type => 'text/plain')
136
-
137
- core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
138
- urls = core.pages.keys
139
-
140
- urls.should include(pages[0].url)
141
- urls.should_not include(pages[1].url)
190
+
191
+ describe Storage::PStore do
192
+ it_should_behave_like "crawl"
193
+
194
+ before(:each) do
195
+ @test_file = 'test.pstore'
196
+ File.delete(@test_file) if File.exists?(@test_file)
197
+ @opts = {:storage => Storage.PStore(@test_file)}
198
+ end
199
+
200
+ after(:all) do
201
+ File.delete(@test_file) if File.exists?(@test_file)
202
+ end
142
203
  end
143
-
144
- describe "many pages" do
204
+
205
+ describe Storage::TokyoCabinet do
206
+ it_should_behave_like "crawl"
207
+
145
208
  before(:each) do
146
- @pages, size = [], 5
147
-
148
- size.times do |n|
149
- # register this page with a link to the next page
150
- link = (n + 1).to_s if n + 1 < size
151
- @pages << FakePage.new(n.to_s, :links => Array(link))
152
- end
153
- end
154
-
155
- it "should track the page depth and referer" do
156
- core = Anemone.crawl(@pages[0].url)
157
- previous_page = nil
158
-
159
- @pages.each_with_index do |page, i|
160
- page = core.pages[page.url]
161
- page.should be
162
- page.depth.should == i
163
-
164
- if previous_page
165
- page.referer.should == previous_page.url
166
- else
167
- page.referer.should be_nil
168
- end
169
- previous_page = page
170
- end
209
+ @test_file = 'test.tch'
210
+ File.delete(@test_file) if File.exists?(@test_file)
211
+ @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
171
212
  end
172
-
173
- it "should optionally limit the depth of the crawl" do
174
- core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
175
- core.should have(4).pages
213
+
214
+ after(:each) do
215
+ @store.close
216
+ end
217
+
218
+ after(:all) do
219
+ File.delete(@test_file) if File.exists?(@test_file)
176
220
  end
177
221
  end
178
222
 
@@ -194,6 +238,25 @@ module Anemone
194
238
  core.opts[:depth_limit].should == 3
195
239
  end
196
240
 
241
+ it "should accept options via setter methods in the crawl block" do
242
+ core = Anemone.crawl(SPEC_DOMAIN) do |a|
243
+ a.verbose = false
244
+ a.threads = 2
245
+ a.discard_page_bodies = true
246
+ a.user_agent = 'test'
247
+ a.obey_robots_txt = true
248
+ a.depth_limit = 3
249
+ end
250
+
251
+ core.opts[:verbose].should == false
252
+ core.opts[:threads].should == 2
253
+ core.opts[:discard_page_bodies].should == true
254
+ core.opts[:delay].should == 0
255
+ core.opts[:user_agent].should == 'test'
256
+ core.opts[:obey_robots_txt].should == true
257
+ core.opts[:depth_limit].should == 3
258
+ end
259
+
197
260
  it "should use 1 thread if a delay is requested" do
198
261
  Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
199
262
  end