anemone 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ begin
2
+ require 'tokyocabinet'
3
+ rescue LoadError
4
+ puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
5
+ exit
6
+ end
7
+
8
+ require 'forwardable'
9
+
10
+ module Anemone
11
+ module Storage
12
+ class TokyoCabinet
13
+ extend Forwardable
14
+
15
+ def_delegators :@db, :close, :size, :keys, :has_key?
16
+
17
+ def initialize(file)
18
+ raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
19
+ @db = ::TokyoCabinet::HDB::new
20
+ @db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
21
+ @db.clear
22
+ end
23
+
24
+ def [](key)
25
+ if value = @db[key]
26
+ load_value(value)
27
+ end
28
+ end
29
+
30
+ def []=(key, value)
31
+ @db[key] = [Marshal.dump(value)].pack("m")
32
+ end
33
+
34
+ def delete(key)
35
+ value = self[key]
36
+ @db.delete(key)
37
+ value
38
+ end
39
+
40
+ def each
41
+ @db.each { |k, v| yield k, load_value(v) }
42
+ end
43
+
44
+ def merge!(hash)
45
+ hash.each { |key, value| self[key] = value }
46
+ self
47
+ end
48
+
49
+ private
50
+
51
+ def load_value(value)
52
+ Marshal.load(value.unpack("m")[0])
53
+ end
54
+
55
+ end
56
+ end
57
+ end
@@ -2,7 +2,7 @@ require 'anemone/http'
2
2
 
3
3
  module Anemone
4
4
  class Tentacle
5
-
5
+
6
6
  #
7
7
  # Create a new Tentacle
8
8
  #
@@ -12,18 +12,18 @@ module Anemone
12
12
  @http = Anemone::HTTP.new(opts)
13
13
  @opts = opts
14
14
  end
15
-
15
+
16
16
  #
17
17
  # Gets links from @link_queue, and returns the fetched
18
18
  # Page objects into @page_queue
19
19
  #
20
20
  def run
21
21
  loop do
22
- link, from_page = @link_queue.deq
23
-
22
+ link, referer, depth = @link_queue.deq
23
+
24
24
  break if link == :END
25
25
 
26
- @page_queue << @http.fetch_page(link, from_page)
26
+ @http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
27
27
 
28
28
  delay
29
29
  end
@@ -32,8 +32,8 @@ module Anemone
32
32
  private
33
33
 
34
34
  def delay
35
- sleep @opts[:delay] if @opts[:delay]
35
+ sleep @opts[:delay] if @opts[:delay] > 0
36
36
  end
37
37
 
38
38
  end
39
- end
39
+ end
@@ -1,15 +1,15 @@
1
1
  require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  describe Anemone do
4
-
4
+
5
5
  it "should have a version" do
6
6
  Anemone.const_defined?('VERSION').should == true
7
7
  end
8
8
 
9
- it "should return a Anemone::Core from the crawl, which has a PageHash" do
9
+ it "should return a Anemone::Core from the crawl, which has a PageStore" do
10
10
  result = Anemone.crawl(SPEC_DOMAIN)
11
11
  result.should be_an_instance_of(Anemone::Core)
12
- result.pages.should be_an_instance_of(Anemone::PageHash)
12
+ result.pages.should be_an_instance_of(Anemone::PageStore)
13
13
  end
14
-
14
+
15
15
  end
@@ -1,178 +1,222 @@
1
1
  require File.dirname(__FILE__) + '/spec_helper'
2
+ %w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
2
3
 
3
4
  module Anemone
4
5
  describe Core do
5
-
6
+
6
7
  before(:each) do
7
8
  FakeWeb.clean_registry
8
9
  end
9
-
10
- it "should crawl all the html pages in a domain by following <a> href's" do
11
- pages = []
12
- pages << FakePage.new('0', :links => ['1', '2'])
13
- pages << FakePage.new('1', :links => ['3'])
14
- pages << FakePage.new('2')
15
- pages << FakePage.new('3')
16
-
17
- Anemone.crawl(pages[0].url).should have(4).pages
18
- end
19
-
20
- it "should not leave the original domain" do
21
- pages = []
22
- pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
- pages << FakePage.new('1')
24
-
25
- core = Anemone.crawl(pages[0].url)
26
-
27
- core.should have(2).pages
28
- core.pages.keys.should_not include('http://www.other.com/')
29
- end
30
-
31
- it "should follow http redirects" do
32
- pages = []
33
- pages << FakePage.new('0', :links => ['1'])
34
- pages << FakePage.new('1', :redirect => '2')
35
- pages << FakePage.new('2')
36
-
37
- Anemone.crawl(pages[0].url).should have(3).pages
38
- end
39
-
40
- it "should accept multiple starting URLs" do
41
- pages = []
42
- pages << FakePage.new('0', :links => ['1'])
43
- pages << FakePage.new('1')
44
- pages << FakePage.new('2', :links => ['3'])
45
- pages << FakePage.new('3')
46
-
47
- Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
- end
49
-
50
- it "should include the query string when following links" do
51
- pages = []
52
- pages << FakePage.new('0', :links => ['1?foo=1'])
53
- pages << FakePage.new('1?foo=1')
54
- pages << FakePage.new('1')
55
-
56
- core = Anemone.crawl(pages[0].url)
57
-
58
- core.should have(2).pages
59
- core.pages.keys.should_not include(pages[2].url)
60
- end
61
-
62
- it "should be able to skip links based on a RegEx" do
63
- pages = []
64
- pages << FakePage.new('0', :links => ['1', '2'])
65
- pages << FakePage.new('1')
66
- pages << FakePage.new('2')
67
- pages << FakePage.new('3')
68
-
69
- core = Anemone.crawl(pages[0].url) do |a|
70
- a.skip_links_like /1/, /3/
71
- end
72
-
73
- core.should have(2).pages
74
- core.pages.keys.should_not include(pages[1].url)
75
- core.pages.keys.should_not include(pages[3].url)
76
- end
77
-
78
- it "should be able to call a block on every page" do
79
- pages = []
80
- pages << FakePage.new('0', :links => ['1', '2'])
81
- pages << FakePage.new('1')
82
- pages << FakePage.new('2')
83
-
84
- count = 0
85
- Anemone.crawl(pages[0].url) do |a|
86
- a.on_every_page { count += 1 }
87
- end
88
-
89
- count.should == 3
90
- end
91
-
92
- it "should not discard page bodies by default" do
93
- Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
94
- end
95
-
96
- it "should optionally discard page bodies to conserve memory" do
97
- core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
98
- core.pages.values.first.doc.should be_nil
99
- end
100
-
101
- it "should provide a focus_crawl method to select the links on each page to follow" do
102
- pages = []
103
- pages << FakePage.new('0', :links => ['1', '2'])
104
- pages << FakePage.new('1')
105
- pages << FakePage.new('2')
106
-
107
- core = Anemone.crawl(pages[0].url) do |a|
108
- a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
109
- end
110
-
111
- core.should have(2).pages
112
- core.pages.keys.should_not include(pages[1].url)
10
+
11
+ shared_examples_for "crawl" do
12
+ it "should crawl all the html pages in a domain by following <a> href's" do
13
+ pages = []
14
+ pages << FakePage.new('0', :links => ['1', '2'])
15
+ pages << FakePage.new('1', :links => ['3'])
16
+ pages << FakePage.new('2')
17
+ pages << FakePage.new('3')
18
+
19
+ Anemone.crawl(pages[0].url, @opts).should have(4).pages
20
+ end
21
+
22
+ it "should not leave the original domain" do
23
+ pages = []
24
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
25
+ pages << FakePage.new('1')
26
+
27
+ core = Anemone.crawl(pages[0].url, @opts)
28
+
29
+ core.should have(2).pages
30
+ core.pages.keys.should_not include('http://www.other.com/')
31
+ end
32
+
33
+ it "should follow http redirects" do
34
+ pages = []
35
+ pages << FakePage.new('0', :links => ['1'])
36
+ pages << FakePage.new('1', :redirect => '2')
37
+ pages << FakePage.new('2')
38
+
39
+ Anemone.crawl(pages[0].url, @opts).should have(3).pages
40
+ end
41
+
42
+ it "should accept multiple starting URLs" do
43
+ pages = []
44
+ pages << FakePage.new('0', :links => ['1'])
45
+ pages << FakePage.new('1')
46
+ pages << FakePage.new('2', :links => ['3'])
47
+ pages << FakePage.new('3')
48
+
49
+ Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
50
+ end
51
+
52
+ it "should include the query string when following links" do
53
+ pages = []
54
+ pages << FakePage.new('0', :links => ['1?foo=1'])
55
+ pages << FakePage.new('1?foo=1')
56
+ pages << FakePage.new('1')
57
+
58
+ core = Anemone.crawl(pages[0].url, @opts)
59
+
60
+ core.should have(2).pages
61
+ core.pages.keys.should_not include(pages[2].url)
62
+ end
63
+
64
+ it "should be able to skip links based on a RegEx" do
65
+ pages = []
66
+ pages << FakePage.new('0', :links => ['1', '2'])
67
+ pages << FakePage.new('1')
68
+ pages << FakePage.new('2')
69
+ pages << FakePage.new('3')
70
+
71
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
72
+ a.skip_links_like /1/, /3/
73
+ end
74
+
75
+ core.should have(2).pages
76
+ core.pages.keys.should_not include(pages[1].url)
77
+ core.pages.keys.should_not include(pages[3].url)
78
+ end
79
+
80
+ it "should be able to call a block on every page" do
81
+ pages = []
82
+ pages << FakePage.new('0', :links => ['1', '2'])
83
+ pages << FakePage.new('1')
84
+ pages << FakePage.new('2')
85
+
86
+ count = 0
87
+ Anemone.crawl(pages[0].url, @opts) do |a|
88
+ a.on_every_page { count += 1 }
89
+ end
90
+
91
+ count.should == 3
92
+ end
93
+
94
+ it "should not discard page bodies by default" do
95
+ Anemone.crawl(FakePage.new('0').url, @opts).pages.values.first.doc.should_not be_nil
96
+ end
97
+
98
+ it "should optionally discard page bodies to conserve memory" do
99
+ core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
100
+ core.pages.values.first.doc.should be_nil
101
+ end
102
+
103
+ it "should provide a focus_crawl method to select the links on each page to follow" do
104
+ pages = []
105
+ pages << FakePage.new('0', :links => ['1', '2'])
106
+ pages << FakePage.new('1')
107
+ pages << FakePage.new('2')
108
+
109
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
110
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
111
+ end
112
+
113
+ core.should have(2).pages
114
+ core.pages.keys.should_not include(pages[1].url)
115
+ end
116
+
117
+ it "should optionally delay between page requests" do
118
+ delay = 0.25
119
+
120
+ pages = []
121
+ pages << FakePage.new('0', :links => '1')
122
+ pages << FakePage.new('1')
123
+
124
+ start = Time.now
125
+ Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
126
+ finish = Time.now
127
+
128
+ (finish - start).should satisfy {|t| t > delay * 2}
129
+ end
130
+
131
+ it "should optionally obey the robots exclusion protocol" do
132
+ pages = []
133
+ pages << FakePage.new('0', :links => '1')
134
+ pages << FakePage.new('1')
135
+ pages << FakePage.new('robots.txt',
136
+ :body => "User-agent: *\nDisallow: /1",
137
+ :content_type => 'text/plain')
138
+
139
+ core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
140
+ urls = core.pages.keys
141
+
142
+ urls.should include(pages[0].url)
143
+ urls.should_not include(pages[1].url)
144
+ end
145
+
146
+ describe "many pages" do
147
+ before(:each) do
148
+ @pages, size = [], 5
149
+
150
+ size.times do |n|
151
+ # register this page with a link to the next page
152
+ link = (n + 1).to_s if n + 1 < size
153
+ @pages << FakePage.new(n.to_s, :links => Array(link))
154
+ end
155
+ end
156
+
157
+ it "should track the page depth and referer" do
158
+ core = Anemone.crawl(@pages[0].url, @opts)
159
+ previous_page = nil
160
+
161
+ @pages.each_with_index do |page, i|
162
+ page = core.pages[page.url]
163
+ page.should be
164
+ page.depth.should == i
165
+
166
+ if previous_page
167
+ page.referer.should == previous_page.url
168
+ else
169
+ page.referer.should be_nil
170
+ end
171
+ previous_page = page
172
+ end
173
+ end
174
+
175
+ it "should optionally limit the depth of the crawl" do
176
+ core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
177
+ core.should have(4).pages
178
+ end
179
+ end
180
+
113
181
  end
114
-
115
- it "should optionally delay between page requests" do
116
- delay = 0.25
117
-
118
- pages = []
119
- pages << FakePage.new('0', :links => '1')
120
- pages << FakePage.new('1')
121
-
122
- start = Time.now
123
- Anemone.crawl(pages[0].url, :delay => delay)
124
- finish = Time.now
125
-
126
- (finish - start).should satisfy {|t| t > delay * 2}
182
+
183
+ describe Hash do
184
+ it_should_behave_like "crawl"
185
+
186
+ before(:all) do
187
+ @opts = {}
188
+ end
127
189
  end
128
-
129
- it "should optionally obey the robots exclusion protocol" do
130
- pages = []
131
- pages << FakePage.new('0', :links => '1')
132
- pages << FakePage.new('1')
133
- pages << FakePage.new('robots.txt',
134
- :body => "User-agent: *\nDisallow: /1",
135
- :content_type => 'text/plain')
136
-
137
- core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
138
- urls = core.pages.keys
139
-
140
- urls.should include(pages[0].url)
141
- urls.should_not include(pages[1].url)
190
+
191
+ describe Storage::PStore do
192
+ it_should_behave_like "crawl"
193
+
194
+ before(:each) do
195
+ @test_file = 'test.pstore'
196
+ File.delete(@test_file) if File.exists?(@test_file)
197
+ @opts = {:storage => Storage.PStore(@test_file)}
198
+ end
199
+
200
+ after(:all) do
201
+ File.delete(@test_file) if File.exists?(@test_file)
202
+ end
142
203
  end
143
-
144
- describe "many pages" do
204
+
205
+ describe Storage::TokyoCabinet do
206
+ it_should_behave_like "crawl"
207
+
145
208
  before(:each) do
146
- @pages, size = [], 5
147
-
148
- size.times do |n|
149
- # register this page with a link to the next page
150
- link = (n + 1).to_s if n + 1 < size
151
- @pages << FakePage.new(n.to_s, :links => Array(link))
152
- end
153
- end
154
-
155
- it "should track the page depth and referer" do
156
- core = Anemone.crawl(@pages[0].url)
157
- previous_page = nil
158
-
159
- @pages.each_with_index do |page, i|
160
- page = core.pages[page.url]
161
- page.should be
162
- page.depth.should == i
163
-
164
- if previous_page
165
- page.referer.should == previous_page.url
166
- else
167
- page.referer.should be_nil
168
- end
169
- previous_page = page
170
- end
209
+ @test_file = 'test.tch'
210
+ File.delete(@test_file) if File.exists?(@test_file)
211
+ @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
171
212
  end
172
-
173
- it "should optionally limit the depth of the crawl" do
174
- core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
175
- core.should have(4).pages
213
+
214
+ after(:each) do
215
+ @store.close
216
+ end
217
+
218
+ after(:all) do
219
+ File.delete(@test_file) if File.exists?(@test_file)
176
220
  end
177
221
  end
178
222
 
@@ -194,6 +238,25 @@ module Anemone
194
238
  core.opts[:depth_limit].should == 3
195
239
  end
196
240
 
241
+ it "should accept options via setter methods in the crawl block" do
242
+ core = Anemone.crawl(SPEC_DOMAIN) do |a|
243
+ a.verbose = false
244
+ a.threads = 2
245
+ a.discard_page_bodies = true
246
+ a.user_agent = 'test'
247
+ a.obey_robots_txt = true
248
+ a.depth_limit = 3
249
+ end
250
+
251
+ core.opts[:verbose].should == false
252
+ core.opts[:threads].should == 2
253
+ core.opts[:discard_page_bodies].should == true
254
+ core.opts[:delay].should == 0
255
+ core.opts[:user_agent].should == 'test'
256
+ core.opts[:obey_robots_txt].should == true
257
+ core.opts[:depth_limit].should == 3
258
+ end
259
+
197
260
  it "should use 1 thread if a delay is requested" do
198
261
  Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
199
262
  end