spk-anemone 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,21 +1,52 @@
1
+ require 'forwardable'
2
+
1
3
  module Anemone
2
- class PageHash < Hash
3
-
4
+ class PageStore
5
+ extend Forwardable
6
+
7
+ def_delegators :@storage, :keys, :values, :size, :each
8
+
9
+ def initialize(storage = {})
10
+ @storage = storage
11
+ end
12
+
4
13
  # We typically index the hash with a URI,
5
14
  # but convert it to a String for easier retrieval
6
15
  def [](index)
7
- super(index.to_s)
16
+ @storage[index.to_s]
8
17
  end
9
-
18
+
10
19
  def []=(index, other)
11
- super(index.to_s, other)
20
+ @storage[index.to_s] = other
21
+ end
22
+
23
+ def delete(key)
24
+ @storage.delete key.to_s
12
25
  end
13
-
26
+
14
27
  def has_key?(key)
15
- super(key.to_s)
28
+ @storage.has_key? key.to_s
29
+ end
30
+
31
+ def each_value
32
+ each { |key, value| yield value }
33
+ end
34
+
35
+ def values
36
+ result = []
37
+ each { |key, value| result << value }
38
+ result
39
+ end
40
+
41
+ def touch_key(key)
42
+ self[key] = Page.new(key)
16
43
  end
17
44
 
18
- # Does this PageHash contain the specified URL?
45
+ def touch_keys(keys)
46
+ @storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
47
+ end
48
+
49
+ # Does this PageStore contain the specified URL?
19
50
  # HTTP and HTTPS versions of a URL are considered to be the same page.
20
51
  def has_page?(url)
21
52
  schemes = %w(http https)
@@ -24,80 +55,67 @@ module Anemone
24
55
  return schemes.any? { |s| u.scheme = s; has_key?(u) }
25
56
  end
26
57
 
27
- has_key?(url)
58
+ has_key? url
28
59
  end
29
-
60
+
30
61
  #
31
62
  # Use a breadth-first search to calculate the single-source
32
- # shortest paths from *root* to all pages in the PageHash
63
+ # shortest paths from *root* to all pages in the PageStore
33
64
  #
34
65
  def shortest_paths!(root)
35
66
  root = URI(root) if root.is_a?(String)
36
67
  raise "Root node not found" if !has_key?(root)
37
-
38
- each_value {|p| p.visited = false if p}
39
-
68
+
40
69
  q = Queue.new
41
-
42
- q.enq(root)
43
- self[root].depth = 0
44
- self[root].visited = true
45
- while(!q.empty?)
46
- url = q.deq
47
-
48
- next if !has_key?(url)
49
-
50
- page = self[url]
51
-
70
+
71
+ q.enq root
72
+ root_page = self[root]
73
+ root_page.depth = 0
74
+ root_page.visited = true
75
+ self[root] = root_page
76
+ while !q.empty?
77
+ page = self[q.deq]
52
78
  page.links.each do |u|
53
- next if !has_key?(u) or self[u].nil?
54
- link = self[u]
55
- aliases = [link].concat(link.aliases.map {|a| self[a] })
56
-
57
- aliases.each do |node|
58
- if node.depth.nil? or page.depth + 1 < node.depth
59
- node.depth = page.depth + 1
79
+ begin
80
+ link = self[u]
81
+ next if link.nil? || !link.fetched? || link.visited
82
+
83
+ q << u unless link.redirect?
84
+ link.visited = true
85
+ link.depth = page.depth + 1
86
+ self[u] = link
87
+
88
+ if link.redirect?
89
+ u = link.redirect_to
90
+ redo
60
91
  end
61
92
  end
62
-
63
- q.enq(self[u].url) if !self[u].visited
64
- self[u].visited = true
65
93
  end
66
94
  end
67
-
95
+
68
96
  self
69
97
  end
70
-
98
+
71
99
  #
72
- # Returns a new PageHash by removing redirect-aliases for each
73
- # non-redirect Page
100
+ # Removes all Pages from storage where redirect? is true
74
101
  #
75
- def uniq
76
- results = PageHash.new
77
- each do |url, page|
78
- #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
79
- page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
80
- if !page.redirect? and !page_added
81
- results[url] = page.clone
82
- results[url].aliases = []
83
- end
84
- end
85
-
86
- results
102
+ def uniq!
103
+ each_value { |page| delete page.url if page.redirect? }
104
+ self
87
105
  end
88
-
106
+
89
107
  #
90
108
  # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
91
109
  # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
92
110
  #
93
111
  def pages_linking_to(urls)
94
112
  unless urls.is_a?(Array)
95
- urls = [urls] unless urls.is_a?(Array)
113
+ urls = [urls]
96
114
  single = true
97
115
  end
98
116
 
99
117
  urls.map! do |url|
100
- if url.is_a?(String)
118
+ unless url.is_a?(URI)
101
119
  URI(url) rescue nil
102
120
  else
103
121
  url
@@ -112,7 +130,7 @@ module Anemone
112
130
  end
113
131
 
114
132
  if single and !links.empty?
115
- return links.first
133
+ return links[urls.first]
116
134
  else
117
135
  return links
118
136
  end
@@ -132,11 +150,11 @@ module Anemone
132
150
  links.each { |url, pages| links[url] = pages.map{|p| p.url} }
133
151
 
134
152
  if single and !links.empty?
135
- return links.first
153
+ return links[urls.first]
136
154
  else
137
155
  return links
138
- end
156
+ end
139
157
  end
140
158
 
141
159
  end
142
- end
160
+ end
@@ -0,0 +1,19 @@
1
+ module Anemone
2
+ module Storage
3
+
4
+ def self.Hash(*args)
5
+ Hash.new(*args)
6
+ end
7
+
8
+ def self.PStore(*args)
9
+ require 'anemone/storage/pstore'
10
+ self::PStore.new(*args)
11
+ end
12
+
13
+ def self.TokyoCabinet(file)
14
+ require 'anemone/storage/tokyo_cabinet'
15
+ self::TokyoCabinet.new(file)
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,48 @@
1
+ require 'pstore'
2
+ require 'forwardable'
3
+
4
+ module Anemone
5
+ module Storage
6
+ class PStore
7
+ extend Forwardable
8
+
9
+ def_delegators :@keys, :has_key?, :keys, :size
10
+
11
+ def initialize(file)
12
+ File.delete(file) if File.exists?(file)
13
+ @store = ::PStore.new(file)
14
+ @keys = {}
15
+ end
16
+
17
+ def [](key)
18
+ @store.transaction { |s| s[key] }
19
+ end
20
+
21
+ def []=(key,value)
22
+ @keys[key] = nil
23
+ @store.transaction { |s| s[key] = value }
24
+ end
25
+
26
+ def delete(key)
27
+ @keys.delete(key)
28
+ @store.transaction { |s| s.delete key}
29
+ end
30
+
31
+ def each
32
+ @keys.each_key do |key|
33
+ value = nil
34
+ @store.transaction { |s| value = s[key] }
35
+ yield key, value
36
+ end
37
+ end
38
+
39
+ def merge!(hash)
40
+ @store.transaction do |s|
41
+ hash.each { |key, value| s[key] = value; @keys[key] = nil }
42
+ end
43
+ self
44
+ end
45
+
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,57 @@
1
+ begin
2
+ require 'tokyocabinet'
3
+ rescue LoadError
4
+ puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
5
+ exit
6
+ end
7
+
8
+ require 'forwardable'
9
+
10
+ module Anemone
11
+ module Storage
12
+ class TokyoCabinet
13
+ extend Forwardable
14
+
15
+ def_delegators :@db, :close, :size, :keys, :has_key?
16
+
17
+ def initialize(file)
18
+ raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
19
+ @db = ::TokyoCabinet::HDB::new
20
+ @db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
21
+ @db.clear
22
+ end
23
+
24
+ def [](key)
25
+ if value = @db[key]
26
+ load_value(value)
27
+ end
28
+ end
29
+
30
+ def []=(key, value)
31
+ @db[key] = [Marshal.dump(value)].pack("m")
32
+ end
33
+
34
+ def delete(key)
35
+ value = self[key]
36
+ @db.delete(key)
37
+ value
38
+ end
39
+
40
+ def each
41
+ @db.each { |k, v| yield k, load_value(v) }
42
+ end
43
+
44
+ def merge!(hash)
45
+ hash.each { |key, value| self[key] = value }
46
+ self
47
+ end
48
+
49
+ private
50
+
51
+ def load_value(value)
52
+ Marshal.load(value.unpack("m")[0])
53
+ end
54
+
55
+ end
56
+ end
57
+ end
@@ -2,7 +2,7 @@ require 'anemone/http'
2
2
 
3
3
  module Anemone
4
4
  class Tentacle
5
-
5
+
6
6
  #
7
7
  # Create a new Tentacle
8
8
  #
@@ -12,18 +12,18 @@ module Anemone
12
12
  @http = Anemone::HTTP.new(opts)
13
13
  @opts = opts
14
14
  end
15
-
15
+
16
16
  #
17
17
  # Gets links from @link_queue, and returns the fetched
18
18
  # Page objects into @page_queue
19
19
  #
20
20
  def run
21
21
  loop do
22
- link, from_page = @link_queue.deq
23
-
22
+ link, referer, depth = @link_queue.deq
23
+
24
24
  break if link == :END
25
25
 
26
- @page_queue << @http.fetch_page(link, from_page)
26
+ @http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
27
27
 
28
28
  delay
29
29
  end
@@ -32,8 +32,8 @@ module Anemone
32
32
  private
33
33
 
34
34
  def delay
35
- sleep @opts[:delay] if @opts[:delay]
35
+ sleep @opts[:delay] if @opts[:delay] > 0
36
36
  end
37
37
 
38
38
  end
39
- end
39
+ end
data/spec/anemone_spec.rb CHANGED
@@ -1,15 +1,15 @@
1
1
  require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  describe Anemone do
4
-
4
+
5
5
  it "should have a version" do
6
6
  Anemone.const_defined?('VERSION').should == true
7
7
  end
8
8
 
9
- it "should return a Anemone::Core from the crawl, which has a PageHash" do
9
+ it "should return a Anemone::Core from the crawl, which has a PageStore" do
10
10
  result = Anemone.crawl(SPEC_DOMAIN)
11
11
  result.should be_an_instance_of(Anemone::Core)
12
- result.pages.should be_an_instance_of(Anemone::PageHash)
12
+ result.pages.should be_an_instance_of(Anemone::PageStore)
13
13
  end
14
-
14
+
15
15
  end
data/spec/core_spec.rb CHANGED
@@ -1,178 +1,222 @@
1
1
  require File.dirname(__FILE__) + '/spec_helper'
2
+ %w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
2
3
 
3
4
  module Anemone
4
5
  describe Core do
5
-
6
+
6
7
  before(:each) do
7
8
  FakeWeb.clean_registry
8
9
  end
9
-
10
- it "should crawl all the html pages in a domain by following <a> href's" do
11
- pages = []
12
- pages << FakePage.new('0', :links => ['1', '2'])
13
- pages << FakePage.new('1', :links => ['3'])
14
- pages << FakePage.new('2')
15
- pages << FakePage.new('3')
16
-
17
- Anemone.crawl(pages[0].url).should have(4).pages
18
- end
19
-
20
- it "should not leave the original domain" do
21
- pages = []
22
- pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
- pages << FakePage.new('1')
24
-
25
- core = Anemone.crawl(pages[0].url)
26
-
27
- core.should have(2).pages
28
- core.pages.keys.should_not include('http://www.other.com/')
29
- end
30
-
31
- it "should follow http redirects" do
32
- pages = []
33
- pages << FakePage.new('0', :links => ['1'])
34
- pages << FakePage.new('1', :redirect => '2')
35
- pages << FakePage.new('2')
36
-
37
- Anemone.crawl(pages[0].url).should have(3).pages
38
- end
39
-
40
- it "should accept multiple starting URLs" do
41
- pages = []
42
- pages << FakePage.new('0', :links => ['1'])
43
- pages << FakePage.new('1')
44
- pages << FakePage.new('2', :links => ['3'])
45
- pages << FakePage.new('3')
46
-
47
- Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
- end
49
-
50
- it "should include the query string when following links" do
51
- pages = []
52
- pages << FakePage.new('0', :links => ['1?foo=1'])
53
- pages << FakePage.new('1?foo=1')
54
- pages << FakePage.new('1')
55
-
56
- core = Anemone.crawl(pages[0].url)
57
-
58
- core.should have(2).pages
59
- core.pages.keys.should_not include(pages[2].url)
60
- end
61
-
62
- it "should be able to skip links based on a RegEx" do
63
- pages = []
64
- pages << FakePage.new('0', :links => ['1', '2'])
65
- pages << FakePage.new('1')
66
- pages << FakePage.new('2')
67
- pages << FakePage.new('3')
68
-
69
- core = Anemone.crawl(pages[0].url) do |a|
70
- a.skip_links_like /1/, /3/
71
- end
72
-
73
- core.should have(2).pages
74
- core.pages.keys.should_not include(pages[1].url)
75
- core.pages.keys.should_not include(pages[3].url)
76
- end
77
-
78
- it "should be able to call a block on every page" do
79
- pages = []
80
- pages << FakePage.new('0', :links => ['1', '2'])
81
- pages << FakePage.new('1')
82
- pages << FakePage.new('2')
83
-
84
- count = 0
85
- Anemone.crawl(pages[0].url) do |a|
86
- a.on_every_page { count += 1 }
87
- end
88
-
89
- count.should == 3
90
- end
91
-
92
- it "should not discard page bodies by default" do
93
- Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
94
- end
95
-
96
- it "should optionally discard page bodies to conserve memory" do
97
- core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
98
- core.pages.values.first.doc.should be_nil
99
- end
100
-
101
- it "should provide a focus_crawl method to select the links on each page to follow" do
102
- pages = []
103
- pages << FakePage.new('0', :links => ['1', '2'])
104
- pages << FakePage.new('1')
105
- pages << FakePage.new('2')
106
-
107
- core = Anemone.crawl(pages[0].url) do |a|
108
- a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
109
- end
110
-
111
- core.should have(2).pages
112
- core.pages.keys.should_not include(pages[1].url)
10
+
11
+ shared_examples_for "crawl" do
12
+ it "should crawl all the html pages in a domain by following <a> href's" do
13
+ pages = []
14
+ pages << FakePage.new('0', :links => ['1', '2'])
15
+ pages << FakePage.new('1', :links => ['3'])
16
+ pages << FakePage.new('2')
17
+ pages << FakePage.new('3')
18
+
19
+ Anemone.crawl(pages[0].url, @opts).should have(4).pages
20
+ end
21
+
22
+ it "should not leave the original domain" do
23
+ pages = []
24
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
25
+ pages << FakePage.new('1')
26
+
27
+ core = Anemone.crawl(pages[0].url, @opts)
28
+
29
+ core.should have(2).pages
30
+ core.pages.keys.should_not include('http://www.other.com/')
31
+ end
32
+
33
+ it "should follow http redirects" do
34
+ pages = []
35
+ pages << FakePage.new('0', :links => ['1'])
36
+ pages << FakePage.new('1', :redirect => '2')
37
+ pages << FakePage.new('2')
38
+
39
+ Anemone.crawl(pages[0].url, @opts).should have(3).pages
40
+ end
41
+
42
+ it "should accept multiple starting URLs" do
43
+ pages = []
44
+ pages << FakePage.new('0', :links => ['1'])
45
+ pages << FakePage.new('1')
46
+ pages << FakePage.new('2', :links => ['3'])
47
+ pages << FakePage.new('3')
48
+
49
+ Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
50
+ end
51
+
52
+ it "should include the query string when following links" do
53
+ pages = []
54
+ pages << FakePage.new('0', :links => ['1?foo=1'])
55
+ pages << FakePage.new('1?foo=1')
56
+ pages << FakePage.new('1')
57
+
58
+ core = Anemone.crawl(pages[0].url, @opts)
59
+
60
+ core.should have(2).pages
61
+ core.pages.keys.should_not include(pages[2].url)
62
+ end
63
+
64
+ it "should be able to skip links based on a RegEx" do
65
+ pages = []
66
+ pages << FakePage.new('0', :links => ['1', '2'])
67
+ pages << FakePage.new('1')
68
+ pages << FakePage.new('2')
69
+ pages << FakePage.new('3')
70
+
71
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
72
+ a.skip_links_like /1/, /3/
73
+ end
74
+
75
+ core.should have(2).pages
76
+ core.pages.keys.should_not include(pages[1].url)
77
+ core.pages.keys.should_not include(pages[3].url)
78
+ end
79
+
80
+ it "should be able to call a block on every page" do
81
+ pages = []
82
+ pages << FakePage.new('0', :links => ['1', '2'])
83
+ pages << FakePage.new('1')
84
+ pages << FakePage.new('2')
85
+
86
+ count = 0
87
+ Anemone.crawl(pages[0].url, @opts) do |a|
88
+ a.on_every_page { count += 1 }
89
+ end
90
+
91
+ count.should == 3
92
+ end
93
+
94
+ it "should not discard page bodies by default" do
95
+ Anemone.crawl(FakePage.new('0').url, @opts).pages.values.first.doc.should_not be_nil
96
+ end
97
+
98
+ it "should optionally discard page bodies to conserve memory" do
99
+ core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
100
+ core.pages.values.first.doc.should be_nil
101
+ end
102
+
103
+ it "should provide a focus_crawl method to select the links on each page to follow" do
104
+ pages = []
105
+ pages << FakePage.new('0', :links => ['1', '2'])
106
+ pages << FakePage.new('1')
107
+ pages << FakePage.new('2')
108
+
109
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
110
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
111
+ end
112
+
113
+ core.should have(2).pages
114
+ core.pages.keys.should_not include(pages[1].url)
115
+ end
116
+
117
+ it "should optionally delay between page requests" do
118
+ delay = 0.25
119
+
120
+ pages = []
121
+ pages << FakePage.new('0', :links => '1')
122
+ pages << FakePage.new('1')
123
+
124
+ start = Time.now
125
+ Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
126
+ finish = Time.now
127
+
128
+ (finish - start).should satisfy {|t| t > delay * 2}
129
+ end
130
+
131
+ it "should optionally obey the robots exclusion protocol" do
132
+ pages = []
133
+ pages << FakePage.new('0', :links => '1')
134
+ pages << FakePage.new('1')
135
+ pages << FakePage.new('robots.txt',
136
+ :body => "User-agent: *\nDisallow: /1",
137
+ :content_type => 'text/plain')
138
+
139
+ core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
140
+ urls = core.pages.keys
141
+
142
+ urls.should include(pages[0].url)
143
+ urls.should_not include(pages[1].url)
144
+ end
145
+
146
+ describe "many pages" do
147
+ before(:each) do
148
+ @pages, size = [], 5
149
+
150
+ size.times do |n|
151
+ # register this page with a link to the next page
152
+ link = (n + 1).to_s if n + 1 < size
153
+ @pages << FakePage.new(n.to_s, :links => Array(link))
154
+ end
155
+ end
156
+
157
+ it "should track the page depth and referer" do
158
+ core = Anemone.crawl(@pages[0].url, @opts)
159
+ previous_page = nil
160
+
161
+ @pages.each_with_index do |page, i|
162
+ page = core.pages[page.url]
163
+ page.should be
164
+ page.depth.should == i
165
+
166
+ if previous_page
167
+ page.referer.should == previous_page.url
168
+ else
169
+ page.referer.should be_nil
170
+ end
171
+ previous_page = page
172
+ end
173
+ end
174
+
175
+ it "should optionally limit the depth of the crawl" do
176
+ core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
177
+ core.should have(4).pages
178
+ end
179
+ end
180
+
113
181
  end
114
-
115
- it "should optionally delay between page requests" do
116
- delay = 0.25
117
-
118
- pages = []
119
- pages << FakePage.new('0', :links => '1')
120
- pages << FakePage.new('1')
121
-
122
- start = Time.now
123
- Anemone.crawl(pages[0].url, :delay => delay)
124
- finish = Time.now
125
-
126
- (finish - start).should satisfy {|t| t > delay * 2}
182
+
183
+ describe Hash do
184
+ it_should_behave_like "crawl"
185
+
186
+ before(:all) do
187
+ @opts = {}
188
+ end
127
189
  end
128
-
129
- it "should optionally obey the robots exclusion protocol" do
130
- pages = []
131
- pages << FakePage.new('0', :links => '1')
132
- pages << FakePage.new('1')
133
- pages << FakePage.new('robots.txt',
134
- :body => "User-agent: *\nDisallow: /1",
135
- :content_type => 'text/plain')
136
-
137
- core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
138
- urls = core.pages.keys
139
-
140
- urls.should include(pages[0].url)
141
- urls.should_not include(pages[1].url)
190
+
191
+ describe Storage::PStore do
192
+ it_should_behave_like "crawl"
193
+
194
+ before(:each) do
195
+ @test_file = 'test.pstore'
196
+ File.delete(@test_file) if File.exists?(@test_file)
197
+ @opts = {:storage => Storage.PStore(@test_file)}
198
+ end
199
+
200
+ after(:all) do
201
+ File.delete(@test_file) if File.exists?(@test_file)
202
+ end
142
203
  end
143
-
144
- describe "many pages" do
204
+
205
+ describe Storage::TokyoCabinet do
206
+ it_should_behave_like "crawl"
207
+
145
208
  before(:each) do
146
- @pages, size = [], 5
147
-
148
- size.times do |n|
149
- # register this page with a link to the next page
150
- link = (n + 1).to_s if n + 1 < size
151
- @pages << FakePage.new(n.to_s, :links => Array(link))
152
- end
153
- end
154
-
155
- it "should track the page depth and referer" do
156
- core = Anemone.crawl(@pages[0].url)
157
- previous_page = nil
158
-
159
- @pages.each_with_index do |page, i|
160
- page = core.pages[page.url]
161
- page.should be
162
- page.depth.should == i
163
-
164
- if previous_page
165
- page.referer.should == previous_page.url
166
- else
167
- page.referer.should be_nil
168
- end
169
- previous_page = page
170
- end
209
+ @test_file = 'test.tch'
210
+ File.delete(@test_file) if File.exists?(@test_file)
211
+ @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
171
212
  end
172
-
173
- it "should optionally limit the depth of the crawl" do
174
- core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
175
- core.should have(4).pages
213
+
214
+ after(:each) do
215
+ @store.close
216
+ end
217
+
218
+ after(:all) do
219
+ File.delete(@test_file) if File.exists?(@test_file)
176
220
  end
177
221
  end
178
222
 
@@ -194,6 +238,25 @@ module Anemone
194
238
  core.opts[:depth_limit].should == 3
195
239
  end
196
240
 
241
+ it "should accept options via setter methods in the crawl block" do
242
+ core = Anemone.crawl(SPEC_DOMAIN) do |a|
243
+ a.verbose = false
244
+ a.threads = 2
245
+ a.discard_page_bodies = true
246
+ a.user_agent = 'test'
247
+ a.obey_robots_txt = true
248
+ a.depth_limit = 3
249
+ end
250
+
251
+ core.opts[:verbose].should == false
252
+ core.opts[:threads].should == 2
253
+ core.opts[:discard_page_bodies].should == true
254
+ core.opts[:delay].should == 0
255
+ core.opts[:user_agent].should == 'test'
256
+ core.opts[:obey_robots_txt].should == true
257
+ core.opts[:depth_limit].should == 3
258
+ end
259
+
197
260
  it "should use 1 thread if a delay is requested" do
198
261
  Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
199
262
  end