spk-anemone 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,21 +1,52 @@
1
+ require 'forwardable'
2
+
1
3
  module Anemone
2
- class PageHash < Hash
3
-
4
+ class PageStore
5
+ extend Forwardable
6
+
7
+ def_delegators :@storage, :keys, :values, :size, :each
8
+
9
+ def initialize(storage = {})
10
+ @storage = storage
11
+ end
12
+
4
13
  # We typically index the hash with a URI,
5
14
  # but convert it to a String for easier retrieval
6
15
  def [](index)
7
- super(index.to_s)
16
+ @storage[index.to_s]
8
17
  end
9
-
18
+
10
19
  def []=(index, other)
11
- super(index.to_s, other)
20
+ @storage[index.to_s] = other
21
+ end
22
+
23
+ def delete(key)
24
+ @storage.delete key.to_s
12
25
  end
13
-
26
+
14
27
  def has_key?(key)
15
- super(key.to_s)
28
+ @storage.has_key? key.to_s
29
+ end
30
+
31
+ def each_value
32
+ each { |key, value| yield value }
33
+ end
34
+
35
+ def values
36
+ result = []
37
+ each { |key, value| result << value }
38
+ result
39
+ end
40
+
41
+ def touch_key(key)
42
+ self[key] = Page.new(key)
16
43
  end
17
44
 
18
- # Does this PageHash contain the specified URL?
45
+ def touch_keys(keys)
46
+ @storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
47
+ end
48
+
49
+ # Does this PageStore contain the specified URL?
19
50
  # HTTP and HTTPS versions of a URL are considered to be the same page.
20
51
  def has_page?(url)
21
52
  schemes = %w(http https)
@@ -24,80 +55,67 @@ module Anemone
24
55
  return schemes.any? { |s| u.scheme = s; has_key?(u) }
25
56
  end
26
57
 
27
- has_key?(url)
58
+ has_key? url
28
59
  end
29
-
60
+
30
61
  #
31
62
  # Use a breadth-first search to calculate the single-source
32
- # shortest paths from *root* to all pages in the PageHash
63
+ # shortest paths from *root* to all pages in the PageStore
33
64
  #
34
65
  def shortest_paths!(root)
35
66
  root = URI(root) if root.is_a?(String)
36
67
  raise "Root node not found" if !has_key?(root)
37
-
38
- each_value {|p| p.visited = false if p}
39
-
68
+
40
69
  q = Queue.new
41
-
42
- q.enq(root)
43
- self[root].depth = 0
44
- self[root].visited = true
45
- while(!q.empty?)
46
- url = q.deq
47
-
48
- next if !has_key?(url)
49
-
50
- page = self[url]
51
-
70
+
71
+ q.enq root
72
+ root_page = self[root]
73
+ root_page.depth = 0
74
+ root_page.visited = true
75
+ self[root] = root_page
76
+ while !q.empty?
77
+ page = self[q.deq]
52
78
  page.links.each do |u|
53
- next if !has_key?(u) or self[u].nil?
54
- link = self[u]
55
- aliases = [link].concat(link.aliases.map {|a| self[a] })
56
-
57
- aliases.each do |node|
58
- if node.depth.nil? or page.depth + 1 < node.depth
59
- node.depth = page.depth + 1
79
+ begin
80
+ link = self[u]
81
+ next if link.nil? || !link.fetched? || link.visited
82
+
83
+ q << u unless link.redirect?
84
+ link.visited = true
85
+ link.depth = page.depth + 1
86
+ self[u] = link
87
+
88
+ if link.redirect?
89
+ u = link.redirect_to
90
+ redo
60
91
  end
61
92
  end
62
-
63
- q.enq(self[u].url) if !self[u].visited
64
- self[u].visited = true
65
93
  end
66
94
  end
67
-
95
+
68
96
  self
69
97
  end
70
-
98
+
71
99
  #
72
- # Returns a new PageHash by removing redirect-aliases for each
73
- # non-redirect Page
100
+ # Removes all Pages from storage where redirect? is true
74
101
  #
75
- def uniq
76
- results = PageHash.new
77
- each do |url, page|
78
- #if none of the aliases of this page have been added, and this isn't a redirect page, add this page
79
- page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
80
- if !page.redirect? and !page_added
81
- results[url] = page.clone
82
- results[url].aliases = []
83
- end
84
- end
85
-
86
- results
102
+ def uniq!
103
+ each_value { |page| delete page.url if page.redirect? }
104
+ self
87
105
  end
88
-
106
+
89
107
  #
90
108
  # If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
91
109
  # If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
92
110
  #
93
111
  def pages_linking_to(urls)
94
112
  unless urls.is_a?(Array)
95
- urls = [urls] unless urls.is_a?(Array)
113
+ urls = [urls]
96
114
  single = true
97
115
  end
98
116
 
99
117
  urls.map! do |url|
100
- if url.is_a?(String)
118
+ unless url.is_a?(URI)
101
119
  URI(url) rescue nil
102
120
  else
103
121
  url
@@ -112,7 +130,7 @@ module Anemone
112
130
  end
113
131
 
114
132
  if single and !links.empty?
115
- return links.first
133
+ return links[urls.first]
116
134
  else
117
135
  return links
118
136
  end
@@ -132,11 +150,11 @@ module Anemone
132
150
  links.each { |url, pages| links[url] = pages.map{|p| p.url} }
133
151
 
134
152
  if single and !links.empty?
135
- return links.first
153
+ return links[urls.first]
136
154
  else
137
155
  return links
138
- end
156
+ end
139
157
  end
140
158
 
141
159
  end
142
- end
160
+ end
@@ -0,0 +1,19 @@
1
+ module Anemone
2
+ module Storage
3
+
4
+ def self.Hash(*args)
5
+ Hash.new(*args)
6
+ end
7
+
8
+ def self.PStore(*args)
9
+ require 'anemone/storage/pstore'
10
+ self::PStore.new(*args)
11
+ end
12
+
13
+ def self.TokyoCabinet(file)
14
+ require 'anemone/storage/tokyo_cabinet'
15
+ self::TokyoCabinet.new(file)
16
+ end
17
+
18
+ end
19
+ end
@@ -0,0 +1,48 @@
1
+ require 'pstore'
2
+ require 'forwardable'
3
+
4
+ module Anemone
5
+ module Storage
6
+ class PStore
7
+ extend Forwardable
8
+
9
+ def_delegators :@keys, :has_key?, :keys, :size
10
+
11
+ def initialize(file)
12
+ File.delete(file) if File.exists?(file)
13
+ @store = ::PStore.new(file)
14
+ @keys = {}
15
+ end
16
+
17
+ def [](key)
18
+ @store.transaction { |s| s[key] }
19
+ end
20
+
21
+ def []=(key,value)
22
+ @keys[key] = nil
23
+ @store.transaction { |s| s[key] = value }
24
+ end
25
+
26
+ def delete(key)
27
+ @keys.delete(key)
28
+ @store.transaction { |s| s.delete key}
29
+ end
30
+
31
+ def each
32
+ @keys.each_key do |key|
33
+ value = nil
34
+ @store.transaction { |s| value = s[key] }
35
+ yield key, value
36
+ end
37
+ end
38
+
39
+ def merge!(hash)
40
+ @store.transaction do |s|
41
+ hash.each { |key, value| s[key] = value; @keys[key] = nil }
42
+ end
43
+ self
44
+ end
45
+
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,57 @@
1
+ begin
2
+ require 'tokyocabinet'
3
+ rescue LoadError
4
+ puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
5
+ exit
6
+ end
7
+
8
+ require 'forwardable'
9
+
10
+ module Anemone
11
+ module Storage
12
+ class TokyoCabinet
13
+ extend Forwardable
14
+
15
+ def_delegators :@db, :close, :size, :keys, :has_key?
16
+
17
+ def initialize(file)
18
+ raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
19
+ @db = ::TokyoCabinet::HDB::new
20
+ @db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
21
+ @db.clear
22
+ end
23
+
24
+ def [](key)
25
+ if value = @db[key]
26
+ load_value(value)
27
+ end
28
+ end
29
+
30
+ def []=(key, value)
31
+ @db[key] = [Marshal.dump(value)].pack("m")
32
+ end
33
+
34
+ def delete(key)
35
+ value = self[key]
36
+ @db.delete(key)
37
+ value
38
+ end
39
+
40
+ def each
41
+ @db.each { |k, v| yield k, load_value(v) }
42
+ end
43
+
44
+ def merge!(hash)
45
+ hash.each { |key, value| self[key] = value }
46
+ self
47
+ end
48
+
49
+ private
50
+
51
+ def load_value(value)
52
+ Marshal.load(value.unpack("m")[0])
53
+ end
54
+
55
+ end
56
+ end
57
+ end
@@ -2,7 +2,7 @@ require 'anemone/http'
2
2
 
3
3
  module Anemone
4
4
  class Tentacle
5
-
5
+
6
6
  #
7
7
  # Create a new Tentacle
8
8
  #
@@ -12,18 +12,18 @@ module Anemone
12
12
  @http = Anemone::HTTP.new(opts)
13
13
  @opts = opts
14
14
  end
15
-
15
+
16
16
  #
17
17
  # Gets links from @link_queue, and returns the fetched
18
18
  # Page objects into @page_queue
19
19
  #
20
20
  def run
21
21
  loop do
22
- link, from_page = @link_queue.deq
23
-
22
+ link, referer, depth = @link_queue.deq
23
+
24
24
  break if link == :END
25
25
 
26
- @page_queue << @http.fetch_page(link, from_page)
26
+ @http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
27
27
 
28
28
  delay
29
29
  end
@@ -32,8 +32,8 @@ module Anemone
32
32
  private
33
33
 
34
34
  def delay
35
- sleep @opts[:delay] if @opts[:delay]
35
+ sleep @opts[:delay] if @opts[:delay] > 0
36
36
  end
37
37
 
38
38
  end
39
- end
39
+ end
data/spec/anemone_spec.rb CHANGED
@@ -1,15 +1,15 @@
1
1
  require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  describe Anemone do
4
-
4
+
5
5
  it "should have a version" do
6
6
  Anemone.const_defined?('VERSION').should == true
7
7
  end
8
8
 
9
- it "should return a Anemone::Core from the crawl, which has a PageHash" do
9
+ it "should return a Anemone::Core from the crawl, which has a PageStore" do
10
10
  result = Anemone.crawl(SPEC_DOMAIN)
11
11
  result.should be_an_instance_of(Anemone::Core)
12
- result.pages.should be_an_instance_of(Anemone::PageHash)
12
+ result.pages.should be_an_instance_of(Anemone::PageStore)
13
13
  end
14
-
14
+
15
15
  end
data/spec/core_spec.rb CHANGED
@@ -1,178 +1,222 @@
1
1
  require File.dirname(__FILE__) + '/spec_helper'
2
+ %w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
2
3
 
3
4
  module Anemone
4
5
  describe Core do
5
-
6
+
6
7
  before(:each) do
7
8
  FakeWeb.clean_registry
8
9
  end
9
-
10
- it "should crawl all the html pages in a domain by following <a> href's" do
11
- pages = []
12
- pages << FakePage.new('0', :links => ['1', '2'])
13
- pages << FakePage.new('1', :links => ['3'])
14
- pages << FakePage.new('2')
15
- pages << FakePage.new('3')
16
-
17
- Anemone.crawl(pages[0].url).should have(4).pages
18
- end
19
-
20
- it "should not leave the original domain" do
21
- pages = []
22
- pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
23
- pages << FakePage.new('1')
24
-
25
- core = Anemone.crawl(pages[0].url)
26
-
27
- core.should have(2).pages
28
- core.pages.keys.should_not include('http://www.other.com/')
29
- end
30
-
31
- it "should follow http redirects" do
32
- pages = []
33
- pages << FakePage.new('0', :links => ['1'])
34
- pages << FakePage.new('1', :redirect => '2')
35
- pages << FakePage.new('2')
36
-
37
- Anemone.crawl(pages[0].url).should have(3).pages
38
- end
39
-
40
- it "should accept multiple starting URLs" do
41
- pages = []
42
- pages << FakePage.new('0', :links => ['1'])
43
- pages << FakePage.new('1')
44
- pages << FakePage.new('2', :links => ['3'])
45
- pages << FakePage.new('3')
46
-
47
- Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
48
- end
49
-
50
- it "should include the query string when following links" do
51
- pages = []
52
- pages << FakePage.new('0', :links => ['1?foo=1'])
53
- pages << FakePage.new('1?foo=1')
54
- pages << FakePage.new('1')
55
-
56
- core = Anemone.crawl(pages[0].url)
57
-
58
- core.should have(2).pages
59
- core.pages.keys.should_not include(pages[2].url)
60
- end
61
-
62
- it "should be able to skip links based on a RegEx" do
63
- pages = []
64
- pages << FakePage.new('0', :links => ['1', '2'])
65
- pages << FakePage.new('1')
66
- pages << FakePage.new('2')
67
- pages << FakePage.new('3')
68
-
69
- core = Anemone.crawl(pages[0].url) do |a|
70
- a.skip_links_like /1/, /3/
71
- end
72
-
73
- core.should have(2).pages
74
- core.pages.keys.should_not include(pages[1].url)
75
- core.pages.keys.should_not include(pages[3].url)
76
- end
77
-
78
- it "should be able to call a block on every page" do
79
- pages = []
80
- pages << FakePage.new('0', :links => ['1', '2'])
81
- pages << FakePage.new('1')
82
- pages << FakePage.new('2')
83
-
84
- count = 0
85
- Anemone.crawl(pages[0].url) do |a|
86
- a.on_every_page { count += 1 }
87
- end
88
-
89
- count.should == 3
90
- end
91
-
92
- it "should not discard page bodies by default" do
93
- Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
94
- end
95
-
96
- it "should optionally discard page bodies to conserve memory" do
97
- core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
98
- core.pages.values.first.doc.should be_nil
99
- end
100
-
101
- it "should provide a focus_crawl method to select the links on each page to follow" do
102
- pages = []
103
- pages << FakePage.new('0', :links => ['1', '2'])
104
- pages << FakePage.new('1')
105
- pages << FakePage.new('2')
106
-
107
- core = Anemone.crawl(pages[0].url) do |a|
108
- a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
109
- end
110
-
111
- core.should have(2).pages
112
- core.pages.keys.should_not include(pages[1].url)
10
+
11
+ shared_examples_for "crawl" do
12
+ it "should crawl all the html pages in a domain by following <a> href's" do
13
+ pages = []
14
+ pages << FakePage.new('0', :links => ['1', '2'])
15
+ pages << FakePage.new('1', :links => ['3'])
16
+ pages << FakePage.new('2')
17
+ pages << FakePage.new('3')
18
+
19
+ Anemone.crawl(pages[0].url, @opts).should have(4).pages
20
+ end
21
+
22
+ it "should not leave the original domain" do
23
+ pages = []
24
+ pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
25
+ pages << FakePage.new('1')
26
+
27
+ core = Anemone.crawl(pages[0].url, @opts)
28
+
29
+ core.should have(2).pages
30
+ core.pages.keys.should_not include('http://www.other.com/')
31
+ end
32
+
33
+ it "should follow http redirects" do
34
+ pages = []
35
+ pages << FakePage.new('0', :links => ['1'])
36
+ pages << FakePage.new('1', :redirect => '2')
37
+ pages << FakePage.new('2')
38
+
39
+ Anemone.crawl(pages[0].url, @opts).should have(3).pages
40
+ end
41
+
42
+ it "should accept multiple starting URLs" do
43
+ pages = []
44
+ pages << FakePage.new('0', :links => ['1'])
45
+ pages << FakePage.new('1')
46
+ pages << FakePage.new('2', :links => ['3'])
47
+ pages << FakePage.new('3')
48
+
49
+ Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
50
+ end
51
+
52
+ it "should include the query string when following links" do
53
+ pages = []
54
+ pages << FakePage.new('0', :links => ['1?foo=1'])
55
+ pages << FakePage.new('1?foo=1')
56
+ pages << FakePage.new('1')
57
+
58
+ core = Anemone.crawl(pages[0].url, @opts)
59
+
60
+ core.should have(2).pages
61
+ core.pages.keys.should_not include(pages[2].url)
62
+ end
63
+
64
+ it "should be able to skip links based on a RegEx" do
65
+ pages = []
66
+ pages << FakePage.new('0', :links => ['1', '2'])
67
+ pages << FakePage.new('1')
68
+ pages << FakePage.new('2')
69
+ pages << FakePage.new('3')
70
+
71
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
72
+ a.skip_links_like /1/, /3/
73
+ end
74
+
75
+ core.should have(2).pages
76
+ core.pages.keys.should_not include(pages[1].url)
77
+ core.pages.keys.should_not include(pages[3].url)
78
+ end
79
+
80
+ it "should be able to call a block on every page" do
81
+ pages = []
82
+ pages << FakePage.new('0', :links => ['1', '2'])
83
+ pages << FakePage.new('1')
84
+ pages << FakePage.new('2')
85
+
86
+ count = 0
87
+ Anemone.crawl(pages[0].url, @opts) do |a|
88
+ a.on_every_page { count += 1 }
89
+ end
90
+
91
+ count.should == 3
92
+ end
93
+
94
+ it "should not discard page bodies by default" do
95
+ Anemone.crawl(FakePage.new('0').url, @opts).pages.values.first.doc.should_not be_nil
96
+ end
97
+
98
+ it "should optionally discard page bodies to conserve memory" do
99
+ core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
100
+ core.pages.values.first.doc.should be_nil
101
+ end
102
+
103
+ it "should provide a focus_crawl method to select the links on each page to follow" do
104
+ pages = []
105
+ pages << FakePage.new('0', :links => ['1', '2'])
106
+ pages << FakePage.new('1')
107
+ pages << FakePage.new('2')
108
+
109
+ core = Anemone.crawl(pages[0].url, @opts) do |a|
110
+ a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
111
+ end
112
+
113
+ core.should have(2).pages
114
+ core.pages.keys.should_not include(pages[1].url)
115
+ end
116
+
117
+ it "should optionally delay between page requests" do
118
+ delay = 0.25
119
+
120
+ pages = []
121
+ pages << FakePage.new('0', :links => '1')
122
+ pages << FakePage.new('1')
123
+
124
+ start = Time.now
125
+ Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
126
+ finish = Time.now
127
+
128
+ (finish - start).should satisfy {|t| t > delay * 2}
129
+ end
130
+
131
+ it "should optionally obey the robots exclusion protocol" do
132
+ pages = []
133
+ pages << FakePage.new('0', :links => '1')
134
+ pages << FakePage.new('1')
135
+ pages << FakePage.new('robots.txt',
136
+ :body => "User-agent: *\nDisallow: /1",
137
+ :content_type => 'text/plain')
138
+
139
+ core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
140
+ urls = core.pages.keys
141
+
142
+ urls.should include(pages[0].url)
143
+ urls.should_not include(pages[1].url)
144
+ end
145
+
146
+ describe "many pages" do
147
+ before(:each) do
148
+ @pages, size = [], 5
149
+
150
+ size.times do |n|
151
+ # register this page with a link to the next page
152
+ link = (n + 1).to_s if n + 1 < size
153
+ @pages << FakePage.new(n.to_s, :links => Array(link))
154
+ end
155
+ end
156
+
157
+ it "should track the page depth and referer" do
158
+ core = Anemone.crawl(@pages[0].url, @opts)
159
+ previous_page = nil
160
+
161
+ @pages.each_with_index do |page, i|
162
+ page = core.pages[page.url]
163
+ page.should be
164
+ page.depth.should == i
165
+
166
+ if previous_page
167
+ page.referer.should == previous_page.url
168
+ else
169
+ page.referer.should be_nil
170
+ end
171
+ previous_page = page
172
+ end
173
+ end
174
+
175
+ it "should optionally limit the depth of the crawl" do
176
+ core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
177
+ core.should have(4).pages
178
+ end
179
+ end
180
+
113
181
  end
114
-
115
- it "should optionally delay between page requests" do
116
- delay = 0.25
117
-
118
- pages = []
119
- pages << FakePage.new('0', :links => '1')
120
- pages << FakePage.new('1')
121
-
122
- start = Time.now
123
- Anemone.crawl(pages[0].url, :delay => delay)
124
- finish = Time.now
125
-
126
- (finish - start).should satisfy {|t| t > delay * 2}
182
+
183
+ describe Hash do
184
+ it_should_behave_like "crawl"
185
+
186
+ before(:all) do
187
+ @opts = {}
188
+ end
127
189
  end
128
-
129
- it "should optionally obey the robots exclusion protocol" do
130
- pages = []
131
- pages << FakePage.new('0', :links => '1')
132
- pages << FakePage.new('1')
133
- pages << FakePage.new('robots.txt',
134
- :body => "User-agent: *\nDisallow: /1",
135
- :content_type => 'text/plain')
136
-
137
- core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
138
- urls = core.pages.keys
139
-
140
- urls.should include(pages[0].url)
141
- urls.should_not include(pages[1].url)
190
+
191
+ describe Storage::PStore do
192
+ it_should_behave_like "crawl"
193
+
194
+ before(:each) do
195
+ @test_file = 'test.pstore'
196
+ File.delete(@test_file) if File.exists?(@test_file)
197
+ @opts = {:storage => Storage.PStore(@test_file)}
198
+ end
199
+
200
+ after(:all) do
201
+ File.delete(@test_file) if File.exists?(@test_file)
202
+ end
142
203
  end
143
-
144
- describe "many pages" do
204
+
205
+ describe Storage::TokyoCabinet do
206
+ it_should_behave_like "crawl"
207
+
145
208
  before(:each) do
146
- @pages, size = [], 5
147
-
148
- size.times do |n|
149
- # register this page with a link to the next page
150
- link = (n + 1).to_s if n + 1 < size
151
- @pages << FakePage.new(n.to_s, :links => Array(link))
152
- end
153
- end
154
-
155
- it "should track the page depth and referer" do
156
- core = Anemone.crawl(@pages[0].url)
157
- previous_page = nil
158
-
159
- @pages.each_with_index do |page, i|
160
- page = core.pages[page.url]
161
- page.should be
162
- page.depth.should == i
163
-
164
- if previous_page
165
- page.referer.should == previous_page.url
166
- else
167
- page.referer.should be_nil
168
- end
169
- previous_page = page
170
- end
209
+ @test_file = 'test.tch'
210
+ File.delete(@test_file) if File.exists?(@test_file)
211
+ @opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
171
212
  end
172
-
173
- it "should optionally limit the depth of the crawl" do
174
- core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
175
- core.should have(4).pages
213
+
214
+ after(:each) do
215
+ @store.close
216
+ end
217
+
218
+ after(:all) do
219
+ File.delete(@test_file) if File.exists?(@test_file)
176
220
  end
177
221
  end
178
222
 
@@ -194,6 +238,25 @@ module Anemone
194
238
  core.opts[:depth_limit].should == 3
195
239
  end
196
240
 
241
+ it "should accept options via setter methods in the crawl block" do
242
+ core = Anemone.crawl(SPEC_DOMAIN) do |a|
243
+ a.verbose = false
244
+ a.threads = 2
245
+ a.discard_page_bodies = true
246
+ a.user_agent = 'test'
247
+ a.obey_robots_txt = true
248
+ a.depth_limit = 3
249
+ end
250
+
251
+ core.opts[:verbose].should == false
252
+ core.opts[:threads].should == 2
253
+ core.opts[:discard_page_bodies].should == true
254
+ core.opts[:delay].should == 0
255
+ core.opts[:user_agent].should == 'test'
256
+ core.opts[:obey_robots_txt].should == true
257
+ core.opts[:depth_limit].should == 3
258
+ end
259
+
197
260
  it "should use 1 thread if a delay is requested" do
198
261
  Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
199
262
  end