spk-anemone 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +10 -0
- data/README.rdoc +2 -0
- data/lib/anemone/cli/serialize.rb +2 -2
- data/lib/anemone/core.rb +43 -53
- data/lib/anemone/http.rb +32 -21
- data/lib/anemone/page.rb +43 -50
- data/lib/anemone/{page_hash.rb → page_store.rb} +76 -58
- data/lib/anemone/storage.rb +19 -0
- data/lib/anemone/storage/pstore.rb +48 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +57 -0
- data/lib/anemone/tentacle.rb +7 -7
- data/spec/anemone_spec.rb +4 -4
- data/spec/core_spec.rb +226 -163
- data/spec/http_spec.rb +23 -0
- data/spec/page_spec.rb +28 -14
- data/spec/page_store_spec.rb +128 -0
- data/spec/storage_spec.rb +123 -0
- metadata +10 -5
@@ -1,21 +1,52 @@
|
|
1
|
+
require 'forwardable'
|
2
|
+
|
1
3
|
module Anemone
|
2
|
-
class
|
3
|
-
|
4
|
+
class PageStore
|
5
|
+
extend Forwardable
|
6
|
+
|
7
|
+
def_delegators :@storage, :keys, :values, :size, :each
|
8
|
+
|
9
|
+
def initialize(storage = {})
|
10
|
+
@storage = storage
|
11
|
+
end
|
12
|
+
|
4
13
|
# We typically index the hash with a URI,
|
5
14
|
# but convert it to a String for easier retrieval
|
6
15
|
def [](index)
|
7
|
-
|
16
|
+
@storage[index.to_s]
|
8
17
|
end
|
9
|
-
|
18
|
+
|
10
19
|
def []=(index, other)
|
11
|
-
|
20
|
+
@storage[index.to_s] = other
|
21
|
+
end
|
22
|
+
|
23
|
+
def delete(key)
|
24
|
+
@storage.delete key.to_s
|
12
25
|
end
|
13
|
-
|
26
|
+
|
14
27
|
def has_key?(key)
|
15
|
-
|
28
|
+
@storage.has_key? key.to_s
|
29
|
+
end
|
30
|
+
|
31
|
+
def each_value
|
32
|
+
each { |key, value| yield value }
|
33
|
+
end
|
34
|
+
|
35
|
+
def values
|
36
|
+
result = []
|
37
|
+
each { |key, value| result << value }
|
38
|
+
result
|
39
|
+
end
|
40
|
+
|
41
|
+
def touch_key(key)
|
42
|
+
self[key] = Page.new(key)
|
16
43
|
end
|
17
44
|
|
18
|
-
|
45
|
+
def touch_keys(keys)
|
46
|
+
@storage.merge! keys.inject({}) { |h, k| h[k.to_s] = Page.new(k); h }
|
47
|
+
end
|
48
|
+
|
49
|
+
# Does this PageStore contain the specified URL?
|
19
50
|
# HTTP and HTTPS versions of a URL are considered to be the same page.
|
20
51
|
def has_page?(url)
|
21
52
|
schemes = %w(http https)
|
@@ -24,80 +55,67 @@ module Anemone
|
|
24
55
|
return schemes.any? { |s| u.scheme = s; has_key?(u) }
|
25
56
|
end
|
26
57
|
|
27
|
-
has_key?
|
58
|
+
has_key? url
|
28
59
|
end
|
29
|
-
|
60
|
+
|
30
61
|
#
|
31
62
|
# Use a breadth-first search to calculate the single-source
|
32
|
-
# shortest paths from *root* to all pages in the
|
63
|
+
# shortest paths from *root* to all pages in the PageStore
|
33
64
|
#
|
34
65
|
def shortest_paths!(root)
|
35
66
|
root = URI(root) if root.is_a?(String)
|
36
67
|
raise "Root node not found" if !has_key?(root)
|
37
|
-
|
38
|
-
each_value {|p| p.visited = false if p}
|
39
|
-
|
68
|
+
|
40
69
|
q = Queue.new
|
41
|
-
|
42
|
-
q.enq
|
43
|
-
self[root]
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
page = self[url]
|
51
|
-
|
70
|
+
|
71
|
+
q.enq root
|
72
|
+
root_page = self[root]
|
73
|
+
root_page.depth = 0
|
74
|
+
root_page.visited = true
|
75
|
+
self[root] = root_page
|
76
|
+
while !q.empty?
|
77
|
+
page = self[q.deq]
|
52
78
|
page.links.each do |u|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
79
|
+
begin
|
80
|
+
link = self[u]
|
81
|
+
next if link.nil? || !link.fetched? || link.visited
|
82
|
+
|
83
|
+
q << u unless link.redirect?
|
84
|
+
link.visited = true
|
85
|
+
link.depth = page.depth + 1
|
86
|
+
self[u] = link
|
87
|
+
|
88
|
+
if link.redirect?
|
89
|
+
u = link.redirect_to
|
90
|
+
redo
|
60
91
|
end
|
61
92
|
end
|
62
|
-
|
63
|
-
q.enq(self[u].url) if !self[u].visited
|
64
|
-
self[u].visited = true
|
65
93
|
end
|
66
94
|
end
|
67
|
-
|
95
|
+
|
68
96
|
self
|
69
97
|
end
|
70
|
-
|
98
|
+
|
71
99
|
#
|
72
|
-
#
|
73
|
-
# non-redirect Page
|
100
|
+
# Removes all Pages from storage where redirect? is true
|
74
101
|
#
|
75
|
-
def uniq
|
76
|
-
|
77
|
-
|
78
|
-
#if none of the aliases of this page have been added, and this isn't a redirect page, add this page
|
79
|
-
page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
|
80
|
-
if !page.redirect? and !page_added
|
81
|
-
results[url] = page.clone
|
82
|
-
results[url].aliases = []
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
results
|
102
|
+
def uniq!
|
103
|
+
each_value { |page| delete page.url if page.redirect? }
|
104
|
+
self
|
87
105
|
end
|
88
|
-
|
106
|
+
|
89
107
|
#
|
90
108
|
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
|
91
109
|
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
|
92
110
|
#
|
93
111
|
def pages_linking_to(urls)
|
94
112
|
unless urls.is_a?(Array)
|
95
|
-
urls = [urls]
|
113
|
+
urls = [urls]
|
96
114
|
single = true
|
97
115
|
end
|
98
116
|
|
99
117
|
urls.map! do |url|
|
100
|
-
|
118
|
+
unless url.is_a?(URI)
|
101
119
|
URI(url) rescue nil
|
102
120
|
else
|
103
121
|
url
|
@@ -112,7 +130,7 @@ module Anemone
|
|
112
130
|
end
|
113
131
|
|
114
132
|
if single and !links.empty?
|
115
|
-
return links.first
|
133
|
+
return links[urls.first]
|
116
134
|
else
|
117
135
|
return links
|
118
136
|
end
|
@@ -132,11 +150,11 @@ module Anemone
|
|
132
150
|
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
|
133
151
|
|
134
152
|
if single and !links.empty?
|
135
|
-
return links.first
|
153
|
+
return links[urls.first]
|
136
154
|
else
|
137
155
|
return links
|
138
|
-
end
|
156
|
+
end
|
139
157
|
end
|
140
158
|
|
141
159
|
end
|
142
|
-
end
|
160
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Anemone
|
2
|
+
module Storage
|
3
|
+
|
4
|
+
def self.Hash(*args)
|
5
|
+
Hash.new(*args)
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.PStore(*args)
|
9
|
+
require 'anemone/storage/pstore'
|
10
|
+
self::PStore.new(*args)
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.TokyoCabinet(file)
|
14
|
+
require 'anemone/storage/tokyo_cabinet'
|
15
|
+
self::TokyoCabinet.new(file)
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'pstore'
|
2
|
+
require 'forwardable'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
module Storage
|
6
|
+
class PStore
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
def_delegators :@keys, :has_key?, :keys, :size
|
10
|
+
|
11
|
+
def initialize(file)
|
12
|
+
File.delete(file) if File.exists?(file)
|
13
|
+
@store = ::PStore.new(file)
|
14
|
+
@keys = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
def [](key)
|
18
|
+
@store.transaction { |s| s[key] }
|
19
|
+
end
|
20
|
+
|
21
|
+
def []=(key,value)
|
22
|
+
@keys[key] = nil
|
23
|
+
@store.transaction { |s| s[key] = value }
|
24
|
+
end
|
25
|
+
|
26
|
+
def delete(key)
|
27
|
+
@keys.delete(key)
|
28
|
+
@store.transaction { |s| s.delete key}
|
29
|
+
end
|
30
|
+
|
31
|
+
def each
|
32
|
+
@keys.each_key do |key|
|
33
|
+
value = nil
|
34
|
+
@store.transaction { |s| value = s[key] }
|
35
|
+
yield key, value
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def merge!(hash)
|
40
|
+
@store.transaction do |s|
|
41
|
+
hash.each { |key, value| s[key] = value; @keys[key] = nil }
|
42
|
+
end
|
43
|
+
self
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
begin
|
2
|
+
require 'tokyocabinet'
|
3
|
+
rescue LoadError
|
4
|
+
puts "You need the tokyocabinet gem to use Anemone::Storage::TokyoCabinet"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
require 'forwardable'
|
9
|
+
|
10
|
+
module Anemone
|
11
|
+
module Storage
|
12
|
+
class TokyoCabinet
|
13
|
+
extend Forwardable
|
14
|
+
|
15
|
+
def_delegators :@db, :close, :size, :keys, :has_key?
|
16
|
+
|
17
|
+
def initialize(file)
|
18
|
+
raise "TokyoCabinet filename must have .tch extension" if File.extname(file) != '.tch'
|
19
|
+
@db = ::TokyoCabinet::HDB::new
|
20
|
+
@db.open(file, ::TokyoCabinet::HDB::OWRITER | ::TokyoCabinet::HDB::OCREAT)
|
21
|
+
@db.clear
|
22
|
+
end
|
23
|
+
|
24
|
+
def [](key)
|
25
|
+
if value = @db[key]
|
26
|
+
load_value(value)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def []=(key, value)
|
31
|
+
@db[key] = [Marshal.dump(value)].pack("m")
|
32
|
+
end
|
33
|
+
|
34
|
+
def delete(key)
|
35
|
+
value = self[key]
|
36
|
+
@db.delete(key)
|
37
|
+
value
|
38
|
+
end
|
39
|
+
|
40
|
+
def each
|
41
|
+
@db.each { |k, v| yield k, load_value(v) }
|
42
|
+
end
|
43
|
+
|
44
|
+
def merge!(hash)
|
45
|
+
hash.each { |key, value| self[key] = value }
|
46
|
+
self
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def load_value(value)
|
52
|
+
Marshal.load(value.unpack("m")[0])
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/lib/anemone/tentacle.rb
CHANGED
@@ -2,7 +2,7 @@ require 'anemone/http'
|
|
2
2
|
|
3
3
|
module Anemone
|
4
4
|
class Tentacle
|
5
|
-
|
5
|
+
|
6
6
|
#
|
7
7
|
# Create a new Tentacle
|
8
8
|
#
|
@@ -12,18 +12,18 @@ module Anemone
|
|
12
12
|
@http = Anemone::HTTP.new(opts)
|
13
13
|
@opts = opts
|
14
14
|
end
|
15
|
-
|
15
|
+
|
16
16
|
#
|
17
17
|
# Gets links from @link_queue, and returns the fetched
|
18
18
|
# Page objects into @page_queue
|
19
19
|
#
|
20
20
|
def run
|
21
21
|
loop do
|
22
|
-
link,
|
23
|
-
|
22
|
+
link, referer, depth = @link_queue.deq
|
23
|
+
|
24
24
|
break if link == :END
|
25
25
|
|
26
|
-
@
|
26
|
+
@http.fetch_pages(link, referer, depth).each { |page| @page_queue << page }
|
27
27
|
|
28
28
|
delay
|
29
29
|
end
|
@@ -32,8 +32,8 @@ module Anemone
|
|
32
32
|
private
|
33
33
|
|
34
34
|
def delay
|
35
|
-
sleep @opts[:delay] if @opts[:delay]
|
35
|
+
sleep @opts[:delay] if @opts[:delay] > 0
|
36
36
|
end
|
37
37
|
|
38
38
|
end
|
39
|
-
end
|
39
|
+
end
|
data/spec/anemone_spec.rb
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/spec_helper'
|
2
2
|
|
3
3
|
describe Anemone do
|
4
|
-
|
4
|
+
|
5
5
|
it "should have a version" do
|
6
6
|
Anemone.const_defined?('VERSION').should == true
|
7
7
|
end
|
8
8
|
|
9
|
-
it "should return a Anemone::Core from the crawl, which has a
|
9
|
+
it "should return a Anemone::Core from the crawl, which has a PageStore" do
|
10
10
|
result = Anemone.crawl(SPEC_DOMAIN)
|
11
11
|
result.should be_an_instance_of(Anemone::Core)
|
12
|
-
result.pages.should be_an_instance_of(Anemone::
|
12
|
+
result.pages.should be_an_instance_of(Anemone::PageStore)
|
13
13
|
end
|
14
|
-
|
14
|
+
|
15
15
|
end
|
data/spec/core_spec.rb
CHANGED
@@ -1,178 +1,222 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
%w[pstore tokyo_cabinet].each { |file| require "anemone/storage/#{file}.rb" }
|
2
3
|
|
3
4
|
module Anemone
|
4
5
|
describe Core do
|
5
|
-
|
6
|
+
|
6
7
|
before(:each) do
|
7
8
|
FakeWeb.clean_registry
|
8
9
|
end
|
9
|
-
|
10
|
-
|
11
|
-
pages
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
10
|
+
|
11
|
+
shared_examples_for "crawl" do
|
12
|
+
it "should crawl all the html pages in a domain by following <a> href's" do
|
13
|
+
pages = []
|
14
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
15
|
+
pages << FakePage.new('1', :links => ['3'])
|
16
|
+
pages << FakePage.new('2')
|
17
|
+
pages << FakePage.new('3')
|
18
|
+
|
19
|
+
Anemone.crawl(pages[0].url, @opts).should have(4).pages
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should not leave the original domain" do
|
23
|
+
pages = []
|
24
|
+
pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
|
25
|
+
pages << FakePage.new('1')
|
26
|
+
|
27
|
+
core = Anemone.crawl(pages[0].url, @opts)
|
28
|
+
|
29
|
+
core.should have(2).pages
|
30
|
+
core.pages.keys.should_not include('http://www.other.com/')
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should follow http redirects" do
|
34
|
+
pages = []
|
35
|
+
pages << FakePage.new('0', :links => ['1'])
|
36
|
+
pages << FakePage.new('1', :redirect => '2')
|
37
|
+
pages << FakePage.new('2')
|
38
|
+
|
39
|
+
Anemone.crawl(pages[0].url, @opts).should have(3).pages
|
40
|
+
end
|
41
|
+
|
42
|
+
it "should accept multiple starting URLs" do
|
43
|
+
pages = []
|
44
|
+
pages << FakePage.new('0', :links => ['1'])
|
45
|
+
pages << FakePage.new('1')
|
46
|
+
pages << FakePage.new('2', :links => ['3'])
|
47
|
+
pages << FakePage.new('3')
|
48
|
+
|
49
|
+
Anemone.crawl([pages[0].url, pages[2].url], @opts).should have(4).pages
|
50
|
+
end
|
51
|
+
|
52
|
+
it "should include the query string when following links" do
|
53
|
+
pages = []
|
54
|
+
pages << FakePage.new('0', :links => ['1?foo=1'])
|
55
|
+
pages << FakePage.new('1?foo=1')
|
56
|
+
pages << FakePage.new('1')
|
57
|
+
|
58
|
+
core = Anemone.crawl(pages[0].url, @opts)
|
59
|
+
|
60
|
+
core.should have(2).pages
|
61
|
+
core.pages.keys.should_not include(pages[2].url)
|
62
|
+
end
|
63
|
+
|
64
|
+
it "should be able to skip links based on a RegEx" do
|
65
|
+
pages = []
|
66
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
67
|
+
pages << FakePage.new('1')
|
68
|
+
pages << FakePage.new('2')
|
69
|
+
pages << FakePage.new('3')
|
70
|
+
|
71
|
+
core = Anemone.crawl(pages[0].url, @opts) do |a|
|
72
|
+
a.skip_links_like /1/, /3/
|
73
|
+
end
|
74
|
+
|
75
|
+
core.should have(2).pages
|
76
|
+
core.pages.keys.should_not include(pages[1].url)
|
77
|
+
core.pages.keys.should_not include(pages[3].url)
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should be able to call a block on every page" do
|
81
|
+
pages = []
|
82
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
83
|
+
pages << FakePage.new('1')
|
84
|
+
pages << FakePage.new('2')
|
85
|
+
|
86
|
+
count = 0
|
87
|
+
Anemone.crawl(pages[0].url, @opts) do |a|
|
88
|
+
a.on_every_page { count += 1 }
|
89
|
+
end
|
90
|
+
|
91
|
+
count.should == 3
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should not discard page bodies by default" do
|
95
|
+
Anemone.crawl(FakePage.new('0').url, @opts).pages.values.first.doc.should_not be_nil
|
96
|
+
end
|
97
|
+
|
98
|
+
it "should optionally discard page bodies to conserve memory" do
|
99
|
+
core = Anemone.crawl(FakePage.new('0').url, @opts.merge({:discard_page_bodies => true}))
|
100
|
+
core.pages.values.first.doc.should be_nil
|
101
|
+
end
|
102
|
+
|
103
|
+
it "should provide a focus_crawl method to select the links on each page to follow" do
|
104
|
+
pages = []
|
105
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
106
|
+
pages << FakePage.new('1')
|
107
|
+
pages << FakePage.new('2')
|
108
|
+
|
109
|
+
core = Anemone.crawl(pages[0].url, @opts) do |a|
|
110
|
+
a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
|
111
|
+
end
|
112
|
+
|
113
|
+
core.should have(2).pages
|
114
|
+
core.pages.keys.should_not include(pages[1].url)
|
115
|
+
end
|
116
|
+
|
117
|
+
it "should optionally delay between page requests" do
|
118
|
+
delay = 0.25
|
119
|
+
|
120
|
+
pages = []
|
121
|
+
pages << FakePage.new('0', :links => '1')
|
122
|
+
pages << FakePage.new('1')
|
123
|
+
|
124
|
+
start = Time.now
|
125
|
+
Anemone.crawl(pages[0].url, @opts.merge({:delay => delay}))
|
126
|
+
finish = Time.now
|
127
|
+
|
128
|
+
(finish - start).should satisfy {|t| t > delay * 2}
|
129
|
+
end
|
130
|
+
|
131
|
+
it "should optionally obey the robots exclusion protocol" do
|
132
|
+
pages = []
|
133
|
+
pages << FakePage.new('0', :links => '1')
|
134
|
+
pages << FakePage.new('1')
|
135
|
+
pages << FakePage.new('robots.txt',
|
136
|
+
:body => "User-agent: *\nDisallow: /1",
|
137
|
+
:content_type => 'text/plain')
|
138
|
+
|
139
|
+
core = Anemone.crawl(pages[0].url, @opts.merge({:obey_robots_txt => true}))
|
140
|
+
urls = core.pages.keys
|
141
|
+
|
142
|
+
urls.should include(pages[0].url)
|
143
|
+
urls.should_not include(pages[1].url)
|
144
|
+
end
|
145
|
+
|
146
|
+
describe "many pages" do
|
147
|
+
before(:each) do
|
148
|
+
@pages, size = [], 5
|
149
|
+
|
150
|
+
size.times do |n|
|
151
|
+
# register this page with a link to the next page
|
152
|
+
link = (n + 1).to_s if n + 1 < size
|
153
|
+
@pages << FakePage.new(n.to_s, :links => Array(link))
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
it "should track the page depth and referer" do
|
158
|
+
core = Anemone.crawl(@pages[0].url, @opts)
|
159
|
+
previous_page = nil
|
160
|
+
|
161
|
+
@pages.each_with_index do |page, i|
|
162
|
+
page = core.pages[page.url]
|
163
|
+
page.should be
|
164
|
+
page.depth.should == i
|
165
|
+
|
166
|
+
if previous_page
|
167
|
+
page.referer.should == previous_page.url
|
168
|
+
else
|
169
|
+
page.referer.should be_nil
|
170
|
+
end
|
171
|
+
previous_page = page
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
it "should optionally limit the depth of the crawl" do
|
176
|
+
core = Anemone.crawl(@pages[0].url, @opts.merge({:depth_limit => 3}))
|
177
|
+
core.should have(4).pages
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
113
181
|
end
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
start = Time.now
|
123
|
-
Anemone.crawl(pages[0].url, :delay => delay)
|
124
|
-
finish = Time.now
|
125
|
-
|
126
|
-
(finish - start).should satisfy {|t| t > delay * 2}
|
182
|
+
|
183
|
+
describe Hash do
|
184
|
+
it_should_behave_like "crawl"
|
185
|
+
|
186
|
+
before(:all) do
|
187
|
+
@opts = {}
|
188
|
+
end
|
127
189
|
end
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
urls.should_not include(pages[1].url)
|
190
|
+
|
191
|
+
describe Storage::PStore do
|
192
|
+
it_should_behave_like "crawl"
|
193
|
+
|
194
|
+
before(:each) do
|
195
|
+
@test_file = 'test.pstore'
|
196
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
197
|
+
@opts = {:storage => Storage.PStore(@test_file)}
|
198
|
+
end
|
199
|
+
|
200
|
+
after(:all) do
|
201
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
202
|
+
end
|
142
203
|
end
|
143
|
-
|
144
|
-
describe
|
204
|
+
|
205
|
+
describe Storage::TokyoCabinet do
|
206
|
+
it_should_behave_like "crawl"
|
207
|
+
|
145
208
|
before(:each) do
|
146
|
-
@
|
147
|
-
|
148
|
-
|
149
|
-
# register this page with a link to the next page
|
150
|
-
link = (n + 1).to_s if n + 1 < size
|
151
|
-
@pages << FakePage.new(n.to_s, :links => Array(link))
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
it "should track the page depth and referer" do
|
156
|
-
core = Anemone.crawl(@pages[0].url)
|
157
|
-
previous_page = nil
|
158
|
-
|
159
|
-
@pages.each_with_index do |page, i|
|
160
|
-
page = core.pages[page.url]
|
161
|
-
page.should be
|
162
|
-
page.depth.should == i
|
163
|
-
|
164
|
-
if previous_page
|
165
|
-
page.referer.should == previous_page.url
|
166
|
-
else
|
167
|
-
page.referer.should be_nil
|
168
|
-
end
|
169
|
-
previous_page = page
|
170
|
-
end
|
209
|
+
@test_file = 'test.tch'
|
210
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
211
|
+
@opts = {:storage => @store = Storage.TokyoCabinet(@test_file)}
|
171
212
|
end
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
213
|
+
|
214
|
+
after(:each) do
|
215
|
+
@store.close
|
216
|
+
end
|
217
|
+
|
218
|
+
after(:all) do
|
219
|
+
File.delete(@test_file) if File.exists?(@test_file)
|
176
220
|
end
|
177
221
|
end
|
178
222
|
|
@@ -194,6 +238,25 @@ module Anemone
|
|
194
238
|
core.opts[:depth_limit].should == 3
|
195
239
|
end
|
196
240
|
|
241
|
+
it "should accept options via setter methods in the crawl block" do
|
242
|
+
core = Anemone.crawl(SPEC_DOMAIN) do |a|
|
243
|
+
a.verbose = false
|
244
|
+
a.threads = 2
|
245
|
+
a.discard_page_bodies = true
|
246
|
+
a.user_agent = 'test'
|
247
|
+
a.obey_robots_txt = true
|
248
|
+
a.depth_limit = 3
|
249
|
+
end
|
250
|
+
|
251
|
+
core.opts[:verbose].should == false
|
252
|
+
core.opts[:threads].should == 2
|
253
|
+
core.opts[:discard_page_bodies].should == true
|
254
|
+
core.opts[:delay].should == 0
|
255
|
+
core.opts[:user_agent].should == 'test'
|
256
|
+
core.opts[:obey_robots_txt].should == true
|
257
|
+
core.opts[:depth_limit].should == 3
|
258
|
+
end
|
259
|
+
|
197
260
|
it "should use 1 thread if a delay is requested" do
|
198
261
|
Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
|
199
262
|
end
|