anemone 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/anemone/anemone.rb +21 -6
- data/lib/anemone/core.rb +23 -9
- data/lib/anemone/http.rb +11 -5
- data/lib/anemone/page.rb +17 -7
- data/lib/anemone/page_hash.rb +14 -0
- data/lib/anemone/tentacle.rb +7 -3
- data/spec/anemone_spec.rb +5 -1
- data/spec/core_spec.rb +57 -4
- data/spec/fakeweb_helper.rb +5 -2
- data/spec/spec_helper.rb +1 -1
- metadata +2 -2
data/lib/anemone/anemone.rb
CHANGED
@@ -3,7 +3,7 @@ require 'anemone/core'
|
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
# Version number
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.2.0'
|
7
7
|
|
8
8
|
#module-wide options
|
9
9
|
def Anemone.options=(options)
|
@@ -20,21 +20,36 @@ module Anemone
|
|
20
20
|
def Anemone.crawl(urls, options = {}, &block)
|
21
21
|
Anemone.options = OpenStruct.new(options)
|
22
22
|
|
23
|
-
#by default, run 4 Tentacle threads to fetch pages
|
23
|
+
# by default, run 4 Tentacle threads to fetch pages
|
24
24
|
Anemone.options.threads ||= 4
|
25
25
|
|
26
|
-
#disable verbose output by default
|
26
|
+
# disable verbose output by default
|
27
27
|
Anemone.options.verbose ||= false
|
28
28
|
|
29
|
-
#by default, don't throw away the page response body after scanning it for links
|
29
|
+
# by default, don't throw away the page response body after scanning it for links
|
30
30
|
Anemone.options.discard_page_bodies ||= false
|
31
31
|
|
32
|
-
#by default, identify self as Anemone/VERSION
|
32
|
+
# by default, identify self as Anemone/VERSION
|
33
33
|
Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
|
34
34
|
|
35
|
-
#no delay between requests by default
|
35
|
+
# no delay between requests by default
|
36
36
|
Anemone.options.delay ||= 0
|
37
|
+
|
38
|
+
# by default, don't obey the robots exclusion protocol
|
39
|
+
if Anemone.options.obey_robots_txt ||= false
|
40
|
+
begin
|
41
|
+
require 'robots'
|
42
|
+
rescue LoadError
|
43
|
+
warn "To support the robot exclusion protocol, install the robots gem:\n" \
|
44
|
+
"sudo gem sources -a http://gems.github.com\n" \
|
45
|
+
"sudo gem install fizx-robots"
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
end
|
37
49
|
|
50
|
+
# by default, don't limit the depth of the crawl
|
51
|
+
Anemone.options.depth_limit ||= :infinity
|
52
|
+
|
38
53
|
#use a single thread if a delay was requested
|
39
54
|
if(Anemone.options.delay != 0)
|
40
55
|
Anemone.options.threads = 1
|
data/lib/anemone/core.rb
CHANGED
@@ -23,6 +23,10 @@ module Anemone
|
|
23
23
|
@skip_link_patterns = []
|
24
24
|
@after_crawl_blocks = []
|
25
25
|
|
26
|
+
if Anemone.options.obey_robots_txt
|
27
|
+
@robots = Robots.new(Anemone.options.user_agent)
|
28
|
+
end
|
29
|
+
|
26
30
|
block.call(self) if block
|
27
31
|
end
|
28
32
|
|
@@ -113,18 +117,18 @@ module Anemone
|
|
113
117
|
|
114
118
|
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
115
119
|
|
116
|
-
#perform the on_every_page blocks for this page
|
120
|
+
# perform the on_every_page blocks for this page
|
117
121
|
do_page_blocks(page)
|
118
122
|
|
119
123
|
page.doc = nil if Anemone.options.discard_page_bodies
|
120
124
|
|
121
125
|
links_to_follow(page).each do |link|
|
122
|
-
link_queue.enq(link)
|
126
|
+
link_queue.enq([link, page])
|
123
127
|
@pages[link] = nil
|
124
128
|
end
|
125
129
|
|
126
|
-
#create an entry in the page hash for each alias of this page,
|
127
|
-
#i.e. all the pages that redirected to this page
|
130
|
+
# create an entry in the page hash for each alias of this page,
|
131
|
+
# i.e. all the pages that redirected to this page
|
128
132
|
page.aliases.each do |aka|
|
129
133
|
if !@pages.has_key?(aka) or @pages[aka].nil?
|
130
134
|
@pages[aka] = page.alias_clone(aka)
|
@@ -184,16 +188,26 @@ module Anemone
|
|
184
188
|
#
|
185
189
|
def links_to_follow(page)
|
186
190
|
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
187
|
-
links.
|
191
|
+
links.select { |link| visit_link?(link, page) }
|
188
192
|
end
|
189
193
|
|
190
194
|
#
|
191
195
|
# Returns +true+ if *link* has not been visited already,
|
192
|
-
# and is not excluded by a skip_link pattern
|
193
|
-
#
|
196
|
+
# and is not excluded by a skip_link pattern...
|
197
|
+
# and is not excluded by robots.txt...
|
198
|
+
# and is not deeper than the depth limit
|
199
|
+
# Returns +false+ otherwise.
|
194
200
|
#
|
195
|
-
def visit_link?(link)
|
196
|
-
|
201
|
+
def visit_link?(link, from_page = nil)
|
202
|
+
allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
|
203
|
+
|
204
|
+
if from_page
|
205
|
+
too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
|
206
|
+
else
|
207
|
+
too_deep = false
|
208
|
+
end
|
209
|
+
|
210
|
+
!@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
|
197
211
|
end
|
198
212
|
|
199
213
|
#
|
data/lib/anemone/http.rb
CHANGED
@@ -9,8 +9,8 @@ module Anemone
|
|
9
9
|
# Retrieve an HTTP response for *url*, following redirects.
|
10
10
|
# Returns the response object, response code, and final URI location.
|
11
11
|
#
|
12
|
-
def self.get(url)
|
13
|
-
response = get_response(url)
|
12
|
+
def self.get(url, referer = nil)
|
13
|
+
response = get_response(url, referer)
|
14
14
|
code = Integer(response.code)
|
15
15
|
loc = url
|
16
16
|
|
@@ -18,7 +18,7 @@ module Anemone
|
|
18
18
|
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
19
|
loc = URI(response['location'])
|
20
20
|
loc = url.merge(loc) if loc.relative?
|
21
|
-
response = get_response(loc)
|
21
|
+
response = get_response(loc, referer)
|
22
22
|
limit -= 1
|
23
23
|
end
|
24
24
|
|
@@ -28,10 +28,16 @@ module Anemone
|
|
28
28
|
#
|
29
29
|
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
30
|
#
|
31
|
-
def self.get_response(url)
|
31
|
+
def self.get_response(url, referer = nil)
|
32
32
|
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
33
|
+
user_agent = Anemone.options.user_agent rescue nil
|
34
|
+
|
35
|
+
opts = {}
|
36
|
+
opts['User-Agent'] = user_agent if user_agent
|
37
|
+
opts['Referer'] = referer.to_s if referer
|
38
|
+
|
33
39
|
Net::HTTP.start(url.host, url.port) do |http|
|
34
|
-
return http.get(full_path,
|
40
|
+
return http.get(full_path, opts)
|
35
41
|
end
|
36
42
|
end
|
37
43
|
end
|
data/lib/anemone/page.rb
CHANGED
@@ -22,24 +22,32 @@ module Anemone
|
|
22
22
|
attr_accessor :aliases
|
23
23
|
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
24
24
|
attr_accessor :visited
|
25
|
-
#
|
25
|
+
# Depth of this page from the root of the crawl. This is not necessarily the
|
26
|
+
# shortest path; use PageHash#shortest_paths! to find that value.
|
26
27
|
attr_accessor :depth
|
28
|
+
# URL of the page that brought us to this page
|
29
|
+
attr_accessor :referer
|
27
30
|
|
28
31
|
#
|
29
32
|
# Create a new Page from the response of an HTTP request to *url*
|
30
33
|
#
|
31
|
-
def self.fetch(url)
|
34
|
+
def self.fetch(url, from_page = nil)
|
32
35
|
begin
|
33
|
-
url = URI(url)
|
36
|
+
url = URI(url) unless url.is_a?(URI)
|
34
37
|
|
35
|
-
|
38
|
+
if from_page
|
39
|
+
referer = from_page.url
|
40
|
+
depth = from_page.depth + 1
|
41
|
+
end
|
42
|
+
|
43
|
+
response, code, location = Anemone::HTTP.get(url, referer)
|
36
44
|
|
37
45
|
aka = nil
|
38
46
|
if !url.eql?(location)
|
39
47
|
aka = location
|
40
48
|
end
|
41
49
|
|
42
|
-
return Page.new(url, response.body, code, response.to_hash, aka)
|
50
|
+
return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
|
43
51
|
rescue
|
44
52
|
return Page.new(url)
|
45
53
|
end
|
@@ -48,14 +56,16 @@ module Anemone
|
|
48
56
|
#
|
49
57
|
# Create a new page
|
50
58
|
#
|
51
|
-
def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
|
59
|
+
def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
|
52
60
|
@url = url
|
53
61
|
@code = code
|
54
62
|
@headers = headers
|
55
63
|
@links = []
|
56
64
|
@aliases = []
|
57
65
|
@data = OpenStruct.new
|
58
|
-
|
66
|
+
@referer = referer
|
67
|
+
@depth = depth || 0
|
68
|
+
|
59
69
|
@aliases << aka if !aka.nil?
|
60
70
|
|
61
71
|
if body
|
data/lib/anemone/page_hash.rb
CHANGED
@@ -1,6 +1,20 @@
|
|
1
1
|
module Anemone
|
2
2
|
class PageHash < Hash
|
3
3
|
|
4
|
+
# We typically index the hash with a URI,
|
5
|
+
# but convert it to a String for easier retrieval
|
6
|
+
def [](index)
|
7
|
+
super(index.to_s)
|
8
|
+
end
|
9
|
+
|
10
|
+
def []=(index, other)
|
11
|
+
super(index.to_s, other)
|
12
|
+
end
|
13
|
+
|
14
|
+
def has_key?(key)
|
15
|
+
super(key.to_s)
|
16
|
+
end
|
17
|
+
|
4
18
|
#
|
5
19
|
# Use a breadth-first search to calculate the single-source
|
6
20
|
# shortest paths from *root* to all pages in the PageHash
|
data/lib/anemone/tentacle.rb
CHANGED
@@ -17,11 +17,15 @@ module Anemone
|
|
17
17
|
#
|
18
18
|
def run
|
19
19
|
while true do
|
20
|
-
link = @link_queue.deq
|
20
|
+
link, from_page = @link_queue.deq
|
21
21
|
|
22
22
|
break if link == :END
|
23
|
-
|
24
|
-
|
23
|
+
|
24
|
+
if from_page
|
25
|
+
page = Page.fetch(link, from_page)
|
26
|
+
else
|
27
|
+
page = Page.fetch(link)
|
28
|
+
end
|
25
29
|
|
26
30
|
@page_queue.enq(page)
|
27
31
|
|
data/spec/anemone_spec.rb
CHANGED
@@ -14,12 +14,16 @@ describe Anemone do
|
|
14
14
|
Anemone.crawl(SPEC_DOMAIN, :verbose => false,
|
15
15
|
:threads => 2,
|
16
16
|
:discard_page_bodies => true,
|
17
|
-
:user_agent => 'test'
|
17
|
+
:user_agent => 'test',
|
18
|
+
:obey_robots_txt => true,
|
19
|
+
:depth_limit => 3)
|
18
20
|
Anemone.options.verbose.should == false
|
19
21
|
Anemone.options.threads.should == 2
|
20
22
|
Anemone.options.discard_page_bodies.should == true
|
21
23
|
Anemone.options.delay.should == 0
|
22
24
|
Anemone.options.user_agent.should == 'test'
|
25
|
+
Anemone.options.obey_robots_txt.should == true
|
26
|
+
Anemone.options.depth_limit.should == 3
|
23
27
|
end
|
24
28
|
|
25
29
|
it "should use 1 thread if a delay is requested" do
|
data/spec/core_spec.rb
CHANGED
@@ -25,7 +25,7 @@ module Anemone
|
|
25
25
|
core = Anemone.crawl(pages[0].url)
|
26
26
|
|
27
27
|
core.should have(2).pages
|
28
|
-
core.pages.keys.
|
28
|
+
core.pages.keys.should_not include('http://www.other.com/')
|
29
29
|
end
|
30
30
|
|
31
31
|
it "should follow http redirects" do
|
@@ -56,7 +56,7 @@ module Anemone
|
|
56
56
|
core = Anemone.crawl(pages[0].url)
|
57
57
|
|
58
58
|
core.should have(2).pages
|
59
|
-
core.pages.keys.
|
59
|
+
core.pages.keys.should_not include(pages[2].url)
|
60
60
|
end
|
61
61
|
|
62
62
|
it "should be able to skip links based on a RegEx" do
|
@@ -70,7 +70,7 @@ module Anemone
|
|
70
70
|
end
|
71
71
|
|
72
72
|
core.should have(2).pages
|
73
|
-
core.pages.keys.
|
73
|
+
core.pages.keys.should_not include(pages[1].url)
|
74
74
|
end
|
75
75
|
|
76
76
|
it "should be able to call a block on every page" do
|
@@ -107,7 +107,7 @@ module Anemone
|
|
107
107
|
end
|
108
108
|
|
109
109
|
core.should have(2).pages
|
110
|
-
core.pages.keys.
|
110
|
+
core.pages.keys.should_not include(pages[1].url)
|
111
111
|
end
|
112
112
|
|
113
113
|
it "should optionally delay between page requests" do
|
@@ -123,6 +123,59 @@ module Anemone
|
|
123
123
|
|
124
124
|
(finish - start).should satisfy {|t| t > delay * 2}
|
125
125
|
end
|
126
|
+
|
127
|
+
it "should optionally obey the robots exclusion protocol" do
|
128
|
+
pages = []
|
129
|
+
pages << FakePage.new('0', :links => '1')
|
130
|
+
pages << FakePage.new('1')
|
131
|
+
pages << FakePage.new('robots.txt',
|
132
|
+
:body => "User-agent: *\nDisallow: /1",
|
133
|
+
:content_type => 'text/plain')
|
134
|
+
|
135
|
+
core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
|
136
|
+
urls = core.pages.keys
|
137
|
+
|
138
|
+
urls.should include(pages[0].url)
|
139
|
+
urls.should_not include(pages[1].url)
|
140
|
+
end
|
141
|
+
|
142
|
+
it "should track the page depth and referer" do
|
143
|
+
num_pages = 5
|
144
|
+
|
145
|
+
pages = []
|
146
|
+
|
147
|
+
num_pages.times do |n|
|
148
|
+
# register this page with a link to the next page
|
149
|
+
link = (n + 1).to_s if n + 1 < num_pages
|
150
|
+
pages << FakePage.new(n.to_s, :links => [link].compact)
|
151
|
+
end
|
152
|
+
|
153
|
+
core = Anemone.crawl(pages[0].url)
|
154
|
+
|
155
|
+
num_pages.times do |n|
|
156
|
+
page = core.pages[pages[n].url]
|
157
|
+
page.depth.should == n
|
158
|
+
page.referer.should == core.pages[pages[n-1].url].url if n > 0
|
159
|
+
end
|
160
|
+
|
161
|
+
core.pages[pages[0].url].referer.should == nil
|
162
|
+
end
|
126
163
|
|
164
|
+
it "should optionally limit the depth of the crawl" do
|
165
|
+
num_pages = 5
|
166
|
+
|
167
|
+
pages = []
|
168
|
+
|
169
|
+
num_pages.times do |n|
|
170
|
+
# register this page with a link to the next page
|
171
|
+
link = (n + 1).to_s if n + 1 < num_pages
|
172
|
+
pages << FakePage.new(n.to_s, :links => [link].compact)
|
173
|
+
end
|
174
|
+
|
175
|
+
core = Anemone.crawl(pages[0].url, :depth_limit => 3)
|
176
|
+
|
177
|
+
core.should have(4).pages
|
178
|
+
end
|
179
|
+
|
127
180
|
end
|
128
181
|
end
|
data/spec/fakeweb_helper.rb
CHANGED
@@ -13,14 +13,17 @@ module Anemone
|
|
13
13
|
class FakePage
|
14
14
|
attr_accessor :links
|
15
15
|
attr_accessor :hrefs
|
16
|
+
attr_accessor :body
|
16
17
|
|
17
18
|
def initialize(name = '', options = {})
|
18
19
|
@name = name
|
19
20
|
@links = [options[:links]].flatten if options.has_key?(:links)
|
20
21
|
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
21
22
|
@redirect = options[:redirect] if options.has_key?(:redirect)
|
23
|
+
@content_type = options[:content_type] || "text/html"
|
24
|
+
@body = options[:body]
|
22
25
|
|
23
|
-
create_body
|
26
|
+
create_body unless @body
|
24
27
|
add_to_fakeweb
|
25
28
|
end
|
26
29
|
|
@@ -38,7 +41,7 @@ module Anemone
|
|
38
41
|
end
|
39
42
|
|
40
43
|
def add_to_fakeweb
|
41
|
-
options = {:body => @body, :content_type =>
|
44
|
+
options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
|
42
45
|
|
43
46
|
if @redirect
|
44
47
|
options[:status] = [301, "Permanently Moved"]
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-09-07 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|