anemone 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/anemone/anemone.rb +21 -6
- data/lib/anemone/core.rb +23 -9
- data/lib/anemone/http.rb +11 -5
- data/lib/anemone/page.rb +17 -7
- data/lib/anemone/page_hash.rb +14 -0
- data/lib/anemone/tentacle.rb +7 -3
- data/spec/anemone_spec.rb +5 -1
- data/spec/core_spec.rb +57 -4
- data/spec/fakeweb_helper.rb +5 -2
- data/spec/spec_helper.rb +1 -1
- metadata +2 -2
data/lib/anemone/anemone.rb
CHANGED
@@ -3,7 +3,7 @@ require 'anemone/core'
|
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
# Version number
|
6
|
-
VERSION = '0.
|
6
|
+
VERSION = '0.2.0'
|
7
7
|
|
8
8
|
#module-wide options
|
9
9
|
def Anemone.options=(options)
|
@@ -20,21 +20,36 @@ module Anemone
|
|
20
20
|
def Anemone.crawl(urls, options = {}, &block)
|
21
21
|
Anemone.options = OpenStruct.new(options)
|
22
22
|
|
23
|
-
#by default, run 4 Tentacle threads to fetch pages
|
23
|
+
# by default, run 4 Tentacle threads to fetch pages
|
24
24
|
Anemone.options.threads ||= 4
|
25
25
|
|
26
|
-
#disable verbose output by default
|
26
|
+
# disable verbose output by default
|
27
27
|
Anemone.options.verbose ||= false
|
28
28
|
|
29
|
-
#by default, don't throw away the page response body after scanning it for links
|
29
|
+
# by default, don't throw away the page response body after scanning it for links
|
30
30
|
Anemone.options.discard_page_bodies ||= false
|
31
31
|
|
32
|
-
#by default, identify self as Anemone/VERSION
|
32
|
+
# by default, identify self as Anemone/VERSION
|
33
33
|
Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
|
34
34
|
|
35
|
-
#no delay between requests by default
|
35
|
+
# no delay between requests by default
|
36
36
|
Anemone.options.delay ||= 0
|
37
|
+
|
38
|
+
# by default, don't obey the robots exclusion protocol
|
39
|
+
if Anemone.options.obey_robots_txt ||= false
|
40
|
+
begin
|
41
|
+
require 'robots'
|
42
|
+
rescue LoadError
|
43
|
+
warn "To support the robot exclusion protocol, install the robots gem:\n" \
|
44
|
+
"sudo gem sources -a http://gems.github.com\n" \
|
45
|
+
"sudo gem install fizx-robots"
|
46
|
+
exit
|
47
|
+
end
|
48
|
+
end
|
37
49
|
|
50
|
+
# by default, don't limit the depth of the crawl
|
51
|
+
Anemone.options.depth_limit ||= :infinity
|
52
|
+
|
38
53
|
#use a single thread if a delay was requested
|
39
54
|
if(Anemone.options.delay != 0)
|
40
55
|
Anemone.options.threads = 1
|
data/lib/anemone/core.rb
CHANGED
@@ -23,6 +23,10 @@ module Anemone
|
|
23
23
|
@skip_link_patterns = []
|
24
24
|
@after_crawl_blocks = []
|
25
25
|
|
26
|
+
if Anemone.options.obey_robots_txt
|
27
|
+
@robots = Robots.new(Anemone.options.user_agent)
|
28
|
+
end
|
29
|
+
|
26
30
|
block.call(self) if block
|
27
31
|
end
|
28
32
|
|
@@ -113,18 +117,18 @@ module Anemone
|
|
113
117
|
|
114
118
|
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
115
119
|
|
116
|
-
#perform the on_every_page blocks for this page
|
120
|
+
# perform the on_every_page blocks for this page
|
117
121
|
do_page_blocks(page)
|
118
122
|
|
119
123
|
page.doc = nil if Anemone.options.discard_page_bodies
|
120
124
|
|
121
125
|
links_to_follow(page).each do |link|
|
122
|
-
link_queue.enq(link)
|
126
|
+
link_queue.enq([link, page])
|
123
127
|
@pages[link] = nil
|
124
128
|
end
|
125
129
|
|
126
|
-
#create an entry in the page hash for each alias of this page,
|
127
|
-
#i.e. all the pages that redirected to this page
|
130
|
+
# create an entry in the page hash for each alias of this page,
|
131
|
+
# i.e. all the pages that redirected to this page
|
128
132
|
page.aliases.each do |aka|
|
129
133
|
if !@pages.has_key?(aka) or @pages[aka].nil?
|
130
134
|
@pages[aka] = page.alias_clone(aka)
|
@@ -184,16 +188,26 @@ module Anemone
|
|
184
188
|
#
|
185
189
|
def links_to_follow(page)
|
186
190
|
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
187
|
-
links.
|
191
|
+
links.select { |link| visit_link?(link, page) }
|
188
192
|
end
|
189
193
|
|
190
194
|
#
|
191
195
|
# Returns +true+ if *link* has not been visited already,
|
192
|
-
# and is not excluded by a skip_link pattern
|
193
|
-
#
|
196
|
+
# and is not excluded by a skip_link pattern...
|
197
|
+
# and is not excluded by robots.txt...
|
198
|
+
# and is not deeper than the depth limit
|
199
|
+
# Returns +false+ otherwise.
|
194
200
|
#
|
195
|
-
def visit_link?(link)
|
196
|
-
|
201
|
+
def visit_link?(link, from_page = nil)
|
202
|
+
allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
|
203
|
+
|
204
|
+
if from_page
|
205
|
+
too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
|
206
|
+
else
|
207
|
+
too_deep = false
|
208
|
+
end
|
209
|
+
|
210
|
+
!@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
|
197
211
|
end
|
198
212
|
|
199
213
|
#
|
data/lib/anemone/http.rb
CHANGED
@@ -9,8 +9,8 @@ module Anemone
|
|
9
9
|
# Retrieve an HTTP response for *url*, following redirects.
|
10
10
|
# Returns the response object, response code, and final URI location.
|
11
11
|
#
|
12
|
-
def self.get(url)
|
13
|
-
response = get_response(url)
|
12
|
+
def self.get(url, referer = nil)
|
13
|
+
response = get_response(url, referer)
|
14
14
|
code = Integer(response.code)
|
15
15
|
loc = url
|
16
16
|
|
@@ -18,7 +18,7 @@ module Anemone
|
|
18
18
|
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
19
|
loc = URI(response['location'])
|
20
20
|
loc = url.merge(loc) if loc.relative?
|
21
|
-
response = get_response(loc)
|
21
|
+
response = get_response(loc, referer)
|
22
22
|
limit -= 1
|
23
23
|
end
|
24
24
|
|
@@ -28,10 +28,16 @@ module Anemone
|
|
28
28
|
#
|
29
29
|
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
30
|
#
|
31
|
-
def self.get_response(url)
|
31
|
+
def self.get_response(url, referer = nil)
|
32
32
|
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
33
|
+
user_agent = Anemone.options.user_agent rescue nil
|
34
|
+
|
35
|
+
opts = {}
|
36
|
+
opts['User-Agent'] = user_agent if user_agent
|
37
|
+
opts['Referer'] = referer.to_s if referer
|
38
|
+
|
33
39
|
Net::HTTP.start(url.host, url.port) do |http|
|
34
|
-
return http.get(full_path,
|
40
|
+
return http.get(full_path, opts)
|
35
41
|
end
|
36
42
|
end
|
37
43
|
end
|
data/lib/anemone/page.rb
CHANGED
@@ -22,24 +22,32 @@ module Anemone
|
|
22
22
|
attr_accessor :aliases
|
23
23
|
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
24
24
|
attr_accessor :visited
|
25
|
-
#
|
25
|
+
# Depth of this page from the root of the crawl. This is not necessarily the
|
26
|
+
# shortest path; use PageHash#shortest_paths! to find that value.
|
26
27
|
attr_accessor :depth
|
28
|
+
# URL of the page that brought us to this page
|
29
|
+
attr_accessor :referer
|
27
30
|
|
28
31
|
#
|
29
32
|
# Create a new Page from the response of an HTTP request to *url*
|
30
33
|
#
|
31
|
-
def self.fetch(url)
|
34
|
+
def self.fetch(url, from_page = nil)
|
32
35
|
begin
|
33
|
-
url = URI(url)
|
36
|
+
url = URI(url) unless url.is_a?(URI)
|
34
37
|
|
35
|
-
|
38
|
+
if from_page
|
39
|
+
referer = from_page.url
|
40
|
+
depth = from_page.depth + 1
|
41
|
+
end
|
42
|
+
|
43
|
+
response, code, location = Anemone::HTTP.get(url, referer)
|
36
44
|
|
37
45
|
aka = nil
|
38
46
|
if !url.eql?(location)
|
39
47
|
aka = location
|
40
48
|
end
|
41
49
|
|
42
|
-
return Page.new(url, response.body, code, response.to_hash, aka)
|
50
|
+
return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
|
43
51
|
rescue
|
44
52
|
return Page.new(url)
|
45
53
|
end
|
@@ -48,14 +56,16 @@ module Anemone
|
|
48
56
|
#
|
49
57
|
# Create a new page
|
50
58
|
#
|
51
|
-
def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
|
59
|
+
def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
|
52
60
|
@url = url
|
53
61
|
@code = code
|
54
62
|
@headers = headers
|
55
63
|
@links = []
|
56
64
|
@aliases = []
|
57
65
|
@data = OpenStruct.new
|
58
|
-
|
66
|
+
@referer = referer
|
67
|
+
@depth = depth || 0
|
68
|
+
|
59
69
|
@aliases << aka if !aka.nil?
|
60
70
|
|
61
71
|
if body
|
data/lib/anemone/page_hash.rb
CHANGED
@@ -1,6 +1,20 @@
|
|
1
1
|
module Anemone
|
2
2
|
class PageHash < Hash
|
3
3
|
|
4
|
+
# We typically index the hash with a URI,
|
5
|
+
# but convert it to a String for easier retrieval
|
6
|
+
def [](index)
|
7
|
+
super(index.to_s)
|
8
|
+
end
|
9
|
+
|
10
|
+
def []=(index, other)
|
11
|
+
super(index.to_s, other)
|
12
|
+
end
|
13
|
+
|
14
|
+
def has_key?(key)
|
15
|
+
super(key.to_s)
|
16
|
+
end
|
17
|
+
|
4
18
|
#
|
5
19
|
# Use a breadth-first search to calculate the single-source
|
6
20
|
# shortest paths from *root* to all pages in the PageHash
|
data/lib/anemone/tentacle.rb
CHANGED
@@ -17,11 +17,15 @@ module Anemone
|
|
17
17
|
#
|
18
18
|
def run
|
19
19
|
while true do
|
20
|
-
link = @link_queue.deq
|
20
|
+
link, from_page = @link_queue.deq
|
21
21
|
|
22
22
|
break if link == :END
|
23
|
-
|
24
|
-
|
23
|
+
|
24
|
+
if from_page
|
25
|
+
page = Page.fetch(link, from_page)
|
26
|
+
else
|
27
|
+
page = Page.fetch(link)
|
28
|
+
end
|
25
29
|
|
26
30
|
@page_queue.enq(page)
|
27
31
|
|
data/spec/anemone_spec.rb
CHANGED
@@ -14,12 +14,16 @@ describe Anemone do
|
|
14
14
|
Anemone.crawl(SPEC_DOMAIN, :verbose => false,
|
15
15
|
:threads => 2,
|
16
16
|
:discard_page_bodies => true,
|
17
|
-
:user_agent => 'test'
|
17
|
+
:user_agent => 'test',
|
18
|
+
:obey_robots_txt => true,
|
19
|
+
:depth_limit => 3)
|
18
20
|
Anemone.options.verbose.should == false
|
19
21
|
Anemone.options.threads.should == 2
|
20
22
|
Anemone.options.discard_page_bodies.should == true
|
21
23
|
Anemone.options.delay.should == 0
|
22
24
|
Anemone.options.user_agent.should == 'test'
|
25
|
+
Anemone.options.obey_robots_txt.should == true
|
26
|
+
Anemone.options.depth_limit.should == 3
|
23
27
|
end
|
24
28
|
|
25
29
|
it "should use 1 thread if a delay is requested" do
|
data/spec/core_spec.rb
CHANGED
@@ -25,7 +25,7 @@ module Anemone
|
|
25
25
|
core = Anemone.crawl(pages[0].url)
|
26
26
|
|
27
27
|
core.should have(2).pages
|
28
|
-
core.pages.keys.
|
28
|
+
core.pages.keys.should_not include('http://www.other.com/')
|
29
29
|
end
|
30
30
|
|
31
31
|
it "should follow http redirects" do
|
@@ -56,7 +56,7 @@ module Anemone
|
|
56
56
|
core = Anemone.crawl(pages[0].url)
|
57
57
|
|
58
58
|
core.should have(2).pages
|
59
|
-
core.pages.keys.
|
59
|
+
core.pages.keys.should_not include(pages[2].url)
|
60
60
|
end
|
61
61
|
|
62
62
|
it "should be able to skip links based on a RegEx" do
|
@@ -70,7 +70,7 @@ module Anemone
|
|
70
70
|
end
|
71
71
|
|
72
72
|
core.should have(2).pages
|
73
|
-
core.pages.keys.
|
73
|
+
core.pages.keys.should_not include(pages[1].url)
|
74
74
|
end
|
75
75
|
|
76
76
|
it "should be able to call a block on every page" do
|
@@ -107,7 +107,7 @@ module Anemone
|
|
107
107
|
end
|
108
108
|
|
109
109
|
core.should have(2).pages
|
110
|
-
core.pages.keys.
|
110
|
+
core.pages.keys.should_not include(pages[1].url)
|
111
111
|
end
|
112
112
|
|
113
113
|
it "should optionally delay between page requests" do
|
@@ -123,6 +123,59 @@ module Anemone
|
|
123
123
|
|
124
124
|
(finish - start).should satisfy {|t| t > delay * 2}
|
125
125
|
end
|
126
|
+
|
127
|
+
it "should optionally obey the robots exclusion protocol" do
|
128
|
+
pages = []
|
129
|
+
pages << FakePage.new('0', :links => '1')
|
130
|
+
pages << FakePage.new('1')
|
131
|
+
pages << FakePage.new('robots.txt',
|
132
|
+
:body => "User-agent: *\nDisallow: /1",
|
133
|
+
:content_type => 'text/plain')
|
134
|
+
|
135
|
+
core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
|
136
|
+
urls = core.pages.keys
|
137
|
+
|
138
|
+
urls.should include(pages[0].url)
|
139
|
+
urls.should_not include(pages[1].url)
|
140
|
+
end
|
141
|
+
|
142
|
+
it "should track the page depth and referer" do
|
143
|
+
num_pages = 5
|
144
|
+
|
145
|
+
pages = []
|
146
|
+
|
147
|
+
num_pages.times do |n|
|
148
|
+
# register this page with a link to the next page
|
149
|
+
link = (n + 1).to_s if n + 1 < num_pages
|
150
|
+
pages << FakePage.new(n.to_s, :links => [link].compact)
|
151
|
+
end
|
152
|
+
|
153
|
+
core = Anemone.crawl(pages[0].url)
|
154
|
+
|
155
|
+
num_pages.times do |n|
|
156
|
+
page = core.pages[pages[n].url]
|
157
|
+
page.depth.should == n
|
158
|
+
page.referer.should == core.pages[pages[n-1].url].url if n > 0
|
159
|
+
end
|
160
|
+
|
161
|
+
core.pages[pages[0].url].referer.should == nil
|
162
|
+
end
|
126
163
|
|
164
|
+
it "should optionally limit the depth of the crawl" do
|
165
|
+
num_pages = 5
|
166
|
+
|
167
|
+
pages = []
|
168
|
+
|
169
|
+
num_pages.times do |n|
|
170
|
+
# register this page with a link to the next page
|
171
|
+
link = (n + 1).to_s if n + 1 < num_pages
|
172
|
+
pages << FakePage.new(n.to_s, :links => [link].compact)
|
173
|
+
end
|
174
|
+
|
175
|
+
core = Anemone.crawl(pages[0].url, :depth_limit => 3)
|
176
|
+
|
177
|
+
core.should have(4).pages
|
178
|
+
end
|
179
|
+
|
127
180
|
end
|
128
181
|
end
|
data/spec/fakeweb_helper.rb
CHANGED
@@ -13,14 +13,17 @@ module Anemone
|
|
13
13
|
class FakePage
|
14
14
|
attr_accessor :links
|
15
15
|
attr_accessor :hrefs
|
16
|
+
attr_accessor :body
|
16
17
|
|
17
18
|
def initialize(name = '', options = {})
|
18
19
|
@name = name
|
19
20
|
@links = [options[:links]].flatten if options.has_key?(:links)
|
20
21
|
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
21
22
|
@redirect = options[:redirect] if options.has_key?(:redirect)
|
23
|
+
@content_type = options[:content_type] || "text/html"
|
24
|
+
@body = options[:body]
|
22
25
|
|
23
|
-
create_body
|
26
|
+
create_body unless @body
|
24
27
|
add_to_fakeweb
|
25
28
|
end
|
26
29
|
|
@@ -38,7 +41,7 @@ module Anemone
|
|
38
41
|
end
|
39
42
|
|
40
43
|
def add_to_fakeweb
|
41
|
-
options = {:body => @body, :content_type =>
|
44
|
+
options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
|
42
45
|
|
43
46
|
if @redirect
|
44
47
|
options[:status] = [301, "Permanently Moved"]
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-09-07 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|