anemone 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +27 -0
- data/README.rdoc +1 -3
- data/lib/anemone.rb +1 -1
- data/lib/anemone/core.rb +57 -24
- data/lib/anemone/http.rb +19 -6
- data/lib/anemone/page.rb +1 -1
- data/lib/anemone/tentacle.rb +11 -4
- data/spec/anemone_spec.rb +0 -35
- data/spec/core_spec.rb +28 -2
- data/spec/fakeweb_helper.rb +0 -1
- metadata +13 -3
- data/lib/anemone/anemone.rb +0 -54
data/CHANGELOG.rdoc
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
== 0.2.3 / 2009-11-01
|
2
|
+
|
3
|
+
* Minor enhancements
|
4
|
+
|
5
|
+
* Options are now applied per-crawl, rather than module-wide.
|
6
|
+
|
7
|
+
* Bug fixes
|
8
|
+
|
9
|
+
* Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
|
10
|
+
|
11
|
+
== 0.2.2 / 2009-10-26
|
12
|
+
|
13
|
+
* Minor enhancements
|
14
|
+
|
15
|
+
* When the :verbose option is set to true, exception backtraces are printed to aid debugging.
|
16
|
+
|
17
|
+
== 0.2.1 / 2009-10-24
|
18
|
+
|
19
|
+
* Major enhancements
|
20
|
+
|
21
|
+
* Added HTTPS support.
|
22
|
+
* CLI program 'anemone', which is a frontend for several tasks.
|
23
|
+
|
24
|
+
* Minor enhancements
|
25
|
+
|
26
|
+
* HTTP request response time recorded in Page.
|
27
|
+
* Use of persistent HTTP connections.
|
data/README.rdoc
CHANGED
data/lib/anemone.rb
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require 'anemone/
|
2
|
+
require 'anemone/core'
|
data/lib/anemone/core.rb
CHANGED
@@ -1,19 +1,51 @@
|
|
1
|
-
require 'net/http'
|
2
1
|
require 'thread'
|
2
|
+
require 'robots'
|
3
3
|
require 'anemone/tentacle'
|
4
4
|
require 'anemone/page'
|
5
5
|
require 'anemone/page_hash'
|
6
6
|
|
7
7
|
module Anemone
|
8
|
+
|
9
|
+
VERSION = '0.2.3';
|
10
|
+
|
11
|
+
#
|
12
|
+
# Convenience method to start a crawl
|
13
|
+
#
|
14
|
+
def Anemone.crawl(urls, options = {}, &block)
|
15
|
+
Core.crawl(urls, options, &block)
|
16
|
+
end
|
17
|
+
|
8
18
|
class Core
|
9
19
|
# PageHash storing all Page objects encountered during the crawl
|
10
20
|
attr_reader :pages
|
11
|
-
|
21
|
+
|
22
|
+
# Hash of options for the crawl
|
23
|
+
attr_accessor :opts
|
24
|
+
|
25
|
+
DEFAULT_OPTS = {
|
26
|
+
# run 4 Tentacle threads to fetch pages
|
27
|
+
:threads => 4,
|
28
|
+
# disable verbose output
|
29
|
+
:verbose => false,
|
30
|
+
# don't throw away the page response body after scanning it for links
|
31
|
+
:discard_page_bodies => false,
|
32
|
+
# identify self as Anemone/VERSION
|
33
|
+
:user_agent => "Anemone/#{Anemone::VERSION}",
|
34
|
+
# no delay between requests
|
35
|
+
:delay => 0,
|
36
|
+
# don't obey the robots exclusion protocol
|
37
|
+
:obey_robots_txt => false,
|
38
|
+
# by default, don't limit the depth of the crawl
|
39
|
+
:depth_limit => false,
|
40
|
+
# number of times HTTP redirects will be followed
|
41
|
+
:redirect_limit => 5
|
42
|
+
}
|
43
|
+
|
12
44
|
#
|
13
45
|
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
14
46
|
# and optional *block*
|
15
47
|
#
|
16
|
-
def initialize(urls)
|
48
|
+
def initialize(urls, opts = {})
|
17
49
|
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
18
50
|
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
19
51
|
|
@@ -23,10 +55,8 @@ module Anemone
|
|
23
55
|
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
24
56
|
@skip_link_patterns = []
|
25
57
|
@after_crawl_blocks = []
|
26
|
-
|
27
|
-
|
28
|
-
@robots = Robots.new(Anemone.options.user_agent)
|
29
|
-
end
|
58
|
+
|
59
|
+
process_options opts
|
30
60
|
|
31
61
|
yield self if block_given?
|
32
62
|
end
|
@@ -34,8 +64,8 @@ module Anemone
|
|
34
64
|
#
|
35
65
|
# Convenience method to start a new crawl
|
36
66
|
#
|
37
|
-
def self.crawl(
|
38
|
-
self.new(
|
67
|
+
def self.crawl(urls, opts = {})
|
68
|
+
self.new(urls, opts) do |core|
|
39
69
|
yield core if block_given?
|
40
70
|
core.run
|
41
71
|
end
|
@@ -55,11 +85,7 @@ module Anemone
|
|
55
85
|
# followed
|
56
86
|
#
|
57
87
|
def skip_links_like(*patterns)
|
58
|
-
|
59
|
-
patterns.each do |pattern|
|
60
|
-
@skip_link_patterns << pattern
|
61
|
-
end
|
62
|
-
end
|
88
|
+
@skip_link_patterns.concat [patterns].flatten.compact
|
63
89
|
self
|
64
90
|
end
|
65
91
|
|
@@ -104,8 +130,8 @@ module Anemone
|
|
104
130
|
link_queue = Queue.new
|
105
131
|
page_queue = Queue.new
|
106
132
|
|
107
|
-
|
108
|
-
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
133
|
+
@opts[:threads].times do
|
134
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
109
135
|
end
|
110
136
|
|
111
137
|
@urls.each{ |url| link_queue.enq(url) }
|
@@ -115,12 +141,12 @@ module Anemone
|
|
115
141
|
|
116
142
|
@pages[page.url] = page
|
117
143
|
|
118
|
-
puts "#{page.url} Queue: #{link_queue.size}" if
|
144
|
+
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
119
145
|
|
120
146
|
# perform the on_every_page blocks for this page
|
121
147
|
do_page_blocks(page)
|
122
148
|
|
123
|
-
page.discard_doc! if
|
149
|
+
page.discard_doc! if @opts[:discard_page_bodies]
|
124
150
|
|
125
151
|
links_to_follow(page).each do |link|
|
126
152
|
link_queue.enq([link, page])
|
@@ -158,7 +184,15 @@ module Anemone
|
|
158
184
|
end
|
159
185
|
|
160
186
|
private
|
161
|
-
|
187
|
+
|
188
|
+
def process_options(options)
|
189
|
+
@opts = DEFAULT_OPTS.merge options
|
190
|
+
|
191
|
+
@opts[:threads] = 1 if @opts[:delay] > 0
|
192
|
+
|
193
|
+
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
194
|
+
end
|
195
|
+
|
162
196
|
#
|
163
197
|
# Execute the after_crawl blocks
|
164
198
|
#
|
@@ -199,10 +233,10 @@ module Anemone
|
|
199
233
|
# Returns +false+ otherwise.
|
200
234
|
#
|
201
235
|
def visit_link?(link, from_page = nil)
|
202
|
-
allowed =
|
236
|
+
allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
203
237
|
|
204
|
-
if from_page
|
205
|
-
too_deep = from_page.depth >=
|
238
|
+
if from_page && @opts[:depth_limit]
|
239
|
+
too_deep = from_page.depth >= @opts[:depth_limit]
|
206
240
|
else
|
207
241
|
too_deep = false
|
208
242
|
end
|
@@ -215,8 +249,7 @@ module Anemone
|
|
215
249
|
# its URL matches a skip_link pattern.
|
216
250
|
#
|
217
251
|
def skip_link?(link)
|
218
|
-
@skip_link_patterns.
|
219
|
-
false
|
252
|
+
@skip_link_patterns.any? { |p| link.path =~ p }
|
220
253
|
end
|
221
254
|
|
222
255
|
end
|
data/lib/anemone/http.rb
CHANGED
@@ -4,10 +4,11 @@ require 'anemone/page'
|
|
4
4
|
module Anemone
|
5
5
|
class HTTP
|
6
6
|
# Maximum number of redirects to follow on each get_response
|
7
|
-
|
7
|
+
REDIRECT_LIMIT = 5
|
8
8
|
|
9
|
-
def initialize
|
9
|
+
def initialize(opts = {})
|
10
10
|
@connections = {}
|
11
|
+
@opts = opts
|
11
12
|
end
|
12
13
|
|
13
14
|
#
|
@@ -31,7 +32,7 @@ module Anemone
|
|
31
32
|
|
32
33
|
return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
|
33
34
|
rescue => e
|
34
|
-
if
|
35
|
+
if verbose?
|
35
36
|
puts e.inspect
|
36
37
|
puts e.backtrace
|
37
38
|
end
|
@@ -50,7 +51,7 @@ module Anemone
|
|
50
51
|
code = Integer(response.code)
|
51
52
|
loc = url
|
52
53
|
|
53
|
-
limit =
|
54
|
+
limit = redirect_limit
|
54
55
|
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
55
56
|
loc = URI(response['location'])
|
56
57
|
loc = url.merge(loc) if loc.relative?
|
@@ -66,7 +67,6 @@ module Anemone
|
|
66
67
|
#
|
67
68
|
def get_response(url, referer = nil)
|
68
69
|
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
69
|
-
user_agent = Anemone.options.user_agent rescue nil
|
70
70
|
|
71
71
|
opts = {}
|
72
72
|
opts['User-Agent'] = user_agent if user_agent
|
@@ -82,7 +82,7 @@ module Anemone
|
|
82
82
|
rescue EOFError
|
83
83
|
refresh_connection(url)
|
84
84
|
retries += 1
|
85
|
-
retry unless retries >
|
85
|
+
retry unless retries > 3
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
@@ -104,5 +104,18 @@ module Anemone
|
|
104
104
|
end
|
105
105
|
@connections[url.host][url.port] = http.start
|
106
106
|
end
|
107
|
+
|
108
|
+
def redirect_limit
|
109
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
110
|
+
end
|
111
|
+
|
112
|
+
def user_agent
|
113
|
+
@opts[:user_agent]
|
114
|
+
end
|
115
|
+
|
116
|
+
def verbose?
|
117
|
+
@opts[:verbose]
|
118
|
+
end
|
119
|
+
|
107
120
|
end
|
108
121
|
end
|
data/lib/anemone/page.rb
CHANGED
@@ -33,7 +33,7 @@ module Anemone
|
|
33
33
|
def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
|
34
34
|
@url = url
|
35
35
|
@code = code
|
36
|
-
@headers = headers
|
36
|
+
@headers = headers || {}
|
37
37
|
@headers['content-type'] ||= ['']
|
38
38
|
@aliases = Array(aka)
|
39
39
|
@data = OpenStruct.new
|
data/lib/anemone/tentacle.rb
CHANGED
@@ -6,10 +6,11 @@ module Anemone
|
|
6
6
|
#
|
7
7
|
# Create a new Tentacle
|
8
8
|
#
|
9
|
-
def initialize(link_queue, page_queue)
|
9
|
+
def initialize(link_queue, page_queue, opts = {})
|
10
10
|
@link_queue = link_queue
|
11
11
|
@page_queue = page_queue
|
12
|
-
@http = Anemone::HTTP.new
|
12
|
+
@http = Anemone::HTTP.new(opts)
|
13
|
+
@opts = opts
|
13
14
|
end
|
14
15
|
|
15
16
|
#
|
@@ -22,11 +23,17 @@ module Anemone
|
|
22
23
|
|
23
24
|
break if link == :END
|
24
25
|
|
25
|
-
@page_queue
|
26
|
+
@page_queue << @http.fetch_page(link, from_page)
|
26
27
|
|
27
|
-
|
28
|
+
delay
|
28
29
|
end
|
29
30
|
end
|
30
31
|
|
32
|
+
private
|
33
|
+
|
34
|
+
def delay
|
35
|
+
sleep @opts[:delay] if @opts[:delay]
|
36
|
+
end
|
37
|
+
|
31
38
|
end
|
32
39
|
end
|
data/spec/anemone_spec.rb
CHANGED
@@ -2,45 +2,10 @@ require File.dirname(__FILE__) + '/spec_helper'
|
|
2
2
|
|
3
3
|
describe Anemone do
|
4
4
|
|
5
|
-
before(:all) do
|
6
|
-
Anemone::FakePage.new
|
7
|
-
end
|
8
|
-
|
9
|
-
after(:each) do
|
10
|
-
# reset global options object to defaults
|
11
|
-
Anemone::DEFAULTS.each { |key, value| Anemone.options.send("#{key}=", value) }
|
12
|
-
end
|
13
|
-
|
14
5
|
it "should have a version" do
|
15
6
|
Anemone.const_defined?('VERSION').should == true
|
16
7
|
end
|
17
8
|
|
18
|
-
it "should have options" do
|
19
|
-
Anemone.should respond_to(:options)
|
20
|
-
end
|
21
|
-
|
22
|
-
it "should accept options for the crawl" do
|
23
|
-
Anemone.crawl(SPEC_DOMAIN, :verbose => false,
|
24
|
-
:threads => 2,
|
25
|
-
:discard_page_bodies => true,
|
26
|
-
:user_agent => 'test',
|
27
|
-
:obey_robots_txt => true,
|
28
|
-
:depth_limit => 3)
|
29
|
-
|
30
|
-
Anemone.options.verbose.should == false
|
31
|
-
Anemone.options.threads.should == 2
|
32
|
-
Anemone.options.discard_page_bodies.should == true
|
33
|
-
Anemone.options.delay.should == 0
|
34
|
-
Anemone.options.user_agent.should == 'test'
|
35
|
-
Anemone.options.obey_robots_txt.should == true
|
36
|
-
Anemone.options.depth_limit.should == 3
|
37
|
-
end
|
38
|
-
|
39
|
-
it "should use 1 thread if a delay is requested" do
|
40
|
-
Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
|
41
|
-
Anemone.options.threads.should == 1
|
42
|
-
end
|
43
|
-
|
44
9
|
it "should return a Anemone::Core from the crawl, which has a PageHash" do
|
45
10
|
result = Anemone.crawl(SPEC_DOMAIN)
|
46
11
|
result.should be_an_instance_of(Anemone::Core)
|
data/spec/core_spec.rb
CHANGED
@@ -64,13 +64,15 @@ module Anemone
|
|
64
64
|
pages << FakePage.new('0', :links => ['1', '2'])
|
65
65
|
pages << FakePage.new('1')
|
66
66
|
pages << FakePage.new('2')
|
67
|
-
|
67
|
+
pages << FakePage.new('3')
|
68
|
+
|
68
69
|
core = Anemone.crawl(pages[0].url) do |a|
|
69
|
-
a.skip_links_like /1/
|
70
|
+
a.skip_links_like /1/, /3/
|
70
71
|
end
|
71
72
|
|
72
73
|
core.should have(2).pages
|
73
74
|
core.pages.keys.should_not include(pages[1].url)
|
75
|
+
core.pages.keys.should_not include(pages[3].url)
|
74
76
|
end
|
75
77
|
|
76
78
|
it "should be able to call a block on every page" do
|
@@ -173,5 +175,29 @@ module Anemone
|
|
173
175
|
core.should have(4).pages
|
174
176
|
end
|
175
177
|
end
|
178
|
+
|
179
|
+
describe "options" do
|
180
|
+
it "should accept options for the crawl" do
|
181
|
+
core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
|
182
|
+
:threads => 2,
|
183
|
+
:discard_page_bodies => true,
|
184
|
+
:user_agent => 'test',
|
185
|
+
:obey_robots_txt => true,
|
186
|
+
:depth_limit => 3)
|
187
|
+
|
188
|
+
core.opts[:verbose].should == false
|
189
|
+
core.opts[:threads].should == 2
|
190
|
+
core.opts[:discard_page_bodies].should == true
|
191
|
+
core.opts[:delay].should == 0
|
192
|
+
core.opts[:user_agent].should == 'test'
|
193
|
+
core.opts[:obey_robots_txt].should == true
|
194
|
+
core.opts[:depth_limit].should == 3
|
195
|
+
end
|
196
|
+
|
197
|
+
it "should use 1 thread if a delay is requested" do
|
198
|
+
Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
176
202
|
end
|
177
203
|
end
|
data/spec/fakeweb_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-11-01 01:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -22,6 +22,16 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: 1.3.0
|
24
24
|
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: robots
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.7.2
|
34
|
+
version:
|
25
35
|
description:
|
26
36
|
email:
|
27
37
|
executables:
|
@@ -32,10 +42,10 @@ extra_rdoc_files:
|
|
32
42
|
- README.rdoc
|
33
43
|
files:
|
34
44
|
- LICENSE.txt
|
45
|
+
- CHANGELOG.rdoc
|
35
46
|
- README.rdoc
|
36
47
|
- bin/anemone
|
37
48
|
- lib/anemone.rb
|
38
|
-
- lib/anemone/anemone.rb
|
39
49
|
- lib/anemone/core.rb
|
40
50
|
- lib/anemone/http.rb
|
41
51
|
- lib/anemone/page.rb
|
data/lib/anemone/anemone.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
require 'ostruct'
|
2
|
-
require 'anemone/core'
|
3
|
-
|
4
|
-
module Anemone
|
5
|
-
# Version number
|
6
|
-
VERSION = '0.2.2'
|
7
|
-
|
8
|
-
# default options
|
9
|
-
DEFAULTS = {
|
10
|
-
# run 4 Tentacle threads to fetch pages
|
11
|
-
:threads => 4,
|
12
|
-
# disable verbose output
|
13
|
-
:verbose => false,
|
14
|
-
# don't throw away the page response body after scanning it for links
|
15
|
-
:discard_page_bodies => false,
|
16
|
-
# identify self as Anemone/VERSION
|
17
|
-
:user_agent => "Anemone/#{VERSION}",
|
18
|
-
# no delay between requests
|
19
|
-
:delay => 0,
|
20
|
-
# don't obey the robots exclusion protocol
|
21
|
-
:obey_robots_txt => false,
|
22
|
-
# by default, don't limit the depth of the crawl
|
23
|
-
:depth_limit => false,
|
24
|
-
# number of times HTTP redirects will be followed
|
25
|
-
:redirect_limit => 5
|
26
|
-
}
|
27
|
-
|
28
|
-
def self.options
|
29
|
-
@options ||= OpenStruct.new(DEFAULTS)
|
30
|
-
end
|
31
|
-
|
32
|
-
#
|
33
|
-
# Convenience method to start a crawl using Core
|
34
|
-
#
|
35
|
-
def Anemone.crawl(urls, options = {}, &block)
|
36
|
-
options.each { |key, value| Anemone.options.send("#{key}=", value) }
|
37
|
-
|
38
|
-
if Anemone.options.obey_robots_txt
|
39
|
-
begin
|
40
|
-
require 'robots'
|
41
|
-
rescue LoadError
|
42
|
-
warn "To support the robot exclusion protocol, install the robots gem:\n" \
|
43
|
-
"sudo gem sources -a http://gems.github.com\n" \
|
44
|
-
"sudo gem install fizx-robots"
|
45
|
-
exit
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
#use a single thread if a delay was requested
|
50
|
-
Anemone.options.threads = 1 if Anemone.options.delay > 0
|
51
|
-
|
52
|
-
Core.crawl(urls, &block)
|
53
|
-
end
|
54
|
-
end
|