anemone 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +27 -0
- data/README.rdoc +1 -3
- data/lib/anemone.rb +1 -1
- data/lib/anemone/core.rb +57 -24
- data/lib/anemone/http.rb +19 -6
- data/lib/anemone/page.rb +1 -1
- data/lib/anemone/tentacle.rb +11 -4
- data/spec/anemone_spec.rb +0 -35
- data/spec/core_spec.rb +28 -2
- data/spec/fakeweb_helper.rb +0 -1
- metadata +13 -3
- data/lib/anemone/anemone.rb +0 -54
data/CHANGELOG.rdoc
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
== 0.2.3 / 2009-11-01
|
2
|
+
|
3
|
+
* Minor enhancements
|
4
|
+
|
5
|
+
* Options are now applied per-crawl, rather than module-wide.
|
6
|
+
|
7
|
+
* Bug fixes
|
8
|
+
|
9
|
+
* Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
|
10
|
+
|
11
|
+
== 0.2.2 / 2009-10-26
|
12
|
+
|
13
|
+
* Minor enhancements
|
14
|
+
|
15
|
+
* When the :verbose option is set to true, exception backtraces are printed to aid debugging.
|
16
|
+
|
17
|
+
== 0.2.1 / 2009-10-24
|
18
|
+
|
19
|
+
* Major enhancements
|
20
|
+
|
21
|
+
* Added HTTPS support.
|
22
|
+
* CLI program 'anemone', which is a frontend for several tasks.
|
23
|
+
|
24
|
+
* Minor enhancements
|
25
|
+
|
26
|
+
* HTTP request response time recorded in Page.
|
27
|
+
* Use of persistent HTTP connections.
|
data/README.rdoc
CHANGED
data/lib/anemone.rb
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require 'anemone/
|
2
|
+
require 'anemone/core'
|
data/lib/anemone/core.rb
CHANGED
@@ -1,19 +1,51 @@
|
|
1
|
-
require 'net/http'
|
2
1
|
require 'thread'
|
2
|
+
require 'robots'
|
3
3
|
require 'anemone/tentacle'
|
4
4
|
require 'anemone/page'
|
5
5
|
require 'anemone/page_hash'
|
6
6
|
|
7
7
|
module Anemone
|
8
|
+
|
9
|
+
VERSION = '0.2.3';
|
10
|
+
|
11
|
+
#
|
12
|
+
# Convenience method to start a crawl
|
13
|
+
#
|
14
|
+
def Anemone.crawl(urls, options = {}, &block)
|
15
|
+
Core.crawl(urls, options, &block)
|
16
|
+
end
|
17
|
+
|
8
18
|
class Core
|
9
19
|
# PageHash storing all Page objects encountered during the crawl
|
10
20
|
attr_reader :pages
|
11
|
-
|
21
|
+
|
22
|
+
# Hash of options for the crawl
|
23
|
+
attr_accessor :opts
|
24
|
+
|
25
|
+
DEFAULT_OPTS = {
|
26
|
+
# run 4 Tentacle threads to fetch pages
|
27
|
+
:threads => 4,
|
28
|
+
# disable verbose output
|
29
|
+
:verbose => false,
|
30
|
+
# don't throw away the page response body after scanning it for links
|
31
|
+
:discard_page_bodies => false,
|
32
|
+
# identify self as Anemone/VERSION
|
33
|
+
:user_agent => "Anemone/#{Anemone::VERSION}",
|
34
|
+
# no delay between requests
|
35
|
+
:delay => 0,
|
36
|
+
# don't obey the robots exclusion protocol
|
37
|
+
:obey_robots_txt => false,
|
38
|
+
# by default, don't limit the depth of the crawl
|
39
|
+
:depth_limit => false,
|
40
|
+
# number of times HTTP redirects will be followed
|
41
|
+
:redirect_limit => 5
|
42
|
+
}
|
43
|
+
|
12
44
|
#
|
13
45
|
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
14
46
|
# and optional *block*
|
15
47
|
#
|
16
|
-
def initialize(urls)
|
48
|
+
def initialize(urls, opts = {})
|
17
49
|
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
18
50
|
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
19
51
|
|
@@ -23,10 +55,8 @@ module Anemone
|
|
23
55
|
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
24
56
|
@skip_link_patterns = []
|
25
57
|
@after_crawl_blocks = []
|
26
|
-
|
27
|
-
|
28
|
-
@robots = Robots.new(Anemone.options.user_agent)
|
29
|
-
end
|
58
|
+
|
59
|
+
process_options opts
|
30
60
|
|
31
61
|
yield self if block_given?
|
32
62
|
end
|
@@ -34,8 +64,8 @@ module Anemone
|
|
34
64
|
#
|
35
65
|
# Convenience method to start a new crawl
|
36
66
|
#
|
37
|
-
def self.crawl(
|
38
|
-
self.new(
|
67
|
+
def self.crawl(urls, opts = {})
|
68
|
+
self.new(urls, opts) do |core|
|
39
69
|
yield core if block_given?
|
40
70
|
core.run
|
41
71
|
end
|
@@ -55,11 +85,7 @@ module Anemone
|
|
55
85
|
# followed
|
56
86
|
#
|
57
87
|
def skip_links_like(*patterns)
|
58
|
-
|
59
|
-
patterns.each do |pattern|
|
60
|
-
@skip_link_patterns << pattern
|
61
|
-
end
|
62
|
-
end
|
88
|
+
@skip_link_patterns.concat [patterns].flatten.compact
|
63
89
|
self
|
64
90
|
end
|
65
91
|
|
@@ -104,8 +130,8 @@ module Anemone
|
|
104
130
|
link_queue = Queue.new
|
105
131
|
page_queue = Queue.new
|
106
132
|
|
107
|
-
|
108
|
-
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
133
|
+
@opts[:threads].times do
|
134
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
109
135
|
end
|
110
136
|
|
111
137
|
@urls.each{ |url| link_queue.enq(url) }
|
@@ -115,12 +141,12 @@ module Anemone
|
|
115
141
|
|
116
142
|
@pages[page.url] = page
|
117
143
|
|
118
|
-
puts "#{page.url} Queue: #{link_queue.size}" if
|
144
|
+
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
119
145
|
|
120
146
|
# perform the on_every_page blocks for this page
|
121
147
|
do_page_blocks(page)
|
122
148
|
|
123
|
-
page.discard_doc! if
|
149
|
+
page.discard_doc! if @opts[:discard_page_bodies]
|
124
150
|
|
125
151
|
links_to_follow(page).each do |link|
|
126
152
|
link_queue.enq([link, page])
|
@@ -158,7 +184,15 @@ module Anemone
|
|
158
184
|
end
|
159
185
|
|
160
186
|
private
|
161
|
-
|
187
|
+
|
188
|
+
def process_options(options)
|
189
|
+
@opts = DEFAULT_OPTS.merge options
|
190
|
+
|
191
|
+
@opts[:threads] = 1 if @opts[:delay] > 0
|
192
|
+
|
193
|
+
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
194
|
+
end
|
195
|
+
|
162
196
|
#
|
163
197
|
# Execute the after_crawl blocks
|
164
198
|
#
|
@@ -199,10 +233,10 @@ module Anemone
|
|
199
233
|
# Returns +false+ otherwise.
|
200
234
|
#
|
201
235
|
def visit_link?(link, from_page = nil)
|
202
|
-
allowed =
|
236
|
+
allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
203
237
|
|
204
|
-
if from_page
|
205
|
-
too_deep = from_page.depth >=
|
238
|
+
if from_page && @opts[:depth_limit]
|
239
|
+
too_deep = from_page.depth >= @opts[:depth_limit]
|
206
240
|
else
|
207
241
|
too_deep = false
|
208
242
|
end
|
@@ -215,8 +249,7 @@ module Anemone
|
|
215
249
|
# its URL matches a skip_link pattern.
|
216
250
|
#
|
217
251
|
def skip_link?(link)
|
218
|
-
@skip_link_patterns.
|
219
|
-
false
|
252
|
+
@skip_link_patterns.any? { |p| link.path =~ p }
|
220
253
|
end
|
221
254
|
|
222
255
|
end
|
data/lib/anemone/http.rb
CHANGED
@@ -4,10 +4,11 @@ require 'anemone/page'
|
|
4
4
|
module Anemone
|
5
5
|
class HTTP
|
6
6
|
# Maximum number of redirects to follow on each get_response
|
7
|
-
|
7
|
+
REDIRECT_LIMIT = 5
|
8
8
|
|
9
|
-
def initialize
|
9
|
+
def initialize(opts = {})
|
10
10
|
@connections = {}
|
11
|
+
@opts = opts
|
11
12
|
end
|
12
13
|
|
13
14
|
#
|
@@ -31,7 +32,7 @@ module Anemone
|
|
31
32
|
|
32
33
|
return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
|
33
34
|
rescue => e
|
34
|
-
if
|
35
|
+
if verbose?
|
35
36
|
puts e.inspect
|
36
37
|
puts e.backtrace
|
37
38
|
end
|
@@ -50,7 +51,7 @@ module Anemone
|
|
50
51
|
code = Integer(response.code)
|
51
52
|
loc = url
|
52
53
|
|
53
|
-
limit =
|
54
|
+
limit = redirect_limit
|
54
55
|
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
55
56
|
loc = URI(response['location'])
|
56
57
|
loc = url.merge(loc) if loc.relative?
|
@@ -66,7 +67,6 @@ module Anemone
|
|
66
67
|
#
|
67
68
|
def get_response(url, referer = nil)
|
68
69
|
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
69
|
-
user_agent = Anemone.options.user_agent rescue nil
|
70
70
|
|
71
71
|
opts = {}
|
72
72
|
opts['User-Agent'] = user_agent if user_agent
|
@@ -82,7 +82,7 @@ module Anemone
|
|
82
82
|
rescue EOFError
|
83
83
|
refresh_connection(url)
|
84
84
|
retries += 1
|
85
|
-
retry unless retries >
|
85
|
+
retry unless retries > 3
|
86
86
|
end
|
87
87
|
end
|
88
88
|
|
@@ -104,5 +104,18 @@ module Anemone
|
|
104
104
|
end
|
105
105
|
@connections[url.host][url.port] = http.start
|
106
106
|
end
|
107
|
+
|
108
|
+
def redirect_limit
|
109
|
+
@opts[:redirect_limit] || REDIRECT_LIMIT
|
110
|
+
end
|
111
|
+
|
112
|
+
def user_agent
|
113
|
+
@opts[:user_agent]
|
114
|
+
end
|
115
|
+
|
116
|
+
def verbose?
|
117
|
+
@opts[:verbose]
|
118
|
+
end
|
119
|
+
|
107
120
|
end
|
108
121
|
end
|
data/lib/anemone/page.rb
CHANGED
@@ -33,7 +33,7 @@ module Anemone
|
|
33
33
|
def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
|
34
34
|
@url = url
|
35
35
|
@code = code
|
36
|
-
@headers = headers
|
36
|
+
@headers = headers || {}
|
37
37
|
@headers['content-type'] ||= ['']
|
38
38
|
@aliases = Array(aka)
|
39
39
|
@data = OpenStruct.new
|
data/lib/anemone/tentacle.rb
CHANGED
@@ -6,10 +6,11 @@ module Anemone
|
|
6
6
|
#
|
7
7
|
# Create a new Tentacle
|
8
8
|
#
|
9
|
-
def initialize(link_queue, page_queue)
|
9
|
+
def initialize(link_queue, page_queue, opts = {})
|
10
10
|
@link_queue = link_queue
|
11
11
|
@page_queue = page_queue
|
12
|
-
@http = Anemone::HTTP.new
|
12
|
+
@http = Anemone::HTTP.new(opts)
|
13
|
+
@opts = opts
|
13
14
|
end
|
14
15
|
|
15
16
|
#
|
@@ -22,11 +23,17 @@ module Anemone
|
|
22
23
|
|
23
24
|
break if link == :END
|
24
25
|
|
25
|
-
@page_queue
|
26
|
+
@page_queue << @http.fetch_page(link, from_page)
|
26
27
|
|
27
|
-
|
28
|
+
delay
|
28
29
|
end
|
29
30
|
end
|
30
31
|
|
32
|
+
private
|
33
|
+
|
34
|
+
def delay
|
35
|
+
sleep @opts[:delay] if @opts[:delay]
|
36
|
+
end
|
37
|
+
|
31
38
|
end
|
32
39
|
end
|
data/spec/anemone_spec.rb
CHANGED
@@ -2,45 +2,10 @@ require File.dirname(__FILE__) + '/spec_helper'
|
|
2
2
|
|
3
3
|
describe Anemone do
|
4
4
|
|
5
|
-
before(:all) do
|
6
|
-
Anemone::FakePage.new
|
7
|
-
end
|
8
|
-
|
9
|
-
after(:each) do
|
10
|
-
# reset global options object to defaults
|
11
|
-
Anemone::DEFAULTS.each { |key, value| Anemone.options.send("#{key}=", value) }
|
12
|
-
end
|
13
|
-
|
14
5
|
it "should have a version" do
|
15
6
|
Anemone.const_defined?('VERSION').should == true
|
16
7
|
end
|
17
8
|
|
18
|
-
it "should have options" do
|
19
|
-
Anemone.should respond_to(:options)
|
20
|
-
end
|
21
|
-
|
22
|
-
it "should accept options for the crawl" do
|
23
|
-
Anemone.crawl(SPEC_DOMAIN, :verbose => false,
|
24
|
-
:threads => 2,
|
25
|
-
:discard_page_bodies => true,
|
26
|
-
:user_agent => 'test',
|
27
|
-
:obey_robots_txt => true,
|
28
|
-
:depth_limit => 3)
|
29
|
-
|
30
|
-
Anemone.options.verbose.should == false
|
31
|
-
Anemone.options.threads.should == 2
|
32
|
-
Anemone.options.discard_page_bodies.should == true
|
33
|
-
Anemone.options.delay.should == 0
|
34
|
-
Anemone.options.user_agent.should == 'test'
|
35
|
-
Anemone.options.obey_robots_txt.should == true
|
36
|
-
Anemone.options.depth_limit.should == 3
|
37
|
-
end
|
38
|
-
|
39
|
-
it "should use 1 thread if a delay is requested" do
|
40
|
-
Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
|
41
|
-
Anemone.options.threads.should == 1
|
42
|
-
end
|
43
|
-
|
44
9
|
it "should return a Anemone::Core from the crawl, which has a PageHash" do
|
45
10
|
result = Anemone.crawl(SPEC_DOMAIN)
|
46
11
|
result.should be_an_instance_of(Anemone::Core)
|
data/spec/core_spec.rb
CHANGED
@@ -64,13 +64,15 @@ module Anemone
|
|
64
64
|
pages << FakePage.new('0', :links => ['1', '2'])
|
65
65
|
pages << FakePage.new('1')
|
66
66
|
pages << FakePage.new('2')
|
67
|
-
|
67
|
+
pages << FakePage.new('3')
|
68
|
+
|
68
69
|
core = Anemone.crawl(pages[0].url) do |a|
|
69
|
-
a.skip_links_like /1/
|
70
|
+
a.skip_links_like /1/, /3/
|
70
71
|
end
|
71
72
|
|
72
73
|
core.should have(2).pages
|
73
74
|
core.pages.keys.should_not include(pages[1].url)
|
75
|
+
core.pages.keys.should_not include(pages[3].url)
|
74
76
|
end
|
75
77
|
|
76
78
|
it "should be able to call a block on every page" do
|
@@ -173,5 +175,29 @@ module Anemone
|
|
173
175
|
core.should have(4).pages
|
174
176
|
end
|
175
177
|
end
|
178
|
+
|
179
|
+
describe "options" do
|
180
|
+
it "should accept options for the crawl" do
|
181
|
+
core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
|
182
|
+
:threads => 2,
|
183
|
+
:discard_page_bodies => true,
|
184
|
+
:user_agent => 'test',
|
185
|
+
:obey_robots_txt => true,
|
186
|
+
:depth_limit => 3)
|
187
|
+
|
188
|
+
core.opts[:verbose].should == false
|
189
|
+
core.opts[:threads].should == 2
|
190
|
+
core.opts[:discard_page_bodies].should == true
|
191
|
+
core.opts[:delay].should == 0
|
192
|
+
core.opts[:user_agent].should == 'test'
|
193
|
+
core.opts[:obey_robots_txt].should == true
|
194
|
+
core.opts[:depth_limit].should == 3
|
195
|
+
end
|
196
|
+
|
197
|
+
it "should use 1 thread if a delay is requested" do
|
198
|
+
Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
176
202
|
end
|
177
203
|
end
|
data/spec/fakeweb_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-11-01 01:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -22,6 +22,16 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: 1.3.0
|
24
24
|
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: robots
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.7.2
|
34
|
+
version:
|
25
35
|
description:
|
26
36
|
email:
|
27
37
|
executables:
|
@@ -32,10 +42,10 @@ extra_rdoc_files:
|
|
32
42
|
- README.rdoc
|
33
43
|
files:
|
34
44
|
- LICENSE.txt
|
45
|
+
- CHANGELOG.rdoc
|
35
46
|
- README.rdoc
|
36
47
|
- bin/anemone
|
37
48
|
- lib/anemone.rb
|
38
|
-
- lib/anemone/anemone.rb
|
39
49
|
- lib/anemone/core.rb
|
40
50
|
- lib/anemone/http.rb
|
41
51
|
- lib/anemone/page.rb
|
data/lib/anemone/anemone.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
require 'ostruct'
|
2
|
-
require 'anemone/core'
|
3
|
-
|
4
|
-
module Anemone
|
5
|
-
# Version number
|
6
|
-
VERSION = '0.2.2'
|
7
|
-
|
8
|
-
# default options
|
9
|
-
DEFAULTS = {
|
10
|
-
# run 4 Tentacle threads to fetch pages
|
11
|
-
:threads => 4,
|
12
|
-
# disable verbose output
|
13
|
-
:verbose => false,
|
14
|
-
# don't throw away the page response body after scanning it for links
|
15
|
-
:discard_page_bodies => false,
|
16
|
-
# identify self as Anemone/VERSION
|
17
|
-
:user_agent => "Anemone/#{VERSION}",
|
18
|
-
# no delay between requests
|
19
|
-
:delay => 0,
|
20
|
-
# don't obey the robots exclusion protocol
|
21
|
-
:obey_robots_txt => false,
|
22
|
-
# by default, don't limit the depth of the crawl
|
23
|
-
:depth_limit => false,
|
24
|
-
# number of times HTTP redirects will be followed
|
25
|
-
:redirect_limit => 5
|
26
|
-
}
|
27
|
-
|
28
|
-
def self.options
|
29
|
-
@options ||= OpenStruct.new(DEFAULTS)
|
30
|
-
end
|
31
|
-
|
32
|
-
#
|
33
|
-
# Convenience method to start a crawl using Core
|
34
|
-
#
|
35
|
-
def Anemone.crawl(urls, options = {}, &block)
|
36
|
-
options.each { |key, value| Anemone.options.send("#{key}=", value) }
|
37
|
-
|
38
|
-
if Anemone.options.obey_robots_txt
|
39
|
-
begin
|
40
|
-
require 'robots'
|
41
|
-
rescue LoadError
|
42
|
-
warn "To support the robot exclusion protocol, install the robots gem:\n" \
|
43
|
-
"sudo gem sources -a http://gems.github.com\n" \
|
44
|
-
"sudo gem install fizx-robots"
|
45
|
-
exit
|
46
|
-
end
|
47
|
-
end
|
48
|
-
|
49
|
-
#use a single thread if a delay was requested
|
50
|
-
Anemone.options.threads = 1 if Anemone.options.delay > 0
|
51
|
-
|
52
|
-
Core.crawl(urls, &block)
|
53
|
-
end
|
54
|
-
end
|