anemone 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ == 0.2.3 / 2009-11-01
2
+
3
+ * Minor enhancements
4
+
5
+ * Options are now applied per-crawl, rather than module-wide.
6
+
7
+ * Bug fixes
8
+
9
+ * Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
10
+
11
+ == 0.2.2 / 2009-10-26
12
+
13
+ * Minor enhancements
14
+
15
+ * When the :verbose option is set to true, exception backtraces are printed to aid debugging.
16
+
17
+ == 0.2.1 / 2009-10-24
18
+
19
+ * Major enhancements
20
+
21
+ * Added HTTPS support.
22
+ * CLI program 'anemone', which is a frontend for several tasks.
23
+
24
+ * Minor enhancements
25
+
26
+ * HTTP request response time recorded in Page.
27
+ * Use of persistent HTTP connections.
@@ -21,6 +21,4 @@ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of sev
21
21
 
22
22
  == Requirements
23
23
  * nokogiri
24
-
25
- == Optional
26
- * fizx-robots (required if obey_robots_txt is set to true)
24
+ * robots
@@ -1,2 +1,2 @@
1
1
  require 'rubygems'
2
- require 'anemone/anemone'
2
+ require 'anemone/core'
@@ -1,19 +1,51 @@
1
- require 'net/http'
2
1
  require 'thread'
2
+ require 'robots'
3
3
  require 'anemone/tentacle'
4
4
  require 'anemone/page'
5
5
  require 'anemone/page_hash'
6
6
 
7
7
  module Anemone
8
+
9
+ VERSION = '0.2.3';
10
+
11
+ #
12
+ # Convenience method to start a crawl
13
+ #
14
+ def Anemone.crawl(urls, options = {}, &block)
15
+ Core.crawl(urls, options, &block)
16
+ end
17
+
8
18
  class Core
9
19
  # PageHash storing all Page objects encountered during the crawl
10
20
  attr_reader :pages
11
-
21
+
22
+ # Hash of options for the crawl
23
+ attr_accessor :opts
24
+
25
+ DEFAULT_OPTS = {
26
+ # run 4 Tentacle threads to fetch pages
27
+ :threads => 4,
28
+ # disable verbose output
29
+ :verbose => false,
30
+ # don't throw away the page response body after scanning it for links
31
+ :discard_page_bodies => false,
32
+ # identify self as Anemone/VERSION
33
+ :user_agent => "Anemone/#{Anemone::VERSION}",
34
+ # no delay between requests
35
+ :delay => 0,
36
+ # don't obey the robots exclusion protocol
37
+ :obey_robots_txt => false,
38
+ # by default, don't limit the depth of the crawl
39
+ :depth_limit => false,
40
+ # number of times HTTP redirects will be followed
41
+ :redirect_limit => 5
42
+ }
43
+
12
44
  #
13
45
  # Initialize the crawl with starting *urls* (single URL or Array of URLs)
14
46
  # and optional *block*
15
47
  #
16
- def initialize(urls)
48
+ def initialize(urls, opts = {})
17
49
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
18
50
  @urls.each{ |url| url.path = '/' if url.path.empty? }
19
51
 
@@ -23,10 +55,8 @@ module Anemone
23
55
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
24
56
  @skip_link_patterns = []
25
57
  @after_crawl_blocks = []
26
-
27
- if Anemone.options.obey_robots_txt
28
- @robots = Robots.new(Anemone.options.user_agent)
29
- end
58
+
59
+ process_options opts
30
60
 
31
61
  yield self if block_given?
32
62
  end
@@ -34,8 +64,8 @@ module Anemone
34
64
  #
35
65
  # Convenience method to start a new crawl
36
66
  #
37
- def self.crawl(root)
38
- self.new(root) do |core|
67
+ def self.crawl(urls, opts = {})
68
+ self.new(urls, opts) do |core|
39
69
  yield core if block_given?
40
70
  core.run
41
71
  end
@@ -55,11 +85,7 @@ module Anemone
55
85
  # followed
56
86
  #
57
87
  def skip_links_like(*patterns)
58
- if patterns
59
- patterns.each do |pattern|
60
- @skip_link_patterns << pattern
61
- end
62
- end
88
+ @skip_link_patterns.concat [patterns].flatten.compact
63
89
  self
64
90
  end
65
91
 
@@ -104,8 +130,8 @@ module Anemone
104
130
  link_queue = Queue.new
105
131
  page_queue = Queue.new
106
132
 
107
- Anemone.options.threads.times do
108
- @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
133
+ @opts[:threads].times do
134
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
109
135
  end
110
136
 
111
137
  @urls.each{ |url| link_queue.enq(url) }
@@ -115,12 +141,12 @@ module Anemone
115
141
 
116
142
  @pages[page.url] = page
117
143
 
118
- puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
144
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
119
145
 
120
146
  # perform the on_every_page blocks for this page
121
147
  do_page_blocks(page)
122
148
 
123
- page.discard_doc! if Anemone.options.discard_page_bodies
149
+ page.discard_doc! if @opts[:discard_page_bodies]
124
150
 
125
151
  links_to_follow(page).each do |link|
126
152
  link_queue.enq([link, page])
@@ -158,7 +184,15 @@ module Anemone
158
184
  end
159
185
 
160
186
  private
161
-
187
+
188
+ def process_options(options)
189
+ @opts = DEFAULT_OPTS.merge options
190
+
191
+ @opts[:threads] = 1 if @opts[:delay] > 0
192
+
193
+ @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
194
+ end
195
+
162
196
  #
163
197
  # Execute the after_crawl blocks
164
198
  #
@@ -199,10 +233,10 @@ module Anemone
199
233
  # Returns +false+ otherwise.
200
234
  #
201
235
  def visit_link?(link, from_page = nil)
202
- allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
236
+ allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
203
237
 
204
- if from_page
205
- too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
238
+ if from_page && @opts[:depth_limit]
239
+ too_deep = from_page.depth >= @opts[:depth_limit]
206
240
  else
207
241
  too_deep = false
208
242
  end
@@ -215,8 +249,7 @@ module Anemone
215
249
  # its URL matches a skip_link pattern.
216
250
  #
217
251
  def skip_link?(link)
218
- @skip_link_patterns.each { |p| return true if link.path =~ p}
219
- false
252
+ @skip_link_patterns.any? { |p| link.path =~ p }
220
253
  end
221
254
 
222
255
  end
@@ -4,10 +4,11 @@ require 'anemone/page'
4
4
  module Anemone
5
5
  class HTTP
6
6
  # Maximum number of redirects to follow on each get_response
7
- REDIRECTION_LIMIT = 5
7
+ REDIRECT_LIMIT = 5
8
8
 
9
- def initialize
9
+ def initialize(opts = {})
10
10
  @connections = {}
11
+ @opts = opts
11
12
  end
12
13
 
13
14
  #
@@ -31,7 +32,7 @@ module Anemone
31
32
 
32
33
  return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
33
34
  rescue => e
34
- if Anemone.options.verbose
35
+ if verbose?
35
36
  puts e.inspect
36
37
  puts e.backtrace
37
38
  end
@@ -50,7 +51,7 @@ module Anemone
50
51
  code = Integer(response.code)
51
52
  loc = url
52
53
 
53
- limit = REDIRECTION_LIMIT
54
+ limit = redirect_limit
54
55
  while response.is_a?(Net::HTTPRedirection) and limit > 0
55
56
  loc = URI(response['location'])
56
57
  loc = url.merge(loc) if loc.relative?
@@ -66,7 +67,6 @@ module Anemone
66
67
  #
67
68
  def get_response(url, referer = nil)
68
69
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
69
- user_agent = Anemone.options.user_agent rescue nil
70
70
 
71
71
  opts = {}
72
72
  opts['User-Agent'] = user_agent if user_agent
@@ -82,7 +82,7 @@ module Anemone
82
82
  rescue EOFError
83
83
  refresh_connection(url)
84
84
  retries += 1
85
- retry unless retries > 1
85
+ retry unless retries > 3
86
86
  end
87
87
  end
88
88
 
@@ -104,5 +104,18 @@ module Anemone
104
104
  end
105
105
  @connections[url.host][url.port] = http.start
106
106
  end
107
+
108
+ def redirect_limit
109
+ @opts[:redirect_limit] || REDIRECT_LIMIT
110
+ end
111
+
112
+ def user_agent
113
+ @opts[:user_agent]
114
+ end
115
+
116
+ def verbose?
117
+ @opts[:verbose]
118
+ end
119
+
107
120
  end
108
121
  end
@@ -33,7 +33,7 @@ module Anemone
33
33
  def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
34
34
  @url = url
35
35
  @code = code
36
- @headers = headers
36
+ @headers = headers || {}
37
37
  @headers['content-type'] ||= ['']
38
38
  @aliases = Array(aka)
39
39
  @data = OpenStruct.new
@@ -6,10 +6,11 @@ module Anemone
6
6
  #
7
7
  # Create a new Tentacle
8
8
  #
9
- def initialize(link_queue, page_queue)
9
+ def initialize(link_queue, page_queue, opts = {})
10
10
  @link_queue = link_queue
11
11
  @page_queue = page_queue
12
- @http = Anemone::HTTP.new
12
+ @http = Anemone::HTTP.new(opts)
13
+ @opts = opts
13
14
  end
14
15
 
15
16
  #
@@ -22,11 +23,17 @@ module Anemone
22
23
 
23
24
  break if link == :END
24
25
 
25
- @page_queue.enq @http.fetch_page(link, from_page)
26
+ @page_queue << @http.fetch_page(link, from_page)
26
27
 
27
- sleep Anemone.options.delay
28
+ delay
28
29
  end
29
30
  end
30
31
 
32
+ private
33
+
34
+ def delay
35
+ sleep @opts[:delay] if @opts[:delay]
36
+ end
37
+
31
38
  end
32
39
  end
@@ -2,45 +2,10 @@ require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  describe Anemone do
4
4
 
5
- before(:all) do
6
- Anemone::FakePage.new
7
- end
8
-
9
- after(:each) do
10
- # reset global options object to defaults
11
- Anemone::DEFAULTS.each { |key, value| Anemone.options.send("#{key}=", value) }
12
- end
13
-
14
5
  it "should have a version" do
15
6
  Anemone.const_defined?('VERSION').should == true
16
7
  end
17
8
 
18
- it "should have options" do
19
- Anemone.should respond_to(:options)
20
- end
21
-
22
- it "should accept options for the crawl" do
23
- Anemone.crawl(SPEC_DOMAIN, :verbose => false,
24
- :threads => 2,
25
- :discard_page_bodies => true,
26
- :user_agent => 'test',
27
- :obey_robots_txt => true,
28
- :depth_limit => 3)
29
-
30
- Anemone.options.verbose.should == false
31
- Anemone.options.threads.should == 2
32
- Anemone.options.discard_page_bodies.should == true
33
- Anemone.options.delay.should == 0
34
- Anemone.options.user_agent.should == 'test'
35
- Anemone.options.obey_robots_txt.should == true
36
- Anemone.options.depth_limit.should == 3
37
- end
38
-
39
- it "should use 1 thread if a delay is requested" do
40
- Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
41
- Anemone.options.threads.should == 1
42
- end
43
-
44
9
  it "should return a Anemone::Core from the crawl, which has a PageHash" do
45
10
  result = Anemone.crawl(SPEC_DOMAIN)
46
11
  result.should be_an_instance_of(Anemone::Core)
@@ -64,13 +64,15 @@ module Anemone
64
64
  pages << FakePage.new('0', :links => ['1', '2'])
65
65
  pages << FakePage.new('1')
66
66
  pages << FakePage.new('2')
67
-
67
+ pages << FakePage.new('3')
68
+
68
69
  core = Anemone.crawl(pages[0].url) do |a|
69
- a.skip_links_like /1/
70
+ a.skip_links_like /1/, /3/
70
71
  end
71
72
 
72
73
  core.should have(2).pages
73
74
  core.pages.keys.should_not include(pages[1].url)
75
+ core.pages.keys.should_not include(pages[3].url)
74
76
  end
75
77
 
76
78
  it "should be able to call a block on every page" do
@@ -173,5 +175,29 @@ module Anemone
173
175
  core.should have(4).pages
174
176
  end
175
177
  end
178
+
179
+ describe "options" do
180
+ it "should accept options for the crawl" do
181
+ core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
182
+ :threads => 2,
183
+ :discard_page_bodies => true,
184
+ :user_agent => 'test',
185
+ :obey_robots_txt => true,
186
+ :depth_limit => 3)
187
+
188
+ core.opts[:verbose].should == false
189
+ core.opts[:threads].should == 2
190
+ core.opts[:discard_page_bodies].should == true
191
+ core.opts[:delay].should == 0
192
+ core.opts[:user_agent].should == 'test'
193
+ core.opts[:obey_robots_txt].should == true
194
+ core.opts[:depth_limit].should == 3
195
+ end
196
+
197
+ it "should use 1 thread if a delay is requested" do
198
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
199
+ end
200
+ end
201
+
176
202
  end
177
203
  end
@@ -55,4 +55,3 @@ end
55
55
 
56
56
  #default root
57
57
  Anemone::FakePage.new
58
-
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-10-26 00:00:00 -05:00
12
+ date: 2009-11-01 01:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -22,6 +22,16 @@ dependencies:
22
22
  - !ruby/object:Gem::Version
23
23
  version: 1.3.0
24
24
  version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: robots
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.7.2
34
+ version:
25
35
  description:
26
36
  email:
27
37
  executables:
@@ -32,10 +42,10 @@ extra_rdoc_files:
32
42
  - README.rdoc
33
43
  files:
34
44
  - LICENSE.txt
45
+ - CHANGELOG.rdoc
35
46
  - README.rdoc
36
47
  - bin/anemone
37
48
  - lib/anemone.rb
38
- - lib/anemone/anemone.rb
39
49
  - lib/anemone/core.rb
40
50
  - lib/anemone/http.rb
41
51
  - lib/anemone/page.rb
@@ -1,54 +0,0 @@
1
- require 'ostruct'
2
- require 'anemone/core'
3
-
4
- module Anemone
5
- # Version number
6
- VERSION = '0.2.2'
7
-
8
- # default options
9
- DEFAULTS = {
10
- # run 4 Tentacle threads to fetch pages
11
- :threads => 4,
12
- # disable verbose output
13
- :verbose => false,
14
- # don't throw away the page response body after scanning it for links
15
- :discard_page_bodies => false,
16
- # identify self as Anemone/VERSION
17
- :user_agent => "Anemone/#{VERSION}",
18
- # no delay between requests
19
- :delay => 0,
20
- # don't obey the robots exclusion protocol
21
- :obey_robots_txt => false,
22
- # by default, don't limit the depth of the crawl
23
- :depth_limit => false,
24
- # number of times HTTP redirects will be followed
25
- :redirect_limit => 5
26
- }
27
-
28
- def self.options
29
- @options ||= OpenStruct.new(DEFAULTS)
30
- end
31
-
32
- #
33
- # Convenience method to start a crawl using Core
34
- #
35
- def Anemone.crawl(urls, options = {}, &block)
36
- options.each { |key, value| Anemone.options.send("#{key}=", value) }
37
-
38
- if Anemone.options.obey_robots_txt
39
- begin
40
- require 'robots'
41
- rescue LoadError
42
- warn "To support the robot exclusion protocol, install the robots gem:\n" \
43
- "sudo gem sources -a http://gems.github.com\n" \
44
- "sudo gem install fizx-robots"
45
- exit
46
- end
47
- end
48
-
49
- #use a single thread if a delay was requested
50
- Anemone.options.threads = 1 if Anemone.options.delay > 0
51
-
52
- Core.crawl(urls, &block)
53
- end
54
- end