anemone 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,27 @@
1
+ == 0.2.3 / 2009-11-01
2
+
3
+ * Minor enhancements
4
+
5
+ * Options are now applied per-crawl, rather than module-wide.
6
+
7
+ * Bug fixes
8
+
9
+ * Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
10
+
11
+ == 0.2.2 / 2009-10-26
12
+
13
+ * Minor enhancements
14
+
15
+ * When the :verbose option is set to true, exception backtraces are printed to aid debugging.
16
+
17
+ == 0.2.1 / 2009-10-24
18
+
19
+ * Major enhancements
20
+
21
+ * Added HTTPS support.
22
+ * CLI program 'anemone', which is a frontend for several tasks.
23
+
24
+ * Minor enhancements
25
+
26
+ * HTTP request response time recorded in Page.
27
+ * Use of persistent HTTP connections.
@@ -21,6 +21,4 @@ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of sev
21
21
 
22
22
  == Requirements
23
23
  * nokogiri
24
-
25
- == Optional
26
- * fizx-robots (required if obey_robots_txt is set to true)
24
+ * robots
@@ -1,2 +1,2 @@
1
1
  require 'rubygems'
2
- require 'anemone/anemone'
2
+ require 'anemone/core'
@@ -1,19 +1,51 @@
1
- require 'net/http'
2
1
  require 'thread'
2
+ require 'robots'
3
3
  require 'anemone/tentacle'
4
4
  require 'anemone/page'
5
5
  require 'anemone/page_hash'
6
6
 
7
7
  module Anemone
8
+
9
+ VERSION = '0.2.3';
10
+
11
+ #
12
+ # Convenience method to start a crawl
13
+ #
14
+ def Anemone.crawl(urls, options = {}, &block)
15
+ Core.crawl(urls, options, &block)
16
+ end
17
+
8
18
  class Core
9
19
  # PageHash storing all Page objects encountered during the crawl
10
20
  attr_reader :pages
11
-
21
+
22
+ # Hash of options for the crawl
23
+ attr_accessor :opts
24
+
25
+ DEFAULT_OPTS = {
26
+ # run 4 Tentacle threads to fetch pages
27
+ :threads => 4,
28
+ # disable verbose output
29
+ :verbose => false,
30
+ # don't throw away the page response body after scanning it for links
31
+ :discard_page_bodies => false,
32
+ # identify self as Anemone/VERSION
33
+ :user_agent => "Anemone/#{Anemone::VERSION}",
34
+ # no delay between requests
35
+ :delay => 0,
36
+ # don't obey the robots exclusion protocol
37
+ :obey_robots_txt => false,
38
+ # by default, don't limit the depth of the crawl
39
+ :depth_limit => false,
40
+ # number of times HTTP redirects will be followed
41
+ :redirect_limit => 5
42
+ }
43
+
12
44
  #
13
45
  # Initialize the crawl with starting *urls* (single URL or Array of URLs)
14
46
  # and optional *block*
15
47
  #
16
- def initialize(urls)
48
+ def initialize(urls, opts = {})
17
49
  @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
18
50
  @urls.each{ |url| url.path = '/' if url.path.empty? }
19
51
 
@@ -23,10 +55,8 @@ module Anemone
23
55
  @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
24
56
  @skip_link_patterns = []
25
57
  @after_crawl_blocks = []
26
-
27
- if Anemone.options.obey_robots_txt
28
- @robots = Robots.new(Anemone.options.user_agent)
29
- end
58
+
59
+ process_options opts
30
60
 
31
61
  yield self if block_given?
32
62
  end
@@ -34,8 +64,8 @@ module Anemone
34
64
  #
35
65
  # Convenience method to start a new crawl
36
66
  #
37
- def self.crawl(root)
38
- self.new(root) do |core|
67
+ def self.crawl(urls, opts = {})
68
+ self.new(urls, opts) do |core|
39
69
  yield core if block_given?
40
70
  core.run
41
71
  end
@@ -55,11 +85,7 @@ module Anemone
55
85
  # followed
56
86
  #
57
87
  def skip_links_like(*patterns)
58
- if patterns
59
- patterns.each do |pattern|
60
- @skip_link_patterns << pattern
61
- end
62
- end
88
+ @skip_link_patterns.concat [patterns].flatten.compact
63
89
  self
64
90
  end
65
91
 
@@ -104,8 +130,8 @@ module Anemone
104
130
  link_queue = Queue.new
105
131
  page_queue = Queue.new
106
132
 
107
- Anemone.options.threads.times do
108
- @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
133
+ @opts[:threads].times do
134
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
109
135
  end
110
136
 
111
137
  @urls.each{ |url| link_queue.enq(url) }
@@ -115,12 +141,12 @@ module Anemone
115
141
 
116
142
  @pages[page.url] = page
117
143
 
118
- puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
144
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
119
145
 
120
146
  # perform the on_every_page blocks for this page
121
147
  do_page_blocks(page)
122
148
 
123
- page.discard_doc! if Anemone.options.discard_page_bodies
149
+ page.discard_doc! if @opts[:discard_page_bodies]
124
150
 
125
151
  links_to_follow(page).each do |link|
126
152
  link_queue.enq([link, page])
@@ -158,7 +184,15 @@ module Anemone
158
184
  end
159
185
 
160
186
  private
161
-
187
+
188
+ def process_options(options)
189
+ @opts = DEFAULT_OPTS.merge options
190
+
191
+ @opts[:threads] = 1 if @opts[:delay] > 0
192
+
193
+ @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
194
+ end
195
+
162
196
  #
163
197
  # Execute the after_crawl blocks
164
198
  #
@@ -199,10 +233,10 @@ module Anemone
199
233
  # Returns +false+ otherwise.
200
234
  #
201
235
  def visit_link?(link, from_page = nil)
202
- allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
236
+ allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
203
237
 
204
- if from_page
205
- too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
238
+ if from_page && @opts[:depth_limit]
239
+ too_deep = from_page.depth >= @opts[:depth_limit]
206
240
  else
207
241
  too_deep = false
208
242
  end
@@ -215,8 +249,7 @@ module Anemone
215
249
  # its URL matches a skip_link pattern.
216
250
  #
217
251
  def skip_link?(link)
218
- @skip_link_patterns.each { |p| return true if link.path =~ p}
219
- false
252
+ @skip_link_patterns.any? { |p| link.path =~ p }
220
253
  end
221
254
 
222
255
  end
@@ -4,10 +4,11 @@ require 'anemone/page'
4
4
  module Anemone
5
5
  class HTTP
6
6
  # Maximum number of redirects to follow on each get_response
7
- REDIRECTION_LIMIT = 5
7
+ REDIRECT_LIMIT = 5
8
8
 
9
- def initialize
9
+ def initialize(opts = {})
10
10
  @connections = {}
11
+ @opts = opts
11
12
  end
12
13
 
13
14
  #
@@ -31,7 +32,7 @@ module Anemone
31
32
 
32
33
  return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
33
34
  rescue => e
34
- if Anemone.options.verbose
35
+ if verbose?
35
36
  puts e.inspect
36
37
  puts e.backtrace
37
38
  end
@@ -50,7 +51,7 @@ module Anemone
50
51
  code = Integer(response.code)
51
52
  loc = url
52
53
 
53
- limit = REDIRECTION_LIMIT
54
+ limit = redirect_limit
54
55
  while response.is_a?(Net::HTTPRedirection) and limit > 0
55
56
  loc = URI(response['location'])
56
57
  loc = url.merge(loc) if loc.relative?
@@ -66,7 +67,6 @@ module Anemone
66
67
  #
67
68
  def get_response(url, referer = nil)
68
69
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
69
- user_agent = Anemone.options.user_agent rescue nil
70
70
 
71
71
  opts = {}
72
72
  opts['User-Agent'] = user_agent if user_agent
@@ -82,7 +82,7 @@ module Anemone
82
82
  rescue EOFError
83
83
  refresh_connection(url)
84
84
  retries += 1
85
- retry unless retries > 1
85
+ retry unless retries > 3
86
86
  end
87
87
  end
88
88
 
@@ -104,5 +104,18 @@ module Anemone
104
104
  end
105
105
  @connections[url.host][url.port] = http.start
106
106
  end
107
+
108
+ def redirect_limit
109
+ @opts[:redirect_limit] || REDIRECT_LIMIT
110
+ end
111
+
112
+ def user_agent
113
+ @opts[:user_agent]
114
+ end
115
+
116
+ def verbose?
117
+ @opts[:verbose]
118
+ end
119
+
107
120
  end
108
121
  end
@@ -33,7 +33,7 @@ module Anemone
33
33
  def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
34
34
  @url = url
35
35
  @code = code
36
- @headers = headers
36
+ @headers = headers || {}
37
37
  @headers['content-type'] ||= ['']
38
38
  @aliases = Array(aka)
39
39
  @data = OpenStruct.new
@@ -6,10 +6,11 @@ module Anemone
6
6
  #
7
7
  # Create a new Tentacle
8
8
  #
9
- def initialize(link_queue, page_queue)
9
+ def initialize(link_queue, page_queue, opts = {})
10
10
  @link_queue = link_queue
11
11
  @page_queue = page_queue
12
- @http = Anemone::HTTP.new
12
+ @http = Anemone::HTTP.new(opts)
13
+ @opts = opts
13
14
  end
14
15
 
15
16
  #
@@ -22,11 +23,17 @@ module Anemone
22
23
 
23
24
  break if link == :END
24
25
 
25
- @page_queue.enq @http.fetch_page(link, from_page)
26
+ @page_queue << @http.fetch_page(link, from_page)
26
27
 
27
- sleep Anemone.options.delay
28
+ delay
28
29
  end
29
30
  end
30
31
 
32
+ private
33
+
34
+ def delay
35
+ sleep @opts[:delay] if @opts[:delay]
36
+ end
37
+
31
38
  end
32
39
  end
@@ -2,45 +2,10 @@ require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  describe Anemone do
4
4
 
5
- before(:all) do
6
- Anemone::FakePage.new
7
- end
8
-
9
- after(:each) do
10
- # reset global options object to defaults
11
- Anemone::DEFAULTS.each { |key, value| Anemone.options.send("#{key}=", value) }
12
- end
13
-
14
5
  it "should have a version" do
15
6
  Anemone.const_defined?('VERSION').should == true
16
7
  end
17
8
 
18
- it "should have options" do
19
- Anemone.should respond_to(:options)
20
- end
21
-
22
- it "should accept options for the crawl" do
23
- Anemone.crawl(SPEC_DOMAIN, :verbose => false,
24
- :threads => 2,
25
- :discard_page_bodies => true,
26
- :user_agent => 'test',
27
- :obey_robots_txt => true,
28
- :depth_limit => 3)
29
-
30
- Anemone.options.verbose.should == false
31
- Anemone.options.threads.should == 2
32
- Anemone.options.discard_page_bodies.should == true
33
- Anemone.options.delay.should == 0
34
- Anemone.options.user_agent.should == 'test'
35
- Anemone.options.obey_robots_txt.should == true
36
- Anemone.options.depth_limit.should == 3
37
- end
38
-
39
- it "should use 1 thread if a delay is requested" do
40
- Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
41
- Anemone.options.threads.should == 1
42
- end
43
-
44
9
  it "should return a Anemone::Core from the crawl, which has a PageHash" do
45
10
  result = Anemone.crawl(SPEC_DOMAIN)
46
11
  result.should be_an_instance_of(Anemone::Core)
@@ -64,13 +64,15 @@ module Anemone
64
64
  pages << FakePage.new('0', :links => ['1', '2'])
65
65
  pages << FakePage.new('1')
66
66
  pages << FakePage.new('2')
67
-
67
+ pages << FakePage.new('3')
68
+
68
69
  core = Anemone.crawl(pages[0].url) do |a|
69
- a.skip_links_like /1/
70
+ a.skip_links_like /1/, /3/
70
71
  end
71
72
 
72
73
  core.should have(2).pages
73
74
  core.pages.keys.should_not include(pages[1].url)
75
+ core.pages.keys.should_not include(pages[3].url)
74
76
  end
75
77
 
76
78
  it "should be able to call a block on every page" do
@@ -173,5 +175,29 @@ module Anemone
173
175
  core.should have(4).pages
174
176
  end
175
177
  end
178
+
179
+ describe "options" do
180
+ it "should accept options for the crawl" do
181
+ core = Anemone.crawl(SPEC_DOMAIN, :verbose => false,
182
+ :threads => 2,
183
+ :discard_page_bodies => true,
184
+ :user_agent => 'test',
185
+ :obey_robots_txt => true,
186
+ :depth_limit => 3)
187
+
188
+ core.opts[:verbose].should == false
189
+ core.opts[:threads].should == 2
190
+ core.opts[:discard_page_bodies].should == true
191
+ core.opts[:delay].should == 0
192
+ core.opts[:user_agent].should == 'test'
193
+ core.opts[:obey_robots_txt].should == true
194
+ core.opts[:depth_limit].should == 3
195
+ end
196
+
197
+ it "should use 1 thread if a delay is requested" do
198
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2).opts[:threads].should == 1
199
+ end
200
+ end
201
+
176
202
  end
177
203
  end
@@ -55,4 +55,3 @@ end
55
55
 
56
56
  #default root
57
57
  Anemone::FakePage.new
58
-
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-10-26 00:00:00 -05:00
12
+ date: 2009-11-01 01:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -22,6 +22,16 @@ dependencies:
22
22
  - !ruby/object:Gem::Version
23
23
  version: 1.3.0
24
24
  version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: robots
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.7.2
34
+ version:
25
35
  description:
26
36
  email:
27
37
  executables:
@@ -32,10 +42,10 @@ extra_rdoc_files:
32
42
  - README.rdoc
33
43
  files:
34
44
  - LICENSE.txt
45
+ - CHANGELOG.rdoc
35
46
  - README.rdoc
36
47
  - bin/anemone
37
48
  - lib/anemone.rb
38
- - lib/anemone/anemone.rb
39
49
  - lib/anemone/core.rb
40
50
  - lib/anemone/http.rb
41
51
  - lib/anemone/page.rb
@@ -1,54 +0,0 @@
1
- require 'ostruct'
2
- require 'anemone/core'
3
-
4
- module Anemone
5
- # Version number
6
- VERSION = '0.2.2'
7
-
8
- # default options
9
- DEFAULTS = {
10
- # run 4 Tentacle threads to fetch pages
11
- :threads => 4,
12
- # disable verbose output
13
- :verbose => false,
14
- # don't throw away the page response body after scanning it for links
15
- :discard_page_bodies => false,
16
- # identify self as Anemone/VERSION
17
- :user_agent => "Anemone/#{VERSION}",
18
- # no delay between requests
19
- :delay => 0,
20
- # don't obey the robots exclusion protocol
21
- :obey_robots_txt => false,
22
- # by default, don't limit the depth of the crawl
23
- :depth_limit => false,
24
- # number of times HTTP redirects will be followed
25
- :redirect_limit => 5
26
- }
27
-
28
- def self.options
29
- @options ||= OpenStruct.new(DEFAULTS)
30
- end
31
-
32
- #
33
- # Convenience method to start a crawl using Core
34
- #
35
- def Anemone.crawl(urls, options = {}, &block)
36
- options.each { |key, value| Anemone.options.send("#{key}=", value) }
37
-
38
- if Anemone.options.obey_robots_txt
39
- begin
40
- require 'robots'
41
- rescue LoadError
42
- warn "To support the robot exclusion protocol, install the robots gem:\n" \
43
- "sudo gem sources -a http://gems.github.com\n" \
44
- "sudo gem install fizx-robots"
45
- exit
46
- end
47
- end
48
-
49
- #use a single thread if a delay was requested
50
- Anemone.options.threads = 1 if Anemone.options.delay > 0
51
-
52
- Core.crawl(urls, &block)
53
- end
54
- end