anemone 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.1.2'
6
+ VERSION = '0.2.0'
7
7
 
8
8
  #module-wide options
9
9
  def Anemone.options=(options)
@@ -20,21 +20,36 @@ module Anemone
20
20
  def Anemone.crawl(urls, options = {}, &block)
21
21
  Anemone.options = OpenStruct.new(options)
22
22
 
23
- #by default, run 4 Tentacle threads to fetch pages
23
+ # by default, run 4 Tentacle threads to fetch pages
24
24
  Anemone.options.threads ||= 4
25
25
 
26
- #disable verbose output by default
26
+ # disable verbose output by default
27
27
  Anemone.options.verbose ||= false
28
28
 
29
- #by default, don't throw away the page response body after scanning it for links
29
+ # by default, don't throw away the page response body after scanning it for links
30
30
  Anemone.options.discard_page_bodies ||= false
31
31
 
32
- #by default, identify self as Anemone/VERSION
32
+ # by default, identify self as Anemone/VERSION
33
33
  Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
34
34
 
35
- #no delay between requests by default
35
+ # no delay between requests by default
36
36
  Anemone.options.delay ||= 0
37
+
38
+ # by default, don't obey the robots exclusion protocol
39
+ if Anemone.options.obey_robots_txt ||= false
40
+ begin
41
+ require 'robots'
42
+ rescue LoadError
43
+ warn "To support the robot exclusion protocol, install the robots gem:\n" \
44
+ "sudo gem sources -a http://gems.github.com\n" \
45
+ "sudo gem install fizx-robots"
46
+ exit
47
+ end
48
+ end
37
49
 
50
+ # by default, don't limit the depth of the crawl
51
+ Anemone.options.depth_limit ||= :infinity
52
+
38
53
  #use a single thread if a delay was requested
39
54
  if(Anemone.options.delay != 0)
40
55
  Anemone.options.threads = 1
data/lib/anemone/core.rb CHANGED
@@ -23,6 +23,10 @@ module Anemone
23
23
  @skip_link_patterns = []
24
24
  @after_crawl_blocks = []
25
25
 
26
+ if Anemone.options.obey_robots_txt
27
+ @robots = Robots.new(Anemone.options.user_agent)
28
+ end
29
+
26
30
  block.call(self) if block
27
31
  end
28
32
 
@@ -113,18 +117,18 @@ module Anemone
113
117
 
114
118
  puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
115
119
 
116
- #perform the on_every_page blocks for this page
120
+ # perform the on_every_page blocks for this page
117
121
  do_page_blocks(page)
118
122
 
119
123
  page.doc = nil if Anemone.options.discard_page_bodies
120
124
 
121
125
  links_to_follow(page).each do |link|
122
- link_queue.enq(link)
126
+ link_queue.enq([link, page])
123
127
  @pages[link] = nil
124
128
  end
125
129
 
126
- #create an entry in the page hash for each alias of this page,
127
- #i.e. all the pages that redirected to this page
130
+ # create an entry in the page hash for each alias of this page,
131
+ # i.e. all the pages that redirected to this page
128
132
  page.aliases.each do |aka|
129
133
  if !@pages.has_key?(aka) or @pages[aka].nil?
130
134
  @pages[aka] = page.alias_clone(aka)
@@ -184,16 +188,26 @@ module Anemone
184
188
  #
185
189
  def links_to_follow(page)
186
190
  links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
187
- links.find_all { |link| visit_link?(link) }
191
+ links.select { |link| visit_link?(link, page) }
188
192
  end
189
193
 
190
194
  #
191
195
  # Returns +true+ if *link* has not been visited already,
192
- # and is not excluded by a skip_link pattern. Returns
193
- # +false+ otherwise.
196
+ # and is not excluded by a skip_link pattern...
197
+ # and is not excluded by robots.txt...
198
+ # and is not deeper than the depth limit
199
+ # Returns +false+ otherwise.
194
200
  #
195
- def visit_link?(link)
196
- !@pages.has_key?(link) and !skip_link?(link)
201
+ def visit_link?(link, from_page = nil)
202
+ allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
203
+
204
+ if from_page
205
+ too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
206
+ else
207
+ too_deep = false
208
+ end
209
+
210
+ !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
197
211
  end
198
212
 
199
213
  #
data/lib/anemone/http.rb CHANGED
@@ -9,8 +9,8 @@ module Anemone
9
9
  # Retrieve an HTTP response for *url*, following redirects.
10
10
  # Returns the response object, response code, and final URI location.
11
11
  #
12
- def self.get(url)
13
- response = get_response(url)
12
+ def self.get(url, referer = nil)
13
+ response = get_response(url, referer)
14
14
  code = Integer(response.code)
15
15
  loc = url
16
16
 
@@ -18,7 +18,7 @@ module Anemone
18
18
  while response.is_a?(Net::HTTPRedirection) and limit > 0
19
19
  loc = URI(response['location'])
20
20
  loc = url.merge(loc) if loc.relative?
21
- response = get_response(loc)
21
+ response = get_response(loc, referer)
22
22
  limit -= 1
23
23
  end
24
24
 
@@ -28,10 +28,16 @@ module Anemone
28
28
  #
29
29
  # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
30
  #
31
- def self.get_response(url)
31
+ def self.get_response(url, referer = nil)
32
32
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
33
+ user_agent = Anemone.options.user_agent rescue nil
34
+
35
+ opts = {}
36
+ opts['User-Agent'] = user_agent if user_agent
37
+ opts['Referer'] = referer.to_s if referer
38
+
33
39
  Net::HTTP.start(url.host, url.port) do |http|
34
- return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
40
+ return http.get(full_path, opts)
35
41
  end
36
42
  end
37
43
  end
data/lib/anemone/page.rb CHANGED
@@ -22,24 +22,32 @@ module Anemone
22
22
  attr_accessor :aliases
23
23
  # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
24
24
  attr_accessor :visited
25
- # Used by PageHash#shortest_paths! to store depth of the page
25
+ # Depth of this page from the root of the crawl. This is not necessarily the
26
+ # shortest path; use PageHash#shortest_paths! to find that value.
26
27
  attr_accessor :depth
28
+ # URL of the page that brought us to this page
29
+ attr_accessor :referer
27
30
 
28
31
  #
29
32
  # Create a new Page from the response of an HTTP request to *url*
30
33
  #
31
- def self.fetch(url)
34
+ def self.fetch(url, from_page = nil)
32
35
  begin
33
- url = URI(url) if url.is_a?(String)
36
+ url = URI(url) unless url.is_a?(URI)
34
37
 
35
- response, code, location = Anemone::HTTP.get(url)
38
+ if from_page
39
+ referer = from_page.url
40
+ depth = from_page.depth + 1
41
+ end
42
+
43
+ response, code, location = Anemone::HTTP.get(url, referer)
36
44
 
37
45
  aka = nil
38
46
  if !url.eql?(location)
39
47
  aka = location
40
48
  end
41
49
 
42
- return Page.new(url, response.body, code, response.to_hash, aka)
50
+ return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
43
51
  rescue
44
52
  return Page.new(url)
45
53
  end
@@ -48,14 +56,16 @@ module Anemone
48
56
  #
49
57
  # Create a new page
50
58
  #
51
- def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
59
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
52
60
  @url = url
53
61
  @code = code
54
62
  @headers = headers
55
63
  @links = []
56
64
  @aliases = []
57
65
  @data = OpenStruct.new
58
-
66
+ @referer = referer
67
+ @depth = depth || 0
68
+
59
69
  @aliases << aka if !aka.nil?
60
70
 
61
71
  if body
@@ -1,6 +1,20 @@
1
1
  module Anemone
2
2
  class PageHash < Hash
3
3
 
4
+ # We typically index the hash with a URI,
5
+ # but convert it to a String for easier retrieval
6
+ def [](index)
7
+ super(index.to_s)
8
+ end
9
+
10
+ def []=(index, other)
11
+ super(index.to_s, other)
12
+ end
13
+
14
+ def has_key?(key)
15
+ super(key.to_s)
16
+ end
17
+
4
18
  #
5
19
  # Use a breadth-first search to calculate the single-source
6
20
  # shortest paths from *root* to all pages in the PageHash
@@ -17,11 +17,15 @@ module Anemone
17
17
  #
18
18
  def run
19
19
  while true do
20
- link = @link_queue.deq
20
+ link, from_page = @link_queue.deq
21
21
 
22
22
  break if link == :END
23
-
24
- page = Page.fetch(link)
23
+
24
+ if from_page
25
+ page = Page.fetch(link, from_page)
26
+ else
27
+ page = Page.fetch(link)
28
+ end
25
29
 
26
30
  @page_queue.enq(page)
27
31
 
data/spec/anemone_spec.rb CHANGED
@@ -14,12 +14,16 @@ describe Anemone do
14
14
  Anemone.crawl(SPEC_DOMAIN, :verbose => false,
15
15
  :threads => 2,
16
16
  :discard_page_bodies => true,
17
- :user_agent => 'test')
17
+ :user_agent => 'test',
18
+ :obey_robots_txt => true,
19
+ :depth_limit => 3)
18
20
  Anemone.options.verbose.should == false
19
21
  Anemone.options.threads.should == 2
20
22
  Anemone.options.discard_page_bodies.should == true
21
23
  Anemone.options.delay.should == 0
22
24
  Anemone.options.user_agent.should == 'test'
25
+ Anemone.options.obey_robots_txt.should == true
26
+ Anemone.options.depth_limit.should == 3
23
27
  end
24
28
 
25
29
  it "should use 1 thread if a delay is requested" do
data/spec/core_spec.rb CHANGED
@@ -25,7 +25,7 @@ module Anemone
25
25
  core = Anemone.crawl(pages[0].url)
26
26
 
27
27
  core.should have(2).pages
28
- core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
28
+ core.pages.keys.should_not include('http://www.other.com/')
29
29
  end
30
30
 
31
31
  it "should follow http redirects" do
@@ -56,7 +56,7 @@ module Anemone
56
56
  core = Anemone.crawl(pages[0].url)
57
57
 
58
58
  core.should have(2).pages
59
- core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
59
+ core.pages.keys.should_not include(pages[2].url)
60
60
  end
61
61
 
62
62
  it "should be able to skip links based on a RegEx" do
@@ -70,7 +70,7 @@ module Anemone
70
70
  end
71
71
 
72
72
  core.should have(2).pages
73
- core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
73
+ core.pages.keys.should_not include(pages[1].url)
74
74
  end
75
75
 
76
76
  it "should be able to call a block on every page" do
@@ -107,7 +107,7 @@ module Anemone
107
107
  end
108
108
 
109
109
  core.should have(2).pages
110
- core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
110
+ core.pages.keys.should_not include(pages[1].url)
111
111
  end
112
112
 
113
113
  it "should optionally delay between page requests" do
@@ -123,6 +123,59 @@ module Anemone
123
123
 
124
124
  (finish - start).should satisfy {|t| t > delay * 2}
125
125
  end
126
+
127
+ it "should optionally obey the robots exclusion protocol" do
128
+ pages = []
129
+ pages << FakePage.new('0', :links => '1')
130
+ pages << FakePage.new('1')
131
+ pages << FakePage.new('robots.txt',
132
+ :body => "User-agent: *\nDisallow: /1",
133
+ :content_type => 'text/plain')
134
+
135
+ core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
136
+ urls = core.pages.keys
137
+
138
+ urls.should include(pages[0].url)
139
+ urls.should_not include(pages[1].url)
140
+ end
141
+
142
+ it "should track the page depth and referer" do
143
+ num_pages = 5
144
+
145
+ pages = []
146
+
147
+ num_pages.times do |n|
148
+ # register this page with a link to the next page
149
+ link = (n + 1).to_s if n + 1 < num_pages
150
+ pages << FakePage.new(n.to_s, :links => [link].compact)
151
+ end
152
+
153
+ core = Anemone.crawl(pages[0].url)
154
+
155
+ num_pages.times do |n|
156
+ page = core.pages[pages[n].url]
157
+ page.depth.should == n
158
+ page.referer.should == core.pages[pages[n-1].url].url if n > 0
159
+ end
160
+
161
+ core.pages[pages[0].url].referer.should == nil
162
+ end
126
163
 
164
+ it "should optionally limit the depth of the crawl" do
165
+ num_pages = 5
166
+
167
+ pages = []
168
+
169
+ num_pages.times do |n|
170
+ # register this page with a link to the next page
171
+ link = (n + 1).to_s if n + 1 < num_pages
172
+ pages << FakePage.new(n.to_s, :links => [link].compact)
173
+ end
174
+
175
+ core = Anemone.crawl(pages[0].url, :depth_limit => 3)
176
+
177
+ core.should have(4).pages
178
+ end
179
+
127
180
  end
128
181
  end
@@ -13,14 +13,17 @@ module Anemone
13
13
  class FakePage
14
14
  attr_accessor :links
15
15
  attr_accessor :hrefs
16
+ attr_accessor :body
16
17
 
17
18
  def initialize(name = '', options = {})
18
19
  @name = name
19
20
  @links = [options[:links]].flatten if options.has_key?(:links)
20
21
  @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
21
22
  @redirect = options[:redirect] if options.has_key?(:redirect)
23
+ @content_type = options[:content_type] || "text/html"
24
+ @body = options[:body]
22
25
 
23
- create_body
26
+ create_body unless @body
24
27
  add_to_fakeweb
25
28
  end
26
29
 
@@ -38,7 +41,7 @@ module Anemone
38
41
  end
39
42
 
40
43
  def add_to_fakeweb
41
- options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
44
+ options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
42
45
 
43
46
  if @redirect
44
47
  options[:status] = [301, "Permanently Moved"]
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,5 @@
1
- require File.dirname(__FILE__) + '/fakeweb_helper'
2
1
  require 'rubygems'
2
+ require File.dirname(__FILE__) + '/fakeweb_helper'
3
3
 
4
4
  $:.unshift(File.dirname(__FILE__) + '/../lib/')
5
5
  require 'anemone'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-10 00:00:00 -05:00
12
+ date: 2009-09-07 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency