anemone 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,7 +3,7 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.1.2'
6
+ VERSION = '0.2.0'
7
7
 
8
8
  #module-wide options
9
9
  def Anemone.options=(options)
@@ -20,21 +20,36 @@ module Anemone
20
20
  def Anemone.crawl(urls, options = {}, &block)
21
21
  Anemone.options = OpenStruct.new(options)
22
22
 
23
- #by default, run 4 Tentacle threads to fetch pages
23
+ # by default, run 4 Tentacle threads to fetch pages
24
24
  Anemone.options.threads ||= 4
25
25
 
26
- #disable verbose output by default
26
+ # disable verbose output by default
27
27
  Anemone.options.verbose ||= false
28
28
 
29
- #by default, don't throw away the page response body after scanning it for links
29
+ # by default, don't throw away the page response body after scanning it for links
30
30
  Anemone.options.discard_page_bodies ||= false
31
31
 
32
- #by default, identify self as Anemone/VERSION
32
+ # by default, identify self as Anemone/VERSION
33
33
  Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
34
34
 
35
- #no delay between requests by default
35
+ # no delay between requests by default
36
36
  Anemone.options.delay ||= 0
37
+
38
+ # by default, don't obey the robots exclusion protocol
39
+ if Anemone.options.obey_robots_txt ||= false
40
+ begin
41
+ require 'robots'
42
+ rescue LoadError
43
+ warn "To support the robot exclusion protocol, install the robots gem:\n" \
44
+ "sudo gem sources -a http://gems.github.com\n" \
45
+ "sudo gem install fizx-robots"
46
+ exit
47
+ end
48
+ end
37
49
 
50
+ # by default, don't limit the depth of the crawl
51
+ Anemone.options.depth_limit ||= :infinity
52
+
38
53
  #use a single thread if a delay was requested
39
54
  if(Anemone.options.delay != 0)
40
55
  Anemone.options.threads = 1
data/lib/anemone/core.rb CHANGED
@@ -23,6 +23,10 @@ module Anemone
23
23
  @skip_link_patterns = []
24
24
  @after_crawl_blocks = []
25
25
 
26
+ if Anemone.options.obey_robots_txt
27
+ @robots = Robots.new(Anemone.options.user_agent)
28
+ end
29
+
26
30
  block.call(self) if block
27
31
  end
28
32
 
@@ -113,18 +117,18 @@ module Anemone
113
117
 
114
118
  puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
115
119
 
116
- #perform the on_every_page blocks for this page
120
+ # perform the on_every_page blocks for this page
117
121
  do_page_blocks(page)
118
122
 
119
123
  page.doc = nil if Anemone.options.discard_page_bodies
120
124
 
121
125
  links_to_follow(page).each do |link|
122
- link_queue.enq(link)
126
+ link_queue.enq([link, page])
123
127
  @pages[link] = nil
124
128
  end
125
129
 
126
- #create an entry in the page hash for each alias of this page,
127
- #i.e. all the pages that redirected to this page
130
+ # create an entry in the page hash for each alias of this page,
131
+ # i.e. all the pages that redirected to this page
128
132
  page.aliases.each do |aka|
129
133
  if !@pages.has_key?(aka) or @pages[aka].nil?
130
134
  @pages[aka] = page.alias_clone(aka)
@@ -184,16 +188,26 @@ module Anemone
184
188
  #
185
189
  def links_to_follow(page)
186
190
  links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
187
- links.find_all { |link| visit_link?(link) }
191
+ links.select { |link| visit_link?(link, page) }
188
192
  end
189
193
 
190
194
  #
191
195
  # Returns +true+ if *link* has not been visited already,
192
- # and is not excluded by a skip_link pattern. Returns
193
- # +false+ otherwise.
196
+ # and is not excluded by a skip_link pattern...
197
+ # and is not excluded by robots.txt...
198
+ # and is not deeper than the depth limit
199
+ # Returns +false+ otherwise.
194
200
  #
195
- def visit_link?(link)
196
- !@pages.has_key?(link) and !skip_link?(link)
201
+ def visit_link?(link, from_page = nil)
202
+ allowed = Anemone.options.obey_robots_txt ? @robots.allowed?(link) : true
203
+
204
+ if from_page
205
+ too_deep = from_page.depth >= Anemone.options.depth_limit rescue false
206
+ else
207
+ too_deep = false
208
+ end
209
+
210
+ !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
197
211
  end
198
212
 
199
213
  #
data/lib/anemone/http.rb CHANGED
@@ -9,8 +9,8 @@ module Anemone
9
9
  # Retrieve an HTTP response for *url*, following redirects.
10
10
  # Returns the response object, response code, and final URI location.
11
11
  #
12
- def self.get(url)
13
- response = get_response(url)
12
+ def self.get(url, referer = nil)
13
+ response = get_response(url, referer)
14
14
  code = Integer(response.code)
15
15
  loc = url
16
16
 
@@ -18,7 +18,7 @@ module Anemone
18
18
  while response.is_a?(Net::HTTPRedirection) and limit > 0
19
19
  loc = URI(response['location'])
20
20
  loc = url.merge(loc) if loc.relative?
21
- response = get_response(loc)
21
+ response = get_response(loc, referer)
22
22
  limit -= 1
23
23
  end
24
24
 
@@ -28,10 +28,16 @@ module Anemone
28
28
  #
29
29
  # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
30
  #
31
- def self.get_response(url)
31
+ def self.get_response(url, referer = nil)
32
32
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
33
+ user_agent = Anemone.options.user_agent rescue nil
34
+
35
+ opts = {}
36
+ opts['User-Agent'] = user_agent if user_agent
37
+ opts['Referer'] = referer.to_s if referer
38
+
33
39
  Net::HTTP.start(url.host, url.port) do |http|
34
- return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
40
+ return http.get(full_path, opts)
35
41
  end
36
42
  end
37
43
  end
data/lib/anemone/page.rb CHANGED
@@ -22,24 +22,32 @@ module Anemone
22
22
  attr_accessor :aliases
23
23
  # Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
24
24
  attr_accessor :visited
25
- # Used by PageHash#shortest_paths! to store depth of the page
25
+ # Depth of this page from the root of the crawl. This is not necessarily the
26
+ # shortest path; use PageHash#shortest_paths! to find that value.
26
27
  attr_accessor :depth
28
+ # URL of the page that brought us to this page
29
+ attr_accessor :referer
27
30
 
28
31
  #
29
32
  # Create a new Page from the response of an HTTP request to *url*
30
33
  #
31
- def self.fetch(url)
34
+ def self.fetch(url, from_page = nil)
32
35
  begin
33
- url = URI(url) if url.is_a?(String)
36
+ url = URI(url) unless url.is_a?(URI)
34
37
 
35
- response, code, location = Anemone::HTTP.get(url)
38
+ if from_page
39
+ referer = from_page.url
40
+ depth = from_page.depth + 1
41
+ end
42
+
43
+ response, code, location = Anemone::HTTP.get(url, referer)
36
44
 
37
45
  aka = nil
38
46
  if !url.eql?(location)
39
47
  aka = location
40
48
  end
41
49
 
42
- return Page.new(url, response.body, code, response.to_hash, aka)
50
+ return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
43
51
  rescue
44
52
  return Page.new(url)
45
53
  end
@@ -48,14 +56,16 @@ module Anemone
48
56
  #
49
57
  # Create a new page
50
58
  #
51
- def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
59
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
52
60
  @url = url
53
61
  @code = code
54
62
  @headers = headers
55
63
  @links = []
56
64
  @aliases = []
57
65
  @data = OpenStruct.new
58
-
66
+ @referer = referer
67
+ @depth = depth || 0
68
+
59
69
  @aliases << aka if !aka.nil?
60
70
 
61
71
  if body
@@ -1,6 +1,20 @@
1
1
  module Anemone
2
2
  class PageHash < Hash
3
3
 
4
+ # We typically index the hash with a URI,
5
+ # but convert it to a String for easier retrieval
6
+ def [](index)
7
+ super(index.to_s)
8
+ end
9
+
10
+ def []=(index, other)
11
+ super(index.to_s, other)
12
+ end
13
+
14
+ def has_key?(key)
15
+ super(key.to_s)
16
+ end
17
+
4
18
  #
5
19
  # Use a breadth-first search to calculate the single-source
6
20
  # shortest paths from *root* to all pages in the PageHash
@@ -17,11 +17,15 @@ module Anemone
17
17
  #
18
18
  def run
19
19
  while true do
20
- link = @link_queue.deq
20
+ link, from_page = @link_queue.deq
21
21
 
22
22
  break if link == :END
23
-
24
- page = Page.fetch(link)
23
+
24
+ if from_page
25
+ page = Page.fetch(link, from_page)
26
+ else
27
+ page = Page.fetch(link)
28
+ end
25
29
 
26
30
  @page_queue.enq(page)
27
31
 
data/spec/anemone_spec.rb CHANGED
@@ -14,12 +14,16 @@ describe Anemone do
14
14
  Anemone.crawl(SPEC_DOMAIN, :verbose => false,
15
15
  :threads => 2,
16
16
  :discard_page_bodies => true,
17
- :user_agent => 'test')
17
+ :user_agent => 'test',
18
+ :obey_robots_txt => true,
19
+ :depth_limit => 3)
18
20
  Anemone.options.verbose.should == false
19
21
  Anemone.options.threads.should == 2
20
22
  Anemone.options.discard_page_bodies.should == true
21
23
  Anemone.options.delay.should == 0
22
24
  Anemone.options.user_agent.should == 'test'
25
+ Anemone.options.obey_robots_txt.should == true
26
+ Anemone.options.depth_limit.should == 3
23
27
  end
24
28
 
25
29
  it "should use 1 thread if a delay is requested" do
data/spec/core_spec.rb CHANGED
@@ -25,7 +25,7 @@ module Anemone
25
25
  core = Anemone.crawl(pages[0].url)
26
26
 
27
27
  core.should have(2).pages
28
- core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
28
+ core.pages.keys.should_not include('http://www.other.com/')
29
29
  end
30
30
 
31
31
  it "should follow http redirects" do
@@ -56,7 +56,7 @@ module Anemone
56
56
  core = Anemone.crawl(pages[0].url)
57
57
 
58
58
  core.should have(2).pages
59
- core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
59
+ core.pages.keys.should_not include(pages[2].url)
60
60
  end
61
61
 
62
62
  it "should be able to skip links based on a RegEx" do
@@ -70,7 +70,7 @@ module Anemone
70
70
  end
71
71
 
72
72
  core.should have(2).pages
73
- core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
73
+ core.pages.keys.should_not include(pages[1].url)
74
74
  end
75
75
 
76
76
  it "should be able to call a block on every page" do
@@ -107,7 +107,7 @@ module Anemone
107
107
  end
108
108
 
109
109
  core.should have(2).pages
110
- core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
110
+ core.pages.keys.should_not include(pages[1].url)
111
111
  end
112
112
 
113
113
  it "should optionally delay between page requests" do
@@ -123,6 +123,59 @@ module Anemone
123
123
 
124
124
  (finish - start).should satisfy {|t| t > delay * 2}
125
125
  end
126
+
127
+ it "should optionally obey the robots exclusion protocol" do
128
+ pages = []
129
+ pages << FakePage.new('0', :links => '1')
130
+ pages << FakePage.new('1')
131
+ pages << FakePage.new('robots.txt',
132
+ :body => "User-agent: *\nDisallow: /1",
133
+ :content_type => 'text/plain')
134
+
135
+ core = Anemone.crawl(pages[0].url, :obey_robots_txt => true)
136
+ urls = core.pages.keys
137
+
138
+ urls.should include(pages[0].url)
139
+ urls.should_not include(pages[1].url)
140
+ end
141
+
142
+ it "should track the page depth and referer" do
143
+ num_pages = 5
144
+
145
+ pages = []
146
+
147
+ num_pages.times do |n|
148
+ # register this page with a link to the next page
149
+ link = (n + 1).to_s if n + 1 < num_pages
150
+ pages << FakePage.new(n.to_s, :links => [link].compact)
151
+ end
152
+
153
+ core = Anemone.crawl(pages[0].url)
154
+
155
+ num_pages.times do |n|
156
+ page = core.pages[pages[n].url]
157
+ page.depth.should == n
158
+ page.referer.should == core.pages[pages[n-1].url].url if n > 0
159
+ end
160
+
161
+ core.pages[pages[0].url].referer.should == nil
162
+ end
126
163
 
164
+ it "should optionally limit the depth of the crawl" do
165
+ num_pages = 5
166
+
167
+ pages = []
168
+
169
+ num_pages.times do |n|
170
+ # register this page with a link to the next page
171
+ link = (n + 1).to_s if n + 1 < num_pages
172
+ pages << FakePage.new(n.to_s, :links => [link].compact)
173
+ end
174
+
175
+ core = Anemone.crawl(pages[0].url, :depth_limit => 3)
176
+
177
+ core.should have(4).pages
178
+ end
179
+
127
180
  end
128
181
  end
@@ -13,14 +13,17 @@ module Anemone
13
13
  class FakePage
14
14
  attr_accessor :links
15
15
  attr_accessor :hrefs
16
+ attr_accessor :body
16
17
 
17
18
  def initialize(name = '', options = {})
18
19
  @name = name
19
20
  @links = [options[:links]].flatten if options.has_key?(:links)
20
21
  @hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
21
22
  @redirect = options[:redirect] if options.has_key?(:redirect)
23
+ @content_type = options[:content_type] || "text/html"
24
+ @body = options[:body]
22
25
 
23
- create_body
26
+ create_body unless @body
24
27
  add_to_fakeweb
25
28
  end
26
29
 
@@ -38,7 +41,7 @@ module Anemone
38
41
  end
39
42
 
40
43
  def add_to_fakeweb
41
- options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
44
+ options = {:body => @body, :content_type => @content_type, :status => [200, "OK"]}
42
45
 
43
46
  if @redirect
44
47
  options[:status] = [301, "Permanently Moved"]
data/spec/spec_helper.rb CHANGED
@@ -1,5 +1,5 @@
1
- require File.dirname(__FILE__) + '/fakeweb_helper'
2
1
  require 'rubygems'
2
+ require File.dirname(__FILE__) + '/fakeweb_helper'
3
3
 
4
4
  $:.unshift(File.dirname(__FILE__) + '/../lib/')
5
5
  require 'anemone'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-08-10 00:00:00 -05:00
12
+ date: 2009-09-07 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency