anemone 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -1,18 +1,26 @@
1
1
  = Anemone
2
2
 
3
- == DESCRIPTION
4
3
  Anemone is a web spider framework that can spider a domain and collect useful
5
4
  information about the pages it visits. It is versatile, allowing you to
6
5
  write your own specialized spider tasks quickly and easily.
7
6
 
8
- == FEATURES
7
+ See http://anemone.rubyforge.org for more information.
8
+
9
+ == Features
9
10
  * Multi-threaded design for high performance
10
11
  * Tracks 301 HTTP redirects to understand a page's aliases
11
12
  * Built-in BFS algorithm for determining page depth
12
13
  * Allows exclusion of URLs based on regular expressions
14
+ * Choose the links to follow on each page with focus_crawl()
15
+ * HTTPS support
16
+ * Records response time for each page
17
+ * CLI program can list all pages in a domain, calculate page depths, and more
18
+
19
+ == Examples
20
+ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
13
21
 
14
- == REQUIREMENTS
22
+ == Requirements
15
23
  * nokogiri
16
24
 
17
- == EXAMPLES
18
- See the +bin+ directory for several examples of useful Anemone tasks.
25
+ == Optional
26
+ * fizx-robots (required if obey_robots_txt is set to true)
data/bin/anemone ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'anemone/cli'
3
+
4
+ Anemone::CLI::run
@@ -3,42 +3,41 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.2.0'
7
-
8
- #module-wide options
9
- def Anemone.options=(options)
10
- @options = options
11
- end
6
+ VERSION = '0.2.1'
12
7
 
13
- def Anemone.options
14
- @options
8
+ # default options
9
+ DEFAULTS = {
10
+ # run 4 Tentacle threads to fetch pages
11
+ :threads => 4,
12
+ # disable verbose output
13
+ :verbose => false,
14
+ # don't throw away the page response body after scanning it for links
15
+ :discard_page_bodies => false,
16
+ # identify self as Anemone/VERSION
17
+ :user_agent => "Anemone/#{VERSION}",
18
+ # no delay between requests
19
+ :delay => 0,
20
+ # don't obey the robots exclusion protocol
21
+ :obey_robots_txt => false,
22
+ # by default, don't limit the depth of the crawl
23
+ :depth_limit => false,
24
+ # number of times HTTP redirects will be followed
25
+ :redirect_limit => 5
26
+ }
27
+
28
+ def self.options
29
+ @options ||= OpenStruct.new(DEFAULTS)
15
30
  end
16
31
 
17
32
  #
18
33
  # Convenience method to start a crawl using Core
19
34
  #
20
35
  def Anemone.crawl(urls, options = {}, &block)
21
- Anemone.options = OpenStruct.new(options)
22
-
23
- # by default, run 4 Tentacle threads to fetch pages
24
- Anemone.options.threads ||= 4
25
-
26
- # disable verbose output by default
27
- Anemone.options.verbose ||= false
28
-
29
- # by default, don't throw away the page response body after scanning it for links
30
- Anemone.options.discard_page_bodies ||= false
31
-
32
- # by default, identify self as Anemone/VERSION
33
- Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
36
+ options.each { |key, value| Anemone.options.send("#{key}=", value) }
34
37
 
35
- # no delay between requests by default
36
- Anemone.options.delay ||= 0
37
-
38
- # by default, don't obey the robots exclusion protocol
39
- if Anemone.options.obey_robots_txt ||= false
38
+ if Anemone.options.obey_robots_txt
40
39
  begin
41
- require 'robots'
40
+ require 'robots'
42
41
  rescue LoadError
43
42
  warn "To support the robot exclusion protocol, install the robots gem:\n" \
44
43
  "sudo gem sources -a http://gems.github.com\n" \
@@ -46,15 +45,10 @@ module Anemone
46
45
  exit
47
46
  end
48
47
  end
49
-
50
- # by default, don't limit the depth of the crawl
51
- Anemone.options.depth_limit ||= :infinity
52
48
 
53
49
  #use a single thread if a delay was requested
54
- if(Anemone.options.delay != 0)
55
- Anemone.options.threads = 1
56
- end
57
-
50
+ Anemone.options.threads = 1 if Anemone.options.delay > 0
51
+
58
52
  Core.crawl(urls, &block)
59
53
  end
60
54
  end
@@ -0,0 +1,24 @@
1
+ module Anemone
2
+ module CLI
3
+ COMMANDS = %w[count cron pagedepth serialize url-list]
4
+
5
+ def self.run
6
+ command = ARGV.shift
7
+
8
+ if COMMANDS.include? command
9
+ load "anemone/cli/#{command.tr('-', '_')}.rb"
10
+ else
11
+ puts <<-INFO
12
+ Anemone is a web spider framework that can collect
13
+ useful information about pages it visits.
14
+
15
+ Usage:
16
+ anemone <command> [arguments]
17
+
18
+ Commands:
19
+ #{COMMANDS.join(', ')}
20
+ INFO
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,22 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ url = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone count <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs the total number
13
+ of unique pages on the site.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(url) do |anemone|
19
+ anemone.after_crawl do |pages|
20
+ puts pages.uniq.size
21
+ end
22
+ end
@@ -1,44 +1,30 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Performs pagedepth, url list, and count functionality
4
- # Meant to be run daily as a cron job
5
- #
6
- # == Usage
7
- # anemone_url_list.rb [options] url
8
- #
9
- # == Options
10
- # -r, --relative Output relative URLs (rather than absolute)
11
- # -o, --output filename Filename to save URL list to. Defaults to urls.txt.
12
- #
13
- # == Author
14
- # Chris Kite
15
-
16
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
17
-
18
1
  require 'anemone'
19
2
  require 'optparse'
20
3
  require 'ostruct'
21
4
 
22
- def usage
23
- puts <<END
24
- Usage: anemone_url_list.rb [options] url
25
-
26
- Options:
27
- -r, --relative Output relative URLs (rather than absolute)
28
- -o, --output filename Filename to save URL list to. Defautls to urls.txt.
29
- END
30
- end
31
-
32
5
  options = OpenStruct.new
33
6
  options.relative = false
34
7
  options.output_file = 'urls.txt'
35
8
 
36
- # make sure that the last option is a URL we can crawl
37
9
  begin
38
- URI(ARGV.last)
10
+ # make sure that the last argument is a URL we can crawl
11
+ root = URI(ARGV.last)
39
12
  rescue
40
- usage
41
- Process.exit
13
+ puts <<-INFO
14
+ Usage:
15
+ anemone cron [options] <url>
16
+
17
+ Synopsis:
18
+ Combination of `count`, `pagedepth` and `url-list` commands.
19
+ Performs pagedepth, url list, and count functionality.
20
+ Outputs results to STDOUT and link list to file (urls.txt).
21
+ Meant to be run daily as a cron job.
22
+
23
+ Options:
24
+ -r, --relative Output relative URLs (rather than absolute)
25
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
26
+ INFO
27
+ exit(0)
42
28
  end
43
29
 
44
30
  # parse command-line options
@@ -47,8 +33,6 @@ opts.on('-r', '--relative') { options.relative = true }
47
33
  opts.on('-o', '--output filename') {|o| options.output_file = o }
48
34
  opts.parse!(ARGV)
49
35
 
50
- root = ARGV.last
51
-
52
36
  Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
53
37
 
54
38
  anemone.after_crawl do |pages|
@@ -101,6 +85,6 @@ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
101
85
  url = options.relative ? url.path.to_s : url.to_s
102
86
  file.puts url
103
87
  end
104
-
105
88
  end
106
- end
89
+
90
+ end
@@ -0,0 +1,32 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ root = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone pagedepth <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs a count of
13
+ the number of pages at each depth of the crawl.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(root) do |anemone|
19
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
20
+
21
+ anemone.after_crawl do |pages|
22
+ pages = pages.shortest_paths!(root).uniq
23
+
24
+ depths = pages.values.inject({}) do |depths, page|
25
+ depths[page.depth] ||= 0
26
+ depths[page.depth] += 1
27
+ depths
28
+ end
29
+
30
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
31
+ end
32
+ end
@@ -0,0 +1,35 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ begin
6
+ # make sure that the first option is a URL we can crawl
7
+ root = URI(ARGV[0])
8
+ rescue
9
+ puts <<-INFO
10
+ Usage:
11
+ anemone serialize [options] <url>
12
+
13
+ Synopsis:
14
+ Crawls a site starting at the given URL and saves the resulting
15
+ PageHash object to a file using Marshal serialization.
16
+
17
+ Options:
18
+ -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
19
+ INFO
20
+ exit(0)
21
+ end
22
+
23
+ options = OpenStruct.new
24
+ options.output_file = "crawl.#{Time.now.to_i}"
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.after_crawl do |pages|
33
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
34
+ end
35
+ end
@@ -0,0 +1,41 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+
8
+ begin
9
+ # make sure that the last option is a URL we can crawl
10
+ root = URI(ARGV.last)
11
+ rescue
12
+ puts <<-INFO
13
+ Usage:
14
+ anemone url-list [options] <url>
15
+
16
+ Synopsis:
17
+ Crawls a site starting at the given URL, and outputs the URL of each page
18
+ in the domain as they are encountered.
19
+
20
+ Options:
21
+ -r, --relative Output relative URLs (rather than absolute)
22
+ INFO
23
+ exit(0)
24
+ end
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-r', '--relative') { options.relative = true }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
32
+
33
+ anemone.on_every_page do |page|
34
+ if options.relative
35
+ puts page.url.path
36
+ else
37
+ puts page.url
38
+ end
39
+ end
40
+
41
+ end
data/lib/anemone/core.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'net/http'
2
2
  require 'thread'
3
3
  require 'anemone/tentacle'
4
+ require 'anemone/page'
4
5
  require 'anemone/page_hash'
5
6
 
6
7
  module Anemone
@@ -12,10 +13,10 @@ module Anemone
12
13
  # Initialize the crawl with starting *urls* (single URL or Array of URLs)
13
14
  # and optional *block*
14
15
  #
15
- def initialize(urls, &block)
16
- @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
16
+ def initialize(urls)
17
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
17
18
  @urls.each{ |url| url.path = '/' if url.path.empty? }
18
-
19
+
19
20
  @tentacles = []
20
21
  @pages = PageHash.new
21
22
  @on_every_page_blocks = []
@@ -26,18 +27,17 @@ module Anemone
26
27
  if Anemone.options.obey_robots_txt
27
28
  @robots = Robots.new(Anemone.options.user_agent)
28
29
  end
29
-
30
- block.call(self) if block
30
+
31
+ yield self if block_given?
31
32
  end
32
33
 
33
34
  #
34
35
  # Convenience method to start a new crawl
35
36
  #
36
- def self.crawl(root, &block)
37
+ def self.crawl(root)
37
38
  self.new(root) do |core|
38
- block.call(core) if block
39
+ yield core if block_given?
39
40
  core.run
40
- return core
41
41
  end
42
42
  end
43
43
 
@@ -104,7 +104,7 @@ module Anemone
104
104
  link_queue = Queue.new
105
105
  page_queue = Queue.new
106
106
 
107
- Anemone.options.threads.times do |id|
107
+ Anemone.options.threads.times do
108
108
  @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
109
109
  end
110
110
 
@@ -120,7 +120,7 @@ module Anemone
120
120
  # perform the on_every_page blocks for this page
121
121
  do_page_blocks(page)
122
122
 
123
- page.doc = nil if Anemone.options.discard_page_bodies
123
+ page.discard_doc! if Anemone.options.discard_page_bodies
124
124
 
125
125
  links_to_follow(page).each do |link|
126
126
  link_queue.enq([link, page])
@@ -143,7 +143,7 @@ module Anemone
143
143
  end
144
144
 
145
145
  if page_queue.empty?
146
- @tentacles.size.times { |i| link_queue.enq(:END)}
146
+ @tentacles.size.times { link_queue.enq(:END)}
147
147
  break
148
148
  end
149
149
  end
@@ -207,7 +207,7 @@ module Anemone
207
207
  too_deep = false
208
208
  end
209
209
 
210
- !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
210
+ !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
211
211
  end
212
212
 
213
213
  #
@@ -216,7 +216,7 @@ module Anemone
216
216
  #
217
217
  def skip_link?(link)
218
218
  @skip_link_patterns.each { |p| return true if link.path =~ p}
219
- return false
219
+ false
220
220
  end
221
221
 
222
222
  end
data/lib/anemone/http.rb CHANGED
@@ -1,16 +1,48 @@
1
- require 'net/http'
1
+ require 'net/https'
2
+ require 'anemone/page'
2
3
 
3
4
  module Anemone
4
- class HTTP < Net::HTTP
5
+ class HTTP
5
6
  # Maximum number of redirects to follow on each get_response
6
7
  REDIRECTION_LIMIT = 5
7
-
8
+
9
+ def initialize
10
+ @connections = {}
11
+ end
12
+
13
+ #
14
+ # Create a new Page from the response of an HTTP request to *url*
15
+ #
16
+ def fetch_page(url, from_page = nil)
17
+ begin
18
+ url = URI(url) unless url.is_a?(URI)
19
+
20
+ if from_page
21
+ referer = from_page.url
22
+ depth = from_page.depth + 1
23
+ end
24
+
25
+ response, code, location, response_time = get(url, referer)
26
+
27
+ aka = nil
28
+ if !url.eql?(location)
29
+ aka = location
30
+ end
31
+
32
+ return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
33
+ rescue
34
+ return Page.new(url)
35
+ end
36
+ end
37
+
38
+ private
39
+
8
40
  #
9
41
  # Retrieve an HTTP response for *url*, following redirects.
10
42
  # Returns the response object, response code, and final URI location.
11
43
  #
12
- def self.get(url, referer = nil)
13
- response = get_response(url, referer)
44
+ def get(url, referer = nil)
45
+ response, response_time = get_response(url, referer)
14
46
  code = Integer(response.code)
15
47
  loc = url
16
48
 
@@ -18,17 +50,17 @@ module Anemone
18
50
  while response.is_a?(Net::HTTPRedirection) and limit > 0
19
51
  loc = URI(response['location'])
20
52
  loc = url.merge(loc) if loc.relative?
21
- response = get_response(loc, referer)
53
+ response, response_time = get_response(loc, referer)
22
54
  limit -= 1
23
55
  end
24
56
 
25
- return response, code, loc
57
+ return response, code, loc, response_time
26
58
  end
27
59
 
28
60
  #
29
61
  # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
62
  #
31
- def self.get_response(url, referer = nil)
63
+ def get_response(url, referer = nil)
32
64
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
33
65
  user_agent = Anemone.options.user_agent rescue nil
34
66
 
@@ -36,9 +68,37 @@ module Anemone
36
68
  opts['User-Agent'] = user_agent if user_agent
37
69
  opts['Referer'] = referer.to_s if referer
38
70
 
39
- Net::HTTP.start(url.host, url.port) do |http|
40
- return http.get(full_path, opts)
71
+ retries = 0
72
+ begin
73
+ start = Time.now()
74
+ response = connection(url).get(full_path, opts)
75
+ finish = Time.now()
76
+ response_time = ((finish - start) * 1000).round
77
+ return response, response_time
78
+ rescue EOFError
79
+ refresh_connection(url)
80
+ retries += 1
81
+ retry unless retries > 1
82
+ end
83
+ end
84
+
85
+ def connection(url)
86
+ @connections[url.host] ||= {}
87
+
88
+ if conn = @connections[url.host][url.port]
89
+ return conn
90
+ end
91
+
92
+ refresh_connection(url)
93
+ end
94
+
95
+ def refresh_connection(url)
96
+ http = Net::HTTP.new(url.host, url.port)
97
+ if url.scheme == 'https'
98
+ http.use_ssl = true
99
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
41
100
  end
101
+ @connections[url.host][url.port] = http.start
42
102
  end
43
103
  end
44
104
  end
data/lib/anemone/page.rb CHANGED
@@ -1,4 +1,3 @@
1
- require 'anemone/http'
2
1
  require 'nokogiri'
3
2
  require 'ostruct'
4
3
 
@@ -7,8 +6,6 @@ module Anemone
7
6
 
8
7
  # The URL of the page
9
8
  attr_reader :url
10
- # Array of distinct A tag HREFs from the page
11
- attr_reader :links
12
9
  # Headers of the HTTP response
13
10
  attr_reader :headers
14
11
 
@@ -27,74 +24,45 @@ module Anemone
27
24
  attr_accessor :depth
28
25
  # URL of the page that brought us to this page
29
26
  attr_accessor :referer
30
-
31
- #
32
- # Create a new Page from the response of an HTTP request to *url*
33
- #
34
- def self.fetch(url, from_page = nil)
35
- begin
36
- url = URI(url) unless url.is_a?(URI)
37
-
38
- if from_page
39
- referer = from_page.url
40
- depth = from_page.depth + 1
41
- end
42
-
43
- response, code, location = Anemone::HTTP.get(url, referer)
44
-
45
- aka = nil
46
- if !url.eql?(location)
47
- aka = location
48
- end
49
-
50
- return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
51
- rescue
52
- return Page.new(url)
53
- end
54
- end
27
+ # Response time of the request for this page in milliseconds
28
+ attr_accessor :response_time
55
29
 
56
30
  #
57
31
  # Create a new page
58
32
  #
59
- def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
33
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
60
34
  @url = url
61
35
  @code = code
62
36
  @headers = headers
63
- @links = []
64
- @aliases = []
37
+ @headers['content-type'] ||= ['']
38
+ @aliases = Array(aka)
65
39
  @data = OpenStruct.new
66
40
  @referer = referer
67
41
  @depth = depth || 0
42
+ @response_time = response_time
43
+ @doc = Nokogiri::HTML(body) if body && html? rescue nil
44
+ end
68
45
 
69
- @aliases << aka if !aka.nil?
70
-
71
- if body
72
- begin
73
- @doc = Nokogiri::HTML(body)
74
- rescue
75
- return
76
- end
77
-
78
- return if @doc.nil?
79
-
80
- #get a list of distinct links on the page, in absolute url form
81
- @doc.css('a').each do |a|
82
- u = a.attributes['href'].content if a.attributes['href']
83
- next if u.nil?
84
-
85
- begin
86
- abs = to_absolute(URI(u))
87
- rescue
88
- next
89
- end
90
-
91
- @links << abs if in_domain?(abs)
92
- end
93
-
94
- @links.uniq!
46
+ # Array of distinct A tag HREFs from the page
47
+ def links
48
+ return @links unless @links.nil?
49
+ @links = []
50
+ return @links if !doc
51
+
52
+ doc.css('a').each do |a|
53
+ u = a.attributes['href'].content rescue nil
54
+ next if u.nil? or u.empty?
55
+ abs = to_absolute(URI(u)) rescue next
56
+ @links << abs if in_domain?(abs)
95
57
  end
58
+ @links.uniq!
59
+ @links
96
60
  end
97
61
 
62
+ def discard_doc!
63
+ links # force parsing of page links before we trash the document
64
+ @doc = nil
65
+ end
98
66
 
99
67
  #
100
68
  # Return a new page with the same *response* and *url*, but
@@ -124,7 +92,7 @@ module Anemone
124
92
  # *page_hash* is a PageHash object with the results of the current crawl.
125
93
  #
126
94
  def links_and_their_aliases(page_hash)
127
- @links.inject([]) do |results, link|
95
+ links.inject([]) do |results, link|
128
96
  results.concat([link].concat(page_hash[link].aliases))
129
97
  end
130
98
  end
@@ -133,7 +101,7 @@ module Anemone
133
101
  # The content-type returned by the HTTP request for this page
134
102
  #
135
103
  def content_type
136
- @headers['content-type'][0] rescue nil
104
+ headers['content-type'].first
137
105
  end
138
106
 
139
107
  #
@@ -141,7 +109,7 @@ module Anemone
141
109
  # otherwise.
142
110
  #
143
111
  def html?
144
- (@content_type =~ /text\/html/) == 0
112
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
145
113
  end
146
114
 
147
115
  #
@@ -14,6 +14,18 @@ module Anemone
14
14
  def has_key?(key)
15
15
  super(key.to_s)
16
16
  end
17
+
18
+ # Does this PageHash contain the specified URL?
19
+ # HTTP and HTTPS versions of a URL are considered to be the same page.
20
+ def has_page?(url)
21
+ schemes = %w(http https)
22
+ if schemes.include? url.scheme
23
+ u = url.dup
24
+ return schemes.any? { |s| u.scheme = s; has_key?(u) }
25
+ end
26
+
27
+ has_key?(url)
28
+ end
17
29
 
18
30
  #
19
31
  # Use a breadth-first search to calculate the single-source
@@ -1,4 +1,4 @@
1
- require 'anemone/page'
1
+ require 'anemone/http'
2
2
 
3
3
  module Anemone
4
4
  class Tentacle
@@ -9,6 +9,7 @@ module Anemone
9
9
  def initialize(link_queue, page_queue)
10
10
  @link_queue = link_queue
11
11
  @page_queue = page_queue
12
+ @http = Anemone::HTTP.new
12
13
  end
13
14
 
14
15
  #
@@ -16,22 +17,16 @@ module Anemone
16
17
  # Page objects into @page_queue
17
18
  #
18
19
  def run
19
- while true do
20
+ loop do
20
21
  link, from_page = @link_queue.deq
21
22
 
22
23
  break if link == :END
23
-
24
- if from_page
25
- page = Page.fetch(link, from_page)
26
- else
27
- page = Page.fetch(link)
28
- end
29
-
30
- @page_queue.enq(page)
24
+
25
+ @page_queue.enq @http.fetch_page(link, from_page)
31
26
 
32
27
  sleep Anemone.options.delay
33
28
  end
34
29
  end
35
-
30
+
36
31
  end
37
32
  end
data/spec/anemone_spec.rb CHANGED
@@ -1,6 +1,15 @@
1
1
  require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  describe Anemone do
4
+
5
+ before(:all) do
6
+ Anemone::FakePage.new
7
+ end
8
+
9
+ after(:each) do
10
+ # reset global options object to defaults
11
+ Anemone::DEFAULTS.each { |key, value| Anemone.options.send("#{key}=", value) }
12
+ end
4
13
 
5
14
  it "should have a version" do
6
15
  Anemone.const_defined?('VERSION').should == true
@@ -17,6 +26,7 @@ describe Anemone do
17
26
  :user_agent => 'test',
18
27
  :obey_robots_txt => true,
19
28
  :depth_limit => 3)
29
+
20
30
  Anemone.options.verbose.should == false
21
31
  Anemone.options.threads.should == 2
22
32
  Anemone.options.discard_page_bodies.should == true
data/spec/core_spec.rb CHANGED
@@ -139,43 +139,39 @@ module Anemone
139
139
  urls.should_not include(pages[1].url)
140
140
  end
141
141
 
142
- it "should track the page depth and referer" do
143
- num_pages = 5
144
-
145
- pages = []
146
-
147
- num_pages.times do |n|
148
- # register this page with a link to the next page
149
- link = (n + 1).to_s if n + 1 < num_pages
150
- pages << FakePage.new(n.to_s, :links => [link].compact)
142
+ describe "many pages" do
143
+ before(:each) do
144
+ @pages, size = [], 5
145
+
146
+ size.times do |n|
147
+ # register this page with a link to the next page
148
+ link = (n + 1).to_s if n + 1 < size
149
+ @pages << FakePage.new(n.to_s, :links => Array(link))
150
+ end
151
151
  end
152
-
153
- core = Anemone.crawl(pages[0].url)
154
-
155
- num_pages.times do |n|
156
- page = core.pages[pages[n].url]
157
- page.depth.should == n
158
- page.referer.should == core.pages[pages[n-1].url].url if n > 0
152
+
153
+ it "should track the page depth and referer" do
154
+ core = Anemone.crawl(@pages[0].url)
155
+ previous_page = nil
156
+
157
+ @pages.each_with_index do |page, i|
158
+ page = core.pages[page.url]
159
+ page.should be
160
+ page.depth.should == i
161
+
162
+ if previous_page
163
+ page.referer.should == previous_page.url
164
+ else
165
+ page.referer.should be_nil
166
+ end
167
+ previous_page = page
168
+ end
159
169
  end
160
-
161
- core.pages[pages[0].url].referer.should == nil
162
- end
163
170
 
164
- it "should optionally limit the depth of the crawl" do
165
- num_pages = 5
166
-
167
- pages = []
168
-
169
- num_pages.times do |n|
170
- # register this page with a link to the next page
171
- link = (n + 1).to_s if n + 1 < num_pages
172
- pages << FakePage.new(n.to_s, :links => [link].compact)
171
+ it "should optionally limit the depth of the crawl" do
172
+ core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
173
+ core.should have(4).pages
173
174
  end
174
-
175
- core = Anemone.crawl(pages[0].url, :depth_limit => 3)
176
-
177
- core.should have(4).pages
178
175
  end
179
-
180
176
  end
181
177
  end
data/spec/page_spec.rb CHANGED
@@ -2,14 +2,13 @@ require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  module Anemone
4
4
  describe Page do
5
-
6
- before(:each) do
7
- @page = Page.fetch(FakePage.new('home').url)
5
+
6
+ before(:all) do
7
+ @http = Anemone::HTTP.new
8
8
  end
9
-
10
- it "should be able to fetch a page" do
11
- @page.should_not be_nil
12
- @page.url.to_s.should include('home')
9
+
10
+ before(:each) do
11
+ @page = @http.fetch_page(FakePage.new('home').url)
13
12
  end
14
13
 
15
14
  it "should store the response headers when fetching a page" do
@@ -35,7 +34,7 @@ module Anemone
35
34
 
36
35
  @page.redirect?.should == false
37
36
 
38
- Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
37
+ @http.fetch_page(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
39
38
  end
40
39
 
41
40
  it "should have a method to tell if a URI is in the same domain as the page" do
@@ -44,6 +43,10 @@ module Anemone
44
43
  @page.in_domain?(URI(FakePage.new('test').url)).should == true
45
44
  @page.in_domain?(URI('http://www.other.com/')).should == false
46
45
  end
46
+
47
+ it "should include the response time for the HTTP request" do
48
+ @page.should respond_to(:response_time)
49
+ end
47
50
 
48
51
  end
49
52
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-09-07 00:00:00 -05:00
12
+ date: 2009-10-24 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -25,11 +25,7 @@ dependencies:
25
25
  description:
26
26
  email:
27
27
  executables:
28
- - anemone_count.rb
29
- - anemone_cron.rb
30
- - anemone_pagedepth.rb
31
- - anemone_serialize.rb
32
- - anemone_url_list.rb
28
+ - anemone
33
29
  extensions: []
34
30
 
35
31
  extra_rdoc_files:
@@ -37,11 +33,7 @@ extra_rdoc_files:
37
33
  files:
38
34
  - LICENSE.txt
39
35
  - README.rdoc
40
- - bin/anemone_count.rb
41
- - bin/anemone_cron.rb
42
- - bin/anemone_pagedepth.rb
43
- - bin/anemone_serialize.rb
44
- - bin/anemone_url_list.rb
36
+ - bin/anemone
45
37
  - lib/anemone.rb
46
38
  - lib/anemone/anemone.rb
47
39
  - lib/anemone/core.rb
@@ -49,6 +41,12 @@ files:
49
41
  - lib/anemone/page.rb
50
42
  - lib/anemone/page_hash.rb
51
43
  - lib/anemone/tentacle.rb
44
+ - lib/anemone/cli.rb
45
+ - lib/anemone/cli/url_list.rb
46
+ - lib/anemone/cli/cron.rb
47
+ - lib/anemone/cli/count.rb
48
+ - lib/anemone/cli/pagedepth.rb
49
+ - lib/anemone/cli/serialize.rb
52
50
  has_rdoc: true
53
51
  homepage: http://anemone.rubyforge.org
54
52
  post_install_message:
data/bin/anemone_count.rb DELETED
@@ -1,36 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs the total number
4
- # of unique pages on the site.
5
- #
6
- # == Usage
7
- # anemone_count.rb url
8
- #
9
- # == Author
10
- # Chris Kite
11
-
12
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
-
14
- require 'anemone'
15
-
16
- def usage
17
- puts <<END
18
- Usage: anemone_count.rb url
19
- END
20
- end
21
-
22
- # make sure that the first option is a URL we can crawl
23
- begin
24
- URI(ARGV[0])
25
- rescue
26
- usage
27
- Process.exit
28
- end
29
-
30
- Anemone.crawl(ARGV[0]) do |anemone|
31
- anemone.after_crawl do |pages|
32
- puts pages.uniq.size
33
- end
34
- end
35
-
36
-
@@ -1,44 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs a count of
4
- # the number of Pages at each depth in the site.
5
- #
6
- # == Usage
7
- # anemone_pagedepth.rb url
8
- #
9
- # == Author
10
- # Chris Kite
11
-
12
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
-
14
- require 'anemone'
15
-
16
- def usage
17
- puts <<END
18
- Usage: anemone_pagedepth.rb url
19
- END
20
- end
21
-
22
- # make sure that the first option is a URL we can crawl
23
- begin
24
- URI(ARGV[0])
25
- rescue
26
- usage
27
- Process.exit
28
- end
29
-
30
- root = ARGV[0]
31
- Anemone.crawl(root) do |anemone|
32
- anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
33
-
34
- anemone.after_crawl do |pages|
35
- pages = pages.shortest_paths!(root).uniq
36
- depths = pages.values.inject({}) do |depths, page|
37
- depths[page.depth] ||= 0
38
- depths[page.depth] += 1
39
- depths
40
- end
41
-
42
- depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
43
- end
44
- end
@@ -1,51 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and saves the resulting
4
- # PageHash object to a file using Marshal serialization.
5
- #
6
- # == Usage
7
- # anemone_serialize.rb [options] url
8
- #
9
- # == Options
10
- # -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
11
- #
12
- # == Author
13
- # Chris Kite
14
-
15
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
-
17
- require 'anemone'
18
- require 'optparse'
19
- require 'ostruct'
20
-
21
- def usage
22
- puts <<END
23
- Usage: anemone_serialize.rb [options] url
24
-
25
- Options:
26
- -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
27
- END
28
- end
29
-
30
- # make sure that the first option is a URL we can crawl
31
- begin
32
- URI(ARGV[0])
33
- rescue
34
- usage
35
- Process.exit
36
- end
37
-
38
- options = OpenStruct.new
39
- options.output_file = "crawl.#{Time.now.to_i}"
40
-
41
- # parse command-line options
42
- opts = OptionParser.new
43
- opts.on('-o', '--output filename') {|o| options.output_file = o }
44
- opts.parse!(ARGV)
45
-
46
- root = ARGV[0]
47
- Anemone.crawl(root) do |anemone|
48
- anemone.after_crawl do |pages|
49
- open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
50
- end
51
- end
@@ -1,54 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs the URL of each page
4
- # in the domain as they are encountered.
5
- #
6
- # == Usage
7
- # anemone_url_list.rb [options] url
8
- #
9
- # == Options
10
- # -r, --relative Output relative URLs (rather than absolute)
11
- #
12
- # == Author
13
- # Chris Kite
14
-
15
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
-
17
- require 'anemone'
18
- require 'optparse'
19
- require 'ostruct'
20
-
21
- def usage
22
- puts <<END
23
- Usage: anemone_url_list.rb [options] url
24
-
25
- Options:
26
- -r, --relative Output relative URLs (rather than absolute)
27
- END
28
- end
29
-
30
- options = OpenStruct.new
31
- options.relative = false
32
-
33
- # make sure that the last option is a URL we can crawl
34
- begin
35
- URI(ARGV.last)
36
- rescue
37
- usage
38
- Process.exit
39
- end
40
-
41
- # parse command-line options
42
- opts = OptionParser.new
43
- opts.on('-r', '--relative') { options.relative = true }
44
- opts.parse!(ARGV)
45
-
46
- Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
47
- anemone.on_every_page do |page|
48
- if options.relative
49
- puts page.url.path
50
- else
51
- puts page.url
52
- end
53
- end
54
- end