anemone 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -1,18 +1,26 @@
1
1
  = Anemone
2
2
 
3
- == DESCRIPTION
4
3
  Anemone is a web spider framework that can spider a domain and collect useful
5
4
  information about the pages it visits. It is versatile, allowing you to
6
5
  write your own specialized spider tasks quickly and easily.
7
6
 
8
- == FEATURES
7
+ See http://anemone.rubyforge.org for more information.
8
+
9
+ == Features
9
10
  * Multi-threaded design for high performance
10
11
  * Tracks 301 HTTP redirects to understand a page's aliases
11
12
  * Built-in BFS algorithm for determining page depth
12
13
  * Allows exclusion of URLs based on regular expressions
14
+ * Choose the links to follow on each page with focus_crawl()
15
+ * HTTPS support
16
+ * Records response time for each page
17
+ * CLI program can list all pages in a domain, calculate page depths, and more
18
+
19
+ == Examples
20
+ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
13
21
 
14
- == REQUIREMENTS
22
+ == Requirements
15
23
  * nokogiri
16
24
 
17
- == EXAMPLES
18
- See the +bin+ directory for several examples of useful Anemone tasks.
25
+ == Optional
26
+ * fizx-robots (required if obey_robots_txt is set to true)
data/bin/anemone ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'anemone/cli'
3
+
4
+ Anemone::CLI::run
@@ -3,42 +3,41 @@ require 'anemone/core'
3
3
 
4
4
  module Anemone
5
5
  # Version number
6
- VERSION = '0.2.0'
7
-
8
- #module-wide options
9
- def Anemone.options=(options)
10
- @options = options
11
- end
6
+ VERSION = '0.2.1'
12
7
 
13
- def Anemone.options
14
- @options
8
+ # default options
9
+ DEFAULTS = {
10
+ # run 4 Tentacle threads to fetch pages
11
+ :threads => 4,
12
+ # disable verbose output
13
+ :verbose => false,
14
+ # don't throw away the page response body after scanning it for links
15
+ :discard_page_bodies => false,
16
+ # identify self as Anemone/VERSION
17
+ :user_agent => "Anemone/#{VERSION}",
18
+ # no delay between requests
19
+ :delay => 0,
20
+ # don't obey the robots exclusion protocol
21
+ :obey_robots_txt => false,
22
+ # by default, don't limit the depth of the crawl
23
+ :depth_limit => false,
24
+ # number of times HTTP redirects will be followed
25
+ :redirect_limit => 5
26
+ }
27
+
28
+ def self.options
29
+ @options ||= OpenStruct.new(DEFAULTS)
15
30
  end
16
31
 
17
32
  #
18
33
  # Convenience method to start a crawl using Core
19
34
  #
20
35
  def Anemone.crawl(urls, options = {}, &block)
21
- Anemone.options = OpenStruct.new(options)
22
-
23
- # by default, run 4 Tentacle threads to fetch pages
24
- Anemone.options.threads ||= 4
25
-
26
- # disable verbose output by default
27
- Anemone.options.verbose ||= false
28
-
29
- # by default, don't throw away the page response body after scanning it for links
30
- Anemone.options.discard_page_bodies ||= false
31
-
32
- # by default, identify self as Anemone/VERSION
33
- Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
36
+ options.each { |key, value| Anemone.options.send("#{key}=", value) }
34
37
 
35
- # no delay between requests by default
36
- Anemone.options.delay ||= 0
37
-
38
- # by default, don't obey the robots exclusion protocol
39
- if Anemone.options.obey_robots_txt ||= false
38
+ if Anemone.options.obey_robots_txt
40
39
  begin
41
- require 'robots'
40
+ require 'robots'
42
41
  rescue LoadError
43
42
  warn "To support the robot exclusion protocol, install the robots gem:\n" \
44
43
  "sudo gem sources -a http://gems.github.com\n" \
@@ -46,15 +45,10 @@ module Anemone
46
45
  exit
47
46
  end
48
47
  end
49
-
50
- # by default, don't limit the depth of the crawl
51
- Anemone.options.depth_limit ||= :infinity
52
48
 
53
49
  #use a single thread if a delay was requested
54
- if(Anemone.options.delay != 0)
55
- Anemone.options.threads = 1
56
- end
57
-
50
+ Anemone.options.threads = 1 if Anemone.options.delay > 0
51
+
58
52
  Core.crawl(urls, &block)
59
53
  end
60
54
  end
@@ -0,0 +1,24 @@
1
+ module Anemone
2
+ module CLI
3
+ COMMANDS = %w[count cron pagedepth serialize url-list]
4
+
5
+ def self.run
6
+ command = ARGV.shift
7
+
8
+ if COMMANDS.include? command
9
+ load "anemone/cli/#{command.tr('-', '_')}.rb"
10
+ else
11
+ puts <<-INFO
12
+ Anemone is a web spider framework that can collect
13
+ useful information about pages it visits.
14
+
15
+ Usage:
16
+ anemone <command> [arguments]
17
+
18
+ Commands:
19
+ #{COMMANDS.join(', ')}
20
+ INFO
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,22 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ url = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone count <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs the total number
13
+ of unique pages on the site.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(url) do |anemone|
19
+ anemone.after_crawl do |pages|
20
+ puts pages.uniq.size
21
+ end
22
+ end
@@ -1,44 +1,30 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Performs pagedepth, url list, and count functionality
4
- # Meant to be run daily as a cron job
5
- #
6
- # == Usage
7
- # anemone_url_list.rb [options] url
8
- #
9
- # == Options
10
- # -r, --relative Output relative URLs (rather than absolute)
11
- # -o, --output filename Filename to save URL list to. Defaults to urls.txt.
12
- #
13
- # == Author
14
- # Chris Kite
15
-
16
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
17
-
18
1
  require 'anemone'
19
2
  require 'optparse'
20
3
  require 'ostruct'
21
4
 
22
- def usage
23
- puts <<END
24
- Usage: anemone_url_list.rb [options] url
25
-
26
- Options:
27
- -r, --relative Output relative URLs (rather than absolute)
28
- -o, --output filename Filename to save URL list to. Defautls to urls.txt.
29
- END
30
- end
31
-
32
5
  options = OpenStruct.new
33
6
  options.relative = false
34
7
  options.output_file = 'urls.txt'
35
8
 
36
- # make sure that the last option is a URL we can crawl
37
9
  begin
38
- URI(ARGV.last)
10
+ # make sure that the last argument is a URL we can crawl
11
+ root = URI(ARGV.last)
39
12
  rescue
40
- usage
41
- Process.exit
13
+ puts <<-INFO
14
+ Usage:
15
+ anemone cron [options] <url>
16
+
17
+ Synopsis:
18
+ Combination of `count`, `pagedepth` and `url-list` commands.
19
+ Performs pagedepth, url list, and count functionality.
20
+ Outputs results to STDOUT and link list to file (urls.txt).
21
+ Meant to be run daily as a cron job.
22
+
23
+ Options:
24
+ -r, --relative Output relative URLs (rather than absolute)
25
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
26
+ INFO
27
+ exit(0)
42
28
  end
43
29
 
44
30
  # parse command-line options
@@ -47,8 +33,6 @@ opts.on('-r', '--relative') { options.relative = true }
47
33
  opts.on('-o', '--output filename') {|o| options.output_file = o }
48
34
  opts.parse!(ARGV)
49
35
 
50
- root = ARGV.last
51
-
52
36
  Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
53
37
 
54
38
  anemone.after_crawl do |pages|
@@ -101,6 +85,6 @@ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
101
85
  url = options.relative ? url.path.to_s : url.to_s
102
86
  file.puts url
103
87
  end
104
-
105
88
  end
106
- end
89
+
90
+ end
@@ -0,0 +1,32 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ root = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone pagedepth <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs a count of
13
+ the number of pages at each depth of the crawl.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(root) do |anemone|
19
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
20
+
21
+ anemone.after_crawl do |pages|
22
+ pages = pages.shortest_paths!(root).uniq
23
+
24
+ depths = pages.values.inject({}) do |depths, page|
25
+ depths[page.depth] ||= 0
26
+ depths[page.depth] += 1
27
+ depths
28
+ end
29
+
30
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
31
+ end
32
+ end
@@ -0,0 +1,35 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ begin
6
+ # make sure that the first option is a URL we can crawl
7
+ root = URI(ARGV[0])
8
+ rescue
9
+ puts <<-INFO
10
+ Usage:
11
+ anemone serialize [options] <url>
12
+
13
+ Synopsis:
14
+ Crawls a site starting at the given URL and saves the resulting
15
+ PageHash object to a file using Marshal serialization.
16
+
17
+ Options:
18
+ -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
19
+ INFO
20
+ exit(0)
21
+ end
22
+
23
+ options = OpenStruct.new
24
+ options.output_file = "crawl.#{Time.now.to_i}"
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.after_crawl do |pages|
33
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
34
+ end
35
+ end
@@ -0,0 +1,41 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+
8
+ begin
9
+ # make sure that the last option is a URL we can crawl
10
+ root = URI(ARGV.last)
11
+ rescue
12
+ puts <<-INFO
13
+ Usage:
14
+ anemone url-list [options] <url>
15
+
16
+ Synopsis:
17
+ Crawls a site starting at the given URL, and outputs the URL of each page
18
+ in the domain as they are encountered.
19
+
20
+ Options:
21
+ -r, --relative Output relative URLs (rather than absolute)
22
+ INFO
23
+ exit(0)
24
+ end
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-r', '--relative') { options.relative = true }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
32
+
33
+ anemone.on_every_page do |page|
34
+ if options.relative
35
+ puts page.url.path
36
+ else
37
+ puts page.url
38
+ end
39
+ end
40
+
41
+ end
data/lib/anemone/core.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require 'net/http'
2
2
  require 'thread'
3
3
  require 'anemone/tentacle'
4
+ require 'anemone/page'
4
5
  require 'anemone/page_hash'
5
6
 
6
7
  module Anemone
@@ -12,10 +13,10 @@ module Anemone
12
13
  # Initialize the crawl with starting *urls* (single URL or Array of URLs)
13
14
  # and optional *block*
14
15
  #
15
- def initialize(urls, &block)
16
- @urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
16
+ def initialize(urls)
17
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
17
18
  @urls.each{ |url| url.path = '/' if url.path.empty? }
18
-
19
+
19
20
  @tentacles = []
20
21
  @pages = PageHash.new
21
22
  @on_every_page_blocks = []
@@ -26,18 +27,17 @@ module Anemone
26
27
  if Anemone.options.obey_robots_txt
27
28
  @robots = Robots.new(Anemone.options.user_agent)
28
29
  end
29
-
30
- block.call(self) if block
30
+
31
+ yield self if block_given?
31
32
  end
32
33
 
33
34
  #
34
35
  # Convenience method to start a new crawl
35
36
  #
36
- def self.crawl(root, &block)
37
+ def self.crawl(root)
37
38
  self.new(root) do |core|
38
- block.call(core) if block
39
+ yield core if block_given?
39
40
  core.run
40
- return core
41
41
  end
42
42
  end
43
43
 
@@ -104,7 +104,7 @@ module Anemone
104
104
  link_queue = Queue.new
105
105
  page_queue = Queue.new
106
106
 
107
- Anemone.options.threads.times do |id|
107
+ Anemone.options.threads.times do
108
108
  @tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
109
109
  end
110
110
 
@@ -120,7 +120,7 @@ module Anemone
120
120
  # perform the on_every_page blocks for this page
121
121
  do_page_blocks(page)
122
122
 
123
- page.doc = nil if Anemone.options.discard_page_bodies
123
+ page.discard_doc! if Anemone.options.discard_page_bodies
124
124
 
125
125
  links_to_follow(page).each do |link|
126
126
  link_queue.enq([link, page])
@@ -143,7 +143,7 @@ module Anemone
143
143
  end
144
144
 
145
145
  if page_queue.empty?
146
- @tentacles.size.times { |i| link_queue.enq(:END)}
146
+ @tentacles.size.times { link_queue.enq(:END)}
147
147
  break
148
148
  end
149
149
  end
@@ -207,7 +207,7 @@ module Anemone
207
207
  too_deep = false
208
208
  end
209
209
 
210
- !@pages.has_key?(link) and !skip_link?(link) and allowed and !too_deep
210
+ !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
211
211
  end
212
212
 
213
213
  #
@@ -216,7 +216,7 @@ module Anemone
216
216
  #
217
217
  def skip_link?(link)
218
218
  @skip_link_patterns.each { |p| return true if link.path =~ p}
219
- return false
219
+ false
220
220
  end
221
221
 
222
222
  end
data/lib/anemone/http.rb CHANGED
@@ -1,16 +1,48 @@
1
- require 'net/http'
1
+ require 'net/https'
2
+ require 'anemone/page'
2
3
 
3
4
  module Anemone
4
- class HTTP < Net::HTTP
5
+ class HTTP
5
6
  # Maximum number of redirects to follow on each get_response
6
7
  REDIRECTION_LIMIT = 5
7
-
8
+
9
+ def initialize
10
+ @connections = {}
11
+ end
12
+
13
+ #
14
+ # Create a new Page from the response of an HTTP request to *url*
15
+ #
16
+ def fetch_page(url, from_page = nil)
17
+ begin
18
+ url = URI(url) unless url.is_a?(URI)
19
+
20
+ if from_page
21
+ referer = from_page.url
22
+ depth = from_page.depth + 1
23
+ end
24
+
25
+ response, code, location, response_time = get(url, referer)
26
+
27
+ aka = nil
28
+ if !url.eql?(location)
29
+ aka = location
30
+ end
31
+
32
+ return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
33
+ rescue
34
+ return Page.new(url)
35
+ end
36
+ end
37
+
38
+ private
39
+
8
40
  #
9
41
  # Retrieve an HTTP response for *url*, following redirects.
10
42
  # Returns the response object, response code, and final URI location.
11
43
  #
12
- def self.get(url, referer = nil)
13
- response = get_response(url, referer)
44
+ def get(url, referer = nil)
45
+ response, response_time = get_response(url, referer)
14
46
  code = Integer(response.code)
15
47
  loc = url
16
48
 
@@ -18,17 +50,17 @@ module Anemone
18
50
  while response.is_a?(Net::HTTPRedirection) and limit > 0
19
51
  loc = URI(response['location'])
20
52
  loc = url.merge(loc) if loc.relative?
21
- response = get_response(loc, referer)
53
+ response, response_time = get_response(loc, referer)
22
54
  limit -= 1
23
55
  end
24
56
 
25
- return response, code, loc
57
+ return response, code, loc, response_time
26
58
  end
27
59
 
28
60
  #
29
61
  # Get an HTTPResponse for *url*, sending the appropriate User-Agent string
30
62
  #
31
- def self.get_response(url, referer = nil)
63
+ def get_response(url, referer = nil)
32
64
  full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
33
65
  user_agent = Anemone.options.user_agent rescue nil
34
66
 
@@ -36,9 +68,37 @@ module Anemone
36
68
  opts['User-Agent'] = user_agent if user_agent
37
69
  opts['Referer'] = referer.to_s if referer
38
70
 
39
- Net::HTTP.start(url.host, url.port) do |http|
40
- return http.get(full_path, opts)
71
+ retries = 0
72
+ begin
73
+ start = Time.now()
74
+ response = connection(url).get(full_path, opts)
75
+ finish = Time.now()
76
+ response_time = ((finish - start) * 1000).round
77
+ return response, response_time
78
+ rescue EOFError
79
+ refresh_connection(url)
80
+ retries += 1
81
+ retry unless retries > 1
82
+ end
83
+ end
84
+
85
+ def connection(url)
86
+ @connections[url.host] ||= {}
87
+
88
+ if conn = @connections[url.host][url.port]
89
+ return conn
90
+ end
91
+
92
+ refresh_connection(url)
93
+ end
94
+
95
+ def refresh_connection(url)
96
+ http = Net::HTTP.new(url.host, url.port)
97
+ if url.scheme == 'https'
98
+ http.use_ssl = true
99
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
41
100
  end
101
+ @connections[url.host][url.port] = http.start
42
102
  end
43
103
  end
44
104
  end
data/lib/anemone/page.rb CHANGED
@@ -1,4 +1,3 @@
1
- require 'anemone/http'
2
1
  require 'nokogiri'
3
2
  require 'ostruct'
4
3
 
@@ -7,8 +6,6 @@ module Anemone
7
6
 
8
7
  # The URL of the page
9
8
  attr_reader :url
10
- # Array of distinct A tag HREFs from the page
11
- attr_reader :links
12
9
  # Headers of the HTTP response
13
10
  attr_reader :headers
14
11
 
@@ -27,74 +24,45 @@ module Anemone
27
24
  attr_accessor :depth
28
25
  # URL of the page that brought us to this page
29
26
  attr_accessor :referer
30
-
31
- #
32
- # Create a new Page from the response of an HTTP request to *url*
33
- #
34
- def self.fetch(url, from_page = nil)
35
- begin
36
- url = URI(url) unless url.is_a?(URI)
37
-
38
- if from_page
39
- referer = from_page.url
40
- depth = from_page.depth + 1
41
- end
42
-
43
- response, code, location = Anemone::HTTP.get(url, referer)
44
-
45
- aka = nil
46
- if !url.eql?(location)
47
- aka = location
48
- end
49
-
50
- return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
51
- rescue
52
- return Page.new(url)
53
- end
54
- end
27
+ # Response time of the request for this page in milliseconds
28
+ attr_accessor :response_time
55
29
 
56
30
  #
57
31
  # Create a new page
58
32
  #
59
- def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
33
+ def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
60
34
  @url = url
61
35
  @code = code
62
36
  @headers = headers
63
- @links = []
64
- @aliases = []
37
+ @headers['content-type'] ||= ['']
38
+ @aliases = Array(aka)
65
39
  @data = OpenStruct.new
66
40
  @referer = referer
67
41
  @depth = depth || 0
42
+ @response_time = response_time
43
+ @doc = Nokogiri::HTML(body) if body && html? rescue nil
44
+ end
68
45
 
69
- @aliases << aka if !aka.nil?
70
-
71
- if body
72
- begin
73
- @doc = Nokogiri::HTML(body)
74
- rescue
75
- return
76
- end
77
-
78
- return if @doc.nil?
79
-
80
- #get a list of distinct links on the page, in absolute url form
81
- @doc.css('a').each do |a|
82
- u = a.attributes['href'].content if a.attributes['href']
83
- next if u.nil?
84
-
85
- begin
86
- abs = to_absolute(URI(u))
87
- rescue
88
- next
89
- end
90
-
91
- @links << abs if in_domain?(abs)
92
- end
93
-
94
- @links.uniq!
46
+ # Array of distinct A tag HREFs from the page
47
+ def links
48
+ return @links unless @links.nil?
49
+ @links = []
50
+ return @links if !doc
51
+
52
+ doc.css('a').each do |a|
53
+ u = a.attributes['href'].content rescue nil
54
+ next if u.nil? or u.empty?
55
+ abs = to_absolute(URI(u)) rescue next
56
+ @links << abs if in_domain?(abs)
95
57
  end
58
+ @links.uniq!
59
+ @links
96
60
  end
97
61
 
62
+ def discard_doc!
63
+ links # force parsing of page links before we trash the document
64
+ @doc = nil
65
+ end
98
66
 
99
67
  #
100
68
  # Return a new page with the same *response* and *url*, but
@@ -124,7 +92,7 @@ module Anemone
124
92
  # *page_hash* is a PageHash object with the results of the current crawl.
125
93
  #
126
94
  def links_and_their_aliases(page_hash)
127
- @links.inject([]) do |results, link|
95
+ links.inject([]) do |results, link|
128
96
  results.concat([link].concat(page_hash[link].aliases))
129
97
  end
130
98
  end
@@ -133,7 +101,7 @@ module Anemone
133
101
  # The content-type returned by the HTTP request for this page
134
102
  #
135
103
  def content_type
136
- @headers['content-type'][0] rescue nil
104
+ headers['content-type'].first
137
105
  end
138
106
 
139
107
  #
@@ -141,7 +109,7 @@ module Anemone
141
109
  # otherwise.
142
110
  #
143
111
  def html?
144
- (@content_type =~ /text\/html/) == 0
112
+ !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
145
113
  end
146
114
 
147
115
  #
@@ -14,6 +14,18 @@ module Anemone
14
14
  def has_key?(key)
15
15
  super(key.to_s)
16
16
  end
17
+
18
+ # Does this PageHash contain the specified URL?
19
+ # HTTP and HTTPS versions of a URL are considered to be the same page.
20
+ def has_page?(url)
21
+ schemes = %w(http https)
22
+ if schemes.include? url.scheme
23
+ u = url.dup
24
+ return schemes.any? { |s| u.scheme = s; has_key?(u) }
25
+ end
26
+
27
+ has_key?(url)
28
+ end
17
29
 
18
30
  #
19
31
  # Use a breadth-first search to calculate the single-source
@@ -1,4 +1,4 @@
1
- require 'anemone/page'
1
+ require 'anemone/http'
2
2
 
3
3
  module Anemone
4
4
  class Tentacle
@@ -9,6 +9,7 @@ module Anemone
9
9
  def initialize(link_queue, page_queue)
10
10
  @link_queue = link_queue
11
11
  @page_queue = page_queue
12
+ @http = Anemone::HTTP.new
12
13
  end
13
14
 
14
15
  #
@@ -16,22 +17,16 @@ module Anemone
16
17
  # Page objects into @page_queue
17
18
  #
18
19
  def run
19
- while true do
20
+ loop do
20
21
  link, from_page = @link_queue.deq
21
22
 
22
23
  break if link == :END
23
-
24
- if from_page
25
- page = Page.fetch(link, from_page)
26
- else
27
- page = Page.fetch(link)
28
- end
29
-
30
- @page_queue.enq(page)
24
+
25
+ @page_queue.enq @http.fetch_page(link, from_page)
31
26
 
32
27
  sleep Anemone.options.delay
33
28
  end
34
29
  end
35
-
30
+
36
31
  end
37
32
  end
data/spec/anemone_spec.rb CHANGED
@@ -1,6 +1,15 @@
1
1
  require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  describe Anemone do
4
+
5
+ before(:all) do
6
+ Anemone::FakePage.new
7
+ end
8
+
9
+ after(:each) do
10
+ # reset global options object to defaults
11
+ Anemone::DEFAULTS.each { |key, value| Anemone.options.send("#{key}=", value) }
12
+ end
4
13
 
5
14
  it "should have a version" do
6
15
  Anemone.const_defined?('VERSION').should == true
@@ -17,6 +26,7 @@ describe Anemone do
17
26
  :user_agent => 'test',
18
27
  :obey_robots_txt => true,
19
28
  :depth_limit => 3)
29
+
20
30
  Anemone.options.verbose.should == false
21
31
  Anemone.options.threads.should == 2
22
32
  Anemone.options.discard_page_bodies.should == true
data/spec/core_spec.rb CHANGED
@@ -139,43 +139,39 @@ module Anemone
139
139
  urls.should_not include(pages[1].url)
140
140
  end
141
141
 
142
- it "should track the page depth and referer" do
143
- num_pages = 5
144
-
145
- pages = []
146
-
147
- num_pages.times do |n|
148
- # register this page with a link to the next page
149
- link = (n + 1).to_s if n + 1 < num_pages
150
- pages << FakePage.new(n.to_s, :links => [link].compact)
142
+ describe "many pages" do
143
+ before(:each) do
144
+ @pages, size = [], 5
145
+
146
+ size.times do |n|
147
+ # register this page with a link to the next page
148
+ link = (n + 1).to_s if n + 1 < size
149
+ @pages << FakePage.new(n.to_s, :links => Array(link))
150
+ end
151
151
  end
152
-
153
- core = Anemone.crawl(pages[0].url)
154
-
155
- num_pages.times do |n|
156
- page = core.pages[pages[n].url]
157
- page.depth.should == n
158
- page.referer.should == core.pages[pages[n-1].url].url if n > 0
152
+
153
+ it "should track the page depth and referer" do
154
+ core = Anemone.crawl(@pages[0].url)
155
+ previous_page = nil
156
+
157
+ @pages.each_with_index do |page, i|
158
+ page = core.pages[page.url]
159
+ page.should be
160
+ page.depth.should == i
161
+
162
+ if previous_page
163
+ page.referer.should == previous_page.url
164
+ else
165
+ page.referer.should be_nil
166
+ end
167
+ previous_page = page
168
+ end
159
169
  end
160
-
161
- core.pages[pages[0].url].referer.should == nil
162
- end
163
170
 
164
- it "should optionally limit the depth of the crawl" do
165
- num_pages = 5
166
-
167
- pages = []
168
-
169
- num_pages.times do |n|
170
- # register this page with a link to the next page
171
- link = (n + 1).to_s if n + 1 < num_pages
172
- pages << FakePage.new(n.to_s, :links => [link].compact)
171
+ it "should optionally limit the depth of the crawl" do
172
+ core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
173
+ core.should have(4).pages
173
174
  end
174
-
175
- core = Anemone.crawl(pages[0].url, :depth_limit => 3)
176
-
177
- core.should have(4).pages
178
175
  end
179
-
180
176
  end
181
177
  end
data/spec/page_spec.rb CHANGED
@@ -2,14 +2,13 @@ require File.dirname(__FILE__) + '/spec_helper'
2
2
 
3
3
  module Anemone
4
4
  describe Page do
5
-
6
- before(:each) do
7
- @page = Page.fetch(FakePage.new('home').url)
5
+
6
+ before(:all) do
7
+ @http = Anemone::HTTP.new
8
8
  end
9
-
10
- it "should be able to fetch a page" do
11
- @page.should_not be_nil
12
- @page.url.to_s.should include('home')
9
+
10
+ before(:each) do
11
+ @page = @http.fetch_page(FakePage.new('home').url)
13
12
  end
14
13
 
15
14
  it "should store the response headers when fetching a page" do
@@ -35,7 +34,7 @@ module Anemone
35
34
 
36
35
  @page.redirect?.should == false
37
36
 
38
- Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
37
+ @http.fetch_page(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
39
38
  end
40
39
 
41
40
  it "should have a method to tell if a URI is in the same domain as the page" do
@@ -44,6 +43,10 @@ module Anemone
44
43
  @page.in_domain?(URI(FakePage.new('test').url)).should == true
45
44
  @page.in_domain?(URI('http://www.other.com/')).should == false
46
45
  end
46
+
47
+ it "should include the response time for the HTTP request" do
48
+ @page.should respond_to(:response_time)
49
+ end
47
50
 
48
51
  end
49
52
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anemone
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Chris Kite
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-09-07 00:00:00 -05:00
12
+ date: 2009-10-24 00:00:00 -05:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -25,11 +25,7 @@ dependencies:
25
25
  description:
26
26
  email:
27
27
  executables:
28
- - anemone_count.rb
29
- - anemone_cron.rb
30
- - anemone_pagedepth.rb
31
- - anemone_serialize.rb
32
- - anemone_url_list.rb
28
+ - anemone
33
29
  extensions: []
34
30
 
35
31
  extra_rdoc_files:
@@ -37,11 +33,7 @@ extra_rdoc_files:
37
33
  files:
38
34
  - LICENSE.txt
39
35
  - README.rdoc
40
- - bin/anemone_count.rb
41
- - bin/anemone_cron.rb
42
- - bin/anemone_pagedepth.rb
43
- - bin/anemone_serialize.rb
44
- - bin/anemone_url_list.rb
36
+ - bin/anemone
45
37
  - lib/anemone.rb
46
38
  - lib/anemone/anemone.rb
47
39
  - lib/anemone/core.rb
@@ -49,6 +41,12 @@ files:
49
41
  - lib/anemone/page.rb
50
42
  - lib/anemone/page_hash.rb
51
43
  - lib/anemone/tentacle.rb
44
+ - lib/anemone/cli.rb
45
+ - lib/anemone/cli/url_list.rb
46
+ - lib/anemone/cli/cron.rb
47
+ - lib/anemone/cli/count.rb
48
+ - lib/anemone/cli/pagedepth.rb
49
+ - lib/anemone/cli/serialize.rb
52
50
  has_rdoc: true
53
51
  homepage: http://anemone.rubyforge.org
54
52
  post_install_message:
data/bin/anemone_count.rb DELETED
@@ -1,36 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs the total number
4
- # of unique pages on the site.
5
- #
6
- # == Usage
7
- # anemone_count.rb url
8
- #
9
- # == Author
10
- # Chris Kite
11
-
12
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
-
14
- require 'anemone'
15
-
16
- def usage
17
- puts <<END
18
- Usage: anemone_count.rb url
19
- END
20
- end
21
-
22
- # make sure that the first option is a URL we can crawl
23
- begin
24
- URI(ARGV[0])
25
- rescue
26
- usage
27
- Process.exit
28
- end
29
-
30
- Anemone.crawl(ARGV[0]) do |anemone|
31
- anemone.after_crawl do |pages|
32
- puts pages.uniq.size
33
- end
34
- end
35
-
36
-
@@ -1,44 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs a count of
4
- # the number of Pages at each depth in the site.
5
- #
6
- # == Usage
7
- # anemone_pagedepth.rb url
8
- #
9
- # == Author
10
- # Chris Kite
11
-
12
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
-
14
- require 'anemone'
15
-
16
- def usage
17
- puts <<END
18
- Usage: anemone_pagedepth.rb url
19
- END
20
- end
21
-
22
- # make sure that the first option is a URL we can crawl
23
- begin
24
- URI(ARGV[0])
25
- rescue
26
- usage
27
- Process.exit
28
- end
29
-
30
- root = ARGV[0]
31
- Anemone.crawl(root) do |anemone|
32
- anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
33
-
34
- anemone.after_crawl do |pages|
35
- pages = pages.shortest_paths!(root).uniq
36
- depths = pages.values.inject({}) do |depths, page|
37
- depths[page.depth] ||= 0
38
- depths[page.depth] += 1
39
- depths
40
- end
41
-
42
- depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
43
- end
44
- end
@@ -1,51 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and saves the resulting
4
- # PageHash object to a file using Marshal serialization.
5
- #
6
- # == Usage
7
- # anemone_serialize.rb [options] url
8
- #
9
- # == Options
10
- # -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
11
- #
12
- # == Author
13
- # Chris Kite
14
-
15
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
-
17
- require 'anemone'
18
- require 'optparse'
19
- require 'ostruct'
20
-
21
- def usage
22
- puts <<END
23
- Usage: anemone_serialize.rb [options] url
24
-
25
- Options:
26
- -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
27
- END
28
- end
29
-
30
- # make sure that the first option is a URL we can crawl
31
- begin
32
- URI(ARGV[0])
33
- rescue
34
- usage
35
- Process.exit
36
- end
37
-
38
- options = OpenStruct.new
39
- options.output_file = "crawl.#{Time.now.to_i}"
40
-
41
- # parse command-line options
42
- opts = OptionParser.new
43
- opts.on('-o', '--output filename') {|o| options.output_file = o }
44
- opts.parse!(ARGV)
45
-
46
- root = ARGV[0]
47
- Anemone.crawl(root) do |anemone|
48
- anemone.after_crawl do |pages|
49
- open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
50
- end
51
- end
@@ -1,54 +0,0 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs the URL of each page
4
- # in the domain as they are encountered.
5
- #
6
- # == Usage
7
- # anemone_url_list.rb [options] url
8
- #
9
- # == Options
10
- # -r, --relative Output relative URLs (rather than absolute)
11
- #
12
- # == Author
13
- # Chris Kite
14
-
15
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
-
17
- require 'anemone'
18
- require 'optparse'
19
- require 'ostruct'
20
-
21
- def usage
22
- puts <<END
23
- Usage: anemone_url_list.rb [options] url
24
-
25
- Options:
26
- -r, --relative Output relative URLs (rather than absolute)
27
- END
28
- end
29
-
30
- options = OpenStruct.new
31
- options.relative = false
32
-
33
- # make sure that the last option is a URL we can crawl
34
- begin
35
- URI(ARGV.last)
36
- rescue
37
- usage
38
- Process.exit
39
- end
40
-
41
- # parse command-line options
42
- opts = OptionParser.new
43
- opts.on('-r', '--relative') { options.relative = true }
44
- opts.parse!(ARGV)
45
-
46
- Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
47
- anemone.on_every_page do |page|
48
- if options.relative
49
- puts page.url.path
50
- else
51
- puts page.url
52
- end
53
- end
54
- end