anemone 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +13 -5
- data/bin/anemone +4 -0
- data/lib/anemone/anemone.rb +28 -34
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/{bin/anemone_cron.rb → lib/anemone/cli/cron.rb} +19 -35
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/core.rb +13 -13
- data/lib/anemone/http.rb +70 -10
- data/lib/anemone/page.rb +28 -60
- data/lib/anemone/page_hash.rb +12 -0
- data/lib/anemone/tentacle.rb +6 -11
- data/spec/anemone_spec.rb +10 -0
- data/spec/core_spec.rb +29 -33
- data/spec/page_spec.rb +11 -8
- metadata +10 -12
- data/bin/anemone_count.rb +0 -36
- data/bin/anemone_pagedepth.rb +0 -44
- data/bin/anemone_serialize.rb +0 -51
- data/bin/anemone_url_list.rb +0 -54
data/README.rdoc
CHANGED
@@ -1,18 +1,26 @@
|
|
1
1
|
= Anemone
|
2
2
|
|
3
|
-
== DESCRIPTION
|
4
3
|
Anemone is a web spider framework that can spider a domain and collect useful
|
5
4
|
information about the pages it visits. It is versatile, allowing you to
|
6
5
|
write your own specialized spider tasks quickly and easily.
|
7
6
|
|
8
|
-
|
7
|
+
See http://anemone.rubyforge.org for more information.
|
8
|
+
|
9
|
+
== Features
|
9
10
|
* Multi-threaded design for high performance
|
10
11
|
* Tracks 301 HTTP redirects to understand a page's aliases
|
11
12
|
* Built-in BFS algorithm for determining page depth
|
12
13
|
* Allows exclusion of URLs based on regular expressions
|
14
|
+
* Choose the links to follow on each page with focus_crawl()
|
15
|
+
* HTTPS support
|
16
|
+
* Records response time for each page
|
17
|
+
* CLI program can list all pages in a domain, calculate page depths, and more
|
18
|
+
|
19
|
+
== Examples
|
20
|
+
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
|
13
21
|
|
14
|
-
==
|
22
|
+
== Requirements
|
15
23
|
* nokogiri
|
16
24
|
|
17
|
-
==
|
18
|
-
|
25
|
+
== Optional
|
26
|
+
* fizx-robots (required if obey_robots_txt is set to true)
|
data/bin/anemone
ADDED
data/lib/anemone/anemone.rb
CHANGED
@@ -3,42 +3,41 @@ require 'anemone/core'
|
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
# Version number
|
6
|
-
VERSION = '0.2.
|
7
|
-
|
8
|
-
#module-wide options
|
9
|
-
def Anemone.options=(options)
|
10
|
-
@options = options
|
11
|
-
end
|
6
|
+
VERSION = '0.2.1'
|
12
7
|
|
13
|
-
|
14
|
-
|
8
|
+
# default options
|
9
|
+
DEFAULTS = {
|
10
|
+
# run 4 Tentacle threads to fetch pages
|
11
|
+
:threads => 4,
|
12
|
+
# disable verbose output
|
13
|
+
:verbose => false,
|
14
|
+
# don't throw away the page response body after scanning it for links
|
15
|
+
:discard_page_bodies => false,
|
16
|
+
# identify self as Anemone/VERSION
|
17
|
+
:user_agent => "Anemone/#{VERSION}",
|
18
|
+
# no delay between requests
|
19
|
+
:delay => 0,
|
20
|
+
# don't obey the robots exclusion protocol
|
21
|
+
:obey_robots_txt => false,
|
22
|
+
# by default, don't limit the depth of the crawl
|
23
|
+
:depth_limit => false,
|
24
|
+
# number of times HTTP redirects will be followed
|
25
|
+
:redirect_limit => 5
|
26
|
+
}
|
27
|
+
|
28
|
+
def self.options
|
29
|
+
@options ||= OpenStruct.new(DEFAULTS)
|
15
30
|
end
|
16
31
|
|
17
32
|
#
|
18
33
|
# Convenience method to start a crawl using Core
|
19
34
|
#
|
20
35
|
def Anemone.crawl(urls, options = {}, &block)
|
21
|
-
Anemone.options
|
22
|
-
|
23
|
-
# by default, run 4 Tentacle threads to fetch pages
|
24
|
-
Anemone.options.threads ||= 4
|
25
|
-
|
26
|
-
# disable verbose output by default
|
27
|
-
Anemone.options.verbose ||= false
|
28
|
-
|
29
|
-
# by default, don't throw away the page response body after scanning it for links
|
30
|
-
Anemone.options.discard_page_bodies ||= false
|
31
|
-
|
32
|
-
# by default, identify self as Anemone/VERSION
|
33
|
-
Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
|
36
|
+
options.each { |key, value| Anemone.options.send("#{key}=", value) }
|
34
37
|
|
35
|
-
|
36
|
-
Anemone.options.delay ||= 0
|
37
|
-
|
38
|
-
# by default, don't obey the robots exclusion protocol
|
39
|
-
if Anemone.options.obey_robots_txt ||= false
|
38
|
+
if Anemone.options.obey_robots_txt
|
40
39
|
begin
|
41
|
-
|
40
|
+
require 'robots'
|
42
41
|
rescue LoadError
|
43
42
|
warn "To support the robot exclusion protocol, install the robots gem:\n" \
|
44
43
|
"sudo gem sources -a http://gems.github.com\n" \
|
@@ -46,15 +45,10 @@ module Anemone
|
|
46
45
|
exit
|
47
46
|
end
|
48
47
|
end
|
49
|
-
|
50
|
-
# by default, don't limit the depth of the crawl
|
51
|
-
Anemone.options.depth_limit ||= :infinity
|
52
48
|
|
53
49
|
#use a single thread if a delay was requested
|
54
|
-
if
|
55
|
-
|
56
|
-
end
|
57
|
-
|
50
|
+
Anemone.options.threads = 1 if Anemone.options.delay > 0
|
51
|
+
|
58
52
|
Core.crawl(urls, &block)
|
59
53
|
end
|
60
54
|
end
|
data/lib/anemone/cli.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module Anemone
|
2
|
+
module CLI
|
3
|
+
COMMANDS = %w[count cron pagedepth serialize url-list]
|
4
|
+
|
5
|
+
def self.run
|
6
|
+
command = ARGV.shift
|
7
|
+
|
8
|
+
if COMMANDS.include? command
|
9
|
+
load "anemone/cli/#{command.tr('-', '_')}.rb"
|
10
|
+
else
|
11
|
+
puts <<-INFO
|
12
|
+
Anemone is a web spider framework that can collect
|
13
|
+
useful information about pages it visits.
|
14
|
+
|
15
|
+
Usage:
|
16
|
+
anemone <command> [arguments]
|
17
|
+
|
18
|
+
Commands:
|
19
|
+
#{COMMANDS.join(', ')}
|
20
|
+
INFO
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
|
3
|
+
begin
|
4
|
+
# make sure that the first option is a URL we can crawl
|
5
|
+
url = URI(ARGV[0])
|
6
|
+
rescue
|
7
|
+
puts <<-INFO
|
8
|
+
Usage:
|
9
|
+
anemone count <url>
|
10
|
+
|
11
|
+
Synopsis:
|
12
|
+
Crawls a site starting at the given URL and outputs the total number
|
13
|
+
of unique pages on the site.
|
14
|
+
INFO
|
15
|
+
exit(0)
|
16
|
+
end
|
17
|
+
|
18
|
+
Anemone.crawl(url) do |anemone|
|
19
|
+
anemone.after_crawl do |pages|
|
20
|
+
puts pages.uniq.size
|
21
|
+
end
|
22
|
+
end
|
@@ -1,44 +1,30 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Performs pagedepth, url list, and count functionality
|
4
|
-
# Meant to be run daily as a cron job
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_url_list.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
-
# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
|
12
|
-
#
|
13
|
-
# == Author
|
14
|
-
# Chris Kite
|
15
|
-
|
16
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
17
|
-
|
18
1
|
require 'anemone'
|
19
2
|
require 'optparse'
|
20
3
|
require 'ostruct'
|
21
4
|
|
22
|
-
def usage
|
23
|
-
puts <<END
|
24
|
-
Usage: anemone_url_list.rb [options] url
|
25
|
-
|
26
|
-
Options:
|
27
|
-
-r, --relative Output relative URLs (rather than absolute)
|
28
|
-
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
29
|
-
END
|
30
|
-
end
|
31
|
-
|
32
5
|
options = OpenStruct.new
|
33
6
|
options.relative = false
|
34
7
|
options.output_file = 'urls.txt'
|
35
8
|
|
36
|
-
# make sure that the last option is a URL we can crawl
|
37
9
|
begin
|
38
|
-
|
10
|
+
# make sure that the last argument is a URL we can crawl
|
11
|
+
root = URI(ARGV.last)
|
39
12
|
rescue
|
40
|
-
|
41
|
-
|
13
|
+
puts <<-INFO
|
14
|
+
Usage:
|
15
|
+
anemone cron [options] <url>
|
16
|
+
|
17
|
+
Synopsis:
|
18
|
+
Combination of `count`, `pagedepth` and `url-list` commands.
|
19
|
+
Performs pagedepth, url list, and count functionality.
|
20
|
+
Outputs results to STDOUT and link list to file (urls.txt).
|
21
|
+
Meant to be run daily as a cron job.
|
22
|
+
|
23
|
+
Options:
|
24
|
+
-r, --relative Output relative URLs (rather than absolute)
|
25
|
+
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
26
|
+
INFO
|
27
|
+
exit(0)
|
42
28
|
end
|
43
29
|
|
44
30
|
# parse command-line options
|
@@ -47,8 +33,6 @@ opts.on('-r', '--relative') { options.relative = true }
|
|
47
33
|
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
48
34
|
opts.parse!(ARGV)
|
49
35
|
|
50
|
-
root = ARGV.last
|
51
|
-
|
52
36
|
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
53
37
|
|
54
38
|
anemone.after_crawl do |pages|
|
@@ -101,6 +85,6 @@ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
|
101
85
|
url = options.relative ? url.path.to_s : url.to_s
|
102
86
|
file.puts url
|
103
87
|
end
|
104
|
-
|
105
88
|
end
|
106
|
-
|
89
|
+
|
90
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
|
3
|
+
begin
|
4
|
+
# make sure that the first option is a URL we can crawl
|
5
|
+
root = URI(ARGV[0])
|
6
|
+
rescue
|
7
|
+
puts <<-INFO
|
8
|
+
Usage:
|
9
|
+
anemone pagedepth <url>
|
10
|
+
|
11
|
+
Synopsis:
|
12
|
+
Crawls a site starting at the given URL and outputs a count of
|
13
|
+
the number of pages at each depth of the crawl.
|
14
|
+
INFO
|
15
|
+
exit(0)
|
16
|
+
end
|
17
|
+
|
18
|
+
Anemone.crawl(root) do |anemone|
|
19
|
+
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
20
|
+
|
21
|
+
anemone.after_crawl do |pages|
|
22
|
+
pages = pages.shortest_paths!(root).uniq
|
23
|
+
|
24
|
+
depths = pages.values.inject({}) do |depths, page|
|
25
|
+
depths[page.depth] ||= 0
|
26
|
+
depths[page.depth] += 1
|
27
|
+
depths
|
28
|
+
end
|
29
|
+
|
30
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
begin
|
6
|
+
# make sure that the first option is a URL we can crawl
|
7
|
+
root = URI(ARGV[0])
|
8
|
+
rescue
|
9
|
+
puts <<-INFO
|
10
|
+
Usage:
|
11
|
+
anemone serialize [options] <url>
|
12
|
+
|
13
|
+
Synopsis:
|
14
|
+
Crawls a site starting at the given URL and saves the resulting
|
15
|
+
PageHash object to a file using Marshal serialization.
|
16
|
+
|
17
|
+
Options:
|
18
|
+
-o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
19
|
+
INFO
|
20
|
+
exit(0)
|
21
|
+
end
|
22
|
+
|
23
|
+
options = OpenStruct.new
|
24
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
25
|
+
|
26
|
+
# parse command-line options
|
27
|
+
opts = OptionParser.new
|
28
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
29
|
+
opts.parse!(ARGV)
|
30
|
+
|
31
|
+
Anemone.crawl(root) do |anemone|
|
32
|
+
anemone.after_crawl do |pages|
|
33
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
options = OpenStruct.new
|
6
|
+
options.relative = false
|
7
|
+
|
8
|
+
begin
|
9
|
+
# make sure that the last option is a URL we can crawl
|
10
|
+
root = URI(ARGV.last)
|
11
|
+
rescue
|
12
|
+
puts <<-INFO
|
13
|
+
Usage:
|
14
|
+
anemone url-list [options] <url>
|
15
|
+
|
16
|
+
Synopsis:
|
17
|
+
Crawls a site starting at the given URL, and outputs the URL of each page
|
18
|
+
in the domain as they are encountered.
|
19
|
+
|
20
|
+
Options:
|
21
|
+
-r, --relative Output relative URLs (rather than absolute)
|
22
|
+
INFO
|
23
|
+
exit(0)
|
24
|
+
end
|
25
|
+
|
26
|
+
# parse command-line options
|
27
|
+
opts = OptionParser.new
|
28
|
+
opts.on('-r', '--relative') { options.relative = true }
|
29
|
+
opts.parse!(ARGV)
|
30
|
+
|
31
|
+
Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
|
32
|
+
|
33
|
+
anemone.on_every_page do |page|
|
34
|
+
if options.relative
|
35
|
+
puts page.url.path
|
36
|
+
else
|
37
|
+
puts page.url
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
data/lib/anemone/core.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'net/http'
|
2
2
|
require 'thread'
|
3
3
|
require 'anemone/tentacle'
|
4
|
+
require 'anemone/page'
|
4
5
|
require 'anemone/page_hash'
|
5
6
|
|
6
7
|
module Anemone
|
@@ -12,10 +13,10 @@ module Anemone
|
|
12
13
|
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
13
14
|
# and optional *block*
|
14
15
|
#
|
15
|
-
def initialize(urls
|
16
|
-
@urls = [urls].flatten.map{ |url| URI
|
16
|
+
def initialize(urls)
|
17
|
+
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
17
18
|
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
18
|
-
|
19
|
+
|
19
20
|
@tentacles = []
|
20
21
|
@pages = PageHash.new
|
21
22
|
@on_every_page_blocks = []
|
@@ -26,18 +27,17 @@ module Anemone
|
|
26
27
|
if Anemone.options.obey_robots_txt
|
27
28
|
@robots = Robots.new(Anemone.options.user_agent)
|
28
29
|
end
|
29
|
-
|
30
|
-
|
30
|
+
|
31
|
+
yield self if block_given?
|
31
32
|
end
|
32
33
|
|
33
34
|
#
|
34
35
|
# Convenience method to start a new crawl
|
35
36
|
#
|
36
|
-
def self.crawl(root
|
37
|
+
def self.crawl(root)
|
37
38
|
self.new(root) do |core|
|
38
|
-
|
39
|
+
yield core if block_given?
|
39
40
|
core.run
|
40
|
-
return core
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
@@ -104,7 +104,7 @@ module Anemone
|
|
104
104
|
link_queue = Queue.new
|
105
105
|
page_queue = Queue.new
|
106
106
|
|
107
|
-
Anemone.options.threads.times do
|
107
|
+
Anemone.options.threads.times do
|
108
108
|
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
109
109
|
end
|
110
110
|
|
@@ -120,7 +120,7 @@ module Anemone
|
|
120
120
|
# perform the on_every_page blocks for this page
|
121
121
|
do_page_blocks(page)
|
122
122
|
|
123
|
-
page.
|
123
|
+
page.discard_doc! if Anemone.options.discard_page_bodies
|
124
124
|
|
125
125
|
links_to_follow(page).each do |link|
|
126
126
|
link_queue.enq([link, page])
|
@@ -143,7 +143,7 @@ module Anemone
|
|
143
143
|
end
|
144
144
|
|
145
145
|
if page_queue.empty?
|
146
|
-
@tentacles.size.times {
|
146
|
+
@tentacles.size.times { link_queue.enq(:END)}
|
147
147
|
break
|
148
148
|
end
|
149
149
|
end
|
@@ -207,7 +207,7 @@ module Anemone
|
|
207
207
|
too_deep = false
|
208
208
|
end
|
209
209
|
|
210
|
-
!@pages.
|
210
|
+
!@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
|
211
211
|
end
|
212
212
|
|
213
213
|
#
|
@@ -216,7 +216,7 @@ module Anemone
|
|
216
216
|
#
|
217
217
|
def skip_link?(link)
|
218
218
|
@skip_link_patterns.each { |p| return true if link.path =~ p}
|
219
|
-
|
219
|
+
false
|
220
220
|
end
|
221
221
|
|
222
222
|
end
|
data/lib/anemone/http.rb
CHANGED
@@ -1,16 +1,48 @@
|
|
1
|
-
require 'net/
|
1
|
+
require 'net/https'
|
2
|
+
require 'anemone/page'
|
2
3
|
|
3
4
|
module Anemone
|
4
|
-
class HTTP
|
5
|
+
class HTTP
|
5
6
|
# Maximum number of redirects to follow on each get_response
|
6
7
|
REDIRECTION_LIMIT = 5
|
7
|
-
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@connections = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Create a new Page from the response of an HTTP request to *url*
|
15
|
+
#
|
16
|
+
def fetch_page(url, from_page = nil)
|
17
|
+
begin
|
18
|
+
url = URI(url) unless url.is_a?(URI)
|
19
|
+
|
20
|
+
if from_page
|
21
|
+
referer = from_page.url
|
22
|
+
depth = from_page.depth + 1
|
23
|
+
end
|
24
|
+
|
25
|
+
response, code, location, response_time = get(url, referer)
|
26
|
+
|
27
|
+
aka = nil
|
28
|
+
if !url.eql?(location)
|
29
|
+
aka = location
|
30
|
+
end
|
31
|
+
|
32
|
+
return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
|
33
|
+
rescue
|
34
|
+
return Page.new(url)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
8
40
|
#
|
9
41
|
# Retrieve an HTTP response for *url*, following redirects.
|
10
42
|
# Returns the response object, response code, and final URI location.
|
11
43
|
#
|
12
|
-
def
|
13
|
-
response = get_response(url, referer)
|
44
|
+
def get(url, referer = nil)
|
45
|
+
response, response_time = get_response(url, referer)
|
14
46
|
code = Integer(response.code)
|
15
47
|
loc = url
|
16
48
|
|
@@ -18,17 +50,17 @@ module Anemone
|
|
18
50
|
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
51
|
loc = URI(response['location'])
|
20
52
|
loc = url.merge(loc) if loc.relative?
|
21
|
-
response = get_response(loc, referer)
|
53
|
+
response, response_time = get_response(loc, referer)
|
22
54
|
limit -= 1
|
23
55
|
end
|
24
56
|
|
25
|
-
return response, code, loc
|
57
|
+
return response, code, loc, response_time
|
26
58
|
end
|
27
59
|
|
28
60
|
#
|
29
61
|
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
62
|
#
|
31
|
-
def
|
63
|
+
def get_response(url, referer = nil)
|
32
64
|
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
33
65
|
user_agent = Anemone.options.user_agent rescue nil
|
34
66
|
|
@@ -36,9 +68,37 @@ module Anemone
|
|
36
68
|
opts['User-Agent'] = user_agent if user_agent
|
37
69
|
opts['Referer'] = referer.to_s if referer
|
38
70
|
|
39
|
-
|
40
|
-
|
71
|
+
retries = 0
|
72
|
+
begin
|
73
|
+
start = Time.now()
|
74
|
+
response = connection(url).get(full_path, opts)
|
75
|
+
finish = Time.now()
|
76
|
+
response_time = ((finish - start) * 1000).round
|
77
|
+
return response, response_time
|
78
|
+
rescue EOFError
|
79
|
+
refresh_connection(url)
|
80
|
+
retries += 1
|
81
|
+
retry unless retries > 1
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def connection(url)
|
86
|
+
@connections[url.host] ||= {}
|
87
|
+
|
88
|
+
if conn = @connections[url.host][url.port]
|
89
|
+
return conn
|
90
|
+
end
|
91
|
+
|
92
|
+
refresh_connection(url)
|
93
|
+
end
|
94
|
+
|
95
|
+
def refresh_connection(url)
|
96
|
+
http = Net::HTTP.new(url.host, url.port)
|
97
|
+
if url.scheme == 'https'
|
98
|
+
http.use_ssl = true
|
99
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
41
100
|
end
|
101
|
+
@connections[url.host][url.port] = http.start
|
42
102
|
end
|
43
103
|
end
|
44
104
|
end
|
data/lib/anemone/page.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'anemone/http'
|
2
1
|
require 'nokogiri'
|
3
2
|
require 'ostruct'
|
4
3
|
|
@@ -7,8 +6,6 @@ module Anemone
|
|
7
6
|
|
8
7
|
# The URL of the page
|
9
8
|
attr_reader :url
|
10
|
-
# Array of distinct A tag HREFs from the page
|
11
|
-
attr_reader :links
|
12
9
|
# Headers of the HTTP response
|
13
10
|
attr_reader :headers
|
14
11
|
|
@@ -27,74 +24,45 @@ module Anemone
|
|
27
24
|
attr_accessor :depth
|
28
25
|
# URL of the page that brought us to this page
|
29
26
|
attr_accessor :referer
|
30
|
-
|
31
|
-
|
32
|
-
# Create a new Page from the response of an HTTP request to *url*
|
33
|
-
#
|
34
|
-
def self.fetch(url, from_page = nil)
|
35
|
-
begin
|
36
|
-
url = URI(url) unless url.is_a?(URI)
|
37
|
-
|
38
|
-
if from_page
|
39
|
-
referer = from_page.url
|
40
|
-
depth = from_page.depth + 1
|
41
|
-
end
|
42
|
-
|
43
|
-
response, code, location = Anemone::HTTP.get(url, referer)
|
44
|
-
|
45
|
-
aka = nil
|
46
|
-
if !url.eql?(location)
|
47
|
-
aka = location
|
48
|
-
end
|
49
|
-
|
50
|
-
return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
|
51
|
-
rescue
|
52
|
-
return Page.new(url)
|
53
|
-
end
|
54
|
-
end
|
27
|
+
# Response time of the request for this page in milliseconds
|
28
|
+
attr_accessor :response_time
|
55
29
|
|
56
30
|
#
|
57
31
|
# Create a new page
|
58
32
|
#
|
59
|
-
def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
|
33
|
+
def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
|
60
34
|
@url = url
|
61
35
|
@code = code
|
62
36
|
@headers = headers
|
63
|
-
@
|
64
|
-
@aliases =
|
37
|
+
@headers['content-type'] ||= ['']
|
38
|
+
@aliases = Array(aka)
|
65
39
|
@data = OpenStruct.new
|
66
40
|
@referer = referer
|
67
41
|
@depth = depth || 0
|
42
|
+
@response_time = response_time
|
43
|
+
@doc = Nokogiri::HTML(body) if body && html? rescue nil
|
44
|
+
end
|
68
45
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
#get a list of distinct links on the page, in absolute url form
|
81
|
-
@doc.css('a').each do |a|
|
82
|
-
u = a.attributes['href'].content if a.attributes['href']
|
83
|
-
next if u.nil?
|
84
|
-
|
85
|
-
begin
|
86
|
-
abs = to_absolute(URI(u))
|
87
|
-
rescue
|
88
|
-
next
|
89
|
-
end
|
90
|
-
|
91
|
-
@links << abs if in_domain?(abs)
|
92
|
-
end
|
93
|
-
|
94
|
-
@links.uniq!
|
46
|
+
# Array of distinct A tag HREFs from the page
|
47
|
+
def links
|
48
|
+
return @links unless @links.nil?
|
49
|
+
@links = []
|
50
|
+
return @links if !doc
|
51
|
+
|
52
|
+
doc.css('a').each do |a|
|
53
|
+
u = a.attributes['href'].content rescue nil
|
54
|
+
next if u.nil? or u.empty?
|
55
|
+
abs = to_absolute(URI(u)) rescue next
|
56
|
+
@links << abs if in_domain?(abs)
|
95
57
|
end
|
58
|
+
@links.uniq!
|
59
|
+
@links
|
96
60
|
end
|
97
61
|
|
62
|
+
def discard_doc!
|
63
|
+
links # force parsing of page links before we trash the document
|
64
|
+
@doc = nil
|
65
|
+
end
|
98
66
|
|
99
67
|
#
|
100
68
|
# Return a new page with the same *response* and *url*, but
|
@@ -124,7 +92,7 @@ module Anemone
|
|
124
92
|
# *page_hash* is a PageHash object with the results of the current crawl.
|
125
93
|
#
|
126
94
|
def links_and_their_aliases(page_hash)
|
127
|
-
|
95
|
+
links.inject([]) do |results, link|
|
128
96
|
results.concat([link].concat(page_hash[link].aliases))
|
129
97
|
end
|
130
98
|
end
|
@@ -133,7 +101,7 @@ module Anemone
|
|
133
101
|
# The content-type returned by the HTTP request for this page
|
134
102
|
#
|
135
103
|
def content_type
|
136
|
-
|
104
|
+
headers['content-type'].first
|
137
105
|
end
|
138
106
|
|
139
107
|
#
|
@@ -141,7 +109,7 @@ module Anemone
|
|
141
109
|
# otherwise.
|
142
110
|
#
|
143
111
|
def html?
|
144
|
-
(
|
112
|
+
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
|
145
113
|
end
|
146
114
|
|
147
115
|
#
|
data/lib/anemone/page_hash.rb
CHANGED
@@ -14,6 +14,18 @@ module Anemone
|
|
14
14
|
def has_key?(key)
|
15
15
|
super(key.to_s)
|
16
16
|
end
|
17
|
+
|
18
|
+
# Does this PageHash contain the specified URL?
|
19
|
+
# HTTP and HTTPS versions of a URL are considered to be the same page.
|
20
|
+
def has_page?(url)
|
21
|
+
schemes = %w(http https)
|
22
|
+
if schemes.include? url.scheme
|
23
|
+
u = url.dup
|
24
|
+
return schemes.any? { |s| u.scheme = s; has_key?(u) }
|
25
|
+
end
|
26
|
+
|
27
|
+
has_key?(url)
|
28
|
+
end
|
17
29
|
|
18
30
|
#
|
19
31
|
# Use a breadth-first search to calculate the single-source
|
data/lib/anemone/tentacle.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require 'anemone/
|
1
|
+
require 'anemone/http'
|
2
2
|
|
3
3
|
module Anemone
|
4
4
|
class Tentacle
|
@@ -9,6 +9,7 @@ module Anemone
|
|
9
9
|
def initialize(link_queue, page_queue)
|
10
10
|
@link_queue = link_queue
|
11
11
|
@page_queue = page_queue
|
12
|
+
@http = Anemone::HTTP.new
|
12
13
|
end
|
13
14
|
|
14
15
|
#
|
@@ -16,22 +17,16 @@ module Anemone
|
|
16
17
|
# Page objects into @page_queue
|
17
18
|
#
|
18
19
|
def run
|
19
|
-
|
20
|
+
loop do
|
20
21
|
link, from_page = @link_queue.deq
|
21
22
|
|
22
23
|
break if link == :END
|
23
|
-
|
24
|
-
|
25
|
-
page = Page.fetch(link, from_page)
|
26
|
-
else
|
27
|
-
page = Page.fetch(link)
|
28
|
-
end
|
29
|
-
|
30
|
-
@page_queue.enq(page)
|
24
|
+
|
25
|
+
@page_queue.enq @http.fetch_page(link, from_page)
|
31
26
|
|
32
27
|
sleep Anemone.options.delay
|
33
28
|
end
|
34
29
|
end
|
35
|
-
|
30
|
+
|
36
31
|
end
|
37
32
|
end
|
data/spec/anemone_spec.rb
CHANGED
@@ -1,6 +1,15 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/spec_helper'
|
2
2
|
|
3
3
|
describe Anemone do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
Anemone::FakePage.new
|
7
|
+
end
|
8
|
+
|
9
|
+
after(:each) do
|
10
|
+
# reset global options object to defaults
|
11
|
+
Anemone::DEFAULTS.each { |key, value| Anemone.options.send("#{key}=", value) }
|
12
|
+
end
|
4
13
|
|
5
14
|
it "should have a version" do
|
6
15
|
Anemone.const_defined?('VERSION').should == true
|
@@ -17,6 +26,7 @@ describe Anemone do
|
|
17
26
|
:user_agent => 'test',
|
18
27
|
:obey_robots_txt => true,
|
19
28
|
:depth_limit => 3)
|
29
|
+
|
20
30
|
Anemone.options.verbose.should == false
|
21
31
|
Anemone.options.threads.should == 2
|
22
32
|
Anemone.options.discard_page_bodies.should == true
|
data/spec/core_spec.rb
CHANGED
@@ -139,43 +139,39 @@ module Anemone
|
|
139
139
|
urls.should_not include(pages[1].url)
|
140
140
|
end
|
141
141
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
142
|
+
describe "many pages" do
|
143
|
+
before(:each) do
|
144
|
+
@pages, size = [], 5
|
145
|
+
|
146
|
+
size.times do |n|
|
147
|
+
# register this page with a link to the next page
|
148
|
+
link = (n + 1).to_s if n + 1 < size
|
149
|
+
@pages << FakePage.new(n.to_s, :links => Array(link))
|
150
|
+
end
|
151
151
|
end
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
152
|
+
|
153
|
+
it "should track the page depth and referer" do
|
154
|
+
core = Anemone.crawl(@pages[0].url)
|
155
|
+
previous_page = nil
|
156
|
+
|
157
|
+
@pages.each_with_index do |page, i|
|
158
|
+
page = core.pages[page.url]
|
159
|
+
page.should be
|
160
|
+
page.depth.should == i
|
161
|
+
|
162
|
+
if previous_page
|
163
|
+
page.referer.should == previous_page.url
|
164
|
+
else
|
165
|
+
page.referer.should be_nil
|
166
|
+
end
|
167
|
+
previous_page = page
|
168
|
+
end
|
159
169
|
end
|
160
|
-
|
161
|
-
core.pages[pages[0].url].referer.should == nil
|
162
|
-
end
|
163
170
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
pages = []
|
168
|
-
|
169
|
-
num_pages.times do |n|
|
170
|
-
# register this page with a link to the next page
|
171
|
-
link = (n + 1).to_s if n + 1 < num_pages
|
172
|
-
pages << FakePage.new(n.to_s, :links => [link].compact)
|
171
|
+
it "should optionally limit the depth of the crawl" do
|
172
|
+
core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
|
173
|
+
core.should have(4).pages
|
173
174
|
end
|
174
|
-
|
175
|
-
core = Anemone.crawl(pages[0].url, :depth_limit => 3)
|
176
|
-
|
177
|
-
core.should have(4).pages
|
178
175
|
end
|
179
|
-
|
180
176
|
end
|
181
177
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -2,14 +2,13 @@ require File.dirname(__FILE__) + '/spec_helper'
|
|
2
2
|
|
3
3
|
module Anemone
|
4
4
|
describe Page do
|
5
|
-
|
6
|
-
before(:
|
7
|
-
@
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
@http = Anemone::HTTP.new
|
8
8
|
end
|
9
|
-
|
10
|
-
|
11
|
-
@page.
|
12
|
-
@page.url.to_s.should include('home')
|
9
|
+
|
10
|
+
before(:each) do
|
11
|
+
@page = @http.fetch_page(FakePage.new('home').url)
|
13
12
|
end
|
14
13
|
|
15
14
|
it "should store the response headers when fetching a page" do
|
@@ -35,7 +34,7 @@ module Anemone
|
|
35
34
|
|
36
35
|
@page.redirect?.should == false
|
37
36
|
|
38
|
-
|
37
|
+
@http.fetch_page(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
|
39
38
|
end
|
40
39
|
|
41
40
|
it "should have a method to tell if a URI is in the same domain as the page" do
|
@@ -44,6 +43,10 @@ module Anemone
|
|
44
43
|
@page.in_domain?(URI(FakePage.new('test').url)).should == true
|
45
44
|
@page.in_domain?(URI('http://www.other.com/')).should == false
|
46
45
|
end
|
46
|
+
|
47
|
+
it "should include the response time for the HTTP request" do
|
48
|
+
@page.should respond_to(:response_time)
|
49
|
+
end
|
47
50
|
|
48
51
|
end
|
49
52
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-10-24 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -25,11 +25,7 @@ dependencies:
|
|
25
25
|
description:
|
26
26
|
email:
|
27
27
|
executables:
|
28
|
-
-
|
29
|
-
- anemone_cron.rb
|
30
|
-
- anemone_pagedepth.rb
|
31
|
-
- anemone_serialize.rb
|
32
|
-
- anemone_url_list.rb
|
28
|
+
- anemone
|
33
29
|
extensions: []
|
34
30
|
|
35
31
|
extra_rdoc_files:
|
@@ -37,11 +33,7 @@ extra_rdoc_files:
|
|
37
33
|
files:
|
38
34
|
- LICENSE.txt
|
39
35
|
- README.rdoc
|
40
|
-
- bin/
|
41
|
-
- bin/anemone_cron.rb
|
42
|
-
- bin/anemone_pagedepth.rb
|
43
|
-
- bin/anemone_serialize.rb
|
44
|
-
- bin/anemone_url_list.rb
|
36
|
+
- bin/anemone
|
45
37
|
- lib/anemone.rb
|
46
38
|
- lib/anemone/anemone.rb
|
47
39
|
- lib/anemone/core.rb
|
@@ -49,6 +41,12 @@ files:
|
|
49
41
|
- lib/anemone/page.rb
|
50
42
|
- lib/anemone/page_hash.rb
|
51
43
|
- lib/anemone/tentacle.rb
|
44
|
+
- lib/anemone/cli.rb
|
45
|
+
- lib/anemone/cli/url_list.rb
|
46
|
+
- lib/anemone/cli/cron.rb
|
47
|
+
- lib/anemone/cli/count.rb
|
48
|
+
- lib/anemone/cli/pagedepth.rb
|
49
|
+
- lib/anemone/cli/serialize.rb
|
52
50
|
has_rdoc: true
|
53
51
|
homepage: http://anemone.rubyforge.org
|
54
52
|
post_install_message:
|
data/bin/anemone_count.rb
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs the total number
|
4
|
-
# of unique pages on the site.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_count.rb url
|
8
|
-
#
|
9
|
-
# == Author
|
10
|
-
# Chris Kite
|
11
|
-
|
12
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
-
|
14
|
-
require 'anemone'
|
15
|
-
|
16
|
-
def usage
|
17
|
-
puts <<END
|
18
|
-
Usage: anemone_count.rb url
|
19
|
-
END
|
20
|
-
end
|
21
|
-
|
22
|
-
# make sure that the first option is a URL we can crawl
|
23
|
-
begin
|
24
|
-
URI(ARGV[0])
|
25
|
-
rescue
|
26
|
-
usage
|
27
|
-
Process.exit
|
28
|
-
end
|
29
|
-
|
30
|
-
Anemone.crawl(ARGV[0]) do |anemone|
|
31
|
-
anemone.after_crawl do |pages|
|
32
|
-
puts pages.uniq.size
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
|
data/bin/anemone_pagedepth.rb
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs a count of
|
4
|
-
# the number of Pages at each depth in the site.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_pagedepth.rb url
|
8
|
-
#
|
9
|
-
# == Author
|
10
|
-
# Chris Kite
|
11
|
-
|
12
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
-
|
14
|
-
require 'anemone'
|
15
|
-
|
16
|
-
def usage
|
17
|
-
puts <<END
|
18
|
-
Usage: anemone_pagedepth.rb url
|
19
|
-
END
|
20
|
-
end
|
21
|
-
|
22
|
-
# make sure that the first option is a URL we can crawl
|
23
|
-
begin
|
24
|
-
URI(ARGV[0])
|
25
|
-
rescue
|
26
|
-
usage
|
27
|
-
Process.exit
|
28
|
-
end
|
29
|
-
|
30
|
-
root = ARGV[0]
|
31
|
-
Anemone.crawl(root) do |anemone|
|
32
|
-
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
33
|
-
|
34
|
-
anemone.after_crawl do |pages|
|
35
|
-
pages = pages.shortest_paths!(root).uniq
|
36
|
-
depths = pages.values.inject({}) do |depths, page|
|
37
|
-
depths[page.depth] ||= 0
|
38
|
-
depths[page.depth] += 1
|
39
|
-
depths
|
40
|
-
end
|
41
|
-
|
42
|
-
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
43
|
-
end
|
44
|
-
end
|
data/bin/anemone_serialize.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and saves the resulting
|
4
|
-
# PageHash object to a file using Marshal serialization.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_serialize.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
11
|
-
#
|
12
|
-
# == Author
|
13
|
-
# Chris Kite
|
14
|
-
|
15
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
-
|
17
|
-
require 'anemone'
|
18
|
-
require 'optparse'
|
19
|
-
require 'ostruct'
|
20
|
-
|
21
|
-
def usage
|
22
|
-
puts <<END
|
23
|
-
Usage: anemone_serialize.rb [options] url
|
24
|
-
|
25
|
-
Options:
|
26
|
-
-o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
27
|
-
END
|
28
|
-
end
|
29
|
-
|
30
|
-
# make sure that the first option is a URL we can crawl
|
31
|
-
begin
|
32
|
-
URI(ARGV[0])
|
33
|
-
rescue
|
34
|
-
usage
|
35
|
-
Process.exit
|
36
|
-
end
|
37
|
-
|
38
|
-
options = OpenStruct.new
|
39
|
-
options.output_file = "crawl.#{Time.now.to_i}"
|
40
|
-
|
41
|
-
# parse command-line options
|
42
|
-
opts = OptionParser.new
|
43
|
-
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
44
|
-
opts.parse!(ARGV)
|
45
|
-
|
46
|
-
root = ARGV[0]
|
47
|
-
Anemone.crawl(root) do |anemone|
|
48
|
-
anemone.after_crawl do |pages|
|
49
|
-
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
50
|
-
end
|
51
|
-
end
|
data/bin/anemone_url_list.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
-
# in the domain as they are encountered.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_url_list.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
-
#
|
12
|
-
# == Author
|
13
|
-
# Chris Kite
|
14
|
-
|
15
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
-
|
17
|
-
require 'anemone'
|
18
|
-
require 'optparse'
|
19
|
-
require 'ostruct'
|
20
|
-
|
21
|
-
def usage
|
22
|
-
puts <<END
|
23
|
-
Usage: anemone_url_list.rb [options] url
|
24
|
-
|
25
|
-
Options:
|
26
|
-
-r, --relative Output relative URLs (rather than absolute)
|
27
|
-
END
|
28
|
-
end
|
29
|
-
|
30
|
-
options = OpenStruct.new
|
31
|
-
options.relative = false
|
32
|
-
|
33
|
-
# make sure that the last option is a URL we can crawl
|
34
|
-
begin
|
35
|
-
URI(ARGV.last)
|
36
|
-
rescue
|
37
|
-
usage
|
38
|
-
Process.exit
|
39
|
-
end
|
40
|
-
|
41
|
-
# parse command-line options
|
42
|
-
opts = OptionParser.new
|
43
|
-
opts.on('-r', '--relative') { options.relative = true }
|
44
|
-
opts.parse!(ARGV)
|
45
|
-
|
46
|
-
Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
|
47
|
-
anemone.on_every_page do |page|
|
48
|
-
if options.relative
|
49
|
-
puts page.url.path
|
50
|
-
else
|
51
|
-
puts page.url
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|