anemone 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +13 -5
- data/bin/anemone +4 -0
- data/lib/anemone/anemone.rb +28 -34
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/{bin/anemone_cron.rb → lib/anemone/cli/cron.rb} +19 -35
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/core.rb +13 -13
- data/lib/anemone/http.rb +70 -10
- data/lib/anemone/page.rb +28 -60
- data/lib/anemone/page_hash.rb +12 -0
- data/lib/anemone/tentacle.rb +6 -11
- data/spec/anemone_spec.rb +10 -0
- data/spec/core_spec.rb +29 -33
- data/spec/page_spec.rb +11 -8
- metadata +10 -12
- data/bin/anemone_count.rb +0 -36
- data/bin/anemone_pagedepth.rb +0 -44
- data/bin/anemone_serialize.rb +0 -51
- data/bin/anemone_url_list.rb +0 -54
data/README.rdoc
CHANGED
@@ -1,18 +1,26 @@
|
|
1
1
|
= Anemone
|
2
2
|
|
3
|
-
== DESCRIPTION
|
4
3
|
Anemone is a web spider framework that can spider a domain and collect useful
|
5
4
|
information about the pages it visits. It is versatile, allowing you to
|
6
5
|
write your own specialized spider tasks quickly and easily.
|
7
6
|
|
8
|
-
|
7
|
+
See http://anemone.rubyforge.org for more information.
|
8
|
+
|
9
|
+
== Features
|
9
10
|
* Multi-threaded design for high performance
|
10
11
|
* Tracks 301 HTTP redirects to understand a page's aliases
|
11
12
|
* Built-in BFS algorithm for determining page depth
|
12
13
|
* Allows exclusion of URLs based on regular expressions
|
14
|
+
* Choose the links to follow on each page with focus_crawl()
|
15
|
+
* HTTPS support
|
16
|
+
* Records response time for each page
|
17
|
+
* CLI program can list all pages in a domain, calculate page depths, and more
|
18
|
+
|
19
|
+
== Examples
|
20
|
+
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
|
13
21
|
|
14
|
-
==
|
22
|
+
== Requirements
|
15
23
|
* nokogiri
|
16
24
|
|
17
|
-
==
|
18
|
-
|
25
|
+
== Optional
|
26
|
+
* fizx-robots (required if obey_robots_txt is set to true)
|
data/bin/anemone
ADDED
data/lib/anemone/anemone.rb
CHANGED
@@ -3,42 +3,41 @@ require 'anemone/core'
|
|
3
3
|
|
4
4
|
module Anemone
|
5
5
|
# Version number
|
6
|
-
VERSION = '0.2.
|
7
|
-
|
8
|
-
#module-wide options
|
9
|
-
def Anemone.options=(options)
|
10
|
-
@options = options
|
11
|
-
end
|
6
|
+
VERSION = '0.2.1'
|
12
7
|
|
13
|
-
|
14
|
-
|
8
|
+
# default options
|
9
|
+
DEFAULTS = {
|
10
|
+
# run 4 Tentacle threads to fetch pages
|
11
|
+
:threads => 4,
|
12
|
+
# disable verbose output
|
13
|
+
:verbose => false,
|
14
|
+
# don't throw away the page response body after scanning it for links
|
15
|
+
:discard_page_bodies => false,
|
16
|
+
# identify self as Anemone/VERSION
|
17
|
+
:user_agent => "Anemone/#{VERSION}",
|
18
|
+
# no delay between requests
|
19
|
+
:delay => 0,
|
20
|
+
# don't obey the robots exclusion protocol
|
21
|
+
:obey_robots_txt => false,
|
22
|
+
# by default, don't limit the depth of the crawl
|
23
|
+
:depth_limit => false,
|
24
|
+
# number of times HTTP redirects will be followed
|
25
|
+
:redirect_limit => 5
|
26
|
+
}
|
27
|
+
|
28
|
+
def self.options
|
29
|
+
@options ||= OpenStruct.new(DEFAULTS)
|
15
30
|
end
|
16
31
|
|
17
32
|
#
|
18
33
|
# Convenience method to start a crawl using Core
|
19
34
|
#
|
20
35
|
def Anemone.crawl(urls, options = {}, &block)
|
21
|
-
Anemone.options
|
22
|
-
|
23
|
-
# by default, run 4 Tentacle threads to fetch pages
|
24
|
-
Anemone.options.threads ||= 4
|
25
|
-
|
26
|
-
# disable verbose output by default
|
27
|
-
Anemone.options.verbose ||= false
|
28
|
-
|
29
|
-
# by default, don't throw away the page response body after scanning it for links
|
30
|
-
Anemone.options.discard_page_bodies ||= false
|
31
|
-
|
32
|
-
# by default, identify self as Anemone/VERSION
|
33
|
-
Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
|
36
|
+
options.each { |key, value| Anemone.options.send("#{key}=", value) }
|
34
37
|
|
35
|
-
|
36
|
-
Anemone.options.delay ||= 0
|
37
|
-
|
38
|
-
# by default, don't obey the robots exclusion protocol
|
39
|
-
if Anemone.options.obey_robots_txt ||= false
|
38
|
+
if Anemone.options.obey_robots_txt
|
40
39
|
begin
|
41
|
-
|
40
|
+
require 'robots'
|
42
41
|
rescue LoadError
|
43
42
|
warn "To support the robot exclusion protocol, install the robots gem:\n" \
|
44
43
|
"sudo gem sources -a http://gems.github.com\n" \
|
@@ -46,15 +45,10 @@ module Anemone
|
|
46
45
|
exit
|
47
46
|
end
|
48
47
|
end
|
49
|
-
|
50
|
-
# by default, don't limit the depth of the crawl
|
51
|
-
Anemone.options.depth_limit ||= :infinity
|
52
48
|
|
53
49
|
#use a single thread if a delay was requested
|
54
|
-
if
|
55
|
-
|
56
|
-
end
|
57
|
-
|
50
|
+
Anemone.options.threads = 1 if Anemone.options.delay > 0
|
51
|
+
|
58
52
|
Core.crawl(urls, &block)
|
59
53
|
end
|
60
54
|
end
|
data/lib/anemone/cli.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module Anemone
|
2
|
+
module CLI
|
3
|
+
COMMANDS = %w[count cron pagedepth serialize url-list]
|
4
|
+
|
5
|
+
def self.run
|
6
|
+
command = ARGV.shift
|
7
|
+
|
8
|
+
if COMMANDS.include? command
|
9
|
+
load "anemone/cli/#{command.tr('-', '_')}.rb"
|
10
|
+
else
|
11
|
+
puts <<-INFO
|
12
|
+
Anemone is a web spider framework that can collect
|
13
|
+
useful information about pages it visits.
|
14
|
+
|
15
|
+
Usage:
|
16
|
+
anemone <command> [arguments]
|
17
|
+
|
18
|
+
Commands:
|
19
|
+
#{COMMANDS.join(', ')}
|
20
|
+
INFO
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
|
3
|
+
begin
|
4
|
+
# make sure that the first option is a URL we can crawl
|
5
|
+
url = URI(ARGV[0])
|
6
|
+
rescue
|
7
|
+
puts <<-INFO
|
8
|
+
Usage:
|
9
|
+
anemone count <url>
|
10
|
+
|
11
|
+
Synopsis:
|
12
|
+
Crawls a site starting at the given URL and outputs the total number
|
13
|
+
of unique pages on the site.
|
14
|
+
INFO
|
15
|
+
exit(0)
|
16
|
+
end
|
17
|
+
|
18
|
+
Anemone.crawl(url) do |anemone|
|
19
|
+
anemone.after_crawl do |pages|
|
20
|
+
puts pages.uniq.size
|
21
|
+
end
|
22
|
+
end
|
@@ -1,44 +1,30 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Performs pagedepth, url list, and count functionality
|
4
|
-
# Meant to be run daily as a cron job
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_url_list.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
-
# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
|
12
|
-
#
|
13
|
-
# == Author
|
14
|
-
# Chris Kite
|
15
|
-
|
16
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
17
|
-
|
18
1
|
require 'anemone'
|
19
2
|
require 'optparse'
|
20
3
|
require 'ostruct'
|
21
4
|
|
22
|
-
def usage
|
23
|
-
puts <<END
|
24
|
-
Usage: anemone_url_list.rb [options] url
|
25
|
-
|
26
|
-
Options:
|
27
|
-
-r, --relative Output relative URLs (rather than absolute)
|
28
|
-
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
29
|
-
END
|
30
|
-
end
|
31
|
-
|
32
5
|
options = OpenStruct.new
|
33
6
|
options.relative = false
|
34
7
|
options.output_file = 'urls.txt'
|
35
8
|
|
36
|
-
# make sure that the last option is a URL we can crawl
|
37
9
|
begin
|
38
|
-
|
10
|
+
# make sure that the last argument is a URL we can crawl
|
11
|
+
root = URI(ARGV.last)
|
39
12
|
rescue
|
40
|
-
|
41
|
-
|
13
|
+
puts <<-INFO
|
14
|
+
Usage:
|
15
|
+
anemone cron [options] <url>
|
16
|
+
|
17
|
+
Synopsis:
|
18
|
+
Combination of `count`, `pagedepth` and `url-list` commands.
|
19
|
+
Performs pagedepth, url list, and count functionality.
|
20
|
+
Outputs results to STDOUT and link list to file (urls.txt).
|
21
|
+
Meant to be run daily as a cron job.
|
22
|
+
|
23
|
+
Options:
|
24
|
+
-r, --relative Output relative URLs (rather than absolute)
|
25
|
+
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
26
|
+
INFO
|
27
|
+
exit(0)
|
42
28
|
end
|
43
29
|
|
44
30
|
# parse command-line options
|
@@ -47,8 +33,6 @@ opts.on('-r', '--relative') { options.relative = true }
|
|
47
33
|
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
48
34
|
opts.parse!(ARGV)
|
49
35
|
|
50
|
-
root = ARGV.last
|
51
|
-
|
52
36
|
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
53
37
|
|
54
38
|
anemone.after_crawl do |pages|
|
@@ -101,6 +85,6 @@ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
|
101
85
|
url = options.relative ? url.path.to_s : url.to_s
|
102
86
|
file.puts url
|
103
87
|
end
|
104
|
-
|
105
88
|
end
|
106
|
-
|
89
|
+
|
90
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
|
3
|
+
begin
|
4
|
+
# make sure that the first option is a URL we can crawl
|
5
|
+
root = URI(ARGV[0])
|
6
|
+
rescue
|
7
|
+
puts <<-INFO
|
8
|
+
Usage:
|
9
|
+
anemone pagedepth <url>
|
10
|
+
|
11
|
+
Synopsis:
|
12
|
+
Crawls a site starting at the given URL and outputs a count of
|
13
|
+
the number of pages at each depth of the crawl.
|
14
|
+
INFO
|
15
|
+
exit(0)
|
16
|
+
end
|
17
|
+
|
18
|
+
Anemone.crawl(root) do |anemone|
|
19
|
+
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
20
|
+
|
21
|
+
anemone.after_crawl do |pages|
|
22
|
+
pages = pages.shortest_paths!(root).uniq
|
23
|
+
|
24
|
+
depths = pages.values.inject({}) do |depths, page|
|
25
|
+
depths[page.depth] ||= 0
|
26
|
+
depths[page.depth] += 1
|
27
|
+
depths
|
28
|
+
end
|
29
|
+
|
30
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
begin
|
6
|
+
# make sure that the first option is a URL we can crawl
|
7
|
+
root = URI(ARGV[0])
|
8
|
+
rescue
|
9
|
+
puts <<-INFO
|
10
|
+
Usage:
|
11
|
+
anemone serialize [options] <url>
|
12
|
+
|
13
|
+
Synopsis:
|
14
|
+
Crawls a site starting at the given URL and saves the resulting
|
15
|
+
PageHash object to a file using Marshal serialization.
|
16
|
+
|
17
|
+
Options:
|
18
|
+
-o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
19
|
+
INFO
|
20
|
+
exit(0)
|
21
|
+
end
|
22
|
+
|
23
|
+
options = OpenStruct.new
|
24
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
25
|
+
|
26
|
+
# parse command-line options
|
27
|
+
opts = OptionParser.new
|
28
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
29
|
+
opts.parse!(ARGV)
|
30
|
+
|
31
|
+
Anemone.crawl(root) do |anemone|
|
32
|
+
anemone.after_crawl do |pages|
|
33
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
options = OpenStruct.new
|
6
|
+
options.relative = false
|
7
|
+
|
8
|
+
begin
|
9
|
+
# make sure that the last option is a URL we can crawl
|
10
|
+
root = URI(ARGV.last)
|
11
|
+
rescue
|
12
|
+
puts <<-INFO
|
13
|
+
Usage:
|
14
|
+
anemone url-list [options] <url>
|
15
|
+
|
16
|
+
Synopsis:
|
17
|
+
Crawls a site starting at the given URL, and outputs the URL of each page
|
18
|
+
in the domain as they are encountered.
|
19
|
+
|
20
|
+
Options:
|
21
|
+
-r, --relative Output relative URLs (rather than absolute)
|
22
|
+
INFO
|
23
|
+
exit(0)
|
24
|
+
end
|
25
|
+
|
26
|
+
# parse command-line options
|
27
|
+
opts = OptionParser.new
|
28
|
+
opts.on('-r', '--relative') { options.relative = true }
|
29
|
+
opts.parse!(ARGV)
|
30
|
+
|
31
|
+
Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
|
32
|
+
|
33
|
+
anemone.on_every_page do |page|
|
34
|
+
if options.relative
|
35
|
+
puts page.url.path
|
36
|
+
else
|
37
|
+
puts page.url
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
data/lib/anemone/core.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'net/http'
|
2
2
|
require 'thread'
|
3
3
|
require 'anemone/tentacle'
|
4
|
+
require 'anemone/page'
|
4
5
|
require 'anemone/page_hash'
|
5
6
|
|
6
7
|
module Anemone
|
@@ -12,10 +13,10 @@ module Anemone
|
|
12
13
|
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
13
14
|
# and optional *block*
|
14
15
|
#
|
15
|
-
def initialize(urls
|
16
|
-
@urls = [urls].flatten.map{ |url| URI
|
16
|
+
def initialize(urls)
|
17
|
+
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
17
18
|
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
18
|
-
|
19
|
+
|
19
20
|
@tentacles = []
|
20
21
|
@pages = PageHash.new
|
21
22
|
@on_every_page_blocks = []
|
@@ -26,18 +27,17 @@ module Anemone
|
|
26
27
|
if Anemone.options.obey_robots_txt
|
27
28
|
@robots = Robots.new(Anemone.options.user_agent)
|
28
29
|
end
|
29
|
-
|
30
|
-
|
30
|
+
|
31
|
+
yield self if block_given?
|
31
32
|
end
|
32
33
|
|
33
34
|
#
|
34
35
|
# Convenience method to start a new crawl
|
35
36
|
#
|
36
|
-
def self.crawl(root
|
37
|
+
def self.crawl(root)
|
37
38
|
self.new(root) do |core|
|
38
|
-
|
39
|
+
yield core if block_given?
|
39
40
|
core.run
|
40
|
-
return core
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
@@ -104,7 +104,7 @@ module Anemone
|
|
104
104
|
link_queue = Queue.new
|
105
105
|
page_queue = Queue.new
|
106
106
|
|
107
|
-
Anemone.options.threads.times do
|
107
|
+
Anemone.options.threads.times do
|
108
108
|
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
109
109
|
end
|
110
110
|
|
@@ -120,7 +120,7 @@ module Anemone
|
|
120
120
|
# perform the on_every_page blocks for this page
|
121
121
|
do_page_blocks(page)
|
122
122
|
|
123
|
-
page.
|
123
|
+
page.discard_doc! if Anemone.options.discard_page_bodies
|
124
124
|
|
125
125
|
links_to_follow(page).each do |link|
|
126
126
|
link_queue.enq([link, page])
|
@@ -143,7 +143,7 @@ module Anemone
|
|
143
143
|
end
|
144
144
|
|
145
145
|
if page_queue.empty?
|
146
|
-
@tentacles.size.times {
|
146
|
+
@tentacles.size.times { link_queue.enq(:END)}
|
147
147
|
break
|
148
148
|
end
|
149
149
|
end
|
@@ -207,7 +207,7 @@ module Anemone
|
|
207
207
|
too_deep = false
|
208
208
|
end
|
209
209
|
|
210
|
-
!@pages.
|
210
|
+
!@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
|
211
211
|
end
|
212
212
|
|
213
213
|
#
|
@@ -216,7 +216,7 @@ module Anemone
|
|
216
216
|
#
|
217
217
|
def skip_link?(link)
|
218
218
|
@skip_link_patterns.each { |p| return true if link.path =~ p}
|
219
|
-
|
219
|
+
false
|
220
220
|
end
|
221
221
|
|
222
222
|
end
|
data/lib/anemone/http.rb
CHANGED
@@ -1,16 +1,48 @@
|
|
1
|
-
require 'net/
|
1
|
+
require 'net/https'
|
2
|
+
require 'anemone/page'
|
2
3
|
|
3
4
|
module Anemone
|
4
|
-
class HTTP
|
5
|
+
class HTTP
|
5
6
|
# Maximum number of redirects to follow on each get_response
|
6
7
|
REDIRECTION_LIMIT = 5
|
7
|
-
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@connections = {}
|
11
|
+
end
|
12
|
+
|
13
|
+
#
|
14
|
+
# Create a new Page from the response of an HTTP request to *url*
|
15
|
+
#
|
16
|
+
def fetch_page(url, from_page = nil)
|
17
|
+
begin
|
18
|
+
url = URI(url) unless url.is_a?(URI)
|
19
|
+
|
20
|
+
if from_page
|
21
|
+
referer = from_page.url
|
22
|
+
depth = from_page.depth + 1
|
23
|
+
end
|
24
|
+
|
25
|
+
response, code, location, response_time = get(url, referer)
|
26
|
+
|
27
|
+
aka = nil
|
28
|
+
if !url.eql?(location)
|
29
|
+
aka = location
|
30
|
+
end
|
31
|
+
|
32
|
+
return Page.new(url, response.body.dup, code, response.to_hash, aka, referer, depth, response_time)
|
33
|
+
rescue
|
34
|
+
return Page.new(url)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
8
40
|
#
|
9
41
|
# Retrieve an HTTP response for *url*, following redirects.
|
10
42
|
# Returns the response object, response code, and final URI location.
|
11
43
|
#
|
12
|
-
def
|
13
|
-
response = get_response(url, referer)
|
44
|
+
def get(url, referer = nil)
|
45
|
+
response, response_time = get_response(url, referer)
|
14
46
|
code = Integer(response.code)
|
15
47
|
loc = url
|
16
48
|
|
@@ -18,17 +50,17 @@ module Anemone
|
|
18
50
|
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
51
|
loc = URI(response['location'])
|
20
52
|
loc = url.merge(loc) if loc.relative?
|
21
|
-
response = get_response(loc, referer)
|
53
|
+
response, response_time = get_response(loc, referer)
|
22
54
|
limit -= 1
|
23
55
|
end
|
24
56
|
|
25
|
-
return response, code, loc
|
57
|
+
return response, code, loc, response_time
|
26
58
|
end
|
27
59
|
|
28
60
|
#
|
29
61
|
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
62
|
#
|
31
|
-
def
|
63
|
+
def get_response(url, referer = nil)
|
32
64
|
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
33
65
|
user_agent = Anemone.options.user_agent rescue nil
|
34
66
|
|
@@ -36,9 +68,37 @@ module Anemone
|
|
36
68
|
opts['User-Agent'] = user_agent if user_agent
|
37
69
|
opts['Referer'] = referer.to_s if referer
|
38
70
|
|
39
|
-
|
40
|
-
|
71
|
+
retries = 0
|
72
|
+
begin
|
73
|
+
start = Time.now()
|
74
|
+
response = connection(url).get(full_path, opts)
|
75
|
+
finish = Time.now()
|
76
|
+
response_time = ((finish - start) * 1000).round
|
77
|
+
return response, response_time
|
78
|
+
rescue EOFError
|
79
|
+
refresh_connection(url)
|
80
|
+
retries += 1
|
81
|
+
retry unless retries > 1
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def connection(url)
|
86
|
+
@connections[url.host] ||= {}
|
87
|
+
|
88
|
+
if conn = @connections[url.host][url.port]
|
89
|
+
return conn
|
90
|
+
end
|
91
|
+
|
92
|
+
refresh_connection(url)
|
93
|
+
end
|
94
|
+
|
95
|
+
def refresh_connection(url)
|
96
|
+
http = Net::HTTP.new(url.host, url.port)
|
97
|
+
if url.scheme == 'https'
|
98
|
+
http.use_ssl = true
|
99
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
41
100
|
end
|
101
|
+
@connections[url.host][url.port] = http.start
|
42
102
|
end
|
43
103
|
end
|
44
104
|
end
|
data/lib/anemone/page.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
require 'anemone/http'
|
2
1
|
require 'nokogiri'
|
3
2
|
require 'ostruct'
|
4
3
|
|
@@ -7,8 +6,6 @@ module Anemone
|
|
7
6
|
|
8
7
|
# The URL of the page
|
9
8
|
attr_reader :url
|
10
|
-
# Array of distinct A tag HREFs from the page
|
11
|
-
attr_reader :links
|
12
9
|
# Headers of the HTTP response
|
13
10
|
attr_reader :headers
|
14
11
|
|
@@ -27,74 +24,45 @@ module Anemone
|
|
27
24
|
attr_accessor :depth
|
28
25
|
# URL of the page that brought us to this page
|
29
26
|
attr_accessor :referer
|
30
|
-
|
31
|
-
|
32
|
-
# Create a new Page from the response of an HTTP request to *url*
|
33
|
-
#
|
34
|
-
def self.fetch(url, from_page = nil)
|
35
|
-
begin
|
36
|
-
url = URI(url) unless url.is_a?(URI)
|
37
|
-
|
38
|
-
if from_page
|
39
|
-
referer = from_page.url
|
40
|
-
depth = from_page.depth + 1
|
41
|
-
end
|
42
|
-
|
43
|
-
response, code, location = Anemone::HTTP.get(url, referer)
|
44
|
-
|
45
|
-
aka = nil
|
46
|
-
if !url.eql?(location)
|
47
|
-
aka = location
|
48
|
-
end
|
49
|
-
|
50
|
-
return Page.new(url, response.body, code, response.to_hash, aka, referer, depth)
|
51
|
-
rescue
|
52
|
-
return Page.new(url)
|
53
|
-
end
|
54
|
-
end
|
27
|
+
# Response time of the request for this page in milliseconds
|
28
|
+
attr_accessor :response_time
|
55
29
|
|
56
30
|
#
|
57
31
|
# Create a new page
|
58
32
|
#
|
59
|
-
def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0)
|
33
|
+
def initialize(url, body = nil, code = nil, headers = nil, aka = nil, referer = nil, depth = 0, response_time = nil)
|
60
34
|
@url = url
|
61
35
|
@code = code
|
62
36
|
@headers = headers
|
63
|
-
@
|
64
|
-
@aliases =
|
37
|
+
@headers['content-type'] ||= ['']
|
38
|
+
@aliases = Array(aka)
|
65
39
|
@data = OpenStruct.new
|
66
40
|
@referer = referer
|
67
41
|
@depth = depth || 0
|
42
|
+
@response_time = response_time
|
43
|
+
@doc = Nokogiri::HTML(body) if body && html? rescue nil
|
44
|
+
end
|
68
45
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
#get a list of distinct links on the page, in absolute url form
|
81
|
-
@doc.css('a').each do |a|
|
82
|
-
u = a.attributes['href'].content if a.attributes['href']
|
83
|
-
next if u.nil?
|
84
|
-
|
85
|
-
begin
|
86
|
-
abs = to_absolute(URI(u))
|
87
|
-
rescue
|
88
|
-
next
|
89
|
-
end
|
90
|
-
|
91
|
-
@links << abs if in_domain?(abs)
|
92
|
-
end
|
93
|
-
|
94
|
-
@links.uniq!
|
46
|
+
# Array of distinct A tag HREFs from the page
|
47
|
+
def links
|
48
|
+
return @links unless @links.nil?
|
49
|
+
@links = []
|
50
|
+
return @links if !doc
|
51
|
+
|
52
|
+
doc.css('a').each do |a|
|
53
|
+
u = a.attributes['href'].content rescue nil
|
54
|
+
next if u.nil? or u.empty?
|
55
|
+
abs = to_absolute(URI(u)) rescue next
|
56
|
+
@links << abs if in_domain?(abs)
|
95
57
|
end
|
58
|
+
@links.uniq!
|
59
|
+
@links
|
96
60
|
end
|
97
61
|
|
62
|
+
def discard_doc!
|
63
|
+
links # force parsing of page links before we trash the document
|
64
|
+
@doc = nil
|
65
|
+
end
|
98
66
|
|
99
67
|
#
|
100
68
|
# Return a new page with the same *response* and *url*, but
|
@@ -124,7 +92,7 @@ module Anemone
|
|
124
92
|
# *page_hash* is a PageHash object with the results of the current crawl.
|
125
93
|
#
|
126
94
|
def links_and_their_aliases(page_hash)
|
127
|
-
|
95
|
+
links.inject([]) do |results, link|
|
128
96
|
results.concat([link].concat(page_hash[link].aliases))
|
129
97
|
end
|
130
98
|
end
|
@@ -133,7 +101,7 @@ module Anemone
|
|
133
101
|
# The content-type returned by the HTTP request for this page
|
134
102
|
#
|
135
103
|
def content_type
|
136
|
-
|
104
|
+
headers['content-type'].first
|
137
105
|
end
|
138
106
|
|
139
107
|
#
|
@@ -141,7 +109,7 @@ module Anemone
|
|
141
109
|
# otherwise.
|
142
110
|
#
|
143
111
|
def html?
|
144
|
-
(
|
112
|
+
!!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
|
145
113
|
end
|
146
114
|
|
147
115
|
#
|
data/lib/anemone/page_hash.rb
CHANGED
@@ -14,6 +14,18 @@ module Anemone
|
|
14
14
|
def has_key?(key)
|
15
15
|
super(key.to_s)
|
16
16
|
end
|
17
|
+
|
18
|
+
# Does this PageHash contain the specified URL?
|
19
|
+
# HTTP and HTTPS versions of a URL are considered to be the same page.
|
20
|
+
def has_page?(url)
|
21
|
+
schemes = %w(http https)
|
22
|
+
if schemes.include? url.scheme
|
23
|
+
u = url.dup
|
24
|
+
return schemes.any? { |s| u.scheme = s; has_key?(u) }
|
25
|
+
end
|
26
|
+
|
27
|
+
has_key?(url)
|
28
|
+
end
|
17
29
|
|
18
30
|
#
|
19
31
|
# Use a breadth-first search to calculate the single-source
|
data/lib/anemone/tentacle.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
require 'anemone/
|
1
|
+
require 'anemone/http'
|
2
2
|
|
3
3
|
module Anemone
|
4
4
|
class Tentacle
|
@@ -9,6 +9,7 @@ module Anemone
|
|
9
9
|
def initialize(link_queue, page_queue)
|
10
10
|
@link_queue = link_queue
|
11
11
|
@page_queue = page_queue
|
12
|
+
@http = Anemone::HTTP.new
|
12
13
|
end
|
13
14
|
|
14
15
|
#
|
@@ -16,22 +17,16 @@ module Anemone
|
|
16
17
|
# Page objects into @page_queue
|
17
18
|
#
|
18
19
|
def run
|
19
|
-
|
20
|
+
loop do
|
20
21
|
link, from_page = @link_queue.deq
|
21
22
|
|
22
23
|
break if link == :END
|
23
|
-
|
24
|
-
|
25
|
-
page = Page.fetch(link, from_page)
|
26
|
-
else
|
27
|
-
page = Page.fetch(link)
|
28
|
-
end
|
29
|
-
|
30
|
-
@page_queue.enq(page)
|
24
|
+
|
25
|
+
@page_queue.enq @http.fetch_page(link, from_page)
|
31
26
|
|
32
27
|
sleep Anemone.options.delay
|
33
28
|
end
|
34
29
|
end
|
35
|
-
|
30
|
+
|
36
31
|
end
|
37
32
|
end
|
data/spec/anemone_spec.rb
CHANGED
@@ -1,6 +1,15 @@
|
|
1
1
|
require File.dirname(__FILE__) + '/spec_helper'
|
2
2
|
|
3
3
|
describe Anemone do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
Anemone::FakePage.new
|
7
|
+
end
|
8
|
+
|
9
|
+
after(:each) do
|
10
|
+
# reset global options object to defaults
|
11
|
+
Anemone::DEFAULTS.each { |key, value| Anemone.options.send("#{key}=", value) }
|
12
|
+
end
|
4
13
|
|
5
14
|
it "should have a version" do
|
6
15
|
Anemone.const_defined?('VERSION').should == true
|
@@ -17,6 +26,7 @@ describe Anemone do
|
|
17
26
|
:user_agent => 'test',
|
18
27
|
:obey_robots_txt => true,
|
19
28
|
:depth_limit => 3)
|
29
|
+
|
20
30
|
Anemone.options.verbose.should == false
|
21
31
|
Anemone.options.threads.should == 2
|
22
32
|
Anemone.options.discard_page_bodies.should == true
|
data/spec/core_spec.rb
CHANGED
@@ -139,43 +139,39 @@ module Anemone
|
|
139
139
|
urls.should_not include(pages[1].url)
|
140
140
|
end
|
141
141
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
142
|
+
describe "many pages" do
|
143
|
+
before(:each) do
|
144
|
+
@pages, size = [], 5
|
145
|
+
|
146
|
+
size.times do |n|
|
147
|
+
# register this page with a link to the next page
|
148
|
+
link = (n + 1).to_s if n + 1 < size
|
149
|
+
@pages << FakePage.new(n.to_s, :links => Array(link))
|
150
|
+
end
|
151
151
|
end
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
152
|
+
|
153
|
+
it "should track the page depth and referer" do
|
154
|
+
core = Anemone.crawl(@pages[0].url)
|
155
|
+
previous_page = nil
|
156
|
+
|
157
|
+
@pages.each_with_index do |page, i|
|
158
|
+
page = core.pages[page.url]
|
159
|
+
page.should be
|
160
|
+
page.depth.should == i
|
161
|
+
|
162
|
+
if previous_page
|
163
|
+
page.referer.should == previous_page.url
|
164
|
+
else
|
165
|
+
page.referer.should be_nil
|
166
|
+
end
|
167
|
+
previous_page = page
|
168
|
+
end
|
159
169
|
end
|
160
|
-
|
161
|
-
core.pages[pages[0].url].referer.should == nil
|
162
|
-
end
|
163
170
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
pages = []
|
168
|
-
|
169
|
-
num_pages.times do |n|
|
170
|
-
# register this page with a link to the next page
|
171
|
-
link = (n + 1).to_s if n + 1 < num_pages
|
172
|
-
pages << FakePage.new(n.to_s, :links => [link].compact)
|
171
|
+
it "should optionally limit the depth of the crawl" do
|
172
|
+
core = Anemone.crawl(@pages[0].url, :depth_limit => 3)
|
173
|
+
core.should have(4).pages
|
173
174
|
end
|
174
|
-
|
175
|
-
core = Anemone.crawl(pages[0].url, :depth_limit => 3)
|
176
|
-
|
177
|
-
core.should have(4).pages
|
178
175
|
end
|
179
|
-
|
180
176
|
end
|
181
177
|
end
|
data/spec/page_spec.rb
CHANGED
@@ -2,14 +2,13 @@ require File.dirname(__FILE__) + '/spec_helper'
|
|
2
2
|
|
3
3
|
module Anemone
|
4
4
|
describe Page do
|
5
|
-
|
6
|
-
before(:
|
7
|
-
@
|
5
|
+
|
6
|
+
before(:all) do
|
7
|
+
@http = Anemone::HTTP.new
|
8
8
|
end
|
9
|
-
|
10
|
-
|
11
|
-
@page.
|
12
|
-
@page.url.to_s.should include('home')
|
9
|
+
|
10
|
+
before(:each) do
|
11
|
+
@page = @http.fetch_page(FakePage.new('home').url)
|
13
12
|
end
|
14
13
|
|
15
14
|
it "should store the response headers when fetching a page" do
|
@@ -35,7 +34,7 @@ module Anemone
|
|
35
34
|
|
36
35
|
@page.redirect?.should == false
|
37
36
|
|
38
|
-
|
37
|
+
@http.fetch_page(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
|
39
38
|
end
|
40
39
|
|
41
40
|
it "should have a method to tell if a URI is in the same domain as the page" do
|
@@ -44,6 +43,10 @@ module Anemone
|
|
44
43
|
@page.in_domain?(URI(FakePage.new('test').url)).should == true
|
45
44
|
@page.in_domain?(URI('http://www.other.com/')).should == false
|
46
45
|
end
|
46
|
+
|
47
|
+
it "should include the response time for the HTTP request" do
|
48
|
+
@page.should respond_to(:response_time)
|
49
|
+
end
|
47
50
|
|
48
51
|
end
|
49
52
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anemone
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Kite
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-10-24 00:00:00 -05:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -25,11 +25,7 @@ dependencies:
|
|
25
25
|
description:
|
26
26
|
email:
|
27
27
|
executables:
|
28
|
-
-
|
29
|
-
- anemone_cron.rb
|
30
|
-
- anemone_pagedepth.rb
|
31
|
-
- anemone_serialize.rb
|
32
|
-
- anemone_url_list.rb
|
28
|
+
- anemone
|
33
29
|
extensions: []
|
34
30
|
|
35
31
|
extra_rdoc_files:
|
@@ -37,11 +33,7 @@ extra_rdoc_files:
|
|
37
33
|
files:
|
38
34
|
- LICENSE.txt
|
39
35
|
- README.rdoc
|
40
|
-
- bin/
|
41
|
-
- bin/anemone_cron.rb
|
42
|
-
- bin/anemone_pagedepth.rb
|
43
|
-
- bin/anemone_serialize.rb
|
44
|
-
- bin/anemone_url_list.rb
|
36
|
+
- bin/anemone
|
45
37
|
- lib/anemone.rb
|
46
38
|
- lib/anemone/anemone.rb
|
47
39
|
- lib/anemone/core.rb
|
@@ -49,6 +41,12 @@ files:
|
|
49
41
|
- lib/anemone/page.rb
|
50
42
|
- lib/anemone/page_hash.rb
|
51
43
|
- lib/anemone/tentacle.rb
|
44
|
+
- lib/anemone/cli.rb
|
45
|
+
- lib/anemone/cli/url_list.rb
|
46
|
+
- lib/anemone/cli/cron.rb
|
47
|
+
- lib/anemone/cli/count.rb
|
48
|
+
- lib/anemone/cli/pagedepth.rb
|
49
|
+
- lib/anemone/cli/serialize.rb
|
52
50
|
has_rdoc: true
|
53
51
|
homepage: http://anemone.rubyforge.org
|
54
52
|
post_install_message:
|
data/bin/anemone_count.rb
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs the total number
|
4
|
-
# of unique pages on the site.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_count.rb url
|
8
|
-
#
|
9
|
-
# == Author
|
10
|
-
# Chris Kite
|
11
|
-
|
12
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
-
|
14
|
-
require 'anemone'
|
15
|
-
|
16
|
-
def usage
|
17
|
-
puts <<END
|
18
|
-
Usage: anemone_count.rb url
|
19
|
-
END
|
20
|
-
end
|
21
|
-
|
22
|
-
# make sure that the first option is a URL we can crawl
|
23
|
-
begin
|
24
|
-
URI(ARGV[0])
|
25
|
-
rescue
|
26
|
-
usage
|
27
|
-
Process.exit
|
28
|
-
end
|
29
|
-
|
30
|
-
Anemone.crawl(ARGV[0]) do |anemone|
|
31
|
-
anemone.after_crawl do |pages|
|
32
|
-
puts pages.uniq.size
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
|
data/bin/anemone_pagedepth.rb
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs a count of
|
4
|
-
# the number of Pages at each depth in the site.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_pagedepth.rb url
|
8
|
-
#
|
9
|
-
# == Author
|
10
|
-
# Chris Kite
|
11
|
-
|
12
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
-
|
14
|
-
require 'anemone'
|
15
|
-
|
16
|
-
def usage
|
17
|
-
puts <<END
|
18
|
-
Usage: anemone_pagedepth.rb url
|
19
|
-
END
|
20
|
-
end
|
21
|
-
|
22
|
-
# make sure that the first option is a URL we can crawl
|
23
|
-
begin
|
24
|
-
URI(ARGV[0])
|
25
|
-
rescue
|
26
|
-
usage
|
27
|
-
Process.exit
|
28
|
-
end
|
29
|
-
|
30
|
-
root = ARGV[0]
|
31
|
-
Anemone.crawl(root) do |anemone|
|
32
|
-
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
33
|
-
|
34
|
-
anemone.after_crawl do |pages|
|
35
|
-
pages = pages.shortest_paths!(root).uniq
|
36
|
-
depths = pages.values.inject({}) do |depths, page|
|
37
|
-
depths[page.depth] ||= 0
|
38
|
-
depths[page.depth] += 1
|
39
|
-
depths
|
40
|
-
end
|
41
|
-
|
42
|
-
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
43
|
-
end
|
44
|
-
end
|
data/bin/anemone_serialize.rb
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and saves the resulting
|
4
|
-
# PageHash object to a file using Marshal serialization.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_serialize.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
11
|
-
#
|
12
|
-
# == Author
|
13
|
-
# Chris Kite
|
14
|
-
|
15
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
-
|
17
|
-
require 'anemone'
|
18
|
-
require 'optparse'
|
19
|
-
require 'ostruct'
|
20
|
-
|
21
|
-
def usage
|
22
|
-
puts <<END
|
23
|
-
Usage: anemone_serialize.rb [options] url
|
24
|
-
|
25
|
-
Options:
|
26
|
-
-o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
27
|
-
END
|
28
|
-
end
|
29
|
-
|
30
|
-
# make sure that the first option is a URL we can crawl
|
31
|
-
begin
|
32
|
-
URI(ARGV[0])
|
33
|
-
rescue
|
34
|
-
usage
|
35
|
-
Process.exit
|
36
|
-
end
|
37
|
-
|
38
|
-
options = OpenStruct.new
|
39
|
-
options.output_file = "crawl.#{Time.now.to_i}"
|
40
|
-
|
41
|
-
# parse command-line options
|
42
|
-
opts = OptionParser.new
|
43
|
-
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
44
|
-
opts.parse!(ARGV)
|
45
|
-
|
46
|
-
root = ARGV[0]
|
47
|
-
Anemone.crawl(root) do |anemone|
|
48
|
-
anemone.after_crawl do |pages|
|
49
|
-
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
50
|
-
end
|
51
|
-
end
|
data/bin/anemone_url_list.rb
DELETED
@@ -1,54 +0,0 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
-
# in the domain as they are encountered.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_url_list.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
-
#
|
12
|
-
# == Author
|
13
|
-
# Chris Kite
|
14
|
-
|
15
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
-
|
17
|
-
require 'anemone'
|
18
|
-
require 'optparse'
|
19
|
-
require 'ostruct'
|
20
|
-
|
21
|
-
def usage
|
22
|
-
puts <<END
|
23
|
-
Usage: anemone_url_list.rb [options] url
|
24
|
-
|
25
|
-
Options:
|
26
|
-
-r, --relative Output relative URLs (rather than absolute)
|
27
|
-
END
|
28
|
-
end
|
29
|
-
|
30
|
-
options = OpenStruct.new
|
31
|
-
options.relative = false
|
32
|
-
|
33
|
-
# make sure that the last option is a URL we can crawl
|
34
|
-
begin
|
35
|
-
URI(ARGV.last)
|
36
|
-
rescue
|
37
|
-
usage
|
38
|
-
Process.exit
|
39
|
-
end
|
40
|
-
|
41
|
-
# parse command-line options
|
42
|
-
opts = OptionParser.new
|
43
|
-
opts.on('-r', '--relative') { options.relative = true }
|
44
|
-
opts.parse!(ARGV)
|
45
|
-
|
46
|
-
Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
|
47
|
-
anemone.on_every_page do |page|
|
48
|
-
if options.relative
|
49
|
-
puts page.url.path
|
50
|
-
else
|
51
|
-
puts page.url
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|