anemone 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +18 -0
- data/bin/anemone_count.rb +31 -0
- data/bin/anemone_cron.rb +99 -0
- data/bin/anemone_pagedepth.rb +39 -0
- data/bin/anemone_serialize.rb +43 -0
- data/bin/anemone_url_list.rb +46 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/anemone.rb +16 -0
- data/lib/anemone/core.rb +183 -0
- data/lib/anemone/http.rb +37 -0
- data/lib/anemone/page.rb +165 -0
- data/lib/anemone/page_hash.rb +83 -0
- data/lib/anemone/tentacle.rb +31 -0
- metadata +82 -0
data/README.txt
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
= Anemone
|
2
|
+
|
3
|
+
== DESCRIPTION
|
4
|
+
Anemone is a web spider framework that can spider a domain and collect useful
|
5
|
+
information about the pages it visits. It is versatile, allowing you to
|
6
|
+
write your own specialized spider tasks quickly and easily.
|
7
|
+
|
8
|
+
== FEATURES
|
9
|
+
* Multi-threaded design for high performance
|
10
|
+
* Tracks 301 HTTP redirects to understand a page's aliases
|
11
|
+
* Built-in BFS algorithm for determining page depth
|
12
|
+
* Allows exclusion of URLs based on regular expressions
|
13
|
+
|
14
|
+
== REQUIREMENTS
|
15
|
+
* hpricot
|
16
|
+
|
17
|
+
== EXAMPLES
|
18
|
+
See the +bin+ directory for several examples of useful Anemone tasks.
|
@@ -0,0 +1,31 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the total number
|
4
|
+
# of unique pages on the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_count.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
require 'rdoc/usage'
|
16
|
+
|
17
|
+
# make sure that the first option is a URL we can crawl
|
18
|
+
begin
|
19
|
+
URI(ARGV[0])
|
20
|
+
rescue
|
21
|
+
RDoc::usage()
|
22
|
+
Process.exit
|
23
|
+
end
|
24
|
+
|
25
|
+
Anemone.crawl(ARGV[0]) do |anemone|
|
26
|
+
anemone.after_crawl do |pages|
|
27
|
+
puts pages.uniq.size
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
|
data/bin/anemone_cron.rb
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Performs pagedepth, url list, and count functionality
|
4
|
+
# Meant to be run daily as a cron job
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
|
12
|
+
#
|
13
|
+
# == Author
|
14
|
+
# Chris Kite
|
15
|
+
|
16
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
17
|
+
|
18
|
+
require 'anemone'
|
19
|
+
require 'optparse'
|
20
|
+
require 'rdoc/usage'
|
21
|
+
require 'ostruct'
|
22
|
+
|
23
|
+
options = OpenStruct.new
|
24
|
+
options.relative = false
|
25
|
+
options.output_file = 'urls.txt'
|
26
|
+
|
27
|
+
# make sure that the last option is a URL we can crawl
|
28
|
+
begin
|
29
|
+
URI(ARGV.last)
|
30
|
+
rescue
|
31
|
+
RDoc::usage()
|
32
|
+
Process.exit
|
33
|
+
end
|
34
|
+
|
35
|
+
# parse command-line options
|
36
|
+
opts = OptionParser.new
|
37
|
+
opts.on('-r', '--relative') { options.relative = true }
|
38
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
39
|
+
opts.parse!(ARGV)
|
40
|
+
|
41
|
+
root = ARGV.last
|
42
|
+
|
43
|
+
Anemone.crawl(root) do |anemone|
|
44
|
+
|
45
|
+
anemone.after_crawl do |pages|
|
46
|
+
puts "Crawl results for #{root}\n"
|
47
|
+
|
48
|
+
# print a list of 404's
|
49
|
+
not_found = []
|
50
|
+
pages.each_value do |page|
|
51
|
+
url = page.url.to_s
|
52
|
+
not_found << url if page.not_found?
|
53
|
+
end
|
54
|
+
if !not_found.empty?
|
55
|
+
puts "\n404's:"
|
56
|
+
not_found.each do |url|
|
57
|
+
if options.relative
|
58
|
+
puts URI(url).path.to_s
|
59
|
+
else
|
60
|
+
puts url
|
61
|
+
end
|
62
|
+
num_linked_from = 0
|
63
|
+
pages.urls_linking_to(url).each do |u|
|
64
|
+
u = u.path if options.relative
|
65
|
+
num_linked_from += 1
|
66
|
+
puts " linked from #{u}"
|
67
|
+
if num_linked_from > 10
|
68
|
+
puts " ..."
|
69
|
+
break
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
print "\n"
|
75
|
+
end
|
76
|
+
|
77
|
+
# remove redirect aliases, and calculate pagedepths
|
78
|
+
pages = pages.shortest_paths!(root).uniq
|
79
|
+
depths = pages.values.inject({}) do |depths, page|
|
80
|
+
depths[page.depth] ||= 0
|
81
|
+
depths[page.depth] += 1
|
82
|
+
depths
|
83
|
+
end
|
84
|
+
|
85
|
+
# print the page count
|
86
|
+
puts "Total pages: #{pages.size}\n"
|
87
|
+
|
88
|
+
# print a list of depths
|
89
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
90
|
+
|
91
|
+
# output a list of urls to file
|
92
|
+
file = open(options.output_file, 'w')
|
93
|
+
pages.each_key do |url|
|
94
|
+
url = options.relative ? url.path.to_s : url.to_s
|
95
|
+
file.puts url
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs a count of
|
4
|
+
# the number of Pages at each depth in the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_pagedepth.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
require 'rdoc/usage'
|
16
|
+
|
17
|
+
# make sure that the first option is a URL we can crawl
|
18
|
+
begin
|
19
|
+
URI(ARGV[0])
|
20
|
+
rescue
|
21
|
+
RDoc::usage()
|
22
|
+
Process.exit
|
23
|
+
end
|
24
|
+
|
25
|
+
root = ARGV[0]
|
26
|
+
Anemone.crawl(root) do |anemone|
|
27
|
+
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
28
|
+
|
29
|
+
anemone.after_crawl do |pages|
|
30
|
+
pages = pages.shortest_paths!(root).uniq
|
31
|
+
depths = pages.values.inject({}) do |depths, page|
|
32
|
+
depths[page.depth] ||= 0
|
33
|
+
depths[page.depth] += 1
|
34
|
+
depths
|
35
|
+
end
|
36
|
+
|
37
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and saves the resulting
|
4
|
+
# PageHash object to a file using Marshal serialization.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_serialize.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'rdoc/usage'
|
20
|
+
require 'ostruct'
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
RDoc::usage()
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
options = OpenStruct.new
|
31
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
32
|
+
|
33
|
+
# parse command-line options
|
34
|
+
opts = OptionParser.new
|
35
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
36
|
+
opts.parse!(ARGV)
|
37
|
+
|
38
|
+
root = ARGV[0]
|
39
|
+
Anemone.crawl(root) do |anemone|
|
40
|
+
anemone.after_crawl do |pages|
|
41
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
+
# in the domain as they are encountered.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'rdoc/usage'
|
20
|
+
require 'ostruct'
|
21
|
+
|
22
|
+
options = OpenStruct.new
|
23
|
+
options.relative = false
|
24
|
+
|
25
|
+
# make sure that the last option is a URL we can crawl
|
26
|
+
begin
|
27
|
+
URI(ARGV.last)
|
28
|
+
rescue
|
29
|
+
RDoc::usage()
|
30
|
+
Process.exit
|
31
|
+
end
|
32
|
+
|
33
|
+
# parse command-line options
|
34
|
+
opts = OptionParser.new
|
35
|
+
opts.on('-r', '--relative') { options.relative = true }
|
36
|
+
opts.parse!(ARGV)
|
37
|
+
|
38
|
+
Anemone.crawl(ARGV.last) do |anemone|
|
39
|
+
anemone.on_every_page do |page|
|
40
|
+
if options.relative
|
41
|
+
puts page.url.path
|
42
|
+
else
|
43
|
+
puts page.url
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/anemone.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'anemone/core'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
# Version number
|
5
|
+
VERSION = '0.0.1'
|
6
|
+
|
7
|
+
# User-Agent string used for HTTP requests
|
8
|
+
USER_AGENT = "Anemone/#{self::VERSION}"
|
9
|
+
|
10
|
+
#
|
11
|
+
# Convenience method to start a crawl using Core
|
12
|
+
#
|
13
|
+
def Anemone.crawl(url, options = {}, &block)
|
14
|
+
Core.crawl(url, options, &block)
|
15
|
+
end
|
16
|
+
end
|
data/lib/anemone/core.rb
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'thread'
|
3
|
+
require 'anemone/tentacle'
|
4
|
+
require 'anemone/page_hash'
|
5
|
+
|
6
|
+
module Anemone
|
7
|
+
class Core
|
8
|
+
# PageHash storing all Page objects encountered during the crawl
|
9
|
+
attr_reader :pages
|
10
|
+
|
11
|
+
#
|
12
|
+
# Initialize the crawl with a starting *url*, *options*, and optional *block*
|
13
|
+
#
|
14
|
+
def initialize(url, options={}, &block)
|
15
|
+
url = URI(url) if url.is_a?(String)
|
16
|
+
@url = url
|
17
|
+
@options = options
|
18
|
+
@tentacles = []
|
19
|
+
@pages = PageHash.new
|
20
|
+
@on_every_page_blocks = []
|
21
|
+
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
22
|
+
@skip_link_patterns = []
|
23
|
+
@after_crawl_blocks = []
|
24
|
+
|
25
|
+
@options[:threads] ||= 4
|
26
|
+
@options[:verbose] ||= false
|
27
|
+
|
28
|
+
block.call(self) if block
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# Convenience method to start a new crawl
|
33
|
+
#
|
34
|
+
def self.crawl(root, options={}, &block)
|
35
|
+
self.new(root, options) do |core|
|
36
|
+
block.call(core) if block
|
37
|
+
core.run
|
38
|
+
core.do_after_crawl_blocks
|
39
|
+
return core
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
#
|
44
|
+
# Add a block to be executed on the PageHash after the crawl
|
45
|
+
# is finished
|
46
|
+
#
|
47
|
+
def after_crawl(&block)
|
48
|
+
@after_crawl_blocks << block
|
49
|
+
self
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Add one ore more Regex patterns for URLs which should not be
|
54
|
+
# followed
|
55
|
+
#
|
56
|
+
def skip_links_like(*patterns)
|
57
|
+
if patterns
|
58
|
+
patterns.each do |pattern|
|
59
|
+
@skip_link_patterns << pattern
|
60
|
+
end
|
61
|
+
end
|
62
|
+
self
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Add a block to be executed on every Page as they are encountered
|
67
|
+
# during the crawl
|
68
|
+
#
|
69
|
+
def on_every_page(&block)
|
70
|
+
@on_every_page_blocks << block
|
71
|
+
self
|
72
|
+
end
|
73
|
+
|
74
|
+
#
|
75
|
+
# Add a block to be executed on Page objects with a URL matching
|
76
|
+
# one or more patterns
|
77
|
+
#
|
78
|
+
def on_pages_like(*patterns, &block)
|
79
|
+
if patterns
|
80
|
+
patterns.each do |pattern|
|
81
|
+
@on_pages_like_blocks[pattern] << block
|
82
|
+
end
|
83
|
+
end
|
84
|
+
self
|
85
|
+
end
|
86
|
+
|
87
|
+
#
|
88
|
+
# Perform the crawl
|
89
|
+
#
|
90
|
+
def run
|
91
|
+
link_queue = Queue.new
|
92
|
+
page_queue = Queue.new
|
93
|
+
|
94
|
+
@options[:threads].times do |id|
|
95
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
96
|
+
end
|
97
|
+
|
98
|
+
return if !visit_link?(@url)
|
99
|
+
|
100
|
+
link_queue.enq(@url)
|
101
|
+
|
102
|
+
while true do
|
103
|
+
page = page_queue.deq
|
104
|
+
|
105
|
+
@pages[page.url] = page
|
106
|
+
|
107
|
+
puts "#{page.url} Queue: #{link_queue.size}" if @options[:verbose]
|
108
|
+
|
109
|
+
do_page_blocks(page)
|
110
|
+
|
111
|
+
page.links.each do |link|
|
112
|
+
if visit_link?(link)
|
113
|
+
link_queue.enq(link)
|
114
|
+
@pages[link] = nil
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
page.aliases.each do |aka|
|
119
|
+
if !@pages.has_key?(aka) or @pages[aka].nil?
|
120
|
+
@pages[aka] = page.alias_clone(aka)
|
121
|
+
end
|
122
|
+
@pages[aka].add_alias!(page.url)
|
123
|
+
end
|
124
|
+
|
125
|
+
# if we are done with the crawl, tell the threads to end
|
126
|
+
if link_queue.empty? and page_queue.empty?
|
127
|
+
until link_queue.num_waiting == @tentacles.size
|
128
|
+
Thread.pass
|
129
|
+
end
|
130
|
+
|
131
|
+
if page_queue.empty?
|
132
|
+
@tentacles.size.times { |i| link_queue.enq(:END)}
|
133
|
+
break
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
|
139
|
+
@tentacles.each { |t| t.join }
|
140
|
+
|
141
|
+
self
|
142
|
+
end
|
143
|
+
|
144
|
+
#
|
145
|
+
# Execute the after_crawl blocks
|
146
|
+
#
|
147
|
+
def do_after_crawl_blocks
|
148
|
+
@after_crawl_blocks.each {|b| b.call(@pages)}
|
149
|
+
end
|
150
|
+
|
151
|
+
#
|
152
|
+
# Execute the on_every_page blocks for *page*
|
153
|
+
#
|
154
|
+
def do_page_blocks(page)
|
155
|
+
@on_every_page_blocks.each do |blk|
|
156
|
+
blk.call(page)
|
157
|
+
end
|
158
|
+
|
159
|
+
@on_pages_like_blocks.each do |pattern, blk|
|
160
|
+
blk.call(page) if page.url.to_s =~ pattern
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
#
|
165
|
+
# Returns +true+ if *link* has not been visited already,
|
166
|
+
# and is not excluded by a skip_link pattern. Returns
|
167
|
+
# +false+ otherwise.
|
168
|
+
#
|
169
|
+
def visit_link?(link)
|
170
|
+
!@pages.has_key?(link) and !skip_link?(link)
|
171
|
+
end
|
172
|
+
|
173
|
+
#
|
174
|
+
# Returns +true+ if *link* should not be visited because
|
175
|
+
# its URL matches a skip_link pattern.
|
176
|
+
#
|
177
|
+
def skip_link?(link)
|
178
|
+
@skip_link_patterns.each { |p| return true if link.path =~ p}
|
179
|
+
return false
|
180
|
+
end
|
181
|
+
|
182
|
+
end
|
183
|
+
end
|
data/lib/anemone/http.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class HTTP < Net::HTTP
|
5
|
+
# Maximum number of redirects to follow on each get_response
|
6
|
+
REDIRECTION_LIMIT = 5
|
7
|
+
|
8
|
+
#
|
9
|
+
# Retrieve an HTTP response for *url*, following redirects.
|
10
|
+
# Returns the response object, response code, and final URI location.
|
11
|
+
#
|
12
|
+
def self.get(url)
|
13
|
+
response = get_response(url)
|
14
|
+
code = Integer(response.code)
|
15
|
+
loc = url
|
16
|
+
|
17
|
+
limit = REDIRECTION_LIMIT
|
18
|
+
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
|
+
loc = URI(response['location'])
|
20
|
+
loc = url.merge(loc) if loc.relative?
|
21
|
+
response = get_response(loc)
|
22
|
+
limit -= 1
|
23
|
+
end
|
24
|
+
|
25
|
+
return response, code, loc
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
|
+
#
|
31
|
+
def self.get_response(url)
|
32
|
+
Net::HTTP.start(url.host, url.port) do |http|
|
33
|
+
return http.get(url.path, {'User-Agent' => Anemone::USER_AGENT })
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/lib/anemone/page.rb
ADDED
@@ -0,0 +1,165 @@
|
|
1
|
+
require 'anemone/http'
|
2
|
+
require 'hpricot'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
class Page
|
6
|
+
# The URL of the page
|
7
|
+
attr_reader :url
|
8
|
+
# Array of distinct A tag HREFs from the page
|
9
|
+
attr_reader :links
|
10
|
+
# Integer response code of the page
|
11
|
+
attr_reader :code
|
12
|
+
|
13
|
+
# Array of redirect-aliases for the page
|
14
|
+
attr_accessor :aliases
|
15
|
+
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
16
|
+
attr_accessor :visited
|
17
|
+
# Used by PageHash#shortest_paths! to store depth of the page
|
18
|
+
attr_accessor :depth
|
19
|
+
|
20
|
+
#
|
21
|
+
# Create a new Page from the response of an HTTP request to *url*
|
22
|
+
#
|
23
|
+
def self.fetch(url)
|
24
|
+
begin
|
25
|
+
url = URI(url) if url.is_a?(String)
|
26
|
+
|
27
|
+
response, code, location = Anemone::HTTP.get(url)
|
28
|
+
|
29
|
+
aka = nil
|
30
|
+
if !url.eql?(location)
|
31
|
+
aka = location
|
32
|
+
end
|
33
|
+
|
34
|
+
return Page.new(url, response, code, aka)
|
35
|
+
rescue
|
36
|
+
return Page.new(url)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# Create a new page
|
42
|
+
#
|
43
|
+
def initialize(url, response = nil, code = nil, aka = nil)
|
44
|
+
@url = url
|
45
|
+
@response = response
|
46
|
+
@code = code
|
47
|
+
@links = []
|
48
|
+
@aliases = []
|
49
|
+
|
50
|
+
@aliases << aka if !aka.nil?
|
51
|
+
|
52
|
+
#get a list of distinct links on the page, in absolute url form
|
53
|
+
if @response and @response.body
|
54
|
+
Hpricot(@response.body).search('a').each do |a|
|
55
|
+
u = a['href']
|
56
|
+
next if u.nil?
|
57
|
+
|
58
|
+
begin
|
59
|
+
u = URI(u)
|
60
|
+
rescue
|
61
|
+
next
|
62
|
+
end
|
63
|
+
|
64
|
+
abs = to_absolute(u)
|
65
|
+
@links << abs if in_domain?(abs)
|
66
|
+
end
|
67
|
+
|
68
|
+
@links.uniq!
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
|
73
|
+
#
|
74
|
+
# Return a new page with the same *response* and *url*, but
|
75
|
+
# with a 200 response code
|
76
|
+
#
|
77
|
+
def alias_clone(url)
|
78
|
+
Page.new(url, @response, 200, @url)
|
79
|
+
end
|
80
|
+
|
81
|
+
#
|
82
|
+
# Add a redirect-alias String *aka* to the list of the page's aliases
|
83
|
+
#
|
84
|
+
# Returns *self*
|
85
|
+
#
|
86
|
+
def add_alias!(aka)
|
87
|
+
@aliases << aka if !@aliases.include?(aka)
|
88
|
+
self
|
89
|
+
end
|
90
|
+
|
91
|
+
#
|
92
|
+
# Returns an Array of all links from this page, and all the
|
93
|
+
# redirect-aliases of those pages, as String objects.
|
94
|
+
#
|
95
|
+
# *page_hash* is a PageHash object with the results of the current crawl.
|
96
|
+
#
|
97
|
+
def links_and_their_aliases(page_hash)
|
98
|
+
@links.inject([]) do |results, link|
|
99
|
+
results.concat([link].concat(page_hash[link].aliases))
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
#
|
104
|
+
# Returns the response body for the page
|
105
|
+
#
|
106
|
+
def body
|
107
|
+
@response.body
|
108
|
+
end
|
109
|
+
|
110
|
+
#
|
111
|
+
# Returns the +Content-Type+ header for the page
|
112
|
+
#
|
113
|
+
def content_type
|
114
|
+
@response['Content-Type']
|
115
|
+
end
|
116
|
+
|
117
|
+
#
|
118
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
119
|
+
# otherwise.
|
120
|
+
#
|
121
|
+
def html?
|
122
|
+
(content_type =~ /text\/html/) == 0
|
123
|
+
end
|
124
|
+
|
125
|
+
#
|
126
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
127
|
+
# otherwise.
|
128
|
+
#
|
129
|
+
def redirect?
|
130
|
+
(300..399).include?(@code)
|
131
|
+
end
|
132
|
+
|
133
|
+
#
|
134
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
135
|
+
# returns +false+ otherwise.
|
136
|
+
#
|
137
|
+
def not_found?
|
138
|
+
404 == @code
|
139
|
+
end
|
140
|
+
|
141
|
+
#
|
142
|
+
# Converts relative URL *link* into an absolute URL based on the
|
143
|
+
# location of the page
|
144
|
+
#
|
145
|
+
def to_absolute(link)
|
146
|
+
# remove anchor
|
147
|
+
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
148
|
+
|
149
|
+
relative = URI(link)
|
150
|
+
absolute = @url.merge(relative)
|
151
|
+
|
152
|
+
absolute.path = '/' if absolute.path.empty?
|
153
|
+
|
154
|
+
return absolute
|
155
|
+
end
|
156
|
+
|
157
|
+
#
|
158
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
159
|
+
# +false+ otherwise
|
160
|
+
#
|
161
|
+
def in_domain?(uri)
|
162
|
+
uri.host == @url.host
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
module Anemone
|
2
|
+
class PageHash < Hash
|
3
|
+
|
4
|
+
#
|
5
|
+
# Use a breadth-first search to calculate the single-source
|
6
|
+
# shortest paths from *root* to all pages in the PageHash
|
7
|
+
#
|
8
|
+
def shortest_paths!(root)
|
9
|
+
root = URI(root) if root.is_a?(String)
|
10
|
+
raise "Root node not found" if !has_key?(root)
|
11
|
+
|
12
|
+
each_value {|p| p.visited = false if p}
|
13
|
+
|
14
|
+
q = Queue.new
|
15
|
+
|
16
|
+
q.enq(root)
|
17
|
+
self[root].depth = 0
|
18
|
+
self[root].visited = true
|
19
|
+
while(!q.empty?)
|
20
|
+
url = q.deq
|
21
|
+
|
22
|
+
next if !has_key?(url)
|
23
|
+
|
24
|
+
page = self[url]
|
25
|
+
|
26
|
+
page.links.each do |u|
|
27
|
+
next if !has_key?(u) or self[u].nil?
|
28
|
+
link = self[u]
|
29
|
+
aliases = [link].concat(link.aliases.map {|a| self[a] })
|
30
|
+
|
31
|
+
aliases.each do |node|
|
32
|
+
if node.depth.nil? or page.depth + 1 < node.depth
|
33
|
+
node.depth = page.depth + 1
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
q.enq(self[u].url) if !self[u].visited
|
38
|
+
self[u].visited = true
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
self
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Returns a new PageHash by removing redirect-aliases for each
|
47
|
+
# non-redirect Page
|
48
|
+
#
|
49
|
+
def uniq
|
50
|
+
results = PageHash.new
|
51
|
+
each do |url, page|
|
52
|
+
#if none of the aliases of this page have been added, and this isn't a redirect page, add this page
|
53
|
+
page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
|
54
|
+
if !page.redirect? and !page_added
|
55
|
+
results[url] = page.clone
|
56
|
+
results[url].aliases = []
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
results
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# Return an Array of Page objects which link to the given url
|
65
|
+
#
|
66
|
+
def pages_linking_to url
|
67
|
+
begin
|
68
|
+
url = URI(url) if url.is_a?(String)
|
69
|
+
rescue
|
70
|
+
return []
|
71
|
+
end
|
72
|
+
|
73
|
+
values.delete_if { |p| !p.links.include?(url) }
|
74
|
+
end
|
75
|
+
|
76
|
+
#
|
77
|
+
# Return an Array of URI objects of Pages linking to the given url
|
78
|
+
def urls_linking_to url
|
79
|
+
pages_linking_to(url).map{|p| p.url}
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'anemone/page'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class Tentacle
|
5
|
+
|
6
|
+
#
|
7
|
+
# Create a new Tentacle
|
8
|
+
#
|
9
|
+
def initialize(link_queue, page_queue)
|
10
|
+
@link_queue = link_queue
|
11
|
+
@page_queue = page_queue
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Gets links from @link_queue, and returns the fetched
|
16
|
+
# Page objects into @page_queue
|
17
|
+
#
|
18
|
+
def run
|
19
|
+
while true do
|
20
|
+
link = @link_queue.deq
|
21
|
+
|
22
|
+
break if link == :END
|
23
|
+
|
24
|
+
page = Page.fetch(link)
|
25
|
+
|
26
|
+
@page_queue.enq(page)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: anemone
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Chris Kite
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-04-14 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.7.0
|
24
|
+
version:
|
25
|
+
description:
|
26
|
+
email:
|
27
|
+
executables:
|
28
|
+
- anemone_count.rb
|
29
|
+
- anemone_cron.rb
|
30
|
+
- anemone_pagedepth.rb
|
31
|
+
- anemone_serialize.rb
|
32
|
+
- anemone_url_list.rb
|
33
|
+
extensions: []
|
34
|
+
|
35
|
+
extra_rdoc_files:
|
36
|
+
- README.txt
|
37
|
+
files:
|
38
|
+
- bin/anemone_count.rb
|
39
|
+
- bin/anemone_cron.rb
|
40
|
+
- bin/anemone_pagedepth.rb
|
41
|
+
- bin/anemone_serialize.rb
|
42
|
+
- bin/anemone_url_list.rb
|
43
|
+
- lib/anemone
|
44
|
+
- lib/anemone/anemone.rb
|
45
|
+
- lib/anemone/core.rb
|
46
|
+
- lib/anemone/http.rb
|
47
|
+
- lib/anemone/page.rb
|
48
|
+
- lib/anemone/page_hash.rb
|
49
|
+
- lib/anemone/tentacle.rb
|
50
|
+
- lib/anemone.rb
|
51
|
+
- README.txt
|
52
|
+
has_rdoc: true
|
53
|
+
homepage: http://anemone.rubyforge.org
|
54
|
+
post_install_message:
|
55
|
+
rdoc_options:
|
56
|
+
- -m
|
57
|
+
- README.txt
|
58
|
+
- -t
|
59
|
+
- Anemone
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: "0"
|
67
|
+
version:
|
68
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: "0"
|
73
|
+
version:
|
74
|
+
requirements: []
|
75
|
+
|
76
|
+
rubyforge_project: anemone
|
77
|
+
rubygems_version: 1.3.1
|
78
|
+
signing_key:
|
79
|
+
specification_version: 2
|
80
|
+
summary: Anemone web-spider framework
|
81
|
+
test_files: []
|
82
|
+
|