parolkar-anemone 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +19 -0
- data/README.rdoc +19 -0
- data/bin/anemone_count.rb +36 -0
- data/bin/anemone_cron.rb +106 -0
- data/bin/anemone_pagedepth.rb +44 -0
- data/bin/anemone_serialize.rb +51 -0
- data/bin/anemone_url_list.rb +54 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/anemone.rb +56 -0
- data/lib/anemone/core.rb +209 -0
- data/lib/anemone/http.rb +38 -0
- data/lib/anemone/page.rb +177 -0
- data/lib/anemone/page_hash.rb +116 -0
- data/lib/anemone/tentacle.rb +33 -0
- data/spec/anemone_spec.rb +41 -0
- data/spec/core_spec.rb +128 -0
- data/spec/fakeweb_helper.rb +55 -0
- data/spec/page_spec.rb +49 -0
- data/spec/spec_helper.rb +7 -0
- metadata +86 -0
data/LICENSE.txt
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2009 Vertive, Inc.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
= Anemone
|
2
|
+
|
3
|
+
== DESCRIPTION
|
4
|
+
Anemone is a web spider framework that can spider a domain and collect useful
|
5
|
+
information about the pages it visits. It is versatile, allowing you to
|
6
|
+
write your own specialized spider tasks quickly and easily.
|
7
|
+
|
8
|
+
== FEATURES
|
9
|
+
* Multi-threaded design for high performance
|
10
|
+
* Tracks 301 HTTP redirects to understand a page's aliases
|
11
|
+
* Built-in BFS algorithm for determining page depth
|
12
|
+
* Allows exclusion of URLs based on regular expressions
|
13
|
+
* Can crawl obeying robots.txt
|
14
|
+
|
15
|
+
== REQUIREMENTS
|
16
|
+
* nokogiri
|
17
|
+
|
18
|
+
== EXAMPLES
|
19
|
+
See the +bin+ directory for several examples of useful Anemone tasks.
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the total number
|
4
|
+
# of unique pages on the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_count.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_count.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
Anemone.crawl(ARGV[0]) do |anemone|
|
31
|
+
anemone.after_crawl do |pages|
|
32
|
+
puts pages.uniq.size
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
data/bin/anemone_cron.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Performs pagedepth, url list, and count functionality
|
4
|
+
# Meant to be run daily as a cron job
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
|
12
|
+
#
|
13
|
+
# == Author
|
14
|
+
# Chris Kite
|
15
|
+
|
16
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
17
|
+
|
18
|
+
require 'anemone'
|
19
|
+
require 'optparse'
|
20
|
+
require 'ostruct'
|
21
|
+
|
22
|
+
def usage
|
23
|
+
puts <<END
|
24
|
+
Usage: anemone_url_list.rb [options] url
|
25
|
+
|
26
|
+
Options:
|
27
|
+
-r, --relative Output relative URLs (rather than absolute)
|
28
|
+
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
29
|
+
END
|
30
|
+
end
|
31
|
+
|
32
|
+
options = OpenStruct.new
|
33
|
+
options.relative = false
|
34
|
+
options.output_file = 'urls.txt'
|
35
|
+
|
36
|
+
# make sure that the last option is a URL we can crawl
|
37
|
+
begin
|
38
|
+
URI(ARGV.last)
|
39
|
+
rescue
|
40
|
+
usage
|
41
|
+
Process.exit
|
42
|
+
end
|
43
|
+
|
44
|
+
# parse command-line options
|
45
|
+
opts = OptionParser.new
|
46
|
+
opts.on('-r', '--relative') { options.relative = true }
|
47
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
48
|
+
opts.parse!(ARGV)
|
49
|
+
|
50
|
+
root = ARGV.last
|
51
|
+
|
52
|
+
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
53
|
+
|
54
|
+
anemone.after_crawl do |pages|
|
55
|
+
puts "Crawl results for #{root}\n"
|
56
|
+
|
57
|
+
# print a list of 404's
|
58
|
+
not_found = []
|
59
|
+
pages.each_value do |page|
|
60
|
+
url = page.url.to_s
|
61
|
+
not_found << url if page.not_found?
|
62
|
+
end
|
63
|
+
unless not_found.empty?
|
64
|
+
puts "\n404's:"
|
65
|
+
|
66
|
+
missing_links = pages.urls_linking_to(not_found)
|
67
|
+
missing_links.each do |url, links|
|
68
|
+
if options.relative
|
69
|
+
puts URI(url).path.to_s
|
70
|
+
else
|
71
|
+
puts url
|
72
|
+
end
|
73
|
+
links.slice(0..10).each do |u|
|
74
|
+
u = u.path if options.relative
|
75
|
+
puts " linked from #{u}"
|
76
|
+
end
|
77
|
+
|
78
|
+
puts " ..." if links.size > 10
|
79
|
+
end
|
80
|
+
|
81
|
+
print "\n"
|
82
|
+
end
|
83
|
+
|
84
|
+
# remove redirect aliases, and calculate pagedepths
|
85
|
+
pages = pages.shortest_paths!(root).uniq
|
86
|
+
depths = pages.values.inject({}) do |depths, page|
|
87
|
+
depths[page.depth] ||= 0
|
88
|
+
depths[page.depth] += 1
|
89
|
+
depths
|
90
|
+
end
|
91
|
+
|
92
|
+
# print the page count
|
93
|
+
puts "Total pages: #{pages.size}\n"
|
94
|
+
|
95
|
+
# print a list of depths
|
96
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
97
|
+
|
98
|
+
# output a list of urls to file
|
99
|
+
file = open(options.output_file, 'w')
|
100
|
+
pages.each_key do |url|
|
101
|
+
url = options.relative ? url.path.to_s : url.to_s
|
102
|
+
file.puts url
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs a count of
|
4
|
+
# the number of Pages at each depth in the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_pagedepth.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_pagedepth.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
root = ARGV[0]
|
31
|
+
Anemone.crawl(root) do |anemone|
|
32
|
+
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
33
|
+
|
34
|
+
anemone.after_crawl do |pages|
|
35
|
+
pages = pages.shortest_paths!(root).uniq
|
36
|
+
depths = pages.values.inject({}) do |depths, page|
|
37
|
+
depths[page.depth] ||= 0
|
38
|
+
depths[page.depth] += 1
|
39
|
+
depths
|
40
|
+
end
|
41
|
+
|
42
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and saves the resulting
|
4
|
+
# PageHash object to a file using Marshal serialization.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_serialize.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'ostruct'
|
20
|
+
|
21
|
+
def usage
|
22
|
+
puts <<END
|
23
|
+
Usage: anemone_serialize.rb [options] url
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
27
|
+
END
|
28
|
+
end
|
29
|
+
|
30
|
+
# make sure that the first option is a URL we can crawl
|
31
|
+
begin
|
32
|
+
URI(ARGV[0])
|
33
|
+
rescue
|
34
|
+
usage
|
35
|
+
Process.exit
|
36
|
+
end
|
37
|
+
|
38
|
+
options = OpenStruct.new
|
39
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
40
|
+
|
41
|
+
# parse command-line options
|
42
|
+
opts = OptionParser.new
|
43
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
44
|
+
opts.parse!(ARGV)
|
45
|
+
|
46
|
+
root = ARGV[0]
|
47
|
+
Anemone.crawl(root) do |anemone|
|
48
|
+
anemone.after_crawl do |pages|
|
49
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
+
# in the domain as they are encountered.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'ostruct'
|
20
|
+
|
21
|
+
def usage
|
22
|
+
puts <<END
|
23
|
+
Usage: anemone_url_list.rb [options] url
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-r, --relative Output relative URLs (rather than absolute)
|
27
|
+
END
|
28
|
+
end
|
29
|
+
|
30
|
+
options = OpenStruct.new
|
31
|
+
options.relative = false
|
32
|
+
|
33
|
+
# make sure that the last option is a URL we can crawl
|
34
|
+
begin
|
35
|
+
URI(ARGV.last)
|
36
|
+
rescue
|
37
|
+
usage
|
38
|
+
Process.exit
|
39
|
+
end
|
40
|
+
|
41
|
+
# parse command-line options
|
42
|
+
opts = OptionParser.new
|
43
|
+
opts.on('-r', '--relative') { options.relative = true }
|
44
|
+
opts.parse!(ARGV)
|
45
|
+
|
46
|
+
Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
|
47
|
+
anemone.on_every_page do |page|
|
48
|
+
if options.relative
|
49
|
+
puts page.url.path
|
50
|
+
else
|
51
|
+
puts page.url
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/anemone.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
require 'anemone/core'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
# Version number
|
6
|
+
VERSION = '0.1.2'
|
7
|
+
|
8
|
+
#module-wide options
|
9
|
+
def Anemone.options=(options)
|
10
|
+
@options = options
|
11
|
+
end
|
12
|
+
|
13
|
+
def Anemone.options
|
14
|
+
@options
|
15
|
+
end
|
16
|
+
|
17
|
+
#
|
18
|
+
# Convenience method to start a crawl using Core
|
19
|
+
#
|
20
|
+
def Anemone.crawl(urls, options = {}, &block)
|
21
|
+
Anemone.options = OpenStruct.new(options)
|
22
|
+
|
23
|
+
#by default, run 4 Tentacle threads to fetch pages
|
24
|
+
Anemone.options.threads ||= 4
|
25
|
+
|
26
|
+
#disable verbose output by default
|
27
|
+
Anemone.options.verbose ||= false
|
28
|
+
|
29
|
+
#by default, don't throw away the page response body after scanning it for links
|
30
|
+
Anemone.options.discard_page_bodies ||= false
|
31
|
+
|
32
|
+
#by default, identify self as Anemone/VERSION
|
33
|
+
Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
|
34
|
+
|
35
|
+
#Obey Robots.txt
|
36
|
+
Anemone.options.obey_robots_dot_txt ||= false
|
37
|
+
if Anemone.options.obey_robots_dot_txt == true
|
38
|
+
begin
|
39
|
+
require 'obey_robots_dot_txt'
|
40
|
+
rescue LoadError
|
41
|
+
warn "You need the 'obey_robots_dot_txt' gem installed, (you may run sudo gem install parolkar-obey_robots_dot_txt --source http://gems.github.com )"
|
42
|
+
exit
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
#no delay between requests by default
|
47
|
+
Anemone.options.delay ||= 0
|
48
|
+
|
49
|
+
#use a single thread if a delay was requested
|
50
|
+
if(Anemone.options.delay != 0)
|
51
|
+
Anemone.options.threads = 1
|
52
|
+
end
|
53
|
+
|
54
|
+
Core.crawl(urls, &block)
|
55
|
+
end
|
56
|
+
end
|
data/lib/anemone/core.rb
ADDED
@@ -0,0 +1,209 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'thread'
|
3
|
+
require 'anemone/tentacle'
|
4
|
+
require 'anemone/page_hash'
|
5
|
+
|
6
|
+
module Anemone
|
7
|
+
class Core
|
8
|
+
# PageHash storing all Page objects encountered during the crawl
|
9
|
+
attr_reader :pages
|
10
|
+
|
11
|
+
#
|
12
|
+
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
13
|
+
# and optional *block*
|
14
|
+
#
|
15
|
+
def initialize(urls, &block)
|
16
|
+
@urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
|
17
|
+
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
18
|
+
|
19
|
+
@tentacles = []
|
20
|
+
@pages = PageHash.new
|
21
|
+
@on_every_page_blocks = []
|
22
|
+
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
23
|
+
@skip_link_patterns = []
|
24
|
+
@after_crawl_blocks = []
|
25
|
+
|
26
|
+
block.call(self) if block
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# Convenience method to start a new crawl
|
31
|
+
#
|
32
|
+
def self.crawl(root, &block)
|
33
|
+
self.new(root) do |core|
|
34
|
+
block.call(core) if block
|
35
|
+
core.run
|
36
|
+
return core
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# Add a block to be executed on the PageHash after the crawl
|
42
|
+
# is finished
|
43
|
+
#
|
44
|
+
def after_crawl(&block)
|
45
|
+
@after_crawl_blocks << block
|
46
|
+
self
|
47
|
+
end
|
48
|
+
|
49
|
+
#
|
50
|
+
# Add one ore more Regex patterns for URLs which should not be
|
51
|
+
# followed
|
52
|
+
#
|
53
|
+
def skip_links_like(*patterns)
|
54
|
+
if patterns
|
55
|
+
patterns.each do |pattern|
|
56
|
+
@skip_link_patterns << pattern
|
57
|
+
end
|
58
|
+
end
|
59
|
+
self
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Add a block to be executed on every Page as they are encountered
|
64
|
+
# during the crawl
|
65
|
+
#
|
66
|
+
def on_every_page(&block)
|
67
|
+
@on_every_page_blocks << block
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Add a block to be executed on Page objects with a URL matching
|
73
|
+
# one or more patterns
|
74
|
+
#
|
75
|
+
def on_pages_like(*patterns, &block)
|
76
|
+
if patterns
|
77
|
+
patterns.each do |pattern|
|
78
|
+
@on_pages_like_blocks[pattern] << block
|
79
|
+
end
|
80
|
+
end
|
81
|
+
self
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# Specify a block which will select which links to follow on each page.
|
86
|
+
# The block should return an Array of URI objects.
|
87
|
+
#
|
88
|
+
def focus_crawl(&block)
|
89
|
+
@focus_crawl_block = block
|
90
|
+
self
|
91
|
+
end
|
92
|
+
|
93
|
+
#
|
94
|
+
# Perform the crawl
|
95
|
+
#
|
96
|
+
def run
|
97
|
+
@urls.delete_if { |url| !visit_link?(url) }
|
98
|
+
return if @urls.empty?
|
99
|
+
|
100
|
+
link_queue = Queue.new
|
101
|
+
page_queue = Queue.new
|
102
|
+
|
103
|
+
Anemone.options.threads.times do |id|
|
104
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
105
|
+
end
|
106
|
+
|
107
|
+
@urls.each{ |url| link_queue.enq(url) }
|
108
|
+
|
109
|
+
loop do
|
110
|
+
page = page_queue.deq
|
111
|
+
|
112
|
+
@pages[page.url] = page
|
113
|
+
|
114
|
+
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
115
|
+
|
116
|
+
#perform the on_every_page blocks for this page
|
117
|
+
do_page_blocks(page)
|
118
|
+
|
119
|
+
page.doc = nil if Anemone.options.discard_page_bodies
|
120
|
+
|
121
|
+
links_to_follow(page).each do |link|
|
122
|
+
link_queue.enq(link)
|
123
|
+
@pages[link] = nil
|
124
|
+
end
|
125
|
+
|
126
|
+
#create an entry in the page hash for each alias of this page,
|
127
|
+
#i.e. all the pages that redirected to this page
|
128
|
+
page.aliases.each do |aka|
|
129
|
+
if !@pages.has_key?(aka) or @pages[aka].nil?
|
130
|
+
@pages[aka] = page.alias_clone(aka)
|
131
|
+
end
|
132
|
+
@pages[aka].add_alias!(page.url)
|
133
|
+
end
|
134
|
+
|
135
|
+
# if we are done with the crawl, tell the threads to end
|
136
|
+
if link_queue.empty? and page_queue.empty?
|
137
|
+
until link_queue.num_waiting == @tentacles.size
|
138
|
+
Thread.pass
|
139
|
+
end
|
140
|
+
|
141
|
+
if page_queue.empty?
|
142
|
+
@tentacles.size.times { |i| link_queue.enq(:END)}
|
143
|
+
break
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
@tentacles.each { |t| t.join }
|
150
|
+
|
151
|
+
do_after_crawl_blocks()
|
152
|
+
|
153
|
+
self
|
154
|
+
end
|
155
|
+
|
156
|
+
private
|
157
|
+
|
158
|
+
#
|
159
|
+
# Execute the after_crawl blocks
|
160
|
+
#
|
161
|
+
def do_after_crawl_blocks
|
162
|
+
@after_crawl_blocks.each {|b| b.call(@pages)}
|
163
|
+
end
|
164
|
+
|
165
|
+
#
|
166
|
+
# Execute the on_every_page blocks for *page*
|
167
|
+
#
|
168
|
+
def do_page_blocks(page)
|
169
|
+
@on_every_page_blocks.each do |blk|
|
170
|
+
blk.call(page)
|
171
|
+
end
|
172
|
+
|
173
|
+
@on_pages_like_blocks.each do |pattern, blks|
|
174
|
+
if page.url.to_s =~ pattern
|
175
|
+
blks.each { |blk| blk.call(page) }
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
#
|
181
|
+
# Return an Array of links to follow from the given page.
|
182
|
+
# Based on whether or not the link has already been crawled,
|
183
|
+
# and the block given to focus_crawl()
|
184
|
+
#
|
185
|
+
def links_to_follow(page)
|
186
|
+
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
187
|
+
links.find_all { |link| visit_link?(link) }
|
188
|
+
end
|
189
|
+
|
190
|
+
#
|
191
|
+
# Returns +true+ if *link* has not been visited already,
|
192
|
+
# and is not excluded by a skip_link pattern. Returns
|
193
|
+
# +false+ otherwise.
|
194
|
+
#
|
195
|
+
def visit_link?(link)
|
196
|
+
!@pages.has_key?(link) and !skip_link?(link)
|
197
|
+
end
|
198
|
+
|
199
|
+
#
|
200
|
+
# Returns +true+ if *link* should not be visited because
|
201
|
+
# its URL matches a skip_link pattern.
|
202
|
+
#
|
203
|
+
def skip_link?(link)
|
204
|
+
@skip_link_patterns.each { |p| return true if link.path =~ p}
|
205
|
+
return false
|
206
|
+
end
|
207
|
+
|
208
|
+
end
|
209
|
+
end
|
data/lib/anemone/http.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class HTTP < Net::HTTP
|
5
|
+
# Maximum number of redirects to follow on each get_response
|
6
|
+
REDIRECTION_LIMIT = 5
|
7
|
+
|
8
|
+
#
|
9
|
+
# Retrieve an HTTP response for *url*, following redirects.
|
10
|
+
# Returns the response object, response code, and final URI location.
|
11
|
+
#
|
12
|
+
def self.get(url)
|
13
|
+
response = get_response(url)
|
14
|
+
code = Integer(response.code)
|
15
|
+
loc = url
|
16
|
+
|
17
|
+
limit = REDIRECTION_LIMIT
|
18
|
+
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
|
+
loc = URI(response['location'])
|
20
|
+
loc = url.merge(loc) if loc.relative?
|
21
|
+
response = (Anemone.options.obey_robots_dot_txt ? (Net::HTTP.get_obeying_robots(loc)) : get_response(loc) )
|
22
|
+
limit -= 1
|
23
|
+
end
|
24
|
+
|
25
|
+
return response, code, loc
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
|
+
#
|
31
|
+
def self.get_response(url)
|
32
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
33
|
+
Net::HTTP.start(url.host, url.port) do |http|
|
34
|
+
return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/anemone/page.rb
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
require 'anemone/http'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
module Anemone
|
6
|
+
class Page
|
7
|
+
|
8
|
+
# The URL of the page
|
9
|
+
attr_reader :url
|
10
|
+
# Array of distinct A tag HREFs from the page
|
11
|
+
attr_reader :links
|
12
|
+
# Headers of the HTTP response
|
13
|
+
attr_reader :headers
|
14
|
+
|
15
|
+
# OpenStruct for user-stored data
|
16
|
+
attr_accessor :data
|
17
|
+
# Nokogiri document for the HTML body
|
18
|
+
attr_accessor :doc
|
19
|
+
# Integer response code of the page
|
20
|
+
attr_accessor :code
|
21
|
+
# Array of redirect-aliases for the page
|
22
|
+
attr_accessor :aliases
|
23
|
+
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
24
|
+
attr_accessor :visited
|
25
|
+
# Used by PageHash#shortest_paths! to store depth of the page
|
26
|
+
attr_accessor :depth
|
27
|
+
|
28
|
+
#
|
29
|
+
# Create a new Page from the response of an HTTP request to *url*
|
30
|
+
#
|
31
|
+
def self.fetch(url)
|
32
|
+
begin
|
33
|
+
url = URI(url) if url.is_a?(String)
|
34
|
+
|
35
|
+
response, code, location = Anemone::HTTP.get(url)
|
36
|
+
|
37
|
+
aka = nil
|
38
|
+
if !url.eql?(location)
|
39
|
+
aka = location
|
40
|
+
end
|
41
|
+
|
42
|
+
return Page.new(url, response.body, code, response.to_hash, aka)
|
43
|
+
rescue
|
44
|
+
return Page.new(url)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Create a new page
|
50
|
+
#
|
51
|
+
def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
|
52
|
+
@url = url
|
53
|
+
@code = code
|
54
|
+
@headers = headers
|
55
|
+
@links = []
|
56
|
+
@aliases = []
|
57
|
+
@data = OpenStruct.new
|
58
|
+
|
59
|
+
@aliases << aka if !aka.nil?
|
60
|
+
|
61
|
+
if body
|
62
|
+
begin
|
63
|
+
@doc = Nokogiri::HTML(body)
|
64
|
+
rescue
|
65
|
+
return
|
66
|
+
end
|
67
|
+
|
68
|
+
return if @doc.nil?
|
69
|
+
|
70
|
+
#get a list of distinct links on the page, in absolute url form
|
71
|
+
@doc.css('a').each do |a|
|
72
|
+
u = a.attributes['href'].content if a.attributes['href']
|
73
|
+
next if u.nil?
|
74
|
+
|
75
|
+
begin
|
76
|
+
abs = to_absolute(URI(u))
|
77
|
+
rescue
|
78
|
+
next
|
79
|
+
end
|
80
|
+
|
81
|
+
@links << abs if in_domain?(abs)
|
82
|
+
end
|
83
|
+
|
84
|
+
@links.uniq!
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
#
|
90
|
+
# Return a new page with the same *response* and *url*, but
|
91
|
+
# with a 200 response code
|
92
|
+
#
|
93
|
+
def alias_clone(url)
|
94
|
+
p = clone
|
95
|
+
p.add_alias!(@aka) if !@aka.nil?
|
96
|
+
p.code = 200
|
97
|
+
p
|
98
|
+
end
|
99
|
+
|
100
|
+
#
|
101
|
+
# Add a redirect-alias String *aka* to the list of the page's aliases
|
102
|
+
#
|
103
|
+
# Returns *self*
|
104
|
+
#
|
105
|
+
def add_alias!(aka)
|
106
|
+
@aliases << aka if !@aliases.include?(aka)
|
107
|
+
self
|
108
|
+
end
|
109
|
+
|
110
|
+
#
|
111
|
+
# Returns an Array of all links from this page, and all the
|
112
|
+
# redirect-aliases of those pages, as String objects.
|
113
|
+
#
|
114
|
+
# *page_hash* is a PageHash object with the results of the current crawl.
|
115
|
+
#
|
116
|
+
def links_and_their_aliases(page_hash)
|
117
|
+
@links.inject([]) do |results, link|
|
118
|
+
results.concat([link].concat(page_hash[link].aliases))
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
#
|
123
|
+
# The content-type returned by the HTTP request for this page
|
124
|
+
#
|
125
|
+
def content_type
|
126
|
+
@headers['content-type'][0] rescue nil
|
127
|
+
end
|
128
|
+
|
129
|
+
#
|
130
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
131
|
+
# otherwise.
|
132
|
+
#
|
133
|
+
def html?
|
134
|
+
(@content_type =~ /text\/html/) == 0
|
135
|
+
end
|
136
|
+
|
137
|
+
#
|
138
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
139
|
+
# otherwise.
|
140
|
+
#
|
141
|
+
def redirect?
|
142
|
+
(300..399).include?(@code)
|
143
|
+
end
|
144
|
+
|
145
|
+
#
|
146
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
147
|
+
# returns +false+ otherwise.
|
148
|
+
#
|
149
|
+
def not_found?
|
150
|
+
404 == @code
|
151
|
+
end
|
152
|
+
|
153
|
+
#
|
154
|
+
# Converts relative URL *link* into an absolute URL based on the
|
155
|
+
# location of the page
|
156
|
+
#
|
157
|
+
def to_absolute(link)
|
158
|
+
# remove anchor
|
159
|
+
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
160
|
+
|
161
|
+
relative = URI(link)
|
162
|
+
absolute = @url.merge(relative)
|
163
|
+
|
164
|
+
absolute.path = '/' if absolute.path.empty?
|
165
|
+
|
166
|
+
return absolute
|
167
|
+
end
|
168
|
+
|
169
|
+
#
|
170
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
171
|
+
# +false+ otherwise
|
172
|
+
#
|
173
|
+
def in_domain?(uri)
|
174
|
+
uri.host == @url.host
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
module Anemone
|
2
|
+
class PageHash < Hash
|
3
|
+
|
4
|
+
#
|
5
|
+
# Use a breadth-first search to calculate the single-source
|
6
|
+
# shortest paths from *root* to all pages in the PageHash
|
7
|
+
#
|
8
|
+
def shortest_paths!(root)
|
9
|
+
root = URI(root) if root.is_a?(String)
|
10
|
+
raise "Root node not found" if !has_key?(root)
|
11
|
+
|
12
|
+
each_value {|p| p.visited = false if p}
|
13
|
+
|
14
|
+
q = Queue.new
|
15
|
+
|
16
|
+
q.enq(root)
|
17
|
+
self[root].depth = 0
|
18
|
+
self[root].visited = true
|
19
|
+
while(!q.empty?)
|
20
|
+
url = q.deq
|
21
|
+
|
22
|
+
next if !has_key?(url)
|
23
|
+
|
24
|
+
page = self[url]
|
25
|
+
|
26
|
+
page.links.each do |u|
|
27
|
+
next if !has_key?(u) or self[u].nil?
|
28
|
+
link = self[u]
|
29
|
+
aliases = [link].concat(link.aliases.map {|a| self[a] })
|
30
|
+
|
31
|
+
aliases.each do |node|
|
32
|
+
if node.depth.nil? or page.depth + 1 < node.depth
|
33
|
+
node.depth = page.depth + 1
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
q.enq(self[u].url) if !self[u].visited
|
38
|
+
self[u].visited = true
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
self
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Returns a new PageHash by removing redirect-aliases for each
|
47
|
+
# non-redirect Page
|
48
|
+
#
|
49
|
+
def uniq
|
50
|
+
results = PageHash.new
|
51
|
+
each do |url, page|
|
52
|
+
#if none of the aliases of this page have been added, and this isn't a redirect page, add this page
|
53
|
+
page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
|
54
|
+
if !page.redirect? and !page_added
|
55
|
+
results[url] = page.clone
|
56
|
+
results[url].aliases = []
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
results
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
|
65
|
+
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
|
66
|
+
#
|
67
|
+
def pages_linking_to(urls)
|
68
|
+
unless urls.is_a?(Array)
|
69
|
+
urls = [urls] unless urls.is_a?(Array)
|
70
|
+
single = true
|
71
|
+
end
|
72
|
+
|
73
|
+
urls.map! do |url|
|
74
|
+
if url.is_a?(String)
|
75
|
+
URI(url) rescue nil
|
76
|
+
else
|
77
|
+
url
|
78
|
+
end
|
79
|
+
end
|
80
|
+
urls.compact
|
81
|
+
|
82
|
+
links = {}
|
83
|
+
urls.each { |url| links[url] = [] }
|
84
|
+
values.each do |page|
|
85
|
+
urls.each { |url| links[url] << page if page.links.include?(url) }
|
86
|
+
end
|
87
|
+
|
88
|
+
if single and !links.empty?
|
89
|
+
return links.first
|
90
|
+
else
|
91
|
+
return links
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
#
|
96
|
+
# If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
|
97
|
+
# If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
|
98
|
+
#
|
99
|
+
def urls_linking_to(urls)
|
100
|
+
unless urls.is_a?(Array)
|
101
|
+
urls = [urls] unless urls.is_a?(Array)
|
102
|
+
single = true
|
103
|
+
end
|
104
|
+
|
105
|
+
links = pages_linking_to(urls)
|
106
|
+
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
|
107
|
+
|
108
|
+
if single and !links.empty?
|
109
|
+
return links.first
|
110
|
+
else
|
111
|
+
return links
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'anemone/page'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class Tentacle
|
5
|
+
|
6
|
+
#
|
7
|
+
# Create a new Tentacle
|
8
|
+
#
|
9
|
+
def initialize(link_queue, page_queue)
|
10
|
+
@link_queue = link_queue
|
11
|
+
@page_queue = page_queue
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Gets links from @link_queue, and returns the fetched
|
16
|
+
# Page objects into @page_queue
|
17
|
+
#
|
18
|
+
def run
|
19
|
+
while true do
|
20
|
+
link = @link_queue.deq
|
21
|
+
|
22
|
+
break if link == :END
|
23
|
+
|
24
|
+
page = Page.fetch(link)
|
25
|
+
|
26
|
+
@page_queue.enq(page)
|
27
|
+
|
28
|
+
sleep Anemone.options.delay
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe Anemone do
|
4
|
+
|
5
|
+
it "should have a version" do
|
6
|
+
Anemone.const_defined?('VERSION').should == true
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have options" do
|
10
|
+
Anemone.should respond_to(:options)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should accept options for the crawl" do
|
14
|
+
Anemone.crawl(SPEC_DOMAIN, :verbose => false,
|
15
|
+
:threads => 2,
|
16
|
+
:discard_page_bodies => true,
|
17
|
+
:user_agent => 'test')
|
18
|
+
Anemone.options.verbose.should == false
|
19
|
+
Anemone.options.threads.should == 2
|
20
|
+
Anemone.options.discard_page_bodies.should == true
|
21
|
+
Anemone.options.delay.should == 0
|
22
|
+
Anemone.options.user_agent.should == 'test'
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should accept options of obeying Robots.txt for the crawl" do
|
26
|
+
Anemone.crawl(SPEC_DOMAIN, :obey_robots_dot_txt => true)
|
27
|
+
Anemone.options.obey_robots_dot_txt.should == true
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should use 1 thread if a delay is requested" do
|
31
|
+
Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
|
32
|
+
Anemone.options.threads.should == 1
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should return a Anemone::Core from the crawl, which has a PageHash" do
|
36
|
+
result = Anemone.crawl(SPEC_DOMAIN)
|
37
|
+
result.should be_an_instance_of(Anemone::Core)
|
38
|
+
result.pages.should be_an_instance_of(Anemone::PageHash)
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
data/spec/core_spec.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
describe Core do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
FakeWeb.clean_registry
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should crawl all the html pages in a domain by following <a> href's" do
|
11
|
+
pages = []
|
12
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
13
|
+
pages << FakePage.new('1', :links => ['3'])
|
14
|
+
pages << FakePage.new('2')
|
15
|
+
pages << FakePage.new('3')
|
16
|
+
|
17
|
+
Anemone.crawl(pages[0].url).should have(4).pages
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should not leave the original domain" do
|
21
|
+
pages = []
|
22
|
+
pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
|
23
|
+
pages << FakePage.new('1')
|
24
|
+
|
25
|
+
core = Anemone.crawl(pages[0].url)
|
26
|
+
|
27
|
+
core.should have(2).pages
|
28
|
+
core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should follow http redirects" do
|
32
|
+
pages = []
|
33
|
+
pages << FakePage.new('0', :links => ['1'])
|
34
|
+
pages << FakePage.new('1', :redirect => '2')
|
35
|
+
pages << FakePage.new('2')
|
36
|
+
|
37
|
+
Anemone.crawl(pages[0].url).should have(3).pages
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should accept multiple starting URLs" do
|
41
|
+
pages = []
|
42
|
+
pages << FakePage.new('0', :links => ['1'])
|
43
|
+
pages << FakePage.new('1')
|
44
|
+
pages << FakePage.new('2', :links => ['3'])
|
45
|
+
pages << FakePage.new('3')
|
46
|
+
|
47
|
+
Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should include the query string when following links" do
|
51
|
+
pages = []
|
52
|
+
pages << FakePage.new('0', :links => ['1?foo=1'])
|
53
|
+
pages << FakePage.new('1?foo=1')
|
54
|
+
pages << FakePage.new('1')
|
55
|
+
|
56
|
+
core = Anemone.crawl(pages[0].url)
|
57
|
+
|
58
|
+
core.should have(2).pages
|
59
|
+
core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should be able to skip links based on a RegEx" do
|
63
|
+
pages = []
|
64
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
65
|
+
pages << FakePage.new('1')
|
66
|
+
pages << FakePage.new('2')
|
67
|
+
|
68
|
+
core = Anemone.crawl(pages[0].url) do |a|
|
69
|
+
a.skip_links_like /1/
|
70
|
+
end
|
71
|
+
|
72
|
+
core.should have(2).pages
|
73
|
+
core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should be able to call a block on every page" do
|
77
|
+
pages = []
|
78
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
79
|
+
pages << FakePage.new('1')
|
80
|
+
pages << FakePage.new('2')
|
81
|
+
|
82
|
+
count = 0
|
83
|
+
Anemone.crawl(pages[0].url) do |a|
|
84
|
+
a.on_every_page { count += 1 }
|
85
|
+
end
|
86
|
+
|
87
|
+
count.should == 3
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should not discard page bodies by default" do
|
91
|
+
Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should optionally discard page bodies to conserve memory" do
|
95
|
+
core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
|
96
|
+
core.pages.values.first.doc.should be_nil
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should provide a focus_crawl method to select the links on each page to follow" do
|
100
|
+
pages = []
|
101
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
102
|
+
pages << FakePage.new('1')
|
103
|
+
pages << FakePage.new('2')
|
104
|
+
|
105
|
+
core = Anemone.crawl(pages[0].url) do |a|
|
106
|
+
a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
|
107
|
+
end
|
108
|
+
|
109
|
+
core.should have(2).pages
|
110
|
+
core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
|
111
|
+
end
|
112
|
+
|
113
|
+
it "should optionally delay between page requests" do
|
114
|
+
delay = 0.25
|
115
|
+
|
116
|
+
pages = []
|
117
|
+
pages << FakePage.new('0', :links => '1')
|
118
|
+
pages << FakePage.new('1')
|
119
|
+
|
120
|
+
start = Time.now
|
121
|
+
Anemone.crawl(pages[0].url, :delay => delay)
|
122
|
+
finish = Time.now
|
123
|
+
|
124
|
+
(finish - start).should satisfy {|t| t > delay * 2}
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
begin
|
2
|
+
require 'fakeweb'
|
3
|
+
rescue LoadError
|
4
|
+
warn "You need the 'fakeweb' gem installed to test Anemone"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
FakeWeb.allow_net_connect = false
|
9
|
+
|
10
|
+
module Anemone
|
11
|
+
SPEC_DOMAIN = "http://www.example.com/"
|
12
|
+
|
13
|
+
class FakePage
|
14
|
+
attr_accessor :links
|
15
|
+
attr_accessor :hrefs
|
16
|
+
|
17
|
+
def initialize(name = '', options = {})
|
18
|
+
@name = name
|
19
|
+
@links = [options[:links]].flatten if options.has_key?(:links)
|
20
|
+
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
21
|
+
@redirect = options[:redirect] if options.has_key?(:redirect)
|
22
|
+
|
23
|
+
create_body
|
24
|
+
add_to_fakeweb
|
25
|
+
end
|
26
|
+
|
27
|
+
def url
|
28
|
+
SPEC_DOMAIN + @name
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def create_body
|
34
|
+
@body = "<html><body>"
|
35
|
+
@links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
|
36
|
+
@hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
|
37
|
+
@body += "</body></html>"
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_to_fakeweb
|
41
|
+
options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
|
42
|
+
|
43
|
+
if @redirect
|
44
|
+
options[:status] = [301, "Permanently Moved"]
|
45
|
+
options[:location] = SPEC_DOMAIN + @redirect
|
46
|
+
end
|
47
|
+
|
48
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
#default root
|
54
|
+
Anemone::FakePage.new
|
55
|
+
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
describe Page do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
@page = Page.fetch(FakePage.new('home').url)
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should be able to fetch a page" do
|
11
|
+
@page.should_not be_nil
|
12
|
+
@page.url.to_s.should include('home')
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should store the response headers when fetching a page" do
|
16
|
+
@page.headers.should_not be_nil
|
17
|
+
@page.headers.should have_key('content-type')
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should have an OpenStruct attribute for the developer to store data in" do
|
21
|
+
@page.data.should_not be_nil
|
22
|
+
@page.data.should be_an_instance_of(OpenStruct)
|
23
|
+
|
24
|
+
@page.data.test = 'test'
|
25
|
+
@page.data.test.should == 'test'
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should have a Nokogori::HTML::Document attribute for the page body" do
|
29
|
+
@page.doc.should_not be_nil
|
30
|
+
@page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should indicate whether it was fetched after an HTTP redirect" do
|
34
|
+
@page.should respond_to(:redirect?)
|
35
|
+
|
36
|
+
@page.redirect?.should == false
|
37
|
+
|
38
|
+
Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should have a method to tell if a URI is in the same domain as the page" do
|
42
|
+
@page.should respond_to(:in_domain?)
|
43
|
+
|
44
|
+
@page.in_domain?(URI(FakePage.new('test').url)).should == true
|
45
|
+
@page.in_domain?(URI('http://www.other.com/')).should == false
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: parolkar-anemone
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Chris Kite
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-05-16 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: nokogiri
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.3.0
|
24
|
+
version:
|
25
|
+
description:
|
26
|
+
email:
|
27
|
+
executables:
|
28
|
+
- anemone_count.rb
|
29
|
+
- anemone_cron.rb
|
30
|
+
- anemone_pagedepth.rb
|
31
|
+
- anemone_serialize.rb
|
32
|
+
- anemone_url_list.rb
|
33
|
+
extensions: []
|
34
|
+
|
35
|
+
extra_rdoc_files:
|
36
|
+
- README.rdoc
|
37
|
+
files:
|
38
|
+
- LICENSE.txt
|
39
|
+
- README.rdoc
|
40
|
+
- bin/anemone_count.rb
|
41
|
+
- bin/anemone_cron.rb
|
42
|
+
- bin/anemone_pagedepth.rb
|
43
|
+
- bin/anemone_serialize.rb
|
44
|
+
- bin/anemone_url_list.rb
|
45
|
+
- lib/anemone.rb
|
46
|
+
- lib/anemone/anemone.rb
|
47
|
+
- lib/anemone/core.rb
|
48
|
+
- lib/anemone/http.rb
|
49
|
+
- lib/anemone/page.rb
|
50
|
+
- lib/anemone/page_hash.rb
|
51
|
+
- lib/anemone/tentacle.rb
|
52
|
+
has_rdoc: true
|
53
|
+
homepage: http://anemone.rubyforge.org
|
54
|
+
post_install_message:
|
55
|
+
rdoc_options:
|
56
|
+
- -m
|
57
|
+
- README.rdoc
|
58
|
+
- -t
|
59
|
+
- Anemone
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: "0"
|
67
|
+
version:
|
68
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: "0"
|
73
|
+
version:
|
74
|
+
requirements: []
|
75
|
+
|
76
|
+
rubyforge_project: anemone
|
77
|
+
rubygems_version: 1.2.0
|
78
|
+
signing_key:
|
79
|
+
specification_version: 2
|
80
|
+
summary: Anemone web-spider framework
|
81
|
+
test_files:
|
82
|
+
- spec/anemone_spec.rb
|
83
|
+
- spec/core_spec.rb
|
84
|
+
- spec/page_spec.rb
|
85
|
+
- spec/fakeweb_helper.rb
|
86
|
+
- spec/spec_helper.rb
|