parolkar-anemone 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +19 -0
- data/README.rdoc +19 -0
- data/bin/anemone_count.rb +36 -0
- data/bin/anemone_cron.rb +106 -0
- data/bin/anemone_pagedepth.rb +44 -0
- data/bin/anemone_serialize.rb +51 -0
- data/bin/anemone_url_list.rb +54 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/anemone.rb +56 -0
- data/lib/anemone/core.rb +209 -0
- data/lib/anemone/http.rb +38 -0
- data/lib/anemone/page.rb +177 -0
- data/lib/anemone/page_hash.rb +116 -0
- data/lib/anemone/tentacle.rb +33 -0
- data/spec/anemone_spec.rb +41 -0
- data/spec/core_spec.rb +128 -0
- data/spec/fakeweb_helper.rb +55 -0
- data/spec/page_spec.rb +49 -0
- data/spec/spec_helper.rb +7 -0
- metadata +86 -0
data/LICENSE.txt
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2009 Vertive, Inc.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
= Anemone
|
2
|
+
|
3
|
+
== DESCRIPTION
|
4
|
+
Anemone is a web spider framework that can spider a domain and collect useful
|
5
|
+
information about the pages it visits. It is versatile, allowing you to
|
6
|
+
write your own specialized spider tasks quickly and easily.
|
7
|
+
|
8
|
+
== FEATURES
|
9
|
+
* Multi-threaded design for high performance
|
10
|
+
* Tracks 301 HTTP redirects to understand a page's aliases
|
11
|
+
* Built-in BFS algorithm for determining page depth
|
12
|
+
* Allows exclusion of URLs based on regular expressions
|
13
|
+
* Can crawl obeying robots.txt
|
14
|
+
|
15
|
+
== REQUIREMENTS
|
16
|
+
* nokogiri
|
17
|
+
|
18
|
+
== EXAMPLES
|
19
|
+
See the +bin+ directory for several examples of useful Anemone tasks.
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the total number
|
4
|
+
# of unique pages on the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_count.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_count.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
Anemone.crawl(ARGV[0]) do |anemone|
|
31
|
+
anemone.after_crawl do |pages|
|
32
|
+
puts pages.uniq.size
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
data/bin/anemone_cron.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Performs pagedepth, url list, and count functionality
|
4
|
+
# Meant to be run daily as a cron job
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
|
12
|
+
#
|
13
|
+
# == Author
|
14
|
+
# Chris Kite
|
15
|
+
|
16
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
17
|
+
|
18
|
+
require 'anemone'
|
19
|
+
require 'optparse'
|
20
|
+
require 'ostruct'
|
21
|
+
|
22
|
+
def usage
|
23
|
+
puts <<END
|
24
|
+
Usage: anemone_url_list.rb [options] url
|
25
|
+
|
26
|
+
Options:
|
27
|
+
-r, --relative Output relative URLs (rather than absolute)
|
28
|
+
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
29
|
+
END
|
30
|
+
end
|
31
|
+
|
32
|
+
options = OpenStruct.new
|
33
|
+
options.relative = false
|
34
|
+
options.output_file = 'urls.txt'
|
35
|
+
|
36
|
+
# make sure that the last option is a URL we can crawl
|
37
|
+
begin
|
38
|
+
URI(ARGV.last)
|
39
|
+
rescue
|
40
|
+
usage
|
41
|
+
Process.exit
|
42
|
+
end
|
43
|
+
|
44
|
+
# parse command-line options
|
45
|
+
opts = OptionParser.new
|
46
|
+
opts.on('-r', '--relative') { options.relative = true }
|
47
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
48
|
+
opts.parse!(ARGV)
|
49
|
+
|
50
|
+
root = ARGV.last
|
51
|
+
|
52
|
+
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
53
|
+
|
54
|
+
anemone.after_crawl do |pages|
|
55
|
+
puts "Crawl results for #{root}\n"
|
56
|
+
|
57
|
+
# print a list of 404's
|
58
|
+
not_found = []
|
59
|
+
pages.each_value do |page|
|
60
|
+
url = page.url.to_s
|
61
|
+
not_found << url if page.not_found?
|
62
|
+
end
|
63
|
+
unless not_found.empty?
|
64
|
+
puts "\n404's:"
|
65
|
+
|
66
|
+
missing_links = pages.urls_linking_to(not_found)
|
67
|
+
missing_links.each do |url, links|
|
68
|
+
if options.relative
|
69
|
+
puts URI(url).path.to_s
|
70
|
+
else
|
71
|
+
puts url
|
72
|
+
end
|
73
|
+
links.slice(0..10).each do |u|
|
74
|
+
u = u.path if options.relative
|
75
|
+
puts " linked from #{u}"
|
76
|
+
end
|
77
|
+
|
78
|
+
puts " ..." if links.size > 10
|
79
|
+
end
|
80
|
+
|
81
|
+
print "\n"
|
82
|
+
end
|
83
|
+
|
84
|
+
# remove redirect aliases, and calculate pagedepths
|
85
|
+
pages = pages.shortest_paths!(root).uniq
|
86
|
+
depths = pages.values.inject({}) do |depths, page|
|
87
|
+
depths[page.depth] ||= 0
|
88
|
+
depths[page.depth] += 1
|
89
|
+
depths
|
90
|
+
end
|
91
|
+
|
92
|
+
# print the page count
|
93
|
+
puts "Total pages: #{pages.size}\n"
|
94
|
+
|
95
|
+
# print a list of depths
|
96
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
97
|
+
|
98
|
+
# output a list of urls to file
|
99
|
+
file = open(options.output_file, 'w')
|
100
|
+
pages.each_key do |url|
|
101
|
+
url = options.relative ? url.path.to_s : url.to_s
|
102
|
+
file.puts url
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs a count of
|
4
|
+
# the number of Pages at each depth in the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_pagedepth.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_pagedepth.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
root = ARGV[0]
|
31
|
+
Anemone.crawl(root) do |anemone|
|
32
|
+
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
33
|
+
|
34
|
+
anemone.after_crawl do |pages|
|
35
|
+
pages = pages.shortest_paths!(root).uniq
|
36
|
+
depths = pages.values.inject({}) do |depths, page|
|
37
|
+
depths[page.depth] ||= 0
|
38
|
+
depths[page.depth] += 1
|
39
|
+
depths
|
40
|
+
end
|
41
|
+
|
42
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and saves the resulting
|
4
|
+
# PageHash object to a file using Marshal serialization.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_serialize.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'ostruct'
|
20
|
+
|
21
|
+
def usage
|
22
|
+
puts <<END
|
23
|
+
Usage: anemone_serialize.rb [options] url
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
27
|
+
END
|
28
|
+
end
|
29
|
+
|
30
|
+
# make sure that the first option is a URL we can crawl
|
31
|
+
begin
|
32
|
+
URI(ARGV[0])
|
33
|
+
rescue
|
34
|
+
usage
|
35
|
+
Process.exit
|
36
|
+
end
|
37
|
+
|
38
|
+
options = OpenStruct.new
|
39
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
40
|
+
|
41
|
+
# parse command-line options
|
42
|
+
opts = OptionParser.new
|
43
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
44
|
+
opts.parse!(ARGV)
|
45
|
+
|
46
|
+
root = ARGV[0]
|
47
|
+
Anemone.crawl(root) do |anemone|
|
48
|
+
anemone.after_crawl do |pages|
|
49
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
+
# in the domain as they are encountered.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'ostruct'
|
20
|
+
|
21
|
+
def usage
|
22
|
+
puts <<END
|
23
|
+
Usage: anemone_url_list.rb [options] url
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-r, --relative Output relative URLs (rather than absolute)
|
27
|
+
END
|
28
|
+
end
|
29
|
+
|
30
|
+
options = OpenStruct.new
|
31
|
+
options.relative = false
|
32
|
+
|
33
|
+
# make sure that the last option is a URL we can crawl
|
34
|
+
begin
|
35
|
+
URI(ARGV.last)
|
36
|
+
rescue
|
37
|
+
usage
|
38
|
+
Process.exit
|
39
|
+
end
|
40
|
+
|
41
|
+
# parse command-line options
|
42
|
+
opts = OptionParser.new
|
43
|
+
opts.on('-r', '--relative') { options.relative = true }
|
44
|
+
opts.parse!(ARGV)
|
45
|
+
|
46
|
+
Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
|
47
|
+
anemone.on_every_page do |page|
|
48
|
+
if options.relative
|
49
|
+
puts page.url.path
|
50
|
+
else
|
51
|
+
puts page.url
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/lib/anemone.rb
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'ostruct'
|
2
|
+
require 'anemone/core'
|
3
|
+
|
4
|
+
module Anemone
|
5
|
+
# Version number
|
6
|
+
VERSION = '0.1.2'
|
7
|
+
|
8
|
+
#module-wide options
|
9
|
+
def Anemone.options=(options)
|
10
|
+
@options = options
|
11
|
+
end
|
12
|
+
|
13
|
+
def Anemone.options
|
14
|
+
@options
|
15
|
+
end
|
16
|
+
|
17
|
+
#
|
18
|
+
# Convenience method to start a crawl using Core
|
19
|
+
#
|
20
|
+
def Anemone.crawl(urls, options = {}, &block)
|
21
|
+
Anemone.options = OpenStruct.new(options)
|
22
|
+
|
23
|
+
#by default, run 4 Tentacle threads to fetch pages
|
24
|
+
Anemone.options.threads ||= 4
|
25
|
+
|
26
|
+
#disable verbose output by default
|
27
|
+
Anemone.options.verbose ||= false
|
28
|
+
|
29
|
+
#by default, don't throw away the page response body after scanning it for links
|
30
|
+
Anemone.options.discard_page_bodies ||= false
|
31
|
+
|
32
|
+
#by default, identify self as Anemone/VERSION
|
33
|
+
Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
|
34
|
+
|
35
|
+
#Obey Robots.txt
|
36
|
+
Anemone.options.obey_robots_dot_txt ||= false
|
37
|
+
if Anemone.options.obey_robots_dot_txt == true
|
38
|
+
begin
|
39
|
+
require 'obey_robots_dot_txt'
|
40
|
+
rescue LoadError
|
41
|
+
warn "You need the 'obey_robots_dot_txt' gem installed, (you may run sudo gem install parolkar-obey_robots_dot_txt --source http://gems.github.com )"
|
42
|
+
exit
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
#no delay between requests by default
|
47
|
+
Anemone.options.delay ||= 0
|
48
|
+
|
49
|
+
#use a single thread if a delay was requested
|
50
|
+
if(Anemone.options.delay != 0)
|
51
|
+
Anemone.options.threads = 1
|
52
|
+
end
|
53
|
+
|
54
|
+
Core.crawl(urls, &block)
|
55
|
+
end
|
56
|
+
end
|
data/lib/anemone/core.rb
ADDED
@@ -0,0 +1,209 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'thread'
|
3
|
+
require 'anemone/tentacle'
|
4
|
+
require 'anemone/page_hash'
|
5
|
+
|
6
|
+
module Anemone
|
7
|
+
class Core
|
8
|
+
# PageHash storing all Page objects encountered during the crawl
|
9
|
+
attr_reader :pages
|
10
|
+
|
11
|
+
#
|
12
|
+
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
13
|
+
# and optional *block*
|
14
|
+
#
|
15
|
+
def initialize(urls, &block)
|
16
|
+
@urls = [urls].flatten.map{ |url| URI(url) if url.is_a?(String) }
|
17
|
+
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
18
|
+
|
19
|
+
@tentacles = []
|
20
|
+
@pages = PageHash.new
|
21
|
+
@on_every_page_blocks = []
|
22
|
+
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
23
|
+
@skip_link_patterns = []
|
24
|
+
@after_crawl_blocks = []
|
25
|
+
|
26
|
+
block.call(self) if block
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# Convenience method to start a new crawl
|
31
|
+
#
|
32
|
+
def self.crawl(root, &block)
|
33
|
+
self.new(root) do |core|
|
34
|
+
block.call(core) if block
|
35
|
+
core.run
|
36
|
+
return core
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# Add a block to be executed on the PageHash after the crawl
|
42
|
+
# is finished
|
43
|
+
#
|
44
|
+
def after_crawl(&block)
|
45
|
+
@after_crawl_blocks << block
|
46
|
+
self
|
47
|
+
end
|
48
|
+
|
49
|
+
#
|
50
|
+
# Add one ore more Regex patterns for URLs which should not be
|
51
|
+
# followed
|
52
|
+
#
|
53
|
+
def skip_links_like(*patterns)
|
54
|
+
if patterns
|
55
|
+
patterns.each do |pattern|
|
56
|
+
@skip_link_patterns << pattern
|
57
|
+
end
|
58
|
+
end
|
59
|
+
self
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Add a block to be executed on every Page as they are encountered
|
64
|
+
# during the crawl
|
65
|
+
#
|
66
|
+
def on_every_page(&block)
|
67
|
+
@on_every_page_blocks << block
|
68
|
+
self
|
69
|
+
end
|
70
|
+
|
71
|
+
#
|
72
|
+
# Add a block to be executed on Page objects with a URL matching
|
73
|
+
# one or more patterns
|
74
|
+
#
|
75
|
+
def on_pages_like(*patterns, &block)
|
76
|
+
if patterns
|
77
|
+
patterns.each do |pattern|
|
78
|
+
@on_pages_like_blocks[pattern] << block
|
79
|
+
end
|
80
|
+
end
|
81
|
+
self
|
82
|
+
end
|
83
|
+
|
84
|
+
#
|
85
|
+
# Specify a block which will select which links to follow on each page.
|
86
|
+
# The block should return an Array of URI objects.
|
87
|
+
#
|
88
|
+
def focus_crawl(&block)
|
89
|
+
@focus_crawl_block = block
|
90
|
+
self
|
91
|
+
end
|
92
|
+
|
93
|
+
#
|
94
|
+
# Perform the crawl
|
95
|
+
#
|
96
|
+
def run
|
97
|
+
@urls.delete_if { |url| !visit_link?(url) }
|
98
|
+
return if @urls.empty?
|
99
|
+
|
100
|
+
link_queue = Queue.new
|
101
|
+
page_queue = Queue.new
|
102
|
+
|
103
|
+
Anemone.options.threads.times do |id|
|
104
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue).run }
|
105
|
+
end
|
106
|
+
|
107
|
+
@urls.each{ |url| link_queue.enq(url) }
|
108
|
+
|
109
|
+
loop do
|
110
|
+
page = page_queue.deq
|
111
|
+
|
112
|
+
@pages[page.url] = page
|
113
|
+
|
114
|
+
puts "#{page.url} Queue: #{link_queue.size}" if Anemone.options.verbose
|
115
|
+
|
116
|
+
#perform the on_every_page blocks for this page
|
117
|
+
do_page_blocks(page)
|
118
|
+
|
119
|
+
page.doc = nil if Anemone.options.discard_page_bodies
|
120
|
+
|
121
|
+
links_to_follow(page).each do |link|
|
122
|
+
link_queue.enq(link)
|
123
|
+
@pages[link] = nil
|
124
|
+
end
|
125
|
+
|
126
|
+
#create an entry in the page hash for each alias of this page,
|
127
|
+
#i.e. all the pages that redirected to this page
|
128
|
+
page.aliases.each do |aka|
|
129
|
+
if !@pages.has_key?(aka) or @pages[aka].nil?
|
130
|
+
@pages[aka] = page.alias_clone(aka)
|
131
|
+
end
|
132
|
+
@pages[aka].add_alias!(page.url)
|
133
|
+
end
|
134
|
+
|
135
|
+
# if we are done with the crawl, tell the threads to end
|
136
|
+
if link_queue.empty? and page_queue.empty?
|
137
|
+
until link_queue.num_waiting == @tentacles.size
|
138
|
+
Thread.pass
|
139
|
+
end
|
140
|
+
|
141
|
+
if page_queue.empty?
|
142
|
+
@tentacles.size.times { |i| link_queue.enq(:END)}
|
143
|
+
break
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
end
|
148
|
+
|
149
|
+
@tentacles.each { |t| t.join }
|
150
|
+
|
151
|
+
do_after_crawl_blocks()
|
152
|
+
|
153
|
+
self
|
154
|
+
end
|
155
|
+
|
156
|
+
private
|
157
|
+
|
158
|
+
#
|
159
|
+
# Execute the after_crawl blocks
|
160
|
+
#
|
161
|
+
def do_after_crawl_blocks
|
162
|
+
@after_crawl_blocks.each {|b| b.call(@pages)}
|
163
|
+
end
|
164
|
+
|
165
|
+
#
|
166
|
+
# Execute the on_every_page blocks for *page*
|
167
|
+
#
|
168
|
+
def do_page_blocks(page)
|
169
|
+
@on_every_page_blocks.each do |blk|
|
170
|
+
blk.call(page)
|
171
|
+
end
|
172
|
+
|
173
|
+
@on_pages_like_blocks.each do |pattern, blks|
|
174
|
+
if page.url.to_s =~ pattern
|
175
|
+
blks.each { |blk| blk.call(page) }
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
#
|
181
|
+
# Return an Array of links to follow from the given page.
|
182
|
+
# Based on whether or not the link has already been crawled,
|
183
|
+
# and the block given to focus_crawl()
|
184
|
+
#
|
185
|
+
def links_to_follow(page)
|
186
|
+
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
187
|
+
links.find_all { |link| visit_link?(link) }
|
188
|
+
end
|
189
|
+
|
190
|
+
#
|
191
|
+
# Returns +true+ if *link* has not been visited already,
|
192
|
+
# and is not excluded by a skip_link pattern. Returns
|
193
|
+
# +false+ otherwise.
|
194
|
+
#
|
195
|
+
def visit_link?(link)
|
196
|
+
!@pages.has_key?(link) and !skip_link?(link)
|
197
|
+
end
|
198
|
+
|
199
|
+
#
|
200
|
+
# Returns +true+ if *link* should not be visited because
|
201
|
+
# its URL matches a skip_link pattern.
|
202
|
+
#
|
203
|
+
def skip_link?(link)
|
204
|
+
@skip_link_patterns.each { |p| return true if link.path =~ p}
|
205
|
+
return false
|
206
|
+
end
|
207
|
+
|
208
|
+
end
|
209
|
+
end
|
data/lib/anemone/http.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class HTTP < Net::HTTP
|
5
|
+
# Maximum number of redirects to follow on each get_response
|
6
|
+
REDIRECTION_LIMIT = 5
|
7
|
+
|
8
|
+
#
|
9
|
+
# Retrieve an HTTP response for *url*, following redirects.
|
10
|
+
# Returns the response object, response code, and final URI location.
|
11
|
+
#
|
12
|
+
def self.get(url)
|
13
|
+
response = get_response(url)
|
14
|
+
code = Integer(response.code)
|
15
|
+
loc = url
|
16
|
+
|
17
|
+
limit = REDIRECTION_LIMIT
|
18
|
+
while response.is_a?(Net::HTTPRedirection) and limit > 0
|
19
|
+
loc = URI(response['location'])
|
20
|
+
loc = url.merge(loc) if loc.relative?
|
21
|
+
response = (Anemone.options.obey_robots_dot_txt ? (Net::HTTP.get_obeying_robots(loc)) : get_response(loc) )
|
22
|
+
limit -= 1
|
23
|
+
end
|
24
|
+
|
25
|
+
return response, code, loc
|
26
|
+
end
|
27
|
+
|
28
|
+
#
|
29
|
+
# Get an HTTPResponse for *url*, sending the appropriate User-Agent string
|
30
|
+
#
|
31
|
+
def self.get_response(url)
|
32
|
+
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
|
33
|
+
Net::HTTP.start(url.host, url.port) do |http|
|
34
|
+
return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
data/lib/anemone/page.rb
ADDED
@@ -0,0 +1,177 @@
|
|
1
|
+
require 'anemone/http'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
module Anemone
|
6
|
+
class Page
|
7
|
+
|
8
|
+
# The URL of the page
|
9
|
+
attr_reader :url
|
10
|
+
# Array of distinct A tag HREFs from the page
|
11
|
+
attr_reader :links
|
12
|
+
# Headers of the HTTP response
|
13
|
+
attr_reader :headers
|
14
|
+
|
15
|
+
# OpenStruct for user-stored data
|
16
|
+
attr_accessor :data
|
17
|
+
# Nokogiri document for the HTML body
|
18
|
+
attr_accessor :doc
|
19
|
+
# Integer response code of the page
|
20
|
+
attr_accessor :code
|
21
|
+
# Array of redirect-aliases for the page
|
22
|
+
attr_accessor :aliases
|
23
|
+
# Boolean indicating whether or not this page has been visited in PageHash#shortest_paths!
|
24
|
+
attr_accessor :visited
|
25
|
+
# Used by PageHash#shortest_paths! to store depth of the page
|
26
|
+
attr_accessor :depth
|
27
|
+
|
28
|
+
#
|
29
|
+
# Create a new Page from the response of an HTTP request to *url*
|
30
|
+
#
|
31
|
+
def self.fetch(url)
|
32
|
+
begin
|
33
|
+
url = URI(url) if url.is_a?(String)
|
34
|
+
|
35
|
+
response, code, location = Anemone::HTTP.get(url)
|
36
|
+
|
37
|
+
aka = nil
|
38
|
+
if !url.eql?(location)
|
39
|
+
aka = location
|
40
|
+
end
|
41
|
+
|
42
|
+
return Page.new(url, response.body, code, response.to_hash, aka)
|
43
|
+
rescue
|
44
|
+
return Page.new(url)
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Create a new page
|
50
|
+
#
|
51
|
+
def initialize(url, body = nil, code = nil, headers = nil, aka = nil)
|
52
|
+
@url = url
|
53
|
+
@code = code
|
54
|
+
@headers = headers
|
55
|
+
@links = []
|
56
|
+
@aliases = []
|
57
|
+
@data = OpenStruct.new
|
58
|
+
|
59
|
+
@aliases << aka if !aka.nil?
|
60
|
+
|
61
|
+
if body
|
62
|
+
begin
|
63
|
+
@doc = Nokogiri::HTML(body)
|
64
|
+
rescue
|
65
|
+
return
|
66
|
+
end
|
67
|
+
|
68
|
+
return if @doc.nil?
|
69
|
+
|
70
|
+
#get a list of distinct links on the page, in absolute url form
|
71
|
+
@doc.css('a').each do |a|
|
72
|
+
u = a.attributes['href'].content if a.attributes['href']
|
73
|
+
next if u.nil?
|
74
|
+
|
75
|
+
begin
|
76
|
+
abs = to_absolute(URI(u))
|
77
|
+
rescue
|
78
|
+
next
|
79
|
+
end
|
80
|
+
|
81
|
+
@links << abs if in_domain?(abs)
|
82
|
+
end
|
83
|
+
|
84
|
+
@links.uniq!
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
|
89
|
+
#
|
90
|
+
# Return a new page with the same *response* and *url*, but
|
91
|
+
# with a 200 response code
|
92
|
+
#
|
93
|
+
def alias_clone(url)
|
94
|
+
p = clone
|
95
|
+
p.add_alias!(@aka) if !@aka.nil?
|
96
|
+
p.code = 200
|
97
|
+
p
|
98
|
+
end
|
99
|
+
|
100
|
+
#
|
101
|
+
# Add a redirect-alias String *aka* to the list of the page's aliases
|
102
|
+
#
|
103
|
+
# Returns *self*
|
104
|
+
#
|
105
|
+
def add_alias!(aka)
|
106
|
+
@aliases << aka if !@aliases.include?(aka)
|
107
|
+
self
|
108
|
+
end
|
109
|
+
|
110
|
+
#
|
111
|
+
# Returns an Array of all links from this page, and all the
|
112
|
+
# redirect-aliases of those pages, as String objects.
|
113
|
+
#
|
114
|
+
# *page_hash* is a PageHash object with the results of the current crawl.
|
115
|
+
#
|
116
|
+
def links_and_their_aliases(page_hash)
|
117
|
+
@links.inject([]) do |results, link|
|
118
|
+
results.concat([link].concat(page_hash[link].aliases))
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
#
|
123
|
+
# The content-type returned by the HTTP request for this page
|
124
|
+
#
|
125
|
+
def content_type
|
126
|
+
@headers['content-type'][0] rescue nil
|
127
|
+
end
|
128
|
+
|
129
|
+
#
|
130
|
+
# Returns +true+ if the page is a HTML document, returns +false+
|
131
|
+
# otherwise.
|
132
|
+
#
|
133
|
+
def html?
|
134
|
+
(@content_type =~ /text\/html/) == 0
|
135
|
+
end
|
136
|
+
|
137
|
+
#
|
138
|
+
# Returns +true+ if the page is a HTTP redirect, returns +false+
|
139
|
+
# otherwise.
|
140
|
+
#
|
141
|
+
def redirect?
|
142
|
+
(300..399).include?(@code)
|
143
|
+
end
|
144
|
+
|
145
|
+
#
|
146
|
+
# Returns +true+ if the page was not found (returned 404 code),
|
147
|
+
# returns +false+ otherwise.
|
148
|
+
#
|
149
|
+
def not_found?
|
150
|
+
404 == @code
|
151
|
+
end
|
152
|
+
|
153
|
+
#
|
154
|
+
# Converts relative URL *link* into an absolute URL based on the
|
155
|
+
# location of the page
|
156
|
+
#
|
157
|
+
def to_absolute(link)
|
158
|
+
# remove anchor
|
159
|
+
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
160
|
+
|
161
|
+
relative = URI(link)
|
162
|
+
absolute = @url.merge(relative)
|
163
|
+
|
164
|
+
absolute.path = '/' if absolute.path.empty?
|
165
|
+
|
166
|
+
return absolute
|
167
|
+
end
|
168
|
+
|
169
|
+
#
|
170
|
+
# Returns +true+ if *uri* is in the same domain as the page, returns
|
171
|
+
# +false+ otherwise
|
172
|
+
#
|
173
|
+
def in_domain?(uri)
|
174
|
+
uri.host == @url.host
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
module Anemone
|
2
|
+
class PageHash < Hash
|
3
|
+
|
4
|
+
#
|
5
|
+
# Use a breadth-first search to calculate the single-source
|
6
|
+
# shortest paths from *root* to all pages in the PageHash
|
7
|
+
#
|
8
|
+
def shortest_paths!(root)
|
9
|
+
root = URI(root) if root.is_a?(String)
|
10
|
+
raise "Root node not found" if !has_key?(root)
|
11
|
+
|
12
|
+
each_value {|p| p.visited = false if p}
|
13
|
+
|
14
|
+
q = Queue.new
|
15
|
+
|
16
|
+
q.enq(root)
|
17
|
+
self[root].depth = 0
|
18
|
+
self[root].visited = true
|
19
|
+
while(!q.empty?)
|
20
|
+
url = q.deq
|
21
|
+
|
22
|
+
next if !has_key?(url)
|
23
|
+
|
24
|
+
page = self[url]
|
25
|
+
|
26
|
+
page.links.each do |u|
|
27
|
+
next if !has_key?(u) or self[u].nil?
|
28
|
+
link = self[u]
|
29
|
+
aliases = [link].concat(link.aliases.map {|a| self[a] })
|
30
|
+
|
31
|
+
aliases.each do |node|
|
32
|
+
if node.depth.nil? or page.depth + 1 < node.depth
|
33
|
+
node.depth = page.depth + 1
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
q.enq(self[u].url) if !self[u].visited
|
38
|
+
self[u].visited = true
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
self
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Returns a new PageHash by removing redirect-aliases for each
|
47
|
+
# non-redirect Page
|
48
|
+
#
|
49
|
+
def uniq
|
50
|
+
results = PageHash.new
|
51
|
+
each do |url, page|
|
52
|
+
#if none of the aliases of this page have been added, and this isn't a redirect page, add this page
|
53
|
+
page_added = page.aliases.inject(false) { |r, a| r ||= results.has_key? a}
|
54
|
+
if !page.redirect? and !page_added
|
55
|
+
results[url] = page.clone
|
56
|
+
results[url].aliases = []
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
results
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# If given a single URL (as a String or URI), returns an Array of Pages which link to that URL
|
65
|
+
# If given an Array of URLs, returns a Hash (URI => [Page, Page...]) of Pages linking to those URLs
|
66
|
+
#
|
67
|
+
def pages_linking_to(urls)
|
68
|
+
unless urls.is_a?(Array)
|
69
|
+
urls = [urls] unless urls.is_a?(Array)
|
70
|
+
single = true
|
71
|
+
end
|
72
|
+
|
73
|
+
urls.map! do |url|
|
74
|
+
if url.is_a?(String)
|
75
|
+
URI(url) rescue nil
|
76
|
+
else
|
77
|
+
url
|
78
|
+
end
|
79
|
+
end
|
80
|
+
urls.compact
|
81
|
+
|
82
|
+
links = {}
|
83
|
+
urls.each { |url| links[url] = [] }
|
84
|
+
values.each do |page|
|
85
|
+
urls.each { |url| links[url] << page if page.links.include?(url) }
|
86
|
+
end
|
87
|
+
|
88
|
+
if single and !links.empty?
|
89
|
+
return links.first
|
90
|
+
else
|
91
|
+
return links
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
#
|
96
|
+
# If given a single URL (as a String or URI), returns an Array of URLs which link to that URL
|
97
|
+
# If given an Array of URLs, returns a Hash (URI => [URI, URI...]) of URLs linking to those URLs
|
98
|
+
#
|
99
|
+
def urls_linking_to(urls)
|
100
|
+
unless urls.is_a?(Array)
|
101
|
+
urls = [urls] unless urls.is_a?(Array)
|
102
|
+
single = true
|
103
|
+
end
|
104
|
+
|
105
|
+
links = pages_linking_to(urls)
|
106
|
+
links.each { |url, pages| links[url] = pages.map{|p| p.url} }
|
107
|
+
|
108
|
+
if single and !links.empty?
|
109
|
+
return links.first
|
110
|
+
else
|
111
|
+
return links
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'anemone/page'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
class Tentacle
|
5
|
+
|
6
|
+
#
|
7
|
+
# Create a new Tentacle
|
8
|
+
#
|
9
|
+
def initialize(link_queue, page_queue)
|
10
|
+
@link_queue = link_queue
|
11
|
+
@page_queue = page_queue
|
12
|
+
end
|
13
|
+
|
14
|
+
#
|
15
|
+
# Gets links from @link_queue, and returns the fetched
|
16
|
+
# Page objects into @page_queue
|
17
|
+
#
|
18
|
+
def run
|
19
|
+
while true do
|
20
|
+
link = @link_queue.deq
|
21
|
+
|
22
|
+
break if link == :END
|
23
|
+
|
24
|
+
page = Page.fetch(link)
|
25
|
+
|
26
|
+
@page_queue.enq(page)
|
27
|
+
|
28
|
+
sleep Anemone.options.delay
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
describe Anemone do
|
4
|
+
|
5
|
+
it "should have a version" do
|
6
|
+
Anemone.const_defined?('VERSION').should == true
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should have options" do
|
10
|
+
Anemone.should respond_to(:options)
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should accept options for the crawl" do
|
14
|
+
Anemone.crawl(SPEC_DOMAIN, :verbose => false,
|
15
|
+
:threads => 2,
|
16
|
+
:discard_page_bodies => true,
|
17
|
+
:user_agent => 'test')
|
18
|
+
Anemone.options.verbose.should == false
|
19
|
+
Anemone.options.threads.should == 2
|
20
|
+
Anemone.options.discard_page_bodies.should == true
|
21
|
+
Anemone.options.delay.should == 0
|
22
|
+
Anemone.options.user_agent.should == 'test'
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should accept options of obeying Robots.txt for the crawl" do
|
26
|
+
Anemone.crawl(SPEC_DOMAIN, :obey_robots_dot_txt => true)
|
27
|
+
Anemone.options.obey_robots_dot_txt.should == true
|
28
|
+
end
|
29
|
+
|
30
|
+
it "should use 1 thread if a delay is requested" do
|
31
|
+
Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
|
32
|
+
Anemone.options.threads.should == 1
|
33
|
+
end
|
34
|
+
|
35
|
+
it "should return a Anemone::Core from the crawl, which has a PageHash" do
|
36
|
+
result = Anemone.crawl(SPEC_DOMAIN)
|
37
|
+
result.should be_an_instance_of(Anemone::Core)
|
38
|
+
result.pages.should be_an_instance_of(Anemone::PageHash)
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
data/spec/core_spec.rb
ADDED
@@ -0,0 +1,128 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
describe Core do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
FakeWeb.clean_registry
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should crawl all the html pages in a domain by following <a> href's" do
|
11
|
+
pages = []
|
12
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
13
|
+
pages << FakePage.new('1', :links => ['3'])
|
14
|
+
pages << FakePage.new('2')
|
15
|
+
pages << FakePage.new('3')
|
16
|
+
|
17
|
+
Anemone.crawl(pages[0].url).should have(4).pages
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should not leave the original domain" do
|
21
|
+
pages = []
|
22
|
+
pages << FakePage.new('0', :links => ['1'], :hrefs => 'http://www.other.com/')
|
23
|
+
pages << FakePage.new('1')
|
24
|
+
|
25
|
+
core = Anemone.crawl(pages[0].url)
|
26
|
+
|
27
|
+
core.should have(2).pages
|
28
|
+
core.pages.keys.map{|k| k.to_s}.should_not include('http://www.other.com/')
|
29
|
+
end
|
30
|
+
|
31
|
+
it "should follow http redirects" do
|
32
|
+
pages = []
|
33
|
+
pages << FakePage.new('0', :links => ['1'])
|
34
|
+
pages << FakePage.new('1', :redirect => '2')
|
35
|
+
pages << FakePage.new('2')
|
36
|
+
|
37
|
+
Anemone.crawl(pages[0].url).should have(3).pages
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should accept multiple starting URLs" do
|
41
|
+
pages = []
|
42
|
+
pages << FakePage.new('0', :links => ['1'])
|
43
|
+
pages << FakePage.new('1')
|
44
|
+
pages << FakePage.new('2', :links => ['3'])
|
45
|
+
pages << FakePage.new('3')
|
46
|
+
|
47
|
+
Anemone.crawl([pages[0].url, pages[2].url]).should have(4).pages
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should include the query string when following links" do
|
51
|
+
pages = []
|
52
|
+
pages << FakePage.new('0', :links => ['1?foo=1'])
|
53
|
+
pages << FakePage.new('1?foo=1')
|
54
|
+
pages << FakePage.new('1')
|
55
|
+
|
56
|
+
core = Anemone.crawl(pages[0].url)
|
57
|
+
|
58
|
+
core.should have(2).pages
|
59
|
+
core.pages.keys.map{|k| k.to_s}.should_not include(pages[2].url)
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should be able to skip links based on a RegEx" do
|
63
|
+
pages = []
|
64
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
65
|
+
pages << FakePage.new('1')
|
66
|
+
pages << FakePage.new('2')
|
67
|
+
|
68
|
+
core = Anemone.crawl(pages[0].url) do |a|
|
69
|
+
a.skip_links_like /1/
|
70
|
+
end
|
71
|
+
|
72
|
+
core.should have(2).pages
|
73
|
+
core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should be able to call a block on every page" do
|
77
|
+
pages = []
|
78
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
79
|
+
pages << FakePage.new('1')
|
80
|
+
pages << FakePage.new('2')
|
81
|
+
|
82
|
+
count = 0
|
83
|
+
Anemone.crawl(pages[0].url) do |a|
|
84
|
+
a.on_every_page { count += 1 }
|
85
|
+
end
|
86
|
+
|
87
|
+
count.should == 3
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should not discard page bodies by default" do
|
91
|
+
Anemone.crawl(FakePage.new('0').url).pages.values.first.doc.should_not be_nil
|
92
|
+
end
|
93
|
+
|
94
|
+
it "should optionally discard page bodies to conserve memory" do
|
95
|
+
core = Anemone.crawl(FakePage.new('0').url, :discard_page_bodies => true)
|
96
|
+
core.pages.values.first.doc.should be_nil
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should provide a focus_crawl method to select the links on each page to follow" do
|
100
|
+
pages = []
|
101
|
+
pages << FakePage.new('0', :links => ['1', '2'])
|
102
|
+
pages << FakePage.new('1')
|
103
|
+
pages << FakePage.new('2')
|
104
|
+
|
105
|
+
core = Anemone.crawl(pages[0].url) do |a|
|
106
|
+
a.focus_crawl {|p| p.links.reject{|l| l.to_s =~ /1/}}
|
107
|
+
end
|
108
|
+
|
109
|
+
core.should have(2).pages
|
110
|
+
core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
|
111
|
+
end
|
112
|
+
|
113
|
+
it "should optionally delay between page requests" do
|
114
|
+
delay = 0.25
|
115
|
+
|
116
|
+
pages = []
|
117
|
+
pages << FakePage.new('0', :links => '1')
|
118
|
+
pages << FakePage.new('1')
|
119
|
+
|
120
|
+
start = Time.now
|
121
|
+
Anemone.crawl(pages[0].url, :delay => delay)
|
122
|
+
finish = Time.now
|
123
|
+
|
124
|
+
(finish - start).should satisfy {|t| t > delay * 2}
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
128
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
begin
|
2
|
+
require 'fakeweb'
|
3
|
+
rescue LoadError
|
4
|
+
warn "You need the 'fakeweb' gem installed to test Anemone"
|
5
|
+
exit
|
6
|
+
end
|
7
|
+
|
8
|
+
FakeWeb.allow_net_connect = false
|
9
|
+
|
10
|
+
module Anemone
|
11
|
+
SPEC_DOMAIN = "http://www.example.com/"
|
12
|
+
|
13
|
+
class FakePage
|
14
|
+
attr_accessor :links
|
15
|
+
attr_accessor :hrefs
|
16
|
+
|
17
|
+
def initialize(name = '', options = {})
|
18
|
+
@name = name
|
19
|
+
@links = [options[:links]].flatten if options.has_key?(:links)
|
20
|
+
@hrefs = [options[:hrefs]].flatten if options.has_key?(:hrefs)
|
21
|
+
@redirect = options[:redirect] if options.has_key?(:redirect)
|
22
|
+
|
23
|
+
create_body
|
24
|
+
add_to_fakeweb
|
25
|
+
end
|
26
|
+
|
27
|
+
def url
|
28
|
+
SPEC_DOMAIN + @name
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def create_body
|
34
|
+
@body = "<html><body>"
|
35
|
+
@links.each{|l| @body += "<a href=\"#{SPEC_DOMAIN}#{l}\"></a>"} if @links
|
36
|
+
@hrefs.each{|h| @body += "<a href=\"#{h}\"></a>"} if @hrefs
|
37
|
+
@body += "</body></html>"
|
38
|
+
end
|
39
|
+
|
40
|
+
def add_to_fakeweb
|
41
|
+
options = {:body => @body, :content_type => "text/html", :status => [200, "OK"]}
|
42
|
+
|
43
|
+
if @redirect
|
44
|
+
options[:status] = [301, "Permanently Moved"]
|
45
|
+
options[:location] = SPEC_DOMAIN + @redirect
|
46
|
+
end
|
47
|
+
|
48
|
+
FakeWeb.register_uri(:get, SPEC_DOMAIN + @name, options)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
#default root
|
54
|
+
Anemone::FakePage.new
|
55
|
+
|
data/spec/page_spec.rb
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require File.dirname(__FILE__) + '/spec_helper'
|
2
|
+
|
3
|
+
module Anemone
|
4
|
+
describe Page do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
@page = Page.fetch(FakePage.new('home').url)
|
8
|
+
end
|
9
|
+
|
10
|
+
it "should be able to fetch a page" do
|
11
|
+
@page.should_not be_nil
|
12
|
+
@page.url.to_s.should include('home')
|
13
|
+
end
|
14
|
+
|
15
|
+
it "should store the response headers when fetching a page" do
|
16
|
+
@page.headers.should_not be_nil
|
17
|
+
@page.headers.should have_key('content-type')
|
18
|
+
end
|
19
|
+
|
20
|
+
it "should have an OpenStruct attribute for the developer to store data in" do
|
21
|
+
@page.data.should_not be_nil
|
22
|
+
@page.data.should be_an_instance_of(OpenStruct)
|
23
|
+
|
24
|
+
@page.data.test = 'test'
|
25
|
+
@page.data.test.should == 'test'
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should have a Nokogori::HTML::Document attribute for the page body" do
|
29
|
+
@page.doc.should_not be_nil
|
30
|
+
@page.doc.should be_an_instance_of(Nokogiri::HTML::Document)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should indicate whether it was fetched after an HTTP redirect" do
|
34
|
+
@page.should respond_to(:redirect?)
|
35
|
+
|
36
|
+
@page.redirect?.should == false
|
37
|
+
|
38
|
+
Page.fetch(FakePage.new('redir', :redirect => 'home').url).redirect?.should == true
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should have a method to tell if a URI is in the same domain as the page" do
|
42
|
+
@page.should respond_to(:in_domain?)
|
43
|
+
|
44
|
+
@page.in_domain?(URI(FakePage.new('test').url)).should == true
|
45
|
+
@page.in_domain?(URI('http://www.other.com/')).should == false
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: parolkar-anemone
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Chris Kite
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-05-16 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: nokogiri
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.3.0
|
24
|
+
version:
|
25
|
+
description:
|
26
|
+
email:
|
27
|
+
executables:
|
28
|
+
- anemone_count.rb
|
29
|
+
- anemone_cron.rb
|
30
|
+
- anemone_pagedepth.rb
|
31
|
+
- anemone_serialize.rb
|
32
|
+
- anemone_url_list.rb
|
33
|
+
extensions: []
|
34
|
+
|
35
|
+
extra_rdoc_files:
|
36
|
+
- README.rdoc
|
37
|
+
files:
|
38
|
+
- LICENSE.txt
|
39
|
+
- README.rdoc
|
40
|
+
- bin/anemone_count.rb
|
41
|
+
- bin/anemone_cron.rb
|
42
|
+
- bin/anemone_pagedepth.rb
|
43
|
+
- bin/anemone_serialize.rb
|
44
|
+
- bin/anemone_url_list.rb
|
45
|
+
- lib/anemone.rb
|
46
|
+
- lib/anemone/anemone.rb
|
47
|
+
- lib/anemone/core.rb
|
48
|
+
- lib/anemone/http.rb
|
49
|
+
- lib/anemone/page.rb
|
50
|
+
- lib/anemone/page_hash.rb
|
51
|
+
- lib/anemone/tentacle.rb
|
52
|
+
has_rdoc: true
|
53
|
+
homepage: http://anemone.rubyforge.org
|
54
|
+
post_install_message:
|
55
|
+
rdoc_options:
|
56
|
+
- -m
|
57
|
+
- README.rdoc
|
58
|
+
- -t
|
59
|
+
- Anemone
|
60
|
+
require_paths:
|
61
|
+
- lib
|
62
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: "0"
|
67
|
+
version:
|
68
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: "0"
|
73
|
+
version:
|
74
|
+
requirements: []
|
75
|
+
|
76
|
+
rubyforge_project: anemone
|
77
|
+
rubygems_version: 1.2.0
|
78
|
+
signing_key:
|
79
|
+
specification_version: 2
|
80
|
+
summary: Anemone web-spider framework
|
81
|
+
test_files:
|
82
|
+
- spec/anemone_spec.rb
|
83
|
+
- spec/core_spec.rb
|
84
|
+
- spec/page_spec.rb
|
85
|
+
- spec/fakeweb_helper.rb
|
86
|
+
- spec/spec_helper.rb
|