shingara-anemone 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +27 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +24 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/core.rb +256 -0
- data/lib/anemone/http.rb +123 -0
- data/lib/anemone/page.rb +155 -0
- data/lib/anemone/page_hash.rb +142 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +15 -0
- data/spec/core_spec.rb +203 -0
- data/spec/fakeweb_helper.rb +57 -0
- data/spec/page_spec.rb +52 -0
- data/spec/spec_helper.rb +7 -0
- metadata +96 -0
data/CHANGELOG.rdoc
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
== 0.2.3 / 2009-11-01
|
2
|
+
|
3
|
+
* Minor enhancements
|
4
|
+
|
5
|
+
* Options are now applied per-crawl, rather than module-wide.
|
6
|
+
|
7
|
+
* Bug fixes
|
8
|
+
|
9
|
+
* Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
|
10
|
+
|
11
|
+
== 0.2.2 / 2009-10-26
|
12
|
+
|
13
|
+
* Minor enhancements
|
14
|
+
|
15
|
+
* When the :verbose option is set to true, exception backtraces are printed to aid debugging.
|
16
|
+
|
17
|
+
== 0.2.1 / 2009-10-24
|
18
|
+
|
19
|
+
* Major enhancements
|
20
|
+
|
21
|
+
* Added HTTPS support.
|
22
|
+
* CLI program 'anemone', which is a frontend for several tasks.
|
23
|
+
|
24
|
+
* Minor enhancements
|
25
|
+
|
26
|
+
* HTTP request response time recorded in Page.
|
27
|
+
* Use of persistent HTTP connections.
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2009 Vertive, Inc.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
= Anemone
|
2
|
+
|
3
|
+
Anemone is a web spider framework that can spider a domain and collect useful
|
4
|
+
information about the pages it visits. It is versatile, allowing you to
|
5
|
+
write your own specialized spider tasks quickly and easily.
|
6
|
+
|
7
|
+
See http://anemone.rubyforge.org for more information.
|
8
|
+
|
9
|
+
== Features
|
10
|
+
* Multi-threaded design for high performance
|
11
|
+
* Tracks 301 HTTP redirects to understand a page's aliases
|
12
|
+
* Built-in BFS algorithm for determining page depth
|
13
|
+
* Allows exclusion of URLs based on regular expressions
|
14
|
+
* Choose the links to follow on each page with focus_crawl()
|
15
|
+
* HTTPS support
|
16
|
+
* Records response time for each page
|
17
|
+
* CLI program can list all pages in a domain, calculate page depths, and more
|
18
|
+
|
19
|
+
== Examples
|
20
|
+
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
|
21
|
+
|
22
|
+
== Requirements
|
23
|
+
* nokogiri
|
24
|
+
* robots
|
data/bin/anemone
ADDED
data/lib/anemone.rb
ADDED
data/lib/anemone/cli.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module Anemone
|
2
|
+
module CLI
|
3
|
+
COMMANDS = %w[count cron pagedepth serialize url-list]
|
4
|
+
|
5
|
+
def self.run
|
6
|
+
command = ARGV.shift
|
7
|
+
|
8
|
+
if COMMANDS.include? command
|
9
|
+
load "anemone/cli/#{command.tr('-', '_')}.rb"
|
10
|
+
else
|
11
|
+
puts <<-INFO
|
12
|
+
Anemone is a web spider framework that can collect
|
13
|
+
useful information about pages it visits.
|
14
|
+
|
15
|
+
Usage:
|
16
|
+
anemone <command> [arguments]
|
17
|
+
|
18
|
+
Commands:
|
19
|
+
#{COMMANDS.join(', ')}
|
20
|
+
INFO
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
|
3
|
+
begin
|
4
|
+
# make sure that the first option is a URL we can crawl
|
5
|
+
url = URI(ARGV[0])
|
6
|
+
rescue
|
7
|
+
puts <<-INFO
|
8
|
+
Usage:
|
9
|
+
anemone count <url>
|
10
|
+
|
11
|
+
Synopsis:
|
12
|
+
Crawls a site starting at the given URL and outputs the total number
|
13
|
+
of unique pages on the site.
|
14
|
+
INFO
|
15
|
+
exit(0)
|
16
|
+
end
|
17
|
+
|
18
|
+
Anemone.crawl(url) do |anemone|
|
19
|
+
anemone.after_crawl do |pages|
|
20
|
+
puts pages.uniq.size
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
options = OpenStruct.new
|
6
|
+
options.relative = false
|
7
|
+
options.output_file = 'urls.txt'
|
8
|
+
|
9
|
+
begin
|
10
|
+
# make sure that the last argument is a URL we can crawl
|
11
|
+
root = URI(ARGV.last)
|
12
|
+
rescue
|
13
|
+
puts <<-INFO
|
14
|
+
Usage:
|
15
|
+
anemone cron [options] <url>
|
16
|
+
|
17
|
+
Synopsis:
|
18
|
+
Combination of `count`, `pagedepth` and `url-list` commands.
|
19
|
+
Performs pagedepth, url list, and count functionality.
|
20
|
+
Outputs results to STDOUT and link list to file (urls.txt).
|
21
|
+
Meant to be run daily as a cron job.
|
22
|
+
|
23
|
+
Options:
|
24
|
+
-r, --relative Output relative URLs (rather than absolute)
|
25
|
+
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
26
|
+
INFO
|
27
|
+
exit(0)
|
28
|
+
end
|
29
|
+
|
30
|
+
# parse command-line options
|
31
|
+
opts = OptionParser.new
|
32
|
+
opts.on('-r', '--relative') { options.relative = true }
|
33
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
34
|
+
opts.parse!(ARGV)
|
35
|
+
|
36
|
+
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
37
|
+
|
38
|
+
anemone.after_crawl do |pages|
|
39
|
+
puts "Crawl results for #{root}\n"
|
40
|
+
|
41
|
+
# print a list of 404's
|
42
|
+
not_found = []
|
43
|
+
pages.each_value do |page|
|
44
|
+
url = page.url.to_s
|
45
|
+
not_found << url if page.not_found?
|
46
|
+
end
|
47
|
+
unless not_found.empty?
|
48
|
+
puts "\n404's:"
|
49
|
+
|
50
|
+
missing_links = pages.urls_linking_to(not_found)
|
51
|
+
missing_links.each do |url, links|
|
52
|
+
if options.relative
|
53
|
+
puts URI(url).path.to_s
|
54
|
+
else
|
55
|
+
puts url
|
56
|
+
end
|
57
|
+
links.slice(0..10).each do |u|
|
58
|
+
u = u.path if options.relative
|
59
|
+
puts " linked from #{u}"
|
60
|
+
end
|
61
|
+
|
62
|
+
puts " ..." if links.size > 10
|
63
|
+
end
|
64
|
+
|
65
|
+
print "\n"
|
66
|
+
end
|
67
|
+
|
68
|
+
# remove redirect aliases, and calculate pagedepths
|
69
|
+
pages = pages.shortest_paths!(root).uniq
|
70
|
+
depths = pages.values.inject({}) do |depths, page|
|
71
|
+
depths[page.depth] ||= 0
|
72
|
+
depths[page.depth] += 1
|
73
|
+
depths
|
74
|
+
end
|
75
|
+
|
76
|
+
# print the page count
|
77
|
+
puts "Total pages: #{pages.size}\n"
|
78
|
+
|
79
|
+
# print a list of depths
|
80
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
81
|
+
|
82
|
+
# output a list of urls to file
|
83
|
+
file = open(options.output_file, 'w')
|
84
|
+
pages.each_key do |url|
|
85
|
+
url = options.relative ? url.path.to_s : url.to_s
|
86
|
+
file.puts url
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
|
3
|
+
begin
|
4
|
+
# make sure that the first option is a URL we can crawl
|
5
|
+
root = URI(ARGV[0])
|
6
|
+
rescue
|
7
|
+
puts <<-INFO
|
8
|
+
Usage:
|
9
|
+
anemone pagedepth <url>
|
10
|
+
|
11
|
+
Synopsis:
|
12
|
+
Crawls a site starting at the given URL and outputs a count of
|
13
|
+
the number of pages at each depth of the crawl.
|
14
|
+
INFO
|
15
|
+
exit(0)
|
16
|
+
end
|
17
|
+
|
18
|
+
Anemone.crawl(root) do |anemone|
|
19
|
+
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
20
|
+
|
21
|
+
anemone.after_crawl do |pages|
|
22
|
+
pages = pages.shortest_paths!(root).uniq
|
23
|
+
|
24
|
+
depths = pages.values.inject({}) do |depths, page|
|
25
|
+
depths[page.depth] ||= 0
|
26
|
+
depths[page.depth] += 1
|
27
|
+
depths
|
28
|
+
end
|
29
|
+
|
30
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
begin
|
6
|
+
# make sure that the first option is a URL we can crawl
|
7
|
+
root = URI(ARGV[0])
|
8
|
+
rescue
|
9
|
+
puts <<-INFO
|
10
|
+
Usage:
|
11
|
+
anemone serialize [options] <url>
|
12
|
+
|
13
|
+
Synopsis:
|
14
|
+
Crawls a site starting at the given URL and saves the resulting
|
15
|
+
PageHash object to a file using Marshal serialization.
|
16
|
+
|
17
|
+
Options:
|
18
|
+
-o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
19
|
+
INFO
|
20
|
+
exit(0)
|
21
|
+
end
|
22
|
+
|
23
|
+
options = OpenStruct.new
|
24
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
25
|
+
|
26
|
+
# parse command-line options
|
27
|
+
opts = OptionParser.new
|
28
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
29
|
+
opts.parse!(ARGV)
|
30
|
+
|
31
|
+
Anemone.crawl(root) do |anemone|
|
32
|
+
anemone.after_crawl do |pages|
|
33
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
options = OpenStruct.new
|
6
|
+
options.relative = false
|
7
|
+
|
8
|
+
begin
|
9
|
+
# make sure that the last option is a URL we can crawl
|
10
|
+
root = URI(ARGV.last)
|
11
|
+
rescue
|
12
|
+
puts <<-INFO
|
13
|
+
Usage:
|
14
|
+
anemone url-list [options] <url>
|
15
|
+
|
16
|
+
Synopsis:
|
17
|
+
Crawls a site starting at the given URL, and outputs the URL of each page
|
18
|
+
in the domain as they are encountered.
|
19
|
+
|
20
|
+
Options:
|
21
|
+
-r, --relative Output relative URLs (rather than absolute)
|
22
|
+
INFO
|
23
|
+
exit(0)
|
24
|
+
end
|
25
|
+
|
26
|
+
# parse command-line options
|
27
|
+
opts = OptionParser.new
|
28
|
+
opts.on('-r', '--relative') { options.relative = true }
|
29
|
+
opts.parse!(ARGV)
|
30
|
+
|
31
|
+
Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
|
32
|
+
|
33
|
+
anemone.on_every_page do |page|
|
34
|
+
if options.relative
|
35
|
+
puts page.url.path
|
36
|
+
else
|
37
|
+
puts page.url
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
data/lib/anemone/core.rb
ADDED
@@ -0,0 +1,256 @@
|
|
1
|
+
require 'thread'
|
2
|
+
require 'robots'
|
3
|
+
require 'anemone/tentacle'
|
4
|
+
require 'anemone/page'
|
5
|
+
require 'anemone/page_hash'
|
6
|
+
|
7
|
+
module Anemone
|
8
|
+
|
9
|
+
VERSION = '0.2.3';
|
10
|
+
|
11
|
+
#
|
12
|
+
# Convenience method to start a crawl
|
13
|
+
#
|
14
|
+
def Anemone.crawl(urls, options = {}, &block)
|
15
|
+
Core.crawl(urls, options, &block)
|
16
|
+
end
|
17
|
+
|
18
|
+
class Core
|
19
|
+
# PageHash storing all Page objects encountered during the crawl
|
20
|
+
attr_reader :pages
|
21
|
+
|
22
|
+
# Hash of options for the crawl
|
23
|
+
attr_accessor :opts
|
24
|
+
|
25
|
+
DEFAULT_OPTS = {
|
26
|
+
# run 4 Tentacle threads to fetch pages
|
27
|
+
:threads => 4,
|
28
|
+
# disable verbose output
|
29
|
+
:verbose => false,
|
30
|
+
# don't throw away the page response body after scanning it for links
|
31
|
+
:discard_page_bodies => false,
|
32
|
+
# identify self as Anemone/VERSION
|
33
|
+
:user_agent => "Anemone/#{Anemone::VERSION}",
|
34
|
+
# no delay between requests
|
35
|
+
:delay => 0,
|
36
|
+
# don't obey the robots exclusion protocol
|
37
|
+
:obey_robots_txt => false,
|
38
|
+
# by default, don't limit the depth of the crawl
|
39
|
+
:depth_limit => false,
|
40
|
+
# number of times HTTP redirects will be followed
|
41
|
+
:redirect_limit => 5
|
42
|
+
}
|
43
|
+
|
44
|
+
#
|
45
|
+
# Initialize the crawl with starting *urls* (single URL or Array of URLs)
|
46
|
+
# and optional *block*
|
47
|
+
#
|
48
|
+
def initialize(urls, opts = {})
|
49
|
+
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
|
50
|
+
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
51
|
+
|
52
|
+
@tentacles = []
|
53
|
+
@pages = PageHash.new
|
54
|
+
@on_every_page_blocks = []
|
55
|
+
@on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
|
56
|
+
@skip_link_patterns = []
|
57
|
+
@after_crawl_blocks = []
|
58
|
+
|
59
|
+
process_options opts
|
60
|
+
|
61
|
+
yield self if block_given?
|
62
|
+
end
|
63
|
+
|
64
|
+
#
|
65
|
+
# Convenience method to start a new crawl
|
66
|
+
#
|
67
|
+
def self.crawl(urls, opts = {})
|
68
|
+
self.new(urls, opts) do |core|
|
69
|
+
yield core if block_given?
|
70
|
+
core.run
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
#
|
75
|
+
# Add a block to be executed on the PageHash after the crawl
|
76
|
+
# is finished
|
77
|
+
#
|
78
|
+
def after_crawl(&block)
|
79
|
+
@after_crawl_blocks << block
|
80
|
+
self
|
81
|
+
end
|
82
|
+
|
83
|
+
#
|
84
|
+
# Add one ore more Regex patterns for URLs which should not be
|
85
|
+
# followed
|
86
|
+
#
|
87
|
+
def skip_links_like(*patterns)
|
88
|
+
@skip_link_patterns.concat [patterns].flatten.compact
|
89
|
+
self
|
90
|
+
end
|
91
|
+
|
92
|
+
#
|
93
|
+
# Add a block to be executed on every Page as they are encountered
|
94
|
+
# during the crawl
|
95
|
+
#
|
96
|
+
def on_every_page(&block)
|
97
|
+
@on_every_page_blocks << block
|
98
|
+
self
|
99
|
+
end
|
100
|
+
|
101
|
+
#
|
102
|
+
# Add a block to be executed on Page objects with a URL matching
|
103
|
+
# one or more patterns
|
104
|
+
#
|
105
|
+
def on_pages_like(*patterns, &block)
|
106
|
+
if patterns
|
107
|
+
patterns.each do |pattern|
|
108
|
+
@on_pages_like_blocks[pattern] << block
|
109
|
+
end
|
110
|
+
end
|
111
|
+
self
|
112
|
+
end
|
113
|
+
|
114
|
+
#
|
115
|
+
# Specify a block which will select which links to follow on each page.
|
116
|
+
# The block should return an Array of URI objects.
|
117
|
+
#
|
118
|
+
def focus_crawl(&block)
|
119
|
+
@focus_crawl_block = block
|
120
|
+
self
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# Perform the crawl
|
125
|
+
#
|
126
|
+
def run
|
127
|
+
@urls.delete_if { |url| !visit_link?(url) }
|
128
|
+
return if @urls.empty?
|
129
|
+
|
130
|
+
link_queue = Queue.new
|
131
|
+
page_queue = Queue.new
|
132
|
+
|
133
|
+
@opts[:threads].times do
|
134
|
+
@tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
|
135
|
+
end
|
136
|
+
|
137
|
+
@urls.each{ |url| link_queue.enq(url) }
|
138
|
+
|
139
|
+
loop do
|
140
|
+
page = page_queue.deq
|
141
|
+
|
142
|
+
@pages[page.url] = page
|
143
|
+
|
144
|
+
puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
|
145
|
+
|
146
|
+
# perform the on_every_page blocks for this page
|
147
|
+
do_page_blocks(page)
|
148
|
+
|
149
|
+
page.discard_doc! if @opts[:discard_page_bodies]
|
150
|
+
|
151
|
+
links_to_follow(page).each do |link|
|
152
|
+
link_queue.enq([link, page])
|
153
|
+
@pages[link] = nil
|
154
|
+
end
|
155
|
+
|
156
|
+
# create an entry in the page hash for each alias of this page,
|
157
|
+
# i.e. all the pages that redirected to this page
|
158
|
+
page.aliases.each do |aka|
|
159
|
+
if !@pages.has_key?(aka) or @pages[aka].nil?
|
160
|
+
@pages[aka] = page.alias_clone(aka)
|
161
|
+
end
|
162
|
+
@pages[aka].add_alias!(page.url)
|
163
|
+
end
|
164
|
+
|
165
|
+
# if we are done with the crawl, tell the threads to end
|
166
|
+
if link_queue.empty? and page_queue.empty?
|
167
|
+
until link_queue.num_waiting == @tentacles.size
|
168
|
+
Thread.pass
|
169
|
+
end
|
170
|
+
|
171
|
+
if page_queue.empty?
|
172
|
+
@tentacles.size.times { link_queue.enq(:END)}
|
173
|
+
break
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
end
|
178
|
+
|
179
|
+
@tentacles.each { |t| t.join }
|
180
|
+
|
181
|
+
do_after_crawl_blocks()
|
182
|
+
|
183
|
+
self
|
184
|
+
end
|
185
|
+
|
186
|
+
private
|
187
|
+
|
188
|
+
def process_options(options)
|
189
|
+
@opts = DEFAULT_OPTS.merge options
|
190
|
+
|
191
|
+
@opts[:threads] = 1 if @opts[:delay] > 0
|
192
|
+
|
193
|
+
@robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
|
194
|
+
end
|
195
|
+
|
196
|
+
#
|
197
|
+
# Execute the after_crawl blocks
|
198
|
+
#
|
199
|
+
def do_after_crawl_blocks
|
200
|
+
@after_crawl_blocks.each {|b| b.call(@pages)}
|
201
|
+
end
|
202
|
+
|
203
|
+
#
|
204
|
+
# Execute the on_every_page blocks for *page*
|
205
|
+
#
|
206
|
+
def do_page_blocks(page)
|
207
|
+
@on_every_page_blocks.each do |blk|
|
208
|
+
blk.call(page)
|
209
|
+
end
|
210
|
+
|
211
|
+
@on_pages_like_blocks.each do |pattern, blks|
|
212
|
+
if page.url.to_s =~ pattern
|
213
|
+
blks.each { |blk| blk.call(page) }
|
214
|
+
end
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
#
|
219
|
+
# Return an Array of links to follow from the given page.
|
220
|
+
# Based on whether or not the link has already been crawled,
|
221
|
+
# and the block given to focus_crawl()
|
222
|
+
#
|
223
|
+
def links_to_follow(page)
|
224
|
+
links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
|
225
|
+
links.select { |link| visit_link?(link, page) }
|
226
|
+
end
|
227
|
+
|
228
|
+
#
|
229
|
+
# Returns +true+ if *link* has not been visited already,
|
230
|
+
# and is not excluded by a skip_link pattern...
|
231
|
+
# and is not excluded by robots.txt...
|
232
|
+
# and is not deeper than the depth limit
|
233
|
+
# Returns +false+ otherwise.
|
234
|
+
#
|
235
|
+
def visit_link?(link, from_page = nil)
|
236
|
+
allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
|
237
|
+
|
238
|
+
if from_page && @opts[:depth_limit]
|
239
|
+
too_deep = from_page.depth >= @opts[:depth_limit]
|
240
|
+
else
|
241
|
+
too_deep = false
|
242
|
+
end
|
243
|
+
|
244
|
+
!@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
|
245
|
+
end
|
246
|
+
|
247
|
+
#
|
248
|
+
# Returns +true+ if *link* should not be visited because
|
249
|
+
# its URL matches a skip_link pattern.
|
250
|
+
#
|
251
|
+
def skip_link?(link)
|
252
|
+
@skip_link_patterns.any? { |p| link.path =~ p }
|
253
|
+
end
|
254
|
+
|
255
|
+
end
|
256
|
+
end
|