spk-anemone 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG.rdoc ADDED
@@ -0,0 +1,34 @@
1
+ == 0.2.4 / 2009-11-26
2
+
3
+ * Minor enhancements
4
+
5
+ * Add authorization header can be set by anemone option or in the URI
6
+ * Add accessor for html body
7
+
8
+ == 0.2.3 / 2009-11-01
9
+
10
+ * Minor enhancements
11
+
12
+ * Options are now applied per-crawl, rather than module-wide.
13
+
14
+ * Bug fixes
15
+
16
+ * Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
17
+
18
+ == 0.2.2 / 2009-10-26
19
+
20
+ * Minor enhancements
21
+
22
+ * When the :verbose option is set to true, exception backtraces are printed to aid debugging.
23
+
24
+ == 0.2.1 / 2009-10-24
25
+
26
+ * Major enhancements
27
+
28
+ * Added HTTPS support.
29
+ * CLI program 'anemone', which is a frontend for several tasks.
30
+
31
+ * Minor enhancements
32
+
33
+ * HTTP request response time recorded in Page.
34
+ * Use of persistent HTTP connections.
data/LICENSE.txt ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 Vertive, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,24 @@
1
+ = Anemone
2
+
3
+ Anemone is a web spider framework that can spider a domain and collect useful
4
+ information about the pages it visits. It is versatile, allowing you to
5
+ write your own specialized spider tasks quickly and easily.
6
+
7
+ See http://anemone.rubyforge.org for more information.
8
+
9
+ == Features
10
+ * Multi-threaded design for high performance
11
+ * Tracks 301 HTTP redirects to understand a page's aliases
12
+ * Built-in BFS algorithm for determining page depth
13
+ * Allows exclusion of URLs based on regular expressions
14
+ * Choose the links to follow on each page with focus_crawl()
15
+ * HTTPS support
16
+ * Records response time for each page
17
+ * CLI program can list all pages in a domain, calculate page depths, and more
18
+
19
+ == Examples
20
+ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
21
+
22
+ == Requirements
23
+ * nokogiri
24
+ * robots
data/bin/anemone ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'anemone/cli'
3
+
4
+ Anemone::CLI::run
data/lib/anemone.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'rubygems'
2
+ require 'anemone/core'
@@ -0,0 +1,24 @@
1
+ module Anemone
2
+ module CLI
3
+ COMMANDS = %w[count cron pagedepth serialize url-list]
4
+
5
+ def self.run
6
+ command = ARGV.shift
7
+
8
+ if COMMANDS.include? command
9
+ load "anemone/cli/#{command.tr('-', '_')}.rb"
10
+ else
11
+ puts <<-INFO
12
+ Anemone is a web spider framework that can collect
13
+ useful information about pages it visits.
14
+
15
+ Usage:
16
+ anemone <command> [arguments]
17
+
18
+ Commands:
19
+ #{COMMANDS.join(', ')}
20
+ INFO
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,22 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ url = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone count <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs the total number
13
+ of unique pages on the site.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(url) do |anemone|
19
+ anemone.after_crawl do |pages|
20
+ puts pages.uniq.size
21
+ end
22
+ end
@@ -0,0 +1,90 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+ options.output_file = 'urls.txt'
8
+
9
+ begin
10
+ # make sure that the last argument is a URL we can crawl
11
+ root = URI(ARGV.last)
12
+ rescue
13
+ puts <<-INFO
14
+ Usage:
15
+ anemone cron [options] <url>
16
+
17
+ Synopsis:
18
+ Combination of `count`, `pagedepth` and `url-list` commands.
19
+ Performs pagedepth, url list, and count functionality.
20
+ Outputs results to STDOUT and link list to file (urls.txt).
21
+ Meant to be run daily as a cron job.
22
+
23
+ Options:
24
+ -r, --relative Output relative URLs (rather than absolute)
25
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
26
+ INFO
27
+ exit(0)
28
+ end
29
+
30
+ # parse command-line options
31
+ opts = OptionParser.new
32
+ opts.on('-r', '--relative') { options.relative = true }
33
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
34
+ opts.parse!(ARGV)
35
+
36
+ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
37
+
38
+ anemone.after_crawl do |pages|
39
+ puts "Crawl results for #{root}\n"
40
+
41
+ # print a list of 404's
42
+ not_found = []
43
+ pages.each_value do |page|
44
+ url = page.url.to_s
45
+ not_found << url if page.not_found?
46
+ end
47
+ unless not_found.empty?
48
+ puts "\n404's:"
49
+
50
+ missing_links = pages.urls_linking_to(not_found)
51
+ missing_links.each do |url, links|
52
+ if options.relative
53
+ puts URI(url).path.to_s
54
+ else
55
+ puts url
56
+ end
57
+ links.slice(0..10).each do |u|
58
+ u = u.path if options.relative
59
+ puts " linked from #{u}"
60
+ end
61
+
62
+ puts " ..." if links.size > 10
63
+ end
64
+
65
+ print "\n"
66
+ end
67
+
68
+ # remove redirect aliases, and calculate pagedepths
69
+ pages = pages.shortest_paths!(root).uniq
70
+ depths = pages.values.inject({}) do |depths, page|
71
+ depths[page.depth] ||= 0
72
+ depths[page.depth] += 1
73
+ depths
74
+ end
75
+
76
+ # print the page count
77
+ puts "Total pages: #{pages.size}\n"
78
+
79
+ # print a list of depths
80
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
81
+
82
+ # output a list of urls to file
83
+ file = open(options.output_file, 'w')
84
+ pages.each_key do |url|
85
+ url = options.relative ? url.path.to_s : url.to_s
86
+ file.puts url
87
+ end
88
+ end
89
+
90
+ end
@@ -0,0 +1,32 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ root = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone pagedepth <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs a count of
13
+ the number of pages at each depth of the crawl.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(root) do |anemone|
19
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
20
+
21
+ anemone.after_crawl do |pages|
22
+ pages = pages.shortest_paths!(root).uniq
23
+
24
+ depths = pages.values.inject({}) do |depths, page|
25
+ depths[page.depth] ||= 0
26
+ depths[page.depth] += 1
27
+ depths
28
+ end
29
+
30
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
31
+ end
32
+ end
@@ -0,0 +1,35 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ begin
6
+ # make sure that the first option is a URL we can crawl
7
+ root = URI(ARGV[0])
8
+ rescue
9
+ puts <<-INFO
10
+ Usage:
11
+ anemone serialize [options] <url>
12
+
13
+ Synopsis:
14
+ Crawls a site starting at the given URL and saves the resulting
15
+ PageHash object to a file using Marshal serialization.
16
+
17
+ Options:
18
+ -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
19
+ INFO
20
+ exit(0)
21
+ end
22
+
23
+ options = OpenStruct.new
24
+ options.output_file = "crawl.#{Time.now.to_i}"
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.after_crawl do |pages|
33
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
34
+ end
35
+ end
@@ -0,0 +1,41 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+
8
+ begin
9
+ # make sure that the last option is a URL we can crawl
10
+ root = URI(ARGV.last)
11
+ rescue
12
+ puts <<-INFO
13
+ Usage:
14
+ anemone url-list [options] <url>
15
+
16
+ Synopsis:
17
+ Crawls a site starting at the given URL, and outputs the URL of each page
18
+ in the domain as they are encountered.
19
+
20
+ Options:
21
+ -r, --relative Output relative URLs (rather than absolute)
22
+ INFO
23
+ exit(0)
24
+ end
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-r', '--relative') { options.relative = true }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
32
+
33
+ anemone.on_every_page do |page|
34
+ if options.relative
35
+ puts page.url.path
36
+ else
37
+ puts page.url
38
+ end
39
+ end
40
+
41
+ end
@@ -0,0 +1,280 @@
1
+ require 'thread'
2
+ require 'robots'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page'
5
+ require 'anemone/page_hash'
6
+
7
+ module Anemone
8
+
9
+ VERSION = '0.2.4';
10
+
11
+ #
12
+ # Convenience method to start a crawl
13
+ #
14
+ def Anemone.crawl(urls, options = {}, &block)
15
+ Core.crawl(urls, options, &block)
16
+ end
17
+
18
+ class Core
19
+ # PageHash storing all Page objects encountered during the crawl
20
+ attr_reader :pages
21
+
22
+ # Hash of options for the crawl
23
+ attr_accessor :opts
24
+
25
+ DEFAULT_OPTS = {
26
+ # run 4 Tentacle threads to fetch pages
27
+ :threads => 4,
28
+ # disable verbose output
29
+ :verbose => false,
30
+ # don't throw away the page response body after scanning it for links
31
+ :discard_page_bodies => false,
32
+ # identify self as Anemone/VERSION
33
+ :user_agent => "Anemone/#{Anemone::VERSION}",
34
+ # no delay between requests
35
+ :delay => 0,
36
+ # don't obey the robots exclusion protocol
37
+ :obey_robots_txt => false,
38
+ # by default, don't limit the depth of the crawl
39
+ :depth_limit => false,
40
+ # number of times HTTP redirects will be followed
41
+ :redirect_limit => 5,
42
+ # Authentication
43
+ :authorization => nil,
44
+ }
45
+
46
+ #
47
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
48
+ # and optional *block*
49
+ #
50
+ def initialize(urls, opts = {})
51
+ process_options opts
52
+
53
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
54
+ @urls.each{ |url|
55
+ url.path = '/' if url.path.empty?
56
+ authorization(url) if url.user
57
+ }
58
+
59
+ @tentacles = []
60
+ @pages = PageHash.new
61
+ @on_every_page_blocks = []
62
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
63
+ @skip_link_patterns = []
64
+ @after_crawl_blocks = []
65
+
66
+ yield self if block_given?
67
+ end
68
+
69
+ #
70
+ # Convenience method to start a new crawl
71
+ #
72
+ def self.crawl(urls, opts = {})
73
+ self.new(urls, opts) do |core|
74
+ yield core if block_given?
75
+ core.run
76
+ end
77
+ end
78
+
79
+ #
80
+ # Add a block to be executed on the PageHash after the crawl
81
+ # is finished
82
+ #
83
+ def after_crawl(&block)
84
+ @after_crawl_blocks << block
85
+ self
86
+ end
87
+
88
+ #
89
+ # Add one ore more Regex patterns for URLs which should not be
90
+ # followed
91
+ #
92
+ def skip_links_like(*patterns)
93
+ @skip_link_patterns.concat [patterns].flatten.compact
94
+ self
95
+ end
96
+
97
+ #
98
+ # Add a block to be executed on every Page as they are encountered
99
+ # during the crawl
100
+ #
101
+ def on_every_page(&block)
102
+ @on_every_page_blocks << block
103
+ self
104
+ end
105
+
106
+ #
107
+ # Add a block to be executed on Page objects with a URL matching
108
+ # one or more patterns
109
+ #
110
+ def on_pages_like(*patterns, &block)
111
+ if patterns
112
+ patterns.each do |pattern|
113
+ @on_pages_like_blocks[pattern] << block
114
+ end
115
+ end
116
+ self
117
+ end
118
+
119
+ #
120
+ # Specify a block which will select which links to follow on each page.
121
+ # The block should return an Array of URI objects.
122
+ #
123
+ def focus_crawl(&block)
124
+ @focus_crawl_block = block
125
+ self
126
+ end
127
+
128
+ #
129
+ # Perform the crawl
130
+ #
131
+ def run
132
+ @urls.delete_if { |url| !visit_link?(url) }
133
+ return if @urls.empty?
134
+
135
+ link_queue = Queue.new
136
+ page_queue = Queue.new
137
+
138
+ @opts[:threads].times do
139
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
140
+ end
141
+
142
+ @urls.each{ |url| link_queue.enq(url) }
143
+
144
+ loop do
145
+ page = page_queue.deq
146
+
147
+ @pages[page.url] = page
148
+
149
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
150
+
151
+ # perform the on_every_page blocks for this page
152
+ do_page_blocks(page)
153
+
154
+ page.discard_doc! if @opts[:discard_page_bodies]
155
+
156
+ links_to_follow(page).each do |link|
157
+ link_queue.enq([link, page])
158
+ @pages[link] = nil
159
+ end
160
+
161
+ # create an entry in the page hash for each alias of this page,
162
+ # i.e. all the pages that redirected to this page
163
+ page.aliases.each do |aka|
164
+ if !@pages.has_key?(aka) or @pages[aka].nil?
165
+ @pages[aka] = page.alias_clone(aka)
166
+ end
167
+ @pages[aka].add_alias!(page.url)
168
+ end
169
+
170
+ # if we are done with the crawl, tell the threads to end
171
+ if link_queue.empty? and page_queue.empty?
172
+ until link_queue.num_waiting == @tentacles.size
173
+ Thread.pass
174
+ end
175
+
176
+ if page_queue.empty?
177
+ @tentacles.size.times { link_queue.enq(:END)}
178
+ break
179
+ end
180
+ end
181
+
182
+ end
183
+
184
+ @tentacles.each { |t| t.join }
185
+
186
+ do_after_crawl_blocks()
187
+
188
+ self
189
+ end
190
+
191
+ private
192
+
193
+ def process_options(options)
194
+ @opts = DEFAULT_OPTS.merge options
195
+
196
+ authorization(@opts[:authorization])
197
+
198
+ @opts[:threads] = 1 if @opts[:delay] > 0
199
+
200
+ @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
201
+ end
202
+
203
+ # Generate Authorization string only if not already set
204
+ def authorization(auth=nil)
205
+ return if @opts[:authorization] =~ /^Basic .*/
206
+ require 'base64'
207
+ if auth.is_a?(String) && auth.include?(':')
208
+ @opts[:authorization] = "Basic #{Base64.b64encode(auth)}"
209
+ elsif auth.is_a?(Array)
210
+ user = auth.first
211
+ password = auth.last
212
+ @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
213
+ elsif auth.is_a?(URI)
214
+ user = auth.user
215
+ password = auth.password
216
+ @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
217
+ end
218
+ end
219
+
220
+ #
221
+ # Execute the after_crawl blocks
222
+ #
223
+ def do_after_crawl_blocks
224
+ @after_crawl_blocks.each {|b| b.call(@pages)}
225
+ end
226
+
227
+ #
228
+ # Execute the on_every_page blocks for *page*
229
+ #
230
+ def do_page_blocks(page)
231
+ @on_every_page_blocks.each do |blk|
232
+ blk.call(page)
233
+ end
234
+
235
+ @on_pages_like_blocks.each do |pattern, blks|
236
+ if page.url.to_s =~ pattern
237
+ blks.each { |blk| blk.call(page) }
238
+ end
239
+ end
240
+ end
241
+
242
+ #
243
+ # Return an Array of links to follow from the given page.
244
+ # Based on whether or not the link has already been crawled,
245
+ # and the block given to focus_crawl()
246
+ #
247
+ def links_to_follow(page)
248
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
249
+ links.select { |link| visit_link?(link, page) }
250
+ end
251
+
252
+ #
253
+ # Returns +true+ if *link* has not been visited already,
254
+ # and is not excluded by a skip_link pattern...
255
+ # and is not excluded by robots.txt...
256
+ # and is not deeper than the depth limit
257
+ # Returns +false+ otherwise.
258
+ #
259
+ def visit_link?(link, from_page = nil)
260
+ allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
261
+
262
+ if from_page && @opts[:depth_limit]
263
+ too_deep = from_page.depth >= @opts[:depth_limit]
264
+ else
265
+ too_deep = false
266
+ end
267
+
268
+ !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
269
+ end
270
+
271
+ #
272
+ # Returns +true+ if *link* should not be visited because
273
+ # its URL matches a skip_link pattern.
274
+ #
275
+ def skip_link?(link)
276
+ @skip_link_patterns.any? { |p| link.path =~ p }
277
+ end
278
+
279
+ end
280
+ end