spk-anemone 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG.rdoc ADDED
@@ -0,0 +1,34 @@
1
+ == 0.2.4 / 2009-11-26
2
+
3
+ * Minor enhancements
4
+
5
+ * Add authorization header can be set by anemone option or in the URI
6
+ * Add accessor for html body
7
+
8
+ == 0.2.3 / 2009-11-01
9
+
10
+ * Minor enhancements
11
+
12
+ * Options are now applied per-crawl, rather than module-wide.
13
+
14
+ * Bug fixes
15
+
16
+ * Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
17
+
18
+ == 0.2.2 / 2009-10-26
19
+
20
+ * Minor enhancements
21
+
22
+ * When the :verbose option is set to true, exception backtraces are printed to aid debugging.
23
+
24
+ == 0.2.1 / 2009-10-24
25
+
26
+ * Major enhancements
27
+
28
+ * Added HTTPS support.
29
+ * CLI program 'anemone', which is a frontend for several tasks.
30
+
31
+ * Minor enhancements
32
+
33
+ * HTTP request response time recorded in Page.
34
+ * Use of persistent HTTP connections.
data/LICENSE.txt ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 Vertive, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,24 @@
1
+ = Anemone
2
+
3
+ Anemone is a web spider framework that can spider a domain and collect useful
4
+ information about the pages it visits. It is versatile, allowing you to
5
+ write your own specialized spider tasks quickly and easily.
6
+
7
+ See http://anemone.rubyforge.org for more information.
8
+
9
+ == Features
10
+ * Multi-threaded design for high performance
11
+ * Tracks 301 HTTP redirects to understand a page's aliases
12
+ * Built-in BFS algorithm for determining page depth
13
+ * Allows exclusion of URLs based on regular expressions
14
+ * Choose the links to follow on each page with focus_crawl()
15
+ * HTTPS support
16
+ * Records response time for each page
17
+ * CLI program can list all pages in a domain, calculate page depths, and more
18
+
19
+ == Examples
20
+ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
21
+
22
+ == Requirements
23
+ * nokogiri
24
+ * robots
data/bin/anemone ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'anemone/cli'
3
+
4
+ Anemone::CLI::run
data/lib/anemone.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'rubygems'
2
+ require 'anemone/core'
@@ -0,0 +1,24 @@
1
+ module Anemone
2
+ module CLI
3
+ COMMANDS = %w[count cron pagedepth serialize url-list]
4
+
5
+ def self.run
6
+ command = ARGV.shift
7
+
8
+ if COMMANDS.include? command
9
+ load "anemone/cli/#{command.tr('-', '_')}.rb"
10
+ else
11
+ puts <<-INFO
12
+ Anemone is a web spider framework that can collect
13
+ useful information about pages it visits.
14
+
15
+ Usage:
16
+ anemone <command> [arguments]
17
+
18
+ Commands:
19
+ #{COMMANDS.join(', ')}
20
+ INFO
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,22 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ url = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone count <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs the total number
13
+ of unique pages on the site.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(url) do |anemone|
19
+ anemone.after_crawl do |pages|
20
+ puts pages.uniq.size
21
+ end
22
+ end
@@ -0,0 +1,90 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+ options.output_file = 'urls.txt'
8
+
9
+ begin
10
+ # make sure that the last argument is a URL we can crawl
11
+ root = URI(ARGV.last)
12
+ rescue
13
+ puts <<-INFO
14
+ Usage:
15
+ anemone cron [options] <url>
16
+
17
+ Synopsis:
18
+ Combination of `count`, `pagedepth` and `url-list` commands.
19
+ Performs pagedepth, url list, and count functionality.
20
+ Outputs results to STDOUT and link list to file (urls.txt).
21
+ Meant to be run daily as a cron job.
22
+
23
+ Options:
24
+ -r, --relative Output relative URLs (rather than absolute)
25
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
26
+ INFO
27
+ exit(0)
28
+ end
29
+
30
+ # parse command-line options
31
+ opts = OptionParser.new
32
+ opts.on('-r', '--relative') { options.relative = true }
33
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
34
+ opts.parse!(ARGV)
35
+
36
+ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
37
+
38
+ anemone.after_crawl do |pages|
39
+ puts "Crawl results for #{root}\n"
40
+
41
+ # print a list of 404's
42
+ not_found = []
43
+ pages.each_value do |page|
44
+ url = page.url.to_s
45
+ not_found << url if page.not_found?
46
+ end
47
+ unless not_found.empty?
48
+ puts "\n404's:"
49
+
50
+ missing_links = pages.urls_linking_to(not_found)
51
+ missing_links.each do |url, links|
52
+ if options.relative
53
+ puts URI(url).path.to_s
54
+ else
55
+ puts url
56
+ end
57
+ links.slice(0..10).each do |u|
58
+ u = u.path if options.relative
59
+ puts " linked from #{u}"
60
+ end
61
+
62
+ puts " ..." if links.size > 10
63
+ end
64
+
65
+ print "\n"
66
+ end
67
+
68
+ # remove redirect aliases, and calculate pagedepths
69
+ pages = pages.shortest_paths!(root).uniq
70
+ depths = pages.values.inject({}) do |depths, page|
71
+ depths[page.depth] ||= 0
72
+ depths[page.depth] += 1
73
+ depths
74
+ end
75
+
76
+ # print the page count
77
+ puts "Total pages: #{pages.size}\n"
78
+
79
+ # print a list of depths
80
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
81
+
82
+ # output a list of urls to file
83
+ file = open(options.output_file, 'w')
84
+ pages.each_key do |url|
85
+ url = options.relative ? url.path.to_s : url.to_s
86
+ file.puts url
87
+ end
88
+ end
89
+
90
+ end
@@ -0,0 +1,32 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ root = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone pagedepth <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs a count of
13
+ the number of pages at each depth of the crawl.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(root) do |anemone|
19
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
20
+
21
+ anemone.after_crawl do |pages|
22
+ pages = pages.shortest_paths!(root).uniq
23
+
24
+ depths = pages.values.inject({}) do |depths, page|
25
+ depths[page.depth] ||= 0
26
+ depths[page.depth] += 1
27
+ depths
28
+ end
29
+
30
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
31
+ end
32
+ end
@@ -0,0 +1,35 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ begin
6
+ # make sure that the first option is a URL we can crawl
7
+ root = URI(ARGV[0])
8
+ rescue
9
+ puts <<-INFO
10
+ Usage:
11
+ anemone serialize [options] <url>
12
+
13
+ Synopsis:
14
+ Crawls a site starting at the given URL and saves the resulting
15
+ PageHash object to a file using Marshal serialization.
16
+
17
+ Options:
18
+ -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
19
+ INFO
20
+ exit(0)
21
+ end
22
+
23
+ options = OpenStruct.new
24
+ options.output_file = "crawl.#{Time.now.to_i}"
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.after_crawl do |pages|
33
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
34
+ end
35
+ end
@@ -0,0 +1,41 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+
8
+ begin
9
+ # make sure that the last option is a URL we can crawl
10
+ root = URI(ARGV.last)
11
+ rescue
12
+ puts <<-INFO
13
+ Usage:
14
+ anemone url-list [options] <url>
15
+
16
+ Synopsis:
17
+ Crawls a site starting at the given URL, and outputs the URL of each page
18
+ in the domain as they are encountered.
19
+
20
+ Options:
21
+ -r, --relative Output relative URLs (rather than absolute)
22
+ INFO
23
+ exit(0)
24
+ end
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-r', '--relative') { options.relative = true }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
32
+
33
+ anemone.on_every_page do |page|
34
+ if options.relative
35
+ puts page.url.path
36
+ else
37
+ puts page.url
38
+ end
39
+ end
40
+
41
+ end
@@ -0,0 +1,280 @@
1
+ require 'thread'
2
+ require 'robots'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page'
5
+ require 'anemone/page_hash'
6
+
7
+ module Anemone
8
+
9
+ VERSION = '0.2.4';
10
+
11
+ #
12
+ # Convenience method to start a crawl
13
+ #
14
+ def Anemone.crawl(urls, options = {}, &block)
15
+ Core.crawl(urls, options, &block)
16
+ end
17
+
18
+ class Core
19
+ # PageHash storing all Page objects encountered during the crawl
20
+ attr_reader :pages
21
+
22
+ # Hash of options for the crawl
23
+ attr_accessor :opts
24
+
25
+ DEFAULT_OPTS = {
26
+ # run 4 Tentacle threads to fetch pages
27
+ :threads => 4,
28
+ # disable verbose output
29
+ :verbose => false,
30
+ # don't throw away the page response body after scanning it for links
31
+ :discard_page_bodies => false,
32
+ # identify self as Anemone/VERSION
33
+ :user_agent => "Anemone/#{Anemone::VERSION}",
34
+ # no delay between requests
35
+ :delay => 0,
36
+ # don't obey the robots exclusion protocol
37
+ :obey_robots_txt => false,
38
+ # by default, don't limit the depth of the crawl
39
+ :depth_limit => false,
40
+ # number of times HTTP redirects will be followed
41
+ :redirect_limit => 5,
42
+ # Authentication
43
+ :authorization => nil,
44
+ }
45
+
46
+ #
47
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
48
+ # and optional *block*
49
+ #
50
+ def initialize(urls, opts = {})
51
+ process_options opts
52
+
53
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
54
+ @urls.each{ |url|
55
+ url.path = '/' if url.path.empty?
56
+ authorization(url) if url.user
57
+ }
58
+
59
+ @tentacles = []
60
+ @pages = PageHash.new
61
+ @on_every_page_blocks = []
62
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
63
+ @skip_link_patterns = []
64
+ @after_crawl_blocks = []
65
+
66
+ yield self if block_given?
67
+ end
68
+
69
+ #
70
+ # Convenience method to start a new crawl
71
+ #
72
+ def self.crawl(urls, opts = {})
73
+ self.new(urls, opts) do |core|
74
+ yield core if block_given?
75
+ core.run
76
+ end
77
+ end
78
+
79
+ #
80
+ # Add a block to be executed on the PageHash after the crawl
81
+ # is finished
82
+ #
83
+ def after_crawl(&block)
84
+ @after_crawl_blocks << block
85
+ self
86
+ end
87
+
88
+ #
89
+ # Add one ore more Regex patterns for URLs which should not be
90
+ # followed
91
+ #
92
+ def skip_links_like(*patterns)
93
+ @skip_link_patterns.concat [patterns].flatten.compact
94
+ self
95
+ end
96
+
97
+ #
98
+ # Add a block to be executed on every Page as they are encountered
99
+ # during the crawl
100
+ #
101
+ def on_every_page(&block)
102
+ @on_every_page_blocks << block
103
+ self
104
+ end
105
+
106
+ #
107
+ # Add a block to be executed on Page objects with a URL matching
108
+ # one or more patterns
109
+ #
110
+ def on_pages_like(*patterns, &block)
111
+ if patterns
112
+ patterns.each do |pattern|
113
+ @on_pages_like_blocks[pattern] << block
114
+ end
115
+ end
116
+ self
117
+ end
118
+
119
+ #
120
+ # Specify a block which will select which links to follow on each page.
121
+ # The block should return an Array of URI objects.
122
+ #
123
+ def focus_crawl(&block)
124
+ @focus_crawl_block = block
125
+ self
126
+ end
127
+
128
+ #
129
+ # Perform the crawl
130
+ #
131
+ def run
132
+ @urls.delete_if { |url| !visit_link?(url) }
133
+ return if @urls.empty?
134
+
135
+ link_queue = Queue.new
136
+ page_queue = Queue.new
137
+
138
+ @opts[:threads].times do
139
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
140
+ end
141
+
142
+ @urls.each{ |url| link_queue.enq(url) }
143
+
144
+ loop do
145
+ page = page_queue.deq
146
+
147
+ @pages[page.url] = page
148
+
149
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
150
+
151
+ # perform the on_every_page blocks for this page
152
+ do_page_blocks(page)
153
+
154
+ page.discard_doc! if @opts[:discard_page_bodies]
155
+
156
+ links_to_follow(page).each do |link|
157
+ link_queue.enq([link, page])
158
+ @pages[link] = nil
159
+ end
160
+
161
+ # create an entry in the page hash for each alias of this page,
162
+ # i.e. all the pages that redirected to this page
163
+ page.aliases.each do |aka|
164
+ if !@pages.has_key?(aka) or @pages[aka].nil?
165
+ @pages[aka] = page.alias_clone(aka)
166
+ end
167
+ @pages[aka].add_alias!(page.url)
168
+ end
169
+
170
+ # if we are done with the crawl, tell the threads to end
171
+ if link_queue.empty? and page_queue.empty?
172
+ until link_queue.num_waiting == @tentacles.size
173
+ Thread.pass
174
+ end
175
+
176
+ if page_queue.empty?
177
+ @tentacles.size.times { link_queue.enq(:END)}
178
+ break
179
+ end
180
+ end
181
+
182
+ end
183
+
184
+ @tentacles.each { |t| t.join }
185
+
186
+ do_after_crawl_blocks()
187
+
188
+ self
189
+ end
190
+
191
+ private
192
+
193
+ def process_options(options)
194
+ @opts = DEFAULT_OPTS.merge options
195
+
196
+ authorization(@opts[:authorization])
197
+
198
+ @opts[:threads] = 1 if @opts[:delay] > 0
199
+
200
+ @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
201
+ end
202
+
203
+ # Generate Authorization string only if not already set
204
+ def authorization(auth=nil)
205
+ return if @opts[:authorization] =~ /^Basic .*/
206
+ require 'base64'
207
+ if auth.is_a?(String) && auth.include?(':')
208
+ @opts[:authorization] = "Basic #{Base64.b64encode(auth)}"
209
+ elsif auth.is_a?(Array)
210
+ user = auth.first
211
+ password = auth.last
212
+ @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
213
+ elsif auth.is_a?(URI)
214
+ user = auth.user
215
+ password = auth.password
216
+ @opts[:authorization] = "Basic #{Base64.b64encode(user+":"+password)}"
217
+ end
218
+ end
219
+
220
+ #
221
+ # Execute the after_crawl blocks
222
+ #
223
+ def do_after_crawl_blocks
224
+ @after_crawl_blocks.each {|b| b.call(@pages)}
225
+ end
226
+
227
+ #
228
+ # Execute the on_every_page blocks for *page*
229
+ #
230
+ def do_page_blocks(page)
231
+ @on_every_page_blocks.each do |blk|
232
+ blk.call(page)
233
+ end
234
+
235
+ @on_pages_like_blocks.each do |pattern, blks|
236
+ if page.url.to_s =~ pattern
237
+ blks.each { |blk| blk.call(page) }
238
+ end
239
+ end
240
+ end
241
+
242
+ #
243
+ # Return an Array of links to follow from the given page.
244
+ # Based on whether or not the link has already been crawled,
245
+ # and the block given to focus_crawl()
246
+ #
247
+ def links_to_follow(page)
248
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
249
+ links.select { |link| visit_link?(link, page) }
250
+ end
251
+
252
+ #
253
+ # Returns +true+ if *link* has not been visited already,
254
+ # and is not excluded by a skip_link pattern...
255
+ # and is not excluded by robots.txt...
256
+ # and is not deeper than the depth limit
257
+ # Returns +false+ otherwise.
258
+ #
259
+ def visit_link?(link, from_page = nil)
260
+ allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
261
+
262
+ if from_page && @opts[:depth_limit]
263
+ too_deep = from_page.depth >= @opts[:depth_limit]
264
+ else
265
+ too_deep = false
266
+ end
267
+
268
+ !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
269
+ end
270
+
271
+ #
272
+ # Returns +true+ if *link* should not be visited because
273
+ # its URL matches a skip_link pattern.
274
+ #
275
+ def skip_link?(link)
276
+ @skip_link_patterns.any? { |p| link.path =~ p }
277
+ end
278
+
279
+ end
280
+ end