shingara-anemone 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ == 0.2.3 / 2009-11-01
2
+
3
+ * Minor enhancements
4
+
5
+ * Options are now applied per-crawl, rather than module-wide.
6
+
7
+ * Bug fixes
8
+
9
+ * Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
10
+
11
+ == 0.2.2 / 2009-10-26
12
+
13
+ * Minor enhancements
14
+
15
+ * When the :verbose option is set to true, exception backtraces are printed to aid debugging.
16
+
17
+ == 0.2.1 / 2009-10-24
18
+
19
+ * Major enhancements
20
+
21
+ * Added HTTPS support.
22
+ * CLI program 'anemone', which is a frontend for several tasks.
23
+
24
+ * Minor enhancements
25
+
26
+ * HTTP request response time recorded in Page.
27
+ * Use of persistent HTTP connections.
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 Vertive, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
@@ -0,0 +1,24 @@
1
+ = Anemone
2
+
3
+ Anemone is a web spider framework that can spider a domain and collect useful
4
+ information about the pages it visits. It is versatile, allowing you to
5
+ write your own specialized spider tasks quickly and easily.
6
+
7
+ See http://anemone.rubyforge.org for more information.
8
+
9
+ == Features
10
+ * Multi-threaded design for high performance
11
+ * Tracks 301 HTTP redirects to understand a page's aliases
12
+ * Built-in BFS algorithm for determining page depth
13
+ * Allows exclusion of URLs based on regular expressions
14
+ * Choose the links to follow on each page with focus_crawl()
15
+ * HTTPS support
16
+ * Records response time for each page
17
+ * CLI program can list all pages in a domain, calculate page depths, and more
18
+
19
+ == Examples
20
+ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
21
+
22
+ == Requirements
23
+ * nokogiri
24
+ * robots
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'anemone/cli'
3
+
4
+ Anemone::CLI::run
@@ -0,0 +1,2 @@
1
+ require 'rubygems'
2
+ require 'anemone/core'
@@ -0,0 +1,24 @@
1
+ module Anemone
2
+ module CLI
3
+ COMMANDS = %w[count cron pagedepth serialize url-list]
4
+
5
+ def self.run
6
+ command = ARGV.shift
7
+
8
+ if COMMANDS.include? command
9
+ load "anemone/cli/#{command.tr('-', '_')}.rb"
10
+ else
11
+ puts <<-INFO
12
+ Anemone is a web spider framework that can collect
13
+ useful information about pages it visits.
14
+
15
+ Usage:
16
+ anemone <command> [arguments]
17
+
18
+ Commands:
19
+ #{COMMANDS.join(', ')}
20
+ INFO
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,22 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ url = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone count <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs the total number
13
+ of unique pages on the site.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(url) do |anemone|
19
+ anemone.after_crawl do |pages|
20
+ puts pages.uniq.size
21
+ end
22
+ end
@@ -0,0 +1,90 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+ options.output_file = 'urls.txt'
8
+
9
+ begin
10
+ # make sure that the last argument is a URL we can crawl
11
+ root = URI(ARGV.last)
12
+ rescue
13
+ puts <<-INFO
14
+ Usage:
15
+ anemone cron [options] <url>
16
+
17
+ Synopsis:
18
+ Combination of `count`, `pagedepth` and `url-list` commands.
19
+ Performs pagedepth, url list, and count functionality.
20
+ Outputs results to STDOUT and link list to file (urls.txt).
21
+ Meant to be run daily as a cron job.
22
+
23
+ Options:
24
+ -r, --relative Output relative URLs (rather than absolute)
25
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
26
+ INFO
27
+ exit(0)
28
+ end
29
+
30
+ # parse command-line options
31
+ opts = OptionParser.new
32
+ opts.on('-r', '--relative') { options.relative = true }
33
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
34
+ opts.parse!(ARGV)
35
+
36
+ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
37
+
38
+ anemone.after_crawl do |pages|
39
+ puts "Crawl results for #{root}\n"
40
+
41
+ # print a list of 404's
42
+ not_found = []
43
+ pages.each_value do |page|
44
+ url = page.url.to_s
45
+ not_found << url if page.not_found?
46
+ end
47
+ unless not_found.empty?
48
+ puts "\n404's:"
49
+
50
+ missing_links = pages.urls_linking_to(not_found)
51
+ missing_links.each do |url, links|
52
+ if options.relative
53
+ puts URI(url).path.to_s
54
+ else
55
+ puts url
56
+ end
57
+ links.slice(0..10).each do |u|
58
+ u = u.path if options.relative
59
+ puts " linked from #{u}"
60
+ end
61
+
62
+ puts " ..." if links.size > 10
63
+ end
64
+
65
+ print "\n"
66
+ end
67
+
68
+ # remove redirect aliases, and calculate pagedepths
69
+ pages = pages.shortest_paths!(root).uniq
70
+ depths = pages.values.inject({}) do |depths, page|
71
+ depths[page.depth] ||= 0
72
+ depths[page.depth] += 1
73
+ depths
74
+ end
75
+
76
+ # print the page count
77
+ puts "Total pages: #{pages.size}\n"
78
+
79
+ # print a list of depths
80
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
81
+
82
+ # output a list of urls to file
83
+ file = open(options.output_file, 'w')
84
+ pages.each_key do |url|
85
+ url = options.relative ? url.path.to_s : url.to_s
86
+ file.puts url
87
+ end
88
+ end
89
+
90
+ end
@@ -0,0 +1,32 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ root = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone pagedepth <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs a count of
13
+ the number of pages at each depth of the crawl.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(root) do |anemone|
19
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
20
+
21
+ anemone.after_crawl do |pages|
22
+ pages = pages.shortest_paths!(root).uniq
23
+
24
+ depths = pages.values.inject({}) do |depths, page|
25
+ depths[page.depth] ||= 0
26
+ depths[page.depth] += 1
27
+ depths
28
+ end
29
+
30
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
31
+ end
32
+ end
@@ -0,0 +1,35 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ begin
6
+ # make sure that the first option is a URL we can crawl
7
+ root = URI(ARGV[0])
8
+ rescue
9
+ puts <<-INFO
10
+ Usage:
11
+ anemone serialize [options] <url>
12
+
13
+ Synopsis:
14
+ Crawls a site starting at the given URL and saves the resulting
15
+ PageHash object to a file using Marshal serialization.
16
+
17
+ Options:
18
+ -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
19
+ INFO
20
+ exit(0)
21
+ end
22
+
23
+ options = OpenStruct.new
24
+ options.output_file = "crawl.#{Time.now.to_i}"
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.after_crawl do |pages|
33
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
34
+ end
35
+ end
@@ -0,0 +1,41 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+
8
+ begin
9
+ # make sure that the last option is a URL we can crawl
10
+ root = URI(ARGV.last)
11
+ rescue
12
+ puts <<-INFO
13
+ Usage:
14
+ anemone url-list [options] <url>
15
+
16
+ Synopsis:
17
+ Crawls a site starting at the given URL, and outputs the URL of each page
18
+ in the domain as they are encountered.
19
+
20
+ Options:
21
+ -r, --relative Output relative URLs (rather than absolute)
22
+ INFO
23
+ exit(0)
24
+ end
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-r', '--relative') { options.relative = true }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
32
+
33
+ anemone.on_every_page do |page|
34
+ if options.relative
35
+ puts page.url.path
36
+ else
37
+ puts page.url
38
+ end
39
+ end
40
+
41
+ end
@@ -0,0 +1,256 @@
1
+ require 'thread'
2
+ require 'robots'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page'
5
+ require 'anemone/page_hash'
6
+
7
+ module Anemone
8
+
9
+ VERSION = '0.2.3';
10
+
11
+ #
12
+ # Convenience method to start a crawl
13
+ #
14
+ def Anemone.crawl(urls, options = {}, &block)
15
+ Core.crawl(urls, options, &block)
16
+ end
17
+
18
+ class Core
19
+ # PageHash storing all Page objects encountered during the crawl
20
+ attr_reader :pages
21
+
22
+ # Hash of options for the crawl
23
+ attr_accessor :opts
24
+
25
+ DEFAULT_OPTS = {
26
+ # run 4 Tentacle threads to fetch pages
27
+ :threads => 4,
28
+ # disable verbose output
29
+ :verbose => false,
30
+ # don't throw away the page response body after scanning it for links
31
+ :discard_page_bodies => false,
32
+ # identify self as Anemone/VERSION
33
+ :user_agent => "Anemone/#{Anemone::VERSION}",
34
+ # no delay between requests
35
+ :delay => 0,
36
+ # don't obey the robots exclusion protocol
37
+ :obey_robots_txt => false,
38
+ # by default, don't limit the depth of the crawl
39
+ :depth_limit => false,
40
+ # number of times HTTP redirects will be followed
41
+ :redirect_limit => 5
42
+ }
43
+
44
+ #
45
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
46
+ # and optional *block*
47
+ #
48
+ def initialize(urls, opts = {})
49
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
50
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
51
+
52
+ @tentacles = []
53
+ @pages = PageHash.new
54
+ @on_every_page_blocks = []
55
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
56
+ @skip_link_patterns = []
57
+ @after_crawl_blocks = []
58
+
59
+ process_options opts
60
+
61
+ yield self if block_given?
62
+ end
63
+
64
+ #
65
+ # Convenience method to start a new crawl
66
+ #
67
+ def self.crawl(urls, opts = {})
68
+ self.new(urls, opts) do |core|
69
+ yield core if block_given?
70
+ core.run
71
+ end
72
+ end
73
+
74
+ #
75
+ # Add a block to be executed on the PageHash after the crawl
76
+ # is finished
77
+ #
78
+ def after_crawl(&block)
79
+ @after_crawl_blocks << block
80
+ self
81
+ end
82
+
83
+ #
84
+ # Add one ore more Regex patterns for URLs which should not be
85
+ # followed
86
+ #
87
+ def skip_links_like(*patterns)
88
+ @skip_link_patterns.concat [patterns].flatten.compact
89
+ self
90
+ end
91
+
92
+ #
93
+ # Add a block to be executed on every Page as they are encountered
94
+ # during the crawl
95
+ #
96
+ def on_every_page(&block)
97
+ @on_every_page_blocks << block
98
+ self
99
+ end
100
+
101
+ #
102
+ # Add a block to be executed on Page objects with a URL matching
103
+ # one or more patterns
104
+ #
105
+ def on_pages_like(*patterns, &block)
106
+ if patterns
107
+ patterns.each do |pattern|
108
+ @on_pages_like_blocks[pattern] << block
109
+ end
110
+ end
111
+ self
112
+ end
113
+
114
+ #
115
+ # Specify a block which will select which links to follow on each page.
116
+ # The block should return an Array of URI objects.
117
+ #
118
+ def focus_crawl(&block)
119
+ @focus_crawl_block = block
120
+ self
121
+ end
122
+
123
+ #
124
+ # Perform the crawl
125
+ #
126
+ def run
127
+ @urls.delete_if { |url| !visit_link?(url) }
128
+ return if @urls.empty?
129
+
130
+ link_queue = Queue.new
131
+ page_queue = Queue.new
132
+
133
+ @opts[:threads].times do
134
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
135
+ end
136
+
137
+ @urls.each{ |url| link_queue.enq(url) }
138
+
139
+ loop do
140
+ page = page_queue.deq
141
+
142
+ @pages[page.url] = page
143
+
144
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
145
+
146
+ # perform the on_every_page blocks for this page
147
+ do_page_blocks(page)
148
+
149
+ page.discard_doc! if @opts[:discard_page_bodies]
150
+
151
+ links_to_follow(page).each do |link|
152
+ link_queue.enq([link, page])
153
+ @pages[link] = nil
154
+ end
155
+
156
+ # create an entry in the page hash for each alias of this page,
157
+ # i.e. all the pages that redirected to this page
158
+ page.aliases.each do |aka|
159
+ if !@pages.has_key?(aka) or @pages[aka].nil?
160
+ @pages[aka] = page.alias_clone(aka)
161
+ end
162
+ @pages[aka].add_alias!(page.url)
163
+ end
164
+
165
+ # if we are done with the crawl, tell the threads to end
166
+ if link_queue.empty? and page_queue.empty?
167
+ until link_queue.num_waiting == @tentacles.size
168
+ Thread.pass
169
+ end
170
+
171
+ if page_queue.empty?
172
+ @tentacles.size.times { link_queue.enq(:END)}
173
+ break
174
+ end
175
+ end
176
+
177
+ end
178
+
179
+ @tentacles.each { |t| t.join }
180
+
181
+ do_after_crawl_blocks()
182
+
183
+ self
184
+ end
185
+
186
+ private
187
+
188
+ def process_options(options)
189
+ @opts = DEFAULT_OPTS.merge options
190
+
191
+ @opts[:threads] = 1 if @opts[:delay] > 0
192
+
193
+ @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
194
+ end
195
+
196
+ #
197
+ # Execute the after_crawl blocks
198
+ #
199
+ def do_after_crawl_blocks
200
+ @after_crawl_blocks.each {|b| b.call(@pages)}
201
+ end
202
+
203
+ #
204
+ # Execute the on_every_page blocks for *page*
205
+ #
206
+ def do_page_blocks(page)
207
+ @on_every_page_blocks.each do |blk|
208
+ blk.call(page)
209
+ end
210
+
211
+ @on_pages_like_blocks.each do |pattern, blks|
212
+ if page.url.to_s =~ pattern
213
+ blks.each { |blk| blk.call(page) }
214
+ end
215
+ end
216
+ end
217
+
218
+ #
219
+ # Return an Array of links to follow from the given page.
220
+ # Based on whether or not the link has already been crawled,
221
+ # and the block given to focus_crawl()
222
+ #
223
+ def links_to_follow(page)
224
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
225
+ links.select { |link| visit_link?(link, page) }
226
+ end
227
+
228
+ #
229
+ # Returns +true+ if *link* has not been visited already,
230
+ # and is not excluded by a skip_link pattern...
231
+ # and is not excluded by robots.txt...
232
+ # and is not deeper than the depth limit
233
+ # Returns +false+ otherwise.
234
+ #
235
+ def visit_link?(link, from_page = nil)
236
+ allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
237
+
238
+ if from_page && @opts[:depth_limit]
239
+ too_deep = from_page.depth >= @opts[:depth_limit]
240
+ else
241
+ too_deep = false
242
+ end
243
+
244
+ !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
245
+ end
246
+
247
+ #
248
+ # Returns +true+ if *link* should not be visited because
249
+ # its URL matches a skip_link pattern.
250
+ #
251
+ def skip_link?(link)
252
+ @skip_link_patterns.any? { |p| link.path =~ p }
253
+ end
254
+
255
+ end
256
+ end