shingara-anemone 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,27 @@
1
+ == 0.2.3 / 2009-11-01
2
+
3
+ * Minor enhancements
4
+
5
+ * Options are now applied per-crawl, rather than module-wide.
6
+
7
+ * Bug fixes
8
+
9
+ * Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
10
+
11
+ == 0.2.2 / 2009-10-26
12
+
13
+ * Minor enhancements
14
+
15
+ * When the :verbose option is set to true, exception backtraces are printed to aid debugging.
16
+
17
+ == 0.2.1 / 2009-10-24
18
+
19
+ * Major enhancements
20
+
21
+ * Added HTTPS support.
22
+ * CLI program 'anemone', which is a frontend for several tasks.
23
+
24
+ * Minor enhancements
25
+
26
+ * HTTP request response time recorded in Page.
27
+ * Use of persistent HTTP connections.
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 Vertive, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
@@ -0,0 +1,24 @@
1
+ = Anemone
2
+
3
+ Anemone is a web spider framework that can spider a domain and collect useful
4
+ information about the pages it visits. It is versatile, allowing you to
5
+ write your own specialized spider tasks quickly and easily.
6
+
7
+ See http://anemone.rubyforge.org for more information.
8
+
9
+ == Features
10
+ * Multi-threaded design for high performance
11
+ * Tracks 301 HTTP redirects to understand a page's aliases
12
+ * Built-in BFS algorithm for determining page depth
13
+ * Allows exclusion of URLs based on regular expressions
14
+ * Choose the links to follow on each page with focus_crawl()
15
+ * HTTPS support
16
+ * Records response time for each page
17
+ * CLI program can list all pages in a domain, calculate page depths, and more
18
+
19
+ == Examples
20
+ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
21
+
22
+ == Requirements
23
+ * nokogiri
24
+ * robots
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'anemone/cli'
3
+
4
+ Anemone::CLI::run
@@ -0,0 +1,2 @@
1
+ require 'rubygems'
2
+ require 'anemone/core'
@@ -0,0 +1,24 @@
1
+ module Anemone
2
+ module CLI
3
+ COMMANDS = %w[count cron pagedepth serialize url-list]
4
+
5
+ def self.run
6
+ command = ARGV.shift
7
+
8
+ if COMMANDS.include? command
9
+ load "anemone/cli/#{command.tr('-', '_')}.rb"
10
+ else
11
+ puts <<-INFO
12
+ Anemone is a web spider framework that can collect
13
+ useful information about pages it visits.
14
+
15
+ Usage:
16
+ anemone <command> [arguments]
17
+
18
+ Commands:
19
+ #{COMMANDS.join(', ')}
20
+ INFO
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,22 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ url = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone count <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs the total number
13
+ of unique pages on the site.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(url) do |anemone|
19
+ anemone.after_crawl do |pages|
20
+ puts pages.uniq.size
21
+ end
22
+ end
@@ -0,0 +1,90 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+ options.output_file = 'urls.txt'
8
+
9
+ begin
10
+ # make sure that the last argument is a URL we can crawl
11
+ root = URI(ARGV.last)
12
+ rescue
13
+ puts <<-INFO
14
+ Usage:
15
+ anemone cron [options] <url>
16
+
17
+ Synopsis:
18
+ Combination of `count`, `pagedepth` and `url-list` commands.
19
+ Performs pagedepth, url list, and count functionality.
20
+ Outputs results to STDOUT and link list to file (urls.txt).
21
+ Meant to be run daily as a cron job.
22
+
23
+ Options:
24
+ -r, --relative Output relative URLs (rather than absolute)
25
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
26
+ INFO
27
+ exit(0)
28
+ end
29
+
30
+ # parse command-line options
31
+ opts = OptionParser.new
32
+ opts.on('-r', '--relative') { options.relative = true }
33
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
34
+ opts.parse!(ARGV)
35
+
36
+ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
37
+
38
+ anemone.after_crawl do |pages|
39
+ puts "Crawl results for #{root}\n"
40
+
41
+ # print a list of 404's
42
+ not_found = []
43
+ pages.each_value do |page|
44
+ url = page.url.to_s
45
+ not_found << url if page.not_found?
46
+ end
47
+ unless not_found.empty?
48
+ puts "\n404's:"
49
+
50
+ missing_links = pages.urls_linking_to(not_found)
51
+ missing_links.each do |url, links|
52
+ if options.relative
53
+ puts URI(url).path.to_s
54
+ else
55
+ puts url
56
+ end
57
+ links.slice(0..10).each do |u|
58
+ u = u.path if options.relative
59
+ puts " linked from #{u}"
60
+ end
61
+
62
+ puts " ..." if links.size > 10
63
+ end
64
+
65
+ print "\n"
66
+ end
67
+
68
+ # remove redirect aliases, and calculate pagedepths
69
+ pages = pages.shortest_paths!(root).uniq
70
+ depths = pages.values.inject({}) do |depths, page|
71
+ depths[page.depth] ||= 0
72
+ depths[page.depth] += 1
73
+ depths
74
+ end
75
+
76
+ # print the page count
77
+ puts "Total pages: #{pages.size}\n"
78
+
79
+ # print a list of depths
80
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
81
+
82
+ # output a list of urls to file
83
+ file = open(options.output_file, 'w')
84
+ pages.each_key do |url|
85
+ url = options.relative ? url.path.to_s : url.to_s
86
+ file.puts url
87
+ end
88
+ end
89
+
90
+ end
@@ -0,0 +1,32 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ root = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone pagedepth <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs a count of
13
+ the number of pages at each depth of the crawl.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(root) do |anemone|
19
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
20
+
21
+ anemone.after_crawl do |pages|
22
+ pages = pages.shortest_paths!(root).uniq
23
+
24
+ depths = pages.values.inject({}) do |depths, page|
25
+ depths[page.depth] ||= 0
26
+ depths[page.depth] += 1
27
+ depths
28
+ end
29
+
30
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
31
+ end
32
+ end
@@ -0,0 +1,35 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ begin
6
+ # make sure that the first option is a URL we can crawl
7
+ root = URI(ARGV[0])
8
+ rescue
9
+ puts <<-INFO
10
+ Usage:
11
+ anemone serialize [options] <url>
12
+
13
+ Synopsis:
14
+ Crawls a site starting at the given URL and saves the resulting
15
+ PageHash object to a file using Marshal serialization.
16
+
17
+ Options:
18
+ -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
19
+ INFO
20
+ exit(0)
21
+ end
22
+
23
+ options = OpenStruct.new
24
+ options.output_file = "crawl.#{Time.now.to_i}"
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.after_crawl do |pages|
33
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
34
+ end
35
+ end
@@ -0,0 +1,41 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+
8
+ begin
9
+ # make sure that the last option is a URL we can crawl
10
+ root = URI(ARGV.last)
11
+ rescue
12
+ puts <<-INFO
13
+ Usage:
14
+ anemone url-list [options] <url>
15
+
16
+ Synopsis:
17
+ Crawls a site starting at the given URL, and outputs the URL of each page
18
+ in the domain as they are encountered.
19
+
20
+ Options:
21
+ -r, --relative Output relative URLs (rather than absolute)
22
+ INFO
23
+ exit(0)
24
+ end
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-r', '--relative') { options.relative = true }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root, :discard_page_bodies => true) do |anemone|
32
+
33
+ anemone.on_every_page do |page|
34
+ if options.relative
35
+ puts page.url.path
36
+ else
37
+ puts page.url
38
+ end
39
+ end
40
+
41
+ end
@@ -0,0 +1,256 @@
1
+ require 'thread'
2
+ require 'robots'
3
+ require 'anemone/tentacle'
4
+ require 'anemone/page'
5
+ require 'anemone/page_hash'
6
+
7
+ module Anemone
8
+
9
+ VERSION = '0.2.3';
10
+
11
+ #
12
+ # Convenience method to start a crawl
13
+ #
14
+ def Anemone.crawl(urls, options = {}, &block)
15
+ Core.crawl(urls, options, &block)
16
+ end
17
+
18
+ class Core
19
+ # PageHash storing all Page objects encountered during the crawl
20
+ attr_reader :pages
21
+
22
+ # Hash of options for the crawl
23
+ attr_accessor :opts
24
+
25
+ DEFAULT_OPTS = {
26
+ # run 4 Tentacle threads to fetch pages
27
+ :threads => 4,
28
+ # disable verbose output
29
+ :verbose => false,
30
+ # don't throw away the page response body after scanning it for links
31
+ :discard_page_bodies => false,
32
+ # identify self as Anemone/VERSION
33
+ :user_agent => "Anemone/#{Anemone::VERSION}",
34
+ # no delay between requests
35
+ :delay => 0,
36
+ # don't obey the robots exclusion protocol
37
+ :obey_robots_txt => false,
38
+ # by default, don't limit the depth of the crawl
39
+ :depth_limit => false,
40
+ # number of times HTTP redirects will be followed
41
+ :redirect_limit => 5
42
+ }
43
+
44
+ #
45
+ # Initialize the crawl with starting *urls* (single URL or Array of URLs)
46
+ # and optional *block*
47
+ #
48
+ def initialize(urls, opts = {})
49
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
50
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
51
+
52
+ @tentacles = []
53
+ @pages = PageHash.new
54
+ @on_every_page_blocks = []
55
+ @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
56
+ @skip_link_patterns = []
57
+ @after_crawl_blocks = []
58
+
59
+ process_options opts
60
+
61
+ yield self if block_given?
62
+ end
63
+
64
+ #
65
+ # Convenience method to start a new crawl
66
+ #
67
+ def self.crawl(urls, opts = {})
68
+ self.new(urls, opts) do |core|
69
+ yield core if block_given?
70
+ core.run
71
+ end
72
+ end
73
+
74
+ #
75
+ # Add a block to be executed on the PageHash after the crawl
76
+ # is finished
77
+ #
78
+ def after_crawl(&block)
79
+ @after_crawl_blocks << block
80
+ self
81
+ end
82
+
83
+ #
84
+ # Add one ore more Regex patterns for URLs which should not be
85
+ # followed
86
+ #
87
+ def skip_links_like(*patterns)
88
+ @skip_link_patterns.concat [patterns].flatten.compact
89
+ self
90
+ end
91
+
92
+ #
93
+ # Add a block to be executed on every Page as they are encountered
94
+ # during the crawl
95
+ #
96
+ def on_every_page(&block)
97
+ @on_every_page_blocks << block
98
+ self
99
+ end
100
+
101
+ #
102
+ # Add a block to be executed on Page objects with a URL matching
103
+ # one or more patterns
104
+ #
105
+ def on_pages_like(*patterns, &block)
106
+ if patterns
107
+ patterns.each do |pattern|
108
+ @on_pages_like_blocks[pattern] << block
109
+ end
110
+ end
111
+ self
112
+ end
113
+
114
+ #
115
+ # Specify a block which will select which links to follow on each page.
116
+ # The block should return an Array of URI objects.
117
+ #
118
+ def focus_crawl(&block)
119
+ @focus_crawl_block = block
120
+ self
121
+ end
122
+
123
+ #
124
+ # Perform the crawl
125
+ #
126
+ def run
127
+ @urls.delete_if { |url| !visit_link?(url) }
128
+ return if @urls.empty?
129
+
130
+ link_queue = Queue.new
131
+ page_queue = Queue.new
132
+
133
+ @opts[:threads].times do
134
+ @tentacles << Thread.new { Tentacle.new(link_queue, page_queue, @opts).run }
135
+ end
136
+
137
+ @urls.each{ |url| link_queue.enq(url) }
138
+
139
+ loop do
140
+ page = page_queue.deq
141
+
142
+ @pages[page.url] = page
143
+
144
+ puts "#{page.url} Queue: #{link_queue.size}" if @opts[:verbose]
145
+
146
+ # perform the on_every_page blocks for this page
147
+ do_page_blocks(page)
148
+
149
+ page.discard_doc! if @opts[:discard_page_bodies]
150
+
151
+ links_to_follow(page).each do |link|
152
+ link_queue.enq([link, page])
153
+ @pages[link] = nil
154
+ end
155
+
156
+ # create an entry in the page hash for each alias of this page,
157
+ # i.e. all the pages that redirected to this page
158
+ page.aliases.each do |aka|
159
+ if !@pages.has_key?(aka) or @pages[aka].nil?
160
+ @pages[aka] = page.alias_clone(aka)
161
+ end
162
+ @pages[aka].add_alias!(page.url)
163
+ end
164
+
165
+ # if we are done with the crawl, tell the threads to end
166
+ if link_queue.empty? and page_queue.empty?
167
+ until link_queue.num_waiting == @tentacles.size
168
+ Thread.pass
169
+ end
170
+
171
+ if page_queue.empty?
172
+ @tentacles.size.times { link_queue.enq(:END)}
173
+ break
174
+ end
175
+ end
176
+
177
+ end
178
+
179
+ @tentacles.each { |t| t.join }
180
+
181
+ do_after_crawl_blocks()
182
+
183
+ self
184
+ end
185
+
186
+ private
187
+
188
+ def process_options(options)
189
+ @opts = DEFAULT_OPTS.merge options
190
+
191
+ @opts[:threads] = 1 if @opts[:delay] > 0
192
+
193
+ @robots = Robots.new(@opts[:user_agent]) if @opts[:obey_robots_txt]
194
+ end
195
+
196
+ #
197
+ # Execute the after_crawl blocks
198
+ #
199
+ def do_after_crawl_blocks
200
+ @after_crawl_blocks.each {|b| b.call(@pages)}
201
+ end
202
+
203
+ #
204
+ # Execute the on_every_page blocks for *page*
205
+ #
206
+ def do_page_blocks(page)
207
+ @on_every_page_blocks.each do |blk|
208
+ blk.call(page)
209
+ end
210
+
211
+ @on_pages_like_blocks.each do |pattern, blks|
212
+ if page.url.to_s =~ pattern
213
+ blks.each { |blk| blk.call(page) }
214
+ end
215
+ end
216
+ end
217
+
218
+ #
219
+ # Return an Array of links to follow from the given page.
220
+ # Based on whether or not the link has already been crawled,
221
+ # and the block given to focus_crawl()
222
+ #
223
+ def links_to_follow(page)
224
+ links = @focus_crawl_block ? @focus_crawl_block.call(page) : page.links
225
+ links.select { |link| visit_link?(link, page) }
226
+ end
227
+
228
+ #
229
+ # Returns +true+ if *link* has not been visited already,
230
+ # and is not excluded by a skip_link pattern...
231
+ # and is not excluded by robots.txt...
232
+ # and is not deeper than the depth limit
233
+ # Returns +false+ otherwise.
234
+ #
235
+ def visit_link?(link, from_page = nil)
236
+ allowed = @opts[:obey_robots_txt] ? @robots.allowed?(link) : true
237
+
238
+ if from_page && @opts[:depth_limit]
239
+ too_deep = from_page.depth >= @opts[:depth_limit]
240
+ else
241
+ too_deep = false
242
+ end
243
+
244
+ !@pages.has_page?(link) && !skip_link?(link) && allowed && !too_deep
245
+ end
246
+
247
+ #
248
+ # Returns +true+ if *link* should not be visited because
249
+ # its URL matches a skip_link pattern.
250
+ #
251
+ def skip_link?(link)
252
+ @skip_link_patterns.any? { |p| link.path =~ p }
253
+ end
254
+
255
+ end
256
+ end