sutch-anemone 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ YWExNTI1ZGRiMjA0NTZhNzk3Y2YyMDYzNDcxMDQwNTk4NDJkMGI2NQ==
5
+ data.tar.gz: !binary |-
6
+ YWE2MDg1OTQzNTEyOTZiZTJjMTUzNWZiMzgxNTBjMDJmMzBkYjYzZQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ YWY1MjliMGJjMzRhZGQ1OTVmYWVlYzI3YmU3YTI1ZGVjMDk5NjAwZGEwZmJh
10
+ MzA0Y2Q4ZWMwODM3MzgyMTU5ZTk2NTE3ZDFhNDc4MDBmOWZjNWViOWQ1Nzlk
11
+ ZDU4OTA4Y2VkNWI1MDA1ZjUyNWQ2YzJkMDA3YmJiNGQwZTczMGM=
12
+ data.tar.gz: !binary |-
13
+ MjYzODE4ZTIwYjM5OTljMDY2NTdkNzRiY2FlOWJkOGMxNmZjNzE4M2JkN2Fk
14
+ YjQ4MjE5NjM2NjllZmJhODc4M2UzYjYwYTZhY2ZhZWRiYzgwZjk4MDZmYzEy
15
+ N2FkMjZiZjdiMmI2NWI2N2I3MDUyNWM2YmI0YTIyODAzZmQ1Yzg=
data/CHANGELOG.rdoc ADDED
@@ -0,0 +1,136 @@
1
+ * Enhancements
2
+
3
+ == sutch's branch
4
+
5
+ * Added Anemone::Resource to provide for spidering of resources other than HTML pages
6
+
7
+ == 0.7.2 / 2012-05-30
8
+
9
+ * Bug fixes
10
+
11
+ * Fix bug causing anchor links to have '#' converted to '%23'
12
+
13
+ == 0.7.1 / 2012-01-20
14
+
15
+ * Minor enhancements
16
+
17
+ * Switch from robots gem (which people reported problems with) to new robotex gem
18
+
19
+ * Bug fixes
20
+
21
+ * Fix incorrect default file extension for KyotoCabinet
22
+
23
+ == 0.7.0 / 2012-01-19
24
+
25
+ * Major enhancements
26
+
27
+ * Added support for SQLite3 and Kyoto Cabinet storage
28
+
29
+ * Minor enhancements
30
+
31
+ * Added Page#base to use base HTML element
32
+ * Use bundler for development dependencies
33
+
34
+ * Bug fixes
35
+
36
+ * Encode characters in URLs
37
+ * Fix specs to run under rake
38
+ * Fix handling of redirect_to in storage adapters
39
+
40
+ == 0.6.1 / 2011-02-24
41
+
42
+ * Bug fixes
43
+
44
+ * Fix a bug preventing SSL connections from working
45
+
46
+ == 0.6.0 / 2011-02-17
47
+
48
+ * Major enhancements
49
+
50
+ * Added support for HTTP Basic Auth with URLs containing a username and password
51
+ * Added support for anonymous HTTP proxies
52
+
53
+ * Minor enhancements
54
+
55
+ * Added read_timeout option to set the HTTP request timeout in seconds
56
+
57
+ * Bug fixes
58
+
59
+ * Don't fatal error if a page request times out
60
+ * Fix double encoding of links containing %20
61
+
62
+ == 0.5.0 / 2010-09-01
63
+
64
+ * Major enhancements
65
+
66
+ * Added page storage engines for MongoDB and Redis
67
+
68
+ * Minor enhancements
69
+
70
+ * Use xpath for link parsing instead of CSS (faster) (Marc Seeger)
71
+ * Added skip_query_strings option to skip links with query strings (Joost Baaij)
72
+
73
+ * Bug fixes
74
+
75
+ * Only consider status code 300..307 a redirect (Marc Seeger)
76
+ * Canonicalize redirect links (Marc Seeger)
77
+
78
+ == 0.4.0 / 2010-04-08
79
+
80
+ * Major enchancements
81
+
82
+ * Cookies can be accepted and sent with each HTTP request.
83
+
84
+ == 0.3.2 / 2010-02-04
85
+
86
+ * Bug fixes
87
+
88
+ * Fixed issue that allowed following redirects off the original domain
89
+
90
+ == 0.3.1 / 2010-01-22
91
+
92
+ * Minor enhancements
93
+
94
+ * Added an attr_accessor to Page for the HTTP response body
95
+
96
+ * Bug fixes
97
+
98
+ * Fixed incorrect method calls in CLI scripts
99
+
100
+ == 0.3.0 / 2009-12-15
101
+
102
+ * Major enchancements
103
+
104
+ * Option for persistent storage of pages during crawl with TokyoCabinet or PStore
105
+
106
+ * Minor enhancements
107
+
108
+ * Options can be set via methods on the Core object in the crawl block
109
+
110
+ == 0.2.3 / 2009-11-01
111
+
112
+ * Minor enhancements
113
+
114
+ * Options are now applied per-crawl, rather than module-wide.
115
+
116
+ * Bug fixes
117
+
118
+ * Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
119
+
120
+ == 0.2.2 / 2009-10-26
121
+
122
+ * Minor enhancements
123
+
124
+ * When the :verbose option is set to true, exception backtraces are printed to aid debugging.
125
+
126
+ == 0.2.1 / 2009-10-24
127
+
128
+ * Major enhancements
129
+
130
+ * Added HTTPS support.
131
+ * CLI program 'anemone', which is a frontend for several tasks.
132
+
133
+ * Minor enhancements
134
+
135
+ * HTTP request response time recorded in Page.
136
+ * Use of persistent HTTP connections.
data/LICENSE.txt ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 Vertive, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,38 @@
1
+ = Anemone
2
+
3
+ Anemone is a web spider framework that can spider a domain and collect useful
4
+ information about the pages it visits. It is versatile, allowing you to
5
+ write your own specialized spider tasks quickly and easily.
6
+
7
+ See http://anemone.rubyforge.org for more information.
8
+
9
+ == Features
10
+ * Multi-threaded design for high performance
11
+ * Tracks 301 HTTP redirects
12
+ * Built-in BFS algorithm for determining page depth
13
+ * Allows exclusion of URLs based on regular expressions
14
+ * Choose the links to follow on each page with focus_crawl()
15
+ * HTTPS support
16
+ * Records response time for each page
17
+ * CLI program can list all pages in a domain, calculate page depths, and more
18
+ * Obey robots.txt
19
+ * In-memory or persistent storage of pages during crawl, using TokyoCabinet, SQLite3, MongoDB, or Redis
20
+
21
+ == Examples
22
+ See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
23
+
24
+ == Requirements
25
+ * nokogiri
26
+ * robots
27
+
28
+ == Development
29
+ To test and develop this gem, additional requirements are:
30
+ * rspec
31
+ * fakeweb
32
+ * tokyocabinet
33
+ * kyotocabinet-ruby
34
+ * mongo
35
+ * redis
36
+ * sqlite3
37
+
38
+ You will need to have KyotoCabinet, {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.
data/Rakefile ADDED
@@ -0,0 +1,23 @@
1
+ require 'rspec/core/rake_task'
2
+ require 'rdoc/task'
3
+
4
+ desc "Run all specs"
5
+ RSpec::Core::RakeTask.new(:rspec) do |spec|
6
+ spec.pattern = 'spec/**/*_spec.rb'
7
+ end
8
+
9
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
10
+ spec.pattern = 'spec/**/*_spec.rb'
11
+ spec.rcov = true
12
+ end
13
+
14
+ task :default => :rspec
15
+
16
+ RDoc::Task.new do |rdoc|
17
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
18
+
19
+ rdoc.rdoc_dir = 'rdoc'
20
+ rdoc.title = "anemone #{version}"
21
+ rdoc.rdoc_files.include('README*')
22
+ rdoc.rdoc_files.include('lib/**/*.rb')
23
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.7.2
data/bin/anemone ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'anemone/cli'
3
+
4
+ Anemone::CLI::run
data/lib/anemone.rb ADDED
@@ -0,0 +1,2 @@
1
+ require 'rubygems'
2
+ require 'anemone/core'
@@ -0,0 +1,24 @@
1
+ module Anemone
2
+ module CLI
3
+ COMMANDS = %w[count cron pagedepth serialize url-list]
4
+
5
+ def self.run
6
+ command = ARGV.shift
7
+
8
+ if COMMANDS.include? command
9
+ load "anemone/cli/#{command.tr('-', '_')}.rb"
10
+ else
11
+ puts <<-INFO
12
+ Anemone is a web spider framework that can collect
13
+ useful information about pages it visits.
14
+
15
+ Usage:
16
+ anemone <command> [arguments]
17
+
18
+ Commands:
19
+ #{COMMANDS.join(', ')}
20
+ INFO
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,22 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ url = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone count <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs the total number
13
+ of unique pages on the site.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(url) do |anemone|
19
+ anemone.after_crawl do |pages|
20
+ puts pages.uniq!.size
21
+ end
22
+ end
@@ -0,0 +1,90 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ options = OpenStruct.new
6
+ options.relative = false
7
+ options.output_file = 'urls.txt'
8
+
9
+ begin
10
+ # make sure that the last argument is a URL we can crawl
11
+ root = URI(ARGV.last)
12
+ rescue
13
+ puts <<-INFO
14
+ Usage:
15
+ anemone cron [options] <url>
16
+
17
+ Synopsis:
18
+ Combination of `count`, `pagedepth` and `url-list` commands.
19
+ Performs pagedepth, url list, and count functionality.
20
+ Outputs results to STDOUT and link list to file (urls.txt).
21
+ Meant to be run daily as a cron job.
22
+
23
+ Options:
24
+ -r, --relative Output relative URLs (rather than absolute)
25
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
26
+ INFO
27
+ exit(0)
28
+ end
29
+
30
+ # parse command-line options
31
+ opts = OptionParser.new
32
+ opts.on('-r', '--relative') { options.relative = true }
33
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
34
+ opts.parse!(ARGV)
35
+
36
+ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
37
+
38
+ anemone.after_crawl do |pages|
39
+ puts "Crawl results for #{root}\n"
40
+
41
+ # print a list of 404's
42
+ not_found = []
43
+ pages.each_value do |page|
44
+ url = page.url.to_s
45
+ not_found << url if page.not_found?
46
+ end
47
+ unless not_found.empty?
48
+ puts "\n404's:"
49
+
50
+ missing_links = pages.urls_linking_to(not_found)
51
+ missing_links.each do |url, links|
52
+ if options.relative
53
+ puts URI(url).path.to_s
54
+ else
55
+ puts url
56
+ end
57
+ links.slice(0..10).each do |u|
58
+ u = u.path if options.relative
59
+ puts " linked from #{u}"
60
+ end
61
+
62
+ puts " ..." if links.size > 10
63
+ end
64
+
65
+ print "\n"
66
+ end
67
+
68
+ # remove redirect aliases, and calculate pagedepths
69
+ pages = pages.shortest_paths!(root).uniq
70
+ depths = pages.values.inject({}) do |depths, page|
71
+ depths[page.depth] ||= 0
72
+ depths[page.depth] += 1
73
+ depths
74
+ end
75
+
76
+ # print the page count
77
+ puts "Total pages: #{pages.size}\n"
78
+
79
+ # print a list of depths
80
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
81
+
82
+ # output a list of urls to file
83
+ file = open(options.output_file, 'w')
84
+ pages.each_key do |url|
85
+ url = options.relative ? url.path.to_s : url.to_s
86
+ file.puts url
87
+ end
88
+ end
89
+
90
+ end
@@ -0,0 +1,32 @@
1
+ require 'anemone'
2
+
3
+ begin
4
+ # make sure that the first option is a URL we can crawl
5
+ root = URI(ARGV[0])
6
+ rescue
7
+ puts <<-INFO
8
+ Usage:
9
+ anemone pagedepth <url>
10
+
11
+ Synopsis:
12
+ Crawls a site starting at the given URL and outputs a count of
13
+ the number of pages at each depth of the crawl.
14
+ INFO
15
+ exit(0)
16
+ end
17
+
18
+ Anemone.crawl(root) do |anemone|
19
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
20
+
21
+ anemone.after_crawl do |pages|
22
+ pages = pages.shortest_paths!(root).uniq!
23
+
24
+ depths = pages.values.inject({}) do |depths, page|
25
+ depths[page.depth] ||= 0
26
+ depths[page.depth] += 1
27
+ depths
28
+ end
29
+
30
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
31
+ end
32
+ end
@@ -0,0 +1,35 @@
1
+ require 'anemone'
2
+ require 'optparse'
3
+ require 'ostruct'
4
+
5
+ begin
6
+ # make sure that the first option is a URL we can crawl
7
+ root = URI(ARGV[0])
8
+ rescue
9
+ puts <<-INFO
10
+ Usage:
11
+ anemone serialize [options] <url>
12
+
13
+ Synopsis:
14
+ Crawls a site starting at the given URL and saves the resulting
15
+ PageStore object to a file using Marshal serialization.
16
+
17
+ Options:
18
+ -o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
19
+ INFO
20
+ exit(0)
21
+ end
22
+
23
+ options = OpenStruct.new
24
+ options.output_file = "crawl.#{Time.now.to_i}"
25
+
26
+ # parse command-line options
27
+ opts = OptionParser.new
28
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
29
+ opts.parse!(ARGV)
30
+
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.after_crawl do |pages|
33
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
34
+ end
35
+ end