sutch-anemone 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/CHANGELOG.rdoc +136 -0
- data/LICENSE.txt +19 -0
- data/README.rdoc +38 -0
- data/Rakefile +23 -0
- data/VERSION +1 -0
- data/bin/anemone +4 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/cli.rb +24 -0
- data/lib/anemone/cli/count.rb +22 -0
- data/lib/anemone/cli/cron.rb +90 -0
- data/lib/anemone/cli/pagedepth.rb +32 -0
- data/lib/anemone/cli/serialize.rb +35 -0
- data/lib/anemone/cli/url_list.rb +41 -0
- data/lib/anemone/cookie_store.rb +35 -0
- data/lib/anemone/core.rb +339 -0
- data/lib/anemone/exceptions.rb +5 -0
- data/lib/anemone/http.rb +187 -0
- data/lib/anemone/page.rb +217 -0
- data/lib/anemone/page_store.rb +161 -0
- data/lib/anemone/resource.rb +42 -0
- data/lib/anemone/storage.rb +44 -0
- data/lib/anemone/storage/base.rb +75 -0
- data/lib/anemone/storage/exceptions.rb +15 -0
- data/lib/anemone/storage/kyoto_cabinet.rb +72 -0
- data/lib/anemone/storage/mongodb.rb +89 -0
- data/lib/anemone/storage/pstore.rb +50 -0
- data/lib/anemone/storage/redis.rb +90 -0
- data/lib/anemone/storage/sqlite3.rb +90 -0
- data/lib/anemone/storage/tokyo_cabinet.rb +60 -0
- data/lib/anemone/tentacle.rb +39 -0
- data/spec/anemone_spec.rb +16 -0
- data/spec/cookie_store_spec.rb +28 -0
- data/spec/core_spec.rb +344 -0
- data/spec/fakeweb_helper.rb +77 -0
- data/spec/http_spec.rb +19 -0
- data/spec/page_spec.rb +186 -0
- data/spec/page_store_spec.rb +171 -0
- data/spec/resource_spec.rb +91 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/storage_spec.rb +252 -0
- metadata +281 -0
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
YWExNTI1ZGRiMjA0NTZhNzk3Y2YyMDYzNDcxMDQwNTk4NDJkMGI2NQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
YWE2MDg1OTQzNTEyOTZiZTJjMTUzNWZiMzgxNTBjMDJmMzBkYjYzZQ==
|
7
|
+
!binary "U0hBNTEy":
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
YWY1MjliMGJjMzRhZGQ1OTVmYWVlYzI3YmU3YTI1ZGVjMDk5NjAwZGEwZmJh
|
10
|
+
MzA0Y2Q4ZWMwODM3MzgyMTU5ZTk2NTE3ZDFhNDc4MDBmOWZjNWViOWQ1Nzlk
|
11
|
+
ZDU4OTA4Y2VkNWI1MDA1ZjUyNWQ2YzJkMDA3YmJiNGQwZTczMGM=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
MjYzODE4ZTIwYjM5OTljMDY2NTdkNzRiY2FlOWJkOGMxNmZjNzE4M2JkN2Fk
|
14
|
+
YjQ4MjE5NjM2NjllZmJhODc4M2UzYjYwYTZhY2ZhZWRiYzgwZjk4MDZmYzEy
|
15
|
+
N2FkMjZiZjdiMmI2NWI2N2I3MDUyNWM2YmI0YTIyODAzZmQ1Yzg=
|
data/CHANGELOG.rdoc
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
* Enhancements
|
2
|
+
|
3
|
+
== sutch's branch
|
4
|
+
|
5
|
+
* Added Anemone::Resource to provide for spidering of resources other than HTML pages
|
6
|
+
|
7
|
+
== 0.7.2 / 2012-05-30
|
8
|
+
|
9
|
+
* Bug fixes
|
10
|
+
|
11
|
+
* Fix bug causing anchor links to have '#' converted to '%23'
|
12
|
+
|
13
|
+
== 0.7.1 / 2012-01-20
|
14
|
+
|
15
|
+
* Minor enhancements
|
16
|
+
|
17
|
+
* Switch from robots gem (which people reported problems with) to new robotex gem
|
18
|
+
|
19
|
+
* Bug fixes
|
20
|
+
|
21
|
+
* Fix incorrect default file extension for KyotoCabinet
|
22
|
+
|
23
|
+
== 0.7.0 / 2012-01-19
|
24
|
+
|
25
|
+
* Major enhancements
|
26
|
+
|
27
|
+
* Added support for SQLite3 and Kyoto Cabinet storage
|
28
|
+
|
29
|
+
* Minor enhancements
|
30
|
+
|
31
|
+
* Added Page#base to use base HTML element
|
32
|
+
* Use bundler for development dependencies
|
33
|
+
|
34
|
+
* Bug fixes
|
35
|
+
|
36
|
+
* Encode characters in URLs
|
37
|
+
* Fix specs to run under rake
|
38
|
+
* Fix handling of redirect_to in storage adapters
|
39
|
+
|
40
|
+
== 0.6.1 / 2011-02-24
|
41
|
+
|
42
|
+
* Bug fixes
|
43
|
+
|
44
|
+
* Fix a bug preventing SSL connections from working
|
45
|
+
|
46
|
+
== 0.6.0 / 2011-02-17
|
47
|
+
|
48
|
+
* Major enhancements
|
49
|
+
|
50
|
+
* Added support for HTTP Basic Auth with URLs containing a username and password
|
51
|
+
* Added support for anonymous HTTP proxies
|
52
|
+
|
53
|
+
* Minor enhancements
|
54
|
+
|
55
|
+
* Added read_timeout option to set the HTTP request timeout in seconds
|
56
|
+
|
57
|
+
* Bug fixes
|
58
|
+
|
59
|
+
* Don't fatal error if a page request times out
|
60
|
+
* Fix double encoding of links containing %20
|
61
|
+
|
62
|
+
== 0.5.0 / 2010-09-01
|
63
|
+
|
64
|
+
* Major enhancements
|
65
|
+
|
66
|
+
* Added page storage engines for MongoDB and Redis
|
67
|
+
|
68
|
+
* Minor enhancements
|
69
|
+
|
70
|
+
* Use xpath for link parsing instead of CSS (faster) (Marc Seeger)
|
71
|
+
* Added skip_query_strings option to skip links with query strings (Joost Baaij)
|
72
|
+
|
73
|
+
* Bug fixes
|
74
|
+
|
75
|
+
* Only consider status code 300..307 a redirect (Marc Seeger)
|
76
|
+
* Canonicalize redirect links (Marc Seeger)
|
77
|
+
|
78
|
+
== 0.4.0 / 2010-04-08
|
79
|
+
|
80
|
+
* Major enchancements
|
81
|
+
|
82
|
+
* Cookies can be accepted and sent with each HTTP request.
|
83
|
+
|
84
|
+
== 0.3.2 / 2010-02-04
|
85
|
+
|
86
|
+
* Bug fixes
|
87
|
+
|
88
|
+
* Fixed issue that allowed following redirects off the original domain
|
89
|
+
|
90
|
+
== 0.3.1 / 2010-01-22
|
91
|
+
|
92
|
+
* Minor enhancements
|
93
|
+
|
94
|
+
* Added an attr_accessor to Page for the HTTP response body
|
95
|
+
|
96
|
+
* Bug fixes
|
97
|
+
|
98
|
+
* Fixed incorrect method calls in CLI scripts
|
99
|
+
|
100
|
+
== 0.3.0 / 2009-12-15
|
101
|
+
|
102
|
+
* Major enchancements
|
103
|
+
|
104
|
+
* Option for persistent storage of pages during crawl with TokyoCabinet or PStore
|
105
|
+
|
106
|
+
* Minor enhancements
|
107
|
+
|
108
|
+
* Options can be set via methods on the Core object in the crawl block
|
109
|
+
|
110
|
+
== 0.2.3 / 2009-11-01
|
111
|
+
|
112
|
+
* Minor enhancements
|
113
|
+
|
114
|
+
* Options are now applied per-crawl, rather than module-wide.
|
115
|
+
|
116
|
+
* Bug fixes
|
117
|
+
|
118
|
+
* Fixed a bug which caused deadlock if an exception occurred when crawling the last page in the queue.
|
119
|
+
|
120
|
+
== 0.2.2 / 2009-10-26
|
121
|
+
|
122
|
+
* Minor enhancements
|
123
|
+
|
124
|
+
* When the :verbose option is set to true, exception backtraces are printed to aid debugging.
|
125
|
+
|
126
|
+
== 0.2.1 / 2009-10-24
|
127
|
+
|
128
|
+
* Major enhancements
|
129
|
+
|
130
|
+
* Added HTTPS support.
|
131
|
+
* CLI program 'anemone', which is a frontend for several tasks.
|
132
|
+
|
133
|
+
* Minor enhancements
|
134
|
+
|
135
|
+
* HTTP request response time recorded in Page.
|
136
|
+
* Use of persistent HTTP connections.
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2009 Vertive, Inc.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
= Anemone
|
2
|
+
|
3
|
+
Anemone is a web spider framework that can spider a domain and collect useful
|
4
|
+
information about the pages it visits. It is versatile, allowing you to
|
5
|
+
write your own specialized spider tasks quickly and easily.
|
6
|
+
|
7
|
+
See http://anemone.rubyforge.org for more information.
|
8
|
+
|
9
|
+
== Features
|
10
|
+
* Multi-threaded design for high performance
|
11
|
+
* Tracks 301 HTTP redirects
|
12
|
+
* Built-in BFS algorithm for determining page depth
|
13
|
+
* Allows exclusion of URLs based on regular expressions
|
14
|
+
* Choose the links to follow on each page with focus_crawl()
|
15
|
+
* HTTPS support
|
16
|
+
* Records response time for each page
|
17
|
+
* CLI program can list all pages in a domain, calculate page depths, and more
|
18
|
+
* Obey robots.txt
|
19
|
+
* In-memory or persistent storage of pages during crawl, using TokyoCabinet, SQLite3, MongoDB, or Redis
|
20
|
+
|
21
|
+
== Examples
|
22
|
+
See the scripts under the <tt>lib/anemone/cli</tt> directory for examples of several useful Anemone tasks.
|
23
|
+
|
24
|
+
== Requirements
|
25
|
+
* nokogiri
|
26
|
+
* robots
|
27
|
+
|
28
|
+
== Development
|
29
|
+
To test and develop this gem, additional requirements are:
|
30
|
+
* rspec
|
31
|
+
* fakeweb
|
32
|
+
* tokyocabinet
|
33
|
+
* kyotocabinet-ruby
|
34
|
+
* mongo
|
35
|
+
* redis
|
36
|
+
* sqlite3
|
37
|
+
|
38
|
+
You will need to have KyotoCabinet, {Tokyo Cabinet}[http://fallabs.com/tokyocabinet/], {MongoDB}[http://www.mongodb.org/], and {Redis}[http://code.google.com/p/redis/] installed on your system and running.
|
data/Rakefile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'rspec/core/rake_task'
|
2
|
+
require 'rdoc/task'
|
3
|
+
|
4
|
+
desc "Run all specs"
|
5
|
+
RSpec::Core::RakeTask.new(:rspec) do |spec|
|
6
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
7
|
+
end
|
8
|
+
|
9
|
+
RSpec::Core::RakeTask.new(:rcov) do |spec|
|
10
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
11
|
+
spec.rcov = true
|
12
|
+
end
|
13
|
+
|
14
|
+
task :default => :rspec
|
15
|
+
|
16
|
+
RDoc::Task.new do |rdoc|
|
17
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
18
|
+
|
19
|
+
rdoc.rdoc_dir = 'rdoc'
|
20
|
+
rdoc.title = "anemone #{version}"
|
21
|
+
rdoc.rdoc_files.include('README*')
|
22
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
23
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.7.2
|
data/bin/anemone
ADDED
data/lib/anemone.rb
ADDED
data/lib/anemone/cli.rb
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
module Anemone
|
2
|
+
module CLI
|
3
|
+
COMMANDS = %w[count cron pagedepth serialize url-list]
|
4
|
+
|
5
|
+
def self.run
|
6
|
+
command = ARGV.shift
|
7
|
+
|
8
|
+
if COMMANDS.include? command
|
9
|
+
load "anemone/cli/#{command.tr('-', '_')}.rb"
|
10
|
+
else
|
11
|
+
puts <<-INFO
|
12
|
+
Anemone is a web spider framework that can collect
|
13
|
+
useful information about pages it visits.
|
14
|
+
|
15
|
+
Usage:
|
16
|
+
anemone <command> [arguments]
|
17
|
+
|
18
|
+
Commands:
|
19
|
+
#{COMMANDS.join(', ')}
|
20
|
+
INFO
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
|
3
|
+
begin
|
4
|
+
# make sure that the first option is a URL we can crawl
|
5
|
+
url = URI(ARGV[0])
|
6
|
+
rescue
|
7
|
+
puts <<-INFO
|
8
|
+
Usage:
|
9
|
+
anemone count <url>
|
10
|
+
|
11
|
+
Synopsis:
|
12
|
+
Crawls a site starting at the given URL and outputs the total number
|
13
|
+
of unique pages on the site.
|
14
|
+
INFO
|
15
|
+
exit(0)
|
16
|
+
end
|
17
|
+
|
18
|
+
Anemone.crawl(url) do |anemone|
|
19
|
+
anemone.after_crawl do |pages|
|
20
|
+
puts pages.uniq!.size
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
options = OpenStruct.new
|
6
|
+
options.relative = false
|
7
|
+
options.output_file = 'urls.txt'
|
8
|
+
|
9
|
+
begin
|
10
|
+
# make sure that the last argument is a URL we can crawl
|
11
|
+
root = URI(ARGV.last)
|
12
|
+
rescue
|
13
|
+
puts <<-INFO
|
14
|
+
Usage:
|
15
|
+
anemone cron [options] <url>
|
16
|
+
|
17
|
+
Synopsis:
|
18
|
+
Combination of `count`, `pagedepth` and `url-list` commands.
|
19
|
+
Performs pagedepth, url list, and count functionality.
|
20
|
+
Outputs results to STDOUT and link list to file (urls.txt).
|
21
|
+
Meant to be run daily as a cron job.
|
22
|
+
|
23
|
+
Options:
|
24
|
+
-r, --relative Output relative URLs (rather than absolute)
|
25
|
+
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
26
|
+
INFO
|
27
|
+
exit(0)
|
28
|
+
end
|
29
|
+
|
30
|
+
# parse command-line options
|
31
|
+
opts = OptionParser.new
|
32
|
+
opts.on('-r', '--relative') { options.relative = true }
|
33
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
34
|
+
opts.parse!(ARGV)
|
35
|
+
|
36
|
+
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
37
|
+
|
38
|
+
anemone.after_crawl do |pages|
|
39
|
+
puts "Crawl results for #{root}\n"
|
40
|
+
|
41
|
+
# print a list of 404's
|
42
|
+
not_found = []
|
43
|
+
pages.each_value do |page|
|
44
|
+
url = page.url.to_s
|
45
|
+
not_found << url if page.not_found?
|
46
|
+
end
|
47
|
+
unless not_found.empty?
|
48
|
+
puts "\n404's:"
|
49
|
+
|
50
|
+
missing_links = pages.urls_linking_to(not_found)
|
51
|
+
missing_links.each do |url, links|
|
52
|
+
if options.relative
|
53
|
+
puts URI(url).path.to_s
|
54
|
+
else
|
55
|
+
puts url
|
56
|
+
end
|
57
|
+
links.slice(0..10).each do |u|
|
58
|
+
u = u.path if options.relative
|
59
|
+
puts " linked from #{u}"
|
60
|
+
end
|
61
|
+
|
62
|
+
puts " ..." if links.size > 10
|
63
|
+
end
|
64
|
+
|
65
|
+
print "\n"
|
66
|
+
end
|
67
|
+
|
68
|
+
# remove redirect aliases, and calculate pagedepths
|
69
|
+
pages = pages.shortest_paths!(root).uniq
|
70
|
+
depths = pages.values.inject({}) do |depths, page|
|
71
|
+
depths[page.depth] ||= 0
|
72
|
+
depths[page.depth] += 1
|
73
|
+
depths
|
74
|
+
end
|
75
|
+
|
76
|
+
# print the page count
|
77
|
+
puts "Total pages: #{pages.size}\n"
|
78
|
+
|
79
|
+
# print a list of depths
|
80
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
81
|
+
|
82
|
+
# output a list of urls to file
|
83
|
+
file = open(options.output_file, 'w')
|
84
|
+
pages.each_key do |url|
|
85
|
+
url = options.relative ? url.path.to_s : url.to_s
|
86
|
+
file.puts url
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
|
3
|
+
begin
|
4
|
+
# make sure that the first option is a URL we can crawl
|
5
|
+
root = URI(ARGV[0])
|
6
|
+
rescue
|
7
|
+
puts <<-INFO
|
8
|
+
Usage:
|
9
|
+
anemone pagedepth <url>
|
10
|
+
|
11
|
+
Synopsis:
|
12
|
+
Crawls a site starting at the given URL and outputs a count of
|
13
|
+
the number of pages at each depth of the crawl.
|
14
|
+
INFO
|
15
|
+
exit(0)
|
16
|
+
end
|
17
|
+
|
18
|
+
Anemone.crawl(root) do |anemone|
|
19
|
+
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
20
|
+
|
21
|
+
anemone.after_crawl do |pages|
|
22
|
+
pages = pages.shortest_paths!(root).uniq!
|
23
|
+
|
24
|
+
depths = pages.values.inject({}) do |depths, page|
|
25
|
+
depths[page.depth] ||= 0
|
26
|
+
depths[page.depth] += 1
|
27
|
+
depths
|
28
|
+
end
|
29
|
+
|
30
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'anemone'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
begin
|
6
|
+
# make sure that the first option is a URL we can crawl
|
7
|
+
root = URI(ARGV[0])
|
8
|
+
rescue
|
9
|
+
puts <<-INFO
|
10
|
+
Usage:
|
11
|
+
anemone serialize [options] <url>
|
12
|
+
|
13
|
+
Synopsis:
|
14
|
+
Crawls a site starting at the given URL and saves the resulting
|
15
|
+
PageStore object to a file using Marshal serialization.
|
16
|
+
|
17
|
+
Options:
|
18
|
+
-o, --output filename Filename to save PageStore to. Defaults to crawl.{Time.now}
|
19
|
+
INFO
|
20
|
+
exit(0)
|
21
|
+
end
|
22
|
+
|
23
|
+
options = OpenStruct.new
|
24
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
25
|
+
|
26
|
+
# parse command-line options
|
27
|
+
opts = OptionParser.new
|
28
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
29
|
+
opts.parse!(ARGV)
|
30
|
+
|
31
|
+
Anemone.crawl(root) do |anemone|
|
32
|
+
anemone.after_crawl do |pages|
|
33
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
34
|
+
end
|
35
|
+
end
|