jeremyf-anemone 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.txt +19 -0
- data/README.rdoc +18 -0
- data/Rakefile +48 -0
- data/VERSION.yml +4 -0
- data/anemone.gemspec +62 -0
- data/bin/anemone_count.rb +36 -0
- data/bin/anemone_cron.rb +106 -0
- data/bin/anemone_pagedepth.rb +44 -0
- data/bin/anemone_serialize.rb +51 -0
- data/bin/anemone_url_list.rb +51 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/anemone.rb +37 -0
- data/lib/anemone/core.rb +211 -0
- data/lib/anemone/http.rb +38 -0
- data/lib/anemone/page.rb +180 -0
- data/lib/anemone/page_hash.rb +116 -0
- data/lib/anemone/tentacle.rb +31 -0
- data/spec/anemone_spec.rb +27 -0
- data/spec/core_spec.rb +114 -0
- data/spec/fakeweb_helper.rb +55 -0
- data/spec/page_spec.rb +49 -0
- data/spec/spec_helper.rb +5 -0
- metadata +85 -0
data/LICENSE.txt
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2009 Vertive, Inc.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
= Anemone
|
2
|
+
|
3
|
+
== DESCRIPTION
|
4
|
+
Anemone is a web spider framework that can spider a domain and collect useful
|
5
|
+
information about the pages it visits. It is versatile, allowing you to
|
6
|
+
write your own specialized spider tasks quickly and easily.
|
7
|
+
|
8
|
+
== FEATURES
|
9
|
+
* Multi-threaded design for high performance
|
10
|
+
* Tracks 301 HTTP redirects to understand a page's aliases
|
11
|
+
* Built-in BFS algorithm for determining page depth
|
12
|
+
* Allows exclusion of URLs based on regular expressions
|
13
|
+
|
14
|
+
== REQUIREMENTS
|
15
|
+
* nokogiri
|
16
|
+
|
17
|
+
== EXAMPLES
|
18
|
+
See the +bin+ directory for several examples of useful Anemone tasks.
|
data/Rakefile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "anemone"
|
8
|
+
gem.summary = %Q{Anemone is a web spider framework that can spider a domain.}
|
9
|
+
gem.email = "jeremy.n.friesen@gmail.com"
|
10
|
+
gem.homepage = "http://github.com/jeremyf/anemone"
|
11
|
+
gem.authors = ["Chris Kite", "Jeremy Friesen"]
|
12
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
13
|
+
end
|
14
|
+
|
15
|
+
rescue LoadError
|
16
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'spec/rake/spectask'
|
20
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
21
|
+
spec.libs << 'lib' << 'spec'
|
22
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
23
|
+
end
|
24
|
+
|
25
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
26
|
+
spec.libs << 'lib' << 'spec'
|
27
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
28
|
+
spec.rcov = true
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
task :default => :spec
|
33
|
+
|
34
|
+
require 'rake/rdoctask'
|
35
|
+
Rake::RDocTask.new do |rdoc|
|
36
|
+
if File.exist?('VERSION.yml')
|
37
|
+
config = YAML.load(File.read('VERSION.yml'))
|
38
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
39
|
+
else
|
40
|
+
version = ""
|
41
|
+
end
|
42
|
+
|
43
|
+
rdoc.rdoc_dir = 'rdoc'
|
44
|
+
rdoc.title = "anemone #{version}"
|
45
|
+
rdoc.rdoc_files.include('README*')
|
46
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
47
|
+
end
|
48
|
+
|
data/VERSION.yml
ADDED
data/anemone.gemspec
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{anemone}
|
5
|
+
s.version = "0.1.3"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Chris Kite", "Jeremy Friesen"]
|
9
|
+
s.date = %q{2009-08-05}
|
10
|
+
s.email = %q{jeremy.n.friesen@gmail.com}
|
11
|
+
s.executables = ["anemone_count.rb", "anemone_cron.rb", "anemone_pagedepth.rb", "anemone_serialize.rb", "anemone_url_list.rb"]
|
12
|
+
s.extra_rdoc_files = [
|
13
|
+
"LICENSE.txt",
|
14
|
+
"README.rdoc"
|
15
|
+
]
|
16
|
+
s.files = [
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.rdoc",
|
19
|
+
"Rakefile",
|
20
|
+
"VERSION.yml",
|
21
|
+
"anemone.gemspec",
|
22
|
+
"bin/anemone_count.rb",
|
23
|
+
"bin/anemone_cron.rb",
|
24
|
+
"bin/anemone_pagedepth.rb",
|
25
|
+
"bin/anemone_serialize.rb",
|
26
|
+
"bin/anemone_url_list.rb",
|
27
|
+
"lib/anemone.rb",
|
28
|
+
"lib/anemone/anemone.rb",
|
29
|
+
"lib/anemone/core.rb",
|
30
|
+
"lib/anemone/http.rb",
|
31
|
+
"lib/anemone/page.rb",
|
32
|
+
"lib/anemone/page_hash.rb",
|
33
|
+
"lib/anemone/tentacle.rb",
|
34
|
+
"spec/anemone_spec.rb",
|
35
|
+
"spec/core_spec.rb",
|
36
|
+
"spec/fakeweb_helper.rb",
|
37
|
+
"spec/page_spec.rb",
|
38
|
+
"spec/spec_helper.rb"
|
39
|
+
]
|
40
|
+
s.homepage = %q{http://github.com/jeremyf/anemone}
|
41
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
42
|
+
s.require_paths = ["lib"]
|
43
|
+
s.rubygems_version = %q{1.3.4}
|
44
|
+
s.summary = %q{Anemone is a web spider framework that can spider a domain.}
|
45
|
+
s.test_files = [
|
46
|
+
"spec/anemone_spec.rb",
|
47
|
+
"spec/core_spec.rb",
|
48
|
+
"spec/fakeweb_helper.rb",
|
49
|
+
"spec/page_spec.rb",
|
50
|
+
"spec/spec_helper.rb"
|
51
|
+
]
|
52
|
+
|
53
|
+
if s.respond_to? :specification_version then
|
54
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
55
|
+
s.specification_version = 3
|
56
|
+
|
57
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
58
|
+
else
|
59
|
+
end
|
60
|
+
else
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the total number
|
4
|
+
# of unique pages on the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_count.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_count.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
Anemone.crawl(ARGV[0]) do |anemone|
|
31
|
+
anemone.after_crawl do |pages|
|
32
|
+
puts pages.uniq.size
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
data/bin/anemone_cron.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Performs pagedepth, url list, and count functionality
|
4
|
+
# Meant to be run daily as a cron job
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
|
12
|
+
#
|
13
|
+
# == Author
|
14
|
+
# Chris Kite
|
15
|
+
|
16
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
17
|
+
|
18
|
+
require 'anemone'
|
19
|
+
require 'optparse'
|
20
|
+
require 'ostruct'
|
21
|
+
|
22
|
+
def usage
|
23
|
+
puts <<END
|
24
|
+
Usage: anemone_url_list.rb [options] url
|
25
|
+
|
26
|
+
Options:
|
27
|
+
-r, --relative Output relative URLs (rather than absolute)
|
28
|
+
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
29
|
+
END
|
30
|
+
end
|
31
|
+
|
32
|
+
options = OpenStruct.new
|
33
|
+
options.relative = false
|
34
|
+
options.output_file = 'urls.txt'
|
35
|
+
|
36
|
+
# make sure that the last option is a URL we can crawl
|
37
|
+
begin
|
38
|
+
URI(ARGV.last)
|
39
|
+
rescue
|
40
|
+
usage
|
41
|
+
Process.exit
|
42
|
+
end
|
43
|
+
|
44
|
+
# parse command-line options
|
45
|
+
opts = OptionParser.new
|
46
|
+
opts.on('-r', '--relative') { options.relative = true }
|
47
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
48
|
+
opts.parse!(ARGV)
|
49
|
+
|
50
|
+
root = ARGV.last
|
51
|
+
|
52
|
+
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
53
|
+
|
54
|
+
anemone.after_crawl do |pages|
|
55
|
+
puts "Crawl results for #{root}\n"
|
56
|
+
|
57
|
+
# print a list of 404's
|
58
|
+
not_found = []
|
59
|
+
pages.each_value do |page|
|
60
|
+
url = page.url.to_s
|
61
|
+
not_found << url if page.not_found?
|
62
|
+
end
|
63
|
+
unless not_found.empty?
|
64
|
+
puts "\n404's:"
|
65
|
+
|
66
|
+
missing_links = pages.urls_linking_to(not_found)
|
67
|
+
missing_links.each do |url, links|
|
68
|
+
if options.relative
|
69
|
+
puts URI(url).path.to_s
|
70
|
+
else
|
71
|
+
puts url
|
72
|
+
end
|
73
|
+
links.slice(0..10).each do |u|
|
74
|
+
u = u.path if options.relative
|
75
|
+
puts " linked from #{u}"
|
76
|
+
end
|
77
|
+
|
78
|
+
puts " ..." if links.size > 10
|
79
|
+
end
|
80
|
+
|
81
|
+
print "\n"
|
82
|
+
end
|
83
|
+
|
84
|
+
# remove redirect aliases, and calculate pagedepths
|
85
|
+
pages = pages.shortest_paths!(root).uniq
|
86
|
+
depths = pages.values.inject({}) do |depths, page|
|
87
|
+
depths[page.depth] ||= 0
|
88
|
+
depths[page.depth] += 1
|
89
|
+
depths
|
90
|
+
end
|
91
|
+
|
92
|
+
# print the page count
|
93
|
+
puts "Total pages: #{pages.size}\n"
|
94
|
+
|
95
|
+
# print a list of depths
|
96
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
97
|
+
|
98
|
+
# output a list of urls to file
|
99
|
+
file = open(options.output_file, 'w')
|
100
|
+
pages.each_key do |url|
|
101
|
+
url = options.relative ? url.path.to_s : url.to_s
|
102
|
+
file.puts url
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs a count of
|
4
|
+
# the number of Pages at each depth in the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_pagedepth.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_pagedepth.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
root = ARGV[0]
|
31
|
+
Anemone.crawl(root) do |anemone|
|
32
|
+
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
33
|
+
|
34
|
+
anemone.after_crawl do |pages|
|
35
|
+
pages = pages.shortest_paths!(root).uniq
|
36
|
+
depths = pages.values.inject({}) do |depths, page|
|
37
|
+
depths[page.depth] ||= 0
|
38
|
+
depths[page.depth] += 1
|
39
|
+
depths
|
40
|
+
end
|
41
|
+
|
42
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and saves the resulting
|
4
|
+
# PageHash object to a file using Marshal serialization.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_serialize.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'ostruct'
|
20
|
+
|
21
|
+
def usage
|
22
|
+
puts <<END
|
23
|
+
Usage: anemone_serialize.rb [options] url
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
27
|
+
END
|
28
|
+
end
|
29
|
+
|
30
|
+
# make sure that the first option is a URL we can crawl
|
31
|
+
begin
|
32
|
+
URI(ARGV[0])
|
33
|
+
rescue
|
34
|
+
usage
|
35
|
+
Process.exit
|
36
|
+
end
|
37
|
+
|
38
|
+
options = OpenStruct.new
|
39
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
40
|
+
|
41
|
+
# parse command-line options
|
42
|
+
opts = OptionParser.new
|
43
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
44
|
+
opts.parse!(ARGV)
|
45
|
+
|
46
|
+
root = ARGV[0]
|
47
|
+
Anemone.crawl(root) do |anemone|
|
48
|
+
anemone.after_crawl do |pages|
|
49
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
+
# in the domain as they are encountered.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
require 'anemone'
|
17
|
+
require 'optparse'
|
18
|
+
require 'ostruct'
|
19
|
+
|
20
|
+
def usage
|
21
|
+
puts <<END
|
22
|
+
Usage: anemone_url_list.rb [options] url
|
23
|
+
|
24
|
+
Options:
|
25
|
+
-r, --relative Output relative URLs (rather than absolute)
|
26
|
+
END
|
27
|
+
end
|
28
|
+
|
29
|
+
options = OpenStruct.new
|
30
|
+
options.relative = false
|
31
|
+
|
32
|
+
# make sure that the last option is a URL we can crawl
|
33
|
+
begin
|
34
|
+
URI(ARGV.last)
|
35
|
+
rescue
|
36
|
+
usage
|
37
|
+
Process.exit
|
38
|
+
end
|
39
|
+
|
40
|
+
# parse command-line options
|
41
|
+
opts = OptionParser.new
|
42
|
+
opts.on('-r', '--relative') { options.relative = true }
|
43
|
+
opts.parse!(ARGV)
|
44
|
+
|
45
|
+
puts "CODE\tFROM\tTO"
|
46
|
+
Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
|
47
|
+
anemone.on_every_page do |page|
|
48
|
+
link = options.relative ? page.url.page : page.url
|
49
|
+
puts "#{page.code}\t#{page.from_url}\t#{link}"
|
50
|
+
end
|
51
|
+
end
|