jeremyf-anemone 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.txt +19 -0
- data/README.rdoc +18 -0
- data/Rakefile +48 -0
- data/VERSION.yml +4 -0
- data/anemone.gemspec +62 -0
- data/bin/anemone_count.rb +36 -0
- data/bin/anemone_cron.rb +106 -0
- data/bin/anemone_pagedepth.rb +44 -0
- data/bin/anemone_serialize.rb +51 -0
- data/bin/anemone_url_list.rb +51 -0
- data/lib/anemone.rb +2 -0
- data/lib/anemone/anemone.rb +37 -0
- data/lib/anemone/core.rb +211 -0
- data/lib/anemone/http.rb +38 -0
- data/lib/anemone/page.rb +180 -0
- data/lib/anemone/page_hash.rb +116 -0
- data/lib/anemone/tentacle.rb +31 -0
- data/spec/anemone_spec.rb +27 -0
- data/spec/core_spec.rb +114 -0
- data/spec/fakeweb_helper.rb +55 -0
- data/spec/page_spec.rb +49 -0
- data/spec/spec_helper.rb +5 -0
- metadata +85 -0
data/LICENSE.txt
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2009 Vertive, Inc.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
= Anemone
|
2
|
+
|
3
|
+
== DESCRIPTION
|
4
|
+
Anemone is a web spider framework that can spider a domain and collect useful
|
5
|
+
information about the pages it visits. It is versatile, allowing you to
|
6
|
+
write your own specialized spider tasks quickly and easily.
|
7
|
+
|
8
|
+
== FEATURES
|
9
|
+
* Multi-threaded design for high performance
|
10
|
+
* Tracks 301 HTTP redirects to understand a page's aliases
|
11
|
+
* Built-in BFS algorithm for determining page depth
|
12
|
+
* Allows exclusion of URLs based on regular expressions
|
13
|
+
|
14
|
+
== REQUIREMENTS
|
15
|
+
* nokogiri
|
16
|
+
|
17
|
+
== EXAMPLES
|
18
|
+
See the +bin+ directory for several examples of useful Anemone tasks.
|
data/Rakefile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "anemone"
|
8
|
+
gem.summary = %Q{Anemone is a web spider framework that can spider a domain.}
|
9
|
+
gem.email = "jeremy.n.friesen@gmail.com"
|
10
|
+
gem.homepage = "http://github.com/jeremyf/anemone"
|
11
|
+
gem.authors = ["Chris Kite", "Jeremy Friesen"]
|
12
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
13
|
+
end
|
14
|
+
|
15
|
+
rescue LoadError
|
16
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
17
|
+
end
|
18
|
+
|
19
|
+
require 'spec/rake/spectask'
|
20
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
21
|
+
spec.libs << 'lib' << 'spec'
|
22
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
23
|
+
end
|
24
|
+
|
25
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
26
|
+
spec.libs << 'lib' << 'spec'
|
27
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
28
|
+
spec.rcov = true
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
task :default => :spec
|
33
|
+
|
34
|
+
require 'rake/rdoctask'
|
35
|
+
Rake::RDocTask.new do |rdoc|
|
36
|
+
if File.exist?('VERSION.yml')
|
37
|
+
config = YAML.load(File.read('VERSION.yml'))
|
38
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
39
|
+
else
|
40
|
+
version = ""
|
41
|
+
end
|
42
|
+
|
43
|
+
rdoc.rdoc_dir = 'rdoc'
|
44
|
+
rdoc.title = "anemone #{version}"
|
45
|
+
rdoc.rdoc_files.include('README*')
|
46
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
47
|
+
end
|
48
|
+
|
data/VERSION.yml
ADDED
data/anemone.gemspec
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{anemone}
|
5
|
+
s.version = "0.1.3"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Chris Kite", "Jeremy Friesen"]
|
9
|
+
s.date = %q{2009-08-05}
|
10
|
+
s.email = %q{jeremy.n.friesen@gmail.com}
|
11
|
+
s.executables = ["anemone_count.rb", "anemone_cron.rb", "anemone_pagedepth.rb", "anemone_serialize.rb", "anemone_url_list.rb"]
|
12
|
+
s.extra_rdoc_files = [
|
13
|
+
"LICENSE.txt",
|
14
|
+
"README.rdoc"
|
15
|
+
]
|
16
|
+
s.files = [
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.rdoc",
|
19
|
+
"Rakefile",
|
20
|
+
"VERSION.yml",
|
21
|
+
"anemone.gemspec",
|
22
|
+
"bin/anemone_count.rb",
|
23
|
+
"bin/anemone_cron.rb",
|
24
|
+
"bin/anemone_pagedepth.rb",
|
25
|
+
"bin/anemone_serialize.rb",
|
26
|
+
"bin/anemone_url_list.rb",
|
27
|
+
"lib/anemone.rb",
|
28
|
+
"lib/anemone/anemone.rb",
|
29
|
+
"lib/anemone/core.rb",
|
30
|
+
"lib/anemone/http.rb",
|
31
|
+
"lib/anemone/page.rb",
|
32
|
+
"lib/anemone/page_hash.rb",
|
33
|
+
"lib/anemone/tentacle.rb",
|
34
|
+
"spec/anemone_spec.rb",
|
35
|
+
"spec/core_spec.rb",
|
36
|
+
"spec/fakeweb_helper.rb",
|
37
|
+
"spec/page_spec.rb",
|
38
|
+
"spec/spec_helper.rb"
|
39
|
+
]
|
40
|
+
s.homepage = %q{http://github.com/jeremyf/anemone}
|
41
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
42
|
+
s.require_paths = ["lib"]
|
43
|
+
s.rubygems_version = %q{1.3.4}
|
44
|
+
s.summary = %q{Anemone is a web spider framework that can spider a domain.}
|
45
|
+
s.test_files = [
|
46
|
+
"spec/anemone_spec.rb",
|
47
|
+
"spec/core_spec.rb",
|
48
|
+
"spec/fakeweb_helper.rb",
|
49
|
+
"spec/page_spec.rb",
|
50
|
+
"spec/spec_helper.rb"
|
51
|
+
]
|
52
|
+
|
53
|
+
if s.respond_to? :specification_version then
|
54
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
55
|
+
s.specification_version = 3
|
56
|
+
|
57
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
58
|
+
else
|
59
|
+
end
|
60
|
+
else
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the total number
|
4
|
+
# of unique pages on the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_count.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_count.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
Anemone.crawl(ARGV[0]) do |anemone|
|
31
|
+
anemone.after_crawl do |pages|
|
32
|
+
puts pages.uniq.size
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
data/bin/anemone_cron.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Performs pagedepth, url list, and count functionality
|
4
|
+
# Meant to be run daily as a cron job
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
|
12
|
+
#
|
13
|
+
# == Author
|
14
|
+
# Chris Kite
|
15
|
+
|
16
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
17
|
+
|
18
|
+
require 'anemone'
|
19
|
+
require 'optparse'
|
20
|
+
require 'ostruct'
|
21
|
+
|
22
|
+
def usage
|
23
|
+
puts <<END
|
24
|
+
Usage: anemone_url_list.rb [options] url
|
25
|
+
|
26
|
+
Options:
|
27
|
+
-r, --relative Output relative URLs (rather than absolute)
|
28
|
+
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
29
|
+
END
|
30
|
+
end
|
31
|
+
|
32
|
+
options = OpenStruct.new
|
33
|
+
options.relative = false
|
34
|
+
options.output_file = 'urls.txt'
|
35
|
+
|
36
|
+
# make sure that the last option is a URL we can crawl
|
37
|
+
begin
|
38
|
+
URI(ARGV.last)
|
39
|
+
rescue
|
40
|
+
usage
|
41
|
+
Process.exit
|
42
|
+
end
|
43
|
+
|
44
|
+
# parse command-line options
|
45
|
+
opts = OptionParser.new
|
46
|
+
opts.on('-r', '--relative') { options.relative = true }
|
47
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
48
|
+
opts.parse!(ARGV)
|
49
|
+
|
50
|
+
root = ARGV.last
|
51
|
+
|
52
|
+
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
53
|
+
|
54
|
+
anemone.after_crawl do |pages|
|
55
|
+
puts "Crawl results for #{root}\n"
|
56
|
+
|
57
|
+
# print a list of 404's
|
58
|
+
not_found = []
|
59
|
+
pages.each_value do |page|
|
60
|
+
url = page.url.to_s
|
61
|
+
not_found << url if page.not_found?
|
62
|
+
end
|
63
|
+
unless not_found.empty?
|
64
|
+
puts "\n404's:"
|
65
|
+
|
66
|
+
missing_links = pages.urls_linking_to(not_found)
|
67
|
+
missing_links.each do |url, links|
|
68
|
+
if options.relative
|
69
|
+
puts URI(url).path.to_s
|
70
|
+
else
|
71
|
+
puts url
|
72
|
+
end
|
73
|
+
links.slice(0..10).each do |u|
|
74
|
+
u = u.path if options.relative
|
75
|
+
puts " linked from #{u}"
|
76
|
+
end
|
77
|
+
|
78
|
+
puts " ..." if links.size > 10
|
79
|
+
end
|
80
|
+
|
81
|
+
print "\n"
|
82
|
+
end
|
83
|
+
|
84
|
+
# remove redirect aliases, and calculate pagedepths
|
85
|
+
pages = pages.shortest_paths!(root).uniq
|
86
|
+
depths = pages.values.inject({}) do |depths, page|
|
87
|
+
depths[page.depth] ||= 0
|
88
|
+
depths[page.depth] += 1
|
89
|
+
depths
|
90
|
+
end
|
91
|
+
|
92
|
+
# print the page count
|
93
|
+
puts "Total pages: #{pages.size}\n"
|
94
|
+
|
95
|
+
# print a list of depths
|
96
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
97
|
+
|
98
|
+
# output a list of urls to file
|
99
|
+
file = open(options.output_file, 'w')
|
100
|
+
pages.each_key do |url|
|
101
|
+
url = options.relative ? url.path.to_s : url.to_s
|
102
|
+
file.puts url
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs a count of
|
4
|
+
# the number of Pages at each depth in the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_pagedepth.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_pagedepth.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
root = ARGV[0]
|
31
|
+
Anemone.crawl(root) do |anemone|
|
32
|
+
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
33
|
+
|
34
|
+
anemone.after_crawl do |pages|
|
35
|
+
pages = pages.shortest_paths!(root).uniq
|
36
|
+
depths = pages.values.inject({}) do |depths, page|
|
37
|
+
depths[page.depth] ||= 0
|
38
|
+
depths[page.depth] += 1
|
39
|
+
depths
|
40
|
+
end
|
41
|
+
|
42
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
43
|
+
end
|
44
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and saves the resulting
|
4
|
+
# PageHash object to a file using Marshal serialization.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_serialize.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'ostruct'
|
20
|
+
|
21
|
+
def usage
|
22
|
+
puts <<END
|
23
|
+
Usage: anemone_serialize.rb [options] url
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
27
|
+
END
|
28
|
+
end
|
29
|
+
|
30
|
+
# make sure that the first option is a URL we can crawl
|
31
|
+
begin
|
32
|
+
URI(ARGV[0])
|
33
|
+
rescue
|
34
|
+
usage
|
35
|
+
Process.exit
|
36
|
+
end
|
37
|
+
|
38
|
+
options = OpenStruct.new
|
39
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
40
|
+
|
41
|
+
# parse command-line options
|
42
|
+
opts = OptionParser.new
|
43
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
44
|
+
opts.parse!(ARGV)
|
45
|
+
|
46
|
+
root = ARGV[0]
|
47
|
+
Anemone.crawl(root) do |anemone|
|
48
|
+
anemone.after_crawl do |pages|
|
49
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the URL of each page
|
4
|
+
# in the domain as they are encountered.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
require 'anemone'
|
17
|
+
require 'optparse'
|
18
|
+
require 'ostruct'
|
19
|
+
|
20
|
+
def usage
|
21
|
+
puts <<END
|
22
|
+
Usage: anemone_url_list.rb [options] url
|
23
|
+
|
24
|
+
Options:
|
25
|
+
-r, --relative Output relative URLs (rather than absolute)
|
26
|
+
END
|
27
|
+
end
|
28
|
+
|
29
|
+
options = OpenStruct.new
|
30
|
+
options.relative = false
|
31
|
+
|
32
|
+
# make sure that the last option is a URL we can crawl
|
33
|
+
begin
|
34
|
+
URI(ARGV.last)
|
35
|
+
rescue
|
36
|
+
usage
|
37
|
+
Process.exit
|
38
|
+
end
|
39
|
+
|
40
|
+
# parse command-line options
|
41
|
+
opts = OptionParser.new
|
42
|
+
opts.on('-r', '--relative') { options.relative = true }
|
43
|
+
opts.parse!(ARGV)
|
44
|
+
|
45
|
+
puts "CODE\tFROM\tTO"
|
46
|
+
Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
|
47
|
+
anemone.on_every_page do |page|
|
48
|
+
link = options.relative ? page.url.page : page.url
|
49
|
+
puts "#{page.code}\t#{page.from_url}\t#{link}"
|
50
|
+
end
|
51
|
+
end
|