jeremyf-anemone 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE.txt ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2009 Vertive, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,18 @@
1
+ = Anemone
2
+
3
+ == DESCRIPTION
4
+ Anemone is a web spider framework that can spider a domain and collect useful
5
+ information about the pages it visits. It is versatile, allowing you to
6
+ write your own specialized spider tasks quickly and easily.
7
+
8
+ == FEATURES
9
+ * Multi-threaded design for high performance
10
+ * Tracks 301 HTTP redirects to understand a page's aliases
11
+ * Built-in BFS algorithm for determining page depth
12
+ * Allows exclusion of URLs based on regular expressions
13
+
14
+ == REQUIREMENTS
15
+ * nokogiri
16
+
17
+ == EXAMPLES
18
+ See the +bin+ directory for several examples of useful Anemone tasks.
data/Rakefile ADDED
@@ -0,0 +1,48 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "anemone"
8
+ gem.summary = %Q{Anemone is a web spider framework that can spider a domain.}
9
+ gem.email = "jeremy.n.friesen@gmail.com"
10
+ gem.homepage = "http://github.com/jeremyf/anemone"
11
+ gem.authors = ["Chris Kite", "Jeremy Friesen"]
12
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
13
+ end
14
+
15
+ rescue LoadError
16
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
17
+ end
18
+
19
+ require 'spec/rake/spectask'
20
+ Spec::Rake::SpecTask.new(:spec) do |spec|
21
+ spec.libs << 'lib' << 'spec'
22
+ spec.spec_files = FileList['spec/**/*_spec.rb']
23
+ end
24
+
25
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
26
+ spec.libs << 'lib' << 'spec'
27
+ spec.pattern = 'spec/**/*_spec.rb'
28
+ spec.rcov = true
29
+ end
30
+
31
+
32
+ task :default => :spec
33
+
34
+ require 'rake/rdoctask'
35
+ Rake::RDocTask.new do |rdoc|
36
+ if File.exist?('VERSION.yml')
37
+ config = YAML.load(File.read('VERSION.yml'))
38
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
39
+ else
40
+ version = ""
41
+ end
42
+
43
+ rdoc.rdoc_dir = 'rdoc'
44
+ rdoc.title = "anemone #{version}"
45
+ rdoc.rdoc_files.include('README*')
46
+ rdoc.rdoc_files.include('lib/**/*.rb')
47
+ end
48
+
data/VERSION.yml ADDED
@@ -0,0 +1,4 @@
1
+ ---
2
+ :major: 0
3
+ :minor: 1
4
+ :patch: 3
data/anemone.gemspec ADDED
@@ -0,0 +1,62 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{anemone}
5
+ s.version = "0.1.3"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Chris Kite", "Jeremy Friesen"]
9
+ s.date = %q{2009-08-05}
10
+ s.email = %q{jeremy.n.friesen@gmail.com}
11
+ s.executables = ["anemone_count.rb", "anemone_cron.rb", "anemone_pagedepth.rb", "anemone_serialize.rb", "anemone_url_list.rb"]
12
+ s.extra_rdoc_files = [
13
+ "LICENSE.txt",
14
+ "README.rdoc"
15
+ ]
16
+ s.files = [
17
+ "LICENSE.txt",
18
+ "README.rdoc",
19
+ "Rakefile",
20
+ "VERSION.yml",
21
+ "anemone.gemspec",
22
+ "bin/anemone_count.rb",
23
+ "bin/anemone_cron.rb",
24
+ "bin/anemone_pagedepth.rb",
25
+ "bin/anemone_serialize.rb",
26
+ "bin/anemone_url_list.rb",
27
+ "lib/anemone.rb",
28
+ "lib/anemone/anemone.rb",
29
+ "lib/anemone/core.rb",
30
+ "lib/anemone/http.rb",
31
+ "lib/anemone/page.rb",
32
+ "lib/anemone/page_hash.rb",
33
+ "lib/anemone/tentacle.rb",
34
+ "spec/anemone_spec.rb",
35
+ "spec/core_spec.rb",
36
+ "spec/fakeweb_helper.rb",
37
+ "spec/page_spec.rb",
38
+ "spec/spec_helper.rb"
39
+ ]
40
+ s.homepage = %q{http://github.com/jeremyf/anemone}
41
+ s.rdoc_options = ["--charset=UTF-8"]
42
+ s.require_paths = ["lib"]
43
+ s.rubygems_version = %q{1.3.4}
44
+ s.summary = %q{Anemone is a web spider framework that can spider a domain.}
45
+ s.test_files = [
46
+ "spec/anemone_spec.rb",
47
+ "spec/core_spec.rb",
48
+ "spec/fakeweb_helper.rb",
49
+ "spec/page_spec.rb",
50
+ "spec/spec_helper.rb"
51
+ ]
52
+
53
+ if s.respond_to? :specification_version then
54
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
55
+ s.specification_version = 3
56
+
57
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
58
+ else
59
+ end
60
+ else
61
+ end
62
+ end
@@ -0,0 +1,36 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the total number
4
+ # of unique pages on the site.
5
+ #
6
+ # == Usage
7
+ # anemone_count.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+
16
+ def usage
17
+ puts <<END
18
+ Usage: anemone_count.rb url
19
+ END
20
+ end
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ usage
27
+ Process.exit
28
+ end
29
+
30
+ Anemone.crawl(ARGV[0]) do |anemone|
31
+ anemone.after_crawl do |pages|
32
+ puts pages.uniq.size
33
+ end
34
+ end
35
+
36
+
@@ -0,0 +1,106 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Performs pagedepth, url list, and count functionality
4
+ # Meant to be run daily as a cron job
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ # -o, --output filename Filename to save URL list to. Defaults to urls.txt.
12
+ #
13
+ # == Author
14
+ # Chris Kite
15
+
16
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
17
+
18
+ require 'anemone'
19
+ require 'optparse'
20
+ require 'ostruct'
21
+
22
+ def usage
23
+ puts <<END
24
+ Usage: anemone_url_list.rb [options] url
25
+
26
+ Options:
27
+ -r, --relative Output relative URLs (rather than absolute)
28
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
29
+ END
30
+ end
31
+
32
+ options = OpenStruct.new
33
+ options.relative = false
34
+ options.output_file = 'urls.txt'
35
+
36
+ # make sure that the last option is a URL we can crawl
37
+ begin
38
+ URI(ARGV.last)
39
+ rescue
40
+ usage
41
+ Process.exit
42
+ end
43
+
44
+ # parse command-line options
45
+ opts = OptionParser.new
46
+ opts.on('-r', '--relative') { options.relative = true }
47
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
48
+ opts.parse!(ARGV)
49
+
50
+ root = ARGV.last
51
+
52
+ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
53
+
54
+ anemone.after_crawl do |pages|
55
+ puts "Crawl results for #{root}\n"
56
+
57
+ # print a list of 404's
58
+ not_found = []
59
+ pages.each_value do |page|
60
+ url = page.url.to_s
61
+ not_found << url if page.not_found?
62
+ end
63
+ unless not_found.empty?
64
+ puts "\n404's:"
65
+
66
+ missing_links = pages.urls_linking_to(not_found)
67
+ missing_links.each do |url, links|
68
+ if options.relative
69
+ puts URI(url).path.to_s
70
+ else
71
+ puts url
72
+ end
73
+ links.slice(0..10).each do |u|
74
+ u = u.path if options.relative
75
+ puts " linked from #{u}"
76
+ end
77
+
78
+ puts " ..." if links.size > 10
79
+ end
80
+
81
+ print "\n"
82
+ end
83
+
84
+ # remove redirect aliases, and calculate pagedepths
85
+ pages = pages.shortest_paths!(root).uniq
86
+ depths = pages.values.inject({}) do |depths, page|
87
+ depths[page.depth] ||= 0
88
+ depths[page.depth] += 1
89
+ depths
90
+ end
91
+
92
+ # print the page count
93
+ puts "Total pages: #{pages.size}\n"
94
+
95
+ # print a list of depths
96
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
97
+
98
+ # output a list of urls to file
99
+ file = open(options.output_file, 'w')
100
+ pages.each_key do |url|
101
+ url = options.relative ? url.path.to_s : url.to_s
102
+ file.puts url
103
+ end
104
+
105
+ end
106
+ end
@@ -0,0 +1,44 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs a count of
4
+ # the number of Pages at each depth in the site.
5
+ #
6
+ # == Usage
7
+ # anemone_pagedepth.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+
16
+ def usage
17
+ puts <<END
18
+ Usage: anemone_pagedepth.rb url
19
+ END
20
+ end
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ usage
27
+ Process.exit
28
+ end
29
+
30
+ root = ARGV[0]
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
33
+
34
+ anemone.after_crawl do |pages|
35
+ pages = pages.shortest_paths!(root).uniq
36
+ depths = pages.values.inject({}) do |depths, page|
37
+ depths[page.depth] ||= 0
38
+ depths[page.depth] += 1
39
+ depths
40
+ end
41
+
42
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
43
+ end
44
+ end
@@ -0,0 +1,51 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and saves the resulting
4
+ # PageHash object to a file using Marshal serialization.
5
+ #
6
+ # == Usage
7
+ # anemone_serialize.rb [options] url
8
+ #
9
+ # == Options
10
+ # -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'ostruct'
20
+
21
+ def usage
22
+ puts <<END
23
+ Usage: anemone_serialize.rb [options] url
24
+
25
+ Options:
26
+ -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
27
+ END
28
+ end
29
+
30
+ # make sure that the first option is a URL we can crawl
31
+ begin
32
+ URI(ARGV[0])
33
+ rescue
34
+ usage
35
+ Process.exit
36
+ end
37
+
38
+ options = OpenStruct.new
39
+ options.output_file = "crawl.#{Time.now.to_i}"
40
+
41
+ # parse command-line options
42
+ opts = OptionParser.new
43
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
44
+ opts.parse!(ARGV)
45
+
46
+ root = ARGV[0]
47
+ Anemone.crawl(root) do |anemone|
48
+ anemone.after_crawl do |pages|
49
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
50
+ end
51
+ end
@@ -0,0 +1,51 @@
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the URL of each page
4
+ # in the domain as they are encountered.
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+ require 'anemone'
17
+ require 'optparse'
18
+ require 'ostruct'
19
+
20
+ def usage
21
+ puts <<END
22
+ Usage: anemone_url_list.rb [options] url
23
+
24
+ Options:
25
+ -r, --relative Output relative URLs (rather than absolute)
26
+ END
27
+ end
28
+
29
+ options = OpenStruct.new
30
+ options.relative = false
31
+
32
+ # make sure that the last option is a URL we can crawl
33
+ begin
34
+ URI(ARGV.last)
35
+ rescue
36
+ usage
37
+ Process.exit
38
+ end
39
+
40
+ # parse command-line options
41
+ opts = OptionParser.new
42
+ opts.on('-r', '--relative') { options.relative = true }
43
+ opts.parse!(ARGV)
44
+
45
+ puts "CODE\tFROM\tTO"
46
+ Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
47
+ anemone.on_every_page do |page|
48
+ link = options.relative ? page.url.page : page.url
49
+ puts "#{page.code}\t#{page.from_url}\t#{link}"
50
+ end
51
+ end