anemone 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.txt +17 -17
- data/bin/anemone_count.rb +36 -31
- data/bin/anemone_cron.rb +107 -98
- data/bin/anemone_pagedepth.rb +43 -38
- data/bin/anemone_serialize.rb +50 -42
- data/bin/anemone_url_list.rb +54 -46
- data/bin/anemone_url_list.rb~ +58 -0
- data/lib/anemone.rb +1 -1
- data/lib/anemone/anemone.rb +36 -36
- data/lib/anemone/core.rb +181 -179
- data/lib/anemone/http.rb +36 -36
- data/lib/anemone/page.rb +184 -159
- data/lib/anemone/page_hash.rb +82 -82
- data/lib/anemone/tentacle.rb +30 -30
- metadata +10 -9
data/README.txt
CHANGED
@@ -1,18 +1,18 @@
|
|
1
|
-
= Anemone
|
2
|
-
|
3
|
-
== DESCRIPTION
|
4
|
-
Anemone is a web spider framework that can spider a domain and collect useful
|
5
|
-
information about the pages it visits. It is versatile, allowing you to
|
6
|
-
write your own specialized spider tasks quickly and easily.
|
7
|
-
|
8
|
-
== FEATURES
|
9
|
-
* Multi-threaded design for high performance
|
10
|
-
* Tracks 301 HTTP redirects to understand a page's aliases
|
11
|
-
* Built-in BFS algorithm for determining page depth
|
12
|
-
* Allows exclusion of URLs based on regular expressions
|
13
|
-
|
14
|
-
== REQUIREMENTS
|
15
|
-
* hpricot
|
16
|
-
|
17
|
-
== EXAMPLES
|
1
|
+
= Anemone
|
2
|
+
|
3
|
+
== DESCRIPTION
|
4
|
+
Anemone is a web spider framework that can spider a domain and collect useful
|
5
|
+
information about the pages it visits. It is versatile, allowing you to
|
6
|
+
write your own specialized spider tasks quickly and easily.
|
7
|
+
|
8
|
+
== FEATURES
|
9
|
+
* Multi-threaded design for high performance
|
10
|
+
* Tracks 301 HTTP redirects to understand a page's aliases
|
11
|
+
* Built-in BFS algorithm for determining page depth
|
12
|
+
* Allows exclusion of URLs based on regular expressions
|
13
|
+
|
14
|
+
== REQUIREMENTS
|
15
|
+
* hpricot
|
16
|
+
|
17
|
+
== EXAMPLES
|
18
18
|
See the +bin+ directory for several examples of useful Anemone tasks.
|
data/bin/anemone_count.rb
CHANGED
@@ -1,31 +1,36 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs the total number
|
4
|
-
# of unique pages on the site.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_count.rb url
|
8
|
-
#
|
9
|
-
# == Author
|
10
|
-
# Chris Kite
|
11
|
-
|
12
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
-
|
14
|
-
require 'anemone'
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the total number
|
4
|
+
# of unique pages on the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_count.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_count.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
Anemone.crawl(ARGV[0]) do |anemone|
|
31
|
+
anemone.after_crawl do |pages|
|
32
|
+
puts pages.uniq.size
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
data/bin/anemone_cron.rb
CHANGED
@@ -1,99 +1,108 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Performs pagedepth, url list, and count functionality
|
4
|
-
# Meant to be run daily as a cron job
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_url_list.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
-
# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
|
12
|
-
#
|
13
|
-
# == Author
|
14
|
-
# Chris Kite
|
15
|
-
|
16
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
17
|
-
|
18
|
-
require 'anemone'
|
19
|
-
require 'optparse'
|
20
|
-
require '
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Performs pagedepth, url list, and count functionality
|
4
|
+
# Meant to be run daily as a cron job
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
|
12
|
+
#
|
13
|
+
# == Author
|
14
|
+
# Chris Kite
|
15
|
+
|
16
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
17
|
+
|
18
|
+
require 'anemone'
|
19
|
+
require 'optparse'
|
20
|
+
require 'ostruct'
|
21
|
+
|
22
|
+
def usage
|
23
|
+
puts <<END
|
24
|
+
Usage: anemone_url_list.rb [options] url
|
25
|
+
|
26
|
+
Options:
|
27
|
+
-r, --relative Output relative URLs (rather than absolute)
|
28
|
+
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
29
|
+
END
|
30
|
+
end
|
31
|
+
|
32
|
+
options = OpenStruct.new
|
33
|
+
options.relative = false
|
34
|
+
options.output_file = 'urls.txt'
|
35
|
+
|
36
|
+
# make sure that the last option is a URL we can crawl
|
37
|
+
begin
|
38
|
+
URI(ARGV.last)
|
39
|
+
rescue
|
40
|
+
usage
|
41
|
+
Process.exit
|
42
|
+
end
|
43
|
+
|
44
|
+
# parse command-line options
|
45
|
+
opts = OptionParser.new
|
46
|
+
opts.on('-r', '--relative') { options.relative = true }
|
47
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
48
|
+
opts.parse!(ARGV)
|
49
|
+
|
50
|
+
root = ARGV.last
|
51
|
+
|
52
|
+
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
53
|
+
|
54
|
+
anemone.after_crawl do |pages|
|
55
|
+
puts "Crawl results for #{root}\n"
|
56
|
+
|
57
|
+
# print a list of 404's
|
58
|
+
not_found = []
|
59
|
+
pages.each_value do |page|
|
60
|
+
url = page.url.to_s
|
61
|
+
not_found << url if page.not_found?
|
62
|
+
end
|
63
|
+
if !not_found.empty?
|
64
|
+
puts "\n404's:"
|
65
|
+
not_found.each do |url|
|
66
|
+
if options.relative
|
67
|
+
puts URI(url).path.to_s
|
68
|
+
else
|
69
|
+
puts url
|
70
|
+
end
|
71
|
+
num_linked_from = 0
|
72
|
+
pages.urls_linking_to(url).each do |u|
|
73
|
+
u = u.path if options.relative
|
74
|
+
num_linked_from += 1
|
75
|
+
puts " linked from #{u}"
|
76
|
+
if num_linked_from > 10
|
77
|
+
puts " ..."
|
78
|
+
break
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
print "\n"
|
84
|
+
end
|
85
|
+
|
86
|
+
# remove redirect aliases, and calculate pagedepths
|
87
|
+
pages = pages.shortest_paths!(root).uniq
|
88
|
+
depths = pages.values.inject({}) do |depths, page|
|
89
|
+
depths[page.depth] ||= 0
|
90
|
+
depths[page.depth] += 1
|
91
|
+
depths
|
92
|
+
end
|
93
|
+
|
94
|
+
# print the page count
|
95
|
+
puts "Total pages: #{pages.size}\n"
|
96
|
+
|
97
|
+
# print a list of depths
|
98
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
99
|
+
|
100
|
+
# output a list of urls to file
|
101
|
+
file = open(options.output_file, 'w')
|
102
|
+
pages.each_key do |url|
|
103
|
+
url = options.relative ? url.path.to_s : url.to_s
|
104
|
+
file.puts url
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
99
108
|
end
|
data/bin/anemone_pagedepth.rb
CHANGED
@@ -1,39 +1,44 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs a count of
|
4
|
-
# the number of Pages at each depth in the site.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_pagedepth.rb url
|
8
|
-
#
|
9
|
-
# == Author
|
10
|
-
# Chris Kite
|
11
|
-
|
12
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
-
|
14
|
-
require 'anemone'
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs a count of
|
4
|
+
# the number of Pages at each depth in the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_pagedepth.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_pagedepth.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
root = ARGV[0]
|
31
|
+
Anemone.crawl(root) do |anemone|
|
32
|
+
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
33
|
+
|
34
|
+
anemone.after_crawl do |pages|
|
35
|
+
pages = pages.shortest_paths!(root).uniq
|
36
|
+
depths = pages.values.inject({}) do |depths, page|
|
37
|
+
depths[page.depth] ||= 0
|
38
|
+
depths[page.depth] += 1
|
39
|
+
depths
|
40
|
+
end
|
41
|
+
|
42
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
43
|
+
end
|
39
44
|
end
|
data/bin/anemone_serialize.rb
CHANGED
@@ -1,43 +1,51 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and saves the resulting
|
4
|
-
# PageHash object to a file using Marshal serialization.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_serialize.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
11
|
-
#
|
12
|
-
# == Author
|
13
|
-
# Chris Kite
|
14
|
-
|
15
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
-
|
17
|
-
require 'anemone'
|
18
|
-
require 'optparse'
|
19
|
-
require '
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and saves the resulting
|
4
|
+
# PageHash object to a file using Marshal serialization.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_serialize.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'ostruct'
|
20
|
+
|
21
|
+
def usage
|
22
|
+
puts <<END
|
23
|
+
Usage: anemone_serialize.rb [options] url
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
27
|
+
END
|
28
|
+
end
|
29
|
+
|
30
|
+
# make sure that the first option is a URL we can crawl
|
31
|
+
begin
|
32
|
+
URI(ARGV[0])
|
33
|
+
rescue
|
34
|
+
usage
|
35
|
+
Process.exit
|
36
|
+
end
|
37
|
+
|
38
|
+
options = OpenStruct.new
|
39
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
40
|
+
|
41
|
+
# parse command-line options
|
42
|
+
opts = OptionParser.new
|
43
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
44
|
+
opts.parse!(ARGV)
|
45
|
+
|
46
|
+
root = ARGV[0]
|
47
|
+
Anemone.crawl(root) do |anemone|
|
48
|
+
anemone.after_crawl do |pages|
|
49
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
50
|
+
end
|
43
51
|
end
|