anemone 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.txt +17 -17
- data/bin/anemone_count.rb +36 -31
- data/bin/anemone_cron.rb +107 -98
- data/bin/anemone_pagedepth.rb +43 -38
- data/bin/anemone_serialize.rb +50 -42
- data/bin/anemone_url_list.rb +54 -46
- data/bin/anemone_url_list.rb~ +58 -0
- data/lib/anemone.rb +1 -1
- data/lib/anemone/anemone.rb +36 -36
- data/lib/anemone/core.rb +181 -179
- data/lib/anemone/http.rb +36 -36
- data/lib/anemone/page.rb +184 -159
- data/lib/anemone/page_hash.rb +82 -82
- data/lib/anemone/tentacle.rb +30 -30
- metadata +10 -9
data/README.txt
CHANGED
@@ -1,18 +1,18 @@
|
|
1
|
-
= Anemone
|
2
|
-
|
3
|
-
== DESCRIPTION
|
4
|
-
Anemone is a web spider framework that can spider a domain and collect useful
|
5
|
-
information about the pages it visits. It is versatile, allowing you to
|
6
|
-
write your own specialized spider tasks quickly and easily.
|
7
|
-
|
8
|
-
== FEATURES
|
9
|
-
* Multi-threaded design for high performance
|
10
|
-
* Tracks 301 HTTP redirects to understand a page's aliases
|
11
|
-
* Built-in BFS algorithm for determining page depth
|
12
|
-
* Allows exclusion of URLs based on regular expressions
|
13
|
-
|
14
|
-
== REQUIREMENTS
|
15
|
-
* hpricot
|
16
|
-
|
17
|
-
== EXAMPLES
|
1
|
+
= Anemone
|
2
|
+
|
3
|
+
== DESCRIPTION
|
4
|
+
Anemone is a web spider framework that can spider a domain and collect useful
|
5
|
+
information about the pages it visits. It is versatile, allowing you to
|
6
|
+
write your own specialized spider tasks quickly and easily.
|
7
|
+
|
8
|
+
== FEATURES
|
9
|
+
* Multi-threaded design for high performance
|
10
|
+
* Tracks 301 HTTP redirects to understand a page's aliases
|
11
|
+
* Built-in BFS algorithm for determining page depth
|
12
|
+
* Allows exclusion of URLs based on regular expressions
|
13
|
+
|
14
|
+
== REQUIREMENTS
|
15
|
+
* hpricot
|
16
|
+
|
17
|
+
== EXAMPLES
|
18
18
|
See the +bin+ directory for several examples of useful Anemone tasks.
|
data/bin/anemone_count.rb
CHANGED
@@ -1,31 +1,36 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs the total number
|
4
|
-
# of unique pages on the site.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_count.rb url
|
8
|
-
#
|
9
|
-
# == Author
|
10
|
-
# Chris Kite
|
11
|
-
|
12
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
-
|
14
|
-
require 'anemone'
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs the total number
|
4
|
+
# of unique pages on the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_count.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_count.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
Anemone.crawl(ARGV[0]) do |anemone|
|
31
|
+
anemone.after_crawl do |pages|
|
32
|
+
puts pages.uniq.size
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
|
data/bin/anemone_cron.rb
CHANGED
@@ -1,99 +1,108 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Performs pagedepth, url list, and count functionality
|
4
|
-
# Meant to be run daily as a cron job
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_url_list.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
-
# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
|
12
|
-
#
|
13
|
-
# == Author
|
14
|
-
# Chris Kite
|
15
|
-
|
16
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
17
|
-
|
18
|
-
require 'anemone'
|
19
|
-
require 'optparse'
|
20
|
-
require '
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Performs pagedepth, url list, and count functionality
|
4
|
+
# Meant to be run daily as a cron job
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_url_list.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -r, --relative Output relative URLs (rather than absolute)
|
11
|
+
# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
|
12
|
+
#
|
13
|
+
# == Author
|
14
|
+
# Chris Kite
|
15
|
+
|
16
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
17
|
+
|
18
|
+
require 'anemone'
|
19
|
+
require 'optparse'
|
20
|
+
require 'ostruct'
|
21
|
+
|
22
|
+
def usage
|
23
|
+
puts <<END
|
24
|
+
Usage: anemone_url_list.rb [options] url
|
25
|
+
|
26
|
+
Options:
|
27
|
+
-r, --relative Output relative URLs (rather than absolute)
|
28
|
+
-o, --output filename Filename to save URL list to. Defautls to urls.txt.
|
29
|
+
END
|
30
|
+
end
|
31
|
+
|
32
|
+
options = OpenStruct.new
|
33
|
+
options.relative = false
|
34
|
+
options.output_file = 'urls.txt'
|
35
|
+
|
36
|
+
# make sure that the last option is a URL we can crawl
|
37
|
+
begin
|
38
|
+
URI(ARGV.last)
|
39
|
+
rescue
|
40
|
+
usage
|
41
|
+
Process.exit
|
42
|
+
end
|
43
|
+
|
44
|
+
# parse command-line options
|
45
|
+
opts = OptionParser.new
|
46
|
+
opts.on('-r', '--relative') { options.relative = true }
|
47
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
48
|
+
opts.parse!(ARGV)
|
49
|
+
|
50
|
+
root = ARGV.last
|
51
|
+
|
52
|
+
Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
|
53
|
+
|
54
|
+
anemone.after_crawl do |pages|
|
55
|
+
puts "Crawl results for #{root}\n"
|
56
|
+
|
57
|
+
# print a list of 404's
|
58
|
+
not_found = []
|
59
|
+
pages.each_value do |page|
|
60
|
+
url = page.url.to_s
|
61
|
+
not_found << url if page.not_found?
|
62
|
+
end
|
63
|
+
if !not_found.empty?
|
64
|
+
puts "\n404's:"
|
65
|
+
not_found.each do |url|
|
66
|
+
if options.relative
|
67
|
+
puts URI(url).path.to_s
|
68
|
+
else
|
69
|
+
puts url
|
70
|
+
end
|
71
|
+
num_linked_from = 0
|
72
|
+
pages.urls_linking_to(url).each do |u|
|
73
|
+
u = u.path if options.relative
|
74
|
+
num_linked_from += 1
|
75
|
+
puts " linked from #{u}"
|
76
|
+
if num_linked_from > 10
|
77
|
+
puts " ..."
|
78
|
+
break
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
print "\n"
|
84
|
+
end
|
85
|
+
|
86
|
+
# remove redirect aliases, and calculate pagedepths
|
87
|
+
pages = pages.shortest_paths!(root).uniq
|
88
|
+
depths = pages.values.inject({}) do |depths, page|
|
89
|
+
depths[page.depth] ||= 0
|
90
|
+
depths[page.depth] += 1
|
91
|
+
depths
|
92
|
+
end
|
93
|
+
|
94
|
+
# print the page count
|
95
|
+
puts "Total pages: #{pages.size}\n"
|
96
|
+
|
97
|
+
# print a list of depths
|
98
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
99
|
+
|
100
|
+
# output a list of urls to file
|
101
|
+
file = open(options.output_file, 'w')
|
102
|
+
pages.each_key do |url|
|
103
|
+
url = options.relative ? url.path.to_s : url.to_s
|
104
|
+
file.puts url
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
99
108
|
end
|
data/bin/anemone_pagedepth.rb
CHANGED
@@ -1,39 +1,44 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and outputs a count of
|
4
|
-
# the number of Pages at each depth in the site.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_pagedepth.rb url
|
8
|
-
#
|
9
|
-
# == Author
|
10
|
-
# Chris Kite
|
11
|
-
|
12
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
-
|
14
|
-
require 'anemone'
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and outputs a count of
|
4
|
+
# the number of Pages at each depth in the site.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_pagedepth.rb url
|
8
|
+
#
|
9
|
+
# == Author
|
10
|
+
# Chris Kite
|
11
|
+
|
12
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
13
|
+
|
14
|
+
require 'anemone'
|
15
|
+
|
16
|
+
def usage
|
17
|
+
puts <<END
|
18
|
+
Usage: anemone_pagedepth.rb url
|
19
|
+
END
|
20
|
+
end
|
21
|
+
|
22
|
+
# make sure that the first option is a URL we can crawl
|
23
|
+
begin
|
24
|
+
URI(ARGV[0])
|
25
|
+
rescue
|
26
|
+
usage
|
27
|
+
Process.exit
|
28
|
+
end
|
29
|
+
|
30
|
+
root = ARGV[0]
|
31
|
+
Anemone.crawl(root) do |anemone|
|
32
|
+
anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
|
33
|
+
|
34
|
+
anemone.after_crawl do |pages|
|
35
|
+
pages = pages.shortest_paths!(root).uniq
|
36
|
+
depths = pages.values.inject({}) do |depths, page|
|
37
|
+
depths[page.depth] ||= 0
|
38
|
+
depths[page.depth] += 1
|
39
|
+
depths
|
40
|
+
end
|
41
|
+
|
42
|
+
depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
|
43
|
+
end
|
39
44
|
end
|
data/bin/anemone_serialize.rb
CHANGED
@@ -1,43 +1,51 @@
|
|
1
|
-
#! /usr/bin/env ruby
|
2
|
-
# == Synopsis
|
3
|
-
# Crawls a site starting at the given URL, and saves the resulting
|
4
|
-
# PageHash object to a file using Marshal serialization.
|
5
|
-
#
|
6
|
-
# == Usage
|
7
|
-
# anemone_serialize.rb [options] url
|
8
|
-
#
|
9
|
-
# == Options
|
10
|
-
# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
11
|
-
#
|
12
|
-
# == Author
|
13
|
-
# Chris Kite
|
14
|
-
|
15
|
-
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
-
|
17
|
-
require 'anemone'
|
18
|
-
require 'optparse'
|
19
|
-
require '
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
# == Synopsis
|
3
|
+
# Crawls a site starting at the given URL, and saves the resulting
|
4
|
+
# PageHash object to a file using Marshal serialization.
|
5
|
+
#
|
6
|
+
# == Usage
|
7
|
+
# anemone_serialize.rb [options] url
|
8
|
+
#
|
9
|
+
# == Options
|
10
|
+
# -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
11
|
+
#
|
12
|
+
# == Author
|
13
|
+
# Chris Kite
|
14
|
+
|
15
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
16
|
+
|
17
|
+
require 'anemone'
|
18
|
+
require 'optparse'
|
19
|
+
require 'ostruct'
|
20
|
+
|
21
|
+
def usage
|
22
|
+
puts <<END
|
23
|
+
Usage: anemone_serialize.rb [options] url
|
24
|
+
|
25
|
+
Options:
|
26
|
+
-o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
|
27
|
+
END
|
28
|
+
end
|
29
|
+
|
30
|
+
# make sure that the first option is a URL we can crawl
|
31
|
+
begin
|
32
|
+
URI(ARGV[0])
|
33
|
+
rescue
|
34
|
+
usage
|
35
|
+
Process.exit
|
36
|
+
end
|
37
|
+
|
38
|
+
options = OpenStruct.new
|
39
|
+
options.output_file = "crawl.#{Time.now.to_i}"
|
40
|
+
|
41
|
+
# parse command-line options
|
42
|
+
opts = OptionParser.new
|
43
|
+
opts.on('-o', '--output filename') {|o| options.output_file = o }
|
44
|
+
opts.parse!(ARGV)
|
45
|
+
|
46
|
+
root = ARGV[0]
|
47
|
+
Anemone.crawl(root) do |anemone|
|
48
|
+
anemone.after_crawl do |pages|
|
49
|
+
open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
|
50
|
+
end
|
43
51
|
end
|