anemone 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.txt CHANGED
@@ -1,18 +1,18 @@
1
- = Anemone
2
-
3
- == DESCRIPTION
4
- Anemone is a web spider framework that can spider a domain and collect useful
5
- information about the pages it visits. It is versatile, allowing you to
6
- write your own specialized spider tasks quickly and easily.
7
-
8
- == FEATURES
9
- * Multi-threaded design for high performance
10
- * Tracks 301 HTTP redirects to understand a page's aliases
11
- * Built-in BFS algorithm for determining page depth
12
- * Allows exclusion of URLs based on regular expressions
13
-
14
- == REQUIREMENTS
15
- * hpricot
16
-
17
- == EXAMPLES
1
+ = Anemone
2
+
3
+ == DESCRIPTION
4
+ Anemone is a web spider framework that can spider a domain and collect useful
5
+ information about the pages it visits. It is versatile, allowing you to
6
+ write your own specialized spider tasks quickly and easily.
7
+
8
+ == FEATURES
9
+ * Multi-threaded design for high performance
10
+ * Tracks 301 HTTP redirects to understand a page's aliases
11
+ * Built-in BFS algorithm for determining page depth
12
+ * Allows exclusion of URLs based on regular expressions
13
+
14
+ == REQUIREMENTS
15
+ * hpricot
16
+
17
+ == EXAMPLES
18
18
  See the +bin+ directory for several examples of useful Anemone tasks.
data/bin/anemone_count.rb CHANGED
@@ -1,31 +1,36 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs the total number
4
- # of unique pages on the site.
5
- #
6
- # == Usage
7
- # anemone_count.rb url
8
- #
9
- # == Author
10
- # Chris Kite
11
-
12
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
-
14
- require 'anemone'
15
- require 'rdoc/usage'
16
-
17
- # make sure that the first option is a URL we can crawl
18
- begin
19
- URI(ARGV[0])
20
- rescue
21
- RDoc::usage()
22
- Process.exit
23
- end
24
-
25
- Anemone.crawl(ARGV[0]) do |anemone|
26
- anemone.after_crawl do |pages|
27
- puts pages.uniq.size
28
- end
29
- end
30
-
31
-
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the total number
4
+ # of unique pages on the site.
5
+ #
6
+ # == Usage
7
+ # anemone_count.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+
16
+ def usage
17
+ puts <<END
18
+ Usage: anemone_count.rb url
19
+ END
20
+ end
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ usage
27
+ Process.exit
28
+ end
29
+
30
+ Anemone.crawl(ARGV[0]) do |anemone|
31
+ anemone.after_crawl do |pages|
32
+ puts pages.uniq.size
33
+ end
34
+ end
35
+
36
+
data/bin/anemone_cron.rb CHANGED
@@ -1,99 +1,108 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Performs pagedepth, url list, and count functionality
4
- # Meant to be run daily as a cron job
5
- #
6
- # == Usage
7
- # anemone_url_list.rb [options] url
8
- #
9
- # == Options
10
- # -r, --relative Output relative URLs (rather than absolute)
11
- # -o, --output filename Filename to save URL list to. Defaults to urls.txt.
12
- #
13
- # == Author
14
- # Chris Kite
15
-
16
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
17
-
18
- require 'anemone'
19
- require 'optparse'
20
- require 'rdoc/usage'
21
- require 'ostruct'
22
-
23
- options = OpenStruct.new
24
- options.relative = false
25
- options.output_file = 'urls.txt'
26
-
27
- # make sure that the last option is a URL we can crawl
28
- begin
29
- URI(ARGV.last)
30
- rescue
31
- RDoc::usage()
32
- Process.exit
33
- end
34
-
35
- # parse command-line options
36
- opts = OptionParser.new
37
- opts.on('-r', '--relative') { options.relative = true }
38
- opts.on('-o', '--output filename') {|o| options.output_file = o }
39
- opts.parse!(ARGV)
40
-
41
- root = ARGV.last
42
-
43
- Anemone.crawl(root) do |anemone|
44
-
45
- anemone.after_crawl do |pages|
46
- puts "Crawl results for #{root}\n"
47
-
48
- # print a list of 404's
49
- not_found = []
50
- pages.each_value do |page|
51
- url = page.url.to_s
52
- not_found << url if page.not_found?
53
- end
54
- if !not_found.empty?
55
- puts "\n404's:"
56
- not_found.each do |url|
57
- if options.relative
58
- puts URI(url).path.to_s
59
- else
60
- puts url
61
- end
62
- num_linked_from = 0
63
- pages.urls_linking_to(url).each do |u|
64
- u = u.path if options.relative
65
- num_linked_from += 1
66
- puts " linked from #{u}"
67
- if num_linked_from > 10
68
- puts " ..."
69
- break
70
- end
71
- end
72
- end
73
-
74
- print "\n"
75
- end
76
-
77
- # remove redirect aliases, and calculate pagedepths
78
- pages = pages.shortest_paths!(root).uniq
79
- depths = pages.values.inject({}) do |depths, page|
80
- depths[page.depth] ||= 0
81
- depths[page.depth] += 1
82
- depths
83
- end
84
-
85
- # print the page count
86
- puts "Total pages: #{pages.size}\n"
87
-
88
- # print a list of depths
89
- depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
90
-
91
- # output a list of urls to file
92
- file = open(options.output_file, 'w')
93
- pages.each_key do |url|
94
- url = options.relative ? url.path.to_s : url.to_s
95
- file.puts url
96
- end
97
-
98
- end
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Performs pagedepth, url list, and count functionality
4
+ # Meant to be run daily as a cron job
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ # -o, --output filename Filename to save URL list to. Defaults to urls.txt.
12
+ #
13
+ # == Author
14
+ # Chris Kite
15
+
16
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
17
+
18
+ require 'anemone'
19
+ require 'optparse'
20
+ require 'ostruct'
21
+
22
+ def usage
23
+ puts <<END
24
+ Usage: anemone_url_list.rb [options] url
25
+
26
+ Options:
27
+ -r, --relative Output relative URLs (rather than absolute)
28
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
29
+ END
30
+ end
31
+
32
+ options = OpenStruct.new
33
+ options.relative = false
34
+ options.output_file = 'urls.txt'
35
+
36
+ # make sure that the last option is a URL we can crawl
37
+ begin
38
+ URI(ARGV.last)
39
+ rescue
40
+ usage
41
+ Process.exit
42
+ end
43
+
44
+ # parse command-line options
45
+ opts = OptionParser.new
46
+ opts.on('-r', '--relative') { options.relative = true }
47
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
48
+ opts.parse!(ARGV)
49
+
50
+ root = ARGV.last
51
+
52
+ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
53
+
54
+ anemone.after_crawl do |pages|
55
+ puts "Crawl results for #{root}\n"
56
+
57
+ # print a list of 404's
58
+ not_found = []
59
+ pages.each_value do |page|
60
+ url = page.url.to_s
61
+ not_found << url if page.not_found?
62
+ end
63
+ if !not_found.empty?
64
+ puts "\n404's:"
65
+ not_found.each do |url|
66
+ if options.relative
67
+ puts URI(url).path.to_s
68
+ else
69
+ puts url
70
+ end
71
+ num_linked_from = 0
72
+ pages.urls_linking_to(url).each do |u|
73
+ u = u.path if options.relative
74
+ num_linked_from += 1
75
+ puts " linked from #{u}"
76
+ if num_linked_from > 10
77
+ puts " ..."
78
+ break
79
+ end
80
+ end
81
+ end
82
+
83
+ print "\n"
84
+ end
85
+
86
+ # remove redirect aliases, and calculate pagedepths
87
+ pages = pages.shortest_paths!(root).uniq
88
+ depths = pages.values.inject({}) do |depths, page|
89
+ depths[page.depth] ||= 0
90
+ depths[page.depth] += 1
91
+ depths
92
+ end
93
+
94
+ # print the page count
95
+ puts "Total pages: #{pages.size}\n"
96
+
97
+ # print a list of depths
98
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
99
+
100
+ # output a list of urls to file
101
+ file = open(options.output_file, 'w')
102
+ pages.each_key do |url|
103
+ url = options.relative ? url.path.to_s : url.to_s
104
+ file.puts url
105
+ end
106
+
107
+ end
99
108
  end
@@ -1,39 +1,44 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs a count of
4
- # the number of Pages at each depth in the site.
5
- #
6
- # == Usage
7
- # anemone_pagedepth.rb url
8
- #
9
- # == Author
10
- # Chris Kite
11
-
12
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
-
14
- require 'anemone'
15
- require 'rdoc/usage'
16
-
17
- # make sure that the first option is a URL we can crawl
18
- begin
19
- URI(ARGV[0])
20
- rescue
21
- RDoc::usage()
22
- Process.exit
23
- end
24
-
25
- root = ARGV[0]
26
- Anemone.crawl(root) do |anemone|
27
- anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
28
-
29
- anemone.after_crawl do |pages|
30
- pages = pages.shortest_paths!(root).uniq
31
- depths = pages.values.inject({}) do |depths, page|
32
- depths[page.depth] ||= 0
33
- depths[page.depth] += 1
34
- depths
35
- end
36
-
37
- depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
38
- end
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs a count of
4
+ # the number of Pages at each depth in the site.
5
+ #
6
+ # == Usage
7
+ # anemone_pagedepth.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+
16
+ def usage
17
+ puts <<END
18
+ Usage: anemone_pagedepth.rb url
19
+ END
20
+ end
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ usage
27
+ Process.exit
28
+ end
29
+
30
+ root = ARGV[0]
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
33
+
34
+ anemone.after_crawl do |pages|
35
+ pages = pages.shortest_paths!(root).uniq
36
+ depths = pages.values.inject({}) do |depths, page|
37
+ depths[page.depth] ||= 0
38
+ depths[page.depth] += 1
39
+ depths
40
+ end
41
+
42
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
43
+ end
39
44
  end
@@ -1,43 +1,51 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and saves the resulting
4
- # PageHash object to a file using Marshal serialization.
5
- #
6
- # == Usage
7
- # anemone_serialize.rb [options] url
8
- #
9
- # == Options
10
- # -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
11
- #
12
- # == Author
13
- # Chris Kite
14
-
15
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
-
17
- require 'anemone'
18
- require 'optparse'
19
- require 'rdoc/usage'
20
- require 'ostruct'
21
-
22
- # make sure that the first option is a URL we can crawl
23
- begin
24
- URI(ARGV[0])
25
- rescue
26
- RDoc::usage()
27
- Process.exit
28
- end
29
-
30
- options = OpenStruct.new
31
- options.output_file = "crawl.#{Time.now.to_i}"
32
-
33
- # parse command-line options
34
- opts = OptionParser.new
35
- opts.on('-o', '--output filename') {|o| options.output_file = o }
36
- opts.parse!(ARGV)
37
-
38
- root = ARGV[0]
39
- Anemone.crawl(root) do |anemone|
40
- anemone.after_crawl do |pages|
41
- open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
42
- end
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and saves the resulting
4
+ # PageHash object to a file using Marshal serialization.
5
+ #
6
+ # == Usage
7
+ # anemone_serialize.rb [options] url
8
+ #
9
+ # == Options
10
+ # -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'ostruct'
20
+
21
+ def usage
22
+ puts <<END
23
+ Usage: anemone_serialize.rb [options] url
24
+
25
+ Options:
26
+ -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
27
+ END
28
+ end
29
+
30
+ # make sure that the first option is a URL we can crawl
31
+ begin
32
+ URI(ARGV[0])
33
+ rescue
34
+ usage
35
+ Process.exit
36
+ end
37
+
38
+ options = OpenStruct.new
39
+ options.output_file = "crawl.#{Time.now.to_i}"
40
+
41
+ # parse command-line options
42
+ opts = OptionParser.new
43
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
44
+ opts.parse!(ARGV)
45
+
46
+ root = ARGV[0]
47
+ Anemone.crawl(root) do |anemone|
48
+ anemone.after_crawl do |pages|
49
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
50
+ end
43
51
  end