anemone 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/README.txt CHANGED
@@ -1,18 +1,18 @@
1
- = Anemone
2
-
3
- == DESCRIPTION
4
- Anemone is a web spider framework that can spider a domain and collect useful
5
- information about the pages it visits. It is versatile, allowing you to
6
- write your own specialized spider tasks quickly and easily.
7
-
8
- == FEATURES
9
- * Multi-threaded design for high performance
10
- * Tracks 301 HTTP redirects to understand a page's aliases
11
- * Built-in BFS algorithm for determining page depth
12
- * Allows exclusion of URLs based on regular expressions
13
-
14
- == REQUIREMENTS
15
- * hpricot
16
-
17
- == EXAMPLES
1
+ = Anemone
2
+
3
+ == DESCRIPTION
4
+ Anemone is a web spider framework that can spider a domain and collect useful
5
+ information about the pages it visits. It is versatile, allowing you to
6
+ write your own specialized spider tasks quickly and easily.
7
+
8
+ == FEATURES
9
+ * Multi-threaded design for high performance
10
+ * Tracks 301 HTTP redirects to understand a page's aliases
11
+ * Built-in BFS algorithm for determining page depth
12
+ * Allows exclusion of URLs based on regular expressions
13
+
14
+ == REQUIREMENTS
15
+ * hpricot
16
+
17
+ == EXAMPLES
18
18
  See the +bin+ directory for several examples of useful Anemone tasks.
data/bin/anemone_count.rb CHANGED
@@ -1,31 +1,36 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs the total number
4
- # of unique pages on the site.
5
- #
6
- # == Usage
7
- # anemone_count.rb url
8
- #
9
- # == Author
10
- # Chris Kite
11
-
12
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
-
14
- require 'anemone'
15
- require 'rdoc/usage'
16
-
17
- # make sure that the first option is a URL we can crawl
18
- begin
19
- URI(ARGV[0])
20
- rescue
21
- RDoc::usage()
22
- Process.exit
23
- end
24
-
25
- Anemone.crawl(ARGV[0]) do |anemone|
26
- anemone.after_crawl do |pages|
27
- puts pages.uniq.size
28
- end
29
- end
30
-
31
-
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs the total number
4
+ # of unique pages on the site.
5
+ #
6
+ # == Usage
7
+ # anemone_count.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+
16
+ def usage
17
+ puts <<END
18
+ Usage: anemone_count.rb url
19
+ END
20
+ end
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ usage
27
+ Process.exit
28
+ end
29
+
30
+ Anemone.crawl(ARGV[0]) do |anemone|
31
+ anemone.after_crawl do |pages|
32
+ puts pages.uniq.size
33
+ end
34
+ end
35
+
36
+
data/bin/anemone_cron.rb CHANGED
@@ -1,99 +1,108 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Performs pagedepth, url list, and count functionality
4
- # Meant to be run daily as a cron job
5
- #
6
- # == Usage
7
- # anemone_url_list.rb [options] url
8
- #
9
- # == Options
10
- # -r, --relative Output relative URLs (rather than absolute)
11
- # -o, --output filename Filename to save URL list to. Defaults to urls.txt.
12
- #
13
- # == Author
14
- # Chris Kite
15
-
16
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
17
-
18
- require 'anemone'
19
- require 'optparse'
20
- require 'rdoc/usage'
21
- require 'ostruct'
22
-
23
- options = OpenStruct.new
24
- options.relative = false
25
- options.output_file = 'urls.txt'
26
-
27
- # make sure that the last option is a URL we can crawl
28
- begin
29
- URI(ARGV.last)
30
- rescue
31
- RDoc::usage()
32
- Process.exit
33
- end
34
-
35
- # parse command-line options
36
- opts = OptionParser.new
37
- opts.on('-r', '--relative') { options.relative = true }
38
- opts.on('-o', '--output filename') {|o| options.output_file = o }
39
- opts.parse!(ARGV)
40
-
41
- root = ARGV.last
42
-
43
- Anemone.crawl(root) do |anemone|
44
-
45
- anemone.after_crawl do |pages|
46
- puts "Crawl results for #{root}\n"
47
-
48
- # print a list of 404's
49
- not_found = []
50
- pages.each_value do |page|
51
- url = page.url.to_s
52
- not_found << url if page.not_found?
53
- end
54
- if !not_found.empty?
55
- puts "\n404's:"
56
- not_found.each do |url|
57
- if options.relative
58
- puts URI(url).path.to_s
59
- else
60
- puts url
61
- end
62
- num_linked_from = 0
63
- pages.urls_linking_to(url).each do |u|
64
- u = u.path if options.relative
65
- num_linked_from += 1
66
- puts " linked from #{u}"
67
- if num_linked_from > 10
68
- puts " ..."
69
- break
70
- end
71
- end
72
- end
73
-
74
- print "\n"
75
- end
76
-
77
- # remove redirect aliases, and calculate pagedepths
78
- pages = pages.shortest_paths!(root).uniq
79
- depths = pages.values.inject({}) do |depths, page|
80
- depths[page.depth] ||= 0
81
- depths[page.depth] += 1
82
- depths
83
- end
84
-
85
- # print the page count
86
- puts "Total pages: #{pages.size}\n"
87
-
88
- # print a list of depths
89
- depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
90
-
91
- # output a list of urls to file
92
- file = open(options.output_file, 'w')
93
- pages.each_key do |url|
94
- url = options.relative ? url.path.to_s : url.to_s
95
- file.puts url
96
- end
97
-
98
- end
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Performs pagedepth, url list, and count functionality
4
+ # Meant to be run daily as a cron job
5
+ #
6
+ # == Usage
7
+ # anemone_url_list.rb [options] url
8
+ #
9
+ # == Options
10
+ # -r, --relative Output relative URLs (rather than absolute)
11
+ # -o, --output filename Filename to save URL list to. Defaults to urls.txt.
12
+ #
13
+ # == Author
14
+ # Chris Kite
15
+
16
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
17
+
18
+ require 'anemone'
19
+ require 'optparse'
20
+ require 'ostruct'
21
+
22
+ def usage
23
+ puts <<END
24
+ Usage: anemone_url_list.rb [options] url
25
+
26
+ Options:
27
+ -r, --relative Output relative URLs (rather than absolute)
28
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
29
+ END
30
+ end
31
+
32
+ options = OpenStruct.new
33
+ options.relative = false
34
+ options.output_file = 'urls.txt'
35
+
36
+ # make sure that the last option is a URL we can crawl
37
+ begin
38
+ URI(ARGV.last)
39
+ rescue
40
+ usage
41
+ Process.exit
42
+ end
43
+
44
+ # parse command-line options
45
+ opts = OptionParser.new
46
+ opts.on('-r', '--relative') { options.relative = true }
47
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
48
+ opts.parse!(ARGV)
49
+
50
+ root = ARGV.last
51
+
52
+ Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
53
+
54
+ anemone.after_crawl do |pages|
55
+ puts "Crawl results for #{root}\n"
56
+
57
+ # print a list of 404's
58
+ not_found = []
59
+ pages.each_value do |page|
60
+ url = page.url.to_s
61
+ not_found << url if page.not_found?
62
+ end
63
+ if !not_found.empty?
64
+ puts "\n404's:"
65
+ not_found.each do |url|
66
+ if options.relative
67
+ puts URI(url).path.to_s
68
+ else
69
+ puts url
70
+ end
71
+ num_linked_from = 0
72
+ pages.urls_linking_to(url).each do |u|
73
+ u = u.path if options.relative
74
+ num_linked_from += 1
75
+ puts " linked from #{u}"
76
+ if num_linked_from > 10
77
+ puts " ..."
78
+ break
79
+ end
80
+ end
81
+ end
82
+
83
+ print "\n"
84
+ end
85
+
86
+ # remove redirect aliases, and calculate pagedepths
87
+ pages = pages.shortest_paths!(root).uniq
88
+ depths = pages.values.inject({}) do |depths, page|
89
+ depths[page.depth] ||= 0
90
+ depths[page.depth] += 1
91
+ depths
92
+ end
93
+
94
+ # print the page count
95
+ puts "Total pages: #{pages.size}\n"
96
+
97
+ # print a list of depths
98
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
99
+
100
+ # output a list of urls to file
101
+ file = open(options.output_file, 'w')
102
+ pages.each_key do |url|
103
+ url = options.relative ? url.path.to_s : url.to_s
104
+ file.puts url
105
+ end
106
+
107
+ end
99
108
  end
@@ -1,39 +1,44 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and outputs a count of
4
- # the number of Pages at each depth in the site.
5
- #
6
- # == Usage
7
- # anemone_pagedepth.rb url
8
- #
9
- # == Author
10
- # Chris Kite
11
-
12
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
-
14
- require 'anemone'
15
- require 'rdoc/usage'
16
-
17
- # make sure that the first option is a URL we can crawl
18
- begin
19
- URI(ARGV[0])
20
- rescue
21
- RDoc::usage()
22
- Process.exit
23
- end
24
-
25
- root = ARGV[0]
26
- Anemone.crawl(root) do |anemone|
27
- anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
28
-
29
- anemone.after_crawl do |pages|
30
- pages = pages.shortest_paths!(root).uniq
31
- depths = pages.values.inject({}) do |depths, page|
32
- depths[page.depth] ||= 0
33
- depths[page.depth] += 1
34
- depths
35
- end
36
-
37
- depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
38
- end
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and outputs a count of
4
+ # the number of Pages at each depth in the site.
5
+ #
6
+ # == Usage
7
+ # anemone_pagedepth.rb url
8
+ #
9
+ # == Author
10
+ # Chris Kite
11
+
12
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
13
+
14
+ require 'anemone'
15
+
16
+ def usage
17
+ puts <<END
18
+ Usage: anemone_pagedepth.rb url
19
+ END
20
+ end
21
+
22
+ # make sure that the first option is a URL we can crawl
23
+ begin
24
+ URI(ARGV[0])
25
+ rescue
26
+ usage
27
+ Process.exit
28
+ end
29
+
30
+ root = ARGV[0]
31
+ Anemone.crawl(root) do |anemone|
32
+ anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
33
+
34
+ anemone.after_crawl do |pages|
35
+ pages = pages.shortest_paths!(root).uniq
36
+ depths = pages.values.inject({}) do |depths, page|
37
+ depths[page.depth] ||= 0
38
+ depths[page.depth] += 1
39
+ depths
40
+ end
41
+
42
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
43
+ end
39
44
  end
@@ -1,43 +1,51 @@
1
- #! /usr/bin/env ruby
2
- # == Synopsis
3
- # Crawls a site starting at the given URL, and saves the resulting
4
- # PageHash object to a file using Marshal serialization.
5
- #
6
- # == Usage
7
- # anemone_serialize.rb [options] url
8
- #
9
- # == Options
10
- # -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
11
- #
12
- # == Author
13
- # Chris Kite
14
-
15
- $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
-
17
- require 'anemone'
18
- require 'optparse'
19
- require 'rdoc/usage'
20
- require 'ostruct'
21
-
22
- # make sure that the first option is a URL we can crawl
23
- begin
24
- URI(ARGV[0])
25
- rescue
26
- RDoc::usage()
27
- Process.exit
28
- end
29
-
30
- options = OpenStruct.new
31
- options.output_file = "crawl.#{Time.now.to_i}"
32
-
33
- # parse command-line options
34
- opts = OptionParser.new
35
- opts.on('-o', '--output filename') {|o| options.output_file = o }
36
- opts.parse!(ARGV)
37
-
38
- root = ARGV[0]
39
- Anemone.crawl(root) do |anemone|
40
- anemone.after_crawl do |pages|
41
- open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
42
- end
1
+ #! /usr/bin/env ruby
2
+ # == Synopsis
3
+ # Crawls a site starting at the given URL, and saves the resulting
4
+ # PageHash object to a file using Marshal serialization.
5
+ #
6
+ # == Usage
7
+ # anemone_serialize.rb [options] url
8
+ #
9
+ # == Options
10
+ # -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
11
+ #
12
+ # == Author
13
+ # Chris Kite
14
+
15
+ $:.unshift File.join(File.dirname(__FILE__), "..", "lib")
16
+
17
+ require 'anemone'
18
+ require 'optparse'
19
+ require 'ostruct'
20
+
21
+ def usage
22
+ puts <<END
23
+ Usage: anemone_serialize.rb [options] url
24
+
25
+ Options:
26
+ -o, --output filename Filename to save PageHash to. Defaults to crawl.{Time.now}
27
+ END
28
+ end
29
+
30
+ # make sure that the first option is a URL we can crawl
31
+ begin
32
+ URI(ARGV[0])
33
+ rescue
34
+ usage
35
+ Process.exit
36
+ end
37
+
38
+ options = OpenStruct.new
39
+ options.output_file = "crawl.#{Time.now.to_i}"
40
+
41
+ # parse command-line options
42
+ opts = OptionParser.new
43
+ opts.on('-o', '--output filename') {|o| options.output_file = o }
44
+ opts.parse!(ARGV)
45
+
46
+ root = ARGV[0]
47
+ Anemone.crawl(root) do |anemone|
48
+ anemone.after_crawl do |pages|
49
+ open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
50
+ end
43
51
  end