RubyGems - anemone - Versions diffs - 0.0.2 → 0.0.3 - Mend

anemone 0.0.2 → 0.0.3

Files changed (15) hide show

data/README.txt CHANGED Viewed

@@ -1,18 +1,18 @@
-= Anemone
-== DESCRIPTION
-Anemone is a web spider framework that can spider a domain and collect useful
-information about the pages it visits. It is versatile, allowing you to
-write your own specialized spider tasks quickly and easily.
-== FEATURES
-* Multi-threaded design for high performance
-* Tracks 301 HTTP redirects to understand a page's aliases
-* Built-in BFS algorithm for determining page depth
-* Allows exclusion of URLs based on regular expressions
-== REQUIREMENTS
-* hpricot
-== EXAMPLES
+= Anemone
+== DESCRIPTION
+Anemone is a web spider framework that can spider a domain and collect useful
+information about the pages it visits. It is versatile, allowing you to
+write your own specialized spider tasks quickly and easily.
+== FEATURES
+* Multi-threaded design for high performance
+* Tracks 301 HTTP redirects to understand a page's aliases
+* Built-in BFS algorithm for determining page depth
+* Allows exclusion of URLs based on regular expressions
+== REQUIREMENTS
+* hpricot
+== EXAMPLES
 See the +bin+ directory for several examples of useful Anemone tasks.

data/bin/anemone_count.rb CHANGED Viewed

@@ -1,31 +1,36 @@
-#! /usr/bin/env ruby
-# == Synopsis
-#   Crawls a site starting at the given URL, and outputs the total number
-#   of unique pages on the site.
-#
-# == Usage
-#   anemone_count.rb url
-#
-# == Author
-#   Chris Kite
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-require 'anemone'
-require 'rdoc/usage'
-# make sure that the first option is a URL we can crawl
-begin
-  URI(ARGV[0])
-rescue
-  RDoc::usage()
-  Process.exit
-end
-Anemone.crawl(ARGV[0]) do |anemone|
-  anemone.after_crawl do |pages|
-    puts pages.uniq.size
-  end
-end
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs the total number
+#   of unique pages on the site.
+#
+# == Usage
+#   anemone_count.rb url
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+def usage
+  puts <<END
+Usage: anemone_count.rb url
+END
+end
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  usage
+  Process.exit
+end
+Anemone.crawl(ARGV[0]) do |anemone|
+  anemone.after_crawl do |pages|
+    puts pages.uniq.size
+  end
+end

data/bin/anemone_cron.rb CHANGED Viewed

@@ -1,99 +1,108 @@
-#! /usr/bin/env ruby
-# == Synopsis
-#   Performs pagedepth, url list, and count functionality
-#   Meant to be run daily as a cron job
-#
-# == Usage
-#   anemone_url_list.rb [options] url
-#
-# == Options
-#   -r, --relative                  Output relative URLs (rather than absolute)
-#   -o, --output filename           Filename to save URL list to. Defaults to urls.txt.
-#
-# == Author
-#   Chris Kite
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-require 'anemone'
-require 'optparse'
-require 'rdoc/usage'
-require 'ostruct'
-options = OpenStruct.new
-options.relative = false
-options.output_file = 'urls.txt'
-# make sure that the last option is a URL we can crawl
-begin
-  URI(ARGV.last)
-rescue
-  RDoc::usage()
-  Process.exit
-end
-# parse command-line options
-opts = OptionParser.new
-opts.on('-r', '--relative')        { options.relative = true }
-opts.on('-o', '--output filename') {|o| options.output_file = o }
-opts.parse!(ARGV)
-root = ARGV.last
-Anemone.crawl(root) do |anemone|
-  anemone.after_crawl do |pages|
-    puts "Crawl results for #{root}\n"
-    # print a list of 404's
-    not_found = []
-    pages.each_value do |page|
-      url = page.url.to_s
-      not_found << url if page.not_found?
-    end
-    if !not_found.empty?
-      puts "\n404's:"
-      not_found.each do |url|
-        if options.relative
-          puts URI(url).path.to_s
-        else
-          puts url
-        end
-        num_linked_from = 0
-        pages.urls_linking_to(url).each do |u|
-          u = u.path if options.relative
-          num_linked_from += 1
-          puts "  linked from #{u}"
-          if num_linked_from > 10
-            puts "  ..."
-            break
-          end
-        end
-      end
-      print "\n"
-    end
-    # remove redirect aliases, and calculate pagedepths
-    pages = pages.shortest_paths!(root).uniq
-    depths = pages.values.inject({}) do |depths, page|
-      depths[page.depth] ||= 0
-      depths[page.depth] += 1
-      depths
-    end
-    # print the page count
-    puts "Total pages: #{pages.size}\n"
-    # print a list of depths
-    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
-    # output a list of urls to file
-    file = open(options.output_file, 'w')
-    pages.each_key do |url|
-      url = options.relative ? url.path.to_s : url.to_s
-      file.puts url
-    end
-  end
+#! /usr/bin/env ruby
+# == Synopsis
+#   Performs pagedepth, url list, and count functionality
+#   Meant to be run daily as a cron job
+#
+# == Usage
+#   anemone_url_list.rb [options] url
+#
+# == Options
+#   -r, --relative                  Output relative URLs (rather than absolute)
+#   -o, --output filename           Filename to save URL list to. Defaults to urls.txt.
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+def usage
+  puts <<END
+Usage: anemone_url_list.rb [options] url
+Options:
+  -r, --relative           Output relative URLs (rather than absolute)
+  -o, --output filename    Filename to save URL list to. Defautls to urls.txt.
+END
+end
+options = OpenStruct.new
+options.relative = false
+options.output_file = 'urls.txt'
+# make sure that the last option is a URL we can crawl
+begin
+  URI(ARGV.last)
+rescue
+  usage
+  Process.exit
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative')        { options.relative = true }
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+root = ARGV.last
+Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
+  anemone.after_crawl do |pages|
+    puts "Crawl results for #{root}\n"
+    # print a list of 404's
+    not_found = []
+    pages.each_value do |page|
+      url = page.url.to_s
+      not_found << url if page.not_found?
+    end
+    if !not_found.empty?
+      puts "\n404's:"
+      not_found.each do |url|
+        if options.relative
+          puts URI(url).path.to_s
+        else
+          puts url
+        end
+        num_linked_from = 0
+        pages.urls_linking_to(url).each do |u|
+          u = u.path if options.relative
+          num_linked_from += 1
+          puts "  linked from #{u}"
+          if num_linked_from > 10
+            puts "  ..."
+            break
+          end
+        end
+      end
+      print "\n"
+    end
+    # remove redirect aliases, and calculate pagedepths
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    # print the page count
+    puts "Total pages: #{pages.size}\n"
+    # print a list of depths
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+    # output a list of urls to file
+    file = open(options.output_file, 'w')
+    pages.each_key do |url|
+      url = options.relative ? url.path.to_s : url.to_s
+      file.puts url
+    end
+  end
 end

data/bin/anemone_pagedepth.rb CHANGED Viewed

@@ -1,39 +1,44 @@
-#! /usr/bin/env ruby
-# == Synopsis
-#   Crawls a site starting at the given URL, and outputs a count of
-#   the number of Pages at each depth in the site.
-#
-# == Usage
-#   anemone_pagedepth.rb url
-#
-# == Author
-#   Chris Kite
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-require 'anemone'
-require 'rdoc/usage'
-# make sure that the first option is a URL we can crawl
-begin
-  URI(ARGV[0])
-rescue
-  RDoc::usage()
-  Process.exit
-end
-root = ARGV[0]
-Anemone.crawl(root) do |anemone|
-  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
-  anemone.after_crawl do |pages|
-    pages = pages.shortest_paths!(root).uniq
-    depths = pages.values.inject({}) do |depths, page|
-      depths[page.depth] ||= 0
-      depths[page.depth] += 1
-      depths
-    end
-    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
-  end
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs a count of
+#   the number of Pages at each depth in the site.
+#
+# == Usage
+#   anemone_pagedepth.rb url
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+def usage
+  puts <<END
+Usage: anemone_pagedepth.rb url
+END
+end
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  usage
+  Process.exit
+end
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
+  anemone.after_crawl do |pages|
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+  end
 end

data/bin/anemone_serialize.rb CHANGED Viewed

@@ -1,43 +1,51 @@
-#! /usr/bin/env ruby
-# == Synopsis
-#   Crawls a site starting at the given URL, and saves the resulting
-#   PageHash object to a file using Marshal serialization.
-#
-# == Usage
-#   anemone_serialize.rb [options] url
-#
-# == Options
-#   -o, --output filename           Filename to save PageHash to. Defaults to crawl.{Time.now}
-#
-# == Author
-#   Chris Kite
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-require 'anemone'
-require 'optparse'
-require 'rdoc/usage'
-require 'ostruct'
-# make sure that the first option is a URL we can crawl
-begin
-  URI(ARGV[0])
-rescue
-  RDoc::usage()
-  Process.exit
-end
-options = OpenStruct.new
-options.output_file = "crawl.#{Time.now.to_i}"
-# parse command-line options
-opts = OptionParser.new
-opts.on('-o', '--output filename') {|o| options.output_file = o }
-opts.parse!(ARGV)
-root = ARGV[0]
-Anemone.crawl(root) do |anemone|
-  anemone.after_crawl do |pages|
-    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
-  end
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and saves the resulting
+#   PageHash object to a file using Marshal serialization.
+#
+# == Usage
+#   anemone_serialize.rb [options] url
+#
+# == Options
+#   -o, --output filename           Filename to save PageHash to. Defaults to crawl.{Time.now}
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+def usage
+  puts <<END
+Usage: anemone_serialize.rb [options] url
+Options:
+  -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
+END
+end
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  usage
+  Process.exit
+end
+options = OpenStruct.new
+options.output_file = "crawl.#{Time.now.to_i}"
+# parse command-line options
+opts = OptionParser.new
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+  anemone.after_crawl do |pages|
+    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
+  end
 end