RubyGems - chriskite-anemone - Versions diffs - 0.0.4 - Mend

chriskite-anemone 0.0.4

Files changed (7) hide show

data/README.txt ADDED Viewed

@@ -0,0 +1,18 @@
+= Anemone
+== DESCRIPTION
+Anemone is a web spider framework that can spider a domain and collect useful
+information about the pages it visits. It is versatile, allowing you to
+write your own specialized spider tasks quickly and easily.
+== FEATURES
+* Multi-threaded design for high performance
+* Tracks 301 HTTP redirects to understand a page's aliases
+* Built-in BFS algorithm for determining page depth
+* Allows exclusion of URLs based on regular expressions
+== REQUIREMENTS
+* hpricot
+== EXAMPLES
+See the +bin+ directory for several examples of useful Anemone tasks.

data/bin/anemone_count.rb ADDED Viewed

@@ -0,0 +1,36 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs the total number
+#   of unique pages on the site.
+#
+# == Usage
+#   anemone_count.rb url
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+def usage
+  puts <<END
+Usage: anemone_count.rb url
+END
+end
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  usage
+  Process.exit
+end
+Anemone.crawl(ARGV[0]) do |anemone|
+  anemone.after_crawl do |pages|
+    puts pages.uniq.size
+  end
+end

data/bin/anemone_cron.rb ADDED Viewed

@@ -0,0 +1,108 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Performs pagedepth, url list, and count functionality
+#   Meant to be run daily as a cron job
+#
+# == Usage
+#   anemone_url_list.rb [options] url
+#
+# == Options
+#   -r, --relative                  Output relative URLs (rather than absolute)
+#   -o, --output filename           Filename to save URL list to. Defaults to urls.txt.
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+def usage
+  puts <<END
+Usage: anemone_url_list.rb [options] url
+Options:
+  -r, --relative           Output relative URLs (rather than absolute)
+  -o, --output filename    Filename to save URL list to. Defautls to urls.txt.
+END
+end
+options = OpenStruct.new
+options.relative = false
+options.output_file = 'urls.txt'
+# make sure that the last option is a URL we can crawl
+begin
+  URI(ARGV.last)
+rescue
+  usage
+  Process.exit
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative')        { options.relative = true }
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+root = ARGV.last
+Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
+  anemone.after_crawl do |pages|
+    puts "Crawl results for #{root}\n"
+    # print a list of 404's
+    not_found = []
+    pages.each_value do |page|
+      url = page.url.to_s
+      not_found << url if page.not_found?
+    end
+    if !not_found.empty?
+      puts "\n404's:"
+      not_found.each do |url|
+        if options.relative
+          puts URI(url).path.to_s
+        else
+          puts url
+        end
+        num_linked_from = 0
+        pages.urls_linking_to(url).each do |u|
+          u = u.path if options.relative
+          num_linked_from += 1
+          puts "  linked from #{u}"
+          if num_linked_from > 10
+            puts "  ..."
+            break
+          end
+        end
+      end
+      print "\n"
+    end
+    # remove redirect aliases, and calculate pagedepths
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    # print the page count
+    puts "Total pages: #{pages.size}\n"
+    # print a list of depths
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+    # output a list of urls to file
+    file = open(options.output_file, 'w')
+    pages.each_key do |url|
+      url = options.relative ? url.path.to_s : url.to_s
+      file.puts url
+    end
+  end
+end

data/bin/anemone_pagedepth.rb ADDED Viewed

@@ -0,0 +1,44 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs a count of
+#   the number of Pages at each depth in the site.
+#
+# == Usage
+#   anemone_pagedepth.rb url
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+def usage
+  puts <<END
+Usage: anemone_pagedepth.rb url
+END
+end
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  usage
+  Process.exit
+end
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
+  anemone.after_crawl do |pages|
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+  end
+end

data/bin/anemone_serialize.rb ADDED Viewed

@@ -0,0 +1,51 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and saves the resulting
+#   PageHash object to a file using Marshal serialization.
+#
+# == Usage
+#   anemone_serialize.rb [options] url
+#
+# == Options
+#   -o, --output filename           Filename to save PageHash to. Defaults to crawl.{Time.now}
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+def usage
+  puts <<END
+Usage: anemone_serialize.rb [options] url
+Options:
+  -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
+END
+end
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  usage
+  Process.exit
+end
+options = OpenStruct.new
+options.output_file = "crawl.#{Time.now.to_i}"
+# parse command-line options
+opts = OptionParser.new
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+  anemone.after_crawl do |pages|
+    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
+  end
+end

data/bin/anemone_url_list.rb ADDED Viewed

@@ -0,0 +1,54 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs the URL of each page
+#   in the domain as they are encountered.
+#
+# == Usage
+#   anemone_url_list.rb [options] url
+#
+# == Options
+#   -r, --relative          Output relative URLs (rather than absolute)
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+def usage
+  puts <<END
+Usage: anemone_url_list.rb [options] url
+Options:
+  -r, --relative      Output relative URLs (rather than absolute)
+END
+end
+options = OpenStruct.new
+options.relative = false
+# make sure that the last option is a URL we can crawl
+begin
+  URI(ARGV.last)
+rescue
+  usage
+  Process.exit
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.parse!(ARGV)
+Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
+  anemone.on_every_page do |page|
+    if options.relative
+      puts page.url.path
+    else
+      puts page.url
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,79 @@
+--- !ruby/object:Gem::Specification
+name: chriskite-anemone
+version: !ruby/object:Gem::Version
+  version: 0.0.4
+platform: ruby
+authors:
+- Chris Kite
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-05-16 00:00:00 -07:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.3.0
+    version:
+- !ruby/object:Gem::Dependency
+  name: facets
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.5.0
+    version:
+description:
+email:
+executables:
+- anemone_count.rb
+- anemone_cron.rb
+- anemone_pagedepth.rb
+- anemone_serialize.rb
+- anemone_url_list.rb
+extensions: []
+extra_rdoc_files:
+- README.txt
+files:
+- README.txt
+has_rdoc: true
+homepage: http://anemone.rubyforge.org
+post_install_message:
+rdoc_options:
+- -m
+- README.txt
+- -t
+- Anemone
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: anemone
+rubygems_version: 1.2.0
+signing_key:
+specification_version: 2
+summary: Anemone web-spider framework
+test_files: []