RubyGems - chriskite-anemone - Versions diffs - 0.0.4 - Mend

chriskite-anemone 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

data/README.txt ADDED Viewed

@@ -0,0 +1,18 @@
+= Anemone
+== DESCRIPTION
+Anemone is a web spider framework that can spider a domain and collect useful
+information about the pages it visits. It is versatile, allowing you to
+write your own specialized spider tasks quickly and easily.
+== FEATURES
+* Multi-threaded design for high performance
+* Tracks 301 HTTP redirects to understand a page's aliases
+* Built-in BFS algorithm for determining page depth
+* Allows exclusion of URLs based on regular expressions
+== REQUIREMENTS
+* hpricot
+== EXAMPLES
+See the +bin+ directory for several examples of useful Anemone tasks.

data/bin/anemone_count.rb ADDED Viewed

@@ -0,0 +1,36 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs the total number
+#   of unique pages on the site.
+#
+# == Usage
+#   anemone_count.rb url
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+def usage
+  puts <<END
+Usage: anemone_count.rb url
+END
+end
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  usage
+  Process.exit
+end
+Anemone.crawl(ARGV[0]) do |anemone|
+  anemone.after_crawl do |pages|
+    puts pages.uniq.size
+  end
+end

data/bin/anemone_cron.rb ADDED Viewed

@@ -0,0 +1,108 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Performs pagedepth, url list, and count functionality
+#   Meant to be run daily as a cron job
+#
+# == Usage
+#   anemone_url_list.rb [options] url
+#
+# == Options
+#   -r, --relative                  Output relative URLs (rather than absolute)
+#   -o, --output filename           Filename to save URL list to. Defaults to urls.txt.
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+def usage
+  puts <<END
+Usage: anemone_url_list.rb [options] url
+Options:
+  -r, --relative           Output relative URLs (rather than absolute)
+  -o, --output filename    Filename to save URL list to. Defautls to urls.txt.
+END
+end
+options = OpenStruct.new
+options.relative = false
+options.output_file = 'urls.txt'
+# make sure that the last option is a URL we can crawl
+begin
+  URI(ARGV.last)
+rescue
+  usage
+  Process.exit
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative')        { options.relative = true }
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+root = ARGV.last
+Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
+  anemone.after_crawl do |pages|
+    puts "Crawl results for #{root}\n"
+    # print a list of 404's
+    not_found = []
+    pages.each_value do |page|
+      url = page.url.to_s
+      not_found << url if page.not_found?
+    end
+    if !not_found.empty?
+      puts "\n404's:"
+      not_found.each do |url|
+        if options.relative
+          puts URI(url).path.to_s
+        else
+          puts url
+        end
+        num_linked_from = 0
+        pages.urls_linking_to(url).each do |u|
+          u = u.path if options.relative
+          num_linked_from += 1
+          puts "  linked from #{u}"
+          if num_linked_from > 10
+            puts "  ..."
+            break
+          end
+        end
+      end
+      print "\n"
+    end
+    # remove redirect aliases, and calculate pagedepths
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    # print the page count
+    puts "Total pages: #{pages.size}\n"
+    # print a list of depths
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+    # output a list of urls to file
+    file = open(options.output_file, 'w')
+    pages.each_key do |url|
+      url = options.relative ? url.path.to_s : url.to_s
+      file.puts url
+    end
+  end
+end

data/bin/anemone_pagedepth.rb ADDED Viewed

@@ -0,0 +1,44 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs a count of
+#   the number of Pages at each depth in the site.
+#
+# == Usage
+#   anemone_pagedepth.rb url
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+def usage
+  puts <<END
+Usage: anemone_pagedepth.rb url
+END
+end
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  usage
+  Process.exit
+end
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
+  anemone.after_crawl do |pages|
+    pages = pages.shortest_paths!(root).uniq
+    depths = pages.values.inject({}) do |depths, page|
+      depths[page.depth] ||= 0
+      depths[page.depth] += 1
+      depths
+    end
+    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+  end
+end

data/bin/anemone_serialize.rb ADDED Viewed

@@ -0,0 +1,51 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and saves the resulting
+#   PageHash object to a file using Marshal serialization.
+#
+# == Usage
+#   anemone_serialize.rb [options] url
+#
+# == Options
+#   -o, --output filename           Filename to save PageHash to. Defaults to crawl.{Time.now}
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+def usage
+  puts <<END
+Usage: anemone_serialize.rb [options] url
+Options:
+  -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
+END
+end
+# make sure that the first option is a URL we can crawl
+begin
+  URI(ARGV[0])
+rescue
+  usage
+  Process.exit
+end
+options = OpenStruct.new
+options.output_file = "crawl.#{Time.now.to_i}"
+# parse command-line options
+opts = OptionParser.new
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+root = ARGV[0]
+Anemone.crawl(root) do |anemone|
+  anemone.after_crawl do |pages|
+    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
+  end
+end

data/bin/anemone_url_list.rb ADDED Viewed

@@ -0,0 +1,54 @@
+#! /usr/bin/env ruby
+# == Synopsis
+#   Crawls a site starting at the given URL, and outputs the URL of each page
+#   in the domain as they are encountered.
+#
+# == Usage
+#   anemone_url_list.rb [options] url
+#
+# == Options
+#   -r, --relative          Output relative URLs (rather than absolute)
+#
+# == Author
+#   Chris Kite
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+def usage
+  puts <<END
+Usage: anemone_url_list.rb [options] url
+Options:
+  -r, --relative      Output relative URLs (rather than absolute)
+END
+end
+options = OpenStruct.new
+options.relative = false
+# make sure that the last option is a URL we can crawl
+begin
+  URI(ARGV.last)
+rescue
+  usage
+  Process.exit
+end
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.parse!(ARGV)
+Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone|
+  anemone.on_every_page do |page|
+    if options.relative
+      puts page.url.path
+    else
+      puts page.url
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,79 @@
+--- !ruby/object:Gem::Specification
+name: chriskite-anemone
+version: !ruby/object:Gem::Version
+  version: 0.0.4
+platform: ruby
+authors:
+- Chris Kite
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2009-05-16 00:00:00 -07:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.3.0
+    version:
+- !ruby/object:Gem::Dependency
+  name: facets
+  type: :runtime
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 2.5.0
+    version:
+description:
+email:
+executables:
+- anemone_count.rb
+- anemone_cron.rb
+- anemone_pagedepth.rb
+- anemone_serialize.rb
+- anemone_url_list.rb
+extensions: []
+extra_rdoc_files:
+- README.txt
+files:
+- README.txt
+has_rdoc: true
+homepage: http://anemone.rubyforge.org
+post_install_message:
+rdoc_options:
+- -m
+- README.txt
+- -t
+- Anemone
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project: anemone
+rubygems_version: 1.2.0
+signing_key:
+specification_version: 2
+summary: Anemone web-spider framework
+test_files: []