RubyGems - imw - Versions diffs - 0.1.0 - Mend

imw 0.1.0

Files changed (111) hide show

data/.gitignore +15 -0
data/CHANGELOG +0 -0
data/LICENSE +674 -0
data/README.rdoc +101 -0
data/Rakefile +20 -0
data/VERSION +1 -0
data/etc/imwrc.rb +76 -0
data/lib/imw.rb +42 -0
data/lib/imw/boot.rb +58 -0
data/lib/imw/dataset.rb +233 -0
data/lib/imw/dataset/datamapper.rb +66 -0
data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
data/lib/imw/dataset/loaddump.rb +50 -0
data/lib/imw/dataset/old/file_collection.rb +88 -0
data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
data/lib/imw/dataset/scaffold.rb +132 -0
data/lib/imw/dataset/scraped_uri.rb +305 -0
data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
data/lib/imw/dataset/scrub/scrub.rb +147 -0
data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
data/lib/imw/dataset/scrub/slug.rb +101 -0
data/lib/imw/dataset/stats.rb +73 -0
data/lib/imw/dataset/stats/counter.rb +23 -0
data/lib/imw/dataset/task.rb +38 -0
data/lib/imw/dataset/workflow.rb +81 -0
data/lib/imw/files.rb +110 -0
data/lib/imw/files/archive.rb +113 -0
data/lib/imw/files/basicfile.rb +122 -0
data/lib/imw/files/binary.rb +28 -0
data/lib/imw/files/compressed_file.rb +93 -0
data/lib/imw/files/compressed_files_and_archives.rb +348 -0
data/lib/imw/files/compressible.rb +103 -0
data/lib/imw/files/csv.rb +112 -0
data/lib/imw/files/json.rb +41 -0
data/lib/imw/files/sgml.rb +65 -0
data/lib/imw/files/text.rb +68 -0
data/lib/imw/files/yaml.rb +46 -0
data/lib/imw/packagers.rb +8 -0
data/lib/imw/packagers/archiver.rb +108 -0
data/lib/imw/packagers/s3_mover.rb +28 -0
data/lib/imw/parsers.rb +7 -0
data/lib/imw/parsers/html_parser.rb +382 -0
data/lib/imw/parsers/html_parser/matchers.rb +306 -0
data/lib/imw/parsers/line_parser.rb +87 -0
data/lib/imw/parsers/regexp_parser.rb +72 -0
data/lib/imw/utils.rb +24 -0
data/lib/imw/utils/components.rb +61 -0
data/lib/imw/utils/config.rb +46 -0
data/lib/imw/utils/error.rb +54 -0
data/lib/imw/utils/extensions/array.rb +125 -0
data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
data/lib/imw/utils/extensions/core.rb +43 -0
data/lib/imw/utils/extensions/dir.rb +24 -0
data/lib/imw/utils/extensions/file_core.rb +64 -0
data/lib/imw/utils/extensions/hash.rb +218 -0
data/lib/imw/utils/extensions/hpricot.rb +48 -0
data/lib/imw/utils/extensions/string.rb +49 -0
data/lib/imw/utils/extensions/struct.rb +42 -0
data/lib/imw/utils/extensions/symbol.rb +28 -0
data/lib/imw/utils/extensions/typed_struct.rb +22 -0
data/lib/imw/utils/extensions/uri.rb +59 -0
data/lib/imw/utils/log.rb +67 -0
data/lib/imw/utils/misc.rb +63 -0
data/lib/imw/utils/paths.rb +115 -0
data/lib/imw/utils/uri.rb +59 -0
data/lib/imw/utils/uuid.rb +33 -0
data/lib/imw/utils/validate.rb +38 -0
data/lib/imw/utils/version.rb +12 -0
data/lib/imw/utils/view.rb +113 -0
data/lib/imw/utils/view/dump_csv.rb +112 -0
data/lib/imw/utils/view/dump_csv_older.rb +117 -0
data/spec/data/sample.csv +131 -0
data/spec/data/sample.tsv +131 -0
data/spec/data/sample.txt +131 -0
data/spec/data/sample.xml +653 -0
data/spec/data/sample.yaml +652 -0
data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
data/spec/imw/files/archive_spec.rb +118 -0
data/spec/imw/files/basicfile_spec.rb +121 -0
data/spec/imw/files/bz2_spec.rb +32 -0
data/spec/imw/files/compressed_file_spec.rb +96 -0
data/spec/imw/files/compressible_spec.rb +100 -0
data/spec/imw/files/file_spec.rb +144 -0
data/spec/imw/files/gz_spec.rb +32 -0
data/spec/imw/files/rar_spec.rb +33 -0
data/spec/imw/files/tar_spec.rb +31 -0
data/spec/imw/files/text_spec.rb +23 -0
data/spec/imw/files/zip_spec.rb +31 -0
data/spec/imw/files_spec.rb +38 -0
data/spec/imw/packagers/archiver_spec.rb +125 -0
data/spec/imw/packagers/s3_mover_spec.rb +7 -0
data/spec/imw/parsers/line_parser_spec.rb +96 -0
data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
data/spec/imw/utils/extensions/find_spec.rb +113 -0
data/spec/imw/utils/paths_spec.rb +38 -0
data/spec/imw/workflow/rip/local_spec.rb +89 -0
data/spec/imw/workflow/rip_spec.rb +27 -0
data/spec/rcov.opts +1 -0
data/spec/spec.opts +4 -0
data/spec/spec_helper.rb +32 -0
data/spec/support/archive_contents_matcher.rb +94 -0
data/spec/support/custom_matchers.rb +21 -0
data/spec/support/directory_contents_matcher.rb +61 -0
data/spec/support/extensions.rb +18 -0
data/spec/support/file_contents_matcher.rb +50 -0
data/spec/support/random.rb +210 -0
data/spec/support/without_regard_to_order_matcher.rb +58 -0
metadata +196 -0

data/lib/imw/dataset/scrub/scrub_simple_url.rb ADDED

@@ -0,0 +1,38 @@
+module IMW
+  module URIScrubber
+    def scrubbed
+      to_dirpath
+    end
+  end
+end
+module Scrub
+  #
+  # start with a letter, and contain only A-Za-z0-9_
+  #
+  class SimplifiedURL < Scrub::Generic
+    self.complaint  = "should follow our zany simplified URL rules: com.host.dot-reversed:schemeifnothttp/path/seg_men-ts/stuff.ext-SHA1ifweird"
+    self.validator  = %r{#{Addressable::URI::SAFE_CHARS}#{Addressable::URI::RESERVED_CHARS}}u
+    self.replacer   = ''
+    include Scrub::Lowercased
+    attr_accessor :uri
+    def valid? str
+      str.to_s.downcase == sanitize(str)
+    end
+    def sanitize str
+      # if this fails just normalize once, or don't set $KCODE: http://bit.ly/1664vp
+      uri = Addressable::URI.heuristic_parse(str.to_s).normalize
+      # print [uri.host, uri.host_valid?, uri.path, uri.path_valid?].inspect
+      if uri.host_valid?
+        uri.scrubbed
+      else
+        uri.uuid_path
+      end
+    end
+  end
+end

data/lib/imw/dataset/scrub/scrub_test.rb ADDED

@@ -0,0 +1,60 @@
+#!/usr/bin/env ruby
+# -*- coding: utf-8 -*-
+require 'scrub'
+require 'scrub_simple_url'
+test_strings = [
+  nil, '', '12', '123', 'simple', 'UPPER', 'CamelCased', 'iden_tifier_23_',
+  'twentyfouralphacharslong', 'twentyfiveatozonlyletters', 'hello.-_there@funnychar.com',
+  "tab\t", "newline\n",
+  "Iñtërnâtiônàlizætiøn",
+  'semicolon;', 'quote"', 'tick\'', 'backtick`', 'percent%', 'plus+', 'space ',
+  'leftanglebracket<', 'ampersand&',
+  "control char-bel\x07",
+  "http://foo.bar.com/",
+  "HTTP://FOO.BAR.com",
+  ".com/zazz",
+  "scheme://user_name@user_acct:passwd@host-name.museum:9047/path;pathquery/p!a-th~2/path?query=param&amp;query=pa%20ram#fragment",
+  "http://web.site.com/path/path/file.ext",
+  "ftp://ftp.site.com/path/path/file.ext",
+  "/absolute/pathname/file.ext",
+  "http://foo.bar.com/.hidden_file_with.ext",
+  "http://foo.bar.com/.hidden_file",
+  "dir/--/non_alpha_path_segment.ext",
+  "http://foo.bar.com/dir/../two_dots_in_path",
+]
+scrubbers = {
+  # :unicode_title   => Scrub::UnicodeTitle.new,
+  # :title           => Scrub::Title.new,
+  # :identifier      => Scrub::Identifier.new,
+  # :free_text       => Scrub::FreeText.new,
+  :handle        => Scrub::Handle.new,
+  :simplified_url  => Scrub::SimplifiedURL.new,
+  # :domain        => Scrub::Domain.new,
+  # :email         => Scrub::Email.new,
+}
+scrubbers.each do |scrubber_name, scrubber|
+  puts scrubber_name
+  results = test_strings.map do |test_string|
+    [!!scrubber.valid?(test_string), scrubber.sanitize(test_string).inspect, test_string.inspect ]
+  end
+  results.sort_by{|val,san,orig| val ? 1 : -1 }.each do |val,san,orig|
+    puts "  %-5s %-30s %-30s" % [val,san,orig]
+  end
+end
+# 'foo@bar.com', 'foo@newskool-tld.museum', 'foo@twoletter-tld.de', 'foo@nonexistant-tld.qq',
+#         'r@a.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail.com',
+#         'hello.-_there@funnychar.com', 'uucp%addr@gmail.com', 'hello+routing-str@gmail.com',
+#         'domain@can.haz.many.sub.doma.in',],
+#       :invalid => [nil, '', '!!@nobadchars.com', 'foo@no-rep-dots..com', 'foo@badtld.xxx', 'foo@toolongtld.abcdefg',
+#         'Iñtërnâtiônàlizætiøn@hasnt.happened.to.email', 'need.domain.and.tld@de', "tab\t", "newline\n",
+#         'r@.wk', '1234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890-234567890@gmail2.com',
+#         # these are technically allowed but not seen in practice:
+#         'uucp!addr@gmail.com', 'semicolon;@gmail.com', 'quote"@gmail.com', 'tick\'@gmail.com', 'backtick`@gmail.com', 'space @gmail.com', 'bracket<@gmail.com', 'bracket>@gmail.com'

data/lib/imw/dataset/scrub/slug.rb ADDED

@@ -0,0 +1,101 @@
+require 'rubygems'
+require 'addressable/uri'
+require 'uuidtools'
+require 'scrub'
+require 'scrub_simple_url'
+module IMW
+  #
+  #
+  # +handle+ -- reasonable effort at a uniq-ish, but human-comprehensible string
+  # Handle should only contain the characters A-Za-z0-9_-./
+  #
+  #
+  class Slug
+    # A humane representation of the handle ('that-one-time-at_foo')
+    attr_reader :handle
+    # The purportedly unique string ('')
+    attr_accessor :uniqish
+    def initialize handle
+      self.handle = handle
+      self.uniqish  = handle
+    end
+    #
+    # Unless overridden, use the uniqish to
+    # make a name-based UUID within the infochimps.org
+    # namespace
+    #
+    def uuid
+      UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
+    end
+    # Handle with only \w characters -- safe for everything there be
+    def url_sane
+      return '' if !handle
+      handle.gsub(/[^\w\/\:]+/, '-').gsub(/_/, '__').gsub(%r{[/:]+}, '_')
+    end
+    def handle= t
+      @handle = self.class.sanitize_handle(t)
+    end
+    # Strip all but handle-safe characters
+    def self.sanitize_handle t, turd='-'
+      t = t.gsub(%r{[^\w\-\./]+}, turd)
+    end
+  end
+  #
+  # Uses a URL (that's locator, not URI) as a
+  # presumed-uniq identifier.
+  #
+  # +uniqish+ returns the full normalized URL
+  #
+  # +handle+ is formed from the dot-reversed host, the scheme (if not http) and a
+  # sanitized version of the path. (The query string, fragment, etc are stripped
+  # from the handle)
+  #
+  #
+  class URLSlug < Slug
+    attr_accessor :url
+    def initialize url_str
+      self.url     = Addressable::URI.heuristic_parse(url_str).normalize
+      raise "Bad URL #{url}" unless url.host
+      self.uniqish = url.to_s
+      self.handle   = munge_url
+    end
+    def uuid
+      UUID.sha1_create(UUID_URL_NAMESPACE, full_handle)
+    end
+  end
+end
+module Sluggable
+  protected
+  def create_slug
+    "Slugging #{self.attributes}"
+    if (self.class.slug_on == :url) || (self.name.blank?)
+      slug = IMW::URLSlug.new(self.url)
+      self.name = slug.handle
+    else
+      slug = IMW::Slug.new(self.name)
+    end
+    self.handle ||= slug.handle
+  end
+  public
+  def self.included base
+    base.before :save, :create_slug
+    base.class_eval do
+      def self.slug_on s=nil
+        @slug_on ||= s
+      end
+    end
+  end
+end

data/lib/imw/dataset/stats.rb ADDED

@@ -0,0 +1,73 @@
+#
+# h2. lib/imw/dataset/stats.rb -- statistics for datasets
+#
+# == About
+#
+# Implements methods to calculate very basic statistical properties of
+# a dataset.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
+module IMW
+  class Dataset
+    #
+    # simple histogram
+    #
+    # Runs down one column/attribute of a dataset
+    # returning counts for that column
+    #
+    def hist slicer
+      counts = { }
+      els = slice(slicer)
+      els.each do |el|
+        counts[el] ||= 0
+        counts[el]  += 1
+      end
+      self.class.new(counts.map{ |el,ct| [ct,el] })
+    end
+    def slice slicer
+      case
+      when slicer.respond_to?(:call) then self.map{ |row| slicer.call(row) }
+      else
+        self.map{ |row| row[slicer] }
+      end
+    end
+    #
+    # Report
+    #
+    def report slicer, opts={}
+      opts.reverse_merge! :n_top => 20, :hist_args => [], :fmt => "%7d\t%s", :do_hist => true
+      counts = hist(slicer)
+      report_hist  data, counts, slicer, opts if opts[:do_hist]
+      report_sizes data, counts, slicer, opts
+    end
+    def report_sizes data, counts, slicer, opts={}
+      fmt  = opts[:fmt]
+      puts fmt % [counts.length,              "unique elements"]
+      puts fmt % [data.length,                "total elements"]
+      puts fmt % [counts.find_all(&:nil?).length, "nil elements"]
+      uniqvals = counts.map{|ct,el| el}.reject(&:nil?)
+      puts " min:\t#{uniqvals.min}"
+      puts " max:\t#{uniqvals.max}"
+    end
+    # Most popular
+    def report_hist data, counts, slicer, opts={}
+      top = counts.sort_by{|ct,el| ct}[-opts[:n_top]..-1]
+      puts "Top #{opts[:n_top]} elements for slice through #{slicer}:"
+      puts " -freq-\t-element-"
+      puts top.map{ |ct,el| opts[:fmt] % [ct,el] }
+      puts "-------\t-------"
+    end
+  end
+end

data/lib/imw/dataset/stats/counter.rb ADDED

@@ -0,0 +1,23 @@
+module IMW
+  class RecordCounter < Hash
+    def record val
+      self[val] ||= 0
+      self[val]  += 1
+    end
+    def if_seen val, &block
+      if self[val]
+        yield
+      end
+      record val
+    end
+    def unless_seen val, &block
+      unless self[val]
+        yield
+      end
+      record val
+    end
+  end
+end

data/lib/imw/dataset/task.rb ADDED

@@ -0,0 +1,38 @@
+#
+# h2. lib/imw/workflow/task.rb --
+#
+# == About
+#
+# This file defines a class <tt>IMW::Task</tt> which subclasses
+# <tt>Rake::Task</tt>.  Tasks defined in IMW should be instances of
+# <tt>IMW::Task</tt>.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
+require 'rake'
+module IMW
+  class Task < Rake::Task
+  end
+  class Dataset
+    include Rake::TaskManager
+    # Return a new (or existing) <tt>IMW::Task</tt> with the given
+    # +name+.  Dependencies can be declared and a block passed in just
+    # as in Rake.
+    def task name, &block
+      self.define_task IMW::Task, name, &block
+    end
+  end
+end

data/lib/imw/dataset/workflow.rb ADDED

@@ -0,0 +1,81 @@
+#
+# lib/imw/workflow.rb -- implements the workflow class
+#
+# == About
+#
+# This file implements the <tt>IMW::Workflow</tt> class which tailors
+# the functionality of Rake for IMW objects.
+#
+# Author::    Philip flip Kromer for infochimps.org (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+require 'imw/dataset/scaffold'
+require 'imw/dataset/task'
+module IMW
+  # The <tt>IMW::Workflow</tt> module is a collection of methods which
+  # define Rake[http://rake.rubyforge.org/] tasks specialized for each
+  # dataset.
+  module Workflow
+    # The functions called here define the default tasks associated
+    # with each dataset.
+    def create_default_tasks
+      create_directories_task
+      create_symlinks_task
+      create_initialize_task
+      create_delete_data_task
+      create_destroy_task
+      create_workflow_tasks
+    end
+    # Sets the default tasks in this workflow.
+    #
+    # The default tasks constitute a set of consecutive actions that
+    # must be taken in order: <tt>:rip</tt>, <tt>parse</tt>,
+    # <tt>munge</tt>, <tt>fix</tt>, and <tt>package</tt>.  Each task
+    # is a <tt>Rake::Task</tt> which depends on the one before it.
+    #
+    # Each task does nothing by default other than create directories
+    # to hold files for this dataset as it undergoes the workflow.
+    def set_default_tasks
+      define_task(Rake::Task, {:rip => []})
+      define_task(Rake::Task, {:parse => :rip})
+      define_task(Rake::Task, {:munge => :parse})
+      define_task(Rake::Task, {:fix => :munge})
+      define_task(Rake::Task, {:package => :fix})
+      comment_default_tasks
+    end
+    # Set the initial comments for each of the default tasks.
+    def comment_default_tasks
+      self[:rip].comment = "Rip dataset from an origin"
+      self[:parse].comment = "Parse dataset into intermediate form"
+      self[:munge].comment = "Munge dataset's structure into desired form"
+      self[:fix].comment = "Fix and format dataset"
+      self[:package].comment = "Package dataset into a final format"
+    end
+    # Creates the task dependency chain <tt>:package => :fix => :munge
+    # => :peel => :rip => :initialize</tt>.
+    def create_workflow_tasks
+      @last_description = "Obtain data from some source."
+      define_task(IMW::Task, :rip     => [:initialize])
+      @last_description = "Extract datafiles from ripped data."
+      define_task(IMW::Task, :peel    => [:rip])
+      @last_description = "Transform records in a dataset."
+      define_task(IMW::Task, :munge   => [:peel])
+      @last_description = "Reconcile records."
+      define_task(IMW::Task, :fix     => [:munge])
+      @last_description = "Package dataset in final form."
+      define_task(IMW::Task, :package => [:fix])
+    end
+  end
+end
+# puts "#{File.basename(__FILE__)}: You find your flow next to a tall tree.  Ahhhh."

data/lib/imw/files.rb ADDED

@@ -0,0 +1,110 @@
+#
+# h2. lib/imw/files.rb -- uniform interface to various files
+#
+# == About
+#
+# Implements <tt>IMW.open</tt> which returns an appropriate +IMW+
+# object given a URI.
+#
+# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
+# Copyright:: Copyright (c) 2008 infochimps.org
+# License::   GPL 3.0
+# Website::   http://infinitemonkeywrench.org/
+#
+# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
+require 'uri'
+require 'open-uri'
+require 'imw/utils'
+require 'imw/files/basicfile'
+require 'imw/files/archive'
+require 'imw/files/compressible'
+require 'imw/files/compressed_file'
+module IMW
+  # Parse +path+ and return an appropriate handler.  Pass in <tt>:write
+  # => true</tt> to open for writing.
+  #
+  #   IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
+  #
+  #
+  def self.open path, options = {}
+    mode = options[:write] ? 'w' : 'r'
+    Files.file_class_for(path, options).new(path, mode, options)
+  end
+  def self.open! path, options = {}
+    self.open path, options.reverse_merge(:write => true)
+  end
+  module Files
+    # There is certainly a cleaner way to do this.
+    autoload :Text,   'imw/files/text'
+    autoload :Binary, 'imw/files/binary'
+    autoload :Yaml,   'imw/files/yaml'
+    autoload :Csv,    'imw/files/csv'
+    autoload :Json,   'imw/files/json'
+    autoload :Bz2,    'imw/files/compressed_files_and_archives'
+    autoload :Gz,     'imw/files/compressed_files_and_archives'
+    autoload :Tar,    'imw/files/compressed_files_and_archives'
+    autoload :TarBz2, 'imw/files/compressed_files_and_archives'
+    autoload :TarGz,  'imw/files/compressed_files_and_archives'
+    autoload :Rar,    'imw/files/compressed_files_and_archives'
+    autoload :Zip,    'imw/files/compressed_files_and_archives'
+    autoload :Xml,    'imw/files/sgml'
+    autoload :Html,   'imw/files/sgml'
+    # An array used to match files to classes to handle them.  The
+    # first element of each array is the regexp and the second names
+    # the class to handle the file.
+    #
+    #  IMW::Files::EXTENSION_HANDLERS << [ /\.csv$/, :csv ] #=> IMW::Files::Csv
+    #  IMW::Files::EXTENSION_HANDLERS << [ /\.txt$/, "Text" ] #=> IMW::Files::Text
+    #  IMW::Files::EXTENSION_HANDLERS << [ /\.myclass%/, MyClass ] #=> MyClass
+    #
+    # Elements at the end of the array have greater precedence which
+    # allows, say, <tt>.tar.gz</tt> to be handled differently from
+    # <tt>.gz</tt>.
+    EXTENSION_HANDLERS = [
+                          [/./,           :Text], # catchall
+                          [/\.txt$/,      :Text],
+                          [/\.txt$/,      :Text],
+                          [/\.dat$/,      :Text],
+                          [/\.ascii$/,    :Text],
+                          [/\.yaml$/,     :Yaml],
+                          [/\.yml$/,      :Yaml],
+                          [/\.csv$/,      :Csv],
+                          [/\.tsv$/,      :Tsv],
+                          [/\.json$/,     :Json],
+                          [/\.bz2$/,      :Bz2],
+                          [/\.gz$/,       :Gz],
+                          [/\.tar\.bz2$/, :TarBz2],
+                          [/\.tbz2$/,     :TarBz2],
+                          [/\.tar\.gz$/,  :TarGz],
+                          [/\.tgz$/,      :TarGz],
+                          [/\.tar$/,      :Tar],
+                          [/\.rar$/,      :Rar],
+                          [/\.zip$/,      :Zip],
+                          [/\.xml$/,      :Xml],
+                          [/\.html$/,     :Html],
+                          [/\.htm$/,      :Html]
+                         ]
+    protected
+    def self.file_class_for path, options = {}
+      klass = options.delete(:as)
+      unless klass
+        EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
+          next unless regexp =~ path
+          klass = thing
+          break
+        end
+      end
+      klass.is_a?(Class) ? klass : class_eval(klass.to_s)
+    end
+  end
+end