RubyGems - imw - Versions diffs - 0.1.0 → 0.1.1 - Mend

imw 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

data/README.rdoc +194 -31
data/VERSION +1 -1
data/bin/imw +5 -0
data/lib/imw/boot.rb +0 -15
data/lib/imw/dataset/paths.rb +38 -0
data/lib/imw/dataset/task.rb +21 -18
data/lib/imw/dataset/workflow.rb +126 -65
data/lib/imw/dataset.rb +56 -82
data/lib/imw/files/basicfile.rb +3 -3
data/lib/imw/files/compressed_files_and_archives.rb +23 -37
data/lib/imw/files/csv.rb +2 -1
data/lib/imw/files/directory.rb +62 -0
data/lib/imw/files/excel.rb +84 -0
data/lib/imw/files/sgml.rb +4 -23
data/lib/imw/files.rb +62 -47
data/lib/imw/packagers/archiver.rb +19 -1
data/lib/imw/packagers/s3_mover.rb +8 -0
data/lib/imw/parsers/html_parser/matchers.rb +251 -268
data/lib/imw/parsers/html_parser.rb +181 -176
data/lib/imw/parsers.rb +1 -1
data/lib/imw/repository.rb +35 -0
data/lib/imw/runner.rb +114 -0
data/lib/imw/utils/extensions/core.rb +0 -16
data/lib/imw/utils/paths.rb +0 -28
data/lib/imw.rb +21 -32
metadata +11 -19
data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
data/lib/imw/dataset/datamapper.rb +0 -66
data/lib/imw/dataset/loaddump.rb +0 -50
data/lib/imw/dataset/old/file_collection.rb +0 -88
data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
data/lib/imw/dataset/scaffold.rb +0 -132
data/lib/imw/dataset/scraped_uri.rb +0 -305
data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
data/lib/imw/dataset/scrub/scrub.rb +0 -147
data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
data/lib/imw/dataset/scrub/slug.rb +0 -101
data/lib/imw/dataset/stats/counter.rb +0 -23
data/lib/imw/dataset/stats.rb +0 -73

data/lib/imw/dataset.rb CHANGED Viewed

@@ -1,50 +1,64 @@
-#
-# h2. lib/imw/dataset.rb -- imw dataset
-#
-# == About
-#
-# Defines basic properties of the <tt>IMW::Dataset</tt>
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: You use your Monkeywrench to rake deep and straight furrows in the earth for your orchard." # at bottom
-require 'rake'
-require 'ostruct'
 require 'imw/utils'
 require 'imw/dataset/workflow'
-require 'imw/dataset/loaddump'
-require 'imw/dataset/stats'
+require 'imw/dataset/paths'
 module IMW
-  # The basic unit in IMW is the dataset.  Each dataset has a handle
-  # which is meant to be unique (at least in the context of a
-  # particular pool of datasets, see <tt>IMW::Pool</tt>).  A dataset
-  # can also have a taxonomic classification or _taxon_
+  # The IMW::Dataset class is useful organizing a complex data
+  # transformation because it is capable of managing a collection of
+  # paths and the interdependencies between subparts of the
+  # transformation.
+  #
+  # == Manipulating Paths
+  #
+  # Storing paths makes code shorter and more readable.  By default
+  # (this assumes the executing script is in a file
+  # /home/imw_user/data/foo.rb):
+  #
+  #   dataset = IMW::Dataset.new
+  #   dataset.path_to(:self)
+  #   #=> '/home/imw_user/data'
+  #   dataset.path_to(:ripd)
+  #   #=> '/home/imw_user/data/ripd'
+  #   dataset.path_to(:pkgd, 'final.tar.gz')
+  #   #=> '/home/imw_user/data/pkgd/final.tar.gz'
+  #
+  # Paths can be added
+  #
+  #   dataset.add_path(:sorted_output, :mungd, 'sorted-file-3923.txt')
+  #   dataset.path_to(:sorted_output)
+  #   #=> '/home/imw_user/data/mungd/sorted-file-3923.txt'
+  #
+  # as well as removed (via +remove_path+).
+  #
+  # == Defining Workflows
+  #
+  # IMW encourages you to think of transforming data as a network of
+  # interdependent steps (see IMW::Workflow).  Each of IMW's five
+  # default steps maps to a named directory remembered by each
+  # dataset.
   #
-  #   dataset = IMW::Dataset.new :recent_history_of_banana_prices,
-  #                              :taxon => [:economics,:alarming_trends]
+  # The following example shows why this is a useful abstraction as
+  # well as illustrating some of the other functionality in IMW.
   #
-  # but it isn't required like the handle.
+  # == Example Dataset
   #
-  # Processing a dataset commonly occurs in four course steps.  IMW
-  # defines a task[http://rake.rubyforge.org] for each of these steps
-  # and keeps files involved in different steps in different
-  # directories.
+  # The first step is to import IMW and create the dataset
+  #
+  #   require 'rubygems'
+  #   require 'imw'
+  #   dataset = IMW::Dataset.new
+  #
+  # You can pass in a handle (the name or "slug" for the dataset) as
+  # well as some options.  Now define the steps you intend to take to
+  # complete the transformation:
   #
   # rip::
-  #   Managed by the <tt>:rip</tt> task, data is collected from a
-  #   source (+http+, +ftp+, database, &c.) and deposited in a
-  #   subdirectory of the <tt>:ripd</tt> directory named for the URI
-  #   of the source.
+  #   Data is collected from a source (+http+, +ftp+, database, &c.)
+  #   and deposited in the <tt>:ripd</tt> directory of this dataset.
   #
   #     dataset.task :rip do
-  #       IMW::Rip.from_web 'http://econ.chimpu.edu/datasets/produce_prices.tar.bz2'
+  #       IMW.open('http://econ.chimpu.edu/datasets/produce_prices.tar.bz2').cp_to_dir(dataset.path_to(:ripd))
   #         #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2
   #
   #       IMW::Rip.from_database :named  => "weather_records",
@@ -174,59 +188,19 @@ module IMW
   # framework in which to program.
   class Dataset
-    # The <tt>Rake::TaskManager</tt> module allows the
-    # <tt>IMW::Dataset</tt> class to leverage the functionality of the
-    # Rake[http://rake.rubyforge.org/] library to manage tasks
-    # associated with the processing of this dataset.
-    include Rake::TaskManager
     # The <tt>IMW::Workflow</tt> module contains pre-defined tasks for
     # dataset processing.
     include IMW::Workflow
-    attr_reader   :handle, :taxon, :options
-    attr_accessor :data
-    # The default taxon assigned to a dataset.
-    DEFAULT_TAXON = nil
-    # Default options passed to <tt>Rake</tt>.  Any class including
-    # the <tt>Rake::TaskManager</tt> module must define a constant by
-    # this name.
-    DEFAULT_OPTIONS = {
-      :dry_run => false,
-      :trace   => false,
-      :verbose => false
-    }
+    attr_accessor :handle, :options, :data
-    # Create a new dataset.  Arguments include
-    #
-    #   <tt>:taxon</tt> (+DEFAULT_TAXON+):: a string or sequence
-    #   giving the taxonomic classification of the dataset.  See
-    #   <tt>IMW::Dataset.taxon=</tt> for more details on how this
-    #   argument is interpreted.
-    def initialize handle, options = {}
-      options = options.reverse_merge :taxon => DEFAULT_TAXON
-      # FIXME is this how the attribute writer functions should be
-      # called?
-      @handle = handle
-      @taxon = options[:taxon]
-      # for rake
-      @tasks = Hash.new
-      @rules = Array.new
-      @scope = Array.new
-      @last_description = nil
-      @options = OpenStruct.new(DEFAULT_OPTIONS)
-      create_default_tasks
-      # sets an empty @paths hash; see utils/paths.rb
+    def initialize options = {}
+      @options = options
+      @handle  = options[:handle]
+      initialize_workflow
+      set_root_paths
       set_paths
-    end
-    def handle= thing
-      @handle = thing.is_a?(String) ? thing.to_handle : thing
+      set_tasks
     end
   end

data/lib/imw/files/basicfile.rb CHANGED Viewed

@@ -20,7 +20,7 @@ module IMW
       protected
       def uri= uri
-        @uri      = URI.parse(uri) if uri.is_a?(String)
+        @uri      = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
         @host     = self.uri.host
         @path     = local? ? ::File.expand_path(self.uri.path) : self.uri.path
         @dirname  = ::File.dirname path
@@ -53,7 +53,7 @@ module IMW
       # path as a first argument.
       [:executable?, :executable_real?, :file?, :directory?, :ftype, :owned?, :pipe?, :readable?, :readable_real?, :setgid?, :setuid?, :size, :size?, :socket?, :split, :stat, :sticky?, :writable?, :writable_real?, :zero?].each do |class_method|
         define_method class_method do
-          File.send(class_method, path) if local?
+          File.send(class_method, path)
         end
       end
@@ -61,7 +61,7 @@ module IMW
       # to open files online too to check.
       def exist?
         if local?
-          ::File.exist?(path) ? true : false
+          ::File.exist?(path)
         else
           begin
             true if open(uri)

data/lib/imw/files/compressed_files_and_archives.rb CHANGED Viewed

@@ -1,17 +1,3 @@
-#
-# h2. lib/imw/files/compressed_files_and_archives.rb -- require farm
-#
-# == About
-#
-# Just required all the archive and compressed formats (+tar+, +bz2+,
-# &c.)
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
 module IMW
   module Files
@@ -29,9 +15,9 @@ module IMW
       # The default flags used creating, appending to, listing, and
       # extracting a tar archive.
       DEFAULT_FLAGS = {
-        :create => "-cf",
-        :append => "-rf",
-        :list => "-tf",
+        :create  => "-cf",
+        :append  => "-rf",
+        :list    => "-tf",
         :extract => "-xf",
         :program => :tar
       }
@@ -39,10 +25,10 @@ module IMW
       def initialize uri, *args
         self.uri= uri
         @archive = {
-          :program => DEFAULT_FLAGS[:program],
-          :create_flags => DEFAULT_FLAGS[:create],
-          :append_flags => DEFAULT_FLAGS[:append],
-          :list_flags => DEFAULT_FLAGS[:list],
+          :program       => DEFAULT_FLAGS[:program],
+          :create_flags  => DEFAULT_FLAGS[:create],
+          :append_flags  => DEFAULT_FLAGS[:append],
+          :list_flags    => DEFAULT_FLAGS[:list],
           :extract_flags => DEFAULT_FLAGS[:extract]
         }
       end
@@ -51,9 +37,9 @@ module IMW
     # A class to wrap a <tt>tar.gz</tt> archive.
     #
     # Creation, appending, listing, and extraction flags are stored in
-    # <tt>IMW::Files::TarGz::DEFAULT_FLAGS</tt> and all are passed to
+    # <tt>IMW::Files::Targz::DEFAULT_FLAGS</tt> and all are passed to
     # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
-    class TarGz
+    class Targz
       include IMW::Files::BasicFile
       include IMW::Files::Archive
@@ -63,21 +49,21 @@ module IMW
       # extracting a <tt>tar.gz</tt> archive.
       DEFAULT_FLAGS = {
         :decompression_program => :gzip,
-        :decompression_flags => '-fd',
-        :archive_program => :tar,
-        :archive_list_flags => "-tf",
+        :decompression_flags   => '-fd',
+        :archive_program       => :tar,
+        :archive_list_flags    => "-tf",
         :archive_extract_flags => "-xzf"
       }
       def initialize uri, *args
         self.uri= uri
         @compression = {
-          :program => DEFAULT_FLAGS[:decompression_program],
+          :program             => DEFAULT_FLAGS[:decompression_program],
           :decompression_flags => DEFAULT_FLAGS[:decompression_flags]
         }
         @archive = {
-          :program => DEFAULT_FLAGS[:archive_program],
-          :list_flags => DEFAULT_FLAGS[:archive_list_flags],
+          :program       => DEFAULT_FLAGS[:archive_program],
+          :list_flags    => DEFAULT_FLAGS[:archive_list_flags],
           :extract_flags => DEFAULT_FLAGS[:archive_extract_flags]
         }
       end
@@ -99,14 +85,14 @@ module IMW
         end
       end
-    end # TarGz
+    end # Targz
     # A class to wrap a <tt>tar.bz2</tt> archive.
     #
     # Creation, appending, listing, and extraction flags are stored in
-    # <tt>IMW::Files::TarBz2::DEFAULT_FLAGS</tt> and all are passed to
+    # <tt>IMW::Files::Tarbz2::DEFAULT_FLAGS</tt> and all are passed to
     # the <tt>:tar</tt> program in <tt>IMW::EXTERAL_PROGRAMS</tt>.
-    class TarBz2
+    class Tarbz2
       include IMW::Files::BasicFile
       include IMW::Files::Archive
@@ -169,7 +155,7 @@ module IMW
         File.join(dirname,name + '.tar')
       end
-    end # TarBz2
+    end # Tarbz2
     # A class to wrap a +rar+ archive.
     #
@@ -330,11 +316,11 @@ module IMW
     # make sure that tar.bz2 precedes bz2 and so on...
-    FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::TarBz2]
-    FILE_REGEXPS << [/\.tbz2$/,     IMW::Files::TarBz2]
+    FILE_REGEXPS << [/\.tar\.bz2$/, IMW::Files::Tarbz2]
+    FILE_REGEXPS << [/\.tbz2$/,     IMW::Files::Tarbz2]
-    FILE_REGEXPS << [/\.tar\.gz$/,  IMW::Files::TarGz]
-    FILE_REGEXPS << [/\.tgz$/,      IMW::Files::TarGz]
+    FILE_REGEXPS << [/\.tar\.gz$/,  IMW::Files::Targz]
+    FILE_REGEXPS << [/\.tgz$/,      IMW::Files::Targz]
     FILE_REGEXPS << [/\.tar$/,      IMW::Files::Tar]
     FILE_REGEXPS << [/\.bz2$/,      IMW::Files::Bz2]

data/lib/imw/files/csv.rb CHANGED Viewed

@@ -39,7 +39,8 @@ module IMW
       def initialize uri, mode='r', options = {}
         options.reverse_merge!(self.class::DEFAULT_OPTIONS)
         self.uri= uri
-        super open(uri,mode),options
+        options.delete(:write)  # FasterCSV complains about unkown options
+        super open(uri,mode), options
       end
       # Return the contents of this CSV file as an array of arrays.

data/lib/imw/files/directory.rb ADDED Viewed

@@ -0,0 +1,62 @@
+require 'imw/files/basicfile'
+module IMW
+  module Files
+    class Directory
+      include IMW::Files::BasicFile
+      # FIXME these should be defined by BasicFile and then removed here but I don't see how...
+      # [:executable?, :executable_real?, :pipe?, :socket?, :rm, :rm!, :extname, :extname=, :name, :name=].each do |method|
+      #   instance_eval do
+      #     remove_method method
+      #   end
+      # end
+      def uri= uri
+        @uri      = uri.is_a?(URI::Generic) ? uri : URI.parse(uri)
+        @host     = self.uri.host
+        @path     = local? ? ::File.expand_path(self.uri.path) : self.uri.path
+        @dirname  = ::File.dirname path
+        @basename = ::File.basename path
+      end
+      def initialize uri
+        self.uri = uri
+      end
+      def [] selector='*'
+        Dir[File.join(path, selector)] if local?
+      end
+      # Copy the contents of this directory to +new_dir+.
+      def cp new_dir
+        raise IMW::PathError.new("cannot copy from #{path}, doesn't exist!") unless exist?
+        if local?
+          FileUtils.cp_r path, new_dir
+        else
+          raise IMW::PathError.new("cannot recursively copy remote directories (yet!)")
+        end
+        self.class.new(new_dir)
+      end
+      # Move this directory to +new_dir+.
+      def mv new_dir
+        raise IMW::PathError.new("cannot move from #{path}, doesn't exist!") unless exist?
+        if local?
+          FileUtils.mv path, new_dir
+        else
+          raise IMW::PathError.new("cannot move remote directories (yet!)")
+        end
+        self.class.new(new_dir)
+      end
+      alias_method :mv!, :mv
+      # Move this directory so it sits beneath +dir+.
+      def mv_to_dir dir
+        mv File.join(File.expand_path(dir),basename)
+      end
+      alias_method :mv_to_dir!, :mv_to_dir
+    end
+  end
+end

data/lib/imw/files/excel.rb ADDED Viewed

@@ -0,0 +1,84 @@
+require 'spreadsheet'
+# FIXME Main issue with this:
+# You can make a new excel book and dump data to it no problem.
+# However, something that doesn't seem to work is dumping to a file, opening,
+# and dumping to it again. At the moment this is probably not a big deal.
+module IMW
+  module Files
+    class Excel
+      include IMW::Files::BasicFile
+      include IMW::Files::Compressible
+      #need to initialize, load, and dump
+      attr_accessor :book,:idx, :max_lines, :sht_idx, :sht_row, :book_idx
+      def initialize uri, mode, options={}
+        self.uri = uri
+        @max_lines = options[:max_lines] || 65000
+        @idx = 0
+        @book_idx = 0
+        @sht_idx = 0
+        unless self.exist?
+          make_new_book
+          make_new_sheet
+        else
+          get_existing_book
+        end
+      end
+      def load
+        @sheet.map{|row| row.to_a}
+      end
+      def dump data
+        data.each do |line|
+          raise "too many lines" if too_many?
+          self << line
+        end
+        save unless no_data?
+      end
+      def << line
+        @sheet.row(@sht_row).concat( line )
+        @sht_row += 1
+        @idx += 1
+      end
+      def make_new_book
+        @book = Spreadsheet::Workbook.new
+        @book_idx += 1
+      end
+      def make_new_sheet
+        @sheet = @book.create_worksheet
+        @sht_idx += 1
+        @sht_row = 0 #always start at row 0 in a new sheet
+      end
+      def get_existing_book
+        @book = Spreadsheet.open path
+        @sheet = book.worksheet 0
+        @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
+        @sht_idx += 1
+      end
+      def incr_sheet
+        @sheet = book.worksheet @sht_idx
+      end
+      def too_many?
+        @sht_row >= @max_lines
+      end
+      def no_data?
+        @sht_row == 0
+      end
+      def save
+        @book.write path
+      end
+    end
+  end
+end

data/lib/imw/files/sgml.rb CHANGED Viewed

@@ -1,17 +1,3 @@
-#
-# h2. lib/imw/files/sgml.rb -- SGML files
-#
-# == About
-#
-# For SGML-derived files, including XML, HTML, &c..
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
 require 'hpricot'
 require 'imw/files/text'
 require 'imw/parsers/html_parser'
@@ -23,21 +9,16 @@ module IMW
       attr_accessor :doc
-      def initialize uri, mode='r', options={}
-        super uri, mode, options
-        @doc = Hpricot(open(uri))
-      end
       # Delegate to Hpricot
       def method_missing method, *args, &block
         @doc.send method, *args, &block
       end
-      # Parse this file using the IMW HTMLParser.  The parser can
-      # either be passed in directly or constructed from a passed hash
-      # of matchers.
+      # Parse this file using the IMW::Parsers::HtmlParser.  The
+      # parser can either be passed in directly or constructed from a
+      # passed hash of specs and/or matchers.
       def parse *args
-        parser = args.first.is_a?(IMW::HTMLParser) ? args.first : IMW::HTMLParser.new(*args)
+        parser = args.first.is_a?(IMW::Parsers::HtmlParser) ? args.first : IMW::Parsers::HtmlParser.new(*args)
         parser.parse(self)
       end

data/lib/imw/files.rb CHANGED Viewed

@@ -1,22 +1,8 @@
-#
-# h2. lib/imw/files.rb -- uniform interface to various files
-#
-# == About
-#
-# Implements <tt>IMW.open</tt> which returns an appropriate +IMW+
-# object given a URI.
-#
-# Author::    (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
-# Copyright:: Copyright (c) 2008 infochimps.org
-# License::   GPL 3.0
-# Website::   http://infinitemonkeywrench.org/
-#
-# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
 require 'uri'
 require 'open-uri'
 require 'imw/utils'
 require 'imw/files/basicfile'
+require 'imw/files/directory'
 require 'imw/files/archive'
 require 'imw/files/compressible'
 require 'imw/files/compressed_file'
@@ -28,13 +14,21 @@ module IMW
   #
   #   IMW.open("/tmp/test.csv") # => IMW::Files::Csv("/tmp/test.csv')
   #
-  #
-  def self.open path, options = {}
-    mode = options[:write] ? 'w' : 'r'
-    Files.file_class_for(path, options).new(path, mode, options)
+  #
+  def self.open path, options = {}, &block
+    if File.directory?(File.expand_path(path))
+      dir = Files::Directory.new(path)
+      yield dir if block_given?
+      dir
+    else
+      mode = options[:write] ? 'w' : 'r'
+      file = Files.file_class_for(path, options).new(path, mode, options)
+      yield file if block_given?
+      file
+    end
   end
-  def self.open! path, options = {}
+  def self.open! path, options = {}, &block
     self.open path, options.reverse_merge(:write => true)
   end
@@ -50,13 +44,14 @@ module IMW
     autoload :Bz2,    'imw/files/compressed_files_and_archives'
     autoload :Gz,     'imw/files/compressed_files_and_archives'
     autoload :Tar,    'imw/files/compressed_files_and_archives'
-    autoload :TarBz2, 'imw/files/compressed_files_and_archives'
-    autoload :TarGz,  'imw/files/compressed_files_and_archives'
+    autoload :Tarbz2, 'imw/files/compressed_files_and_archives'
+    autoload :Targz,  'imw/files/compressed_files_and_archives'
     autoload :Rar,    'imw/files/compressed_files_and_archives'
     autoload :Zip,    'imw/files/compressed_files_and_archives'
     autoload :Xml,    'imw/files/sgml'
     autoload :Html,   'imw/files/sgml'
+    autoload :Excel,  'imw/files/excel'
     # An array used to match files to classes to handle them.  The
     # first element of each array is the regexp and the second names
@@ -70,33 +65,39 @@ module IMW
     # allows, say, <tt>.tar.gz</tt> to be handled differently from
     # <tt>.gz</tt>.
     EXTENSION_HANDLERS = [
-                          [/./,           :Text], # catchall
-                          [/\.txt$/,      :Text],
-                          [/\.txt$/,      :Text],
-                          [/\.dat$/,      :Text],
-                          [/\.ascii$/,    :Text],
-                          [/\.yaml$/,     :Yaml],
-                          [/\.yml$/,      :Yaml],
-                          [/\.csv$/,      :Csv],
-                          [/\.tsv$/,      :Tsv],
-                          [/\.json$/,     :Json],
-                          [/\.bz2$/,      :Bz2],
-                          [/\.gz$/,       :Gz],
-                          [/\.tar\.bz2$/, :TarBz2],
-                          [/\.tbz2$/,     :TarBz2],
-                          [/\.tar\.gz$/,  :TarGz],
-                          [/\.tgz$/,      :TarGz],
-                          [/\.tar$/,      :Tar],
-                          [/\.rar$/,      :Rar],
-                          [/\.zip$/,      :Zip],
-                          [/\.xml$/,      :Xml],
-                          [/\.html$/,     :Html],
-                          [/\.htm$/,      :Html]
+                          [/\.txt$/,      :text],
+                          [/\.txt$/,      :text],
+                          [/\.dat$/,      :text],
+                          [/\.ascii$/,    :text],
+                          [/\.yaml$/,     :yaml],
+                          [/\.yml$/,      :yaml],
+                          [/\.csv$/,      :csv],
+                          [/\.tsv$/,      :tsv],
+                          [/\.json$/,     :json],
+                          [/\.bz2$/,      :bz2],
+                          [/\.gz$/,       :gz],
+                          [/\.tar\.bz2$/, :tarbz2],
+                          [/\.tbz2$/,     :tarbz2],
+                          [/\.tar\.gz$/,  :targz],
+                          [/\.tgz$/,      :targz],
+                          [/\.tar$/,      :tar],
+                          [/\.rar$/,      :rar],
+                          [/\.zip$/,      :zip],
+                          [/\.xml$/,      :xml],
+                          [/\.html$/,     :html],
+                          [/\.htm$/,      :html],
+                          [/\.xlsx?$/,    :excel]
                          ]
+    SCHEME_HANDLERS = [
+                       [/http/, :html]
+                       ]
     protected
     def self.file_class_for path, options = {}
       klass = options.delete(:as)
+      # try to choose klass from path extension if not already set
       unless klass
         EXTENSION_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
           next unless regexp =~ path
@@ -104,7 +105,21 @@ module IMW
           break
         end
       end
-      klass.is_a?(Class) ? klass : class_eval(klass.to_s)
+      # try to choose klass from uri scheme if not already set
+      unless klass
+        scheme = URI.parse(path).scheme
+        SCHEME_HANDLERS.reverse_each do |regexp, thing| # end has greater precedence
+          next unless regexp =~ scheme
+          klass = thing
+          break
+        end
+      end
+      # just stick with text if still not set
+      klass = :text unless klass
+      klass.is_a?(Class) ? klass : class_eval(klass.to_s.downcase.capitalize)
     end
   end
 end