RubyGems - imw - Versions diffs - 0.2.7 → 0.2.8 - Mend

imw 0.2.7 → 0.2.8

Files changed (93) hide show

data/Gemfile +23 -0
data/Gemfile.lock +47 -0
data/LICENSE +20 -674
data/README.rdoc +3 -4
data/VERSION +1 -1
data/lib/imw.rb +64 -35
data/lib/imw/dataset.rb +12 -2
data/lib/imw/formats.rb +4 -2
data/lib/imw/formats/delimited.rb +96 -36
data/lib/imw/formats/excel.rb +69 -101
data/lib/imw/formats/json.rb +3 -5
data/lib/imw/formats/pdf.rb +71 -0
data/lib/imw/formats/yaml.rb +3 -5
data/lib/imw/metadata.rb +66 -0
data/lib/imw/metadata/contains_metadata.rb +44 -0
data/lib/imw/metadata/dsl.rb +111 -0
data/lib/imw/metadata/field.rb +65 -0
data/lib/imw/metadata/schema.rb +227 -0
data/lib/imw/metadata/schematized.rb +27 -0
data/lib/imw/parsers.rb +1 -0
data/lib/imw/parsers/flat.rb +44 -0
data/lib/imw/resource.rb +36 -224
data/lib/imw/schemes.rb +3 -1
data/lib/imw/schemes/hdfs.rb +12 -1
data/lib/imw/schemes/http.rb +1 -2
data/lib/imw/schemes/local.rb +139 -16
data/lib/imw/schemes/remote.rb +14 -9
data/lib/imw/schemes/s3.rb +12 -0
data/lib/imw/schemes/sql.rb +117 -0
data/lib/imw/tools.rb +5 -3
data/lib/imw/tools/downloader.rb +63 -0
data/lib/imw/tools/summarizer.rb +21 -10
data/lib/imw/utils.rb +10 -0
data/lib/imw/utils/dynamically_extendable.rb +137 -0
data/lib/imw/utils/error.rb +3 -0
data/lib/imw/utils/extensions.rb +0 -4
data/lib/imw/utils/extensions/array.rb +6 -7
data/lib/imw/utils/extensions/hash.rb +3 -5
data/lib/imw/utils/extensions/string.rb +3 -3
data/lib/imw/utils/has_uri.rb +114 -0
data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
data/spec/data/formats/excel/sample.xls +0 -0
data/spec/data/formats/json/sample.json +1 -0
data/spec/data/formats/none/sample +650 -0
data/spec/data/formats/sgml/sample.xml +617 -0
data/spec/data/formats/text/sample.txt +650 -0
data/spec/data/formats/yaml/sample.yaml +410 -0
data/spec/data/schema-tabular.yaml +11 -0
data/spec/imw/formats/delimited_spec.rb +34 -2
data/spec/imw/formats/excel_spec.rb +55 -0
data/spec/imw/formats/json_spec.rb +3 -3
data/spec/imw/formats/sgml_spec.rb +4 -4
data/spec/imw/formats/yaml_spec.rb +3 -3
data/spec/imw/metadata/field_spec.rb +26 -0
data/spec/imw/metadata/schema_spec.rb +27 -0
data/spec/imw/metadata_spec.rb +39 -0
data/spec/imw/parsers/line_parser_spec.rb +1 -1
data/spec/imw/resource_spec.rb +0 -100
data/spec/imw/schemes/hdfs_spec.rb +19 -13
data/spec/imw/schemes/local_spec.rb +59 -3
data/spec/imw/schemes/s3_spec.rb +4 -0
data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
data/spec/imw/utils/has_uri_spec.rb +55 -0
data/spec/spec_helper.rb +1 -2
data/spec/support/random.rb +4 -4
metadata +58 -17
data/CHANGELOG +0 -0
data/TODO +0 -18
data/spec/data/sample.json +0 -782
data/spec/data/sample.txt +0 -131
data/spec/data/sample.xml +0 -653
data/spec/data/sample.yaml +0 -651
data/spec/spec.opts +0 -4
data/spec/support/extensions.rb +0 -18

data/README.rdoc CHANGED Viewed

@@ -1,4 +1,3 @@
 = What is the Infinite Monkeywrench?
 The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the
@@ -58,18 +57,18 @@ IMW is centered around processing resources.  A resource can be
 _anything_ with a URI and you create one using IMW.open.
   csv     = IMW.open('/path/to/my_data.csv')
-  html    = IMW.open('http://www.infochimps.com')
+  html    = IMW.open('http://www.example.com/history/march_2007')
 IMW dynamically extends a resource with modules appropriate to it when
 you open it.  In the above case, +csv+ would be automatically extended
 by the IMW::Resources::Formats::Csv module, among others:
-  csv.resource_modules
+  csv.modules
   => [IMW::Schemes::Local::Base, IMW::Schemes::Local::LocalFile, IMW::CompressedFiles::Compressible, IMW::Formats::Csv]
 while +html+ will use a different set
-  html.resource_modules
+  html.modules
   => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
 Consult the documentation for the modules a resource uses to learn

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.2.7
1	+ 0.2.8

data/lib/imw.rb CHANGED Viewed

@@ -1,4 +1,6 @@
 require 'rubygems'
+require 'bundler'
+Bundler.setup
 require 'imw/boot'
 require 'imw/utils'
@@ -8,15 +10,18 @@ require 'imw/utils'
 # transformations of data as a network of dependencies (a la Make or
 # Rake).
 #
-# IMW has a few central concepts: resources, datasets, workflows, and
-# repositories.
+# IMW has a few central concepts: resources, metadata, datasets,
+# workflows, and repositories.
 #
 # Resources represent individual data resources like local files,
-# websites, databases, &c.  Resources are typically instantiated via
-# IMW.open, with IMW doing the work of figuring out what to return
+# websites, databases, &c.  An IMW::Resource is typically instantiated
+# via IMW.open, with IMW doing the work of figuring out what to return
 # based on the URI passed in.
 #
-# Datasets represent collections of related data resources.  An
+# A Resource can have a schema which describes the fields in its data.
+# IMW::Metadata consists of classes which describe fields.
+#
+# Datasets represent collections of related data resources ..  An
 # IMW::Dataset comes with a pre-defined (but customizable) workflow
 # that takes data resources through several steps: rip, parse, munge,
 # and package.  The workflow leverages Rake and so the various tasks
@@ -35,6 +40,7 @@ module IMW
   autoload :Parsers,         'imw/parsers'
   autoload :Dataset,         'imw/dataset'
   autoload :Repository,      'imw/repository'
+  autoload :Metadata,        'imw/metadata'
   # Open a resource at the given +uri+.  The resource will
   # automatically be extended by modules which make sense given the
@@ -47,14 +53,23 @@ module IMW
   #
   # @param  [String, Addressable::URI, IMW::Resource] obj the URI to open
   # @param [Hash] options
-  # @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_resource!
-  # @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_resource!
+  # @option options [Array<String,Module>] as same as <tt>:use_modules</tt> in IMW::Resource.extend_instance!
+  # @option options [Array<String,Module>] without same as <tt>:skip_modules</tt> in IMW::Resource.extend_instance!
   # @return [IMW::Resource] the resulting resource, property extended for the given URI
-  def self.open obj, options={}
-    return obj if obj.is_a?(IMW::Resource)
-    options[:use_modules]  ||= (options[:as]      || [])
-    options[:skip_modules] ||= (options[:without] || [])
-    IMW::Resource.new(obj, options)
+  def self.open obj, options={}, &block
+    if obj.is_a?(IMW::Resource)
+      resource = obj
+    else
+      options[:use_modules]  ||= (options[:as]      || [])
+      options[:skip_modules] ||= (options[:without] || [])
+      resource = IMW::Resource.new(obj, options)
+    end
+    if block_given?
+      yield resource
+      resource.close
+    else
+      resource
+    end
   end
   # Works the same way as IMW.open except opens the resource for
@@ -62,8 +77,8 @@ module IMW
   #
   # @param  [String, Addressable::URI] uri the URI to open
   # @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing.
-  def self.open! uri, options={}
-    IMW::Resource.new(uri, options.merge(:mode => 'w'))
+  def self.open! uri, options={}, &block
+    open(uri, options.merge(:mode => 'w'), &block)
   end
   # The default repository in which to place datasets.  See the
@@ -75,32 +90,41 @@ module IMW
     @@repository ||= IMW::Repository.new
   end
-  # Create a dataset and put it in the default IMW repository.  Also
-  # yields the dataset so you can define its workflow
+  # Create a dataset and put it in the default IMW repository.
   #
-  # IMW.dataset :my_dataset do
-  #
-  #   # Define some paths we're going to use
-  #   add_path :raw_data,  :ripd, 'raw_data.csv'
-  #   add_path :fixd_data, :fixd, 'fixed_data.csv'
+  # Evaluates the given block in the context of the new dataset.  This
+  # allows you to define tasks, add paths, and use defined metadata in
+  # an elegant way.
   #
-  #   # Copy a file from a website to this dataset's +ripd+ directory.
-  #   rip do
-  #     IMW.open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:raw_data))
-  #   end
+  #   IMW.dataset :my_dataset do
+  #
+  #     # Define some paths we're going to use
+  #     add_path :original, :rawd, 'original.csv'
+  #     add_path :filtered, :fixd, 'filtered.csv'
+  #     add_path :package,  :pkgd, 'filtered.tar.bz2'
   #
-  #   # Filter the raw data to those values which match some criterion defined by <tt>accept?</tt>
-  #   munge do
-  #     IMW.open(path_to(:raw_data)).map do |row|
-  #       row if accept?(row)
-  #     end.compact.dump(path_to(:fixd_data))
-  #   end
+  #     # Copy a CSV filefrom a website to this machine.
+  #     rip do
+  #       open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:original))
+  #     end
   #
-  #   # Compress this new data
-  #   package do
-  #     IMW.open(path_to(:fixd_data)).compress.mv(path_to(:pkgd))
+  #     # Filter the original CSV data by the
+  #     # <tt>meets_some_condition?</tt> method we define elsewhere...
+  #     munge do
+  #       open!(path_to(:filtered)) do |filtered|
+  #         open(path_to(:original)).each do |row|
+  #           filtered << row if meets_some_condition?(row)
+  #       end
+  #     end
+  #
+  #     # Compress the filtered data to an archive.
+  #     package do
+  #       open(path_to(:filtered)).compress.mv(path_to(:package))
+  #     end
   #   end
-  # end
+  #
+  # See the <tt>/examples</tt> directory of the IMW distribution for
+  # more examples.
   #
   # @param [Symbol, String] handle the handle to identify this dataset with
   # @param [Hash]   options a hash of options (see IMW::Dataset)
@@ -112,3 +136,8 @@ module IMW
   end
 end
+# Works just like IMW.dataset but defined at a top-level scope.
+def dataset handle, options={}, &block
+  IMW.dataset(handle, options, &block)
+end

data/lib/imw/dataset.rb CHANGED Viewed

@@ -96,9 +96,12 @@ module IMW
   #   dataset = IMW::Dataset.new :my_dataset, :repository => repo
   class Dataset
-    include IMW::Workflow
+    # The handle this dataset goes by.  Used for identifying it within
+    # a repository.
+    attr_accessor :handle
-    attr_accessor :handle, :options
+    # Options for this dataset.
+    attr_accessor :options
     def initialize handle, options = {}
       @options = options
@@ -111,5 +114,12 @@ module IMW
       end
     end
+    # Provides this dataset with a workflow of tasks managed by Rake.
+    include IMW::Workflow
+    # Provides this dataset with DSL like methods to construct a
+    # schema in an IMW file.
+    include IMW::Metadata::DSL
   end
 end

data/lib/imw/formats.rb CHANGED Viewed

@@ -10,20 +10,22 @@ module IMW
     autoload :Xhtml, 'imw/formats/sgml'
     autoload :Rdf,   'imw/formats/sgml'
     autoload :Yaml,  'imw/formats/yaml'
+    autoload :Pdf,   'imw/formats/pdf'
     # Handlers which augment a resource with data format specific
     # methods.
     HANDLERS = [
                 [ "Formats::Csv",   /\.csv$/i    ],
                 [ "Formats::Tsv",   /\.tsv$/i    ],
-                [ "Formats::Excel", /\.xslx?$/i  ],
+                [ "Formats::Excel", /\.xlsx?$/i  ],
                 [ "Formats::Json",  /\.json$/i   ],
                 [ "Formats::Xml",   /\.xml$/i    ],
                 [ "Formats::Xsl",   /\.xsl$/i    ],
                 [ "Formats::Html",  /\.html?$/i  ],
                 [ "Formats::Xhtml", /\.xhtml?$/i ],
                 [ "Formats::Rdf",   /\.rdf?$/i   ],
-                [ "Formats::Yaml",  /\.ya?ml$/i  ]
+                [ "Formats::Yaml",  /\.ya?ml$/i  ],
+                [ "Formats::Pdf",   /\.pdf$/i    ]
                ]
   end
 end

data/lib/imw/formats/delimited.rb CHANGED Viewed

@@ -11,9 +11,22 @@ module IMW
     # @abstract
     module Delimited
-      include Enumerable
+      # Ensure that this delimited resource is described by a an
+      # ordered collection of flat fields.
+      def validate_schema!
+        raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
+      end
-      attr_accessor :delimited_settings
+      # Default options to be passed to
+      # FasterCSV[http://fastercsv.rubyforge.org/]; see its
+      # documentation for more information.
+      #
+      # @return [Hash]
+      def delimited_options
+        @delimited_options ||= {
+          :headers        => schema && schema.map { |field| field['name'] }
+        }.merge(resource_options_compatible_with_faster_csv)
+      end
       # Return the data in this delimited resource as an array of
       # arrays.
@@ -27,24 +40,70 @@ module IMW
         FasterCSV.parse(read, delimited_options, &block)
       end
+      # Gives us goodies!  Needs +each+ below.
+      include Enumerable
       # Call +block+ with each row in this delimited resource.
       def each &block
-        load(&block)
+        require 'fastercsv'
+        FasterCSV.new(io, delimited_options).each(&block)
       end
-      # Dump an array of arrays into this resource.
+      # Emit a single array or an array of arrays into this resource.
       #
-      # @param [Array] data array of arrays to dump
+      # @param [Array<Array>, Array] data array or array of arrays to emit
       # @param [Hash] options
-      # @option options [true, false] :persist Keep this resource's IO object open after dumping
-      def dump data, options={}
+      # @option options [true, false] :persist Keep this resource's IO object open after emiting
+      def emit data, options={}
         require 'fastercsv'
+        data = [data] unless data.first.is_a?(Array)
         data.each do |row|
           write(FasterCSV.generate_line(row, delimited_options))
         end
-        io.close unless options[:persist]
         self
       end
+      alias_method :<<, :emit
+      # Do a heuristic check to determine whether or not the first row
+      # of this delimited data is a row of headers.
+      #
+      # @return [true, false]
+      def headers_in_first_line?
+        # grab the header and up to 10 body rows
+        require 'fastercsv'
+        copy  = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
+        header = (copy.shift || []) rescue []
+        body   = 10.times.map { (copy.shift || []) rescue []}.flatten
+        # guess how many elements in a row
+        #size_guess = ((header.size + body.map(&:size).inject(0.0) { |e, s| s += e }).to_f / (1 + body.length).to_f).to_i
+        # calculate the fraction of bytes that are [-A-z_] (letters +
+        # underscore + hypen) for header and body and compute a
+        # threshold determinant
+        header_chars           = header.map(&:to_s).join
+        header_schema_bytes    = header_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
+        body_chars             = body.map(&:to_s).join
+        body_schema_bytes      = body_chars.bytes.find_all { |byte| (byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122) || byte == 95 || byte == 45 }
+        header_schema_fraction = header_schema_bytes.size.to_f / header_chars.size.to_f    rescue nil
+        body_schema_fraction   = body_schema_bytes.size.to_f   / body_chars.size.to_f      rescue nil
+        determinant            = (body_schema_fraction - header_schema_fraction).abs / 2.0 rescue nil
+        # decide, setting the threshold at 0.05 based on some guesswork...
+        determinant && determinant >= 0.05
+      end
+      # If it seems like there are headers in the first line of this
+      # data then go ahead and use them to define a schema.
+      #
+      # Will overwrite a schema already present for this resource.
+      def guess_schema!
+        return unless headers_in_first_line?
+        copy                        = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
+        names                       = (copy.shift || []) rescue []
+        self.schema                 = IMW::Metadata::Schema.new(names)
+        delimited_options[:headers] = names
+      end
       # Return a 10-line sample of this file.
       #
@@ -53,52 +112,53 @@ module IMW
         require 'fastercsv'
         returning([]) do |rows|
           row_num = 1
-          FasterCSV.new(io, delimited_options).each do |row|
+          each do |row|
             break if row_num > 10
-            rows << row
+            rows << row.size.times.map { |index| row[index] }
             row_num += 1
           end
         end
       end
-    end
-    module Csv
-      include Delimited
+      protected
+      # An array of option names used by FasterCSV.
+      FASTER_CSV_OPTION_NAMES = %w[col_sep row_sep quote_char encoding field_size_limit converters unconverted_fields headers return_headers write_headers header_converters skip_blanks force_quotes].map(&:to_sym)
-      # Default options to be passed to
-      # FasterCSV[http://fastercsv.rubyforge.org/]; see its
-      # documentation for more information.
+      # Return the subset of options this resource was initialized
+      # with that are compatible with FasterCSV (it complains when you
+      # give it keywords it doesn't know).
       #
       # @return [Hash]
+      def resource_options_compatible_with_faster_csv
+        @compatible_options ||= returning({}) do |compatible_options|
+          FASTER_CSV_OPTION_NAMES.each do |option_name|
+            compatible_options[option_name] = resource_options[option_name] if resource_options.has_key?(option_name.to_sym)
+          end
+        end
+      end
+    end
+    # A module for working with CSV (comma-separated value) formatted
+    # data.
+    #
+    # @see IMW::Formats::Delimited
+    module Csv
+      include Delimited
       def delimited_options
-        @delimited_options ||= {
-          :col_sep        => ',',
-          :headers        => false,
-          :return_headers => false,
-          :write_headers  => true,
-          :skip_blanks    => false,
-          :force_quotes   => false
-        }
+        @delimited_options ||= {:col_sep => ","}.merge(super())
       end
     end
+    # A module for working with TSV (tab-separated value) formatted
+    # data.
+    #
+    # @see IMW::Formats::Delimited
     module Tsv
       include Delimited
-      # Default options to be passed to
-      # FasterCSV[http://fastercsv.rubyforge.org/]; see its
-      # documentation for more information.
-      #
-      # @return [Hash]
       def delimited_options
         @delimited_options ||= {
           :col_sep        => "\t",
-          :headers        => false,
-          :return_headers => false,
-          :write_headers  => true,
-          :skip_blanks    => false,
-          :force_quotes   => false
-        }
+        }.merge(super())
       end
     end
   end

data/lib/imw/formats/excel.rb CHANGED Viewed

@@ -4,120 +4,88 @@ module IMW
     # Defines methods for reading and writing Microsoft Excel data.
     module Excel
-      attr_accessor :book, :sheet
-      def self.extended obj
-        if obj.exist?
-          @book = Spreadsheet.open path
-          @sheet = book.worksheet(0)
-        end
-      end
-      def book
-        return @book if @book
-        if exists?
-          @book = Spreadsheet.open(path)
-        else
-          @book = Spreadsheet::Workbook.new
-        end
+      # Ensure that this Excel resource is described by a an ordered
+      # collection of flat fields.
+      def validate_schema!
+        raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
       end
-      def sheet
-        @sheet = @book.create_worksheet
-        @sheet
-      end
-      #If an Excel file exists at the location specified by uri then
-      #it is opened and can be read out with a subsequent call to
-      #load(). Otherwise, a new workbook is created and can be written
-      #to with the dump() method.
-      def initialize uri, mode='r', options={}
-        self.uri = uri
-        @max_lines = options[:max_lines] || 65000
-        @idx = 0
-        @book_idx = 0
-        @sht_idx = 0
-        unless self.exist?
-          make_new_book
-          make_new_sheet
-        else
-          get_existing_book
-        end
-      end
-      #Returns the data in an existing workbook as an
-      #array of arrays. Only capable of reading a single sheet.
+      # Return the data in this Excel document as an array of arrays.
+      #
+      # Data from consecutive worksheets will be concatenated into a
+      # single outer array.
+      #
+      # @return [Array<Array>]
       def load
-        @sheet.map{|row| row.to_a}
-      end
-      #Dumps data, which is assumed to be an array of arrays, to a
-      #newly created Excel workbook. Attempting to dump to a book
-      #that already exists will typically result in file corruption.
-      #Raises a 'too many lines' error if the number of lines
-      #of data exceeds max_lines.
-      def dump data
-        data.each do |line|
-          raise "too many lines" if too_many?
-          self << line
+        require 'spreadsheet'
+        data = []
+        Spreadsheet.open(path).worksheets.each do |worksheet|
+          data += worksheet.map do |row|
+            row.to_a
+          end
         end
-        save unless no_data?
+        data
       end
-      #Processes a single line of data and updates internal variables.
-      #You shouldn't need to call this directly.
-      def << line
-        @sheet.row(@sht_row).concat( line )
-        @sht_row += 1
-        @idx += 1
-      end
-      #Instantiates a new Excel workbook in memory. You shouldn't
-      #need to call this directly.
-      def make_new_book
-        @book = Spreadsheet::Workbook.new
-        @book_idx += 1
-      end
-      #Makes a new worksheet for a pre-existing Excel workbook.
-      #This should be called after recovering from the
-      #'too many lines' error.
-      def make_new_sheet
-        @sheet = @book.create_worksheet
-        @sht_idx += 1
-        @sht_row = 0 #always start at row 0 in a new sheet
-      end
+      # Gives us goodies!  Needs +each+ below.
+      include Enumerable
-      #Opens an existing Excel workbook. You shoudn't need to
-      #call this directly.
-      def get_existing_book
-        @sht_row = @sheet.row_count #would like to be able to dump new data, doesn't work
-        @sht_idx += 1
+      # Yield each row of this Excel document.
+      #
+      # Will loop from one worksheet to the next.
+      #
+      # @yield [Spreadsheet::Excel::Row]
+      def each &block
+        require 'spreadsheet'
+        Spreadsheet.open(path).worksheets.each do |worksheet|
+          worksheet.each(&block)
+        end
       end
-      #Increments the current sheet to the next one in
-      #an open book. Not necessary at the moment.
-      def incr_sheet
-        @sheet = book.worksheet @sht_idx
+      # Return the number of lines in this Excel document.
+      #
+      # Measured across worksheets.
+      #
+      # @return [Integer]
+      def num_lines
+        require 'spreadsheet'
+        Spreadsheet.open(path).worksheets.inject(0) do |sum, worksheet|
+          sum += worksheet.row_count
+        end
       end
-      #There are too many lines if the number of rows attempting
-      #to be written exceeds max_lines.
-      def too_many?
-        @sht_row >= @max_lines
-      end
+      # TODO
+      #
+      # def emit
+      # end
-      #There is no data if the number of rows attempting to be written
-      #is zero.
-      def no_data?
-        @sht_row == 0
-      end
+      # TODO
+      #
+      # Extract the following methods from delimited into a module and
+      # let both Excel and Delimited use them.
+      #
+      # Or let Excel include Delimited and let it override
+      # appropriately.
+      #
+      #   headers_in_first_line?
+      #   guess_schema!
+      #
+      #
-      #Saves the workbook.
-      def save
-        @book.write path
+      #
+      def snippet
+        require 'spreadsheet'
+        returning([]) do |snip|
+          row_num = 1
+          Spreadsheet.open(path).worksheets.each do |worksheet|
+            worksheet.each do |row|
+              break if row_num > 10
+              snip << row.to_a
+              row_num += 1
+            end
+            break if row_num > 10
+          end
+        end
       end
     end
   end