RubyGems - iron-import - Versions diffs - 0.6.1 → 0.7.0 - Mend

iron-import 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

checksums.yaml +4 -4
data/History.txt +16 -1
data/README.rdoc +43 -16
data/Version.txt +1 -1
data/lib/iron/import/column.rb +27 -14
data/lib/iron/import/csv_reader.rb +4 -4
data/lib/iron/import/custom_reader.rb +14 -8
data/lib/iron/import/data_reader.rb +42 -30
data/lib/iron/import/error.rb +4 -16
data/lib/iron/import/excel_reader.rb +69 -0
data/lib/iron/import/html_reader.rb +78 -0
data/lib/iron/import/importer.rb +432 -103
data/lib/iron/import/row.rb +15 -11
data/lib/iron/import/xls_reader.rb +3 -37
data/lib/iron/import/xlsx_reader.rb +2 -37
data/lib/iron/import.rb +2 -1
data/spec/importer/column_spec.rb +4 -5
data/spec/importer/csv_reader_spec.rb +1 -1
data/spec/importer/custom_reader_spec.rb +6 -10
data/spec/importer/data_reader_spec.rb +6 -5
data/spec/importer/html_reader_spec.rb +105 -0
data/spec/importer/importer_spec.rb +107 -0
data/spec/importer/row_spec.rb +9 -2
data/spec/importer/xls_reader_spec.rb +77 -0
data/spec/importer/xlsx_reader_spec.rb +2 -3
data/spec/samples/3-sheets.xls +0 -0
data/spec/samples/col-span.html +29 -0
data/spec/samples/html-th-td.html +11 -0
data/spec/samples/multi-table.html +29 -0
data/spec/samples/nanodrop.xlsx +0 -0
data/spec/samples/scores.html +30 -0
data/spec/samples/simple.html +14 -0
data/spec/spec_helper.rb +1 -0
metadata +30 -8
data/lib/iron/import/sheet.rb +0 -263
data/spec/importer/sheet_spec.rb +0 -65

data/lib/iron/import/importer.rb CHANGED Viewed

@@ -18,119 +18,242 @@
 #     end
 #   end
 #
-# The row.all? call will verify that each row passed contains a value for all defined columns.
-#
 # A more realistic and complex example follows:
 #
-#   importer = Importer.build do
+#   Importer.build do
+#     # Define our columns and settings
 #     column :order_number do
-#       match /order (num.*|id)/i
+#       header /order (num.*|id)/i
+#       type :int
 #     end
-#     column :date
-#     column :amount
+#     column :date do
+#       type :date
+#     end
+#     column :amount do
+#       type :cents
+#     end
+#
+#     # Filter out any rows missing an order number
+#     filter do |row|
+#       !row[:order_number].nil?
+#     end
+#
+#   end.import('/path/to/file.csv', format: :csv) do |row|
+#     # Process each row as basically a hash of :column_key => value,
+#     # only called on import success
+#     Order.create(row.to_hash)
+#
+#   end.on_error do
+#     # If we have any errors, do something
+#     raise error_summary
 #   end
 #
 class Importer
-  # Array of error message or nil for each non-header row
-  attr_accessor :errors, :warnings
-  attr_accessor :sheets
-  attr_reader :data, :custom_reader
+  # Inner class for holding load-time data that gets reset on each load call
+  class Data
+    attr_accessor :start_row, :rows
+    def initialize
+      @start_row = nil
+      @rows = []
+    end
+  end
+  # Array of defined columns
+  attr_reader :columns
+  # Array of error messages collected during an import/process run
+  attr_accessor :errors
+  # Custom reader, if one has been defined using #on_file or #on_stream
+  attr_reader :custom_reader
+  # Set to the format selected during past import
+  attr_reader :format
+  # Import data
+  attr_reader :data
+  # Missing headers post-import
+  attr_reader :missing_headers
+  # When true, skips header detection
+  dsl_flag :headerless
+  # Explicitly sets the row number (1-indexed) where data rows begin,
+  # usually left defaulted to nil to automatically start after the header
+  # row.
+  dsl_accessor :start_row
+  # Set to a block/lambda taking a parsed but unvalidated row as a hash,
+  # return true to keep, false to skip.
+  dsl_accessor :filter
   # Source file/stream encoding, assumes UTF-8 if none specified
   dsl_accessor :encoding
+  # Create a new importer!  See #build for details on what to do
+  # in the block.
   def self.build(options = {}, &block)
     importer = Importer.new(options)
     importer.build(&block)
     importer
   end
+  # Ye standard constructor!
   def initialize(options = {})
+    @scopes = {}
     @encoding = 'UTF-8'
-    @sheets = {}
+    @headerless = false
+    @filter = nil
+    @columns = []
     reset
   end
-  # Takes a block, and sets self to be importer instance, so you can
-  # just call #column, #sheet, etc. directly.
+  # Call to define the importer's column configuration and other setup options.
+  #
+  # The following builder options are available:
+  #
+  #   importer = Importer.build do
+  #     # Don't try to look for a header using column definitions, there is no header
+  #     headerless!
+  #
+  #     # Manually set the start row for data, defaults to nil
+  #     # indicating that the data rows start immediatly following the header.
+  #     start_row 4
+  #
+  #     # Define a filter that will skip unneeded rows.  The filter command takes
+  #     # a block that receives the parsed (but not validated!) row data as an
+  #     # associative hash of :col_key => <parsed value>, and returns
+  #     # true to keep the row or false to exclude it.
+  #     filter do |row|
+  #       row[:id].to_i > 5000
+  #     end
+  #
+  #     # If you need to process a type of input that isn't built in, define
+  #     # a custom reader with #on_file or #on_stream
+  #     on_file do |path|
+  #       ... read file at path, return array of each row's raw column values ...
+  #     end
+  #
+  #     # Got a multi-block format like Excel or HTML?  You can optionally limit
+  #     # searching by setting a scope or scopes to search:
+  #     scope :xls, 'Sheet 2'
+  #     # Or set a bunch of scopes in one go:
+  #     scopes :html => ['div > table.data', 'table.aux-data'],
+  #            :xls => [2, 'Orders']
+  #
+  #     # Of course, the main thing you're going to do is to define columns.  See the
+  #     # Column class' notes for options when defining a column.  Note that
+  #     # you can define columns using either hash-style:
+  #     column :id, :type => :integer
+  #     # or builder-style:
+  #     column :name do
+  #       header /company\s*name/i
+  #       type :string
+  #     end
+  #   end
   def build(&block)
     DslProxy.exec(self, &block) if block
     self
   end
-  # For the common case where there is only one "sheet", e.g. CSV files.
-  def default_sheet(&block)
-    sheet(1, true, &block)
-  end
-  # Access a Sheet definition by id (either number (1-N) or sheet name).
-  # Used during #build calls to define a sheet with a passed block, like so:
+  # Add a new column definition to our list, allows customizing the new
+  # column with a builder block.  See Importer::Column docs for
+  # options.  In lieu of a builder mode, you can pass the same values
+  # as key => value pairs in the options hash to this method, so:
   #
-  #   Importer.build do
-  #     sheet(1) do
-  #       column :store_name
-  #       column :store_address
-  #     end
-  #     sheet('Orders') do
-  #       column :id
-  #       column :price
-  #       filter do |row|
-  #         row[:price].prensent?
-  #       end
+  #   column(:foo) do
+  #     type :string
+  #     parse do |val|
+  #       val.to_s.upcase
   #     end
   #   end
-  def sheet(id, create=true, &block)
-    # Find the sheet, creating it if needed (and requested!)
-    if @sheets[id].nil?
-      if create
-        @sheets[id] = Sheet.new(self, id)
-      else
-        return nil
-      end
+  #
+  # Is equivalent to:
+  #
+  #   column(:foo, :type => :string, :parse => lambda {|val| val.to_s.upcase})
+  #
+  # Use whichever you prefer!
+  def column(key, options_hash = {}, &block)
+    # Find existing column with key to allow re-opening an existing definition
+    col = @columns.detect {|c| c.key == key }
+    unless col
+      # if none found, add a new one
+      col = Column.new(self, key, options_hash)
+      @columns << col
     end
-    sheet = @sheets[id]
-    # Allow customization by DSL block if requested
-    sheet.build(&block) if block
+    # Customize if needed
+    DslProxy::exec(col, &block) if block
-    # Return the sheet
-    sheet
+    col
+  end
+  # Limit the search scope for a single format (:xls, :xlsx, :html, :custom)
+  # to the given value or values - the meaning and format of scopes is determined
+  # by that format's data reader.
+  def scope(format, *scopes)
+    @scopes[format] = scopes.flatten
+  end
+  # Limit the search scope for more than one format at a time.  For example, if
+  # you support both XLS and XLSX formats (and why wouldn't you?) then you
+  # could tell the importer to look only at the sheets named "Orders" and
+  # "Legacy Orders" like so:
+  #
+  #   scopes :xls => ['Orders', 'Legacy Orders'],
+  #          :xlsx => ['Orders', 'Legacy Orders']
+  #
+  def scopes(map = :__read__)
+    if map == :__read__
+      return @scopes
+    else
+      map.each_pair do |format, scope|
+        scope(format, scope)
+      end
+    end
   end
-  # Define a custom file reader to implement your own sheet parsing.
+  # Define a custom file reader to implement your own parsing.  Pass
+  # a block accepting a file path, and returning an array of arrays (rows of
+  # raw column values).  Use #add_error(msg) to add a reading error.
+  #
+  # Adding a custom stream parser will change the importer's default
+  # format to :custom, though you can override it when calling #import as
+  # usual.
+  #
+  # Only one of #on_file or #on_stream needs to be implemented - the importer
+  # will cross convert as needed!
+  #
+  # Example:
+  #
+  #   on_file do |path|
+  #     # Read a file line by line
+  #     File.readlines(path).collect do |line|
+  #       # Each line has colon-separated values, so split 'em up
+  #       line.split(/\s*:\s*/)
+  #     end
+  #   end
+  #
   def on_file(&block)
     @custom_reader = CustomReader.new(self) unless @custom_reader
     @custom_reader.set_reader(:file, block)
   end
+  # Just like #on_file, but for streams.  Pass
+  # a block accepting a stream, and returning an array of arrays (rows of
+  # raw column values).  Use #add_error(msg) to add a reading error.
+  #
+  # Example:
+  #
+  #   on_stream do |stream|
+  #     # Stream contains rows separated by a | char
+  #     stream.readlines('|').collect do |line|
+  #       # Each line has 3 fields of 10 characters each
+  #       [line[0...10], line[10...20], line[20...30]]
+  #     end
+  #   end
+  #
   def on_stream(&block)
     @custom_reader = CustomReader.new(self) unless @custom_reader
     @custom_reader.set_reader(:stream, block)
   end
-  # Very, very commonly we only want to deal with the default sheet.  In this case,
-  # let folks skip the sheet(n) do ... end block wrapper and just define columns
-  # against the main importer.  Internally, proxy those calls to the first sheet.
-  def column(*args, &block)
-    default_sheet.column(*args, &block)
-  end
-  # Ditto for filters
-  def filter(*args, &block)
-    default_sheet.filter(*args, &block)
-  end
-  # Ditto for start row too
-  def start_row(row_num)
-    default_sheet.start_row(row_num)
-  end
-  # More facading
-  def headerless!
-    default_sheet.headerless!
-  end
   # First call to a freshly #build'd importer, this will read the file/stream/path supplied,
   # validate the required values, run custom validations... basically pre-parse and
   # massage the supplied data.  It will return true on success, or false if one
@@ -139,87 +262,293 @@ class Importer
   # You may supply various options for the import using the options hash.  Supported
   # options include:
   #
-  #   format: one of :auto, :csv, :xls, :xlsx, defaults to :auto, forces treating the supplied
-  #           source as the specified format, or auto-detects if set to :auto
+  #   format: one of :auto, :csv, :html, :xls, :xlsx, defaults to :auto, forces treating the supplied
+  #           source as the specified format, or attempts to auto-detect if set to :auto
+  #   scope: specify the search scope for the data/format, overriding any scope set with #scope
   #   encoding: source encoding override, defaults to guessing based on input
   #
-  # Generally, you should be able to throw a source at it and it should work.  The
+  # Generally, you should be able to throw a path or stream at it and it should work.  The
   # options exist to allow overriding in cases where the automation heuristics
   # have failed and the input type is known by the caller.
   #
+  # If you're trying to import from a raw string, use Importer#import_string instead.
+  #
   # After #import has completed successfully, you can process the resulting data
-  # using #process or extract the raw data by calling #to_hash or #sheet(num).to_a
-  def import(path_or_stream, options = {})
+  # using #process or extract the raw data by calling #to_a to get an array of row hashes
+  #
+  # Note that as of version 0.7.0, there is a more compact operation mode enabled by passing
+  # a block to this call:
+  #
+  #    importer.import(...) do |row|
+  #      # Process each row here
+  #    end
+  #
+  # In this mode, the block is called with each row as in #process, conditionally on no
+  # errors.  In addition, when a block is passed, true/false is not returned (as the
+  # block is already conditionally called).  Instead, it will return the importer to allow
+  # chaining to #on_error or other calls.
+  def import(path_or_stream, options = {}, &block)
     # Clear all our load-time state, including all rows, header locations... you name it
     reset
     # Get the reader for this format
     default = @custom_reader ? :custom : :auto
-    format = options.delete(:format) { default }
-    if format == :custom
+    @format = options.delete(:format) { default }
+    if @format == :custom
       # Custom format selected, use our internal custom reader
-      @data = @custom_reader
+      @reader = @custom_reader
-    elsif format && format != :auto
+    elsif @format && @format != :auto
       # Explicit format requested
-      @data = DataReader::for_format(self, format)
-      unless @data
-        add_error("Unable to find format handler for format #{format} - aborting")
-        return
-      end
+      @reader = DataReader::for_format(self, @format)
     else
       # Auto select
-      @data = DataReader::for_source(self, path_or_stream)
+      @reader = DataReader::for_source(self, path_or_stream)
+      @format = @reader.format
+    end
+    # Verify we got one
+    unless @reader
+      add_error("Unable to find format handler for format :#{format} on import of #{path_or_stream.class.name} source - aborting")
+      return
+    end
+    # What scopes (if any) should we limit our searching to?
+    scopes = options.delete(:scope) { @scopes[@format] }
+    if scopes && !scopes.is_a?(Array)
+      scopes = [scopes]
     end
     # Read in the data!
-    @data.load(path_or_stream)
+    @reader.load(path_or_stream, scopes) do |raw_rows|
+      # Find our column layout, start of data, etc
+      if find_header(raw_rows)
+        # Now, run all the data and add it as a Row instance
+        raw_rows.each_with_index do |raw, index|
+          row_num = index + 1
+          if row_num >= @data.start_row
+            add_row(row_num, raw)
+          end
+        end
+        # We've found a workable sheet/table/whatever, stop looking
+        true
+      else
+        # This sheet/table/whatever didn't have the needed header, try
+        # the next one (if any)
+        false
+      end
+    end
+    # If we have any missing headers, note that fact
+    if @missing_headers && @missing_headers.count > 0
+      add_error("Unable to locate required column header for column(s): " + @missing_headers.collect{|c| ":#{c}"}.list_join(', '))
+    end
+    # If we're here with no errors, we rule!
+    success = !has_errors?
+    if block
+      # New way, if block is passed, process it on success
+      process(&block) if success
+      self
+    else
+      # Old way, return result
+      success
+    end
   end
-  # Process a specific sheet, or the default sheet if none is provided.  Your
-  # passed block will be handed one Row at a time.
-  def process(sheet_id = nil, &block)
-    s = sheet(sheet_id, false) || default_sheet
-    s.process(&block)
+  # Use this form of import for the common case of having a raw CSV or HTML string.
+  def import_string(string, options = {}, &block)
+    # Get a format here if needed
+    if options[:format].nil?
+      if @custom_reader
+        format = :custom
+      else
+        format = string.include?('<table') && string.include?('</tr>') ? :html : :csv
+      end
+      options[:format] = format
+    end
+    # Do the import, converting the string to a stream
+    import(StringIO.new(string), options, &block)
+  end
+  # Call with a block accepting a single Importer::Row with contents that
+  # look like :column_key => <parsed value>.  Any filtered rows
+  # will not be present.  If you want to register an error, simply
+  # raise "some text" and it will be added to the importer's error
+  # list for display to the user, logging, or whatever.
+  def process
+    @data.rows.each do |row|
+      begin
+        yield row
+      rescue Exception => e
+        add_error(row, e.to_s)
+      end
+    end
   end
-  def add_error(context, msg = nil)
-    if context.is_a?(String) && msg.nil?
-      msg = context
-      context = nil
+  def on_error(&block)
+    raise 'Invalid block passed to Importer#on_error: block may accept 0, 1 or 2 arguments' if block.arity > 2
+    if has_errors?
+      case block.arity
+      when 0 then DslProxy.exec(self, &block)
+      when 1 then DslProxy.exec(self, @errors, &block)
+      when 2 then DslProxy.exec(self, @errors, error_summary, &block)
+      end
     end
-    @errors << Error.new(context, msg)
+    self
   end
+  # Process the raw values for the first rows in a sheet,
+  # and attempt to build a map of the column layout, and
+  # detect the first row of real data
+  def find_header(raw_rows)
+    if headerless?
+      # Use implicit or explicit column position when told to not look for a header
+      next_index = 0
+      @columns.each do |col|
+        unless col.position.nil?
+          next_index = col.fixed_index
+        end
+        col.data.index = next_index
+        next_index += 1
+      end
+      @data.start_row = @start_row || 1
+      @missing_headers = nil
+      return true
+    else
+      # Match by testing
+      missing = nil
+      raw_rows.each_with_index do |row, i|
+        # Um, have data?
+        next unless row
+        # Set up for this iteration
+        remaining = @columns.dup
+        # Step through this row's raw values, and look for a matching column for all columns
+        row.each_with_index do |val, i|
+          col = remaining.detect {|c| c.match_header?(val.to_s, i) }
+          if col
+            remaining -= [col]
+            col.data.index = i
+          end
+        end
+        if remaining.empty?
+          # Found all columns, have a map, update our start row to be the next line and return!
+          @data.start_row = @start_row || i+2
+          @missing_headers = nil
+          return true
+        else
+          missing = remaining if (missing.nil? || missing.count > remaining.count)
+        end
+      end
+      # If we get here, we're hosed
+      @missing_headers = missing.collect(&:key) if @missing_headers.nil? || @missing_headers.count > missing.count
+      false
+    end
+  end
+  # Add a new row to our stash, parsing/filtering/validating as we go!
+  def add_row(line, raw_data)
+    # Gracefully handle custom parsers that return nil for a row's data
+    raw_data ||= []
+    # Add the row
+    row = Row.new(self, line)
+    # Parse out the values
+    values = {}
+    @columns.each do |col|
+      index = col.data.index
+      raw_val = raw_data[index]
+      if col.parse
+        # Use custom parser if this row has one
+        val = col.parse_value(row, raw_val)
+      else
+        # Otherwise use our standard parser
+        val = @reader.parse_value(raw_val, col.type)
+      end
+      values[col.key] = val
+    end
+    # Set the values and filter if needed
+    row.set_values(values)
+    return nil if @filter && !@filter.call(row)
+    # Row is desired, now validate values
+    @columns.each do |col|
+      val = values[col.key]
+      col.validate_value(row, val)
+    end
+    # We is good
+    @data.rows << row
+    row
+  end
+  # When true, one or more errors have been recorded during this import/process
+  # cycle.
   def has_errors?
     @errors.any?
   end
-  def add_warning(context, msg)
+  # Add an error to our error list.  Will result in a failed import.
+  def add_error(context, msg = nil)
     if context.is_a?(String) && msg.nil?
       msg = context
       context = nil
     end
-    @warnings << Error.new(context, msg)
-  end
-  def has_warnings?
-    @warnings.any?
+    @errors << Error.new(context, msg)
   end
-  # Returns a human-readable summary of the errors present on the importer
+  # Returns a human-readable summary of the errors present on the importer, or
+  # nil if no errors are present
   def error_summary
+    # Simple case
     return nil unless has_errors?
-    @errors.collect(&:summary).list_join(', ')
+    # Group by error text - we often get the same error dozens of times
+    list = {}
+    @errors.each do |err|
+      errs = list[err.text] || []
+      errs << err
+      list[err.text] = errs
+    end
+    # Build summary & return
+    list.values.collect do |errs|
+      summary = errs.first.summary
+      if errs.count == 1
+        summary
+      else
+        errs.count.to_s + ' x ' + summary
+      end
+    end.list_join(', ')
+  end
+  # After calling #import, you can dump the final values for each row
+  # as an array of hashes.  Useful in debugging!  For general processing,
+  # use #process or the block form of #import instead.
+  def to_a
+    @data.rows.collect(&:values)
   end
   protected
   def reset
     @errors = []
-    @warnings = []
-    @sheets.values.each(&:reset)
+    @missing_headers = nil
+    @format = nil
+    @reader = nil
+    @data = Data.new
   end
 end

data/lib/iron/import/row.rb CHANGED Viewed

@@ -2,12 +2,12 @@ class Importer
   class Row
-    attr_reader :sheet, :line, :values
+    attr_reader :line, :values
-    def initialize(sheet, line, value_hash = nil)
-      @sheet = sheet
+    def initialize(importer, line, value_hash = nil)
+      @importer = importer
       @line = line
-      @values = value_hash
+      set_values(value_hash)
     end
     def set_values(value_hash)
@@ -15,8 +15,9 @@ class Importer
     end
     # True when all columns have a non-nil value, useful in filtering out junk
-    # rows
+    # rows.  Pass in one or more keys to check only those keys for presence.
     def all?(*keys)
+      keys.flatten!
       if keys.any?
         # Check only the specified keys
         valid = true
@@ -33,25 +34,28 @@ class Importer
       end
     end
+    # True when all row columns have nil values.
     def empty?
       @values.values.all?(&:nil?)
     end
-    # Returns the value of a column
+    # Returns the value of a column.
     def [](column_key)
       @values[column_key]
     end
+    # The row's name, e.g. 'Row 4'
     def to_s
       "Row #{@line}"
     end
-    def add_error(msg)
-      @sheet.importer.add_error(self, msg)
+    # This row's values as a hash of :column_key => <parsed + validated value>
+    def to_hash
+      @values.dup
     end
-    def add_warning(msg)
-      @sheet.importer.add_warning(self, msg)
+    def add_error(msg)
+      @importer.add_error(self, msg)
     end
   end