RubyGems - iron-import - Versions diffs - 0.5.0 - Mend

iron-import 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +7 -0
data/.rspec +1 -0
data/History.txt +12 -0
data/LICENSE +20 -0
data/README.rdoc +70 -0
data/Version.txt +1 -0
data/lib/iron/import/column.rb +177 -0
data/lib/iron/import/csv_reader.rb +26 -0
data/lib/iron/import/data_reader.rb +176 -0
data/lib/iron/import/error.rb +66 -0
data/lib/iron/import/importer.rb +188 -0
data/lib/iron/import/row.rb +59 -0
data/lib/iron/import/sheet.rb +186 -0
data/lib/iron/import/xls_reader.rb +60 -0
data/lib/iron/import/xlsx_reader.rb +60 -0
data/lib/iron/import.rb +14 -0
data/spec/importer/column_spec.rb +116 -0
data/spec/importer/csv_reader_spec.rb +31 -0
data/spec/importer/data_reader_spec.rb +93 -0
data/spec/importer/importer_spec.rb +28 -0
data/spec/importer/row_spec.rb +37 -0
data/spec/importer/sheet_spec.rb +65 -0
data/spec/importer/xlsx_reader_spec.rb +35 -0
data/spec/samples/nanodrop.xlsx +0 -0
data/spec/samples/simple.csv +4 -0
data/spec/samples/test-products.xls +0 -0
data/spec/spec_helper.rb +21 -0
metadata +128 -0

data/lib/iron/import/importer.rb ADDED Viewed

@@ -0,0 +1,188 @@
+# Implements the entry-point for our importing system.  To use, construct
+# an importer using the builder syntax (examples below), then run one or more
+# files or streams through the import system.
+#
+# Constructing a simple importer:
+#
+#   importer = Importer.build do
+#     column :order_number
+#     column :date
+#     column :amount
+#   end
+#
+# To use this importer simply call:
+#
+#   if importer.import('/path/to/file.xls')
+#     importer.process do |row|
+#       puts "Order #{row[:order_number]: #{row[:amount]} on #{row[:date]}"
+#     end
+#   end
+#
+# The row.all? call will verify that each row passed contains a value for all defined columns.
+#
+# A more realistic and complex example follows:
+#
+#   importer = Importer.build do
+#     column :order_number do
+#       match /order (num.*|id)/i
+#     end
+#     column :date
+#     column :amount
+#   end
+#
+class Importer
+  # Array of error message or nil for each non-header row
+  attr_accessor :errors, :warnings, :data
+  attr_accessor :sheets
+  # Source file/stream encoding, assumes UTF-8 if none specified
+  dsl_accessor :encoding
+  def self.build(options = {}, &block)
+    importer = Importer.new(options)
+    importer.build(&block)
+    importer
+  end
+  def initialize(options = {})
+    @encoding = 'UTF-8'
+    @sheets = {}
+    reset
+  end
+  def build(&block)
+    DslProxy.exec(self, &block) if block
+    self
+  end
+  def default_sheet
+    sheet(1)
+  end
+  # Access a Sheet definition by id (either number (1-N) or sheet name)
+  def sheet(id, create=true, &block)
+    # Find the sheet, creating it if needed (and requested!)
+    if @sheets[id].nil?
+      if create
+        @sheets[id] = Sheet.new(self, id)
+      else
+        return nil
+      end
+    end
+    sheet = @sheets[id]
+    # Allow customization by DSL block if requested
+    sheet.build(&block) if block
+    # Return the sheet
+    sheet
+  end
+  # Very, very commonly we only want to deal with the default sheet.  In this case,
+  # let folks skip the sheet(n) do ... end block wrapper and just define columns
+  # against the main importer.  Internally, proxy those calls to the first sheet
+  def column(*args, &block)
+    default_sheet.column(*args, &block)
+  end
+  def filter(*args, &block)
+    default_sheet.filter(*args, &block)
+  end
+  # First call to a freshly #build'd importer, this will read the file/stream/path supplied,
+  # validate the required values, run custom validations... basically pre-parse and
+  # massage the supplied data.  It will return true on success, or false if one
+  # or more errors were encountered and the import failed.
+  #
+  # You may supply various options for the import using the options hash.  Supported
+  # options include:
+  #
+  #   format: one of :auto, :csv, :xls, :xlsx, defaults to :auto, forces treating the supplied
+  #           source as the specified format, or auto-detects if set to :auto
+  #   encoding: source encoding override, defaults to guessing based on input
+  #
+  # Generally, you should be able to throw a source at it and it should work.  The
+  # options exist to allow overriding in cases where the automation heuristics
+  # have failed and the input type is known by the caller.
+  #
+  # After #import has completed successfully, you can process the resulting data
+  # using #process or extract the raw data by calling #to_hash or #sheet(num).to_a
+  def import(path_or_stream, options = {})
+    # Clear all our load-time state, including all rows, header locations... you name it
+    reset
+    # Get the reader for this format
+    format = options.delete(:format)
+    if format && format != :auto
+      @data = DataReader::for_format(self, format)
+      unless reader
+        add_error("Unable to find format handler for format #{format} - aborting")
+        return
+      end
+    else
+      if path_or_stream.respond_to?(:read)
+        @data = DataReader::for_stream(self, path_or_stream)
+        unless @data
+          add_error("Unable to find format handler for stream - aborting")
+          return
+        end
+      else
+        @data = DataReader::for_path(self, path_or_stream)
+        unless @data
+          add_error("Unable to find format handler for file #{path_or_stream} - aborting")
+          return
+        end
+      end
+    end
+    # Read in the data!
+    @data.load(path_or_stream)
+  end
+  # Process a specific sheet, or the default sheet if none is provided.  Your
+  # passed block will be handed one Row at a time.
+  def process(sheet_id = nil, &block)
+    s = sheet(sheet_id, false) || default_sheet
+    s.process(&block)
+  end
+  def add_error(context, msg = nil)
+    if context.is_a?(String) && msg.nil?
+      msg = context
+      context = nil
+    end
+    @errors << Error.new(context, msg)
+  end
+  def has_errors?
+    @errors.any?
+  end
+  def add_warning(context, msg)
+    if context.is_a?(String) && msg.nil?
+      msg = context
+      context = nil
+    end
+    @warnings << Error.new(context, msg)
+  end
+  def has_warnings?
+    @warnings.any?
+  end
+  # Returns a human-readable summary of the errors present on the importer
+  def error_summary
+    return nil unless has_errors?
+    @errors.collect(&:summary).list_join(', ')
+  end
+  protected
+  def reset
+    @errors = []
+    @warnings = []
+    @sheets.values.each(&:reset)
+  end
+end

data/lib/iron/import/row.rb ADDED Viewed

@@ -0,0 +1,59 @@
+class Importer
+  class Row
+    attr_reader :sheet, :line, :values
+    def initialize(sheet, line, value_hash = nil)
+      @sheet = sheet
+      @line = line
+      @values = value_hash
+    end
+    def set_values(value_hash)
+      @values = value_hash
+    end
+    # True when all columns have a non-nil value, useful in filtering out junk
+    # rows
+    def all?(*keys)
+      if keys.any?
+        # Check only the specified keys
+        valid = true
+        keys.each do |key|
+          unless @values.has_key?(key)
+            raise "Unknown column key :#{key} in call to Row#all?"
+          end
+          valid = valid && !@values[key].nil?
+        end
+        valid
+      else
+        # Check all value keys
+        @values.values.all? {|v| !v.nil? }
+      end
+    end
+    def empty?
+      @values.values.all?(&:nil?)
+    end
+    # Returns the value of a column
+    def [](column_key)
+      @values[column_key]
+    end
+    def to_s
+      "Row #{@line}"
+    end
+    def add_error(msg)
+      @sheet.importer.add_error(self, msg)
+    end
+    def add_warning(msg)
+      @sheet.importer.add_warning(self, msg)
+    end
+  end
+end

data/lib/iron/import/sheet.rb ADDED Viewed

@@ -0,0 +1,186 @@
+class Importer
+  # The Sheet class handles building the sheet's column configuration and other
+  # setup, then holds all load-time row data.
+  class Sheet
+    # Inner class for holding load-time data that gets reset on each load call
+    class Data
+      attr_accessor :start_row, :rows
+      def initialize
+        @start_row = nil
+        @rows = []
+      end
+    end
+    # Key data
+    attr_reader :importer
+    attr_reader :columns
+    attr_reader :data
+    # Settings
+    dsl_flag :headerless
+    dsl_accessor :id
+    dsl_accessor :start_row
+    dsl_accessor :filter
+    def initialize(importer, id)
+      @importer = importer
+      @id = id
+      @headerless = false
+      @start_row = nil
+      @filter = nil
+      @columns = []
+      reset
+    end
+    def build(&block)
+      DslProxy.exec(self, &block)
+    end
+    def process
+      @data.rows.each do |row|
+        begin
+          yield row
+        rescue Exception => e
+          @importer.add_error(row, e.to_s)
+        end
+      end
+    end
+    def column(key, &block)
+      col = @columns.detect {|c| c.key == key }
+      unless col
+        col = Column.new(self, key)
+        @columns << col
+      end
+      DslProxy::exec(col, &block) if block
+      col
+    end
+    # Reset for load attempt
+    def reset
+      @data = Data.new
+    end
+    def parse_raw_data(raw_rows)
+      # Find our column layout, start of data, etc
+      if parse_header(raw_rows)
+        # Now, run all the data and add it as a Row instance
+        raw_rows.each_with_index do |raw, index|
+          line = index + 1
+          if line >= @data.start_row
+            add_row(line, raw)
+          end
+        end
+      end
+    end
+    # Add a new row to our stash
+    def add_row(line, raw_data)
+      # Add the row
+      row = Row.new(self, line)
+      # Parse out the values
+      values = {}
+      @columns.each do |col|
+        index = col.data.index
+        raw_val = raw_data[index]
+        if col.parse
+          val = col.parse_value(row, raw_val)
+        else
+          val = @importer.data.parse_value(raw_val, col.type)
+        end
+        values[col.key] = val
+      end
+      # Set the values and filter if needed
+      row.set_values(values)
+      return nil unless !@filter || @filter.call(row)
+      # Row is solid, now check for missing required vals
+      @columns.each do |col|
+        val = values[col.key]
+        if col.validate_value(row, val)
+          if col.required?
+            if values[col.key].nil?
+              @importer.add_error(row, "Missing required value for #{col}")
+            end
+          end
+        end
+      end
+      # We is good
+      @data.rows << row
+      row
+    end
+    # Process the raw values for the first rows in a sheet,
+    # and attempt to build a map of the column layout, and
+    # detect the first row of real data
+    def parse_header(raw_rows)
+      if headerless?
+        # Use implicit or explicit column position when told to not look for a header
+        next_index = 0
+        @columns.each do |col|
+          if col.index.present?
+            next_index = col.index
+          end
+          col.data.index = next_index
+          next_index += 1
+        end
+        @data.start_row = @start_row || 1
+        return true
+      else
+        # Match by testing
+        raw_rows.each_with_index do |row, i|
+          # Set up for this iteration
+          remaining = @columns.dup
+          # Step through this row's raw values, and look for a matching column for all columns
+          row.each_with_index do |val, i|
+            col = remaining.detect {|c| c.match_header?(val.to_s, i) }
+            if col
+              remaining -= [col]
+              col.data.index = i
+            end
+          end
+          if remaining.empty?
+            # Found the cols, have a map, update our start row to be the next line and return!
+            @data.start_row = @start_row || i+2
+            return true
+          end
+        end
+        # If we get here, we're hosed
+        @importer.add_error(self, "Unable to locate required column header(s) in sheet")
+        false
+      end
+    end
+    def match_sheet?(name, index)
+      if @id.is_a?(Fixnum)
+        @id.to_i == index+1
+      else
+        @id.to_s == name
+      end
+    end
+    def to_s
+      "Sheet #{@id}"
+    end
+    def dump
+      @data.rows.collect(&:values)
+    end
+  end
+end

data/lib/iron/import/xls_reader.rb ADDED Viewed

@@ -0,0 +1,60 @@
+class Importer
+  class XlsReader < DataReader
+    def initialize(importer)
+      super(importer, :xlsx)
+    end
+    def load_file(path)
+      spreadsheet = Roo::Excel.new(path, :file_warning => :ignore)
+      if spreadsheet
+        # Get our list of sheet definitions, and run all the sheets in the spreadsheet
+        remaining_sheets = @importer.sheets.values
+        spreadsheet.sheets.each_with_index do |name, index|
+          # Look for a sheet definition that matches this sheet's name/index
+          sheet = remaining_sheets.detect {|s| s.match_sheet?(name, index) }
+          if sheet
+            # Remove from our list of remaining sheets
+            remaining_sheets.delete(sheet)
+            # Extract our raw data
+            raw_rows = []
+            spreadsheet.sheet(name).each_with_index do |row, line|
+              raw_rows << row
+            end
+            # Let the sheet sort it out
+            sheet.parse_raw_data(raw_rows)
+          end
+        end
+        return true
+      else
+        @importer.add_error("Unable to read Excel file at path #{path}")
+        return false
+      end
+    rescue Exception => e
+      @importer.add_error("Error reading file #{path}: #{e}")
+      false
+    end
+    private
+    def load_raw_rows(sheet, raw_rows)
+      # Figure out where our columns are and where our data starts
+      column_map = sheet.find_header(raw_rows[0...5])
+      start_row = sheet.data.start_row
+      # Run all the raw rows and convert them to Row instances, making notes of errors along the way...
+      if !@importer.has_errors?
+        raw_rows.each_with_index do |raw, index|
+          line = index + 1
+          if line >= start_row
+            row = sheet.add_row(line, raw)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/iron/import/xlsx_reader.rb ADDED Viewed

@@ -0,0 +1,60 @@
+class Importer
+  class XlsxReader < DataReader
+    def initialize(importer)
+      super(importer, :xlsx)
+    end
+    def load_file(path)
+      spreadsheet = Roo::Excelx.new(path, :file_warning => :ignore)
+      if spreadsheet
+        # Get our list of sheet definitions, and run all the sheets in the spreadsheet
+        remaining_sheets = @importer.sheets.values
+        spreadsheet.sheets.each_with_index do |name, index|
+          # Look for a sheet definition that matches this sheet's name/index
+          sheet = remaining_sheets.detect {|s| s.match_sheet?(name, index) }
+          if sheet
+            # Remove from our list of remaining sheets
+            remaining_sheets.delete(sheet)
+            # Extract our raw data
+            raw_rows = []
+            spreadsheet.sheet(name).each_with_index do |row, line|
+              raw_rows << row
+            end
+            # Let the sheet sort it out
+            sheet.parse_raw_data(raw_rows)
+          end
+        end
+        return true
+      else
+        @importer.add_error("Unable to read ExcelX file at path #{path}")
+        return false
+      end
+    rescue Exception => e
+      @importer.add_error("Error reading file #{path}: #{e} @ #{e.backtrace.first}")
+      false
+    end
+    private
+    def load_raw_rows(sheet, raw_rows)
+      # Figure out where our columns are and where our data starts
+      column_map = sheet.find_header(raw_rows[0...5])
+      start_row = sheet.data.start_row
+      # Run all the raw rows and convert them to Row instances, making notes of errors along the way...
+      if !@importer.has_errors?
+        raw_rows.each_with_index do |raw, index|
+          line = index + 1
+          if line >= start_row
+            row = sheet.add_row(line, raw)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/iron/import.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# Dependencies
+require 'iron/extensions'
+require 'iron/dsl'
+# Include required classes
+require_relative 'import/column'
+require_relative 'import/sheet'
+require_relative 'import/row'
+require_relative 'import/error'
+require_relative 'import/data_reader'
+require_relative 'import/csv_reader'
+require_relative 'import/xls_reader'
+require_relative 'import/xlsx_reader'
+require_relative 'import/importer'

data/spec/importer/column_spec.rb ADDED Viewed

@@ -0,0 +1,116 @@
+describe Importer::Column do
+  before do
+    @importer = Importer.new
+    @sheet = @importer.default_sheet
+    @col = Importer::Column.new(@sheet, :test)
+    @row = Importer::Row.new(@sheet, 1)
+  end
+  it 'should respond to build' do
+    @col.should respond_to(:build)
+    @col.build do
+      required!
+    end
+    @col.required?.should be_true
+  end
+  it 'should convert position strings to indexes' do
+    {
+      'A' => 0,
+      'C' => 2,
+      'AA' => 26,
+      'BAA' => 2*26*26 + 26
+    }.each_pair do |pos, index|
+      Importer::Column.pos_to_index(pos).should == index
+    end
+  end
+  it 'should convert position ints to position codes' do
+    {
+      0 => 'A',
+      25 => 'Z',
+      26 => 'AA',
+      2*26*26 + 26 + 3 => 'BAD'
+    }.each_pair do |index, pos|
+      Importer::Column.index_to_pos(index).should == pos
+    end
+  end
+  it 'should accept both int and string positions, and convert them to an index' do
+    {
+      'A' => 0,
+      5 => 4,
+      'Z' => 25
+    }.each_pair do |pos, index|
+      @col.position = pos
+      @col.fixed_index.should == index
+    end
+  end
+  it 'should put a pretty output on conversion to string' do
+    @col.data.index = 3
+    @col.to_s.should == 'Column D'
+  end
+  it 'should match by key by default' do
+    ['Test', 'test', '  TEST  '].each do |header|
+      @col.match_header?(header, 888).should be_true
+    end
+    ['', nil, 'Foo', 'Testy'].each do |header|
+      @col.match_header?(header, 888).should be_false
+    end
+  end
+  it 'should default to string type' do
+    @col.type.should == :string
+  end
+  it 'should match by position if position is specified' do
+    @col.position 'B'
+    @col.match_header?('junk', 1).should be_true
+    @col.match_header?('junk', 2).should be_false
+  end
+  it 'should match custom header matchers' do
+    {
+      /(alpha|beta)/ => { 'alphabet' => true, 'beta  X' => true, 'gamma' => false },
+      /^test.$/i => { 'Testy' => true, 'test?' => true, 'notest' => false }
+    }.each_pair do |matcher, tests|
+      @col.header matcher
+      tests.each_pair do |val, res|
+        @col.match_header?(val, 1234).should == res
+      end
+    end
+  end
+  it 'should properly apply custom parsers' do
+    @col.parse_value(@row, 5).should == 5
+    @col.parse do |raw|
+      raw.to_i + 2
+    end
+    @col.parse_value(@row, 5).should == 7
+  end
+  it 'should record exceptions during parsing as errors' do
+    @col.parse do |raw|
+      raise 'nope'
+    end
+    @importer.has_errors?.should be_false
+    @col.parse_value(@row, 5).should be_nil
+    @importer.has_errors?.should be_true
+  end
+  it 'should allow custom validation' do
+    @col.validate do |val|
+      raise 'nope' if val != 5
+    end
+    @importer.has_errors?.should be_false
+    @col.validate_value(@row, 5).should be_true
+    @importer.has_errors?.should be_false
+    @col.validate_value(@row, 4).should be_false
+    @importer.has_errors?.should be_true
+  end
+end

data/spec/importer/csv_reader_spec.rb ADDED Viewed

@@ -0,0 +1,31 @@
+describe Importer::CsvReader do
+  before do
+    @importer = Importer.new
+    @reader = Importer::CsvReader.new(@importer)
+  end
+  it 'should load our simple CSV data' do
+    importer = Importer.build do
+      column :number do
+        type :integer
+      end
+      column :string do
+        type :string
+      end
+      column :date do
+        type :date
+      end
+      column :cost do
+        type :cents
+      end
+    end
+    importer.import(SpecHelper.sample_path('simple.csv')).should be_true
+    importer.default_sheet.dump.should == [
+      {:number => 123, :string => 'Abc', :date => Date.new(1977,5,13), :cost => 899},
+      {:number => nil, :string => nil, :date => nil, :cost => nil},
+      {:number => 5, :string => 'String with end spaces', :date => Date.new(2004,2,1), :cost => 1000}
+    ]
+  end
+end