RubyGems - iron-import - Versions diffs - 0.5.0 → 0.6.0 - Mend

iron-import 0.5.0 → 0.6.0

Files changed (18) hide show

checksums.yaml +4 -4
data/History.txt +6 -0
data/README.rdoc +2 -2
data/Version.txt +1 -1
data/lib/iron/import/column.rb +35 -14
data/lib/iron/import/csv_reader.rb +26 -11
data/lib/iron/import/custom_reader.rb +39 -0
data/lib/iron/import/data_reader.rb +98 -12
data/lib/iron/import/importer.rb +58 -21
data/lib/iron/import/sheet.rb +74 -9
data/lib/iron/import/xls_reader.rb +25 -39
data/lib/iron/import/xlsx_reader.rb +25 -38
data/lib/iron/import.rb +1 -0
data/lib/iron-import.rb +1 -0
data/spec/importer/custom_reader_spec.rb +46 -0
data/spec/importer/data_reader_spec.rb +1 -1
data/spec/samples/icd10-custom.txt +4 -0
metadata +12 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 70c4748d780e9854cbd60622563b74d3b7ce2b5c
-  data.tar.gz: d6503f0f7a08b4c88da5813b3114446baf1fff1a
+  metadata.gz: 04d666ea1e0170b0186d75fc8b0ec367a16e528a
+  data.tar.gz: 9dad576e17b7d8fc4b523ffe6b4e53c85feae9c8
 SHA512:
-  metadata.gz: 488a0e4b2d8ed83914bb2a6c907358ee584c0849f26bf9e64d6cc4bd8c2296997e4bc580f59b3bff4db6fa699a6abf94f5a85cd31c1585f03f728523025529a3
-  data.tar.gz: 00c6e27cf433423c9c1cc14828c11cd895459b0c12e86aa57ec65b35b049b0b7939dda98edd3359aa4dab8af945b0a17b9ef5b7fc486300edeb8b987b21d65dd
+  metadata.gz: e5a31e81381d78c29da480b296a8e9569ed415a32f50610881014de52a7b92c925687d3ee4ba683bf64a50562b26bf5785acf09070017710938d39c52f0087ad
+  data.tar.gz: d29901644886a98c617dd215b52edec0e8c011875ae2c0d6724c8f0f03c26bfdb9ea8028a3322c9cf1d16f91d9010950203c73ff0fc0485ce0bfc09ffc53e6f2

data/History.txt CHANGED Viewed

@@ -1,3 +1,9 @@
+== 0.6.0 / 2015-08-17
+* Refactored readers to allow for custom format reading
+* Vastly improved internal and user-facing comments
+* Improved error logging, replaced some exceptions with errors
 == 0.5.0 / 2015-02-XX
 * Initial revision

data/README.rdoc CHANGED Viewed

@@ -25,7 +25,7 @@ any warnings and errors encountered... well, this is the library for you!
 IMPORTANT NOTE: this gem is in flux as we work to define the best possible abstraction
 for the task.  Breaking changes will be noted by increases in the second-level version,
-ie 0.5.0 and 0.5.1 will be compatible, but 0.6.0 will not.
+ie 0.5.0 and 0.5.1 will be compatible, but 0.6.0 will not (i.e. we follow semantic versioning).
 == SAMPLE USAGE
@@ -65,6 +65,6 @@ RVM users can skip the sudo:
 Then use
-    require 'iron/import'
+    require 'iron-import'
 to require the library code.

data/Version.txt CHANGED Viewed

	@@ -1 +1 @@
1	- 0.5.0
1	+ 0.6.0

data/lib/iron/import/column.rb CHANGED Viewed

@@ -24,13 +24,14 @@ class Importer
   #       # Instead of a type, you can set an explicit parse block.  Be aware
   #       # that different source types may give you different raw values for what
   #       # seems like the "same" source value, for example an Excel source file
-  #       # will give you a float value for all numeric types, even "integers"
+  #       # will give you a float value for all numeric types, even "integers".
   #       parse do |raw_value|
   #         raw_value.to_i + 1000
   #       end
   #
   #       # You can also add a custom validator to check the value and add
-  #       # an error if it's not within a given range, or whatever:
+  #       # an error if it's not within a given range, or whatever.  To fail validation,
+  #       # simply raise the error you wish recorded.
   #       validate do |parsed_value|
   #         raise "Out of range" unless (parsed_value > 0 && parsed_value < 5000)
   #       end
@@ -83,50 +84,60 @@ class Importer
       str = chars[index] + str
       str
     end
-    def initialize(sheet, key)
+    # Create a new column definition, with the owning sheet, the key for the column,
+    # and an optional set of options.  The options supported are the same as those supported
+    # in block/builder mode.
+    def initialize(sheet, key, options_hash = {})
       # Save off our info
       @key = key
       @sheet = sheet
       @importer = @sheet.importer
       # Return it as a string, by default
-      @type = :string
+      @type = options_hash.delete(:type) { :string }
       # By default, we allow empty values
-      @required = false
+      @required = options_hash.delete(:required) { false }
       # Position can be explicitly set
-      @position = nil
+      @position = options_hash.delete(:position)
       # By default, don't parse incoming data, just pass it through
-      @parse = nil
+      @parse = options_hash.delete(:parse)
       # Default matcher, looks for the presence of the column key as text anywhere
       # in the header string, ignoring case and using underscores as spaces, ie
       # :order_id => /\A\s*order id\s*\z/i
-      @header = Regexp.new('\A\s*' + key.to_s.gsub('_', ' ') + '\s*\z', Regexp::IGNORECASE)
+      @header = options_hash.delete(:header) {
+        Regexp.new('\A\s*' + key.to_s.gsub('_', ' ') + '\s*\z', Regexp::IGNORECASE)
+      }
       # Reset our state to pre-load status
       reset
     end
+    # Customize ourselves using block syntax
     def build(&block)
       DslProxy.exec(self, &block)
     end
+    # Deletes all stored data in prep for an import run
     def reset
       @data = Data.new
     end
-    # When true, matches either the passed value or the index (if position has been explicitly set)
+    # When true, our header definition or index match the passed text or column index.
     def match_header?(text, index)
-      res = index == self.fixed_index || (@header && !@header.match(text).nil?)
-      # puts "#{@header.inspect} ~ #{text.inspect} => #{res.inspect}"
-      res
+      return true if index == self.fixed_index
+      if @header.is_a?(Regexp)
+        return !@header.match(text).nil?
+      else
+        return @header.to_s.downcase == text
+      end
     end
-    # Use any custom parser defined to process the given value, capturing
+    # Applies any custom parser defined to process the given value, capturing
     # errors as needed
     def parse_value(row, val)
       return val if @parse.nil?
@@ -138,6 +149,7 @@ class Importer
       end
     end
+    # Applies any validation to a parsed value
     def validate_value(row, val)
       return unless @validate
       begin
@@ -149,6 +161,9 @@ class Importer
       end
     end
+    # Returns the fixed index of this column based on the set position.
+    # In other words, a position of 2 would return an index of 1 (as
+    # indicies are 0-based), where a position of 'C' would return 2.
     def fixed_index
       return nil unless @position
       if @position.is_a?(Fixnum)
@@ -158,14 +173,20 @@ class Importer
       end
     end
+    # Pretty name for ourselves
     def to_s
       'Column ' + @data.pos
     end
+    # Extracts the sheet's values for this column and returns them in an array.
+    # Note that the array indices ARE NOT row indices, as the rows may have been
+    # filtered and any header rows have been skipped.
     def to_a
       @sheet.data.rows.collect {|r| r[@key] }
     end
+    # Extracts the sheet's values for this column and returns them in a hash of
+    # row num => value for all non-filtered, non-header rows.
     def to_h
       res = {}
       @sheet.data.rows.collect {|r| res[r.num] = r[@key] }

data/lib/iron/import/csv_reader.rb CHANGED Viewed

@@ -6,19 +6,34 @@ class Importer
     def initialize(importer)
       super(importer, :csv)
-    end
-    def load_stream(stream)
-      text = stream.read
-      encoding = @importer.encoding || 'UTF-8'
-      raw_rows = CSV.parse(text, :encoding => "#{encoding}:UTF-8")
-      @importer.default_sheet.parse_raw_data(raw_rows)
+      supports_file!
+      supports_stream!
     end
-    def load_file(path)
-      encoding = @importer.encoding || 'UTF-8'
-      raw_rows = CSV.read(path, :encoding => "#{encoding}:UTF-8")
-      @importer.default_sheet.parse_raw_data(raw_rows)
+    def init_source(mode, source)
+      if mode == :stream
+        # For streams, we just read 'em in and parse 'em
+        text = source.read
+        encoding = @importer.encoding || 'UTF-8'
+        @raw_rows = CSV.parse(text, :encoding => "#{encoding}:UTF-8")
+        true
+      elsif mode == :file
+        # Files have a different path
+        encoding = @importer.encoding || 'UTF-8'
+        @raw_rows = CSV.read(source, :encoding => "#{encoding}:UTF-8")
+        true
+      else
+        @importer.add_error("Unsupported CSV mode: #{mode}")
+        false
+      end
+    end
+    # Normally, we'd check the key and return the proper data, but for CSV files,
+    # there's only one "sheet"
+    def load_raw_sheet(key)
+      @raw_rows
     end
   end

data/lib/iron/import/custom_reader.rb ADDED Viewed

@@ -0,0 +1,39 @@
+class Importer
+  # Special data reader that allows you to define a block to do the import yourself for cases
+  # where you have an odd text-based format or something else you want to be able to process
+  # using this gem.  Check out Importer#on_file and Importer#on_stream to see how to use
+  # this reader type.
+  class CustomReader < DataReader
+    attr_accessor :readers
+    def initialize(importer)
+      super(importer, :custom)
+      @readers = {}
+    end
+    # Called by the importer to add a handler for the given mode
+    def set_reader(mode, block)
+      @readers[mode] = block
+      @supports << mode
+    end
+    def init_source(mode, source)
+      @mode = mode
+      @source = source
+    end
+    def load_raw_sheet(sheet)
+      reader = @readers[@mode]
+      reader.call(@source, sheet)
+    rescue Exception => e
+      # Catch any exceptions thrown and note them with helpful stacktrace info for debugging custom readers
+      @importer.add_error("Error in custom reader when loading sheet #{sheet}: #{e} @ #{e.backtrace.first}")
+      false
+    end
+  end
+end

data/lib/iron/import/data_reader.rb CHANGED Viewed

@@ -14,6 +14,24 @@ class Importer
       end
     end
+    # Implement our automatic reader selection, based on the import source
+    def self.for_source(importer, source)
+      data = nil
+      if is_stream?(source)
+        data = DataReader::for_stream(importer, source)
+        unless data
+          importer.add_error("Unable to find format handler for stream")
+        end
+      else
+        data = DataReader::for_path(importer, source)
+        unless data
+          importer.add_error("Unable to find format handler for file #{source}")
+        end
+      end
+      data
+    end
+    # Factory method to build a reader from an explicit format selector
     def self.for_format(importer, format)
       case format
       when :csv
@@ -29,6 +47,7 @@ class Importer
       end
     end
+    # Figure out which format to use for a given path based on file name
     def self.for_path(importer, path)
       format = path.to_s.extract(/\.(csv|xlsx?)\z/i)
       if format
@@ -39,11 +58,19 @@ class Importer
       end
     end
+    # Figure out which format to use based on a stream's source file info
     def self.for_stream(importer, stream)
       path = path_from_stream(stream)
       for_path(importer, path)
     end
+    # Attempt to determine if the given source is a stream
+    def self.is_stream?(source)
+      # For now, just assume anything that has a #read method is a stream, in
+      # duck-type fashion
+      source.respond_to?(:read)
+    end
     # Try to find the original file name for the given stream,
     # as in the case where a file is uploaded to Rails and we're dealing with an
     # ActionDispatch::Http::UploadedFile.
@@ -60,16 +87,40 @@ class Importer
     def initialize(importer, format)
       @importer = importer
       @format = format
-      @multisheet = true
+      @supports = []
     end
+    def supports_stream!
+      @supports << :stream
+    end
+    def supports_file!
+      @supports << :file
+    end
+    def supports?(mode)
+      @supports.include?(mode)
+    end
+    def supports_file?
+      supports?(:file)
+    end
+    def supports_stream?
+      supports?(:stream)
+    end
+    # Core data reader method.  Takes a given input source (either a stream or
+    # a file path) and attempts to load it.  Returns true if successful, false
+    # if not.  If false, there will be one or more errors explaining what went
+    # wrong.
     def load(path_or_stream)
       # Figure out what we've been passed, and handle it
-      if path_or_stream.respond_to?(:read)
+      if self.class.is_stream?(path_or_stream)
         # We have a stream (open file, upload, whatever)
-        if respond_to?(:load_stream)
+        if supports_stream?
           # Stream loader defined, run it
-          load_stream(path_or_stream)
+          load_sheets(:stream, path_or_stream)
         else
           # Write to temp file, as some of our readers only read physical files, annoyingly
           file = Tempfile.new(['importer', ".#{format}"])
@@ -77,7 +128,7 @@ class Importer
           begin
             file.write path_or_stream.read
             file.close
-            load_file(file.path)
+            load_sheets(:file, file.path)
           ensure
             file.close
             file.unlink
@@ -86,23 +137,58 @@ class Importer
       elsif path_or_stream.is_a?(String)
         # Assume it's a path
-        if respond_to?(:load_file)
-          # We're all set, load up the given path
-          load_file(path_or_stream)
+        if File.exist?(path_or_stream)
+          if supports_file?
+            # We're all set, load up the given path
+            load_sheets(:file, path_or_stream)
+          else
+            # No file handler, so open the file and run the stream processor
+            file = File.open(path_or_stream, 'rb')
+            load_sheets(:stream, file)
+          end
         else
-          # No file handler, so open the file and run the stream processor
-          file = File.open(path_or_stream, 'rb')
-          load_stream(file)
+          @importer.add_error("Unable to locate source file #{path_or_stream}")
         end
       else
-        raise "Unable to load data: #{path_or_stream.inspect}"
+        @importer.add_error("Unable to load data source - not a file path or stream: #{path_or_stream.inspect}")
       end
       # Return our status
       !@importer.has_errors?
     end
+    # Load up the sheets in the correct mode
+    def load_sheets(mode, source)
+      # Let our derived classes open the file, etc. as they need
+      if init_source(mode, source)
+        # Once the source is set, run through each defined sheet, pass it to
+        # our sheet loader, and have the sheet parse it out.
+        @importer.sheets.values.each do |sheet|
+          res = load_raw_sheet(sheet)
+          if res === false
+            # D'oh.
+          else
+            # Tell the sheet to parse the data
+            sheet.parse_raw_data(res)
+          end
+        end
+      end
+    end
+    # Override this method in derived classes to set up
+    # the given source in the given mode
+    def init_source(mode, source)
+      raise "Unimplemented method #init_source in data reader #{self.class.name}"
+    end
+    # Override this method in derived classes to take the given sheet definition,
+    # find that sheet in the input source, and read out the raw (unparsed) rows
+    # as an array of arrays.  Return false if the sheet cannot be loaded.
+    def load_raw_sheet(sheet)
+      raise "Unimplemented method #load_raw_sheet in data reader #{self.class.name}"
+    end
     # Provides default value parsing/coersion for all derived data readers.  Attempts to be clever and
     # handle edge cases like converting '5.00' to 5 when in integer mode, etc.  If you find your inputs aren't
     # being parsed correctly, add a custom #parse block on your Column definition.

data/lib/iron/import/importer.rb CHANGED Viewed

@@ -33,8 +33,9 @@
 class Importer
   # Array of error message or nil for each non-header row
-  attr_accessor :errors, :warnings, :data
+  attr_accessor :errors, :warnings
   attr_accessor :sheets
+  attr_reader :data, :custom_reader
   # Source file/stream encoding, assumes UTF-8 if none specified
   dsl_accessor :encoding
@@ -51,16 +52,34 @@ class Importer
     reset
   end
+  # Takes a block, and sets self to be importer instance, so you can
+  # just call #column, #sheet, etc. directly.
   def build(&block)
     DslProxy.exec(self, &block) if block
     self
   end
-  def default_sheet
-    sheet(1)
+  # For the common case where there is only one "sheet", e.g. CSV files.
+  def default_sheet(&block)
+    sheet(1, true, &block)
   end
-  # Access a Sheet definition by id (either number (1-N) or sheet name)
+  # Access a Sheet definition by id (either number (1-N) or sheet name).
+  # Used during #build calls to define a sheet with a passed block, like so:
+  #
+  #   Importer.build do
+  #     sheet(1) do
+  #       column :store_name
+  #       column :store_address
+  #     end
+  #     sheet('Orders') do
+  #       column :id
+  #       column :price
+  #       filter do |row|
+  #         row[:price].prensent?
+  #       end
+  #     end
+  #   end
   def sheet(id, create=true, &block)
     # Find the sheet, creating it if needed (and requested!)
     if @sheets[id].nil?
@@ -78,18 +97,40 @@ class Importer
     # Return the sheet
     sheet
   end
+  # Define a custom file reader to implement your own sheet parsing.
+  def on_file(&block)
+    @custom_reader = CustomReader.new(self) unless @custom_reader
+    @custom_reader.set_reader(:file, block)
+  end
+  def on_stream(&block)
+    @custom_reader = CustomReader.new(self) unless @custom_reader
+    @custom_reader.set_reader(:stream, block)
+  end
   # Very, very commonly we only want to deal with the default sheet.  In this case,
   # let folks skip the sheet(n) do ... end block wrapper and just define columns
-  # against the main importer.  Internally, proxy those calls to the first sheet
+  # against the main importer.  Internally, proxy those calls to the first sheet.
   def column(*args, &block)
     default_sheet.column(*args, &block)
   end
+  # Ditto for filters
   def filter(*args, &block)
     default_sheet.filter(*args, &block)
   end
+  # Ditto for start row too
+  def start_row(row_num)
+    default_sheet.start_row(row_num)
+  end
+  # More facading
+  def headerless!
+    default_sheet.headerless!
+  end
   # First call to a freshly #build'd importer, this will read the file/stream/path supplied,
   # validate the required values, run custom validations... basically pre-parse and
   # massage the supplied data.  It will return true on success, or false if one
@@ -113,27 +154,23 @@ class Importer
     reset
     # Get the reader for this format
-    format = options.delete(:format)
-    if format && format != :auto
+    default = @custom_reader ? :custom : :auto
+    format = options.delete(:format) { default }
+    if format == :custom
+      # Custom format selected, use our internal custom reader
+      @data = @custom_reader
+    elsif format && format != :auto
+      # Explicit format requested
       @data = DataReader::for_format(self, format)
-      unless reader
+      unless @data
         add_error("Unable to find format handler for format #{format} - aborting")
         return
       end
     else
-      if path_or_stream.respond_to?(:read)
-        @data = DataReader::for_stream(self, path_or_stream)
-        unless @data
-          add_error("Unable to find format handler for stream - aborting")
-          return
-        end
-      else
-        @data = DataReader::for_path(self, path_or_stream)
-        unless @data
-          add_error("Unable to find format handler for file #{path_or_stream} - aborting")
-          return
-        end
-      end
+      # Auto select
+      @data = DataReader::for_source(self, path_or_stream)
     end
     # Read in the data!

data/lib/iron/import/sheet.rb CHANGED Viewed

@@ -1,7 +1,39 @@
 class Importer
   # The Sheet class handles building the sheet's column configuration and other
-  # setup, then holds all load-time row data.
+  # setup, then holds all load-time row data.  In some file types (Excel mostly)
+  # there may be more than one sheet definition in a given importer.  In others,
+  # the default sheet is the only one (possibly implicitly) defined.
+  #
+  # The following builder options are available:
+  #
+  #   Importer.build do
+  #     sheet('Some Sheet Name') do
+  #       # Don't try to look for a header using column definitions, there is no header
+  #       headerless!
+  #
+  #       # Manually set the start row for data in this sheet, defaults to nil
+  #       # indicating that the data rows start immediatly following the header.
+  #       start_row 4
+  #
+  #       # Define a filter that will skip unneeded rows.  The filter command takes
+  #       # a block that receives the parsed (but not validated!) row data as an
+  #       # associative hash of :col_key => <parsed value>, and returns
+  #       # true to keep the row or false to exclude it.
+  #       filter do |row|
+  #         row[:id].to_i > 5000
+  #       end
+  #
+  #       # Of course, the main thing to do in a sheet is define columns.  See the
+  #       # Column class' notes for options when defining a column.  Note that
+  #       # you can define columns using either hash-style:
+  #       column :id, :type => :integer
+  #       # or builder-style:
+  #       column :name do
+  #         header /company\s*name/
+  #         type :string
+  #       end
+  #    end
   class Sheet
     # Inner class for holding load-time data that gets reset on each load call
@@ -37,10 +69,16 @@ class Importer
       reset
     end
+    # Define our columns etc. via builder-style method calling
     def build(&block)
       DslProxy.exec(self, &block)
     end
+    # Call with a block accepting a single Importer::Row with contents that
+    # look like :column_key => <parsed value>.  Any filtered rows
+    # will not be present.  If you want to register an error, simply
+    # raise "some text" and it will be added to the importer's error
+    # list for display to the user, logging, or whatever.
     def process
       @data.rows.each do |row|
         begin
@@ -51,13 +89,33 @@ class Importer
       end
     end
-    def column(key, &block)
+    # Add a new column definition to our list, allows customizing the new
+    # column with a builder block.  See Importer::Column docs for
+    # options.  In lieu of a builder mode, you can pass the same values
+    # as key => value pairs in the options hash to this method, so:
+    #
+    #   column(:foo) do
+    #     type :string
+    #     parse do |val|
+    #       val.to_s.upcase
+    #     end
+    #   end
+    #
+    # Is equivalent to:
+    #
+    #   column(:foo, :type => :string, :parse => lambda {|val| val.to_s.upcase})
+    #
+    # Use whichever you prefer!
+    def column(key, options_hash = {}, &block)
+      # Find existing column with key to allow re-opening an existing definition
       col = @columns.detect {|c| c.key == key }
       unless col
-        col = Column.new(self, key)
+        # if none found, add a new one
+        col = Column.new(self, key, options_hash)
         @columns << col
       end
+      # Customize if needed
       DslProxy::exec(col, &block) if block
       col
@@ -73,9 +131,9 @@ class Importer
       if parse_header(raw_rows)
         # Now, run all the data and add it as a Row instance
         raw_rows.each_with_index do |raw, index|
-          line = index + 1
-          if line >= @data.start_row
-            add_row(line, raw)
+          row_num = index + 1
+          if row_num >= @data.start_row
+            add_row(row_num, raw)
           end
         end
       end
@@ -128,8 +186,8 @@ class Importer
         # Use implicit or explicit column position when told to not look for a header
         next_index = 0
         @columns.each do |col|
-          if col.index.present?
-            next_index = col.index
+          unless col.position.nil?
+            next_index = col.fixed_index
           end
           col.data.index = next_index
           next_index += 1
@@ -140,6 +198,9 @@ class Importer
       else
         # Match by testing
         raw_rows.each_with_index do |row, i|
+          # Um, have data?
+          next unless row
           # Set up for this iteration
           remaining = @columns.dup
@@ -165,11 +226,13 @@ class Importer
       end
     end
+    # When true, the given sheet name or zero-based index
+    # is a match with our id.
     def match_sheet?(name, index)
       if @id.is_a?(Fixnum)
         @id.to_i == index+1
       else
-        @id.to_s == name
+        @id.to_s.downcase == name.downcase
       end
     end
@@ -177,6 +240,8 @@ class Importer
       "Sheet #{@id}"
     end
+    # Return all parsed, filtered data in the sheet as an
+    # array of arrays.
     def dump
       @data.rows.collect(&:values)
     end

data/lib/iron/import/xls_reader.rb CHANGED Viewed

@@ -6,55 +6,41 @@ class Importer
       super(importer, :xlsx)
     end
-    def load_file(path)
-      spreadsheet = Roo::Excel.new(path, :file_warning => :ignore)
-      if spreadsheet
-        # Get our list of sheet definitions, and run all the sheets in the spreadsheet
-        remaining_sheets = @importer.sheets.values
-        spreadsheet.sheets.each_with_index do |name, index|
-          # Look for a sheet definition that matches this sheet's name/index
-          sheet = remaining_sheets.detect {|s| s.match_sheet?(name, index) }
-          if sheet
-            # Remove from our list of remaining sheets
-            remaining_sheets.delete(sheet)
-            # Extract our raw data
-            raw_rows = []
-            spreadsheet.sheet(name).each_with_index do |row, line|
-              raw_rows << row
-            end
-            # Let the sheet sort it out
-            sheet.parse_raw_data(raw_rows)
-          end
-        end
-        return true
+    def init_source(mode, source)
+      if mode == :file
+        @spreadsheet = Roo::Excel.new(source, :file_warning => :ignore)
+        true
       else
-        @importer.add_error("Unable to read Excel file at path #{path}")
-        return false
+        @importer.add_error("Unsupported XLS mode: #{mode}")
+        false
       end
     rescue Exception => e
-      @importer.add_error("Error reading file #{path}: #{e}")
+      @importer.add_error("Error reading file #{source}: #{e}")
       false
     end
-    private
-    def load_raw_rows(sheet, raw_rows)
-      # Figure out where our columns are and where our data starts
-      column_map = sheet.find_header(raw_rows[0...5])
-      start_row = sheet.data.start_row
-      # Run all the raw rows and convert them to Row instances, making notes of errors along the way...
-      if !@importer.has_errors?
-        raw_rows.each_with_index do |raw, index|
-          line = index + 1
-          if line >= start_row
-            row = sheet.add_row(line, raw)
+    def load_raw_sheet(sheet)
+      @spreadsheet.sheets.each_with_index do |name, index|
+        # See if this sheet's name or index matches the requested sheet definition
+        if sheet.match_sheet?(name, index)
+          # Extract our raw data
+          raw_rows = []
+          @spreadsheet.sheet(name).each_with_index do |row, line|
+            raw_rows << row
           end
+          return raw_rows
         end
       end
+      # This is not good.
+      @importer.add_error("Unable to find sheet #{sheet}")
+      return false
+    rescue Exception => e
+      # Not sure why we'd get here, but we strive for error-freedom here, yessir.
+      @importer.add_error("Error loading sheet #{sheet}: #{e}")
+      false
     end
   end
 end

data/lib/iron/import/xlsx_reader.rb CHANGED Viewed

@@ -1,58 +1,45 @@
 class Importer
+  # Uses the Roo gem to read in .xlsx files
   class XlsxReader < DataReader
     def initialize(importer)
       super(importer, :xlsx)
+      supports_file!
     end
-    def load_file(path)
-      spreadsheet = Roo::Excelx.new(path, :file_warning => :ignore)
-      if spreadsheet
-        # Get our list of sheet definitions, and run all the sheets in the spreadsheet
-        remaining_sheets = @importer.sheets.values
-        spreadsheet.sheets.each_with_index do |name, index|
-          # Look for a sheet definition that matches this sheet's name/index
-          sheet = remaining_sheets.detect {|s| s.match_sheet?(name, index) }
-          if sheet
-            # Remove from our list of remaining sheets
-            remaining_sheets.delete(sheet)
-            # Extract our raw data
-            raw_rows = []
-            spreadsheet.sheet(name).each_with_index do |row, line|
-              raw_rows << row
-            end
-            # Let the sheet sort it out
-            sheet.parse_raw_data(raw_rows)
-          end
-        end
-        return true
+    def init_source(mode, source)
+      if mode == :file
+        @spreadsheet = Roo::Excelx.new(source, :file_warning => :ignore)
+        true
       else
-        @importer.add_error("Unable to read ExcelX file at path #{path}")
-        return false
+        @importer.add_error("Unsupported XLSX mode: #{mode}")
+        false
       end
     rescue Exception => e
-      @importer.add_error("Error reading file #{path}: #{e} @ #{e.backtrace.first}")
+      @importer.add_error("Error reading file #{source}: #{e}")
       false
     end
-    private
-    def load_raw_rows(sheet, raw_rows)
-      # Figure out where our columns are and where our data starts
-      column_map = sheet.find_header(raw_rows[0...5])
-      start_row = sheet.data.start_row
-      # Run all the raw rows and convert them to Row instances, making notes of errors along the way...
-      if !@importer.has_errors?
-        raw_rows.each_with_index do |raw, index|
-          line = index + 1
-          if line >= start_row
-            row = sheet.add_row(line, raw)
+    def load_raw_sheet(sheet)
+      @spreadsheet.sheets.each_with_index do |name, index|
+        # See if this sheet's name or index matches the requested sheet definition
+        if sheet.match_sheet?(name, index)
+          # Extract our raw data
+          raw_rows = []
+          @spreadsheet.sheet(name).each_with_index do |row, line|
+            raw_rows << row
           end
+          return raw_rows
         end
       end
+      @importer.add_error("Unable to find sheet #{sheet}")
+      return false
+    rescue Exception => e
+      # Not sure why we'd get here, but we strive for error-freedom here, yessir.
+      @importer.add_error("Error loading sheet #{sheet}: #{e}")
+      false
     end
   end

data/lib/iron/import.rb CHANGED Viewed

@@ -11,4 +11,5 @@ require_relative 'import/data_reader'
 require_relative 'import/csv_reader'
 require_relative 'import/xls_reader'
 require_relative 'import/xlsx_reader'
+require_relative 'import/custom_reader'
 require_relative 'import/importer'

data/lib/iron-import.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require 'iron/import'

data/spec/importer/custom_reader_spec.rb ADDED Viewed

@@ -0,0 +1,46 @@
+describe Importer::CustomReader do
+  before do
+    @importer = Importer.new
+  end
+  it 'should set up correctly for on_file handling' do
+    @importer.custom_reader.should be_nil
+    @importer.build do
+      headerless!
+      on_file do |source, sheet|
+        []
+      end
+    end
+    @importer.custom_reader.should be_an(Importer::CustomReader)
+    @importer.custom_reader.should be_supports_file
+    @importer.custom_reader.should_not be_supports_stream
+  end
+  it 'should load the ICD10 test document' do
+    importer = Importer.build do
+      headerless!
+      column :code do
+        required!
+      end
+      column :desc do
+        required!
+      end
+      on_file do |source, sheet|
+        File.readlines(source).collect do |line|
+          line.extract(/([A-TV-Z][0-9][A-Z0-9]{1,5})\s+(.*)/)
+        end
+      end
+    end
+    importer.import(SpecHelper.sample_path('icd10-custom.txt'))
+    importer.error_summary.should be_nil
+    importer.default_sheet.dump.should == [
+      {:code => 'A000', :desc => 'Cholera due to Vibrio cholerae 01, biovar cholerae'},
+      {:code => 'A001', :desc => 'Cholera due to Vibrio cholerae 01, biovar eltor'},
+      {:code => 'A009', :desc => 'Cholera, unspecified'},
+      {:code => 'A0100', :desc => 'Typhoid fever, unspecified'}
+    ]
+  end
+end

data/spec/importer/data_reader_spec.rb CHANGED Viewed

@@ -87,7 +87,7 @@ describe Importer::DataReader do
   end
   it 'should build an instance based on stream' do
-    Importer::DataReader.for_stream(@importer, mock(original_filename: "nanodrop.xlsx", content_type: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")).should be_a(Importer::XlsxReader)
+    Importer::DataReader.for_stream(@importer, double(original_filename: "nanodrop.xlsx", content_type: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")).should be_a(Importer::XlsxReader)
   end
 end

data/spec/samples/icd10-custom.txt ADDED Viewed

@@ -0,0 +1,4 @@
+A000    Cholera due to Vibrio cholerae 01, biovar cholerae
+A001    Cholera due to Vibrio cholerae 01, biovar eltor
+A009    Cholera, unspecified
+A0100   Typhoid fever, unspecified

metadata CHANGED Viewed

@@ -1,20 +1,23 @@
 --- !ruby/object:Gem::Specification
 name: iron-import
 version: !ruby/object:Gem::Version
-  version: 0.5.0
+  version: 0.6.0
 platform: ruby
 authors:
 - Rob Morris
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-03-19 00:00:00.000000000 Z
+date: 2015-08-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: iron-extensions
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.2'
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.2.1
   type: :runtime
@@ -22,6 +25,9 @@ dependencies:
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.2'
+    - - ">="
       - !ruby/object:Gem::Version
         version: 1.2.1
 - !ruby/object:Gem::Dependency
@@ -80,9 +86,11 @@ files:
 - LICENSE
 - README.rdoc
 - Version.txt
+- lib/iron-import.rb
 - lib/iron/import.rb
 - lib/iron/import/column.rb
 - lib/iron/import/csv_reader.rb
+- lib/iron/import/custom_reader.rb
 - lib/iron/import/data_reader.rb
 - lib/iron/import/error.rb
 - lib/iron/import/importer.rb
@@ -92,11 +100,13 @@ files:
 - lib/iron/import/xlsx_reader.rb
 - spec/importer/column_spec.rb
 - spec/importer/csv_reader_spec.rb
+- spec/importer/custom_reader_spec.rb
 - spec/importer/data_reader_spec.rb
 - spec/importer/importer_spec.rb
 - spec/importer/row_spec.rb
 - spec/importer/sheet_spec.rb
 - spec/importer/xlsx_reader_spec.rb
+- spec/samples/icd10-custom.txt
 - spec/samples/nanodrop.xlsx
 - spec/samples/simple.csv
 - spec/samples/test-products.xls