RubyGems - smarter_csv - Versions diffs - 1.4.0 → 1.5.1 - Mend

smarter_csv 1.4.0 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/.gitignore +2 -0
data/CHANGELOG.md +24 -2
data/CONTRIBUTORS.md +46 -0
data/LICENSE.txt +1 -1
data/README.md +53 -69
data/Rakefile +8 -15
data/lib/smarter_csv/smarter_csv.rb +168 -112
data/lib/smarter_csv/version.rb +1 -1
data/lib/smarter_csv.rb +8 -0
data/smarter_csv.gemspec +1 -0
data/spec/fixtures/additional_separator.csv +6 -0
data/spec/fixtures/duplicate_headers.csv +1 -1
data/spec/fixtures/hard_sample.csv +2 -0
data/spec/smarter_csv/additional_separator_spec.rb +45 -0
data/spec/smarter_csv/binary_file2_spec.rb +1 -1
data/spec/smarter_csv/carriage_return_spec.rb +27 -7
data/spec/smarter_csv/column_separator_spec.rb +7 -1
data/spec/smarter_csv/duplicate_headers_spec.rb +76 -0
data/spec/smarter_csv/hard_sample_spec.rb +24 -0
data/spec/smarter_csv/ignore_comments_spec.rb +45 -30
data/spec/smarter_csv/invalid_headers_spec.rb +8 -22
data/spec/smarter_csv/no_header_spec.rb +16 -11
metadata +28 -3

data/lib/smarter_csv/smarter_csv.rb CHANGED Viewed

@@ -5,116 +5,37 @@ module SmarterCSV
   class DuplicateHeaders < SmarterCSVException; end
   class MissingHeaders < SmarterCSVException; end
   class NoColSepDetected < SmarterCSVException; end
+  class KeyMappingError < SmarterCSVException; end
-  def SmarterCSV.process(input, options={}, &block)   # first parameter: filename or input object with readline method
-    default_options = {:col_sep => ',', :row_sep => $INPUT_RECORD_SEPARATOR, :quote_char => '"', :force_simple_split => false , :verbose => false ,
-      :remove_empty_values => true, :remove_zero_values => false , :remove_values_matching => nil , :remove_empty_hashes => true , :strip_whitespace => true,
-      :convert_values_to_numeric => true, :strip_chars_from_headers => nil , :user_provided_headers => nil , :headers_in_file => true,
-      :comment_regexp => /\A#/, :chunk_size => nil , :key_mapping_hash => nil , :downcase_header => true, :strings_as_keys => false, :file_encoding => 'utf-8',
-      :remove_unmapped_keys => false, :keep_original_headers => false, :value_converters => nil, :skip_lines => nil, :force_utf8 => false, :invalid_byte_sequence => '',
-      :auto_row_sep_chars => 500, :required_headers => nil
-    }
+  # first parameter: filename or input object which responds to readline method
+  def SmarterCSV.process(input, options={}, &block)
     options = default_options.merge(options)
     options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil?
-    csv_options = options.select{|k,v| [:col_sep, :row_sep, :quote_char].include?(k)} # options.slice(:col_sep, :row_sep, :quote_char)
     headerA = []
     result = []
-    old_row_sep = $INPUT_RECORD_SEPARATOR
-    file_line_count = 0
-    csv_line_count = 0
+    @file_line_count = 0
+    @csv_line_count = 0
     has_rails = !! defined?(Rails)
     begin
       f = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
+      # auto-detect the row separator
+      options[:row_sep] = SmarterCSV.guess_line_ending(f, options) if options[:row_sep].to_sym == :auto
       # attempt to auto-detect column separator
-      options[:col_sep] = guess_column_separator(f) if options[:col_sep] == 'auto'
+      options[:col_sep] = guess_column_separator(f, options) if options[:col_sep].to_sym == :auto
+      # preserve options, in case we need to call the CSV class
+      csv_options = options.select{|k,v| [:col_sep, :row_sep, :quote_char].include?(k)} # options.slice(:col_sep, :row_sep, :quote_char)
+      csv_options.delete(:row_sep) if [nil, :auto].include?( options[:row_sep].to_sym )
+      csv_options.delete(:col_sep) if [nil, :auto].include?( options[:col_sep].to_sym )
       if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && ( f.respond_to?(:external_encoding) && f.external_encoding != Encoding.find('UTF-8') || f.respond_to?(:encoding) && f.encoding != Encoding.find('UTF-8') )
         puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
       end
-      if options[:row_sep] == :auto
-        options[:row_sep] = line_ending = SmarterCSV.guess_line_ending( f, options )
-        f.rewind
-      end
-      $INPUT_RECORD_SEPARATOR = options[:row_sep]
-      if options[:skip_lines].to_i > 0
-        options[:skip_lines].to_i.times{f.readline}
-      end
+      options[:skip_lines].to_i.times{f.readline(options[:row_sep])} if options[:skip_lines].to_i > 0
-      if options[:headers_in_file]        # extract the header line
-        # process the header line in the CSV file..
-        # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
-        header = f.readline
-        header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
-        header = header.sub(options[:comment_regexp],'').chomp(options[:row_sep])
-        file_line_count += 1
-        csv_line_count += 1
-        header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
-        if (header =~ %r{#{options[:quote_char]}}) and (! options[:force_simple_split])
-          file_headerA = begin
-            CSV.parse( header, **csv_options ).flatten.collect!{|x| x.nil? ? '' : x} # to deal with nil values from CSV.parse
-          rescue CSV::MalformedCSVError => e
-            raise $!, "#{$!} [SmarterCSV: csv line #{csv_line_count}]", $!.backtrace
-          end
-        else
-          file_headerA =  header.split(options[:col_sep])
-        end
-        file_header_size = file_headerA.size # before mapping, which could delete keys
-        file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/,'') }
-        file_headerA.map!{|x| x.strip}  if options[:strip_whitespace]
-        unless options[:keep_original_headers]
-          file_headerA.map!{|x| x.gsub(/\s+|-+/,'_')}
-          file_headerA.map!{|x| x.downcase }   if options[:downcase_header]
-        end
-      else
-        raise SmarterCSV::IncorrectOption , "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" if options[:user_provided_headers].nil?
-      end
-      if options[:user_provided_headers] && options[:user_provided_headers].class == Array && ! options[:user_provided_headers].empty?
-        # use user-provided headers
-        headerA = options[:user_provided_headers]
-        if defined?(file_header_size) && ! file_header_size.nil?
-          if headerA.size != file_header_size
-            raise SmarterCSV::HeaderSizeMismatch , "ERROR: :user_provided_headers defines #{headerA.size} headers !=  CSV-file #{input} has #{file_header_size} headers"
-          else
-            # we could print out the mapping of file_headerA to headerA here
-          end
-        end
-      else
-        headerA = file_headerA
-      end
-      header_size = headerA.size
-      headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
-      unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
-        key_mappingH = options[:key_mapping]
-        # do some key mapping on the keys in the file header
-        #   if you want to completely delete a key, then map it to nil or to ''
-        if ! key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
-          headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
-        end
-      end
-      # header_validations
-      duplicate_headers = []
-      headerA.compact.each do |k|
-        duplicate_headers << k if headerA.select{|x| x == k}.size > 1
-      end
-      raise SmarterCSV::DuplicateHeaders , "ERROR: duplicate headers: #{duplicate_headers.join(',')}" unless duplicate_headers.empty?
-      if options[:required_headers] && options[:required_headers].is_a?(Array)
-        missing_headers = []
-        options[:required_headers].each do |k|
-          missing_headers << k unless headerA.include?(k)
-        end
-        raise SmarterCSV::MissingHeaders , "ERROR: missing headers: #{missing_headers.join(',')}" unless missing_headers.empty?
-      end
+      headerA, header_size = process_headers(f, options, csv_options)
       # in case we use chunking.. we'll need to set it up..
       if ! options[:chunk_size].nil? && options[:chunk_size].to_i > 0
@@ -128,41 +49,42 @@ module SmarterCSV
       # now on to processing all the rest of the lines in the CSV file:
       while ! f.eof?    # we can't use f.readlines() here, because this would read the whole file into memory at once, and eof => true
-        line = f.readline  # read one line.. this uses the input_record_separator $INPUT_RECORD_SEPARATOR which we set previously!
+        line = f.readline(options[:row_sep])  # read one line
+        @file_line_count += 1
+        @csv_line_count += 1
         # replace invalid byte sequence in UTF-8 with question mark to avoid errors
         line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
-        file_line_count += 1
-        csv_line_count += 1
-        print "processing file line %10d, csv line %10d\r" % [file_line_count, csv_line_count] if options[:verbose]
-        next  if  line =~ options[:comment_regexp]  # ignore all comment lines if there are any
+        print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]
+        next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
         # cater for the quoted csv data containing the row separator carriage return character
         # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
         # by detecting the existence of an uneven number of quote characters
-        multiline = line.count(options[:quote_char])%2 == 1
-        while line.count(options[:quote_char])%2 == 1
-          next_line = f.readline
+        multiline = line.count(options[:quote_char])%2 == 1 # should handle quote_char nil
+        while line.count(options[:quote_char])%2 == 1 # should handle quote_char nil
+          next_line = f.readline(options[:row_sep])
           next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
           line += next_line
-          file_line_count += 1
+          @file_line_count += 1
         end
-        print "\nline contains uneven number of quote chars so including content through file line %d\n" % file_line_count if options[:verbose] && multiline
+        print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline
-        line.chomp!    # will use $INPUT_RECORD_SEPARATOR which is set to options[:col_sep]
+        line.chomp!(options[:row_sep])
         if (line =~ %r{#{options[:quote_char]}}) and (! options[:force_simple_split])
           dataA = begin
             CSV.parse( line, **csv_options ).flatten.collect!{|x| x.nil? ? '' : x} # to deal with nil values from CSV.parse
           rescue CSV::MalformedCSVError => e
-            raise $!, "#{$!} [SmarterCSV: csv line #{csv_line_count}]", $!.backtrace
+            raise $!, "#{$!} [SmarterCSV: csv line #{@csv_line_count}]", $!.backtrace
           end
         else
-          dataA =  line.split(options[:col_sep], header_size)
+          dataA = line.split(options[:col_sep], header_size)
         end
-####     dataA.map!{|x| x.gsub(%r/#{options[:quote_char]}/,'') }  # this is actually not a good idea as a default
-        dataA.map!{|x| x.strip}  if options[:strip_whitespace]
+        dataA.map!{|x| x.sub(/(#{options[:col_sep]})+\z/, '')} # remove any unwanted trailing col_sep characters at the end
+        dataA.map!{|x| x.strip} if options[:strip_whitespace]
         # if all values are blank, then ignore this line
         # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
@@ -257,7 +179,6 @@ module SmarterCSV
         chunk = []  # initialize for next chunk of data
       end
     ensure
-      $INPUT_RECORD_SEPARATOR = old_row_sep   # make sure this stupid global variable is always reset to it's previous value after we're done!
       f.close if f.respond_to?(:close)
     end
     if block_given?
@@ -269,6 +190,40 @@ module SmarterCSV
   private
+  def self.default_options
+    {
+      auto_row_sep_chars: 500,
+      chunk_size: nil ,
+      col_sep: ',',
+      comment_regexp: nil, # was: /\A#/,
+      convert_values_to_numeric: true,
+      downcase_header: true,
+      duplicate_header_suffix: nil,
+      file_encoding: 'utf-8',
+      force_simple_split: false ,
+      force_utf8: false,
+      headers_in_file: true,
+      invalid_byte_sequence: '',
+      keep_original_headers: false,
+      key_mapping_hash: nil ,
+      quote_char: '"',
+      remove_empty_hashes: true ,
+      remove_empty_values: true,
+      remove_unmapped_keys: false,
+      remove_values_matching: nil,
+      remove_zero_values: false,
+      required_headers: nil,
+      row_sep: $INPUT_RECORD_SEPARATOR,
+      skip_lines: nil,
+      strings_as_keys: false,
+      strip_chars_from_headers: nil,
+      strip_whitespace: true,
+      user_provided_headers: nil,
+      value_converters: nil,
+      verbose: false,
+    }
+  end
   def self.blank?(value)
     case value
     when Array
@@ -304,11 +259,11 @@ module SmarterCSV
   end
   # raise exception if none is found
-  def self.guess_column_separator(filehandle)
+  def self.guess_column_separator(filehandle, options)
     del = [',', "\t", ';', ':', '|']
     n = Hash.new(0)
     5.times do
-      line = filehandle.readline
+      line = filehandle.readline(options[:row_sep])
       del.each do |d|
         n[d] += line.scan(d).count
       end
@@ -347,9 +302,110 @@ module SmarterCSV
       lines += 1
       break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
     end
+    filehandle.rewind
     counts["\r"] += 1 if last_char == "\r"
     # find the key/value pair with the largest counter:
     k,_ = counts.max_by{|_,v| v}
     return k                    # the most frequent one is it
   end
+  def self.process_headers(filehandle, options, csv_options)
+    if options[:headers_in_file]        # extract the header line
+      # process the header line in the CSV file..
+      # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
+      header = filehandle.readline(options[:row_sep])
+      @file_line_count += 1
+      @csv_line_count += 1
+      header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
+      header = header.sub(options[:comment_regexp],'') if options[:comment_regexp]
+      header = header.chomp(options[:row_sep])
+      header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
+      if (header =~ %r{#{options[:quote_char]}}) and (! options[:force_simple_split])
+        file_headerA = begin
+          CSV.parse( header, **csv_options ).flatten.collect!{|x| x.nil? ? '' : x} # to deal with nil values from CSV.parse
+        rescue CSV::MalformedCSVError => e
+          raise $!, "#{$!} [SmarterCSV: csv line #{@csv_line_count}]", $!.backtrace
+        end
+      else
+        file_headerA =  header.split(options[:col_sep])
+      end
+      file_header_size = file_headerA.size # before mapping, which could delete keys
+      file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/,'') }
+      file_headerA.map!{|x| x.strip}  if options[:strip_whitespace]
+      unless options[:keep_original_headers]
+        file_headerA.map!{|x| x.gsub(/\s+|-+/,'_')}
+        file_headerA.map!{|x| x.downcase }   if options[:downcase_header]
+      end
+    else
+      raise SmarterCSV::IncorrectOption , "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
+    end
+    if options[:user_provided_headers] && options[:user_provided_headers].class == Array && ! options[:user_provided_headers].empty?
+      # use user-provided headers
+      headerA = options[:user_provided_headers]
+      if defined?(file_header_size) && ! file_header_size.nil?
+        if headerA.size != file_header_size
+          raise SmarterCSV::HeaderSizeMismatch , "ERROR: :user_provided_headers defines #{headerA.size} headers !=  CSV-file #{input} has #{file_header_size} headers"
+        else
+          # we could print out the mapping of file_headerA to headerA here
+        end
+      end
+    else
+      headerA = file_headerA
+    end
+    # detect duplicate headers and disambiguate
+    headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
+    header_size = headerA.size # used for splitting lines
+    headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
+    unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
+      key_mappingH = options[:key_mapping]
+      # do some key mapping on the keys in the file header
+      #   if you want to completely delete a key, then map it to nil or to ''
+      if ! key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
+        # we can't map keys that are not there
+        raise SmarterCSV::KeyMappingError unless (key_mappingH.keys - headerA).empty?
+        headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
+      end
+    end
+    # header_validations
+    duplicate_headers = []
+    headerA.compact.each do |k|
+      duplicate_headers << k if headerA.select{|x| x == k}.size > 1
+    end
+    raise SmarterCSV::DuplicateHeaders , "ERROR: duplicate headers: #{duplicate_headers.join(',')}" unless duplicate_headers.empty?
+    if options[:required_headers] && options[:required_headers].is_a?(Array)
+      missing_headers = []
+      options[:required_headers].each do |k|
+        missing_headers << k unless headerA.include?(k)
+      end
+      raise SmarterCSV::MissingHeaders , "ERROR: missing headers: #{missing_headers.join(',')}" unless missing_headers.empty?
+    end
+    [headerA, header_size]
+  end
+  def self.process_duplicate_headers(headers, options)
+    counts = Hash.new(0)
+    result = []
+    headers.each do |key|
+      counts[key] += 1
+      if counts[key] == 1
+        result << key
+      else
+        result << [key, options[:duplicate_header_suffix], counts[key]].join
+      end
+    end
+    result
+  end
 end

data/lib/smarter_csv/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module SmarterCSV
-  VERSION = "1.4.0"
+  VERSION = "1.5.1"
 end

data/lib/smarter_csv.rb CHANGED Viewed

@@ -1,3 +1,11 @@
+if ENV['COVERAGE']
+  require 'simplecov'
+  SimpleCov.start do
+    add_filter "/spec/"
+    add_filter "/pkg/"
+  end
+end
 require 'csv'
 require "smarter_csv/version"
 require "extensions/hash.rb"

data/smarter_csv.gemspec CHANGED Viewed

@@ -18,6 +18,7 @@ Gem::Specification.new do |spec|
   spec.require_paths = ["lib"]
   spec.requirements  = ['csv'] # for CSV.parse() only needed in case we have quoted fields
   spec.add_development_dependency "rspec"
+  spec.add_development_dependency "simplecov"
   #  spec.add_development_dependency "guard-rspec"
   spec.metadata["homepage_uri"] = spec.homepage

data/spec/fixtures/additional_separator.csv ADDED Viewed

@@ -0,0 +1,6 @@
+col1,col2
+eins,zwei
+uno,dos,
+one,two ,,,
+ichi, ,,,,,
+un

data/spec/fixtures/duplicate_headers.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 email,firstname,lastname,email,age
 tom@bla.com,Tom,Sawyer,mike@bla.com,34
-eri@bla.com,Eri Chan,tom@bla.com,21
+eri@bla.com,Eri,Chan,tom@bla.com,21

data/spec/fixtures/hard_sample.csv ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ Name,Email,Financial Status,Paid at,Fulfillment Status,Fulfilled at,Accepts Marketing,Currency,Subtotal,Shipping,Taxes,Total,Discount Code,Discount Amount,Shipping Method,Created at,Lineitem quantity,Lineitem name,Lineitem price,Lineitem compare at price,Lineitem sku,Lineitem requires shipping,Lineitem taxable,Lineitem fulfillment status,Billing Name,Billing Street,Billing Address1,Billing Address2,Billing Company,Billing City,Billing Zip,Billing Province,Billing Country,Billing Phone,Shipping Name,Shipping Street,Shipping Address1,Shipping Address2,Shipping Company,Shipping City,Shipping Zip,Shipping Province,Shipping Country,Shipping Phone,Notes,Note Attributes,Cancelled at,Payment Method,Payment Reference,Refunded Amount,Vendor, rece,Tags,Risk Level,Source,Lineitem discount,Tax 1 Name,Tax 1 Value,Tax 2 Name,Tax 2 Value,Tax 3 Name,Tax 3 Value,Tax 4 Name,Tax 4 Value,Tax 5 Name,Tax 5 Value,Phone,Receipt Number,Duties,Billing Province Name,Shipping Province Name,Payment ID,Payment Terms Name,Next Payment Due At
2	+ #MR1220817,foo@bar.com,paid,2022-02-08 22:31:28 +0100,unfulfilled,,yes,EUR,144,0,24,144,VIP,119.6,"Livraison Standard GRATUITE, 2-5 jours avec suivi",2022-02-08 22:31:26 +0100,2,Cire Épilation Nacrée,37,,WAX-200-NAC,true,true,pending,French Fry,64 Boulevard Budgié,64 Boulevard Budgié,,,dootdoot’,'49100,,FR,06 12 34 56 78,French Fry,64 Boulevard Budgi,64 Boulevard Budgié,,,dootdoot,'49100,,FR,06 12 34 56 78,,,,Stripe,c23800013619353.2,0,Goober Rég,4331065802905,902,Low,web,0,FR TVA 20%,24,,,,,,,,,3366012111111,,,,,,,

data/spec/smarter_csv/additional_separator_spec.rb ADDED Viewed

@@ -0,0 +1,45 @@
+require 'spec_helper'
+fixture_path = 'spec/fixtures'
+describe 'handling of additional trailing column separators' do
+  let(:file) { "#{fixture_path}/additional_separator.csv" }
+  describe '' do
+    let(:data) { SmarterCSV.process(file) }
+    it 'reads all lines' do
+      data.size.should eq 5
+    end
+    it 'reads regular lines' do
+      item = data[0]
+      item[:col1].should == 'eins'
+      item[:col2].should == 'zwei'
+    end
+    it 'strips single trailing col_sep character' do
+      item = data[1]
+      item[:col1].should == 'uno'
+      item[:col2].should == 'dos'
+    end
+    it 'strips multiple trailing col_sep characters' do
+      item = data[2]
+      item[:col1].should == 'one'
+      item[:col2].should == 'two'
+    end
+    it 'strips multiple trailing col_sep chars' do
+      item = data[3]
+      item[:col1].should == 'ichi'
+      item[:col2].should == nil
+    end
+    it 'strips multiple trailing col_sep chars' do
+      item = data[4]
+      item[:col1].should == 'un'
+      item[:col2].should == nil
+    end
+  end
+end

data/spec/smarter_csv/binary_file2_spec.rb CHANGED Viewed

@@ -12,7 +12,7 @@ describe 'be_able_to' do
   it 'loads_binary_file_with_strings_as_keys' do
     options = {:col_sep => "\cA", :row_sep => "\cB", :comment_regexp => /^#/, :strings_as_keys => true}
     data = SmarterCSV.process("#{fixture_path}/binary.csv", options)
-    data.flatten.size.should == 8
+    data.size.should == 8
     data.each do |item|
       # all keys should be strings
       item.keys.each{|x| x.class.should be == String}

data/spec/smarter_csv/carriage_return_spec.rb CHANGED Viewed

@@ -3,7 +3,6 @@ require 'spec_helper'
 fixture_path = 'spec/fixtures'
 describe 'process files with line endings explicitly pre-specified' do
   it 'should process a file with \n for line endings and within data fields' do
     sep = "\n"
     options = {:row_sep => sep}
@@ -83,14 +82,14 @@ describe 'process files with line endings explicitly pre-specified' do
     data[1][:members].should == ["Jimmy Page", "Robert Plant", "John Bonham", "John Paul Jones"].join(text_sep)
     data[1][:albums].should == ["Led Zeppelin", "Led Zeppelin II", "Led Zeppelin III", "Led Zeppelin IV"].join(text_sep)
   end
 end
 describe 'process files with line endings in automatic mode' do
+  let(:options) { { row_sep: :auto } }
   it 'should process a file with \n for line endings and within data fields' do
     sep = "\n"
-    data = SmarterCSV.process("#{fixture_path}/carriage_returns_n.csv", {:row_sep => :auto})
+    data = SmarterCSV.process("#{fixture_path}/carriage_returns_n.csv", options)
     data.flatten.size.should == 8
     data[0][:name].should == "Anfield"
     data[0][:street].should == "Anfield Road"
@@ -112,7 +111,29 @@ describe 'process files with line endings in automatic mode' do
   it 'should process a file with \r for line endings and within data fields' do
     sep = "\r"
-    data = SmarterCSV.process("#{fixture_path}/carriage_returns_r.csv", {:row_sep => :auto})
+    data = SmarterCSV.process("#{fixture_path}/carriage_returns_r.csv", options)
+    data.flatten.size.should == 8
+    data[0][:name].should == "Anfield"
+    data[0][:street].should == "Anfield Road"
+    data[0][:city].should == "Liverpool"
+    data[1][:name].should == ["Highbury", "Highbury House"].join(sep)
+    data[2][:street].should == ["Sir Matt ", "Busby Way"].join(sep)
+    data[3][:city].should == ["Newcastle-upon-tyne ", "Tyne and Wear"].join(sep)
+    data[4][:name].should == ["White Hart Lane", "(The Lane)"].join(sep)
+    data[4][:street].should == ["Bill Nicholson Way ", "748 High Rd"].join(sep)
+    data[4][:city].should == ["Tottenham", "London"].join(sep)
+    data[5][:name].should == "Stamford Bridge"
+    data[5][:street].should == ["Fulham Road", "London"].join(sep)
+    data[5][:city].should be_nil
+    data[6][:name].should == ["Etihad Stadium", "Rowsley St", "Manchester"].join(sep)
+    data[7][:name].should == "Goodison"
+    data[7][:street].should == "Goodison Road"
+    data[7][:city].should == "Liverpool"
+  end
+  it 'also works when auto is given a string' do
+    sep = "\r"
+    data = SmarterCSV.process("#{fixture_path}/carriage_returns_r.csv", {row_sep: 'auto'})
     data.flatten.size.should == 8
     data[0][:name].should == "Anfield"
     data[0][:street].should == "Anfield Road"
@@ -134,7 +155,7 @@ describe 'process files with line endings in automatic mode' do
   it 'should process a file with \r\n for line endings and within data fields' do
     sep = "\r\n"
-    data = SmarterCSV.process("#{fixture_path}/carriage_returns_rn.csv", {:row_sep => :auto})
+    data = SmarterCSV.process("#{fixture_path}/carriage_returns_rn.csv", options)
     data.flatten.size.should == 8
     data[0][:name].should == "Anfield"
     data[0][:street].should == "Anfield Road"
@@ -157,7 +178,7 @@ describe 'process files with line endings in automatic mode' do
   it 'should process a file with more quoted text carriage return characters (\r) than line ending characters (\n)' do
     row_sep = "\n"
     text_sep = "\r"
-    data = SmarterCSV.process("#{fixture_path}/carriage_returns_quoted.csv", {:row_sep => :auto})
+    data = SmarterCSV.process("#{fixture_path}/carriage_returns_quoted.csv", options)
     data.flatten.size.should == 2
     data[0][:band].should == "New Order"
     data[0][:members].should == ["Bernard Sumner", "Peter Hook", "Stephen Morris", "Gillian Gilbert"].join(text_sep)
@@ -166,5 +187,4 @@ describe 'process files with line endings in automatic mode' do
     data[1][:members].should == ["Jimmy Page", "Robert Plant", "John Bonham", "John Paul Jones"].join(text_sep)
     data[1][:albums].should == ["Led Zeppelin", "Led Zeppelin II", "Led Zeppelin III", "Led Zeppelin IV"].join(text_sep)
   end
 end

data/spec/smarter_csv/column_separator_spec.rb CHANGED Viewed

@@ -48,7 +48,7 @@ describe 'can handle col_sep' do
   end
   describe 'auto-detection of separator' do
-    options = {:col_sep => 'auto'}
+    options = {col_sep: :auto}
     it 'auto-detects comma separator and loads data' do
       data = SmarterCSV.process("#{fixture_path}/separator_comma.csv", options)
@@ -85,5 +85,11 @@ describe 'can handle col_sep' do
         SmarterCSV.process("#{fixture_path}/binary.csv", options)
       }.to raise_exception SmarterCSV::NoColSepDetected
     end
+    it 'also works when auto is given a string' do
+      data = SmarterCSV.process("#{fixture_path}/separator_pipe.csv", col_sep: 'auto')
+      data.first.keys.size.should == 4
+      data.size.should eq 3
+    end
   end
 end

data/spec/smarter_csv/duplicate_headers_spec.rb ADDED Viewed

@@ -0,0 +1,76 @@
+require 'spec_helper'
+fixture_path = 'spec/fixtures'
+describe 'duplicate headers' do
+  describe 'without special handling / default behavior' do
+    it 'raises error on duplicate headers' do
+      expect {
+        SmarterCSV.process("#{fixture_path}/duplicate_headers.csv", {})
+      }.to raise_exception(SmarterCSV::DuplicateHeaders)
+    end
+    it 'raises error on duplicate given headers' do
+      expect {
+        options = {:user_provided_headers => [:a,:b,:c,:d,:a]}
+        SmarterCSV.process("#{fixture_path}/duplicate_headers.csv", options)
+      }.to raise_exception(SmarterCSV::DuplicateHeaders)
+    end
+    it 'raises error on missing mapped headers' do
+      expect {
+        # the mapping is right, but the underlying csv file is bad
+        options = {:key_mapping => {:email => :a, :firstname => :b, :lastname => :c, :manager_email => :d, :age => :e} }
+        SmarterCSV.process("#{fixture_path}/duplicate_headers.csv", options)
+      }.to raise_exception(SmarterCSV::KeyMappingError)
+    end
+  end
+  describe 'with special handling' do
+    context 'with given suffix' do
+      let(:options) { {duplicate_header_suffix: '_'} }
+      it 'reads whole file' do
+        data = SmarterCSV.process("#{fixture_path}/duplicate_headers.csv", options)
+        expect(data.size).to eq 2
+      end
+      it 'generates the correct keys' do
+        data = SmarterCSV.process("#{fixture_path}/duplicate_headers.csv", options)
+        expect(data.first.keys).to eq [:email, :firstname, :lastname, :email_2, :age]
+      end
+      it 'enumerates when duplicate headers are given' do
+        options.merge!({:user_provided_headers => [:a,:b,:c,:a,:a]})
+        data = SmarterCSV.process("#{fixture_path}/duplicate_headers.csv", options)
+        expect(data.first.keys).to eq [:a, :b, :c, :a_2, :a_3]
+      end
+      it 'can remap duplicated headers' do
+        options.merge!({:key_mapping => {:email => :a, :firstname => :b, :lastname => :c, :email_2 => :d, :age => :e}})
+        data = SmarterCSV.process("#{fixture_path}/duplicate_headers.csv", options)
+        expect(data.first).to eq({a: 'tom@bla.com', b: 'Tom', c: 'Sawyer', d: 'mike@bla.com', e: 34})
+      end
+    end
+    context 'with empty suffix' do
+      let(:options) { {duplicate_header_suffix: ''} }
+      it 'reads whole file' do
+        data = SmarterCSV.process("#{fixture_path}/duplicate_headers.csv", options)
+        expect(data.size).to eq 2
+      end
+      it 'generates the correct keys' do
+        data = SmarterCSV.process("#{fixture_path}/duplicate_headers.csv", options)
+        expect(data.first.keys).to eq [:email, :firstname, :lastname, :email2, :age]
+      end
+      it 'enumerates when duplicate headers are given' do
+        options.merge!({:user_provided_headers => [:a,:b,:c,:a,:a]})
+        data = SmarterCSV.process("#{fixture_path}/duplicate_headers.csv", options)
+        expect(data.first.keys).to eq [:a, :b, :c, :a2, :a3]
+      end
+    end
+  end
+end