RubyGems - smarter_csv - Versions diffs - 1.9.2.pre01 → 1.9.3 - Mend

smarter_csv 1.9.2.pre01 → 1.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +8 -3
data/README.md +1 -1
data/lib/smarter_csv/auto_detection.rb +73 -0
data/lib/smarter_csv/file_io.rb +50 -0
data/lib/smarter_csv/headers.rb +160 -0
data/lib/smarter_csv/parse.rb +90 -0
data/lib/smarter_csv/smarter_csv.rb +27 -340
data/lib/smarter_csv/variables.rb +26 -0
data/lib/smarter_csv/version.rb +1 -1
data/lib/smarter_csv.rb +9 -5
metadata +9 -5
data/lib/core_ext/hash.rb +0 -9

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 75d9d441d771c2fbe0861e0bc5f84dbf05d010b4844a867fc4679df002822d07
-  data.tar.gz: 0d5d37d4f2654fd354a2adac23019b2955540c1356c57f72052c01220598ffa2
+  metadata.gz: 5f35e10ff8bc0e79ff1ed9bea8e413f746f51128a6f6a9622d246873fd588366
+  data.tar.gz: 5cc30cf6f4422dd16f3019915bc5305a92aaaa4b99665e4c4c525d3bbf489cfd
 SHA512:
-  metadata.gz: b24e2b09ea919994da347eb52b781868a19e0f28dc367bfeb43b8a254619ab8dd882d3035f0546683c2ebc893fd600ce05a3abb800e4b124c7369d314607ee3f
-  data.tar.gz: 3a1115ac4937c2fedf469d1f45e3aa1cf7ed03f1f55d66f6cb310c767a3b5e8cb4966a19ba968f065c5d1de2d7074f479b5eeb0686dbab8012e5e6b8ed0f2628
+  metadata.gz: 057472a73ae0be95318b16428b276ecffba384a68479af715c5ec3ca7601405ca73928b0fbf245c9b3f46fd33b82a8c6d9c9e6330ddb0305b83ae23f58173df0
+  data.tar.gz: 319b12a53875c1963eed6d27aa67850135d33a5b3a9f70607e6d812906733b711ade6c3ee6e789d78c2e159004a879e59e700145224134745b16d279039ac38a

data/CHANGELOG.md CHANGED Viewed

@@ -1,9 +1,14 @@
 # SmarterCSV 1.x Change Log
-## 1.9.2.pre01 (2023-11-11)
-  * fixed bug with '\\' at end of line (issue #252)
-  * fixed require statements
+## 1.9.3 (2023-12-16)
+  * raise SmarterCSV::IncorrectOption when `user_provided_headers` are empty
+  * code refactor / no functional changes
+  * added test cases
+## 1.9.2 (2023-11-12)
+  * fixed bug with '\\' at end of line (issue #252, thanks to averycrespi-moz)
+  * fixed require statements (issue #249, thanks to PikachuEXE, courtsimas)
 ## 1.9.1 (2023-10-30) (YANKED)
   * yanked

data/README.md CHANGED Viewed

@@ -300,7 +300,7 @@ And header and data validations will also be supported in 2.x
      | Option                      | Default  |  Explanation                                                                         |
      ---------------------------------------------------------------------------------------------------------------------------------
      | :key_mapping                |   nil    | a hash which maps headers from the CSV file to keys in the result hash               |
-     | :silence_missing_key        |   false  | ignore missing keys in `key_mapping`                                   |
+     | :silence_missing_keys        |   false  | ignore missing keys in `key_mapping`                                   |
      |                             |          | if set to true: makes all mapped keys optional                         |
      |                             |          | if given an array, makes only the keys listed in it optional                         |
      | :required_keys              |   nil    | An array. Specify the required names AFTER header transformation.                  |

data/lib/smarter_csv/auto_detection.rb ADDED Viewed

@@ -0,0 +1,73 @@
+# frozen_string_literal: true
+module SmarterCSV
+  class << self
+    protected
+    # If file has headers, then guesses column separator from headers.
+    # Otherwise guesses column separator from contents.
+    # Raises exception if none is found.
+    def guess_column_separator(filehandle, options)
+      skip_lines(filehandle, options)
+      delimiters = [',', "\t", ';', ':', '|']
+      line = nil
+      has_header = options[:headers_in_file]
+      candidates = Hash.new(0)
+      count = has_header ? 1 : 5
+      count.times do
+        line = readline_with_counts(filehandle, options)
+        delimiters.each do |d|
+          candidates[d] += line.scan(d).count
+        end
+      rescue EOFError # short files
+        break
+      end
+      rewind(filehandle)
+      if candidates.values.max == 0
+        # if the header only contains
+        return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
+        raise SmarterCSV::NoColSepDetected
+      end
+      candidates.key(candidates.values.max)
+    end
+    # limitation: this currently reads the whole file in before making a decision
+    def guess_line_ending(filehandle, options)
+      counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
+      quoted_char = false
+      # count how many of the pre-defined line-endings we find
+      # ignoring those contained within quote characters
+      last_char = nil
+      lines = 0
+      filehandle.each_char do |c|
+        quoted_char = !quoted_char if c == options[:quote_char]
+        next if quoted_char
+        if last_char == "\r"
+          if c == "\n"
+            counts["\r\n"] += 1
+          else
+            counts["\r"] += 1 # \r are counted after they appeared
+          end
+        elsif c == "\n"
+          counts["\n"] += 1
+        end
+        last_char = c
+        lines += 1
+        break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
+      end
+      rewind(filehandle)
+      counts["\r"] += 1 if last_char == "\r"
+      # find the most frequent key/value pair:
+      most_frequent_key, _count = counts.max_by{|_, v| v}
+      most_frequent_key
+    end
+  end
+end

data/lib/smarter_csv/file_io.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+module SmarterCSV
+  class << self
+    protected
+    def readline_with_counts(filehandle, options)
+      line = filehandle.readline(options[:row_sep])
+      @file_line_count += 1
+      @csv_line_count += 1
+      line = remove_bom(line) if @csv_line_count == 1
+      line
+    end
+    def skip_lines(filehandle, options)
+      options[:skip_lines].to_i.times do
+        readline_with_counts(filehandle, options)
+      end
+    end
+    def rewind(filehandle)
+      @file_line_count = 0
+      @csv_line_count = 0
+      filehandle.rewind
+    end
+    private
+    UTF_32_BOM = %w[0 0 fe ff].freeze
+    UTF_32LE_BOM = %w[ff fe 0 0].freeze
+    UTF_8_BOM = %w[ef bb bf].freeze
+    UTF_16_BOM = %w[fe ff].freeze
+    UTF_16LE_BOM = %w[ff fe].freeze
+    def remove_bom(str)
+      str_as_hex = str.bytes.map{|x| x.to_s(16)}
+      # if string does not start with one of the bytes, there is no BOM
+      return str unless %w[ef fe ff 0].include?(str_as_hex[0])
+      return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
+      return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
+      return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
+      # :nocov:
+      puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
+      str
+      # :nocov:
+    end
+  end
+end

data/lib/smarter_csv/headers.rb ADDED Viewed

@@ -0,0 +1,160 @@
+# frozen_string_literal: true
+module SmarterCSV
+  class << self
+    def process_headers(filehandle, options)
+      @raw_header = nil # header as it appears in the file
+      @headers = nil # the processed headers
+      header_array = []
+      file_header_size = nil
+      # if headers_in_file, get the headers -> We get the number of columns, even when user provided headers
+      if options[:headers_in_file] # extract the header line
+        # process the header line in the CSV file..
+        # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
+        header_line = @raw_header = readline_with_counts(filehandle, options)
+        header_line = preprocess_header_line(header_line, options)
+        file_header_array, file_header_size = parse_and_modify_headers(header_line, options)
+      else
+        unless options[:user_provided_headers]
+          raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers"
+        end
+      end
+      if options[:user_provided_headers]
+        unless options[:user_provided_headers].is_a?(Array) && !options[:user_provided_headers].empty?
+          raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for user_provided_headers! Expecting array with headers.")
+        end
+        # use user-provided headers
+        user_header_array = options[:user_provided_headers]
+        # user_provided_headers: their count should match the headers_in_file if any
+        if defined?(file_header_size) && !file_header_size.nil?
+          if user_header_array.size != file_header_size
+            raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{user_header_array.size} headers !=  CSV-file has #{file_header_size} headers"
+          else
+            # we could print out the mapping of file_header_array to header_array here
+          end
+        end
+        header_array = user_header_array
+      else
+        header_array = file_header_array
+      end
+      # detect duplicate headers and disambiguate
+      header_array = disambiguate_headers(header_array, options) if options[:duplicate_header_suffix]
+      # symbolize headers
+      header_array.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
+      # wouldn't make sense to re-map user provided headers
+      header_array = remap_headers(header_array, options) if options[:key_mapping] && !options[:user_provided_headers]
+      validate_and_deprecate_headers(header_array, options)
+      [header_array, header_array.size]
+    end
+    private
+    def preprocess_header_line(header_line, options)
+      header_line = enforce_utf8_encoding(header_line, options)
+      header_line = remove_comments_from_header(header_line, options)
+      header_line = header_line.chomp(options[:row_sep])
+      header_line.gsub!(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
+      header_line
+    end
+    def parse_and_modify_headers(header_line, options)
+      file_header_array, file_header_size = parse(header_line, options)
+      file_header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
+      file_header_array.map!{|x| x.strip} if options[:strip_whitespace]
+      unless options[:keep_original_headers]
+        file_header_array.map!{|x| x.gsub(/\s+|-+/, '_')}
+        file_header_array.map!{|x| x.downcase} if options[:downcase_header]
+      end
+      [file_header_array, file_header_size]
+    end
+    def disambiguate_headers(headers, options)
+      counts = Hash.new(0)
+      headers.map do |header|
+        counts[header] += 1
+        counts[header] > 1 ? "#{header}#{options[:duplicate_header_suffix]}#{counts[header]}" : header
+      end
+    end
+    # do some key mapping on the keys in the file header
+    # if you want to completely delete a key, then map it to nil or to ''
+    def remap_headers(headers, options)
+      key_mapping = options[:key_mapping]
+      if key_mapping.empty? || !key_mapping.is_a?(Hash) || key_mapping.keys.empty?
+        raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for key_mapping! Expecting hash with from -> to mappings")
+      end
+      key_mapping = options[:key_mapping]
+      # if silence_missing_keys are not set, raise error if missing header
+      missing_keys = key_mapping.keys - headers
+      # if the user passes a list of speciffic mapped keys that are optional
+      missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
+      unless missing_keys.empty? || options[:silence_missing_keys] == true
+        raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
+      end
+      headers.map! do |header|
+        if key_mapping.has_key?(header)
+          key_mapping[header].nil? ? nil : key_mapping[header]
+        elsif options[:remove_unmapped_keys]
+          nil
+        else
+          header
+        end
+      end
+      headers
+    end
+    # header_validations
+    def validate_and_deprecate_headers(headers, options)
+      duplicate_headers = []
+      headers.compact.each do |k|
+        duplicate_headers << k if headers.select{|x| x == k}.size > 1
+      end
+      unless options[:user_provided_headers] || duplicate_headers.empty?
+        raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
+      end
+      # deprecate required_headers
+      unless options[:required_headers].nil?
+        puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
+        if options[:required_keys].nil?
+          options[:required_keys] = options[:required_headers]
+          options[:required_headers] = nil
+        end
+      end
+      if options[:required_keys] && options[:required_keys].is_a?(Array)
+        missing_keys = []
+        options[:required_keys].each do |k|
+          missing_keys << k unless headers.include?(k)
+        end
+        raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
+      end
+    end
+    def enforce_utf8_encoding(header, options)
+      return header unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
+      header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
+    end
+    def remove_comments_from_header(header, options)
+      return header unless options[:comment_regexp]
+      header.sub(options[:comment_regexp], '')
+    end
+  end
+end

data/lib/smarter_csv/parse.rb ADDED Viewed

@@ -0,0 +1,90 @@
+# frozen_string_literal: true
+module SmarterCSV
+  class << self
+    protected
+    ###
+    ### Thin wrapper around C-extension
+    ###
+    def parse(line, options, header_size = nil)
+      # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
+      if options[:acceleration] && has_acceleration?
+        # :nocov:
+        has_quotes = line =~ /#{options[:quote_char]}/
+        elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
+        elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
+        [elements, elements.size]
+        # :nocov:
+      else
+        # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
+        parse_csv_line_ruby(line, options, header_size)
+      end
+    end
+    # ------------------------------------------------------------------
+    # Ruby equivalent of the C-extension for parse_line
+    #
+    # parses a single line: either a CSV header and body line
+    # - quoting rules compared to RFC-4180 are somewhat relaxed
+    # - we are not assuming that quotes inside a fields need to be doubled
+    # - we are not assuming that all fields need to be quoted (0 is even)
+    # - works with multi-char col_sep
+    # - if header_size is given, only up to header_size fields are parsed
+    #
+    # We use header_size for parsing the body lines to make sure we always match the number of headers
+    # in case there are trailing col_sep characters in line
+    #
+    # Our convention is that empty fields are returned as empty strings, not as nil.
+    #
+    #
+    # the purpose of the max_size parameter is to handle a corner case where
+    # CSV lines contain more fields than the header.
+    # In which case the remaining fields in the line are ignored
+    #
+    def parse_csv_line_ruby(line, options, header_size = nil)
+      return [] if line.nil?
+      line_size = line.size
+      col_sep = options[:col_sep]
+      col_sep_size = col_sep.size
+      quote = options[:quote_char]
+      quote_count = 0
+      elements = []
+      start = 0
+      i = 0
+      previous_char = ''
+      while i < line_size
+        if line[i...i+col_sep_size] == col_sep && quote_count.even?
+          break if !header_size.nil? && elements.size >= header_size
+          elements << cleanup_quotes(line[start...i], quote)
+          previous_char = line[i]
+          i += col_sep.size
+          start = i
+        else
+          quote_count += 1 if line[i] == quote && previous_char != '\\'
+          previous_char = line[i]
+          i += 1
+        end
+      end
+      elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
+      [elements, elements.size]
+    end
+    def cleanup_quotes(field, quote)
+      return field if field.nil?
+      # return if field !~ /#{quote}/ # this check can probably eliminated
+      if field.start_with?(quote) && field.end_with?(quote)
+        field.delete_prefix!(quote)
+        field.delete_suffix!(quote)
+      end
+      field.gsub!("#{quote}#{quote}", quote)
+      field
+    end
+  end
+end

data/lib/smarter_csv/smarter_csv.rb CHANGED Viewed

@@ -14,10 +14,8 @@ module SmarterCSV
   def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
     options = process_options(given_options)
-    headerA = []
-    result = []
-    @file_line_count = 0
-    @csv_line_count = 0
+    initialize_variables
     has_rails = !!defined?(Rails)
     begin
       fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
@@ -33,13 +31,14 @@ module SmarterCSV
       skip_lines(fh, options)
-      headerA, header_size = process_headers(fh, options)
+      @headers, header_size = process_headers(fh, options)
+      @headerA = @headers # @headerA is deprecated, use @headers
       # in case we use chunking.. we'll need to set it up..
-      if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
+      if options[:chunk_size].to_i > 0
         use_chunks = true
         chunk_size = options[:chunk_size].to_i
-        chunk_count = 0
+        @chunk_count = 0
         chunk = []
       else
         use_chunks = false
@@ -78,7 +77,7 @@ module SmarterCSV
         # if all values are blank, then ignore this line
         next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
-        hash = Hash.zip(headerA, dataA) # from Facets of Ruby library
+        hash = @headers.zip(dataA).to_h
         # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
         hash.delete(nil)
@@ -95,7 +94,7 @@ module SmarterCSV
         if options[:convert_values_to_numeric]
           hash.each do |k, v|
             # deal with the :only / :except options to :convert_values_to_numeric
-            next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
+            next if limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
             # convert if it's a numeric value:
             case v
@@ -128,9 +127,9 @@ module SmarterCSV
             if block_given?
               yield chunk # do something with the hashes in the chunk in the block
             else
-              result << chunk # not sure yet, why anybody would want to do this without a block
+              @result << chunk # not sure yet, why anybody would want to do this without a block
             end
-            chunk_count += 1
+            @chunk_count += 1
             chunk = [] # initialize for next chunk of data
           else
@@ -144,7 +143,7 @@ module SmarterCSV
           if block_given?
             yield [hash] # do something with the hash in the block (better to use chunking here)
           else
-            result << hash
+            @result << hash
           end
         end
       end
@@ -158,34 +157,23 @@ module SmarterCSV
         if block_given?
           yield chunk # do something with the hashes in the chunk in the block
         else
-          result << chunk # not sure yet, why anybody would want to do this without a block
+          @result << chunk # not sure yet, why anybody would want to do this without a block
         end
-        chunk_count += 1
+        @chunk_count += 1
         # chunk = [] # initialize for next chunk of data
       end
     ensure
       fh.close if fh.respond_to?(:close)
     end
     if block_given?
-      chunk_count # when we do processing through a block we only care how many chunks we processed
+      @chunk_count # when we do processing through a block we only care how many chunks we processed
     else
-      result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
+      @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
     end
   end
   class << self
-    def has_acceleration?
-      @has_acceleration ||= !!defined?(parse_csv_line_c)
-    end
-    def raw_header
-      @raw_header
-    end
-    def headers
-      @headers
-    end
     # * the `scan` method iterates through the string and finds all occurrences of the pattern
     # * The reqular expression:
     #   - (?<!\\) : Negative lookbehind to ensure the quote character is not preceded by an unescaped backslash.
@@ -198,111 +186,22 @@ module SmarterCSV
       line.scan(/(?<!\\)(?:\\\\)*#{Regexp.escape(quote_char)}/).count
     end
-    protected
-    def readline_with_counts(filehandle, options)
-      line = filehandle.readline(options[:row_sep])
-      @file_line_count += 1
-      @csv_line_count += 1
-      line = remove_bom(line) if @csv_line_count == 1
-      line
-    end
-    def skip_lines(filehandle, options)
-      return unless options[:skip_lines].to_i > 0
-      options[:skip_lines].to_i.times do
-        readline_with_counts(filehandle, options)
-      end
-    end
-    def rewind(filehandle)
-      @file_line_count = 0
-      @csv_line_count = 0
-      filehandle.rewind
+    def has_acceleration?
+      @has_acceleration ||= !!defined?(parse_csv_line_c)
     end
-    ###
-    ### Thin wrapper around C-extension
-    ###
-    def parse(line, options, header_size = nil)
-      # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
-      if options[:acceleration] && has_acceleration?
-        # :nocov:
-        has_quotes = line =~ /#{options[:quote_char]}/
-        elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
-        elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
-        [elements, elements.size]
-        # :nocov:
-      else
-        # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
-        parse_csv_line_ruby(line, options, header_size)
-      end
-    end
+    protected
-    # ------------------------------------------------------------------
-    # Ruby equivalent of the C-extension for parse_line
-    #
-    # parses a single line: either a CSV header and body line
-    # - quoting rules compared to RFC-4180 are somewhat relaxed
-    # - we are not assuming that quotes inside a fields need to be doubled
-    # - we are not assuming that all fields need to be quoted (0 is even)
-    # - works with multi-char col_sep
-    # - if header_size is given, only up to header_size fields are parsed
-    #
-    # We use header_size for parsing the body lines to make sure we always match the number of headers
-    # in case there are trailing col_sep characters in line
-    #
-    # Our convention is that empty fields are returned as empty strings, not as nil.
-    #
-    #
-    # the purpose of the max_size parameter is to handle a corner case where
-    # CSV lines contain more fields than the header.
-    # In which case the remaining fields in the line are ignored
-    #
-    def parse_csv_line_ruby(line, options, header_size = nil)
-      return [] if line.nil?
-      line_size = line.size
-      col_sep = options[:col_sep]
-      col_sep_size = col_sep.size
-      quote = options[:quote_char]
-      quote_count = 0
-      elements = []
-      start = 0
-      i = 0
-      previous_char = ''
-      while i < line_size
-        if line[i...i+col_sep_size] == col_sep && quote_count.even?
-          break if !header_size.nil? && elements.size >= header_size
-          elements << cleanup_quotes(line[start...i], quote)
-          previous_char = line[i]
-          i += col_sep.size
-          start = i
-        else
-          quote_count += 1 if line[i] == quote && previous_char != '\\'
-          previous_char = line[i]
-          i += 1
+    # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
+    def limit_execution_for_only_or_except(options, option_name, key)
+      if options[option_name].is_a?(Hash)
+        if options[option_name].has_key?(:except)
+          return true if Array(options[option_name][:except]).include?(key)
+        elsif options[option_name].has_key?(:only)
+          return true unless Array(options[option_name][:only]).include?(key)
         end
       end
-      elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
-      [elements, elements.size]
-    end
-    def cleanup_quotes(field, quote)
-      return field if field.nil?
-      # return if field !~ /#{quote}/ # this check can probably eliminated
-      if field.start_with?(quote) && field.end_with?(quote)
-        field.delete_prefix!(quote)
-        field.delete_suffix!(quote)
-      end
-      field.gsub!("#{quote}#{quote}", quote)
-      field
+      false
     end
     # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
@@ -340,217 +239,5 @@ module SmarterCSV
         false
       end
     end
-    # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
-    def only_or_except_limit_execution(options, option_name, key)
-      if options[option_name].is_a?(Hash)
-        if options[option_name].has_key?(:except)
-          return true if Array(options[option_name][:except]).include?(key)
-        elsif options[option_name].has_key?(:only)
-          return true unless Array(options[option_name][:only]).include?(key)
-        end
-      end
-      false
-    end
-    # If file has headers, then guesses column separator from headers.
-    # Otherwise guesses column separator from contents.
-    # Raises exception if none is found.
-    def guess_column_separator(filehandle, options)
-      skip_lines(filehandle, options)
-      delimiters = [',', "\t", ';', ':', '|']
-      line = nil
-      has_header = options[:headers_in_file]
-      candidates = Hash.new(0)
-      count = has_header ? 1 : 5
-      count.times do
-        line = readline_with_counts(filehandle, options)
-        delimiters.each do |d|
-          candidates[d] += line.scan(d).count
-        end
-      rescue EOFError # short files
-        break
-      end
-      rewind(filehandle)
-      if candidates.values.max == 0
-        # if the header only contains
-        return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
-        raise SmarterCSV::NoColSepDetected
-      end
-      candidates.key(candidates.values.max)
-    end
-    # limitation: this currently reads the whole file in before making a decision
-    def guess_line_ending(filehandle, options)
-      counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
-      quoted_char = false
-      # count how many of the pre-defined line-endings we find
-      # ignoring those contained within quote characters
-      last_char = nil
-      lines = 0
-      filehandle.each_char do |c|
-        quoted_char = !quoted_char if c == options[:quote_char]
-        next if quoted_char
-        if last_char == "\r"
-          if c == "\n"
-            counts["\r\n"] += 1
-          else
-            counts["\r"] += 1 # \r are counted after they appeared
-          end
-        elsif c == "\n"
-          counts["\n"] += 1
-        end
-        last_char = c
-        lines += 1
-        break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
-      end
-      rewind(filehandle)
-      counts["\r"] += 1 if last_char == "\r"
-      # find the most frequent key/value pair:
-      most_frequent_key, _count = counts.max_by{|_, v| v}
-      most_frequent_key
-    end
-    def process_headers(filehandle, options)
-      @raw_header = nil
-      @headers = nil
-      if options[:headers_in_file] # extract the header line
-        # process the header line in the CSV file..
-        # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
-        header = readline_with_counts(filehandle, options)
-        @raw_header = header
-        header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
-        header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
-        header = header.chomp(options[:row_sep])
-        header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
-        file_headerA, file_header_size = parse(header, options)
-        file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
-        file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
-        unless options[:keep_original_headers]
-          file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
-          file_headerA.map!{|x| x.downcase} if options[:downcase_header]
-        end
-      else
-        raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
-      end
-      if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
-        # use user-provided headers
-        headerA = options[:user_provided_headers]
-        if defined?(file_header_size) && !file_header_size.nil?
-          if headerA.size != file_header_size
-            raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers !=  CSV-file has #{file_header_size} headers"
-          else
-            # we could print out the mapping of file_headerA to headerA here
-          end
-        end
-      else
-        headerA = file_headerA
-      end
-      # detect duplicate headers and disambiguate
-      headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
-      header_size = headerA.size # used for splitting lines
-      headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
-      unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
-        key_mappingH = options[:key_mapping]
-        # do some key mapping on the keys in the file header
-        #   if you want to completely delete a key, then map it to nil or to ''
-        if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
-          # if silence_missing_keys are not set, raise error if missing header
-          missing_keys = key_mappingH.keys - headerA
-          # if the user passes a list of speciffic mapped keys that are optional
-          missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
-          unless missing_keys.empty? || options[:silence_missing_keys] == true
-            raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
-          end
-          headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
-        end
-      end
-      # header_validations
-      duplicate_headers = []
-      headerA.compact.each do |k|
-        duplicate_headers << k if headerA.select{|x| x == k}.size > 1
-      end
-      unless options[:user_provided_headers] || duplicate_headers.empty?
-        raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
-      end
-      # deprecate required_headers
-      unless options[:required_headers].nil?
-        puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
-        if options[:required_keys].nil?
-          options[:required_keys] = options[:required_headers]
-          options[:required_headers] = nil
-        end
-      end
-      if options[:required_keys] && options[:required_keys].is_a?(Array)
-        missing_keys = []
-        options[:required_keys].each do |k|
-          missing_keys << k unless headerA.include?(k)
-        end
-        raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
-      end
-      @headers = headerA
-      [headerA, header_size]
-    end
-    def process_duplicate_headers(headers, options)
-      counts = Hash.new(0)
-      result = []
-      headers.each do |key|
-        counts[key] += 1
-        if counts[key] == 1
-          result << key
-        else
-          result << [key, options[:duplicate_header_suffix], counts[key]].join
-        end
-      end
-      result
-    end
-    private
-    UTF_32_BOM = %w[0 0 fe ff].freeze
-    UTF_32LE_BOM = %w[ff fe 0 0].freeze
-    UTF_8_BOM = %w[ef bb bf].freeze
-    UTF_16_BOM = %w[fe ff].freeze
-    UTF_16LE_BOM = %w[ff fe].freeze
-    def remove_bom(str)
-      str_as_hex = str.bytes.map{|x| x.to_s(16)}
-      # if string does not start with one of the bytes, there is no BOM
-      return str unless %w[ef fe ff 0].include?(str_as_hex[0])
-      return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
-      return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
-      return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
-      # :nocov:
-      puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
-      str
-      # :nocov:
-    end
   end
 end

data/lib/smarter_csv/variables.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# frozen_string_literal: true
+module SmarterCSV
+  class << self
+    attr_reader :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
+    def initialize_variables
+      @csv_line_count = 0
+      @chunk_count = 0
+      @errors = {}
+      @file_line_count = 0
+      @headerA = []
+      @headers = nil
+      @raw_header = nil # header as it appears in the file
+      @result = []
+      @warnings = {}
+    end
+    # :nocov:
+    def headerA
+      warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
+      @headerA
+    end
+    # :nocov:
+  end
+end

data/lib/smarter_csv/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module SmarterCSV
-  VERSION = "1.9.2.pre01" # this is a pretty odd situation
+  VERSION = "1.9.3"
 end

data/lib/smarter_csv.rb CHANGED Viewed

@@ -1,9 +1,12 @@
 # frozen_string_literal: true
-require "core_ext/hash"
 require "smarter_csv/version"
+require "smarter_csv/file_io"
 require "smarter_csv/options_processing"
+require "smarter_csv/auto_detection"
+require "smarter_csv/variables"
+require "smarter_csv/headers"
+require "smarter_csv/parse"
 case RUBY_ENGINE
 when 'ruby'
@@ -11,10 +14,12 @@ when 'ruby'
     if `uname -s`.chomp == 'Darwin'
       require 'smarter_csv/smarter_csv.bundle'
     else
+      # :nocov:
       require_relative "smarter_csv/smarter_csv"
+      # :nocov:
     end
-  rescue Exception
-  #  require_relative 'smarter_csv/smarter_csv'
+  rescue Exception # rubocop:disable Lint/RescueException
+    #  require_relative 'smarter_csv/smarter_csv'
   end
 # :nocov:
 # when 'truffleruby'
@@ -36,4 +41,3 @@ else
 end
 # :nocov:
 require "smarter_csv/smarter_csv"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: smarter_csv
 version: !ruby/object:Gem::Version
-  version: 1.9.2.pre01
+  version: 1.9.3
 platform: ruby
 authors:
 - Tilo Sloboda
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-11-12 00:00:00.000000000 Z
+date: 2023-12-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: awesome_print
@@ -115,10 +115,14 @@ files:
 - TO_DO_v2.md
 - ext/smarter_csv/extconf.rb
 - ext/smarter_csv/smarter_csv.c
-- lib/core_ext/hash.rb
 - lib/smarter_csv.rb
+- lib/smarter_csv/auto_detection.rb
+- lib/smarter_csv/file_io.rb
+- lib/smarter_csv/headers.rb
 - lib/smarter_csv/options_processing.rb
+- lib/smarter_csv/parse.rb
 - lib/smarter_csv/smarter_csv.rb
+- lib/smarter_csv/variables.rb
 - lib/smarter_csv/version.rb
 - smarter_csv.gemspec
 homepage: https://github.com/tilo/smarter_csv
@@ -140,9 +144,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: 2.5.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">"
+  - - ">="
     - !ruby/object:Gem::Version
-      version: 1.3.1
+      version: '0'
 requirements: []
 rubygems_version: 3.2.3
 signing_key:

data/lib/core_ext/hash.rb DELETED Viewed

@@ -1,9 +0,0 @@
-# frozen_string_literal: true
-# the following extension for class Hash is needed (from Facets of Ruby library):
-class Hash
-  def self.zip(keys, values) # from Facets of Ruby library
-    keys.zip(values).to_h
-  end
-end