RubyGems - csv-utils - Versions diffs - 0.3.25 → 0.5.0 - Mend

csv-utils 0.3.25 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/.github/workflows/ci.yml +53 -0
data/.rubocop.yml +81 -0
data/ARCHITECTURE.md +154 -0
data/CLAUDE.md +63 -0
data/Gemfile +2 -1
data/Gemfile.lock +5 -0
data/README.md +238 -16
data/bin/csv-diff +3 -3
data/bin/csv-duplicate-finder +1 -1
data/bin/csv-grep +3 -3
data/bin/csv-readline +4 -5
data/bin/csv-splitter +1 -1
data/bin/csv-validator +38 -36
data/csv-utils.gemspec +6 -5
data/lib/csv-utils.rb +3 -0
data/lib/csv_utils/csv_compare.rb +77 -71
data/lib/csv_utils/csv_extender.rb +45 -41
data/lib/csv_utils/csv_iterator.rb +90 -75
data/lib/csv_utils/csv_options.rb +11 -11
data/lib/csv_utils/csv_report.rb +5 -2
data/lib/csv_utils/csv_row.rb +3 -1
data/lib/csv_utils/csv_row_matcher.rb +34 -0
data/lib/csv_utils/csv_sort.rb +110 -96
data/lib/csv_utils/csv_transformer.rb +95 -92
data/lib/csv_utils/csv_wrapper.rb +40 -36
metadata +13 -6
data/docs/ARCHITECTURE.md +0 -134

data/bin/csv-diff CHANGED Viewed

@@ -15,16 +15,16 @@ OptionParser.new do |opts|
     exit
   end
-  opts.on('-u', '--unique HEADERS', 'Comman separated list of headers that genrate a unique key per a row, use 1st column by default') do |v|
+  opts.on('-u', '--unique HEADERS', 'Comma separated list of headers that generate a unique key per row, uses 1st column by default') do |v|
     options[:unique_headers] = v.split(',')
   end
-  opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore during row comparison') do |v|
+  opts.on('-i', '--ignore HEADERS', 'Comma separated list of headers to ignore during row comparison') do |v|
     options[:ignore_headers] = v.split(',')
   end
   opts.on('--sort-batch-size SIZE', Integer, 'Number of rows to load into memory while sorting') do |v|
-    opts[:sort_batch_size] = v
+    options[:sort_batch_size] = v
   end
 end.parse!

data/bin/csv-duplicate-finder CHANGED Viewed

@@ -26,7 +26,7 @@ csv = CSVUtils::CSVIterator.new(ARGV[0])
 missing_headers = options[:ignore_columns] - csv.first.keys
 unless missing_headers.empty?
-  raise("unkown headers #{missing_headers.join(', ')} configured ingnore headers")
+  raise("unknown headers #{missing_headers.join(', ')} in configured ignore headers")
 end
 hashed_rows = {}

data/bin/csv-grep CHANGED Viewed

@@ -34,7 +34,7 @@ OptionParser.new do |opts|
     options[:limit] = v
   end
-  opts.on('-i', '--ignore-case', 'Ignore case') do |v|
+  opts.on('-i', '--ignore-case', 'Ignore case') do
     options[:search_regex_options] = Regexp::IGNORECASE
   end
 end.parse!
@@ -55,7 +55,7 @@ search_regex =
 headers =
   case options[:headers]
   when :first
-    [csv .headers.first]
+    [csv.headers.first]
   when :all
     csv.headers
   else
@@ -63,7 +63,7 @@ headers =
   end
 missing_headers = headers - csv.headers
-raise("unknown headers #{headers.join(', ')}") unless missing_headers.empty?
+raise("unknown headers #{missing_headers.join(', ')}") unless missing_headers.empty?
 matching_row_proc = proc do |row|
   result = false

data/bin/csv-readline CHANGED Viewed

@@ -114,16 +114,15 @@ OptionParser.new do |opts|
   end
 end.parse!
-file = File.open(ARGV[0], 'rb')
 lineno = ARGV[1].to_i
 number_of_lines = (ARGV[2] || 1).to_i
 raise "no lineno specified" unless lineno > 0
-headers = strip_byte_order_mark(file.readline.strip).split(',')
-data = headers.zip(parse_csv_row(file, lineno, number_of_lines))
-file.close
+data = File.open(ARGV[0], 'rb') do |file|
+  headers = strip_byte_order_mark(file.readline.strip).split(',')
+  headers.zip(parse_csv_row(file, lineno, number_of_lines))
+end
 cnt = 0
 data.each do |k, (v, status)|

data/bin/csv-splitter CHANGED Viewed

@@ -78,4 +78,4 @@ while (row = csv.shift)
   append_row_proc.call(row)
 end
-out.close
+out&.close

data/bin/csv-validator CHANGED Viewed

@@ -29,53 +29,55 @@ def strip_bom!(col)
   col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
 end
-csv = CSV.open(ARGV[0], 'rb')
 id_column_name = ARGV[1]
+csv = CSV.open(ARGV[0], 'rb')
+out = nil
-headers = csv.shift
-strip_bom!(headers[0])
+begin
+  headers = csv.shift
+  strip_bom!(headers[0])
-id_column_name ||= headers[0]
-unless headers.include?(id_column_name)
-  $stderr.puts("header #{id_column_name} not found in current set of headers")
-  exit 1
-end
+  id_column_name ||= headers[0]
+  unless headers.include?(id_column_name)
+    $stderr.puts("header #{id_column_name} not found in current set of headers")
+    exit 1
+  end
-id_column_num = headers.index(id_column_name)
+  id_column_num = headers.index(id_column_name)
-out = nil
-out_proc = Proc.new do |row|
-  out ||=
-    begin
-      out = CSV.open('utf8-correctsion.csv', 'wb')
-      out << [id_column_name, 'Row', 'Col', 'Header', 'Value']
-      out
-    end
+  out_proc = proc do |row|
+    out ||=
+      begin
+        out = CSV.open('utf8-correction.csv', 'wb')
+        out << [id_column_name, 'Row', 'Col', 'Header', 'Value']
+        out
+      end
-  out << row
-end
+    out << row
+  end
-csv_lineno = 1
+  csv_lineno = 1
-while (row = csv.shift)
-  csv_lineno += 1
+  while (row = csv.shift)
+    csv_lineno += 1
-  unless row.size == headers.size
-    $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
-  end
+    unless row.size == headers.size
+      $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
+    end
-  row.each_with_index do |col, idx|
-    next if col.nil? || utf8?(col)
+    row.each_with_index do |col, idx|
+      next if col.nil? || utf8?(col)
-    $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: none UTF-8 characters found in \"#{col}\""
-    if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
-      puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
-      out_proc.call [row[id_column_num], csv_lineno, (idx + 1), headers[idx], col_utf8_encoded]
-    else
-      $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: unknown character encoding"
+      $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: none UTF-8 characters found in \"#{col}\""
+      if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
+        puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
+        out_proc.call [row[id_column_num], csv_lineno, (idx + 1), headers[idx], col_utf8_encoded]
+      else
+        $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: unknown character encoding"
+      end
     end
   end
+ensure
+  csv.close
+  out&.close
 end
-csv.close
-out.close if out

data/csv-utils.gemspec CHANGED Viewed

@@ -2,10 +2,10 @@
 Gem::Specification.new do |s|
   s.name        = 'csv-utils'
-  s.version     = '0.3.25'
+  s.version     = '0.5.0'
   s.licenses    = ['MIT']
-  s.summary     = 'CSV Utils'
-  s.description = 'Tools for debugging malformed CSV files'
+  s.summary     = 'Comprehensive CSV manipulation and debugging utilities for Ruby'
+  s.description = 'A Ruby library for CSV file processing featuring comparison, transformation, sorting, and validation. Includes CLI tools for debugging malformed CSVs, auto-detection of encodings and separators, and efficient handling of large files.'
   s.authors     = ['Doug Youch']
   s.email       = 'dougyouch@gmail.com'
   s.homepage    = 'https://github.com/dougyouch/csv-utils'
@@ -13,6 +13,7 @@ Gem::Specification.new do |s|
   s.bindir      = 'bin'
   s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
-  s.add_runtime_dependency 'csv'
-  s.add_runtime_dependency 'inheritance-helper'
+  s.add_dependency 'csv'
+  s.add_dependency 'inheritance-helper'
+  s.metadata['rubygems_mfa_required'] = 'true'
 end

data/lib/csv-utils.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'csv'
 # Collection of tools for working with CSV files.
@@ -8,6 +10,7 @@ module CSVUtils
   autoload :CSVOptions, 'csv_utils/csv_options'
   autoload :CSVReport, 'csv_utils/csv_report'
   autoload :CSVRow, 'csv_utils/csv_row'
+  autoload :CSVRowMatcher, 'csv_utils/csv_row_matcher'
   autoload :CSVSort, 'csv_utils/csv_sort'
   autoload :CSVTransformer, 'csv_utils/csv_transformer'
   autoload :CSVWrapper, 'csv_utils/csv_wrapper'

data/lib/csv_utils/csv_compare.rb CHANGED Viewed

@@ -1,87 +1,93 @@
+# frozen_string_literal: true
 # CSVUtils::CSVCompare purpose is to determine which rows in the secondary_data_file need to be created, deleted or updated
 # **requires both CSV files to be sorted on the same columns, CSVUtils::CSVSort can accomplish this
 # In order to receive updates, update_comparison_columns must configured or use inheritance and change the update_row? method
-class CSVUtils::CSVCompare
-  # primary_data_file is the source of truth
-  # compare_proc used to compare the id column(s)
-  # update_comparison_columns column(s) to compare for equality, ex: updated_at, timestamp, hash
-  #  caveat: update_comparison_columns need to be in both csv files
-  attr_reader :primary_data_file,
-              :update_comparison_columns,
-              :compare_proc
-  def initialize(primary_data_file, update_comparison_columns=nil, &block)
-    @primary_data_file = primary_data_file
-    @update_comparison_columns = update_comparison_columns
-    @compare_proc = block
-  end
-  def compare(secondary_data_file)
-    src = CSV.open(primary_data_file, 'rb')
-    src_headers = src.shift
-    strip_bom!(src_headers[0])
-    dest = CSV.open(secondary_data_file, 'rb')
-    dest_headers = dest.shift
-    strip_bom!(dest_headers[0])
-    read_next_src = true
-    read_next_dest = true
-    while(!src.eof? || !dest.eof?)
-      src_record = next_record_from_file(src_headers, src) if read_next_src
-      dest_record = next_record_from_file(dest_headers, dest) if read_next_dest
-      if ! src_record
-        read_next_src = false
-        read_next_dest = true
-        yield :delete, dest_record
-      elsif ! dest_record
-        read_next_src = true
-        read_next_dest = false
-        yield :create, src_record
-      elsif compare_proc.call(src_record, dest_record) == 0
-        read_next_src = true
-        read_next_dest = true
-        yield(:update, src_record) if update_row?(src_record, dest_record)
-      elsif compare_proc.call(src_record, dest_record) > 0
-        read_next_src = false
-        read_next_dest = true
-        yield :delete, dest_record
-      else
-        read_next_src = true
-        read_next_dest = false
+module CSVUtils
+  class CSVCompare
+    # primary_data_file is the source of truth
+    # compare_proc used to compare the id column(s)
+    # update_comparison_columns column(s) to compare for equality, ex: updated_at, timestamp, hash
+    #  caveat: update_comparison_columns need to be in both csv files
+    attr_reader :primary_data_file,
+                :update_comparison_columns,
+                :compare_proc
+    def initialize(primary_data_file, update_comparison_columns = nil, &block)
+      @primary_data_file = primary_data_file
+      @update_comparison_columns = update_comparison_columns
+      @compare_proc = block
+    end
-        yield :create, src_record
+    # rubocop:disable Metrics/MethodLength
+    def compare(secondary_data_file)
+      src = CSV.open(primary_data_file, 'rb')
+      begin
+        src_headers = src.shift
+        strip_bom!(src_headers[0])
+        dest = CSV.open(secondary_data_file, 'rb')
+        begin
+          dest_headers = dest.shift
+          strip_bom!(dest_headers[0])
+          read_next_src = true
+          read_next_dest = true
+          while !src.eof? || !dest.eof?
+            src_record = next_record_from_file(src_headers, src) if read_next_src
+            dest_record = next_record_from_file(dest_headers, dest) if read_next_dest
+            if !src_record
+              read_next_src = false
+              read_next_dest = true
+              yield :delete, dest_record
+            elsif !dest_record
+              read_next_src = true
+              read_next_dest = false
+              yield :create, src_record
+            elsif compare_proc.call(src_record, dest_record).zero?
+              read_next_src = true
+              read_next_dest = true
+              yield(:update, src_record) if update_row?(src_record, dest_record)
+            elsif compare_proc.call(src_record, dest_record).positive?
+              read_next_src = false
+              read_next_dest = true
+              yield :delete, dest_record
+            else
+              read_next_src = true
+              read_next_dest = false
+              yield :create, src_record
+            end
+          end
+        ensure
+          dest.close
+        end
+      ensure
+        src.close
       end
     end
+    # rubocop:enable Metrics/MethodLength
-    src.close
-    dest.close
-  end
+    private
-  private
+    def next_record_from_file(headers, file)
+      return nil if file.eof?
-  def next_record_from_file(headers, file)
-    return nil if file.eof?
+      headers.zip(file.shift).to_h
+    end
-    Hash[headers.zip(file.shift)]
-  end
+    def update_row?(src_record, dest_record)
+      return false unless update_comparison_columns
-  def update_row?(src_record, dest_record)
-    return false unless update_comparison_columns
+      update_comparison_columns.each do |column_name|
+        return true unless src_record[column_name] == dest_record[column_name]
+      end
-    update_comparison_columns.each do |column_name|
-      return true unless src_record[column_name] == dest_record[column_name]
+      false
     end
-    false
-  end
-  def strip_bom!(col)
-    col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
+    def strip_bom!(col)
+      col.sub!((+"\xEF\xBB\xBF").force_encoding('ASCII-8BIT'), '')
+    end
   end
 end

data/lib/csv_utils/csv_extender.rb CHANGED Viewed

@@ -1,63 +1,67 @@
+# frozen_string_literal: true
 # Utility class for appending data to a csv file.
-class CSVUtils::CSVExtender
-  def initialize(src_csv, dest_csv, csv_options = {})
-    @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
-    @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
-  end
+module CSVUtils
+  class CSVExtender
+    def initialize(src_csv, dest_csv, csv_options = {})
+      @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
+      @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
+    end
-  def append(additional_headers)
-    process(additional_headers) do |current_headers|
-      while (row = @src_csv.shift)
-        additional_columns = yield row, current_headers
-        @dest_csv << (row + additional_columns)
+    def append(additional_headers)
+      process(additional_headers) do |current_headers|
+        while (row = @src_csv.shift)
+          additional_columns = yield row, current_headers
+          @dest_csv << (row + additional_columns)
+        end
       end
     end
-  end
-  def append_in_batches(additional_headers, batch_size = 1_000)
-    process(additional_headers) do |current_headers|
-      batch = []
+    def append_in_batches(additional_headers, batch_size = 1_000)
+      process(additional_headers) do |current_headers|
+        batch = []
+        process_batch_proc = proc do
+          additional_rows = yield batch, current_headers
-      process_batch_proc = Proc.new do
-        additional_rows = yield batch, current_headers
+          batch.each_with_index do |row, idx|
+            @dest_csv << (row + additional_rows[idx])
+          end
-        batch.each_with_index do |row, idx|
-          @dest_csv << (row + additional_rows[idx])
+          batch = []
         end
-        batch = []
-      end
+        while (row = @src_csv.shift)
+          batch << row
-      while (row = @src_csv.shift)
-        batch << row
+          process_batch_proc.call if batch.size >= batch_size
+        end
-        process_batch_proc.call if batch.size >= batch_size
+        process_batch_proc.call if batch.size.positive?
       end
-      process_batch_proc.call if batch.size > 0
     end
-  end
-  private
+    private
-  def process(additional_headers)
-    current_headers = append_headers(additional_headers)
+    def process(additional_headers)
+      current_headers = append_headers(additional_headers)
-    yield current_headers
+      yield current_headers
-    close
-  end
+      close
+    end
-  def close
-    @src_csv.close
-    @dest_csv.close
-  end
+    def close
+      @src_csv.close
+      @dest_csv.close
+    end
-  def append_headers(additional_headers)
-    return nil unless additional_headers
+    def append_headers(additional_headers)
+      return nil unless additional_headers
-    current_headers = @src_csv.shift
-    @dest_csv << (current_headers + additional_headers)
-    current_headers
+      current_headers = @src_csv.shift
+      @dest_csv << (current_headers + additional_headers)
+      current_headers
+    end
   end
 end