RubyGems - csv-utils - Versions diffs - 0.3.25 → 0.5.0 - Mend

csv-utils 0.3.25 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +4 -4
data/.github/workflows/ci.yml +53 -0
data/.rubocop.yml +81 -0
data/ARCHITECTURE.md +154 -0
data/CLAUDE.md +63 -0
data/Gemfile +2 -1
data/Gemfile.lock +5 -0
data/README.md +238 -16
data/bin/csv-diff +3 -3
data/bin/csv-duplicate-finder +1 -1
data/bin/csv-grep +3 -3
data/bin/csv-readline +4 -5
data/bin/csv-splitter +1 -1
data/bin/csv-validator +38 -36
data/csv-utils.gemspec +6 -5
data/lib/csv-utils.rb +3 -0
data/lib/csv_utils/csv_compare.rb +77 -71
data/lib/csv_utils/csv_extender.rb +45 -41
data/lib/csv_utils/csv_iterator.rb +90 -75
data/lib/csv_utils/csv_options.rb +11 -11
data/lib/csv_utils/csv_report.rb +5 -2
data/lib/csv_utils/csv_row.rb +3 -1
data/lib/csv_utils/csv_row_matcher.rb +34 -0
data/lib/csv_utils/csv_sort.rb +110 -96
data/lib/csv_utils/csv_transformer.rb +95 -92
data/lib/csv_utils/csv_wrapper.rb +40 -36
metadata +13 -6
data/docs/ARCHITECTURE.md +0 -134

data/lib/csv_utils/csv_iterator.rb CHANGED Viewed

@@ -1,102 +1,117 @@
-# Search a CSV given a series of steps
-class CSVUtils::CSVIterator
-  include Enumerable
-  attr_reader :prev_row
+# frozen_string_literal: true
-  class RowWrapper < Hash
-    attr_accessor :lineno
+# Search a CSV given a series of steps
+module CSVUtils
+  class CSVIterator
+    include Enumerable
+    BYTE_ORDER_MARKS = [
+      (+"\xEF\xBB\xBF").force_encoding('ASCII-8BIT'),       # UTF-8
+      (+"\xFE\xFF").force_encoding('ASCII-8BIT'),           # UTF-16 BE
+      (+"\xFF\xFE").force_encoding('ASCII-8BIT'),           # UTF-16 LE
+      (+"\x00\x00\xFE\xFF").force_encoding('ASCII-8BIT'),   # UTF-32 BE
+      (+"\xFF\xFE\x00\x00").force_encoding('ASCII-8BIT')    # UTF-32 LE
+    ].freeze
+    attr_reader :prev_row
+    class RowWrapper < Hash
+      attr_accessor :lineno
+      def self.create(headers, row, lineno)
+        row_wrapper = RowWrapper[headers.zip(row)]
+        row_wrapper.lineno = lineno
+        row_wrapper
+      end
-    def self.create(headers, row, lineno)
-      row_wrapper = RowWrapper[headers.zip(row)]
-      row_wrapper.lineno = lineno
-      row_wrapper
+      def to_pretty_s
+        reject { |_, v| v.nil? || v.strip.empty? }
+          .each_with_index
+          .map { |(k, v), idx| format('  %-3d %s: %s', idx + 1, k, v) }
+          .join("\n") + "\n"
+      end
     end
-    def to_pretty_s
-      reject { |_, v| v.nil? || v.strip.empty? }
-        .each_with_index
-        .map { |(k, v), idx| sprintf('  %-3d %s: %s', idx+1, k, v) }
-        .join("\n") + "\n"
+    def initialize(src_csv, csv_options = {}, mode = 'rb')
+      @src_csv = CSVUtils::CSVWrapper.new(src_csv, mode, csv_options)
     end
-  end
-  def initialize(src_csv, csv_options = {}, mode = 'rb')
-    @src_csv = CSVUtils::CSVWrapper.new(src_csv, mode, csv_options)
-  end
+    def each(headers = nil)
+      @src_csv.rewind
-  def each(headers = nil)
-    @src_csv.rewind
+      lineno = 0
+      unless headers
+        headers = @src_csv.shift
+        strip_bom!(headers[0])
+        lineno += 1
+      end
-    lineno = 0
-    unless headers
-      headers = @src_csv.shift
-      strip_bom!(headers[0])
-      lineno += 1
+      @prev_row = nil
+      while (row = @src_csv.shift)
+        lineno += 1
+        yield RowWrapper.create(headers, row, lineno)
+        @prev_row = row
+      end
     end
-    @prev_row = nil
-    while (row = @src_csv.shift)
-      lineno += 1
-      yield RowWrapper.create(headers, row, lineno)
-      @prev_row = row
+    def headers
+      @src_csv.rewind
+      headers = @src_csv.shift
+      strip_bom!(headers[0])
+      headers
     end
-  end
-  def headers
-    @src_csv.rewind
-    headers = @src_csv.shift
-    strip_bom!(headers[0])
-    headers
-  end
+    def to_hash(key, value = nil, &)
+      raise("header #{key} not found in #{headers}") unless headers.include?(key)
+      raise("headers #{value} not found in #{headers}") if value && !headers.include?(value)
-  def to_hash(key, value = nil)
-    raise("header #{key} not found in #{headers}") unless headers.include?(key)
-    raise("headers #{value} not found in #{headers}") if value && !headers.include?(value)
+      value_proc =
+        if value
+          proc { |row| row[value] }
+        else
+          proc(&)
+        end
-    value_proc =
-      if value
-        proc { |row| row[value] }
-      else
-        proc { |row| yield(row) }
+      each_with_object({}) do |row, hsh|
+        hsh[row[key]] = value_proc.call(row)
       end
-    each_with_object({}) do |row, hsh|
-      hsh[row[key]] = value_proc.call(row)
     end
-  end
-  def size
-    @src_csv.rewind
-    @src_csv.shift
-    cnt = 0
-    while @src_csv.shift
-      cnt +=1
+    def size
+      @src_csv.rewind
+      @src_csv.shift
+      cnt = 0
+      cnt += 1 while @src_csv.shift
+      cnt
     end
-    cnt
-  end
-  def each_batch(batch_size = 1_000)
-    batch = []
-    process_batch_proc = Proc.new do
-      yield batch
+    def each_batch(batch_size = 1_000)
       batch = []
-    end
-    each do |row|
-      batch << row
-      process_batch_proc.call if batch.size >= batch_size
-    end
+      process_batch_proc = proc do
+        yield batch
+        batch = []
+      end
-    process_batch_proc.call if batch.size > 0
+      each do |row|
+        batch << row
+        process_batch_proc.call if batch.size >= batch_size
+      end
-    nil
-  end
+      process_batch_proc.call if batch.size.positive?
-  private
+      nil
+    end
-  def strip_bom!(col)
-    col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
+    private
+    def strip_bom!(col)
+      BYTE_ORDER_MARKS.each do |bom|
+        if col.start_with?(bom)
+          col.sub!(bom, '')
+          break
+        end
+      end
+    end
   end
 end

data/lib/csv_utils/csv_options.rb CHANGED Viewed

@@ -1,35 +1,35 @@
+# frozen_string_literal: true
 # Auto detect a csv files options
 module CSVUtils
   class CSVOptions
     # this list is from https://en.wikipedia.org/wiki/Byte_order_mark
     BYTE_ORDER_MARKS = {
-      "\xEF\xBB\xBF".force_encoding('ASCII-8BIT') => 'UTF-8',
-      "\xFE\xFF".force_encoding('ASCII-8BIT') => 'UTF-16',
-      "\xFF\xFE".force_encoding('ASCII-8BIT') => 'UTF-16',
-      "\x00\x00\xFE\xFF".force_encoding('ASCII-8BIT') => 'UTF-32',
-      "\xFF\xFE\x00\x00".force_encoding('ASCII-8BIT') => 'UTF-32'
-    }
+      (+"\xEF\xBB\xBF").force_encoding('ASCII-8BIT') => 'UTF-8',
+      (+"\xFE\xFF").force_encoding('ASCII-8BIT') => 'UTF-16',
+      (+"\xFF\xFE").force_encoding('ASCII-8BIT') => 'UTF-16',
+      (+"\x00\x00\xFE\xFF").force_encoding('ASCII-8BIT') => 'UTF-32',
+      (+"\xFF\xFE\x00\x00").force_encoding('ASCII-8BIT') => 'UTF-32'
+    }.freeze
     COL_SEPARATORS = [
       "\x02",
       "\t",
       '|',
       ','
-    ]
+    ].freeze
     ROW_SEPARATORS = [
       "\r\n",
       "\n",
       "\r"
-    ]
+    ].freeze
     attr_reader :columns,
                 :byte_order_mark,
                 :encoding,
                 :col_separator,
                 :row_separator
     def initialize(io)
       line =
@@ -81,7 +81,7 @@ module CSVUtils
     end
     def strip_byte_order_marks(header)
-      @byte_order_marks ? header.sub(@byte_order_marks, '') : header
+      @byte_order_mark ? header.sub(@byte_order_mark, '') : header
     end
   end
 end

data/lib/csv_utils/csv_report.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 # Builds a csv file from csv rows
 module CSVUtils
   class CSVReport
@@ -8,8 +10,9 @@ module CSVUtils
       @csv =
         if csv.is_a?(String)
           @must_close = true
-          mode = csv_options.delete(:mode) || 'wb'
-          CSV.open(csv, mode, **csv_options)
+          opts = csv_options.dup
+          mode = opts.delete(:mode) || 'wb'
+          CSV.open(csv, mode, **opts)
         else
           @must_close = false
           csv

data/lib/csv_utils/csv_row.rb CHANGED Viewed

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'inheritance-helper'
 module CSVUtils
@@ -38,7 +40,7 @@ module CSVUtils
     def csv_row
       self.class.csv_columns.values.map { |column_options| csv_column_value(column_options) }
     end
-    alias_method :to_a, :csv_row
+    alias to_a csv_row
     def csv_headers
       self.class.csv_headers

data/lib/csv_utils/csv_row_matcher.rb ADDED Viewed

@@ -0,0 +1,34 @@
+# frozen_string_literal: true
+module CSVUtils
+  class CSVRowMatcher
+    attr_accessor :regex,
+                  :columns
+    def initialize(regex, columns = :all)
+      self.regex = regex
+      self.columns = columns
+    end
+    def match?(row)
+      if columns == :all
+        row.each_value do |value|
+          return true if value&.match?(regex)
+        end
+      else
+        columns.each do |column_name|
+          value = row[column_name]
+          return true if value&.match?(regex)
+        end
+      end
+      false
+    end
+    def to_proc
+      proc do |row|
+        match?(row)
+      end
+    end
+  end
+end

data/lib/csv_utils/csv_sort.rb CHANGED Viewed

@@ -1,116 +1,130 @@
+# frozen_string_literal: true
 require 'fileutils'
 # Utility class for sorting the rows for a csv file
-class CSVUtils::CSVSort
-  attr_reader :csv_file,
-              :new_csv_file,
-              :has_headers,
-              :csv_options,
-              :headers
-  def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
-    @csv_file = csv_file
-    @new_csv_file = new_csv_file
-    @has_headers = has_headers
-    @csv_options = csv_options
-    @csv_part_files = []
-    @files_to_delete = []
-  end
-  def sort(batch_size = 100_000, &block)
-    create_sorted_csv_part_files(batch_size, &block)
-    merge_csv_part_files(&block)
-  end
-  private
-  def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
-    src1 = CSV.open(src_csv_file1, 'rb', **csv_options)
-    src2 = CSV.open(src_csv_file2, 'rb', **csv_options)
-    dest = CSV.open(dest_csv_file, 'wb', **csv_options)
-    if @headers
-      dest << @headers
-      src1.shift
-      src2.shift
+module CSVUtils
+  class CSVSort
+    attr_reader :csv_file,
+                :new_csv_file,
+                :has_headers,
+                :csv_options,
+                :headers
+    def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
+      @csv_file = csv_file
+      @new_csv_file = new_csv_file
+      @has_headers = has_headers
+      @csv_options = csv_options
+      @csv_part_files = []
+      @files_to_delete = []
     end
-    row1 = src1.shift
-    row2 = src2.shift
-    append_row1_proc = Proc.new do
-      dest << row1
-      row1 = src1.shift
-    end
-    append_row2_proc = Proc.new do
-      dest << row2
-      row2 = src2.shift
+    def sort(batch_size = 100_000, &)
+      create_sorted_csv_part_files(batch_size, &)
+      merge_csv_part_files(&)
     end
-    while row1 || row2
-      if row1.nil?
-        append_row2_proc.call
-      elsif row2.nil?
-        append_row1_proc.call
-      elsif yield(row1, row2) <= 0
-        append_row1_proc.call
-      else
-        append_row2_proc.call
+    private
+    # rubocop:disable Metrics/MethodLength
+    def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
+      src1 = CSV.open(src_csv_file1, 'rb', **csv_options)
+      begin
+        src2 = CSV.open(src_csv_file2, 'rb', **csv_options)
+        begin
+          dest = CSV.open(dest_csv_file, 'wb', **csv_options)
+          begin
+            if @headers
+              dest << @headers
+              src1.shift
+              src2.shift
+            end
+            row1 = src1.shift
+            row2 = src2.shift
+            append_row1_proc = proc do
+              dest << row1
+              row1 = src1.shift
+            end
+            append_row2_proc = proc do
+              dest << row2
+              row2 = src2.shift
+            end
+            while row1 || row2
+              if row1.nil?
+                append_row2_proc.call
+              elsif row2.nil?
+                append_row1_proc.call
+              elsif yield(row1, row2) <= 0
+                append_row1_proc.call
+              else
+                append_row2_proc.call
+              end
+            end
+          ensure
+            dest.close
+          end
+        ensure
+          src2.close
+        end
+      ensure
+        src1.close
       end
     end
-    src1.close
-    src2.close
-    dest.close
-  end
-  def create_sorted_csv_part_files(batch_size, &block)
-    src = CSV.open(csv_file, 'rb', **csv_options)
-    @headers = src.shift if has_headers
-    batch = []
-    create_batch_part_proc = Proc.new do
-      batch.sort!(&block)
-      @csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
-      CSV.open(@csv_part_files.last, 'wb', **csv_options) do |csv|
-        csv << @headers if @headers
-        batch.each { |row| csv << row }
+    # rubocop:enable Metrics/MethodLength
+    def create_sorted_csv_part_files(batch_size, &block)
+      src = CSV.open(csv_file, 'rb', **csv_options)
+      begin
+        @headers = src.shift if has_headers
+        batch = []
+        create_batch_part_proc = proc do
+          batch.sort!(&block)
+          @csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
+          CSV.open(@csv_part_files.last, 'wb', **csv_options) do |csv|
+            csv << @headers if @headers
+            batch.each { |row| csv << row }
+          end
+          batch = []
+        end
+        while (row = src.shift)
+          batch << row
+          create_batch_part_proc.call if batch.size >= batch_size
+        end
+        create_batch_part_proc.call if batch.size.positive?
+      ensure
+        src.close
       end
-      batch = []
     end
-    while (row = src.shift)
-      batch << row
-      create_batch_part_proc.call if batch.size >= batch_size
-    end
-    create_batch_part_proc.call if batch.size > 0
-    src.close
-  end
-  def merge_csv_part_files(&block)
-    file_merge_cnt = 0
+    def merge_csv_part_files(&)
+      file_merge_cnt = 0
-    while @csv_part_files.size > 1
-      file_merge_cnt += 1
+      while @csv_part_files.size > 1
+        file_merge_cnt += 1
-      csv_part_file1 = @csv_part_files.shift
-      csv_part_file2 = @csv_part_files.shift
-      @csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
+        csv_part_file1 = @csv_part_files.shift
+        csv_part_file2 = @csv_part_files.shift
+        @csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
-      merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)
+        merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &)
-      File.unlink(csv_part_file1)
-      File.unlink(csv_part_file2)
-    end
+        File.unlink(csv_part_file1)
+        File.unlink(csv_part_file2)
+      end
-    if @csv_part_files.size > 0
-      FileUtils.mv(@csv_part_files.last, new_csv_file)
-    else
-      FileUtils.cp(@csv_file, new_csv_file)
+      if @csv_part_files.size.positive?
+        FileUtils.mv(@csv_part_files.last, new_csv_file)
+      else
+        FileUtils.cp(@csv_file, new_csv_file)
+      end
     end
   end
 end