RubyGems - csv-utils - Versions diffs - 0.2.0 → 0.3.1 - Mend

csv-utils 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/bin/csv-validator +47 -0
data/csv-utils.gemspec +1 -1
data/lib/csv-utils.rb +4 -0
data/lib/csv_utils/csv_extender.rb +63 -0
data/lib/csv_utils/csv_report.rb +4 -3
data/lib/csv_utils/csv_row.rb +13 -7
data/lib/csv_utils/csv_sort.rb +112 -0
data/lib/csv_utils/csv_transformer.rb +119 -0
data/lib/csv_utils/csv_wrapper.rb +47 -0
metadata +8 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ecb75f60c8e9b9db4cc3eb0e4ca3a0ac53aad67726ed995b7e8c341cd0dc76a3
-  data.tar.gz: 5138b5cc82eec0b7667c9e3435c2662bbfa1de51e469582372a158b943e57d7f
+  metadata.gz: '019dcd269f036bc21e93019e567e8a0223d8436e87c56519d74af02383640bdf'
+  data.tar.gz: 34b4e8035a533e897c395943e892de0bae16fbdc3847a4990cc0281225d21bd4
 SHA512:
-  metadata.gz: 1a7d685b0db28805833596b32793fca968c6a5a1f57346223b487579622423d0151a122253c69806e377015fe0cf9cf02381bafbee37cb0ba54fa290a857c1cc
-  data.tar.gz: 7c572f9e7c74d626084612afa188bb15b036377bbac16ed5374d7fcff300e58fab8100a77729c2fa428e4294ac6b3b643feb00b2e87e7de78883c8548193de54
+  metadata.gz: e770276baa097fa30551266882910818f331890c6e9bfd7fa92ab01654826a14ad3b67f151f2ce858c816d6d628170174fcc872038636998c15c818dc129130a
+  data.tar.gz: a1689e7404f5d9b70f092b7cda83c8f200df0af1725d27ecc222d798bafcbb1ea1612cbff74dbef493fe427a12ff8330f7828ed608becb4a3d3cd7267179319e

data/bin/csv-validator ADDED

@@ -0,0 +1,47 @@
+#!/usr/bin/env ruby
+require 'csv'
+require 'rchardet'
+def utf8?(str)
+  str
+    .force_encoding('utf-8')
+    .valid_encoding?
+end
+def convert_to_utf8(str, current_encoding)
+  str.force_encoding(current_encoding)
+  return nil unless str.valid_encoding?
+  str.encode('utf-8')
+end
+def detect_encoding(col)
+  CharDet.detect(col)['encoding']
+end
+csv = CSV.open(ARGV[0], 'rb')
+headers = csv.shift
+csv_lineno = 1
+while (row = csv.shift)
+  csv_lineno += 1
+  unless row.size == headers.size
+    $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
+  end
+  row.each_with_index do |col, idx|
+    next if utf8?(col)
+    $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): none UTF-8 characters found in \"#{col}\""
+    if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
+      puts "row(#{csv_lineno}),col(#{idx + 1}): converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
+    else
+      $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): unknown character encoding"
+    end
+  end
+end
+csv.close

data/csv-utils.gemspec CHANGED

@@ -2,7 +2,7 @@
 Gem::Specification.new do |s|
   s.name        = 'csv-utils'
-  s.version     = '0.2.0'
+  s.version     = '0.3.1'
   s.licenses    = ['MIT']
   s.summary     = 'CSV Utils'
   s.description = 'Tools for debugging malformed CSV files'

data/lib/csv-utils.rb CHANGED

@@ -2,7 +2,11 @@ require 'csv'
 # Collection of tools for working with CSV files.
 module CSVUtils
+  autoload :CSVExtender, 'csv_utils/csv_extender'
   autoload :CSVOptions, 'csv_utils/csv_options'
   autoload :CSVReport, 'csv_utils/csv_report'
   autoload :CSVRow, 'csv_utils/csv_row'
+  autoload :CSVSort, 'csv_utils/csv_sort'
+  autoload :CSVTransformer, 'csv_utils/csv_transformer'
+  autoload :CSVWrapper, 'csv_utils/csv_wrapper'
 end

data/lib/csv_utils/csv_extender.rb ADDED

@@ -0,0 +1,63 @@
+# Utility class for appending data to a csv file.
+class CSVUtils::CSVExtender
+  def initialize(src_csv, dest_csv, csv_options = {})
+    @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
+    @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
+  end
+  def append(additional_headers)
+    process(additional_headers) do |current_headers|
+      while (row = @src_csv.shift)
+        additional_columns = yield row, current_headers
+        @dest_csv << (row + additional_columns)
+      end
+    end
+  end
+  def append_in_batches(additional_headers, batch_size = 1_000)
+    process(additional_headers) do |current_headers|
+      batch = []
+      process_batch_proc = Proc.new do
+        additional_rows = yield batch, current_headers
+        batch.each_with_index do |row, idx|
+          @dest_csv << (row + additional_rows[idx])
+        end
+        batch = []
+      end
+      while (row = @src_csv.shift)
+        batch << row
+        process_batch_proc.call if batch.size >= batch_size
+      end
+      process_batch_proc.call if batch.size > 0
+    end
+  end
+  private
+  def process(additional_headers)
+    current_headers = append_headers(additional_headers)
+    yield current_headers
+    close
+  end
+  def close
+    @src_csv.close
+    @dest_csv.close
+  end
+  def append_headers(additional_headers)
+    return nil unless additional_headers
+    current_headers = @src_csv.shift
+    @dest_csv << (current_headers + additional_headers)
+    current_headers
+  end
+end

data/lib/csv_utils/csv_report.rb CHANGED

@@ -4,7 +4,7 @@ module CSVUtils
     attr_reader :csv,
                 :must_close
-    def initialize(csv, csv_options = {}, &block)
+    def initialize(csv, headers = nil, csv_options = {}, &block)
       @csv =
         if csv.is_a?(String)
           @must_close = true
@@ -15,10 +15,11 @@ module CSVUtils
           csv
         end
-      generate(&block) if block
+      generate(headers, &block) if block
     end
-    def generate
+    def generate(headers = nil)
+      add_headers(headers) if headers
       yield self
       @csv.close if @must_close
     end

data/lib/csv_utils/csv_row.rb CHANGED

@@ -23,10 +23,16 @@ module CSVUtils
         add_value_to_class_method(:csv_columns, header => options)
       end
-    end
-    def csv_headers
-      self.class.csv_columns.values.map { |column_options| csv_column_header(column_options) }
+      def csv_headers
+        csv_columns.values.map { |column_options| csv_column_header(column_options) }
+      end
+      private
+      def csv_column_header(column_options)
+        column_options[:header]
+      end
     end
     def csv_row
@@ -34,12 +40,12 @@ module CSVUtils
     end
     alias_method :to_a, :csv_row
-    private
-    def csv_column_header(column_options)
-      column_options[:header]
+    def csv_headers
+      self.class.csv_headers
     end
+    private
     def csv_column_value(column_options)
       if column_options[:proc]
         instance_eval(&column_options[:proc])

data/lib/csv_utils/csv_sort.rb ADDED

@@ -0,0 +1,112 @@
+require 'fileutils'
+# Utility class for sorting the rows for a csv file
+class CSVUtils::CSVSort
+  attr_reader :csv_file,
+              :new_csv_file,
+              :has_headers,
+              :csv_options,
+              :headers
+  def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
+    @csv_file = csv_file
+    @new_csv_file = new_csv_file
+    @has_headers = has_headers
+    @csv_options = csv_options
+    @csv_part_files = []
+    @files_to_delete = []
+  end
+  def sort(batch_size = 100_000, &block)
+    create_sorted_csv_part_files(batch_size, &block)
+    merge_csv_part_files(&block)
+  end
+  private
+  def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
+    src1 = CSV.open(src_csv_file1, 'rb', csv_options)
+    src2 = CSV.open(src_csv_file2, 'rb', csv_options)
+    dest = CSV.open(dest_csv_file, 'wb', csv_options)
+    if @headers
+      dest << @headers
+      src1.shift
+      src2.shift
+    end
+    row1 = src1.shift
+    row2 = src2.shift
+    append_row1_proc = Proc.new do
+      dest << row1
+      row1 = src1.shift
+    end
+    append_row2_proc = Proc.new do
+      dest << row2
+      row2 = src2.shift
+    end
+    while row1 || row2
+      if row1.nil?
+        append_row2_proc.call
+      elsif row2.nil?
+        append_row1_proc.call
+      elsif yield(row1, row2) <= 0
+        append_row1_proc.call
+      else
+        append_row2_proc.call
+      end
+    end
+    src1.close
+    src2.close
+    dest.close
+  end
+  def create_sorted_csv_part_files(batch_size, &block)
+    src = CSV.open(csv_file, 'rb', csv_options)
+    @headers = src.shift if has_headers
+    batch = []
+    create_batch_part_proc = Proc.new do
+      batch.sort!(&block)
+      @csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
+      CSV.open(@csv_part_files.last, 'wb', csv_options) do |csv|
+        csv << @headers if @headers
+        batch.each { |row| csv << row }
+      end
+      batch = []
+    end
+    while (row = src.shift)
+      batch << row
+      create_batch_part_proc.call if batch.size >= batch_size
+    end
+    create_batch_part_proc.call if batch.size > 0
+    src.close
+  end
+  def merge_csv_part_files(&block)
+    file_merge_cnt = 0
+    while @csv_part_files.size > 1
+      file_merge_cnt += 1
+      csv_part_file1 = @csv_part_files.shift
+      csv_part_file2 = @csv_part_files.shift
+      @csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
+      merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)
+      File.unlink(csv_part_file1)
+      File.unlink(csv_part_file2)
+    end
+    FileUtils.mv(@csv_part_files.last, new_csv_file)
+  end
+end

data/lib/csv_utils/csv_transformer.rb ADDED

@@ -0,0 +1,119 @@
+# Transforms a CSV given a series of steps
+class CSVUtils::CSVTransformer
+  attr_reader :headers
+  def initialize(src_csv, dest_csv, csv_options = {})
+    @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
+    @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
+  end
+  def read_headers
+    @headers = @src_csv.shift
+    self
+  end
+  def additional_data(&block)
+    steps << [:additional_data, @headers, block]
+    self
+  end
+  def select(&block)
+    steps << [:select, @headers, block]
+    self
+  end
+  def reject(&block)
+    steps << [:reject, @headers, block]
+    self
+  end
+  def map(new_headers, &block)
+    steps << [:map, @headers, block]
+    @headers = new_headers
+    self
+  end
+  def append(additional_headers, &block)
+    steps << [:append, @headers, block]
+    if additional_headers
+      @headers += additional_headers
+    else
+      @headers = nil
+    end
+    self
+  end
+  def each(&block)
+    steps << [:each, @headers, block]
+    self
+  end
+  def set_headers(headers)
+    @headers = headers
+    self
+  end
+  def process(batch_size = 10_000, &block)
+    batch = []
+    @dest_csv << @headers if @headers
+    steps_proc = Proc.new do
+      steps.each do |step_type, current_headers, proc|
+        batch = process_step(step_type, current_headers, batch, &proc)
+      end
+      batch.each { |row| @dest_csv << row }
+      batch = []
+    end
+    while (row = @src_csv.shift)
+      batch << row
+      steps_proc.call if batch.size >= batch_size
+    end
+    steps_proc.call if batch.size > 0
+    @src_csv.close
+    @dest_csv.close
+  end
+  private
+  def steps
+    @steps ||= []
+  end
+  def process_step(step_type, current_headers, batch, &block)
+    case step_type
+    when :select
+      batch.select! do |row|
+        block.call row, current_headers, @additional_data
+      end
+    when :reject
+      batch.reject! do |row|
+        block.call row, current_headers, @additional_data
+      end
+    when :map
+      batch.map! do |row|
+        block.call row, current_headers, @additional_data
+      end
+    when :append
+      batch.map! do |row|
+        row + block.call(row, current_headers, @additional_data)
+      end
+    when :additional_data
+      @additional_data = block.call(batch, current_headers)
+    when :each
+      batch.each do |row|
+        block.call(row, current_headers, @additional_data)
+      end
+    end
+    batch
+  end
+end

data/lib/csv_utils/csv_wrapper.rb ADDED

@@ -0,0 +1,47 @@
+# Wraps a CSV object, if wrapper opens the csv file it will close it
+class CSVUtils::CSVWrapper
+  attr_reader :csv
+  def initialize(csv, mode, csv_options)
+    open(csv, mode, csv_options)
+  end
+  def self.open(file, mode, csv_options = {})
+    csv = new(file, mode, csv_options)
+    if block_given?
+      yield csv
+      csv.close
+    else
+      csv
+    end
+  end
+  def open(csv, mode, csv_options)
+    if csv.is_a?(String)
+      @close_when_done = true
+      @csv = CSV.open(csv, mode, csv_options)
+    else
+      @close_when_done = false
+      @csv = csv
+    end
+  end
+  def <<(row)
+    csv << row
+  end
+  def shift
+    csv.shift
+  end
+  def close
+    csv.close if close_when_done?
+  end
+  private
+  def close_when_done?
+    @close_when_done
+  end
+end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: csv-utils
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.1
 platform: ruby
 authors:
 - Doug Youch
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-06-23 00:00:00.000000000 Z
+date: 2020-07-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: inheritance-helper
@@ -30,6 +30,7 @@ executables:
 - csv-change-eol
 - csv-find-error
 - csv-readline
+- csv-validator
 extensions: []
 extra_rdoc_files: []
 files:
@@ -43,11 +44,16 @@ files:
 - bin/csv-change-eol
 - bin/csv-find-error
 - bin/csv-readline
+- bin/csv-validator
 - csv-utils.gemspec
 - lib/csv-utils.rb
+- lib/csv_utils/csv_extender.rb
 - lib/csv_utils/csv_options.rb
 - lib/csv_utils/csv_report.rb
 - lib/csv_utils/csv_row.rb
+- lib/csv_utils/csv_sort.rb
+- lib/csv_utils/csv_transformer.rb
+- lib/csv_utils/csv_wrapper.rb
 - script/console
 homepage: https://github.com/dougyouch/csv-utils
 licenses: