RubyGems - csv-utils - Versions diffs - 0.2.1 → 0.3.2 - Mend

csv-utils 0.2.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/bin/csv-validator +60 -0
data/csv-utils.gemspec +1 -1
data/lib/csv-utils.rb +3 -0
data/lib/csv_utils/csv_extender.rb +13 -26
data/lib/csv_utils/csv_report.rb +4 -3
data/lib/csv_utils/csv_row.rb +13 -7
data/lib/csv_utils/csv_sort.rb +112 -0
data/lib/csv_utils/csv_transformer.rb +119 -0
data/lib/csv_utils/csv_wrapper.rb +47 -0
metadata +7 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: '083044bd714b955ff9d5f6a44cf6d4cb3344cf7f649b105d81a94b3ddb1c9425'
-  data.tar.gz: 5fa1cd9acacf10c275a23176e36722e189f8e6d8a85e1bfc7faf18eb45a1ca31
+  metadata.gz: 7b3f6bdded232bf3be2009d4bbb5ab99e083cd9f48dff5fabe89324f5217550a
+  data.tar.gz: 86b130f177d173a74bc1611c5677cc6bad5053e953a1a5fabfeabee494e372c8
 SHA512:
-  metadata.gz: de36fb6c80a68b33c92f3c943a665c419ac1287f5b4b2901c1a60f45964d4b13c2199f53e8dd779b2947d6c86439fa1b1b781cc5ba2225656b2d8b50f690e4cc
-  data.tar.gz: 0a6b5a9301f2386c2ad5bf3009ca77f949a92510ebb606844a1e6bf9fcc573f524966e68d1d8cec7db9778be3c93fb58a006fee9008e531c53a2c5391af6c0c3
+  metadata.gz: dd0a299cbe4b153f122d605bbb1b2ab08726194d58010e909090048f1cef0e384d08c8a3a59683b8108c1dc3766deacac46a9063af35e57502f6fe3c92b855ff
+  data.tar.gz: e4233e0c38338d24a6105d465745092146a65e03f297442fabada13ecf380db70740196726ec2d40a8f92828b6cfbc6945c0274701091cb2687af549126e34fa

data/bin/csv-validator ADDED

@@ -0,0 +1,60 @@
+#!/usr/bin/env ruby
+require 'csv'
+begin
+  require 'rchardet'
+rescue LoadError
+  $stderr.puts 'gem install rchardet'
+  exit 1
+end
+def utf8?(str)
+  str
+    .force_encoding('utf-8')
+    .valid_encoding?
+end
+def convert_to_utf8(str, current_encoding)
+  str.force_encoding(current_encoding)
+  return nil unless str.valid_encoding?
+  str.encode('utf-8')
+end
+def detect_encoding(col)
+  CharDet.detect(col)['encoding']
+end
+csv = CSV.open(ARGV[0], 'rb')
+out = CSV.open(ARGV[1], 'wb') if ARGV[1]
+headers = csv.shift
+out << headers if out
+csv_lineno = 1
+while (row = csv.shift)
+  csv_lineno += 1
+  unless row.size == headers.size
+    $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
+  end
+  converted = false
+  row.each_with_index do |col, idx|
+    next if utf8?(col)
+    $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): none UTF-8 characters found in \"#{col}\""
+    if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
+      converted = true
+      puts "row(#{csv_lineno}),col(#{idx + 1}): converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
+      row[idx] = col_utf8_encoded
+    else
+      $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): unknown character encoding"
+    end
+  end
+  out << row if out && converted
+end
+csv.close
+out.close if out

data/csv-utils.gemspec CHANGED

@@ -2,7 +2,7 @@
 Gem::Specification.new do |s|
   s.name        = 'csv-utils'
-  s.version     = '0.2.1'
+  s.version     = '0.3.2'
   s.licenses    = ['MIT']
   s.summary     = 'CSV Utils'
   s.description = 'Tools for debugging malformed CSV files'

data/lib/csv-utils.rb CHANGED

@@ -6,4 +6,7 @@ module CSVUtils
   autoload :CSVOptions, 'csv_utils/csv_options'
   autoload :CSVReport, 'csv_utils/csv_report'
   autoload :CSVRow, 'csv_utils/csv_row'
+  autoload :CSVSort, 'csv_utils/csv_sort'
+  autoload :CSVTransformer, 'csv_utils/csv_transformer'
+  autoload :CSVWrapper, 'csv_utils/csv_wrapper'
 end

data/lib/csv_utils/csv_extender.rb CHANGED

@@ -1,20 +1,15 @@
 # Utility class for appending data to a csv file.
 class CSVUtils::CSVExtender
-  attr_reader :csv_file,
-              :new_csv_file,
-              :csv_options
-  def initialize(csv_file, new_csv_file, csv_options = {})
-    @csv_file = csv_file
-    @new_csv_file = new_csv_file
-    @csv_options = csv_options
+  def initialize(src_csv, dest_csv, csv_options = {})
+    @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
+    @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
   end
   def append(additional_headers)
     process(additional_headers) do |current_headers|
-      while (row = src.shift)
+      while (row = @src_csv.shift)
         additional_columns = yield row, current_headers
-        dest << (row + additional_columns)
+        @dest_csv << (row + additional_columns)
       end
     end
   end
@@ -27,13 +22,13 @@ class CSVUtils::CSVExtender
         additional_rows = yield batch, current_headers
         batch.each_with_index do |row, idx|
-          dest << (row + additional_rows[idx])
+          @dest_csv << (row + additional_rows[idx])
         end
         batch = []
       end
-      while (row = src.shift)
+      while (row = @src_csv.shift)
         batch << row
         process_batch_proc.call if batch.size >= batch_size
@@ -43,6 +38,8 @@ class CSVUtils::CSVExtender
     end
   end
+  private
   def process(additional_headers)
     current_headers = append_headers(additional_headers)
@@ -51,26 +48,16 @@ class CSVUtils::CSVExtender
     close
   end
-  def src
-    @src ||= CSV.open(csv_file, 'rb', csv_options)
-  end
-  def dest
-    @dest ||= CSV.open(new_csv_file, 'wb', csv_options)
-  end
   def close
-    src.close
-    dest.close
+    @src_csv.close
+    @dest_csv.close
   end
-  private
   def append_headers(additional_headers)
     return nil unless additional_headers
-    current_headers = src.shift
-    dest << (current_headers + additional_headers)
+    current_headers = @src_csv.shift
+    @dest_csv << (current_headers + additional_headers)
     current_headers
   end
 end

data/lib/csv_utils/csv_report.rb CHANGED

@@ -4,7 +4,7 @@ module CSVUtils
     attr_reader :csv,
                 :must_close
-    def initialize(csv, csv_options = {}, &block)
+    def initialize(csv, headers = nil, csv_options = {}, &block)
       @csv =
         if csv.is_a?(String)
           @must_close = true
@@ -15,10 +15,11 @@ module CSVUtils
           csv
         end
-      generate(&block) if block
+      generate(headers, &block) if block
     end
-    def generate
+    def generate(headers = nil)
+      add_headers(headers) if headers
       yield self
       @csv.close if @must_close
     end

data/lib/csv_utils/csv_row.rb CHANGED

@@ -23,10 +23,16 @@ module CSVUtils
         add_value_to_class_method(:csv_columns, header => options)
       end
-    end
-    def csv_headers
-      self.class.csv_columns.values.map { |column_options| csv_column_header(column_options) }
+      def csv_headers
+        csv_columns.values.map { |column_options| csv_column_header(column_options) }
+      end
+      private
+      def csv_column_header(column_options)
+        column_options[:header]
+      end
     end
     def csv_row
@@ -34,12 +40,12 @@ module CSVUtils
     end
     alias_method :to_a, :csv_row
-    private
-    def csv_column_header(column_options)
-      column_options[:header]
+    def csv_headers
+      self.class.csv_headers
     end
+    private
     def csv_column_value(column_options)
       if column_options[:proc]
         instance_eval(&column_options[:proc])

data/lib/csv_utils/csv_sort.rb ADDED

@@ -0,0 +1,112 @@
+require 'fileutils'
+# Utility class for sorting the rows for a csv file
+class CSVUtils::CSVSort
+  attr_reader :csv_file,
+              :new_csv_file,
+              :has_headers,
+              :csv_options,
+              :headers
+  def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
+    @csv_file = csv_file
+    @new_csv_file = new_csv_file
+    @has_headers = has_headers
+    @csv_options = csv_options
+    @csv_part_files = []
+    @files_to_delete = []
+  end
+  def sort(batch_size = 100_000, &block)
+    create_sorted_csv_part_files(batch_size, &block)
+    merge_csv_part_files(&block)
+  end
+  private
+  def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
+    src1 = CSV.open(src_csv_file1, 'rb', csv_options)
+    src2 = CSV.open(src_csv_file2, 'rb', csv_options)
+    dest = CSV.open(dest_csv_file, 'wb', csv_options)
+    if @headers
+      dest << @headers
+      src1.shift
+      src2.shift
+    end
+    row1 = src1.shift
+    row2 = src2.shift
+    append_row1_proc = Proc.new do
+      dest << row1
+      row1 = src1.shift
+    end
+    append_row2_proc = Proc.new do
+      dest << row2
+      row2 = src2.shift
+    end
+    while row1 || row2
+      if row1.nil?
+        append_row2_proc.call
+      elsif row2.nil?
+        append_row1_proc.call
+      elsif yield(row1, row2) <= 0
+        append_row1_proc.call
+      else
+        append_row2_proc.call
+      end
+    end
+    src1.close
+    src2.close
+    dest.close
+  end
+  def create_sorted_csv_part_files(batch_size, &block)
+    src = CSV.open(csv_file, 'rb', csv_options)
+    @headers = src.shift if has_headers
+    batch = []
+    create_batch_part_proc = Proc.new do
+      batch.sort!(&block)
+      @csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
+      CSV.open(@csv_part_files.last, 'wb', csv_options) do |csv|
+        csv << @headers if @headers
+        batch.each { |row| csv << row }
+      end
+      batch = []
+    end
+    while (row = src.shift)
+      batch << row
+      create_batch_part_proc.call if batch.size >= batch_size
+    end
+    create_batch_part_proc.call if batch.size > 0
+    src.close
+  end
+  def merge_csv_part_files(&block)
+    file_merge_cnt = 0
+    while @csv_part_files.size > 1
+      file_merge_cnt += 1
+      csv_part_file1 = @csv_part_files.shift
+      csv_part_file2 = @csv_part_files.shift
+      @csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
+      merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)
+      File.unlink(csv_part_file1)
+      File.unlink(csv_part_file2)
+    end
+    FileUtils.mv(@csv_part_files.last, new_csv_file)
+  end
+end

data/lib/csv_utils/csv_transformer.rb ADDED

@@ -0,0 +1,119 @@
+# Transforms a CSV given a series of steps
+class CSVUtils::CSVTransformer
+  attr_reader :headers
+  def initialize(src_csv, dest_csv, csv_options = {})
+    @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
+    @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
+  end
+  def read_headers
+    @headers = @src_csv.shift
+    self
+  end
+  def additional_data(&block)
+    steps << [:additional_data, @headers, block]
+    self
+  end
+  def select(&block)
+    steps << [:select, @headers, block]
+    self
+  end
+  def reject(&block)
+    steps << [:reject, @headers, block]
+    self
+  end
+  def map(new_headers, &block)
+    steps << [:map, @headers, block]
+    @headers = new_headers
+    self
+  end
+  def append(additional_headers, &block)
+    steps << [:append, @headers, block]
+    if additional_headers
+      @headers += additional_headers
+    else
+      @headers = nil
+    end
+    self
+  end
+  def each(&block)
+    steps << [:each, @headers, block]
+    self
+  end
+  def set_headers(headers)
+    @headers = headers
+    self
+  end
+  def process(batch_size = 10_000, &block)
+    batch = []
+    @dest_csv << @headers if @headers
+    steps_proc = Proc.new do
+      steps.each do |step_type, current_headers, proc|
+        batch = process_step(step_type, current_headers, batch, &proc)
+      end
+      batch.each { |row| @dest_csv << row }
+      batch = []
+    end
+    while (row = @src_csv.shift)
+      batch << row
+      steps_proc.call if batch.size >= batch_size
+    end
+    steps_proc.call if batch.size > 0
+    @src_csv.close
+    @dest_csv.close
+  end
+  private
+  def steps
+    @steps ||= []
+  end
+  def process_step(step_type, current_headers, batch, &block)
+    case step_type
+    when :select
+      batch.select! do |row|
+        block.call row, current_headers, @additional_data
+      end
+    when :reject
+      batch.reject! do |row|
+        block.call row, current_headers, @additional_data
+      end
+    when :map
+      batch.map! do |row|
+        block.call row, current_headers, @additional_data
+      end
+    when :append
+      batch.map! do |row|
+        row + block.call(row, current_headers, @additional_data)
+      end
+    when :additional_data
+      @additional_data = block.call(batch, current_headers)
+    when :each
+      batch.each do |row|
+        block.call(row, current_headers, @additional_data)
+      end
+    end
+    batch
+  end
+end

data/lib/csv_utils/csv_wrapper.rb ADDED

@@ -0,0 +1,47 @@
+# Wraps a CSV object, if wrapper opens the csv file it will close it
+class CSVUtils::CSVWrapper
+  attr_reader :csv
+  def initialize(csv, mode, csv_options)
+    open(csv, mode, csv_options)
+  end
+  def self.open(file, mode, csv_options = {})
+    csv = new(file, mode, csv_options)
+    if block_given?
+      yield csv
+      csv.close
+    else
+      csv
+    end
+  end
+  def open(csv, mode, csv_options)
+    if csv.is_a?(String)
+      @close_when_done = true
+      @csv = CSV.open(csv, mode, csv_options)
+    else
+      @close_when_done = false
+      @csv = csv
+    end
+  end
+  def <<(row)
+    csv << row
+  end
+  def shift
+    csv.shift
+  end
+  def close
+    csv.close if close_when_done?
+  end
+  private
+  def close_when_done?
+    @close_when_done
+  end
+end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: csv-utils
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.3.2
 platform: ruby
 authors:
 - Doug Youch
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-07-04 00:00:00.000000000 Z
+date: 2020-07-20 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: inheritance-helper
@@ -30,6 +30,7 @@ executables:
 - csv-change-eol
 - csv-find-error
 - csv-readline
+- csv-validator
 extensions: []
 extra_rdoc_files: []
 files:
@@ -43,12 +44,16 @@ files:
 - bin/csv-change-eol
 - bin/csv-find-error
 - bin/csv-readline
+- bin/csv-validator
 - csv-utils.gemspec
 - lib/csv-utils.rb
 - lib/csv_utils/csv_extender.rb
 - lib/csv_utils/csv_options.rb
 - lib/csv_utils/csv_report.rb
 - lib/csv_utils/csv_row.rb
+- lib/csv_utils/csv_sort.rb
+- lib/csv_utils/csv_transformer.rb
+- lib/csv_utils/csv_wrapper.rb
 - script/console
 homepage: https://github.com/dougyouch/csv-utils
 licenses: