csv-utils 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: '083044bd714b955ff9d5f6a44cf6d4cb3344cf7f649b105d81a94b3ddb1c9425'
4
- data.tar.gz: 5fa1cd9acacf10c275a23176e36722e189f8e6d8a85e1bfc7faf18eb45a1ca31
3
+ metadata.gz: f85c431ad42ed20382fbe91c3696153be9437a8ee755ede313be2d6f488b3770
4
+ data.tar.gz: b620cfb208a7a28573160103155564875b05990f965f9206ad89bd7cb6b5fcc7
5
5
  SHA512:
6
- metadata.gz: de36fb6c80a68b33c92f3c943a665c419ac1287f5b4b2901c1a60f45964d4b13c2199f53e8dd779b2947d6c86439fa1b1b781cc5ba2225656b2d8b50f690e4cc
7
- data.tar.gz: 0a6b5a9301f2386c2ad5bf3009ca77f949a92510ebb606844a1e6bf9fcc573f524966e68d1d8cec7db9778be3c93fb58a006fee9008e531c53a2c5391af6c0c3
6
+ metadata.gz: 02fe7a0d34f61c54a3788739cc5455dc685cad7b627a9091f2b6e5b3ed0323bf5d162cff53150e89a431c6ab42e76e014b1350c6b47f4aeed66ef727ffe76554
7
+ data.tar.gz: 117d50507b9661c1d70b89df658a97649ea2562604c9dd0764f160e9294586293edbf10e9d968da0107a2f072b0e1814d105c364f91ed2c22cca509fb48f2fd6
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'csv-utils'
5
- s.version = '0.2.1'
5
+ s.version = '0.2.2'
6
6
  s.licenses = ['MIT']
7
7
  s.summary = 'CSV Utils'
8
8
  s.description = 'Tools for debugging malformed CSV files'
@@ -6,4 +6,5 @@ module CSVUtils
6
6
  autoload :CSVOptions, 'csv_utils/csv_options'
7
7
  autoload :CSVReport, 'csv_utils/csv_report'
8
8
  autoload :CSVRow, 'csv_utils/csv_row'
9
+ autoload :CSVSort, 'csv_utils/csv_sort'
9
10
  end
@@ -0,0 +1,112 @@
1
+ require 'fileutils'
2
+
3
+ # Utility class for sorting the rows for a csv file
4
+ class CSVUtils::CSVSort
5
+ attr_reader :csv_file,
6
+ :new_csv_file,
7
+ :has_headers,
8
+ :csv_options,
9
+ :headers
10
+
11
+ def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
12
+ @csv_file = csv_file
13
+ @new_csv_file = new_csv_file
14
+ @has_headers = has_headers
15
+ @csv_options = csv_options
16
+ @csv_part_files = []
17
+ @files_to_delete = []
18
+ end
19
+
20
+ def sort(batch_size = 100_000, &block)
21
+ create_sorted_csv_part_files(batch_size, &block)
22
+ merge_csv_part_files(&block)
23
+ end
24
+
25
+ private
26
+
27
+ def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
28
+ src1 = CSV.open(src_csv_file1, 'rb', csv_options)
29
+ src2 = CSV.open(src_csv_file2, 'rb', csv_options)
30
+ dest = CSV.open(dest_csv_file, 'wb', csv_options)
31
+
32
+ if @headers
33
+ dest << @headers
34
+ src1.shift
35
+ src2.shift
36
+ end
37
+
38
+ row1 = src1.shift
39
+ row2 = src2.shift
40
+
41
+ append_row1_proc = Proc.new do
42
+ dest << row1
43
+ row1 = src1.shift
44
+ end
45
+
46
+ append_row2_proc = Proc.new do
47
+ dest << row2
48
+ row2 = src2.shift
49
+ end
50
+
51
+ while row1 || row2
52
+ if row1.nil?
53
+ append_row2_proc.call
54
+ elsif row2.nil?
55
+ append_row1_proc.call
56
+ elsif yield(row1, row2) <= 0
57
+ append_row1_proc.call
58
+ else
59
+ append_row2_proc.call
60
+ end
61
+ end
62
+
63
+ src1.close
64
+ src2.close
65
+ dest.close
66
+ end
67
+
68
+ def create_sorted_csv_part_files(batch_size, &block)
69
+ src = CSV.open(csv_file, 'rb', csv_options)
70
+
71
+ @headers = src.shift if has_headers
72
+
73
+ batch = []
74
+ create_batch_part_proc = Proc.new do
75
+ batch.sort!(&block)
76
+ @csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
77
+ CSV.open(@csv_part_files.last, 'wb', csv_options) do |csv|
78
+ csv << @headers if @headers
79
+ batch.each { |row| csv << row }
80
+ end
81
+ batch = []
82
+ end
83
+
84
+ while (row = src.shift)
85
+ batch << row
86
+ create_batch_part_proc.call if batch.size >= batch_size
87
+ end
88
+
89
+ create_batch_part_proc.call if batch.size > 0
90
+
91
+ src.close
92
+ end
93
+
94
+ def merge_csv_part_files(&block)
95
+ file_merge_cnt = 0
96
+
97
+ while @csv_part_files.size > 1
98
+ file_merge_cnt += 1
99
+
100
+ csv_part_file1 = @csv_part_files.shift
101
+ csv_part_file2 = @csv_part_files.shift
102
+ @csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
103
+
104
+ merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)
105
+
106
+ File.unlink(csv_part_file1)
107
+ File.unlink(csv_part_file2)
108
+ end
109
+
110
+ FileUtils.mv(@csv_part_files.last, new_csv_file)
111
+ end
112
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Doug Youch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-04 00:00:00.000000000 Z
11
+ date: 2020-07-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: inheritance-helper
@@ -49,6 +49,7 @@ files:
49
49
  - lib/csv_utils/csv_options.rb
50
50
  - lib/csv_utils/csv_report.rb
51
51
  - lib/csv_utils/csv_row.rb
52
+ - lib/csv_utils/csv_sort.rb
52
53
  - script/console
53
54
  homepage: https://github.com/dougyouch/csv-utils
54
55
  licenses: