csv-utils 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/csv-utils.gemspec +1 -1
- data/lib/csv-utils.rb +1 -0
- data/lib/csv_utils/csv_sort.rb +112 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f85c431ad42ed20382fbe91c3696153be9437a8ee755ede313be2d6f488b3770
|
4
|
+
data.tar.gz: b620cfb208a7a28573160103155564875b05990f965f9206ad89bd7cb6b5fcc7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02fe7a0d34f61c54a3788739cc5455dc685cad7b627a9091f2b6e5b3ed0323bf5d162cff53150e89a431c6ab42e76e014b1350c6b47f4aeed66ef727ffe76554
|
7
|
+
data.tar.gz: 117d50507b9661c1d70b89df658a97649ea2562604c9dd0764f160e9294586293edbf10e9d968da0107a2f072b0e1814d105c364f91ed2c22cca509fb48f2fd6
|
data/csv-utils.gemspec
CHANGED
data/lib/csv-utils.rb
CHANGED
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
# Utility class for sorting the rows for a csv file
|
4
|
+
class CSVUtils::CSVSort
|
5
|
+
attr_reader :csv_file,
|
6
|
+
:new_csv_file,
|
7
|
+
:has_headers,
|
8
|
+
:csv_options,
|
9
|
+
:headers
|
10
|
+
|
11
|
+
def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
|
12
|
+
@csv_file = csv_file
|
13
|
+
@new_csv_file = new_csv_file
|
14
|
+
@has_headers = has_headers
|
15
|
+
@csv_options = csv_options
|
16
|
+
@csv_part_files = []
|
17
|
+
@files_to_delete = []
|
18
|
+
end
|
19
|
+
|
20
|
+
def sort(batch_size = 100_000, &block)
|
21
|
+
create_sorted_csv_part_files(batch_size, &block)
|
22
|
+
merge_csv_part_files(&block)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
|
28
|
+
src1 = CSV.open(src_csv_file1, 'rb', csv_options)
|
29
|
+
src2 = CSV.open(src_csv_file2, 'rb', csv_options)
|
30
|
+
dest = CSV.open(dest_csv_file, 'wb', csv_options)
|
31
|
+
|
32
|
+
if @headers
|
33
|
+
dest << @headers
|
34
|
+
src1.shift
|
35
|
+
src2.shift
|
36
|
+
end
|
37
|
+
|
38
|
+
row1 = src1.shift
|
39
|
+
row2 = src2.shift
|
40
|
+
|
41
|
+
append_row1_proc = Proc.new do
|
42
|
+
dest << row1
|
43
|
+
row1 = src1.shift
|
44
|
+
end
|
45
|
+
|
46
|
+
append_row2_proc = Proc.new do
|
47
|
+
dest << row2
|
48
|
+
row2 = src2.shift
|
49
|
+
end
|
50
|
+
|
51
|
+
while row1 || row2
|
52
|
+
if row1.nil?
|
53
|
+
append_row2_proc.call
|
54
|
+
elsif row2.nil?
|
55
|
+
append_row1_proc.call
|
56
|
+
elsif yield(row1, row2) <= 0
|
57
|
+
append_row1_proc.call
|
58
|
+
else
|
59
|
+
append_row2_proc.call
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
src1.close
|
64
|
+
src2.close
|
65
|
+
dest.close
|
66
|
+
end
|
67
|
+
|
68
|
+
def create_sorted_csv_part_files(batch_size, &block)
|
69
|
+
src = CSV.open(csv_file, 'rb', csv_options)
|
70
|
+
|
71
|
+
@headers = src.shift if has_headers
|
72
|
+
|
73
|
+
batch = []
|
74
|
+
create_batch_part_proc = Proc.new do
|
75
|
+
batch.sort!(&block)
|
76
|
+
@csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
|
77
|
+
CSV.open(@csv_part_files.last, 'wb', csv_options) do |csv|
|
78
|
+
csv << @headers if @headers
|
79
|
+
batch.each { |row| csv << row }
|
80
|
+
end
|
81
|
+
batch = []
|
82
|
+
end
|
83
|
+
|
84
|
+
while (row = src.shift)
|
85
|
+
batch << row
|
86
|
+
create_batch_part_proc.call if batch.size >= batch_size
|
87
|
+
end
|
88
|
+
|
89
|
+
create_batch_part_proc.call if batch.size > 0
|
90
|
+
|
91
|
+
src.close
|
92
|
+
end
|
93
|
+
|
94
|
+
def merge_csv_part_files(&block)
|
95
|
+
file_merge_cnt = 0
|
96
|
+
|
97
|
+
while @csv_part_files.size > 1
|
98
|
+
file_merge_cnt += 1
|
99
|
+
|
100
|
+
csv_part_file1 = @csv_part_files.shift
|
101
|
+
csv_part_file2 = @csv_part_files.shift
|
102
|
+
@csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
|
103
|
+
|
104
|
+
merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)
|
105
|
+
|
106
|
+
File.unlink(csv_part_file1)
|
107
|
+
File.unlink(csv_part_file2)
|
108
|
+
end
|
109
|
+
|
110
|
+
FileUtils.mv(@csv_part_files.last, new_csv_file)
|
111
|
+
end
|
112
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv-utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Doug Youch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-07-
|
11
|
+
date: 2020-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: inheritance-helper
|
@@ -49,6 +49,7 @@ files:
|
|
49
49
|
- lib/csv_utils/csv_options.rb
|
50
50
|
- lib/csv_utils/csv_report.rb
|
51
51
|
- lib/csv_utils/csv_row.rb
|
52
|
+
- lib/csv_utils/csv_sort.rb
|
52
53
|
- script/console
|
53
54
|
homepage: https://github.com/dougyouch/csv-utils
|
54
55
|
licenses:
|