csv-utils 0.1.6 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b6af40dc417430366452f9b5ddc1670595fb16ca17ab8452b7842faaa0726343
4
- data.tar.gz: 11db2a2e86c54f4f9593113cd2f1ddd9b712213beb7da49b0660501d63c148a5
3
+ metadata.gz: 8b1634cd26b21129c05cf1c5655e46ff3b81307135ce8dd686022cd8a4ff5adc
4
+ data.tar.gz: 53312f6841e32ddcae0e97cd9aab4513a47580c35a3ec39964f8933fdbbcff10
5
5
  SHA512:
6
- metadata.gz: d97f0d8b564b3819713250b52f83a2b538862654423c9d932c77a0d188199949a31ed8acaf3fd0db7c1bcc00a57c1cdde2997553c2cd062429de691354472611
7
- data.tar.gz: 1c486515fba9b72c9c37a4952f86f872947518fb82a2573e5868c57520085516f202acf146f2f276fe4fe88b20f49c304d391b30f9bdb96d8c82982d38109b03
6
+ metadata.gz: 5733c0ff8b730e957c46fc73b1080690dd79fb481b30f9591f4256507abc3ea46139d9758605c4c4e143e23781468b9adbd623812ceb105bb610a61943ffbc57
7
+ data.tar.gz: 4ce9256c44ed50fc289fbe2b0ce970b2798310d50b4f69dffe9fa57098b0acab0539049bd5c945b43768ea7cb3d97b2421e54bf4c069df7751bd3704b68db9ec
@@ -0,0 +1 @@
1
+ csv-utils
@@ -0,0 +1 @@
1
+ 2.6.3
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'http://rubygems.org'
4
+
5
+ gem 'inheritance-helper'
6
+
7
+ group :development do
8
+ gem 'rake'
9
+ gem 'rubocop'
10
+ end
11
+
12
+ group :spec do
13
+ gem 'rspec'
14
+ gem 'simplecov'
15
+ end
@@ -0,0 +1,57 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ ast (2.4.1)
5
+ diff-lcs (1.3)
6
+ docile (1.3.2)
7
+ inheritance-helper (0.1.5)
8
+ parallel (1.19.2)
9
+ parser (2.7.1.4)
10
+ ast (~> 2.4.1)
11
+ rainbow (3.0.0)
12
+ rake (13.0.1)
13
+ regexp_parser (1.7.1)
14
+ rexml (3.2.4)
15
+ rspec (3.9.0)
16
+ rspec-core (~> 3.9.0)
17
+ rspec-expectations (~> 3.9.0)
18
+ rspec-mocks (~> 3.9.0)
19
+ rspec-core (3.9.2)
20
+ rspec-support (~> 3.9.3)
21
+ rspec-expectations (3.9.2)
22
+ diff-lcs (>= 1.2.0, < 2.0)
23
+ rspec-support (~> 3.9.0)
24
+ rspec-mocks (3.9.1)
25
+ diff-lcs (>= 1.2.0, < 2.0)
26
+ rspec-support (~> 3.9.0)
27
+ rspec-support (3.9.3)
28
+ rubocop (0.86.0)
29
+ parallel (~> 1.10)
30
+ parser (>= 2.7.0.1)
31
+ rainbow (>= 2.2.2, < 4.0)
32
+ regexp_parser (>= 1.7)
33
+ rexml
34
+ rubocop-ast (>= 0.0.3, < 1.0)
35
+ ruby-progressbar (~> 1.7)
36
+ unicode-display_width (>= 1.4.0, < 2.0)
37
+ rubocop-ast (0.0.3)
38
+ parser (>= 2.7.0.1)
39
+ ruby-progressbar (1.10.1)
40
+ simplecov (0.18.5)
41
+ docile (~> 1.1)
42
+ simplecov-html (~> 0.11)
43
+ simplecov-html (0.12.2)
44
+ unicode-display_width (1.7.0)
45
+
46
+ PLATFORMS
47
+ ruby
48
+
49
+ DEPENDENCIES
50
+ inheritance-helper
51
+ rake
52
+ rspec
53
+ rubocop
54
+ simplecov
55
+
56
+ BUNDLED WITH
57
+ 1.17.3
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'csv'
4
+
5
+ def bold_string(str)
6
+ "\033[1m#{str}\033[0m"
7
+ end
8
+
9
+ USAGE = "Usage: #{bold_string('csv-change-eol')} <csv_file> <end of line character sequence in hex>"
10
+
11
+ def exit_on_error(msg)
12
+ $stderr.print <<STR
13
+ Error: #{bold_string(msg)}
14
+
15
+ #{USAGE}
16
+
17
+ End of line example: '7C5E7C0A' is '|^|\\n'
18
+ - 0A is new line
19
+ - 0D is carriage return
20
+
21
+ Goto: #{bold_string('http://www.asciitable.com/')} for help with the character sequence
22
+
23
+ STR
24
+ exit 1
25
+ end
26
+
27
+ csv_file = ARGV.shift || exit_on_error('no csv file specified')
28
+ eol_sequence = ARGV.shift || exit_on_error('no EOL character sequence specified')
29
+
30
+ exit_on_error("file #{csv_file} not found") unless File.exist?(csv_file)
31
+ exit_on_error("not a HEX sequece (#{eol_sequence})") unless eol_sequence =~ /\A[0-9a-f]+\z/i
32
+ exit_on_error("incorrect number of characters in (#{eol_sequence}), should be even") unless eol_sequence.size.even?
33
+
34
+ eol_sequence = [eol_sequence].pack('H*')
35
+
36
+
37
+ escaped_csv_file =
38
+ if csv_file =~ /\.csv$/i
39
+ csv_file.sub(/(\.csv)$/i, '.escaped-eol\1')
40
+ else
41
+ csv_file + '.escaped-eol'
42
+ end
43
+
44
+
45
+ File.open(escaped_csv_file, 'wb') do |out|
46
+ CSV.foreach(csv_file) do |row|
47
+ line = row.to_csv
48
+ line.rstrip!
49
+ line.concat(eol_sequence)
50
+ out.write line
51
+ end
52
+ end
53
+
54
+ puts escaped_csv_file
@@ -2,7 +2,8 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'csv-utils'
5
- s.version = '0.1.6'
5
+ s.version = '0.2.3'
6
+ s.licenses = ['MIT']
6
7
  s.summary = 'CSV Utils'
7
8
  s.description = 'Tools for debugging malformed CSV files'
8
9
  s.authors = ['Doug Youch']
@@ -11,4 +12,6 @@ Gem::Specification.new do |s|
11
12
  s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
12
13
  s.bindir = 'bin'
13
14
  s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
15
+
16
+ s.add_runtime_dependency 'inheritance-helper'
14
17
  end
@@ -0,0 +1,10 @@
1
+ require 'csv'
2
+
3
+ # Collection of tools for working with CSV files.
4
+ module CSVUtils
5
+ autoload :CSVExtender, 'csv_utils/csv_extender'
6
+ autoload :CSVOptions, 'csv_utils/csv_options'
7
+ autoload :CSVReport, 'csv_utils/csv_report'
8
+ autoload :CSVRow, 'csv_utils/csv_row'
9
+ autoload :CSVSort, 'csv_utils/csv_sort'
10
+ end
@@ -0,0 +1,76 @@
1
+ # Utility class for appending data to a csv file.
2
+ class CSVUtils::CSVExtender
3
+ attr_reader :csv_file,
4
+ :new_csv_file,
5
+ :csv_options
6
+
7
+ def initialize(csv_file, new_csv_file, csv_options = {})
8
+ @csv_file = csv_file
9
+ @new_csv_file = new_csv_file
10
+ @csv_options = csv_options
11
+ end
12
+
13
+ def append(additional_headers)
14
+ process(additional_headers) do |current_headers|
15
+ while (row = src.shift)
16
+ additional_columns = yield row, current_headers
17
+ dest << (row + additional_columns)
18
+ end
19
+ end
20
+ end
21
+
22
+ def append_in_batches(additional_headers, batch_size = 1_000)
23
+ process(additional_headers) do |current_headers|
24
+ batch = []
25
+
26
+ process_batch_proc = Proc.new do
27
+ additional_rows = yield batch, current_headers
28
+
29
+ batch.each_with_index do |row, idx|
30
+ dest << (row + additional_rows[idx])
31
+ end
32
+
33
+ batch = []
34
+ end
35
+
36
+ while (row = src.shift)
37
+ batch << row
38
+
39
+ process_batch_proc.call if batch.size >= batch_size
40
+ end
41
+
42
+ process_batch_proc.call if batch.size > 0
43
+ end
44
+ end
45
+
46
+ def process(additional_headers)
47
+ current_headers = append_headers(additional_headers)
48
+
49
+ yield current_headers
50
+
51
+ close
52
+ end
53
+
54
+ def src
55
+ @src ||= CSV.open(csv_file, 'rb', csv_options)
56
+ end
57
+
58
+ def dest
59
+ @dest ||= CSV.open(new_csv_file, 'wb', csv_options)
60
+ end
61
+
62
+ def close
63
+ src.close
64
+ dest.close
65
+ end
66
+
67
+ private
68
+
69
+ def append_headers(additional_headers)
70
+ return nil unless additional_headers
71
+
72
+ current_headers = src.shift
73
+ dest << (current_headers + additional_headers)
74
+ current_headers
75
+ end
76
+ end
@@ -0,0 +1,87 @@
1
+ # Auto detect a csv files options
2
+ module CSVUtils
3
+ class CSVOptions
4
+
5
+ # this list is from https://en.wikipedia.org/wiki/Byte_order_mark
6
+ BYTE_ORDER_MARKS = {
7
+ "\xEF\xBB\xBF".force_encoding('ASCII-8BIT') => 'UTF-8',
8
+ "\xFE\xFF".force_encoding('ASCII-8BIT') => 'UTF-16',
9
+ "\xFF\xFE".force_encoding('ASCII-8BIT') => 'UTF-16',
10
+ "\x00\x00\xFE\xFF".force_encoding('ASCII-8BIT') => 'UTF-32',
11
+ "\xFF\xFE\x00\x00".force_encoding('ASCII-8BIT') => 'UTF-32'
12
+ }
13
+
14
+ COL_SEPARATORS = [
15
+ "\x02",
16
+ "\t",
17
+ '|',
18
+ ','
19
+ ]
20
+
21
+ ROW_SEPARATORS = [
22
+ "\r\n",
23
+ "\n",
24
+ "\r"
25
+ ]
26
+
27
+ attr_reader :columns,
28
+ :byte_order_mark,
29
+ :encoding,
30
+ :col_separator,
31
+ :row_separator
32
+
33
+
34
+ def initialize(io)
35
+ line =
36
+ if io.is_a?(String)
37
+ File.open(io, 'rb', &:readline)
38
+ else
39
+ io.readline
40
+ end
41
+
42
+ @col_separator = auto_detect_col_sep(line)
43
+ @row_separator = auto_detect_row_sep(line)
44
+ @byte_order_mark = get_byte_order_mark(line)
45
+ @encoding = get_character_encoding(@byte_order_mark)
46
+ @columns = get_number_of_columns(line) if @col_separator
47
+ end
48
+
49
+ def valid?
50
+ return false if @col_separator.nil? || @row_separator.nil?
51
+
52
+ true
53
+ end
54
+
55
+ def auto_detect_col_sep(line)
56
+ COL_SEPARATORS.detect { |sep| line.include?(sep) }
57
+ end
58
+
59
+ def auto_detect_row_sep(line)
60
+ ROW_SEPARATORS.detect { |sep| line.include?(sep) }
61
+ end
62
+
63
+ def get_headers(line)
64
+ headers = line.split(col_separator)
65
+ headers[0] = strip_byte_order_marks(headers[0])
66
+ headers
67
+ end
68
+
69
+ def get_number_of_columns(line)
70
+ get_headers(line).size
71
+ end
72
+
73
+ def get_byte_order_mark(line)
74
+ BYTE_ORDER_MARKS.keys.detect do |bom|
75
+ line =~ /\A#{bom}/
76
+ end
77
+ end
78
+
79
+ def get_character_encoding(bom)
80
+ BYTE_ORDER_MARKS[bom] || 'UTF-8'
81
+ end
82
+
83
+ def strip_byte_order_marks(header)
84
+ @byte_order_marks ? header.sub(@byte_order_marks, '') : header
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,41 @@
1
+ # Builds a csv file from csv rows
2
+ module CSVUtils
3
+ class CSVReport
4
+ attr_reader :csv,
5
+ :must_close
6
+
7
+ def initialize(csv, headers = nil, csv_options = {}, &block)
8
+ @csv =
9
+ if csv.is_a?(String)
10
+ @must_close = true
11
+ mode = csv_options.delete(:mode) || 'wb'
12
+ CSV.open(csv, mode, csv_options)
13
+ else
14
+ @must_close = false
15
+ csv
16
+ end
17
+
18
+ generate(headers, &block) if block
19
+ end
20
+
21
+ def generate(headers = nil)
22
+ add_headers(headers) if headers
23
+ yield self
24
+ @csv.close if @must_close
25
+ end
26
+
27
+ def append(csv_row)
28
+ @csv <<
29
+ if csv_row.is_a?(Array)
30
+ csv_row
31
+ else
32
+ csv_row.to_a
33
+ end
34
+ end
35
+ alias << append
36
+
37
+ def add_headers(csv_row)
38
+ append(csv_row.is_a?(Array) ? csv_row : csv_row.csv_headers)
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,57 @@
1
+ require 'inheritance-helper'
2
+
3
+ module CSVUtils
4
+ module CSVRow
5
+ def self.included(base)
6
+ base.extend InheritanceHelper::Methods
7
+ base.extend ClassMethods
8
+ end
9
+
10
+ module ClassMethods
11
+ def csv_columns
12
+ {}
13
+ end
14
+
15
+ def csv_column(header, options = {}, &block)
16
+ options[:header] ||= header.to_s
17
+
18
+ if block
19
+ options[:proc] = block
20
+ elsif options[:proc].nil?
21
+ options[:method] ||= header
22
+ end
23
+
24
+ add_value_to_class_method(:csv_columns, header => options)
25
+ end
26
+
27
+ def csv_headers
28
+ csv_columns.values.map { |column_options| csv_column_header(column_options) }
29
+ end
30
+
31
+ private
32
+
33
+ def csv_column_header(column_options)
34
+ column_options[:header]
35
+ end
36
+ end
37
+
38
+ def csv_row
39
+ self.class.csv_columns.values.map { |column_options| csv_column_value(column_options) }
40
+ end
41
+ alias_method :to_a, :csv_row
42
+
43
+ def csv_headers
44
+ self.class.csv_headers
45
+ end
46
+
47
+ private
48
+
49
+ def csv_column_value(column_options)
50
+ if column_options[:proc]
51
+ instance_eval(&column_options[:proc])
52
+ else
53
+ send(column_options[:method])
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,112 @@
1
+ require 'fileutils'
2
+
3
+ # Utility class for sorting the rows for a csv file
4
+ class CSVUtils::CSVSort
5
+ attr_reader :csv_file,
6
+ :new_csv_file,
7
+ :has_headers,
8
+ :csv_options,
9
+ :headers
10
+
11
+ def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
12
+ @csv_file = csv_file
13
+ @new_csv_file = new_csv_file
14
+ @has_headers = has_headers
15
+ @csv_options = csv_options
16
+ @csv_part_files = []
17
+ @files_to_delete = []
18
+ end
19
+
20
+ def sort(batch_size = 100_000, &block)
21
+ create_sorted_csv_part_files(batch_size, &block)
22
+ merge_csv_part_files(&block)
23
+ end
24
+
25
+ private
26
+
27
+ def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
28
+ src1 = CSV.open(src_csv_file1, 'rb', csv_options)
29
+ src2 = CSV.open(src_csv_file2, 'rb', csv_options)
30
+ dest = CSV.open(dest_csv_file, 'wb', csv_options)
31
+
32
+ if @headers
33
+ dest << @headers
34
+ src1.shift
35
+ src2.shift
36
+ end
37
+
38
+ row1 = src1.shift
39
+ row2 = src2.shift
40
+
41
+ append_row1_proc = Proc.new do
42
+ dest << row1
43
+ row1 = src1.shift
44
+ end
45
+
46
+ append_row2_proc = Proc.new do
47
+ dest << row2
48
+ row2 = src2.shift
49
+ end
50
+
51
+ while row1 || row2
52
+ if row1.nil?
53
+ append_row2_proc.call
54
+ elsif row2.nil?
55
+ append_row1_proc.call
56
+ elsif yield(row1, row2) <= 0
57
+ append_row1_proc.call
58
+ else
59
+ append_row2_proc.call
60
+ end
61
+ end
62
+
63
+ src1.close
64
+ src2.close
65
+ dest.close
66
+ end
67
+
68
+ def create_sorted_csv_part_files(batch_size, &block)
69
+ src = CSV.open(csv_file, 'rb', csv_options)
70
+
71
+ @headers = src.shift if has_headers
72
+
73
+ batch = []
74
+ create_batch_part_proc = Proc.new do
75
+ batch.sort!(&block)
76
+ @csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
77
+ CSV.open(@csv_part_files.last, 'wb', csv_options) do |csv|
78
+ csv << @headers if @headers
79
+ batch.each { |row| csv << row }
80
+ end
81
+ batch = []
82
+ end
83
+
84
+ while (row = src.shift)
85
+ batch << row
86
+ create_batch_part_proc.call if batch.size >= batch_size
87
+ end
88
+
89
+ create_batch_part_proc.call if batch.size > 0
90
+
91
+ src.close
92
+ end
93
+
94
+ def merge_csv_part_files(&block)
95
+ file_merge_cnt = 0
96
+
97
+ while @csv_part_files.size > 1
98
+ file_merge_cnt += 1
99
+
100
+ csv_part_file1 = @csv_part_files.shift
101
+ csv_part_file2 = @csv_part_files.shift
102
+ @csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
103
+
104
+ merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)
105
+
106
+ File.unlink(csv_part_file1)
107
+ File.unlink(csv_part_file2)
108
+ end
109
+
110
+ FileUtils.mv(@csv_part_files.last, new_csv_file)
111
+ end
112
+ end
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ $LOAD_PATH << File.expand_path('../lib', __dir__)
5
+ require 'csv-utils'
6
+ require 'irb'
7
+ IRB.start(__FILE__)
metadata CHANGED
@@ -1,33 +1,61 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Doug Youch
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-07-30 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2020-07-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: inheritance-helper
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  description: Tools for debugging malformed CSV files
14
28
  email: dougyouch@gmail.com
15
29
  executables:
30
+ - csv-change-eol
16
31
  - csv-find-error
17
32
  - csv-readline
18
33
  extensions: []
19
34
  extra_rdoc_files: []
20
35
  files:
21
36
  - ".gitignore"
37
+ - ".ruby-gemset"
38
+ - ".ruby-version"
39
+ - Gemfile
40
+ - Gemfile.lock
22
41
  - LICENSE
23
42
  - README.md
43
+ - bin/csv-change-eol
24
44
  - bin/csv-find-error
25
45
  - bin/csv-readline
26
46
  - csv-utils.gemspec
47
+ - lib/csv-utils.rb
48
+ - lib/csv_utils/csv_extender.rb
49
+ - lib/csv_utils/csv_options.rb
50
+ - lib/csv_utils/csv_report.rb
51
+ - lib/csv_utils/csv_row.rb
52
+ - lib/csv_utils/csv_sort.rb
53
+ - script/console
27
54
  homepage: https://github.com/dougyouch/csv-utils
28
- licenses: []
55
+ licenses:
56
+ - MIT
29
57
  metadata: {}
30
- post_install_message:
58
+ post_install_message:
31
59
  rdoc_options: []
32
60
  require_paths:
33
61
  - lib
@@ -42,8 +70,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
42
70
  - !ruby/object:Gem::Version
43
71
  version: '0'
44
72
  requirements: []
45
- rubygems_version: 3.0.3
46
- signing_key:
73
+ rubygems_version: 3.0.8
74
+ signing_key:
47
75
  specification_version: 4
48
76
  summary: CSV Utils
49
77
  test_files: []