csv-utils 0.1.5 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 48b1bf2dfbd79b8035b19d96af6a0fb06d55c8b6abdfeaeb7f7c3f561985aed7
4
- data.tar.gz: 63257717b646afe688126a8323a1d839ee52cc9332837c53d4bc003c1d9c82d5
3
+ metadata.gz: f85c431ad42ed20382fbe91c3696153be9437a8ee755ede313be2d6f488b3770
4
+ data.tar.gz: b620cfb208a7a28573160103155564875b05990f965f9206ad89bd7cb6b5fcc7
5
5
  SHA512:
6
- metadata.gz: 2d1e351a8b331c68b28b8cc392b8607412cef3e7d529a959b5175afce720054434b3aa3d70d18e1a30dd0765e1afa0c48813fb495f183d963e51152010f1a7f9
7
- data.tar.gz: b47d36f27c3657197eebc71145302edd4fce1a813b011fae3b4442a093e1eacc2bcaa032d021d542bb7eaed8f0c3ef37f4d373b9627161aae82dd48f0b7e2225
6
+ metadata.gz: 02fe7a0d34f61c54a3788739cc5455dc685cad7b627a9091f2b6e5b3ed0323bf5d162cff53150e89a431c6ab42e76e014b1350c6b47f4aeed66ef727ffe76554
7
+ data.tar.gz: 117d50507b9661c1d70b89df658a97649ea2562604c9dd0764f160e9294586293edbf10e9d968da0107a2f072b0e1814d105c364f91ed2c22cca509fb48f2fd6
@@ -0,0 +1 @@
1
+ csv-utils
@@ -0,0 +1 @@
1
+ 2.6.3
data/Gemfile ADDED
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ source 'http://rubygems.org'
4
+
5
+ gem 'inheritance-helper'
6
+
7
+ group :development do
8
+ gem 'rake'
9
+ gem 'rubocop'
10
+ end
11
+
12
+ group :spec do
13
+ gem 'rspec'
14
+ gem 'simplecov'
15
+ end
@@ -0,0 +1,57 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ ast (2.4.1)
5
+ diff-lcs (1.3)
6
+ docile (1.3.2)
7
+ inheritance-helper (0.1.5)
8
+ parallel (1.19.2)
9
+ parser (2.7.1.4)
10
+ ast (~> 2.4.1)
11
+ rainbow (3.0.0)
12
+ rake (13.0.1)
13
+ regexp_parser (1.7.1)
14
+ rexml (3.2.4)
15
+ rspec (3.9.0)
16
+ rspec-core (~> 3.9.0)
17
+ rspec-expectations (~> 3.9.0)
18
+ rspec-mocks (~> 3.9.0)
19
+ rspec-core (3.9.2)
20
+ rspec-support (~> 3.9.3)
21
+ rspec-expectations (3.9.2)
22
+ diff-lcs (>= 1.2.0, < 2.0)
23
+ rspec-support (~> 3.9.0)
24
+ rspec-mocks (3.9.1)
25
+ diff-lcs (>= 1.2.0, < 2.0)
26
+ rspec-support (~> 3.9.0)
27
+ rspec-support (3.9.3)
28
+ rubocop (0.86.0)
29
+ parallel (~> 1.10)
30
+ parser (>= 2.7.0.1)
31
+ rainbow (>= 2.2.2, < 4.0)
32
+ regexp_parser (>= 1.7)
33
+ rexml
34
+ rubocop-ast (>= 0.0.3, < 1.0)
35
+ ruby-progressbar (~> 1.7)
36
+ unicode-display_width (>= 1.4.0, < 2.0)
37
+ rubocop-ast (0.0.3)
38
+ parser (>= 2.7.0.1)
39
+ ruby-progressbar (1.10.1)
40
+ simplecov (0.18.5)
41
+ docile (~> 1.1)
42
+ simplecov-html (~> 0.11)
43
+ simplecov-html (0.12.2)
44
+ unicode-display_width (1.7.0)
45
+
46
+ PLATFORMS
47
+ ruby
48
+
49
+ DEPENDENCIES
50
+ inheritance-helper
51
+ rake
52
+ rspec
53
+ rubocop
54
+ simplecov
55
+
56
+ BUNDLED WITH
57
+ 1.17.3
@@ -0,0 +1,54 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'csv'
4
+
5
+ def bold_string(str)
6
+ "\033[1m#{str}\033[0m"
7
+ end
8
+
9
+ USAGE = "Usage: #{bold_string('csv-change-eol')} <csv_file> <end of line character sequence in hex>"
10
+
11
+ def exit_on_error(msg)
12
+ $stderr.print <<STR
13
+ Error: #{bold_string(msg)}
14
+
15
+ #{USAGE}
16
+
17
+ End of line example: '7C5E7C0A' is '|^|\\n'
18
+ - 0A is new line
19
+ - 0D is carriage return
20
+
21
+ Goto: #{bold_string('http://www.asciitable.com/')} for help with the character sequence
22
+
23
+ STR
24
+ exit 1
25
+ end
26
+
27
+ csv_file = ARGV.shift || exit_on_error('no csv file specified')
28
+ eol_sequence = ARGV.shift || exit_on_error('no EOL character sequence specified')
29
+
30
+ exit_on_error("file #{csv_file} not found") unless File.exist?(csv_file)
31
+ exit_on_error("not a HEX sequece (#{eol_sequence})") unless eol_sequence =~ /\A[0-9a-f]+\z/i
32
+ exit_on_error("incorrect number of characters in (#{eol_sequence}), should be even") unless eol_sequence.size.even?
33
+
34
+ eol_sequence = [eol_sequence].pack('H*')
35
+
36
+
37
+ escaped_csv_file =
38
+ if csv_file =~ /\.csv$/i
39
+ csv_file.sub(/(\.csv)$/i, '.escaped-eol\1')
40
+ else
41
+ csv_file + '.escaped-eol'
42
+ end
43
+
44
+
45
+ File.open(escaped_csv_file, 'wb') do |out|
46
+ CSV.foreach(csv_file) do |row|
47
+ line = row.to_csv
48
+ line.rstrip!
49
+ line.concat(eol_sequence)
50
+ out.write line
51
+ end
52
+ end
53
+
54
+ puts escaped_csv_file
@@ -1,5 +1,7 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'optparse'
4
+
3
5
  BYTE_ORDER_MARKS = {
4
6
  "\xEF\xBB\xBF".force_encoding('ASCII-8BIT') => 'UTF-8',
5
7
  "\xFE\xFF".force_encoding('ASCII-8BIT') => 'UTF-16',
@@ -25,7 +27,7 @@ def csv_parse_line(line)
25
27
  last_comma_pos = -1
26
28
  column = 1
27
29
 
28
- while pos = line.index(/([",])/, pos + 1)
30
+ while pos = line.index(/([",\n])/, pos + 1)
29
31
  case line[pos]
30
32
  when '"'
31
33
  if opened_quote
@@ -62,7 +64,8 @@ def csv_parse_line(line)
62
64
  opened_quote = true
63
65
  end
64
66
  end
65
- when ','
67
+ when ',',
68
+ "\n"
66
69
  if ! opened_quote
67
70
  column += 1
68
71
  columns << [line[last_comma_pos + 1, pos - last_comma_pos - 1], :ok]
@@ -95,6 +98,22 @@ def parse_csv_row(file, lineno, number_of_lines)
95
98
  csv_parse_line(str)
96
99
  end
97
100
 
101
+ options = {
102
+ all_columns: false
103
+ }
104
+ OptionParser.new do |opts|
105
+ opts.banner = 'Usage: ' + File.basename(__FILE__) + ' [options] <csv file> <line number> [<number of lines>]'
106
+
107
+ opts.on('-h', '--help', 'Prints this help') do
108
+ puts opts
109
+ exit
110
+ end
111
+
112
+ opts.on('-a', '--all', 'Display all columns') do
113
+ options[:all_columns] = true
114
+ end
115
+ end.parse!
116
+
98
117
  file = File.open(ARGV[0], 'rb')
99
118
  lineno = ARGV[1].to_i
100
119
  number_of_lines = (ARGV[2] || 1).to_i
@@ -109,7 +128,7 @@ file.close
109
128
  cnt = 0
110
129
  data.each do |k, (v, status)|
111
130
  cnt += 1
112
- next if empty_column?(v)
131
+ next if !options[:all_columns] && empty_column?(v)
113
132
  if status == :ok
114
133
  puts sprintf(' %-3d %s: %s', cnt, k, v)
115
134
  else
@@ -2,7 +2,8 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'csv-utils'
5
- s.version = '0.1.5'
5
+ s.version = '0.2.2'
6
+ s.licenses = ['MIT']
6
7
  s.summary = 'CSV Utils'
7
8
  s.description = 'Tools for debugging malformed CSV files'
8
9
  s.authors = ['Doug Youch']
@@ -11,4 +12,6 @@ Gem::Specification.new do |s|
11
12
  s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
12
13
  s.bindir = 'bin'
13
14
  s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
15
+
16
+ s.add_runtime_dependency 'inheritance-helper'
14
17
  end
@@ -0,0 +1,10 @@
1
+ require 'csv'
2
+
3
+ # Collection of tools for working with CSV files.
4
+ module CSVUtils
5
+ autoload :CSVExtender, 'csv_utils/csv_extender'
6
+ autoload :CSVOptions, 'csv_utils/csv_options'
7
+ autoload :CSVReport, 'csv_utils/csv_report'
8
+ autoload :CSVRow, 'csv_utils/csv_row'
9
+ autoload :CSVSort, 'csv_utils/csv_sort'
10
+ end
@@ -0,0 +1,76 @@
1
+ # Utility class for appending data to a csv file.
2
+ class CSVUtils::CSVExtender
3
+ attr_reader :csv_file,
4
+ :new_csv_file,
5
+ :csv_options
6
+
7
+ def initialize(csv_file, new_csv_file, csv_options = {})
8
+ @csv_file = csv_file
9
+ @new_csv_file = new_csv_file
10
+ @csv_options = csv_options
11
+ end
12
+
13
+ def append(additional_headers)
14
+ process(additional_headers) do |current_headers|
15
+ while (row = src.shift)
16
+ additional_columns = yield row, current_headers
17
+ dest << (row + additional_columns)
18
+ end
19
+ end
20
+ end
21
+
22
+ def append_in_batches(additional_headers, batch_size = 1_000)
23
+ process(additional_headers) do |current_headers|
24
+ batch = []
25
+
26
+ process_batch_proc = Proc.new do
27
+ additional_rows = yield batch, current_headers
28
+
29
+ batch.each_with_index do |row, idx|
30
+ dest << (row + additional_rows[idx])
31
+ end
32
+
33
+ batch = []
34
+ end
35
+
36
+ while (row = src.shift)
37
+ batch << row
38
+
39
+ process_batch_proc.call if batch.size >= batch_size
40
+ end
41
+
42
+ process_batch_proc.call if batch.size > 0
43
+ end
44
+ end
45
+
46
+ def process(additional_headers)
47
+ current_headers = append_headers(additional_headers)
48
+
49
+ yield current_headers
50
+
51
+ close
52
+ end
53
+
54
+ def src
55
+ @src ||= CSV.open(csv_file, 'rb', csv_options)
56
+ end
57
+
58
+ def dest
59
+ @dest ||= CSV.open(new_csv_file, 'wb', csv_options)
60
+ end
61
+
62
+ def close
63
+ src.close
64
+ dest.close
65
+ end
66
+
67
+ private
68
+
69
+ def append_headers(additional_headers)
70
+ return nil unless additional_headers
71
+
72
+ current_headers = src.shift
73
+ dest << (current_headers + additional_headers)
74
+ current_headers
75
+ end
76
+ end
@@ -0,0 +1,87 @@
1
+ # Auto detect a csv files options
2
+ module CSVUtils
3
+ class CSVOptions
4
+
5
+ # this list is from https://en.wikipedia.org/wiki/Byte_order_mark
6
+ BYTE_ORDER_MARKS = {
7
+ "\xEF\xBB\xBF".force_encoding('ASCII-8BIT') => 'UTF-8',
8
+ "\xFE\xFF".force_encoding('ASCII-8BIT') => 'UTF-16',
9
+ "\xFF\xFE".force_encoding('ASCII-8BIT') => 'UTF-16',
10
+ "\x00\x00\xFE\xFF".force_encoding('ASCII-8BIT') => 'UTF-32',
11
+ "\xFF\xFE\x00\x00".force_encoding('ASCII-8BIT') => 'UTF-32'
12
+ }
13
+
14
+ COL_SEPARATORS = [
15
+ "\x02",
16
+ "\t",
17
+ '|',
18
+ ','
19
+ ]
20
+
21
+ ROW_SEPARATORS = [
22
+ "\r\n",
23
+ "\n",
24
+ "\r"
25
+ ]
26
+
27
+ attr_reader :columns,
28
+ :byte_order_mark,
29
+ :encoding,
30
+ :col_separator,
31
+ :row_separator
32
+
33
+
34
+ def initialize(io)
35
+ line =
36
+ if io.is_a?(String)
37
+ File.open(io, 'rb', &:readline)
38
+ else
39
+ io.readline
40
+ end
41
+
42
+ @col_separator = auto_detect_col_sep(line)
43
+ @row_separator = auto_detect_row_sep(line)
44
+ @byte_order_mark = get_byte_order_mark(line)
45
+ @encoding = get_character_encoding(@byte_order_mark)
46
+ @columns = get_number_of_columns(line) if @col_separator
47
+ end
48
+
49
+ def valid?
50
+ return false if @col_separator.nil? || @row_separator.nil?
51
+
52
+ true
53
+ end
54
+
55
+ def auto_detect_col_sep(line)
56
+ COL_SEPARATORS.detect { |sep| line.include?(sep) }
57
+ end
58
+
59
+ def auto_detect_row_sep(line)
60
+ ROW_SEPARATORS.detect { |sep| line.include?(sep) }
61
+ end
62
+
63
+ def get_headers(line)
64
+ headers = line.split(col_separator)
65
+ headers[0] = strip_byte_order_marks(headers[0])
66
+ headers
67
+ end
68
+
69
+ def get_number_of_columns(line)
70
+ get_headers(line).size
71
+ end
72
+
73
+ def get_byte_order_mark(line)
74
+ BYTE_ORDER_MARKS.keys.detect do |bom|
75
+ line =~ /\A#{bom}/
76
+ end
77
+ end
78
+
79
+ def get_character_encoding(bom)
80
+ BYTE_ORDER_MARKS[bom] || 'UTF-8'
81
+ end
82
+
83
+ def strip_byte_order_marks(header)
84
+ @byte_order_marks ? header.sub(@byte_order_marks, '') : header
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,40 @@
1
+ # Builds a csv file from csv rows
2
+ module CSVUtils
3
+ class CSVReport
4
+ attr_reader :csv,
5
+ :must_close
6
+
7
+ def initialize(csv, csv_options = {}, &block)
8
+ @csv =
9
+ if csv.is_a?(String)
10
+ @must_close = true
11
+ mode = csv_options.delete(:mode) || 'wb'
12
+ CSV.open(csv, mode, csv_options)
13
+ else
14
+ @must_close = false
15
+ csv
16
+ end
17
+
18
+ generate(&block) if block
19
+ end
20
+
21
+ def generate
22
+ yield self
23
+ @csv.close if @must_close
24
+ end
25
+
26
+ def append(csv_row)
27
+ @csv <<
28
+ if csv_row.is_a?(Array)
29
+ csv_row
30
+ else
31
+ csv_row.to_a
32
+ end
33
+ end
34
+ alias << append
35
+
36
+ def add_headers(csv_row)
37
+ append(csv_row.is_a?(Array) ? csv_row : csv_row.csv_headers)
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,51 @@
1
+ require 'inheritance-helper'
2
+
3
+ module CSVUtils
4
+ module CSVRow
5
+ def self.included(base)
6
+ base.extend InheritanceHelper::Methods
7
+ base.extend ClassMethods
8
+ end
9
+
10
+ module ClassMethods
11
+ def csv_columns
12
+ {}
13
+ end
14
+
15
+ def csv_column(header, options = {}, &block)
16
+ options[:header] ||= header.to_s
17
+
18
+ if block
19
+ options[:proc] = block
20
+ elsif options[:proc].nil?
21
+ options[:method] ||= header
22
+ end
23
+
24
+ add_value_to_class_method(:csv_columns, header => options)
25
+ end
26
+ end
27
+
28
+ def csv_headers
29
+ self.class.csv_columns.values.map { |column_options| csv_column_header(column_options) }
30
+ end
31
+
32
+ def csv_row
33
+ self.class.csv_columns.values.map { |column_options| csv_column_value(column_options) }
34
+ end
35
+ alias_method :to_a, :csv_row
36
+
37
+ private
38
+
39
+ def csv_column_header(column_options)
40
+ column_options[:header]
41
+ end
42
+
43
+ def csv_column_value(column_options)
44
+ if column_options[:proc]
45
+ instance_eval(&column_options[:proc])
46
+ else
47
+ send(column_options[:method])
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,112 @@
1
+ require 'fileutils'
2
+
3
+ # Utility class for sorting the rows for a csv file
4
+ class CSVUtils::CSVSort
5
+ attr_reader :csv_file,
6
+ :new_csv_file,
7
+ :has_headers,
8
+ :csv_options,
9
+ :headers
10
+
11
+ def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
12
+ @csv_file = csv_file
13
+ @new_csv_file = new_csv_file
14
+ @has_headers = has_headers
15
+ @csv_options = csv_options
16
+ @csv_part_files = []
17
+ @files_to_delete = []
18
+ end
19
+
20
+ def sort(batch_size = 100_000, &block)
21
+ create_sorted_csv_part_files(batch_size, &block)
22
+ merge_csv_part_files(&block)
23
+ end
24
+
25
+ private
26
+
27
+ def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
28
+ src1 = CSV.open(src_csv_file1, 'rb', csv_options)
29
+ src2 = CSV.open(src_csv_file2, 'rb', csv_options)
30
+ dest = CSV.open(dest_csv_file, 'wb', csv_options)
31
+
32
+ if @headers
33
+ dest << @headers
34
+ src1.shift
35
+ src2.shift
36
+ end
37
+
38
+ row1 = src1.shift
39
+ row2 = src2.shift
40
+
41
+ append_row1_proc = Proc.new do
42
+ dest << row1
43
+ row1 = src1.shift
44
+ end
45
+
46
+ append_row2_proc = Proc.new do
47
+ dest << row2
48
+ row2 = src2.shift
49
+ end
50
+
51
+ while row1 || row2
52
+ if row1.nil?
53
+ append_row2_proc.call
54
+ elsif row2.nil?
55
+ append_row1_proc.call
56
+ elsif yield(row1, row2) <= 0
57
+ append_row1_proc.call
58
+ else
59
+ append_row2_proc.call
60
+ end
61
+ end
62
+
63
+ src1.close
64
+ src2.close
65
+ dest.close
66
+ end
67
+
68
+ def create_sorted_csv_part_files(batch_size, &block)
69
+ src = CSV.open(csv_file, 'rb', csv_options)
70
+
71
+ @headers = src.shift if has_headers
72
+
73
+ batch = []
74
+ create_batch_part_proc = Proc.new do
75
+ batch.sort!(&block)
76
+ @csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
77
+ CSV.open(@csv_part_files.last, 'wb', csv_options) do |csv|
78
+ csv << @headers if @headers
79
+ batch.each { |row| csv << row }
80
+ end
81
+ batch = []
82
+ end
83
+
84
+ while (row = src.shift)
85
+ batch << row
86
+ create_batch_part_proc.call if batch.size >= batch_size
87
+ end
88
+
89
+ create_batch_part_proc.call if batch.size > 0
90
+
91
+ src.close
92
+ end
93
+
94
+ def merge_csv_part_files(&block)
95
+ file_merge_cnt = 0
96
+
97
+ while @csv_part_files.size > 1
98
+ file_merge_cnt += 1
99
+
100
+ csv_part_file1 = @csv_part_files.shift
101
+ csv_part_file2 = @csv_part_files.shift
102
+ @csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
103
+
104
+ merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)
105
+
106
+ File.unlink(csv_part_file1)
107
+ File.unlink(csv_part_file2)
108
+ end
109
+
110
+ FileUtils.mv(@csv_part_files.last, new_csv_file)
111
+ end
112
+ end
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ $LOAD_PATH << File.expand_path('../lib', __dir__)
5
+ require 'csv-utils'
6
+ require 'irb'
7
+ IRB.start(__FILE__)
metadata CHANGED
@@ -1,33 +1,61 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Doug Youch
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-06-19 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2020-07-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: inheritance-helper
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  description: Tools for debugging malformed CSV files
14
28
  email: dougyouch@gmail.com
15
29
  executables:
30
+ - csv-change-eol
16
31
  - csv-find-error
17
32
  - csv-readline
18
33
  extensions: []
19
34
  extra_rdoc_files: []
20
35
  files:
21
36
  - ".gitignore"
37
+ - ".ruby-gemset"
38
+ - ".ruby-version"
39
+ - Gemfile
40
+ - Gemfile.lock
22
41
  - LICENSE
23
42
  - README.md
43
+ - bin/csv-change-eol
24
44
  - bin/csv-find-error
25
45
  - bin/csv-readline
26
46
  - csv-utils.gemspec
47
+ - lib/csv-utils.rb
48
+ - lib/csv_utils/csv_extender.rb
49
+ - lib/csv_utils/csv_options.rb
50
+ - lib/csv_utils/csv_report.rb
51
+ - lib/csv_utils/csv_row.rb
52
+ - lib/csv_utils/csv_sort.rb
53
+ - script/console
27
54
  homepage: https://github.com/dougyouch/csv-utils
28
- licenses: []
55
+ licenses:
56
+ - MIT
29
57
  metadata: {}
30
- post_install_message:
58
+ post_install_message:
31
59
  rdoc_options: []
32
60
  require_paths:
33
61
  - lib
@@ -42,8 +70,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
42
70
  - !ruby/object:Gem::Version
43
71
  version: '0'
44
72
  requirements: []
45
- rubygems_version: 3.0.3
46
- signing_key:
73
+ rubygems_version: 3.0.8
74
+ signing_key:
47
75
  specification_version: 4
48
76
  summary: CSV Utils
49
77
  test_files: []