smarter_csv 1.9.2 → 1.9.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +1 -1
- data/lib/smarter_csv/auto_detection.rb +73 -0
- data/lib/smarter_csv/file_io.rb +50 -0
- data/lib/smarter_csv/headers.rb +160 -0
- data/lib/smarter_csv/parse.rb +90 -0
- data/lib/smarter_csv/smarter_csv.rb +27 -340
- data/lib/smarter_csv/variables.rb +26 -0
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv.rb +8 -3
- metadata +7 -3
- data/lib/core_ext/hash.rb +0 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5f35e10ff8bc0e79ff1ed9bea8e413f746f51128a6f6a9622d246873fd588366
|
4
|
+
data.tar.gz: 5cc30cf6f4422dd16f3019915bc5305a92aaaa4b99665e4c4c525d3bbf489cfd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 057472a73ae0be95318b16428b276ecffba384a68479af715c5ec3ca7601405ca73928b0fbf245c9b3f46fd33b82a8c6d9c9e6330ddb0305b83ae23f58173df0
|
7
|
+
data.tar.gz: 319b12a53875c1963eed6d27aa67850135d33a5b3a9f70607e6d812906733b711ade6c3ee6e789d78c2e159004a879e59e700145224134745b16d279039ac38a
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
|
2
2
|
# SmarterCSV 1.x Change Log
|
3
3
|
|
4
|
+
## 1.9.3 (2023-12-16)
|
5
|
+
* raise SmarterCSV::IncorrectOption when `user_provided_headers` are empty
|
6
|
+
* code refactor / no functional changes
|
7
|
+
* added test cases
|
8
|
+
|
4
9
|
## 1.9.2 (2023-11-12)
|
5
10
|
* fixed bug with '\\' at end of line (issue #252, thanks to averycrespi-moz)
|
6
11
|
* fixed require statements (issue #249, thanks to PikachuEXE, courtsimas)
|
data/README.md
CHANGED
@@ -300,7 +300,7 @@ And header and data validations will also be supported in 2.x
|
|
300
300
|
| Option | Default | Explanation |
|
301
301
|
---------------------------------------------------------------------------------------------------------------------------------
|
302
302
|
| :key_mapping | nil | a hash which maps headers from the CSV file to keys in the result hash |
|
303
|
-
| :
|
303
|
+
| :silence_missing_keys | false | ignore missing keys in `key_mapping` |
|
304
304
|
| | | if set to true: makes all mapped keys optional |
|
305
305
|
| | | if given an array, makes only the keys listed in it optional |
|
306
306
|
| :required_keys | nil | An array. Specify the required names AFTER header transformation. |
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
class << self
|
5
|
+
protected
|
6
|
+
|
7
|
+
# If file has headers, then guesses column separator from headers.
|
8
|
+
# Otherwise guesses column separator from contents.
|
9
|
+
# Raises exception if none is found.
|
10
|
+
def guess_column_separator(filehandle, options)
|
11
|
+
skip_lines(filehandle, options)
|
12
|
+
|
13
|
+
delimiters = [',', "\t", ';', ':', '|']
|
14
|
+
|
15
|
+
line = nil
|
16
|
+
has_header = options[:headers_in_file]
|
17
|
+
candidates = Hash.new(0)
|
18
|
+
count = has_header ? 1 : 5
|
19
|
+
count.times do
|
20
|
+
line = readline_with_counts(filehandle, options)
|
21
|
+
delimiters.each do |d|
|
22
|
+
candidates[d] += line.scan(d).count
|
23
|
+
end
|
24
|
+
rescue EOFError # short files
|
25
|
+
break
|
26
|
+
end
|
27
|
+
rewind(filehandle)
|
28
|
+
|
29
|
+
if candidates.values.max == 0
|
30
|
+
# if the header only contains
|
31
|
+
return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
|
32
|
+
|
33
|
+
raise SmarterCSV::NoColSepDetected
|
34
|
+
end
|
35
|
+
|
36
|
+
candidates.key(candidates.values.max)
|
37
|
+
end
|
38
|
+
|
39
|
+
# limitation: this currently reads the whole file in before making a decision
|
40
|
+
def guess_line_ending(filehandle, options)
|
41
|
+
counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
|
42
|
+
quoted_char = false
|
43
|
+
|
44
|
+
# count how many of the pre-defined line-endings we find
|
45
|
+
# ignoring those contained within quote characters
|
46
|
+
last_char = nil
|
47
|
+
lines = 0
|
48
|
+
filehandle.each_char do |c|
|
49
|
+
quoted_char = !quoted_char if c == options[:quote_char]
|
50
|
+
next if quoted_char
|
51
|
+
|
52
|
+
if last_char == "\r"
|
53
|
+
if c == "\n"
|
54
|
+
counts["\r\n"] += 1
|
55
|
+
else
|
56
|
+
counts["\r"] += 1 # \r are counted after they appeared
|
57
|
+
end
|
58
|
+
elsif c == "\n"
|
59
|
+
counts["\n"] += 1
|
60
|
+
end
|
61
|
+
last_char = c
|
62
|
+
lines += 1
|
63
|
+
break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
|
64
|
+
end
|
65
|
+
rewind(filehandle)
|
66
|
+
|
67
|
+
counts["\r"] += 1 if last_char == "\r"
|
68
|
+
# find the most frequent key/value pair:
|
69
|
+
most_frequent_key, _count = counts.max_by{|_, v| v}
|
70
|
+
most_frequent_key
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
class << self
|
5
|
+
protected
|
6
|
+
|
7
|
+
def readline_with_counts(filehandle, options)
|
8
|
+
line = filehandle.readline(options[:row_sep])
|
9
|
+
@file_line_count += 1
|
10
|
+
@csv_line_count += 1
|
11
|
+
line = remove_bom(line) if @csv_line_count == 1
|
12
|
+
line
|
13
|
+
end
|
14
|
+
|
15
|
+
def skip_lines(filehandle, options)
|
16
|
+
options[:skip_lines].to_i.times do
|
17
|
+
readline_with_counts(filehandle, options)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def rewind(filehandle)
|
22
|
+
@file_line_count = 0
|
23
|
+
@csv_line_count = 0
|
24
|
+
filehandle.rewind
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
UTF_32_BOM = %w[0 0 fe ff].freeze
|
30
|
+
UTF_32LE_BOM = %w[ff fe 0 0].freeze
|
31
|
+
UTF_8_BOM = %w[ef bb bf].freeze
|
32
|
+
UTF_16_BOM = %w[fe ff].freeze
|
33
|
+
UTF_16LE_BOM = %w[ff fe].freeze
|
34
|
+
|
35
|
+
def remove_bom(str)
|
36
|
+
str_as_hex = str.bytes.map{|x| x.to_s(16)}
|
37
|
+
# if string does not start with one of the bytes, there is no BOM
|
38
|
+
return str unless %w[ef fe ff 0].include?(str_as_hex[0])
|
39
|
+
|
40
|
+
return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
|
41
|
+
return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
|
42
|
+
return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
|
43
|
+
|
44
|
+
# :nocov:
|
45
|
+
puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
|
46
|
+
str
|
47
|
+
# :nocov:
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
class << self
|
5
|
+
def process_headers(filehandle, options)
|
6
|
+
@raw_header = nil # header as it appears in the file
|
7
|
+
@headers = nil # the processed headers
|
8
|
+
header_array = []
|
9
|
+
file_header_size = nil
|
10
|
+
|
11
|
+
# if headers_in_file, get the headers -> We get the number of columns, even when user provided headers
|
12
|
+
if options[:headers_in_file] # extract the header line
|
13
|
+
# process the header line in the CSV file..
|
14
|
+
# the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
|
15
|
+
header_line = @raw_header = readline_with_counts(filehandle, options)
|
16
|
+
header_line = preprocess_header_line(header_line, options)
|
17
|
+
file_header_array, file_header_size = parse_and_modify_headers(header_line, options)
|
18
|
+
else
|
19
|
+
unless options[:user_provided_headers]
|
20
|
+
raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
if options[:user_provided_headers]
|
25
|
+
unless options[:user_provided_headers].is_a?(Array) && !options[:user_provided_headers].empty?
|
26
|
+
raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for user_provided_headers! Expecting array with headers.")
|
27
|
+
end
|
28
|
+
|
29
|
+
# use user-provided headers
|
30
|
+
user_header_array = options[:user_provided_headers]
|
31
|
+
# user_provided_headers: their count should match the headers_in_file if any
|
32
|
+
if defined?(file_header_size) && !file_header_size.nil?
|
33
|
+
if user_header_array.size != file_header_size
|
34
|
+
raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{user_header_array.size} headers != CSV-file has #{file_header_size} headers"
|
35
|
+
else
|
36
|
+
# we could print out the mapping of file_header_array to header_array here
|
37
|
+
end
|
38
|
+
end
|
39
|
+
header_array = user_header_array
|
40
|
+
else
|
41
|
+
header_array = file_header_array
|
42
|
+
end
|
43
|
+
|
44
|
+
# detect duplicate headers and disambiguate
|
45
|
+
header_array = disambiguate_headers(header_array, options) if options[:duplicate_header_suffix]
|
46
|
+
|
47
|
+
# symbolize headers
|
48
|
+
header_array.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
|
49
|
+
|
50
|
+
# wouldn't make sense to re-map user provided headers
|
51
|
+
header_array = remap_headers(header_array, options) if options[:key_mapping] && !options[:user_provided_headers]
|
52
|
+
|
53
|
+
validate_and_deprecate_headers(header_array, options)
|
54
|
+
|
55
|
+
[header_array, header_array.size]
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def preprocess_header_line(header_line, options)
|
61
|
+
header_line = enforce_utf8_encoding(header_line, options)
|
62
|
+
header_line = remove_comments_from_header(header_line, options)
|
63
|
+
header_line = header_line.chomp(options[:row_sep])
|
64
|
+
header_line.gsub!(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
|
65
|
+
header_line
|
66
|
+
end
|
67
|
+
|
68
|
+
def parse_and_modify_headers(header_line, options)
|
69
|
+
file_header_array, file_header_size = parse(header_line, options)
|
70
|
+
|
71
|
+
file_header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
|
72
|
+
file_header_array.map!{|x| x.strip} if options[:strip_whitespace]
|
73
|
+
|
74
|
+
unless options[:keep_original_headers]
|
75
|
+
file_header_array.map!{|x| x.gsub(/\s+|-+/, '_')}
|
76
|
+
file_header_array.map!{|x| x.downcase} if options[:downcase_header]
|
77
|
+
end
|
78
|
+
[file_header_array, file_header_size]
|
79
|
+
end
|
80
|
+
|
81
|
+
def disambiguate_headers(headers, options)
|
82
|
+
counts = Hash.new(0)
|
83
|
+
headers.map do |header|
|
84
|
+
counts[header] += 1
|
85
|
+
counts[header] > 1 ? "#{header}#{options[:duplicate_header_suffix]}#{counts[header]}" : header
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# do some key mapping on the keys in the file header
|
90
|
+
# if you want to completely delete a key, then map it to nil or to ''
|
91
|
+
def remap_headers(headers, options)
|
92
|
+
key_mapping = options[:key_mapping]
|
93
|
+
if key_mapping.empty? || !key_mapping.is_a?(Hash) || key_mapping.keys.empty?
|
94
|
+
raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for key_mapping! Expecting hash with from -> to mappings")
|
95
|
+
end
|
96
|
+
|
97
|
+
key_mapping = options[:key_mapping]
|
98
|
+
# if silence_missing_keys are not set, raise error if missing header
|
99
|
+
missing_keys = key_mapping.keys - headers
|
100
|
+
# if the user passes a list of speciffic mapped keys that are optional
|
101
|
+
missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
|
102
|
+
|
103
|
+
unless missing_keys.empty? || options[:silence_missing_keys] == true
|
104
|
+
raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
|
105
|
+
end
|
106
|
+
|
107
|
+
headers.map! do |header|
|
108
|
+
if key_mapping.has_key?(header)
|
109
|
+
key_mapping[header].nil? ? nil : key_mapping[header]
|
110
|
+
elsif options[:remove_unmapped_keys]
|
111
|
+
nil
|
112
|
+
else
|
113
|
+
header
|
114
|
+
end
|
115
|
+
end
|
116
|
+
headers
|
117
|
+
end
|
118
|
+
|
119
|
+
# header_validations
|
120
|
+
def validate_and_deprecate_headers(headers, options)
|
121
|
+
duplicate_headers = []
|
122
|
+
headers.compact.each do |k|
|
123
|
+
duplicate_headers << k if headers.select{|x| x == k}.size > 1
|
124
|
+
end
|
125
|
+
|
126
|
+
unless options[:user_provided_headers] || duplicate_headers.empty?
|
127
|
+
raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
|
128
|
+
end
|
129
|
+
|
130
|
+
# deprecate required_headers
|
131
|
+
unless options[:required_headers].nil?
|
132
|
+
puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
|
133
|
+
if options[:required_keys].nil?
|
134
|
+
options[:required_keys] = options[:required_headers]
|
135
|
+
options[:required_headers] = nil
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
if options[:required_keys] && options[:required_keys].is_a?(Array)
|
140
|
+
missing_keys = []
|
141
|
+
options[:required_keys].each do |k|
|
142
|
+
missing_keys << k unless headers.include?(k)
|
143
|
+
end
|
144
|
+
raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def enforce_utf8_encoding(header, options)
|
149
|
+
return header unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
150
|
+
|
151
|
+
header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
|
152
|
+
end
|
153
|
+
|
154
|
+
def remove_comments_from_header(header, options)
|
155
|
+
return header unless options[:comment_regexp]
|
156
|
+
|
157
|
+
header.sub(options[:comment_regexp], '')
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
class << self
|
5
|
+
protected
|
6
|
+
|
7
|
+
###
|
8
|
+
### Thin wrapper around C-extension
|
9
|
+
###
|
10
|
+
def parse(line, options, header_size = nil)
|
11
|
+
# puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
|
12
|
+
|
13
|
+
if options[:acceleration] && has_acceleration?
|
14
|
+
# :nocov:
|
15
|
+
has_quotes = line =~ /#{options[:quote_char]}/
|
16
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
|
17
|
+
elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
|
18
|
+
[elements, elements.size]
|
19
|
+
# :nocov:
|
20
|
+
else
|
21
|
+
# puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
|
22
|
+
parse_csv_line_ruby(line, options, header_size)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# ------------------------------------------------------------------
|
27
|
+
# Ruby equivalent of the C-extension for parse_line
|
28
|
+
#
|
29
|
+
# parses a single line: either a CSV header and body line
|
30
|
+
# - quoting rules compared to RFC-4180 are somewhat relaxed
|
31
|
+
# - we are not assuming that quotes inside a fields need to be doubled
|
32
|
+
# - we are not assuming that all fields need to be quoted (0 is even)
|
33
|
+
# - works with multi-char col_sep
|
34
|
+
# - if header_size is given, only up to header_size fields are parsed
|
35
|
+
#
|
36
|
+
# We use header_size for parsing the body lines to make sure we always match the number of headers
|
37
|
+
# in case there are trailing col_sep characters in line
|
38
|
+
#
|
39
|
+
# Our convention is that empty fields are returned as empty strings, not as nil.
|
40
|
+
#
|
41
|
+
#
|
42
|
+
# the purpose of the max_size parameter is to handle a corner case where
|
43
|
+
# CSV lines contain more fields than the header.
|
44
|
+
# In which case the remaining fields in the line are ignored
|
45
|
+
#
|
46
|
+
def parse_csv_line_ruby(line, options, header_size = nil)
|
47
|
+
return [] if line.nil?
|
48
|
+
|
49
|
+
line_size = line.size
|
50
|
+
col_sep = options[:col_sep]
|
51
|
+
col_sep_size = col_sep.size
|
52
|
+
quote = options[:quote_char]
|
53
|
+
quote_count = 0
|
54
|
+
elements = []
|
55
|
+
start = 0
|
56
|
+
i = 0
|
57
|
+
|
58
|
+
previous_char = ''
|
59
|
+
while i < line_size
|
60
|
+
if line[i...i+col_sep_size] == col_sep && quote_count.even?
|
61
|
+
break if !header_size.nil? && elements.size >= header_size
|
62
|
+
|
63
|
+
elements << cleanup_quotes(line[start...i], quote)
|
64
|
+
previous_char = line[i]
|
65
|
+
i += col_sep.size
|
66
|
+
start = i
|
67
|
+
else
|
68
|
+
quote_count += 1 if line[i] == quote && previous_char != '\\'
|
69
|
+
previous_char = line[i]
|
70
|
+
i += 1
|
71
|
+
end
|
72
|
+
end
|
73
|
+
elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
|
74
|
+
[elements, elements.size]
|
75
|
+
end
|
76
|
+
|
77
|
+
def cleanup_quotes(field, quote)
|
78
|
+
return field if field.nil?
|
79
|
+
|
80
|
+
# return if field !~ /#{quote}/ # this check can probably eliminated
|
81
|
+
|
82
|
+
if field.start_with?(quote) && field.end_with?(quote)
|
83
|
+
field.delete_prefix!(quote)
|
84
|
+
field.delete_suffix!(quote)
|
85
|
+
end
|
86
|
+
field.gsub!("#{quote}#{quote}", quote)
|
87
|
+
field
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -14,10 +14,8 @@ module SmarterCSV
|
|
14
14
|
def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
|
15
15
|
options = process_options(given_options)
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
@file_line_count = 0
|
20
|
-
@csv_line_count = 0
|
17
|
+
initialize_variables
|
18
|
+
|
21
19
|
has_rails = !!defined?(Rails)
|
22
20
|
begin
|
23
21
|
fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
|
@@ -33,13 +31,14 @@ module SmarterCSV
|
|
33
31
|
|
34
32
|
skip_lines(fh, options)
|
35
33
|
|
36
|
-
|
34
|
+
@headers, header_size = process_headers(fh, options)
|
35
|
+
@headerA = @headers # @headerA is deprecated, use @headers
|
37
36
|
|
38
37
|
# in case we use chunking.. we'll need to set it up..
|
39
|
-
if
|
38
|
+
if options[:chunk_size].to_i > 0
|
40
39
|
use_chunks = true
|
41
40
|
chunk_size = options[:chunk_size].to_i
|
42
|
-
chunk_count = 0
|
41
|
+
@chunk_count = 0
|
43
42
|
chunk = []
|
44
43
|
else
|
45
44
|
use_chunks = false
|
@@ -78,7 +77,7 @@ module SmarterCSV
|
|
78
77
|
# if all values are blank, then ignore this line
|
79
78
|
next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
|
80
79
|
|
81
|
-
hash =
|
80
|
+
hash = @headers.zip(dataA).to_h
|
82
81
|
|
83
82
|
# make sure we delete any key/value pairs from the hash, which the user wanted to delete:
|
84
83
|
hash.delete(nil)
|
@@ -95,7 +94,7 @@ module SmarterCSV
|
|
95
94
|
if options[:convert_values_to_numeric]
|
96
95
|
hash.each do |k, v|
|
97
96
|
# deal with the :only / :except options to :convert_values_to_numeric
|
98
|
-
next if
|
97
|
+
next if limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
|
99
98
|
|
100
99
|
# convert if it's a numeric value:
|
101
100
|
case v
|
@@ -128,9 +127,9 @@ module SmarterCSV
|
|
128
127
|
if block_given?
|
129
128
|
yield chunk # do something with the hashes in the chunk in the block
|
130
129
|
else
|
131
|
-
result << chunk # not sure yet, why anybody would want to do this without a block
|
130
|
+
@result << chunk # not sure yet, why anybody would want to do this without a block
|
132
131
|
end
|
133
|
-
chunk_count += 1
|
132
|
+
@chunk_count += 1
|
134
133
|
chunk = [] # initialize for next chunk of data
|
135
134
|
else
|
136
135
|
|
@@ -144,7 +143,7 @@ module SmarterCSV
|
|
144
143
|
if block_given?
|
145
144
|
yield [hash] # do something with the hash in the block (better to use chunking here)
|
146
145
|
else
|
147
|
-
result << hash
|
146
|
+
@result << hash
|
148
147
|
end
|
149
148
|
end
|
150
149
|
end
|
@@ -158,34 +157,23 @@ module SmarterCSV
|
|
158
157
|
if block_given?
|
159
158
|
yield chunk # do something with the hashes in the chunk in the block
|
160
159
|
else
|
161
|
-
result << chunk # not sure yet, why anybody would want to do this without a block
|
160
|
+
@result << chunk # not sure yet, why anybody would want to do this without a block
|
162
161
|
end
|
163
|
-
chunk_count += 1
|
162
|
+
@chunk_count += 1
|
164
163
|
# chunk = [] # initialize for next chunk of data
|
165
164
|
end
|
166
165
|
ensure
|
167
166
|
fh.close if fh.respond_to?(:close)
|
168
167
|
end
|
168
|
+
|
169
169
|
if block_given?
|
170
|
-
chunk_count # when we do processing through a block we only care how many chunks we processed
|
170
|
+
@chunk_count # when we do processing through a block we only care how many chunks we processed
|
171
171
|
else
|
172
|
-
result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
|
172
|
+
@result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
|
173
173
|
end
|
174
174
|
end
|
175
175
|
|
176
176
|
class << self
|
177
|
-
def has_acceleration?
|
178
|
-
@has_acceleration ||= !!defined?(parse_csv_line_c)
|
179
|
-
end
|
180
|
-
|
181
|
-
def raw_header
|
182
|
-
@raw_header
|
183
|
-
end
|
184
|
-
|
185
|
-
def headers
|
186
|
-
@headers
|
187
|
-
end
|
188
|
-
|
189
177
|
# * the `scan` method iterates through the string and finds all occurrences of the pattern
|
190
178
|
# * The reqular expression:
|
191
179
|
# - (?<!\\) : Negative lookbehind to ensure the quote character is not preceded by an unescaped backslash.
|
@@ -198,111 +186,22 @@ module SmarterCSV
|
|
198
186
|
line.scan(/(?<!\\)(?:\\\\)*#{Regexp.escape(quote_char)}/).count
|
199
187
|
end
|
200
188
|
|
201
|
-
|
202
|
-
|
203
|
-
def readline_with_counts(filehandle, options)
|
204
|
-
line = filehandle.readline(options[:row_sep])
|
205
|
-
@file_line_count += 1
|
206
|
-
@csv_line_count += 1
|
207
|
-
line = remove_bom(line) if @csv_line_count == 1
|
208
|
-
line
|
209
|
-
end
|
210
|
-
|
211
|
-
def skip_lines(filehandle, options)
|
212
|
-
return unless options[:skip_lines].to_i > 0
|
213
|
-
|
214
|
-
options[:skip_lines].to_i.times do
|
215
|
-
readline_with_counts(filehandle, options)
|
216
|
-
end
|
217
|
-
end
|
218
|
-
|
219
|
-
def rewind(filehandle)
|
220
|
-
@file_line_count = 0
|
221
|
-
@csv_line_count = 0
|
222
|
-
filehandle.rewind
|
189
|
+
def has_acceleration?
|
190
|
+
@has_acceleration ||= !!defined?(parse_csv_line_c)
|
223
191
|
end
|
224
192
|
|
225
|
-
|
226
|
-
### Thin wrapper around C-extension
|
227
|
-
###
|
228
|
-
def parse(line, options, header_size = nil)
|
229
|
-
# puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
|
230
|
-
|
231
|
-
if options[:acceleration] && has_acceleration?
|
232
|
-
# :nocov:
|
233
|
-
has_quotes = line =~ /#{options[:quote_char]}/
|
234
|
-
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
|
235
|
-
elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
|
236
|
-
[elements, elements.size]
|
237
|
-
# :nocov:
|
238
|
-
else
|
239
|
-
# puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
|
240
|
-
parse_csv_line_ruby(line, options, header_size)
|
241
|
-
end
|
242
|
-
end
|
193
|
+
protected
|
243
194
|
|
244
|
-
#
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
# - works with multi-char col_sep
|
252
|
-
# - if header_size is given, only up to header_size fields are parsed
|
253
|
-
#
|
254
|
-
# We use header_size for parsing the body lines to make sure we always match the number of headers
|
255
|
-
# in case there are trailing col_sep characters in line
|
256
|
-
#
|
257
|
-
# Our convention is that empty fields are returned as empty strings, not as nil.
|
258
|
-
#
|
259
|
-
#
|
260
|
-
# the purpose of the max_size parameter is to handle a corner case where
|
261
|
-
# CSV lines contain more fields than the header.
|
262
|
-
# In which case the remaining fields in the line are ignored
|
263
|
-
#
|
264
|
-
def parse_csv_line_ruby(line, options, header_size = nil)
|
265
|
-
return [] if line.nil?
|
266
|
-
|
267
|
-
line_size = line.size
|
268
|
-
col_sep = options[:col_sep]
|
269
|
-
col_sep_size = col_sep.size
|
270
|
-
quote = options[:quote_char]
|
271
|
-
quote_count = 0
|
272
|
-
elements = []
|
273
|
-
start = 0
|
274
|
-
i = 0
|
275
|
-
|
276
|
-
previous_char = ''
|
277
|
-
while i < line_size
|
278
|
-
if line[i...i+col_sep_size] == col_sep && quote_count.even?
|
279
|
-
break if !header_size.nil? && elements.size >= header_size
|
280
|
-
|
281
|
-
elements << cleanup_quotes(line[start...i], quote)
|
282
|
-
previous_char = line[i]
|
283
|
-
i += col_sep.size
|
284
|
-
start = i
|
285
|
-
else
|
286
|
-
quote_count += 1 if line[i] == quote && previous_char != '\\'
|
287
|
-
previous_char = line[i]
|
288
|
-
i += 1
|
195
|
+
# acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
|
196
|
+
def limit_execution_for_only_or_except(options, option_name, key)
|
197
|
+
if options[option_name].is_a?(Hash)
|
198
|
+
if options[option_name].has_key?(:except)
|
199
|
+
return true if Array(options[option_name][:except]).include?(key)
|
200
|
+
elsif options[option_name].has_key?(:only)
|
201
|
+
return true unless Array(options[option_name][:only]).include?(key)
|
289
202
|
end
|
290
203
|
end
|
291
|
-
|
292
|
-
[elements, elements.size]
|
293
|
-
end
|
294
|
-
|
295
|
-
def cleanup_quotes(field, quote)
|
296
|
-
return field if field.nil?
|
297
|
-
|
298
|
-
# return if field !~ /#{quote}/ # this check can probably eliminated
|
299
|
-
|
300
|
-
if field.start_with?(quote) && field.end_with?(quote)
|
301
|
-
field.delete_prefix!(quote)
|
302
|
-
field.delete_suffix!(quote)
|
303
|
-
end
|
304
|
-
field.gsub!("#{quote}#{quote}", quote)
|
305
|
-
field
|
204
|
+
false
|
306
205
|
end
|
307
206
|
|
308
207
|
# SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
|
@@ -340,217 +239,5 @@ module SmarterCSV
|
|
340
239
|
false
|
341
240
|
end
|
342
241
|
end
|
343
|
-
|
344
|
-
# acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
|
345
|
-
def only_or_except_limit_execution(options, option_name, key)
|
346
|
-
if options[option_name].is_a?(Hash)
|
347
|
-
if options[option_name].has_key?(:except)
|
348
|
-
return true if Array(options[option_name][:except]).include?(key)
|
349
|
-
elsif options[option_name].has_key?(:only)
|
350
|
-
return true unless Array(options[option_name][:only]).include?(key)
|
351
|
-
end
|
352
|
-
end
|
353
|
-
false
|
354
|
-
end
|
355
|
-
|
356
|
-
# If file has headers, then guesses column separator from headers.
|
357
|
-
# Otherwise guesses column separator from contents.
|
358
|
-
# Raises exception if none is found.
|
359
|
-
def guess_column_separator(filehandle, options)
|
360
|
-
skip_lines(filehandle, options)
|
361
|
-
|
362
|
-
delimiters = [',', "\t", ';', ':', '|']
|
363
|
-
|
364
|
-
line = nil
|
365
|
-
has_header = options[:headers_in_file]
|
366
|
-
candidates = Hash.new(0)
|
367
|
-
count = has_header ? 1 : 5
|
368
|
-
count.times do
|
369
|
-
line = readline_with_counts(filehandle, options)
|
370
|
-
delimiters.each do |d|
|
371
|
-
candidates[d] += line.scan(d).count
|
372
|
-
end
|
373
|
-
rescue EOFError # short files
|
374
|
-
break
|
375
|
-
end
|
376
|
-
rewind(filehandle)
|
377
|
-
|
378
|
-
if candidates.values.max == 0
|
379
|
-
# if the header only contains
|
380
|
-
return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
|
381
|
-
|
382
|
-
raise SmarterCSV::NoColSepDetected
|
383
|
-
end
|
384
|
-
|
385
|
-
candidates.key(candidates.values.max)
|
386
|
-
end
|
387
|
-
|
388
|
-
# limitation: this currently reads the whole file in before making a decision
|
389
|
-
def guess_line_ending(filehandle, options)
|
390
|
-
counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
|
391
|
-
quoted_char = false
|
392
|
-
|
393
|
-
# count how many of the pre-defined line-endings we find
|
394
|
-
# ignoring those contained within quote characters
|
395
|
-
last_char = nil
|
396
|
-
lines = 0
|
397
|
-
filehandle.each_char do |c|
|
398
|
-
quoted_char = !quoted_char if c == options[:quote_char]
|
399
|
-
next if quoted_char
|
400
|
-
|
401
|
-
if last_char == "\r"
|
402
|
-
if c == "\n"
|
403
|
-
counts["\r\n"] += 1
|
404
|
-
else
|
405
|
-
counts["\r"] += 1 # \r are counted after they appeared
|
406
|
-
end
|
407
|
-
elsif c == "\n"
|
408
|
-
counts["\n"] += 1
|
409
|
-
end
|
410
|
-
last_char = c
|
411
|
-
lines += 1
|
412
|
-
break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
|
413
|
-
end
|
414
|
-
rewind(filehandle)
|
415
|
-
|
416
|
-
counts["\r"] += 1 if last_char == "\r"
|
417
|
-
# find the most frequent key/value pair:
|
418
|
-
most_frequent_key, _count = counts.max_by{|_, v| v}
|
419
|
-
most_frequent_key
|
420
|
-
end
|
421
|
-
|
422
|
-
def process_headers(filehandle, options)
|
423
|
-
@raw_header = nil
|
424
|
-
@headers = nil
|
425
|
-
if options[:headers_in_file] # extract the header line
|
426
|
-
# process the header line in the CSV file..
|
427
|
-
# the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
|
428
|
-
header = readline_with_counts(filehandle, options)
|
429
|
-
@raw_header = header
|
430
|
-
|
431
|
-
header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
432
|
-
header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
|
433
|
-
header = header.chomp(options[:row_sep])
|
434
|
-
|
435
|
-
header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
|
436
|
-
|
437
|
-
file_headerA, file_header_size = parse(header, options)
|
438
|
-
|
439
|
-
file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
|
440
|
-
file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
|
441
|
-
|
442
|
-
unless options[:keep_original_headers]
|
443
|
-
file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
|
444
|
-
file_headerA.map!{|x| x.downcase} if options[:downcase_header]
|
445
|
-
end
|
446
|
-
else
|
447
|
-
raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
|
448
|
-
end
|
449
|
-
if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
|
450
|
-
# use user-provided headers
|
451
|
-
headerA = options[:user_provided_headers]
|
452
|
-
if defined?(file_header_size) && !file_header_size.nil?
|
453
|
-
if headerA.size != file_header_size
|
454
|
-
raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
|
455
|
-
else
|
456
|
-
# we could print out the mapping of file_headerA to headerA here
|
457
|
-
end
|
458
|
-
end
|
459
|
-
else
|
460
|
-
headerA = file_headerA
|
461
|
-
end
|
462
|
-
|
463
|
-
# detect duplicate headers and disambiguate
|
464
|
-
headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
|
465
|
-
header_size = headerA.size # used for splitting lines
|
466
|
-
|
467
|
-
headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
|
468
|
-
|
469
|
-
unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
|
470
|
-
key_mappingH = options[:key_mapping]
|
471
|
-
|
472
|
-
# do some key mapping on the keys in the file header
|
473
|
-
# if you want to completely delete a key, then map it to nil or to ''
|
474
|
-
if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
|
475
|
-
# if silence_missing_keys are not set, raise error if missing header
|
476
|
-
missing_keys = key_mappingH.keys - headerA
|
477
|
-
# if the user passes a list of speciffic mapped keys that are optional
|
478
|
-
missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
|
479
|
-
|
480
|
-
unless missing_keys.empty? || options[:silence_missing_keys] == true
|
481
|
-
raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
|
482
|
-
end
|
483
|
-
|
484
|
-
headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
|
485
|
-
end
|
486
|
-
end
|
487
|
-
|
488
|
-
# header_validations
|
489
|
-
duplicate_headers = []
|
490
|
-
headerA.compact.each do |k|
|
491
|
-
duplicate_headers << k if headerA.select{|x| x == k}.size > 1
|
492
|
-
end
|
493
|
-
|
494
|
-
unless options[:user_provided_headers] || duplicate_headers.empty?
|
495
|
-
raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
|
496
|
-
end
|
497
|
-
|
498
|
-
# deprecate required_headers
|
499
|
-
unless options[:required_headers].nil?
|
500
|
-
puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
|
501
|
-
if options[:required_keys].nil?
|
502
|
-
options[:required_keys] = options[:required_headers]
|
503
|
-
options[:required_headers] = nil
|
504
|
-
end
|
505
|
-
end
|
506
|
-
|
507
|
-
if options[:required_keys] && options[:required_keys].is_a?(Array)
|
508
|
-
missing_keys = []
|
509
|
-
options[:required_keys].each do |k|
|
510
|
-
missing_keys << k unless headerA.include?(k)
|
511
|
-
end
|
512
|
-
raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
|
513
|
-
end
|
514
|
-
|
515
|
-
@headers = headerA
|
516
|
-
[headerA, header_size]
|
517
|
-
end
|
518
|
-
|
519
|
-
def process_duplicate_headers(headers, options)
|
520
|
-
counts = Hash.new(0)
|
521
|
-
result = []
|
522
|
-
headers.each do |key|
|
523
|
-
counts[key] += 1
|
524
|
-
if counts[key] == 1
|
525
|
-
result << key
|
526
|
-
else
|
527
|
-
result << [key, options[:duplicate_header_suffix], counts[key]].join
|
528
|
-
end
|
529
|
-
end
|
530
|
-
result
|
531
|
-
end
|
532
|
-
|
533
|
-
private
|
534
|
-
|
535
|
-
UTF_32_BOM = %w[0 0 fe ff].freeze
|
536
|
-
UTF_32LE_BOM = %w[ff fe 0 0].freeze
|
537
|
-
UTF_8_BOM = %w[ef bb bf].freeze
|
538
|
-
UTF_16_BOM = %w[fe ff].freeze
|
539
|
-
UTF_16LE_BOM = %w[ff fe].freeze
|
540
|
-
|
541
|
-
def remove_bom(str)
|
542
|
-
str_as_hex = str.bytes.map{|x| x.to_s(16)}
|
543
|
-
# if string does not start with one of the bytes, there is no BOM
|
544
|
-
return str unless %w[ef fe ff 0].include?(str_as_hex[0])
|
545
|
-
|
546
|
-
return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
|
547
|
-
return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
|
548
|
-
return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
|
549
|
-
|
550
|
-
# :nocov:
|
551
|
-
puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
|
552
|
-
str
|
553
|
-
# :nocov:
|
554
|
-
end
|
555
242
|
end
|
556
243
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
class << self
|
5
|
+
attr_reader :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
|
6
|
+
|
7
|
+
def initialize_variables
|
8
|
+
@csv_line_count = 0
|
9
|
+
@chunk_count = 0
|
10
|
+
@errors = {}
|
11
|
+
@file_line_count = 0
|
12
|
+
@headerA = []
|
13
|
+
@headers = nil
|
14
|
+
@raw_header = nil # header as it appears in the file
|
15
|
+
@result = []
|
16
|
+
@warnings = {}
|
17
|
+
end
|
18
|
+
|
19
|
+
# :nocov:
|
20
|
+
def headerA
|
21
|
+
warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
|
22
|
+
@headerA
|
23
|
+
end
|
24
|
+
# :nocov:
|
25
|
+
end
|
26
|
+
end
|
data/lib/smarter_csv/version.rb
CHANGED
data/lib/smarter_csv.rb
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require "core_ext/hash"
|
4
|
-
|
5
3
|
require "smarter_csv/version"
|
4
|
+
require "smarter_csv/file_io"
|
6
5
|
require "smarter_csv/options_processing"
|
6
|
+
require "smarter_csv/auto_detection"
|
7
|
+
require "smarter_csv/variables"
|
8
|
+
require "smarter_csv/headers"
|
9
|
+
require "smarter_csv/parse"
|
7
10
|
|
8
11
|
case RUBY_ENGINE
|
9
12
|
when 'ruby'
|
@@ -11,9 +14,11 @@ when 'ruby'
|
|
11
14
|
if `uname -s`.chomp == 'Darwin'
|
12
15
|
require 'smarter_csv/smarter_csv.bundle'
|
13
16
|
else
|
17
|
+
# :nocov:
|
14
18
|
require_relative "smarter_csv/smarter_csv"
|
19
|
+
# :nocov:
|
15
20
|
end
|
16
|
-
rescue Exception
|
21
|
+
rescue Exception # rubocop:disable Lint/RescueException
|
17
22
|
# require_relative 'smarter_csv/smarter_csv'
|
18
23
|
end
|
19
24
|
# :nocov:
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: smarter_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.9.
|
4
|
+
version: 1.9.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tilo Sloboda
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-12-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: awesome_print
|
@@ -115,10 +115,14 @@ files:
|
|
115
115
|
- TO_DO_v2.md
|
116
116
|
- ext/smarter_csv/extconf.rb
|
117
117
|
- ext/smarter_csv/smarter_csv.c
|
118
|
-
- lib/core_ext/hash.rb
|
119
118
|
- lib/smarter_csv.rb
|
119
|
+
- lib/smarter_csv/auto_detection.rb
|
120
|
+
- lib/smarter_csv/file_io.rb
|
121
|
+
- lib/smarter_csv/headers.rb
|
120
122
|
- lib/smarter_csv/options_processing.rb
|
123
|
+
- lib/smarter_csv/parse.rb
|
121
124
|
- lib/smarter_csv/smarter_csv.rb
|
125
|
+
- lib/smarter_csv/variables.rb
|
122
126
|
- lib/smarter_csv/version.rb
|
123
127
|
- smarter_csv.gemspec
|
124
128
|
homepage: https://github.com/tilo/smarter_csv
|