smarter_csv 1.9.2.pre01 → 1.9.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 75d9d441d771c2fbe0861e0bc5f84dbf05d010b4844a867fc4679df002822d07
4
- data.tar.gz: 0d5d37d4f2654fd354a2adac23019b2955540c1356c57f72052c01220598ffa2
3
+ metadata.gz: 5f35e10ff8bc0e79ff1ed9bea8e413f746f51128a6f6a9622d246873fd588366
4
+ data.tar.gz: 5cc30cf6f4422dd16f3019915bc5305a92aaaa4b99665e4c4c525d3bbf489cfd
5
5
  SHA512:
6
- metadata.gz: b24e2b09ea919994da347eb52b781868a19e0f28dc367bfeb43b8a254619ab8dd882d3035f0546683c2ebc893fd600ce05a3abb800e4b124c7369d314607ee3f
7
- data.tar.gz: 3a1115ac4937c2fedf469d1f45e3aa1cf7ed03f1f55d66f6cb310c767a3b5e8cb4966a19ba968f065c5d1de2d7074f479b5eeb0686dbab8012e5e6b8ed0f2628
6
+ metadata.gz: 057472a73ae0be95318b16428b276ecffba384a68479af715c5ec3ca7601405ca73928b0fbf245c9b3f46fd33b82a8c6d9c9e6330ddb0305b83ae23f58173df0
7
+ data.tar.gz: 319b12a53875c1963eed6d27aa67850135d33a5b3a9f70607e6d812906733b711ade6c3ee6e789d78c2e159004a879e59e700145224134745b16d279039ac38a
data/CHANGELOG.md CHANGED
@@ -1,9 +1,14 @@
1
1
 
2
2
  # SmarterCSV 1.x Change Log
3
3
 
4
- ## 1.9.2.pre01 (2023-11-11)
5
- * fixed bug with '\\' at end of line (issue #252)
6
- * fixed require statements
4
+ ## 1.9.3 (2023-12-16)
5
+ * raise SmarterCSV::IncorrectOption when `user_provided_headers` are empty
6
+ * code refactor / no functional changes
7
+ * added test cases
8
+
9
+ ## 1.9.2 (2023-11-12)
10
+ * fixed bug with '\\' at end of line (issue #252, thanks to averycrespi-moz)
11
+ * fixed require statements (issue #249, thanks to PikachuEXE, courtsimas)
7
12
 
8
13
  ## 1.9.1 (2023-10-30) (YANKED)
9
14
  * yanked
data/README.md CHANGED
@@ -300,7 +300,7 @@ And header and data validations will also be supported in 2.x
300
300
  | Option | Default | Explanation |
301
301
  ---------------------------------------------------------------------------------------------------------------------------------
302
302
  | :key_mapping | nil | a hash which maps headers from the CSV file to keys in the result hash |
303
- | :silence_missing_key | false | ignore missing keys in `key_mapping` |
303
+ | :silence_missing_keys | false | ignore missing keys in `key_mapping` |
304
304
  | | | if set to true: makes all mapped keys optional |
305
305
  | | | if given an array, makes only the keys listed in it optional |
306
306
  | :required_keys | nil | An array. Specify the required names AFTER header transformation. |
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ protected
6
+
7
+ # If file has headers, then guesses column separator from headers.
8
+ # Otherwise guesses column separator from contents.
9
+ # Raises exception if none is found.
10
+ def guess_column_separator(filehandle, options)
11
+ skip_lines(filehandle, options)
12
+
13
+ delimiters = [',', "\t", ';', ':', '|']
14
+
15
+ line = nil
16
+ has_header = options[:headers_in_file]
17
+ candidates = Hash.new(0)
18
+ count = has_header ? 1 : 5
19
+ count.times do
20
+ line = readline_with_counts(filehandle, options)
21
+ delimiters.each do |d|
22
+ candidates[d] += line.scan(d).count
23
+ end
24
+ rescue EOFError # short files
25
+ break
26
+ end
27
+ rewind(filehandle)
28
+
29
+ if candidates.values.max == 0
30
+ # if the header only contains
31
+ return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
32
+
33
+ raise SmarterCSV::NoColSepDetected
34
+ end
35
+
36
+ candidates.key(candidates.values.max)
37
+ end
38
+
39
+ # limitation: this currently reads the whole file in before making a decision
40
+ def guess_line_ending(filehandle, options)
41
+ counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
42
+ quoted_char = false
43
+
44
+ # count how many of the pre-defined line-endings we find
45
+ # ignoring those contained within quote characters
46
+ last_char = nil
47
+ lines = 0
48
+ filehandle.each_char do |c|
49
+ quoted_char = !quoted_char if c == options[:quote_char]
50
+ next if quoted_char
51
+
52
+ if last_char == "\r"
53
+ if c == "\n"
54
+ counts["\r\n"] += 1
55
+ else
56
+ counts["\r"] += 1 # \r are counted after they appeared
57
+ end
58
+ elsif c == "\n"
59
+ counts["\n"] += 1
60
+ end
61
+ last_char = c
62
+ lines += 1
63
+ break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
64
+ end
65
+ rewind(filehandle)
66
+
67
+ counts["\r"] += 1 if last_char == "\r"
68
+ # find the most frequent key/value pair:
69
+ most_frequent_key, _count = counts.max_by{|_, v| v}
70
+ most_frequent_key
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ protected
6
+
7
+ def readline_with_counts(filehandle, options)
8
+ line = filehandle.readline(options[:row_sep])
9
+ @file_line_count += 1
10
+ @csv_line_count += 1
11
+ line = remove_bom(line) if @csv_line_count == 1
12
+ line
13
+ end
14
+
15
+ def skip_lines(filehandle, options)
16
+ options[:skip_lines].to_i.times do
17
+ readline_with_counts(filehandle, options)
18
+ end
19
+ end
20
+
21
+ def rewind(filehandle)
22
+ @file_line_count = 0
23
+ @csv_line_count = 0
24
+ filehandle.rewind
25
+ end
26
+
27
+ private
28
+
29
+ UTF_32_BOM = %w[0 0 fe ff].freeze
30
+ UTF_32LE_BOM = %w[ff fe 0 0].freeze
31
+ UTF_8_BOM = %w[ef bb bf].freeze
32
+ UTF_16_BOM = %w[fe ff].freeze
33
+ UTF_16LE_BOM = %w[ff fe].freeze
34
+
35
+ def remove_bom(str)
36
+ str_as_hex = str.bytes.map{|x| x.to_s(16)}
37
+ # if string does not start with one of the bytes, there is no BOM
38
+ return str unless %w[ef fe ff 0].include?(str_as_hex[0])
39
+
40
+ return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
41
+ return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
42
+ return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
43
+
44
+ # :nocov:
45
+ puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
46
+ str
47
+ # :nocov:
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,160 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ def process_headers(filehandle, options)
6
+ @raw_header = nil # header as it appears in the file
7
+ @headers = nil # the processed headers
8
+ header_array = []
9
+ file_header_size = nil
10
+
11
+ # if headers_in_file, get the headers -> We get the number of columns, even when user provided headers
12
+ if options[:headers_in_file] # extract the header line
13
+ # process the header line in the CSV file..
14
+ # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
15
+ header_line = @raw_header = readline_with_counts(filehandle, options)
16
+ header_line = preprocess_header_line(header_line, options)
17
+ file_header_array, file_header_size = parse_and_modify_headers(header_line, options)
18
+ else
19
+ unless options[:user_provided_headers]
20
+ raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers"
21
+ end
22
+ end
23
+
24
+ if options[:user_provided_headers]
25
+ unless options[:user_provided_headers].is_a?(Array) && !options[:user_provided_headers].empty?
26
+ raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for user_provided_headers! Expecting array with headers.")
27
+ end
28
+
29
+ # use user-provided headers
30
+ user_header_array = options[:user_provided_headers]
31
+ # user_provided_headers: their count should match the headers_in_file if any
32
+ if defined?(file_header_size) && !file_header_size.nil?
33
+ if user_header_array.size != file_header_size
34
+ raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{user_header_array.size} headers != CSV-file has #{file_header_size} headers"
35
+ else
36
+ # we could print out the mapping of file_header_array to header_array here
37
+ end
38
+ end
39
+ header_array = user_header_array
40
+ else
41
+ header_array = file_header_array
42
+ end
43
+
44
+ # detect duplicate headers and disambiguate
45
+ header_array = disambiguate_headers(header_array, options) if options[:duplicate_header_suffix]
46
+
47
+ # symbolize headers
48
+ header_array.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
49
+
50
+ # wouldn't make sense to re-map user provided headers
51
+ header_array = remap_headers(header_array, options) if options[:key_mapping] && !options[:user_provided_headers]
52
+
53
+ validate_and_deprecate_headers(header_array, options)
54
+
55
+ [header_array, header_array.size]
56
+ end
57
+
58
+ private
59
+
60
+ def preprocess_header_line(header_line, options)
61
+ header_line = enforce_utf8_encoding(header_line, options)
62
+ header_line = remove_comments_from_header(header_line, options)
63
+ header_line = header_line.chomp(options[:row_sep])
64
+ header_line.gsub!(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
65
+ header_line
66
+ end
67
+
68
+ def parse_and_modify_headers(header_line, options)
69
+ file_header_array, file_header_size = parse(header_line, options)
70
+
71
+ file_header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
72
+ file_header_array.map!{|x| x.strip} if options[:strip_whitespace]
73
+
74
+ unless options[:keep_original_headers]
75
+ file_header_array.map!{|x| x.gsub(/\s+|-+/, '_')}
76
+ file_header_array.map!{|x| x.downcase} if options[:downcase_header]
77
+ end
78
+ [file_header_array, file_header_size]
79
+ end
80
+
81
+ def disambiguate_headers(headers, options)
82
+ counts = Hash.new(0)
83
+ headers.map do |header|
84
+ counts[header] += 1
85
+ counts[header] > 1 ? "#{header}#{options[:duplicate_header_suffix]}#{counts[header]}" : header
86
+ end
87
+ end
88
+
89
+ # do some key mapping on the keys in the file header
90
+ # if you want to completely delete a key, then map it to nil or to ''
91
+ def remap_headers(headers, options)
92
+ key_mapping = options[:key_mapping]
93
+ if key_mapping.empty? || !key_mapping.is_a?(Hash) || key_mapping.keys.empty?
94
+ raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for key_mapping! Expecting hash with from -> to mappings")
95
+ end
96
+
97
+ key_mapping = options[:key_mapping]
98
+ # if silence_missing_keys are not set, raise error if missing header
99
+ missing_keys = key_mapping.keys - headers
100
+ # if the user passes a list of speciffic mapped keys that are optional
101
+ missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
102
+
103
+ unless missing_keys.empty? || options[:silence_missing_keys] == true
104
+ raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
105
+ end
106
+
107
+ headers.map! do |header|
108
+ if key_mapping.has_key?(header)
109
+ key_mapping[header].nil? ? nil : key_mapping[header]
110
+ elsif options[:remove_unmapped_keys]
111
+ nil
112
+ else
113
+ header
114
+ end
115
+ end
116
+ headers
117
+ end
118
+
119
+ # header_validations
120
+ def validate_and_deprecate_headers(headers, options)
121
+ duplicate_headers = []
122
+ headers.compact.each do |k|
123
+ duplicate_headers << k if headers.select{|x| x == k}.size > 1
124
+ end
125
+
126
+ unless options[:user_provided_headers] || duplicate_headers.empty?
127
+ raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
128
+ end
129
+
130
+ # deprecate required_headers
131
+ unless options[:required_headers].nil?
132
+ puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
133
+ if options[:required_keys].nil?
134
+ options[:required_keys] = options[:required_headers]
135
+ options[:required_headers] = nil
136
+ end
137
+ end
138
+
139
+ if options[:required_keys] && options[:required_keys].is_a?(Array)
140
+ missing_keys = []
141
+ options[:required_keys].each do |k|
142
+ missing_keys << k unless headers.include?(k)
143
+ end
144
+ raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
145
+ end
146
+ end
147
+
148
+ def enforce_utf8_encoding(header, options)
149
+ return header unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
150
+
151
+ header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
152
+ end
153
+
154
+ def remove_comments_from_header(header, options)
155
+ return header unless options[:comment_regexp]
156
+
157
+ header.sub(options[:comment_regexp], '')
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ protected
6
+
7
+ ###
8
+ ### Thin wrapper around C-extension
9
+ ###
10
+ def parse(line, options, header_size = nil)
11
+ # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
12
+
13
+ if options[:acceleration] && has_acceleration?
14
+ # :nocov:
15
+ has_quotes = line =~ /#{options[:quote_char]}/
16
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
17
+ elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
18
+ [elements, elements.size]
19
+ # :nocov:
20
+ else
21
+ # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
22
+ parse_csv_line_ruby(line, options, header_size)
23
+ end
24
+ end
25
+
26
+ # ------------------------------------------------------------------
27
+ # Ruby equivalent of the C-extension for parse_line
28
+ #
29
+ # parses a single line: either a CSV header and body line
30
+ # - quoting rules compared to RFC-4180 are somewhat relaxed
31
+ # - we are not assuming that quotes inside a fields need to be doubled
32
+ # - we are not assuming that all fields need to be quoted (0 is even)
33
+ # - works with multi-char col_sep
34
+ # - if header_size is given, only up to header_size fields are parsed
35
+ #
36
+ # We use header_size for parsing the body lines to make sure we always match the number of headers
37
+ # in case there are trailing col_sep characters in line
38
+ #
39
+ # Our convention is that empty fields are returned as empty strings, not as nil.
40
+ #
41
+ #
42
+ # the purpose of the max_size parameter is to handle a corner case where
43
+ # CSV lines contain more fields than the header.
44
+ # In which case the remaining fields in the line are ignored
45
+ #
46
+ def parse_csv_line_ruby(line, options, header_size = nil)
47
+ return [] if line.nil?
48
+
49
+ line_size = line.size
50
+ col_sep = options[:col_sep]
51
+ col_sep_size = col_sep.size
52
+ quote = options[:quote_char]
53
+ quote_count = 0
54
+ elements = []
55
+ start = 0
56
+ i = 0
57
+
58
+ previous_char = ''
59
+ while i < line_size
60
+ if line[i...i+col_sep_size] == col_sep && quote_count.even?
61
+ break if !header_size.nil? && elements.size >= header_size
62
+
63
+ elements << cleanup_quotes(line[start...i], quote)
64
+ previous_char = line[i]
65
+ i += col_sep.size
66
+ start = i
67
+ else
68
+ quote_count += 1 if line[i] == quote && previous_char != '\\'
69
+ previous_char = line[i]
70
+ i += 1
71
+ end
72
+ end
73
+ elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
74
+ [elements, elements.size]
75
+ end
76
+
77
+ def cleanup_quotes(field, quote)
78
+ return field if field.nil?
79
+
80
+ # return if field !~ /#{quote}/ # this check can probably eliminated
81
+
82
+ if field.start_with?(quote) && field.end_with?(quote)
83
+ field.delete_prefix!(quote)
84
+ field.delete_suffix!(quote)
85
+ end
86
+ field.gsub!("#{quote}#{quote}", quote)
87
+ field
88
+ end
89
+ end
90
+ end
@@ -14,10 +14,8 @@ module SmarterCSV
14
14
  def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
15
15
  options = process_options(given_options)
16
16
 
17
- headerA = []
18
- result = []
19
- @file_line_count = 0
20
- @csv_line_count = 0
17
+ initialize_variables
18
+
21
19
  has_rails = !!defined?(Rails)
22
20
  begin
23
21
  fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
@@ -33,13 +31,14 @@ module SmarterCSV
33
31
 
34
32
  skip_lines(fh, options)
35
33
 
36
- headerA, header_size = process_headers(fh, options)
34
+ @headers, header_size = process_headers(fh, options)
35
+ @headerA = @headers # @headerA is deprecated, use @headers
37
36
 
38
37
  # in case we use chunking.. we'll need to set it up..
39
- if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
38
+ if options[:chunk_size].to_i > 0
40
39
  use_chunks = true
41
40
  chunk_size = options[:chunk_size].to_i
42
- chunk_count = 0
41
+ @chunk_count = 0
43
42
  chunk = []
44
43
  else
45
44
  use_chunks = false
@@ -78,7 +77,7 @@ module SmarterCSV
78
77
  # if all values are blank, then ignore this line
79
78
  next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
80
79
 
81
- hash = Hash.zip(headerA, dataA) # from Facets of Ruby library
80
+ hash = @headers.zip(dataA).to_h
82
81
 
83
82
  # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
84
83
  hash.delete(nil)
@@ -95,7 +94,7 @@ module SmarterCSV
95
94
  if options[:convert_values_to_numeric]
96
95
  hash.each do |k, v|
97
96
  # deal with the :only / :except options to :convert_values_to_numeric
98
- next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
97
+ next if limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
99
98
 
100
99
  # convert if it's a numeric value:
101
100
  case v
@@ -128,9 +127,9 @@ module SmarterCSV
128
127
  if block_given?
129
128
  yield chunk # do something with the hashes in the chunk in the block
130
129
  else
131
- result << chunk # not sure yet, why anybody would want to do this without a block
130
+ @result << chunk # not sure yet, why anybody would want to do this without a block
132
131
  end
133
- chunk_count += 1
132
+ @chunk_count += 1
134
133
  chunk = [] # initialize for next chunk of data
135
134
  else
136
135
 
@@ -144,7 +143,7 @@ module SmarterCSV
144
143
  if block_given?
145
144
  yield [hash] # do something with the hash in the block (better to use chunking here)
146
145
  else
147
- result << hash
146
+ @result << hash
148
147
  end
149
148
  end
150
149
  end
@@ -158,34 +157,23 @@ module SmarterCSV
158
157
  if block_given?
159
158
  yield chunk # do something with the hashes in the chunk in the block
160
159
  else
161
- result << chunk # not sure yet, why anybody would want to do this without a block
160
+ @result << chunk # not sure yet, why anybody would want to do this without a block
162
161
  end
163
- chunk_count += 1
162
+ @chunk_count += 1
164
163
  # chunk = [] # initialize for next chunk of data
165
164
  end
166
165
  ensure
167
166
  fh.close if fh.respond_to?(:close)
168
167
  end
168
+
169
169
  if block_given?
170
- chunk_count # when we do processing through a block we only care how many chunks we processed
170
+ @chunk_count # when we do processing through a block we only care how many chunks we processed
171
171
  else
172
- result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
172
+ @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
173
173
  end
174
174
  end
175
175
 
176
176
  class << self
177
- def has_acceleration?
178
- @has_acceleration ||= !!defined?(parse_csv_line_c)
179
- end
180
-
181
- def raw_header
182
- @raw_header
183
- end
184
-
185
- def headers
186
- @headers
187
- end
188
-
189
177
  # * the `scan` method iterates through the string and finds all occurrences of the pattern
190
178
  # * The reqular expression:
191
179
  # - (?<!\\) : Negative lookbehind to ensure the quote character is not preceded by an unescaped backslash.
@@ -198,111 +186,22 @@ module SmarterCSV
198
186
  line.scan(/(?<!\\)(?:\\\\)*#{Regexp.escape(quote_char)}/).count
199
187
  end
200
188
 
201
- protected
202
-
203
- def readline_with_counts(filehandle, options)
204
- line = filehandle.readline(options[:row_sep])
205
- @file_line_count += 1
206
- @csv_line_count += 1
207
- line = remove_bom(line) if @csv_line_count == 1
208
- line
209
- end
210
-
211
- def skip_lines(filehandle, options)
212
- return unless options[:skip_lines].to_i > 0
213
-
214
- options[:skip_lines].to_i.times do
215
- readline_with_counts(filehandle, options)
216
- end
217
- end
218
-
219
- def rewind(filehandle)
220
- @file_line_count = 0
221
- @csv_line_count = 0
222
- filehandle.rewind
189
+ def has_acceleration?
190
+ @has_acceleration ||= !!defined?(parse_csv_line_c)
223
191
  end
224
192
 
225
- ###
226
- ### Thin wrapper around C-extension
227
- ###
228
- def parse(line, options, header_size = nil)
229
- # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
230
-
231
- if options[:acceleration] && has_acceleration?
232
- # :nocov:
233
- has_quotes = line =~ /#{options[:quote_char]}/
234
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
235
- elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
236
- [elements, elements.size]
237
- # :nocov:
238
- else
239
- # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
240
- parse_csv_line_ruby(line, options, header_size)
241
- end
242
- end
193
+ protected
243
194
 
244
- # ------------------------------------------------------------------
245
- # Ruby equivalent of the C-extension for parse_line
246
- #
247
- # parses a single line: either a CSV header and body line
248
- # - quoting rules compared to RFC-4180 are somewhat relaxed
249
- # - we are not assuming that quotes inside a fields need to be doubled
250
- # - we are not assuming that all fields need to be quoted (0 is even)
251
- # - works with multi-char col_sep
252
- # - if header_size is given, only up to header_size fields are parsed
253
- #
254
- # We use header_size for parsing the body lines to make sure we always match the number of headers
255
- # in case there are trailing col_sep characters in line
256
- #
257
- # Our convention is that empty fields are returned as empty strings, not as nil.
258
- #
259
- #
260
- # the purpose of the max_size parameter is to handle a corner case where
261
- # CSV lines contain more fields than the header.
262
- # In which case the remaining fields in the line are ignored
263
- #
264
- def parse_csv_line_ruby(line, options, header_size = nil)
265
- return [] if line.nil?
266
-
267
- line_size = line.size
268
- col_sep = options[:col_sep]
269
- col_sep_size = col_sep.size
270
- quote = options[:quote_char]
271
- quote_count = 0
272
- elements = []
273
- start = 0
274
- i = 0
275
-
276
- previous_char = ''
277
- while i < line_size
278
- if line[i...i+col_sep_size] == col_sep && quote_count.even?
279
- break if !header_size.nil? && elements.size >= header_size
280
-
281
- elements << cleanup_quotes(line[start...i], quote)
282
- previous_char = line[i]
283
- i += col_sep.size
284
- start = i
285
- else
286
- quote_count += 1 if line[i] == quote && previous_char != '\\'
287
- previous_char = line[i]
288
- i += 1
195
+ # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
196
+ def limit_execution_for_only_or_except(options, option_name, key)
197
+ if options[option_name].is_a?(Hash)
198
+ if options[option_name].has_key?(:except)
199
+ return true if Array(options[option_name][:except]).include?(key)
200
+ elsif options[option_name].has_key?(:only)
201
+ return true unless Array(options[option_name][:only]).include?(key)
289
202
  end
290
203
  end
291
- elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
292
- [elements, elements.size]
293
- end
294
-
295
- def cleanup_quotes(field, quote)
296
- return field if field.nil?
297
-
298
- # return if field !~ /#{quote}/ # this check can probably eliminated
299
-
300
- if field.start_with?(quote) && field.end_with?(quote)
301
- field.delete_prefix!(quote)
302
- field.delete_suffix!(quote)
303
- end
304
- field.gsub!("#{quote}#{quote}", quote)
305
- field
204
+ false
306
205
  end
307
206
 
308
207
  # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
@@ -340,217 +239,5 @@ module SmarterCSV
340
239
  false
341
240
  end
342
241
  end
343
-
344
- # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
345
- def only_or_except_limit_execution(options, option_name, key)
346
- if options[option_name].is_a?(Hash)
347
- if options[option_name].has_key?(:except)
348
- return true if Array(options[option_name][:except]).include?(key)
349
- elsif options[option_name].has_key?(:only)
350
- return true unless Array(options[option_name][:only]).include?(key)
351
- end
352
- end
353
- false
354
- end
355
-
356
- # If file has headers, then guesses column separator from headers.
357
- # Otherwise guesses column separator from contents.
358
- # Raises exception if none is found.
359
- def guess_column_separator(filehandle, options)
360
- skip_lines(filehandle, options)
361
-
362
- delimiters = [',', "\t", ';', ':', '|']
363
-
364
- line = nil
365
- has_header = options[:headers_in_file]
366
- candidates = Hash.new(0)
367
- count = has_header ? 1 : 5
368
- count.times do
369
- line = readline_with_counts(filehandle, options)
370
- delimiters.each do |d|
371
- candidates[d] += line.scan(d).count
372
- end
373
- rescue EOFError # short files
374
- break
375
- end
376
- rewind(filehandle)
377
-
378
- if candidates.values.max == 0
379
- # if the header only contains
380
- return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
381
-
382
- raise SmarterCSV::NoColSepDetected
383
- end
384
-
385
- candidates.key(candidates.values.max)
386
- end
387
-
388
- # limitation: this currently reads the whole file in before making a decision
389
- def guess_line_ending(filehandle, options)
390
- counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
391
- quoted_char = false
392
-
393
- # count how many of the pre-defined line-endings we find
394
- # ignoring those contained within quote characters
395
- last_char = nil
396
- lines = 0
397
- filehandle.each_char do |c|
398
- quoted_char = !quoted_char if c == options[:quote_char]
399
- next if quoted_char
400
-
401
- if last_char == "\r"
402
- if c == "\n"
403
- counts["\r\n"] += 1
404
- else
405
- counts["\r"] += 1 # \r are counted after they appeared
406
- end
407
- elsif c == "\n"
408
- counts["\n"] += 1
409
- end
410
- last_char = c
411
- lines += 1
412
- break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
413
- end
414
- rewind(filehandle)
415
-
416
- counts["\r"] += 1 if last_char == "\r"
417
- # find the most frequent key/value pair:
418
- most_frequent_key, _count = counts.max_by{|_, v| v}
419
- most_frequent_key
420
- end
421
-
422
- def process_headers(filehandle, options)
423
- @raw_header = nil
424
- @headers = nil
425
- if options[:headers_in_file] # extract the header line
426
- # process the header line in the CSV file..
427
- # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
428
- header = readline_with_counts(filehandle, options)
429
- @raw_header = header
430
-
431
- header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
432
- header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
433
- header = header.chomp(options[:row_sep])
434
-
435
- header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
436
-
437
- file_headerA, file_header_size = parse(header, options)
438
-
439
- file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
440
- file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
441
-
442
- unless options[:keep_original_headers]
443
- file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
444
- file_headerA.map!{|x| x.downcase} if options[:downcase_header]
445
- end
446
- else
447
- raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
448
- end
449
- if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
450
- # use user-provided headers
451
- headerA = options[:user_provided_headers]
452
- if defined?(file_header_size) && !file_header_size.nil?
453
- if headerA.size != file_header_size
454
- raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
455
- else
456
- # we could print out the mapping of file_headerA to headerA here
457
- end
458
- end
459
- else
460
- headerA = file_headerA
461
- end
462
-
463
- # detect duplicate headers and disambiguate
464
- headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
465
- header_size = headerA.size # used for splitting lines
466
-
467
- headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
468
-
469
- unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
470
- key_mappingH = options[:key_mapping]
471
-
472
- # do some key mapping on the keys in the file header
473
- # if you want to completely delete a key, then map it to nil or to ''
474
- if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
475
- # if silence_missing_keys are not set, raise error if missing header
476
- missing_keys = key_mappingH.keys - headerA
477
- # if the user passes a list of speciffic mapped keys that are optional
478
- missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
479
-
480
- unless missing_keys.empty? || options[:silence_missing_keys] == true
481
- raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
482
- end
483
-
484
- headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
485
- end
486
- end
487
-
488
- # header_validations
489
- duplicate_headers = []
490
- headerA.compact.each do |k|
491
- duplicate_headers << k if headerA.select{|x| x == k}.size > 1
492
- end
493
-
494
- unless options[:user_provided_headers] || duplicate_headers.empty?
495
- raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
496
- end
497
-
498
- # deprecate required_headers
499
- unless options[:required_headers].nil?
500
- puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
501
- if options[:required_keys].nil?
502
- options[:required_keys] = options[:required_headers]
503
- options[:required_headers] = nil
504
- end
505
- end
506
-
507
- if options[:required_keys] && options[:required_keys].is_a?(Array)
508
- missing_keys = []
509
- options[:required_keys].each do |k|
510
- missing_keys << k unless headerA.include?(k)
511
- end
512
- raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
513
- end
514
-
515
- @headers = headerA
516
- [headerA, header_size]
517
- end
518
-
519
- def process_duplicate_headers(headers, options)
520
- counts = Hash.new(0)
521
- result = []
522
- headers.each do |key|
523
- counts[key] += 1
524
- if counts[key] == 1
525
- result << key
526
- else
527
- result << [key, options[:duplicate_header_suffix], counts[key]].join
528
- end
529
- end
530
- result
531
- end
532
-
533
- private
534
-
535
- UTF_32_BOM = %w[0 0 fe ff].freeze
536
- UTF_32LE_BOM = %w[ff fe 0 0].freeze
537
- UTF_8_BOM = %w[ef bb bf].freeze
538
- UTF_16_BOM = %w[fe ff].freeze
539
- UTF_16LE_BOM = %w[ff fe].freeze
540
-
541
- def remove_bom(str)
542
- str_as_hex = str.bytes.map{|x| x.to_s(16)}
543
- # if string does not start with one of the bytes, there is no BOM
544
- return str unless %w[ef fe ff 0].include?(str_as_hex[0])
545
-
546
- return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
547
- return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
548
- return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
549
-
550
- # :nocov:
551
- puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
552
- str
553
- # :nocov:
554
- end
555
242
  end
556
243
  end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ attr_reader :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
6
+
7
+ def initialize_variables
8
+ @csv_line_count = 0
9
+ @chunk_count = 0
10
+ @errors = {}
11
+ @file_line_count = 0
12
+ @headerA = []
13
+ @headers = nil
14
+ @raw_header = nil # header as it appears in the file
15
+ @result = []
16
+ @warnings = {}
17
+ end
18
+
19
+ # :nocov:
20
+ def headerA
21
+ warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
22
+ @headerA
23
+ end
24
+ # :nocov:
25
+ end
26
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.9.2.pre01" # this is a pretty odd situation
4
+ VERSION = "1.9.3"
5
5
  end
data/lib/smarter_csv.rb CHANGED
@@ -1,9 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "core_ext/hash"
4
-
5
3
  require "smarter_csv/version"
4
+ require "smarter_csv/file_io"
6
5
  require "smarter_csv/options_processing"
6
+ require "smarter_csv/auto_detection"
7
+ require "smarter_csv/variables"
8
+ require "smarter_csv/headers"
9
+ require "smarter_csv/parse"
7
10
 
8
11
  case RUBY_ENGINE
9
12
  when 'ruby'
@@ -11,10 +14,12 @@ when 'ruby'
11
14
  if `uname -s`.chomp == 'Darwin'
12
15
  require 'smarter_csv/smarter_csv.bundle'
13
16
  else
17
+ # :nocov:
14
18
  require_relative "smarter_csv/smarter_csv"
19
+ # :nocov:
15
20
  end
16
- rescue Exception
17
- # require_relative 'smarter_csv/smarter_csv'
21
+ rescue Exception # rubocop:disable Lint/RescueException
22
+ # require_relative 'smarter_csv/smarter_csv'
18
23
  end
19
24
  # :nocov:
20
25
  # when 'truffleruby'
@@ -36,4 +41,3 @@ else
36
41
  end
37
42
  # :nocov:
38
43
  require "smarter_csv/smarter_csv"
39
-
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.2.pre01
4
+ version: 1.9.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-11-12 00:00:00.000000000 Z
11
+ date: 2023-12-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -115,10 +115,14 @@ files:
115
115
  - TO_DO_v2.md
116
116
  - ext/smarter_csv/extconf.rb
117
117
  - ext/smarter_csv/smarter_csv.c
118
- - lib/core_ext/hash.rb
119
118
  - lib/smarter_csv.rb
119
+ - lib/smarter_csv/auto_detection.rb
120
+ - lib/smarter_csv/file_io.rb
121
+ - lib/smarter_csv/headers.rb
120
122
  - lib/smarter_csv/options_processing.rb
123
+ - lib/smarter_csv/parse.rb
121
124
  - lib/smarter_csv/smarter_csv.rb
125
+ - lib/smarter_csv/variables.rb
122
126
  - lib/smarter_csv/version.rb
123
127
  - smarter_csv.gemspec
124
128
  homepage: https://github.com/tilo/smarter_csv
@@ -140,9 +144,9 @@ required_ruby_version: !ruby/object:Gem::Requirement
140
144
  version: 2.5.0
141
145
  required_rubygems_version: !ruby/object:Gem::Requirement
142
146
  requirements:
143
- - - ">"
147
+ - - ">="
144
148
  - !ruby/object:Gem::Version
145
- version: 1.3.1
149
+ version: '0'
146
150
  requirements: []
147
151
  rubygems_version: 3.2.3
148
152
  signing_key:
data/lib/core_ext/hash.rb DELETED
@@ -1,9 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # the following extension for class Hash is needed (from Facets of Ruby library):
4
-
5
- class Hash
6
- def self.zip(keys, values) # from Facets of Ruby library
7
- keys.zip(values).to_h
8
- end
9
- end