smarter_csv 1.15.2 → 1.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +9 -0
  4. data/CHANGELOG.md +112 -1
  5. data/CONTRIBUTORS.md +4 -1
  6. data/Gemfile +1 -0
  7. data/README.md +129 -27
  8. data/docs/_introduction.md +45 -24
  9. data/docs/bad_row_quarantine.md +342 -0
  10. data/docs/basic_read_api.md +152 -9
  11. data/docs/basic_write_api.md +475 -59
  12. data/docs/batch_processing.md +162 -4
  13. data/docs/column_selection.md +184 -0
  14. data/docs/data_transformations.md +163 -29
  15. data/docs/examples.md +340 -46
  16. data/docs/header_transformations.md +94 -12
  17. data/docs/header_validations.md +57 -18
  18. data/docs/history.md +119 -0
  19. data/docs/instrumentation.md +166 -0
  20. data/docs/migrating_from_csv.md +565 -0
  21. data/docs/options.md +151 -87
  22. data/docs/parsing_strategy.md +64 -1
  23. data/docs/real_world_csv.md +263 -0
  24. data/docs/releases/1.16.0/benchmarks.md +223 -0
  25. data/docs/releases/1.16.0/changes.md +273 -0
  26. data/docs/releases/1.16.0/performance_notes.md +114 -0
  27. data/docs/row_col_sep.md +15 -5
  28. data/docs/ruby_csv_pitfalls.md +514 -0
  29. data/docs/value_converters.md +194 -57
  30. data/ext/smarter_csv/extconf.rb +3 -0
  31. data/ext/smarter_csv/smarter_csv.c +1017 -82
  32. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png +0 -0
  33. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg +108 -0
  34. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.png +0 -0
  35. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg +141 -0
  36. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png +0 -0
  37. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg +139 -0
  38. data/lib/smarter_csv/errors.rb +8 -0
  39. data/lib/smarter_csv/file_io.rb +1 -1
  40. data/lib/smarter_csv/hash_transformations.rb +14 -13
  41. data/lib/smarter_csv/header_transformations.rb +21 -2
  42. data/lib/smarter_csv/headers.rb +2 -1
  43. data/lib/smarter_csv/options.rb +124 -7
  44. data/lib/smarter_csv/parser.rb +358 -74
  45. data/lib/smarter_csv/reader.rb +494 -46
  46. data/lib/smarter_csv/version.rb +1 -1
  47. data/lib/smarter_csv/writer.rb +71 -19
  48. data/lib/smarter_csv.rb +134 -13
  49. data/smarter_csv.gemspec +20 -10
  50. metadata +38 -80
@@ -1,7 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'set'
4
+
3
5
  module SmarterCSV
4
6
  class Reader
7
+ include Enumerable
8
+
9
+ # Default chunk size used by each_chunk when chunk_size is not explicitly set.
10
+ # A warning is emitted to STDERR so users know to configure it explicitly.
11
+ DEFAULT_CHUNK_SIZE = 100
12
+
5
13
  include ::SmarterCSV::Options
6
14
  include ::SmarterCSV::FileIO
7
15
  include ::SmarterCSV::AutoDetection
@@ -16,14 +24,12 @@ module SmarterCSV
16
24
  attr_reader :enforce_utf8, :has_rails, :has_acceleration
17
25
  attr_reader :errors, :warnings, :headers, :raw_header, :result
18
26
 
19
- # :nocov:
20
27
  # rubocop:disable Naming/MethodName
21
28
  def headerA
22
29
  warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
23
30
  @headerA
24
31
  end
25
32
  # rubocop:enable Naming/MethodName
26
- # :nocov:
27
33
 
28
34
  # first parameter: filename or input object which responds to readline method
29
35
  def initialize(input, given_options = {})
@@ -44,6 +50,58 @@ module SmarterCSV
44
50
  @has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
45
51
  end
46
52
 
53
+ # Yields each successfully parsed row as a Hash.
54
+ # Ignores chunk_size — always row-by-row, enabling standard Enumerable usage.
55
+ # Returns an Enumerator when called without a block.
56
+ #
57
+ # Examples:
58
+ # reader.each { |hash| MyModel.upsert(hash) }
59
+ # reader.each_with_index { |hash, i| puts "Row #{i}: #{hash}" }
60
+ # reader.select { |h| h[:country] == "US" }
61
+ # reader.lazy.map { |h| h[:name] }.first(10)
62
+ def each
63
+ return enum_for(:each) unless block_given?
64
+
65
+ # Force row-by-row mode regardless of chunk_size setting
66
+ original_chunk_size = @options[:chunk_size]
67
+ @options[:chunk_size] = nil
68
+ process { |row_array, _| yield row_array.first }
69
+ ensure
70
+ @options[:chunk_size] = original_chunk_size
71
+ end
72
+
73
+ # Yields each chunk as Array<Hash> plus its 0-based chunk index.
74
+ # Uses chunk_size from options; raises ArgumentError if chunk_size < 1.
75
+ # Returns an Enumerator when called without a block.
76
+ #
77
+ # Examples:
78
+ # reader = SmarterCSV::Reader.new("big.csv", chunk_size: 500)
79
+ # reader.each_chunk { |chunk, i| Sidekiq.push_bulk(chunk) }
80
+ # reader.each_chunk.with_index { |chunk, i| puts "Chunk #{i}: #{chunk.size} rows" }
81
+ def each_chunk
82
+ return enum_for(:each_chunk) unless block_given?
83
+
84
+ chunk_size = @options[:chunk_size]
85
+ if chunk_size.nil?
86
+ warn "SmarterCSV: chunk_size not set, defaulting to #{DEFAULT_CHUNK_SIZE}. Set chunk_size explicitly to suppress this warning." unless @options[:verbose] == :quiet
87
+ chunk_size = DEFAULT_CHUNK_SIZE
88
+ end
89
+ unless chunk_size.is_a?(Integer) && chunk_size >= 1
90
+ raise ArgumentError, "chunk_size must be an Integer >= 1 (got #{chunk_size.inspect})"
91
+ end
92
+
93
+ # Temporarily apply chunk_size (handles nil default case) and restore after
94
+ original_chunk_size = @options[:chunk_size]
95
+ @options[:chunk_size] = chunk_size
96
+ begin
97
+ # process reuses the same chunk Array (clearing it after each yield),
98
+ # so we dup to give callers a stable snapshot they can safely store.
99
+ process { |chunk, index| yield chunk.dup, index }
100
+ ensure
101
+ @options[:chunk_size] = original_chunk_size
102
+ end
103
+ end
104
+
47
105
  def process(&block) # rubocop:disable Lint/UnusedMethodArgument
48
106
  @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
49
107
  @verbose = options[:verbose]
@@ -52,7 +110,7 @@ module SmarterCSV
52
110
  fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
53
111
 
54
112
  if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
55
- puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
113
+ warn 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".' unless options[:verbose] == :quiet
56
114
  end
57
115
 
58
116
  # auto-detect the row separator
@@ -66,10 +124,78 @@ module SmarterCSV
66
124
  @headers, _header_size = process_headers(fh, options)
67
125
  @headerA = @headers # @headerA is deprecated, use @headers
68
126
 
69
- puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
127
+ $stderr.puts "Effective headers:\n#{pp(@headers)}\n" if @verbose == :debug
70
128
 
71
129
  header_validations(@headers, options)
72
130
 
131
+ # Precompute column filter sets for only_headers / except_headers (O(1) lookup per row)
132
+ @only_headers_set = options[:only_headers] ? Set.new(options[:only_headers]) : nil
133
+ @except_headers_set = options[:except_headers] ? Set.new(options[:except_headers]) : nil
134
+
135
+ # Precompute column-filter bitmap for the C extension.
136
+ #
137
+ # The bitmap is a loop invariant — headers and filter settings never change between rows.
138
+ # We store it as a packed binary String so C can copy it with a single memcpy instead of
139
+ # N rb_ary_entry calls per row. early_exit_after and keep_extra_cols are pre-stored so
140
+ # C reads them with O(1) hash lookups rather than recomputing per row.
141
+ if @only_headers_set || @except_headers_set
142
+ keep_flags = @headers.map { |h| @only_headers_set ? @only_headers_set.include?(h) : !@except_headers_set.include?(h) }
143
+ options[:_keep_bitmap] = keep_flags.map { |f| f ? 1 : 0 }.pack('C*').freeze
144
+ options[:_keep_extra_cols] = @only_headers_set ? false : true
145
+ options[:_early_exit_after] = (@only_headers_set && !options[:strict]) ? (keep_flags.rindex(true) || -1) : -1
146
+ options[:_keep_cols] = nil # nil signals C: "filter active, check _keep_bitmap"
147
+ else
148
+ options[:_keep_cols] = false # sentinel: no filtering active — C skips all bitmap paths
149
+ # Do NOT insert _keep_bitmap/_keep_extra_cols/_early_exit_after when unused.
150
+ # Keeping the options hash as small as possible avoids hash table resize and
151
+ # keeps all 10 per-row rb_hash_aref lookups hitting the same cache lines.
152
+ end
153
+
154
+ # Precompute all hot-path strategy ivars once — eliminates per-row option lookups
155
+ # and method-dispatch overhead in the main loop.
156
+ #
157
+ # @quote_escaping_backslash / @quote_escaping_double may already exist if
158
+ # parse_with_auto_fallback ran during header parsing (lazily created there).
159
+ # Ensure they exist and carry the now-final _keep_cols (and bitmap keys only when active).
160
+ @quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
161
+ @quote_escaping_double ||= options.merge(quote_escaping: :double_quotes)
162
+ @quote_escaping_backslash[:_keep_cols] = options[:_keep_cols]
163
+ @quote_escaping_double[:_keep_cols] = options[:_keep_cols]
164
+ if @only_headers_set || @except_headers_set
165
+ %i[_keep_bitmap _keep_extra_cols _early_exit_after].each do |k|
166
+ @quote_escaping_backslash[k] = options[k]
167
+ @quote_escaping_double[k] = options[k]
168
+ end
169
+ end
170
+
171
+ @quote_escaping_auto = options[:quote_escaping] == :auto
172
+ @use_acceleration = options[:acceleration] && has_acceleration
173
+
174
+ # The single options hash used on the hot path — for :auto we always try backslash
175
+ # first (C downgrades to RFC internally via Opt #5 when no backslash is found).
176
+ @hot_path_options = @quote_escaping_auto ? @quote_escaping_backslash : options
177
+
178
+ # Build ParseContext objects once after headers are known.
179
+ # Eliminates ~10 rb_hash_aref calls per row by pre-baking all loop-invariant
180
+ # options into a C struct accessed via direct pointer dereference.
181
+ if @use_acceleration
182
+ hot_opts = @hot_path_options
183
+ double_opts = @quote_escaping_double
184
+ @parse_ctx = SmarterCSV::Parser.new_parse_context_c(@headers, hot_opts)
185
+ @parse_ctx_double = SmarterCSV::Parser.new_parse_context_c(@headers, double_opts)
186
+ end
187
+
188
+ # Key-cleanup flags — computed once, checked per row via cheap ivar reads.
189
+ # hash.delete(nil) / hash.delete('') only occur when key_mapping maps a header to nil/"".
190
+ # hash.delete(:"") also catches empty headers produced by ,, in the CSV.
191
+ @delete_nil_keys = !!options[:key_mapping]
192
+ @delete_empty_keys = !!options[:key_mapping] || @headers.include?(:"")
193
+
194
+ # Cache quote_char as an ivar for the stitch-loop memchr guard (avoids hash lookup per continuation line).
195
+ @quote_char = options[:quote_char]
196
+ # Cache field_size_limit as an ivar (nil when unset → one nil-check per row, no method calls).
197
+ @field_size_limit = options[:field_size_limit]
198
+
73
199
  # in case we use chunking.. we'll need to set it up..
74
200
  if options[:chunk_size].to_i > 0
75
201
  use_chunks = true
@@ -80,59 +206,182 @@ module SmarterCSV
80
206
  use_chunks = false
81
207
  end
82
208
 
209
+ # --- INSTRUMENTATION HOOKS ---
210
+ # on_start / on_chunk / on_complete are optional callables (nil by default).
211
+ # Hooks only fire from `process` (library-controlled iteration). Enumerator
212
+ # modes (each / each_chunk) do not fire hooks — the caller owns the lifecycle.
213
+ _on_start = options[:on_start]
214
+ _on_chunk = options[:on_chunk]
215
+ _on_complete = options[:on_complete]
216
+ _start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) if _on_start || _on_complete
217
+
218
+ if _on_start
219
+ _input_meta = if @input.is_a?(String)
220
+ { input: @input, file_size: (File.size(@input) rescue nil) }
221
+ else
222
+ { input: @input.class.name, file_size: nil }
223
+ end
224
+ _on_start.call(_input_meta.merge(col_sep: options[:col_sep], row_sep: options[:row_sep]))
225
+ end
226
+
83
227
  # now on to processing all the rest of the lines in the CSV file:
84
228
  while (line = next_line_with_counts(fh, options))
85
229
 
86
230
  # replace invalid byte sequence in UTF-8 with question mark to avoid errors
87
231
  line = enforce_utf8_encoding(line, options) if @enforce_utf8
88
232
 
89
- print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
233
+ $stderr.print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose == :debug
90
234
 
91
235
  next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
92
236
 
93
- # cater for the quoted csv data containing the row separator carriage return character
94
- # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
95
- # by detecting the existence of an uneven number of quote characters
96
- multiline = detect_multiline(line, options)
97
-
98
- while multiline
99
- next_line = fh.gets(options[:row_sep])
100
- if next_line.nil?
101
- # End of file reached. Check if quotes are balanced.
102
- if detect_multiline(line, options)
103
- raise MalformedCSV, "Unclosed quoted field detected in multiline data"
104
- else
237
+ # Snapshot line counters before multiline stitching so error records reflect
238
+ # where the bad row started, not where it failed.
239
+ bad_row_start_csv_line = @csv_line_count
240
+ bad_row_start_file_line = @file_line_count
241
+
242
+ begin
243
+ # --- PARSE (inlined — no method-wrapper overhead on the hot path) ---
244
+ # Replaces: process_line_to_hash → parse_line_to_hash → parse_line_to_hash_auto
245
+ # All routing decisions are pre-baked into ivars set up after header processing.
246
+ if @use_acceleration
247
+ hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx)
248
+ # :auto only: if unclosed quote AND backslash present, RFC may close it differently
249
+ if @quote_escaping_auto && data_size == -1 && line.include?('\\')
250
+ hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx_double)
251
+ end
252
+ else
253
+ has_quotes = line.include?(options[:quote_char])
254
+ hash, data_size = parse_line_to_hash_ruby(line, @headers, @hot_path_options, has_quotes)
255
+ if @quote_escaping_auto && data_size == -1 && line.include?('\\')
256
+ hash, data_size = parse_line_to_hash_ruby(line, @headers, @quote_escaping_double, has_quotes)
257
+ end
258
+ end
259
+
260
+ # --- MULTILINE STITCH ---
261
+ # data_size == -1 means the parser saw an unclosed quoted field at end-of-line.
262
+ # Fetch the next physical line, append, and re-parse until the field closes.
263
+ while data_size == -1
264
+ next_line = fh.gets(options[:row_sep])
265
+ raise MalformedCSV, "Unclosed quoted field detected in multiline data" if next_line.nil?
266
+
267
+ next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
268
+ line += next_line
269
+ @file_line_count += 1
270
+ $stderr.print "\nline contains unclosed quoted field, including content through file line %d\n" % @file_line_count if @verbose == :debug
271
+
272
+ # DoS guard: prevent runaway multiline accumulation (vectors: never-closing quote, huge embedded content)
273
+ if @field_size_limit && line.bytesize > @field_size_limit
274
+ raise SmarterCSV::FieldSizeLimitExceeded,
275
+ "Multiline field exceeds field_size_limit of #{@field_size_limit} bytes " \
276
+ "(accumulated #{line.bytesize} bytes)"
277
+ end
278
+
279
+ # Opt #8 (memchr guard): if the newly appended line contains no quote character,
280
+ # it cannot close the currently open quoted field — skip the full re-parse and
281
+ # keep accumulating physical lines. String#include? uses memchr internally (C speed).
282
+ next unless next_line.include?(@quote_char)
283
+
284
+ if @use_acceleration
105
285
  # :nocov:
106
- # Quotes are balanced; proceed without raising an error.
107
- break
286
+ hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx)
287
+ if @quote_escaping_auto && data_size == -1 && line.include?('\\')
288
+ hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx_double)
289
+ end
108
290
  # :nocov:
291
+ else
292
+ # Optimization #18: use detect_multiline as a cheap gate before attempting a full
293
+ # Ruby re-parse on the growing stitched line. detect_multiline_strict now uses
294
+ # byteindex skip-ahead (Opt #17) and is faster than parse_line_to_hash_ruby on
295
+ # the same content. Saves N-2 wasted full parses per multiline row.
296
+ next if detect_multiline(line, options)
297
+
298
+ has_quotes = true # we know the line has quotes — we've been stitching a quoted field
299
+ hash, data_size = parse_line_to_hash_ruby(line, @headers, @hot_path_options, has_quotes)
300
+ if @quote_escaping_auto && data_size == -1 && line.include?('\\')
301
+ hash, data_size = parse_line_to_hash_ruby(line, @headers, @quote_escaping_double, has_quotes)
302
+ end
109
303
  end
110
304
  end
111
- next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
112
- line += next_line
113
- @file_line_count += 1
114
305
 
115
- multiline = detect_multiline(line, options)
116
- end
306
+ # --- EXTRA COLUMNS ---
307
+ if data_size > @headers.size
308
+ raise SmarterCSV::HeaderSizeMismatch, "extra columns detected on line #{@file_line_count}" if options[:missing_headers] == :raise
117
309
 
118
- # :nocov:
119
- if multiline && @verbose
120
- print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
121
- end
122
- # :nocov:
310
+ while @headers.size < data_size
311
+ @headers << "#{options[:missing_header_prefix]}#{@headers.size + 1}".to_sym
312
+ end
313
+ end
123
314
 
124
- hash = process_line_to_hash(line, options)
125
- next if hash.nil?
315
+ next if hash.nil?
316
+
317
+ # --- FIELD SIZE LIMIT CHECK ---
318
+ # Pre-filter: if the raw line fits within the limit, no individual field can exceed it
319
+ # (a field is always a substring of its row). Only iterate over values for large rows.
320
+ if @field_size_limit && line.bytesize > @field_size_limit
321
+ hash.each_value do |v|
322
+ if v.is_a?(String) && v.bytesize > @field_size_limit
323
+ raise SmarterCSV::FieldSizeLimitExceeded,
324
+ "Field exceeds field_size_limit of #{@field_size_limit} bytes (got #{v.bytesize} bytes)"
325
+ end
326
+ end
327
+ end
126
328
 
127
- puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
128
- # optional adding of csv_line_number to the hash to help debugging
129
- hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
329
+ # --- COLUMN SELECTION ---
330
+ hash.select! { |k, _| @only_headers_set.include?(k) } if @only_headers_set
331
+ hash.reject! { |k, _| @except_headers_set.include?(k) } if @except_headers_set
332
+
333
+ # --- HASH CLEANUP & TRANSFORMATIONS ---
334
+ if @use_acceleration
335
+ # C already applied: remove_empty_values, convert_values_to_numeric, remove_zero_values.
336
+ # Remove nil/"" keys left by key_mapping or empty CSV headers.
337
+ if @delete_nil_keys
338
+ hash.delete(nil)
339
+ hash.delete('')
340
+ end
341
+ hash.delete(:"") if @delete_empty_keys
342
+
343
+ if (matcher = options[:nil_values_matching])
344
+ if options[:remove_empty_values]
345
+ hash.delete_if do |_k, v|
346
+ str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
347
+ str_val && matcher.match?(str_val)
348
+ end
349
+ else
350
+ hash.each_key do |k|
351
+ v = hash[k]
352
+ str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
353
+ hash[k] = nil if str_val && matcher.match?(str_val)
354
+ end
355
+ end
356
+ end
357
+
358
+ if options[:value_converters]
359
+ options[:value_converters].each do |key, converter|
360
+ hash[key] = converter.convert(hash[key]) if hash.key?(key)
361
+ end
362
+ end
363
+ else
364
+ hash = hash_transformations(hash, options)
365
+ end
366
+
367
+ next if options[:remove_empty_hashes] && hash.empty?
368
+
369
+ $stderr.puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == :debug
370
+ # optional adding of csv_line_number to the hash to help debugging
371
+ hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
372
+ rescue SmarterCSV::Error, EOFError => e
373
+ raise if options[:on_bad_row] == :raise
374
+
375
+ handle_bad_row(e, line, bad_row_start_csv_line, bad_row_start_file_line, options)
376
+ next
377
+ end
130
378
 
131
379
  # process the chunks or the resulting hash
132
380
  if use_chunks
133
381
  chunk << hash # append temp result to chunk
134
382
 
135
383
  if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
384
+ _on_chunk&.call({ chunk_number: @chunk_count + 1, rows_in_chunk: chunk.size, total_rows_so_far: @csv_line_count })
136
385
  # do something with the chunk
137
386
  if block_given?
138
387
  yield chunk, @chunk_count # do something with the hashes in the chunk in the block
@@ -157,10 +406,11 @@ module SmarterCSV
157
406
  end
158
407
 
159
408
  # print new line to retain last processing line message
160
- print "\n" if @verbose
409
+ $stderr.print "\n" if @verbose == :debug
161
410
 
162
411
  # handling of last chunk:
163
412
  if !chunk.nil? && chunk.size > 0
413
+ _on_chunk&.call({ chunk_number: @chunk_count + 1, rows_in_chunk: chunk.size, total_rows_so_far: @csv_line_count })
164
414
  # do something with the chunk
165
415
  if block_given?
166
416
  yield chunk, @chunk_count # do something with the hashes in the chunk in the block
@@ -170,6 +420,15 @@ module SmarterCSV
170
420
  @chunk_count += 1
171
421
  # chunk = [] # initialize for next chunk of data
172
422
  end
423
+
424
+ if _on_complete
425
+ _on_complete.call({
426
+ total_rows: @csv_line_count,
427
+ total_chunks: @chunk_count,
428
+ duration: Process.clock_gettime(Process::CLOCK_MONOTONIC) - _start_time,
429
+ bad_rows: @errors[:bad_row_count] || 0,
430
+ })
431
+ end
173
432
  ensure
174
433
  fh.close if fh.respond_to?(:close)
175
434
  end
@@ -254,11 +513,15 @@ module SmarterCSV
254
513
 
255
514
  # Determine if a line has unbalanced quotes requiring multiline stitching.
256
515
  # For :auto mode, uses dual counting to avoid false multiline detection.
516
+ # For :standard quote_boundary mode, uses a full state machine so that
517
+ # mid-field quotes (which are literals in standard mode) do not trigger stitching.
257
518
  # Optimization #8: skip quote counting entirely when line has no quote chars.
258
519
  def detect_multiline(line, options)
259
520
  return false unless line.include?(options[:quote_char])
260
521
 
261
- if options[:quote_escaping] == :auto
522
+ if options[:quote_boundary] == :standard
523
+ detect_multiline_strict(line, options)
524
+ elsif options[:quote_escaping] == :auto
262
525
  escaped_count, rfc_count = count_quote_chars_auto(line, options[:quote_char], options[:col_sep])
263
526
  # If backslash-aware count is even → line is self-contained either way
264
527
  # If backslash-aware count is odd AND rfc_count is also odd → truly multiline
@@ -270,6 +533,146 @@ module SmarterCSV
270
533
  end
271
534
  end
272
535
 
536
+ # Boundary-aware multiline detection for quote_boundary: :standard mode.
537
+ # Walks the line as a state machine tracking quote state only for boundary quotes.
538
+ # A quote only opens/closes a quoted field if it appears at a field boundary
539
+ # (start of field, or after leading whitespace when strip_whitespace is true).
540
+ # Mid-field quotes are treated as literals and do not affect quote state.
541
+ #
542
+ # Optimization #17: single-char col_sep fast path uses byteindex skip-ahead
543
+ # (mirrors Opt #10/#12 in parse_csv_line_ruby) so that:
544
+ # - inside a quoted field: jump directly to next quote char via C-level byteindex
545
+ # - inside an unquoted field: jump directly to next col_sep via C-level byteindex
546
+ # This makes detect_multiline_strict competitive with parse_csv_line_ruby on the same
547
+ # content, enabling it to serve as a cheap gate in the stitch loop (Opt #18).
548
+ def detect_multiline_strict(line, options)
549
+ col_sep = options[:col_sep]
550
+ quote = options[:quote_char]
551
+ strip = options[:strip_whitespace]
552
+ row_sep = options[:row_sep]
553
+
554
+ col_sep_size = col_sep.size
555
+ row_sep_size = row_sep.is_a?(String) ? row_sep.size : 0
556
+ in_quotes = false
557
+ field_started = false
558
+
559
+ if col_sep_size == 1
560
+ # Fast path: byte-level scanning with byteindex skip-ahead (Opt #17)
561
+ col_sep_byte = col_sep.getbyte(0)
562
+ quote_byte = quote.getbyte(0)
563
+ row_sep_bytesize = row_sep.is_a?(String) ? row_sep.bytesize : 0
564
+ bytesize = line.bytesize
565
+ byteindex_available = SmarterCSV::Parser::BYTEINDEX_AVAILABLE
566
+ i = 0
567
+
568
+ while i < bytesize
569
+ if in_quotes
570
+ # Opt #10 mirror: jump directly to next quote using C-level byteindex (MRI Ruby ≥ 3.2).
571
+ # Fallback for older Ruby / JRuby: manual getbyte loop — kept inline to avoid
572
+ # method-call frame overhead in this hot loop (see BYTEINDEX_AVAILABLE in parser.rb).
573
+ next_q = if byteindex_available
574
+ line.byteindex(quote, i)
575
+ else
576
+ j = i
577
+ j += 1 while j < bytesize && line.getbyte(j) != quote_byte
578
+ j < bytesize ? j : nil
579
+ end
580
+ return true if next_q.nil? # no closing quote → line is incomplete
581
+
582
+ i = next_q
583
+ b = quote_byte
584
+ elsif field_started
585
+ # Opt #12 mirror: unquoted field in progress — jump to next col_sep using C-level
586
+ # byteindex (MRI Ruby ≥ 3.2). Fallback for older Ruby / JRuby: manual getbyte loop —
587
+ # kept inline for the same reason as the Opt #10 mirror above.
588
+ next_sep = if byteindex_available
589
+ line.byteindex(col_sep, i)
590
+ else
591
+ j = i
592
+ j += 1 while j < bytesize && line.getbyte(j) != col_sep_byte
593
+ j < bytesize ? j : nil
594
+ end
595
+ break if next_sep.nil? # no more separators → end of line, not multiline
596
+
597
+ i = next_sep
598
+ b = col_sep_byte
599
+ else
600
+ b = line.getbyte(i)
601
+ end
602
+
603
+ if b == col_sep_byte && !in_quotes
604
+ field_started = false
605
+ elsif b == quote_byte
606
+ if in_quotes
607
+ # closing quote: only valid if followed by col_sep, row_sep, or end of line
608
+ next_i = i + 1
609
+ if next_i >= bytesize ||
610
+ line.getbyte(next_i) == col_sep_byte ||
611
+ (row_sep_bytesize > 0 && line.byteslice(next_i, row_sep_bytesize) == row_sep)
612
+ in_quotes = false
613
+ field_started = true
614
+ end
615
+ # else: quote inside quoted field → literal (handles "" doubling)
616
+ elsif !field_started # at field boundary: open quoted field
617
+ in_quotes = true
618
+ field_started = true
619
+ end
620
+ # else: mid-field quote → literal, no state change
621
+ else
622
+ unless in_quotes
623
+ # rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
624
+ field_started = true unless strip && (b == 32 || b == 9) # ' ' == 32, '\t' == 9
625
+ # rubocop:enable Style/MultipleComparison
626
+ end
627
+ end
628
+ i += 1
629
+ end
630
+ else
631
+ # Multi-char col_sep: character-by-character (original path)
632
+ line_size = line.size
633
+ i = 0
634
+
635
+ while i < line_size
636
+ # Check for column separator (only outside quotes)
637
+ if !in_quotes && line[i...i + col_sep_size] == col_sep
638
+ field_started = false
639
+ i += col_sep_size
640
+ next
641
+ end
642
+
643
+ if line[i] == quote
644
+ if in_quotes
645
+ # closing quote: only valid if followed by col_sep, row_sep, or end of line
646
+ next_i = i + 1
647
+ if next_i >= line_size ||
648
+ line[next_i...next_i + col_sep_size] == col_sep ||
649
+ (row_sep_size > 0 && line[next_i...next_i + row_sep_size] == row_sep)
650
+ in_quotes = false
651
+ field_started = true
652
+ end
653
+ # else: quote inside quoted field → literal (handles "" doubling)
654
+ elsif !field_started # at field boundary: open quoted field
655
+ in_quotes = true
656
+ field_started = true
657
+ end
658
+ # else: mid-field quote → literal, no state change
659
+ elsif !in_quotes
660
+ # Non-quote character: track whether field has started
661
+ if strip
662
+ # rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
663
+ field_started = true unless line[i] == ' ' || line[i] == "\t"
664
+ # rubocop:enable Style/MultipleComparison
665
+ else
666
+ field_started = true
667
+ end
668
+ end
669
+ i += 1
670
+ end
671
+ end
672
+
673
+ in_quotes # true → line ends inside a quoted field → needs stitching
674
+ end
675
+
273
676
  protected
274
677
 
275
678
  # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
@@ -302,9 +705,12 @@ module SmarterCSV
302
705
  # we create additional columns on-the-fly when we find more data fields than headers
303
706
  hash, data_size = parse_line_to_hash(line, @headers, options)
304
707
 
708
+ # Unclosed quote at end of line: signal caller to stitch next physical line
709
+ return :needs_more if data_size == -1
710
+
305
711
  # Handle extra columns (more data fields than headers)
306
712
  if data_size > @headers.size
307
- if options[:strict]
713
+ if options[:missing_headers] == :raise
308
714
  raise SmarterCSV::HeaderSizeMismatch, "extra columns detected on line #{@file_line_count}"
309
715
  end
310
716
 
@@ -317,19 +723,33 @@ module SmarterCSV
317
723
  # if all values were blank (hash is nil) we ignore this CSV line
318
724
  return nil if hash.nil?
319
725
 
726
+ # Apply column selection (only_headers / except_headers)
727
+ hash.select! { |k, _| @only_headers_set.include?(k) } if @only_headers_set
728
+ hash.reject! { |k, _| @except_headers_set.include?(k) } if @except_headers_set
729
+
320
730
  # --- HASH TRANSFORMATIONS / POST-FILTERS --------------------------------
321
- if options[:acceleration] && @has_acceleration
731
+ if @use_acceleration
322
732
  # C already handled: remove_empty_values, convert_values_to_numeric, remove_zero_values.
323
- # Clean up nil/empty keys (from key_mapping setting keys to nil)
324
- hash.delete(nil)
325
- hash.delete('')
326
- hash.delete(:"")
733
+ # Remove nil/"" keys left by key_mapping or empty CSV headers.
734
+ if @delete_nil_keys
735
+ hash.delete(nil)
736
+ hash.delete('')
737
+ end
738
+ hash.delete(:"") if @delete_empty_keys
327
739
 
328
740
  # Only these Ruby-only post-filters remain (user-provided Ruby objects):
329
- if options[:remove_values_matching]
330
- hash.delete_if do |_k, v|
331
- str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
332
- str_val && options[:remove_values_matching].match?(str_val)
741
+ if (matcher = options[:nil_values_matching])
742
+ if options[:remove_empty_values]
743
+ hash.delete_if do |_k, v|
744
+ str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
745
+ str_val && matcher.match?(str_val)
746
+ end
747
+ else
748
+ hash.each_key do |k|
749
+ v = hash[k]
750
+ str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
751
+ hash[k] = nil if str_val && matcher.match?(str_val)
752
+ end
333
753
  end
334
754
  end
335
755
 
@@ -360,5 +780,33 @@ module SmarterCSV
360
780
 
361
781
  line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
362
782
  end
783
+
784
+ def handle_bad_row(error, line, start_csv_line, start_file_line, options)
785
+ @errors[:bad_row_count] = (@errors[:bad_row_count] || 0) + 1
786
+
787
+ error_record = {
788
+ csv_line_number: start_csv_line,
789
+ file_line_number: start_file_line,
790
+ file_lines_consumed: @file_line_count - start_file_line + 1,
791
+ error_class: error.class,
792
+ error_message: error.message,
793
+ }
794
+ error_record[:raw_logical_line] = line if options[:collect_raw_lines]
795
+
796
+ on_bad_row = options[:on_bad_row]
797
+ case on_bad_row
798
+ when :skip
799
+ # counted above; nothing more to collect
800
+ when :collect
801
+ (@errors[:bad_rows] ||= []) << error_record
802
+ else
803
+ # callable
804
+ on_bad_row.call(error_record)
805
+ end
806
+
807
+ if options[:bad_row_limit] && @errors[:bad_row_count] > options[:bad_row_limit]
808
+ raise TooManyBadRows, "Bad row limit of #{options[:bad_row_limit]} exceeded (#{@errors[:bad_row_count]} bad rows encountered)"
809
+ end
810
+ end
363
811
  end
364
812
  end