smarter_csv 1.15.2 → 1.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +2 -0
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +112 -1
- data/CONTRIBUTORS.md +4 -1
- data/Gemfile +1 -0
- data/README.md +129 -27
- data/docs/_introduction.md +45 -24
- data/docs/bad_row_quarantine.md +342 -0
- data/docs/basic_read_api.md +152 -9
- data/docs/basic_write_api.md +475 -59
- data/docs/batch_processing.md +162 -4
- data/docs/column_selection.md +184 -0
- data/docs/data_transformations.md +163 -29
- data/docs/examples.md +340 -46
- data/docs/header_transformations.md +94 -12
- data/docs/header_validations.md +57 -18
- data/docs/history.md +119 -0
- data/docs/instrumentation.md +166 -0
- data/docs/migrating_from_csv.md +565 -0
- data/docs/options.md +151 -87
- data/docs/parsing_strategy.md +64 -1
- data/docs/real_world_csv.md +263 -0
- data/docs/releases/1.16.0/benchmarks.md +223 -0
- data/docs/releases/1.16.0/changes.md +273 -0
- data/docs/releases/1.16.0/performance_notes.md +114 -0
- data/docs/row_col_sep.md +15 -5
- data/docs/ruby_csv_pitfalls.md +514 -0
- data/docs/value_converters.md +194 -57
- data/ext/smarter_csv/extconf.rb +3 -0
- data/ext/smarter_csv/smarter_csv.c +1017 -82
- data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg +108 -0
- data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg +141 -0
- data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg +139 -0
- data/lib/smarter_csv/errors.rb +8 -0
- data/lib/smarter_csv/file_io.rb +1 -1
- data/lib/smarter_csv/hash_transformations.rb +14 -13
- data/lib/smarter_csv/header_transformations.rb +21 -2
- data/lib/smarter_csv/headers.rb +2 -1
- data/lib/smarter_csv/options.rb +124 -7
- data/lib/smarter_csv/parser.rb +358 -74
- data/lib/smarter_csv/reader.rb +494 -46
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv/writer.rb +71 -19
- data/lib/smarter_csv.rb +134 -13
- data/smarter_csv.gemspec +20 -10
- metadata +38 -80
data/lib/smarter_csv/reader.rb
CHANGED
|
@@ -1,7 +1,15 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
3
5
|
module SmarterCSV
|
|
4
6
|
class Reader
|
|
7
|
+
include Enumerable
|
|
8
|
+
|
|
9
|
+
# Default chunk size used by each_chunk when chunk_size is not explicitly set.
|
|
10
|
+
# A warning is emitted to STDERR so users know to configure it explicitly.
|
|
11
|
+
DEFAULT_CHUNK_SIZE = 100
|
|
12
|
+
|
|
5
13
|
include ::SmarterCSV::Options
|
|
6
14
|
include ::SmarterCSV::FileIO
|
|
7
15
|
include ::SmarterCSV::AutoDetection
|
|
@@ -16,14 +24,12 @@ module SmarterCSV
|
|
|
16
24
|
attr_reader :enforce_utf8, :has_rails, :has_acceleration
|
|
17
25
|
attr_reader :errors, :warnings, :headers, :raw_header, :result
|
|
18
26
|
|
|
19
|
-
# :nocov:
|
|
20
27
|
# rubocop:disable Naming/MethodName
|
|
21
28
|
def headerA
|
|
22
29
|
warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
|
|
23
30
|
@headerA
|
|
24
31
|
end
|
|
25
32
|
# rubocop:enable Naming/MethodName
|
|
26
|
-
# :nocov:
|
|
27
33
|
|
|
28
34
|
# first parameter: filename or input object which responds to readline method
|
|
29
35
|
def initialize(input, given_options = {})
|
|
@@ -44,6 +50,58 @@ module SmarterCSV
|
|
|
44
50
|
@has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
|
|
45
51
|
end
|
|
46
52
|
|
|
53
|
+
# Yields each successfully parsed row as a Hash.
|
|
54
|
+
# Ignores chunk_size — always row-by-row, enabling standard Enumerable usage.
|
|
55
|
+
# Returns an Enumerator when called without a block.
|
|
56
|
+
#
|
|
57
|
+
# Examples:
|
|
58
|
+
# reader.each { |hash| MyModel.upsert(hash) }
|
|
59
|
+
# reader.each_with_index { |hash, i| puts "Row #{i}: #{hash}" }
|
|
60
|
+
# reader.select { |h| h[:country] == "US" }
|
|
61
|
+
# reader.lazy.map { |h| h[:name] }.first(10)
|
|
62
|
+
def each
|
|
63
|
+
return enum_for(:each) unless block_given?
|
|
64
|
+
|
|
65
|
+
# Force row-by-row mode regardless of chunk_size setting
|
|
66
|
+
original_chunk_size = @options[:chunk_size]
|
|
67
|
+
@options[:chunk_size] = nil
|
|
68
|
+
process { |row_array, _| yield row_array.first }
|
|
69
|
+
ensure
|
|
70
|
+
@options[:chunk_size] = original_chunk_size
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Yields each chunk as Array<Hash> plus its 0-based chunk index.
|
|
74
|
+
# Uses chunk_size from options; raises ArgumentError if chunk_size < 1.
|
|
75
|
+
# Returns an Enumerator when called without a block.
|
|
76
|
+
#
|
|
77
|
+
# Examples:
|
|
78
|
+
# reader = SmarterCSV::Reader.new("big.csv", chunk_size: 500)
|
|
79
|
+
# reader.each_chunk { |chunk, i| Sidekiq.push_bulk(chunk) }
|
|
80
|
+
# reader.each_chunk.with_index { |chunk, i| puts "Chunk #{i}: #{chunk.size} rows" }
|
|
81
|
+
def each_chunk
|
|
82
|
+
return enum_for(:each_chunk) unless block_given?
|
|
83
|
+
|
|
84
|
+
chunk_size = @options[:chunk_size]
|
|
85
|
+
if chunk_size.nil?
|
|
86
|
+
warn "SmarterCSV: chunk_size not set, defaulting to #{DEFAULT_CHUNK_SIZE}. Set chunk_size explicitly to suppress this warning." unless @options[:verbose] == :quiet
|
|
87
|
+
chunk_size = DEFAULT_CHUNK_SIZE
|
|
88
|
+
end
|
|
89
|
+
unless chunk_size.is_a?(Integer) && chunk_size >= 1
|
|
90
|
+
raise ArgumentError, "chunk_size must be an Integer >= 1 (got #{chunk_size.inspect})"
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Temporarily apply chunk_size (handles nil default case) and restore after
|
|
94
|
+
original_chunk_size = @options[:chunk_size]
|
|
95
|
+
@options[:chunk_size] = chunk_size
|
|
96
|
+
begin
|
|
97
|
+
# process reuses the same chunk Array (clearing it after each yield),
|
|
98
|
+
# so we dup to give callers a stable snapshot they can safely store.
|
|
99
|
+
process { |chunk, index| yield chunk.dup, index }
|
|
100
|
+
ensure
|
|
101
|
+
@options[:chunk_size] = original_chunk_size
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
47
105
|
def process(&block) # rubocop:disable Lint/UnusedMethodArgument
|
|
48
106
|
@enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
|
49
107
|
@verbose = options[:verbose]
|
|
@@ -52,7 +110,7 @@ module SmarterCSV
|
|
|
52
110
|
fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
|
|
53
111
|
|
|
54
112
|
if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
|
|
55
|
-
|
|
113
|
+
warn 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".' unless options[:verbose] == :quiet
|
|
56
114
|
end
|
|
57
115
|
|
|
58
116
|
# auto-detect the row separator
|
|
@@ -66,10 +124,78 @@ module SmarterCSV
|
|
|
66
124
|
@headers, _header_size = process_headers(fh, options)
|
|
67
125
|
@headerA = @headers # @headerA is deprecated, use @headers
|
|
68
126
|
|
|
69
|
-
puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
|
|
127
|
+
$stderr.puts "Effective headers:\n#{pp(@headers)}\n" if @verbose == :debug
|
|
70
128
|
|
|
71
129
|
header_validations(@headers, options)
|
|
72
130
|
|
|
131
|
+
# Precompute column filter sets for only_headers / except_headers (O(1) lookup per row)
|
|
132
|
+
@only_headers_set = options[:only_headers] ? Set.new(options[:only_headers]) : nil
|
|
133
|
+
@except_headers_set = options[:except_headers] ? Set.new(options[:except_headers]) : nil
|
|
134
|
+
|
|
135
|
+
# Precompute column-filter bitmap for the C extension.
|
|
136
|
+
#
|
|
137
|
+
# The bitmap is a loop invariant — headers and filter settings never change between rows.
|
|
138
|
+
# We store it as a packed binary String so C can copy it with a single memcpy instead of
|
|
139
|
+
# N rb_ary_entry calls per row. early_exit_after and keep_extra_cols are pre-stored so
|
|
140
|
+
# C reads them with O(1) hash lookups rather than recomputing per row.
|
|
141
|
+
if @only_headers_set || @except_headers_set
|
|
142
|
+
keep_flags = @headers.map { |h| @only_headers_set ? @only_headers_set.include?(h) : !@except_headers_set.include?(h) }
|
|
143
|
+
options[:_keep_bitmap] = keep_flags.map { |f| f ? 1 : 0 }.pack('C*').freeze
|
|
144
|
+
options[:_keep_extra_cols] = @only_headers_set ? false : true
|
|
145
|
+
options[:_early_exit_after] = (@only_headers_set && !options[:strict]) ? (keep_flags.rindex(true) || -1) : -1
|
|
146
|
+
options[:_keep_cols] = nil # nil signals C: "filter active, check _keep_bitmap"
|
|
147
|
+
else
|
|
148
|
+
options[:_keep_cols] = false # sentinel: no filtering active — C skips all bitmap paths
|
|
149
|
+
# Do NOT insert _keep_bitmap/_keep_extra_cols/_early_exit_after when unused.
|
|
150
|
+
# Keeping the options hash as small as possible avoids hash table resize and
|
|
151
|
+
# keeps all 10 per-row rb_hash_aref lookups hitting the same cache lines.
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Precompute all hot-path strategy ivars once — eliminates per-row option lookups
|
|
155
|
+
# and method-dispatch overhead in the main loop.
|
|
156
|
+
#
|
|
157
|
+
# @quote_escaping_backslash / @quote_escaping_double may already exist if
|
|
158
|
+
# parse_with_auto_fallback ran during header parsing (lazily created there).
|
|
159
|
+
# Ensure they exist and carry the now-final _keep_cols (and bitmap keys only when active).
|
|
160
|
+
@quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
|
|
161
|
+
@quote_escaping_double ||= options.merge(quote_escaping: :double_quotes)
|
|
162
|
+
@quote_escaping_backslash[:_keep_cols] = options[:_keep_cols]
|
|
163
|
+
@quote_escaping_double[:_keep_cols] = options[:_keep_cols]
|
|
164
|
+
if @only_headers_set || @except_headers_set
|
|
165
|
+
%i[_keep_bitmap _keep_extra_cols _early_exit_after].each do |k|
|
|
166
|
+
@quote_escaping_backslash[k] = options[k]
|
|
167
|
+
@quote_escaping_double[k] = options[k]
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
@quote_escaping_auto = options[:quote_escaping] == :auto
|
|
172
|
+
@use_acceleration = options[:acceleration] && has_acceleration
|
|
173
|
+
|
|
174
|
+
# The single options hash used on the hot path — for :auto we always try backslash
|
|
175
|
+
# first (C downgrades to RFC internally via Opt #5 when no backslash is found).
|
|
176
|
+
@hot_path_options = @quote_escaping_auto ? @quote_escaping_backslash : options
|
|
177
|
+
|
|
178
|
+
# Build ParseContext objects once after headers are known.
|
|
179
|
+
# Eliminates ~10 rb_hash_aref calls per row by pre-baking all loop-invariant
|
|
180
|
+
# options into a C struct accessed via direct pointer dereference.
|
|
181
|
+
if @use_acceleration
|
|
182
|
+
hot_opts = @hot_path_options
|
|
183
|
+
double_opts = @quote_escaping_double
|
|
184
|
+
@parse_ctx = SmarterCSV::Parser.new_parse_context_c(@headers, hot_opts)
|
|
185
|
+
@parse_ctx_double = SmarterCSV::Parser.new_parse_context_c(@headers, double_opts)
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# Key-cleanup flags — computed once, checked per row via cheap ivar reads.
|
|
189
|
+
# hash.delete(nil) / hash.delete('') only occur when key_mapping maps a header to nil/"".
|
|
190
|
+
# hash.delete(:"") also catches empty headers produced by ,, in the CSV.
|
|
191
|
+
@delete_nil_keys = !!options[:key_mapping]
|
|
192
|
+
@delete_empty_keys = !!options[:key_mapping] || @headers.include?(:"")
|
|
193
|
+
|
|
194
|
+
# Cache quote_char as an ivar for the stitch-loop memchr guard (avoids hash lookup per continuation line).
|
|
195
|
+
@quote_char = options[:quote_char]
|
|
196
|
+
# Cache field_size_limit as an ivar (nil when unset → one nil-check per row, no method calls).
|
|
197
|
+
@field_size_limit = options[:field_size_limit]
|
|
198
|
+
|
|
73
199
|
# in case we use chunking.. we'll need to set it up..
|
|
74
200
|
if options[:chunk_size].to_i > 0
|
|
75
201
|
use_chunks = true
|
|
@@ -80,59 +206,182 @@ module SmarterCSV
|
|
|
80
206
|
use_chunks = false
|
|
81
207
|
end
|
|
82
208
|
|
|
209
|
+
# --- INSTRUMENTATION HOOKS ---
|
|
210
|
+
# on_start / on_chunk / on_complete are optional callables (nil by default).
|
|
211
|
+
# Hooks only fire from `process` (library-controlled iteration). Enumerator
|
|
212
|
+
# modes (each / each_chunk) do not fire hooks — the caller owns the lifecycle.
|
|
213
|
+
_on_start = options[:on_start]
|
|
214
|
+
_on_chunk = options[:on_chunk]
|
|
215
|
+
_on_complete = options[:on_complete]
|
|
216
|
+
_start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) if _on_start || _on_complete
|
|
217
|
+
|
|
218
|
+
if _on_start
|
|
219
|
+
_input_meta = if @input.is_a?(String)
|
|
220
|
+
{ input: @input, file_size: (File.size(@input) rescue nil) }
|
|
221
|
+
else
|
|
222
|
+
{ input: @input.class.name, file_size: nil }
|
|
223
|
+
end
|
|
224
|
+
_on_start.call(_input_meta.merge(col_sep: options[:col_sep], row_sep: options[:row_sep]))
|
|
225
|
+
end
|
|
226
|
+
|
|
83
227
|
# now on to processing all the rest of the lines in the CSV file:
|
|
84
228
|
while (line = next_line_with_counts(fh, options))
|
|
85
229
|
|
|
86
230
|
# replace invalid byte sequence in UTF-8 with question mark to avoid errors
|
|
87
231
|
line = enforce_utf8_encoding(line, options) if @enforce_utf8
|
|
88
232
|
|
|
89
|
-
print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
|
|
233
|
+
$stderr.print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose == :debug
|
|
90
234
|
|
|
91
235
|
next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
|
|
92
236
|
|
|
93
|
-
#
|
|
94
|
-
#
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
237
|
+
# Snapshot line counters before multiline stitching so error records reflect
|
|
238
|
+
# where the bad row started, not where it failed.
|
|
239
|
+
bad_row_start_csv_line = @csv_line_count
|
|
240
|
+
bad_row_start_file_line = @file_line_count
|
|
241
|
+
|
|
242
|
+
begin
|
|
243
|
+
# --- PARSE (inlined — no method-wrapper overhead on the hot path) ---
|
|
244
|
+
# Replaces: process_line_to_hash → parse_line_to_hash → parse_line_to_hash_auto
|
|
245
|
+
# All routing decisions are pre-baked into ivars set up after header processing.
|
|
246
|
+
if @use_acceleration
|
|
247
|
+
hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx)
|
|
248
|
+
# :auto only: if unclosed quote AND backslash present, RFC may close it differently
|
|
249
|
+
if @quote_escaping_auto && data_size == -1 && line.include?('\\')
|
|
250
|
+
hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx_double)
|
|
251
|
+
end
|
|
252
|
+
else
|
|
253
|
+
has_quotes = line.include?(options[:quote_char])
|
|
254
|
+
hash, data_size = parse_line_to_hash_ruby(line, @headers, @hot_path_options, has_quotes)
|
|
255
|
+
if @quote_escaping_auto && data_size == -1 && line.include?('\\')
|
|
256
|
+
hash, data_size = parse_line_to_hash_ruby(line, @headers, @quote_escaping_double, has_quotes)
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# --- MULTILINE STITCH ---
|
|
261
|
+
# data_size == -1 means the parser saw an unclosed quoted field at end-of-line.
|
|
262
|
+
# Fetch the next physical line, append, and re-parse until the field closes.
|
|
263
|
+
while data_size == -1
|
|
264
|
+
next_line = fh.gets(options[:row_sep])
|
|
265
|
+
raise MalformedCSV, "Unclosed quoted field detected in multiline data" if next_line.nil?
|
|
266
|
+
|
|
267
|
+
next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
|
|
268
|
+
line += next_line
|
|
269
|
+
@file_line_count += 1
|
|
270
|
+
$stderr.print "\nline contains unclosed quoted field, including content through file line %d\n" % @file_line_count if @verbose == :debug
|
|
271
|
+
|
|
272
|
+
# DoS guard: prevent runaway multiline accumulation (vectors: never-closing quote, huge embedded content)
|
|
273
|
+
if @field_size_limit && line.bytesize > @field_size_limit
|
|
274
|
+
raise SmarterCSV::FieldSizeLimitExceeded,
|
|
275
|
+
"Multiline field exceeds field_size_limit of #{@field_size_limit} bytes " \
|
|
276
|
+
"(accumulated #{line.bytesize} bytes)"
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
# Opt #8 (memchr guard): if the newly appended line contains no quote character,
|
|
280
|
+
# it cannot close the currently open quoted field — skip the full re-parse and
|
|
281
|
+
# keep accumulating physical lines. String#include? uses memchr internally (C speed).
|
|
282
|
+
next unless next_line.include?(@quote_char)
|
|
283
|
+
|
|
284
|
+
if @use_acceleration
|
|
105
285
|
# :nocov:
|
|
106
|
-
|
|
107
|
-
|
|
286
|
+
hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx)
|
|
287
|
+
if @quote_escaping_auto && data_size == -1 && line.include?('\\')
|
|
288
|
+
hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx_double)
|
|
289
|
+
end
|
|
108
290
|
# :nocov:
|
|
291
|
+
else
|
|
292
|
+
# Optimization #18: use detect_multiline as a cheap gate before attempting a full
|
|
293
|
+
# Ruby re-parse on the growing stitched line. detect_multiline_strict now uses
|
|
294
|
+
# byteindex skip-ahead (Opt #17) and is faster than parse_line_to_hash_ruby on
|
|
295
|
+
# the same content. Saves N-2 wasted full parses per multiline row.
|
|
296
|
+
next if detect_multiline(line, options)
|
|
297
|
+
|
|
298
|
+
has_quotes = true # we know the line has quotes — we've been stitching a quoted field
|
|
299
|
+
hash, data_size = parse_line_to_hash_ruby(line, @headers, @hot_path_options, has_quotes)
|
|
300
|
+
if @quote_escaping_auto && data_size == -1 && line.include?('\\')
|
|
301
|
+
hash, data_size = parse_line_to_hash_ruby(line, @headers, @quote_escaping_double, has_quotes)
|
|
302
|
+
end
|
|
109
303
|
end
|
|
110
304
|
end
|
|
111
|
-
next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
|
|
112
|
-
line += next_line
|
|
113
|
-
@file_line_count += 1
|
|
114
305
|
|
|
115
|
-
|
|
116
|
-
|
|
306
|
+
# --- EXTRA COLUMNS ---
|
|
307
|
+
if data_size > @headers.size
|
|
308
|
+
raise SmarterCSV::HeaderSizeMismatch, "extra columns detected on line #{@file_line_count}" if options[:missing_headers] == :raise
|
|
117
309
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
# :nocov:
|
|
310
|
+
while @headers.size < data_size
|
|
311
|
+
@headers << "#{options[:missing_header_prefix]}#{@headers.size + 1}".to_sym
|
|
312
|
+
end
|
|
313
|
+
end
|
|
123
314
|
|
|
124
|
-
|
|
125
|
-
|
|
315
|
+
next if hash.nil?
|
|
316
|
+
|
|
317
|
+
# --- FIELD SIZE LIMIT CHECK ---
|
|
318
|
+
# Pre-filter: if the raw line fits within the limit, no individual field can exceed it
|
|
319
|
+
# (a field is always a substring of its row). Only iterate over values for large rows.
|
|
320
|
+
if @field_size_limit && line.bytesize > @field_size_limit
|
|
321
|
+
hash.each_value do |v|
|
|
322
|
+
if v.is_a?(String) && v.bytesize > @field_size_limit
|
|
323
|
+
raise SmarterCSV::FieldSizeLimitExceeded,
|
|
324
|
+
"Field exceeds field_size_limit of #{@field_size_limit} bytes (got #{v.bytesize} bytes)"
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
end
|
|
126
328
|
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
329
|
+
# --- COLUMN SELECTION ---
|
|
330
|
+
hash.select! { |k, _| @only_headers_set.include?(k) } if @only_headers_set
|
|
331
|
+
hash.reject! { |k, _| @except_headers_set.include?(k) } if @except_headers_set
|
|
332
|
+
|
|
333
|
+
# --- HASH CLEANUP & TRANSFORMATIONS ---
|
|
334
|
+
if @use_acceleration
|
|
335
|
+
# C already applied: remove_empty_values, convert_values_to_numeric, remove_zero_values.
|
|
336
|
+
# Remove nil/"" keys left by key_mapping or empty CSV headers.
|
|
337
|
+
if @delete_nil_keys
|
|
338
|
+
hash.delete(nil)
|
|
339
|
+
hash.delete('')
|
|
340
|
+
end
|
|
341
|
+
hash.delete(:"") if @delete_empty_keys
|
|
342
|
+
|
|
343
|
+
if (matcher = options[:nil_values_matching])
|
|
344
|
+
if options[:remove_empty_values]
|
|
345
|
+
hash.delete_if do |_k, v|
|
|
346
|
+
str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
|
|
347
|
+
str_val && matcher.match?(str_val)
|
|
348
|
+
end
|
|
349
|
+
else
|
|
350
|
+
hash.each_key do |k|
|
|
351
|
+
v = hash[k]
|
|
352
|
+
str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
|
|
353
|
+
hash[k] = nil if str_val && matcher.match?(str_val)
|
|
354
|
+
end
|
|
355
|
+
end
|
|
356
|
+
end
|
|
357
|
+
|
|
358
|
+
if options[:value_converters]
|
|
359
|
+
options[:value_converters].each do |key, converter|
|
|
360
|
+
hash[key] = converter.convert(hash[key]) if hash.key?(key)
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
else
|
|
364
|
+
hash = hash_transformations(hash, options)
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
next if options[:remove_empty_hashes] && hash.empty?
|
|
368
|
+
|
|
369
|
+
$stderr.puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == :debug
|
|
370
|
+
# optional adding of csv_line_number to the hash to help debugging
|
|
371
|
+
hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
|
|
372
|
+
rescue SmarterCSV::Error, EOFError => e
|
|
373
|
+
raise if options[:on_bad_row] == :raise
|
|
374
|
+
|
|
375
|
+
handle_bad_row(e, line, bad_row_start_csv_line, bad_row_start_file_line, options)
|
|
376
|
+
next
|
|
377
|
+
end
|
|
130
378
|
|
|
131
379
|
# process the chunks or the resulting hash
|
|
132
380
|
if use_chunks
|
|
133
381
|
chunk << hash # append temp result to chunk
|
|
134
382
|
|
|
135
383
|
if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
|
|
384
|
+
_on_chunk&.call({ chunk_number: @chunk_count + 1, rows_in_chunk: chunk.size, total_rows_so_far: @csv_line_count })
|
|
136
385
|
# do something with the chunk
|
|
137
386
|
if block_given?
|
|
138
387
|
yield chunk, @chunk_count # do something with the hashes in the chunk in the block
|
|
@@ -157,10 +406,11 @@ module SmarterCSV
|
|
|
157
406
|
end
|
|
158
407
|
|
|
159
408
|
# print new line to retain last processing line message
|
|
160
|
-
print "\n" if @verbose
|
|
409
|
+
$stderr.print "\n" if @verbose == :debug
|
|
161
410
|
|
|
162
411
|
# handling of last chunk:
|
|
163
412
|
if !chunk.nil? && chunk.size > 0
|
|
413
|
+
_on_chunk&.call({ chunk_number: @chunk_count + 1, rows_in_chunk: chunk.size, total_rows_so_far: @csv_line_count })
|
|
164
414
|
# do something with the chunk
|
|
165
415
|
if block_given?
|
|
166
416
|
yield chunk, @chunk_count # do something with the hashes in the chunk in the block
|
|
@@ -170,6 +420,15 @@ module SmarterCSV
|
|
|
170
420
|
@chunk_count += 1
|
|
171
421
|
# chunk = [] # initialize for next chunk of data
|
|
172
422
|
end
|
|
423
|
+
|
|
424
|
+
if _on_complete
|
|
425
|
+
_on_complete.call({
|
|
426
|
+
total_rows: @csv_line_count,
|
|
427
|
+
total_chunks: @chunk_count,
|
|
428
|
+
duration: Process.clock_gettime(Process::CLOCK_MONOTONIC) - _start_time,
|
|
429
|
+
bad_rows: @errors[:bad_row_count] || 0,
|
|
430
|
+
})
|
|
431
|
+
end
|
|
173
432
|
ensure
|
|
174
433
|
fh.close if fh.respond_to?(:close)
|
|
175
434
|
end
|
|
@@ -254,11 +513,15 @@ module SmarterCSV
|
|
|
254
513
|
|
|
255
514
|
# Determine if a line has unbalanced quotes requiring multiline stitching.
|
|
256
515
|
# For :auto mode, uses dual counting to avoid false multiline detection.
|
|
516
|
+
# For :standard quote_boundary mode, uses a full state machine so that
|
|
517
|
+
# mid-field quotes (which are literals in standard mode) do not trigger stitching.
|
|
257
518
|
# Optimization #8: skip quote counting entirely when line has no quote chars.
|
|
258
519
|
def detect_multiline(line, options)
|
|
259
520
|
return false unless line.include?(options[:quote_char])
|
|
260
521
|
|
|
261
|
-
if options[:
|
|
522
|
+
if options[:quote_boundary] == :standard
|
|
523
|
+
detect_multiline_strict(line, options)
|
|
524
|
+
elsif options[:quote_escaping] == :auto
|
|
262
525
|
escaped_count, rfc_count = count_quote_chars_auto(line, options[:quote_char], options[:col_sep])
|
|
263
526
|
# If backslash-aware count is even → line is self-contained either way
|
|
264
527
|
# If backslash-aware count is odd AND rfc_count is also odd → truly multiline
|
|
@@ -270,6 +533,146 @@ module SmarterCSV
|
|
|
270
533
|
end
|
|
271
534
|
end
|
|
272
535
|
|
|
536
|
+
# Boundary-aware multiline detection for quote_boundary: :standard mode.
|
|
537
|
+
# Walks the line as a state machine tracking quote state only for boundary quotes.
|
|
538
|
+
# A quote only opens/closes a quoted field if it appears at a field boundary
|
|
539
|
+
# (start of field, or after leading whitespace when strip_whitespace is true).
|
|
540
|
+
# Mid-field quotes are treated as literals and do not affect quote state.
|
|
541
|
+
#
|
|
542
|
+
# Optimization #17: single-char col_sep fast path uses byteindex skip-ahead
|
|
543
|
+
# (mirrors Opt #10/#12 in parse_csv_line_ruby) so that:
|
|
544
|
+
# - inside a quoted field: jump directly to next quote char via C-level byteindex
|
|
545
|
+
# - inside an unquoted field: jump directly to next col_sep via C-level byteindex
|
|
546
|
+
# This makes detect_multiline_strict competitive with parse_csv_line_ruby on the same
|
|
547
|
+
# content, enabling it to serve as a cheap gate in the stitch loop (Opt #18).
|
|
548
|
+
def detect_multiline_strict(line, options)
|
|
549
|
+
col_sep = options[:col_sep]
|
|
550
|
+
quote = options[:quote_char]
|
|
551
|
+
strip = options[:strip_whitespace]
|
|
552
|
+
row_sep = options[:row_sep]
|
|
553
|
+
|
|
554
|
+
col_sep_size = col_sep.size
|
|
555
|
+
row_sep_size = row_sep.is_a?(String) ? row_sep.size : 0
|
|
556
|
+
in_quotes = false
|
|
557
|
+
field_started = false
|
|
558
|
+
|
|
559
|
+
if col_sep_size == 1
|
|
560
|
+
# Fast path: byte-level scanning with byteindex skip-ahead (Opt #17)
|
|
561
|
+
col_sep_byte = col_sep.getbyte(0)
|
|
562
|
+
quote_byte = quote.getbyte(0)
|
|
563
|
+
row_sep_bytesize = row_sep.is_a?(String) ? row_sep.bytesize : 0
|
|
564
|
+
bytesize = line.bytesize
|
|
565
|
+
byteindex_available = SmarterCSV::Parser::BYTEINDEX_AVAILABLE
|
|
566
|
+
i = 0
|
|
567
|
+
|
|
568
|
+
while i < bytesize
|
|
569
|
+
if in_quotes
|
|
570
|
+
# Opt #10 mirror: jump directly to next quote using C-level byteindex (MRI Ruby ≥ 3.2).
|
|
571
|
+
# Fallback for older Ruby / JRuby: manual getbyte loop — kept inline to avoid
|
|
572
|
+
# method-call frame overhead in this hot loop (see BYTEINDEX_AVAILABLE in parser.rb).
|
|
573
|
+
next_q = if byteindex_available
|
|
574
|
+
line.byteindex(quote, i)
|
|
575
|
+
else
|
|
576
|
+
j = i
|
|
577
|
+
j += 1 while j < bytesize && line.getbyte(j) != quote_byte
|
|
578
|
+
j < bytesize ? j : nil
|
|
579
|
+
end
|
|
580
|
+
return true if next_q.nil? # no closing quote → line is incomplete
|
|
581
|
+
|
|
582
|
+
i = next_q
|
|
583
|
+
b = quote_byte
|
|
584
|
+
elsif field_started
|
|
585
|
+
# Opt #12 mirror: unquoted field in progress — jump to next col_sep using C-level
|
|
586
|
+
# byteindex (MRI Ruby ≥ 3.2). Fallback for older Ruby / JRuby: manual getbyte loop —
|
|
587
|
+
# kept inline for the same reason as the Opt #10 mirror above.
|
|
588
|
+
next_sep = if byteindex_available
|
|
589
|
+
line.byteindex(col_sep, i)
|
|
590
|
+
else
|
|
591
|
+
j = i
|
|
592
|
+
j += 1 while j < bytesize && line.getbyte(j) != col_sep_byte
|
|
593
|
+
j < bytesize ? j : nil
|
|
594
|
+
end
|
|
595
|
+
break if next_sep.nil? # no more separators → end of line, not multiline
|
|
596
|
+
|
|
597
|
+
i = next_sep
|
|
598
|
+
b = col_sep_byte
|
|
599
|
+
else
|
|
600
|
+
b = line.getbyte(i)
|
|
601
|
+
end
|
|
602
|
+
|
|
603
|
+
if b == col_sep_byte && !in_quotes
|
|
604
|
+
field_started = false
|
|
605
|
+
elsif b == quote_byte
|
|
606
|
+
if in_quotes
|
|
607
|
+
# closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
608
|
+
next_i = i + 1
|
|
609
|
+
if next_i >= bytesize ||
|
|
610
|
+
line.getbyte(next_i) == col_sep_byte ||
|
|
611
|
+
(row_sep_bytesize > 0 && line.byteslice(next_i, row_sep_bytesize) == row_sep)
|
|
612
|
+
in_quotes = false
|
|
613
|
+
field_started = true
|
|
614
|
+
end
|
|
615
|
+
# else: quote inside quoted field → literal (handles "" doubling)
|
|
616
|
+
elsif !field_started # at field boundary: open quoted field
|
|
617
|
+
in_quotes = true
|
|
618
|
+
field_started = true
|
|
619
|
+
end
|
|
620
|
+
# else: mid-field quote → literal, no state change
|
|
621
|
+
else
|
|
622
|
+
unless in_quotes
|
|
623
|
+
# rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
|
|
624
|
+
field_started = true unless strip && (b == 32 || b == 9) # ' ' == 32, '\t' == 9
|
|
625
|
+
# rubocop:enable Style/MultipleComparison
|
|
626
|
+
end
|
|
627
|
+
end
|
|
628
|
+
i += 1
|
|
629
|
+
end
|
|
630
|
+
else
|
|
631
|
+
# Multi-char col_sep: character-by-character (original path)
|
|
632
|
+
line_size = line.size
|
|
633
|
+
i = 0
|
|
634
|
+
|
|
635
|
+
while i < line_size
|
|
636
|
+
# Check for column separator (only outside quotes)
|
|
637
|
+
if !in_quotes && line[i...i + col_sep_size] == col_sep
|
|
638
|
+
field_started = false
|
|
639
|
+
i += col_sep_size
|
|
640
|
+
next
|
|
641
|
+
end
|
|
642
|
+
|
|
643
|
+
if line[i] == quote
|
|
644
|
+
if in_quotes
|
|
645
|
+
# closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
646
|
+
next_i = i + 1
|
|
647
|
+
if next_i >= line_size ||
|
|
648
|
+
line[next_i...next_i + col_sep_size] == col_sep ||
|
|
649
|
+
(row_sep_size > 0 && line[next_i...next_i + row_sep_size] == row_sep)
|
|
650
|
+
in_quotes = false
|
|
651
|
+
field_started = true
|
|
652
|
+
end
|
|
653
|
+
# else: quote inside quoted field → literal (handles "" doubling)
|
|
654
|
+
elsif !field_started # at field boundary: open quoted field
|
|
655
|
+
in_quotes = true
|
|
656
|
+
field_started = true
|
|
657
|
+
end
|
|
658
|
+
# else: mid-field quote → literal, no state change
|
|
659
|
+
elsif !in_quotes
|
|
660
|
+
# Non-quote character: track whether field has started
|
|
661
|
+
if strip
|
|
662
|
+
# rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
|
|
663
|
+
field_started = true unless line[i] == ' ' || line[i] == "\t"
|
|
664
|
+
# rubocop:enable Style/MultipleComparison
|
|
665
|
+
else
|
|
666
|
+
field_started = true
|
|
667
|
+
end
|
|
668
|
+
end
|
|
669
|
+
i += 1
|
|
670
|
+
end
|
|
671
|
+
end
|
|
672
|
+
|
|
673
|
+
in_quotes # true → line ends inside a quoted field → needs stitching
|
|
674
|
+
end
|
|
675
|
+
|
|
273
676
|
protected
|
|
274
677
|
|
|
275
678
|
# SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
|
|
@@ -302,9 +705,12 @@ module SmarterCSV
|
|
|
302
705
|
# we create additional columns on-the-fly when we find more data fields than headers
|
|
303
706
|
hash, data_size = parse_line_to_hash(line, @headers, options)
|
|
304
707
|
|
|
708
|
+
# Unclosed quote at end of line: signal caller to stitch next physical line
|
|
709
|
+
return :needs_more if data_size == -1
|
|
710
|
+
|
|
305
711
|
# Handle extra columns (more data fields than headers)
|
|
306
712
|
if data_size > @headers.size
|
|
307
|
-
if options[:
|
|
713
|
+
if options[:missing_headers] == :raise
|
|
308
714
|
raise SmarterCSV::HeaderSizeMismatch, "extra columns detected on line #{@file_line_count}"
|
|
309
715
|
end
|
|
310
716
|
|
|
@@ -317,19 +723,33 @@ module SmarterCSV
|
|
|
317
723
|
# if all values were blank (hash is nil) we ignore this CSV line
|
|
318
724
|
return nil if hash.nil?
|
|
319
725
|
|
|
726
|
+
# Apply column selection (only_headers / except_headers)
|
|
727
|
+
hash.select! { |k, _| @only_headers_set.include?(k) } if @only_headers_set
|
|
728
|
+
hash.reject! { |k, _| @except_headers_set.include?(k) } if @except_headers_set
|
|
729
|
+
|
|
320
730
|
# --- HASH TRANSFORMATIONS / POST-FILTERS --------------------------------
|
|
321
|
-
if
|
|
731
|
+
if @use_acceleration
|
|
322
732
|
# C already handled: remove_empty_values, convert_values_to_numeric, remove_zero_values.
|
|
323
|
-
#
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
733
|
+
# Remove nil/"" keys left by key_mapping or empty CSV headers.
|
|
734
|
+
if @delete_nil_keys
|
|
735
|
+
hash.delete(nil)
|
|
736
|
+
hash.delete('')
|
|
737
|
+
end
|
|
738
|
+
hash.delete(:"") if @delete_empty_keys
|
|
327
739
|
|
|
328
740
|
# Only these Ruby-only post-filters remain (user-provided Ruby objects):
|
|
329
|
-
if options[:
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
741
|
+
if (matcher = options[:nil_values_matching])
|
|
742
|
+
if options[:remove_empty_values]
|
|
743
|
+
hash.delete_if do |_k, v|
|
|
744
|
+
str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
|
|
745
|
+
str_val && matcher.match?(str_val)
|
|
746
|
+
end
|
|
747
|
+
else
|
|
748
|
+
hash.each_key do |k|
|
|
749
|
+
v = hash[k]
|
|
750
|
+
str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
|
|
751
|
+
hash[k] = nil if str_val && matcher.match?(str_val)
|
|
752
|
+
end
|
|
333
753
|
end
|
|
334
754
|
end
|
|
335
755
|
|
|
@@ -360,5 +780,33 @@ module SmarterCSV
|
|
|
360
780
|
|
|
361
781
|
line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
|
|
362
782
|
end
|
|
783
|
+
|
|
784
|
+
def handle_bad_row(error, line, start_csv_line, start_file_line, options)
|
|
785
|
+
@errors[:bad_row_count] = (@errors[:bad_row_count] || 0) + 1
|
|
786
|
+
|
|
787
|
+
error_record = {
|
|
788
|
+
csv_line_number: start_csv_line,
|
|
789
|
+
file_line_number: start_file_line,
|
|
790
|
+
file_lines_consumed: @file_line_count - start_file_line + 1,
|
|
791
|
+
error_class: error.class,
|
|
792
|
+
error_message: error.message,
|
|
793
|
+
}
|
|
794
|
+
error_record[:raw_logical_line] = line if options[:collect_raw_lines]
|
|
795
|
+
|
|
796
|
+
on_bad_row = options[:on_bad_row]
|
|
797
|
+
case on_bad_row
|
|
798
|
+
when :skip
|
|
799
|
+
# counted above; nothing more to collect
|
|
800
|
+
when :collect
|
|
801
|
+
(@errors[:bad_rows] ||= []) << error_record
|
|
802
|
+
else
|
|
803
|
+
# callable
|
|
804
|
+
on_bad_row.call(error_record)
|
|
805
|
+
end
|
|
806
|
+
|
|
807
|
+
if options[:bad_row_limit] && @errors[:bad_row_count] > options[:bad_row_limit]
|
|
808
|
+
raise TooManyBadRows, "Bad row limit of #{options[:bad_row_limit]} exceeded (#{@errors[:bad_row_count]} bad rows encountered)"
|
|
809
|
+
end
|
|
810
|
+
end
|
|
363
811
|
end
|
|
364
812
|
end
|