smarter_csv 1.15.2 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/CHANGELOG.md +68 -1
  4. data/CONTRIBUTORS.md +3 -1
  5. data/Gemfile +1 -0
  6. data/README.md +123 -27
  7. data/docs/_introduction.md +40 -24
  8. data/docs/bad_row_quarantine.md +285 -0
  9. data/docs/basic_read_api.md +151 -9
  10. data/docs/basic_write_api.md +474 -59
  11. data/docs/batch_processing.md +161 -4
  12. data/docs/column_selection.md +183 -0
  13. data/docs/data_transformations.md +162 -29
  14. data/docs/examples.md +339 -46
  15. data/docs/header_transformations.md +93 -12
  16. data/docs/header_validations.md +56 -18
  17. data/docs/history.md +117 -0
  18. data/docs/instrumentation.md +165 -0
  19. data/docs/migrating_from_csv.md +290 -0
  20. data/docs/options.md +150 -87
  21. data/docs/parsing_strategy.md +63 -1
  22. data/docs/real_world_csv.md +262 -0
  23. data/docs/releases/1.16.0/benchmarks.md +223 -0
  24. data/docs/releases/1.16.0/changes.md +272 -0
  25. data/docs/releases/1.16.0/performance_notes.md +114 -0
  26. data/docs/row_col_sep.md +14 -5
  27. data/docs/value_converters.md +193 -57
  28. data/ext/smarter_csv/extconf.rb +3 -0
  29. data/ext/smarter_csv/smarter_csv.c +1007 -71
  30. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png +0 -0
  31. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg +108 -0
  32. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.png +0 -0
  33. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg +141 -0
  34. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png +0 -0
  35. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg +139 -0
  36. data/lib/smarter_csv/errors.rb +8 -0
  37. data/lib/smarter_csv/file_io.rb +1 -1
  38. data/lib/smarter_csv/hash_transformations.rb +14 -13
  39. data/lib/smarter_csv/header_transformations.rb +21 -2
  40. data/lib/smarter_csv/headers.rb +2 -1
  41. data/lib/smarter_csv/options.rb +124 -7
  42. data/lib/smarter_csv/parser.rb +362 -75
  43. data/lib/smarter_csv/reader.rb +494 -46
  44. data/lib/smarter_csv/version.rb +1 -1
  45. data/lib/smarter_csv/writer.rb +71 -19
  46. data/lib/smarter_csv.rb +95 -12
  47. data/smarter_csv.gemspec +20 -10
  48. metadata +37 -80
@@ -13,44 +13,57 @@ module SmarterCSV
13
13
  DEFAULT_OPTIONS = {
14
14
  acceleration: true, # if user wants to use accelleration or not
15
15
  auto_row_sep_chars: 500,
16
+ bad_row_limit: nil,
16
17
  chunk_size: nil,
17
18
  col_sep: :auto, # was: ',',
19
+ collect_raw_lines: true,
18
20
  comment_regexp: nil, # was: /\A#/,
19
21
  convert_values_to_numeric: true,
20
22
  downcase_header: true,
21
23
  duplicate_header_suffix: '', # was: nil,
24
+ field_size_limit: nil, # Integer (bytes) or nil for no limit. Raises FieldSizeLimitExceeded if any
25
+ # extracted field exceeds this size. Prevents DoS from runaway quoted
26
+ # fields (unbounded multiline stitching) or huge inline payloads.
22
27
  file_encoding: 'utf-8',
23
28
  force_utf8: false,
24
29
  headers_in_file: true,
25
30
  invalid_byte_sequence: '',
26
31
  keep_original_headers: false,
27
32
  key_mapping: nil,
33
+ strict: false, # DEPRECATED -> use missing_headers
34
+ missing_headers: :auto, # :auto (auto-generate names for extra cols) or :raise (raise HeaderSizeMismatch)
28
35
  missing_header_prefix: 'column_',
36
+ nil_values_matching: nil, # regex: set matching values to nil (key kept); pairs with remove_empty_values
37
+ on_bad_row: :raise,
38
+ on_chunk: nil, # callable: fired after each chunk is parsed, before yielding to the block
39
+ on_complete: nil, # callable: fired once after the entire file is processed
40
+ on_start: nil, # callable: fired once before the first row is parsed
41
+ quote_boundary: :standard, # :standard (only at field boundary 👍) or :legacy (any quote toggles state 👎)
29
42
  quote_char: '"',
30
43
  quote_escaping: :auto,
31
44
  remove_empty_hashes: true,
32
45
  remove_empty_values: true,
33
46
  remove_unmapped_keys: false,
34
- remove_values_matching: nil,
47
+ remove_values_matching: nil, # DEPRECATED: use nil_values_matching instead
35
48
  remove_zero_values: false,
36
49
  required_headers: nil,
37
50
  required_keys: nil,
38
51
  row_sep: :auto, # was: $/,
39
52
  silence_missing_keys: false,
40
53
  skip_lines: nil,
41
- strict: false,
42
54
  strings_as_keys: false,
43
55
  strip_chars_from_headers: nil,
44
56
  strip_whitespace: true,
45
57
  user_provided_headers: nil,
46
58
  value_converters: nil,
47
- verbose: false,
59
+ verbose: :normal, # nil/:normal (default), :quiet (suppress warnings), :debug (print diagnostics); true/false are deprecated
48
60
  with_line_numbers: false,
49
61
  }.freeze
50
62
 
51
63
  # NOTE: this is not called when "parse" methods are tested by themselves
52
64
  def process_options(given_options = {})
53
- puts "User provided options:\n#{pp(given_options)}\n" if given_options[:verbose]
65
+ # Debug output before merge — check raw verbose value (true or :debug)
66
+ $stderr.puts "User provided options:\n#{pp(given_options)}\n" if [true, :debug].include?(given_options[:verbose])
54
67
 
55
68
  # Special case for :user_provided_headers:
56
69
  #
@@ -61,15 +74,95 @@ module SmarterCSV
61
74
  #
62
75
  if given_options[:user_provided_headers] && !given_options.keys.include?(:headers_in_file)
63
76
  given_options[:headers_in_file] = false
64
- puts "WARNING: setting `headers_in_file: false` as a precaution to not lose the first row. Set explicitly to `true` if you have headers."
77
+ warn "WARNING: setting `headers_in_file: false` as a precaution to not lose the first row. Set explicitly to `true` if you have headers." unless given_options[:verbose] == :quiet
65
78
  end
66
79
 
67
80
  @options = DEFAULT_OPTIONS.dup.merge!(given_options)
68
81
 
82
+ # Normalize verbose to a symbol — done once here, stored back into @options.
83
+ # All subsequent checks are free symbol comparisons; no re-evaluation needed.
84
+ # :quiet — suppress all warnings and notices (good for production)
85
+ # :normal — show behavioral warnings (default; helpful for new users)
86
+ # :debug — :normal + print computed options and per-row diagnostics
87
+ # nil is silently normalized to :normal; true/false are deprecated.
88
+ case @options[:verbose]
89
+ when :quiet, :normal, :debug
90
+ # keep as is
91
+ when nil
92
+ @options[:verbose] = :normal
93
+ when false
94
+ warn "DEPRECATION WARNING: verbose: false is deprecated. Use verbose: :normal instead (or omit — it is the default)."
95
+ @options[:verbose] = :normal
96
+ when true
97
+ warn "DEPRECATION WARNING: verbose: true is deprecated. Use verbose: :debug instead."
98
+ @options[:verbose] = :debug
99
+ else
100
+ warn "WARNING: unknown verbose value #{@options[:verbose].inspect}, defaulting to :normal. Valid values: :quiet, :normal, :debug."
101
+ @options[:verbose] = :normal
102
+ end
103
+
69
104
  # fix invalid input
70
105
  @options[:invalid_byte_sequence] ||= ''
71
106
 
72
- puts "Computed options:\n#{pp(@options)}\n" if @options[:verbose]
107
+ # Normalize headers: { only: [...] } / { except: [...] } to internal option names.
108
+ # The public API is headers: { only: } or headers: { except: }.
109
+ # Internally we use only_headers: / except_headers: (what the C extension reads).
110
+ if (hdr = @options.delete(:headers)).is_a?(Hash)
111
+ @options[:only_headers] = hdr[:only] if hdr.key?(:only)
112
+ @options[:except_headers] = hdr[:except] if hdr.key?(:except)
113
+ end
114
+
115
+ # Deprecation: direct use of only_headers: / except_headers: (use headers: { only: } instead)
116
+ if given_options.key?(:only_headers) && !given_options.key?(:headers)
117
+ warn "DEPRECATION WARNING: 'only_headers:' is deprecated. Use 'headers: { only: [...] }' instead." unless @options[:verbose] == :quiet
118
+ end
119
+ if given_options.key?(:except_headers) && !given_options.key?(:headers)
120
+ warn "DEPRECATION WARNING: 'except_headers:' is deprecated. Use 'headers: { except: [...] }' instead." unless @options[:verbose] == :quiet
121
+ end
122
+
123
+ # Normalize only_headers/except_headers to arrays of symbols (internal names, read by C extension)
124
+ if @options[:only_headers]
125
+ values = Array(@options[:only_headers])
126
+ bad = values.reject { |v| v.is_a?(Symbol) || v.is_a?(String) }
127
+ raise SmarterCSV::ValidationError, "headers: { only: } elements must be String or Symbol, got: #{bad.map(&:class).uniq.inspect}" if bad.any?
128
+ @options[:only_headers] = values.map(&:to_sym)
129
+ end
130
+ if @options[:except_headers]
131
+ values = Array(@options[:except_headers])
132
+ bad = values.reject { |v| v.is_a?(Symbol) || v.is_a?(String) }
133
+ raise SmarterCSV::ValidationError, "headers: { except: } elements must be String or Symbol, got: #{bad.map(&:class).uniq.inspect}" if bad.any?
134
+ @options[:except_headers] = values.map(&:to_sym)
135
+ end
136
+
137
+ # Deprecation: remove_values_matching → nil_values_matching
138
+ # Old behavior: removes the key-value pair entirely.
139
+ # New behavior: nil_values_matching sets the value to nil (key kept);
140
+ # combined with the default remove_empty_values: true the net effect is identical.
141
+ # With remove_empty_values: false, the key is retained with a nil value.
142
+ if given_options.key?(:remove_values_matching)
143
+ unless @options[:verbose] == :quiet
144
+ warn "DEPRECATION WARNING: 'remove_values_matching' is deprecated. " \
145
+ "Use 'nil_values_matching' instead. With the default 'remove_empty_values: true' " \
146
+ "the net behavior is identical. With 'remove_empty_values: false', matching values " \
147
+ "are set to nil but the key is retained in the result hash."
148
+ end
149
+ @options[:nil_values_matching] ||= @options[:remove_values_matching]
150
+ @options[:remove_values_matching] = nil # clear to prevent double-processing
151
+ end
152
+
153
+ # Translate deprecated :strict option to :missing_headers
154
+ if given_options.key?(:strict)
155
+ unless @options[:verbose] == :quiet
156
+ warn "DEPRECATION WARNING: 'strict' option is deprecated and will be removed in a future version. " \
157
+ "Use 'missing_headers: :raise' instead of 'strict: true', or 'missing_headers: :auto' instead of 'strict: false'."
158
+ end
159
+ @options[:missing_headers] = @options[:strict] ? :raise : :auto unless given_options.key?(:missing_headers)
160
+ end
161
+
162
+ # Keep :strict synchronized with :missing_headers (C extension reads :strict directly)
163
+ @options[:strict] = (@options[:missing_headers] == :raise)
164
+
165
+ $stderr.puts "Computed options:\n#{pp(@options)}\n" if @options[:verbose] == :debug
73
166
 
74
167
  validate_options!(@options)
75
168
  @options
@@ -80,7 +173,7 @@ module SmarterCSV
80
173
  def validate_options!(options)
81
174
  # deprecate required_headers
82
175
  unless options[:required_headers].nil?
83
- puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
176
+ warn "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'" unless options[:verbose] == :quiet
84
177
  if options[:required_keys].nil?
85
178
  options[:required_keys] = options[:required_headers]
86
179
  options[:required_headers] = nil
@@ -92,9 +185,33 @@ module SmarterCSV
92
185
  errors << "invalid row_sep" if keys.include?(:row_sep) && !option_valid?(options[:row_sep])
93
186
  errors << "invalid col_sep" if keys.include?(:col_sep) && !option_valid?(options[:col_sep])
94
187
  errors << "invalid quote_char" if keys.include?(:quote_char) && !option_valid?(options[:quote_char])
188
+ if keys.include?(:quote_char) && options[:quote_char].is_a?(String) && options[:quote_char].bytesize > 1
189
+ errors << "invalid quote_char: must be a single byte (got #{options[:quote_char].inspect})"
190
+ end
95
191
  unless %i[double_quotes backslash auto].include?(options[:quote_escaping])
96
192
  errors << "invalid quote_escaping: must be :double_quotes, :backslash, or :auto"
97
193
  end
194
+ unless %i[legacy standard].include?(options[:quote_boundary])
195
+ errors << "invalid quote_boundary: must be :legacy or :standard"
196
+ end
197
+ fsl = options[:field_size_limit]
198
+ unless fsl.nil? || (fsl.is_a?(Integer) && fsl > 0)
199
+ errors << "invalid field_size_limit: must be nil or a positive Integer (got #{fsl.inspect})"
200
+ end
201
+ obr = options[:on_bad_row]
202
+ unless %i[raise skip collect].include?(obr) || obr.respond_to?(:call)
203
+ errors << "invalid on_bad_row: must be :raise, :skip, :collect, or a callable"
204
+ end
205
+ %i[on_start on_chunk on_complete].each do |hook|
206
+ val = options[hook]
207
+ errors << "invalid #{hook}: must be nil or a callable" if !val.nil? && !val.respond_to?(:call)
208
+ end
209
+ unless %i[auto raise].include?(options[:missing_headers])
210
+ errors << "invalid missing_headers: must be :auto or :raise"
211
+ end
212
+ if options[:only_headers] && options[:except_headers]
213
+ errors << "cannot use both 'headers: { only: }' and 'headers: { except: }' at the same time"
214
+ end
98
215
  raise SmarterCSV::ValidationError, errors.inspect if errors.any?
99
216
  end
100
217