smarter_csv 1.16.4 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +10 -1
  3. data/CHANGELOG.md +54 -0
  4. data/Gemfile +10 -5
  5. data/README.md +98 -14
  6. data/TO_DO.md +109 -0
  7. data/docs/_introduction.md +1 -0
  8. data/docs/bad_row_quarantine.md +2 -1
  9. data/docs/basic_read_api.md +6 -1
  10. data/docs/basic_write_api.md +30 -0
  11. data/docs/batch_processing.md +25 -0
  12. data/docs/column_selection.md +1 -0
  13. data/docs/data_transformations.md +1 -0
  14. data/docs/examples.md +126 -0
  15. data/docs/header_transformations.md +23 -0
  16. data/docs/header_validations.md +1 -0
  17. data/docs/history.md +1 -0
  18. data/docs/instrumentation.md +2 -1
  19. data/docs/migrating_from_csv.md +1 -0
  20. data/docs/options.md +20 -18
  21. data/docs/parsing_strategy.md +1 -0
  22. data/docs/real_world_csv.md +51 -1
  23. data/docs/releases/1.16.0/performance_notes.md +15 -15
  24. data/docs/releases/1.17.0/benchmarks.md +121 -0
  25. data/docs/releases/1.17.0/changes.md +161 -0
  26. data/docs/releases/1.17.0/performance_notes.md +126 -0
  27. data/docs/row_col_sep.md +21 -1
  28. data/docs/ruby_csv_pitfalls.md +1 -0
  29. data/docs/value_converters.md +24 -0
  30. data/docs/warnings.md +141 -0
  31. data/ext/smarter_csv/smarter_csv.c +98 -32
  32. data/images/SmarterCSV_1.17.0_vs_RubyCSV_3.3.5_speedup.svg +106 -0
  33. data/images/SmarterCSV_1.17.0_vs_previous_C-speedup.svg +181 -0
  34. data/images/SmarterCSV_1.17.0_vs_previous_Rb-speedup.svg +179 -0
  35. data/lib/smarter_csv/auto_detection.rb +215 -30
  36. data/lib/smarter_csv/file_io.rb +2 -2
  37. data/lib/smarter_csv/hash_transformations.rb +29 -13
  38. data/lib/smarter_csv/parser.rb +42 -33
  39. data/lib/smarter_csv/peekable_io.rb +453 -0
  40. data/lib/smarter_csv/reader.rb +119 -23
  41. data/lib/smarter_csv/reader_options.rb +61 -1
  42. data/lib/smarter_csv/version.rb +1 -1
  43. data/lib/smarter_csv.rb +40 -12
  44. metadata +12 -5
  45. data/TO_DO_v2.md +0 -14
  46. data/ext/smarter_csv/Makefile +0 -270
@@ -0,0 +1,179 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" width="820" height="668"
2
+ font-family="ui-monospace, 'Cascadia Code', 'Courier New', monospace" font-size="12">
3
+ <rect width="820" height="668" fill="#ffffff"/>
4
+ <text x="410" y="18" text-anchor="middle" font-size="13" font-weight="bold" fill="#212121">SmarterCSV improvements 1.15.2, 1.16.4, 1.17.0 vs 1.14.4 — Ruby (not accelerated)</text>
5
+ <text x="410" y="32" text-anchor="middle" font-size="10" fill="#9e9e9e">Speedup ratio = baseline version time ÷ newer version time (higher = newer version is faster)</text>
6
+ <text x="410" y="48" text-anchor="middle" font-size="11" fill="#616161">Ruby 3.4.7 [log scale, best of 40]</text>
7
+ <line x1="220" y1="68" x2="220" y2="580" stroke="#e0e0e0" stroke-width="1"/>
8
+ <text x="220" y="64" text-anchor="middle" font-size="11" fill="#757575">1×</text>
9
+ <line x1="323" y1="68" x2="323" y2="580" stroke="#e0e0e0" stroke-width="1"/>
10
+ <text x="323" y="64" text-anchor="middle" font-size="11" fill="#757575">2×</text>
11
+ <line x1="459" y1="68" x2="459" y2="580" stroke="#e0e0e0" stroke-width="1"/>
12
+ <text x="459" y="64" text-anchor="middle" font-size="11" fill="#757575">5×</text>
13
+ <line x1="561" y1="68" x2="561" y2="580" stroke="#e0e0e0" stroke-width="1"/>
14
+ <text x="561" y="64" text-anchor="middle" font-size="11" fill="#757575">10×</text>
15
+ <line x1="664" y1="68" x2="664" y2="580" stroke="#e0e0e0" stroke-width="1"/>
16
+ <text x="664" y="64" text-anchor="middle" font-size="11" fill="#757575">20×</text>
17
+ <line x1="800" y1="68" x2="800" y2="580" stroke="#e0e0e0" stroke-width="1"/>
18
+ <text x="800" y="64" text-anchor="middle" font-size="11" fill="#757575">50×</text>
19
+ <line x1="220" y1="68" x2="220" y2="580" stroke="#9e9e9e" stroke-width="1.5"/>
20
+ <line x1="220" y1="68" x2="800" y2="68" stroke="#bdbdbd" stroke-width="1"/>
21
+ <rect x="0" y="86" width="820" height="26" fill="#f5f5f5"/>
22
+ <text x="212" y="103" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_C</text>
23
+ <rect x="556" y="94" width="10" height="10" fill="#1565C0"/>
24
+ <text x="518" y="103" font-size="10" fill="#1565C0">10.0×</text>
25
+ <rect x="652" y="94" width="10" height="10" fill="#BF360C"/>
26
+ <rect x="658" y="94" width="10" height="10" fill="#2E7D32"/>
27
+ <text x="671" y="103" font-size="10" fill="#BF360C">19×</text>
28
+ <text x="698" y="103" font-size="10" fill="#2E7D32">20×</text>
29
+ <rect x="0" y="112" width="820" height="26" fill="#ffffff"/>
30
+ <text x="212" y="129" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_NC</text>
31
+ <rect x="506" y="120" width="10" height="10" fill="#1565C0"/>
32
+ <text x="475" y="129" font-size="10" fill="#1565C0">7.1×</text>
33
+ <rect x="570" y="120" width="10" height="10" fill="#BF360C"/>
34
+ <rect x="586" y="120" width="10" height="10" fill="#2E7D32"/>
35
+ <text x="599" y="129" font-size="10" fill="#BF360C">11×</text>
36
+ <text x="626" y="129" font-size="10" fill="#2E7D32">12×</text>
37
+ <rect x="0" y="138" width="820" height="26" fill="#f5f5f5"/>
38
+ <text x="212" y="155" text-anchor="end" font-size="11" fill="#424242">whitespace_heavy_60k</text>
39
+ <rect x="538" y="146" width="10" height="10" fill="#1565C0"/>
40
+ <rect x="543" y="146" width="10" height="10" fill="#BF360C"/>
41
+ <rect x="572" y="146" width="10" height="10" fill="#2E7D32"/>
42
+ <text x="507" y="155" font-size="10" fill="#1565C0">8.8×</text>
43
+ <text x="585" y="155" font-size="10" fill="#BF360C">9.1×</text>
44
+ <text x="619" y="155" font-size="10" fill="#2E7D32">11×</text>
45
+ <rect x="0" y="164" width="820" height="26" fill="#ffffff"/>
46
+ <text x="212" y="181" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_NB</text>
47
+ <rect x="525" y="172" width="10" height="10" fill="#1565C0"/>
48
+ <rect x="544" y="172" width="10" height="10" fill="#BF360C"/>
49
+ <rect x="563" y="172" width="10" height="10" fill="#2E7D32"/>
50
+ <text x="494" y="181" font-size="10" fill="#1565C0">8.1×</text>
51
+ <text x="576" y="181" font-size="10" fill="#BF360C">9.2×</text>
52
+ <text x="610" y="181" font-size="10" fill="#2E7D32">10×</text>
53
+ <rect x="0" y="190" width="820" height="26" fill="#f5f5f5"/>
54
+ <text x="212" y="207" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_B</text>
55
+ <rect x="525" y="198" width="10" height="10" fill="#1565C0"/>
56
+ <rect x="538" y="198" width="10" height="10" fill="#BF360C"/>
57
+ <rect x="550" y="198" width="10" height="10" fill="#2E7D32"/>
58
+ <text x="494" y="207" font-size="10" fill="#1565C0">8.1×</text>
59
+ <text x="563" y="207" font-size="10" fill="#BF360C">8.8×</text>
60
+ <text x="597" y="207" font-size="10" fill="#2E7D32">9.6×</text>
61
+ <rect x="0" y="216" width="820" height="26" fill="#ffffff"/>
62
+ <text x="212" y="233" text-anchor="end" font-size="11" fill="#424242">tab_separated_60k</text>
63
+ <rect x="504" y="224" width="10" height="10" fill="#1565C0"/>
64
+ <rect x="509" y="224" width="10" height="10" fill="#BF360C"/>
65
+ <rect x="539" y="224" width="10" height="10" fill="#2E7D32"/>
66
+ <text x="473" y="233" font-size="10" fill="#1565C0">7.0×</text>
67
+ <text x="552" y="233" font-size="10" fill="#BF360C">7.3×</text>
68
+ <text x="586" y="233" font-size="10" fill="#2E7D32">8.9×</text>
69
+ <rect x="0" y="242" width="820" height="26" fill="#f5f5f5"/>
70
+ <text x="212" y="259" text-anchor="end" font-size="11" fill="#424242">multi_char_separator_60k</text>
71
+ <rect x="466" y="250" width="10" height="10" fill="#1565C0"/>
72
+ <text x="435" y="259" font-size="10" fill="#1565C0">5.4×</text>
73
+ <rect x="503" y="250" width="10" height="10" fill="#BF360C"/>
74
+ <rect x="529" y="250" width="10" height="10" fill="#2E7D32"/>
75
+ <text x="542" y="259" font-size="10" fill="#BF360C">7.0×</text>
76
+ <text x="576" y="259" font-size="10" fill="#2E7D32">8.3×</text>
77
+ <rect x="0" y="268" width="820" height="26" fill="#ffffff"/>
78
+ <text x="212" y="285" text-anchor="end" font-size="11" fill="#424242">wide_500_cols_20k</text>
79
+ <rect x="483" y="276" width="10" height="10" fill="#1565C0"/>
80
+ <rect x="483" y="276" width="10" height="10" fill="#BF360C"/>
81
+ <text x="452" y="285" font-size="10" fill="#1565C0">6.1×</text>
82
+ <text x="496" y="285" font-size="10" fill="#BF360C">6.1×</text>
83
+ <rect x="516" y="276" width="10" height="10" fill="#2E7D32"/>
84
+ <text x="529" y="285" font-size="10" fill="#2E7D32">7.6×</text>
85
+ <rect x="0" y="294" width="820" height="26" fill="#f5f5f5"/>
86
+ <text x="212" y="311" text-anchor="end" font-size="11" fill="#424242">utf8_multibyte_60k</text>
87
+ <rect x="475" y="302" width="10" height="10" fill="#1565C0"/>
88
+ <rect x="483" y="302" width="10" height="10" fill="#BF360C"/>
89
+ <rect x="502" y="302" width="10" height="10" fill="#2E7D32"/>
90
+ <text x="444" y="311" font-size="10" fill="#1565C0">5.8×</text>
91
+ <text x="515" y="311" font-size="10" fill="#BF360C">6.1×</text>
92
+ <text x="549" y="311" font-size="10" fill="#2E7D32">6.9×</text>
93
+ <rect x="0" y="320" width="820" height="26" fill="#ffffff"/>
94
+ <text x="212" y="337" text-anchor="end" font-size="11" fill="#424242">many_empty_fields_60k</text>
95
+ <rect x="393" y="328" width="10" height="10" fill="#1565C0"/>
96
+ <text x="362" y="337" font-size="10" fill="#1565C0">3.3×</text>
97
+ <rect x="454" y="328" width="10" height="10" fill="#BF360C"/>
98
+ <rect x="474" y="328" width="10" height="10" fill="#2E7D32"/>
99
+ <text x="487" y="337" font-size="10" fill="#BF360C">5.0×</text>
100
+ <text x="521" y="337" font-size="10" fill="#2E7D32">5.7×</text>
101
+ <rect x="0" y="346" width="820" height="26" fill="#f5f5f5"/>
102
+ <text x="212" y="363" text-anchor="end" font-size="11" fill="#424242">sample_100k</text>
103
+ <rect x="434" y="354" width="10" height="10" fill="#1565C0"/>
104
+ <rect x="439" y="354" width="10" height="10" fill="#BF360C"/>
105
+ <rect x="456" y="354" width="10" height="10" fill="#2E7D32"/>
106
+ <text x="403" y="363" font-size="10" fill="#1565C0">4.4×</text>
107
+ <text x="469" y="363" font-size="10" fill="#BF360C">4.5×</text>
108
+ <text x="503" y="363" font-size="10" fill="#2E7D32">5.1×</text>
109
+ <rect x="0" y="372" width="820" height="26" fill="#ffffff"/>
110
+ <text x="212" y="389" text-anchor="end" font-size="11" fill="#424242">sensor_data_50krows_50cols</text>
111
+ <rect x="454" y="380" width="10" height="10" fill="#1565C0"/>
112
+ <rect x="455" y="380" width="10" height="10" fill="#2E7D32"/>
113
+ <rect x="456" y="380" width="10" height="10" fill="#BF360C"/>
114
+ <text x="423" y="389" font-size="10" fill="#1565C0">5.0×</text>
115
+ <text x="469" y="389" font-size="10" fill="#2E7D32">5.1×</text>
116
+ <text x="503" y="389" font-size="10" fill="#BF360C">5.1×</text>
117
+ <rect x="0" y="398" width="820" height="26" fill="#f5f5f5"/>
118
+ <text x="212" y="415" text-anchor="end" font-size="11" fill="#424242">long_fields_40k</text>
119
+ <rect x="301" y="406" width="10" height="10" fill="#1565C0"/>
120
+ <text x="270" y="415" font-size="10" fill="#1565C0">1.8×</text>
121
+ <rect x="416" y="406" width="10" height="10" fill="#BF360C"/>
122
+ <rect x="416" y="406" width="10" height="10" fill="#2E7D32"/>
123
+ <text x="429" y="415" font-size="10" fill="#BF360C">3.9×</text>
124
+ <text x="463" y="415" font-size="10" fill="#2E7D32">3.9×</text>
125
+ <rect x="0" y="424" width="820" height="26" fill="#ffffff"/>
126
+ <text x="212" y="441" text-anchor="end" font-size="11" fill="#424242">heavy_quoting_60k</text>
127
+ <rect x="268" y="432" width="10" height="10" fill="#1565C0"/>
128
+ <text x="237" y="441" font-size="10" fill="#1565C0">1.4×</text>
129
+ <rect x="371" y="432" width="10" height="10" fill="#BF360C"/>
130
+ <rect x="381" y="432" width="10" height="10" fill="#2E7D32"/>
131
+ <text x="394" y="441" font-size="10" fill="#BF360C">2.9×</text>
132
+ <text x="428" y="441" font-size="10" fill="#2E7D32">3.1×</text>
133
+ <rect x="0" y="450" width="820" height="26" fill="#f5f5f5"/>
134
+ <text x="212" y="467" text-anchor="end" font-size="11" fill="#424242">embedded_separators_60k</text>
135
+ <rect x="277" y="458" width="10" height="10" fill="#1565C0"/>
136
+ <text x="246" y="467" font-size="10" fill="#1565C0">1.5×</text>
137
+ <rect x="364" y="458" width="10" height="10" fill="#BF360C"/>
138
+ <rect x="374" y="458" width="10" height="10" fill="#2E7D32"/>
139
+ <text x="387" y="467" font-size="10" fill="#BF360C">2.7×</text>
140
+ <text x="421" y="467" font-size="10" fill="#2E7D32">2.9×</text>
141
+ <rect x="0" y="476" width="820" height="26" fill="#ffffff"/>
142
+ <text x="212" y="493" text-anchor="end" font-size="11" fill="#424242">worldcities</text>
143
+ <rect x="270" y="484" width="10" height="10" fill="#1565C0"/>
144
+ <text x="239" y="493" font-size="10" fill="#1565C0">1.4×</text>
145
+ <rect x="366" y="484" width="10" height="10" fill="#BF360C"/>
146
+ <rect x="372" y="484" width="10" height="10" fill="#2E7D32"/>
147
+ <text x="385" y="493" font-size="10" fill="#BF360C">2.8×</text>
148
+ <text x="419" y="493" font-size="10" fill="#2E7D32">2.9×</text>
149
+ <rect x="0" y="502" width="820" height="26" fill="#f5f5f5"/>
150
+ <text x="212" y="519" text-anchor="end" font-size="11" fill="#424242">uscities</text>
151
+ <rect x="269" y="510" width="10" height="10" fill="#1565C0"/>
152
+ <text x="238" y="519" font-size="10" fill="#1565C0">1.4×</text>
153
+ <rect x="359" y="510" width="10" height="10" fill="#BF360C"/>
154
+ <rect x="365" y="510" width="10" height="10" fill="#2E7D32"/>
155
+ <text x="378" y="519" font-size="10" fill="#BF360C">2.6×</text>
156
+ <text x="412" y="519" font-size="10" fill="#2E7D32">2.8×</text>
157
+ <rect x="0" y="528" width="820" height="26" fill="#ffffff"/>
158
+ <text x="212" y="545" text-anchor="end" font-size="11" fill="#424242">uszips</text>
159
+ <rect x="273" y="536" width="10" height="10" fill="#1565C0"/>
160
+ <text x="242" y="545" font-size="10" fill="#1565C0">1.5×</text>
161
+ <rect x="356" y="536" width="10" height="10" fill="#BF360C"/>
162
+ <rect x="361" y="536" width="10" height="10" fill="#2E7D32"/>
163
+ <text x="374" y="545" font-size="10" fill="#BF360C">2.6×</text>
164
+ <text x="408" y="545" font-size="10" fill="#2E7D32">2.7×</text>
165
+ <rect x="0" y="554" width="820" height="26" fill="#f5f5f5"/>
166
+ <text x="212" y="571" text-anchor="end" font-size="11" fill="#424242">embedded_newlines_60k</text>
167
+ <rect x="315" y="562" width="10" height="10" fill="#1565C0"/>
168
+ <rect x="341" y="562" width="10" height="10" fill="#BF360C"/>
169
+ <rect x="346" y="562" width="10" height="10" fill="#2E7D32"/>
170
+ <text x="284" y="571" font-size="10" fill="#1565C0">2.0×</text>
171
+ <text x="359" y="571" font-size="10" fill="#BF360C">2.3×</text>
172
+ <text x="393" y="571" font-size="10" fill="#2E7D32">2.4×</text>
173
+ <rect x="223" y="589" width="10" height="10" fill="#1565C0"/>
174
+ <text x="240" y="598" font-size="11" fill="#1565C0">Ruby path (v1.15.2)</text>
175
+ <rect x="223" y="609" width="10" height="10" fill="#BF360C"/>
176
+ <text x="240" y="618" font-size="11" fill="#BF360C">Ruby path (v1.16.4)</text>
177
+ <rect x="223" y="629" width="10" height="10" fill="#2E7D32"/>
178
+ <text x="240" y="638" font-size="11" fill="#2E7D32">Ruby path (v1.17.0)</text>
179
+ </svg>
@@ -8,8 +8,6 @@ module SmarterCSV
8
8
  # Otherwise guesses column separator from contents.
9
9
  # Raises exception if none is found.
10
10
  def guess_column_separator(filehandle, options)
11
- skip_lines(filehandle, options)
12
-
13
11
  delimiters = [',', "\t", ';', ':', '|']
14
12
 
15
13
  line = nil
@@ -29,11 +27,13 @@ module SmarterCSV
29
27
  candidates[d] += non_quoted_text.scan(d).count
30
28
  end
31
29
  end
32
- rewind(filehandle)
30
+ # No lines were read at all — empty file or stream.
31
+ # Return a safe default and let process_headers raise EmptyFileError.
32
+ return ',' if line.nil?
33
33
 
34
34
  if candidates.values.max == 0
35
35
  # if the header only contains word characters and whitespace, assume comma separator
36
- return ',' if line && line.chomp(options[:row_sep]) =~ /^[\w\s]+$/
36
+ return ',' if line.chomp(options[:row_sep]) =~ /^[\w\s]+$/
37
37
 
38
38
  raise SmarterCSV::NoColSepDetected
39
39
  end
@@ -41,38 +41,223 @@ module SmarterCSV
41
41
  candidates.key(candidates.values.max)
42
42
  end
43
43
 
44
- # limitation: this currently reads the whole file in before making a decision
44
+ # Lower bound on auto_row_sep_chars. Below this the initial scan would be
45
+ # too small to reliably catch a row separator on the very first read on
46
+ # most real-world CSV files. Most well-formed CSVs reveal a clear majority
47
+ # within ~200 bytes; 512 gives comfortable headroom while keeping cheap
48
+ # files cheap.
49
+ MIN_AUTO_ROW_SEP_CHARS = 512
50
+
51
+ # Default auto_row_sep_chars. Sized to cover wide-header CSVs (e.g. 100+
52
+ # columns) in a single read so escalation rarely fires. 4096 also matches
53
+ # a typical filesystem block, so the OS-level read cost is the same as a
54
+ # smaller request.
55
+ DEFAULT_AUTO_ROW_SEP_CHARS = 4096
56
+
57
+ # Upper bound on auto_row_sep_chars. Serves three roles:
58
+ # 1. The hard cap on the user-facing `auto_row_sep_chars` option.
59
+ # 2. The cap on the doubling escalation inside guess_line_ending —
60
+ # a single chunk read never exceeds this.
61
+ # 3. The hard cap on total bytes scanned during auto-detection
62
+ # (`break if buf.bytesize >= MAX_AUTO_ROW_SEP_CHARS`).
63
+ # All three roles use the same value because beyond this point further
64
+ # scanning would not improve detection accuracy and only delays parsing.
65
+ MAX_AUTO_ROW_SEP_CHARS = 65_536
66
+
67
+ # Guess the row separator ("\n", "\r\n", or "\r") by counting occurrences
68
+ # outside of quoted regions, scanning each chunk only once and accumulating
69
+ # counts across iterations.
70
+ #
71
+ # Reads one chunk of options[:auto_row_sep_chars] bytes at a time and
72
+ # grows up to MAX_AUTO_ROW_SEP_CHARS bytes total while no candidate has
73
+ # a clear majority (count > sum of the others).
74
+ #
75
+ # State carried across iterations:
76
+ # * crlf, lf, cr — running counts; never reset
77
+ # * in_quote — true if a previous chunk ended inside a quoted region
78
+ # * pending_cr — true if a previous chunk's last byte was a lone "\r"
79
+ # (deferred so it can pair with a leading "\n" of the
80
+ # next chunk without an extra read).
81
+ #
82
+ # Falls back to "\n" (and emits a warning unless verbose: :quiet) when:
83
+ # * no known separator is found within MAX_AUTO_ROW_SEP_CHARS bytes — e.g. a file
84
+ # that uses an exotic separator like "\u2028"; or
85
+ # * a tie between candidates persists across MAX_AUTO_ROW_SEP_CHARS bytes.
86
+ #
87
+ # The fallback preserves 14 years of permissive behavior; the warning lets
88
+ # infrastructure code (logs, captured stderr) surface the ambiguity.
45
89
  def guess_line_ending(filehandle, options)
46
- counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
47
- quoted_char = false
48
-
49
- # count how many of the pre-defined line-endings we find
50
- # ignoring those contained within quote characters
51
- last_char = nil
52
- lines = 0
53
- filehandle.each_char do |c|
54
- quoted_char = !quoted_char if c == options[:quote_char]
55
- next if quoted_char
56
-
57
- if last_char == "\r"
58
- if c == "\n"
59
- counts["\r\n"] += 1
90
+ q = Regexp.escape(options[:quote_char])
91
+ # Combined regex: matches complete "..." pairs AND unclosed "...\z (open
92
+ # quote followed by content to end of string). One gsub pass strips both
93
+ # cases; quote count parity tells us whether an unclosed open existed.
94
+ # /n flag: byte-level matching, encoding-agnostic.
95
+ quoted_re = /#{q}[^#{q}]*(?:#{q}|\z)/n
96
+ quote_str = options[:quote_char].b
97
+ # Adaptive doubling: the first read is auto_row_sep_chars bytes (default 4096).
98
+ # Iter 2 reuses the same size so files with a clear separator slightly past
99
+ # the initial chunk resolve cheaply; iter 3+ doubles each iteration up to
100
+ # MAX_AUTO_ROW_SEP_CHARS.
101
+ #
102
+ # Read pattern with default auto_row_sep_chars = 4096:
103
+ # iter | chunk | cumulative
104
+ # 1 | 4096 | 4096
105
+ # 2 | 4096 | 8192
106
+ # 3 | 8192 | 16384
107
+ # 4 | 16384 | 32768
108
+ # 5 | 32768 | 65536 (loop ends at MAX_AUTO_ROW_SEP_CHARS)
109
+ #
110
+ # MIN_AUTO_ROW_SEP_CHARS is the defensive floor — catches direct callers
111
+ # that bypass option validation (e.g. tests calling via send). Through the
112
+ # public process_options pipeline, validation already enforces this floor,
113
+ # so this .max is inert in normal use.
114
+ chunk_size = [options[:auto_row_sep_chars].to_i, MIN_AUTO_ROW_SEP_CHARS].max
115
+ chunk_size = [chunk_size, MAX_AUTO_ROW_SEP_CHARS].min # Defensive cap: do not read beyond MAX
116
+ bytes_read = false
117
+ total_bytes = 0
118
+ crlf = lf = cr = 0
119
+ in_quote = false # carries across chunks; an open quote with no close
120
+ pending_cr = false # carries across chunks; "\r" deferred for "\r\n" pairing
121
+ iter = 0
122
+
123
+ loop do
124
+ part = filehandle.read(chunk_size)
125
+ break if part.nil? || part.empty?
126
+
127
+ bytes_read = true
128
+ total_bytes += part.bytesize
129
+
130
+ # Resolve a "\r" left pending from the previous chunk's last byte.
131
+ # If the new chunk starts with "\n", the pair is "\r\n"; otherwise
132
+ # the deferred "\r" was a lone "\r" and the new first byte is
133
+ # processed below. (pending_cr and in_quote can never both be true
134
+ # at the start of an iteration — see the open-quote handling below.)
135
+ if pending_cr
136
+ pending_cr = false
137
+ if part.getbyte(0) == 0x0A # \n
138
+ crlf += 1
139
+ part = part.byteslice(1, part.bytesize - 1)
60
140
  else
61
- counts["\r"] += 1 # \r are counted after they appeared
141
+ cr += 1
142
+ # part stays as-is; the new first byte is processed below.
62
143
  end
63
- elsif c == "\n"
64
- counts["\n"] += 1
65
144
  end
66
- last_char = c
67
- lines += 1
68
- break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
145
+
146
+ # Fast path: chunk has no quote char AND we're not carrying an open
147
+ # quote from a previous chunk. Skip the gsub + index + .b machinery
148
+ # and count separators directly — most CSV chunks contain no quote
149
+ # chars. (`include?` is one C-level byte scan, vs gsub + index = two
150
+ # passes plus a string copy.)
151
+ if !in_quote && !part.include?(quote_str)
152
+ unquoted = part
153
+ if unquoted.end_with?("\r")
154
+ pending_cr = true
155
+ unquoted = unquoted.byteslice(0, unquoted.bytesize - 1)
156
+ end
157
+ delta_crlf = unquoted.scan("\r\n").size
158
+ delta_lf = unquoted.count("\n") - delta_crlf
159
+ delta_cr = unquoted.count("\r") - delta_crlf
160
+ crlf += delta_crlf
161
+ lf += delta_lf
162
+ cr += delta_cr
163
+ else
164
+ # Slow path: chunk contains quote chars or we're carrying in_quote
165
+ # state from a previous chunk. Convert to binary so index/byteslice
166
+ # are byte-level (safe even with multibyte UTF-8 content before the
167
+ # quote position).
168
+ part = part.b
169
+
170
+ if in_quote
171
+ close_idx = part.index(quote_str)
172
+ if close_idx
173
+ in_quote = false
174
+ part = part.byteslice(close_idx + 1, part.bytesize - close_idx - 1)
175
+ else
176
+ # Whole chunk is still inside the quote.
177
+ part = nil
178
+ end
179
+ end
180
+
181
+ if part && !part.empty?
182
+ # Single regex pass: gsub with the combined regex strips every
183
+ # complete "..." pair AND, if there's an unclosed open quote at
184
+ # the end, strips "...\z too. After this, no quote chars remain
185
+ # in `unquoted`.
186
+ unquoted = part.gsub(quoted_re, '')
187
+
188
+ # Parity check on the original chunk's quote count: an odd count
189
+ # means an unclosed open quote existed (and the gsub stripped its
190
+ # content along with the open). Set in_quote so the next chunk
191
+ # will look for the close. (count is a fast C-level byte scan.)
192
+ in_quote = true if part.count(quote_str).odd?
193
+
194
+ if unquoted.end_with?("\r".b)
195
+ if in_quote
196
+ # The byte right after this trailing "\r" was the open quote
197
+ # char (NOT "\n"), so the "\r" is a lone cr — count it now.
198
+ # Deferring would mispair against the next chunk's first
199
+ # byte, which is inside the (now-open) quoted region.
200
+ cr += 1
201
+ else
202
+ # No open quote — safe to defer trailing "\r" so it can pair
203
+ # with the next chunk's leading "\n" if any.
204
+ pending_cr = true
205
+ end
206
+ unquoted = unquoted.byteslice(0, unquoted.bytesize - 1)
207
+ end
208
+
209
+ # Count separators in the new bytes and add to running totals.
210
+ delta_crlf = unquoted.scan("\r\n".b).size
211
+ delta_lf = unquoted.count("\n") - delta_crlf
212
+ delta_cr = unquoted.count("\r") - delta_crlf
213
+ crlf += delta_crlf
214
+ lf += delta_lf
215
+ cr += delta_cr
216
+ end
217
+ end
218
+
219
+ # Clear majority: winner strictly greater than the sum of the others.
220
+ return "\r\n" if crlf > lf + cr
221
+ return "\n" if lf > crlf + cr
222
+ return "\r" if cr > crlf + lf
223
+
224
+ break if total_bytes >= MAX_AUTO_ROW_SEP_CHARS
225
+
226
+ # Iter 2 keeps the iter-1 chunk size; iter 3+ doubles each iteration,
227
+ # capped at MAX_AUTO_ROW_SEP_CHARS.
228
+ iter += 1
229
+ chunk_size = [chunk_size * 2, MAX_AUTO_ROW_SEP_CHARS].min if iter >= 2
69
230
  end
70
- rewind(filehandle)
71
231
 
72
- counts["\r"] += 1 if last_char == "\r"
73
- # find the most frequent key/value pair:
74
- most_frequent_key, _count = counts.max_by{|_, v| v}
75
- most_frequent_key
232
+ # Empty stream return harmless fallback; downstream raises EmptyFileError.
233
+ return "\n" unless bytes_read
234
+
235
+ # If we exited with a deferred "\r" (EOF or cap reached and no following
236
+ # byte to pair it with), count it as a lone "\r" now and re-check majority.
237
+ # Without this, a file ending in a lone "\r" with no other separators would
238
+ # fall through to the no-clear-row-sep warning instead of returning "\r".
239
+ if pending_cr
240
+ cr += 1
241
+ return "\r\n" if crlf > lf + cr
242
+ return "\n" if lf > crlf + cr
243
+ return "\r" if cr > crlf + lf
244
+ end
245
+
246
+ unless options[:verbose] == :quiet
247
+ if crlf == 0 && lf == 0 && cr == 0
248
+ record_warning(type: :row_sep, code: :no_row_sep_found, severity: :error) do
249
+ "no row separator found in first #{total_bytes} bytes; " \
250
+ "defaulting to \"\\n\". Pass row_sep: explicitly if this is wrong."
251
+ end
252
+ else
253
+ record_warning(type: :row_sep, code: :no_clear_row_sep, severity: :error) do
254
+ "no clear row separator in first #{total_bytes} bytes " \
255
+ "(saw #{lf}×\"\\n\", #{crlf}×\"\\r\\n\", #{cr}×\"\\r\"); defaulting to \"\\n\". " \
256
+ "Pass row_sep: explicitly if this is wrong."
257
+ end
258
+ end
259
+ end
260
+ "\n"
76
261
  end
77
262
  end
78
263
  end
@@ -20,10 +20,10 @@ module SmarterCSV
20
20
  end
21
21
  end
22
22
 
23
- def rewind(filehandle)
23
+ def rewind_buffer(filehandle)
24
24
  @file_line_count = 0
25
25
  @csv_line_count = 0
26
- filehandle.rewind
26
+ filehandle.rewind_buffer # this is PeekableIO.rewind_buffer, not io.rewind !
27
27
  end
28
28
 
29
29
  private
@@ -3,9 +3,17 @@
3
3
  module SmarterCSV
4
4
  module HashTransformations
5
5
  # Frozen regex constants for performance (avoid recompilation on every value)
6
- FLOAT_REGEX = /\A[+-]?\d+\.\d+\z/.freeze
7
- INTEGER_REGEX = /\A[+-]?\d+\z/.freeze
8
- ZERO_REGEX = /\A0+(?:\.0+)?\z/.freeze
6
+ NUMERIC_REGEX = /\A[+-]?\d+(?:\.\d+)?\z/.freeze
7
+ # FLOAT_REGEX = /\A[+-]?\d+\.\d+\z/.freeze
8
+ # INTEGER_REGEX = /\A[+-]?\d+\z/.freeze
9
+ ZERO_REGEX = /\A[+-]?0+(?:\.0+)?\z/.freeze # could be +0.0
10
+
11
+ # First-byte values that can begin a numeric literal — used to skip the numeric
12
+ # regexes for values that obviously aren't numbers (e.g. city names).
13
+ ZERO_BYTE = '0'.ord # 48
14
+ NINE_BYTE = '9'.ord # 57
15
+ PLUS_BYTE = '+'.ord # 43
16
+ MINUS_BYTE = '-'.ord # 45
9
17
 
10
18
  def hash_transformations(hash, options)
11
19
  # Modify hash in-place for performance (avoids allocating a second hash per row)
@@ -24,7 +32,11 @@ module SmarterCSV
24
32
  # Early return if no transformations needed
25
33
  return hash unless remove_empty_values || remove_zero_values || nil_values_matching || convert_to_numeric || value_converters
26
34
 
27
- keys_to_delete = []
35
+ # {only:}/{except:} limits on numeric conversion apply only when the option is a Hash;
36
+ # in the common case (true/false) skip the per-key check entirely.
37
+ numeric_has_limits = convert_to_numeric.is_a?(Hash)
38
+ rails = has_rails
39
+ keys_to_delete = nil # lazily allocated only if something is actually removed
28
40
 
29
41
  hash.each do |k, v|
30
42
  # Nil-ify values matching the pattern (keeps the key; remove_empty_values handles deletion)
@@ -39,23 +51,27 @@ module SmarterCSV
39
51
 
40
52
  # Check if this key/value should be removed
41
53
  # Note: numeric values (Integer/Float) are never blank, so skip the blank check for them
42
- if remove_empty_values && !v.is_a?(Numeric) && (has_rails ? v.blank? : blank?(v))
43
- keys_to_delete << k
54
+ if remove_empty_values && !v.is_a?(Numeric) && (rails ? v.blank? : blank?(v))
55
+ (keys_to_delete ||= []) << k
44
56
  next
45
57
  end
46
58
 
47
59
  # Handle both string zeros ("0", "0.0") and numeric zeros (already converted by C)
48
60
  if remove_zero_values && ((v.is_a?(String) && ZERO_REGEX.match?(v)) || (v.is_a?(Numeric) && v == 0))
49
- keys_to_delete << k
61
+ (keys_to_delete ||= []) << k
50
62
  next
51
63
  end
52
64
 
53
65
  # Convert to numeric if requested
54
- if convert_to_numeric && v.is_a?(String) && !limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
55
- if FLOAT_REGEX.match?(v)
56
- hash[k] = v.to_f
57
- elsif INTEGER_REGEX.match?(v)
58
- hash[k] = v.to_i
66
+ if convert_to_numeric && v.is_a?(String) &&
67
+ (!numeric_has_limits || !limit_execution_for_only_or_except(options, :convert_values_to_numeric, k))
68
+ # Fast-reject: the string is already stripped and NUMERIC_REGEX is \A-anchored on a digit or sign,
69
+ # so a value whose first byte isn't a digit, '+', or '-' cannot be numeric — skip the regex entirely.
70
+ first_byte = v.getbyte(0)
71
+ if first_byte && ((first_byte >= ZERO_BYTE && first_byte <= NINE_BYTE) || first_byte == MINUS_BYTE || first_byte == PLUS_BYTE)
72
+ if NUMERIC_REGEX.match?(v)
73
+ hash[k] = v.include?('.') ? v.to_f : v.to_i
74
+ end
59
75
  end
60
76
  end
61
77
 
@@ -67,7 +83,7 @@ module SmarterCSV
67
83
  end
68
84
 
69
85
  # Delete marked keys
70
- keys_to_delete.each { |k| hash.delete(k) }
86
+ keys_to_delete&.each { |key| hash.delete(key) }
71
87
 
72
88
  hash
73
89
  end