smarter_csv 1.16.4 → 1.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +10 -1
- data/CHANGELOG.md +54 -0
- data/Gemfile +10 -5
- data/README.md +98 -14
- data/TO_DO.md +109 -0
- data/docs/_introduction.md +1 -0
- data/docs/bad_row_quarantine.md +2 -1
- data/docs/basic_read_api.md +6 -1
- data/docs/basic_write_api.md +30 -0
- data/docs/batch_processing.md +25 -0
- data/docs/column_selection.md +1 -0
- data/docs/data_transformations.md +1 -0
- data/docs/examples.md +126 -0
- data/docs/header_transformations.md +23 -0
- data/docs/header_validations.md +1 -0
- data/docs/history.md +1 -0
- data/docs/instrumentation.md +2 -1
- data/docs/migrating_from_csv.md +1 -0
- data/docs/options.md +20 -18
- data/docs/parsing_strategy.md +1 -0
- data/docs/real_world_csv.md +51 -1
- data/docs/releases/1.16.0/performance_notes.md +15 -15
- data/docs/releases/1.17.0/benchmarks.md +121 -0
- data/docs/releases/1.17.0/changes.md +161 -0
- data/docs/releases/1.17.0/performance_notes.md +126 -0
- data/docs/row_col_sep.md +21 -1
- data/docs/ruby_csv_pitfalls.md +1 -0
- data/docs/value_converters.md +24 -0
- data/docs/warnings.md +141 -0
- data/ext/smarter_csv/smarter_csv.c +98 -32
- data/images/SmarterCSV_1.17.0_vs_RubyCSV_3.3.5_speedup.svg +106 -0
- data/images/SmarterCSV_1.17.0_vs_previous_C-speedup.svg +181 -0
- data/images/SmarterCSV_1.17.0_vs_previous_Rb-speedup.svg +179 -0
- data/lib/smarter_csv/auto_detection.rb +215 -30
- data/lib/smarter_csv/file_io.rb +2 -2
- data/lib/smarter_csv/hash_transformations.rb +29 -13
- data/lib/smarter_csv/parser.rb +42 -33
- data/lib/smarter_csv/peekable_io.rb +453 -0
- data/lib/smarter_csv/reader.rb +119 -23
- data/lib/smarter_csv/reader_options.rb +61 -1
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv.rb +40 -12
- metadata +12 -5
- data/TO_DO_v2.md +0 -14
- data/ext/smarter_csv/Makefile +0 -270
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
<svg xmlns="http://www.w3.org/2000/svg" width="820" height="668"
|
|
2
|
+
font-family="ui-monospace, 'Cascadia Code', 'Courier New', monospace" font-size="12">
|
|
3
|
+
<rect width="820" height="668" fill="#ffffff"/>
|
|
4
|
+
<text x="410" y="18" text-anchor="middle" font-size="13" font-weight="bold" fill="#212121">SmarterCSV improvements 1.15.2, 1.16.4, 1.17.0 vs 1.14.4 — Ruby (not accelerated)</text>
|
|
5
|
+
<text x="410" y="32" text-anchor="middle" font-size="10" fill="#9e9e9e">Speedup ratio = baseline version time ÷ newer version time (higher = newer version is faster)</text>
|
|
6
|
+
<text x="410" y="48" text-anchor="middle" font-size="11" fill="#616161">Ruby 3.4.7 [log scale, best of 40]</text>
|
|
7
|
+
<line x1="220" y1="68" x2="220" y2="580" stroke="#e0e0e0" stroke-width="1"/>
|
|
8
|
+
<text x="220" y="64" text-anchor="middle" font-size="11" fill="#757575">1×</text>
|
|
9
|
+
<line x1="323" y1="68" x2="323" y2="580" stroke="#e0e0e0" stroke-width="1"/>
|
|
10
|
+
<text x="323" y="64" text-anchor="middle" font-size="11" fill="#757575">2×</text>
|
|
11
|
+
<line x1="459" y1="68" x2="459" y2="580" stroke="#e0e0e0" stroke-width="1"/>
|
|
12
|
+
<text x="459" y="64" text-anchor="middle" font-size="11" fill="#757575">5×</text>
|
|
13
|
+
<line x1="561" y1="68" x2="561" y2="580" stroke="#e0e0e0" stroke-width="1"/>
|
|
14
|
+
<text x="561" y="64" text-anchor="middle" font-size="11" fill="#757575">10×</text>
|
|
15
|
+
<line x1="664" y1="68" x2="664" y2="580" stroke="#e0e0e0" stroke-width="1"/>
|
|
16
|
+
<text x="664" y="64" text-anchor="middle" font-size="11" fill="#757575">20×</text>
|
|
17
|
+
<line x1="800" y1="68" x2="800" y2="580" stroke="#e0e0e0" stroke-width="1"/>
|
|
18
|
+
<text x="800" y="64" text-anchor="middle" font-size="11" fill="#757575">50×</text>
|
|
19
|
+
<line x1="220" y1="68" x2="220" y2="580" stroke="#9e9e9e" stroke-width="1.5"/>
|
|
20
|
+
<line x1="220" y1="68" x2="800" y2="68" stroke="#bdbdbd" stroke-width="1"/>
|
|
21
|
+
<rect x="0" y="86" width="820" height="26" fill="#f5f5f5"/>
|
|
22
|
+
<text x="212" y="103" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_C</text>
|
|
23
|
+
<rect x="556" y="94" width="10" height="10" fill="#1565C0"/>
|
|
24
|
+
<text x="518" y="103" font-size="10" fill="#1565C0">10.0×</text>
|
|
25
|
+
<rect x="652" y="94" width="10" height="10" fill="#BF360C"/>
|
|
26
|
+
<rect x="658" y="94" width="10" height="10" fill="#2E7D32"/>
|
|
27
|
+
<text x="671" y="103" font-size="10" fill="#BF360C">19×</text>
|
|
28
|
+
<text x="698" y="103" font-size="10" fill="#2E7D32">20×</text>
|
|
29
|
+
<rect x="0" y="112" width="820" height="26" fill="#ffffff"/>
|
|
30
|
+
<text x="212" y="129" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_NC</text>
|
|
31
|
+
<rect x="506" y="120" width="10" height="10" fill="#1565C0"/>
|
|
32
|
+
<text x="475" y="129" font-size="10" fill="#1565C0">7.1×</text>
|
|
33
|
+
<rect x="570" y="120" width="10" height="10" fill="#BF360C"/>
|
|
34
|
+
<rect x="586" y="120" width="10" height="10" fill="#2E7D32"/>
|
|
35
|
+
<text x="599" y="129" font-size="10" fill="#BF360C">11×</text>
|
|
36
|
+
<text x="626" y="129" font-size="10" fill="#2E7D32">12×</text>
|
|
37
|
+
<rect x="0" y="138" width="820" height="26" fill="#f5f5f5"/>
|
|
38
|
+
<text x="212" y="155" text-anchor="end" font-size="11" fill="#424242">whitespace_heavy_60k</text>
|
|
39
|
+
<rect x="538" y="146" width="10" height="10" fill="#1565C0"/>
|
|
40
|
+
<rect x="543" y="146" width="10" height="10" fill="#BF360C"/>
|
|
41
|
+
<rect x="572" y="146" width="10" height="10" fill="#2E7D32"/>
|
|
42
|
+
<text x="507" y="155" font-size="10" fill="#1565C0">8.8×</text>
|
|
43
|
+
<text x="585" y="155" font-size="10" fill="#BF360C">9.1×</text>
|
|
44
|
+
<text x="619" y="155" font-size="10" fill="#2E7D32">11×</text>
|
|
45
|
+
<rect x="0" y="164" width="820" height="26" fill="#ffffff"/>
|
|
46
|
+
<text x="212" y="181" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_NB</text>
|
|
47
|
+
<rect x="525" y="172" width="10" height="10" fill="#1565C0"/>
|
|
48
|
+
<rect x="544" y="172" width="10" height="10" fill="#BF360C"/>
|
|
49
|
+
<rect x="563" y="172" width="10" height="10" fill="#2E7D32"/>
|
|
50
|
+
<text x="494" y="181" font-size="10" fill="#1565C0">8.1×</text>
|
|
51
|
+
<text x="576" y="181" font-size="10" fill="#BF360C">9.2×</text>
|
|
52
|
+
<text x="610" y="181" font-size="10" fill="#2E7D32">10×</text>
|
|
53
|
+
<rect x="0" y="190" width="820" height="26" fill="#f5f5f5"/>
|
|
54
|
+
<text x="212" y="207" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_B</text>
|
|
55
|
+
<rect x="525" y="198" width="10" height="10" fill="#1565C0"/>
|
|
56
|
+
<rect x="538" y="198" width="10" height="10" fill="#BF360C"/>
|
|
57
|
+
<rect x="550" y="198" width="10" height="10" fill="#2E7D32"/>
|
|
58
|
+
<text x="494" y="207" font-size="10" fill="#1565C0">8.1×</text>
|
|
59
|
+
<text x="563" y="207" font-size="10" fill="#BF360C">8.8×</text>
|
|
60
|
+
<text x="597" y="207" font-size="10" fill="#2E7D32">9.6×</text>
|
|
61
|
+
<rect x="0" y="216" width="820" height="26" fill="#ffffff"/>
|
|
62
|
+
<text x="212" y="233" text-anchor="end" font-size="11" fill="#424242">tab_separated_60k</text>
|
|
63
|
+
<rect x="504" y="224" width="10" height="10" fill="#1565C0"/>
|
|
64
|
+
<rect x="509" y="224" width="10" height="10" fill="#BF360C"/>
|
|
65
|
+
<rect x="539" y="224" width="10" height="10" fill="#2E7D32"/>
|
|
66
|
+
<text x="473" y="233" font-size="10" fill="#1565C0">7.0×</text>
|
|
67
|
+
<text x="552" y="233" font-size="10" fill="#BF360C">7.3×</text>
|
|
68
|
+
<text x="586" y="233" font-size="10" fill="#2E7D32">8.9×</text>
|
|
69
|
+
<rect x="0" y="242" width="820" height="26" fill="#f5f5f5"/>
|
|
70
|
+
<text x="212" y="259" text-anchor="end" font-size="11" fill="#424242">multi_char_separator_60k</text>
|
|
71
|
+
<rect x="466" y="250" width="10" height="10" fill="#1565C0"/>
|
|
72
|
+
<text x="435" y="259" font-size="10" fill="#1565C0">5.4×</text>
|
|
73
|
+
<rect x="503" y="250" width="10" height="10" fill="#BF360C"/>
|
|
74
|
+
<rect x="529" y="250" width="10" height="10" fill="#2E7D32"/>
|
|
75
|
+
<text x="542" y="259" font-size="10" fill="#BF360C">7.0×</text>
|
|
76
|
+
<text x="576" y="259" font-size="10" fill="#2E7D32">8.3×</text>
|
|
77
|
+
<rect x="0" y="268" width="820" height="26" fill="#ffffff"/>
|
|
78
|
+
<text x="212" y="285" text-anchor="end" font-size="11" fill="#424242">wide_500_cols_20k</text>
|
|
79
|
+
<rect x="483" y="276" width="10" height="10" fill="#1565C0"/>
|
|
80
|
+
<rect x="483" y="276" width="10" height="10" fill="#BF360C"/>
|
|
81
|
+
<text x="452" y="285" font-size="10" fill="#1565C0">6.1×</text>
|
|
82
|
+
<text x="496" y="285" font-size="10" fill="#BF360C">6.1×</text>
|
|
83
|
+
<rect x="516" y="276" width="10" height="10" fill="#2E7D32"/>
|
|
84
|
+
<text x="529" y="285" font-size="10" fill="#2E7D32">7.6×</text>
|
|
85
|
+
<rect x="0" y="294" width="820" height="26" fill="#f5f5f5"/>
|
|
86
|
+
<text x="212" y="311" text-anchor="end" font-size="11" fill="#424242">utf8_multibyte_60k</text>
|
|
87
|
+
<rect x="475" y="302" width="10" height="10" fill="#1565C0"/>
|
|
88
|
+
<rect x="483" y="302" width="10" height="10" fill="#BF360C"/>
|
|
89
|
+
<rect x="502" y="302" width="10" height="10" fill="#2E7D32"/>
|
|
90
|
+
<text x="444" y="311" font-size="10" fill="#1565C0">5.8×</text>
|
|
91
|
+
<text x="515" y="311" font-size="10" fill="#BF360C">6.1×</text>
|
|
92
|
+
<text x="549" y="311" font-size="10" fill="#2E7D32">6.9×</text>
|
|
93
|
+
<rect x="0" y="320" width="820" height="26" fill="#ffffff"/>
|
|
94
|
+
<text x="212" y="337" text-anchor="end" font-size="11" fill="#424242">many_empty_fields_60k</text>
|
|
95
|
+
<rect x="393" y="328" width="10" height="10" fill="#1565C0"/>
|
|
96
|
+
<text x="362" y="337" font-size="10" fill="#1565C0">3.3×</text>
|
|
97
|
+
<rect x="454" y="328" width="10" height="10" fill="#BF360C"/>
|
|
98
|
+
<rect x="474" y="328" width="10" height="10" fill="#2E7D32"/>
|
|
99
|
+
<text x="487" y="337" font-size="10" fill="#BF360C">5.0×</text>
|
|
100
|
+
<text x="521" y="337" font-size="10" fill="#2E7D32">5.7×</text>
|
|
101
|
+
<rect x="0" y="346" width="820" height="26" fill="#f5f5f5"/>
|
|
102
|
+
<text x="212" y="363" text-anchor="end" font-size="11" fill="#424242">sample_100k</text>
|
|
103
|
+
<rect x="434" y="354" width="10" height="10" fill="#1565C0"/>
|
|
104
|
+
<rect x="439" y="354" width="10" height="10" fill="#BF360C"/>
|
|
105
|
+
<rect x="456" y="354" width="10" height="10" fill="#2E7D32"/>
|
|
106
|
+
<text x="403" y="363" font-size="10" fill="#1565C0">4.4×</text>
|
|
107
|
+
<text x="469" y="363" font-size="10" fill="#BF360C">4.5×</text>
|
|
108
|
+
<text x="503" y="363" font-size="10" fill="#2E7D32">5.1×</text>
|
|
109
|
+
<rect x="0" y="372" width="820" height="26" fill="#ffffff"/>
|
|
110
|
+
<text x="212" y="389" text-anchor="end" font-size="11" fill="#424242">sensor_data_50krows_50cols</text>
|
|
111
|
+
<rect x="454" y="380" width="10" height="10" fill="#1565C0"/>
|
|
112
|
+
<rect x="455" y="380" width="10" height="10" fill="#2E7D32"/>
|
|
113
|
+
<rect x="456" y="380" width="10" height="10" fill="#BF360C"/>
|
|
114
|
+
<text x="423" y="389" font-size="10" fill="#1565C0">5.0×</text>
|
|
115
|
+
<text x="469" y="389" font-size="10" fill="#2E7D32">5.1×</text>
|
|
116
|
+
<text x="503" y="389" font-size="10" fill="#BF360C">5.1×</text>
|
|
117
|
+
<rect x="0" y="398" width="820" height="26" fill="#f5f5f5"/>
|
|
118
|
+
<text x="212" y="415" text-anchor="end" font-size="11" fill="#424242">long_fields_40k</text>
|
|
119
|
+
<rect x="301" y="406" width="10" height="10" fill="#1565C0"/>
|
|
120
|
+
<text x="270" y="415" font-size="10" fill="#1565C0">1.8×</text>
|
|
121
|
+
<rect x="416" y="406" width="10" height="10" fill="#BF360C"/>
|
|
122
|
+
<rect x="416" y="406" width="10" height="10" fill="#2E7D32"/>
|
|
123
|
+
<text x="429" y="415" font-size="10" fill="#BF360C">3.9×</text>
|
|
124
|
+
<text x="463" y="415" font-size="10" fill="#2E7D32">3.9×</text>
|
|
125
|
+
<rect x="0" y="424" width="820" height="26" fill="#ffffff"/>
|
|
126
|
+
<text x="212" y="441" text-anchor="end" font-size="11" fill="#424242">heavy_quoting_60k</text>
|
|
127
|
+
<rect x="268" y="432" width="10" height="10" fill="#1565C0"/>
|
|
128
|
+
<text x="237" y="441" font-size="10" fill="#1565C0">1.4×</text>
|
|
129
|
+
<rect x="371" y="432" width="10" height="10" fill="#BF360C"/>
|
|
130
|
+
<rect x="381" y="432" width="10" height="10" fill="#2E7D32"/>
|
|
131
|
+
<text x="394" y="441" font-size="10" fill="#BF360C">2.9×</text>
|
|
132
|
+
<text x="428" y="441" font-size="10" fill="#2E7D32">3.1×</text>
|
|
133
|
+
<rect x="0" y="450" width="820" height="26" fill="#f5f5f5"/>
|
|
134
|
+
<text x="212" y="467" text-anchor="end" font-size="11" fill="#424242">embedded_separators_60k</text>
|
|
135
|
+
<rect x="277" y="458" width="10" height="10" fill="#1565C0"/>
|
|
136
|
+
<text x="246" y="467" font-size="10" fill="#1565C0">1.5×</text>
|
|
137
|
+
<rect x="364" y="458" width="10" height="10" fill="#BF360C"/>
|
|
138
|
+
<rect x="374" y="458" width="10" height="10" fill="#2E7D32"/>
|
|
139
|
+
<text x="387" y="467" font-size="10" fill="#BF360C">2.7×</text>
|
|
140
|
+
<text x="421" y="467" font-size="10" fill="#2E7D32">2.9×</text>
|
|
141
|
+
<rect x="0" y="476" width="820" height="26" fill="#ffffff"/>
|
|
142
|
+
<text x="212" y="493" text-anchor="end" font-size="11" fill="#424242">worldcities</text>
|
|
143
|
+
<rect x="270" y="484" width="10" height="10" fill="#1565C0"/>
|
|
144
|
+
<text x="239" y="493" font-size="10" fill="#1565C0">1.4×</text>
|
|
145
|
+
<rect x="366" y="484" width="10" height="10" fill="#BF360C"/>
|
|
146
|
+
<rect x="372" y="484" width="10" height="10" fill="#2E7D32"/>
|
|
147
|
+
<text x="385" y="493" font-size="10" fill="#BF360C">2.8×</text>
|
|
148
|
+
<text x="419" y="493" font-size="10" fill="#2E7D32">2.9×</text>
|
|
149
|
+
<rect x="0" y="502" width="820" height="26" fill="#f5f5f5"/>
|
|
150
|
+
<text x="212" y="519" text-anchor="end" font-size="11" fill="#424242">uscities</text>
|
|
151
|
+
<rect x="269" y="510" width="10" height="10" fill="#1565C0"/>
|
|
152
|
+
<text x="238" y="519" font-size="10" fill="#1565C0">1.4×</text>
|
|
153
|
+
<rect x="359" y="510" width="10" height="10" fill="#BF360C"/>
|
|
154
|
+
<rect x="365" y="510" width="10" height="10" fill="#2E7D32"/>
|
|
155
|
+
<text x="378" y="519" font-size="10" fill="#BF360C">2.6×</text>
|
|
156
|
+
<text x="412" y="519" font-size="10" fill="#2E7D32">2.8×</text>
|
|
157
|
+
<rect x="0" y="528" width="820" height="26" fill="#ffffff"/>
|
|
158
|
+
<text x="212" y="545" text-anchor="end" font-size="11" fill="#424242">uszips</text>
|
|
159
|
+
<rect x="273" y="536" width="10" height="10" fill="#1565C0"/>
|
|
160
|
+
<text x="242" y="545" font-size="10" fill="#1565C0">1.5×</text>
|
|
161
|
+
<rect x="356" y="536" width="10" height="10" fill="#BF360C"/>
|
|
162
|
+
<rect x="361" y="536" width="10" height="10" fill="#2E7D32"/>
|
|
163
|
+
<text x="374" y="545" font-size="10" fill="#BF360C">2.6×</text>
|
|
164
|
+
<text x="408" y="545" font-size="10" fill="#2E7D32">2.7×</text>
|
|
165
|
+
<rect x="0" y="554" width="820" height="26" fill="#f5f5f5"/>
|
|
166
|
+
<text x="212" y="571" text-anchor="end" font-size="11" fill="#424242">embedded_newlines_60k</text>
|
|
167
|
+
<rect x="315" y="562" width="10" height="10" fill="#1565C0"/>
|
|
168
|
+
<rect x="341" y="562" width="10" height="10" fill="#BF360C"/>
|
|
169
|
+
<rect x="346" y="562" width="10" height="10" fill="#2E7D32"/>
|
|
170
|
+
<text x="284" y="571" font-size="10" fill="#1565C0">2.0×</text>
|
|
171
|
+
<text x="359" y="571" font-size="10" fill="#BF360C">2.3×</text>
|
|
172
|
+
<text x="393" y="571" font-size="10" fill="#2E7D32">2.4×</text>
|
|
173
|
+
<rect x="223" y="589" width="10" height="10" fill="#1565C0"/>
|
|
174
|
+
<text x="240" y="598" font-size="11" fill="#1565C0">Ruby path (v1.15.2)</text>
|
|
175
|
+
<rect x="223" y="609" width="10" height="10" fill="#BF360C"/>
|
|
176
|
+
<text x="240" y="618" font-size="11" fill="#BF360C">Ruby path (v1.16.4)</text>
|
|
177
|
+
<rect x="223" y="629" width="10" height="10" fill="#2E7D32"/>
|
|
178
|
+
<text x="240" y="638" font-size="11" fill="#2E7D32">Ruby path (v1.17.0)</text>
|
|
179
|
+
</svg>
|
|
@@ -8,8 +8,6 @@ module SmarterCSV
|
|
|
8
8
|
# Otherwise guesses column separator from contents.
|
|
9
9
|
# Raises exception if none is found.
|
|
10
10
|
def guess_column_separator(filehandle, options)
|
|
11
|
-
skip_lines(filehandle, options)
|
|
12
|
-
|
|
13
11
|
delimiters = [',', "\t", ';', ':', '|']
|
|
14
12
|
|
|
15
13
|
line = nil
|
|
@@ -29,11 +27,13 @@ module SmarterCSV
|
|
|
29
27
|
candidates[d] += non_quoted_text.scan(d).count
|
|
30
28
|
end
|
|
31
29
|
end
|
|
32
|
-
|
|
30
|
+
# No lines were read at all — empty file or stream.
|
|
31
|
+
# Return a safe default and let process_headers raise EmptyFileError.
|
|
32
|
+
return ',' if line.nil?
|
|
33
33
|
|
|
34
34
|
if candidates.values.max == 0
|
|
35
35
|
# if the header only contains word characters and whitespace, assume comma separator
|
|
36
|
-
return ',' if line
|
|
36
|
+
return ',' if line.chomp(options[:row_sep]) =~ /^[\w\s]+$/
|
|
37
37
|
|
|
38
38
|
raise SmarterCSV::NoColSepDetected
|
|
39
39
|
end
|
|
@@ -41,38 +41,223 @@ module SmarterCSV
|
|
|
41
41
|
candidates.key(candidates.values.max)
|
|
42
42
|
end
|
|
43
43
|
|
|
44
|
-
#
|
|
44
|
+
# Lower bound on auto_row_sep_chars. Below this the initial scan would be
|
|
45
|
+
# too small to reliably catch a row separator on the very first read on
|
|
46
|
+
# most real-world CSV files. Most well-formed CSVs reveal a clear majority
|
|
47
|
+
# within ~200 bytes; 512 gives comfortable headroom while keeping cheap
|
|
48
|
+
# files cheap.
|
|
49
|
+
MIN_AUTO_ROW_SEP_CHARS = 512
|
|
50
|
+
|
|
51
|
+
# Default auto_row_sep_chars. Sized to cover wide-header CSVs (e.g. 100+
|
|
52
|
+
# columns) in a single read so escalation rarely fires. 4096 also matches
|
|
53
|
+
# a typical filesystem block, so the OS-level read cost is the same as a
|
|
54
|
+
# smaller request.
|
|
55
|
+
DEFAULT_AUTO_ROW_SEP_CHARS = 4096
|
|
56
|
+
|
|
57
|
+
# Upper bound on auto_row_sep_chars. Serves three roles:
|
|
58
|
+
# 1. The hard cap on the user-facing `auto_row_sep_chars` option.
|
|
59
|
+
# 2. The cap on the doubling escalation inside guess_line_ending —
|
|
60
|
+
# a single chunk read never exceeds this.
|
|
61
|
+
# 3. The hard cap on total bytes scanned during auto-detection
|
|
62
|
+
# (`break if buf.bytesize >= MAX_AUTO_ROW_SEP_CHARS`).
|
|
63
|
+
# All three roles use the same value because beyond this point further
|
|
64
|
+
# scanning would not improve detection accuracy and only delays parsing.
|
|
65
|
+
MAX_AUTO_ROW_SEP_CHARS = 65_536
|
|
66
|
+
|
|
67
|
+
# Guess the row separator ("\n", "\r\n", or "\r") by counting occurrences
|
|
68
|
+
# outside of quoted regions, scanning each chunk only once and accumulating
|
|
69
|
+
# counts across iterations.
|
|
70
|
+
#
|
|
71
|
+
# Reads one chunk of options[:auto_row_sep_chars] bytes at a time and
|
|
72
|
+
# grows up to MAX_AUTO_ROW_SEP_CHARS bytes total while no candidate has
|
|
73
|
+
# a clear majority (count > sum of the others).
|
|
74
|
+
#
|
|
75
|
+
# State carried across iterations:
|
|
76
|
+
# * crlf, lf, cr — running counts; never reset
|
|
77
|
+
# * in_quote — true if a previous chunk ended inside a quoted region
|
|
78
|
+
# * pending_cr — true if a previous chunk's last byte was a lone "\r"
|
|
79
|
+
# (deferred so it can pair with a leading "\n" of the
|
|
80
|
+
# next chunk without an extra read).
|
|
81
|
+
#
|
|
82
|
+
# Falls back to "\n" (and emits a warning unless verbose: :quiet) when:
|
|
83
|
+
# * no known separator is found within MAX_AUTO_ROW_SEP_CHARS bytes — e.g. a file
|
|
84
|
+
# that uses an exotic separator like "\u2028"; or
|
|
85
|
+
# * a tie between candidates persists across MAX_AUTO_ROW_SEP_CHARS bytes.
|
|
86
|
+
#
|
|
87
|
+
# The fallback preserves 14 years of permissive behavior; the warning lets
|
|
88
|
+
# infrastructure code (logs, captured stderr) surface the ambiguity.
|
|
45
89
|
def guess_line_ending(filehandle, options)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
# count
|
|
50
|
-
#
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
90
|
+
q = Regexp.escape(options[:quote_char])
|
|
91
|
+
# Combined regex: matches complete "..." pairs AND unclosed "...\z (open
|
|
92
|
+
# quote followed by content to end of string). One gsub pass strips both
|
|
93
|
+
# cases; quote count parity tells us whether an unclosed open existed.
|
|
94
|
+
# /n flag: byte-level matching, encoding-agnostic.
|
|
95
|
+
quoted_re = /#{q}[^#{q}]*(?:#{q}|\z)/n
|
|
96
|
+
quote_str = options[:quote_char].b
|
|
97
|
+
# Adaptive doubling: the first read is auto_row_sep_chars bytes (default 4096).
|
|
98
|
+
# Iter 2 reuses the same size so files with a clear separator slightly past
|
|
99
|
+
# the initial chunk resolve cheaply; iter 3+ doubles each iteration up to
|
|
100
|
+
# MAX_AUTO_ROW_SEP_CHARS.
|
|
101
|
+
#
|
|
102
|
+
# Read pattern with default auto_row_sep_chars = 4096:
|
|
103
|
+
# iter | chunk | cumulative
|
|
104
|
+
# 1 | 4096 | 4096
|
|
105
|
+
# 2 | 4096 | 8192
|
|
106
|
+
# 3 | 8192 | 16384
|
|
107
|
+
# 4 | 16384 | 32768
|
|
108
|
+
# 5 | 32768 | 65536 (loop ends at MAX_AUTO_ROW_SEP_CHARS)
|
|
109
|
+
#
|
|
110
|
+
# MIN_AUTO_ROW_SEP_CHARS is the defensive floor — catches direct callers
|
|
111
|
+
# that bypass option validation (e.g. tests calling via send). Through the
|
|
112
|
+
# public process_options pipeline, validation already enforces this floor,
|
|
113
|
+
# so this .max is inert in normal use.
|
|
114
|
+
chunk_size = [options[:auto_row_sep_chars].to_i, MIN_AUTO_ROW_SEP_CHARS].max
|
|
115
|
+
chunk_size = [chunk_size, MAX_AUTO_ROW_SEP_CHARS].min # Defensive cap: do not read beyond MAX
|
|
116
|
+
bytes_read = false
|
|
117
|
+
total_bytes = 0
|
|
118
|
+
crlf = lf = cr = 0
|
|
119
|
+
in_quote = false # carries across chunks; an open quote with no close
|
|
120
|
+
pending_cr = false # carries across chunks; "\r" deferred for "\r\n" pairing
|
|
121
|
+
iter = 0
|
|
122
|
+
|
|
123
|
+
loop do
|
|
124
|
+
part = filehandle.read(chunk_size)
|
|
125
|
+
break if part.nil? || part.empty?
|
|
126
|
+
|
|
127
|
+
bytes_read = true
|
|
128
|
+
total_bytes += part.bytesize
|
|
129
|
+
|
|
130
|
+
# Resolve a "\r" left pending from the previous chunk's last byte.
|
|
131
|
+
# If the new chunk starts with "\n", the pair is "\r\n"; otherwise
|
|
132
|
+
# the deferred "\r" was a lone "\r" and the new first byte is
|
|
133
|
+
# processed below. (pending_cr and in_quote can never both be true
|
|
134
|
+
# at the start of an iteration — see the open-quote handling below.)
|
|
135
|
+
if pending_cr
|
|
136
|
+
pending_cr = false
|
|
137
|
+
if part.getbyte(0) == 0x0A # \n
|
|
138
|
+
crlf += 1
|
|
139
|
+
part = part.byteslice(1, part.bytesize - 1)
|
|
60
140
|
else
|
|
61
|
-
|
|
141
|
+
cr += 1
|
|
142
|
+
# part stays as-is; the new first byte is processed below.
|
|
62
143
|
end
|
|
63
|
-
elsif c == "\n"
|
|
64
|
-
counts["\n"] += 1
|
|
65
144
|
end
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
145
|
+
|
|
146
|
+
# Fast path: chunk has no quote char AND we're not carrying an open
|
|
147
|
+
# quote from a previous chunk. Skip the gsub + index + .b machinery
|
|
148
|
+
# and count separators directly — most CSV chunks contain no quote
|
|
149
|
+
# chars. (`include?` is one C-level byte scan, vs gsub + index = two
|
|
150
|
+
# passes plus a string copy.)
|
|
151
|
+
if !in_quote && !part.include?(quote_str)
|
|
152
|
+
unquoted = part
|
|
153
|
+
if unquoted.end_with?("\r")
|
|
154
|
+
pending_cr = true
|
|
155
|
+
unquoted = unquoted.byteslice(0, unquoted.bytesize - 1)
|
|
156
|
+
end
|
|
157
|
+
delta_crlf = unquoted.scan("\r\n").size
|
|
158
|
+
delta_lf = unquoted.count("\n") - delta_crlf
|
|
159
|
+
delta_cr = unquoted.count("\r") - delta_crlf
|
|
160
|
+
crlf += delta_crlf
|
|
161
|
+
lf += delta_lf
|
|
162
|
+
cr += delta_cr
|
|
163
|
+
else
|
|
164
|
+
# Slow path: chunk contains quote chars or we're carrying in_quote
|
|
165
|
+
# state from a previous chunk. Convert to binary so index/byteslice
|
|
166
|
+
# are byte-level (safe even with multibyte UTF-8 content before the
|
|
167
|
+
# quote position).
|
|
168
|
+
part = part.b
|
|
169
|
+
|
|
170
|
+
if in_quote
|
|
171
|
+
close_idx = part.index(quote_str)
|
|
172
|
+
if close_idx
|
|
173
|
+
in_quote = false
|
|
174
|
+
part = part.byteslice(close_idx + 1, part.bytesize - close_idx - 1)
|
|
175
|
+
else
|
|
176
|
+
# Whole chunk is still inside the quote.
|
|
177
|
+
part = nil
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
if part && !part.empty?
|
|
182
|
+
# Single regex pass: gsub with the combined regex strips every
|
|
183
|
+
# complete "..." pair AND, if there's an unclosed open quote at
|
|
184
|
+
# the end, strips "...\z too. After this, no quote chars remain
|
|
185
|
+
# in `unquoted`.
|
|
186
|
+
unquoted = part.gsub(quoted_re, '')
|
|
187
|
+
|
|
188
|
+
# Parity check on the original chunk's quote count: an odd count
|
|
189
|
+
# means an unclosed open quote existed (and the gsub stripped its
|
|
190
|
+
# content along with the open). Set in_quote so the next chunk
|
|
191
|
+
# will look for the close. (count is a fast C-level byte scan.)
|
|
192
|
+
in_quote = true if part.count(quote_str).odd?
|
|
193
|
+
|
|
194
|
+
if unquoted.end_with?("\r".b)
|
|
195
|
+
if in_quote
|
|
196
|
+
# The byte right after this trailing "\r" was the open quote
|
|
197
|
+
# char (NOT "\n"), so the "\r" is a lone cr — count it now.
|
|
198
|
+
# Deferring would mispair against the next chunk's first
|
|
199
|
+
# byte, which is inside the (now-open) quoted region.
|
|
200
|
+
cr += 1
|
|
201
|
+
else
|
|
202
|
+
# No open quote — safe to defer trailing "\r" so it can pair
|
|
203
|
+
# with the next chunk's leading "\n" if any.
|
|
204
|
+
pending_cr = true
|
|
205
|
+
end
|
|
206
|
+
unquoted = unquoted.byteslice(0, unquoted.bytesize - 1)
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
# Count separators in the new bytes and add to running totals.
|
|
210
|
+
delta_crlf = unquoted.scan("\r\n".b).size
|
|
211
|
+
delta_lf = unquoted.count("\n") - delta_crlf
|
|
212
|
+
delta_cr = unquoted.count("\r") - delta_crlf
|
|
213
|
+
crlf += delta_crlf
|
|
214
|
+
lf += delta_lf
|
|
215
|
+
cr += delta_cr
|
|
216
|
+
end
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
# Clear majority: winner strictly greater than the sum of the others.
|
|
220
|
+
return "\r\n" if crlf > lf + cr
|
|
221
|
+
return "\n" if lf > crlf + cr
|
|
222
|
+
return "\r" if cr > crlf + lf
|
|
223
|
+
|
|
224
|
+
break if total_bytes >= MAX_AUTO_ROW_SEP_CHARS
|
|
225
|
+
|
|
226
|
+
# Iter 2 keeps the iter-1 chunk size; iter 3+ doubles each iteration,
|
|
227
|
+
# capped at MAX_AUTO_ROW_SEP_CHARS.
|
|
228
|
+
iter += 1
|
|
229
|
+
chunk_size = [chunk_size * 2, MAX_AUTO_ROW_SEP_CHARS].min if iter >= 2
|
|
69
230
|
end
|
|
70
|
-
rewind(filehandle)
|
|
71
231
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
232
|
+
# Empty stream — return harmless fallback; downstream raises EmptyFileError.
|
|
233
|
+
return "\n" unless bytes_read
|
|
234
|
+
|
|
235
|
+
# If we exited with a deferred "\r" (EOF or cap reached and no following
|
|
236
|
+
# byte to pair it with), count it as a lone "\r" now and re-check majority.
|
|
237
|
+
# Without this, a file ending in a lone "\r" with no other separators would
|
|
238
|
+
# fall through to the no-clear-row-sep warning instead of returning "\r".
|
|
239
|
+
if pending_cr
|
|
240
|
+
cr += 1
|
|
241
|
+
return "\r\n" if crlf > lf + cr
|
|
242
|
+
return "\n" if lf > crlf + cr
|
|
243
|
+
return "\r" if cr > crlf + lf
|
|
244
|
+
end
|
|
245
|
+
|
|
246
|
+
unless options[:verbose] == :quiet
|
|
247
|
+
if crlf == 0 && lf == 0 && cr == 0
|
|
248
|
+
record_warning(type: :row_sep, code: :no_row_sep_found, severity: :error) do
|
|
249
|
+
"no row separator found in first #{total_bytes} bytes; " \
|
|
250
|
+
"defaulting to \"\\n\". Pass row_sep: explicitly if this is wrong."
|
|
251
|
+
end
|
|
252
|
+
else
|
|
253
|
+
record_warning(type: :row_sep, code: :no_clear_row_sep, severity: :error) do
|
|
254
|
+
"no clear row separator in first #{total_bytes} bytes " \
|
|
255
|
+
"(saw #{lf}×\"\\n\", #{crlf}×\"\\r\\n\", #{cr}×\"\\r\"); defaulting to \"\\n\". " \
|
|
256
|
+
"Pass row_sep: explicitly if this is wrong."
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
"\n"
|
|
76
261
|
end
|
|
77
262
|
end
|
|
78
263
|
end
|
data/lib/smarter_csv/file_io.rb
CHANGED
|
@@ -20,10 +20,10 @@ module SmarterCSV
|
|
|
20
20
|
end
|
|
21
21
|
end
|
|
22
22
|
|
|
23
|
-
def
|
|
23
|
+
def rewind_buffer(filehandle)
|
|
24
24
|
@file_line_count = 0
|
|
25
25
|
@csv_line_count = 0
|
|
26
|
-
filehandle.rewind
|
|
26
|
+
filehandle.rewind_buffer # this is PeekableIO.rewind_buffer, not io.rewind !
|
|
27
27
|
end
|
|
28
28
|
|
|
29
29
|
private
|
|
@@ -3,9 +3,17 @@
|
|
|
3
3
|
module SmarterCSV
|
|
4
4
|
module HashTransformations
|
|
5
5
|
# Frozen regex constants for performance (avoid recompilation on every value)
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
6
|
+
NUMERIC_REGEX = /\A[+-]?\d+(?:\.\d+)?\z/.freeze
|
|
7
|
+
# FLOAT_REGEX = /\A[+-]?\d+\.\d+\z/.freeze
|
|
8
|
+
# INTEGER_REGEX = /\A[+-]?\d+\z/.freeze
|
|
9
|
+
ZERO_REGEX = /\A[+-]?0+(?:\.0+)?\z/.freeze # could be +0.0
|
|
10
|
+
|
|
11
|
+
# First-byte values that can begin a numeric literal — used to skip the numeric
|
|
12
|
+
# regexes for values that obviously aren't numbers (e.g. city names).
|
|
13
|
+
ZERO_BYTE = '0'.ord # 48
|
|
14
|
+
NINE_BYTE = '9'.ord # 57
|
|
15
|
+
PLUS_BYTE = '+'.ord # 43
|
|
16
|
+
MINUS_BYTE = '-'.ord # 45
|
|
9
17
|
|
|
10
18
|
def hash_transformations(hash, options)
|
|
11
19
|
# Modify hash in-place for performance (avoids allocating a second hash per row)
|
|
@@ -24,7 +32,11 @@ module SmarterCSV
|
|
|
24
32
|
# Early return if no transformations needed
|
|
25
33
|
return hash unless remove_empty_values || remove_zero_values || nil_values_matching || convert_to_numeric || value_converters
|
|
26
34
|
|
|
27
|
-
|
|
35
|
+
# {only:}/{except:} limits on numeric conversion apply only when the option is a Hash;
|
|
36
|
+
# in the common case (true/false) skip the per-key check entirely.
|
|
37
|
+
numeric_has_limits = convert_to_numeric.is_a?(Hash)
|
|
38
|
+
rails = has_rails
|
|
39
|
+
keys_to_delete = nil # lazily allocated only if something is actually removed
|
|
28
40
|
|
|
29
41
|
hash.each do |k, v|
|
|
30
42
|
# Nil-ify values matching the pattern (keeps the key; remove_empty_values handles deletion)
|
|
@@ -39,23 +51,27 @@ module SmarterCSV
|
|
|
39
51
|
|
|
40
52
|
# Check if this key/value should be removed
|
|
41
53
|
# Note: numeric values (Integer/Float) are never blank, so skip the blank check for them
|
|
42
|
-
if remove_empty_values && !v.is_a?(Numeric) && (
|
|
43
|
-
keys_to_delete << k
|
|
54
|
+
if remove_empty_values && !v.is_a?(Numeric) && (rails ? v.blank? : blank?(v))
|
|
55
|
+
(keys_to_delete ||= []) << k
|
|
44
56
|
next
|
|
45
57
|
end
|
|
46
58
|
|
|
47
59
|
# Handle both string zeros ("0", "0.0") and numeric zeros (already converted by C)
|
|
48
60
|
if remove_zero_values && ((v.is_a?(String) && ZERO_REGEX.match?(v)) || (v.is_a?(Numeric) && v == 0))
|
|
49
|
-
keys_to_delete << k
|
|
61
|
+
(keys_to_delete ||= []) << k
|
|
50
62
|
next
|
|
51
63
|
end
|
|
52
64
|
|
|
53
65
|
# Convert to numeric if requested
|
|
54
|
-
if convert_to_numeric && v.is_a?(String) &&
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
66
|
+
if convert_to_numeric && v.is_a?(String) &&
|
|
67
|
+
(!numeric_has_limits || !limit_execution_for_only_or_except(options, :convert_values_to_numeric, k))
|
|
68
|
+
# Fast-reject: the string is already stripped and NUMERIC_REGEX is \A-anchored on a digit or sign,
|
|
69
|
+
# so a value whose first byte isn't a digit, '+', or '-' cannot be numeric — skip the regex entirely.
|
|
70
|
+
first_byte = v.getbyte(0)
|
|
71
|
+
if first_byte && ((first_byte >= ZERO_BYTE && first_byte <= NINE_BYTE) || first_byte == MINUS_BYTE || first_byte == PLUS_BYTE)
|
|
72
|
+
if NUMERIC_REGEX.match?(v)
|
|
73
|
+
hash[k] = v.include?('.') ? v.to_f : v.to_i
|
|
74
|
+
end
|
|
59
75
|
end
|
|
60
76
|
end
|
|
61
77
|
|
|
@@ -67,7 +83,7 @@ module SmarterCSV
|
|
|
67
83
|
end
|
|
68
84
|
|
|
69
85
|
# Delete marked keys
|
|
70
|
-
keys_to_delete
|
|
86
|
+
keys_to_delete&.each { |key| hash.delete(key) }
|
|
71
87
|
|
|
72
88
|
hash
|
|
73
89
|
end
|