smarter_csv 1.15.2 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/CHANGELOG.md +68 -1
  4. data/CONTRIBUTORS.md +3 -1
  5. data/Gemfile +1 -0
  6. data/README.md +123 -27
  7. data/docs/_introduction.md +40 -24
  8. data/docs/bad_row_quarantine.md +285 -0
  9. data/docs/basic_read_api.md +151 -9
  10. data/docs/basic_write_api.md +474 -59
  11. data/docs/batch_processing.md +161 -4
  12. data/docs/column_selection.md +183 -0
  13. data/docs/data_transformations.md +162 -29
  14. data/docs/examples.md +339 -46
  15. data/docs/header_transformations.md +93 -12
  16. data/docs/header_validations.md +56 -18
  17. data/docs/history.md +117 -0
  18. data/docs/instrumentation.md +165 -0
  19. data/docs/migrating_from_csv.md +290 -0
  20. data/docs/options.md +150 -87
  21. data/docs/parsing_strategy.md +63 -1
  22. data/docs/real_world_csv.md +262 -0
  23. data/docs/releases/1.16.0/benchmarks.md +223 -0
  24. data/docs/releases/1.16.0/changes.md +272 -0
  25. data/docs/releases/1.16.0/performance_notes.md +114 -0
  26. data/docs/row_col_sep.md +14 -5
  27. data/docs/value_converters.md +193 -57
  28. data/ext/smarter_csv/extconf.rb +3 -0
  29. data/ext/smarter_csv/smarter_csv.c +1007 -71
  30. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png +0 -0
  31. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg +108 -0
  32. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.png +0 -0
  33. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg +141 -0
  34. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png +0 -0
  35. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg +139 -0
  36. data/lib/smarter_csv/errors.rb +8 -0
  37. data/lib/smarter_csv/file_io.rb +1 -1
  38. data/lib/smarter_csv/hash_transformations.rb +14 -13
  39. data/lib/smarter_csv/header_transformations.rb +21 -2
  40. data/lib/smarter_csv/headers.rb +2 -1
  41. data/lib/smarter_csv/options.rb +124 -7
  42. data/lib/smarter_csv/parser.rb +362 -75
  43. data/lib/smarter_csv/reader.rb +494 -46
  44. data/lib/smarter_csv/version.rb +1 -1
  45. data/lib/smarter_csv/writer.rb +71 -19
  46. data/lib/smarter_csv.rb +95 -12
  47. data/smarter_csv.gemspec +20 -10
  48. metadata +37 -80
@@ -0,0 +1,108 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" width="764" height="632"
2
+ font-family="ui-monospace, 'Cascadia Code', 'Courier New', monospace" font-size="12">
3
+ <rect width="764" height="632" fill="#ffffff"/>
4
+ <text x="382" y="20" text-anchor="middle" font-size="14" font-weight="bold" fill="#212121">SmarterCSV 1.16.0 vs Ruby CSV.read 3.3.5</text>
5
+ <text x="382" y="36" text-anchor="middle" font-size="10" fill="#9e9e9e">Speedup = CSV.read time ÷ SmarterCSV time (higher = SmarterCSV is faster) · Ruby 3.4.7 · best of 30</text>
6
+ <text x="490" y="620" text-anchor="middle" font-size="11" fill="#616161">Speedup (CSV.read ÷ SmarterCSV 1.16.0 C)</text>
7
+ <line x1="240" y1="62" x2="240" y2="594" stroke="#e0e0e0" stroke-width="1"/>
8
+ <text x="240" y="606" text-anchor="middle" font-size="11" fill="#757575">0×</text>
9
+ <line x1="240" y1="62" x2="240" y2="594" stroke="#e0e0e0" stroke-width="1"/>
10
+ <text x="240" y="606" text-anchor="middle" font-size="11" fill="#757575">1×</text>
11
+ <line x1="240" y1="62" x2="240" y2="594" stroke="#e0e0e0" stroke-width="1"/>
12
+ <text x="240" y="606" text-anchor="middle" font-size="11" fill="#757575">2×</text>
13
+ <line x1="240" y1="62" x2="240" y2="594" stroke="#e0e0e0" stroke-width="1"/>
14
+ <text x="240" y="606" text-anchor="middle" font-size="11" fill="#757575">3×</text>
15
+ <line x1="240" y1="62" x2="240" y2="594" stroke="#e0e0e0" stroke-width="1"/>
16
+ <text x="240" y="606" text-anchor="middle" font-size="11" fill="#757575">4×</text>
17
+ <line x1="240" y1="62" x2="240" y2="594" stroke="#e0e0e0" stroke-width="1"/>
18
+ <text x="240" y="606" text-anchor="middle" font-size="11" fill="#757575">5×</text>
19
+ <line x1="240" y1="62" x2="240" y2="594" stroke="#e0e0e0" stroke-width="1"/>
20
+ <text x="240" y="606" text-anchor="middle" font-size="11" fill="#757575">6×</text>
21
+ <line x1="240" y1="62" x2="240" y2="594" stroke="#e0e0e0" stroke-width="1"/>
22
+ <text x="240" y="606" text-anchor="middle" font-size="11" fill="#757575">7×</text>
23
+ <line x1="240" y1="62" x2="240" y2="594" stroke="#e0e0e0" stroke-width="1"/>
24
+ <text x="240" y="606" text-anchor="middle" font-size="11" fill="#757575">8×</text>
25
+ <line x1="240" y1="62" x2="240" y2="594" stroke="#e0e0e0" stroke-width="1"/>
26
+ <text x="240" y="606" text-anchor="middle" font-size="11" fill="#757575">9×</text>
27
+ <line x1="740" y1="62" x2="740" y2="594" stroke="#e0e0e0" stroke-width="1"/>
28
+ <text x="740" y="606" text-anchor="middle" font-size="11" fill="#757575">10×</text>
29
+ <line x1="290" y1="62" x2="290" y2="594" stroke="#9e9e9e" stroke-width="1.5" stroke-dasharray="4,3"/>
30
+ <line x1="240" y1="594" x2="740" y2="594" stroke="#bdbdbd" stroke-width="1"/>
31
+ <line x1="240" y1="62" x2="240" y2="594" stroke="#bdbdbd" stroke-width="1"/>
32
+ <rect x="0" y="62" width="764" height="28" fill="#f5f5f5"/>
33
+ <text x="232" y="80" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_C.csv</text>
34
+ <rect x="240" y="67" width="425" height="18" fill="#1565C0" rx="2"/>
35
+ <text x="661" y="80" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">8.51×</text>
36
+ <rect x="0" y="90" width="764" height="28" fill="#ffffff"/>
37
+ <text x="232" y="108" text-anchor="end" font-size="11" fill="#424242">uszips.csv</text>
38
+ <rect x="240" y="95" width="348" height="18" fill="#1565C0" rx="2"/>
39
+ <text x="584" y="108" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">6.97×</text>
40
+ <rect x="0" y="118" width="764" height="28" fill="#f5f5f5"/>
41
+ <text x="232" y="136" text-anchor="end" font-size="11" fill="#424242">worldcities.csv</text>
42
+ <rect x="240" y="123" width="311" height="18" fill="#1565C0" rx="2"/>
43
+ <text x="547" y="136" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">6.22×</text>
44
+ <rect x="0" y="146" width="764" height="28" fill="#ffffff"/>
45
+ <text x="232" y="164" text-anchor="end" font-size="11" fill="#424242">long_fields_20k.csv</text>
46
+ <rect x="240" y="151" width="261" height="18" fill="#1565C0" rx="2"/>
47
+ <text x="497" y="164" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">5.22×</text>
48
+ <rect x="0" y="174" width="764" height="28" fill="#f5f5f5"/>
49
+ <text x="232" y="192" text-anchor="end" font-size="11" fill="#424242">uscities.csv</text>
50
+ <rect x="240" y="179" width="248" height="18" fill="#1565C0" rx="2"/>
51
+ <text x="484" y="192" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">4.96×</text>
52
+ <rect x="0" y="202" width="764" height="28" fill="#ffffff"/>
53
+ <text x="232" y="220" text-anchor="end" font-size="11" fill="#424242">embedded_separators_20k.csv</text>
54
+ <rect x="240" y="207" width="241" height="18" fill="#1565C0" rx="2"/>
55
+ <text x="477" y="220" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">4.83×</text>
56
+ <rect x="0" y="230" width="764" height="28" fill="#f5f5f5"/>
57
+ <text x="232" y="248" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_NC.csv</text>
58
+ <rect x="240" y="235" width="226" height="18" fill="#1565C0" rx="2"/>
59
+ <text x="462" y="248" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">4.52×</text>
60
+ <rect x="0" y="258" width="764" height="28" fill="#ffffff"/>
61
+ <text x="232" y="276" text-anchor="end" font-size="11" fill="#424242">many_empty_fields_20k.csv</text>
62
+ <rect x="240" y="263" width="207" height="18" fill="#1565C0" rx="2"/>
63
+ <text x="443" y="276" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">4.15×</text>
64
+ <rect x="0" y="286" width="764" height="28" fill="#f5f5f5"/>
65
+ <text x="232" y="304" text-anchor="end" font-size="11" fill="#424242">utf8_multibyte_20k.csv</text>
66
+ <rect x="240" y="291" width="190" height="18" fill="#1565C0" rx="2"/>
67
+ <text x="426" y="304" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">3.81×</text>
68
+ <rect x="0" y="314" width="764" height="28" fill="#ffffff"/>
69
+ <text x="232" y="332" text-anchor="end" font-size="11" fill="#424242">tab_separated_20k.tsv</text>
70
+ <rect x="240" y="319" width="165" height="18" fill="#1565C0" rx="2"/>
71
+ <text x="401" y="332" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">3.30×</text>
72
+ <rect x="0" y="342" width="764" height="28" fill="#f5f5f5"/>
73
+ <text x="232" y="360" text-anchor="end" font-size="11" fill="#424242">sample_10M.csv</text>
74
+ <rect x="240" y="347" width="163" height="18" fill="#1565C0" rx="2"/>
75
+ <text x="399" y="360" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">3.26×</text>
76
+ <rect x="0" y="370" width="764" height="28" fill="#ffffff"/>
77
+ <text x="232" y="388" text-anchor="end" font-size="11" fill="#424242">whitespace_heavy_20k.csv</text>
78
+ <rect x="240" y="375" width="160" height="18" fill="#1565C0" rx="2"/>
79
+ <text x="396" y="388" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">3.19×</text>
80
+ <rect x="0" y="398" width="764" height="28" fill="#f5f5f5"/>
81
+ <text x="232" y="416" text-anchor="end" font-size="11" fill="#424242">heavy_quoting_20k.csv</text>
82
+ <rect x="240" y="403" width="157" height="18" fill="#1565C0" rx="2"/>
83
+ <text x="393" y="416" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">3.14×</text>
84
+ <rect x="0" y="426" width="764" height="28" fill="#ffffff"/>
85
+ <text x="232" y="444" text-anchor="end" font-size="11" fill="#424242">embedded_newlines_20k.csv</text>
86
+ <rect x="240" y="431" width="148" height="18" fill="#1565C0" rx="2"/>
87
+ <text x="384" y="444" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">2.96×</text>
88
+ <rect x="0" y="454" width="764" height="28" fill="#f5f5f5"/>
89
+ <text x="232" y="472" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_B.csv</text>
90
+ <rect x="240" y="459" width="147" height="18" fill="#1565C0" rx="2"/>
91
+ <text x="383" y="472" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">2.94×</text>
92
+ <rect x="0" y="482" width="764" height="28" fill="#ffffff"/>
93
+ <text x="232" y="500" text-anchor="end" font-size="11" fill="#424242">multi_char_separator_20k.csv</text>
94
+ <rect x="240" y="487" width="145" height="18" fill="#1565C0" rx="2"/>
95
+ <text x="381" y="500" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">2.91×</text>
96
+ <rect x="0" y="510" width="764" height="28" fill="#f5f5f5"/>
97
+ <text x="232" y="528" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_NB.csv</text>
98
+ <rect x="240" y="515" width="140" height="18" fill="#1565C0" rx="2"/>
99
+ <text x="376" y="528" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">2.81×</text>
100
+ <rect x="0" y="538" width="764" height="28" fill="#ffffff"/>
101
+ <text x="232" y="556" text-anchor="end" font-size="11" fill="#424242">sensor_data_50krows_50cols.csv</text>
102
+ <rect x="240" y="543" width="105" height="18" fill="#1565C0" rx="2"/>
103
+ <text x="341" y="556" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">2.11×</text>
104
+ <rect x="0" y="566" width="764" height="28" fill="#f5f5f5"/>
105
+ <text x="232" y="584" text-anchor="end" font-size="11" fill="#424242">wide_500_cols_20k.csv</text>
106
+ <rect x="240" y="571" width="88" height="18" fill="#1565C0" rx="2"/>
107
+ <text x="324" y="584" text-anchor="end" font-size="10" fill="#ffffff" font-weight="bold">1.75×</text>
108
+ </svg>
@@ -0,0 +1,141 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" width="820" height="648"
2
+ font-family="ui-monospace, 'Cascadia Code', 'Courier New', monospace" font-size="12">
3
+ <rect width="820" height="648" fill="#ffffff"/>
4
+ <text x="410" y="18" text-anchor="middle" font-size="13" font-weight="bold" fill="#212121">SmarterCSV improvements 1.15.2, 1.16.0 vs 1.14.4 — C accelerated</text>
5
+ <text x="410" y="32" text-anchor="middle" font-size="10" fill="#9e9e9e">Speedup ratio = baseline version time ÷ newer version time (higher = newer version is faster)</text>
6
+ <text x="410" y="48" text-anchor="middle" font-size="11" fill="#616161">Ruby 3.4.7 [log scale, best of 30]</text>
7
+ <line x1="220" y1="68" x2="220" y2="580" stroke="#e0e0e0" stroke-width="1"/>
8
+ <text x="220" y="64" text-anchor="middle" font-size="11" fill="#757575">1×</text>
9
+ <line x1="307" y1="68" x2="307" y2="580" stroke="#e0e0e0" stroke-width="1"/>
10
+ <text x="307" y="64" text-anchor="middle" font-size="11" fill="#757575">2×</text>
11
+ <line x1="423" y1="68" x2="423" y2="580" stroke="#e0e0e0" stroke-width="1"/>
12
+ <text x="423" y="64" text-anchor="middle" font-size="11" fill="#757575">5×</text>
13
+ <line x1="510" y1="68" x2="510" y2="580" stroke="#e0e0e0" stroke-width="1"/>
14
+ <text x="510" y="64" text-anchor="middle" font-size="11" fill="#757575">10×</text>
15
+ <line x1="597" y1="68" x2="597" y2="580" stroke="#e0e0e0" stroke-width="1"/>
16
+ <text x="597" y="64" text-anchor="middle" font-size="11" fill="#757575">20×</text>
17
+ <line x1="713" y1="68" x2="713" y2="580" stroke="#e0e0e0" stroke-width="1"/>
18
+ <text x="713" y="64" text-anchor="middle" font-size="11" fill="#757575">50×</text>
19
+ <line x1="800" y1="68" x2="800" y2="580" stroke="#e0e0e0" stroke-width="1"/>
20
+ <text x="800" y="64" text-anchor="middle" font-size="11" fill="#757575">100×</text>
21
+ <line x1="220" y1="68" x2="220" y2="580" stroke="#9e9e9e" stroke-width="1.5"/>
22
+ <line x1="220" y1="68" x2="800" y2="68" stroke="#bdbdbd" stroke-width="1"/>
23
+ <rect x="0" y="86" width="820" height="26" fill="#f5f5f5"/>
24
+ <text x="212" y="103" text-anchor="end" font-size="11" fill="#424242">long_fields_20k</text>
25
+ <circle cx="635" cy="99" r="5" fill="#1565C0"/>
26
+ <text x="606" y="103" font-size="10" fill="#1565C0">27×</text>
27
+ <circle cx="744" cy="99" r="5" fill="#BF360C"/>
28
+ <text x="752" y="103" font-size="10" fill="#BF360C">64×</text>
29
+ <rect x="0" y="112" width="820" height="26" fill="#ffffff"/>
30
+ <text x="212" y="129" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_C</text>
31
+ <circle cx="683" cy="125" r="5" fill="#1565C0"/>
32
+ <text x="654" y="129" font-size="10" fill="#1565C0">40×</text>
33
+ <circle cx="708" cy="125" r="5" fill="#BF360C"/>
34
+ <text x="716" y="132" font-size="10" fill="#BF360C">48×</text>
35
+ <rect x="0" y="138" width="820" height="26" fill="#f5f5f5"/>
36
+ <text x="212" y="155" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_NC</text>
37
+ <circle cx="595" cy="151" r="5" fill="#1565C0"/>
38
+ <text x="566" y="155" font-size="10" fill="#1565C0">20×</text>
39
+ <circle cx="619" cy="151" r="5" fill="#BF360C"/>
40
+ <text x="627" y="158" font-size="10" fill="#BF360C">24×</text>
41
+ <rect x="0" y="164" width="820" height="26" fill="#ffffff"/>
42
+ <text x="212" y="181" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_NB</text>
43
+ <circle cx="589" cy="177" r="5" fill="#1565C0"/>
44
+ <text x="560" y="181" font-size="10" fill="#1565C0">19×</text>
45
+ <circle cx="598" cy="177" r="5" fill="#BF360C"/>
46
+ <text x="606" y="184" font-size="10" fill="#BF360C">20×</text>
47
+ <rect x="0" y="190" width="820" height="26" fill="#f5f5f5"/>
48
+ <text x="212" y="207" text-anchor="end" font-size="11" fill="#424242">multi_char_separator_20k</text>
49
+ <circle cx="573" cy="203" r="5" fill="#1565C0"/>
50
+ <text x="544" y="207" font-size="10" fill="#1565C0">16×</text>
51
+ <circle cx="596" cy="203" r="5" fill="#BF360C"/>
52
+ <text x="604" y="210" font-size="10" fill="#BF360C">20×</text>
53
+ <rect x="0" y="216" width="820" height="26" fill="#ffffff"/>
54
+ <text x="212" y="233" text-anchor="end" font-size="11" fill="#424242">whitespace_heavy_20k</text>
55
+ <circle cx="562" cy="229" r="5" fill="#1565C0"/>
56
+ <text x="533" y="233" font-size="10" fill="#1565C0">15×</text>
57
+ <circle cx="591" cy="229" r="5" fill="#BF360C"/>
58
+ <text x="599" y="236" font-size="10" fill="#BF360C">19×</text>
59
+ <rect x="0" y="242" width="820" height="26" fill="#f5f5f5"/>
60
+ <text x="212" y="259" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_B</text>
61
+ <circle cx="572" cy="255" r="5" fill="#1565C0"/>
62
+ <text x="543" y="259" font-size="10" fill="#1565C0">16×</text>
63
+ <circle cx="591" cy="255" r="5" fill="#BF360C"/>
64
+ <text x="599" y="262" font-size="10" fill="#BF360C">19×</text>
65
+ <rect x="0" y="268" width="820" height="26" fill="#ffffff"/>
66
+ <text x="212" y="285" text-anchor="end" font-size="11" fill="#424242">tab_separated_20k</text>
67
+ <circle cx="547" cy="281" r="5" fill="#1565C0"/>
68
+ <text x="518" y="285" font-size="10" fill="#1565C0">13×</text>
69
+ <circle cx="590" cy="281" r="5" fill="#BF360C"/>
70
+ <text x="598" y="288" font-size="10" fill="#BF360C">19×</text>
71
+ <rect x="0" y="294" width="820" height="26" fill="#f5f5f5"/>
72
+ <text x="212" y="311" text-anchor="end" font-size="11" fill="#424242">many_empty_fields_20k</text>
73
+ <circle cx="539" cy="307" r="5" fill="#1565C0"/>
74
+ <text x="510" y="311" font-size="10" fill="#1565C0">13×</text>
75
+ <circle cx="563" cy="307" r="5" fill="#BF360C"/>
76
+ <text x="571" y="314" font-size="10" fill="#BF360C">15×</text>
77
+ <rect x="0" y="320" width="820" height="26" fill="#ffffff"/>
78
+ <text x="212" y="337" text-anchor="end" font-size="11" fill="#424242">sensor_data_50krows_50cols</text>
79
+ <circle cx="558" cy="333" r="5" fill="#1565C0"/>
80
+ <text x="529" y="337" font-size="10" fill="#1565C0">15×</text>
81
+ <circle cx="560" cy="333" r="5" fill="#BF360C"/>
82
+ <text x="568" y="340" font-size="10" fill="#BF360C">15×</text>
83
+ <rect x="0" y="346" width="820" height="26" fill="#f5f5f5"/>
84
+ <text x="212" y="363" text-anchor="end" font-size="11" fill="#424242">heavy_quoting_20k</text>
85
+ <circle cx="506" cy="359" r="5" fill="#1565C0"/>
86
+ <text x="470" y="363" font-size="10" fill="#1565C0">9.7×</text>
87
+ <circle cx="557" cy="359" r="5" fill="#BF360C"/>
88
+ <text x="565" y="363" font-size="10" fill="#BF360C">15×</text>
89
+ <rect x="0" y="372" width="820" height="26" fill="#ffffff"/>
90
+ <text x="212" y="389" text-anchor="end" font-size="11" fill="#424242">utf8_multibyte_20k</text>
91
+ <circle cx="524" cy="385" r="5" fill="#1565C0"/>
92
+ <text x="495" y="389" font-size="10" fill="#1565C0">11×</text>
93
+ <circle cx="549" cy="385" r="5" fill="#BF360C"/>
94
+ <text x="557" y="392" font-size="10" fill="#BF360C">14×</text>
95
+ <rect x="0" y="398" width="820" height="26" fill="#f5f5f5"/>
96
+ <text x="212" y="415" text-anchor="end" font-size="11" fill="#424242">wide_500_cols_20k</text>
97
+ <circle cx="538" cy="411" r="5" fill="#1565C0"/>
98
+ <text x="509" y="415" font-size="10" fill="#1565C0">12×</text>
99
+ <circle cx="541" cy="411" r="5" fill="#BF360C"/>
100
+ <text x="549" y="418" font-size="10" fill="#BF360C">13×</text>
101
+ <rect x="0" y="424" width="820" height="26" fill="#ffffff"/>
102
+ <text x="212" y="441" text-anchor="end" font-size="11" fill="#424242">uszips</text>
103
+ <circle cx="527" cy="437" r="5" fill="#1565C0"/>
104
+ <text x="498" y="441" font-size="10" fill="#1565C0">11×</text>
105
+ <circle cx="538" cy="437" r="5" fill="#BF360C"/>
106
+ <text x="546" y="444" font-size="10" fill="#BF360C">13×</text>
107
+ <rect x="0" y="450" width="820" height="26" fill="#f5f5f5"/>
108
+ <text x="212" y="467" text-anchor="end" font-size="11" fill="#424242">embedded_separators_20k</text>
109
+ <circle cx="492" cy="463" r="5" fill="#1565C0"/>
110
+ <text x="456" y="467" font-size="10" fill="#1565C0">8.7×</text>
111
+ <circle cx="525" cy="463" r="5" fill="#BF360C"/>
112
+ <text x="533" y="470" font-size="10" fill="#BF360C">11×</text>
113
+ <rect x="0" y="476" width="820" height="26" fill="#ffffff"/>
114
+ <text x="212" y="493" text-anchor="end" font-size="11" fill="#424242">worldcities</text>
115
+ <circle cx="500" cy="489" r="5" fill="#1565C0"/>
116
+ <text x="464" y="493" font-size="10" fill="#1565C0">9.2×</text>
117
+ <circle cx="522" cy="489" r="5" fill="#BF360C"/>
118
+ <text x="530" y="496" font-size="10" fill="#BF360C">11×</text>
119
+ <rect x="0" y="502" width="820" height="26" fill="#f5f5f5"/>
120
+ <text x="212" y="519" text-anchor="end" font-size="11" fill="#424242">sample_10M</text>
121
+ <circle cx="491" cy="515" r="5" fill="#1565C0"/>
122
+ <text x="455" y="519" font-size="10" fill="#1565C0">8.6×</text>
123
+ <circle cx="509" cy="515" r="5" fill="#BF360C"/>
124
+ <text x="517" y="522" font-size="10" fill="#BF360C">9.9×</text>
125
+ <rect x="0" y="528" width="820" height="26" fill="#ffffff"/>
126
+ <text x="212" y="545" text-anchor="end" font-size="11" fill="#424242">uscities</text>
127
+ <circle cx="502" cy="541" r="5" fill="#1565C0"/>
128
+ <text x="466" y="545" font-size="10" fill="#1565C0">9.4×</text>
129
+ <circle cx="507" cy="541" r="5" fill="#BF360C"/>
130
+ <text x="515" y="548" font-size="10" fill="#BF360C">9.8×</text>
131
+ <rect x="0" y="554" width="820" height="26" fill="#f5f5f5"/>
132
+ <text x="212" y="571" text-anchor="end" font-size="11" fill="#424242">embedded_newlines_20k</text>
133
+ <circle cx="505" cy="567" r="5" fill="#1565C0"/>
134
+ <text x="469" y="571" font-size="10" fill="#1565C0">9.6×</text>
135
+ <circle cx="507" cy="567" r="5" fill="#BF360C"/>
136
+ <text x="515" y="574" font-size="10" fill="#BF360C">9.8×</text>
137
+ <circle cx="228" cy="594" r="5" fill="#1565C0"/>
138
+ <text x="240" y="598" font-size="11" fill="#1565C0">C accelerated (v1.15.2)</text>
139
+ <circle cx="228" cy="614" r="5" fill="#BF360C"/>
140
+ <text x="240" y="618" font-size="11" fill="#BF360C">C accelerated (v1.16.0)</text>
141
+ </svg>
@@ -0,0 +1,139 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" width="820" height="648"
2
+ font-family="ui-monospace, 'Cascadia Code', 'Courier New', monospace" font-size="12">
3
+ <rect width="820" height="648" fill="#ffffff"/>
4
+ <text x="410" y="18" text-anchor="middle" font-size="13" font-weight="bold" fill="#212121">SmarterCSV improvements 1.15.2, 1.16.0 vs 1.14.4 — Ruby (not accelerated)</text>
5
+ <text x="410" y="32" text-anchor="middle" font-size="10" fill="#9e9e9e">Speedup ratio = baseline version time ÷ newer version time (higher = newer version is faster)</text>
6
+ <text x="410" y="48" text-anchor="middle" font-size="11" fill="#616161">Ruby 3.4.7 [log scale, best of 30]</text>
7
+ <line x1="220" y1="68" x2="220" y2="580" stroke="#e0e0e0" stroke-width="1"/>
8
+ <text x="220" y="64" text-anchor="middle" font-size="11" fill="#757575">1×</text>
9
+ <line x1="323" y1="68" x2="323" y2="580" stroke="#e0e0e0" stroke-width="1"/>
10
+ <text x="323" y="64" text-anchor="middle" font-size="11" fill="#757575">2×</text>
11
+ <line x1="459" y1="68" x2="459" y2="580" stroke="#e0e0e0" stroke-width="1"/>
12
+ <text x="459" y="64" text-anchor="middle" font-size="11" fill="#757575">5×</text>
13
+ <line x1="561" y1="68" x2="561" y2="580" stroke="#e0e0e0" stroke-width="1"/>
14
+ <text x="561" y="64" text-anchor="middle" font-size="11" fill="#757575">10×</text>
15
+ <line x1="664" y1="68" x2="664" y2="580" stroke="#e0e0e0" stroke-width="1"/>
16
+ <text x="664" y="64" text-anchor="middle" font-size="11" fill="#757575">20×</text>
17
+ <line x1="800" y1="68" x2="800" y2="580" stroke="#e0e0e0" stroke-width="1"/>
18
+ <text x="800" y="64" text-anchor="middle" font-size="11" fill="#757575">50×</text>
19
+ <line x1="220" y1="68" x2="220" y2="580" stroke="#9e9e9e" stroke-width="1.5"/>
20
+ <line x1="220" y1="68" x2="800" y2="68" stroke="#bdbdbd" stroke-width="1"/>
21
+ <rect x="0" y="86" width="820" height="26" fill="#f5f5f5"/>
22
+ <text x="212" y="103" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_C</text>
23
+ <rect x="563" y="94" width="10" height="10" fill="#1565C0"/>
24
+ <text x="539" y="103" font-size="10" fill="#1565C0">10×</text>
25
+ <rect x="660" y="94" width="10" height="10" fill="#BF360C"/>
26
+ <text x="673" y="103" font-size="10" fill="#BF360C">20×</text>
27
+ <rect x="0" y="112" width="820" height="26" fill="#ffffff"/>
28
+ <text x="212" y="129" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_NC</text>
29
+ <rect x="515" y="120" width="10" height="10" fill="#1565C0"/>
30
+ <text x="484" y="129" font-size="10" fill="#1565C0">7.6×</text>
31
+ <rect x="572" y="120" width="10" height="10" fill="#BF360C"/>
32
+ <text x="585" y="129" font-size="10" fill="#BF360C">11×</text>
33
+ <rect x="0" y="138" width="820" height="26" fill="#f5f5f5"/>
34
+ <text x="212" y="155" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_NB</text>
35
+ <rect x="535" y="146" width="10" height="10" fill="#1565C0"/>
36
+ <text x="504" y="155" font-size="10" fill="#1565C0">8.6×</text>
37
+ <rect x="554" y="146" width="10" height="10" fill="#BF360C"/>
38
+ <text x="567" y="158" font-size="10" fill="#BF360C">9.8×</text>
39
+ <rect x="0" y="164" width="820" height="26" fill="#ffffff"/>
40
+ <text x="212" y="181" text-anchor="end" font-size="11" fill="#424242">whitespace_heavy_20k</text>
41
+ <rect x="546" y="172" width="10" height="10" fill="#1565C0"/>
42
+ <text x="515" y="181" font-size="10" fill="#1565C0">9.3×</text>
43
+ <rect x="550" y="172" width="10" height="10" fill="#BF360C"/>
44
+ <text x="563" y="184" font-size="10" fill="#BF360C">9.6×</text>
45
+ <rect x="0" y="190" width="820" height="26" fill="#f5f5f5"/>
46
+ <text x="212" y="207" text-anchor="end" font-size="11" fill="#424242">PEOPLE_IMPORT_B</text>
47
+ <rect x="535" y="198" width="10" height="10" fill="#1565C0"/>
48
+ <text x="504" y="207" font-size="10" fill="#1565C0">8.7×</text>
49
+ <rect x="541" y="198" width="10" height="10" fill="#BF360C"/>
50
+ <text x="554" y="210" font-size="10" fill="#BF360C">9.0×</text>
51
+ <rect x="0" y="216" width="820" height="26" fill="#ffffff"/>
52
+ <text x="212" y="233" text-anchor="end" font-size="11" fill="#424242">tab_separated_20k</text>
53
+ <rect x="521" y="224" width="10" height="10" fill="#1565C0"/>
54
+ <text x="490" y="233" font-size="10" fill="#1565C0">7.9×</text>
55
+ <rect x="528" y="224" width="10" height="10" fill="#BF360C"/>
56
+ <text x="541" y="236" font-size="10" fill="#BF360C">8.2×</text>
57
+ <rect x="0" y="242" width="820" height="26" fill="#f5f5f5"/>
58
+ <text x="212" y="259" text-anchor="end" font-size="11" fill="#424242">multi_char_separator_20k</text>
59
+ <rect x="476" y="250" width="10" height="10" fill="#1565C0"/>
60
+ <text x="445" y="259" font-size="10" fill="#1565C0">5.8×</text>
61
+ <rect x="519" y="250" width="10" height="10" fill="#BF360C"/>
62
+ <text x="532" y="262" font-size="10" fill="#BF360C">7.8×</text>
63
+ <rect x="0" y="268" width="820" height="26" fill="#ffffff"/>
64
+ <text x="212" y="285" text-anchor="end" font-size="11" fill="#424242">wide_500_cols_20k</text>
65
+ <rect x="486" y="276" width="10" height="10" fill="#1565C0"/>
66
+ <text x="455" y="285" font-size="10" fill="#1565C0">6.2×</text>
67
+ <rect x="487" y="276" width="10" height="10" fill="#BF360C"/>
68
+ <text x="500" y="288" font-size="10" fill="#BF360C">6.3×</text>
69
+ <rect x="0" y="294" width="820" height="26" fill="#f5f5f5"/>
70
+ <text x="212" y="311" text-anchor="end" font-size="11" fill="#424242">utf8_multibyte_20k</text>
71
+ <rect x="473" y="302" width="10" height="10" fill="#1565C0"/>
72
+ <text x="442" y="311" font-size="10" fill="#1565C0">5.7×</text>
73
+ <rect x="481" y="302" width="10" height="10" fill="#BF360C"/>
74
+ <text x="494" y="314" font-size="10" fill="#BF360C">6.0×</text>
75
+ <rect x="0" y="320" width="820" height="26" fill="#ffffff"/>
76
+ <text x="212" y="337" text-anchor="end" font-size="11" fill="#424242">many_empty_fields_20k</text>
77
+ <rect x="395" y="328" width="10" height="10" fill="#1565C0"/>
78
+ <text x="364" y="337" font-size="10" fill="#1565C0">3.4×</text>
79
+ <rect x="461" y="328" width="10" height="10" fill="#BF360C"/>
80
+ <text x="474" y="337" font-size="10" fill="#BF360C">5.3×</text>
81
+ <rect x="0" y="346" width="820" height="26" fill="#f5f5f5"/>
82
+ <text x="212" y="363" text-anchor="end" font-size="11" fill="#424242">sensor_data_50krows_50cols</text>
83
+ <rect x="453" y="354" width="10" height="10" fill="#1565C0"/>
84
+ <text x="422" y="363" font-size="10" fill="#1565C0">5.0×</text>
85
+ <rect x="457" y="354" width="10" height="10" fill="#BF360C"/>
86
+ <text x="470" y="366" font-size="10" fill="#BF360C">5.1×</text>
87
+ <rect x="0" y="372" width="820" height="26" fill="#ffffff"/>
88
+ <text x="212" y="389" text-anchor="end" font-size="11" fill="#424242">sample_10M</text>
89
+ <rect x="435" y="380" width="10" height="10" fill="#1565C0"/>
90
+ <text x="404" y="389" font-size="10" fill="#1565C0">4.4×</text>
91
+ <rect x="447" y="380" width="10" height="10" fill="#BF360C"/>
92
+ <text x="460" y="392" font-size="10" fill="#BF360C">4.8×</text>
93
+ <rect x="0" y="398" width="820" height="26" fill="#f5f5f5"/>
94
+ <text x="212" y="415" text-anchor="end" font-size="11" fill="#424242">long_fields_20k</text>
95
+ <rect x="308" y="406" width="10" height="10" fill="#1565C0"/>
96
+ <text x="277" y="415" font-size="10" fill="#1565C0">1.9×</text>
97
+ <rect x="402" y="406" width="10" height="10" fill="#BF360C"/>
98
+ <text x="415" y="415" font-size="10" fill="#BF360C">3.5×</text>
99
+ <rect x="0" y="424" width="820" height="26" fill="#ffffff"/>
100
+ <text x="212" y="441" text-anchor="end" font-size="11" fill="#424242">heavy_quoting_20k</text>
101
+ <rect x="289" y="432" width="10" height="10" fill="#1565C0"/>
102
+ <text x="258" y="441" font-size="10" fill="#1565C0">1.7×</text>
103
+ <rect x="364" y="432" width="10" height="10" fill="#BF360C"/>
104
+ <text x="377" y="441" font-size="10" fill="#BF360C">2.7×</text>
105
+ <rect x="0" y="450" width="820" height="26" fill="#f5f5f5"/>
106
+ <text x="212" y="467" text-anchor="end" font-size="11" fill="#424242">worldcities</text>
107
+ <rect x="275" y="458" width="10" height="10" fill="#1565C0"/>
108
+ <text x="244" y="467" font-size="10" fill="#1565C0">1.5×</text>
109
+ <rect x="360" y="458" width="10" height="10" fill="#BF360C"/>
110
+ <text x="373" y="467" font-size="10" fill="#BF360C">2.7×</text>
111
+ <rect x="0" y="476" width="820" height="26" fill="#ffffff"/>
112
+ <text x="212" y="493" text-anchor="end" font-size="11" fill="#424242">embedded_separators_20k</text>
113
+ <rect x="280" y="484" width="10" height="10" fill="#1565C0"/>
114
+ <text x="249" y="493" font-size="10" fill="#1565C0">1.6×</text>
115
+ <rect x="357" y="484" width="10" height="10" fill="#BF360C"/>
116
+ <text x="370" y="493" font-size="10" fill="#BF360C">2.6×</text>
117
+ <rect x="0" y="502" width="820" height="26" fill="#f5f5f5"/>
118
+ <text x="212" y="519" text-anchor="end" font-size="11" fill="#424242">uscities</text>
119
+ <rect x="274" y="510" width="10" height="10" fill="#1565C0"/>
120
+ <text x="243" y="519" font-size="10" fill="#1565C0">1.5×</text>
121
+ <rect x="352" y="510" width="10" height="10" fill="#BF360C"/>
122
+ <text x="365" y="519" font-size="10" fill="#BF360C">2.5×</text>
123
+ <rect x="0" y="528" width="820" height="26" fill="#ffffff"/>
124
+ <text x="212" y="545" text-anchor="end" font-size="11" fill="#424242">uszips</text>
125
+ <rect x="274" y="536" width="10" height="10" fill="#1565C0"/>
126
+ <text x="243" y="545" font-size="10" fill="#1565C0">1.5×</text>
127
+ <rect x="348" y="536" width="10" height="10" fill="#BF360C"/>
128
+ <text x="361" y="545" font-size="10" fill="#BF360C">2.4×</text>
129
+ <rect x="0" y="554" width="820" height="26" fill="#f5f5f5"/>
130
+ <text x="212" y="571" text-anchor="end" font-size="11" fill="#424242">embedded_newlines_20k</text>
131
+ <rect x="322" y="562" width="10" height="10" fill="#1565C0"/>
132
+ <text x="291" y="571" font-size="10" fill="#1565C0">2.1×</text>
133
+ <rect x="336" y="562" width="10" height="10" fill="#BF360C"/>
134
+ <text x="349" y="574" font-size="10" fill="#BF360C">2.3×</text>
135
+ <rect x="223" y="589" width="10" height="10" fill="#1565C0"/>
136
+ <text x="240" y="598" font-size="11" fill="#1565C0">Ruby path (v1.15.2)</text>
137
+ <rect x="223" y="609" width="10" height="10" fill="#BF360C"/>
138
+ <text x="240" y="618" font-size="11" fill="#BF360C">Ruby path (v1.16.0)</text>
139
+ </svg>
@@ -2,11 +2,13 @@
2
2
 
3
3
  module SmarterCSV
4
4
  class Error < StandardError; end # new code should rescue this instead
5
+
5
6
  # Reader:
6
7
  class SmarterCSVException < Error; end # for backwards compatibility
7
8
  class HeaderSizeMismatch < SmarterCSVException; end
8
9
  class IncorrectOption < SmarterCSVException; end
9
10
  class ValidationError < SmarterCSVException; end
11
+
10
12
  class DuplicateHeaders < SmarterCSVException
11
13
  attr_reader :headers
12
14
 
@@ -25,9 +27,15 @@ module SmarterCSV
25
27
  end
26
28
  end
27
29
 
30
+ class EmptyFileError < SmarterCSVException; end
28
31
  class NoColSepDetected < SmarterCSVException; end
29
32
  class KeyMappingError < SmarterCSVException; end
30
33
  class MalformedCSV < SmarterCSVException; end
34
+ class FieldSizeLimitExceeded < SmarterCSVException; end
35
+
31
36
  # Writer:
32
37
  class InvalidInputData < SmarterCSVException; end
38
+
39
+ # Bad-row quarantine:
40
+ class TooManyBadRows < SmarterCSVException; end
33
41
  end
@@ -44,7 +44,7 @@ module SmarterCSV
44
44
  return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
45
45
 
46
46
  # :nocov:
47
- puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
47
+ warn "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}" unless @options[:verbose] == :quiet
48
48
  str
49
49
  # :nocov:
50
50
  end
@@ -17,16 +17,26 @@ module SmarterCSV
17
17
 
18
18
  remove_empty_values = options[:remove_empty_values] == true
19
19
  remove_zero_values = options[:remove_zero_values]
20
- remove_values_matching = options[:remove_values_matching]
20
+ nil_values_matching = options[:nil_values_matching]
21
21
  convert_to_numeric = options[:convert_values_to_numeric]
22
22
  value_converters = options[:value_converters]
23
23
 
24
24
  # Early return if no transformations needed
25
- return hash unless remove_empty_values || remove_zero_values || remove_values_matching || convert_to_numeric || value_converters
25
+ return hash unless remove_empty_values || remove_zero_values || nil_values_matching || convert_to_numeric || value_converters
26
26
 
27
27
  keys_to_delete = []
28
28
 
29
29
  hash.each do |k, v|
30
+ # Nil-ify values matching the pattern (keeps the key; remove_empty_values handles deletion)
31
+ if nil_values_matching
32
+ str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
33
+ if str_val && nil_values_matching.match?(str_val)
34
+ hash[k] = nil
35
+ v = nil
36
+ # fall through: remove_empty_values will delete the key if true
37
+ end
38
+ end
39
+
30
40
  # Check if this key/value should be removed
31
41
  # Note: numeric values (Integer/Float) are never blank, so skip the blank check for them
32
42
  if remove_empty_values && !v.is_a?(Numeric) && (has_rails ? v.blank? : blank?(v))
@@ -40,15 +50,6 @@ module SmarterCSV
40
50
  next
41
51
  end
42
52
 
43
- # Match against string values, or against the string representation of numeric values
44
- if remove_values_matching
45
- str_val = v.is_a?(String) ? v : (v.is_a?(Numeric) ? v.to_s : nil)
46
- if str_val && remove_values_matching.match?(str_val)
47
- keys_to_delete << k
48
- next
49
- end
50
- end
51
-
52
53
  # Convert to numeric if requested
53
54
  if convert_to_numeric && v.is_a?(String) && !limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
54
55
  if FLOAT_REGEX.match?(v)
@@ -75,7 +76,7 @@ module SmarterCSV
75
76
  # def hash_transformations(hash, options)
76
77
  # remove_empty_values = options[:remove_empty_values] == true
77
78
  # remove_zero_values = options[:remove_zero_values]
78
- # remove_values_matching = options[:remove_values_matching]
79
+ # nil_values_matching = options[:nil_values_matching] # replaces deprecated remove_values_matching
79
80
  # convert_to_numeric = options[:convert_values_to_numeric]
80
81
  # value_converters = options[:value_converters]
81
82
  #
@@ -83,7 +84,7 @@ module SmarterCSV
83
84
  # next if k.nil? || k == '' || k == :""
84
85
  # next if remove_empty_values && (has_rails ? v.blank? : blank?(v))
85
86
  # next if remove_zero_values && v.is_a?(String) && ZERO_REGEX.match?(v)
86
- # next if remove_values_matching && remove_values_matching.match?(v)
87
+ # next if nil_values_matching && nil_values_matching.match?(v)
87
88
  #
88
89
  # if convert_to_numeric && !limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
89
90
  # if v.is_a?(String)
@@ -8,6 +8,9 @@ module SmarterCSV
8
8
  header_array.map!{|x| x.strip} if options[:strip_whitespace]
9
9
 
10
10
  unless options[:keep_original_headers]
11
+ # Normalize whitespace-only headers to "" before gsub so they are treated as
12
+ # blank/missing by disambiguate_headers rather than converted to "_".
13
+ header_array.map!{|x| blank?(x) ? '' : x} unless options[:strip_whitespace]
11
14
  header_array.map!{|x| x.gsub(/\s+|-+/, '_')}
12
15
  header_array.map!{|x| x.downcase} if options[:downcase_header]
13
16
  end
@@ -24,9 +27,25 @@ module SmarterCSV
24
27
 
25
28
  def disambiguate_headers(headers, options)
26
29
  counts = Hash.new(0)
30
+ empty_count = 0
31
+ prefix = options[:missing_header_prefix] || 'column_'
32
+ # Pre-collect non-blank header names so auto-generated names can avoid collisions.
33
+ used = headers.reject { |h| blank?(h) }
27
34
  headers.map do |header|
28
- counts[header] += 1
29
- counts[header] > 1 ? "#{header}#{options[:duplicate_header_suffix]}#{counts[header]}" : header
35
+ if blank?(header)
36
+ # Empty headers use missing_header_prefix (e.g. "column_1", "column_2") so they
37
+ # produce a usable key instead of :"" which gets silently deleted downstream.
38
+ # Skip ahead if the generated name collides with an existing header.
39
+ begin
40
+ empty_count += 1
41
+ candidate = "#{prefix}#{empty_count}"
42
+ end while used.include?(candidate)
43
+ used << candidate
44
+ candidate
45
+ else
46
+ counts[header] += 1
47
+ counts[header] > 1 ? "#{header}#{options[:duplicate_header_suffix]}#{counts[header]}" : header
48
+ end
30
49
  end
31
50
  end
32
51
 
@@ -13,7 +13,8 @@ module SmarterCSV
13
13
  # process the header line in the CSV file..
14
14
  # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
15
15
  header_line = @raw_header = next_line_with_counts(filehandle, options)
16
- header_line = preprocess_header_line(header_line, options)
16
+ header_line = preprocess_header_line(header_line, options) unless header_line.nil?
17
+ raise SmarterCSV::EmptyFileError, "Empty CSV file" if blank?(header_line)
17
18
 
18
19
  file_header_array, file_header_size = parse(header_line, options)
19
20