vfcsv 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.tool-versions +1 -0
- data/Gemfile +14 -0
- data/Gemfile.lock +65 -0
- data/LICENSE +21 -0
- data/README.md +268 -0
- data/Rakefile +37 -0
- data/bench/run_all_jit.sh +20 -0
- data/bench/vs_competitors.rb +253 -0
- data/bench/vs_stdlib.rb +137 -0
- data/ext/vfcsv_rust/Cargo.lock +289 -0
- data/ext/vfcsv_rust/Cargo.toml +27 -0
- data/ext/vfcsv_rust/extconf.rb +6 -0
- data/ext/vfcsv_rust/src/lib.rs +476 -0
- data/lib/vfcsv/row.rb +296 -0
- data/lib/vfcsv/table.rb +270 -0
- data/lib/vfcsv/version.rb +5 -0
- data/lib/vfcsv.rb +568 -0
- data/vfcsv.gemspec +43 -0
- metadata +149 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 30c8959e89b63c211bb93b9a3ba56a290688a2fefdd8b8644c57d95b55d2ccfb
|
|
4
|
+
data.tar.gz: 2e1ff95060b5c3a9ef9edc6187d6546c72a5bb4b5811e480bc542e48ef4540b7
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: '0695ec5e9e3e1b92489eb46ef250a62d939d671f113357809cbf88444e3198aa6f244488240cabc941c83f15be8d5578d8be64a5fb99e7ab182dd359ffff274f'
|
|
7
|
+
data.tar.gz: 5e41903584067f08c8597d5c5358e9ad866b93f2dd11530b870e5db375ebb097f1c207b2b614ce2011339c89785119b7c27c6c46b01737bcff7b3bdc6d65e0e9
|
data/.tool-versions
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ruby 4.0.0
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
PATH
|
|
2
|
+
remote: .
|
|
3
|
+
specs:
|
|
4
|
+
vfcsv (0.1.0)
|
|
5
|
+
rb_sys (~> 0.9)
|
|
6
|
+
|
|
7
|
+
GEM
|
|
8
|
+
remote: https://rubygems.org/
|
|
9
|
+
specs:
|
|
10
|
+
benchmark-ips (2.14.0)
|
|
11
|
+
csv (3.3.5)
|
|
12
|
+
diff-lcs (1.6.2)
|
|
13
|
+
minitest (5.27.0)
|
|
14
|
+
rake (13.3.1)
|
|
15
|
+
rake-compiler (1.3.1)
|
|
16
|
+
rake
|
|
17
|
+
rake-compiler-dock (1.11.0)
|
|
18
|
+
rb_sys (0.9.124)
|
|
19
|
+
rake-compiler-dock (= 1.11.0)
|
|
20
|
+
rspec (3.13.2)
|
|
21
|
+
rspec-core (~> 3.13.0)
|
|
22
|
+
rspec-expectations (~> 3.13.0)
|
|
23
|
+
rspec-mocks (~> 3.13.0)
|
|
24
|
+
rspec-core (3.13.6)
|
|
25
|
+
rspec-support (~> 3.13.0)
|
|
26
|
+
rspec-expectations (3.13.5)
|
|
27
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
28
|
+
rspec-support (~> 3.13.0)
|
|
29
|
+
rspec-mocks (3.13.7)
|
|
30
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
|
31
|
+
rspec-support (~> 3.13.0)
|
|
32
|
+
rspec-support (3.13.6)
|
|
33
|
+
|
|
34
|
+
PLATFORMS
|
|
35
|
+
arm64-darwin-25
|
|
36
|
+
ruby
|
|
37
|
+
|
|
38
|
+
DEPENDENCIES
|
|
39
|
+
benchmark-ips (~> 2.0, >= 0)
|
|
40
|
+
csv
|
|
41
|
+
minitest (~> 5.0, >= 0)
|
|
42
|
+
rake (~> 13.0)
|
|
43
|
+
rake-compiler (~> 1.2, >= 0)
|
|
44
|
+
rb_sys (~> 0.9)
|
|
45
|
+
rspec
|
|
46
|
+
vfcsv!
|
|
47
|
+
|
|
48
|
+
CHECKSUMS
|
|
49
|
+
benchmark-ips (2.14.0) sha256=b72bc8a65d525d5906f8cd94270dccf73452ee3257a32b89fbd6684d3e8a9b1d
|
|
50
|
+
csv (3.3.5) sha256=6e5134ac3383ef728b7f02725d9872934f523cb40b961479f69cf3afa6c8e73f
|
|
51
|
+
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
|
|
52
|
+
minitest (5.27.0)
|
|
53
|
+
rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
|
|
54
|
+
rake-compiler (1.3.1) sha256=6b351612b6e2d73ddd5563ee799bb58685176e05363db6758504bd11573d670a
|
|
55
|
+
rake-compiler-dock (1.11.0) sha256=eab51f2cd533eb35cea6b624a75281f047123e70a64c58b607471bb49428f8c2
|
|
56
|
+
rb_sys (0.9.124) sha256=513476557b12eaf73764b3da9f8746024558fe8699bda785fb548c9aa3877ae7
|
|
57
|
+
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
58
|
+
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
59
|
+
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
60
|
+
rspec-mocks (3.13.7) sha256=0979034e64b1d7a838aaaddf12bf065ea4dc40ef3d4c39f01f93ae2c66c62b1c
|
|
61
|
+
rspec-support (3.13.6) sha256=2e8de3702427eab064c9352fe74488cc12a1bfae887ad8b91cba480ec9f8afb2
|
|
62
|
+
vfcsv (0.1.0)
|
|
63
|
+
|
|
64
|
+
BUNDLED WITH
|
|
65
|
+
4.0.3
|
data/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Chris Hasinski
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# VFCSV - Very Fast CSV
|
|
2
|
+
|
|
3
|
+
**The only SIMD-accelerated CSV parser for Ruby with full stdlib API compatibility.**
|
|
4
|
+
|
|
5
|
+
[](https://badge.fury.io/rb/vfcsv)
|
|
6
|
+
|
|
7
|
+
VFCSV is a drop-in replacement for Ruby's CSV library that delivers 2-6x faster parsing through SIMD acceleration (NEON on ARM64, AVX2 on x86_64), while maintaining 100% API compatibility with Ruby's CSV—including `Row`, `Table`, converters, and all standard options.
|
|
8
|
+
|
|
9
|
+
## Why VFCSV?
|
|
10
|
+
|
|
11
|
+
| Library | Speed | Drop-in? | Row/Table | Converters | SIMD | Dependencies |
|
|
12
|
+
|---------|-------|----------|-----------|------------|------|--------------|
|
|
13
|
+
| **VFCSV** | **2-6x** | **✓** | **✓** | **✓** | **✓** | Pure Rust |
|
|
14
|
+
| zsv-ruby | 5-6x | Partial | ✗ | ✗ | ✓ | C (zsv) |
|
|
15
|
+
| OSV | 8x | ✗ | ✗ | ✗ | ✗ | Rust |
|
|
16
|
+
| FastCSV | 1.5x | ✓ | ✓ | ✓ | ✗ | C (Ragel) |
|
|
17
|
+
| FastestCSV | 3x | ✗ | ✗ | ✗ | ✗ | C |
|
|
18
|
+
| CSV (stdlib) | 1x | N/A | ✓ | ✓ | ✗ | None |
|
|
19
|
+
|
|
20
|
+
**VFCSV is the only library that combines SIMD acceleration with full API compatibility.**
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
Add to your Gemfile:
|
|
25
|
+
|
|
26
|
+
```ruby
|
|
27
|
+
gem 'vfcsv'
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Or install directly:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
gem install vfcsv
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Requires Rust toolchain for compilation. Works on Ruby 3.0+ (optimized for Ruby 4.0).
|
|
37
|
+
|
|
38
|
+
## Quick Start
|
|
39
|
+
|
|
40
|
+
```ruby
|
|
41
|
+
# Just replace your require
|
|
42
|
+
require 'vfcsv' # instead of require 'csv'
|
|
43
|
+
|
|
44
|
+
# Use exactly like Ruby's CSV
|
|
45
|
+
data = VFCSV.parse("name,age\nAlice,30\nBob,25", headers: true)
|
|
46
|
+
data[0]["name"] # => "Alice"
|
|
47
|
+
data["age"] # => ["30", "25"]
|
|
48
|
+
|
|
49
|
+
# All the same methods work
|
|
50
|
+
VFCSV.read("data.csv")
|
|
51
|
+
VFCSV.foreach("data.csv") { |row| puts row }
|
|
52
|
+
VFCSV.generate { |csv| csv << [1, 2, 3] }
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Performance
|
|
56
|
+
|
|
57
|
+
Benchmarks on Apple M1 (Ruby 4.0, no YJIT):
|
|
58
|
+
|
|
59
|
+
| Data Type | CSV stdlib | VFCSV | Speedup |
|
|
60
|
+
|-----------|------------|-------|---------|
|
|
61
|
+
| Simple CSV | 40 MB/s | 90 MB/s | **2.2x** |
|
|
62
|
+
| Quoted CSV | 21 MB/s | 120 MB/s | **5.6x** |
|
|
63
|
+
| With Headers | 10.7 i/s | 27.0 i/s | **2.5x** |
|
|
64
|
+
|
|
65
|
+
SIMD excels at quote detection, making quoted CSV parsing significantly faster.
|
|
66
|
+
|
|
67
|
+
```ruby
|
|
68
|
+
# Check your SIMD capabilities
|
|
69
|
+
VFCSV.simd_info
|
|
70
|
+
# => {neon: true, arch: "aarch64", backend: "vfcsv-simd"}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Full API Compatibility
|
|
74
|
+
|
|
75
|
+
### Parsing
|
|
76
|
+
|
|
77
|
+
```ruby
|
|
78
|
+
# Basic parsing
|
|
79
|
+
VFCSV.parse("a,b,c\n1,2,3")
|
|
80
|
+
# => [["a", "b", "c"], ["1", "2", "3"]]
|
|
81
|
+
|
|
82
|
+
# With headers (returns Table with Row objects)
|
|
83
|
+
table = VFCSV.parse("name,age\nAlice,30", headers: true)
|
|
84
|
+
table.class # => VFCSV::Table
|
|
85
|
+
table[0].class # => VFCSV::Row
|
|
86
|
+
table[0]["name"] # => "Alice"
|
|
87
|
+
table[0][0] # => "Alice"
|
|
88
|
+
table["name"] # => ["Alice"]
|
|
89
|
+
|
|
90
|
+
# Parse single line
|
|
91
|
+
VFCSV.parse_line("a,b,c") # => ["a", "b", "c"]
|
|
92
|
+
|
|
93
|
+
# File operations
|
|
94
|
+
VFCSV.read("file.csv")
|
|
95
|
+
VFCSV.foreach("file.csv") { |row| process(row) }
|
|
96
|
+
VFCSV.table("file.csv") # Shortcut for read with headers
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Converters
|
|
100
|
+
|
|
101
|
+
```ruby
|
|
102
|
+
# Built-in converters
|
|
103
|
+
VFCSV.parse("a,b\n1,2.5", headers: true, converters: :numeric)
|
|
104
|
+
# => a: 1 (Integer), b: 2.5 (Float)
|
|
105
|
+
|
|
106
|
+
# Available: :integer, :float, :numeric, :date, :date_time, :all
|
|
107
|
+
|
|
108
|
+
# Custom converters
|
|
109
|
+
upcase = ->(val) { val.upcase rescue val }
|
|
110
|
+
VFCSV.parse("a\nhello", headers: true, converters: [upcase])
|
|
111
|
+
# => a: "HELLO"
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Header Converters
|
|
115
|
+
|
|
116
|
+
```ruby
|
|
117
|
+
# Downcase headers
|
|
118
|
+
VFCSV.parse("NAME,AGE\na,1", headers: true, header_converters: :downcase)
|
|
119
|
+
# headers: ["name", "age"]
|
|
120
|
+
|
|
121
|
+
# Symbol headers
|
|
122
|
+
VFCSV.parse("Name,Age\na,1", headers: true, header_converters: :symbol)
|
|
123
|
+
# headers: [:name, :age]
|
|
124
|
+
# Access: row[:name]
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Row Class
|
|
128
|
+
|
|
129
|
+
Full `CSV::Row` compatibility:
|
|
130
|
+
|
|
131
|
+
```ruby
|
|
132
|
+
row = table[0]
|
|
133
|
+
row.headers # => ["name", "age"]
|
|
134
|
+
row.fields # => ["Alice", "30"]
|
|
135
|
+
row["name"] # => "Alice"
|
|
136
|
+
row[0] # => "Alice"
|
|
137
|
+
row.to_h # => {"name" => "Alice", "age" => "30"}
|
|
138
|
+
row.to_csv # => "Alice,30\n"
|
|
139
|
+
row.header?("name") # => true
|
|
140
|
+
row.field?("Alice") # => true
|
|
141
|
+
|
|
142
|
+
# Mutation
|
|
143
|
+
row["city"] = "NYC"
|
|
144
|
+
row << ["country", "USA"]
|
|
145
|
+
row.delete("country")
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Table Class
|
|
149
|
+
|
|
150
|
+
Full `CSV::Table` compatibility:
|
|
151
|
+
|
|
152
|
+
```ruby
|
|
153
|
+
table.headers # => ["name", "age"]
|
|
154
|
+
table.size # => 2
|
|
155
|
+
table[0] # => Row
|
|
156
|
+
table["name"] # => ["Alice", "Bob"] (column)
|
|
157
|
+
|
|
158
|
+
# Access modes
|
|
159
|
+
table.by_col["name"] # Column access
|
|
160
|
+
table.by_row[0] # Row access
|
|
161
|
+
|
|
162
|
+
# Mutation
|
|
163
|
+
table << ["Carol", "35"]
|
|
164
|
+
table.delete(0)
|
|
165
|
+
|
|
166
|
+
# Output
|
|
167
|
+
table.to_csv # Full CSV string with headers
|
|
168
|
+
table.to_a # Array of arrays
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### Generation
|
|
172
|
+
|
|
173
|
+
```ruby
|
|
174
|
+
# Generate CSV string
|
|
175
|
+
csv = VFCSV.generate do |out|
|
|
176
|
+
out << ["name", "age"]
|
|
177
|
+
out << ["Alice", 30]
|
|
178
|
+
end
|
|
179
|
+
# => "name,age\nAlice,30\n"
|
|
180
|
+
|
|
181
|
+
# Generate single line
|
|
182
|
+
VFCSV.generate_line([1, 2, 3]) # => "1,2,3\n"
|
|
183
|
+
VFCSV.generate_line([1, 2], col_sep: "|") # => "1|2\n"
|
|
184
|
+
|
|
185
|
+
# Force quotes
|
|
186
|
+
VFCSV.generate_line([1, 2], force_quotes: true) # => "\"1\",\"2\"\n"
|
|
187
|
+
|
|
188
|
+
# Write to file
|
|
189
|
+
VFCSV.open("out.csv", "w") do |csv|
|
|
190
|
+
csv << [1, 2, 3]
|
|
191
|
+
end
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### Options
|
|
195
|
+
|
|
196
|
+
All standard CSV options are supported:
|
|
197
|
+
|
|
198
|
+
```ruby
|
|
199
|
+
VFCSV.parse(data,
|
|
200
|
+
col_sep: ",", # Column separator
|
|
201
|
+
row_sep: :auto, # Row separator (:auto, "\n", "\r\n")
|
|
202
|
+
quote_char: '"', # Quote character
|
|
203
|
+
headers: false, # First row as headers
|
|
204
|
+
converters: nil, # Value converters
|
|
205
|
+
header_converters: nil, # Header converters
|
|
206
|
+
skip_blanks: false, # Skip empty rows
|
|
207
|
+
skip_lines: nil, # Regexp to skip lines
|
|
208
|
+
force_quotes: false, # Quote all fields on output
|
|
209
|
+
liberal_parsing: false # Lenient parsing
|
|
210
|
+
)
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
## Architecture
|
|
214
|
+
|
|
215
|
+
VFCSV uses a two-stage SIMD-accelerated parsing approach inspired by simdjson:
|
|
216
|
+
|
|
217
|
+
1. **Stage 1: Structural Detection** - SIMD instructions process 16 bytes at a time to identify commas, quotes, and newlines
|
|
218
|
+
2. **Stage 2: Field Extraction** - Extract fields based on structural indices with optimized quote handling
|
|
219
|
+
|
|
220
|
+
The Rust core is wrapped with Magnus for zero-copy Ruby string handling.
|
|
221
|
+
|
|
222
|
+
```
|
|
223
|
+
┌───────────────────────────────────────────────┐
|
|
224
|
+
│ Ruby API │
|
|
225
|
+
│ VFCSV.parse / Row / Table / Generator │
|
|
226
|
+
├───────────────────────────────────────────────┤
|
|
227
|
+
│ Magnus FFI │
|
|
228
|
+
├───────────────────────────────────────────────┤
|
|
229
|
+
│ Rust SIMD Parser │
|
|
230
|
+
│ ┌─────────────┐ ┌───────────────────┐ │
|
|
231
|
+
│ │ NEON (ARM64)│ │ Portable Fallback │ │
|
|
232
|
+
│ └─────────────┘ └───────────────────┘ │
|
|
233
|
+
└───────────────────────────────────────────────┘
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
## When to Use VFCSV
|
|
237
|
+
|
|
238
|
+
**Use VFCSV when:**
|
|
239
|
+
- You need faster CSV parsing without changing your code
|
|
240
|
+
- You're processing large CSV files
|
|
241
|
+
- You need full CSV API compatibility (Row, Table, converters)
|
|
242
|
+
- You want SIMD acceleration with zero C dependencies
|
|
243
|
+
|
|
244
|
+
**Consider alternatives when:**
|
|
245
|
+
- You only need hash output (OSV might be faster)
|
|
246
|
+
- You don't need Row/Table classes (zsv-ruby is comparable speed)
|
|
247
|
+
- You need streaming for files larger than memory
|
|
248
|
+
|
|
249
|
+
## Running Tests
|
|
250
|
+
|
|
251
|
+
```bash
|
|
252
|
+
bundle exec rake test # Run all tests (136 tests)
|
|
253
|
+
bundle exec rake bench # Run benchmarks
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## Contributing
|
|
257
|
+
|
|
258
|
+
Bug reports and pull requests welcome at https://github.com/khasinski/vfcsv.
|
|
259
|
+
|
|
260
|
+
## License
|
|
261
|
+
|
|
262
|
+
MIT License. See [LICENSE](LICENSE) for details.
|
|
263
|
+
|
|
264
|
+
## Acknowledgments
|
|
265
|
+
|
|
266
|
+
- Inspired by [simdjson](https://github.com/simdjson/simdjson)'s SIMD parsing techniques
|
|
267
|
+
- Built with [Magnus](https://github.com/matsadler/magnus) for Ruby bindings
|
|
268
|
+
- Benchmarked against [zsv-ruby](https://github.com/sebyx07/zsv-ruby), [OSV](https://github.com/njaremko/osv), and [FastCSV](https://github.com/jpmckinney/fastcsv)
|
data/Rakefile
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "bundler/gem_tasks"
|
|
4
|
+
require "rspec/core/rake_task"
|
|
5
|
+
require "rake/extensiontask"
|
|
6
|
+
require "rake/testtask"
|
|
7
|
+
|
|
8
|
+
RSpec::Core::RakeTask.new(:spec)
|
|
9
|
+
|
|
10
|
+
# Minitest task for CSV compatibility tests
|
|
11
|
+
Rake::TestTask.new(:test) do |t|
|
|
12
|
+
t.libs << "test"
|
|
13
|
+
t.libs << "lib"
|
|
14
|
+
t.test_files = FileList["test/test_*.rb"]
|
|
15
|
+
t.verbose = true
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# Rust extension (via rb_sys)
|
|
19
|
+
Rake::ExtensionTask.new("vfcsv_rust") do |ext|
|
|
20
|
+
ext.lib_dir = "lib/vfcsv"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
task default: %i[compile test]
|
|
24
|
+
|
|
25
|
+
namespace :bench do
|
|
26
|
+
desc "Benchmark against stdlib CSV"
|
|
27
|
+
task :stdlib do
|
|
28
|
+
ruby "bench/vs_stdlib.rb"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
desc "Benchmark against all CSV libraries"
|
|
32
|
+
task :all do
|
|
33
|
+
ruby "bench/vs_all.rb"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
task bench: "bench:stdlib"
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
set -e
|
|
3
|
+
|
|
4
|
+
cd "$(dirname "$0")/.."
|
|
5
|
+
|
|
6
|
+
echo "=================================================="
|
|
7
|
+
echo "VFCSV vs CSV - All JIT Modes Comparison"
|
|
8
|
+
echo "=================================================="
|
|
9
|
+
|
|
10
|
+
echo ""
|
|
11
|
+
echo "==================== NO JIT ===================="
|
|
12
|
+
bundle exec ruby bench/vs_stdlib.rb
|
|
13
|
+
|
|
14
|
+
echo ""
|
|
15
|
+
echo "==================== YJIT ===================="
|
|
16
|
+
RUBY_YJIT_ENABLE=1 bundle exec ruby bench/vs_stdlib.rb
|
|
17
|
+
|
|
18
|
+
echo ""
|
|
19
|
+
echo "==================== ZJIT ===================="
|
|
20
|
+
RUBY_ZJIT_ENABLE=1 bundle exec ruby bench/vs_stdlib.rb
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Comprehensive benchmark comparing VFCSV against all major CSV libraries
|
|
5
|
+
#
|
|
6
|
+
# Competitors:
|
|
7
|
+
# - csv (stdlib) - Ruby's built-in CSV
|
|
8
|
+
# - fastcsv - Ragel-based, drop-in replacement
|
|
9
|
+
# - fastest-csv - C-based, fastest pure parsing
|
|
10
|
+
# - smarter_csv - Feature-rich, hash output
|
|
11
|
+
# - zsv-ruby - SIMD-accelerated (wraps C zsv library)
|
|
12
|
+
# - osv - Rust-based (wraps csv-rs)
|
|
13
|
+
|
|
14
|
+
require "bundler/setup"
|
|
15
|
+
require "benchmark/ips"
|
|
16
|
+
require "csv"
|
|
17
|
+
require_relative "../lib/vfcsv"
|
|
18
|
+
|
|
19
|
+
# Try to load optional competitors
|
|
20
|
+
COMPETITORS = {}
|
|
21
|
+
|
|
22
|
+
begin
|
|
23
|
+
require "fastcsv"
|
|
24
|
+
COMPETITORS[:fastcsv] = true
|
|
25
|
+
rescue LoadError
|
|
26
|
+
COMPETITORS[:fastcsv] = false
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
begin
|
|
30
|
+
require "fastest-csv"
|
|
31
|
+
COMPETITORS[:fastest_csv] = true
|
|
32
|
+
rescue LoadError
|
|
33
|
+
COMPETITORS[:fastest_csv] = false
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
begin
|
|
37
|
+
require "smarter_csv"
|
|
38
|
+
COMPETITORS[:smarter_csv] = true
|
|
39
|
+
rescue LoadError
|
|
40
|
+
COMPETITORS[:smarter_csv] = false
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
begin
|
|
44
|
+
require "zsv"
|
|
45
|
+
COMPETITORS[:zsv] = true
|
|
46
|
+
rescue LoadError
|
|
47
|
+
COMPETITORS[:zsv] = false
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
begin
|
|
51
|
+
require "osv"
|
|
52
|
+
COMPETITORS[:osv] = true
|
|
53
|
+
rescue LoadError
|
|
54
|
+
COMPETITORS[:osv] = false
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
puts "=" * 70
|
|
58
|
+
puts "VFCSV vs All Competitors - Comprehensive Benchmark"
|
|
59
|
+
puts "=" * 70
|
|
60
|
+
puts
|
|
61
|
+
puts "Ruby: #{RUBY_VERSION}"
|
|
62
|
+
puts "YJIT: #{defined?(RubyVM::YJIT) && RubyVM::YJIT.enabled? ? 'enabled' : 'disabled'}"
|
|
63
|
+
puts "VFCSV SIMD: #{VFCSV.simd_info.inspect}"
|
|
64
|
+
puts
|
|
65
|
+
puts "Competitors available:"
|
|
66
|
+
puts " - csv (stdlib): ✓"
|
|
67
|
+
COMPETITORS.each do |name, available|
|
|
68
|
+
puts " - #{name}: #{available ? '✓' : '✗ (not installed)'}"
|
|
69
|
+
end
|
|
70
|
+
puts
|
|
71
|
+
|
|
72
|
+
# Generate test data
|
|
73
|
+
def generate_csv(rows, cols)
|
|
74
|
+
header = (1..cols).map { |i| "col#{i}" }.join(",")
|
|
75
|
+
data = (1..rows).map do |r|
|
|
76
|
+
(1..cols).map { |c| "value#{r}_#{c}" }.join(",")
|
|
77
|
+
end.join("\n")
|
|
78
|
+
"#{header}\n#{data}\n"
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def generate_quoted_csv(rows, cols)
|
|
82
|
+
header = (1..cols).map { |i| "\"col#{i}\"" }.join(",")
|
|
83
|
+
data = (1..rows).map do |r|
|
|
84
|
+
(1..cols).map { |c| "\"value #{r}, col #{c}\"" }.join(",")
|
|
85
|
+
end.join("\n")
|
|
86
|
+
"#{header}\n#{data}\n"
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def generate_numeric_csv(rows, cols)
|
|
90
|
+
header = (1..cols).map { |i| "col#{i}" }.join(",")
|
|
91
|
+
data = (1..rows).map do |r|
|
|
92
|
+
(1..cols).map { |c| (r * c * 1.5).to_s }.join(",")
|
|
93
|
+
end.join("\n")
|
|
94
|
+
"#{header}\n#{data}\n"
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Test datasets
|
|
98
|
+
SMALL_CSV = generate_csv(1000, 10)
|
|
99
|
+
MEDIUM_CSV = generate_csv(10_000, 10)
|
|
100
|
+
LARGE_CSV = generate_csv(50_000, 10)
|
|
101
|
+
QUOTED_CSV = generate_quoted_csv(5000, 10)
|
|
102
|
+
NUMERIC_CSV = generate_numeric_csv(5000, 10)
|
|
103
|
+
|
|
104
|
+
puts "Test data sizes:"
|
|
105
|
+
puts " Small: #{SMALL_CSV.bytesize / 1024}KB (1,000 rows x 10 cols)"
|
|
106
|
+
puts " Medium: #{MEDIUM_CSV.bytesize / 1024}KB (10,000 rows x 10 cols)"
|
|
107
|
+
puts " Large: #{LARGE_CSV.bytesize / 1024}KB (50,000 rows x 10 cols)"
|
|
108
|
+
puts " Quoted: #{QUOTED_CSV.bytesize / 1024}KB (5,000 rows x 10 cols, quoted)"
|
|
109
|
+
puts " Numeric: #{NUMERIC_CSV.bytesize / 1024}KB (5,000 rows x 10 cols, numeric)"
|
|
110
|
+
puts
|
|
111
|
+
|
|
112
|
+
# Verify correctness first
|
|
113
|
+
puts "-" * 70
|
|
114
|
+
puts "Correctness Check (comparing output to CSV stdlib)"
|
|
115
|
+
puts "-" * 70
|
|
116
|
+
|
|
117
|
+
csv_result = CSV.parse(SMALL_CSV)
|
|
118
|
+
vfcsv_result = VFCSV.parse(SMALL_CSV)
|
|
119
|
+
puts "VFCSV: #{csv_result == vfcsv_result ? '✓ PASS' : '✗ FAIL'}"
|
|
120
|
+
|
|
121
|
+
if COMPETITORS[:fastcsv]
|
|
122
|
+
fastcsv_result = FastCSV.parse(SMALL_CSV)
|
|
123
|
+
puts "FastCSV: #{csv_result == fastcsv_result ? '✓ PASS' : '✗ FAIL'}"
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
if COMPETITORS[:fastest_csv]
|
|
127
|
+
fastest_result = FastestCSV.parse(SMALL_CSV)
|
|
128
|
+
puts "FastestCSV: #{csv_result == fastest_result ? '✓ PASS' : '✗ FAIL (known - multiline issues)'}"
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
puts
|
|
132
|
+
|
|
133
|
+
# Run benchmarks
|
|
134
|
+
[
|
|
135
|
+
["Small (1K rows)", SMALL_CSV],
|
|
136
|
+
["Medium (10K rows)", MEDIUM_CSV],
|
|
137
|
+
["Large (50K rows)", LARGE_CSV],
|
|
138
|
+
["Quoted (5K rows)", QUOTED_CSV],
|
|
139
|
+
].each do |name, csv_data|
|
|
140
|
+
puts "-" * 70
|
|
141
|
+
puts "Benchmark: #{name}"
|
|
142
|
+
puts "-" * 70
|
|
143
|
+
|
|
144
|
+
Benchmark.ips do |x|
|
|
145
|
+
x.config(time: 3, warmup: 1)
|
|
146
|
+
|
|
147
|
+
x.report("CSV (stdlib)") { CSV.parse(csv_data) }
|
|
148
|
+
x.report("VFCSV (SIMD)") { VFCSV.parse(csv_data) }
|
|
149
|
+
|
|
150
|
+
if COMPETITORS[:fastcsv]
|
|
151
|
+
x.report("FastCSV") { FastCSV.parse(csv_data) }
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
if COMPETITORS[:fastest_csv]
|
|
155
|
+
x.report("FastestCSV") { FastestCSV.parse(csv_data) }
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
if COMPETITORS[:zsv]
|
|
159
|
+
x.report("ZSV") { ZSV.parse(csv_data) }
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
if COMPETITORS[:osv]
|
|
163
|
+
x.report("OSV") { OSV.for_each(StringIO.new(csv_data), result_type: :array).to_a }
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
x.compare!
|
|
167
|
+
end
|
|
168
|
+
puts
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Headers benchmark
|
|
172
|
+
puts "-" * 70
|
|
173
|
+
puts "Benchmark: With Headers (returns hash/Row objects)"
|
|
174
|
+
puts "-" * 70
|
|
175
|
+
|
|
176
|
+
Benchmark.ips do |x|
|
|
177
|
+
x.config(time: 3, warmup: 1)
|
|
178
|
+
|
|
179
|
+
x.report("CSV (headers)") { CSV.parse(MEDIUM_CSV, headers: true).map(&:to_h) }
|
|
180
|
+
x.report("VFCSV (headers)") { VFCSV.parse(MEDIUM_CSV, headers: true).map(&:to_h) }
|
|
181
|
+
|
|
182
|
+
if COMPETITORS[:fastcsv]
|
|
183
|
+
x.report("FastCSV (headers)") { FastCSV.parse(MEDIUM_CSV, headers: true).map(&:to_h) }
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
if COMPETITORS[:smarter_csv]
|
|
187
|
+
x.report("SmarterCSV") { SmarterCSV.parse(MEDIUM_CSV) }
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
if COMPETITORS[:osv]
|
|
191
|
+
x.report("OSV (hash)") { OSV.for_each(StringIO.new(MEDIUM_CSV), result_type: :hash).to_a }
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
x.compare!
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Throughput summary
|
|
198
|
+
puts
|
|
199
|
+
puts "=" * 70
|
|
200
|
+
puts "THROUGHPUT SUMMARY (MB/s)"
|
|
201
|
+
puts "=" * 70
|
|
202
|
+
|
|
203
|
+
results = {}
|
|
204
|
+
|
|
205
|
+
[SMALL_CSV, MEDIUM_CSV, LARGE_CSV].each_with_index do |csv_data, i|
|
|
206
|
+
size_mb = csv_data.bytesize / 1_000_000.0
|
|
207
|
+
iterations = [1000, 100, 20][i] # Adjust based on size
|
|
208
|
+
|
|
209
|
+
# CSV stdlib
|
|
210
|
+
t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
211
|
+
iterations.times { CSV.parse(csv_data) }
|
|
212
|
+
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
213
|
+
csv_mbs = (size_mb * iterations) / (t1 - t0)
|
|
214
|
+
|
|
215
|
+
# VFCSV
|
|
216
|
+
t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
217
|
+
iterations.times { VFCSV.parse(csv_data) }
|
|
218
|
+
t1 = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
219
|
+
vfcsv_mbs = (size_mb * iterations) / (t1 - t0)
|
|
220
|
+
|
|
221
|
+
results[["Small", "Medium", "Large"][i]] = {
|
|
222
|
+
csv: csv_mbs,
|
|
223
|
+
vfcsv: vfcsv_mbs,
|
|
224
|
+
speedup: vfcsv_mbs / csv_mbs
|
|
225
|
+
}
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
puts
|
|
229
|
+
puts "| Dataset | CSV (MB/s) | VFCSV (MB/s) | Speedup |"
|
|
230
|
+
puts "|---------|------------|--------------|---------|"
|
|
231
|
+
results.each do |name, data|
|
|
232
|
+
puts "| #{name.ljust(7)} | #{data[:csv].round(1).to_s.rjust(10)} | #{data[:vfcsv].round(1).to_s.rjust(12)} | #{data[:speedup].round(1)}x |"
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
puts
|
|
236
|
+
puts "=" * 70
|
|
237
|
+
puts "FEATURE COMPARISON"
|
|
238
|
+
puts "=" * 70
|
|
239
|
+
puts
|
|
240
|
+
puts "| Feature | CSV | VFCSV | FastCSV | FastestCSV | ZSV | OSV |"
|
|
241
|
+
puts "|----------------------------|-----|-------|---------|------------|-----|-----|"
|
|
242
|
+
puts "| Drop-in replacement | N/A | ✓ | ✓ | ✗ | ~ | ✗ |"
|
|
243
|
+
puts "| SIMD acceleration | ✗ | ✓ | ✗ | ✗ | ✓ | ✗ |"
|
|
244
|
+
puts "| Row/Table classes | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ |"
|
|
245
|
+
puts "| Converters (:integer, etc) | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ |"
|
|
246
|
+
puts "| Header converters | ✓ | ✓ | ✓ | ✗ | ✗ | ✗ |"
|
|
247
|
+
puts "| Multiline fields | ✓ | ✓ | ✓ | ✗ | ✓ | ✓ |"
|
|
248
|
+
puts "| Quoted fields | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |"
|
|
249
|
+
puts "| Custom col_sep | ✓ | ✓ | ✓ | ✗ | ✓ | ✓ |"
|
|
250
|
+
puts "| Ruby 4.0 compatible | ✓ | ✓ | ? | ? | ? | ✓ |"
|
|
251
|
+
puts "| Pure Rust/no C deps | N/A | ✓ | ✗ | ✗ | ✗ | ✓ |"
|
|
252
|
+
puts
|
|
253
|
+
puts "Legend: ✓ = supported, ✗ = not supported, ~ = partial, ? = unknown"
|