cisv 0.0.42 → 0.0.60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -72
- package/benchmark/benchmark.js +107 -0
- package/cisv/cisv_addon.cc +2 -1
- package/cisv/cisv_parser.c +648 -891
- package/cisv/cisv_transformer.c +0 -1
- package/cisv/cisv_writer.c +0 -1
- package/package.json +11 -8
package/README.md
CHANGED
|
@@ -5,20 +5,15 @@
|
|
|
5
5
|

|
|
6
6
|

|
|
7
7
|
|
|
8
|
+
> # DISCLAIMER
|
|
9
|
+
>
|
|
10
|
+
> This csv parser does not covers all quotes/comments edge cases, it is meant for now to be just extremly fast, thus not PROD ready yet.
|
|
8
11
|
|
|
9
12
|
Cisv is a csv parser on steroids... literally.
|
|
10
13
|
It's a high-performance CSV parser/writer leveraging SIMD instructions and zero-copy memory mapping. Available as both a Node.js native addon and standalone CLI tool with extensive configuration options.
|
|
11
14
|
|
|
12
15
|
I wrote about basics in a blog post, you can read here :https://sanixdk.xyz/blogs/how-i-accidentally-created-the-fastest-csv-parser-ever-made.
|
|
13
16
|
|
|
14
|
-
## PERFORMANCE
|
|
15
|
-
|
|
16
|
-
- **469,968 MB/s** throughput on 2M row CSV files (AVX-512)
|
|
17
|
-
- **10-100x faster** than popular CSV parsers
|
|
18
|
-
- Zero-copy memory-mapped I/O with kernel optimizations
|
|
19
|
-
- SIMD accelerated with AVX-512/AVX2 auto-detection
|
|
20
|
-
- Dynamic lookup tables for configurable parsing
|
|
21
|
-
|
|
22
17
|
## CLI BENCHMARKS WITH DOCKER
|
|
23
18
|
|
|
24
19
|
```bash
|
|
@@ -41,41 +36,47 @@ $ docker run --rm \
|
|
|
41
36
|
## BENCHMARKS
|
|
42
37
|
|
|
43
38
|
Benchmarks comparison with existing popular tools,
|
|
44
|
-
cf pipeline you can check : (https://github.com/Sanix-Darker/cisv/actions/runs/
|
|
39
|
+
cf pipeline you can check : (https://github.com/Sanix-Darker/cisv/actions/runs/18422464917/job/52498590205) at step "Publish to npm".
|
|
45
40
|
|
|
46
41
|
### SYNCHRONOUS RESULTS
|
|
47
42
|
|
|
48
43
|
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
49
44
|
|--------------------|--------------|---------------|----------------|
|
|
50
|
-
| cisv (sync) |
|
|
51
|
-
| csv-parse (sync) | 18.
|
|
52
|
-
| papaparse (sync) |
|
|
45
|
+
| cisv (sync) | 71.10 | 0.01 | 153723 |
|
|
46
|
+
| csv-parse (sync) | 18.76 | 0.02 | 40563 |
|
|
47
|
+
| papaparse (sync) | 27.97 | 0.02 | 60467 |
|
|
48
|
+
| udsv (sync) | 69.81 | 0.01 | 150930 |
|
|
49
|
+
| d3-dsv (sync) | 98.11 | 0.00 | 212117 |
|
|
53
50
|
|
|
54
51
|
### SYNCHRONOUS RESULTS (WITH DATA ACCESS)
|
|
55
52
|
|
|
56
53
|
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
57
54
|
|--------------------|--------------|---------------|----------------|
|
|
58
|
-
| cisv (sync) |
|
|
59
|
-
| csv-parse (sync) |
|
|
60
|
-
| papaparse (sync) | 28.
|
|
61
|
-
|
|
55
|
+
| cisv (sync) | 104.58 | 0.00 | 226108 |
|
|
56
|
+
| csv-parse (sync) | 16.87 | 0.03 | 36482 |
|
|
57
|
+
| papaparse (sync) | 28.13 | 0.02 | 60807 |
|
|
58
|
+
| udsv (sync) | 69.29 | 0.01 | 149812 |
|
|
59
|
+
| d3-dsv (sync) | 96.32 | 0.00 | 208248 |
|
|
62
60
|
|
|
63
61
|
### ASYNCHRONOUS RESULTS
|
|
64
62
|
|
|
65
63
|
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
66
64
|
|--------------------------|--------------|---------------|----------------|
|
|
67
|
-
| cisv (async/stream) |
|
|
68
|
-
| papaparse (async/stream) | 21.
|
|
69
|
-
|
|
|
70
|
-
|
|
65
|
+
| cisv (async/stream) | 98.36 | 0.00 | 212662 |
|
|
66
|
+
| papaparse (async/stream) | 21.56 | 0.02 | 46609 |
|
|
67
|
+
| fast-csv (async/stream) | 10.09 | 0.05 | 21817 |
|
|
68
|
+
| neat-csv (async/promise) | 9.20 | 0.05 | 19898 |
|
|
69
|
+
| udsv (async/stream) | 51.74 | 0.01 | 111858 |
|
|
71
70
|
|
|
72
71
|
### ASYNCHRONOUS RESULTS (WITH DATA ACCESS)
|
|
73
72
|
|
|
74
73
|
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
75
74
|
|--------------------------|--------------|---------------|----------------|
|
|
76
|
-
| cisv (async/stream) |
|
|
77
|
-
| papaparse (async/stream) |
|
|
78
|
-
|
|
|
75
|
+
| cisv (async/stream) | 27.50 | 0.02 | 59460 |
|
|
76
|
+
| papaparse (async/stream) | 21.98 | 0.02 | 47513 |
|
|
77
|
+
| fast-csv (async/stream) | 10.05 | 0.05 | 21719 |
|
|
78
|
+
| neat-csv (async/promise) | 9.58 | 0.05 | 20711 |
|
|
79
|
+
| udsv (async/stream) | 53.26 | 0.01 | 115146 |
|
|
79
80
|
|
|
80
81
|
## INSTALLATION
|
|
81
82
|
|
|
@@ -120,16 +121,16 @@ const tsv_rows = tsv_parser.parseSync('./data.tsv');
|
|
|
120
121
|
### CLI
|
|
121
122
|
```bash
|
|
122
123
|
# Basic parsing
|
|
123
|
-
|
|
124
|
+
cisv_bin data.csv
|
|
124
125
|
|
|
125
126
|
# Parse TSV file
|
|
126
|
-
|
|
127
|
+
cisv_bin -d $'\t' data.tsv
|
|
127
128
|
|
|
128
129
|
# Parse with custom quote and trim
|
|
129
|
-
|
|
130
|
+
cisv_bin -q "'" -t data.csv
|
|
130
131
|
|
|
131
132
|
# Skip comment lines
|
|
132
|
-
|
|
133
|
+
cisv_bin -m '#' config.csv
|
|
133
134
|
```
|
|
134
135
|
|
|
135
136
|
## CONFIGURATION OPTIONS
|
|
@@ -352,7 +353,7 @@ const tsvCount = cisvParser.countRowsWithConfig('data.tsv', {
|
|
|
352
353
|
### PARSING OPTIONS
|
|
353
354
|
|
|
354
355
|
```bash
|
|
355
|
-
|
|
356
|
+
cisv_bin [OPTIONS] [FILE]
|
|
356
357
|
|
|
357
358
|
General Options:
|
|
358
359
|
-h, --help Show help message
|
|
@@ -384,34 +385,34 @@ Processing Options:
|
|
|
384
385
|
|
|
385
386
|
```bash
|
|
386
387
|
# Parse TSV file
|
|
387
|
-
|
|
388
|
+
cisv_bin -d $'\t' data.tsv
|
|
388
389
|
|
|
389
390
|
# Parse CSV with semicolon delimiter and single quotes
|
|
390
|
-
|
|
391
|
+
cisv_bin -d ';' -q "'" european.csv
|
|
391
392
|
|
|
392
393
|
# Skip comment lines starting with #
|
|
393
|
-
|
|
394
|
+
cisv_bin -m '#' config.csv
|
|
394
395
|
|
|
395
396
|
# Trim whitespace and skip empty lines
|
|
396
|
-
|
|
397
|
+
cisv_bin -t --skip-empty messy.csv
|
|
397
398
|
|
|
398
399
|
# Parse lines 100-1000 only
|
|
399
|
-
|
|
400
|
+
cisv_bin --from-line 100 --to-line 1000 large.csv
|
|
400
401
|
|
|
401
402
|
# Select specific columns
|
|
402
|
-
|
|
403
|
+
cisv_bin -s 0,2,5,7 data.csv
|
|
403
404
|
|
|
404
405
|
# Count rows with specific configuration
|
|
405
|
-
|
|
406
|
+
cisv_bin -c -d $'\t' --skip-empty data.tsv
|
|
406
407
|
|
|
407
408
|
# Benchmark with custom delimiter
|
|
408
|
-
|
|
409
|
+
cisv_bin -b -d ';' european.csv
|
|
409
410
|
```
|
|
410
411
|
|
|
411
412
|
### WRITING
|
|
412
413
|
|
|
413
414
|
```bash
|
|
414
|
-
|
|
415
|
+
cisv_bin write [OPTIONS]
|
|
415
416
|
|
|
416
417
|
Options:
|
|
417
418
|
-g, --generate N Generate N rows of test data
|
|
@@ -423,44 +424,9 @@ Options:
|
|
|
423
424
|
-b, --benchmark Benchmark mode
|
|
424
425
|
```
|
|
425
426
|
|
|
426
|
-
## BENCHMARKS
|
|
427
|
-
|
|
428
|
-
### PARSER PERFORMANCE (273 MB, 5M ROWS)
|
|
429
|
-
|
|
430
|
-
| Parser | Speed (MB/s) | Time (ms) | Relative |
|
|
431
|
-
|---------------|--------------|-----------|----------------|
|
|
432
|
-
| **cisv** | 7,184 | 38 | 1.0x (fastest) |
|
|
433
|
-
| rust-csv | 391 | 698 | 18x slower |
|
|
434
|
-
| xsv | 650 | 420 | 11x slower |
|
|
435
|
-
| csvkit | 28 | 9,875 | 260x slower |
|
|
436
|
-
|
|
437
|
-
### NODE.JS LIBRARY BENCHMARKS
|
|
438
|
-
|
|
439
|
-
| Library | Speed (MB/s) | Operations/sec | Configuration Support |
|
|
440
|
-
|--------------------|--------------|----------------|----------------------|
|
|
441
|
-
| cisv | 61.24 | 136,343 | Full |
|
|
442
|
-
| csv-parse | 15.48 | 34,471 | Partial |
|
|
443
|
-
| papaparse | 25.67 | 57,147 | Partial |
|
|
444
|
-
|
|
445
|
-
(you can check more benchmarks details from release pipelines)
|
|
446
|
-
|
|
447
|
-
### RUNNING BENCHMARKS
|
|
448
|
-
|
|
449
|
-
```bash
|
|
450
|
-
# CLI benchmarks
|
|
451
|
-
make clean && make cli && make benchmark-cli
|
|
452
|
-
|
|
453
|
-
# Node.js benchmarks
|
|
454
|
-
npm run benchmark
|
|
455
|
-
|
|
456
|
-
# Benchmark with custom configuration
|
|
457
|
-
cisv -b -d ';' -q "'" --trim european.csv
|
|
458
|
-
```
|
|
459
|
-
|
|
460
427
|
## TECHNICAL ARCHITECTURE
|
|
461
428
|
|
|
462
429
|
- **SIMD Processing**: AVX-512 (64-byte vectors) or AVX2 (32-byte vectors) for parallel processing
|
|
463
|
-
- **Dynamic Lookup Tables**: Generated per-configuration for optimal state transitions
|
|
464
430
|
- **Memory Mapping**: Direct kernel-to-userspace zero-copy with `mmap()`
|
|
465
431
|
- **Optimized Buffering**: 1MB ring buffer sized for L3 cache efficiency
|
|
466
432
|
- **Compiler Optimizations**: LTO and architecture-specific tuning with `-march=native`
|
package/benchmark/benchmark.js
CHANGED
|
@@ -6,6 +6,9 @@ const { parse: csvParseSync } = require('csv-parse/sync');
|
|
|
6
6
|
const { parse: csvParseStream } = require('csv-parse');
|
|
7
7
|
const Papa = require('papaparse');
|
|
8
8
|
const fastCsv = require('fast-csv');
|
|
9
|
+
const { inferSchema, initParser } = require('udsv');
|
|
10
|
+
const d3 = require('d3-dsv');
|
|
11
|
+
// const { iter } = require('but-csv');
|
|
9
12
|
const fs = require('fs');
|
|
10
13
|
const { Suite } = require('benchmark');
|
|
11
14
|
const stream = require('stream');
|
|
@@ -90,6 +93,17 @@ async function runAllBenchmarks() {
|
|
|
90
93
|
.add('papaparse (sync)', () => {
|
|
91
94
|
Papa.parse(fileString, { fastMode: true });
|
|
92
95
|
})
|
|
96
|
+
.add('udsv (sync)', () => {
|
|
97
|
+
const schema = inferSchema(fileString);
|
|
98
|
+
const parser = initParser(schema);
|
|
99
|
+
parser.stringArrs(fileString);
|
|
100
|
+
})
|
|
101
|
+
.add('d3-dsv (sync)', () => {
|
|
102
|
+
d3.csvParse(fileString);
|
|
103
|
+
})
|
|
104
|
+
// .add('but-csv (sync)', () => {
|
|
105
|
+
// Array.from(iter(fileString));
|
|
106
|
+
// })
|
|
93
107
|
.on('cycle', (event) => logCycle(event, 'sync'))
|
|
94
108
|
.on('error', reject)
|
|
95
109
|
.on('complete', function() {
|
|
@@ -122,6 +136,20 @@ async function runAllBenchmarks() {
|
|
|
122
136
|
const result = Papa.parse(fileString, { fastMode: true });
|
|
123
137
|
const specificRow = result.data[TARGET_ROW_INDEX];
|
|
124
138
|
})
|
|
139
|
+
.add('udsv (sync)', () => {
|
|
140
|
+
const schema = inferSchema(fileString);
|
|
141
|
+
const parser = initParser(schema);
|
|
142
|
+
const rows = parser.stringArrs(fileString);
|
|
143
|
+
const specificRow = rows[TARGET_ROW_INDEX];
|
|
144
|
+
})
|
|
145
|
+
.add('d3-dsv (sync)', () => {
|
|
146
|
+
const rows = d3.csvParse(fileString);
|
|
147
|
+
const specificRow = rows[TARGET_ROW_INDEX];
|
|
148
|
+
})
|
|
149
|
+
//.add('but-csv (sync)', () => {
|
|
150
|
+
// const rows = Array.from(iter(fileString));
|
|
151
|
+
// const specificRow = rows[TARGET_ROW_INDEX];
|
|
152
|
+
//})
|
|
125
153
|
.on('cycle', (event) => logCycle(event, 'sync_data'))
|
|
126
154
|
.on('error', reject)
|
|
127
155
|
.on('complete', function() {
|
|
@@ -168,6 +196,18 @@ async function runAllBenchmarks() {
|
|
|
168
196
|
});
|
|
169
197
|
}
|
|
170
198
|
})
|
|
199
|
+
.add('fast-csv (async/stream)', {
|
|
200
|
+
defer: true,
|
|
201
|
+
fn: (deferred) => {
|
|
202
|
+
const rows = [];
|
|
203
|
+
const readable = stream.Readable.from(fileBuffer);
|
|
204
|
+
readable
|
|
205
|
+
.pipe(fastCsv.parse({ headers: true }))
|
|
206
|
+
.on('data', (row) => rows.push(row))
|
|
207
|
+
.on('end', () => deferred.resolve())
|
|
208
|
+
.on('error', (err) => deferred.reject(err));
|
|
209
|
+
}
|
|
210
|
+
})
|
|
171
211
|
.add('neat-csv (async/promise)', {
|
|
172
212
|
defer: true,
|
|
173
213
|
fn: (deferred) => {
|
|
@@ -178,6 +218,32 @@ async function runAllBenchmarks() {
|
|
|
178
218
|
.catch((err) => deferred.reject(err));
|
|
179
219
|
}
|
|
180
220
|
})
|
|
221
|
+
.add('udsv (async/stream)', {
|
|
222
|
+
defer: true,
|
|
223
|
+
fn: (deferred) => {
|
|
224
|
+
const readable = stream.Readable.from(fileString);
|
|
225
|
+
let parser = null;
|
|
226
|
+
|
|
227
|
+
readable
|
|
228
|
+
.on('data', (chunk) => {
|
|
229
|
+
const strChunk = chunk.toString();
|
|
230
|
+
if (parser == null) {
|
|
231
|
+
const schema = inferSchema(strChunk);
|
|
232
|
+
parser = initParser(schema);
|
|
233
|
+
}
|
|
234
|
+
parser.chunk(strChunk);
|
|
235
|
+
})
|
|
236
|
+
.on('end', () => {
|
|
237
|
+
if (parser != null) {
|
|
238
|
+
parser.end();
|
|
239
|
+
}
|
|
240
|
+
deferred.resolve();
|
|
241
|
+
})
|
|
242
|
+
.on('error', (err) => deferred.reject(err));
|
|
243
|
+
}
|
|
244
|
+
})
|
|
245
|
+
// Note: d3-dsv and but-csv don't have native async streaming support
|
|
246
|
+
// so they are only included in sync benchmarks
|
|
181
247
|
.on('cycle', (event) => logCycle(event, 'async'))
|
|
182
248
|
.on('error', reject)
|
|
183
249
|
.on('complete', function() {
|
|
@@ -227,6 +293,21 @@ async function runAllBenchmarks() {
|
|
|
227
293
|
});
|
|
228
294
|
}
|
|
229
295
|
})
|
|
296
|
+
.add('fast-csv (async/stream)', {
|
|
297
|
+
defer: true,
|
|
298
|
+
fn: (deferred) => {
|
|
299
|
+
const rows = [];
|
|
300
|
+
const readable = stream.Readable.from(fileBuffer);
|
|
301
|
+
readable
|
|
302
|
+
.pipe(fastCsv.parse({ headers: true }))
|
|
303
|
+
.on('data', (row) => rows.push(row))
|
|
304
|
+
.on('end', () => {
|
|
305
|
+
const specificRow = rows[TARGET_ROW_INDEX];
|
|
306
|
+
deferred.resolve();
|
|
307
|
+
})
|
|
308
|
+
.on('error', (err) => deferred.reject(err));
|
|
309
|
+
}
|
|
310
|
+
})
|
|
230
311
|
.add('neat-csv (async/promise)', {
|
|
231
312
|
defer: true,
|
|
232
313
|
fn: (deferred) => {
|
|
@@ -238,6 +319,32 @@ async function runAllBenchmarks() {
|
|
|
238
319
|
.catch((err) => deferred.reject(err));
|
|
239
320
|
}
|
|
240
321
|
})
|
|
322
|
+
.add('udsv (async/stream)', {
|
|
323
|
+
defer: true,
|
|
324
|
+
fn: (deferred) => {
|
|
325
|
+
const readable = stream.Readable.from(fileString);
|
|
326
|
+
let parser = null;
|
|
327
|
+
let result = null;
|
|
328
|
+
|
|
329
|
+
readable
|
|
330
|
+
.on('data', (chunk) => {
|
|
331
|
+
const strChunk = chunk.toString();
|
|
332
|
+
if (!parser) {
|
|
333
|
+
const schema = inferSchema(strChunk);
|
|
334
|
+
parser = initParser(schema);
|
|
335
|
+
}
|
|
336
|
+
parser.chunk(strChunk, parser.stringArrs);
|
|
337
|
+
})
|
|
338
|
+
.on('end', () => {
|
|
339
|
+
if (parser) {
|
|
340
|
+
result = parser.end();
|
|
341
|
+
const specificRow = result[TARGET_ROW_INDEX];
|
|
342
|
+
}
|
|
343
|
+
deferred.resolve();
|
|
344
|
+
})
|
|
345
|
+
.on('error', (err) => deferred.reject(err));
|
|
346
|
+
}
|
|
347
|
+
})
|
|
241
348
|
.on('cycle', (event) => logCycle(event, 'async_data'))
|
|
242
349
|
.on('error', reject)
|
|
243
350
|
.on('complete', function() {
|
package/cisv/cisv_addon.cc
CHANGED