cisv 0.0.42 → 0.0.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -57
- package/benchmark/benchmark.js +107 -0
- package/cisv/cisv_addon.cc +2 -1
- package/cisv/cisv_parser.c +648 -891
- package/cisv/cisv_transformer.c +0 -1
- package/cisv/cisv_writer.c +0 -1
- package/package.json +10 -7
package/README.md
CHANGED
|
@@ -5,20 +5,15 @@
|
|
|
5
5
|

|
|
6
6
|

|
|
7
7
|
|
|
8
|
+
> # DISCLAIMER
|
|
9
|
+
>
|
|
10
|
+
> This csv parser does not covers all quotes/comments edge cases, it is meant for now to be just extremly fast, thus not PROD ready yet.
|
|
8
11
|
|
|
9
12
|
Cisv is a csv parser on steroids... literally.
|
|
10
13
|
It's a high-performance CSV parser/writer leveraging SIMD instructions and zero-copy memory mapping. Available as both a Node.js native addon and standalone CLI tool with extensive configuration options.
|
|
11
14
|
|
|
12
15
|
I wrote about basics in a blog post, you can read here :https://sanixdk.xyz/blogs/how-i-accidentally-created-the-fastest-csv-parser-ever-made.
|
|
13
16
|
|
|
14
|
-
## PERFORMANCE
|
|
15
|
-
|
|
16
|
-
- **469,968 MB/s** throughput on 2M row CSV files (AVX-512)
|
|
17
|
-
- **10-100x faster** than popular CSV parsers
|
|
18
|
-
- Zero-copy memory-mapped I/O with kernel optimizations
|
|
19
|
-
- SIMD accelerated with AVX-512/AVX2 auto-detection
|
|
20
|
-
- Dynamic lookup tables for configurable parsing
|
|
21
|
-
|
|
22
17
|
## CLI BENCHMARKS WITH DOCKER
|
|
23
18
|
|
|
24
19
|
```bash
|
|
@@ -120,16 +115,16 @@ const tsv_rows = tsv_parser.parseSync('./data.tsv');
|
|
|
120
115
|
### CLI
|
|
121
116
|
```bash
|
|
122
117
|
# Basic parsing
|
|
123
|
-
|
|
118
|
+
cisv_bin data.csv
|
|
124
119
|
|
|
125
120
|
# Parse TSV file
|
|
126
|
-
|
|
121
|
+
cisv_bin -d $'\t' data.tsv
|
|
127
122
|
|
|
128
123
|
# Parse with custom quote and trim
|
|
129
|
-
|
|
124
|
+
cisv_bin -q "'" -t data.csv
|
|
130
125
|
|
|
131
126
|
# Skip comment lines
|
|
132
|
-
|
|
127
|
+
cisv_bin -m '#' config.csv
|
|
133
128
|
```
|
|
134
129
|
|
|
135
130
|
## CONFIGURATION OPTIONS
|
|
@@ -352,7 +347,7 @@ const tsvCount = cisvParser.countRowsWithConfig('data.tsv', {
|
|
|
352
347
|
### PARSING OPTIONS
|
|
353
348
|
|
|
354
349
|
```bash
|
|
355
|
-
|
|
350
|
+
cisv_bin [OPTIONS] [FILE]
|
|
356
351
|
|
|
357
352
|
General Options:
|
|
358
353
|
-h, --help Show help message
|
|
@@ -384,34 +379,34 @@ Processing Options:
|
|
|
384
379
|
|
|
385
380
|
```bash
|
|
386
381
|
# Parse TSV file
|
|
387
|
-
|
|
382
|
+
cisv_bin -d $'\t' data.tsv
|
|
388
383
|
|
|
389
384
|
# Parse CSV with semicolon delimiter and single quotes
|
|
390
|
-
|
|
385
|
+
cisv_bin -d ';' -q "'" european.csv
|
|
391
386
|
|
|
392
387
|
# Skip comment lines starting with #
|
|
393
|
-
|
|
388
|
+
cisv_bin -m '#' config.csv
|
|
394
389
|
|
|
395
390
|
# Trim whitespace and skip empty lines
|
|
396
|
-
|
|
391
|
+
cisv_bin -t --skip-empty messy.csv
|
|
397
392
|
|
|
398
393
|
# Parse lines 100-1000 only
|
|
399
|
-
|
|
394
|
+
cisv_bin --from-line 100 --to-line 1000 large.csv
|
|
400
395
|
|
|
401
396
|
# Select specific columns
|
|
402
|
-
|
|
397
|
+
cisv_bin -s 0,2,5,7 data.csv
|
|
403
398
|
|
|
404
399
|
# Count rows with specific configuration
|
|
405
|
-
|
|
400
|
+
cisv_bin -c -d $'\t' --skip-empty data.tsv
|
|
406
401
|
|
|
407
402
|
# Benchmark with custom delimiter
|
|
408
|
-
|
|
403
|
+
cisv_bin -b -d ';' european.csv
|
|
409
404
|
```
|
|
410
405
|
|
|
411
406
|
### WRITING
|
|
412
407
|
|
|
413
408
|
```bash
|
|
414
|
-
|
|
409
|
+
cisv_bin write [OPTIONS]
|
|
415
410
|
|
|
416
411
|
Options:
|
|
417
412
|
-g, --generate N Generate N rows of test data
|
|
@@ -423,44 +418,9 @@ Options:
|
|
|
423
418
|
-b, --benchmark Benchmark mode
|
|
424
419
|
```
|
|
425
420
|
|
|
426
|
-
## BENCHMARKS
|
|
427
|
-
|
|
428
|
-
### PARSER PERFORMANCE (273 MB, 5M ROWS)
|
|
429
|
-
|
|
430
|
-
| Parser | Speed (MB/s) | Time (ms) | Relative |
|
|
431
|
-
|---------------|--------------|-----------|----------------|
|
|
432
|
-
| **cisv** | 7,184 | 38 | 1.0x (fastest) |
|
|
433
|
-
| rust-csv | 391 | 698 | 18x slower |
|
|
434
|
-
| xsv | 650 | 420 | 11x slower |
|
|
435
|
-
| csvkit | 28 | 9,875 | 260x slower |
|
|
436
|
-
|
|
437
|
-
### NODE.JS LIBRARY BENCHMARKS
|
|
438
|
-
|
|
439
|
-
| Library | Speed (MB/s) | Operations/sec | Configuration Support |
|
|
440
|
-
|--------------------|--------------|----------------|----------------------|
|
|
441
|
-
| cisv | 61.24 | 136,343 | Full |
|
|
442
|
-
| csv-parse | 15.48 | 34,471 | Partial |
|
|
443
|
-
| papaparse | 25.67 | 57,147 | Partial |
|
|
444
|
-
|
|
445
|
-
(you can check more benchmarks details from release pipelines)
|
|
446
|
-
|
|
447
|
-
### RUNNING BENCHMARKS
|
|
448
|
-
|
|
449
|
-
```bash
|
|
450
|
-
# CLI benchmarks
|
|
451
|
-
make clean && make cli && make benchmark-cli
|
|
452
|
-
|
|
453
|
-
# Node.js benchmarks
|
|
454
|
-
npm run benchmark
|
|
455
|
-
|
|
456
|
-
# Benchmark with custom configuration
|
|
457
|
-
cisv -b -d ';' -q "'" --trim european.csv
|
|
458
|
-
```
|
|
459
|
-
|
|
460
421
|
## TECHNICAL ARCHITECTURE
|
|
461
422
|
|
|
462
423
|
- **SIMD Processing**: AVX-512 (64-byte vectors) or AVX2 (32-byte vectors) for parallel processing
|
|
463
|
-
- **Dynamic Lookup Tables**: Generated per-configuration for optimal state transitions
|
|
464
424
|
- **Memory Mapping**: Direct kernel-to-userspace zero-copy with `mmap()`
|
|
465
425
|
- **Optimized Buffering**: 1MB ring buffer sized for L3 cache efficiency
|
|
466
426
|
- **Compiler Optimizations**: LTO and architecture-specific tuning with `-march=native`
|
package/benchmark/benchmark.js
CHANGED
|
@@ -6,6 +6,9 @@ const { parse: csvParseSync } = require('csv-parse/sync');
|
|
|
6
6
|
const { parse: csvParseStream } = require('csv-parse');
|
|
7
7
|
const Papa = require('papaparse');
|
|
8
8
|
const fastCsv = require('fast-csv');
|
|
9
|
+
const { inferSchema, initParser } = require('udsv');
|
|
10
|
+
const d3 = require('d3-dsv');
|
|
11
|
+
// const { iter } = require('but-csv');
|
|
9
12
|
const fs = require('fs');
|
|
10
13
|
const { Suite } = require('benchmark');
|
|
11
14
|
const stream = require('stream');
|
|
@@ -90,6 +93,17 @@ async function runAllBenchmarks() {
|
|
|
90
93
|
.add('papaparse (sync)', () => {
|
|
91
94
|
Papa.parse(fileString, { fastMode: true });
|
|
92
95
|
})
|
|
96
|
+
.add('udsv (sync)', () => {
|
|
97
|
+
const schema = inferSchema(fileString);
|
|
98
|
+
const parser = initParser(schema);
|
|
99
|
+
parser.stringArrs(fileString);
|
|
100
|
+
})
|
|
101
|
+
.add('d3-dsv (sync)', () => {
|
|
102
|
+
d3.csvParse(fileString);
|
|
103
|
+
})
|
|
104
|
+
// .add('but-csv (sync)', () => {
|
|
105
|
+
// Array.from(iter(fileString));
|
|
106
|
+
// })
|
|
93
107
|
.on('cycle', (event) => logCycle(event, 'sync'))
|
|
94
108
|
.on('error', reject)
|
|
95
109
|
.on('complete', function() {
|
|
@@ -122,6 +136,20 @@ async function runAllBenchmarks() {
|
|
|
122
136
|
const result = Papa.parse(fileString, { fastMode: true });
|
|
123
137
|
const specificRow = result.data[TARGET_ROW_INDEX];
|
|
124
138
|
})
|
|
139
|
+
.add('udsv (sync)', () => {
|
|
140
|
+
const schema = inferSchema(fileString);
|
|
141
|
+
const parser = initParser(schema);
|
|
142
|
+
const rows = parser.stringArrs(fileString);
|
|
143
|
+
const specificRow = rows[TARGET_ROW_INDEX];
|
|
144
|
+
})
|
|
145
|
+
.add('d3-dsv (sync)', () => {
|
|
146
|
+
const rows = d3.csvParse(fileString);
|
|
147
|
+
const specificRow = rows[TARGET_ROW_INDEX];
|
|
148
|
+
})
|
|
149
|
+
//.add('but-csv (sync)', () => {
|
|
150
|
+
// const rows = Array.from(iter(fileString));
|
|
151
|
+
// const specificRow = rows[TARGET_ROW_INDEX];
|
|
152
|
+
//})
|
|
125
153
|
.on('cycle', (event) => logCycle(event, 'sync_data'))
|
|
126
154
|
.on('error', reject)
|
|
127
155
|
.on('complete', function() {
|
|
@@ -168,6 +196,18 @@ async function runAllBenchmarks() {
|
|
|
168
196
|
});
|
|
169
197
|
}
|
|
170
198
|
})
|
|
199
|
+
.add('fast-csv (async/stream)', {
|
|
200
|
+
defer: true,
|
|
201
|
+
fn: (deferred) => {
|
|
202
|
+
const rows = [];
|
|
203
|
+
const readable = stream.Readable.from(fileBuffer);
|
|
204
|
+
readable
|
|
205
|
+
.pipe(fastCsv.parse({ headers: true }))
|
|
206
|
+
.on('data', (row) => rows.push(row))
|
|
207
|
+
.on('end', () => deferred.resolve())
|
|
208
|
+
.on('error', (err) => deferred.reject(err));
|
|
209
|
+
}
|
|
210
|
+
})
|
|
171
211
|
.add('neat-csv (async/promise)', {
|
|
172
212
|
defer: true,
|
|
173
213
|
fn: (deferred) => {
|
|
@@ -178,6 +218,32 @@ async function runAllBenchmarks() {
|
|
|
178
218
|
.catch((err) => deferred.reject(err));
|
|
179
219
|
}
|
|
180
220
|
})
|
|
221
|
+
.add('udsv (async/stream)', {
|
|
222
|
+
defer: true,
|
|
223
|
+
fn: (deferred) => {
|
|
224
|
+
const readable = stream.Readable.from(fileString);
|
|
225
|
+
let parser = null;
|
|
226
|
+
|
|
227
|
+
readable
|
|
228
|
+
.on('data', (chunk) => {
|
|
229
|
+
const strChunk = chunk.toString();
|
|
230
|
+
if (parser == null) {
|
|
231
|
+
const schema = inferSchema(strChunk);
|
|
232
|
+
parser = initParser(schema);
|
|
233
|
+
}
|
|
234
|
+
parser.chunk(strChunk);
|
|
235
|
+
})
|
|
236
|
+
.on('end', () => {
|
|
237
|
+
if (parser != null) {
|
|
238
|
+
parser.end();
|
|
239
|
+
}
|
|
240
|
+
deferred.resolve();
|
|
241
|
+
})
|
|
242
|
+
.on('error', (err) => deferred.reject(err));
|
|
243
|
+
}
|
|
244
|
+
})
|
|
245
|
+
// Note: d3-dsv and but-csv don't have native async streaming support
|
|
246
|
+
// so they are only included in sync benchmarks
|
|
181
247
|
.on('cycle', (event) => logCycle(event, 'async'))
|
|
182
248
|
.on('error', reject)
|
|
183
249
|
.on('complete', function() {
|
|
@@ -227,6 +293,21 @@ async function runAllBenchmarks() {
|
|
|
227
293
|
});
|
|
228
294
|
}
|
|
229
295
|
})
|
|
296
|
+
.add('fast-csv (async/stream)', {
|
|
297
|
+
defer: true,
|
|
298
|
+
fn: (deferred) => {
|
|
299
|
+
const rows = [];
|
|
300
|
+
const readable = stream.Readable.from(fileBuffer);
|
|
301
|
+
readable
|
|
302
|
+
.pipe(fastCsv.parse({ headers: true }))
|
|
303
|
+
.on('data', (row) => rows.push(row))
|
|
304
|
+
.on('end', () => {
|
|
305
|
+
const specificRow = rows[TARGET_ROW_INDEX];
|
|
306
|
+
deferred.resolve();
|
|
307
|
+
})
|
|
308
|
+
.on('error', (err) => deferred.reject(err));
|
|
309
|
+
}
|
|
310
|
+
})
|
|
230
311
|
.add('neat-csv (async/promise)', {
|
|
231
312
|
defer: true,
|
|
232
313
|
fn: (deferred) => {
|
|
@@ -238,6 +319,32 @@ async function runAllBenchmarks() {
|
|
|
238
319
|
.catch((err) => deferred.reject(err));
|
|
239
320
|
}
|
|
240
321
|
})
|
|
322
|
+
.add('udsv (async/stream)', {
|
|
323
|
+
defer: true,
|
|
324
|
+
fn: (deferred) => {
|
|
325
|
+
const readable = stream.Readable.from(fileString);
|
|
326
|
+
let parser = null;
|
|
327
|
+
let result = null;
|
|
328
|
+
|
|
329
|
+
readable
|
|
330
|
+
.on('data', (chunk) => {
|
|
331
|
+
const strChunk = chunk.toString();
|
|
332
|
+
if (!parser) {
|
|
333
|
+
const schema = inferSchema(strChunk);
|
|
334
|
+
parser = initParser(schema);
|
|
335
|
+
}
|
|
336
|
+
parser.chunk(strChunk, parser.stringArrs);
|
|
337
|
+
})
|
|
338
|
+
.on('end', () => {
|
|
339
|
+
if (parser) {
|
|
340
|
+
result = parser.end();
|
|
341
|
+
const specificRow = result[TARGET_ROW_INDEX];
|
|
342
|
+
}
|
|
343
|
+
deferred.resolve();
|
|
344
|
+
})
|
|
345
|
+
.on('error', (err) => deferred.reject(err));
|
|
346
|
+
}
|
|
347
|
+
})
|
|
241
348
|
.on('cycle', (event) => logCycle(event, 'async_data'))
|
|
242
349
|
.on('error', reject)
|
|
243
350
|
.on('complete', function() {
|
package/cisv/cisv_addon.cc
CHANGED