cisv 0.0.42 → 0.0.60

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -5,20 +5,15 @@
5
5
  ![Size](https://deno.bundlejs.com/badge?q=spring-easing)
6
6
  ![Downloads](https://badgen.net/npm/dw/cisv)
7
7
 
8
+ > # DISCLAIMER
9
+ >
10
+ > This csv parser does not covers all quotes/comments edge cases, it is meant for now to be just extremly fast, thus not PROD ready yet.
8
11
 
9
12
  Cisv is a csv parser on steroids... literally.
10
13
  It's a high-performance CSV parser/writer leveraging SIMD instructions and zero-copy memory mapping. Available as both a Node.js native addon and standalone CLI tool with extensive configuration options.
11
14
 
12
15
  I wrote about basics in a blog post, you can read here :https://sanixdk.xyz/blogs/how-i-accidentally-created-the-fastest-csv-parser-ever-made.
13
16
 
14
- ## PERFORMANCE
15
-
16
- - **469,968 MB/s** throughput on 2M row CSV files (AVX-512)
17
- - **10-100x faster** than popular CSV parsers
18
- - Zero-copy memory-mapped I/O with kernel optimizations
19
- - SIMD accelerated with AVX-512/AVX2 auto-detection
20
- - Dynamic lookup tables for configurable parsing
21
-
22
17
  ## CLI BENCHMARKS WITH DOCKER
23
18
 
24
19
  ```bash
@@ -41,41 +36,47 @@ $ docker run --rm \
41
36
  ## BENCHMARKS
42
37
 
43
38
  Benchmarks comparison with existing popular tools,
44
- cf pipeline you can check : (https://github.com/Sanix-Darker/cisv/actions/runs/17697547058/job/50298916576) a step "Publish to npm"
39
+ cf pipeline you can check : (https://github.com/Sanix-Darker/cisv/actions/runs/18422464917/job/52498590205) at step "Publish to npm".
45
40
 
46
41
  ### SYNCHRONOUS RESULTS
47
42
 
48
43
  | Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
49
44
  |--------------------|--------------|---------------|----------------|
50
- | cisv (sync) | 45.58 | 0.01 | 98543 |
51
- | csv-parse (sync) | 18.11 | 0.03 | 39155 |
52
- | papaparse (sync) | 28.03 | 0.02 | 60596 |
45
+ | cisv (sync) | 71.10 | 0.01 | 153723 |
46
+ | csv-parse (sync) | 18.76 | 0.02 | 40563 |
47
+ | papaparse (sync) | 27.97 | 0.02 | 60467 |
48
+ | udsv (sync) | 69.81 | 0.01 | 150930 |
49
+ | d3-dsv (sync) | 98.11 | 0.00 | 212117 |
53
50
 
54
51
  ### SYNCHRONOUS RESULTS (WITH DATA ACCESS)
55
52
 
56
53
  | Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
57
54
  |--------------------|--------------|---------------|----------------|
58
- | cisv (sync) | 46.80 | 0.01 | 101185 |
59
- | csv-parse (sync) | 18.92 | 0.02 | 40900 |
60
- | papaparse (sync) | 28.38 | 0.02 | 61363 |
61
-
55
+ | cisv (sync) | 104.58 | 0.00 | 226108 |
56
+ | csv-parse (sync) | 16.87 | 0.03 | 36482 |
57
+ | papaparse (sync) | 28.13 | 0.02 | 60807 |
58
+ | udsv (sync) | 69.29 | 0.01 | 149812 |
59
+ | d3-dsv (sync) | 96.32 | 0.00 | 208248 |
62
60
 
63
61
  ### ASYNCHRONOUS RESULTS
64
62
 
65
63
  | Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
66
64
  |--------------------------|--------------|---------------|----------------|
67
- | cisv (async/stream) | 70.07 | 0.01 | 151485 |
68
- | papaparse (async/stream) | 21.58 | 0.02 | 46646 |
69
- | neat-csv (async/promise) | 9.77 | 0.05 | 21126 |
70
-
65
+ | cisv (async/stream) | 98.36 | 0.00 | 212662 |
66
+ | papaparse (async/stream) | 21.56 | 0.02 | 46609 |
67
+ | fast-csv (async/stream) | 10.09 | 0.05 | 21817 |
68
+ | neat-csv (async/promise) | 9.20 | 0.05 | 19898 |
69
+ | udsv (async/stream) | 51.74 | 0.01 | 111858 |
71
70
 
72
71
  ### ASYNCHRONOUS RESULTS (WITH DATA ACCESS)
73
72
 
74
73
  | Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
75
74
  |--------------------------|--------------|---------------|----------------|
76
- | cisv (async/stream) | 25.23 | 0.02 | 54545 |
77
- | papaparse (async/stream) | 22.49 | 0.02 | 48622 |
78
- | neat-csv (async/promise) | 9.91 | 0.05 | 21428 |
75
+ | cisv (async/stream) | 27.50 | 0.02 | 59460 |
76
+ | papaparse (async/stream) | 21.98 | 0.02 | 47513 |
77
+ | fast-csv (async/stream) | 10.05 | 0.05 | 21719 |
78
+ | neat-csv (async/promise) | 9.58 | 0.05 | 20711 |
79
+ | udsv (async/stream) | 53.26 | 0.01 | 115146 |
79
80
 
80
81
  ## INSTALLATION
81
82
 
@@ -120,16 +121,16 @@ const tsv_rows = tsv_parser.parseSync('./data.tsv');
120
121
  ### CLI
121
122
  ```bash
122
123
  # Basic parsing
123
- cisv data.csv
124
+ cisv_bin data.csv
124
125
 
125
126
  # Parse TSV file
126
- cisv -d $'\t' data.tsv
127
+ cisv_bin -d $'\t' data.tsv
127
128
 
128
129
  # Parse with custom quote and trim
129
- cisv -q "'" -t data.csv
130
+ cisv_bin -q "'" -t data.csv
130
131
 
131
132
  # Skip comment lines
132
- cisv -m '#' config.csv
133
+ cisv_bin -m '#' config.csv
133
134
  ```
134
135
 
135
136
  ## CONFIGURATION OPTIONS
@@ -352,7 +353,7 @@ const tsvCount = cisvParser.countRowsWithConfig('data.tsv', {
352
353
  ### PARSING OPTIONS
353
354
 
354
355
  ```bash
355
- cisv [OPTIONS] [FILE]
356
+ cisv_bin [OPTIONS] [FILE]
356
357
 
357
358
  General Options:
358
359
  -h, --help Show help message
@@ -384,34 +385,34 @@ Processing Options:
384
385
 
385
386
  ```bash
386
387
  # Parse TSV file
387
- cisv -d $'\t' data.tsv
388
+ cisv_bin -d $'\t' data.tsv
388
389
 
389
390
  # Parse CSV with semicolon delimiter and single quotes
390
- cisv -d ';' -q "'" european.csv
391
+ cisv_bin -d ';' -q "'" european.csv
391
392
 
392
393
  # Skip comment lines starting with #
393
- cisv -m '#' config.csv
394
+ cisv_bin -m '#' config.csv
394
395
 
395
396
  # Trim whitespace and skip empty lines
396
- cisv -t --skip-empty messy.csv
397
+ cisv_bin -t --skip-empty messy.csv
397
398
 
398
399
  # Parse lines 100-1000 only
399
- cisv --from-line 100 --to-line 1000 large.csv
400
+ cisv_bin --from-line 100 --to-line 1000 large.csv
400
401
 
401
402
  # Select specific columns
402
- cisv -s 0,2,5,7 data.csv
403
+ cisv_bin -s 0,2,5,7 data.csv
403
404
 
404
405
  # Count rows with specific configuration
405
- cisv -c -d $'\t' --skip-empty data.tsv
406
+ cisv_bin -c -d $'\t' --skip-empty data.tsv
406
407
 
407
408
  # Benchmark with custom delimiter
408
- cisv -b -d ';' european.csv
409
+ cisv_bin -b -d ';' european.csv
409
410
  ```
410
411
 
411
412
  ### WRITING
412
413
 
413
414
  ```bash
414
- cisv write [OPTIONS]
415
+ cisv_bin write [OPTIONS]
415
416
 
416
417
  Options:
417
418
  -g, --generate N Generate N rows of test data
@@ -423,44 +424,9 @@ Options:
423
424
  -b, --benchmark Benchmark mode
424
425
  ```
425
426
 
426
- ## BENCHMARKS
427
-
428
- ### PARSER PERFORMANCE (273 MB, 5M ROWS)
429
-
430
- | Parser | Speed (MB/s) | Time (ms) | Relative |
431
- |---------------|--------------|-----------|----------------|
432
- | **cisv** | 7,184 | 38 | 1.0x (fastest) |
433
- | rust-csv | 391 | 698 | 18x slower |
434
- | xsv | 650 | 420 | 11x slower |
435
- | csvkit | 28 | 9,875 | 260x slower |
436
-
437
- ### NODE.JS LIBRARY BENCHMARKS
438
-
439
- | Library | Speed (MB/s) | Operations/sec | Configuration Support |
440
- |--------------------|--------------|----------------|----------------------|
441
- | cisv | 61.24 | 136,343 | Full |
442
- | csv-parse | 15.48 | 34,471 | Partial |
443
- | papaparse | 25.67 | 57,147 | Partial |
444
-
445
- (you can check more benchmarks details from release pipelines)
446
-
447
- ### RUNNING BENCHMARKS
448
-
449
- ```bash
450
- # CLI benchmarks
451
- make clean && make cli && make benchmark-cli
452
-
453
- # Node.js benchmarks
454
- npm run benchmark
455
-
456
- # Benchmark with custom configuration
457
- cisv -b -d ';' -q "'" --trim european.csv
458
- ```
459
-
460
427
  ## TECHNICAL ARCHITECTURE
461
428
 
462
429
  - **SIMD Processing**: AVX-512 (64-byte vectors) or AVX2 (32-byte vectors) for parallel processing
463
- - **Dynamic Lookup Tables**: Generated per-configuration for optimal state transitions
464
430
  - **Memory Mapping**: Direct kernel-to-userspace zero-copy with `mmap()`
465
431
  - **Optimized Buffering**: 1MB ring buffer sized for L3 cache efficiency
466
432
  - **Compiler Optimizations**: LTO and architecture-specific tuning with `-march=native`
@@ -6,6 +6,9 @@ const { parse: csvParseSync } = require('csv-parse/sync');
6
6
  const { parse: csvParseStream } = require('csv-parse');
7
7
  const Papa = require('papaparse');
8
8
  const fastCsv = require('fast-csv');
9
+ const { inferSchema, initParser } = require('udsv');
10
+ const d3 = require('d3-dsv');
11
+ // const { iter } = require('but-csv');
9
12
  const fs = require('fs');
10
13
  const { Suite } = require('benchmark');
11
14
  const stream = require('stream');
@@ -90,6 +93,17 @@ async function runAllBenchmarks() {
90
93
  .add('papaparse (sync)', () => {
91
94
  Papa.parse(fileString, { fastMode: true });
92
95
  })
96
+ .add('udsv (sync)', () => {
97
+ const schema = inferSchema(fileString);
98
+ const parser = initParser(schema);
99
+ parser.stringArrs(fileString);
100
+ })
101
+ .add('d3-dsv (sync)', () => {
102
+ d3.csvParse(fileString);
103
+ })
104
+ // .add('but-csv (sync)', () => {
105
+ // Array.from(iter(fileString));
106
+ // })
93
107
  .on('cycle', (event) => logCycle(event, 'sync'))
94
108
  .on('error', reject)
95
109
  .on('complete', function() {
@@ -122,6 +136,20 @@ async function runAllBenchmarks() {
122
136
  const result = Papa.parse(fileString, { fastMode: true });
123
137
  const specificRow = result.data[TARGET_ROW_INDEX];
124
138
  })
139
+ .add('udsv (sync)', () => {
140
+ const schema = inferSchema(fileString);
141
+ const parser = initParser(schema);
142
+ const rows = parser.stringArrs(fileString);
143
+ const specificRow = rows[TARGET_ROW_INDEX];
144
+ })
145
+ .add('d3-dsv (sync)', () => {
146
+ const rows = d3.csvParse(fileString);
147
+ const specificRow = rows[TARGET_ROW_INDEX];
148
+ })
149
+ //.add('but-csv (sync)', () => {
150
+ // const rows = Array.from(iter(fileString));
151
+ // const specificRow = rows[TARGET_ROW_INDEX];
152
+ //})
125
153
  .on('cycle', (event) => logCycle(event, 'sync_data'))
126
154
  .on('error', reject)
127
155
  .on('complete', function() {
@@ -168,6 +196,18 @@ async function runAllBenchmarks() {
168
196
  });
169
197
  }
170
198
  })
199
+ .add('fast-csv (async/stream)', {
200
+ defer: true,
201
+ fn: (deferred) => {
202
+ const rows = [];
203
+ const readable = stream.Readable.from(fileBuffer);
204
+ readable
205
+ .pipe(fastCsv.parse({ headers: true }))
206
+ .on('data', (row) => rows.push(row))
207
+ .on('end', () => deferred.resolve())
208
+ .on('error', (err) => deferred.reject(err));
209
+ }
210
+ })
171
211
  .add('neat-csv (async/promise)', {
172
212
  defer: true,
173
213
  fn: (deferred) => {
@@ -178,6 +218,32 @@ async function runAllBenchmarks() {
178
218
  .catch((err) => deferred.reject(err));
179
219
  }
180
220
  })
221
+ .add('udsv (async/stream)', {
222
+ defer: true,
223
+ fn: (deferred) => {
224
+ const readable = stream.Readable.from(fileString);
225
+ let parser = null;
226
+
227
+ readable
228
+ .on('data', (chunk) => {
229
+ const strChunk = chunk.toString();
230
+ if (parser == null) {
231
+ const schema = inferSchema(strChunk);
232
+ parser = initParser(schema);
233
+ }
234
+ parser.chunk(strChunk);
235
+ })
236
+ .on('end', () => {
237
+ if (parser != null) {
238
+ parser.end();
239
+ }
240
+ deferred.resolve();
241
+ })
242
+ .on('error', (err) => deferred.reject(err));
243
+ }
244
+ })
245
+ // Note: d3-dsv and but-csv don't have native async streaming support
246
+ // so they are only included in sync benchmarks
181
247
  .on('cycle', (event) => logCycle(event, 'async'))
182
248
  .on('error', reject)
183
249
  .on('complete', function() {
@@ -227,6 +293,21 @@ async function runAllBenchmarks() {
227
293
  });
228
294
  }
229
295
  })
296
+ .add('fast-csv (async/stream)', {
297
+ defer: true,
298
+ fn: (deferred) => {
299
+ const rows = [];
300
+ const readable = stream.Readable.from(fileBuffer);
301
+ readable
302
+ .pipe(fastCsv.parse({ headers: true }))
303
+ .on('data', (row) => rows.push(row))
304
+ .on('end', () => {
305
+ const specificRow = rows[TARGET_ROW_INDEX];
306
+ deferred.resolve();
307
+ })
308
+ .on('error', (err) => deferred.reject(err));
309
+ }
310
+ })
230
311
  .add('neat-csv (async/promise)', {
231
312
  defer: true,
232
313
  fn: (deferred) => {
@@ -238,6 +319,32 @@ async function runAllBenchmarks() {
238
319
  .catch((err) => deferred.reject(err));
239
320
  }
240
321
  })
322
+ .add('udsv (async/stream)', {
323
+ defer: true,
324
+ fn: (deferred) => {
325
+ const readable = stream.Readable.from(fileString);
326
+ let parser = null;
327
+ let result = null;
328
+
329
+ readable
330
+ .on('data', (chunk) => {
331
+ const strChunk = chunk.toString();
332
+ if (!parser) {
333
+ const schema = inferSchema(strChunk);
334
+ parser = initParser(schema);
335
+ }
336
+ parser.chunk(strChunk, parser.stringArrs);
337
+ })
338
+ .on('end', () => {
339
+ if (parser) {
340
+ result = parser.end();
341
+ const specificRow = result[TARGET_ROW_INDEX];
342
+ }
343
+ deferred.resolve();
344
+ })
345
+ .on('error', (err) => deferred.reject(err));
346
+ }
347
+ })
241
348
  .on('cycle', (event) => logCycle(event, 'async_data'))
242
349
  .on('error', reject)
243
350
  .on('complete', function() {
@@ -476,7 +476,8 @@ public:
476
476
  if (!is_destroyed_) {
477
477
  cisv_parser_end(parser_);
478
478
  // Clear the environment reference after ending
479
- rc_->env = nullptr;
479
+ // FIXME: the transformer may need this
480
+ // rc_->env = nullptr;
480
481
  }
481
482
  }
482
483