cisv 0.0.41 → 0.0.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -5,20 +5,15 @@
5
5
  ![Size](https://deno.bundlejs.com/badge?q=spring-easing)
6
6
  ![Downloads](https://badgen.net/npm/dw/cisv)
7
7
 
8
+ > # DISCLAIMER
9
+ >
10
+ > This csv parser does not covers all quotes/comments edge cases, it is meant for now to be just extremly fast, thus not PROD ready yet.
8
11
 
9
12
  Cisv is a csv parser on steroids... literally.
10
13
  It's a high-performance CSV parser/writer leveraging SIMD instructions and zero-copy memory mapping. Available as both a Node.js native addon and standalone CLI tool with extensive configuration options.
11
14
 
12
15
  I wrote about basics in a blog post, you can read here :https://sanixdk.xyz/blogs/how-i-accidentally-created-the-fastest-csv-parser-ever-made.
13
16
 
14
- ## PERFORMANCE
15
-
16
- - **469,968 MB/s** throughput on 2M row CSV files (AVX-512)
17
- - **10-100x faster** than popular CSV parsers
18
- - Zero-copy memory-mapped I/O with kernel optimizations
19
- - SIMD accelerated with AVX-512/AVX2 auto-detection
20
- - Dynamic lookup tables for configurable parsing
21
-
22
17
  ## CLI BENCHMARKS WITH DOCKER
23
18
 
24
19
  ```bash
@@ -120,16 +115,16 @@ const tsv_rows = tsv_parser.parseSync('./data.tsv');
120
115
  ### CLI
121
116
  ```bash
122
117
  # Basic parsing
123
- cisv data.csv
118
+ cisv_bin data.csv
124
119
 
125
120
  # Parse TSV file
126
- cisv -d $'\t' data.tsv
121
+ cisv_bin -d $'\t' data.tsv
127
122
 
128
123
  # Parse with custom quote and trim
129
- cisv -q "'" -t data.csv
124
+ cisv_bin -q "'" -t data.csv
130
125
 
131
126
  # Skip comment lines
132
- cisv -m '#' config.csv
127
+ cisv_bin -m '#' config.csv
133
128
  ```
134
129
 
135
130
  ## CONFIGURATION OPTIONS
@@ -352,7 +347,7 @@ const tsvCount = cisvParser.countRowsWithConfig('data.tsv', {
352
347
  ### PARSING OPTIONS
353
348
 
354
349
  ```bash
355
- cisv [OPTIONS] [FILE]
350
+ cisv_bin [OPTIONS] [FILE]
356
351
 
357
352
  General Options:
358
353
  -h, --help Show help message
@@ -384,34 +379,34 @@ Processing Options:
384
379
 
385
380
  ```bash
386
381
  # Parse TSV file
387
- cisv -d $'\t' data.tsv
382
+ cisv_bin -d $'\t' data.tsv
388
383
 
389
384
  # Parse CSV with semicolon delimiter and single quotes
390
- cisv -d ';' -q "'" european.csv
385
+ cisv_bin -d ';' -q "'" european.csv
391
386
 
392
387
  # Skip comment lines starting with #
393
- cisv -m '#' config.csv
388
+ cisv_bin -m '#' config.csv
394
389
 
395
390
  # Trim whitespace and skip empty lines
396
- cisv -t --skip-empty messy.csv
391
+ cisv_bin -t --skip-empty messy.csv
397
392
 
398
393
  # Parse lines 100-1000 only
399
- cisv --from-line 100 --to-line 1000 large.csv
394
+ cisv_bin --from-line 100 --to-line 1000 large.csv
400
395
 
401
396
  # Select specific columns
402
- cisv -s 0,2,5,7 data.csv
397
+ cisv_bin -s 0,2,5,7 data.csv
403
398
 
404
399
  # Count rows with specific configuration
405
- cisv -c -d $'\t' --skip-empty data.tsv
400
+ cisv_bin -c -d $'\t' --skip-empty data.tsv
406
401
 
407
402
  # Benchmark with custom delimiter
408
- cisv -b -d ';' european.csv
403
+ cisv_bin -b -d ';' european.csv
409
404
  ```
410
405
 
411
406
  ### WRITING
412
407
 
413
408
  ```bash
414
- cisv write [OPTIONS]
409
+ cisv_bin write [OPTIONS]
415
410
 
416
411
  Options:
417
412
  -g, --generate N Generate N rows of test data
@@ -423,44 +418,9 @@ Options:
423
418
  -b, --benchmark Benchmark mode
424
419
  ```
425
420
 
426
- ## BENCHMARKS
427
-
428
- ### PARSER PERFORMANCE (273 MB, 5M ROWS)
429
-
430
- | Parser | Speed (MB/s) | Time (ms) | Relative |
431
- |---------------|--------------|-----------|----------------|
432
- | **cisv** | 7,184 | 38 | 1.0x (fastest) |
433
- | rust-csv | 391 | 698 | 18x slower |
434
- | xsv | 650 | 420 | 11x slower |
435
- | csvkit | 28 | 9,875 | 260x slower |
436
-
437
- ### NODE.JS LIBRARY BENCHMARKS
438
-
439
- | Library | Speed (MB/s) | Operations/sec | Configuration Support |
440
- |--------------------|--------------|----------------|----------------------|
441
- | cisv | 61.24 | 136,343 | Full |
442
- | csv-parse | 15.48 | 34,471 | Partial |
443
- | papaparse | 25.67 | 57,147 | Partial |
444
-
445
- (you can check more benchmarks details from release pipelines)
446
-
447
- ### RUNNING BENCHMARKS
448
-
449
- ```bash
450
- # CLI benchmarks
451
- make clean && make cli && make benchmark-cli
452
-
453
- # Node.js benchmarks
454
- npm run benchmark
455
-
456
- # Benchmark with custom configuration
457
- cisv -b -d ';' -q "'" --trim european.csv
458
- ```
459
-
460
421
  ## TECHNICAL ARCHITECTURE
461
422
 
462
423
  - **SIMD Processing**: AVX-512 (64-byte vectors) or AVX2 (32-byte vectors) for parallel processing
463
- - **Dynamic Lookup Tables**: Generated per-configuration for optimal state transitions
464
424
  - **Memory Mapping**: Direct kernel-to-userspace zero-copy with `mmap()`
465
425
  - **Optimized Buffering**: 1MB ring buffer sized for L3 cache efficiency
466
426
  - **Compiler Optimizations**: LTO and architecture-specific tuning with `-march=native`
@@ -6,6 +6,9 @@ const { parse: csvParseSync } = require('csv-parse/sync');
6
6
  const { parse: csvParseStream } = require('csv-parse');
7
7
  const Papa = require('papaparse');
8
8
  const fastCsv = require('fast-csv');
9
+ const { inferSchema, initParser } = require('udsv');
10
+ const d3 = require('d3-dsv');
11
+ // const { iter } = require('but-csv');
9
12
  const fs = require('fs');
10
13
  const { Suite } = require('benchmark');
11
14
  const stream = require('stream');
@@ -90,6 +93,17 @@ async function runAllBenchmarks() {
90
93
  .add('papaparse (sync)', () => {
91
94
  Papa.parse(fileString, { fastMode: true });
92
95
  })
96
+ .add('udsv (sync)', () => {
97
+ const schema = inferSchema(fileString);
98
+ const parser = initParser(schema);
99
+ parser.stringArrs(fileString);
100
+ })
101
+ .add('d3-dsv (sync)', () => {
102
+ d3.csvParse(fileString);
103
+ })
104
+ // .add('but-csv (sync)', () => {
105
+ // Array.from(iter(fileString));
106
+ // })
93
107
  .on('cycle', (event) => logCycle(event, 'sync'))
94
108
  .on('error', reject)
95
109
  .on('complete', function() {
@@ -122,6 +136,20 @@ async function runAllBenchmarks() {
122
136
  const result = Papa.parse(fileString, { fastMode: true });
123
137
  const specificRow = result.data[TARGET_ROW_INDEX];
124
138
  })
139
+ .add('udsv (sync)', () => {
140
+ const schema = inferSchema(fileString);
141
+ const parser = initParser(schema);
142
+ const rows = parser.stringArrs(fileString);
143
+ const specificRow = rows[TARGET_ROW_INDEX];
144
+ })
145
+ .add('d3-dsv (sync)', () => {
146
+ const rows = d3.csvParse(fileString);
147
+ const specificRow = rows[TARGET_ROW_INDEX];
148
+ })
149
+ //.add('but-csv (sync)', () => {
150
+ // const rows = Array.from(iter(fileString));
151
+ // const specificRow = rows[TARGET_ROW_INDEX];
152
+ //})
125
153
  .on('cycle', (event) => logCycle(event, 'sync_data'))
126
154
  .on('error', reject)
127
155
  .on('complete', function() {
@@ -168,6 +196,18 @@ async function runAllBenchmarks() {
168
196
  });
169
197
  }
170
198
  })
199
+ .add('fast-csv (async/stream)', {
200
+ defer: true,
201
+ fn: (deferred) => {
202
+ const rows = [];
203
+ const readable = stream.Readable.from(fileBuffer);
204
+ readable
205
+ .pipe(fastCsv.parse({ headers: true }))
206
+ .on('data', (row) => rows.push(row))
207
+ .on('end', () => deferred.resolve())
208
+ .on('error', (err) => deferred.reject(err));
209
+ }
210
+ })
171
211
  .add('neat-csv (async/promise)', {
172
212
  defer: true,
173
213
  fn: (deferred) => {
@@ -178,6 +218,32 @@ async function runAllBenchmarks() {
178
218
  .catch((err) => deferred.reject(err));
179
219
  }
180
220
  })
221
+ .add('udsv (async/stream)', {
222
+ defer: true,
223
+ fn: (deferred) => {
224
+ const readable = stream.Readable.from(fileString);
225
+ let parser = null;
226
+
227
+ readable
228
+ .on('data', (chunk) => {
229
+ const strChunk = chunk.toString();
230
+ if (parser == null) {
231
+ const schema = inferSchema(strChunk);
232
+ parser = initParser(schema);
233
+ }
234
+ parser.chunk(strChunk);
235
+ })
236
+ .on('end', () => {
237
+ if (parser != null) {
238
+ parser.end();
239
+ }
240
+ deferred.resolve();
241
+ })
242
+ .on('error', (err) => deferred.reject(err));
243
+ }
244
+ })
245
+ // Note: d3-dsv and but-csv don't have native async streaming support
246
+ // so they are only included in sync benchmarks
181
247
  .on('cycle', (event) => logCycle(event, 'async'))
182
248
  .on('error', reject)
183
249
  .on('complete', function() {
@@ -227,6 +293,21 @@ async function runAllBenchmarks() {
227
293
  });
228
294
  }
229
295
  })
296
+ .add('fast-csv (async/stream)', {
297
+ defer: true,
298
+ fn: (deferred) => {
299
+ const rows = [];
300
+ const readable = stream.Readable.from(fileBuffer);
301
+ readable
302
+ .pipe(fastCsv.parse({ headers: true }))
303
+ .on('data', (row) => rows.push(row))
304
+ .on('end', () => {
305
+ const specificRow = rows[TARGET_ROW_INDEX];
306
+ deferred.resolve();
307
+ })
308
+ .on('error', (err) => deferred.reject(err));
309
+ }
310
+ })
230
311
  .add('neat-csv (async/promise)', {
231
312
  defer: true,
232
313
  fn: (deferred) => {
@@ -238,6 +319,32 @@ async function runAllBenchmarks() {
238
319
  .catch((err) => deferred.reject(err));
239
320
  }
240
321
  })
322
+ .add('udsv (async/stream)', {
323
+ defer: true,
324
+ fn: (deferred) => {
325
+ const readable = stream.Readable.from(fileString);
326
+ let parser = null;
327
+ let result = null;
328
+
329
+ readable
330
+ .on('data', (chunk) => {
331
+ const strChunk = chunk.toString();
332
+ if (!parser) {
333
+ const schema = inferSchema(strChunk);
334
+ parser = initParser(schema);
335
+ }
336
+ parser.chunk(strChunk, parser.stringArrs);
337
+ })
338
+ .on('end', () => {
339
+ if (parser) {
340
+ result = parser.end();
341
+ const specificRow = result[TARGET_ROW_INDEX];
342
+ }
343
+ deferred.resolve();
344
+ })
345
+ .on('error', (err) => deferred.reject(err));
346
+ }
347
+ })
241
348
  .on('cycle', (event) => logCycle(event, 'async_data'))
242
349
  .on('error', reject)
243
350
  .on('complete', function() {
@@ -476,7 +476,8 @@ public:
476
476
  if (!is_destroyed_) {
477
477
  cisv_parser_end(parser_);
478
478
  // Clear the environment reference after ending
479
- rc_->env = nullptr;
479
+ // FIXME: the transformer may need this
480
+ // rc_->env = nullptr;
480
481
  }
481
482
  }
482
483