npm - cisv - Versions diffs - 0.0.42 → 0.0.51 - Mend

cisv 0.0.42 → 0.0.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md CHANGED Viewed

@@ -5,20 +5,15 @@
 ![Size](https://deno.bundlejs.com/badge?q=spring-easing)
 ![Downloads](https://badgen.net/npm/dw/cisv)
+> # DISCLAIMER
+>
+> This csv parser does not covers all quotes/comments edge cases, it is meant for now to be just extremly fast, thus not PROD ready yet.
 Cisv is a csv parser on steroids... literally.
 It's a high-performance CSV parser/writer leveraging SIMD instructions and zero-copy memory mapping. Available as both a Node.js native addon and standalone CLI tool with extensive configuration options.
 I wrote about basics in a blog post, you can read here :https://sanixdk.xyz/blogs/how-i-accidentally-created-the-fastest-csv-parser-ever-made.
-## PERFORMANCE
-- **469,968 MB/s** throughput on 2M row CSV files (AVX-512)
-- **10-100x faster** than popular CSV parsers
-- Zero-copy memory-mapped I/O with kernel optimizations
-- SIMD accelerated with AVX-512/AVX2 auto-detection
-- Dynamic lookup tables for configurable parsing
 ## CLI BENCHMARKS WITH DOCKER
 ```bash
@@ -120,16 +115,16 @@ const tsv_rows = tsv_parser.parseSync('./data.tsv');
 ### CLI
 ```bash
 # Basic parsing
-cisv data.csv
+cisv_bin data.csv
 # Parse TSV file
-cisv -d $'\t' data.tsv
+cisv_bin -d $'\t' data.tsv
 # Parse with custom quote and trim
-cisv -q "'" -t data.csv
+cisv_bin -q "'" -t data.csv
 # Skip comment lines
-cisv -m '#' config.csv
+cisv_bin -m '#' config.csv
 ```
 ## CONFIGURATION OPTIONS
@@ -352,7 +347,7 @@ const tsvCount = cisvParser.countRowsWithConfig('data.tsv', {
 ### PARSING OPTIONS
 ```bash
-cisv [OPTIONS] [FILE]
+cisv_bin [OPTIONS] [FILE]
 General Options:
   -h, --help              Show help message
@@ -384,34 +379,34 @@ Processing Options:
 ```bash
 # Parse TSV file
-cisv -d $'\t' data.tsv
+cisv_bin -d $'\t' data.tsv
 # Parse CSV with semicolon delimiter and single quotes
-cisv -d ';' -q "'" european.csv
+cisv_bin -d ';' -q "'" european.csv
 # Skip comment lines starting with #
-cisv -m '#' config.csv
+cisv_bin -m '#' config.csv
 # Trim whitespace and skip empty lines
-cisv -t --skip-empty messy.csv
+cisv_bin -t --skip-empty messy.csv
 # Parse lines 100-1000 only
-cisv --from-line 100 --to-line 1000 large.csv
+cisv_bin --from-line 100 --to-line 1000 large.csv
 # Select specific columns
-cisv -s 0,2,5,7 data.csv
+cisv_bin -s 0,2,5,7 data.csv
 # Count rows with specific configuration
-cisv -c -d $'\t' --skip-empty data.tsv
+cisv_bin -c -d $'\t' --skip-empty data.tsv
 # Benchmark with custom delimiter
-cisv -b -d ';' european.csv
+cisv_bin -b -d ';' european.csv
 ```
 ### WRITING
 ```bash
-cisv write [OPTIONS]
+cisv_bin write [OPTIONS]
 Options:
   -g, --generate N       Generate N rows of test data
@@ -423,44 +418,9 @@ Options:
   -b, --benchmark        Benchmark mode
 ```
-## BENCHMARKS
-### PARSER PERFORMANCE (273 MB, 5M ROWS)
-| Parser        | Speed (MB/s) | Time (ms) | Relative       |
-|---------------|--------------|-----------|----------------|
-| **cisv**      | 7,184        | 38        | 1.0x (fastest) |
-| rust-csv      | 391          | 698       | 18x slower     |
-| xsv           | 650          | 420       | 11x slower     |
-| csvkit        | 28           | 9,875     | 260x slower    |
-### NODE.JS LIBRARY BENCHMARKS
-| Library            | Speed (MB/s) | Operations/sec | Configuration Support |
-|--------------------|--------------|----------------|----------------------|
-| cisv              | 61.24        | 136,343        | Full                 |
-| csv-parse         | 15.48        | 34,471         | Partial              |
-| papaparse         | 25.67        | 57,147         | Partial              |
-(you can check more benchmarks details from release pipelines)
-### RUNNING BENCHMARKS
-```bash
-# CLI benchmarks
-make clean && make cli && make benchmark-cli
-# Node.js benchmarks
-npm run benchmark
-# Benchmark with custom configuration
-cisv -b -d ';' -q "'" --trim european.csv
-```
 ## TECHNICAL ARCHITECTURE
 - **SIMD Processing**: AVX-512 (64-byte vectors) or AVX2 (32-byte vectors) for parallel processing
-- **Dynamic Lookup Tables**: Generated per-configuration for optimal state transitions
 - **Memory Mapping**: Direct kernel-to-userspace zero-copy with `mmap()`
 - **Optimized Buffering**: 1MB ring buffer sized for L3 cache efficiency
 - **Compiler Optimizations**: LTO and architecture-specific tuning with `-march=native`

package/benchmark/benchmark.js CHANGED Viewed

@@ -6,6 +6,9 @@ const { parse: csvParseSync } = require('csv-parse/sync');
 const { parse: csvParseStream } = require('csv-parse');
 const Papa = require('papaparse');
 const fastCsv = require('fast-csv');
+const { inferSchema, initParser } = require('udsv');
+const d3 = require('d3-dsv');
+// const { iter } = require('but-csv');
 const fs = require('fs');
 const { Suite } = require('benchmark');
 const stream = require('stream');
@@ -90,6 +93,17 @@ async function runAllBenchmarks() {
           .add('papaparse (sync)', () => {
             Papa.parse(fileString, { fastMode: true });
           })
+          .add('udsv (sync)', () => {
+            const schema = inferSchema(fileString);
+            const parser = initParser(schema);
+            parser.stringArrs(fileString);
+          })
+          .add('d3-dsv (sync)', () => {
+            d3.csvParse(fileString);
+          })
+          // .add('but-csv (sync)', () => {
+          //   Array.from(iter(fileString));
+          // })
           .on('cycle', (event) => logCycle(event, 'sync'))
           .on('error', reject)
           .on('complete', function() {
@@ -122,6 +136,20 @@ async function runAllBenchmarks() {
             const result = Papa.parse(fileString, { fastMode: true });
             const specificRow = result.data[TARGET_ROW_INDEX];
           })
+          .add('udsv (sync)', () => {
+            const schema = inferSchema(fileString);
+            const parser = initParser(schema);
+            const rows = parser.stringArrs(fileString);
+            const specificRow = rows[TARGET_ROW_INDEX];
+          })
+          .add('d3-dsv (sync)', () => {
+            const rows = d3.csvParse(fileString);
+            const specificRow = rows[TARGET_ROW_INDEX];
+          })
+          //.add('but-csv (sync)', () => {
+          //  const rows = Array.from(iter(fileString));
+          //  const specificRow = rows[TARGET_ROW_INDEX];
+          //})
           .on('cycle', (event) => logCycle(event, 'sync_data'))
           .on('error', reject)
           .on('complete', function() {
@@ -168,6 +196,18 @@ async function runAllBenchmarks() {
               });
             }
           })
+          .add('fast-csv (async/stream)', {
+            defer: true,
+            fn: (deferred) => {
+              const rows = [];
+              const readable = stream.Readable.from(fileBuffer);
+              readable
+                .pipe(fastCsv.parse({ headers: true }))
+                .on('data', (row) => rows.push(row))
+                .on('end', () => deferred.resolve())
+                .on('error', (err) => deferred.reject(err));
+            }
+          })
           .add('neat-csv (async/promise)', {
             defer: true,
             fn: (deferred) => {
@@ -178,6 +218,32 @@ async function runAllBenchmarks() {
                 .catch((err) => deferred.reject(err));
             }
           })
+          .add('udsv (async/stream)', {
+            defer: true,
+            fn: (deferred) => {
+              const readable = stream.Readable.from(fileString);
+              let parser = null;
+              readable
+                .on('data', (chunk) => {
+                  const strChunk = chunk.toString();
+                  if (parser == null) {
+                    const schema = inferSchema(strChunk);
+                    parser = initParser(schema);
+                  }
+                  parser.chunk(strChunk);
+                })
+                .on('end', () => {
+                  if (parser != null) {
+                    parser.end();
+                  }
+                  deferred.resolve();
+                })
+                .on('error', (err) => deferred.reject(err));
+            }
+          })
+          // Note: d3-dsv and but-csv don't have native async streaming support
+          // so they are only included in sync benchmarks
           .on('cycle', (event) => logCycle(event, 'async'))
           .on('error', reject)
           .on('complete', function() {
@@ -227,6 +293,21 @@ async function runAllBenchmarks() {
               });
             }
           })
+          .add('fast-csv (async/stream)', {
+            defer: true,
+            fn: (deferred) => {
+              const rows = [];
+              const readable = stream.Readable.from(fileBuffer);
+              readable
+                .pipe(fastCsv.parse({ headers: true }))
+                .on('data', (row) => rows.push(row))
+                .on('end', () => {
+                  const specificRow = rows[TARGET_ROW_INDEX];
+                  deferred.resolve();
+                })
+                .on('error', (err) => deferred.reject(err));
+            }
+          })
           .add('neat-csv (async/promise)', {
             defer: true,
             fn: (deferred) => {
@@ -238,6 +319,32 @@ async function runAllBenchmarks() {
                 .catch((err) => deferred.reject(err));
             }
           })
+          .add('udsv (async/stream)', {
+            defer: true,
+            fn: (deferred) => {
+              const readable = stream.Readable.from(fileString);
+              let parser = null;
+              let result = null;
+              readable
+                .on('data', (chunk) => {
+                  const strChunk = chunk.toString();
+                  if (!parser) {
+                    const schema = inferSchema(strChunk);
+                    parser = initParser(schema);
+                  }
+                  parser.chunk(strChunk, parser.stringArrs);
+                })
+                .on('end', () => {
+                  if (parser) {
+                    result = parser.end();
+                    const specificRow = result[TARGET_ROW_INDEX];
+                  }
+                  deferred.resolve();
+                })
+                .on('error', (err) => deferred.reject(err));
+            }
+          })
           .on('cycle', (event) => logCycle(event, 'async_data'))
           .on('error', reject)
           .on('complete', function() {

package/cisv/cisv_addon.cc CHANGED Viewed

@@ -476,7 +476,8 @@ public:
         if (!is_destroyed_) {
             cisv_parser_end(parser_);
             // Clear the environment reference after ending
-            rc_->env = nullptr;
+            // FIXME: the transformer may need this
+            // rc_->env = nullptr;
         }
     }