cisv 0.0.60 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +0 -209
- package/binding.gyp +23 -9
- package/build/Release/cisv.node +0 -0
- package/cisv/cisv_addon.cc +204 -16
- package/package.json +17 -9
- package/LICENSE +0 -21
- package/benchmark/benchmark.js +0 -418
- package/cisv/cisv_parser.c +0 -1333
- package/cisv/cisv_parser.h +0 -91
- package/cisv/cisv_simd.h +0 -54
- package/cisv/cisv_transformer.c +0 -624
- package/cisv/cisv_transformer.h +0 -171
- package/cisv/cisv_writer.c +0 -531
- package/cisv/cisv_writer.h +0 -60
- package/data.csv +0 -11
- package/index.d.ts +0 -280
- /package/{types → cisv/types}/cisv.d.ts +0 -0
package/README.md
CHANGED
|
@@ -5,79 +5,6 @@
|
|
|
5
5
|

|
|
6
6
|

|
|
7
7
|
|
|
8
|
-
> # DISCLAIMER
|
|
9
|
-
>
|
|
10
|
-
> This csv parser does not covers all quotes/comments edge cases, it is meant for now to be just extremly fast, thus not PROD ready yet.
|
|
11
|
-
|
|
12
|
-
Cisv is a csv parser on steroids... literally.
|
|
13
|
-
It's a high-performance CSV parser/writer leveraging SIMD instructions and zero-copy memory mapping. Available as both a Node.js native addon and standalone CLI tool with extensive configuration options.
|
|
14
|
-
|
|
15
|
-
I wrote about basics in a blog post, you can read here :https://sanixdk.xyz/blogs/how-i-accidentally-created-the-fastest-csv-parser-ever-made.
|
|
16
|
-
|
|
17
|
-
## CLI BENCHMARKS WITH DOCKER
|
|
18
|
-
|
|
19
|
-
```bash
|
|
20
|
-
$ docker build -t cisv-benchmark .
|
|
21
|
-
```
|
|
22
|
-
|
|
23
|
-
To run them... choosing some specs for the container to size resources, you can :
|
|
24
|
-
|
|
25
|
-
```bash
|
|
26
|
-
$ docker run --rm \
|
|
27
|
-
--cpus="2.0" \
|
|
28
|
-
--memory="4g" \
|
|
29
|
-
--memory-swap="4g" \
|
|
30
|
-
--cpu-shares=1024 \
|
|
31
|
-
--security-opt \
|
|
32
|
-
seccomp=unconfined \
|
|
33
|
-
cisv-benchmark
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
## BENCHMARKS
|
|
37
|
-
|
|
38
|
-
Benchmarks comparison with existing popular tools,
|
|
39
|
-
cf pipeline you can check : (https://github.com/Sanix-Darker/cisv/actions/runs/18422464917/job/52498590205) at step "Publish to npm".
|
|
40
|
-
|
|
41
|
-
### SYNCHRONOUS RESULTS
|
|
42
|
-
|
|
43
|
-
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
44
|
-
|--------------------|--------------|---------------|----------------|
|
|
45
|
-
| cisv (sync) | 71.10 | 0.01 | 153723 |
|
|
46
|
-
| csv-parse (sync) | 18.76 | 0.02 | 40563 |
|
|
47
|
-
| papaparse (sync) | 27.97 | 0.02 | 60467 |
|
|
48
|
-
| udsv (sync) | 69.81 | 0.01 | 150930 |
|
|
49
|
-
| d3-dsv (sync) | 98.11 | 0.00 | 212117 |
|
|
50
|
-
|
|
51
|
-
### SYNCHRONOUS RESULTS (WITH DATA ACCESS)
|
|
52
|
-
|
|
53
|
-
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
54
|
-
|--------------------|--------------|---------------|----------------|
|
|
55
|
-
| cisv (sync) | 104.58 | 0.00 | 226108 |
|
|
56
|
-
| csv-parse (sync) | 16.87 | 0.03 | 36482 |
|
|
57
|
-
| papaparse (sync) | 28.13 | 0.02 | 60807 |
|
|
58
|
-
| udsv (sync) | 69.29 | 0.01 | 149812 |
|
|
59
|
-
| d3-dsv (sync) | 96.32 | 0.00 | 208248 |
|
|
60
|
-
|
|
61
|
-
### ASYNCHRONOUS RESULTS
|
|
62
|
-
|
|
63
|
-
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
64
|
-
|--------------------------|--------------|---------------|----------------|
|
|
65
|
-
| cisv (async/stream) | 98.36 | 0.00 | 212662 |
|
|
66
|
-
| papaparse (async/stream) | 21.56 | 0.02 | 46609 |
|
|
67
|
-
| fast-csv (async/stream) | 10.09 | 0.05 | 21817 |
|
|
68
|
-
| neat-csv (async/promise) | 9.20 | 0.05 | 19898 |
|
|
69
|
-
| udsv (async/stream) | 51.74 | 0.01 | 111858 |
|
|
70
|
-
|
|
71
|
-
### ASYNCHRONOUS RESULTS (WITH DATA ACCESS)
|
|
72
|
-
|
|
73
|
-
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
74
|
-
|--------------------------|--------------|---------------|----------------|
|
|
75
|
-
| cisv (async/stream) | 27.50 | 0.02 | 59460 |
|
|
76
|
-
| papaparse (async/stream) | 21.98 | 0.02 | 47513 |
|
|
77
|
-
| fast-csv (async/stream) | 10.05 | 0.05 | 21719 |
|
|
78
|
-
| neat-csv (async/promise) | 9.58 | 0.05 | 20711 |
|
|
79
|
-
| udsv (async/stream) | 53.26 | 0.01 | 115146 |
|
|
80
|
-
|
|
81
8
|
## INSTALLATION
|
|
82
9
|
|
|
83
10
|
### NODE.JS PACKAGE
|
|
@@ -85,18 +12,9 @@ cf pipeline you can check : (https://github.com/Sanix-Darker/cisv/actions/runs/1
|
|
|
85
12
|
npm install cisv
|
|
86
13
|
```
|
|
87
14
|
|
|
88
|
-
### CLI TOOL (FROM SOURCE)
|
|
89
|
-
```bash
|
|
90
|
-
git clone https://github.com/sanix-darker/cisv
|
|
91
|
-
cd cisv
|
|
92
|
-
make cli
|
|
93
|
-
sudo make install-cli
|
|
94
|
-
```
|
|
95
|
-
|
|
96
15
|
### BUILD FROM SOURCE (NODE.JS ADDON)
|
|
97
16
|
```bash
|
|
98
17
|
npm install -g node-gyp
|
|
99
|
-
make build
|
|
100
18
|
```
|
|
101
19
|
|
|
102
20
|
## QUICK START
|
|
@@ -118,20 +36,6 @@ const tsv_parser = new cisvParser({
|
|
|
118
36
|
const tsv_rows = tsv_parser.parseSync('./data.tsv');
|
|
119
37
|
```
|
|
120
38
|
|
|
121
|
-
### CLI
|
|
122
|
-
```bash
|
|
123
|
-
# Basic parsing
|
|
124
|
-
cisv_bin data.csv
|
|
125
|
-
|
|
126
|
-
# Parse TSV file
|
|
127
|
-
cisv_bin -d $'\t' data.tsv
|
|
128
|
-
|
|
129
|
-
# Parse with custom quote and trim
|
|
130
|
-
cisv_bin -q "'" -t data.csv
|
|
131
|
-
|
|
132
|
-
# Skip comment lines
|
|
133
|
-
cisv_bin -m '#' config.csv
|
|
134
|
-
```
|
|
135
39
|
|
|
136
40
|
## CONFIGURATION OPTIONS
|
|
137
41
|
|
|
@@ -347,116 +251,3 @@ const tsvCount = cisvParser.countRowsWithConfig('data.tsv', {
|
|
|
347
251
|
toLine: 1000
|
|
348
252
|
});
|
|
349
253
|
```
|
|
350
|
-
|
|
351
|
-
## CLI USAGE
|
|
352
|
-
|
|
353
|
-
### PARSING OPTIONS
|
|
354
|
-
|
|
355
|
-
```bash
|
|
356
|
-
cisv_bin [OPTIONS] [FILE]
|
|
357
|
-
|
|
358
|
-
General Options:
|
|
359
|
-
-h, --help Show help message
|
|
360
|
-
-v, --version Show version
|
|
361
|
-
-o, --output FILE Write to FILE instead of stdout
|
|
362
|
-
-b, --benchmark Run benchmark mode
|
|
363
|
-
|
|
364
|
-
Configuration Options:
|
|
365
|
-
-d, --delimiter DELIM Field delimiter (default: ,)
|
|
366
|
-
-q, --quote CHAR Quote character (default: ")
|
|
367
|
-
-e, --escape CHAR Escape character (default: RFC4180 style)
|
|
368
|
-
-m, --comment CHAR Comment character (default: none)
|
|
369
|
-
-t, --trim Trim whitespace from fields
|
|
370
|
-
-r, --relaxed Use relaxed parsing rules
|
|
371
|
-
--skip-empty Skip empty lines
|
|
372
|
-
--skip-errors Skip lines with parse errors
|
|
373
|
-
--max-row SIZE Maximum row size in bytes
|
|
374
|
-
--from-line N Start from line N (1-based)
|
|
375
|
-
--to-line N Stop at line N
|
|
376
|
-
|
|
377
|
-
Processing Options:
|
|
378
|
-
-s, --select COLS Select columns (comma-separated indices)
|
|
379
|
-
-c, --count Show only row count
|
|
380
|
-
--head N Show first N rows
|
|
381
|
-
--tail N Show last N rows
|
|
382
|
-
```
|
|
383
|
-
|
|
384
|
-
### EXAMPLES
|
|
385
|
-
|
|
386
|
-
```bash
|
|
387
|
-
# Parse TSV file
|
|
388
|
-
cisv_bin -d $'\t' data.tsv
|
|
389
|
-
|
|
390
|
-
# Parse CSV with semicolon delimiter and single quotes
|
|
391
|
-
cisv_bin -d ';' -q "'" european.csv
|
|
392
|
-
|
|
393
|
-
# Skip comment lines starting with #
|
|
394
|
-
cisv_bin -m '#' config.csv
|
|
395
|
-
|
|
396
|
-
# Trim whitespace and skip empty lines
|
|
397
|
-
cisv_bin -t --skip-empty messy.csv
|
|
398
|
-
|
|
399
|
-
# Parse lines 100-1000 only
|
|
400
|
-
cisv_bin --from-line 100 --to-line 1000 large.csv
|
|
401
|
-
|
|
402
|
-
# Select specific columns
|
|
403
|
-
cisv_bin -s 0,2,5,7 data.csv
|
|
404
|
-
|
|
405
|
-
# Count rows with specific configuration
|
|
406
|
-
cisv_bin -c -d $'\t' --skip-empty data.tsv
|
|
407
|
-
|
|
408
|
-
# Benchmark with custom delimiter
|
|
409
|
-
cisv_bin -b -d ';' european.csv
|
|
410
|
-
```
|
|
411
|
-
|
|
412
|
-
### WRITING
|
|
413
|
-
|
|
414
|
-
```bash
|
|
415
|
-
cisv_bin write [OPTIONS]
|
|
416
|
-
|
|
417
|
-
Options:
|
|
418
|
-
-g, --generate N Generate N rows of test data
|
|
419
|
-
-o, --output FILE Output file
|
|
420
|
-
-d, --delimiter DELIM Field delimiter
|
|
421
|
-
-Q, --quote-all Quote all fields
|
|
422
|
-
-r, --crlf Use CRLF line endings
|
|
423
|
-
-n, --null TEXT Null representation
|
|
424
|
-
-b, --benchmark Benchmark mode
|
|
425
|
-
```
|
|
426
|
-
|
|
427
|
-
## TECHNICAL ARCHITECTURE
|
|
428
|
-
|
|
429
|
-
- **SIMD Processing**: AVX-512 (64-byte vectors) or AVX2 (32-byte vectors) for parallel processing
|
|
430
|
-
- **Memory Mapping**: Direct kernel-to-userspace zero-copy with `mmap()`
|
|
431
|
-
- **Optimized Buffering**: 1MB ring buffer sized for L3 cache efficiency
|
|
432
|
-
- **Compiler Optimizations**: LTO and architecture-specific tuning with `-march=native`
|
|
433
|
-
- **Configurable Parsing**: RFC 4180 compliant with extensive customization options
|
|
434
|
-
|
|
435
|
-
## FEATURES (PROS)
|
|
436
|
-
|
|
437
|
-
- RFC 4180 compliant with configurable extensions
|
|
438
|
-
- Handles quoted fields with embedded delimiters
|
|
439
|
-
- Support for multiple CSV dialects (TSV, PSV, etc.)
|
|
440
|
-
- Comment line support
|
|
441
|
-
- Field trimming and empty line handling
|
|
442
|
-
- Line range parsing for large files
|
|
443
|
-
- Streaming API for unlimited file sizes
|
|
444
|
-
- Safe fallback for non-x86 architectures
|
|
445
|
-
- High-performance CSV writer with SIMD optimization
|
|
446
|
-
- Row counting without full parsing
|
|
447
|
-
|
|
448
|
-
## LIMITATIONS
|
|
449
|
-
|
|
450
|
-
- Linux/Unix support only (optimized for x86_64 CPU)
|
|
451
|
-
- Windows support planned for future release
|
|
452
|
-
|
|
453
|
-
## LICENSE
|
|
454
|
-
|
|
455
|
-
MIT © [sanix-darker](https://github.com/sanix-darker)
|
|
456
|
-
|
|
457
|
-
## ACKNOWLEDGMENTS
|
|
458
|
-
|
|
459
|
-
Inspired by:
|
|
460
|
-
- [simdjson](https://github.com/simdjson/simdjson) - Parsing gigabytes of JSON per second
|
|
461
|
-
- [xsv](https://github.com/BurntSushi/xsv) - Fast CSV command line toolkit
|
|
462
|
-
- [rust-csv](https://github.com/BurntSushi/rust-csv) - CSV parser for Rust
|
package/binding.gyp
CHANGED
|
@@ -4,20 +4,22 @@
|
|
|
4
4
|
"target_name": "cisv",
|
|
5
5
|
"sources": [
|
|
6
6
|
"cisv/cisv_addon.cc",
|
|
7
|
-
"
|
|
8
|
-
"
|
|
9
|
-
"
|
|
7
|
+
"../../core/src/parser.c",
|
|
8
|
+
"../../core/src/writer.c",
|
|
9
|
+
"../../core/src/transformer.c"
|
|
10
10
|
],
|
|
11
11
|
"include_dirs": [
|
|
12
12
|
"<!@(node -p \"require('node-addon-api').include\")",
|
|
13
|
+
"../../core/include/",
|
|
13
14
|
"cisv/"
|
|
14
15
|
],
|
|
15
16
|
"dependencies": [
|
|
16
17
|
"<!(node -p \"require('node-addon-api').gyp\")"
|
|
17
18
|
],
|
|
18
19
|
"cflags!": [ "-fno-exceptions" ],
|
|
19
|
-
"cflags": ["-O3"
|
|
20
|
+
"cflags": ["-O3"],
|
|
20
21
|
"cflags_cc!": [ "-fno-exceptions" ],
|
|
22
|
+
"cflags_cc": ["-O3"],
|
|
21
23
|
"defines": [
|
|
22
24
|
"NAPI_DISABLE_CPP_EXCEPTIONS",
|
|
23
25
|
"NAPI_VERSION=6"
|
|
@@ -28,27 +30,39 @@
|
|
|
28
30
|
"-O3",
|
|
29
31
|
"-march=native",
|
|
30
32
|
"-mtune=native",
|
|
31
|
-
"-ffast-math"
|
|
33
|
+
"-ffast-math",
|
|
34
|
+
"-funroll-loops",
|
|
35
|
+
"-fomit-frame-pointer",
|
|
36
|
+
"-flto"
|
|
32
37
|
],
|
|
33
38
|
"cflags_cc": [
|
|
34
39
|
"-O3",
|
|
35
40
|
"-march=native",
|
|
36
41
|
"-mtune=native",
|
|
37
|
-
"-ffast-math"
|
|
38
|
-
|
|
42
|
+
"-ffast-math",
|
|
43
|
+
"-funroll-loops",
|
|
44
|
+
"-fomit-frame-pointer",
|
|
45
|
+
"-flto"
|
|
46
|
+
],
|
|
47
|
+
"ldflags": ["-flto"]
|
|
39
48
|
}],
|
|
40
49
|
["OS=='mac'", {
|
|
41
50
|
"xcode_settings": {
|
|
42
51
|
"GCC_OPTIMIZATION_LEVEL": "3",
|
|
52
|
+
"LLVM_LTO": "YES",
|
|
43
53
|
"OTHER_CFLAGS": [
|
|
44
54
|
"-march=native",
|
|
45
55
|
"-mtune=native",
|
|
46
|
-
"-ffast-math"
|
|
56
|
+
"-ffast-math",
|
|
57
|
+
"-funroll-loops",
|
|
58
|
+
"-fomit-frame-pointer"
|
|
47
59
|
],
|
|
48
60
|
"OTHER_CPLUSPLUSFLAGS": [
|
|
49
61
|
"-march=native",
|
|
50
62
|
"-mtune=native",
|
|
51
|
-
"-ffast-math"
|
|
63
|
+
"-ffast-math",
|
|
64
|
+
"-funroll-loops",
|
|
65
|
+
"-fomit-frame-pointer"
|
|
52
66
|
]
|
|
53
67
|
}
|
|
54
68
|
}]
|
|
Binary file
|
package/cisv/cisv_addon.cc
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#include <napi.h>
|
|
2
|
-
#include "
|
|
3
|
-
#include "
|
|
2
|
+
#include "cisv/parser.h"
|
|
3
|
+
#include "cisv/transformer.h"
|
|
4
4
|
#include <vector>
|
|
5
5
|
#include <memory>
|
|
6
6
|
#include <string>
|
|
@@ -9,6 +9,109 @@
|
|
|
9
9
|
|
|
10
10
|
namespace {
|
|
11
11
|
|
|
12
|
+
// =============================================================================
|
|
13
|
+
// SECURITY: UTF-8 validation to prevent V8 crashes on invalid input
|
|
14
|
+
// Invalid UTF-8 data can cause Napi::String::New to throw or crash
|
|
15
|
+
// =============================================================================
|
|
16
|
+
static bool isValidUtf8(const char* data, size_t len) {
|
|
17
|
+
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(data);
|
|
18
|
+
size_t i = 0;
|
|
19
|
+
|
|
20
|
+
while (i < len) {
|
|
21
|
+
unsigned char c = bytes[i];
|
|
22
|
+
|
|
23
|
+
if (c < 0x80) {
|
|
24
|
+
// ASCII: single byte (0x00-0x7F)
|
|
25
|
+
i++;
|
|
26
|
+
} else if ((c & 0xE0) == 0xC0) {
|
|
27
|
+
// 2-byte sequence (0xC0-0xDF)
|
|
28
|
+
if (i + 1 >= len) return false;
|
|
29
|
+
if ((bytes[i + 1] & 0xC0) != 0x80) return false;
|
|
30
|
+
// Overlong check: C0-C1 are invalid
|
|
31
|
+
if (c < 0xC2) return false;
|
|
32
|
+
i += 2;
|
|
33
|
+
} else if ((c & 0xF0) == 0xE0) {
|
|
34
|
+
// 3-byte sequence (0xE0-0xEF)
|
|
35
|
+
if (i + 2 >= len) return false;
|
|
36
|
+
if ((bytes[i + 1] & 0xC0) != 0x80) return false;
|
|
37
|
+
if ((bytes[i + 2] & 0xC0) != 0x80) return false;
|
|
38
|
+
// Overlong check for E0
|
|
39
|
+
if (c == 0xE0 && bytes[i + 1] < 0xA0) return false;
|
|
40
|
+
// Surrogate check (U+D800-U+DFFF)
|
|
41
|
+
if (c == 0xED && bytes[i + 1] >= 0xA0) return false;
|
|
42
|
+
i += 3;
|
|
43
|
+
} else if ((c & 0xF8) == 0xF0) {
|
|
44
|
+
// 4-byte sequence (0xF0-0xF7)
|
|
45
|
+
if (i + 3 >= len) return false;
|
|
46
|
+
if ((bytes[i + 1] & 0xC0) != 0x80) return false;
|
|
47
|
+
if ((bytes[i + 2] & 0xC0) != 0x80) return false;
|
|
48
|
+
if ((bytes[i + 3] & 0xC0) != 0x80) return false;
|
|
49
|
+
// Overlong check for F0
|
|
50
|
+
if (c == 0xF0 && bytes[i + 1] < 0x90) return false;
|
|
51
|
+
// Check for code points > U+10FFFF
|
|
52
|
+
if (c == 0xF4 && bytes[i + 1] >= 0x90) return false;
|
|
53
|
+
if (c > 0xF4) return false;
|
|
54
|
+
i += 4;
|
|
55
|
+
} else {
|
|
56
|
+
// Invalid leading byte
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
return true;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Create Napi::String with UTF-8 validation (safe version)
|
|
64
|
+
// Falls back to replacement character representation for invalid UTF-8
|
|
65
|
+
static Napi::String SafeNewString(Napi::Env env, const char* data, size_t len) {
|
|
66
|
+
if (isValidUtf8(data, len)) {
|
|
67
|
+
return Napi::String::New(env, data, len);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Invalid UTF-8 - replace invalid bytes with replacement character
|
|
71
|
+
// This prevents V8 crashes while preserving data visibility
|
|
72
|
+
std::string safe_str;
|
|
73
|
+
safe_str.reserve(len);
|
|
74
|
+
|
|
75
|
+
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(data);
|
|
76
|
+
size_t i = 0;
|
|
77
|
+
|
|
78
|
+
while (i < len) {
|
|
79
|
+
unsigned char c = bytes[i];
|
|
80
|
+
|
|
81
|
+
if (c < 0x80) {
|
|
82
|
+
safe_str += static_cast<char>(c);
|
|
83
|
+
i++;
|
|
84
|
+
} else if ((c & 0xE0) == 0xC0 && i + 1 < len &&
|
|
85
|
+
(bytes[i + 1] & 0xC0) == 0x80 && c >= 0xC2) {
|
|
86
|
+
safe_str += static_cast<char>(c);
|
|
87
|
+
safe_str += static_cast<char>(bytes[i + 1]);
|
|
88
|
+
i += 2;
|
|
89
|
+
} else if ((c & 0xF0) == 0xE0 && i + 2 < len &&
|
|
90
|
+
(bytes[i + 1] & 0xC0) == 0x80 &&
|
|
91
|
+
(bytes[i + 2] & 0xC0) == 0x80) {
|
|
92
|
+
safe_str += static_cast<char>(c);
|
|
93
|
+
safe_str += static_cast<char>(bytes[i + 1]);
|
|
94
|
+
safe_str += static_cast<char>(bytes[i + 2]);
|
|
95
|
+
i += 3;
|
|
96
|
+
} else if ((c & 0xF8) == 0xF0 && i + 3 < len &&
|
|
97
|
+
(bytes[i + 1] & 0xC0) == 0x80 &&
|
|
98
|
+
(bytes[i + 2] & 0xC0) == 0x80 &&
|
|
99
|
+
(bytes[i + 3] & 0xC0) == 0x80 && c <= 0xF4) {
|
|
100
|
+
safe_str += static_cast<char>(c);
|
|
101
|
+
safe_str += static_cast<char>(bytes[i + 1]);
|
|
102
|
+
safe_str += static_cast<char>(bytes[i + 2]);
|
|
103
|
+
safe_str += static_cast<char>(bytes[i + 3]);
|
|
104
|
+
i += 4;
|
|
105
|
+
} else {
|
|
106
|
+
// Invalid byte - use UTF-8 replacement character U+FFFD
|
|
107
|
+
safe_str += "\xEF\xBF\xBD";
|
|
108
|
+
i++;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return Napi::String::New(env, safe_str);
|
|
113
|
+
}
|
|
114
|
+
|
|
12
115
|
// Extended RowCollector that handles transforms
|
|
13
116
|
struct RowCollector {
|
|
14
117
|
std::vector<std::string> current;
|
|
@@ -34,6 +137,13 @@ struct RowCollector {
|
|
|
34
137
|
cisv_transform_pipeline_destroy(pipeline);
|
|
35
138
|
pipeline = nullptr;
|
|
36
139
|
}
|
|
140
|
+
// SECURITY FIX: Properly release all persistent references to prevent memory leak
|
|
141
|
+
// Napi::Persistent references must be Reset() before being destroyed
|
|
142
|
+
for (auto& pair : js_transforms) {
|
|
143
|
+
if (!pair.second.IsEmpty()) {
|
|
144
|
+
pair.second.Reset(); // Release the persistent handle
|
|
145
|
+
}
|
|
146
|
+
}
|
|
37
147
|
js_transforms.clear();
|
|
38
148
|
rows.clear();
|
|
39
149
|
current.clear();
|
|
@@ -76,7 +186,8 @@ struct RowCollector {
|
|
|
76
186
|
auto it = js_transforms.find(field_index);
|
|
77
187
|
if (it != js_transforms.end() && !it->second.IsEmpty()) {
|
|
78
188
|
try {
|
|
79
|
-
|
|
189
|
+
// SECURITY: Use safe string creation to handle invalid UTF-8
|
|
190
|
+
Napi::String input = SafeNewString(env, result.c_str(), result.length());
|
|
80
191
|
Napi::Number field = Napi::Number::New(env, field_index);
|
|
81
192
|
|
|
82
193
|
Napi::Value js_result = it->second.Call({input, field});
|
|
@@ -84,8 +195,15 @@ struct RowCollector {
|
|
|
84
195
|
if (js_result.IsString()) {
|
|
85
196
|
result = js_result.As<Napi::String>().Utf8Value();
|
|
86
197
|
}
|
|
198
|
+
} catch (const Napi::Error& e) {
|
|
199
|
+
// Keep original result but log the error
|
|
200
|
+
fprintf(stderr, "CISV: JS transform error for field %d: %s\n",
|
|
201
|
+
field_index, e.Message().c_str());
|
|
202
|
+
} catch (const std::exception& e) {
|
|
203
|
+
fprintf(stderr, "CISV: C++ exception in JS transform: %s\n", e.what());
|
|
87
204
|
} catch (...) {
|
|
88
|
-
|
|
205
|
+
fprintf(stderr, "CISV: Unknown exception in JS transform for field %d\n",
|
|
206
|
+
field_index);
|
|
89
207
|
}
|
|
90
208
|
}
|
|
91
209
|
|
|
@@ -93,7 +211,8 @@ struct RowCollector {
|
|
|
93
211
|
auto it_all = js_transforms.find(-1);
|
|
94
212
|
if (it_all != js_transforms.end() && !it_all->second.IsEmpty()) {
|
|
95
213
|
try {
|
|
96
|
-
|
|
214
|
+
// SECURITY: Use safe string creation to handle invalid UTF-8
|
|
215
|
+
Napi::String input = SafeNewString(env, result.c_str(), result.length());
|
|
97
216
|
Napi::Number field = Napi::Number::New(env, field_index);
|
|
98
217
|
|
|
99
218
|
Napi::Value js_result = it_all->second.Call({input, field});
|
|
@@ -101,8 +220,13 @@ struct RowCollector {
|
|
|
101
220
|
if (js_result.IsString()) {
|
|
102
221
|
result = js_result.As<Napi::String>().Utf8Value();
|
|
103
222
|
}
|
|
223
|
+
} catch (const Napi::Error& e) {
|
|
224
|
+
// Keep original result but log the error
|
|
225
|
+
fprintf(stderr, "CISV: JS transform error (all fields): %s\n", e.Message().c_str());
|
|
226
|
+
} catch (const std::exception& e) {
|
|
227
|
+
fprintf(stderr, "CISV: C++ exception in JS transform: %s\n", e.what());
|
|
104
228
|
} catch (...) {
|
|
105
|
-
|
|
229
|
+
fprintf(stderr, "CISV: Unknown exception in JS transform (all fields)\n");
|
|
106
230
|
}
|
|
107
231
|
}
|
|
108
232
|
}
|
|
@@ -114,9 +238,19 @@ struct RowCollector {
|
|
|
114
238
|
static void field_cb(void *user, const char *data, size_t len) {
|
|
115
239
|
auto *rc = reinterpret_cast<RowCollector *>(user);
|
|
116
240
|
|
|
117
|
-
//
|
|
241
|
+
// Fast path: no transforms - avoid unnecessary string copies
|
|
242
|
+
bool has_c_transforms = rc->pipeline && rc->pipeline->count > 0;
|
|
243
|
+
bool has_js_transforms = !rc->js_transforms.empty();
|
|
244
|
+
|
|
245
|
+
if (!has_c_transforms && !has_js_transforms) {
|
|
246
|
+
rc->current.emplace_back(data, len);
|
|
247
|
+
rc->current_field_index++;
|
|
248
|
+
return;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// Slow path: apply transforms
|
|
118
252
|
std::string transformed = rc->applyTransforms(data, len, rc->current_field_index);
|
|
119
|
-
rc->current.emplace_back(transformed);
|
|
253
|
+
rc->current.emplace_back(std::move(transformed));
|
|
120
254
|
rc->current_field_index++;
|
|
121
255
|
}
|
|
122
256
|
|
|
@@ -457,15 +591,25 @@ public:
|
|
|
457
591
|
|
|
458
592
|
if (info[0].IsBuffer()) {
|
|
459
593
|
auto buf = info[0].As<Napi::Buffer<uint8_t>>();
|
|
460
|
-
|
|
461
|
-
|
|
594
|
+
size_t buf_len = buf.Length();
|
|
595
|
+
// Check for overflow before adding to total_bytes_
|
|
596
|
+
if (buf_len > SIZE_MAX - total_bytes_) {
|
|
597
|
+
throw Napi::Error::New(env, "Total bytes would overflow");
|
|
598
|
+
}
|
|
599
|
+
cisv_parser_write(parser_, buf.Data(), buf_len);
|
|
600
|
+
total_bytes_ += buf_len;
|
|
462
601
|
return;
|
|
463
602
|
}
|
|
464
603
|
|
|
465
604
|
if (info[0].IsString()) {
|
|
466
605
|
std::string chunk = info[0].As<Napi::String>();
|
|
467
|
-
|
|
468
|
-
|
|
606
|
+
size_t chunk_size = chunk.size();
|
|
607
|
+
// Check for overflow before adding to total_bytes_
|
|
608
|
+
if (chunk_size > SIZE_MAX - total_bytes_) {
|
|
609
|
+
throw Napi::Error::New(env, "Total bytes would overflow");
|
|
610
|
+
}
|
|
611
|
+
cisv_parser_write(parser_, reinterpret_cast<const uint8_t*>(chunk.data()), chunk_size);
|
|
612
|
+
total_bytes_ += chunk_size;
|
|
469
613
|
return;
|
|
470
614
|
}
|
|
471
615
|
|
|
@@ -475,9 +619,10 @@ public:
|
|
|
475
619
|
void End(const Napi::CallbackInfo &info) {
|
|
476
620
|
if (!is_destroyed_) {
|
|
477
621
|
cisv_parser_end(parser_);
|
|
478
|
-
// Clear the environment reference after ending
|
|
479
|
-
|
|
480
|
-
// rc_->
|
|
622
|
+
// Clear the environment reference after ending to prevent stale references
|
|
623
|
+
rc_->env = nullptr;
|
|
624
|
+
// Note: JS transforms stored in rc_->js_transforms remain valid
|
|
625
|
+
// as they are Persistent references managed by the addon lifecycle
|
|
481
626
|
}
|
|
482
627
|
}
|
|
483
628
|
|
|
@@ -554,6 +699,9 @@ public:
|
|
|
554
699
|
if (info.Length() >= 3 && info[2].IsObject()) {
|
|
555
700
|
Napi::Object context_obj = info[2].As<Napi::Object>();
|
|
556
701
|
ctx = (cisv_transform_context_t*)calloc(1, sizeof(cisv_transform_context_t));
|
|
702
|
+
if (!ctx) {
|
|
703
|
+
throw Napi::Error::New(env, "Memory allocation failed for transform context");
|
|
704
|
+
}
|
|
557
705
|
|
|
558
706
|
// Extract context properties if they exist
|
|
559
707
|
if (context_obj.Has("key")) {
|
|
@@ -561,6 +709,10 @@ public:
|
|
|
561
709
|
if (key_val.IsString()) {
|
|
562
710
|
std::string key = key_val.As<Napi::String>();
|
|
563
711
|
ctx->key = strdup(key.c_str());
|
|
712
|
+
if (!ctx->key) {
|
|
713
|
+
free(ctx);
|
|
714
|
+
throw Napi::Error::New(env, "Memory allocation failed for key");
|
|
715
|
+
}
|
|
564
716
|
ctx->key_len = key.length();
|
|
565
717
|
}
|
|
566
718
|
}
|
|
@@ -570,6 +722,11 @@ public:
|
|
|
570
722
|
if (iv_val.IsString()) {
|
|
571
723
|
std::string iv = iv_val.As<Napi::String>();
|
|
572
724
|
ctx->iv = strdup(iv.c_str());
|
|
725
|
+
if (!ctx->iv) {
|
|
726
|
+
if (ctx->key) free((void*)ctx->key);
|
|
727
|
+
free(ctx);
|
|
728
|
+
throw Napi::Error::New(env, "Memory allocation failed for iv");
|
|
729
|
+
}
|
|
573
730
|
ctx->iv_len = iv.length();
|
|
574
731
|
}
|
|
575
732
|
}
|
|
@@ -653,6 +810,9 @@ Napi::Value TransformByName(const Napi::CallbackInfo &info) {
|
|
|
653
810
|
if (info.Length() >= 3 && info[2].IsObject()) {
|
|
654
811
|
Napi::Object context_obj = info[2].As<Napi::Object>();
|
|
655
812
|
ctx = (cisv_transform_context_t*)calloc(1, sizeof(cisv_transform_context_t));
|
|
813
|
+
if (!ctx) {
|
|
814
|
+
throw Napi::Error::New(env, "Memory allocation failed for transform context");
|
|
815
|
+
}
|
|
656
816
|
|
|
657
817
|
// Extract context properties if they exist
|
|
658
818
|
if (context_obj.Has("key")) {
|
|
@@ -660,6 +820,10 @@ Napi::Value TransformByName(const Napi::CallbackInfo &info) {
|
|
|
660
820
|
if (key_val.IsString()) {
|
|
661
821
|
std::string key = key_val.As<Napi::String>();
|
|
662
822
|
ctx->key = strdup(key.c_str());
|
|
823
|
+
if (!ctx->key) {
|
|
824
|
+
free(ctx);
|
|
825
|
+
throw Napi::Error::New(env, "Memory allocation failed for key");
|
|
826
|
+
}
|
|
663
827
|
ctx->key_len = key.length();
|
|
664
828
|
}
|
|
665
829
|
}
|
|
@@ -669,6 +833,11 @@ Napi::Value TransformByName(const Napi::CallbackInfo &info) {
|
|
|
669
833
|
if (iv_val.IsString()) {
|
|
670
834
|
std::string iv = iv_val.As<Napi::String>();
|
|
671
835
|
ctx->iv = strdup(iv.c_str());
|
|
836
|
+
if (!ctx->iv) {
|
|
837
|
+
if (ctx->key) free((void*)ctx->key);
|
|
838
|
+
free(ctx);
|
|
839
|
+
throw Napi::Error::New(env, "Memory allocation failed for iv");
|
|
840
|
+
}
|
|
672
841
|
ctx->iv_len = iv.length();
|
|
673
842
|
}
|
|
674
843
|
}
|
|
@@ -722,14 +891,31 @@ void SetHeaderFields(const Napi::CallbackInfo &info) {
|
|
|
722
891
|
throw Napi::Error::New(env, "Memory allocation failed");
|
|
723
892
|
}
|
|
724
893
|
|
|
894
|
+
// Initialize to NULL for safe cleanup on partial failure
|
|
895
|
+
for (size_t i = 0; i < field_count; i++) {
|
|
896
|
+
c_field_names[i] = nullptr;
|
|
897
|
+
}
|
|
898
|
+
|
|
725
899
|
for (size_t i = 0; i < field_count; i++) {
|
|
726
900
|
Napi::Value field_val = field_names[i];
|
|
727
901
|
if (!field_val.IsString()) {
|
|
902
|
+
// Clean up all previously allocated strings
|
|
903
|
+
for (size_t j = 0; j < i; j++) {
|
|
904
|
+
if (c_field_names[j]) free((void*)c_field_names[j]);
|
|
905
|
+
}
|
|
728
906
|
free(c_field_names);
|
|
729
907
|
throw Napi::TypeError::New(env, "Field names must be strings");
|
|
730
908
|
}
|
|
731
909
|
std::string field_str = field_val.As<Napi::String>();
|
|
732
910
|
c_field_names[i] = strdup(field_str.c_str());
|
|
911
|
+
if (!c_field_names[i]) {
|
|
912
|
+
// Clean up all previously allocated strings
|
|
913
|
+
for (size_t j = 0; j < i; j++) {
|
|
914
|
+
if (c_field_names[j]) free((void*)c_field_names[j]);
|
|
915
|
+
}
|
|
916
|
+
free(c_field_names);
|
|
917
|
+
throw Napi::Error::New(env, "Memory allocation failed for field name");
|
|
918
|
+
}
|
|
733
919
|
}
|
|
734
920
|
|
|
735
921
|
// Ensure pipeline exists
|
|
@@ -981,7 +1167,9 @@ private:
|
|
|
981
1167
|
for (size_t i = 0; i < rc_->rows.size(); ++i) {
|
|
982
1168
|
Napi::Array row = Napi::Array::New(env, rc_->rows[i].size());
|
|
983
1169
|
for (size_t j = 0; j < rc_->rows[i].size(); ++j) {
|
|
984
|
-
|
|
1170
|
+
// SECURITY: Use safe string creation to handle invalid UTF-8 in CSV data
|
|
1171
|
+
const std::string& field = rc_->rows[i][j];
|
|
1172
|
+
row[j] = SafeNewString(env, field.c_str(), field.length());
|
|
985
1173
|
}
|
|
986
1174
|
rows[i] = row;
|
|
987
1175
|
}
|