cisv 0.0.40 → 0.0.42
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -13
- package/cisv/cisv_addon.cc +6 -1
- package/cisv/cisv_parser.c +29 -15
- package/cisv/cisv_transformer.c +4 -1
- package/cisv/index.js +4 -0
- package/cisv/index.mjs +12 -0
- package/package.json +11 -4
package/README.md
CHANGED
|
@@ -41,41 +41,41 @@ $ docker run --rm \
|
|
|
41
41
|
## BENCHMARKS
|
|
42
42
|
|
|
43
43
|
Benchmarks comparison with existing popular tools,
|
|
44
|
-
cf pipeline you can check : (https://github.com/Sanix-Darker/cisv/actions/runs/
|
|
44
|
+
cf pipeline you can check : (https://github.com/Sanix-Darker/cisv/actions/runs/17697547058/job/50298916576) a step "Publish to npm"
|
|
45
45
|
|
|
46
46
|
### SYNCHRONOUS RESULTS
|
|
47
47
|
|
|
48
48
|
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
49
49
|
|--------------------|--------------|---------------|----------------|
|
|
50
|
-
| cisv (sync) |
|
|
51
|
-
| csv-parse (sync) |
|
|
52
|
-
| papaparse (sync) |
|
|
50
|
+
| cisv (sync) | 45.58 | 0.01 | 98543 |
|
|
51
|
+
| csv-parse (sync) | 18.11 | 0.03 | 39155 |
|
|
52
|
+
| papaparse (sync) | 28.03 | 0.02 | 60596 |
|
|
53
53
|
|
|
54
54
|
### SYNCHRONOUS RESULTS (WITH DATA ACCESS)
|
|
55
55
|
|
|
56
56
|
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
57
57
|
|--------------------|--------------|---------------|----------------|
|
|
58
|
-
| cisv (sync) |
|
|
59
|
-
| csv-parse (sync) |
|
|
60
|
-
| papaparse (sync) |
|
|
58
|
+
| cisv (sync) | 46.80 | 0.01 | 101185 |
|
|
59
|
+
| csv-parse (sync) | 18.92 | 0.02 | 40900 |
|
|
60
|
+
| papaparse (sync) | 28.38 | 0.02 | 61363 |
|
|
61
61
|
|
|
62
62
|
|
|
63
63
|
### ASYNCHRONOUS RESULTS
|
|
64
64
|
|
|
65
65
|
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
66
66
|
|--------------------------|--------------|---------------|----------------|
|
|
67
|
-
| cisv (async/stream) |
|
|
68
|
-
| papaparse (async/stream) |
|
|
69
|
-
| neat-csv (async/promise) | 9.
|
|
67
|
+
| cisv (async/stream) | 70.07 | 0.01 | 151485 |
|
|
68
|
+
| papaparse (async/stream) | 21.58 | 0.02 | 46646 |
|
|
69
|
+
| neat-csv (async/promise) | 9.77 | 0.05 | 21126 |
|
|
70
70
|
|
|
71
71
|
|
|
72
72
|
### ASYNCHRONOUS RESULTS (WITH DATA ACCESS)
|
|
73
73
|
|
|
74
74
|
| Library | Speed (MB/s) | Avg Time (ms) | Operations/sec |
|
|
75
75
|
|--------------------------|--------------|---------------|----------------|
|
|
76
|
-
| cisv (async/stream) |
|
|
77
|
-
| papaparse (async/stream) |
|
|
78
|
-
| neat-csv (async/promise) | 9.
|
|
76
|
+
| cisv (async/stream) | 25.23 | 0.02 | 54545 |
|
|
77
|
+
| papaparse (async/stream) | 22.49 | 0.02 | 48622 |
|
|
78
|
+
| neat-csv (async/promise) | 9.91 | 0.05 | 21428 |
|
|
79
79
|
|
|
80
80
|
## INSTALLATION
|
|
81
81
|
|
package/cisv/cisv_addon.cc
CHANGED
|
@@ -174,6 +174,8 @@ public:
|
|
|
174
174
|
// Initialize configuration with defaults
|
|
175
175
|
cisv_config_init(&config_);
|
|
176
176
|
|
|
177
|
+
config_.max_row_size = 0;
|
|
178
|
+
|
|
177
179
|
// Handle constructor options if provided
|
|
178
180
|
if (info.Length() > 0 && info[0].IsObject()) {
|
|
179
181
|
Napi::Object options = info[0].As<Napi::Object>();
|
|
@@ -261,7 +263,10 @@ public:
|
|
|
261
263
|
|
|
262
264
|
// Numeric options
|
|
263
265
|
if (options.Has("maxRowSize")) {
|
|
264
|
-
|
|
266
|
+
Napi::Value val = options.Get("maxRowSize");
|
|
267
|
+
if (!val.IsNull() && !val.IsUndefined()) {
|
|
268
|
+
config_.max_row_size = val.As<Napi::Number>().Uint32Value();
|
|
269
|
+
}
|
|
265
270
|
}
|
|
266
271
|
|
|
267
272
|
if (options.Has("fromLine")) {
|
package/cisv/cisv_parser.c
CHANGED
|
@@ -6,7 +6,6 @@
|
|
|
6
6
|
#include <errno.h>
|
|
7
7
|
#include <time.h>
|
|
8
8
|
#include <stdbool.h>
|
|
9
|
-
#include <ctype.h>
|
|
10
9
|
// NOTE: not dealing with windows for now, too much issues
|
|
11
10
|
#include <sys/mman.h>
|
|
12
11
|
#include <fcntl.h>
|
|
@@ -14,7 +13,6 @@
|
|
|
14
13
|
#include <getopt.h>
|
|
15
14
|
#include <sys/time.h>
|
|
16
15
|
#include "cisv_parser.h"
|
|
17
|
-
#include "cisv_simd.h"
|
|
18
16
|
|
|
19
17
|
#ifdef __AVX512F__
|
|
20
18
|
#include <immintrin.h>
|
|
@@ -24,8 +22,8 @@
|
|
|
24
22
|
#include <immintrin.h>
|
|
25
23
|
#endif
|
|
26
24
|
|
|
27
|
-
#define RINGBUF_SIZE (
|
|
28
|
-
|
|
25
|
+
#define RINGBUF_SIZE (256 * 1024)
|
|
26
|
+
#define DIRECT_PARSE_THRESHOLD (64 * 1024) // Parse directly if chunk > 64KB
|
|
29
27
|
#define PREFETCH_DISTANCE 256
|
|
30
28
|
|
|
31
29
|
struct cisv_parser {
|
|
@@ -494,6 +492,8 @@ static void parse_simd_chunk(cisv_parser *parser, const uint8_t *buffer, size_t
|
|
|
494
492
|
// Handle newline
|
|
495
493
|
if (is_newline) {
|
|
496
494
|
yield_row(parser);
|
|
495
|
+
parser->current_row_size = 0;
|
|
496
|
+
parser->row_start = special_pos + 1;
|
|
497
497
|
}
|
|
498
498
|
|
|
499
499
|
// Update state branchlessly
|
|
@@ -786,6 +786,7 @@ static void parse_simd_chunk(cisv_parser *parser, const uint8_t *buffer, size_t
|
|
|
786
786
|
if (action & ACT_ROW) {
|
|
787
787
|
yield_row(parser);
|
|
788
788
|
parser->current_row_size = 0;
|
|
789
|
+
parser->row_start = cur + 1;
|
|
789
790
|
}
|
|
790
791
|
|
|
791
792
|
cur += 1 - ((action & ACT_REPROCESS) >> 2);
|
|
@@ -808,6 +809,7 @@ static int parse_memory(cisv_parser *parser, const uint8_t *buffer, size_t len)
|
|
|
808
809
|
// Yield final row if there's content
|
|
809
810
|
if (parser->field_start > parser->row_start || !parser->skip_empty_lines) {
|
|
810
811
|
yield_row(parser);
|
|
812
|
+
parser->current_row_size = 0;
|
|
811
813
|
}
|
|
812
814
|
}
|
|
813
815
|
return 0;
|
|
@@ -1025,24 +1027,36 @@ int cisv_parser_parse_file(cisv_parser *parser, const char *path) {
|
|
|
1025
1027
|
}
|
|
1026
1028
|
|
|
1027
1029
|
int cisv_parser_write(cisv_parser *parser, const uint8_t *chunk, size_t len) {
|
|
1028
|
-
if (!parser || !chunk
|
|
1030
|
+
if (!parser || !chunk) return -EINVAL;
|
|
1029
1031
|
|
|
1030
|
-
//
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
parse_memory(parser,
|
|
1034
|
-
|
|
1032
|
+
// For large chunks, bypass ring buffer entirely
|
|
1033
|
+
if (len > DIRECT_PARSE_THRESHOLD) {
|
|
1034
|
+
// Parse directly - this is actually FASTER for large data
|
|
1035
|
+
return parse_memory(parser, chunk, len);
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
// Small chunks use ring buffer for efficiency
|
|
1039
|
+
if (parser->head + len > RINGBUF_SIZE) {
|
|
1040
|
+
// Flush current buffer
|
|
1041
|
+
if (parser->head > 0) {
|
|
1042
|
+
parse_memory(parser, parser->ring, parser->head);
|
|
1043
|
+
parser->head = 0;
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1046
|
+
// If still too large, parse directly
|
|
1047
|
+
if (len > RINGBUF_SIZE) {
|
|
1048
|
+
return parse_memory(parser, chunk, len);
|
|
1049
|
+
}
|
|
1035
1050
|
}
|
|
1036
1051
|
|
|
1037
1052
|
memcpy(parser->ring + parser->head, chunk, len);
|
|
1038
1053
|
parser->head += len;
|
|
1039
1054
|
|
|
1040
|
-
//
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
if (has_newline | threshold) {
|
|
1044
|
-
parse_memory(parser, parser->ring, parser->head);
|
|
1055
|
+
// Process on newline or when buffer is getting full
|
|
1056
|
+
if (memchr(chunk, '\n', len) || parser->head > (RINGBUF_SIZE * 3 / 4)) {
|
|
1057
|
+
int result = parse_memory(parser, parser->ring, parser->head);
|
|
1045
1058
|
parser->head = 0;
|
|
1059
|
+
return result;
|
|
1046
1060
|
}
|
|
1047
1061
|
return 0;
|
|
1048
1062
|
}
|
package/cisv/cisv_transformer.c
CHANGED
|
@@ -5,12 +5,15 @@
|
|
|
5
5
|
#include <ctype.h>
|
|
6
6
|
#include <stdio.h>
|
|
7
7
|
|
|
8
|
+
#ifdef __AVX512F__
|
|
9
|
+
#include <immintrin.h>
|
|
10
|
+
#endif
|
|
11
|
+
|
|
8
12
|
#ifdef __AVX2__
|
|
9
13
|
#include <immintrin.h>
|
|
10
14
|
#endif
|
|
11
15
|
|
|
12
16
|
#define TRANSFORM_POOL_SIZE (1 << 20) // 1MB default pool
|
|
13
|
-
// #define TRANSFORM_POOL_SIZE (1 << 16) // 64kb (for memory safe reasons)
|
|
14
17
|
#define SIMD_ALIGNMENT 64
|
|
15
18
|
|
|
16
19
|
// Create transform pipeline
|
package/cisv/index.js
ADDED
package/cisv/index.mjs
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import { createRequire } from 'module';
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
|
|
5
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
6
|
+
const require = createRequire(import.meta.url);
|
|
7
|
+
|
|
8
|
+
const gyp = require('node-gyp-build');
|
|
9
|
+
const addon = gyp(path.join(__dirname, '..'));
|
|
10
|
+
|
|
11
|
+
export const cisvParser = addon.cisvParser;
|
|
12
|
+
export default addon;
|
package/package.json
CHANGED
|
@@ -1,15 +1,21 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "cisv",
|
|
3
|
-
"version": "0.0.
|
|
4
|
-
"description": "The
|
|
3
|
+
"version": "0.0.42",
|
|
4
|
+
"description": "The csv parser on steroids.",
|
|
5
5
|
"author": "sanix<s4nixd@gmail.com>",
|
|
6
6
|
"main": "./build/Release/cisv.node",
|
|
7
7
|
"types": "./types/cisv.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"import": "./cisv/index.mjs",
|
|
11
|
+
"require": "./cisv/index.js"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
8
14
|
"scripts": {
|
|
9
15
|
"install": "node-gyp rebuild",
|
|
10
16
|
"build": "node-gyp rebuild",
|
|
11
17
|
"test": "mocha ./tests/*.test.js && bash ./test_transform.sh",
|
|
12
|
-
"test:build": "npm run test",
|
|
18
|
+
"test:build": "npm run build && npm run test",
|
|
13
19
|
"benchmark": "node benchmark/benchmark.js",
|
|
14
20
|
"lint": "clang-format -i cisv/*.{cc,h}",
|
|
15
21
|
"prepublishOnly": "npm run benchmark",
|
|
@@ -20,7 +26,8 @@
|
|
|
20
26
|
"test:perf": "node test/performance.test.js"
|
|
21
27
|
},
|
|
22
28
|
"dependencies": {
|
|
23
|
-
"node-addon-api": "^5.0.0"
|
|
29
|
+
"node-addon-api": "^5.0.0",
|
|
30
|
+
"node-gyp-build": "^4.8.4"
|
|
24
31
|
},
|
|
25
32
|
"devDependencies": {
|
|
26
33
|
"@types/mocha": "^10.0.10",
|