cisv 0.2.5 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,298 +1,154 @@
1
- # CISV
1
+ # CISV Node.js Binding
2
2
 
3
- ![License](https://img.shields.io/badge/license-MIT-blue)
4
- ![Build](https://img.shields.io/badge/build-passing-brightgreen)
5
- ![Size](https://deno.bundlejs.com/badge?q=spring-easing)
6
- ![Downloads](https://badgen.net/npm/dw/cisv)
3
+ Native Node-API binding for the CISV C core.
7
4
 
8
- ## INSTALLATION
5
+ ## Install
9
6
 
10
- ### NODE.JS PACKAGE
11
7
  ```bash
12
8
  npm install cisv
13
9
  ```
14
10
 
15
- ### BUILD FROM SOURCE (NODE.JS ADDON)
11
+ From source in this repository:
12
+
16
13
  ```bash
17
- npm install -g node-gyp
14
+ cd bindings/nodejs
15
+ npm ci
16
+ npm run build
17
+ npm test
18
18
  ```
19
19
 
20
- ## QUICK START
20
+ ## Quick Start
21
21
 
22
- ### NODE.JS
23
- ```javascript
22
+ ```js
24
23
  const { cisvParser } = require('cisv');
25
24
 
26
- // Basic usage
27
- const parser = new cisvParser();
28
- const rows = parser.parseSync('./data.csv');
29
-
30
- // With configuration (optional)
31
- const tsv_parser = new cisvParser({
32
- delimiter: '\t',
33
- quote: "'",
34
- trim: true
35
- });
36
- const tsv_rows = tsv_parser.parseSync('./data.tsv');
37
- ```
38
-
39
-
40
- ## CONFIGURATION OPTIONS
41
-
42
- ### Parser Configuration
43
-
44
- ```javascript
45
- const parser = new cisvParser({
46
- // Field delimiter character (default: ',')
47
- delimiter: ',',
48
-
49
- // Quote character (default: '"')
50
- quote: '"',
51
-
52
- // Escape character (null for RFC4180 "" style, default: null)
53
- escape: null,
54
-
55
- // Comment character to skip lines (default: null)
56
- comment: '#',
57
-
58
- // Trim whitespace from fields (default: false)
59
- trim: true,
60
-
61
- // Skip empty lines (default: false)
62
- skipEmptyLines: true,
63
-
64
- // Use relaxed parsing rules (default: false)
65
- relaxed: false,
66
-
67
- // Skip lines with parse errors (default: false)
68
- skipLinesWithError: true,
69
-
70
- // Maximum row size in bytes (0 = unlimited, default: 0)
71
- maxRowSize: 1048576,
72
-
73
- // Start parsing from line N (1-based, default: 1)
74
- fromLine: 10,
75
-
76
- // Stop parsing at line N (0 = until end, default: 0)
77
- toLine: 1000
78
- });
79
- ```
80
-
81
- ### Dynamic Configuration
82
-
83
- ```javascript
84
- // Set configuration after creation
85
- parser.setConfig({
86
- delimiter: ';',
87
- quote: "'",
88
- trim: true
89
- });
90
-
91
- // Get current configuration
92
- const config = parser.getConfig();
93
- console.log(config);
94
- ```
95
-
96
- ## API REFERENCE
97
-
98
- ### TYPESCRIPT DEFINITIONS
99
- ```typescript
100
- interface CisvConfig {
101
- delimiter?: string;
102
- quote?: string;
103
- escape?: string | null;
104
- comment?: string | null;
105
- trim?: boolean;
106
- skipEmptyLines?: boolean;
107
- relaxed?: boolean;
108
- skipLinesWithError?: boolean;
109
- maxRowSize?: number;
110
- fromLine?: number;
111
- toLine?: number;
112
- }
113
-
114
- interface ParsedRow extends Array<string> {}
115
-
116
- interface ParseStats {
117
- rowCount: number;
118
- fieldCount: number;
119
- totalBytes: number;
120
- parseTime: number;
121
- currentLine: number;
122
- }
123
-
124
- interface TransformInfo {
125
- cTransformCount: number;
126
- jsTransformCount: number;
127
- fieldIndices: number[];
128
- }
129
-
130
- class cisvParser {
131
- constructor(config?: CisvConfig);
132
- parseSync(path: string): ParsedRow[];
133
- parse(path: string): Promise<ParsedRow[]>;
134
- parseString(csv: string): ParsedRow[];
135
- write(chunk: string | Buffer): void;
136
- end(): void;
137
- getRows(): ParsedRow[];
138
- clear(): void;
139
- setConfig(config: CisvConfig): void;
140
- getConfig(): CisvConfig;
141
- transform(fieldIndex: number, type: string | Function): this;
142
- removeTransform(fieldIndex: number): this;
143
- clearTransforms(): this;
144
- getStats(): ParseStats;
145
- getTransformInfo(): TransformInfo;
146
- destroy(): void;
147
-
148
- static countRows(path: string): number;
149
- static countRowsWithConfig(path: string, config?: CisvConfig): number;
150
- }
151
- ```
152
-
153
- ### BASIC PARSING
154
-
155
- ```javascript
156
- import { cisvParser } from "cisv";
157
-
158
- // Default configuration (standard CSV)
159
- const parser = new cisvParser();
25
+ const parser = new cisvParser({ delimiter: ',', trim: true });
160
26
  const rows = parser.parseSync('data.csv');
161
27
 
162
- // Custom configuration (TSV with single quotes)
163
- const tsvParser = new cisvParser({
164
- delimiter: '\t',
165
- quote: "'"
166
- });
167
- const tsvRows = tsvParser.parseSync('data.tsv');
168
-
169
- // Parse specific line range
170
- const rangeParser = new cisvParser({
171
- fromLine: 100,
172
- toLine: 1000
173
- });
174
- const subset = rangeParser.parseSync('large.csv');
175
-
176
- // Skip comments and empty lines
177
- const cleanParser = new cisvParser({
178
- comment: '#',
179
- skipEmptyLines: true,
180
- trim: true
181
- });
182
- const cleanData = cleanParser.parseSync('config.csv');
28
+ console.log(rows.length);
29
+ console.log(rows[0]);
183
30
  ```
184
31
 
185
- ### STREAMING
186
-
187
- ```javascript
188
- import { cisvParser } from "cisv";
189
- import fs from 'fs';
190
-
191
- const streamParser = new cisvParser({
192
- delimiter: ',',
193
- trim: true
194
- });
195
-
196
- const stream = fs.createReadStream('huge-file.csv');
197
-
198
- stream.on('data', chunk => streamParser.write(chunk));
199
- stream.on('end', () => {
200
- streamParser.end();
201
- const results = streamParser.getRows();
202
- console.log(`Parsed ${results.length} rows`);
203
- });
204
- ```
205
-
206
- ### DATA TRANSFORMATION
207
-
208
- ```javascript
209
- const parser = new cisvParser();
210
-
211
- // Built-in C transforms (optimized)
212
- parser
213
- .transform(0, 'uppercase') // Column 0 to uppercase
214
- .transform(1, 'lowercase') // Column 1 to lowercase
215
- .transform(2, 'trim') // Column 2 trim whitespace
216
- .transform(3, 'to_int') // Column 3 to integer
217
- .transform(4, 'to_float') // Column 4 to float
218
- .transform(5, 'base64_encode') // Column 5 to base64
219
- .transform(6, 'hash_sha256'); // Column 6 to SHA256
220
-
221
- // Custom fieldname transform :
222
- parser
223
- .transform('name', 'uppercase');
224
-
225
- // Custom row transform :
226
- parser
227
- .transformRow((row, rowObj) => {console.log(row}});
228
-
229
- // Custom JavaScript transforms
230
- parser.transform(7, value => new Date(value).toISOString());
231
-
232
- // Apply to all fields
233
- parser.transform(-1, value => value.replace(/[^\w\s]/gi, ''));
32
+ ## Parser API
33
+
34
+ ### Constructor options
35
+
36
+ - `delimiter?: string` (first character used)
37
+ - `quote?: string` (first character used)
38
+ - `escape?: string | null` (`null` means RFC4180 doubled quote escaping)
39
+ - `comment?: string | null`
40
+ - `trim?: boolean`
41
+ - `skipEmptyLines?: boolean`
42
+ - `relaxed?: boolean`
43
+ - `skipLinesWithError?: boolean`
44
+ - `maxRowSize?: number`
45
+ - `fromLine?: number`
46
+ - `toLine?: number`
47
+
48
+ ### Instance methods
49
+
50
+ - `parseSync(path: string): string[][]`
51
+ - `parse(path: string): Promise<string[][]>`
52
+ - `parseString(csv: string): string[][]`
53
+ - `write(chunk: Buffer | string): void`
54
+ - `end(): void`
55
+ - `getRows(): string[][]`
56
+ - `clear(): void`
57
+ - `setConfig(config): this`
58
+ - `getConfig(): object`
59
+ - `transform(fieldIndex: number, kindOrFn: string | Function, context?): this`
60
+ - `transformByName(fieldName: string, kindOrFn: string | Function, context?): this`
61
+ - `setHeaderFields(fields: string[]): void`
62
+ - `removeTransform(fieldIndex: number): this`
63
+ - `removeTransformByName(fieldName: string): this`
64
+ - `clearTransforms(): this`
65
+ - `getTransformInfo(): { cTransformCount: number, jsTransformCount: number, fieldIndices: number[] }`
66
+ - `getStats(): { rowCount: number, fieldCount: number, totalBytes: number, parseTime: number, currentLine: number }`
67
+ - `openIterator(path: string): this`
68
+ - `fetchRow(): string[] | null`
69
+ - `closeIterator(): this`
70
+ - `destroy(): void`
71
+
72
+ ### Static methods
73
+
74
+ - `cisvParser.countRows(path: string): number`
75
+ - `cisvParser.countRowsWithConfig(path: string, config?): number`
76
+
77
+ ## Transform Types
78
+
79
+ Built-in transform names:
80
+
81
+ - `uppercase`
82
+ - `lowercase`
83
+ - `trim`
84
+ - `to_int` (or `int`)
85
+ - `to_float` (or `float`)
86
+ - `hash_sha256` (or `sha256`)
87
+ - `base64_encode` (or `base64`)
88
+
89
+ ## Examples
90
+
91
+ ### Async parse
92
+
93
+ ```js
94
+ const { cisvParser } = require('cisv');
234
95
 
235
- const transformed = parser.parseSync('data.csv');
96
+ (async () => {
97
+ const parser = new cisvParser();
98
+ const rows = await parser.parse('data.csv');
99
+ console.log(rows.length);
100
+ })();
236
101
  ```
237
102
 
238
- ### ROW COUNTING
103
+ ### Streaming chunks
239
104
 
240
- ```javascript
241
- import { cisvParser } from "cisv";
105
+ ```js
106
+ const fs = require('fs');
107
+ const { cisvParser } = require('cisv');
242
108
 
243
- // Fast row counting without parsing
244
- const count = cisvParser.countRows('large.csv');
109
+ const parser = new cisvParser();
110
+ for (const chunk of [
111
+ Buffer.from('id,name\n1,'),
112
+ Buffer.from('john\n2,jane\n')
113
+ ]) {
114
+ parser.write(chunk);
115
+ }
116
+ parser.end();
245
117
 
246
- // Count with specific configuration
247
- const tsvCount = cisvParser.countRowsWithConfig('data.tsv', {
248
- delimiter: '\t',
249
- skipEmptyLines: true,
250
- fromLine: 10,
251
- toLine: 1000
252
- });
118
+ console.log(parser.getRows());
253
119
  ```
254
120
 
255
- ### ROW-BY-ROW ITERATION
121
+ ### Iterator mode (low memory)
256
122
 
257
- The iterator API provides fgetcsv-style streaming with minimal memory footprint and early exit support.
258
-
259
- ```javascript
260
- import { cisvParser } from "cisv";
261
-
262
- const parser = new cisvParser({ delimiter: ',', trim: true });
123
+ ```js
124
+ const { cisvParser } = require('cisv');
263
125
 
264
- // Open iterator for a file
265
- parser.openIterator('/path/to/large.csv');
126
+ const parser = new cisvParser({ delimiter: ',' });
127
+ parser.openIterator('large.csv');
266
128
 
267
- // Fetch rows one at a time
268
129
  let row;
269
130
  while ((row = parser.fetchRow()) !== null) {
270
- console.log(row); // string[]
271
-
272
- // Early exit - no wasted work
273
- if (row[0] === 'stop') {
274
- break;
275
- }
131
+ if (row[0] === 'stop') break;
276
132
  }
277
133
 
278
- // Close iterator when done
279
134
  parser.closeIterator();
280
-
281
- // Methods support chaining
282
- parser.openIterator('data.csv')
283
- .closeIterator();
284
135
  ```
285
136
 
286
- **Iterator Methods:**
137
+ ### Name-based transforms
138
+
139
+ ```js
140
+ const { cisvParser } = require('cisv');
141
+
142
+ const parser = new cisvParser();
143
+ parser.setHeaderFields(['id', 'name', 'email']);
144
+ parser.transformByName('name', 'uppercase');
145
+
146
+ const rows = parser.parseString('id,name,email\n1,john,john@test.com');
147
+ console.log(rows[1][1]); // JOHN
148
+ ```
287
149
 
288
- | Method | Description |
289
- |--------|-------------|
290
- | `openIterator(path)` | Open a file for row-by-row iteration |
291
- | `fetchRow()` | Get next row as `string[]`, or `null` if at EOF |
292
- | `closeIterator()` | Close iterator and release resources |
150
+ ## Notes
293
151
 
294
- **Notes:**
295
- - The iterator uses the parser's current configuration (delimiter, quote, trim, etc.)
296
- - Calling `destroy()` automatically closes any open iterator
297
- - Only one iterator can be open at a time per parser instance
298
- - Breaking out of iteration and calling `closeIterator()` stops parsing immediately
152
+ - Returned rows include the header row when the input has one.
153
+ - `removeTransform*` currently removes JavaScript transforms; C-transform removal by index/name is not fully implemented yet.
154
+ - `parse()` runs in a worker thread for non-transform workloads; when transforms are attached it preserves current synchronous transform behavior for compatibility.
Binary file
@@ -6,6 +6,7 @@
6
6
  #include <string>
7
7
  #include <unordered_map>
8
8
  #include <chrono>
9
+ #include <cstdint>
9
10
 
10
11
  namespace {
11
12
 
@@ -60,11 +61,77 @@ static bool isValidUtf8(const char* data, size_t len) {
60
61
  return true;
61
62
  }
62
63
 
64
+ // Fast path for common ASCII-only CSV data.
65
+ static inline bool isAllAscii(const char* data, size_t len) {
66
+ const unsigned char* bytes = reinterpret_cast<const unsigned char*>(data);
67
+ size_t i = 0;
68
+
69
+ // Check machine-word chunks first.
70
+ const size_t word_size = sizeof(uintptr_t);
71
+ const uintptr_t high_mask = sizeof(uintptr_t) == 8
72
+ ? static_cast<uintptr_t>(0x8080808080808080ULL)
73
+ : static_cast<uintptr_t>(0x80808080UL);
74
+
75
+ while (i + word_size <= len) {
76
+ uintptr_t word;
77
+ memcpy(&word, bytes + i, word_size);
78
+ if (word & high_mask) {
79
+ return false;
80
+ }
81
+ i += word_size;
82
+ }
83
+
84
+ while (i < len) {
85
+ if (bytes[i] & 0x80) {
86
+ return false;
87
+ }
88
+ i++;
89
+ }
90
+ return true;
91
+ }
92
+
63
93
  // Create Napi::String with UTF-8 validation (safe version)
64
94
  // Falls back to replacement character representation for invalid UTF-8
65
- static Napi::String SafeNewString(Napi::Env env, const char* data, size_t len) {
95
+ static napi_value SafeNewStringValue(napi_env env, const char* data, size_t len) {
96
+ // Short fields are extremely common in CSV; avoid heavier ASCII/UTF-8 scans.
97
+ if (len <= 32) {
98
+ bool ascii = true;
99
+ for (size_t i = 0; i < len; i++) {
100
+ if (static_cast<unsigned char>(data[i]) & 0x80) {
101
+ ascii = false;
102
+ break;
103
+ }
104
+ }
105
+
106
+ napi_value short_value = nullptr;
107
+ if (ascii) {
108
+ if (napi_create_string_latin1(env, data, len, &short_value) == napi_ok && short_value) {
109
+ return short_value;
110
+ }
111
+ } else {
112
+ if (napi_create_string_utf8(env, data, len, &short_value) == napi_ok && short_value) {
113
+ return short_value;
114
+ }
115
+ }
116
+ }
117
+
118
+ // Fastest path: ASCII-only data is valid Latin-1.
119
+ // Using Latin-1 creation avoids UTF-8 decoding overhead.
120
+ if (isAllAscii(data, len)) {
121
+ napi_value latin1_value = nullptr;
122
+ if (napi_create_string_latin1(env, data, len, &latin1_value) == napi_ok && latin1_value) {
123
+ return latin1_value;
124
+ }
125
+ // Fallback to UTF-8 path if Latin-1 creation fails unexpectedly.
126
+ napi_value utf8_value = nullptr;
127
+ napi_create_string_utf8(env, data, len, &utf8_value);
128
+ return utf8_value;
129
+ }
130
+
66
131
  if (isValidUtf8(data, len)) {
67
- return Napi::String::New(env, data, len);
132
+ napi_value utf8_value = nullptr;
133
+ napi_create_string_utf8(env, data, len, &utf8_value);
134
+ return utf8_value;
68
135
  }
69
136
 
70
137
  // Invalid UTF-8 - replace invalid bytes with replacement character
@@ -109,7 +176,13 @@ static Napi::String SafeNewString(Napi::Env env, const char* data, size_t len) {
109
176
  }
110
177
  }
111
178
 
112
- return Napi::String::New(env, safe_str);
179
+ napi_value safe_value = nullptr;
180
+ napi_create_string_utf8(env, safe_str.c_str(), safe_str.length(), &safe_value);
181
+ return safe_value;
182
+ }
183
+
184
+ static Napi::String SafeNewString(Napi::Env env, const char* data, size_t len) {
185
+ return Napi::String(env, SafeNewStringValue(env, data, len));
113
186
  }
114
187
 
115
188
  // Extended RowCollector that handles transforms
@@ -266,6 +339,76 @@ static void error_cb(void *user, int line, const char *msg) {
266
339
  fprintf(stderr, "CSV Parse Error at line %d: %s\n", line, msg);
267
340
  }
268
341
 
342
+ class ParseFileWorker final : public Napi::AsyncWorker {
343
+ public:
344
+ ParseFileWorker(
345
+ Napi::Env env,
346
+ std::string path,
347
+ cisv_config config,
348
+ Napi::Promise::Deferred deferred
349
+ ) : Napi::AsyncWorker(env),
350
+ path_(std::move(path)),
351
+ config_(config),
352
+ deferred_(deferred) {}
353
+
354
+ void Execute() override {
355
+ cisv_result_t *result = cisv_parse_file_batch(path_.c_str(), &config_);
356
+ if (!result) {
357
+ SetError("parse error: " + std::string(strerror(errno)));
358
+ return;
359
+ }
360
+
361
+ if (result->error_code != 0) {
362
+ std::string msg = result->error_message[0] ? result->error_message : "parse error";
363
+ if (msg.rfind("parse error", 0) != 0) {
364
+ msg = "parse error: " + msg;
365
+ }
366
+ SetError(msg);
367
+ cisv_result_free(result);
368
+ return;
369
+ }
370
+
371
+ rows_.reserve(result->row_count);
372
+ for (size_t i = 0; i < result->row_count; i++) {
373
+ cisv_row_t *row = &result->rows[i];
374
+ std::vector<std::string> out_row;
375
+ out_row.reserve(row->field_count);
376
+ for (size_t j = 0; j < row->field_count; j++) {
377
+ out_row.emplace_back(row->fields[j], row->field_lengths[j]);
378
+ }
379
+ rows_.emplace_back(std::move(out_row));
380
+ }
381
+
382
+ cisv_result_free(result);
383
+ }
384
+
385
+ void OnOK() override {
386
+ Napi::Env env = Env();
387
+ Napi::Array out = Napi::Array::New(env, rows_.size());
388
+
389
+ for (size_t i = 0; i < rows_.size(); i++) {
390
+ Napi::Array row = Napi::Array::New(env, rows_[i].size());
391
+ for (size_t j = 0; j < rows_[i].size(); j++) {
392
+ const std::string &field = rows_[i][j];
393
+ row[j] = SafeNewString(env, field.c_str(), field.length());
394
+ }
395
+ out[i] = row;
396
+ }
397
+
398
+ deferred_.Resolve(out);
399
+ }
400
+
401
+ void OnError(const Napi::Error &e) override {
402
+ deferred_.Reject(e.Value());
403
+ }
404
+
405
+ private:
406
+ std::string path_;
407
+ cisv_config config_;
408
+ Napi::Promise::Deferred deferred_;
409
+ std::vector<std::vector<std::string>> rows_;
410
+ };
411
+
269
412
  } // namespace
270
413
 
271
414
  class CisvParser : public Napi::ObjectWrap<CisvParser> {
@@ -310,6 +453,8 @@ public:
310
453
  total_bytes_ = 0;
311
454
  is_destroyed_ = false;
312
455
  iterator_ = nullptr;
456
+ batch_result_ = nullptr;
457
+ stream_buffering_active_ = true;
313
458
 
314
459
  // Initialize configuration with defaults
315
460
  cisv_config_init(&config_);
@@ -503,6 +648,7 @@ public:
503
648
  delete rc_;
504
649
  rc_ = nullptr;
505
650
  }
651
+ clearBatchResult();
506
652
  is_destroyed_ = true;
507
653
  }
508
654
  }
@@ -528,26 +674,35 @@ public:
528
674
 
529
675
  auto start = std::chrono::high_resolution_clock::now();
530
676
 
531
- // Clear previous data
532
- rc_->rows.clear();
533
- rc_->current.clear();
534
- rc_->current_field_index = 0;
677
+ resetRowState();
535
678
 
536
- // Set environment for JS transforms
537
- rc_->env = env;
538
-
539
- int result = cisv_parser_parse_file(parser_, path.c_str());
679
+ int result = 0;
680
+ if (!hasTransforms()) {
681
+ cisv_result_t *batch = cisv_parse_file_batch(path.c_str(), &config_);
682
+ if (!batch) {
683
+ throw Napi::Error::New(env, "parse error: " + std::string(strerror(errno)));
684
+ }
685
+ if (batch->error_code != 0) {
686
+ std::string msg = batch->error_message[0] ? batch->error_message : "parse error";
687
+ cisv_result_free(batch);
688
+ throw Napi::Error::New(env, msg);
689
+ }
690
+ clearBatchResult();
691
+ batch_result_ = batch;
692
+ } else {
693
+ // Set environment for JS transforms
694
+ rc_->env = env;
695
+ result = cisv_parser_parse_file(parser_, path.c_str());
696
+ // Clear the environment reference after parsing
697
+ rc_->env = nullptr;
698
+ if (result < 0) {
699
+ throw Napi::Error::New(env, "parse error: " + std::to_string(result));
700
+ }
701
+ }
540
702
 
541
703
  auto end = std::chrono::high_resolution_clock::now();
542
704
  parse_time_ = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
543
705
 
544
- // Clear the environment reference after parsing
545
- rc_->env = nullptr;
546
-
547
- if (result < 0) {
548
- throw Napi::Error::New(env, "parse error: " + std::to_string(result));
549
- }
550
-
551
706
  return drainRows(env);
552
707
  }
553
708
 
@@ -565,22 +720,33 @@ public:
565
720
 
566
721
  std::string content = info[0].As<Napi::String>();
567
722
 
568
- // Clear previous data
569
- rc_->rows.clear();
570
- rc_->current.clear();
571
- rc_->current_field_index = 0;
723
+ resetRowState();
572
724
 
573
- // Set environment for JS transforms
574
- rc_->env = env;
725
+ if (!hasTransforms()) {
726
+ cisv_result_t *batch = cisv_parse_string_batch(content.c_str(), content.length(), &config_);
727
+ if (!batch) {
728
+ throw Napi::Error::New(env, "parse error: " + std::string(strerror(errno)));
729
+ }
730
+ if (batch->error_code != 0) {
731
+ std::string msg = batch->error_message[0] ? batch->error_message : "parse error";
732
+ cisv_result_free(batch);
733
+ throw Napi::Error::New(env, msg);
734
+ }
735
+ clearBatchResult();
736
+ batch_result_ = batch;
737
+ } else {
738
+ // Set environment for JS transforms
739
+ rc_->env = env;
575
740
 
576
- // Write the string content as chunks
577
- cisv_parser_write(parser_, (const uint8_t*)content.c_str(), content.length());
578
- cisv_parser_end(parser_);
741
+ // Write the string content as chunks
742
+ cisv_parser_write(parser_, (const uint8_t*)content.c_str(), content.length());
743
+ cisv_parser_end(parser_);
579
744
 
580
- total_bytes_ = content.length();
745
+ // Clear the environment reference after parsing
746
+ rc_->env = nullptr;
747
+ }
581
748
 
582
- // Clear the environment reference after parsing
583
- rc_->env = nullptr;
749
+ total_bytes_ = content.length();
584
750
 
585
751
  return drainRows(env);
586
752
  }
@@ -597,44 +763,95 @@ public:
597
763
  throw Napi::TypeError::New(env, "Expected one argument");
598
764
  }
599
765
 
766
+ // Streaming writes produce row-callback data, not batch results.
767
+ clearBatchResult();
768
+
600
769
  // Set environment for JS transforms
601
770
  rc_->env = env;
602
771
 
772
+ const uint8_t* chunk_data = nullptr;
773
+ size_t chunk_size = 0;
774
+ std::string chunk_storage;
775
+
603
776
  if (info[0].IsBuffer()) {
604
777
  auto buf = info[0].As<Napi::Buffer<uint8_t>>();
605
- size_t buf_len = buf.Length();
606
- // Check for overflow before adding to total_bytes_
607
- if (buf_len > SIZE_MAX - total_bytes_) {
608
- throw Napi::Error::New(env, "Total bytes would overflow");
609
- }
610
- cisv_parser_write(parser_, buf.Data(), buf_len);
611
- total_bytes_ += buf_len;
612
- return;
778
+ chunk_data = buf.Data();
779
+ chunk_size = buf.Length();
780
+ } else if (info[0].IsString()) {
781
+ chunk_storage = info[0].As<Napi::String>();
782
+ chunk_data = reinterpret_cast<const uint8_t*>(chunk_storage.data());
783
+ chunk_size = chunk_storage.size();
784
+ } else {
785
+ throw Napi::TypeError::New(env, "Expected Buffer or String");
786
+ }
787
+
788
+ // Check for overflow before adding to total_bytes_
789
+ if (chunk_size > SIZE_MAX - total_bytes_) {
790
+ throw Napi::Error::New(env, "Total bytes would overflow");
613
791
  }
614
792
 
615
- if (info[0].IsString()) {
616
- std::string chunk = info[0].As<Napi::String>();
617
- size_t chunk_size = chunk.size();
618
- // Check for overflow before adding to total_bytes_
619
- if (chunk_size > SIZE_MAX - total_bytes_) {
620
- throw Napi::Error::New(env, "Total bytes would overflow");
793
+ // Fast streaming mode:
794
+ // Buffer chunks when no transforms/iterator are active and batch-parse on end().
795
+ // If buffered payload exceeds threshold, flush once to parser and continue streaming.
796
+ if (!hasTransforms() && iterator_ == nullptr) {
797
+ if (chunk_size > SIZE_MAX - pending_stream_.size()) {
798
+ throw Napi::Error::New(env, "Buffered stream size would overflow");
621
799
  }
622
- cisv_parser_write(parser_, reinterpret_cast<const uint8_t*>(chunk.data()), chunk_size);
623
- total_bytes_ += chunk_size;
624
- return;
800
+
801
+ if (stream_buffering_active_) {
802
+ pending_stream_.append(reinterpret_cast<const char*>(chunk_data), chunk_size);
803
+ total_bytes_ += chunk_size;
804
+
805
+ if (pending_stream_.size() > kStreamBufferLimitBytes) {
806
+ flushPendingStreamToParser();
807
+ stream_buffering_active_ = false;
808
+ }
809
+ return;
810
+ }
811
+ } else if (!pending_stream_.empty()) {
812
+ flushPendingStreamToParser();
813
+ stream_buffering_active_ = false;
625
814
  }
626
815
 
627
- throw Napi::TypeError::New(env, "Expected Buffer or String");
816
+ cisv_parser_write(parser_, chunk_data, chunk_size);
817
+ total_bytes_ += chunk_size;
628
818
  }
629
819
 
630
820
  void End(const Napi::CallbackInfo &info) {
631
- if (!is_destroyed_) {
632
- cisv_parser_end(parser_);
633
- // Clear the environment reference after ending to prevent stale references
821
+ if (is_destroyed_) {
822
+ return;
823
+ }
824
+
825
+ if (stream_buffering_active_ && !pending_stream_.empty() &&
826
+ !hasTransforms() && iterator_ == nullptr &&
827
+ rc_ && rc_->rows.empty() && rc_->current.empty()) {
828
+ cisv_result_t *batch = cisv_parse_string_batch(
829
+ pending_stream_.data(), pending_stream_.size(), &config_);
830
+ if (!batch) {
831
+ throw Napi::Error::New(info.Env(), "parse error: " + std::string(strerror(errno)));
832
+ }
833
+ if (batch->error_code != 0) {
834
+ std::string msg = batch->error_message[0] ? batch->error_message : "parse error";
835
+ cisv_result_free(batch);
836
+ throw Napi::Error::New(info.Env(), msg);
837
+ }
838
+ clearBatchResult();
839
+ batch_result_ = batch;
840
+ pending_stream_.clear();
634
841
  rc_->env = nullptr;
635
- // Note: JS transforms stored in rc_->js_transforms remain valid
636
- // as they are Persistent references managed by the addon lifecycle
842
+ return;
637
843
  }
844
+
845
+ if (!pending_stream_.empty()) {
846
+ flushPendingStreamToParser();
847
+ stream_buffering_active_ = false;
848
+ }
849
+
850
+ cisv_parser_end(parser_);
851
+ // Clear the environment reference after ending to prevent stale references
852
+ rc_->env = nullptr;
853
+ // Note: JS transforms stored in rc_->js_transforms remain valid
854
+ // as they are Persistent references managed by the addon lifecycle
638
855
  }
639
856
 
640
857
  Napi::Value GetRows(const Napi::CallbackInfo &info) {
@@ -642,16 +859,23 @@ public:
642
859
  Napi::Env env = info.Env();
643
860
  throw Napi::Error::New(env, "Parser has been destroyed");
644
861
  }
862
+ if (!pending_stream_.empty()) {
863
+ flushPendingStreamToParser();
864
+ stream_buffering_active_ = false;
865
+ }
645
866
  return drainRows(info.Env());
646
867
  }
647
868
 
648
869
  void Clear(const Napi::CallbackInfo &info) {
649
870
  if (!is_destroyed_ && rc_) {
871
+ clearBatchResult();
650
872
  rc_->rows.clear();
651
873
  rc_->current.clear();
652
874
  rc_->current_field_index = 0;
653
875
  total_bytes_ = 0;
654
876
  parse_time_ = 0;
877
+ pending_stream_.clear();
878
+ stream_buffering_active_ = true;
655
879
  // Also clear the environment reference
656
880
  rc_->env = nullptr;
657
881
  }
@@ -870,11 +1094,26 @@ Napi::Value TransformByName(const Napi::CallbackInfo &info) {
870
1094
  // Handle JavaScript function transforms by name
871
1095
  Napi::Function func = info[1].As<Napi::Function>();
872
1096
 
873
- // Add to the C transform pipeline by name
874
- if (cisv_transform_pipeline_add_js_by_name(rc_->pipeline, field_name.c_str(), &func) < 0) {
875
- throw Napi::Error::New(env, "Failed to add JS transform for field: " + field_name);
1097
+ if (!rc_->pipeline || !rc_->pipeline->header_fields) {
1098
+ throw Napi::Error::New(env,
1099
+ "Header fields are not set. Call setHeaderFields([...]) before transformByName(..., fn).");
876
1100
  }
877
1101
 
1102
+ int field_index = -1;
1103
+ for (size_t i = 0; i < rc_->pipeline->header_count; i++) {
1104
+ if (strcmp(rc_->pipeline->header_fields[i], field_name.c_str()) == 0) {
1105
+ field_index = static_cast<int>(i);
1106
+ break;
1107
+ }
1108
+ }
1109
+
1110
+ if (field_index < 0) {
1111
+ throw Napi::Error::New(env, "Unknown field name: " + field_name);
1112
+ }
1113
+
1114
+ // Store callback in the same map used by applyTransforms().
1115
+ rc_->js_transforms[field_index] = Napi::Persistent(func);
1116
+
878
1117
  } else {
879
1118
  throw Napi::TypeError::New(env, "Transform must be a string type or function");
880
1119
  }
@@ -1008,6 +1247,11 @@ Napi::Value RemoveTransformByName(const Napi::CallbackInfo &info) {
1008
1247
  }
1009
1248
 
1010
1249
  // Clear JavaScript transforms
1250
+ for (auto &pair : rc_->js_transforms) {
1251
+ if (!pair.second.IsEmpty()) {
1252
+ pair.second.Reset();
1253
+ }
1254
+ }
1011
1255
  rc_->js_transforms.clear();
1012
1256
 
1013
1257
  // Clear C transforms - destroy and DON'T recreate pipeline yet
@@ -1033,18 +1277,32 @@ Napi::Value RemoveTransformByName(const Napi::CallbackInfo &info) {
1033
1277
 
1034
1278
  std::string path = info[0].As<Napi::String>();
1035
1279
 
1036
- // Create a promise
1037
1280
  auto deferred = Napi::Promise::Deferred::New(env);
1038
1281
 
1039
- // For simplicity, we'll use sync parsing here
1040
- // FIXME: In production, this should use worker threads
1041
- try {
1042
- Napi::Value result = ParseSync(info);
1043
- deferred.Resolve(result);
1044
- } catch (const Napi::Error& e) {
1045
- deferred.Reject(e.Value());
1282
+ // Preserve behavior for transform-enabled parsers (native + JS transforms)
1283
+ // until async transform execution is implemented.
1284
+ bool has_c_transforms = rc_ && rc_->pipeline && rc_->pipeline->count > 0;
1285
+ bool has_js_transforms = rc_ && !rc_->js_transforms.empty();
1286
+ if (has_c_transforms || has_js_transforms) {
1287
+ try {
1288
+ Napi::Value result = ParseSync(info);
1289
+ deferred.Resolve(result);
1290
+ } catch (const Napi::Error &e) {
1291
+ deferred.Reject(e.Value());
1292
+ }
1293
+ return deferred.Promise();
1046
1294
  }
1047
1295
 
1296
+ // Use batch parser in a worker thread to avoid blocking the event loop.
1297
+ cisv_config worker_config = config_;
1298
+ worker_config.field_cb = nullptr;
1299
+ worker_config.row_cb = nullptr;
1300
+ worker_config.error_cb = nullptr;
1301
+ worker_config.user = nullptr;
1302
+
1303
+ auto *worker = new ParseFileWorker(env, path, worker_config, deferred);
1304
+ worker->Queue();
1305
+
1048
1306
  return deferred.Promise();
1049
1307
  }
1050
1308
 
@@ -1090,10 +1348,22 @@ Napi::Value RemoveTransformByName(const Napi::CallbackInfo &info) {
1090
1348
  }
1091
1349
 
1092
1350
  Napi::Object stats = Napi::Object::New(env);
1351
+ size_t row_count = 0;
1352
+ size_t field_count = 0;
1353
+ if (batch_result_) {
1354
+ row_count = batch_result_->row_count;
1355
+ if (batch_result_->row_count > 0) {
1356
+ field_count = batch_result_->rows[0].field_count;
1357
+ }
1358
+ } else if (rc_) {
1359
+ row_count = rc_->rows.size();
1360
+ if (!rc_->rows.empty()) {
1361
+ field_count = rc_->rows[0].size();
1362
+ }
1363
+ }
1093
1364
 
1094
- stats.Set("rowCount", Napi::Number::New(env, rc_ ? rc_->rows.size() : 0));
1095
- stats.Set("fieldCount", Napi::Number::New(env,
1096
- (rc_ && !rc_->rows.empty()) ? rc_->rows[0].size() : 0));
1365
+ stats.Set("rowCount", Napi::Number::New(env, row_count));
1366
+ stats.Set("fieldCount", Napi::Number::New(env, field_count));
1097
1367
  stats.Set("totalBytes", Napi::Number::New(env, total_bytes_));
1098
1368
  stats.Set("parseTime", Napi::Number::New(env, parse_time_));
1099
1369
  stats.Set("currentLine", Napi::Number::New(env,
@@ -1233,14 +1503,13 @@ Napi::Value RemoveTransformByName(const Napi::CallbackInfo &info) {
1233
1503
  throw Napi::Error::New(env, "Error reading CSV row");
1234
1504
  }
1235
1505
 
1236
- // Create array of strings for the row
1237
- Napi::Array row = Napi::Array::New(env, field_count);
1506
+ napi_value row;
1507
+ napi_create_array_with_length(env, field_count, &row);
1238
1508
  for (size_t i = 0; i < field_count; i++) {
1239
- // SECURITY: Use safe string creation to handle invalid UTF-8
1240
- row.Set(i, SafeNewString(env, fields[i], lengths[i]));
1509
+ napi_set_element(env, row, i, SafeNewStringValue(env, fields[i], lengths[i]));
1241
1510
  }
1242
1511
 
1243
- return row;
1512
+ return Napi::Value(env, row);
1244
1513
  }
1245
1514
 
1246
1515
  /**
@@ -1263,27 +1532,94 @@ Napi::Value RemoveTransformByName(const Napi::CallbackInfo &info) {
1263
1532
  }
1264
1533
 
1265
1534
  private:
1535
+ void clearBatchResult() {
1536
+ if (batch_result_) {
1537
+ cisv_result_free(batch_result_);
1538
+ batch_result_ = nullptr;
1539
+ }
1540
+ }
1541
+
1542
+ bool hasTransforms() const {
1543
+ bool has_c_transforms = rc_ && rc_->pipeline && rc_->pipeline->count > 0;
1544
+ bool has_js_transforms = rc_ && !rc_->js_transforms.empty();
1545
+ return has_c_transforms || has_js_transforms;
1546
+ }
1547
+
1548
+ void resetRowState() {
1549
+ clearBatchResult();
1550
+ pending_stream_.clear();
1551
+ stream_buffering_active_ = true;
1552
+ if (!rc_) return;
1553
+ rc_->rows.clear();
1554
+ rc_->current.clear();
1555
+ rc_->current_field_index = 0;
1556
+ }
1557
+
1558
+ void flushPendingStreamToParser() {
1559
+ if (pending_stream_.empty()) {
1560
+ return;
1561
+ }
1562
+ cisv_parser_write(
1563
+ parser_,
1564
+ reinterpret_cast<const uint8_t*>(pending_stream_.data()),
1565
+ pending_stream_.size());
1566
+ pending_stream_.clear();
1567
+ }
1568
+
1569
+ void loadRowsFromBatch(const cisv_result_t *result) {
1570
+ if (!rc_ || !result) return;
1571
+ rc_->rows.clear();
1572
+ rc_->rows.reserve(result->row_count);
1573
+
1574
+ for (size_t i = 0; i < result->row_count; i++) {
1575
+ const cisv_row_t *row = &result->rows[i];
1576
+ std::vector<std::string> out_row;
1577
+ out_row.reserve(row->field_count);
1578
+ for (size_t j = 0; j < row->field_count; j++) {
1579
+ out_row.emplace_back(row->fields[j], row->field_lengths[j]);
1580
+ }
1581
+ rc_->rows.emplace_back(std::move(out_row));
1582
+ }
1583
+ }
1584
+
1266
1585
  Napi::Value drainRows(Napi::Env env) {
1586
+ if (batch_result_) {
1587
+ napi_value rows;
1588
+ napi_create_array_with_length(env, batch_result_->row_count, &rows);
1589
+ for (size_t i = 0; i < batch_result_->row_count; ++i) {
1590
+ const cisv_row_t *src_row = &batch_result_->rows[i];
1591
+ napi_value row;
1592
+ napi_create_array_with_length(env, src_row->field_count, &row);
1593
+ for (size_t j = 0; j < src_row->field_count; ++j) {
1594
+ napi_set_element(env, row, j, SafeNewStringValue(env, src_row->fields[j], src_row->field_lengths[j]));
1595
+ }
1596
+ napi_set_element(env, rows, i, row);
1597
+ }
1598
+ return Napi::Value(env, rows);
1599
+ }
1600
+
1267
1601
  if (!rc_) {
1268
1602
  return Napi::Array::New(env, 0);
1269
1603
  }
1270
1604
 
1271
- Napi::Array rows = Napi::Array::New(env, rc_->rows.size());
1605
+ napi_value rows;
1606
+ napi_create_array_with_length(env, rc_->rows.size(), &rows);
1272
1607
 
1273
1608
  for (size_t i = 0; i < rc_->rows.size(); ++i) {
1274
- Napi::Array row = Napi::Array::New(env, rc_->rows[i].size());
1609
+ napi_value row;
1610
+ napi_create_array_with_length(env, rc_->rows[i].size(), &row);
1275
1611
  for (size_t j = 0; j < rc_->rows[i].size(); ++j) {
1276
1612
  // SECURITY: Use safe string creation to handle invalid UTF-8 in CSV data
1277
1613
  const std::string& field = rc_->rows[i][j];
1278
- row[j] = SafeNewString(env, field.c_str(), field.length());
1614
+ napi_set_element(env, row, j, SafeNewStringValue(env, field.c_str(), field.length()));
1279
1615
  }
1280
- rows[i] = row;
1616
+ napi_set_element(env, rows, i, row);
1281
1617
  }
1282
1618
 
1283
1619
  // Don't clear here if we want to keep data for multiple reads
1284
1620
  // rc_->rows.clear();
1285
1621
 
1286
- return rows;
1622
+ return Napi::Value(env, rows);
1287
1623
  }
1288
1624
 
1289
1625
  cisv_parser *parser_;
@@ -1293,6 +1629,10 @@ private:
1293
1629
  double parse_time_;
1294
1630
  bool is_destroyed_;
1295
1631
  cisv_iterator_t *iterator_; // For row-by-row iteration
1632
+ cisv_result_t *batch_result_;
1633
+ std::string pending_stream_;
1634
+ bool stream_buffering_active_;
1635
+ static constexpr size_t kStreamBufferLimitBytes = 8 * 1024 * 1024;
1296
1636
  };
1297
1637
 
1298
1638
  // Initialize all exports
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cisv",
3
- "version": "0.2.5",
3
+ "version": "0.3.2",
4
4
  "description": "The csv parser on steroids.",
5
5
  "author": "sanix<s4nixd@gmail.com>",
6
6
  "main": "./build/Release/cisv.node",