cisv 0.0.25 → 0.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +248 -67
- package/cisv/cisv_addon.cc +228 -9
- package/cisv/cisv_parser.c +564 -148
- package/cisv/cisv_parser.h +36 -0
- package/cisv/cisv_simd.h +46 -45
- package/cisv/cisv_transformer.c +3 -2
- package/index.d.ts +74 -10
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|

|
|
6
6
|

|
|
7
7
|
|
|
8
|
-
High-performance CSV parser and writer leveraging SIMD instructions and zero-copy memory mapping. Available as both a Node.js native addon and standalone CLI tool.
|
|
8
|
+
High-performance CSV parser and writer leveraging SIMD instructions and zero-copy memory mapping. Available as both a Node.js native addon and standalone CLI tool with extensive configuration options.
|
|
9
9
|
|
|
10
10
|
## PERFORMANCE
|
|
11
11
|
|
|
@@ -13,6 +13,7 @@ High-performance CSV parser and writer leveraging SIMD instructions and zero-cop
|
|
|
13
13
|
- **10-100x faster** than popular CSV parsers
|
|
14
14
|
- Zero-copy memory-mapped I/O with kernel optimizations
|
|
15
15
|
- SIMD accelerated with AVX-512/AVX2 auto-detection
|
|
16
|
+
- Dynamic lookup tables for configurable parsing
|
|
16
17
|
|
|
17
18
|
## INSTALLATION
|
|
18
19
|
|
|
@@ -41,39 +42,144 @@ make build
|
|
|
41
42
|
```javascript
|
|
42
43
|
const { cisvParser } = require('cisv');
|
|
43
44
|
|
|
45
|
+
// Basic usage
|
|
44
46
|
const parser = new cisvParser();
|
|
45
47
|
const rows = parser.parseSync('./data.csv');
|
|
46
|
-
|
|
48
|
+
|
|
49
|
+
// With configuration (optional)
|
|
50
|
+
const tsv_parser = new cisvParser({
|
|
51
|
+
delimiter: '\t',
|
|
52
|
+
quote: "'",
|
|
53
|
+
trim: true
|
|
54
|
+
});
|
|
55
|
+
const tsv_rows = tsv_parser.parseSync('./data.tsv');
|
|
47
56
|
```
|
|
48
57
|
|
|
49
58
|
### CLI
|
|
50
|
-
|
|
51
59
|
```bash
|
|
52
|
-
#
|
|
53
|
-
cisv
|
|
60
|
+
# Basic parsing
|
|
61
|
+
cisv data.csv
|
|
62
|
+
|
|
63
|
+
# Parse TSV file
|
|
64
|
+
cisv -d $'\t' data.tsv
|
|
65
|
+
|
|
66
|
+
# Parse with custom quote and trim
|
|
67
|
+
cisv -q "'" -t data.csv
|
|
68
|
+
|
|
69
|
+
# Skip comment lines
|
|
70
|
+
cisv -m '#' config.csv
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## CONFIGURATION OPTIONS
|
|
74
|
+
|
|
75
|
+
### Parser Configuration
|
|
76
|
+
|
|
77
|
+
```javascript
|
|
78
|
+
const parser = new cisvParser({
|
|
79
|
+
// Field delimiter character (default: ',')
|
|
80
|
+
delimiter: ',',
|
|
81
|
+
|
|
82
|
+
// Quote character (default: '"')
|
|
83
|
+
quote: '"',
|
|
84
|
+
|
|
85
|
+
// Escape character (null for RFC4180 "" style, default: null)
|
|
86
|
+
escape: null,
|
|
87
|
+
|
|
88
|
+
// Comment character to skip lines (default: null)
|
|
89
|
+
comment: '#',
|
|
90
|
+
|
|
91
|
+
// Trim whitespace from fields (default: false)
|
|
92
|
+
trim: true,
|
|
93
|
+
|
|
94
|
+
// Skip empty lines (default: false)
|
|
95
|
+
skipEmptyLines: true,
|
|
96
|
+
|
|
97
|
+
// Use relaxed parsing rules (default: false)
|
|
98
|
+
relaxed: false,
|
|
54
99
|
|
|
55
|
-
|
|
56
|
-
|
|
100
|
+
// Skip lines with parse errors (default: false)
|
|
101
|
+
skipLinesWithError: true,
|
|
57
102
|
|
|
58
|
-
|
|
59
|
-
|
|
103
|
+
// Maximum row size in bytes (0 = unlimited, default: 0)
|
|
104
|
+
maxRowSize: 1048576,
|
|
105
|
+
|
|
106
|
+
// Start parsing from line N (1-based, default: 1)
|
|
107
|
+
fromLine: 10,
|
|
108
|
+
|
|
109
|
+
// Stop parsing at line N (0 = until end, default: 0)
|
|
110
|
+
toLine: 1000
|
|
111
|
+
});
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### Dynamic Configuration
|
|
115
|
+
|
|
116
|
+
```javascript
|
|
117
|
+
// Set configuration after creation
|
|
118
|
+
parser.setConfig({
|
|
119
|
+
delimiter: ';',
|
|
120
|
+
quote: "'",
|
|
121
|
+
trim: true
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
// Get current configuration
|
|
125
|
+
const config = parser.getConfig();
|
|
126
|
+
console.log(config);
|
|
60
127
|
```
|
|
61
128
|
|
|
62
129
|
## API REFERENCE
|
|
63
130
|
|
|
64
131
|
### TYPESCRIPT DEFINITIONS
|
|
65
132
|
```typescript
|
|
133
|
+
interface CisvConfig {
|
|
134
|
+
delimiter?: string;
|
|
135
|
+
quote?: string;
|
|
136
|
+
escape?: string | null;
|
|
137
|
+
comment?: string | null;
|
|
138
|
+
trim?: boolean;
|
|
139
|
+
skipEmptyLines?: boolean;
|
|
140
|
+
relaxed?: boolean;
|
|
141
|
+
skipLinesWithError?: boolean;
|
|
142
|
+
maxRowSize?: number;
|
|
143
|
+
fromLine?: number;
|
|
144
|
+
toLine?: number;
|
|
145
|
+
}
|
|
146
|
+
|
|
66
147
|
interface ParsedRow extends Array<string> {}
|
|
148
|
+
|
|
67
149
|
interface ParseStats {
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
150
|
+
rowCount: number;
|
|
151
|
+
fieldCount: number;
|
|
152
|
+
totalBytes: number;
|
|
153
|
+
parseTime: number;
|
|
154
|
+
currentLine: number;
|
|
72
155
|
}
|
|
156
|
+
|
|
73
157
|
interface TransformInfo {
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
158
|
+
cTransformCount: number;
|
|
159
|
+
jsTransformCount: number;
|
|
160
|
+
fieldIndices: number[];
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
class cisvParser {
|
|
164
|
+
constructor(config?: CisvConfig);
|
|
165
|
+
parseSync(path: string): ParsedRow[];
|
|
166
|
+
parse(path: string): Promise<ParsedRow[]>;
|
|
167
|
+
parseString(csv: string): ParsedRow[];
|
|
168
|
+
write(chunk: string | Buffer): void;
|
|
169
|
+
end(): void;
|
|
170
|
+
getRows(): ParsedRow[];
|
|
171
|
+
clear(): void;
|
|
172
|
+
setConfig(config: CisvConfig): void;
|
|
173
|
+
getConfig(): CisvConfig;
|
|
174
|
+
transform(fieldIndex: number, type: string | Function): this;
|
|
175
|
+
removeTransform(fieldIndex: number): this;
|
|
176
|
+
clearTransforms(): this;
|
|
177
|
+
getStats(): ParseStats;
|
|
178
|
+
getTransformInfo(): TransformInfo;
|
|
179
|
+
destroy(): void;
|
|
180
|
+
|
|
181
|
+
static countRows(path: string): number;
|
|
182
|
+
static countRowsWithConfig(path: string, config?: CisvConfig): number;
|
|
77
183
|
}
|
|
78
184
|
```
|
|
79
185
|
|
|
@@ -82,38 +188,60 @@ interface TransformInfo {
|
|
|
82
188
|
```javascript
|
|
83
189
|
import { cisvParser } from "cisv";
|
|
84
190
|
|
|
191
|
+
// Default configuration (standard CSV)
|
|
85
192
|
const parser = new cisvParser();
|
|
86
|
-
|
|
87
|
-
// Synchronous
|
|
88
193
|
const rows = parser.parseSync('data.csv');
|
|
89
194
|
|
|
90
|
-
//
|
|
91
|
-
const
|
|
195
|
+
// Custom configuration (TSV with single quotes)
|
|
196
|
+
const tsvParser = new cisvParser({
|
|
197
|
+
delimiter: '\t',
|
|
198
|
+
quote: "'"
|
|
199
|
+
});
|
|
200
|
+
const tsvRows = tsvParser.parseSync('data.tsv');
|
|
201
|
+
|
|
202
|
+
// Parse specific line range
|
|
203
|
+
const rangeParser = new cisvParser({
|
|
204
|
+
fromLine: 100,
|
|
205
|
+
toLine: 1000
|
|
206
|
+
});
|
|
207
|
+
const subset = rangeParser.parseSync('large.csv');
|
|
92
208
|
|
|
93
|
-
//
|
|
94
|
-
const
|
|
95
|
-
|
|
209
|
+
// Skip comments and empty lines
|
|
210
|
+
const cleanParser = new cisvParser({
|
|
211
|
+
comment: '#',
|
|
212
|
+
skipEmptyLines: true,
|
|
213
|
+
trim: true
|
|
214
|
+
});
|
|
215
|
+
const cleanData = cleanParser.parseSync('config.csv');
|
|
96
216
|
```
|
|
97
217
|
|
|
98
218
|
### STREAMING
|
|
99
219
|
|
|
100
220
|
```javascript
|
|
101
221
|
import { cisvParser } from "cisv";
|
|
222
|
+
import fs from 'fs';
|
|
223
|
+
|
|
224
|
+
const streamParser = new cisvParser({
|
|
225
|
+
delimiter: ',',
|
|
226
|
+
trim: true
|
|
227
|
+
});
|
|
102
228
|
|
|
103
|
-
const streamParser = new cisvParser();
|
|
104
229
|
const stream = fs.createReadStream('huge-file.csv');
|
|
105
230
|
|
|
106
231
|
stream.on('data', chunk => streamParser.write(chunk));
|
|
107
232
|
stream.on('end', () => {
|
|
108
233
|
streamParser.end();
|
|
109
234
|
const results = streamParser.getRows();
|
|
235
|
+
console.log(`Parsed ${results.length} rows`);
|
|
110
236
|
});
|
|
111
237
|
```
|
|
112
238
|
|
|
113
239
|
### DATA TRANSFORMATION
|
|
114
240
|
|
|
115
|
-
Built-in C transforms (optimized):
|
|
116
241
|
```javascript
|
|
242
|
+
const parser = new cisvParser();
|
|
243
|
+
|
|
244
|
+
// Built-in C transforms (optimized)
|
|
117
245
|
parser
|
|
118
246
|
.transform(0, 'uppercase') // Column 0 to uppercase
|
|
119
247
|
.transform(1, 'lowercase') // Column 1 to lowercase
|
|
@@ -122,39 +250,92 @@ parser
|
|
|
122
250
|
.transform(4, 'to_float') // Column 4 to float
|
|
123
251
|
.transform(5, 'base64_encode') // Column 5 to base64
|
|
124
252
|
.transform(6, 'hash_sha256'); // Column 6 to SHA256
|
|
125
|
-
```
|
|
126
253
|
|
|
127
|
-
Custom JavaScript transforms
|
|
128
|
-
```javascript
|
|
129
|
-
// Single field
|
|
254
|
+
// Custom JavaScript transforms
|
|
130
255
|
parser.transform(7, value => new Date(value).toISOString());
|
|
131
256
|
|
|
132
|
-
//
|
|
257
|
+
// Apply to all fields
|
|
133
258
|
parser.transform(-1, value => value.replace(/[^\w\s]/gi, ''));
|
|
134
259
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
260
|
+
const transformed = parser.parseSync('data.csv');
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
### ROW COUNTING
|
|
264
|
+
|
|
265
|
+
```javascript
|
|
266
|
+
import { cisvParser } from "cisv";
|
|
267
|
+
|
|
268
|
+
// Fast row counting without parsing
|
|
269
|
+
const count = cisvParser.countRows('large.csv');
|
|
270
|
+
|
|
271
|
+
// Count with specific configuration
|
|
272
|
+
const tsvCount = cisvParser.countRowsWithConfig('data.tsv', {
|
|
273
|
+
delimiter: '\t',
|
|
274
|
+
skipEmptyLines: true,
|
|
275
|
+
fromLine: 10,
|
|
276
|
+
toLine: 1000
|
|
277
|
+
});
|
|
140
278
|
```
|
|
141
279
|
|
|
142
280
|
## CLI USAGE
|
|
143
281
|
|
|
144
|
-
### PARSING
|
|
282
|
+
### PARSING OPTIONS
|
|
283
|
+
|
|
145
284
|
```bash
|
|
146
285
|
cisv [OPTIONS] [FILE]
|
|
147
286
|
|
|
148
|
-
Options:
|
|
287
|
+
General Options:
|
|
149
288
|
-h, --help Show help message
|
|
150
289
|
-v, --version Show version
|
|
290
|
+
-o, --output FILE Write to FILE instead of stdout
|
|
291
|
+
-b, --benchmark Run benchmark mode
|
|
292
|
+
|
|
293
|
+
Configuration Options:
|
|
151
294
|
-d, --delimiter DELIM Field delimiter (default: ,)
|
|
295
|
+
-q, --quote CHAR Quote character (default: ")
|
|
296
|
+
-e, --escape CHAR Escape character (default: RFC4180 style)
|
|
297
|
+
-m, --comment CHAR Comment character (default: none)
|
|
298
|
+
-t, --trim Trim whitespace from fields
|
|
299
|
+
-r, --relaxed Use relaxed parsing rules
|
|
300
|
+
--skip-empty Skip empty lines
|
|
301
|
+
--skip-errors Skip lines with parse errors
|
|
302
|
+
--max-row SIZE Maximum row size in bytes
|
|
303
|
+
--from-line N Start from line N (1-based)
|
|
304
|
+
--to-line N Stop at line N
|
|
305
|
+
|
|
306
|
+
Processing Options:
|
|
152
307
|
-s, --select COLS Select columns (comma-separated indices)
|
|
153
|
-
-c, --count
|
|
154
|
-
--head N
|
|
155
|
-
--tail N
|
|
156
|
-
|
|
157
|
-
|
|
308
|
+
-c, --count Show only row count
|
|
309
|
+
--head N Show first N rows
|
|
310
|
+
--tail N Show last N rows
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
### EXAMPLES
|
|
314
|
+
|
|
315
|
+
```bash
|
|
316
|
+
# Parse TSV file
|
|
317
|
+
cisv -d $'\t' data.tsv
|
|
318
|
+
|
|
319
|
+
# Parse CSV with semicolon delimiter and single quotes
|
|
320
|
+
cisv -d ';' -q "'" european.csv
|
|
321
|
+
|
|
322
|
+
# Skip comment lines starting with #
|
|
323
|
+
cisv -m '#' config.csv
|
|
324
|
+
|
|
325
|
+
# Trim whitespace and skip empty lines
|
|
326
|
+
cisv -t --skip-empty messy.csv
|
|
327
|
+
|
|
328
|
+
# Parse lines 100-1000 only
|
|
329
|
+
cisv --from-line 100 --to-line 1000 large.csv
|
|
330
|
+
|
|
331
|
+
# Select specific columns
|
|
332
|
+
cisv -s 0,2,5,7 data.csv
|
|
333
|
+
|
|
334
|
+
# Count rows with specific configuration
|
|
335
|
+
cisv -c -d $'\t' --skip-empty data.tsv
|
|
336
|
+
|
|
337
|
+
# Benchmark with custom delimiter
|
|
338
|
+
cisv -b -d ';' european.csv
|
|
158
339
|
```
|
|
159
340
|
|
|
160
341
|
### WRITING
|
|
@@ -185,21 +366,13 @@ Options:
|
|
|
185
366
|
|
|
186
367
|
### NODE.JS LIBRARY BENCHMARKS
|
|
187
368
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
|
191
|
-
|
|
192
|
-
|
|
|
193
|
-
| csv-parse | 15.48 | 34,471 |
|
|
194
|
-
| papaparse | 25.67 | 57,147 |
|
|
195
|
-
|
|
196
|
-
- **Asynchronous Streaming:**
|
|
369
|
+
| Library | Speed (MB/s) | Operations/sec | Configuration Support |
|
|
370
|
+
|--------------------|--------------|----------------|----------------------|
|
|
371
|
+
| cisv | 61.24 | 136,343 | Full |
|
|
372
|
+
| csv-parse | 15.48 | 34,471 | Partial |
|
|
373
|
+
| papaparse | 25.67 | 57,147 | Partial |
|
|
197
374
|
|
|
198
|
-
|
|
199
|
-
|--------------------|--------------|----------------|
|
|
200
|
-
| cisv | 76.94 | 171,287 |
|
|
201
|
-
| papaparse | 16.54 | 36,815 |
|
|
202
|
-
| neat-csv | 8.11 | 18,055 |
|
|
375
|
+
(you can check more benchmarks details from release pipelines)
|
|
203
376
|
|
|
204
377
|
### RUNNING BENCHMARKS
|
|
205
378
|
|
|
@@ -210,38 +383,46 @@ make clean && make cli && make benchmark-cli
|
|
|
210
383
|
# Node.js benchmarks
|
|
211
384
|
npm run benchmark
|
|
212
385
|
|
|
213
|
-
#
|
|
214
|
-
|
|
215
|
-
docker run --rm --cpus="2.0" --memory="4g" cisv-benchmark
|
|
386
|
+
# Benchmark with custom configuration
|
|
387
|
+
cisv -b -d ';' -q "'" --trim european.csv
|
|
216
388
|
```
|
|
217
389
|
|
|
218
390
|
## TECHNICAL ARCHITECTURE
|
|
219
391
|
|
|
220
392
|
- **SIMD Processing**: AVX-512 (64-byte vectors) or AVX2 (32-byte vectors) for parallel processing
|
|
393
|
+
- **Dynamic Lookup Tables**: Generated per-configuration for optimal state transitions
|
|
221
394
|
- **Memory Mapping**: Direct kernel-to-userspace zero-copy with `mmap()`
|
|
222
395
|
- **Optimized Buffering**: 1MB ring buffer sized for L3 cache efficiency
|
|
223
396
|
- **Compiler Optimizations**: LTO and architecture-specific tuning with `-march=native`
|
|
397
|
+
- **Configurable Parsing**: RFC 4180 compliant with extensive customization options
|
|
224
398
|
|
|
225
399
|
## FEATURES (PROS)
|
|
226
400
|
|
|
227
|
-
- RFC 4180 compliant
|
|
401
|
+
- RFC 4180 compliant with configurable extensions
|
|
228
402
|
- Handles quoted fields with embedded delimiters
|
|
403
|
+
- Support for multiple CSV dialects (TSV, PSV, etc.)
|
|
404
|
+
- Comment line support
|
|
405
|
+
- Field trimming and empty line handling
|
|
406
|
+
- Line range parsing for large files
|
|
229
407
|
- Streaming API for unlimited file sizes
|
|
230
408
|
- Safe fallback for non-x86 architectures
|
|
231
409
|
- High-performance CSV writer with SIMD optimization
|
|
410
|
+
- Row counting without full parsing
|
|
232
411
|
|
|
233
|
-
##
|
|
412
|
+
## LIMITATIONS
|
|
234
413
|
|
|
235
|
-
-
|
|
414
|
+
- Linux/Unix support only (optimized for x86_64 CPU)
|
|
415
|
+
- Windows support planned for future release
|
|
236
416
|
|
|
237
417
|
## CONTRIBUTING
|
|
238
418
|
|
|
239
419
|
Areas of interest:
|
|
240
|
-
- ARM NEON/SVE
|
|
241
|
-
- Windows native support
|
|
242
|
-
- Parallel parsing for multi-core systems
|
|
243
|
-
- Custom memory allocators
|
|
244
|
-
- Streaming compression support
|
|
420
|
+
- [ ] ARM NEON/SVE optimization improvements (in progress)
|
|
421
|
+
- [ ] Windows native support
|
|
422
|
+
- [ ] Parallel parsing for multi-core systems
|
|
423
|
+
- [ ] Custom memory allocators
|
|
424
|
+
- [ ] Streaming compression support
|
|
425
|
+
- [ ] Additional transform functions
|
|
245
426
|
|
|
246
427
|
## LICENSE
|
|
247
428
|
|