cisv 0.0.25 → 0.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -5,7 +5,7 @@
5
5
  ![License](https://img.shields.io/badge/license-GPL2-blue)
6
6
  ![Build](https://img.shields.io/badge/build-passing-brightgreen)
7
7
 
8
- High-performance CSV parser and writer leveraging SIMD instructions and zero-copy memory mapping. Available as both a Node.js native addon and standalone CLI tool.
8
+ High-performance CSV parser and writer leveraging SIMD instructions and zero-copy memory mapping. Available as both a Node.js native addon and standalone CLI tool with extensive configuration options.
9
9
 
10
10
  ## PERFORMANCE
11
11
 
@@ -13,6 +13,7 @@ High-performance CSV parser and writer leveraging SIMD instructions and zero-cop
13
13
  - **10-100x faster** than popular CSV parsers
14
14
  - Zero-copy memory-mapped I/O with kernel optimizations
15
15
  - SIMD accelerated with AVX-512/AVX2 auto-detection
16
+ - Dynamic lookup tables for configurable parsing
16
17
 
17
18
  ## INSTALLATION
18
19
 
@@ -41,39 +42,144 @@ make build
41
42
  ```javascript
42
43
  const { cisvParser } = require('cisv');
43
44
 
45
+ // Basic usage
44
46
  const parser = new cisvParser();
45
47
  const rows = parser.parseSync('./data.csv');
46
- console.log(`Parsed ${rows.length} rows`);
48
+
49
+ // With configuration (optional)
50
+ const tsv_parser = new cisvParser({
51
+ delimiter: '\t',
52
+ quote: "'",
53
+ trim: true
54
+ });
55
+ const tsv_rows = tsv_parser.parseSync('./data.tsv');
47
56
  ```
48
57
 
49
58
  ### CLI
50
-
51
59
  ```bash
52
- # Count rows
53
- cisv -c large_file.csv
60
+ # Basic parsing
61
+ cisv data.csv
62
+
63
+ # Parse TSV file
64
+ cisv -d $'\t' data.tsv
65
+
66
+ # Parse with custom quote and trim
67
+ cisv -q "'" -t data.csv
68
+
69
+ # Skip comment lines
70
+ cisv -m '#' config.csv
71
+ ```
72
+
73
+ ## CONFIGURATION OPTIONS
74
+
75
+ ### Parser Configuration
76
+
77
+ ```javascript
78
+ const parser = new cisvParser({
79
+ // Field delimiter character (default: ',')
80
+ delimiter: ',',
81
+
82
+ // Quote character (default: '"')
83
+ quote: '"',
84
+
85
+ // Escape character (null for RFC4180 "" style, default: null)
86
+ escape: null,
87
+
88
+ // Comment character to skip lines (default: null)
89
+ comment: '#',
90
+
91
+ // Trim whitespace from fields (default: false)
92
+ trim: true,
93
+
94
+ // Skip empty lines (default: false)
95
+ skipEmptyLines: true,
96
+
97
+ // Use relaxed parsing rules (default: false)
98
+ relaxed: false,
54
99
 
55
- # Select columns
56
- cisv -s 0,2,5 data.csv
100
+ // Skip lines with parse errors (default: false)
101
+ skipLinesWithError: true,
57
102
 
58
- # First 100 rows
59
- cisv --head 100 data.csv
103
+ // Maximum row size in bytes (0 = unlimited, default: 0)
104
+ maxRowSize: 1048576,
105
+
106
+ // Start parsing from line N (1-based, default: 1)
107
+ fromLine: 10,
108
+
109
+ // Stop parsing at line N (0 = until end, default: 0)
110
+ toLine: 1000
111
+ });
112
+ ```
113
+
114
+ ### Dynamic Configuration
115
+
116
+ ```javascript
117
+ // Set configuration after creation
118
+ parser.setConfig({
119
+ delimiter: ';',
120
+ quote: "'",
121
+ trim: true
122
+ });
123
+
124
+ // Get current configuration
125
+ const config = parser.getConfig();
126
+ console.log(config);
60
127
  ```
61
128
 
62
129
  ## API REFERENCE
63
130
 
64
131
  ### TYPESCRIPT DEFINITIONS
65
132
  ```typescript
133
+ interface CisvConfig {
134
+ delimiter?: string;
135
+ quote?: string;
136
+ escape?: string | null;
137
+ comment?: string | null;
138
+ trim?: boolean;
139
+ skipEmptyLines?: boolean;
140
+ relaxed?: boolean;
141
+ skipLinesWithError?: boolean;
142
+ maxRowSize?: number;
143
+ fromLine?: number;
144
+ toLine?: number;
145
+ }
146
+
66
147
  interface ParsedRow extends Array<string> {}
148
+
67
149
  interface ParseStats {
68
- rowCount: number;
69
- fieldCount: number;
70
- totalBytes: number;
71
- parseTime: number;
150
+ rowCount: number;
151
+ fieldCount: number;
152
+ totalBytes: number;
153
+ parseTime: number;
154
+ currentLine: number;
72
155
  }
156
+
73
157
  interface TransformInfo {
74
- cTransformCount: number;
75
- jsTransformCount: number;
76
- fieldIndices: number[];
158
+ cTransformCount: number;
159
+ jsTransformCount: number;
160
+ fieldIndices: number[];
161
+ }
162
+
163
+ class cisvParser {
164
+ constructor(config?: CisvConfig);
165
+ parseSync(path: string): ParsedRow[];
166
+ parse(path: string): Promise<ParsedRow[]>;
167
+ parseString(csv: string): ParsedRow[];
168
+ write(chunk: string | Buffer): void;
169
+ end(): void;
170
+ getRows(): ParsedRow[];
171
+ clear(): void;
172
+ setConfig(config: CisvConfig): void;
173
+ getConfig(): CisvConfig;
174
+ transform(fieldIndex: number, type: string | Function): this;
175
+ removeTransform(fieldIndex: number): this;
176
+ clearTransforms(): this;
177
+ getStats(): ParseStats;
178
+ getTransformInfo(): TransformInfo;
179
+ destroy(): void;
180
+
181
+ static countRows(path: string): number;
182
+ static countRowsWithConfig(path: string, config?: CisvConfig): number;
77
183
  }
78
184
  ```
79
185
 
@@ -82,38 +188,60 @@ interface TransformInfo {
82
188
  ```javascript
83
189
  import { cisvParser } from "cisv";
84
190
 
191
+ // Default configuration (standard CSV)
85
192
  const parser = new cisvParser();
86
-
87
- // Synchronous
88
193
  const rows = parser.parseSync('data.csv');
89
194
 
90
- // Asynchronous
91
- const asyncRows = await parser.parse('large-file.csv');
195
+ // Custom configuration (TSV with single quotes)
196
+ const tsvParser = new cisvParser({
197
+ delimiter: '\t',
198
+ quote: "'"
199
+ });
200
+ const tsvRows = tsvParser.parseSync('data.tsv');
201
+
202
+ // Parse specific line range
203
+ const rangeParser = new cisvParser({
204
+ fromLine: 100,
205
+ toLine: 1000
206
+ });
207
+ const subset = rangeParser.parseSync('large.csv');
92
208
 
93
- // From string
94
- const csvString = 'name,age,city\nJohn,30,NYC\nJane,25,LA';
95
- const stringRows = parser.parseString(csvString);
209
+ // Skip comments and empty lines
210
+ const cleanParser = new cisvParser({
211
+ comment: '#',
212
+ skipEmptyLines: true,
213
+ trim: true
214
+ });
215
+ const cleanData = cleanParser.parseSync('config.csv');
96
216
  ```
97
217
 
98
218
  ### STREAMING
99
219
 
100
220
  ```javascript
101
221
  import { cisvParser } from "cisv";
222
+ import fs from 'fs';
223
+
224
+ const streamParser = new cisvParser({
225
+ delimiter: ',',
226
+ trim: true
227
+ });
102
228
 
103
- const streamParser = new cisvParser();
104
229
  const stream = fs.createReadStream('huge-file.csv');
105
230
 
106
231
  stream.on('data', chunk => streamParser.write(chunk));
107
232
  stream.on('end', () => {
108
233
  streamParser.end();
109
234
  const results = streamParser.getRows();
235
+ console.log(`Parsed ${results.length} rows`);
110
236
  });
111
237
  ```
112
238
 
113
239
  ### DATA TRANSFORMATION
114
240
 
115
- Built-in C transforms (optimized):
116
241
  ```javascript
242
+ const parser = new cisvParser();
243
+
244
+ // Built-in C transforms (optimized)
117
245
  parser
118
246
  .transform(0, 'uppercase') // Column 0 to uppercase
119
247
  .transform(1, 'lowercase') // Column 1 to lowercase
@@ -122,39 +250,92 @@ parser
122
250
  .transform(4, 'to_float') // Column 4 to float
123
251
  .transform(5, 'base64_encode') // Column 5 to base64
124
252
  .transform(6, 'hash_sha256'); // Column 6 to SHA256
125
- ```
126
253
 
127
- Custom JavaScript transforms:
128
- ```javascript
129
- // Single field
254
+ // Custom JavaScript transforms
130
255
  parser.transform(7, value => new Date(value).toISOString());
131
256
 
132
- // All fields
257
+ // Apply to all fields
133
258
  parser.transform(-1, value => value.replace(/[^\w\s]/gi, ''));
134
259
 
135
- // Chain transforms
136
- parser
137
- .transform(0, 'trim')
138
- .transform(0, 'uppercase')
139
- .transform(0, val => val.substring(0, 10));
260
+ const transformed = parser.parseSync('data.csv');
261
+ ```
262
+
263
+ ### ROW COUNTING
264
+
265
+ ```javascript
266
+ import { cisvParser } from "cisv";
267
+
268
+ // Fast row counting without parsing
269
+ const count = cisvParser.countRows('large.csv');
270
+
271
+ // Count with specific configuration
272
+ const tsvCount = cisvParser.countRowsWithConfig('data.tsv', {
273
+ delimiter: '\t',
274
+ skipEmptyLines: true,
275
+ fromLine: 10,
276
+ toLine: 1000
277
+ });
140
278
  ```
141
279
 
142
280
  ## CLI USAGE
143
281
 
144
- ### PARSING
282
+ ### PARSING OPTIONS
283
+
145
284
  ```bash
146
285
  cisv [OPTIONS] [FILE]
147
286
 
148
- Options:
287
+ General Options:
149
288
  -h, --help Show help message
150
289
  -v, --version Show version
290
+ -o, --output FILE Write to FILE instead of stdout
291
+ -b, --benchmark Run benchmark mode
292
+
293
+ Configuration Options:
151
294
  -d, --delimiter DELIM Field delimiter (default: ,)
295
+ -q, --quote CHAR Quote character (default: ")
296
+ -e, --escape CHAR Escape character (default: RFC4180 style)
297
+ -m, --comment CHAR Comment character (default: none)
298
+ -t, --trim Trim whitespace from fields
299
+ -r, --relaxed Use relaxed parsing rules
300
+ --skip-empty Skip empty lines
301
+ --skip-errors Skip lines with parse errors
302
+ --max-row SIZE Maximum row size in bytes
303
+ --from-line N Start from line N (1-based)
304
+ --to-line N Stop at line N
305
+
306
+ Processing Options:
152
307
  -s, --select COLS Select columns (comma-separated indices)
153
- -c, --count Show only row count
154
- --head N Show first N rows
155
- --tail N Show last N rows
156
- -o, --output FILE Write to FILE instead of stdout
157
- -b, --benchmark Run benchmark mode
308
+ -c, --count Show only row count
309
+ --head N Show first N rows
310
+ --tail N Show last N rows
311
+ ```
312
+
313
+ ### EXAMPLES
314
+
315
+ ```bash
316
+ # Parse TSV file
317
+ cisv -d $'\t' data.tsv
318
+
319
+ # Parse CSV with semicolon delimiter and single quotes
320
+ cisv -d ';' -q "'" european.csv
321
+
322
+ # Skip comment lines starting with #
323
+ cisv -m '#' config.csv
324
+
325
+ # Trim whitespace and skip empty lines
326
+ cisv -t --skip-empty messy.csv
327
+
328
+ # Parse lines 100-1000 only
329
+ cisv --from-line 100 --to-line 1000 large.csv
330
+
331
+ # Select specific columns
332
+ cisv -s 0,2,5,7 data.csv
333
+
334
+ # Count rows with specific configuration
335
+ cisv -c -d $'\t' --skip-empty data.tsv
336
+
337
+ # Benchmark with custom delimiter
338
+ cisv -b -d ';' european.csv
158
339
  ```
159
340
 
160
341
  ### WRITING
@@ -185,21 +366,13 @@ Options:
185
366
 
186
367
  ### NODE.JS LIBRARY BENCHMARKS
187
368
 
188
- - **Synchronous with Data Access:**
189
-
190
- | Library | Speed (MB/s) | Operations/sec |
191
- |--------------------|--------------|----------------|
192
- | cisv | 61.24 | 136,343 |
193
- | csv-parse | 15.48 | 34,471 |
194
- | papaparse | 25.67 | 57,147 |
195
-
196
- - **Asynchronous Streaming:**
369
+ | Library | Speed (MB/s) | Operations/sec | Configuration Support |
370
+ |--------------------|--------------|----------------|----------------------|
371
+ | cisv | 61.24 | 136,343 | Full |
372
+ | csv-parse | 15.48 | 34,471 | Partial |
373
+ | papaparse | 25.67 | 57,147 | Partial |
197
374
 
198
- | Library | Speed (MB/s) | Operations/sec |
199
- |--------------------|--------------|----------------|
200
- | cisv | 76.94 | 171,287 |
201
- | papaparse | 16.54 | 36,815 |
202
- | neat-csv | 8.11 | 18,055 |
375
+ (you can check more benchmarks details from release pipelines)
203
376
 
204
377
  ### RUNNING BENCHMARKS
205
378
 
@@ -210,38 +383,46 @@ make clean && make cli && make benchmark-cli
210
383
  # Node.js benchmarks
211
384
  npm run benchmark
212
385
 
213
- # Docker isolated benchmarks
214
- docker build -t cisv-benchmark .
215
- docker run --rm --cpus="2.0" --memory="4g" cisv-benchmark
386
+ # Benchmark with custom configuration
387
+ cisv -b -d ';' -q "'" --trim european.csv
216
388
  ```
217
389
 
218
390
  ## TECHNICAL ARCHITECTURE
219
391
 
220
392
  - **SIMD Processing**: AVX-512 (64-byte vectors) or AVX2 (32-byte vectors) for parallel processing
393
+ - **Dynamic Lookup Tables**: Generated per-configuration for optimal state transitions
221
394
  - **Memory Mapping**: Direct kernel-to-userspace zero-copy with `mmap()`
222
395
  - **Optimized Buffering**: 1MB ring buffer sized for L3 cache efficiency
223
396
  - **Compiler Optimizations**: LTO and architecture-specific tuning with `-march=native`
397
+ - **Configurable Parsing**: RFC 4180 compliant with extensive customization options
224
398
 
225
399
  ## FEATURES (PROS)
226
400
 
227
- - RFC 4180 compliant
401
+ - RFC 4180 compliant with configurable extensions
228
402
  - Handles quoted fields with embedded delimiters
403
+ - Support for multiple CSV dialects (TSV, PSV, etc.)
404
+ - Comment line support
405
+ - Field trimming and empty line handling
406
+ - Line range parsing for large files
229
407
  - Streaming API for unlimited file sizes
230
408
  - Safe fallback for non-x86 architectures
231
409
  - High-performance CSV writer with SIMD optimization
410
+ - Row counting without full parsing
232
411
 
233
- ## CONS
412
+ ## LIMITATIONS
234
413
 
235
- - Only Linux support for now (really good on x86_64 CPU)
414
+ - Linux/Unix support only (optimized for x86_64 CPU)
415
+ - Windows support planned for future release
236
416
 
237
417
  ## CONTRIBUTING
238
418
 
239
419
  Areas of interest:
240
- - ARM NEON/SVE support
241
- - Windows native support
242
- - Parallel parsing for multi-core systems
243
- - Custom memory allocators
244
- - Streaming compression support
420
+ - [ ] ARM NEON/SVE optimization improvements (in progress)
421
+ - [ ] Windows native support
422
+ - [ ] Parallel parsing for multi-core systems
423
+ - [ ] Custom memory allocators
424
+ - [ ] Streaming compression support
425
+ - [ ] Additional transform functions
245
426
 
246
427
  ## LICENSE
247
428