cisv 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci.yml +158 -0
- package/.github/workflows/release.yml +167 -0
- package/Dockerfile +63 -0
- package/LICENSE +7 -0
- package/Makefile +160 -0
- package/README.md +249 -0
- package/SIMD_benchmarks.md +658 -0
- package/benchmark/benchmark.js +287 -0
- package/benchmark_cli_reader.sh +236 -0
- package/benchmark_cli_writer.sh +280 -0
- package/binding.gyp +57 -0
- package/debug-addon.js +64 -0
- package/examples/basic-parse.js +65 -0
- package/examples/large-file.js +35 -0
- package/examples/transform.js +152 -0
- package/examples/typescript.ts +38 -0
- package/index.d.ts +336 -0
- package/install_benchmark_deps.sh +156 -0
- package/package.json +47 -0
- package/run_benchmarks.sh +53 -0
- package/src/cisv_addon.cc +614 -0
- package/src/cisv_parser.c +988 -0
- package/src/cisv_parser.h +55 -0
- package/src/cisv_simd.h +53 -0
- package/src/cisv_transformer.c +537 -0
- package/src/cisv_transformer.h +145 -0
- package/src/cisv_writer.c +535 -0
- package/src/cisv_writer.h +60 -0
- package/src/index.ts +2 -0
- package/src/test/typescript.test.ts +43 -0
- package/src/win_getopt.h +100 -0
- package/src/win_sys_time.h +50 -0
- package/test/basic.test.js +104 -0
- package/test_select.sh +92 -0
- package/test_transform.sh +167 -0
- package/test_transform_leak_test.js +94 -0
- package/tsconfig.json +17 -0
- package/types/cisv.d.ts +8 -0
- package/valgrind-node.supp +69 -0
package/README.md
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# CISV
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+

|
|
6
|
+

|
|
7
|
+
|
|
8
|
+
High-performance CSV parser and writer leveraging SIMD instructions and zero-copy memory mapping. Available as both a Node.js native addon and standalone CLI tool.
|
|
9
|
+
|
|
10
|
+
## PERFORMANCE
|
|
11
|
+
|
|
12
|
+
- **469,968 MB/s** throughput on 2M row CSV files (AVX-512)
|
|
13
|
+
- **10-100x faster** than popular CSV parsers
|
|
14
|
+
- Zero-copy memory-mapped I/O with kernel optimizations
|
|
15
|
+
- SIMD accelerated with AVX-512/AVX2 auto-detection
|
|
16
|
+
|
|
17
|
+
## INSTALLATION
|
|
18
|
+
|
|
19
|
+
### NODE.JS PACKAGE
|
|
20
|
+
```bash
|
|
21
|
+
npm install cisv
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### CLI TOOL (FROM SOURCE)
|
|
25
|
+
```bash
|
|
26
|
+
git clone https://github.com/sanix-darker/cisv
|
|
27
|
+
cd cisv
|
|
28
|
+
make cli
|
|
29
|
+
sudo make install-cli
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### BUILD FROM SOURCE (NODE.JS ADDON)
|
|
33
|
+
```bash
|
|
34
|
+
npm install -g node-gyp
|
|
35
|
+
make build
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## QUICK START
|
|
39
|
+
|
|
40
|
+
### NODE.JS
|
|
41
|
+
```javascript
|
|
42
|
+
const { cisvParser } = require('cisv');
|
|
43
|
+
|
|
44
|
+
const parser = new cisvParser();
|
|
45
|
+
const rows = parser.parseSync('./data.csv');
|
|
46
|
+
console.log(`Parsed ${rows.length} rows`);
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### CLI
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# Count rows
|
|
53
|
+
cisv -c large_file.csv
|
|
54
|
+
|
|
55
|
+
# Select columns
|
|
56
|
+
cisv -s 0,2,5 data.csv
|
|
57
|
+
|
|
58
|
+
# First 100 rows
|
|
59
|
+
cisv --head 100 data.csv
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## API REFERENCE
|
|
63
|
+
|
|
64
|
+
### TYPESCRIPT DEFINITIONS
|
|
65
|
+
```typescript
|
|
66
|
+
interface ParsedRow extends Array<string> {}
|
|
67
|
+
interface ParseStats {
|
|
68
|
+
rowCount: number;
|
|
69
|
+
fieldCount: number;
|
|
70
|
+
totalBytes: number;
|
|
71
|
+
parseTime: number;
|
|
72
|
+
}
|
|
73
|
+
interface TransformInfo {
|
|
74
|
+
cTransformCount: number;
|
|
75
|
+
jsTransformCount: number;
|
|
76
|
+
fieldIndices: number[];
|
|
77
|
+
}
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### BASIC PARSING
|
|
81
|
+
```javascript
|
|
82
|
+
const parser = new cisv.cisvParser();
|
|
83
|
+
|
|
84
|
+
// Synchronous
|
|
85
|
+
const rows = parser.parseSync('data.csv');
|
|
86
|
+
|
|
87
|
+
// Asynchronous
|
|
88
|
+
const asyncRows = await parser.parse('large-file.csv');
|
|
89
|
+
|
|
90
|
+
// From string
|
|
91
|
+
const csvString = 'name,age,city\nJohn,30,NYC\nJane,25,LA';
|
|
92
|
+
const stringRows = parser.parseString(csvString);
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### STREAMING
|
|
96
|
+
```javascript
|
|
97
|
+
const streamParser = new cisv.cisvParser();
|
|
98
|
+
const stream = fs.createReadStream('huge-file.csv');
|
|
99
|
+
|
|
100
|
+
stream.on('data', chunk => streamParser.write(chunk));
|
|
101
|
+
stream.on('end', () => {
|
|
102
|
+
streamParser.end();
|
|
103
|
+
const results = streamParser.getRows();
|
|
104
|
+
});
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### DATA TRANSFORMATION
|
|
108
|
+
|
|
109
|
+
Built-in C transforms (optimized):
|
|
110
|
+
```javascript
|
|
111
|
+
parser
|
|
112
|
+
.transform(0, 'uppercase') // Column 0 to uppercase
|
|
113
|
+
.transform(1, 'lowercase') // Column 1 to lowercase
|
|
114
|
+
.transform(2, 'trim') // Column 2 trim whitespace
|
|
115
|
+
.transform(3, 'to_int') // Column 3 to integer
|
|
116
|
+
.transform(4, 'to_float') // Column 4 to float
|
|
117
|
+
.transform(5, 'base64_encode') // Column 5 to base64
|
|
118
|
+
.transform(6, 'hash_sha256'); // Column 6 to SHA256
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Custom JavaScript transforms:
|
|
122
|
+
```javascript
|
|
123
|
+
// Single field
|
|
124
|
+
parser.transform(7, value => new Date(value).toISOString());
|
|
125
|
+
|
|
126
|
+
// All fields
|
|
127
|
+
parser.transform(-1, value => value.replace(/[^\w\s]/gi, ''));
|
|
128
|
+
|
|
129
|
+
// Chain transforms
|
|
130
|
+
parser
|
|
131
|
+
.transform(0, 'trim')
|
|
132
|
+
.transform(0, 'uppercase')
|
|
133
|
+
.transform(0, val => val.substring(0, 10));
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## CLI USAGE
|
|
137
|
+
|
|
138
|
+
### PARSING
|
|
139
|
+
```bash
|
|
140
|
+
cisv [OPTIONS] [FILE]
|
|
141
|
+
|
|
142
|
+
Options:
|
|
143
|
+
-h, --help Show help message
|
|
144
|
+
-v, --version Show version
|
|
145
|
+
-d, --delimiter DELIM Field delimiter (default: ,)
|
|
146
|
+
-s, --select COLS Select columns (comma-separated indices)
|
|
147
|
+
-c, --count Show only row count
|
|
148
|
+
--head N Show first N rows
|
|
149
|
+
--tail N Show last N rows
|
|
150
|
+
-o, --output FILE Write to FILE instead of stdout
|
|
151
|
+
-b, --benchmark Run benchmark mode
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### WRITING
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
cisv write [OPTIONS]
|
|
158
|
+
|
|
159
|
+
Options:
|
|
160
|
+
-g, --generate N Generate N rows of test data
|
|
161
|
+
-o, --output FILE Output file
|
|
162
|
+
-d, --delimiter DELIM Field delimiter
|
|
163
|
+
-Q, --quote-all Quote all fields
|
|
164
|
+
-r, --crlf Use CRLF line endings
|
|
165
|
+
-n, --null TEXT Null representation
|
|
166
|
+
-b, --benchmark Benchmark mode
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
## BENCHMARKS
|
|
170
|
+
|
|
171
|
+
### PARSER PERFORMANCE (273 MB, 5M ROWS)
|
|
172
|
+
|
|
173
|
+
| Parser | Speed (MB/s) | Time (ms) | Relative |
|
|
174
|
+
|---------------|--------------|-----------|----------------|
|
|
175
|
+
| **cisv** | 7,184 | 38 | 1.0x (fastest) |
|
|
176
|
+
| rust-csv | 391 | 698 | 18x slower |
|
|
177
|
+
| xsv | 650 | 420 | 11x slower |
|
|
178
|
+
| csvkit | 28 | 9,875 | 260x slower |
|
|
179
|
+
|
|
180
|
+
### NODE.JS LIBRARY BENCHMARKS
|
|
181
|
+
|
|
182
|
+
- **Synchronous with Data Access:**
|
|
183
|
+
|
|
184
|
+
| Library | Speed (MB/s) | Operations/sec |
|
|
185
|
+
|--------------------|--------------|----------------|
|
|
186
|
+
| cisv | 61.24 | 136,343 |
|
|
187
|
+
| csv-parse | 15.48 | 34,471 |
|
|
188
|
+
| papaparse | 25.67 | 57,147 |
|
|
189
|
+
|
|
190
|
+
- **Asynchronous Streaming:**
|
|
191
|
+
|
|
192
|
+
| Library | Speed (MB/s) | Operations/sec |
|
|
193
|
+
|--------------------|--------------|----------------|
|
|
194
|
+
| cisv | 76.94 | 171,287 |
|
|
195
|
+
| papaparse | 16.54 | 36,815 |
|
|
196
|
+
| neat-csv | 8.11 | 18,055 |
|
|
197
|
+
|
|
198
|
+
### RUNNING BENCHMARKS
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
# CLI benchmarks
|
|
202
|
+
make clean && make cli && make benchmark-cli
|
|
203
|
+
|
|
204
|
+
# Node.js benchmarks
|
|
205
|
+
npm run benchmark
|
|
206
|
+
|
|
207
|
+
# Docker isolated benchmarks
|
|
208
|
+
docker build -t cisv-benchmark .
|
|
209
|
+
docker run --rm --cpus="2.0" --memory="4g" cisv-benchmark
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
## TECHNICAL ARCHITECTURE
|
|
213
|
+
|
|
214
|
+
- **SIMD Processing**: AVX-512 (64-byte vectors) or AVX2 (32-byte vectors) for parallel processing
|
|
215
|
+
- **Memory Mapping**: Direct kernel-to-userspace zero-copy with `mmap()`
|
|
216
|
+
- **Optimized Buffering**: 1MB ring buffer sized for L3 cache efficiency
|
|
217
|
+
- **Compiler Optimizations**: LTO and architecture-specific tuning with `-march=native`
|
|
218
|
+
|
|
219
|
+
## FEATURES (PROS)
|
|
220
|
+
|
|
221
|
+
- RFC 4180 compliant
|
|
222
|
+
- Handles quoted fields with embedded delimiters
|
|
223
|
+
- Streaming API for unlimited file sizes
|
|
224
|
+
- Safe fallback for non-x86 architectures
|
|
225
|
+
- High-performance CSV writer with SIMD optimization
|
|
226
|
+
|
|
227
|
+
## CONS
|
|
228
|
+
|
|
229
|
+
- Only Linux support for now (really good on x86_64 CPU)
|
|
230
|
+
|
|
231
|
+
## CONTRIBUTING
|
|
232
|
+
|
|
233
|
+
Areas of interest:
|
|
234
|
+
- ARM NEON/SVE support
|
|
235
|
+
- Windows native support
|
|
236
|
+
- Parallel parsing for multi-core systems
|
|
237
|
+
- Custom memory allocators
|
|
238
|
+
- Streaming compression support
|
|
239
|
+
|
|
240
|
+
## LICENSE
|
|
241
|
+
|
|
242
|
+
GPL2 © [sanix-darker](https://github.com/sanix-darker)
|
|
243
|
+
|
|
244
|
+
## ACKNOWLEDGMENTS
|
|
245
|
+
|
|
246
|
+
Inspired by:
|
|
247
|
+
- [simdjson](https://github.com/simdjson/simdjson) - Parsing gigabytes of JSON per second
|
|
248
|
+
- [xsv](https://github.com/BurntSushi/xsv) - Fast CSV command line toolkit
|
|
249
|
+
- [rust-csv](https://github.com/BurntSushi/rust-csv) - CSV parser for Rust
|