cisv 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci.yml +158 -0
- package/.github/workflows/release.yml +167 -0
- package/Dockerfile +63 -0
- package/LICENSE +7 -0
- package/Makefile +160 -0
- package/README.md +249 -0
- package/SIMD_benchmarks.md +658 -0
- package/benchmark/benchmark.js +287 -0
- package/benchmark_cli_reader.sh +236 -0
- package/benchmark_cli_writer.sh +280 -0
- package/binding.gyp +57 -0
- package/debug-addon.js +64 -0
- package/examples/basic-parse.js +65 -0
- package/examples/large-file.js +35 -0
- package/examples/transform.js +152 -0
- package/examples/typescript.ts +38 -0
- package/index.d.ts +336 -0
- package/install_benchmark_deps.sh +156 -0
- package/package.json +47 -0
- package/run_benchmarks.sh +53 -0
- package/src/cisv_addon.cc +614 -0
- package/src/cisv_parser.c +988 -0
- package/src/cisv_parser.h +55 -0
- package/src/cisv_simd.h +53 -0
- package/src/cisv_transformer.c +537 -0
- package/src/cisv_transformer.h +145 -0
- package/src/cisv_writer.c +535 -0
- package/src/cisv_writer.h +60 -0
- package/src/index.ts +2 -0
- package/src/test/typescript.test.ts +43 -0
- package/src/win_getopt.h +100 -0
- package/src/win_sys_time.h +50 -0
- package/test/basic.test.js +104 -0
- package/test_select.sh +92 -0
- package/test_transform.sh +167 -0
- package/test_transform_leak_test.js +94 -0
- package/tsconfig.json +17 -0
- package/types/cisv.d.ts +8 -0
- package/valgrind-node.supp +69 -0
|
@@ -0,0 +1,988 @@
|
|
|
1
|
+
#define _GNU_SOURCE
|
|
2
|
+
#include <stdio.h>
|
|
3
|
+
#include <stdlib.h>
|
|
4
|
+
#include <string.h>
|
|
5
|
+
#include "win_getopt.h"
|
|
6
|
+
#include <sys/stat.h>
|
|
7
|
+
#include <errno.h>
|
|
8
|
+
#include <time.h>
|
|
9
|
+
// NOTE: not dealing with windows for now, too much issues
|
|
10
|
+
#include <sys/mman.h>
|
|
11
|
+
#include <fcntl.h>
|
|
12
|
+
#include <unistd.h>
|
|
13
|
+
#include <getopt.h>
|
|
14
|
+
#include <sys/time.h>
|
|
15
|
+
#include "./cisv_simd.h"
|
|
16
|
+
#include "cisv_parser.h"
|
|
17
|
+
|
|
18
|
+
// #define RINGBUF_SIZE (1 << 20) // 1 MiB (we may adjust according to needs)
|
|
19
|
+
#define RINGBUF_SIZE (1 << 16) // 64kb (for memory safe reasons)
|
|
20
|
+
#define PREFETCH_DISTANCE 256
|
|
21
|
+
|
|
22
|
+
struct cisv_parser {
|
|
23
|
+
uint8_t *base; // pointer to the whole input, if memory-mapped
|
|
24
|
+
size_t size; // length of that mapping
|
|
25
|
+
int fd; // the underlying file descriptor (-1 ⇒ none)
|
|
26
|
+
uint8_t *ring; // malloc’ed circular buffer when not mmapped
|
|
27
|
+
size_t head; // write head: next byte slot to fill
|
|
28
|
+
uint8_t st;
|
|
29
|
+
cisv_field_cb fcb; // field callback fired whenever a full cell is ready
|
|
30
|
+
// (delimiter or row-ending newline encountered, consistent with RFC 4180 rules)
|
|
31
|
+
|
|
32
|
+
cisv_row_cb rcb; // row callback fired after the last field of each record
|
|
33
|
+
void *user;
|
|
34
|
+
const uint8_t *field_start; // where the in-progress field began
|
|
35
|
+
};
|
|
36
|
+
// State constants for branchless operations
|
|
37
|
+
#define S_UNQUOTED 0
|
|
38
|
+
#define S_QUOTED 1
|
|
39
|
+
#define S_QUOTE_ESC 2
|
|
40
|
+
|
|
41
|
+
// Action flags for each character in each state
|
|
42
|
+
#define ACT_NONE 0
|
|
43
|
+
#define ACT_FIELD 1
|
|
44
|
+
#define ACT_ROW 2
|
|
45
|
+
#define ACT_REPROCESS 4
|
|
46
|
+
|
|
47
|
+
// Lookup tables for branchless state transitions - initialized at compile time
|
|
48
|
+
static uint8_t state_table[3][256];
|
|
49
|
+
static uint8_t action_table[3][256];
|
|
50
|
+
static int tables_initialized = 0;
|
|
51
|
+
|
|
52
|
+
// Initialize lookup tables (called lazily)
|
|
53
|
+
static void init_tables(void) {
|
|
54
|
+
if (tables_initialized) return;
|
|
55
|
+
|
|
56
|
+
// Initialize state transitions
|
|
57
|
+
for (int c = 0; c < 256; c++) {
|
|
58
|
+
// S_UNQUOTED transitions
|
|
59
|
+
state_table[S_UNQUOTED][c] = S_UNQUOTED;
|
|
60
|
+
if (c == '"') state_table[S_UNQUOTED][c] = S_QUOTED;
|
|
61
|
+
|
|
62
|
+
// S_QUOTED transitions
|
|
63
|
+
state_table[S_QUOTED][c] = S_QUOTED;
|
|
64
|
+
if (c == '"') state_table[S_QUOTED][c] = S_QUOTE_ESC;
|
|
65
|
+
|
|
66
|
+
// S_QUOTE_ESC transitions
|
|
67
|
+
if (c == '"') {
|
|
68
|
+
state_table[S_QUOTE_ESC][c] = S_QUOTED;
|
|
69
|
+
} else {
|
|
70
|
+
state_table[S_QUOTE_ESC][c] = S_UNQUOTED;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Initialize action table
|
|
75
|
+
memset(action_table, ACT_NONE, sizeof(action_table));
|
|
76
|
+
|
|
77
|
+
// S_UNQUOTED actions
|
|
78
|
+
action_table[S_UNQUOTED][','] = ACT_FIELD;
|
|
79
|
+
action_table[S_UNQUOTED]['\n'] = ACT_FIELD | ACT_ROW;
|
|
80
|
+
|
|
81
|
+
// S_QUOTE_ESC actions
|
|
82
|
+
for (int c = 0; c < 256; c++) {
|
|
83
|
+
if (c != '"') {
|
|
84
|
+
action_table[S_QUOTE_ESC][c] = ACT_REPROCESS;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
tables_initialized = 1;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
static inline void yield_field(cisv_parser *parser, const uint8_t *start, const uint8_t *end) {
|
|
92
|
+
// Branchless check: multiply callback by validity flag
|
|
93
|
+
size_t valid = (parser->fcb != NULL) & (start != NULL) & (end != NULL) & (end >= start);
|
|
94
|
+
if (valid) {
|
|
95
|
+
parser->fcb(parser->user, (const char *)start, (size_t)(end - start));
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
static inline void yield_row(cisv_parser *parser) {
|
|
100
|
+
if (parser->rcb) {
|
|
101
|
+
parser->rcb(parser->user);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
#if defined(cisv_HAVE_AVX512) || defined(cisv_HAVE_AVX2)
|
|
106
|
+
|
|
107
|
+
static void parse_simd_chunk(cisv_parser *parser, const uint8_t *buffer, size_t len) {
|
|
108
|
+
// Ensure tables are initialized
|
|
109
|
+
if (!tables_initialized) init_tables();
|
|
110
|
+
|
|
111
|
+
const uint8_t *cur = buffer;
|
|
112
|
+
const uint8_t *end = buffer + len;
|
|
113
|
+
|
|
114
|
+
// SIMD constants - create them on stack to avoid segfault
|
|
115
|
+
uint8_t comma_bytes[64];
|
|
116
|
+
uint8_t quote_bytes[64];
|
|
117
|
+
uint8_t newline_bytes[64];
|
|
118
|
+
memset(comma_bytes, ',', 64);
|
|
119
|
+
memset(quote_bytes, '"', 64);
|
|
120
|
+
memset(newline_bytes, '\n', 64);
|
|
121
|
+
|
|
122
|
+
const cisv_vec comma_vec = cisv_LOAD(comma_bytes);
|
|
123
|
+
const cisv_vec quote_vec = cisv_LOAD(quote_bytes);
|
|
124
|
+
const cisv_vec newline_vec = cisv_LOAD(newline_bytes);
|
|
125
|
+
|
|
126
|
+
while (cur + cisv_VEC_BYTES <= end) {
|
|
127
|
+
// Prefetch next chunk
|
|
128
|
+
__builtin_prefetch(cur + PREFETCH_DISTANCE, 0, 1);
|
|
129
|
+
|
|
130
|
+
// Fast path for unquoted state
|
|
131
|
+
if (parser->st == S_UNQUOTED) {
|
|
132
|
+
cisv_vec chunk = cisv_LOAD(cur);
|
|
133
|
+
|
|
134
|
+
#ifdef cisv_HAVE_AVX512
|
|
135
|
+
uint64_t comma_mask = cisv_CMP_EQ(chunk, comma_vec);
|
|
136
|
+
uint64_t quote_mask = cisv_CMP_EQ(chunk, quote_vec);
|
|
137
|
+
uint64_t newline_mask = cisv_CMP_EQ(chunk, newline_vec);
|
|
138
|
+
uint64_t combined = comma_mask | quote_mask | newline_mask;
|
|
139
|
+
#else
|
|
140
|
+
cisv_vec comma_cmp = cisv_CMP_EQ(chunk, comma_vec);
|
|
141
|
+
cisv_vec quote_cmp = cisv_CMP_EQ(chunk, quote_vec);
|
|
142
|
+
cisv_vec newline_cmp = cisv_CMP_EQ(chunk, newline_vec);
|
|
143
|
+
cisv_vec combined_vec = cisv_OR_MASK(cisv_OR_MASK(comma_cmp, quote_cmp), newline_cmp);
|
|
144
|
+
uint32_t combined = cisv_MOVEMASK(combined_vec);
|
|
145
|
+
#endif
|
|
146
|
+
|
|
147
|
+
if (!combined) {
|
|
148
|
+
// No special chars, skip entire vector
|
|
149
|
+
cur += cisv_VEC_BYTES;
|
|
150
|
+
continue;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Process special characters
|
|
154
|
+
while (combined) {
|
|
155
|
+
size_t pos = cisv_CTZ(combined);
|
|
156
|
+
const uint8_t *special_pos = cur + pos;
|
|
157
|
+
uint8_t c = *special_pos;
|
|
158
|
+
|
|
159
|
+
// Branchless field/row handling
|
|
160
|
+
uint8_t is_comma = (c == ',');
|
|
161
|
+
uint8_t is_newline = (c == '\n');
|
|
162
|
+
uint8_t is_quote = (c == '"');
|
|
163
|
+
|
|
164
|
+
// Process field before special char
|
|
165
|
+
if (special_pos > parser->field_start && (is_comma | is_newline)) {
|
|
166
|
+
yield_field(parser, parser->field_start, special_pos);
|
|
167
|
+
parser->field_start = special_pos + 1;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Handle newline
|
|
171
|
+
if (is_newline) {
|
|
172
|
+
yield_row(parser);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Update state branchlessly
|
|
176
|
+
parser->st = (parser->st & ~is_quote) | (S_QUOTED & -is_quote);
|
|
177
|
+
|
|
178
|
+
// Clear processed bit
|
|
179
|
+
combined &= combined - 1;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
cur += cisv_VEC_BYTES;
|
|
183
|
+
} else {
|
|
184
|
+
// In quoted state - need scalar processing
|
|
185
|
+
break;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
// Handle remainder with scalar code
|
|
190
|
+
while (cur < end) {
|
|
191
|
+
uint8_t c = *cur;
|
|
192
|
+
uint8_t next_state = state_table[parser->st][c];
|
|
193
|
+
uint8_t action = action_table[parser->st][c];
|
|
194
|
+
|
|
195
|
+
// Branchless state update
|
|
196
|
+
parser->st = next_state;
|
|
197
|
+
|
|
198
|
+
// Handle actions branchlessly where possible
|
|
199
|
+
if (action & ACT_FIELD) {
|
|
200
|
+
yield_field(parser, parser->field_start, cur);
|
|
201
|
+
parser->field_start = cur + 1;
|
|
202
|
+
}
|
|
203
|
+
if (action & ACT_ROW) {
|
|
204
|
+
yield_row(parser);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Reprocess requires going back
|
|
208
|
+
cur += 1 - ((action & ACT_REPROCESS) >> 2);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
#elif defined(HAS_NEON)
|
|
213
|
+
|
|
214
|
+
// ARM NEON optimized parsing
|
|
215
|
+
static void parse_simd_chunk(cisv_parser *parser, const uint8_t *buffer, size_t len) {
|
|
216
|
+
// Ensure tables are initialized
|
|
217
|
+
if (!tables_initialized) init_tables();
|
|
218
|
+
|
|
219
|
+
const uint8_t *cur = buffer;
|
|
220
|
+
const uint8_t *end = buffer + len;
|
|
221
|
+
|
|
222
|
+
// NEON constants
|
|
223
|
+
uint8x16_t comma_vec = vdupq_n_u8(',');
|
|
224
|
+
uint8x16_t quote_vec = vdupq_n_u8('"');
|
|
225
|
+
uint8x16_t newline_vec = vdupq_n_u8('\n');
|
|
226
|
+
|
|
227
|
+
while (cur + 16 <= end && parser->st == S_UNQUOTED) {
|
|
228
|
+
// Prefetch next chunk
|
|
229
|
+
__builtin_prefetch(cur + 64, 0, 1);
|
|
230
|
+
|
|
231
|
+
// Load 16 bytes
|
|
232
|
+
uint8x16_t chunk = vld1q_u8(cur);
|
|
233
|
+
|
|
234
|
+
// Compare with special characters
|
|
235
|
+
uint8x16_t comma_cmp = vceqq_u8(chunk, comma_vec);
|
|
236
|
+
uint8x16_t quote_cmp = vceqq_u8(chunk, quote_vec);
|
|
237
|
+
uint8x16_t newline_cmp = vceqq_u8(chunk, newline_vec);
|
|
238
|
+
|
|
239
|
+
// Combine masks
|
|
240
|
+
uint8x16_t combined = vorrq_u8(vorrq_u8(comma_cmp, quote_cmp), newline_cmp);
|
|
241
|
+
|
|
242
|
+
// Check if any special char found
|
|
243
|
+
uint64_t mask_low = vgetq_lane_u64(vreinterpretq_u64_u8(combined), 0);
|
|
244
|
+
uint64_t mask_high = vgetq_lane_u64(vreinterpretq_u64_u8(combined), 1);
|
|
245
|
+
|
|
246
|
+
if (!(mask_low | mask_high)) {
|
|
247
|
+
// No special chars, advance
|
|
248
|
+
cur += 16;
|
|
249
|
+
continue;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
// Process special characters byte by byte
|
|
253
|
+
for (int i = 0; i < 16 && cur + i < end; i++) {
|
|
254
|
+
uint8_t c = cur[i];
|
|
255
|
+
if (c == ',' || c == '\n' || c == '"') {
|
|
256
|
+
if (c == ',' || c == '\n') {
|
|
257
|
+
if (cur + i > parser->field_start) {
|
|
258
|
+
yield_field(parser, parser->field_start, cur + i);
|
|
259
|
+
parser->field_start = cur + i + 1;
|
|
260
|
+
}
|
|
261
|
+
if (c == '\n') {
|
|
262
|
+
yield_row(parser);
|
|
263
|
+
}
|
|
264
|
+
} else if (c == '"') {
|
|
265
|
+
parser->st = S_QUOTED;
|
|
266
|
+
cur += i + 1;
|
|
267
|
+
goto scalar_fallback;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
cur += 16;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
scalar_fallback:
|
|
275
|
+
// Handle remainder with scalar code
|
|
276
|
+
while (cur < end) {
|
|
277
|
+
uint8_t c = *cur;
|
|
278
|
+
uint8_t next_state = state_table[parser->st][c];
|
|
279
|
+
uint8_t action = action_table[parser->st][c];
|
|
280
|
+
|
|
281
|
+
parser->st = next_state;
|
|
282
|
+
|
|
283
|
+
if (action & ACT_FIELD) {
|
|
284
|
+
yield_field(parser, parser->field_start, cur);
|
|
285
|
+
parser->field_start = cur + 1;
|
|
286
|
+
}
|
|
287
|
+
if (action & ACT_ROW) {
|
|
288
|
+
yield_row(parser);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
cur += 1 - ((action & ACT_REPROCESS) >> 2);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
#else
|
|
296
|
+
// Non-SIMD fallback with branchless optimizations
|
|
297
|
+
static void parse_simd_chunk(cisv_parser *parser, const uint8_t *buffer, size_t len) {
|
|
298
|
+
// Ensure tables are initialized
|
|
299
|
+
if (!tables_initialized) init_tables();
|
|
300
|
+
|
|
301
|
+
const uint8_t *cur = buffer;
|
|
302
|
+
const uint8_t *end = buffer + len;
|
|
303
|
+
|
|
304
|
+
// Unroll loop by 8 for better performance
|
|
305
|
+
while (cur + 8 <= end) {
|
|
306
|
+
__builtin_prefetch(cur + 64, 0, 1);
|
|
307
|
+
|
|
308
|
+
for (int i = 0; i < 8; i++) {
|
|
309
|
+
uint8_t c = cur[i];
|
|
310
|
+
uint8_t next_state = state_table[parser->st][c];
|
|
311
|
+
uint8_t action = action_table[parser->st][c];
|
|
312
|
+
|
|
313
|
+
parser->st = next_state;
|
|
314
|
+
|
|
315
|
+
if (action & ACT_FIELD) {
|
|
316
|
+
yield_field(parser, parser->field_start, cur + i);
|
|
317
|
+
parser->field_start = cur + i + 1;
|
|
318
|
+
}
|
|
319
|
+
if (action & ACT_ROW) {
|
|
320
|
+
yield_row(parser);
|
|
321
|
+
}
|
|
322
|
+
if (action & ACT_REPROCESS) {
|
|
323
|
+
i--;
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
cur += 8;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Handle remainder
|
|
330
|
+
while (cur < end) {
|
|
331
|
+
uint8_t c = *cur;
|
|
332
|
+
uint8_t next_state = state_table[parser->st][c];
|
|
333
|
+
uint8_t action = action_table[parser->st][c];
|
|
334
|
+
|
|
335
|
+
parser->st = next_state;
|
|
336
|
+
|
|
337
|
+
if (action & ACT_FIELD) {
|
|
338
|
+
yield_field(parser, parser->field_start, cur);
|
|
339
|
+
parser->field_start = cur + 1;
|
|
340
|
+
}
|
|
341
|
+
if (action & ACT_ROW) {
|
|
342
|
+
yield_row(parser);
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
cur += 1 - ((action & ACT_REPROCESS) >> 2);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
#endif
|
|
349
|
+
|
|
350
|
+
static int parse_memory(cisv_parser *parser, const uint8_t *buffer, size_t len) {
|
|
351
|
+
parser->field_start = buffer;
|
|
352
|
+
parser->st = S_UNQUOTED;
|
|
353
|
+
|
|
354
|
+
parse_simd_chunk(parser, buffer, len);
|
|
355
|
+
|
|
356
|
+
// Handle final field if needed
|
|
357
|
+
if (parser->field_start < buffer + len) {
|
|
358
|
+
yield_field(parser, parser->field_start, buffer + len);
|
|
359
|
+
}
|
|
360
|
+
return 0;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
size_t cisv_parser_count_rows(const char *path) {
|
|
364
|
+
int fd = open(path, O_RDONLY);
|
|
365
|
+
if (fd < 0) return 0;
|
|
366
|
+
|
|
367
|
+
struct stat st;
|
|
368
|
+
if (fstat(fd, &st) < 0) {
|
|
369
|
+
close(fd);
|
|
370
|
+
return 0;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
if (st.st_size == 0) {
|
|
374
|
+
close(fd);
|
|
375
|
+
return 0;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
uint8_t *base = (uint8_t *)mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
|
|
379
|
+
if (base == MAP_FAILED) {
|
|
380
|
+
close(fd);
|
|
381
|
+
return 0;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
// Use madvise for sequential access
|
|
385
|
+
madvise(base, st.st_size, MADV_SEQUENTIAL);
|
|
386
|
+
|
|
387
|
+
size_t count = 0;
|
|
388
|
+
|
|
389
|
+
#if defined(cisv_HAVE_AVX512) || defined(cisv_HAVE_AVX2)
|
|
390
|
+
// x86 SIMD path
|
|
391
|
+
uint8_t newline_bytes[64];
|
|
392
|
+
memset(newline_bytes, '\n', 64);
|
|
393
|
+
const cisv_vec newline_vec = cisv_LOAD(newline_bytes);
|
|
394
|
+
|
|
395
|
+
size_t i = 0;
|
|
396
|
+
for (; i + cisv_VEC_BYTES <= (size_t)st.st_size; i += cisv_VEC_BYTES) {
|
|
397
|
+
cisv_vec chunk = cisv_LOAD(base + i);
|
|
398
|
+
#ifdef cisv_HAVE_AVX512
|
|
399
|
+
uint64_t mask = cisv_CMP_EQ(chunk, newline_vec);
|
|
400
|
+
count += __builtin_popcountll(mask);
|
|
401
|
+
#else
|
|
402
|
+
uint32_t mask = cisv_MOVEMASK(cisv_CMP_EQ(chunk, newline_vec));
|
|
403
|
+
count += __builtin_popcount(mask);
|
|
404
|
+
#endif
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
// Handle remainder
|
|
408
|
+
for (; i < (size_t)st.st_size; i++) {
|
|
409
|
+
count += (base[i] == '\n');
|
|
410
|
+
}
|
|
411
|
+
#elif defined(HAS_NEON)
|
|
412
|
+
// ARM NEON path
|
|
413
|
+
uint8x16_t newline_vec = vdupq_n_u8('\n');
|
|
414
|
+
size_t i = 0;
|
|
415
|
+
|
|
416
|
+
// Process 64 bytes at a time using 4 NEON registers
|
|
417
|
+
for (; i + 64 <= (size_t)st.st_size; i += 64) {
|
|
418
|
+
uint8x16_t data0 = vld1q_u8(base + i);
|
|
419
|
+
uint8x16_t data1 = vld1q_u8(base + i + 16);
|
|
420
|
+
uint8x16_t data2 = vld1q_u8(base + i + 32);
|
|
421
|
+
uint8x16_t data3 = vld1q_u8(base + i + 48);
|
|
422
|
+
|
|
423
|
+
uint8x16_t cmp0 = vceqq_u8(data0, newline_vec);
|
|
424
|
+
uint8x16_t cmp1 = vceqq_u8(data1, newline_vec);
|
|
425
|
+
uint8x16_t cmp2 = vceqq_u8(data2, newline_vec);
|
|
426
|
+
uint8x16_t cmp3 = vceqq_u8(data3, newline_vec);
|
|
427
|
+
|
|
428
|
+
// Count set bits in comparison results
|
|
429
|
+
count += vaddvq_u8(vandq_u8(cmp0, vdupq_n_u8(1)));
|
|
430
|
+
count += vaddvq_u8(vandq_u8(cmp1, vdupq_n_u8(1)));
|
|
431
|
+
count += vaddvq_u8(vandq_u8(cmp2, vdupq_n_u8(1)));
|
|
432
|
+
count += vaddvq_u8(vandq_u8(cmp3, vdupq_n_u8(1)));
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
// Process 16 bytes at a time
|
|
436
|
+
for (; i + 16 <= (size_t)st.st_size; i += 16) {
|
|
437
|
+
uint8x16_t data = vld1q_u8(base + i);
|
|
438
|
+
uint8x16_t cmp = vceqq_u8(data, newline_vec);
|
|
439
|
+
count += vaddvq_u8(vandq_u8(cmp, vdupq_n_u8(1)));
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// Handle remainder
|
|
443
|
+
for (; i < (size_t)st.st_size; i++) {
|
|
444
|
+
count += (base[i] == '\n');
|
|
445
|
+
}
|
|
446
|
+
#else
|
|
447
|
+
// Unrolled scalar version
|
|
448
|
+
size_t i = 0;
|
|
449
|
+
for (; i + 8 <= (size_t)st.st_size; i += 8) {
|
|
450
|
+
count += (base[i] == '\n');
|
|
451
|
+
count += (base[i+1] == '\n');
|
|
452
|
+
count += (base[i+2] == '\n');
|
|
453
|
+
count += (base[i+3] == '\n');
|
|
454
|
+
count += (base[i+4] == '\n');
|
|
455
|
+
count += (base[i+5] == '\n');
|
|
456
|
+
count += (base[i+6] == '\n');
|
|
457
|
+
count += (base[i+7] == '\n');
|
|
458
|
+
}
|
|
459
|
+
for (; i < (size_t)st.st_size; i++) {
|
|
460
|
+
count += (base[i] == '\n');
|
|
461
|
+
}
|
|
462
|
+
#endif
|
|
463
|
+
|
|
464
|
+
munmap(base, st.st_size);
|
|
465
|
+
close(fd);
|
|
466
|
+
return count;
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
cisv_parser *cisv_parser_create(cisv_field_cb fcb, cisv_row_cb rcb, void *user) {
|
|
470
|
+
cisv_parser *parser = calloc(1, sizeof(*parser));
|
|
471
|
+
if (!parser) return NULL;
|
|
472
|
+
|
|
473
|
+
// Align ring buffer to cache line
|
|
474
|
+
if (posix_memalign((void**)&parser->ring, 64, RINGBUF_SIZE + 64) != 0) {
|
|
475
|
+
free(parser);
|
|
476
|
+
return NULL;
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
parser->fd = -1;
|
|
480
|
+
parser->fcb = fcb;
|
|
481
|
+
parser->rcb = rcb;
|
|
482
|
+
parser->user = user;
|
|
483
|
+
return parser;
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
void cisv_parser_destroy(cisv_parser *parser) {
|
|
487
|
+
if (!parser) return;
|
|
488
|
+
|
|
489
|
+
if (parser->base && parser->base != MAP_FAILED) {
|
|
490
|
+
munmap(parser->base, parser->size);
|
|
491
|
+
}
|
|
492
|
+
if (parser->fd >= 0) {
|
|
493
|
+
close(parser->fd);
|
|
494
|
+
}
|
|
495
|
+
free(parser->ring);
|
|
496
|
+
free(parser);
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
int cisv_parser_parse_file(cisv_parser *parser, const char *path) {
|
|
500
|
+
struct stat st;
|
|
501
|
+
parser->fd = open(path, O_RDONLY);
|
|
502
|
+
if (parser->fd < 0) return -errno;
|
|
503
|
+
|
|
504
|
+
// Platform-specific file hints
|
|
505
|
+
#ifdef HAS_POSIX_FADVISE
|
|
506
|
+
posix_fadvise(parser->fd, 0, 0, POSIX_FADV_SEQUENTIAL);
|
|
507
|
+
#elif defined(__APPLE__) && defined(HAS_RDADVISE)
|
|
508
|
+
struct radvisory radv;
|
|
509
|
+
radv.ra_offset = 0;
|
|
510
|
+
radv.ra_count = 0; // 0 means whole file
|
|
511
|
+
fcntl(parser->fd, F_RDADVISE, &radv);
|
|
512
|
+
#endif
|
|
513
|
+
|
|
514
|
+
if (fstat(parser->fd, &st) < 0) {
|
|
515
|
+
int err = errno;
|
|
516
|
+
close(parser->fd);
|
|
517
|
+
parser->fd = -1;
|
|
518
|
+
return -err;
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
if (st.st_size == 0) {
|
|
522
|
+
close(parser->fd);
|
|
523
|
+
parser->fd = -1;
|
|
524
|
+
return 0;
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
parser->size = st.st_size;
|
|
528
|
+
|
|
529
|
+
// mmap with platform-specific flags
|
|
530
|
+
int mmap_flags = MAP_PRIVATE;
|
|
531
|
+
#ifdef HAS_MAP_POPULATE
|
|
532
|
+
mmap_flags |= MAP_POPULATE;
|
|
533
|
+
#endif
|
|
534
|
+
|
|
535
|
+
parser->base = mmap(NULL, parser->size, PROT_READ, mmap_flags, parser->fd, 0);
|
|
536
|
+
if (parser->base == MAP_FAILED) {
|
|
537
|
+
int err = errno;
|
|
538
|
+
close(parser->fd);
|
|
539
|
+
parser->fd = -1;
|
|
540
|
+
return -err;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
// Advise kernel about access pattern
|
|
544
|
+
madvise(parser->base, parser->size, MADV_SEQUENTIAL);
|
|
545
|
+
#ifdef MADV_WILLNEED
|
|
546
|
+
madvise(parser->base, parser->size < (1<<20) ? parser->size : (1<<20), MADV_WILLNEED);
|
|
547
|
+
#endif
|
|
548
|
+
|
|
549
|
+
return parse_memory(parser, parser->base, parser->size);
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
int cisv_parser_write(cisv_parser *parser, const uint8_t *chunk, size_t len) {
|
|
553
|
+
if (!parser || !chunk || len >= RINGBUF_SIZE) return -EINVAL;
|
|
554
|
+
|
|
555
|
+
// Branchless overflow handling
|
|
556
|
+
size_t overflow = (parser->head + len > RINGBUF_SIZE);
|
|
557
|
+
if (overflow) {
|
|
558
|
+
parse_memory(parser, parser->ring, parser->head);
|
|
559
|
+
parser->head = 0;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
memcpy(parser->ring + parser->head, chunk, len);
|
|
563
|
+
parser->head += len;
|
|
564
|
+
|
|
565
|
+
// Check for newline or buffer threshold
|
|
566
|
+
uint8_t has_newline = (memchr(chunk, '\n', len) != NULL);
|
|
567
|
+
uint8_t threshold = (parser->head > (RINGBUF_SIZE / 2));
|
|
568
|
+
if (has_newline | threshold) {
|
|
569
|
+
parse_memory(parser, parser->ring, parser->head);
|
|
570
|
+
parser->head = 0;
|
|
571
|
+
}
|
|
572
|
+
return 0;
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
void cisv_parser_end(cisv_parser *parser) {
|
|
576
|
+
if (!parser || parser->head == 0) return;
|
|
577
|
+
parse_memory(parser, parser->ring, parser->head);
|
|
578
|
+
parser->head = 0;
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
#ifdef CISV_CLI
|
|
582
|
+
|
|
583
|
+
#include "cisv_writer.h"
|
|
584
|
+
|
|
585
|
+
// Forward declaration for writer
|
|
586
|
+
int cisv_writer_main(int argc, char *argv[]);
|
|
587
|
+
|
|
588
|
+
typedef struct {
|
|
589
|
+
size_t row_count;
|
|
590
|
+
size_t field_count;
|
|
591
|
+
int count_only;
|
|
592
|
+
int head;
|
|
593
|
+
int tail;
|
|
594
|
+
char delimiter;
|
|
595
|
+
int *select_cols;
|
|
596
|
+
int select_count;
|
|
597
|
+
FILE *output;
|
|
598
|
+
|
|
599
|
+
char ***tail_buffer;
|
|
600
|
+
size_t *tail_field_counts;
|
|
601
|
+
size_t tail_pos;
|
|
602
|
+
|
|
603
|
+
char **current_row;
|
|
604
|
+
size_t current_field_count;
|
|
605
|
+
size_t current_field_capacity;
|
|
606
|
+
size_t current_row_num;
|
|
607
|
+
int in_header;
|
|
608
|
+
} cli_context;
|
|
609
|
+
|
|
610
|
+
static void field_callback(void *user, const char *data, size_t len) {
|
|
611
|
+
cli_context *ctx = (cli_context *)user;
|
|
612
|
+
|
|
613
|
+
// Ensure we have space for the field
|
|
614
|
+
if (ctx->current_field_count >= ctx->current_field_capacity) {
|
|
615
|
+
size_t new_capacity = ctx->current_field_capacity * 2;
|
|
616
|
+
if (new_capacity < 16) new_capacity = 16;
|
|
617
|
+
|
|
618
|
+
char **new_row = realloc(ctx->current_row, new_capacity * sizeof(char *));
|
|
619
|
+
if (!new_row) {
|
|
620
|
+
fprintf(stderr, "Failed to allocate memory for fields\n");
|
|
621
|
+
exit(1);
|
|
622
|
+
}
|
|
623
|
+
ctx->current_row = new_row;
|
|
624
|
+
ctx->current_field_capacity = new_capacity;
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
// Allocate and copy field data
|
|
628
|
+
ctx->current_row[ctx->current_field_count] = malloc(len + 1);
|
|
629
|
+
if (!ctx->current_row[ctx->current_field_count]) {
|
|
630
|
+
fprintf(stderr, "Failed to allocate memory for field data\n");
|
|
631
|
+
exit(1);
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
memcpy(ctx->current_row[ctx->current_field_count], data, len);
|
|
635
|
+
ctx->current_row[ctx->current_field_count][len] = '\0';
|
|
636
|
+
ctx->current_field_count++;
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
static void row_callback(void *user) {
|
|
640
|
+
cli_context *ctx = (cli_context *)user;
|
|
641
|
+
|
|
642
|
+
// Skip row if we're past head limit
|
|
643
|
+
if (ctx->head > 0 && ctx->current_row_num >= (size_t)ctx->head) {
|
|
644
|
+
for (size_t i = 0; i < ctx->current_field_count; i++) {
|
|
645
|
+
free(ctx->current_row[i]);
|
|
646
|
+
}
|
|
647
|
+
ctx->current_field_count = 0;
|
|
648
|
+
ctx->current_row_num++;
|
|
649
|
+
return;
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
// Handle tail buffering
|
|
653
|
+
if (ctx->tail > 0) {
|
|
654
|
+
// Free old row if buffer is full
|
|
655
|
+
if (ctx->tail_buffer[ctx->tail_pos]) {
|
|
656
|
+
for (size_t i = 0; i < ctx->tail_field_counts[ctx->tail_pos]; i++) {
|
|
657
|
+
free(ctx->tail_buffer[ctx->tail_pos][i]);
|
|
658
|
+
}
|
|
659
|
+
free(ctx->tail_buffer[ctx->tail_pos]);
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
// Store current row in circular buffer
|
|
663
|
+
ctx->tail_buffer[ctx->tail_pos] = ctx->current_row;
|
|
664
|
+
ctx->tail_field_counts[ctx->tail_pos] = ctx->current_field_count;
|
|
665
|
+
ctx->tail_pos = (ctx->tail_pos + 1) % ctx->tail;
|
|
666
|
+
|
|
667
|
+
// Allocate new row
|
|
668
|
+
ctx->current_row = calloc(ctx->current_field_capacity, sizeof(char *));
|
|
669
|
+
if (!ctx->current_row) {
|
|
670
|
+
fprintf(stderr, "Failed to allocate memory for new row\n");
|
|
671
|
+
exit(1);
|
|
672
|
+
}
|
|
673
|
+
ctx->current_field_count = 0;
|
|
674
|
+
} else {
|
|
675
|
+
// Output row immediately
|
|
676
|
+
int first = 1;
|
|
677
|
+
for (size_t i = 0; i < ctx->current_field_count; i++) {
|
|
678
|
+
int should_output = 1;
|
|
679
|
+
|
|
680
|
+
if (ctx->select_cols && ctx->select_count > 0) {
|
|
681
|
+
should_output = 0;
|
|
682
|
+
for (int j = 0; j < ctx->select_count; j++) {
|
|
683
|
+
if (ctx->select_cols[j] == (int)i) {
|
|
684
|
+
should_output = 1;
|
|
685
|
+
break;
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
if (should_output) {
|
|
691
|
+
if (!first) fprintf(ctx->output, "%c", ctx->delimiter);
|
|
692
|
+
fprintf(ctx->output, "%s", ctx->current_row[i]);
|
|
693
|
+
first = 0;
|
|
694
|
+
}
|
|
695
|
+
|
|
696
|
+
free(ctx->current_row[i]);
|
|
697
|
+
}
|
|
698
|
+
fprintf(ctx->output, "\n");
|
|
699
|
+
ctx->current_field_count = 0;
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
ctx->row_count++;
|
|
703
|
+
ctx->current_row_num++;
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
static void print_help(const char *prog) {
|
|
707
|
+
printf("cisv - The fastest CSV parser of the multiverse\n\n");
|
|
708
|
+
printf("Usage: %s [COMMAND] [OPTIONS] [FILE]\n\n", prog);
|
|
709
|
+
printf("Commands:\n");
|
|
710
|
+
printf(" parse Parse CSV file (default if no command given)\n");
|
|
711
|
+
printf(" write Write/generate CSV files\n\n");
|
|
712
|
+
printf("\n");
|
|
713
|
+
printf("Options:\n");
|
|
714
|
+
printf(" -h, --help Show this help message\n");
|
|
715
|
+
printf(" -v, --version Show version information\n");
|
|
716
|
+
printf(" -d, --delimiter DELIM Field delimiter (default: ,)\n");
|
|
717
|
+
printf(" -s, --select COLS Select columns (comma-separated indices)\n");
|
|
718
|
+
printf(" -c, --count Show only row count\n");
|
|
719
|
+
printf(" --head N Show first N rows\n");
|
|
720
|
+
printf(" --tail N Show last N rows\n");
|
|
721
|
+
printf(" -o, --output FILE Write to FILE instead of stdout\n");
|
|
722
|
+
printf(" -b, --benchmark Run benchmark mode\n");
|
|
723
|
+
printf("\n----------\nExamples:\n");
|
|
724
|
+
printf(" %s data.csv # Parse and display CSV\n", prog);
|
|
725
|
+
printf(" %s -c data.csv # Count rows\n", prog);
|
|
726
|
+
printf(" %s -s 0,2,3 data.csv # Select columns 0, 2, and 3\n", prog);
|
|
727
|
+
printf(" %s --head 10 data.csv # Show first 10 rows\n", prog);
|
|
728
|
+
printf(" %s -d ';' data.csv # Use semicolon as delimiter\n", prog);
|
|
729
|
+
printf("\nFor write options, use: %s write --help\n", prog);
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
static double get_time_ms() {
|
|
733
|
+
struct timeval tv;
|
|
734
|
+
gettimeofday(&tv, NULL);
|
|
735
|
+
return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
static void benchmark_file(const char *filename) {
|
|
739
|
+
FILE *f = fopen(filename, "rb");
|
|
740
|
+
if (!f) {
|
|
741
|
+
perror("fopen");
|
|
742
|
+
return;
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
fseek(f, 0, SEEK_END);
|
|
746
|
+
long size = ftell(f);
|
|
747
|
+
fclose(f);
|
|
748
|
+
|
|
749
|
+
double size_mb = size / (1024.0 * 1024.0);
|
|
750
|
+
printf("Benchmarking file: %s\n", filename);
|
|
751
|
+
printf("File size: %.2f MB\n\n", size_mb);
|
|
752
|
+
|
|
753
|
+
const int iterations = 5;
|
|
754
|
+
for (int i = 0; i < iterations; i++) {
|
|
755
|
+
double start = get_time_ms();
|
|
756
|
+
size_t count = cisv_parser_count_rows(filename);
|
|
757
|
+
double end = get_time_ms();
|
|
758
|
+
|
|
759
|
+
printf("Run %d: %.2f ms, %zu rows\n", i + 1, end - start, count);
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
int main(int argc, char *argv[]) {
|
|
764
|
+
// Check for write command
|
|
765
|
+
if (argc > 1 && strcmp(argv[1], "write") == 0) {
|
|
766
|
+
// Shift arguments and call writer main
|
|
767
|
+
return cisv_writer_main(argc - 1, argv + 1);
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
// If first arg is "parse", skip it
|
|
771
|
+
if (argc > 1 && strcmp(argv[1], "parse") == 0) {
|
|
772
|
+
argc--;
|
|
773
|
+
argv++;
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
static struct option long_options[] = {
|
|
777
|
+
{"help", no_argument, 0, 'h'},
|
|
778
|
+
{"version", no_argument, 0, 'v'},
|
|
779
|
+
{"delimiter", required_argument, 0, 'd'},
|
|
780
|
+
{"select", required_argument, 0, 's'},
|
|
781
|
+
{"count", no_argument, 0, 'c'},
|
|
782
|
+
{"head", required_argument, 0, 1},
|
|
783
|
+
{"tail", required_argument, 0, 2},
|
|
784
|
+
{"output", required_argument, 0, 'o'},
|
|
785
|
+
{"benchmark", no_argument, 0, 'b'},
|
|
786
|
+
{0, 0, 0, 0}
|
|
787
|
+
};
|
|
788
|
+
|
|
789
|
+
// Initialize context
|
|
790
|
+
cli_context ctx = {0};
|
|
791
|
+
ctx.delimiter = ',';
|
|
792
|
+
ctx.output = stdout;
|
|
793
|
+
ctx.current_field_capacity = 16;
|
|
794
|
+
ctx.current_row = calloc(ctx.current_field_capacity, sizeof(char *));
|
|
795
|
+
if (!ctx.current_row) {
|
|
796
|
+
fprintf(stderr, "Failed to allocate initial row buffer\n");
|
|
797
|
+
return 1;
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
int opt;
|
|
801
|
+
int option_index = 0;
|
|
802
|
+
const char *filename = NULL;
|
|
803
|
+
const char *output_file = NULL;
|
|
804
|
+
int benchmark = 0;
|
|
805
|
+
|
|
806
|
+
while ((opt = getopt_long(argc, argv, "hvd:s:co:b", long_options, &option_index)) != -1) {
|
|
807
|
+
switch (opt) {
|
|
808
|
+
case 'h':
|
|
809
|
+
print_help(argv[0]);
|
|
810
|
+
free(ctx.current_row);
|
|
811
|
+
return 0;
|
|
812
|
+
|
|
813
|
+
case 'v':
|
|
814
|
+
printf("cisv version v0.0.1-rc10\n");
|
|
815
|
+
free(ctx.current_row);
|
|
816
|
+
return 0;
|
|
817
|
+
|
|
818
|
+
case 'd':
|
|
819
|
+
ctx.delimiter = optarg[0];
|
|
820
|
+
break;
|
|
821
|
+
|
|
822
|
+
case 's': {
|
|
823
|
+
// Parse column selection
|
|
824
|
+
char *cols_copy = strdup(optarg);
|
|
825
|
+
if (!cols_copy) {
|
|
826
|
+
fprintf(stderr, "Memory allocation failed\n");
|
|
827
|
+
free(ctx.current_row);
|
|
828
|
+
return 1;
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
// Count columns
|
|
832
|
+
int count = 1;
|
|
833
|
+
for (char *p = cols_copy; *p; p++) {
|
|
834
|
+
if (*p == ',') count++;
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
ctx.select_cols = calloc(count, sizeof(int));
|
|
838
|
+
if (!ctx.select_cols) {
|
|
839
|
+
fprintf(stderr, "Memory allocation failed\n");
|
|
840
|
+
free(cols_copy);
|
|
841
|
+
free(ctx.current_row);
|
|
842
|
+
return 1;
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
// Parse column numbers
|
|
846
|
+
char *tok = strtok(cols_copy, ",");
|
|
847
|
+
int i = 0;
|
|
848
|
+
while (tok && i < count) {
|
|
849
|
+
ctx.select_cols[i++] = atoi(tok);
|
|
850
|
+
tok = strtok(NULL, ",");
|
|
851
|
+
}
|
|
852
|
+
ctx.select_count = i;
|
|
853
|
+
|
|
854
|
+
free(cols_copy);
|
|
855
|
+
break;
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
case 'c':
|
|
859
|
+
ctx.count_only = 1;
|
|
860
|
+
break;
|
|
861
|
+
|
|
862
|
+
case 'o':
|
|
863
|
+
output_file = optarg;
|
|
864
|
+
break;
|
|
865
|
+
|
|
866
|
+
case 'b':
|
|
867
|
+
benchmark = 1;
|
|
868
|
+
break;
|
|
869
|
+
|
|
870
|
+
case 1: // --head
|
|
871
|
+
ctx.head = atoi(optarg);
|
|
872
|
+
break;
|
|
873
|
+
|
|
874
|
+
case 2: // --tail
|
|
875
|
+
ctx.tail = atoi(optarg);
|
|
876
|
+
ctx.tail_buffer = calloc(ctx.tail, sizeof(char **));
|
|
877
|
+
ctx.tail_field_counts = calloc(ctx.tail, sizeof(size_t));
|
|
878
|
+
if (!ctx.tail_buffer || !ctx.tail_field_counts) {
|
|
879
|
+
fprintf(stderr, "Memory allocation failed\n");
|
|
880
|
+
free(ctx.current_row);
|
|
881
|
+
free(ctx.select_cols);
|
|
882
|
+
return 1;
|
|
883
|
+
}
|
|
884
|
+
break;
|
|
885
|
+
|
|
886
|
+
default:
|
|
887
|
+
fprintf(stderr, "Try '%s --help' for more information.\n", argv[0]);
|
|
888
|
+
free(ctx.current_row);
|
|
889
|
+
free(ctx.select_cols);
|
|
890
|
+
return 1;
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
if (optind < argc) {
|
|
895
|
+
filename = argv[optind];
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
if (!filename) {
|
|
899
|
+
fprintf(stderr, "Error: No input file specified\n");
|
|
900
|
+
print_help(argv[0]);
|
|
901
|
+
free(ctx.current_row);
|
|
902
|
+
free(ctx.select_cols);
|
|
903
|
+
return 1;
|
|
904
|
+
}
|
|
905
|
+
|
|
906
|
+
if (benchmark) {
|
|
907
|
+
benchmark_file(filename);
|
|
908
|
+
free(ctx.current_row);
|
|
909
|
+
free(ctx.select_cols);
|
|
910
|
+
return 0;
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
if (ctx.count_only) {
|
|
914
|
+
size_t count = cisv_parser_count_rows(filename);
|
|
915
|
+
printf("%zu\n", count);
|
|
916
|
+
free(ctx.current_row);
|
|
917
|
+
free(ctx.select_cols);
|
|
918
|
+
return 0;
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
if (output_file) {
|
|
922
|
+
ctx.output = fopen(output_file, "w");
|
|
923
|
+
if (!ctx.output) {
|
|
924
|
+
perror("fopen");
|
|
925
|
+
free(ctx.current_row);
|
|
926
|
+
free(ctx.select_cols);
|
|
927
|
+
free(ctx.tail_buffer);
|
|
928
|
+
free(ctx.tail_field_counts);
|
|
929
|
+
return 1;
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
cisv_parser *parser = cisv_parser_create(field_callback, row_callback, &ctx);
|
|
934
|
+
if (!parser) {
|
|
935
|
+
fprintf(stderr, "Failed to create parser\n");
|
|
936
|
+
free(ctx.current_row);
|
|
937
|
+
free(ctx.select_cols);
|
|
938
|
+
free(ctx.tail_buffer);
|
|
939
|
+
free(ctx.tail_field_counts);
|
|
940
|
+
if (ctx.output != stdout) fclose(ctx.output);
|
|
941
|
+
return 1;
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
int result = cisv_parser_parse_file(parser, filename);
|
|
945
|
+
if (result < 0) {
|
|
946
|
+
fprintf(stderr, "Parse error: %s\n", strerror(-result));
|
|
947
|
+
cisv_parser_destroy(parser);
|
|
948
|
+
free(ctx.current_row);
|
|
949
|
+
free(ctx.select_cols);
|
|
950
|
+
free(ctx.tail_buffer);
|
|
951
|
+
free(ctx.tail_field_counts);
|
|
952
|
+
if (ctx.output != stdout) fclose(ctx.output);
|
|
953
|
+
return 1;
|
|
954
|
+
}
|
|
955
|
+
|
|
956
|
+
// Output tail buffer if used
|
|
957
|
+
if (ctx.tail > 0 && ctx.tail_buffer) {
|
|
958
|
+
size_t start = ctx.tail_pos;
|
|
959
|
+
for (int i = 0; i < ctx.tail; i++) {
|
|
960
|
+
size_t idx = (start + i) % ctx.tail;
|
|
961
|
+
if (!ctx.tail_buffer[idx]) continue;
|
|
962
|
+
|
|
963
|
+
int first = 1;
|
|
964
|
+
for (size_t j = 0; j < ctx.tail_field_counts[idx]; j++) {
|
|
965
|
+
if (!first) fprintf(ctx.output, "%c", ctx.delimiter);
|
|
966
|
+
fprintf(ctx.output, "%s", ctx.tail_buffer[idx][j]);
|
|
967
|
+
free(ctx.tail_buffer[idx][j]);
|
|
968
|
+
first = 0;
|
|
969
|
+
}
|
|
970
|
+
fprintf(ctx.output, "\n");
|
|
971
|
+
free(ctx.tail_buffer[idx]);
|
|
972
|
+
}
|
|
973
|
+
free(ctx.tail_buffer);
|
|
974
|
+
free(ctx.tail_field_counts);
|
|
975
|
+
}
|
|
976
|
+
|
|
977
|
+
cisv_parser_destroy(parser);
|
|
978
|
+
free(ctx.current_row);
|
|
979
|
+
free(ctx.select_cols);
|
|
980
|
+
|
|
981
|
+
if (ctx.output != stdout) {
|
|
982
|
+
fclose(ctx.output);
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
return 0;
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
#endif // CISV_CLI
|