cisv 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,988 @@
1
+ #define _GNU_SOURCE
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <string.h>
5
+ #include "win_getopt.h"
6
+ #include <sys/stat.h>
7
+ #include <errno.h>
8
+ #include <time.h>
9
+ // NOTE: not dealing with windows for now, too much issues
10
+ #include <sys/mman.h>
11
+ #include <fcntl.h>
12
+ #include <unistd.h>
13
+ #include <getopt.h>
14
+ #include <sys/time.h>
15
+ #include "./cisv_simd.h"
16
+ #include "cisv_parser.h"
17
+
18
+ // #define RINGBUF_SIZE (1 << 20) // 1 MiB (we may adjust according to needs)
19
+ #define RINGBUF_SIZE (1 << 16) // 64kb (for memory safe reasons)
20
+ #define PREFETCH_DISTANCE 256
21
+
22
+ struct cisv_parser {
23
+ uint8_t *base; // pointer to the whole input, if memory-mapped
24
+ size_t size; // length of that mapping
25
+ int fd; // the underlying file descriptor (-1 ⇒ none)
26
+ uint8_t *ring; // malloc’ed circular buffer when not mmapped
27
+ size_t head; // write head: next byte slot to fill
28
+ uint8_t st;
29
+ cisv_field_cb fcb; // field callback fired whenever a full cell is ready
30
+ // (delimiter or row-ending newline encountered, consistent with RFC 4180 rules)
31
+
32
+ cisv_row_cb rcb; // row callback fired after the last field of each record
33
+ void *user;
34
+ const uint8_t *field_start; // where the in-progress field began
35
+ };
36
+ // State constants for branchless operations
37
+ #define S_UNQUOTED 0
38
+ #define S_QUOTED 1
39
+ #define S_QUOTE_ESC 2
40
+
41
+ // Action flags for each character in each state
42
+ #define ACT_NONE 0
43
+ #define ACT_FIELD 1
44
+ #define ACT_ROW 2
45
+ #define ACT_REPROCESS 4
46
+
47
+ // Lookup tables for branchless state transitions - initialized at compile time
48
+ static uint8_t state_table[3][256];
49
+ static uint8_t action_table[3][256];
50
+ static int tables_initialized = 0;
51
+
52
+ // Initialize lookup tables (called lazily)
53
+ static void init_tables(void) {
54
+ if (tables_initialized) return;
55
+
56
+ // Initialize state transitions
57
+ for (int c = 0; c < 256; c++) {
58
+ // S_UNQUOTED transitions
59
+ state_table[S_UNQUOTED][c] = S_UNQUOTED;
60
+ if (c == '"') state_table[S_UNQUOTED][c] = S_QUOTED;
61
+
62
+ // S_QUOTED transitions
63
+ state_table[S_QUOTED][c] = S_QUOTED;
64
+ if (c == '"') state_table[S_QUOTED][c] = S_QUOTE_ESC;
65
+
66
+ // S_QUOTE_ESC transitions
67
+ if (c == '"') {
68
+ state_table[S_QUOTE_ESC][c] = S_QUOTED;
69
+ } else {
70
+ state_table[S_QUOTE_ESC][c] = S_UNQUOTED;
71
+ }
72
+ }
73
+
74
+ // Initialize action table
75
+ memset(action_table, ACT_NONE, sizeof(action_table));
76
+
77
+ // S_UNQUOTED actions
78
+ action_table[S_UNQUOTED][','] = ACT_FIELD;
79
+ action_table[S_UNQUOTED]['\n'] = ACT_FIELD | ACT_ROW;
80
+
81
+ // S_QUOTE_ESC actions
82
+ for (int c = 0; c < 256; c++) {
83
+ if (c != '"') {
84
+ action_table[S_QUOTE_ESC][c] = ACT_REPROCESS;
85
+ }
86
+ }
87
+
88
+ tables_initialized = 1;
89
+ }
90
+
91
+ static inline void yield_field(cisv_parser *parser, const uint8_t *start, const uint8_t *end) {
92
+ // Branchless check: multiply callback by validity flag
93
+ size_t valid = (parser->fcb != NULL) & (start != NULL) & (end != NULL) & (end >= start);
94
+ if (valid) {
95
+ parser->fcb(parser->user, (const char *)start, (size_t)(end - start));
96
+ }
97
+ }
98
+
99
+ static inline void yield_row(cisv_parser *parser) {
100
+ if (parser->rcb) {
101
+ parser->rcb(parser->user);
102
+ }
103
+ }
104
+
105
+ #if defined(cisv_HAVE_AVX512) || defined(cisv_HAVE_AVX2)
106
+
107
+ static void parse_simd_chunk(cisv_parser *parser, const uint8_t *buffer, size_t len) {
108
+ // Ensure tables are initialized
109
+ if (!tables_initialized) init_tables();
110
+
111
+ const uint8_t *cur = buffer;
112
+ const uint8_t *end = buffer + len;
113
+
114
+ // SIMD constants - create them on stack to avoid segfault
115
+ uint8_t comma_bytes[64];
116
+ uint8_t quote_bytes[64];
117
+ uint8_t newline_bytes[64];
118
+ memset(comma_bytes, ',', 64);
119
+ memset(quote_bytes, '"', 64);
120
+ memset(newline_bytes, '\n', 64);
121
+
122
+ const cisv_vec comma_vec = cisv_LOAD(comma_bytes);
123
+ const cisv_vec quote_vec = cisv_LOAD(quote_bytes);
124
+ const cisv_vec newline_vec = cisv_LOAD(newline_bytes);
125
+
126
+ while (cur + cisv_VEC_BYTES <= end) {
127
+ // Prefetch next chunk
128
+ __builtin_prefetch(cur + PREFETCH_DISTANCE, 0, 1);
129
+
130
+ // Fast path for unquoted state
131
+ if (parser->st == S_UNQUOTED) {
132
+ cisv_vec chunk = cisv_LOAD(cur);
133
+
134
+ #ifdef cisv_HAVE_AVX512
135
+ uint64_t comma_mask = cisv_CMP_EQ(chunk, comma_vec);
136
+ uint64_t quote_mask = cisv_CMP_EQ(chunk, quote_vec);
137
+ uint64_t newline_mask = cisv_CMP_EQ(chunk, newline_vec);
138
+ uint64_t combined = comma_mask | quote_mask | newline_mask;
139
+ #else
140
+ cisv_vec comma_cmp = cisv_CMP_EQ(chunk, comma_vec);
141
+ cisv_vec quote_cmp = cisv_CMP_EQ(chunk, quote_vec);
142
+ cisv_vec newline_cmp = cisv_CMP_EQ(chunk, newline_vec);
143
+ cisv_vec combined_vec = cisv_OR_MASK(cisv_OR_MASK(comma_cmp, quote_cmp), newline_cmp);
144
+ uint32_t combined = cisv_MOVEMASK(combined_vec);
145
+ #endif
146
+
147
+ if (!combined) {
148
+ // No special chars, skip entire vector
149
+ cur += cisv_VEC_BYTES;
150
+ continue;
151
+ }
152
+
153
+ // Process special characters
154
+ while (combined) {
155
+ size_t pos = cisv_CTZ(combined);
156
+ const uint8_t *special_pos = cur + pos;
157
+ uint8_t c = *special_pos;
158
+
159
+ // Branchless field/row handling
160
+ uint8_t is_comma = (c == ',');
161
+ uint8_t is_newline = (c == '\n');
162
+ uint8_t is_quote = (c == '"');
163
+
164
+ // Process field before special char
165
+ if (special_pos > parser->field_start && (is_comma | is_newline)) {
166
+ yield_field(parser, parser->field_start, special_pos);
167
+ parser->field_start = special_pos + 1;
168
+ }
169
+
170
+ // Handle newline
171
+ if (is_newline) {
172
+ yield_row(parser);
173
+ }
174
+
175
+ // Update state branchlessly
176
+ parser->st = (parser->st & ~is_quote) | (S_QUOTED & -is_quote);
177
+
178
+ // Clear processed bit
179
+ combined &= combined - 1;
180
+ }
181
+
182
+ cur += cisv_VEC_BYTES;
183
+ } else {
184
+ // In quoted state - need scalar processing
185
+ break;
186
+ }
187
+ }
188
+
189
+ // Handle remainder with scalar code
190
+ while (cur < end) {
191
+ uint8_t c = *cur;
192
+ uint8_t next_state = state_table[parser->st][c];
193
+ uint8_t action = action_table[parser->st][c];
194
+
195
+ // Branchless state update
196
+ parser->st = next_state;
197
+
198
+ // Handle actions branchlessly where possible
199
+ if (action & ACT_FIELD) {
200
+ yield_field(parser, parser->field_start, cur);
201
+ parser->field_start = cur + 1;
202
+ }
203
+ if (action & ACT_ROW) {
204
+ yield_row(parser);
205
+ }
206
+
207
+ // Reprocess requires going back
208
+ cur += 1 - ((action & ACT_REPROCESS) >> 2);
209
+ }
210
+ }
211
+
212
+ #elif defined(HAS_NEON)
213
+
214
+ // ARM NEON optimized parsing
215
+ static void parse_simd_chunk(cisv_parser *parser, const uint8_t *buffer, size_t len) {
216
+ // Ensure tables are initialized
217
+ if (!tables_initialized) init_tables();
218
+
219
+ const uint8_t *cur = buffer;
220
+ const uint8_t *end = buffer + len;
221
+
222
+ // NEON constants
223
+ uint8x16_t comma_vec = vdupq_n_u8(',');
224
+ uint8x16_t quote_vec = vdupq_n_u8('"');
225
+ uint8x16_t newline_vec = vdupq_n_u8('\n');
226
+
227
+ while (cur + 16 <= end && parser->st == S_UNQUOTED) {
228
+ // Prefetch next chunk
229
+ __builtin_prefetch(cur + 64, 0, 1);
230
+
231
+ // Load 16 bytes
232
+ uint8x16_t chunk = vld1q_u8(cur);
233
+
234
+ // Compare with special characters
235
+ uint8x16_t comma_cmp = vceqq_u8(chunk, comma_vec);
236
+ uint8x16_t quote_cmp = vceqq_u8(chunk, quote_vec);
237
+ uint8x16_t newline_cmp = vceqq_u8(chunk, newline_vec);
238
+
239
+ // Combine masks
240
+ uint8x16_t combined = vorrq_u8(vorrq_u8(comma_cmp, quote_cmp), newline_cmp);
241
+
242
+ // Check if any special char found
243
+ uint64_t mask_low = vgetq_lane_u64(vreinterpretq_u64_u8(combined), 0);
244
+ uint64_t mask_high = vgetq_lane_u64(vreinterpretq_u64_u8(combined), 1);
245
+
246
+ if (!(mask_low | mask_high)) {
247
+ // No special chars, advance
248
+ cur += 16;
249
+ continue;
250
+ }
251
+
252
+ // Process special characters byte by byte
253
+ for (int i = 0; i < 16 && cur + i < end; i++) {
254
+ uint8_t c = cur[i];
255
+ if (c == ',' || c == '\n' || c == '"') {
256
+ if (c == ',' || c == '\n') {
257
+ if (cur + i > parser->field_start) {
258
+ yield_field(parser, parser->field_start, cur + i);
259
+ parser->field_start = cur + i + 1;
260
+ }
261
+ if (c == '\n') {
262
+ yield_row(parser);
263
+ }
264
+ } else if (c == '"') {
265
+ parser->st = S_QUOTED;
266
+ cur += i + 1;
267
+ goto scalar_fallback;
268
+ }
269
+ }
270
+ }
271
+ cur += 16;
272
+ }
273
+
274
+ scalar_fallback:
275
+ // Handle remainder with scalar code
276
+ while (cur < end) {
277
+ uint8_t c = *cur;
278
+ uint8_t next_state = state_table[parser->st][c];
279
+ uint8_t action = action_table[parser->st][c];
280
+
281
+ parser->st = next_state;
282
+
283
+ if (action & ACT_FIELD) {
284
+ yield_field(parser, parser->field_start, cur);
285
+ parser->field_start = cur + 1;
286
+ }
287
+ if (action & ACT_ROW) {
288
+ yield_row(parser);
289
+ }
290
+
291
+ cur += 1 - ((action & ACT_REPROCESS) >> 2);
292
+ }
293
+ }
294
+
295
+ #else
296
+ // Non-SIMD fallback with branchless optimizations
297
+ static void parse_simd_chunk(cisv_parser *parser, const uint8_t *buffer, size_t len) {
298
+ // Ensure tables are initialized
299
+ if (!tables_initialized) init_tables();
300
+
301
+ const uint8_t *cur = buffer;
302
+ const uint8_t *end = buffer + len;
303
+
304
+ // Unroll loop by 8 for better performance
305
+ while (cur + 8 <= end) {
306
+ __builtin_prefetch(cur + 64, 0, 1);
307
+
308
+ for (int i = 0; i < 8; i++) {
309
+ uint8_t c = cur[i];
310
+ uint8_t next_state = state_table[parser->st][c];
311
+ uint8_t action = action_table[parser->st][c];
312
+
313
+ parser->st = next_state;
314
+
315
+ if (action & ACT_FIELD) {
316
+ yield_field(parser, parser->field_start, cur + i);
317
+ parser->field_start = cur + i + 1;
318
+ }
319
+ if (action & ACT_ROW) {
320
+ yield_row(parser);
321
+ }
322
+ if (action & ACT_REPROCESS) {
323
+ i--;
324
+ }
325
+ }
326
+ cur += 8;
327
+ }
328
+
329
+ // Handle remainder
330
+ while (cur < end) {
331
+ uint8_t c = *cur;
332
+ uint8_t next_state = state_table[parser->st][c];
333
+ uint8_t action = action_table[parser->st][c];
334
+
335
+ parser->st = next_state;
336
+
337
+ if (action & ACT_FIELD) {
338
+ yield_field(parser, parser->field_start, cur);
339
+ parser->field_start = cur + 1;
340
+ }
341
+ if (action & ACT_ROW) {
342
+ yield_row(parser);
343
+ }
344
+
345
+ cur += 1 - ((action & ACT_REPROCESS) >> 2);
346
+ }
347
+ }
348
+ #endif
349
+
350
+ static int parse_memory(cisv_parser *parser, const uint8_t *buffer, size_t len) {
351
+ parser->field_start = buffer;
352
+ parser->st = S_UNQUOTED;
353
+
354
+ parse_simd_chunk(parser, buffer, len);
355
+
356
+ // Handle final field if needed
357
+ if (parser->field_start < buffer + len) {
358
+ yield_field(parser, parser->field_start, buffer + len);
359
+ }
360
+ return 0;
361
+ }
362
+
363
+ size_t cisv_parser_count_rows(const char *path) {
364
+ int fd = open(path, O_RDONLY);
365
+ if (fd < 0) return 0;
366
+
367
+ struct stat st;
368
+ if (fstat(fd, &st) < 0) {
369
+ close(fd);
370
+ return 0;
371
+ }
372
+
373
+ if (st.st_size == 0) {
374
+ close(fd);
375
+ return 0;
376
+ }
377
+
378
+ uint8_t *base = (uint8_t *)mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
379
+ if (base == MAP_FAILED) {
380
+ close(fd);
381
+ return 0;
382
+ }
383
+
384
+ // Use madvise for sequential access
385
+ madvise(base, st.st_size, MADV_SEQUENTIAL);
386
+
387
+ size_t count = 0;
388
+
389
+ #if defined(cisv_HAVE_AVX512) || defined(cisv_HAVE_AVX2)
390
+ // x86 SIMD path
391
+ uint8_t newline_bytes[64];
392
+ memset(newline_bytes, '\n', 64);
393
+ const cisv_vec newline_vec = cisv_LOAD(newline_bytes);
394
+
395
+ size_t i = 0;
396
+ for (; i + cisv_VEC_BYTES <= (size_t)st.st_size; i += cisv_VEC_BYTES) {
397
+ cisv_vec chunk = cisv_LOAD(base + i);
398
+ #ifdef cisv_HAVE_AVX512
399
+ uint64_t mask = cisv_CMP_EQ(chunk, newline_vec);
400
+ count += __builtin_popcountll(mask);
401
+ #else
402
+ uint32_t mask = cisv_MOVEMASK(cisv_CMP_EQ(chunk, newline_vec));
403
+ count += __builtin_popcount(mask);
404
+ #endif
405
+ }
406
+
407
+ // Handle remainder
408
+ for (; i < (size_t)st.st_size; i++) {
409
+ count += (base[i] == '\n');
410
+ }
411
+ #elif defined(HAS_NEON)
412
+ // ARM NEON path
413
+ uint8x16_t newline_vec = vdupq_n_u8('\n');
414
+ size_t i = 0;
415
+
416
+ // Process 64 bytes at a time using 4 NEON registers
417
+ for (; i + 64 <= (size_t)st.st_size; i += 64) {
418
+ uint8x16_t data0 = vld1q_u8(base + i);
419
+ uint8x16_t data1 = vld1q_u8(base + i + 16);
420
+ uint8x16_t data2 = vld1q_u8(base + i + 32);
421
+ uint8x16_t data3 = vld1q_u8(base + i + 48);
422
+
423
+ uint8x16_t cmp0 = vceqq_u8(data0, newline_vec);
424
+ uint8x16_t cmp1 = vceqq_u8(data1, newline_vec);
425
+ uint8x16_t cmp2 = vceqq_u8(data2, newline_vec);
426
+ uint8x16_t cmp3 = vceqq_u8(data3, newline_vec);
427
+
428
+ // Count set bits in comparison results
429
+ count += vaddvq_u8(vandq_u8(cmp0, vdupq_n_u8(1)));
430
+ count += vaddvq_u8(vandq_u8(cmp1, vdupq_n_u8(1)));
431
+ count += vaddvq_u8(vandq_u8(cmp2, vdupq_n_u8(1)));
432
+ count += vaddvq_u8(vandq_u8(cmp3, vdupq_n_u8(1)));
433
+ }
434
+
435
+ // Process 16 bytes at a time
436
+ for (; i + 16 <= (size_t)st.st_size; i += 16) {
437
+ uint8x16_t data = vld1q_u8(base + i);
438
+ uint8x16_t cmp = vceqq_u8(data, newline_vec);
439
+ count += vaddvq_u8(vandq_u8(cmp, vdupq_n_u8(1)));
440
+ }
441
+
442
+ // Handle remainder
443
+ for (; i < (size_t)st.st_size; i++) {
444
+ count += (base[i] == '\n');
445
+ }
446
+ #else
447
+ // Unrolled scalar version
448
+ size_t i = 0;
449
+ for (; i + 8 <= (size_t)st.st_size; i += 8) {
450
+ count += (base[i] == '\n');
451
+ count += (base[i+1] == '\n');
452
+ count += (base[i+2] == '\n');
453
+ count += (base[i+3] == '\n');
454
+ count += (base[i+4] == '\n');
455
+ count += (base[i+5] == '\n');
456
+ count += (base[i+6] == '\n');
457
+ count += (base[i+7] == '\n');
458
+ }
459
+ for (; i < (size_t)st.st_size; i++) {
460
+ count += (base[i] == '\n');
461
+ }
462
+ #endif
463
+
464
+ munmap(base, st.st_size);
465
+ close(fd);
466
+ return count;
467
+ }
468
+
469
+ cisv_parser *cisv_parser_create(cisv_field_cb fcb, cisv_row_cb rcb, void *user) {
470
+ cisv_parser *parser = calloc(1, sizeof(*parser));
471
+ if (!parser) return NULL;
472
+
473
+ // Align ring buffer to cache line
474
+ if (posix_memalign((void**)&parser->ring, 64, RINGBUF_SIZE + 64) != 0) {
475
+ free(parser);
476
+ return NULL;
477
+ }
478
+
479
+ parser->fd = -1;
480
+ parser->fcb = fcb;
481
+ parser->rcb = rcb;
482
+ parser->user = user;
483
+ return parser;
484
+ }
485
+
486
+ void cisv_parser_destroy(cisv_parser *parser) {
487
+ if (!parser) return;
488
+
489
+ if (parser->base && parser->base != MAP_FAILED) {
490
+ munmap(parser->base, parser->size);
491
+ }
492
+ if (parser->fd >= 0) {
493
+ close(parser->fd);
494
+ }
495
+ free(parser->ring);
496
+ free(parser);
497
+ }
498
+
499
+ int cisv_parser_parse_file(cisv_parser *parser, const char *path) {
500
+ struct stat st;
501
+ parser->fd = open(path, O_RDONLY);
502
+ if (parser->fd < 0) return -errno;
503
+
504
+ // Platform-specific file hints
505
+ #ifdef HAS_POSIX_FADVISE
506
+ posix_fadvise(parser->fd, 0, 0, POSIX_FADV_SEQUENTIAL);
507
+ #elif defined(__APPLE__) && defined(HAS_RDADVISE)
508
+ struct radvisory radv;
509
+ radv.ra_offset = 0;
510
+ radv.ra_count = 0; // 0 means whole file
511
+ fcntl(parser->fd, F_RDADVISE, &radv);
512
+ #endif
513
+
514
+ if (fstat(parser->fd, &st) < 0) {
515
+ int err = errno;
516
+ close(parser->fd);
517
+ parser->fd = -1;
518
+ return -err;
519
+ }
520
+
521
+ if (st.st_size == 0) {
522
+ close(parser->fd);
523
+ parser->fd = -1;
524
+ return 0;
525
+ }
526
+
527
+ parser->size = st.st_size;
528
+
529
+ // mmap with platform-specific flags
530
+ int mmap_flags = MAP_PRIVATE;
531
+ #ifdef HAS_MAP_POPULATE
532
+ mmap_flags |= MAP_POPULATE;
533
+ #endif
534
+
535
+ parser->base = mmap(NULL, parser->size, PROT_READ, mmap_flags, parser->fd, 0);
536
+ if (parser->base == MAP_FAILED) {
537
+ int err = errno;
538
+ close(parser->fd);
539
+ parser->fd = -1;
540
+ return -err;
541
+ }
542
+
543
+ // Advise kernel about access pattern
544
+ madvise(parser->base, parser->size, MADV_SEQUENTIAL);
545
+ #ifdef MADV_WILLNEED
546
+ madvise(parser->base, parser->size < (1<<20) ? parser->size : (1<<20), MADV_WILLNEED);
547
+ #endif
548
+
549
+ return parse_memory(parser, parser->base, parser->size);
550
+ }
551
+
552
+ int cisv_parser_write(cisv_parser *parser, const uint8_t *chunk, size_t len) {
553
+ if (!parser || !chunk || len >= RINGBUF_SIZE) return -EINVAL;
554
+
555
+ // Branchless overflow handling
556
+ size_t overflow = (parser->head + len > RINGBUF_SIZE);
557
+ if (overflow) {
558
+ parse_memory(parser, parser->ring, parser->head);
559
+ parser->head = 0;
560
+ }
561
+
562
+ memcpy(parser->ring + parser->head, chunk, len);
563
+ parser->head += len;
564
+
565
+ // Check for newline or buffer threshold
566
+ uint8_t has_newline = (memchr(chunk, '\n', len) != NULL);
567
+ uint8_t threshold = (parser->head > (RINGBUF_SIZE / 2));
568
+ if (has_newline | threshold) {
569
+ parse_memory(parser, parser->ring, parser->head);
570
+ parser->head = 0;
571
+ }
572
+ return 0;
573
+ }
574
+
575
+ void cisv_parser_end(cisv_parser *parser) {
576
+ if (!parser || parser->head == 0) return;
577
+ parse_memory(parser, parser->ring, parser->head);
578
+ parser->head = 0;
579
+ }
580
+
581
+ #ifdef CISV_CLI
582
+
583
+ #include "cisv_writer.h"
584
+
585
+ // Forward declaration for writer
586
+ int cisv_writer_main(int argc, char *argv[]);
587
+
588
+ typedef struct {
589
+ size_t row_count;
590
+ size_t field_count;
591
+ int count_only;
592
+ int head;
593
+ int tail;
594
+ char delimiter;
595
+ int *select_cols;
596
+ int select_count;
597
+ FILE *output;
598
+
599
+ char ***tail_buffer;
600
+ size_t *tail_field_counts;
601
+ size_t tail_pos;
602
+
603
+ char **current_row;
604
+ size_t current_field_count;
605
+ size_t current_field_capacity;
606
+ size_t current_row_num;
607
+ int in_header;
608
+ } cli_context;
609
+
610
+ static void field_callback(void *user, const char *data, size_t len) {
611
+ cli_context *ctx = (cli_context *)user;
612
+
613
+ // Ensure we have space for the field
614
+ if (ctx->current_field_count >= ctx->current_field_capacity) {
615
+ size_t new_capacity = ctx->current_field_capacity * 2;
616
+ if (new_capacity < 16) new_capacity = 16;
617
+
618
+ char **new_row = realloc(ctx->current_row, new_capacity * sizeof(char *));
619
+ if (!new_row) {
620
+ fprintf(stderr, "Failed to allocate memory for fields\n");
621
+ exit(1);
622
+ }
623
+ ctx->current_row = new_row;
624
+ ctx->current_field_capacity = new_capacity;
625
+ }
626
+
627
+ // Allocate and copy field data
628
+ ctx->current_row[ctx->current_field_count] = malloc(len + 1);
629
+ if (!ctx->current_row[ctx->current_field_count]) {
630
+ fprintf(stderr, "Failed to allocate memory for field data\n");
631
+ exit(1);
632
+ }
633
+
634
+ memcpy(ctx->current_row[ctx->current_field_count], data, len);
635
+ ctx->current_row[ctx->current_field_count][len] = '\0';
636
+ ctx->current_field_count++;
637
+ }
638
+
639
+ static void row_callback(void *user) {
640
+ cli_context *ctx = (cli_context *)user;
641
+
642
+ // Skip row if we're past head limit
643
+ if (ctx->head > 0 && ctx->current_row_num >= (size_t)ctx->head) {
644
+ for (size_t i = 0; i < ctx->current_field_count; i++) {
645
+ free(ctx->current_row[i]);
646
+ }
647
+ ctx->current_field_count = 0;
648
+ ctx->current_row_num++;
649
+ return;
650
+ }
651
+
652
+ // Handle tail buffering
653
+ if (ctx->tail > 0) {
654
+ // Free old row if buffer is full
655
+ if (ctx->tail_buffer[ctx->tail_pos]) {
656
+ for (size_t i = 0; i < ctx->tail_field_counts[ctx->tail_pos]; i++) {
657
+ free(ctx->tail_buffer[ctx->tail_pos][i]);
658
+ }
659
+ free(ctx->tail_buffer[ctx->tail_pos]);
660
+ }
661
+
662
+ // Store current row in circular buffer
663
+ ctx->tail_buffer[ctx->tail_pos] = ctx->current_row;
664
+ ctx->tail_field_counts[ctx->tail_pos] = ctx->current_field_count;
665
+ ctx->tail_pos = (ctx->tail_pos + 1) % ctx->tail;
666
+
667
+ // Allocate new row
668
+ ctx->current_row = calloc(ctx->current_field_capacity, sizeof(char *));
669
+ if (!ctx->current_row) {
670
+ fprintf(stderr, "Failed to allocate memory for new row\n");
671
+ exit(1);
672
+ }
673
+ ctx->current_field_count = 0;
674
+ } else {
675
+ // Output row immediately
676
+ int first = 1;
677
+ for (size_t i = 0; i < ctx->current_field_count; i++) {
678
+ int should_output = 1;
679
+
680
+ if (ctx->select_cols && ctx->select_count > 0) {
681
+ should_output = 0;
682
+ for (int j = 0; j < ctx->select_count; j++) {
683
+ if (ctx->select_cols[j] == (int)i) {
684
+ should_output = 1;
685
+ break;
686
+ }
687
+ }
688
+ }
689
+
690
+ if (should_output) {
691
+ if (!first) fprintf(ctx->output, "%c", ctx->delimiter);
692
+ fprintf(ctx->output, "%s", ctx->current_row[i]);
693
+ first = 0;
694
+ }
695
+
696
+ free(ctx->current_row[i]);
697
+ }
698
+ fprintf(ctx->output, "\n");
699
+ ctx->current_field_count = 0;
700
+ }
701
+
702
+ ctx->row_count++;
703
+ ctx->current_row_num++;
704
+ }
705
+
706
+ static void print_help(const char *prog) {
707
+ printf("cisv - The fastest CSV parser of the multiverse\n\n");
708
+ printf("Usage: %s [COMMAND] [OPTIONS] [FILE]\n\n", prog);
709
+ printf("Commands:\n");
710
+ printf(" parse Parse CSV file (default if no command given)\n");
711
+ printf(" write Write/generate CSV files\n\n");
712
+ printf("\n");
713
+ printf("Options:\n");
714
+ printf(" -h, --help Show this help message\n");
715
+ printf(" -v, --version Show version information\n");
716
+ printf(" -d, --delimiter DELIM Field delimiter (default: ,)\n");
717
+ printf(" -s, --select COLS Select columns (comma-separated indices)\n");
718
+ printf(" -c, --count Show only row count\n");
719
+ printf(" --head N Show first N rows\n");
720
+ printf(" --tail N Show last N rows\n");
721
+ printf(" -o, --output FILE Write to FILE instead of stdout\n");
722
+ printf(" -b, --benchmark Run benchmark mode\n");
723
+ printf("\n----------\nExamples:\n");
724
+ printf(" %s data.csv # Parse and display CSV\n", prog);
725
+ printf(" %s -c data.csv # Count rows\n", prog);
726
+ printf(" %s -s 0,2,3 data.csv # Select columns 0, 2, and 3\n", prog);
727
+ printf(" %s --head 10 data.csv # Show first 10 rows\n", prog);
728
+ printf(" %s -d ';' data.csv # Use semicolon as delimiter\n", prog);
729
+ printf("\nFor write options, use: %s write --help\n", prog);
730
+ }
731
+
732
+ static double get_time_ms() {
733
+ struct timeval tv;
734
+ gettimeofday(&tv, NULL);
735
+ return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
736
+ }
737
+
738
+ static void benchmark_file(const char *filename) {
739
+ FILE *f = fopen(filename, "rb");
740
+ if (!f) {
741
+ perror("fopen");
742
+ return;
743
+ }
744
+
745
+ fseek(f, 0, SEEK_END);
746
+ long size = ftell(f);
747
+ fclose(f);
748
+
749
+ double size_mb = size / (1024.0 * 1024.0);
750
+ printf("Benchmarking file: %s\n", filename);
751
+ printf("File size: %.2f MB\n\n", size_mb);
752
+
753
+ const int iterations = 5;
754
+ for (int i = 0; i < iterations; i++) {
755
+ double start = get_time_ms();
756
+ size_t count = cisv_parser_count_rows(filename);
757
+ double end = get_time_ms();
758
+
759
+ printf("Run %d: %.2f ms, %zu rows\n", i + 1, end - start, count);
760
+ }
761
+ }
762
+
763
+ int main(int argc, char *argv[]) {
764
+ // Check for write command
765
+ if (argc > 1 && strcmp(argv[1], "write") == 0) {
766
+ // Shift arguments and call writer main
767
+ return cisv_writer_main(argc - 1, argv + 1);
768
+ }
769
+
770
+ // If first arg is "parse", skip it
771
+ if (argc > 1 && strcmp(argv[1], "parse") == 0) {
772
+ argc--;
773
+ argv++;
774
+ }
775
+
776
+ static struct option long_options[] = {
777
+ {"help", no_argument, 0, 'h'},
778
+ {"version", no_argument, 0, 'v'},
779
+ {"delimiter", required_argument, 0, 'd'},
780
+ {"select", required_argument, 0, 's'},
781
+ {"count", no_argument, 0, 'c'},
782
+ {"head", required_argument, 0, 1},
783
+ {"tail", required_argument, 0, 2},
784
+ {"output", required_argument, 0, 'o'},
785
+ {"benchmark", no_argument, 0, 'b'},
786
+ {0, 0, 0, 0}
787
+ };
788
+
789
+ // Initialize context
790
+ cli_context ctx = {0};
791
+ ctx.delimiter = ',';
792
+ ctx.output = stdout;
793
+ ctx.current_field_capacity = 16;
794
+ ctx.current_row = calloc(ctx.current_field_capacity, sizeof(char *));
795
+ if (!ctx.current_row) {
796
+ fprintf(stderr, "Failed to allocate initial row buffer\n");
797
+ return 1;
798
+ }
799
+
800
+ int opt;
801
+ int option_index = 0;
802
+ const char *filename = NULL;
803
+ const char *output_file = NULL;
804
+ int benchmark = 0;
805
+
806
+ while ((opt = getopt_long(argc, argv, "hvd:s:co:b", long_options, &option_index)) != -1) {
807
+ switch (opt) {
808
+ case 'h':
809
+ print_help(argv[0]);
810
+ free(ctx.current_row);
811
+ return 0;
812
+
813
+ case 'v':
814
+ printf("cisv version v0.0.1-rc10\n");
815
+ free(ctx.current_row);
816
+ return 0;
817
+
818
+ case 'd':
819
+ ctx.delimiter = optarg[0];
820
+ break;
821
+
822
+ case 's': {
823
+ // Parse column selection
824
+ char *cols_copy = strdup(optarg);
825
+ if (!cols_copy) {
826
+ fprintf(stderr, "Memory allocation failed\n");
827
+ free(ctx.current_row);
828
+ return 1;
829
+ }
830
+
831
+ // Count columns
832
+ int count = 1;
833
+ for (char *p = cols_copy; *p; p++) {
834
+ if (*p == ',') count++;
835
+ }
836
+
837
+ ctx.select_cols = calloc(count, sizeof(int));
838
+ if (!ctx.select_cols) {
839
+ fprintf(stderr, "Memory allocation failed\n");
840
+ free(cols_copy);
841
+ free(ctx.current_row);
842
+ return 1;
843
+ }
844
+
845
+ // Parse column numbers
846
+ char *tok = strtok(cols_copy, ",");
847
+ int i = 0;
848
+ while (tok && i < count) {
849
+ ctx.select_cols[i++] = atoi(tok);
850
+ tok = strtok(NULL, ",");
851
+ }
852
+ ctx.select_count = i;
853
+
854
+ free(cols_copy);
855
+ break;
856
+ }
857
+
858
+ case 'c':
859
+ ctx.count_only = 1;
860
+ break;
861
+
862
+ case 'o':
863
+ output_file = optarg;
864
+ break;
865
+
866
+ case 'b':
867
+ benchmark = 1;
868
+ break;
869
+
870
+ case 1: // --head
871
+ ctx.head = atoi(optarg);
872
+ break;
873
+
874
+ case 2: // --tail
875
+ ctx.tail = atoi(optarg);
876
+ ctx.tail_buffer = calloc(ctx.tail, sizeof(char **));
877
+ ctx.tail_field_counts = calloc(ctx.tail, sizeof(size_t));
878
+ if (!ctx.tail_buffer || !ctx.tail_field_counts) {
879
+ fprintf(stderr, "Memory allocation failed\n");
880
+ free(ctx.current_row);
881
+ free(ctx.select_cols);
882
+ return 1;
883
+ }
884
+ break;
885
+
886
+ default:
887
+ fprintf(stderr, "Try '%s --help' for more information.\n", argv[0]);
888
+ free(ctx.current_row);
889
+ free(ctx.select_cols);
890
+ return 1;
891
+ }
892
+ }
893
+
894
+ if (optind < argc) {
895
+ filename = argv[optind];
896
+ }
897
+
898
+ if (!filename) {
899
+ fprintf(stderr, "Error: No input file specified\n");
900
+ print_help(argv[0]);
901
+ free(ctx.current_row);
902
+ free(ctx.select_cols);
903
+ return 1;
904
+ }
905
+
906
+ if (benchmark) {
907
+ benchmark_file(filename);
908
+ free(ctx.current_row);
909
+ free(ctx.select_cols);
910
+ return 0;
911
+ }
912
+
913
+ if (ctx.count_only) {
914
+ size_t count = cisv_parser_count_rows(filename);
915
+ printf("%zu\n", count);
916
+ free(ctx.current_row);
917
+ free(ctx.select_cols);
918
+ return 0;
919
+ }
920
+
921
+ if (output_file) {
922
+ ctx.output = fopen(output_file, "w");
923
+ if (!ctx.output) {
924
+ perror("fopen");
925
+ free(ctx.current_row);
926
+ free(ctx.select_cols);
927
+ free(ctx.tail_buffer);
928
+ free(ctx.tail_field_counts);
929
+ return 1;
930
+ }
931
+ }
932
+
933
+ cisv_parser *parser = cisv_parser_create(field_callback, row_callback, &ctx);
934
+ if (!parser) {
935
+ fprintf(stderr, "Failed to create parser\n");
936
+ free(ctx.current_row);
937
+ free(ctx.select_cols);
938
+ free(ctx.tail_buffer);
939
+ free(ctx.tail_field_counts);
940
+ if (ctx.output != stdout) fclose(ctx.output);
941
+ return 1;
942
+ }
943
+
944
+ int result = cisv_parser_parse_file(parser, filename);
945
+ if (result < 0) {
946
+ fprintf(stderr, "Parse error: %s\n", strerror(-result));
947
+ cisv_parser_destroy(parser);
948
+ free(ctx.current_row);
949
+ free(ctx.select_cols);
950
+ free(ctx.tail_buffer);
951
+ free(ctx.tail_field_counts);
952
+ if (ctx.output != stdout) fclose(ctx.output);
953
+ return 1;
954
+ }
955
+
956
+ // Output tail buffer if used
957
+ if (ctx.tail > 0 && ctx.tail_buffer) {
958
+ size_t start = ctx.tail_pos;
959
+ for (int i = 0; i < ctx.tail; i++) {
960
+ size_t idx = (start + i) % ctx.tail;
961
+ if (!ctx.tail_buffer[idx]) continue;
962
+
963
+ int first = 1;
964
+ for (size_t j = 0; j < ctx.tail_field_counts[idx]; j++) {
965
+ if (!first) fprintf(ctx.output, "%c", ctx.delimiter);
966
+ fprintf(ctx.output, "%s", ctx.tail_buffer[idx][j]);
967
+ free(ctx.tail_buffer[idx][j]);
968
+ first = 0;
969
+ }
970
+ fprintf(ctx.output, "\n");
971
+ free(ctx.tail_buffer[idx]);
972
+ }
973
+ free(ctx.tail_buffer);
974
+ free(ctx.tail_field_counts);
975
+ }
976
+
977
+ cisv_parser_destroy(parser);
978
+ free(ctx.current_row);
979
+ free(ctx.select_cols);
980
+
981
+ if (ctx.output != stdout) {
982
+ fclose(ctx.output);
983
+ }
984
+
985
+ return 0;
986
+ }
987
+
988
+ #endif // CISV_CLI