cisv 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci.yml +158 -0
- package/.github/workflows/release.yml +167 -0
- package/Dockerfile +63 -0
- package/LICENSE +7 -0
- package/Makefile +160 -0
- package/README.md +249 -0
- package/SIMD_benchmarks.md +658 -0
- package/benchmark/benchmark.js +287 -0
- package/benchmark_cli_reader.sh +236 -0
- package/benchmark_cli_writer.sh +280 -0
- package/binding.gyp +57 -0
- package/debug-addon.js +64 -0
- package/examples/basic-parse.js +65 -0
- package/examples/large-file.js +35 -0
- package/examples/transform.js +152 -0
- package/examples/typescript.ts +38 -0
- package/index.d.ts +336 -0
- package/install_benchmark_deps.sh +156 -0
- package/package.json +47 -0
- package/run_benchmarks.sh +53 -0
- package/src/cisv_addon.cc +614 -0
- package/src/cisv_parser.c +988 -0
- package/src/cisv_parser.h +55 -0
- package/src/cisv_simd.h +53 -0
- package/src/cisv_transformer.c +537 -0
- package/src/cisv_transformer.h +145 -0
- package/src/cisv_writer.c +535 -0
- package/src/cisv_writer.h +60 -0
- package/src/index.ts +2 -0
- package/src/test/typescript.test.ts +43 -0
- package/src/win_getopt.h +100 -0
- package/src/win_sys_time.h +50 -0
- package/test/basic.test.js +104 -0
- package/test_select.sh +92 -0
- package/test_transform.sh +167 -0
- package/test_transform_leak_test.js +94 -0
- package/tsconfig.json +17 -0
- package/types/cisv.d.ts +8 -0
- package/valgrind-node.supp +69 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
#ifndef CISV_TRANSFORMER_H
|
|
2
|
+
#define CISV_TRANSFORMER_H
|
|
3
|
+
|
|
4
|
+
#include <stddef.h>
|
|
5
|
+
#include <stdint.h>
|
|
6
|
+
|
|
7
|
+
#ifdef __cplusplus
|
|
8
|
+
extern "C" {
|
|
9
|
+
#endif
|
|
10
|
+
|
|
11
|
+
// Transform types
|
|
12
|
+
typedef enum {
|
|
13
|
+
TRANSFORM_NONE = 0,
|
|
14
|
+
|
|
15
|
+
// String transforms
|
|
16
|
+
TRANSFORM_UPPERCASE,
|
|
17
|
+
TRANSFORM_LOWERCASE,
|
|
18
|
+
TRANSFORM_TRIM,
|
|
19
|
+
TRANSFORM_TRIM_LEFT,
|
|
20
|
+
TRANSFORM_TRIM_RIGHT,
|
|
21
|
+
|
|
22
|
+
// Type conversions
|
|
23
|
+
TRANSFORM_TO_INT,
|
|
24
|
+
TRANSFORM_TO_FLOAT,
|
|
25
|
+
TRANSFORM_TO_BOOL,
|
|
26
|
+
|
|
27
|
+
// Crypto transforms
|
|
28
|
+
TRANSFORM_HASH_MD5,
|
|
29
|
+
TRANSFORM_HASH_SHA256,
|
|
30
|
+
TRANSFORM_ENCRYPT_AES256,
|
|
31
|
+
TRANSFORM_DECRYPT_AES256,
|
|
32
|
+
|
|
33
|
+
// Data transforms
|
|
34
|
+
TRANSFORM_BASE64_ENCODE,
|
|
35
|
+
TRANSFORM_BASE64_DECODE,
|
|
36
|
+
TRANSFORM_URL_ENCODE,
|
|
37
|
+
TRANSFORM_URL_DECODE,
|
|
38
|
+
|
|
39
|
+
// Custom/JS callback
|
|
40
|
+
TRANSFORM_CUSTOM_JS,
|
|
41
|
+
|
|
42
|
+
TRANSFORM_MAX
|
|
43
|
+
} cisv_transform_type_t;
|
|
44
|
+
|
|
45
|
+
// Transform result
|
|
46
|
+
typedef struct {
|
|
47
|
+
char *data;
|
|
48
|
+
size_t len;
|
|
49
|
+
int needs_free; // Whether data needs to be freed
|
|
50
|
+
} cisv_transform_result_t;
|
|
51
|
+
|
|
52
|
+
// Transform context (for crypto operations)
|
|
53
|
+
typedef struct {
|
|
54
|
+
void *key;
|
|
55
|
+
size_t key_len;
|
|
56
|
+
void *iv;
|
|
57
|
+
size_t iv_len;
|
|
58
|
+
void *extra; // For any additional data
|
|
59
|
+
} cisv_transform_context_t;
|
|
60
|
+
|
|
61
|
+
// Transform function signature
|
|
62
|
+
typedef cisv_transform_result_t (*cisv_transform_fn)(
|
|
63
|
+
const char *data,
|
|
64
|
+
size_t len,
|
|
65
|
+
cisv_transform_context_t *ctx
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
// Transform pipeline entry
|
|
69
|
+
typedef struct {
|
|
70
|
+
cisv_transform_type_t type;
|
|
71
|
+
cisv_transform_fn fn;
|
|
72
|
+
cisv_transform_context_t *ctx;
|
|
73
|
+
int field_index; // -1 for all fields
|
|
74
|
+
void *js_callback; // For JS callbacks (napi_ref)
|
|
75
|
+
} cisv_transform_t;
|
|
76
|
+
|
|
77
|
+
// Transform pipeline
|
|
78
|
+
typedef struct {
|
|
79
|
+
cisv_transform_t *transforms;
|
|
80
|
+
size_t count;
|
|
81
|
+
size_t capacity;
|
|
82
|
+
|
|
83
|
+
// Memory pool for transformations
|
|
84
|
+
char *buffer_pool;
|
|
85
|
+
size_t pool_size;
|
|
86
|
+
size_t pool_used;
|
|
87
|
+
|
|
88
|
+
// SIMD alignment
|
|
89
|
+
size_t alignment;
|
|
90
|
+
} cisv_transform_pipeline_t;
|
|
91
|
+
|
|
92
|
+
typedef struct cisv_js_callback {
|
|
93
|
+
void* env; // napi_env
|
|
94
|
+
void* callback; // napi_ref to the JavaScript function
|
|
95
|
+
void* instance; // napi_ref to the parser instance (for 'this' context)
|
|
96
|
+
} cisv_js_callback_t;
|
|
97
|
+
|
|
98
|
+
// Create/destroy pipeline
|
|
99
|
+
cisv_transform_pipeline_t *cisv_transform_pipeline_create(size_t initial_capacity);
|
|
100
|
+
void cisv_transform_pipeline_destroy(cisv_transform_pipeline_t *pipeline);
|
|
101
|
+
|
|
102
|
+
// Add transforms to pipeline
|
|
103
|
+
int cisv_transform_pipeline_add(
|
|
104
|
+
cisv_transform_pipeline_t *pipeline,
|
|
105
|
+
int field_index,
|
|
106
|
+
cisv_transform_type_t type,
|
|
107
|
+
cisv_transform_context_t *ctx
|
|
108
|
+
);
|
|
109
|
+
|
|
110
|
+
int cisv_transform_pipeline_add_js(
|
|
111
|
+
cisv_transform_pipeline_t *pipeline,
|
|
112
|
+
int field_index,
|
|
113
|
+
void *js_callback
|
|
114
|
+
);
|
|
115
|
+
|
|
116
|
+
// Apply transforms
|
|
117
|
+
cisv_transform_result_t cisv_transform_apply(
|
|
118
|
+
cisv_transform_pipeline_t *pipeline,
|
|
119
|
+
int field_index,
|
|
120
|
+
const char *data,
|
|
121
|
+
size_t len
|
|
122
|
+
);
|
|
123
|
+
|
|
124
|
+
// Built-in transform functions
|
|
125
|
+
cisv_transform_result_t cisv_transform_uppercase(const char *data, size_t len, cisv_transform_context_t *ctx);
|
|
126
|
+
cisv_transform_result_t cisv_transform_lowercase(const char *data, size_t len, cisv_transform_context_t *ctx);
|
|
127
|
+
cisv_transform_result_t cisv_transform_trim(const char *data, size_t len, cisv_transform_context_t *ctx);
|
|
128
|
+
cisv_transform_result_t cisv_transform_to_int(const char *data, size_t len, cisv_transform_context_t *ctx);
|
|
129
|
+
cisv_transform_result_t cisv_transform_to_float(const char *data, size_t len, cisv_transform_context_t *ctx);
|
|
130
|
+
cisv_transform_result_t cisv_transform_hash_sha256(const char *data, size_t len, cisv_transform_context_t *ctx);
|
|
131
|
+
cisv_transform_result_t cisv_transform_base64_encode(const char *data, size_t len, cisv_transform_context_t *ctx);
|
|
132
|
+
|
|
133
|
+
void cisv_transform_result_free(cisv_transform_result_t *result);
|
|
134
|
+
|
|
135
|
+
// SIMD-optimized transforms
|
|
136
|
+
#ifdef __AVX2__
|
|
137
|
+
void cisv_transform_uppercase_simd(char *dst, const char *src, size_t len);
|
|
138
|
+
void cisv_transform_lowercase_simd(char *dst, const char *src, size_t len);
|
|
139
|
+
#endif
|
|
140
|
+
|
|
141
|
+
#ifdef __cplusplus
|
|
142
|
+
}
|
|
143
|
+
#endif
|
|
144
|
+
|
|
145
|
+
#endif // CISV_TRANSFORMER_H
|
|
@@ -0,0 +1,535 @@
|
|
|
1
|
+
#define _GNU_SOURCE
|
|
2
|
+
#include "cisv_writer.h"
|
|
3
|
+
#include "cisv_simd.h"
|
|
4
|
+
#include <stdlib.h>
|
|
5
|
+
#include <string.h>
|
|
6
|
+
#include <stdio.h>
|
|
7
|
+
#include <errno.h>
|
|
8
|
+
#include <ctype.h>
|
|
9
|
+
|
|
10
|
+
#define DEFAULT_BUFFER_SIZE (1 << 20) // 1MB
|
|
11
|
+
#define MIN_BUFFER_SIZE (1 << 16) // 64KB
|
|
12
|
+
|
|
13
|
+
struct cisv_writer {
|
|
14
|
+
FILE *output;
|
|
15
|
+
uint8_t *buffer;
|
|
16
|
+
size_t buffer_size;
|
|
17
|
+
size_t buffer_pos;
|
|
18
|
+
|
|
19
|
+
// Configuration
|
|
20
|
+
char delimiter;
|
|
21
|
+
char quote_char;
|
|
22
|
+
int always_quote;
|
|
23
|
+
int use_crlf;
|
|
24
|
+
const char *null_string;
|
|
25
|
+
|
|
26
|
+
// State
|
|
27
|
+
int in_field;
|
|
28
|
+
size_t field_count;
|
|
29
|
+
|
|
30
|
+
// Statistics
|
|
31
|
+
size_t bytes_written;
|
|
32
|
+
size_t rows_written;
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
// Check if field needs quoting
|
|
36
|
+
static inline int needs_quoting(const char *data, size_t len, char delim, char quote) {
|
|
37
|
+
// SIMD-accelerated quote detection for x86-64
|
|
38
|
+
#if defined(cisv_ARCH_AVX512) || defined(cisv_ARCH_AVX2)
|
|
39
|
+
const uint8_t *cur = (const uint8_t *)data;
|
|
40
|
+
const uint8_t *end = cur + len;
|
|
41
|
+
|
|
42
|
+
#ifdef cisv_ARCH_AVX512
|
|
43
|
+
const __m512i delim_vec = _mm512_set1_epi8(delim);
|
|
44
|
+
const __m512i quote_vec = _mm512_set1_epi8(quote);
|
|
45
|
+
const __m512i cr_vec = _mm512_set1_epi8('\r');
|
|
46
|
+
const __m512i lf_vec = _mm512_set1_epi8('\n');
|
|
47
|
+
|
|
48
|
+
while (cur + 64 <= end) {
|
|
49
|
+
__m512i chunk = _mm512_loadu_si512((const __m512i*)cur);
|
|
50
|
+
__mmask64 delim_mask = _mm512_cmpeq_epi8_mask(chunk, delim_vec);
|
|
51
|
+
__mmask64 quote_mask = _mm512_cmpeq_epi8_mask(chunk, quote_vec);
|
|
52
|
+
__mmask64 cr_mask = _mm512_cmpeq_epi8_mask(chunk, cr_vec);
|
|
53
|
+
__mmask64 lf_mask = _mm512_cmpeq_epi8_mask(chunk, lf_vec);
|
|
54
|
+
|
|
55
|
+
if (delim_mask | quote_mask | cr_mask | lf_mask) {
|
|
56
|
+
return 1;
|
|
57
|
+
}
|
|
58
|
+
cur += 64;
|
|
59
|
+
}
|
|
60
|
+
#elif defined(cisv_ARCH_AVX2)
|
|
61
|
+
const __m256i delim_vec = _mm256_set1_epi8(delim);
|
|
62
|
+
const __m256i quote_vec = _mm256_set1_epi8(quote);
|
|
63
|
+
const __m256i cr_vec = _mm256_set1_epi8('\r');
|
|
64
|
+
const __m256i lf_vec = _mm256_set1_epi8('\n');
|
|
65
|
+
|
|
66
|
+
while (cur + 32 <= end) {
|
|
67
|
+
__m256i chunk = _mm256_loadu_si256((const __m256i*)cur);
|
|
68
|
+
__m256i delim_cmp = _mm256_cmpeq_epi8(chunk, delim_vec);
|
|
69
|
+
__m256i quote_cmp = _mm256_cmpeq_epi8(chunk, quote_vec);
|
|
70
|
+
__m256i cr_cmp = _mm256_cmpeq_epi8(chunk, cr_vec);
|
|
71
|
+
__m256i lf_cmp = _mm256_cmpeq_epi8(chunk, lf_vec);
|
|
72
|
+
|
|
73
|
+
__m256i any_match = _mm256_or_si256(
|
|
74
|
+
_mm256_or_si256(delim_cmp, quote_cmp),
|
|
75
|
+
_mm256_or_si256(cr_cmp, lf_cmp)
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
if (_mm256_movemask_epi8(any_match)) {
|
|
79
|
+
return 1;
|
|
80
|
+
}
|
|
81
|
+
cur += 32;
|
|
82
|
+
}
|
|
83
|
+
#endif
|
|
84
|
+
|
|
85
|
+
// Handle remaining bytes
|
|
86
|
+
while (cur < end) {
|
|
87
|
+
char c = *cur++;
|
|
88
|
+
if (c == delim || c == quote || c == '\r' || c == '\n') {
|
|
89
|
+
return 1;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
return 0;
|
|
93
|
+
#else
|
|
94
|
+
// Scalar fallback
|
|
95
|
+
for (size_t i = 0; i < len; i++) {
|
|
96
|
+
char c = data[i];
|
|
97
|
+
if (c == delim || c == quote || c == '\r' || c == '\n') {
|
|
98
|
+
return 1;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return 0;
|
|
102
|
+
#endif
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Ensure buffer has at least 'needed' bytes available
|
|
106
|
+
static int ensure_buffer_space(cisv_writer *writer, size_t needed) {
|
|
107
|
+
if (writer->buffer_pos + needed > writer->buffer_size) {
|
|
108
|
+
if (cisv_writer_flush(writer) < 0) {
|
|
109
|
+
return -1;
|
|
110
|
+
}
|
|
111
|
+
// If single field is larger than buffer, write directly
|
|
112
|
+
if (needed > writer->buffer_size) {
|
|
113
|
+
return -2; // Signal direct write needed
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return 0;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Write data to buffer
|
|
120
|
+
static void buffer_write(cisv_writer *writer, const void *data, size_t len) {
|
|
121
|
+
memcpy(writer->buffer + writer->buffer_pos, data, len);
|
|
122
|
+
writer->buffer_pos += len;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Write escaped field to buffer
|
|
126
|
+
static int write_quoted_field(cisv_writer *writer, const char *data, size_t len) {
|
|
127
|
+
// Worst case: all quotes need escaping + 2 quotes
|
|
128
|
+
size_t max_size = len * 2 + 2;
|
|
129
|
+
|
|
130
|
+
int space_result = ensure_buffer_space(writer, max_size);
|
|
131
|
+
if (space_result == -2) {
|
|
132
|
+
// Field too large for buffer, write directly
|
|
133
|
+
if (fputc(writer->quote_char, writer->output) == EOF) return -1;
|
|
134
|
+
|
|
135
|
+
for (size_t i = 0; i < len; i++) {
|
|
136
|
+
if (data[i] == writer->quote_char) {
|
|
137
|
+
if (fputc(writer->quote_char, writer->output) == EOF) return -1;
|
|
138
|
+
}
|
|
139
|
+
if (fputc(data[i], writer->output) == EOF) return -1;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
if (fputc(writer->quote_char, writer->output) == EOF) return -1;
|
|
143
|
+
writer->bytes_written += len + 2;
|
|
144
|
+
return 0;
|
|
145
|
+
} else if (space_result < 0) {
|
|
146
|
+
return -1;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Write to buffer
|
|
150
|
+
writer->buffer[writer->buffer_pos++] = writer->quote_char;
|
|
151
|
+
|
|
152
|
+
for (size_t i = 0; i < len; i++) {
|
|
153
|
+
if (data[i] == writer->quote_char) {
|
|
154
|
+
writer->buffer[writer->buffer_pos++] = writer->quote_char;
|
|
155
|
+
}
|
|
156
|
+
writer->buffer[writer->buffer_pos++] = data[i];
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
writer->buffer[writer->buffer_pos++] = writer->quote_char;
|
|
160
|
+
return 0;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
cisv_writer *cisv_writer_create(FILE *output) {
|
|
164
|
+
cisv_writer_config config = {
|
|
165
|
+
.delimiter = ',',
|
|
166
|
+
.quote_char = '"',
|
|
167
|
+
.always_quote = 0,
|
|
168
|
+
.use_crlf = 0,
|
|
169
|
+
.null_string = "",
|
|
170
|
+
.buffer_size = DEFAULT_BUFFER_SIZE
|
|
171
|
+
};
|
|
172
|
+
return cisv_writer_create_config(output, &config);
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
cisv_writer *cisv_writer_create_config(FILE *output, const cisv_writer_config *config) {
|
|
176
|
+
if (!output) return NULL;
|
|
177
|
+
|
|
178
|
+
cisv_writer *writer = calloc(1, sizeof(*writer));
|
|
179
|
+
if (!writer) return NULL;
|
|
180
|
+
|
|
181
|
+
writer->buffer_size = config->buffer_size;
|
|
182
|
+
if (writer->buffer_size < MIN_BUFFER_SIZE) {
|
|
183
|
+
writer->buffer_size = MIN_BUFFER_SIZE;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
writer->buffer = malloc(writer->buffer_size);
|
|
187
|
+
if (!writer->buffer) {
|
|
188
|
+
free(writer);
|
|
189
|
+
return NULL;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
writer->output = output;
|
|
193
|
+
writer->delimiter = config->delimiter;
|
|
194
|
+
writer->quote_char = config->quote_char;
|
|
195
|
+
writer->always_quote = config->always_quote;
|
|
196
|
+
writer->use_crlf = config->use_crlf;
|
|
197
|
+
writer->null_string = config->null_string ? config->null_string : "";
|
|
198
|
+
|
|
199
|
+
return writer;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
void cisv_writer_destroy(cisv_writer *writer) {
|
|
203
|
+
if (!writer) return;
|
|
204
|
+
|
|
205
|
+
cisv_writer_flush(writer);
|
|
206
|
+
free(writer->buffer);
|
|
207
|
+
free(writer);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
int cisv_writer_field(cisv_writer *writer, const char *data, size_t len) {
|
|
211
|
+
if (!writer) return -1;
|
|
212
|
+
|
|
213
|
+
// Add delimiter if not first field
|
|
214
|
+
if (writer->field_count > 0) {
|
|
215
|
+
if (ensure_buffer_space(writer, 1) < 0) return -1;
|
|
216
|
+
writer->buffer[writer->buffer_pos++] = writer->delimiter;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// Handle NULL
|
|
220
|
+
if (!data) {
|
|
221
|
+
data = writer->null_string;
|
|
222
|
+
len = strlen(data);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Check if quoting needed
|
|
226
|
+
if (writer->always_quote || needs_quoting(data, len, writer->delimiter, writer->quote_char)) {
|
|
227
|
+
if (write_quoted_field(writer, data, len) < 0) return -1;
|
|
228
|
+
} else {
|
|
229
|
+
// Simple field - write directly
|
|
230
|
+
int space_result = ensure_buffer_space(writer, len);
|
|
231
|
+
if (space_result == -2) {
|
|
232
|
+
// Direct write
|
|
233
|
+
if (fwrite(data, 1, len, writer->output) != len) return -1;
|
|
234
|
+
writer->bytes_written += len;
|
|
235
|
+
} else if (space_result < 0) {
|
|
236
|
+
return -1;
|
|
237
|
+
} else {
|
|
238
|
+
buffer_write(writer, data, len);
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
writer->field_count++;
|
|
243
|
+
writer->in_field = 0;
|
|
244
|
+
return 0;
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
int cisv_writer_field_str(cisv_writer *writer, const char *str) {
|
|
248
|
+
if (!str) return cisv_writer_field(writer, NULL, 0);
|
|
249
|
+
return cisv_writer_field(writer, str, strlen(str));
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
int cisv_writer_field_int(cisv_writer *writer, int64_t value) {
|
|
253
|
+
char buffer[32];
|
|
254
|
+
int len = snprintf(buffer, sizeof(buffer), "%lld", (long long)value);
|
|
255
|
+
return cisv_writer_field(writer, buffer, len);
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
int cisv_writer_field_double(cisv_writer *writer, double value, int precision) {
|
|
259
|
+
char buffer[64];
|
|
260
|
+
int len = snprintf(buffer, sizeof(buffer), "%.*f", precision, value);
|
|
261
|
+
return cisv_writer_field(writer, buffer, len);
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
int cisv_writer_row_end(cisv_writer *writer) {
|
|
265
|
+
if (!writer) return -1;
|
|
266
|
+
|
|
267
|
+
// Write line ending
|
|
268
|
+
if (writer->use_crlf) {
|
|
269
|
+
if (ensure_buffer_space(writer, 2) < 0) return -1;
|
|
270
|
+
writer->buffer[writer->buffer_pos++] = '\r';
|
|
271
|
+
writer->buffer[writer->buffer_pos++] = '\n';
|
|
272
|
+
} else {
|
|
273
|
+
if (ensure_buffer_space(writer, 1) < 0) return -1;
|
|
274
|
+
writer->buffer[writer->buffer_pos++] = '\n';
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
writer->field_count = 0;
|
|
278
|
+
writer->rows_written++;
|
|
279
|
+
return 0;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
int cisv_writer_row(cisv_writer *writer, const char **fields, size_t count) {
|
|
283
|
+
for (size_t i = 0; i < count; i++) {
|
|
284
|
+
if (cisv_writer_field_str(writer, fields[i]) < 0) return -1;
|
|
285
|
+
}
|
|
286
|
+
return cisv_writer_row_end(writer);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
int cisv_writer_flush(cisv_writer *writer) {
|
|
290
|
+
if (!writer || writer->buffer_pos == 0) return 0;
|
|
291
|
+
|
|
292
|
+
if (fwrite(writer->buffer, 1, writer->buffer_pos, writer->output) != writer->buffer_pos) {
|
|
293
|
+
return -1;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
writer->bytes_written += writer->buffer_pos;
|
|
297
|
+
writer->buffer_pos = 0;
|
|
298
|
+
return 0;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
size_t cisv_writer_bytes_written(const cisv_writer *writer) {
|
|
302
|
+
return writer ? writer->bytes_written + writer->buffer_pos : 0;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
size_t cisv_writer_rows_written(const cisv_writer *writer) {
|
|
306
|
+
return writer ? writer->rows_written : 0;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// CLI integration
|
|
310
|
+
#ifdef CISV_CLI
|
|
311
|
+
|
|
312
|
+
#include <getopt.h>
|
|
313
|
+
#include <sys/time.h>
|
|
314
|
+
#include <time.h>
|
|
315
|
+
|
|
316
|
+
typedef enum {
|
|
317
|
+
MODE_GENERATE,
|
|
318
|
+
MODE_TRANSFORM,
|
|
319
|
+
MODE_CONVERT
|
|
320
|
+
} write_mode_t;
|
|
321
|
+
|
|
322
|
+
static void print_write_help(const char *prog) {
|
|
323
|
+
printf("cisv write - High-performance CSV writer\n\n");
|
|
324
|
+
printf("Usage: %s write [OPTIONS]\n\n", prog);
|
|
325
|
+
printf("Modes:\n");
|
|
326
|
+
printf(" -g, --generate N Generate N rows of test data\n");
|
|
327
|
+
printf(" -t, --transform FILE Transform existing CSV\n");
|
|
328
|
+
printf(" -j, --json FILE Convert JSON to CSV\n\n");
|
|
329
|
+
printf("Options:\n");
|
|
330
|
+
printf(" -o, --output FILE Output file (default: stdout)\n");
|
|
331
|
+
printf(" -d, --delimiter CHAR Field delimiter (default: ,)\n");
|
|
332
|
+
printf(" -q, --quote CHAR Quote character (default: \")\n");
|
|
333
|
+
printf(" -Q, --always-quote Always quote fields\n");
|
|
334
|
+
printf(" -r, --crlf Use CRLF line endings\n");
|
|
335
|
+
printf(" -n, --null STRING String for NULL values (default: empty)\n");
|
|
336
|
+
printf(" -c, --columns LIST Column names for generation\n");
|
|
337
|
+
printf(" -b, --benchmark Run in benchmark mode\n");
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
static double get_time_seconds() {
|
|
341
|
+
struct timeval tv;
|
|
342
|
+
gettimeofday(&tv, NULL);
|
|
343
|
+
return tv.tv_sec + tv.tv_usec / 1000000.0;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// Generate test data
|
|
347
|
+
static int generate_csv(cisv_writer *writer, size_t rows, const char *columns) {
|
|
348
|
+
(void)columns; // TODO: implement custom columns support
|
|
349
|
+
// Default columns if not specified
|
|
350
|
+
const char *default_cols[] = {"id", "name", "email", "value", "timestamp"};
|
|
351
|
+
size_t col_count = 5;
|
|
352
|
+
|
|
353
|
+
// Write header
|
|
354
|
+
if (cisv_writer_row(writer, default_cols, col_count) < 0) return -1;
|
|
355
|
+
|
|
356
|
+
// Generate rows
|
|
357
|
+
char buffer[256];
|
|
358
|
+
for (size_t i = 0; i < rows; i++) {
|
|
359
|
+
// ID
|
|
360
|
+
if (cisv_writer_field_int(writer, i + 1) < 0) return -1;
|
|
361
|
+
|
|
362
|
+
// Name
|
|
363
|
+
snprintf(buffer, sizeof(buffer), "User_%zu", i);
|
|
364
|
+
if (cisv_writer_field_str(writer, buffer) < 0) return -1;
|
|
365
|
+
|
|
366
|
+
// Email
|
|
367
|
+
snprintf(buffer, sizeof(buffer), "user%zu@example.com", i);
|
|
368
|
+
if (cisv_writer_field_str(writer, buffer) < 0) return -1;
|
|
369
|
+
|
|
370
|
+
// Value
|
|
371
|
+
if (cisv_writer_field_double(writer, (double)(i * 1.23), 2) < 0) return -1;
|
|
372
|
+
|
|
373
|
+
// Timestamp
|
|
374
|
+
time_t now = time(NULL) + i;
|
|
375
|
+
struct tm *tm = localtime(&now);
|
|
376
|
+
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", tm);
|
|
377
|
+
if (cisv_writer_field_str(writer, buffer) < 0) return -1;
|
|
378
|
+
|
|
379
|
+
if (cisv_writer_row_end(writer) < 0) return -1;
|
|
380
|
+
|
|
381
|
+
// Progress report every 1M rows
|
|
382
|
+
if ((i + 1) % 1000000 == 0) {
|
|
383
|
+
fprintf(stderr, "Generated %zu rows...\n", i + 1);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
return 0;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
int cisv_writer_main(int argc, char *argv[]) {
|
|
391
|
+
static struct option long_options[] = {
|
|
392
|
+
{"generate", required_argument, 0, 'g'},
|
|
393
|
+
{"transform", required_argument, 0, 't'},
|
|
394
|
+
{"json", required_argument, 0, 'j'},
|
|
395
|
+
{"output", required_argument, 0, 'o'},
|
|
396
|
+
{"delimiter", required_argument, 0, 'd'},
|
|
397
|
+
{"quote", required_argument, 0, 'q'},
|
|
398
|
+
{"always-quote", no_argument, 0, 'Q'},
|
|
399
|
+
{"crlf", no_argument, 0, 'r'},
|
|
400
|
+
{"null", required_argument, 0, 'n'},
|
|
401
|
+
{"columns", required_argument, 0, 'c'},
|
|
402
|
+
{"benchmark", no_argument, 0, 'b'},
|
|
403
|
+
{"help", no_argument, 0, 'h'},
|
|
404
|
+
{0, 0, 0, 0}
|
|
405
|
+
};
|
|
406
|
+
|
|
407
|
+
write_mode_t mode = MODE_GENERATE;
|
|
408
|
+
size_t generate_rows = 0;
|
|
409
|
+
//const char *input_file = NULL;
|
|
410
|
+
const char *output_file = NULL;
|
|
411
|
+
const char *columns = NULL;
|
|
412
|
+
int benchmark = 0;
|
|
413
|
+
|
|
414
|
+
cisv_writer_config config = {
|
|
415
|
+
.delimiter = ',',
|
|
416
|
+
.quote_char = '"',
|
|
417
|
+
.always_quote = 0,
|
|
418
|
+
.use_crlf = 0,
|
|
419
|
+
.null_string = "",
|
|
420
|
+
.buffer_size = DEFAULT_BUFFER_SIZE
|
|
421
|
+
};
|
|
422
|
+
|
|
423
|
+
int opt;
|
|
424
|
+
while ((opt = getopt_long(argc, argv, "g:t:j:o:d:q:Qrn:c:bh", long_options, NULL)) != -1) {
|
|
425
|
+
switch (opt) {
|
|
426
|
+
case 'g':
|
|
427
|
+
mode = MODE_GENERATE;
|
|
428
|
+
generate_rows = strtoull(optarg, NULL, 10);
|
|
429
|
+
break;
|
|
430
|
+
case 't':
|
|
431
|
+
mode = MODE_TRANSFORM;
|
|
432
|
+
//input_file = optarg;
|
|
433
|
+
break;
|
|
434
|
+
case 'j':
|
|
435
|
+
mode = MODE_CONVERT;
|
|
436
|
+
//input_file = optarg;
|
|
437
|
+
break;
|
|
438
|
+
case 'o':
|
|
439
|
+
output_file = optarg;
|
|
440
|
+
break;
|
|
441
|
+
case 'd':
|
|
442
|
+
config.delimiter = optarg[0];
|
|
443
|
+
break;
|
|
444
|
+
case 'q':
|
|
445
|
+
config.quote_char = optarg[0];
|
|
446
|
+
break;
|
|
447
|
+
case 'Q':
|
|
448
|
+
config.always_quote = 1;
|
|
449
|
+
break;
|
|
450
|
+
case 'r':
|
|
451
|
+
config.use_crlf = 1;
|
|
452
|
+
break;
|
|
453
|
+
case 'n':
|
|
454
|
+
config.null_string = optarg;
|
|
455
|
+
break;
|
|
456
|
+
case 'c':
|
|
457
|
+
columns = optarg;
|
|
458
|
+
break;
|
|
459
|
+
case 'b':
|
|
460
|
+
benchmark = 1;
|
|
461
|
+
break;
|
|
462
|
+
case 'h':
|
|
463
|
+
print_write_help(argv[0]);
|
|
464
|
+
return 0;
|
|
465
|
+
default:
|
|
466
|
+
fprintf(stderr, "Try '%s write --help' for more information.\n", argv[0]);
|
|
467
|
+
return 1;
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
// Open output file
|
|
472
|
+
FILE *output = stdout;
|
|
473
|
+
if (output_file) {
|
|
474
|
+
output = fopen(output_file, "wb");
|
|
475
|
+
if (!output) {
|
|
476
|
+
perror("fopen");
|
|
477
|
+
return 1;
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
// Create writer
|
|
482
|
+
cisv_writer *writer = cisv_writer_create_config(output, &config);
|
|
483
|
+
if (!writer) {
|
|
484
|
+
fprintf(stderr, "Failed to create writer\n");
|
|
485
|
+
if (output != stdout) fclose(output);
|
|
486
|
+
return 1;
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
double start_time = 0;
|
|
490
|
+
if (benchmark) {
|
|
491
|
+
start_time = get_time_seconds();
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
int result = 0;
|
|
495
|
+
switch (mode) {
|
|
496
|
+
case MODE_GENERATE:
|
|
497
|
+
if (generate_rows == 0) {
|
|
498
|
+
fprintf(stderr, "Error: Must specify number of rows to generate\n");
|
|
499
|
+
result = 1;
|
|
500
|
+
} else {
|
|
501
|
+
result = generate_csv(writer, generate_rows, columns);
|
|
502
|
+
}
|
|
503
|
+
break;
|
|
504
|
+
|
|
505
|
+
case MODE_TRANSFORM:
|
|
506
|
+
case MODE_CONVERT:
|
|
507
|
+
fprintf(stderr, "Transform/convert modes not yet implemented\n");
|
|
508
|
+
result = 1;
|
|
509
|
+
break;
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
cisv_writer_flush(writer);
|
|
513
|
+
|
|
514
|
+
if (benchmark && result == 0) {
|
|
515
|
+
double elapsed = get_time_seconds() - start_time;
|
|
516
|
+
size_t bytes = cisv_writer_bytes_written(writer);
|
|
517
|
+
size_t rows = cisv_writer_rows_written(writer);
|
|
518
|
+
double mb = bytes / (1024.0 * 1024.0);
|
|
519
|
+
double throughput = mb / elapsed;
|
|
520
|
+
|
|
521
|
+
fprintf(stderr, "\nBenchmark Results:\n");
|
|
522
|
+
fprintf(stderr, " Rows written: %zu\n", rows);
|
|
523
|
+
fprintf(stderr, " Bytes written: %zu (%.2f MB)\n", bytes, mb);
|
|
524
|
+
fprintf(stderr, " Time: %.3f seconds\n", elapsed);
|
|
525
|
+
fprintf(stderr, " Throughput: %.2f MB/s\n", throughput);
|
|
526
|
+
fprintf(stderr, " Rows/sec: %.0f\n", rows / elapsed);
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
cisv_writer_destroy(writer);
|
|
530
|
+
if (output != stdout) fclose(output);
|
|
531
|
+
|
|
532
|
+
return result;
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
#endif // CISV_CLI
|