js-stream-sas7bdat 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/binding.gyp +58 -0
  2. package/package.json +4 -2
  3. package/src/binding/ReadStat/LICENSE +19 -0
  4. package/src/binding/ReadStat/README.md +483 -0
  5. package/src/binding/ReadStat/src/CKHashTable.c +309 -0
  6. package/src/binding/ReadStat/src/CKHashTable.h +37 -0
  7. package/src/binding/ReadStat/src/readstat.h +627 -0
  8. package/src/binding/ReadStat/src/readstat_bits.c +69 -0
  9. package/src/binding/ReadStat/src/readstat_bits.h +20 -0
  10. package/src/binding/ReadStat/src/readstat_convert.c +36 -0
  11. package/src/binding/ReadStat/src/readstat_convert.h +2 -0
  12. package/src/binding/ReadStat/src/readstat_error.c +126 -0
  13. package/src/binding/ReadStat/src/readstat_iconv.h +15 -0
  14. package/src/binding/ReadStat/src/readstat_io_unistd.c +147 -0
  15. package/src/binding/ReadStat/src/readstat_io_unistd.h +11 -0
  16. package/src/binding/ReadStat/src/readstat_malloc.c +34 -0
  17. package/src/binding/ReadStat/src/readstat_malloc.h +4 -0
  18. package/src/binding/ReadStat/src/readstat_metadata.c +53 -0
  19. package/src/binding/ReadStat/src/readstat_parser.c +121 -0
  20. package/src/binding/ReadStat/src/readstat_strings.h +6 -0
  21. package/src/binding/ReadStat/src/readstat_value.c +178 -0
  22. package/src/binding/ReadStat/src/readstat_variable.c +123 -0
  23. package/src/binding/ReadStat/src/readstat_writer.c +677 -0
  24. package/src/binding/ReadStat/src/readstat_writer.h +21 -0
  25. package/src/binding/ReadStat/src/sas/ieee.c +420 -0
  26. package/src/binding/ReadStat/src/sas/ieee.h +6 -0
  27. package/src/binding/ReadStat/src/sas/readstat_sas.c +528 -0
  28. package/src/binding/ReadStat/src/sas/readstat_sas.h +131 -0
  29. package/src/binding/ReadStat/src/sas/readstat_sas7bcat_read.c +515 -0
  30. package/src/binding/ReadStat/src/sas/readstat_sas7bcat_write.c +218 -0
  31. package/src/binding/ReadStat/src/sas/readstat_sas7bdat_read.c +1304 -0
  32. package/src/binding/ReadStat/src/sas/readstat_sas7bdat_write.c +812 -0
  33. package/src/binding/ReadStat/src/sas/readstat_sas_rle.c +286 -0
  34. package/src/binding/ReadStat/src/sas/readstat_sas_rle.h +8 -0
  35. package/src/binding/ReadStat/src/sas/readstat_xport.c +28 -0
  36. package/src/binding/ReadStat/src/sas/readstat_xport.h +47 -0
  37. package/src/binding/ReadStat/src/sas/readstat_xport_parse_format.c +265 -0
  38. package/src/binding/ReadStat/src/sas/readstat_xport_parse_format.h +4 -0
  39. package/src/binding/ReadStat/src/sas/readstat_xport_parse_format.rl +68 -0
  40. package/src/binding/ReadStat/src/sas/readstat_xport_read.c +777 -0
  41. package/src/binding/ReadStat/src/sas/readstat_xport_write.c +561 -0
  42. package/src/binding/readstat_binding.cc +393 -0
@@ -0,0 +1,1304 @@
1
+
2
+ #include <stdio.h>
3
+ #include <stdlib.h>
4
+ #include <errno.h>
5
+ #include <string.h>
6
+ #include <math.h>
7
+ #include <inttypes.h>
8
+ #include "readstat_sas.h"
9
+ #include "readstat_sas_rle.h"
10
+ #include "../readstat_iconv.h"
11
+ #include "../readstat_convert.h"
12
+ #include "../readstat_malloc.h"
13
+
14
+ typedef struct col_info_s {
15
+ sas_text_ref_t name_ref;
16
+ sas_text_ref_t format_ref;
17
+ sas_text_ref_t label_ref;
18
+
19
+ int index;
20
+ uint64_t offset;
21
+ uint32_t width;
22
+ int type;
23
+ int format_len;
24
+ } col_info_t;
25
+
26
+ typedef struct subheader_pointer_s {
27
+ uint64_t offset;
28
+ uint64_t len;
29
+ unsigned char compression;
30
+ unsigned char is_compressed_data;
31
+ } subheader_pointer_t;
32
+
33
+ typedef struct sas7bdat_ctx_s {
34
+ readstat_callbacks_t handle;
35
+ int64_t file_size;
36
+
37
+ int little_endian;
38
+ int u64;
39
+ int vendor;
40
+ void *user_ctx;
41
+ readstat_io_t *io;
42
+ int bswap;
43
+ int did_submit_columns;
44
+
45
+ uint32_t row_length;
46
+ uint32_t page_row_count;
47
+ uint32_t parsed_row_count;
48
+ uint32_t column_count;
49
+ uint32_t row_limit;
50
+ uint32_t row_offset;
51
+
52
+ uint64_t header_size;
53
+ uint64_t page_count;
54
+ uint64_t page_size;
55
+ char *page;
56
+ char *row;
57
+
58
+ uint64_t page_header_size;
59
+ uint64_t subheader_signature_size;
60
+ uint64_t subheader_pointer_size;
61
+
62
+ int text_blob_count;
63
+ size_t *text_blob_lengths;
64
+ char **text_blobs;
65
+
66
+ int col_names_count;
67
+ int col_attrs_count;
68
+ int col_formats_count;
69
+
70
+ size_t max_col_width;
71
+ char *scratch_buffer;
72
+ size_t scratch_buffer_len;
73
+
74
+ int col_info_count;
75
+ col_info_t *col_info;
76
+
77
+ readstat_variable_t **variables;
78
+
79
+ const char *input_encoding;
80
+ const char *output_encoding;
81
+ iconv_t converter;
82
+
83
+ time_t ctime;
84
+ time_t mtime;
85
+ int version;
86
+ char table_name[4*32+1];
87
+ char file_label[4*256+1];
88
+ char error_buf[2048];
89
+
90
+ unsigned int rdc_compression:1;
91
+ } sas7bdat_ctx_t;
92
+
93
+ static void sas7bdat_ctx_free(sas7bdat_ctx_t *ctx) {
94
+ int i;
95
+ if (ctx->text_blobs) {
96
+ for (i=0; i<ctx->text_blob_count; i++) {
97
+ free(ctx->text_blobs[i]);
98
+ }
99
+ free(ctx->text_blobs);
100
+ free(ctx->text_blob_lengths);
101
+ }
102
+ if (ctx->variables) {
103
+ for (i=0; i<ctx->column_count; i++) {
104
+ if (ctx->variables[i])
105
+ free(ctx->variables[i]);
106
+ }
107
+ free(ctx->variables);
108
+ }
109
+ if (ctx->col_info)
110
+ free(ctx->col_info);
111
+
112
+ if (ctx->scratch_buffer)
113
+ free(ctx->scratch_buffer);
114
+
115
+ if (ctx->page)
116
+ free(ctx->page);
117
+
118
+ if (ctx->row)
119
+ free(ctx->row);
120
+
121
+ if (ctx->converter)
122
+ iconv_close(ctx->converter);
123
+
124
+ free(ctx);
125
+ }
126
+
127
+ static readstat_error_t sas7bdat_update_progress(sas7bdat_ctx_t *ctx) {
128
+ readstat_io_t *io = ctx->io;
129
+ return io->update(ctx->file_size, ctx->handle.progress, ctx->user_ctx, io->io_ctx);
130
+ }
131
+
132
+ static sas_text_ref_t sas7bdat_parse_text_ref(const char *data, sas7bdat_ctx_t *ctx) {
133
+ sas_text_ref_t ref;
134
+
135
+ ref.index = sas_read2(&data[0], ctx->bswap);
136
+ ref.offset = sas_read2(&data[2], ctx->bswap);
137
+ ref.length = sas_read2(&data[4], ctx->bswap);
138
+
139
+ return ref;
140
+ }
141
+
142
+ static readstat_error_t sas7bdat_copy_text_ref(char *out_buffer, size_t out_buffer_len, sas_text_ref_t text_ref, sas7bdat_ctx_t *ctx) {
143
+ if (text_ref.index >= ctx->text_blob_count)
144
+ return READSTAT_ERROR_PARSE;
145
+
146
+ if (text_ref.length == 0) {
147
+ out_buffer[0] = '\0';
148
+ return READSTAT_OK;
149
+ }
150
+
151
+ char *blob = ctx->text_blobs[text_ref.index];
152
+
153
+ if (text_ref.offset + text_ref.length > ctx->text_blob_lengths[text_ref.index])
154
+ return READSTAT_ERROR_PARSE;
155
+
156
+ return readstat_convert(out_buffer, out_buffer_len, &blob[text_ref.offset], text_ref.length,
157
+ ctx->converter);
158
+ }
159
+
160
+ static readstat_error_t sas7bdat_parse_column_text_subheader(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
161
+ readstat_error_t retval = READSTAT_OK;
162
+ size_t signature_len = ctx->subheader_signature_size;
163
+ uint16_t remainder = sas_read2(&subheader[signature_len], ctx->bswap);
164
+ char *blob = NULL;
165
+ if (remainder != sas_subheader_remainder(len, signature_len)) {
166
+ retval = READSTAT_ERROR_PARSE;
167
+ goto cleanup;
168
+ }
169
+ ctx->text_blob_count++;
170
+ ctx->text_blobs = readstat_realloc(ctx->text_blobs, ctx->text_blob_count * sizeof(char *));
171
+ ctx->text_blob_lengths = readstat_realloc(ctx->text_blob_lengths,
172
+ ctx->text_blob_count * sizeof(ctx->text_blob_lengths[0]));
173
+ if (ctx->text_blobs == NULL || ctx->text_blob_lengths == NULL) {
174
+ retval = READSTAT_ERROR_MALLOC;
175
+ goto cleanup;
176
+ }
177
+
178
+ if ((blob = readstat_malloc(len-signature_len)) == NULL) {
179
+ retval = READSTAT_ERROR_MALLOC;
180
+ goto cleanup;
181
+ }
182
+ memcpy(blob, subheader+signature_len, len-signature_len);
183
+ ctx->text_blob_lengths[ctx->text_blob_count-1] = len-signature_len;
184
+ ctx->text_blobs[ctx->text_blob_count-1] = blob;
185
+
186
+ cleanup:
187
+ return retval;
188
+ }
189
+
190
+ static readstat_error_t sas7bdat_realloc_col_info(sas7bdat_ctx_t *ctx, size_t count) {
191
+ if (ctx->col_info_count < count) {
192
+ size_t old_count = ctx->col_info_count;
193
+ ctx->col_info_count = count;
194
+ ctx->col_info = readstat_realloc(ctx->col_info, ctx->col_info_count * sizeof(col_info_t));
195
+ if (ctx->col_info == NULL) {
196
+ return READSTAT_ERROR_MALLOC;
197
+ }
198
+ memset(ctx->col_info + old_count, 0, (count - old_count) * sizeof(col_info_t));
199
+ }
200
+ return READSTAT_OK;
201
+ }
202
+
203
+ static readstat_error_t sas7bdat_parse_column_size_subheader(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
204
+ uint64_t col_count;
205
+ readstat_error_t retval = READSTAT_OK;
206
+
207
+ if (ctx->column_count || ctx->did_submit_columns) {
208
+ retval = READSTAT_ERROR_PARSE;
209
+ goto cleanup;
210
+ }
211
+
212
+ if (len < (ctx->u64 ? 16 : 8)) {
213
+ retval = READSTAT_ERROR_PARSE;
214
+ goto cleanup;
215
+ }
216
+
217
+ if (ctx->u64) {
218
+ col_count = sas_read8(&subheader[8], ctx->bswap);
219
+ } else {
220
+ col_count = sas_read4(&subheader[4], ctx->bswap);
221
+ }
222
+
223
+ ctx->column_count = col_count;
224
+
225
+ retval = sas7bdat_realloc_col_info(ctx, ctx->column_count);
226
+
227
+ cleanup:
228
+ return retval;
229
+ }
230
+
231
+ static readstat_error_t sas7bdat_parse_row_size_subheader(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
232
+ readstat_error_t retval = READSTAT_OK;
233
+ uint64_t total_row_count;
234
+ uint64_t row_length, page_row_count;
235
+
236
+ if (len < (ctx->u64 ? 250: 190)) {
237
+ retval = READSTAT_ERROR_PARSE;
238
+ goto cleanup;
239
+ }
240
+
241
+ if (ctx->u64) {
242
+ row_length = sas_read8(&subheader[40], ctx->bswap);
243
+ total_row_count = sas_read8(&subheader[48], ctx->bswap);
244
+ page_row_count = sas_read8(&subheader[120], ctx->bswap);
245
+ } else {
246
+ row_length = sas_read4(&subheader[20], ctx->bswap);
247
+ total_row_count = sas_read4(&subheader[24], ctx->bswap);
248
+ page_row_count = sas_read4(&subheader[60], ctx->bswap);
249
+ }
250
+
251
+ sas_text_ref_t file_label_ref = sas7bdat_parse_text_ref(&subheader[len-130], ctx);
252
+ if (file_label_ref.length) {
253
+ if ((retval = sas7bdat_copy_text_ref(ctx->file_label, sizeof(ctx->file_label),
254
+ file_label_ref, ctx)) != READSTAT_OK) {
255
+ goto cleanup;
256
+ }
257
+ }
258
+
259
+ sas_text_ref_t compression_ref = sas7bdat_parse_text_ref(&subheader[len-118], ctx);
260
+ if (compression_ref.length) {
261
+ char compression[9];
262
+ if ((retval = sas7bdat_copy_text_ref(compression, sizeof(compression),
263
+ compression_ref, ctx)) != READSTAT_OK) {
264
+ goto cleanup;
265
+ }
266
+ ctx->rdc_compression = (memcmp(compression, SAS_COMPRESSION_SIGNATURE_RDC, 8) == 0);
267
+ }
268
+
269
+ ctx->row_length = row_length;
270
+ ctx->row = readstat_realloc(ctx->row, ctx->row_length);
271
+ if (ctx->row == NULL) {
272
+ retval = READSTAT_ERROR_MALLOC;
273
+ goto cleanup;
274
+ }
275
+
276
+ ctx->page_row_count = page_row_count;
277
+ uint64_t total_row_count_after_skipping = total_row_count;
278
+ if (total_row_count > ctx->row_offset) {
279
+ total_row_count_after_skipping -= ctx->row_offset;
280
+ } else {
281
+ total_row_count_after_skipping = 0;
282
+ ctx->row_offset = total_row_count;
283
+ }
284
+ if (ctx->row_limit == 0 || total_row_count_after_skipping < ctx->row_limit)
285
+ ctx->row_limit = total_row_count_after_skipping;
286
+
287
+ cleanup:
288
+ return retval;
289
+ }
290
+
291
+ static readstat_error_t sas7bdat_parse_column_name_subheader(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
292
+ readstat_error_t retval = READSTAT_OK;
293
+ size_t signature_len = ctx->subheader_signature_size;
294
+ int cmax = ctx->u64 ? (len-28)/8 : (len-20)/8;
295
+ int i;
296
+ const char *cnp = &subheader[signature_len+8];
297
+ uint16_t remainder = sas_read2(&subheader[signature_len], ctx->bswap);
298
+
299
+ if (remainder != sas_subheader_remainder(len, signature_len)) {
300
+ retval = READSTAT_ERROR_PARSE;
301
+ goto cleanup;
302
+ }
303
+
304
+ ctx->col_names_count += cmax;
305
+
306
+ if ((retval = sas7bdat_realloc_col_info(ctx, ctx->col_names_count)) != READSTAT_OK)
307
+ goto cleanup;
308
+
309
+ for (i=ctx->col_names_count-cmax; i<ctx->col_names_count; i++) {
310
+ ctx->col_info[i].name_ref = sas7bdat_parse_text_ref(cnp, ctx);
311
+ cnp += 8;
312
+ }
313
+
314
+ cleanup:
315
+
316
+ return retval;
317
+ }
318
+
319
+ static readstat_error_t sas7bdat_parse_column_attributes_subheader(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
320
+ readstat_error_t retval = READSTAT_OK;
321
+ size_t signature_len = ctx->subheader_signature_size;
322
+ int cmax = ctx->u64 ? (len-28)/16 : (len-20)/12;
323
+ int i;
324
+ const char *cap = &subheader[signature_len+8];
325
+ uint16_t remainder = sas_read2(&subheader[signature_len], ctx->bswap);
326
+
327
+ if (remainder != sas_subheader_remainder(len, signature_len)) {
328
+ retval = READSTAT_ERROR_PARSE;
329
+ goto cleanup;
330
+ }
331
+ ctx->col_attrs_count += cmax;
332
+ if ((retval = sas7bdat_realloc_col_info(ctx, ctx->col_attrs_count)) != READSTAT_OK)
333
+ goto cleanup;
334
+
335
+ for (i=ctx->col_attrs_count-cmax; i<ctx->col_attrs_count; i++) {
336
+ if (ctx->u64) {
337
+ ctx->col_info[i].offset = sas_read8(&cap[0], ctx->bswap);
338
+ } else {
339
+ ctx->col_info[i].offset = sas_read4(&cap[0], ctx->bswap);
340
+ }
341
+
342
+ readstat_off_t off=4;
343
+ if (ctx->u64)
344
+ off=8;
345
+
346
+ ctx->col_info[i].width = sas_read4(&cap[off], ctx->bswap);
347
+ if (ctx->col_info[i].width > ctx->max_col_width)
348
+ ctx->max_col_width = ctx->col_info[i].width;
349
+
350
+ if (cap[off+6] == SAS_COLUMN_TYPE_NUM) {
351
+ ctx->col_info[i].type = READSTAT_TYPE_DOUBLE;
352
+ } else if (cap[off+6] == SAS_COLUMN_TYPE_CHR) {
353
+ ctx->col_info[i].type = READSTAT_TYPE_STRING;
354
+ } else {
355
+ retval = READSTAT_ERROR_PARSE;
356
+ goto cleanup;
357
+ }
358
+ ctx->col_info[i].index = i;
359
+ cap += off+8;
360
+ }
361
+
362
+ cleanup:
363
+
364
+ return retval;
365
+ }
366
+
367
+ static readstat_error_t sas7bdat_parse_column_format_subheader(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
368
+ readstat_error_t retval = READSTAT_OK;
369
+
370
+ if (len < (ctx->u64 ? 58 : 46)) {
371
+ retval = READSTAT_ERROR_PARSE;
372
+ goto cleanup;
373
+ }
374
+
375
+ ctx->col_formats_count++;
376
+ if ((retval = sas7bdat_realloc_col_info(ctx, ctx->col_formats_count)) != READSTAT_OK)
377
+ goto cleanup;
378
+
379
+ if (ctx->u64)
380
+ ctx->col_info[ctx->col_formats_count-1].format_len = sas_read2(&subheader[24], ctx->bswap);
381
+ ctx->col_info[ctx->col_formats_count-1].format_ref = sas7bdat_parse_text_ref(
382
+ ctx->u64 ? &subheader[46] : &subheader[34], ctx);
383
+ ctx->col_info[ctx->col_formats_count-1].label_ref = sas7bdat_parse_text_ref(
384
+ ctx->u64 ? &subheader[52] : &subheader[40], ctx);
385
+
386
+ cleanup:
387
+ return retval;
388
+ }
389
+
390
+ static readstat_error_t sas7bdat_handle_data_value(readstat_variable_t *variable,
391
+ col_info_t *col_info, const char *col_data, sas7bdat_ctx_t *ctx) {
392
+ readstat_error_t retval = READSTAT_OK;
393
+ int cb_retval = 0;
394
+ readstat_value_t value;
395
+ memset(&value, 0, sizeof(readstat_value_t));
396
+
397
+ value.type = col_info->type;
398
+
399
+ if (col_info->type == READSTAT_TYPE_STRING) {
400
+ retval = readstat_convert(ctx->scratch_buffer, ctx->scratch_buffer_len,
401
+ col_data, col_info->width, ctx->converter);
402
+ if (retval != READSTAT_OK) {
403
+ if (ctx->handle.error) {
404
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf),
405
+ "ReadStat: Error converting string (row=%u, col=%u) to specified encoding: %.*s",
406
+ ctx->parsed_row_count+1, col_info->index+1, col_info->width, col_data);
407
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
408
+ }
409
+ goto cleanup;
410
+ }
411
+
412
+ value.v.string_value = ctx->scratch_buffer;
413
+ } else if (col_info->type == READSTAT_TYPE_DOUBLE) {
414
+ uint64_t val = 0;
415
+ double dval = NAN;
416
+ if (ctx->little_endian) {
417
+ int k;
418
+ for (k=0; k<col_info->width; k++) {
419
+ val = (val << 8) | (unsigned char)col_data[col_info->width-1-k];
420
+ }
421
+ } else {
422
+ int k;
423
+ for (k=0; k<col_info->width; k++) {
424
+ val = (val << 8) | (unsigned char)col_data[k];
425
+ }
426
+ }
427
+ val <<= (8-col_info->width)*8;
428
+
429
+ memcpy(&dval, &val, 8);
430
+
431
+ if (isnan(dval)) {
432
+ value.v.double_value = NAN;
433
+ sas_assign_tag(&value, ~((val >> 40) & 0xFF));
434
+ } else {
435
+ value.v.double_value = dval;
436
+ }
437
+ }
438
+ cb_retval = ctx->handle.value(ctx->parsed_row_count, variable, value, ctx->user_ctx);
439
+
440
+ if (cb_retval != READSTAT_HANDLER_OK)
441
+ retval = READSTAT_ERROR_USER_ABORT;
442
+
443
+ cleanup:
444
+ return retval;
445
+ }
446
+
447
+ static readstat_error_t sas7bdat_parse_single_row(const char *data, sas7bdat_ctx_t *ctx) {
448
+ if (ctx->parsed_row_count == ctx->row_limit)
449
+ return READSTAT_OK;
450
+ if (ctx->row_offset) {
451
+ ctx->row_offset--;
452
+ return READSTAT_OK;
453
+ }
454
+
455
+ readstat_error_t retval = READSTAT_OK;
456
+ int j;
457
+ if (ctx->handle.value) {
458
+ ctx->scratch_buffer_len = 4*ctx->max_col_width+1;
459
+ ctx->scratch_buffer = readstat_realloc(ctx->scratch_buffer, ctx->scratch_buffer_len);
460
+ if (ctx->scratch_buffer == NULL) {
461
+ retval = READSTAT_ERROR_MALLOC;
462
+ goto cleanup;
463
+ }
464
+
465
+ for (j=0; j<ctx->column_count; j++) {
466
+ col_info_t *col_info = &ctx->col_info[j];
467
+ readstat_variable_t *variable = ctx->variables[j];
468
+ if (variable->skip)
469
+ continue;
470
+
471
+ if (col_info->offset > ctx->row_length || col_info->offset + col_info->width > ctx->row_length) {
472
+ retval = READSTAT_ERROR_PARSE;
473
+ goto cleanup;
474
+ }
475
+ retval = sas7bdat_handle_data_value(variable, col_info, &data[col_info->offset], ctx);
476
+ if (retval != READSTAT_OK) {
477
+ goto cleanup;
478
+ }
479
+ }
480
+ }
481
+ ctx->parsed_row_count++;
482
+
483
+ cleanup:
484
+ return retval;
485
+ }
486
+
487
+ static readstat_error_t sas7bdat_parse_rows(const char *data, size_t len, sas7bdat_ctx_t *ctx) {
488
+ readstat_error_t retval = READSTAT_OK;
489
+ int i;
490
+ size_t row_offset=0;
491
+ for (i=0; i<ctx->page_row_count && ctx->parsed_row_count < ctx->row_limit; i++) {
492
+ if (row_offset + ctx->row_length > len) {
493
+ retval = READSTAT_ERROR_ROW_WIDTH_MISMATCH;
494
+ goto cleanup;
495
+ }
496
+ if ((retval = sas7bdat_parse_single_row(&data[row_offset], ctx)) != READSTAT_OK)
497
+ goto cleanup;
498
+
499
+ row_offset += ctx->row_length;
500
+ }
501
+
502
+ cleanup:
503
+ return retval;
504
+ }
505
+
506
+ static readstat_error_t sas7bdat_parse_subheader_rdc(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
507
+ readstat_error_t retval = READSTAT_OK;
508
+ const unsigned char *input = (const unsigned char *)subheader;
509
+ char *buffer = malloc(ctx->row_length);
510
+ char *output = buffer;
511
+ while (input + 2 <= (const unsigned char *)subheader + len) {
512
+ int i;
513
+ unsigned short prefix = (input[0] << 8) + input[1];
514
+ input += 2;
515
+ for (i=0; i<16; i++) {
516
+ if ((prefix & (1 << (15 - i))) == 0) {
517
+ if (input + 1 > (const unsigned char *)subheader + len) {
518
+ break;
519
+ }
520
+ if (output + 1 > buffer + ctx->row_length) {
521
+ retval = READSTAT_ERROR_ROW_WIDTH_MISMATCH;
522
+ goto cleanup;
523
+ }
524
+ *output++ = *input++;
525
+ continue;
526
+ }
527
+
528
+ if (input + 2 > (const unsigned char *)subheader + len) {
529
+ retval = READSTAT_ERROR_PARSE;
530
+ goto cleanup;
531
+ }
532
+
533
+ unsigned char marker_byte = *input++;
534
+ unsigned char next_byte = *input++;
535
+ size_t insert_len = 0, copy_len = 0;
536
+ unsigned char insert_byte = 0x00;
537
+ size_t back_offset = 0;
538
+
539
+ if (marker_byte <= 0x0F) {
540
+ insert_len = 3 + marker_byte;
541
+ insert_byte = next_byte;
542
+ } else if ((marker_byte >> 4) == 1) {
543
+ if (input + 1 > (const unsigned char *)subheader + len) {
544
+ retval = READSTAT_ERROR_PARSE;
545
+ goto cleanup;
546
+ }
547
+ insert_len = 19 + (marker_byte & 0x0F) + next_byte * 16;
548
+ insert_byte = *input++;
549
+ } else if ((marker_byte >> 4) == 2) {
550
+ if (input + 1 > (const unsigned char *)subheader + len) {
551
+ retval = READSTAT_ERROR_PARSE;
552
+ goto cleanup;
553
+ }
554
+ copy_len = 16 + (*input++);
555
+ back_offset = 3 + (marker_byte & 0x0F) + next_byte * 16;
556
+ } else {
557
+ copy_len = (marker_byte >> 4);
558
+ back_offset = 3 + (marker_byte & 0x0F) + next_byte * 16;
559
+ }
560
+
561
+ if (insert_len) {
562
+ if (output + insert_len > buffer + ctx->row_length) {
563
+ retval = READSTAT_ERROR_ROW_WIDTH_MISMATCH;
564
+ goto cleanup;
565
+ }
566
+ memset(output, insert_byte, insert_len);
567
+ output += insert_len;
568
+ } else if (copy_len) {
569
+ if (output - buffer < back_offset || copy_len > back_offset) {
570
+ retval = READSTAT_ERROR_PARSE;
571
+ goto cleanup;
572
+ }
573
+ if (output + copy_len > buffer + ctx->row_length) {
574
+ retval = READSTAT_ERROR_ROW_WIDTH_MISMATCH;
575
+ goto cleanup;
576
+ }
577
+ memcpy(output, output - back_offset, copy_len);
578
+ output += copy_len;
579
+ }
580
+ }
581
+ }
582
+
583
+ if (output - buffer != ctx->row_length) {
584
+ retval = READSTAT_ERROR_ROW_WIDTH_MISMATCH;
585
+ goto cleanup;
586
+ }
587
+ retval = sas7bdat_parse_single_row(buffer, ctx);
588
+ cleanup:
589
+ free(buffer);
590
+
591
+ return retval;
592
+ }
593
+
594
+ static readstat_error_t sas7bdat_parse_subheader_rle(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
595
+ if (ctx->row_limit == ctx->parsed_row_count)
596
+ return READSTAT_OK;
597
+
598
+ readstat_error_t retval = READSTAT_OK;
599
+ ssize_t bytes_decompressed = 0;
600
+
601
+ bytes_decompressed = sas_rle_decompress(ctx->row, ctx->row_length, subheader, len);
602
+
603
+ if (bytes_decompressed != ctx->row_length) {
604
+ retval = READSTAT_ERROR_ROW_WIDTH_MISMATCH;
605
+ if (ctx->handle.error) {
606
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf),
607
+ "ReadStat: Row #%d decompressed to %ld bytes (expected %d bytes)",
608
+ ctx->parsed_row_count, (long)(bytes_decompressed), ctx->row_length);
609
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
610
+ }
611
+ goto cleanup;
612
+ }
613
+ retval = sas7bdat_parse_single_row(ctx->row, ctx);
614
+
615
+ cleanup:
616
+ return retval;
617
+ }
618
+
619
+ static readstat_error_t sas7bdat_parse_subheader_compressed(const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
620
+ if (ctx->rdc_compression)
621
+ return sas7bdat_parse_subheader_rdc(subheader, len, ctx);
622
+
623
+ return sas7bdat_parse_subheader_rle(subheader, len, ctx);
624
+ }
625
+
626
+ static readstat_error_t sas7bdat_parse_subheader(uint32_t signature, const char *subheader, size_t len, sas7bdat_ctx_t *ctx) {
627
+ readstat_error_t retval = READSTAT_OK;
628
+
629
+ if (len < 2 + ctx->subheader_signature_size) {
630
+ retval = READSTAT_ERROR_PARSE;
631
+ goto cleanup;
632
+ }
633
+ if (signature == SAS_SUBHEADER_SIGNATURE_ROW_SIZE) {
634
+ retval = sas7bdat_parse_row_size_subheader(subheader, len, ctx);
635
+ } else if (signature == SAS_SUBHEADER_SIGNATURE_COLUMN_SIZE) {
636
+ retval = sas7bdat_parse_column_size_subheader(subheader, len, ctx);
637
+ } else if (signature == SAS_SUBHEADER_SIGNATURE_COUNTS) {
638
+ /* void */
639
+ } else if (signature == SAS_SUBHEADER_SIGNATURE_COLUMN_TEXT) {
640
+ retval = sas7bdat_parse_column_text_subheader(subheader, len, ctx);
641
+ } else if (signature == SAS_SUBHEADER_SIGNATURE_COLUMN_NAME) {
642
+ retval = sas7bdat_parse_column_name_subheader(subheader, len, ctx);
643
+ } else if (signature == SAS_SUBHEADER_SIGNATURE_COLUMN_ATTRS) {
644
+ retval = sas7bdat_parse_column_attributes_subheader(subheader, len, ctx);
645
+ } else if (signature == SAS_SUBHEADER_SIGNATURE_COLUMN_FORMAT) {
646
+ retval = sas7bdat_parse_column_format_subheader(subheader, len, ctx);
647
+ } else if (signature == SAS_SUBHEADER_SIGNATURE_COLUMN_LIST) {
648
+ /* void */
649
+ } else if ((signature & SAS_SUBHEADER_SIGNATURE_COLUMN_MASK) == SAS_SUBHEADER_SIGNATURE_COLUMN_MASK) {
650
+ /* void */
651
+ } else {
652
+ retval = READSTAT_ERROR_PARSE;
653
+ }
654
+
655
+ cleanup:
656
+
657
+ return retval;
658
+ }
659
+
660
+ static readstat_error_t sas7bdat_validate_column(col_info_t *col_info) {
661
+ if (col_info->type == READSTAT_TYPE_DOUBLE) {
662
+ if (col_info->width > 8 || col_info->width < 3) {
663
+ return READSTAT_ERROR_PARSE;
664
+ }
665
+ }
666
+ if (col_info->type == READSTAT_TYPE_STRING) {
667
+ if (col_info->width > INT16_MAX || col_info->width == 0) {
668
+ return READSTAT_ERROR_PARSE;
669
+ }
670
+ }
671
+ return READSTAT_OK;
672
+ }
673
+
674
+ static readstat_variable_t *sas7bdat_init_variable(sas7bdat_ctx_t *ctx, int i,
675
+ int index_after_skipping, readstat_error_t *out_retval) {
676
+ readstat_error_t retval = READSTAT_OK;
677
+ readstat_variable_t *variable = readstat_calloc(1, sizeof(readstat_variable_t));
678
+
679
+ variable->index = i;
680
+ variable->index_after_skipping = index_after_skipping;
681
+ variable->type = ctx->col_info[i].type;
682
+ variable->storage_width = ctx->col_info[i].width;
683
+
684
+ if ((retval = sas7bdat_validate_column(&ctx->col_info[i])) != READSTAT_OK) {
685
+ goto cleanup;
686
+ }
687
+ if ((retval = sas7bdat_copy_text_ref(variable->name, sizeof(variable->name),
688
+ ctx->col_info[i].name_ref, ctx)) != READSTAT_OK) {
689
+ goto cleanup;
690
+ }
691
+ if ((retval = sas7bdat_copy_text_ref(variable->format, sizeof(variable->format),
692
+ ctx->col_info[i].format_ref, ctx)) != READSTAT_OK) {
693
+ goto cleanup;
694
+ }
695
+ size_t len = strlen(variable->format);
696
+ if (len && ctx->col_info[i].format_len) {
697
+ snprintf(variable->format + len, sizeof(variable->format) - len, "%d", ctx->col_info[i].format_len);
698
+ }
699
+ if ((retval = sas7bdat_copy_text_ref(variable->label, sizeof(variable->label),
700
+ ctx->col_info[i].label_ref, ctx)) != READSTAT_OK) {
701
+ goto cleanup;
702
+ }
703
+
704
+ cleanup:
705
+ if (retval != READSTAT_OK) {
706
+ if (out_retval)
707
+ *out_retval = retval;
708
+
709
+ if (retval == READSTAT_ERROR_CONVERT_BAD_STRING) {
710
+ if (ctx->handle.error) {
711
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf),
712
+ "ReadStat: Error converting variable #%d info to specified encoding: %s %s (%s)",
713
+ i, variable->name, variable->format, variable->label);
714
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
715
+ }
716
+ }
717
+
718
+ free(variable);
719
+
720
+ return NULL;
721
+ }
722
+
723
+ return variable;
724
+ }
725
+
726
+ static readstat_error_t sas7bdat_submit_columns(sas7bdat_ctx_t *ctx, int compressed) {
727
+ readstat_error_t retval = READSTAT_OK;
728
+ if (ctx->handle.metadata) {
729
+ readstat_metadata_t metadata = {
730
+ .row_count = ctx->row_limit,
731
+ .var_count = ctx->column_count,
732
+ .table_name = ctx->table_name,
733
+ .file_label = ctx->file_label,
734
+ .file_encoding = ctx->input_encoding, /* orig encoding? */
735
+ .creation_time = ctx->ctime,
736
+ .modified_time = ctx->mtime,
737
+ .file_format_version = ctx->version,
738
+ .compression = READSTAT_COMPRESS_NONE,
739
+ .endianness = ctx->little_endian ? READSTAT_ENDIAN_LITTLE : READSTAT_ENDIAN_BIG,
740
+ .is64bit = ctx->u64
741
+ };
742
+ if (compressed) {
743
+ if (ctx->rdc_compression) {
744
+ metadata.compression = READSTAT_COMPRESS_BINARY;
745
+ } else {
746
+ metadata.compression = READSTAT_COMPRESS_ROWS;
747
+ }
748
+ }
749
+ if (ctx->handle.metadata(&metadata, ctx->user_ctx) != READSTAT_HANDLER_OK) {
750
+ retval = READSTAT_ERROR_USER_ABORT;
751
+ goto cleanup;
752
+ }
753
+ }
754
+ if (ctx->column_count == 0)
755
+ goto cleanup;
756
+
757
+ if ((ctx->variables = readstat_calloc(ctx->column_count, sizeof(readstat_variable_t *))) == NULL) {
758
+ retval = READSTAT_ERROR_MALLOC;
759
+ goto cleanup;
760
+ }
761
+ int i;
762
+ int index_after_skipping = 0;
763
+ for (i=0; i<ctx->column_count; i++) {
764
+ ctx->variables[i] = sas7bdat_init_variable(ctx, i, index_after_skipping, &retval);
765
+ if (ctx->variables[i] == NULL)
766
+ break;
767
+
768
+ int cb_retval = READSTAT_HANDLER_OK;
769
+ if (ctx->handle.variable) {
770
+ cb_retval = ctx->handle.variable(i, ctx->variables[i], ctx->variables[i]->format, ctx->user_ctx);
771
+ }
772
+ if (cb_retval == READSTAT_HANDLER_ABORT) {
773
+ retval = READSTAT_ERROR_USER_ABORT;
774
+ goto cleanup;
775
+ }
776
+
777
+ if (cb_retval == READSTAT_HANDLER_SKIP_VARIABLE) {
778
+ ctx->variables[i]->skip = 1;
779
+ } else {
780
+ index_after_skipping++;
781
+ }
782
+ }
783
+ cleanup:
784
+ return retval;
785
+ }
786
+
787
+ static readstat_error_t sas7bdat_submit_columns_if_needed(sas7bdat_ctx_t *ctx, int compressed) {
788
+ readstat_error_t retval = READSTAT_OK;
789
+ if (!ctx->did_submit_columns) {
790
+ if ((retval = sas7bdat_submit_columns(ctx, compressed)) != READSTAT_OK) {
791
+ goto cleanup;
792
+ }
793
+ ctx->did_submit_columns = 1;
794
+ }
795
+ cleanup:
796
+ return retval;
797
+ }
798
+
799
+ static int sas7bdat_signature_is_recognized(uint32_t signature) {
800
+ return (signature == SAS_SUBHEADER_SIGNATURE_ROW_SIZE ||
801
+ signature == SAS_SUBHEADER_SIGNATURE_COLUMN_SIZE ||
802
+ signature == SAS_SUBHEADER_SIGNATURE_COUNTS ||
803
+ signature == SAS_SUBHEADER_SIGNATURE_COLUMN_FORMAT ||
804
+ (signature & SAS_SUBHEADER_SIGNATURE_COLUMN_MASK) == SAS_SUBHEADER_SIGNATURE_COLUMN_MASK);
805
+ }
806
+
807
+ static readstat_error_t sas7bdat_parse_subheader_pointer(const char *shp, size_t shp_size,
808
+ subheader_pointer_t *info, sas7bdat_ctx_t *ctx) {
809
+ readstat_error_t retval = READSTAT_OK;
810
+ if (ctx->u64) {
811
+ if (shp_size <= 17) {
812
+ retval = READSTAT_ERROR_PARSE;
813
+ goto cleanup;
814
+ }
815
+ info->offset = sas_read8(&shp[0], ctx->bswap);
816
+ info->len = sas_read8(&shp[8], ctx->bswap);
817
+ info->compression = shp[16];
818
+ info->is_compressed_data = shp[17];
819
+ } else {
820
+ if (shp_size <= 9) {
821
+ retval = READSTAT_ERROR_PARSE;
822
+ goto cleanup;
823
+ }
824
+ info->offset = sas_read4(&shp[0], ctx->bswap);
825
+ info->len = sas_read4(&shp[4], ctx->bswap);
826
+ info->compression = shp[8];
827
+ info->is_compressed_data = shp[9];
828
+ }
829
+ cleanup:
830
+ return retval;
831
+ }
832
+
833
+ static readstat_error_t sas7bdat_validate_subheader_pointer(subheader_pointer_t *shp_info, size_t page_size,
834
+ uint16_t subheader_count, sas7bdat_ctx_t *ctx) {
835
+ if (shp_info->offset > page_size)
836
+ return READSTAT_ERROR_PARSE;
837
+ if (shp_info->len > page_size)
838
+ return READSTAT_ERROR_PARSE;
839
+ if (shp_info->offset + shp_info->len > page_size)
840
+ return READSTAT_ERROR_PARSE;
841
+ if (shp_info->offset < ctx->page_header_size + subheader_count*ctx->subheader_pointer_size)
842
+ return READSTAT_ERROR_PARSE;
843
+ if (shp_info->compression == SAS_COMPRESSION_NONE) {
844
+ if (shp_info->len < ctx->subheader_signature_size)
845
+ return READSTAT_ERROR_PARSE;
846
+ if (shp_info->offset + ctx->subheader_signature_size > page_size)
847
+ return READSTAT_ERROR_PARSE;
848
+ }
849
+
850
+ return READSTAT_OK;
851
+ }
852
+
853
+ /* First, extract column text */
854
+ static readstat_error_t sas7bdat_parse_page_pass1(const char *page, size_t page_size, sas7bdat_ctx_t *ctx) {
855
+ readstat_error_t retval = READSTAT_OK;
856
+
857
+ uint16_t subheader_count = sas_read2(&page[ctx->page_header_size-4], ctx->bswap);
858
+
859
+ int i;
860
+ const char *shp = &page[ctx->page_header_size];
861
+ int lshp = ctx->subheader_pointer_size;
862
+
863
+ if (ctx->page_header_size + subheader_count*lshp > page_size) {
864
+ retval = READSTAT_ERROR_PARSE;
865
+ goto cleanup;
866
+ }
867
+
868
+ for (i=0; i<subheader_count; i++) {
869
+ subheader_pointer_t shp_info = { 0 };
870
+ uint32_t signature = 0;
871
+ size_t signature_len = ctx->subheader_signature_size;
872
+ if ((retval = sas7bdat_parse_subheader_pointer(shp, page + page_size - shp, &shp_info, ctx)) != READSTAT_OK) {
873
+ goto cleanup;
874
+ }
875
+ if (shp_info.len > 0 && shp_info.compression != SAS_COMPRESSION_TRUNC) {
876
+ if ((retval = sas7bdat_validate_subheader_pointer(&shp_info, page_size, subheader_count, ctx)) != READSTAT_OK) {
877
+ goto cleanup;
878
+ }
879
+ if (shp_info.compression == SAS_COMPRESSION_NONE) {
880
+ signature = sas_read4(page + shp_info.offset, ctx->bswap);
881
+ if (!ctx->little_endian && signature == -1 && signature_len == 8) {
882
+ signature = sas_read4(page + shp_info.offset + 4, ctx->bswap);
883
+ }
884
+ if (signature == SAS_SUBHEADER_SIGNATURE_COLUMN_TEXT) {
885
+ if ((retval = sas7bdat_parse_subheader(signature, page + shp_info.offset, shp_info.len, ctx))
886
+ != READSTAT_OK) {
887
+ goto cleanup;
888
+ }
889
+ }
890
+ } else if (shp_info.compression == SAS_COMPRESSION_ROW) {
891
+ /* void */
892
+ } else {
893
+ retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION;
894
+ goto cleanup;
895
+ }
896
+ }
897
+
898
+ shp += lshp;
899
+ }
900
+
901
+ cleanup:
902
+
903
+ return retval;
904
+ }
905
+
906
+ static readstat_error_t sas7bdat_parse_page_pass2(const char *page, size_t page_size, sas7bdat_ctx_t *ctx) {
907
+ uint16_t page_type;
908
+
909
+ readstat_error_t retval = READSTAT_OK;
910
+
911
+ page_type = sas_read2(&page[ctx->page_header_size-8], ctx->bswap);
912
+
913
+ const char *data = NULL;
914
+
915
+ if ((page_type & SAS_PAGE_TYPE_MASK) == SAS_PAGE_TYPE_DATA) {
916
+ ctx->page_row_count = sas_read2(&page[ctx->page_header_size-6], ctx->bswap);
917
+ data = &page[ctx->page_header_size];
918
+ } else if (!(page_type & SAS_PAGE_TYPE_COMP)) {
919
+ uint16_t subheader_count = sas_read2(&page[ctx->page_header_size-4], ctx->bswap);
920
+
921
+ int i;
922
+ const char *shp = &page[ctx->page_header_size];
923
+ int lshp = ctx->subheader_pointer_size;
924
+
925
+ if (ctx->page_header_size + subheader_count*lshp > page_size) {
926
+ retval = READSTAT_ERROR_PARSE;
927
+ goto cleanup;
928
+ }
929
+
930
+ for (i=0; i<subheader_count; i++) {
931
+ subheader_pointer_t shp_info = { 0 };
932
+ uint32_t signature = 0;
933
+ if ((retval = sas7bdat_parse_subheader_pointer(shp, page + page_size - shp, &shp_info, ctx)) != READSTAT_OK) {
934
+ goto cleanup;
935
+ }
936
+ if (shp_info.len > 0 && shp_info.compression != SAS_COMPRESSION_TRUNC) {
937
+ if ((retval = sas7bdat_validate_subheader_pointer(&shp_info, page_size, subheader_count, ctx)) != READSTAT_OK) {
938
+ goto cleanup;
939
+ }
940
+ if (shp_info.compression == SAS_COMPRESSION_NONE) {
941
+ signature = sas_read4(page + shp_info.offset, ctx->bswap);
942
+ if (!ctx->little_endian && signature == -1 && ctx->u64) {
943
+ signature = sas_read4(page + shp_info.offset + 4, ctx->bswap);
944
+ }
945
+ if (shp_info.is_compressed_data && !sas7bdat_signature_is_recognized(signature)) {
946
+ if (shp_info.len != ctx->row_length) {
947
+ retval = READSTAT_ERROR_ROW_WIDTH_MISMATCH;
948
+ goto cleanup;
949
+ }
950
+ if ((retval = sas7bdat_submit_columns_if_needed(ctx, 1)) != READSTAT_OK) {
951
+ goto cleanup;
952
+ }
953
+ if ((retval = sas7bdat_parse_single_row(page + shp_info.offset, ctx)) != READSTAT_OK) {
954
+ goto cleanup;
955
+ }
956
+ } else {
957
+ if (signature != SAS_SUBHEADER_SIGNATURE_COLUMN_TEXT) {
958
+ if ((retval = sas7bdat_parse_subheader(signature, page + shp_info.offset, shp_info.len, ctx)) != READSTAT_OK) {
959
+ goto cleanup;
960
+ }
961
+ }
962
+ }
963
+ } else if (shp_info.compression == SAS_COMPRESSION_ROW) {
964
+ if ((retval = sas7bdat_submit_columns_if_needed(ctx, 1)) != READSTAT_OK) {
965
+ goto cleanup;
966
+ }
967
+ if ((retval = sas7bdat_parse_subheader_compressed(page + shp_info.offset, shp_info.len, ctx)) != READSTAT_OK) {
968
+ goto cleanup;
969
+ }
970
+ } else {
971
+ retval = READSTAT_ERROR_UNSUPPORTED_COMPRESSION;
972
+ goto cleanup;
973
+ }
974
+ }
975
+
976
+ shp += lshp;
977
+ }
978
+
979
+ if ((page_type & SAS_PAGE_TYPE_MASK) == SAS_PAGE_TYPE_MIX) {
980
+ /* HACK - this is supposed to obey 8-byte boundaries but
981
+ * some files created by Stat/Transfer don't. So verify that the
982
+ * padding is { 0, 0, 0, 0 } or { ' ', ' ', ' ', ' ' } (or that
983
+ * the file is not from Stat/Transfer) before skipping it */
984
+ if ((shp-page)%8 == 4 && shp + 4 <= page + page_size &&
985
+ (*(uint32_t *)shp == 0x00000000 ||
986
+ *(uint32_t *)shp == 0x20202020 ||
987
+ ctx->vendor != READSTAT_VENDOR_STAT_TRANSFER)) {
988
+ data = shp + 4;
989
+ } else {
990
+ data = shp;
991
+ }
992
+ }
993
+ }
994
+ if (data) {
995
+ if ((retval = sas7bdat_submit_columns_if_needed(ctx, 0)) != READSTAT_OK) {
996
+ goto cleanup;
997
+ }
998
+ if (ctx->handle.value) {
999
+ retval = sas7bdat_parse_rows(data, page + page_size - data, ctx);
1000
+ }
1001
+ }
1002
+ cleanup:
1003
+
1004
+ return retval;
1005
+ }
1006
+
1007
+ static readstat_error_t sas7bdat_parse_meta_pages_pass1(sas7bdat_ctx_t *ctx, int64_t *outLastExaminedPage) {
1008
+ readstat_error_t retval = READSTAT_OK;
1009
+ readstat_io_t *io = ctx->io;
1010
+ int64_t i;
1011
+
1012
+ /* look for META and MIX pages at beginning... */
1013
+ for (i=0; i<ctx->page_count; i++) {
1014
+ if (io->seek(ctx->header_size + i*ctx->page_size, READSTAT_SEEK_SET, io->io_ctx) == -1) {
1015
+ retval = READSTAT_ERROR_SEEK;
1016
+ if (ctx->handle.error) {
1017
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Failed to seek to position %" PRId64
1018
+ " (= %" PRId64 " + %" PRId64 "*%" PRId64 ")",
1019
+ ctx->header_size + i*ctx->page_size, ctx->header_size, i, ctx->page_size);
1020
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1021
+ }
1022
+ goto cleanup;
1023
+ }
1024
+
1025
+ readstat_off_t off = 0;
1026
+ if (ctx->u64)
1027
+ off = 16;
1028
+
1029
+ size_t head_len = off + 16 + 2;
1030
+ size_t tail_len = ctx->page_size - head_len;
1031
+
1032
+ if (io->read(ctx->page, head_len, io->io_ctx) < head_len) {
1033
+ retval = READSTAT_ERROR_READ;
1034
+ goto cleanup;
1035
+ }
1036
+
1037
+ uint16_t page_type = sas_read2(&ctx->page[off+16], ctx->bswap);
1038
+
1039
+ if ((page_type & SAS_PAGE_TYPE_MASK) == SAS_PAGE_TYPE_DATA)
1040
+ break;
1041
+ if ((page_type & SAS_PAGE_TYPE_COMP))
1042
+ continue;
1043
+
1044
+ if (io->read(ctx->page + head_len, tail_len, io->io_ctx) < tail_len) {
1045
+ retval = READSTAT_ERROR_READ;
1046
+ goto cleanup;
1047
+ }
1048
+
1049
+ if ((retval = sas7bdat_parse_page_pass1(ctx->page, ctx->page_size, ctx)) != READSTAT_OK) {
1050
+ if (ctx->handle.error && retval != READSTAT_ERROR_USER_ABORT) {
1051
+ int64_t pos = io->seek(0, READSTAT_SEEK_CUR, io->io_ctx);
1052
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf),
1053
+ "ReadStat: Error parsing page %" PRId64 ", bytes %" PRId64 "-%" PRId64,
1054
+ i, pos - ctx->page_size, pos-1);
1055
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1056
+ }
1057
+ goto cleanup;
1058
+ }
1059
+ }
1060
+
1061
+ cleanup:
1062
+ if (outLastExaminedPage)
1063
+ *outLastExaminedPage = i;
1064
+
1065
+ return retval;
1066
+ }
1067
+
1068
+ static readstat_error_t sas7bdat_parse_amd_pages_pass1(int64_t last_examined_page_pass1, sas7bdat_ctx_t *ctx) {
1069
+ readstat_error_t retval = READSTAT_OK;
1070
+ readstat_io_t *io = ctx->io;
1071
+ uint64_t i;
1072
+ uint64_t amd_page_count = 0;
1073
+
1074
+ /* ...then AMD pages at the end */
1075
+ for (i=ctx->page_count-1; i>last_examined_page_pass1; i--) {
1076
+ if (io->seek(ctx->header_size + i*ctx->page_size, READSTAT_SEEK_SET, io->io_ctx) == -1) {
1077
+ retval = READSTAT_ERROR_SEEK;
1078
+ if (ctx->handle.error) {
1079
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Failed to seek to position %" PRId64
1080
+ " (= %" PRId64 " + %" PRId64 "*%" PRId64 ")",
1081
+ ctx->header_size + i*ctx->page_size, ctx->header_size, i, ctx->page_size);
1082
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1083
+ }
1084
+ goto cleanup;
1085
+ }
1086
+
1087
+ readstat_off_t off = 0;
1088
+ if (ctx->u64)
1089
+ off = 16;
1090
+
1091
+ size_t head_len = off + 16 + 2;
1092
+ size_t tail_len = ctx->page_size - head_len;
1093
+
1094
+ if (io->read(ctx->page, head_len, io->io_ctx) < head_len) {
1095
+ retval = READSTAT_ERROR_READ;
1096
+ goto cleanup;
1097
+ }
1098
+
1099
+ uint16_t page_type = sas_read2(&ctx->page[off+16], ctx->bswap);
1100
+
1101
+ if ((page_type & SAS_PAGE_TYPE_MASK) == SAS_PAGE_TYPE_DATA) {
1102
+ /* Usually AMD pages are at the end but sometimes data pages appear after them */
1103
+ if (amd_page_count > 0)
1104
+ break;
1105
+ continue;
1106
+ }
1107
+ if ((page_type & SAS_PAGE_TYPE_COMP))
1108
+ continue;
1109
+
1110
+ if (io->read(ctx->page + head_len, tail_len, io->io_ctx) < tail_len) {
1111
+ retval = READSTAT_ERROR_READ;
1112
+ goto cleanup;
1113
+ }
1114
+
1115
+ if ((retval = sas7bdat_parse_page_pass1(ctx->page, ctx->page_size, ctx)) != READSTAT_OK) {
1116
+ if (ctx->handle.error && retval != READSTAT_ERROR_USER_ABORT) {
1117
+ int64_t pos = io->seek(0, READSTAT_SEEK_CUR, io->io_ctx);
1118
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf),
1119
+ "ReadStat: Error parsing page %" PRId64 ", bytes %" PRId64 "-%" PRId64,
1120
+ i, pos - ctx->page_size, pos-1);
1121
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1122
+ }
1123
+ goto cleanup;
1124
+ }
1125
+
1126
+ amd_page_count++;
1127
+ }
1128
+
1129
+ cleanup:
1130
+
1131
+ return retval;
1132
+ }
1133
+
1134
+ static readstat_error_t sas7bdat_parse_all_pages_pass2(sas7bdat_ctx_t *ctx) {
1135
+ readstat_error_t retval = READSTAT_OK;
1136
+ readstat_io_t *io = ctx->io;
1137
+ int64_t i;
1138
+
1139
+ for (i=0; i<ctx->page_count; i++) {
1140
+ if ((retval = sas7bdat_update_progress(ctx)) != READSTAT_OK) {
1141
+ goto cleanup;
1142
+ }
1143
+ if (io->read(ctx->page, ctx->page_size, io->io_ctx) < ctx->page_size) {
1144
+ retval = READSTAT_ERROR_READ;
1145
+ goto cleanup;
1146
+ }
1147
+
1148
+ if ((retval = sas7bdat_parse_page_pass2(ctx->page, ctx->page_size, ctx)) != READSTAT_OK) {
1149
+ if (ctx->handle.error && retval != READSTAT_ERROR_USER_ABORT) {
1150
+ int64_t pos = io->seek(0, READSTAT_SEEK_CUR, io->io_ctx);
1151
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf),
1152
+ "ReadStat: Error parsing page %" PRId64 ", bytes %" PRId64 "-%" PRId64,
1153
+ i, pos - ctx->page_size, pos-1);
1154
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1155
+ }
1156
+ goto cleanup;
1157
+ }
1158
+ if (ctx->parsed_row_count == ctx->row_limit)
1159
+ break;
1160
+ }
1161
+ cleanup:
1162
+
1163
+ return retval;
1164
+ }
1165
+
1166
+ readstat_error_t readstat_parse_sas7bdat(readstat_parser_t *parser, const char *path, void *user_ctx) {
1167
+ int64_t last_examined_page_pass1 = 0;
1168
+ readstat_error_t retval = READSTAT_OK;
1169
+ readstat_io_t *io = parser->io;
1170
+
1171
+ sas7bdat_ctx_t *ctx = calloc(1, sizeof(sas7bdat_ctx_t));
1172
+ sas_header_info_t *hinfo = calloc(1, sizeof(sas_header_info_t));
1173
+
1174
+ ctx->handle = parser->handlers;
1175
+ ctx->input_encoding = parser->input_encoding;
1176
+ ctx->output_encoding = parser->output_encoding;
1177
+ ctx->user_ctx = user_ctx;
1178
+ ctx->io = parser->io;
1179
+ ctx->row_limit = parser->row_limit;
1180
+ if (parser->row_offset > 0)
1181
+ ctx->row_offset = parser->row_offset;
1182
+
1183
+ if (io->open(path, io->io_ctx) == -1) {
1184
+ retval = READSTAT_ERROR_OPEN;
1185
+ goto cleanup;
1186
+ }
1187
+
1188
+ if ((ctx->file_size = io->seek(0, READSTAT_SEEK_END, io->io_ctx)) == -1) {
1189
+ retval = READSTAT_ERROR_SEEK;
1190
+ if (ctx->handle.error) {
1191
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Failed to seek to end of file");
1192
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1193
+ }
1194
+ goto cleanup;
1195
+ }
1196
+
1197
+ if (io->seek(0, READSTAT_SEEK_SET, io->io_ctx) == -1) {
1198
+ retval = READSTAT_ERROR_SEEK;
1199
+ if (ctx->handle.error) {
1200
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Failed to seek to beginning of file");
1201
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1202
+ }
1203
+ goto cleanup;
1204
+ }
1205
+
1206
+ if ((retval = sas_read_header(io, hinfo, ctx->handle.error, user_ctx)) != READSTAT_OK) {
1207
+ goto cleanup;
1208
+ }
1209
+
1210
+ ctx->u64 = hinfo->u64;
1211
+ ctx->little_endian = hinfo->little_endian;
1212
+ ctx->vendor = hinfo->vendor;
1213
+ ctx->bswap = machine_is_little_endian() ^ hinfo->little_endian;
1214
+ ctx->header_size = hinfo->header_size;
1215
+ ctx->page_count = hinfo->page_count;
1216
+ ctx->page_size = hinfo->page_size;
1217
+ ctx->page_header_size = hinfo->page_header_size;
1218
+ ctx->subheader_pointer_size = hinfo->subheader_pointer_size;
1219
+ ctx->subheader_signature_size = ctx->u64 ? 8 : 4;
1220
+ ctx->ctime = hinfo->creation_time;
1221
+ ctx->mtime = hinfo->modification_time;
1222
+ ctx->version = hinfo->major_version;
1223
+ if (ctx->input_encoding == NULL) {
1224
+ ctx->input_encoding = hinfo->encoding;
1225
+ }
1226
+ if ((ctx->page = readstat_malloc(ctx->page_size)) == NULL) {
1227
+ retval = READSTAT_ERROR_MALLOC;
1228
+ goto cleanup;
1229
+ }
1230
+
1231
+ if (ctx->input_encoding && ctx->output_encoding && strcmp(ctx->input_encoding, ctx->output_encoding) != 0) {
1232
+ iconv_t converter = iconv_open(ctx->output_encoding, ctx->input_encoding);
1233
+ if (converter == (iconv_t)-1) {
1234
+ retval = READSTAT_ERROR_UNSUPPORTED_CHARSET;
1235
+ goto cleanup;
1236
+ }
1237
+ ctx->converter = converter;
1238
+ }
1239
+
1240
+ if ((retval = readstat_convert(ctx->table_name, sizeof(ctx->table_name),
1241
+ hinfo->table_name, sizeof(hinfo->table_name), ctx->converter)) != READSTAT_OK) {
1242
+ goto cleanup;
1243
+ }
1244
+
1245
+ if ((retval = sas7bdat_parse_meta_pages_pass1(ctx, &last_examined_page_pass1)) != READSTAT_OK) {
1246
+ goto cleanup;
1247
+ }
1248
+
1249
+ if ((retval = sas7bdat_parse_amd_pages_pass1(last_examined_page_pass1, ctx)) != READSTAT_OK) {
1250
+ goto cleanup;
1251
+ }
1252
+
1253
+ if (io->seek(ctx->header_size, READSTAT_SEEK_SET, io->io_ctx) == -1) {
1254
+ retval = READSTAT_ERROR_SEEK;
1255
+ if (ctx->handle.error) {
1256
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Failed to seek to position %" PRId64,
1257
+ ctx->header_size);
1258
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1259
+ }
1260
+ goto cleanup;
1261
+ }
1262
+
1263
+ if ((retval = sas7bdat_parse_all_pages_pass2(ctx)) != READSTAT_OK) {
1264
+ goto cleanup;
1265
+ }
1266
+
1267
+ if ((retval = sas7bdat_submit_columns_if_needed(ctx, 0)) != READSTAT_OK) {
1268
+ goto cleanup;
1269
+ }
1270
+
1271
+ if (ctx->handle.value && ctx->parsed_row_count != ctx->row_limit) {
1272
+ retval = READSTAT_ERROR_ROW_COUNT_MISMATCH;
1273
+ if (ctx->handle.error) {
1274
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: Expected %d rows in file, found %d",
1275
+ ctx->row_limit, ctx->parsed_row_count);
1276
+ ctx->handle.error(ctx->error_buf, ctx->user_ctx);
1277
+ }
1278
+ goto cleanup;
1279
+ }
1280
+
1281
+ if ((retval = sas7bdat_update_progress(ctx)) != READSTAT_OK) {
1282
+ goto cleanup;
1283
+ }
1284
+
1285
+ cleanup:
1286
+ io->close(io->io_ctx);
1287
+
1288
+ if (retval == READSTAT_ERROR_OPEN ||
1289
+ retval == READSTAT_ERROR_READ ||
1290
+ retval == READSTAT_ERROR_SEEK) {
1291
+ if (ctx->handle.error) {
1292
+ snprintf(ctx->error_buf, sizeof(ctx->error_buf), "ReadStat: %s (retval = %d): %s (errno = %d)",
1293
+ readstat_error_message(retval), retval, strerror(errno), errno);
1294
+ ctx->handle.error(ctx->error_buf, user_ctx);
1295
+ }
1296
+ }
1297
+
1298
+ if (ctx)
1299
+ sas7bdat_ctx_free(ctx);
1300
+ if (hinfo)
1301
+ free(hinfo);
1302
+
1303
+ return retval;
1304
+ }