node-s3tables 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2,13 +2,1026 @@
2
2
 
3
3
  Object.defineProperty(exports, '__esModule', { value: true });
4
4
 
5
+ var node_crypto = require('node:crypto');
6
+ var avsc = require('avsc');
5
7
  var clientS3 = require('@aws-sdk/client-s3');
6
8
  var clientS3tables = require('@aws-sdk/client-s3tables');
9
+ var libStorage = require('@aws-sdk/lib-storage');
10
+ var node_stream = require('node:stream');
11
+ var LosslessJson = require('lossless-json');
7
12
  var signatureV4 = require('@smithy/signature-v4');
8
13
  var sha256Js = require('@aws-crypto/sha256-js');
9
14
  var protocolHttp = require('@smithy/protocol-http');
10
15
  var credentialProviderNode = require('@aws-sdk/credential-provider-node');
11
16
 
17
+ function _interopNamespaceDefault(e) {
18
+ var n = Object.create(null);
19
+ if (e) {
20
+ Object.keys(e).forEach(function (k) {
21
+ if (k !== 'default') {
22
+ var d = Object.getOwnPropertyDescriptor(e, k);
23
+ Object.defineProperty(n, k, d.get ? d : {
24
+ enumerable: true,
25
+ get: function () { return e[k]; }
26
+ });
27
+ }
28
+ });
29
+ }
30
+ n.default = e;
31
+ return Object.freeze(n);
32
+ }
33
+
34
+ var avsc__namespace = /*#__PURE__*/_interopNamespaceDefault(avsc);
35
+ var LosslessJson__namespace = /*#__PURE__*/_interopNamespaceDefault(LosslessJson);
36
+
37
+ function fixupMetadata(metadata) {
38
+ const newMetadata = {};
39
+ for (const [key, value] of Object.entries(metadata)) {
40
+ if (Buffer.isBuffer(value)) {
41
+ newMetadata[key] = value;
42
+ }
43
+ else {
44
+ newMetadata[key] = Buffer.from(value, 'utf8');
45
+ }
46
+ }
47
+ return newMetadata;
48
+ }
49
+ async function avroToBuffer(params) {
50
+ const metadata = params.metadata
51
+ ? fixupMetadata(params.metadata)
52
+ : params.metadata;
53
+ return new Promise((resolve, reject) => {
54
+ try {
55
+ const buffers = [];
56
+ const opts = {
57
+ writeHeader: true,
58
+ codec: 'deflate',
59
+ metadata,
60
+ };
61
+ const encoder = new avsc__namespace.streams.BlockEncoder(params.type, opts);
62
+ encoder.on('data', (chunk) => {
63
+ buffers.push(chunk);
64
+ });
65
+ encoder.on('end', () => {
66
+ resolve(Buffer.concat(buffers));
67
+ });
68
+ encoder.on('error', reject);
69
+ params.records.forEach((record) => {
70
+ encoder.write(record);
71
+ });
72
+ encoder.end();
73
+ }
74
+ catch (err) {
75
+ if (err instanceof Error) {
76
+ reject(err);
77
+ }
78
+ else {
79
+ reject(new Error(String(err)));
80
+ }
81
+ }
82
+ });
83
+ }
84
+ function icebergToAvroFields(spec, schema) {
85
+ return spec.fields.map((p) => _icebergToAvroField(p, schema));
86
+ }
87
+ function _icebergToAvroField(field, schema) {
88
+ const source = schema.fields.find((f) => f.id === field['source-id']);
89
+ if (!source) {
90
+ throw new Error(`Source field ${field['source-id']} not found in schema`);
91
+ }
92
+ let avroType;
93
+ switch (field.transform) {
94
+ case 'identity':
95
+ if (typeof source.type === 'string') {
96
+ avroType = _mapPrimitiveToAvro(source.type);
97
+ break;
98
+ }
99
+ throw new Error(`Unsupported transform: ${field.transform} for complex type`);
100
+ case 'year':
101
+ avroType = { type: 'int', logicalType: 'year' };
102
+ break;
103
+ case 'month':
104
+ avroType = { type: 'int', logicalType: 'month' };
105
+ break;
106
+ case 'day':
107
+ avroType = { type: 'int', logicalType: 'date' };
108
+ break;
109
+ case 'hour':
110
+ avroType = { type: 'long', logicalType: 'hour' };
111
+ break;
112
+ default:
113
+ if (field.transform.startsWith('bucket[')) {
114
+ avroType = 'int';
115
+ break;
116
+ }
117
+ else if (field.transform.startsWith('truncate[')) {
118
+ avroType = 'string';
119
+ break;
120
+ }
121
+ throw new Error(`Unsupported transform: ${field.transform} for type`);
122
+ }
123
+ return { name: field.name, type: ['null', avroType], default: null };
124
+ }
125
+ function _mapPrimitiveToAvro(type) {
126
+ switch (type) {
127
+ case 'boolean':
128
+ return 'int';
129
+ case 'int':
130
+ return 'int';
131
+ case 'long':
132
+ case 'time':
133
+ case 'timestamp':
134
+ case 'timestamptz':
135
+ return 'long';
136
+ case 'float':
137
+ case 'double':
138
+ return 'double';
139
+ case 'date':
140
+ return { type: 'int', logicalType: 'date' };
141
+ case 'string':
142
+ case 'uuid':
143
+ return 'string';
144
+ case 'binary':
145
+ return 'bytes';
146
+ default:
147
+ throw new Error(`Unsupported primitive: ${type}`);
148
+ }
149
+ }
150
+
151
+ var ManifestFileStatus;
152
+ (function (ManifestFileStatus) {
153
+ ManifestFileStatus[ManifestFileStatus["EXISTING"] = 0] = "EXISTING";
154
+ ManifestFileStatus[ManifestFileStatus["ADDED"] = 1] = "ADDED";
155
+ ManifestFileStatus[ManifestFileStatus["DELETED"] = 2] = "DELETED";
156
+ })(ManifestFileStatus || (ManifestFileStatus = {}));
157
+ var DataFileContent;
158
+ (function (DataFileContent) {
159
+ DataFileContent[DataFileContent["DATA"] = 0] = "DATA";
160
+ DataFileContent[DataFileContent["POSITION_DELETES"] = 1] = "POSITION_DELETES";
161
+ DataFileContent[DataFileContent["EQUALITY_DELETES"] = 2] = "EQUALITY_DELETES";
162
+ })(DataFileContent || (DataFileContent = {}));
163
+ var ListContent;
164
+ (function (ListContent) {
165
+ ListContent[ListContent["DATA"] = 0] = "DATA";
166
+ ListContent[ListContent["DELETES"] = 1] = "DELETES";
167
+ })(ListContent || (ListContent = {}));
168
+ const BigIntType = avsc__namespace.types.LongType.__with({
169
+ fromBuffer: (buf) => buf.readBigInt64LE(),
170
+ toBuffer(n) {
171
+ const buf = Buffer.alloc(8);
172
+ buf.writeBigInt64LE(n);
173
+ return buf;
174
+ },
175
+ fromJSON: BigInt,
176
+ toJSON: Number,
177
+ isValid: (n) => typeof n === 'bigint',
178
+ compare(n1, n2) {
179
+ return n1 === n2 ? 0 : n1 < n2 ? -1 : 1;
180
+ },
181
+ });
182
+ class YearStringType extends avsc__namespace.types.LogicalType {
183
+ _fromValue(val) {
184
+ return (1970 + val).toString();
185
+ }
186
+ _toValue(str) {
187
+ return parseInt(str, 10) - 1970;
188
+ }
189
+ _resolve(type) {
190
+ if (avsc__namespace.Type.isType(type, 'int')) {
191
+ return (val) => this._fromValue(val);
192
+ }
193
+ return null;
194
+ }
195
+ }
196
+ class MonthStringType extends avsc__namespace.types.LogicalType {
197
+ _fromValue(val) {
198
+ const year = 1970 + Math.floor(val / 12);
199
+ const month = (val % 12) + 1;
200
+ return `${year}-${String(month).padStart(2, '0')}`;
201
+ }
202
+ _toValue(str) {
203
+ const [y, m] = str.split('-').map(Number);
204
+ return ((y ?? 1970) - 1970) * 12 + ((m ?? 1) - 1);
205
+ }
206
+ _resolve(type) {
207
+ if (avsc__namespace.Type.isType(type, 'int')) {
208
+ return (val) => this._fromValue(val);
209
+ }
210
+ return null;
211
+ }
212
+ }
213
+ class DateStringType extends avsc__namespace.types.LogicalType {
214
+ _fromValue(val) {
215
+ const ms = val * 86400000;
216
+ return new Date(ms).toISOString().slice(0, 10);
217
+ }
218
+ _toValue(str) {
219
+ const [year, month, day] = str.split('-').map(Number);
220
+ return Math.floor(Date.UTC(year ?? 1970, (month ?? 1) - 1, day ?? 1) / 86400000);
221
+ }
222
+ _resolve(type) {
223
+ if (avsc__namespace.Type.isType(type, 'int')) {
224
+ return (val) => this._fromValue(val);
225
+ }
226
+ return null;
227
+ }
228
+ }
229
+ class HourStringType extends avsc__namespace.types.LogicalType {
230
+ _fromValue(val) {
231
+ const ms = val * 3600000;
232
+ return new Date(ms).toISOString().slice(0, 13);
233
+ }
234
+ _toValue(str) {
235
+ const d = new Date(str);
236
+ return Math.floor(d.getTime() / 3600000);
237
+ }
238
+ _resolve(type) {
239
+ if (avsc__namespace.Type.isType(type, 'long')) {
240
+ return (val) => this._fromValue(val);
241
+ }
242
+ return null;
243
+ }
244
+ }
245
+ const AvroRegistry = { long: BigIntType };
246
+ const AvroLogicalTypes = {
247
+ year: YearStringType,
248
+ month: MonthStringType,
249
+ date: DateStringType,
250
+ hour: HourStringType,
251
+ };
252
+
253
+ function makeManifestType(spec, schema) {
254
+ const part_fields = icebergToAvroFields(spec, schema);
255
+ return avsc__namespace.Type.forSchema({
256
+ type: 'record',
257
+ name: 'manifest_entry',
258
+ fields: [
259
+ { name: 'status', type: 'int', 'field-id': 0 },
260
+ {
261
+ name: 'snapshot_id',
262
+ type: ['null', 'long'],
263
+ default: null,
264
+ 'field-id': 1,
265
+ },
266
+ {
267
+ name: 'sequence_number',
268
+ type: ['null', 'long'],
269
+ default: null,
270
+ 'field-id': 3,
271
+ },
272
+ {
273
+ name: 'file_sequence_number',
274
+ type: ['null', 'long'],
275
+ default: null,
276
+ 'field-id': 4,
277
+ },
278
+ {
279
+ name: 'data_file',
280
+ type: {
281
+ type: 'record',
282
+ name: 'r2',
283
+ fields: [
284
+ {
285
+ name: 'content',
286
+ type: 'int',
287
+ doc: 'Contents of the file: 0=data, 1=position deletes, 2=equality deletes',
288
+ 'field-id': 134,
289
+ },
290
+ {
291
+ name: 'file_path',
292
+ type: 'string',
293
+ doc: 'Location URI with FS scheme',
294
+ 'field-id': 100,
295
+ },
296
+ {
297
+ name: 'file_format',
298
+ type: 'string',
299
+ doc: 'File format name: avro, orc, or parquet',
300
+ 'field-id': 101,
301
+ },
302
+ {
303
+ name: 'partition',
304
+ type: { type: 'record', name: 'r102', fields: part_fields },
305
+ doc: 'Partition data tuple, schema based on the partition spec',
306
+ 'field-id': 102,
307
+ },
308
+ {
309
+ name: 'record_count',
310
+ type: 'long',
311
+ doc: 'Number of records in the file',
312
+ 'field-id': 103,
313
+ },
314
+ {
315
+ name: 'file_size_in_bytes',
316
+ type: 'long',
317
+ doc: 'Total file size in bytes',
318
+ 'field-id': 104,
319
+ },
320
+ {
321
+ name: 'column_sizes',
322
+ type: [
323
+ 'null',
324
+ {
325
+ type: 'array',
326
+ items: {
327
+ type: 'record',
328
+ name: 'k117_v118',
329
+ fields: [
330
+ { name: 'key', type: 'int', 'field-id': 117 },
331
+ { name: 'value', type: 'long', 'field-id': 118 },
332
+ ],
333
+ },
334
+ logicalType: 'map',
335
+ },
336
+ ],
337
+ doc: 'Map of column id to total size on disk',
338
+ default: null,
339
+ 'field-id': 108,
340
+ },
341
+ {
342
+ name: 'value_counts',
343
+ type: [
344
+ 'null',
345
+ {
346
+ type: 'array',
347
+ items: {
348
+ type: 'record',
349
+ name: 'k119_v120',
350
+ fields: [
351
+ { name: 'key', type: 'int', 'field-id': 119 },
352
+ { name: 'value', type: 'long', 'field-id': 120 },
353
+ ],
354
+ },
355
+ logicalType: 'map',
356
+ },
357
+ ],
358
+ doc: 'Map of column id to total count, including null and NaN',
359
+ default: null,
360
+ 'field-id': 109,
361
+ },
362
+ {
363
+ name: 'null_value_counts',
364
+ type: [
365
+ 'null',
366
+ {
367
+ type: 'array',
368
+ items: {
369
+ type: 'record',
370
+ name: 'k121_v122',
371
+ fields: [
372
+ { name: 'key', type: 'int', 'field-id': 121 },
373
+ { name: 'value', type: 'long', 'field-id': 122 },
374
+ ],
375
+ },
376
+ logicalType: 'map',
377
+ },
378
+ ],
379
+ doc: 'Map of column id to null value count',
380
+ default: null,
381
+ 'field-id': 110,
382
+ },
383
+ {
384
+ name: 'nan_value_counts',
385
+ type: [
386
+ 'null',
387
+ {
388
+ type: 'array',
389
+ items: {
390
+ type: 'record',
391
+ name: 'k138_v139',
392
+ fields: [
393
+ { name: 'key', type: 'int', 'field-id': 138 },
394
+ { name: 'value', type: 'long', 'field-id': 139 },
395
+ ],
396
+ },
397
+ logicalType: 'map',
398
+ },
399
+ ],
400
+ doc: 'Map of column id to number of NaN values in the column',
401
+ default: null,
402
+ 'field-id': 137,
403
+ },
404
+ {
405
+ name: 'lower_bounds',
406
+ type: [
407
+ 'null',
408
+ {
409
+ type: 'array',
410
+ items: {
411
+ type: 'record',
412
+ name: 'k126_v127',
413
+ fields: [
414
+ { name: 'key', type: 'int', 'field-id': 126 },
415
+ { name: 'value', type: 'bytes', 'field-id': 127 },
416
+ ],
417
+ },
418
+ logicalType: 'map',
419
+ },
420
+ ],
421
+ doc: 'Map of column id to lower bound',
422
+ default: null,
423
+ 'field-id': 125,
424
+ },
425
+ {
426
+ name: 'upper_bounds',
427
+ type: [
428
+ 'null',
429
+ {
430
+ type: 'array',
431
+ items: {
432
+ type: 'record',
433
+ name: 'k129_v130',
434
+ fields: [
435
+ { name: 'key', type: 'int', 'field-id': 129 },
436
+ { name: 'value', type: 'bytes', 'field-id': 130 },
437
+ ],
438
+ },
439
+ logicalType: 'map',
440
+ },
441
+ ],
442
+ doc: 'Map of column id to upper bound',
443
+ default: null,
444
+ 'field-id': 128,
445
+ },
446
+ {
447
+ name: 'key_metadata',
448
+ type: ['null', 'bytes'],
449
+ doc: 'Encryption key metadata blob',
450
+ default: null,
451
+ 'field-id': 131,
452
+ },
453
+ {
454
+ name: 'split_offsets',
455
+ type: [
456
+ 'null',
457
+ { type: 'array', items: 'long', 'element-id': 133 },
458
+ ],
459
+ doc: 'Splittable offsets',
460
+ default: null,
461
+ 'field-id': 132,
462
+ },
463
+ {
464
+ name: 'equality_ids',
465
+ type: [
466
+ 'null',
467
+ { type: 'array', items: 'int', 'element-id': 136 },
468
+ ],
469
+ doc: 'Equality comparison field IDs',
470
+ default: null,
471
+ 'field-id': 135,
472
+ },
473
+ {
474
+ name: 'sort_order_id',
475
+ type: ['null', 'int'],
476
+ doc: 'Sort order ID',
477
+ default: null,
478
+ 'field-id': 140,
479
+ },
480
+ ],
481
+ },
482
+ 'field-id': 2,
483
+ },
484
+ ],
485
+ }, { registry: { ...AvroRegistry }, logicalTypes: AvroLogicalTypes });
486
+ }
487
+ const ManifestListType = avsc__namespace.Type.forSchema({
488
+ type: 'record',
489
+ name: 'manifest_file',
490
+ fields: [
491
+ {
492
+ name: 'manifest_path',
493
+ type: 'string',
494
+ doc: 'Location URI with FS scheme',
495
+ 'field-id': 500,
496
+ },
497
+ {
498
+ name: 'manifest_length',
499
+ type: 'long',
500
+ doc: 'Total file size in bytes',
501
+ 'field-id': 501,
502
+ },
503
+ {
504
+ name: 'partition_spec_id',
505
+ type: 'int',
506
+ doc: 'Spec ID used to write',
507
+ 'field-id': 502,
508
+ },
509
+ {
510
+ name: 'content',
511
+ type: 'int',
512
+ doc: 'Contents of the manifest: 0=data, 1=deletes',
513
+ 'field-id': 517,
514
+ },
515
+ {
516
+ name: 'sequence_number',
517
+ type: 'long',
518
+ doc: 'Sequence number when the manifest was added',
519
+ 'field-id': 515,
520
+ },
521
+ {
522
+ name: 'min_sequence_number',
523
+ type: 'long',
524
+ doc: 'Lowest sequence number in the manifest',
525
+ 'field-id': 516,
526
+ },
527
+ {
528
+ name: 'added_snapshot_id',
529
+ type: 'long',
530
+ doc: 'Snapshot ID that added the manifest',
531
+ 'field-id': 503,
532
+ },
533
+ {
534
+ name: 'added_data_files_count',
535
+ type: 'int',
536
+ doc: 'Added entry count',
537
+ 'field-id': 504,
538
+ },
539
+ {
540
+ name: 'existing_data_files_count',
541
+ type: 'int',
542
+ doc: 'Existing entry count',
543
+ 'field-id': 505,
544
+ },
545
+ {
546
+ name: 'deleted_data_files_count',
547
+ type: 'int',
548
+ doc: 'Deleted entry count',
549
+ 'field-id': 506,
550
+ },
551
+ {
552
+ name: 'added_rows_count',
553
+ type: 'long',
554
+ doc: 'Added rows count',
555
+ 'field-id': 512,
556
+ },
557
+ {
558
+ name: 'existing_rows_count',
559
+ type: 'long',
560
+ doc: 'Existing rows count',
561
+ 'field-id': 513,
562
+ },
563
+ {
564
+ name: 'deleted_rows_count',
565
+ type: 'long',
566
+ doc: 'Deleted rows count',
567
+ 'field-id': 514,
568
+ },
569
+ {
570
+ name: 'partitions',
571
+ type: [
572
+ 'null',
573
+ {
574
+ type: 'array',
575
+ items: {
576
+ type: 'record',
577
+ name: 'r508',
578
+ fields: [
579
+ {
580
+ name: 'contains_null',
581
+ type: 'boolean',
582
+ doc: 'True if any file has a null partition value',
583
+ 'field-id': 509,
584
+ },
585
+ {
586
+ name: 'contains_nan',
587
+ type: ['null', 'boolean'],
588
+ doc: 'True if any file has a nan partition value',
589
+ default: null,
590
+ 'field-id': 518,
591
+ },
592
+ {
593
+ name: 'lower_bound',
594
+ type: ['null', 'bytes'],
595
+ doc: 'Partition lower bound for all files',
596
+ default: null,
597
+ 'field-id': 510,
598
+ },
599
+ {
600
+ name: 'upper_bound',
601
+ type: ['null', 'bytes'],
602
+ doc: 'Partition upper bound for all files',
603
+ default: null,
604
+ 'field-id': 511,
605
+ },
606
+ ],
607
+ },
608
+ 'element-id': 508,
609
+ },
610
+ ],
611
+ doc: 'Summary for each partition',
612
+ default: null,
613
+ 'field-id': 507,
614
+ },
615
+ ],
616
+ }, { registry: { ...AvroRegistry } });
617
+
618
+ function _isPrimitive(t) {
619
+ return typeof t === 'string';
620
+ }
621
+ function _outputType(transform, sourceType) {
622
+ if (transform === 'identity' || transform.startsWith('truncate[')) {
623
+ if (_isPrimitive(sourceType)) {
624
+ return sourceType;
625
+ }
626
+ return null;
627
+ }
628
+ if (transform.startsWith('bucket[')) {
629
+ return 'int';
630
+ }
631
+ if (transform === 'year' ||
632
+ transform === 'month' ||
633
+ transform === 'day' ||
634
+ transform === 'hour') {
635
+ return 'int';
636
+ }
637
+ return null;
638
+ }
639
+ function _encodeValue(raw, transform, out_type) {
640
+ if (raw === null || transform === null || out_type === null) {
641
+ return null;
642
+ }
643
+ switch (transform) {
644
+ case 'identity': {
645
+ if (Buffer.isBuffer(raw)) {
646
+ if (out_type === 'binary' ||
647
+ out_type.startsWith('decimal(') ||
648
+ out_type.startsWith('fixed[')) {
649
+ return raw;
650
+ }
651
+ throw new Error(`Buffer not allowed for identity with type ${out_type}`);
652
+ }
653
+ switch (out_type) {
654
+ case 'int': {
655
+ const n = typeof raw === 'number' ? raw : Number(raw);
656
+ const buf = Buffer.alloc(4);
657
+ buf.writeInt32LE(Math.floor(n));
658
+ return buf;
659
+ }
660
+ case 'long': {
661
+ const n = typeof raw === 'bigint' ? raw : BigInt(raw);
662
+ const buf = Buffer.alloc(8);
663
+ buf.writeBigInt64LE(n);
664
+ return buf;
665
+ }
666
+ case 'float': {
667
+ const n = typeof raw === 'number' ? raw : Number(raw);
668
+ const buf = Buffer.alloc(4);
669
+ buf.writeFloatLE(n);
670
+ return buf;
671
+ }
672
+ case 'double': {
673
+ const n = typeof raw === 'number' ? raw : Number(raw);
674
+ const buf = Buffer.alloc(8);
675
+ buf.writeDoubleLE(n);
676
+ return buf;
677
+ }
678
+ case 'string':
679
+ case 'uuid': {
680
+ const s = typeof raw === 'string' ? raw : String(raw);
681
+ return Buffer.from(s, 'utf8');
682
+ }
683
+ case 'boolean': {
684
+ const buf = Buffer.alloc(1);
685
+ buf.writeUInt8(raw ? 1 : 0);
686
+ return buf;
687
+ }
688
+ case 'binary':
689
+ case 'date':
690
+ case 'time':
691
+ case 'timestamp':
692
+ case 'timestamptz':
693
+ throw new Error(`Identity not implemented for type ${out_type}`);
694
+ default:
695
+ throw new Error(`Identity not implemented for type ${out_type}`);
696
+ }
697
+ }
698
+ case 'year':
699
+ case 'month':
700
+ case 'day':
701
+ case 'hour': {
702
+ let n;
703
+ if (typeof raw === 'string') {
704
+ const d = new Date(raw);
705
+ if (transform === 'year') {
706
+ n = d.getUTCFullYear();
707
+ }
708
+ else if (transform === 'month') {
709
+ n = d.getUTCFullYear() * 12 + d.getUTCMonth();
710
+ }
711
+ else if (transform === 'day') {
712
+ n = Math.floor(d.getTime() / (24 * 3600 * 1000));
713
+ }
714
+ else {
715
+ n = Math.floor(d.getTime() / (3600 * 1000));
716
+ }
717
+ }
718
+ else if (typeof raw === 'number' || typeof raw === 'bigint') {
719
+ n = Number(raw);
720
+ }
721
+ else {
722
+ throw new Error(`${transform} requires string|number|bigint`);
723
+ }
724
+ const buf = Buffer.alloc(4);
725
+ buf.writeInt32LE(n);
726
+ return buf;
727
+ }
728
+ default:
729
+ if (transform.startsWith('bucket[')) {
730
+ if (typeof raw !== 'number') {
731
+ throw new Error('bucket requires number input');
732
+ }
733
+ const buf = Buffer.alloc(4);
734
+ buf.writeInt32LE(raw);
735
+ return buf;
736
+ }
737
+ if (transform.startsWith('truncate[')) {
738
+ if (typeof raw !== 'string') {
739
+ throw new Error('truncate requires string input');
740
+ }
741
+ const width = Number(/\d+/.exec(transform)?.[0]);
742
+ return Buffer.from(raw.substring(0, width), 'utf8');
743
+ }
744
+ throw new Error(`Unsupported transform ${transform}`);
745
+ }
746
+ }
747
+ const NaNValue = NaN;
748
+ function makeBounds(paritions, spec, schema) {
749
+ return spec.fields.map((f) => {
750
+ const schemaField = schema.fields.find((sf) => sf.id === f['source-id']);
751
+ if (!schemaField) {
752
+ throw new Error(`Schema field not found for source-id ${f['source-id']}`);
753
+ }
754
+ if (!(f.name in paritions)) {
755
+ throw new Error(`paritions missing ${f.name}`);
756
+ }
757
+ const raw = paritions[f.name];
758
+ if (typeof raw === 'number' && isNaN(raw)) {
759
+ return NaNValue;
760
+ }
761
+ if (raw === null || raw === undefined) {
762
+ return null;
763
+ }
764
+ const out_type = _outputType(f.transform, schemaField.type);
765
+ return _encodeValue(raw, f.transform, out_type);
766
+ });
767
+ }
768
+
769
+ const S3_REGEX = /^s3:\/\/([^/]+)\/(.+)$/;
770
+ function parseS3Url(url) {
771
+ const match = S3_REGEX.exec(url);
772
+ if (!match) {
773
+ throw new Error('Invalid S3 URL');
774
+ }
775
+ return { bucket: match[1], key: match[2] };
776
+ }
777
+ const g_s3Map = new Map();
778
+ const g_s3TablesMap = new Map();
779
+ function getS3Client(params) {
780
+ const { region, credentials } = params;
781
+ let ret = g_s3Map.get(region)?.get(credentials);
782
+ if (!ret) {
783
+ const opts = {};
784
+ if (region) {
785
+ opts.region = region;
786
+ }
787
+ if (credentials) {
788
+ opts.credentials = credentials;
789
+ }
790
+ ret = new clientS3.S3Client(opts);
791
+ _setMap(g_s3Map, region, credentials, ret);
792
+ }
793
+ return ret;
794
+ }
795
+ function getS3TablesClient(params) {
796
+ const { region, credentials } = params;
797
+ let ret = g_s3TablesMap.get(region)?.get(credentials);
798
+ if (!ret) {
799
+ const opts = {};
800
+ if (region) {
801
+ opts.region = region;
802
+ }
803
+ if (credentials) {
804
+ opts.credentials = credentials;
805
+ }
806
+ ret = new clientS3tables.S3TablesClient(opts);
807
+ _setMap(g_s3TablesMap, region, credentials, ret);
808
+ }
809
+ return ret;
810
+ }
811
+ function _setMap(map, region, credentials, client) {
812
+ let region_map = map.get(region);
813
+ region_map ??= new Map();
814
+ region_map.set(credentials, client);
815
+ }
816
+ async function writeS3File(params) {
817
+ const { credentials, region, bucket, key, body } = params;
818
+ const s3 = getS3Client({ region, credentials });
819
+ const command = new clientS3.PutObjectCommand({
820
+ Bucket: bucket,
821
+ Key: key,
822
+ Body: body,
823
+ });
824
+ await s3.send(command);
825
+ }
826
+ async function updateManifestList(params) {
827
+ const { region, credentials, bucket, key, outKey, prepend } = params;
828
+ const metadata = params.metadata
829
+ ? fixupMetadata(params.metadata)
830
+ : params.metadata;
831
+ const s3 = getS3Client({ region, credentials });
832
+ const get = new clientS3.GetObjectCommand({ Bucket: bucket, Key: key });
833
+ const response = await s3.send(get);
834
+ const source = response.Body;
835
+ if (!source) {
836
+ throw new Error('failed to get source manifest list');
837
+ }
838
+ const passthrough = new node_stream.PassThrough();
839
+ const decoder = new avsc__namespace.streams.BlockDecoder({
840
+ parseHook: () => ManifestListType,
841
+ });
842
+ const encoder = new avsc__namespace.streams.BlockEncoder(ManifestListType, {
843
+ codec: 'deflate',
844
+ metadata,
845
+ });
846
+ encoder.pipe(passthrough);
847
+ for (const record of prepend) {
848
+ encoder.write(record);
849
+ }
850
+ const upload = new libStorage.Upload({
851
+ client: s3,
852
+ params: { Bucket: bucket, Key: outKey, Body: passthrough },
853
+ });
854
+ const stream_promise = new Promise((resolve, reject) => {
855
+ decoder.on('error', reject);
856
+ decoder.on('data', (record) => {
857
+ encoder.write(record);
858
+ });
859
+ decoder.on('end', () => {
860
+ encoder.end();
861
+ });
862
+ decoder.on('finish', () => {
863
+ resolve();
864
+ });
865
+ source.pipe(decoder);
866
+ });
867
+ await Promise.all([stream_promise, upload.done()]);
868
+ }
869
+
870
+ async function addManifest(params) {
871
+ const { credentials, region, metadata } = params;
872
+ const bucket = metadata.location.split('/').slice(-1)[0];
873
+ const schema = metadata.schemas.find((s) => s['schema-id'] === params.schemaId);
874
+ const spec = metadata['partition-specs'].find((p) => p['spec-id'] === params.specId);
875
+ if (!bucket) {
876
+ throw new Error('bad manifest location');
877
+ }
878
+ if (!schema) {
879
+ throw new Error('schema not found');
880
+ }
881
+ if (!spec) {
882
+ throw new Error('partition spec not found');
883
+ }
884
+ if (!params.files[0]) {
885
+ throw new Error('must have at least 1 file');
886
+ }
887
+ let added_rows_count = 0n;
888
+ const partitions = spec.fields.map(() => ({
889
+ contains_null: false,
890
+ contains_nan: false,
891
+ upper_bound: null,
892
+ lower_bound: null,
893
+ }));
894
+ const records = params.files.map((file) => {
895
+ added_rows_count += file.recordCount;
896
+ const bounds = makeBounds(file.partitions, spec, schema);
897
+ for (let i = 0; i < partitions.length; i++) {
898
+ const part = partitions[i];
899
+ const bound = bounds[i];
900
+ if (!part) {
901
+ throw new Error('impossible');
902
+ }
903
+ else if (bound === null) {
904
+ part.contains_null = true;
905
+ }
906
+ else if (Buffer.isBuffer(bound)) {
907
+ part.upper_bound = _maxBuffer(part.upper_bound ?? null, bound);
908
+ part.lower_bound = _minBuffer(part.lower_bound ?? null, bound);
909
+ }
910
+ else {
911
+ part.contains_nan = true;
912
+ }
913
+ }
914
+ return {
915
+ status: ManifestFileStatus.ADDED,
916
+ snapshot_id: params.snapshotId,
917
+ sequence_number: params.sequenceNumber,
918
+ file_sequence_number: params.sequenceNumber,
919
+ data_file: {
920
+ content: DataFileContent.DATA,
921
+ file_path: file.file,
922
+ file_format: 'PARQUET',
923
+ record_count: file.recordCount,
924
+ file_size_in_bytes: file.fileSize,
925
+ partition: file.partitions,
926
+ column_sizes: _transformRecord(schema, file.columnSizes),
927
+ value_counts: _transformRecord(schema, file.valueCounts),
928
+ null_value_counts: _transformRecord(schema, file.nullValueCounts),
929
+ nan_value_counts: _transformRecord(schema, file.nanValueCounts),
930
+ lower_bounds: _transformRecord(schema, file.lowerBounds),
931
+ upper_bounds: _transformRecord(schema, file.upperBounds),
932
+ key_metadata: file.keyMetadata ?? null,
933
+ split_offsets: file.splitOffsets ?? null,
934
+ equality_ids: file.equalityIds ?? null,
935
+ sort_order_id: file.sortOrderId ?? null,
936
+ },
937
+ };
938
+ });
939
+ const manifest_type = makeManifestType(spec, schema);
940
+ const manifest_buf = await avroToBuffer({
941
+ type: manifest_type,
942
+ metadata: {
943
+ 'partition-spec-id': String(params.specId),
944
+ 'partition-spec': JSON.stringify(spec.fields),
945
+ },
946
+ records,
947
+ });
948
+ const manifest_key = `metadata/${node_crypto.randomUUID()}.avro`;
949
+ await writeS3File({
950
+ credentials,
951
+ region,
952
+ bucket,
953
+ key: manifest_key,
954
+ body: manifest_buf,
955
+ });
956
+ const manifest_record = {
957
+ manifest_path: `s3://${bucket}/${manifest_key}`,
958
+ manifest_length: BigInt(manifest_buf.length),
959
+ partition_spec_id: params.specId,
960
+ content: ListContent.DATA,
961
+ sequence_number: params.sequenceNumber,
962
+ min_sequence_number: params.sequenceNumber,
963
+ added_snapshot_id: params.snapshotId,
964
+ added_data_files_count: params.files.length,
965
+ existing_data_files_count: 0,
966
+ deleted_data_files_count: 0,
967
+ added_rows_count,
968
+ existing_rows_count: 0n,
969
+ deleted_rows_count: 0n,
970
+ partitions,
971
+ };
972
+ return manifest_record;
973
+ }
974
+ function _transformRecord(schema, map) {
975
+ if (!map) {
976
+ return null;
977
+ }
978
+ const ret = [];
979
+ for (const field of schema.fields) {
980
+ const value = map[field.name];
981
+ if (value !== undefined) {
982
+ ret.push({ key: field.id, value });
983
+ }
984
+ }
985
+ return ret.length > 0 ? ret : null;
986
+ }
987
+ function _minBuffer(a, b) {
988
+ if (!a && !b) {
989
+ return null;
990
+ }
991
+ else if (!a) {
992
+ return b;
993
+ }
994
+ else if (!b) {
995
+ return a;
996
+ }
997
+ return Buffer.compare(a, b) <= 0 ? a : b;
998
+ }
999
+ function _maxBuffer(a, b) {
1000
+ if (!a && !b) {
1001
+ return null;
1002
+ }
1003
+ else if (!a) {
1004
+ return b;
1005
+ }
1006
+ else if (!b) {
1007
+ return a;
1008
+ }
1009
+ return Buffer.compare(a, b) >= 0 ? a : b;
1010
+ }
1011
+
1012
+ function customNumberParser(value) {
1013
+ if (LosslessJson__namespace.isInteger(value)) {
1014
+ if (LosslessJson__namespace.isSafeNumber(value)) {
1015
+ return parseInt(value, 10);
1016
+ }
1017
+ return BigInt(value);
1018
+ }
1019
+ return parseFloat(value);
1020
+ }
1021
+ function parse(text) {
1022
+ return LosslessJson__namespace.parse(text, null, customNumberParser);
1023
+ }
1024
+
12
1025
  async function icebergRequest(params) {
13
1026
  const region = params.tableBucketARN.split(':')[3];
14
1027
  if (!region) {
@@ -17,7 +1030,7 @@ async function icebergRequest(params) {
17
1030
  const arn = encodeURIComponent(params.tableBucketARN);
18
1031
  const hostname = `s3tables.${region}.amazonaws.com`;
19
1032
  const full_path = `/iceberg/v1/${arn}${params.suffix}`;
20
- const body = params.body ? JSON.stringify(params.body) : null;
1033
+ const body = params.body ? LosslessJson.stringify(params.body) : null;
21
1034
  const req_opts = {
22
1035
  method: params.method ?? 'GET',
23
1036
  protocol: 'https:',
@@ -47,29 +1060,47 @@ async function icebergRequest(params) {
47
1060
  fetch_opts.body = signed.body;
48
1061
  }
49
1062
  const res = await fetch(url, fetch_opts);
1063
+ const text = await res.text();
50
1064
  if (!res.ok) {
51
- throw new Error(`request failed: ${res.status} ${res.statusText}`);
1065
+ throw new Error(`request failed: ${res.status} ${res.statusText} ${text}`);
1066
+ }
1067
+ try {
1068
+ return parse(text);
1069
+ }
1070
+ catch {
1071
+ return text;
52
1072
  }
53
- return (await res.json());
54
1073
  }
55
1074
 
56
1075
  async function getMetadata(params) {
57
- const { config, ...other } = params;
58
- const client = new clientS3tables.S3TablesClient(config ?? {});
1076
+ if ('tableBucketARN' in params) {
1077
+ const icebergResponse = await icebergRequest({
1078
+ credentials: params.credentials,
1079
+ tableBucketARN: params.tableBucketARN,
1080
+ method: 'GET',
1081
+ suffix: `/namespaces/${params.namespace}/tables/${params.name}`,
1082
+ });
1083
+ if (icebergResponse.metadata) {
1084
+ return icebergResponse.metadata;
1085
+ }
1086
+ throw new Error('invalid table metadata');
1087
+ }
1088
+ const { ...other } = params;
1089
+ const client = getS3TablesClient(params);
59
1090
  const get_table_cmd = new clientS3tables.GetTableCommand(other);
60
1091
  const response = await client.send(get_table_cmd);
61
1092
  if (!response.metadataLocation) {
62
1093
  throw new Error('missing metadataLocation');
63
1094
  }
64
- const s3_client = new clientS3.S3Client(config ?? {});
65
- const { key, bucket } = _parseS3Url(response.metadataLocation);
1095
+ const s3_client = getS3Client(params);
1096
+ const { key, bucket } = parseS3Url(response.metadataLocation);
66
1097
  const get_file_cmd = new clientS3.GetObjectCommand({ Bucket: bucket, Key: key });
67
1098
  const file_response = await s3_client.send(get_file_cmd);
68
1099
  const body = await file_response.Body?.transformToString();
69
1100
  if (!body) {
70
1101
  throw new Error('missing body');
71
1102
  }
72
- return JSON.parse(body);
1103
+ return parse(body);
73
1104
  }
74
1105
  async function addSchema(params) {
75
1106
  return icebergRequest({
@@ -113,18 +1144,173 @@ async function addPartitionSpec(params) {
113
1144
  },
114
1145
  });
115
1146
  }
116
- const S3_REGEX = /^s3:\/\/([^/]+)\/(.+)$/;
117
- function _parseS3Url(url) {
118
- const match = S3_REGEX.exec(url);
119
- if (!match) {
120
- throw new Error('Invalid S3 URL');
1147
+
1148
+ async function addDataFiles(params) {
1149
+ const { credentials } = params;
1150
+ const region = params.tableBucketARN.split(':')[3];
1151
+ if (!region) {
1152
+ throw new Error('bad tableBucketARN');
121
1153
  }
122
- return { bucket: match[1], key: match[2] };
1154
+ const snapshot_id = _randomBigInt64();
1155
+ const metadata = await getMetadata(params);
1156
+ const parent_snapshot_id = metadata['current-snapshot-id'];
1157
+ const bucket = metadata.location.split('/').slice(-1)[0];
1158
+ const snapshot = parent_snapshot_id === -1
1159
+ ? null
1160
+ : metadata.snapshots.find((s) => s['snapshot-id'] === parent_snapshot_id);
1161
+ if (!bucket) {
1162
+ throw new Error('bad manifest location');
1163
+ }
1164
+ if (parent_snapshot_id !== -1 && !snapshot) {
1165
+ throw new Error('no old snapshot');
1166
+ }
1167
+ const sequence_number = BigInt(metadata.snapshots.reduce((memo, s) => s['sequence-number'] > memo ? s['sequence-number'] : memo, 0)) + 1n;
1168
+ let added_files = 0;
1169
+ let added_records = 0n;
1170
+ let added_size = 0n;
1171
+ const records = await Promise.all(params.lists.map(async (list) => {
1172
+ added_files += list.files.length;
1173
+ for (const file of list.files) {
1174
+ added_records += file.recordCount;
1175
+ added_size += file.fileSize;
1176
+ }
1177
+ const opts = {
1178
+ credentials,
1179
+ region,
1180
+ metadata,
1181
+ schemaId: list.schemaId,
1182
+ specId: list.specId,
1183
+ snapshotId: snapshot_id,
1184
+ sequenceNumber: sequence_number,
1185
+ files: list.files,
1186
+ };
1187
+ return addManifest(opts);
1188
+ }));
1189
+ const manifest_list_key = `metadata/${node_crypto.randomUUID()}.avro`;
1190
+ const manifest_list_url = `s3://${bucket}/${manifest_list_key}`;
1191
+ if (snapshot) {
1192
+ const { key: old_list_key } = parseS3Url(snapshot['manifest-list']);
1193
+ if (!old_list_key) {
1194
+ throw new Error('snapshot invalid');
1195
+ }
1196
+ await updateManifestList({
1197
+ credentials,
1198
+ region,
1199
+ bucket,
1200
+ key: old_list_key,
1201
+ outKey: manifest_list_key,
1202
+ metadata: {
1203
+ 'sequence-number': String(sequence_number),
1204
+ 'snapshot-id': String(snapshot_id),
1205
+ 'parent-snapshot-id': String(parent_snapshot_id),
1206
+ },
1207
+ prepend: records,
1208
+ });
1209
+ }
1210
+ else {
1211
+ const manifest_list_buf = await avroToBuffer({
1212
+ type: ManifestListType,
1213
+ metadata: {
1214
+ 'sequence-number': String(sequence_number),
1215
+ 'snapshot-id': String(snapshot_id),
1216
+ 'parent-snapshot-id': String(parent_snapshot_id),
1217
+ },
1218
+ records,
1219
+ });
1220
+ await writeS3File({
1221
+ credentials,
1222
+ region,
1223
+ bucket,
1224
+ key: manifest_list_key,
1225
+ body: manifest_list_buf,
1226
+ });
1227
+ }
1228
+ const commit_result = await icebergRequest({
1229
+ credentials: params.credentials,
1230
+ tableBucketARN: params.tableBucketARN,
1231
+ method: 'POST',
1232
+ suffix: `/namespaces/${params.namespace}/tables/${params.name}`,
1233
+ body: {
1234
+ requirements: parent_snapshot_id === -1
1235
+ ? []
1236
+ : [
1237
+ {
1238
+ type: 'assert-ref-snapshot-id',
1239
+ ref: 'main',
1240
+ 'snapshot-id': parent_snapshot_id,
1241
+ },
1242
+ ],
1243
+ updates: [
1244
+ {
1245
+ action: 'add-snapshot',
1246
+ snapshot: {
1247
+ 'sequence-number': sequence_number,
1248
+ 'snapshot-id': snapshot_id,
1249
+ 'parent-snapshot-id': parent_snapshot_id,
1250
+ 'timestamp-ms': Date.now(),
1251
+ summary: {
1252
+ operation: 'append',
1253
+ 'added-data-files': String(added_files),
1254
+ 'added-records': String(added_records),
1255
+ 'added-files-size': String(added_size),
1256
+ },
1257
+ 'manifest-list': manifest_list_url,
1258
+ 'schema-id': metadata['current-schema-id'],
1259
+ },
1260
+ },
1261
+ {
1262
+ action: 'set-snapshot-ref',
1263
+ 'snapshot-id': snapshot_id,
1264
+ type: 'branch',
1265
+ 'ref-name': 'main',
1266
+ },
1267
+ ],
1268
+ },
1269
+ });
1270
+ return commit_result;
1271
+ }
1272
+ async function setCurrentCommit(params) {
1273
+ const commit_result = await icebergRequest({
1274
+ credentials: params.credentials,
1275
+ tableBucketARN: params.tableBucketARN,
1276
+ method: 'POST',
1277
+ suffix: `/namespaces/${params.namespace}/tables/${params.name}`,
1278
+ body: {
1279
+ updates: [
1280
+ {
1281
+ action: 'set-snapshot-ref',
1282
+ 'snapshot-id': params.snapshotId,
1283
+ type: 'branch',
1284
+ 'ref-name': 'main',
1285
+ },
1286
+ ],
1287
+ },
1288
+ });
1289
+ return commit_result;
1290
+ }
1291
+ function _randomBigInt64() {
1292
+ const bytes = node_crypto.randomBytes(8);
1293
+ let ret = bytes.readBigUInt64BE();
1294
+ ret &= BigInt('0x7FFFFFFFFFFFFFFF');
1295
+ if (ret === 0n) {
1296
+ ret = 1n;
1297
+ }
1298
+ return ret;
123
1299
  }
124
1300
 
125
- var index = { getMetadata, addSchema, addPartitionSpec };
1301
+ var index = {
1302
+ getMetadata,
1303
+ addSchema,
1304
+ addPartitionSpec,
1305
+ addManifest,
1306
+ addDataFiles,
1307
+ setCurrentCommit,
1308
+ };
126
1309
 
1310
+ exports.addDataFiles = addDataFiles;
1311
+ exports.addManifest = addManifest;
127
1312
  exports.addPartitionSpec = addPartitionSpec;
128
1313
  exports.addSchema = addSchema;
129
1314
  exports.default = index;
130
1315
  exports.getMetadata = getMetadata;
1316
+ exports.setCurrentCommit = setCurrentCommit;