node-s3tables 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +206 -1
- package/dist/index.d.ts +107 -7
- package/dist/index.js +1201 -15
- package/package.json +13 -6
package/dist/index.js
CHANGED
|
@@ -2,13 +2,1026 @@
|
|
|
2
2
|
|
|
3
3
|
Object.defineProperty(exports, '__esModule', { value: true });
|
|
4
4
|
|
|
5
|
+
var node_crypto = require('node:crypto');
|
|
6
|
+
var avsc = require('avsc');
|
|
5
7
|
var clientS3 = require('@aws-sdk/client-s3');
|
|
6
8
|
var clientS3tables = require('@aws-sdk/client-s3tables');
|
|
9
|
+
var libStorage = require('@aws-sdk/lib-storage');
|
|
10
|
+
var node_stream = require('node:stream');
|
|
11
|
+
var LosslessJson = require('lossless-json');
|
|
7
12
|
var signatureV4 = require('@smithy/signature-v4');
|
|
8
13
|
var sha256Js = require('@aws-crypto/sha256-js');
|
|
9
14
|
var protocolHttp = require('@smithy/protocol-http');
|
|
10
15
|
var credentialProviderNode = require('@aws-sdk/credential-provider-node');
|
|
11
16
|
|
|
17
|
+
function _interopNamespaceDefault(e) {
|
|
18
|
+
var n = Object.create(null);
|
|
19
|
+
if (e) {
|
|
20
|
+
Object.keys(e).forEach(function (k) {
|
|
21
|
+
if (k !== 'default') {
|
|
22
|
+
var d = Object.getOwnPropertyDescriptor(e, k);
|
|
23
|
+
Object.defineProperty(n, k, d.get ? d : {
|
|
24
|
+
enumerable: true,
|
|
25
|
+
get: function () { return e[k]; }
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
n.default = e;
|
|
31
|
+
return Object.freeze(n);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
var avsc__namespace = /*#__PURE__*/_interopNamespaceDefault(avsc);
|
|
35
|
+
var LosslessJson__namespace = /*#__PURE__*/_interopNamespaceDefault(LosslessJson);
|
|
36
|
+
|
|
37
|
+
function fixupMetadata(metadata) {
|
|
38
|
+
const newMetadata = {};
|
|
39
|
+
for (const [key, value] of Object.entries(metadata)) {
|
|
40
|
+
if (Buffer.isBuffer(value)) {
|
|
41
|
+
newMetadata[key] = value;
|
|
42
|
+
}
|
|
43
|
+
else {
|
|
44
|
+
newMetadata[key] = Buffer.from(value, 'utf8');
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
return newMetadata;
|
|
48
|
+
}
|
|
49
|
+
async function avroToBuffer(params) {
|
|
50
|
+
const metadata = params.metadata
|
|
51
|
+
? fixupMetadata(params.metadata)
|
|
52
|
+
: params.metadata;
|
|
53
|
+
return new Promise((resolve, reject) => {
|
|
54
|
+
try {
|
|
55
|
+
const buffers = [];
|
|
56
|
+
const opts = {
|
|
57
|
+
writeHeader: true,
|
|
58
|
+
codec: 'deflate',
|
|
59
|
+
metadata,
|
|
60
|
+
};
|
|
61
|
+
const encoder = new avsc__namespace.streams.BlockEncoder(params.type, opts);
|
|
62
|
+
encoder.on('data', (chunk) => {
|
|
63
|
+
buffers.push(chunk);
|
|
64
|
+
});
|
|
65
|
+
encoder.on('end', () => {
|
|
66
|
+
resolve(Buffer.concat(buffers));
|
|
67
|
+
});
|
|
68
|
+
encoder.on('error', reject);
|
|
69
|
+
params.records.forEach((record) => {
|
|
70
|
+
encoder.write(record);
|
|
71
|
+
});
|
|
72
|
+
encoder.end();
|
|
73
|
+
}
|
|
74
|
+
catch (err) {
|
|
75
|
+
if (err instanceof Error) {
|
|
76
|
+
reject(err);
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
reject(new Error(String(err)));
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
function icebergToAvroFields(spec, schema) {
|
|
85
|
+
return spec.fields.map((p) => _icebergToAvroField(p, schema));
|
|
86
|
+
}
|
|
87
|
+
function _icebergToAvroField(field, schema) {
|
|
88
|
+
const source = schema.fields.find((f) => f.id === field['source-id']);
|
|
89
|
+
if (!source) {
|
|
90
|
+
throw new Error(`Source field ${field['source-id']} not found in schema`);
|
|
91
|
+
}
|
|
92
|
+
let avroType;
|
|
93
|
+
switch (field.transform) {
|
|
94
|
+
case 'identity':
|
|
95
|
+
if (typeof source.type === 'string') {
|
|
96
|
+
avroType = _mapPrimitiveToAvro(source.type);
|
|
97
|
+
break;
|
|
98
|
+
}
|
|
99
|
+
throw new Error(`Unsupported transform: ${field.transform} for complex type`);
|
|
100
|
+
case 'year':
|
|
101
|
+
avroType = { type: 'int', logicalType: 'year' };
|
|
102
|
+
break;
|
|
103
|
+
case 'month':
|
|
104
|
+
avroType = { type: 'int', logicalType: 'month' };
|
|
105
|
+
break;
|
|
106
|
+
case 'day':
|
|
107
|
+
avroType = { type: 'int', logicalType: 'date' };
|
|
108
|
+
break;
|
|
109
|
+
case 'hour':
|
|
110
|
+
avroType = { type: 'long', logicalType: 'hour' };
|
|
111
|
+
break;
|
|
112
|
+
default:
|
|
113
|
+
if (field.transform.startsWith('bucket[')) {
|
|
114
|
+
avroType = 'int';
|
|
115
|
+
break;
|
|
116
|
+
}
|
|
117
|
+
else if (field.transform.startsWith('truncate[')) {
|
|
118
|
+
avroType = 'string';
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
throw new Error(`Unsupported transform: ${field.transform} for type`);
|
|
122
|
+
}
|
|
123
|
+
return { name: field.name, type: ['null', avroType], default: null };
|
|
124
|
+
}
|
|
125
|
+
function _mapPrimitiveToAvro(type) {
|
|
126
|
+
switch (type) {
|
|
127
|
+
case 'boolean':
|
|
128
|
+
return 'int';
|
|
129
|
+
case 'int':
|
|
130
|
+
return 'int';
|
|
131
|
+
case 'long':
|
|
132
|
+
case 'time':
|
|
133
|
+
case 'timestamp':
|
|
134
|
+
case 'timestamptz':
|
|
135
|
+
return 'long';
|
|
136
|
+
case 'float':
|
|
137
|
+
case 'double':
|
|
138
|
+
return 'double';
|
|
139
|
+
case 'date':
|
|
140
|
+
return { type: 'int', logicalType: 'date' };
|
|
141
|
+
case 'string':
|
|
142
|
+
case 'uuid':
|
|
143
|
+
return 'string';
|
|
144
|
+
case 'binary':
|
|
145
|
+
return 'bytes';
|
|
146
|
+
default:
|
|
147
|
+
throw new Error(`Unsupported primitive: ${type}`);
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
var ManifestFileStatus;
|
|
152
|
+
(function (ManifestFileStatus) {
|
|
153
|
+
ManifestFileStatus[ManifestFileStatus["EXISTING"] = 0] = "EXISTING";
|
|
154
|
+
ManifestFileStatus[ManifestFileStatus["ADDED"] = 1] = "ADDED";
|
|
155
|
+
ManifestFileStatus[ManifestFileStatus["DELETED"] = 2] = "DELETED";
|
|
156
|
+
})(ManifestFileStatus || (ManifestFileStatus = {}));
|
|
157
|
+
var DataFileContent;
|
|
158
|
+
(function (DataFileContent) {
|
|
159
|
+
DataFileContent[DataFileContent["DATA"] = 0] = "DATA";
|
|
160
|
+
DataFileContent[DataFileContent["POSITION_DELETES"] = 1] = "POSITION_DELETES";
|
|
161
|
+
DataFileContent[DataFileContent["EQUALITY_DELETES"] = 2] = "EQUALITY_DELETES";
|
|
162
|
+
})(DataFileContent || (DataFileContent = {}));
|
|
163
|
+
var ListContent;
|
|
164
|
+
(function (ListContent) {
|
|
165
|
+
ListContent[ListContent["DATA"] = 0] = "DATA";
|
|
166
|
+
ListContent[ListContent["DELETES"] = 1] = "DELETES";
|
|
167
|
+
})(ListContent || (ListContent = {}));
|
|
168
|
+
const BigIntType = avsc__namespace.types.LongType.__with({
|
|
169
|
+
fromBuffer: (buf) => buf.readBigInt64LE(),
|
|
170
|
+
toBuffer(n) {
|
|
171
|
+
const buf = Buffer.alloc(8);
|
|
172
|
+
buf.writeBigInt64LE(n);
|
|
173
|
+
return buf;
|
|
174
|
+
},
|
|
175
|
+
fromJSON: BigInt,
|
|
176
|
+
toJSON: Number,
|
|
177
|
+
isValid: (n) => typeof n === 'bigint',
|
|
178
|
+
compare(n1, n2) {
|
|
179
|
+
return n1 === n2 ? 0 : n1 < n2 ? -1 : 1;
|
|
180
|
+
},
|
|
181
|
+
});
|
|
182
|
+
class YearStringType extends avsc__namespace.types.LogicalType {
|
|
183
|
+
_fromValue(val) {
|
|
184
|
+
return (1970 + val).toString();
|
|
185
|
+
}
|
|
186
|
+
_toValue(str) {
|
|
187
|
+
return parseInt(str, 10) - 1970;
|
|
188
|
+
}
|
|
189
|
+
_resolve(type) {
|
|
190
|
+
if (avsc__namespace.Type.isType(type, 'int')) {
|
|
191
|
+
return (val) => this._fromValue(val);
|
|
192
|
+
}
|
|
193
|
+
return null;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
class MonthStringType extends avsc__namespace.types.LogicalType {
|
|
197
|
+
_fromValue(val) {
|
|
198
|
+
const year = 1970 + Math.floor(val / 12);
|
|
199
|
+
const month = (val % 12) + 1;
|
|
200
|
+
return `${year}-${String(month).padStart(2, '0')}`;
|
|
201
|
+
}
|
|
202
|
+
_toValue(str) {
|
|
203
|
+
const [y, m] = str.split('-').map(Number);
|
|
204
|
+
return ((y ?? 1970) - 1970) * 12 + ((m ?? 1) - 1);
|
|
205
|
+
}
|
|
206
|
+
_resolve(type) {
|
|
207
|
+
if (avsc__namespace.Type.isType(type, 'int')) {
|
|
208
|
+
return (val) => this._fromValue(val);
|
|
209
|
+
}
|
|
210
|
+
return null;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
class DateStringType extends avsc__namespace.types.LogicalType {
|
|
214
|
+
_fromValue(val) {
|
|
215
|
+
const ms = val * 86400000;
|
|
216
|
+
return new Date(ms).toISOString().slice(0, 10);
|
|
217
|
+
}
|
|
218
|
+
_toValue(str) {
|
|
219
|
+
const [year, month, day] = str.split('-').map(Number);
|
|
220
|
+
return Math.floor(Date.UTC(year ?? 1970, (month ?? 1) - 1, day ?? 1) / 86400000);
|
|
221
|
+
}
|
|
222
|
+
_resolve(type) {
|
|
223
|
+
if (avsc__namespace.Type.isType(type, 'int')) {
|
|
224
|
+
return (val) => this._fromValue(val);
|
|
225
|
+
}
|
|
226
|
+
return null;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
class HourStringType extends avsc__namespace.types.LogicalType {
|
|
230
|
+
_fromValue(val) {
|
|
231
|
+
const ms = val * 3600000;
|
|
232
|
+
return new Date(ms).toISOString().slice(0, 13);
|
|
233
|
+
}
|
|
234
|
+
_toValue(str) {
|
|
235
|
+
const d = new Date(str);
|
|
236
|
+
return Math.floor(d.getTime() / 3600000);
|
|
237
|
+
}
|
|
238
|
+
_resolve(type) {
|
|
239
|
+
if (avsc__namespace.Type.isType(type, 'long')) {
|
|
240
|
+
return (val) => this._fromValue(val);
|
|
241
|
+
}
|
|
242
|
+
return null;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
const AvroRegistry = { long: BigIntType };
|
|
246
|
+
const AvroLogicalTypes = {
|
|
247
|
+
year: YearStringType,
|
|
248
|
+
month: MonthStringType,
|
|
249
|
+
date: DateStringType,
|
|
250
|
+
hour: HourStringType,
|
|
251
|
+
};
|
|
252
|
+
|
|
253
|
+
function makeManifestType(spec, schema) {
|
|
254
|
+
const part_fields = icebergToAvroFields(spec, schema);
|
|
255
|
+
return avsc__namespace.Type.forSchema({
|
|
256
|
+
type: 'record',
|
|
257
|
+
name: 'manifest_entry',
|
|
258
|
+
fields: [
|
|
259
|
+
{ name: 'status', type: 'int', 'field-id': 0 },
|
|
260
|
+
{
|
|
261
|
+
name: 'snapshot_id',
|
|
262
|
+
type: ['null', 'long'],
|
|
263
|
+
default: null,
|
|
264
|
+
'field-id': 1,
|
|
265
|
+
},
|
|
266
|
+
{
|
|
267
|
+
name: 'sequence_number',
|
|
268
|
+
type: ['null', 'long'],
|
|
269
|
+
default: null,
|
|
270
|
+
'field-id': 3,
|
|
271
|
+
},
|
|
272
|
+
{
|
|
273
|
+
name: 'file_sequence_number',
|
|
274
|
+
type: ['null', 'long'],
|
|
275
|
+
default: null,
|
|
276
|
+
'field-id': 4,
|
|
277
|
+
},
|
|
278
|
+
{
|
|
279
|
+
name: 'data_file',
|
|
280
|
+
type: {
|
|
281
|
+
type: 'record',
|
|
282
|
+
name: 'r2',
|
|
283
|
+
fields: [
|
|
284
|
+
{
|
|
285
|
+
name: 'content',
|
|
286
|
+
type: 'int',
|
|
287
|
+
doc: 'Contents of the file: 0=data, 1=position deletes, 2=equality deletes',
|
|
288
|
+
'field-id': 134,
|
|
289
|
+
},
|
|
290
|
+
{
|
|
291
|
+
name: 'file_path',
|
|
292
|
+
type: 'string',
|
|
293
|
+
doc: 'Location URI with FS scheme',
|
|
294
|
+
'field-id': 100,
|
|
295
|
+
},
|
|
296
|
+
{
|
|
297
|
+
name: 'file_format',
|
|
298
|
+
type: 'string',
|
|
299
|
+
doc: 'File format name: avro, orc, or parquet',
|
|
300
|
+
'field-id': 101,
|
|
301
|
+
},
|
|
302
|
+
{
|
|
303
|
+
name: 'partition',
|
|
304
|
+
type: { type: 'record', name: 'r102', fields: part_fields },
|
|
305
|
+
doc: 'Partition data tuple, schema based on the partition spec',
|
|
306
|
+
'field-id': 102,
|
|
307
|
+
},
|
|
308
|
+
{
|
|
309
|
+
name: 'record_count',
|
|
310
|
+
type: 'long',
|
|
311
|
+
doc: 'Number of records in the file',
|
|
312
|
+
'field-id': 103,
|
|
313
|
+
},
|
|
314
|
+
{
|
|
315
|
+
name: 'file_size_in_bytes',
|
|
316
|
+
type: 'long',
|
|
317
|
+
doc: 'Total file size in bytes',
|
|
318
|
+
'field-id': 104,
|
|
319
|
+
},
|
|
320
|
+
{
|
|
321
|
+
name: 'column_sizes',
|
|
322
|
+
type: [
|
|
323
|
+
'null',
|
|
324
|
+
{
|
|
325
|
+
type: 'array',
|
|
326
|
+
items: {
|
|
327
|
+
type: 'record',
|
|
328
|
+
name: 'k117_v118',
|
|
329
|
+
fields: [
|
|
330
|
+
{ name: 'key', type: 'int', 'field-id': 117 },
|
|
331
|
+
{ name: 'value', type: 'long', 'field-id': 118 },
|
|
332
|
+
],
|
|
333
|
+
},
|
|
334
|
+
logicalType: 'map',
|
|
335
|
+
},
|
|
336
|
+
],
|
|
337
|
+
doc: 'Map of column id to total size on disk',
|
|
338
|
+
default: null,
|
|
339
|
+
'field-id': 108,
|
|
340
|
+
},
|
|
341
|
+
{
|
|
342
|
+
name: 'value_counts',
|
|
343
|
+
type: [
|
|
344
|
+
'null',
|
|
345
|
+
{
|
|
346
|
+
type: 'array',
|
|
347
|
+
items: {
|
|
348
|
+
type: 'record',
|
|
349
|
+
name: 'k119_v120',
|
|
350
|
+
fields: [
|
|
351
|
+
{ name: 'key', type: 'int', 'field-id': 119 },
|
|
352
|
+
{ name: 'value', type: 'long', 'field-id': 120 },
|
|
353
|
+
],
|
|
354
|
+
},
|
|
355
|
+
logicalType: 'map',
|
|
356
|
+
},
|
|
357
|
+
],
|
|
358
|
+
doc: 'Map of column id to total count, including null and NaN',
|
|
359
|
+
default: null,
|
|
360
|
+
'field-id': 109,
|
|
361
|
+
},
|
|
362
|
+
{
|
|
363
|
+
name: 'null_value_counts',
|
|
364
|
+
type: [
|
|
365
|
+
'null',
|
|
366
|
+
{
|
|
367
|
+
type: 'array',
|
|
368
|
+
items: {
|
|
369
|
+
type: 'record',
|
|
370
|
+
name: 'k121_v122',
|
|
371
|
+
fields: [
|
|
372
|
+
{ name: 'key', type: 'int', 'field-id': 121 },
|
|
373
|
+
{ name: 'value', type: 'long', 'field-id': 122 },
|
|
374
|
+
],
|
|
375
|
+
},
|
|
376
|
+
logicalType: 'map',
|
|
377
|
+
},
|
|
378
|
+
],
|
|
379
|
+
doc: 'Map of column id to null value count',
|
|
380
|
+
default: null,
|
|
381
|
+
'field-id': 110,
|
|
382
|
+
},
|
|
383
|
+
{
|
|
384
|
+
name: 'nan_value_counts',
|
|
385
|
+
type: [
|
|
386
|
+
'null',
|
|
387
|
+
{
|
|
388
|
+
type: 'array',
|
|
389
|
+
items: {
|
|
390
|
+
type: 'record',
|
|
391
|
+
name: 'k138_v139',
|
|
392
|
+
fields: [
|
|
393
|
+
{ name: 'key', type: 'int', 'field-id': 138 },
|
|
394
|
+
{ name: 'value', type: 'long', 'field-id': 139 },
|
|
395
|
+
],
|
|
396
|
+
},
|
|
397
|
+
logicalType: 'map',
|
|
398
|
+
},
|
|
399
|
+
],
|
|
400
|
+
doc: 'Map of column id to number of NaN values in the column',
|
|
401
|
+
default: null,
|
|
402
|
+
'field-id': 137,
|
|
403
|
+
},
|
|
404
|
+
{
|
|
405
|
+
name: 'lower_bounds',
|
|
406
|
+
type: [
|
|
407
|
+
'null',
|
|
408
|
+
{
|
|
409
|
+
type: 'array',
|
|
410
|
+
items: {
|
|
411
|
+
type: 'record',
|
|
412
|
+
name: 'k126_v127',
|
|
413
|
+
fields: [
|
|
414
|
+
{ name: 'key', type: 'int', 'field-id': 126 },
|
|
415
|
+
{ name: 'value', type: 'bytes', 'field-id': 127 },
|
|
416
|
+
],
|
|
417
|
+
},
|
|
418
|
+
logicalType: 'map',
|
|
419
|
+
},
|
|
420
|
+
],
|
|
421
|
+
doc: 'Map of column id to lower bound',
|
|
422
|
+
default: null,
|
|
423
|
+
'field-id': 125,
|
|
424
|
+
},
|
|
425
|
+
{
|
|
426
|
+
name: 'upper_bounds',
|
|
427
|
+
type: [
|
|
428
|
+
'null',
|
|
429
|
+
{
|
|
430
|
+
type: 'array',
|
|
431
|
+
items: {
|
|
432
|
+
type: 'record',
|
|
433
|
+
name: 'k129_v130',
|
|
434
|
+
fields: [
|
|
435
|
+
{ name: 'key', type: 'int', 'field-id': 129 },
|
|
436
|
+
{ name: 'value', type: 'bytes', 'field-id': 130 },
|
|
437
|
+
],
|
|
438
|
+
},
|
|
439
|
+
logicalType: 'map',
|
|
440
|
+
},
|
|
441
|
+
],
|
|
442
|
+
doc: 'Map of column id to upper bound',
|
|
443
|
+
default: null,
|
|
444
|
+
'field-id': 128,
|
|
445
|
+
},
|
|
446
|
+
{
|
|
447
|
+
name: 'key_metadata',
|
|
448
|
+
type: ['null', 'bytes'],
|
|
449
|
+
doc: 'Encryption key metadata blob',
|
|
450
|
+
default: null,
|
|
451
|
+
'field-id': 131,
|
|
452
|
+
},
|
|
453
|
+
{
|
|
454
|
+
name: 'split_offsets',
|
|
455
|
+
type: [
|
|
456
|
+
'null',
|
|
457
|
+
{ type: 'array', items: 'long', 'element-id': 133 },
|
|
458
|
+
],
|
|
459
|
+
doc: 'Splittable offsets',
|
|
460
|
+
default: null,
|
|
461
|
+
'field-id': 132,
|
|
462
|
+
},
|
|
463
|
+
{
|
|
464
|
+
name: 'equality_ids',
|
|
465
|
+
type: [
|
|
466
|
+
'null',
|
|
467
|
+
{ type: 'array', items: 'int', 'element-id': 136 },
|
|
468
|
+
],
|
|
469
|
+
doc: 'Equality comparison field IDs',
|
|
470
|
+
default: null,
|
|
471
|
+
'field-id': 135,
|
|
472
|
+
},
|
|
473
|
+
{
|
|
474
|
+
name: 'sort_order_id',
|
|
475
|
+
type: ['null', 'int'],
|
|
476
|
+
doc: 'Sort order ID',
|
|
477
|
+
default: null,
|
|
478
|
+
'field-id': 140,
|
|
479
|
+
},
|
|
480
|
+
],
|
|
481
|
+
},
|
|
482
|
+
'field-id': 2,
|
|
483
|
+
},
|
|
484
|
+
],
|
|
485
|
+
}, { registry: { ...AvroRegistry }, logicalTypes: AvroLogicalTypes });
|
|
486
|
+
}
|
|
487
|
+
const ManifestListType = avsc__namespace.Type.forSchema({
|
|
488
|
+
type: 'record',
|
|
489
|
+
name: 'manifest_file',
|
|
490
|
+
fields: [
|
|
491
|
+
{
|
|
492
|
+
name: 'manifest_path',
|
|
493
|
+
type: 'string',
|
|
494
|
+
doc: 'Location URI with FS scheme',
|
|
495
|
+
'field-id': 500,
|
|
496
|
+
},
|
|
497
|
+
{
|
|
498
|
+
name: 'manifest_length',
|
|
499
|
+
type: 'long',
|
|
500
|
+
doc: 'Total file size in bytes',
|
|
501
|
+
'field-id': 501,
|
|
502
|
+
},
|
|
503
|
+
{
|
|
504
|
+
name: 'partition_spec_id',
|
|
505
|
+
type: 'int',
|
|
506
|
+
doc: 'Spec ID used to write',
|
|
507
|
+
'field-id': 502,
|
|
508
|
+
},
|
|
509
|
+
{
|
|
510
|
+
name: 'content',
|
|
511
|
+
type: 'int',
|
|
512
|
+
doc: 'Contents of the manifest: 0=data, 1=deletes',
|
|
513
|
+
'field-id': 517,
|
|
514
|
+
},
|
|
515
|
+
{
|
|
516
|
+
name: 'sequence_number',
|
|
517
|
+
type: 'long',
|
|
518
|
+
doc: 'Sequence number when the manifest was added',
|
|
519
|
+
'field-id': 515,
|
|
520
|
+
},
|
|
521
|
+
{
|
|
522
|
+
name: 'min_sequence_number',
|
|
523
|
+
type: 'long',
|
|
524
|
+
doc: 'Lowest sequence number in the manifest',
|
|
525
|
+
'field-id': 516,
|
|
526
|
+
},
|
|
527
|
+
{
|
|
528
|
+
name: 'added_snapshot_id',
|
|
529
|
+
type: 'long',
|
|
530
|
+
doc: 'Snapshot ID that added the manifest',
|
|
531
|
+
'field-id': 503,
|
|
532
|
+
},
|
|
533
|
+
{
|
|
534
|
+
name: 'added_data_files_count',
|
|
535
|
+
type: 'int',
|
|
536
|
+
doc: 'Added entry count',
|
|
537
|
+
'field-id': 504,
|
|
538
|
+
},
|
|
539
|
+
{
|
|
540
|
+
name: 'existing_data_files_count',
|
|
541
|
+
type: 'int',
|
|
542
|
+
doc: 'Existing entry count',
|
|
543
|
+
'field-id': 505,
|
|
544
|
+
},
|
|
545
|
+
{
|
|
546
|
+
name: 'deleted_data_files_count',
|
|
547
|
+
type: 'int',
|
|
548
|
+
doc: 'Deleted entry count',
|
|
549
|
+
'field-id': 506,
|
|
550
|
+
},
|
|
551
|
+
{
|
|
552
|
+
name: 'added_rows_count',
|
|
553
|
+
type: 'long',
|
|
554
|
+
doc: 'Added rows count',
|
|
555
|
+
'field-id': 512,
|
|
556
|
+
},
|
|
557
|
+
{
|
|
558
|
+
name: 'existing_rows_count',
|
|
559
|
+
type: 'long',
|
|
560
|
+
doc: 'Existing rows count',
|
|
561
|
+
'field-id': 513,
|
|
562
|
+
},
|
|
563
|
+
{
|
|
564
|
+
name: 'deleted_rows_count',
|
|
565
|
+
type: 'long',
|
|
566
|
+
doc: 'Deleted rows count',
|
|
567
|
+
'field-id': 514,
|
|
568
|
+
},
|
|
569
|
+
{
|
|
570
|
+
name: 'partitions',
|
|
571
|
+
type: [
|
|
572
|
+
'null',
|
|
573
|
+
{
|
|
574
|
+
type: 'array',
|
|
575
|
+
items: {
|
|
576
|
+
type: 'record',
|
|
577
|
+
name: 'r508',
|
|
578
|
+
fields: [
|
|
579
|
+
{
|
|
580
|
+
name: 'contains_null',
|
|
581
|
+
type: 'boolean',
|
|
582
|
+
doc: 'True if any file has a null partition value',
|
|
583
|
+
'field-id': 509,
|
|
584
|
+
},
|
|
585
|
+
{
|
|
586
|
+
name: 'contains_nan',
|
|
587
|
+
type: ['null', 'boolean'],
|
|
588
|
+
doc: 'True if any file has a nan partition value',
|
|
589
|
+
default: null,
|
|
590
|
+
'field-id': 518,
|
|
591
|
+
},
|
|
592
|
+
{
|
|
593
|
+
name: 'lower_bound',
|
|
594
|
+
type: ['null', 'bytes'],
|
|
595
|
+
doc: 'Partition lower bound for all files',
|
|
596
|
+
default: null,
|
|
597
|
+
'field-id': 510,
|
|
598
|
+
},
|
|
599
|
+
{
|
|
600
|
+
name: 'upper_bound',
|
|
601
|
+
type: ['null', 'bytes'],
|
|
602
|
+
doc: 'Partition upper bound for all files',
|
|
603
|
+
default: null,
|
|
604
|
+
'field-id': 511,
|
|
605
|
+
},
|
|
606
|
+
],
|
|
607
|
+
},
|
|
608
|
+
'element-id': 508,
|
|
609
|
+
},
|
|
610
|
+
],
|
|
611
|
+
doc: 'Summary for each partition',
|
|
612
|
+
default: null,
|
|
613
|
+
'field-id': 507,
|
|
614
|
+
},
|
|
615
|
+
],
|
|
616
|
+
}, { registry: { ...AvroRegistry } });
|
|
617
|
+
|
|
618
|
+
function _isPrimitive(t) {
|
|
619
|
+
return typeof t === 'string';
|
|
620
|
+
}
|
|
621
|
+
function _outputType(transform, sourceType) {
|
|
622
|
+
if (transform === 'identity' || transform.startsWith('truncate[')) {
|
|
623
|
+
if (_isPrimitive(sourceType)) {
|
|
624
|
+
return sourceType;
|
|
625
|
+
}
|
|
626
|
+
return null;
|
|
627
|
+
}
|
|
628
|
+
if (transform.startsWith('bucket[')) {
|
|
629
|
+
return 'int';
|
|
630
|
+
}
|
|
631
|
+
if (transform === 'year' ||
|
|
632
|
+
transform === 'month' ||
|
|
633
|
+
transform === 'day' ||
|
|
634
|
+
transform === 'hour') {
|
|
635
|
+
return 'int';
|
|
636
|
+
}
|
|
637
|
+
return null;
|
|
638
|
+
}
|
|
639
|
+
function _encodeValue(raw, transform, out_type) {
|
|
640
|
+
if (raw === null || transform === null || out_type === null) {
|
|
641
|
+
return null;
|
|
642
|
+
}
|
|
643
|
+
switch (transform) {
|
|
644
|
+
case 'identity': {
|
|
645
|
+
if (Buffer.isBuffer(raw)) {
|
|
646
|
+
if (out_type === 'binary' ||
|
|
647
|
+
out_type.startsWith('decimal(') ||
|
|
648
|
+
out_type.startsWith('fixed[')) {
|
|
649
|
+
return raw;
|
|
650
|
+
}
|
|
651
|
+
throw new Error(`Buffer not allowed for identity with type ${out_type}`);
|
|
652
|
+
}
|
|
653
|
+
switch (out_type) {
|
|
654
|
+
case 'int': {
|
|
655
|
+
const n = typeof raw === 'number' ? raw : Number(raw);
|
|
656
|
+
const buf = Buffer.alloc(4);
|
|
657
|
+
buf.writeInt32LE(Math.floor(n));
|
|
658
|
+
return buf;
|
|
659
|
+
}
|
|
660
|
+
case 'long': {
|
|
661
|
+
const n = typeof raw === 'bigint' ? raw : BigInt(raw);
|
|
662
|
+
const buf = Buffer.alloc(8);
|
|
663
|
+
buf.writeBigInt64LE(n);
|
|
664
|
+
return buf;
|
|
665
|
+
}
|
|
666
|
+
case 'float': {
|
|
667
|
+
const n = typeof raw === 'number' ? raw : Number(raw);
|
|
668
|
+
const buf = Buffer.alloc(4);
|
|
669
|
+
buf.writeFloatLE(n);
|
|
670
|
+
return buf;
|
|
671
|
+
}
|
|
672
|
+
case 'double': {
|
|
673
|
+
const n = typeof raw === 'number' ? raw : Number(raw);
|
|
674
|
+
const buf = Buffer.alloc(8);
|
|
675
|
+
buf.writeDoubleLE(n);
|
|
676
|
+
return buf;
|
|
677
|
+
}
|
|
678
|
+
case 'string':
|
|
679
|
+
case 'uuid': {
|
|
680
|
+
const s = typeof raw === 'string' ? raw : String(raw);
|
|
681
|
+
return Buffer.from(s, 'utf8');
|
|
682
|
+
}
|
|
683
|
+
case 'boolean': {
|
|
684
|
+
const buf = Buffer.alloc(1);
|
|
685
|
+
buf.writeUInt8(raw ? 1 : 0);
|
|
686
|
+
return buf;
|
|
687
|
+
}
|
|
688
|
+
case 'binary':
|
|
689
|
+
case 'date':
|
|
690
|
+
case 'time':
|
|
691
|
+
case 'timestamp':
|
|
692
|
+
case 'timestamptz':
|
|
693
|
+
throw new Error(`Identity not implemented for type ${out_type}`);
|
|
694
|
+
default:
|
|
695
|
+
throw new Error(`Identity not implemented for type ${out_type}`);
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
case 'year':
|
|
699
|
+
case 'month':
|
|
700
|
+
case 'day':
|
|
701
|
+
case 'hour': {
|
|
702
|
+
let n;
|
|
703
|
+
if (typeof raw === 'string') {
|
|
704
|
+
const d = new Date(raw);
|
|
705
|
+
if (transform === 'year') {
|
|
706
|
+
n = d.getUTCFullYear();
|
|
707
|
+
}
|
|
708
|
+
else if (transform === 'month') {
|
|
709
|
+
n = d.getUTCFullYear() * 12 + d.getUTCMonth();
|
|
710
|
+
}
|
|
711
|
+
else if (transform === 'day') {
|
|
712
|
+
n = Math.floor(d.getTime() / (24 * 3600 * 1000));
|
|
713
|
+
}
|
|
714
|
+
else {
|
|
715
|
+
n = Math.floor(d.getTime() / (3600 * 1000));
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
else if (typeof raw === 'number' || typeof raw === 'bigint') {
|
|
719
|
+
n = Number(raw);
|
|
720
|
+
}
|
|
721
|
+
else {
|
|
722
|
+
throw new Error(`${transform} requires string|number|bigint`);
|
|
723
|
+
}
|
|
724
|
+
const buf = Buffer.alloc(4);
|
|
725
|
+
buf.writeInt32LE(n);
|
|
726
|
+
return buf;
|
|
727
|
+
}
|
|
728
|
+
default:
|
|
729
|
+
if (transform.startsWith('bucket[')) {
|
|
730
|
+
if (typeof raw !== 'number') {
|
|
731
|
+
throw new Error('bucket requires number input');
|
|
732
|
+
}
|
|
733
|
+
const buf = Buffer.alloc(4);
|
|
734
|
+
buf.writeInt32LE(raw);
|
|
735
|
+
return buf;
|
|
736
|
+
}
|
|
737
|
+
if (transform.startsWith('truncate[')) {
|
|
738
|
+
if (typeof raw !== 'string') {
|
|
739
|
+
throw new Error('truncate requires string input');
|
|
740
|
+
}
|
|
741
|
+
const width = Number(/\d+/.exec(transform)?.[0]);
|
|
742
|
+
return Buffer.from(raw.substring(0, width), 'utf8');
|
|
743
|
+
}
|
|
744
|
+
throw new Error(`Unsupported transform ${transform}`);
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
const NaNValue = NaN;
|
|
748
|
+
function makeBounds(paritions, spec, schema) {
|
|
749
|
+
return spec.fields.map((f) => {
|
|
750
|
+
const schemaField = schema.fields.find((sf) => sf.id === f['source-id']);
|
|
751
|
+
if (!schemaField) {
|
|
752
|
+
throw new Error(`Schema field not found for source-id ${f['source-id']}`);
|
|
753
|
+
}
|
|
754
|
+
if (!(f.name in paritions)) {
|
|
755
|
+
throw new Error(`paritions missing ${f.name}`);
|
|
756
|
+
}
|
|
757
|
+
const raw = paritions[f.name];
|
|
758
|
+
if (typeof raw === 'number' && isNaN(raw)) {
|
|
759
|
+
return NaNValue;
|
|
760
|
+
}
|
|
761
|
+
if (raw === null || raw === undefined) {
|
|
762
|
+
return null;
|
|
763
|
+
}
|
|
764
|
+
const out_type = _outputType(f.transform, schemaField.type);
|
|
765
|
+
return _encodeValue(raw, f.transform, out_type);
|
|
766
|
+
});
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
const S3_REGEX = /^s3:\/\/([^/]+)\/(.+)$/;
|
|
770
|
+
function parseS3Url(url) {
|
|
771
|
+
const match = S3_REGEX.exec(url);
|
|
772
|
+
if (!match) {
|
|
773
|
+
throw new Error('Invalid S3 URL');
|
|
774
|
+
}
|
|
775
|
+
return { bucket: match[1], key: match[2] };
|
|
776
|
+
}
|
|
777
|
+
const g_s3Map = new Map();
|
|
778
|
+
const g_s3TablesMap = new Map();
|
|
779
|
+
function getS3Client(params) {
|
|
780
|
+
const { region, credentials } = params;
|
|
781
|
+
let ret = g_s3Map.get(region)?.get(credentials);
|
|
782
|
+
if (!ret) {
|
|
783
|
+
const opts = {};
|
|
784
|
+
if (region) {
|
|
785
|
+
opts.region = region;
|
|
786
|
+
}
|
|
787
|
+
if (credentials) {
|
|
788
|
+
opts.credentials = credentials;
|
|
789
|
+
}
|
|
790
|
+
ret = new clientS3.S3Client(opts);
|
|
791
|
+
_setMap(g_s3Map, region, credentials, ret);
|
|
792
|
+
}
|
|
793
|
+
return ret;
|
|
794
|
+
}
|
|
795
|
+
function getS3TablesClient(params) {
|
|
796
|
+
const { region, credentials } = params;
|
|
797
|
+
let ret = g_s3TablesMap.get(region)?.get(credentials);
|
|
798
|
+
if (!ret) {
|
|
799
|
+
const opts = {};
|
|
800
|
+
if (region) {
|
|
801
|
+
opts.region = region;
|
|
802
|
+
}
|
|
803
|
+
if (credentials) {
|
|
804
|
+
opts.credentials = credentials;
|
|
805
|
+
}
|
|
806
|
+
ret = new clientS3tables.S3TablesClient(opts);
|
|
807
|
+
_setMap(g_s3TablesMap, region, credentials, ret);
|
|
808
|
+
}
|
|
809
|
+
return ret;
|
|
810
|
+
}
|
|
811
|
+
function _setMap(map, region, credentials, client) {
|
|
812
|
+
let region_map = map.get(region);
|
|
813
|
+
region_map ??= new Map();
|
|
814
|
+
region_map.set(credentials, client);
|
|
815
|
+
}
|
|
816
|
+
async function writeS3File(params) {
|
|
817
|
+
const { credentials, region, bucket, key, body } = params;
|
|
818
|
+
const s3 = getS3Client({ region, credentials });
|
|
819
|
+
const command = new clientS3.PutObjectCommand({
|
|
820
|
+
Bucket: bucket,
|
|
821
|
+
Key: key,
|
|
822
|
+
Body: body,
|
|
823
|
+
});
|
|
824
|
+
await s3.send(command);
|
|
825
|
+
}
|
|
826
|
+
async function updateManifestList(params) {
|
|
827
|
+
const { region, credentials, bucket, key, outKey, prepend } = params;
|
|
828
|
+
const metadata = params.metadata
|
|
829
|
+
? fixupMetadata(params.metadata)
|
|
830
|
+
: params.metadata;
|
|
831
|
+
const s3 = getS3Client({ region, credentials });
|
|
832
|
+
const get = new clientS3.GetObjectCommand({ Bucket: bucket, Key: key });
|
|
833
|
+
const response = await s3.send(get);
|
|
834
|
+
const source = response.Body;
|
|
835
|
+
if (!source) {
|
|
836
|
+
throw new Error('failed to get source manifest list');
|
|
837
|
+
}
|
|
838
|
+
const passthrough = new node_stream.PassThrough();
|
|
839
|
+
const decoder = new avsc__namespace.streams.BlockDecoder({
|
|
840
|
+
parseHook: () => ManifestListType,
|
|
841
|
+
});
|
|
842
|
+
const encoder = new avsc__namespace.streams.BlockEncoder(ManifestListType, {
|
|
843
|
+
codec: 'deflate',
|
|
844
|
+
metadata,
|
|
845
|
+
});
|
|
846
|
+
encoder.pipe(passthrough);
|
|
847
|
+
for (const record of prepend) {
|
|
848
|
+
encoder.write(record);
|
|
849
|
+
}
|
|
850
|
+
const upload = new libStorage.Upload({
|
|
851
|
+
client: s3,
|
|
852
|
+
params: { Bucket: bucket, Key: outKey, Body: passthrough },
|
|
853
|
+
});
|
|
854
|
+
const stream_promise = new Promise((resolve, reject) => {
|
|
855
|
+
decoder.on('error', reject);
|
|
856
|
+
decoder.on('data', (record) => {
|
|
857
|
+
encoder.write(record);
|
|
858
|
+
});
|
|
859
|
+
decoder.on('end', () => {
|
|
860
|
+
encoder.end();
|
|
861
|
+
});
|
|
862
|
+
decoder.on('finish', () => {
|
|
863
|
+
resolve();
|
|
864
|
+
});
|
|
865
|
+
source.pipe(decoder);
|
|
866
|
+
});
|
|
867
|
+
await Promise.all([stream_promise, upload.done()]);
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
async function addManifest(params) {
|
|
871
|
+
const { credentials, region, metadata } = params;
|
|
872
|
+
const bucket = metadata.location.split('/').slice(-1)[0];
|
|
873
|
+
const schema = metadata.schemas.find((s) => s['schema-id'] === params.schemaId);
|
|
874
|
+
const spec = metadata['partition-specs'].find((p) => p['spec-id'] === params.specId);
|
|
875
|
+
if (!bucket) {
|
|
876
|
+
throw new Error('bad manifest location');
|
|
877
|
+
}
|
|
878
|
+
if (!schema) {
|
|
879
|
+
throw new Error('schema not found');
|
|
880
|
+
}
|
|
881
|
+
if (!spec) {
|
|
882
|
+
throw new Error('partition spec not found');
|
|
883
|
+
}
|
|
884
|
+
if (!params.files[0]) {
|
|
885
|
+
throw new Error('must have at least 1 file');
|
|
886
|
+
}
|
|
887
|
+
let added_rows_count = 0n;
|
|
888
|
+
const partitions = spec.fields.map(() => ({
|
|
889
|
+
contains_null: false,
|
|
890
|
+
contains_nan: false,
|
|
891
|
+
upper_bound: null,
|
|
892
|
+
lower_bound: null,
|
|
893
|
+
}));
|
|
894
|
+
const records = params.files.map((file) => {
|
|
895
|
+
added_rows_count += file.recordCount;
|
|
896
|
+
const bounds = makeBounds(file.partitions, spec, schema);
|
|
897
|
+
for (let i = 0; i < partitions.length; i++) {
|
|
898
|
+
const part = partitions[i];
|
|
899
|
+
const bound = bounds[i];
|
|
900
|
+
if (!part) {
|
|
901
|
+
throw new Error('impossible');
|
|
902
|
+
}
|
|
903
|
+
else if (bound === null) {
|
|
904
|
+
part.contains_null = true;
|
|
905
|
+
}
|
|
906
|
+
else if (Buffer.isBuffer(bound)) {
|
|
907
|
+
part.upper_bound = _maxBuffer(part.upper_bound ?? null, bound);
|
|
908
|
+
part.lower_bound = _minBuffer(part.lower_bound ?? null, bound);
|
|
909
|
+
}
|
|
910
|
+
else {
|
|
911
|
+
part.contains_nan = true;
|
|
912
|
+
}
|
|
913
|
+
}
|
|
914
|
+
return {
|
|
915
|
+
status: ManifestFileStatus.ADDED,
|
|
916
|
+
snapshot_id: params.snapshotId,
|
|
917
|
+
sequence_number: params.sequenceNumber,
|
|
918
|
+
file_sequence_number: params.sequenceNumber,
|
|
919
|
+
data_file: {
|
|
920
|
+
content: DataFileContent.DATA,
|
|
921
|
+
file_path: file.file,
|
|
922
|
+
file_format: 'PARQUET',
|
|
923
|
+
record_count: file.recordCount,
|
|
924
|
+
file_size_in_bytes: file.fileSize,
|
|
925
|
+
partition: file.partitions,
|
|
926
|
+
column_sizes: _transformRecord(schema, file.columnSizes),
|
|
927
|
+
value_counts: _transformRecord(schema, file.valueCounts),
|
|
928
|
+
null_value_counts: _transformRecord(schema, file.nullValueCounts),
|
|
929
|
+
nan_value_counts: _transformRecord(schema, file.nanValueCounts),
|
|
930
|
+
lower_bounds: _transformRecord(schema, file.lowerBounds),
|
|
931
|
+
upper_bounds: _transformRecord(schema, file.upperBounds),
|
|
932
|
+
key_metadata: file.keyMetadata ?? null,
|
|
933
|
+
split_offsets: file.splitOffsets ?? null,
|
|
934
|
+
equality_ids: file.equalityIds ?? null,
|
|
935
|
+
sort_order_id: file.sortOrderId ?? null,
|
|
936
|
+
},
|
|
937
|
+
};
|
|
938
|
+
});
|
|
939
|
+
const manifest_type = makeManifestType(spec, schema);
|
|
940
|
+
const manifest_buf = await avroToBuffer({
|
|
941
|
+
type: manifest_type,
|
|
942
|
+
metadata: {
|
|
943
|
+
'partition-spec-id': String(params.specId),
|
|
944
|
+
'partition-spec': JSON.stringify(spec.fields),
|
|
945
|
+
},
|
|
946
|
+
records,
|
|
947
|
+
});
|
|
948
|
+
const manifest_key = `metadata/${node_crypto.randomUUID()}.avro`;
|
|
949
|
+
await writeS3File({
|
|
950
|
+
credentials,
|
|
951
|
+
region,
|
|
952
|
+
bucket,
|
|
953
|
+
key: manifest_key,
|
|
954
|
+
body: manifest_buf,
|
|
955
|
+
});
|
|
956
|
+
const manifest_record = {
|
|
957
|
+
manifest_path: `s3://${bucket}/${manifest_key}`,
|
|
958
|
+
manifest_length: BigInt(manifest_buf.length),
|
|
959
|
+
partition_spec_id: params.specId,
|
|
960
|
+
content: ListContent.DATA,
|
|
961
|
+
sequence_number: params.sequenceNumber,
|
|
962
|
+
min_sequence_number: params.sequenceNumber,
|
|
963
|
+
added_snapshot_id: params.snapshotId,
|
|
964
|
+
added_data_files_count: params.files.length,
|
|
965
|
+
existing_data_files_count: 0,
|
|
966
|
+
deleted_data_files_count: 0,
|
|
967
|
+
added_rows_count,
|
|
968
|
+
existing_rows_count: 0n,
|
|
969
|
+
deleted_rows_count: 0n,
|
|
970
|
+
partitions,
|
|
971
|
+
};
|
|
972
|
+
return manifest_record;
|
|
973
|
+
}
|
|
974
|
+
function _transformRecord(schema, map) {
|
|
975
|
+
if (!map) {
|
|
976
|
+
return null;
|
|
977
|
+
}
|
|
978
|
+
const ret = [];
|
|
979
|
+
for (const field of schema.fields) {
|
|
980
|
+
const value = map[field.name];
|
|
981
|
+
if (value !== undefined) {
|
|
982
|
+
ret.push({ key: field.id, value });
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
return ret.length > 0 ? ret : null;
|
|
986
|
+
}
|
|
987
|
+
function _minBuffer(a, b) {
|
|
988
|
+
if (!a && !b) {
|
|
989
|
+
return null;
|
|
990
|
+
}
|
|
991
|
+
else if (!a) {
|
|
992
|
+
return b;
|
|
993
|
+
}
|
|
994
|
+
else if (!b) {
|
|
995
|
+
return a;
|
|
996
|
+
}
|
|
997
|
+
return Buffer.compare(a, b) <= 0 ? a : b;
|
|
998
|
+
}
|
|
999
|
+
function _maxBuffer(a, b) {
|
|
1000
|
+
if (!a && !b) {
|
|
1001
|
+
return null;
|
|
1002
|
+
}
|
|
1003
|
+
else if (!a) {
|
|
1004
|
+
return b;
|
|
1005
|
+
}
|
|
1006
|
+
else if (!b) {
|
|
1007
|
+
return a;
|
|
1008
|
+
}
|
|
1009
|
+
return Buffer.compare(a, b) >= 0 ? a : b;
|
|
1010
|
+
}
|
|
1011
|
+
|
|
1012
|
+
function customNumberParser(value) {
|
|
1013
|
+
if (LosslessJson__namespace.isInteger(value)) {
|
|
1014
|
+
if (LosslessJson__namespace.isSafeNumber(value)) {
|
|
1015
|
+
return parseInt(value, 10);
|
|
1016
|
+
}
|
|
1017
|
+
return BigInt(value);
|
|
1018
|
+
}
|
|
1019
|
+
return parseFloat(value);
|
|
1020
|
+
}
|
|
1021
|
+
function parse(text) {
|
|
1022
|
+
return LosslessJson__namespace.parse(text, null, customNumberParser);
|
|
1023
|
+
}
|
|
1024
|
+
|
|
12
1025
|
async function icebergRequest(params) {
|
|
13
1026
|
const region = params.tableBucketARN.split(':')[3];
|
|
14
1027
|
if (!region) {
|
|
@@ -17,7 +1030,7 @@ async function icebergRequest(params) {
|
|
|
17
1030
|
const arn = encodeURIComponent(params.tableBucketARN);
|
|
18
1031
|
const hostname = `s3tables.${region}.amazonaws.com`;
|
|
19
1032
|
const full_path = `/iceberg/v1/${arn}${params.suffix}`;
|
|
20
|
-
const body = params.body ?
|
|
1033
|
+
const body = params.body ? LosslessJson.stringify(params.body) : null;
|
|
21
1034
|
const req_opts = {
|
|
22
1035
|
method: params.method ?? 'GET',
|
|
23
1036
|
protocol: 'https:',
|
|
@@ -47,29 +1060,47 @@ async function icebergRequest(params) {
|
|
|
47
1060
|
fetch_opts.body = signed.body;
|
|
48
1061
|
}
|
|
49
1062
|
const res = await fetch(url, fetch_opts);
|
|
1063
|
+
const text = await res.text();
|
|
50
1064
|
if (!res.ok) {
|
|
51
|
-
throw new Error(`request failed: ${res.status} ${res.statusText}`);
|
|
1065
|
+
throw new Error(`request failed: ${res.status} ${res.statusText} ${text}`);
|
|
1066
|
+
}
|
|
1067
|
+
try {
|
|
1068
|
+
return parse(text);
|
|
1069
|
+
}
|
|
1070
|
+
catch {
|
|
1071
|
+
return text;
|
|
52
1072
|
}
|
|
53
|
-
return (await res.json());
|
|
54
1073
|
}
|
|
55
1074
|
|
|
56
1075
|
async function getMetadata(params) {
|
|
57
|
-
|
|
58
|
-
|
|
1076
|
+
if ('tableBucketARN' in params) {
|
|
1077
|
+
const icebergResponse = await icebergRequest({
|
|
1078
|
+
credentials: params.credentials,
|
|
1079
|
+
tableBucketARN: params.tableBucketARN,
|
|
1080
|
+
method: 'GET',
|
|
1081
|
+
suffix: `/namespaces/${params.namespace}/tables/${params.name}`,
|
|
1082
|
+
});
|
|
1083
|
+
if (icebergResponse.metadata) {
|
|
1084
|
+
return icebergResponse.metadata;
|
|
1085
|
+
}
|
|
1086
|
+
throw new Error('invalid table metadata');
|
|
1087
|
+
}
|
|
1088
|
+
const { ...other } = params;
|
|
1089
|
+
const client = getS3TablesClient(params);
|
|
59
1090
|
const get_table_cmd = new clientS3tables.GetTableCommand(other);
|
|
60
1091
|
const response = await client.send(get_table_cmd);
|
|
61
1092
|
if (!response.metadataLocation) {
|
|
62
1093
|
throw new Error('missing metadataLocation');
|
|
63
1094
|
}
|
|
64
|
-
const s3_client =
|
|
65
|
-
const { key, bucket } =
|
|
1095
|
+
const s3_client = getS3Client(params);
|
|
1096
|
+
const { key, bucket } = parseS3Url(response.metadataLocation);
|
|
66
1097
|
const get_file_cmd = new clientS3.GetObjectCommand({ Bucket: bucket, Key: key });
|
|
67
1098
|
const file_response = await s3_client.send(get_file_cmd);
|
|
68
1099
|
const body = await file_response.Body?.transformToString();
|
|
69
1100
|
if (!body) {
|
|
70
1101
|
throw new Error('missing body');
|
|
71
1102
|
}
|
|
72
|
-
return
|
|
1103
|
+
return parse(body);
|
|
73
1104
|
}
|
|
74
1105
|
async function addSchema(params) {
|
|
75
1106
|
return icebergRequest({
|
|
@@ -113,18 +1144,173 @@ async function addPartitionSpec(params) {
|
|
|
113
1144
|
},
|
|
114
1145
|
});
|
|
115
1146
|
}
|
|
116
|
-
|
|
117
|
-
function
|
|
118
|
-
const
|
|
119
|
-
|
|
120
|
-
|
|
1147
|
+
|
|
1148
|
+
async function addDataFiles(params) {
|
|
1149
|
+
const { credentials } = params;
|
|
1150
|
+
const region = params.tableBucketARN.split(':')[3];
|
|
1151
|
+
if (!region) {
|
|
1152
|
+
throw new Error('bad tableBucketARN');
|
|
121
1153
|
}
|
|
122
|
-
|
|
1154
|
+
const snapshot_id = _randomBigInt64();
|
|
1155
|
+
const metadata = await getMetadata(params);
|
|
1156
|
+
const parent_snapshot_id = metadata['current-snapshot-id'];
|
|
1157
|
+
const bucket = metadata.location.split('/').slice(-1)[0];
|
|
1158
|
+
const snapshot = parent_snapshot_id === -1
|
|
1159
|
+
? null
|
|
1160
|
+
: metadata.snapshots.find((s) => s['snapshot-id'] === parent_snapshot_id);
|
|
1161
|
+
if (!bucket) {
|
|
1162
|
+
throw new Error('bad manifest location');
|
|
1163
|
+
}
|
|
1164
|
+
if (parent_snapshot_id !== -1 && !snapshot) {
|
|
1165
|
+
throw new Error('no old snapshot');
|
|
1166
|
+
}
|
|
1167
|
+
const sequence_number = BigInt(metadata.snapshots.reduce((memo, s) => s['sequence-number'] > memo ? s['sequence-number'] : memo, 0)) + 1n;
|
|
1168
|
+
let added_files = 0;
|
|
1169
|
+
let added_records = 0n;
|
|
1170
|
+
let added_size = 0n;
|
|
1171
|
+
const records = await Promise.all(params.lists.map(async (list) => {
|
|
1172
|
+
added_files += list.files.length;
|
|
1173
|
+
for (const file of list.files) {
|
|
1174
|
+
added_records += file.recordCount;
|
|
1175
|
+
added_size += file.fileSize;
|
|
1176
|
+
}
|
|
1177
|
+
const opts = {
|
|
1178
|
+
credentials,
|
|
1179
|
+
region,
|
|
1180
|
+
metadata,
|
|
1181
|
+
schemaId: list.schemaId,
|
|
1182
|
+
specId: list.specId,
|
|
1183
|
+
snapshotId: snapshot_id,
|
|
1184
|
+
sequenceNumber: sequence_number,
|
|
1185
|
+
files: list.files,
|
|
1186
|
+
};
|
|
1187
|
+
return addManifest(opts);
|
|
1188
|
+
}));
|
|
1189
|
+
const manifest_list_key = `metadata/${node_crypto.randomUUID()}.avro`;
|
|
1190
|
+
const manifest_list_url = `s3://${bucket}/${manifest_list_key}`;
|
|
1191
|
+
if (snapshot) {
|
|
1192
|
+
const { key: old_list_key } = parseS3Url(snapshot['manifest-list']);
|
|
1193
|
+
if (!old_list_key) {
|
|
1194
|
+
throw new Error('snapshot invalid');
|
|
1195
|
+
}
|
|
1196
|
+
await updateManifestList({
|
|
1197
|
+
credentials,
|
|
1198
|
+
region,
|
|
1199
|
+
bucket,
|
|
1200
|
+
key: old_list_key,
|
|
1201
|
+
outKey: manifest_list_key,
|
|
1202
|
+
metadata: {
|
|
1203
|
+
'sequence-number': String(sequence_number),
|
|
1204
|
+
'snapshot-id': String(snapshot_id),
|
|
1205
|
+
'parent-snapshot-id': String(parent_snapshot_id),
|
|
1206
|
+
},
|
|
1207
|
+
prepend: records,
|
|
1208
|
+
});
|
|
1209
|
+
}
|
|
1210
|
+
else {
|
|
1211
|
+
const manifest_list_buf = await avroToBuffer({
|
|
1212
|
+
type: ManifestListType,
|
|
1213
|
+
metadata: {
|
|
1214
|
+
'sequence-number': String(sequence_number),
|
|
1215
|
+
'snapshot-id': String(snapshot_id),
|
|
1216
|
+
'parent-snapshot-id': String(parent_snapshot_id),
|
|
1217
|
+
},
|
|
1218
|
+
records,
|
|
1219
|
+
});
|
|
1220
|
+
await writeS3File({
|
|
1221
|
+
credentials,
|
|
1222
|
+
region,
|
|
1223
|
+
bucket,
|
|
1224
|
+
key: manifest_list_key,
|
|
1225
|
+
body: manifest_list_buf,
|
|
1226
|
+
});
|
|
1227
|
+
}
|
|
1228
|
+
const commit_result = await icebergRequest({
|
|
1229
|
+
credentials: params.credentials,
|
|
1230
|
+
tableBucketARN: params.tableBucketARN,
|
|
1231
|
+
method: 'POST',
|
|
1232
|
+
suffix: `/namespaces/${params.namespace}/tables/${params.name}`,
|
|
1233
|
+
body: {
|
|
1234
|
+
requirements: parent_snapshot_id === -1
|
|
1235
|
+
? []
|
|
1236
|
+
: [
|
|
1237
|
+
{
|
|
1238
|
+
type: 'assert-ref-snapshot-id',
|
|
1239
|
+
ref: 'main',
|
|
1240
|
+
'snapshot-id': parent_snapshot_id,
|
|
1241
|
+
},
|
|
1242
|
+
],
|
|
1243
|
+
updates: [
|
|
1244
|
+
{
|
|
1245
|
+
action: 'add-snapshot',
|
|
1246
|
+
snapshot: {
|
|
1247
|
+
'sequence-number': sequence_number,
|
|
1248
|
+
'snapshot-id': snapshot_id,
|
|
1249
|
+
'parent-snapshot-id': parent_snapshot_id,
|
|
1250
|
+
'timestamp-ms': Date.now(),
|
|
1251
|
+
summary: {
|
|
1252
|
+
operation: 'append',
|
|
1253
|
+
'added-data-files': String(added_files),
|
|
1254
|
+
'added-records': String(added_records),
|
|
1255
|
+
'added-files-size': String(added_size),
|
|
1256
|
+
},
|
|
1257
|
+
'manifest-list': manifest_list_url,
|
|
1258
|
+
'schema-id': metadata['current-schema-id'],
|
|
1259
|
+
},
|
|
1260
|
+
},
|
|
1261
|
+
{
|
|
1262
|
+
action: 'set-snapshot-ref',
|
|
1263
|
+
'snapshot-id': snapshot_id,
|
|
1264
|
+
type: 'branch',
|
|
1265
|
+
'ref-name': 'main',
|
|
1266
|
+
},
|
|
1267
|
+
],
|
|
1268
|
+
},
|
|
1269
|
+
});
|
|
1270
|
+
return commit_result;
|
|
1271
|
+
}
|
|
1272
|
+
async function setCurrentCommit(params) {
|
|
1273
|
+
const commit_result = await icebergRequest({
|
|
1274
|
+
credentials: params.credentials,
|
|
1275
|
+
tableBucketARN: params.tableBucketARN,
|
|
1276
|
+
method: 'POST',
|
|
1277
|
+
suffix: `/namespaces/${params.namespace}/tables/${params.name}`,
|
|
1278
|
+
body: {
|
|
1279
|
+
updates: [
|
|
1280
|
+
{
|
|
1281
|
+
action: 'set-snapshot-ref',
|
|
1282
|
+
'snapshot-id': params.snapshotId,
|
|
1283
|
+
type: 'branch',
|
|
1284
|
+
'ref-name': 'main',
|
|
1285
|
+
},
|
|
1286
|
+
],
|
|
1287
|
+
},
|
|
1288
|
+
});
|
|
1289
|
+
return commit_result;
|
|
1290
|
+
}
|
|
1291
|
+
function _randomBigInt64() {
|
|
1292
|
+
const bytes = node_crypto.randomBytes(8);
|
|
1293
|
+
let ret = bytes.readBigUInt64BE();
|
|
1294
|
+
ret &= BigInt('0x7FFFFFFFFFFFFFFF');
|
|
1295
|
+
if (ret === 0n) {
|
|
1296
|
+
ret = 1n;
|
|
1297
|
+
}
|
|
1298
|
+
return ret;
|
|
123
1299
|
}
|
|
124
1300
|
|
|
125
|
-
var index = {
|
|
1301
|
+
var index = {
|
|
1302
|
+
getMetadata,
|
|
1303
|
+
addSchema,
|
|
1304
|
+
addPartitionSpec,
|
|
1305
|
+
addManifest,
|
|
1306
|
+
addDataFiles,
|
|
1307
|
+
setCurrentCommit,
|
|
1308
|
+
};
|
|
126
1309
|
|
|
1310
|
+
exports.addDataFiles = addDataFiles;
|
|
1311
|
+
exports.addManifest = addManifest;
|
|
127
1312
|
exports.addPartitionSpec = addPartitionSpec;
|
|
128
1313
|
exports.addSchema = addSchema;
|
|
129
1314
|
exports.default = index;
|
|
130
1315
|
exports.getMetadata = getMetadata;
|
|
1316
|
+
exports.setCurrentCommit = setCurrentCommit;
|