node-s3tables 0.0.16 → 0.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin.js +25 -0
- package/dist/index.d.ts +50 -3
- package/dist/index.js +587 -19
- package/package.json +5 -2
package/dist/bin.js
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
'use strict';
|
|
3
|
+
|
|
4
|
+
var nodeS3tables = require('node-s3tables');
|
|
5
|
+
|
|
6
|
+
/* eslint-disable no-console */
|
|
7
|
+
const [tableBucketARN, namespace, name] = process.argv.slice(2);
|
|
8
|
+
if (!tableBucketARN || !namespace || !name) {
|
|
9
|
+
console.error('Usage: node-s3tables compact <tableBucketARN> <namespace> <name>');
|
|
10
|
+
process.exit(-1);
|
|
11
|
+
}
|
|
12
|
+
nodeS3tables.manifestCompact({ tableBucketARN, namespace, name })
|
|
13
|
+
.then((result) => {
|
|
14
|
+
console.log('Compact result:', result);
|
|
15
|
+
process.exit(0);
|
|
16
|
+
})
|
|
17
|
+
.catch((error) => {
|
|
18
|
+
if (error instanceof Error) {
|
|
19
|
+
console.error('Error:', error.message);
|
|
20
|
+
}
|
|
21
|
+
else {
|
|
22
|
+
console.error('Error:', error);
|
|
23
|
+
}
|
|
24
|
+
process.exit(1);
|
|
25
|
+
});
|
package/dist/index.d.ts
CHANGED
|
@@ -130,7 +130,7 @@ type TableLocation = {
|
|
|
130
130
|
};
|
|
131
131
|
type GetMetadataParams = TableLocation & {
|
|
132
132
|
region?: string;
|
|
133
|
-
credentials?: AwsCredentialIdentity;
|
|
133
|
+
credentials?: AwsCredentialIdentity | undefined;
|
|
134
134
|
};
|
|
135
135
|
declare function getMetadata(params: GetMetadataParams): Promise<IcebergMetadata>;
|
|
136
136
|
interface AddSchemaParams {
|
|
@@ -194,6 +194,34 @@ interface AddDataFilesResult {
|
|
|
194
194
|
sequenceNumber: bigint;
|
|
195
195
|
}
|
|
196
196
|
declare function addDataFiles(params: AddDataFilesParams): Promise<AddDataFilesResult>;
|
|
197
|
+
|
|
198
|
+
interface SubmitSnapshotParams {
|
|
199
|
+
credentials?: AwsCredentialIdentity | undefined;
|
|
200
|
+
tableBucketARN: string;
|
|
201
|
+
namespace: string;
|
|
202
|
+
name: string;
|
|
203
|
+
currentSchemaId: number;
|
|
204
|
+
parentSnapshotId: bigint;
|
|
205
|
+
snapshotId: bigint;
|
|
206
|
+
sequenceNumber: bigint;
|
|
207
|
+
retryCount?: number | undefined;
|
|
208
|
+
removeSnapshotId?: bigint | undefined;
|
|
209
|
+
manifestListUrl: string;
|
|
210
|
+
summary: Record<string, string>;
|
|
211
|
+
resolveConflict?: (conflictSnapshot: IcebergSnapshot) => Promise<ResolveConflictResult>;
|
|
212
|
+
}
|
|
213
|
+
interface ResolveConflictResult {
|
|
214
|
+
manifestListUrl: string;
|
|
215
|
+
summary: Record<string, string>;
|
|
216
|
+
}
|
|
217
|
+
interface SubmitSnapshotResult {
|
|
218
|
+
result: JSONObject;
|
|
219
|
+
retriesNeeded: number;
|
|
220
|
+
parentSnapshotId: bigint;
|
|
221
|
+
snapshotId: bigint;
|
|
222
|
+
sequenceNumber: bigint;
|
|
223
|
+
}
|
|
224
|
+
declare function submitSnapshot(params: SubmitSnapshotParams): Promise<SubmitSnapshotResult>;
|
|
197
225
|
interface SetCurrentCommitParams {
|
|
198
226
|
credentials?: AwsCredentialIdentity;
|
|
199
227
|
tableBucketARN: string;
|
|
@@ -210,6 +238,25 @@ declare class IcebergHttpError extends Error {
|
|
|
210
238
|
constructor(status: number, body: JSONValue, message: string);
|
|
211
239
|
}
|
|
212
240
|
|
|
241
|
+
type CalculateWeightFunction = (group: ManifestListRecord[]) => number;
|
|
242
|
+
interface ManifestCompactParams {
|
|
243
|
+
credentials?: AwsCredentialIdentity;
|
|
244
|
+
tableBucketARN: string;
|
|
245
|
+
namespace: string;
|
|
246
|
+
name: string;
|
|
247
|
+
snapshotId?: bigint;
|
|
248
|
+
targetCount?: number;
|
|
249
|
+
calculateWeight?: CalculateWeightFunction;
|
|
250
|
+
forceRewrite?: boolean;
|
|
251
|
+
retryCount?: number;
|
|
252
|
+
maxSnapshots?: number;
|
|
253
|
+
}
|
|
254
|
+
interface ManifestCompactResult extends SubmitSnapshotResult {
|
|
255
|
+
changed: boolean;
|
|
256
|
+
outputManifestCount: number;
|
|
257
|
+
}
|
|
258
|
+
declare function manifestCompact(params: ManifestCompactParams): Promise<ManifestCompactResult>;
|
|
259
|
+
|
|
213
260
|
declare const _default: {
|
|
214
261
|
IcebergHttpError: typeof IcebergHttpError;
|
|
215
262
|
getMetadata: typeof getMetadata;
|
|
@@ -221,5 +268,5 @@ declare const _default: {
|
|
|
221
268
|
removeSnapshots: typeof removeSnapshots;
|
|
222
269
|
};
|
|
223
270
|
|
|
224
|
-
export { IcebergHttpError, addDataFiles, addManifest, addPartitionSpec, addSchema, _default as default, getMetadata, removeSnapshots, setCurrentCommit };
|
|
225
|
-
export type { AddDataFilesParams, AddDataFilesResult, AddFile, AddFileList, AddManifestParams, AddPartitionSpecParams, AddSchemaParams, GetMetadataParams, IcebergComplexType, IcebergMetadata, IcebergPartitionField, IcebergPartitionSpec, IcebergPrimitiveType, IcebergSchema, IcebergSchemaField, IcebergSnapshot, IcebergSnapshotSummary, IcebergTransform, IcebergType, IcebergUpdateResponse, RemoveSnapshotsParams, SetCurrentCommitParams, TableLocation };
|
|
271
|
+
export { IcebergHttpError, addDataFiles, addManifest, addPartitionSpec, addSchema, _default as default, getMetadata, manifestCompact, removeSnapshots, setCurrentCommit, submitSnapshot };
|
|
272
|
+
export type { AddDataFilesParams, AddDataFilesResult, AddFile, AddFileList, AddManifestParams, AddPartitionSpecParams, AddSchemaParams, CalculateWeightFunction, GetMetadataParams, IcebergComplexType, IcebergMetadata, IcebergPartitionField, IcebergPartitionSpec, IcebergPrimitiveType, IcebergSchema, IcebergSchemaField, IcebergSnapshot, IcebergSnapshotSummary, IcebergTransform, IcebergType, IcebergUpdateResponse, ManifestCompactParams, ManifestCompactResult, ManifestListRecord, RemoveSnapshotsParams, ResolveConflictResult, SetCurrentCommitParams, SubmitSnapshotParams, SubmitSnapshotResult, TableLocation };
|
package/dist/index.js
CHANGED
|
@@ -84,13 +84,21 @@ async function avroToBuffer(params) {
|
|
|
84
84
|
}
|
|
85
85
|
});
|
|
86
86
|
}
|
|
87
|
-
function icebergToAvroFields(spec,
|
|
88
|
-
return spec.fields.map((p) => _icebergToAvroField(p,
|
|
87
|
+
function icebergToAvroFields(spec, schemas, skipPartitionLogicalType) {
|
|
88
|
+
return spec.fields.map((p) => _icebergToAvroField(p, schemas, skipPartitionLogicalType));
|
|
89
89
|
}
|
|
90
|
-
function _icebergToAvroField(field,
|
|
91
|
-
|
|
90
|
+
function _icebergToAvroField(field, schemas, skipPartitionLogicalType) {
|
|
91
|
+
let source;
|
|
92
|
+
for (const schema of schemas) {
|
|
93
|
+
for (const f of schema.fields) {
|
|
94
|
+
if (f.id === field['source-id']) {
|
|
95
|
+
source = f;
|
|
96
|
+
break;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
92
100
|
if (!source) {
|
|
93
|
-
throw new Error(`Source field ${field['source-id']} not found in
|
|
101
|
+
throw new Error(`Source field ${field['source-id']} not found in schemas`);
|
|
94
102
|
}
|
|
95
103
|
let avroType;
|
|
96
104
|
switch (field.transform) {
|
|
@@ -123,6 +131,9 @@ function _icebergToAvroField(field, schema) {
|
|
|
123
131
|
}
|
|
124
132
|
throw new Error(`Unsupported transform: ${field.transform} for type`);
|
|
125
133
|
}
|
|
134
|
+
if (typeof avroType === 'object' && skipPartitionLogicalType) {
|
|
135
|
+
avroType = avroType.type;
|
|
136
|
+
}
|
|
126
137
|
return {
|
|
127
138
|
name: field.name,
|
|
128
139
|
type: ['null', avroType],
|
|
@@ -260,9 +271,9 @@ const AvroLogicalTypes = {
|
|
|
260
271
|
hour: HourStringType,
|
|
261
272
|
};
|
|
262
273
|
|
|
263
|
-
function
|
|
264
|
-
const part_fields = icebergToAvroFields(spec,
|
|
265
|
-
return
|
|
274
|
+
function makeManifestSchema(spec, schemas, skipPartitionLogicalType) {
|
|
275
|
+
const part_fields = icebergToAvroFields(spec, schemas, skipPartitionLogicalType);
|
|
276
|
+
return {
|
|
266
277
|
type: 'record',
|
|
267
278
|
name: 'manifest_entry',
|
|
268
279
|
fields: [
|
|
@@ -492,9 +503,16 @@ function makeManifestType(spec, schema) {
|
|
|
492
503
|
'field-id': 2,
|
|
493
504
|
},
|
|
494
505
|
],
|
|
495
|
-
}
|
|
506
|
+
};
|
|
507
|
+
}
|
|
508
|
+
function makeManifestType(spec, schemas, skipPartitionLogicalType) {
|
|
509
|
+
const schema = makeManifestSchema(spec, schemas, skipPartitionLogicalType);
|
|
510
|
+
return avsc__namespace.Type.forSchema(schema, {
|
|
511
|
+
registry: { ...AvroRegistry },
|
|
512
|
+
logicalTypes: AvroLogicalTypes,
|
|
513
|
+
});
|
|
496
514
|
}
|
|
497
|
-
const
|
|
515
|
+
const ManifestListSchema = {
|
|
498
516
|
type: 'record',
|
|
499
517
|
name: 'manifest_file',
|
|
500
518
|
fields: [
|
|
@@ -630,7 +648,10 @@ const ManifestListType = avsc__namespace.Type.forSchema({
|
|
|
630
648
|
'field-id': 519,
|
|
631
649
|
},
|
|
632
650
|
],
|
|
633
|
-
}
|
|
651
|
+
};
|
|
652
|
+
const ManifestListType = avsc__namespace.Type.forSchema(ManifestListSchema, {
|
|
653
|
+
registry: { ...AvroRegistry },
|
|
654
|
+
});
|
|
634
655
|
|
|
635
656
|
function _isPrimitive(t) {
|
|
636
657
|
return typeof t === 'string';
|
|
@@ -1033,10 +1054,14 @@ async function updateManifestList(params) {
|
|
|
1033
1054
|
reject(err);
|
|
1034
1055
|
});
|
|
1035
1056
|
decoder.on('data', (record) => {
|
|
1036
|
-
const translated = translateRecord(sourceSchema,
|
|
1037
|
-
if (
|
|
1038
|
-
|
|
1039
|
-
|
|
1057
|
+
const translated = translateRecord(sourceSchema, ManifestListSchema, record);
|
|
1058
|
+
if (translated.content !== ListContent.DATA ||
|
|
1059
|
+
translated.added_files_count > 0 ||
|
|
1060
|
+
translated.existing_files_count > 0) {
|
|
1061
|
+
if (!encoder.write(translated)) {
|
|
1062
|
+
decoder.pause();
|
|
1063
|
+
encoder.once('drain', () => decoder.resume());
|
|
1064
|
+
}
|
|
1040
1065
|
}
|
|
1041
1066
|
});
|
|
1042
1067
|
decoder.on('end', () => {
|
|
@@ -1049,6 +1074,89 @@ async function updateManifestList(params) {
|
|
|
1049
1074
|
});
|
|
1050
1075
|
await Promise.all([stream_promise, upload.done()]);
|
|
1051
1076
|
}
|
|
1077
|
+
async function streamWriteAvro(params) {
|
|
1078
|
+
const { region, credentials, bucket, key } = params;
|
|
1079
|
+
const metadata = params.metadata
|
|
1080
|
+
? fixupMetadata(params.metadata)
|
|
1081
|
+
: params.metadata;
|
|
1082
|
+
const s3 = getS3Client({ region, credentials });
|
|
1083
|
+
const encoder = new avsc__namespace.streams.BlockEncoder(params.avroType, {
|
|
1084
|
+
codec: 'deflate',
|
|
1085
|
+
codecs: { deflate: zlib__namespace.deflateRaw },
|
|
1086
|
+
metadata,
|
|
1087
|
+
});
|
|
1088
|
+
const upload = new libStorage.Upload({
|
|
1089
|
+
client: s3,
|
|
1090
|
+
params: { Bucket: bucket, Key: key, Body: encoder },
|
|
1091
|
+
});
|
|
1092
|
+
let file_size = 0;
|
|
1093
|
+
upload.on('httpUploadProgress', (progress) => {
|
|
1094
|
+
if (progress.loaded) {
|
|
1095
|
+
file_size = progress.loaded;
|
|
1096
|
+
}
|
|
1097
|
+
});
|
|
1098
|
+
const upload_promise = upload.done();
|
|
1099
|
+
let found_err;
|
|
1100
|
+
upload_promise.catch((err) => {
|
|
1101
|
+
found_err = err;
|
|
1102
|
+
});
|
|
1103
|
+
encoder.on('error', (err) => {
|
|
1104
|
+
found_err = err;
|
|
1105
|
+
});
|
|
1106
|
+
for await (const batch of params.iter) {
|
|
1107
|
+
if (found_err) {
|
|
1108
|
+
throw found_err;
|
|
1109
|
+
}
|
|
1110
|
+
for (const record of batch) {
|
|
1111
|
+
encoder.write(record);
|
|
1112
|
+
}
|
|
1113
|
+
}
|
|
1114
|
+
encoder.end();
|
|
1115
|
+
await upload_promise;
|
|
1116
|
+
if (found_err) {
|
|
1117
|
+
throw found_err;
|
|
1118
|
+
}
|
|
1119
|
+
return file_size;
|
|
1120
|
+
}
|
|
1121
|
+
async function downloadAvro(params) {
|
|
1122
|
+
const { region, credentials, bucket, key, avroSchema } = params;
|
|
1123
|
+
const s3 = getS3Client({ region, credentials });
|
|
1124
|
+
const get = new clientS3.GetObjectCommand({ Bucket: bucket, Key: key });
|
|
1125
|
+
const response = await s3.send(get);
|
|
1126
|
+
const source = response.Body;
|
|
1127
|
+
if (!source) {
|
|
1128
|
+
throw new Error('failed to get source manifest list');
|
|
1129
|
+
}
|
|
1130
|
+
let sourceSchema;
|
|
1131
|
+
const decoder = new avsc__namespace.streams.BlockDecoder({
|
|
1132
|
+
codecs: { deflate: zlib__namespace.inflateRaw },
|
|
1133
|
+
parseHook(schema) {
|
|
1134
|
+
sourceSchema = schema;
|
|
1135
|
+
return avsc__namespace.Type.forSchema(schema, {
|
|
1136
|
+
registry: { ...AvroRegistry },
|
|
1137
|
+
});
|
|
1138
|
+
},
|
|
1139
|
+
});
|
|
1140
|
+
const records = [];
|
|
1141
|
+
const stream_promise = new Promise((resolve, reject) => {
|
|
1142
|
+
source.on('error', (err) => {
|
|
1143
|
+
reject(err);
|
|
1144
|
+
});
|
|
1145
|
+
decoder.on('error', (err) => {
|
|
1146
|
+
reject(err);
|
|
1147
|
+
});
|
|
1148
|
+
decoder.on('data', (record) => {
|
|
1149
|
+
const translated = translateRecord(sourceSchema, avroSchema, record);
|
|
1150
|
+
records.push(translated);
|
|
1151
|
+
});
|
|
1152
|
+
decoder.on('end', () => {
|
|
1153
|
+
resolve();
|
|
1154
|
+
});
|
|
1155
|
+
source.pipe(decoder);
|
|
1156
|
+
});
|
|
1157
|
+
await stream_promise;
|
|
1158
|
+
return records;
|
|
1159
|
+
}
|
|
1052
1160
|
|
|
1053
1161
|
async function addManifest(params) {
|
|
1054
1162
|
const { credentials, region, metadata } = params;
|
|
@@ -1119,7 +1227,7 @@ async function addManifest(params) {
|
|
|
1119
1227
|
},
|
|
1120
1228
|
};
|
|
1121
1229
|
});
|
|
1122
|
-
const manifest_type = makeManifestType(spec, schema);
|
|
1230
|
+
const manifest_type = makeManifestType(spec, [schema]);
|
|
1123
1231
|
const manifest_buf = await avroToBuffer({
|
|
1124
1232
|
type: manifest_type,
|
|
1125
1233
|
metadata: {
|
|
@@ -1365,15 +1473,15 @@ async function removeSnapshots(params) {
|
|
|
1365
1473
|
});
|
|
1366
1474
|
}
|
|
1367
1475
|
|
|
1368
|
-
const DEFAULT_RETRY_COUNT = 5;
|
|
1476
|
+
const DEFAULT_RETRY_COUNT$1 = 5;
|
|
1369
1477
|
async function addDataFiles(params) {
|
|
1370
1478
|
const { credentials } = params;
|
|
1371
|
-
const retry_max = params.retryCount ?? DEFAULT_RETRY_COUNT;
|
|
1479
|
+
const retry_max = params.retryCount ?? DEFAULT_RETRY_COUNT$1;
|
|
1372
1480
|
const region = params.tableBucketARN.split(':')[3];
|
|
1373
1481
|
if (!region) {
|
|
1374
1482
|
throw new Error('bad tableBucketARN');
|
|
1375
1483
|
}
|
|
1376
|
-
const snapshot_id = params.snapshotId ?? _randomBigInt64();
|
|
1484
|
+
const snapshot_id = params.snapshotId ?? _randomBigInt64$1();
|
|
1377
1485
|
const metadata = await getMetadata(params);
|
|
1378
1486
|
const bucket = metadata.location.split('/').slice(-1)[0];
|
|
1379
1487
|
const parent_snapshot_id = BigInt(metadata['current-snapshot-id']);
|
|
@@ -1554,6 +1662,116 @@ async function addDataFiles(params) {
|
|
|
1554
1662
|
}
|
|
1555
1663
|
}
|
|
1556
1664
|
}
|
|
1665
|
+
function _randomBigInt64$1() {
|
|
1666
|
+
const bytes = node_crypto.randomBytes(8);
|
|
1667
|
+
let ret = bytes.readBigUInt64BE();
|
|
1668
|
+
ret &= BigInt('0x7FFFFFFFFFFFFFFF');
|
|
1669
|
+
if (ret === 0n) {
|
|
1670
|
+
ret = 1n;
|
|
1671
|
+
}
|
|
1672
|
+
return ret;
|
|
1673
|
+
}
|
|
1674
|
+
|
|
1675
|
+
const DEFAULT_RETRY_COUNT = 5;
|
|
1676
|
+
async function submitSnapshot(params) {
|
|
1677
|
+
const { snapshotId, parentSnapshotId, resolveConflict } = params;
|
|
1678
|
+
let { sequenceNumber, removeSnapshotId, manifestListUrl, summary } = params;
|
|
1679
|
+
const retry_max = params.retryCount ?? DEFAULT_RETRY_COUNT;
|
|
1680
|
+
let expected_snapshot_id = parentSnapshotId;
|
|
1681
|
+
let conflict_snap;
|
|
1682
|
+
for (let try_count = 0;; try_count++) {
|
|
1683
|
+
if (conflict_snap && resolveConflict) {
|
|
1684
|
+
const resolve_result = await resolveConflict(conflict_snap);
|
|
1685
|
+
summary = resolve_result.summary;
|
|
1686
|
+
manifestListUrl = resolve_result.manifestListUrl;
|
|
1687
|
+
}
|
|
1688
|
+
else if (conflict_snap) {
|
|
1689
|
+
throw new Error('conflict');
|
|
1690
|
+
}
|
|
1691
|
+
try {
|
|
1692
|
+
const updates = [
|
|
1693
|
+
{
|
|
1694
|
+
action: 'add-snapshot',
|
|
1695
|
+
snapshot: {
|
|
1696
|
+
'sequence-number': sequenceNumber,
|
|
1697
|
+
'snapshot-id': snapshotId,
|
|
1698
|
+
'parent-snapshot-id': parentSnapshotId,
|
|
1699
|
+
'timestamp-ms': Date.now(),
|
|
1700
|
+
summary,
|
|
1701
|
+
'manifest-list': manifestListUrl,
|
|
1702
|
+
'schema-id': params.currentSchemaId,
|
|
1703
|
+
},
|
|
1704
|
+
},
|
|
1705
|
+
{
|
|
1706
|
+
action: 'set-snapshot-ref',
|
|
1707
|
+
'snapshot-id': snapshotId,
|
|
1708
|
+
type: 'branch',
|
|
1709
|
+
'ref-name': 'main',
|
|
1710
|
+
},
|
|
1711
|
+
];
|
|
1712
|
+
if (removeSnapshotId && removeSnapshotId > 0n) {
|
|
1713
|
+
updates.push({
|
|
1714
|
+
action: 'remove-snapshots',
|
|
1715
|
+
'snapshot-ids': [removeSnapshotId],
|
|
1716
|
+
});
|
|
1717
|
+
}
|
|
1718
|
+
const result = await icebergRequest({
|
|
1719
|
+
credentials: params.credentials,
|
|
1720
|
+
tableBucketARN: params.tableBucketARN,
|
|
1721
|
+
method: 'POST',
|
|
1722
|
+
suffix: `/namespaces/${params.namespace}/tables/${params.name}`,
|
|
1723
|
+
body: {
|
|
1724
|
+
requirements: expected_snapshot_id > 0n
|
|
1725
|
+
? [
|
|
1726
|
+
{
|
|
1727
|
+
type: 'assert-ref-snapshot-id',
|
|
1728
|
+
ref: 'main',
|
|
1729
|
+
'snapshot-id': expected_snapshot_id,
|
|
1730
|
+
},
|
|
1731
|
+
]
|
|
1732
|
+
: [],
|
|
1733
|
+
updates,
|
|
1734
|
+
},
|
|
1735
|
+
});
|
|
1736
|
+
return {
|
|
1737
|
+
result,
|
|
1738
|
+
retriesNeeded: try_count,
|
|
1739
|
+
parentSnapshotId,
|
|
1740
|
+
snapshotId,
|
|
1741
|
+
sequenceNumber,
|
|
1742
|
+
};
|
|
1743
|
+
}
|
|
1744
|
+
catch (e) {
|
|
1745
|
+
if (e instanceof IcebergHttpError &&
|
|
1746
|
+
e.status === 409 &&
|
|
1747
|
+
try_count < retry_max) {
|
|
1748
|
+
// retry case
|
|
1749
|
+
removeSnapshotId = 0n;
|
|
1750
|
+
}
|
|
1751
|
+
else {
|
|
1752
|
+
throw e;
|
|
1753
|
+
}
|
|
1754
|
+
}
|
|
1755
|
+
// we do a merge in the append only simultanious case
|
|
1756
|
+
const conflict_metadata = await getMetadata(params);
|
|
1757
|
+
const conflict_snapshot_id = BigInt(conflict_metadata['current-snapshot-id']);
|
|
1758
|
+
if (conflict_snapshot_id <= 0n) {
|
|
1759
|
+
throw new Error('conflict');
|
|
1760
|
+
}
|
|
1761
|
+
conflict_snap = conflict_metadata.snapshots.find((s) => s['snapshot-id'] === conflict_snapshot_id);
|
|
1762
|
+
if (!conflict_snap) {
|
|
1763
|
+
throw new Error('conflict');
|
|
1764
|
+
}
|
|
1765
|
+
if (conflict_snap.summary.operation === 'append' &&
|
|
1766
|
+
BigInt(conflict_snap['sequence-number']) === sequenceNumber) {
|
|
1767
|
+
expected_snapshot_id = conflict_snapshot_id;
|
|
1768
|
+
sequenceNumber++;
|
|
1769
|
+
}
|
|
1770
|
+
else {
|
|
1771
|
+
throw new Error('conflict');
|
|
1772
|
+
}
|
|
1773
|
+
}
|
|
1774
|
+
}
|
|
1557
1775
|
async function setCurrentCommit(params) {
|
|
1558
1776
|
const commit_result = await icebergRequest({
|
|
1559
1777
|
credentials: params.credentials,
|
|
@@ -1574,6 +1792,345 @@ async function setCurrentCommit(params) {
|
|
|
1574
1792
|
});
|
|
1575
1793
|
return commit_result;
|
|
1576
1794
|
}
|
|
1795
|
+
|
|
1796
|
+
async function* asyncIterMap(items, func) {
|
|
1797
|
+
const pending = new Set();
|
|
1798
|
+
for (const item of items) {
|
|
1799
|
+
const ref = {};
|
|
1800
|
+
const wrapper = func(item).then((value) => ({
|
|
1801
|
+
self: ref.current,
|
|
1802
|
+
value,
|
|
1803
|
+
}));
|
|
1804
|
+
ref.current = wrapper;
|
|
1805
|
+
pending.add(wrapper);
|
|
1806
|
+
}
|
|
1807
|
+
while (pending.size) {
|
|
1808
|
+
const { self, value } = await Promise.race(pending);
|
|
1809
|
+
if (self) {
|
|
1810
|
+
pending.delete(self);
|
|
1811
|
+
}
|
|
1812
|
+
yield value;
|
|
1813
|
+
}
|
|
1814
|
+
}
|
|
1815
|
+
|
|
1816
|
+
async function manifestCompact(params) {
|
|
1817
|
+
const { credentials, targetCount, calculateWeight } = params;
|
|
1818
|
+
const region = params.tableBucketARN.split(':')[3];
|
|
1819
|
+
if (!region) {
|
|
1820
|
+
throw new Error('bad tableBucketARN');
|
|
1821
|
+
}
|
|
1822
|
+
const snapshot_id = params.snapshotId ?? _randomBigInt64();
|
|
1823
|
+
const metadata = await getMetadata(params);
|
|
1824
|
+
const bucket = metadata.location.split('/').slice(-1)[0];
|
|
1825
|
+
const parent_snapshot_id = BigInt(metadata['current-snapshot-id']);
|
|
1826
|
+
const snapshot = metadata.snapshots.find((s) => BigInt(s['snapshot-id']) === parent_snapshot_id) ?? null;
|
|
1827
|
+
if (!bucket) {
|
|
1828
|
+
throw new Error('bad manifest location');
|
|
1829
|
+
}
|
|
1830
|
+
if (!snapshot) {
|
|
1831
|
+
return {
|
|
1832
|
+
result: {},
|
|
1833
|
+
retriesNeeded: 0,
|
|
1834
|
+
parentSnapshotId: parent_snapshot_id,
|
|
1835
|
+
snapshotId: 0n,
|
|
1836
|
+
sequenceNumber: 0n,
|
|
1837
|
+
changed: false,
|
|
1838
|
+
outputManifestCount: 0,
|
|
1839
|
+
};
|
|
1840
|
+
}
|
|
1841
|
+
if (parent_snapshot_id <= 0n) {
|
|
1842
|
+
throw new Error('no old snapshot');
|
|
1843
|
+
}
|
|
1844
|
+
const old_list_key = parseS3Url(snapshot['manifest-list']).key;
|
|
1845
|
+
if (!old_list_key) {
|
|
1846
|
+
throw new Error('last snapshot invalid');
|
|
1847
|
+
}
|
|
1848
|
+
const sequence_number = BigInt(metadata['last-sequence-number']) + 1n;
|
|
1849
|
+
let remove_snapshot_id = 0n;
|
|
1850
|
+
if (params.maxSnapshots && metadata.snapshots.length >= params.maxSnapshots) {
|
|
1851
|
+
let earliest_time = 0;
|
|
1852
|
+
for (const snap of metadata.snapshots) {
|
|
1853
|
+
const snap_time = snap['timestamp-ms'];
|
|
1854
|
+
if (earliest_time === 0 || snap_time < earliest_time) {
|
|
1855
|
+
earliest_time = snap_time;
|
|
1856
|
+
remove_snapshot_id = BigInt(snap['snapshot-id']);
|
|
1857
|
+
}
|
|
1858
|
+
}
|
|
1859
|
+
}
|
|
1860
|
+
const list = await downloadAvro({
|
|
1861
|
+
credentials,
|
|
1862
|
+
region,
|
|
1863
|
+
bucket,
|
|
1864
|
+
key: old_list_key,
|
|
1865
|
+
avroSchema: ManifestListSchema,
|
|
1866
|
+
});
|
|
1867
|
+
const filtered = list.filter(_filterDeletes);
|
|
1868
|
+
const groups = _groupList(filtered, (a, b) => {
|
|
1869
|
+
if (a.content === ListContent.DATA &&
|
|
1870
|
+
b.content === ListContent.DATA &&
|
|
1871
|
+
a.deleted_files_count === 0 &&
|
|
1872
|
+
b.deleted_files_count === 0 &&
|
|
1873
|
+
a.partition_spec_id === b.partition_spec_id) {
|
|
1874
|
+
return (!a.partitions ||
|
|
1875
|
+
a.partitions.every((part, i) => {
|
|
1876
|
+
const other = b.partitions?.[i];
|
|
1877
|
+
return (other &&
|
|
1878
|
+
(part.upper_bound === other.upper_bound ||
|
|
1879
|
+
(part.upper_bound &&
|
|
1880
|
+
other.upper_bound &&
|
|
1881
|
+
Buffer.compare(part.upper_bound, other.upper_bound) === 0)) &&
|
|
1882
|
+
(part.lower_bound === other.lower_bound ||
|
|
1883
|
+
(part.lower_bound &&
|
|
1884
|
+
other.lower_bound &&
|
|
1885
|
+
Buffer.compare(part.lower_bound, other.lower_bound) === 0)));
|
|
1886
|
+
}));
|
|
1887
|
+
}
|
|
1888
|
+
return false;
|
|
1889
|
+
});
|
|
1890
|
+
const final_groups = targetCount !== undefined &&
|
|
1891
|
+
calculateWeight !== undefined &&
|
|
1892
|
+
groups.length > targetCount
|
|
1893
|
+
? _combineWeightGroups(groups, targetCount, calculateWeight)
|
|
1894
|
+
: groups;
|
|
1895
|
+
if (final_groups.length === 0 ||
|
|
1896
|
+
(final_groups.length === list.length && !params.forceRewrite)) {
|
|
1897
|
+
return {
|
|
1898
|
+
result: {},
|
|
1899
|
+
retriesNeeded: 0,
|
|
1900
|
+
parentSnapshotId: parent_snapshot_id,
|
|
1901
|
+
snapshotId: 0n,
|
|
1902
|
+
sequenceNumber: sequence_number,
|
|
1903
|
+
changed: false,
|
|
1904
|
+
outputManifestCount: 0,
|
|
1905
|
+
};
|
|
1906
|
+
}
|
|
1907
|
+
const manifest_list_key = `metadata/${node_crypto.randomUUID()}.avro`;
|
|
1908
|
+
const iter = asyncIterMap(final_groups, async (group) => {
|
|
1909
|
+
if (!group[0]) {
|
|
1910
|
+
return [];
|
|
1911
|
+
}
|
|
1912
|
+
const { partition_spec_id } = group[0];
|
|
1913
|
+
const spec = metadata['partition-specs'].find((p) => p['spec-id'] === partition_spec_id);
|
|
1914
|
+
if (!spec) {
|
|
1915
|
+
throw new Error(`Partition spec not found: ${partition_spec_id}`);
|
|
1916
|
+
}
|
|
1917
|
+
return _combineGroup({
|
|
1918
|
+
credentials,
|
|
1919
|
+
region,
|
|
1920
|
+
bucket,
|
|
1921
|
+
group,
|
|
1922
|
+
spec,
|
|
1923
|
+
snapshotId: snapshot_id,
|
|
1924
|
+
schemas: metadata.schemas,
|
|
1925
|
+
sequenceNumber: sequence_number,
|
|
1926
|
+
forceRewrite: params.forceRewrite ?? false,
|
|
1927
|
+
});
|
|
1928
|
+
});
|
|
1929
|
+
await streamWriteAvro({
|
|
1930
|
+
credentials,
|
|
1931
|
+
region,
|
|
1932
|
+
bucket,
|
|
1933
|
+
key: manifest_list_key,
|
|
1934
|
+
metadata: {
|
|
1935
|
+
'sequence-number': String(sequence_number),
|
|
1936
|
+
'snapshot-id': String(snapshot_id),
|
|
1937
|
+
'parent-snapshot-id': String(parent_snapshot_id),
|
|
1938
|
+
},
|
|
1939
|
+
avroType: ManifestListType,
|
|
1940
|
+
iter,
|
|
1941
|
+
});
|
|
1942
|
+
const summary = {
|
|
1943
|
+
operation: 'replace',
|
|
1944
|
+
'added-data-files': '0',
|
|
1945
|
+
'deleted-data-files': '0',
|
|
1946
|
+
'added-records': '0',
|
|
1947
|
+
'deleted-records': '0',
|
|
1948
|
+
'added-files-size': '0',
|
|
1949
|
+
'removed-files-size': '0',
|
|
1950
|
+
'changed-partition-count': '0',
|
|
1951
|
+
};
|
|
1952
|
+
const snap_result = await submitSnapshot({
|
|
1953
|
+
credentials,
|
|
1954
|
+
tableBucketARN: params.tableBucketARN,
|
|
1955
|
+
namespace: params.namespace,
|
|
1956
|
+
name: params.name,
|
|
1957
|
+
currentSchemaId: metadata['current-schema-id'],
|
|
1958
|
+
parentSnapshotId: parent_snapshot_id,
|
|
1959
|
+
snapshotId: snapshot_id,
|
|
1960
|
+
sequenceNumber: sequence_number,
|
|
1961
|
+
manifestListUrl: `s3://${bucket}/${manifest_list_key}`,
|
|
1962
|
+
summary,
|
|
1963
|
+
removeSnapshotId: remove_snapshot_id,
|
|
1964
|
+
retryCount: params.retryCount,
|
|
1965
|
+
});
|
|
1966
|
+
return {
|
|
1967
|
+
...snap_result,
|
|
1968
|
+
changed: true,
|
|
1969
|
+
outputManifestCount: final_groups.length,
|
|
1970
|
+
};
|
|
1971
|
+
}
|
|
1972
|
+
async function _combineGroup(params) {
|
|
1973
|
+
const { credentials, region, bucket, group } = params;
|
|
1974
|
+
const record0 = group[0];
|
|
1975
|
+
if ((group.length === 1 && !params.forceRewrite) || !record0) {
|
|
1976
|
+
return group;
|
|
1977
|
+
}
|
|
1978
|
+
const key = `metadata/${node_crypto.randomUUID()}.avro`;
|
|
1979
|
+
const schema = makeManifestSchema(params.spec, params.schemas, true);
|
|
1980
|
+
const type = makeManifestType(params.spec, params.schemas, true);
|
|
1981
|
+
const iter = asyncIterMap(group, async (record) => {
|
|
1982
|
+
return _streamReadManifest({
|
|
1983
|
+
credentials,
|
|
1984
|
+
region,
|
|
1985
|
+
bucket,
|
|
1986
|
+
url: record.manifest_path,
|
|
1987
|
+
schema,
|
|
1988
|
+
});
|
|
1989
|
+
});
|
|
1990
|
+
const manifest_length = await streamWriteAvro({
|
|
1991
|
+
credentials,
|
|
1992
|
+
region,
|
|
1993
|
+
bucket,
|
|
1994
|
+
key,
|
|
1995
|
+
metadata: {
|
|
1996
|
+
'partition-spec-id': String(params.spec['spec-id']),
|
|
1997
|
+
'partition-spec': JSON.stringify(params.spec.fields),
|
|
1998
|
+
},
|
|
1999
|
+
avroType: type,
|
|
2000
|
+
iter,
|
|
2001
|
+
});
|
|
2002
|
+
const ret = {
|
|
2003
|
+
manifest_path: `s3://${bucket}/${key}`,
|
|
2004
|
+
manifest_length: BigInt(manifest_length),
|
|
2005
|
+
partition_spec_id: record0.partition_spec_id,
|
|
2006
|
+
content: record0.content,
|
|
2007
|
+
sequence_number: params.sequenceNumber,
|
|
2008
|
+
min_sequence_number: params.sequenceNumber,
|
|
2009
|
+
added_snapshot_id: params.snapshotId,
|
|
2010
|
+
added_files_count: 0,
|
|
2011
|
+
existing_files_count: 0,
|
|
2012
|
+
deleted_files_count: 0,
|
|
2013
|
+
added_rows_count: 0n,
|
|
2014
|
+
existing_rows_count: 0n,
|
|
2015
|
+
deleted_rows_count: 0n,
|
|
2016
|
+
partitions: record0.partitions ?? null,
|
|
2017
|
+
};
|
|
2018
|
+
for (const record of group) {
|
|
2019
|
+
ret.added_files_count += record.added_files_count;
|
|
2020
|
+
ret.existing_files_count += record.existing_files_count;
|
|
2021
|
+
ret.deleted_files_count += record.deleted_files_count;
|
|
2022
|
+
ret.added_rows_count += record.added_rows_count;
|
|
2023
|
+
ret.existing_rows_count += record.existing_rows_count;
|
|
2024
|
+
ret.deleted_rows_count += record.deleted_rows_count;
|
|
2025
|
+
ret.min_sequence_number = _bigintMin(ret.min_sequence_number, record.min_sequence_number);
|
|
2026
|
+
}
|
|
2027
|
+
for (let i = 1; i < group.length; i++) {
|
|
2028
|
+
const parts = group[i]?.partitions;
|
|
2029
|
+
if (ret.partitions && parts) {
|
|
2030
|
+
for (let j = 0; j < parts.length; j++) {
|
|
2031
|
+
const part = parts[j];
|
|
2032
|
+
const ret_part = ret.partitions[j];
|
|
2033
|
+
if (part && ret_part) {
|
|
2034
|
+
ret_part.contains_null ||= part.contains_null;
|
|
2035
|
+
if (part.contains_nan !== undefined) {
|
|
2036
|
+
ret_part.contains_nan =
|
|
2037
|
+
(ret_part.contains_nan ?? false) || part.contains_nan;
|
|
2038
|
+
}
|
|
2039
|
+
if (!ret_part.upper_bound ||
|
|
2040
|
+
(part.upper_bound &&
|
|
2041
|
+
Buffer.compare(part.upper_bound, ret_part.upper_bound) > 0)) {
|
|
2042
|
+
ret_part.upper_bound = part.upper_bound ?? null;
|
|
2043
|
+
}
|
|
2044
|
+
if (!ret_part.lower_bound ||
|
|
2045
|
+
(part.lower_bound &&
|
|
2046
|
+
Buffer.compare(part.lower_bound, ret_part.lower_bound) < 0)) {
|
|
2047
|
+
ret_part.lower_bound = part.lower_bound ?? null;
|
|
2048
|
+
}
|
|
2049
|
+
}
|
|
2050
|
+
}
|
|
2051
|
+
}
|
|
2052
|
+
else if (parts) {
|
|
2053
|
+
ret.partitions = parts;
|
|
2054
|
+
}
|
|
2055
|
+
}
|
|
2056
|
+
return [ret];
|
|
2057
|
+
}
|
|
2058
|
+
async function _streamReadManifest(params) {
|
|
2059
|
+
let bucket = params.bucket;
|
|
2060
|
+
let key = params.url;
|
|
2061
|
+
if (params.url.startsWith('s3://')) {
|
|
2062
|
+
const parsed = parseS3Url(params.url);
|
|
2063
|
+
bucket = parsed.bucket;
|
|
2064
|
+
key = parsed.key;
|
|
2065
|
+
}
|
|
2066
|
+
if (!bucket || !key) {
|
|
2067
|
+
throw new Error(`invalid manfiest url: ${params.url}`);
|
|
2068
|
+
}
|
|
2069
|
+
const results = await downloadAvro({
|
|
2070
|
+
credentials: params.credentials,
|
|
2071
|
+
region: params.region,
|
|
2072
|
+
bucket,
|
|
2073
|
+
key,
|
|
2074
|
+
avroSchema: params.schema,
|
|
2075
|
+
});
|
|
2076
|
+
return results;
|
|
2077
|
+
}
|
|
2078
|
+
function _filterDeletes(record) {
|
|
2079
|
+
return (record.content !== ListContent.DATA ||
|
|
2080
|
+
record.added_files_count > 0 ||
|
|
2081
|
+
record.existing_files_count > 0);
|
|
2082
|
+
}
|
|
2083
|
+
function _groupList(list, compare) {
|
|
2084
|
+
const ret = [];
|
|
2085
|
+
for (const item of list) {
|
|
2086
|
+
let added = false;
|
|
2087
|
+
for (const group of ret) {
|
|
2088
|
+
if (group[0] && compare(group[0], item)) {
|
|
2089
|
+
group.push(item);
|
|
2090
|
+
added = true;
|
|
2091
|
+
break;
|
|
2092
|
+
}
|
|
2093
|
+
}
|
|
2094
|
+
if (!added) {
|
|
2095
|
+
ret.push([item]);
|
|
2096
|
+
}
|
|
2097
|
+
}
|
|
2098
|
+
return ret;
|
|
2099
|
+
}
|
|
2100
|
+
function _combineWeightGroups(groups, targetCount, calculateWeight) {
|
|
2101
|
+
const weighted_groups = groups.map((group) => ({
|
|
2102
|
+
group,
|
|
2103
|
+
weight: calculateWeight(group),
|
|
2104
|
+
}));
|
|
2105
|
+
weighted_groups.sort(_sortGroup);
|
|
2106
|
+
while (weighted_groups.length > targetCount) {
|
|
2107
|
+
let remove_item;
|
|
2108
|
+
let merge_item;
|
|
2109
|
+
for (let i = 0; i < weighted_groups.length; i++) {
|
|
2110
|
+
const check_item = weighted_groups[i];
|
|
2111
|
+
const partition_spec_id = check_item?.group[0]?.partition_spec_id;
|
|
2112
|
+
if (partition_spec_id !== undefined) {
|
|
2113
|
+
for (let j = i + 1; j < weighted_groups.length; j++) {
|
|
2114
|
+
merge_item = weighted_groups[j];
|
|
2115
|
+
if (merge_item?.group[0]?.partition_spec_id === partition_spec_id) {
|
|
2116
|
+
remove_item = weighted_groups.splice(i, 1)[0];
|
|
2117
|
+
break;
|
|
2118
|
+
}
|
|
2119
|
+
}
|
|
2120
|
+
}
|
|
2121
|
+
}
|
|
2122
|
+
if (!remove_item || !merge_item) {
|
|
2123
|
+
break;
|
|
2124
|
+
}
|
|
2125
|
+
for (const item of remove_item.group) {
|
|
2126
|
+
merge_item.group.push(item);
|
|
2127
|
+
}
|
|
2128
|
+
}
|
|
2129
|
+
return weighted_groups.map((g) => g.group);
|
|
2130
|
+
}
|
|
2131
|
+
function _sortGroup(a, b) {
|
|
2132
|
+
return a.weight - b.weight;
|
|
2133
|
+
}
|
|
1577
2134
|
function _randomBigInt64() {
|
|
1578
2135
|
const bytes = node_crypto.randomBytes(8);
|
|
1579
2136
|
let ret = bytes.readBigUInt64BE();
|
|
@@ -1583,6 +2140,15 @@ function _randomBigInt64() {
|
|
|
1583
2140
|
}
|
|
1584
2141
|
return ret;
|
|
1585
2142
|
}
|
|
2143
|
+
function _bigintMin(value0, ...values) {
|
|
2144
|
+
let ret = value0;
|
|
2145
|
+
for (const val of values) {
|
|
2146
|
+
if (val < ret) {
|
|
2147
|
+
ret = val;
|
|
2148
|
+
}
|
|
2149
|
+
}
|
|
2150
|
+
return ret;
|
|
2151
|
+
}
|
|
1586
2152
|
|
|
1587
2153
|
var index = {
|
|
1588
2154
|
IcebergHttpError,
|
|
@@ -1602,5 +2168,7 @@ exports.addPartitionSpec = addPartitionSpec;
|
|
|
1602
2168
|
exports.addSchema = addSchema;
|
|
1603
2169
|
exports.default = index;
|
|
1604
2170
|
exports.getMetadata = getMetadata;
|
|
2171
|
+
exports.manifestCompact = manifestCompact;
|
|
1605
2172
|
exports.removeSnapshots = removeSnapshots;
|
|
1606
2173
|
exports.setCurrentCommit = setCurrentCommit;
|
|
2174
|
+
exports.submitSnapshot = submitSnapshot;
|
package/package.json
CHANGED
|
@@ -1,15 +1,18 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "node-s3tables",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.18",
|
|
4
4
|
"description": "node api for dealing with s3tables",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
7
|
+
"bin": {
|
|
8
|
+
"node-s3tables": "dist/bin-compact.js"
|
|
9
|
+
},
|
|
7
10
|
"files": [
|
|
8
11
|
"dist/**/*"
|
|
9
12
|
],
|
|
10
13
|
"scripts": {
|
|
11
14
|
"build": "rollup -c",
|
|
12
|
-
"ts
|
|
15
|
+
"ts:check": "tsc --noEmit",
|
|
13
16
|
"lint": "eslint src test",
|
|
14
17
|
"pretty": "prettier -u --write \"**/*\" --log-level warn",
|
|
15
18
|
"test": "dotenv -- tsx --test test/*.test.ts",
|