node-s3tables 0.0.18 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bin.js +105 -14
- package/dist/index.d.ts +56 -40
- package/dist/index.js +386 -260
- package/package.json +4 -3
package/dist/bin.js
CHANGED
|
@@ -1,25 +1,116 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
'use strict';
|
|
3
3
|
|
|
4
|
+
var node_util = require('node:util');
|
|
4
5
|
var nodeS3tables = require('node-s3tables');
|
|
5
6
|
|
|
6
7
|
/* eslint-disable no-console */
|
|
7
|
-
const
|
|
8
|
-
|
|
9
|
-
|
|
8
|
+
const { positionals, values } = node_util.parseArgs({
|
|
9
|
+
allowPositionals: true,
|
|
10
|
+
options: {
|
|
11
|
+
'force-rewrite': { type: 'boolean' },
|
|
12
|
+
'spec-id': { type: 'string' },
|
|
13
|
+
'schema-id': { type: 'string' },
|
|
14
|
+
files: { type: 'string' },
|
|
15
|
+
'max-snapshots': { type: 'string' },
|
|
16
|
+
'redshift-manifest-url': { type: 'string' },
|
|
17
|
+
},
|
|
18
|
+
});
|
|
19
|
+
const [command, tableBucketARN, namespace, name] = positionals;
|
|
20
|
+
if (!command || !tableBucketARN || !namespace || !name) {
|
|
21
|
+
console.error('Usage: node-s3tables <command> <tableBucketARN> <namespace> <name> [options]\n');
|
|
22
|
+
console.error('Commands:');
|
|
23
|
+
console.error(' compact Compact manifest files');
|
|
24
|
+
console.error(' Options: --force-rewrite');
|
|
25
|
+
console.error('');
|
|
26
|
+
console.error(' add_files Add data files to table');
|
|
27
|
+
console.error(' Options: --spec-id <id> --schema-id <id> --files <json> [--max-snapshots <n>]');
|
|
28
|
+
console.error(' Example: --files \'[{"file":"s3://bucket/data.parquet","partitions":{},"recordCount":"1000","fileSize":"52428"}]\'');
|
|
29
|
+
console.error('');
|
|
30
|
+
console.error(' import_redshift Import redshift manifest created by UNLOAD');
|
|
31
|
+
console.error(' Options: --redshift-manifest-url s3://s3table-bucket/unload/manfiest');
|
|
32
|
+
console.error('');
|
|
10
33
|
process.exit(-1);
|
|
11
34
|
}
|
|
12
|
-
|
|
13
|
-
.
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
35
|
+
if (command === 'compact') {
|
|
36
|
+
console.log('Compact:', tableBucketARN, namespace, name, 'forceRewrite:', Boolean(values['force-rewrite']));
|
|
37
|
+
nodeS3tables.manifestCompact({
|
|
38
|
+
tableBucketARN,
|
|
39
|
+
namespace,
|
|
40
|
+
name,
|
|
41
|
+
forceRewrite: Boolean(values['force-rewrite']),
|
|
42
|
+
})
|
|
43
|
+
.then((result) => {
|
|
44
|
+
console.log('Compact result:', result);
|
|
45
|
+
process.exit(0);
|
|
46
|
+
})
|
|
47
|
+
.catch((error) => {
|
|
48
|
+
console.error('Error:', error);
|
|
49
|
+
process.exit(1);
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
else if (command === 'add_files') {
|
|
53
|
+
const specId = values['spec-id'];
|
|
54
|
+
const schemaId = values['schema-id'];
|
|
55
|
+
const filesJson = values.files;
|
|
56
|
+
if (!specId || !schemaId || !filesJson) {
|
|
57
|
+
console.error('Error: Missing required options for add_files command\n');
|
|
58
|
+
console.error('Usage: node-s3tables add_files <tableBucketARN> <namespace> <name> --spec-id <id> --schema-id <id> --files <json> [--max-snapshots <n>]\n');
|
|
59
|
+
console.error('Example:');
|
|
60
|
+
console.error(' --spec-id 1 --schema-id 2 --files \'[{"file":"s3://bucket/data.parquet","partitions":{"date":"2024-01-01"},"recordCount":"1000","fileSize":"52428"}]\'');
|
|
61
|
+
process.exit(-1);
|
|
20
62
|
}
|
|
21
|
-
|
|
63
|
+
const files = JSON.parse(filesJson);
|
|
64
|
+
const maxSnapshots = values['max-snapshots']
|
|
65
|
+
? parseInt(values['max-snapshots'], 10)
|
|
66
|
+
: undefined;
|
|
67
|
+
console.log('Adding files:', tableBucketARN, namespace, name);
|
|
68
|
+
const params = {
|
|
69
|
+
tableBucketARN,
|
|
70
|
+
namespace,
|
|
71
|
+
name,
|
|
72
|
+
lists: [
|
|
73
|
+
{ specId: parseInt(specId, 10), schemaId: parseInt(schemaId, 10), files },
|
|
74
|
+
],
|
|
75
|
+
};
|
|
76
|
+
if (maxSnapshots !== undefined) {
|
|
77
|
+
params.maxSnapshots = maxSnapshots;
|
|
78
|
+
}
|
|
79
|
+
nodeS3tables.addDataFiles(params)
|
|
80
|
+
.then((result) => {
|
|
81
|
+
console.log('Add files result:', result);
|
|
82
|
+
process.exit(0);
|
|
83
|
+
})
|
|
84
|
+
.catch((error) => {
|
|
22
85
|
console.error('Error:', error);
|
|
86
|
+
process.exit(1);
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
else if (command === 'import_redshift') {
|
|
90
|
+
const redshiftManifestUrl = values['redshift-manifest-url'];
|
|
91
|
+
if (!redshiftManifestUrl) {
|
|
92
|
+
console.error('Error: Missing required options for import_redshift command\n');
|
|
93
|
+
console.error('Usage: node-s3tables import_redshift <tableBucketARN> <namespace> <name> --redshift-manfiest-url <s3url>\n');
|
|
94
|
+
console.error('Example:');
|
|
95
|
+
console.error(' --redshift-manifest-url s3://s3table-bucket/exported_manfiest.json');
|
|
96
|
+
process.exit(-1);
|
|
23
97
|
}
|
|
24
|
-
|
|
25
|
-
|
|
98
|
+
nodeS3tables.importRedshiftManifest({
|
|
99
|
+
tableBucketARN,
|
|
100
|
+
namespace,
|
|
101
|
+
name,
|
|
102
|
+
redshiftManifestUrl,
|
|
103
|
+
})
|
|
104
|
+
.then((result) => {
|
|
105
|
+
console.log('Import result:', result);
|
|
106
|
+
process.exit(0);
|
|
107
|
+
})
|
|
108
|
+
.catch((error) => {
|
|
109
|
+
console.error('Error:', error);
|
|
110
|
+
process.exit(1);
|
|
111
|
+
});
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
console.error('Unknown command:', command);
|
|
115
|
+
process.exit(-1);
|
|
116
|
+
}
|
package/dist/index.d.ts
CHANGED
|
@@ -120,6 +120,51 @@ interface AddManifestParams {
|
|
|
120
120
|
files: AddFile[];
|
|
121
121
|
}
|
|
122
122
|
declare function addManifest(params: AddManifestParams): Promise<ManifestListRecord>;
|
|
123
|
+
declare function minBuffer(a: Buffer | null | undefined, b: Buffer | null | undefined, field: IcebergPartitionField, schema: IcebergSchema): Buffer | null;
|
|
124
|
+
declare function maxBuffer(a: Buffer | null | undefined, b: Buffer | null | undefined, field: IcebergPartitionField, schema: IcebergSchema): Buffer | null;
|
|
125
|
+
|
|
126
|
+
type JSONPrimitive = string | number | boolean | null | bigint | undefined;
|
|
127
|
+
type JSONValue = JSONPrimitive | JSONObject | JSONArray;
|
|
128
|
+
interface JSONObject {
|
|
129
|
+
[key: string]: JSONValue;
|
|
130
|
+
}
|
|
131
|
+
type JSONArray = JSONValue[];
|
|
132
|
+
|
|
133
|
+
interface AddFileList {
|
|
134
|
+
specId: number;
|
|
135
|
+
schemaId: number;
|
|
136
|
+
files: AddFile[];
|
|
137
|
+
}
|
|
138
|
+
interface AddDataFilesParams {
|
|
139
|
+
credentials?: AwsCredentialIdentity | undefined;
|
|
140
|
+
tableBucketARN: string;
|
|
141
|
+
namespace: string;
|
|
142
|
+
name: string;
|
|
143
|
+
snapshotId?: bigint;
|
|
144
|
+
lists: AddFileList[];
|
|
145
|
+
retryCount?: number | undefined;
|
|
146
|
+
maxSnapshots?: number;
|
|
147
|
+
}
|
|
148
|
+
interface AddDataFilesResult {
|
|
149
|
+
result: JSONObject;
|
|
150
|
+
retriesNeeded: number;
|
|
151
|
+
parentSnapshotId: bigint;
|
|
152
|
+
snapshotId: bigint;
|
|
153
|
+
sequenceNumber: bigint;
|
|
154
|
+
}
|
|
155
|
+
declare function addDataFiles(params: AddDataFilesParams): Promise<AddDataFilesResult>;
|
|
156
|
+
|
|
157
|
+
interface ImportRedshiftManifestParams {
|
|
158
|
+
credentials?: AwsCredentialIdentity;
|
|
159
|
+
tableBucketARN: string;
|
|
160
|
+
namespace: string;
|
|
161
|
+
name: string;
|
|
162
|
+
redshiftManifestUrl: string;
|
|
163
|
+
schemaId?: number;
|
|
164
|
+
specId?: number;
|
|
165
|
+
retryCount?: number | undefined;
|
|
166
|
+
}
|
|
167
|
+
declare function importRedshiftManifest(params: ImportRedshiftManifestParams): Promise<AddDataFilesResult>;
|
|
123
168
|
|
|
124
169
|
type TableLocation = {
|
|
125
170
|
tableArn: string;
|
|
@@ -164,36 +209,12 @@ interface RemoveSnapshotsParams {
|
|
|
164
209
|
}
|
|
165
210
|
declare function removeSnapshots(params: RemoveSnapshotsParams): Promise<IcebergUpdateResponse>;
|
|
166
211
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
type JSONArray = JSONValue[];
|
|
173
|
-
|
|
174
|
-
interface AddFileList {
|
|
175
|
-
specId: number;
|
|
176
|
-
schemaId: number;
|
|
177
|
-
files: AddFile[];
|
|
178
|
-
}
|
|
179
|
-
interface AddDataFilesParams {
|
|
180
|
-
credentials?: AwsCredentialIdentity;
|
|
181
|
-
tableBucketARN: string;
|
|
182
|
-
namespace: string;
|
|
183
|
-
name: string;
|
|
184
|
-
snapshotId?: bigint;
|
|
185
|
-
lists: AddFileList[];
|
|
186
|
-
retryCount?: number;
|
|
187
|
-
maxSnapshots?: number;
|
|
188
|
-
}
|
|
189
|
-
interface AddDataFilesResult {
|
|
190
|
-
result: JSONObject;
|
|
191
|
-
retriesNeeded: number;
|
|
192
|
-
parentSnapshotId: bigint;
|
|
193
|
-
snapshotId: bigint;
|
|
194
|
-
sequenceNumber: bigint;
|
|
212
|
+
declare class IcebergHttpError extends Error {
|
|
213
|
+
status: number;
|
|
214
|
+
text?: string;
|
|
215
|
+
body?: JSONObject;
|
|
216
|
+
constructor(status: number, body: JSONValue, message: string);
|
|
195
217
|
}
|
|
196
|
-
declare function addDataFiles(params: AddDataFilesParams): Promise<AddDataFilesResult>;
|
|
197
218
|
|
|
198
219
|
interface SubmitSnapshotParams {
|
|
199
220
|
credentials?: AwsCredentialIdentity | undefined;
|
|
@@ -231,13 +252,6 @@ interface SetCurrentCommitParams {
|
|
|
231
252
|
}
|
|
232
253
|
declare function setCurrentCommit(params: SetCurrentCommitParams): Promise<JSONObject>;
|
|
233
254
|
|
|
234
|
-
declare class IcebergHttpError extends Error {
|
|
235
|
-
status: number;
|
|
236
|
-
text?: string;
|
|
237
|
-
body?: JSONObject;
|
|
238
|
-
constructor(status: number, body: JSONValue, message: string);
|
|
239
|
-
}
|
|
240
|
-
|
|
241
255
|
type CalculateWeightFunction = (group: ManifestListRecord[]) => number;
|
|
242
256
|
interface ManifestCompactParams {
|
|
243
257
|
credentials?: AwsCredentialIdentity;
|
|
@@ -253,20 +267,22 @@ interface ManifestCompactParams {
|
|
|
253
267
|
}
|
|
254
268
|
interface ManifestCompactResult extends SubmitSnapshotResult {
|
|
255
269
|
changed: boolean;
|
|
270
|
+
inputManifestCount: number;
|
|
256
271
|
outputManifestCount: number;
|
|
257
272
|
}
|
|
258
273
|
declare function manifestCompact(params: ManifestCompactParams): Promise<ManifestCompactResult>;
|
|
259
274
|
|
|
260
275
|
declare const _default: {
|
|
261
276
|
IcebergHttpError: typeof IcebergHttpError;
|
|
262
|
-
getMetadata: typeof getMetadata;
|
|
263
277
|
addSchema: typeof addSchema;
|
|
264
278
|
addPartitionSpec: typeof addPartitionSpec;
|
|
265
279
|
addManifest: typeof addManifest;
|
|
266
280
|
addDataFiles: typeof addDataFiles;
|
|
267
|
-
|
|
281
|
+
getMetadata: typeof getMetadata;
|
|
282
|
+
importRedshiftManifest: typeof importRedshiftManifest;
|
|
268
283
|
removeSnapshots: typeof removeSnapshots;
|
|
284
|
+
setCurrentCommit: typeof setCurrentCommit;
|
|
269
285
|
};
|
|
270
286
|
|
|
271
|
-
export { IcebergHttpError, addDataFiles, addManifest, addPartitionSpec, addSchema, _default as default, getMetadata, manifestCompact, removeSnapshots, setCurrentCommit, submitSnapshot };
|
|
272
|
-
export type { AddDataFilesParams, AddDataFilesResult, AddFile, AddFileList, AddManifestParams, AddPartitionSpecParams, AddSchemaParams, CalculateWeightFunction, GetMetadataParams, IcebergComplexType, IcebergMetadata, IcebergPartitionField, IcebergPartitionSpec, IcebergPrimitiveType, IcebergSchema, IcebergSchemaField, IcebergSnapshot, IcebergSnapshotSummary, IcebergTransform, IcebergType, IcebergUpdateResponse, ManifestCompactParams, ManifestCompactResult, ManifestListRecord, RemoveSnapshotsParams, ResolveConflictResult, SetCurrentCommitParams, SubmitSnapshotParams, SubmitSnapshotResult, TableLocation };
|
|
287
|
+
export { IcebergHttpError, addDataFiles, addManifest, addPartitionSpec, addSchema, _default as default, getMetadata, importRedshiftManifest, manifestCompact, maxBuffer, minBuffer, removeSnapshots, setCurrentCommit, submitSnapshot };
|
|
288
|
+
export type { AddDataFilesParams, AddDataFilesResult, AddFile, AddFileList, AddManifestParams, AddPartitionSpecParams, AddSchemaParams, CalculateWeightFunction, GetMetadataParams, IcebergComplexType, IcebergMetadata, IcebergPartitionField, IcebergPartitionSpec, IcebergPrimitiveType, IcebergSchema, IcebergSchemaField, IcebergSnapshot, IcebergSnapshotSummary, IcebergTransform, IcebergType, IcebergUpdateResponse, ImportRedshiftManifestParams, ManifestCompactParams, ManifestCompactResult, ManifestListRecord, RemoveSnapshotsParams, ResolveConflictResult, SetCurrentCommitParams, SubmitSnapshotParams, SubmitSnapshotResult, TableLocation };
|
package/dist/index.js
CHANGED
|
@@ -803,6 +803,37 @@ function makeBounds(partitions, spec, schema) {
|
|
|
803
803
|
return _encodeValue(raw, f.transform, out_type);
|
|
804
804
|
});
|
|
805
805
|
}
|
|
806
|
+
function compareBounds(a, b, field, schema) {
|
|
807
|
+
const schemaField = schema.fields.find((sf) => sf.id === field['source-id']);
|
|
808
|
+
if (!schemaField) {
|
|
809
|
+
throw new Error(`Schema field not found for source-id ${field['source-id']}`);
|
|
810
|
+
}
|
|
811
|
+
const out_type = _outputType(field.transform, schemaField.type);
|
|
812
|
+
switch (out_type) {
|
|
813
|
+
case 'boolean':
|
|
814
|
+
return Buffer.from(a).readUInt8() - Buffer.from(b).readUInt8();
|
|
815
|
+
case 'int':
|
|
816
|
+
return Buffer.from(a).readInt32LE() - Buffer.from(b).readInt32LE();
|
|
817
|
+
case 'long': {
|
|
818
|
+
const diff = Buffer.from(a).readBigInt64LE() - Buffer.from(b).readBigInt64LE();
|
|
819
|
+
return diff > 0n ? 1 : diff < 0n ? -1 : 0;
|
|
820
|
+
}
|
|
821
|
+
case 'float':
|
|
822
|
+
return Buffer.from(a).readFloatLE() - Buffer.from(b).readFloatLE();
|
|
823
|
+
case 'double':
|
|
824
|
+
return Buffer.from(a).readDoubleLE() - Buffer.from(b).readDoubleLE();
|
|
825
|
+
case null:
|
|
826
|
+
case 'date':
|
|
827
|
+
case 'time':
|
|
828
|
+
case 'timestamp':
|
|
829
|
+
case 'timestamptz':
|
|
830
|
+
case 'string':
|
|
831
|
+
case 'uuid':
|
|
832
|
+
case 'binary':
|
|
833
|
+
default:
|
|
834
|
+
return Buffer.compare(a, b);
|
|
835
|
+
}
|
|
836
|
+
}
|
|
806
837
|
|
|
807
838
|
function isRawRecordSchema(schema) {
|
|
808
839
|
return (typeof schema === 'object' &&
|
|
@@ -957,6 +988,13 @@ function parseS3Url(url) {
|
|
|
957
988
|
}
|
|
958
989
|
const g_s3Map = new Map();
|
|
959
990
|
const g_s3TablesMap = new Map();
|
|
991
|
+
class ByteCounter extends node_stream.Transform {
|
|
992
|
+
bytes = 0;
|
|
993
|
+
_transform(chunk, _encoding, callback) {
|
|
994
|
+
this.bytes += chunk.length;
|
|
995
|
+
callback(null, chunk);
|
|
996
|
+
}
|
|
997
|
+
}
|
|
960
998
|
function getS3Client(params) {
|
|
961
999
|
const { region, credentials } = params;
|
|
962
1000
|
let ret = g_s3Map.get(region)?.get(credentials);
|
|
@@ -1085,26 +1123,32 @@ async function streamWriteAvro(params) {
|
|
|
1085
1123
|
codecs: { deflate: zlib__namespace.deflateRaw },
|
|
1086
1124
|
metadata,
|
|
1087
1125
|
});
|
|
1126
|
+
const counter = new ByteCounter();
|
|
1127
|
+
encoder.pipe(counter);
|
|
1088
1128
|
const upload = new libStorage.Upload({
|
|
1089
1129
|
client: s3,
|
|
1090
|
-
params: { Bucket: bucket, Key: key, Body:
|
|
1130
|
+
params: { Bucket: bucket, Key: key, Body: counter },
|
|
1091
1131
|
});
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
file_size = progress.loaded;
|
|
1132
|
+
async function _abortUpload() {
|
|
1133
|
+
try {
|
|
1134
|
+
await upload.abort();
|
|
1096
1135
|
}
|
|
1097
|
-
|
|
1136
|
+
catch {
|
|
1137
|
+
// noop
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1098
1140
|
const upload_promise = upload.done();
|
|
1099
1141
|
let found_err;
|
|
1100
1142
|
upload_promise.catch((err) => {
|
|
1101
|
-
found_err
|
|
1143
|
+
found_err ??= err;
|
|
1102
1144
|
});
|
|
1103
1145
|
encoder.on('error', (err) => {
|
|
1104
|
-
found_err
|
|
1146
|
+
found_err ??= err;
|
|
1147
|
+
void _abortUpload();
|
|
1105
1148
|
});
|
|
1106
1149
|
for await (const batch of params.iter) {
|
|
1107
1150
|
if (found_err) {
|
|
1151
|
+
void _abortUpload();
|
|
1108
1152
|
throw found_err;
|
|
1109
1153
|
}
|
|
1110
1154
|
for (const record of batch) {
|
|
@@ -1114,9 +1158,10 @@ async function streamWriteAvro(params) {
|
|
|
1114
1158
|
encoder.end();
|
|
1115
1159
|
await upload_promise;
|
|
1116
1160
|
if (found_err) {
|
|
1161
|
+
void _abortUpload();
|
|
1117
1162
|
throw found_err;
|
|
1118
1163
|
}
|
|
1119
|
-
return
|
|
1164
|
+
return counter.bytes;
|
|
1120
1165
|
}
|
|
1121
1166
|
async function downloadAvro(params) {
|
|
1122
1167
|
const { region, credentials, bucket, key, avroSchema } = params;
|
|
@@ -1188,15 +1233,16 @@ async function addManifest(params) {
|
|
|
1188
1233
|
for (let i = 0; i < partitions.length; i++) {
|
|
1189
1234
|
const part = partitions[i];
|
|
1190
1235
|
const bound = bounds[i];
|
|
1191
|
-
|
|
1236
|
+
const field = spec.fields[i];
|
|
1237
|
+
if (!part || !field) {
|
|
1192
1238
|
throw new Error('impossible');
|
|
1193
1239
|
}
|
|
1194
1240
|
else if (bound === null) {
|
|
1195
1241
|
part.contains_null = true;
|
|
1196
1242
|
}
|
|
1197
1243
|
else if (Buffer.isBuffer(bound)) {
|
|
1198
|
-
part.upper_bound =
|
|
1199
|
-
part.lower_bound =
|
|
1244
|
+
part.upper_bound = maxBuffer(part.upper_bound ?? null, bound, field, schema);
|
|
1245
|
+
part.lower_bound = minBuffer(part.lower_bound ?? null, bound, field, schema);
|
|
1200
1246
|
}
|
|
1201
1247
|
else {
|
|
1202
1248
|
part.contains_nan = true;
|
|
@@ -1275,29 +1321,29 @@ function _transformRecord(schema, map) {
|
|
|
1275
1321
|
}
|
|
1276
1322
|
return ret.length > 0 ? ret : null;
|
|
1277
1323
|
}
|
|
1278
|
-
function
|
|
1279
|
-
if (
|
|
1280
|
-
return
|
|
1324
|
+
function minBuffer(a, b, field, schema) {
|
|
1325
|
+
if (a && b) {
|
|
1326
|
+
return compareBounds(a, b, field, schema) <= 0 ? a : b;
|
|
1281
1327
|
}
|
|
1282
|
-
else if (
|
|
1283
|
-
return b;
|
|
1284
|
-
}
|
|
1285
|
-
else if (!b) {
|
|
1328
|
+
else if (a) {
|
|
1286
1329
|
return a;
|
|
1287
1330
|
}
|
|
1288
|
-
|
|
1289
|
-
}
|
|
1290
|
-
function _maxBuffer(a, b) {
|
|
1291
|
-
if (!a && !b) {
|
|
1292
|
-
return null;
|
|
1293
|
-
}
|
|
1294
|
-
else if (!a) {
|
|
1331
|
+
else if (b) {
|
|
1295
1332
|
return b;
|
|
1296
1333
|
}
|
|
1297
|
-
|
|
1334
|
+
return null;
|
|
1335
|
+
}
|
|
1336
|
+
function maxBuffer(a, b, field, schema) {
|
|
1337
|
+
if (a && b) {
|
|
1338
|
+
return compareBounds(a, b, field, schema) >= 0 ? a : b;
|
|
1339
|
+
}
|
|
1340
|
+
else if (a) {
|
|
1298
1341
|
return a;
|
|
1299
1342
|
}
|
|
1300
|
-
|
|
1343
|
+
else if (b) {
|
|
1344
|
+
return b;
|
|
1345
|
+
}
|
|
1346
|
+
return null;
|
|
1301
1347
|
}
|
|
1302
1348
|
|
|
1303
1349
|
function customNumberParser(value) {
|
|
@@ -1473,205 +1519,6 @@ async function removeSnapshots(params) {
|
|
|
1473
1519
|
});
|
|
1474
1520
|
}
|
|
1475
1521
|
|
|
1476
|
-
const DEFAULT_RETRY_COUNT$1 = 5;
|
|
1477
|
-
async function addDataFiles(params) {
|
|
1478
|
-
const { credentials } = params;
|
|
1479
|
-
const retry_max = params.retryCount ?? DEFAULT_RETRY_COUNT$1;
|
|
1480
|
-
const region = params.tableBucketARN.split(':')[3];
|
|
1481
|
-
if (!region) {
|
|
1482
|
-
throw new Error('bad tableBucketARN');
|
|
1483
|
-
}
|
|
1484
|
-
const snapshot_id = params.snapshotId ?? _randomBigInt64$1();
|
|
1485
|
-
const metadata = await getMetadata(params);
|
|
1486
|
-
const bucket = metadata.location.split('/').slice(-1)[0];
|
|
1487
|
-
const parent_snapshot_id = BigInt(metadata['current-snapshot-id']);
|
|
1488
|
-
const snapshot = metadata.snapshots.find((s) => BigInt(s['snapshot-id']) === parent_snapshot_id) ?? null;
|
|
1489
|
-
if (!bucket) {
|
|
1490
|
-
throw new Error('bad manifest location');
|
|
1491
|
-
}
|
|
1492
|
-
if (parent_snapshot_id > 0n && !snapshot) {
|
|
1493
|
-
throw new Error('no old snapshot');
|
|
1494
|
-
}
|
|
1495
|
-
let old_list_key = snapshot ? parseS3Url(snapshot['manifest-list']).key : '';
|
|
1496
|
-
if (snapshot && !old_list_key) {
|
|
1497
|
-
throw new Error('last snapshot invalid');
|
|
1498
|
-
}
|
|
1499
|
-
let sequence_number = BigInt(metadata['last-sequence-number']) + 1n;
|
|
1500
|
-
let remove_snapshot_id = 0n;
|
|
1501
|
-
if (params.maxSnapshots && metadata.snapshots.length >= params.maxSnapshots) {
|
|
1502
|
-
let earliest_time = 0;
|
|
1503
|
-
for (const snap of metadata.snapshots) {
|
|
1504
|
-
const snap_time = snap['timestamp-ms'];
|
|
1505
|
-
if (earliest_time === 0 || snap_time < earliest_time) {
|
|
1506
|
-
earliest_time = snap_time;
|
|
1507
|
-
remove_snapshot_id = BigInt(snap['snapshot-id']);
|
|
1508
|
-
}
|
|
1509
|
-
}
|
|
1510
|
-
}
|
|
1511
|
-
let added_files = 0;
|
|
1512
|
-
let added_records = 0n;
|
|
1513
|
-
let added_size = 0n;
|
|
1514
|
-
const records = await Promise.all(params.lists.map(async (list) => {
|
|
1515
|
-
added_files += list.files.length;
|
|
1516
|
-
for (const file of list.files) {
|
|
1517
|
-
added_records += file.recordCount;
|
|
1518
|
-
added_size += file.fileSize;
|
|
1519
|
-
}
|
|
1520
|
-
const opts = {
|
|
1521
|
-
credentials,
|
|
1522
|
-
region,
|
|
1523
|
-
metadata,
|
|
1524
|
-
schemaId: list.schemaId,
|
|
1525
|
-
specId: list.specId,
|
|
1526
|
-
snapshotId: snapshot_id,
|
|
1527
|
-
sequenceNumber: sequence_number,
|
|
1528
|
-
files: list.files,
|
|
1529
|
-
};
|
|
1530
|
-
return addManifest(opts);
|
|
1531
|
-
}));
|
|
1532
|
-
let expected_snapshot_id = parent_snapshot_id;
|
|
1533
|
-
for (let try_count = 0;; try_count++) {
|
|
1534
|
-
const manifest_list_key = `metadata/${node_crypto.randomUUID()}.avro`;
|
|
1535
|
-
const manifest_list_url = `s3://${bucket}/${manifest_list_key}`;
|
|
1536
|
-
if (old_list_key) {
|
|
1537
|
-
await updateManifestList({
|
|
1538
|
-
credentials,
|
|
1539
|
-
region,
|
|
1540
|
-
bucket,
|
|
1541
|
-
key: old_list_key,
|
|
1542
|
-
outKey: manifest_list_key,
|
|
1543
|
-
metadata: {
|
|
1544
|
-
'sequence-number': String(sequence_number),
|
|
1545
|
-
'snapshot-id': String(snapshot_id),
|
|
1546
|
-
'parent-snapshot-id': String(parent_snapshot_id),
|
|
1547
|
-
},
|
|
1548
|
-
prepend: records,
|
|
1549
|
-
});
|
|
1550
|
-
}
|
|
1551
|
-
else {
|
|
1552
|
-
const manifest_list_buf = await avroToBuffer({
|
|
1553
|
-
type: ManifestListType,
|
|
1554
|
-
metadata: {
|
|
1555
|
-
'sequence-number': String(sequence_number),
|
|
1556
|
-
'snapshot-id': String(snapshot_id),
|
|
1557
|
-
'parent-snapshot-id': 'null',
|
|
1558
|
-
},
|
|
1559
|
-
records,
|
|
1560
|
-
});
|
|
1561
|
-
await writeS3File({
|
|
1562
|
-
credentials,
|
|
1563
|
-
region,
|
|
1564
|
-
bucket,
|
|
1565
|
-
key: manifest_list_key,
|
|
1566
|
-
body: manifest_list_buf,
|
|
1567
|
-
});
|
|
1568
|
-
}
|
|
1569
|
-
try {
|
|
1570
|
-
const updates = [
|
|
1571
|
-
{
|
|
1572
|
-
action: 'add-snapshot',
|
|
1573
|
-
snapshot: {
|
|
1574
|
-
'sequence-number': sequence_number,
|
|
1575
|
-
'snapshot-id': snapshot_id,
|
|
1576
|
-
'parent-snapshot-id': parent_snapshot_id,
|
|
1577
|
-
'timestamp-ms': Date.now(),
|
|
1578
|
-
summary: {
|
|
1579
|
-
operation: 'append',
|
|
1580
|
-
'added-data-files': String(added_files),
|
|
1581
|
-
'added-records': String(added_records),
|
|
1582
|
-
'added-files-size': String(added_size),
|
|
1583
|
-
},
|
|
1584
|
-
'manifest-list': manifest_list_url,
|
|
1585
|
-
'schema-id': metadata['current-schema-id'],
|
|
1586
|
-
},
|
|
1587
|
-
},
|
|
1588
|
-
{
|
|
1589
|
-
action: 'set-snapshot-ref',
|
|
1590
|
-
'snapshot-id': snapshot_id,
|
|
1591
|
-
type: 'branch',
|
|
1592
|
-
'ref-name': 'main',
|
|
1593
|
-
},
|
|
1594
|
-
];
|
|
1595
|
-
if (remove_snapshot_id > 0n) {
|
|
1596
|
-
updates.push({
|
|
1597
|
-
action: 'remove-snapshots',
|
|
1598
|
-
'snapshot-ids': [remove_snapshot_id],
|
|
1599
|
-
});
|
|
1600
|
-
}
|
|
1601
|
-
const result = await icebergRequest({
|
|
1602
|
-
credentials: params.credentials,
|
|
1603
|
-
tableBucketARN: params.tableBucketARN,
|
|
1604
|
-
method: 'POST',
|
|
1605
|
-
suffix: `/namespaces/${params.namespace}/tables/${params.name}`,
|
|
1606
|
-
body: {
|
|
1607
|
-
requirements: expected_snapshot_id > 0n
|
|
1608
|
-
? [
|
|
1609
|
-
{
|
|
1610
|
-
type: 'assert-ref-snapshot-id',
|
|
1611
|
-
ref: 'main',
|
|
1612
|
-
'snapshot-id': expected_snapshot_id,
|
|
1613
|
-
},
|
|
1614
|
-
]
|
|
1615
|
-
: [],
|
|
1616
|
-
updates,
|
|
1617
|
-
},
|
|
1618
|
-
});
|
|
1619
|
-
return {
|
|
1620
|
-
result,
|
|
1621
|
-
retriesNeeded: try_count,
|
|
1622
|
-
parentSnapshotId: parent_snapshot_id,
|
|
1623
|
-
snapshotId: snapshot_id,
|
|
1624
|
-
sequenceNumber: sequence_number,
|
|
1625
|
-
};
|
|
1626
|
-
}
|
|
1627
|
-
catch (e) {
|
|
1628
|
-
if (e instanceof IcebergHttpError &&
|
|
1629
|
-
e.status === 409 &&
|
|
1630
|
-
try_count < retry_max) {
|
|
1631
|
-
// retry case
|
|
1632
|
-
remove_snapshot_id = 0n;
|
|
1633
|
-
}
|
|
1634
|
-
else {
|
|
1635
|
-
throw e;
|
|
1636
|
-
}
|
|
1637
|
-
}
|
|
1638
|
-
// we do a merge in the append only simultanious case
|
|
1639
|
-
const conflict_metadata = await getMetadata(params);
|
|
1640
|
-
const conflict_snapshot_id = BigInt(conflict_metadata['current-snapshot-id']);
|
|
1641
|
-
if (conflict_snapshot_id <= 0n) {
|
|
1642
|
-
throw new Error('conflict');
|
|
1643
|
-
}
|
|
1644
|
-
const conflict_snap = conflict_metadata.snapshots.find((s) => s['snapshot-id'] === conflict_snapshot_id);
|
|
1645
|
-
if (!conflict_snap) {
|
|
1646
|
-
throw new Error('conflict');
|
|
1647
|
-
}
|
|
1648
|
-
if (conflict_snap.summary.operation === 'append' &&
|
|
1649
|
-
BigInt(conflict_snap['sequence-number']) === sequence_number) {
|
|
1650
|
-
old_list_key = parseS3Url(conflict_snap['manifest-list']).key;
|
|
1651
|
-
if (!old_list_key) {
|
|
1652
|
-
throw new Error('conflict');
|
|
1653
|
-
}
|
|
1654
|
-
added_files += parseInt(conflict_snap.summary['added-data-files'] ?? '0', 10);
|
|
1655
|
-
added_records += BigInt(conflict_snap.summary['added-records'] ?? '0');
|
|
1656
|
-
added_size += BigInt(conflict_snap.summary['added-files-size'] ?? '0');
|
|
1657
|
-
expected_snapshot_id = conflict_snapshot_id;
|
|
1658
|
-
sequence_number++;
|
|
1659
|
-
}
|
|
1660
|
-
else {
|
|
1661
|
-
throw new Error('conflict');
|
|
1662
|
-
}
|
|
1663
|
-
}
|
|
1664
|
-
}
|
|
1665
|
-
function _randomBigInt64$1() {
|
|
1666
|
-
const bytes = node_crypto.randomBytes(8);
|
|
1667
|
-
let ret = bytes.readBigUInt64BE();
|
|
1668
|
-
ret &= BigInt('0x7FFFFFFFFFFFFFFF');
|
|
1669
|
-
if (ret === 0n) {
|
|
1670
|
-
ret = 1n;
|
|
1671
|
-
}
|
|
1672
|
-
return ret;
|
|
1673
|
-
}
|
|
1674
|
-
|
|
1675
1522
|
const DEFAULT_RETRY_COUNT = 5;
|
|
1676
1523
|
async function submitSnapshot(params) {
|
|
1677
1524
|
const { snapshotId, parentSnapshotId, resolveConflict } = params;
|
|
@@ -1793,26 +1640,296 @@ async function setCurrentCommit(params) {
|
|
|
1793
1640
|
return commit_result;
|
|
1794
1641
|
}
|
|
1795
1642
|
|
|
1796
|
-
async function
|
|
1643
|
+
async function addDataFiles(params) {
|
|
1644
|
+
const { credentials } = params;
|
|
1645
|
+
const region = params.tableBucketARN.split(':')[3];
|
|
1646
|
+
if (!region) {
|
|
1647
|
+
throw new Error('bad tableBucketARN');
|
|
1648
|
+
}
|
|
1649
|
+
const snapshot_id = params.snapshotId ?? _randomBigInt64$1();
|
|
1650
|
+
const metadata = await getMetadata(params);
|
|
1651
|
+
const bucket = metadata.location.split('/').slice(-1)[0];
|
|
1652
|
+
if (!bucket) {
|
|
1653
|
+
throw new Error('bad manifest location');
|
|
1654
|
+
}
|
|
1655
|
+
const parent_snapshot_id = BigInt(metadata['current-snapshot-id']);
|
|
1656
|
+
const snapshot = metadata.snapshots.find((s) => BigInt(s['snapshot-id']) === parent_snapshot_id) ?? null;
|
|
1657
|
+
if (parent_snapshot_id > 0n && !snapshot) {
|
|
1658
|
+
throw new Error('no old snapshot');
|
|
1659
|
+
}
|
|
1660
|
+
let old_list_key = snapshot ? parseS3Url(snapshot['manifest-list']).key : '';
|
|
1661
|
+
if (snapshot && !old_list_key) {
|
|
1662
|
+
throw new Error('last snapshot invalid');
|
|
1663
|
+
}
|
|
1664
|
+
let sequence_number = BigInt(metadata['last-sequence-number']) + 1n;
|
|
1665
|
+
let remove_snapshot_id = 0n;
|
|
1666
|
+
if (params.maxSnapshots && metadata.snapshots.length >= params.maxSnapshots) {
|
|
1667
|
+
let earliest_time = 0;
|
|
1668
|
+
for (const snap of metadata.snapshots) {
|
|
1669
|
+
const snap_time = snap['timestamp-ms'];
|
|
1670
|
+
if (earliest_time === 0 || snap_time < earliest_time) {
|
|
1671
|
+
earliest_time = snap_time;
|
|
1672
|
+
remove_snapshot_id = BigInt(snap['snapshot-id']);
|
|
1673
|
+
}
|
|
1674
|
+
}
|
|
1675
|
+
}
|
|
1676
|
+
let added_files = 0;
|
|
1677
|
+
let added_records = 0n;
|
|
1678
|
+
let added_size = 0n;
|
|
1679
|
+
const records = await Promise.all(params.lists.map(async (list) => {
|
|
1680
|
+
added_files += list.files.length;
|
|
1681
|
+
for (const file of list.files) {
|
|
1682
|
+
added_records += file.recordCount;
|
|
1683
|
+
added_size += file.fileSize;
|
|
1684
|
+
}
|
|
1685
|
+
const opts = {
|
|
1686
|
+
credentials,
|
|
1687
|
+
region,
|
|
1688
|
+
metadata,
|
|
1689
|
+
schemaId: list.schemaId,
|
|
1690
|
+
specId: list.specId,
|
|
1691
|
+
snapshotId: snapshot_id,
|
|
1692
|
+
sequenceNumber: sequence_number,
|
|
1693
|
+
files: list.files,
|
|
1694
|
+
};
|
|
1695
|
+
return addManifest(opts);
|
|
1696
|
+
}));
|
|
1697
|
+
async function createManifestList() {
|
|
1698
|
+
if (!bucket) {
|
|
1699
|
+
throw new Error('bad manifest location');
|
|
1700
|
+
}
|
|
1701
|
+
if (!region) {
|
|
1702
|
+
throw new Error('bad tableBucketARN');
|
|
1703
|
+
}
|
|
1704
|
+
const manifest_list_key = `metadata/${node_crypto.randomUUID()}.avro`;
|
|
1705
|
+
const url = `s3://${bucket}/${manifest_list_key}`;
|
|
1706
|
+
if (old_list_key) {
|
|
1707
|
+
await updateManifestList({
|
|
1708
|
+
credentials,
|
|
1709
|
+
region,
|
|
1710
|
+
bucket,
|
|
1711
|
+
key: old_list_key,
|
|
1712
|
+
outKey: manifest_list_key,
|
|
1713
|
+
metadata: {
|
|
1714
|
+
'sequence-number': String(sequence_number),
|
|
1715
|
+
'snapshot-id': String(snapshot_id),
|
|
1716
|
+
'parent-snapshot-id': String(parent_snapshot_id),
|
|
1717
|
+
},
|
|
1718
|
+
prepend: records,
|
|
1719
|
+
});
|
|
1720
|
+
}
|
|
1721
|
+
else {
|
|
1722
|
+
const manifest_list_buf = await avroToBuffer({
|
|
1723
|
+
type: ManifestListType,
|
|
1724
|
+
metadata: {
|
|
1725
|
+
'sequence-number': String(sequence_number),
|
|
1726
|
+
'snapshot-id': String(snapshot_id),
|
|
1727
|
+
'parent-snapshot-id': 'null',
|
|
1728
|
+
},
|
|
1729
|
+
records,
|
|
1730
|
+
});
|
|
1731
|
+
await writeS3File({
|
|
1732
|
+
credentials,
|
|
1733
|
+
region,
|
|
1734
|
+
bucket,
|
|
1735
|
+
key: manifest_list_key,
|
|
1736
|
+
body: manifest_list_buf,
|
|
1737
|
+
});
|
|
1738
|
+
}
|
|
1739
|
+
return url;
|
|
1740
|
+
}
|
|
1741
|
+
const manifest_list_url = await createManifestList();
|
|
1742
|
+
async function resolveConflict(conflict_snap) {
|
|
1743
|
+
if (conflict_snap.summary.operation === 'append' &&
|
|
1744
|
+
BigInt(conflict_snap['sequence-number']) === sequence_number) {
|
|
1745
|
+
old_list_key = parseS3Url(conflict_snap['manifest-list']).key;
|
|
1746
|
+
if (!old_list_key) {
|
|
1747
|
+
throw new Error('conflict');
|
|
1748
|
+
}
|
|
1749
|
+
added_files += parseInt(conflict_snap.summary['added-data-files'] ?? '0', 10);
|
|
1750
|
+
added_records += BigInt(conflict_snap.summary['added-records'] ?? '0');
|
|
1751
|
+
added_size += BigInt(conflict_snap.summary['added-files-size'] ?? '0');
|
|
1752
|
+
sequence_number++;
|
|
1753
|
+
const url = await createManifestList();
|
|
1754
|
+
return {
|
|
1755
|
+
manifestListUrl: url,
|
|
1756
|
+
summary: {
|
|
1757
|
+
operation: 'append',
|
|
1758
|
+
'added-data-files': String(added_files),
|
|
1759
|
+
'added-records': String(added_records),
|
|
1760
|
+
'added-files-size': String(added_size),
|
|
1761
|
+
},
|
|
1762
|
+
};
|
|
1763
|
+
}
|
|
1764
|
+
throw new Error('conflict');
|
|
1765
|
+
}
|
|
1766
|
+
return submitSnapshot({
|
|
1767
|
+
credentials,
|
|
1768
|
+
tableBucketARN: params.tableBucketARN,
|
|
1769
|
+
namespace: params.namespace,
|
|
1770
|
+
name: params.name,
|
|
1771
|
+
currentSchemaId: metadata['current-schema-id'],
|
|
1772
|
+
parentSnapshotId: parent_snapshot_id,
|
|
1773
|
+
snapshotId: snapshot_id,
|
|
1774
|
+
sequenceNumber: sequence_number,
|
|
1775
|
+
retryCount: params.retryCount,
|
|
1776
|
+
removeSnapshotId: remove_snapshot_id,
|
|
1777
|
+
manifestListUrl: manifest_list_url,
|
|
1778
|
+
summary: {
|
|
1779
|
+
operation: 'append',
|
|
1780
|
+
'added-data-files': String(added_files),
|
|
1781
|
+
'added-records': String(added_records),
|
|
1782
|
+
'added-files-size': String(added_size),
|
|
1783
|
+
},
|
|
1784
|
+
resolveConflict,
|
|
1785
|
+
});
|
|
1786
|
+
}
|
|
1787
|
+
function _randomBigInt64$1() {
|
|
1788
|
+
const bytes = node_crypto.randomBytes(8);
|
|
1789
|
+
let ret = bytes.readBigUInt64BE();
|
|
1790
|
+
ret &= BigInt('0x7FFFFFFFFFFFFFFF');
|
|
1791
|
+
if (ret === 0n) {
|
|
1792
|
+
ret = 1n;
|
|
1793
|
+
}
|
|
1794
|
+
return ret;
|
|
1795
|
+
}
|
|
1796
|
+
|
|
1797
|
+
async function importRedshiftManifest(params) {
|
|
1798
|
+
const { credentials } = params;
|
|
1799
|
+
const region = params.tableBucketARN.split(':')[3];
|
|
1800
|
+
if (!region) {
|
|
1801
|
+
throw new Error('bad tableBucketARN');
|
|
1802
|
+
}
|
|
1803
|
+
const manifest = await _downloadRedshift(params);
|
|
1804
|
+
const metadata = await getMetadata(params);
|
|
1805
|
+
const bucket = metadata.location.split('/').slice(-1)[0];
|
|
1806
|
+
if (!bucket) {
|
|
1807
|
+
throw new Error('bad manifest location');
|
|
1808
|
+
}
|
|
1809
|
+
const import_prefix = `data/${node_crypto.randomUUID()}/`;
|
|
1810
|
+
const lists = [];
|
|
1811
|
+
for (const entry of manifest.entries) {
|
|
1812
|
+
const { url } = entry;
|
|
1813
|
+
const { content_length, record_count } = entry.meta;
|
|
1814
|
+
const file = url.split('/').pop() ?? '';
|
|
1815
|
+
const parts = [...url.matchAll(/\/([^=/]*=[^/=]*)/g)].map((m) => m[1] ?? '');
|
|
1816
|
+
const partitions = {};
|
|
1817
|
+
for (const part of parts) {
|
|
1818
|
+
const [part_key, part_value] = part.split('=');
|
|
1819
|
+
partitions[part_key ?? ''] = part_value ?? '';
|
|
1820
|
+
}
|
|
1821
|
+
const keys = Object.keys(partitions);
|
|
1822
|
+
const specId = params.specId ?? _findSpec(metadata, keys);
|
|
1823
|
+
const schemaId = params.schemaId ?? _findSchema(metadata, manifest);
|
|
1824
|
+
let list = lists.find((l) => l.schemaId === schemaId && l.specId === specId);
|
|
1825
|
+
if (!list) {
|
|
1826
|
+
list = { specId, schemaId, files: [] };
|
|
1827
|
+
lists.push(list);
|
|
1828
|
+
}
|
|
1829
|
+
const part_path = parts.length > 0 ? `${parts.join('/')}/` : '';
|
|
1830
|
+
const key = import_prefix + part_path + file;
|
|
1831
|
+
list.files.push({
|
|
1832
|
+
file: await _maybeMoveFile({ credentials, region, bucket, key, url }),
|
|
1833
|
+
partitions,
|
|
1834
|
+
fileSize: BigInt(content_length),
|
|
1835
|
+
recordCount: BigInt(record_count),
|
|
1836
|
+
});
|
|
1837
|
+
}
|
|
1838
|
+
return addDataFiles({
|
|
1839
|
+
credentials,
|
|
1840
|
+
tableBucketARN: params.tableBucketARN,
|
|
1841
|
+
namespace: params.namespace,
|
|
1842
|
+
name: params.name,
|
|
1843
|
+
lists,
|
|
1844
|
+
retryCount: params.retryCount,
|
|
1845
|
+
});
|
|
1846
|
+
}
|
|
1847
|
+
async function _downloadRedshift(params) {
|
|
1848
|
+
const s3_client = getS3Client(params);
|
|
1849
|
+
const { bucket, key } = parseS3Url(params.redshiftManifestUrl);
|
|
1850
|
+
const get_file_cmd = new clientS3.GetObjectCommand({ Bucket: bucket, Key: key });
|
|
1851
|
+
const file_response = await s3_client.send(get_file_cmd);
|
|
1852
|
+
const body = await file_response.Body?.transformToString();
|
|
1853
|
+
if (!body) {
|
|
1854
|
+
throw new Error('missing body');
|
|
1855
|
+
}
|
|
1856
|
+
return parse(body);
|
|
1857
|
+
}
|
|
1858
|
+
async function _maybeMoveFile(params) {
|
|
1859
|
+
const { bucket, key } = parseS3Url(params.url);
|
|
1860
|
+
if (!bucket || !key) {
|
|
1861
|
+
throw new Error(`bad entry url: ${params.url}`);
|
|
1862
|
+
}
|
|
1863
|
+
if (bucket === params.bucket) {
|
|
1864
|
+
return params.url;
|
|
1865
|
+
}
|
|
1866
|
+
const s3_client = getS3Client(params);
|
|
1867
|
+
const get = new clientS3.GetObjectCommand({ Bucket: bucket, Key: key });
|
|
1868
|
+
const { Body } = await s3_client.send(get);
|
|
1869
|
+
if (!Body) {
|
|
1870
|
+
throw new Error(`body missing for file: ${params.url}`);
|
|
1871
|
+
}
|
|
1872
|
+
const upload = new libStorage.Upload({
|
|
1873
|
+
client: s3_client,
|
|
1874
|
+
params: { Bucket: params.bucket, Key: params.key, Body },
|
|
1875
|
+
});
|
|
1876
|
+
await upload.done();
|
|
1877
|
+
return `s3://${params.bucket}/${params.key}`;
|
|
1878
|
+
}
|
|
1879
|
+
function _findSpec(metadata, keys) {
|
|
1880
|
+
if (keys.length === 0) {
|
|
1881
|
+
return 0;
|
|
1882
|
+
}
|
|
1883
|
+
for (const spec of metadata['partition-specs']) {
|
|
1884
|
+
if (spec.fields.length === keys.length) {
|
|
1885
|
+
if (keys.every((key) => spec.fields.find((f) => f.name === key))) {
|
|
1886
|
+
return spec['spec-id'];
|
|
1887
|
+
}
|
|
1888
|
+
}
|
|
1889
|
+
}
|
|
1890
|
+
throw new Error(`spec not found for keys ${keys.join(', ')}`);
|
|
1891
|
+
}
|
|
1892
|
+
function _findSchema(metadata, manifest) {
|
|
1893
|
+
const { elements } = manifest.schema;
|
|
1894
|
+
for (const schema of metadata.schemas) {
|
|
1895
|
+
if (schema.fields.every((f) => !f.required || elements.find((e) => e.name === f.name))) {
|
|
1896
|
+
return schema['schema-id'];
|
|
1897
|
+
}
|
|
1898
|
+
}
|
|
1899
|
+
throw new Error('schema not found for schema.elements');
|
|
1900
|
+
}
|
|
1901
|
+
|
|
1902
|
+
async function* asyncIterMap(items, limit, func) {
|
|
1797
1903
|
const pending = new Set();
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
const
|
|
1801
|
-
|
|
1802
|
-
value
|
|
1803
|
-
|
|
1804
|
-
|
|
1805
|
-
|
|
1904
|
+
let index = 0;
|
|
1905
|
+
function enqueue() {
|
|
1906
|
+
const item = items[index++];
|
|
1907
|
+
if (item !== undefined) {
|
|
1908
|
+
const result = { promise: undefined, value: undefined };
|
|
1909
|
+
const promise = func(item).then((value) => {
|
|
1910
|
+
result.value = value;
|
|
1911
|
+
return result;
|
|
1912
|
+
});
|
|
1913
|
+
result.promise = promise;
|
|
1914
|
+
pending.add(promise);
|
|
1915
|
+
}
|
|
1916
|
+
}
|
|
1917
|
+
for (let i = 0; i < limit && i < items.length; i++) {
|
|
1918
|
+
enqueue();
|
|
1806
1919
|
}
|
|
1807
1920
|
while (pending.size) {
|
|
1808
|
-
const {
|
|
1809
|
-
if (
|
|
1810
|
-
pending.delete(
|
|
1921
|
+
const { promise, value } = await Promise.race(pending);
|
|
1922
|
+
if (promise) {
|
|
1923
|
+
pending.delete(promise);
|
|
1811
1924
|
}
|
|
1812
|
-
|
|
1925
|
+
if (value !== undefined) {
|
|
1926
|
+
yield value;
|
|
1927
|
+
}
|
|
1928
|
+
enqueue();
|
|
1813
1929
|
}
|
|
1814
1930
|
}
|
|
1815
1931
|
|
|
1932
|
+
const ITER_LIMIT = 10;
|
|
1816
1933
|
async function manifestCompact(params) {
|
|
1817
1934
|
const { credentials, targetCount, calculateWeight } = params;
|
|
1818
1935
|
const region = params.tableBucketARN.split(':')[3];
|
|
@@ -1835,6 +1952,7 @@ async function manifestCompact(params) {
|
|
|
1835
1952
|
snapshotId: 0n,
|
|
1836
1953
|
sequenceNumber: 0n,
|
|
1837
1954
|
changed: false,
|
|
1955
|
+
inputManifestCount: 0,
|
|
1838
1956
|
outputManifestCount: 0,
|
|
1839
1957
|
};
|
|
1840
1958
|
}
|
|
@@ -1901,11 +2019,12 @@ async function manifestCompact(params) {
|
|
|
1901
2019
|
snapshotId: 0n,
|
|
1902
2020
|
sequenceNumber: sequence_number,
|
|
1903
2021
|
changed: false,
|
|
2022
|
+
inputManifestCount: list.length,
|
|
1904
2023
|
outputManifestCount: 0,
|
|
1905
2024
|
};
|
|
1906
2025
|
}
|
|
1907
2026
|
const manifest_list_key = `metadata/${node_crypto.randomUUID()}.avro`;
|
|
1908
|
-
const iter = asyncIterMap(final_groups, async (group) => {
|
|
2027
|
+
const iter = asyncIterMap(final_groups, ITER_LIMIT, async (group) => {
|
|
1909
2028
|
if (!group[0]) {
|
|
1910
2029
|
return [];
|
|
1911
2030
|
}
|
|
@@ -1966,19 +2085,21 @@ async function manifestCompact(params) {
|
|
|
1966
2085
|
return {
|
|
1967
2086
|
...snap_result,
|
|
1968
2087
|
changed: true,
|
|
2088
|
+
inputManifestCount: list.length,
|
|
1969
2089
|
outputManifestCount: final_groups.length,
|
|
1970
2090
|
};
|
|
1971
2091
|
}
|
|
1972
2092
|
async function _combineGroup(params) {
|
|
1973
|
-
const { credentials, region, bucket, group } = params;
|
|
2093
|
+
const { credentials, region, bucket, group, spec } = params;
|
|
1974
2094
|
const record0 = group[0];
|
|
1975
2095
|
if ((group.length === 1 && !params.forceRewrite) || !record0) {
|
|
1976
2096
|
return group;
|
|
1977
2097
|
}
|
|
1978
2098
|
const key = `metadata/${node_crypto.randomUUID()}.avro`;
|
|
1979
|
-
const
|
|
1980
|
-
const
|
|
1981
|
-
const
|
|
2099
|
+
const icebergSchema = _schemaForSpec(params.schemas, spec);
|
|
2100
|
+
const schema = makeManifestSchema(spec, params.schemas, true);
|
|
2101
|
+
const type = makeManifestType(spec, params.schemas, true);
|
|
2102
|
+
const iter = asyncIterMap(group, ITER_LIMIT, async (record) => {
|
|
1982
2103
|
return _streamReadManifest({
|
|
1983
2104
|
credentials,
|
|
1984
2105
|
region,
|
|
@@ -1993,8 +2114,8 @@ async function _combineGroup(params) {
|
|
|
1993
2114
|
bucket,
|
|
1994
2115
|
key,
|
|
1995
2116
|
metadata: {
|
|
1996
|
-
'partition-spec-id': String(
|
|
1997
|
-
'partition-spec': JSON.stringify(
|
|
2117
|
+
'partition-spec-id': String(spec['spec-id']),
|
|
2118
|
+
'partition-spec': JSON.stringify(spec.fields),
|
|
1998
2119
|
},
|
|
1999
2120
|
avroType: type,
|
|
2000
2121
|
iter,
|
|
@@ -2030,22 +2151,15 @@ async function _combineGroup(params) {
|
|
|
2030
2151
|
for (let j = 0; j < parts.length; j++) {
|
|
2031
2152
|
const part = parts[j];
|
|
2032
2153
|
const ret_part = ret.partitions[j];
|
|
2033
|
-
|
|
2154
|
+
const field = spec.fields[i];
|
|
2155
|
+
if (part && ret_part && field) {
|
|
2034
2156
|
ret_part.contains_null ||= part.contains_null;
|
|
2035
2157
|
if (part.contains_nan !== undefined) {
|
|
2036
2158
|
ret_part.contains_nan =
|
|
2037
2159
|
(ret_part.contains_nan ?? false) || part.contains_nan;
|
|
2038
2160
|
}
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
Buffer.compare(part.upper_bound, ret_part.upper_bound) > 0)) {
|
|
2042
|
-
ret_part.upper_bound = part.upper_bound ?? null;
|
|
2043
|
-
}
|
|
2044
|
-
if (!ret_part.lower_bound ||
|
|
2045
|
-
(part.lower_bound &&
|
|
2046
|
-
Buffer.compare(part.lower_bound, ret_part.lower_bound) < 0)) {
|
|
2047
|
-
ret_part.lower_bound = part.lower_bound ?? null;
|
|
2048
|
-
}
|
|
2161
|
+
ret_part.upper_bound = maxBuffer(ret_part.upper_bound, part.upper_bound, field, icebergSchema);
|
|
2162
|
+
ret_part.lower_bound = minBuffer(ret_part.lower_bound, part.lower_bound, field, icebergSchema);
|
|
2049
2163
|
}
|
|
2050
2164
|
}
|
|
2051
2165
|
}
|
|
@@ -2131,6 +2245,14 @@ function _combineWeightGroups(groups, targetCount, calculateWeight) {
|
|
|
2131
2245
|
function _sortGroup(a, b) {
|
|
2132
2246
|
return a.weight - b.weight;
|
|
2133
2247
|
}
|
|
2248
|
+
function _schemaForSpec(schemas, spec) {
|
|
2249
|
+
for (const schema of schemas) {
|
|
2250
|
+
if (spec.fields.every((f) => schema.fields.find((f2) => f2.id === f['source-id']))) {
|
|
2251
|
+
return schema;
|
|
2252
|
+
}
|
|
2253
|
+
}
|
|
2254
|
+
throw new Error(`schema not found for spec: ${spec['spec-id']}`);
|
|
2255
|
+
}
|
|
2134
2256
|
function _randomBigInt64() {
|
|
2135
2257
|
const bytes = node_crypto.randomBytes(8);
|
|
2136
2258
|
let ret = bytes.readBigUInt64BE();
|
|
@@ -2152,13 +2274,14 @@ function _bigintMin(value0, ...values) {
|
|
|
2152
2274
|
|
|
2153
2275
|
var index = {
|
|
2154
2276
|
IcebergHttpError,
|
|
2155
|
-
getMetadata,
|
|
2156
2277
|
addSchema,
|
|
2157
2278
|
addPartitionSpec,
|
|
2158
2279
|
addManifest,
|
|
2159
2280
|
addDataFiles,
|
|
2160
|
-
|
|
2281
|
+
getMetadata,
|
|
2282
|
+
importRedshiftManifest,
|
|
2161
2283
|
removeSnapshots,
|
|
2284
|
+
setCurrentCommit,
|
|
2162
2285
|
};
|
|
2163
2286
|
|
|
2164
2287
|
exports.IcebergHttpError = IcebergHttpError;
|
|
@@ -2168,7 +2291,10 @@ exports.addPartitionSpec = addPartitionSpec;
|
|
|
2168
2291
|
exports.addSchema = addSchema;
|
|
2169
2292
|
exports.default = index;
|
|
2170
2293
|
exports.getMetadata = getMetadata;
|
|
2294
|
+
exports.importRedshiftManifest = importRedshiftManifest;
|
|
2171
2295
|
exports.manifestCompact = manifestCompact;
|
|
2296
|
+
exports.maxBuffer = maxBuffer;
|
|
2297
|
+
exports.minBuffer = minBuffer;
|
|
2172
2298
|
exports.removeSnapshots = removeSnapshots;
|
|
2173
2299
|
exports.setCurrentCommit = setCurrentCommit;
|
|
2174
2300
|
exports.submitSnapshot = submitSnapshot;
|
package/package.json
CHANGED
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "node-s3tables",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.19",
|
|
4
4
|
"description": "node api for dealing with s3tables",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
7
7
|
"bin": {
|
|
8
|
-
"node-s3tables": "dist/bin
|
|
8
|
+
"node-s3tables": "dist/bin.js"
|
|
9
9
|
},
|
|
10
10
|
"files": [
|
|
11
11
|
"dist/**/*"
|
|
12
12
|
],
|
|
13
13
|
"scripts": {
|
|
14
|
+
"bin": "tsx src/bin.ts",
|
|
14
15
|
"build": "rollup -c",
|
|
15
16
|
"ts:check": "tsc --noEmit",
|
|
16
|
-
"lint": "eslint src test",
|
|
17
|
+
"lint": "eslint src test --fix",
|
|
17
18
|
"pretty": "prettier -u --write \"**/*\" --log-level warn",
|
|
18
19
|
"test": "dotenv -- tsx --test test/*.test.ts",
|
|
19
20
|
"test:single": "dotenv -- tsx --test",
|