node-s3tables 0.0.18 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin.js CHANGED
@@ -1,25 +1,116 @@
1
1
  #!/usr/bin/env node
2
2
  'use strict';
3
3
 
4
+ var node_util = require('node:util');
4
5
  var nodeS3tables = require('node-s3tables');
5
6
 
6
7
  /* eslint-disable no-console */
7
- const [tableBucketARN, namespace, name] = process.argv.slice(2);
8
- if (!tableBucketARN || !namespace || !name) {
9
- console.error('Usage: node-s3tables compact <tableBucketARN> <namespace> <name>');
8
+ const { positionals, values } = node_util.parseArgs({
9
+ allowPositionals: true,
10
+ options: {
11
+ 'force-rewrite': { type: 'boolean' },
12
+ 'spec-id': { type: 'string' },
13
+ 'schema-id': { type: 'string' },
14
+ files: { type: 'string' },
15
+ 'max-snapshots': { type: 'string' },
16
+ 'redshift-manifest-url': { type: 'string' },
17
+ },
18
+ });
19
+ const [command, tableBucketARN, namespace, name] = positionals;
20
+ if (!command || !tableBucketARN || !namespace || !name) {
21
+ console.error('Usage: node-s3tables <command> <tableBucketARN> <namespace> <name> [options]\n');
22
+ console.error('Commands:');
23
+ console.error(' compact Compact manifest files');
24
+ console.error(' Options: --force-rewrite');
25
+ console.error('');
26
+ console.error(' add_files Add data files to table');
27
+ console.error(' Options: --spec-id <id> --schema-id <id> --files <json> [--max-snapshots <n>]');
28
+ console.error(' Example: --files \'[{"file":"s3://bucket/data.parquet","partitions":{},"recordCount":"1000","fileSize":"52428"}]\'');
29
+ console.error('');
30
+ console.error(' import_redshift Import redshift manifest created by UNLOAD');
31
+ console.error(' Options: --redshift-manifest-url s3://s3table-bucket/unload/manfiest');
32
+ console.error('');
10
33
  process.exit(-1);
11
34
  }
12
- nodeS3tables.manifestCompact({ tableBucketARN, namespace, name })
13
- .then((result) => {
14
- console.log('Compact result:', result);
15
- process.exit(0);
16
- })
17
- .catch((error) => {
18
- if (error instanceof Error) {
19
- console.error('Error:', error.message);
35
+ if (command === 'compact') {
36
+ console.log('Compact:', tableBucketARN, namespace, name, 'forceRewrite:', Boolean(values['force-rewrite']));
37
+ nodeS3tables.manifestCompact({
38
+ tableBucketARN,
39
+ namespace,
40
+ name,
41
+ forceRewrite: Boolean(values['force-rewrite']),
42
+ })
43
+ .then((result) => {
44
+ console.log('Compact result:', result);
45
+ process.exit(0);
46
+ })
47
+ .catch((error) => {
48
+ console.error('Error:', error);
49
+ process.exit(1);
50
+ });
51
+ }
52
+ else if (command === 'add_files') {
53
+ const specId = values['spec-id'];
54
+ const schemaId = values['schema-id'];
55
+ const filesJson = values.files;
56
+ if (!specId || !schemaId || !filesJson) {
57
+ console.error('Error: Missing required options for add_files command\n');
58
+ console.error('Usage: node-s3tables add_files <tableBucketARN> <namespace> <name> --spec-id <id> --schema-id <id> --files <json> [--max-snapshots <n>]\n');
59
+ console.error('Example:');
60
+ console.error(' --spec-id 1 --schema-id 2 --files \'[{"file":"s3://bucket/data.parquet","partitions":{"date":"2024-01-01"},"recordCount":"1000","fileSize":"52428"}]\'');
61
+ process.exit(-1);
20
62
  }
21
- else {
63
+ const files = JSON.parse(filesJson);
64
+ const maxSnapshots = values['max-snapshots']
65
+ ? parseInt(values['max-snapshots'], 10)
66
+ : undefined;
67
+ console.log('Adding files:', tableBucketARN, namespace, name);
68
+ const params = {
69
+ tableBucketARN,
70
+ namespace,
71
+ name,
72
+ lists: [
73
+ { specId: parseInt(specId, 10), schemaId: parseInt(schemaId, 10), files },
74
+ ],
75
+ };
76
+ if (maxSnapshots !== undefined) {
77
+ params.maxSnapshots = maxSnapshots;
78
+ }
79
+ nodeS3tables.addDataFiles(params)
80
+ .then((result) => {
81
+ console.log('Add files result:', result);
82
+ process.exit(0);
83
+ })
84
+ .catch((error) => {
22
85
  console.error('Error:', error);
86
+ process.exit(1);
87
+ });
88
+ }
89
+ else if (command === 'import_redshift') {
90
+ const redshiftManifestUrl = values['redshift-manifest-url'];
91
+ if (!redshiftManifestUrl) {
92
+ console.error('Error: Missing required options for import_redshift command\n');
93
+ console.error('Usage: node-s3tables import_redshift <tableBucketARN> <namespace> <name> --redshift-manfiest-url <s3url>\n');
94
+ console.error('Example:');
95
+ console.error(' --redshift-manifest-url s3://s3table-bucket/exported_manfiest.json');
96
+ process.exit(-1);
23
97
  }
24
- process.exit(1);
25
- });
98
+ nodeS3tables.importRedshiftManifest({
99
+ tableBucketARN,
100
+ namespace,
101
+ name,
102
+ redshiftManifestUrl,
103
+ })
104
+ .then((result) => {
105
+ console.log('Import result:', result);
106
+ process.exit(0);
107
+ })
108
+ .catch((error) => {
109
+ console.error('Error:', error);
110
+ process.exit(1);
111
+ });
112
+ }
113
+ else {
114
+ console.error('Unknown command:', command);
115
+ process.exit(-1);
116
+ }
package/dist/index.d.ts CHANGED
@@ -120,6 +120,51 @@ interface AddManifestParams {
120
120
  files: AddFile[];
121
121
  }
122
122
  declare function addManifest(params: AddManifestParams): Promise<ManifestListRecord>;
123
+ declare function minBuffer(a: Buffer | null | undefined, b: Buffer | null | undefined, field: IcebergPartitionField, schema: IcebergSchema): Buffer | null;
124
+ declare function maxBuffer(a: Buffer | null | undefined, b: Buffer | null | undefined, field: IcebergPartitionField, schema: IcebergSchema): Buffer | null;
125
+
126
+ type JSONPrimitive = string | number | boolean | null | bigint | undefined;
127
+ type JSONValue = JSONPrimitive | JSONObject | JSONArray;
128
+ interface JSONObject {
129
+ [key: string]: JSONValue;
130
+ }
131
+ type JSONArray = JSONValue[];
132
+
133
+ interface AddFileList {
134
+ specId: number;
135
+ schemaId: number;
136
+ files: AddFile[];
137
+ }
138
+ interface AddDataFilesParams {
139
+ credentials?: AwsCredentialIdentity | undefined;
140
+ tableBucketARN: string;
141
+ namespace: string;
142
+ name: string;
143
+ snapshotId?: bigint;
144
+ lists: AddFileList[];
145
+ retryCount?: number | undefined;
146
+ maxSnapshots?: number;
147
+ }
148
+ interface AddDataFilesResult {
149
+ result: JSONObject;
150
+ retriesNeeded: number;
151
+ parentSnapshotId: bigint;
152
+ snapshotId: bigint;
153
+ sequenceNumber: bigint;
154
+ }
155
+ declare function addDataFiles(params: AddDataFilesParams): Promise<AddDataFilesResult>;
156
+
157
+ interface ImportRedshiftManifestParams {
158
+ credentials?: AwsCredentialIdentity;
159
+ tableBucketARN: string;
160
+ namespace: string;
161
+ name: string;
162
+ redshiftManifestUrl: string;
163
+ schemaId?: number;
164
+ specId?: number;
165
+ retryCount?: number | undefined;
166
+ }
167
+ declare function importRedshiftManifest(params: ImportRedshiftManifestParams): Promise<AddDataFilesResult>;
123
168
 
124
169
  type TableLocation = {
125
170
  tableArn: string;
@@ -164,36 +209,12 @@ interface RemoveSnapshotsParams {
164
209
  }
165
210
  declare function removeSnapshots(params: RemoveSnapshotsParams): Promise<IcebergUpdateResponse>;
166
211
 
167
- type JSONPrimitive = string | number | boolean | null | bigint | undefined;
168
- type JSONValue = JSONPrimitive | JSONObject | JSONArray;
169
- interface JSONObject {
170
- [key: string]: JSONValue;
171
- }
172
- type JSONArray = JSONValue[];
173
-
174
- interface AddFileList {
175
- specId: number;
176
- schemaId: number;
177
- files: AddFile[];
178
- }
179
- interface AddDataFilesParams {
180
- credentials?: AwsCredentialIdentity;
181
- tableBucketARN: string;
182
- namespace: string;
183
- name: string;
184
- snapshotId?: bigint;
185
- lists: AddFileList[];
186
- retryCount?: number;
187
- maxSnapshots?: number;
188
- }
189
- interface AddDataFilesResult {
190
- result: JSONObject;
191
- retriesNeeded: number;
192
- parentSnapshotId: bigint;
193
- snapshotId: bigint;
194
- sequenceNumber: bigint;
212
+ declare class IcebergHttpError extends Error {
213
+ status: number;
214
+ text?: string;
215
+ body?: JSONObject;
216
+ constructor(status: number, body: JSONValue, message: string);
195
217
  }
196
- declare function addDataFiles(params: AddDataFilesParams): Promise<AddDataFilesResult>;
197
218
 
198
219
  interface SubmitSnapshotParams {
199
220
  credentials?: AwsCredentialIdentity | undefined;
@@ -231,13 +252,6 @@ interface SetCurrentCommitParams {
231
252
  }
232
253
  declare function setCurrentCommit(params: SetCurrentCommitParams): Promise<JSONObject>;
233
254
 
234
- declare class IcebergHttpError extends Error {
235
- status: number;
236
- text?: string;
237
- body?: JSONObject;
238
- constructor(status: number, body: JSONValue, message: string);
239
- }
240
-
241
255
  type CalculateWeightFunction = (group: ManifestListRecord[]) => number;
242
256
  interface ManifestCompactParams {
243
257
  credentials?: AwsCredentialIdentity;
@@ -253,20 +267,22 @@ interface ManifestCompactParams {
253
267
  }
254
268
  interface ManifestCompactResult extends SubmitSnapshotResult {
255
269
  changed: boolean;
270
+ inputManifestCount: number;
256
271
  outputManifestCount: number;
257
272
  }
258
273
  declare function manifestCompact(params: ManifestCompactParams): Promise<ManifestCompactResult>;
259
274
 
260
275
  declare const _default: {
261
276
  IcebergHttpError: typeof IcebergHttpError;
262
- getMetadata: typeof getMetadata;
263
277
  addSchema: typeof addSchema;
264
278
  addPartitionSpec: typeof addPartitionSpec;
265
279
  addManifest: typeof addManifest;
266
280
  addDataFiles: typeof addDataFiles;
267
- setCurrentCommit: typeof setCurrentCommit;
281
+ getMetadata: typeof getMetadata;
282
+ importRedshiftManifest: typeof importRedshiftManifest;
268
283
  removeSnapshots: typeof removeSnapshots;
284
+ setCurrentCommit: typeof setCurrentCommit;
269
285
  };
270
286
 
271
- export { IcebergHttpError, addDataFiles, addManifest, addPartitionSpec, addSchema, _default as default, getMetadata, manifestCompact, removeSnapshots, setCurrentCommit, submitSnapshot };
272
- export type { AddDataFilesParams, AddDataFilesResult, AddFile, AddFileList, AddManifestParams, AddPartitionSpecParams, AddSchemaParams, CalculateWeightFunction, GetMetadataParams, IcebergComplexType, IcebergMetadata, IcebergPartitionField, IcebergPartitionSpec, IcebergPrimitiveType, IcebergSchema, IcebergSchemaField, IcebergSnapshot, IcebergSnapshotSummary, IcebergTransform, IcebergType, IcebergUpdateResponse, ManifestCompactParams, ManifestCompactResult, ManifestListRecord, RemoveSnapshotsParams, ResolveConflictResult, SetCurrentCommitParams, SubmitSnapshotParams, SubmitSnapshotResult, TableLocation };
287
+ export { IcebergHttpError, addDataFiles, addManifest, addPartitionSpec, addSchema, _default as default, getMetadata, importRedshiftManifest, manifestCompact, maxBuffer, minBuffer, removeSnapshots, setCurrentCommit, submitSnapshot };
288
+ export type { AddDataFilesParams, AddDataFilesResult, AddFile, AddFileList, AddManifestParams, AddPartitionSpecParams, AddSchemaParams, CalculateWeightFunction, GetMetadataParams, IcebergComplexType, IcebergMetadata, IcebergPartitionField, IcebergPartitionSpec, IcebergPrimitiveType, IcebergSchema, IcebergSchemaField, IcebergSnapshot, IcebergSnapshotSummary, IcebergTransform, IcebergType, IcebergUpdateResponse, ImportRedshiftManifestParams, ManifestCompactParams, ManifestCompactResult, ManifestListRecord, RemoveSnapshotsParams, ResolveConflictResult, SetCurrentCommitParams, SubmitSnapshotParams, SubmitSnapshotResult, TableLocation };
package/dist/index.js CHANGED
@@ -803,6 +803,37 @@ function makeBounds(partitions, spec, schema) {
803
803
  return _encodeValue(raw, f.transform, out_type);
804
804
  });
805
805
  }
806
+ function compareBounds(a, b, field, schema) {
807
+ const schemaField = schema.fields.find((sf) => sf.id === field['source-id']);
808
+ if (!schemaField) {
809
+ throw new Error(`Schema field not found for source-id ${field['source-id']}`);
810
+ }
811
+ const out_type = _outputType(field.transform, schemaField.type);
812
+ switch (out_type) {
813
+ case 'boolean':
814
+ return Buffer.from(a).readUInt8() - Buffer.from(b).readUInt8();
815
+ case 'int':
816
+ return Buffer.from(a).readInt32LE() - Buffer.from(b).readInt32LE();
817
+ case 'long': {
818
+ const diff = Buffer.from(a).readBigInt64LE() - Buffer.from(b).readBigInt64LE();
819
+ return diff > 0n ? 1 : diff < 0n ? -1 : 0;
820
+ }
821
+ case 'float':
822
+ return Buffer.from(a).readFloatLE() - Buffer.from(b).readFloatLE();
823
+ case 'double':
824
+ return Buffer.from(a).readDoubleLE() - Buffer.from(b).readDoubleLE();
825
+ case null:
826
+ case 'date':
827
+ case 'time':
828
+ case 'timestamp':
829
+ case 'timestamptz':
830
+ case 'string':
831
+ case 'uuid':
832
+ case 'binary':
833
+ default:
834
+ return Buffer.compare(a, b);
835
+ }
836
+ }
806
837
 
807
838
  function isRawRecordSchema(schema) {
808
839
  return (typeof schema === 'object' &&
@@ -957,6 +988,13 @@ function parseS3Url(url) {
957
988
  }
958
989
  const g_s3Map = new Map();
959
990
  const g_s3TablesMap = new Map();
991
+ class ByteCounter extends node_stream.Transform {
992
+ bytes = 0;
993
+ _transform(chunk, _encoding, callback) {
994
+ this.bytes += chunk.length;
995
+ callback(null, chunk);
996
+ }
997
+ }
960
998
  function getS3Client(params) {
961
999
  const { region, credentials } = params;
962
1000
  let ret = g_s3Map.get(region)?.get(credentials);
@@ -1085,26 +1123,32 @@ async function streamWriteAvro(params) {
1085
1123
  codecs: { deflate: zlib__namespace.deflateRaw },
1086
1124
  metadata,
1087
1125
  });
1126
+ const counter = new ByteCounter();
1127
+ encoder.pipe(counter);
1088
1128
  const upload = new libStorage.Upload({
1089
1129
  client: s3,
1090
- params: { Bucket: bucket, Key: key, Body: encoder },
1130
+ params: { Bucket: bucket, Key: key, Body: counter },
1091
1131
  });
1092
- let file_size = 0;
1093
- upload.on('httpUploadProgress', (progress) => {
1094
- if (progress.loaded) {
1095
- file_size = progress.loaded;
1132
+ async function _abortUpload() {
1133
+ try {
1134
+ await upload.abort();
1096
1135
  }
1097
- });
1136
+ catch {
1137
+ // noop
1138
+ }
1139
+ }
1098
1140
  const upload_promise = upload.done();
1099
1141
  let found_err;
1100
1142
  upload_promise.catch((err) => {
1101
- found_err = err;
1143
+ found_err ??= err;
1102
1144
  });
1103
1145
  encoder.on('error', (err) => {
1104
- found_err = err;
1146
+ found_err ??= err;
1147
+ void _abortUpload();
1105
1148
  });
1106
1149
  for await (const batch of params.iter) {
1107
1150
  if (found_err) {
1151
+ void _abortUpload();
1108
1152
  throw found_err;
1109
1153
  }
1110
1154
  for (const record of batch) {
@@ -1114,9 +1158,10 @@ async function streamWriteAvro(params) {
1114
1158
  encoder.end();
1115
1159
  await upload_promise;
1116
1160
  if (found_err) {
1161
+ void _abortUpload();
1117
1162
  throw found_err;
1118
1163
  }
1119
- return file_size;
1164
+ return counter.bytes;
1120
1165
  }
1121
1166
  async function downloadAvro(params) {
1122
1167
  const { region, credentials, bucket, key, avroSchema } = params;
@@ -1188,15 +1233,16 @@ async function addManifest(params) {
1188
1233
  for (let i = 0; i < partitions.length; i++) {
1189
1234
  const part = partitions[i];
1190
1235
  const bound = bounds[i];
1191
- if (!part) {
1236
+ const field = spec.fields[i];
1237
+ if (!part || !field) {
1192
1238
  throw new Error('impossible');
1193
1239
  }
1194
1240
  else if (bound === null) {
1195
1241
  part.contains_null = true;
1196
1242
  }
1197
1243
  else if (Buffer.isBuffer(bound)) {
1198
- part.upper_bound = _maxBuffer(part.upper_bound ?? null, bound);
1199
- part.lower_bound = _minBuffer(part.lower_bound ?? null, bound);
1244
+ part.upper_bound = maxBuffer(part.upper_bound ?? null, bound, field, schema);
1245
+ part.lower_bound = minBuffer(part.lower_bound ?? null, bound, field, schema);
1200
1246
  }
1201
1247
  else {
1202
1248
  part.contains_nan = true;
@@ -1275,29 +1321,29 @@ function _transformRecord(schema, map) {
1275
1321
  }
1276
1322
  return ret.length > 0 ? ret : null;
1277
1323
  }
1278
- function _minBuffer(a, b) {
1279
- if (!a && !b) {
1280
- return null;
1324
+ function minBuffer(a, b, field, schema) {
1325
+ if (a && b) {
1326
+ return compareBounds(a, b, field, schema) <= 0 ? a : b;
1281
1327
  }
1282
- else if (!a) {
1283
- return b;
1284
- }
1285
- else if (!b) {
1328
+ else if (a) {
1286
1329
  return a;
1287
1330
  }
1288
- return Buffer.compare(a, b) <= 0 ? a : b;
1289
- }
1290
- function _maxBuffer(a, b) {
1291
- if (!a && !b) {
1292
- return null;
1293
- }
1294
- else if (!a) {
1331
+ else if (b) {
1295
1332
  return b;
1296
1333
  }
1297
- else if (!b) {
1334
+ return null;
1335
+ }
1336
+ function maxBuffer(a, b, field, schema) {
1337
+ if (a && b) {
1338
+ return compareBounds(a, b, field, schema) >= 0 ? a : b;
1339
+ }
1340
+ else if (a) {
1298
1341
  return a;
1299
1342
  }
1300
- return Buffer.compare(a, b) >= 0 ? a : b;
1343
+ else if (b) {
1344
+ return b;
1345
+ }
1346
+ return null;
1301
1347
  }
1302
1348
 
1303
1349
  function customNumberParser(value) {
@@ -1473,205 +1519,6 @@ async function removeSnapshots(params) {
1473
1519
  });
1474
1520
  }
1475
1521
 
1476
- const DEFAULT_RETRY_COUNT$1 = 5;
1477
- async function addDataFiles(params) {
1478
- const { credentials } = params;
1479
- const retry_max = params.retryCount ?? DEFAULT_RETRY_COUNT$1;
1480
- const region = params.tableBucketARN.split(':')[3];
1481
- if (!region) {
1482
- throw new Error('bad tableBucketARN');
1483
- }
1484
- const snapshot_id = params.snapshotId ?? _randomBigInt64$1();
1485
- const metadata = await getMetadata(params);
1486
- const bucket = metadata.location.split('/').slice(-1)[0];
1487
- const parent_snapshot_id = BigInt(metadata['current-snapshot-id']);
1488
- const snapshot = metadata.snapshots.find((s) => BigInt(s['snapshot-id']) === parent_snapshot_id) ?? null;
1489
- if (!bucket) {
1490
- throw new Error('bad manifest location');
1491
- }
1492
- if (parent_snapshot_id > 0n && !snapshot) {
1493
- throw new Error('no old snapshot');
1494
- }
1495
- let old_list_key = snapshot ? parseS3Url(snapshot['manifest-list']).key : '';
1496
- if (snapshot && !old_list_key) {
1497
- throw new Error('last snapshot invalid');
1498
- }
1499
- let sequence_number = BigInt(metadata['last-sequence-number']) + 1n;
1500
- let remove_snapshot_id = 0n;
1501
- if (params.maxSnapshots && metadata.snapshots.length >= params.maxSnapshots) {
1502
- let earliest_time = 0;
1503
- for (const snap of metadata.snapshots) {
1504
- const snap_time = snap['timestamp-ms'];
1505
- if (earliest_time === 0 || snap_time < earliest_time) {
1506
- earliest_time = snap_time;
1507
- remove_snapshot_id = BigInt(snap['snapshot-id']);
1508
- }
1509
- }
1510
- }
1511
- let added_files = 0;
1512
- let added_records = 0n;
1513
- let added_size = 0n;
1514
- const records = await Promise.all(params.lists.map(async (list) => {
1515
- added_files += list.files.length;
1516
- for (const file of list.files) {
1517
- added_records += file.recordCount;
1518
- added_size += file.fileSize;
1519
- }
1520
- const opts = {
1521
- credentials,
1522
- region,
1523
- metadata,
1524
- schemaId: list.schemaId,
1525
- specId: list.specId,
1526
- snapshotId: snapshot_id,
1527
- sequenceNumber: sequence_number,
1528
- files: list.files,
1529
- };
1530
- return addManifest(opts);
1531
- }));
1532
- let expected_snapshot_id = parent_snapshot_id;
1533
- for (let try_count = 0;; try_count++) {
1534
- const manifest_list_key = `metadata/${node_crypto.randomUUID()}.avro`;
1535
- const manifest_list_url = `s3://${bucket}/${manifest_list_key}`;
1536
- if (old_list_key) {
1537
- await updateManifestList({
1538
- credentials,
1539
- region,
1540
- bucket,
1541
- key: old_list_key,
1542
- outKey: manifest_list_key,
1543
- metadata: {
1544
- 'sequence-number': String(sequence_number),
1545
- 'snapshot-id': String(snapshot_id),
1546
- 'parent-snapshot-id': String(parent_snapshot_id),
1547
- },
1548
- prepend: records,
1549
- });
1550
- }
1551
- else {
1552
- const manifest_list_buf = await avroToBuffer({
1553
- type: ManifestListType,
1554
- metadata: {
1555
- 'sequence-number': String(sequence_number),
1556
- 'snapshot-id': String(snapshot_id),
1557
- 'parent-snapshot-id': 'null',
1558
- },
1559
- records,
1560
- });
1561
- await writeS3File({
1562
- credentials,
1563
- region,
1564
- bucket,
1565
- key: manifest_list_key,
1566
- body: manifest_list_buf,
1567
- });
1568
- }
1569
- try {
1570
- const updates = [
1571
- {
1572
- action: 'add-snapshot',
1573
- snapshot: {
1574
- 'sequence-number': sequence_number,
1575
- 'snapshot-id': snapshot_id,
1576
- 'parent-snapshot-id': parent_snapshot_id,
1577
- 'timestamp-ms': Date.now(),
1578
- summary: {
1579
- operation: 'append',
1580
- 'added-data-files': String(added_files),
1581
- 'added-records': String(added_records),
1582
- 'added-files-size': String(added_size),
1583
- },
1584
- 'manifest-list': manifest_list_url,
1585
- 'schema-id': metadata['current-schema-id'],
1586
- },
1587
- },
1588
- {
1589
- action: 'set-snapshot-ref',
1590
- 'snapshot-id': snapshot_id,
1591
- type: 'branch',
1592
- 'ref-name': 'main',
1593
- },
1594
- ];
1595
- if (remove_snapshot_id > 0n) {
1596
- updates.push({
1597
- action: 'remove-snapshots',
1598
- 'snapshot-ids': [remove_snapshot_id],
1599
- });
1600
- }
1601
- const result = await icebergRequest({
1602
- credentials: params.credentials,
1603
- tableBucketARN: params.tableBucketARN,
1604
- method: 'POST',
1605
- suffix: `/namespaces/${params.namespace}/tables/${params.name}`,
1606
- body: {
1607
- requirements: expected_snapshot_id > 0n
1608
- ? [
1609
- {
1610
- type: 'assert-ref-snapshot-id',
1611
- ref: 'main',
1612
- 'snapshot-id': expected_snapshot_id,
1613
- },
1614
- ]
1615
- : [],
1616
- updates,
1617
- },
1618
- });
1619
- return {
1620
- result,
1621
- retriesNeeded: try_count,
1622
- parentSnapshotId: parent_snapshot_id,
1623
- snapshotId: snapshot_id,
1624
- sequenceNumber: sequence_number,
1625
- };
1626
- }
1627
- catch (e) {
1628
- if (e instanceof IcebergHttpError &&
1629
- e.status === 409 &&
1630
- try_count < retry_max) {
1631
- // retry case
1632
- remove_snapshot_id = 0n;
1633
- }
1634
- else {
1635
- throw e;
1636
- }
1637
- }
1638
- // we do a merge in the append only simultanious case
1639
- const conflict_metadata = await getMetadata(params);
1640
- const conflict_snapshot_id = BigInt(conflict_metadata['current-snapshot-id']);
1641
- if (conflict_snapshot_id <= 0n) {
1642
- throw new Error('conflict');
1643
- }
1644
- const conflict_snap = conflict_metadata.snapshots.find((s) => s['snapshot-id'] === conflict_snapshot_id);
1645
- if (!conflict_snap) {
1646
- throw new Error('conflict');
1647
- }
1648
- if (conflict_snap.summary.operation === 'append' &&
1649
- BigInt(conflict_snap['sequence-number']) === sequence_number) {
1650
- old_list_key = parseS3Url(conflict_snap['manifest-list']).key;
1651
- if (!old_list_key) {
1652
- throw new Error('conflict');
1653
- }
1654
- added_files += parseInt(conflict_snap.summary['added-data-files'] ?? '0', 10);
1655
- added_records += BigInt(conflict_snap.summary['added-records'] ?? '0');
1656
- added_size += BigInt(conflict_snap.summary['added-files-size'] ?? '0');
1657
- expected_snapshot_id = conflict_snapshot_id;
1658
- sequence_number++;
1659
- }
1660
- else {
1661
- throw new Error('conflict');
1662
- }
1663
- }
1664
- }
1665
- function _randomBigInt64$1() {
1666
- const bytes = node_crypto.randomBytes(8);
1667
- let ret = bytes.readBigUInt64BE();
1668
- ret &= BigInt('0x7FFFFFFFFFFFFFFF');
1669
- if (ret === 0n) {
1670
- ret = 1n;
1671
- }
1672
- return ret;
1673
- }
1674
-
1675
1522
  const DEFAULT_RETRY_COUNT = 5;
1676
1523
  async function submitSnapshot(params) {
1677
1524
  const { snapshotId, parentSnapshotId, resolveConflict } = params;
@@ -1793,26 +1640,296 @@ async function setCurrentCommit(params) {
1793
1640
  return commit_result;
1794
1641
  }
1795
1642
 
1796
- async function* asyncIterMap(items, func) {
1643
+ async function addDataFiles(params) {
1644
+ const { credentials } = params;
1645
+ const region = params.tableBucketARN.split(':')[3];
1646
+ if (!region) {
1647
+ throw new Error('bad tableBucketARN');
1648
+ }
1649
+ const snapshot_id = params.snapshotId ?? _randomBigInt64$1();
1650
+ const metadata = await getMetadata(params);
1651
+ const bucket = metadata.location.split('/').slice(-1)[0];
1652
+ if (!bucket) {
1653
+ throw new Error('bad manifest location');
1654
+ }
1655
+ const parent_snapshot_id = BigInt(metadata['current-snapshot-id']);
1656
+ const snapshot = metadata.snapshots.find((s) => BigInt(s['snapshot-id']) === parent_snapshot_id) ?? null;
1657
+ if (parent_snapshot_id > 0n && !snapshot) {
1658
+ throw new Error('no old snapshot');
1659
+ }
1660
+ let old_list_key = snapshot ? parseS3Url(snapshot['manifest-list']).key : '';
1661
+ if (snapshot && !old_list_key) {
1662
+ throw new Error('last snapshot invalid');
1663
+ }
1664
+ let sequence_number = BigInt(metadata['last-sequence-number']) + 1n;
1665
+ let remove_snapshot_id = 0n;
1666
+ if (params.maxSnapshots && metadata.snapshots.length >= params.maxSnapshots) {
1667
+ let earliest_time = 0;
1668
+ for (const snap of metadata.snapshots) {
1669
+ const snap_time = snap['timestamp-ms'];
1670
+ if (earliest_time === 0 || snap_time < earliest_time) {
1671
+ earliest_time = snap_time;
1672
+ remove_snapshot_id = BigInt(snap['snapshot-id']);
1673
+ }
1674
+ }
1675
+ }
1676
+ let added_files = 0;
1677
+ let added_records = 0n;
1678
+ let added_size = 0n;
1679
+ const records = await Promise.all(params.lists.map(async (list) => {
1680
+ added_files += list.files.length;
1681
+ for (const file of list.files) {
1682
+ added_records += file.recordCount;
1683
+ added_size += file.fileSize;
1684
+ }
1685
+ const opts = {
1686
+ credentials,
1687
+ region,
1688
+ metadata,
1689
+ schemaId: list.schemaId,
1690
+ specId: list.specId,
1691
+ snapshotId: snapshot_id,
1692
+ sequenceNumber: sequence_number,
1693
+ files: list.files,
1694
+ };
1695
+ return addManifest(opts);
1696
+ }));
1697
+ async function createManifestList() {
1698
+ if (!bucket) {
1699
+ throw new Error('bad manifest location');
1700
+ }
1701
+ if (!region) {
1702
+ throw new Error('bad tableBucketARN');
1703
+ }
1704
+ const manifest_list_key = `metadata/${node_crypto.randomUUID()}.avro`;
1705
+ const url = `s3://${bucket}/${manifest_list_key}`;
1706
+ if (old_list_key) {
1707
+ await updateManifestList({
1708
+ credentials,
1709
+ region,
1710
+ bucket,
1711
+ key: old_list_key,
1712
+ outKey: manifest_list_key,
1713
+ metadata: {
1714
+ 'sequence-number': String(sequence_number),
1715
+ 'snapshot-id': String(snapshot_id),
1716
+ 'parent-snapshot-id': String(parent_snapshot_id),
1717
+ },
1718
+ prepend: records,
1719
+ });
1720
+ }
1721
+ else {
1722
+ const manifest_list_buf = await avroToBuffer({
1723
+ type: ManifestListType,
1724
+ metadata: {
1725
+ 'sequence-number': String(sequence_number),
1726
+ 'snapshot-id': String(snapshot_id),
1727
+ 'parent-snapshot-id': 'null',
1728
+ },
1729
+ records,
1730
+ });
1731
+ await writeS3File({
1732
+ credentials,
1733
+ region,
1734
+ bucket,
1735
+ key: manifest_list_key,
1736
+ body: manifest_list_buf,
1737
+ });
1738
+ }
1739
+ return url;
1740
+ }
1741
+ const manifest_list_url = await createManifestList();
1742
+ async function resolveConflict(conflict_snap) {
1743
+ if (conflict_snap.summary.operation === 'append' &&
1744
+ BigInt(conflict_snap['sequence-number']) === sequence_number) {
1745
+ old_list_key = parseS3Url(conflict_snap['manifest-list']).key;
1746
+ if (!old_list_key) {
1747
+ throw new Error('conflict');
1748
+ }
1749
+ added_files += parseInt(conflict_snap.summary['added-data-files'] ?? '0', 10);
1750
+ added_records += BigInt(conflict_snap.summary['added-records'] ?? '0');
1751
+ added_size += BigInt(conflict_snap.summary['added-files-size'] ?? '0');
1752
+ sequence_number++;
1753
+ const url = await createManifestList();
1754
+ return {
1755
+ manifestListUrl: url,
1756
+ summary: {
1757
+ operation: 'append',
1758
+ 'added-data-files': String(added_files),
1759
+ 'added-records': String(added_records),
1760
+ 'added-files-size': String(added_size),
1761
+ },
1762
+ };
1763
+ }
1764
+ throw new Error('conflict');
1765
+ }
1766
+ return submitSnapshot({
1767
+ credentials,
1768
+ tableBucketARN: params.tableBucketARN,
1769
+ namespace: params.namespace,
1770
+ name: params.name,
1771
+ currentSchemaId: metadata['current-schema-id'],
1772
+ parentSnapshotId: parent_snapshot_id,
1773
+ snapshotId: snapshot_id,
1774
+ sequenceNumber: sequence_number,
1775
+ retryCount: params.retryCount,
1776
+ removeSnapshotId: remove_snapshot_id,
1777
+ manifestListUrl: manifest_list_url,
1778
+ summary: {
1779
+ operation: 'append',
1780
+ 'added-data-files': String(added_files),
1781
+ 'added-records': String(added_records),
1782
+ 'added-files-size': String(added_size),
1783
+ },
1784
+ resolveConflict,
1785
+ });
1786
+ }
1787
+ function _randomBigInt64$1() {
1788
+ const bytes = node_crypto.randomBytes(8);
1789
+ let ret = bytes.readBigUInt64BE();
1790
+ ret &= BigInt('0x7FFFFFFFFFFFFFFF');
1791
+ if (ret === 0n) {
1792
+ ret = 1n;
1793
+ }
1794
+ return ret;
1795
+ }
1796
+
1797
+ async function importRedshiftManifest(params) {
1798
+ const { credentials } = params;
1799
+ const region = params.tableBucketARN.split(':')[3];
1800
+ if (!region) {
1801
+ throw new Error('bad tableBucketARN');
1802
+ }
1803
+ const manifest = await _downloadRedshift(params);
1804
+ const metadata = await getMetadata(params);
1805
+ const bucket = metadata.location.split('/').slice(-1)[0];
1806
+ if (!bucket) {
1807
+ throw new Error('bad manifest location');
1808
+ }
1809
+ const import_prefix = `data/${node_crypto.randomUUID()}/`;
1810
+ const lists = [];
1811
+ for (const entry of manifest.entries) {
1812
+ const { url } = entry;
1813
+ const { content_length, record_count } = entry.meta;
1814
+ const file = url.split('/').pop() ?? '';
1815
+ const parts = [...url.matchAll(/\/([^=/]*=[^/=]*)/g)].map((m) => m[1] ?? '');
1816
+ const partitions = {};
1817
+ for (const part of parts) {
1818
+ const [part_key, part_value] = part.split('=');
1819
+ partitions[part_key ?? ''] = part_value ?? '';
1820
+ }
1821
+ const keys = Object.keys(partitions);
1822
+ const specId = params.specId ?? _findSpec(metadata, keys);
1823
+ const schemaId = params.schemaId ?? _findSchema(metadata, manifest);
1824
+ let list = lists.find((l) => l.schemaId === schemaId && l.specId === specId);
1825
+ if (!list) {
1826
+ list = { specId, schemaId, files: [] };
1827
+ lists.push(list);
1828
+ }
1829
+ const part_path = parts.length > 0 ? `${parts.join('/')}/` : '';
1830
+ const key = import_prefix + part_path + file;
1831
+ list.files.push({
1832
+ file: await _maybeMoveFile({ credentials, region, bucket, key, url }),
1833
+ partitions,
1834
+ fileSize: BigInt(content_length),
1835
+ recordCount: BigInt(record_count),
1836
+ });
1837
+ }
1838
+ return addDataFiles({
1839
+ credentials,
1840
+ tableBucketARN: params.tableBucketARN,
1841
+ namespace: params.namespace,
1842
+ name: params.name,
1843
+ lists,
1844
+ retryCount: params.retryCount,
1845
+ });
1846
+ }
1847
+ async function _downloadRedshift(params) {
1848
+ const s3_client = getS3Client(params);
1849
+ const { bucket, key } = parseS3Url(params.redshiftManifestUrl);
1850
+ const get_file_cmd = new clientS3.GetObjectCommand({ Bucket: bucket, Key: key });
1851
+ const file_response = await s3_client.send(get_file_cmd);
1852
+ const body = await file_response.Body?.transformToString();
1853
+ if (!body) {
1854
+ throw new Error('missing body');
1855
+ }
1856
+ return parse(body);
1857
+ }
1858
+ async function _maybeMoveFile(params) {
1859
+ const { bucket, key } = parseS3Url(params.url);
1860
+ if (!bucket || !key) {
1861
+ throw new Error(`bad entry url: ${params.url}`);
1862
+ }
1863
+ if (bucket === params.bucket) {
1864
+ return params.url;
1865
+ }
1866
+ const s3_client = getS3Client(params);
1867
+ const get = new clientS3.GetObjectCommand({ Bucket: bucket, Key: key });
1868
+ const { Body } = await s3_client.send(get);
1869
+ if (!Body) {
1870
+ throw new Error(`body missing for file: ${params.url}`);
1871
+ }
1872
+ const upload = new libStorage.Upload({
1873
+ client: s3_client,
1874
+ params: { Bucket: params.bucket, Key: params.key, Body },
1875
+ });
1876
+ await upload.done();
1877
+ return `s3://${params.bucket}/${params.key}`;
1878
+ }
1879
+ function _findSpec(metadata, keys) {
1880
+ if (keys.length === 0) {
1881
+ return 0;
1882
+ }
1883
+ for (const spec of metadata['partition-specs']) {
1884
+ if (spec.fields.length === keys.length) {
1885
+ if (keys.every((key) => spec.fields.find((f) => f.name === key))) {
1886
+ return spec['spec-id'];
1887
+ }
1888
+ }
1889
+ }
1890
+ throw new Error(`spec not found for keys ${keys.join(', ')}`);
1891
+ }
1892
+ function _findSchema(metadata, manifest) {
1893
+ const { elements } = manifest.schema;
1894
+ for (const schema of metadata.schemas) {
1895
+ if (schema.fields.every((f) => !f.required || elements.find((e) => e.name === f.name))) {
1896
+ return schema['schema-id'];
1897
+ }
1898
+ }
1899
+ throw new Error('schema not found for schema.elements');
1900
+ }
1901
+
1902
+ async function* asyncIterMap(items, limit, func) {
1797
1903
  const pending = new Set();
1798
- for (const item of items) {
1799
- const ref = {};
1800
- const wrapper = func(item).then((value) => ({
1801
- self: ref.current,
1802
- value,
1803
- }));
1804
- ref.current = wrapper;
1805
- pending.add(wrapper);
1904
+ let index = 0;
1905
+ function enqueue() {
1906
+ const item = items[index++];
1907
+ if (item !== undefined) {
1908
+ const result = { promise: undefined, value: undefined };
1909
+ const promise = func(item).then((value) => {
1910
+ result.value = value;
1911
+ return result;
1912
+ });
1913
+ result.promise = promise;
1914
+ pending.add(promise);
1915
+ }
1916
+ }
1917
+ for (let i = 0; i < limit && i < items.length; i++) {
1918
+ enqueue();
1806
1919
  }
1807
1920
  while (pending.size) {
1808
- const { self, value } = await Promise.race(pending);
1809
- if (self) {
1810
- pending.delete(self);
1921
+ const { promise, value } = await Promise.race(pending);
1922
+ if (promise) {
1923
+ pending.delete(promise);
1811
1924
  }
1812
- yield value;
1925
+ if (value !== undefined) {
1926
+ yield value;
1927
+ }
1928
+ enqueue();
1813
1929
  }
1814
1930
  }
1815
1931
 
1932
+ const ITER_LIMIT = 10;
1816
1933
  async function manifestCompact(params) {
1817
1934
  const { credentials, targetCount, calculateWeight } = params;
1818
1935
  const region = params.tableBucketARN.split(':')[3];
@@ -1835,6 +1952,7 @@ async function manifestCompact(params) {
1835
1952
  snapshotId: 0n,
1836
1953
  sequenceNumber: 0n,
1837
1954
  changed: false,
1955
+ inputManifestCount: 0,
1838
1956
  outputManifestCount: 0,
1839
1957
  };
1840
1958
  }
@@ -1901,11 +2019,12 @@ async function manifestCompact(params) {
1901
2019
  snapshotId: 0n,
1902
2020
  sequenceNumber: sequence_number,
1903
2021
  changed: false,
2022
+ inputManifestCount: list.length,
1904
2023
  outputManifestCount: 0,
1905
2024
  };
1906
2025
  }
1907
2026
  const manifest_list_key = `metadata/${node_crypto.randomUUID()}.avro`;
1908
- const iter = asyncIterMap(final_groups, async (group) => {
2027
+ const iter = asyncIterMap(final_groups, ITER_LIMIT, async (group) => {
1909
2028
  if (!group[0]) {
1910
2029
  return [];
1911
2030
  }
@@ -1966,19 +2085,21 @@ async function manifestCompact(params) {
1966
2085
  return {
1967
2086
  ...snap_result,
1968
2087
  changed: true,
2088
+ inputManifestCount: list.length,
1969
2089
  outputManifestCount: final_groups.length,
1970
2090
  };
1971
2091
  }
1972
2092
  async function _combineGroup(params) {
1973
- const { credentials, region, bucket, group } = params;
2093
+ const { credentials, region, bucket, group, spec } = params;
1974
2094
  const record0 = group[0];
1975
2095
  if ((group.length === 1 && !params.forceRewrite) || !record0) {
1976
2096
  return group;
1977
2097
  }
1978
2098
  const key = `metadata/${node_crypto.randomUUID()}.avro`;
1979
- const schema = makeManifestSchema(params.spec, params.schemas, true);
1980
- const type = makeManifestType(params.spec, params.schemas, true);
1981
- const iter = asyncIterMap(group, async (record) => {
2099
+ const icebergSchema = _schemaForSpec(params.schemas, spec);
2100
+ const schema = makeManifestSchema(spec, params.schemas, true);
2101
+ const type = makeManifestType(spec, params.schemas, true);
2102
+ const iter = asyncIterMap(group, ITER_LIMIT, async (record) => {
1982
2103
  return _streamReadManifest({
1983
2104
  credentials,
1984
2105
  region,
@@ -1993,8 +2114,8 @@ async function _combineGroup(params) {
1993
2114
  bucket,
1994
2115
  key,
1995
2116
  metadata: {
1996
- 'partition-spec-id': String(params.spec['spec-id']),
1997
- 'partition-spec': JSON.stringify(params.spec.fields),
2117
+ 'partition-spec-id': String(spec['spec-id']),
2118
+ 'partition-spec': JSON.stringify(spec.fields),
1998
2119
  },
1999
2120
  avroType: type,
2000
2121
  iter,
@@ -2030,22 +2151,15 @@ async function _combineGroup(params) {
2030
2151
  for (let j = 0; j < parts.length; j++) {
2031
2152
  const part = parts[j];
2032
2153
  const ret_part = ret.partitions[j];
2033
- if (part && ret_part) {
2154
+ const field = spec.fields[i];
2155
+ if (part && ret_part && field) {
2034
2156
  ret_part.contains_null ||= part.contains_null;
2035
2157
  if (part.contains_nan !== undefined) {
2036
2158
  ret_part.contains_nan =
2037
2159
  (ret_part.contains_nan ?? false) || part.contains_nan;
2038
2160
  }
2039
- if (!ret_part.upper_bound ||
2040
- (part.upper_bound &&
2041
- Buffer.compare(part.upper_bound, ret_part.upper_bound) > 0)) {
2042
- ret_part.upper_bound = part.upper_bound ?? null;
2043
- }
2044
- if (!ret_part.lower_bound ||
2045
- (part.lower_bound &&
2046
- Buffer.compare(part.lower_bound, ret_part.lower_bound) < 0)) {
2047
- ret_part.lower_bound = part.lower_bound ?? null;
2048
- }
2161
+ ret_part.upper_bound = maxBuffer(ret_part.upper_bound, part.upper_bound, field, icebergSchema);
2162
+ ret_part.lower_bound = minBuffer(ret_part.lower_bound, part.lower_bound, field, icebergSchema);
2049
2163
  }
2050
2164
  }
2051
2165
  }
@@ -2131,6 +2245,14 @@ function _combineWeightGroups(groups, targetCount, calculateWeight) {
2131
2245
  function _sortGroup(a, b) {
2132
2246
  return a.weight - b.weight;
2133
2247
  }
2248
+ function _schemaForSpec(schemas, spec) {
2249
+ for (const schema of schemas) {
2250
+ if (spec.fields.every((f) => schema.fields.find((f2) => f2.id === f['source-id']))) {
2251
+ return schema;
2252
+ }
2253
+ }
2254
+ throw new Error(`schema not found for spec: ${spec['spec-id']}`);
2255
+ }
2134
2256
  function _randomBigInt64() {
2135
2257
  const bytes = node_crypto.randomBytes(8);
2136
2258
  let ret = bytes.readBigUInt64BE();
@@ -2152,13 +2274,14 @@ function _bigintMin(value0, ...values) {
2152
2274
 
2153
2275
  var index = {
2154
2276
  IcebergHttpError,
2155
- getMetadata,
2156
2277
  addSchema,
2157
2278
  addPartitionSpec,
2158
2279
  addManifest,
2159
2280
  addDataFiles,
2160
- setCurrentCommit,
2281
+ getMetadata,
2282
+ importRedshiftManifest,
2161
2283
  removeSnapshots,
2284
+ setCurrentCommit,
2162
2285
  };
2163
2286
 
2164
2287
  exports.IcebergHttpError = IcebergHttpError;
@@ -2168,7 +2291,10 @@ exports.addPartitionSpec = addPartitionSpec;
2168
2291
  exports.addSchema = addSchema;
2169
2292
  exports.default = index;
2170
2293
  exports.getMetadata = getMetadata;
2294
+ exports.importRedshiftManifest = importRedshiftManifest;
2171
2295
  exports.manifestCompact = manifestCompact;
2296
+ exports.maxBuffer = maxBuffer;
2297
+ exports.minBuffer = minBuffer;
2172
2298
  exports.removeSnapshots = removeSnapshots;
2173
2299
  exports.setCurrentCommit = setCurrentCommit;
2174
2300
  exports.submitSnapshot = submitSnapshot;
package/package.json CHANGED
@@ -1,19 +1,20 @@
1
1
  {
2
2
  "name": "node-s3tables",
3
- "version": "0.0.18",
3
+ "version": "0.0.19",
4
4
  "description": "node api for dealing with s3tables",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
7
7
  "bin": {
8
- "node-s3tables": "dist/bin-compact.js"
8
+ "node-s3tables": "dist/bin.js"
9
9
  },
10
10
  "files": [
11
11
  "dist/**/*"
12
12
  ],
13
13
  "scripts": {
14
+ "bin": "tsx src/bin.ts",
14
15
  "build": "rollup -c",
15
16
  "ts:check": "tsc --noEmit",
16
- "lint": "eslint src test",
17
+ "lint": "eslint src test --fix",
17
18
  "pretty": "prettier -u --write \"**/*\" --log-level warn",
18
19
  "test": "dotenv -- tsx --test test/*.test.ts",
19
20
  "test:single": "dotenv -- tsx --test",