node-s3tables 0.0.16 → 0.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/bin.js ADDED
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ var nodeS3tables = require('node-s3tables');
5
+
6
+ /* eslint-disable no-console */
7
+ const [tableBucketARN, namespace, name] = process.argv.slice(2);
8
+ if (!tableBucketARN || !namespace || !name) {
9
+ console.error('Usage: node-s3tables compact <tableBucketARN> <namespace> <name>');
10
+ process.exit(-1);
11
+ }
12
+ nodeS3tables.manifestCompact({ tableBucketARN, namespace, name })
13
+ .then((result) => {
14
+ console.log('Compact result:', result);
15
+ process.exit(0);
16
+ })
17
+ .catch((error) => {
18
+ if (error instanceof Error) {
19
+ console.error('Error:', error.message);
20
+ }
21
+ else {
22
+ console.error('Error:', error);
23
+ }
24
+ process.exit(1);
25
+ });
package/dist/index.d.ts CHANGED
@@ -130,7 +130,7 @@ type TableLocation = {
130
130
  };
131
131
  type GetMetadataParams = TableLocation & {
132
132
  region?: string;
133
- credentials?: AwsCredentialIdentity;
133
+ credentials?: AwsCredentialIdentity | undefined;
134
134
  };
135
135
  declare function getMetadata(params: GetMetadataParams): Promise<IcebergMetadata>;
136
136
  interface AddSchemaParams {
@@ -194,6 +194,34 @@ interface AddDataFilesResult {
194
194
  sequenceNumber: bigint;
195
195
  }
196
196
  declare function addDataFiles(params: AddDataFilesParams): Promise<AddDataFilesResult>;
197
+
198
+ interface SubmitSnapshotParams {
199
+ credentials?: AwsCredentialIdentity | undefined;
200
+ tableBucketARN: string;
201
+ namespace: string;
202
+ name: string;
203
+ currentSchemaId: number;
204
+ parentSnapshotId: bigint;
205
+ snapshotId: bigint;
206
+ sequenceNumber: bigint;
207
+ retryCount?: number | undefined;
208
+ removeSnapshotId?: bigint | undefined;
209
+ manifestListUrl: string;
210
+ summary: Record<string, string>;
211
+ resolveConflict?: (conflictSnapshot: IcebergSnapshot) => Promise<ResolveConflictResult>;
212
+ }
213
+ interface ResolveConflictResult {
214
+ manifestListUrl: string;
215
+ summary: Record<string, string>;
216
+ }
217
+ interface SubmitSnapshotResult {
218
+ result: JSONObject;
219
+ retriesNeeded: number;
220
+ parentSnapshotId: bigint;
221
+ snapshotId: bigint;
222
+ sequenceNumber: bigint;
223
+ }
224
+ declare function submitSnapshot(params: SubmitSnapshotParams): Promise<SubmitSnapshotResult>;
197
225
  interface SetCurrentCommitParams {
198
226
  credentials?: AwsCredentialIdentity;
199
227
  tableBucketARN: string;
@@ -210,6 +238,25 @@ declare class IcebergHttpError extends Error {
210
238
  constructor(status: number, body: JSONValue, message: string);
211
239
  }
212
240
 
241
+ type CalculateWeightFunction = (group: ManifestListRecord[]) => number;
242
+ interface ManifestCompactParams {
243
+ credentials?: AwsCredentialIdentity;
244
+ tableBucketARN: string;
245
+ namespace: string;
246
+ name: string;
247
+ snapshotId?: bigint;
248
+ targetCount?: number;
249
+ calculateWeight?: CalculateWeightFunction;
250
+ forceRewrite?: boolean;
251
+ retryCount?: number;
252
+ maxSnapshots?: number;
253
+ }
254
+ interface ManifestCompactResult extends SubmitSnapshotResult {
255
+ changed: boolean;
256
+ outputManifestCount: number;
257
+ }
258
+ declare function manifestCompact(params: ManifestCompactParams): Promise<ManifestCompactResult>;
259
+
213
260
  declare const _default: {
214
261
  IcebergHttpError: typeof IcebergHttpError;
215
262
  getMetadata: typeof getMetadata;
@@ -221,5 +268,5 @@ declare const _default: {
221
268
  removeSnapshots: typeof removeSnapshots;
222
269
  };
223
270
 
224
- export { IcebergHttpError, addDataFiles, addManifest, addPartitionSpec, addSchema, _default as default, getMetadata, removeSnapshots, setCurrentCommit };
225
- export type { AddDataFilesParams, AddDataFilesResult, AddFile, AddFileList, AddManifestParams, AddPartitionSpecParams, AddSchemaParams, GetMetadataParams, IcebergComplexType, IcebergMetadata, IcebergPartitionField, IcebergPartitionSpec, IcebergPrimitiveType, IcebergSchema, IcebergSchemaField, IcebergSnapshot, IcebergSnapshotSummary, IcebergTransform, IcebergType, IcebergUpdateResponse, RemoveSnapshotsParams, SetCurrentCommitParams, TableLocation };
271
+ export { IcebergHttpError, addDataFiles, addManifest, addPartitionSpec, addSchema, _default as default, getMetadata, manifestCompact, removeSnapshots, setCurrentCommit, submitSnapshot };
272
+ export type { AddDataFilesParams, AddDataFilesResult, AddFile, AddFileList, AddManifestParams, AddPartitionSpecParams, AddSchemaParams, CalculateWeightFunction, GetMetadataParams, IcebergComplexType, IcebergMetadata, IcebergPartitionField, IcebergPartitionSpec, IcebergPrimitiveType, IcebergSchema, IcebergSchemaField, IcebergSnapshot, IcebergSnapshotSummary, IcebergTransform, IcebergType, IcebergUpdateResponse, ManifestCompactParams, ManifestCompactResult, ManifestListRecord, RemoveSnapshotsParams, ResolveConflictResult, SetCurrentCommitParams, SubmitSnapshotParams, SubmitSnapshotResult, TableLocation };
package/dist/index.js CHANGED
@@ -84,13 +84,21 @@ async function avroToBuffer(params) {
84
84
  }
85
85
  });
86
86
  }
87
- function icebergToAvroFields(spec, schema) {
88
- return spec.fields.map((p) => _icebergToAvroField(p, schema));
87
+ function icebergToAvroFields(spec, schemas, skipPartitionLogicalType) {
88
+ return spec.fields.map((p) => _icebergToAvroField(p, schemas, skipPartitionLogicalType));
89
89
  }
90
- function _icebergToAvroField(field, schema) {
91
- const source = schema.fields.find((f) => f.id === field['source-id']);
90
+ function _icebergToAvroField(field, schemas, skipPartitionLogicalType) {
91
+ let source;
92
+ for (const schema of schemas) {
93
+ for (const f of schema.fields) {
94
+ if (f.id === field['source-id']) {
95
+ source = f;
96
+ break;
97
+ }
98
+ }
99
+ }
92
100
  if (!source) {
93
- throw new Error(`Source field ${field['source-id']} not found in schema`);
101
+ throw new Error(`Source field ${field['source-id']} not found in schemas`);
94
102
  }
95
103
  let avroType;
96
104
  switch (field.transform) {
@@ -123,6 +131,9 @@ function _icebergToAvroField(field, schema) {
123
131
  }
124
132
  throw new Error(`Unsupported transform: ${field.transform} for type`);
125
133
  }
134
+ if (typeof avroType === 'object' && skipPartitionLogicalType) {
135
+ avroType = avroType.type;
136
+ }
126
137
  return {
127
138
  name: field.name,
128
139
  type: ['null', avroType],
@@ -260,9 +271,9 @@ const AvroLogicalTypes = {
260
271
  hour: HourStringType,
261
272
  };
262
273
 
263
- function makeManifestType(spec, schema) {
264
- const part_fields = icebergToAvroFields(spec, schema);
265
- return avsc__namespace.Type.forSchema({
274
+ function makeManifestSchema(spec, schemas, skipPartitionLogicalType) {
275
+ const part_fields = icebergToAvroFields(spec, schemas, skipPartitionLogicalType);
276
+ return {
266
277
  type: 'record',
267
278
  name: 'manifest_entry',
268
279
  fields: [
@@ -492,9 +503,16 @@ function makeManifestType(spec, schema) {
492
503
  'field-id': 2,
493
504
  },
494
505
  ],
495
- }, { registry: { ...AvroRegistry }, logicalTypes: AvroLogicalTypes });
506
+ };
507
+ }
508
+ function makeManifestType(spec, schemas, skipPartitionLogicalType) {
509
+ const schema = makeManifestSchema(spec, schemas, skipPartitionLogicalType);
510
+ return avsc__namespace.Type.forSchema(schema, {
511
+ registry: { ...AvroRegistry },
512
+ logicalTypes: AvroLogicalTypes,
513
+ });
496
514
  }
497
- const ManifestListType = avsc__namespace.Type.forSchema({
515
+ const ManifestListSchema = {
498
516
  type: 'record',
499
517
  name: 'manifest_file',
500
518
  fields: [
@@ -630,7 +648,10 @@ const ManifestListType = avsc__namespace.Type.forSchema({
630
648
  'field-id': 519,
631
649
  },
632
650
  ],
633
- }, { registry: { ...AvroRegistry } });
651
+ };
652
+ const ManifestListType = avsc__namespace.Type.forSchema(ManifestListSchema, {
653
+ registry: { ...AvroRegistry },
654
+ });
634
655
 
635
656
  function _isPrimitive(t) {
636
657
  return typeof t === 'string';
@@ -1033,10 +1054,14 @@ async function updateManifestList(params) {
1033
1054
  reject(err);
1034
1055
  });
1035
1056
  decoder.on('data', (record) => {
1036
- const translated = translateRecord(sourceSchema, ManifestListType.schema(), record);
1037
- if (!encoder.write(translated)) {
1038
- decoder.pause();
1039
- encoder.once('drain', () => decoder.resume());
1057
+ const translated = translateRecord(sourceSchema, ManifestListSchema, record);
1058
+ if (translated.content !== ListContent.DATA ||
1059
+ translated.added_files_count > 0 ||
1060
+ translated.existing_files_count > 0) {
1061
+ if (!encoder.write(translated)) {
1062
+ decoder.pause();
1063
+ encoder.once('drain', () => decoder.resume());
1064
+ }
1040
1065
  }
1041
1066
  });
1042
1067
  decoder.on('end', () => {
@@ -1049,6 +1074,89 @@ async function updateManifestList(params) {
1049
1074
  });
1050
1075
  await Promise.all([stream_promise, upload.done()]);
1051
1076
  }
1077
+ async function streamWriteAvro(params) {
1078
+ const { region, credentials, bucket, key } = params;
1079
+ const metadata = params.metadata
1080
+ ? fixupMetadata(params.metadata)
1081
+ : params.metadata;
1082
+ const s3 = getS3Client({ region, credentials });
1083
+ const encoder = new avsc__namespace.streams.BlockEncoder(params.avroType, {
1084
+ codec: 'deflate',
1085
+ codecs: { deflate: zlib__namespace.deflateRaw },
1086
+ metadata,
1087
+ });
1088
+ const upload = new libStorage.Upload({
1089
+ client: s3,
1090
+ params: { Bucket: bucket, Key: key, Body: encoder },
1091
+ });
1092
+ let file_size = 0;
1093
+ upload.on('httpUploadProgress', (progress) => {
1094
+ if (progress.loaded) {
1095
+ file_size = progress.loaded;
1096
+ }
1097
+ });
1098
+ const upload_promise = upload.done();
1099
+ let found_err;
1100
+ upload_promise.catch((err) => {
1101
+ found_err = err;
1102
+ });
1103
+ encoder.on('error', (err) => {
1104
+ found_err = err;
1105
+ });
1106
+ for await (const batch of params.iter) {
1107
+ if (found_err) {
1108
+ throw found_err;
1109
+ }
1110
+ for (const record of batch) {
1111
+ encoder.write(record);
1112
+ }
1113
+ }
1114
+ encoder.end();
1115
+ await upload_promise;
1116
+ if (found_err) {
1117
+ throw found_err;
1118
+ }
1119
+ return file_size;
1120
+ }
1121
+ async function downloadAvro(params) {
1122
+ const { region, credentials, bucket, key, avroSchema } = params;
1123
+ const s3 = getS3Client({ region, credentials });
1124
+ const get = new clientS3.GetObjectCommand({ Bucket: bucket, Key: key });
1125
+ const response = await s3.send(get);
1126
+ const source = response.Body;
1127
+ if (!source) {
1128
+ throw new Error('failed to get source manifest list');
1129
+ }
1130
+ let sourceSchema;
1131
+ const decoder = new avsc__namespace.streams.BlockDecoder({
1132
+ codecs: { deflate: zlib__namespace.inflateRaw },
1133
+ parseHook(schema) {
1134
+ sourceSchema = schema;
1135
+ return avsc__namespace.Type.forSchema(schema, {
1136
+ registry: { ...AvroRegistry },
1137
+ });
1138
+ },
1139
+ });
1140
+ const records = [];
1141
+ const stream_promise = new Promise((resolve, reject) => {
1142
+ source.on('error', (err) => {
1143
+ reject(err);
1144
+ });
1145
+ decoder.on('error', (err) => {
1146
+ reject(err);
1147
+ });
1148
+ decoder.on('data', (record) => {
1149
+ const translated = translateRecord(sourceSchema, avroSchema, record);
1150
+ records.push(translated);
1151
+ });
1152
+ decoder.on('end', () => {
1153
+ resolve();
1154
+ });
1155
+ source.pipe(decoder);
1156
+ });
1157
+ await stream_promise;
1158
+ return records;
1159
+ }
1052
1160
 
1053
1161
  async function addManifest(params) {
1054
1162
  const { credentials, region, metadata } = params;
@@ -1119,7 +1227,7 @@ async function addManifest(params) {
1119
1227
  },
1120
1228
  };
1121
1229
  });
1122
- const manifest_type = makeManifestType(spec, schema);
1230
+ const manifest_type = makeManifestType(spec, [schema]);
1123
1231
  const manifest_buf = await avroToBuffer({
1124
1232
  type: manifest_type,
1125
1233
  metadata: {
@@ -1365,15 +1473,15 @@ async function removeSnapshots(params) {
1365
1473
  });
1366
1474
  }
1367
1475
 
1368
- const DEFAULT_RETRY_COUNT = 5;
1476
+ const DEFAULT_RETRY_COUNT$1 = 5;
1369
1477
  async function addDataFiles(params) {
1370
1478
  const { credentials } = params;
1371
- const retry_max = params.retryCount ?? DEFAULT_RETRY_COUNT;
1479
+ const retry_max = params.retryCount ?? DEFAULT_RETRY_COUNT$1;
1372
1480
  const region = params.tableBucketARN.split(':')[3];
1373
1481
  if (!region) {
1374
1482
  throw new Error('bad tableBucketARN');
1375
1483
  }
1376
- const snapshot_id = params.snapshotId ?? _randomBigInt64();
1484
+ const snapshot_id = params.snapshotId ?? _randomBigInt64$1();
1377
1485
  const metadata = await getMetadata(params);
1378
1486
  const bucket = metadata.location.split('/').slice(-1)[0];
1379
1487
  const parent_snapshot_id = BigInt(metadata['current-snapshot-id']);
@@ -1554,6 +1662,116 @@ async function addDataFiles(params) {
1554
1662
  }
1555
1663
  }
1556
1664
  }
1665
+ function _randomBigInt64$1() {
1666
+ const bytes = node_crypto.randomBytes(8);
1667
+ let ret = bytes.readBigUInt64BE();
1668
+ ret &= BigInt('0x7FFFFFFFFFFFFFFF');
1669
+ if (ret === 0n) {
1670
+ ret = 1n;
1671
+ }
1672
+ return ret;
1673
+ }
1674
+
1675
+ const DEFAULT_RETRY_COUNT = 5;
1676
+ async function submitSnapshot(params) {
1677
+ const { snapshotId, parentSnapshotId, resolveConflict } = params;
1678
+ let { sequenceNumber, removeSnapshotId, manifestListUrl, summary } = params;
1679
+ const retry_max = params.retryCount ?? DEFAULT_RETRY_COUNT;
1680
+ let expected_snapshot_id = parentSnapshotId;
1681
+ let conflict_snap;
1682
+ for (let try_count = 0;; try_count++) {
1683
+ if (conflict_snap && resolveConflict) {
1684
+ const resolve_result = await resolveConflict(conflict_snap);
1685
+ summary = resolve_result.summary;
1686
+ manifestListUrl = resolve_result.manifestListUrl;
1687
+ }
1688
+ else if (conflict_snap) {
1689
+ throw new Error('conflict');
1690
+ }
1691
+ try {
1692
+ const updates = [
1693
+ {
1694
+ action: 'add-snapshot',
1695
+ snapshot: {
1696
+ 'sequence-number': sequenceNumber,
1697
+ 'snapshot-id': snapshotId,
1698
+ 'parent-snapshot-id': parentSnapshotId,
1699
+ 'timestamp-ms': Date.now(),
1700
+ summary,
1701
+ 'manifest-list': manifestListUrl,
1702
+ 'schema-id': params.currentSchemaId,
1703
+ },
1704
+ },
1705
+ {
1706
+ action: 'set-snapshot-ref',
1707
+ 'snapshot-id': snapshotId,
1708
+ type: 'branch',
1709
+ 'ref-name': 'main',
1710
+ },
1711
+ ];
1712
+ if (removeSnapshotId && removeSnapshotId > 0n) {
1713
+ updates.push({
1714
+ action: 'remove-snapshots',
1715
+ 'snapshot-ids': [removeSnapshotId],
1716
+ });
1717
+ }
1718
+ const result = await icebergRequest({
1719
+ credentials: params.credentials,
1720
+ tableBucketARN: params.tableBucketARN,
1721
+ method: 'POST',
1722
+ suffix: `/namespaces/${params.namespace}/tables/${params.name}`,
1723
+ body: {
1724
+ requirements: expected_snapshot_id > 0n
1725
+ ? [
1726
+ {
1727
+ type: 'assert-ref-snapshot-id',
1728
+ ref: 'main',
1729
+ 'snapshot-id': expected_snapshot_id,
1730
+ },
1731
+ ]
1732
+ : [],
1733
+ updates,
1734
+ },
1735
+ });
1736
+ return {
1737
+ result,
1738
+ retriesNeeded: try_count,
1739
+ parentSnapshotId,
1740
+ snapshotId,
1741
+ sequenceNumber,
1742
+ };
1743
+ }
1744
+ catch (e) {
1745
+ if (e instanceof IcebergHttpError &&
1746
+ e.status === 409 &&
1747
+ try_count < retry_max) {
1748
+ // retry case
1749
+ removeSnapshotId = 0n;
1750
+ }
1751
+ else {
1752
+ throw e;
1753
+ }
1754
+ }
1755
+ // we do a merge in the append only simultanious case
1756
+ const conflict_metadata = await getMetadata(params);
1757
+ const conflict_snapshot_id = BigInt(conflict_metadata['current-snapshot-id']);
1758
+ if (conflict_snapshot_id <= 0n) {
1759
+ throw new Error('conflict');
1760
+ }
1761
+ conflict_snap = conflict_metadata.snapshots.find((s) => s['snapshot-id'] === conflict_snapshot_id);
1762
+ if (!conflict_snap) {
1763
+ throw new Error('conflict');
1764
+ }
1765
+ if (conflict_snap.summary.operation === 'append' &&
1766
+ BigInt(conflict_snap['sequence-number']) === sequenceNumber) {
1767
+ expected_snapshot_id = conflict_snapshot_id;
1768
+ sequenceNumber++;
1769
+ }
1770
+ else {
1771
+ throw new Error('conflict');
1772
+ }
1773
+ }
1774
+ }
1557
1775
  async function setCurrentCommit(params) {
1558
1776
  const commit_result = await icebergRequest({
1559
1777
  credentials: params.credentials,
@@ -1574,6 +1792,345 @@ async function setCurrentCommit(params) {
1574
1792
  });
1575
1793
  return commit_result;
1576
1794
  }
1795
+
1796
+ async function* asyncIterMap(items, func) {
1797
+ const pending = new Set();
1798
+ for (const item of items) {
1799
+ const ref = {};
1800
+ const wrapper = func(item).then((value) => ({
1801
+ self: ref.current,
1802
+ value,
1803
+ }));
1804
+ ref.current = wrapper;
1805
+ pending.add(wrapper);
1806
+ }
1807
+ while (pending.size) {
1808
+ const { self, value } = await Promise.race(pending);
1809
+ if (self) {
1810
+ pending.delete(self);
1811
+ }
1812
+ yield value;
1813
+ }
1814
+ }
1815
+
1816
+ async function manifestCompact(params) {
1817
+ const { credentials, targetCount, calculateWeight } = params;
1818
+ const region = params.tableBucketARN.split(':')[3];
1819
+ if (!region) {
1820
+ throw new Error('bad tableBucketARN');
1821
+ }
1822
+ const snapshot_id = params.snapshotId ?? _randomBigInt64();
1823
+ const metadata = await getMetadata(params);
1824
+ const bucket = metadata.location.split('/').slice(-1)[0];
1825
+ const parent_snapshot_id = BigInt(metadata['current-snapshot-id']);
1826
+ const snapshot = metadata.snapshots.find((s) => BigInt(s['snapshot-id']) === parent_snapshot_id) ?? null;
1827
+ if (!bucket) {
1828
+ throw new Error('bad manifest location');
1829
+ }
1830
+ if (!snapshot) {
1831
+ return {
1832
+ result: {},
1833
+ retriesNeeded: 0,
1834
+ parentSnapshotId: parent_snapshot_id,
1835
+ snapshotId: 0n,
1836
+ sequenceNumber: 0n,
1837
+ changed: false,
1838
+ outputManifestCount: 0,
1839
+ };
1840
+ }
1841
+ if (parent_snapshot_id <= 0n) {
1842
+ throw new Error('no old snapshot');
1843
+ }
1844
+ const old_list_key = parseS3Url(snapshot['manifest-list']).key;
1845
+ if (!old_list_key) {
1846
+ throw new Error('last snapshot invalid');
1847
+ }
1848
+ const sequence_number = BigInt(metadata['last-sequence-number']) + 1n;
1849
+ let remove_snapshot_id = 0n;
1850
+ if (params.maxSnapshots && metadata.snapshots.length >= params.maxSnapshots) {
1851
+ let earliest_time = 0;
1852
+ for (const snap of metadata.snapshots) {
1853
+ const snap_time = snap['timestamp-ms'];
1854
+ if (earliest_time === 0 || snap_time < earliest_time) {
1855
+ earliest_time = snap_time;
1856
+ remove_snapshot_id = BigInt(snap['snapshot-id']);
1857
+ }
1858
+ }
1859
+ }
1860
+ const list = await downloadAvro({
1861
+ credentials,
1862
+ region,
1863
+ bucket,
1864
+ key: old_list_key,
1865
+ avroSchema: ManifestListSchema,
1866
+ });
1867
+ const filtered = list.filter(_filterDeletes);
1868
+ const groups = _groupList(filtered, (a, b) => {
1869
+ if (a.content === ListContent.DATA &&
1870
+ b.content === ListContent.DATA &&
1871
+ a.deleted_files_count === 0 &&
1872
+ b.deleted_files_count === 0 &&
1873
+ a.partition_spec_id === b.partition_spec_id) {
1874
+ return (!a.partitions ||
1875
+ a.partitions.every((part, i) => {
1876
+ const other = b.partitions?.[i];
1877
+ return (other &&
1878
+ (part.upper_bound === other.upper_bound ||
1879
+ (part.upper_bound &&
1880
+ other.upper_bound &&
1881
+ Buffer.compare(part.upper_bound, other.upper_bound) === 0)) &&
1882
+ (part.lower_bound === other.lower_bound ||
1883
+ (part.lower_bound &&
1884
+ other.lower_bound &&
1885
+ Buffer.compare(part.lower_bound, other.lower_bound) === 0)));
1886
+ }));
1887
+ }
1888
+ return false;
1889
+ });
1890
+ const final_groups = targetCount !== undefined &&
1891
+ calculateWeight !== undefined &&
1892
+ groups.length > targetCount
1893
+ ? _combineWeightGroups(groups, targetCount, calculateWeight)
1894
+ : groups;
1895
+ if (final_groups.length === 0 ||
1896
+ (final_groups.length === list.length && !params.forceRewrite)) {
1897
+ return {
1898
+ result: {},
1899
+ retriesNeeded: 0,
1900
+ parentSnapshotId: parent_snapshot_id,
1901
+ snapshotId: 0n,
1902
+ sequenceNumber: sequence_number,
1903
+ changed: false,
1904
+ outputManifestCount: 0,
1905
+ };
1906
+ }
1907
+ const manifest_list_key = `metadata/${node_crypto.randomUUID()}.avro`;
1908
+ const iter = asyncIterMap(final_groups, async (group) => {
1909
+ if (!group[0]) {
1910
+ return [];
1911
+ }
1912
+ const { partition_spec_id } = group[0];
1913
+ const spec = metadata['partition-specs'].find((p) => p['spec-id'] === partition_spec_id);
1914
+ if (!spec) {
1915
+ throw new Error(`Partition spec not found: ${partition_spec_id}`);
1916
+ }
1917
+ return _combineGroup({
1918
+ credentials,
1919
+ region,
1920
+ bucket,
1921
+ group,
1922
+ spec,
1923
+ snapshotId: snapshot_id,
1924
+ schemas: metadata.schemas,
1925
+ sequenceNumber: sequence_number,
1926
+ forceRewrite: params.forceRewrite ?? false,
1927
+ });
1928
+ });
1929
+ await streamWriteAvro({
1930
+ credentials,
1931
+ region,
1932
+ bucket,
1933
+ key: manifest_list_key,
1934
+ metadata: {
1935
+ 'sequence-number': String(sequence_number),
1936
+ 'snapshot-id': String(snapshot_id),
1937
+ 'parent-snapshot-id': String(parent_snapshot_id),
1938
+ },
1939
+ avroType: ManifestListType,
1940
+ iter,
1941
+ });
1942
+ const summary = {
1943
+ operation: 'replace',
1944
+ 'added-data-files': '0',
1945
+ 'deleted-data-files': '0',
1946
+ 'added-records': '0',
1947
+ 'deleted-records': '0',
1948
+ 'added-files-size': '0',
1949
+ 'removed-files-size': '0',
1950
+ 'changed-partition-count': '0',
1951
+ };
1952
+ const snap_result = await submitSnapshot({
1953
+ credentials,
1954
+ tableBucketARN: params.tableBucketARN,
1955
+ namespace: params.namespace,
1956
+ name: params.name,
1957
+ currentSchemaId: metadata['current-schema-id'],
1958
+ parentSnapshotId: parent_snapshot_id,
1959
+ snapshotId: snapshot_id,
1960
+ sequenceNumber: sequence_number,
1961
+ manifestListUrl: `s3://${bucket}/${manifest_list_key}`,
1962
+ summary,
1963
+ removeSnapshotId: remove_snapshot_id,
1964
+ retryCount: params.retryCount,
1965
+ });
1966
+ return {
1967
+ ...snap_result,
1968
+ changed: true,
1969
+ outputManifestCount: final_groups.length,
1970
+ };
1971
+ }
1972
+ async function _combineGroup(params) {
1973
+ const { credentials, region, bucket, group } = params;
1974
+ const record0 = group[0];
1975
+ if ((group.length === 1 && !params.forceRewrite) || !record0) {
1976
+ return group;
1977
+ }
1978
+ const key = `metadata/${node_crypto.randomUUID()}.avro`;
1979
+ const schema = makeManifestSchema(params.spec, params.schemas, true);
1980
+ const type = makeManifestType(params.spec, params.schemas, true);
1981
+ const iter = asyncIterMap(group, async (record) => {
1982
+ return _streamReadManifest({
1983
+ credentials,
1984
+ region,
1985
+ bucket,
1986
+ url: record.manifest_path,
1987
+ schema,
1988
+ });
1989
+ });
1990
+ const manifest_length = await streamWriteAvro({
1991
+ credentials,
1992
+ region,
1993
+ bucket,
1994
+ key,
1995
+ metadata: {
1996
+ 'partition-spec-id': String(params.spec['spec-id']),
1997
+ 'partition-spec': JSON.stringify(params.spec.fields),
1998
+ },
1999
+ avroType: type,
2000
+ iter,
2001
+ });
2002
+ const ret = {
2003
+ manifest_path: `s3://${bucket}/${key}`,
2004
+ manifest_length: BigInt(manifest_length),
2005
+ partition_spec_id: record0.partition_spec_id,
2006
+ content: record0.content,
2007
+ sequence_number: params.sequenceNumber,
2008
+ min_sequence_number: params.sequenceNumber,
2009
+ added_snapshot_id: params.snapshotId,
2010
+ added_files_count: 0,
2011
+ existing_files_count: 0,
2012
+ deleted_files_count: 0,
2013
+ added_rows_count: 0n,
2014
+ existing_rows_count: 0n,
2015
+ deleted_rows_count: 0n,
2016
+ partitions: record0.partitions ?? null,
2017
+ };
2018
+ for (const record of group) {
2019
+ ret.added_files_count += record.added_files_count;
2020
+ ret.existing_files_count += record.existing_files_count;
2021
+ ret.deleted_files_count += record.deleted_files_count;
2022
+ ret.added_rows_count += record.added_rows_count;
2023
+ ret.existing_rows_count += record.existing_rows_count;
2024
+ ret.deleted_rows_count += record.deleted_rows_count;
2025
+ ret.min_sequence_number = _bigintMin(ret.min_sequence_number, record.min_sequence_number);
2026
+ }
2027
+ for (let i = 1; i < group.length; i++) {
2028
+ const parts = group[i]?.partitions;
2029
+ if (ret.partitions && parts) {
2030
+ for (let j = 0; j < parts.length; j++) {
2031
+ const part = parts[j];
2032
+ const ret_part = ret.partitions[j];
2033
+ if (part && ret_part) {
2034
+ ret_part.contains_null ||= part.contains_null;
2035
+ if (part.contains_nan !== undefined) {
2036
+ ret_part.contains_nan =
2037
+ (ret_part.contains_nan ?? false) || part.contains_nan;
2038
+ }
2039
+ if (!ret_part.upper_bound ||
2040
+ (part.upper_bound &&
2041
+ Buffer.compare(part.upper_bound, ret_part.upper_bound) > 0)) {
2042
+ ret_part.upper_bound = part.upper_bound ?? null;
2043
+ }
2044
+ if (!ret_part.lower_bound ||
2045
+ (part.lower_bound &&
2046
+ Buffer.compare(part.lower_bound, ret_part.lower_bound) < 0)) {
2047
+ ret_part.lower_bound = part.lower_bound ?? null;
2048
+ }
2049
+ }
2050
+ }
2051
+ }
2052
+ else if (parts) {
2053
+ ret.partitions = parts;
2054
+ }
2055
+ }
2056
+ return [ret];
2057
+ }
2058
+ async function _streamReadManifest(params) {
2059
+ let bucket = params.bucket;
2060
+ let key = params.url;
2061
+ if (params.url.startsWith('s3://')) {
2062
+ const parsed = parseS3Url(params.url);
2063
+ bucket = parsed.bucket;
2064
+ key = parsed.key;
2065
+ }
2066
+ if (!bucket || !key) {
2067
+ throw new Error(`invalid manfiest url: ${params.url}`);
2068
+ }
2069
+ const results = await downloadAvro({
2070
+ credentials: params.credentials,
2071
+ region: params.region,
2072
+ bucket,
2073
+ key,
2074
+ avroSchema: params.schema,
2075
+ });
2076
+ return results;
2077
+ }
2078
+ function _filterDeletes(record) {
2079
+ return (record.content !== ListContent.DATA ||
2080
+ record.added_files_count > 0 ||
2081
+ record.existing_files_count > 0);
2082
+ }
2083
+ function _groupList(list, compare) {
2084
+ const ret = [];
2085
+ for (const item of list) {
2086
+ let added = false;
2087
+ for (const group of ret) {
2088
+ if (group[0] && compare(group[0], item)) {
2089
+ group.push(item);
2090
+ added = true;
2091
+ break;
2092
+ }
2093
+ }
2094
+ if (!added) {
2095
+ ret.push([item]);
2096
+ }
2097
+ }
2098
+ return ret;
2099
+ }
2100
+ function _combineWeightGroups(groups, targetCount, calculateWeight) {
2101
+ const weighted_groups = groups.map((group) => ({
2102
+ group,
2103
+ weight: calculateWeight(group),
2104
+ }));
2105
+ weighted_groups.sort(_sortGroup);
2106
+ while (weighted_groups.length > targetCount) {
2107
+ let remove_item;
2108
+ let merge_item;
2109
+ for (let i = 0; i < weighted_groups.length; i++) {
2110
+ const check_item = weighted_groups[i];
2111
+ const partition_spec_id = check_item?.group[0]?.partition_spec_id;
2112
+ if (partition_spec_id !== undefined) {
2113
+ for (let j = i + 1; j < weighted_groups.length; j++) {
2114
+ merge_item = weighted_groups[j];
2115
+ if (merge_item?.group[0]?.partition_spec_id === partition_spec_id) {
2116
+ remove_item = weighted_groups.splice(i, 1)[0];
2117
+ break;
2118
+ }
2119
+ }
2120
+ }
2121
+ }
2122
+ if (!remove_item || !merge_item) {
2123
+ break;
2124
+ }
2125
+ for (const item of remove_item.group) {
2126
+ merge_item.group.push(item);
2127
+ }
2128
+ }
2129
+ return weighted_groups.map((g) => g.group);
2130
+ }
2131
+ function _sortGroup(a, b) {
2132
+ return a.weight - b.weight;
2133
+ }
1577
2134
  function _randomBigInt64() {
1578
2135
  const bytes = node_crypto.randomBytes(8);
1579
2136
  let ret = bytes.readBigUInt64BE();
@@ -1583,6 +2140,15 @@ function _randomBigInt64() {
1583
2140
  }
1584
2141
  return ret;
1585
2142
  }
2143
+ function _bigintMin(value0, ...values) {
2144
+ let ret = value0;
2145
+ for (const val of values) {
2146
+ if (val < ret) {
2147
+ ret = val;
2148
+ }
2149
+ }
2150
+ return ret;
2151
+ }
1586
2152
 
1587
2153
  var index = {
1588
2154
  IcebergHttpError,
@@ -1602,5 +2168,7 @@ exports.addPartitionSpec = addPartitionSpec;
1602
2168
  exports.addSchema = addSchema;
1603
2169
  exports.default = index;
1604
2170
  exports.getMetadata = getMetadata;
2171
+ exports.manifestCompact = manifestCompact;
1605
2172
  exports.removeSnapshots = removeSnapshots;
1606
2173
  exports.setCurrentCommit = setCurrentCommit;
2174
+ exports.submitSnapshot = submitSnapshot;
package/package.json CHANGED
@@ -1,15 +1,18 @@
1
1
  {
2
2
  "name": "node-s3tables",
3
- "version": "0.0.16",
3
+ "version": "0.0.18",
4
4
  "description": "node api for dealing with s3tables",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
7
+ "bin": {
8
+ "node-s3tables": "dist/bin-compact.js"
9
+ },
7
10
  "files": [
8
11
  "dist/**/*"
9
12
  ],
10
13
  "scripts": {
11
14
  "build": "rollup -c",
12
- "ts-check": "tsc --noEmit",
15
+ "ts:check": "tsc --noEmit",
13
16
  "lint": "eslint src test",
14
17
  "pretty": "prettier -u --write \"**/*\" --log-level warn",
15
18
  "test": "dotenv -- tsx --test test/*.test.ts",