@gscdump/engine 0.27.2 → 0.28.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/THIRD-PARTY-LICENSES.md +443 -0
- package/dist/_chunks/engine.mjs +26 -12
- package/dist/_chunks/libs/chokidar.d.mts +1 -0
- package/dist/_chunks/libs/db0.d.mts +1 -0
- package/dist/_chunks/libs/denque.d.mts +1 -0
- package/dist/_chunks/libs/fzstd.mjs +545 -0
- package/dist/_chunks/libs/hyparquet-compressors.mjs +2796 -0
- package/dist/_chunks/libs/icebird.d.mts +441 -0
- package/dist/_chunks/libs/icebird.mjs +3708 -0
- package/dist/_chunks/libs/ioredis.d.mts +1 -0
- package/dist/_chunks/libs/lru-cache.d.mts +1 -0
- package/dist/_chunks/libs/unstorage.d.mts +120 -0
- package/dist/_chunks/sink.d.mts +62 -11
- package/dist/_chunks/storage.d.mts +33 -1
- package/dist/iceberg/index.d.mts +3 -2
- package/dist/iceberg/index.mjs +169 -17
- package/dist/index.d.mts +21 -2
- package/dist/index.mjs +27 -1
- package/dist/vendor/hysnappy-purejs.mjs +1 -12
- package/package.json +7 -6
- package/LICENSE +0 -21
|
@@ -0,0 +1,3708 @@
|
|
|
1
|
+
import { gunzip } from "./hyparquet-compressors.mjs";
|
|
2
|
+
import { asyncBufferFromUrl, cachedAsyncBuffer } from "hyparquet";
|
|
3
|
+
import { ByteWriter, parquetWrite } from "hyparquet-writer";
|
|
4
|
+
import { parseDecimal } from "hyparquet/src/convert.js";
|
|
5
|
+
function readZigZag(reader) {
|
|
6
|
+
let result = 0;
|
|
7
|
+
let shift = 0;
|
|
8
|
+
while (true) {
|
|
9
|
+
const byte = reader.view.getUint8(reader.offset++);
|
|
10
|
+
result |= (byte & 127) << shift;
|
|
11
|
+
if (!(byte & 128)) return result >>> 1 ^ -(result & 1);
|
|
12
|
+
shift += 7;
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
function readZigZagBigInt(reader) {
|
|
16
|
+
let result = 0n;
|
|
17
|
+
let shift = 0n;
|
|
18
|
+
while (true) {
|
|
19
|
+
const byte = reader.view.getUint8(reader.offset++);
|
|
20
|
+
result |= BigInt(byte & 127) << shift;
|
|
21
|
+
if (!(byte & 128)) return result >> 1n ^ -(result & 1n);
|
|
22
|
+
shift += 7n;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
function readAvroString(reader) {
|
|
26
|
+
const length = readZigZag(reader);
|
|
27
|
+
const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length);
|
|
28
|
+
reader.offset += length;
|
|
29
|
+
return new TextDecoder().decode(bytes);
|
|
30
|
+
}
|
|
31
|
+
function avroMetadata(reader) {
|
|
32
|
+
if (reader.view.getUint32(reader.offset) !== 1331849729) throw new Error("avro invalid magic bytes");
|
|
33
|
+
reader.offset += 4;
|
|
34
|
+
const metadata = {};
|
|
35
|
+
let mapCount = readZigZag(reader);
|
|
36
|
+
while (mapCount !== 0) {
|
|
37
|
+
if (mapCount < 0) {
|
|
38
|
+
mapCount = -mapCount;
|
|
39
|
+
readZigZag(reader);
|
|
40
|
+
}
|
|
41
|
+
for (let i = 0; i < mapCount; i++) {
|
|
42
|
+
const key = readAvroString(reader);
|
|
43
|
+
metadata[key] = readAvroString(reader);
|
|
44
|
+
}
|
|
45
|
+
mapCount = readZigZag(reader);
|
|
46
|
+
}
|
|
47
|
+
metadata["avro.schema"] = JSON.parse(metadata["avro.schema"]);
|
|
48
|
+
if (metadata["schema"]) metadata["schema"] = JSON.parse(metadata["schema"]);
|
|
49
|
+
if (metadata["iceberg.schema"]) metadata["iceberg.schema"] = JSON.parse(metadata["iceberg.schema"]);
|
|
50
|
+
const syncMarker = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, 16);
|
|
51
|
+
reader.offset += 16;
|
|
52
|
+
return {
|
|
53
|
+
metadata,
|
|
54
|
+
syncMarker
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
function avroRead({ reader, metadata, syncMarker }) {
|
|
58
|
+
const blocks = [];
|
|
59
|
+
while (reader.offset < reader.view.byteLength) {
|
|
60
|
+
let recordCount = readZigZag(reader);
|
|
61
|
+
if (recordCount === 0) break;
|
|
62
|
+
if (recordCount < 0) recordCount = -recordCount;
|
|
63
|
+
const blockSize = readZigZag(reader);
|
|
64
|
+
let data = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, blockSize);
|
|
65
|
+
reader.offset += blockSize;
|
|
66
|
+
const blockSync = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, 16);
|
|
67
|
+
reader.offset += 16;
|
|
68
|
+
for (let i = 0; i < 16; i++) if (blockSync[i] !== syncMarker[i]) throw new Error("sync marker does not match");
|
|
69
|
+
const codec = metadata["avro.codec"];
|
|
70
|
+
if (codec === "deflate") data = gunzip(data);
|
|
71
|
+
else if (codec !== "null") throw new Error(`unsupported codec: ${codec}`);
|
|
72
|
+
const { fields } = metadata["avro.schema"];
|
|
73
|
+
const dataReader = {
|
|
74
|
+
view: new DataView(data.buffer, data.byteOffset, data.byteLength),
|
|
75
|
+
offset: 0
|
|
76
|
+
};
|
|
77
|
+
for (let i = 0; i < recordCount; i++) {
|
|
78
|
+
const obj = {};
|
|
79
|
+
for (const field of fields) {
|
|
80
|
+
const value = readType(dataReader, field.type);
|
|
81
|
+
obj[field.name] = value;
|
|
82
|
+
}
|
|
83
|
+
blocks.push(obj);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return blocks;
|
|
87
|
+
}
|
|
88
|
+
function readType(reader, type) {
|
|
89
|
+
if (type === "null") return;
|
|
90
|
+
else if (Array.isArray(type)) return readType(reader, type[readZigZag(reader)]);
|
|
91
|
+
else if (typeof type === "object" && type.type === "record") {
|
|
92
|
+
const obj = {};
|
|
93
|
+
for (const subField of type.fields) obj[subField.name] = readType(reader, subField.type);
|
|
94
|
+
return obj;
|
|
95
|
+
} else if (typeof type === "object" && type.type === "array") {
|
|
96
|
+
const arr = [];
|
|
97
|
+
while (true) {
|
|
98
|
+
let count = readZigZag(reader);
|
|
99
|
+
if (count === 0) break;
|
|
100
|
+
if (count < 0) {
|
|
101
|
+
count = -count;
|
|
102
|
+
readZigZag(reader);
|
|
103
|
+
}
|
|
104
|
+
for (let i = 0; i < count; i++) arr.push(readType(reader, type.items));
|
|
105
|
+
}
|
|
106
|
+
return arr;
|
|
107
|
+
} else if (typeof type === "object" && type.logicalType) if (type.logicalType === "date" && type.type === "int") {
|
|
108
|
+
const value = readZigZag(reader);
|
|
109
|
+
return /* @__PURE__ */ new Date(value * 864e5);
|
|
110
|
+
} else if (type.logicalType === "time-millis" && type.type === "int") return readZigZag(reader);
|
|
111
|
+
else if (type.logicalType === "time-micros" && type.type === "long") return readZigZagBigInt(reader);
|
|
112
|
+
else if (type.logicalType === "timestamp-millis" && type.type === "long") {
|
|
113
|
+
const value = readZigZagBigInt(reader);
|
|
114
|
+
return new Date(Number(value));
|
|
115
|
+
} else if (type.logicalType === "timestamp-micros" && type.type === "long") {
|
|
116
|
+
const value = readZigZagBigInt(reader);
|
|
117
|
+
return new Date(Number(value / 1000n));
|
|
118
|
+
} else if (type.logicalType === "timestamp-nanos" && type.type === "long") {
|
|
119
|
+
const value = readZigZagBigInt(reader);
|
|
120
|
+
return new Date(Number(value / 1000000n));
|
|
121
|
+
} else if (type.logicalType === "decimal" && "precision" in type) {
|
|
122
|
+
const bytes = type.type === "fixed" ? readFixed(reader, type.size) : readType(reader, type.type);
|
|
123
|
+
const factor = 10 ** -(type.scale || 0);
|
|
124
|
+
return parseDecimal(bytes) * factor;
|
|
125
|
+
} else if (type.logicalType === "uuid" && type.type === "fixed" && type.size === 16) return bytesToUuid(readFixed(reader, 16));
|
|
126
|
+
else {
|
|
127
|
+
console.warn(`unknown logical type: ${type.logicalType}`);
|
|
128
|
+
return type.type === "fixed" ? readFixed(reader, type.size) : readType(reader, type.type);
|
|
129
|
+
}
|
|
130
|
+
else if (typeof type === "object" && type.type === "fixed") return readFixed(reader, type.size);
|
|
131
|
+
else if (type === "boolean") {
|
|
132
|
+
const value = reader.view.getUint8(reader.offset) === 1;
|
|
133
|
+
reader.offset++;
|
|
134
|
+
return value;
|
|
135
|
+
} else if (type === "int") return readZigZag(reader);
|
|
136
|
+
else if (type === "long") return readZigZagBigInt(reader);
|
|
137
|
+
else if (type === "float") {
|
|
138
|
+
const value = reader.view.getFloat32(reader.offset, true);
|
|
139
|
+
reader.offset += 4;
|
|
140
|
+
return value;
|
|
141
|
+
} else if (type === "double") {
|
|
142
|
+
const value = reader.view.getFloat64(reader.offset, true);
|
|
143
|
+
reader.offset += 8;
|
|
144
|
+
return value;
|
|
145
|
+
} else if (type === "bytes") {
|
|
146
|
+
const length = readZigZag(reader);
|
|
147
|
+
const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length);
|
|
148
|
+
reader.offset += length;
|
|
149
|
+
return bytes;
|
|
150
|
+
} else if (type === "string") {
|
|
151
|
+
const length = readZigZag(reader);
|
|
152
|
+
const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, length);
|
|
153
|
+
const text = new TextDecoder().decode(bytes);
|
|
154
|
+
reader.offset += length;
|
|
155
|
+
return text;
|
|
156
|
+
} else throw new Error(`unsupported type: ${type}`);
|
|
157
|
+
}
|
|
158
|
+
function readFixed(reader, size) {
|
|
159
|
+
const bytes = new Uint8Array(reader.view.buffer, reader.view.byteOffset + reader.offset, size);
|
|
160
|
+
reader.offset += size;
|
|
161
|
+
return bytes;
|
|
162
|
+
}
|
|
163
|
+
function bytesToUuid(bytes) {
|
|
164
|
+
let hex = "";
|
|
165
|
+
for (let i = 0; i < 16; i++) {
|
|
166
|
+
hex += bytes[i].toString(16).padStart(2, "0");
|
|
167
|
+
if (i === 3 || i === 5 || i === 7 || i === 9) hex += "-";
|
|
168
|
+
}
|
|
169
|
+
return hex;
|
|
170
|
+
}
|
|
171
|
+
function sanitize(name) {
|
|
172
|
+
let result = "";
|
|
173
|
+
for (let i = 0; i < name.length; i++) {
|
|
174
|
+
const ch = name.charAt(i);
|
|
175
|
+
const isLetter = /^[A-Za-z]$/.test(ch);
|
|
176
|
+
const isDigit = /^[0-9]$/.test(ch);
|
|
177
|
+
if (i === 0) if (isLetter || ch === "_") result += ch;
|
|
178
|
+
else result += isDigit ? "_" + ch : "_x" + ch.charCodeAt(0).toString(16).toUpperCase();
|
|
179
|
+
else if (isLetter || isDigit || ch === "_") result += ch;
|
|
180
|
+
else result += "_x" + ch.charCodeAt(0).toString(16).toUpperCase();
|
|
181
|
+
}
|
|
182
|
+
return result;
|
|
183
|
+
}
|
|
184
|
+
function bytesToHex$1(bytes) {
|
|
185
|
+
let hex = "";
|
|
186
|
+
for (let i = 0; i < bytes.length; i++) hex += bytes[i].toString(16).padStart(2, "0");
|
|
187
|
+
return hex;
|
|
188
|
+
}
|
|
189
|
+
function uuid4() {
|
|
190
|
+
if (globalThis.crypto?.randomUUID) return globalThis.crypto.randomUUID();
|
|
191
|
+
return "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx".replace(/[xy]/g, (c) => {
|
|
192
|
+
const r = Math.random() * 16 | 0;
|
|
193
|
+
return (c === "x" ? r : r & 3 | 8).toString(16);
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
function translateS3Url(url) {
|
|
197
|
+
if (url.startsWith("s3a://") || url.startsWith("s3://")) {
|
|
198
|
+
const rest = url.slice(url.indexOf("://") + 3);
|
|
199
|
+
const slashIndex = rest.indexOf("/");
|
|
200
|
+
if (slashIndex === -1) throw new Error("Invalid S3 URL, missing \"/\" after bucket");
|
|
201
|
+
return `https://${rest.slice(0, slashIndex)}.s3.amazonaws.com${rest.slice(slashIndex)}`;
|
|
202
|
+
}
|
|
203
|
+
return url;
|
|
204
|
+
}
|
|
205
|
+
function urlResolver({ requestInit } = {}) {
|
|
206
|
+
return {
|
|
207
|
+
reader(url, byteLength) {
|
|
208
|
+
return asyncBufferFromUrl({
|
|
209
|
+
url: translateS3Url(url),
|
|
210
|
+
byteLength,
|
|
211
|
+
requestInit
|
|
212
|
+
});
|
|
213
|
+
},
|
|
214
|
+
writer(url, options) {
|
|
215
|
+
const w = new ByteWriter();
|
|
216
|
+
w.finish = async function() {
|
|
217
|
+
const target = translateS3Url(url);
|
|
218
|
+
const body = w.getBytes().slice();
|
|
219
|
+
const headers = {};
|
|
220
|
+
if (requestInit?.headers) new Headers(requestInit.headers).forEach((v, k) => {
|
|
221
|
+
headers[k] = v;
|
|
222
|
+
});
|
|
223
|
+
if (options?.ifNoneMatch) headers["If-None-Match"] = options.ifNoneMatch;
|
|
224
|
+
const res = await fetch(target, {
|
|
225
|
+
...requestInit,
|
|
226
|
+
method: "PUT",
|
|
227
|
+
headers,
|
|
228
|
+
body
|
|
229
|
+
});
|
|
230
|
+
if (!res.ok) {
|
|
231
|
+
const err = /* @__PURE__ */ new Error(`PUT ${url}: ${res.status} ${res.statusText}`);
|
|
232
|
+
err.status = res.status;
|
|
233
|
+
throw err;
|
|
234
|
+
}
|
|
235
|
+
};
|
|
236
|
+
return w;
|
|
237
|
+
},
|
|
238
|
+
async deleter(url) {
|
|
239
|
+
const res = await fetch(translateS3Url(url), {
|
|
240
|
+
...requestInit,
|
|
241
|
+
method: "DELETE"
|
|
242
|
+
});
|
|
243
|
+
if (!res.ok && res.status !== 404) throw new Error(`DELETE ${url}: ${res.status} ${res.statusText}`);
|
|
244
|
+
}
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
function cachingResolver(base) {
|
|
248
|
+
const cache = /* @__PURE__ */ new Map();
|
|
249
|
+
const out = { reader(path, byteLength) {
|
|
250
|
+
let buf = cache.get(path);
|
|
251
|
+
if (!buf) {
|
|
252
|
+
buf = (async () => cachedAsyncBuffer(await base.reader(path, byteLength)))();
|
|
253
|
+
cache.set(path, buf);
|
|
254
|
+
buf.catch(() => {
|
|
255
|
+
if (cache.get(path) === buf) cache.delete(path);
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
return buf;
|
|
259
|
+
} };
|
|
260
|
+
if (base.writer) {
|
|
261
|
+
const baseWriter = base.writer;
|
|
262
|
+
out.writer = (path, options) => {
|
|
263
|
+
const w = baseWriter(path, options);
|
|
264
|
+
const origFinish = w.finish.bind(w);
|
|
265
|
+
w.finish = async function() {
|
|
266
|
+
await origFinish();
|
|
267
|
+
cache.delete(path);
|
|
268
|
+
};
|
|
269
|
+
return w;
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
if (base.deleter) {
|
|
273
|
+
const baseDeleter = base.deleter;
|
|
274
|
+
out.deleter = async (path) => {
|
|
275
|
+
await baseDeleter(path);
|
|
276
|
+
cache.delete(path);
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
return out;
|
|
280
|
+
}
|
|
281
|
+
function s3Lister({ requestInit } = {}) {
|
|
282
|
+
return async function list(url) {
|
|
283
|
+
const s3parts = s3ParseUrl(url);
|
|
284
|
+
if (!s3parts) throw new Error(`not an S3 URL: ${url}`);
|
|
285
|
+
const { bucket, prefix } = s3parts;
|
|
286
|
+
const listUrl = `https://${bucket}.s3.amazonaws.com/?list-type=2&prefix=${prefix.replace(/\/$/, "")}/&delimiter=/`;
|
|
287
|
+
const res = await fetch(listUrl, requestInit);
|
|
288
|
+
if (!res.ok) throw new Error(`${res.status} ${res.statusText}`);
|
|
289
|
+
return ((await res.text()).match(/<Contents>(.*?)<\/Contents>/gs) || []).map((match) => {
|
|
290
|
+
const keyMatch = match.match(/<Key>(.*?)<\/Key>/);
|
|
291
|
+
if (!keyMatch) throw new Error("failed to parse S3 list response");
|
|
292
|
+
return keyMatch[1].split("/").pop() ?? "";
|
|
293
|
+
}).filter(Boolean);
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
function s3ParseUrl(url) {
|
|
297
|
+
if (url.startsWith("s3://") || url.startsWith("s3a://")) {
|
|
298
|
+
const parts = url.split("/");
|
|
299
|
+
return {
|
|
300
|
+
bucket: parts[2],
|
|
301
|
+
prefix: parts.slice(3).join("/")
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
if (url.startsWith("https://s3.amazonaws.com/")) {
|
|
305
|
+
const parts = url.split("/");
|
|
306
|
+
return {
|
|
307
|
+
bucket: parts[3],
|
|
308
|
+
prefix: parts.slice(4).join("/")
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
const m = url.match(/^https:\/\/([a-z0-9][a-z0-9-]*)\.s3(?:[.-][a-z0-9-]+)?\.amazonaws\.com\/(.*)$/);
|
|
312
|
+
if (m) return {
|
|
313
|
+
bucket: m[1],
|
|
314
|
+
prefix: m[2]
|
|
315
|
+
};
|
|
316
|
+
}
|
|
317
|
+
async function resolveText(resolver, path) {
|
|
318
|
+
const ab = await resolver.reader(path);
|
|
319
|
+
let buf = await ab.slice(0, ab.byteLength);
|
|
320
|
+
if (isGzip(buf)) buf = await decompressGzip(buf);
|
|
321
|
+
return new TextDecoder().decode(buf);
|
|
322
|
+
}
|
|
323
|
+
function isGzip(buf) {
|
|
324
|
+
if (buf.byteLength < 2) return false;
|
|
325
|
+
const view = new Uint8Array(buf, 0, 2);
|
|
326
|
+
return view[0] === 31 && view[1] === 139;
|
|
327
|
+
}
|
|
328
|
+
async function decompressGzip(buf) {
|
|
329
|
+
if (!globalThis.DecompressionStream) throw new Error("gzip decompression is not supported in this environment");
|
|
330
|
+
const stream = new Blob([buf]).stream().pipeThrough(new DecompressionStream("gzip"));
|
|
331
|
+
return await new Response(stream).arrayBuffer();
|
|
332
|
+
}
|
|
333
|
+
async function fetchAvroRecords(url, resolver, byteLength) {
|
|
334
|
+
const lengthHint = byteLength !== void 0 && Number.isFinite(byteLength) ? byteLength : void 0;
|
|
335
|
+
const ab = await resolver.reader(url, lengthHint);
|
|
336
|
+
const buffer = await ab.slice(0, ab.byteLength);
|
|
337
|
+
const reader = {
|
|
338
|
+
view: new DataView(buffer),
|
|
339
|
+
offset: 0
|
|
340
|
+
};
|
|
341
|
+
const { metadata, syncMarker } = await avroMetadata(reader);
|
|
342
|
+
return await avroRead({
|
|
343
|
+
reader,
|
|
344
|
+
metadata,
|
|
345
|
+
syncMarker
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
const MAX_SAFE = BigInt(Number.MAX_SAFE_INTEGER);
|
|
349
|
+
function stringifyIcebergJson(value, indent) {
|
|
350
|
+
const sp = indent ? " ".repeat(indent) : "";
|
|
351
|
+
function emit(v, depth) {
|
|
352
|
+
if (typeof v === "bigint") return v.toString();
|
|
353
|
+
if (v === null) return "null";
|
|
354
|
+
if (typeof v === "string") return JSON.stringify(v);
|
|
355
|
+
if (typeof v === "number" || typeof v === "boolean") return JSON.stringify(v);
|
|
356
|
+
if (Array.isArray(v)) {
|
|
357
|
+
if (v.length === 0) return "[]";
|
|
358
|
+
const inner = v.map((x) => emit(x, depth + 1));
|
|
359
|
+
if (!sp) return "[" + inner.join(",") + "]";
|
|
360
|
+
const pad = sp.repeat(depth + 1), close = sp.repeat(depth);
|
|
361
|
+
return "[\n" + pad + inner.join(",\n" + pad) + "\n" + close + "]";
|
|
362
|
+
}
|
|
363
|
+
if (typeof v === "object") {
|
|
364
|
+
const keys = Object.keys(v).filter((k) => v[k] !== void 0);
|
|
365
|
+
if (keys.length === 0) return "{}";
|
|
366
|
+
const inner = keys.map((k) => JSON.stringify(k) + (sp ? ": " : ":") + emit(v[k], depth + 1));
|
|
367
|
+
if (!sp) return "{" + inner.join(",") + "}";
|
|
368
|
+
const pad = sp.repeat(depth + 1), close = sp.repeat(depth);
|
|
369
|
+
return "{\n" + pad + inner.join(",\n" + pad) + "\n" + close + "}";
|
|
370
|
+
}
|
|
371
|
+
return JSON.stringify(v);
|
|
372
|
+
}
|
|
373
|
+
return emit(value, 0);
|
|
374
|
+
}
|
|
375
|
+
function parseIcebergJson(text) {
|
|
376
|
+
let i = 0;
|
|
377
|
+
function skipWs() {
|
|
378
|
+
while (i < text.length) {
|
|
379
|
+
const c = text.charCodeAt(i);
|
|
380
|
+
if (c !== 32 && c !== 9 && c !== 10 && c !== 13) break;
|
|
381
|
+
i++;
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
function parseString() {
|
|
385
|
+
if (text[i] !== "\"") throw new Error(`expected " at ${i}`);
|
|
386
|
+
i++;
|
|
387
|
+
let s = "";
|
|
388
|
+
while (i < text.length) {
|
|
389
|
+
const c = text[i++];
|
|
390
|
+
if (c === "\"") return s;
|
|
391
|
+
if (c !== "\\") {
|
|
392
|
+
s += c;
|
|
393
|
+
continue;
|
|
394
|
+
}
|
|
395
|
+
const e = text[i++];
|
|
396
|
+
if (e === "u") {
|
|
397
|
+
s += String.fromCharCode(parseInt(text.slice(i, i + 4), 16));
|
|
398
|
+
i += 4;
|
|
399
|
+
} else if (e === "n") s += "\n";
|
|
400
|
+
else if (e === "t") s += " ";
|
|
401
|
+
else if (e === "r") s += "\r";
|
|
402
|
+
else if (e === "b") s += "\b";
|
|
403
|
+
else if (e === "f") s += "\f";
|
|
404
|
+
else s += e;
|
|
405
|
+
}
|
|
406
|
+
throw new Error("unterminated string");
|
|
407
|
+
}
|
|
408
|
+
function parseNumber() {
|
|
409
|
+
const start = i;
|
|
410
|
+
if (text[i] === "-") i++;
|
|
411
|
+
while (text[i] >= "0" && text[i] <= "9") i++;
|
|
412
|
+
const intEnd = i;
|
|
413
|
+
let isFloat = false;
|
|
414
|
+
if (text[i] === ".") {
|
|
415
|
+
isFloat = true;
|
|
416
|
+
i++;
|
|
417
|
+
while (text[i] >= "0" && text[i] <= "9") i++;
|
|
418
|
+
}
|
|
419
|
+
if (text[i] === "e" || text[i] === "E") {
|
|
420
|
+
isFloat = true;
|
|
421
|
+
i++;
|
|
422
|
+
if (text[i] === "+" || text[i] === "-") i++;
|
|
423
|
+
while (text[i] >= "0" && text[i] <= "9") i++;
|
|
424
|
+
}
|
|
425
|
+
if (isFloat) return Number(text.slice(start, i));
|
|
426
|
+
const intStr = text.slice(start, intEnd);
|
|
427
|
+
if (intStr.length >= 16) {
|
|
428
|
+
const n = BigInt(intStr);
|
|
429
|
+
if (n > MAX_SAFE || n < -MAX_SAFE) return n;
|
|
430
|
+
}
|
|
431
|
+
return Number(intStr);
|
|
432
|
+
}
|
|
433
|
+
function parseLiteral(lit, val) {
|
|
434
|
+
if (text.slice(i, i + lit.length) !== lit) throw new Error(`bad literal at ${i}`);
|
|
435
|
+
i += lit.length;
|
|
436
|
+
return val;
|
|
437
|
+
}
|
|
438
|
+
function parseValue() {
|
|
439
|
+
skipWs();
|
|
440
|
+
const ch = text[i];
|
|
441
|
+
if (ch === "\"") return parseString();
|
|
442
|
+
if (ch === "{") return parseObject();
|
|
443
|
+
if (ch === "[") return parseArray();
|
|
444
|
+
if (ch === "t") return parseLiteral("true", true);
|
|
445
|
+
if (ch === "f") return parseLiteral("false", false);
|
|
446
|
+
if (ch === "n") return parseLiteral("null", null);
|
|
447
|
+
return parseNumber();
|
|
448
|
+
}
|
|
449
|
+
function parseObject() {
|
|
450
|
+
i++;
|
|
451
|
+
skipWs();
|
|
452
|
+
const obj = {};
|
|
453
|
+
if (text[i] === "}") {
|
|
454
|
+
i++;
|
|
455
|
+
return obj;
|
|
456
|
+
}
|
|
457
|
+
while (true) {
|
|
458
|
+
skipWs();
|
|
459
|
+
const key = parseString();
|
|
460
|
+
skipWs();
|
|
461
|
+
if (text[i] !== ":") throw new Error(`expected : at ${i}`);
|
|
462
|
+
i++;
|
|
463
|
+
obj[key] = parseValue();
|
|
464
|
+
skipWs();
|
|
465
|
+
if (text[i] === ",") {
|
|
466
|
+
i++;
|
|
467
|
+
continue;
|
|
468
|
+
}
|
|
469
|
+
if (text[i] === "}") {
|
|
470
|
+
i++;
|
|
471
|
+
return obj;
|
|
472
|
+
}
|
|
473
|
+
throw new Error(`expected , or } at ${i}`);
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
function parseArray() {
|
|
477
|
+
i++;
|
|
478
|
+
skipWs();
|
|
479
|
+
const arr = [];
|
|
480
|
+
if (text[i] === "]") {
|
|
481
|
+
i++;
|
|
482
|
+
return arr;
|
|
483
|
+
}
|
|
484
|
+
while (true) {
|
|
485
|
+
arr.push(parseValue());
|
|
486
|
+
skipWs();
|
|
487
|
+
if (text[i] === ",") {
|
|
488
|
+
i++;
|
|
489
|
+
continue;
|
|
490
|
+
}
|
|
491
|
+
if (text[i] === "]") {
|
|
492
|
+
i++;
|
|
493
|
+
return arr;
|
|
494
|
+
}
|
|
495
|
+
throw new Error(`expected , or ] at ${i}`);
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
const value = parseValue();
|
|
499
|
+
skipWs();
|
|
500
|
+
if (i !== text.length) throw new Error(`unexpected trailing input at ${i}`);
|
|
501
|
+
return value;
|
|
502
|
+
}
|
|
503
|
+
function metadataFileVersionNumber(file) {
|
|
504
|
+
const match = file.match(/^(?:v(\d+)|(\d+)-.+)(?:\.metadata\.json|\.gz\.metadata\.json|\.metadata\.json\.gz)$/);
|
|
505
|
+
if (!match) return void 0;
|
|
506
|
+
return Number(match[1] ?? match[2]);
|
|
507
|
+
}
|
|
508
|
+
function metadataFileVersionName(file) {
|
|
509
|
+
if (metadataFileVersionNumber(file) === void 0) return void 0;
|
|
510
|
+
return file.replace(/(?:\.metadata\.json\.gz|\.gz\.metadata\.json|\.metadata\.json)$/, "");
|
|
511
|
+
}
|
|
512
|
+
function metadataVersions(files) {
|
|
513
|
+
const versions = /* @__PURE__ */ new Map();
|
|
514
|
+
for (const file of files) {
|
|
515
|
+
const version = metadataFileVersionNumber(file);
|
|
516
|
+
const name = metadataFileVersionName(file);
|
|
517
|
+
if (version === void 0 || name === void 0) continue;
|
|
518
|
+
const current = versions.get(version);
|
|
519
|
+
const paddedVersion = String(version).padStart(5, "0");
|
|
520
|
+
if (current === void 0 || metadataFilePreference(file, paddedVersion) < metadataFilePreference(`${current}.metadata.json`, paddedVersion)) versions.set(version, name);
|
|
521
|
+
}
|
|
522
|
+
return [...versions.entries()].sort(([a], [b]) => a - b).map(([, name]) => name);
|
|
523
|
+
}
|
|
524
|
+
function icebergLatestVersion({ tableUrl, resolver, lister }) {
|
|
525
|
+
resolver ??= urlResolver();
|
|
526
|
+
lister ??= s3Lister();
|
|
527
|
+
const url = `${tableUrl}/metadata/version-hint.text`;
|
|
528
|
+
return resolveText(resolver, url).then((text) => {
|
|
529
|
+
const version = parseInt(text);
|
|
530
|
+
if (isNaN(version)) throw new Error(`invalid version: ${text}`);
|
|
531
|
+
return `v${version}`;
|
|
532
|
+
}).catch(() => {
|
|
533
|
+
const metadataDir = `${tableUrl}/metadata`;
|
|
534
|
+
return lister(metadataDir).then((files) => {
|
|
535
|
+
const versions = metadataVersions(files);
|
|
536
|
+
if (versions.length === 0) throw new Error("no metadata files found");
|
|
537
|
+
return versions[versions.length - 1];
|
|
538
|
+
});
|
|
539
|
+
}).catch((err) => {
|
|
540
|
+
throw new Error(`failed to determine latest iceberg version: ${err.message}`);
|
|
541
|
+
});
|
|
542
|
+
}
|
|
543
|
+
async function resolveMetadata({ tableUrl, metadataFileName, resolver, lister }) {
|
|
544
|
+
resolver ??= urlResolver();
|
|
545
|
+
lister ??= s3Lister();
|
|
546
|
+
if (!metadataFileName) metadataFileName = `${await icebergLatestVersion({
|
|
547
|
+
tableUrl,
|
|
548
|
+
resolver,
|
|
549
|
+
lister
|
|
550
|
+
})}.metadata.json`;
|
|
551
|
+
const url = `${tableUrl}/metadata/${metadataFileName}`;
|
|
552
|
+
try {
|
|
553
|
+
return {
|
|
554
|
+
metadata: parseIcebergJson(await resolveText(resolver, url)),
|
|
555
|
+
metadataFileName
|
|
556
|
+
};
|
|
557
|
+
} catch (err) {
|
|
558
|
+
try {
|
|
559
|
+
const metadataDir = `${tableUrl}/metadata`;
|
|
560
|
+
const match = findMetadataFile(await lister(metadataDir), metadataFileName);
|
|
561
|
+
if (match) return {
|
|
562
|
+
metadata: parseIcebergJson(await resolveText(resolver, `${metadataDir}/${match}`)),
|
|
563
|
+
metadataFileName: match
|
|
564
|
+
};
|
|
565
|
+
} catch {}
|
|
566
|
+
throw new Error(`failed to get iceberg metadata: ${err.message}`);
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
function findMetadataFile(files, metadataFileName) {
|
|
570
|
+
if (files.includes(metadataFileName)) return metadataFileName;
|
|
571
|
+
const version = metadataFileVersionNumber(metadataFileName);
|
|
572
|
+
if (version === void 0) return void 0;
|
|
573
|
+
const versionNum = String(version).padStart(5, "0");
|
|
574
|
+
return files.filter((f) => metadataFileVersionNumber(f) === version).sort((a, b) => metadataFilePreference(a, versionNum) - metadataFilePreference(b, versionNum))[0];
|
|
575
|
+
}
|
|
576
|
+
async function loadLatestFileCatalogMetadata({ tableUrl, resolver, lister, maxProbe = 64 }) {
|
|
577
|
+
resolver ??= urlResolver();
|
|
578
|
+
lister ??= s3Lister();
|
|
579
|
+
let files;
|
|
580
|
+
try {
|
|
581
|
+
files = await lister(`${tableUrl}/metadata`);
|
|
582
|
+
} catch (err) {
|
|
583
|
+
const fallback = await hintProbeFallback(resolver, tableUrl, maxProbe);
|
|
584
|
+
if (fallback) return fallback;
|
|
585
|
+
throw err;
|
|
586
|
+
}
|
|
587
|
+
let highest = -1;
|
|
588
|
+
let highestFile;
|
|
589
|
+
for (const file of files) {
|
|
590
|
+
const v = metadataFileVersionNumber(file);
|
|
591
|
+
if (v === void 0) continue;
|
|
592
|
+
if (v > highest) {
|
|
593
|
+
highest = v;
|
|
594
|
+
highestFile = file;
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
if (highest < 0 || !highestFile) throw new Error(`no metadata files found at ${tableUrl}/metadata`);
|
|
598
|
+
const metadataLocation = `${tableUrl}/metadata/${highestFile}`;
|
|
599
|
+
const text = await resolveText(resolver, metadataLocation);
|
|
600
|
+
return {
|
|
601
|
+
version: highest,
|
|
602
|
+
metadata: parseIcebergJson(text),
|
|
603
|
+
metadataFileName: highestFile,
|
|
604
|
+
metadataLocation
|
|
605
|
+
};
|
|
606
|
+
}
|
|
607
|
+
async function hintProbeFallback(resolver, tableUrl, maxProbe) {
|
|
608
|
+
let hintVersion;
|
|
609
|
+
try {
|
|
610
|
+
const text = await resolveText(resolver, `${tableUrl}/metadata/version-hint.text`);
|
|
611
|
+
const parsed = parseInt(text);
|
|
612
|
+
if (!isNaN(parsed)) hintVersion = parsed;
|
|
613
|
+
} catch {}
|
|
614
|
+
if (hintVersion === void 0 || hintVersion < 0) return void 0;
|
|
615
|
+
let lastFound = await tryReadVersion(resolver, tableUrl, hintVersion);
|
|
616
|
+
if (!lastFound) return void 0;
|
|
617
|
+
let probe = hintVersion + 1;
|
|
618
|
+
const limit = hintVersion + maxProbe;
|
|
619
|
+
while (probe <= limit) {
|
|
620
|
+
const next = await tryReadVersion(resolver, tableUrl, probe);
|
|
621
|
+
if (!next) break;
|
|
622
|
+
lastFound = next;
|
|
623
|
+
probe++;
|
|
624
|
+
}
|
|
625
|
+
if (probe > limit) return void 0;
|
|
626
|
+
return lastFound;
|
|
627
|
+
}
|
|
628
|
+
async function tryReadVersion(resolver, tableUrl, version) {
|
|
629
|
+
const fileName = `v${version}.metadata.json`;
|
|
630
|
+
const metadataLocation = `${tableUrl}/metadata/${fileName}`;
|
|
631
|
+
try {
|
|
632
|
+
return {
|
|
633
|
+
version,
|
|
634
|
+
metadata: parseIcebergJson(await resolveText(resolver, metadataLocation)),
|
|
635
|
+
metadataFileName: fileName,
|
|
636
|
+
metadataLocation
|
|
637
|
+
};
|
|
638
|
+
} catch {
|
|
639
|
+
return;
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
function metadataFilePreference(file, paddedVersion) {
|
|
643
|
+
if (file === `v${Number(paddedVersion)}.metadata.json`) return 0;
|
|
644
|
+
if (file === `v${Number(paddedVersion)}.gz.metadata.json`) return 1;
|
|
645
|
+
if (file === `v${Number(paddedVersion)}.metadata.json.gz`) return 2;
|
|
646
|
+
if (file.startsWith(`${paddedVersion}-`) && file.endsWith(".metadata.json")) return 3;
|
|
647
|
+
if (file.startsWith(`${paddedVersion}-`) && file.endsWith(".gz.metadata.json")) return 4;
|
|
648
|
+
if (file.startsWith(`${paddedVersion}-`) && file.endsWith(".metadata.json.gz")) return 5;
|
|
649
|
+
return 6;
|
|
650
|
+
}
|
|
651
|
+
async function restCatalogConnect({ url, warehouse, requestInit }) {
|
|
652
|
+
const base = url.replace(/\/$/, "");
|
|
653
|
+
const configUrl = warehouse ? `${base}/v1/config?warehouse=${encodeURIComponent(warehouse)}` : `${base}/v1/config`;
|
|
654
|
+
const res = await fetch(configUrl, requestInit);
|
|
655
|
+
if (!res.ok) await throwRestError(res);
|
|
656
|
+
const body = parseIcebergJson(await res.text());
|
|
657
|
+
const defaults = body.defaults ?? {};
|
|
658
|
+
const overrides = body.overrides ?? {};
|
|
659
|
+
const prefix = overrides.prefix ?? defaults.prefix ?? "";
|
|
660
|
+
return Object.freeze({
|
|
661
|
+
type: "rest",
|
|
662
|
+
url: base,
|
|
663
|
+
prefix: typeof prefix === "string" ? prefix : "",
|
|
664
|
+
defaults,
|
|
665
|
+
overrides,
|
|
666
|
+
requestInit
|
|
667
|
+
});
|
|
668
|
+
}
|
|
669
|
+
function restCatalogListTables(ctx, { namespace }) {
|
|
670
|
+
const ns = encodeNamespace(namespace);
|
|
671
|
+
return paginate({}, async (query) => {
|
|
672
|
+
const body = parseIcebergJson(await (await restFetch(ctx, `namespaces/${ns}/tables${query}`)).text());
|
|
673
|
+
return {
|
|
674
|
+
items: body.identifiers ?? [],
|
|
675
|
+
nextPageToken: body["next-page-token"]
|
|
676
|
+
};
|
|
677
|
+
});
|
|
678
|
+
}
|
|
679
|
+
async function restCatalogLoadTable(ctx, { namespace, table }) {
|
|
680
|
+
const body = parseIcebergJson(await (await restFetch(ctx, `namespaces/${encodeNamespace(namespace)}/tables/${encodeURIComponent(table)}`)).text());
|
|
681
|
+
return {
|
|
682
|
+
metadataLocation: body["metadata-location"],
|
|
683
|
+
metadata: body.metadata,
|
|
684
|
+
config: body.config ?? {}
|
|
685
|
+
};
|
|
686
|
+
}
|
|
687
|
+
async function restCatalogCreateTable(ctx, { namespace, table, schema, location, partitionSpec, writeOrder, stageCreate, properties }) {
|
|
688
|
+
const ns = encodeNamespace(namespace);
|
|
689
|
+
const body = {
|
|
690
|
+
name: table,
|
|
691
|
+
schema
|
|
692
|
+
};
|
|
693
|
+
if (location !== void 0) body.location = location;
|
|
694
|
+
if (partitionSpec !== void 0) body["partition-spec"] = partitionSpec;
|
|
695
|
+
if (writeOrder !== void 0) body["write-order"] = writeOrder;
|
|
696
|
+
if (stageCreate !== void 0) body["stage-create"] = stageCreate;
|
|
697
|
+
if (properties !== void 0) body.properties = properties;
|
|
698
|
+
const responseBody = parseIcebergJson(await (await restFetch(ctx, `namespaces/${ns}/tables`, {
|
|
699
|
+
method: "POST",
|
|
700
|
+
headers: { "content-type": "application/json" },
|
|
701
|
+
body: stringifyIcebergJson(body)
|
|
702
|
+
})).text());
|
|
703
|
+
return {
|
|
704
|
+
metadataLocation: responseBody["metadata-location"],
|
|
705
|
+
metadata: responseBody.metadata,
|
|
706
|
+
config: responseBody.config ?? {}
|
|
707
|
+
};
|
|
708
|
+
}
|
|
709
|
+
async function restCatalogUpdateTable(ctx, { namespace, table, requirements, updates }) {
|
|
710
|
+
const responseBody = parseIcebergJson(await (await restFetch(ctx, `namespaces/${encodeNamespace(namespace)}/tables/${encodeURIComponent(table)}`, {
|
|
711
|
+
method: "POST",
|
|
712
|
+
headers: { "content-type": "application/json" },
|
|
713
|
+
body: stringifyIcebergJson({
|
|
714
|
+
requirements,
|
|
715
|
+
updates
|
|
716
|
+
})
|
|
717
|
+
})).text());
|
|
718
|
+
return {
|
|
719
|
+
metadataLocation: responseBody["metadata-location"],
|
|
720
|
+
metadata: responseBody.metadata,
|
|
721
|
+
config: responseBody.config ?? {}
|
|
722
|
+
};
|
|
723
|
+
}
|
|
724
|
+
async function restCatalogDropTable(ctx, { namespace, table, purgeRequested }) {
|
|
725
|
+
await restFetch(ctx, `namespaces/${encodeNamespace(namespace)}/tables/${encodeURIComponent(table)}${purgeRequested ? "?purgeRequested=true" : ""}`, { method: "DELETE" });
|
|
726
|
+
}
|
|
727
|
+
async function restCatalogCreateNamespace(ctx, { namespace, properties }) {
|
|
728
|
+
const ns = Array.isArray(namespace) ? namespace : namespace.split(".");
|
|
729
|
+
const body = parseIcebergJson(await (await restFetch(ctx, "namespaces", {
|
|
730
|
+
method: "POST",
|
|
731
|
+
headers: { "content-type": "application/json" },
|
|
732
|
+
body: stringifyIcebergJson({
|
|
733
|
+
namespace: ns,
|
|
734
|
+
properties: properties ?? {}
|
|
735
|
+
})
|
|
736
|
+
})).text());
|
|
737
|
+
return {
|
|
738
|
+
namespace: body.namespace ?? ns,
|
|
739
|
+
properties: body.properties ?? {}
|
|
740
|
+
};
|
|
741
|
+
}
|
|
742
|
+
function encodeNamespace(namespace) {
|
|
743
|
+
return (Array.isArray(namespace) ? namespace : namespace.split(".")).map((p) => encodeURIComponent(p)).join("%1F");
|
|
744
|
+
}
|
|
745
|
+
async function restFetch(ctx, path, init) {
|
|
746
|
+
const prefixSegment = ctx.prefix ? `${ctx.prefix.replace(/^\/|\/$/g, "")}/` : "";
|
|
747
|
+
const fullUrl = `${ctx.url}/v1/${prefixSegment}${path}`;
|
|
748
|
+
const merged = mergeRequestInit(ctx.requestInit, init);
|
|
749
|
+
const res = await fetch(fullUrl, merged);
|
|
750
|
+
if (!res.ok) await throwRestError(res);
|
|
751
|
+
return res;
|
|
752
|
+
}
|
|
753
|
+
function mergeRequestInit(a, b) {
|
|
754
|
+
if (!a) return b;
|
|
755
|
+
if (!b) return a;
|
|
756
|
+
return {
|
|
757
|
+
...a,
|
|
758
|
+
...b,
|
|
759
|
+
headers: {
|
|
760
|
+
...headersToObject(a.headers),
|
|
761
|
+
...headersToObject(b.headers)
|
|
762
|
+
}
|
|
763
|
+
};
|
|
764
|
+
}
|
|
765
|
+
function headersToObject(h) {
|
|
766
|
+
if (!h) return {};
|
|
767
|
+
if (h instanceof Headers) {
|
|
768
|
+
const out = {};
|
|
769
|
+
h.forEach((v, k) => {
|
|
770
|
+
out[k] = v;
|
|
771
|
+
});
|
|
772
|
+
return out;
|
|
773
|
+
}
|
|
774
|
+
if (Array.isArray(h)) return Object.fromEntries(h);
|
|
775
|
+
return h;
|
|
776
|
+
}
|
|
777
|
+
async function throwRestError(res) {
|
|
778
|
+
let detail = "";
|
|
779
|
+
try {
|
|
780
|
+
const body = parseIcebergJson(await res.text());
|
|
781
|
+
if (body?.error) {
|
|
782
|
+
const { code, type, message } = body.error;
|
|
783
|
+
detail = `${code ?? res.status} ${type ?? ""}: ${message ?? ""}`.trim();
|
|
784
|
+
}
|
|
785
|
+
} catch {}
|
|
786
|
+
const err = new Error(detail || `${res.status} ${res.statusText}`);
|
|
787
|
+
err.status = res.status;
|
|
788
|
+
throw err;
|
|
789
|
+
}
|
|
790
|
+
async function paginate(baseParams, fetchPage) {
|
|
791
|
+
const out = [];
|
|
792
|
+
let pageToken;
|
|
793
|
+
while (true) {
|
|
794
|
+
const params = { ...baseParams };
|
|
795
|
+
if (pageToken) params.pageToken = pageToken;
|
|
796
|
+
const keys = Object.keys(params);
|
|
797
|
+
const { items, nextPageToken } = await fetchPage(keys.length ? "?" + keys.map((k) => `${k}=${params[k]}`).join("&") : "");
|
|
798
|
+
out.push(...items);
|
|
799
|
+
if (!nextPageToken) return out;
|
|
800
|
+
pageToken = encodeURIComponent(nextPageToken);
|
|
801
|
+
}
|
|
802
|
+
}
|
|
803
|
+
async function loadTable({ catalog, namespace, table, tableUrl, resolver }) {
|
|
804
|
+
if (catalog.type === "rest") {
|
|
805
|
+
if (!namespace || !table) throw new Error("namespace and table are required for rest catalogs");
|
|
806
|
+
const { metadata } = await restCatalogLoadTable(catalog, {
|
|
807
|
+
namespace,
|
|
808
|
+
table
|
|
809
|
+
});
|
|
810
|
+
return {
|
|
811
|
+
metadata,
|
|
812
|
+
metadataFileName: void 0,
|
|
813
|
+
tableUrl: metadata.location,
|
|
814
|
+
resolver
|
|
815
|
+
};
|
|
816
|
+
}
|
|
817
|
+
if (catalog.type === "file") {
|
|
818
|
+
if (!tableUrl) throw new Error("tableUrl is required for file catalogs");
|
|
819
|
+
const eff = resolver ?? catalog.resolver;
|
|
820
|
+
if (catalog.conditionalCommits) {
|
|
821
|
+
const { metadata, metadataFileName, version } = await loadLatestFileCatalogMetadata({
|
|
822
|
+
tableUrl,
|
|
823
|
+
resolver: eff,
|
|
824
|
+
lister: catalog.lister
|
|
825
|
+
});
|
|
826
|
+
return {
|
|
827
|
+
metadata,
|
|
828
|
+
metadataFileName,
|
|
829
|
+
version,
|
|
830
|
+
tableUrl,
|
|
831
|
+
resolver: eff
|
|
832
|
+
};
|
|
833
|
+
}
|
|
834
|
+
const { metadata, metadataFileName } = await resolveMetadata({
|
|
835
|
+
tableUrl,
|
|
836
|
+
resolver: eff,
|
|
837
|
+
lister: catalog.lister
|
|
838
|
+
});
|
|
839
|
+
return {
|
|
840
|
+
metadata,
|
|
841
|
+
metadataFileName,
|
|
842
|
+
tableUrl,
|
|
843
|
+
resolver: eff
|
|
844
|
+
};
|
|
845
|
+
}
|
|
846
|
+
throw new Error(`unknown catalog type: ${catalog?.type}`);
|
|
847
|
+
}
|
|
848
|
+
function validateSchemaForVersion(schema, formatVersion) {
|
|
849
|
+
for (const field of schema.fields) validateFieldForVersion(field, formatVersion, field.name);
|
|
850
|
+
}
|
|
851
|
+
function maxFieldId(fields = []) {
|
|
852
|
+
let max = 0;
|
|
853
|
+
for (const field of fields) {
|
|
854
|
+
if (max < field.id) max = field.id;
|
|
855
|
+
const nested = maxNestedFieldId(field.type);
|
|
856
|
+
if (max < nested) max = nested;
|
|
857
|
+
}
|
|
858
|
+
return max;
|
|
859
|
+
}
|
|
860
|
+
function maxNestedFieldId(type) {
|
|
861
|
+
if (typeof type === "string") return 0;
|
|
862
|
+
if (type.type === "list") {
|
|
863
|
+
const elementId = type["element-id"] ?? 0;
|
|
864
|
+
return Math.max(elementId, maxNestedFieldId(type.element));
|
|
865
|
+
}
|
|
866
|
+
if (type.type === "map") {
|
|
867
|
+
const keyId = type["key-id"] ?? 0;
|
|
868
|
+
const valueId = type["value-id"] ?? 0;
|
|
869
|
+
return Math.max(keyId, valueId, maxNestedFieldId(type.key), maxNestedFieldId(type.value));
|
|
870
|
+
}
|
|
871
|
+
if (type.type === "struct") return maxFieldId(type.fields);
|
|
872
|
+
return 0;
|
|
873
|
+
}
|
|
874
|
+
const MAX_USER_FIELD_ID = 2147483447;
|
|
875
|
+
function validateFieldForVersion(field, formatVersion, path) {
|
|
876
|
+
if (typeof field.id === "number" && field.id > MAX_USER_FIELD_ID) throw new Error(`field id ${field.id} is in the reserved range (> ${MAX_USER_FIELD_ID}) (field: ${path})`);
|
|
877
|
+
if (formatVersion < 3) {
|
|
878
|
+
checkTypeForV2(field.type, path);
|
|
879
|
+
if (field["initial-default"] !== void 0) throw new Error(`initial-default requires format-version 3 (field: ${path})`);
|
|
880
|
+
if (field["write-default"] !== void 0) throw new Error(`write-default requires format-version 3 (field: ${path})`);
|
|
881
|
+
} else checkV3Default(field, path);
|
|
882
|
+
checkNestedFieldsForVersion(field.type, formatVersion, path);
|
|
883
|
+
}
|
|
884
|
+
function checkTypeForV2(type, path) {
|
|
885
|
+
if (typeof type === "string") {
|
|
886
|
+
if (type === "unknown" || type === "variant" || type === "timestamp_ns" || type === "timestamptz_ns" || type === "geometry" || type.startsWith("geometry(") || type === "geography" || type.startsWith("geography(")) throw new Error(`type ${type} requires format-version 3 (field: ${path})`);
|
|
887
|
+
return;
|
|
888
|
+
}
|
|
889
|
+
if (type.type === "struct") for (const f of type.fields) checkTypeForV2(f.type, `${path}.${f.name}`);
|
|
890
|
+
else if (type.type === "list") checkTypeForV2(type.element, `${path}.element`);
|
|
891
|
+
else if (type.type === "map") {
|
|
892
|
+
checkTypeForV2(type.key, `${path}.key`);
|
|
893
|
+
checkTypeForV2(type.value, `${path}.value`);
|
|
894
|
+
}
|
|
895
|
+
}
|
|
896
|
+
function checkV3Default(field, path) {
|
|
897
|
+
const type = typeName(field.type);
|
|
898
|
+
if (!requiresNullDefault(type)) return;
|
|
899
|
+
for (const key of ["initial-default", "write-default"]) if (field[key] != null) throw new Error(`${key} for field ${path} of type ${type} must default to null`);
|
|
900
|
+
}
|
|
901
|
+
function checkNestedFieldsForVersion(type, formatVersion, path) {
|
|
902
|
+
if (typeof type === "string") return;
|
|
903
|
+
if (type.type === "struct") for (const f of type.fields) validateFieldForVersion(f, formatVersion, `${path}.${f.name}`);
|
|
904
|
+
else if (type.type === "list") {
|
|
905
|
+
checkReservedFieldId(type["element-id"], `${path}.element`);
|
|
906
|
+
checkNestedFieldsForVersion(type.element, formatVersion, `${path}.element`);
|
|
907
|
+
} else if (type.type === "map") {
|
|
908
|
+
checkReservedFieldId(type["key-id"], `${path}.key`);
|
|
909
|
+
checkReservedFieldId(type["value-id"], `${path}.value`);
|
|
910
|
+
checkNestedFieldsForVersion(type.key, formatVersion, `${path}.key`);
|
|
911
|
+
checkNestedFieldsForVersion(type.value, formatVersion, `${path}.value`);
|
|
912
|
+
}
|
|
913
|
+
}
|
|
914
|
+
function checkReservedFieldId(id, path) {
|
|
915
|
+
if (typeof id === "number" && id > MAX_USER_FIELD_ID) throw new Error(`field id ${id} is in the reserved range (> ${MAX_USER_FIELD_ID}) (field: ${path})`);
|
|
916
|
+
}
|
|
917
|
+
function typeName(type) {
|
|
918
|
+
return typeof type === "string" ? type : type.type;
|
|
919
|
+
}
|
|
920
|
+
function requiresNullDefault(type) {
|
|
921
|
+
return type === "unknown" || type === "variant" || type === "geometry" || type.startsWith("geometry(") || type === "geography" || type.startsWith("geography(");
|
|
922
|
+
}
|
|
923
|
+
function parseDecimalType(type) {
|
|
924
|
+
const m = /^decimal\((\d+),\s*(\d+)\)$/.exec(type);
|
|
925
|
+
if (!m) return void 0;
|
|
926
|
+
return {
|
|
927
|
+
precision: parseInt(m[1], 10),
|
|
928
|
+
scale: parseInt(m[2], 10)
|
|
929
|
+
};
|
|
930
|
+
}
|
|
931
|
+
function decimalRequiredBytes(precision) {
|
|
932
|
+
const limit = 10n ** BigInt(precision);
|
|
933
|
+
let n = 1;
|
|
934
|
+
let bound = 128n;
|
|
935
|
+
while (limit > bound) {
|
|
936
|
+
n++;
|
|
937
|
+
bound <<= 8n;
|
|
938
|
+
}
|
|
939
|
+
return n;
|
|
940
|
+
}
|
|
941
|
+
function decimalToFixedBytes(value, precision, scale, label) {
|
|
942
|
+
const size = decimalRequiredBytes(precision);
|
|
943
|
+
if (value instanceof Uint8Array) {
|
|
944
|
+
if (value.length !== size) throw new Error(`expected ${label}`);
|
|
945
|
+
return value;
|
|
946
|
+
}
|
|
947
|
+
if (typeof value !== "number" && typeof value !== "bigint") throw new Error(`expected ${label}`);
|
|
948
|
+
const factor = 10n ** BigInt(scale);
|
|
949
|
+
const unscaled = typeof value === "bigint" ? value * factor : BigInt(Math.round(value * Number(factor)));
|
|
950
|
+
const limit = 10n ** BigInt(precision);
|
|
951
|
+
if (unscaled >= limit || unscaled <= -limit) throw new Error(`${label} exceeds precision ${precision}`);
|
|
952
|
+
return bigintToFixedBytes(unscaled, size, label);
|
|
953
|
+
}
|
|
954
|
+
function toUint8Array(value) {
|
|
955
|
+
return value instanceof Uint8Array ? value : new Uint8Array(value);
|
|
956
|
+
}
|
|
957
|
+
function uuidToBytes(value, label) {
|
|
958
|
+
if (value instanceof Uint8Array) {
|
|
959
|
+
if (value.length !== 16) throw new Error(`expected ${label}`);
|
|
960
|
+
return value;
|
|
961
|
+
}
|
|
962
|
+
if (typeof value !== "string") throw new Error(`expected ${label}`);
|
|
963
|
+
const hex = value.toLowerCase().replace(/-/g, "");
|
|
964
|
+
if (!/^[0-9a-f]{32}$/.test(hex)) throw new Error(`expected ${label}`);
|
|
965
|
+
const bytes = new Uint8Array(16);
|
|
966
|
+
for (let i = 0; i < bytes.length; i++) bytes[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
|
|
967
|
+
return bytes;
|
|
968
|
+
}
|
|
969
|
+
function bigintToFixedBytes(value, size, label) {
|
|
970
|
+
const bytes = new Uint8Array(size);
|
|
971
|
+
let v = value;
|
|
972
|
+
for (let i = size - 1; i >= 0; i--) {
|
|
973
|
+
bytes[i] = Number(v & 255n);
|
|
974
|
+
v >>= 8n;
|
|
975
|
+
}
|
|
976
|
+
const negative = value < 0n;
|
|
977
|
+
const signBitSet = (bytes[0] & 128) !== 0;
|
|
978
|
+
if (!negative && (v !== 0n || signBitSet) || negative && (v !== -1n || !signBitSet)) throw new Error(`${label} does not fit in ${size} bytes`);
|
|
979
|
+
return bytes;
|
|
980
|
+
}
|
|
981
|
+
function parseTransform(transform) {
|
|
982
|
+
if (transform === "identity" || transform === "void" || transform === "year" || transform === "month" || transform === "day" || transform === "hour") return { kind: transform };
|
|
983
|
+
let m = /^bucket\[(\d+)\]$/.exec(transform);
|
|
984
|
+
if (m) {
|
|
985
|
+
const n = parseInt(m[1], 10);
|
|
986
|
+
if (n > 0) return {
|
|
987
|
+
kind: "bucket",
|
|
988
|
+
n
|
|
989
|
+
};
|
|
990
|
+
}
|
|
991
|
+
m = /^truncate\[(\d+)\]$/.exec(transform);
|
|
992
|
+
if (m) {
|
|
993
|
+
const w = parseInt(m[1], 10);
|
|
994
|
+
if (w > 0) return {
|
|
995
|
+
kind: "truncate",
|
|
996
|
+
w
|
|
997
|
+
};
|
|
998
|
+
}
|
|
999
|
+
throw new Error(`unsupported partition transform: ${transform}`);
|
|
1000
|
+
}
|
|
1001
|
+
function transformResultType(transform, sourceType) {
|
|
1002
|
+
const parsed = parseTransform(transform);
|
|
1003
|
+
validateTransformSource(parsed, sourceType);
|
|
1004
|
+
switch (parsed.kind) {
|
|
1005
|
+
case "identity":
|
|
1006
|
+
case "truncate": return sourceType;
|
|
1007
|
+
case "void": return "int";
|
|
1008
|
+
case "year":
|
|
1009
|
+
case "month":
|
|
1010
|
+
case "day":
|
|
1011
|
+
case "hour":
|
|
1012
|
+
case "bucket": return "int";
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
function applyTransform(transform, value, sourceType) {
|
|
1016
|
+
const parsed = parseTransform(transform);
|
|
1017
|
+
validateTransformSource(parsed, sourceType);
|
|
1018
|
+
if (value == null) return null;
|
|
1019
|
+
switch (parsed.kind) {
|
|
1020
|
+
case "identity": return value;
|
|
1021
|
+
case "void": return null;
|
|
1022
|
+
case "year": return yearTransform(value, sourceType);
|
|
1023
|
+
case "month": return monthTransform(value, sourceType);
|
|
1024
|
+
case "day": return dayTransform(value, sourceType);
|
|
1025
|
+
case "hour": return hourTransform(value, sourceType);
|
|
1026
|
+
case "bucket": return bucketTransform(value, sourceType, parsed.n);
|
|
1027
|
+
case "truncate": return truncateTransform(value, sourceType, parsed.w);
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
1030
|
+
function dateAsMillis(value, sourceType, transform) {
|
|
1031
|
+
const t = typeName(sourceType);
|
|
1032
|
+
validateTransformSource({ kind: transform }, sourceType);
|
|
1033
|
+
if (value instanceof Date) return value.getTime();
|
|
1034
|
+
const n = typeof value === "bigint" ? value : BigInt(value);
|
|
1035
|
+
switch (t) {
|
|
1036
|
+
case "date": return Number(n) * 864e5;
|
|
1037
|
+
case "timestamp":
|
|
1038
|
+
case "timestamptz": return Number(n / 1000n);
|
|
1039
|
+
default: return Number(n / 1000000n);
|
|
1040
|
+
}
|
|
1041
|
+
}
|
|
1042
|
+
function yearTransform(v, t) {
|
|
1043
|
+
return new Date(dateAsMillis(v, t, "year")).getUTCFullYear() - 1970;
|
|
1044
|
+
}
|
|
1045
|
+
function monthTransform(v, t) {
|
|
1046
|
+
const d = new Date(dateAsMillis(v, t, "month"));
|
|
1047
|
+
return (d.getUTCFullYear() - 1970) * 12 + d.getUTCMonth();
|
|
1048
|
+
}
|
|
1049
|
+
function dayTransform(v, t) {
|
|
1050
|
+
return Math.floor(dateAsMillis(v, t, "day") / 864e5);
|
|
1051
|
+
}
|
|
1052
|
+
function hourTransform(v, t) {
|
|
1053
|
+
return Math.floor(dateAsMillis(v, t, "hour") / 36e5);
|
|
1054
|
+
}
|
|
1055
|
+
function bucketTransform(value, sourceType, n) {
|
|
1056
|
+
return (murmur3_32(bucketBytes(value, sourceType), 0) & 2147483647) % n;
|
|
1057
|
+
}
|
|
1058
|
+
function bucketBytes(value, sourceType) {
|
|
1059
|
+
const t = typeName(sourceType);
|
|
1060
|
+
if (t.startsWith("decimal(")) return decimalToUnscaledBytes(value, t);
|
|
1061
|
+
if (t === "uuid") return uuidToBytes(value, "uuid partition value");
|
|
1062
|
+
if (t.startsWith("fixed[") || t === "binary" || t === "fixed") return value instanceof Uint8Array ? value : new Uint8Array(value);
|
|
1063
|
+
switch (t) {
|
|
1064
|
+
case "int":
|
|
1065
|
+
case "long":
|
|
1066
|
+
case "date":
|
|
1067
|
+
case "time":
|
|
1068
|
+
case "timestamp":
|
|
1069
|
+
case "timestamptz":
|
|
1070
|
+
case "timestamp_ns":
|
|
1071
|
+
case "timestamptz_ns": {
|
|
1072
|
+
let v;
|
|
1073
|
+
if (t === "date") v = value instanceof Date ? BigInt(Math.floor(value.getTime() / 864e5)) : BigInt(value);
|
|
1074
|
+
else if (t === "timestamp" || t === "timestamptz") v = value instanceof Date ? BigInt(value.getTime()) * 1000n : BigInt(value);
|
|
1075
|
+
else if (t === "timestamp_ns" || t === "timestamptz_ns") v = value instanceof Date ? BigInt(value.getTime()) * 1000n : BigInt(value) / 1000n;
|
|
1076
|
+
else v = typeof value === "bigint" ? value : BigInt(value);
|
|
1077
|
+
const out = new Uint8Array(8);
|
|
1078
|
+
new DataView(out.buffer).setBigInt64(0, v, true);
|
|
1079
|
+
return out;
|
|
1080
|
+
}
|
|
1081
|
+
case "string": return new TextEncoder().encode(String(value));
|
|
1082
|
+
default: throw new Error(`bucket transform: unsupported source type ${t}`);
|
|
1083
|
+
}
|
|
1084
|
+
}
|
|
1085
|
+
function decimalToUnscaledBytes(value, decimalType) {
|
|
1086
|
+
const m = /^decimal\((\d+),\s*(\d+)\)$/.exec(decimalType);
|
|
1087
|
+
if (!m) throw new Error(`bucket transform: invalid decimal type ${decimalType}`);
|
|
1088
|
+
const scale = parseInt(m[2], 10);
|
|
1089
|
+
const factor = 10n ** BigInt(scale);
|
|
1090
|
+
const unscaled = typeof value === "bigint" ? value * factor : BigInt(Math.round(Number(value) * Number(factor)));
|
|
1091
|
+
const bytes = [];
|
|
1092
|
+
let v = unscaled;
|
|
1093
|
+
while (true) {
|
|
1094
|
+
const byte = Number(v & 255n);
|
|
1095
|
+
bytes.unshift(byte);
|
|
1096
|
+
v >>= 8n;
|
|
1097
|
+
const sign = byte & 128;
|
|
1098
|
+
if (!sign && v === 0n || sign && v === -1n) break;
|
|
1099
|
+
}
|
|
1100
|
+
return new Uint8Array(bytes);
|
|
1101
|
+
}
|
|
1102
|
+
function truncateTransform(value, sourceType, w) {
|
|
1103
|
+
const t = typeName(sourceType);
|
|
1104
|
+
if (t.startsWith("decimal(")) {
|
|
1105
|
+
const m = /^decimal\((\d+),\s*(\d+)\)$/.exec(t);
|
|
1106
|
+
if (!m) throw new Error(`truncate transform: invalid decimal type ${t}`);
|
|
1107
|
+
const scale = parseInt(m[2], 10);
|
|
1108
|
+
const factor = 10n ** BigInt(scale);
|
|
1109
|
+
const unscaled = typeof value === "bigint" ? value * factor : BigInt(Math.round(Number(value) * Number(factor)));
|
|
1110
|
+
const W = BigInt(w);
|
|
1111
|
+
const truncated = unscaled - (unscaled % W + W) % W;
|
|
1112
|
+
return Number(truncated) / Number(factor);
|
|
1113
|
+
}
|
|
1114
|
+
if (t === "binary") return (value instanceof Uint8Array ? value : new Uint8Array(value)).slice(0, w);
|
|
1115
|
+
switch (t) {
|
|
1116
|
+
case "int": {
|
|
1117
|
+
const v = Number(value);
|
|
1118
|
+
return v - (v % w + w) % w;
|
|
1119
|
+
}
|
|
1120
|
+
case "long": {
|
|
1121
|
+
const W = BigInt(w);
|
|
1122
|
+
const v = typeof value === "bigint" ? value : BigInt(value);
|
|
1123
|
+
return v - (v % W + W) % W;
|
|
1124
|
+
}
|
|
1125
|
+
case "string": {
|
|
1126
|
+
const s = String(value);
|
|
1127
|
+
let count = 0;
|
|
1128
|
+
let i = 0;
|
|
1129
|
+
while (i < s.length && count < w) {
|
|
1130
|
+
const code = s.codePointAt(i);
|
|
1131
|
+
i += code > 65535 ? 2 : 1;
|
|
1132
|
+
count++;
|
|
1133
|
+
}
|
|
1134
|
+
return s.slice(0, i);
|
|
1135
|
+
}
|
|
1136
|
+
default: throw new Error(`truncate transform: unsupported source type ${t}`);
|
|
1137
|
+
}
|
|
1138
|
+
}
|
|
1139
|
+
function validateTransformSource(parsed, sourceType) {
|
|
1140
|
+
const t = typeName(sourceType);
|
|
1141
|
+
switch (parsed.kind) {
|
|
1142
|
+
case "identity":
|
|
1143
|
+
if (t === "variant" || t === "geometry" || t.startsWith("geometry(") || t === "geography" || t.startsWith("geography(")) throw new Error(`identity transform: unsupported source type ${t}`);
|
|
1144
|
+
return;
|
|
1145
|
+
case "void": return;
|
|
1146
|
+
case "bucket":
|
|
1147
|
+
if (t === "int" || t === "long" || t.startsWith("decimal(") || t === "date" || t === "time" || t === "timestamp" || t === "timestamptz" || t === "timestamp_ns" || t === "timestamptz_ns" || t === "string" || t === "uuid" || t.startsWith("fixed[") || t === "binary" || t === "fixed") return;
|
|
1148
|
+
throw new Error(`bucket transform: unsupported source type ${t}`);
|
|
1149
|
+
case "truncate":
|
|
1150
|
+
if (t === "int" || t === "long" || t.startsWith("decimal(") || t === "string" || t === "binary") return;
|
|
1151
|
+
throw new Error(`truncate transform: unsupported source type ${t}`);
|
|
1152
|
+
case "year":
|
|
1153
|
+
case "month":
|
|
1154
|
+
case "day":
|
|
1155
|
+
if (t === "date" || t === "timestamp" || t === "timestamptz" || t === "timestamp_ns" || t === "timestamptz_ns") return;
|
|
1156
|
+
throw new Error(`${parsed.kind} transform: unsupported source type ${t}`);
|
|
1157
|
+
case "hour":
|
|
1158
|
+
if (t === "timestamp" || t === "timestamptz" || t === "timestamp_ns" || t === "timestamptz_ns") return;
|
|
1159
|
+
throw new Error("hour transform: unsupported source type " + t);
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
1162
|
+
function murmur3_32(data, seed) {
|
|
1163
|
+
const c1 = 3432918353;
|
|
1164
|
+
const c2 = 461845907;
|
|
1165
|
+
const len = data.length;
|
|
1166
|
+
const nBlocks = len >>> 2;
|
|
1167
|
+
let h1 = seed >>> 0;
|
|
1168
|
+
for (let i = 0; i < nBlocks; i++) {
|
|
1169
|
+
const off = i * 4;
|
|
1170
|
+
let k1 = data[off] | data[off + 1] << 8 | data[off + 2] << 16 | data[off + 3] << 24;
|
|
1171
|
+
k1 = Math.imul(k1, c1);
|
|
1172
|
+
k1 = k1 << 15 | k1 >>> 17;
|
|
1173
|
+
k1 = Math.imul(k1, c2);
|
|
1174
|
+
h1 ^= k1;
|
|
1175
|
+
h1 = h1 << 13 | h1 >>> 19;
|
|
1176
|
+
h1 = Math.imul(h1, 5) + 3864292196 | 0;
|
|
1177
|
+
}
|
|
1178
|
+
let k1 = 0;
|
|
1179
|
+
const tail = nBlocks * 4;
|
|
1180
|
+
switch (len & 3) {
|
|
1181
|
+
case 3: k1 ^= data[tail + 2] << 16;
|
|
1182
|
+
case 2: k1 ^= data[tail + 1] << 8;
|
|
1183
|
+
case 1:
|
|
1184
|
+
k1 ^= data[tail];
|
|
1185
|
+
k1 = Math.imul(k1, c1);
|
|
1186
|
+
k1 = k1 << 15 | k1 >>> 17;
|
|
1187
|
+
k1 = Math.imul(k1, c2);
|
|
1188
|
+
h1 ^= k1;
|
|
1189
|
+
}
|
|
1190
|
+
h1 ^= len;
|
|
1191
|
+
h1 ^= h1 >>> 16;
|
|
1192
|
+
h1 = Math.imul(h1, 2246822507);
|
|
1193
|
+
h1 ^= h1 >>> 13;
|
|
1194
|
+
h1 = Math.imul(h1, 3266489909);
|
|
1195
|
+
h1 ^= h1 >>> 16;
|
|
1196
|
+
return h1 >>> 0;
|
|
1197
|
+
}
|
|
1198
|
+
function groupByPartition(records, schema, partitionSpec) {
|
|
1199
|
+
const sourceFields = partitionSpec.fields.map((pf) => {
|
|
1200
|
+
const sourceId = pf["source-id"];
|
|
1201
|
+
if (sourceId === void 0) throw new Error(`partition field ${pf.name} is missing source-id`);
|
|
1202
|
+
const sourceField = schema.fields.find((f) => f.id === sourceId);
|
|
1203
|
+
if (!sourceField) throw new Error(`partition source field id ${sourceId} not found in schema`);
|
|
1204
|
+
return {
|
|
1205
|
+
partitionName: pf.name,
|
|
1206
|
+
sourceName: sourceField.name,
|
|
1207
|
+
sourceType: sourceField.type,
|
|
1208
|
+
sourceWriteDefault: sourceField["write-default"],
|
|
1209
|
+
transform: pf.transform,
|
|
1210
|
+
resultType: transformResultType(pf.transform, sourceField.type)
|
|
1211
|
+
};
|
|
1212
|
+
});
|
|
1213
|
+
const groups = /* @__PURE__ */ new Map();
|
|
1214
|
+
for (const record of records) {
|
|
1215
|
+
const partition = {};
|
|
1216
|
+
const keyParts = [];
|
|
1217
|
+
for (const { partitionName, sourceName, sourceType, sourceWriteDefault, transform, resultType } of sourceFields) {
|
|
1218
|
+
let v = record[sourceName];
|
|
1219
|
+
if (v === void 0 && sourceWriteDefault !== void 0) v = sourceWriteDefault;
|
|
1220
|
+
partition[partitionName] = applyTransform(transform, v === void 0 ? null : v, sourceType);
|
|
1221
|
+
keyParts.push(partitionKeyPart(partition[partitionName], resultType));
|
|
1222
|
+
}
|
|
1223
|
+
const key = JSON.stringify(keyParts);
|
|
1224
|
+
let group = groups.get(key);
|
|
1225
|
+
if (!group) {
|
|
1226
|
+
group = {
|
|
1227
|
+
partition,
|
|
1228
|
+
records: []
|
|
1229
|
+
};
|
|
1230
|
+
groups.set(key, group);
|
|
1231
|
+
}
|
|
1232
|
+
group.records.push(record);
|
|
1233
|
+
}
|
|
1234
|
+
return [...groups.values()];
|
|
1235
|
+
}
|
|
1236
|
+
function validatePartitionSpecForWrite(schema, partitionSpec, label = "partition spec") {
|
|
1237
|
+
for (const pf of partitionSpec.fields) {
|
|
1238
|
+
const sourceId = pf["source-id"];
|
|
1239
|
+
if (sourceId === void 0) throw new Error(`${label}: partition field ${pf.name} is missing source-id`);
|
|
1240
|
+
const sourceField = schema.fields.find((f) => f.id === sourceId);
|
|
1241
|
+
if (!sourceField) throw new Error(`${label}: partition source field id ${sourceId} not found in schema`);
|
|
1242
|
+
icebergTypeToAvro(transformResultType(pf.transform, sourceField.type), pf["field-id"]);
|
|
1243
|
+
}
|
|
1244
|
+
}
|
|
1245
|
+
function partitionAvroSchema(schema, partitionSpec) {
|
|
1246
|
+
return {
|
|
1247
|
+
type: "record",
|
|
1248
|
+
name: "r102",
|
|
1249
|
+
fields: partitionSpec.fields.map((pf) => {
|
|
1250
|
+
const sourceField = schema.fields.find((f) => f.id === pf["source-id"]);
|
|
1251
|
+
if (!sourceField) throw new Error(`partition source field id ${pf["source-id"]} not found`);
|
|
1252
|
+
const resultType = transformResultType(pf.transform, sourceField.type);
|
|
1253
|
+
return {
|
|
1254
|
+
name: pf.name,
|
|
1255
|
+
"field-id": pf["field-id"],
|
|
1256
|
+
default: null,
|
|
1257
|
+
type: ["null", icebergTypeToAvro(resultType, pf["field-id"])]
|
|
1258
|
+
};
|
|
1259
|
+
})
|
|
1260
|
+
};
|
|
1261
|
+
}
|
|
1262
|
+
function partitionSpecJson(partitionSpec) {
|
|
1263
|
+
return JSON.stringify(partitionSpec.fields);
|
|
1264
|
+
}
|
|
1265
|
+
function partitionToAvroRecord(partition, schema, partitionSpec) {
|
|
1266
|
+
const out = {};
|
|
1267
|
+
for (const pf of partitionSpec.fields) {
|
|
1268
|
+
const sourceField = schema.fields.find((f) => f.id === pf["source-id"]);
|
|
1269
|
+
if (!sourceField) throw new Error(`partition source field id ${pf["source-id"]} not found`);
|
|
1270
|
+
const resultType = transformResultType(pf.transform, sourceField.type);
|
|
1271
|
+
const value = partition[pf.name];
|
|
1272
|
+
out[pf.name] = value == null ? null : coerceForAvro(value, resultType);
|
|
1273
|
+
}
|
|
1274
|
+
return out;
|
|
1275
|
+
}
|
|
1276
|
+
function partitionKeyPart(value, type) {
|
|
1277
|
+
if (value === null || value === void 0) return "__null__";
|
|
1278
|
+
const name = typeof type === "string" ? type : type.type;
|
|
1279
|
+
if (name === "uuid") return `uuid:${bytesToHex$1(uuidToBytes(value, "uuid partition value"))}`;
|
|
1280
|
+
if (typeof value === "number" && (name === "float" || name === "double")) return `${name}:${floatPartitionKey(value, name)}`;
|
|
1281
|
+
if (name === "long") return `long:${BigInt(value)}`;
|
|
1282
|
+
if (typeof value === "bigint") return `b:${value.toString()}`;
|
|
1283
|
+
if (value instanceof Date) return `d:${value.getTime()}`;
|
|
1284
|
+
if (value instanceof Uint8Array) return `x:${bytesToHex$1(value)}`;
|
|
1285
|
+
return `${typeof value}:${String(value)}`;
|
|
1286
|
+
}
|
|
1287
|
+
function floatPartitionKey(value, type) {
|
|
1288
|
+
if (Number.isNaN(value)) return "nan";
|
|
1289
|
+
const bytes = new Uint8Array(type === "float" ? 4 : 8);
|
|
1290
|
+
const view = new DataView(bytes.buffer);
|
|
1291
|
+
if (type === "float") view.setFloat32(0, value, false);
|
|
1292
|
+
else view.setFloat64(0, value, false);
|
|
1293
|
+
return bytesToHex$1(bytes);
|
|
1294
|
+
}
|
|
1295
|
+
function icebergTypeToAvro(type, fieldId) {
|
|
1296
|
+
const name = typeof type === "string" ? type : type.type;
|
|
1297
|
+
const decimal = parseDecimalType(name);
|
|
1298
|
+
if (decimal) return {
|
|
1299
|
+
type: "fixed",
|
|
1300
|
+
name: `r102_${fieldId}`,
|
|
1301
|
+
size: decimalRequiredBytes(decimal.precision),
|
|
1302
|
+
logicalType: "decimal",
|
|
1303
|
+
precision: decimal.precision,
|
|
1304
|
+
scale: decimal.scale
|
|
1305
|
+
};
|
|
1306
|
+
const fixed = /^fixed\[(\d+)\]$/.exec(name);
|
|
1307
|
+
if (fixed) return {
|
|
1308
|
+
type: "fixed",
|
|
1309
|
+
name: `r102_${fieldId}`,
|
|
1310
|
+
size: parseInt(fixed[1], 10)
|
|
1311
|
+
};
|
|
1312
|
+
switch (name) {
|
|
1313
|
+
case "boolean": return "boolean";
|
|
1314
|
+
case "int": return "int";
|
|
1315
|
+
case "long": return "long";
|
|
1316
|
+
case "float": return "float";
|
|
1317
|
+
case "double": return "double";
|
|
1318
|
+
case "string": return "string";
|
|
1319
|
+
case "uuid": return {
|
|
1320
|
+
type: "fixed",
|
|
1321
|
+
name: `r102_${fieldId}`,
|
|
1322
|
+
size: 16,
|
|
1323
|
+
logicalType: "uuid"
|
|
1324
|
+
};
|
|
1325
|
+
case "binary": return "bytes";
|
|
1326
|
+
case "date": return {
|
|
1327
|
+
type: "int",
|
|
1328
|
+
logicalType: "date"
|
|
1329
|
+
};
|
|
1330
|
+
case "time": return {
|
|
1331
|
+
type: "long",
|
|
1332
|
+
logicalType: "time-micros"
|
|
1333
|
+
};
|
|
1334
|
+
case "timestamp": return {
|
|
1335
|
+
type: "long",
|
|
1336
|
+
logicalType: "timestamp-micros",
|
|
1337
|
+
"adjust-to-utc": false
|
|
1338
|
+
};
|
|
1339
|
+
case "timestamptz": return {
|
|
1340
|
+
type: "long",
|
|
1341
|
+
logicalType: "timestamp-micros",
|
|
1342
|
+
"adjust-to-utc": true
|
|
1343
|
+
};
|
|
1344
|
+
case "timestamp_ns": return {
|
|
1345
|
+
type: "long",
|
|
1346
|
+
logicalType: "timestamp-nanos",
|
|
1347
|
+
"adjust-to-utc": false
|
|
1348
|
+
};
|
|
1349
|
+
case "timestamptz_ns": return {
|
|
1350
|
+
type: "long",
|
|
1351
|
+
logicalType: "timestamp-nanos",
|
|
1352
|
+
"adjust-to-utc": true
|
|
1353
|
+
};
|
|
1354
|
+
default: throw new Error(`unsupported partition source type: ${name}`);
|
|
1355
|
+
}
|
|
1356
|
+
}
|
|
1357
|
+
function coerceForAvro(value, type) {
|
|
1358
|
+
const name = typeof type === "string" ? type : type.type;
|
|
1359
|
+
if (name === "long") return typeof value === "bigint" ? value : BigInt(value);
|
|
1360
|
+
if (name === "uuid") return uuidToBytes(value, "uuid partition value");
|
|
1361
|
+
const decimal = parseDecimalType(name);
|
|
1362
|
+
if (decimal) return decimalToFixedBytes(value, decimal.precision, decimal.scale, `decimal(${decimal.precision},${decimal.scale}) partition value`);
|
|
1363
|
+
const fixed = /^fixed\[(\d+)\]$/.exec(name);
|
|
1364
|
+
if (fixed) {
|
|
1365
|
+
const bytes = toUint8Array(value);
|
|
1366
|
+
const expected = parseInt(fixed[1], 10);
|
|
1367
|
+
if (bytes.length !== expected) throw new Error(`expected fixed[${expected}] partition value`);
|
|
1368
|
+
return bytes;
|
|
1369
|
+
}
|
|
1370
|
+
return value;
|
|
1371
|
+
}
|
|
1372
|
+
async function icebergCreate({ tableUrl, resolver, schema, formatVersion, partitionSpec, sortOrder, properties, conditionalCommits }) {
|
|
1373
|
+
if (!tableUrl) throw new Error("tableUrl is required");
|
|
1374
|
+
if (formatVersion === void 0) {
|
|
1375
|
+
const propVersion = properties?.["format-version"];
|
|
1376
|
+
formatVersion = propVersion !== void 0 ? Number(propVersion) : 2;
|
|
1377
|
+
}
|
|
1378
|
+
if (formatVersion !== 2 && formatVersion !== 3) throw new Error(`unsupported format-version: ${formatVersion}`);
|
|
1379
|
+
const metadataVersion = 1;
|
|
1380
|
+
const metadataUrl = `${tableUrl}/metadata/v${metadataVersion}.metadata.json`;
|
|
1381
|
+
const initialSchema = schema ?? {
|
|
1382
|
+
type: "struct",
|
|
1383
|
+
"schema-id": 0,
|
|
1384
|
+
fields: []
|
|
1385
|
+
};
|
|
1386
|
+
validateSchemaForVersion(initialSchema, formatVersion);
|
|
1387
|
+
const initialPartitionSpec = partitionSpec ?? {
|
|
1388
|
+
"spec-id": 0,
|
|
1389
|
+
fields: []
|
|
1390
|
+
};
|
|
1391
|
+
validatePartitionSpecForWrite(initialSchema, initialPartitionSpec);
|
|
1392
|
+
const initialSortOrder = sortOrder ?? {
|
|
1393
|
+
"order-id": 0,
|
|
1394
|
+
fields: []
|
|
1395
|
+
};
|
|
1396
|
+
const metadata = {
|
|
1397
|
+
"format-version": formatVersion,
|
|
1398
|
+
"table-uuid": uuid4(),
|
|
1399
|
+
location: tableUrl,
|
|
1400
|
+
"last-sequence-number": 0,
|
|
1401
|
+
"last-updated-ms": Date.now(),
|
|
1402
|
+
"last-column-id": maxFieldId(initialSchema.fields),
|
|
1403
|
+
"current-schema-id": initialSchema["schema-id"] ?? 0,
|
|
1404
|
+
schemas: [initialSchema],
|
|
1405
|
+
"default-spec-id": initialPartitionSpec["spec-id"],
|
|
1406
|
+
"partition-specs": [initialPartitionSpec],
|
|
1407
|
+
"last-partition-id": maxPartitionFieldId(initialPartitionSpec.fields),
|
|
1408
|
+
"sort-orders": [initialSortOrder],
|
|
1409
|
+
"default-sort-order-id": initialSortOrder["order-id"]
|
|
1410
|
+
};
|
|
1411
|
+
if (properties) metadata.properties = properties;
|
|
1412
|
+
if (formatVersion >= 3) metadata["next-row-id"] = 0;
|
|
1413
|
+
if (!resolver.writer) throw new Error("resolver.writer is required");
|
|
1414
|
+
const metadataWriter = conditionalCommits ? resolver.writer(metadataUrl, { ifNoneMatch: "*" }) : resolver.writer(metadataUrl);
|
|
1415
|
+
const metadataBytes = new TextEncoder().encode(stringifyIcebergJson(metadata, 2));
|
|
1416
|
+
metadataWriter.appendBytes(metadataBytes);
|
|
1417
|
+
await metadataWriter.finish();
|
|
1418
|
+
const versionHintUrl = `${tableUrl}/metadata/version-hint.text`;
|
|
1419
|
+
try {
|
|
1420
|
+
const versionHintWriter = resolver.writer(versionHintUrl);
|
|
1421
|
+
const versionHintBytes = new TextEncoder().encode(String(metadataVersion));
|
|
1422
|
+
versionHintWriter.appendBytes(versionHintBytes);
|
|
1423
|
+
await versionHintWriter.finish();
|
|
1424
|
+
} catch (err) {
|
|
1425
|
+
if (!conditionalCommits) throw err;
|
|
1426
|
+
}
|
|
1427
|
+
return metadata;
|
|
1428
|
+
}
|
|
1429
|
+
function maxPartitionFieldId(partitionFields = []) {
|
|
1430
|
+
let max = 0;
|
|
1431
|
+
for (const pf of partitionFields) if (max < pf["field-id"]) max = pf["field-id"];
|
|
1432
|
+
return max;
|
|
1433
|
+
}
|
|
1434
|
+
async function fileCatalogCommit({ tableUrl, metadata, metadataFileName, currentVersion, staged, resolver, conditionalCommits }) {
|
|
1435
|
+
if (!tableUrl) throw new Error("tableUrl is required");
|
|
1436
|
+
if (!resolver?.writer) throw new Error("resolver.writer is required");
|
|
1437
|
+
checkRequirements(metadata, staged.requirements);
|
|
1438
|
+
const updated = applyUpdates(staged.updates.some((up) => up.action === "add-snapshot") ? metadata : {
|
|
1439
|
+
...metadata,
|
|
1440
|
+
"last-updated-ms": Date.now()
|
|
1441
|
+
}, staged.updates);
|
|
1442
|
+
const priorMetadataLog = metadata["metadata-log"] ?? [];
|
|
1443
|
+
const derivedVersion = currentVersion ?? deriveCurrentVersion(priorMetadataLog);
|
|
1444
|
+
const newVersion = derivedVersion + 1;
|
|
1445
|
+
const currentMetadataPath = metadataFileName ? `${tableUrl}/metadata/${metadataFileName}` : `${tableUrl}/metadata/v${derivedVersion}.metadata.json`;
|
|
1446
|
+
const newMetadataPath = `${tableUrl}/metadata/v${newVersion}.metadata.json`;
|
|
1447
|
+
const appendedLog = [...priorMetadataLog, {
|
|
1448
|
+
"timestamp-ms": metadata["last-updated-ms"],
|
|
1449
|
+
"metadata-file": currentMetadataPath
|
|
1450
|
+
}];
|
|
1451
|
+
const max = Number(updated.properties?.["write.metadata.previous-versions-max"] ?? 100);
|
|
1452
|
+
const droppedLog = max > 0 && appendedLog.length > max ? appendedLog.slice(0, appendedLog.length - max) : [];
|
|
1453
|
+
const trimmedLog = droppedLog.length > 0 ? appendedLog.slice(-max) : appendedLog;
|
|
1454
|
+
const newMetadata = {
|
|
1455
|
+
...updated,
|
|
1456
|
+
"metadata-log": trimmedLog
|
|
1457
|
+
};
|
|
1458
|
+
const metaWriter = conditionalCommits ? resolver.writer(newMetadataPath, { ifNoneMatch: "*" }) : resolver.writer(newMetadataPath);
|
|
1459
|
+
metaWriter.appendBytes(new TextEncoder().encode(stringifyIcebergJson(newMetadata, 2)));
|
|
1460
|
+
await metaWriter.finish();
|
|
1461
|
+
try {
|
|
1462
|
+
const hintWriter = resolver.writer(`${tableUrl}/metadata/version-hint.text`);
|
|
1463
|
+
hintWriter.appendBytes(new TextEncoder().encode(String(newVersion)));
|
|
1464
|
+
await hintWriter.finish();
|
|
1465
|
+
} catch {}
|
|
1466
|
+
if (updated.properties?.["write.metadata.delete-after-commit.enabled"] === "true" && droppedLog.length > 0 && resolver.deleter) {
|
|
1467
|
+
const { deleter } = resolver;
|
|
1468
|
+
await Promise.allSettled(droppedLog.map((entry) => deleter(entry["metadata-file"])));
|
|
1469
|
+
}
|
|
1470
|
+
return newMetadata;
|
|
1471
|
+
}
|
|
1472
|
+
function deriveCurrentVersion(priorMetadataLog) {
|
|
1473
|
+
if (priorMetadataLog.length === 0) return 1;
|
|
1474
|
+
const match = (priorMetadataLog[priorMetadataLog.length - 1]["metadata-file"].split("/").pop() ?? "").match(/^(?:v(\d+)|0*(\d+)-[0-9a-f-]+)\.metadata\.json$/);
|
|
1475
|
+
if (match) return Number(match[1] ?? match[2]) + 1;
|
|
1476
|
+
return priorMetadataLog.length + 1;
|
|
1477
|
+
}
|
|
1478
|
+
function checkRequirements(metadata, requirements) {
|
|
1479
|
+
for (const req of requirements) if (req.type === "assert-create") throw new Error("requirement failed: assert-create against an existing table");
|
|
1480
|
+
else if (req.type === "assert-table-uuid") {
|
|
1481
|
+
if (metadata["table-uuid"] !== req.uuid) throw new Error(`requirement failed: table-uuid expected ${req.uuid}, got ${metadata["table-uuid"]}`);
|
|
1482
|
+
} else if (req.type === "assert-ref-snapshot-id") {
|
|
1483
|
+
let current = (metadata.refs ?? {})[req.ref]?.["snapshot-id"] ?? null;
|
|
1484
|
+
if (current === null && req.ref === "main") current = metadata["current-snapshot-id"] ?? null;
|
|
1485
|
+
const expected = req["snapshot-id"];
|
|
1486
|
+
if (!(current === expected || current != null && expected != null && BigInt(current) === BigInt(expected))) throw new Error(`requirement failed: ref ${req.ref} expected snapshot ${expected}, got ${current}`);
|
|
1487
|
+
} else if (req.type === "assert-next-row-id") {
|
|
1488
|
+
const current = Number(metadata["next-row-id"] ?? 0);
|
|
1489
|
+
if (current !== req["next-row-id"]) throw new Error(`requirement failed: next-row-id expected ${req["next-row-id"]}, got ${current}`);
|
|
1490
|
+
} else if (req.type === "assert-current-schema-id") {
|
|
1491
|
+
const current = metadata["current-schema-id"];
|
|
1492
|
+
if (current !== req["current-schema-id"]) throw new Error(`requirement failed: current-schema-id expected ${req["current-schema-id"]}, got ${current}`);
|
|
1493
|
+
} else if (req.type === "assert-last-assigned-field-id") {
|
|
1494
|
+
const current = metadata["last-column-id"];
|
|
1495
|
+
if (current !== req["last-assigned-field-id"]) throw new Error(`requirement failed: last-assigned-field-id expected ${req["last-assigned-field-id"]}, got ${current}`);
|
|
1496
|
+
} else if (req.type === "assert-last-assigned-partition-id") {
|
|
1497
|
+
const current = metadata["last-partition-id"];
|
|
1498
|
+
if (current !== req["last-assigned-partition-id"]) throw new Error(`requirement failed: last-assigned-partition-id expected ${req["last-assigned-partition-id"]}, got ${current}`);
|
|
1499
|
+
} else if (req.type === "assert-default-spec-id") {
|
|
1500
|
+
const current = metadata["default-spec-id"];
|
|
1501
|
+
if (current !== req["default-spec-id"]) throw new Error(`requirement failed: default-spec-id expected ${req["default-spec-id"]}, got ${current}`);
|
|
1502
|
+
} else if (req.type === "assert-default-sort-order-id") {
|
|
1503
|
+
const current = metadata["default-sort-order-id"];
|
|
1504
|
+
if (current !== req["default-sort-order-id"]) throw new Error(`requirement failed: default-sort-order-id expected ${req["default-sort-order-id"]}, got ${current}`);
|
|
1505
|
+
} else throw new Error(`unknown requirement: ${JSON.stringify(req)}`);
|
|
1506
|
+
}
|
|
1507
|
+
function applyUpdates(metadata, updates) {
|
|
1508
|
+
let next = { ...metadata };
|
|
1509
|
+
for (const up of updates) if (up.action === "add-snapshot") {
|
|
1510
|
+
const snap = up.snapshot;
|
|
1511
|
+
const priorSnapshots = next.snapshots ?? [];
|
|
1512
|
+
if (priorSnapshots.some((s) => s["snapshot-id"] === snap["snapshot-id"])) throw new Error(`add-snapshot: snapshot-id ${snap["snapshot-id"]} already exists`);
|
|
1513
|
+
next = {
|
|
1514
|
+
...next,
|
|
1515
|
+
snapshots: [...priorSnapshots, snap],
|
|
1516
|
+
"last-sequence-number": Math.max(next["last-sequence-number"] ?? 0, snap["sequence-number"]),
|
|
1517
|
+
"last-updated-ms": snap["timestamp-ms"]
|
|
1518
|
+
};
|
|
1519
|
+
if (next["format-version"] >= 3 && snap["first-row-id"] !== void 0 && snap["added-rows"] !== void 0) {
|
|
1520
|
+
const nextRowId = snap["first-row-id"] + snap["added-rows"];
|
|
1521
|
+
next["next-row-id"] = Math.max(Number(next["next-row-id"] ?? 0), nextRowId);
|
|
1522
|
+
}
|
|
1523
|
+
} else if (up.action === "set-properties") next = {
|
|
1524
|
+
...next,
|
|
1525
|
+
properties: {
|
|
1526
|
+
...next.properties,
|
|
1527
|
+
...up.updates
|
|
1528
|
+
}
|
|
1529
|
+
};
|
|
1530
|
+
else if (up.action === "remove-properties") {
|
|
1531
|
+
const properties = { ...next.properties };
|
|
1532
|
+
for (const key of up.removals) delete properties[key];
|
|
1533
|
+
next = {
|
|
1534
|
+
...next,
|
|
1535
|
+
properties
|
|
1536
|
+
};
|
|
1537
|
+
} else if (up.action === "add-schema") {
|
|
1538
|
+
const schemas = next.schemas ?? [];
|
|
1539
|
+
let schemaId = up.schema["schema-id"];
|
|
1540
|
+
if (schemaId === -1) schemaId = schemas.reduce((m, s) => Math.max(m, s["schema-id"]), -1) + 1;
|
|
1541
|
+
else if (schemas.some((s) => s["schema-id"] === schemaId)) throw new Error(`add-schema: schema-id ${schemaId} already exists`);
|
|
1542
|
+
const newSchema = {
|
|
1543
|
+
...up.schema,
|
|
1544
|
+
"schema-id": schemaId
|
|
1545
|
+
};
|
|
1546
|
+
validateSchemaForVersion(newSchema, next["format-version"]);
|
|
1547
|
+
const priorLastColumnId = next["last-column-id"] ?? 0;
|
|
1548
|
+
validateAssignedFieldIds(newSchema, currentAssignedIdIndex(schemas, next["current-schema-id"]), priorLastColumnId);
|
|
1549
|
+
validateSchemaEvolution(schemas, newSchema, priorLastColumnId, next["format-version"]);
|
|
1550
|
+
validateNewRequiredFields(newSchema, priorLastColumnId);
|
|
1551
|
+
next = {
|
|
1552
|
+
...next,
|
|
1553
|
+
schemas: [...schemas, newSchema],
|
|
1554
|
+
"last-column-id": Math.max(priorLastColumnId, maxFieldId(newSchema.fields))
|
|
1555
|
+
};
|
|
1556
|
+
} else if (up.action === "set-current-schema") {
|
|
1557
|
+
let id = up["schema-id"];
|
|
1558
|
+
const schemas = next.schemas ?? [];
|
|
1559
|
+
if (id === -1) {
|
|
1560
|
+
if (schemas.length === 0) throw new Error("set-current-schema: table has no schemas");
|
|
1561
|
+
id = schemas[schemas.length - 1]["schema-id"];
|
|
1562
|
+
} else if (!schemas.some((s) => s["schema-id"] === id)) throw new Error(`set-current-schema: schema-id ${id} not found`);
|
|
1563
|
+
next = {
|
|
1564
|
+
...next,
|
|
1565
|
+
"current-schema-id": id
|
|
1566
|
+
};
|
|
1567
|
+
} else if (up.action === "add-sort-order") {
|
|
1568
|
+
const orders = next["sort-orders"] ?? [];
|
|
1569
|
+
let orderId = up["sort-order"]["order-id"];
|
|
1570
|
+
if (orderId === -1) orderId = orders.reduce((m, o) => Math.max(m, o["order-id"]), -1) + 1;
|
|
1571
|
+
else if (orders.some((o) => o["order-id"] === orderId)) throw new Error(`add-sort-order: order-id ${orderId} already exists`);
|
|
1572
|
+
const newOrder = {
|
|
1573
|
+
...up["sort-order"],
|
|
1574
|
+
"order-id": orderId
|
|
1575
|
+
};
|
|
1576
|
+
next = {
|
|
1577
|
+
...next,
|
|
1578
|
+
"sort-orders": [...orders, newOrder]
|
|
1579
|
+
};
|
|
1580
|
+
} else if (up.action === "set-default-sort-order") {
|
|
1581
|
+
let id = up["sort-order-id"];
|
|
1582
|
+
const orders = next["sort-orders"] ?? [];
|
|
1583
|
+
if (id === -1) {
|
|
1584
|
+
if (orders.length === 0) throw new Error("set-default-sort-order: table has no sort orders");
|
|
1585
|
+
id = orders[orders.length - 1]["order-id"];
|
|
1586
|
+
} else if (!orders.some((o) => o["order-id"] === id)) throw new Error(`set-default-sort-order: sort-order-id ${id} not found`);
|
|
1587
|
+
next = {
|
|
1588
|
+
...next,
|
|
1589
|
+
"default-sort-order-id": id
|
|
1590
|
+
};
|
|
1591
|
+
} else if (up.action === "add-spec") {
|
|
1592
|
+
const specs = next["partition-specs"] ?? [];
|
|
1593
|
+
let specId = up.spec["spec-id"];
|
|
1594
|
+
if (specId === -1) specId = specs.reduce((m, s) => Math.max(m, s["spec-id"]), -1) + 1;
|
|
1595
|
+
else if (specs.some((s) => s["spec-id"] === specId)) throw new Error(`add-spec: spec-id ${specId} already exists`);
|
|
1596
|
+
const newSpec = {
|
|
1597
|
+
...up.spec,
|
|
1598
|
+
"spec-id": specId
|
|
1599
|
+
};
|
|
1600
|
+
validatePartitionSpecEvolution(specs, newSpec, currentSchemaForMetadata(next));
|
|
1601
|
+
let nextLastPartitionId = next["last-partition-id"] ?? 0;
|
|
1602
|
+
for (const f of newSpec.fields) if (f["field-id"] > nextLastPartitionId) nextLastPartitionId = f["field-id"];
|
|
1603
|
+
next = {
|
|
1604
|
+
...next,
|
|
1605
|
+
"partition-specs": [...specs, newSpec],
|
|
1606
|
+
"last-partition-id": nextLastPartitionId
|
|
1607
|
+
};
|
|
1608
|
+
} else if (up.action === "set-default-spec") {
|
|
1609
|
+
let id = up["spec-id"];
|
|
1610
|
+
const specs = next["partition-specs"] ?? [];
|
|
1611
|
+
if (id === -1) {
|
|
1612
|
+
if (specs.length === 0) throw new Error("set-default-spec: table has no partition specs");
|
|
1613
|
+
id = specs[specs.length - 1]["spec-id"];
|
|
1614
|
+
} else if (!specs.some((s) => s["spec-id"] === id)) throw new Error(`set-default-spec: spec-id ${id} not found`);
|
|
1615
|
+
next = {
|
|
1616
|
+
...next,
|
|
1617
|
+
"default-spec-id": id
|
|
1618
|
+
};
|
|
1619
|
+
} else if (up.action === "remove-snapshots") {
|
|
1620
|
+
const removeIds = new Set(up["snapshot-ids"]);
|
|
1621
|
+
const snapshots = (next.snapshots ?? []).filter((s) => !removeIds.has(s["snapshot-id"]));
|
|
1622
|
+
const log = (next["snapshot-log"] ?? []).filter((e) => !removeIds.has(e["snapshot-id"]));
|
|
1623
|
+
next = {
|
|
1624
|
+
...next,
|
|
1625
|
+
snapshots,
|
|
1626
|
+
"snapshot-log": log
|
|
1627
|
+
};
|
|
1628
|
+
} else if (up.action === "set-snapshot-ref") {
|
|
1629
|
+
const ref = {
|
|
1630
|
+
"snapshot-id": up["snapshot-id"],
|
|
1631
|
+
type: up.type
|
|
1632
|
+
};
|
|
1633
|
+
if (up["min-snapshots-to-keep"] !== void 0) ref["min-snapshots-to-keep"] = up["min-snapshots-to-keep"];
|
|
1634
|
+
if (up["max-snapshot-age-ms"] !== void 0) ref["max-snapshot-age-ms"] = up["max-snapshot-age-ms"];
|
|
1635
|
+
if (up["max-ref-age-ms"] !== void 0) ref["max-ref-age-ms"] = up["max-ref-age-ms"];
|
|
1636
|
+
next = {
|
|
1637
|
+
...next,
|
|
1638
|
+
refs: {
|
|
1639
|
+
...next.refs,
|
|
1640
|
+
[up["ref-name"]]: ref
|
|
1641
|
+
}
|
|
1642
|
+
};
|
|
1643
|
+
if (up["ref-name"] === "main" && up.type === "branch") {
|
|
1644
|
+
next["current-snapshot-id"] = up["snapshot-id"];
|
|
1645
|
+
next["snapshot-log"] = [...next["snapshot-log"] ?? [], {
|
|
1646
|
+
"timestamp-ms": next["last-updated-ms"],
|
|
1647
|
+
"snapshot-id": up["snapshot-id"]
|
|
1648
|
+
}];
|
|
1649
|
+
}
|
|
1650
|
+
} else throw new Error(`unknown update: ${JSON.stringify(up)}`);
|
|
1651
|
+
return next;
|
|
1652
|
+
}
|
|
1653
|
+
function currentAssignedIdIndex(schemas, currentSchemaId) {
|
|
1654
|
+
const currentSchema = schemas.find((s) => s["schema-id"] === currentSchemaId) ?? schemas[schemas.length - 1];
|
|
1655
|
+
const assignedIds = /* @__PURE__ */ new Map();
|
|
1656
|
+
if (currentSchema) indexAssignedFieldIds(currentSchema.fields, "", assignedIds);
|
|
1657
|
+
return assignedIds;
|
|
1658
|
+
}
|
|
1659
|
+
function indexAssignedFieldIds(fields, prefix, assignedIds) {
|
|
1660
|
+
for (const field of fields) {
|
|
1661
|
+
const path = prefix ? `${prefix}.${field.name}` : field.name;
|
|
1662
|
+
assignedIds.set(field.id, {
|
|
1663
|
+
kind: "field",
|
|
1664
|
+
path
|
|
1665
|
+
});
|
|
1666
|
+
indexAssignedTypeIds(field.type, path, assignedIds);
|
|
1667
|
+
}
|
|
1668
|
+
}
|
|
1669
|
+
function indexAssignedTypeIds(type, path, assignedIds) {
|
|
1670
|
+
if (typeof type === "string") return;
|
|
1671
|
+
if (type.type === "struct") indexAssignedFieldIds(type.fields, path, assignedIds);
|
|
1672
|
+
else if (type.type === "list") {
|
|
1673
|
+
assignedIds.set(type["element-id"], {
|
|
1674
|
+
kind: "list element",
|
|
1675
|
+
path: `${path}.element`
|
|
1676
|
+
});
|
|
1677
|
+
indexAssignedTypeIds(type.element, `${path}.element`, assignedIds);
|
|
1678
|
+
} else if (type.type === "map") {
|
|
1679
|
+
assignedIds.set(type["key-id"], {
|
|
1680
|
+
kind: "map key",
|
|
1681
|
+
path: `${path}.key`
|
|
1682
|
+
});
|
|
1683
|
+
assignedIds.set(type["value-id"], {
|
|
1684
|
+
kind: "map value",
|
|
1685
|
+
path: `${path}.value`
|
|
1686
|
+
});
|
|
1687
|
+
indexAssignedTypeIds(type.key, `${path}.key`, assignedIds);
|
|
1688
|
+
indexAssignedTypeIds(type.value, `${path}.value`, assignedIds);
|
|
1689
|
+
}
|
|
1690
|
+
}
|
|
1691
|
+
function validateAssignedFieldIds(schema, priorAssignedIds, priorLastColumnId) {
|
|
1692
|
+
validateAssignedFields(schema.fields, "", priorAssignedIds, priorLastColumnId);
|
|
1693
|
+
}
|
|
1694
|
+
function validateAssignedFields(fields, prefix, priorAssignedIds, priorLastColumnId) {
|
|
1695
|
+
for (const field of fields) {
|
|
1696
|
+
const path = prefix ? `${prefix}.${field.name}` : field.name;
|
|
1697
|
+
validateAssignedId(field.id, "field", path, priorAssignedIds, priorLastColumnId);
|
|
1698
|
+
validateAssignedTypeIds(field.type, path, priorAssignedIds, priorLastColumnId);
|
|
1699
|
+
}
|
|
1700
|
+
}
|
|
1701
|
+
function validateAssignedTypeIds(type, path, priorAssignedIds, priorLastColumnId) {
|
|
1702
|
+
if (typeof type === "string") return;
|
|
1703
|
+
if (type.type === "struct") validateAssignedFields(type.fields, path, priorAssignedIds, priorLastColumnId);
|
|
1704
|
+
else if (type.type === "list") {
|
|
1705
|
+
validateAssignedId(type["element-id"], "list element", `${path}.element`, priorAssignedIds, priorLastColumnId);
|
|
1706
|
+
validateAssignedTypeIds(type.element, `${path}.element`, priorAssignedIds, priorLastColumnId);
|
|
1707
|
+
} else if (type.type === "map") {
|
|
1708
|
+
validateAssignedId(type["key-id"], "map key", `${path}.key`, priorAssignedIds, priorLastColumnId);
|
|
1709
|
+
validateAssignedId(type["value-id"], "map value", `${path}.value`, priorAssignedIds, priorLastColumnId);
|
|
1710
|
+
validateAssignedTypeIds(type.key, `${path}.key`, priorAssignedIds, priorLastColumnId);
|
|
1711
|
+
validateAssignedTypeIds(type.value, `${path}.value`, priorAssignedIds, priorLastColumnId);
|
|
1712
|
+
}
|
|
1713
|
+
}
|
|
1714
|
+
function validateAssignedId(id, kind, path, priorAssignedIds, priorLastColumnId) {
|
|
1715
|
+
if (id > priorLastColumnId) return;
|
|
1716
|
+
const prior = priorAssignedIds.get(id);
|
|
1717
|
+
if (!prior) throw new Error(`add-schema: ${kind} ${path} uses unassigned id ${id} (last-column-id ${priorLastColumnId})`);
|
|
1718
|
+
if (prior.kind !== kind) throw new Error(`add-schema: ${kind} ${path} uses id ${id} previously assigned to ${prior.kind} ${prior.path}`);
|
|
1719
|
+
}
|
|
1720
|
+
function validateNewRequiredFields(schema, priorLastColumnId) {
|
|
1721
|
+
for (const field of schema.fields) if (field.id > priorLastColumnId && field.required) {
|
|
1722
|
+
if (field["initial-default"] == null) throw new Error(`add-schema: required field ${field.name} (id ${field.id}) needs a non-null initial-default`);
|
|
1723
|
+
if (field["write-default"] == null) throw new Error(`add-schema: required field ${field.name} (id ${field.id}) needs a non-null write-default`);
|
|
1724
|
+
}
|
|
1725
|
+
}
|
|
1726
|
+
function validateSchemaEvolution(schemas, newSchema, priorLastColumnId, formatVersion) {
|
|
1727
|
+
for (const field of newSchema.fields) {
|
|
1728
|
+
if (field.id > priorLastColumnId) continue;
|
|
1729
|
+
const prior = latestFieldById(schemas, field.id);
|
|
1730
|
+
if (!prior) continue;
|
|
1731
|
+
if (!canPromoteType(prior.type, field.type, formatVersion)) throw new Error(`add-schema: cannot promote field ${field.name} from ${typeToString(prior.type)} to ${typeToString(field.type)}`);
|
|
1732
|
+
if (!defaultsEqual(prior["initial-default"], field["initial-default"])) throw new Error(`add-schema: initial-default for field ${field.name} cannot change`);
|
|
1733
|
+
}
|
|
1734
|
+
}
|
|
1735
|
+
function latestFieldById(schemas, id) {
|
|
1736
|
+
for (let i = schemas.length - 1; i >= 0; i--) {
|
|
1737
|
+
const field = schemas[i].fields.find((f) => f.id === id);
|
|
1738
|
+
if (field) return field;
|
|
1739
|
+
}
|
|
1740
|
+
}
|
|
1741
|
+
function canPromoteType(from, to, formatVersion) {
|
|
1742
|
+
if (typesEqual(from, to)) return true;
|
|
1743
|
+
if (typeof from !== "string" || typeof to !== "string") return false;
|
|
1744
|
+
if (formatVersion >= 3 && from === "unknown") return true;
|
|
1745
|
+
if (from === "int" && to === "long") return true;
|
|
1746
|
+
if (from === "float" && to === "double") return true;
|
|
1747
|
+
if (formatVersion >= 3 && from === "date" && (to === "timestamp" || to === "timestamp_ns")) return true;
|
|
1748
|
+
return decimalPromotionAllowed(from, to);
|
|
1749
|
+
}
|
|
1750
|
+
function typesEqual(a, b) {
|
|
1751
|
+
if (typeof a === "string" || typeof b === "string") return a === b;
|
|
1752
|
+
return JSON.stringify(a) === JSON.stringify(b);
|
|
1753
|
+
}
|
|
1754
|
+
function decimalPromotionAllowed(from, to) {
|
|
1755
|
+
const a = parseDecimalType(from);
|
|
1756
|
+
const b = parseDecimalType(to);
|
|
1757
|
+
return Boolean(a && b && b.precision > a.precision && b.scale === a.scale);
|
|
1758
|
+
}
|
|
1759
|
+
function typeToString(type) {
|
|
1760
|
+
return typeof type === "string" ? type : JSON.stringify(type);
|
|
1761
|
+
}
|
|
1762
|
+
function defaultsEqual(a, b) {
|
|
1763
|
+
if (Object.is(a, b)) return true;
|
|
1764
|
+
if (!a || !b || typeof a !== "object" || typeof b !== "object") return false;
|
|
1765
|
+
if (Array.isArray(a) || Array.isArray(b)) {
|
|
1766
|
+
if (!Array.isArray(a) || !Array.isArray(b) || a.length !== b.length) return false;
|
|
1767
|
+
for (let i = 0; i < a.length; i++) if (!defaultsEqual(a[i], b[i])) return false;
|
|
1768
|
+
return true;
|
|
1769
|
+
}
|
|
1770
|
+
const aKeys = Object.keys(a);
|
|
1771
|
+
const bKeys = Object.keys(b);
|
|
1772
|
+
if (aKeys.length !== bKeys.length) return false;
|
|
1773
|
+
for (const key of aKeys) {
|
|
1774
|
+
if (!Object.hasOwn(b, key)) return false;
|
|
1775
|
+
if (!defaultsEqual(a[key], b[key])) return false;
|
|
1776
|
+
}
|
|
1777
|
+
return true;
|
|
1778
|
+
}
|
|
1779
|
+
function validatePartitionSpecEvolution(specs, newSpec, schema) {
|
|
1780
|
+
validateWritablePartitionSpec(newSpec, schema);
|
|
1781
|
+
if (specs.some((spec) => partitionSpecsEquivalent(spec, newSpec))) throw new Error("add-spec: equivalent partition spec already exists");
|
|
1782
|
+
for (const field of newSpec.fields) {
|
|
1783
|
+
const equivalent = equivalentPartitionField(specs, field);
|
|
1784
|
+
if (equivalent && equivalent["field-id"] !== field["field-id"]) throw new Error(`add-spec: partition field ${field.name} must reuse field-id ${equivalent["field-id"]}`);
|
|
1785
|
+
}
|
|
1786
|
+
}
|
|
1787
|
+
function validateWritablePartitionSpec(spec, schema) {
|
|
1788
|
+
try {
|
|
1789
|
+
validatePartitionSpecForWrite(schema, spec, "add-spec");
|
|
1790
|
+
} catch (err) {
|
|
1791
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
1792
|
+
if (message.startsWith("unsupported partition transform: ")) throw new Error(`add-spec: ${message}`);
|
|
1793
|
+
throw err;
|
|
1794
|
+
}
|
|
1795
|
+
}
|
|
1796
|
+
function currentSchemaForMetadata(metadata) {
|
|
1797
|
+
const schema = metadata.schemas?.find((s) => s["schema-id"] === metadata["current-schema-id"]);
|
|
1798
|
+
if (!schema) throw new Error("add-spec: current schema not found in metadata");
|
|
1799
|
+
return schema;
|
|
1800
|
+
}
|
|
1801
|
+
function equivalentPartitionField(specs, field) {
|
|
1802
|
+
for (const spec of specs) {
|
|
1803
|
+
const found = spec.fields.find((existing) => partitionFieldsEquivalent(existing, field));
|
|
1804
|
+
if (found) return found;
|
|
1805
|
+
}
|
|
1806
|
+
}
|
|
1807
|
+
function partitionSpecsEquivalent(a, b) {
|
|
1808
|
+
if (a.fields.length !== b.fields.length) return false;
|
|
1809
|
+
for (let i = 0; i < a.fields.length; i++) if (!partitionFieldsEquivalent(a.fields[i], b.fields[i])) return false;
|
|
1810
|
+
return true;
|
|
1811
|
+
}
|
|
1812
|
+
function partitionFieldsEquivalent(a, b) {
|
|
1813
|
+
return a["source-id"] === b["source-id"] && idsListEquivalent(a["source-ids"], b["source-ids"]) && a.transform === b.transform && a.name === b.name;
|
|
1814
|
+
}
|
|
1815
|
+
function idsListEquivalent(a, b) {
|
|
1816
|
+
if (a === void 0 || b === void 0) return a === b;
|
|
1817
|
+
if (a.length !== b.length) return false;
|
|
1818
|
+
for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false;
|
|
1819
|
+
return true;
|
|
1820
|
+
}
|
|
1821
|
+
function avroWrite({ writer, schema, records, blockSize = 512, metadata }) {
|
|
1822
|
+
writer.appendUint32(23749199);
|
|
1823
|
+
const meta = {
|
|
1824
|
+
...metadata,
|
|
1825
|
+
"avro.schema": typeof schema === "string" ? schema : JSON.stringify(schema),
|
|
1826
|
+
"avro.codec": "null"
|
|
1827
|
+
};
|
|
1828
|
+
appendZigZag(writer, Object.keys(meta).length);
|
|
1829
|
+
for (const [key, value] of Object.entries(meta)) {
|
|
1830
|
+
const kb = new TextEncoder().encode(key);
|
|
1831
|
+
appendZigZag(writer, kb.length);
|
|
1832
|
+
writer.appendBytes(kb);
|
|
1833
|
+
const vb = new TextEncoder().encode(value);
|
|
1834
|
+
appendZigZag(writer, vb.length);
|
|
1835
|
+
writer.appendBytes(vb);
|
|
1836
|
+
}
|
|
1837
|
+
writer.appendVarInt(0);
|
|
1838
|
+
const sync = new Uint8Array(16);
|
|
1839
|
+
for (let i = 0; i < 16; i++) sync[i] = Math.random() * 256 | 0;
|
|
1840
|
+
writer.appendBytes(sync);
|
|
1841
|
+
for (let i = 0; i < records.length; i += blockSize) {
|
|
1842
|
+
const block = records.slice(i, i + blockSize);
|
|
1843
|
+
appendZigZag(writer, block.length);
|
|
1844
|
+
const blockWriter = new ByteWriter();
|
|
1845
|
+
for (const record of block) for (const { name, type } of schema.fields) writeType(blockWriter, type, record[name]);
|
|
1846
|
+
appendZigZag(writer, blockWriter.offset);
|
|
1847
|
+
writer.appendBytes(blockWriter.getBytes());
|
|
1848
|
+
writer.appendBytes(sync);
|
|
1849
|
+
}
|
|
1850
|
+
return writer.finish();
|
|
1851
|
+
}
|
|
1852
|
+
function writeType(writer, schema, value) {
|
|
1853
|
+
if (Array.isArray(schema)) {
|
|
1854
|
+
const unionIndex = schema.findIndex((s) => {
|
|
1855
|
+
if (Array.isArray(s)) throw new Error("nested unions not supported");
|
|
1856
|
+
const tag = typeof s === "string" ? s : s.type === "record" || s.type === "array" || s.type === "fixed" ? s.type : s.logicalType;
|
|
1857
|
+
if (value == null) return tag === "null";
|
|
1858
|
+
if (tag === "boolean") return typeof value === "boolean";
|
|
1859
|
+
if (tag === "int") return typeof value === "number" && Number.isInteger(value);
|
|
1860
|
+
if (tag === "long") return typeof value === "bigint" || typeof value === "number";
|
|
1861
|
+
if (tag === "float" || tag === "double") return typeof value === "number";
|
|
1862
|
+
if (tag === "string") return typeof value === "string";
|
|
1863
|
+
if (tag === "bytes") return value instanceof Uint8Array;
|
|
1864
|
+
if (tag === "date") return value instanceof Date || typeof value === "number";
|
|
1865
|
+
if (tag === "time-millis") return typeof value === "number";
|
|
1866
|
+
if (tag === "time-micros") return typeof value === "bigint" || typeof value === "number";
|
|
1867
|
+
if (tag === "timestamp-millis" || tag === "timestamp-micros" || tag === "timestamp-nanos") return value instanceof Date || typeof value === "bigint" || typeof value === "number";
|
|
1868
|
+
if (tag === "decimal") return typeof value === "number" || typeof value === "bigint";
|
|
1869
|
+
if (tag === "record") return typeof value === "object" && value !== null;
|
|
1870
|
+
if (tag === "array") return Array.isArray(value);
|
|
1871
|
+
if (tag === "fixed") {
|
|
1872
|
+
if (value instanceof Uint8Array) return true;
|
|
1873
|
+
return typeof s === "object" && "logicalType" in s && s.logicalType === "uuid" && typeof value === "string";
|
|
1874
|
+
}
|
|
1875
|
+
return false;
|
|
1876
|
+
});
|
|
1877
|
+
if (unionIndex === -1) throw new Error("union branch not found");
|
|
1878
|
+
appendZigZag(writer, unionIndex);
|
|
1879
|
+
writeType(writer, schema[unionIndex], value);
|
|
1880
|
+
} else if (typeof schema === "string") {
|
|
1881
|
+
if (schema === "null") {} else if (schema === "boolean") writer.appendUint8(value ? 1 : 0);
|
|
1882
|
+
else if (schema === "int") {
|
|
1883
|
+
if (typeof value !== "number" || !Number.isInteger(value)) throw new Error("expected integer value");
|
|
1884
|
+
appendZigZag(writer, value);
|
|
1885
|
+
} else if (schema === "long") {
|
|
1886
|
+
if (typeof value !== "bigint") throw new Error("expected bigint value");
|
|
1887
|
+
appendZigZag64(writer, value);
|
|
1888
|
+
} else if (schema === "float") {
|
|
1889
|
+
if (typeof value !== "number") throw new Error("expected number value");
|
|
1890
|
+
writer.appendFloat32(value);
|
|
1891
|
+
} else if (schema === "double") {
|
|
1892
|
+
if (typeof value !== "number") throw new Error("expected number value");
|
|
1893
|
+
writer.appendFloat64(value);
|
|
1894
|
+
} else if (schema === "bytes") {
|
|
1895
|
+
if (!(value instanceof Uint8Array)) throw new Error("expected Uint8Array value");
|
|
1896
|
+
appendZigZag(writer, value.length);
|
|
1897
|
+
writer.appendBytes(value);
|
|
1898
|
+
} else if (schema === "string") {
|
|
1899
|
+
if (typeof value !== "string") throw new Error("expected string value");
|
|
1900
|
+
const b = new TextEncoder().encode(value);
|
|
1901
|
+
appendZigZag(writer, b.length);
|
|
1902
|
+
writer.appendBytes(b);
|
|
1903
|
+
}
|
|
1904
|
+
} else if (schema.type === "record") for (const f of schema.fields) writeType(writer, f.type, value[f.name]);
|
|
1905
|
+
else if (schema.type === "array") {
|
|
1906
|
+
if (value.length) {
|
|
1907
|
+
appendZigZag(writer, value.length);
|
|
1908
|
+
for (const it of value) writeType(writer, schema.items, it);
|
|
1909
|
+
}
|
|
1910
|
+
writer.appendVarInt(0);
|
|
1911
|
+
} else if (schema.type === "fixed") {
|
|
1912
|
+
const bytes = schema.logicalType === "uuid" && typeof value === "string" ? uuidStringToBytes$1(value) : value;
|
|
1913
|
+
if (!(bytes instanceof Uint8Array)) throw new Error("expected Uint8Array value");
|
|
1914
|
+
if (bytes.length !== schema.size) throw new Error(`expected fixed[${schema.size}] value`);
|
|
1915
|
+
writer.appendBytes(bytes);
|
|
1916
|
+
} else if ("logicalType" in schema) if (schema.logicalType === "date") appendZigZag(writer, value instanceof Date ? Math.floor(value.getTime() / 864e5) : value);
|
|
1917
|
+
else if (schema.logicalType === "time-millis") appendZigZag(writer, value);
|
|
1918
|
+
else if (schema.logicalType === "time-micros") appendZigZag64(writer, BigInt(value));
|
|
1919
|
+
else if (schema.logicalType === "timestamp-millis") appendZigZag64(writer, value instanceof Date ? BigInt(value.getTime()) : BigInt(value));
|
|
1920
|
+
else if (schema.logicalType === "timestamp-micros") appendZigZag64(writer, value instanceof Date ? BigInt(value.getTime()) * 1000n : BigInt(value));
|
|
1921
|
+
else if (schema.logicalType === "timestamp-nanos") appendZigZag64(writer, value instanceof Date ? BigInt(value.getTime()) * 1000000n : BigInt(value));
|
|
1922
|
+
else if (schema.logicalType === "decimal") {
|
|
1923
|
+
const scale = "scale" in schema ? schema.scale ?? 0 : 0;
|
|
1924
|
+
let u;
|
|
1925
|
+
if (typeof value === "bigint") u = value;
|
|
1926
|
+
else if (typeof value === "number") u = BigInt(Math.round(value * 10 ** scale));
|
|
1927
|
+
else throw new Error("decimal value must be bigint or number");
|
|
1928
|
+
const b = bigIntToBytes(u);
|
|
1929
|
+
appendZigZag(writer, b.length);
|
|
1930
|
+
writer.appendBytes(b);
|
|
1931
|
+
} else throw new Error(`unknown logical type ${schema.logicalType}`);
|
|
1932
|
+
else throw new Error(`unknown schema type ${JSON.stringify(schema)}`);
|
|
1933
|
+
}
|
|
1934
|
+
function appendZigZag(writer, v) {
|
|
1935
|
+
writer.appendVarInt(v << 1 ^ v >> 31);
|
|
1936
|
+
}
|
|
1937
|
+
function appendZigZag64(writer, v) {
|
|
1938
|
+
writer.appendVarBigInt(v << 1n ^ v >> 63n);
|
|
1939
|
+
}
|
|
1940
|
+
function uuidStringToBytes$1(value) {
|
|
1941
|
+
const hex = value.toLowerCase().replace(/-/g, "");
|
|
1942
|
+
if (!/^[0-9a-f]{32}$/.test(hex)) throw new Error("expected uuid string");
|
|
1943
|
+
const bytes = new Uint8Array(16);
|
|
1944
|
+
for (let i = 0; i < 16; i++) bytes[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
|
|
1945
|
+
return bytes;
|
|
1946
|
+
}
|
|
1947
|
+
function bigIntToBytes(value) {
|
|
1948
|
+
const neg = value < 0n;
|
|
1949
|
+
let abs = neg ? -value : value;
|
|
1950
|
+
const out = [];
|
|
1951
|
+
while (abs > 0n) {
|
|
1952
|
+
out.unshift(Number(abs & 255n));
|
|
1953
|
+
abs >>= 8n;
|
|
1954
|
+
}
|
|
1955
|
+
if (out.length === 0) out.push(0);
|
|
1956
|
+
if (neg) {
|
|
1957
|
+
for (let i = 0; i < out.length; i++) out[i] ^= 255;
|
|
1958
|
+
for (let i = out.length - 1; i >= 0; i--) {
|
|
1959
|
+
out[i] = out[i] + 1 & 255;
|
|
1960
|
+
if (out[i]) break;
|
|
1961
|
+
}
|
|
1962
|
+
if ((out[0] & 128) === 0) out.unshift(255);
|
|
1963
|
+
} else if ((out[0] & 128) !== 0) out.unshift(0);
|
|
1964
|
+
return Uint8Array.from(out);
|
|
1965
|
+
}
|
|
1966
|
+
function manifestEntrySchema(schema, partitionSpec, formatVersion, manifestContent = 0) {
|
|
1967
|
+
const dataFileFields = [
|
|
1968
|
+
{
|
|
1969
|
+
name: "content",
|
|
1970
|
+
type: "int",
|
|
1971
|
+
"field-id": 134
|
|
1972
|
+
},
|
|
1973
|
+
{
|
|
1974
|
+
name: "file_path",
|
|
1975
|
+
type: "string",
|
|
1976
|
+
"field-id": 100
|
|
1977
|
+
},
|
|
1978
|
+
{
|
|
1979
|
+
name: "file_format",
|
|
1980
|
+
type: "string",
|
|
1981
|
+
"field-id": 101
|
|
1982
|
+
},
|
|
1983
|
+
{
|
|
1984
|
+
name: "partition",
|
|
1985
|
+
"field-id": 102,
|
|
1986
|
+
type: partitionAvroSchema(schema, partitionSpec)
|
|
1987
|
+
},
|
|
1988
|
+
{
|
|
1989
|
+
name: "record_count",
|
|
1990
|
+
type: "long",
|
|
1991
|
+
"field-id": 103
|
|
1992
|
+
},
|
|
1993
|
+
{
|
|
1994
|
+
name: "file_size_in_bytes",
|
|
1995
|
+
type: "long",
|
|
1996
|
+
"field-id": 104
|
|
1997
|
+
},
|
|
1998
|
+
mapField("column_sizes", 108, "k117_v118", 117, 118, "long"),
|
|
1999
|
+
mapField("value_counts", 109, "k119_v120", 119, 120, "long"),
|
|
2000
|
+
mapField("null_value_counts", 110, "k121_v122", 121, 122, "long"),
|
|
2001
|
+
mapField("nan_value_counts", 137, "k138_v139", 138, 139, "long"),
|
|
2002
|
+
mapField("lower_bounds", 125, "k126_v127", 126, 127, "bytes"),
|
|
2003
|
+
mapField("upper_bounds", 128, "k129_v130", 129, 130, "bytes"),
|
|
2004
|
+
{
|
|
2005
|
+
name: "sort_order_id",
|
|
2006
|
+
type: ["null", "int"],
|
|
2007
|
+
default: null,
|
|
2008
|
+
"field-id": 140
|
|
2009
|
+
}
|
|
2010
|
+
];
|
|
2011
|
+
if (manifestContent === 1) {
|
|
2012
|
+
dataFileFields.push({
|
|
2013
|
+
name: "equality_ids",
|
|
2014
|
+
"field-id": 135,
|
|
2015
|
+
default: null,
|
|
2016
|
+
type: ["null", {
|
|
2017
|
+
type: "array",
|
|
2018
|
+
items: "int",
|
|
2019
|
+
"element-id": 136
|
|
2020
|
+
}]
|
|
2021
|
+
});
|
|
2022
|
+
dataFileFields.push({
|
|
2023
|
+
name: "referenced_data_file",
|
|
2024
|
+
type: ["null", "string"],
|
|
2025
|
+
default: null,
|
|
2026
|
+
"field-id": 143
|
|
2027
|
+
});
|
|
2028
|
+
if (formatVersion >= 3) {
|
|
2029
|
+
dataFileFields.push({
|
|
2030
|
+
name: "content_offset",
|
|
2031
|
+
type: ["null", "long"],
|
|
2032
|
+
default: null,
|
|
2033
|
+
"field-id": 144
|
|
2034
|
+
});
|
|
2035
|
+
dataFileFields.push({
|
|
2036
|
+
name: "content_size_in_bytes",
|
|
2037
|
+
type: ["null", "long"],
|
|
2038
|
+
default: null,
|
|
2039
|
+
"field-id": 145
|
|
2040
|
+
});
|
|
2041
|
+
}
|
|
2042
|
+
}
|
|
2043
|
+
if (formatVersion >= 3) dataFileFields.push({
|
|
2044
|
+
name: "first_row_id",
|
|
2045
|
+
type: ["null", "long"],
|
|
2046
|
+
default: null,
|
|
2047
|
+
"field-id": 142
|
|
2048
|
+
});
|
|
2049
|
+
return {
|
|
2050
|
+
type: "record",
|
|
2051
|
+
name: "manifest_entry",
|
|
2052
|
+
fields: [
|
|
2053
|
+
{
|
|
2054
|
+
name: "status",
|
|
2055
|
+
type: "int",
|
|
2056
|
+
"field-id": 0
|
|
2057
|
+
},
|
|
2058
|
+
{
|
|
2059
|
+
name: "snapshot_id",
|
|
2060
|
+
type: ["null", "long"],
|
|
2061
|
+
default: null,
|
|
2062
|
+
"field-id": 1
|
|
2063
|
+
},
|
|
2064
|
+
{
|
|
2065
|
+
name: "sequence_number",
|
|
2066
|
+
type: ["null", "long"],
|
|
2067
|
+
default: null,
|
|
2068
|
+
"field-id": 3
|
|
2069
|
+
},
|
|
2070
|
+
{
|
|
2071
|
+
name: "file_sequence_number",
|
|
2072
|
+
type: ["null", "long"],
|
|
2073
|
+
default: null,
|
|
2074
|
+
"field-id": 4
|
|
2075
|
+
},
|
|
2076
|
+
{
|
|
2077
|
+
name: "data_file",
|
|
2078
|
+
"field-id": 2,
|
|
2079
|
+
type: {
|
|
2080
|
+
type: "record",
|
|
2081
|
+
name: "r2",
|
|
2082
|
+
fields: dataFileFields
|
|
2083
|
+
}
|
|
2084
|
+
}
|
|
2085
|
+
]
|
|
2086
|
+
};
|
|
2087
|
+
}
|
|
2088
|
+
function mapField(name, fieldId, recName, keyId, valueId, valueType) {
|
|
2089
|
+
return {
|
|
2090
|
+
name,
|
|
2091
|
+
"field-id": fieldId,
|
|
2092
|
+
default: null,
|
|
2093
|
+
type: ["null", {
|
|
2094
|
+
type: "array",
|
|
2095
|
+
logicalType: "map",
|
|
2096
|
+
items: {
|
|
2097
|
+
type: "record",
|
|
2098
|
+
name: recName,
|
|
2099
|
+
fields: [{
|
|
2100
|
+
name: "key",
|
|
2101
|
+
type: "int",
|
|
2102
|
+
"field-id": keyId
|
|
2103
|
+
}, {
|
|
2104
|
+
name: "value",
|
|
2105
|
+
type: valueType,
|
|
2106
|
+
"field-id": valueId
|
|
2107
|
+
}]
|
|
2108
|
+
}
|
|
2109
|
+
}]
|
|
2110
|
+
};
|
|
2111
|
+
}
|
|
2112
|
+
function icebergSchemaJson(schema) {
|
|
2113
|
+
return JSON.stringify(schema);
|
|
2114
|
+
}
|
|
2115
|
+
function encodeMap(m) {
|
|
2116
|
+
if (!m) return null;
|
|
2117
|
+
const entries = Object.entries(m);
|
|
2118
|
+
if (!entries.length) return null;
|
|
2119
|
+
return entries.map(([k, value]) => ({
|
|
2120
|
+
key: Number(k),
|
|
2121
|
+
value
|
|
2122
|
+
}));
|
|
2123
|
+
}
|
|
2124
|
+
function writeDataManifest({ writer, schema, partitionSpec, snapshotId, dataFiles, formatVersion = 2 }) {
|
|
2125
|
+
const records = dataFiles.map((dataFile) => {
|
|
2126
|
+
if (dataFile.content !== 0) throw new Error(`writeDataManifest expects data files (content=0), got content=${dataFile.content}`);
|
|
2127
|
+
return manifestEntryRecord(dataFile, schema, partitionSpec, snapshotId, formatVersion, 0);
|
|
2128
|
+
});
|
|
2129
|
+
return avroWrite({
|
|
2130
|
+
writer,
|
|
2131
|
+
schema: manifestEntrySchema(schema, partitionSpec, formatVersion, 0),
|
|
2132
|
+
records,
|
|
2133
|
+
metadata: {
|
|
2134
|
+
"format-version": String(formatVersion),
|
|
2135
|
+
content: "data",
|
|
2136
|
+
schema: icebergSchemaJson(schema),
|
|
2137
|
+
"partition-spec": partitionSpecJson(partitionSpec),
|
|
2138
|
+
"partition-spec-id": String(partitionSpec["spec-id"])
|
|
2139
|
+
}
|
|
2140
|
+
});
|
|
2141
|
+
}
|
|
2142
|
+
function manifestEntryRecord(dataFile, schema, partitionSpec, snapshotId, formatVersion, manifestContent) {
|
|
2143
|
+
const dataFileRecord = {
|
|
2144
|
+
content: dataFile.content,
|
|
2145
|
+
file_path: dataFile.file_path,
|
|
2146
|
+
file_format: dataFile.file_format.toUpperCase(),
|
|
2147
|
+
partition: partitionToAvroRecord(dataFile.partition ?? {}, schema, partitionSpec),
|
|
2148
|
+
record_count: dataFile.record_count,
|
|
2149
|
+
file_size_in_bytes: dataFile.file_size_in_bytes,
|
|
2150
|
+
column_sizes: encodeMap(dataFile.column_sizes),
|
|
2151
|
+
value_counts: encodeMap(dataFile.value_counts),
|
|
2152
|
+
null_value_counts: encodeMap(dataFile.null_value_counts),
|
|
2153
|
+
nan_value_counts: encodeMap(dataFile.nan_value_counts),
|
|
2154
|
+
lower_bounds: encodeMap(dataFile.lower_bounds),
|
|
2155
|
+
upper_bounds: encodeMap(dataFile.upper_bounds),
|
|
2156
|
+
sort_order_id: dataFile.content === 1 ? null : dataFile.sort_order_id ?? 0
|
|
2157
|
+
};
|
|
2158
|
+
if (manifestContent === 1) {
|
|
2159
|
+
dataFileRecord.equality_ids = dataFile.equality_ids?.length ? dataFile.equality_ids : null;
|
|
2160
|
+
dataFileRecord.referenced_data_file = dataFile.referenced_data_file ?? null;
|
|
2161
|
+
if (formatVersion >= 3) {
|
|
2162
|
+
dataFileRecord.content_offset = dataFile.content_offset ?? null;
|
|
2163
|
+
dataFileRecord.content_size_in_bytes = dataFile.content_size_in_bytes ?? null;
|
|
2164
|
+
}
|
|
2165
|
+
}
|
|
2166
|
+
if (formatVersion >= 3) dataFileRecord.first_row_id = dataFile.first_row_id ?? null;
|
|
2167
|
+
return {
|
|
2168
|
+
status: 1,
|
|
2169
|
+
snapshot_id: snapshotId,
|
|
2170
|
+
sequence_number: null,
|
|
2171
|
+
file_sequence_number: null,
|
|
2172
|
+
data_file: dataFileRecord
|
|
2173
|
+
};
|
|
2174
|
+
}
|
|
2175
|
+
function manifestFileSchema(formatVersion) {
|
|
2176
|
+
const fields = [
|
|
2177
|
+
{
|
|
2178
|
+
name: "manifest_path",
|
|
2179
|
+
type: "string",
|
|
2180
|
+
"field-id": 500
|
|
2181
|
+
},
|
|
2182
|
+
{
|
|
2183
|
+
name: "manifest_length",
|
|
2184
|
+
type: "long",
|
|
2185
|
+
"field-id": 501
|
|
2186
|
+
},
|
|
2187
|
+
{
|
|
2188
|
+
name: "partition_spec_id",
|
|
2189
|
+
type: "int",
|
|
2190
|
+
"field-id": 502
|
|
2191
|
+
},
|
|
2192
|
+
{
|
|
2193
|
+
name: "content",
|
|
2194
|
+
type: "int",
|
|
2195
|
+
"field-id": 517
|
|
2196
|
+
},
|
|
2197
|
+
{
|
|
2198
|
+
name: "sequence_number",
|
|
2199
|
+
type: "long",
|
|
2200
|
+
"field-id": 515
|
|
2201
|
+
},
|
|
2202
|
+
{
|
|
2203
|
+
name: "min_sequence_number",
|
|
2204
|
+
type: "long",
|
|
2205
|
+
"field-id": 516
|
|
2206
|
+
},
|
|
2207
|
+
{
|
|
2208
|
+
name: "added_snapshot_id",
|
|
2209
|
+
type: "long",
|
|
2210
|
+
"field-id": 503
|
|
2211
|
+
},
|
|
2212
|
+
{
|
|
2213
|
+
name: "added_files_count",
|
|
2214
|
+
type: "int",
|
|
2215
|
+
"field-id": 504
|
|
2216
|
+
},
|
|
2217
|
+
{
|
|
2218
|
+
name: "existing_files_count",
|
|
2219
|
+
type: "int",
|
|
2220
|
+
"field-id": 505
|
|
2221
|
+
},
|
|
2222
|
+
{
|
|
2223
|
+
name: "deleted_files_count",
|
|
2224
|
+
type: "int",
|
|
2225
|
+
"field-id": 506
|
|
2226
|
+
},
|
|
2227
|
+
{
|
|
2228
|
+
name: "added_rows_count",
|
|
2229
|
+
type: "long",
|
|
2230
|
+
"field-id": 512
|
|
2231
|
+
},
|
|
2232
|
+
{
|
|
2233
|
+
name: "existing_rows_count",
|
|
2234
|
+
type: "long",
|
|
2235
|
+
"field-id": 513
|
|
2236
|
+
},
|
|
2237
|
+
{
|
|
2238
|
+
name: "deleted_rows_count",
|
|
2239
|
+
type: "long",
|
|
2240
|
+
"field-id": 514
|
|
2241
|
+
},
|
|
2242
|
+
{
|
|
2243
|
+
name: "partitions",
|
|
2244
|
+
type: ["null", {
|
|
2245
|
+
type: "array",
|
|
2246
|
+
"element-id": 508,
|
|
2247
|
+
items: {
|
|
2248
|
+
type: "record",
|
|
2249
|
+
name: "r508",
|
|
2250
|
+
fields: [
|
|
2251
|
+
{
|
|
2252
|
+
name: "contains_null",
|
|
2253
|
+
type: "boolean",
|
|
2254
|
+
"field-id": 509
|
|
2255
|
+
},
|
|
2256
|
+
{
|
|
2257
|
+
name: "contains_nan",
|
|
2258
|
+
type: ["null", "boolean"],
|
|
2259
|
+
default: null,
|
|
2260
|
+
"field-id": 518
|
|
2261
|
+
},
|
|
2262
|
+
{
|
|
2263
|
+
name: "lower_bound",
|
|
2264
|
+
type: ["null", "bytes"],
|
|
2265
|
+
default: null,
|
|
2266
|
+
"field-id": 510
|
|
2267
|
+
},
|
|
2268
|
+
{
|
|
2269
|
+
name: "upper_bound",
|
|
2270
|
+
type: ["null", "bytes"],
|
|
2271
|
+
default: null,
|
|
2272
|
+
"field-id": 511
|
|
2273
|
+
}
|
|
2274
|
+
]
|
|
2275
|
+
}
|
|
2276
|
+
}],
|
|
2277
|
+
default: null,
|
|
2278
|
+
"field-id": 507
|
|
2279
|
+
}
|
|
2280
|
+
];
|
|
2281
|
+
if (formatVersion >= 3) fields.push({
|
|
2282
|
+
name: "first_row_id",
|
|
2283
|
+
type: ["null", "long"],
|
|
2284
|
+
default: null,
|
|
2285
|
+
"field-id": 520
|
|
2286
|
+
});
|
|
2287
|
+
return {
|
|
2288
|
+
type: "record",
|
|
2289
|
+
name: "manifest_file",
|
|
2290
|
+
fields
|
|
2291
|
+
};
|
|
2292
|
+
}
|
|
2293
|
+
function writeManifestList({ writer, snapshotId, sequenceNumber, manifests, formatVersion = 2 }) {
|
|
2294
|
+
const records = manifests.map((m) => {
|
|
2295
|
+
const record = {
|
|
2296
|
+
manifest_path: m.manifest_path,
|
|
2297
|
+
manifest_length: m.manifest_length,
|
|
2298
|
+
partition_spec_id: m.partition_spec_id,
|
|
2299
|
+
content: m.content,
|
|
2300
|
+
sequence_number: m.sequence_number ?? sequenceNumber,
|
|
2301
|
+
min_sequence_number: m.min_sequence_number ?? sequenceNumber,
|
|
2302
|
+
added_snapshot_id: m.added_snapshot_id,
|
|
2303
|
+
added_files_count: m.added_files_count,
|
|
2304
|
+
existing_files_count: m.existing_files_count,
|
|
2305
|
+
deleted_files_count: m.deleted_files_count,
|
|
2306
|
+
added_rows_count: m.added_rows_count,
|
|
2307
|
+
existing_rows_count: m.existing_rows_count,
|
|
2308
|
+
deleted_rows_count: m.deleted_rows_count,
|
|
2309
|
+
partitions: m.partitions ?? null
|
|
2310
|
+
};
|
|
2311
|
+
if (formatVersion >= 3) record.first_row_id = m.content === 0 ? m.first_row_id ?? null : null;
|
|
2312
|
+
return record;
|
|
2313
|
+
});
|
|
2314
|
+
return avroWrite({
|
|
2315
|
+
writer,
|
|
2316
|
+
schema: manifestFileSchema(formatVersion),
|
|
2317
|
+
records,
|
|
2318
|
+
metadata: {
|
|
2319
|
+
"format-version": String(formatVersion),
|
|
2320
|
+
"snapshot-id": String(snapshotId),
|
|
2321
|
+
"sequence-number": String(sequenceNumber)
|
|
2322
|
+
}
|
|
2323
|
+
});
|
|
2324
|
+
}
|
|
2325
|
+
function isGeoType(name) {
|
|
2326
|
+
return name.startsWith("geometry") || name.startsWith("geography");
|
|
2327
|
+
}
|
|
2328
|
+
function computeGeoBounds(records, field) {
|
|
2329
|
+
let partial;
|
|
2330
|
+
let nulls = 0n;
|
|
2331
|
+
const writeDefault = field["write-default"];
|
|
2332
|
+
for (const record of records) {
|
|
2333
|
+
let v = record[field.name];
|
|
2334
|
+
if (v === void 0 && writeDefault !== void 0) v = writeDefault;
|
|
2335
|
+
if (v === null || v === void 0) {
|
|
2336
|
+
nulls++;
|
|
2337
|
+
continue;
|
|
2338
|
+
}
|
|
2339
|
+
if (typeof v !== "object") throw new Error("geospatial column expects GeoJSON geometries");
|
|
2340
|
+
partial = extendBoundsFromGeometry(partial, v);
|
|
2341
|
+
}
|
|
2342
|
+
const result = {
|
|
2343
|
+
value_count: BigInt(records.length),
|
|
2344
|
+
null_count: nulls
|
|
2345
|
+
};
|
|
2346
|
+
const { xmin, ymin, xmax, ymax, zmin, zmax, mmin, mmax } = partial ?? {};
|
|
2347
|
+
if (xmin === void 0 || ymin === void 0 || xmax === void 0 || ymax === void 0) return result;
|
|
2348
|
+
const hasZ = zmin !== void 0;
|
|
2349
|
+
const hasM = mmin !== void 0;
|
|
2350
|
+
return {
|
|
2351
|
+
...result,
|
|
2352
|
+
lower: encodeGeoPoint(xmin, ymin, zmin, mmin, hasZ, hasM),
|
|
2353
|
+
upper: encodeGeoPoint(xmax, ymax, zmax, mmax, hasZ, hasM)
|
|
2354
|
+
};
|
|
2355
|
+
}
|
|
2356
|
+
function extendBoundsFromGeometry(bbox, geometry) {
|
|
2357
|
+
if (geometry.type === "GeometryCollection") {
|
|
2358
|
+
for (const child of geometry.geometries || []) bbox = extendBoundsFromGeometry(bbox, child);
|
|
2359
|
+
return bbox;
|
|
2360
|
+
}
|
|
2361
|
+
return extendBoundsFromCoordinates(bbox, geometry.coordinates);
|
|
2362
|
+
}
|
|
2363
|
+
function extendBoundsFromCoordinates(bbox, coordinates) {
|
|
2364
|
+
if (typeof coordinates[0] === "number") {
|
|
2365
|
+
bbox = updateAxis(bbox, "xmin", "xmax", coordinates[0]);
|
|
2366
|
+
bbox = updateAxis(bbox, "ymin", "ymax", coordinates[1]);
|
|
2367
|
+
if (coordinates.length > 2) bbox = updateAxis(bbox, "zmin", "zmax", coordinates[2]);
|
|
2368
|
+
if (coordinates.length > 3) bbox = updateAxis(bbox, "mmin", "mmax", coordinates[3]);
|
|
2369
|
+
return bbox;
|
|
2370
|
+
}
|
|
2371
|
+
for (const child of coordinates) bbox = extendBoundsFromCoordinates(bbox, child);
|
|
2372
|
+
return bbox;
|
|
2373
|
+
}
|
|
2374
|
+
function updateAxis(bbox, minKey, maxKey, value) {
|
|
2375
|
+
if (value === void 0 || !Number.isFinite(value)) return bbox;
|
|
2376
|
+
if (!bbox) bbox = {};
|
|
2377
|
+
const min = bbox[minKey];
|
|
2378
|
+
const max = bbox[maxKey];
|
|
2379
|
+
if (min === void 0 || value < min) bbox[minKey] = value;
|
|
2380
|
+
if (max === void 0 || value > max) bbox[maxKey] = value;
|
|
2381
|
+
return bbox;
|
|
2382
|
+
}
|
|
2383
|
+
function encodeGeoPoint(x, y, z, m, hasZ, hasM) {
|
|
2384
|
+
const len = !hasZ && !hasM ? 16 : hasZ && !hasM ? 24 : 32;
|
|
2385
|
+
const buf = new ArrayBuffer(len);
|
|
2386
|
+
const view = new DataView(buf);
|
|
2387
|
+
view.setFloat64(0, x, true);
|
|
2388
|
+
view.setFloat64(8, y, true);
|
|
2389
|
+
if (len === 24) view.setFloat64(16, z, true);
|
|
2390
|
+
else if (len === 32) {
|
|
2391
|
+
view.setFloat64(16, hasZ ? z : NaN, true);
|
|
2392
|
+
view.setFloat64(24, m, true);
|
|
2393
|
+
}
|
|
2394
|
+
return new Uint8Array(buf);
|
|
2395
|
+
}
|
|
2396
|
+
function serializeValue(value, type) {
|
|
2397
|
+
const name = typeName(type);
|
|
2398
|
+
if (name.startsWith("decimal(")) {
|
|
2399
|
+
const m = /^decimal\((\d+),\s*(\d+)\)$/.exec(name);
|
|
2400
|
+
if (!m) return void 0;
|
|
2401
|
+
const scale = parseInt(m[2], 10);
|
|
2402
|
+
if (typeof value !== "number" && typeof value !== "bigint") return void 0;
|
|
2403
|
+
const factor = 10n ** BigInt(scale);
|
|
2404
|
+
return twosComplementMinBigEndian(typeof value === "bigint" ? value * factor : BigInt(Math.round(value * Number(factor))));
|
|
2405
|
+
}
|
|
2406
|
+
if (name.startsWith("fixed[")) return value instanceof Uint8Array ? value : void 0;
|
|
2407
|
+
switch (name) {
|
|
2408
|
+
case "boolean": return new Uint8Array([value ? 1 : 0]);
|
|
2409
|
+
case "int": {
|
|
2410
|
+
const buf = /* @__PURE__ */ new ArrayBuffer(4);
|
|
2411
|
+
new DataView(buf).setInt32(0, value, true);
|
|
2412
|
+
return new Uint8Array(buf);
|
|
2413
|
+
}
|
|
2414
|
+
case "long": {
|
|
2415
|
+
const buf = /* @__PURE__ */ new ArrayBuffer(8);
|
|
2416
|
+
new DataView(buf).setBigInt64(0, typeof value === "bigint" ? value : BigInt(value), true);
|
|
2417
|
+
return new Uint8Array(buf);
|
|
2418
|
+
}
|
|
2419
|
+
case "float": {
|
|
2420
|
+
const buf = /* @__PURE__ */ new ArrayBuffer(4);
|
|
2421
|
+
new DataView(buf).setFloat32(0, value, true);
|
|
2422
|
+
return new Uint8Array(buf);
|
|
2423
|
+
}
|
|
2424
|
+
case "double": {
|
|
2425
|
+
const buf = /* @__PURE__ */ new ArrayBuffer(8);
|
|
2426
|
+
new DataView(buf).setFloat64(0, value, true);
|
|
2427
|
+
return new Uint8Array(buf);
|
|
2428
|
+
}
|
|
2429
|
+
case "date": {
|
|
2430
|
+
const days = value instanceof Date ? Math.floor(value.getTime() / 864e5) : Number(value);
|
|
2431
|
+
const buf = /* @__PURE__ */ new ArrayBuffer(4);
|
|
2432
|
+
new DataView(buf).setInt32(0, days, true);
|
|
2433
|
+
return new Uint8Array(buf);
|
|
2434
|
+
}
|
|
2435
|
+
case "time": {
|
|
2436
|
+
const buf = /* @__PURE__ */ new ArrayBuffer(8);
|
|
2437
|
+
new DataView(buf).setBigInt64(0, typeof value === "bigint" ? value : BigInt(value), true);
|
|
2438
|
+
return new Uint8Array(buf);
|
|
2439
|
+
}
|
|
2440
|
+
case "timestamp":
|
|
2441
|
+
case "timestamptz": {
|
|
2442
|
+
const buf = /* @__PURE__ */ new ArrayBuffer(8);
|
|
2443
|
+
new DataView(buf).setBigInt64(0, timestampToMicros(value), true);
|
|
2444
|
+
return new Uint8Array(buf);
|
|
2445
|
+
}
|
|
2446
|
+
case "timestamp_ns":
|
|
2447
|
+
case "timestamptz_ns": {
|
|
2448
|
+
const buf = /* @__PURE__ */ new ArrayBuffer(8);
|
|
2449
|
+
new DataView(buf).setBigInt64(0, timestampToNanos(value), true);
|
|
2450
|
+
return new Uint8Array(buf);
|
|
2451
|
+
}
|
|
2452
|
+
case "string": return new TextEncoder().encode(value);
|
|
2453
|
+
case "binary": return value instanceof Uint8Array ? value : void 0;
|
|
2454
|
+
case "uuid":
|
|
2455
|
+
if (value instanceof Uint8Array && value.length === 16) return value;
|
|
2456
|
+
if (typeof value === "string") return uuidStringToBytes(value);
|
|
2457
|
+
return;
|
|
2458
|
+
default: return;
|
|
2459
|
+
}
|
|
2460
|
+
}
|
|
2461
|
+
function compare(a, b, type) {
|
|
2462
|
+
switch (typeName(type)) {
|
|
2463
|
+
case "boolean": return (a ? 1 : 0) - (b ? 1 : 0);
|
|
2464
|
+
case "int": return a < b ? -1 : a > b ? 1 : 0;
|
|
2465
|
+
case "float":
|
|
2466
|
+
case "double": return compareFloating(a, b);
|
|
2467
|
+
case "long": {
|
|
2468
|
+
const ai = typeof a === "bigint" ? a : BigInt(a);
|
|
2469
|
+
const bi = typeof b === "bigint" ? b : BigInt(b);
|
|
2470
|
+
return ai < bi ? -1 : ai > bi ? 1 : 0;
|
|
2471
|
+
}
|
|
2472
|
+
case "date": {
|
|
2473
|
+
const ad = dateToDays(a);
|
|
2474
|
+
const bd = dateToDays(b);
|
|
2475
|
+
if (Number.isNaN(ad) || Number.isNaN(bd)) return NaN;
|
|
2476
|
+
return ad < bd ? -1 : ad > bd ? 1 : 0;
|
|
2477
|
+
}
|
|
2478
|
+
case "timestamp":
|
|
2479
|
+
case "timestamptz": return compareBigInt(timestampToMicros(a), timestampToMicros(b));
|
|
2480
|
+
case "timestamp_ns":
|
|
2481
|
+
case "timestamptz_ns": return compareBigInt(timestampToNanos(a), timestampToNanos(b));
|
|
2482
|
+
case "string": return a < b ? -1 : a > b ? 1 : 0;
|
|
2483
|
+
case "binary":
|
|
2484
|
+
case "uuid": return compareBytes(a, b);
|
|
2485
|
+
default:
|
|
2486
|
+
if (typeName(type).startsWith("fixed[")) return compareBytes(a, b);
|
|
2487
|
+
return a < b ? -1 : a > b ? 1 : 0;
|
|
2488
|
+
}
|
|
2489
|
+
}
|
|
2490
|
+
function compareFloating(a, b) {
|
|
2491
|
+
if (Object.is(a, b)) return 0;
|
|
2492
|
+
if (a === 0 && b === 0) return Object.is(a, -0) ? -1 : 1;
|
|
2493
|
+
return a < b ? -1 : a > b ? 1 : 0;
|
|
2494
|
+
}
|
|
2495
|
+
function compareBigInt(a, b) {
|
|
2496
|
+
return a < b ? -1 : a > b ? 1 : 0;
|
|
2497
|
+
}
|
|
2498
|
+
function dateToDays(value) {
|
|
2499
|
+
if (value instanceof Date) return Math.floor(value.getTime() / 864e5);
|
|
2500
|
+
if (typeof value === "bigint") return Number(value);
|
|
2501
|
+
if (typeof value === "number") return value;
|
|
2502
|
+
if (typeof value === "string") {
|
|
2503
|
+
const ms = Date.parse(value);
|
|
2504
|
+
return Number.isNaN(ms) ? NaN : Math.floor(ms / 864e5);
|
|
2505
|
+
}
|
|
2506
|
+
return NaN;
|
|
2507
|
+
}
|
|
2508
|
+
function timestampToMicros(value) {
|
|
2509
|
+
return typeof value === "bigint" ? value : value instanceof Date ? BigInt(value.getTime()) * 1000n : BigInt(value);
|
|
2510
|
+
}
|
|
2511
|
+
function timestampToNanos(value) {
|
|
2512
|
+
return typeof value === "bigint" ? value : value instanceof Date ? BigInt(value.getTime()) * 1000000n : BigInt(value);
|
|
2513
|
+
}
|
|
2514
|
+
function compareBytes(a, b) {
|
|
2515
|
+
const len = Math.min(a.length, b.length);
|
|
2516
|
+
for (let i = 0; i < len; i++) if (a[i] !== b[i]) return a[i] - b[i];
|
|
2517
|
+
return a.length - b.length;
|
|
2518
|
+
}
|
|
2519
|
+
function twosComplementMinBigEndian(value) {
|
|
2520
|
+
const bytes = [];
|
|
2521
|
+
let v = value;
|
|
2522
|
+
while (true) {
|
|
2523
|
+
const byte = Number(v & 255n);
|
|
2524
|
+
bytes.unshift(byte);
|
|
2525
|
+
v >>= 8n;
|
|
2526
|
+
const sign = byte & 128;
|
|
2527
|
+
if (!sign && v === 0n || sign && v === -1n) break;
|
|
2528
|
+
}
|
|
2529
|
+
return new Uint8Array(bytes);
|
|
2530
|
+
}
|
|
2531
|
+
function uuidStringToBytes(s) {
|
|
2532
|
+
const hex = s.replace(/-/g, "");
|
|
2533
|
+
if (hex.length !== 32) return void 0;
|
|
2534
|
+
const out = new Uint8Array(16);
|
|
2535
|
+
for (let i = 0; i < 16; i++) {
|
|
2536
|
+
const byte = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
|
|
2537
|
+
if (Number.isNaN(byte)) return void 0;
|
|
2538
|
+
out[i] = byte;
|
|
2539
|
+
}
|
|
2540
|
+
return out;
|
|
2541
|
+
}
|
|
2542
|
+
const TRUNCATE_LIMIT = 16;
|
|
2543
|
+
function computeColumnStats(records, schema) {
|
|
2544
|
+
const value_counts = {};
|
|
2545
|
+
const null_value_counts = {};
|
|
2546
|
+
const nan_value_counts = {};
|
|
2547
|
+
const lower_bounds = {};
|
|
2548
|
+
const upper_bounds = {};
|
|
2549
|
+
for (const field of schema.fields) {
|
|
2550
|
+
const type = typeName(field.type);
|
|
2551
|
+
if (type === "unknown") continue;
|
|
2552
|
+
if (type === "list" || type === "map" || type === "struct") continue;
|
|
2553
|
+
if (isGeoType(type)) {
|
|
2554
|
+
const { value_count, null_count, lower, upper } = computeGeoBounds(records, field);
|
|
2555
|
+
value_counts[field.id] = value_count;
|
|
2556
|
+
null_value_counts[field.id] = null_count;
|
|
2557
|
+
if (lower) lower_bounds[field.id] = lower;
|
|
2558
|
+
if (upper) upper_bounds[field.id] = upper;
|
|
2559
|
+
continue;
|
|
2560
|
+
}
|
|
2561
|
+
let nulls = 0n;
|
|
2562
|
+
let nans = 0n;
|
|
2563
|
+
let min;
|
|
2564
|
+
let max;
|
|
2565
|
+
const isFloat = type === "float" || type === "double";
|
|
2566
|
+
const trackBounds = hasComparableBounds(field.type);
|
|
2567
|
+
const writeDefault = field["write-default"];
|
|
2568
|
+
for (const record of records) {
|
|
2569
|
+
let v = record[field.name];
|
|
2570
|
+
if (v === void 0 && writeDefault !== void 0) v = writeDefault;
|
|
2571
|
+
if (v === null || v === void 0) {
|
|
2572
|
+
nulls++;
|
|
2573
|
+
continue;
|
|
2574
|
+
}
|
|
2575
|
+
if (isFloat && Number.isNaN(v)) {
|
|
2576
|
+
nans++;
|
|
2577
|
+
continue;
|
|
2578
|
+
}
|
|
2579
|
+
if (trackBounds) {
|
|
2580
|
+
if (min === void 0 || compare(v, min, field.type) < 0) min = v;
|
|
2581
|
+
if (max === void 0 || compare(v, max, field.type) > 0) max = v;
|
|
2582
|
+
}
|
|
2583
|
+
}
|
|
2584
|
+
value_counts[field.id] = BigInt(records.length);
|
|
2585
|
+
null_value_counts[field.id] = nulls;
|
|
2586
|
+
if (isFloat) nan_value_counts[field.id] = nans;
|
|
2587
|
+
if (min !== void 0) {
|
|
2588
|
+
const lo = serializeValue(truncateLower(min, field.type), field.type);
|
|
2589
|
+
if (lo) lower_bounds[field.id] = lo;
|
|
2590
|
+
}
|
|
2591
|
+
if (max !== void 0) {
|
|
2592
|
+
const truncated = truncateUpper(max, field.type);
|
|
2593
|
+
if (truncated !== void 0) {
|
|
2594
|
+
const hi = serializeValue(truncated, field.type);
|
|
2595
|
+
if (hi) upper_bounds[field.id] = hi;
|
|
2596
|
+
}
|
|
2597
|
+
}
|
|
2598
|
+
}
|
|
2599
|
+
return {
|
|
2600
|
+
value_counts,
|
|
2601
|
+
null_value_counts,
|
|
2602
|
+
nan_value_counts,
|
|
2603
|
+
lower_bounds,
|
|
2604
|
+
upper_bounds
|
|
2605
|
+
};
|
|
2606
|
+
}
|
|
2607
|
+
function hasComparableBounds(type) {
|
|
2608
|
+
const name = typeName(type);
|
|
2609
|
+
if (isGeoType(name)) return false;
|
|
2610
|
+
return name !== "unknown" && name !== "variant";
|
|
2611
|
+
}
|
|
2612
|
+
function computeFieldSummary(values, type) {
|
|
2613
|
+
const name = typeName(type);
|
|
2614
|
+
const isFloat = name === "float" || name === "double";
|
|
2615
|
+
const trackBounds = hasComparableBounds(type);
|
|
2616
|
+
let containsNull = false;
|
|
2617
|
+
let containsNan = false;
|
|
2618
|
+
let min;
|
|
2619
|
+
let max;
|
|
2620
|
+
for (const v of values) {
|
|
2621
|
+
if (v === null || v === void 0) {
|
|
2622
|
+
containsNull = true;
|
|
2623
|
+
continue;
|
|
2624
|
+
}
|
|
2625
|
+
if (isFloat && Number.isNaN(v)) {
|
|
2626
|
+
containsNan = true;
|
|
2627
|
+
continue;
|
|
2628
|
+
}
|
|
2629
|
+
if (trackBounds) {
|
|
2630
|
+
if (min === void 0 || compare(v, min, type) < 0) min = v;
|
|
2631
|
+
if (max === void 0 || compare(v, max, type) > 0) max = v;
|
|
2632
|
+
}
|
|
2633
|
+
}
|
|
2634
|
+
const summary = { contains_null: containsNull };
|
|
2635
|
+
if (isFloat) summary.contains_nan = containsNan;
|
|
2636
|
+
if (min !== void 0) {
|
|
2637
|
+
const lo = serializeValue(truncateLower(min, type), type);
|
|
2638
|
+
if (lo) summary.lower_bound = lo;
|
|
2639
|
+
}
|
|
2640
|
+
if (max !== void 0) {
|
|
2641
|
+
const truncated = truncateUpper(max, type);
|
|
2642
|
+
if (truncated !== void 0) {
|
|
2643
|
+
const hi = serializeValue(truncated, type);
|
|
2644
|
+
if (hi) summary.upper_bound = hi;
|
|
2645
|
+
}
|
|
2646
|
+
}
|
|
2647
|
+
return summary;
|
|
2648
|
+
}
|
|
2649
|
+
function truncateLower(value, type) {
|
|
2650
|
+
const name = typeName(type);
|
|
2651
|
+
if (name === "string" && typeof value === "string") {
|
|
2652
|
+
const cps = Array.from(value);
|
|
2653
|
+
if (cps.length <= TRUNCATE_LIMIT) return value;
|
|
2654
|
+
return cps.slice(0, TRUNCATE_LIMIT).join("");
|
|
2655
|
+
}
|
|
2656
|
+
if ((name === "binary" || name.startsWith("fixed[")) && value instanceof Uint8Array) {
|
|
2657
|
+
if (value.length <= TRUNCATE_LIMIT) return value;
|
|
2658
|
+
return value.slice(0, TRUNCATE_LIMIT);
|
|
2659
|
+
}
|
|
2660
|
+
return value;
|
|
2661
|
+
}
|
|
2662
|
+
function truncateUpper(value, type) {
|
|
2663
|
+
const name = typeName(type);
|
|
2664
|
+
if (name === "string" && typeof value === "string") {
|
|
2665
|
+
const cps = Array.from(value);
|
|
2666
|
+
if (cps.length <= TRUNCATE_LIMIT) return value;
|
|
2667
|
+
const prefix = cps.slice(0, TRUNCATE_LIMIT);
|
|
2668
|
+
while (prefix.length > 0) {
|
|
2669
|
+
const cp = prefix[prefix.length - 1].codePointAt(0);
|
|
2670
|
+
const next = cp + 1 === 55296 ? 57344 : cp + 1;
|
|
2671
|
+
if (next <= 1114111) {
|
|
2672
|
+
prefix[prefix.length - 1] = String.fromCodePoint(next);
|
|
2673
|
+
return prefix.join("");
|
|
2674
|
+
}
|
|
2675
|
+
prefix.pop();
|
|
2676
|
+
}
|
|
2677
|
+
return;
|
|
2678
|
+
}
|
|
2679
|
+
if ((name === "binary" || name.startsWith("fixed[")) && value instanceof Uint8Array) {
|
|
2680
|
+
if (value.length <= TRUNCATE_LIMIT) return value;
|
|
2681
|
+
const prefix = value.slice(0, TRUNCATE_LIMIT);
|
|
2682
|
+
for (let i = prefix.length - 1; i >= 0; i--) if (prefix[i] < 255) {
|
|
2683
|
+
const out = prefix.slice(0, i + 1);
|
|
2684
|
+
out[i]++;
|
|
2685
|
+
return out;
|
|
2686
|
+
}
|
|
2687
|
+
return;
|
|
2688
|
+
}
|
|
2689
|
+
return value;
|
|
2690
|
+
}
|
|
2691
|
+
function currentSnapshot(metadata) {
|
|
2692
|
+
const id = metadata["current-snapshot-id"];
|
|
2693
|
+
if (id === void 0) return void 0;
|
|
2694
|
+
return metadata.snapshots?.find((s) => s["snapshot-id"] === id);
|
|
2695
|
+
}
|
|
2696
|
+
async function loadPriorManifests(metadata, resolver) {
|
|
2697
|
+
const snap = currentSnapshot(metadata);
|
|
2698
|
+
if (!snap?.["manifest-list"]) return [];
|
|
2699
|
+
return await fetchAvroRecords(snap["manifest-list"], resolver);
|
|
2700
|
+
}
|
|
2701
|
+
async function buildSnapshotUpdate({ tableUrl, metadata, resolver, snapshotId, sequenceNumber, manifestUuid, timestampMs, formatVersion, newManifests, summary, writtenFiles, priorManifests, skipPriorManifestPaths }) {
|
|
2702
|
+
const writerFn = resolver.writer;
|
|
2703
|
+
if (!writerFn) throw new Error("resolver.writer is required");
|
|
2704
|
+
const rowLineage = formatVersion >= 3;
|
|
2705
|
+
const firstRowId = rowLineage ? BigInt(metadata["next-row-id"] ?? 0) : 0n;
|
|
2706
|
+
priorManifests ??= await loadPriorManifests(metadata, resolver);
|
|
2707
|
+
if (skipPriorManifestPaths?.size) priorManifests = priorManifests.filter((manifest) => !skipPriorManifestPaths.has(manifest.manifest_path));
|
|
2708
|
+
const allManifests = [...priorManifests, ...newManifests];
|
|
2709
|
+
const addedRows = rowLineage ? assignFirstRowIds$1(allManifests, firstRowId) : 0n;
|
|
2710
|
+
const manifestListPath = `${tableUrl}/metadata/snap-${snapshotId}-1-${manifestUuid}.avro`;
|
|
2711
|
+
await writeManifestList({
|
|
2712
|
+
writer: writerFn(manifestListPath),
|
|
2713
|
+
snapshotId,
|
|
2714
|
+
sequenceNumber,
|
|
2715
|
+
manifests: allManifests,
|
|
2716
|
+
formatVersion
|
|
2717
|
+
});
|
|
2718
|
+
const snapshot = {
|
|
2719
|
+
"snapshot-id": Number(snapshotId),
|
|
2720
|
+
"sequence-number": Number(sequenceNumber),
|
|
2721
|
+
"timestamp-ms": timestampMs,
|
|
2722
|
+
"manifest-list": manifestListPath,
|
|
2723
|
+
summary,
|
|
2724
|
+
"schema-id": metadata["current-schema-id"]
|
|
2725
|
+
};
|
|
2726
|
+
if (rowLineage) {
|
|
2727
|
+
snapshot["first-row-id"] = toMetadataLong(firstRowId);
|
|
2728
|
+
snapshot["added-rows"] = toMetadataLong(addedRows);
|
|
2729
|
+
}
|
|
2730
|
+
const rawCurrentSnapshotId = metadata["current-snapshot-id"];
|
|
2731
|
+
const currentSnapshotId = rawCurrentSnapshotId === void 0 || rawCurrentSnapshotId === null || rawCurrentSnapshotId === -1 ? null : rawCurrentSnapshotId;
|
|
2732
|
+
if (currentSnapshotId !== null) snapshot["parent-snapshot-id"] = currentSnapshotId;
|
|
2733
|
+
const requirements = [{
|
|
2734
|
+
type: "assert-table-uuid",
|
|
2735
|
+
uuid: metadata["table-uuid"]
|
|
2736
|
+
}, {
|
|
2737
|
+
type: "assert-ref-snapshot-id",
|
|
2738
|
+
ref: "main",
|
|
2739
|
+
"snapshot-id": currentSnapshotId
|
|
2740
|
+
}];
|
|
2741
|
+
if (rowLineage) requirements.push({
|
|
2742
|
+
type: "assert-next-row-id",
|
|
2743
|
+
"next-row-id": toMetadataLong(metadata["next-row-id"] ?? 0)
|
|
2744
|
+
});
|
|
2745
|
+
return {
|
|
2746
|
+
snapshot,
|
|
2747
|
+
requirements,
|
|
2748
|
+
updates: [{
|
|
2749
|
+
action: "add-snapshot",
|
|
2750
|
+
snapshot
|
|
2751
|
+
}, {
|
|
2752
|
+
action: "set-snapshot-ref",
|
|
2753
|
+
"ref-name": "main",
|
|
2754
|
+
type: "branch",
|
|
2755
|
+
"snapshot-id": snapshot["snapshot-id"]
|
|
2756
|
+
}],
|
|
2757
|
+
writtenFiles: [...writtenFiles, manifestListPath]
|
|
2758
|
+
};
|
|
2759
|
+
}
|
|
2760
|
+
function buildPartitionSummaries(partitions, schema, partitionSpec) {
|
|
2761
|
+
return partitionSpec.fields.map((pf) => {
|
|
2762
|
+
const sourceField = schema.fields.find((f) => f.id === pf["source-id"]);
|
|
2763
|
+
if (!sourceField) throw new Error(`partition source field id ${pf["source-id"]} not found`);
|
|
2764
|
+
const resultType = transformResultType(pf.transform, sourceField.type);
|
|
2765
|
+
return computeFieldSummary(partitions.map((p) => p[pf.name]), resultType);
|
|
2766
|
+
});
|
|
2767
|
+
}
|
|
2768
|
+
function assignFirstRowIds$1(manifests, firstRowId) {
|
|
2769
|
+
let nextFirstRowId = firstRowId;
|
|
2770
|
+
let assignedRows = 0n;
|
|
2771
|
+
for (const manifest of manifests) {
|
|
2772
|
+
if (manifest.content !== 0) {
|
|
2773
|
+
manifest.first_row_id = void 0;
|
|
2774
|
+
continue;
|
|
2775
|
+
}
|
|
2776
|
+
const rowIdRange = BigInt(manifest.added_rows_count ?? 0) + BigInt(manifest.existing_rows_count ?? 0);
|
|
2777
|
+
if (manifest.first_row_id == null) {
|
|
2778
|
+
manifest.first_row_id = nextFirstRowId;
|
|
2779
|
+
nextFirstRowId += rowIdRange;
|
|
2780
|
+
assignedRows += rowIdRange;
|
|
2781
|
+
} else {
|
|
2782
|
+
const manifestEnd = BigInt(manifest.first_row_id) + rowIdRange;
|
|
2783
|
+
if (manifestEnd > nextFirstRowId) nextFirstRowId = manifestEnd;
|
|
2784
|
+
}
|
|
2785
|
+
}
|
|
2786
|
+
return assignedRows;
|
|
2787
|
+
}
|
|
2788
|
+
function toMetadataLong(value) {
|
|
2789
|
+
const out = Number(value);
|
|
2790
|
+
if (!Number.isSafeInteger(out)) throw new Error(`metadata long exceeds JavaScript safe integer range: ${value}`);
|
|
2791
|
+
return out;
|
|
2792
|
+
}
|
|
2793
|
+
function writeParquet({ writer, schema, records, codec }) {
|
|
2794
|
+
const columnData = [];
|
|
2795
|
+
const parquetFields = [];
|
|
2796
|
+
let rootChildren = 0;
|
|
2797
|
+
for (const field of schema.fields) {
|
|
2798
|
+
const name = sanitize(field.name);
|
|
2799
|
+
const fieldElements = icebergTypeToParquetFields(name, field.type, field.required, field.id);
|
|
2800
|
+
if (!fieldElements.length) continue;
|
|
2801
|
+
columnData.push({
|
|
2802
|
+
name,
|
|
2803
|
+
data: extractColumn(records, field)
|
|
2804
|
+
});
|
|
2805
|
+
parquetFields.push(...fieldElements);
|
|
2806
|
+
rootChildren++;
|
|
2807
|
+
}
|
|
2808
|
+
return parquetWrite({
|
|
2809
|
+
writer,
|
|
2810
|
+
columnData,
|
|
2811
|
+
schema: [{
|
|
2812
|
+
name: "root",
|
|
2813
|
+
num_children: rootChildren
|
|
2814
|
+
}, ...parquetFields],
|
|
2815
|
+
kvMetadata: [{
|
|
2816
|
+
key: "iceberg.schema",
|
|
2817
|
+
value: JSON.stringify(schema)
|
|
2818
|
+
}],
|
|
2819
|
+
codec
|
|
2820
|
+
});
|
|
2821
|
+
}
|
|
2822
|
+
function extractColumn(records, field) {
|
|
2823
|
+
const out = new Array(records.length);
|
|
2824
|
+
for (let i = 0; i < records.length; i++) out[i] = materializeFieldValue(records[i][field.name], field);
|
|
2825
|
+
return out;
|
|
2826
|
+
}
|
|
2827
|
+
function materializeFieldValue(value, field) {
|
|
2828
|
+
const writeDefault = field["write-default"];
|
|
2829
|
+
return materializeNestedDefaults(value !== void 0 ? value : writeDefault !== void 0 ? writeDefault : null, field.type);
|
|
2830
|
+
}
|
|
2831
|
+
function materializeNestedDefaults(value, type) {
|
|
2832
|
+
if (value === null || value === void 0 || typeof type !== "object") return value;
|
|
2833
|
+
if (type.type === "struct") {
|
|
2834
|
+
if (typeof value !== "object" || Array.isArray(value)) return value;
|
|
2835
|
+
const out = { ...value };
|
|
2836
|
+
for (const child of type.fields) out[child.name] = materializeFieldValue(value[child.name], child);
|
|
2837
|
+
return out;
|
|
2838
|
+
}
|
|
2839
|
+
if (type.type === "list") {
|
|
2840
|
+
if (!Array.isArray(value)) return value;
|
|
2841
|
+
return value.map((v) => materializeNestedDefaults(v, type.element));
|
|
2842
|
+
}
|
|
2843
|
+
if (type.type === "map") return materializeMapDefaults(value, type);
|
|
2844
|
+
return value;
|
|
2845
|
+
}
|
|
2846
|
+
function materializeMapDefaults(value, type) {
|
|
2847
|
+
if (typeof type.key !== "object" && typeof type.value !== "object") return value;
|
|
2848
|
+
if (value instanceof Map) return Array.from(value.entries(), ([key, entryValue]) => ({
|
|
2849
|
+
key: materializeNestedDefaults(key, type.key),
|
|
2850
|
+
value: materializeNestedDefaults(entryValue, type.value)
|
|
2851
|
+
}));
|
|
2852
|
+
if (Array.isArray(value)) return value.map((entry) => {
|
|
2853
|
+
if (entry && typeof entry === "object" && "key" in entry && "value" in entry) return {
|
|
2854
|
+
key: materializeNestedDefaults(entry.key, type.key),
|
|
2855
|
+
value: materializeNestedDefaults(entry.value, type.value)
|
|
2856
|
+
};
|
|
2857
|
+
if (Array.isArray(entry) && entry.length === 2) return {
|
|
2858
|
+
key: materializeNestedDefaults(entry[0], type.key),
|
|
2859
|
+
value: materializeNestedDefaults(entry[1], type.value)
|
|
2860
|
+
};
|
|
2861
|
+
return entry;
|
|
2862
|
+
});
|
|
2863
|
+
if (typeof value === "object") return Object.fromEntries(Object.entries(value).map(([key, entryValue]) => [key, materializeNestedDefaults(entryValue, type.value)]));
|
|
2864
|
+
return value;
|
|
2865
|
+
}
|
|
2866
|
+
function icebergTypeToParquetFields(name, type, required, fieldId) {
|
|
2867
|
+
const repetition_type = required ? "REQUIRED" : "OPTIONAL";
|
|
2868
|
+
if (typeof type === "object") {
|
|
2869
|
+
if (type.type === "list") {
|
|
2870
|
+
const elementFields = icebergTypeToParquetFields("element", type.element, type["element-required"], type["element-id"]);
|
|
2871
|
+
if (!elementFields.length) throw new Error(`unsupported iceberg list element type: ${typeName(type.element)}`);
|
|
2872
|
+
return [
|
|
2873
|
+
{
|
|
2874
|
+
name,
|
|
2875
|
+
converted_type: "LIST",
|
|
2876
|
+
logical_type: { type: "LIST" },
|
|
2877
|
+
repetition_type,
|
|
2878
|
+
num_children: 1,
|
|
2879
|
+
field_id: fieldId
|
|
2880
|
+
},
|
|
2881
|
+
{
|
|
2882
|
+
name: "list",
|
|
2883
|
+
repetition_type: "REPEATED",
|
|
2884
|
+
num_children: 1
|
|
2885
|
+
},
|
|
2886
|
+
...elementFields
|
|
2887
|
+
];
|
|
2888
|
+
}
|
|
2889
|
+
if (type.type === "struct") {
|
|
2890
|
+
const allChildren = [];
|
|
2891
|
+
let directChildren = 0;
|
|
2892
|
+
for (const child of type.fields) {
|
|
2893
|
+
const sub = icebergTypeToParquetFields(child.name, child.type, child.required, child.id);
|
|
2894
|
+
if (!sub.length) continue;
|
|
2895
|
+
allChildren.push(...sub);
|
|
2896
|
+
directChildren++;
|
|
2897
|
+
}
|
|
2898
|
+
if (!directChildren) throw new Error(`struct ${name} has no writable children`);
|
|
2899
|
+
return [{
|
|
2900
|
+
name,
|
|
2901
|
+
repetition_type,
|
|
2902
|
+
num_children: directChildren,
|
|
2903
|
+
field_id: fieldId
|
|
2904
|
+
}, ...allChildren];
|
|
2905
|
+
}
|
|
2906
|
+
if (type.type === "map") {
|
|
2907
|
+
if (type.key !== "string" && type.key !== "int") throw new Error(`unsupported iceberg map key type: ${typeName(type.key)}`);
|
|
2908
|
+
const keyFields = icebergTypeToParquetFields("key", type.key, true, type["key-id"]);
|
|
2909
|
+
const valueFields = icebergTypeToParquetFields("value", type.value, type["value-required"], type["value-id"]);
|
|
2910
|
+
if (!keyFields.length) throw new Error(`unsupported iceberg map key type: ${typeName(type.key)}`);
|
|
2911
|
+
if (!valueFields.length) throw new Error(`unsupported iceberg map value type: ${typeName(type.value)}`);
|
|
2912
|
+
return [
|
|
2913
|
+
{
|
|
2914
|
+
name,
|
|
2915
|
+
converted_type: "MAP",
|
|
2916
|
+
logical_type: { type: "MAP" },
|
|
2917
|
+
repetition_type,
|
|
2918
|
+
num_children: 1,
|
|
2919
|
+
field_id: fieldId
|
|
2920
|
+
},
|
|
2921
|
+
{
|
|
2922
|
+
name: "key_value",
|
|
2923
|
+
repetition_type: "REPEATED",
|
|
2924
|
+
num_children: 2
|
|
2925
|
+
},
|
|
2926
|
+
...keyFields,
|
|
2927
|
+
...valueFields
|
|
2928
|
+
];
|
|
2929
|
+
}
|
|
2930
|
+
throw new Error(`unsupported iceberg nested type: ${JSON.stringify(type)}`);
|
|
2931
|
+
}
|
|
2932
|
+
if (type.startsWith("geometry")) return [{
|
|
2933
|
+
name,
|
|
2934
|
+
type: "BYTE_ARRAY",
|
|
2935
|
+
logical_type: { type: "GEOMETRY" },
|
|
2936
|
+
repetition_type,
|
|
2937
|
+
field_id: fieldId
|
|
2938
|
+
}];
|
|
2939
|
+
if (type.startsWith("geography")) return [{
|
|
2940
|
+
name,
|
|
2941
|
+
type: "BYTE_ARRAY",
|
|
2942
|
+
logical_type: { type: "GEOGRAPHY" },
|
|
2943
|
+
repetition_type,
|
|
2944
|
+
field_id: fieldId
|
|
2945
|
+
}];
|
|
2946
|
+
const decimal = parseDecimalType(type);
|
|
2947
|
+
if (decimal) {
|
|
2948
|
+
const { precision, scale } = decimal;
|
|
2949
|
+
return [{
|
|
2950
|
+
name,
|
|
2951
|
+
type: "FIXED_LEN_BYTE_ARRAY",
|
|
2952
|
+
type_length: decimalRequiredBytes(precision),
|
|
2953
|
+
converted_type: "DECIMAL",
|
|
2954
|
+
logical_type: {
|
|
2955
|
+
type: "DECIMAL",
|
|
2956
|
+
precision,
|
|
2957
|
+
scale
|
|
2958
|
+
},
|
|
2959
|
+
precision,
|
|
2960
|
+
scale,
|
|
2961
|
+
repetition_type,
|
|
2962
|
+
field_id: fieldId
|
|
2963
|
+
}];
|
|
2964
|
+
}
|
|
2965
|
+
const fixedLen = parseFixedType(type);
|
|
2966
|
+
if (fixedLen !== void 0) return [{
|
|
2967
|
+
name,
|
|
2968
|
+
type: "FIXED_LEN_BYTE_ARRAY",
|
|
2969
|
+
type_length: fixedLen,
|
|
2970
|
+
repetition_type,
|
|
2971
|
+
field_id: fieldId
|
|
2972
|
+
}];
|
|
2973
|
+
switch (type) {
|
|
2974
|
+
case "unknown":
|
|
2975
|
+
if (required) throw new Error("unsupported required iceberg type: unknown");
|
|
2976
|
+
return [];
|
|
2977
|
+
case "variant": return [
|
|
2978
|
+
{
|
|
2979
|
+
name,
|
|
2980
|
+
repetition_type,
|
|
2981
|
+
num_children: 2,
|
|
2982
|
+
logical_type: { type: "VARIANT" },
|
|
2983
|
+
field_id: fieldId
|
|
2984
|
+
},
|
|
2985
|
+
{
|
|
2986
|
+
name: "metadata",
|
|
2987
|
+
type: "BYTE_ARRAY",
|
|
2988
|
+
repetition_type: "REQUIRED"
|
|
2989
|
+
},
|
|
2990
|
+
{
|
|
2991
|
+
name: "value",
|
|
2992
|
+
type: "BYTE_ARRAY",
|
|
2993
|
+
repetition_type: "OPTIONAL"
|
|
2994
|
+
}
|
|
2995
|
+
];
|
|
2996
|
+
case "boolean": return [{
|
|
2997
|
+
name,
|
|
2998
|
+
type: "BOOLEAN",
|
|
2999
|
+
repetition_type,
|
|
3000
|
+
field_id: fieldId
|
|
3001
|
+
}];
|
|
3002
|
+
case "int": return [{
|
|
3003
|
+
name,
|
|
3004
|
+
type: "INT32",
|
|
3005
|
+
repetition_type,
|
|
3006
|
+
field_id: fieldId
|
|
3007
|
+
}];
|
|
3008
|
+
case "long": return [{
|
|
3009
|
+
name,
|
|
3010
|
+
type: "INT64",
|
|
3011
|
+
repetition_type,
|
|
3012
|
+
field_id: fieldId
|
|
3013
|
+
}];
|
|
3014
|
+
case "float": return [{
|
|
3015
|
+
name,
|
|
3016
|
+
type: "FLOAT",
|
|
3017
|
+
repetition_type,
|
|
3018
|
+
field_id: fieldId
|
|
3019
|
+
}];
|
|
3020
|
+
case "double": return [{
|
|
3021
|
+
name,
|
|
3022
|
+
type: "DOUBLE",
|
|
3023
|
+
repetition_type,
|
|
3024
|
+
field_id: fieldId
|
|
3025
|
+
}];
|
|
3026
|
+
case "string": return [{
|
|
3027
|
+
name,
|
|
3028
|
+
type: "BYTE_ARRAY",
|
|
3029
|
+
converted_type: "UTF8",
|
|
3030
|
+
repetition_type,
|
|
3031
|
+
field_id: fieldId
|
|
3032
|
+
}];
|
|
3033
|
+
case "binary": return [{
|
|
3034
|
+
name,
|
|
3035
|
+
type: "BYTE_ARRAY",
|
|
3036
|
+
repetition_type,
|
|
3037
|
+
field_id: fieldId
|
|
3038
|
+
}];
|
|
3039
|
+
case "uuid": return [{
|
|
3040
|
+
name,
|
|
3041
|
+
type: "FIXED_LEN_BYTE_ARRAY",
|
|
3042
|
+
type_length: 16,
|
|
3043
|
+
logical_type: { type: "UUID" },
|
|
3044
|
+
repetition_type,
|
|
3045
|
+
field_id: fieldId
|
|
3046
|
+
}];
|
|
3047
|
+
case "date": return [{
|
|
3048
|
+
name,
|
|
3049
|
+
type: "INT32",
|
|
3050
|
+
converted_type: "DATE",
|
|
3051
|
+
logical_type: { type: "DATE" },
|
|
3052
|
+
repetition_type,
|
|
3053
|
+
field_id: fieldId
|
|
3054
|
+
}];
|
|
3055
|
+
case "time": return [{
|
|
3056
|
+
name,
|
|
3057
|
+
type: "INT64",
|
|
3058
|
+
converted_type: "TIME_MICROS",
|
|
3059
|
+
logical_type: {
|
|
3060
|
+
type: "TIME",
|
|
3061
|
+
isAdjustedToUTC: false,
|
|
3062
|
+
unit: "MICROS"
|
|
3063
|
+
},
|
|
3064
|
+
repetition_type,
|
|
3065
|
+
field_id: fieldId
|
|
3066
|
+
}];
|
|
3067
|
+
case "timestamp": return [timestampField(name, repetition_type, false, "MICROS", fieldId)];
|
|
3068
|
+
case "timestamptz": return [timestampField(name, repetition_type, true, "MICROS", fieldId)];
|
|
3069
|
+
case "timestamp_ns": return [timestampField(name, repetition_type, false, "NANOS", fieldId)];
|
|
3070
|
+
case "timestamptz_ns": return [timestampField(name, repetition_type, true, "NANOS", fieldId)];
|
|
3071
|
+
default: throw new Error(`unsupported iceberg type: ${type}`);
|
|
3072
|
+
}
|
|
3073
|
+
}
|
|
3074
|
+
function parseFixedType(type) {
|
|
3075
|
+
const m = /^fixed\[(\d+)\]$/.exec(type);
|
|
3076
|
+
if (!m) return void 0;
|
|
3077
|
+
return parseInt(m[1], 10);
|
|
3078
|
+
}
|
|
3079
|
+
function timestampField(name, repetition_type, isAdjustedToUTC, unit, field_id) {
|
|
3080
|
+
return {
|
|
3081
|
+
name,
|
|
3082
|
+
type: "INT64",
|
|
3083
|
+
logical_type: {
|
|
3084
|
+
type: "TIMESTAMP",
|
|
3085
|
+
isAdjustedToUTC,
|
|
3086
|
+
unit
|
|
3087
|
+
},
|
|
3088
|
+
repetition_type,
|
|
3089
|
+
field_id
|
|
3090
|
+
};
|
|
3091
|
+
}
|
|
3092
|
+
function buildSortComparator(sortOrder, schema) {
|
|
3093
|
+
if (!sortOrder?.fields?.length) return void 0;
|
|
3094
|
+
const fields = sortOrder.fields.map((sf) => {
|
|
3095
|
+
const sourceId = sf["source-id"] ?? sf["source-ids"]?.[0];
|
|
3096
|
+
const sourceField = schema.fields.find((f) => f.id === sourceId);
|
|
3097
|
+
if (!sourceField) throw new Error(`sort source field id ${sourceId} not found in schema`);
|
|
3098
|
+
return {
|
|
3099
|
+
name: sourceField.name,
|
|
3100
|
+
transform: sf.transform,
|
|
3101
|
+
sourceType: sourceField.type,
|
|
3102
|
+
resultType: transformResultType(sf.transform, sourceField.type),
|
|
3103
|
+
desc: sf.direction === "desc",
|
|
3104
|
+
nullsFirst: sf["null-order"] === "nulls-first"
|
|
3105
|
+
};
|
|
3106
|
+
});
|
|
3107
|
+
return (a, b) => {
|
|
3108
|
+
for (const f of fields) {
|
|
3109
|
+
const c = compareKeys(sortKey(a[f.name], f.transform, f.sourceType), sortKey(b[f.name], f.transform, f.sourceType), f.resultType, f.desc, f.nullsFirst);
|
|
3110
|
+
if (c !== 0) return c;
|
|
3111
|
+
}
|
|
3112
|
+
return 0;
|
|
3113
|
+
};
|
|
3114
|
+
}
|
|
3115
|
+
function sortKey(value, transform, sourceType) {
|
|
3116
|
+
if (value === null || value === void 0) return null;
|
|
3117
|
+
if (transform === "identity") return value;
|
|
3118
|
+
return applyTransform(transform, value, sourceType);
|
|
3119
|
+
}
|
|
3120
|
+
function compareKeys(ka, kb, resultType, desc, nullsFirst) {
|
|
3121
|
+
const aNull = ka === null || ka === void 0;
|
|
3122
|
+
const bNull = kb === null || kb === void 0;
|
|
3123
|
+
if (aNull && bNull) return 0;
|
|
3124
|
+
if (aNull) return nullsFirst ? -1 : 1;
|
|
3125
|
+
if (bNull) return nullsFirst ? 1 : -1;
|
|
3126
|
+
const aNaN = typeof ka === "number" && Number.isNaN(ka);
|
|
3127
|
+
const bNaN = typeof kb === "number" && Number.isNaN(kb);
|
|
3128
|
+
if (aNaN || bNaN) {
|
|
3129
|
+
if (aNaN && bNaN) return 0;
|
|
3130
|
+
const c = aNaN ? 1 : -1;
|
|
3131
|
+
return desc ? -c : c;
|
|
3132
|
+
}
|
|
3133
|
+
const c = compare(ka, kb, resultType);
|
|
3134
|
+
return desc ? -c : c;
|
|
3135
|
+
}
|
|
3136
|
+
async function prepareAppend({ tableUrl, metadata, records, resolver, sortOrderId }) {
|
|
3137
|
+
if (!tableUrl) throw new Error("tableUrl is required");
|
|
3138
|
+
if (!resolver?.writer) throw new Error("resolver.writer is required");
|
|
3139
|
+
const writerFn = resolver.writer;
|
|
3140
|
+
if (metadata["format-version"] !== 2 && metadata["format-version"] !== 3) throw new Error(`unsupported format-version: ${metadata["format-version"]}`);
|
|
3141
|
+
const formatVersion = metadata["format-version"];
|
|
3142
|
+
const partitionSpec = metadata["partition-specs"].find((s) => s["spec-id"] === metadata["default-spec-id"]);
|
|
3143
|
+
if (!partitionSpec) throw new Error("default partition spec not found in metadata");
|
|
3144
|
+
const schema = metadata.schemas.find((s) => s["schema-id"] === metadata["current-schema-id"]);
|
|
3145
|
+
if (!schema) throw new Error("current schema not found in metadata");
|
|
3146
|
+
validateSchemaForVersion(schema, formatVersion);
|
|
3147
|
+
const snapshotId = newSnapshotId(metadata);
|
|
3148
|
+
const manifestUuid = uuid4();
|
|
3149
|
+
checkWriteFormat(metadata.properties?.["write.format.default"]);
|
|
3150
|
+
const codec = resolveParquetCodec(metadata.properties?.["write.parquet.compression-codec"]);
|
|
3151
|
+
const orderId = sortOrderId ?? metadata["default-sort-order-id"] ?? 0;
|
|
3152
|
+
const sortOrder = (metadata["sort-orders"] ?? []).find((o) => o["order-id"] === orderId);
|
|
3153
|
+
if (sortOrderId !== void 0 && !sortOrder) throw new Error(`sort order ${sortOrderId} not found in metadata`);
|
|
3154
|
+
const comparator = buildSortComparator(sortOrder, schema);
|
|
3155
|
+
const appliedSortOrderId = comparator ? orderId : 0;
|
|
3156
|
+
const groups = partitionSpec.fields.length ? groupByPartition(records, schema, partitionSpec) : [{
|
|
3157
|
+
partition: {},
|
|
3158
|
+
records
|
|
3159
|
+
}];
|
|
3160
|
+
const writtenDataFiles = await Promise.all(groups.map(async (group) => {
|
|
3161
|
+
const sortedRecords = comparator ? [...group.records].sort(comparator) : group.records;
|
|
3162
|
+
const dataPath = `${tableUrl}/data/${uuid4()}.parquet`;
|
|
3163
|
+
const dataWriter = writerFn(dataPath);
|
|
3164
|
+
await writeParquet({
|
|
3165
|
+
writer: dataWriter,
|
|
3166
|
+
schema,
|
|
3167
|
+
records: sortedRecords,
|
|
3168
|
+
codec
|
|
3169
|
+
});
|
|
3170
|
+
const stats = computeColumnStats(sortedRecords, schema);
|
|
3171
|
+
return {
|
|
3172
|
+
partition: group.partition,
|
|
3173
|
+
records: sortedRecords,
|
|
3174
|
+
dataFile: {
|
|
3175
|
+
content: 0,
|
|
3176
|
+
file_path: dataPath,
|
|
3177
|
+
file_format: "parquet",
|
|
3178
|
+
partition: group.partition,
|
|
3179
|
+
record_count: BigInt(sortedRecords.length),
|
|
3180
|
+
file_size_in_bytes: BigInt(dataWriter.offset),
|
|
3181
|
+
value_counts: stats.value_counts,
|
|
3182
|
+
null_value_counts: stats.null_value_counts,
|
|
3183
|
+
nan_value_counts: stats.nan_value_counts,
|
|
3184
|
+
lower_bounds: stats.lower_bounds,
|
|
3185
|
+
upper_bounds: stats.upper_bounds,
|
|
3186
|
+
sort_order_id: appliedSortOrderId
|
|
3187
|
+
},
|
|
3188
|
+
path: dataPath
|
|
3189
|
+
};
|
|
3190
|
+
}));
|
|
3191
|
+
const manifestPath = `${tableUrl}/metadata/${manifestUuid}-m0.avro`;
|
|
3192
|
+
const manifestWriter = writerFn(manifestPath);
|
|
3193
|
+
await writeDataManifest({
|
|
3194
|
+
writer: manifestWriter,
|
|
3195
|
+
schema,
|
|
3196
|
+
partitionSpec,
|
|
3197
|
+
snapshotId,
|
|
3198
|
+
dataFiles: writtenDataFiles.map((f) => f.dataFile),
|
|
3199
|
+
formatVersion
|
|
3200
|
+
});
|
|
3201
|
+
const manifestLength = BigInt(manifestWriter.offset);
|
|
3202
|
+
const addedRowCount = writtenDataFiles.reduce((sum, f) => sum + BigInt(f.records.length), 0n);
|
|
3203
|
+
const addedFilesSize = writtenDataFiles.reduce((sum, f) => sum + f.dataFile.file_size_in_bytes, 0n);
|
|
3204
|
+
const partitions = buildPartitionSummaries(writtenDataFiles.map((f) => f.dataFile.partition), schema, partitionSpec);
|
|
3205
|
+
return {
|
|
3206
|
+
snapshotId,
|
|
3207
|
+
manifestUuid,
|
|
3208
|
+
formatVersion,
|
|
3209
|
+
manifestPath,
|
|
3210
|
+
manifestLength,
|
|
3211
|
+
partitionSpecId: partitionSpec["spec-id"],
|
|
3212
|
+
partitions,
|
|
3213
|
+
addedDataFilesCount: writtenDataFiles.length,
|
|
3214
|
+
addedRowCount,
|
|
3215
|
+
addedFilesSize,
|
|
3216
|
+
recordsCount: records.length,
|
|
3217
|
+
writtenFiles: [...writtenDataFiles.map((f) => f.path), manifestPath]
|
|
3218
|
+
};
|
|
3219
|
+
}
|
|
3220
|
+
async function stageSnapshotForAppend({ tableUrl, metadata, prepared, resolver }) {
|
|
3221
|
+
if (!tableUrl) throw new Error("tableUrl is required");
|
|
3222
|
+
if (!resolver?.writer) throw new Error("resolver.writer is required");
|
|
3223
|
+
const sequenceNumber = BigInt(metadata["last-sequence-number"] ?? 0) + 1n;
|
|
3224
|
+
const timestampMs = Date.now();
|
|
3225
|
+
const newManifest = {
|
|
3226
|
+
manifest_path: prepared.manifestPath,
|
|
3227
|
+
manifest_length: prepared.manifestLength,
|
|
3228
|
+
partition_spec_id: prepared.partitionSpecId,
|
|
3229
|
+
content: 0,
|
|
3230
|
+
sequence_number: sequenceNumber,
|
|
3231
|
+
min_sequence_number: sequenceNumber,
|
|
3232
|
+
added_snapshot_id: prepared.snapshotId,
|
|
3233
|
+
added_files_count: prepared.addedDataFilesCount,
|
|
3234
|
+
existing_files_count: 0,
|
|
3235
|
+
deleted_files_count: 0,
|
|
3236
|
+
added_rows_count: prepared.addedRowCount,
|
|
3237
|
+
existing_rows_count: 0n,
|
|
3238
|
+
deleted_rows_count: 0n,
|
|
3239
|
+
partitions: prepared.partitions
|
|
3240
|
+
};
|
|
3241
|
+
const prevSummary = currentSnapshot(metadata)?.summary;
|
|
3242
|
+
const prevTotals = {
|
|
3243
|
+
records: BigInt(prevSummary?.["total-records"] ?? "0"),
|
|
3244
|
+
size: BigInt(prevSummary?.["total-files-size"] ?? "0"),
|
|
3245
|
+
files: BigInt(prevSummary?.["total-data-files"] ?? "0")
|
|
3246
|
+
};
|
|
3247
|
+
const summary = {
|
|
3248
|
+
operation: "append",
|
|
3249
|
+
"added-data-files": String(prepared.addedDataFilesCount),
|
|
3250
|
+
"added-records": String(prepared.recordsCount),
|
|
3251
|
+
"added-files-size": String(prepared.addedFilesSize),
|
|
3252
|
+
"changed-partition-count": String(prepared.addedDataFilesCount),
|
|
3253
|
+
"total-records": String(prevTotals.records + BigInt(prepared.recordsCount)),
|
|
3254
|
+
"total-files-size": String(prevTotals.size + prepared.addedFilesSize),
|
|
3255
|
+
"total-data-files": String(prevTotals.files + BigInt(prepared.addedDataFilesCount)),
|
|
3256
|
+
"total-delete-files": "0",
|
|
3257
|
+
"total-position-deletes": "0",
|
|
3258
|
+
"total-equality-deletes": "0"
|
|
3259
|
+
};
|
|
3260
|
+
return await buildSnapshotUpdate({
|
|
3261
|
+
tableUrl,
|
|
3262
|
+
metadata,
|
|
3263
|
+
resolver,
|
|
3264
|
+
snapshotId: prepared.snapshotId,
|
|
3265
|
+
sequenceNumber,
|
|
3266
|
+
manifestUuid: prepared.manifestUuid,
|
|
3267
|
+
timestampMs,
|
|
3268
|
+
formatVersion: prepared.formatVersion,
|
|
3269
|
+
newManifests: [newManifest],
|
|
3270
|
+
summary,
|
|
3271
|
+
writtenFiles: []
|
|
3272
|
+
});
|
|
3273
|
+
}
|
|
3274
|
+
function checkWriteFormat(value) {
|
|
3275
|
+
if (value === void 0) return;
|
|
3276
|
+
if (value.toLowerCase() !== "parquet") throw new Error(`unsupported write.format.default: ${value}`);
|
|
3277
|
+
}
|
|
3278
|
+
function resolveParquetCodec(value) {
|
|
3279
|
+
if (value === void 0) return void 0;
|
|
3280
|
+
switch (value.toLowerCase()) {
|
|
3281
|
+
case "snappy": return "SNAPPY";
|
|
3282
|
+
case "none":
|
|
3283
|
+
case "uncompressed": return "UNCOMPRESSED";
|
|
3284
|
+
default: throw new Error(`unsupported write.parquet.compression-codec: ${value}`);
|
|
3285
|
+
}
|
|
3286
|
+
}
|
|
3287
|
+
function newSnapshotId(metadata) {
|
|
3288
|
+
const used = new Set((metadata?.snapshots ?? []).map((s) => BigInt(s["snapshot-id"])));
|
|
3289
|
+
const arr = new BigInt64Array(1);
|
|
3290
|
+
for (let attempt = 0; attempt < 32; attempt++) {
|
|
3291
|
+
globalThis.crypto.getRandomValues(arr);
|
|
3292
|
+
const masked = arr[0] & 9007199254740991n;
|
|
3293
|
+
const id = masked === 0n ? 1n : masked;
|
|
3294
|
+
if (!used.has(id)) return id;
|
|
3295
|
+
}
|
|
3296
|
+
throw new Error("newSnapshotId: failed to find an unused id after 32 attempts");
|
|
3297
|
+
}
|
|
3298
|
+
async function icebergManifests({ metadata, resolver, snapshotId, partitionFilter }) {
|
|
3299
|
+
resolver ??= urlResolver();
|
|
3300
|
+
const rawTarget = snapshotId ?? metadata["current-snapshot-id"];
|
|
3301
|
+
if (rawTarget == null || rawTarget < 0) throw new Error("No current snapshot id found in table metadata");
|
|
3302
|
+
const targetId = BigInt(rawTarget);
|
|
3303
|
+
const snapshot = metadata.snapshots?.find((s) => BigInt(s["snapshot-id"]) === targetId);
|
|
3304
|
+
if (!snapshot) throw new Error(`Snapshot ${rawTarget} not found in metadata`);
|
|
3305
|
+
let manifests = [];
|
|
3306
|
+
if (snapshot["manifest-list"]) {
|
|
3307
|
+
const manifestListUrl = snapshot["manifest-list"];
|
|
3308
|
+
manifests = await fetchAvroRecords(manifestListUrl, resolver);
|
|
3309
|
+
} else if (snapshot.manifests) manifests = snapshot.manifests;
|
|
3310
|
+
else throw new Error("No manifest information found in snapshot");
|
|
3311
|
+
if (partitionFilter) manifests = manifests.filter((manifest) => {
|
|
3312
|
+
let keep = true;
|
|
3313
|
+
try {
|
|
3314
|
+
keep = partitionFilter(manifest.partitions, manifest.partition_spec_id ?? 0, manifest) !== false;
|
|
3315
|
+
} catch {
|
|
3316
|
+
keep = true;
|
|
3317
|
+
}
|
|
3318
|
+
return keep;
|
|
3319
|
+
});
|
|
3320
|
+
return await fetchManifests(manifests, resolver);
|
|
3321
|
+
}
|
|
3322
|
+
async function fetchManifests(manifests, resolver) {
|
|
3323
|
+
return await Promise.all(manifests.map(async (manifest) => {
|
|
3324
|
+
const url = manifest.manifest_path;
|
|
3325
|
+
const entries = await fetchAvroRecords(url, resolver, Number(manifest.manifest_length));
|
|
3326
|
+
for (const entry of entries) {
|
|
3327
|
+
entry.partition_spec_id = manifest.partition_spec_id ?? 0;
|
|
3328
|
+
if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number ?? 0n;
|
|
3329
|
+
if (entry.status === 1) {
|
|
3330
|
+
if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number;
|
|
3331
|
+
if (entry.file_sequence_number === void 0) entry.file_sequence_number = manifest.sequence_number;
|
|
3332
|
+
} else if (entry.sequence_number === void 0 || entry.file_sequence_number === void 0) throw new Error("iceberg manifest entry missing sequence number");
|
|
3333
|
+
}
|
|
3334
|
+
assignFirstRowIds(manifest, entries);
|
|
3335
|
+
return {
|
|
3336
|
+
url,
|
|
3337
|
+
entries
|
|
3338
|
+
};
|
|
3339
|
+
}));
|
|
3340
|
+
}
|
|
3341
|
+
function assignFirstRowIds(manifest, entries) {
|
|
3342
|
+
if (manifest.content !== 0 || manifest.first_row_id == null) return;
|
|
3343
|
+
let nextFirstRowId = BigInt(manifest.first_row_id);
|
|
3344
|
+
for (const entry of entries) {
|
|
3345
|
+
const dataFile = entry.data_file;
|
|
3346
|
+
if (dataFile.content !== 0) continue;
|
|
3347
|
+
if (dataFile.first_row_id == null) {
|
|
3348
|
+
dataFile.first_row_id = nextFirstRowId;
|
|
3349
|
+
nextFirstRowId += BigInt(dataFile.record_count);
|
|
3350
|
+
}
|
|
3351
|
+
}
|
|
3352
|
+
}
|
|
3353
|
+
const DEFAULT_RETRY = Object.freeze({
|
|
3354
|
+
maxAttempts: 50,
|
|
3355
|
+
initialMs: 50,
|
|
3356
|
+
maxMs: 3e3,
|
|
3357
|
+
factor: 2,
|
|
3358
|
+
totalTimeoutMs: 1800 * 1e3
|
|
3359
|
+
});
|
|
3360
|
+
async function icebergAppend({ catalog, namespace, table, tableUrl, resolver, records, sortOrderId }) {
|
|
3361
|
+
const ctx = await loadTable({
|
|
3362
|
+
catalog,
|
|
3363
|
+
namespace,
|
|
3364
|
+
table,
|
|
3365
|
+
tableUrl,
|
|
3366
|
+
resolver
|
|
3367
|
+
});
|
|
3368
|
+
const prepared = await prepareAppend({
|
|
3369
|
+
tableUrl: ctx.tableUrl,
|
|
3370
|
+
metadata: ctx.metadata,
|
|
3371
|
+
records,
|
|
3372
|
+
resolver: requireResolver(ctx.resolver, "icebergAppend"),
|
|
3373
|
+
sortOrderId
|
|
3374
|
+
});
|
|
3375
|
+
return await commitWithRetry({
|
|
3376
|
+
catalog,
|
|
3377
|
+
target: {
|
|
3378
|
+
namespace,
|
|
3379
|
+
table
|
|
3380
|
+
},
|
|
3381
|
+
ctx,
|
|
3382
|
+
stage: (workingCtx) => stageSnapshotForAppend({
|
|
3383
|
+
tableUrl: workingCtx.tableUrl,
|
|
3384
|
+
metadata: workingCtx.metadata,
|
|
3385
|
+
prepared,
|
|
3386
|
+
resolver: requireResolver(workingCtx.resolver, "icebergAppend")
|
|
3387
|
+
})
|
|
3388
|
+
});
|
|
3389
|
+
}
|
|
3390
|
+
async function icebergCreateTable({ catalog, namespace, table, tableUrl, schema, partitionSpec, sortOrder, properties, formatVersion, stageCreate }) {
|
|
3391
|
+
if (catalog.type === "rest") {
|
|
3392
|
+
if (!namespace || !table) throw new Error("namespace and table are required for rest catalogs");
|
|
3393
|
+
if (!schema) throw new Error("schema is required for rest catalogs");
|
|
3394
|
+
const { metadata } = await restCatalogCreateTable(catalog, {
|
|
3395
|
+
namespace,
|
|
3396
|
+
table,
|
|
3397
|
+
schema,
|
|
3398
|
+
location: tableUrl,
|
|
3399
|
+
partitionSpec,
|
|
3400
|
+
writeOrder: sortOrder,
|
|
3401
|
+
stageCreate,
|
|
3402
|
+
properties
|
|
3403
|
+
});
|
|
3404
|
+
return metadata;
|
|
3405
|
+
}
|
|
3406
|
+
if (!tableUrl) throw new Error("tableUrl is required for file catalogs");
|
|
3407
|
+
return await icebergCreate({
|
|
3408
|
+
tableUrl,
|
|
3409
|
+
resolver: catalog.resolver,
|
|
3410
|
+
schema,
|
|
3411
|
+
formatVersion,
|
|
3412
|
+
partitionSpec,
|
|
3413
|
+
sortOrder,
|
|
3414
|
+
properties,
|
|
3415
|
+
conditionalCommits: catalog.conditionalCommits
|
|
3416
|
+
});
|
|
3417
|
+
}
|
|
3418
|
+
async function icebergDropTable({ catalog, namespace, table, tableUrl, lister, purgeRequested }) {
|
|
3419
|
+
if (catalog.type === "rest") {
|
|
3420
|
+
if (!namespace || !table) throw new Error("namespace and table are required for rest catalogs");
|
|
3421
|
+
await restCatalogDropTable(catalog, {
|
|
3422
|
+
namespace,
|
|
3423
|
+
table,
|
|
3424
|
+
purgeRequested
|
|
3425
|
+
});
|
|
3426
|
+
return;
|
|
3427
|
+
}
|
|
3428
|
+
if (!tableUrl) throw new Error("tableUrl is required for file catalogs");
|
|
3429
|
+
if (!lister) throw new Error("lister is required to drop a file catalog table");
|
|
3430
|
+
const { deleter } = catalog.resolver;
|
|
3431
|
+
if (!deleter) throw new Error("resolver.deleter is required to drop a file catalog table");
|
|
3432
|
+
const dirs = purgeRequested ? ["metadata", "data"] : ["metadata"];
|
|
3433
|
+
for (const dir of dirs) {
|
|
3434
|
+
const names = await lister(`${tableUrl}/${dir}`).catch(() => []);
|
|
3435
|
+
await Promise.allSettled(names.map((n) => deleter(`${tableUrl}/${dir}/${n}`)));
|
|
3436
|
+
}
|
|
3437
|
+
}
|
|
3438
|
+
function requireResolver(resolver, caller) {
|
|
3439
|
+
if (!resolver) throw new Error(`${caller}: resolver is required`);
|
|
3440
|
+
return resolver;
|
|
3441
|
+
}
|
|
3442
|
+
async function commitStaged(catalog, target, ctx, staged) {
|
|
3443
|
+
if (catalog.type === "rest") {
|
|
3444
|
+
const { metadata } = await restCatalogUpdateTable(catalog, {
|
|
3445
|
+
namespace: target.namespace,
|
|
3446
|
+
table: target.table,
|
|
3447
|
+
requirements: staged.requirements,
|
|
3448
|
+
updates: staged.updates
|
|
3449
|
+
});
|
|
3450
|
+
return metadata;
|
|
3451
|
+
}
|
|
3452
|
+
if (!ctx.resolver) throw new Error("resolver is required to commit to a file catalog");
|
|
3453
|
+
return await fileCatalogCommit({
|
|
3454
|
+
tableUrl: ctx.tableUrl,
|
|
3455
|
+
metadata: ctx.metadata,
|
|
3456
|
+
metadataFileName: ctx.metadataFileName,
|
|
3457
|
+
currentVersion: ctx.version,
|
|
3458
|
+
staged,
|
|
3459
|
+
resolver: ctx.resolver,
|
|
3460
|
+
conditionalCommits: catalog.type === "file" && catalog.conditionalCommits
|
|
3461
|
+
});
|
|
3462
|
+
}
|
|
3463
|
+
async function commitWithRetry({ catalog, target, ctx, stage }) {
|
|
3464
|
+
const retryEnabled = catalog.type === "rest" || catalog.type === "file" && catalog.conditionalCommits === true;
|
|
3465
|
+
const policy = resolveRetryPolicy(ctx.metadata);
|
|
3466
|
+
const startedAt = Date.now();
|
|
3467
|
+
let workingCtx = ctx;
|
|
3468
|
+
for (let attempt = 1; attempt <= policy.maxAttempts; attempt++) {
|
|
3469
|
+
const staged = await stage(workingCtx);
|
|
3470
|
+
try {
|
|
3471
|
+
return await commitStaged(catalog, target, workingCtx, staged);
|
|
3472
|
+
} catch (err) {
|
|
3473
|
+
if (!retryEnabled || !isCommitConflict(err)) throw err;
|
|
3474
|
+
if (attempt === policy.maxAttempts) throw new Error(`${catalog.type} catalog commit failed after ${policy.maxAttempts} attempts due to concurrent commits`);
|
|
3475
|
+
const elapsed = Date.now() - startedAt;
|
|
3476
|
+
if (elapsed >= policy.totalTimeoutMs) throw new Error(`${catalog.type} catalog commit retry budget exhausted after ${attempt} attempts and ${elapsed}ms (limit ${policy.totalTimeoutMs}ms)`);
|
|
3477
|
+
const remaining = policy.totalTimeoutMs - elapsed;
|
|
3478
|
+
await sleep(Math.min(jitteredBackoff(attempt, policy), remaining));
|
|
3479
|
+
workingCtx = await reloadCtx(catalog, target, workingCtx, err);
|
|
3480
|
+
}
|
|
3481
|
+
}
|
|
3482
|
+
throw new Error("unreachable");
|
|
3483
|
+
}
|
|
3484
|
+
async function reloadCtx(catalog, target, workingCtx, lastErr) {
|
|
3485
|
+
if (catalog.type === "rest") {
|
|
3486
|
+
if (!target.namespace || !target.table) throw lastErr;
|
|
3487
|
+
const { metadata } = await restCatalogLoadTable(catalog, {
|
|
3488
|
+
namespace: target.namespace,
|
|
3489
|
+
table: target.table
|
|
3490
|
+
});
|
|
3491
|
+
return {
|
|
3492
|
+
metadata,
|
|
3493
|
+
metadataFileName: workingCtx.metadataFileName,
|
|
3494
|
+
version: workingCtx.version,
|
|
3495
|
+
tableUrl: workingCtx.tableUrl,
|
|
3496
|
+
resolver: workingCtx.resolver
|
|
3497
|
+
};
|
|
3498
|
+
}
|
|
3499
|
+
if (!workingCtx.resolver) throw lastErr;
|
|
3500
|
+
const fresh = await loadLatestFileCatalogMetadata({
|
|
3501
|
+
tableUrl: workingCtx.tableUrl,
|
|
3502
|
+
resolver: workingCtx.resolver,
|
|
3503
|
+
lister: catalog.lister
|
|
3504
|
+
});
|
|
3505
|
+
return {
|
|
3506
|
+
metadata: fresh.metadata,
|
|
3507
|
+
metadataFileName: fresh.metadataFileName,
|
|
3508
|
+
version: fresh.version,
|
|
3509
|
+
tableUrl: workingCtx.tableUrl,
|
|
3510
|
+
resolver: workingCtx.resolver
|
|
3511
|
+
};
|
|
3512
|
+
}
|
|
3513
|
+
function resolveRetryPolicy(metadata) {
|
|
3514
|
+
const props = metadata.properties ?? {};
|
|
3515
|
+
const numRetries = parseTableProp(props["commit.retry.num-retries"]);
|
|
3516
|
+
const maxAttempts = numRetries === void 0 ? DEFAULT_RETRY.maxAttempts : numRetries + 1;
|
|
3517
|
+
const initialMs = parseTableProp(props["commit.retry.min-wait-ms"]) ?? DEFAULT_RETRY.initialMs;
|
|
3518
|
+
const maxMs = parseTableProp(props["commit.retry.max-wait-ms"]) ?? DEFAULT_RETRY.maxMs;
|
|
3519
|
+
const totalTimeoutMs = parseTableProp(props["commit.retry.total-timeout-ms"]) ?? DEFAULT_RETRY.totalTimeoutMs;
|
|
3520
|
+
return {
|
|
3521
|
+
maxAttempts,
|
|
3522
|
+
initialMs,
|
|
3523
|
+
maxMs,
|
|
3524
|
+
factor: DEFAULT_RETRY.factor,
|
|
3525
|
+
totalTimeoutMs
|
|
3526
|
+
};
|
|
3527
|
+
}
|
|
3528
|
+
function parseTableProp(value) {
|
|
3529
|
+
if (value === void 0 || value === null || value === "") return void 0;
|
|
3530
|
+
const n = Number(value);
|
|
3531
|
+
if (!Number.isFinite(n) || n < 0) return void 0;
|
|
3532
|
+
return n;
|
|
3533
|
+
}
|
|
3534
|
+
function jitteredBackoff(attempt, policy) {
|
|
3535
|
+
if (policy.initialMs === 0 || policy.maxMs === 0) return 0;
|
|
3536
|
+
const base = Math.min(policy.maxMs, policy.initialMs * policy.factor ** (attempt - 1));
|
|
3537
|
+
return Math.floor(Math.random() * base);
|
|
3538
|
+
}
|
|
3539
|
+
function sleep(ms) {
|
|
3540
|
+
if (ms <= 0) return Promise.resolve();
|
|
3541
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
3542
|
+
}
|
|
3543
|
+
function isCommitConflict(err) {
|
|
3544
|
+
if (!err || typeof err !== "object") return false;
|
|
3545
|
+
const { status } = err;
|
|
3546
|
+
return status === 412 || status === 409;
|
|
3547
|
+
}
|
|
3548
|
+
const enc = new TextEncoder();
|
|
3549
|
+
function s3SignedResolver({ accessKeyId, secretAccessKey, sessionToken, region, endpoint, pathStyle = false }) {
|
|
3550
|
+
const ep = endpoint ? new URL(endpoint.replace(/\/$/, "") + "/") : void 0;
|
|
3551
|
+
function toHttps(url) {
|
|
3552
|
+
if (!url.startsWith("s3://") && !url.startsWith("s3a://")) return url;
|
|
3553
|
+
const rest = url.slice(url.indexOf("://") + 3);
|
|
3554
|
+
const slash = rest.indexOf("/");
|
|
3555
|
+
if (slash === -1) throw new Error(`invalid S3 URL: ${url}`);
|
|
3556
|
+
const bucket = rest.slice(0, slash);
|
|
3557
|
+
const key = rest.slice(slash + 1);
|
|
3558
|
+
if (ep) {
|
|
3559
|
+
if (pathStyle) return `${ep.origin}${ep.pathname}${bucket}/${key}`;
|
|
3560
|
+
return `${ep.protocol}//${bucket}.${ep.host}/${key}`;
|
|
3561
|
+
}
|
|
3562
|
+
return `https://${bucket}.s3.amazonaws.com/${key}`;
|
|
3563
|
+
}
|
|
3564
|
+
async function signRequest(method, url, body, extra = {}) {
|
|
3565
|
+
const u = new URL(url);
|
|
3566
|
+
const xAmzDate = (/* @__PURE__ */ new Date()).toISOString().replace(/[-:]|\.\d{3}/g, "");
|
|
3567
|
+
const dStamp = xAmzDate.slice(0, 8);
|
|
3568
|
+
const payloadHash = body !== void 0 ? await sha256hex(body) : await sha256hex("");
|
|
3569
|
+
const lc = {};
|
|
3570
|
+
for (const [k, v] of Object.entries(extra)) lc[k.toLowerCase()] = String(v);
|
|
3571
|
+
lc["host"] = u.host;
|
|
3572
|
+
lc["x-amz-date"] = xAmzDate;
|
|
3573
|
+
lc["x-amz-content-sha256"] = payloadHash;
|
|
3574
|
+
if (sessionToken) lc["x-amz-security-token"] = sessionToken;
|
|
3575
|
+
const sortedKeys = Object.keys(lc).sort();
|
|
3576
|
+
const canonicalHeaders = sortedKeys.map((k) => `${k}:${lc[k].trim().replace(/\s+/g, " ")}\n`).join("");
|
|
3577
|
+
const signedHeaders = sortedKeys.join(";");
|
|
3578
|
+
const canonicalRequest = [
|
|
3579
|
+
method,
|
|
3580
|
+
u.pathname.split("/").map((seg) => encodeRfc3986(decodeURIComponent(seg))).join("/"),
|
|
3581
|
+
[...u.searchParams.entries()].sort((a, b) => {
|
|
3582
|
+
if (a[0] !== b[0]) return a[0] < b[0] ? -1 : 1;
|
|
3583
|
+
return a[1] < b[1] ? -1 : a[1] > b[1] ? 1 : 0;
|
|
3584
|
+
}).map(([k, v]) => `${encodeRfc3986(k)}=${encodeRfc3986(v)}`).join("&"),
|
|
3585
|
+
canonicalHeaders,
|
|
3586
|
+
signedHeaders,
|
|
3587
|
+
payloadHash
|
|
3588
|
+
].join("\n");
|
|
3589
|
+
const credentialScope = `${dStamp}/${region}/s3/aws4_request`;
|
|
3590
|
+
const stringToSign = [
|
|
3591
|
+
"AWS4-HMAC-SHA256",
|
|
3592
|
+
xAmzDate,
|
|
3593
|
+
credentialScope,
|
|
3594
|
+
await sha256hex(canonicalRequest)
|
|
3595
|
+
].join("\n");
|
|
3596
|
+
const signature = bytesToHex(await hmac(await deriveSigningKey(secretAccessKey, dStamp, region, "s3"), stringToSign));
|
|
3597
|
+
const out = {};
|
|
3598
|
+
for (const [k, v] of Object.entries(lc)) {
|
|
3599
|
+
if (k === "host") continue;
|
|
3600
|
+
out[k] = v;
|
|
3601
|
+
}
|
|
3602
|
+
out["Authorization"] = `AWS4-HMAC-SHA256 Credential=${accessKeyId}/${credentialScope}, SignedHeaders=${signedHeaders}, Signature=${signature}`;
|
|
3603
|
+
return out;
|
|
3604
|
+
}
|
|
3605
|
+
return {
|
|
3606
|
+
async reader(path, byteLength) {
|
|
3607
|
+
const url = toHttps(path);
|
|
3608
|
+
let len = byteLength;
|
|
3609
|
+
if (len === void 0) {
|
|
3610
|
+
const headers = await signRequest("HEAD", url);
|
|
3611
|
+
const res = await fetch(url, {
|
|
3612
|
+
method: "HEAD",
|
|
3613
|
+
headers
|
|
3614
|
+
});
|
|
3615
|
+
if (!res.ok) throw new Error(`HEAD ${path}: ${res.status} ${res.statusText}`);
|
|
3616
|
+
len = Number(res.headers.get("content-length"));
|
|
3617
|
+
if (!Number.isFinite(len)) throw new Error(`HEAD ${path}: missing Content-Length`);
|
|
3618
|
+
}
|
|
3619
|
+
const fileLength = len;
|
|
3620
|
+
return {
|
|
3621
|
+
byteLength: fileLength,
|
|
3622
|
+
async slice(start, end) {
|
|
3623
|
+
const range = `bytes=${start}-${(end ?? fileLength) - 1}`;
|
|
3624
|
+
const headers = await signRequest("GET", url, void 0, { range });
|
|
3625
|
+
const res = await fetch(url, {
|
|
3626
|
+
method: "GET",
|
|
3627
|
+
headers
|
|
3628
|
+
});
|
|
3629
|
+
if (!res.ok) throw new Error(`GET ${path} ${range}: ${res.status} ${res.statusText}`);
|
|
3630
|
+
return await res.arrayBuffer();
|
|
3631
|
+
}
|
|
3632
|
+
};
|
|
3633
|
+
},
|
|
3634
|
+
writer(path, options) {
|
|
3635
|
+
const w = new ByteWriter();
|
|
3636
|
+
w.finish = async function() {
|
|
3637
|
+
const url = toHttps(path);
|
|
3638
|
+
const body = w.getBytes().slice();
|
|
3639
|
+
const extra = {};
|
|
3640
|
+
if (options?.ifNoneMatch) extra["if-none-match"] = options.ifNoneMatch;
|
|
3641
|
+
const headers = await signRequest("PUT", url, body, extra);
|
|
3642
|
+
const res = await fetch(url, {
|
|
3643
|
+
method: "PUT",
|
|
3644
|
+
headers,
|
|
3645
|
+
body
|
|
3646
|
+
});
|
|
3647
|
+
if (!res.ok) {
|
|
3648
|
+
const err = /* @__PURE__ */ new Error(`PUT ${path}: ${res.status} ${res.statusText}`);
|
|
3649
|
+
err.status = res.status;
|
|
3650
|
+
throw err;
|
|
3651
|
+
}
|
|
3652
|
+
};
|
|
3653
|
+
return w;
|
|
3654
|
+
},
|
|
3655
|
+
async deleter(path) {
|
|
3656
|
+
const url = toHttps(path);
|
|
3657
|
+
const headers = await signRequest("DELETE", url);
|
|
3658
|
+
const res = await fetch(url, {
|
|
3659
|
+
method: "DELETE",
|
|
3660
|
+
headers
|
|
3661
|
+
});
|
|
3662
|
+
if (!res.ok && res.status !== 404) throw new Error(`DELETE ${path}: ${res.status} ${res.statusText}`);
|
|
3663
|
+
}
|
|
3664
|
+
};
|
|
3665
|
+
}
|
|
3666
|
+
async function sha256hex(data) {
|
|
3667
|
+
const bytes = typeof data === "string" ? enc.encode(data) : data;
|
|
3668
|
+
const hash = await crypto.subtle.digest("SHA-256", bytes);
|
|
3669
|
+
return bytesToHex(new Uint8Array(hash));
|
|
3670
|
+
}
|
|
3671
|
+
async function hmac(key, data) {
|
|
3672
|
+
const keyBytes = typeof key === "string" ? enc.encode(key) : key;
|
|
3673
|
+
const dataBytes = typeof data === "string" ? enc.encode(data) : data;
|
|
3674
|
+
const cryptoKey = await crypto.subtle.importKey("raw", keyBytes, {
|
|
3675
|
+
name: "HMAC",
|
|
3676
|
+
hash: "SHA-256"
|
|
3677
|
+
}, false, ["sign"]);
|
|
3678
|
+
const sig = await crypto.subtle.sign("HMAC", cryptoKey, dataBytes);
|
|
3679
|
+
return new Uint8Array(sig);
|
|
3680
|
+
}
|
|
3681
|
+
async function deriveSigningKey(secret, dateStamp, region, service) {
|
|
3682
|
+
return await hmac(await hmac(await hmac(await hmac(`AWS4${secret}`, dateStamp), region), service), "aws4_request");
|
|
3683
|
+
}
|
|
3684
|
+
function encodeRfc3986(str) {
|
|
3685
|
+
return encodeURIComponent(str).replace(/[!*'()]/g, (c) => "%" + c.charCodeAt(0).toString(16).toUpperCase());
|
|
3686
|
+
}
|
|
3687
|
+
function bytesToHex(bytes) {
|
|
3688
|
+
let s = "";
|
|
3689
|
+
for (const b of bytes) s += b.toString(16).padStart(2, "0");
|
|
3690
|
+
return s;
|
|
3691
|
+
}
|
|
3692
|
+
(() => {
|
|
3693
|
+
if (typeof setImmediate === "function") return () => new Promise((resolve) => setImmediate(resolve));
|
|
3694
|
+
if (typeof MessageChannel !== "undefined") {
|
|
3695
|
+
const channel = new MessageChannel();
|
|
3696
|
+
const queue = [];
|
|
3697
|
+
channel.port1.onmessage = () => {
|
|
3698
|
+
const resolve = queue.shift();
|
|
3699
|
+
if (resolve) resolve();
|
|
3700
|
+
};
|
|
3701
|
+
return () => new Promise((resolve) => {
|
|
3702
|
+
queue.push(resolve);
|
|
3703
|
+
channel.port2.postMessage(0);
|
|
3704
|
+
});
|
|
3705
|
+
}
|
|
3706
|
+
return () => new Promise((resolve) => setTimeout(resolve, 0));
|
|
3707
|
+
})();
|
|
3708
|
+
export { cachingResolver, icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver };
|