lakesync 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +74 -0
- package/dist/adapter.d.ts +369 -0
- package/dist/adapter.js +39 -0
- package/dist/adapter.js.map +1 -0
- package/dist/analyst.d.ts +268 -0
- package/dist/analyst.js +495 -0
- package/dist/analyst.js.map +1 -0
- package/dist/auth-CAVutXzx.d.ts +30 -0
- package/dist/base-poller-Qo_SmCZs.d.ts +82 -0
- package/dist/catalogue.d.ts +65 -0
- package/dist/catalogue.js +17 -0
- package/dist/catalogue.js.map +1 -0
- package/dist/chunk-4ARO6KTJ.js +257 -0
- package/dist/chunk-4ARO6KTJ.js.map +1 -0
- package/dist/chunk-5YOFCJQ7.js +1115 -0
- package/dist/chunk-5YOFCJQ7.js.map +1 -0
- package/dist/chunk-7D4SUZUM.js +38 -0
- package/dist/chunk-7D4SUZUM.js.map +1 -0
- package/dist/chunk-BNJOGBYK.js +335 -0
- package/dist/chunk-BNJOGBYK.js.map +1 -0
- package/dist/chunk-ICNT7I3K.js +1180 -0
- package/dist/chunk-ICNT7I3K.js.map +1 -0
- package/dist/chunk-P5DRFKIT.js +413 -0
- package/dist/chunk-P5DRFKIT.js.map +1 -0
- package/dist/chunk-X3RO5SYJ.js +880 -0
- package/dist/chunk-X3RO5SYJ.js.map +1 -0
- package/dist/client.d.ts +428 -0
- package/dist/client.js +2048 -0
- package/dist/client.js.map +1 -0
- package/dist/compactor.d.ts +342 -0
- package/dist/compactor.js +793 -0
- package/dist/compactor.js.map +1 -0
- package/dist/coordinator-CxckTzYW.d.ts +396 -0
- package/dist/db-types-BR6Kt4uf.d.ts +29 -0
- package/dist/gateway-D5SaaMvT.d.ts +337 -0
- package/dist/gateway-server.d.ts +306 -0
- package/dist/gateway-server.js +4663 -0
- package/dist/gateway-server.js.map +1 -0
- package/dist/gateway.d.ts +196 -0
- package/dist/gateway.js +79 -0
- package/dist/gateway.js.map +1 -0
- package/dist/hlc-DiD8QNG3.d.ts +70 -0
- package/dist/index.d.ts +245 -0
- package/dist/index.js +102 -0
- package/dist/index.js.map +1 -0
- package/dist/json-dYtqiL0F.d.ts +18 -0
- package/dist/nessie-client-DrNikVXy.d.ts +160 -0
- package/dist/parquet.d.ts +78 -0
- package/dist/parquet.js +15 -0
- package/dist/parquet.js.map +1 -0
- package/dist/proto.d.ts +434 -0
- package/dist/proto.js +67 -0
- package/dist/proto.js.map +1 -0
- package/dist/react.d.ts +147 -0
- package/dist/react.js +224 -0
- package/dist/react.js.map +1 -0
- package/dist/resolver-C3Wphi6O.d.ts +10 -0
- package/dist/result-CojzlFE2.d.ts +64 -0
- package/dist/src-QU2YLPZY.js +383 -0
- package/dist/src-QU2YLPZY.js.map +1 -0
- package/dist/src-WYBF5LOI.js +102 -0
- package/dist/src-WYBF5LOI.js.map +1 -0
- package/dist/src-WZNPHANQ.js +426 -0
- package/dist/src-WZNPHANQ.js.map +1 -0
- package/dist/types-Bs-QyOe-.d.ts +143 -0
- package/dist/types-DAQL_vU_.d.ts +118 -0
- package/dist/types-DSC_EiwR.d.ts +45 -0
- package/dist/types-V_jVu2sA.d.ts +73 -0
- package/package.json +119 -0
|
@@ -0,0 +1,793 @@
|
|
|
1
|
+
import {
|
|
2
|
+
encodeSyncResponse
|
|
3
|
+
} from "./chunk-BNJOGBYK.js";
|
|
4
|
+
import {
|
|
5
|
+
readParquetToDeltas,
|
|
6
|
+
writeDeltasToParquet
|
|
7
|
+
} from "./chunk-4ARO6KTJ.js";
|
|
8
|
+
import {
|
|
9
|
+
Err,
|
|
10
|
+
FlushError,
|
|
11
|
+
HLC,
|
|
12
|
+
LakeSyncError,
|
|
13
|
+
Ok,
|
|
14
|
+
rowKey
|
|
15
|
+
} from "./chunk-ICNT7I3K.js";
|
|
16
|
+
import "./chunk-7D4SUZUM.js";
|
|
17
|
+
|
|
18
|
+
// ../compactor/src/checkpoint-generator.ts
|
|
19
|
+
var DEFAULT_CHECKPOINT_CONFIG = {
|
|
20
|
+
chunkBytes: 16 * 1024 * 1024
|
|
21
|
+
};
|
|
22
|
+
var ESTIMATED_BASE_BYTES = 200;
|
|
23
|
+
var ESTIMATED_BYTES_PER_COLUMN = 50;
|
|
24
|
+
var CheckpointGenerator = class {
|
|
25
|
+
adapter;
|
|
26
|
+
gatewayId;
|
|
27
|
+
config;
|
|
28
|
+
constructor(adapter, _schema, gatewayId, config) {
|
|
29
|
+
this.adapter = adapter;
|
|
30
|
+
this.gatewayId = gatewayId;
|
|
31
|
+
this.config = config ?? DEFAULT_CHECKPOINT_CONFIG;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Generate checkpoint chunks from base Parquet files.
|
|
35
|
+
*
|
|
36
|
+
* Reads each base file sequentially, accumulates deltas, and flushes
|
|
37
|
+
* chunks when the estimated byte size exceeds the configured threshold.
|
|
38
|
+
*
|
|
39
|
+
* @param baseFileKeys - Storage keys of the base Parquet files
|
|
40
|
+
* @param snapshotHlc - The HLC timestamp representing this snapshot point
|
|
41
|
+
* @returns A Result containing the CheckpointResult, or a LakeSyncError on failure
|
|
42
|
+
*/
|
|
43
|
+
async generate(baseFileKeys, snapshotHlc) {
|
|
44
|
+
if (baseFileKeys.length === 0) {
|
|
45
|
+
return Ok({ chunksWritten: 0, bytesWritten: 0, snapshotHlc });
|
|
46
|
+
}
|
|
47
|
+
const prefix = `checkpoints/${this.gatewayId}`;
|
|
48
|
+
const chunkNames = [];
|
|
49
|
+
let totalBytesWritten = 0;
|
|
50
|
+
let totalDeltas = 0;
|
|
51
|
+
let accumulator = [];
|
|
52
|
+
let accumulatedBytes = 0;
|
|
53
|
+
for (const key of baseFileKeys) {
|
|
54
|
+
const getResult = await this.adapter.getObject(key);
|
|
55
|
+
if (!getResult.ok) {
|
|
56
|
+
return Err(
|
|
57
|
+
new LakeSyncError(
|
|
58
|
+
`Failed to read base file: ${key}`,
|
|
59
|
+
"CHECKPOINT_READ_ERROR",
|
|
60
|
+
getResult.error
|
|
61
|
+
)
|
|
62
|
+
);
|
|
63
|
+
}
|
|
64
|
+
const parseResult = await readParquetToDeltas(getResult.value);
|
|
65
|
+
if (!parseResult.ok) {
|
|
66
|
+
return Err(
|
|
67
|
+
new LakeSyncError(
|
|
68
|
+
`Failed to parse base file: ${key}`,
|
|
69
|
+
"CHECKPOINT_PARSE_ERROR",
|
|
70
|
+
parseResult.error
|
|
71
|
+
)
|
|
72
|
+
);
|
|
73
|
+
}
|
|
74
|
+
for (const delta of parseResult.value) {
|
|
75
|
+
accumulator.push(delta);
|
|
76
|
+
accumulatedBytes += ESTIMATED_BASE_BYTES + delta.columns.length * ESTIMATED_BYTES_PER_COLUMN;
|
|
77
|
+
if (accumulatedBytes >= this.config.chunkBytes) {
|
|
78
|
+
const flushResult = await this.flushChunk(
|
|
79
|
+
prefix,
|
|
80
|
+
chunkNames.length,
|
|
81
|
+
accumulator,
|
|
82
|
+
snapshotHlc
|
|
83
|
+
);
|
|
84
|
+
if (!flushResult.ok) return flushResult;
|
|
85
|
+
totalBytesWritten += flushResult.value;
|
|
86
|
+
totalDeltas += accumulator.length;
|
|
87
|
+
chunkNames.push(this.chunkFileName(chunkNames.length));
|
|
88
|
+
accumulator = [];
|
|
89
|
+
accumulatedBytes = 0;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
if (accumulator.length > 0) {
|
|
94
|
+
const flushResult = await this.flushChunk(
|
|
95
|
+
prefix,
|
|
96
|
+
chunkNames.length,
|
|
97
|
+
accumulator,
|
|
98
|
+
snapshotHlc
|
|
99
|
+
);
|
|
100
|
+
if (!flushResult.ok) return flushResult;
|
|
101
|
+
totalBytesWritten += flushResult.value;
|
|
102
|
+
totalDeltas += accumulator.length;
|
|
103
|
+
chunkNames.push(this.chunkFileName(chunkNames.length));
|
|
104
|
+
}
|
|
105
|
+
const manifest = {
|
|
106
|
+
snapshotHlc: snapshotHlc.toString(),
|
|
107
|
+
generatedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
108
|
+
chunkCount: chunkNames.length,
|
|
109
|
+
totalDeltas,
|
|
110
|
+
chunks: chunkNames
|
|
111
|
+
};
|
|
112
|
+
const manifestBytes = new TextEncoder().encode(JSON.stringify(manifest));
|
|
113
|
+
const manifestResult = await this.adapter.putObject(
|
|
114
|
+
`${prefix}/manifest.json`,
|
|
115
|
+
manifestBytes,
|
|
116
|
+
"application/json"
|
|
117
|
+
);
|
|
118
|
+
if (!manifestResult.ok) {
|
|
119
|
+
return Err(
|
|
120
|
+
new LakeSyncError(
|
|
121
|
+
"Failed to write checkpoint manifest",
|
|
122
|
+
"CHECKPOINT_WRITE_ERROR",
|
|
123
|
+
manifestResult.error
|
|
124
|
+
)
|
|
125
|
+
);
|
|
126
|
+
}
|
|
127
|
+
totalBytesWritten += manifestBytes.byteLength;
|
|
128
|
+
return Ok({
|
|
129
|
+
chunksWritten: chunkNames.length,
|
|
130
|
+
bytesWritten: totalBytesWritten,
|
|
131
|
+
snapshotHlc
|
|
132
|
+
});
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Get all storage keys produced by a checkpoint generation.
|
|
136
|
+
* Useful for adding to activeKeys in maintenance to prevent orphan removal.
|
|
137
|
+
*/
|
|
138
|
+
getCheckpointKeys(chunkCount) {
|
|
139
|
+
const prefix = `checkpoints/${this.gatewayId}`;
|
|
140
|
+
const keys = [`${prefix}/manifest.json`];
|
|
141
|
+
for (let i = 0; i < chunkCount; i++) {
|
|
142
|
+
keys.push(`${prefix}/${this.chunkFileName(i)}`);
|
|
143
|
+
}
|
|
144
|
+
return keys;
|
|
145
|
+
}
|
|
146
|
+
chunkFileName(index) {
|
|
147
|
+
return `chunk-${String(index).padStart(3, "0")}.bin`;
|
|
148
|
+
}
|
|
149
|
+
async flushChunk(prefix, index, deltas, snapshotHlc) {
|
|
150
|
+
const encodeResult = encodeSyncResponse({
|
|
151
|
+
deltas,
|
|
152
|
+
serverHlc: snapshotHlc,
|
|
153
|
+
hasMore: false
|
|
154
|
+
});
|
|
155
|
+
if (!encodeResult.ok) {
|
|
156
|
+
return Err(
|
|
157
|
+
new LakeSyncError(
|
|
158
|
+
`Failed to encode checkpoint chunk ${index}`,
|
|
159
|
+
"CHECKPOINT_ENCODE_ERROR",
|
|
160
|
+
encodeResult.error
|
|
161
|
+
)
|
|
162
|
+
);
|
|
163
|
+
}
|
|
164
|
+
const data = encodeResult.value;
|
|
165
|
+
const chunkKey = `${prefix}/${this.chunkFileName(index)}`;
|
|
166
|
+
const putResult = await this.adapter.putObject(chunkKey, data, "application/octet-stream");
|
|
167
|
+
if (!putResult.ok) {
|
|
168
|
+
return Err(
|
|
169
|
+
new LakeSyncError(
|
|
170
|
+
`Failed to write checkpoint chunk: ${chunkKey}`,
|
|
171
|
+
"CHECKPOINT_WRITE_ERROR",
|
|
172
|
+
putResult.error
|
|
173
|
+
)
|
|
174
|
+
);
|
|
175
|
+
}
|
|
176
|
+
return Ok(data.byteLength);
|
|
177
|
+
}
|
|
178
|
+
};
|
|
179
|
+
|
|
180
|
+
// ../compactor/src/equality-delete.ts
|
|
181
|
+
var EQUALITY_DELETE_SCHEMA = {
|
|
182
|
+
table: "_equality_delete",
|
|
183
|
+
columns: []
|
|
184
|
+
};
|
|
185
|
+
var SENTINEL_HLC = HLC.encode(0, 0);
|
|
186
|
+
async function writeEqualityDeletes(deletedRows, _schema) {
|
|
187
|
+
if (deletedRows.length === 0) {
|
|
188
|
+
return Ok(new Uint8Array(0));
|
|
189
|
+
}
|
|
190
|
+
try {
|
|
191
|
+
const syntheticDeltas = deletedRows.map((row, index) => ({
|
|
192
|
+
op: "DELETE",
|
|
193
|
+
table: row.table,
|
|
194
|
+
rowId: row.rowId,
|
|
195
|
+
clientId: "_compactor",
|
|
196
|
+
columns: [],
|
|
197
|
+
hlc: SENTINEL_HLC,
|
|
198
|
+
deltaId: `eq-delete-${index}`
|
|
199
|
+
}));
|
|
200
|
+
return await writeDeltasToParquet(syntheticDeltas, EQUALITY_DELETE_SCHEMA);
|
|
201
|
+
} catch (err) {
|
|
202
|
+
const cause = err instanceof Error ? err : new Error(String(err));
|
|
203
|
+
return Err(new FlushError(`Failed to write equality deletes: ${cause.message}`, cause));
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
async function readEqualityDeletes(data) {
|
|
207
|
+
if (data.byteLength === 0) {
|
|
208
|
+
return Ok([]);
|
|
209
|
+
}
|
|
210
|
+
const readResult = await readParquetToDeltas(data);
|
|
211
|
+
if (!readResult.ok) {
|
|
212
|
+
return Err(
|
|
213
|
+
new FlushError(
|
|
214
|
+
`Failed to read equality deletes: ${readResult.error.message}`,
|
|
215
|
+
readResult.error
|
|
216
|
+
)
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
const rows = readResult.value.map((delta) => ({
|
|
220
|
+
table: delta.table,
|
|
221
|
+
rowId: delta.rowId
|
|
222
|
+
}));
|
|
223
|
+
return Ok(rows);
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// ../compactor/src/compactor.ts
|
|
227
|
+
var Compactor = class {
|
|
228
|
+
adapter;
|
|
229
|
+
config;
|
|
230
|
+
schema;
|
|
231
|
+
/**
|
|
232
|
+
* Create a new Compactor instance.
|
|
233
|
+
*
|
|
234
|
+
* @param adapter - The lake adapter for reading/writing Parquet files
|
|
235
|
+
* @param config - Compaction configuration (thresholds and limits)
|
|
236
|
+
* @param schema - The table schema describing user-defined columns
|
|
237
|
+
*/
|
|
238
|
+
constructor(adapter, config, schema) {
|
|
239
|
+
this.adapter = adapter;
|
|
240
|
+
this.config = config;
|
|
241
|
+
this.schema = schema;
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Compact delta files into base data files.
|
|
245
|
+
*
|
|
246
|
+
* Reads delta files from storage, resolves all deltas per row using LWW,
|
|
247
|
+
* and writes consolidated base files + equality delete files.
|
|
248
|
+
*
|
|
249
|
+
* @param deltaFileKeys - Storage keys of the delta Parquet files to compact
|
|
250
|
+
* @param outputPrefix - Prefix for the output base/delete file keys
|
|
251
|
+
* @returns A Result containing the CompactionResult, or a LakeSyncError on failure
|
|
252
|
+
*/
|
|
253
|
+
async compact(deltaFileKeys, outputPrefix) {
|
|
254
|
+
if (deltaFileKeys.length < this.config.minDeltaFiles) {
|
|
255
|
+
return Ok({
|
|
256
|
+
baseFilesWritten: 0,
|
|
257
|
+
deleteFilesWritten: 0,
|
|
258
|
+
deltaFilesCompacted: 0,
|
|
259
|
+
bytesRead: 0,
|
|
260
|
+
bytesWritten: 0
|
|
261
|
+
});
|
|
262
|
+
}
|
|
263
|
+
const keysToCompact = deltaFileKeys.slice(0, this.config.maxDeltaFiles);
|
|
264
|
+
const resolveResult = await this.readAndResolveIncrementally(keysToCompact);
|
|
265
|
+
if (!resolveResult.ok) return resolveResult;
|
|
266
|
+
const { liveRows, deletedRows, bytesRead } = resolveResult.value;
|
|
267
|
+
const writeResult = await this.writeOutputFiles(liveRows, deletedRows, outputPrefix);
|
|
268
|
+
if (!writeResult.ok) return writeResult;
|
|
269
|
+
return Ok({
|
|
270
|
+
...writeResult.value,
|
|
271
|
+
deltaFilesCompacted: keysToCompact.length,
|
|
272
|
+
bytesRead
|
|
273
|
+
});
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* Read delta files one at a time and incrementally resolve to final row state.
|
|
277
|
+
*
|
|
278
|
+
* Memory usage is O(unique rows x columns) rather than O(total deltas),
|
|
279
|
+
* since each file's deltas are processed and discarded before reading the next.
|
|
280
|
+
*/
|
|
281
|
+
async readAndResolveIncrementally(keysToCompact) {
|
|
282
|
+
const rowStates = /* @__PURE__ */ new Map();
|
|
283
|
+
let bytesRead = 0;
|
|
284
|
+
for (const key of keysToCompact) {
|
|
285
|
+
const getResult = await this.adapter.getObject(key);
|
|
286
|
+
if (!getResult.ok) {
|
|
287
|
+
return Err(
|
|
288
|
+
new LakeSyncError(
|
|
289
|
+
`Failed to read delta file: ${key}`,
|
|
290
|
+
"COMPACTION_READ_ERROR",
|
|
291
|
+
getResult.error
|
|
292
|
+
)
|
|
293
|
+
);
|
|
294
|
+
}
|
|
295
|
+
const data = getResult.value;
|
|
296
|
+
bytesRead += data.byteLength;
|
|
297
|
+
const parseResult = await readParquetToDeltas(data);
|
|
298
|
+
if (!parseResult.ok) {
|
|
299
|
+
return Err(
|
|
300
|
+
new LakeSyncError(
|
|
301
|
+
`Failed to parse delta file: ${key}`,
|
|
302
|
+
"COMPACTION_PARSE_ERROR",
|
|
303
|
+
parseResult.error
|
|
304
|
+
)
|
|
305
|
+
);
|
|
306
|
+
}
|
|
307
|
+
for (const delta of parseResult.value) {
|
|
308
|
+
const k = rowKey(delta.table, delta.rowId);
|
|
309
|
+
let state = rowStates.get(k);
|
|
310
|
+
if (!state) {
|
|
311
|
+
state = {
|
|
312
|
+
table: delta.table,
|
|
313
|
+
rowId: delta.rowId,
|
|
314
|
+
clientId: delta.clientId,
|
|
315
|
+
columns: /* @__PURE__ */ new Map(),
|
|
316
|
+
latestHlc: 0n,
|
|
317
|
+
latestDeltaId: delta.deltaId,
|
|
318
|
+
deleteHlc: 0n
|
|
319
|
+
};
|
|
320
|
+
rowStates.set(k, state);
|
|
321
|
+
}
|
|
322
|
+
if (HLC.compare(delta.hlc, state.latestHlc) > 0) {
|
|
323
|
+
state.latestHlc = delta.hlc;
|
|
324
|
+
state.latestDeltaId = delta.deltaId;
|
|
325
|
+
state.clientId = delta.clientId;
|
|
326
|
+
}
|
|
327
|
+
if (delta.op === "DELETE") {
|
|
328
|
+
if (HLC.compare(delta.hlc, state.deleteHlc) > 0) {
|
|
329
|
+
state.deleteHlc = delta.hlc;
|
|
330
|
+
}
|
|
331
|
+
} else {
|
|
332
|
+
for (const col of delta.columns) {
|
|
333
|
+
const existing = state.columns.get(col.column);
|
|
334
|
+
if (!existing || HLC.compare(delta.hlc, existing.hlc) > 0) {
|
|
335
|
+
state.columns.set(col.column, {
|
|
336
|
+
value: col.value,
|
|
337
|
+
hlc: delta.hlc
|
|
338
|
+
});
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
const liveRows = [];
|
|
345
|
+
const deletedRows = [];
|
|
346
|
+
for (const [, state] of rowStates) {
|
|
347
|
+
let isDeleted = state.deleteHlc > 0n;
|
|
348
|
+
if (isDeleted) {
|
|
349
|
+
for (const col of state.columns.values()) {
|
|
350
|
+
if (HLC.compare(state.deleteHlc, col.hlc) < 0) {
|
|
351
|
+
isDeleted = false;
|
|
352
|
+
break;
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
if (isDeleted || state.columns.size === 0) {
|
|
357
|
+
deletedRows.push({ table: state.table, rowId: state.rowId });
|
|
358
|
+
} else {
|
|
359
|
+
const columns = [];
|
|
360
|
+
for (const col of this.schema.columns) {
|
|
361
|
+
const colState = state.columns.get(col.name);
|
|
362
|
+
if (colState && (state.deleteHlc === 0n || HLC.compare(colState.hlc, state.deleteHlc) > 0)) {
|
|
363
|
+
columns.push({ column: col.name, value: colState.value });
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
liveRows.push({
|
|
367
|
+
op: "INSERT",
|
|
368
|
+
table: state.table,
|
|
369
|
+
rowId: state.rowId,
|
|
370
|
+
clientId: state.clientId,
|
|
371
|
+
columns,
|
|
372
|
+
hlc: state.latestHlc,
|
|
373
|
+
deltaId: state.latestDeltaId
|
|
374
|
+
});
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
return Ok({ liveRows, deletedRows, bytesRead });
|
|
378
|
+
}
|
|
379
|
+
/** Write base Parquet file(s) for live rows and equality delete file(s) for deleted rows. */
|
|
380
|
+
async writeOutputFiles(liveRows, deletedRows, outputPrefix) {
|
|
381
|
+
let bytesWritten = 0;
|
|
382
|
+
let baseFilesWritten = 0;
|
|
383
|
+
let deleteFilesWritten = 0;
|
|
384
|
+
if (liveRows.length > 0) {
|
|
385
|
+
const writeResult = await writeDeltasToParquet(liveRows, this.schema);
|
|
386
|
+
if (!writeResult.ok) {
|
|
387
|
+
return Err(
|
|
388
|
+
new LakeSyncError(
|
|
389
|
+
"Failed to write base file",
|
|
390
|
+
"COMPACTION_WRITE_ERROR",
|
|
391
|
+
writeResult.error
|
|
392
|
+
)
|
|
393
|
+
);
|
|
394
|
+
}
|
|
395
|
+
const baseData = writeResult.value;
|
|
396
|
+
const timestamp = this.generateTimestamp();
|
|
397
|
+
const basePath = `${outputPrefix}/base-${timestamp}.parquet`;
|
|
398
|
+
const putResult = await this.adapter.putObject(
|
|
399
|
+
basePath,
|
|
400
|
+
baseData,
|
|
401
|
+
"application/octet-stream"
|
|
402
|
+
);
|
|
403
|
+
if (!putResult.ok) {
|
|
404
|
+
return Err(
|
|
405
|
+
new LakeSyncError(
|
|
406
|
+
`Failed to store base file: ${basePath}`,
|
|
407
|
+
"COMPACTION_STORE_ERROR",
|
|
408
|
+
putResult.error
|
|
409
|
+
)
|
|
410
|
+
);
|
|
411
|
+
}
|
|
412
|
+
bytesWritten += baseData.byteLength;
|
|
413
|
+
baseFilesWritten = 1;
|
|
414
|
+
}
|
|
415
|
+
if (deletedRows.length > 0) {
|
|
416
|
+
const writeResult = await writeEqualityDeletes(deletedRows, this.schema);
|
|
417
|
+
if (!writeResult.ok) {
|
|
418
|
+
return Err(
|
|
419
|
+
new LakeSyncError(
|
|
420
|
+
"Failed to write equality delete file",
|
|
421
|
+
"COMPACTION_WRITE_ERROR",
|
|
422
|
+
writeResult.error
|
|
423
|
+
)
|
|
424
|
+
);
|
|
425
|
+
}
|
|
426
|
+
const deleteData = writeResult.value;
|
|
427
|
+
const timestamp = this.generateTimestamp();
|
|
428
|
+
const deletePath = `${outputPrefix}/delete-${timestamp}.parquet`;
|
|
429
|
+
const putResult = await this.adapter.putObject(
|
|
430
|
+
deletePath,
|
|
431
|
+
deleteData,
|
|
432
|
+
"application/octet-stream"
|
|
433
|
+
);
|
|
434
|
+
if (!putResult.ok) {
|
|
435
|
+
return Err(
|
|
436
|
+
new LakeSyncError(
|
|
437
|
+
`Failed to store delete file: ${deletePath}`,
|
|
438
|
+
"COMPACTION_STORE_ERROR",
|
|
439
|
+
putResult.error
|
|
440
|
+
)
|
|
441
|
+
);
|
|
442
|
+
}
|
|
443
|
+
bytesWritten += deleteData.byteLength;
|
|
444
|
+
deleteFilesWritten = 1;
|
|
445
|
+
}
|
|
446
|
+
return Ok({ baseFilesWritten, deleteFilesWritten, bytesWritten });
|
|
447
|
+
}
|
|
448
|
+
/**
|
|
449
|
+
* Generate a timestamp string for output file naming.
|
|
450
|
+
* Uses the current wall clock time with a random suffix for uniqueness.
|
|
451
|
+
*/
|
|
452
|
+
generateTimestamp() {
|
|
453
|
+
const now = Date.now();
|
|
454
|
+
const suffix = Math.random().toString(36).slice(2, 8);
|
|
455
|
+
return `${now}-${suffix}`;
|
|
456
|
+
}
|
|
457
|
+
};
|
|
458
|
+
|
|
459
|
+
// ../compactor/src/maintenance.ts
|
|
460
|
+
var DEFAULT_MAINTENANCE_CONFIG = {
|
|
461
|
+
retainSnapshots: 5,
|
|
462
|
+
orphanAgeMs: 60 * 60 * 1e3
|
|
463
|
+
// 1 hour
|
|
464
|
+
};
|
|
465
|
+
var MaintenanceRunner = class {
|
|
466
|
+
compactor;
|
|
467
|
+
adapter;
|
|
468
|
+
config;
|
|
469
|
+
checkpointGenerator;
|
|
470
|
+
/**
|
|
471
|
+
* Create a new MaintenanceRunner instance.
|
|
472
|
+
*
|
|
473
|
+
* @param compactor - The compactor instance for merging delta files
|
|
474
|
+
* @param adapter - The lake adapter for storage operations
|
|
475
|
+
* @param config - Maintenance configuration (retention and age thresholds)
|
|
476
|
+
* @param checkpointGenerator - Optional checkpoint generator; when provided,
|
|
477
|
+
* checkpoints are generated after successful compaction
|
|
478
|
+
*/
|
|
479
|
+
constructor(compactor, adapter, config, checkpointGenerator) {
|
|
480
|
+
this.compactor = compactor;
|
|
481
|
+
this.adapter = adapter;
|
|
482
|
+
this.config = config;
|
|
483
|
+
this.checkpointGenerator = checkpointGenerator ?? null;
|
|
484
|
+
}
|
|
485
|
+
/**
|
|
486
|
+
* Run the full maintenance cycle: compact, expire, and clean.
|
|
487
|
+
*
|
|
488
|
+
* Compacts delta files into base/delete files, then removes orphaned
|
|
489
|
+
* storage objects that are no longer referenced by any active data.
|
|
490
|
+
* Files younger than `orphanAgeMs` are never deleted to avoid races
|
|
491
|
+
* with in-progress flush operations.
|
|
492
|
+
*
|
|
493
|
+
* @param deltaFileKeys - Storage keys of the delta Parquet files to compact
|
|
494
|
+
* @param outputPrefix - Prefix for the output base/delete file keys
|
|
495
|
+
* @param storagePrefix - Prefix under which all related storage files live
|
|
496
|
+
* @returns A Result containing the MaintenanceReport, or a LakeSyncError on failure
|
|
497
|
+
*/
|
|
498
|
+
async run(deltaFileKeys, outputPrefix, storagePrefix) {
|
|
499
|
+
const compactionResult = await this.compactor.compact(deltaFileKeys, outputPrefix);
|
|
500
|
+
if (!compactionResult.ok) {
|
|
501
|
+
return Err(
|
|
502
|
+
new LakeSyncError(
|
|
503
|
+
`Maintenance compaction failed: ${compactionResult.error.message}`,
|
|
504
|
+
"MAINTENANCE_COMPACTION_ERROR",
|
|
505
|
+
compactionResult.error
|
|
506
|
+
)
|
|
507
|
+
);
|
|
508
|
+
}
|
|
509
|
+
const compaction = compactionResult.value;
|
|
510
|
+
const activeKeys = /* @__PURE__ */ new Set();
|
|
511
|
+
const compactedCount = compaction.deltaFilesCompacted;
|
|
512
|
+
for (let i = compactedCount; i < deltaFileKeys.length; i++) {
|
|
513
|
+
activeKeys.add(deltaFileKeys[i]);
|
|
514
|
+
}
|
|
515
|
+
const listOutputResult = await this.adapter.listObjects(outputPrefix);
|
|
516
|
+
if (!listOutputResult.ok) {
|
|
517
|
+
return Err(
|
|
518
|
+
new LakeSyncError(
|
|
519
|
+
`Failed to list output files: ${listOutputResult.error.message}`,
|
|
520
|
+
"MAINTENANCE_LIST_ERROR",
|
|
521
|
+
listOutputResult.error
|
|
522
|
+
)
|
|
523
|
+
);
|
|
524
|
+
}
|
|
525
|
+
for (const obj of listOutputResult.value) {
|
|
526
|
+
activeKeys.add(obj.key);
|
|
527
|
+
}
|
|
528
|
+
let checkpoint;
|
|
529
|
+
if (this.checkpointGenerator && compaction.baseFilesWritten > 0) {
|
|
530
|
+
const baseFileKeys = listOutputResult.value.filter((obj) => obj.key.endsWith(".parquet") && obj.key.includes("/base-")).map((obj) => obj.key);
|
|
531
|
+
if (baseFileKeys.length > 0) {
|
|
532
|
+
const snapshotHlc = HLC.encode(Date.now(), 0);
|
|
533
|
+
const checkpointResult = await this.checkpointGenerator.generate(baseFileKeys, snapshotHlc);
|
|
534
|
+
if (checkpointResult.ok) {
|
|
535
|
+
checkpoint = checkpointResult.value;
|
|
536
|
+
const checkpointKeys = this.checkpointGenerator.getCheckpointKeys(
|
|
537
|
+
checkpoint.chunksWritten
|
|
538
|
+
);
|
|
539
|
+
for (const key of checkpointKeys) {
|
|
540
|
+
activeKeys.add(key);
|
|
541
|
+
}
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
const orphanResult = await this.removeOrphans(storagePrefix, activeKeys);
|
|
546
|
+
if (!orphanResult.ok) {
|
|
547
|
+
return Err(
|
|
548
|
+
new LakeSyncError(
|
|
549
|
+
`Maintenance orphan removal failed: ${orphanResult.error.message}`,
|
|
550
|
+
"MAINTENANCE_ORPHAN_ERROR",
|
|
551
|
+
orphanResult.error
|
|
552
|
+
)
|
|
553
|
+
);
|
|
554
|
+
}
|
|
555
|
+
return Ok({
|
|
556
|
+
compaction,
|
|
557
|
+
snapshotsExpired: 0,
|
|
558
|
+
orphansRemoved: orphanResult.value,
|
|
559
|
+
checkpoint
|
|
560
|
+
});
|
|
561
|
+
}
|
|
562
|
+
/**
|
|
563
|
+
* Delete orphaned files not referenced by any active data.
|
|
564
|
+
*
|
|
565
|
+
* Lists all files under the given storage prefix, compares each
|
|
566
|
+
* against the set of active keys, and deletes files that are both
|
|
567
|
+
* unreferenced and older than `orphanAgeMs`. This age guard
|
|
568
|
+
* prevents deletion of files created by in-progress flush operations.
|
|
569
|
+
*
|
|
570
|
+
* @param storagePrefix - The storage prefix to scan for orphaned files
|
|
571
|
+
* @param activeKeys - Set of storage keys that must be retained
|
|
572
|
+
* @returns A Result containing the count of deleted files, or a LakeSyncError on failure
|
|
573
|
+
*/
|
|
574
|
+
async removeOrphans(storagePrefix, activeKeys) {
|
|
575
|
+
const listResult = await this.adapter.listObjects(storagePrefix);
|
|
576
|
+
if (!listResult.ok) {
|
|
577
|
+
return Err(
|
|
578
|
+
new LakeSyncError(
|
|
579
|
+
`Failed to list objects for orphan removal: ${listResult.error.message}`,
|
|
580
|
+
"MAINTENANCE_LIST_ERROR",
|
|
581
|
+
listResult.error
|
|
582
|
+
)
|
|
583
|
+
);
|
|
584
|
+
}
|
|
585
|
+
const now = Date.now();
|
|
586
|
+
const orphanKeys = this.findOrphans(listResult.value, activeKeys, now);
|
|
587
|
+
if (orphanKeys.length === 0) {
|
|
588
|
+
return Ok(0);
|
|
589
|
+
}
|
|
590
|
+
const deleteResult = await this.adapter.deleteObjects(orphanKeys);
|
|
591
|
+
if (!deleteResult.ok) {
|
|
592
|
+
return Err(
|
|
593
|
+
new LakeSyncError(
|
|
594
|
+
`Failed to delete orphaned files: ${deleteResult.error.message}`,
|
|
595
|
+
"MAINTENANCE_DELETE_ERROR",
|
|
596
|
+
deleteResult.error
|
|
597
|
+
)
|
|
598
|
+
);
|
|
599
|
+
}
|
|
600
|
+
return Ok(orphanKeys.length);
|
|
601
|
+
}
|
|
602
|
+
/**
|
|
603
|
+
* Identify orphaned file keys from a list of storage objects.
|
|
604
|
+
*
|
|
605
|
+
* A file is considered an orphan if it is not in the active keys set
|
|
606
|
+
* and its last modification time is older than the configured orphan age.
|
|
607
|
+
*/
|
|
608
|
+
findOrphans(objects, activeKeys, now) {
|
|
609
|
+
const orphans = [];
|
|
610
|
+
for (const obj of objects) {
|
|
611
|
+
if (activeKeys.has(obj.key)) {
|
|
612
|
+
continue;
|
|
613
|
+
}
|
|
614
|
+
const age = now - obj.lastModified.getTime();
|
|
615
|
+
if (age >= this.config.orphanAgeMs) {
|
|
616
|
+
orphans.push(obj.key);
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
return orphans;
|
|
620
|
+
}
|
|
621
|
+
};
|
|
622
|
+
|
|
623
|
+
// ../compactor/src/scheduler.ts
|
|
624
|
+
var DEFAULT_SCHEDULER_CONFIG = {
|
|
625
|
+
intervalMs: 6e4,
|
|
626
|
+
enabled: true
|
|
627
|
+
};
|
|
628
|
+
var CompactionScheduler = class {
|
|
629
|
+
runner;
|
|
630
|
+
taskProvider;
|
|
631
|
+
config;
|
|
632
|
+
timer = null;
|
|
633
|
+
running = false;
|
|
634
|
+
inFlightPromise = null;
|
|
635
|
+
/**
|
|
636
|
+
* Create a new CompactionScheduler instance.
|
|
637
|
+
*
|
|
638
|
+
* @param runner - The maintenance runner to execute on each tick
|
|
639
|
+
* @param taskProvider - Function that provides maintenance task parameters for each run
|
|
640
|
+
* @param config - Scheduler configuration (interval and enabled flag)
|
|
641
|
+
*/
|
|
642
|
+
constructor(runner, taskProvider, config = {}) {
|
|
643
|
+
this.runner = runner;
|
|
644
|
+
this.taskProvider = taskProvider;
|
|
645
|
+
this.config = { ...DEFAULT_SCHEDULER_CONFIG, ...config };
|
|
646
|
+
}
|
|
647
|
+
/**
|
|
648
|
+
* Whether the scheduler is currently active (timer is ticking).
|
|
649
|
+
*/
|
|
650
|
+
get isRunning() {
|
|
651
|
+
return this.running;
|
|
652
|
+
}
|
|
653
|
+
/**
|
|
654
|
+
* Start the scheduler interval timer.
|
|
655
|
+
*
|
|
656
|
+
* Begins executing maintenance runs at the configured interval.
|
|
657
|
+
* If the scheduler is already running or disabled, returns an error.
|
|
658
|
+
*
|
|
659
|
+
* @returns A Result indicating success or a descriptive error
|
|
660
|
+
*/
|
|
661
|
+
start() {
|
|
662
|
+
if (!this.config.enabled) {
|
|
663
|
+
return Err(new LakeSyncError("Scheduler is disabled", "SCHEDULER_DISABLED"));
|
|
664
|
+
}
|
|
665
|
+
if (this.running) {
|
|
666
|
+
return Err(new LakeSyncError("Scheduler is already running", "SCHEDULER_ALREADY_RUNNING"));
|
|
667
|
+
}
|
|
668
|
+
this.running = true;
|
|
669
|
+
this.timer = setInterval(() => {
|
|
670
|
+
void this.tick();
|
|
671
|
+
}, this.config.intervalMs);
|
|
672
|
+
return Ok(void 0);
|
|
673
|
+
}
|
|
674
|
+
/**
|
|
675
|
+
* Stop the scheduler and wait for any in-progress run to finish.
|
|
676
|
+
*
|
|
677
|
+
* Clears the interval timer and, if a maintenance run is currently
|
|
678
|
+
* executing, awaits its completion before returning.
|
|
679
|
+
*
|
|
680
|
+
* @returns A Result indicating success or a descriptive error
|
|
681
|
+
*/
|
|
682
|
+
async stop() {
|
|
683
|
+
if (!this.running) {
|
|
684
|
+
return Err(new LakeSyncError("Scheduler is not running", "SCHEDULER_NOT_RUNNING"));
|
|
685
|
+
}
|
|
686
|
+
if (this.timer !== null) {
|
|
687
|
+
clearInterval(this.timer);
|
|
688
|
+
this.timer = null;
|
|
689
|
+
}
|
|
690
|
+
this.running = false;
|
|
691
|
+
if (this.inFlightPromise !== null) {
|
|
692
|
+
await this.inFlightPromise;
|
|
693
|
+
this.inFlightPromise = null;
|
|
694
|
+
}
|
|
695
|
+
return Ok(void 0);
|
|
696
|
+
}
|
|
697
|
+
/**
|
|
698
|
+
* Manually trigger a single maintenance run.
|
|
699
|
+
*
|
|
700
|
+
* Useful for testing or administrative purposes. If a run is already
|
|
701
|
+
* in progress, skips and returns an error.
|
|
702
|
+
*
|
|
703
|
+
* @returns A Result containing the MaintenanceReport, or a LakeSyncError on failure
|
|
704
|
+
*/
|
|
705
|
+
async runOnce() {
|
|
706
|
+
if (this.inFlightPromise !== null) {
|
|
707
|
+
return Err(new LakeSyncError("A maintenance run is already in progress", "SCHEDULER_BUSY"));
|
|
708
|
+
}
|
|
709
|
+
return this.executeMaintenance();
|
|
710
|
+
}
|
|
711
|
+
/**
|
|
712
|
+
* Internal tick handler called by the interval timer.
|
|
713
|
+
* Skips if a previous run is still in progress.
|
|
714
|
+
*/
|
|
715
|
+
async tick() {
|
|
716
|
+
if (this.inFlightPromise !== null) {
|
|
717
|
+
return;
|
|
718
|
+
}
|
|
719
|
+
await this.executeMaintenance();
|
|
720
|
+
}
|
|
721
|
+
/**
|
|
722
|
+
* Execute a single maintenance cycle.
|
|
723
|
+
*
|
|
724
|
+
* Calls the task provider to get parameters, then runs the maintenance
|
|
725
|
+
* runner. Tracks the in-flight promise so concurrent runs are prevented.
|
|
726
|
+
*/
|
|
727
|
+
async executeMaintenance() {
|
|
728
|
+
const taskResult = await this.resolveTask();
|
|
729
|
+
if (!taskResult.ok) {
|
|
730
|
+
return taskResult;
|
|
731
|
+
}
|
|
732
|
+
const task = taskResult.value;
|
|
733
|
+
if (task === null) {
|
|
734
|
+
return Ok({
|
|
735
|
+
compaction: {
|
|
736
|
+
baseFilesWritten: 0,
|
|
737
|
+
deleteFilesWritten: 0,
|
|
738
|
+
deltaFilesCompacted: 0,
|
|
739
|
+
bytesRead: 0,
|
|
740
|
+
bytesWritten: 0
|
|
741
|
+
},
|
|
742
|
+
snapshotsExpired: 0,
|
|
743
|
+
orphansRemoved: 0
|
|
744
|
+
});
|
|
745
|
+
}
|
|
746
|
+
const promise = this.runner.run(task.deltaFileKeys, task.outputPrefix, task.storagePrefix);
|
|
747
|
+
this.inFlightPromise = promise;
|
|
748
|
+
try {
|
|
749
|
+
const result = await promise;
|
|
750
|
+
return result;
|
|
751
|
+
} finally {
|
|
752
|
+
this.inFlightPromise = null;
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
/**
|
|
756
|
+
* Resolve the maintenance task from the provider, wrapping any thrown
|
|
757
|
+
* exceptions into a Result error.
|
|
758
|
+
*/
|
|
759
|
+
async resolveTask() {
|
|
760
|
+
try {
|
|
761
|
+
const task = await this.taskProvider();
|
|
762
|
+
return Ok(task);
|
|
763
|
+
} catch (error) {
|
|
764
|
+
return Err(
|
|
765
|
+
new LakeSyncError(
|
|
766
|
+
`Task provider failed: ${error instanceof Error ? error.message : String(error)}`,
|
|
767
|
+
"SCHEDULER_TASK_PROVIDER_ERROR"
|
|
768
|
+
)
|
|
769
|
+
);
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
};
|
|
773
|
+
|
|
774
|
+
// ../compactor/src/types.ts
|
|
775
|
+
var DEFAULT_COMPACTION_CONFIG = {
|
|
776
|
+
minDeltaFiles: 10,
|
|
777
|
+
maxDeltaFiles: 20,
|
|
778
|
+
targetFileSizeBytes: 128 * 1024 * 1024
|
|
779
|
+
// 128 MB
|
|
780
|
+
};
|
|
781
|
+
export {
|
|
782
|
+
CheckpointGenerator,
|
|
783
|
+
CompactionScheduler,
|
|
784
|
+
Compactor,
|
|
785
|
+
DEFAULT_CHECKPOINT_CONFIG,
|
|
786
|
+
DEFAULT_COMPACTION_CONFIG,
|
|
787
|
+
DEFAULT_MAINTENANCE_CONFIG,
|
|
788
|
+
DEFAULT_SCHEDULER_CONFIG,
|
|
789
|
+
MaintenanceRunner,
|
|
790
|
+
readEqualityDeletes,
|
|
791
|
+
writeEqualityDeletes
|
|
792
|
+
};
|
|
793
|
+
//# sourceMappingURL=compactor.js.map
|