@gscdump/engine 0.31.0 → 0.31.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/engine.mjs +7 -4
- package/dist/_chunks/sink.d.mts +28 -1
- package/dist/iceberg/index.d.mts +2 -2
- package/dist/iceberg/index.mjs +40 -3
- package/package.json +4 -3
package/dist/_chunks/engine.mjs
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { dayPartition, hourPartition, inferSearchType, objectKey, tenantPrefix } from "./layout.mjs";
|
|
2
|
-
import { SCHEMAS, currentSchemaVersion, dateColumnsFor, dedupeByNaturalKey } from "./schema.mjs";
|
|
2
|
+
import { SCHEMAS, TABLE_METADATA, currentSchemaVersion, dateColumnsFor, dedupeByNaturalKey } from "./schema.mjs";
|
|
3
3
|
import { compactTieredImpl, dedupeOverlappingTiers, splitOverlappingTiers } from "./compaction.mjs";
|
|
4
4
|
import { dateReplaceClause as dateReplaceClause$1 } from "../sql-fragments.mjs";
|
|
5
5
|
import { compileLogicalQueryPlan, substituteNamedFiles } from "./parquet-plan.mjs";
|
|
@@ -100,11 +100,14 @@ function createDuckDBCodec(factory) {
|
|
|
100
100
|
}
|
|
101
101
|
};
|
|
102
102
|
}
|
|
103
|
+
const quoteCol = (c) => `"${c.replace(/"/g, "\"\"")}"`;
|
|
103
104
|
function dedupedMergeSql(table, fileListSql) {
|
|
104
105
|
const base = `SELECT * FROM read_parquet([${fileListSql}], union_by_name = true)`;
|
|
105
|
-
const
|
|
106
|
-
|
|
107
|
-
|
|
106
|
+
const sortKey = SCHEMAS[table].sortKey;
|
|
107
|
+
const clusterKey = TABLE_METADATA[table].clusterKey;
|
|
108
|
+
const dedup = sortKey.length === 0 ? base : `${base} QUALIFY row_number() OVER (PARTITION BY ${sortKey.map(quoteCol).join(", ")}) = 1`;
|
|
109
|
+
if (clusterKey.length === 0) return dedup;
|
|
110
|
+
return `${dedup} ORDER BY ${clusterKey.map(quoteCol).join(", ")}`;
|
|
108
111
|
}
|
|
109
112
|
function rewriteEmptyFileSets(sql, placeholders, defaultTable, placeholderTables) {
|
|
110
113
|
let out = sql;
|
package/dist/_chunks/sink.d.mts
CHANGED
|
@@ -185,6 +185,18 @@ interface IcebergPartitionSpec {
|
|
|
185
185
|
'spec-id': number;
|
|
186
186
|
'fields': IcebergPartitionSpecField[];
|
|
187
187
|
}
|
|
188
|
+
/** A field in an icebird `SortOrder`. */
|
|
189
|
+
interface IcebergSortOrderField {
|
|
190
|
+
'source-id': number;
|
|
191
|
+
'transform': 'identity';
|
|
192
|
+
'direction': 'asc' | 'desc';
|
|
193
|
+
'null-order': 'nulls-first' | 'nulls-last';
|
|
194
|
+
}
|
|
195
|
+
/** An icebird `SortOrder` (Iceberg write-order). */
|
|
196
|
+
interface IcebergSortOrder {
|
|
197
|
+
'order-id': number;
|
|
198
|
+
'fields': IcebergSortOrderField[];
|
|
199
|
+
}
|
|
188
200
|
/** Everything needed to talk to the R2 Data Catalog. */
|
|
189
201
|
interface IcebergCatalogConfig {
|
|
190
202
|
/** REST catalog URI, e.g. `https://catalog.cloudflarestorage.com/<acct>/<warehouse>`. */
|
|
@@ -220,6 +232,21 @@ declare function icebergSchemaFor(table: IcebergTableName, encoding?: PartitionK
|
|
|
220
232
|
* {@link icebergSchemaFor}.
|
|
221
233
|
*/
|
|
222
234
|
declare function icebergPartitionSpecFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergPartitionSpec;
|
|
235
|
+
/**
|
|
236
|
+
* Build the icebird `SortOrder` for a fact table from its `clusterKey`
|
|
237
|
+
* (dimension-first, then `date`) — e.g. `pages` → sort by `url`, then `date`.
|
|
238
|
+
*
|
|
239
|
+
* Declared so any sort-aware compaction (a self-run `icebergRewrite`, or R2
|
|
240
|
+
* managed compaction if/when it honors sort order) re-clusters merged files the
|
|
241
|
+
* same way the append path already orders them ({@link sortByClusterKey} in
|
|
242
|
+
* `append-sink.ts`). R2's managed compaction currently only bin-packs small
|
|
243
|
+
* files without re-sorting, so this is forward-looking: it costs nothing today
|
|
244
|
+
* (the table simply carries the metadata) and means a future sort-aware pass
|
|
245
|
+
* produces globally clustered files for free, maximizing row-group skipping on
|
|
246
|
+
* the DuckDB-over-R2 read path. clusterKey columns are all non-null, so the
|
|
247
|
+
* null ordering is moot; `identity`/`asc` mirrors the physical write order.
|
|
248
|
+
*/
|
|
249
|
+
declare function icebergSortOrderFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergSortOrder;
|
|
223
250
|
/** Options for {@link connectIcebergCatalog}. */
|
|
224
251
|
interface ConnectIcebergOptions {
|
|
225
252
|
/**
|
|
@@ -501,4 +528,4 @@ interface LocalIcebergSinkOptions extends SinkOptions {
|
|
|
501
528
|
/** S3-compatible warehouse location (POC: MinIO). */
|
|
502
529
|
warehouse: string;
|
|
503
530
|
}
|
|
504
|
-
export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
|
|
531
|
+
export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergSortOrder, IcebergSortOrderField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
|
package/dist/iceberg/index.d.mts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
|
|
1
|
+
import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergSortOrder, IcebergSortOrderField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
|
|
2
2
|
import { icebergCreateTable, icebergManifests, restCatalogLoadTable } from "../_chunks/libs/icebird.mjs";
|
|
3
3
|
type IcebergAppendSink = Sink;
|
|
4
4
|
/**
|
|
@@ -10,4 +10,4 @@ type IcebergAppendSink = Sink;
|
|
|
10
10
|
* with no rows never touches the network.
|
|
11
11
|
*/
|
|
12
12
|
declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
|
|
13
|
-
export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, type PartitionKeyEncoding, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
|
|
13
|
+
export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergSortOrder, type IcebergSortOrderField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, type PartitionKeyEncoding, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
|
package/dist/iceberg/index.mjs
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { TABLE_METADATA } from "../_chunks/schema.mjs";
|
|
1
2
|
import { engineErrors } from "../errors.mjs";
|
|
2
3
|
import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, icebergPartitionColumns, icebergSchemasFor, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
|
|
3
4
|
import { cachingResolver, icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "../_chunks/libs/icebird.mjs";
|
|
@@ -107,6 +108,23 @@ function icebergPartitionSpecFor(table, encoding = "string") {
|
|
|
107
108
|
}))
|
|
108
109
|
};
|
|
109
110
|
}
|
|
111
|
+
function icebergSortOrderFor(table, encoding = "string") {
|
|
112
|
+
const fields = icebergSchemasFor(encoding)[table].columns;
|
|
113
|
+
const fieldId = (name) => {
|
|
114
|
+
const col = fields.find((c) => c.name === name);
|
|
115
|
+
if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
|
|
116
|
+
return col.fieldId;
|
|
117
|
+
};
|
|
118
|
+
return {
|
|
119
|
+
"order-id": 1,
|
|
120
|
+
"fields": TABLE_METADATA[table].clusterKey.map((col) => ({
|
|
121
|
+
"source-id": fieldId(col),
|
|
122
|
+
"transform": "identity",
|
|
123
|
+
"direction": "asc",
|
|
124
|
+
"null-order": "nulls-last"
|
|
125
|
+
}))
|
|
126
|
+
};
|
|
127
|
+
}
|
|
110
128
|
const CATALOG_CONFIG_TTL_MS = 3600 * 1e3;
|
|
111
129
|
function catalogConfigKey(config) {
|
|
112
130
|
return `gsc-catalog-cfg\0${config.catalogUri}\0${config.warehouse}`;
|
|
@@ -187,7 +205,8 @@ async function createIcebergTables(conn, tables = ICEBERG_TABLES, encoding = "st
|
|
|
187
205
|
namespace: conn.namespace,
|
|
188
206
|
table,
|
|
189
207
|
schema: icebergSchemaFor(table, encoding),
|
|
190
|
-
partitionSpec: icebergPartitionSpecFor(table, encoding)
|
|
208
|
+
partitionSpec: icebergPartitionSpecFor(table, encoding),
|
|
209
|
+
sortOrder: icebergSortOrderFor(table, encoding)
|
|
191
210
|
}).then(() => results.push({
|
|
192
211
|
table,
|
|
193
212
|
outcome: ok(void 0)
|
|
@@ -369,6 +388,24 @@ function dedupeByIdentity(table, records) {
|
|
|
369
388
|
}
|
|
370
389
|
return seen.size === records.length ? records : [...seen.values()];
|
|
371
390
|
}
|
|
391
|
+
function sortByClusterKey(table, records) {
|
|
392
|
+
const cols = TABLE_METADATA[table].clusterKey;
|
|
393
|
+
if (cols.length === 0 || records.length < 2) return records;
|
|
394
|
+
return records.slice().sort((a, b) => {
|
|
395
|
+
for (const col of cols) {
|
|
396
|
+
const av = a[col];
|
|
397
|
+
const bv = b[col];
|
|
398
|
+
if (av === bv) continue;
|
|
399
|
+
if (av == null) return -1;
|
|
400
|
+
if (bv == null) return 1;
|
|
401
|
+
if (typeof av === "number" && typeof bv === "number") return av - bv;
|
|
402
|
+
const as = String(av);
|
|
403
|
+
const bs = String(bv);
|
|
404
|
+
if (as !== bs) return as < bs ? -1 : 1;
|
|
405
|
+
}
|
|
406
|
+
return 0;
|
|
407
|
+
});
|
|
408
|
+
}
|
|
372
409
|
function toRecords(slice, rows, encoding) {
|
|
373
410
|
const siteVal = encoding === "int" ? toIntPartitionSiteId(slice.ctx.siteId) : slice.ctx.siteId ?? "";
|
|
374
411
|
const searchVal = encoding === "int" ? SEARCH_TYPE_INT[slice.searchType] : slice.searchType;
|
|
@@ -423,7 +460,7 @@ function createIcebergAppendSink(options) {
|
|
|
423
460
|
}
|
|
424
461
|
for (const [table, records] of buffers) {
|
|
425
462
|
if (records.length === 0) continue;
|
|
426
|
-
const deduped = dedupeByIdentity(table, records);
|
|
463
|
+
const deduped = sortByClusterKey(table, dedupeByIdentity(table, records));
|
|
427
464
|
await icebergAppendRetrying({
|
|
428
465
|
catalog: conn.catalog,
|
|
429
466
|
namespace: conn.namespace,
|
|
@@ -447,4 +484,4 @@ function createIcebergAppendSink(options) {
|
|
|
447
484
|
}
|
|
448
485
|
};
|
|
449
486
|
}
|
|
450
|
-
export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
|
|
487
|
+
export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.31.
|
|
4
|
+
"version": "0.31.1",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -191,8 +191,8 @@
|
|
|
191
191
|
"hyparquet": "^1.26.1",
|
|
192
192
|
"hyparquet-writer": "^0.16.1",
|
|
193
193
|
"proper-lockfile": "^4.1.2",
|
|
194
|
-
"@gscdump/contracts": "0.31.
|
|
195
|
-
"gscdump": "0.31.
|
|
194
|
+
"@gscdump/contracts": "0.31.1",
|
|
195
|
+
"gscdump": "0.31.1"
|
|
196
196
|
},
|
|
197
197
|
"devDependencies": {
|
|
198
198
|
"@duckdb/duckdb-wasm": "^1.32.0",
|
|
@@ -208,6 +208,7 @@
|
|
|
208
208
|
"build": "obuild",
|
|
209
209
|
"typecheck": "tsc --noEmit",
|
|
210
210
|
"test": "vitest",
|
|
211
|
+
"benchmark-store": "tsx scripts/benchmark-store.mts",
|
|
211
212
|
"r2-harness": "tsx scripts/r2-contention-harness.ts",
|
|
212
213
|
"backfill-audit": "tsx scripts/backfill-audit.ts"
|
|
213
214
|
}
|