npm - @gscdump/engine - Versions diffs - 0.31.0 → 0.31.1 - Mend

@gscdump/engine 0.31.0 → 0.31.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/_chunks/engine.mjs CHANGED Viewed

@@ -1,5 +1,5 @@
 import { dayPartition, hourPartition, inferSearchType, objectKey, tenantPrefix } from "./layout.mjs";
-import { SCHEMAS, currentSchemaVersion, dateColumnsFor, dedupeByNaturalKey } from "./schema.mjs";
+import { SCHEMAS, TABLE_METADATA, currentSchemaVersion, dateColumnsFor, dedupeByNaturalKey } from "./schema.mjs";
 import { compactTieredImpl, dedupeOverlappingTiers, splitOverlappingTiers } from "./compaction.mjs";
 import { dateReplaceClause as dateReplaceClause$1 } from "../sql-fragments.mjs";
 import { compileLogicalQueryPlan, substituteNamedFiles } from "./parquet-plan.mjs";
@@ -100,11 +100,14 @@ function createDuckDBCodec(factory) {
 		}
 	};
 }
+const quoteCol = (c) => `"${c.replace(/"/g, "\"\"")}"`;
 function dedupedMergeSql(table, fileListSql) {
 	const base = `SELECT * FROM read_parquet([${fileListSql}], union_by_name = true)`;
-	const key = SCHEMAS[table].sortKey;
-	if (key.length === 0) return base;
-	return `${base} QUALIFY row_number() OVER (PARTITION BY ${key.map((c) => `"${c.replace(/"/g, "\"\"")}"`).join(", ")}) = 1`;
+	const sortKey = SCHEMAS[table].sortKey;
+	const clusterKey = TABLE_METADATA[table].clusterKey;
+	const dedup = sortKey.length === 0 ? base : `${base} QUALIFY row_number() OVER (PARTITION BY ${sortKey.map(quoteCol).join(", ")}) = 1`;
+	if (clusterKey.length === 0) return dedup;
+	return `${dedup} ORDER BY ${clusterKey.map(quoteCol).join(", ")}`;
 }
 function rewriteEmptyFileSets(sql, placeholders, defaultTable, placeholderTables) {
 	let out = sql;

package/dist/_chunks/sink.d.mts CHANGED Viewed

@@ -185,6 +185,18 @@ interface IcebergPartitionSpec {
   'spec-id': number;
   'fields': IcebergPartitionSpecField[];
 }
+/** A field in an icebird `SortOrder`. */
+interface IcebergSortOrderField {
+  'source-id': number;
+  'transform': 'identity';
+  'direction': 'asc' | 'desc';
+  'null-order': 'nulls-first' | 'nulls-last';
+}
+/** An icebird `SortOrder` (Iceberg write-order). */
+interface IcebergSortOrder {
+  'order-id': number;
+  'fields': IcebergSortOrderField[];
+}
 /** Everything needed to talk to the R2 Data Catalog. */
 interface IcebergCatalogConfig {
   /** REST catalog URI, e.g. `https://catalog.cloudflarestorage.com/<acct>/<warehouse>`. */
@@ -220,6 +232,21 @@ declare function icebergSchemaFor(table: IcebergTableName, encoding?: PartitionK
  * {@link icebergSchemaFor}.
  */
 declare function icebergPartitionSpecFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergPartitionSpec;
+/**
+ * Build the icebird `SortOrder` for a fact table from its `clusterKey`
+ * (dimension-first, then `date`) — e.g. `pages` → sort by `url`, then `date`.
+ *
+ * Declared so any sort-aware compaction (a self-run `icebergRewrite`, or R2
+ * managed compaction if/when it honors sort order) re-clusters merged files the
+ * same way the append path already orders them ({@link sortByClusterKey} in
+ * `append-sink.ts`). R2's managed compaction currently only bin-packs small
+ * files without re-sorting, so this is forward-looking: it costs nothing today
+ * (the table simply carries the metadata) and means a future sort-aware pass
+ * produces globally clustered files for free, maximizing row-group skipping on
+ * the DuckDB-over-R2 read path. clusterKey columns are all non-null, so the
+ * null ordering is moot; `identity`/`asc` mirrors the physical write order.
+ */
+declare function icebergSortOrderFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergSortOrder;
 /** Options for {@link connectIcebergCatalog}. */
 interface ConnectIcebergOptions {
   /**
@@ -501,4 +528,4 @@ interface LocalIcebergSinkOptions extends SinkOptions {
   /** S3-compatible warehouse location (POC: MinIO). */
   warehouse: string;
 }
-export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
+export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergSortOrder, IcebergSortOrderField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };

package/dist/iceberg/index.d.mts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
+import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergSortOrder, IcebergSortOrderField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
 import { icebergCreateTable, icebergManifests, restCatalogLoadTable } from "../_chunks/libs/icebird.mjs";
 type IcebergAppendSink = Sink;
 /**
@@ -10,4 +10,4 @@ type IcebergAppendSink = Sink;
  * with no rows never touches the network.
  */
 declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
-export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, type PartitionKeyEncoding, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
+export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergSortOrder, type IcebergSortOrderField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, type PartitionKeyEncoding, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };

package/dist/iceberg/index.mjs CHANGED Viewed

@@ -1,3 +1,4 @@
+import { TABLE_METADATA } from "../_chunks/schema.mjs";
 import { engineErrors } from "../errors.mjs";
 import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, icebergPartitionColumns, icebergSchemasFor, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
 import { cachingResolver, icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "../_chunks/libs/icebird.mjs";
@@ -107,6 +108,23 @@ function icebergPartitionSpecFor(table, encoding = "string") {
 		}))
 	};
 }
+function icebergSortOrderFor(table, encoding = "string") {
+	const fields = icebergSchemasFor(encoding)[table].columns;
+	const fieldId = (name) => {
+		const col = fields.find((c) => c.name === name);
+		if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
+		return col.fieldId;
+	};
+	return {
+		"order-id": 1,
+		"fields": TABLE_METADATA[table].clusterKey.map((col) => ({
+			"source-id": fieldId(col),
+			"transform": "identity",
+			"direction": "asc",
+			"null-order": "nulls-last"
+		}))
+	};
+}
 const CATALOG_CONFIG_TTL_MS = 3600 * 1e3;
 function catalogConfigKey(config) {
 	return `gsc-catalog-cfg\0${config.catalogUri}\0${config.warehouse}`;
@@ -187,7 +205,8 @@ async function createIcebergTables(conn, tables = ICEBERG_TABLES, encoding = "st
 		namespace: conn.namespace,
 		table,
 		schema: icebergSchemaFor(table, encoding),
-		partitionSpec: icebergPartitionSpecFor(table, encoding)
+		partitionSpec: icebergPartitionSpecFor(table, encoding),
+		sortOrder: icebergSortOrderFor(table, encoding)
 	}).then(() => results.push({
 		table,
 		outcome: ok(void 0)
@@ -369,6 +388,24 @@ function dedupeByIdentity(table, records) {
 	}
 	return seen.size === records.length ? records : [...seen.values()];
 }
+function sortByClusterKey(table, records) {
+	const cols = TABLE_METADATA[table].clusterKey;
+	if (cols.length === 0 || records.length < 2) return records;
+	return records.slice().sort((a, b) => {
+		for (const col of cols) {
+			const av = a[col];
+			const bv = b[col];
+			if (av === bv) continue;
+			if (av == null) return -1;
+			if (bv == null) return 1;
+			if (typeof av === "number" && typeof bv === "number") return av - bv;
+			const as = String(av);
+			const bs = String(bv);
+			if (as !== bs) return as < bs ? -1 : 1;
+		}
+		return 0;
+	});
+}
 function toRecords(slice, rows, encoding) {
 	const siteVal = encoding === "int" ? toIntPartitionSiteId(slice.ctx.siteId) : slice.ctx.siteId ?? "";
 	const searchVal = encoding === "int" ? SEARCH_TYPE_INT[slice.searchType] : slice.searchType;
@@ -423,7 +460,7 @@ function createIcebergAppendSink(options) {
 			}
 			for (const [table, records] of buffers) {
 				if (records.length === 0) continue;
-				const deduped = dedupeByIdentity(table, records);
+				const deduped = sortByClusterKey(table, dedupeByIdentity(table, records));
 				await icebergAppendRetrying({
 					catalog: conn.catalog,
 					namespace: conn.namespace,
@@ -447,4 +484,4 @@ function createIcebergAppendSink(options) {
 		}
 	};
 }
-export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
+export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@gscdump/engine",
   "type": "module",
-  "version": "0.31.0",
+  "version": "0.31.1",
   "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
   "author": {
     "name": "Harlan Wilton",
@@ -191,8 +191,8 @@
     "hyparquet": "^1.26.1",
     "hyparquet-writer": "^0.16.1",
     "proper-lockfile": "^4.1.2",
-    "@gscdump/contracts": "0.31.0",
-    "gscdump": "0.31.0"
+    "@gscdump/contracts": "0.31.1",
+    "gscdump": "0.31.1"
   },
   "devDependencies": {
     "@duckdb/duckdb-wasm": "^1.32.0",
@@ -208,6 +208,7 @@
     "build": "obuild",
     "typecheck": "tsc --noEmit",
     "test": "vitest",
+    "benchmark-store": "tsx scripts/benchmark-store.mts",
     "r2-harness": "tsx scripts/r2-contention-harness.ts",
     "backfill-audit": "tsx scripts/backfill-audit.ts"
   }