@gscdump/engine 0.31.0 → 0.31.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  import { dayPartition, hourPartition, inferSearchType, objectKey, tenantPrefix } from "./layout.mjs";
2
- import { SCHEMAS, currentSchemaVersion, dateColumnsFor, dedupeByNaturalKey } from "./schema.mjs";
2
+ import { SCHEMAS, TABLE_METADATA, currentSchemaVersion, dateColumnsFor, dedupeByNaturalKey } from "./schema.mjs";
3
3
  import { compactTieredImpl, dedupeOverlappingTiers, splitOverlappingTiers } from "./compaction.mjs";
4
4
  import { dateReplaceClause as dateReplaceClause$1 } from "../sql-fragments.mjs";
5
5
  import { compileLogicalQueryPlan, substituteNamedFiles } from "./parquet-plan.mjs";
@@ -100,11 +100,14 @@ function createDuckDBCodec(factory) {
100
100
  }
101
101
  };
102
102
  }
103
+ const quoteCol = (c) => `"${c.replace(/"/g, "\"\"")}"`;
103
104
  function dedupedMergeSql(table, fileListSql) {
104
105
  const base = `SELECT * FROM read_parquet([${fileListSql}], union_by_name = true)`;
105
- const key = SCHEMAS[table].sortKey;
106
- if (key.length === 0) return base;
107
- return `${base} QUALIFY row_number() OVER (PARTITION BY ${key.map((c) => `"${c.replace(/"/g, "\"\"")}"`).join(", ")}) = 1`;
106
+ const sortKey = SCHEMAS[table].sortKey;
107
+ const clusterKey = TABLE_METADATA[table].clusterKey;
108
+ const dedup = sortKey.length === 0 ? base : `${base} QUALIFY row_number() OVER (PARTITION BY ${sortKey.map(quoteCol).join(", ")}) = 1`;
109
+ if (clusterKey.length === 0) return dedup;
110
+ return `${dedup} ORDER BY ${clusterKey.map(quoteCol).join(", ")}`;
108
111
  }
109
112
  function rewriteEmptyFileSets(sql, placeholders, defaultTable, placeholderTables) {
110
113
  let out = sql;
@@ -185,6 +185,18 @@ interface IcebergPartitionSpec {
185
185
  'spec-id': number;
186
186
  'fields': IcebergPartitionSpecField[];
187
187
  }
188
+ /** A field in an icebird `SortOrder`. */
189
+ interface IcebergSortOrderField {
190
+ 'source-id': number;
191
+ 'transform': 'identity';
192
+ 'direction': 'asc' | 'desc';
193
+ 'null-order': 'nulls-first' | 'nulls-last';
194
+ }
195
+ /** An icebird `SortOrder` (Iceberg write-order). */
196
+ interface IcebergSortOrder {
197
+ 'order-id': number;
198
+ 'fields': IcebergSortOrderField[];
199
+ }
188
200
  /** Everything needed to talk to the R2 Data Catalog. */
189
201
  interface IcebergCatalogConfig {
190
202
  /** REST catalog URI, e.g. `https://catalog.cloudflarestorage.com/<acct>/<warehouse>`. */
@@ -220,6 +232,21 @@ declare function icebergSchemaFor(table: IcebergTableName, encoding?: PartitionK
220
232
  * {@link icebergSchemaFor}.
221
233
  */
222
234
  declare function icebergPartitionSpecFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergPartitionSpec;
235
+ /**
236
+ * Build the icebird `SortOrder` for a fact table from its `clusterKey`
237
+ * (dimension-first, then `date`) — e.g. `pages` → sort by `url`, then `date`.
238
+ *
239
+ * Declared so any sort-aware compaction (a self-run `icebergRewrite`, or R2
240
+ * managed compaction if/when it honors sort order) re-clusters merged files the
241
+ * same way the append path already orders them ({@link sortByClusterKey} in
242
+ * `append-sink.ts`). R2's managed compaction currently only bin-packs small
243
+ * files without re-sorting, so this is forward-looking: it costs nothing today
244
+ * (the table simply carries the metadata) and means a future sort-aware pass
245
+ * produces globally clustered files for free, maximizing row-group skipping on
246
+ * the DuckDB-over-R2 read path. clusterKey columns are all non-null, so the
247
+ * null ordering is moot; `identity`/`asc` mirrors the physical write order.
248
+ */
249
+ declare function icebergSortOrderFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergSortOrder;
223
250
  /** Options for {@link connectIcebergCatalog}. */
224
251
  interface ConnectIcebergOptions {
225
252
  /**
@@ -501,4 +528,4 @@ interface LocalIcebergSinkOptions extends SinkOptions {
501
528
  /** S3-compatible warehouse location (POC: MinIO). */
502
529
  warehouse: string;
503
530
  }
504
- export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
531
+ export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergSortOrder, IcebergSortOrderField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
@@ -1,4 +1,4 @@
1
- import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
1
+ import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergSortOrder, IcebergSortOrderField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
2
2
  import { icebergCreateTable, icebergManifests, restCatalogLoadTable } from "../_chunks/libs/icebird.mjs";
3
3
  type IcebergAppendSink = Sink;
4
4
  /**
@@ -10,4 +10,4 @@ type IcebergAppendSink = Sink;
10
10
  * with no rows never touches the network.
11
11
  */
12
12
  declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
13
- export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, type PartitionKeyEncoding, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
13
+ export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergSortOrder, type IcebergSortOrderField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, type PartitionKeyEncoding, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
@@ -1,3 +1,4 @@
1
+ import { TABLE_METADATA } from "../_chunks/schema.mjs";
1
2
  import { engineErrors } from "../errors.mjs";
2
3
  import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, icebergPartitionColumns, icebergSchemasFor, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
3
4
  import { cachingResolver, icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "../_chunks/libs/icebird.mjs";
@@ -107,6 +108,23 @@ function icebergPartitionSpecFor(table, encoding = "string") {
107
108
  }))
108
109
  };
109
110
  }
111
+ function icebergSortOrderFor(table, encoding = "string") {
112
+ const fields = icebergSchemasFor(encoding)[table].columns;
113
+ const fieldId = (name) => {
114
+ const col = fields.find((c) => c.name === name);
115
+ if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
116
+ return col.fieldId;
117
+ };
118
+ return {
119
+ "order-id": 1,
120
+ "fields": TABLE_METADATA[table].clusterKey.map((col) => ({
121
+ "source-id": fieldId(col),
122
+ "transform": "identity",
123
+ "direction": "asc",
124
+ "null-order": "nulls-last"
125
+ }))
126
+ };
127
+ }
110
128
  const CATALOG_CONFIG_TTL_MS = 3600 * 1e3;
111
129
  function catalogConfigKey(config) {
112
130
  return `gsc-catalog-cfg\0${config.catalogUri}\0${config.warehouse}`;
@@ -187,7 +205,8 @@ async function createIcebergTables(conn, tables = ICEBERG_TABLES, encoding = "st
187
205
  namespace: conn.namespace,
188
206
  table,
189
207
  schema: icebergSchemaFor(table, encoding),
190
- partitionSpec: icebergPartitionSpecFor(table, encoding)
208
+ partitionSpec: icebergPartitionSpecFor(table, encoding),
209
+ sortOrder: icebergSortOrderFor(table, encoding)
191
210
  }).then(() => results.push({
192
211
  table,
193
212
  outcome: ok(void 0)
@@ -369,6 +388,24 @@ function dedupeByIdentity(table, records) {
369
388
  }
370
389
  return seen.size === records.length ? records : [...seen.values()];
371
390
  }
391
+ function sortByClusterKey(table, records) {
392
+ const cols = TABLE_METADATA[table].clusterKey;
393
+ if (cols.length === 0 || records.length < 2) return records;
394
+ return records.slice().sort((a, b) => {
395
+ for (const col of cols) {
396
+ const av = a[col];
397
+ const bv = b[col];
398
+ if (av === bv) continue;
399
+ if (av == null) return -1;
400
+ if (bv == null) return 1;
401
+ if (typeof av === "number" && typeof bv === "number") return av - bv;
402
+ const as = String(av);
403
+ const bs = String(bv);
404
+ if (as !== bs) return as < bs ? -1 : 1;
405
+ }
406
+ return 0;
407
+ });
408
+ }
372
409
  function toRecords(slice, rows, encoding) {
373
410
  const siteVal = encoding === "int" ? toIntPartitionSiteId(slice.ctx.siteId) : slice.ctx.siteId ?? "";
374
411
  const searchVal = encoding === "int" ? SEARCH_TYPE_INT[slice.searchType] : slice.searchType;
@@ -423,7 +460,7 @@ function createIcebergAppendSink(options) {
423
460
  }
424
461
  for (const [table, records] of buffers) {
425
462
  if (records.length === 0) continue;
426
- const deduped = dedupeByIdentity(table, records);
463
+ const deduped = sortByClusterKey(table, dedupeByIdentity(table, records));
427
464
  await icebergAppendRetrying({
428
465
  catalog: conn.catalog,
429
466
  namespace: conn.namespace,
@@ -447,4 +484,4 @@ function createIcebergAppendSink(options) {
447
484
  }
448
485
  };
449
486
  }
450
- export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
487
+ export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.31.0",
4
+ "version": "0.31.1",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -191,8 +191,8 @@
191
191
  "hyparquet": "^1.26.1",
192
192
  "hyparquet-writer": "^0.16.1",
193
193
  "proper-lockfile": "^4.1.2",
194
- "@gscdump/contracts": "0.31.0",
195
- "gscdump": "0.31.0"
194
+ "@gscdump/contracts": "0.31.1",
195
+ "gscdump": "0.31.1"
196
196
  },
197
197
  "devDependencies": {
198
198
  "@duckdb/duckdb-wasm": "^1.32.0",
@@ -208,6 +208,7 @@
208
208
  "build": "obuild",
209
209
  "typecheck": "tsc --noEmit",
210
210
  "test": "vitest",
211
+ "benchmark-store": "tsx scripts/benchmark-store.mts",
211
212
  "r2-harness": "tsx scripts/r2-contention-harness.ts",
212
213
  "backfill-audit": "tsx scripts/backfill-audit.ts"
213
214
  }