@gscdump/engine 0.30.0 → 0.31.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  import { dayPartition, hourPartition, inferSearchType, objectKey, tenantPrefix } from "./layout.mjs";
2
- import { SCHEMAS, currentSchemaVersion, dateColumnsFor, dedupeByNaturalKey } from "./schema.mjs";
2
+ import { SCHEMAS, TABLE_METADATA, currentSchemaVersion, dateColumnsFor, dedupeByNaturalKey } from "./schema.mjs";
3
3
  import { compactTieredImpl, dedupeOverlappingTiers, splitOverlappingTiers } from "./compaction.mjs";
4
4
  import { dateReplaceClause as dateReplaceClause$1 } from "../sql-fragments.mjs";
5
5
  import { compileLogicalQueryPlan, substituteNamedFiles } from "./parquet-plan.mjs";
@@ -100,11 +100,14 @@ function createDuckDBCodec(factory) {
100
100
  }
101
101
  };
102
102
  }
103
+ const quoteCol = (c) => `"${c.replace(/"/g, "\"\"")}"`;
103
104
  function dedupedMergeSql(table, fileListSql) {
104
105
  const base = `SELECT * FROM read_parquet([${fileListSql}], union_by_name = true)`;
105
- const key = SCHEMAS[table].sortKey;
106
- if (key.length === 0) return base;
107
- return `${base} QUALIFY row_number() OVER (PARTITION BY ${key.map((c) => `"${c.replace(/"/g, "\"\"")}"`).join(", ")}) = 1`;
106
+ const sortKey = SCHEMAS[table].sortKey;
107
+ const clusterKey = TABLE_METADATA[table].clusterKey;
108
+ const dedup = sortKey.length === 0 ? base : `${base} QUALIFY row_number() OVER (PARTITION BY ${sortKey.map(quoteCol).join(", ")}) = 1`;
109
+ if (clusterKey.length === 0) return dedup;
110
+ return `${dedup} ORDER BY ${clusterKey.map(quoteCol).join(", ")}`;
108
111
  }
109
112
  function rewriteEmptyFileSets(sql, placeholders, defaultTable, placeholderTables) {
110
113
  let out = sql;
@@ -16,6 +16,105 @@ async function readOptional(ds, key, signal) {
16
16
  throw e;
17
17
  });
18
18
  }
19
+ const QUERY_DIM_COLUMNS = [
20
+ {
21
+ name: "query",
22
+ type: "VARCHAR",
23
+ nullable: false
24
+ },
25
+ {
26
+ name: "query_canonical",
27
+ type: "VARCHAR",
28
+ nullable: false
29
+ },
30
+ {
31
+ name: "intent_code",
32
+ type: "INTEGER",
33
+ nullable: false
34
+ },
35
+ {
36
+ name: "normalizer_version",
37
+ type: "INTEGER",
38
+ nullable: false
39
+ },
40
+ {
41
+ name: "intent_version",
42
+ type: "INTEGER",
43
+ nullable: false
44
+ }
45
+ ];
46
+ function queryDimPrefix(ctx) {
47
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/query_dim` : `u_${ctx.userId}/entities/query_dim`;
48
+ }
49
+ function queryDimParquetKey(ctx) {
50
+ return `${queryDimPrefix(ctx)}/index.parquet`;
51
+ }
52
+ function queryDimMetaKey(ctx) {
53
+ return `${queryDimPrefix(ctx)}/index.json`;
54
+ }
55
+ function buildQueryDimRecords(queries, deps) {
56
+ const seen = /* @__PURE__ */ new Set();
57
+ const out = [];
58
+ for (const raw of queries) {
59
+ const query = String(raw);
60
+ if (query.trim() === "" || seen.has(query)) continue;
61
+ seen.add(query);
62
+ const canonical = deps.normalizeQuery(query);
63
+ out.push({
64
+ query,
65
+ query_canonical: canonical === "" ? query : canonical,
66
+ intent_code: deps.classifyIntentCode(query),
67
+ normalizer_version: deps.normalizerVersion,
68
+ intent_version: deps.intentVersion
69
+ });
70
+ }
71
+ return out;
72
+ }
73
+ function createQueryDimStore({ dataSource }) {
74
+ async function exists(key, prefix) {
75
+ return (await dataSource.list(prefix)).includes(key);
76
+ }
77
+ return {
78
+ parquetKey: queryDimParquetKey,
79
+ async write(ctx, records, builtAt) {
80
+ const parquetKey = queryDimParquetKey(ctx);
81
+ const bytes = encodeRowsToParquetFlex(records, {
82
+ columns: QUERY_DIM_COLUMNS,
83
+ sortKey: ["query"]
84
+ });
85
+ await dataSource.write(parquetKey, bytes);
86
+ const meta = {
87
+ version: 1,
88
+ builtAt,
89
+ rowCount: records.length,
90
+ normalizerVersion: records[0]?.normalizer_version ?? 0,
91
+ intentVersion: records[0]?.intent_version ?? 0
92
+ };
93
+ await dataSource.write(queryDimMetaKey(ctx), new TextEncoder().encode(JSON.stringify(meta)));
94
+ return {
95
+ parquetKey,
96
+ rowCount: records.length
97
+ };
98
+ },
99
+ async loadMeta(ctx) {
100
+ const key = queryDimMetaKey(ctx);
101
+ if (!await exists(key, `${queryDimPrefix(ctx)}/`)) return null;
102
+ const bytes = await dataSource.read(key);
103
+ return JSON.parse(new TextDecoder().decode(bytes));
104
+ },
105
+ async loadRecords(ctx) {
106
+ const key = queryDimParquetKey(ctx);
107
+ if (!await exists(key, `${queryDimPrefix(ctx)}/`)) return [];
108
+ return (await decodeParquetToRows(await dataSource.read(key))).map((r) => ({
109
+ query: String(r.query),
110
+ query_canonical: String(r.query_canonical),
111
+ intent_code: Number(r.intent_code),
112
+ normalizer_version: Number(r.normalizer_version),
113
+ intent_version: Number(r.intent_version)
114
+ }));
115
+ }
116
+ };
117
+ }
19
118
  const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
20
119
  function inspectionIndexKey(ctx) {
21
120
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.json` : `u_${ctx.userId}/entities/inspections/index.json`;
@@ -893,4 +992,4 @@ function createEmptyTypesStore(opts) {
893
992
  }
894
993
  };
895
994
  }
896
- export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
995
+ export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
@@ -133,7 +133,7 @@ interface Snapshot {
133
133
  'sequence-number': number;
134
134
  'timestamp-ms': number;
135
135
  'manifest-list': string;
136
- manifests?: Manifest$1[];
136
+ manifests?: Manifest[];
137
137
  summary: {
138
138
  // spec: "value of these fields should be of string type"
139
139
  operation: string; // 'spark.app.id'?: string
@@ -192,7 +192,7 @@ interface MetadataLog {
192
192
  'timestamp-ms': number;
193
193
  'metadata-file': string;
194
194
  }
195
- interface Manifest$1 {
195
+ interface Manifest {
196
196
  manifest_path: string;
197
197
  manifest_length: bigint;
198
198
  partition_spec_id: number;
@@ -297,8 +297,10 @@ function createSqlFragments(config) {
297
297
  if (isMetricDimension(f.dimension)) continue;
298
298
  if (f.dimension === "date") continue;
299
299
  if (f.operator === "topLevel") continue;
300
- const cRef = colRef(tableKey, dimColumn(f.dimension, tableKey));
301
- const matchExpr = f.dimension === "page" ? dimExprSql(f.dimension, tableKey) : cRef;
300
+ const dim = f.dimension;
301
+ const cRef = colRef(tableKey, dimColumn(dim, tableKey));
302
+ const matchExpr = dim === "page" || dim === "queryCanonical" ? dimExprSql(dim, tableKey) : cRef;
303
+ const patternExpr = dim === "queryCanonical" ? matchExpr : cRef;
302
304
  switch (f.operator) {
303
305
  case "equals":
304
306
  preds.push(sql`${matchExpr} = ${f.expression}`);
@@ -307,16 +309,16 @@ function createSqlFragments(config) {
307
309
  preds.push(sql`${matchExpr} != ${f.expression}`);
308
310
  break;
309
311
  case "contains":
310
- preds.push(sql`${cRef} LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
312
+ preds.push(sql`${patternExpr} LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
311
313
  break;
312
314
  case "notContains":
313
- preds.push(sql`${cRef} NOT LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
315
+ preds.push(sql`${patternExpr} NOT LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
314
316
  break;
315
317
  case "includingRegex":
316
- preds.push(regexPredicate(cRef, f.expression, false));
318
+ preds.push(regexPredicate(patternExpr, f.expression, false));
317
319
  break;
318
320
  case "excludingRegex":
319
- preds.push(regexPredicate(cRef, f.expression, true));
321
+ preds.push(regexPredicate(patternExpr, f.expression, true));
320
322
  break;
321
323
  }
322
324
  }
@@ -185,6 +185,18 @@ interface IcebergPartitionSpec {
185
185
  'spec-id': number;
186
186
  'fields': IcebergPartitionSpecField[];
187
187
  }
188
+ /** A field in an icebird `SortOrder`. */
189
+ interface IcebergSortOrderField {
190
+ 'source-id': number;
191
+ 'transform': 'identity';
192
+ 'direction': 'asc' | 'desc';
193
+ 'null-order': 'nulls-first' | 'nulls-last';
194
+ }
195
+ /** An icebird `SortOrder` (Iceberg write-order). */
196
+ interface IcebergSortOrder {
197
+ 'order-id': number;
198
+ 'fields': IcebergSortOrderField[];
199
+ }
188
200
  /** Everything needed to talk to the R2 Data Catalog. */
189
201
  interface IcebergCatalogConfig {
190
202
  /** REST catalog URI, e.g. `https://catalog.cloudflarestorage.com/<acct>/<warehouse>`. */
@@ -220,6 +232,21 @@ declare function icebergSchemaFor(table: IcebergTableName, encoding?: PartitionK
220
232
  * {@link icebergSchemaFor}.
221
233
  */
222
234
  declare function icebergPartitionSpecFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergPartitionSpec;
235
+ /**
236
+ * Build the icebird `SortOrder` for a fact table from its `clusterKey`
237
+ * (dimension-first, then `date`) — e.g. `pages` → sort by `url`, then `date`.
238
+ *
239
+ * Declared so any sort-aware compaction (a self-run `icebergRewrite`, or R2
240
+ * managed compaction if/when it honors sort order) re-clusters merged files the
241
+ * same way the append path already orders them ({@link sortByClusterKey} in
242
+ * `append-sink.ts`). R2's managed compaction currently only bin-packs small
243
+ * files without re-sorting, so this is forward-looking: it costs nothing today
244
+ * (the table simply carries the metadata) and means a future sort-aware pass
245
+ * produces globally clustered files for free, maximizing row-group skipping on
246
+ * the DuckDB-over-R2 read path. clusterKey columns are all non-null, so the
247
+ * null ordering is moot; `identity`/`asc` mirrors the physical write order.
248
+ */
249
+ declare function icebergSortOrderFor(table: IcebergTableName, encoding?: PartitionKeyEncoding): IcebergSortOrder;
223
250
  /** Options for {@link connectIcebergCatalog}. */
224
251
  interface ConnectIcebergOptions {
225
252
  /**
@@ -501,4 +528,4 @@ interface LocalIcebergSinkOptions extends SinkOptions {
501
528
  /** S3-compatible warehouse location (POC: MinIO). */
502
529
  warehouse: string;
503
530
  }
504
- export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
531
+ export { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergSortOrder, IcebergSortOrderField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, LocalIcebergSinkOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, SinkCapabilities, SinkCloseResult, SinkOptions, SinkSlice, SinkWriteResult, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables };
@@ -1,6 +1,60 @@
1
1
  import { DataSource } from "./_chunks/storage.mjs";
2
2
  import { ScheduleState } from "./schedule.mjs";
3
3
  import { ColumnDef, TenantCtx } from "@gscdump/contracts";
4
+ interface QueryDimRecord {
5
+ query: string;
6
+ /** Lexical canonical, never empty: NULL/'' folds to the raw query. */
7
+ query_canonical: string;
8
+ /** Packed search-intent code (see `@gscdump/analysis` `encodeIntent`). */
9
+ intent_code: number;
10
+ normalizer_version: number;
11
+ intent_version: number;
12
+ }
13
+ /** JSON sidecar: versions + freshness, readable without decoding the parquet. */
14
+ interface QueryDimMeta {
15
+ version: 1;
16
+ builtAt: number;
17
+ rowCount: number;
18
+ normalizerVersion: number;
19
+ intentVersion: number;
20
+ }
21
+ declare function queryDimParquetKey(ctx: TenantCtx): string;
22
+ declare function queryDimMetaKey(ctx: TenantCtx): string;
23
+ /**
24
+ * Injected derivation. `engine` never imports `@gscdump/analysis`; the host
25
+ * passes `normalizeQuery` / `classifyIntentCode` (e.g. `encodeIntent ∘
26
+ * classifyQueryIntent`) plus their version constants.
27
+ */
28
+ interface QueryDimDeps {
29
+ normalizeQuery: (query: string) => string;
30
+ normalizerVersion: number;
31
+ /** Returns the packed intent code for a raw query. */
32
+ classifyIntentCode: (query: string) => number;
33
+ intentVersion: number;
34
+ }
35
+ /**
36
+ * Pure: distinct raw queries → dimension records. De-dupes, drops empties, and
37
+ * folds an empty/whitespace canonical back to the raw query so the key is
38
+ * total (matches the read path's `COALESCE(NULLIF(query_canonical, ''), query)`).
39
+ */
40
+ declare function buildQueryDimRecords(queries: Iterable<string>, deps: QueryDimDeps): QueryDimRecord[];
41
+ interface QueryDimStore {
42
+ parquetKey: (ctx: TenantCtx) => string;
43
+ /** Write the parquet + JSON sidecar. Last-write-wins; no history. */
44
+ write: (ctx: TenantCtx, records: readonly QueryDimRecord[], builtAt: number) => Promise<{
45
+ parquetKey: string;
46
+ rowCount: number;
47
+ }>;
48
+ /** Read the sidecar (versions + freshness), or null on first build. */
49
+ loadMeta: (ctx: TenantCtx) => Promise<QueryDimMeta | null>;
50
+ /** Decode the dimension rows (test/inspection; reads JOIN the parquet by key). */
51
+ loadRecords: (ctx: TenantCtx) => Promise<QueryDimRecord[]>;
52
+ }
53
+ declare function createQueryDimStore({
54
+ dataSource
55
+ }: {
56
+ dataSource: DataSource;
57
+ }): QueryDimStore;
4
58
  /**
5
59
  * GSC URL inspection result fields we persist. Mirrors the
6
60
  * `searchconsole_v1.Schema$UrlInspectionResult` shape but as plain JSON
@@ -442,4 +496,4 @@ interface CreateEmptyTypesStoreOptions {
442
496
  now?: () => number;
443
497
  }
444
498
  declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
445
- export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, ReconcileResult, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
499
+ export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, QueryDimDeps, QueryDimMeta, QueryDimRecord, QueryDimStore, ReconcileResult, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
package/dist/entities.mjs CHANGED
@@ -1,2 +1,2 @@
1
- import { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
2
- export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
1
+ import { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
2
+ export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
@@ -1,4 +1,4 @@
1
- import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
1
+ import { CatalogCache, CommitRetryOptions, ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, IcebergAppendSinkOptions, IcebergCatalogConfig, IcebergColumn, IcebergColumnType, IcebergConnection, IcebergListedDataFile, IcebergPartitionField, IcebergPartitionSpec, IcebergPartitionSpecField, IcebergPartitionTransform, IcebergPrimitiveType, IcebergS3Config, IcebergSchema, IcebergSchemaField, IcebergSortOrder, IcebergSortOrderField, IcebergTableName, IcebergTableOpResult, IcebergTableSpec, ListIcebergDataFilesOptions, PartitionKeyEncoding, SEARCH_TYPE_INT, Sink, assertIcebergTable, connectIcebergCatalog, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables } from "../_chunks/sink.mjs";
2
2
  import { icebergCreateTable, icebergManifests, restCatalogLoadTable } from "../_chunks/libs/icebird.mjs";
3
3
  type IcebergAppendSink = Sink;
4
4
  /**
@@ -10,4 +10,4 @@ type IcebergAppendSink = Sink;
10
10
  * with no rows never touches the network.
11
11
  */
12
12
  declare function createIcebergAppendSink(options: IcebergAppendSinkOptions): IcebergAppendSink;
13
- export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, type PartitionKeyEncoding, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
13
+ export { type CatalogCache, type CommitRetryOptions, type ConnectIcebergOptions, ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, type IcebergAppendSink, type IcebergAppendSinkOptions, type IcebergCatalogConfig, type IcebergColumn, type IcebergColumnType, type IcebergConnection, type IcebergListedDataFile, type IcebergPartitionField, type IcebergPartitionSpec, type IcebergPartitionSpecField, type IcebergPartitionTransform, type IcebergPrimitiveType, type IcebergS3Config, type IcebergSchema, type IcebergSchemaField, type IcebergSortOrder, type IcebergSortOrderField, type IcebergTableName, type IcebergTableOpResult, type IcebergTableSpec, type ListIcebergDataFilesOptions, type PartitionKeyEncoding, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
@@ -1,3 +1,4 @@
1
+ import { TABLE_METADATA } from "../_chunks/schema.mjs";
1
2
  import { engineErrors } from "../errors.mjs";
2
3
  import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, icebergPartitionColumns, icebergSchemasFor, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
3
4
  import { cachingResolver, icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "../_chunks/libs/icebird.mjs";
@@ -107,6 +108,23 @@ function icebergPartitionSpecFor(table, encoding = "string") {
107
108
  }))
108
109
  };
109
110
  }
111
+ function icebergSortOrderFor(table, encoding = "string") {
112
+ const fields = icebergSchemasFor(encoding)[table].columns;
113
+ const fieldId = (name) => {
114
+ const col = fields.find((c) => c.name === name);
115
+ if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
116
+ return col.fieldId;
117
+ };
118
+ return {
119
+ "order-id": 1,
120
+ "fields": TABLE_METADATA[table].clusterKey.map((col) => ({
121
+ "source-id": fieldId(col),
122
+ "transform": "identity",
123
+ "direction": "asc",
124
+ "null-order": "nulls-last"
125
+ }))
126
+ };
127
+ }
110
128
  const CATALOG_CONFIG_TTL_MS = 3600 * 1e3;
111
129
  function catalogConfigKey(config) {
112
130
  return `gsc-catalog-cfg\0${config.catalogUri}\0${config.warehouse}`;
@@ -187,7 +205,8 @@ async function createIcebergTables(conn, tables = ICEBERG_TABLES, encoding = "st
187
205
  namespace: conn.namespace,
188
206
  table,
189
207
  schema: icebergSchemaFor(table, encoding),
190
- partitionSpec: icebergPartitionSpecFor(table, encoding)
208
+ partitionSpec: icebergPartitionSpecFor(table, encoding),
209
+ sortOrder: icebergSortOrderFor(table, encoding)
191
210
  }).then(() => results.push({
192
211
  table,
193
212
  outcome: ok(void 0)
@@ -369,6 +388,24 @@ function dedupeByIdentity(table, records) {
369
388
  }
370
389
  return seen.size === records.length ? records : [...seen.values()];
371
390
  }
391
+ function sortByClusterKey(table, records) {
392
+ const cols = TABLE_METADATA[table].clusterKey;
393
+ if (cols.length === 0 || records.length < 2) return records;
394
+ return records.slice().sort((a, b) => {
395
+ for (const col of cols) {
396
+ const av = a[col];
397
+ const bv = b[col];
398
+ if (av === bv) continue;
399
+ if (av == null) return -1;
400
+ if (bv == null) return 1;
401
+ if (typeof av === "number" && typeof bv === "number") return av - bv;
402
+ const as = String(av);
403
+ const bs = String(bv);
404
+ if (as !== bs) return as < bs ? -1 : 1;
405
+ }
406
+ return 0;
407
+ });
408
+ }
372
409
  function toRecords(slice, rows, encoding) {
373
410
  const siteVal = encoding === "int" ? toIntPartitionSiteId(slice.ctx.siteId) : slice.ctx.siteId ?? "";
374
411
  const searchVal = encoding === "int" ? SEARCH_TYPE_INT[slice.searchType] : slice.searchType;
@@ -423,7 +460,7 @@ function createIcebergAppendSink(options) {
423
460
  }
424
461
  for (const [table, records] of buffers) {
425
462
  if (records.length === 0) continue;
426
- const deduped = dedupeByIdentity(table, records);
463
+ const deduped = sortByClusterKey(table, dedupeByIdentity(table, records));
427
464
  await icebergAppendRetrying({
428
465
  catalog: conn.catalog,
429
466
  namespace: conn.namespace,
@@ -447,4 +484,4 @@ function createIcebergAppendSink(options) {
447
484
  }
448
485
  };
449
486
  }
450
- export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
487
+ export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergSortOrderFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
@@ -244,6 +244,12 @@ declare function runWindowed(opts: {
244
244
  start: string;
245
245
  end: string;
246
246
  }) => string;
247
+ /**
248
+ * Extra named file sets merged into every window's `runSQL` (alongside the
249
+ * windowed `FILES`). Use to JOIN a non-windowed sidecar (e.g. the query
250
+ * dimension parquet via `{ QUERY_DIM: { keys: [...] } }`) inside `sqlFor`.
251
+ */
252
+ extraFileSets?: Record<string, FileSetRef>;
247
253
  }): Promise<Row$1[]>;
248
254
  /**
249
255
  * Daily totals across the full history. One row per (date, table) with
package/dist/rollups.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  import "./_chunks/layout.mjs";
2
2
  import { engineErrors } from "./errors.mjs";
3
3
  import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
4
- import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
4
+ import { createIndexingMetadataStore, createQueryDimStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
5
5
  import { MS_PER_DAY } from "gscdump";
6
6
  function rollupPrefix(ctx, searchType) {
7
7
  const base = ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
@@ -237,10 +237,13 @@ async function runWindowed(opts) {
237
237
  const result = await opts.engine.runSQL({
238
238
  ctx: opts.ctx,
239
239
  table: opts.table,
240
- fileSets: { FILES: {
241
- table: opts.table,
242
- partitions: w.partitions
243
- } },
240
+ fileSets: {
241
+ FILES: {
242
+ table: opts.table,
243
+ partitions: w.partitions
244
+ },
245
+ ...opts.extraFileSets
246
+ },
244
247
  sql: opts.sqlFor(w),
245
248
  ...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
246
249
  });
@@ -638,23 +641,41 @@ const queryCanonicalDailyRollup = {
638
641
  }
639
642
  ],
640
643
  parquetSortKey: ["date", "query_canonical"],
641
- async build({ engine, ctx, searchType }) {
644
+ async build({ engine, ctx, dataSource, searchType }) {
645
+ const dimStore = createQueryDimStore({ dataSource });
646
+ const useDim = await dimStore.loadMeta(ctx) !== null;
647
+ const canonExpr = useDim ? `COALESCE(qd.query_canonical, NULLIF(q.query_canonical, ''), q.query)` : `COALESCE(NULLIF(query_canonical, ''), query)`;
642
648
  return (await runWindowed({
643
649
  engine,
644
650
  ctx,
645
651
  table: "queries",
646
652
  ...searchType !== void 0 ? { searchType } : {},
647
- sqlFor: (w) => `
648
- SELECT
649
- COALESCE(NULLIF(query_canonical, ''), query) AS query_canonical,
650
- CAST(date AS VARCHAR) AS date,
651
- SUM(clicks)::BIGINT AS clicks,
652
- SUM(impressions)::BIGINT AS impressions,
653
- SUM(sum_position)::DOUBLE AS sum_position
654
- FROM read_parquet({{FILES}}, union_by_name = true)
655
- WHERE date >= '${w.start}' AND date <= '${w.end}'
656
- GROUP BY COALESCE(NULLIF(query_canonical, ''), query), date
657
- `
653
+ ...useDim ? { extraFileSets: { QUERY_DIM: {
654
+ table: "queries",
655
+ keys: [dimStore.parquetKey(ctx)]
656
+ } } } : {},
657
+ sqlFor: useDim ? (w) => `
658
+ SELECT
659
+ ${canonExpr} AS query_canonical,
660
+ CAST(q.date AS VARCHAR) AS date,
661
+ SUM(q.clicks)::BIGINT AS clicks,
662
+ SUM(q.impressions)::BIGINT AS impressions,
663
+ SUM(q.sum_position)::DOUBLE AS sum_position
664
+ FROM read_parquet({{FILES}}, union_by_name = true) q
665
+ LEFT JOIN read_parquet({{QUERY_DIM}}, union_by_name = true) qd ON q.query = qd.query
666
+ WHERE q.date >= '${w.start}' AND q.date <= '${w.end}'
667
+ GROUP BY ${canonExpr}, q.date
668
+ ` : (w) => `
669
+ SELECT
670
+ ${canonExpr} AS query_canonical,
671
+ CAST(date AS VARCHAR) AS date,
672
+ SUM(clicks)::BIGINT AS clicks,
673
+ SUM(impressions)::BIGINT AS impressions,
674
+ SUM(sum_position)::DOUBLE AS sum_position
675
+ FROM read_parquet({{FILES}}, union_by_name = true)
676
+ WHERE date >= '${w.start}' AND date <= '${w.end}'
677
+ GROUP BY ${canonExpr}, date
678
+ `
658
679
  })).map((r) => ({
659
680
  query_canonical: String(r.query_canonical),
660
681
  date: String(r.date),
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.30.0",
4
+ "version": "0.31.1",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -191,8 +191,8 @@
191
191
  "hyparquet": "^1.26.1",
192
192
  "hyparquet-writer": "^0.16.1",
193
193
  "proper-lockfile": "^4.1.2",
194
- "@gscdump/contracts": "0.30.0",
195
- "gscdump": "0.30.0"
194
+ "@gscdump/contracts": "0.31.1",
195
+ "gscdump": "0.31.1"
196
196
  },
197
197
  "devDependencies": {
198
198
  "@duckdb/duckdb-wasm": "^1.32.0",
@@ -208,6 +208,7 @@
208
208
  "build": "obuild",
209
209
  "typecheck": "tsc --noEmit",
210
210
  "test": "vitest",
211
+ "benchmark-store": "tsx scripts/benchmark-store.mts",
211
212
  "r2-harness": "tsx scripts/r2-contention-harness.ts",
212
213
  "backfill-audit": "tsx scripts/backfill-audit.ts"
213
214
  }