@gscdump/engine 0.29.0 → 0.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,105 @@ async function readOptional(ds, key, signal) {
16
16
  throw e;
17
17
  });
18
18
  }
19
+ const QUERY_DIM_COLUMNS = [
20
+ {
21
+ name: "query",
22
+ type: "VARCHAR",
23
+ nullable: false
24
+ },
25
+ {
26
+ name: "query_canonical",
27
+ type: "VARCHAR",
28
+ nullable: false
29
+ },
30
+ {
31
+ name: "intent_code",
32
+ type: "INTEGER",
33
+ nullable: false
34
+ },
35
+ {
36
+ name: "normalizer_version",
37
+ type: "INTEGER",
38
+ nullable: false
39
+ },
40
+ {
41
+ name: "intent_version",
42
+ type: "INTEGER",
43
+ nullable: false
44
+ }
45
+ ];
46
+ function queryDimPrefix(ctx) {
47
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/query_dim` : `u_${ctx.userId}/entities/query_dim`;
48
+ }
49
+ function queryDimParquetKey(ctx) {
50
+ return `${queryDimPrefix(ctx)}/index.parquet`;
51
+ }
52
+ function queryDimMetaKey(ctx) {
53
+ return `${queryDimPrefix(ctx)}/index.json`;
54
+ }
55
+ function buildQueryDimRecords(queries, deps) {
56
+ const seen = /* @__PURE__ */ new Set();
57
+ const out = [];
58
+ for (const raw of queries) {
59
+ const query = String(raw);
60
+ if (query.trim() === "" || seen.has(query)) continue;
61
+ seen.add(query);
62
+ const canonical = deps.normalizeQuery(query);
63
+ out.push({
64
+ query,
65
+ query_canonical: canonical === "" ? query : canonical,
66
+ intent_code: deps.classifyIntentCode(query),
67
+ normalizer_version: deps.normalizerVersion,
68
+ intent_version: deps.intentVersion
69
+ });
70
+ }
71
+ return out;
72
+ }
73
+ function createQueryDimStore({ dataSource }) {
74
+ async function exists(key, prefix) {
75
+ return (await dataSource.list(prefix)).includes(key);
76
+ }
77
+ return {
78
+ parquetKey: queryDimParquetKey,
79
+ async write(ctx, records, builtAt) {
80
+ const parquetKey = queryDimParquetKey(ctx);
81
+ const bytes = encodeRowsToParquetFlex(records, {
82
+ columns: QUERY_DIM_COLUMNS,
83
+ sortKey: ["query"]
84
+ });
85
+ await dataSource.write(parquetKey, bytes);
86
+ const meta = {
87
+ version: 1,
88
+ builtAt,
89
+ rowCount: records.length,
90
+ normalizerVersion: records[0]?.normalizer_version ?? 0,
91
+ intentVersion: records[0]?.intent_version ?? 0
92
+ };
93
+ await dataSource.write(queryDimMetaKey(ctx), new TextEncoder().encode(JSON.stringify(meta)));
94
+ return {
95
+ parquetKey,
96
+ rowCount: records.length
97
+ };
98
+ },
99
+ async loadMeta(ctx) {
100
+ const key = queryDimMetaKey(ctx);
101
+ if (!await exists(key, `${queryDimPrefix(ctx)}/`)) return null;
102
+ const bytes = await dataSource.read(key);
103
+ return JSON.parse(new TextDecoder().decode(bytes));
104
+ },
105
+ async loadRecords(ctx) {
106
+ const key = queryDimParquetKey(ctx);
107
+ if (!await exists(key, `${queryDimPrefix(ctx)}/`)) return [];
108
+ return (await decodeParquetToRows(await dataSource.read(key))).map((r) => ({
109
+ query: String(r.query),
110
+ query_canonical: String(r.query_canonical),
111
+ intent_code: Number(r.intent_code),
112
+ normalizer_version: Number(r.normalizer_version),
113
+ intent_version: Number(r.intent_version)
114
+ }));
115
+ }
116
+ };
117
+ }
19
118
  const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
20
119
  function inspectionIndexKey(ctx) {
21
120
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.json` : `u_${ctx.userId}/entities/inspections/index.json`;
@@ -893,4 +992,4 @@ function createEmptyTypesStore(opts) {
893
992
  }
894
993
  };
895
994
  }
896
- export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
995
+ export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
@@ -133,7 +133,7 @@ interface Snapshot {
133
133
  'sequence-number': number;
134
134
  'timestamp-ms': number;
135
135
  'manifest-list': string;
136
- manifests?: Manifest$1[];
136
+ manifests?: Manifest[];
137
137
  summary: {
138
138
  // spec: "value of these fields should be of string type"
139
139
  operation: string; // 'spark.app.id'?: string
@@ -192,7 +192,7 @@ interface MetadataLog {
192
192
  'timestamp-ms': number;
193
193
  'metadata-file': string;
194
194
  }
195
- interface Manifest$1 {
195
+ interface Manifest {
196
196
  manifest_path: string;
197
197
  manifest_length: bigint;
198
198
  partition_spec_id: number;
@@ -3319,24 +3319,38 @@ async function icebergManifests({ metadata, resolver, snapshotId, partitionFilte
3319
3319
  });
3320
3320
  return await fetchManifests(manifests, resolver);
3321
3321
  }
3322
+ const MANIFEST_FETCH_CONCURRENCY = 8;
3323
+ async function fetchOneManifest(manifest, resolver) {
3324
+ const url = manifest.manifest_path;
3325
+ const entries = await fetchAvroRecords(url, resolver, Number(manifest.manifest_length));
3326
+ for (const entry of entries) {
3327
+ entry.partition_spec_id = manifest.partition_spec_id ?? 0;
3328
+ if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number ?? 0n;
3329
+ if (entry.status === 1) {
3330
+ if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number;
3331
+ if (entry.file_sequence_number === void 0) entry.file_sequence_number = manifest.sequence_number;
3332
+ } else if (entry.sequence_number === void 0 || entry.file_sequence_number === void 0) throw new Error("iceberg manifest entry missing sequence number");
3333
+ }
3334
+ assignFirstRowIds(manifest, entries);
3335
+ return {
3336
+ url,
3337
+ entries
3338
+ };
3339
+ }
3322
3340
  async function fetchManifests(manifests, resolver) {
3323
- return await Promise.all(manifests.map(async (manifest) => {
3324
- const url = manifest.manifest_path;
3325
- const entries = await fetchAvroRecords(url, resolver, Number(manifest.manifest_length));
3326
- for (const entry of entries) {
3327
- entry.partition_spec_id = manifest.partition_spec_id ?? 0;
3328
- if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number ?? 0n;
3329
- if (entry.status === 1) {
3330
- if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number;
3331
- if (entry.file_sequence_number === void 0) entry.file_sequence_number = manifest.sequence_number;
3332
- } else if (entry.sequence_number === void 0 || entry.file_sequence_number === void 0) throw new Error("iceberg manifest entry missing sequence number");
3333
- }
3334
- assignFirstRowIds(manifest, entries);
3335
- return {
3336
- url,
3337
- entries
3338
- };
3339
- }));
3341
+ const results = new Array(manifests.length);
3342
+ let next = 0;
3343
+ async function worker() {
3344
+ while (next < manifests.length) {
3345
+ const i = next++;
3346
+ results[i] = await fetchOneManifest(manifests[i], resolver);
3347
+ }
3348
+ }
3349
+ const poolSize = Math.min(MANIFEST_FETCH_CONCURRENCY, manifests.length);
3350
+ const workers = [];
3351
+ for (let w = 0; w < poolSize; w++) workers.push(worker());
3352
+ await Promise.all(workers);
3353
+ return results;
3340
3354
  }
3341
3355
  function assignFirstRowIds(manifest, entries) {
3342
3356
  if (manifest.content !== 0 || manifest.first_row_id == null) return;
@@ -13,7 +13,15 @@ declare const pgResolverAdapter: ResolverAdapter<PgTableKey>;
13
13
  * Single-use: build a fresh adapter per query. Cheap (no I/O) and avoids
14
14
  * accidental adapter caching that would lock in a stale `{{FILES}}` set.
15
15
  */
16
- declare function createParquetResolverAdapter(): ResolverAdapter<PgTableKey>;
16
+ interface ResolverAdapterOptions {
17
+ /**
18
+ * Opt-in canonical-primary correctness: fold NULL/'' `query_canonical` back
19
+ * to the raw `query` so canonical is a total GROUP BY / join key. Default
20
+ * false preserves the legacy raw-column behaviour. See ADR-0018.
21
+ */
22
+ canonicalFallback?: boolean;
23
+ }
24
+ declare function createParquetResolverAdapter(options?: ResolverAdapterOptions): ResolverAdapter<PgTableKey>;
17
25
  /**
18
26
  * Multi-tenant pg-flavored adapter for the Iceberg / R2 SQL read path.
19
27
  * Identical SQL output to `pgResolverAdapter` except WHERE clauses inject
@@ -24,5 +32,5 @@ declare function createParquetResolverAdapter(): ResolverAdapter<PgTableKey>;
24
32
  * so callers must rewrite bare table names to their qualified form (e.g.
25
33
  * `${namespace}.pages`) before sending to R2 SQL.
26
34
  */
27
- declare function createIcebergResolverAdapter(): ResolverAdapter<PgTableKey>;
28
- export { PgTableKey, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter };
35
+ declare function createIcebergResolverAdapter(options?: ResolverAdapterOptions): ResolverAdapter<PgTableKey>;
36
+ export { PgTableKey, ResolverAdapterOptions, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter };
@@ -180,7 +180,7 @@ function buildDimensionColumnMap(datasetToTableKey) {
180
180
  return Object.fromEntries(entries);
181
181
  }
182
182
  function createSqlFragments(config) {
183
- const { schema, datasetToTableKey, metricCast, regexPredicate, tableLabel, includeSiteId, includeSearchType, urlToPathExpr: urlToPathExprOverride, tableRef: tableRefOverride } = config;
183
+ const { schema, datasetToTableKey, metricCast, regexPredicate, tableLabel, includeSiteId, includeSearchType, urlToPathExpr: urlToPathExprOverride, tableRef: tableRefOverride, canonicalFallback = false } = config;
184
184
  const DIM_COLUMN_MAP = buildDimensionColumnMap(datasetToTableKey);
185
185
  function isMetricDimension(dim) {
186
186
  return METRIC_NAMES.includes(dim);
@@ -217,6 +217,7 @@ function createSqlFragments(config) {
217
217
  function dimExprSql(dim, tableKey) {
218
218
  const colName = dimColumn(dim, tableKey);
219
219
  if (dim === "page") return sql.raw(urlToPathExpr(colName));
220
+ if (canonicalFallback && dim === "queryCanonical") return sql`COALESCE(NULLIF(${colRef(tableKey, colName)}, ''), ${colRef(tableKey, "query")})`;
220
221
  return colRef(tableKey, colName);
221
222
  }
222
223
  function metricSql(metric, tableKey) {
@@ -296,8 +297,10 @@ function createSqlFragments(config) {
296
297
  if (isMetricDimension(f.dimension)) continue;
297
298
  if (f.dimension === "date") continue;
298
299
  if (f.operator === "topLevel") continue;
299
- const cRef = colRef(tableKey, dimColumn(f.dimension, tableKey));
300
- const matchExpr = f.dimension === "page" ? dimExprSql(f.dimension, tableKey) : cRef;
300
+ const dim = f.dimension;
301
+ const cRef = colRef(tableKey, dimColumn(dim, tableKey));
302
+ const matchExpr = dim === "page" || dim === "queryCanonical" ? dimExprSql(dim, tableKey) : cRef;
303
+ const patternExpr = dim === "queryCanonical" ? matchExpr : cRef;
301
304
  switch (f.operator) {
302
305
  case "equals":
303
306
  preds.push(sql`${matchExpr} = ${f.expression}`);
@@ -306,16 +309,16 @@ function createSqlFragments(config) {
306
309
  preds.push(sql`${matchExpr} != ${f.expression}`);
307
310
  break;
308
311
  case "contains":
309
- preds.push(sql`${cRef} LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
312
+ preds.push(sql`${patternExpr} LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
310
313
  break;
311
314
  case "notContains":
312
- preds.push(sql`${cRef} NOT LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
315
+ preds.push(sql`${patternExpr} NOT LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
313
316
  break;
314
317
  case "includingRegex":
315
- preds.push(regexPredicate(cRef, f.expression, false));
318
+ preds.push(regexPredicate(patternExpr, f.expression, false));
316
319
  break;
317
320
  case "excludingRegex":
318
- preds.push(regexPredicate(cRef, f.expression, true));
321
+ preds.push(regexPredicate(patternExpr, f.expression, true));
319
322
  break;
320
323
  }
321
324
  }
@@ -431,23 +434,37 @@ const pgResolverAdapter = createResolverAdapter({
431
434
  ...PG_BASE_CONFIG,
432
435
  tableLabel: "pg-resolver-adapter"
433
436
  });
434
- function createParquetResolverAdapter() {
437
+ function createParquetResolverAdapter(options = {}) {
435
438
  return createResolverAdapter({
436
439
  ...PG_BASE_CONFIG,
437
440
  tableLabel: "parquet-resolver-adapter",
441
+ canonicalFallback: options.canonicalFallback ?? false,
438
442
  tableRef: (tk) => sql.raw(`read_parquet({{FILES}}, union_by_name = true) AS "${tk}"`)
439
443
  });
440
444
  }
441
- function createIcebergResolverAdapter() {
445
+ function createIcebergResolverAdapter(options = {}) {
442
446
  return createResolverAdapter({
443
447
  ...PG_BASE_CONFIG,
444
448
  schema: icebergSchema,
445
449
  includeSiteId: true,
446
450
  includeSearchType: true,
447
451
  tableLabel: "iceberg-resolver-adapter",
452
+ canonicalFallback: options.canonicalFallback ?? false,
448
453
  tableRef: (tk) => sql.raw(`"${tk}"`)
449
454
  });
450
455
  }
456
+ const ALLOWED_FILTER_DIMS = /* @__PURE__ */ new Set(["date", "queryCanonical"]);
457
+ function planCoveredByCanonicalRollup(plan) {
458
+ if (plan.dataset !== "queries") return false;
459
+ if (plan.groupByDimensions.length !== 1 || plan.groupByDimensions[0] !== "queryCanonical") return false;
460
+ if (!plan.dimensionFilters.every((f) => ALLOWED_FILTER_DIMS.has(f.dimension))) return false;
461
+ if (plan.prefilters.length > 0) return false;
462
+ if (plan.specialFilters.topLevel) return false;
463
+ return true;
464
+ }
465
+ function canonicalRollupCovers(state, capabilities) {
466
+ return planCoveredByCanonicalRollup(buildLogicalPlan(state, capabilities));
467
+ }
451
468
  const COMPARISON_FILTER_SQL = {
452
469
  new: sql`AND COALESCE(p.impressions, 0) = 0 AND COALESCE(c.impressions, 0) > 0`,
453
470
  lost: sql`AND COALESCE(p.impressions, 0) > 0 AND COALESCE(c.impressions, 0) = 0`,
@@ -726,7 +743,8 @@ function buildExtrasQueries(state, options) {
726
743
  whereParts.push(sql`${adapter.dateColRef(queriesKey)} <= ${plan.dateRange.endDate}`);
727
744
  const whereExpr = whereParts.length > 0 ? sql`WHERE ${joinAnd(whereParts)}` : sql``;
728
745
  const outerQueryCol = sql.raw("query");
729
- const compiled = compileCollapsed(adapter, sql`WITH per_variant AS (SELECT ${t.query_canonical} as joinKey, ${t.query} as query, SUM(${t.clicks}) as clicks, SUM(${t.impressions}) as impressions, SUM(${t.sum_position}) as sum_pos, ROW_NUMBER() OVER (PARTITION BY ${t.query_canonical} ORDER BY SUM(${t.clicks}) DESC) as rn, COUNT(*) OVER (PARTITION BY ${t.query_canonical}) as variantCount FROM ${table} ${whereExpr} GROUP BY ${t.query_canonical}, ${t.query}) SELECT joinKey, MAX(variantCount) as variantCount, MAX(CASE WHEN rn = 1 THEN ${outerQueryCol} END) as canonicalName, GROUP_CONCAT(CASE WHEN rn <= 10 THEN ${outerQueryCol} || ':::' || clicks || ':::' || impressions || ':::' || CAST(ROUND(CAST(sum_pos AS REAL) / NULLIF(impressions, 0) + 1, 1) AS TEXT) END, '||') as variants FROM per_variant GROUP BY joinKey`);
746
+ const canonKey = sql`COALESCE(NULLIF(${t.query_canonical}, ''), ${t.query})`;
747
+ const compiled = compileCollapsed(adapter, sql`WITH per_variant AS (SELECT ${canonKey} as joinKey, ${t.query} as query, SUM(${t.clicks}) as clicks, SUM(${t.impressions}) as impressions, SUM(${t.sum_position}) as sum_pos, ROW_NUMBER() OVER (PARTITION BY ${canonKey} ORDER BY SUM(${t.clicks}) DESC) as rn, COUNT(*) OVER (PARTITION BY ${canonKey}) as variantCount FROM ${table} ${whereExpr} GROUP BY ${canonKey}, ${t.query}) SELECT joinKey, MAX(variantCount) as variantCount, MAX(CASE WHEN rn = 1 THEN ${outerQueryCol} END) as canonicalName, GROUP_CONCAT(CASE WHEN rn <= 10 THEN ${outerQueryCol} || ':::' || clicks || ':::' || impressions || ':::' || CAST(ROUND(CAST(sum_pos AS REAL) / NULLIF(impressions, 0) + 1, 1) AS TEXT) END, '||') as variants FROM per_variant GROUP BY joinKey`);
730
748
  extras.push({
731
749
  key: "canonicalExtras",
732
750
  sql: compiled.sql,
@@ -802,6 +820,22 @@ function mergeExtras(rows, extrasResults) {
802
820
  return enriched;
803
821
  });
804
822
  }
823
+ const EXTRA_ROLLUP_IDS = { canonicalExtras: "query_canonical_variants" };
824
+ function createRollupExtrasOverlay(readRollupRows) {
825
+ return async ({ key, ctx, dateRange }) => {
826
+ const id = EXTRA_ROLLUP_IDS[key];
827
+ if (id === void 0) return null;
828
+ return readRollupRows({
829
+ id,
830
+ ctx: {
831
+ userId: ctx.userId,
832
+ siteId: ctx.siteId
833
+ },
834
+ dateRange,
835
+ ...ctx.searchType !== void 0 ? { searchType: ctx.searchType } : {}
836
+ });
837
+ };
838
+ }
805
839
  function collectInternalFilters(filter) {
806
840
  if (!filter || !("_filters" in filter)) return [];
807
841
  const flat = filter._filters;
@@ -856,6 +890,9 @@ function matchesMetricFilter(row, filter) {
856
890
  function matchesTopLevelPage(row) {
857
891
  return (normalizeUrl(dimensionValue(row, "page")).match(/\//g)?.length ?? 0) <= 1;
858
892
  }
893
+ function canonicalSourceWithinCoverage(source, windowEnd) {
894
+ return source.coversThrough === void 0 || windowEnd <= source.coversThrough;
895
+ }
859
896
  function runArgs(ctx, partitions) {
860
897
  return {
861
898
  ctx: {
@@ -870,9 +907,11 @@ function runArgs(ctx, partitions) {
870
907
  ...ctx.searchType !== void 0 ? { searchType: ctx.searchType } : {}
871
908
  };
872
909
  }
873
- async function runOptimizedQuery(runSQL, ctx, state, dateRange) {
874
- const adapter = createParquetResolverAdapter();
910
+ async function runOptimizedQuery(runSQL, ctx, state, dateRange, options = {}) {
875
911
  const base = runArgs(ctx, enumeratePartitions(dateRange.startDate, dateRange.endDate));
912
+ const probe = createParquetResolverAdapter({ canonicalFallback: options.canonicalFallback ?? false });
913
+ const useCanonicalSource = options.canonicalSource !== void 0 && (options.canonicalFallback ?? false) && canonicalSourceWithinCoverage(options.canonicalSource, dateRange.endDate) && canonicalRollupCovers(state, probe.capabilities);
914
+ const adapter = useCanonicalSource ? createParquetResolverAdapter({ canonicalFallback: false }) : probe;
876
915
  const optimized = resolveToSQLOptimized(state, {
877
916
  adapter,
878
917
  siteId: void 0
@@ -881,15 +920,31 @@ async function runOptimizedQuery(runSQL, ctx, state, dateRange) {
881
920
  adapter,
882
921
  siteId: void 0
883
922
  });
884
- const [optRes, ...extrasRows] = await Promise.all([runSQL({
923
+ const mainArgs = useCanonicalSource ? {
885
924
  ...base,
925
+ fileSets: { FILES: {
926
+ table: ctx.table,
927
+ keys: options.canonicalSource.keys
928
+ } }
929
+ } : base;
930
+ const resolveExtra = options.resolveExtra;
931
+ const [optRes, ...extrasRows] = await Promise.all([runSQL({
932
+ ...mainArgs,
886
933
  sql: optimized.sql,
887
934
  params: optimized.params
888
- }), ...extras.map((e) => runSQL({
889
- ...base,
890
- sql: e.sql,
891
- params: e.params
892
- }))]);
935
+ }), ...extras.map(async (e) => {
936
+ const overlaid = resolveExtra ? await resolveExtra({
937
+ key: e.key,
938
+ state,
939
+ ctx,
940
+ dateRange
941
+ }) : null;
942
+ return overlaid !== null ? { rows: overlaid } : runSQL({
943
+ ...base,
944
+ sql: e.sql,
945
+ params: e.params
946
+ });
947
+ })]);
893
948
  const firstRow = optRes.rows[0];
894
949
  const totalCount = Number(firstRow?.totalCount ?? 0);
895
950
  const totals = {
@@ -911,8 +966,10 @@ async function runOptimizedQuery(runSQL, ctx, state, dateRange) {
911
966
  }))
912
967
  };
913
968
  }
914
- async function runComparisonQuery(runSQL, ctx, current, previous, windows, filter) {
915
- const adapter = createParquetResolverAdapter();
969
+ async function runComparisonQuery(runSQL, ctx, current, previous, windows, filter, options = {}) {
970
+ const probe = createParquetResolverAdapter({ canonicalFallback: options.canonicalFallback ?? false });
971
+ const useCanonicalSource = options.canonicalSource !== void 0 && (options.canonicalFallback ?? false) && canonicalSourceWithinCoverage(options.canonicalSource, windows.current.endDate > windows.previous.endDate ? windows.current.endDate : windows.previous.endDate) && canonicalRollupCovers(current, probe.capabilities) && canonicalRollupCovers(previous, probe.capabilities);
972
+ const adapter = useCanonicalSource ? createParquetResolverAdapter({ canonicalFallback: false }) : probe;
916
973
  const comparison = resolveComparisonSQL(current, previous, {
917
974
  adapter,
918
975
  siteId: void 0
@@ -921,7 +978,14 @@ async function runComparisonQuery(runSQL, ctx, current, previous, windows, filte
921
978
  adapter,
922
979
  siteId: void 0
923
980
  });
924
- const base = runArgs(ctx, enumeratePartitions(windows.current.startDate < windows.previous.startDate ? windows.current.startDate : windows.previous.startDate, windows.current.endDate > windows.previous.endDate ? windows.current.endDate : windows.previous.endDate));
981
+ const partitions = enumeratePartitions(windows.current.startDate < windows.previous.startDate ? windows.current.startDate : windows.previous.startDate, windows.current.endDate > windows.previous.endDate ? windows.current.endDate : windows.previous.endDate);
982
+ const base = useCanonicalSource ? {
983
+ ...runArgs(ctx, partitions),
984
+ fileSets: { FILES: {
985
+ table: ctx.table,
986
+ keys: options.canonicalSource.keys
987
+ } }
988
+ } : runArgs(ctx, partitions);
925
989
  const main = await runSQL({
926
990
  ...base,
927
991
  sql: comparison.sql,
@@ -953,4 +1017,4 @@ function assertSchemaInSync(options) {
953
1017
  if (missing.length > 0 || extra.length > 0) throw new Error(`${label} drizzle schema for '${key}' drifted from SCHEMAS. Missing: [${missing.join(", ")}]. Extra: [${extra.join(", ")}].`);
954
1018
  }
955
1019
  }
956
- export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
1020
+ export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
@@ -1,6 +1,60 @@
1
1
  import { DataSource } from "./_chunks/storage.mjs";
2
2
  import { ScheduleState } from "./schedule.mjs";
3
3
  import { ColumnDef, TenantCtx } from "@gscdump/contracts";
4
+ interface QueryDimRecord {
5
+ query: string;
6
+ /** Lexical canonical, never empty: NULL/'' folds to the raw query. */
7
+ query_canonical: string;
8
+ /** Packed search-intent code (see `@gscdump/analysis` `encodeIntent`). */
9
+ intent_code: number;
10
+ normalizer_version: number;
11
+ intent_version: number;
12
+ }
13
+ /** JSON sidecar: versions + freshness, readable without decoding the parquet. */
14
+ interface QueryDimMeta {
15
+ version: 1;
16
+ builtAt: number;
17
+ rowCount: number;
18
+ normalizerVersion: number;
19
+ intentVersion: number;
20
+ }
21
+ declare function queryDimParquetKey(ctx: TenantCtx): string;
22
+ declare function queryDimMetaKey(ctx: TenantCtx): string;
23
+ /**
24
+ * Injected derivation. `engine` never imports `@gscdump/analysis`; the host
25
+ * passes `normalizeQuery` / `classifyIntentCode` (e.g. `encodeIntent ∘
26
+ * classifyQueryIntent`) plus their version constants.
27
+ */
28
+ interface QueryDimDeps {
29
+ normalizeQuery: (query: string) => string;
30
+ normalizerVersion: number;
31
+ /** Returns the packed intent code for a raw query. */
32
+ classifyIntentCode: (query: string) => number;
33
+ intentVersion: number;
34
+ }
35
+ /**
36
+ * Pure: distinct raw queries → dimension records. De-dupes, drops empties, and
37
+ * folds an empty/whitespace canonical back to the raw query so the key is
38
+ * total (matches the read path's `COALESCE(NULLIF(query_canonical, ''), query)`).
39
+ */
40
+ declare function buildQueryDimRecords(queries: Iterable<string>, deps: QueryDimDeps): QueryDimRecord[];
41
+ interface QueryDimStore {
42
+ parquetKey: (ctx: TenantCtx) => string;
43
+ /** Write the parquet + JSON sidecar. Last-write-wins; no history. */
44
+ write: (ctx: TenantCtx, records: readonly QueryDimRecord[], builtAt: number) => Promise<{
45
+ parquetKey: string;
46
+ rowCount: number;
47
+ }>;
48
+ /** Read the sidecar (versions + freshness), or null on first build. */
49
+ loadMeta: (ctx: TenantCtx) => Promise<QueryDimMeta | null>;
50
+ /** Decode the dimension rows (test/inspection; reads JOIN the parquet by key). */
51
+ loadRecords: (ctx: TenantCtx) => Promise<QueryDimRecord[]>;
52
+ }
53
+ declare function createQueryDimStore({
54
+ dataSource
55
+ }: {
56
+ dataSource: DataSource;
57
+ }): QueryDimStore;
4
58
  /**
5
59
  * GSC URL inspection result fields we persist. Mirrors the
6
60
  * `searchconsole_v1.Schema$UrlInspectionResult` shape but as plain JSON
@@ -442,4 +496,4 @@ interface CreateEmptyTypesStoreOptions {
442
496
  now?: () => number;
443
497
  }
444
498
  declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
445
- export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, ReconcileResult, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
499
+ export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, QueryDimDeps, QueryDimMeta, QueryDimRecord, QueryDimStore, ReconcileResult, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
package/dist/entities.mjs CHANGED
@@ -1,2 +1,2 @@
1
- import { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
2
- export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
1
+ import { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
2
+ export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
@@ -1,7 +1,7 @@
1
1
  import { SearchType as SearchType$1, TableName as TableName$1 } from "../_chunks/storage.mjs";
2
2
  import { ComparisonFilter, ExtraQuery, ResolvedComparisonSQL, ResolvedSQL, ResolvedSQLOptimized, ResolverAdapter, ResolverOptions } from "../_chunks/types.mjs";
3
- import { PgTableKey, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter } from "../_chunks/pg-adapter.mjs";
4
- import { LogicalDataset, LogicalDataset as LogicalDataset$1, PlannerCapabilities, UnresolvableDatasetError, inferDataset as inferLogicalDataset, isDatasetResolvable } from "gscdump/query/plan";
3
+ import { PgTableKey, ResolverAdapterOptions, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter } from "../_chunks/pg-adapter.mjs";
4
+ import { LogicalDataset, LogicalDataset as LogicalDataset$1, LogicalQueryPlan, PlannerCapabilities, UnresolvableDatasetError, inferDataset as inferLogicalDataset, isDatasetResolvable } from "gscdump/query/plan";
5
5
  import { SQL } from "drizzle-orm";
6
6
  import { BuilderState, Dimension, FilterInput, InternalFilter, Metric } from "gscdump/query";
7
7
  import { Grain, TableName } from "@gscdump/contracts";
@@ -35,6 +35,19 @@ interface SqlFragmentsConfig<TableKey extends string> {
35
35
  * against the alias.
36
36
  */
37
37
  tableRef?: (tableKey: TableKey) => SQL;
38
+ /**
39
+ * Opt-in correctness for canonical-primary lookups. When true, the
40
+ * `queryCanonical` dimension expression falls back to the raw `query` when
41
+ * the stored `query_canonical` is NULL (no normalizer ran at ingest) or `''`
42
+ * (a fully-stripped query like "free online"), i.e.
43
+ * `COALESCE(NULLIF(query_canonical, ''), query)`. This makes canonical a
44
+ * TOTAL key, valid for GROUP BY / comparison joins.
45
+ *
46
+ * Default (false) preserves legacy behaviour: the raw nullable column, so a
47
+ * NULL/'' bucket pollutes top results and — because `NULL = NULL` is UNKNOWN
48
+ * — double-counts in the gaining/losing FULL OUTER JOIN. See ADR-0018.
49
+ */
50
+ canonicalFallback?: boolean;
38
51
  }
39
52
  interface SqlFragments<TableKey extends string> {
40
53
  METRIC_NAMES: Metric[];
@@ -65,6 +78,16 @@ interface CreateResolverAdapterConfig<TableKey extends string> extends SqlFragme
65
78
  capabilities: PlannerCapabilities;
66
79
  }
67
80
  declare function createResolverAdapter<TableKey extends string>(config: CreateResolverAdapterConfig<TableKey>): ResolverAdapter<TableKey>;
81
+ /**
82
+ * True when `plan` can be served from the canonical-grained rollup instead of
83
+ * the raw `queries` fact partitions. Conservative: anything that would read a
84
+ * dropped column or the raw row grain disqualifies the query, so a false
85
+ * negative just falls back to live aggregation (correct, slower) — never wrong
86
+ * data.
87
+ */
88
+ declare function planCoveredByCanonicalRollup(plan: LogicalQueryPlan): boolean;
89
+ /** State-level convenience: build the plan then gate. */
90
+ declare function canonicalRollupCovers(state: BuilderState, capabilities: PlannerCapabilities): boolean;
68
91
  declare function resolveToSQLOptimized<TK extends string>(state: BuilderState, options: ResolverOptions<TK>): ResolvedSQLOptimized;
69
92
  declare function resolveToSQL<TK extends string>(state: BuilderState, options: ResolverOptions<TK>): ResolvedSQL;
70
93
  declare function buildTotalsSql<TK extends string>(state: BuilderState, options: ResolverOptions<TK>): {
@@ -77,14 +100,6 @@ declare function mergeExtras(rows: Record<string, unknown>[], extrasResults: {
77
100
  key: string;
78
101
  results: Record<string, unknown>[];
79
102
  }[]): Record<string, unknown>[];
80
- declare function getInternalFilters(filter: FilterInput | undefined): InternalFilter[];
81
- declare function getDimensionFilters(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): InternalFilter[];
82
- declare function getFilterDimensions(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): Dimension[];
83
- declare function metricValue(row: Record<string, unknown>, metric: string): number;
84
- declare function dimensionValue(row: Record<string, unknown>, dimension: string): string;
85
- declare function matchesDimensionFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
86
- declare function matchesMetricFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
87
- declare function matchesTopLevelPage(row: Record<string, unknown>): boolean;
88
103
  interface RunQueryCtx {
89
104
  userId: string;
90
105
  siteId: string;
@@ -118,6 +133,60 @@ interface RunSQLFn {
118
133
  rows: Array<Record<string, unknown>>;
119
134
  }>;
120
135
  }
136
+ /**
137
+ * Optional overlay that serves a resolver extra (e.g. canonical-variant
138
+ * grouping, keyed `'canonicalExtras'`) from a precomputed source — typically a
139
+ * materialised rollup — instead of the live window-function SQL. Return the
140
+ * rows in the exact shape the live extra produces (`mergeExtras` consumes
141
+ * either source unchanged), or `null` to decline so the caller falls back to
142
+ * the live query. Pure seam: storage/tenant routing lives in the host's
143
+ * implementation, not here. See ADR-0017.
144
+ */
145
+ interface ResolveExtraFn {
146
+ (opts: {
147
+ key: string;
148
+ state: BuilderState;
149
+ ctx: RunQueryCtx;
150
+ dateRange: {
151
+ startDate: string;
152
+ endDate: string;
153
+ };
154
+ }): Promise<Array<Record<string, unknown>> | null>;
155
+ }
156
+ interface RunOptimizedQueryOptions {
157
+ /** Overlay tried per extra before the live SQL; absent → today's live path. */
158
+ resolveExtra?: ResolveExtraFn;
159
+ /**
160
+ * Opt-in canonical-primary correctness: group/compare `queryCanonical` as a
161
+ * total key (NULL/'' folds to the raw `query`). Default false = legacy raw
162
+ * nullable column. See ADR-0018.
163
+ */
164
+ canonicalFallback?: boolean;
165
+ /**
166
+ * Opt-in canonical-primary performance (ADR-0018 Gap 2): object keys of the
167
+ * `query_canonical_daily` rollup parquet(s). When supplied AND the query is
168
+ * coverable (`canonicalRollupCovers`) AND `canonicalFallback` is on AND the
169
+ * window is within the rollup's coverage, the MAIN query reads these
170
+ * pre-summed `(query_canonical × date)` rows instead of re-aggregating raw
171
+ * partitions; variant extras still read raw. Ignored (live path) on any miss,
172
+ * so a mis-wired host degrades to correct-but-slow, never wrong.
173
+ *
174
+ * `canonicalFallback` is REQUIRED: the rollup is built with
175
+ * `COALESCE(NULLIF(query_canonical, ''), query)` (fallback semantics), so
176
+ * serving it to a legacy (`canonicalFallback: false`) caller would change
177
+ * NULL/'' rows from legacy buckets to raw-query keys. The rollup is already
178
+ * null-free, so the rollup READ itself runs without fallback.
179
+ *
180
+ * `coversThrough` (ISO `YYYY-MM-DD`, the rollup's newest covered date) gates
181
+ * staleness: the source is used only when `dateRange.endDate <= coversThrough`,
182
+ * else the live path serves the window so the recent tail is never silently
183
+ * undercounted. Omit to assert full coverage (use with care).
184
+ */
185
+ canonicalSource?: {
186
+ keys: string[];
187
+ coversThrough?: string;
188
+ };
189
+ }
121
190
  interface OptimizedQueryResult {
122
191
  rows: Array<Record<string, unknown>>;
123
192
  totalCount: number;
@@ -140,7 +209,7 @@ interface ComparisonQueryResult {
140
209
  declare function runOptimizedQuery(runSQL: RunSQLFn, ctx: RunQueryCtx, state: BuilderState, dateRange: {
141
210
  startDate: string;
142
211
  endDate: string;
143
- }): Promise<OptimizedQueryResult>;
212
+ }, options?: RunOptimizedQueryOptions): Promise<OptimizedQueryResult>;
144
213
  declare function runComparisonQuery(runSQL: RunSQLFn, ctx: RunQueryCtx, current: BuilderState, previous: BuilderState, windows: {
145
214
  current: {
146
215
  startDate: string;
@@ -150,7 +219,55 @@ declare function runComparisonQuery(runSQL: RunSQLFn, ctx: RunQueryCtx, current:
150
219
  startDate: string;
151
220
  endDate: string;
152
221
  };
153
- }, filter?: ComparisonFilter): Promise<ComparisonQueryResult>;
222
+ }, filter?: ComparisonFilter, options?: {
223
+ canonicalFallback?: boolean;
224
+ canonicalSource?: {
225
+ keys: string[];
226
+ coversThrough?: string;
227
+ };
228
+ }): Promise<ComparisonQueryResult>;
229
+ /**
230
+ * Host-supplied reader: return the materialised rollup's rows for an
231
+ * `(id, tenant, slice)`, in the exact shape the live extra produces, or `null`
232
+ * when no rollup exists (first sync, never built, stale) so the overlay
233
+ * declines and the resolver falls back to the live query. Typically wired with
234
+ * `readLatestRollup` + a `read_parquet` of the pointer.
235
+ *
236
+ * `dateRange` is the request window. `query_canonical_variants` is full-history
237
+ * (its grouping/variant metrics span all dates), but `buildExtrasQueries`
238
+ * windows the live `canonicalExtras` to the requested range — so for a narrow
239
+ * window the reader MUST decline (return `null`) rather than attach
240
+ * out-of-window variantCount/canonicalName/variants. A common rule: serve only
241
+ * when the request window covers full history.
242
+ */
243
+ interface RollupRowsReader {
244
+ (opts: {
245
+ id: string;
246
+ ctx: {
247
+ userId: string;
248
+ siteId: string;
249
+ };
250
+ searchType?: SearchType$1;
251
+ dateRange: {
252
+ startDate: string;
253
+ endDate: string;
254
+ };
255
+ }): Promise<Array<Record<string, unknown>> | null>;
256
+ }
257
+ /**
258
+ * Build a {@link ResolveExtraFn} that serves resolver extras from materialised
259
+ * rollups when one is mapped for the extra's key, else returns `null` to fall
260
+ * back to the live SQL. Pure wiring around the host's `readRollupRows`.
261
+ */
262
+ declare function createRollupExtrasOverlay(readRollupRows: RollupRowsReader): ResolveExtraFn;
263
+ declare function getInternalFilters(filter: FilterInput | undefined): InternalFilter[];
264
+ declare function getDimensionFilters(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): InternalFilter[];
265
+ declare function getFilterDimensions(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): Dimension[];
266
+ declare function metricValue(row: Record<string, unknown>, metric: string): number;
267
+ declare function dimensionValue(row: Record<string, unknown>, dimension: string): string;
268
+ declare function matchesDimensionFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
269
+ declare function matchesMetricFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
270
+ declare function matchesTopLevelPage(row: Record<string, unknown>): boolean;
154
271
  interface AssertSchemaInSyncOptions {
155
272
  /** Label used in the thrown error (e.g. 'browser', 'sqlite'). */
156
273
  label: string;
@@ -164,4 +281,4 @@ interface AssertSchemaInSyncOptions {
164
281
  mode: 'exact' | 'superset';
165
282
  }
166
283
  declare function assertSchemaInSync(options: AssertSchemaInSyncOptions): void;
167
- export { type AssertSchemaInSyncOptions, type ComparisonFilter, type ComparisonQueryResult, type CreateResolverAdapterConfig, DIMENSION_SURFACES, type DimensionBinding, type DimensionSurface, type ExtraQuery, LOGICAL_DATASETS, type LogicalDataset, type LogicalDatasetDefinition, type OptimizedQueryResult, type PgTableKey, type ResolvedComparisonSQL, type ResolvedSQL, type ResolvedSQLOptimized, type ResolverAdapter, type ResolverOptions, type RunQueryCtx, type RunSQLFn, type SqlFragments, type SqlFragmentsConfig, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
284
+ export { type AssertSchemaInSyncOptions, type ComparisonFilter, type ComparisonQueryResult, type CreateResolverAdapterConfig, DIMENSION_SURFACES, type DimensionBinding, type DimensionSurface, type ExtraQuery, LOGICAL_DATASETS, type LogicalDataset, type LogicalDatasetDefinition, type OptimizedQueryResult, type PgTableKey, type ResolveExtraFn, type ResolvedComparisonSQL, type ResolvedSQL, type ResolvedSQLOptimized, type ResolverAdapter, type ResolverAdapterOptions, type ResolverOptions, type RollupRowsReader, type RunOptimizedQueryOptions, type RunQueryCtx, type RunSQLFn, type SqlFragments, type SqlFragmentsConfig, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
@@ -1,2 +1,2 @@
1
- import { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface } from "../_chunks/resolver.mjs";
2
- export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
1
+ import { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface } from "../_chunks/resolver.mjs";
2
+ export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
@@ -244,6 +244,12 @@ declare function runWindowed(opts: {
244
244
  start: string;
245
245
  end: string;
246
246
  }) => string;
247
+ /**
248
+ * Extra named file sets merged into every window's `runSQL` (alongside the
249
+ * windowed `FILES`). Use to JOIN a non-windowed sidecar (e.g. the query
250
+ * dimension parquet via `{ QUERY_DIM: { keys: [...] } }`) inside `sqlFor`.
251
+ */
252
+ extraFileSets?: Record<string, FileSetRef>;
247
253
  }): Promise<Row$1[]>;
248
254
  /**
249
255
  * Daily totals across the full history. One row per (date, table) with
@@ -283,6 +289,47 @@ declare const topKeywords28dRollup: RollupDef;
283
289
  * coexist during a migration.
284
290
  */
285
291
  declare const topKeywords28dParquetRollup: RollupDef;
292
+ /**
293
+ * Materialises canonical-query variant grouping so the read path
294
+ * (`buildExtrasQueries` in `resolver/compile.ts`) becomes a passthrough scan
295
+ * instead of two window passes (`ROW_NUMBER`/`COUNT` over `PARTITION BY
296
+ * query_canonical`) plus a `GROUP_CONCAT` over the whole `queries` table on
297
+ * every request — work that is single-threaded under DuckDB-WASM/Workers and
298
+ * scales with table size. See ADR-0017.
299
+ *
300
+ * One row per `query_canonical` group, columns named 1:1 with the live query's
301
+ * output (`joinKey`, `variantCount`, `canonicalName`, `variants`) so
302
+ * `mergeExtras` consumes either source unchanged. `variants` packs the top-10
303
+ * variants as `query:::clicks:::impressions:::position` joined by `||`,
304
+ * identical to the live composer.
305
+ *
306
+ * Full history (`windowDays: null`), not a trailing window: grouping metadata
307
+ * is global (which variant is canonical, how many variants exist) and stays
308
+ * stable across requests rather than shifting with each query's date range.
309
+ * Reflects the last sync/compaction, not the live tail — readers that need the
310
+ * tail can layer a recent-overlay later (the envelope carries `builtAt`).
311
+ */
312
+ declare const queryCanonicalVariantsRollup: RollupDef;
313
+ /**
314
+ * Canonical-grained fact aggregate (ADR-0018 Gap 2): pre-sums the raw
315
+ * `(query × date)` query rows to `(query_canonical × date)`, so canonical-
316
+ * primary top/gaining/losing reads a small pre-aggregated table instead of
317
+ * re-collapsing variants on every request. Metrics are additive, so summing
318
+ * these per-date sums over a window is exact — identical to aggregating the raw
319
+ * rows.
320
+ *
321
+ * Null-free by construction: groups by `COALESCE(NULLIF(query_canonical, ''),
322
+ * query)`, the same total-key expression the opt-in read path uses (ADR-0018
323
+ * Gap 1), so the rollup never carries a NULL/'' canonical bucket and the read
324
+ * path needs no fallback when pointed at it.
325
+ *
326
+ * Date-grained full history (`windowDays: null`): one rollup serves every date
327
+ * range (reads filter by `date`) and both windows of a comparison. Opt-in (not
328
+ * in `DEFAULT_ROLLUPS`); the host points the main query's file set at it for
329
+ * queries the rollup covers (see `canonicalRollupCovers` /
330
+ * `RunOptimizedQueryOptions.canonicalSource`).
331
+ */
332
+ declare const queryCanonicalDailyRollup: RollupDef;
286
333
  /**
287
334
  * Aggregates the per-URL Indexing API metadata entity store (populated by
288
335
  * `gscdump entities indexing snapshot`) into daily counts of `URL_UPDATED`
@@ -360,4 +407,12 @@ declare function rebuildDailyFromHourly(opts: RebuildDailyFromHourlyOptions): Pr
360
407
  rowsWritten: number;
361
408
  }>;
362
409
  declare const DEFAULT_ROLLUPS: readonly RollupDef[];
363
- export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
410
+ /**
411
+ * Canonical-primary rollups (ADR-0017 / ADR-0018). Opt-in — kept out of
412
+ * `DEFAULT_ROLLUPS` because they only pay off once the consumer queries by
413
+ * `queryCanonical` and wires the read seams (`resolveExtra` /
414
+ * `canonicalSource`). Hosts opt in by concatenating these onto their def list
415
+ * (CLI: `gscdump rollups --with-canonical`).
416
+ */
417
+ declare const CANONICAL_ROLLUPS: readonly RollupDef[];
418
+ export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
package/dist/rollups.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  import "./_chunks/layout.mjs";
2
2
  import { engineErrors } from "./errors.mjs";
3
3
  import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
4
- import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
4
+ import { createIndexingMetadataStore, createQueryDimStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
5
5
  import { MS_PER_DAY } from "gscdump";
6
6
  function rollupPrefix(ctx, searchType) {
7
7
  const base = ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
@@ -237,10 +237,13 @@ async function runWindowed(opts) {
237
237
  const result = await opts.engine.runSQL({
238
238
  ctx: opts.ctx,
239
239
  table: opts.table,
240
- fileSets: { FILES: {
241
- table: opts.table,
242
- partitions: w.partitions
243
- } },
240
+ fileSets: {
241
+ FILES: {
242
+ table: opts.table,
243
+ partitions: w.partitions
244
+ },
245
+ ...opts.extraFileSets
246
+ },
244
247
  sql: opts.sqlFor(w),
245
248
  ...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
246
249
  });
@@ -534,6 +537,154 @@ const topKeywords28dParquetRollup = {
534
537
  }));
535
538
  }
536
539
  };
540
+ const queryCanonicalVariantsRollup = {
541
+ id: "query_canonical_variants",
542
+ windowDays: null,
543
+ format: "parquet",
544
+ parquetColumns: [
545
+ {
546
+ name: "joinKey",
547
+ type: "VARCHAR",
548
+ nullable: false
549
+ },
550
+ {
551
+ name: "variantCount",
552
+ type: "BIGINT",
553
+ nullable: false
554
+ },
555
+ {
556
+ name: "canonicalName",
557
+ type: "VARCHAR",
558
+ nullable: true
559
+ },
560
+ {
561
+ name: "variants",
562
+ type: "VARCHAR",
563
+ nullable: true
564
+ }
565
+ ],
566
+ parquetSortKey: ["joinKey"],
567
+ async build({ engine, ctx, searchType }) {
568
+ const parts = await engine.listPartitions({
569
+ ctx,
570
+ table: "queries",
571
+ ...searchType !== void 0 ? { searchType } : {}
572
+ });
573
+ if (parts.length === 0) return [];
574
+ const partitions = parts.map((p) => p.partition);
575
+ return (await engine.runSQL({
576
+ ctx,
577
+ table: "queries",
578
+ fileSets: { FILES: {
579
+ table: "queries",
580
+ partitions
581
+ } },
582
+ ...searchType !== void 0 ? { searchType } : {},
583
+ sql: `
584
+ WITH per_variant AS (
585
+ SELECT
586
+ COALESCE(NULLIF(query_canonical, ''), query) AS joinKey,
587
+ query AS query,
588
+ SUM(clicks) AS clicks,
589
+ SUM(impressions) AS impressions,
590
+ SUM(sum_position) AS sum_pos,
591
+ ROW_NUMBER() OVER (PARTITION BY COALESCE(NULLIF(query_canonical, ''), query) ORDER BY SUM(clicks) DESC) AS rn,
592
+ COUNT(*) OVER (PARTITION BY COALESCE(NULLIF(query_canonical, ''), query)) AS variantCount
593
+ FROM read_parquet({{FILES}}, union_by_name = true)
594
+ GROUP BY COALESCE(NULLIF(query_canonical, ''), query), query
595
+ )
596
+ SELECT
597
+ joinKey,
598
+ MAX(variantCount)::BIGINT AS variantCount,
599
+ MAX(CASE WHEN rn = 1 THEN query END) AS canonicalName,
600
+ GROUP_CONCAT(CASE WHEN rn <= 10 THEN query || ':::' || clicks || ':::' || impressions || ':::' || CAST(ROUND(CAST(sum_pos AS REAL) / NULLIF(impressions, 0) + 1, 1) AS TEXT) END, '||') AS variants
601
+ FROM per_variant
602
+ GROUP BY joinKey
603
+ `
604
+ })).rows.map((r) => ({
605
+ joinKey: String(r.joinKey),
606
+ variantCount: BigInt(r.variantCount),
607
+ canonicalName: r.canonicalName == null ? null : String(r.canonicalName),
608
+ variants: r.variants == null ? null : String(r.variants)
609
+ }));
610
+ }
611
+ };
612
+ const queryCanonicalDailyRollup = {
613
+ id: "query_canonical_daily",
614
+ windowDays: null,
615
+ format: "parquet",
616
+ parquetColumns: [
617
+ {
618
+ name: "query_canonical",
619
+ type: "VARCHAR",
620
+ nullable: false
621
+ },
622
+ {
623
+ name: "date",
624
+ type: "DATE",
625
+ nullable: false
626
+ },
627
+ {
628
+ name: "clicks",
629
+ type: "BIGINT",
630
+ nullable: false
631
+ },
632
+ {
633
+ name: "impressions",
634
+ type: "BIGINT",
635
+ nullable: false
636
+ },
637
+ {
638
+ name: "sum_position",
639
+ type: "DOUBLE",
640
+ nullable: false
641
+ }
642
+ ],
643
+ parquetSortKey: ["date", "query_canonical"],
644
+ async build({ engine, ctx, dataSource, searchType }) {
645
+ const dimStore = createQueryDimStore({ dataSource });
646
+ const useDim = await dimStore.loadMeta(ctx) !== null;
647
+ const canonExpr = useDim ? `COALESCE(qd.query_canonical, NULLIF(q.query_canonical, ''), q.query)` : `COALESCE(NULLIF(query_canonical, ''), query)`;
648
+ return (await runWindowed({
649
+ engine,
650
+ ctx,
651
+ table: "queries",
652
+ ...searchType !== void 0 ? { searchType } : {},
653
+ ...useDim ? { extraFileSets: { QUERY_DIM: {
654
+ table: "queries",
655
+ keys: [dimStore.parquetKey(ctx)]
656
+ } } } : {},
657
+ sqlFor: useDim ? (w) => `
658
+ SELECT
659
+ ${canonExpr} AS query_canonical,
660
+ CAST(q.date AS VARCHAR) AS date,
661
+ SUM(q.clicks)::BIGINT AS clicks,
662
+ SUM(q.impressions)::BIGINT AS impressions,
663
+ SUM(q.sum_position)::DOUBLE AS sum_position
664
+ FROM read_parquet({{FILES}}, union_by_name = true) q
665
+ LEFT JOIN read_parquet({{QUERY_DIM}}, union_by_name = true) qd ON q.query = qd.query
666
+ WHERE q.date >= '${w.start}' AND q.date <= '${w.end}'
667
+ GROUP BY ${canonExpr}, q.date
668
+ ` : (w) => `
669
+ SELECT
670
+ ${canonExpr} AS query_canonical,
671
+ CAST(date AS VARCHAR) AS date,
672
+ SUM(clicks)::BIGINT AS clicks,
673
+ SUM(impressions)::BIGINT AS impressions,
674
+ SUM(sum_position)::DOUBLE AS sum_position
675
+ FROM read_parquet({{FILES}}, union_by_name = true)
676
+ WHERE date >= '${w.start}' AND date <= '${w.end}'
677
+ GROUP BY ${canonExpr}, date
678
+ `
679
+ })).map((r) => ({
680
+ query_canonical: String(r.query_canonical),
681
+ date: String(r.date),
682
+ clicks: BigInt(r.clicks),
683
+ impressions: BigInt(r.impressions),
684
+ sum_position: Number(r.sum_position)
685
+ }));
686
+ }
687
+ };
537
688
  const indexingMetadataRollup = {
538
689
  id: "indexing_metadata",
539
690
  windowDays: null,
@@ -845,4 +996,5 @@ const DEFAULT_ROLLUPS = [
845
996
  sitemapHealthRollup,
846
997
  sitemapChanges28dRollup
847
998
  ];
848
- export { DEFAULT_ROLLUPS, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
999
+ const CANONICAL_ROLLUPS = [queryCanonicalVariantsRollup, queryCanonicalDailyRollup];
1000
+ export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.29.0",
4
+ "version": "0.31.0",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -191,8 +191,8 @@
191
191
  "hyparquet": "^1.26.1",
192
192
  "hyparquet-writer": "^0.16.1",
193
193
  "proper-lockfile": "^4.1.2",
194
- "@gscdump/contracts": "0.29.0",
195
- "gscdump": "0.29.0"
194
+ "@gscdump/contracts": "0.31.0",
195
+ "gscdump": "0.31.0"
196
196
  },
197
197
  "devDependencies": {
198
198
  "@duckdb/duckdb-wasm": "^1.32.0",