npm - @gscdump/engine - Versions diffs - 0.29.0 → 0.31.0 - Mend

@gscdump/engine 0.29.0 → 0.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/_chunks/entities.mjs +100 -1
package/dist/_chunks/libs/icebird.d.mts +2 -2
package/dist/_chunks/libs/icebird.mjs +31 -17
package/dist/_chunks/pg-adapter.d.mts +11 -3
package/dist/_chunks/resolver.mjs +86 -22
package/dist/entities.d.mts +55 -1
package/dist/entities.mjs +2 -2
package/dist/resolver/index.d.mts +130 -13
package/dist/resolver/index.mjs +2 -2
package/dist/rollups.d.mts +56 -1
package/dist/rollups.mjs +158 -6
package/package.json +3 -3

package/dist/_chunks/entities.mjs CHANGED Viewed

@@ -16,6 +16,105 @@ async function readOptional(ds, key, signal) {
 		throw e;
 	});
 }
+const QUERY_DIM_COLUMNS = [
+	{
+		name: "query",
+		type: "VARCHAR",
+		nullable: false
+	},
+	{
+		name: "query_canonical",
+		type: "VARCHAR",
+		nullable: false
+	},
+	{
+		name: "intent_code",
+		type: "INTEGER",
+		nullable: false
+	},
+	{
+		name: "normalizer_version",
+		type: "INTEGER",
+		nullable: false
+	},
+	{
+		name: "intent_version",
+		type: "INTEGER",
+		nullable: false
+	}
+];
+function queryDimPrefix(ctx) {
+	return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/query_dim` : `u_${ctx.userId}/entities/query_dim`;
+}
+function queryDimParquetKey(ctx) {
+	return `${queryDimPrefix(ctx)}/index.parquet`;
+}
+function queryDimMetaKey(ctx) {
+	return `${queryDimPrefix(ctx)}/index.json`;
+}
+function buildQueryDimRecords(queries, deps) {
+	const seen = /* @__PURE__ */ new Set();
+	const out = [];
+	for (const raw of queries) {
+		const query = String(raw);
+		if (query.trim() === "" || seen.has(query)) continue;
+		seen.add(query);
+		const canonical = deps.normalizeQuery(query);
+		out.push({
+			query,
+			query_canonical: canonical === "" ? query : canonical,
+			intent_code: deps.classifyIntentCode(query),
+			normalizer_version: deps.normalizerVersion,
+			intent_version: deps.intentVersion
+		});
+	}
+	return out;
+}
+function createQueryDimStore({ dataSource }) {
+	async function exists(key, prefix) {
+		return (await dataSource.list(prefix)).includes(key);
+	}
+	return {
+		parquetKey: queryDimParquetKey,
+		async write(ctx, records, builtAt) {
+			const parquetKey = queryDimParquetKey(ctx);
+			const bytes = encodeRowsToParquetFlex(records, {
+				columns: QUERY_DIM_COLUMNS,
+				sortKey: ["query"]
+			});
+			await dataSource.write(parquetKey, bytes);
+			const meta = {
+				version: 1,
+				builtAt,
+				rowCount: records.length,
+				normalizerVersion: records[0]?.normalizer_version ?? 0,
+				intentVersion: records[0]?.intent_version ?? 0
+			};
+			await dataSource.write(queryDimMetaKey(ctx), new TextEncoder().encode(JSON.stringify(meta)));
+			return {
+				parquetKey,
+				rowCount: records.length
+			};
+		},
+		async loadMeta(ctx) {
+			const key = queryDimMetaKey(ctx);
+			if (!await exists(key, `${queryDimPrefix(ctx)}/`)) return null;
+			const bytes = await dataSource.read(key);
+			return JSON.parse(new TextDecoder().decode(bytes));
+		},
+		async loadRecords(ctx) {
+			const key = queryDimParquetKey(ctx);
+			if (!await exists(key, `${queryDimPrefix(ctx)}/`)) return [];
+			return (await decodeParquetToRows(await dataSource.read(key))).map((r) => ({
+				query: String(r.query),
+				query_canonical: String(r.query_canonical),
+				intent_code: Number(r.intent_code),
+				normalizer_version: Number(r.normalizer_version),
+				intent_version: Number(r.intent_version)
+			}));
+		}
+	};
+}
 const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
 function inspectionIndexKey(ctx) {
 	return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.json` : `u_${ctx.userId}/entities/inspections/index.json`;
@@ -893,4 +992,4 @@ function createEmptyTypesStore(opts) {
 		}
 	};
 }
-export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
+export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };

package/dist/_chunks/libs/icebird.d.mts CHANGED Viewed

@@ -133,7 +133,7 @@ interface Snapshot {
   'sequence-number': number;
   'timestamp-ms': number;
   'manifest-list': string;
-  manifests?: Manifest$1[];
+  manifests?: Manifest[];
   summary: {
     // spec: "value of these fields should be of string type"
     operation: string; // 'spark.app.id'?: string
@@ -192,7 +192,7 @@ interface MetadataLog {
   'timestamp-ms': number;
   'metadata-file': string;
 }
-interface Manifest$1 {
+interface Manifest {
   manifest_path: string;
   manifest_length: bigint;
   partition_spec_id: number;

package/dist/_chunks/libs/icebird.mjs CHANGED Viewed

@@ -3319,24 +3319,38 @@ async function icebergManifests({ metadata, resolver, snapshotId, partitionFilte
 	});
 	return await fetchManifests(manifests, resolver);
 }
+const MANIFEST_FETCH_CONCURRENCY = 8;
+async function fetchOneManifest(manifest, resolver) {
+	const url = manifest.manifest_path;
+	const entries = await fetchAvroRecords(url, resolver, Number(manifest.manifest_length));
+	for (const entry of entries) {
+		entry.partition_spec_id = manifest.partition_spec_id ?? 0;
+		if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number ?? 0n;
+		if (entry.status === 1) {
+			if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number;
+			if (entry.file_sequence_number === void 0) entry.file_sequence_number = manifest.sequence_number;
+		} else if (entry.sequence_number === void 0 || entry.file_sequence_number === void 0) throw new Error("iceberg manifest entry missing sequence number");
+	}
+	assignFirstRowIds(manifest, entries);
+	return {
+		url,
+		entries
+	};
+}
 async function fetchManifests(manifests, resolver) {
-	return await Promise.all(manifests.map(async (manifest) => {
-		const url = manifest.manifest_path;
-		const entries = await fetchAvroRecords(url, resolver, Number(manifest.manifest_length));
-		for (const entry of entries) {
-			entry.partition_spec_id = manifest.partition_spec_id ?? 0;
-			if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number ?? 0n;
-			if (entry.status === 1) {
-				if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number;
-				if (entry.file_sequence_number === void 0) entry.file_sequence_number = manifest.sequence_number;
-			} else if (entry.sequence_number === void 0 || entry.file_sequence_number === void 0) throw new Error("iceberg manifest entry missing sequence number");
-		}
-		assignFirstRowIds(manifest, entries);
-		return {
-			url,
-			entries
-		};
-	}));
+	const results = new Array(manifests.length);
+	let next = 0;
+	async function worker() {
+		while (next < manifests.length) {
+			const i = next++;
+			results[i] = await fetchOneManifest(manifests[i], resolver);
+		}
+	}
+	const poolSize = Math.min(MANIFEST_FETCH_CONCURRENCY, manifests.length);
+	const workers = [];
+	for (let w = 0; w < poolSize; w++) workers.push(worker());
+	await Promise.all(workers);
+	return results;
 }
 function assignFirstRowIds(manifest, entries) {
 	if (manifest.content !== 0 || manifest.first_row_id == null) return;

package/dist/_chunks/pg-adapter.d.mts CHANGED Viewed

@@ -13,7 +13,15 @@ declare const pgResolverAdapter: ResolverAdapter<PgTableKey>;
  * Single-use: build a fresh adapter per query. Cheap (no I/O) and avoids
  * accidental adapter caching that would lock in a stale `{{FILES}}` set.
  */
-declare function createParquetResolverAdapter(): ResolverAdapter<PgTableKey>;
+interface ResolverAdapterOptions {
+  /**
+   * Opt-in canonical-primary correctness: fold NULL/'' `query_canonical` back
+   * to the raw `query` so canonical is a total GROUP BY / join key. Default
+   * false preserves the legacy raw-column behaviour. See ADR-0018.
+   */
+  canonicalFallback?: boolean;
+}
+declare function createParquetResolverAdapter(options?: ResolverAdapterOptions): ResolverAdapter<PgTableKey>;
 /**
  * Multi-tenant pg-flavored adapter for the Iceberg / R2 SQL read path.
  * Identical SQL output to `pgResolverAdapter` except WHERE clauses inject
@@ -24,5 +32,5 @@ declare function createParquetResolverAdapter(): ResolverAdapter<PgTableKey>;
  * so callers must rewrite bare table names to their qualified form (e.g.
  * `${namespace}.pages`) before sending to R2 SQL.
  */
-declare function createIcebergResolverAdapter(): ResolverAdapter<PgTableKey>;
-export { PgTableKey, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter };
+declare function createIcebergResolverAdapter(options?: ResolverAdapterOptions): ResolverAdapter<PgTableKey>;
+export { PgTableKey, ResolverAdapterOptions, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter };

package/dist/_chunks/resolver.mjs CHANGED Viewed

@@ -180,7 +180,7 @@ function buildDimensionColumnMap(datasetToTableKey) {
 	return Object.fromEntries(entries);
 }
 function createSqlFragments(config) {
-	const { schema, datasetToTableKey, metricCast, regexPredicate, tableLabel, includeSiteId, includeSearchType, urlToPathExpr: urlToPathExprOverride, tableRef: tableRefOverride } = config;
+	const { schema, datasetToTableKey, metricCast, regexPredicate, tableLabel, includeSiteId, includeSearchType, urlToPathExpr: urlToPathExprOverride, tableRef: tableRefOverride, canonicalFallback = false } = config;
 	const DIM_COLUMN_MAP = buildDimensionColumnMap(datasetToTableKey);
 	function isMetricDimension(dim) {
 		return METRIC_NAMES.includes(dim);
@@ -217,6 +217,7 @@ function createSqlFragments(config) {
 	function dimExprSql(dim, tableKey) {
 		const colName = dimColumn(dim, tableKey);
 		if (dim === "page") return sql.raw(urlToPathExpr(colName));
+		if (canonicalFallback && dim === "queryCanonical") return sql`COALESCE(NULLIF(${colRef(tableKey, colName)}, ''), ${colRef(tableKey, "query")})`;
 		return colRef(tableKey, colName);
 	}
 	function metricSql(metric, tableKey) {
@@ -296,8 +297,10 @@ function createSqlFragments(config) {
 			if (isMetricDimension(f.dimension)) continue;
 			if (f.dimension === "date") continue;
 			if (f.operator === "topLevel") continue;
-			const cRef = colRef(tableKey, dimColumn(f.dimension, tableKey));
-			const matchExpr = f.dimension === "page" ? dimExprSql(f.dimension, tableKey) : cRef;
+			const dim = f.dimension;
+			const cRef = colRef(tableKey, dimColumn(dim, tableKey));
+			const matchExpr = dim === "page" || dim === "queryCanonical" ? dimExprSql(dim, tableKey) : cRef;
+			const patternExpr = dim === "queryCanonical" ? matchExpr : cRef;
 			switch (f.operator) {
 				case "equals":
 					preds.push(sql`${matchExpr} = ${f.expression}`);
@@ -306,16 +309,16 @@ function createSqlFragments(config) {
 					preds.push(sql`${matchExpr} != ${f.expression}`);
 					break;
 				case "contains":
-					preds.push(sql`${cRef} LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
+					preds.push(sql`${patternExpr} LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
 					break;
 				case "notContains":
-					preds.push(sql`${cRef} NOT LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
+					preds.push(sql`${patternExpr} NOT LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
 					break;
 				case "includingRegex":
-					preds.push(regexPredicate(cRef, f.expression, false));
+					preds.push(regexPredicate(patternExpr, f.expression, false));
 					break;
 				case "excludingRegex":
-					preds.push(regexPredicate(cRef, f.expression, true));
+					preds.push(regexPredicate(patternExpr, f.expression, true));
 					break;
 			}
 		}
@@ -431,23 +434,37 @@ const pgResolverAdapter = createResolverAdapter({
 	...PG_BASE_CONFIG,
 	tableLabel: "pg-resolver-adapter"
 });
-function createParquetResolverAdapter() {
+function createParquetResolverAdapter(options = {}) {
 	return createResolverAdapter({
 		...PG_BASE_CONFIG,
 		tableLabel: "parquet-resolver-adapter",
+		canonicalFallback: options.canonicalFallback ?? false,
 		tableRef: (tk) => sql.raw(`read_parquet({{FILES}}, union_by_name = true) AS "${tk}"`)
 	});
 }
-function createIcebergResolverAdapter() {
+function createIcebergResolverAdapter(options = {}) {
 	return createResolverAdapter({
 		...PG_BASE_CONFIG,
 		schema: icebergSchema,
 		includeSiteId: true,
 		includeSearchType: true,
 		tableLabel: "iceberg-resolver-adapter",
+		canonicalFallback: options.canonicalFallback ?? false,
 		tableRef: (tk) => sql.raw(`"${tk}"`)
 	});
 }
+const ALLOWED_FILTER_DIMS = /* @__PURE__ */ new Set(["date", "queryCanonical"]);
+function planCoveredByCanonicalRollup(plan) {
+	if (plan.dataset !== "queries") return false;
+	if (plan.groupByDimensions.length !== 1 || plan.groupByDimensions[0] !== "queryCanonical") return false;
+	if (!plan.dimensionFilters.every((f) => ALLOWED_FILTER_DIMS.has(f.dimension))) return false;
+	if (plan.prefilters.length > 0) return false;
+	if (plan.specialFilters.topLevel) return false;
+	return true;
+}
+function canonicalRollupCovers(state, capabilities) {
+	return planCoveredByCanonicalRollup(buildLogicalPlan(state, capabilities));
+}
 const COMPARISON_FILTER_SQL = {
 	new: sql`AND COALESCE(p.impressions, 0) = 0 AND COALESCE(c.impressions, 0) > 0`,
 	lost: sql`AND COALESCE(p.impressions, 0) > 0 AND COALESCE(c.impressions, 0) = 0`,
@@ -726,7 +743,8 @@ function buildExtrasQueries(state, options) {
 	whereParts.push(sql`${adapter.dateColRef(queriesKey)} <= ${plan.dateRange.endDate}`);
 	const whereExpr = whereParts.length > 0 ? sql`WHERE ${joinAnd(whereParts)}` : sql``;
 	const outerQueryCol = sql.raw("query");
-	const compiled = compileCollapsed(adapter, sql`WITH per_variant AS (SELECT ${t.query_canonical} as joinKey, ${t.query} as query, SUM(${t.clicks}) as clicks, SUM(${t.impressions}) as impressions, SUM(${t.sum_position}) as sum_pos, ROW_NUMBER() OVER (PARTITION BY ${t.query_canonical} ORDER BY SUM(${t.clicks}) DESC) as rn, COUNT(*) OVER (PARTITION BY ${t.query_canonical}) as variantCount FROM ${table} ${whereExpr} GROUP BY ${t.query_canonical}, ${t.query}) SELECT joinKey, MAX(variantCount) as variantCount, MAX(CASE WHEN rn = 1 THEN ${outerQueryCol} END) as canonicalName, GROUP_CONCAT(CASE WHEN rn <= 10 THEN ${outerQueryCol} || ':::' || clicks || ':::' || impressions || ':::' || CAST(ROUND(CAST(sum_pos AS REAL) / NULLIF(impressions, 0) + 1, 1) AS TEXT) END, '||') as variants FROM per_variant GROUP BY joinKey`);
+	const canonKey = sql`COALESCE(NULLIF(${t.query_canonical}, ''), ${t.query})`;
+	const compiled = compileCollapsed(adapter, sql`WITH per_variant AS (SELECT ${canonKey} as joinKey, ${t.query} as query, SUM(${t.clicks}) as clicks, SUM(${t.impressions}) as impressions, SUM(${t.sum_position}) as sum_pos, ROW_NUMBER() OVER (PARTITION BY ${canonKey} ORDER BY SUM(${t.clicks}) DESC) as rn, COUNT(*) OVER (PARTITION BY ${canonKey}) as variantCount FROM ${table} ${whereExpr} GROUP BY ${canonKey}, ${t.query}) SELECT joinKey, MAX(variantCount) as variantCount, MAX(CASE WHEN rn = 1 THEN ${outerQueryCol} END) as canonicalName, GROUP_CONCAT(CASE WHEN rn <= 10 THEN ${outerQueryCol} || ':::' || clicks || ':::' || impressions || ':::' || CAST(ROUND(CAST(sum_pos AS REAL) / NULLIF(impressions, 0) + 1, 1) AS TEXT) END, '||') as variants FROM per_variant GROUP BY joinKey`);
 	extras.push({
 		key: "canonicalExtras",
 		sql: compiled.sql,
@@ -802,6 +820,22 @@ function mergeExtras(rows, extrasResults) {
 		return enriched;
 	});
 }
+const EXTRA_ROLLUP_IDS = { canonicalExtras: "query_canonical_variants" };
+function createRollupExtrasOverlay(readRollupRows) {
+	return async ({ key, ctx, dateRange }) => {
+		const id = EXTRA_ROLLUP_IDS[key];
+		if (id === void 0) return null;
+		return readRollupRows({
+			id,
+			ctx: {
+				userId: ctx.userId,
+				siteId: ctx.siteId
+			},
+			dateRange,
+			...ctx.searchType !== void 0 ? { searchType: ctx.searchType } : {}
+		});
+	};
+}
 function collectInternalFilters(filter) {
 	if (!filter || !("_filters" in filter)) return [];
 	const flat = filter._filters;
@@ -856,6 +890,9 @@ function matchesMetricFilter(row, filter) {
 function matchesTopLevelPage(row) {
 	return (normalizeUrl(dimensionValue(row, "page")).match(/\//g)?.length ?? 0) <= 1;
 }
+function canonicalSourceWithinCoverage(source, windowEnd) {
+	return source.coversThrough === void 0 || windowEnd <= source.coversThrough;
+}
 function runArgs(ctx, partitions) {
 	return {
 		ctx: {
@@ -870,9 +907,11 @@ function runArgs(ctx, partitions) {
 		...ctx.searchType !== void 0 ? { searchType: ctx.searchType } : {}
 	};
 }
-async function runOptimizedQuery(runSQL, ctx, state, dateRange) {
-	const adapter = createParquetResolverAdapter();
+async function runOptimizedQuery(runSQL, ctx, state, dateRange, options = {}) {
 	const base = runArgs(ctx, enumeratePartitions(dateRange.startDate, dateRange.endDate));
+	const probe = createParquetResolverAdapter({ canonicalFallback: options.canonicalFallback ?? false });
+	const useCanonicalSource = options.canonicalSource !== void 0 && (options.canonicalFallback ?? false) && canonicalSourceWithinCoverage(options.canonicalSource, dateRange.endDate) && canonicalRollupCovers(state, probe.capabilities);
+	const adapter = useCanonicalSource ? createParquetResolverAdapter({ canonicalFallback: false }) : probe;
 	const optimized = resolveToSQLOptimized(state, {
 		adapter,
 		siteId: void 0
@@ -881,15 +920,31 @@ async function runOptimizedQuery(runSQL, ctx, state, dateRange) {
 		adapter,
 		siteId: void 0
 	});
-	const [optRes, ...extrasRows] = await Promise.all([runSQL({
+	const mainArgs = useCanonicalSource ? {
 		...base,
+		fileSets: { FILES: {
+			table: ctx.table,
+			keys: options.canonicalSource.keys
+		} }
+	} : base;
+	const resolveExtra = options.resolveExtra;
+	const [optRes, ...extrasRows] = await Promise.all([runSQL({
+		...mainArgs,
 		sql: optimized.sql,
 		params: optimized.params
-	}), ...extras.map((e) => runSQL({
-		...base,
-		sql: e.sql,
-		params: e.params
-	}))]);
+	}), ...extras.map(async (e) => {
+		const overlaid = resolveExtra ? await resolveExtra({
+			key: e.key,
+			state,
+			ctx,
+			dateRange
+		}) : null;
+		return overlaid !== null ? { rows: overlaid } : runSQL({
+			...base,
+			sql: e.sql,
+			params: e.params
+		});
+	})]);
 	const firstRow = optRes.rows[0];
 	const totalCount = Number(firstRow?.totalCount ?? 0);
 	const totals = {
@@ -911,8 +966,10 @@ async function runOptimizedQuery(runSQL, ctx, state, dateRange) {
 		}))
 	};
 }
-async function runComparisonQuery(runSQL, ctx, current, previous, windows, filter) {
-	const adapter = createParquetResolverAdapter();
+async function runComparisonQuery(runSQL, ctx, current, previous, windows, filter, options = {}) {
+	const probe = createParquetResolverAdapter({ canonicalFallback: options.canonicalFallback ?? false });
+	const useCanonicalSource = options.canonicalSource !== void 0 && (options.canonicalFallback ?? false) && canonicalSourceWithinCoverage(options.canonicalSource, windows.current.endDate > windows.previous.endDate ? windows.current.endDate : windows.previous.endDate) && canonicalRollupCovers(current, probe.capabilities) && canonicalRollupCovers(previous, probe.capabilities);
+	const adapter = useCanonicalSource ? createParquetResolverAdapter({ canonicalFallback: false }) : probe;
 	const comparison = resolveComparisonSQL(current, previous, {
 		adapter,
 		siteId: void 0
@@ -921,7 +978,14 @@ async function runComparisonQuery(runSQL, ctx, current, previous, windows, filte
 		adapter,
 		siteId: void 0
 	});
-	const base = runArgs(ctx, enumeratePartitions(windows.current.startDate < windows.previous.startDate ? windows.current.startDate : windows.previous.startDate, windows.current.endDate > windows.previous.endDate ? windows.current.endDate : windows.previous.endDate));
+	const partitions = enumeratePartitions(windows.current.startDate < windows.previous.startDate ? windows.current.startDate : windows.previous.startDate, windows.current.endDate > windows.previous.endDate ? windows.current.endDate : windows.previous.endDate);
+	const base = useCanonicalSource ? {
+		...runArgs(ctx, partitions),
+		fileSets: { FILES: {
+			table: ctx.table,
+			keys: options.canonicalSource.keys
+		} }
+	} : runArgs(ctx, partitions);
 	const main = await runSQL({
 		...base,
 		sql: comparison.sql,
@@ -953,4 +1017,4 @@ function assertSchemaInSync(options) {
 		if (missing.length > 0 || extra.length > 0) throw new Error(`${label} drizzle schema for '${key}' drifted from SCHEMAS. Missing: [${missing.join(", ")}]. Extra: [${extra.join(", ")}].`);
 	}
 }
-export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
+export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };

package/dist/entities.d.mts CHANGED Viewed

@@ -1,6 +1,60 @@
 import { DataSource } from "./_chunks/storage.mjs";
 import { ScheduleState } from "./schedule.mjs";
 import { ColumnDef, TenantCtx } from "@gscdump/contracts";
+interface QueryDimRecord {
+  query: string;
+  /** Lexical canonical, never empty: NULL/'' folds to the raw query. */
+  query_canonical: string;
+  /** Packed search-intent code (see `@gscdump/analysis` `encodeIntent`). */
+  intent_code: number;
+  normalizer_version: number;
+  intent_version: number;
+}
+/** JSON sidecar: versions + freshness, readable without decoding the parquet. */
+interface QueryDimMeta {
+  version: 1;
+  builtAt: number;
+  rowCount: number;
+  normalizerVersion: number;
+  intentVersion: number;
+}
+declare function queryDimParquetKey(ctx: TenantCtx): string;
+declare function queryDimMetaKey(ctx: TenantCtx): string;
+/**
+ * Injected derivation. `engine` never imports `@gscdump/analysis`; the host
+ * passes `normalizeQuery` / `classifyIntentCode` (e.g. `encodeIntent ∘
+ * classifyQueryIntent`) plus their version constants.
+ */
+interface QueryDimDeps {
+  normalizeQuery: (query: string) => string;
+  normalizerVersion: number;
+  /** Returns the packed intent code for a raw query. */
+  classifyIntentCode: (query: string) => number;
+  intentVersion: number;
+}
+/**
+ * Pure: distinct raw queries → dimension records. De-dupes, drops empties, and
+ * folds an empty/whitespace canonical back to the raw query so the key is
+ * total (matches the read path's `COALESCE(NULLIF(query_canonical, ''), query)`).
+ */
+declare function buildQueryDimRecords(queries: Iterable<string>, deps: QueryDimDeps): QueryDimRecord[];
+interface QueryDimStore {
+  parquetKey: (ctx: TenantCtx) => string;
+  /** Write the parquet + JSON sidecar. Last-write-wins; no history. */
+  write: (ctx: TenantCtx, records: readonly QueryDimRecord[], builtAt: number) => Promise<{
+    parquetKey: string;
+    rowCount: number;
+  }>;
+  /** Read the sidecar (versions + freshness), or null on first build. */
+  loadMeta: (ctx: TenantCtx) => Promise<QueryDimMeta | null>;
+  /** Decode the dimension rows (test/inspection; reads JOIN the parquet by key). */
+  loadRecords: (ctx: TenantCtx) => Promise<QueryDimRecord[]>;
+}
+declare function createQueryDimStore({
+  dataSource
+}: {
+  dataSource: DataSource;
+}): QueryDimStore;
 /**
  * GSC URL inspection result fields we persist. Mirrors the
  *  `searchconsole_v1.Schema$UrlInspectionResult` shape but as plain JSON
@@ -442,4 +496,4 @@ interface CreateEmptyTypesStoreOptions {
   now?: () => number;
 }
 declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
-export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, ReconcileResult, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
+export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, QueryDimDeps, QueryDimMeta, QueryDimRecord, QueryDimStore, ReconcileResult, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };

package/dist/entities.mjs CHANGED Viewed

@@ -1,2 +1,2 @@
-import { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
-export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
+import { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
+export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };

package/dist/resolver/index.d.mts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { SearchType as SearchType$1, TableName as TableName$1 } from "../_chunks/storage.mjs";
 import { ComparisonFilter, ExtraQuery, ResolvedComparisonSQL, ResolvedSQL, ResolvedSQLOptimized, ResolverAdapter, ResolverOptions } from "../_chunks/types.mjs";
-import { PgTableKey, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter } from "../_chunks/pg-adapter.mjs";
-import { LogicalDataset, LogicalDataset as LogicalDataset$1, PlannerCapabilities, UnresolvableDatasetError, inferDataset as inferLogicalDataset, isDatasetResolvable } from "gscdump/query/plan";
+import { PgTableKey, ResolverAdapterOptions, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter } from "../_chunks/pg-adapter.mjs";
+import { LogicalDataset, LogicalDataset as LogicalDataset$1, LogicalQueryPlan, PlannerCapabilities, UnresolvableDatasetError, inferDataset as inferLogicalDataset, isDatasetResolvable } from "gscdump/query/plan";
 import { SQL } from "drizzle-orm";
 import { BuilderState, Dimension, FilterInput, InternalFilter, Metric } from "gscdump/query";
 import { Grain, TableName } from "@gscdump/contracts";
@@ -35,6 +35,19 @@ interface SqlFragmentsConfig<TableKey extends string> {
    * against the alias.
    */
   tableRef?: (tableKey: TableKey) => SQL;
+  /**
+   * Opt-in correctness for canonical-primary lookups. When true, the
+   * `queryCanonical` dimension expression falls back to the raw `query` when
+   * the stored `query_canonical` is NULL (no normalizer ran at ingest) or `''`
+   * (a fully-stripped query like "free online"), i.e.
+   * `COALESCE(NULLIF(query_canonical, ''), query)`. This makes canonical a
+   * TOTAL key, valid for GROUP BY / comparison joins.
+   *
+   * Default (false) preserves legacy behaviour: the raw nullable column, so a
+   * NULL/'' bucket pollutes top results and — because `NULL = NULL` is UNKNOWN
+   * — double-counts in the gaining/losing FULL OUTER JOIN. See ADR-0018.
+   */
+  canonicalFallback?: boolean;
 }
 interface SqlFragments<TableKey extends string> {
   METRIC_NAMES: Metric[];
@@ -65,6 +78,16 @@ interface CreateResolverAdapterConfig<TableKey extends string> extends SqlFragme
   capabilities: PlannerCapabilities;
 }
 declare function createResolverAdapter<TableKey extends string>(config: CreateResolverAdapterConfig<TableKey>): ResolverAdapter<TableKey>;
+/**
+ * True when `plan` can be served from the canonical-grained rollup instead of
+ * the raw `queries` fact partitions. Conservative: anything that would read a
+ * dropped column or the raw row grain disqualifies the query, so a false
+ * negative just falls back to live aggregation (correct, slower) — never wrong
+ * data.
+ */
+declare function planCoveredByCanonicalRollup(plan: LogicalQueryPlan): boolean;
+/** State-level convenience: build the plan then gate. */
+declare function canonicalRollupCovers(state: BuilderState, capabilities: PlannerCapabilities): boolean;
 declare function resolveToSQLOptimized<TK extends string>(state: BuilderState, options: ResolverOptions<TK>): ResolvedSQLOptimized;
 declare function resolveToSQL<TK extends string>(state: BuilderState, options: ResolverOptions<TK>): ResolvedSQL;
 declare function buildTotalsSql<TK extends string>(state: BuilderState, options: ResolverOptions<TK>): {
@@ -77,14 +100,6 @@ declare function mergeExtras(rows: Record<string, unknown>[], extrasResults: {
   key: string;
   results: Record<string, unknown>[];
 }[]): Record<string, unknown>[];
-declare function getInternalFilters(filter: FilterInput | undefined): InternalFilter[];
-declare function getDimensionFilters(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): InternalFilter[];
-declare function getFilterDimensions(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): Dimension[];
-declare function metricValue(row: Record<string, unknown>, metric: string): number;
-declare function dimensionValue(row: Record<string, unknown>, dimension: string): string;
-declare function matchesDimensionFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
-declare function matchesMetricFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
-declare function matchesTopLevelPage(row: Record<string, unknown>): boolean;
 interface RunQueryCtx {
   userId: string;
   siteId: string;
@@ -118,6 +133,60 @@ interface RunSQLFn {
     rows: Array<Record<string, unknown>>;
   }>;
 }
+/**
+ * Optional overlay that serves a resolver extra (e.g. canonical-variant
+ * grouping, keyed `'canonicalExtras'`) from a precomputed source — typically a
+ * materialised rollup — instead of the live window-function SQL. Return the
+ * rows in the exact shape the live extra produces (`mergeExtras` consumes
+ * either source unchanged), or `null` to decline so the caller falls back to
+ * the live query. Pure seam: storage/tenant routing lives in the host's
+ * implementation, not here. See ADR-0017.
+ */
+interface ResolveExtraFn {
+  (opts: {
+    key: string;
+    state: BuilderState;
+    ctx: RunQueryCtx;
+    dateRange: {
+      startDate: string;
+      endDate: string;
+    };
+  }): Promise<Array<Record<string, unknown>> | null>;
+}
+interface RunOptimizedQueryOptions {
+  /** Overlay tried per extra before the live SQL; absent → today's live path. */
+  resolveExtra?: ResolveExtraFn;
+  /**
+   * Opt-in canonical-primary correctness: group/compare `queryCanonical` as a
+   * total key (NULL/'' folds to the raw `query`). Default false = legacy raw
+   * nullable column. See ADR-0018.
+   */
+  canonicalFallback?: boolean;
+  /**
+   * Opt-in canonical-primary performance (ADR-0018 Gap 2): object keys of the
+   * `query_canonical_daily` rollup parquet(s). When supplied AND the query is
+   * coverable (`canonicalRollupCovers`) AND `canonicalFallback` is on AND the
+   * window is within the rollup's coverage, the MAIN query reads these
+   * pre-summed `(query_canonical × date)` rows instead of re-aggregating raw
+   * partitions; variant extras still read raw. Ignored (live path) on any miss,
+   * so a mis-wired host degrades to correct-but-slow, never wrong.
+   *
+   * `canonicalFallback` is REQUIRED: the rollup is built with
+   * `COALESCE(NULLIF(query_canonical, ''), query)` (fallback semantics), so
+   * serving it to a legacy (`canonicalFallback: false`) caller would change
+   * NULL/'' rows from legacy buckets to raw-query keys. The rollup is already
+   * null-free, so the rollup READ itself runs without fallback.
+   *
+   * `coversThrough` (ISO `YYYY-MM-DD`, the rollup's newest covered date) gates
+   * staleness: the source is used only when `dateRange.endDate <= coversThrough`,
+   * else the live path serves the window so the recent tail is never silently
+   * undercounted. Omit to assert full coverage (use with care).
+   */
+  canonicalSource?: {
+    keys: string[];
+    coversThrough?: string;
+  };
+}
 interface OptimizedQueryResult {
   rows: Array<Record<string, unknown>>;
   totalCount: number;
@@ -140,7 +209,7 @@ interface ComparisonQueryResult {
 declare function runOptimizedQuery(runSQL: RunSQLFn, ctx: RunQueryCtx, state: BuilderState, dateRange: {
   startDate: string;
   endDate: string;
-}): Promise<OptimizedQueryResult>;
+}, options?: RunOptimizedQueryOptions): Promise<OptimizedQueryResult>;
 declare function runComparisonQuery(runSQL: RunSQLFn, ctx: RunQueryCtx, current: BuilderState, previous: BuilderState, windows: {
   current: {
     startDate: string;
@@ -150,7 +219,55 @@ declare function runComparisonQuery(runSQL: RunSQLFn, ctx: RunQueryCtx, current:
     startDate: string;
     endDate: string;
   };
-}, filter?: ComparisonFilter): Promise<ComparisonQueryResult>;
+}, filter?: ComparisonFilter, options?: {
+  canonicalFallback?: boolean;
+  canonicalSource?: {
+    keys: string[];
+    coversThrough?: string;
+  };
+}): Promise<ComparisonQueryResult>;
+/**
+ * Host-supplied reader: return the materialised rollup's rows for an
+ * `(id, tenant, slice)`, in the exact shape the live extra produces, or `null`
+ * when no rollup exists (first sync, never built, stale) so the overlay
+ * declines and the resolver falls back to the live query. Typically wired with
+ * `readLatestRollup` + a `read_parquet` of the pointer.
+ *
+ * `dateRange` is the request window. `query_canonical_variants` is full-history
+ * (its grouping/variant metrics span all dates), but `buildExtrasQueries`
+ * windows the live `canonicalExtras` to the requested range — so for a narrow
+ * window the reader MUST decline (return `null`) rather than attach
+ * out-of-window variantCount/canonicalName/variants. A common rule: serve only
+ * when the request window covers full history.
+ */
+interface RollupRowsReader {
+  (opts: {
+    id: string;
+    ctx: {
+      userId: string;
+      siteId: string;
+    };
+    searchType?: SearchType$1;
+    dateRange: {
+      startDate: string;
+      endDate: string;
+    };
+  }): Promise<Array<Record<string, unknown>> | null>;
+}
+/**
+ * Build a {@link ResolveExtraFn} that serves resolver extras from materialised
+ * rollups when one is mapped for the extra's key, else returns `null` to fall
+ * back to the live SQL. Pure wiring around the host's `readRollupRows`.
+ */
+declare function createRollupExtrasOverlay(readRollupRows: RollupRowsReader): ResolveExtraFn;
+declare function getInternalFilters(filter: FilterInput | undefined): InternalFilter[];
+declare function getDimensionFilters(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): InternalFilter[];
+declare function getFilterDimensions(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): Dimension[];
+declare function metricValue(row: Record<string, unknown>, metric: string): number;
+declare function dimensionValue(row: Record<string, unknown>, dimension: string): string;
+declare function matchesDimensionFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
+declare function matchesMetricFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
+declare function matchesTopLevelPage(row: Record<string, unknown>): boolean;
 interface AssertSchemaInSyncOptions {
   /** Label used in the thrown error (e.g. 'browser', 'sqlite'). */
   label: string;
@@ -164,4 +281,4 @@ interface AssertSchemaInSyncOptions {
   mode: 'exact' | 'superset';
 }
 declare function assertSchemaInSync(options: AssertSchemaInSyncOptions): void;
-export { type AssertSchemaInSyncOptions, type ComparisonFilter, type ComparisonQueryResult, type CreateResolverAdapterConfig, DIMENSION_SURFACES, type DimensionBinding, type DimensionSurface, type ExtraQuery, LOGICAL_DATASETS, type LogicalDataset, type LogicalDatasetDefinition, type OptimizedQueryResult, type PgTableKey, type ResolvedComparisonSQL, type ResolvedSQL, type ResolvedSQLOptimized, type ResolverAdapter, type ResolverOptions, type RunQueryCtx, type RunSQLFn, type SqlFragments, type SqlFragmentsConfig, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
+export { type AssertSchemaInSyncOptions, type ComparisonFilter, type ComparisonQueryResult, type CreateResolverAdapterConfig, DIMENSION_SURFACES, type DimensionBinding, type DimensionSurface, type ExtraQuery, LOGICAL_DATASETS, type LogicalDataset, type LogicalDatasetDefinition, type OptimizedQueryResult, type PgTableKey, type ResolveExtraFn, type ResolvedComparisonSQL, type ResolvedSQL, type ResolvedSQLOptimized, type ResolverAdapter, type ResolverAdapterOptions, type ResolverOptions, type RollupRowsReader, type RunOptimizedQueryOptions, type RunQueryCtx, type RunSQLFn, type SqlFragments, type SqlFragmentsConfig, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };

package/dist/resolver/index.mjs CHANGED Viewed

@@ -1,2 +1,2 @@
-import { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface } from "../_chunks/resolver.mjs";
-export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
+import { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface } from "../_chunks/resolver.mjs";
+export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };

package/dist/rollups.d.mts CHANGED Viewed

@@ -244,6 +244,12 @@ declare function runWindowed(opts: {
     start: string;
     end: string;
   }) => string;
+  /**
+   * Extra named file sets merged into every window's `runSQL` (alongside the
+   * windowed `FILES`). Use to JOIN a non-windowed sidecar (e.g. the query
+   * dimension parquet via `{ QUERY_DIM: { keys: [...] } }`) inside `sqlFor`.
+   */
+  extraFileSets?: Record<string, FileSetRef>;
 }): Promise<Row$1[]>;
 /**
  * Daily totals across the full history. One row per (date, table) with
@@ -283,6 +289,47 @@ declare const topKeywords28dRollup: RollupDef;
  * coexist during a migration.
  */
 declare const topKeywords28dParquetRollup: RollupDef;
+/**
+ * Materialises canonical-query variant grouping so the read path
+ * (`buildExtrasQueries` in `resolver/compile.ts`) becomes a passthrough scan
+ * instead of two window passes (`ROW_NUMBER`/`COUNT` over `PARTITION BY
+ * query_canonical`) plus a `GROUP_CONCAT` over the whole `queries` table on
+ * every request — work that is single-threaded under DuckDB-WASM/Workers and
+ * scales with table size. See ADR-0017.
+ *
+ * One row per `query_canonical` group, columns named 1:1 with the live query's
+ * output (`joinKey`, `variantCount`, `canonicalName`, `variants`) so
+ * `mergeExtras` consumes either source unchanged. `variants` packs the top-10
+ * variants as `query:::clicks:::impressions:::position` joined by `||`,
+ * identical to the live composer.
+ *
+ * Full history (`windowDays: null`), not a trailing window: grouping metadata
+ * is global (which variant is canonical, how many variants exist) and stays
+ * stable across requests rather than shifting with each query's date range.
+ * Reflects the last sync/compaction, not the live tail — readers that need the
+ * tail can layer a recent-overlay later (the envelope carries `builtAt`).
+ */
+declare const queryCanonicalVariantsRollup: RollupDef;
+/**
+ * Canonical-grained fact aggregate (ADR-0018 Gap 2): pre-sums the raw
+ * `(query × date)` query rows to `(query_canonical × date)`, so canonical-
+ * primary top/gaining/losing reads a small pre-aggregated table instead of
+ * re-collapsing variants on every request. Metrics are additive, so summing
+ * these per-date sums over a window is exact — identical to aggregating the raw
+ * rows.
+ *
+ * Null-free by construction: groups by `COALESCE(NULLIF(query_canonical, ''),
+ * query)`, the same total-key expression the opt-in read path uses (ADR-0018
+ * Gap 1), so the rollup never carries a NULL/'' canonical bucket and the read
+ * path needs no fallback when pointed at it.
+ *
+ * Date-grained full history (`windowDays: null`): one rollup serves every date
+ * range (reads filter by `date`) and both windows of a comparison. Opt-in (not
+ * in `DEFAULT_ROLLUPS`); the host points the main query's file set at it for
+ * queries the rollup covers (see `canonicalRollupCovers` /
+ * `RunOptimizedQueryOptions.canonicalSource`).
+ */
+declare const queryCanonicalDailyRollup: RollupDef;
 /**
  * Aggregates the per-URL Indexing API metadata entity store (populated by
  * `gscdump entities indexing snapshot`) into daily counts of `URL_UPDATED`
@@ -360,4 +407,12 @@ declare function rebuildDailyFromHourly(opts: RebuildDailyFromHourlyOptions): Pr
   rowsWritten: number;
 }>;
 declare const DEFAULT_ROLLUPS: readonly RollupDef[];
-export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
+/**
+ * Canonical-primary rollups (ADR-0017 / ADR-0018). Opt-in — kept out of
+ * `DEFAULT_ROLLUPS` because they only pay off once the consumer queries by
+ * `queryCanonical` and wires the read seams (`resolveExtra` /
+ * `canonicalSource`). Hosts opt in by concatenating these onto their def list
+ * (CLI: `gscdump rollups --with-canonical`).
+ */
+declare const CANONICAL_ROLLUPS: readonly RollupDef[];
+export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };

package/dist/rollups.mjs CHANGED Viewed

@@ -1,7 +1,7 @@
 import "./_chunks/layout.mjs";
 import { engineErrors } from "./errors.mjs";
 import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
-import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
+import { createIndexingMetadataStore, createQueryDimStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
 import { MS_PER_DAY } from "gscdump";
 function rollupPrefix(ctx, searchType) {
 	const base = ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
@@ -237,10 +237,13 @@ async function runWindowed(opts) {
 		const result = await opts.engine.runSQL({
 			ctx: opts.ctx,
 			table: opts.table,
-			fileSets: { FILES: {
-				table: opts.table,
-				partitions: w.partitions
-			} },
+			fileSets: {
+				FILES: {
+					table: opts.table,
+					partitions: w.partitions
+				},
+				...opts.extraFileSets
+			},
 			sql: opts.sqlFor(w),
 			...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
 		});
@@ -534,6 +537,154 @@ const topKeywords28dParquetRollup = {
 		}));
 	}
 };
+const queryCanonicalVariantsRollup = {
+	id: "query_canonical_variants",
+	windowDays: null,
+	format: "parquet",
+	parquetColumns: [
+		{
+			name: "joinKey",
+			type: "VARCHAR",
+			nullable: false
+		},
+		{
+			name: "variantCount",
+			type: "BIGINT",
+			nullable: false
+		},
+		{
+			name: "canonicalName",
+			type: "VARCHAR",
+			nullable: true
+		},
+		{
+			name: "variants",
+			type: "VARCHAR",
+			nullable: true
+		}
+	],
+	parquetSortKey: ["joinKey"],
+	async build({ engine, ctx, searchType }) {
+		const parts = await engine.listPartitions({
+			ctx,
+			table: "queries",
+			...searchType !== void 0 ? { searchType } : {}
+		});
+		if (parts.length === 0) return [];
+		const partitions = parts.map((p) => p.partition);
+		return (await engine.runSQL({
+			ctx,
+			table: "queries",
+			fileSets: { FILES: {
+				table: "queries",
+				partitions
+			} },
+			...searchType !== void 0 ? { searchType } : {},
+			sql: `
+        WITH per_variant AS (
+          SELECT
+            COALESCE(NULLIF(query_canonical, ''), query) AS joinKey,
+            query AS query,
+            SUM(clicks) AS clicks,
+            SUM(impressions) AS impressions,
+            SUM(sum_position) AS sum_pos,
+            ROW_NUMBER() OVER (PARTITION BY COALESCE(NULLIF(query_canonical, ''), query) ORDER BY SUM(clicks) DESC) AS rn,
+            COUNT(*) OVER (PARTITION BY COALESCE(NULLIF(query_canonical, ''), query)) AS variantCount
+          FROM read_parquet({{FILES}}, union_by_name = true)
+          GROUP BY COALESCE(NULLIF(query_canonical, ''), query), query
+        )
+        SELECT
+          joinKey,
+          MAX(variantCount)::BIGINT AS variantCount,
+          MAX(CASE WHEN rn = 1 THEN query END) AS canonicalName,
+          GROUP_CONCAT(CASE WHEN rn <= 10 THEN query || ':::' || clicks || ':::' || impressions || ':::' || CAST(ROUND(CAST(sum_pos AS REAL) / NULLIF(impressions, 0) + 1, 1) AS TEXT) END, '||') AS variants
+        FROM per_variant
+        GROUP BY joinKey
+      `
+		})).rows.map((r) => ({
+			joinKey: String(r.joinKey),
+			variantCount: BigInt(r.variantCount),
+			canonicalName: r.canonicalName == null ? null : String(r.canonicalName),
+			variants: r.variants == null ? null : String(r.variants)
+		}));
+	}
+};
+const queryCanonicalDailyRollup = {
+	id: "query_canonical_daily",
+	windowDays: null,
+	format: "parquet",
+	parquetColumns: [
+		{
+			name: "query_canonical",
+			type: "VARCHAR",
+			nullable: false
+		},
+		{
+			name: "date",
+			type: "DATE",
+			nullable: false
+		},
+		{
+			name: "clicks",
+			type: "BIGINT",
+			nullable: false
+		},
+		{
+			name: "impressions",
+			type: "BIGINT",
+			nullable: false
+		},
+		{
+			name: "sum_position",
+			type: "DOUBLE",
+			nullable: false
+		}
+	],
+	parquetSortKey: ["date", "query_canonical"],
+	async build({ engine, ctx, dataSource, searchType }) {
+		const dimStore = createQueryDimStore({ dataSource });
+		const useDim = await dimStore.loadMeta(ctx) !== null;
+		const canonExpr = useDim ? `COALESCE(qd.query_canonical, NULLIF(q.query_canonical, ''), q.query)` : `COALESCE(NULLIF(query_canonical, ''), query)`;
+		return (await runWindowed({
+			engine,
+			ctx,
+			table: "queries",
+			...searchType !== void 0 ? { searchType } : {},
+			...useDim ? { extraFileSets: { QUERY_DIM: {
+				table: "queries",
+				keys: [dimStore.parquetKey(ctx)]
+			} } } : {},
+			sqlFor: useDim ? (w) => `
+          SELECT
+            ${canonExpr} AS query_canonical,
+            CAST(q.date AS VARCHAR) AS date,
+            SUM(q.clicks)::BIGINT AS clicks,
+            SUM(q.impressions)::BIGINT AS impressions,
+            SUM(q.sum_position)::DOUBLE AS sum_position
+          FROM read_parquet({{FILES}}, union_by_name = true) q
+          LEFT JOIN read_parquet({{QUERY_DIM}}, union_by_name = true) qd ON q.query = qd.query
+          WHERE q.date >= '${w.start}' AND q.date <= '${w.end}'
+          GROUP BY ${canonExpr}, q.date
+        ` : (w) => `
+          SELECT
+            ${canonExpr} AS query_canonical,
+            CAST(date AS VARCHAR) AS date,
+            SUM(clicks)::BIGINT AS clicks,
+            SUM(impressions)::BIGINT AS impressions,
+            SUM(sum_position)::DOUBLE AS sum_position
+          FROM read_parquet({{FILES}}, union_by_name = true)
+          WHERE date >= '${w.start}' AND date <= '${w.end}'
+          GROUP BY ${canonExpr}, date
+        `
+		})).map((r) => ({
+			query_canonical: String(r.query_canonical),
+			date: String(r.date),
+			clicks: BigInt(r.clicks),
+			impressions: BigInt(r.impressions),
+			sum_position: Number(r.sum_position)
+		}));
+	}
+};
 const indexingMetadataRollup = {
 	id: "indexing_metadata",
 	windowDays: null,
@@ -845,4 +996,5 @@ const DEFAULT_ROLLUPS = [
 	sitemapHealthRollup,
 	sitemapChanges28dRollup
 ];
-export { DEFAULT_ROLLUPS, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
+const CANONICAL_ROLLUPS = [queryCanonicalVariantsRollup, queryCanonicalDailyRollup];
+export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@gscdump/engine",
   "type": "module",
-  "version": "0.29.0",
+  "version": "0.31.0",
   "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
   "author": {
     "name": "Harlan Wilton",
@@ -191,8 +191,8 @@
     "hyparquet": "^1.26.1",
     "hyparquet-writer": "^0.16.1",
     "proper-lockfile": "^4.1.2",
-    "@gscdump/contracts": "0.29.0",
-    "gscdump": "0.29.0"
+    "@gscdump/contracts": "0.31.0",
+    "gscdump": "0.31.0"
   },
   "devDependencies": {
     "@duckdb/duckdb-wasm": "^1.32.0",