@gscdump/engine 0.28.3 → 0.30.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  import { engineErrors } from "../errors.mjs";
2
- import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
2
+ import { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, icebergPartitionColumns, icebergSchemasFor, icebergTableSpec, isIcebergTable } from "../_chunks/schema2.mjs";
3
3
  import { cachingResolver, icebergAppend, icebergCreateTable, icebergDropTable, icebergManifests, restCatalogConnect, restCatalogCreateNamespace, restCatalogListTables, restCatalogLoadTable, s3SignedResolver } from "../_chunks/libs/icebird.mjs";
4
4
  import { err, ok } from "gscdump/result";
5
5
  async function cacheGet(cache, key, now) {
@@ -35,21 +35,25 @@ function decodeInt(bytes) {
35
35
  if (u == null) return null;
36
36
  return new DataView(u.buffer, u.byteOffset, u.byteLength).getInt32(0, true);
37
37
  }
38
- function buildPartitionFilter(siteId, searchType, wantedMonths) {
38
+ function buildPartitionFilter(siteId, searchType, wantedMonths, encoding = "string") {
39
39
  return (partitions) => {
40
40
  const parts = partitions;
41
41
  if (!parts || parts.length === 0) return true;
42
- const siteSummary = parts[SITE_ID_FIELD_INDEX];
43
- if (siteSummary && (siteSummary.lower_bound != null || siteSummary.upper_bound != null)) {
44
- const lo = decodeString(siteSummary.lower_bound);
45
- const hi = decodeString(siteSummary.upper_bound);
46
- if (lo != null && hi != null && (siteId < lo || siteId > hi)) return false;
47
- }
48
- const searchTypeSummary = parts[SEARCH_TYPE_FIELD_INDEX];
49
- if (searchTypeSummary && (searchTypeSummary.lower_bound != null || searchTypeSummary.upper_bound != null)) {
50
- const lo = decodeString(searchTypeSummary.lower_bound);
51
- const hi = decodeString(searchTypeSummary.upper_bound);
52
- if (lo != null && hi != null && (searchType < lo || searchType > hi)) return false;
42
+ if (encoding === "string") {
43
+ const siteStr = String(siteId);
44
+ const searchStr = String(searchType);
45
+ const siteSummary = parts[SITE_ID_FIELD_INDEX];
46
+ if (siteSummary && (siteSummary.lower_bound != null || siteSummary.upper_bound != null)) {
47
+ const lo = decodeString(siteSummary.lower_bound);
48
+ const hi = decodeString(siteSummary.upper_bound);
49
+ if (lo != null && hi != null && (siteStr < lo || siteStr > hi)) return false;
50
+ }
51
+ const searchTypeSummary = parts[SEARCH_TYPE_FIELD_INDEX];
52
+ if (searchTypeSummary && (searchTypeSummary.lower_bound != null || searchTypeSummary.upper_bound != null)) {
53
+ const lo = decodeString(searchTypeSummary.lower_bound);
54
+ const hi = decodeString(searchTypeSummary.upper_bound);
55
+ if (lo != null && hi != null && (searchStr < lo || searchStr > hi)) return false;
56
+ }
53
57
  }
54
58
  const monthSummary = parts[DATE_MONTH_FIELD_INDEX];
55
59
  if (monthSummary && (monthSummary.lower_bound != null || monthSummary.upper_bound != null)) {
@@ -74,11 +78,11 @@ const ICEBERG_TYPE_MAP = {
74
78
  DOUBLE: "double",
75
79
  DATE: "date"
76
80
  };
77
- function icebergSchemaFor(table) {
81
+ function icebergSchemaFor(table, encoding = "string") {
78
82
  return {
79
83
  "type": "struct",
80
84
  "schema-id": 0,
81
- "fields": ICEBERG_SCHEMAS[table].columns.map((col) => ({
85
+ "fields": icebergSchemasFor(encoding)[table].columns.map((col) => ({
82
86
  id: col.fieldId,
83
87
  name: col.name,
84
88
  required: col.required,
@@ -86,8 +90,8 @@ function icebergSchemaFor(table) {
86
90
  }))
87
91
  };
88
92
  }
89
- function icebergPartitionSpecFor(table) {
90
- const fields = ICEBERG_SCHEMAS[table].columns;
93
+ function icebergPartitionSpecFor(table, encoding = "string") {
94
+ const fields = icebergSchemasFor(encoding)[table].columns;
91
95
  const fieldId = (name) => {
92
96
  const col = fields.find((c) => c.name === name);
93
97
  if (!col) throw new Error(`iceberg-catalog: table '${table}' has no '${name}' column`);
@@ -176,14 +180,14 @@ async function icebergAppendRetrying(args, options = {}) {
176
180
  async function ensureIcebergNamespace(conn) {
177
181
  await restCatalogCreateNamespace(conn.catalog, { namespace: conn.namespace }).catch(() => {});
178
182
  }
179
- async function createIcebergTables(conn, tables = ICEBERG_TABLES) {
183
+ async function createIcebergTables(conn, tables = ICEBERG_TABLES, encoding = "string") {
180
184
  const results = [];
181
185
  for (const table of tables) await icebergCreateTable({
182
186
  catalog: conn.catalog,
183
187
  namespace: conn.namespace,
184
188
  table,
185
- schema: icebergSchemaFor(table),
186
- partitionSpec: icebergPartitionSpecFor(table)
189
+ schema: icebergSchemaFor(table, encoding),
190
+ partitionSpec: icebergPartitionSpecFor(table, encoding)
187
191
  }).then(() => results.push({
188
192
  table,
189
193
  outcome: ok(void 0)
@@ -275,20 +279,22 @@ async function listIcebergDataFiles(conn, opts) {
275
279
  if (snapshotId == null || !metadata) return [];
276
280
  }
277
281
  const endWalk = profiler?.start("iceberg.walk");
278
- const partitionFilter = buildPartitionFilter(opts.siteId, opts.searchType, wantedMonths);
282
+ const partitionFilter = buildPartitionFilter(opts.siteId, opts.searchType, wantedMonths, opts.encoding ?? "string");
279
283
  const manifests = await icebergManifests({
280
284
  metadata,
281
285
  resolver: conn.resolver,
282
286
  partitionFilter
283
287
  });
288
+ const wantSite = String(opts.siteId);
289
+ const wantSearch = String(opts.searchType);
284
290
  const out = [];
285
291
  for (const m of manifests) for (const entry of m.entries) {
286
292
  if (entry.status === 2) continue;
287
293
  const df = entry.data_file;
288
294
  if (df.content !== 0) continue;
289
295
  const part = df.partition;
290
- if (part.site_id !== opts.siteId) continue;
291
- if (part.search_type !== opts.searchType) continue;
296
+ if (String(part.site_id) !== wantSite) continue;
297
+ if (String(part.search_type) !== wantSearch) continue;
292
298
  const month = part.date_month;
293
299
  if (typeof month !== "number" || !wantedMonths.has(month)) continue;
294
300
  out.push({
@@ -326,6 +332,8 @@ async function dropIcebergTables(conn, tables) {
326
332
  return results;
327
333
  }
328
334
  const DAY_MILLIS = 864e5;
335
+ const INT32_MIN = -2147483648;
336
+ const INT32_MAX = 2147483647;
329
337
  function toIcebergDate(value) {
330
338
  if (typeof value === "string") {
331
339
  const ms = Date.parse(`${value}T00:00:00Z`);
@@ -343,6 +351,14 @@ function coerceJsonSafe(value) {
343
351
  if (typeof value === "bigint") return Number(value);
344
352
  return value;
345
353
  }
354
+ function toIntPartitionSiteId(value) {
355
+ if (value == null || typeof value === "string" && value.trim() === "") throw new TypeError("toRecords: slice.ctx.siteId is required for int partition encoding");
356
+ if (typeof value !== "string" && typeof value !== "number" && typeof value !== "bigint") throw new TypeError(`toRecords: int partition site_id must be a safe integer, got '${String(value)}'`);
357
+ const siteId = Number(value);
358
+ if (!Number.isSafeInteger(siteId)) throw new TypeError(`toRecords: int partition site_id must be a safe integer, got '${String(value)}'`);
359
+ if (siteId < INT32_MIN || siteId > INT32_MAX) throw new TypeError(`toRecords: int partition site_id must fit Iceberg INT, got '${String(value)}'`);
360
+ return siteId;
361
+ }
346
362
  function dedupeByIdentity(table, records) {
347
363
  if (records.length < 2) return records;
348
364
  const key = ICEBERG_SCHEMAS[table].identityColumns;
@@ -353,19 +369,21 @@ function dedupeByIdentity(table, records) {
353
369
  }
354
370
  return seen.size === records.length ? records : [...seen.values()];
355
371
  }
356
- function toRecords(slice, rows) {
357
- const siteId = slice.ctx.siteId ?? "";
372
+ function toRecords(slice, rows, encoding) {
373
+ const siteVal = encoding === "int" ? toIntPartitionSiteId(slice.ctx.siteId) : slice.ctx.siteId ?? "";
374
+ const searchVal = encoding === "int" ? SEARCH_TYPE_INT[slice.searchType] : slice.searchType;
358
375
  return rows.map((row) => {
359
376
  const out = {};
360
377
  for (const k in row) out[k] = coerceJsonSafe(row[k]);
361
378
  out.date = toIcebergDate(out.date);
362
- out.site_id = siteId;
363
- out.search_type = slice.searchType;
379
+ out.site_id = siteVal;
380
+ out.search_type = searchVal;
364
381
  return out;
365
382
  });
366
383
  }
367
384
  function createIcebergAppendSink(options) {
368
385
  let connection;
386
+ const encoding = options.encoding ?? "string";
369
387
  const buffers = /* @__PURE__ */ new Map();
370
388
  function connect() {
371
389
  connection ??= connectIcebergCatalog(options.catalog);
@@ -375,7 +393,7 @@ function createIcebergAppendSink(options) {
375
393
  capabilities: { appendOnly: true },
376
394
  async emit(slice, rows) {
377
395
  if (rows.length === 0) return { rowCount: 0 };
378
- const records = toRecords(slice, rows);
396
+ const records = toRecords(slice, rows, encoding);
379
397
  const buffer = buffers.get(slice.table);
380
398
  if (buffer) for (let i = 0; i < records.length; i++) buffer.push(records[i]);
381
399
  else buffers.set(slice.table, records);
@@ -429,4 +447,4 @@ function createIcebergAppendSink(options) {
429
447
  }
430
448
  };
431
449
  }
432
- export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_TABLES, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionSpecFor, icebergSchemaFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
450
+ export { ICEBERG_FIELD_ID_BASE, ICEBERG_PARTITION_COLUMNS, ICEBERG_PARTITION_SPEC, ICEBERG_SCHEMAS, ICEBERG_SCHEMAS_INT, ICEBERG_TABLES, INT_SEARCH_TYPE, SEARCH_TYPE_INT, assertIcebergTable, connectIcebergCatalog, createIcebergAppendSink, createIcebergTables, dropIcebergTables, ensureIcebergNamespace, icebergAppendRetrying, icebergCreateTable, icebergManifests, icebergPartitionColumns, icebergPartitionSpecFor, icebergSchemaFor, icebergSchemasFor, icebergTableSpec, isCommitRateLimited, isIcebergTable, listIcebergDataFiles, listIcebergTables, restCatalogLoadTable };
package/dist/ingest.mjs CHANGED
@@ -26,7 +26,7 @@ function toPath(gscUrl) {
26
26
  }
27
27
  }
28
28
  function toSumPosition(apiPosition, impressions) {
29
- return (apiPosition - 1) * Math.max(impressions, 1);
29
+ return ((apiPosition >= 1 ? apiPosition : 1) - 1) * Math.max(impressions, 1);
30
30
  }
31
31
  function transformGscRow(table, apiRow, options = {}) {
32
32
  const keys = apiRow.keys;
@@ -76,8 +76,10 @@ function transformGscRow(table, apiRow, options = {}) {
76
76
  };
77
77
  }
78
78
  if (table === "hourly_pages") {
79
- const hour = String(keys[0] ?? "");
80
- const date = hour.slice(0, 10);
79
+ const hourStamp = String(keys[0] ?? "");
80
+ const date = hourStamp.slice(0, 10);
81
+ const hour = Number.parseInt(hourStamp.slice(11, 13), 10);
82
+ if (!Number.isInteger(hour) || hour < 0 || hour > 23) throw new Error(`hourly_pages: cannot derive hour-of-day from '${hourStamp}'`);
81
83
  return {
82
84
  date,
83
85
  row: {
@@ -1,7 +1,7 @@
1
1
  import { SearchType as SearchType$1, TableName as TableName$1 } from "../_chunks/storage.mjs";
2
2
  import { ComparisonFilter, ExtraQuery, ResolvedComparisonSQL, ResolvedSQL, ResolvedSQLOptimized, ResolverAdapter, ResolverOptions } from "../_chunks/types.mjs";
3
- import { PgTableKey, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter } from "../_chunks/pg-adapter.mjs";
4
- import { LogicalDataset, LogicalDataset as LogicalDataset$1, PlannerCapabilities, UnresolvableDatasetError, inferDataset as inferLogicalDataset, isDatasetResolvable } from "gscdump/query/plan";
3
+ import { PgTableKey, ResolverAdapterOptions, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter } from "../_chunks/pg-adapter.mjs";
4
+ import { LogicalDataset, LogicalDataset as LogicalDataset$1, LogicalQueryPlan, PlannerCapabilities, UnresolvableDatasetError, inferDataset as inferLogicalDataset, isDatasetResolvable } from "gscdump/query/plan";
5
5
  import { SQL } from "drizzle-orm";
6
6
  import { BuilderState, Dimension, FilterInput, InternalFilter, Metric } from "gscdump/query";
7
7
  import { Grain, TableName } from "@gscdump/contracts";
@@ -35,6 +35,19 @@ interface SqlFragmentsConfig<TableKey extends string> {
35
35
  * against the alias.
36
36
  */
37
37
  tableRef?: (tableKey: TableKey) => SQL;
38
+ /**
39
+ * Opt-in correctness for canonical-primary lookups. When true, the
40
+ * `queryCanonical` dimension expression falls back to the raw `query` when
41
+ * the stored `query_canonical` is NULL (no normalizer ran at ingest) or `''`
42
+ * (a fully-stripped query like "free online"), i.e.
43
+ * `COALESCE(NULLIF(query_canonical, ''), query)`. This makes canonical a
44
+ * TOTAL key, valid for GROUP BY / comparison joins.
45
+ *
46
+ * Default (false) preserves legacy behaviour: the raw nullable column, so a
47
+ * NULL/'' bucket pollutes top results and — because `NULL = NULL` is UNKNOWN
48
+ * — double-counts in the gaining/losing FULL OUTER JOIN. See ADR-0018.
49
+ */
50
+ canonicalFallback?: boolean;
38
51
  }
39
52
  interface SqlFragments<TableKey extends string> {
40
53
  METRIC_NAMES: Metric[];
@@ -65,6 +78,16 @@ interface CreateResolverAdapterConfig<TableKey extends string> extends SqlFragme
65
78
  capabilities: PlannerCapabilities;
66
79
  }
67
80
  declare function createResolverAdapter<TableKey extends string>(config: CreateResolverAdapterConfig<TableKey>): ResolverAdapter<TableKey>;
81
+ /**
82
+ * True when `plan` can be served from the canonical-grained rollup instead of
83
+ * the raw `queries` fact partitions. Conservative: anything that would read a
84
+ * dropped column or the raw row grain disqualifies the query, so a false
85
+ * negative just falls back to live aggregation (correct, slower) — never wrong
86
+ * data.
87
+ */
88
+ declare function planCoveredByCanonicalRollup(plan: LogicalQueryPlan): boolean;
89
+ /** State-level convenience: build the plan then gate. */
90
+ declare function canonicalRollupCovers(state: BuilderState, capabilities: PlannerCapabilities): boolean;
68
91
  declare function resolveToSQLOptimized<TK extends string>(state: BuilderState, options: ResolverOptions<TK>): ResolvedSQLOptimized;
69
92
  declare function resolveToSQL<TK extends string>(state: BuilderState, options: ResolverOptions<TK>): ResolvedSQL;
70
93
  declare function buildTotalsSql<TK extends string>(state: BuilderState, options: ResolverOptions<TK>): {
@@ -77,14 +100,6 @@ declare function mergeExtras(rows: Record<string, unknown>[], extrasResults: {
77
100
  key: string;
78
101
  results: Record<string, unknown>[];
79
102
  }[]): Record<string, unknown>[];
80
- declare function getInternalFilters(filter: FilterInput | undefined): InternalFilter[];
81
- declare function getDimensionFilters(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): InternalFilter[];
82
- declare function getFilterDimensions(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): Dimension[];
83
- declare function metricValue(row: Record<string, unknown>, metric: string): number;
84
- declare function dimensionValue(row: Record<string, unknown>, dimension: string): string;
85
- declare function matchesDimensionFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
86
- declare function matchesMetricFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
87
- declare function matchesTopLevelPage(row: Record<string, unknown>): boolean;
88
103
  interface RunQueryCtx {
89
104
  userId: string;
90
105
  siteId: string;
@@ -118,6 +133,60 @@ interface RunSQLFn {
118
133
  rows: Array<Record<string, unknown>>;
119
134
  }>;
120
135
  }
136
+ /**
137
+ * Optional overlay that serves a resolver extra (e.g. canonical-variant
138
+ * grouping, keyed `'canonicalExtras'`) from a precomputed source — typically a
139
+ * materialised rollup — instead of the live window-function SQL. Return the
140
+ * rows in the exact shape the live extra produces (`mergeExtras` consumes
141
+ * either source unchanged), or `null` to decline so the caller falls back to
142
+ * the live query. Pure seam: storage/tenant routing lives in the host's
143
+ * implementation, not here. See ADR-0017.
144
+ */
145
+ interface ResolveExtraFn {
146
+ (opts: {
147
+ key: string;
148
+ state: BuilderState;
149
+ ctx: RunQueryCtx;
150
+ dateRange: {
151
+ startDate: string;
152
+ endDate: string;
153
+ };
154
+ }): Promise<Array<Record<string, unknown>> | null>;
155
+ }
156
+ interface RunOptimizedQueryOptions {
157
+ /** Overlay tried per extra before the live SQL; absent → today's live path. */
158
+ resolveExtra?: ResolveExtraFn;
159
+ /**
160
+ * Opt-in canonical-primary correctness: group/compare `queryCanonical` as a
161
+ * total key (NULL/'' folds to the raw `query`). Default false = legacy raw
162
+ * nullable column. See ADR-0018.
163
+ */
164
+ canonicalFallback?: boolean;
165
+ /**
166
+ * Opt-in canonical-primary performance (ADR-0018 Gap 2): object keys of the
167
+ * `query_canonical_daily` rollup parquet(s). When supplied AND the query is
168
+ * coverable (`canonicalRollupCovers`) AND `canonicalFallback` is on AND the
169
+ * window is within the rollup's coverage, the MAIN query reads these
170
+ * pre-summed `(query_canonical × date)` rows instead of re-aggregating raw
171
+ * partitions; variant extras still read raw. Ignored (live path) on any miss,
172
+ * so a mis-wired host degrades to correct-but-slow, never wrong.
173
+ *
174
+ * `canonicalFallback` is REQUIRED: the rollup is built with
175
+ * `COALESCE(NULLIF(query_canonical, ''), query)` (fallback semantics), so
176
+ * serving it to a legacy (`canonicalFallback: false`) caller would change
177
+ * NULL/'' rows from legacy buckets to raw-query keys. The rollup is already
178
+ * null-free, so the rollup READ itself runs without fallback.
179
+ *
180
+ * `coversThrough` (ISO `YYYY-MM-DD`, the rollup's newest covered date) gates
181
+ * staleness: the source is used only when `dateRange.endDate <= coversThrough`,
182
+ * else the live path serves the window so the recent tail is never silently
183
+ * undercounted. Omit to assert full coverage (use with care).
184
+ */
185
+ canonicalSource?: {
186
+ keys: string[];
187
+ coversThrough?: string;
188
+ };
189
+ }
121
190
  interface OptimizedQueryResult {
122
191
  rows: Array<Record<string, unknown>>;
123
192
  totalCount: number;
@@ -140,7 +209,7 @@ interface ComparisonQueryResult {
140
209
  declare function runOptimizedQuery(runSQL: RunSQLFn, ctx: RunQueryCtx, state: BuilderState, dateRange: {
141
210
  startDate: string;
142
211
  endDate: string;
143
- }): Promise<OptimizedQueryResult>;
212
+ }, options?: RunOptimizedQueryOptions): Promise<OptimizedQueryResult>;
144
213
  declare function runComparisonQuery(runSQL: RunSQLFn, ctx: RunQueryCtx, current: BuilderState, previous: BuilderState, windows: {
145
214
  current: {
146
215
  startDate: string;
@@ -150,7 +219,55 @@ declare function runComparisonQuery(runSQL: RunSQLFn, ctx: RunQueryCtx, current:
150
219
  startDate: string;
151
220
  endDate: string;
152
221
  };
153
- }, filter?: ComparisonFilter): Promise<ComparisonQueryResult>;
222
+ }, filter?: ComparisonFilter, options?: {
223
+ canonicalFallback?: boolean;
224
+ canonicalSource?: {
225
+ keys: string[];
226
+ coversThrough?: string;
227
+ };
228
+ }): Promise<ComparisonQueryResult>;
229
+ /**
230
+ * Host-supplied reader: return the materialised rollup's rows for an
231
+ * `(id, tenant, slice)`, in the exact shape the live extra produces, or `null`
232
+ * when no rollup exists (first sync, never built, stale) so the overlay
233
+ * declines and the resolver falls back to the live query. Typically wired with
234
+ * `readLatestRollup` + a `read_parquet` of the pointer.
235
+ *
236
+ * `dateRange` is the request window. `query_canonical_variants` is full-history
237
+ * (its grouping/variant metrics span all dates), but `buildExtrasQueries`
238
+ * windows the live `canonicalExtras` to the requested range — so for a narrow
239
+ * window the reader MUST decline (return `null`) rather than attach
240
+ * out-of-window variantCount/canonicalName/variants. A common rule: serve only
241
+ * when the request window covers full history.
242
+ */
243
+ interface RollupRowsReader {
244
+ (opts: {
245
+ id: string;
246
+ ctx: {
247
+ userId: string;
248
+ siteId: string;
249
+ };
250
+ searchType?: SearchType$1;
251
+ dateRange: {
252
+ startDate: string;
253
+ endDate: string;
254
+ };
255
+ }): Promise<Array<Record<string, unknown>> | null>;
256
+ }
257
+ /**
258
+ * Build a {@link ResolveExtraFn} that serves resolver extras from materialised
259
+ * rollups when one is mapped for the extra's key, else returns `null` to fall
260
+ * back to the live SQL. Pure wiring around the host's `readRollupRows`.
261
+ */
262
+ declare function createRollupExtrasOverlay(readRollupRows: RollupRowsReader): ResolveExtraFn;
263
+ declare function getInternalFilters(filter: FilterInput | undefined): InternalFilter[];
264
+ declare function getDimensionFilters(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): InternalFilter[];
265
+ declare function getFilterDimensions(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): Dimension[];
266
+ declare function metricValue(row: Record<string, unknown>, metric: string): number;
267
+ declare function dimensionValue(row: Record<string, unknown>, dimension: string): string;
268
+ declare function matchesDimensionFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
269
+ declare function matchesMetricFilter(row: Record<string, unknown>, filter: InternalFilter): boolean;
270
+ declare function matchesTopLevelPage(row: Record<string, unknown>): boolean;
154
271
  interface AssertSchemaInSyncOptions {
155
272
  /** Label used in the thrown error (e.g. 'browser', 'sqlite'). */
156
273
  label: string;
@@ -164,4 +281,4 @@ interface AssertSchemaInSyncOptions {
164
281
  mode: 'exact' | 'superset';
165
282
  }
166
283
  declare function assertSchemaInSync(options: AssertSchemaInSyncOptions): void;
167
- export { type AssertSchemaInSyncOptions, type ComparisonFilter, type ComparisonQueryResult, type CreateResolverAdapterConfig, DIMENSION_SURFACES, type DimensionBinding, type DimensionSurface, type ExtraQuery, LOGICAL_DATASETS, type LogicalDataset, type LogicalDatasetDefinition, type OptimizedQueryResult, type PgTableKey, type ResolvedComparisonSQL, type ResolvedSQL, type ResolvedSQLOptimized, type ResolverAdapter, type ResolverOptions, type RunQueryCtx, type RunSQLFn, type SqlFragments, type SqlFragmentsConfig, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
284
+ export { type AssertSchemaInSyncOptions, type ComparisonFilter, type ComparisonQueryResult, type CreateResolverAdapterConfig, DIMENSION_SURFACES, type DimensionBinding, type DimensionSurface, type ExtraQuery, LOGICAL_DATASETS, type LogicalDataset, type LogicalDatasetDefinition, type OptimizedQueryResult, type PgTableKey, type ResolveExtraFn, type ResolvedComparisonSQL, type ResolvedSQL, type ResolvedSQLOptimized, type ResolverAdapter, type ResolverAdapterOptions, type ResolverOptions, type RollupRowsReader, type RunOptimizedQueryOptions, type RunQueryCtx, type RunSQLFn, type SqlFragments, type SqlFragmentsConfig, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
@@ -1,2 +1,2 @@
1
- import { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface } from "../_chunks/resolver.mjs";
2
- export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
1
+ import { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface } from "../_chunks/resolver.mjs";
2
+ export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
@@ -283,6 +283,47 @@ declare const topKeywords28dRollup: RollupDef;
283
283
  * coexist during a migration.
284
284
  */
285
285
  declare const topKeywords28dParquetRollup: RollupDef;
286
+ /**
287
+ * Materialises canonical-query variant grouping so the read path
288
+ * (`buildExtrasQueries` in `resolver/compile.ts`) becomes a passthrough scan
289
+ * instead of two window passes (`ROW_NUMBER`/`COUNT` over `PARTITION BY
290
+ * query_canonical`) plus a `GROUP_CONCAT` over the whole `queries` table on
291
+ * every request — work that is single-threaded under DuckDB-WASM/Workers and
292
+ * scales with table size. See ADR-0017.
293
+ *
294
+ * One row per `query_canonical` group, columns named 1:1 with the live query's
295
+ * output (`joinKey`, `variantCount`, `canonicalName`, `variants`) so
296
+ * `mergeExtras` consumes either source unchanged. `variants` packs the top-10
297
+ * variants as `query:::clicks:::impressions:::position` joined by `||`,
298
+ * identical to the live composer.
299
+ *
300
+ * Full history (`windowDays: null`), not a trailing window: grouping metadata
301
+ * is global (which variant is canonical, how many variants exist) and stays
302
+ * stable across requests rather than shifting with each query's date range.
303
+ * Reflects the last sync/compaction, not the live tail — readers that need the
304
+ * tail can layer a recent-overlay later (the envelope carries `builtAt`).
305
+ */
306
+ declare const queryCanonicalVariantsRollup: RollupDef;
307
+ /**
308
+ * Canonical-grained fact aggregate (ADR-0018 Gap 2): pre-sums the raw
309
+ * `(query × date)` query rows to `(query_canonical × date)`, so canonical-
310
+ * primary top/gaining/losing reads a small pre-aggregated table instead of
311
+ * re-collapsing variants on every request. Metrics are additive, so summing
312
+ * these per-date sums over a window is exact — identical to aggregating the raw
313
+ * rows.
314
+ *
315
+ * Null-free by construction: groups by `COALESCE(NULLIF(query_canonical, ''),
316
+ * query)`, the same total-key expression the opt-in read path uses (ADR-0018
317
+ * Gap 1), so the rollup never carries a NULL/'' canonical bucket and the read
318
+ * path needs no fallback when pointed at it.
319
+ *
320
+ * Date-grained full history (`windowDays: null`): one rollup serves every date
321
+ * range (reads filter by `date`) and both windows of a comparison. Opt-in (not
322
+ * in `DEFAULT_ROLLUPS`); the host points the main query's file set at it for
323
+ * queries the rollup covers (see `canonicalRollupCovers` /
324
+ * `RunOptimizedQueryOptions.canonicalSource`).
325
+ */
326
+ declare const queryCanonicalDailyRollup: RollupDef;
286
327
  /**
287
328
  * Aggregates the per-URL Indexing API metadata entity store (populated by
288
329
  * `gscdump entities indexing snapshot`) into daily counts of `URL_UPDATED`
@@ -360,4 +401,12 @@ declare function rebuildDailyFromHourly(opts: RebuildDailyFromHourlyOptions): Pr
360
401
  rowsWritten: number;
361
402
  }>;
362
403
  declare const DEFAULT_ROLLUPS: readonly RollupDef[];
363
- export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
404
+ /**
405
+ * Canonical-primary rollups (ADR-0017 / ADR-0018). Opt-in — kept out of
406
+ * `DEFAULT_ROLLUPS` because they only pay off once the consumer queries by
407
+ * `queryCanonical` and wires the read seams (`resolveExtra` /
408
+ * `canonicalSource`). Hosts opt in by concatenating these onto their def list
409
+ * (CLI: `gscdump rollups --with-canonical`).
410
+ */
411
+ declare const CANONICAL_ROLLUPS: readonly RollupDef[];
412
+ export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildDailyFromHourlyOptions, RebuildRollupResult, RebuildRollupsOptions, RollupBucket, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
package/dist/rollups.mjs CHANGED
@@ -534,6 +534,136 @@ const topKeywords28dParquetRollup = {
534
534
  }));
535
535
  }
536
536
  };
537
+ const queryCanonicalVariantsRollup = {
538
+ id: "query_canonical_variants",
539
+ windowDays: null,
540
+ format: "parquet",
541
+ parquetColumns: [
542
+ {
543
+ name: "joinKey",
544
+ type: "VARCHAR",
545
+ nullable: false
546
+ },
547
+ {
548
+ name: "variantCount",
549
+ type: "BIGINT",
550
+ nullable: false
551
+ },
552
+ {
553
+ name: "canonicalName",
554
+ type: "VARCHAR",
555
+ nullable: true
556
+ },
557
+ {
558
+ name: "variants",
559
+ type: "VARCHAR",
560
+ nullable: true
561
+ }
562
+ ],
563
+ parquetSortKey: ["joinKey"],
564
+ async build({ engine, ctx, searchType }) {
565
+ const parts = await engine.listPartitions({
566
+ ctx,
567
+ table: "queries",
568
+ ...searchType !== void 0 ? { searchType } : {}
569
+ });
570
+ if (parts.length === 0) return [];
571
+ const partitions = parts.map((p) => p.partition);
572
+ return (await engine.runSQL({
573
+ ctx,
574
+ table: "queries",
575
+ fileSets: { FILES: {
576
+ table: "queries",
577
+ partitions
578
+ } },
579
+ ...searchType !== void 0 ? { searchType } : {},
580
+ sql: `
581
+ WITH per_variant AS (
582
+ SELECT
583
+ COALESCE(NULLIF(query_canonical, ''), query) AS joinKey,
584
+ query AS query,
585
+ SUM(clicks) AS clicks,
586
+ SUM(impressions) AS impressions,
587
+ SUM(sum_position) AS sum_pos,
588
+ ROW_NUMBER() OVER (PARTITION BY COALESCE(NULLIF(query_canonical, ''), query) ORDER BY SUM(clicks) DESC) AS rn,
589
+ COUNT(*) OVER (PARTITION BY COALESCE(NULLIF(query_canonical, ''), query)) AS variantCount
590
+ FROM read_parquet({{FILES}}, union_by_name = true)
591
+ GROUP BY COALESCE(NULLIF(query_canonical, ''), query), query
592
+ )
593
+ SELECT
594
+ joinKey,
595
+ MAX(variantCount)::BIGINT AS variantCount,
596
+ MAX(CASE WHEN rn = 1 THEN query END) AS canonicalName,
597
+ GROUP_CONCAT(CASE WHEN rn <= 10 THEN query || ':::' || clicks || ':::' || impressions || ':::' || CAST(ROUND(CAST(sum_pos AS REAL) / NULLIF(impressions, 0) + 1, 1) AS TEXT) END, '||') AS variants
598
+ FROM per_variant
599
+ GROUP BY joinKey
600
+ `
601
+ })).rows.map((r) => ({
602
+ joinKey: String(r.joinKey),
603
+ variantCount: BigInt(r.variantCount),
604
+ canonicalName: r.canonicalName == null ? null : String(r.canonicalName),
605
+ variants: r.variants == null ? null : String(r.variants)
606
+ }));
607
+ }
608
+ };
609
+ const queryCanonicalDailyRollup = {
610
+ id: "query_canonical_daily",
611
+ windowDays: null,
612
+ format: "parquet",
613
+ parquetColumns: [
614
+ {
615
+ name: "query_canonical",
616
+ type: "VARCHAR",
617
+ nullable: false
618
+ },
619
+ {
620
+ name: "date",
621
+ type: "DATE",
622
+ nullable: false
623
+ },
624
+ {
625
+ name: "clicks",
626
+ type: "BIGINT",
627
+ nullable: false
628
+ },
629
+ {
630
+ name: "impressions",
631
+ type: "BIGINT",
632
+ nullable: false
633
+ },
634
+ {
635
+ name: "sum_position",
636
+ type: "DOUBLE",
637
+ nullable: false
638
+ }
639
+ ],
640
+ parquetSortKey: ["date", "query_canonical"],
641
+ async build({ engine, ctx, searchType }) {
642
+ return (await runWindowed({
643
+ engine,
644
+ ctx,
645
+ table: "queries",
646
+ ...searchType !== void 0 ? { searchType } : {},
647
+ sqlFor: (w) => `
648
+ SELECT
649
+ COALESCE(NULLIF(query_canonical, ''), query) AS query_canonical,
650
+ CAST(date AS VARCHAR) AS date,
651
+ SUM(clicks)::BIGINT AS clicks,
652
+ SUM(impressions)::BIGINT AS impressions,
653
+ SUM(sum_position)::DOUBLE AS sum_position
654
+ FROM read_parquet({{FILES}}, union_by_name = true)
655
+ WHERE date >= '${w.start}' AND date <= '${w.end}'
656
+ GROUP BY COALESCE(NULLIF(query_canonical, ''), query), date
657
+ `
658
+ })).map((r) => ({
659
+ query_canonical: String(r.query_canonical),
660
+ date: String(r.date),
661
+ clicks: BigInt(r.clicks),
662
+ impressions: BigInt(r.impressions),
663
+ sum_position: Number(r.sum_position)
664
+ }));
665
+ }
666
+ };
537
667
  const indexingMetadataRollup = {
538
668
  id: "indexing_metadata",
539
669
  windowDays: null,
@@ -560,7 +690,7 @@ const indexingMetadataRollup = {
560
690
  if (!latestRemove || r.latestRemoveAt > latestRemove) latestRemove = r.latestRemoveAt;
561
691
  }
562
692
  }
563
- const days = new Set([...updatesByDay.keys(), ...removesByDay.keys()]);
693
+ const days = /* @__PURE__ */ new Set([...updatesByDay.keys(), ...removesByDay.keys()]);
564
694
  const perDay = Array.from(days).sort().map((day) => ({
565
695
  day,
566
696
  updates: updatesByDay.get(day) ?? 0,
@@ -845,4 +975,5 @@ const DEFAULT_ROLLUPS = [
845
975
  sitemapHealthRollup,
846
976
  sitemapChanges28dRollup
847
977
  ];
848
- export { DEFAULT_ROLLUPS, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
978
+ const CANONICAL_ROLLUPS = [queryCanonicalVariantsRollup, queryCanonicalDailyRollup];
979
+ export { CANONICAL_ROLLUPS, DEFAULT_ROLLUPS, WINDOW_BYTE_BUDGET, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, partitionDaySpan, partitionsInRange, planRollupWindows, queryCanonicalDailyRollup, queryCanonicalVariantsRollup, readLatestRollup, rebuildDailyFromHourly, rebuildRollups, rollupKey, rollupParquetKey, runWindowed, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
package/dist/schema.d.mts CHANGED
@@ -1,2 +1,2 @@
1
- import { ColumnDef, ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
2
- export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
1
+ import { ColumnDef, ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, TableSchema, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
2
+ export { type ColumnDef, type ColumnType, type DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
package/dist/schema.mjs CHANGED
@@ -1,2 +1,2 @@
1
- import { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
2
- export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
1
+ import { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries } from "./_chunks/schema.mjs";
2
+ export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };