@gscdump/engine 0.28.3 → 0.30.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,7 +19,7 @@ declare function createDuckDBExecutor(factory: DuckDBFactory): QueryExecutor;
19
19
  /**
20
20
  * Canonical "empty-file" SELECT clause for a table. Codecs that need to
21
21
  * emit a schema-correct empty Parquet can wrap this in:
22
- * `COPY (SELECT * FROM <clause> WHERE FALSE) TO '<key>' (FORMAT PARQUET)`
22
+ * `COPY (SELECT * FROM <clause> WHERE FALSE) TO '<key>' (FORMAT PARQUET, COMPRESSION ZSTD)`
23
23
  * to satisfy the ParquetCodec empty-rows invariant.
24
24
  */
25
25
  declare function canonicalEmptyParquetSchema(table: TableName): string;
@@ -1,6 +1,7 @@
1
1
  import { dayPartition, hourPartition, inferSearchType, objectKey, tenantPrefix } from "./layout.mjs";
2
- import { SCHEMAS, currentSchemaVersion, dedupeByNaturalKey } from "./schema.mjs";
2
+ import { SCHEMAS, currentSchemaVersion, dateColumnsFor, dedupeByNaturalKey } from "./schema.mjs";
3
3
  import { compactTieredImpl, dedupeOverlappingTiers, splitOverlappingTiers } from "./compaction.mjs";
4
+ import { dateReplaceClause as dateReplaceClause$1 } from "../sql-fragments.mjs";
4
5
  import { compileLogicalQueryPlan, substituteNamedFiles } from "./parquet-plan.mjs";
5
6
  import { sqlEscape } from "../sql-bind.mjs";
6
7
  import { buildLogicalPlan } from "gscdump/query/plan";
@@ -13,7 +14,7 @@ async function encodeBytes(db, table, rows) {
13
14
  await db.registerFileBuffer(inName, jsonBytes);
14
15
  registered.push(inName);
15
16
  try {
16
- const sql = rows.length === 0 ? `COPY (SELECT * FROM ${emptyTableSchema(table)} WHERE FALSE) TO '${sqlEscape(outName)}' (FORMAT PARQUET)` : `COPY (SELECT * FROM read_json_auto('${sqlEscape(inName)}', format='array', columns=${columnsJson(table)})) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`;
17
+ const sql = rows.length === 0 ? `COPY (SELECT * FROM ${emptyTableSchema(table)} WHERE FALSE) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)` : `COPY (SELECT * FROM read_json_auto('${sqlEscape(inName)}', format='array', columns=${columnsJson(table)})) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)`;
17
18
  await db.query(sql);
18
19
  registered.push(outName);
19
20
  return await db.copyFileToBuffer(outName);
@@ -58,7 +59,7 @@ function createDuckDBCodec(factory) {
58
59
  const outName = db.makeTempPath("parquet");
59
60
  const fileList = inputUris.map((u) => `'${sqlEscape(u)}'`).join(", ");
60
61
  try {
61
- await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
62
+ await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)`);
62
63
  const bytes = await db.copyFileToBuffer(outName);
63
64
  const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
64
65
  const rowCount = Number(countRows[0]?.n ?? 0);
@@ -83,7 +84,7 @@ function createDuckDBCodec(factory) {
83
84
  }
84
85
  try {
85
86
  const fileList = inNames.map((n) => `'${sqlEscape(n)}'`).join(", ");
86
- await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
87
+ await db.query(`COPY (${dedupedMergeSql(ctx.table, fileList)}) TO '${sqlEscape(outName)}' (FORMAT PARQUET, COMPRESSION ZSTD)`);
87
88
  registered.push(outName);
88
89
  const bytes = await db.copyFileToBuffer(outName);
89
90
  const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
@@ -162,9 +163,7 @@ function canonicalEmptyParquetSchema(table) {
162
163
  }
163
164
  function dateReplaceClause(table) {
164
165
  if (!table) return "";
165
- const dateCols = SCHEMAS[table].columns.filter((c) => c.type === "DATE").map((c) => c.name);
166
- if (dateCols.length === 0) return "";
167
- return `REPLACE (${dateCols.map((n) => `strftime(CAST(${n} AS DATE), '%Y-%m-%d') AS ${n}`).join(", ")})`;
166
+ return dateReplaceClause$1(dateColumnsFor(table), "string");
168
167
  }
169
168
  function columnList(table) {
170
169
  return SCHEMAS[table].columns.map((c) => c.name).join(", ");
@@ -272,6 +271,52 @@ async function gcOrphansImpl(deps, now, graceMs, opts = {}) {
272
271
  }
273
272
  return { deleted: retired.length + sweptOrphans + hourlyDeleted };
274
273
  }
274
+ const PUSHABLE_COLUMN = { query: "query" };
275
+ function txLeaf(leaf, columns) {
276
+ if (leaf.operator !== "equals") return null;
277
+ const column = PUSHABLE_COLUMN[leaf.dimension];
278
+ if (!column || !columns.has(column)) return null;
279
+ return { [column]: { $eq: leaf.expression } };
280
+ }
281
+ function txExact(node, columns) {
282
+ const groupType = node._groupType ?? "and";
283
+ const leafParts = [];
284
+ for (const leaf of node._filters) {
285
+ const t = txLeaf(leaf, columns);
286
+ if (!t) return null;
287
+ leafParts.push(t);
288
+ }
289
+ if (groupType === "or") {
290
+ if (node._nestedGroups?.length || leafParts.length === 0) return null;
291
+ return leafParts.length === 1 ? leafParts[0] : { $or: leafParts };
292
+ }
293
+ const parts = leafParts;
294
+ for (const group of node._nestedGroups ?? []) {
295
+ const t = txExact(group, columns);
296
+ if (!t) return null;
297
+ parts.push(t);
298
+ }
299
+ if (parts.length === 0) return null;
300
+ return parts.length === 1 ? parts[0] : { $and: parts };
301
+ }
302
+ function extractParquetPushdown(state, table) {
303
+ const filter = state?.filter;
304
+ const schema = SCHEMAS[table];
305
+ if (!filter || !schema) return void 0;
306
+ const columns = new Set(schema.columns.map((c) => c.name));
307
+ if ((filter._groupType ?? "and") === "or") return txExact(filter, columns) ?? void 0;
308
+ const parts = [];
309
+ for (const leaf of filter._filters) {
310
+ const t = txLeaf(leaf, columns);
311
+ if (t) parts.push(t);
312
+ }
313
+ for (const group of filter._nestedGroups ?? []) {
314
+ const t = txExact(group, columns);
315
+ if (t) parts.push(t);
316
+ }
317
+ if (parts.length === 0) return void 0;
318
+ return parts.length === 1 ? parts[0] : { $and: parts };
319
+ }
275
320
  const URL_PURGE_TABLES = ["pages", "page_queries"];
276
321
  const MAX_DAY_BYTES = 100 * 1024 * 1024;
277
322
  const URL_COLUMNS = /* @__PURE__ */ new Set();
@@ -463,6 +508,7 @@ function createStorageEngine(opts) {
463
508
  dataSource,
464
509
  table,
465
510
  signal: opts.signal,
511
+ ...opts.pushdownFilters ? { pushdownFilters: opts.pushdownFilters } : {},
466
512
  ...profiler ? { profiler } : {}
467
513
  });
468
514
  endExec?.({ rows: result.rows.length });
@@ -476,6 +522,7 @@ function createStorageEngine(opts) {
476
522
  const plan = buildLogicalPlan(state, { regex: true });
477
523
  const table = ctx.table ?? plan.dataset;
478
524
  const resolved = compileLogicalQueryPlan(plan, table);
525
+ const pushdown = extractParquetPushdown(state, table);
479
526
  return runSQL({
480
527
  ctx: {
481
528
  userId: ctx.userId,
@@ -489,6 +536,7 @@ function createStorageEngine(opts) {
489
536
  sql: resolved.sql,
490
537
  params: resolved.params,
491
538
  signal: ctx.signal,
539
+ ...pushdown ? { pushdownFilters: { FILES: pushdown } } : {},
492
540
  ...ctx.searchType !== void 0 ? { searchType: ctx.searchType } : {},
493
541
  ...ctx.profiler ? { profiler: ctx.profiler } : {}
494
542
  });
@@ -60,8 +60,12 @@ interface CreateSqlQuerySourceOptions<TKey extends string> {
60
60
  execute: (sql: string, params: unknown[]) => Promise<QueryRow[]>;
61
61
  /** Tenant id for multi-tenant dialects; forwarded to `resolveToSQL`. */
62
62
  siteId?: string | number;
63
- /** Search-type scope for multi-tenant dialects; forwarded to `resolveToSQL`. */
64
- searchType?: string;
63
+ /**
64
+ * Search-type scope for multi-tenant dialects; forwarded to `resolveToSQL`.
65
+ * `number` = int-encoded code (`SEARCH_TYPE_INT`) for INT `search_type`
66
+ * catalogs (bound bare so the int partition prunes); `string` otherwise.
67
+ */
68
+ searchType?: string | number;
65
69
  /** Additional capability flags merged on top of `adapter.capabilities`. */
66
70
  extraCapabilities?: Partial<SourceCapabilities>;
67
71
  }
@@ -175,8 +175,8 @@ function nextTableBitSize(count, len, root_bits) {
175
175
  }
176
176
  function buildHuffmanTable(root_table, table, root_bits, code_lengths, code_lengths_size) {
177
177
  const start_table = table;
178
- const count = new Int32Array(16);
179
- const offset = new Int32Array(16);
178
+ const count = /* @__PURE__ */ new Int32Array(16);
179
+ const offset = /* @__PURE__ */ new Int32Array(16);
180
180
  const sorted = new Int32Array(code_lengths_size);
181
181
  for (let i = 0; i < code_lengths_size; i++) count[code_lengths[i]]++;
182
182
  offset[1] = 0;
@@ -220,7 +220,7 @@ function readHuffmanCode(alphabet_size, tables, table, br) {
220
220
  if (simple_code_or_skip === 1) {
221
221
  let max_bits_counter = alphabet_size - 1;
222
222
  let max_bits = 0;
223
- const symbols = new Int32Array(4);
223
+ const symbols = /* @__PURE__ */ new Int32Array(4);
224
224
  const num_symbols = br.readBits(2) + 1;
225
225
  while (max_bits_counter) {
226
226
  max_bits_counter >>= 1;
@@ -505,7 +505,7 @@ const fixedDistanceExtraBits = new Uint8Array([
505
505
  0
506
506
  ]);
507
507
  function freb(eb, start) {
508
- const base = new Uint16Array(31);
508
+ const base = /* @__PURE__ */ new Uint16Array(31);
509
509
  for (let i = 0; i < 31; i++) base[i] = start += 1 << eb[i - 1];
510
510
  const rev = new Int32Array(base[30]);
511
511
  for (let i = 1; i < 30; i++) for (let j = base[i]; j < base[i + 1]; ++j) rev[j] = j - base[i] << 5 | i;
@@ -518,7 +518,7 @@ const { base: fixedLength, rev: revfl } = freb(fixedLengthExtraBits, 2);
518
518
  fixedLength[28] = 258;
519
519
  revfl[258] = 28;
520
520
  const { base: fixedDistance } = freb(fixedDistanceExtraBits, 0);
521
- const rev = new Uint16Array(32768);
521
+ const rev = /* @__PURE__ */ new Uint16Array(32768);
522
522
  for (let i = 0; i < 32768; i++) {
523
523
  let x = (i & 43690) >> 1 | (i & 21845) << 1;
524
524
  x = (x & 52428) >> 2 | (x & 13107) << 2;
@@ -546,12 +546,12 @@ function huffMap(cd, maxBits, r) {
546
546
  }
547
547
  return co;
548
548
  }
549
- const fixedLengthTree = new Uint8Array(288);
549
+ const fixedLengthTree = /* @__PURE__ */ new Uint8Array(288);
550
550
  for (let i = 0; i < 144; i++) fixedLengthTree[i] = 8;
551
551
  for (let i = 144; i < 256; i++) fixedLengthTree[i] = 9;
552
552
  for (let i = 256; i < 280; i++) fixedLengthTree[i] = 7;
553
553
  for (let i = 280; i < 288; i++) fixedLengthTree[i] = 8;
554
- const fixedDistanceTree = new Uint8Array(32);
554
+ const fixedDistanceTree = /* @__PURE__ */ new Uint8Array(32);
555
555
  for (let i = 0; i < 32; i++) fixedDistanceTree[i] = 5;
556
556
  const fixedLengthMap = /*#__PURE__*/ huffMap(fixedLengthTree, 9, 1);
557
557
  const fixedDistanceMap = /*#__PURE__*/ huffMap(fixedDistanceTree, 5, 1);
@@ -2420,7 +2420,7 @@ function gzipStart(input, i) {
2420
2420
  return i + (flag & 2);
2421
2421
  }
2422
2422
  function gunzip(input, output, inputIndex = 0, outputIndex = 0) {
2423
- let out = output ?? new Uint8Array(1024);
2423
+ let out = output ?? /* @__PURE__ */ new Uint8Array(1024);
2424
2424
  if (!(input.length - inputIndex)) return out;
2425
2425
  const payloadStart = gzipStart(input, inputIndex);
2426
2426
  if (payloadStart === input.length - 8) return out;
@@ -2465,7 +2465,7 @@ function gunzip(input, output, inputIndex = 0, outputIndex = 0) {
2465
2465
  const tl = hLiteral + bits(input, pos + 5, 31) + 1;
2466
2466
  pos += 14;
2467
2467
  const lengthDistanceTree = new Uint8Array(tl);
2468
- const codeLengthTree = new Uint8Array(19);
2468
+ const codeLengthTree = /* @__PURE__ */ new Uint8Array(19);
2469
2469
  for (let i = 0; i < hcLengths; ++i) codeLengthTree[codeLengthIndexMap[i]] = bits(input, pos + i * 3, 7);
2470
2470
  pos += hcLengths * 3;
2471
2471
  const codeLengthBits = Math.max(...codeLengthTree);
@@ -962,7 +962,7 @@ function uuidToBytes(value, label) {
962
962
  if (typeof value !== "string") throw new Error(`expected ${label}`);
963
963
  const hex = value.toLowerCase().replace(/-/g, "");
964
964
  if (!/^[0-9a-f]{32}$/.test(hex)) throw new Error(`expected ${label}`);
965
- const bytes = new Uint8Array(16);
965
+ const bytes = /* @__PURE__ */ new Uint8Array(16);
966
966
  for (let i = 0; i < bytes.length; i++) bytes[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
967
967
  return bytes;
968
968
  }
@@ -1074,7 +1074,7 @@ function bucketBytes(value, sourceType) {
1074
1074
  else if (t === "timestamp" || t === "timestamptz") v = value instanceof Date ? BigInt(value.getTime()) * 1000n : BigInt(value);
1075
1075
  else if (t === "timestamp_ns" || t === "timestamptz_ns") v = value instanceof Date ? BigInt(value.getTime()) * 1000n : BigInt(value) / 1000n;
1076
1076
  else v = typeof value === "bigint" ? value : BigInt(value);
1077
- const out = new Uint8Array(8);
1077
+ const out = /* @__PURE__ */ new Uint8Array(8);
1078
1078
  new DataView(out.buffer).setBigInt64(0, v, true);
1079
1079
  return out;
1080
1080
  }
@@ -1835,7 +1835,7 @@ function avroWrite({ writer, schema, records, blockSize = 512, metadata }) {
1835
1835
  writer.appendBytes(vb);
1836
1836
  }
1837
1837
  writer.appendVarInt(0);
1838
- const sync = new Uint8Array(16);
1838
+ const sync = /* @__PURE__ */ new Uint8Array(16);
1839
1839
  for (let i = 0; i < 16; i++) sync[i] = Math.random() * 256 | 0;
1840
1840
  writer.appendBytes(sync);
1841
1841
  for (let i = 0; i < records.length; i += blockSize) {
@@ -1940,7 +1940,7 @@ function appendZigZag64(writer, v) {
1940
1940
  function uuidStringToBytes$1(value) {
1941
1941
  const hex = value.toLowerCase().replace(/-/g, "");
1942
1942
  if (!/^[0-9a-f]{32}$/.test(hex)) throw new Error("expected uuid string");
1943
- const bytes = new Uint8Array(16);
1943
+ const bytes = /* @__PURE__ */ new Uint8Array(16);
1944
1944
  for (let i = 0; i < 16; i++) bytes[i] = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
1945
1945
  return bytes;
1946
1946
  }
@@ -2531,7 +2531,7 @@ function twosComplementMinBigEndian(value) {
2531
2531
  function uuidStringToBytes(s) {
2532
2532
  const hex = s.replace(/-/g, "");
2533
2533
  if (hex.length !== 32) return void 0;
2534
- const out = new Uint8Array(16);
2534
+ const out = /* @__PURE__ */ new Uint8Array(16);
2535
2535
  for (let i = 0; i < 16; i++) {
2536
2536
  const byte = parseInt(hex.slice(i * 2, i * 2 + 2), 16);
2537
2537
  if (Number.isNaN(byte)) return void 0;
@@ -3286,7 +3286,7 @@ function resolveParquetCodec(value) {
3286
3286
  }
3287
3287
  function newSnapshotId(metadata) {
3288
3288
  const used = new Set((metadata?.snapshots ?? []).map((s) => BigInt(s["snapshot-id"])));
3289
- const arr = new BigInt64Array(1);
3289
+ const arr = /* @__PURE__ */ new BigInt64Array(1);
3290
3290
  for (let attempt = 0; attempt < 32; attempt++) {
3291
3291
  globalThis.crypto.getRandomValues(arr);
3292
3292
  const masked = arr[0] & 9007199254740991n;
@@ -3319,24 +3319,38 @@ async function icebergManifests({ metadata, resolver, snapshotId, partitionFilte
3319
3319
  });
3320
3320
  return await fetchManifests(manifests, resolver);
3321
3321
  }
3322
+ const MANIFEST_FETCH_CONCURRENCY = 8;
3323
+ async function fetchOneManifest(manifest, resolver) {
3324
+ const url = manifest.manifest_path;
3325
+ const entries = await fetchAvroRecords(url, resolver, Number(manifest.manifest_length));
3326
+ for (const entry of entries) {
3327
+ entry.partition_spec_id = manifest.partition_spec_id ?? 0;
3328
+ if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number ?? 0n;
3329
+ if (entry.status === 1) {
3330
+ if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number;
3331
+ if (entry.file_sequence_number === void 0) entry.file_sequence_number = manifest.sequence_number;
3332
+ } else if (entry.sequence_number === void 0 || entry.file_sequence_number === void 0) throw new Error("iceberg manifest entry missing sequence number");
3333
+ }
3334
+ assignFirstRowIds(manifest, entries);
3335
+ return {
3336
+ url,
3337
+ entries
3338
+ };
3339
+ }
3322
3340
  async function fetchManifests(manifests, resolver) {
3323
- return await Promise.all(manifests.map(async (manifest) => {
3324
- const url = manifest.manifest_path;
3325
- const entries = await fetchAvroRecords(url, resolver, Number(manifest.manifest_length));
3326
- for (const entry of entries) {
3327
- entry.partition_spec_id = manifest.partition_spec_id ?? 0;
3328
- if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number ?? 0n;
3329
- if (entry.status === 1) {
3330
- if (entry.sequence_number === void 0) entry.sequence_number = manifest.sequence_number;
3331
- if (entry.file_sequence_number === void 0) entry.file_sequence_number = manifest.sequence_number;
3332
- } else if (entry.sequence_number === void 0 || entry.file_sequence_number === void 0) throw new Error("iceberg manifest entry missing sequence number");
3333
- }
3334
- assignFirstRowIds(manifest, entries);
3335
- return {
3336
- url,
3337
- entries
3338
- };
3339
- }));
3341
+ const results = new Array(manifests.length);
3342
+ let next = 0;
3343
+ async function worker() {
3344
+ while (next < manifests.length) {
3345
+ const i = next++;
3346
+ results[i] = await fetchOneManifest(manifests[i], resolver);
3347
+ }
3348
+ }
3349
+ const poolSize = Math.min(MANIFEST_FETCH_CONCURRENCY, manifests.length);
3350
+ const workers = [];
3351
+ for (let w = 0; w < poolSize; w++) workers.push(worker());
3352
+ await Promise.all(workers);
3353
+ return results;
3340
3354
  }
3341
3355
  function assignFirstRowIds(manifest, entries) {
3342
3356
  if (manifest.content !== 0 || manifest.first_row_id == null) return;
@@ -13,7 +13,15 @@ declare const pgResolverAdapter: ResolverAdapter<PgTableKey>;
13
13
  * Single-use: build a fresh adapter per query. Cheap (no I/O) and avoids
14
14
  * accidental adapter caching that would lock in a stale `{{FILES}}` set.
15
15
  */
16
- declare function createParquetResolverAdapter(): ResolverAdapter<PgTableKey>;
16
+ interface ResolverAdapterOptions {
17
+ /**
18
+ * Opt-in canonical-primary correctness: fold NULL/'' `query_canonical` back
19
+ * to the raw `query` so canonical is a total GROUP BY / join key. Default
20
+ * false preserves the legacy raw-column behaviour. See ADR-0018.
21
+ */
22
+ canonicalFallback?: boolean;
23
+ }
24
+ declare function createParquetResolverAdapter(options?: ResolverAdapterOptions): ResolverAdapter<PgTableKey>;
17
25
  /**
18
26
  * Multi-tenant pg-flavored adapter for the Iceberg / R2 SQL read path.
19
27
  * Identical SQL output to `pgResolverAdapter` except WHERE clauses inject
@@ -24,5 +32,5 @@ declare function createParquetResolverAdapter(): ResolverAdapter<PgTableKey>;
24
32
  * so callers must rewrite bare table names to their qualified form (e.g.
25
33
  * `${namespace}.pages`) before sending to R2 SQL.
26
34
  */
27
- declare function createIcebergResolverAdapter(): ResolverAdapter<PgTableKey>;
28
- export { PgTableKey, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter };
35
+ declare function createIcebergResolverAdapter(options?: ResolverAdapterOptions): ResolverAdapter<PgTableKey>;
36
+ export { PgTableKey, ResolverAdapterOptions, createIcebergResolverAdapter, createParquetResolverAdapter, pgResolverAdapter };
@@ -180,7 +180,7 @@ function buildDimensionColumnMap(datasetToTableKey) {
180
180
  return Object.fromEntries(entries);
181
181
  }
182
182
  function createSqlFragments(config) {
183
- const { schema, datasetToTableKey, metricCast, regexPredicate, tableLabel, includeSiteId, includeSearchType, urlToPathExpr: urlToPathExprOverride, tableRef: tableRefOverride } = config;
183
+ const { schema, datasetToTableKey, metricCast, regexPredicate, tableLabel, includeSiteId, includeSearchType, urlToPathExpr: urlToPathExprOverride, tableRef: tableRefOverride, canonicalFallback = false } = config;
184
184
  const DIM_COLUMN_MAP = buildDimensionColumnMap(datasetToTableKey);
185
185
  function isMetricDimension(dim) {
186
186
  return METRIC_NAMES.includes(dim);
@@ -217,6 +217,7 @@ function createSqlFragments(config) {
217
217
  function dimExprSql(dim, tableKey) {
218
218
  const colName = dimColumn(dim, tableKey);
219
219
  if (dim === "page") return sql.raw(urlToPathExpr(colName));
220
+ if (canonicalFallback && dim === "queryCanonical") return sql`COALESCE(NULLIF(${colRef(tableKey, colName)}, ''), ${colRef(tableKey, "query")})`;
220
221
  return colRef(tableKey, colName);
221
222
  }
222
223
  function metricSql(metric, tableKey) {
@@ -431,23 +432,37 @@ const pgResolverAdapter = createResolverAdapter({
431
432
  ...PG_BASE_CONFIG,
432
433
  tableLabel: "pg-resolver-adapter"
433
434
  });
434
- function createParquetResolverAdapter() {
435
+ function createParquetResolverAdapter(options = {}) {
435
436
  return createResolverAdapter({
436
437
  ...PG_BASE_CONFIG,
437
438
  tableLabel: "parquet-resolver-adapter",
439
+ canonicalFallback: options.canonicalFallback ?? false,
438
440
  tableRef: (tk) => sql.raw(`read_parquet({{FILES}}, union_by_name = true) AS "${tk}"`)
439
441
  });
440
442
  }
441
- function createIcebergResolverAdapter() {
443
+ function createIcebergResolverAdapter(options = {}) {
442
444
  return createResolverAdapter({
443
445
  ...PG_BASE_CONFIG,
444
446
  schema: icebergSchema,
445
447
  includeSiteId: true,
446
448
  includeSearchType: true,
447
449
  tableLabel: "iceberg-resolver-adapter",
450
+ canonicalFallback: options.canonicalFallback ?? false,
448
451
  tableRef: (tk) => sql.raw(`"${tk}"`)
449
452
  });
450
453
  }
454
+ const ALLOWED_FILTER_DIMS = /* @__PURE__ */ new Set(["date", "queryCanonical"]);
455
+ function planCoveredByCanonicalRollup(plan) {
456
+ if (plan.dataset !== "queries") return false;
457
+ if (plan.groupByDimensions.length !== 1 || plan.groupByDimensions[0] !== "queryCanonical") return false;
458
+ if (!plan.dimensionFilters.every((f) => ALLOWED_FILTER_DIMS.has(f.dimension))) return false;
459
+ if (plan.prefilters.length > 0) return false;
460
+ if (plan.specialFilters.topLevel) return false;
461
+ return true;
462
+ }
463
+ function canonicalRollupCovers(state, capabilities) {
464
+ return planCoveredByCanonicalRollup(buildLogicalPlan(state, capabilities));
465
+ }
451
466
  const COMPARISON_FILTER_SQL = {
452
467
  new: sql`AND COALESCE(p.impressions, 0) = 0 AND COALESCE(c.impressions, 0) > 0`,
453
468
  lost: sql`AND COALESCE(p.impressions, 0) > 0 AND COALESCE(c.impressions, 0) = 0`,
@@ -726,7 +741,8 @@ function buildExtrasQueries(state, options) {
726
741
  whereParts.push(sql`${adapter.dateColRef(queriesKey)} <= ${plan.dateRange.endDate}`);
727
742
  const whereExpr = whereParts.length > 0 ? sql`WHERE ${joinAnd(whereParts)}` : sql``;
728
743
  const outerQueryCol = sql.raw("query");
729
- const compiled = compileCollapsed(adapter, sql`WITH per_variant AS (SELECT ${t.query_canonical} as joinKey, ${t.query} as query, SUM(${t.clicks}) as clicks, SUM(${t.impressions}) as impressions, SUM(${t.sum_position}) as sum_pos, ROW_NUMBER() OVER (PARTITION BY ${t.query_canonical} ORDER BY SUM(${t.clicks}) DESC) as rn, COUNT(*) OVER (PARTITION BY ${t.query_canonical}) as variantCount FROM ${table} ${whereExpr} GROUP BY ${t.query_canonical}, ${t.query}) SELECT joinKey, MAX(variantCount) as variantCount, MAX(CASE WHEN rn = 1 THEN ${outerQueryCol} END) as canonicalName, GROUP_CONCAT(CASE WHEN rn <= 10 THEN ${outerQueryCol} || ':::' || clicks || ':::' || impressions || ':::' || CAST(ROUND(CAST(sum_pos AS REAL) / NULLIF(impressions, 0) + 1, 1) AS TEXT) END, '||') as variants FROM per_variant GROUP BY joinKey`);
744
+ const canonKey = sql`COALESCE(NULLIF(${t.query_canonical}, ''), ${t.query})`;
745
+ const compiled = compileCollapsed(adapter, sql`WITH per_variant AS (SELECT ${canonKey} as joinKey, ${t.query} as query, SUM(${t.clicks}) as clicks, SUM(${t.impressions}) as impressions, SUM(${t.sum_position}) as sum_pos, ROW_NUMBER() OVER (PARTITION BY ${canonKey} ORDER BY SUM(${t.clicks}) DESC) as rn, COUNT(*) OVER (PARTITION BY ${canonKey}) as variantCount FROM ${table} ${whereExpr} GROUP BY ${canonKey}, ${t.query}) SELECT joinKey, MAX(variantCount) as variantCount, MAX(CASE WHEN rn = 1 THEN ${outerQueryCol} END) as canonicalName, GROUP_CONCAT(CASE WHEN rn <= 10 THEN ${outerQueryCol} || ':::' || clicks || ':::' || impressions || ':::' || CAST(ROUND(CAST(sum_pos AS REAL) / NULLIF(impressions, 0) + 1, 1) AS TEXT) END, '||') as variants FROM per_variant GROUP BY joinKey`);
730
746
  extras.push({
731
747
  key: "canonicalExtras",
732
748
  sql: compiled.sql,
@@ -802,6 +818,22 @@ function mergeExtras(rows, extrasResults) {
802
818
  return enriched;
803
819
  });
804
820
  }
821
+ const EXTRA_ROLLUP_IDS = { canonicalExtras: "query_canonical_variants" };
822
+ function createRollupExtrasOverlay(readRollupRows) {
823
+ return async ({ key, ctx, dateRange }) => {
824
+ const id = EXTRA_ROLLUP_IDS[key];
825
+ if (id === void 0) return null;
826
+ return readRollupRows({
827
+ id,
828
+ ctx: {
829
+ userId: ctx.userId,
830
+ siteId: ctx.siteId
831
+ },
832
+ dateRange,
833
+ ...ctx.searchType !== void 0 ? { searchType: ctx.searchType } : {}
834
+ });
835
+ };
836
+ }
805
837
  function collectInternalFilters(filter) {
806
838
  if (!filter || !("_filters" in filter)) return [];
807
839
  const flat = filter._filters;
@@ -856,6 +888,9 @@ function matchesMetricFilter(row, filter) {
856
888
  function matchesTopLevelPage(row) {
857
889
  return (normalizeUrl(dimensionValue(row, "page")).match(/\//g)?.length ?? 0) <= 1;
858
890
  }
891
+ function canonicalSourceWithinCoverage(source, windowEnd) {
892
+ return source.coversThrough === void 0 || windowEnd <= source.coversThrough;
893
+ }
859
894
  function runArgs(ctx, partitions) {
860
895
  return {
861
896
  ctx: {
@@ -870,9 +905,11 @@ function runArgs(ctx, partitions) {
870
905
  ...ctx.searchType !== void 0 ? { searchType: ctx.searchType } : {}
871
906
  };
872
907
  }
873
- async function runOptimizedQuery(runSQL, ctx, state, dateRange) {
874
- const adapter = createParquetResolverAdapter();
908
+ async function runOptimizedQuery(runSQL, ctx, state, dateRange, options = {}) {
875
909
  const base = runArgs(ctx, enumeratePartitions(dateRange.startDate, dateRange.endDate));
910
+ const probe = createParquetResolverAdapter({ canonicalFallback: options.canonicalFallback ?? false });
911
+ const useCanonicalSource = options.canonicalSource !== void 0 && (options.canonicalFallback ?? false) && canonicalSourceWithinCoverage(options.canonicalSource, dateRange.endDate) && canonicalRollupCovers(state, probe.capabilities);
912
+ const adapter = useCanonicalSource ? createParquetResolverAdapter({ canonicalFallback: false }) : probe;
876
913
  const optimized = resolveToSQLOptimized(state, {
877
914
  adapter,
878
915
  siteId: void 0
@@ -881,15 +918,31 @@ async function runOptimizedQuery(runSQL, ctx, state, dateRange) {
881
918
  adapter,
882
919
  siteId: void 0
883
920
  });
884
- const [optRes, ...extrasRows] = await Promise.all([runSQL({
921
+ const mainArgs = useCanonicalSource ? {
885
922
  ...base,
923
+ fileSets: { FILES: {
924
+ table: ctx.table,
925
+ keys: options.canonicalSource.keys
926
+ } }
927
+ } : base;
928
+ const resolveExtra = options.resolveExtra;
929
+ const [optRes, ...extrasRows] = await Promise.all([runSQL({
930
+ ...mainArgs,
886
931
  sql: optimized.sql,
887
932
  params: optimized.params
888
- }), ...extras.map((e) => runSQL({
889
- ...base,
890
- sql: e.sql,
891
- params: e.params
892
- }))]);
933
+ }), ...extras.map(async (e) => {
934
+ const overlaid = resolveExtra ? await resolveExtra({
935
+ key: e.key,
936
+ state,
937
+ ctx,
938
+ dateRange
939
+ }) : null;
940
+ return overlaid !== null ? { rows: overlaid } : runSQL({
941
+ ...base,
942
+ sql: e.sql,
943
+ params: e.params
944
+ });
945
+ })]);
893
946
  const firstRow = optRes.rows[0];
894
947
  const totalCount = Number(firstRow?.totalCount ?? 0);
895
948
  const totals = {
@@ -911,8 +964,10 @@ async function runOptimizedQuery(runSQL, ctx, state, dateRange) {
911
964
  }))
912
965
  };
913
966
  }
914
- async function runComparisonQuery(runSQL, ctx, current, previous, windows, filter) {
915
- const adapter = createParquetResolverAdapter();
967
+ async function runComparisonQuery(runSQL, ctx, current, previous, windows, filter, options = {}) {
968
+ const probe = createParquetResolverAdapter({ canonicalFallback: options.canonicalFallback ?? false });
969
+ const useCanonicalSource = options.canonicalSource !== void 0 && (options.canonicalFallback ?? false) && canonicalSourceWithinCoverage(options.canonicalSource, windows.current.endDate > windows.previous.endDate ? windows.current.endDate : windows.previous.endDate) && canonicalRollupCovers(current, probe.capabilities) && canonicalRollupCovers(previous, probe.capabilities);
970
+ const adapter = useCanonicalSource ? createParquetResolverAdapter({ canonicalFallback: false }) : probe;
916
971
  const comparison = resolveComparisonSQL(current, previous, {
917
972
  adapter,
918
973
  siteId: void 0
@@ -921,7 +976,14 @@ async function runComparisonQuery(runSQL, ctx, current, previous, windows, filte
921
976
  adapter,
922
977
  siteId: void 0
923
978
  });
924
- const base = runArgs(ctx, enumeratePartitions(windows.current.startDate < windows.previous.startDate ? windows.current.startDate : windows.previous.startDate, windows.current.endDate > windows.previous.endDate ? windows.current.endDate : windows.previous.endDate));
979
+ const partitions = enumeratePartitions(windows.current.startDate < windows.previous.startDate ? windows.current.startDate : windows.previous.startDate, windows.current.endDate > windows.previous.endDate ? windows.current.endDate : windows.previous.endDate);
980
+ const base = useCanonicalSource ? {
981
+ ...runArgs(ctx, partitions),
982
+ fileSets: { FILES: {
983
+ table: ctx.table,
984
+ keys: options.canonicalSource.keys
985
+ } }
986
+ } : runArgs(ctx, partitions);
925
987
  const main = await runSQL({
926
988
  ...base,
927
989
  sql: comparison.sql,
@@ -953,4 +1015,4 @@ function assertSchemaInSync(options) {
953
1015
  if (missing.length > 0 || extra.length > 0) throw new Error(`${label} drizzle schema for '${key}' drifted from SCHEMAS. Missing: [${missing.join(", ")}]. Extra: [${extra.join(", ")}].`);
954
1016
  }
955
1017
  }
956
- export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
1018
+ export { DIMENSION_SURFACES, LOGICAL_DATASETS, UnresolvableDatasetError, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, canonicalRollupCovers, createIcebergResolverAdapter, createParquetResolverAdapter, createResolverAdapter, createRollupExtrasOverlay, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, isDatasetResolvable, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, planCoveredByCanonicalRollup, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, runComparisonQuery, runOptimizedQuery, supportsDimensionOnSurface };
@@ -1087,12 +1087,12 @@ declare const hourly_pages: import("drizzle-orm/pg-core").PgTableWithColumns<{
1087
1087
  identity: undefined;
1088
1088
  generated: undefined;
1089
1089
  }>;
1090
- hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").PgVarcharBuilder<[string, ...string[]]>>, {
1090
+ hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").PgIntegerBuilder>, {
1091
1091
  name: string;
1092
1092
  tableName: "hourly_pages";
1093
- dataType: "string";
1094
- data: string;
1095
- driverParam: string;
1093
+ dataType: "number int32";
1094
+ data: number;
1095
+ driverParam: string | number;
1096
1096
  notNull: true;
1097
1097
  hasDefault: false;
1098
1098
  isPrimaryKey: false;
@@ -2193,12 +2193,12 @@ declare const drizzleSchema: {
2193
2193
  identity: undefined;
2194
2194
  generated: undefined;
2195
2195
  }>;
2196
- hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").PgVarcharBuilder<[string, ...string[]]>>, {
2196
+ hour: import("drizzle-orm/pg-core").PgBuildColumn<"hourly_pages", import("drizzle-orm/pg-core").SetNotNull<import("drizzle-orm/pg-core").PgIntegerBuilder>, {
2197
2197
  name: string;
2198
2198
  tableName: "hourly_pages";
2199
- dataType: "string";
2200
- data: string;
2201
- driverParam: string;
2199
+ dataType: "number int32";
2200
+ data: number;
2201
+ driverParam: string | number;
2202
2202
  notNull: true;
2203
2203
  hasDefault: false;
2204
2204
  isPrimaryKey: false;
@@ -2236,6 +2236,13 @@ declare const TABLE_METADATA: Record<TableName, {
2236
2236
  declare const SCHEMAS: Record<TableName, TableSchema>;
2237
2237
  declare function currentSchemaVersion(table: TableName): number;
2238
2238
  declare function schemaFor(table: TableName): TableSchema;
2239
+ /**
2240
+ * DATE column names for a table. The single schema-derived source every read
2241
+ * path uses to build the legacy-VARCHAR date canonicalization (see
2242
+ * `dateReplaceClause` in `./sql-fragments`), so the engine codec and the CLI
2243
+ * `dump`/`export` commands agree on which columns to cast.
2244
+ */
2245
+ declare function dateColumnsFor(table: TableName): string[];
2239
2246
  declare function allTables(): readonly TableName[];
2240
2247
  declare function inferTable(dimensions: readonly string[]): TableName;
2241
2248
  /**
@@ -2259,4 +2266,4 @@ declare function naturalKeyColumns(table: TableName): readonly string[];
2259
2266
  */
2260
2267
  declare function dedupeByNaturalKey(table: TableName, rows: readonly Row[]): Row[];
2261
2268
  declare function dimensionToColumn(dim: string, _table: TableName): string;
2262
- export { type ColumnDef$1 as ColumnDef, type ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema$1 as TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
2269
+ export { type ColumnDef$1 as ColumnDef, type ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema$1 as TableSchema, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
@@ -74,7 +74,7 @@ const search_appearance_page_queries = pgTable("search_appearance_page_queries",
74
74
  });
75
75
  const hourly_pages = pgTable("hourly_pages", {
76
76
  url: varchar("url").notNull(),
77
- hour: varchar("hour").notNull(),
77
+ hour: integer("hour").notNull(),
78
78
  date: dateCol(),
79
79
  ...metricCols()
80
80
  });
@@ -181,7 +181,7 @@ const TABLE_METADATA = {
181
181
  "date",
182
182
  "hour"
183
183
  ],
184
- version: 1
184
+ version: 2
185
185
  }
186
186
  };
187
187
  function pgSqlTypeToColumnType(sqlType) {
@@ -226,6 +226,9 @@ function currentSchemaVersion(table) {
226
226
  function schemaFor(table) {
227
227
  return SCHEMAS[table];
228
228
  }
229
+ function dateColumnsFor(table) {
230
+ return SCHEMAS[table].columns.filter((c) => c.type === "DATE").map((c) => c.name);
231
+ }
229
232
  function allTables() {
230
233
  return METRIC_TABLES;
231
234
  }
@@ -260,4 +263,4 @@ function dimensionToColumn(dim, _table) {
260
263
  if (dim === "queryCanonical") return "query_canonical";
261
264
  return dim;
262
265
  }
263
- export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
266
+ export { SCHEMAS, TABLE_METADATA, allTables, countries, currentSchemaVersion, dateColumnsFor, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };