@gscdump/engine 0.30.0 → 0.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,105 @@ async function readOptional(ds, key, signal) {
16
16
  throw e;
17
17
  });
18
18
  }
19
+ const QUERY_DIM_COLUMNS = [
20
+ {
21
+ name: "query",
22
+ type: "VARCHAR",
23
+ nullable: false
24
+ },
25
+ {
26
+ name: "query_canonical",
27
+ type: "VARCHAR",
28
+ nullable: false
29
+ },
30
+ {
31
+ name: "intent_code",
32
+ type: "INTEGER",
33
+ nullable: false
34
+ },
35
+ {
36
+ name: "normalizer_version",
37
+ type: "INTEGER",
38
+ nullable: false
39
+ },
40
+ {
41
+ name: "intent_version",
42
+ type: "INTEGER",
43
+ nullable: false
44
+ }
45
+ ];
46
+ function queryDimPrefix(ctx) {
47
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/query_dim` : `u_${ctx.userId}/entities/query_dim`;
48
+ }
49
+ function queryDimParquetKey(ctx) {
50
+ return `${queryDimPrefix(ctx)}/index.parquet`;
51
+ }
52
+ function queryDimMetaKey(ctx) {
53
+ return `${queryDimPrefix(ctx)}/index.json`;
54
+ }
55
+ function buildQueryDimRecords(queries, deps) {
56
+ const seen = /* @__PURE__ */ new Set();
57
+ const out = [];
58
+ for (const raw of queries) {
59
+ const query = String(raw);
60
+ if (query.trim() === "" || seen.has(query)) continue;
61
+ seen.add(query);
62
+ const canonical = deps.normalizeQuery(query);
63
+ out.push({
64
+ query,
65
+ query_canonical: canonical === "" ? query : canonical,
66
+ intent_code: deps.classifyIntentCode(query),
67
+ normalizer_version: deps.normalizerVersion,
68
+ intent_version: deps.intentVersion
69
+ });
70
+ }
71
+ return out;
72
+ }
73
+ function createQueryDimStore({ dataSource }) {
74
+ async function exists(key, prefix) {
75
+ return (await dataSource.list(prefix)).includes(key);
76
+ }
77
+ return {
78
+ parquetKey: queryDimParquetKey,
79
+ async write(ctx, records, builtAt) {
80
+ const parquetKey = queryDimParquetKey(ctx);
81
+ const bytes = encodeRowsToParquetFlex(records, {
82
+ columns: QUERY_DIM_COLUMNS,
83
+ sortKey: ["query"]
84
+ });
85
+ await dataSource.write(parquetKey, bytes);
86
+ const meta = {
87
+ version: 1,
88
+ builtAt,
89
+ rowCount: records.length,
90
+ normalizerVersion: records[0]?.normalizer_version ?? 0,
91
+ intentVersion: records[0]?.intent_version ?? 0
92
+ };
93
+ await dataSource.write(queryDimMetaKey(ctx), new TextEncoder().encode(JSON.stringify(meta)));
94
+ return {
95
+ parquetKey,
96
+ rowCount: records.length
97
+ };
98
+ },
99
+ async loadMeta(ctx) {
100
+ const key = queryDimMetaKey(ctx);
101
+ if (!await exists(key, `${queryDimPrefix(ctx)}/`)) return null;
102
+ const bytes = await dataSource.read(key);
103
+ return JSON.parse(new TextDecoder().decode(bytes));
104
+ },
105
+ async loadRecords(ctx) {
106
+ const key = queryDimParquetKey(ctx);
107
+ if (!await exists(key, `${queryDimPrefix(ctx)}/`)) return [];
108
+ return (await decodeParquetToRows(await dataSource.read(key))).map((r) => ({
109
+ query: String(r.query),
110
+ query_canonical: String(r.query_canonical),
111
+ intent_code: Number(r.intent_code),
112
+ normalizer_version: Number(r.normalizer_version),
113
+ intent_version: Number(r.intent_version)
114
+ }));
115
+ }
116
+ };
117
+ }
19
118
  const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
20
119
  function inspectionIndexKey(ctx) {
21
120
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.json` : `u_${ctx.userId}/entities/inspections/index.json`;
@@ -893,4 +992,4 @@ function createEmptyTypesStore(opts) {
893
992
  }
894
993
  };
895
994
  }
896
- export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
995
+ export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
@@ -133,7 +133,7 @@ interface Snapshot {
133
133
  'sequence-number': number;
134
134
  'timestamp-ms': number;
135
135
  'manifest-list': string;
136
- manifests?: Manifest$1[];
136
+ manifests?: Manifest[];
137
137
  summary: {
138
138
  // spec: "value of these fields should be of string type"
139
139
  operation: string; // 'spark.app.id'?: string
@@ -192,7 +192,7 @@ interface MetadataLog {
192
192
  'timestamp-ms': number;
193
193
  'metadata-file': string;
194
194
  }
195
- interface Manifest$1 {
195
+ interface Manifest {
196
196
  manifest_path: string;
197
197
  manifest_length: bigint;
198
198
  partition_spec_id: number;
@@ -297,8 +297,10 @@ function createSqlFragments(config) {
297
297
  if (isMetricDimension(f.dimension)) continue;
298
298
  if (f.dimension === "date") continue;
299
299
  if (f.operator === "topLevel") continue;
300
- const cRef = colRef(tableKey, dimColumn(f.dimension, tableKey));
301
- const matchExpr = f.dimension === "page" ? dimExprSql(f.dimension, tableKey) : cRef;
300
+ const dim = f.dimension;
301
+ const cRef = colRef(tableKey, dimColumn(dim, tableKey));
302
+ const matchExpr = dim === "page" || dim === "queryCanonical" ? dimExprSql(dim, tableKey) : cRef;
303
+ const patternExpr = dim === "queryCanonical" ? matchExpr : cRef;
302
304
  switch (f.operator) {
303
305
  case "equals":
304
306
  preds.push(sql`${matchExpr} = ${f.expression}`);
@@ -307,16 +309,16 @@ function createSqlFragments(config) {
307
309
  preds.push(sql`${matchExpr} != ${f.expression}`);
308
310
  break;
309
311
  case "contains":
310
- preds.push(sql`${cRef} LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
312
+ preds.push(sql`${patternExpr} LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
311
313
  break;
312
314
  case "notContains":
313
- preds.push(sql`${cRef} NOT LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
315
+ preds.push(sql`${patternExpr} NOT LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
314
316
  break;
315
317
  case "includingRegex":
316
- preds.push(regexPredicate(cRef, f.expression, false));
318
+ preds.push(regexPredicate(patternExpr, f.expression, false));
317
319
  break;
318
320
  case "excludingRegex":
319
- preds.push(regexPredicate(cRef, f.expression, true));
321
+ preds.push(regexPredicate(patternExpr, f.expression, true));
320
322
  break;
321
323
  }
322
324
  }
@@ -1,6 +1,60 @@
1
1
  import { DataSource } from "./_chunks/storage.mjs";
2
2
  import { ScheduleState } from "./schedule.mjs";
3
3
  import { ColumnDef, TenantCtx } from "@gscdump/contracts";
4
+ interface QueryDimRecord {
5
+ query: string;
6
+ /** Lexical canonical, never empty: NULL/'' folds to the raw query. */
7
+ query_canonical: string;
8
+ /** Packed search-intent code (see `@gscdump/analysis` `encodeIntent`). */
9
+ intent_code: number;
10
+ normalizer_version: number;
11
+ intent_version: number;
12
+ }
13
+ /** JSON sidecar: versions + freshness, readable without decoding the parquet. */
14
+ interface QueryDimMeta {
15
+ version: 1;
16
+ builtAt: number;
17
+ rowCount: number;
18
+ normalizerVersion: number;
19
+ intentVersion: number;
20
+ }
21
+ declare function queryDimParquetKey(ctx: TenantCtx): string;
22
+ declare function queryDimMetaKey(ctx: TenantCtx): string;
23
+ /**
24
+ * Injected derivation. `engine` never imports `@gscdump/analysis`; the host
25
+ * passes `normalizeQuery` / `classifyIntentCode` (e.g. `encodeIntent ∘
26
+ * classifyQueryIntent`) plus their version constants.
27
+ */
28
+ interface QueryDimDeps {
29
+ normalizeQuery: (query: string) => string;
30
+ normalizerVersion: number;
31
+ /** Returns the packed intent code for a raw query. */
32
+ classifyIntentCode: (query: string) => number;
33
+ intentVersion: number;
34
+ }
35
+ /**
36
+ * Pure: distinct raw queries → dimension records. De-dupes, drops empties, and
37
+ * folds an empty/whitespace canonical back to the raw query so the key is
38
+ * total (matches the read path's `COALESCE(NULLIF(query_canonical, ''), query)`).
39
+ */
40
+ declare function buildQueryDimRecords(queries: Iterable<string>, deps: QueryDimDeps): QueryDimRecord[];
41
+ interface QueryDimStore {
42
+ parquetKey: (ctx: TenantCtx) => string;
43
+ /** Write the parquet + JSON sidecar. Last-write-wins; no history. */
44
+ write: (ctx: TenantCtx, records: readonly QueryDimRecord[], builtAt: number) => Promise<{
45
+ parquetKey: string;
46
+ rowCount: number;
47
+ }>;
48
+ /** Read the sidecar (versions + freshness), or null on first build. */
49
+ loadMeta: (ctx: TenantCtx) => Promise<QueryDimMeta | null>;
50
+ /** Decode the dimension rows (test/inspection; reads JOIN the parquet by key). */
51
+ loadRecords: (ctx: TenantCtx) => Promise<QueryDimRecord[]>;
52
+ }
53
+ declare function createQueryDimStore({
54
+ dataSource
55
+ }: {
56
+ dataSource: DataSource;
57
+ }): QueryDimStore;
4
58
  /**
5
59
  * GSC URL inspection result fields we persist. Mirrors the
6
60
  * `searchconsole_v1.Schema$UrlInspectionResult` shape but as plain JSON
@@ -442,4 +496,4 @@ interface CreateEmptyTypesStoreOptions {
442
496
  now?: () => number;
443
497
  }
444
498
  declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
445
- export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, ReconcileResult, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
499
+ export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, QueryDimDeps, QueryDimMeta, QueryDimRecord, QueryDimStore, ReconcileResult, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
package/dist/entities.mjs CHANGED
@@ -1,2 +1,2 @@
1
- import { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
2
- export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
1
+ import { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
2
+ export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, buildQueryDimRecords, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createQueryDimStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, queryDimMetaKey, queryDimParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
@@ -244,6 +244,12 @@ declare function runWindowed(opts: {
244
244
  start: string;
245
245
  end: string;
246
246
  }) => string;
247
+ /**
248
+ * Extra named file sets merged into every window's `runSQL` (alongside the
249
+ * windowed `FILES`). Use to JOIN a non-windowed sidecar (e.g. the query
250
+ * dimension parquet via `{ QUERY_DIM: { keys: [...] } }`) inside `sqlFor`.
251
+ */
252
+ extraFileSets?: Record<string, FileSetRef>;
247
253
  }): Promise<Row$1[]>;
248
254
  /**
249
255
  * Daily totals across the full history. One row per (date, table) with
package/dist/rollups.mjs CHANGED
@@ -1,7 +1,7 @@
1
1
  import "./_chunks/layout.mjs";
2
2
  import { engineErrors } from "./errors.mjs";
3
3
  import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
4
- import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
4
+ import { createIndexingMetadataStore, createQueryDimStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
5
5
  import { MS_PER_DAY } from "gscdump";
6
6
  function rollupPrefix(ctx, searchType) {
7
7
  const base = ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
@@ -237,10 +237,13 @@ async function runWindowed(opts) {
237
237
  const result = await opts.engine.runSQL({
238
238
  ctx: opts.ctx,
239
239
  table: opts.table,
240
- fileSets: { FILES: {
241
- table: opts.table,
242
- partitions: w.partitions
243
- } },
240
+ fileSets: {
241
+ FILES: {
242
+ table: opts.table,
243
+ partitions: w.partitions
244
+ },
245
+ ...opts.extraFileSets
246
+ },
244
247
  sql: opts.sqlFor(w),
245
248
  ...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
246
249
  });
@@ -638,23 +641,41 @@ const queryCanonicalDailyRollup = {
638
641
  }
639
642
  ],
640
643
  parquetSortKey: ["date", "query_canonical"],
641
- async build({ engine, ctx, searchType }) {
644
+ async build({ engine, ctx, dataSource, searchType }) {
645
+ const dimStore = createQueryDimStore({ dataSource });
646
+ const useDim = await dimStore.loadMeta(ctx) !== null;
647
+ const canonExpr = useDim ? `COALESCE(qd.query_canonical, NULLIF(q.query_canonical, ''), q.query)` : `COALESCE(NULLIF(query_canonical, ''), query)`;
642
648
  return (await runWindowed({
643
649
  engine,
644
650
  ctx,
645
651
  table: "queries",
646
652
  ...searchType !== void 0 ? { searchType } : {},
647
- sqlFor: (w) => `
648
- SELECT
649
- COALESCE(NULLIF(query_canonical, ''), query) AS query_canonical,
650
- CAST(date AS VARCHAR) AS date,
651
- SUM(clicks)::BIGINT AS clicks,
652
- SUM(impressions)::BIGINT AS impressions,
653
- SUM(sum_position)::DOUBLE AS sum_position
654
- FROM read_parquet({{FILES}}, union_by_name = true)
655
- WHERE date >= '${w.start}' AND date <= '${w.end}'
656
- GROUP BY COALESCE(NULLIF(query_canonical, ''), query), date
657
- `
653
+ ...useDim ? { extraFileSets: { QUERY_DIM: {
654
+ table: "queries",
655
+ keys: [dimStore.parquetKey(ctx)]
656
+ } } } : {},
657
+ sqlFor: useDim ? (w) => `
658
+ SELECT
659
+ ${canonExpr} AS query_canonical,
660
+ CAST(q.date AS VARCHAR) AS date,
661
+ SUM(q.clicks)::BIGINT AS clicks,
662
+ SUM(q.impressions)::BIGINT AS impressions,
663
+ SUM(q.sum_position)::DOUBLE AS sum_position
664
+ FROM read_parquet({{FILES}}, union_by_name = true) q
665
+ LEFT JOIN read_parquet({{QUERY_DIM}}, union_by_name = true) qd ON q.query = qd.query
666
+ WHERE q.date >= '${w.start}' AND q.date <= '${w.end}'
667
+ GROUP BY ${canonExpr}, q.date
668
+ ` : (w) => `
669
+ SELECT
670
+ ${canonExpr} AS query_canonical,
671
+ CAST(date AS VARCHAR) AS date,
672
+ SUM(clicks)::BIGINT AS clicks,
673
+ SUM(impressions)::BIGINT AS impressions,
674
+ SUM(sum_position)::DOUBLE AS sum_position
675
+ FROM read_parquet({{FILES}}, union_by_name = true)
676
+ WHERE date >= '${w.start}' AND date <= '${w.end}'
677
+ GROUP BY ${canonExpr}, date
678
+ `
658
679
  })).map((r) => ({
659
680
  query_canonical: String(r.query_canonical),
660
681
  date: String(r.date),
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.30.0",
4
+ "version": "0.31.0",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -191,8 +191,8 @@
191
191
  "hyparquet": "^1.26.1",
192
192
  "hyparquet-writer": "^0.16.1",
193
193
  "proper-lockfile": "^4.1.2",
194
- "@gscdump/contracts": "0.30.0",
195
- "gscdump": "0.30.0"
194
+ "@gscdump/contracts": "0.31.0",
195
+ "gscdump": "0.31.0"
196
196
  },
197
197
  "devDependencies": {
198
198
  "@duckdb/duckdb-wasm": "^1.32.0",