@gscdump/engine 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -274,12 +274,13 @@ function createStorageEngine(opts) {
274
274
  table: ctx.table,
275
275
  partition
276
276
  }, async () => {
277
- const superseding = (await manifestStore.listLive({
277
+ const superseding = await manifestStore.listLive({
278
278
  userId: ctx.userId,
279
279
  siteId: ctx.siteId,
280
280
  table: ctx.table,
281
- partitions: [partition]
282
- })).filter((e) => inferSearchType(e) === inferSearchType({ searchType }));
281
+ partitions: [partition],
282
+ searchType: inferSearchType({ searchType })
283
+ });
283
284
  const normalizedRows = rows.map((r) => normalizeRow(ctx.table, r));
284
285
  const key = objectKey(ctx, ctx.table, partition, now, searchType);
285
286
  const { bytes: writtenBytes, rowCount } = await codec.writeRows({ table: ctx.table }, normalizedRows, key, dataSource);
@@ -322,7 +323,8 @@ function createStorageEngine(opts) {
322
323
  userId: opts.ctx.userId,
323
324
  siteId: opts.ctx.siteId,
324
325
  table: ref.table,
325
- partitions: ref.partitions
326
+ partitions: ref.partitions,
327
+ ...opts.searchType !== void 0 ? { searchType: opts.searchType } : {}
326
328
  })).map((e) => e.objectKey)];
327
329
  }));
328
330
  opts.signal?.throwIfAborted();
@@ -368,7 +370,8 @@ function createStorageEngine(opts) {
368
370
  } },
369
371
  sql: resolved.sql,
370
372
  params: resolved.params,
371
- signal: ctx.signal
373
+ signal: ctx.signal,
374
+ ...ctx.searchType !== void 0 ? { searchType: ctx.searchType } : {}
372
375
  });
373
376
  }
374
377
  async function compactTiered(ctx, thresholds) {
@@ -33,6 +33,13 @@ interface WriteCtx extends TenantCtx {
33
33
  interface QueryCtx extends TenantCtx {
34
34
  table?: TableName;
35
35
  signal?: AbortSignal;
36
+ /**
37
+ * Restrict the query to a single GSC search-type partition (`web`,
38
+ * `discover`, etc.). Undefined preserves the cross-type union for
39
+ * legacy/web-only deployments; explicit value scopes the read to
40
+ * manifest entries written for that type. Mirrors {@link WriteCtx.searchType}.
41
+ */
42
+ searchType?: SearchType;
36
43
  }
37
44
  interface GcCtx {
38
45
  now?: () => number;
@@ -103,6 +110,18 @@ interface ListLiveFilter {
103
110
  * an explicit `tier` field match on {@link inferLegacyTier}.
104
111
  */
105
112
  tier?: CompactionTier;
113
+ /**
114
+ * Narrow to a single GSC searchType slice. Undefined means "no filter" — used
115
+ * by cross-type admin paths (GC / orphan sweep, tenant-stats site discovery).
116
+ * Explicit value filters to that slice; pass `'web'` to match the legacy /
117
+ * sentinel-`''` entries via {@link inferSearchType}.
118
+ *
119
+ * Read paths that scope to a single (user, site, table) cohort MUST set this
120
+ * once writes from multiple search types coexist for that cohort, otherwise
121
+ * the result unions web + non-web entries into a single query and double-
122
+ * counts metrics.
123
+ */
124
+ searchType?: SearchType;
106
125
  }
107
126
  interface DataSource {
108
127
  read: (key: string, range?: {
@@ -379,6 +398,14 @@ interface RunSQLOptions {
379
398
  sql: string;
380
399
  params?: unknown[];
381
400
  signal?: AbortSignal;
401
+ /**
402
+ * Restrict every manifest lookup the runner performs to a single
403
+ * search-type slice. Applies uniformly across all `fileSets`; per-
404
+ * fileSet overrides aren't supported (the only multi-fileSet caller,
405
+ * comparison joins, always wants the same slice for both windows).
406
+ * Undefined keeps the legacy cross-type union.
407
+ */
408
+ searchType?: SearchType;
382
409
  }
383
410
  interface StorageEngine {
384
411
  writeDay: (ctx: WriteCtx, rows: Row[]) => Promise<void>;
@@ -119,6 +119,7 @@ function matchesFilter(entry, filter) {
119
119
  if (filter.table !== void 0 && entry.table !== filter.table) return false;
120
120
  if (filter.partitions && !filter.partitions.includes(entry.partition)) return false;
121
121
  if (filter.tier !== void 0 && inferLegacyTier(entry) !== filter.tier) return false;
122
+ if (filter.searchType !== void 0 && inferSearchType(entry) !== filter.searchType) return false;
122
123
  return true;
123
124
  }
124
125
  function lockFileFor(locksDir, scope) {
@@ -2,6 +2,7 @@ import { T as StorageEngine, i as DataSource } from "../_chunks/storage.mjs";
2
2
  import { NodeDuckDBOptions, createNodeDuckDBHandle, resetNodeDuckDB } from "./duckdb-node.mjs";
3
3
  import { t as SnapshotIndex } from "../_chunks/snapshot.mjs";
4
4
  import { Row, TableName } from "@gscdump/contracts";
5
+ import { SearchType } from "gscdump/query";
5
6
  interface NodeHarnessOptions {
6
7
  dataDir: string;
7
8
  /** Tenant user id. Defaults to `'local'` for single-user CLI installs. */
@@ -24,6 +25,11 @@ interface NodeHarness {
24
25
  siteUrl: string;
25
26
  table: TableName;
26
27
  params?: unknown[];
28
+ /**
29
+ * Restrict the underlying manifest lookup to a single GSC search-type
30
+ * slice. Undefined keeps the legacy cross-type union.
31
+ */
32
+ searchType?: SearchType;
27
33
  }) => Promise<{
28
34
  rows: Row[];
29
35
  sql: string;
@@ -25,7 +25,8 @@ function createNodeHarness(opts) {
25
25
  table: runOpts.table,
26
26
  fileSets: { FILES: { table: runOpts.table } },
27
27
  sql: runOpts.sql,
28
- params: runOpts.params ?? []
28
+ params: runOpts.params ?? [],
29
+ ...runOpts.searchType !== void 0 ? { searchType: runOpts.searchType } : {}
29
30
  });
30
31
  return {
31
32
  rows: result.rows,
@@ -33,6 +33,7 @@ function matchesEntryFilter(entry, filter) {
33
33
  if (filter.table !== void 0 && entry.table !== filter.table) return false;
34
34
  if (filter.partitions && !filter.partitions.includes(entry.partition)) return false;
35
35
  if (filter.tier !== void 0 && inferLegacyTier(entry) !== filter.tier) return false;
36
+ if (filter.searchType !== void 0 && inferSearchType(entry) !== filter.searchType) return false;
36
37
  return true;
37
38
  }
38
39
  function matchesWatermarkFilter(w, filter) {
@@ -1,6 +1,7 @@
1
1
  import { i as DataSource, o as FileSetRef } from "./_chunks/storage.mjs";
2
2
  import { t as ColumnDef } from "./_chunks/schema.mjs";
3
3
  import { TenantCtx } from "@gscdump/contracts";
4
+ import { SearchType } from "gscdump/query";
4
5
  import * as _$_gscdump_engine_contracts0 from "@gscdump/engine/contracts";
5
6
  interface RollupCtx extends TenantCtx {
6
7
  /** When the rollup was built. Stamped into payload + filename. */
@@ -17,6 +18,13 @@ interface RollupEngine {
17
18
  table?: _$_gscdump_engine_contracts0.TableName;
18
19
  sql: string;
19
20
  params?: unknown[];
21
+ /**
22
+ * Restrict every manifest lookup to a single GSC search-type slice. The
23
+ * rollup runner forwards `RebuildRollupsOptions.searchType` so the
24
+ * aggregated facts never mix web + non-web rows. Undefined preserves
25
+ * the legacy cross-type union (web-only tenants).
26
+ */
27
+ searchType?: SearchType;
20
28
  }) => Promise<{
21
29
  rows: _$_gscdump_engine_contracts0.Row[];
22
30
  }>;
@@ -67,6 +75,13 @@ interface RollupDef {
67
75
  * lives in ICU).
68
76
  */
69
77
  builtAt: number;
78
+ /**
79
+ * GSC search-type slice the runner was invoked for. Builders forward
80
+ * this to every `engine.runSQL` call so the aggregated facts come
81
+ * from one cohort. Undefined preserves the legacy cross-type union
82
+ * (used by web-only tenants and admin paths).
83
+ */
84
+ searchType?: SearchType;
70
85
  }) => Promise<unknown>;
71
86
  }
72
87
  /**
@@ -85,14 +100,24 @@ interface ParquetRollupPointer {
85
100
  parquetKey: string;
86
101
  rowCount: number;
87
102
  }
88
- declare function rollupKey(ctx: TenantCtx, id: string, builtAt: number): string;
89
- declare function rollupParquetKey(ctx: TenantCtx, id: string, builtAt: number): string;
103
+ declare function rollupKey(ctx: TenantCtx, id: string, builtAt: number, searchType?: SearchType): string;
104
+ declare function rollupParquetKey(ctx: TenantCtx, id: string, builtAt: number, searchType?: SearchType): string;
90
105
  interface RebuildRollupsOptions {
91
106
  engine: RollupEngine;
92
107
  dataSource: DataSource;
93
108
  ctx: TenantCtx;
94
109
  defs: readonly RollupDef[];
95
110
  now?: () => number;
111
+ /**
112
+ * Build rollups for a single GSC search-type slice. Threads into every
113
+ * builder's `engine.runSQL` call so the aggregated facts come from one
114
+ * cohort, and namespaces the output object keys under a `<searchType>/`
115
+ * segment so per-slice rollups coexist without overwriting each other.
116
+ * Undefined preserves the legacy cross-type behaviour (one rollup over
117
+ * the union of all slices, written to the legacy path) — fine for web-
118
+ * only tenants and explicit cross-type admin views.
119
+ */
120
+ searchType?: SearchType;
96
121
  }
97
122
  interface RebuildRollupResult {
98
123
  id: string;
package/dist/rollups.mjs CHANGED
@@ -1,25 +1,28 @@
1
1
  import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
2
2
  import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexKey } from "./entities.mjs";
3
3
  import { MS_PER_DAY } from "gscdump";
4
- function rollupPrefix(ctx) {
5
- return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
4
+ function rollupPrefix(ctx, searchType) {
5
+ const base = ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
6
+ return searchType !== void 0 && searchType !== "web" ? `${base}/${searchType}` : base;
6
7
  }
7
- function rollupKey(ctx, id, builtAt) {
8
- return `${rollupPrefix(ctx)}/${id}__v${builtAt}.json`;
8
+ function rollupKey(ctx, id, builtAt, searchType) {
9
+ return `${rollupPrefix(ctx, searchType)}/${id}__v${builtAt}.json`;
9
10
  }
10
- function rollupParquetKey(ctx, id, builtAt) {
11
- return `${rollupPrefix(ctx)}/${id}__v${builtAt}.parquet`;
11
+ function rollupParquetKey(ctx, id, builtAt, searchType) {
12
+ return `${rollupPrefix(ctx, searchType)}/${id}__v${builtAt}.parquet`;
12
13
  }
13
14
  async function rebuildRollups(opts) {
14
15
  const now = opts.now ?? (() => Date.now());
15
16
  const results = [];
17
+ const searchType = opts.searchType;
16
18
  for (const def of opts.defs) {
17
19
  const builtAt = now();
18
20
  const payload = await def.build({
19
21
  engine: opts.engine,
20
22
  ctx: opts.ctx,
21
23
  dataSource: opts.dataSource,
22
- builtAt
24
+ builtAt,
25
+ ...searchType !== void 0 ? { searchType } : {}
23
26
  });
24
27
  if (def.format === "parquet") {
25
28
  if (!def.parquetColumns || def.parquetColumns.length === 0) throw new Error(`rollup '${def.id}' declared format='parquet' without parquetColumns`);
@@ -28,7 +31,7 @@ async function rebuildRollups(opts) {
28
31
  columns: def.parquetColumns,
29
32
  sortKey: def.parquetSortKey
30
33
  });
31
- const parquetKey = rollupParquetKey(opts.ctx, def.id, builtAt);
34
+ const parquetKey = rollupParquetKey(opts.ctx, def.id, builtAt, searchType);
32
35
  await opts.dataSource.write(parquetKey, parquetBytes);
33
36
  const pointer = {
34
37
  parquetKey,
@@ -42,7 +45,7 @@ async function rebuildRollups(opts) {
42
45
  payload: pointer
43
46
  };
44
47
  const envelopeBytes = new TextEncoder().encode(JSON.stringify(envelope));
45
- const key = rollupKey(opts.ctx, def.id, builtAt);
48
+ const key = rollupKey(opts.ctx, def.id, builtAt, searchType);
46
49
  await opts.dataSource.write(key, envelopeBytes);
47
50
  results.push({
48
51
  id: def.id,
@@ -63,7 +66,7 @@ async function rebuildRollups(opts) {
63
66
  };
64
67
  const json = JSON.stringify(envelope);
65
68
  const bytes = new TextEncoder().encode(json);
66
- const key = rollupKey(opts.ctx, def.id, builtAt);
69
+ const key = rollupKey(opts.ctx, def.id, builtAt, searchType);
67
70
  await opts.dataSource.write(key, bytes);
68
71
  results.push({
69
72
  id: def.id,
@@ -81,7 +84,7 @@ function utcDateMinusDays(at, days) {
81
84
  const dailyTotalsRollup = {
82
85
  id: "daily_totals",
83
86
  windowDays: null,
84
- async build({ engine, ctx }) {
87
+ async build({ engine, ctx, searchType }) {
85
88
  const pages = await engine.runSQL({
86
89
  ctx,
87
90
  table: "pages",
@@ -95,7 +98,8 @@ const dailyTotalsRollup = {
95
98
  FROM read_parquet({{FILES}}, union_by_name = true)
96
99
  GROUP BY date
97
100
  ORDER BY date
98
- `
101
+ `,
102
+ ...searchType !== void 0 ? { searchType } : {}
99
103
  });
100
104
  const keywords = await engine.runSQL({
101
105
  ctx,
@@ -107,7 +111,8 @@ const dailyTotalsRollup = {
107
111
  SUM(impressions)::BIGINT AS impressions
108
112
  FROM read_parquet({{FILES}}, union_by_name = true)
109
113
  GROUP BY date
110
- `
114
+ `,
115
+ ...searchType !== void 0 ? { searchType } : {}
111
116
  });
112
117
  const keywordImpressionsByDate = /* @__PURE__ */ new Map();
113
118
  for (const r of keywords.rows) keywordImpressionsByDate.set(String(r.date), BigInt(r.impressions));
@@ -128,11 +133,12 @@ const dailyTotalsRollup = {
128
133
  const weeklyTotalsRollup = {
129
134
  id: "weekly_totals",
130
135
  windowDays: null,
131
- async build({ engine, ctx }) {
136
+ async build({ engine, ctx, searchType }) {
132
137
  return (await engine.runSQL({
133
138
  ctx,
134
139
  table: "pages",
135
140
  fileSets: { FILES: { table: "pages" } },
141
+ ...searchType !== void 0 ? { searchType } : {},
136
142
  sql: `
137
143
  SELECT
138
144
  strftime(date_trunc('week', date::DATE), '%Y-%m-%d') AS week,
@@ -154,12 +160,13 @@ const weeklyTotalsRollup = {
154
160
  const topPages28dRollup = {
155
161
  id: "top_pages_28d",
156
162
  windowDays: 28,
157
- async build({ engine, ctx, builtAt }) {
163
+ async build({ engine, ctx, builtAt, searchType }) {
158
164
  const cutoff = utcDateMinusDays(builtAt, 28);
159
165
  return (await engine.runSQL({
160
166
  ctx,
161
167
  table: "pages",
162
168
  fileSets: { FILES: { table: "pages" } },
169
+ ...searchType !== void 0 ? { searchType } : {},
163
170
  sql: `
164
171
  SELECT
165
172
  url,
@@ -183,12 +190,13 @@ const topPages28dRollup = {
183
190
  const topCountries28dRollup = {
184
191
  id: "top_countries_28d",
185
192
  windowDays: 28,
186
- async build({ engine, ctx, builtAt }) {
193
+ async build({ engine, ctx, builtAt, searchType }) {
187
194
  const cutoff = utcDateMinusDays(builtAt, 28);
188
195
  return (await engine.runSQL({
189
196
  ctx,
190
197
  table: "countries",
191
198
  fileSets: { FILES: { table: "countries" } },
199
+ ...searchType !== void 0 ? { searchType } : {},
192
200
  sql: `
193
201
  SELECT
194
202
  country,
@@ -212,12 +220,13 @@ const topCountries28dRollup = {
212
220
  const topKeywords28dRollup = {
213
221
  id: "top_keywords_28d",
214
222
  windowDays: 28,
215
- async build({ engine, ctx, builtAt }) {
223
+ async build({ engine, ctx, builtAt, searchType }) {
216
224
  const cutoff = utcDateMinusDays(builtAt, 28);
217
225
  return (await engine.runSQL({
218
226
  ctx,
219
227
  table: "keywords",
220
228
  fileSets: { FILES: { table: "keywords" } },
229
+ ...searchType !== void 0 ? { searchType } : {},
221
230
  sql: `
222
231
  SELECT
223
232
  query,
@@ -265,12 +274,13 @@ const topKeywords28dParquetRollup = {
265
274
  }
266
275
  ],
267
276
  parquetSortKey: ["clicks"],
268
- async build({ engine, ctx, builtAt }) {
277
+ async build({ engine, ctx, builtAt, searchType }) {
269
278
  const cutoff = utcDateMinusDays(builtAt, 28);
270
279
  return (await engine.runSQL({
271
280
  ctx,
272
281
  table: "keywords",
273
282
  fileSets: { FILES: { table: "keywords" } },
283
+ ...searchType !== void 0 ? { searchType } : {},
274
284
  sql: `
275
285
  SELECT
276
286
  query,
@@ -381,7 +391,7 @@ const indexingHealthRollup = {
381
391
  const indexPercentRollup = {
382
392
  id: "index_percent",
383
393
  windowDays: 90,
384
- async build({ engine, ctx, dataSource, builtAt }) {
394
+ async build({ engine, ctx, dataSource, builtAt, searchType }) {
385
395
  const urlsKey = sitemapUrlsIndexKey(ctx);
386
396
  if (!await dataSource.head?.(urlsKey)) return {
387
397
  totalSitemapUrls: 0,
@@ -398,6 +408,7 @@ const indexPercentRollup = {
398
408
  keys: [urlsKey]
399
409
  }
400
410
  },
411
+ ...searchType !== void 0 ? { searchType } : {},
401
412
  sql: `
402
413
  SELECT
403
414
  p.date AS date,
@@ -1,4 +1,4 @@
1
- import { M as TenantCtx, S as Row, T as StorageEngine } from "../_chunks/storage.mjs";
1
+ import { M as TenantCtx, S as Row, T as StorageEngine, w as SearchType$1 } from "../_chunks/storage.mjs";
2
2
  import { n as AnalysisResult, t as AnalysisParams } from "../_chunks/analysis-types.mjs";
3
3
  import { o as ResolverAdapter } from "../_chunks/types.mjs";
4
4
  import { C as ExecuteSqlOptions, E as SourceCapabilities, S as AnalysisSourceKind, T as QueryRow, t as AnalyzerRegistry, w as FileSet, x as AnalysisQuerySource } from "../_chunks/registry.mjs";
@@ -72,6 +72,14 @@ declare const ENGINE_QUERY_CAPABILITIES: PlannerCapabilities;
72
72
  interface EngineQuerySourceOptions {
73
73
  engine: StorageEngine;
74
74
  ctx: TenantCtx;
75
+ /**
76
+ * Restrict every manifest lookup the source performs to a single search-type
77
+ * slice. Threads into `engine.query` and `engine.runSQL` so the wrapped
78
+ * source returns rows from one cohort instead of unioning web + non-web
79
+ * parquet. Undefined preserves legacy cross-type behaviour for web-only
80
+ * tenants and admin paths.
81
+ */
82
+ searchType?: SearchType$1;
75
83
  }
76
84
  /**
77
85
  * Wraps a storage engine as an `AnalysisQuerySource` with SQL execution.
@@ -94,7 +94,7 @@ const ENGINE_SOURCE_CAPABILITIES = {
94
94
  adapter: true
95
95
  };
96
96
  function createEngineQuerySource(options) {
97
- const { engine, ctx } = options;
97
+ const { engine, ctx, searchType } = options;
98
98
  return {
99
99
  name: "engine",
100
100
  kind: "local",
@@ -104,7 +104,10 @@ function createEngineQuerySource(options) {
104
104
  const filterDims = getFilterDimensions(state.filter, isMetricDimension);
105
105
  assertDimensionsSupported([...state.dimensions, ...filterDims], "stored", "engine query source");
106
106
  if (state.dimensions.includes("queryCanonical") || filterDims.includes("queryCanonical")) throw new Error("engine query source does not support queryCanonical; use browser/sqlite query sources for derived dimensions");
107
- return coerceRows((await engine.query(ctx, state)).rows);
107
+ return coerceRows((await engine.query({
108
+ ...ctx,
109
+ ...searchType !== void 0 ? { searchType } : {}
110
+ }, state)).rows);
108
111
  },
109
112
  async executeSql(sql, params, opts) {
110
113
  const fileSets = opts?.fileSets;
@@ -114,7 +117,8 @@ function createEngineQuerySource(options) {
114
117
  table: fileSets.FILES.table,
115
118
  fileSets,
116
119
  sql,
117
- params: params ?? []
120
+ params: params ?? [],
121
+ ...searchType !== void 0 ? { searchType } : {}
118
122
  });
119
123
  return coerceRows(rows);
120
124
  }
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.15.0",
4
+ "version": "0.17.0",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -169,8 +169,8 @@
169
169
  "dependencies": {
170
170
  "drizzle-orm": "^0.45.2",
171
171
  "proper-lockfile": "^4.1.2",
172
- "@gscdump/contracts": "0.15.0",
173
- "gscdump": "0.15.0"
172
+ "@gscdump/contracts": "0.17.0",
173
+ "gscdump": "0.17.0"
174
174
  },
175
175
  "devDependencies": {
176
176
  "@duckdb/duckdb-wasm": "^1.32.0",
@@ -178,8 +178,8 @@
178
178
  "aws4fetch": "^1.0.20",
179
179
  "hyparquet": "^1.25.8",
180
180
  "hyparquet-writer": "^0.15.1",
181
- "tsx": "^4.21.0",
182
- "vitest": "^4.1.5"
181
+ "tsx": "^4.22.1",
182
+ "vitest": "^4.1.6"
183
183
  },
184
184
  "scripts": {
185
185
  "dev": "obuild --stub",