@gscdump/engine 0.11.0 → 0.11.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -318,6 +318,7 @@ function createStorageEngine(opts) {
318
318
  opts.signal?.throwIfAborted();
319
319
  const entries = Object.entries(opts.fileSets);
320
320
  const perSet = await Promise.all(entries.map(async ([name, ref]) => {
321
+ if (ref.keys !== void 0) return [name, ref.keys];
321
322
  return [name, (await manifestStore.listLive({
322
323
  userId: opts.ctx.userId,
323
324
  siteId: opts.ctx.siteId,
@@ -375,6 +375,15 @@ interface QueryExecutor {
375
375
  interface FileSetRef {
376
376
  table: TableName;
377
377
  partitions?: string[];
378
+ /**
379
+ * Pre-resolved object keys, bypassing the manifest lookup. When provided,
380
+ * runSQL skips `manifestStore.listLive` for this entry and uses these keys
381
+ * directly. Use for entity-store sidecars (`entities/inspections/index.parquet`,
382
+ * `entities/sitemaps/urls/index.parquet`) which aren't registered in the
383
+ * analytics manifest. `table` is still required as the schema sentinel for
384
+ * the empty-fallback rewrite, but isn't consulted when `keys` is non-empty.
385
+ */
386
+ keys?: string[];
378
387
  }
379
388
  interface RunSQLOptions {
380
389
  ctx: TenantCtx;
package/dist/rollups.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
2
- import { createIndexingMetadataStore, createInspectionStore, createSitemapStore } from "./entities.mjs";
2
+ import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexKey } from "./entities.mjs";
3
3
  import { MS_PER_DAY } from "gscdump";
4
4
  function rollupPrefix(ctx) {
5
5
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
@@ -335,36 +335,35 @@ const indexingMetadataRollup = {
335
335
  };
336
336
  }
337
337
  };
338
- function sqlString(s) {
339
- return `'${s.replace(/'/g, "''")}'`;
340
- }
341
338
  const indexingHealthRollup = {
342
339
  id: "indexing_health",
343
340
  windowDays: 90,
344
341
  async build({ engine, ctx, dataSource, builtAt }) {
345
- const uri = createInspectionStore({ dataSource }).parquetUri(ctx);
346
- if (!uri) return { days: [] };
347
- const cutoff = utcDateMinusDays(builtAt, 90);
342
+ const key = inspectionParquetKey(ctx);
343
+ if (!await dataSource.head?.(key)) return { days: [] };
348
344
  const sql = `
349
345
  SELECT
350
- substr(inspectedAt, 1, 10) AS date,
346
+ substr(CAST(inspectedAt AS VARCHAR), 1, 10) AS date,
351
347
  COUNT(*)::BIGINT AS total_urls,
352
- SUM(CASE WHEN indexStatus = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS indexed_count,
353
- SUM(CASE WHEN pageFetchState = 'SOFT_404' THEN 1 ELSE 0 END)::BIGINT AS soft_404,
354
- SUM(CASE WHEN pageFetchState = 'REDIRECT_ERROR' THEN 1 ELSE 0 END)::BIGINT AS redirect,
355
- SUM(CASE WHEN pageFetchState = 'NOT_FOUND' THEN 1 ELSE 0 END)::BIGINT AS not_found,
356
- SUM(CASE WHEN mobileUsabilityVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS mobile_passes,
357
- SUM(CASE WHEN richResultsVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
358
- SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND userCanonical <> googleCanonical THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
359
- FROM read_parquet(${sqlString(uri)})
360
- WHERE substr(inspectedAt, 1, 10) >= '${cutoff}'
348
+ SUM(CASE WHEN CAST(indexStatus AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS indexed_count,
349
+ SUM(CASE WHEN CAST(pageFetchState AS VARCHAR) = 'SOFT_404' THEN 1 ELSE 0 END)::BIGINT AS soft_404,
350
+ SUM(CASE WHEN CAST(pageFetchState AS VARCHAR) = 'REDIRECT_ERROR' THEN 1 ELSE 0 END)::BIGINT AS redirect,
351
+ SUM(CASE WHEN CAST(pageFetchState AS VARCHAR) = 'NOT_FOUND' THEN 1 ELSE 0 END)::BIGINT AS not_found,
352
+ SUM(CASE WHEN CAST(mobileUsabilityVerdict AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS mobile_passes,
353
+ SUM(CASE WHEN CAST(richResultsVerdict AS VARCHAR) = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
354
+ SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND CAST(userCanonical AS VARCHAR) <> CAST(googleCanonical AS VARCHAR) THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
355
+ FROM read_parquet({{INSPECTIONS}}, union_by_name = true)
356
+ WHERE substr(CAST(inspectedAt AS VARCHAR), 1, 10) >= '${utcDateMinusDays(builtAt, 90)}'
361
357
  GROUP BY 1
362
358
  ORDER BY 1
363
359
  `;
364
360
  return { days: (await engine.runSQL({
365
361
  ctx,
366
362
  table: "pages",
367
- fileSets: {},
363
+ fileSets: { INSPECTIONS: {
364
+ table: "pages",
365
+ keys: [key]
366
+ } },
368
367
  sql
369
368
  })).rows.map((r) => ({
370
369
  date: String(r.date),
@@ -383,8 +382,8 @@ const indexPercentRollup = {
383
382
  id: "index_percent",
384
383
  windowDays: 90,
385
384
  async build({ engine, ctx, dataSource, builtAt }) {
386
- const urlsUri = createSitemapStore({ dataSource }).urlsParquetUri(ctx);
387
- if (!urlsUri) return {
385
+ const urlsKey = sitemapUrlsIndexKey(ctx);
386
+ if (!await dataSource.head?.(urlsKey)) return {
388
387
  totalSitemapUrls: 0,
389
388
  days: []
390
389
  };
@@ -392,13 +391,19 @@ const indexPercentRollup = {
392
391
  const numerator = await engine.runSQL({
393
392
  ctx,
394
393
  table: "pages",
395
- fileSets: { PAGES: { table: "pages" } },
394
+ fileSets: {
395
+ PAGES: { table: "pages" },
396
+ URLS: {
397
+ table: "pages",
398
+ keys: [urlsKey]
399
+ }
400
+ },
396
401
  sql: `
397
402
  SELECT
398
403
  p.date AS date,
399
404
  COUNT(DISTINCT p.url)::BIGINT AS clicked_urls
400
405
  FROM read_parquet({{PAGES}}, union_by_name = true) p
401
- INNER JOIN read_parquet(${sqlString(urlsUri)}) s
406
+ INNER JOIN read_parquet({{URLS}}, union_by_name = true) s
402
407
  ON s.loc = p.url AND s.removed_at IS NULL
403
408
  WHERE p.clicks > 0 AND p.date >= '${cutoff}'
404
409
  GROUP BY p.date
@@ -408,10 +413,13 @@ const indexPercentRollup = {
408
413
  const denom = await engine.runSQL({
409
414
  ctx,
410
415
  table: "pages",
411
- fileSets: {},
416
+ fileSets: { URLS: {
417
+ table: "pages",
418
+ keys: [urlsKey]
419
+ } },
412
420
  sql: `
413
421
  SELECT COUNT(*)::BIGINT AS total
414
- FROM read_parquet(${sqlString(urlsUri)})
422
+ FROM read_parquet({{URLS}}, union_by_name = true)
415
423
  WHERE removed_at IS NULL
416
424
  `
417
425
  });
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.11.0",
4
+ "version": "0.11.2",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -169,7 +169,7 @@
169
169
  "dependencies": {
170
170
  "drizzle-orm": "^0.45.2",
171
171
  "proper-lockfile": "^4.1.2",
172
- "gscdump": "0.11.0"
172
+ "gscdump": "0.11.2"
173
173
  },
174
174
  "devDependencies": {
175
175
  "@duckdb/duckdb-wasm": "^1.32.0",