npm - @gscdump/engine - Versions diffs - 0.6.3 → 0.7.2 - Mend

@gscdump/engine 0.6.3 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +1 -1
package/dist/_chunks/analysis-types.d.mts +47 -0
package/dist/_chunks/contracts.d.mts +1 -0
package/dist/_chunks/dispatch.mjs +75 -0
package/dist/_chunks/registry.d.mts +92 -0
package/dist/_chunks/resolver.mjs +91 -0
package/dist/_chunks/source-types.d.mts +31 -0
package/dist/analysis-types.d.mts +2 -0
package/dist/analysis-types.mjs +7 -0
package/dist/analyzer/index.d.mts +59 -0
package/dist/analyzer/index.mjs +104 -0
package/dist/contracts.d.mts +1 -1
package/dist/period/index.d.mts +57 -0
package/dist/period/index.mjs +150 -0
package/dist/resolver/index.d.mts +1 -27
package/dist/resolver/index.mjs +1 -89
package/dist/scope.d.mts +3 -3
package/dist/source/index.d.mts +78 -0
package/dist/source/index.mjs +113 -0
package/package.json +64 -27
package/dist/rollups.d.mts +0 -162
package/dist/rollups.mjs +0 -346

package/dist/rollups.d.mts DELETED Viewed

@@ -1,162 +0,0 @@
-import { N as TableName$1, a as DataSource, w as Row$1 } from "./_chunks/storage.mjs";
-import { t as ColumnDef } from "./_chunks/schema.mjs";
-import { TenantCtx } from "gscdump/contracts";
-interface RollupCtx extends TenantCtx {
-  /** When the rollup was built. Stamped into payload + filename. */
-  builtAt: number;
-}
-/**
- * Tenant-scoped engine surface a rollup builder needs. Subset of
- * `StorageEngine.runSQL` so rollups stay testable without a full engine.
- */
-interface RollupEngine {
-  runSQL: (opts: {
-    ctx: TenantCtx;
-    fileSets: Record<string, {
-      table: TableName$1;
-      partitions?: string[];
-    }>;
-    table?: TableName$1;
-    sql: string;
-    params?: unknown[];
-  }) => Promise<{
-    rows: Row$1[];
-  }>;
-}
-/**
- * One rollup definition. Build runs SQL over the tenant's facts and/or reads
- * from entity stores via `dataSource`, returning a JSON-serializable payload
- * that the runner timestamps + writes.
- */
-interface RollupDef {
-  id: string;
-  /**
-   * Window in days the rollup covers. `null` means full history. Used by
-   * the runner to populate `windowDays` in the payload metadata so readers
-   * can validate freshness.
-   */
-  windowDays: number | null;
-  /**
-   * Storage format. `'json'` (default) wraps the build payload in a
-   * `RollupEnvelope` and writes as a JSON blob. `'parquet'` expects `build`
-   * to return rows matching `parquetColumns` and writes a parquet file plus
-   * a tiny JSON sidecar envelope that points at it, so metadata
-   * (`builtAt` / `windowDays`) stays readable without decoding parquet.
-   */
-  format?: 'json' | 'parquet';
-  /**
-   * Column schema for parquet output. Required when `format === 'parquet'`.
-   * Types map the same way as the fact-table encoder: VARCHAR / DATE go
-   * through BYTE_ARRAY/UTF8; BIGINT → INT64; INTEGER → INT32; DOUBLE → DOUBLE.
-   */
-  parquetColumns?: readonly ColumnDef[];
-  /** Sort-key column names for parquet row-group stats. Optional. */
-  parquetSortKey?: readonly string[];
-  build: (deps: {
-    engine: RollupEngine;
-    ctx: TenantCtx;
-    /**
-     * Tenant-scoped object store. Rollups that aggregate over entity
-     * snapshots (e.g. indexing metadata) read JSON docs through this.
-     * Pure-SQL rollups can ignore it.
-     */
-    dataSource: DataSource;
-    /**
-     * Wall-clock millis when the runner started this rollup. Use for
-     * derived window cutoffs (e.g. trailing-28d boundary) so the SQL can
-     * inline a date literal and stay portable across DuckDB builds that
-     * don't bundle the ICU extension (Workers DuckDB, for one — CURRENT_DATE
-     * lives in ICU).
-     */
-    builtAt: number;
-  }) => Promise<unknown>;
-}
-/**
- * Wire shape persisted to R2/disk. Readers can rely on the `version` + `builtAt`.
- * Parquet rollups write this envelope as a sidecar whose `payload` points at
- * the co-located `.parquet` object via `{ parquetKey, rowCount }`.
- */
-interface RollupEnvelope<T = unknown> {
-  version: 1;
-  id: string;
-  builtAt: number;
-  windowDays: number | null;
-  payload: T;
-}
-interface ParquetRollupPointer {
-  parquetKey: string;
-  rowCount: number;
-}
-declare function rollupKey(ctx: TenantCtx, id: string, builtAt: number): string;
-declare function rollupParquetKey(ctx: TenantCtx, id: string, builtAt: number): string;
-interface RebuildRollupsOptions {
-  engine: RollupEngine;
-  dataSource: DataSource;
-  ctx: TenantCtx;
-  defs: readonly RollupDef[];
-  now?: () => number;
-}
-interface RebuildRollupResult {
-  id: string;
-  /** JSON envelope key. For parquet rollups this is the sidecar pointer. */
-  objectKey: string;
-  /** Parquet payload key. Present only when `format === 'parquet'`. */
-  parquetKey?: string;
-  /** Envelope byte size; for parquet rollups does NOT include parquet bytes. */
-  bytes: number;
-  /** Parquet payload byte size when `format === 'parquet'`. */
-  parquetBytes?: number;
-  builtAt: number;
-}
-declare function rebuildRollups(opts: RebuildRollupsOptions): Promise<RebuildRollupResult[]>;
-/**
- * Daily totals across the full history. One row per (date, table) with
- * clicks + impressions + position. Powers sparklines and headline totals.
- *
- * Includes `anonymizedImpressionsPct` per day computed as
- *   1 - sum(query_grained_impressions) / sum(page_grained_impressions)
- * — surfaces GSC's anonymous-query gap so the dashboard can warn users not
- * to trust query-grained breakdowns as comprehensive.
- */
-declare const dailyTotalsRollup: RollupDef;
-/** Weekly totals, ISO week aligned. Cheap and stable for trend widgets. */
-declare const weeklyTotalsRollup: RollupDef;
-/**
- * Top 1000 pages by clicks over the trailing 28-day window. JSON for v1;
- * promote to parquet (`top_pages_28d.parquet`) when the dashboard needs
- * server-side WHERE filtering on this rollup.
- */
-declare const topPages28dRollup: RollupDef;
-/**
- * Top 250 countries by clicks over the trailing 28-day window. Countries
- * cardinality is bounded (~250 ISO codes), so the list fits in a tiny JSON
- * payload regardless of traffic shape. Powers a geo-overview widget without
- * spinning up DuckDB-WASM.
- */
-declare const topCountries28dRollup: RollupDef;
-/** Top 1000 keywords by clicks over the trailing 28-day window. */
-declare const topKeywords28dRollup: RollupDef;
-/**
- * Parquet-format companion to `topKeywords28dRollup`. Same shape, but persists
- * as a parquet object plus JSON sidecar pointer so widgets that need
- * server-side WHERE (filter by prefix, by clicks threshold, paginate) can scan
- * it directly with DuckDB-WASM instead of loading all 1000 rows into JS.
- *
- * Opt-in: include in the caller's rollup def list alongside (or instead of)
- * the JSON variant; the runner treats the two as independent ids so they can
- * coexist during a migration.
- */
-declare const topKeywords28dParquetRollup: RollupDef;
-/**
- * Aggregates the per-URL Indexing API metadata entity store (populated by
- * `gscdump entities indexing snapshot`) into daily counts of `URL_UPDATED`
- * and `URL_REMOVED` notifications. Covers the third entity-snapshot shape
- * without needing its own parquet family — publish events are sparse and
- * aggregate cleanly into a small JSON rollup.
- *
- * Safe no-op when the entity store is empty: returns `{ totals: {...}, days: [] }`
- * so downstream readers don't have to special-case first-run sites.
- */
-declare const indexingMetadataRollup: RollupDef;
-declare const DEFAULT_ROLLUPS: readonly RollupDef[];
-export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildRollupResult, RebuildRollupsOptions, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, dailyTotalsRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };

package/dist/rollups.mjs DELETED Viewed

@@ -1,346 +0,0 @@
-import { createIndexingMetadataStore } from "./entities.mjs";
-import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
-import { MS_PER_DAY } from "gscdump";
-function rollupPrefix(ctx) {
-	return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
-}
-function rollupKey(ctx, id, builtAt) {
-	return `${rollupPrefix(ctx)}/${id}__v${builtAt}.json`;
-}
-function rollupParquetKey(ctx, id, builtAt) {
-	return `${rollupPrefix(ctx)}/${id}__v${builtAt}.parquet`;
-}
-async function rebuildRollups(opts) {
-	const now = opts.now ?? (() => Date.now());
-	const results = [];
-	for (const def of opts.defs) {
-		const builtAt = now();
-		const payload = await def.build({
-			engine: opts.engine,
-			ctx: opts.ctx,
-			dataSource: opts.dataSource,
-			builtAt
-		});
-		if (def.format === "parquet") {
-			if (!def.parquetColumns || def.parquetColumns.length === 0) throw new Error(`rollup '${def.id}' declared format='parquet' without parquetColumns`);
-			const rows = payload ?? [];
-			const parquetBytes = encodeRowsToParquetFlex(rows, {
-				columns: def.parquetColumns,
-				sortKey: def.parquetSortKey
-			});
-			const parquetKey = rollupParquetKey(opts.ctx, def.id, builtAt);
-			await opts.dataSource.write(parquetKey, parquetBytes);
-			const pointer = {
-				parquetKey,
-				rowCount: rows.length
-			};
-			const envelope = {
-				version: 1,
-				id: def.id,
-				builtAt,
-				windowDays: def.windowDays,
-				payload: pointer
-			};
-			const envelopeBytes = new TextEncoder().encode(JSON.stringify(envelope));
-			const key = rollupKey(opts.ctx, def.id, builtAt);
-			await opts.dataSource.write(key, envelopeBytes);
-			results.push({
-				id: def.id,
-				objectKey: key,
-				parquetKey,
-				bytes: envelopeBytes.byteLength,
-				parquetBytes: parquetBytes.byteLength,
-				builtAt
-			});
-			continue;
-		}
-		const envelope = {
-			version: 1,
-			id: def.id,
-			builtAt,
-			windowDays: def.windowDays,
-			payload
-		};
-		const json = JSON.stringify(envelope);
-		const bytes = new TextEncoder().encode(json);
-		const key = rollupKey(opts.ctx, def.id, builtAt);
-		await opts.dataSource.write(key, bytes);
-		results.push({
-			id: def.id,
-			objectKey: key,
-			bytes: bytes.byteLength,
-			builtAt
-		});
-	}
-	return results;
-}
-function utcDateMinusDays(at, days) {
-	const d = new Date(at - days * MS_PER_DAY);
-	return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
-}
-const dailyTotalsRollup = {
-	id: "daily_totals",
-	windowDays: null,
-	async build({ engine, ctx }) {
-		const pages = await engine.runSQL({
-			ctx,
-			table: "pages",
-			fileSets: { FILES: { table: "pages" } },
-			sql: `
-        SELECT
-          date,
-          SUM(clicks)::BIGINT AS clicks,
-          SUM(impressions)::BIGINT AS impressions,
-          SUM(sum_position)::DOUBLE AS sum_position
-        FROM read_parquet({{FILES}}, union_by_name = true)
-        GROUP BY date
-        ORDER BY date
-      `
-		});
-		const keywords = await engine.runSQL({
-			ctx,
-			table: "keywords",
-			fileSets: { FILES: { table: "keywords" } },
-			sql: `
-        SELECT
-          date,
-          SUM(impressions)::BIGINT AS impressions
-        FROM read_parquet({{FILES}}, union_by_name = true)
-        GROUP BY date
-      `
-		});
-		const keywordImpressionsByDate = /* @__PURE__ */ new Map();
-		for (const r of keywords.rows) keywordImpressionsByDate.set(String(r.date), BigInt(r.impressions));
-		return pages.rows.map((r) => {
-			const totalImpressions = BigInt(r.impressions);
-			const queryImpressions = keywordImpressionsByDate.get(String(r.date)) ?? BigInt(0);
-			const anonymized = totalImpressions === BigInt(0) ? 0 : 1 - Number(queryImpressions) / Number(totalImpressions);
-			return {
-				date: r.date,
-				clicks: Number(r.clicks),
-				impressions: Number(r.impressions),
-				sum_position: Number(r.sum_position),
-				anonymizedImpressionsPct: Math.max(0, Math.min(1, anonymized))
-			};
-		});
-	}
-};
-const weeklyTotalsRollup = {
-	id: "weekly_totals",
-	windowDays: null,
-	async build({ engine, ctx }) {
-		return (await engine.runSQL({
-			ctx,
-			table: "pages",
-			fileSets: { FILES: { table: "pages" } },
-			sql: `
-        SELECT
-          strftime(date_trunc('week', date::DATE), '%Y-%m-%d') AS week,
-          SUM(clicks)::BIGINT AS clicks,
-          SUM(impressions)::BIGINT AS impressions,
-          SUM(sum_position)::DOUBLE AS sum_position
-        FROM read_parquet({{FILES}}, union_by_name = true)
-        GROUP BY 1
-        ORDER BY 1
-      `
-		})).rows.map((r) => ({
-			week: r.week,
-			clicks: Number(r.clicks),
-			impressions: Number(r.impressions),
-			sum_position: Number(r.sum_position)
-		}));
-	}
-};
-const topPages28dRollup = {
-	id: "top_pages_28d",
-	windowDays: 28,
-	async build({ engine, ctx, builtAt }) {
-		const cutoff = utcDateMinusDays(builtAt, 28);
-		return (await engine.runSQL({
-			ctx,
-			table: "pages",
-			fileSets: { FILES: { table: "pages" } },
-			sql: `
-        SELECT
-          url,
-          SUM(clicks)::BIGINT AS clicks,
-          SUM(impressions)::BIGINT AS impressions,
-          SUM(sum_position)::DOUBLE AS sum_position
-        FROM read_parquet({{FILES}}, union_by_name = true)
-        WHERE date >= '${cutoff}'
-        GROUP BY url
-        ORDER BY clicks DESC
-        LIMIT 1000
-      `
-		})).rows.map((r) => ({
-			url: r.url,
-			clicks: Number(r.clicks),
-			impressions: Number(r.impressions),
-			sum_position: Number(r.sum_position)
-		}));
-	}
-};
-const topCountries28dRollup = {
-	id: "top_countries_28d",
-	windowDays: 28,
-	async build({ engine, ctx, builtAt }) {
-		const cutoff = utcDateMinusDays(builtAt, 28);
-		return (await engine.runSQL({
-			ctx,
-			table: "countries",
-			fileSets: { FILES: { table: "countries" } },
-			sql: `
-        SELECT
-          country,
-          SUM(clicks)::BIGINT AS clicks,
-          SUM(impressions)::BIGINT AS impressions,
-          SUM(sum_position)::DOUBLE AS sum_position
-        FROM read_parquet({{FILES}}, union_by_name = true)
-        WHERE date >= '${cutoff}'
-        GROUP BY country
-        ORDER BY clicks DESC
-        LIMIT 250
-      `
-		})).rows.map((r) => ({
-			country: r.country,
-			clicks: Number(r.clicks),
-			impressions: Number(r.impressions),
-			sum_position: Number(r.sum_position)
-		}));
-	}
-};
-const topKeywords28dRollup = {
-	id: "top_keywords_28d",
-	windowDays: 28,
-	async build({ engine, ctx, builtAt }) {
-		const cutoff = utcDateMinusDays(builtAt, 28);
-		return (await engine.runSQL({
-			ctx,
-			table: "keywords",
-			fileSets: { FILES: { table: "keywords" } },
-			sql: `
-        SELECT
-          query,
-          SUM(clicks)::BIGINT AS clicks,
-          SUM(impressions)::BIGINT AS impressions,
-          SUM(sum_position)::DOUBLE AS sum_position
-        FROM read_parquet({{FILES}}, union_by_name = true)
-        WHERE date >= '${cutoff}'
-        GROUP BY query
-        ORDER BY clicks DESC
-        LIMIT 1000
-      `
-		})).rows.map((r) => ({
-			query: r.query,
-			clicks: Number(r.clicks),
-			impressions: Number(r.impressions),
-			sum_position: Number(r.sum_position)
-		}));
-	}
-};
-const topKeywords28dParquetRollup = {
-	id: "top_keywords_28d_parquet",
-	windowDays: 28,
-	format: "parquet",
-	parquetColumns: [
-		{
-			name: "query",
-			type: "VARCHAR",
-			nullable: false
-		},
-		{
-			name: "clicks",
-			type: "BIGINT",
-			nullable: false
-		},
-		{
-			name: "impressions",
-			type: "BIGINT",
-			nullable: false
-		},
-		{
-			name: "sum_position",
-			type: "DOUBLE",
-			nullable: false
-		}
-	],
-	parquetSortKey: ["clicks"],
-	async build({ engine, ctx, builtAt }) {
-		const cutoff = utcDateMinusDays(builtAt, 28);
-		return (await engine.runSQL({
-			ctx,
-			table: "keywords",
-			fileSets: { FILES: { table: "keywords" } },
-			sql: `
-        SELECT
-          query,
-          SUM(clicks)::BIGINT AS clicks,
-          SUM(impressions)::BIGINT AS impressions,
-          SUM(sum_position)::DOUBLE AS sum_position
-        FROM read_parquet({{FILES}}, union_by_name = true)
-        WHERE date >= '${cutoff}'
-        GROUP BY query
-        ORDER BY clicks DESC
-        LIMIT 1000
-      `
-		})).rows.map((r) => ({
-			query: String(r.query),
-			clicks: BigInt(r.clicks),
-			impressions: BigInt(r.impressions),
-			sum_position: Number(r.sum_position)
-		}));
-	}
-};
-const indexingMetadataRollup = {
-	id: "indexing_metadata",
-	windowDays: null,
-	async build({ dataSource, ctx }) {
-		const index = await createIndexingMetadataStore({ dataSource }).loadIndex(ctx);
-		const records = Object.values(index.records);
-		const updatesByDay = /* @__PURE__ */ new Map();
-		const removesByDay = /* @__PURE__ */ new Map();
-		let totalUpdates = 0;
-		let totalRemoves = 0;
-		let latestUpdate;
-		let latestRemove;
-		for (const r of records) {
-			if (r.latestUpdateAt) {
-				totalUpdates++;
-				const day = r.latestUpdateAt.slice(0, 10);
-				updatesByDay.set(day, (updatesByDay.get(day) ?? 0) + 1);
-				if (!latestUpdate || r.latestUpdateAt > latestUpdate) latestUpdate = r.latestUpdateAt;
-			}
-			if (r.latestRemoveAt) {
-				totalRemoves++;
-				const day = r.latestRemoveAt.slice(0, 10);
-				removesByDay.set(day, (removesByDay.get(day) ?? 0) + 1);
-				if (!latestRemove || r.latestRemoveAt > latestRemove) latestRemove = r.latestRemoveAt;
-			}
-		}
-		const days = new Set([...updatesByDay.keys(), ...removesByDay.keys()]);
-		const perDay = Array.from(days).sort().map((day) => ({
-			day,
-			updates: updatesByDay.get(day) ?? 0,
-			removes: removesByDay.get(day) ?? 0
-		}));
-		return {
-			totals: {
-				urls: records.length,
-				updates: totalUpdates,
-				removes: totalRemoves,
-				latestUpdateAt: latestUpdate ?? null,
-				latestRemoveAt: latestRemove ?? null
-			},
-			days: perDay
-		};
-	}
-};
-const DEFAULT_ROLLUPS = [
-	dailyTotalsRollup,
-	weeklyTotalsRollup,
-	topPages28dRollup,
-	topKeywords28dRollup,
-	topCountries28dRollup,
-	indexingMetadataRollup
-];
-export { DEFAULT_ROLLUPS, dailyTotalsRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };