npm - @gscdump/engine - Versions diffs - 0.4.0 → 0.6.2 - Mend

@gscdump/engine 0.4.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/README.md +20 -3
package/dist/_chunks/compiler.mjs +288 -0
package/dist/_chunks/duckdb.d.mts +26 -0
package/dist/_chunks/engine.mjs +578 -0
package/dist/_chunks/pg-adapter.mjs +676 -0
package/dist/_chunks/planner.d.mts +15 -0
package/dist/_chunks/schema.d.mts +1258 -0
package/dist/_chunks/schema.mjs +139 -0
package/dist/_chunks/storage.d.mts +476 -0
package/dist/_chunks/storage.mjs +39 -0
package/dist/_chunks/types.d.mts +53 -0
package/dist/adapters/duckdb-node.d.mts +1 -13
package/dist/adapters/duckdb-node.mjs +1 -7
package/dist/adapters/filesystem.d.mts +1 -193
package/dist/adapters/filesystem.mjs +2 -9
package/dist/adapters/http.d.mts +1 -193
package/dist/adapters/http.mjs +1 -5
package/dist/adapters/hyparquet.d.mts +6 -83
package/dist/adapters/hyparquet.mjs +1 -105
package/dist/adapters/inspection-sqlite-browser.d.mts +1 -7
package/dist/adapters/inspection-sqlite-node.d.mts +1 -7
package/dist/adapters/inspection-sqlite-node.mjs +1 -1
package/dist/adapters/node-harness.d.mts +3 -306
package/dist/adapters/node-harness.mjs +4 -1866
package/dist/adapters/r2-manifest.d.mts +4 -149
package/dist/adapters/r2-manifest.mjs +1 -8
package/dist/adapters/r2.d.mts +1 -47
package/dist/contracts.d.mts +1 -435
package/dist/entities.d.mts +1 -47
package/dist/index.d.mts +8 -1844
package/dist/index.mjs +8 -1962
package/dist/ingest.d.mts +1 -1
package/dist/planner.d.mts +3 -16
package/dist/planner.mjs +1 -320
package/dist/resolver/index.d.mts +3 -51
package/dist/resolver/index.mjs +2 -780
package/dist/rollups.d.mts +6 -51
package/dist/rollups.mjs +2 -209
package/dist/schema.d.mts +2 -1258
package/dist/schema.mjs +1 -138
package/package.json +5 -5

package/README.md CHANGED Viewed

@@ -1,8 +1,12 @@
-## @gscdump/engine
+# @gscdump/engine
-Append-only Parquet/DuckDB storage engine for the gscdump pipeline. Owns the storage runtime, planner, schema, and adapters that were previously bundled into `gscdump`.
+[![npm version](https://img.shields.io/npm/v/@gscdump/engine?color=yellow)](https://npmjs.com/package/@gscdump/engine)
+[![npm downloads](https://img.shields.io/npm/dm/@gscdump/engine?color=yellow)](https://npm.chart.dev/@gscdump/engine)
+[![license](https://img.shields.io/github/license/harlan-zw/gscdump?color=yellow)](https://github.com/harlan-zw/gscdump/blob/main/LICENSE)
-Edge consumers stay on [`gscdump`](../gscdump). Anything that needs to read/write Parquet, run the DuckDB executor, or attach a snapshot lives here.
+> Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.
+Owns the storage runtime, planner, schema, and adapters that were previously bundled into `gscdump`. Edge consumers stay on [`gscdump`](../gscdump); anything that needs to read/write Parquet, run the DuckDB executor, or attach a snapshot lives here.
 ## Install
@@ -26,11 +30,21 @@ Optional peers (install only what your runtime needs):
 | `@gscdump/engine/snapshot` | `SnapshotIndex` contract for hot/cold snapshot files. |
 | `@gscdump/engine/ingest` | GSC row → storage row helpers (`createRowAccumulator`, `transformGscRow`). |
 | `@gscdump/engine/sql` | SQL literal binding helpers (`bindLiterals`, `formatLiteral`). |
+| `@gscdump/engine/sql-fragments` | Reusable SQL fragments shared across analyzers. |
+| `@gscdump/engine/rollups` | Pre-aggregated rollup contracts + helpers. |
+| `@gscdump/engine/entities` | Entity helpers (sites, tenants, scope keys). |
+| `@gscdump/engine/resolver` | Dialect-neutral SQL composition: `ResolverAdapter`, `pgResolverAdapter`, `compilePg`/`compileSqlite`, `resolveToSQL`. |
+| `@gscdump/engine/scope` | Multi-tenant scope predicates. |
+| `@gscdump/engine/arrow` | Apache Arrow utilities for engine result conversion. |
 | `@gscdump/engine/node` | Node-only DuckDB handle. |
+| `@gscdump/engine/node-harness` | Node test harness for engine integration tests. |
 | `@gscdump/engine/filesystem` | Node-only `DataSource` + `ManifestStore` adapters. |
 | `@gscdump/engine/http` | Read-only HTTP `DataSource` (signed URLs, Range requests). |
 | `@gscdump/engine/hyparquet` | Pure-JS `ParquetCodec`. |
 | `@gscdump/engine/r2` | Cloudflare R2 `DataSource` (structurally typed against `R2Bucket`). |
+| `@gscdump/engine/r2-manifest` | R2-backed `ManifestStore` for hosted deployments. |
+| `@gscdump/engine/inspection-sqlite-node` | Node SQLite adapter for URL-inspection cache. |
+| `@gscdump/engine/inspection-sqlite-browser` | Browser (wa-sqlite) adapter for URL-inspection cache. |
 ## Stability
@@ -46,6 +60,9 @@ Optional peers (install only what your runtime needs):
 - [`gscdump`](../gscdump) — REST client + query builder (edge-safe peer dep).
 - [`@gscdump/analysis`](../analysis) — analyzers; consumes `StorageEngine` via `createEngine` factories.
+- [`@gscdump/engine-duckdb-node`](../engine-duckdb-node) — Node DuckDB analyzer adapter.
+- [`@gscdump/engine-wasm`](../engine-wasm) — DuckDB-WASM browser adapter.
+- [`@gscdump/engine-sqlite`](../engine-sqlite) — SQLite / D1 adapter.
 - [`@gscdump/cli`](../cli) — CLI wrapping engine + analysis.
 ## License

package/dist/_chunks/compiler.mjs ADDED Viewed

@@ -0,0 +1,288 @@
+import { i as dimensionToColumn, r as currentSchemaVersion } from "./schema.mjs";
+import { a as mondayOfWeek, c as quarterOfMonth, d as weekPartition, i as inferSearchType, l as quarterPartition, n as dayPartition, o as monthPartition, s as objectKey } from "./storage.mjs";
+import { METRIC_EXPR, escapeLike, topLevelPagePredicateSql } from "../sql-fragments.mjs";
+import { buildLogicalPlan } from "gscdump/query/plan";
+import { MS_PER_DAY } from "gscdump";
+const DAILY_PARTITION_RE = /^daily\/(\d{4}-\d{2}-\d{2})$/;
+const WEEKLY_PARTITION_RE = /^weekly\/(\d{4}-\d{2}-\d{2})$/;
+const MONTHLY_PARTITION_RE = /^monthly\/(\d{4}-\d{2})$/;
+const DEFAULT_THRESHOLDS = {
+	raw: 7,
+	d7: 30,
+	d30: 90
+};
+const PENDING_WINDOW_DAYS = 4;
+const STAGES = [
+	{
+		inputTier: "raw",
+		outputTier: "d7",
+		cutoffDays: DEFAULT_THRESHOLDS.raw,
+		bucketKey: (e) => {
+			const m = e.partition.match(DAILY_PARTITION_RE);
+			if (!m) return void 0;
+			return mondayOfWeek(m[1]);
+		},
+		bucketLatestMs: (monday) => Date.parse(`${monday}T00:00:00Z`) + 6 * MS_PER_DAY,
+		outputPartition: weekPartition
+	},
+	{
+		inputTier: "d7",
+		outputTier: "d30",
+		cutoffDays: DEFAULT_THRESHOLDS.d7,
+		bucketKey: (e) => {
+			const m = e.partition.match(WEEKLY_PARTITION_RE);
+			if (!m) return void 0;
+			return m[1].slice(0, 7);
+		},
+		bucketLatestMs: monthEndMs,
+		outputPartition: monthPartition
+	},
+	{
+		inputTier: "d30",
+		outputTier: "d90",
+		cutoffDays: DEFAULT_THRESHOLDS.d30,
+		bucketKey: (e) => {
+			const m = e.partition.match(MONTHLY_PARTITION_RE);
+			if (!m) return void 0;
+			return quarterOfMonth(m[1]);
+		},
+		bucketLatestMs: quarterEndMs,
+		outputPartition: quarterPartition
+	}
+];
+async function compactTieredImpl(deps, ctx, now, overrides = {}) {
+	const thresholds = {
+		...DEFAULT_THRESHOLDS,
+		...overrides
+	};
+	const stagesWithThresholds = STAGES.map((s) => ({
+		...s,
+		cutoffDays: s.outputTier === "d7" ? thresholds.raw : s.outputTier === "d30" ? thresholds.d7 : thresholds.d30
+	}));
+	for (const stage of stagesWithThresholds) await runStage(deps, ctx, stage, now);
+}
+async function runStage(deps, ctx, stage, now) {
+	const cutoff = now - Math.max(stage.cutoffDays, PENDING_WINDOW_DAYS) * MS_PER_DAY;
+	const candidates = await deps.manifestStore.listLive({
+		userId: ctx.userId,
+		siteId: ctx.siteId,
+		table: ctx.table,
+		tier: stage.inputTier
+	});
+	const buckets = /* @__PURE__ */ new Map();
+	for (const entry of candidates) {
+		const key = stage.bucketKey(entry);
+		if (!key) continue;
+		if (stage.bucketLatestMs(key) >= cutoff) continue;
+		const compositeKey = `${inferSearchType(entry)}\0${key}`;
+		if (!buckets.has(compositeKey)) buckets.set(compositeKey, []);
+		buckets.get(compositeKey).push(entry);
+	}
+	for (const [compositeKey, entries] of buckets) {
+		const [searchType, bucket] = compositeKey.split("\0");
+		const targetPartition = stage.outputPartition(bucket);
+		if (entries.length === 1 && entries[0].partition === targetPartition) continue;
+		await deps.manifestStore.withLock({
+			userId: ctx.userId,
+			siteId: ctx.siteId,
+			table: ctx.table,
+			partition: targetPartition
+		}, async () => {
+			const key = objectKey(ctx, ctx.table, targetPartition, now, searchType);
+			const { bytes, rowCount } = await deps.codec.compactRows({ table: ctx.table }, entries.map((e) => e.objectKey), key, deps.dataSource);
+			const newEntry = {
+				userId: ctx.userId,
+				siteId: ctx.siteId,
+				table: ctx.table,
+				partition: targetPartition,
+				objectKey: key,
+				rowCount,
+				bytes,
+				createdAt: now,
+				schemaVersion: currentSchemaVersion(ctx.table),
+				tier: stage.outputTier,
+				...searchType !== "web" ? { searchType } : {}
+			};
+			await deps.manifestStore.registerVersion(newEntry, entries);
+		});
+	}
+}
+function enumeratePartitions(startDate, endDate) {
+	const out = [];
+	const [sy, sm, sd] = startDate.split("-").map(Number);
+	const [ey, em, ed] = endDate.split("-").map(Number);
+	const start = Date.UTC(sy, sm - 1, sd);
+	const end = Date.UTC(ey, em - 1, ed);
+	if (end < start) return out;
+	const seenWeeks = /* @__PURE__ */ new Set();
+	const seenMonths = /* @__PURE__ */ new Set();
+	const seenQuarters = /* @__PURE__ */ new Set();
+	for (let t = start; t <= end; t += 864e5) {
+		const d = new Date(t);
+		const y = d.getUTCFullYear();
+		const m = String(d.getUTCMonth() + 1).padStart(2, "0");
+		const isoDay = `${y}-${m}-${String(d.getUTCDate()).padStart(2, "0")}`;
+		const isoMonth = `${y}-${m}`;
+		out.push(dayPartition(isoDay));
+		const monday = mondayOfWeek(isoDay);
+		if (!seenWeeks.has(monday)) {
+			seenWeeks.add(monday);
+			out.push(weekPartition(monday));
+		}
+		if (!seenMonths.has(isoMonth)) {
+			seenMonths.add(isoMonth);
+			out.push(monthPartition(isoMonth));
+		}
+		const quarter = quarterOfMonth(isoMonth);
+		if (!seenQuarters.has(quarter)) {
+			seenQuarters.add(quarter);
+			out.push(quarterPartition(quarter));
+		}
+	}
+	return out;
+}
+function monthEndMs(month) {
+	const [y, m] = month.split("-").map(Number);
+	return Date.UTC(y, m, 0, 23, 59, 59, 999);
+}
+function quarterEndMs(quarter) {
+	const [yStr, qStr] = quarter.split("-Q");
+	const y = Number(yStr);
+	const q = Number(qStr);
+	return Date.UTC(y, q * 3, 0, 23, 59, 59, 999);
+}
+const FILES_PLACEHOLDER = "{{FILES}}";
+function buildDimensionWhere(filters, table) {
+	const clauses = [];
+	const params = [];
+	for (const filter of filters) {
+		const column = dimensionToColumn(filter.dimension, table);
+		switch (filter.operator) {
+			case "equals":
+				clauses.push(`${column} = ?`);
+				params.push(filter.expression);
+				break;
+			case "notEquals":
+				clauses.push(`${column} != ?`);
+				params.push(filter.expression);
+				break;
+			case "contains":
+				clauses.push(`${column} LIKE ? ESCAPE '\\'`);
+				params.push(`%${escapeLike(filter.expression)}%`);
+				break;
+			case "notContains":
+				clauses.push(`${column} NOT LIKE ? ESCAPE '\\'`);
+				params.push(`%${escapeLike(filter.expression)}%`);
+				break;
+			case "includingRegex":
+				clauses.push(`regexp_matches(${column}, ?)`);
+				params.push(filter.expression);
+				break;
+			case "excludingRegex":
+				clauses.push(`NOT regexp_matches(${column}, ?)`);
+				params.push(filter.expression);
+				break;
+		}
+	}
+	return {
+		clause: clauses.join(" AND "),
+		params
+	};
+}
+function buildTopLevelWhere(plan, table) {
+	if (!plan.specialFilters.topLevel) return "";
+	return topLevelPagePredicateSql(dimensionToColumn("page", table));
+}
+function buildHaving(filters) {
+	if (filters.length === 0) return {
+		clause: "",
+		params: []
+	};
+	const clauses = [];
+	const params = [];
+	for (const filter of filters) {
+		const expr = METRIC_EXPR[filter.metric];
+		switch (filter.operator) {
+			case "metricGte":
+				clauses.push(`${expr} >= ?`);
+				params.push(filter.expression);
+				break;
+			case "metricGt":
+				clauses.push(`${expr} > ?`);
+				params.push(filter.expression);
+				break;
+			case "metricLte":
+				clauses.push(`${expr} <= ?`);
+				params.push(filter.expression);
+				break;
+			case "metricLt":
+				clauses.push(`${expr} < ?`);
+				params.push(filter.expression);
+				break;
+			case "metricBetween":
+				clauses.push(`${expr} >= ? AND ${expr} <= ?`);
+				params.push(filter.expression, filter.expression2 ?? filter.expression);
+				break;
+		}
+	}
+	return {
+		clause: clauses.length > 0 ? `HAVING ${clauses.join(" AND ")}` : "",
+		params
+	};
+}
+function compileLogicalQueryPlan(plan, table = plan.dataset) {
+	const partitions = enumeratePartitions(plan.dateRange.startDate, plan.dateRange.endDate);
+	const metricSelects = plan.metrics.map((metric) => `${METRIC_EXPR[metric]} AS ${metric}`);
+	const dimSelects = plan.groupByDimensions.map((dimension) => {
+		const column = dimensionToColumn(dimension, table);
+		return column !== dimension ? `${column} AS ${dimension}` : dimension;
+	});
+	const whereClauses = ["date >= ?", "date <= ?"];
+	const whereParams = [plan.dateRange.startDate, plan.dateRange.endDate];
+	const dimWhere = buildDimensionWhere(plan.dimensionFilters, table);
+	if (dimWhere.clause) {
+		whereClauses.push(dimWhere.clause);
+		whereParams.push(...dimWhere.params);
+	}
+	const topLevelClause = buildTopLevelWhere(plan, table);
+	if (topLevelClause) whereClauses.push(topLevelClause);
+	const having = buildHaving(plan.metricFilters);
+	const groupByCols = [...plan.groupByDimensions.map((dimension) => dimensionToColumn(dimension, table)), ...plan.hasDate ? ["date"] : []];
+	const groupBy = groupByCols.length > 0 ? `GROUP BY ${groupByCols.join(", ")}` : "";
+	const orderBy = plan.orderBy ? `ORDER BY ${plan.orderBy.column} ${plan.orderBy.dir.toUpperCase()}` : "ORDER BY clicks DESC";
+	const limit = `LIMIT ${plan.rowLimit ?? 1e3}`;
+	const offset = plan.startRow ? `OFFSET ${plan.startRow}` : "";
+	return {
+		sql: [
+			`SELECT ${[
+				...dimSelects,
+				...plan.hasDate ? ["date"] : [],
+				...metricSelects
+			].join(", ")}`,
+			`FROM read_parquet(${FILES_PLACEHOLDER}, union_by_name = true)`,
+			`WHERE ${whereClauses.join(" AND ")}`,
+			groupBy,
+			having.clause,
+			orderBy,
+			limit,
+			offset
+		].filter(Boolean).join(" ").replace(/\s+/g, " ").trim(),
+		params: [...whereParams, ...having.params],
+		partitions,
+		table,
+		filesPlaceholder: FILES_PLACEHOLDER
+	};
+}
+function resolveToSQL(state, table) {
+	const plan = buildLogicalPlan(state, { regex: true });
+	return compileLogicalQueryPlan(plan, table ?? plan.dataset);
+}
+function fileList(keys) {
+	return keys.length === 0 ? "[]" : `[${keys.map((key) => `'${key.replace(/'/g, "''")}'`).join(", ")}]`;
+}
+function substituteNamedFiles(sql, sets) {
+	let out = sql;
+	for (const [name, keys] of Object.entries(sets)) out = out.replace(new RegExp(`\\{\\{${name}\\}\\}`, "g"), fileList(keys));
+	return out;
+}
+export { compactTieredImpl as a, substituteNamedFiles as i, compileLogicalQueryPlan as n, enumeratePartitions as o, resolveToSQL as r, FILES_PLACEHOLDER as t };

package/dist/_chunks/duckdb.d.mts ADDED Viewed

@@ -0,0 +1,26 @@
+import { N as TableName, S as QueryExecutor, h as ParquetCodec, w as Row } from "./storage.mjs";
+interface DuckDBHandle {
+  query: (sql: string, params?: unknown[]) => Promise<Row[]>;
+  registerFileBuffer: (name: string, bytes: Uint8Array) => Promise<void>;
+  copyFileToBuffer: (name: string) => Promise<Uint8Array>;
+  dropFiles: (names: string[]) => Promise<void>;
+  /**
+   * Returns a unique path suitable for `COPY TO '…'` + `copyFileToBuffer`.
+   * In Node this is an absolute path under `os.tmpdir()` so DuckDB doesn't
+   * litter the CWD; in browsers/Workers it's a plain virtual-FS name.
+   */
+  makeTempPath: (ext: string) => string;
+}
+interface DuckDBFactory {
+  getDuckDB: () => Promise<DuckDBHandle>;
+}
+declare function createDuckDBCodec(factory: DuckDBFactory): ParquetCodec;
+declare function createDuckDBExecutor(factory: DuckDBFactory): QueryExecutor;
+/**
+ * Canonical "empty-file" SELECT clause for a table. Codecs that need to
+ * emit a schema-correct empty Parquet can wrap this in:
+ *   `COPY (SELECT * FROM <clause> WHERE FALSE) TO '<key>' (FORMAT PARQUET)`
+ * to satisfy the ParquetCodec empty-rows invariant.
+ */
+declare function canonicalEmptyParquetSchema(table: TableName): string;
+export { createDuckDBExecutor as a, createDuckDBCodec as i, DuckDBHandle as n, canonicalEmptyParquetSchema as r, DuckDBFactory as t };