@gscdump/engine 0.4.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +20 -3
  2. package/dist/_chunks/compiler.mjs +288 -0
  3. package/dist/_chunks/duckdb.d.mts +26 -0
  4. package/dist/_chunks/engine.mjs +578 -0
  5. package/dist/_chunks/pg-adapter.mjs +676 -0
  6. package/dist/_chunks/planner.d.mts +15 -0
  7. package/dist/_chunks/schema.d.mts +1258 -0
  8. package/dist/_chunks/schema.mjs +139 -0
  9. package/dist/_chunks/storage.d.mts +476 -0
  10. package/dist/_chunks/storage.mjs +39 -0
  11. package/dist/_chunks/types.d.mts +53 -0
  12. package/dist/adapters/duckdb-node.d.mts +1 -13
  13. package/dist/adapters/duckdb-node.mjs +1 -7
  14. package/dist/adapters/filesystem.d.mts +1 -193
  15. package/dist/adapters/filesystem.mjs +2 -9
  16. package/dist/adapters/http.d.mts +1 -193
  17. package/dist/adapters/http.mjs +1 -5
  18. package/dist/adapters/hyparquet.d.mts +6 -83
  19. package/dist/adapters/hyparquet.mjs +1 -105
  20. package/dist/adapters/inspection-sqlite-browser.d.mts +1 -7
  21. package/dist/adapters/inspection-sqlite-node.d.mts +1 -7
  22. package/dist/adapters/inspection-sqlite-node.mjs +1 -1
  23. package/dist/adapters/node-harness.d.mts +3 -306
  24. package/dist/adapters/node-harness.mjs +4 -1866
  25. package/dist/adapters/r2-manifest.d.mts +4 -149
  26. package/dist/adapters/r2-manifest.mjs +1 -8
  27. package/dist/adapters/r2.d.mts +1 -47
  28. package/dist/contracts.d.mts +1 -435
  29. package/dist/entities.d.mts +1 -47
  30. package/dist/index.d.mts +8 -1844
  31. package/dist/index.mjs +8 -1962
  32. package/dist/ingest.d.mts +1 -1
  33. package/dist/planner.d.mts +3 -16
  34. package/dist/planner.mjs +1 -320
  35. package/dist/resolver/index.d.mts +3 -51
  36. package/dist/resolver/index.mjs +2 -780
  37. package/dist/rollups.d.mts +6 -51
  38. package/dist/rollups.mjs +2 -209
  39. package/dist/schema.d.mts +2 -1258
  40. package/dist/schema.mjs +1 -138
  41. package/package.json +5 -5
package/README.md CHANGED
@@ -1,8 +1,12 @@
1
- ## @gscdump/engine
1
+ # @gscdump/engine
2
2
 
3
- Append-only Parquet/DuckDB storage engine for the gscdump pipeline. Owns the storage runtime, planner, schema, and adapters that were previously bundled into `gscdump`.
3
+ [![npm version](https://img.shields.io/npm/v/@gscdump/engine?color=yellow)](https://npmjs.com/package/@gscdump/engine)
4
+ [![npm downloads](https://img.shields.io/npm/dm/@gscdump/engine?color=yellow)](https://npm.chart.dev/@gscdump/engine)
5
+ [![license](https://img.shields.io/github/license/harlan-zw/gscdump?color=yellow)](https://github.com/harlan-zw/gscdump/blob/main/LICENSE)
4
6
 
5
- Edge consumers stay on [`gscdump`](../gscdump). Anything that needs to read/write Parquet, run the DuckDB executor, or attach a snapshot lives here.
7
+ > Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.
8
+
9
+ Owns the storage runtime, planner, schema, and adapters that were previously bundled into `gscdump`. Edge consumers stay on [`gscdump`](../gscdump); anything that needs to read/write Parquet, run the DuckDB executor, or attach a snapshot lives here.
6
10
 
7
11
  ## Install
8
12
 
@@ -26,11 +30,21 @@ Optional peers (install only what your runtime needs):
26
30
  | `@gscdump/engine/snapshot` | `SnapshotIndex` contract for hot/cold snapshot files. |
27
31
  | `@gscdump/engine/ingest` | GSC row → storage row helpers (`createRowAccumulator`, `transformGscRow`). |
28
32
  | `@gscdump/engine/sql` | SQL literal binding helpers (`bindLiterals`, `formatLiteral`). |
33
+ | `@gscdump/engine/sql-fragments` | Reusable SQL fragments shared across analyzers. |
34
+ | `@gscdump/engine/rollups` | Pre-aggregated rollup contracts + helpers. |
35
+ | `@gscdump/engine/entities` | Entity helpers (sites, tenants, scope keys). |
36
+ | `@gscdump/engine/resolver` | Dialect-neutral SQL composition: `ResolverAdapter`, `pgResolverAdapter`, `compilePg`/`compileSqlite`, `resolveToSQL`. |
37
+ | `@gscdump/engine/scope` | Multi-tenant scope predicates. |
38
+ | `@gscdump/engine/arrow` | Apache Arrow utilities for engine result conversion. |
29
39
  | `@gscdump/engine/node` | Node-only DuckDB handle. |
40
+ | `@gscdump/engine/node-harness` | Node test harness for engine integration tests. |
30
41
  | `@gscdump/engine/filesystem` | Node-only `DataSource` + `ManifestStore` adapters. |
31
42
  | `@gscdump/engine/http` | Read-only HTTP `DataSource` (signed URLs, Range requests). |
32
43
  | `@gscdump/engine/hyparquet` | Pure-JS `ParquetCodec`. |
33
44
  | `@gscdump/engine/r2` | Cloudflare R2 `DataSource` (structurally typed against `R2Bucket`). |
45
+ | `@gscdump/engine/r2-manifest` | R2-backed `ManifestStore` for hosted deployments. |
46
+ | `@gscdump/engine/inspection-sqlite-node` | Node SQLite adapter for URL-inspection cache. |
47
+ | `@gscdump/engine/inspection-sqlite-browser` | Browser (wa-sqlite) adapter for URL-inspection cache. |
34
48
 
35
49
  ## Stability
36
50
 
@@ -46,6 +60,9 @@ Optional peers (install only what your runtime needs):
46
60
 
47
61
  - [`gscdump`](../gscdump) — REST client + query builder (edge-safe peer dep).
48
62
  - [`@gscdump/analysis`](../analysis) — analyzers; consumes `StorageEngine` via `createEngine` factories.
63
+ - [`@gscdump/engine-duckdb-node`](../engine-duckdb-node) — Node DuckDB analyzer adapter.
64
+ - [`@gscdump/engine-wasm`](../engine-wasm) — DuckDB-WASM browser adapter.
65
+ - [`@gscdump/engine-sqlite`](../engine-sqlite) — SQLite / D1 adapter.
49
66
  - [`@gscdump/cli`](../cli) — CLI wrapping engine + analysis.
50
67
 
51
68
  ## License
@@ -0,0 +1,288 @@
1
+ import { i as dimensionToColumn, r as currentSchemaVersion } from "./schema.mjs";
2
+ import { a as mondayOfWeek, c as quarterOfMonth, d as weekPartition, i as inferSearchType, l as quarterPartition, n as dayPartition, o as monthPartition, s as objectKey } from "./storage.mjs";
3
+ import { METRIC_EXPR, escapeLike, topLevelPagePredicateSql } from "../sql-fragments.mjs";
4
+ import { buildLogicalPlan } from "gscdump/query/plan";
5
+ import { MS_PER_DAY } from "gscdump";
6
+ const DAILY_PARTITION_RE = /^daily\/(\d{4}-\d{2}-\d{2})$/;
7
+ const WEEKLY_PARTITION_RE = /^weekly\/(\d{4}-\d{2}-\d{2})$/;
8
+ const MONTHLY_PARTITION_RE = /^monthly\/(\d{4}-\d{2})$/;
9
+ const DEFAULT_THRESHOLDS = {
10
+ raw: 7,
11
+ d7: 30,
12
+ d30: 90
13
+ };
14
+ const PENDING_WINDOW_DAYS = 4;
15
+ const STAGES = [
16
+ {
17
+ inputTier: "raw",
18
+ outputTier: "d7",
19
+ cutoffDays: DEFAULT_THRESHOLDS.raw,
20
+ bucketKey: (e) => {
21
+ const m = e.partition.match(DAILY_PARTITION_RE);
22
+ if (!m) return void 0;
23
+ return mondayOfWeek(m[1]);
24
+ },
25
+ bucketLatestMs: (monday) => Date.parse(`${monday}T00:00:00Z`) + 6 * MS_PER_DAY,
26
+ outputPartition: weekPartition
27
+ },
28
+ {
29
+ inputTier: "d7",
30
+ outputTier: "d30",
31
+ cutoffDays: DEFAULT_THRESHOLDS.d7,
32
+ bucketKey: (e) => {
33
+ const m = e.partition.match(WEEKLY_PARTITION_RE);
34
+ if (!m) return void 0;
35
+ return m[1].slice(0, 7);
36
+ },
37
+ bucketLatestMs: monthEndMs,
38
+ outputPartition: monthPartition
39
+ },
40
+ {
41
+ inputTier: "d30",
42
+ outputTier: "d90",
43
+ cutoffDays: DEFAULT_THRESHOLDS.d30,
44
+ bucketKey: (e) => {
45
+ const m = e.partition.match(MONTHLY_PARTITION_RE);
46
+ if (!m) return void 0;
47
+ return quarterOfMonth(m[1]);
48
+ },
49
+ bucketLatestMs: quarterEndMs,
50
+ outputPartition: quarterPartition
51
+ }
52
+ ];
53
+ async function compactTieredImpl(deps, ctx, now, overrides = {}) {
54
+ const thresholds = {
55
+ ...DEFAULT_THRESHOLDS,
56
+ ...overrides
57
+ };
58
+ const stagesWithThresholds = STAGES.map((s) => ({
59
+ ...s,
60
+ cutoffDays: s.outputTier === "d7" ? thresholds.raw : s.outputTier === "d30" ? thresholds.d7 : thresholds.d30
61
+ }));
62
+ for (const stage of stagesWithThresholds) await runStage(deps, ctx, stage, now);
63
+ }
64
+ async function runStage(deps, ctx, stage, now) {
65
+ const cutoff = now - Math.max(stage.cutoffDays, PENDING_WINDOW_DAYS) * MS_PER_DAY;
66
+ const candidates = await deps.manifestStore.listLive({
67
+ userId: ctx.userId,
68
+ siteId: ctx.siteId,
69
+ table: ctx.table,
70
+ tier: stage.inputTier
71
+ });
72
+ const buckets = /* @__PURE__ */ new Map();
73
+ for (const entry of candidates) {
74
+ const key = stage.bucketKey(entry);
75
+ if (!key) continue;
76
+ if (stage.bucketLatestMs(key) >= cutoff) continue;
77
+ const compositeKey = `${inferSearchType(entry)}\0${key}`;
78
+ if (!buckets.has(compositeKey)) buckets.set(compositeKey, []);
79
+ buckets.get(compositeKey).push(entry);
80
+ }
81
+ for (const [compositeKey, entries] of buckets) {
82
+ const [searchType, bucket] = compositeKey.split("\0");
83
+ const targetPartition = stage.outputPartition(bucket);
84
+ if (entries.length === 1 && entries[0].partition === targetPartition) continue;
85
+ await deps.manifestStore.withLock({
86
+ userId: ctx.userId,
87
+ siteId: ctx.siteId,
88
+ table: ctx.table,
89
+ partition: targetPartition
90
+ }, async () => {
91
+ const key = objectKey(ctx, ctx.table, targetPartition, now, searchType);
92
+ const { bytes, rowCount } = await deps.codec.compactRows({ table: ctx.table }, entries.map((e) => e.objectKey), key, deps.dataSource);
93
+ const newEntry = {
94
+ userId: ctx.userId,
95
+ siteId: ctx.siteId,
96
+ table: ctx.table,
97
+ partition: targetPartition,
98
+ objectKey: key,
99
+ rowCount,
100
+ bytes,
101
+ createdAt: now,
102
+ schemaVersion: currentSchemaVersion(ctx.table),
103
+ tier: stage.outputTier,
104
+ ...searchType !== "web" ? { searchType } : {}
105
+ };
106
+ await deps.manifestStore.registerVersion(newEntry, entries);
107
+ });
108
+ }
109
+ }
110
+ function enumeratePartitions(startDate, endDate) {
111
+ const out = [];
112
+ const [sy, sm, sd] = startDate.split("-").map(Number);
113
+ const [ey, em, ed] = endDate.split("-").map(Number);
114
+ const start = Date.UTC(sy, sm - 1, sd);
115
+ const end = Date.UTC(ey, em - 1, ed);
116
+ if (end < start) return out;
117
+ const seenWeeks = /* @__PURE__ */ new Set();
118
+ const seenMonths = /* @__PURE__ */ new Set();
119
+ const seenQuarters = /* @__PURE__ */ new Set();
120
+ for (let t = start; t <= end; t += 864e5) {
121
+ const d = new Date(t);
122
+ const y = d.getUTCFullYear();
123
+ const m = String(d.getUTCMonth() + 1).padStart(2, "0");
124
+ const isoDay = `${y}-${m}-${String(d.getUTCDate()).padStart(2, "0")}`;
125
+ const isoMonth = `${y}-${m}`;
126
+ out.push(dayPartition(isoDay));
127
+ const monday = mondayOfWeek(isoDay);
128
+ if (!seenWeeks.has(monday)) {
129
+ seenWeeks.add(monday);
130
+ out.push(weekPartition(monday));
131
+ }
132
+ if (!seenMonths.has(isoMonth)) {
133
+ seenMonths.add(isoMonth);
134
+ out.push(monthPartition(isoMonth));
135
+ }
136
+ const quarter = quarterOfMonth(isoMonth);
137
+ if (!seenQuarters.has(quarter)) {
138
+ seenQuarters.add(quarter);
139
+ out.push(quarterPartition(quarter));
140
+ }
141
+ }
142
+ return out;
143
+ }
144
+ function monthEndMs(month) {
145
+ const [y, m] = month.split("-").map(Number);
146
+ return Date.UTC(y, m, 0, 23, 59, 59, 999);
147
+ }
148
+ function quarterEndMs(quarter) {
149
+ const [yStr, qStr] = quarter.split("-Q");
150
+ const y = Number(yStr);
151
+ const q = Number(qStr);
152
+ return Date.UTC(y, q * 3, 0, 23, 59, 59, 999);
153
+ }
154
+ const FILES_PLACEHOLDER = "{{FILES}}";
155
+ function buildDimensionWhere(filters, table) {
156
+ const clauses = [];
157
+ const params = [];
158
+ for (const filter of filters) {
159
+ const column = dimensionToColumn(filter.dimension, table);
160
+ switch (filter.operator) {
161
+ case "equals":
162
+ clauses.push(`${column} = ?`);
163
+ params.push(filter.expression);
164
+ break;
165
+ case "notEquals":
166
+ clauses.push(`${column} != ?`);
167
+ params.push(filter.expression);
168
+ break;
169
+ case "contains":
170
+ clauses.push(`${column} LIKE ? ESCAPE '\\'`);
171
+ params.push(`%${escapeLike(filter.expression)}%`);
172
+ break;
173
+ case "notContains":
174
+ clauses.push(`${column} NOT LIKE ? ESCAPE '\\'`);
175
+ params.push(`%${escapeLike(filter.expression)}%`);
176
+ break;
177
+ case "includingRegex":
178
+ clauses.push(`regexp_matches(${column}, ?)`);
179
+ params.push(filter.expression);
180
+ break;
181
+ case "excludingRegex":
182
+ clauses.push(`NOT regexp_matches(${column}, ?)`);
183
+ params.push(filter.expression);
184
+ break;
185
+ }
186
+ }
187
+ return {
188
+ clause: clauses.join(" AND "),
189
+ params
190
+ };
191
+ }
192
+ function buildTopLevelWhere(plan, table) {
193
+ if (!plan.specialFilters.topLevel) return "";
194
+ return topLevelPagePredicateSql(dimensionToColumn("page", table));
195
+ }
196
+ function buildHaving(filters) {
197
+ if (filters.length === 0) return {
198
+ clause: "",
199
+ params: []
200
+ };
201
+ const clauses = [];
202
+ const params = [];
203
+ for (const filter of filters) {
204
+ const expr = METRIC_EXPR[filter.metric];
205
+ switch (filter.operator) {
206
+ case "metricGte":
207
+ clauses.push(`${expr} >= ?`);
208
+ params.push(filter.expression);
209
+ break;
210
+ case "metricGt":
211
+ clauses.push(`${expr} > ?`);
212
+ params.push(filter.expression);
213
+ break;
214
+ case "metricLte":
215
+ clauses.push(`${expr} <= ?`);
216
+ params.push(filter.expression);
217
+ break;
218
+ case "metricLt":
219
+ clauses.push(`${expr} < ?`);
220
+ params.push(filter.expression);
221
+ break;
222
+ case "metricBetween":
223
+ clauses.push(`${expr} >= ? AND ${expr} <= ?`);
224
+ params.push(filter.expression, filter.expression2 ?? filter.expression);
225
+ break;
226
+ }
227
+ }
228
+ return {
229
+ clause: clauses.length > 0 ? `HAVING ${clauses.join(" AND ")}` : "",
230
+ params
231
+ };
232
+ }
233
+ function compileLogicalQueryPlan(plan, table = plan.dataset) {
234
+ const partitions = enumeratePartitions(plan.dateRange.startDate, plan.dateRange.endDate);
235
+ const metricSelects = plan.metrics.map((metric) => `${METRIC_EXPR[metric]} AS ${metric}`);
236
+ const dimSelects = plan.groupByDimensions.map((dimension) => {
237
+ const column = dimensionToColumn(dimension, table);
238
+ return column !== dimension ? `${column} AS ${dimension}` : dimension;
239
+ });
240
+ const whereClauses = ["date >= ?", "date <= ?"];
241
+ const whereParams = [plan.dateRange.startDate, plan.dateRange.endDate];
242
+ const dimWhere = buildDimensionWhere(plan.dimensionFilters, table);
243
+ if (dimWhere.clause) {
244
+ whereClauses.push(dimWhere.clause);
245
+ whereParams.push(...dimWhere.params);
246
+ }
247
+ const topLevelClause = buildTopLevelWhere(plan, table);
248
+ if (topLevelClause) whereClauses.push(topLevelClause);
249
+ const having = buildHaving(plan.metricFilters);
250
+ const groupByCols = [...plan.groupByDimensions.map((dimension) => dimensionToColumn(dimension, table)), ...plan.hasDate ? ["date"] : []];
251
+ const groupBy = groupByCols.length > 0 ? `GROUP BY ${groupByCols.join(", ")}` : "";
252
+ const orderBy = plan.orderBy ? `ORDER BY ${plan.orderBy.column} ${plan.orderBy.dir.toUpperCase()}` : "ORDER BY clicks DESC";
253
+ const limit = `LIMIT ${plan.rowLimit ?? 1e3}`;
254
+ const offset = plan.startRow ? `OFFSET ${plan.startRow}` : "";
255
+ return {
256
+ sql: [
257
+ `SELECT ${[
258
+ ...dimSelects,
259
+ ...plan.hasDate ? ["date"] : [],
260
+ ...metricSelects
261
+ ].join(", ")}`,
262
+ `FROM read_parquet(${FILES_PLACEHOLDER}, union_by_name = true)`,
263
+ `WHERE ${whereClauses.join(" AND ")}`,
264
+ groupBy,
265
+ having.clause,
266
+ orderBy,
267
+ limit,
268
+ offset
269
+ ].filter(Boolean).join(" ").replace(/\s+/g, " ").trim(),
270
+ params: [...whereParams, ...having.params],
271
+ partitions,
272
+ table,
273
+ filesPlaceholder: FILES_PLACEHOLDER
274
+ };
275
+ }
276
+ function resolveToSQL(state, table) {
277
+ const plan = buildLogicalPlan(state, { regex: true });
278
+ return compileLogicalQueryPlan(plan, table ?? plan.dataset);
279
+ }
280
+ function fileList(keys) {
281
+ return keys.length === 0 ? "[]" : `[${keys.map((key) => `'${key.replace(/'/g, "''")}'`).join(", ")}]`;
282
+ }
283
+ function substituteNamedFiles(sql, sets) {
284
+ let out = sql;
285
+ for (const [name, keys] of Object.entries(sets)) out = out.replace(new RegExp(`\\{\\{${name}\\}\\}`, "g"), fileList(keys));
286
+ return out;
287
+ }
288
+ export { compactTieredImpl as a, substituteNamedFiles as i, compileLogicalQueryPlan as n, enumeratePartitions as o, resolveToSQL as r, FILES_PLACEHOLDER as t };
@@ -0,0 +1,26 @@
1
+ import { N as TableName, S as QueryExecutor, h as ParquetCodec, w as Row } from "./storage.mjs";
2
+ interface DuckDBHandle {
3
+ query: (sql: string, params?: unknown[]) => Promise<Row[]>;
4
+ registerFileBuffer: (name: string, bytes: Uint8Array) => Promise<void>;
5
+ copyFileToBuffer: (name: string) => Promise<Uint8Array>;
6
+ dropFiles: (names: string[]) => Promise<void>;
7
+ /**
8
+ * Returns a unique path suitable for `COPY TO '…'` + `copyFileToBuffer`.
9
+ * In Node this is an absolute path under `os.tmpdir()` so DuckDB doesn't
10
+ * litter the CWD; in browsers/Workers it's a plain virtual-FS name.
11
+ */
12
+ makeTempPath: (ext: string) => string;
13
+ }
14
+ interface DuckDBFactory {
15
+ getDuckDB: () => Promise<DuckDBHandle>;
16
+ }
17
+ declare function createDuckDBCodec(factory: DuckDBFactory): ParquetCodec;
18
+ declare function createDuckDBExecutor(factory: DuckDBFactory): QueryExecutor;
19
+ /**
20
+ * Canonical "empty-file" SELECT clause for a table. Codecs that need to
21
+ * emit a schema-correct empty Parquet can wrap this in:
22
+ * `COPY (SELECT * FROM <clause> WHERE FALSE) TO '<key>' (FORMAT PARQUET)`
23
+ * to satisfy the ParquetCodec empty-rows invariant.
24
+ */
25
+ declare function canonicalEmptyParquetSchema(table: TableName): string;
26
+ export { createDuckDBExecutor as a, createDuckDBCodec as i, DuckDBHandle as n, canonicalEmptyParquetSchema as r, DuckDBFactory as t };