@gscdump/engine 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +53 -0
  3. package/dist/adapters/duckdb-node.d.mts +19 -0
  4. package/dist/adapters/duckdb-node.mjs +78 -0
  5. package/dist/adapters/filesystem.d.mts +206 -0
  6. package/dist/adapters/filesystem.mjs +320 -0
  7. package/dist/adapters/http.d.mts +227 -0
  8. package/dist/adapters/http.mjs +119 -0
  9. package/dist/adapters/hyparquet.d.mts +107 -0
  10. package/dist/adapters/hyparquet.mjs +250 -0
  11. package/dist/adapters/inspection-sqlite-browser.d.mts +9 -0
  12. package/dist/adapters/inspection-sqlite-browser.mjs +42 -0
  13. package/dist/adapters/inspection-sqlite-node.d.mts +9 -0
  14. package/dist/adapters/inspection-sqlite-node.mjs +32 -0
  15. package/dist/adapters/node-harness.d.mts +334 -0
  16. package/dist/adapters/node-harness.mjs +1907 -0
  17. package/dist/adapters/r2-manifest.d.mts +227 -0
  18. package/dist/adapters/r2-manifest.mjs +355 -0
  19. package/dist/adapters/r2.d.mts +93 -0
  20. package/dist/adapters/r2.mjs +65 -0
  21. package/dist/arrow-utils.d.mts +14 -0
  22. package/dist/arrow-utils.mjs +8 -0
  23. package/dist/contracts.d.mts +436 -0
  24. package/dist/contracts.mjs +1 -0
  25. package/dist/entities.d.mts +238 -0
  26. package/dist/entities.mjs +359 -0
  27. package/dist/index.d.mts +1849 -0
  28. package/dist/index.mjs +1976 -0
  29. package/dist/ingest.d.mts +96 -0
  30. package/dist/ingest.mjs +187 -0
  31. package/dist/planner.d.mts +16 -0
  32. package/dist/planner.mjs +321 -0
  33. package/dist/resolver/index.d.mts +207 -0
  34. package/dist/resolver/index.mjs +869 -0
  35. package/dist/rollups.d.mts +207 -0
  36. package/dist/rollups.mjs +553 -0
  37. package/dist/schema.d.mts +1258 -0
  38. package/dist/schema.mjs +139 -0
  39. package/dist/scope.d.mts +38 -0
  40. package/dist/scope.mjs +28 -0
  41. package/dist/snapshot.d.mts +14 -0
  42. package/dist/snapshot.mjs +1 -0
  43. package/dist/sql-bind.d.mts +19 -0
  44. package/dist/sql-bind.mjs +92 -0
  45. package/dist/sql-fragments.d.mts +21 -0
  46. package/dist/sql-fragments.mjs +13 -0
  47. package/package.json +168 -0
@@ -0,0 +1,1907 @@
1
+ import { createRequire } from "node:module";
2
+ import path, { dirname, join, resolve } from "node:path";
3
+ import { encodeSiteId } from "gscdump/tenant";
4
+ import { buildLogicalComparisonPlan, buildLogicalPlan } from "gscdump/query/plan";
5
+ import { MS_PER_DAY, toIsoDate } from "gscdump";
6
+ import { PgDialect, date, doublePrecision, getTableConfig, integer, pgTable, varchar } from "drizzle-orm/pg-core";
7
+ import { normalizeUrl } from "gscdump/normalize";
8
+ import { sql } from "drizzle-orm";
9
+ import { SQLiteAsyncDialect } from "drizzle-orm/sqlite-core";
10
+ import { unlinkSync } from "node:fs";
11
+ import { tmpdir } from "node:os";
12
+ import process from "node:process";
13
+ import { fileURLToPath } from "node:url";
14
+ import { ConsoleLogger, NODE_RUNTIME, VoidLogger, createDuckDB } from "@duckdb/duckdb-wasm/dist/duckdb-node-blocking.cjs";
15
+ import { Buffer } from "node:buffer";
16
+ import { randomBytes } from "node:crypto";
17
+ import { mkdir, readFile, readdir, rename, rm, stat, unlink, writeFile } from "node:fs/promises";
18
+ import { lock } from "proper-lockfile";
19
+ function metricCols() {
20
+ return {
21
+ clicks: integer("clicks").notNull(),
22
+ impressions: integer("impressions").notNull(),
23
+ sum_position: doublePrecision("sum_position").notNull()
24
+ };
25
+ }
26
+ const dateCol = () => date("date").notNull();
27
+ const drizzleSchema = {
28
+ pages: pgTable("pages", {
29
+ url: varchar("url").notNull(),
30
+ date: dateCol(),
31
+ ...metricCols()
32
+ }),
33
+ keywords: pgTable("keywords", {
34
+ query: varchar("query").notNull(),
35
+ query_canonical: varchar("query_canonical"),
36
+ date: dateCol(),
37
+ ...metricCols()
38
+ }),
39
+ countries: pgTable("countries", {
40
+ country: varchar("country").notNull(),
41
+ date: dateCol(),
42
+ ...metricCols()
43
+ }),
44
+ devices: pgTable("devices", {
45
+ device: varchar("device").notNull(),
46
+ date: dateCol(),
47
+ ...metricCols()
48
+ }),
49
+ page_keywords: pgTable("page_keywords", {
50
+ url: varchar("url").notNull(),
51
+ query: varchar("query").notNull(),
52
+ query_canonical: varchar("query_canonical"),
53
+ date: dateCol(),
54
+ ...metricCols()
55
+ }),
56
+ search_appearance: pgTable("search_appearance", {
57
+ searchAppearance: varchar("searchAppearance").notNull(),
58
+ date: dateCol(),
59
+ ...metricCols()
60
+ })
61
+ };
62
+ const TABLE_METADATA = {
63
+ pages: {
64
+ sortKey: ["date", "url"],
65
+ version: 1
66
+ },
67
+ keywords: {
68
+ sortKey: ["date", "query"],
69
+ version: 2
70
+ },
71
+ countries: {
72
+ sortKey: ["date", "country"],
73
+ version: 1
74
+ },
75
+ devices: {
76
+ sortKey: ["date", "device"],
77
+ version: 1
78
+ },
79
+ page_keywords: {
80
+ sortKey: [
81
+ "date",
82
+ "url",
83
+ "query"
84
+ ],
85
+ version: 2
86
+ },
87
+ search_appearance: {
88
+ sortKey: ["date", "searchAppearance"],
89
+ version: 1
90
+ }
91
+ };
92
+ function pgSqlTypeToColumnType(sqlType) {
93
+ const t = sqlType.toLowerCase();
94
+ if (t.startsWith("varchar") || t === "text" || t.startsWith("char")) return "VARCHAR";
95
+ if (t === "date" || t.startsWith("timestamp")) return "DATE";
96
+ if (t.startsWith("double") || t === "real" || t.startsWith("numeric") || t.startsWith("decimal")) return "DOUBLE";
97
+ if (t === "bigint" || t === "int8") return "BIGINT";
98
+ if (t === "integer" || t === "int" || t === "int4" || t === "smallint" || t === "int2") return "INTEGER";
99
+ throw new Error(`unmapped pg type '${sqlType}' — extend pgSqlTypeToColumnType in @gscdump/engine/schema`);
100
+ }
101
+ function tableSchemaFrom(tableName) {
102
+ const columns = getTableConfig(drizzleSchema[tableName]).columns.map((col) => ({
103
+ name: col.name,
104
+ type: pgSqlTypeToColumnType(col.getSQLType()),
105
+ nullable: !col.notNull
106
+ }));
107
+ const meta = TABLE_METADATA[tableName];
108
+ return {
109
+ name: tableName,
110
+ columns,
111
+ sortKey: meta.sortKey,
112
+ version: meta.version
113
+ };
114
+ }
115
+ const SCHEMAS = Object.fromEntries([
116
+ "pages",
117
+ "keywords",
118
+ "countries",
119
+ "devices",
120
+ "page_keywords",
121
+ "search_appearance"
122
+ ].map((t) => [t, tableSchemaFrom(t)]));
123
+ function currentSchemaVersion(table) {
124
+ return SCHEMAS[table].version;
125
+ }
126
+ function dimensionToColumn(dim, _table) {
127
+ if (dim === "page") return "url";
128
+ if (dim === "queryCanonical") return "query_canonical";
129
+ return dim;
130
+ }
131
+ function inferSearchType(entry) {
132
+ return entry.searchType ?? "web";
133
+ }
134
+ function inferLegacyTier(entry) {
135
+ if (entry.tier !== void 0) return entry.tier;
136
+ if (entry.partition.startsWith("daily/")) return "raw";
137
+ if (entry.partition.startsWith("monthly/")) return "d30";
138
+ }
139
+ function dayPartition(date) {
140
+ return `daily/${date}`;
141
+ }
142
+ function monthPartition(month) {
143
+ return `monthly/${month}`;
144
+ }
145
+ function weekPartition(mondayIsoDate) {
146
+ return `weekly/${mondayIsoDate}`;
147
+ }
148
+ function quarterPartition(quarter) {
149
+ return `quarterly/${quarter}`;
150
+ }
151
+ function mondayOfWeek(isoDate) {
152
+ const ms = Date.parse(`${isoDate}T00:00:00Z`);
153
+ const dow = new Date(ms).getUTCDay();
154
+ const offset = dow === 0 ? -6 : 1 - dow;
155
+ return toIsoDate(new Date(ms + offset * MS_PER_DAY));
156
+ }
157
+ function quarterOfMonth(month) {
158
+ const [y, m] = month.split("-").map(Number);
159
+ return `${y}-Q${Math.floor((m - 1) / 3) + 1}`;
160
+ }
161
+ function objectKey(ctx, table, partition, version, searchType) {
162
+ return `${ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/${table}` : `u_${ctx.userId}/${table}`}/${searchType !== void 0 && searchType !== "web" ? `${searchType}/` : ""}${partition}__v${version}.parquet`;
163
+ }
164
+ function tenantPrefix(ctx) {
165
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/` : `u_${ctx.userId}/`;
166
+ }
167
+ const DAILY_PARTITION_RE = /^daily\/(\d{4}-\d{2}-\d{2})$/;
168
+ const WEEKLY_PARTITION_RE = /^weekly\/(\d{4}-\d{2}-\d{2})$/;
169
+ const MONTHLY_PARTITION_RE = /^monthly\/(\d{4}-\d{2})$/;
170
+ const DEFAULT_THRESHOLDS = {
171
+ raw: 7,
172
+ d7: 30,
173
+ d30: 90
174
+ };
175
+ const PENDING_WINDOW_DAYS = 4;
176
+ const STAGES = [
177
+ {
178
+ inputTier: "raw",
179
+ outputTier: "d7",
180
+ cutoffDays: DEFAULT_THRESHOLDS.raw,
181
+ bucketKey: (e) => {
182
+ const m = e.partition.match(DAILY_PARTITION_RE);
183
+ if (!m) return void 0;
184
+ return mondayOfWeek(m[1]);
185
+ },
186
+ bucketLatestMs: (monday) => Date.parse(`${monday}T00:00:00Z`) + 6 * MS_PER_DAY,
187
+ outputPartition: weekPartition
188
+ },
189
+ {
190
+ inputTier: "d7",
191
+ outputTier: "d30",
192
+ cutoffDays: DEFAULT_THRESHOLDS.d7,
193
+ bucketKey: (e) => {
194
+ const m = e.partition.match(WEEKLY_PARTITION_RE);
195
+ if (!m) return void 0;
196
+ return m[1].slice(0, 7);
197
+ },
198
+ bucketLatestMs: monthEndMs,
199
+ outputPartition: monthPartition
200
+ },
201
+ {
202
+ inputTier: "d30",
203
+ outputTier: "d90",
204
+ cutoffDays: DEFAULT_THRESHOLDS.d30,
205
+ bucketKey: (e) => {
206
+ const m = e.partition.match(MONTHLY_PARTITION_RE);
207
+ if (!m) return void 0;
208
+ return quarterOfMonth(m[1]);
209
+ },
210
+ bucketLatestMs: quarterEndMs,
211
+ outputPartition: quarterPartition
212
+ }
213
+ ];
214
+ async function compactTieredImpl(deps, ctx, now, overrides = {}) {
215
+ const thresholds = {
216
+ ...DEFAULT_THRESHOLDS,
217
+ ...overrides
218
+ };
219
+ const stagesWithThresholds = STAGES.map((s) => ({
220
+ ...s,
221
+ cutoffDays: s.outputTier === "d7" ? thresholds.raw : s.outputTier === "d30" ? thresholds.d7 : thresholds.d30
222
+ }));
223
+ for (const stage of stagesWithThresholds) await runStage(deps, ctx, stage, now);
224
+ }
225
+ async function runStage(deps, ctx, stage, now) {
226
+ const cutoff = now - Math.max(stage.cutoffDays, PENDING_WINDOW_DAYS) * MS_PER_DAY;
227
+ const candidates = await deps.manifestStore.listLive({
228
+ userId: ctx.userId,
229
+ siteId: ctx.siteId,
230
+ table: ctx.table,
231
+ tier: stage.inputTier
232
+ });
233
+ const buckets = /* @__PURE__ */ new Map();
234
+ for (const entry of candidates) {
235
+ const key = stage.bucketKey(entry);
236
+ if (!key) continue;
237
+ if (stage.bucketLatestMs(key) >= cutoff) continue;
238
+ const compositeKey = `${inferSearchType(entry)}\0${key}`;
239
+ if (!buckets.has(compositeKey)) buckets.set(compositeKey, []);
240
+ buckets.get(compositeKey).push(entry);
241
+ }
242
+ for (const [compositeKey, entries] of buckets) {
243
+ const [searchType, bucket] = compositeKey.split("\0");
244
+ const targetPartition = stage.outputPartition(bucket);
245
+ if (entries.length === 1 && entries[0].partition === targetPartition) continue;
246
+ await deps.manifestStore.withLock({
247
+ userId: ctx.userId,
248
+ siteId: ctx.siteId,
249
+ table: ctx.table,
250
+ partition: targetPartition
251
+ }, async () => {
252
+ const key = objectKey(ctx, ctx.table, targetPartition, now, searchType);
253
+ const { bytes, rowCount } = await deps.codec.compactRows({ table: ctx.table }, entries.map((e) => e.objectKey), key, deps.dataSource);
254
+ const newEntry = {
255
+ userId: ctx.userId,
256
+ siteId: ctx.siteId,
257
+ table: ctx.table,
258
+ partition: targetPartition,
259
+ objectKey: key,
260
+ rowCount,
261
+ bytes,
262
+ createdAt: now,
263
+ schemaVersion: currentSchemaVersion(ctx.table),
264
+ tier: stage.outputTier,
265
+ ...searchType !== "web" ? { searchType } : {}
266
+ };
267
+ await deps.manifestStore.registerVersion(newEntry, entries);
268
+ });
269
+ }
270
+ }
271
+ function enumeratePartitions(startDate, endDate) {
272
+ const out = [];
273
+ const [sy, sm, sd] = startDate.split("-").map(Number);
274
+ const [ey, em, ed] = endDate.split("-").map(Number);
275
+ const start = Date.UTC(sy, sm - 1, sd);
276
+ const end = Date.UTC(ey, em - 1, ed);
277
+ if (end < start) return out;
278
+ const seenWeeks = /* @__PURE__ */ new Set();
279
+ const seenMonths = /* @__PURE__ */ new Set();
280
+ const seenQuarters = /* @__PURE__ */ new Set();
281
+ for (let t = start; t <= end; t += 864e5) {
282
+ const d = new Date(t);
283
+ const y = d.getUTCFullYear();
284
+ const m = String(d.getUTCMonth() + 1).padStart(2, "0");
285
+ const isoDay = `${y}-${m}-${String(d.getUTCDate()).padStart(2, "0")}`;
286
+ const isoMonth = `${y}-${m}`;
287
+ out.push(dayPartition(isoDay));
288
+ const monday = mondayOfWeek(isoDay);
289
+ if (!seenWeeks.has(monday)) {
290
+ seenWeeks.add(monday);
291
+ out.push(weekPartition(monday));
292
+ }
293
+ if (!seenMonths.has(isoMonth)) {
294
+ seenMonths.add(isoMonth);
295
+ out.push(monthPartition(isoMonth));
296
+ }
297
+ const quarter = quarterOfMonth(isoMonth);
298
+ if (!seenQuarters.has(quarter)) {
299
+ seenQuarters.add(quarter);
300
+ out.push(quarterPartition(quarter));
301
+ }
302
+ }
303
+ return out;
304
+ }
305
+ function monthEndMs(month) {
306
+ const [y, m] = month.split("-").map(Number);
307
+ return Date.UTC(y, m, 0, 23, 59, 59, 999);
308
+ }
309
+ function quarterEndMs(quarter) {
310
+ const [yStr, qStr] = quarter.split("-Q");
311
+ const y = Number(yStr);
312
+ const q = Number(qStr);
313
+ return Date.UTC(y, q * 3, 0, 23, 59, 59, 999);
314
+ }
315
+ function escapeLike(value) {
316
+ return value.replace(/\\/g, "\\\\").replace(/%/g, "\\%").replace(/_/g, "\\_");
317
+ }
318
+ const METRIC_EXPR = {
319
+ clicks: "CAST(SUM(clicks) AS DOUBLE)",
320
+ impressions: "CAST(SUM(impressions) AS DOUBLE)",
321
+ ctr: "CAST(SUM(clicks) AS DOUBLE) / NULLIF(SUM(impressions), 0)",
322
+ position: "SUM(sum_position) / NULLIF(SUM(impressions), 0) + 1"
323
+ };
324
+ function topLevelPagePredicateSql(pathExpr) {
325
+ return `LENGTH(${pathExpr}) - LENGTH(REPLACE(${pathExpr}, '/', '')) <= 1`;
326
+ }
327
+ const FILES_PLACEHOLDER = "{{FILES}}";
328
+ function buildDimensionWhere(filters, table) {
329
+ const clauses = [];
330
+ const params = [];
331
+ for (const filter of filters) {
332
+ const column = dimensionToColumn(filter.dimension, table);
333
+ switch (filter.operator) {
334
+ case "equals":
335
+ clauses.push(`${column} = ?`);
336
+ params.push(filter.expression);
337
+ break;
338
+ case "notEquals":
339
+ clauses.push(`${column} != ?`);
340
+ params.push(filter.expression);
341
+ break;
342
+ case "contains":
343
+ clauses.push(`${column} LIKE ? ESCAPE '\\'`);
344
+ params.push(`%${escapeLike(filter.expression)}%`);
345
+ break;
346
+ case "notContains":
347
+ clauses.push(`${column} NOT LIKE ? ESCAPE '\\'`);
348
+ params.push(`%${escapeLike(filter.expression)}%`);
349
+ break;
350
+ case "includingRegex":
351
+ clauses.push(`regexp_matches(${column}, ?)`);
352
+ params.push(filter.expression);
353
+ break;
354
+ case "excludingRegex":
355
+ clauses.push(`NOT regexp_matches(${column}, ?)`);
356
+ params.push(filter.expression);
357
+ break;
358
+ }
359
+ }
360
+ return {
361
+ clause: clauses.join(" AND "),
362
+ params
363
+ };
364
+ }
365
+ function buildTopLevelWhere(plan, table) {
366
+ if (!plan.specialFilters.topLevel) return "";
367
+ return topLevelPagePredicateSql(dimensionToColumn("page", table));
368
+ }
369
+ function buildHaving(filters) {
370
+ if (filters.length === 0) return {
371
+ clause: "",
372
+ params: []
373
+ };
374
+ const clauses = [];
375
+ const params = [];
376
+ for (const filter of filters) {
377
+ const expr = METRIC_EXPR[filter.metric];
378
+ switch (filter.operator) {
379
+ case "metricGte":
380
+ clauses.push(`${expr} >= ?`);
381
+ params.push(filter.expression);
382
+ break;
383
+ case "metricGt":
384
+ clauses.push(`${expr} > ?`);
385
+ params.push(filter.expression);
386
+ break;
387
+ case "metricLte":
388
+ clauses.push(`${expr} <= ?`);
389
+ params.push(filter.expression);
390
+ break;
391
+ case "metricLt":
392
+ clauses.push(`${expr} < ?`);
393
+ params.push(filter.expression);
394
+ break;
395
+ case "metricBetween":
396
+ clauses.push(`${expr} >= ? AND ${expr} <= ?`);
397
+ params.push(filter.expression, filter.expression2 ?? filter.expression);
398
+ break;
399
+ }
400
+ }
401
+ return {
402
+ clause: clauses.length > 0 ? `HAVING ${clauses.join(" AND ")}` : "",
403
+ params
404
+ };
405
+ }
406
+ function compileLogicalQueryPlan(plan, table = plan.dataset) {
407
+ const partitions = enumeratePartitions(plan.dateRange.startDate, plan.dateRange.endDate);
408
+ const metricSelects = plan.metrics.map((metric) => `${METRIC_EXPR[metric]} AS ${metric}`);
409
+ const dimSelects = plan.groupByDimensions.map((dimension) => {
410
+ const column = dimensionToColumn(dimension, table);
411
+ return column !== dimension ? `${column} AS ${dimension}` : dimension;
412
+ });
413
+ const whereClauses = ["date >= ?", "date <= ?"];
414
+ const whereParams = [plan.dateRange.startDate, plan.dateRange.endDate];
415
+ const dimWhere = buildDimensionWhere(plan.dimensionFilters, table);
416
+ if (dimWhere.clause) {
417
+ whereClauses.push(dimWhere.clause);
418
+ whereParams.push(...dimWhere.params);
419
+ }
420
+ const topLevelClause = buildTopLevelWhere(plan, table);
421
+ if (topLevelClause) whereClauses.push(topLevelClause);
422
+ const having = buildHaving(plan.metricFilters);
423
+ const groupByCols = [...plan.groupByDimensions.map((dimension) => dimensionToColumn(dimension, table)), ...plan.hasDate ? ["date"] : []];
424
+ const groupBy = groupByCols.length > 0 ? `GROUP BY ${groupByCols.join(", ")}` : "";
425
+ const orderBy = plan.orderBy ? `ORDER BY ${plan.orderBy.column} ${plan.orderBy.dir.toUpperCase()}` : "ORDER BY clicks DESC";
426
+ const limit = `LIMIT ${plan.rowLimit ?? 1e3}`;
427
+ const offset = plan.startRow ? `OFFSET ${plan.startRow}` : "";
428
+ return {
429
+ sql: [
430
+ `SELECT ${[
431
+ ...dimSelects,
432
+ ...plan.hasDate ? ["date"] : [],
433
+ ...metricSelects
434
+ ].join(", ")}`,
435
+ `FROM read_parquet(${FILES_PLACEHOLDER}, union_by_name = true)`,
436
+ `WHERE ${whereClauses.join(" AND ")}`,
437
+ groupBy,
438
+ having.clause,
439
+ orderBy,
440
+ limit,
441
+ offset
442
+ ].filter(Boolean).join(" ").replace(/\s+/g, " ").trim(),
443
+ params: [...whereParams, ...having.params],
444
+ partitions,
445
+ table,
446
+ filesPlaceholder: FILES_PLACEHOLDER
447
+ };
448
+ }
449
+ function fileList(keys) {
450
+ return keys.length === 0 ? "[]" : `[${keys.map((key) => `'${key.replace(/'/g, "''")}'`).join(", ")}]`;
451
+ }
452
+ function substituteNamedFiles(sql, sets) {
453
+ let out = sql;
454
+ for (const [name, keys] of Object.entries(sets)) out = out.replace(new RegExp(`\\{\\{${name}\\}\\}`, "g"), fileList(keys));
455
+ return out;
456
+ }
457
+ function sqlEscape(s) {
458
+ return s.replace(/'/g, "''");
459
+ }
460
+ async function encodeBytes(db, table, rows) {
461
+ const inName = db.makeTempPath("json");
462
+ const outName = db.makeTempPath("parquet");
463
+ const jsonBytes = new TextEncoder().encode(JSON.stringify(rows));
464
+ const registered = [];
465
+ await db.registerFileBuffer(inName, jsonBytes);
466
+ registered.push(inName);
467
+ try {
468
+ const sql = rows.length === 0 ? `COPY (SELECT * FROM ${emptyTableSchema(table)} WHERE FALSE) TO '${sqlEscape(outName)}' (FORMAT PARQUET)` : `COPY (SELECT * FROM read_json_auto('${sqlEscape(inName)}', format='array', columns=${columnsJson(table)})) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`;
469
+ await db.query(sql);
470
+ registered.push(outName);
471
+ return await db.copyFileToBuffer(outName);
472
+ } finally {
473
+ await db.dropFiles(registered);
474
+ }
475
+ }
476
+ async function decodeBytes(db, bytes, table) {
477
+ const name = db.makeTempPath("parquet");
478
+ await db.registerFileBuffer(name, bytes);
479
+ try {
480
+ return await db.query(`SELECT * ${dateReplaceClause(table)} FROM read_parquet('${sqlEscape(name)}')`);
481
+ } finally {
482
+ await db.dropFiles([name]);
483
+ }
484
+ }
485
+ function createDuckDBCodec(factory) {
486
+ return {
487
+ async writeRows(ctx, rows, key, dataSource) {
488
+ const bytes = await encodeBytes(await factory.getDuckDB(), ctx.table, rows);
489
+ await dataSource.write(key, bytes);
490
+ return {
491
+ bytes: bytes.byteLength,
492
+ rowCount: rows.length
493
+ };
494
+ },
495
+ async readRows(ctx, key, dataSource) {
496
+ return decodeBytes(await factory.getDuckDB(), await dataSource.read(key), ctx.table);
497
+ },
498
+ async compactRows(ctx, inputKeys, outputKey, dataSource) {
499
+ const db = await factory.getDuckDB();
500
+ if (inputKeys.length === 0) {
501
+ const bytes = await encodeBytes(db, ctx.table, []);
502
+ await dataSource.write(outputKey, bytes);
503
+ return {
504
+ bytes: bytes.byteLength,
505
+ rowCount: 0
506
+ };
507
+ }
508
+ const inputUris = inputKeys.map((k) => dataSource.uri?.(k));
509
+ if (inputUris.every((u) => u !== void 0)) {
510
+ const outName = db.makeTempPath("parquet");
511
+ const fileList = inputUris.map((u) => `'${sqlEscape(u)}'`).join(", ");
512
+ try {
513
+ await db.query(`COPY (SELECT * FROM read_parquet([${fileList}], union_by_name=true)) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
514
+ const bytes = await db.copyFileToBuffer(outName);
515
+ const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
516
+ const rowCount = Number(countRows[0]?.n ?? 0);
517
+ await dataSource.write(outputKey, bytes);
518
+ return {
519
+ bytes: bytes.byteLength,
520
+ rowCount
521
+ };
522
+ } finally {
523
+ await db.dropFiles([outName]);
524
+ }
525
+ }
526
+ const inputs = await Promise.all(inputKeys.map((k) => dataSource.read(k)));
527
+ const inNames = [];
528
+ const outName = db.makeTempPath("parquet");
529
+ const registered = [];
530
+ for (let i = 0; i < inputs.length; i++) {
531
+ const name = db.makeTempPath("parquet");
532
+ await db.registerFileBuffer(name, inputs[i]);
533
+ inNames.push(name);
534
+ registered.push(name);
535
+ }
536
+ try {
537
+ const fileList = inNames.map((n) => `'${sqlEscape(n)}'`).join(", ");
538
+ await db.query(`COPY (SELECT * FROM read_parquet([${fileList}], union_by_name = true)) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
539
+ registered.push(outName);
540
+ const bytes = await db.copyFileToBuffer(outName);
541
+ const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
542
+ const rowCount = Number(countRows[0]?.n ?? 0);
543
+ await dataSource.write(outputKey, bytes);
544
+ return {
545
+ bytes: bytes.byteLength,
546
+ rowCount
547
+ };
548
+ } finally {
549
+ await db.dropFiles(registered);
550
+ }
551
+ }
552
+ };
553
+ }
554
+ function rewriteEmptyFileSets(sql, placeholders, table) {
555
+ const emptyFallback = `(SELECT * FROM ${emptyTableSchema(table)} WHERE FALSE)`;
556
+ let out = sql;
557
+ for (const [name, keys] of Object.entries(placeholders)) {
558
+ if (keys.length > 0) continue;
559
+ const pattern = new RegExp(`read_parquet\\(\\s*\\{\\{${name}\\}\\}\\s*(?:,\\s*union_by_name\\s*=\\s*true\\s*)?\\)`, "g");
560
+ out = out.replace(pattern, emptyFallback);
561
+ }
562
+ return out;
563
+ }
564
+ function createDuckDBExecutor(factory) {
565
+ return { async execute({ sql, params, fileKeys, dataSource, table, signal }) {
566
+ signal?.throwIfAborted();
567
+ const db = await factory.getDuckDB();
568
+ const placeholders = {};
569
+ const registered = [];
570
+ for (const [name, keys] of Object.entries(fileKeys)) {
571
+ const resolved = [];
572
+ for (const key of keys) {
573
+ const uri = dataSource.uri?.(key);
574
+ if (uri !== void 0) resolved.push(uri);
575
+ else {
576
+ const bytes = await dataSource.read(key, void 0, signal);
577
+ await db.registerFileBuffer(key, bytes);
578
+ registered.push(key);
579
+ resolved.push(key);
580
+ }
581
+ }
582
+ placeholders[name] = resolved;
583
+ }
584
+ try {
585
+ signal?.throwIfAborted();
586
+ const finalSql = substituteNamedFiles(rewriteEmptyFileSets(sql, placeholders, table), placeholders);
587
+ return {
588
+ rows: await db.query(finalSql, params),
589
+ sql: finalSql
590
+ };
591
+ } finally {
592
+ if (registered.length > 0) await db.dropFiles(registered);
593
+ }
594
+ } };
595
+ }
596
+ function emptyTableSchema(table) {
597
+ return `(FROM (VALUES ${placeholderValues(table)}) t(${columnList(table)}))`;
598
+ }
599
+ function dateReplaceClause(table) {
600
+ if (!table) return "";
601
+ const dateCols = SCHEMAS[table].columns.filter((c) => c.type === "DATE").map((c) => c.name);
602
+ if (dateCols.length === 0) return "";
603
+ return `REPLACE (${dateCols.map((n) => `strftime(${n}, '%Y-%m-%d') AS ${n}`).join(", ")})`;
604
+ }
605
+ function columnList(table) {
606
+ return SCHEMAS[table].columns.map((c) => c.name).join(", ");
607
+ }
608
+ function placeholderValues(table) {
609
+ return `(${SCHEMAS[table].columns.map((c) => defaultForType(c.type)).join(", ")})`;
610
+ }
611
+ function defaultForType(t) {
612
+ if (t === "VARCHAR") return "''";
613
+ if (t === "DATE") return "DATE '1970-01-01'";
614
+ if (t === "INTEGER" || t === "BIGINT") return "0";
615
+ if (t === "DOUBLE") return "CAST(0 AS DOUBLE)";
616
+ return "NULL";
617
+ }
618
+ function columnsJson(table) {
619
+ return `{${SCHEMAS[table].columns.map((c) => `'${c.name}': '${c.type}'`).join(", ")}}`;
620
+ }
621
+ const VERSION_RE = /__v(\d+)\.parquet$/;
622
+ function parseLockScope(key) {
623
+ const match = VERSION_RE.exec(key);
624
+ if (!match) return void 0;
625
+ const parts = key.slice(0, match.index).split("/");
626
+ if (parts.length < 4) return void 0;
627
+ const userPart = parts[0];
628
+ if (!userPart.startsWith("u_")) return void 0;
629
+ const userId = userPart.slice(2);
630
+ const partition = parts.slice(-2).join("/");
631
+ const table = parts[parts.length - 3];
632
+ return {
633
+ userId,
634
+ siteId: parts.length >= 5 ? parts.slice(1, -3).join("/") : void 0,
635
+ table,
636
+ partition
637
+ };
638
+ }
639
+ async function gcOrphansImpl(deps, now, graceMs, opts = {}) {
640
+ const cutoff = now - graceMs;
641
+ const retired = await deps.manifestStore.listRetired(cutoff);
642
+ if (retired.length > 0) {
643
+ await deps.dataSource.delete(retired.map((e) => e.objectKey));
644
+ await deps.manifestStore.delete(retired);
645
+ }
646
+ let sweptOrphans = 0;
647
+ if (opts.userId) {
648
+ const prefix = tenantPrefix({
649
+ userId: opts.userId,
650
+ siteId: opts.siteId
651
+ });
652
+ const knownEntries = await deps.manifestStore.listAll({
653
+ userId: opts.userId,
654
+ siteId: opts.siteId
655
+ });
656
+ const knownSet = new Set(knownEntries.map((e) => e.objectKey));
657
+ const orphans = [];
658
+ const keyStream = deps.dataSource.streamList ? deps.dataSource.streamList(prefix) : async function* () {
659
+ const all = await deps.dataSource.list(prefix);
660
+ for (const k of all) yield k;
661
+ }();
662
+ for await (const key of keyStream) {
663
+ if (knownSet.has(key)) continue;
664
+ const match = VERSION_RE.exec(key);
665
+ if (!match) continue;
666
+ if (Number(match[1]) <= cutoff) orphans.push(key);
667
+ }
668
+ const byScope = /* @__PURE__ */ new Map();
669
+ for (const key of orphans) {
670
+ const scope = parseLockScope(key);
671
+ if (!scope) continue;
672
+ const sk = `${scope.userId}|${scope.siteId ?? ""}|${scope.table}|${scope.partition}`;
673
+ const bucket = byScope.get(sk) ?? {
674
+ scope,
675
+ keys: []
676
+ };
677
+ bucket.keys.push(key);
678
+ byScope.set(sk, bucket);
679
+ }
680
+ for (const { scope, keys } of byScope.values()) await deps.manifestStore.withLock(scope, async () => {
681
+ const known = await deps.manifestStore.listAll({
682
+ userId: scope.userId,
683
+ siteId: scope.siteId,
684
+ table: scope.table,
685
+ partitions: [scope.partition]
686
+ });
687
+ const knownInScope = new Set(known.map((e) => e.objectKey));
688
+ const stillOrphans = keys.filter((k) => !knownInScope.has(k));
689
+ if (stillOrphans.length > 0) {
690
+ await deps.dataSource.delete(stillOrphans);
691
+ sweptOrphans += stillOrphans.length;
692
+ }
693
+ });
694
+ }
695
+ return { deleted: retired.length + sweptOrphans };
696
+ }
697
+ const COMPARISON_FILTER_SQL = {
698
+ new: sql`AND (p.impressions IS NULL OR p.impressions = 0)`,
699
+ lost: sql`AND p.impressions > 0 AND c.impressions = 0`,
700
+ improving: sql`AND c.clicks > COALESCE(p.clicks, 0)`,
701
+ declining: sql`AND c.clicks < p.clicks AND p.clicks > 0`
702
+ };
703
+ function collapseWs(s) {
704
+ return s.replace(/\s+/g, " ").trim();
705
+ }
706
+ function joinAnd(parts) {
707
+ return sql.join(parts, sql` AND `);
708
+ }
709
+ function joinComma(parts) {
710
+ return sql.join(parts, sql`, `);
711
+ }
712
+ function orderByClause(state, prefix = "") {
713
+ if (state.orderBy) {
714
+ const safeCol = state.orderBy.column.replace(/\W/g, "");
715
+ const safeDir = state.orderBy.dir.toUpperCase() === "ASC" ? "ASC" : "DESC";
716
+ return sql.raw(`ORDER BY ${prefix}${safeCol} ${safeDir}`);
717
+ }
718
+ return sql.raw(`ORDER BY ${prefix}clicks DESC`);
719
+ }
720
+ function limitOffsetClause(state) {
721
+ const rowLimit = Math.max(0, Math.floor(Number(state.rowLimit ?? 100)));
722
+ const offset = state.startRow ? Math.max(0, Math.floor(Number(state.startRow))) : 0;
723
+ return sql.raw(offset > 0 ? `LIMIT ${rowLimit} OFFSET ${offset}` : `LIMIT ${rowLimit}`);
724
+ }
725
+ function aliasRaw(name) {
726
+ const safe = name.replace(/\W/g, "");
727
+ return sql.raw(`"${safe}"`);
728
+ }
729
+ function toInternalDimensionFilters(filters) {
730
+ return filters.map((filter) => ({
731
+ dimension: filter.dimension,
732
+ operator: filter.operator,
733
+ expression: filter.expression,
734
+ expression2: filter.expression2
735
+ }));
736
+ }
737
+ function toInternalMetricFilters(filters) {
738
+ return filters.map((filter) => ({
739
+ dimension: filter.metric,
740
+ operator: filter.operator,
741
+ expression: String(filter.expression),
742
+ expression2: filter.expression2 == null ? void 0 : String(filter.expression2)
743
+ }));
744
+ }
745
+ function topLevelFilters(plan) {
746
+ if (!plan.specialFilters.topLevel) return [];
747
+ return [{
748
+ dimension: "page",
749
+ operator: "topLevel",
750
+ expression: ""
751
+ }];
752
+ }
753
+ function logicalFilterToInternal(filter) {
754
+ return {
755
+ dimension: filter.dimension,
756
+ operator: filter.operator,
757
+ expression: filter.expression,
758
+ expression2: filter.expression2
759
+ };
760
+ }
761
+ function compileFilterTree(node, adapter, tableKey) {
762
+ if (!node) return void 0;
763
+ if (node.kind === "leaf") return adapter.dimensionPredicates([logicalFilterToInternal(node.filter)], tableKey)[0];
764
+ const childSqls = node.children.map((child) => compileFilterTree(child, adapter, tableKey)).filter((s) => s !== void 0);
765
+ if (childSqls.length === 0) return void 0;
766
+ if (childSqls.length === 1) return childSqls[0];
767
+ const sep = node.groupType === "or" ? sql` OR ` : sql` AND `;
768
+ return sql`(${sql.join(childSqls, sep)})`;
769
+ }
770
+ function buildScope(state, options) {
771
+ const { adapter, siteId } = options;
772
+ const plan = buildLogicalPlan(state, adapter.capabilities);
773
+ const tableKey = adapter.tableKeyForDataset(plan.dataset);
774
+ const dimFilters = toInternalDimensionFilters(plan.dimensionFilters);
775
+ const metricFilters = toInternalMetricFilters(plan.metricFilters);
776
+ const groupByDims = plan.groupByDimensions;
777
+ const hasDate = plan.hasDate;
778
+ const metrics = plan.metrics;
779
+ const wherePredicates = [];
780
+ if (adapter.siteIdColRef && siteId != null) wherePredicates.push(sql`${adapter.siteIdColRef(tableKey)} = ${siteId}`);
781
+ wherePredicates.push(sql`${adapter.dateColRef(tableKey)} >= ${plan.dateRange.startDate}`);
782
+ wherePredicates.push(sql`${adapter.dateColRef(tableKey)} <= ${plan.dateRange.endDate}`);
783
+ const dimSql = plan.dimensionFilterTree ? compileFilterTree(plan.dimensionFilterTree, adapter, tableKey) : void 0;
784
+ if (dimSql) wherePredicates.push(dimSql);
785
+ else if (!plan.dimensionFilterTree) wherePredicates.push(...adapter.dimensionPredicates(dimFilters, tableKey));
786
+ const tl = adapter.topLevelPredicate(topLevelFilters(plan), tableKey);
787
+ if (tl) wherePredicates.push(tl);
788
+ return {
789
+ plan,
790
+ tableKey,
791
+ groupByDims,
792
+ hasDate,
793
+ metrics,
794
+ wherePredicates,
795
+ having: adapter.havingPredicates(metricFilters, tableKey),
796
+ dimFilters,
797
+ startDate: plan.dateRange.startDate,
798
+ endDate: plan.dateRange.endDate
799
+ };
800
+ }
801
+ function buildComparisonPlan(current, previous, capabilities) {
802
+ return buildLogicalComparisonPlan(current, previous, capabilities);
803
+ }
804
+ function compileCollapsed(adapter, q) {
805
+ const c = adapter.compile(q);
806
+ return {
807
+ sql: collapseWs(c.sql),
808
+ params: c.params
809
+ };
810
+ }
811
+ function buildTotalsSql(state, options) {
812
+ const { adapter } = options;
813
+ const { tableKey, metrics, wherePredicates } = buildScope(state, options);
814
+ const table = adapter.tableRef(tableKey);
815
+ const selectExprs = metrics.map((m) => sql`${adapter.metricSql(m, tableKey)} as ${aliasRaw(m)}`);
816
+ return compileCollapsed(adapter, wherePredicates.length > 0 ? sql`SELECT ${joinComma(selectExprs)} FROM ${table} WHERE ${joinAnd(wherePredicates)}` : sql`SELECT ${joinComma(selectExprs)} FROM ${table}`);
817
+ }
818
+ function resolveComparisonSQL(current, previous, options, comparisonFilter) {
819
+ const { adapter, siteId } = options;
820
+ const comparisonPlan = buildComparisonPlan(current, previous, adapter.capabilities);
821
+ const currentScope = buildScope(current, options);
822
+ const previousScope = buildScope(previous, options);
823
+ const { tableKey, groupByDims, metrics, wherePredicates: currentWhere, having } = currentScope;
824
+ const table = adapter.tableRef(tableKey);
825
+ const dimSelectExprs = [];
826
+ for (const d of groupByDims) {
827
+ const expr = adapter.dimExprSql(d, tableKey);
828
+ const colName = adapter.dimColumn(d, tableKey);
829
+ if (d === "page" || colName !== d) dimSelectExprs.push(sql`${expr} as ${aliasRaw(d)}`);
830
+ else dimSelectExprs.push(expr);
831
+ }
832
+ const currentSelect = [...dimSelectExprs, ...metrics.map((m) => sql`${adapter.metricSql(m, tableKey)} as ${aliasRaw(m)}`)];
833
+ const prevSelect = [...dimSelectExprs, ...adapter.METRIC_NAMES.map((m) => sql`${adapter.metricSql(m, tableKey)} as ${aliasRaw(m)}`)];
834
+ const groupByExprs = groupByDims.map((d) => adapter.dimExprSql(d, tableKey));
835
+ const prevWhere = [];
836
+ if (adapter.siteIdColRef && siteId != null) prevWhere.push(sql`${adapter.siteIdColRef(tableKey)} = ${siteId}`);
837
+ if (previousScope.startDate) prevWhere.push(sql`${adapter.dateColRef(tableKey)} >= ${previousScope.startDate}`);
838
+ if (previousScope.endDate) prevWhere.push(sql`${adapter.dateColRef(tableKey)} <= ${previousScope.endDate}`);
839
+ const prevDimSql = comparisonPlan.current.dimensionFilterTree ? compileFilterTree(comparisonPlan.current.dimensionFilterTree, adapter, tableKey) : void 0;
840
+ if (prevDimSql) prevWhere.push(prevDimSql);
841
+ else if (!comparisonPlan.current.dimensionFilterTree) prevWhere.push(...adapter.dimensionPredicates(toInternalDimensionFilters(comparisonPlan.current.dimensionFilters), tableKey));
842
+ let currentCte = currentWhere.length > 0 ? sql`SELECT ${joinComma(currentSelect)} FROM ${table} WHERE ${joinAnd(currentWhere)}` : sql`SELECT ${joinComma(currentSelect)} FROM ${table}`;
843
+ if (groupByExprs.length > 0) currentCte = sql`${currentCte} GROUP BY ${joinComma(groupByExprs)}`;
844
+ if (having.length > 0) currentCte = sql`${currentCte} HAVING ${joinAnd(having)}`;
845
+ let previousCte = prevWhere.length > 0 ? sql`SELECT ${joinComma(prevSelect)} FROM ${table} WHERE ${joinAnd(prevWhere)}` : sql`SELECT ${joinComma(prevSelect)} FROM ${table}`;
846
+ if (groupByExprs.length > 0) previousCte = sql`${previousCte} GROUP BY ${joinComma(groupByExprs)}`;
847
+ const joinOn = groupByDims.length > 0 ? sql.raw(groupByDims.map((d) => `c.${d.replace(/\W/g, "")} = p.${d.replace(/\W/g, "")}`).join(" AND ")) : sql.raw("1=1");
848
+ const filterClause = comparisonFilter ? COMPARISON_FILTER_SQL[comparisonFilter] : sql.raw("");
849
+ const orderSql = orderByClause(current, "c.");
850
+ const limitSql = limitOffsetClause(current);
851
+ const outerCurrentCols = [];
852
+ for (const d of groupByDims) {
853
+ const colName = d.replace(/\W/g, "");
854
+ outerCurrentCols.push(sql.raw(`c.${colName} as "${colName}"`));
855
+ }
856
+ outerCurrentCols.push(sql.raw("CAST(c.clicks AS DOUBLE) as \"clicks\""));
857
+ outerCurrentCols.push(sql.raw("CAST(c.impressions AS DOUBLE) as \"impressions\""));
858
+ outerCurrentCols.push(sql.raw("c.ctr as \"ctr\""));
859
+ outerCurrentCols.push(sql.raw("c.position as \"position\""));
860
+ const mainQuery = sql`WITH current AS (${currentCte}), previous AS (${previousCte}) SELECT ${joinComma(outerCurrentCols)}, COALESCE(CAST(p.clicks AS DOUBLE), 0) as "prevClicks", COALESCE(CAST(p.impressions AS DOUBLE), 0) as "prevImpressions", COALESCE(p.ctr, 0) as "prevCtr", COALESCE(p.position, 0) as "prevPosition" FROM current c LEFT JOIN previous p ON ${joinOn} WHERE 1=1 ${filterClause} ${orderSql} ${limitSql}`;
861
+ const firstGroupBy = groupByDims[0] ? groupByDims[0].replace(/\W/g, "") : "clicks";
862
+ const countInnerSelect = sql.raw(`c.${firstGroupBy}`);
863
+ const countQuery = sql`WITH current AS (${currentCte}), previous AS (${previousCte}) SELECT COUNT(*) as total FROM (SELECT ${countInnerSelect} FROM current c LEFT JOIN previous p ON ${joinOn} WHERE 1=1 ${filterClause})`;
864
+ const main = compileCollapsed(adapter, mainQuery);
865
+ const count = compileCollapsed(adapter, countQuery);
866
+ return {
867
+ sql: main.sql,
868
+ params: main.params,
869
+ countSql: count.sql,
870
+ countParams: count.params
871
+ };
872
+ }
873
+ function buildExtrasQueries(state, options) {
874
+ const { adapter, siteId } = options;
875
+ const plan = buildLogicalPlan(state, adapter.capabilities);
876
+ const dims = plan.groupByDimensions;
877
+ const extras = [];
878
+ if (!dims.includes("queryCanonical")) return extras;
879
+ const keywordsKey = adapter.tableKeyForDataset("keywords");
880
+ const t = adapter.schema[keywordsKey];
881
+ const table = adapter.tableRef(keywordsKey);
882
+ const whereParts = [];
883
+ if (adapter.siteIdColRef && siteId != null) whereParts.push(sql`${adapter.siteIdColRef(keywordsKey)} = ${siteId}`);
884
+ whereParts.push(sql`${adapter.dateColRef(keywordsKey)} >= ${plan.dateRange.startDate}`);
885
+ whereParts.push(sql`${adapter.dateColRef(keywordsKey)} <= ${plan.dateRange.endDate}`);
886
+ const whereExpr = whereParts.length > 0 ? sql`WHERE ${joinAnd(whereParts)}` : sql``;
887
+ const outerQueryCol = sql.raw("query");
888
+ const compiled = compileCollapsed(adapter, sql`WITH per_variant AS (SELECT ${t.query_canonical} as joinKey, ${t.query} as query, SUM(${t.clicks}) as clicks, SUM(${t.impressions}) as impressions, SUM(${t.sum_position}) as sum_pos, ROW_NUMBER() OVER (PARTITION BY ${t.query_canonical} ORDER BY SUM(${t.clicks}) DESC) as rn, COUNT(*) OVER (PARTITION BY ${t.query_canonical}) as variantCount FROM ${table} ${whereExpr} GROUP BY ${t.query_canonical}, ${t.query}) SELECT joinKey, MAX(variantCount) as variantCount, MAX(CASE WHEN rn = 1 THEN ${outerQueryCol} END) as canonicalName, GROUP_CONCAT(CASE WHEN rn <= 10 THEN ${outerQueryCol} || ':::' || clicks || ':::' || impressions || ':::' || CAST(ROUND(CAST(sum_pos AS REAL) / NULLIF(impressions, 0) + 1, 1) AS TEXT) END, '||') as variants FROM per_variant GROUP BY joinKey`);
889
+ extras.push({
890
+ key: "canonicalExtras",
891
+ sql: compiled.sql,
892
+ params: compiled.params
893
+ });
894
+ return extras;
895
+ }
896
+ const LOGICAL_DATASETS = {
897
+ pages: { dimensions: {
898
+ page: {
899
+ column: "url",
900
+ surfaces: ["api", "stored"]
901
+ },
902
+ date: {
903
+ column: "date",
904
+ surfaces: ["api", "stored"]
905
+ }
906
+ } },
907
+ keywords: { dimensions: {
908
+ query: {
909
+ column: "query",
910
+ surfaces: ["api", "stored"]
911
+ },
912
+ queryCanonical: {
913
+ column: "query_canonical",
914
+ surfaces: ["stored", "derived"]
915
+ },
916
+ date: {
917
+ column: "date",
918
+ surfaces: ["api", "stored"]
919
+ }
920
+ } },
921
+ page_keywords: { dimensions: {
922
+ page: {
923
+ column: "url",
924
+ surfaces: ["api", "stored"]
925
+ },
926
+ query: {
927
+ column: "query",
928
+ surfaces: ["api", "stored"]
929
+ },
930
+ queryCanonical: {
931
+ column: "query_canonical",
932
+ surfaces: ["stored", "derived"]
933
+ },
934
+ date: {
935
+ column: "date",
936
+ surfaces: ["api", "stored"]
937
+ }
938
+ } },
939
+ countries: { dimensions: {
940
+ country: {
941
+ column: "country",
942
+ surfaces: ["api", "stored"]
943
+ },
944
+ date: {
945
+ column: "date",
946
+ surfaces: ["api", "stored"]
947
+ }
948
+ } },
949
+ devices: { dimensions: {
950
+ device: {
951
+ column: "device",
952
+ surfaces: ["api", "stored"]
953
+ },
954
+ date: {
955
+ column: "date",
956
+ surfaces: ["api", "stored"]
957
+ }
958
+ } },
959
+ search_appearance: { dimensions: {
960
+ searchAppearance: {
961
+ column: "searchAppearance",
962
+ surfaces: ["api", "stored"]
963
+ },
964
+ date: {
965
+ column: "date",
966
+ surfaces: ["api", "stored"]
967
+ }
968
+ } }
969
+ };
970
+ function inferLogicalDataset(dimensions, filterDims = []) {
971
+ const allDims = new Set([...dimensions, ...filterDims]);
972
+ const has = (d) => allDims.has(d);
973
+ if (has("searchAppearance")) return "search_appearance";
974
+ if (has("page") && (has("query") || has("queryCanonical"))) return "page_keywords";
975
+ if (has("query") || has("queryCanonical")) return "keywords";
976
+ if (has("page")) return "pages";
977
+ if (has("country")) return "countries";
978
+ if (has("device")) return "devices";
979
+ return "keywords";
980
+ }
981
+ const METRIC_NAMES = [
982
+ "clicks",
983
+ "impressions",
984
+ "ctr",
985
+ "position"
986
+ ];
987
+ function defaultSqliteUrlToPathExpr(col) {
988
+ return `CASE WHEN ${col} LIKE 'http%' THEN CASE WHEN INSTR(SUBSTR(${col}, INSTR(${col}, '://') + 3), '/') > 0 THEN SUBSTR(${col}, INSTR(${col}, '://') + 2 + INSTR(SUBSTR(${col}, INSTR(${col}, '://') + 3), '/')) ELSE '/' END ELSE ${col} END`;
989
+ }
990
+ function buildDimensionColumnMap(datasetToTableKey) {
991
+ const entries = Object.entries(datasetToTableKey).map(([dataset, tableKey]) => {
992
+ const dims = LOGICAL_DATASETS[dataset].dimensions;
993
+ return [tableKey, Object.fromEntries(Object.entries(dims).map(([dim, binding]) => [dim, binding?.column ?? dim]))];
994
+ });
995
+ return Object.fromEntries(entries);
996
+ }
997
+ function createSqlFragments(config) {
998
+ const { schema, datasetToTableKey, metricCast, regexPredicate, tableLabel, includeSiteId, urlToPathExpr: urlToPathExprOverride, tableRef: tableRefOverride } = config;
999
+ const DIM_COLUMN_MAP = buildDimensionColumnMap(datasetToTableKey);
1000
+ function isMetricDimension(dim) {
1001
+ return METRIC_NAMES.includes(dim);
1002
+ }
1003
+ function dimColumn(dim, table) {
1004
+ return DIM_COLUMN_MAP[table]?.[dim] ?? dim;
1005
+ }
1006
+ function tableKeyForDataset(dataset) {
1007
+ return datasetToTableKey[dataset];
1008
+ }
1009
+ function inferTable(dimensions, filterDims = []) {
1010
+ return tableKeyForDataset(inferLogicalDataset(dimensions, filterDims));
1011
+ }
1012
+ const urlToPathExpr = urlToPathExprOverride ?? defaultSqliteUrlToPathExpr;
1013
+ function colRef(tableKey, colName) {
1014
+ const c = schema[tableKey][colName];
1015
+ if (!c) throw new Error(`${tableLabel}: unknown column '${colName}' on ${tableKey}`);
1016
+ return sql`${c}`;
1017
+ }
1018
+ function tableRef(tableKey) {
1019
+ if (tableRefOverride) return tableRefOverride(tableKey);
1020
+ return sql`${schema[tableKey]}`;
1021
+ }
1022
+ function dateColRef(tableKey) {
1023
+ return colRef(tableKey, "date");
1024
+ }
1025
+ function siteIdColRef(tableKey) {
1026
+ return colRef(tableKey, "site_id");
1027
+ }
1028
+ function dimExprSql(dim, tableKey) {
1029
+ const colName = dimColumn(dim, tableKey);
1030
+ if (dim === "page") return sql.raw(urlToPathExpr(colName));
1031
+ return colRef(tableKey, colName);
1032
+ }
1033
+ function metricSql(metric, tableKey) {
1034
+ const t = schema[tableKey];
1035
+ switch (metric) {
1036
+ case "clicks": return sql`SUM(${t.clicks})`;
1037
+ case "impressions": return sql`SUM(${t.impressions})`;
1038
+ case "ctr": return sql`CAST(SUM(${t.clicks}) AS ${sql.raw(metricCast)}) / NULLIF(SUM(${t.impressions}), 0)`;
1039
+ case "position": return sql`SUM(${t.sum_position}) / NULLIF(SUM(${t.impressions}), 0) + 1`;
1040
+ }
1041
+ }
1042
+ function havingPredicates(filters, tableKey) {
1043
+ const preds = [];
1044
+ for (const f of filters) {
1045
+ const metric = f.dimension;
1046
+ if (!isMetricDimension(metric)) continue;
1047
+ const expr = metricSql(metric, tableKey);
1048
+ const v = Number(f.expression);
1049
+ switch (f.operator) {
1050
+ case "metricGte":
1051
+ preds.push(sql`${expr} >= ${v}`);
1052
+ break;
1053
+ case "metricGt":
1054
+ preds.push(sql`${expr} > ${v}`);
1055
+ break;
1056
+ case "metricLte":
1057
+ preds.push(sql`${expr} <= ${v}`);
1058
+ break;
1059
+ case "metricLt":
1060
+ preds.push(sql`${expr} < ${v}`);
1061
+ break;
1062
+ case "metricBetween": {
1063
+ const v2 = Number(f.expression2);
1064
+ preds.push(sql`${expr} >= ${v} AND ${expr} <= ${v2}`);
1065
+ break;
1066
+ }
1067
+ }
1068
+ }
1069
+ return preds;
1070
+ }
1071
+ function dimensionPredicates(filters, tableKey) {
1072
+ const preds = [];
1073
+ for (const f of filters) {
1074
+ if (isMetricDimension(f.dimension)) continue;
1075
+ if (f.dimension === "date") continue;
1076
+ if (f.operator === "topLevel") continue;
1077
+ const cRef = colRef(tableKey, dimColumn(f.dimension, tableKey));
1078
+ const matchExpr = f.dimension === "page" ? dimExprSql(f.dimension, tableKey) : cRef;
1079
+ switch (f.operator) {
1080
+ case "equals":
1081
+ preds.push(sql`${matchExpr} = ${f.expression}`);
1082
+ break;
1083
+ case "notEquals":
1084
+ preds.push(sql`${matchExpr} != ${f.expression}`);
1085
+ break;
1086
+ case "contains":
1087
+ preds.push(sql`${cRef} LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
1088
+ break;
1089
+ case "notContains":
1090
+ preds.push(sql`${cRef} NOT LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
1091
+ break;
1092
+ case "includingRegex":
1093
+ preds.push(regexPredicate(cRef, f.expression, false));
1094
+ break;
1095
+ case "excludingRegex":
1096
+ preds.push(regexPredicate(cRef, f.expression, true));
1097
+ break;
1098
+ }
1099
+ }
1100
+ return preds;
1101
+ }
1102
+ function topLevelPredicate(filters, tableKey) {
1103
+ if (!filters.some((f) => f.operator === "topLevel")) return void 0;
1104
+ const pathExpr = dimExprSql("page", tableKey);
1105
+ return sql`LENGTH(${pathExpr}) - LENGTH(REPLACE(${pathExpr}, '/', '')) <= 1`;
1106
+ }
1107
+ return {
1108
+ METRIC_NAMES,
1109
+ DIM_COLUMN_MAP,
1110
+ isMetricDimension,
1111
+ tableKeyForDataset,
1112
+ dimColumn,
1113
+ inferTable,
1114
+ urlToPathExpr,
1115
+ colRef,
1116
+ tableRef,
1117
+ dateColRef,
1118
+ siteIdColRef: includeSiteId ? siteIdColRef : void 0,
1119
+ dimExprSql,
1120
+ metricSql,
1121
+ havingPredicates,
1122
+ dimensionPredicates,
1123
+ topLevelPredicate
1124
+ };
1125
+ }
1126
+ function createResolverAdapter(config) {
1127
+ const runtime = createSqlFragments(config);
1128
+ return {
1129
+ METRIC_NAMES: runtime.METRIC_NAMES,
1130
+ capabilities: config.capabilities,
1131
+ schema: config.schema,
1132
+ tableKeyForDataset: runtime.tableKeyForDataset,
1133
+ inferTable: runtime.inferTable,
1134
+ dimColumn: runtime.dimColumn,
1135
+ isMetricDimension: runtime.isMetricDimension,
1136
+ tableRef: runtime.tableRef,
1137
+ dateColRef: runtime.dateColRef,
1138
+ urlToPathExpr: runtime.urlToPathExpr,
1139
+ siteIdColRef: runtime.siteIdColRef,
1140
+ dimExprSql: runtime.dimExprSql,
1141
+ metricSql: runtime.metricSql,
1142
+ dimensionPredicates: runtime.dimensionPredicates,
1143
+ havingPredicates: runtime.havingPredicates,
1144
+ topLevelPredicate: runtime.topLevelPredicate,
1145
+ compile: config.compile
1146
+ };
1147
+ }
1148
+ const pgDialect = new PgDialect();
1149
+ new SQLiteAsyncDialect();
1150
+ function compilePg(query) {
1151
+ const compiled = pgDialect.sqlToQuery(query);
1152
+ return {
1153
+ sql: compiled.sql,
1154
+ params: compiled.params
1155
+ };
1156
+ }
1157
+ const PG_BASE_CONFIG = {
1158
+ schema: drizzleSchema,
1159
+ datasetToTableKey: {
1160
+ pages: "pages",
1161
+ keywords: "keywords",
1162
+ page_keywords: "page_keywords",
1163
+ countries: "countries",
1164
+ devices: "devices",
1165
+ search_appearance: "search_appearance"
1166
+ },
1167
+ metricCast: "DOUBLE",
1168
+ regexPredicate: (expr, pattern, negate) => negate ? sql`NOT regexp_matches(${expr}, ${pattern})` : sql`regexp_matches(${expr}, ${pattern})`,
1169
+ urlToPathExpr: (col) => `CASE WHEN ${col} LIKE 'http%' THEN COALESCE(NULLIF(regexp_replace(${col}, '^https?://[^/]+', ''), ''), '/') ELSE ${col} END`,
1170
+ includeSiteId: false,
1171
+ compile: compilePg,
1172
+ capabilities: {
1173
+ regex: true,
1174
+ comparisonJoin: true,
1175
+ windowTotals: true
1176
+ }
1177
+ };
1178
+ createResolverAdapter({
1179
+ ...PG_BASE_CONFIG,
1180
+ tableLabel: "pg-resolver-adapter"
1181
+ });
1182
+ function createParquetResolverAdapter() {
1183
+ return createResolverAdapter({
1184
+ ...PG_BASE_CONFIG,
1185
+ tableLabel: "parquet-resolver-adapter",
1186
+ tableRef: (tk) => sql.raw(`read_parquet({{FILES}}, union_by_name = true) AS "${tk}"`)
1187
+ });
1188
+ }
1189
+ const URL_PURGE_TABLES = ["pages", "page_keywords"];
1190
+ const MAX_DAY_BYTES = 100 * 1024 * 1024;
1191
+ const URL_COLUMNS = /* @__PURE__ */ new Set();
1192
+ for (const t of Object.keys(SCHEMAS)) for (const col of SCHEMAS[t].columns) if (col.name === "url") URL_COLUMNS.add(`${t}:url`);
1193
+ function normalizeRow(table, row) {
1194
+ if (!URL_COLUMNS.has(`${table}:url`)) return row;
1195
+ const url = row.url;
1196
+ if (typeof url !== "string") return row;
1197
+ const normalized = normalizeUrl(url);
1198
+ if (normalized === url) return row;
1199
+ return {
1200
+ ...row,
1201
+ url: normalized
1202
+ };
1203
+ }
1204
+ function createStorageEngine(opts) {
1205
+ const { dataSource, manifestStore, codec, executor } = opts;
1206
+ const defaultNow = opts.now ?? (() => Date.now());
1207
+ async function writeDay(ctx, rows) {
1208
+ if (!ctx.date) throw new Error("writeDay requires ctx.date");
1209
+ const date = ctx.date;
1210
+ const now = (ctx.now ?? defaultNow)();
1211
+ const partition = dayPartition(date);
1212
+ const searchType = ctx.searchType;
1213
+ return manifestStore.withLock({
1214
+ userId: ctx.userId,
1215
+ siteId: ctx.siteId,
1216
+ table: ctx.table,
1217
+ partition
1218
+ }, async () => {
1219
+ const superseding = (await manifestStore.listLive({
1220
+ userId: ctx.userId,
1221
+ siteId: ctx.siteId,
1222
+ table: ctx.table,
1223
+ partitions: [partition]
1224
+ })).filter((e) => inferSearchType(e) === inferSearchType({ searchType }));
1225
+ const normalizedRows = rows.map((r) => normalizeRow(ctx.table, r));
1226
+ const key = objectKey(ctx, ctx.table, partition, now, searchType);
1227
+ const { bytes: writtenBytes, rowCount } = await codec.writeRows({ table: ctx.table }, normalizedRows, key, dataSource);
1228
+ let bytes = writtenBytes;
1229
+ if (bytes === 0 && rowCount > 0 && dataSource.head) {
1230
+ const probed = await dataSource.head(key);
1231
+ if (probed) bytes = probed.bytes;
1232
+ }
1233
+ if (bytes > 104857600) {
1234
+ await dataSource.delete([key]).catch(() => {});
1235
+ throw new Error(`writeDay payload ${bytes} bytes exceeds ${MAX_DAY_BYTES} hard ceiling (table=${ctx.table}, key=${key})`);
1236
+ }
1237
+ const entry = {
1238
+ userId: ctx.userId,
1239
+ siteId: ctx.siteId,
1240
+ table: ctx.table,
1241
+ partition,
1242
+ objectKey: key,
1243
+ rowCount,
1244
+ bytes,
1245
+ createdAt: now,
1246
+ schemaVersion: currentSchemaVersion(ctx.table),
1247
+ tier: "raw",
1248
+ ...searchType !== void 0 ? { searchType } : {}
1249
+ };
1250
+ await manifestStore.registerVersion(entry, superseding);
1251
+ await manifestStore.bumpWatermark({
1252
+ userId: ctx.userId,
1253
+ siteId: ctx.siteId,
1254
+ table: ctx.table
1255
+ }, date, now);
1256
+ });
1257
+ }
1258
+ async function runSQL(opts) {
1259
+ opts.signal?.throwIfAborted();
1260
+ const entries = Object.entries(opts.fileSets);
1261
+ const perSet = await Promise.all(entries.map(async ([name, ref]) => {
1262
+ return [name, (await manifestStore.listLive({
1263
+ userId: opts.ctx.userId,
1264
+ siteId: opts.ctx.siteId,
1265
+ table: ref.table,
1266
+ partitions: ref.partitions
1267
+ })).map((e) => e.objectKey)];
1268
+ }));
1269
+ opts.signal?.throwIfAborted();
1270
+ const fileKeys = {};
1271
+ for (const [name, keys] of perSet) fileKeys[name] = keys;
1272
+ const uniqueKeys = [...new Set(perSet.flatMap(([, keys]) => keys))];
1273
+ let table = opts.table;
1274
+ if (!table) {
1275
+ if (new Set(entries.map(([, ref]) => ref.table)).size > 1) throw new Error("runSQL requires explicit ctx.table when fileSets reference multiple tables.");
1276
+ table = entries[0]?.[1].table;
1277
+ }
1278
+ if (!table) throw new Error("runSQL requires at least one fileSet or an explicit table");
1279
+ const result = await executor.execute({
1280
+ sql: opts.sql,
1281
+ params: opts.params ?? [],
1282
+ fileKeys,
1283
+ dataSource,
1284
+ table,
1285
+ signal: opts.signal
1286
+ });
1287
+ return {
1288
+ rows: result.rows,
1289
+ sql: result.sql,
1290
+ objectKeys: uniqueKeys
1291
+ };
1292
+ }
1293
+ async function query(ctx, state) {
1294
+ const plan = buildLogicalPlan(state, { regex: true });
1295
+ const table = ctx.table ?? plan.dataset;
1296
+ const resolved = compileLogicalQueryPlan(plan, table);
1297
+ return runSQL({
1298
+ ctx: {
1299
+ userId: ctx.userId,
1300
+ siteId: ctx.siteId
1301
+ },
1302
+ table,
1303
+ fileSets: { FILES: {
1304
+ table,
1305
+ partitions: resolved.partitions
1306
+ } },
1307
+ sql: resolved.sql,
1308
+ params: resolved.params,
1309
+ signal: ctx.signal
1310
+ });
1311
+ }
1312
+ async function queryComparison(ctx, current, previous, filter) {
1313
+ const adapter = createParquetResolverAdapter();
1314
+ const currentPlan = buildLogicalPlan(current, adapter.capabilities);
1315
+ const previousPlan = buildLogicalPlan(previous, adapter.capabilities);
1316
+ if (currentPlan.dataset !== previousPlan.dataset) throw new Error(`queryComparison: current (${currentPlan.dataset}) and previous (${previousPlan.dataset}) must resolve to the same table`);
1317
+ const table = ctx.table ?? currentPlan.dataset;
1318
+ const comparison = resolveComparisonSQL(current, previous, {
1319
+ adapter,
1320
+ siteId: void 0
1321
+ }, filter);
1322
+ const totals = buildTotalsSql(current, {
1323
+ adapter,
1324
+ siteId: void 0
1325
+ });
1326
+ const fileSets = { FILES: {
1327
+ table,
1328
+ partitions: enumeratePartitions(currentPlan.dateRange.startDate < previousPlan.dateRange.startDate ? currentPlan.dateRange.startDate : previousPlan.dateRange.startDate, currentPlan.dateRange.endDate > previousPlan.dateRange.endDate ? currentPlan.dateRange.endDate : previousPlan.dateRange.endDate)
1329
+ } };
1330
+ const baseCtx = {
1331
+ userId: ctx.userId,
1332
+ siteId: ctx.siteId
1333
+ };
1334
+ const [main, count, totalsRow] = await Promise.all([
1335
+ runSQL({
1336
+ ctx: baseCtx,
1337
+ table,
1338
+ fileSets,
1339
+ sql: comparison.sql,
1340
+ params: comparison.params,
1341
+ signal: ctx.signal
1342
+ }),
1343
+ runSQL({
1344
+ ctx: baseCtx,
1345
+ table,
1346
+ fileSets,
1347
+ sql: comparison.countSql,
1348
+ params: comparison.countParams,
1349
+ signal: ctx.signal
1350
+ }),
1351
+ runSQL({
1352
+ ctx: baseCtx,
1353
+ table,
1354
+ fileSets,
1355
+ sql: totals.sql,
1356
+ params: totals.params,
1357
+ signal: ctx.signal
1358
+ })
1359
+ ]);
1360
+ return {
1361
+ rows: main.rows,
1362
+ totalCount: Number(count.rows[0]?.total ?? 0),
1363
+ totals: totalsRow.rows[0] ?? {}
1364
+ };
1365
+ }
1366
+ async function queryExtras(ctx, state) {
1367
+ const adapter = createParquetResolverAdapter();
1368
+ const extras = buildExtrasQueries(state, {
1369
+ adapter,
1370
+ siteId: void 0
1371
+ });
1372
+ if (extras.length === 0) return [];
1373
+ const plan = buildLogicalPlan(state, adapter.capabilities);
1374
+ const table = ctx.table ?? plan.dataset;
1375
+ const fileSets = { FILES: {
1376
+ table,
1377
+ partitions: enumeratePartitions(plan.dateRange.startDate, plan.dateRange.endDate)
1378
+ } };
1379
+ const baseCtx = {
1380
+ userId: ctx.userId,
1381
+ siteId: ctx.siteId
1382
+ };
1383
+ const results = await Promise.all(extras.map((e) => runSQL({
1384
+ ctx: baseCtx,
1385
+ table,
1386
+ fileSets,
1387
+ sql: e.sql,
1388
+ params: e.params,
1389
+ signal: ctx.signal
1390
+ })));
1391
+ return extras.map((e, i) => ({
1392
+ key: e.key,
1393
+ rows: results[i].rows
1394
+ }));
1395
+ }
1396
+ async function compactTiered(ctx, thresholds) {
1397
+ return compactTieredImpl({
1398
+ dataSource,
1399
+ manifestStore,
1400
+ codec
1401
+ }, ctx, (ctx.now ?? defaultNow)(), thresholds);
1402
+ }
1403
+ async function gcOrphans(ctx, graceMs) {
1404
+ return gcOrphansImpl({
1405
+ dataSource,
1406
+ manifestStore
1407
+ }, (ctx.now ?? defaultNow)(), graceMs, {
1408
+ userId: ctx.userId,
1409
+ siteId: ctx.siteId
1410
+ });
1411
+ }
1412
+ async function purgeTenant(ctx) {
1413
+ const prefix = tenantPrefix(ctx);
1414
+ const keys = [];
1415
+ const keyStream = dataSource.streamList ? dataSource.streamList(prefix) : async function* () {
1416
+ for (const k of await dataSource.list(prefix)) yield k;
1417
+ }();
1418
+ for await (const key of keyStream) keys.push(key);
1419
+ if (keys.length > 0) await dataSource.delete(keys);
1420
+ const manifestResult = await manifestStore.purgeTenant({
1421
+ userId: ctx.userId,
1422
+ siteId: ctx.siteId
1423
+ });
1424
+ return {
1425
+ userId: ctx.userId,
1426
+ siteId: ctx.siteId,
1427
+ prefix,
1428
+ objectsDeleted: keys.length,
1429
+ entriesRemoved: manifestResult.entriesRemoved,
1430
+ watermarksRemoved: manifestResult.watermarksRemoved,
1431
+ syncStatesRemoved: manifestResult.syncStatesRemoved,
1432
+ at: defaultNow()
1433
+ };
1434
+ }
1435
+ async function purgeUrls(ctx, urls) {
1436
+ const now = defaultNow();
1437
+ const urlSet = new Set(urls);
1438
+ let entriesRewritten = 0;
1439
+ let rowsRemoved = 0;
1440
+ let bytesAfter = 0;
1441
+ if (urlSet.size === 0) return {
1442
+ userId: ctx.userId,
1443
+ siteId: ctx.siteId,
1444
+ urlsRequested: 0,
1445
+ entriesRewritten: 0,
1446
+ rowsRemoved: 0,
1447
+ bytesAfter: 0,
1448
+ at: now
1449
+ };
1450
+ for (const table of URL_PURGE_TABLES) {
1451
+ const entries = await manifestStore.listLive({
1452
+ userId: ctx.userId,
1453
+ siteId: ctx.siteId,
1454
+ table
1455
+ });
1456
+ for (const entry of entries) await manifestStore.withLock({
1457
+ userId: entry.userId,
1458
+ siteId: entry.siteId,
1459
+ table,
1460
+ partition: entry.partition
1461
+ }, async () => {
1462
+ const rows = await codec.readRows({ table }, entry.objectKey, dataSource);
1463
+ const kept = rows.filter((r) => typeof r.url !== "string" || !urlSet.has(r.url));
1464
+ const removed = rows.length - kept.length;
1465
+ if (removed === 0) return;
1466
+ const searchType = entry.searchType;
1467
+ const newKey = objectKey({
1468
+ userId: entry.userId,
1469
+ siteId: entry.siteId
1470
+ }, table, entry.partition, now, searchType);
1471
+ const { bytes, rowCount } = await codec.writeRows({ table }, kept, newKey, dataSource);
1472
+ const newEntry = {
1473
+ userId: entry.userId,
1474
+ siteId: entry.siteId,
1475
+ table,
1476
+ partition: entry.partition,
1477
+ objectKey: newKey,
1478
+ rowCount,
1479
+ bytes,
1480
+ createdAt: now,
1481
+ schemaVersion: entry.schemaVersion ?? currentSchemaVersion(table),
1482
+ ...entry.tier !== void 0 ? { tier: entry.tier } : {},
1483
+ ...searchType !== void 0 ? { searchType } : {}
1484
+ };
1485
+ await manifestStore.registerVersion(newEntry, [entry]);
1486
+ entriesRewritten++;
1487
+ rowsRemoved += removed;
1488
+ bytesAfter += bytes;
1489
+ });
1490
+ }
1491
+ return {
1492
+ userId: ctx.userId,
1493
+ siteId: ctx.siteId,
1494
+ urlsRequested: urlSet.size,
1495
+ entriesRewritten,
1496
+ rowsRemoved,
1497
+ bytesAfter,
1498
+ at: now
1499
+ };
1500
+ }
1501
+ return {
1502
+ writeDay,
1503
+ query,
1504
+ queryComparison,
1505
+ queryExtras,
1506
+ runSQL,
1507
+ compactTiered,
1508
+ gcOrphans,
1509
+ purgeTenant,
1510
+ purgeUrls,
1511
+ listLive: (filter) => manifestStore.listLive(filter),
1512
+ listAll: (filter) => manifestStore.listAll(filter),
1513
+ getWatermarks: (filter) => manifestStore.getWatermarks(filter),
1514
+ getSyncStates: (filter) => manifestStore.getSyncStates(filter),
1515
+ setSyncState: (scope, state, detail) => manifestStore.setSyncState(scope, state, detail),
1516
+ readObject: (key) => dataSource.read(key)
1517
+ };
1518
+ }
1519
+ function arrowToRows(result) {
1520
+ const r = result;
1521
+ const arr = Array.isArray(r) ? r : typeof r?.toArray === "function" ? r.toArray() : [];
1522
+ if (!arr || arr.length === 0) return [];
1523
+ if (typeof arr[0]?.toJSON === "function") return arr.map((r) => r.toJSON());
1524
+ return arr;
1525
+ }
1526
+ const require_ = createRequire(typeof __filename !== "undefined" ? __filename : typeof import.meta !== "undefined" ? fileURLToPath(import.meta.url) : process.cwd());
1527
+ let singleton = null;
1528
+ function bundles() {
1529
+ return {
1530
+ mvp: {
1531
+ mainModule: require_.resolve("@duckdb/duckdb-wasm/dist/duckdb-mvp.wasm"),
1532
+ mainWorker: null
1533
+ },
1534
+ eh: {
1535
+ mainModule: require_.resolve("@duckdb/duckdb-wasm/dist/duckdb-eh.wasm"),
1536
+ mainWorker: null
1537
+ }
1538
+ };
1539
+ }
1540
+ async function initialize(opts) {
1541
+ const logger = opts.verbose ? new ConsoleLogger() : new VoidLogger();
1542
+ const db = await createDuckDB(bundles(), logger, NODE_RUNTIME);
1543
+ await db.instantiate();
1544
+ return {
1545
+ db,
1546
+ conn: db.connect()
1547
+ };
1548
+ }
1549
+ function createNodeDuckDBHandle(opts = {}) {
1550
+ if (!singleton) singleton = initialize(opts);
1551
+ return {
1552
+ async query(sql, params) {
1553
+ const { conn } = await singleton;
1554
+ if (!params || params.length === 0) return arrowToRows(conn.query(sql));
1555
+ const stmt = conn.prepare(sql);
1556
+ try {
1557
+ return arrowToRows(stmt.query(...params));
1558
+ } finally {
1559
+ stmt.close();
1560
+ }
1561
+ },
1562
+ async registerFileBuffer(name, bytes) {
1563
+ const { db } = await singleton;
1564
+ db.registerFileBuffer(name, bytes);
1565
+ },
1566
+ async copyFileToBuffer(name) {
1567
+ const { db } = await singleton;
1568
+ return db.copyFileToBuffer(name);
1569
+ },
1570
+ async dropFiles(names) {
1571
+ const { db } = await singleton;
1572
+ for (const name of names) {
1573
+ try {
1574
+ db.dropFile(name);
1575
+ } catch {}
1576
+ try {
1577
+ unlinkSync(name);
1578
+ } catch {}
1579
+ }
1580
+ },
1581
+ makeTempPath(ext) {
1582
+ return join(tmpdir(), `gscdump-${Math.random().toString(36).slice(2, 10)}.${ext}`);
1583
+ }
1584
+ };
1585
+ }
1586
+ function createFilesystemDataSource(opts) {
1587
+ const root = resolve(opts.rootDir);
1588
+ function pathFor(key) {
1589
+ const resolved = resolve(root, key);
1590
+ if (!resolved.startsWith(`${root}/`) && resolved !== root) throw new Error(`path escapes root: ${key}`);
1591
+ return resolved;
1592
+ }
1593
+ return {
1594
+ async read(key, range, signal) {
1595
+ const bytes = await readFile(pathFor(key), { signal });
1596
+ if (!range) return new Uint8Array(bytes.buffer, bytes.byteOffset, bytes.byteLength);
1597
+ const sliced = bytes.subarray(range.offset, range.offset + range.length);
1598
+ return new Uint8Array(sliced.buffer, sliced.byteOffset, sliced.byteLength);
1599
+ },
1600
+ async write(key, bytes) {
1601
+ const path = pathFor(key);
1602
+ await mkdir(dirname(path), { recursive: true });
1603
+ await writeFile(path, Buffer.from(bytes));
1604
+ },
1605
+ async delete(keys) {
1606
+ await Promise.all(keys.map(async (k) => {
1607
+ await rm(pathFor(k), { force: true });
1608
+ }));
1609
+ },
1610
+ async list(prefix) {
1611
+ const full = resolve(root, prefix);
1612
+ const out = [];
1613
+ await walk(full, out);
1614
+ return out.map((p) => p.slice(root.length + 1));
1615
+ },
1616
+ async *streamList(prefix) {
1617
+ const full = resolve(root, prefix);
1618
+ for await (const p of walkStream(full)) yield p.slice(root.length + 1);
1619
+ },
1620
+ async head(key) {
1621
+ return stat(pathFor(key)).then((s) => ({ bytes: s.size }), (err) => {
1622
+ if (err.code === "ENOENT") return void 0;
1623
+ throw err;
1624
+ });
1625
+ },
1626
+ uri(key) {
1627
+ return pathFor(key);
1628
+ }
1629
+ };
1630
+ }
1631
+ async function* walkStream(dir) {
1632
+ const entries = await readdir(dir, { withFileTypes: true }).catch((err) => {
1633
+ if (err.code === "ENOENT") return [];
1634
+ throw err;
1635
+ });
1636
+ for (const entry of entries) {
1637
+ const p = join(dir, String(entry.name));
1638
+ if (entry.isDirectory()) yield* walkStream(p);
1639
+ else yield p;
1640
+ }
1641
+ }
1642
+ async function walk(dir, out) {
1643
+ const entries = await readdir(dir, { withFileTypes: true }).catch((err) => {
1644
+ if (err.code === "ENOENT") return [];
1645
+ throw err;
1646
+ });
1647
+ for (const entry of entries) {
1648
+ const p = join(dir, String(entry.name));
1649
+ if (entry.isDirectory()) await walk(p, out);
1650
+ else out.push(p);
1651
+ }
1652
+ }
1653
+ function watermarkKey(w) {
1654
+ return `${w.userId}|${w.siteId ?? ""}|${w.table}`;
1655
+ }
1656
+ function matchesWatermarkFilter(w, filter) {
1657
+ if (w.userId !== filter.userId) return false;
1658
+ if (filter.siteId !== void 0 && w.siteId !== filter.siteId) return false;
1659
+ if (filter.table !== void 0 && w.table !== filter.table) return false;
1660
+ return true;
1661
+ }
1662
+ function syncStateKey(s) {
1663
+ return `${s.userId}|${s.siteId ?? ""}|${s.table}|${s.date}|${inferSearchType(s)}`;
1664
+ }
1665
+ function matchesSyncStateFilter(s, filter) {
1666
+ if (s.userId !== filter.userId) return false;
1667
+ if (filter.siteId !== void 0 && s.siteId !== filter.siteId) return false;
1668
+ if (filter.table !== void 0 && s.table !== filter.table) return false;
1669
+ if (filter.state !== void 0 && s.state !== filter.state) return false;
1670
+ if (filter.searchType !== void 0 && inferSearchType(s) !== filter.searchType) return false;
1671
+ return true;
1672
+ }
1673
+ function mergeSyncState(existing, scope, state, detail) {
1674
+ const at = detail?.at ?? Date.now();
1675
+ const attemptsBump = state === "inflight" ? 1 : 0;
1676
+ if (!existing) return {
1677
+ userId: scope.userId,
1678
+ siteId: scope.siteId,
1679
+ table: scope.table,
1680
+ date: scope.date,
1681
+ state,
1682
+ updatedAt: at,
1683
+ attempts: attemptsBump,
1684
+ error: detail?.error,
1685
+ ...scope.searchType !== void 0 ? { searchType: scope.searchType } : {}
1686
+ };
1687
+ return {
1688
+ ...existing,
1689
+ state,
1690
+ updatedAt: at,
1691
+ attempts: existing.attempts + attemptsBump,
1692
+ error: state === "done" ? void 0 : detail?.error ?? existing.error
1693
+ };
1694
+ }
1695
+ function matchesFilter(entry, filter) {
1696
+ if (entry.userId !== filter.userId) return false;
1697
+ if (filter.siteId !== void 0 && entry.siteId !== filter.siteId) return false;
1698
+ if (filter.table !== void 0 && entry.table !== filter.table) return false;
1699
+ if (filter.partitions && !filter.partitions.includes(entry.partition)) return false;
1700
+ if (filter.tier !== void 0 && inferLegacyTier(entry) !== filter.tier) return false;
1701
+ return true;
1702
+ }
1703
+ function lockFileFor(locksDir, scope) {
1704
+ return join(locksDir, `${`${scope.userId}|${scope.siteId ?? ""}|${scope.table}|${scope.partition}`.replace(/[^\w.-]/g, "_")}.lock`);
1705
+ }
1706
+ function createFilesystemManifestStore(opts) {
1707
+ const manifestPath = resolve(opts.path);
1708
+ const locksDir = join(dirname(manifestPath), "locks");
1709
+ async function load() {
1710
+ const content = await readFile(manifestPath, "utf8").catch((err) => {
1711
+ if (err.code === "ENOENT") return null;
1712
+ throw err;
1713
+ });
1714
+ if (content === null) return {
1715
+ version: 1,
1716
+ entries: []
1717
+ };
1718
+ const parsed = JSON.parse(content);
1719
+ if (parsed.version !== 1) throw new Error(`unsupported manifest version ${parsed.version}`);
1720
+ return parsed;
1721
+ }
1722
+ async function save(data) {
1723
+ await mkdir(dirname(manifestPath), { recursive: true });
1724
+ const tmp = `${manifestPath}.${randomBytes(6).toString("hex")}.tmp`;
1725
+ await writeFile(tmp, JSON.stringify(data), "utf8");
1726
+ await rename(tmp, manifestPath).catch(async (err) => {
1727
+ await unlink(tmp).catch(() => {});
1728
+ throw err;
1729
+ });
1730
+ }
1731
+ const queue = [];
1732
+ let running = false;
1733
+ function enqueue(fn) {
1734
+ return new Promise((resolvePromise, rejectPromise) => {
1735
+ queue.push(async () => {
1736
+ await fn().then(resolvePromise, rejectPromise);
1737
+ });
1738
+ drain();
1739
+ });
1740
+ }
1741
+ async function drain() {
1742
+ if (running) return;
1743
+ running = true;
1744
+ while (queue.length > 0) await queue.shift()().catch(() => {});
1745
+ running = false;
1746
+ }
1747
+ function entryKey(e) {
1748
+ return e.objectKey;
1749
+ }
1750
+ async function registerVersionsImpl(newEntries, superseding) {
1751
+ const data = await load();
1752
+ const supersededAt = newEntries[0]?.createdAt ?? Date.now();
1753
+ const byKey = new Map(data.entries.map((e) => [entryKey(e), e]));
1754
+ if (superseding) for (const s of superseding) {
1755
+ const existing = byKey.get(entryKey(s));
1756
+ if (existing && existing.retiredAt === void 0) byKey.set(entryKey(s), {
1757
+ ...existing,
1758
+ retiredAt: supersededAt
1759
+ });
1760
+ }
1761
+ for (const e of newEntries) byKey.set(entryKey(e), e);
1762
+ data.entries = Array.from(byKey.values());
1763
+ await save(data);
1764
+ }
1765
+ return {
1766
+ async listLive(filter) {
1767
+ return (await load()).entries.filter((e) => e.retiredAt === void 0 && matchesFilter(e, filter));
1768
+ },
1769
+ async listAll(filter) {
1770
+ return (await load()).entries.filter((e) => matchesFilter(e, filter));
1771
+ },
1772
+ async registerVersion(entry, superseding) {
1773
+ return enqueue(() => registerVersionsImpl([entry], superseding));
1774
+ },
1775
+ async registerVersions(entries, superseding) {
1776
+ return enqueue(() => registerVersionsImpl(entries, superseding));
1777
+ },
1778
+ async listRetired(olderThan) {
1779
+ return (await load()).entries.filter((e) => e.retiredAt !== void 0 && e.retiredAt <= olderThan);
1780
+ },
1781
+ async delete(toDelete) {
1782
+ return enqueue(async () => {
1783
+ const data = await load();
1784
+ const toDeleteKeys = new Set(toDelete.map(entryKey));
1785
+ data.entries = data.entries.filter((e) => !toDeleteKeys.has(entryKey(e)));
1786
+ await save(data);
1787
+ });
1788
+ },
1789
+ async getWatermarks(filter) {
1790
+ return ((await load()).watermarks ?? []).filter((w) => matchesWatermarkFilter(w, filter));
1791
+ },
1792
+ async getSyncStates(filter) {
1793
+ return ((await load()).syncStates ?? []).filter((s) => matchesSyncStateFilter(s, filter));
1794
+ },
1795
+ async setSyncState(scope, state, detail) {
1796
+ return enqueue(async () => {
1797
+ const data = await load();
1798
+ const key = syncStateKey(scope);
1799
+ const byKey = new Map((data.syncStates ?? []).map((s) => [syncStateKey(s), s]));
1800
+ byKey.set(key, mergeSyncState(byKey.get(key), scope, state, detail));
1801
+ data.syncStates = Array.from(byKey.values());
1802
+ await save(data);
1803
+ });
1804
+ },
1805
+ async withLock(scope, fn) {
1806
+ await mkdir(locksDir, { recursive: true });
1807
+ const path = lockFileFor(locksDir, scope);
1808
+ await writeFile(path, "", { flag: "a" });
1809
+ const release = await lock(path, {
1810
+ realpath: false,
1811
+ stale: 3e4,
1812
+ retries: {
1813
+ retries: 20,
1814
+ minTimeout: 50,
1815
+ maxTimeout: 500,
1816
+ factor: 1.5
1817
+ }
1818
+ });
1819
+ return await fn().finally(() => release().catch(() => {}));
1820
+ },
1821
+ async purgeTenant(filter) {
1822
+ return enqueue(async () => {
1823
+ const data = await load();
1824
+ const matches = (r) => r.userId === filter.userId && (filter.siteId === void 0 || r.siteId === filter.siteId);
1825
+ const before = {
1826
+ entries: data.entries.length,
1827
+ watermarks: (data.watermarks ?? []).length,
1828
+ syncStates: (data.syncStates ?? []).length
1829
+ };
1830
+ data.entries = data.entries.filter((e) => !matches(e));
1831
+ data.watermarks = (data.watermarks ?? []).filter((w) => !matches(w));
1832
+ data.syncStates = (data.syncStates ?? []).filter((s) => !matches(s));
1833
+ await save(data);
1834
+ return {
1835
+ entriesRemoved: before.entries - data.entries.length,
1836
+ watermarksRemoved: before.watermarks - data.watermarks.length,
1837
+ syncStatesRemoved: before.syncStates - data.syncStates.length
1838
+ };
1839
+ });
1840
+ },
1841
+ async bumpWatermark(scope, date, at) {
1842
+ return enqueue(async () => {
1843
+ const data = await load();
1844
+ const key = watermarkKey(scope);
1845
+ const byKey = new Map((data.watermarks ?? []).map((w) => [watermarkKey(w), w]));
1846
+ const existing = byKey.get(key);
1847
+ const nowMs = at ?? Date.now();
1848
+ const next = existing ? {
1849
+ ...existing,
1850
+ newestDateSynced: date > existing.newestDateSynced ? date : existing.newestDateSynced,
1851
+ oldestDateSynced: date < existing.oldestDateSynced ? date : existing.oldestDateSynced,
1852
+ lastSyncAt: nowMs > existing.lastSyncAt ? nowMs : existing.lastSyncAt
1853
+ } : {
1854
+ userId: scope.userId,
1855
+ siteId: scope.siteId,
1856
+ table: scope.table,
1857
+ newestDateSynced: date,
1858
+ oldestDateSynced: date,
1859
+ lastSyncAt: nowMs
1860
+ };
1861
+ byKey.set(key, next);
1862
+ data.watermarks = Array.from(byKey.values());
1863
+ await save(data);
1864
+ });
1865
+ }
1866
+ };
1867
+ }
1868
+ function createNodeHarness(opts) {
1869
+ const dataDir = opts.dataDir;
1870
+ const userId = opts.userId ?? "local";
1871
+ const manifestFilename = opts.manifestFilename ?? "manifest.json";
1872
+ const handle = createNodeDuckDBHandle();
1873
+ const factory = { getDuckDB: async () => handle };
1874
+ const dataSource = createFilesystemDataSource({ rootDir: dataDir });
1875
+ const engine = createStorageEngine({
1876
+ dataSource,
1877
+ manifestStore: createFilesystemManifestStore({ path: path.join(dataDir, manifestFilename) }),
1878
+ codec: createDuckDBCodec(factory),
1879
+ executor: createDuckDBExecutor(factory)
1880
+ });
1881
+ async function runRawSql(runOpts) {
1882
+ const result = await engine.runSQL({
1883
+ ctx: {
1884
+ userId,
1885
+ siteId: encodeSiteId(runOpts.siteUrl)
1886
+ },
1887
+ table: runOpts.table,
1888
+ fileSets: { FILES: { table: runOpts.table } },
1889
+ sql: runOpts.sql,
1890
+ params: runOpts.params ?? []
1891
+ });
1892
+ return {
1893
+ rows: result.rows,
1894
+ sql: result.sql,
1895
+ keys: result.objectKeys
1896
+ };
1897
+ }
1898
+ return {
1899
+ engine,
1900
+ dataSource,
1901
+ dataDir,
1902
+ userId,
1903
+ siteIdFor: encodeSiteId,
1904
+ runRawSql
1905
+ };
1906
+ }
1907
+ export { createNodeHarness };