@gscdump/engine 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/dist/_chunks/compiler.mjs +288 -0
  2. package/dist/_chunks/duckdb.d.mts +26 -0
  3. package/dist/_chunks/engine.mjs +578 -0
  4. package/dist/_chunks/pg-adapter.mjs +676 -0
  5. package/dist/_chunks/planner.d.mts +15 -0
  6. package/dist/_chunks/schema.d.mts +1258 -0
  7. package/dist/_chunks/schema.mjs +139 -0
  8. package/dist/_chunks/storage.d.mts +476 -0
  9. package/dist/_chunks/storage.mjs +39 -0
  10. package/dist/_chunks/types.d.mts +53 -0
  11. package/dist/adapters/duckdb-node.d.mts +1 -13
  12. package/dist/adapters/duckdb-node.mjs +1 -7
  13. package/dist/adapters/filesystem.d.mts +1 -193
  14. package/dist/adapters/filesystem.mjs +2 -9
  15. package/dist/adapters/http.d.mts +1 -193
  16. package/dist/adapters/http.mjs +1 -5
  17. package/dist/adapters/hyparquet.d.mts +6 -83
  18. package/dist/adapters/hyparquet.mjs +1 -105
  19. package/dist/adapters/inspection-sqlite-browser.d.mts +1 -7
  20. package/dist/adapters/inspection-sqlite-node.d.mts +1 -7
  21. package/dist/adapters/inspection-sqlite-node.mjs +1 -1
  22. package/dist/adapters/node-harness.d.mts +3 -306
  23. package/dist/adapters/node-harness.mjs +4 -1866
  24. package/dist/adapters/r2-manifest.d.mts +4 -149
  25. package/dist/adapters/r2-manifest.mjs +1 -8
  26. package/dist/adapters/r2.d.mts +1 -47
  27. package/dist/contracts.d.mts +1 -435
  28. package/dist/entities.d.mts +1 -47
  29. package/dist/index.d.mts +8 -1844
  30. package/dist/index.mjs +8 -1962
  31. package/dist/ingest.d.mts +1 -1
  32. package/dist/planner.d.mts +3 -16
  33. package/dist/planner.mjs +1 -320
  34. package/dist/resolver/index.d.mts +3 -51
  35. package/dist/resolver/index.mjs +2 -780
  36. package/dist/rollups.d.mts +6 -51
  37. package/dist/rollups.mjs +2 -209
  38. package/dist/schema.d.mts +2 -1258
  39. package/dist/schema.mjs +1 -138
  40. package/package.json +2 -2
package/dist/index.mjs CHANGED
@@ -1,9 +1,11 @@
1
- import { buildLogicalComparisonPlan, buildLogicalPlan } from "gscdump/query/plan";
2
- import { MS_PER_DAY, toIsoDate } from "gscdump";
3
- import { PgDialect, date, doublePrecision, getTableConfig, integer, pgTable, varchar } from "drizzle-orm/pg-core";
4
- import { normalizeUrl } from "gscdump/normalize";
5
- import { sql } from "drizzle-orm";
6
- import { SQLiteAsyncDialect } from "drizzle-orm/sqlite-core";
1
+ import { a as inferTable, c as countries, d as keywords, f as page_keywords, i as dimensionToColumn, l as devices, n as allTables, p as pages, r as currentSchemaVersion, s as TABLE_METADATA, t as SCHEMAS, u as drizzleSchema } from "./_chunks/schema.mjs";
2
+ import { a as mondayOfWeek, c as quarterOfMonth, d as weekPartition, i as inferSearchType, l as quarterPartition, n as dayPartition, o as monthPartition, r as inferLegacyTier, s as objectKey, t as DEFAULT_SEARCH_TYPE } from "./_chunks/storage.mjs";
3
+ import { i as substituteNamedFiles, o as enumeratePartitions, r as resolveToSQL, t as FILES_PLACEHOLDER } from "./_chunks/compiler.mjs";
4
+ import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
5
+ import { a as createDuckDBExecutor, i as createDuckDBCodec, n as createStorageEngine, r as canonicalEmptyParquetSchema, t as MAX_DAY_BYTES } from "./_chunks/engine.mjs";
6
+ import { createInspectionStoreSqlite, inspectionSqliteKey } from "./entities.mjs";
7
+ import { createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
8
+ import "./planner.mjs";
7
9
  function coerceRow(row) {
8
10
  let mutated = null;
9
11
  for (const [k, v] of Object.entries(row)) if (typeof v === "bigint") {
@@ -17,1960 +19,4 @@ function coerceRows(rows) {
17
19
  for (let i = 0; i < rows.length; i++) out[i] = coerceRow(rows[i]);
18
20
  return out;
19
21
  }
20
- function metricCols() {
21
- return {
22
- clicks: integer("clicks").notNull(),
23
- impressions: integer("impressions").notNull(),
24
- sum_position: doublePrecision("sum_position").notNull()
25
- };
26
- }
27
- const dateCol = () => date("date").notNull();
28
- const pages = pgTable("pages", {
29
- url: varchar("url").notNull(),
30
- date: dateCol(),
31
- ...metricCols()
32
- });
33
- const keywords = pgTable("keywords", {
34
- query: varchar("query").notNull(),
35
- query_canonical: varchar("query_canonical"),
36
- date: dateCol(),
37
- ...metricCols()
38
- });
39
- const countries = pgTable("countries", {
40
- country: varchar("country").notNull(),
41
- date: dateCol(),
42
- ...metricCols()
43
- });
44
- const devices = pgTable("devices", {
45
- device: varchar("device").notNull(),
46
- date: dateCol(),
47
- ...metricCols()
48
- });
49
- const page_keywords = pgTable("page_keywords", {
50
- url: varchar("url").notNull(),
51
- query: varchar("query").notNull(),
52
- query_canonical: varchar("query_canonical"),
53
- date: dateCol(),
54
- ...metricCols()
55
- });
56
- const drizzleSchema = {
57
- pages,
58
- keywords,
59
- countries,
60
- devices,
61
- page_keywords,
62
- search_appearance: pgTable("search_appearance", {
63
- searchAppearance: varchar("searchAppearance").notNull(),
64
- date: dateCol(),
65
- ...metricCols()
66
- })
67
- };
68
- const TABLE_METADATA = {
69
- pages: {
70
- sortKey: ["date", "url"],
71
- version: 1
72
- },
73
- keywords: {
74
- sortKey: ["date", "query"],
75
- version: 2
76
- },
77
- countries: {
78
- sortKey: ["date", "country"],
79
- version: 1
80
- },
81
- devices: {
82
- sortKey: ["date", "device"],
83
- version: 1
84
- },
85
- page_keywords: {
86
- sortKey: [
87
- "date",
88
- "url",
89
- "query"
90
- ],
91
- version: 2
92
- },
93
- search_appearance: {
94
- sortKey: ["date", "searchAppearance"],
95
- version: 1
96
- }
97
- };
98
- function pgSqlTypeToColumnType(sqlType) {
99
- const t = sqlType.toLowerCase();
100
- if (t.startsWith("varchar") || t === "text" || t.startsWith("char")) return "VARCHAR";
101
- if (t === "date" || t.startsWith("timestamp")) return "DATE";
102
- if (t.startsWith("double") || t === "real" || t.startsWith("numeric") || t.startsWith("decimal")) return "DOUBLE";
103
- if (t === "bigint" || t === "int8") return "BIGINT";
104
- if (t === "integer" || t === "int" || t === "int4" || t === "smallint" || t === "int2") return "INTEGER";
105
- throw new Error(`unmapped pg type '${sqlType}' — extend pgSqlTypeToColumnType in @gscdump/engine/schema`);
106
- }
107
- function tableSchemaFrom(tableName) {
108
- const columns = getTableConfig(drizzleSchema[tableName]).columns.map((col) => ({
109
- name: col.name,
110
- type: pgSqlTypeToColumnType(col.getSQLType()),
111
- nullable: !col.notNull
112
- }));
113
- const meta = TABLE_METADATA[tableName];
114
- return {
115
- name: tableName,
116
- columns,
117
- sortKey: meta.sortKey,
118
- version: meta.version
119
- };
120
- }
121
- const METRIC_TABLES = [
122
- "pages",
123
- "keywords",
124
- "countries",
125
- "devices",
126
- "page_keywords",
127
- "search_appearance"
128
- ];
129
- const SCHEMAS = Object.fromEntries(METRIC_TABLES.map((t) => [t, tableSchemaFrom(t)]));
130
- function currentSchemaVersion(table) {
131
- return SCHEMAS[table].version;
132
- }
133
- function allTables() {
134
- return METRIC_TABLES;
135
- }
136
- function inferTable(dimensions) {
137
- const dims = new Set(dimensions);
138
- const hasPage = dims.has("page");
139
- const hasQuery = dims.has("query");
140
- if (hasPage && hasQuery) return "page_keywords";
141
- if (hasQuery) return "keywords";
142
- if (hasPage) return "pages";
143
- if (dims.has("country")) return "countries";
144
- if (dims.has("device")) return "devices";
145
- if (dims.has("searchAppearance")) return "search_appearance";
146
- return "keywords";
147
- }
148
- function dimensionToColumn(dim, _table) {
149
- if (dim === "page") return "url";
150
- if (dim === "queryCanonical") return "query_canonical";
151
- return dim;
152
- }
153
- const DEFAULT_SEARCH_TYPE = "web";
154
- function inferSearchType(entry) {
155
- return entry.searchType ?? "web";
156
- }
157
- function inferLegacyTier(entry) {
158
- if (entry.tier !== void 0) return entry.tier;
159
- if (entry.partition.startsWith("daily/")) return "raw";
160
- if (entry.partition.startsWith("monthly/")) return "d30";
161
- }
162
- function dayPartition(date) {
163
- return `daily/${date}`;
164
- }
165
- function monthPartition(month) {
166
- return `monthly/${month}`;
167
- }
168
- function weekPartition(mondayIsoDate) {
169
- return `weekly/${mondayIsoDate}`;
170
- }
171
- function quarterPartition(quarter) {
172
- return `quarterly/${quarter}`;
173
- }
174
- function mondayOfWeek(isoDate) {
175
- const ms = Date.parse(`${isoDate}T00:00:00Z`);
176
- const dow = new Date(ms).getUTCDay();
177
- const offset = dow === 0 ? -6 : 1 - dow;
178
- return toIsoDate(new Date(ms + offset * MS_PER_DAY));
179
- }
180
- function quarterOfMonth(month) {
181
- const [y, m] = month.split("-").map(Number);
182
- return `${y}-Q${Math.floor((m - 1) / 3) + 1}`;
183
- }
184
- function objectKey(ctx, table, partition, version, searchType) {
185
- return `${ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/${table}` : `u_${ctx.userId}/${table}`}/${searchType !== void 0 && searchType !== "web" ? `${searchType}/` : ""}${partition}__v${version}.parquet`;
186
- }
187
- function tenantPrefix(ctx) {
188
- return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/` : `u_${ctx.userId}/`;
189
- }
190
- const DAILY_PARTITION_RE = /^daily\/(\d{4}-\d{2}-\d{2})$/;
191
- const WEEKLY_PARTITION_RE = /^weekly\/(\d{4}-\d{2}-\d{2})$/;
192
- const MONTHLY_PARTITION_RE = /^monthly\/(\d{4}-\d{2})$/;
193
- const DEFAULT_THRESHOLDS = {
194
- raw: 7,
195
- d7: 30,
196
- d30: 90
197
- };
198
- const PENDING_WINDOW_DAYS = 4;
199
- const STAGES = [
200
- {
201
- inputTier: "raw",
202
- outputTier: "d7",
203
- cutoffDays: DEFAULT_THRESHOLDS.raw,
204
- bucketKey: (e) => {
205
- const m = e.partition.match(DAILY_PARTITION_RE);
206
- if (!m) return void 0;
207
- return mondayOfWeek(m[1]);
208
- },
209
- bucketLatestMs: (monday) => Date.parse(`${monday}T00:00:00Z`) + 6 * MS_PER_DAY,
210
- outputPartition: weekPartition
211
- },
212
- {
213
- inputTier: "d7",
214
- outputTier: "d30",
215
- cutoffDays: DEFAULT_THRESHOLDS.d7,
216
- bucketKey: (e) => {
217
- const m = e.partition.match(WEEKLY_PARTITION_RE);
218
- if (!m) return void 0;
219
- return m[1].slice(0, 7);
220
- },
221
- bucketLatestMs: monthEndMs,
222
- outputPartition: monthPartition
223
- },
224
- {
225
- inputTier: "d30",
226
- outputTier: "d90",
227
- cutoffDays: DEFAULT_THRESHOLDS.d30,
228
- bucketKey: (e) => {
229
- const m = e.partition.match(MONTHLY_PARTITION_RE);
230
- if (!m) return void 0;
231
- return quarterOfMonth(m[1]);
232
- },
233
- bucketLatestMs: quarterEndMs,
234
- outputPartition: quarterPartition
235
- }
236
- ];
237
- async function compactTieredImpl(deps, ctx, now, overrides = {}) {
238
- const thresholds = {
239
- ...DEFAULT_THRESHOLDS,
240
- ...overrides
241
- };
242
- const stagesWithThresholds = STAGES.map((s) => ({
243
- ...s,
244
- cutoffDays: s.outputTier === "d7" ? thresholds.raw : s.outputTier === "d30" ? thresholds.d7 : thresholds.d30
245
- }));
246
- for (const stage of stagesWithThresholds) await runStage(deps, ctx, stage, now);
247
- }
248
- async function runStage(deps, ctx, stage, now) {
249
- const cutoff = now - Math.max(stage.cutoffDays, PENDING_WINDOW_DAYS) * MS_PER_DAY;
250
- const candidates = await deps.manifestStore.listLive({
251
- userId: ctx.userId,
252
- siteId: ctx.siteId,
253
- table: ctx.table,
254
- tier: stage.inputTier
255
- });
256
- const buckets = /* @__PURE__ */ new Map();
257
- for (const entry of candidates) {
258
- const key = stage.bucketKey(entry);
259
- if (!key) continue;
260
- if (stage.bucketLatestMs(key) >= cutoff) continue;
261
- const compositeKey = `${inferSearchType(entry)}\0${key}`;
262
- if (!buckets.has(compositeKey)) buckets.set(compositeKey, []);
263
- buckets.get(compositeKey).push(entry);
264
- }
265
- for (const [compositeKey, entries] of buckets) {
266
- const [searchType, bucket] = compositeKey.split("\0");
267
- const targetPartition = stage.outputPartition(bucket);
268
- if (entries.length === 1 && entries[0].partition === targetPartition) continue;
269
- await deps.manifestStore.withLock({
270
- userId: ctx.userId,
271
- siteId: ctx.siteId,
272
- table: ctx.table,
273
- partition: targetPartition
274
- }, async () => {
275
- const key = objectKey(ctx, ctx.table, targetPartition, now, searchType);
276
- const { bytes, rowCount } = await deps.codec.compactRows({ table: ctx.table }, entries.map((e) => e.objectKey), key, deps.dataSource);
277
- const newEntry = {
278
- userId: ctx.userId,
279
- siteId: ctx.siteId,
280
- table: ctx.table,
281
- partition: targetPartition,
282
- objectKey: key,
283
- rowCount,
284
- bytes,
285
- createdAt: now,
286
- schemaVersion: currentSchemaVersion(ctx.table),
287
- tier: stage.outputTier,
288
- ...searchType !== "web" ? { searchType } : {}
289
- };
290
- await deps.manifestStore.registerVersion(newEntry, entries);
291
- });
292
- }
293
- }
294
- function enumeratePartitions(startDate, endDate) {
295
- const out = [];
296
- const [sy, sm, sd] = startDate.split("-").map(Number);
297
- const [ey, em, ed] = endDate.split("-").map(Number);
298
- const start = Date.UTC(sy, sm - 1, sd);
299
- const end = Date.UTC(ey, em - 1, ed);
300
- if (end < start) return out;
301
- const seenWeeks = /* @__PURE__ */ new Set();
302
- const seenMonths = /* @__PURE__ */ new Set();
303
- const seenQuarters = /* @__PURE__ */ new Set();
304
- for (let t = start; t <= end; t += 864e5) {
305
- const d = new Date(t);
306
- const y = d.getUTCFullYear();
307
- const m = String(d.getUTCMonth() + 1).padStart(2, "0");
308
- const isoDay = `${y}-${m}-${String(d.getUTCDate()).padStart(2, "0")}`;
309
- const isoMonth = `${y}-${m}`;
310
- out.push(dayPartition(isoDay));
311
- const monday = mondayOfWeek(isoDay);
312
- if (!seenWeeks.has(monday)) {
313
- seenWeeks.add(monday);
314
- out.push(weekPartition(monday));
315
- }
316
- if (!seenMonths.has(isoMonth)) {
317
- seenMonths.add(isoMonth);
318
- out.push(monthPartition(isoMonth));
319
- }
320
- const quarter = quarterOfMonth(isoMonth);
321
- if (!seenQuarters.has(quarter)) {
322
- seenQuarters.add(quarter);
323
- out.push(quarterPartition(quarter));
324
- }
325
- }
326
- return out;
327
- }
328
- function monthEndMs(month) {
329
- const [y, m] = month.split("-").map(Number);
330
- return Date.UTC(y, m, 0, 23, 59, 59, 999);
331
- }
332
- function quarterEndMs(quarter) {
333
- const [yStr, qStr] = quarter.split("-Q");
334
- const y = Number(yStr);
335
- const q = Number(qStr);
336
- return Date.UTC(y, q * 3, 0, 23, 59, 59, 999);
337
- }
338
- function escapeLike(value) {
339
- return value.replace(/\\/g, "\\\\").replace(/%/g, "\\%").replace(/_/g, "\\_");
340
- }
341
- const METRIC_EXPR = {
342
- clicks: "CAST(SUM(clicks) AS DOUBLE)",
343
- impressions: "CAST(SUM(impressions) AS DOUBLE)",
344
- ctr: "CAST(SUM(clicks) AS DOUBLE) / NULLIF(SUM(impressions), 0)",
345
- position: "SUM(sum_position) / NULLIF(SUM(impressions), 0) + 1"
346
- };
347
- function topLevelPagePredicateSql(pathExpr) {
348
- return `LENGTH(${pathExpr}) - LENGTH(REPLACE(${pathExpr}, '/', '')) <= 1`;
349
- }
350
- const FILES_PLACEHOLDER = "{{FILES}}";
351
- function buildDimensionWhere(filters, table) {
352
- const clauses = [];
353
- const params = [];
354
- for (const filter of filters) {
355
- const column = dimensionToColumn(filter.dimension, table);
356
- switch (filter.operator) {
357
- case "equals":
358
- clauses.push(`${column} = ?`);
359
- params.push(filter.expression);
360
- break;
361
- case "notEquals":
362
- clauses.push(`${column} != ?`);
363
- params.push(filter.expression);
364
- break;
365
- case "contains":
366
- clauses.push(`${column} LIKE ? ESCAPE '\\'`);
367
- params.push(`%${escapeLike(filter.expression)}%`);
368
- break;
369
- case "notContains":
370
- clauses.push(`${column} NOT LIKE ? ESCAPE '\\'`);
371
- params.push(`%${escapeLike(filter.expression)}%`);
372
- break;
373
- case "includingRegex":
374
- clauses.push(`regexp_matches(${column}, ?)`);
375
- params.push(filter.expression);
376
- break;
377
- case "excludingRegex":
378
- clauses.push(`NOT regexp_matches(${column}, ?)`);
379
- params.push(filter.expression);
380
- break;
381
- }
382
- }
383
- return {
384
- clause: clauses.join(" AND "),
385
- params
386
- };
387
- }
388
- function buildTopLevelWhere(plan, table) {
389
- if (!plan.specialFilters.topLevel) return "";
390
- return topLevelPagePredicateSql(dimensionToColumn("page", table));
391
- }
392
- function buildHaving(filters) {
393
- if (filters.length === 0) return {
394
- clause: "",
395
- params: []
396
- };
397
- const clauses = [];
398
- const params = [];
399
- for (const filter of filters) {
400
- const expr = METRIC_EXPR[filter.metric];
401
- switch (filter.operator) {
402
- case "metricGte":
403
- clauses.push(`${expr} >= ?`);
404
- params.push(filter.expression);
405
- break;
406
- case "metricGt":
407
- clauses.push(`${expr} > ?`);
408
- params.push(filter.expression);
409
- break;
410
- case "metricLte":
411
- clauses.push(`${expr} <= ?`);
412
- params.push(filter.expression);
413
- break;
414
- case "metricLt":
415
- clauses.push(`${expr} < ?`);
416
- params.push(filter.expression);
417
- break;
418
- case "metricBetween":
419
- clauses.push(`${expr} >= ? AND ${expr} <= ?`);
420
- params.push(filter.expression, filter.expression2 ?? filter.expression);
421
- break;
422
- }
423
- }
424
- return {
425
- clause: clauses.length > 0 ? `HAVING ${clauses.join(" AND ")}` : "",
426
- params
427
- };
428
- }
429
- function compileLogicalQueryPlan(plan, table = plan.dataset) {
430
- const partitions = enumeratePartitions(plan.dateRange.startDate, plan.dateRange.endDate);
431
- const metricSelects = plan.metrics.map((metric) => `${METRIC_EXPR[metric]} AS ${metric}`);
432
- const dimSelects = plan.groupByDimensions.map((dimension) => {
433
- const column = dimensionToColumn(dimension, table);
434
- return column !== dimension ? `${column} AS ${dimension}` : dimension;
435
- });
436
- const whereClauses = ["date >= ?", "date <= ?"];
437
- const whereParams = [plan.dateRange.startDate, plan.dateRange.endDate];
438
- const dimWhere = buildDimensionWhere(plan.dimensionFilters, table);
439
- if (dimWhere.clause) {
440
- whereClauses.push(dimWhere.clause);
441
- whereParams.push(...dimWhere.params);
442
- }
443
- const topLevelClause = buildTopLevelWhere(plan, table);
444
- if (topLevelClause) whereClauses.push(topLevelClause);
445
- const having = buildHaving(plan.metricFilters);
446
- const groupByCols = [...plan.groupByDimensions.map((dimension) => dimensionToColumn(dimension, table)), ...plan.hasDate ? ["date"] : []];
447
- const groupBy = groupByCols.length > 0 ? `GROUP BY ${groupByCols.join(", ")}` : "";
448
- const orderBy = plan.orderBy ? `ORDER BY ${plan.orderBy.column} ${plan.orderBy.dir.toUpperCase()}` : "ORDER BY clicks DESC";
449
- const limit = `LIMIT ${plan.rowLimit ?? 1e3}`;
450
- const offset = plan.startRow ? `OFFSET ${plan.startRow}` : "";
451
- return {
452
- sql: [
453
- `SELECT ${[
454
- ...dimSelects,
455
- ...plan.hasDate ? ["date"] : [],
456
- ...metricSelects
457
- ].join(", ")}`,
458
- `FROM read_parquet(${FILES_PLACEHOLDER}, union_by_name = true)`,
459
- `WHERE ${whereClauses.join(" AND ")}`,
460
- groupBy,
461
- having.clause,
462
- orderBy,
463
- limit,
464
- offset
465
- ].filter(Boolean).join(" ").replace(/\s+/g, " ").trim(),
466
- params: [...whereParams, ...having.params],
467
- partitions,
468
- table,
469
- filesPlaceholder: FILES_PLACEHOLDER
470
- };
471
- }
472
- function resolveToSQL(state, table) {
473
- const plan = buildLogicalPlan(state, { regex: true });
474
- return compileLogicalQueryPlan(plan, table ?? plan.dataset);
475
- }
476
- function fileList(keys) {
477
- return keys.length === 0 ? "[]" : `[${keys.map((key) => `'${key.replace(/'/g, "''")}'`).join(", ")}]`;
478
- }
479
- function substituteNamedFiles(sql, sets) {
480
- let out = sql;
481
- for (const [name, keys] of Object.entries(sets)) out = out.replace(new RegExp(`\\{\\{${name}\\}\\}`, "g"), fileList(keys));
482
- return out;
483
- }
484
- function containsDisallowedControlChars(value) {
485
- for (let i = 0; i < value.length; i++) {
486
- const code = value.charCodeAt(i);
487
- if (code >= 0 && code <= 8 || code === 11 || code === 12 || code >= 14 && code <= 31) return true;
488
- }
489
- return false;
490
- }
491
- function sqlEscape(s) {
492
- return s.replace(/'/g, "''");
493
- }
494
- function formatLiteral(value) {
495
- if (value == null) return "NULL";
496
- if (typeof value === "number") {
497
- if (!Number.isFinite(value)) throw new Error(`cannot inline non-finite number: ${value}`);
498
- return String(value);
499
- }
500
- if (typeof value === "boolean") return value ? "TRUE" : "FALSE";
501
- if (typeof value === "bigint") return value.toString();
502
- if (value instanceof Date) return `'${value.toISOString()}'`;
503
- if (typeof value === "string") {
504
- if (containsDisallowedControlChars(value)) throw new Error("string literal contains disallowed control characters");
505
- return `'${value.replace(/'/g, "''")}'`;
506
- }
507
- throw new Error(`cannot inline value of type ${typeof value}`);
508
- }
509
- function bindLiterals(sql, params) {
510
- if (params.length === 0) return sql;
511
- let out = "";
512
- let i = 0;
513
- let qmarkIdx = 0;
514
- const usedDollar = /* @__PURE__ */ new Set();
515
- let inString = false;
516
- while (i < sql.length) {
517
- const c = sql[i];
518
- if (inString) {
519
- out += c;
520
- if (c === "'") {
521
- if (sql[i + 1] === "'") {
522
- out += "'";
523
- i += 2;
524
- continue;
525
- }
526
- inString = false;
527
- }
528
- i++;
529
- continue;
530
- }
531
- if (c === "-" && sql[i + 1] === "-") {
532
- const nl = sql.indexOf("\n", i + 2);
533
- const end = nl === -1 ? sql.length : nl;
534
- out += sql.slice(i, end);
535
- i = end;
536
- continue;
537
- }
538
- if (c === "/" && sql[i + 1] === "*") {
539
- const close = sql.indexOf("*/", i + 2);
540
- const end = close === -1 ? sql.length : close + 2;
541
- out += sql.slice(i, end);
542
- i = end;
543
- continue;
544
- }
545
- if (c === "'") {
546
- inString = true;
547
- out += c;
548
- i++;
549
- continue;
550
- }
551
- if (c === "?") {
552
- if (qmarkIdx >= params.length) throw new Error(`bindLiterals: more '?' placeholders than params (have ${params.length})`);
553
- out += formatLiteral(params[qmarkIdx++]);
554
- i++;
555
- continue;
556
- }
557
- if (c === "$" && sql[i + 1] && sql[i + 1] >= "0" && sql[i + 1] <= "9") {
558
- let j = i + 1;
559
- while (j < sql.length && sql[j] >= "0" && sql[j] <= "9") j++;
560
- const n = Number(sql.slice(i + 1, j));
561
- if (n < 1 || n > params.length) throw new Error(`bindLiterals: $${n} out of range (have ${params.length} params)`);
562
- usedDollar.add(n - 1);
563
- out += formatLiteral(params[n - 1]);
564
- i = j;
565
- continue;
566
- }
567
- out += c;
568
- i++;
569
- }
570
- if (qmarkIdx > 0 && usedDollar.size > 0) throw new Error("bindLiterals: cannot mix '?' and '$N' placeholders in the same query");
571
- const used = qmarkIdx > 0 ? qmarkIdx : usedDollar.size;
572
- if (used !== params.length) throw new Error(`bindLiterals: ${params.length - used} params unused`);
573
- return out;
574
- }
575
- async function encodeBytes(db, table, rows) {
576
- const inName = db.makeTempPath("json");
577
- const outName = db.makeTempPath("parquet");
578
- const jsonBytes = new TextEncoder().encode(JSON.stringify(rows));
579
- const registered = [];
580
- await db.registerFileBuffer(inName, jsonBytes);
581
- registered.push(inName);
582
- try {
583
- const sql = rows.length === 0 ? `COPY (SELECT * FROM ${emptyTableSchema(table)} WHERE FALSE) TO '${sqlEscape(outName)}' (FORMAT PARQUET)` : `COPY (SELECT * FROM read_json_auto('${sqlEscape(inName)}', format='array', columns=${columnsJson(table)})) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`;
584
- await db.query(sql);
585
- registered.push(outName);
586
- return await db.copyFileToBuffer(outName);
587
- } finally {
588
- await db.dropFiles(registered);
589
- }
590
- }
591
- async function decodeBytes(db, bytes, table) {
592
- const name = db.makeTempPath("parquet");
593
- await db.registerFileBuffer(name, bytes);
594
- try {
595
- return await db.query(`SELECT * ${dateReplaceClause(table)} FROM read_parquet('${sqlEscape(name)}')`);
596
- } finally {
597
- await db.dropFiles([name]);
598
- }
599
- }
600
- function createDuckDBCodec(factory) {
601
- return {
602
- async writeRows(ctx, rows, key, dataSource) {
603
- const bytes = await encodeBytes(await factory.getDuckDB(), ctx.table, rows);
604
- await dataSource.write(key, bytes);
605
- return {
606
- bytes: bytes.byteLength,
607
- rowCount: rows.length
608
- };
609
- },
610
- async readRows(ctx, key, dataSource) {
611
- return decodeBytes(await factory.getDuckDB(), await dataSource.read(key), ctx.table);
612
- },
613
- async compactRows(ctx, inputKeys, outputKey, dataSource) {
614
- const db = await factory.getDuckDB();
615
- if (inputKeys.length === 0) {
616
- const bytes = await encodeBytes(db, ctx.table, []);
617
- await dataSource.write(outputKey, bytes);
618
- return {
619
- bytes: bytes.byteLength,
620
- rowCount: 0
621
- };
622
- }
623
- const inputUris = inputKeys.map((k) => dataSource.uri?.(k));
624
- if (inputUris.every((u) => u !== void 0)) {
625
- const outName = db.makeTempPath("parquet");
626
- const fileList = inputUris.map((u) => `'${sqlEscape(u)}'`).join(", ");
627
- try {
628
- await db.query(`COPY (SELECT * FROM read_parquet([${fileList}], union_by_name=true)) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
629
- const bytes = await db.copyFileToBuffer(outName);
630
- const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
631
- const rowCount = Number(countRows[0]?.n ?? 0);
632
- await dataSource.write(outputKey, bytes);
633
- return {
634
- bytes: bytes.byteLength,
635
- rowCount
636
- };
637
- } finally {
638
- await db.dropFiles([outName]);
639
- }
640
- }
641
- const inputs = await Promise.all(inputKeys.map((k) => dataSource.read(k)));
642
- const inNames = [];
643
- const outName = db.makeTempPath("parquet");
644
- const registered = [];
645
- for (let i = 0; i < inputs.length; i++) {
646
- const name = db.makeTempPath("parquet");
647
- await db.registerFileBuffer(name, inputs[i]);
648
- inNames.push(name);
649
- registered.push(name);
650
- }
651
- try {
652
- const fileList = inNames.map((n) => `'${sqlEscape(n)}'`).join(", ");
653
- await db.query(`COPY (SELECT * FROM read_parquet([${fileList}], union_by_name = true)) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
654
- registered.push(outName);
655
- const bytes = await db.copyFileToBuffer(outName);
656
- const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
657
- const rowCount = Number(countRows[0]?.n ?? 0);
658
- await dataSource.write(outputKey, bytes);
659
- return {
660
- bytes: bytes.byteLength,
661
- rowCount
662
- };
663
- } finally {
664
- await db.dropFiles(registered);
665
- }
666
- }
667
- };
668
- }
669
- function rewriteEmptyFileSets(sql, placeholders, table) {
670
- const emptyFallback = `(SELECT * FROM ${emptyTableSchema(table)} WHERE FALSE)`;
671
- let out = sql;
672
- for (const [name, keys] of Object.entries(placeholders)) {
673
- if (keys.length > 0) continue;
674
- const pattern = new RegExp(`read_parquet\\(\\s*\\{\\{${name}\\}\\}\\s*(?:,\\s*union_by_name\\s*=\\s*true\\s*)?\\)`, "g");
675
- out = out.replace(pattern, emptyFallback);
676
- }
677
- return out;
678
- }
679
- function createDuckDBExecutor(factory) {
680
- return { async execute({ sql, params, fileKeys, dataSource, table, signal }) {
681
- signal?.throwIfAborted();
682
- const db = await factory.getDuckDB();
683
- const placeholders = {};
684
- const registered = [];
685
- for (const [name, keys] of Object.entries(fileKeys)) {
686
- const resolved = [];
687
- for (const key of keys) {
688
- const uri = dataSource.uri?.(key);
689
- if (uri !== void 0) resolved.push(uri);
690
- else {
691
- const bytes = await dataSource.read(key, void 0, signal);
692
- await db.registerFileBuffer(key, bytes);
693
- registered.push(key);
694
- resolved.push(key);
695
- }
696
- }
697
- placeholders[name] = resolved;
698
- }
699
- try {
700
- signal?.throwIfAborted();
701
- const finalSql = substituteNamedFiles(rewriteEmptyFileSets(sql, placeholders, table), placeholders);
702
- return {
703
- rows: await db.query(finalSql, params),
704
- sql: finalSql
705
- };
706
- } finally {
707
- if (registered.length > 0) await db.dropFiles(registered);
708
- }
709
- } };
710
- }
711
- function emptyTableSchema(table) {
712
- return `(FROM (VALUES ${placeholderValues(table)}) t(${columnList(table)}))`;
713
- }
714
- function canonicalEmptyParquetSchema(table) {
715
- return emptyTableSchema(table);
716
- }
717
- function dateReplaceClause(table) {
718
- if (!table) return "";
719
- const dateCols = SCHEMAS[table].columns.filter((c) => c.type === "DATE").map((c) => c.name);
720
- if (dateCols.length === 0) return "";
721
- return `REPLACE (${dateCols.map((n) => `strftime(${n}, '%Y-%m-%d') AS ${n}`).join(", ")})`;
722
- }
723
- function columnList(table) {
724
- return SCHEMAS[table].columns.map((c) => c.name).join(", ");
725
- }
726
- function placeholderValues(table) {
727
- return `(${SCHEMAS[table].columns.map((c) => defaultForType(c.type)).join(", ")})`;
728
- }
729
- function defaultForType(t) {
730
- if (t === "VARCHAR") return "''";
731
- if (t === "DATE") return "DATE '1970-01-01'";
732
- if (t === "INTEGER" || t === "BIGINT") return "0";
733
- if (t === "DOUBLE") return "CAST(0 AS DOUBLE)";
734
- return "NULL";
735
- }
736
- function columnsJson(table) {
737
- return `{${SCHEMAS[table].columns.map((c) => `'${c.name}': '${c.type}'`).join(", ")}}`;
738
- }
739
- const VERSION_RE = /__v(\d+)\.parquet$/;
740
- function parseLockScope(key) {
741
- const match = VERSION_RE.exec(key);
742
- if (!match) return void 0;
743
- const parts = key.slice(0, match.index).split("/");
744
- if (parts.length < 4) return void 0;
745
- const userPart = parts[0];
746
- if (!userPart.startsWith("u_")) return void 0;
747
- const userId = userPart.slice(2);
748
- const partition = parts.slice(-2).join("/");
749
- const table = parts[parts.length - 3];
750
- return {
751
- userId,
752
- siteId: parts.length >= 5 ? parts.slice(1, -3).join("/") : void 0,
753
- table,
754
- partition
755
- };
756
- }
757
- async function gcOrphansImpl(deps, now, graceMs, opts = {}) {
758
- const cutoff = now - graceMs;
759
- const retired = await deps.manifestStore.listRetired(cutoff);
760
- if (retired.length > 0) {
761
- await deps.dataSource.delete(retired.map((e) => e.objectKey));
762
- await deps.manifestStore.delete(retired);
763
- }
764
- let sweptOrphans = 0;
765
- if (opts.userId) {
766
- const prefix = tenantPrefix({
767
- userId: opts.userId,
768
- siteId: opts.siteId
769
- });
770
- const knownEntries = await deps.manifestStore.listAll({
771
- userId: opts.userId,
772
- siteId: opts.siteId
773
- });
774
- const knownSet = new Set(knownEntries.map((e) => e.objectKey));
775
- const orphans = [];
776
- const keyStream = deps.dataSource.streamList ? deps.dataSource.streamList(prefix) : async function* () {
777
- const all = await deps.dataSource.list(prefix);
778
- for (const k of all) yield k;
779
- }();
780
- for await (const key of keyStream) {
781
- if (knownSet.has(key)) continue;
782
- const match = VERSION_RE.exec(key);
783
- if (!match) continue;
784
- if (Number(match[1]) <= cutoff) orphans.push(key);
785
- }
786
- const byScope = /* @__PURE__ */ new Map();
787
- for (const key of orphans) {
788
- const scope = parseLockScope(key);
789
- if (!scope) continue;
790
- const sk = `${scope.userId}|${scope.siteId ?? ""}|${scope.table}|${scope.partition}`;
791
- const bucket = byScope.get(sk) ?? {
792
- scope,
793
- keys: []
794
- };
795
- bucket.keys.push(key);
796
- byScope.set(sk, bucket);
797
- }
798
- for (const { scope, keys } of byScope.values()) await deps.manifestStore.withLock(scope, async () => {
799
- const known = await deps.manifestStore.listAll({
800
- userId: scope.userId,
801
- siteId: scope.siteId,
802
- table: scope.table,
803
- partitions: [scope.partition]
804
- });
805
- const knownInScope = new Set(known.map((e) => e.objectKey));
806
- const stillOrphans = keys.filter((k) => !knownInScope.has(k));
807
- if (stillOrphans.length > 0) {
808
- await deps.dataSource.delete(stillOrphans);
809
- sweptOrphans += stillOrphans.length;
810
- }
811
- });
812
- }
813
- return { deleted: retired.length + sweptOrphans };
814
- }
815
- const COMPARISON_FILTER_SQL = {
816
- new: sql`AND (p.impressions IS NULL OR p.impressions = 0)`,
817
- lost: sql`AND p.impressions > 0 AND c.impressions = 0`,
818
- improving: sql`AND c.clicks > COALESCE(p.clicks, 0)`,
819
- declining: sql`AND c.clicks < p.clicks AND p.clicks > 0`
820
- };
821
- function collapseWs(s) {
822
- return s.replace(/\s+/g, " ").trim();
823
- }
824
- function joinAnd(parts) {
825
- return sql.join(parts, sql` AND `);
826
- }
827
- function joinComma(parts) {
828
- return sql.join(parts, sql`, `);
829
- }
830
- function orderByClause(state, prefix = "") {
831
- if (state.orderBy) {
832
- const safeCol = state.orderBy.column.replace(/\W/g, "");
833
- const safeDir = state.orderBy.dir.toUpperCase() === "ASC" ? "ASC" : "DESC";
834
- return sql.raw(`ORDER BY ${prefix}${safeCol} ${safeDir}`);
835
- }
836
- return sql.raw(`ORDER BY ${prefix}clicks DESC`);
837
- }
838
- function limitOffsetClause(state) {
839
- const rowLimit = Math.max(0, Math.floor(Number(state.rowLimit ?? 100)));
840
- const offset = state.startRow ? Math.max(0, Math.floor(Number(state.startRow))) : 0;
841
- return sql.raw(offset > 0 ? `LIMIT ${rowLimit} OFFSET ${offset}` : `LIMIT ${rowLimit}`);
842
- }
843
- function aliasRaw(name) {
844
- const safe = name.replace(/\W/g, "");
845
- return sql.raw(`"${safe}"`);
846
- }
847
- function toInternalDimensionFilters(filters) {
848
- return filters.map((filter) => ({
849
- dimension: filter.dimension,
850
- operator: filter.operator,
851
- expression: filter.expression,
852
- expression2: filter.expression2
853
- }));
854
- }
855
- function toInternalMetricFilters(filters) {
856
- return filters.map((filter) => ({
857
- dimension: filter.metric,
858
- operator: filter.operator,
859
- expression: String(filter.expression),
860
- expression2: filter.expression2 == null ? void 0 : String(filter.expression2)
861
- }));
862
- }
863
- function topLevelFilters(plan) {
864
- if (!plan.specialFilters.topLevel) return [];
865
- return [{
866
- dimension: "page",
867
- operator: "topLevel",
868
- expression: ""
869
- }];
870
- }
871
- function logicalFilterToInternal(filter) {
872
- return {
873
- dimension: filter.dimension,
874
- operator: filter.operator,
875
- expression: filter.expression,
876
- expression2: filter.expression2
877
- };
878
- }
879
- function compileFilterTree(node, adapter, tableKey) {
880
- if (!node) return void 0;
881
- if (node.kind === "leaf") return adapter.dimensionPredicates([logicalFilterToInternal(node.filter)], tableKey)[0];
882
- const childSqls = node.children.map((child) => compileFilterTree(child, adapter, tableKey)).filter((s) => s !== void 0);
883
- if (childSqls.length === 0) return void 0;
884
- if (childSqls.length === 1) return childSqls[0];
885
- const sep = node.groupType === "or" ? sql` OR ` : sql` AND `;
886
- return sql`(${sql.join(childSqls, sep)})`;
887
- }
888
- function buildScope(state, options) {
889
- const { adapter, siteId } = options;
890
- const plan = buildLogicalPlan(state, adapter.capabilities);
891
- const tableKey = adapter.tableKeyForDataset(plan.dataset);
892
- const dimFilters = toInternalDimensionFilters(plan.dimensionFilters);
893
- const metricFilters = toInternalMetricFilters(plan.metricFilters);
894
- const groupByDims = plan.groupByDimensions;
895
- const hasDate = plan.hasDate;
896
- const metrics = plan.metrics;
897
- const wherePredicates = [];
898
- if (adapter.siteIdColRef && siteId != null) wherePredicates.push(sql`${adapter.siteIdColRef(tableKey)} = ${siteId}`);
899
- wherePredicates.push(sql`${adapter.dateColRef(tableKey)} >= ${plan.dateRange.startDate}`);
900
- wherePredicates.push(sql`${adapter.dateColRef(tableKey)} <= ${plan.dateRange.endDate}`);
901
- const dimSql = plan.dimensionFilterTree ? compileFilterTree(plan.dimensionFilterTree, adapter, tableKey) : void 0;
902
- if (dimSql) wherePredicates.push(dimSql);
903
- else if (!plan.dimensionFilterTree) wherePredicates.push(...adapter.dimensionPredicates(dimFilters, tableKey));
904
- const tl = adapter.topLevelPredicate(topLevelFilters(plan), tableKey);
905
- if (tl) wherePredicates.push(tl);
906
- return {
907
- plan,
908
- tableKey,
909
- groupByDims,
910
- hasDate,
911
- metrics,
912
- wherePredicates,
913
- having: adapter.havingPredicates(metricFilters, tableKey),
914
- dimFilters,
915
- startDate: plan.dateRange.startDate,
916
- endDate: plan.dateRange.endDate
917
- };
918
- }
919
- function buildComparisonPlan(current, previous, capabilities) {
920
- return buildLogicalComparisonPlan(current, previous, capabilities);
921
- }
922
- function compileCollapsed(adapter, q) {
923
- const c = adapter.compile(q);
924
- return {
925
- sql: collapseWs(c.sql),
926
- params: c.params
927
- };
928
- }
929
- function buildTotalsSql(state, options) {
930
- const { adapter } = options;
931
- const { tableKey, metrics, wherePredicates } = buildScope(state, options);
932
- const table = adapter.tableRef(tableKey);
933
- const selectExprs = metrics.map((m) => sql`${adapter.metricSql(m, tableKey)} as ${aliasRaw(m)}`);
934
- return compileCollapsed(adapter, wherePredicates.length > 0 ? sql`SELECT ${joinComma(selectExprs)} FROM ${table} WHERE ${joinAnd(wherePredicates)}` : sql`SELECT ${joinComma(selectExprs)} FROM ${table}`);
935
- }
936
- function resolveComparisonSQL(current, previous, options, comparisonFilter) {
937
- const { adapter, siteId } = options;
938
- const comparisonPlan = buildComparisonPlan(current, previous, adapter.capabilities);
939
- const currentScope = buildScope(current, options);
940
- const previousScope = buildScope(previous, options);
941
- const { tableKey, groupByDims, metrics, wherePredicates: currentWhere, having } = currentScope;
942
- const table = adapter.tableRef(tableKey);
943
- const dimSelectExprs = [];
944
- for (const d of groupByDims) {
945
- const expr = adapter.dimExprSql(d, tableKey);
946
- const colName = adapter.dimColumn(d, tableKey);
947
- if (d === "page" || colName !== d) dimSelectExprs.push(sql`${expr} as ${aliasRaw(d)}`);
948
- else dimSelectExprs.push(expr);
949
- }
950
- const currentSelect = [...dimSelectExprs, ...metrics.map((m) => sql`${adapter.metricSql(m, tableKey)} as ${aliasRaw(m)}`)];
951
- const prevSelect = [...dimSelectExprs, ...adapter.METRIC_NAMES.map((m) => sql`${adapter.metricSql(m, tableKey)} as ${aliasRaw(m)}`)];
952
- const groupByExprs = groupByDims.map((d) => adapter.dimExprSql(d, tableKey));
953
- const prevWhere = [];
954
- if (adapter.siteIdColRef && siteId != null) prevWhere.push(sql`${adapter.siteIdColRef(tableKey)} = ${siteId}`);
955
- if (previousScope.startDate) prevWhere.push(sql`${adapter.dateColRef(tableKey)} >= ${previousScope.startDate}`);
956
- if (previousScope.endDate) prevWhere.push(sql`${adapter.dateColRef(tableKey)} <= ${previousScope.endDate}`);
957
- const prevDimSql = comparisonPlan.current.dimensionFilterTree ? compileFilterTree(comparisonPlan.current.dimensionFilterTree, adapter, tableKey) : void 0;
958
- if (prevDimSql) prevWhere.push(prevDimSql);
959
- else if (!comparisonPlan.current.dimensionFilterTree) prevWhere.push(...adapter.dimensionPredicates(toInternalDimensionFilters(comparisonPlan.current.dimensionFilters), tableKey));
960
- let currentCte = currentWhere.length > 0 ? sql`SELECT ${joinComma(currentSelect)} FROM ${table} WHERE ${joinAnd(currentWhere)}` : sql`SELECT ${joinComma(currentSelect)} FROM ${table}`;
961
- if (groupByExprs.length > 0) currentCte = sql`${currentCte} GROUP BY ${joinComma(groupByExprs)}`;
962
- if (having.length > 0) currentCte = sql`${currentCte} HAVING ${joinAnd(having)}`;
963
- let previousCte = prevWhere.length > 0 ? sql`SELECT ${joinComma(prevSelect)} FROM ${table} WHERE ${joinAnd(prevWhere)}` : sql`SELECT ${joinComma(prevSelect)} FROM ${table}`;
964
- if (groupByExprs.length > 0) previousCte = sql`${previousCte} GROUP BY ${joinComma(groupByExprs)}`;
965
- const joinOn = groupByDims.length > 0 ? sql.raw(groupByDims.map((d) => `c.${d.replace(/\W/g, "")} = p.${d.replace(/\W/g, "")}`).join(" AND ")) : sql.raw("1=1");
966
- const filterClause = comparisonFilter ? COMPARISON_FILTER_SQL[comparisonFilter] : sql.raw("");
967
- const orderSql = orderByClause(current, "c.");
968
- const limitSql = limitOffsetClause(current);
969
- const outerCurrentCols = [];
970
- for (const d of groupByDims) {
971
- const colName = d.replace(/\W/g, "");
972
- outerCurrentCols.push(sql.raw(`c.${colName} as "${colName}"`));
973
- }
974
- outerCurrentCols.push(sql.raw("CAST(c.clicks AS DOUBLE) as \"clicks\""));
975
- outerCurrentCols.push(sql.raw("CAST(c.impressions AS DOUBLE) as \"impressions\""));
976
- outerCurrentCols.push(sql.raw("c.ctr as \"ctr\""));
977
- outerCurrentCols.push(sql.raw("c.position as \"position\""));
978
- const mainQuery = sql`WITH current AS (${currentCte}), previous AS (${previousCte}) SELECT ${joinComma(outerCurrentCols)}, COALESCE(CAST(p.clicks AS DOUBLE), 0) as "prevClicks", COALESCE(CAST(p.impressions AS DOUBLE), 0) as "prevImpressions", COALESCE(p.ctr, 0) as "prevCtr", COALESCE(p.position, 0) as "prevPosition" FROM current c LEFT JOIN previous p ON ${joinOn} WHERE 1=1 ${filterClause} ${orderSql} ${limitSql}`;
979
- const firstGroupBy = groupByDims[0] ? groupByDims[0].replace(/\W/g, "") : "clicks";
980
- const countInnerSelect = sql.raw(`c.${firstGroupBy}`);
981
- const countQuery = sql`WITH current AS (${currentCte}), previous AS (${previousCte}) SELECT COUNT(*) as total FROM (SELECT ${countInnerSelect} FROM current c LEFT JOIN previous p ON ${joinOn} WHERE 1=1 ${filterClause})`;
982
- const main = compileCollapsed(adapter, mainQuery);
983
- const count = compileCollapsed(adapter, countQuery);
984
- return {
985
- sql: main.sql,
986
- params: main.params,
987
- countSql: count.sql,
988
- countParams: count.params
989
- };
990
- }
991
- function buildExtrasQueries(state, options) {
992
- const { adapter, siteId } = options;
993
- const plan = buildLogicalPlan(state, adapter.capabilities);
994
- const dims = plan.groupByDimensions;
995
- const extras = [];
996
- if (!dims.includes("queryCanonical")) return extras;
997
- const keywordsKey = adapter.tableKeyForDataset("keywords");
998
- const t = adapter.schema[keywordsKey];
999
- const table = adapter.tableRef(keywordsKey);
1000
- const whereParts = [];
1001
- if (adapter.siteIdColRef && siteId != null) whereParts.push(sql`${adapter.siteIdColRef(keywordsKey)} = ${siteId}`);
1002
- whereParts.push(sql`${adapter.dateColRef(keywordsKey)} >= ${plan.dateRange.startDate}`);
1003
- whereParts.push(sql`${adapter.dateColRef(keywordsKey)} <= ${plan.dateRange.endDate}`);
1004
- const whereExpr = whereParts.length > 0 ? sql`WHERE ${joinAnd(whereParts)}` : sql``;
1005
- const outerQueryCol = sql.raw("query");
1006
- const compiled = compileCollapsed(adapter, sql`WITH per_variant AS (SELECT ${t.query_canonical} as joinKey, ${t.query} as query, SUM(${t.clicks}) as clicks, SUM(${t.impressions}) as impressions, SUM(${t.sum_position}) as sum_pos, ROW_NUMBER() OVER (PARTITION BY ${t.query_canonical} ORDER BY SUM(${t.clicks}) DESC) as rn, COUNT(*) OVER (PARTITION BY ${t.query_canonical}) as variantCount FROM ${table} ${whereExpr} GROUP BY ${t.query_canonical}, ${t.query}) SELECT joinKey, MAX(variantCount) as variantCount, MAX(CASE WHEN rn = 1 THEN ${outerQueryCol} END) as canonicalName, GROUP_CONCAT(CASE WHEN rn <= 10 THEN ${outerQueryCol} || ':::' || clicks || ':::' || impressions || ':::' || CAST(ROUND(CAST(sum_pos AS REAL) / NULLIF(impressions, 0) + 1, 1) AS TEXT) END, '||') as variants FROM per_variant GROUP BY joinKey`);
1007
- extras.push({
1008
- key: "canonicalExtras",
1009
- sql: compiled.sql,
1010
- params: compiled.params
1011
- });
1012
- return extras;
1013
- }
1014
- const LOGICAL_DATASETS = {
1015
- pages: { dimensions: {
1016
- page: {
1017
- column: "url",
1018
- surfaces: ["api", "stored"]
1019
- },
1020
- date: {
1021
- column: "date",
1022
- surfaces: ["api", "stored"]
1023
- }
1024
- } },
1025
- keywords: { dimensions: {
1026
- query: {
1027
- column: "query",
1028
- surfaces: ["api", "stored"]
1029
- },
1030
- queryCanonical: {
1031
- column: "query_canonical",
1032
- surfaces: ["stored", "derived"]
1033
- },
1034
- date: {
1035
- column: "date",
1036
- surfaces: ["api", "stored"]
1037
- }
1038
- } },
1039
- page_keywords: { dimensions: {
1040
- page: {
1041
- column: "url",
1042
- surfaces: ["api", "stored"]
1043
- },
1044
- query: {
1045
- column: "query",
1046
- surfaces: ["api", "stored"]
1047
- },
1048
- queryCanonical: {
1049
- column: "query_canonical",
1050
- surfaces: ["stored", "derived"]
1051
- },
1052
- date: {
1053
- column: "date",
1054
- surfaces: ["api", "stored"]
1055
- }
1056
- } },
1057
- countries: { dimensions: {
1058
- country: {
1059
- column: "country",
1060
- surfaces: ["api", "stored"]
1061
- },
1062
- date: {
1063
- column: "date",
1064
- surfaces: ["api", "stored"]
1065
- }
1066
- } },
1067
- devices: { dimensions: {
1068
- device: {
1069
- column: "device",
1070
- surfaces: ["api", "stored"]
1071
- },
1072
- date: {
1073
- column: "date",
1074
- surfaces: ["api", "stored"]
1075
- }
1076
- } },
1077
- search_appearance: { dimensions: {
1078
- searchAppearance: {
1079
- column: "searchAppearance",
1080
- surfaces: ["api", "stored"]
1081
- },
1082
- date: {
1083
- column: "date",
1084
- surfaces: ["api", "stored"]
1085
- }
1086
- } }
1087
- };
1088
- function inferLogicalDataset(dimensions, filterDims = []) {
1089
- const allDims = new Set([...dimensions, ...filterDims]);
1090
- const has = (d) => allDims.has(d);
1091
- if (has("searchAppearance")) return "search_appearance";
1092
- if (has("page") && (has("query") || has("queryCanonical"))) return "page_keywords";
1093
- if (has("query") || has("queryCanonical")) return "keywords";
1094
- if (has("page")) return "pages";
1095
- if (has("country")) return "countries";
1096
- if (has("device")) return "devices";
1097
- return "keywords";
1098
- }
1099
- const METRIC_NAMES = [
1100
- "clicks",
1101
- "impressions",
1102
- "ctr",
1103
- "position"
1104
- ];
1105
- function defaultSqliteUrlToPathExpr(col) {
1106
- return `CASE WHEN ${col} LIKE 'http%' THEN CASE WHEN INSTR(SUBSTR(${col}, INSTR(${col}, '://') + 3), '/') > 0 THEN SUBSTR(${col}, INSTR(${col}, '://') + 2 + INSTR(SUBSTR(${col}, INSTR(${col}, '://') + 3), '/')) ELSE '/' END ELSE ${col} END`;
1107
- }
1108
- function buildDimensionColumnMap(datasetToTableKey) {
1109
- const entries = Object.entries(datasetToTableKey).map(([dataset, tableKey]) => {
1110
- const dims = LOGICAL_DATASETS[dataset].dimensions;
1111
- return [tableKey, Object.fromEntries(Object.entries(dims).map(([dim, binding]) => [dim, binding?.column ?? dim]))];
1112
- });
1113
- return Object.fromEntries(entries);
1114
- }
1115
- function createSqlFragments(config) {
1116
- const { schema, datasetToTableKey, metricCast, regexPredicate, tableLabel, includeSiteId, urlToPathExpr: urlToPathExprOverride, tableRef: tableRefOverride } = config;
1117
- const DIM_COLUMN_MAP = buildDimensionColumnMap(datasetToTableKey);
1118
- function isMetricDimension(dim) {
1119
- return METRIC_NAMES.includes(dim);
1120
- }
1121
- function dimColumn(dim, table) {
1122
- return DIM_COLUMN_MAP[table]?.[dim] ?? dim;
1123
- }
1124
- function tableKeyForDataset(dataset) {
1125
- return datasetToTableKey[dataset];
1126
- }
1127
- function inferTable(dimensions, filterDims = []) {
1128
- return tableKeyForDataset(inferLogicalDataset(dimensions, filterDims));
1129
- }
1130
- const urlToPathExpr = urlToPathExprOverride ?? defaultSqliteUrlToPathExpr;
1131
- function colRef(tableKey, colName) {
1132
- const c = schema[tableKey][colName];
1133
- if (!c) throw new Error(`${tableLabel}: unknown column '${colName}' on ${tableKey}`);
1134
- return sql`${c}`;
1135
- }
1136
- function tableRef(tableKey) {
1137
- if (tableRefOverride) return tableRefOverride(tableKey);
1138
- return sql`${schema[tableKey]}`;
1139
- }
1140
- function dateColRef(tableKey) {
1141
- return colRef(tableKey, "date");
1142
- }
1143
- function siteIdColRef(tableKey) {
1144
- return colRef(tableKey, "site_id");
1145
- }
1146
- function dimExprSql(dim, tableKey) {
1147
- const colName = dimColumn(dim, tableKey);
1148
- if (dim === "page") return sql.raw(urlToPathExpr(colName));
1149
- return colRef(tableKey, colName);
1150
- }
1151
- function metricSql(metric, tableKey) {
1152
- const t = schema[tableKey];
1153
- switch (metric) {
1154
- case "clicks": return sql`CAST(SUM(${t.clicks}) AS ${sql.raw(metricCast)})`;
1155
- case "impressions": return sql`CAST(SUM(${t.impressions}) AS ${sql.raw(metricCast)})`;
1156
- case "ctr": return sql`CAST(SUM(${t.clicks}) AS ${sql.raw(metricCast)}) / NULLIF(SUM(${t.impressions}), 0)`;
1157
- case "position": return sql`SUM(${t.sum_position}) / NULLIF(SUM(${t.impressions}), 0) + 1`;
1158
- }
1159
- }
1160
- function havingPredicates(filters, tableKey) {
1161
- const preds = [];
1162
- for (const f of filters) {
1163
- const metric = f.dimension;
1164
- if (!isMetricDimension(metric)) continue;
1165
- const expr = metricSql(metric, tableKey);
1166
- const v = Number(f.expression);
1167
- switch (f.operator) {
1168
- case "metricGte":
1169
- preds.push(sql`${expr} >= ${v}`);
1170
- break;
1171
- case "metricGt":
1172
- preds.push(sql`${expr} > ${v}`);
1173
- break;
1174
- case "metricLte":
1175
- preds.push(sql`${expr} <= ${v}`);
1176
- break;
1177
- case "metricLt":
1178
- preds.push(sql`${expr} < ${v}`);
1179
- break;
1180
- case "metricBetween": {
1181
- const v2 = Number(f.expression2);
1182
- preds.push(sql`${expr} >= ${v} AND ${expr} <= ${v2}`);
1183
- break;
1184
- }
1185
- }
1186
- }
1187
- return preds;
1188
- }
1189
- function dimensionPredicates(filters, tableKey) {
1190
- const preds = [];
1191
- for (const f of filters) {
1192
- if (isMetricDimension(f.dimension)) continue;
1193
- if (f.dimension === "date") continue;
1194
- if (f.operator === "topLevel") continue;
1195
- const cRef = colRef(tableKey, dimColumn(f.dimension, tableKey));
1196
- const matchExpr = f.dimension === "page" ? dimExprSql(f.dimension, tableKey) : cRef;
1197
- switch (f.operator) {
1198
- case "equals":
1199
- preds.push(sql`${matchExpr} = ${f.expression}`);
1200
- break;
1201
- case "notEquals":
1202
- preds.push(sql`${matchExpr} != ${f.expression}`);
1203
- break;
1204
- case "contains":
1205
- preds.push(sql`${cRef} LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
1206
- break;
1207
- case "notContains":
1208
- preds.push(sql`${cRef} NOT LIKE ${`%${escapeLike(f.expression)}%`} ESCAPE '\\'`);
1209
- break;
1210
- case "includingRegex":
1211
- preds.push(regexPredicate(cRef, f.expression, false));
1212
- break;
1213
- case "excludingRegex":
1214
- preds.push(regexPredicate(cRef, f.expression, true));
1215
- break;
1216
- }
1217
- }
1218
- return preds;
1219
- }
1220
- function topLevelPredicate(filters, tableKey) {
1221
- if (!filters.some((f) => f.operator === "topLevel")) return void 0;
1222
- const pathExpr = dimExprSql("page", tableKey);
1223
- return sql`LENGTH(${pathExpr}) - LENGTH(REPLACE(${pathExpr}, '/', '')) <= 1`;
1224
- }
1225
- return {
1226
- METRIC_NAMES,
1227
- DIM_COLUMN_MAP,
1228
- isMetricDimension,
1229
- tableKeyForDataset,
1230
- dimColumn,
1231
- inferTable,
1232
- urlToPathExpr,
1233
- colRef,
1234
- tableRef,
1235
- dateColRef,
1236
- siteIdColRef: includeSiteId ? siteIdColRef : void 0,
1237
- dimExprSql,
1238
- metricSql,
1239
- havingPredicates,
1240
- dimensionPredicates,
1241
- topLevelPredicate
1242
- };
1243
- }
1244
- function createResolverAdapter(config) {
1245
- const runtime = createSqlFragments(config);
1246
- return {
1247
- METRIC_NAMES: runtime.METRIC_NAMES,
1248
- capabilities: config.capabilities,
1249
- schema: config.schema,
1250
- tableKeyForDataset: runtime.tableKeyForDataset,
1251
- inferTable: runtime.inferTable,
1252
- dimColumn: runtime.dimColumn,
1253
- isMetricDimension: runtime.isMetricDimension,
1254
- tableRef: runtime.tableRef,
1255
- dateColRef: runtime.dateColRef,
1256
- urlToPathExpr: runtime.urlToPathExpr,
1257
- siteIdColRef: runtime.siteIdColRef,
1258
- dimExprSql: runtime.dimExprSql,
1259
- metricSql: runtime.metricSql,
1260
- dimensionPredicates: runtime.dimensionPredicates,
1261
- havingPredicates: runtime.havingPredicates,
1262
- topLevelPredicate: runtime.topLevelPredicate,
1263
- compile: config.compile
1264
- };
1265
- }
1266
- const pgDialect = new PgDialect();
1267
- new SQLiteAsyncDialect();
1268
- function compilePg(query) {
1269
- const compiled = pgDialect.sqlToQuery(query);
1270
- return {
1271
- sql: compiled.sql,
1272
- params: compiled.params
1273
- };
1274
- }
1275
- const PG_BASE_CONFIG = {
1276
- schema: drizzleSchema,
1277
- datasetToTableKey: {
1278
- pages: "pages",
1279
- keywords: "keywords",
1280
- page_keywords: "page_keywords",
1281
- countries: "countries",
1282
- devices: "devices",
1283
- search_appearance: "search_appearance"
1284
- },
1285
- metricCast: "DOUBLE",
1286
- regexPredicate: (expr, pattern, negate) => negate ? sql`NOT regexp_matches(${expr}, ${pattern})` : sql`regexp_matches(${expr}, ${pattern})`,
1287
- urlToPathExpr: (col) => `CASE WHEN ${col} LIKE 'http%' THEN COALESCE(NULLIF(regexp_replace(${col}, '^https?://[^/]+', ''), ''), '/') ELSE ${col} END`,
1288
- includeSiteId: false,
1289
- compile: compilePg,
1290
- capabilities: {
1291
- regex: true,
1292
- comparisonJoin: true,
1293
- windowTotals: true
1294
- }
1295
- };
1296
- createResolverAdapter({
1297
- ...PG_BASE_CONFIG,
1298
- tableLabel: "pg-resolver-adapter"
1299
- });
1300
- function createParquetResolverAdapter() {
1301
- return createResolverAdapter({
1302
- ...PG_BASE_CONFIG,
1303
- tableLabel: "parquet-resolver-adapter",
1304
- tableRef: (tk) => sql.raw(`read_parquet({{FILES}}, union_by_name = true) AS "${tk}"`)
1305
- });
1306
- }
1307
- const URL_PURGE_TABLES = ["pages", "page_keywords"];
1308
- const MAX_DAY_BYTES = 100 * 1024 * 1024;
1309
- const URL_COLUMNS = /* @__PURE__ */ new Set();
1310
- for (const t of Object.keys(SCHEMAS)) for (const col of SCHEMAS[t].columns) if (col.name === "url") URL_COLUMNS.add(`${t}:url`);
1311
- function normalizeRow(table, row) {
1312
- if (!URL_COLUMNS.has(`${table}:url`)) return row;
1313
- const url = row.url;
1314
- if (typeof url !== "string") return row;
1315
- const normalized = normalizeUrl(url);
1316
- if (normalized === url) return row;
1317
- return {
1318
- ...row,
1319
- url: normalized
1320
- };
1321
- }
1322
- function createStorageEngine(opts) {
1323
- const { dataSource, manifestStore, codec, executor } = opts;
1324
- const defaultNow = opts.now ?? (() => Date.now());
1325
- async function writeDay(ctx, rows) {
1326
- if (!ctx.date) throw new Error("writeDay requires ctx.date");
1327
- const date = ctx.date;
1328
- const now = (ctx.now ?? defaultNow)();
1329
- const partition = dayPartition(date);
1330
- const searchType = ctx.searchType;
1331
- return manifestStore.withLock({
1332
- userId: ctx.userId,
1333
- siteId: ctx.siteId,
1334
- table: ctx.table,
1335
- partition
1336
- }, async () => {
1337
- const superseding = (await manifestStore.listLive({
1338
- userId: ctx.userId,
1339
- siteId: ctx.siteId,
1340
- table: ctx.table,
1341
- partitions: [partition]
1342
- })).filter((e) => inferSearchType(e) === inferSearchType({ searchType }));
1343
- const normalizedRows = rows.map((r) => normalizeRow(ctx.table, r));
1344
- const key = objectKey(ctx, ctx.table, partition, now, searchType);
1345
- const { bytes: writtenBytes, rowCount } = await codec.writeRows({ table: ctx.table }, normalizedRows, key, dataSource);
1346
- let bytes = writtenBytes;
1347
- if (bytes === 0 && rowCount > 0 && dataSource.head) {
1348
- const probed = await dataSource.head(key);
1349
- if (probed) bytes = probed.bytes;
1350
- }
1351
- if (bytes > 104857600) {
1352
- await dataSource.delete([key]).catch(() => {});
1353
- throw new Error(`writeDay payload ${bytes} bytes exceeds ${MAX_DAY_BYTES} hard ceiling (table=${ctx.table}, key=${key})`);
1354
- }
1355
- const entry = {
1356
- userId: ctx.userId,
1357
- siteId: ctx.siteId,
1358
- table: ctx.table,
1359
- partition,
1360
- objectKey: key,
1361
- rowCount,
1362
- bytes,
1363
- createdAt: now,
1364
- schemaVersion: currentSchemaVersion(ctx.table),
1365
- tier: "raw",
1366
- ...searchType !== void 0 ? { searchType } : {}
1367
- };
1368
- await manifestStore.registerVersion(entry, superseding);
1369
- await manifestStore.bumpWatermark({
1370
- userId: ctx.userId,
1371
- siteId: ctx.siteId,
1372
- table: ctx.table
1373
- }, date, now);
1374
- });
1375
- }
1376
- async function runSQL(opts) {
1377
- opts.signal?.throwIfAborted();
1378
- const entries = Object.entries(opts.fileSets);
1379
- const perSet = await Promise.all(entries.map(async ([name, ref]) => {
1380
- return [name, (await manifestStore.listLive({
1381
- userId: opts.ctx.userId,
1382
- siteId: opts.ctx.siteId,
1383
- table: ref.table,
1384
- partitions: ref.partitions
1385
- })).map((e) => e.objectKey)];
1386
- }));
1387
- opts.signal?.throwIfAborted();
1388
- const fileKeys = {};
1389
- for (const [name, keys] of perSet) fileKeys[name] = keys;
1390
- const uniqueKeys = [...new Set(perSet.flatMap(([, keys]) => keys))];
1391
- let table = opts.table;
1392
- if (!table) {
1393
- if (new Set(entries.map(([, ref]) => ref.table)).size > 1) throw new Error("runSQL requires explicit ctx.table when fileSets reference multiple tables.");
1394
- table = entries[0]?.[1].table;
1395
- }
1396
- if (!table) throw new Error("runSQL requires at least one fileSet or an explicit table");
1397
- const result = await executor.execute({
1398
- sql: opts.sql,
1399
- params: opts.params ?? [],
1400
- fileKeys,
1401
- dataSource,
1402
- table,
1403
- signal: opts.signal
1404
- });
1405
- return {
1406
- rows: result.rows,
1407
- sql: result.sql,
1408
- objectKeys: uniqueKeys
1409
- };
1410
- }
1411
- async function query(ctx, state) {
1412
- const plan = buildLogicalPlan(state, { regex: true });
1413
- const table = ctx.table ?? plan.dataset;
1414
- const resolved = compileLogicalQueryPlan(plan, table);
1415
- return runSQL({
1416
- ctx: {
1417
- userId: ctx.userId,
1418
- siteId: ctx.siteId
1419
- },
1420
- table,
1421
- fileSets: { FILES: {
1422
- table,
1423
- partitions: resolved.partitions
1424
- } },
1425
- sql: resolved.sql,
1426
- params: resolved.params,
1427
- signal: ctx.signal
1428
- });
1429
- }
1430
- async function queryComparison(ctx, current, previous, filter) {
1431
- const adapter = createParquetResolverAdapter();
1432
- const currentPlan = buildLogicalPlan(current, adapter.capabilities);
1433
- const previousPlan = buildLogicalPlan(previous, adapter.capabilities);
1434
- if (currentPlan.dataset !== previousPlan.dataset) throw new Error(`queryComparison: current (${currentPlan.dataset}) and previous (${previousPlan.dataset}) must resolve to the same table`);
1435
- const table = ctx.table ?? currentPlan.dataset;
1436
- const comparison = resolveComparisonSQL(current, previous, {
1437
- adapter,
1438
- siteId: void 0
1439
- }, filter);
1440
- const totals = buildTotalsSql(current, {
1441
- adapter,
1442
- siteId: void 0
1443
- });
1444
- const fileSets = { FILES: {
1445
- table,
1446
- partitions: enumeratePartitions(currentPlan.dateRange.startDate < previousPlan.dateRange.startDate ? currentPlan.dateRange.startDate : previousPlan.dateRange.startDate, currentPlan.dateRange.endDate > previousPlan.dateRange.endDate ? currentPlan.dateRange.endDate : previousPlan.dateRange.endDate)
1447
- } };
1448
- const baseCtx = {
1449
- userId: ctx.userId,
1450
- siteId: ctx.siteId
1451
- };
1452
- const [main, count, totalsRow] = await Promise.all([
1453
- runSQL({
1454
- ctx: baseCtx,
1455
- table,
1456
- fileSets,
1457
- sql: comparison.sql,
1458
- params: comparison.params,
1459
- signal: ctx.signal
1460
- }),
1461
- runSQL({
1462
- ctx: baseCtx,
1463
- table,
1464
- fileSets,
1465
- sql: comparison.countSql,
1466
- params: comparison.countParams,
1467
- signal: ctx.signal
1468
- }),
1469
- runSQL({
1470
- ctx: baseCtx,
1471
- table,
1472
- fileSets,
1473
- sql: totals.sql,
1474
- params: totals.params,
1475
- signal: ctx.signal
1476
- })
1477
- ]);
1478
- return {
1479
- rows: main.rows,
1480
- totalCount: Number(count.rows[0]?.total ?? 0),
1481
- totals: totalsRow.rows[0] ?? {}
1482
- };
1483
- }
1484
- async function queryExtras(ctx, state) {
1485
- const adapter = createParquetResolverAdapter();
1486
- const extras = buildExtrasQueries(state, {
1487
- adapter,
1488
- siteId: void 0
1489
- });
1490
- if (extras.length === 0) return [];
1491
- const plan = buildLogicalPlan(state, adapter.capabilities);
1492
- const table = ctx.table ?? plan.dataset;
1493
- const fileSets = { FILES: {
1494
- table,
1495
- partitions: enumeratePartitions(plan.dateRange.startDate, plan.dateRange.endDate)
1496
- } };
1497
- const baseCtx = {
1498
- userId: ctx.userId,
1499
- siteId: ctx.siteId
1500
- };
1501
- const results = await Promise.all(extras.map((e) => runSQL({
1502
- ctx: baseCtx,
1503
- table,
1504
- fileSets,
1505
- sql: e.sql,
1506
- params: e.params,
1507
- signal: ctx.signal
1508
- })));
1509
- return extras.map((e, i) => ({
1510
- key: e.key,
1511
- rows: results[i].rows
1512
- }));
1513
- }
1514
- async function compactTiered(ctx, thresholds) {
1515
- return compactTieredImpl({
1516
- dataSource,
1517
- manifestStore,
1518
- codec
1519
- }, ctx, (ctx.now ?? defaultNow)(), thresholds);
1520
- }
1521
- async function gcOrphans(ctx, graceMs) {
1522
- return gcOrphansImpl({
1523
- dataSource,
1524
- manifestStore
1525
- }, (ctx.now ?? defaultNow)(), graceMs, {
1526
- userId: ctx.userId,
1527
- siteId: ctx.siteId
1528
- });
1529
- }
1530
- async function purgeTenant(ctx) {
1531
- const prefix = tenantPrefix(ctx);
1532
- const keys = [];
1533
- const keyStream = dataSource.streamList ? dataSource.streamList(prefix) : async function* () {
1534
- for (const k of await dataSource.list(prefix)) yield k;
1535
- }();
1536
- for await (const key of keyStream) keys.push(key);
1537
- if (keys.length > 0) await dataSource.delete(keys);
1538
- const manifestResult = await manifestStore.purgeTenant({
1539
- userId: ctx.userId,
1540
- siteId: ctx.siteId
1541
- });
1542
- return {
1543
- userId: ctx.userId,
1544
- siteId: ctx.siteId,
1545
- prefix,
1546
- objectsDeleted: keys.length,
1547
- entriesRemoved: manifestResult.entriesRemoved,
1548
- watermarksRemoved: manifestResult.watermarksRemoved,
1549
- syncStatesRemoved: manifestResult.syncStatesRemoved,
1550
- at: defaultNow()
1551
- };
1552
- }
1553
- async function purgeUrls(ctx, urls) {
1554
- const now = defaultNow();
1555
- const urlSet = new Set(urls);
1556
- let entriesRewritten = 0;
1557
- let rowsRemoved = 0;
1558
- let bytesAfter = 0;
1559
- if (urlSet.size === 0) return {
1560
- userId: ctx.userId,
1561
- siteId: ctx.siteId,
1562
- urlsRequested: 0,
1563
- entriesRewritten: 0,
1564
- rowsRemoved: 0,
1565
- bytesAfter: 0,
1566
- at: now
1567
- };
1568
- for (const table of URL_PURGE_TABLES) {
1569
- const entries = await manifestStore.listLive({
1570
- userId: ctx.userId,
1571
- siteId: ctx.siteId,
1572
- table
1573
- });
1574
- for (const entry of entries) await manifestStore.withLock({
1575
- userId: entry.userId,
1576
- siteId: entry.siteId,
1577
- table,
1578
- partition: entry.partition
1579
- }, async () => {
1580
- const rows = await codec.readRows({ table }, entry.objectKey, dataSource);
1581
- const kept = rows.filter((r) => typeof r.url !== "string" || !urlSet.has(r.url));
1582
- const removed = rows.length - kept.length;
1583
- if (removed === 0) return;
1584
- const searchType = entry.searchType;
1585
- const newKey = objectKey({
1586
- userId: entry.userId,
1587
- siteId: entry.siteId
1588
- }, table, entry.partition, now, searchType);
1589
- const { bytes, rowCount } = await codec.writeRows({ table }, kept, newKey, dataSource);
1590
- const newEntry = {
1591
- userId: entry.userId,
1592
- siteId: entry.siteId,
1593
- table,
1594
- partition: entry.partition,
1595
- objectKey: newKey,
1596
- rowCount,
1597
- bytes,
1598
- createdAt: now,
1599
- schemaVersion: entry.schemaVersion ?? currentSchemaVersion(table),
1600
- ...entry.tier !== void 0 ? { tier: entry.tier } : {},
1601
- ...searchType !== void 0 ? { searchType } : {}
1602
- };
1603
- await manifestStore.registerVersion(newEntry, [entry]);
1604
- entriesRewritten++;
1605
- rowsRemoved += removed;
1606
- bytesAfter += bytes;
1607
- });
1608
- }
1609
- return {
1610
- userId: ctx.userId,
1611
- siteId: ctx.siteId,
1612
- urlsRequested: urlSet.size,
1613
- entriesRewritten,
1614
- rowsRemoved,
1615
- bytesAfter,
1616
- at: now
1617
- };
1618
- }
1619
- return {
1620
- writeDay,
1621
- query,
1622
- queryComparison,
1623
- queryExtras,
1624
- runSQL,
1625
- compactTiered,
1626
- gcOrphans,
1627
- purgeTenant,
1628
- purgeUrls,
1629
- listLive: (filter) => manifestStore.listLive(filter),
1630
- listAll: (filter) => manifestStore.listAll(filter),
1631
- getWatermarks: (filter) => manifestStore.getWatermarks(filter),
1632
- getSyncStates: (filter) => manifestStore.getSyncStates(filter),
1633
- setSyncState: (scope, state, detail) => manifestStore.setSyncState(scope, state, detail),
1634
- readObject: (key) => dataSource.read(key)
1635
- };
1636
- }
1637
- const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
1638
- function hashUrl(url) {
1639
- let hi = 2166136261;
1640
- let lo = 3421674724;
1641
- for (let i = 0; i < url.length; i++) {
1642
- const c = url.charCodeAt(i);
1643
- lo ^= c;
1644
- const loMul = Math.imul(lo, 435) >>> 0;
1645
- const carry = Math.floor(lo * 435 / 4294967296);
1646
- const hiMul = Math.imul(hi, 435) + Math.imul(lo, 1) + carry >>> 0;
1647
- lo = loMul;
1648
- hi = hiMul;
1649
- }
1650
- return (hi >>> 0).toString(16).padStart(8, "0") + (lo >>> 0).toString(16).padStart(8, "0");
1651
- }
1652
- function inspectionSqliteKey(ctx) {
1653
- return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/inspections.db` : `u_${ctx.userId}/entities/inspections/inspections.db`;
1654
- }
1655
- const INSPECTION_SCHEMA_SQL = `
1656
- CREATE TABLE IF NOT EXISTS inspections (
1657
- url_hash TEXT PRIMARY KEY,
1658
- url TEXT NOT NULL,
1659
- inspected_at TEXT NOT NULL,
1660
- index_status TEXT,
1661
- last_crawl_time TEXT,
1662
- google_canonical TEXT,
1663
- user_canonical TEXT,
1664
- coverage_state TEXT,
1665
- robots_txt_state TEXT,
1666
- indexing_state TEXT,
1667
- page_fetch_state TEXT,
1668
- mobile_usability_verdict TEXT,
1669
- rich_results_verdict TEXT,
1670
- raw TEXT
1671
- );
1672
- CREATE TABLE IF NOT EXISTS inspection_history (
1673
- year_month TEXT NOT NULL,
1674
- url_hash TEXT NOT NULL,
1675
- url TEXT NOT NULL,
1676
- inspected_at TEXT NOT NULL,
1677
- payload TEXT NOT NULL,
1678
- PRIMARY KEY (year_month, url_hash, inspected_at)
1679
- );
1680
- CREATE INDEX IF NOT EXISTS inspection_history_by_month ON inspection_history(year_month);
1681
- `;
1682
- function rowToRecord(r) {
1683
- const out = {
1684
- url: r.url,
1685
- inspectedAt: r.inspected_at
1686
- };
1687
- if (r.index_status != null) out.indexStatus = r.index_status;
1688
- if (r.last_crawl_time != null) out.lastCrawlTime = r.last_crawl_time;
1689
- if (r.google_canonical != null) out.googleCanonical = r.google_canonical;
1690
- if (r.user_canonical != null) out.userCanonical = r.user_canonical;
1691
- if (r.coverage_state != null) out.coverageState = r.coverage_state;
1692
- if (r.robots_txt_state != null) out.robotsTxtState = r.robots_txt_state;
1693
- if (r.indexing_state != null) out.indexingState = r.indexing_state;
1694
- if (r.page_fetch_state != null) out.pageFetchState = r.page_fetch_state;
1695
- if (r.mobile_usability_verdict != null) out.mobileUsabilityVerdict = r.mobile_usability_verdict;
1696
- if (r.rich_results_verdict != null) out.richResultsVerdict = r.rich_results_verdict;
1697
- if (r.raw != null) out.raw = JSON.parse(r.raw);
1698
- return out;
1699
- }
1700
- function shardForRecord(record) {
1701
- const m = YEAR_MONTH_RE.exec(record.inspectedAt);
1702
- return m ? `${m[1]}-${m[2]}` : "unknown";
1703
- }
1704
- function createInspectionStoreSqlite(opts) {
1705
- const ds = opts.dataSource;
1706
- const hash = opts.hash ?? hashUrl;
1707
- async function withDriver(ctx, fn, persist) {
1708
- const key = inspectionSqliteKey(ctx);
1709
- const bytes = await ds.read(key).catch(() => void 0);
1710
- const driver = await opts.openDriver(bytes);
1711
- await driver.exec(INSPECTION_SCHEMA_SQL);
1712
- const result = await fn(driver);
1713
- if (persist) {
1714
- const out = await driver.serialize();
1715
- await ds.write(key, out);
1716
- }
1717
- await driver.close();
1718
- return result;
1719
- }
1720
- return {
1721
- async writeBatch(ctx, records) {
1722
- if (records.length === 0) return;
1723
- await withDriver(ctx, async (driver) => {
1724
- for (const r of records) {
1725
- const h = hash(r.url);
1726
- await driver.run(`INSERT INTO inspections (
1727
- url_hash, url, inspected_at, index_status, last_crawl_time,
1728
- google_canonical, user_canonical, coverage_state, robots_txt_state,
1729
- indexing_state, page_fetch_state, mobile_usability_verdict,
1730
- rich_results_verdict, raw
1731
- ) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)
1732
- ON CONFLICT(url_hash) DO UPDATE SET
1733
- url = excluded.url,
1734
- inspected_at = excluded.inspected_at,
1735
- index_status = excluded.index_status,
1736
- last_crawl_time = excluded.last_crawl_time,
1737
- google_canonical = excluded.google_canonical,
1738
- user_canonical = excluded.user_canonical,
1739
- coverage_state = excluded.coverage_state,
1740
- robots_txt_state = excluded.robots_txt_state,
1741
- indexing_state = excluded.indexing_state,
1742
- page_fetch_state = excluded.page_fetch_state,
1743
- mobile_usability_verdict = excluded.mobile_usability_verdict,
1744
- rich_results_verdict = excluded.rich_results_verdict,
1745
- raw = excluded.raw`, [
1746
- h,
1747
- r.url,
1748
- r.inspectedAt,
1749
- r.indexStatus ?? null,
1750
- r.lastCrawlTime ?? null,
1751
- r.googleCanonical ?? null,
1752
- r.userCanonical ?? null,
1753
- r.coverageState ?? null,
1754
- r.robotsTxtState ?? null,
1755
- r.indexingState ?? null,
1756
- r.pageFetchState ?? null,
1757
- r.mobileUsabilityVerdict ?? null,
1758
- r.richResultsVerdict ?? null,
1759
- r.raw === void 0 ? null : JSON.stringify(r.raw)
1760
- ]);
1761
- await driver.run(`INSERT OR REPLACE INTO inspection_history
1762
- (year_month, url_hash, url, inspected_at, payload)
1763
- VALUES (?,?,?,?,?)`, [
1764
- shardForRecord(r),
1765
- h,
1766
- r.url,
1767
- r.inspectedAt,
1768
- JSON.stringify(r)
1769
- ]);
1770
- }
1771
- }, true);
1772
- },
1773
- async getLatest(ctx, url) {
1774
- return await withDriver(ctx, async (driver) => {
1775
- const rows = await driver.all("SELECT * FROM inspections WHERE url_hash = ? LIMIT 1", [hash(url)]);
1776
- return rows.length === 0 ? void 0 : rowToRecord(rows[0]);
1777
- }, false);
1778
- },
1779
- async loadIndex(ctx) {
1780
- return await withDriver(ctx, async (driver) => {
1781
- const rows = await driver.all("SELECT * FROM inspections", []);
1782
- const records = {};
1783
- for (const r of rows) records[r.url_hash] = rowToRecord(r);
1784
- return {
1785
- version: 1,
1786
- records
1787
- };
1788
- }, false);
1789
- },
1790
- async loadHistory(ctx, yearMonth) {
1791
- return await withDriver(ctx, async (driver) => {
1792
- const rows = await driver.all("SELECT * FROM inspection_history WHERE year_month = ? ORDER BY inspected_at ASC", [yearMonth]);
1793
- if (rows.length === 0) return void 0;
1794
- return {
1795
- version: 1,
1796
- records: rows.map((r) => JSON.parse(r.payload))
1797
- };
1798
- }, false);
1799
- }
1800
- };
1801
- }
1802
- function toPath(gscUrl) {
1803
- try {
1804
- return new URL(gscUrl).pathname;
1805
- } catch {
1806
- return gscUrl;
1807
- }
1808
- }
1809
- function toSumPosition(apiPosition, impressions) {
1810
- return (apiPosition - 1) * Math.max(impressions, 1);
1811
- }
1812
- function transformGscRow(table, apiRow, options = {}) {
1813
- const keys = apiRow.keys;
1814
- if (!keys || keys.length === 0) return null;
1815
- const clicks = apiRow.clicks || 0;
1816
- const impressions = apiRow.impressions || 0;
1817
- const sum_position = toSumPosition(apiRow.position || 0, impressions);
1818
- if (table === "pages") {
1819
- const date = String(keys[1] ?? "");
1820
- return {
1821
- date,
1822
- row: {
1823
- url: toPath(String(keys[0] ?? "")),
1824
- date,
1825
- clicks,
1826
- impressions,
1827
- sum_position
1828
- }
1829
- };
1830
- }
1831
- if (table === "keywords") {
1832
- const query = String(keys[0] ?? "");
1833
- const date = String(keys[1] ?? "");
1834
- return {
1835
- date,
1836
- row: {
1837
- query,
1838
- query_canonical: options.normalizeQuery?.(query) ?? null,
1839
- date,
1840
- clicks,
1841
- impressions,
1842
- sum_position
1843
- }
1844
- };
1845
- }
1846
- if (table === "countries") {
1847
- const date = String(keys[1] ?? "");
1848
- return {
1849
- date,
1850
- row: {
1851
- country: String(keys[0] ?? ""),
1852
- date,
1853
- clicks,
1854
- impressions,
1855
- sum_position
1856
- }
1857
- };
1858
- }
1859
- if (table === "devices") {
1860
- const date = String(keys[1] ?? "");
1861
- return {
1862
- date,
1863
- row: {
1864
- device: String(keys[0] ?? ""),
1865
- date,
1866
- clicks,
1867
- impressions,
1868
- sum_position
1869
- }
1870
- };
1871
- }
1872
- if (table === "search_appearance") {
1873
- const date = String(keys[1] ?? "");
1874
- return {
1875
- date,
1876
- row: {
1877
- searchAppearance: String(keys[0] ?? ""),
1878
- date,
1879
- clicks,
1880
- impressions,
1881
- sum_position
1882
- }
1883
- };
1884
- }
1885
- const query = String(keys[1] ?? "");
1886
- const date = String(keys[2] ?? "");
1887
- const query_canonical = options.normalizeQuery?.(query) ?? null;
1888
- return {
1889
- date,
1890
- row: {
1891
- url: toPath(String(keys[0] ?? "")),
1892
- query,
1893
- query_canonical,
1894
- date,
1895
- clicks,
1896
- impressions,
1897
- sum_position
1898
- }
1899
- };
1900
- }
1901
- const DEFAULT_MAX_ROWS = 5e5;
1902
- function createRowAccumulator(options = {}) {
1903
- const maxRows = options.maxRows ?? DEFAULT_MAX_ROWS;
1904
- const trackDateBoundary = options.trackDateBoundary === true;
1905
- let buckets = /* @__PURE__ */ new Map();
1906
- const latestDate = /* @__PURE__ */ new Map();
1907
- let total = 0;
1908
- let overflowed = false;
1909
- function bucketFor(table, date) {
1910
- let byDate = buckets.get(table);
1911
- if (!byDate) {
1912
- byDate = /* @__PURE__ */ new Map();
1913
- buckets.set(table, byDate);
1914
- }
1915
- let rows = byDate.get(date);
1916
- if (!rows) {
1917
- rows = [];
1918
- byDate.set(date, rows);
1919
- }
1920
- return rows;
1921
- }
1922
- return {
1923
- get totalRows() {
1924
- return total;
1925
- },
1926
- get overflowed() {
1927
- return overflowed;
1928
- },
1929
- push(table, rows) {
1930
- if (overflowed) return false;
1931
- for (const r of rows) {
1932
- const t = transformGscRow(table, r, options);
1933
- if (!t || !t.date) continue;
1934
- bucketFor(table, t.date).push(t.row);
1935
- total++;
1936
- if (trackDateBoundary) {
1937
- const prev = latestDate.get(table);
1938
- if (!prev || t.date > prev) latestDate.set(table, t.date);
1939
- }
1940
- if (total > maxRows) {
1941
- overflowed = true;
1942
- return false;
1943
- }
1944
- }
1945
- return true;
1946
- },
1947
- drain() {
1948
- const out = buckets;
1949
- buckets = /* @__PURE__ */ new Map();
1950
- latestDate.clear();
1951
- total = 0;
1952
- overflowed = false;
1953
- return out;
1954
- },
1955
- drainCompleted() {
1956
- const out = /* @__PURE__ */ new Map();
1957
- if (!trackDateBoundary) return out;
1958
- for (const [table, byDate] of buckets) {
1959
- const latest = latestDate.get(table);
1960
- if (!latest) continue;
1961
- let outBy;
1962
- for (const [date, dateRows] of byDate) if (date < latest) {
1963
- if (!outBy) {
1964
- outBy = /* @__PURE__ */ new Map();
1965
- out.set(table, outBy);
1966
- }
1967
- outBy.set(date, dateRows);
1968
- total -= dateRows.length;
1969
- }
1970
- if (outBy) for (const date of outBy.keys()) byDate.delete(date);
1971
- }
1972
- return out;
1973
- }
1974
- };
1975
- }
1976
22
  export { DEFAULT_SEARCH_TYPE, FILES_PLACEHOLDER, MAX_DAY_BYTES, SCHEMAS, TABLE_METADATA, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createInspectionStoreSqlite, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, formatLiteral, inferLegacyTier, inferSearchType, inferTable, inspectionSqliteKey, keywords, mondayOfWeek, monthPartition, objectKey, page_keywords, pages, quarterOfMonth, quarterPartition, resolveToSQL, substituteNamedFiles, toPath, toSumPosition, transformGscRow, weekPartition };