@gscdump/engine 0.9.1 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/_chunks/dispatch.mjs +11 -17
- package/dist/_chunks/engine.mjs +622 -0
- package/dist/_chunks/pg-adapter.mjs +6 -15
- package/dist/_chunks/registry.d.mts +137 -15
- package/dist/_chunks/resolver.mjs +2 -25
- package/dist/_chunks/snapshot.d.mts +14 -0
- package/dist/_chunks/storage.d.mts +1 -20
- package/dist/adapters/node.d.mts +91 -0
- package/dist/adapters/node.mjs +133 -0
- package/dist/analyzer/index.d.mts +4 -50
- package/dist/analyzer/index.mjs +17 -8
- package/dist/index.d.mts +2 -2
- package/dist/index.mjs +6 -621
- package/dist/planner.d.mts +1 -1
- package/dist/planner.mjs +1 -1
- package/dist/resolver/index.d.mts +1 -23
- package/dist/resolver/index.mjs +3 -3
- package/dist/rollups.d.mts +163 -0
- package/dist/rollups.mjs +346 -0
- package/dist/snapshot.d.mts +1 -13
- package/dist/source/index.d.mts +30 -8
- package/dist/source/index.mjs +42 -7
- package/package.json +10 -5
- package/dist/_chunks/source-types.d.mts +0 -31
- /package/dist/_chunks/{planner.mjs → compiler.mjs} +0 -0
package/dist/index.mjs
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
import { a as inferTable, c as countries, d as keywords, f as page_keywords, i as dimensionToColumn, l as devices, n as allTables, p as pages, r as currentSchemaVersion, s as TABLE_METADATA, t as SCHEMAS, u as drizzleSchema } from "./_chunks/schema.mjs";
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import { bindLiterals, formatLiteral
|
|
5
|
-
import {
|
|
2
|
+
import { i as inferSearchType, n as dayPartition, r as inferLegacyTier, s as objectKey, t as DEFAULT_SEARCH_TYPE } from "./_chunks/storage.mjs";
|
|
3
|
+
import { i as substituteNamedFiles, o as enumeratePartitions, r as resolveToSQL, t as FILES_PLACEHOLDER } from "./_chunks/compiler.mjs";
|
|
4
|
+
import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
|
|
5
|
+
import { a as createDuckDBExecutor, i as createDuckDBCodec, n as createStorageEngine, r as canonicalEmptyParquetSchema, t as MAX_DAY_BYTES } from "./_chunks/engine.mjs";
|
|
6
6
|
import { createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
|
|
7
|
-
import
|
|
8
|
-
import { normalizeUrl } from "gscdump/normalize";
|
|
7
|
+
import "./planner.mjs";
|
|
9
8
|
function coerceRow(row) {
|
|
10
9
|
let mutated = null;
|
|
11
10
|
for (const [k, v] of Object.entries(row)) if (typeof v === "bigint") {
|
|
@@ -19,618 +18,4 @@ function coerceRows(rows) {
|
|
|
19
18
|
for (let i = 0; i < rows.length; i++) out[i] = coerceRow(rows[i]);
|
|
20
19
|
return out;
|
|
21
20
|
}
|
|
22
|
-
|
|
23
|
-
const inName = db.makeTempPath("json");
|
|
24
|
-
const outName = db.makeTempPath("parquet");
|
|
25
|
-
const jsonBytes = new TextEncoder().encode(JSON.stringify(rows));
|
|
26
|
-
const registered = [];
|
|
27
|
-
await db.registerFileBuffer(inName, jsonBytes);
|
|
28
|
-
registered.push(inName);
|
|
29
|
-
try {
|
|
30
|
-
const sql = rows.length === 0 ? `COPY (SELECT * FROM ${emptyTableSchema(table)} WHERE FALSE) TO '${sqlEscape(outName)}' (FORMAT PARQUET)` : `COPY (SELECT * FROM read_json_auto('${sqlEscape(inName)}', format='array', columns=${columnsJson(table)})) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`;
|
|
31
|
-
await db.query(sql);
|
|
32
|
-
registered.push(outName);
|
|
33
|
-
return await db.copyFileToBuffer(outName);
|
|
34
|
-
} finally {
|
|
35
|
-
await db.dropFiles(registered);
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
async function decodeBytes(db, bytes, table) {
|
|
39
|
-
const name = db.makeTempPath("parquet");
|
|
40
|
-
await db.registerFileBuffer(name, bytes);
|
|
41
|
-
try {
|
|
42
|
-
return await db.query(`SELECT * ${dateReplaceClause(table)} FROM read_parquet('${sqlEscape(name)}')`);
|
|
43
|
-
} finally {
|
|
44
|
-
await db.dropFiles([name]);
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
function createDuckDBCodec(factory) {
|
|
48
|
-
return {
|
|
49
|
-
async writeRows(ctx, rows, key, dataSource) {
|
|
50
|
-
const bytes = await encodeBytes(await factory.getDuckDB(), ctx.table, rows);
|
|
51
|
-
await dataSource.write(key, bytes);
|
|
52
|
-
return {
|
|
53
|
-
bytes: bytes.byteLength,
|
|
54
|
-
rowCount: rows.length
|
|
55
|
-
};
|
|
56
|
-
},
|
|
57
|
-
async readRows(ctx, key, dataSource) {
|
|
58
|
-
return decodeBytes(await factory.getDuckDB(), await dataSource.read(key), ctx.table);
|
|
59
|
-
},
|
|
60
|
-
async compactRows(ctx, inputKeys, outputKey, dataSource) {
|
|
61
|
-
const db = await factory.getDuckDB();
|
|
62
|
-
if (inputKeys.length === 0) {
|
|
63
|
-
const bytes = await encodeBytes(db, ctx.table, []);
|
|
64
|
-
await dataSource.write(outputKey, bytes);
|
|
65
|
-
return {
|
|
66
|
-
bytes: bytes.byteLength,
|
|
67
|
-
rowCount: 0
|
|
68
|
-
};
|
|
69
|
-
}
|
|
70
|
-
const inputUris = inputKeys.map((k) => dataSource.uri?.(k));
|
|
71
|
-
if (inputUris.every((u) => u !== void 0)) {
|
|
72
|
-
const outName = db.makeTempPath("parquet");
|
|
73
|
-
const fileList = inputUris.map((u) => `'${sqlEscape(u)}'`).join(", ");
|
|
74
|
-
try {
|
|
75
|
-
await db.query(`COPY (SELECT * FROM read_parquet([${fileList}], union_by_name=true)) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
|
|
76
|
-
const bytes = await db.copyFileToBuffer(outName);
|
|
77
|
-
const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
|
|
78
|
-
const rowCount = Number(countRows[0]?.n ?? 0);
|
|
79
|
-
await dataSource.write(outputKey, bytes);
|
|
80
|
-
return {
|
|
81
|
-
bytes: bytes.byteLength,
|
|
82
|
-
rowCount
|
|
83
|
-
};
|
|
84
|
-
} finally {
|
|
85
|
-
await db.dropFiles([outName]);
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
const inputs = await Promise.all(inputKeys.map((k) => dataSource.read(k)));
|
|
89
|
-
const inNames = [];
|
|
90
|
-
const outName = db.makeTempPath("parquet");
|
|
91
|
-
const registered = [];
|
|
92
|
-
for (let i = 0; i < inputs.length; i++) {
|
|
93
|
-
const name = db.makeTempPath("parquet");
|
|
94
|
-
await db.registerFileBuffer(name, inputs[i]);
|
|
95
|
-
inNames.push(name);
|
|
96
|
-
registered.push(name);
|
|
97
|
-
}
|
|
98
|
-
try {
|
|
99
|
-
const fileList = inNames.map((n) => `'${sqlEscape(n)}'`).join(", ");
|
|
100
|
-
await db.query(`COPY (SELECT * FROM read_parquet([${fileList}], union_by_name = true)) TO '${sqlEscape(outName)}' (FORMAT PARQUET)`);
|
|
101
|
-
registered.push(outName);
|
|
102
|
-
const bytes = await db.copyFileToBuffer(outName);
|
|
103
|
-
const countRows = await db.query(`SELECT count(*)::BIGINT AS n FROM read_parquet('${sqlEscape(outName)}')`);
|
|
104
|
-
const rowCount = Number(countRows[0]?.n ?? 0);
|
|
105
|
-
await dataSource.write(outputKey, bytes);
|
|
106
|
-
return {
|
|
107
|
-
bytes: bytes.byteLength,
|
|
108
|
-
rowCount
|
|
109
|
-
};
|
|
110
|
-
} finally {
|
|
111
|
-
await db.dropFiles(registered);
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
};
|
|
115
|
-
}
|
|
116
|
-
function rewriteEmptyFileSets(sql, placeholders, defaultTable, placeholderTables) {
|
|
117
|
-
let out = sql;
|
|
118
|
-
for (const [name, keys] of Object.entries(placeholders)) {
|
|
119
|
-
if (keys.length > 0) continue;
|
|
120
|
-
const emptyFallback = `(SELECT * FROM ${emptyTableSchema(placeholderTables?.[name] ?? defaultTable)} WHERE FALSE)`;
|
|
121
|
-
const pattern = new RegExp(`read_parquet\\(\\s*\\{\\{${name}\\}\\}\\s*(?:,\\s*union_by_name\\s*=\\s*true\\s*)?\\)`, "g");
|
|
122
|
-
out = out.replace(pattern, emptyFallback);
|
|
123
|
-
}
|
|
124
|
-
return out;
|
|
125
|
-
}
|
|
126
|
-
function createDuckDBExecutor(factory) {
|
|
127
|
-
return { async execute({ sql, params, fileKeys, placeholderTables, dataSource, table, signal }) {
|
|
128
|
-
signal?.throwIfAborted();
|
|
129
|
-
const db = await factory.getDuckDB();
|
|
130
|
-
const placeholders = {};
|
|
131
|
-
const registered = [];
|
|
132
|
-
for (const [name, keys] of Object.entries(fileKeys)) {
|
|
133
|
-
const resolved = [];
|
|
134
|
-
for (const key of keys) {
|
|
135
|
-
const uri = dataSource.uri?.(key);
|
|
136
|
-
if (uri !== void 0) resolved.push(uri);
|
|
137
|
-
else {
|
|
138
|
-
const bytes = await dataSource.read(key, void 0, signal);
|
|
139
|
-
await db.registerFileBuffer(key, bytes);
|
|
140
|
-
registered.push(key);
|
|
141
|
-
resolved.push(key);
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
placeholders[name] = resolved;
|
|
145
|
-
}
|
|
146
|
-
try {
|
|
147
|
-
signal?.throwIfAborted();
|
|
148
|
-
const finalSql = substituteNamedFiles(rewriteEmptyFileSets(sql, placeholders, table, placeholderTables), placeholders);
|
|
149
|
-
return {
|
|
150
|
-
rows: await db.query(finalSql, params),
|
|
151
|
-
sql: finalSql
|
|
152
|
-
};
|
|
153
|
-
} finally {
|
|
154
|
-
if (registered.length > 0) await db.dropFiles(registered);
|
|
155
|
-
}
|
|
156
|
-
} };
|
|
157
|
-
}
|
|
158
|
-
function emptyTableSchema(table) {
|
|
159
|
-
return `(FROM (VALUES ${placeholderValues(table)}) t(${columnList(table)}))`;
|
|
160
|
-
}
|
|
161
|
-
function canonicalEmptyParquetSchema(table) {
|
|
162
|
-
return emptyTableSchema(table);
|
|
163
|
-
}
|
|
164
|
-
function dateReplaceClause(table) {
|
|
165
|
-
if (!table) return "";
|
|
166
|
-
const dateCols = SCHEMAS[table].columns.filter((c) => c.type === "DATE").map((c) => c.name);
|
|
167
|
-
if (dateCols.length === 0) return "";
|
|
168
|
-
return `REPLACE (${dateCols.map((n) => `strftime(${n}, '%Y-%m-%d') AS ${n}`).join(", ")})`;
|
|
169
|
-
}
|
|
170
|
-
function columnList(table) {
|
|
171
|
-
return SCHEMAS[table].columns.map((c) => c.name).join(", ");
|
|
172
|
-
}
|
|
173
|
-
function placeholderValues(table) {
|
|
174
|
-
return `(${SCHEMAS[table].columns.map((c) => defaultForType(c.type)).join(", ")})`;
|
|
175
|
-
}
|
|
176
|
-
function defaultForType(t) {
|
|
177
|
-
if (t === "VARCHAR") return "''";
|
|
178
|
-
if (t === "DATE") return "DATE '1970-01-01'";
|
|
179
|
-
if (t === "INTEGER" || t === "BIGINT") return "0";
|
|
180
|
-
if (t === "DOUBLE") return "CAST(0 AS DOUBLE)";
|
|
181
|
-
return "NULL";
|
|
182
|
-
}
|
|
183
|
-
function columnsJson(table) {
|
|
184
|
-
return `{${SCHEMAS[table].columns.map((c) => `'${c.name}': '${c.type}'`).join(", ")}}`;
|
|
185
|
-
}
|
|
186
|
-
const VERSION_RE = /__v(\d+)\.parquet$/;
|
|
187
|
-
function parseLockScope(key) {
|
|
188
|
-
const match = VERSION_RE.exec(key);
|
|
189
|
-
if (!match) return void 0;
|
|
190
|
-
const parts = key.slice(0, match.index).split("/");
|
|
191
|
-
if (parts.length < 4) return void 0;
|
|
192
|
-
const userPart = parts[0];
|
|
193
|
-
if (!userPart.startsWith("u_")) return void 0;
|
|
194
|
-
const userId = userPart.slice(2);
|
|
195
|
-
const partition = parts.slice(-2).join("/");
|
|
196
|
-
const table = parts[parts.length - 3];
|
|
197
|
-
return {
|
|
198
|
-
userId,
|
|
199
|
-
siteId: parts.length >= 5 ? parts.slice(1, -3).join("/") : void 0,
|
|
200
|
-
table,
|
|
201
|
-
partition
|
|
202
|
-
};
|
|
203
|
-
}
|
|
204
|
-
async function gcOrphansImpl(deps, now, graceMs, opts = {}) {
|
|
205
|
-
const cutoff = now - graceMs;
|
|
206
|
-
const retired = await deps.manifestStore.listRetired(cutoff);
|
|
207
|
-
if (retired.length > 0) {
|
|
208
|
-
await deps.dataSource.delete(retired.map((e) => e.objectKey));
|
|
209
|
-
await deps.manifestStore.delete(retired);
|
|
210
|
-
}
|
|
211
|
-
let sweptOrphans = 0;
|
|
212
|
-
if (opts.userId) {
|
|
213
|
-
const prefix = tenantPrefix({
|
|
214
|
-
userId: opts.userId,
|
|
215
|
-
siteId: opts.siteId
|
|
216
|
-
});
|
|
217
|
-
const knownEntries = await deps.manifestStore.listAll({
|
|
218
|
-
userId: opts.userId,
|
|
219
|
-
siteId: opts.siteId
|
|
220
|
-
});
|
|
221
|
-
const knownSet = new Set(knownEntries.map((e) => e.objectKey));
|
|
222
|
-
const orphans = [];
|
|
223
|
-
const keyStream = deps.dataSource.streamList ? deps.dataSource.streamList(prefix) : async function* () {
|
|
224
|
-
const all = await deps.dataSource.list(prefix);
|
|
225
|
-
for (const k of all) yield k;
|
|
226
|
-
}();
|
|
227
|
-
for await (const key of keyStream) {
|
|
228
|
-
if (knownSet.has(key)) continue;
|
|
229
|
-
const match = VERSION_RE.exec(key);
|
|
230
|
-
if (!match) continue;
|
|
231
|
-
if (Number(match[1]) <= cutoff) orphans.push(key);
|
|
232
|
-
}
|
|
233
|
-
const byScope = /* @__PURE__ */ new Map();
|
|
234
|
-
for (const key of orphans) {
|
|
235
|
-
const scope = parseLockScope(key);
|
|
236
|
-
if (!scope) continue;
|
|
237
|
-
const sk = `${scope.userId}|${scope.siteId ?? ""}|${scope.table}|${scope.partition}`;
|
|
238
|
-
const bucket = byScope.get(sk) ?? {
|
|
239
|
-
scope,
|
|
240
|
-
keys: []
|
|
241
|
-
};
|
|
242
|
-
bucket.keys.push(key);
|
|
243
|
-
byScope.set(sk, bucket);
|
|
244
|
-
}
|
|
245
|
-
for (const { scope, keys } of byScope.values()) await deps.manifestStore.withLock(scope, async () => {
|
|
246
|
-
const known = await deps.manifestStore.listAll({
|
|
247
|
-
userId: scope.userId,
|
|
248
|
-
siteId: scope.siteId,
|
|
249
|
-
table: scope.table,
|
|
250
|
-
partitions: [scope.partition]
|
|
251
|
-
});
|
|
252
|
-
const knownInScope = new Set(known.map((e) => e.objectKey));
|
|
253
|
-
const stillOrphans = keys.filter((k) => !knownInScope.has(k));
|
|
254
|
-
if (stillOrphans.length > 0) {
|
|
255
|
-
await deps.dataSource.delete(stillOrphans);
|
|
256
|
-
sweptOrphans += stillOrphans.length;
|
|
257
|
-
}
|
|
258
|
-
});
|
|
259
|
-
}
|
|
260
|
-
return { deleted: retired.length + sweptOrphans };
|
|
261
|
-
}
|
|
262
|
-
const URL_PURGE_TABLES = ["pages", "page_keywords"];
|
|
263
|
-
const MAX_DAY_BYTES = 100 * 1024 * 1024;
|
|
264
|
-
const URL_COLUMNS = /* @__PURE__ */ new Set();
|
|
265
|
-
for (const t of Object.keys(SCHEMAS)) for (const col of SCHEMAS[t].columns) if (col.name === "url") URL_COLUMNS.add(`${t}:url`);
|
|
266
|
-
function normalizeRow(table, row) {
|
|
267
|
-
if (!URL_COLUMNS.has(`${table}:url`)) return row;
|
|
268
|
-
const url = row.url;
|
|
269
|
-
if (typeof url !== "string") return row;
|
|
270
|
-
const normalized = normalizeUrl(url);
|
|
271
|
-
if (normalized === url) return row;
|
|
272
|
-
return {
|
|
273
|
-
...row,
|
|
274
|
-
url: normalized
|
|
275
|
-
};
|
|
276
|
-
}
|
|
277
|
-
function createStorageEngine(opts) {
|
|
278
|
-
const { dataSource, manifestStore, codec, executor } = opts;
|
|
279
|
-
const defaultNow = opts.now ?? (() => Date.now());
|
|
280
|
-
async function writeDay(ctx, rows) {
|
|
281
|
-
if (!ctx.date) throw new Error("writeDay requires ctx.date");
|
|
282
|
-
const date = ctx.date;
|
|
283
|
-
const now = (ctx.now ?? defaultNow)();
|
|
284
|
-
const partition = dayPartition(date);
|
|
285
|
-
const searchType = ctx.searchType;
|
|
286
|
-
return manifestStore.withLock({
|
|
287
|
-
userId: ctx.userId,
|
|
288
|
-
siteId: ctx.siteId,
|
|
289
|
-
table: ctx.table,
|
|
290
|
-
partition
|
|
291
|
-
}, async () => {
|
|
292
|
-
const superseding = (await manifestStore.listLive({
|
|
293
|
-
userId: ctx.userId,
|
|
294
|
-
siteId: ctx.siteId,
|
|
295
|
-
table: ctx.table,
|
|
296
|
-
partitions: [partition]
|
|
297
|
-
})).filter((e) => inferSearchType(e) === inferSearchType({ searchType }));
|
|
298
|
-
const normalizedRows = rows.map((r) => normalizeRow(ctx.table, r));
|
|
299
|
-
const key = objectKey(ctx, ctx.table, partition, now, searchType);
|
|
300
|
-
const { bytes: writtenBytes, rowCount } = await codec.writeRows({ table: ctx.table }, normalizedRows, key, dataSource);
|
|
301
|
-
let bytes = writtenBytes;
|
|
302
|
-
if (bytes === 0 && rowCount > 0 && dataSource.head) {
|
|
303
|
-
const probed = await dataSource.head(key);
|
|
304
|
-
if (probed) bytes = probed.bytes;
|
|
305
|
-
}
|
|
306
|
-
if (bytes > 104857600) {
|
|
307
|
-
await dataSource.delete([key]).catch(() => {});
|
|
308
|
-
throw new Error(`writeDay payload ${bytes} bytes exceeds ${MAX_DAY_BYTES} hard ceiling (table=${ctx.table}, key=${key})`);
|
|
309
|
-
}
|
|
310
|
-
const entry = {
|
|
311
|
-
userId: ctx.userId,
|
|
312
|
-
siteId: ctx.siteId,
|
|
313
|
-
table: ctx.table,
|
|
314
|
-
partition,
|
|
315
|
-
objectKey: key,
|
|
316
|
-
rowCount,
|
|
317
|
-
bytes,
|
|
318
|
-
createdAt: now,
|
|
319
|
-
schemaVersion: currentSchemaVersion(ctx.table),
|
|
320
|
-
tier: "raw",
|
|
321
|
-
...searchType !== void 0 ? { searchType } : {}
|
|
322
|
-
};
|
|
323
|
-
await manifestStore.registerVersion(entry, superseding);
|
|
324
|
-
await manifestStore.bumpWatermark({
|
|
325
|
-
userId: ctx.userId,
|
|
326
|
-
siteId: ctx.siteId,
|
|
327
|
-
table: ctx.table
|
|
328
|
-
}, date, now);
|
|
329
|
-
});
|
|
330
|
-
}
|
|
331
|
-
async function runSQL(opts) {
|
|
332
|
-
opts.signal?.throwIfAborted();
|
|
333
|
-
const entries = Object.entries(opts.fileSets);
|
|
334
|
-
const perSet = await Promise.all(entries.map(async ([name, ref]) => {
|
|
335
|
-
return [name, (await manifestStore.listLive({
|
|
336
|
-
userId: opts.ctx.userId,
|
|
337
|
-
siteId: opts.ctx.siteId,
|
|
338
|
-
table: ref.table,
|
|
339
|
-
partitions: ref.partitions
|
|
340
|
-
})).map((e) => e.objectKey)];
|
|
341
|
-
}));
|
|
342
|
-
opts.signal?.throwIfAborted();
|
|
343
|
-
const fileKeys = {};
|
|
344
|
-
for (const [name, keys] of perSet) fileKeys[name] = keys;
|
|
345
|
-
const uniqueKeys = [...new Set(perSet.flatMap(([, keys]) => keys))];
|
|
346
|
-
let table = opts.table;
|
|
347
|
-
if (!table) {
|
|
348
|
-
if (new Set(entries.map(([, ref]) => ref.table)).size > 1) throw new Error("runSQL requires explicit ctx.table when fileSets reference multiple tables.");
|
|
349
|
-
table = entries[0]?.[1].table;
|
|
350
|
-
}
|
|
351
|
-
if (!table) throw new Error("runSQL requires at least one fileSet or an explicit table");
|
|
352
|
-
const placeholderTables = {};
|
|
353
|
-
for (const [name, ref] of entries) placeholderTables[name] = ref.table;
|
|
354
|
-
const result = await executor.execute({
|
|
355
|
-
sql: opts.sql,
|
|
356
|
-
params: opts.params ?? [],
|
|
357
|
-
fileKeys,
|
|
358
|
-
placeholderTables,
|
|
359
|
-
dataSource,
|
|
360
|
-
table,
|
|
361
|
-
signal: opts.signal
|
|
362
|
-
});
|
|
363
|
-
return {
|
|
364
|
-
rows: result.rows,
|
|
365
|
-
sql: result.sql,
|
|
366
|
-
objectKeys: uniqueKeys
|
|
367
|
-
};
|
|
368
|
-
}
|
|
369
|
-
async function query(ctx, state) {
|
|
370
|
-
const plan = buildLogicalPlan(state, { regex: true });
|
|
371
|
-
const table = ctx.table ?? plan.dataset;
|
|
372
|
-
const resolved = compileLogicalQueryPlan(plan, table);
|
|
373
|
-
return runSQL({
|
|
374
|
-
ctx: {
|
|
375
|
-
userId: ctx.userId,
|
|
376
|
-
siteId: ctx.siteId
|
|
377
|
-
},
|
|
378
|
-
table,
|
|
379
|
-
fileSets: { FILES: {
|
|
380
|
-
table,
|
|
381
|
-
partitions: resolved.partitions
|
|
382
|
-
} },
|
|
383
|
-
sql: resolved.sql,
|
|
384
|
-
params: resolved.params,
|
|
385
|
-
signal: ctx.signal
|
|
386
|
-
});
|
|
387
|
-
}
|
|
388
|
-
async function queryComparison(ctx, current, previous, filter) {
|
|
389
|
-
const adapter = createParquetResolverAdapter();
|
|
390
|
-
const currentPlan = buildLogicalPlan(current, adapter.capabilities);
|
|
391
|
-
const previousPlan = buildLogicalPlan(previous, adapter.capabilities);
|
|
392
|
-
if (currentPlan.dataset !== previousPlan.dataset) throw new Error(`queryComparison: current (${currentPlan.dataset}) and previous (${previousPlan.dataset}) must resolve to the same table`);
|
|
393
|
-
const table = ctx.table ?? currentPlan.dataset;
|
|
394
|
-
const comparison = resolveComparisonSQL(current, previous, {
|
|
395
|
-
adapter,
|
|
396
|
-
siteId: void 0
|
|
397
|
-
}, filter);
|
|
398
|
-
const totals = buildTotalsSql(current, {
|
|
399
|
-
adapter,
|
|
400
|
-
siteId: void 0
|
|
401
|
-
});
|
|
402
|
-
const fileSets = { FILES: {
|
|
403
|
-
table,
|
|
404
|
-
partitions: enumeratePartitions(currentPlan.dateRange.startDate < previousPlan.dateRange.startDate ? currentPlan.dateRange.startDate : previousPlan.dateRange.startDate, currentPlan.dateRange.endDate > previousPlan.dateRange.endDate ? currentPlan.dateRange.endDate : previousPlan.dateRange.endDate)
|
|
405
|
-
} };
|
|
406
|
-
const baseCtx = {
|
|
407
|
-
userId: ctx.userId,
|
|
408
|
-
siteId: ctx.siteId
|
|
409
|
-
};
|
|
410
|
-
const [main, count, totalsRow] = await Promise.all([
|
|
411
|
-
runSQL({
|
|
412
|
-
ctx: baseCtx,
|
|
413
|
-
table,
|
|
414
|
-
fileSets,
|
|
415
|
-
sql: comparison.sql,
|
|
416
|
-
params: comparison.params,
|
|
417
|
-
signal: ctx.signal
|
|
418
|
-
}),
|
|
419
|
-
runSQL({
|
|
420
|
-
ctx: baseCtx,
|
|
421
|
-
table,
|
|
422
|
-
fileSets,
|
|
423
|
-
sql: comparison.countSql,
|
|
424
|
-
params: comparison.countParams,
|
|
425
|
-
signal: ctx.signal
|
|
426
|
-
}),
|
|
427
|
-
runSQL({
|
|
428
|
-
ctx: baseCtx,
|
|
429
|
-
table,
|
|
430
|
-
fileSets,
|
|
431
|
-
sql: totals.sql,
|
|
432
|
-
params: totals.params,
|
|
433
|
-
signal: ctx.signal
|
|
434
|
-
})
|
|
435
|
-
]);
|
|
436
|
-
return {
|
|
437
|
-
rows: main.rows,
|
|
438
|
-
totalCount: Number(count.rows[0]?.total ?? 0),
|
|
439
|
-
totals: totalsRow.rows[0] ?? {}
|
|
440
|
-
};
|
|
441
|
-
}
|
|
442
|
-
async function queryOptimized(ctx, state) {
|
|
443
|
-
const adapter = createParquetResolverAdapter();
|
|
444
|
-
const plan = buildLogicalPlan(state, adapter.capabilities);
|
|
445
|
-
const table = ctx.table ?? plan.dataset;
|
|
446
|
-
const partitions = enumeratePartitions(plan.dateRange.startDate, plan.dateRange.endDate);
|
|
447
|
-
const { sql, params } = resolveToSQLOptimized(state, {
|
|
448
|
-
adapter,
|
|
449
|
-
siteId: void 0
|
|
450
|
-
});
|
|
451
|
-
const result = await runSQL({
|
|
452
|
-
ctx: {
|
|
453
|
-
userId: ctx.userId,
|
|
454
|
-
siteId: ctx.siteId
|
|
455
|
-
},
|
|
456
|
-
table,
|
|
457
|
-
fileSets: { FILES: {
|
|
458
|
-
table,
|
|
459
|
-
partitions
|
|
460
|
-
} },
|
|
461
|
-
sql,
|
|
462
|
-
params,
|
|
463
|
-
signal: ctx.signal
|
|
464
|
-
});
|
|
465
|
-
const firstRow = result.rows[0];
|
|
466
|
-
const totalCount = Number(firstRow?.totalCount ?? 0);
|
|
467
|
-
const totals = {
|
|
468
|
-
clicks: Number(firstRow?.totalClicks ?? 0),
|
|
469
|
-
impressions: Number(firstRow?.totalImpressions ?? 0),
|
|
470
|
-
ctr: Number(firstRow?.totalCtr ?? 0),
|
|
471
|
-
position: Number(firstRow?.totalPosition ?? 0)
|
|
472
|
-
};
|
|
473
|
-
return {
|
|
474
|
-
rows: result.rows.map((r) => {
|
|
475
|
-
const { totalCount: _tc, totalClicks: _tcl, totalImpressions: _ti, totalCtr: _tr, totalPosition: _tp, ...rest } = r;
|
|
476
|
-
return rest;
|
|
477
|
-
}),
|
|
478
|
-
totalCount,
|
|
479
|
-
totals
|
|
480
|
-
};
|
|
481
|
-
}
|
|
482
|
-
async function queryExtras(ctx, state) {
|
|
483
|
-
const adapter = createParquetResolverAdapter();
|
|
484
|
-
const extras = buildExtrasQueries(state, {
|
|
485
|
-
adapter,
|
|
486
|
-
siteId: void 0
|
|
487
|
-
});
|
|
488
|
-
if (extras.length === 0) return [];
|
|
489
|
-
const plan = buildLogicalPlan(state, adapter.capabilities);
|
|
490
|
-
const table = ctx.table ?? plan.dataset;
|
|
491
|
-
const fileSets = { FILES: {
|
|
492
|
-
table,
|
|
493
|
-
partitions: enumeratePartitions(plan.dateRange.startDate, plan.dateRange.endDate)
|
|
494
|
-
} };
|
|
495
|
-
const baseCtx = {
|
|
496
|
-
userId: ctx.userId,
|
|
497
|
-
siteId: ctx.siteId
|
|
498
|
-
};
|
|
499
|
-
const results = await Promise.all(extras.map((e) => runSQL({
|
|
500
|
-
ctx: baseCtx,
|
|
501
|
-
table,
|
|
502
|
-
fileSets,
|
|
503
|
-
sql: e.sql,
|
|
504
|
-
params: e.params,
|
|
505
|
-
signal: ctx.signal
|
|
506
|
-
})));
|
|
507
|
-
return extras.map((e, i) => ({
|
|
508
|
-
key: e.key,
|
|
509
|
-
rows: results[i].rows
|
|
510
|
-
}));
|
|
511
|
-
}
|
|
512
|
-
async function compactTiered(ctx, thresholds) {
|
|
513
|
-
return compactTieredImpl({
|
|
514
|
-
dataSource,
|
|
515
|
-
manifestStore,
|
|
516
|
-
codec
|
|
517
|
-
}, ctx, (ctx.now ?? defaultNow)(), thresholds);
|
|
518
|
-
}
|
|
519
|
-
async function gcOrphans(ctx, graceMs) {
|
|
520
|
-
return gcOrphansImpl({
|
|
521
|
-
dataSource,
|
|
522
|
-
manifestStore
|
|
523
|
-
}, (ctx.now ?? defaultNow)(), graceMs, {
|
|
524
|
-
userId: ctx.userId,
|
|
525
|
-
siteId: ctx.siteId
|
|
526
|
-
});
|
|
527
|
-
}
|
|
528
|
-
async function purgeTenant(ctx) {
|
|
529
|
-
const prefix = tenantPrefix(ctx);
|
|
530
|
-
const keys = [];
|
|
531
|
-
const keyStream = dataSource.streamList ? dataSource.streamList(prefix) : async function* () {
|
|
532
|
-
for (const k of await dataSource.list(prefix)) yield k;
|
|
533
|
-
}();
|
|
534
|
-
for await (const key of keyStream) keys.push(key);
|
|
535
|
-
if (keys.length > 0) await dataSource.delete(keys);
|
|
536
|
-
const manifestResult = await manifestStore.purgeTenant({
|
|
537
|
-
userId: ctx.userId,
|
|
538
|
-
siteId: ctx.siteId
|
|
539
|
-
});
|
|
540
|
-
return {
|
|
541
|
-
userId: ctx.userId,
|
|
542
|
-
siteId: ctx.siteId,
|
|
543
|
-
prefix,
|
|
544
|
-
objectsDeleted: keys.length,
|
|
545
|
-
entriesRemoved: manifestResult.entriesRemoved,
|
|
546
|
-
watermarksRemoved: manifestResult.watermarksRemoved,
|
|
547
|
-
syncStatesRemoved: manifestResult.syncStatesRemoved,
|
|
548
|
-
at: defaultNow()
|
|
549
|
-
};
|
|
550
|
-
}
|
|
551
|
-
async function purgeUrls(ctx, urls) {
|
|
552
|
-
const now = defaultNow();
|
|
553
|
-
const urlSet = new Set(urls);
|
|
554
|
-
let entriesRewritten = 0;
|
|
555
|
-
let rowsRemoved = 0;
|
|
556
|
-
let bytesAfter = 0;
|
|
557
|
-
if (urlSet.size === 0) return {
|
|
558
|
-
userId: ctx.userId,
|
|
559
|
-
siteId: ctx.siteId,
|
|
560
|
-
urlsRequested: 0,
|
|
561
|
-
entriesRewritten: 0,
|
|
562
|
-
rowsRemoved: 0,
|
|
563
|
-
bytesAfter: 0,
|
|
564
|
-
at: now
|
|
565
|
-
};
|
|
566
|
-
for (const table of URL_PURGE_TABLES) {
|
|
567
|
-
const entries = await manifestStore.listLive({
|
|
568
|
-
userId: ctx.userId,
|
|
569
|
-
siteId: ctx.siteId,
|
|
570
|
-
table
|
|
571
|
-
});
|
|
572
|
-
for (const entry of entries) await manifestStore.withLock({
|
|
573
|
-
userId: entry.userId,
|
|
574
|
-
siteId: entry.siteId,
|
|
575
|
-
table,
|
|
576
|
-
partition: entry.partition
|
|
577
|
-
}, async () => {
|
|
578
|
-
const rows = await codec.readRows({ table }, entry.objectKey, dataSource);
|
|
579
|
-
const kept = rows.filter((r) => typeof r.url !== "string" || !urlSet.has(r.url));
|
|
580
|
-
const removed = rows.length - kept.length;
|
|
581
|
-
if (removed === 0) return;
|
|
582
|
-
const searchType = entry.searchType;
|
|
583
|
-
const newKey = objectKey({
|
|
584
|
-
userId: entry.userId,
|
|
585
|
-
siteId: entry.siteId
|
|
586
|
-
}, table, entry.partition, now, searchType);
|
|
587
|
-
const { bytes, rowCount } = await codec.writeRows({ table }, kept, newKey, dataSource);
|
|
588
|
-
const newEntry = {
|
|
589
|
-
userId: entry.userId,
|
|
590
|
-
siteId: entry.siteId,
|
|
591
|
-
table,
|
|
592
|
-
partition: entry.partition,
|
|
593
|
-
objectKey: newKey,
|
|
594
|
-
rowCount,
|
|
595
|
-
bytes,
|
|
596
|
-
createdAt: now,
|
|
597
|
-
schemaVersion: entry.schemaVersion ?? currentSchemaVersion(table),
|
|
598
|
-
...entry.tier !== void 0 ? { tier: entry.tier } : {},
|
|
599
|
-
...searchType !== void 0 ? { searchType } : {}
|
|
600
|
-
};
|
|
601
|
-
await manifestStore.registerVersion(newEntry, [entry]);
|
|
602
|
-
entriesRewritten++;
|
|
603
|
-
rowsRemoved += removed;
|
|
604
|
-
bytesAfter += bytes;
|
|
605
|
-
});
|
|
606
|
-
}
|
|
607
|
-
return {
|
|
608
|
-
userId: ctx.userId,
|
|
609
|
-
siteId: ctx.siteId,
|
|
610
|
-
urlsRequested: urlSet.size,
|
|
611
|
-
entriesRewritten,
|
|
612
|
-
rowsRemoved,
|
|
613
|
-
bytesAfter,
|
|
614
|
-
at: now
|
|
615
|
-
};
|
|
616
|
-
}
|
|
617
|
-
return {
|
|
618
|
-
writeDay,
|
|
619
|
-
query,
|
|
620
|
-
queryComparison,
|
|
621
|
-
queryExtras,
|
|
622
|
-
queryOptimized,
|
|
623
|
-
runSQL,
|
|
624
|
-
compactTiered,
|
|
625
|
-
gcOrphans,
|
|
626
|
-
purgeTenant,
|
|
627
|
-
purgeUrls,
|
|
628
|
-
listLive: (filter) => manifestStore.listLive(filter),
|
|
629
|
-
listAll: (filter) => manifestStore.listAll(filter),
|
|
630
|
-
getWatermarks: (filter) => manifestStore.getWatermarks(filter),
|
|
631
|
-
getSyncStates: (filter) => manifestStore.getSyncStates(filter),
|
|
632
|
-
setSyncState: (scope, state, detail) => manifestStore.setSyncState(scope, state, detail),
|
|
633
|
-
readObject: (key) => dataSource.read(key)
|
|
634
|
-
};
|
|
635
|
-
}
|
|
636
|
-
export { DEFAULT_SEARCH_TYPE, FILES_PLACEHOLDER, MAX_DAY_BYTES, SCHEMAS, TABLE_METADATA, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, formatLiteral, inferLegacyTier, inferSearchType, inferTable, keywords, mondayOfWeek, monthPartition, objectKey, page_keywords, pages, quarterOfMonth, quarterPartition, resolveToSQL, substituteNamedFiles, toPath, toSumPosition, transformGscRow, weekPartition };
|
|
21
|
+
export { DEFAULT_SEARCH_TYPE, FILES_PLACEHOLDER, MAX_DAY_BYTES, SCHEMAS, TABLE_METADATA, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, formatLiteral, inferLegacyTier, inferSearchType, inferTable, keywords, objectKey, page_keywords, pages, resolveToSQL, substituteNamedFiles, toPath, toSumPosition, transformGscRow };
|
package/dist/planner.d.mts
CHANGED
|
@@ -1,3 +1,3 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { K as enumeratePartitions } from "./_chunks/storage.mjs";
|
|
2
2
|
import { a as substituteNamedFiles, i as resolveToSQL, n as ResolvedQuery, r as compileLogicalQueryPlan, t as FILES_PLACEHOLDER } from "./_chunks/planner.mjs";
|
|
3
3
|
export { FILES_PLACEHOLDER, ResolvedQuery, compileLogicalQueryPlan, enumeratePartitions, resolveToSQL, substituteNamedFiles };
|
package/dist/planner.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { i as substituteNamedFiles, n as compileLogicalQueryPlan, o as enumeratePartitions, r as resolveToSQL, t as FILES_PLACEHOLDER } from "./_chunks/
|
|
1
|
+
import { i as substituteNamedFiles, n as compileLogicalQueryPlan, o as enumeratePartitions, r as resolveToSQL, t as FILES_PLACEHOLDER } from "./_chunks/compiler.mjs";
|
|
2
2
|
export { FILES_PLACEHOLDER, compileLogicalQueryPlan, enumeratePartitions, resolveToSQL, substituteNamedFiles };
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import { a as ResolvedSQLOptimized, i as ResolvedSQL, n as ExtraQuery, o as ResolverAdapter, r as ResolvedComparisonSQL, s as ResolverOptions, t as ComparisonFilter } from "../_chunks/types.mjs";
|
|
2
|
-
import { a as RowQuerySource, c as isSqlQuerySource, i as QueryRow, n as ExecuteSqlOptions, o as SourceCapabilities, r as FileSet, s as SqlQuerySource, t as AnalysisQuerySource } from "../_chunks/source-types.mjs";
|
|
3
2
|
import { LogicalDataset, LogicalDataset as LogicalDataset$1, PlannerCapabilities } from "gscdump/query/plan";
|
|
4
3
|
import { SQL } from "drizzle-orm";
|
|
5
4
|
import { TableName } from "gscdump/contracts";
|
|
@@ -74,27 +73,6 @@ declare function mergeExtras(rows: Record<string, unknown>[], extrasResults: {
|
|
|
74
73
|
key: string;
|
|
75
74
|
results: Record<string, unknown>[];
|
|
76
75
|
}[]): Record<string, unknown>[];
|
|
77
|
-
interface CreateSqlQuerySourceOptions<TKey extends string> {
|
|
78
|
-
/** Debug-only identifier surfaced on the source for error messages. */
|
|
79
|
-
name: string;
|
|
80
|
-
/** Dialect-specific adapter; compiles `BuilderState` → `{ sql, params }`. */
|
|
81
|
-
adapter: ResolverAdapter<TKey>;
|
|
82
|
-
/** Drives the underlying DB. Called for both typed queries and raw SQL. */
|
|
83
|
-
execute: (sql: string, params: unknown[]) => Promise<QueryRow[]>;
|
|
84
|
-
/** Tenant id for multi-tenant dialects; forwarded to `resolveToSQL`. */
|
|
85
|
-
siteId?: string | number;
|
|
86
|
-
/** Additional capability flags merged on top of `adapter.capabilities`. */
|
|
87
|
-
extraCapabilities?: Partial<SourceCapabilities>;
|
|
88
|
-
}
|
|
89
|
-
declare function createSqlQuerySource<TKey extends string>(options: CreateSqlQuerySourceOptions<TKey>): SqlQuerySource;
|
|
90
|
-
declare function compilePg(query: SQL): {
|
|
91
|
-
sql: string;
|
|
92
|
-
params: unknown[];
|
|
93
|
-
};
|
|
94
|
-
declare function compileSqlite(query: SQL): {
|
|
95
|
-
sql: string;
|
|
96
|
-
params: unknown[];
|
|
97
|
-
};
|
|
98
76
|
declare function getInternalFilters(filter: FilterInput | undefined): InternalFilter[];
|
|
99
77
|
declare function getDimensionFilters(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): InternalFilter[];
|
|
100
78
|
declare function getFilterDimensions(filter: FilterInput | undefined, isMetricDimension: (dim: string) => dim is Metric): Dimension[];
|
|
@@ -130,4 +108,4 @@ interface AssertSchemaInSyncOptions {
|
|
|
130
108
|
mode: 'exact' | 'superset';
|
|
131
109
|
}
|
|
132
110
|
declare function assertSchemaInSync(options: AssertSchemaInSyncOptions): void;
|
|
133
|
-
export { type
|
|
111
|
+
export { type AssertSchemaInSyncOptions, type ComparisonFilter, type CreateResolverAdapterConfig, DIMENSION_SURFACES, type DimensionBinding, type DimensionSurface, type ExtraQuery, LOGICAL_DATASETS, type LogicalDataset, type LogicalDatasetDefinition, type PgTableKey, type ResolvedComparisonSQL, type ResolvedSQL, type ResolvedSQLOptimized, type ResolverAdapter, type ResolverOptions, type SqlFragments, type SqlFragmentsConfig, assertDimensionsSupported, assertSchemaInSync, buildExtrasQueries, buildTotalsSql, createParquetResolverAdapter, createResolverAdapter, createSqlFragments, dimensionColumn, dimensionValue, getDimensionFilters, getFilterDimensions, getInternalFilters, inferLogicalDataset, matchesDimensionFilter, matchesMetricFilter, matchesTopLevelPage, mergeExtras, metricValue, pgResolverAdapter, resolveComparisonSQL, resolveToSQL, resolveToSQLOptimized, supportsDimensionOnSurface };
|