@gscdump/engine 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/compiler.mjs +288 -0
- package/dist/_chunks/duckdb.d.mts +26 -0
- package/dist/_chunks/engine.mjs +578 -0
- package/dist/_chunks/pg-adapter.mjs +676 -0
- package/dist/_chunks/planner.d.mts +15 -0
- package/dist/_chunks/schema.d.mts +1258 -0
- package/dist/_chunks/schema.mjs +139 -0
- package/dist/_chunks/storage.d.mts +476 -0
- package/dist/_chunks/storage.mjs +39 -0
- package/dist/_chunks/types.d.mts +53 -0
- package/dist/adapters/duckdb-node.d.mts +1 -13
- package/dist/adapters/duckdb-node.mjs +1 -7
- package/dist/adapters/filesystem.d.mts +1 -193
- package/dist/adapters/filesystem.mjs +2 -9
- package/dist/adapters/http.d.mts +1 -193
- package/dist/adapters/http.mjs +1 -5
- package/dist/adapters/hyparquet.d.mts +6 -83
- package/dist/adapters/hyparquet.mjs +1 -105
- package/dist/adapters/inspection-sqlite-browser.d.mts +1 -7
- package/dist/adapters/inspection-sqlite-node.d.mts +1 -7
- package/dist/adapters/inspection-sqlite-node.mjs +1 -1
- package/dist/adapters/node-harness.d.mts +3 -306
- package/dist/adapters/node-harness.mjs +4 -1866
- package/dist/adapters/r2-manifest.d.mts +4 -149
- package/dist/adapters/r2-manifest.mjs +1 -8
- package/dist/adapters/r2.d.mts +1 -47
- package/dist/contracts.d.mts +1 -435
- package/dist/entities.d.mts +1 -47
- package/dist/index.d.mts +8 -1844
- package/dist/index.mjs +8 -1962
- package/dist/ingest.d.mts +1 -1
- package/dist/planner.d.mts +3 -16
- package/dist/planner.mjs +1 -320
- package/dist/resolver/index.d.mts +3 -51
- package/dist/resolver/index.mjs +2 -780
- package/dist/rollups.d.mts +6 -51
- package/dist/rollups.mjs +2 -209
- package/dist/schema.d.mts +2 -1258
- package/dist/schema.mjs +1 -138
- package/package.json +2 -2
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
import { i as dimensionToColumn, r as currentSchemaVersion } from "./schema.mjs";
|
|
2
|
+
import { a as mondayOfWeek, c as quarterOfMonth, d as weekPartition, i as inferSearchType, l as quarterPartition, n as dayPartition, o as monthPartition, s as objectKey } from "./storage.mjs";
|
|
3
|
+
import { METRIC_EXPR, escapeLike, topLevelPagePredicateSql } from "../sql-fragments.mjs";
|
|
4
|
+
import { buildLogicalPlan } from "gscdump/query/plan";
|
|
5
|
+
import { MS_PER_DAY } from "gscdump";
|
|
6
|
+
const DAILY_PARTITION_RE = /^daily\/(\d{4}-\d{2}-\d{2})$/;
|
|
7
|
+
const WEEKLY_PARTITION_RE = /^weekly\/(\d{4}-\d{2}-\d{2})$/;
|
|
8
|
+
const MONTHLY_PARTITION_RE = /^monthly\/(\d{4}-\d{2})$/;
|
|
9
|
+
const DEFAULT_THRESHOLDS = {
|
|
10
|
+
raw: 7,
|
|
11
|
+
d7: 30,
|
|
12
|
+
d30: 90
|
|
13
|
+
};
|
|
14
|
+
const PENDING_WINDOW_DAYS = 4;
|
|
15
|
+
const STAGES = [
|
|
16
|
+
{
|
|
17
|
+
inputTier: "raw",
|
|
18
|
+
outputTier: "d7",
|
|
19
|
+
cutoffDays: DEFAULT_THRESHOLDS.raw,
|
|
20
|
+
bucketKey: (e) => {
|
|
21
|
+
const m = e.partition.match(DAILY_PARTITION_RE);
|
|
22
|
+
if (!m) return void 0;
|
|
23
|
+
return mondayOfWeek(m[1]);
|
|
24
|
+
},
|
|
25
|
+
bucketLatestMs: (monday) => Date.parse(`${monday}T00:00:00Z`) + 6 * MS_PER_DAY,
|
|
26
|
+
outputPartition: weekPartition
|
|
27
|
+
},
|
|
28
|
+
{
|
|
29
|
+
inputTier: "d7",
|
|
30
|
+
outputTier: "d30",
|
|
31
|
+
cutoffDays: DEFAULT_THRESHOLDS.d7,
|
|
32
|
+
bucketKey: (e) => {
|
|
33
|
+
const m = e.partition.match(WEEKLY_PARTITION_RE);
|
|
34
|
+
if (!m) return void 0;
|
|
35
|
+
return m[1].slice(0, 7);
|
|
36
|
+
},
|
|
37
|
+
bucketLatestMs: monthEndMs,
|
|
38
|
+
outputPartition: monthPartition
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
inputTier: "d30",
|
|
42
|
+
outputTier: "d90",
|
|
43
|
+
cutoffDays: DEFAULT_THRESHOLDS.d30,
|
|
44
|
+
bucketKey: (e) => {
|
|
45
|
+
const m = e.partition.match(MONTHLY_PARTITION_RE);
|
|
46
|
+
if (!m) return void 0;
|
|
47
|
+
return quarterOfMonth(m[1]);
|
|
48
|
+
},
|
|
49
|
+
bucketLatestMs: quarterEndMs,
|
|
50
|
+
outputPartition: quarterPartition
|
|
51
|
+
}
|
|
52
|
+
];
|
|
53
|
+
async function compactTieredImpl(deps, ctx, now, overrides = {}) {
|
|
54
|
+
const thresholds = {
|
|
55
|
+
...DEFAULT_THRESHOLDS,
|
|
56
|
+
...overrides
|
|
57
|
+
};
|
|
58
|
+
const stagesWithThresholds = STAGES.map((s) => ({
|
|
59
|
+
...s,
|
|
60
|
+
cutoffDays: s.outputTier === "d7" ? thresholds.raw : s.outputTier === "d30" ? thresholds.d7 : thresholds.d30
|
|
61
|
+
}));
|
|
62
|
+
for (const stage of stagesWithThresholds) await runStage(deps, ctx, stage, now);
|
|
63
|
+
}
|
|
64
|
+
async function runStage(deps, ctx, stage, now) {
|
|
65
|
+
const cutoff = now - Math.max(stage.cutoffDays, PENDING_WINDOW_DAYS) * MS_PER_DAY;
|
|
66
|
+
const candidates = await deps.manifestStore.listLive({
|
|
67
|
+
userId: ctx.userId,
|
|
68
|
+
siteId: ctx.siteId,
|
|
69
|
+
table: ctx.table,
|
|
70
|
+
tier: stage.inputTier
|
|
71
|
+
});
|
|
72
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
73
|
+
for (const entry of candidates) {
|
|
74
|
+
const key = stage.bucketKey(entry);
|
|
75
|
+
if (!key) continue;
|
|
76
|
+
if (stage.bucketLatestMs(key) >= cutoff) continue;
|
|
77
|
+
const compositeKey = `${inferSearchType(entry)}\0${key}`;
|
|
78
|
+
if (!buckets.has(compositeKey)) buckets.set(compositeKey, []);
|
|
79
|
+
buckets.get(compositeKey).push(entry);
|
|
80
|
+
}
|
|
81
|
+
for (const [compositeKey, entries] of buckets) {
|
|
82
|
+
const [searchType, bucket] = compositeKey.split("\0");
|
|
83
|
+
const targetPartition = stage.outputPartition(bucket);
|
|
84
|
+
if (entries.length === 1 && entries[0].partition === targetPartition) continue;
|
|
85
|
+
await deps.manifestStore.withLock({
|
|
86
|
+
userId: ctx.userId,
|
|
87
|
+
siteId: ctx.siteId,
|
|
88
|
+
table: ctx.table,
|
|
89
|
+
partition: targetPartition
|
|
90
|
+
}, async () => {
|
|
91
|
+
const key = objectKey(ctx, ctx.table, targetPartition, now, searchType);
|
|
92
|
+
const { bytes, rowCount } = await deps.codec.compactRows({ table: ctx.table }, entries.map((e) => e.objectKey), key, deps.dataSource);
|
|
93
|
+
const newEntry = {
|
|
94
|
+
userId: ctx.userId,
|
|
95
|
+
siteId: ctx.siteId,
|
|
96
|
+
table: ctx.table,
|
|
97
|
+
partition: targetPartition,
|
|
98
|
+
objectKey: key,
|
|
99
|
+
rowCount,
|
|
100
|
+
bytes,
|
|
101
|
+
createdAt: now,
|
|
102
|
+
schemaVersion: currentSchemaVersion(ctx.table),
|
|
103
|
+
tier: stage.outputTier,
|
|
104
|
+
...searchType !== "web" ? { searchType } : {}
|
|
105
|
+
};
|
|
106
|
+
await deps.manifestStore.registerVersion(newEntry, entries);
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
function enumeratePartitions(startDate, endDate) {
|
|
111
|
+
const out = [];
|
|
112
|
+
const [sy, sm, sd] = startDate.split("-").map(Number);
|
|
113
|
+
const [ey, em, ed] = endDate.split("-").map(Number);
|
|
114
|
+
const start = Date.UTC(sy, sm - 1, sd);
|
|
115
|
+
const end = Date.UTC(ey, em - 1, ed);
|
|
116
|
+
if (end < start) return out;
|
|
117
|
+
const seenWeeks = /* @__PURE__ */ new Set();
|
|
118
|
+
const seenMonths = /* @__PURE__ */ new Set();
|
|
119
|
+
const seenQuarters = /* @__PURE__ */ new Set();
|
|
120
|
+
for (let t = start; t <= end; t += 864e5) {
|
|
121
|
+
const d = new Date(t);
|
|
122
|
+
const y = d.getUTCFullYear();
|
|
123
|
+
const m = String(d.getUTCMonth() + 1).padStart(2, "0");
|
|
124
|
+
const isoDay = `${y}-${m}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
125
|
+
const isoMonth = `${y}-${m}`;
|
|
126
|
+
out.push(dayPartition(isoDay));
|
|
127
|
+
const monday = mondayOfWeek(isoDay);
|
|
128
|
+
if (!seenWeeks.has(monday)) {
|
|
129
|
+
seenWeeks.add(monday);
|
|
130
|
+
out.push(weekPartition(monday));
|
|
131
|
+
}
|
|
132
|
+
if (!seenMonths.has(isoMonth)) {
|
|
133
|
+
seenMonths.add(isoMonth);
|
|
134
|
+
out.push(monthPartition(isoMonth));
|
|
135
|
+
}
|
|
136
|
+
const quarter = quarterOfMonth(isoMonth);
|
|
137
|
+
if (!seenQuarters.has(quarter)) {
|
|
138
|
+
seenQuarters.add(quarter);
|
|
139
|
+
out.push(quarterPartition(quarter));
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
return out;
|
|
143
|
+
}
|
|
144
|
+
function monthEndMs(month) {
|
|
145
|
+
const [y, m] = month.split("-").map(Number);
|
|
146
|
+
return Date.UTC(y, m, 0, 23, 59, 59, 999);
|
|
147
|
+
}
|
|
148
|
+
function quarterEndMs(quarter) {
|
|
149
|
+
const [yStr, qStr] = quarter.split("-Q");
|
|
150
|
+
const y = Number(yStr);
|
|
151
|
+
const q = Number(qStr);
|
|
152
|
+
return Date.UTC(y, q * 3, 0, 23, 59, 59, 999);
|
|
153
|
+
}
|
|
154
|
+
const FILES_PLACEHOLDER = "{{FILES}}";
|
|
155
|
+
function buildDimensionWhere(filters, table) {
|
|
156
|
+
const clauses = [];
|
|
157
|
+
const params = [];
|
|
158
|
+
for (const filter of filters) {
|
|
159
|
+
const column = dimensionToColumn(filter.dimension, table);
|
|
160
|
+
switch (filter.operator) {
|
|
161
|
+
case "equals":
|
|
162
|
+
clauses.push(`${column} = ?`);
|
|
163
|
+
params.push(filter.expression);
|
|
164
|
+
break;
|
|
165
|
+
case "notEquals":
|
|
166
|
+
clauses.push(`${column} != ?`);
|
|
167
|
+
params.push(filter.expression);
|
|
168
|
+
break;
|
|
169
|
+
case "contains":
|
|
170
|
+
clauses.push(`${column} LIKE ? ESCAPE '\\'`);
|
|
171
|
+
params.push(`%${escapeLike(filter.expression)}%`);
|
|
172
|
+
break;
|
|
173
|
+
case "notContains":
|
|
174
|
+
clauses.push(`${column} NOT LIKE ? ESCAPE '\\'`);
|
|
175
|
+
params.push(`%${escapeLike(filter.expression)}%`);
|
|
176
|
+
break;
|
|
177
|
+
case "includingRegex":
|
|
178
|
+
clauses.push(`regexp_matches(${column}, ?)`);
|
|
179
|
+
params.push(filter.expression);
|
|
180
|
+
break;
|
|
181
|
+
case "excludingRegex":
|
|
182
|
+
clauses.push(`NOT regexp_matches(${column}, ?)`);
|
|
183
|
+
params.push(filter.expression);
|
|
184
|
+
break;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
return {
|
|
188
|
+
clause: clauses.join(" AND "),
|
|
189
|
+
params
|
|
190
|
+
};
|
|
191
|
+
}
|
|
192
|
+
function buildTopLevelWhere(plan, table) {
|
|
193
|
+
if (!plan.specialFilters.topLevel) return "";
|
|
194
|
+
return topLevelPagePredicateSql(dimensionToColumn("page", table));
|
|
195
|
+
}
|
|
196
|
+
function buildHaving(filters) {
|
|
197
|
+
if (filters.length === 0) return {
|
|
198
|
+
clause: "",
|
|
199
|
+
params: []
|
|
200
|
+
};
|
|
201
|
+
const clauses = [];
|
|
202
|
+
const params = [];
|
|
203
|
+
for (const filter of filters) {
|
|
204
|
+
const expr = METRIC_EXPR[filter.metric];
|
|
205
|
+
switch (filter.operator) {
|
|
206
|
+
case "metricGte":
|
|
207
|
+
clauses.push(`${expr} >= ?`);
|
|
208
|
+
params.push(filter.expression);
|
|
209
|
+
break;
|
|
210
|
+
case "metricGt":
|
|
211
|
+
clauses.push(`${expr} > ?`);
|
|
212
|
+
params.push(filter.expression);
|
|
213
|
+
break;
|
|
214
|
+
case "metricLte":
|
|
215
|
+
clauses.push(`${expr} <= ?`);
|
|
216
|
+
params.push(filter.expression);
|
|
217
|
+
break;
|
|
218
|
+
case "metricLt":
|
|
219
|
+
clauses.push(`${expr} < ?`);
|
|
220
|
+
params.push(filter.expression);
|
|
221
|
+
break;
|
|
222
|
+
case "metricBetween":
|
|
223
|
+
clauses.push(`${expr} >= ? AND ${expr} <= ?`);
|
|
224
|
+
params.push(filter.expression, filter.expression2 ?? filter.expression);
|
|
225
|
+
break;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
return {
|
|
229
|
+
clause: clauses.length > 0 ? `HAVING ${clauses.join(" AND ")}` : "",
|
|
230
|
+
params
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
function compileLogicalQueryPlan(plan, table = plan.dataset) {
|
|
234
|
+
const partitions = enumeratePartitions(plan.dateRange.startDate, plan.dateRange.endDate);
|
|
235
|
+
const metricSelects = plan.metrics.map((metric) => `${METRIC_EXPR[metric]} AS ${metric}`);
|
|
236
|
+
const dimSelects = plan.groupByDimensions.map((dimension) => {
|
|
237
|
+
const column = dimensionToColumn(dimension, table);
|
|
238
|
+
return column !== dimension ? `${column} AS ${dimension}` : dimension;
|
|
239
|
+
});
|
|
240
|
+
const whereClauses = ["date >= ?", "date <= ?"];
|
|
241
|
+
const whereParams = [plan.dateRange.startDate, plan.dateRange.endDate];
|
|
242
|
+
const dimWhere = buildDimensionWhere(plan.dimensionFilters, table);
|
|
243
|
+
if (dimWhere.clause) {
|
|
244
|
+
whereClauses.push(dimWhere.clause);
|
|
245
|
+
whereParams.push(...dimWhere.params);
|
|
246
|
+
}
|
|
247
|
+
const topLevelClause = buildTopLevelWhere(plan, table);
|
|
248
|
+
if (topLevelClause) whereClauses.push(topLevelClause);
|
|
249
|
+
const having = buildHaving(plan.metricFilters);
|
|
250
|
+
const groupByCols = [...plan.groupByDimensions.map((dimension) => dimensionToColumn(dimension, table)), ...plan.hasDate ? ["date"] : []];
|
|
251
|
+
const groupBy = groupByCols.length > 0 ? `GROUP BY ${groupByCols.join(", ")}` : "";
|
|
252
|
+
const orderBy = plan.orderBy ? `ORDER BY ${plan.orderBy.column} ${plan.orderBy.dir.toUpperCase()}` : "ORDER BY clicks DESC";
|
|
253
|
+
const limit = `LIMIT ${plan.rowLimit ?? 1e3}`;
|
|
254
|
+
const offset = plan.startRow ? `OFFSET ${plan.startRow}` : "";
|
|
255
|
+
return {
|
|
256
|
+
sql: [
|
|
257
|
+
`SELECT ${[
|
|
258
|
+
...dimSelects,
|
|
259
|
+
...plan.hasDate ? ["date"] : [],
|
|
260
|
+
...metricSelects
|
|
261
|
+
].join(", ")}`,
|
|
262
|
+
`FROM read_parquet(${FILES_PLACEHOLDER}, union_by_name = true)`,
|
|
263
|
+
`WHERE ${whereClauses.join(" AND ")}`,
|
|
264
|
+
groupBy,
|
|
265
|
+
having.clause,
|
|
266
|
+
orderBy,
|
|
267
|
+
limit,
|
|
268
|
+
offset
|
|
269
|
+
].filter(Boolean).join(" ").replace(/\s+/g, " ").trim(),
|
|
270
|
+
params: [...whereParams, ...having.params],
|
|
271
|
+
partitions,
|
|
272
|
+
table,
|
|
273
|
+
filesPlaceholder: FILES_PLACEHOLDER
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
function resolveToSQL(state, table) {
|
|
277
|
+
const plan = buildLogicalPlan(state, { regex: true });
|
|
278
|
+
return compileLogicalQueryPlan(plan, table ?? plan.dataset);
|
|
279
|
+
}
|
|
280
|
+
function fileList(keys) {
|
|
281
|
+
return keys.length === 0 ? "[]" : `[${keys.map((key) => `'${key.replace(/'/g, "''")}'`).join(", ")}]`;
|
|
282
|
+
}
|
|
283
|
+
function substituteNamedFiles(sql, sets) {
|
|
284
|
+
let out = sql;
|
|
285
|
+
for (const [name, keys] of Object.entries(sets)) out = out.replace(new RegExp(`\\{\\{${name}\\}\\}`, "g"), fileList(keys));
|
|
286
|
+
return out;
|
|
287
|
+
}
|
|
288
|
+
export { compactTieredImpl as a, substituteNamedFiles as i, compileLogicalQueryPlan as n, enumeratePartitions as o, resolveToSQL as r, FILES_PLACEHOLDER as t };
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import { N as TableName, S as QueryExecutor, h as ParquetCodec, w as Row } from "./storage.mjs";
|
|
2
|
+
interface DuckDBHandle {
|
|
3
|
+
query: (sql: string, params?: unknown[]) => Promise<Row[]>;
|
|
4
|
+
registerFileBuffer: (name: string, bytes: Uint8Array) => Promise<void>;
|
|
5
|
+
copyFileToBuffer: (name: string) => Promise<Uint8Array>;
|
|
6
|
+
dropFiles: (names: string[]) => Promise<void>;
|
|
7
|
+
/**
|
|
8
|
+
* Returns a unique path suitable for `COPY TO '…'` + `copyFileToBuffer`.
|
|
9
|
+
* In Node this is an absolute path under `os.tmpdir()` so DuckDB doesn't
|
|
10
|
+
* litter the CWD; in browsers/Workers it's a plain virtual-FS name.
|
|
11
|
+
*/
|
|
12
|
+
makeTempPath: (ext: string) => string;
|
|
13
|
+
}
|
|
14
|
+
interface DuckDBFactory {
|
|
15
|
+
getDuckDB: () => Promise<DuckDBHandle>;
|
|
16
|
+
}
|
|
17
|
+
declare function createDuckDBCodec(factory: DuckDBFactory): ParquetCodec;
|
|
18
|
+
declare function createDuckDBExecutor(factory: DuckDBFactory): QueryExecutor;
|
|
19
|
+
/**
|
|
20
|
+
* Canonical "empty-file" SELECT clause for a table. Codecs that need to
|
|
21
|
+
* emit a schema-correct empty Parquet can wrap this in:
|
|
22
|
+
* `COPY (SELECT * FROM <clause> WHERE FALSE) TO '<key>' (FORMAT PARQUET)`
|
|
23
|
+
* to satisfy the ParquetCodec empty-rows invariant.
|
|
24
|
+
*/
|
|
25
|
+
declare function canonicalEmptyParquetSchema(table: TableName): string;
|
|
26
|
+
export { createDuckDBExecutor as a, createDuckDBCodec as i, DuckDBHandle as n, canonicalEmptyParquetSchema as r, DuckDBFactory as t };
|