@gscdump/engine 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +53 -0
- package/dist/adapters/duckdb-node.d.mts +19 -0
- package/dist/adapters/duckdb-node.mjs +78 -0
- package/dist/adapters/filesystem.d.mts +206 -0
- package/dist/adapters/filesystem.mjs +320 -0
- package/dist/adapters/http.d.mts +227 -0
- package/dist/adapters/http.mjs +119 -0
- package/dist/adapters/hyparquet.d.mts +107 -0
- package/dist/adapters/hyparquet.mjs +250 -0
- package/dist/adapters/inspection-sqlite-browser.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-browser.mjs +42 -0
- package/dist/adapters/inspection-sqlite-node.d.mts +9 -0
- package/dist/adapters/inspection-sqlite-node.mjs +32 -0
- package/dist/adapters/node-harness.d.mts +334 -0
- package/dist/adapters/node-harness.mjs +1907 -0
- package/dist/adapters/r2-manifest.d.mts +227 -0
- package/dist/adapters/r2-manifest.mjs +355 -0
- package/dist/adapters/r2.d.mts +93 -0
- package/dist/adapters/r2.mjs +65 -0
- package/dist/arrow-utils.d.mts +14 -0
- package/dist/arrow-utils.mjs +8 -0
- package/dist/contracts.d.mts +436 -0
- package/dist/contracts.mjs +1 -0
- package/dist/entities.d.mts +238 -0
- package/dist/entities.mjs +359 -0
- package/dist/index.d.mts +1849 -0
- package/dist/index.mjs +1976 -0
- package/dist/ingest.d.mts +96 -0
- package/dist/ingest.mjs +187 -0
- package/dist/planner.d.mts +16 -0
- package/dist/planner.mjs +321 -0
- package/dist/resolver/index.d.mts +207 -0
- package/dist/resolver/index.mjs +869 -0
- package/dist/rollups.d.mts +207 -0
- package/dist/rollups.mjs +553 -0
- package/dist/schema.d.mts +1258 -0
- package/dist/schema.mjs +139 -0
- package/dist/scope.d.mts +38 -0
- package/dist/scope.mjs +28 -0
- package/dist/snapshot.d.mts +14 -0
- package/dist/snapshot.mjs +1 -0
- package/dist/sql-bind.d.mts +19 -0
- package/dist/sql-bind.mjs +92 -0
- package/dist/sql-fragments.d.mts +21 -0
- package/dist/sql-fragments.mjs +13 -0
- package/package.json +168 -0
package/dist/rollups.mjs
ADDED
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
import { MS_PER_DAY } from "gscdump";
|
|
2
|
+
import { parquetWriteBuffer } from "hyparquet-writer";
|
|
3
|
+
import { date, doublePrecision, getTableConfig, integer, pgTable, varchar } from "drizzle-orm/pg-core";
|
|
4
|
+
function metricCols() {
|
|
5
|
+
return {
|
|
6
|
+
clicks: integer("clicks").notNull(),
|
|
7
|
+
impressions: integer("impressions").notNull(),
|
|
8
|
+
sum_position: doublePrecision("sum_position").notNull()
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
const dateCol = () => date("date").notNull();
|
|
12
|
+
const drizzleSchema = {
|
|
13
|
+
pages: pgTable("pages", {
|
|
14
|
+
url: varchar("url").notNull(),
|
|
15
|
+
date: dateCol(),
|
|
16
|
+
...metricCols()
|
|
17
|
+
}),
|
|
18
|
+
keywords: pgTable("keywords", {
|
|
19
|
+
query: varchar("query").notNull(),
|
|
20
|
+
query_canonical: varchar("query_canonical"),
|
|
21
|
+
date: dateCol(),
|
|
22
|
+
...metricCols()
|
|
23
|
+
}),
|
|
24
|
+
countries: pgTable("countries", {
|
|
25
|
+
country: varchar("country").notNull(),
|
|
26
|
+
date: dateCol(),
|
|
27
|
+
...metricCols()
|
|
28
|
+
}),
|
|
29
|
+
devices: pgTable("devices", {
|
|
30
|
+
device: varchar("device").notNull(),
|
|
31
|
+
date: dateCol(),
|
|
32
|
+
...metricCols()
|
|
33
|
+
}),
|
|
34
|
+
page_keywords: pgTable("page_keywords", {
|
|
35
|
+
url: varchar("url").notNull(),
|
|
36
|
+
query: varchar("query").notNull(),
|
|
37
|
+
query_canonical: varchar("query_canonical"),
|
|
38
|
+
date: dateCol(),
|
|
39
|
+
...metricCols()
|
|
40
|
+
}),
|
|
41
|
+
search_appearance: pgTable("search_appearance", {
|
|
42
|
+
searchAppearance: varchar("searchAppearance").notNull(),
|
|
43
|
+
date: dateCol(),
|
|
44
|
+
...metricCols()
|
|
45
|
+
})
|
|
46
|
+
};
|
|
47
|
+
const TABLE_METADATA = {
|
|
48
|
+
pages: {
|
|
49
|
+
sortKey: ["date", "url"],
|
|
50
|
+
version: 1
|
|
51
|
+
},
|
|
52
|
+
keywords: {
|
|
53
|
+
sortKey: ["date", "query"],
|
|
54
|
+
version: 2
|
|
55
|
+
},
|
|
56
|
+
countries: {
|
|
57
|
+
sortKey: ["date", "country"],
|
|
58
|
+
version: 1
|
|
59
|
+
},
|
|
60
|
+
devices: {
|
|
61
|
+
sortKey: ["date", "device"],
|
|
62
|
+
version: 1
|
|
63
|
+
},
|
|
64
|
+
page_keywords: {
|
|
65
|
+
sortKey: [
|
|
66
|
+
"date",
|
|
67
|
+
"url",
|
|
68
|
+
"query"
|
|
69
|
+
],
|
|
70
|
+
version: 2
|
|
71
|
+
},
|
|
72
|
+
search_appearance: {
|
|
73
|
+
sortKey: ["date", "searchAppearance"],
|
|
74
|
+
version: 1
|
|
75
|
+
}
|
|
76
|
+
};
|
|
77
|
+
function pgSqlTypeToColumnType(sqlType) {
|
|
78
|
+
const t = sqlType.toLowerCase();
|
|
79
|
+
if (t.startsWith("varchar") || t === "text" || t.startsWith("char")) return "VARCHAR";
|
|
80
|
+
if (t === "date" || t.startsWith("timestamp")) return "DATE";
|
|
81
|
+
if (t.startsWith("double") || t === "real" || t.startsWith("numeric") || t.startsWith("decimal")) return "DOUBLE";
|
|
82
|
+
if (t === "bigint" || t === "int8") return "BIGINT";
|
|
83
|
+
if (t === "integer" || t === "int" || t === "int4" || t === "smallint" || t === "int2") return "INTEGER";
|
|
84
|
+
throw new Error(`unmapped pg type '${sqlType}' — extend pgSqlTypeToColumnType in @gscdump/engine/schema`);
|
|
85
|
+
}
|
|
86
|
+
function tableSchemaFrom(tableName) {
|
|
87
|
+
const columns = getTableConfig(drizzleSchema[tableName]).columns.map((col) => ({
|
|
88
|
+
name: col.name,
|
|
89
|
+
type: pgSqlTypeToColumnType(col.getSQLType()),
|
|
90
|
+
nullable: !col.notNull
|
|
91
|
+
}));
|
|
92
|
+
const meta = TABLE_METADATA[tableName];
|
|
93
|
+
return {
|
|
94
|
+
name: tableName,
|
|
95
|
+
columns,
|
|
96
|
+
sortKey: meta.sortKey,
|
|
97
|
+
version: meta.version
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
Object.fromEntries([
|
|
101
|
+
"pages",
|
|
102
|
+
"keywords",
|
|
103
|
+
"countries",
|
|
104
|
+
"devices",
|
|
105
|
+
"page_keywords",
|
|
106
|
+
"search_appearance"
|
|
107
|
+
].map((t) => [t, tableSchemaFrom(t)]));
|
|
108
|
+
const ROW_GROUP_SIZE = 25e3;
|
|
109
|
+
function basicTypeFor(colType) {
|
|
110
|
+
if (colType === "VARCHAR" || colType === "DATE") return "STRING";
|
|
111
|
+
if (colType === "BIGINT") return "INT64";
|
|
112
|
+
if (colType === "INTEGER") return "INT32";
|
|
113
|
+
if (colType === "DOUBLE") return "DOUBLE";
|
|
114
|
+
throw new Error(`unsupported column type for parquet encoding: ${colType}`);
|
|
115
|
+
}
|
|
116
|
+
function coerceValue(value, type) {
|
|
117
|
+
if (value === null || value === void 0) return null;
|
|
118
|
+
if (type === "STRING") return typeof value === "string" ? value : String(value);
|
|
119
|
+
if (type === "INT32") {
|
|
120
|
+
const n = typeof value === "number" ? value : Number(value);
|
|
121
|
+
if (!Number.isFinite(n)) throw new Error(`non-finite number for INT32: ${String(value)}`);
|
|
122
|
+
return Math.trunc(n);
|
|
123
|
+
}
|
|
124
|
+
if (type === "INT64") {
|
|
125
|
+
if (typeof value === "bigint") return value;
|
|
126
|
+
const n = typeof value === "number" ? value : Number(value);
|
|
127
|
+
if (!Number.isFinite(n)) throw new Error(`non-finite number for INT64: ${String(value)}`);
|
|
128
|
+
return BigInt(Math.trunc(n));
|
|
129
|
+
}
|
|
130
|
+
if (type === "DOUBLE") {
|
|
131
|
+
const n = typeof value === "number" ? value : Number(value);
|
|
132
|
+
if (!Number.isFinite(n)) throw new Error(`non-finite number for DOUBLE: ${String(value)}`);
|
|
133
|
+
return n;
|
|
134
|
+
}
|
|
135
|
+
return value;
|
|
136
|
+
}
|
|
137
|
+
function compareValues(a, b) {
|
|
138
|
+
if (a === b) return 0;
|
|
139
|
+
if (a === null || a === void 0) return -1;
|
|
140
|
+
if (b === null || b === void 0) return 1;
|
|
141
|
+
if (typeof a === "number" && typeof b === "number") return a - b;
|
|
142
|
+
return String(a) < String(b) ? -1 : 1;
|
|
143
|
+
}
|
|
144
|
+
function encodeRowsToParquetFlex(rows, opts) {
|
|
145
|
+
const { columns, sortKey = [], rowGroupSize = ROW_GROUP_SIZE } = opts;
|
|
146
|
+
const sorted = sortKey.length === 0 || rows.length <= 1 ? rows : [...rows].sort((a, b) => {
|
|
147
|
+
for (const col of sortKey) {
|
|
148
|
+
const cmp = compareValues(a[col], b[col]);
|
|
149
|
+
if (cmp !== 0) return cmp;
|
|
150
|
+
}
|
|
151
|
+
return 0;
|
|
152
|
+
});
|
|
153
|
+
const buffer = parquetWriteBuffer({
|
|
154
|
+
columnData: columns.map((col) => {
|
|
155
|
+
const type = basicTypeFor(col.type);
|
|
156
|
+
const data = sorted.map((r) => coerceValue(r[col.name], type));
|
|
157
|
+
return {
|
|
158
|
+
name: col.name,
|
|
159
|
+
data,
|
|
160
|
+
type,
|
|
161
|
+
nullable: col.nullable,
|
|
162
|
+
columnIndex: true
|
|
163
|
+
};
|
|
164
|
+
}),
|
|
165
|
+
rowGroupSize
|
|
166
|
+
});
|
|
167
|
+
return new Uint8Array(buffer);
|
|
168
|
+
}
|
|
169
|
+
function hashUrl(url) {
|
|
170
|
+
let hi = 2166136261;
|
|
171
|
+
let lo = 3421674724;
|
|
172
|
+
for (let i = 0; i < url.length; i++) {
|
|
173
|
+
const c = url.charCodeAt(i);
|
|
174
|
+
lo ^= c;
|
|
175
|
+
const loMul = Math.imul(lo, 435) >>> 0;
|
|
176
|
+
const carry = Math.floor(lo * 435 / 4294967296);
|
|
177
|
+
const hiMul = Math.imul(hi, 435) + Math.imul(lo, 1) + carry >>> 0;
|
|
178
|
+
lo = loMul;
|
|
179
|
+
hi = hiMul;
|
|
180
|
+
}
|
|
181
|
+
return (hi >>> 0).toString(16).padStart(8, "0") + (lo >>> 0).toString(16).padStart(8, "0");
|
|
182
|
+
}
|
|
183
|
+
function indexingMetadataIndexKey(ctx) {
|
|
184
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/indexing/index.json` : `u_${ctx.userId}/entities/indexing/index.json`;
|
|
185
|
+
}
|
|
186
|
+
function createIndexingMetadataStore(opts) {
|
|
187
|
+
const ds = opts.dataSource;
|
|
188
|
+
const hash = opts.hash ?? hashUrl;
|
|
189
|
+
async function readIndex(key) {
|
|
190
|
+
return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => ({
|
|
191
|
+
version: 1,
|
|
192
|
+
records: {}
|
|
193
|
+
}));
|
|
194
|
+
}
|
|
195
|
+
return {
|
|
196
|
+
async writeBatch(ctx, records) {
|
|
197
|
+
if (records.length === 0) return;
|
|
198
|
+
const key = indexingMetadataIndexKey(ctx);
|
|
199
|
+
const index = await readIndex(key);
|
|
200
|
+
for (const r of records) index.records[hash(r.url)] = r;
|
|
201
|
+
await ds.write(key, new TextEncoder().encode(JSON.stringify(index)));
|
|
202
|
+
},
|
|
203
|
+
async loadIndex(ctx) {
|
|
204
|
+
return readIndex(indexingMetadataIndexKey(ctx));
|
|
205
|
+
},
|
|
206
|
+
async getLatest(ctx, url) {
|
|
207
|
+
return (await readIndex(indexingMetadataIndexKey(ctx))).records[hash(url)];
|
|
208
|
+
}
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
function rollupPrefix(ctx) {
|
|
212
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
|
|
213
|
+
}
|
|
214
|
+
function rollupKey(ctx, id, builtAt) {
|
|
215
|
+
return `${rollupPrefix(ctx)}/${id}__v${builtAt}.json`;
|
|
216
|
+
}
|
|
217
|
+
function rollupParquetKey(ctx, id, builtAt) {
|
|
218
|
+
return `${rollupPrefix(ctx)}/${id}__v${builtAt}.parquet`;
|
|
219
|
+
}
|
|
220
|
+
async function rebuildRollups(opts) {
|
|
221
|
+
const now = opts.now ?? (() => Date.now());
|
|
222
|
+
const results = [];
|
|
223
|
+
for (const def of opts.defs) {
|
|
224
|
+
const builtAt = now();
|
|
225
|
+
const payload = await def.build({
|
|
226
|
+
engine: opts.engine,
|
|
227
|
+
ctx: opts.ctx,
|
|
228
|
+
dataSource: opts.dataSource,
|
|
229
|
+
builtAt
|
|
230
|
+
});
|
|
231
|
+
if (def.format === "parquet") {
|
|
232
|
+
if (!def.parquetColumns || def.parquetColumns.length === 0) throw new Error(`rollup '${def.id}' declared format='parquet' without parquetColumns`);
|
|
233
|
+
const rows = payload ?? [];
|
|
234
|
+
const parquetBytes = encodeRowsToParquetFlex(rows, {
|
|
235
|
+
columns: def.parquetColumns,
|
|
236
|
+
sortKey: def.parquetSortKey
|
|
237
|
+
});
|
|
238
|
+
const parquetKey = rollupParquetKey(opts.ctx, def.id, builtAt);
|
|
239
|
+
await opts.dataSource.write(parquetKey, parquetBytes);
|
|
240
|
+
const pointer = {
|
|
241
|
+
parquetKey,
|
|
242
|
+
rowCount: rows.length
|
|
243
|
+
};
|
|
244
|
+
const envelope = {
|
|
245
|
+
version: 1,
|
|
246
|
+
id: def.id,
|
|
247
|
+
builtAt,
|
|
248
|
+
windowDays: def.windowDays,
|
|
249
|
+
payload: pointer
|
|
250
|
+
};
|
|
251
|
+
const envelopeBytes = new TextEncoder().encode(JSON.stringify(envelope));
|
|
252
|
+
const key = rollupKey(opts.ctx, def.id, builtAt);
|
|
253
|
+
await opts.dataSource.write(key, envelopeBytes);
|
|
254
|
+
results.push({
|
|
255
|
+
id: def.id,
|
|
256
|
+
objectKey: key,
|
|
257
|
+
parquetKey,
|
|
258
|
+
bytes: envelopeBytes.byteLength,
|
|
259
|
+
parquetBytes: parquetBytes.byteLength,
|
|
260
|
+
builtAt
|
|
261
|
+
});
|
|
262
|
+
continue;
|
|
263
|
+
}
|
|
264
|
+
const envelope = {
|
|
265
|
+
version: 1,
|
|
266
|
+
id: def.id,
|
|
267
|
+
builtAt,
|
|
268
|
+
windowDays: def.windowDays,
|
|
269
|
+
payload
|
|
270
|
+
};
|
|
271
|
+
const json = JSON.stringify(envelope);
|
|
272
|
+
const bytes = new TextEncoder().encode(json);
|
|
273
|
+
const key = rollupKey(opts.ctx, def.id, builtAt);
|
|
274
|
+
await opts.dataSource.write(key, bytes);
|
|
275
|
+
results.push({
|
|
276
|
+
id: def.id,
|
|
277
|
+
objectKey: key,
|
|
278
|
+
bytes: bytes.byteLength,
|
|
279
|
+
builtAt
|
|
280
|
+
});
|
|
281
|
+
}
|
|
282
|
+
return results;
|
|
283
|
+
}
|
|
284
|
+
function utcDateMinusDays(at, days) {
|
|
285
|
+
const d = new Date(at - days * MS_PER_DAY);
|
|
286
|
+
return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
287
|
+
}
|
|
288
|
+
const dailyTotalsRollup = {
|
|
289
|
+
id: "daily_totals",
|
|
290
|
+
windowDays: null,
|
|
291
|
+
async build({ engine, ctx }) {
|
|
292
|
+
const pages = await engine.runSQL({
|
|
293
|
+
ctx,
|
|
294
|
+
table: "pages",
|
|
295
|
+
fileSets: { FILES: { table: "pages" } },
|
|
296
|
+
sql: `
|
|
297
|
+
SELECT
|
|
298
|
+
date,
|
|
299
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
300
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
301
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
302
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
303
|
+
GROUP BY date
|
|
304
|
+
ORDER BY date
|
|
305
|
+
`
|
|
306
|
+
});
|
|
307
|
+
const keywords = await engine.runSQL({
|
|
308
|
+
ctx,
|
|
309
|
+
table: "keywords",
|
|
310
|
+
fileSets: { FILES: { table: "keywords" } },
|
|
311
|
+
sql: `
|
|
312
|
+
SELECT
|
|
313
|
+
date,
|
|
314
|
+
SUM(impressions)::BIGINT AS impressions
|
|
315
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
316
|
+
GROUP BY date
|
|
317
|
+
`
|
|
318
|
+
});
|
|
319
|
+
const keywordImpressionsByDate = /* @__PURE__ */ new Map();
|
|
320
|
+
for (const r of keywords.rows) keywordImpressionsByDate.set(String(r.date), BigInt(r.impressions));
|
|
321
|
+
return pages.rows.map((r) => {
|
|
322
|
+
const totalImpressions = BigInt(r.impressions);
|
|
323
|
+
const queryImpressions = keywordImpressionsByDate.get(String(r.date)) ?? BigInt(0);
|
|
324
|
+
const anonymized = totalImpressions === BigInt(0) ? 0 : 1 - Number(queryImpressions) / Number(totalImpressions);
|
|
325
|
+
return {
|
|
326
|
+
date: r.date,
|
|
327
|
+
clicks: Number(r.clicks),
|
|
328
|
+
impressions: Number(r.impressions),
|
|
329
|
+
sum_position: Number(r.sum_position),
|
|
330
|
+
anonymizedImpressionsPct: Math.max(0, Math.min(1, anonymized))
|
|
331
|
+
};
|
|
332
|
+
});
|
|
333
|
+
}
|
|
334
|
+
};
|
|
335
|
+
const weeklyTotalsRollup = {
|
|
336
|
+
id: "weekly_totals",
|
|
337
|
+
windowDays: null,
|
|
338
|
+
async build({ engine, ctx }) {
|
|
339
|
+
return (await engine.runSQL({
|
|
340
|
+
ctx,
|
|
341
|
+
table: "pages",
|
|
342
|
+
fileSets: { FILES: { table: "pages" } },
|
|
343
|
+
sql: `
|
|
344
|
+
SELECT
|
|
345
|
+
strftime(date_trunc('week', date::DATE), '%Y-%m-%d') AS week,
|
|
346
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
347
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
348
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
349
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
350
|
+
GROUP BY 1
|
|
351
|
+
ORDER BY 1
|
|
352
|
+
`
|
|
353
|
+
})).rows.map((r) => ({
|
|
354
|
+
week: r.week,
|
|
355
|
+
clicks: Number(r.clicks),
|
|
356
|
+
impressions: Number(r.impressions),
|
|
357
|
+
sum_position: Number(r.sum_position)
|
|
358
|
+
}));
|
|
359
|
+
}
|
|
360
|
+
};
|
|
361
|
+
const topPages28dRollup = {
|
|
362
|
+
id: "top_pages_28d",
|
|
363
|
+
windowDays: 28,
|
|
364
|
+
async build({ engine, ctx, builtAt }) {
|
|
365
|
+
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
366
|
+
return (await engine.runSQL({
|
|
367
|
+
ctx,
|
|
368
|
+
table: "pages",
|
|
369
|
+
fileSets: { FILES: { table: "pages" } },
|
|
370
|
+
sql: `
|
|
371
|
+
SELECT
|
|
372
|
+
url,
|
|
373
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
374
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
375
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
376
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
377
|
+
WHERE date >= '${cutoff}'
|
|
378
|
+
GROUP BY url
|
|
379
|
+
ORDER BY clicks DESC
|
|
380
|
+
LIMIT 1000
|
|
381
|
+
`
|
|
382
|
+
})).rows.map((r) => ({
|
|
383
|
+
url: r.url,
|
|
384
|
+
clicks: Number(r.clicks),
|
|
385
|
+
impressions: Number(r.impressions),
|
|
386
|
+
sum_position: Number(r.sum_position)
|
|
387
|
+
}));
|
|
388
|
+
}
|
|
389
|
+
};
|
|
390
|
+
const topCountries28dRollup = {
|
|
391
|
+
id: "top_countries_28d",
|
|
392
|
+
windowDays: 28,
|
|
393
|
+
async build({ engine, ctx, builtAt }) {
|
|
394
|
+
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
395
|
+
return (await engine.runSQL({
|
|
396
|
+
ctx,
|
|
397
|
+
table: "countries",
|
|
398
|
+
fileSets: { FILES: { table: "countries" } },
|
|
399
|
+
sql: `
|
|
400
|
+
SELECT
|
|
401
|
+
country,
|
|
402
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
403
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
404
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
405
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
406
|
+
WHERE date >= '${cutoff}'
|
|
407
|
+
GROUP BY country
|
|
408
|
+
ORDER BY clicks DESC
|
|
409
|
+
LIMIT 250
|
|
410
|
+
`
|
|
411
|
+
})).rows.map((r) => ({
|
|
412
|
+
country: r.country,
|
|
413
|
+
clicks: Number(r.clicks),
|
|
414
|
+
impressions: Number(r.impressions),
|
|
415
|
+
sum_position: Number(r.sum_position)
|
|
416
|
+
}));
|
|
417
|
+
}
|
|
418
|
+
};
|
|
419
|
+
const topKeywords28dRollup = {
|
|
420
|
+
id: "top_keywords_28d",
|
|
421
|
+
windowDays: 28,
|
|
422
|
+
async build({ engine, ctx, builtAt }) {
|
|
423
|
+
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
424
|
+
return (await engine.runSQL({
|
|
425
|
+
ctx,
|
|
426
|
+
table: "keywords",
|
|
427
|
+
fileSets: { FILES: { table: "keywords" } },
|
|
428
|
+
sql: `
|
|
429
|
+
SELECT
|
|
430
|
+
query,
|
|
431
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
432
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
433
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
434
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
435
|
+
WHERE date >= '${cutoff}'
|
|
436
|
+
GROUP BY query
|
|
437
|
+
ORDER BY clicks DESC
|
|
438
|
+
LIMIT 1000
|
|
439
|
+
`
|
|
440
|
+
})).rows.map((r) => ({
|
|
441
|
+
query: r.query,
|
|
442
|
+
clicks: Number(r.clicks),
|
|
443
|
+
impressions: Number(r.impressions),
|
|
444
|
+
sum_position: Number(r.sum_position)
|
|
445
|
+
}));
|
|
446
|
+
}
|
|
447
|
+
};
|
|
448
|
+
const topKeywords28dParquetRollup = {
|
|
449
|
+
id: "top_keywords_28d_parquet",
|
|
450
|
+
windowDays: 28,
|
|
451
|
+
format: "parquet",
|
|
452
|
+
parquetColumns: [
|
|
453
|
+
{
|
|
454
|
+
name: "query",
|
|
455
|
+
type: "VARCHAR",
|
|
456
|
+
nullable: false
|
|
457
|
+
},
|
|
458
|
+
{
|
|
459
|
+
name: "clicks",
|
|
460
|
+
type: "BIGINT",
|
|
461
|
+
nullable: false
|
|
462
|
+
},
|
|
463
|
+
{
|
|
464
|
+
name: "impressions",
|
|
465
|
+
type: "BIGINT",
|
|
466
|
+
nullable: false
|
|
467
|
+
},
|
|
468
|
+
{
|
|
469
|
+
name: "sum_position",
|
|
470
|
+
type: "DOUBLE",
|
|
471
|
+
nullable: false
|
|
472
|
+
}
|
|
473
|
+
],
|
|
474
|
+
parquetSortKey: ["clicks"],
|
|
475
|
+
async build({ engine, ctx, builtAt }) {
|
|
476
|
+
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
477
|
+
return (await engine.runSQL({
|
|
478
|
+
ctx,
|
|
479
|
+
table: "keywords",
|
|
480
|
+
fileSets: { FILES: { table: "keywords" } },
|
|
481
|
+
sql: `
|
|
482
|
+
SELECT
|
|
483
|
+
query,
|
|
484
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
485
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
486
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
487
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
488
|
+
WHERE date >= '${cutoff}'
|
|
489
|
+
GROUP BY query
|
|
490
|
+
ORDER BY clicks DESC
|
|
491
|
+
LIMIT 1000
|
|
492
|
+
`
|
|
493
|
+
})).rows.map((r) => ({
|
|
494
|
+
query: String(r.query),
|
|
495
|
+
clicks: BigInt(r.clicks),
|
|
496
|
+
impressions: BigInt(r.impressions),
|
|
497
|
+
sum_position: Number(r.sum_position)
|
|
498
|
+
}));
|
|
499
|
+
}
|
|
500
|
+
};
|
|
501
|
+
const indexingMetadataRollup = {
|
|
502
|
+
id: "indexing_metadata",
|
|
503
|
+
windowDays: null,
|
|
504
|
+
async build({ dataSource, ctx }) {
|
|
505
|
+
const index = await createIndexingMetadataStore({ dataSource }).loadIndex(ctx);
|
|
506
|
+
const records = Object.values(index.records);
|
|
507
|
+
const updatesByDay = /* @__PURE__ */ new Map();
|
|
508
|
+
const removesByDay = /* @__PURE__ */ new Map();
|
|
509
|
+
let totalUpdates = 0;
|
|
510
|
+
let totalRemoves = 0;
|
|
511
|
+
let latestUpdate;
|
|
512
|
+
let latestRemove;
|
|
513
|
+
for (const r of records) {
|
|
514
|
+
if (r.latestUpdateAt) {
|
|
515
|
+
totalUpdates++;
|
|
516
|
+
const day = r.latestUpdateAt.slice(0, 10);
|
|
517
|
+
updatesByDay.set(day, (updatesByDay.get(day) ?? 0) + 1);
|
|
518
|
+
if (!latestUpdate || r.latestUpdateAt > latestUpdate) latestUpdate = r.latestUpdateAt;
|
|
519
|
+
}
|
|
520
|
+
if (r.latestRemoveAt) {
|
|
521
|
+
totalRemoves++;
|
|
522
|
+
const day = r.latestRemoveAt.slice(0, 10);
|
|
523
|
+
removesByDay.set(day, (removesByDay.get(day) ?? 0) + 1);
|
|
524
|
+
if (!latestRemove || r.latestRemoveAt > latestRemove) latestRemove = r.latestRemoveAt;
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
const days = new Set([...updatesByDay.keys(), ...removesByDay.keys()]);
|
|
528
|
+
const perDay = Array.from(days).sort().map((day) => ({
|
|
529
|
+
day,
|
|
530
|
+
updates: updatesByDay.get(day) ?? 0,
|
|
531
|
+
removes: removesByDay.get(day) ?? 0
|
|
532
|
+
}));
|
|
533
|
+
return {
|
|
534
|
+
totals: {
|
|
535
|
+
urls: records.length,
|
|
536
|
+
updates: totalUpdates,
|
|
537
|
+
removes: totalRemoves,
|
|
538
|
+
latestUpdateAt: latestUpdate ?? null,
|
|
539
|
+
latestRemoveAt: latestRemove ?? null
|
|
540
|
+
},
|
|
541
|
+
days: perDay
|
|
542
|
+
};
|
|
543
|
+
}
|
|
544
|
+
};
|
|
545
|
+
const DEFAULT_ROLLUPS = [
|
|
546
|
+
dailyTotalsRollup,
|
|
547
|
+
weeklyTotalsRollup,
|
|
548
|
+
topPages28dRollup,
|
|
549
|
+
topKeywords28dRollup,
|
|
550
|
+
topCountries28dRollup,
|
|
551
|
+
indexingMetadataRollup
|
|
552
|
+
];
|
|
553
|
+
export { DEFAULT_ROLLUPS, dailyTotalsRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|