@gscdump/engine 0.9.2 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/_chunks/dispatch.mjs +11 -17
- package/dist/_chunks/engine.mjs +622 -0
- package/dist/_chunks/pg-adapter.mjs +1 -10
- package/dist/_chunks/registry.d.mts +137 -15
- package/dist/_chunks/resolver.mjs +2 -25
- package/dist/_chunks/snapshot.d.mts +14 -0
- package/dist/_chunks/storage.d.mts +1 -20
- package/dist/adapters/node.d.mts +91 -0
- package/dist/adapters/node.mjs +133 -0
- package/dist/analyzer/index.d.mts +4 -50
- package/dist/analyzer/index.mjs +17 -8
- package/dist/entities.d.mts +116 -2
- package/dist/entities.mjs +453 -1
- package/dist/index.d.mts +3 -2
- package/dist/index.mjs +7 -621
- package/dist/planner.d.mts +1 -1
- package/dist/planner.mjs +1 -1
- package/dist/resolver/index.d.mts +1 -23
- package/dist/resolver/index.mjs +3 -3
- package/dist/rollups.d.mts +196 -0
- package/dist/rollups.mjs +546 -0
- package/dist/schedule.d.mts +19 -0
- package/dist/schedule.mjs +100 -0
- package/dist/snapshot.d.mts +1 -13
- package/dist/source/index.d.mts +30 -8
- package/dist/source/index.mjs +42 -7
- package/package.json +15 -5
- package/dist/_chunks/source-types.d.mts +0 -31
- /package/dist/_chunks/{planner.mjs → compiler.mjs} +0 -0
package/dist/rollups.mjs
ADDED
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
|
|
2
|
+
import { createIndexingMetadataStore, createInspectionStore, createSitemapStore } from "./entities.mjs";
|
|
3
|
+
import { MS_PER_DAY } from "gscdump";
|
|
4
|
+
function rollupPrefix(ctx) {
|
|
5
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
|
|
6
|
+
}
|
|
7
|
+
function rollupKey(ctx, id, builtAt) {
|
|
8
|
+
return `${rollupPrefix(ctx)}/${id}__v${builtAt}.json`;
|
|
9
|
+
}
|
|
10
|
+
function rollupParquetKey(ctx, id, builtAt) {
|
|
11
|
+
return `${rollupPrefix(ctx)}/${id}__v${builtAt}.parquet`;
|
|
12
|
+
}
|
|
13
|
+
async function rebuildRollups(opts) {
|
|
14
|
+
const now = opts.now ?? (() => Date.now());
|
|
15
|
+
const results = [];
|
|
16
|
+
for (const def of opts.defs) {
|
|
17
|
+
const builtAt = now();
|
|
18
|
+
const payload = await def.build({
|
|
19
|
+
engine: opts.engine,
|
|
20
|
+
ctx: opts.ctx,
|
|
21
|
+
dataSource: opts.dataSource,
|
|
22
|
+
builtAt
|
|
23
|
+
});
|
|
24
|
+
if (def.format === "parquet") {
|
|
25
|
+
if (!def.parquetColumns || def.parquetColumns.length === 0) throw new Error(`rollup '${def.id}' declared format='parquet' without parquetColumns`);
|
|
26
|
+
const rows = payload ?? [];
|
|
27
|
+
const parquetBytes = encodeRowsToParquetFlex(rows, {
|
|
28
|
+
columns: def.parquetColumns,
|
|
29
|
+
sortKey: def.parquetSortKey
|
|
30
|
+
});
|
|
31
|
+
const parquetKey = rollupParquetKey(opts.ctx, def.id, builtAt);
|
|
32
|
+
await opts.dataSource.write(parquetKey, parquetBytes);
|
|
33
|
+
const pointer = {
|
|
34
|
+
parquetKey,
|
|
35
|
+
rowCount: rows.length
|
|
36
|
+
};
|
|
37
|
+
const envelope = {
|
|
38
|
+
version: 1,
|
|
39
|
+
id: def.id,
|
|
40
|
+
builtAt,
|
|
41
|
+
windowDays: def.windowDays,
|
|
42
|
+
payload: pointer
|
|
43
|
+
};
|
|
44
|
+
const envelopeBytes = new TextEncoder().encode(JSON.stringify(envelope));
|
|
45
|
+
const key = rollupKey(opts.ctx, def.id, builtAt);
|
|
46
|
+
await opts.dataSource.write(key, envelopeBytes);
|
|
47
|
+
results.push({
|
|
48
|
+
id: def.id,
|
|
49
|
+
objectKey: key,
|
|
50
|
+
parquetKey,
|
|
51
|
+
bytes: envelopeBytes.byteLength,
|
|
52
|
+
parquetBytes: parquetBytes.byteLength,
|
|
53
|
+
builtAt
|
|
54
|
+
});
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
const envelope = {
|
|
58
|
+
version: 1,
|
|
59
|
+
id: def.id,
|
|
60
|
+
builtAt,
|
|
61
|
+
windowDays: def.windowDays,
|
|
62
|
+
payload
|
|
63
|
+
};
|
|
64
|
+
const json = JSON.stringify(envelope);
|
|
65
|
+
const bytes = new TextEncoder().encode(json);
|
|
66
|
+
const key = rollupKey(opts.ctx, def.id, builtAt);
|
|
67
|
+
await opts.dataSource.write(key, bytes);
|
|
68
|
+
results.push({
|
|
69
|
+
id: def.id,
|
|
70
|
+
objectKey: key,
|
|
71
|
+
bytes: bytes.byteLength,
|
|
72
|
+
builtAt
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
return results;
|
|
76
|
+
}
|
|
77
|
+
function utcDateMinusDays(at, days) {
|
|
78
|
+
const d = new Date(at - days * MS_PER_DAY);
|
|
79
|
+
return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
|
|
80
|
+
}
|
|
81
|
+
const dailyTotalsRollup = {
|
|
82
|
+
id: "daily_totals",
|
|
83
|
+
windowDays: null,
|
|
84
|
+
async build({ engine, ctx }) {
|
|
85
|
+
const pages = await engine.runSQL({
|
|
86
|
+
ctx,
|
|
87
|
+
table: "pages",
|
|
88
|
+
fileSets: { FILES: { table: "pages" } },
|
|
89
|
+
sql: `
|
|
90
|
+
SELECT
|
|
91
|
+
date,
|
|
92
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
93
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
94
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
95
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
96
|
+
GROUP BY date
|
|
97
|
+
ORDER BY date
|
|
98
|
+
`
|
|
99
|
+
});
|
|
100
|
+
const keywords = await engine.runSQL({
|
|
101
|
+
ctx,
|
|
102
|
+
table: "keywords",
|
|
103
|
+
fileSets: { FILES: { table: "keywords" } },
|
|
104
|
+
sql: `
|
|
105
|
+
SELECT
|
|
106
|
+
date,
|
|
107
|
+
SUM(impressions)::BIGINT AS impressions
|
|
108
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
109
|
+
GROUP BY date
|
|
110
|
+
`
|
|
111
|
+
});
|
|
112
|
+
const keywordImpressionsByDate = /* @__PURE__ */ new Map();
|
|
113
|
+
for (const r of keywords.rows) keywordImpressionsByDate.set(String(r.date), BigInt(r.impressions));
|
|
114
|
+
return pages.rows.map((r) => {
|
|
115
|
+
const totalImpressions = BigInt(r.impressions);
|
|
116
|
+
const queryImpressions = keywordImpressionsByDate.get(String(r.date)) ?? BigInt(0);
|
|
117
|
+
const anonymized = totalImpressions === BigInt(0) ? 0 : 1 - Number(queryImpressions) / Number(totalImpressions);
|
|
118
|
+
return {
|
|
119
|
+
date: r.date,
|
|
120
|
+
clicks: Number(r.clicks),
|
|
121
|
+
impressions: Number(r.impressions),
|
|
122
|
+
sum_position: Number(r.sum_position),
|
|
123
|
+
anonymizedImpressionsPct: Math.max(0, Math.min(1, anonymized))
|
|
124
|
+
};
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
};
|
|
128
|
+
const weeklyTotalsRollup = {
|
|
129
|
+
id: "weekly_totals",
|
|
130
|
+
windowDays: null,
|
|
131
|
+
async build({ engine, ctx }) {
|
|
132
|
+
return (await engine.runSQL({
|
|
133
|
+
ctx,
|
|
134
|
+
table: "pages",
|
|
135
|
+
fileSets: { FILES: { table: "pages" } },
|
|
136
|
+
sql: `
|
|
137
|
+
SELECT
|
|
138
|
+
strftime(date_trunc('week', date::DATE), '%Y-%m-%d') AS week,
|
|
139
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
140
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
141
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
142
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
143
|
+
GROUP BY 1
|
|
144
|
+
ORDER BY 1
|
|
145
|
+
`
|
|
146
|
+
})).rows.map((r) => ({
|
|
147
|
+
week: r.week,
|
|
148
|
+
clicks: Number(r.clicks),
|
|
149
|
+
impressions: Number(r.impressions),
|
|
150
|
+
sum_position: Number(r.sum_position)
|
|
151
|
+
}));
|
|
152
|
+
}
|
|
153
|
+
};
|
|
154
|
+
const topPages28dRollup = {
|
|
155
|
+
id: "top_pages_28d",
|
|
156
|
+
windowDays: 28,
|
|
157
|
+
async build({ engine, ctx, builtAt }) {
|
|
158
|
+
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
159
|
+
return (await engine.runSQL({
|
|
160
|
+
ctx,
|
|
161
|
+
table: "pages",
|
|
162
|
+
fileSets: { FILES: { table: "pages" } },
|
|
163
|
+
sql: `
|
|
164
|
+
SELECT
|
|
165
|
+
url,
|
|
166
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
167
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
168
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
169
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
170
|
+
WHERE date >= '${cutoff}'
|
|
171
|
+
GROUP BY url
|
|
172
|
+
ORDER BY clicks DESC
|
|
173
|
+
LIMIT 1000
|
|
174
|
+
`
|
|
175
|
+
})).rows.map((r) => ({
|
|
176
|
+
url: r.url,
|
|
177
|
+
clicks: Number(r.clicks),
|
|
178
|
+
impressions: Number(r.impressions),
|
|
179
|
+
sum_position: Number(r.sum_position)
|
|
180
|
+
}));
|
|
181
|
+
}
|
|
182
|
+
};
|
|
183
|
+
const topCountries28dRollup = {
|
|
184
|
+
id: "top_countries_28d",
|
|
185
|
+
windowDays: 28,
|
|
186
|
+
async build({ engine, ctx, builtAt }) {
|
|
187
|
+
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
188
|
+
return (await engine.runSQL({
|
|
189
|
+
ctx,
|
|
190
|
+
table: "countries",
|
|
191
|
+
fileSets: { FILES: { table: "countries" } },
|
|
192
|
+
sql: `
|
|
193
|
+
SELECT
|
|
194
|
+
country,
|
|
195
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
196
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
197
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
198
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
199
|
+
WHERE date >= '${cutoff}'
|
|
200
|
+
GROUP BY country
|
|
201
|
+
ORDER BY clicks DESC
|
|
202
|
+
LIMIT 250
|
|
203
|
+
`
|
|
204
|
+
})).rows.map((r) => ({
|
|
205
|
+
country: r.country,
|
|
206
|
+
clicks: Number(r.clicks),
|
|
207
|
+
impressions: Number(r.impressions),
|
|
208
|
+
sum_position: Number(r.sum_position)
|
|
209
|
+
}));
|
|
210
|
+
}
|
|
211
|
+
};
|
|
212
|
+
const topKeywords28dRollup = {
|
|
213
|
+
id: "top_keywords_28d",
|
|
214
|
+
windowDays: 28,
|
|
215
|
+
async build({ engine, ctx, builtAt }) {
|
|
216
|
+
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
217
|
+
return (await engine.runSQL({
|
|
218
|
+
ctx,
|
|
219
|
+
table: "keywords",
|
|
220
|
+
fileSets: { FILES: { table: "keywords" } },
|
|
221
|
+
sql: `
|
|
222
|
+
SELECT
|
|
223
|
+
query,
|
|
224
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
225
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
226
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
227
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
228
|
+
WHERE date >= '${cutoff}'
|
|
229
|
+
GROUP BY query
|
|
230
|
+
ORDER BY clicks DESC
|
|
231
|
+
LIMIT 1000
|
|
232
|
+
`
|
|
233
|
+
})).rows.map((r) => ({
|
|
234
|
+
query: r.query,
|
|
235
|
+
clicks: Number(r.clicks),
|
|
236
|
+
impressions: Number(r.impressions),
|
|
237
|
+
sum_position: Number(r.sum_position)
|
|
238
|
+
}));
|
|
239
|
+
}
|
|
240
|
+
};
|
|
241
|
+
const topKeywords28dParquetRollup = {
|
|
242
|
+
id: "top_keywords_28d_parquet",
|
|
243
|
+
windowDays: 28,
|
|
244
|
+
format: "parquet",
|
|
245
|
+
parquetColumns: [
|
|
246
|
+
{
|
|
247
|
+
name: "query",
|
|
248
|
+
type: "VARCHAR",
|
|
249
|
+
nullable: false
|
|
250
|
+
},
|
|
251
|
+
{
|
|
252
|
+
name: "clicks",
|
|
253
|
+
type: "BIGINT",
|
|
254
|
+
nullable: false
|
|
255
|
+
},
|
|
256
|
+
{
|
|
257
|
+
name: "impressions",
|
|
258
|
+
type: "BIGINT",
|
|
259
|
+
nullable: false
|
|
260
|
+
},
|
|
261
|
+
{
|
|
262
|
+
name: "sum_position",
|
|
263
|
+
type: "DOUBLE",
|
|
264
|
+
nullable: false
|
|
265
|
+
}
|
|
266
|
+
],
|
|
267
|
+
parquetSortKey: ["clicks"],
|
|
268
|
+
async build({ engine, ctx, builtAt }) {
|
|
269
|
+
const cutoff = utcDateMinusDays(builtAt, 28);
|
|
270
|
+
return (await engine.runSQL({
|
|
271
|
+
ctx,
|
|
272
|
+
table: "keywords",
|
|
273
|
+
fileSets: { FILES: { table: "keywords" } },
|
|
274
|
+
sql: `
|
|
275
|
+
SELECT
|
|
276
|
+
query,
|
|
277
|
+
SUM(clicks)::BIGINT AS clicks,
|
|
278
|
+
SUM(impressions)::BIGINT AS impressions,
|
|
279
|
+
SUM(sum_position)::DOUBLE AS sum_position
|
|
280
|
+
FROM read_parquet({{FILES}}, union_by_name = true)
|
|
281
|
+
WHERE date >= '${cutoff}'
|
|
282
|
+
GROUP BY query
|
|
283
|
+
ORDER BY clicks DESC
|
|
284
|
+
LIMIT 1000
|
|
285
|
+
`
|
|
286
|
+
})).rows.map((r) => ({
|
|
287
|
+
query: String(r.query),
|
|
288
|
+
clicks: BigInt(r.clicks),
|
|
289
|
+
impressions: BigInt(r.impressions),
|
|
290
|
+
sum_position: Number(r.sum_position)
|
|
291
|
+
}));
|
|
292
|
+
}
|
|
293
|
+
};
|
|
294
|
+
const indexingMetadataRollup = {
|
|
295
|
+
id: "indexing_metadata",
|
|
296
|
+
windowDays: null,
|
|
297
|
+
async build({ dataSource, ctx }) {
|
|
298
|
+
const index = await createIndexingMetadataStore({ dataSource }).loadIndex(ctx);
|
|
299
|
+
const records = Object.values(index.records);
|
|
300
|
+
const updatesByDay = /* @__PURE__ */ new Map();
|
|
301
|
+
const removesByDay = /* @__PURE__ */ new Map();
|
|
302
|
+
let totalUpdates = 0;
|
|
303
|
+
let totalRemoves = 0;
|
|
304
|
+
let latestUpdate;
|
|
305
|
+
let latestRemove;
|
|
306
|
+
for (const r of records) {
|
|
307
|
+
if (r.latestUpdateAt) {
|
|
308
|
+
totalUpdates++;
|
|
309
|
+
const day = r.latestUpdateAt.slice(0, 10);
|
|
310
|
+
updatesByDay.set(day, (updatesByDay.get(day) ?? 0) + 1);
|
|
311
|
+
if (!latestUpdate || r.latestUpdateAt > latestUpdate) latestUpdate = r.latestUpdateAt;
|
|
312
|
+
}
|
|
313
|
+
if (r.latestRemoveAt) {
|
|
314
|
+
totalRemoves++;
|
|
315
|
+
const day = r.latestRemoveAt.slice(0, 10);
|
|
316
|
+
removesByDay.set(day, (removesByDay.get(day) ?? 0) + 1);
|
|
317
|
+
if (!latestRemove || r.latestRemoveAt > latestRemove) latestRemove = r.latestRemoveAt;
|
|
318
|
+
}
|
|
319
|
+
}
|
|
320
|
+
const days = new Set([...updatesByDay.keys(), ...removesByDay.keys()]);
|
|
321
|
+
const perDay = Array.from(days).sort().map((day) => ({
|
|
322
|
+
day,
|
|
323
|
+
updates: updatesByDay.get(day) ?? 0,
|
|
324
|
+
removes: removesByDay.get(day) ?? 0
|
|
325
|
+
}));
|
|
326
|
+
return {
|
|
327
|
+
totals: {
|
|
328
|
+
urls: records.length,
|
|
329
|
+
updates: totalUpdates,
|
|
330
|
+
removes: totalRemoves,
|
|
331
|
+
latestUpdateAt: latestUpdate ?? null,
|
|
332
|
+
latestRemoveAt: latestRemove ?? null
|
|
333
|
+
},
|
|
334
|
+
days: perDay
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
};
|
|
338
|
+
function sqlString(s) {
|
|
339
|
+
return `'${s.replace(/'/g, "''")}'`;
|
|
340
|
+
}
|
|
341
|
+
const indexingHealthRollup = {
|
|
342
|
+
id: "indexing_health",
|
|
343
|
+
windowDays: 90,
|
|
344
|
+
async build({ engine, ctx, dataSource, builtAt }) {
|
|
345
|
+
const uri = createInspectionStore({ dataSource }).parquetUri(ctx);
|
|
346
|
+
if (!uri) return { days: [] };
|
|
347
|
+
const cutoff = utcDateMinusDays(builtAt, 90);
|
|
348
|
+
const sql = `
|
|
349
|
+
SELECT
|
|
350
|
+
substr(inspectedAt, 1, 10) AS date,
|
|
351
|
+
COUNT(*)::BIGINT AS total_urls,
|
|
352
|
+
SUM(CASE WHEN indexStatus = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS indexed_count,
|
|
353
|
+
SUM(CASE WHEN pageFetchState = 'SOFT_404' THEN 1 ELSE 0 END)::BIGINT AS soft_404,
|
|
354
|
+
SUM(CASE WHEN pageFetchState = 'REDIRECT_ERROR' THEN 1 ELSE 0 END)::BIGINT AS redirect,
|
|
355
|
+
SUM(CASE WHEN pageFetchState = 'NOT_FOUND' THEN 1 ELSE 0 END)::BIGINT AS not_found,
|
|
356
|
+
SUM(CASE WHEN mobileUsabilityVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS mobile_passes,
|
|
357
|
+
SUM(CASE WHEN richResultsVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
|
|
358
|
+
SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND userCanonical <> googleCanonical THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
|
|
359
|
+
FROM read_parquet(${sqlString(uri)})
|
|
360
|
+
WHERE substr(inspectedAt, 1, 10) >= '${cutoff}'
|
|
361
|
+
GROUP BY 1
|
|
362
|
+
ORDER BY 1
|
|
363
|
+
`;
|
|
364
|
+
return { days: (await engine.runSQL({
|
|
365
|
+
ctx,
|
|
366
|
+
table: "pages",
|
|
367
|
+
fileSets: {},
|
|
368
|
+
sql
|
|
369
|
+
})).rows.map((r) => ({
|
|
370
|
+
date: String(r.date),
|
|
371
|
+
total_urls: Number(r.total_urls),
|
|
372
|
+
indexed_count: Number(r.indexed_count),
|
|
373
|
+
soft_404: Number(r.soft_404),
|
|
374
|
+
redirect: Number(r.redirect),
|
|
375
|
+
not_found: Number(r.not_found),
|
|
376
|
+
mobile_passes: Number(r.mobile_passes),
|
|
377
|
+
rich_results_passes: Number(r.rich_results_passes),
|
|
378
|
+
canonical_mismatches: Number(r.canonical_mismatches)
|
|
379
|
+
})) };
|
|
380
|
+
}
|
|
381
|
+
};
|
|
382
|
+
const indexPercentRollup = {
|
|
383
|
+
id: "index_percent",
|
|
384
|
+
windowDays: 90,
|
|
385
|
+
async build({ engine, ctx, dataSource, builtAt }) {
|
|
386
|
+
const urlsUri = createSitemapStore({ dataSource }).urlsParquetUri(ctx);
|
|
387
|
+
if (!urlsUri) return {
|
|
388
|
+
totalSitemapUrls: 0,
|
|
389
|
+
days: []
|
|
390
|
+
};
|
|
391
|
+
const cutoff = utcDateMinusDays(builtAt, 90);
|
|
392
|
+
const numerator = await engine.runSQL({
|
|
393
|
+
ctx,
|
|
394
|
+
table: "pages",
|
|
395
|
+
fileSets: { PAGES: { table: "pages" } },
|
|
396
|
+
sql: `
|
|
397
|
+
SELECT
|
|
398
|
+
p.date AS date,
|
|
399
|
+
COUNT(DISTINCT p.url)::BIGINT AS clicked_urls
|
|
400
|
+
FROM read_parquet({{PAGES}}, union_by_name = true) p
|
|
401
|
+
INNER JOIN read_parquet(${sqlString(urlsUri)}) s
|
|
402
|
+
ON s.loc = p.url AND s.removed_at IS NULL
|
|
403
|
+
WHERE p.clicks > 0 AND p.date >= '${cutoff}'
|
|
404
|
+
GROUP BY p.date
|
|
405
|
+
ORDER BY p.date
|
|
406
|
+
`
|
|
407
|
+
});
|
|
408
|
+
const denom = await engine.runSQL({
|
|
409
|
+
ctx,
|
|
410
|
+
table: "pages",
|
|
411
|
+
fileSets: {},
|
|
412
|
+
sql: `
|
|
413
|
+
SELECT COUNT(*)::BIGINT AS total
|
|
414
|
+
FROM read_parquet(${sqlString(urlsUri)})
|
|
415
|
+
WHERE removed_at IS NULL
|
|
416
|
+
`
|
|
417
|
+
});
|
|
418
|
+
const total = Number(denom.rows[0]?.total ?? 0);
|
|
419
|
+
return {
|
|
420
|
+
totalSitemapUrls: total,
|
|
421
|
+
days: numerator.rows.map((r) => {
|
|
422
|
+
const clicked = Number(r.clicked_urls);
|
|
423
|
+
return {
|
|
424
|
+
date: String(r.date),
|
|
425
|
+
clicked_urls: clicked,
|
|
426
|
+
total_sitemap_urls: total,
|
|
427
|
+
ratio: total === 0 ? 0 : clicked / total
|
|
428
|
+
};
|
|
429
|
+
})
|
|
430
|
+
};
|
|
431
|
+
}
|
|
432
|
+
};
|
|
433
|
+
const sitemapHealthRollup = {
|
|
434
|
+
id: "sitemap_health",
|
|
435
|
+
windowDays: 90,
|
|
436
|
+
async build({ dataSource, ctx, builtAt }) {
|
|
437
|
+
const index = await createSitemapStore({ dataSource }).loadIndex(ctx);
|
|
438
|
+
const records = Object.values(index.records);
|
|
439
|
+
const cutoff = utcDateMinusDays(builtAt, 90);
|
|
440
|
+
const byDay = /* @__PURE__ */ new Map();
|
|
441
|
+
const feeds = [];
|
|
442
|
+
for (const r of records) {
|
|
443
|
+
const day = (r.capturedAt ?? r.lastDownloaded ?? "").slice(0, 10);
|
|
444
|
+
if (!day || day < cutoff) continue;
|
|
445
|
+
const errors = Number(r.errors ?? 0);
|
|
446
|
+
const warnings = Number(r.warnings ?? 0);
|
|
447
|
+
const urlCount = Number(r.urlCount ?? 0);
|
|
448
|
+
const bucket = byDay.get(day) ?? {
|
|
449
|
+
day,
|
|
450
|
+
feeds: 0,
|
|
451
|
+
total_urls: 0,
|
|
452
|
+
errors: 0,
|
|
453
|
+
warnings: 0
|
|
454
|
+
};
|
|
455
|
+
bucket.feeds += 1;
|
|
456
|
+
bucket.total_urls += urlCount;
|
|
457
|
+
bucket.errors += errors;
|
|
458
|
+
bucket.warnings += warnings;
|
|
459
|
+
byDay.set(day, bucket);
|
|
460
|
+
feeds.push({
|
|
461
|
+
path: r.path,
|
|
462
|
+
urlCount,
|
|
463
|
+
errors,
|
|
464
|
+
warnings,
|
|
465
|
+
contentHash: r.contentHash ?? null,
|
|
466
|
+
lastDownloaded: r.lastDownloaded ?? null,
|
|
467
|
+
capturedAt: r.capturedAt
|
|
468
|
+
});
|
|
469
|
+
}
|
|
470
|
+
return {
|
|
471
|
+
days: Array.from(byDay.values()).sort((a, b) => a.day < b.day ? -1 : 1),
|
|
472
|
+
feeds
|
|
473
|
+
};
|
|
474
|
+
}
|
|
475
|
+
};
|
|
476
|
+
const sitemapChanges28dRollup = {
|
|
477
|
+
id: "sitemap_changes_28d",
|
|
478
|
+
windowDays: 28,
|
|
479
|
+
async build({ dataSource, ctx, builtAt }) {
|
|
480
|
+
const store = createSitemapStore({ dataSource });
|
|
481
|
+
const from = utcDateMinusDays(builtAt, 28);
|
|
482
|
+
const to = utcDateMinusDays(builtAt, 0);
|
|
483
|
+
const counts = /* @__PURE__ */ new Map();
|
|
484
|
+
const addedTop = [];
|
|
485
|
+
const removedTop = [];
|
|
486
|
+
function key(k) {
|
|
487
|
+
return `${k.day}\x00${k.feedpath}`;
|
|
488
|
+
}
|
|
489
|
+
for await (const d of store.loadDeltas(ctx, {
|
|
490
|
+
from,
|
|
491
|
+
to
|
|
492
|
+
})) {
|
|
493
|
+
const day = new Date(d.at).toISOString().slice(0, 10);
|
|
494
|
+
const k = key({
|
|
495
|
+
day,
|
|
496
|
+
feedpath: d.feedpath
|
|
497
|
+
});
|
|
498
|
+
const cur = counts.get(k) ?? {
|
|
499
|
+
day,
|
|
500
|
+
feedpath: d.feedpath,
|
|
501
|
+
added: 0,
|
|
502
|
+
removed: 0
|
|
503
|
+
};
|
|
504
|
+
if (d.op === "added") {
|
|
505
|
+
cur.added += 1;
|
|
506
|
+
addedTop.push({
|
|
507
|
+
loc: d.loc,
|
|
508
|
+
feedpath: d.feedpath,
|
|
509
|
+
at: d.at
|
|
510
|
+
});
|
|
511
|
+
} else {
|
|
512
|
+
cur.removed += 1;
|
|
513
|
+
removedTop.push({
|
|
514
|
+
loc: d.loc,
|
|
515
|
+
feedpath: d.feedpath,
|
|
516
|
+
at: d.at
|
|
517
|
+
});
|
|
518
|
+
}
|
|
519
|
+
counts.set(k, cur);
|
|
520
|
+
}
|
|
521
|
+
const days = Array.from(counts.values()).sort((a, b) => {
|
|
522
|
+
if (a.day !== b.day) return a.day < b.day ? -1 : 1;
|
|
523
|
+
return a.feedpath < b.feedpath ? -1 : 1;
|
|
524
|
+
});
|
|
525
|
+
addedTop.sort((a, b) => b.at - a.at);
|
|
526
|
+
removedTop.sort((a, b) => b.at - a.at);
|
|
527
|
+
return {
|
|
528
|
+
days,
|
|
529
|
+
topAdded: addedTop.slice(0, 200),
|
|
530
|
+
topRemoved: removedTop.slice(0, 200)
|
|
531
|
+
};
|
|
532
|
+
}
|
|
533
|
+
};
|
|
534
|
+
const DEFAULT_ROLLUPS = [
|
|
535
|
+
dailyTotalsRollup,
|
|
536
|
+
weeklyTotalsRollup,
|
|
537
|
+
topPages28dRollup,
|
|
538
|
+
topKeywords28dRollup,
|
|
539
|
+
topCountries28dRollup,
|
|
540
|
+
indexingMetadataRollup,
|
|
541
|
+
indexingHealthRollup,
|
|
542
|
+
indexPercentRollup,
|
|
543
|
+
sitemapHealthRollup,
|
|
544
|
+
sitemapChanges28dRollup
|
|
545
|
+
];
|
|
546
|
+
export { DEFAULT_ROLLUPS, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
interface ScheduleState {
|
|
2
|
+
nextAt: number;
|
|
3
|
+
consecutiveUnchanged: number;
|
|
4
|
+
policyVersion: number;
|
|
5
|
+
}
|
|
6
|
+
interface SchedulePolicy {
|
|
7
|
+
readonly version: number;
|
|
8
|
+
initial: (now: number) => ScheduleState;
|
|
9
|
+
observe: (prev: ScheduleState, evt: {
|
|
10
|
+
changed: boolean;
|
|
11
|
+
at: number;
|
|
12
|
+
}) => ScheduleState;
|
|
13
|
+
isDue: (state: ScheduleState, now: number) => boolean;
|
|
14
|
+
}
|
|
15
|
+
declare const sitemapPolicy: SchedulePolicy;
|
|
16
|
+
type InspectionVerdict = 'PASS' | 'FAIL' | 'NEUTRAL';
|
|
17
|
+
declare function inspectionPolicy(verdict: InspectionVerdict): SchedulePolicy;
|
|
18
|
+
declare function fixedPolicy(intervalMs: number): SchedulePolicy;
|
|
19
|
+
export { InspectionVerdict, SchedulePolicy, ScheduleState, fixedPolicy, inspectionPolicy, sitemapPolicy };
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
const DAY = 24 * (3600 * 1e3);
|
|
2
|
+
function isDue(state, now) {
|
|
3
|
+
return now >= state.nextAt;
|
|
4
|
+
}
|
|
5
|
+
function sitemapCadenceMs(consecutiveUnchanged) {
|
|
6
|
+
if (consecutiveUnchanged >= 7) return 30 * DAY;
|
|
7
|
+
if (consecutiveUnchanged >= 3) return 7 * DAY;
|
|
8
|
+
return DAY;
|
|
9
|
+
}
|
|
10
|
+
const SITEMAP_VERSION = 1;
|
|
11
|
+
const sitemapPolicy = {
|
|
12
|
+
version: SITEMAP_VERSION,
|
|
13
|
+
initial(now) {
|
|
14
|
+
return {
|
|
15
|
+
nextAt: now + DAY,
|
|
16
|
+
consecutiveUnchanged: 0,
|
|
17
|
+
policyVersion: SITEMAP_VERSION
|
|
18
|
+
};
|
|
19
|
+
},
|
|
20
|
+
observe(prev, evt) {
|
|
21
|
+
if (prev.policyVersion !== SITEMAP_VERSION) return {
|
|
22
|
+
nextAt: evt.at + sitemapCadenceMs(0),
|
|
23
|
+
consecutiveUnchanged: 0,
|
|
24
|
+
policyVersion: SITEMAP_VERSION
|
|
25
|
+
};
|
|
26
|
+
if (evt.changed) return {
|
|
27
|
+
nextAt: evt.at + DAY,
|
|
28
|
+
consecutiveUnchanged: 0,
|
|
29
|
+
policyVersion: SITEMAP_VERSION
|
|
30
|
+
};
|
|
31
|
+
const next = prev.consecutiveUnchanged + 1;
|
|
32
|
+
return {
|
|
33
|
+
nextAt: evt.at + sitemapCadenceMs(next),
|
|
34
|
+
consecutiveUnchanged: next,
|
|
35
|
+
policyVersion: SITEMAP_VERSION
|
|
36
|
+
};
|
|
37
|
+
},
|
|
38
|
+
isDue
|
|
39
|
+
};
|
|
40
|
+
const INSPECTION_VERSION = 1;
|
|
41
|
+
function inspectionCadenceMs(verdict) {
|
|
42
|
+
if (verdict === "PASS") return 30 * DAY;
|
|
43
|
+
if (verdict === "FAIL") return 7 * DAY;
|
|
44
|
+
return 14 * DAY;
|
|
45
|
+
}
|
|
46
|
+
function inspectionPolicy(verdict) {
|
|
47
|
+
const cadence = inspectionCadenceMs(verdict);
|
|
48
|
+
return {
|
|
49
|
+
version: INSPECTION_VERSION,
|
|
50
|
+
initial(now) {
|
|
51
|
+
return {
|
|
52
|
+
nextAt: now + cadence,
|
|
53
|
+
consecutiveUnchanged: 0,
|
|
54
|
+
policyVersion: INSPECTION_VERSION
|
|
55
|
+
};
|
|
56
|
+
},
|
|
57
|
+
observe(prev, evt) {
|
|
58
|
+
if (prev.policyVersion !== INSPECTION_VERSION) return {
|
|
59
|
+
nextAt: evt.at + cadence,
|
|
60
|
+
consecutiveUnchanged: 0,
|
|
61
|
+
policyVersion: INSPECTION_VERSION
|
|
62
|
+
};
|
|
63
|
+
const next = evt.changed ? 0 : prev.consecutiveUnchanged + 1;
|
|
64
|
+
return {
|
|
65
|
+
nextAt: evt.at + cadence,
|
|
66
|
+
consecutiveUnchanged: next,
|
|
67
|
+
policyVersion: INSPECTION_VERSION
|
|
68
|
+
};
|
|
69
|
+
},
|
|
70
|
+
isDue
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
const FIXED_VERSION = 1;
|
|
74
|
+
function fixedPolicy(intervalMs) {
|
|
75
|
+
return {
|
|
76
|
+
version: FIXED_VERSION,
|
|
77
|
+
initial(now) {
|
|
78
|
+
return {
|
|
79
|
+
nextAt: now + intervalMs,
|
|
80
|
+
consecutiveUnchanged: 0,
|
|
81
|
+
policyVersion: FIXED_VERSION
|
|
82
|
+
};
|
|
83
|
+
},
|
|
84
|
+
observe(prev, evt) {
|
|
85
|
+
if (prev.policyVersion !== FIXED_VERSION) return {
|
|
86
|
+
nextAt: evt.at + intervalMs,
|
|
87
|
+
consecutiveUnchanged: 0,
|
|
88
|
+
policyVersion: FIXED_VERSION
|
|
89
|
+
};
|
|
90
|
+
const next = evt.changed ? 0 : prev.consecutiveUnchanged + 1;
|
|
91
|
+
return {
|
|
92
|
+
nextAt: evt.at + intervalMs,
|
|
93
|
+
consecutiveUnchanged: next,
|
|
94
|
+
policyVersion: FIXED_VERSION
|
|
95
|
+
};
|
|
96
|
+
},
|
|
97
|
+
isDue
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
export { fixedPolicy, inspectionPolicy, sitemapPolicy };
|
package/dist/snapshot.d.mts
CHANGED
|
@@ -1,14 +1,2 @@
|
|
|
1
|
-
|
|
2
|
-
* Describes a hot/cold snapshot set. Produced by the snapshot builder,
|
|
3
|
-
* consumed by `attachSnapshotIndex`. Filenames are derived from `cold`
|
|
4
|
-
* via `cold-${yearMonth}.duckdb`; hot is always `hot.duckdb` when
|
|
5
|
-
* `hot: true`.
|
|
6
|
-
*/
|
|
7
|
-
interface SnapshotIndex {
|
|
8
|
-
version: 1;
|
|
9
|
-
builtAt: string;
|
|
10
|
-
cold: string[];
|
|
11
|
-
hot: boolean;
|
|
12
|
-
hotDays: number;
|
|
13
|
-
}
|
|
1
|
+
import { t as SnapshotIndex } from "./_chunks/snapshot.mjs";
|
|
14
2
|
export { SnapshotIndex };
|