@gscdump/engine 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import { a as DataSource } from "./_chunks/storage.mjs";
2
+ import { ScheduleState } from "./schedule.mjs";
2
3
  import { TenantCtx } from "gscdump/contracts";
3
4
  /**
4
5
  * GSC URL inspection result fields we persist. Mirrors the
@@ -28,8 +29,17 @@ interface InspectionRecord {
28
29
  * Free-form payload for fields we don't promote to first-class columns
29
30
  * (e.g. `referringUrls`, `crawledAs`). Keeps the wire format forward-compat
30
31
  * without bumping the schema for every API addition.
32
+ *
33
+ * Recognised keys:
34
+ * - `schedule`: optional `ScheduleState` from {@link inspectionPolicy}
35
+ * governing when this URL is next due for re-inspection. Undefined on
36
+ * pre-§0 records — readers must tolerate the missing field and fall
37
+ * back to default policy on first observe.
31
38
  */
32
- raw?: unknown;
39
+ raw?: {
40
+ schedule?: ScheduleState;
41
+ [key: string]: unknown;
42
+ };
33
43
  }
34
44
  /** Wire shape persisted to disk/R2. */
35
45
  interface InspectionIndex {
@@ -44,6 +54,7 @@ interface InspectionHistoryShard {
44
54
  }
45
55
  declare function inspectionIndexKey(ctx: TenantCtx): string;
46
56
  declare function emptyTypesKey(ctx: TenantCtx): string;
57
+ declare function inspectionParquetKey(ctx: TenantCtx): string;
47
58
  declare function inspectionHistoryKey(ctx: TenantCtx, yearMonth: string): string;
48
59
  /**
49
60
  * Stable URL hash used as the index key. Short, URL-safe, deterministic.
@@ -66,6 +77,34 @@ interface InspectionStore {
66
77
  loadIndex: (ctx: TenantCtx) => Promise<InspectionIndex>;
67
78
  /** Read the per-month history shard if it exists. */
68
79
  loadHistory: (ctx: TenantCtx, yearMonth: string) => Promise<InspectionHistoryShard | undefined>;
80
+ /**
81
+ * Snapshot the current JSON index to a parquet sidecar at
82
+ * `entities/inspections/index.parquet`. One PUT. Sorted by `urlHash` so
83
+ * DuckDB row-group stats can prune URL-keyed JOINs efficiently.
84
+ *
85
+ * Internal seam: callers don't choose JSON-vs-parquet — the store materialises
86
+ * the parquet at end-of-batch (e.g. after `indexing/complete`) and readers
87
+ * pick the format that matches their access pattern (parquet for JOINs,
88
+ * JSON for full-index scans / point lookups).
89
+ *
90
+ * Returns the parquet object key (matches {@link parquetUri} after write).
91
+ */
92
+ materialize: (ctx: TenantCtx) => Promise<{
93
+ key: string;
94
+ rowCount: number;
95
+ bytes: number;
96
+ }>;
97
+ /**
98
+ * DuckDB-resolvable URI for the materialised parquet sidecar, or
99
+ * `undefined` if the underlying `DataSource` has no native URI shape
100
+ * (in-memory tests). When defined, read paths can `read_parquet(<uri>)`
101
+ * directly without staging bytes through JS.
102
+ *
103
+ * Does not check existence — caller is responsible for ensuring
104
+ * `materialize` has run at least once. Returning a URI for a missing key
105
+ * is safe; DuckDB will surface a 404 / not-found at query time.
106
+ */
107
+ parquetUri: (ctx: TenantCtx) => string | undefined;
69
108
  }
70
109
  interface CreateInspectionStoreOptions {
71
110
  dataSource: DataSource;
@@ -100,6 +139,12 @@ interface SitemapRecord {
100
139
  }>;
101
140
  /** Raw payload for fields we don't promote to first-class columns. */
102
141
  raw?: unknown;
142
+ /** Number of URLs observed in this feedpath at last snapshot. */
143
+ urlCount?: number;
144
+ /** Stable hash of the sorted normalized loc list at last snapshot. */
145
+ contentHash?: string;
146
+ /** Adaptive cadence state owned by `sitemapPolicy`. */
147
+ schedule?: ScheduleState;
103
148
  }
104
149
  interface SitemapIndex {
105
150
  version: 1;
@@ -114,6 +159,57 @@ interface SitemapHistoryDoc {
114
159
  }
115
160
  declare function sitemapIndexKey(ctx: TenantCtx): string;
116
161
  declare function sitemapHistoryKey(ctx: TenantCtx, feedpathHash: string, capturedAtMs: number): string;
162
+ declare function sitemapUrlsIndexKey(ctx: TenantCtx): string;
163
+ declare function sitemapUrlsDeltaKey(ctx: TenantCtx, feedpathHash: string, date: string): string;
164
+ /** Parsed URL entry from a sitemap XML. */
165
+ interface ParsedUrl {
166
+ loc: string;
167
+ /** ISO-8601 lastmod from the sitemap, if present. */
168
+ lastmod?: string;
169
+ }
170
+ /** A single URL row in the urls/index.parquet partition. */
171
+ interface SitemapUrlRecord {
172
+ feedpath: string;
173
+ feedpathHash: string;
174
+ urlHash: string;
175
+ loc: string;
176
+ lastmod?: string;
177
+ firstSeenAt: number;
178
+ lastSeenAt: number;
179
+ /** Set when the URL has been removed. Null/undefined = currently live. */
180
+ removedAt?: number;
181
+ }
182
+ interface SnapshotUrlsResult {
183
+ added: number;
184
+ removed: number;
185
+ kept: number;
186
+ contentHash: string;
187
+ /** True when contentHash matched prior; the call performed zero writes. */
188
+ unchanged: boolean;
189
+ }
190
+ interface DeltaEntry {
191
+ feedpath: string;
192
+ feedpathHash: string;
193
+ urlHash: string;
194
+ op: 'added' | 'removed';
195
+ loc: string;
196
+ lastmod?: string;
197
+ at: number;
198
+ }
199
+ interface DateRange {
200
+ /** YYYY-MM-DD inclusive. */
201
+ from?: string;
202
+ /** YYYY-MM-DD inclusive. */
203
+ to?: string;
204
+ }
205
+ interface LoadUrlsOptions {
206
+ includeRemoved?: boolean;
207
+ }
208
+ /**
209
+ * Hash a URL list for change detection. Sorts then folds via FNV-1a so it's
210
+ * deterministic, locale-free, and cheap on Workers.
211
+ */
212
+ declare function hashUrlList(urls: readonly ParsedUrl[]): string;
117
213
  interface SitemapStore {
118
214
  /**
119
215
  * Persist a snapshot run. Updates the index + writes one immutable
@@ -124,6 +220,24 @@ interface SitemapStore {
124
220
  loadIndex: (ctx: TenantCtx) => Promise<SitemapIndex>;
125
221
  /** Fetch the latest snapshot for a feedpath, or undefined. */
126
222
  getLatest: (ctx: TenantCtx, path: string) => Promise<SitemapRecord | undefined>;
223
+ /**
224
+ * Diff incoming URLs against the prior `urls/index.parquet` partition for
225
+ * `feedpath`; on change, writes a single delta parquet under
226
+ * `urls/deltas/YYYY-MM-DD__{feedpathHash}.parquet`. Skipped (0 PUTs) when
227
+ * `contentHash` matches prior.
228
+ */
229
+ snapshotUrls: (ctx: TenantCtx, feedpath: string, urls: readonly ParsedUrl[]) => Promise<SnapshotUrlsResult>;
230
+ /** Stream live (and optionally removed) URL rows for a feedpath. */
231
+ loadUrls: (ctx: TenantCtx, feedpath: string, opts?: LoadUrlsOptions) => AsyncIterable<SitemapUrlRecord>;
232
+ /** Stream all delta entries within `[from, to]` (YYYY-MM-DD inclusive). */
233
+ loadDeltas: (ctx: TenantCtx, dateRange?: DateRange) => AsyncIterable<DeltaEntry>;
234
+ /**
235
+ * Fold every accumulated delta into the prior index; writes a fresh
236
+ * `urls/index.parquet` and deletes the consumed delta files.
237
+ */
238
+ compactUrls: (ctx: TenantCtx) => Promise<void>;
239
+ /** DuckDB-resolvable URI for the URLs index; `undefined` if backend lacks one. */
240
+ urlsParquetUri: (ctx: TenantCtx) => string | undefined;
127
241
  }
128
242
  interface CreateSitemapStoreOptions {
129
243
  dataSource: DataSource;
@@ -175,4 +289,4 @@ interface CreateEmptyTypesStoreOptions {
175
289
  now?: () => number;
176
290
  }
177
291
  declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
178
- export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, EmptyTypesDoc, EmptyTypesStore, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionIndex, InspectionRecord, InspectionStore, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, sitemapHistoryKey, sitemapIndexKey };
292
+ export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionIndex, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
package/dist/entities.mjs CHANGED
@@ -1,3 +1,4 @@
1
+ import { decodeParquetToRows, encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
1
2
  const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
2
3
  function inspectionIndexKey(ctx) {
3
4
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.json` : `u_${ctx.userId}/entities/inspections/index.json`;
@@ -5,6 +6,9 @@ function inspectionIndexKey(ctx) {
5
6
  function emptyTypesKey(ctx) {
6
7
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/empty-types.json` : `u_${ctx.userId}/entities/empty-types.json`;
7
8
  }
9
+ function inspectionParquetKey(ctx) {
10
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.parquet` : `u_${ctx.userId}/entities/inspections/index.parquet`;
11
+ }
8
12
  function inspectionHistoryKey(ctx, yearMonth) {
9
13
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}.json` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}.json`;
10
14
  }
@@ -75,15 +79,251 @@ function createInspectionStore(opts) {
75
79
  },
76
80
  async loadHistory(ctx, yearMonth) {
77
81
  return await readJson(inspectionHistoryKey(ctx, yearMonth));
82
+ },
83
+ async materialize(ctx) {
84
+ const index = await readJson(inspectionIndexKey(ctx)) ?? emptyIndex();
85
+ const rows = Object.entries(index.records).map(([urlHash, r]) => ({
86
+ urlHash,
87
+ url: r.url,
88
+ inspectedAt: r.inspectedAt,
89
+ indexStatus: r.indexStatus ?? null,
90
+ lastCrawlTime: r.lastCrawlTime ?? null,
91
+ googleCanonical: r.googleCanonical ?? null,
92
+ userCanonical: r.userCanonical ?? null,
93
+ coverageState: r.coverageState ?? null,
94
+ robotsTxtState: r.robotsTxtState ?? null,
95
+ indexingState: r.indexingState ?? null,
96
+ pageFetchState: r.pageFetchState ?? null,
97
+ mobileUsabilityVerdict: r.mobileUsabilityVerdict ?? null,
98
+ richResultsVerdict: r.richResultsVerdict ?? null,
99
+ scheduleNextAt: r.raw?.schedule?.nextAt ?? null,
100
+ scheduleConsecutiveUnchanged: r.raw?.schedule?.consecutiveUnchanged ?? null,
101
+ schedulePolicyVersion: r.raw?.schedule?.policyVersion ?? null
102
+ }));
103
+ const bytes = encodeRowsToParquetFlex(rows, {
104
+ columns: INSPECTION_PARQUET_COLUMNS,
105
+ sortKey: ["urlHash"]
106
+ });
107
+ const key = inspectionParquetKey(ctx);
108
+ await ds.write(key, bytes);
109
+ return {
110
+ key,
111
+ rowCount: rows.length,
112
+ bytes: bytes.byteLength
113
+ };
114
+ },
115
+ parquetUri(ctx) {
116
+ return ds.uri?.(inspectionParquetKey(ctx));
78
117
  }
79
118
  };
80
119
  }
120
+ const INSPECTION_PARQUET_COLUMNS = [
121
+ {
122
+ name: "urlHash",
123
+ type: "VARCHAR",
124
+ nullable: false
125
+ },
126
+ {
127
+ name: "url",
128
+ type: "VARCHAR",
129
+ nullable: false
130
+ },
131
+ {
132
+ name: "inspectedAt",
133
+ type: "VARCHAR",
134
+ nullable: false
135
+ },
136
+ {
137
+ name: "indexStatus",
138
+ type: "VARCHAR",
139
+ nullable: true
140
+ },
141
+ {
142
+ name: "lastCrawlTime",
143
+ type: "VARCHAR",
144
+ nullable: true
145
+ },
146
+ {
147
+ name: "googleCanonical",
148
+ type: "VARCHAR",
149
+ nullable: true
150
+ },
151
+ {
152
+ name: "userCanonical",
153
+ type: "VARCHAR",
154
+ nullable: true
155
+ },
156
+ {
157
+ name: "coverageState",
158
+ type: "VARCHAR",
159
+ nullable: true
160
+ },
161
+ {
162
+ name: "robotsTxtState",
163
+ type: "VARCHAR",
164
+ nullable: true
165
+ },
166
+ {
167
+ name: "indexingState",
168
+ type: "VARCHAR",
169
+ nullable: true
170
+ },
171
+ {
172
+ name: "pageFetchState",
173
+ type: "VARCHAR",
174
+ nullable: true
175
+ },
176
+ {
177
+ name: "mobileUsabilityVerdict",
178
+ type: "VARCHAR",
179
+ nullable: true
180
+ },
181
+ {
182
+ name: "richResultsVerdict",
183
+ type: "VARCHAR",
184
+ nullable: true
185
+ },
186
+ {
187
+ name: "scheduleNextAt",
188
+ type: "BIGINT",
189
+ nullable: true
190
+ },
191
+ {
192
+ name: "scheduleConsecutiveUnchanged",
193
+ type: "INTEGER",
194
+ nullable: true
195
+ },
196
+ {
197
+ name: "schedulePolicyVersion",
198
+ type: "INTEGER",
199
+ nullable: true
200
+ }
201
+ ];
81
202
  function sitemapIndexKey(ctx) {
82
203
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/index.json` : `u_${ctx.userId}/entities/sitemaps/index.json`;
83
204
  }
84
205
  function sitemapHistoryKey(ctx, feedpathHash, capturedAtMs) {
85
206
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json` : `u_${ctx.userId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json`;
86
207
  }
208
+ function sitemapUrlsPrefix(ctx) {
209
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/urls` : `u_${ctx.userId}/entities/sitemaps/urls`;
210
+ }
211
+ function sitemapUrlsIndexKey(ctx) {
212
+ return `${sitemapUrlsPrefix(ctx)}/index.parquet`;
213
+ }
214
+ function sitemapUrlsDeltaKey(ctx, feedpathHash, date) {
215
+ return `${sitemapUrlsPrefix(ctx)}/deltas/${date}__${feedpathHash}.parquet`;
216
+ }
217
+ const SITEMAP_URLS_DELTA_PREFIX_RE = /\/urls\/deltas\/(\d{4}-\d{2}-\d{2})__([0-9a-f]+)\.parquet$/;
218
+ const URLS_INDEX_COLUMNS = [
219
+ {
220
+ name: "feedpath",
221
+ type: "VARCHAR",
222
+ nullable: false
223
+ },
224
+ {
225
+ name: "feedpath_hash",
226
+ type: "VARCHAR",
227
+ nullable: false
228
+ },
229
+ {
230
+ name: "url_hash",
231
+ type: "VARCHAR",
232
+ nullable: false
233
+ },
234
+ {
235
+ name: "loc",
236
+ type: "VARCHAR",
237
+ nullable: false
238
+ },
239
+ {
240
+ name: "lastmod",
241
+ type: "VARCHAR",
242
+ nullable: true
243
+ },
244
+ {
245
+ name: "first_seen_at",
246
+ type: "BIGINT",
247
+ nullable: false
248
+ },
249
+ {
250
+ name: "last_seen_at",
251
+ type: "BIGINT",
252
+ nullable: false
253
+ },
254
+ {
255
+ name: "removed_at",
256
+ type: "BIGINT",
257
+ nullable: true
258
+ }
259
+ ];
260
+ const URLS_DELTA_COLUMNS = [
261
+ {
262
+ name: "feedpath",
263
+ type: "VARCHAR",
264
+ nullable: false
265
+ },
266
+ {
267
+ name: "feedpath_hash",
268
+ type: "VARCHAR",
269
+ nullable: false
270
+ },
271
+ {
272
+ name: "url_hash",
273
+ type: "VARCHAR",
274
+ nullable: false
275
+ },
276
+ {
277
+ name: "op",
278
+ type: "VARCHAR",
279
+ nullable: false
280
+ },
281
+ {
282
+ name: "loc",
283
+ type: "VARCHAR",
284
+ nullable: false
285
+ },
286
+ {
287
+ name: "lastmod",
288
+ type: "VARCHAR",
289
+ nullable: true
290
+ },
291
+ {
292
+ name: "at",
293
+ type: "BIGINT",
294
+ nullable: false
295
+ }
296
+ ];
297
+ function rowToUrlRecord(row) {
298
+ return {
299
+ feedpath: String(row.feedpath),
300
+ feedpathHash: String(row.feedpath_hash),
301
+ urlHash: String(row.url_hash),
302
+ loc: String(row.loc),
303
+ lastmod: row.lastmod == null ? void 0 : String(row.lastmod),
304
+ firstSeenAt: Number(row.first_seen_at),
305
+ lastSeenAt: Number(row.last_seen_at),
306
+ removedAt: row.removed_at == null ? void 0 : Number(row.removed_at)
307
+ };
308
+ }
309
+ function urlRecordToRow(r) {
310
+ return {
311
+ feedpath: r.feedpath,
312
+ feedpath_hash: r.feedpathHash,
313
+ url_hash: r.urlHash,
314
+ loc: r.loc,
315
+ lastmod: r.lastmod ?? null,
316
+ first_seen_at: r.firstSeenAt,
317
+ last_seen_at: r.lastSeenAt,
318
+ removed_at: r.removedAt ?? null
319
+ };
320
+ }
321
+ function isoDate(ms) {
322
+ return new Date(ms).toISOString().slice(0, 10);
323
+ }
324
+ function hashUrlList(urls) {
325
+ return hashUrl(urls.map((u) => u.loc).sort().join("\n"));
326
+ }
87
327
  function createSitemapStore(opts) {
88
328
  const ds = opts.dataSource;
89
329
  const hash = opts.hash ?? hashUrl;
@@ -123,6 +363,218 @@ function createSitemapStore(opts) {
123
363
  },
124
364
  async getLatest(ctx, path) {
125
365
  return (await readJson(sitemapIndexKey(ctx)))?.records[hash(path)];
366
+ },
367
+ async snapshotUrls(ctx, feedpath, urls) {
368
+ const fpHash = hash(feedpath);
369
+ const contentHash = hashUrlList(urls);
370
+ const at = now();
371
+ const priorByHash = /* @__PURE__ */ new Map();
372
+ for await (const rec of this.loadUrls(ctx, feedpath, { includeRemoved: true })) priorByHash.set(rec.urlHash, rec);
373
+ const livePrior = Array.from(priorByHash.values()).filter((r) => r.removedAt == null);
374
+ if (livePrior.length > 0) {
375
+ if (hashUrl(livePrior.map((r) => String(r.loc)).sort().join("\n")) === contentHash) return {
376
+ added: 0,
377
+ removed: 0,
378
+ kept: livePrior.length,
379
+ contentHash,
380
+ unchanged: true
381
+ };
382
+ }
383
+ const incomingByHash = /* @__PURE__ */ new Map();
384
+ for (const u of urls) incomingByHash.set(hash(u.loc), u);
385
+ const deltaRows = [];
386
+ let added = 0;
387
+ let removed = 0;
388
+ let kept = 0;
389
+ const date = isoDate(at);
390
+ for (const [urlHash, u] of incomingByHash) {
391
+ const prev = priorByHash.get(urlHash);
392
+ if (!prev || prev.removedAt != null) {
393
+ added++;
394
+ deltaRows.push({
395
+ feedpath,
396
+ feedpath_hash: fpHash,
397
+ url_hash: urlHash,
398
+ op: "added",
399
+ loc: u.loc,
400
+ lastmod: u.lastmod ?? null,
401
+ at
402
+ });
403
+ } else kept++;
404
+ }
405
+ for (const [urlHash, prev] of priorByHash) {
406
+ if (prev.removedAt != null) continue;
407
+ if (!incomingByHash.has(urlHash)) {
408
+ removed++;
409
+ deltaRows.push({
410
+ feedpath,
411
+ feedpath_hash: fpHash,
412
+ url_hash: urlHash,
413
+ op: "removed",
414
+ loc: prev.loc,
415
+ lastmod: prev.lastmod ?? null,
416
+ at
417
+ });
418
+ }
419
+ }
420
+ if (deltaRows.length > 0) {
421
+ const bytes = encodeRowsToParquetFlex(deltaRows, {
422
+ columns: URLS_DELTA_COLUMNS,
423
+ sortKey: ["url_hash"]
424
+ });
425
+ await ds.write(sitemapUrlsDeltaKey(ctx, fpHash, date), bytes);
426
+ }
427
+ return {
428
+ added,
429
+ removed,
430
+ kept,
431
+ contentHash,
432
+ unchanged: false
433
+ };
434
+ },
435
+ async *loadUrls(ctx, feedpath, opts) {
436
+ const fpHash = hash(feedpath);
437
+ const includeRemoved = opts?.includeRemoved ?? false;
438
+ const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx)).catch(() => void 0);
439
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
440
+ const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
441
+ const live = /* @__PURE__ */ new Map();
442
+ const removedMap = /* @__PURE__ */ new Map();
443
+ for (const row of indexRows) {
444
+ if (row.feedpath_hash !== fpHash) continue;
445
+ const rec = rowToUrlRecord(row);
446
+ if (rec.removedAt != null) removedMap.set(rec.urlHash, rec);
447
+ else live.set(rec.urlHash, rec);
448
+ }
449
+ for (const key of deltaKeys) {
450
+ const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
451
+ if (!m || m[2] !== fpHash) continue;
452
+ const dBytes = await ds.read(key).catch(() => void 0);
453
+ if (!dBytes) continue;
454
+ const dRows = await decodeParquetToRows(dBytes);
455
+ for (const r of dRows) {
456
+ const op = String(r.op);
457
+ const urlHash = String(r.url_hash);
458
+ const at = Number(r.at);
459
+ if (op === "added") {
460
+ const prev = live.get(urlHash) ?? removedMap.get(urlHash);
461
+ removedMap.delete(urlHash);
462
+ live.set(urlHash, {
463
+ feedpath,
464
+ feedpathHash: fpHash,
465
+ urlHash,
466
+ loc: String(r.loc),
467
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
468
+ firstSeenAt: prev?.firstSeenAt ?? at,
469
+ lastSeenAt: at
470
+ });
471
+ } else if (op === "removed") {
472
+ const prev = live.get(urlHash);
473
+ live.delete(urlHash);
474
+ if (prev) removedMap.set(urlHash, {
475
+ ...prev,
476
+ removedAt: at
477
+ });
478
+ }
479
+ }
480
+ }
481
+ for (const rec of live.values()) yield rec;
482
+ if (includeRemoved) for (const rec of removedMap.values()) yield rec;
483
+ },
484
+ async *loadDeltas(ctx, dateRange) {
485
+ const from = dateRange?.from;
486
+ const to = dateRange?.to;
487
+ const keys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
488
+ for (const key of keys) {
489
+ const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
490
+ if (!m) continue;
491
+ const date = m[1];
492
+ if (from && date < from) continue;
493
+ if (to && date > to) continue;
494
+ const bytes = await ds.read(key).catch(() => void 0);
495
+ if (!bytes) continue;
496
+ const rows = await decodeParquetToRows(bytes);
497
+ for (const r of rows) {
498
+ const op = String(r.op);
499
+ if (op !== "added" && op !== "removed") continue;
500
+ yield {
501
+ feedpath: String(r.feedpath),
502
+ feedpathHash: String(r.feedpath_hash),
503
+ urlHash: String(r.url_hash),
504
+ op,
505
+ loc: String(r.loc),
506
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
507
+ at: Number(r.at)
508
+ };
509
+ }
510
+ }
511
+ },
512
+ async compactUrls(ctx) {
513
+ const indexKey = sitemapUrlsIndexKey(ctx);
514
+ const indexBytes = await ds.read(indexKey).catch(() => void 0);
515
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
516
+ const stateKey = (fp, u) => `${fp}::${u}`;
517
+ const live = /* @__PURE__ */ new Map();
518
+ const removed = /* @__PURE__ */ new Map();
519
+ for (const row of indexRows) {
520
+ const rec = rowToUrlRecord(row);
521
+ const k = stateKey(rec.feedpathHash, rec.urlHash);
522
+ if (rec.removedAt != null) removed.set(k, rec);
523
+ else live.set(k, rec);
524
+ }
525
+ const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
526
+ const consumed = [];
527
+ for (const key of deltaKeys) {
528
+ const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
529
+ if (!m) continue;
530
+ const fpHash = m[2];
531
+ const bytes = await ds.read(key).catch(() => void 0);
532
+ if (!bytes) continue;
533
+ consumed.push(key);
534
+ const rows = await decodeParquetToRows(bytes);
535
+ for (const r of rows) {
536
+ const urlHash = String(r.url_hash);
537
+ const at = Number(r.at);
538
+ const k = stateKey(fpHash, urlHash);
539
+ const op = String(r.op);
540
+ if (op === "added") {
541
+ const prev = live.get(k) ?? removed.get(k);
542
+ removed.delete(k);
543
+ live.set(k, {
544
+ feedpath: String(r.feedpath),
545
+ feedpathHash: fpHash,
546
+ urlHash,
547
+ loc: String(r.loc),
548
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
549
+ firstSeenAt: prev?.firstSeenAt ?? at,
550
+ lastSeenAt: at
551
+ });
552
+ } else if (op === "removed") {
553
+ const prev = live.get(k);
554
+ live.delete(k);
555
+ if (prev) removed.set(k, {
556
+ ...prev,
557
+ removedAt: at
558
+ });
559
+ }
560
+ }
561
+ }
562
+ const merged = [...live.values(), ...removed.values()];
563
+ merged.sort((a, b) => {
564
+ if (a.feedpathHash !== b.feedpathHash) return a.feedpathHash < b.feedpathHash ? -1 : 1;
565
+ if (a.urlHash !== b.urlHash) return a.urlHash < b.urlHash ? -1 : 1;
566
+ return 0;
567
+ });
568
+ const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
569
+ columns: URLS_INDEX_COLUMNS,
570
+ sortKey: ["feedpath_hash", "url_hash"]
571
+ });
572
+ await ds.write(indexKey, bytes);
573
+ if (consumed.length > 0) await ds.delete(consumed);
574
+ },
575
+ urlsParquetUri(ctx) {
576
+ const key = sitemapUrlsIndexKey(ctx);
577
+ return ds.uri ? ds.uri(key) : void 0;
126
578
  }
127
579
  };
128
580
  }
@@ -206,4 +658,4 @@ function createEmptyTypesStore(opts) {
206
658
  }
207
659
  };
208
660
  }
209
- export { createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, sitemapHistoryKey, sitemapIndexKey };
661
+ export { createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
package/dist/index.d.mts CHANGED
@@ -1,6 +1,7 @@
1
1
  import { A as SyncStateDetail, B as WriteResult, C as QueryExecutor, D as SearchType, E as RunSQLOptions, F as TenantCtx, G as CompactionThresholds, H as inferLegacyTier, I as Watermark, K as enumeratePartitions, L as WatermarkFilter, M as SyncStateKind, N as SyncStateScope, O as StorageEngine, P as TableName, R as WatermarkScope, S as QueryExecuteResult, T as Row, U as inferSearchType, V as dayPartition, W as objectKey, _ as PurgeFilter, a as DataSource, b as QueryCtx, c as FileSetRef, d as LockScope, f as ManifestEntry, g as ParquetCodec, h as OptimizedQueryResult, i as DEFAULT_SEARCH_TYPE, j as SyncStateFilter, k as SyncState, l as GcCtx, m as ManifestStore, n as CompactionTier, o as EngineOptions, p as ManifestPurgeResult, r as ComparisonResult, s as ExtraResult, t as CodecCtx, u as ListLiveFilter, v as PurgeResult, w as QueryResult, x as QueryExecuteOptions, y as PurgeUrlsResult, z as WriteCtx } from "./_chunks/storage.mjs";
2
2
  import { a as createDuckDBExecutor, i as createDuckDBCodec, n as DuckDBHandle, r as canonicalEmptyParquetSchema, t as DuckDBFactory } from "./_chunks/duckdb.mjs";
3
3
  import { _ as pages, a as allTables, c as inferTable, d as TABLE_METADATA, f as countries, g as page_keywords, h as keywords, i as TableSchema, m as drizzleSchema, n as ColumnType, o as currentSchemaVersion, p as devices, r as SCHEMAS, s as dimensionToColumn, t as ColumnDef, u as DrizzleSchema } from "./_chunks/schema.mjs";
4
+ import { InspectionVerdict, SchedulePolicy, ScheduleState, fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
4
5
  import { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
5
6
  import { a as substituteNamedFiles, i as resolveToSQL, n as ResolvedQuery, t as FILES_PLACEHOLDER } from "./_chunks/planner.mjs";
6
7
  import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
@@ -9,4 +10,4 @@ declare function coerceRow(row: Row$1): Row$1;
9
10
  declare function coerceRows(rows: readonly Row$1[]): Row$1[];
10
11
  declare const MAX_DAY_BYTES: number;
11
12
  declare function createStorageEngine(opts: EngineOptions): StorageEngine;
12
- export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type ComparisonResult, DEFAULT_SEARCH_TYPE, type DataSource, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, type ExtraResult, FILES_PLACEHOLDER, type FileSetRef, type GcCtx, type GscApiRow, type IngestOptions, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type OptimizedQueryResult, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SearchType, type StorageEngine, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, TABLE_METADATA, type TableName, type TableSchema, type TenantCtx, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, formatLiteral, inferLegacyTier, inferSearchType, inferTable, keywords, objectKey, page_keywords, pages, resolveToSQL, substituteNamedFiles, toPath, toSumPosition, transformGscRow };
13
+ export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type ComparisonResult, DEFAULT_SEARCH_TYPE, type DataSource, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, type ExtraResult, FILES_PLACEHOLDER, type FileSetRef, type GcCtx, type GscApiRow, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type OptimizedQueryResult, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type StorageEngine, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, TABLE_METADATA, type TableName, type TableSchema, type TenantCtx, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, keywords, objectKey, page_keywords, pages, resolveToSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow };
package/dist/index.mjs CHANGED
@@ -5,6 +5,7 @@ import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
5
5
  import { a as createDuckDBExecutor, i as createDuckDBCodec, n as createStorageEngine, r as canonicalEmptyParquetSchema, t as MAX_DAY_BYTES } from "./_chunks/engine.mjs";
6
6
  import { createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
7
7
  import "./planner.mjs";
8
+ import { fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
8
9
  function coerceRow(row) {
9
10
  let mutated = null;
10
11
  for (const [k, v] of Object.entries(row)) if (typeof v === "bigint") {
@@ -18,4 +19,4 @@ function coerceRows(rows) {
18
19
  for (let i = 0; i < rows.length; i++) out[i] = coerceRow(rows[i]);
19
20
  return out;
20
21
  }
21
- export { DEFAULT_SEARCH_TYPE, FILES_PLACEHOLDER, MAX_DAY_BYTES, SCHEMAS, TABLE_METADATA, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, formatLiteral, inferLegacyTier, inferSearchType, inferTable, keywords, objectKey, page_keywords, pages, resolveToSQL, substituteNamedFiles, toPath, toSumPosition, transformGscRow };
22
+ export { DEFAULT_SEARCH_TYPE, FILES_PLACEHOLDER, MAX_DAY_BYTES, SCHEMAS, TABLE_METADATA, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, keywords, objectKey, page_keywords, pages, resolveToSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow };
@@ -159,5 +159,38 @@ declare const topKeywords28dParquetRollup: RollupDef;
159
159
  * so downstream readers don't have to special-case first-run sites.
160
160
  */
161
161
  declare const indexingMetadataRollup: RollupDef;
162
+ /**
163
+ * Indexing-API health by day: per `inspectedAt` date, counts of indexed,
164
+ * soft-404, redirect, not-found, mobile passes, rich-results passes, and
165
+ * canonical mismatches. Sourced from the inspections parquet sidecar
166
+ * (`InspectionStore.parquetUri`), which holds the latest record per URL.
167
+ *
168
+ * Empty-payload no-op when the sidecar URI is unavailable (in-memory
169
+ * `DataSource`, or before `materialize` has run).
170
+ */
171
+ declare const indexingHealthRollup: RollupDef;
172
+ /**
173
+ * Per-day index-percent: ratio of (sitemap URLs that received GSC clicks on
174
+ * that date) / (total live sitemap URLs). Uses a DuckDB JOIN between the
175
+ * sitemap urls parquet (`SitemapStore.urlsParquetUri`) and the `pages` fact
176
+ * parquet. Total denominator is the count of live URLs in the urls index;
177
+ * numerator is per-day distinct loc count where pages.clicks > 0.
178
+ */
179
+ declare const indexPercentRollup: RollupDef;
180
+ /**
181
+ * Sitemap-health per-day series materialized from the sitemap-store JSON
182
+ * index. Each `SitemapRecord` carries `urlCount`, `errors`, `warnings`,
183
+ * `contentHash`, and `lastDownloaded`. We bucket records by the day of their
184
+ * `capturedAt` (or `lastDownloaded` fallback) and emit per-day aggregates plus
185
+ * a snapshot of per-feed stats at the most recent capture.
186
+ */
187
+ declare const sitemapHealthRollup: RollupDef;
188
+ /**
189
+ * Trailing-28-day sitemap URL changes: per-day per-feedpath {added, removed}
190
+ * counts plus rolling top-200 added and removed URLs. Streams from
191
+ * `SitemapStore.loadDeltas()` so it scales independently of how many feeds
192
+ * exist on the site.
193
+ */
194
+ declare const sitemapChanges28dRollup: RollupDef;
162
195
  declare const DEFAULT_ROLLUPS: readonly RollupDef[];
163
- export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildRollupResult, RebuildRollupsOptions, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, dailyTotalsRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
196
+ export { DEFAULT_ROLLUPS, ParquetRollupPointer, RebuildRollupResult, RebuildRollupsOptions, RollupCtx, RollupDef, RollupEngine, RollupEnvelope, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
package/dist/rollups.mjs CHANGED
@@ -1,5 +1,5 @@
1
- import { createIndexingMetadataStore } from "./entities.mjs";
2
1
  import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
2
+ import { createIndexingMetadataStore, createInspectionStore, createSitemapStore } from "./entities.mjs";
3
3
  import { MS_PER_DAY } from "gscdump";
4
4
  function rollupPrefix(ctx) {
5
5
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
@@ -335,12 +335,212 @@ const indexingMetadataRollup = {
335
335
  };
336
336
  }
337
337
  };
338
+ function sqlString(s) {
339
+ return `'${s.replace(/'/g, "''")}'`;
340
+ }
341
+ const indexingHealthRollup = {
342
+ id: "indexing_health",
343
+ windowDays: 90,
344
+ async build({ engine, ctx, dataSource, builtAt }) {
345
+ const uri = createInspectionStore({ dataSource }).parquetUri(ctx);
346
+ if (!uri) return { days: [] };
347
+ const cutoff = utcDateMinusDays(builtAt, 90);
348
+ const sql = `
349
+ SELECT
350
+ substr(inspectedAt, 1, 10) AS date,
351
+ COUNT(*)::BIGINT AS total_urls,
352
+ SUM(CASE WHEN indexStatus = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS indexed_count,
353
+ SUM(CASE WHEN pageFetchState = 'SOFT_404' THEN 1 ELSE 0 END)::BIGINT AS soft_404,
354
+ SUM(CASE WHEN pageFetchState = 'REDIRECT_ERROR' THEN 1 ELSE 0 END)::BIGINT AS redirect,
355
+ SUM(CASE WHEN pageFetchState = 'NOT_FOUND' THEN 1 ELSE 0 END)::BIGINT AS not_found,
356
+ SUM(CASE WHEN mobileUsabilityVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS mobile_passes,
357
+ SUM(CASE WHEN richResultsVerdict = 'PASS' THEN 1 ELSE 0 END)::BIGINT AS rich_results_passes,
358
+ SUM(CASE WHEN userCanonical IS NOT NULL AND googleCanonical IS NOT NULL AND userCanonical <> googleCanonical THEN 1 ELSE 0 END)::BIGINT AS canonical_mismatches
359
+ FROM read_parquet(${sqlString(uri)})
360
+ WHERE substr(inspectedAt, 1, 10) >= '${cutoff}'
361
+ GROUP BY 1
362
+ ORDER BY 1
363
+ `;
364
+ return { days: (await engine.runSQL({
365
+ ctx,
366
+ table: "pages",
367
+ fileSets: {},
368
+ sql
369
+ })).rows.map((r) => ({
370
+ date: String(r.date),
371
+ total_urls: Number(r.total_urls),
372
+ indexed_count: Number(r.indexed_count),
373
+ soft_404: Number(r.soft_404),
374
+ redirect: Number(r.redirect),
375
+ not_found: Number(r.not_found),
376
+ mobile_passes: Number(r.mobile_passes),
377
+ rich_results_passes: Number(r.rich_results_passes),
378
+ canonical_mismatches: Number(r.canonical_mismatches)
379
+ })) };
380
+ }
381
+ };
382
+ const indexPercentRollup = {
383
+ id: "index_percent",
384
+ windowDays: 90,
385
+ async build({ engine, ctx, dataSource, builtAt }) {
386
+ const urlsUri = createSitemapStore({ dataSource }).urlsParquetUri(ctx);
387
+ if (!urlsUri) return {
388
+ totalSitemapUrls: 0,
389
+ days: []
390
+ };
391
+ const cutoff = utcDateMinusDays(builtAt, 90);
392
+ const numerator = await engine.runSQL({
393
+ ctx,
394
+ table: "pages",
395
+ fileSets: { PAGES: { table: "pages" } },
396
+ sql: `
397
+ SELECT
398
+ p.date AS date,
399
+ COUNT(DISTINCT p.url)::BIGINT AS clicked_urls
400
+ FROM read_parquet({{PAGES}}, union_by_name = true) p
401
+ INNER JOIN read_parquet(${sqlString(urlsUri)}) s
402
+ ON s.loc = p.url AND s.removed_at IS NULL
403
+ WHERE p.clicks > 0 AND p.date >= '${cutoff}'
404
+ GROUP BY p.date
405
+ ORDER BY p.date
406
+ `
407
+ });
408
+ const denom = await engine.runSQL({
409
+ ctx,
410
+ table: "pages",
411
+ fileSets: {},
412
+ sql: `
413
+ SELECT COUNT(*)::BIGINT AS total
414
+ FROM read_parquet(${sqlString(urlsUri)})
415
+ WHERE removed_at IS NULL
416
+ `
417
+ });
418
+ const total = Number(denom.rows[0]?.total ?? 0);
419
+ return {
420
+ totalSitemapUrls: total,
421
+ days: numerator.rows.map((r) => {
422
+ const clicked = Number(r.clicked_urls);
423
+ return {
424
+ date: String(r.date),
425
+ clicked_urls: clicked,
426
+ total_sitemap_urls: total,
427
+ ratio: total === 0 ? 0 : clicked / total
428
+ };
429
+ })
430
+ };
431
+ }
432
+ };
433
+ const sitemapHealthRollup = {
434
+ id: "sitemap_health",
435
+ windowDays: 90,
436
+ async build({ dataSource, ctx, builtAt }) {
437
+ const index = await createSitemapStore({ dataSource }).loadIndex(ctx);
438
+ const records = Object.values(index.records);
439
+ const cutoff = utcDateMinusDays(builtAt, 90);
440
+ const byDay = /* @__PURE__ */ new Map();
441
+ const feeds = [];
442
+ for (const r of records) {
443
+ const day = (r.capturedAt ?? r.lastDownloaded ?? "").slice(0, 10);
444
+ if (!day || day < cutoff) continue;
445
+ const errors = Number(r.errors ?? 0);
446
+ const warnings = Number(r.warnings ?? 0);
447
+ const urlCount = Number(r.urlCount ?? 0);
448
+ const bucket = byDay.get(day) ?? {
449
+ day,
450
+ feeds: 0,
451
+ total_urls: 0,
452
+ errors: 0,
453
+ warnings: 0
454
+ };
455
+ bucket.feeds += 1;
456
+ bucket.total_urls += urlCount;
457
+ bucket.errors += errors;
458
+ bucket.warnings += warnings;
459
+ byDay.set(day, bucket);
460
+ feeds.push({
461
+ path: r.path,
462
+ urlCount,
463
+ errors,
464
+ warnings,
465
+ contentHash: r.contentHash ?? null,
466
+ lastDownloaded: r.lastDownloaded ?? null,
467
+ capturedAt: r.capturedAt
468
+ });
469
+ }
470
+ return {
471
+ days: Array.from(byDay.values()).sort((a, b) => a.day < b.day ? -1 : 1),
472
+ feeds
473
+ };
474
+ }
475
+ };
476
+ const sitemapChanges28dRollup = {
477
+ id: "sitemap_changes_28d",
478
+ windowDays: 28,
479
+ async build({ dataSource, ctx, builtAt }) {
480
+ const store = createSitemapStore({ dataSource });
481
+ const from = utcDateMinusDays(builtAt, 28);
482
+ const to = utcDateMinusDays(builtAt, 0);
483
+ const counts = /* @__PURE__ */ new Map();
484
+ const addedTop = [];
485
+ const removedTop = [];
486
+ function key(k) {
487
+ return `${k.day}\x00${k.feedpath}`;
488
+ }
489
+ for await (const d of store.loadDeltas(ctx, {
490
+ from,
491
+ to
492
+ })) {
493
+ const day = new Date(d.at).toISOString().slice(0, 10);
494
+ const k = key({
495
+ day,
496
+ feedpath: d.feedpath
497
+ });
498
+ const cur = counts.get(k) ?? {
499
+ day,
500
+ feedpath: d.feedpath,
501
+ added: 0,
502
+ removed: 0
503
+ };
504
+ if (d.op === "added") {
505
+ cur.added += 1;
506
+ addedTop.push({
507
+ loc: d.loc,
508
+ feedpath: d.feedpath,
509
+ at: d.at
510
+ });
511
+ } else {
512
+ cur.removed += 1;
513
+ removedTop.push({
514
+ loc: d.loc,
515
+ feedpath: d.feedpath,
516
+ at: d.at
517
+ });
518
+ }
519
+ counts.set(k, cur);
520
+ }
521
+ const days = Array.from(counts.values()).sort((a, b) => {
522
+ if (a.day !== b.day) return a.day < b.day ? -1 : 1;
523
+ return a.feedpath < b.feedpath ? -1 : 1;
524
+ });
525
+ addedTop.sort((a, b) => b.at - a.at);
526
+ removedTop.sort((a, b) => b.at - a.at);
527
+ return {
528
+ days,
529
+ topAdded: addedTop.slice(0, 200),
530
+ topRemoved: removedTop.slice(0, 200)
531
+ };
532
+ }
533
+ };
338
534
  const DEFAULT_ROLLUPS = [
339
535
  dailyTotalsRollup,
340
536
  weeklyTotalsRollup,
341
537
  topPages28dRollup,
342
538
  topKeywords28dRollup,
343
539
  topCountries28dRollup,
344
- indexingMetadataRollup
540
+ indexingMetadataRollup,
541
+ indexingHealthRollup,
542
+ indexPercentRollup,
543
+ sitemapHealthRollup,
544
+ sitemapChanges28dRollup
345
545
  ];
346
- export { DEFAULT_ROLLUPS, dailyTotalsRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
546
+ export { DEFAULT_ROLLUPS, dailyTotalsRollup, indexPercentRollup, indexingHealthRollup, indexingMetadataRollup, rebuildRollups, rollupKey, rollupParquetKey, sitemapChanges28dRollup, sitemapHealthRollup, topCountries28dRollup, topKeywords28dParquetRollup, topKeywords28dRollup, topPages28dRollup, weeklyTotalsRollup };
@@ -0,0 +1,19 @@
1
+ interface ScheduleState {
2
+ nextAt: number;
3
+ consecutiveUnchanged: number;
4
+ policyVersion: number;
5
+ }
6
+ interface SchedulePolicy {
7
+ readonly version: number;
8
+ initial: (now: number) => ScheduleState;
9
+ observe: (prev: ScheduleState, evt: {
10
+ changed: boolean;
11
+ at: number;
12
+ }) => ScheduleState;
13
+ isDue: (state: ScheduleState, now: number) => boolean;
14
+ }
15
+ declare const sitemapPolicy: SchedulePolicy;
16
+ type InspectionVerdict = 'PASS' | 'FAIL' | 'NEUTRAL';
17
+ declare function inspectionPolicy(verdict: InspectionVerdict): SchedulePolicy;
18
+ declare function fixedPolicy(intervalMs: number): SchedulePolicy;
19
+ export { InspectionVerdict, SchedulePolicy, ScheduleState, fixedPolicy, inspectionPolicy, sitemapPolicy };
@@ -0,0 +1,100 @@
1
+ const DAY = 24 * (3600 * 1e3);
2
+ function isDue(state, now) {
3
+ return now >= state.nextAt;
4
+ }
5
+ function sitemapCadenceMs(consecutiveUnchanged) {
6
+ if (consecutiveUnchanged >= 7) return 30 * DAY;
7
+ if (consecutiveUnchanged >= 3) return 7 * DAY;
8
+ return DAY;
9
+ }
10
+ const SITEMAP_VERSION = 1;
11
+ const sitemapPolicy = {
12
+ version: SITEMAP_VERSION,
13
+ initial(now) {
14
+ return {
15
+ nextAt: now + DAY,
16
+ consecutiveUnchanged: 0,
17
+ policyVersion: SITEMAP_VERSION
18
+ };
19
+ },
20
+ observe(prev, evt) {
21
+ if (prev.policyVersion !== SITEMAP_VERSION) return {
22
+ nextAt: evt.at + sitemapCadenceMs(0),
23
+ consecutiveUnchanged: 0,
24
+ policyVersion: SITEMAP_VERSION
25
+ };
26
+ if (evt.changed) return {
27
+ nextAt: evt.at + DAY,
28
+ consecutiveUnchanged: 0,
29
+ policyVersion: SITEMAP_VERSION
30
+ };
31
+ const next = prev.consecutiveUnchanged + 1;
32
+ return {
33
+ nextAt: evt.at + sitemapCadenceMs(next),
34
+ consecutiveUnchanged: next,
35
+ policyVersion: SITEMAP_VERSION
36
+ };
37
+ },
38
+ isDue
39
+ };
40
+ const INSPECTION_VERSION = 1;
41
+ function inspectionCadenceMs(verdict) {
42
+ if (verdict === "PASS") return 30 * DAY;
43
+ if (verdict === "FAIL") return 7 * DAY;
44
+ return 14 * DAY;
45
+ }
46
+ function inspectionPolicy(verdict) {
47
+ const cadence = inspectionCadenceMs(verdict);
48
+ return {
49
+ version: INSPECTION_VERSION,
50
+ initial(now) {
51
+ return {
52
+ nextAt: now + cadence,
53
+ consecutiveUnchanged: 0,
54
+ policyVersion: INSPECTION_VERSION
55
+ };
56
+ },
57
+ observe(prev, evt) {
58
+ if (prev.policyVersion !== INSPECTION_VERSION) return {
59
+ nextAt: evt.at + cadence,
60
+ consecutiveUnchanged: 0,
61
+ policyVersion: INSPECTION_VERSION
62
+ };
63
+ const next = evt.changed ? 0 : prev.consecutiveUnchanged + 1;
64
+ return {
65
+ nextAt: evt.at + cadence,
66
+ consecutiveUnchanged: next,
67
+ policyVersion: INSPECTION_VERSION
68
+ };
69
+ },
70
+ isDue
71
+ };
72
+ }
73
+ const FIXED_VERSION = 1;
74
+ function fixedPolicy(intervalMs) {
75
+ return {
76
+ version: FIXED_VERSION,
77
+ initial(now) {
78
+ return {
79
+ nextAt: now + intervalMs,
80
+ consecutiveUnchanged: 0,
81
+ policyVersion: FIXED_VERSION
82
+ };
83
+ },
84
+ observe(prev, evt) {
85
+ if (prev.policyVersion !== FIXED_VERSION) return {
86
+ nextAt: evt.at + intervalMs,
87
+ consecutiveUnchanged: 0,
88
+ policyVersion: FIXED_VERSION
89
+ };
90
+ const next = evt.changed ? 0 : prev.consecutiveUnchanged + 1;
91
+ return {
92
+ nextAt: evt.at + intervalMs,
93
+ consecutiveUnchanged: next,
94
+ policyVersion: FIXED_VERSION
95
+ };
96
+ },
97
+ isDue
98
+ };
99
+ }
100
+ export { fixedPolicy, inspectionPolicy, sitemapPolicy };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.10.0",
4
+ "version": "0.11.0",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -61,6 +61,11 @@
61
61
  "import": "./dist/sql-fragments.mjs",
62
62
  "default": "./dist/sql-fragments.mjs"
63
63
  },
64
+ "./schedule": {
65
+ "types": "./dist/schedule.d.mts",
66
+ "import": "./dist/schedule.mjs",
67
+ "default": "./dist/schedule.mjs"
68
+ },
64
69
  "./entities": {
65
70
  "types": "./dist/entities.d.mts",
66
71
  "import": "./dist/entities.mjs",
@@ -164,7 +169,7 @@
164
169
  "dependencies": {
165
170
  "drizzle-orm": "^0.45.2",
166
171
  "proper-lockfile": "^4.1.2",
167
- "gscdump": "0.10.0"
172
+ "gscdump": "0.11.0"
168
173
  },
169
174
  "devDependencies": {
170
175
  "@duckdb/duckdb-wasm": "^1.32.0",