@gscdump/engine 0.9.2 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import { a as DataSource } from "./_chunks/storage.mjs";
2
+ import { ScheduleState } from "./schedule.mjs";
2
3
  import { TenantCtx } from "gscdump/contracts";
3
4
  /**
4
5
  * GSC URL inspection result fields we persist. Mirrors the
@@ -28,8 +29,17 @@ interface InspectionRecord {
28
29
  * Free-form payload for fields we don't promote to first-class columns
29
30
  * (e.g. `referringUrls`, `crawledAs`). Keeps the wire format forward-compat
30
31
  * without bumping the schema for every API addition.
32
+ *
33
+ * Recognised keys:
34
+ * - `schedule`: optional `ScheduleState` from {@link inspectionPolicy}
35
+ * governing when this URL is next due for re-inspection. Undefined on
36
+ * pre-§0 records — readers must tolerate the missing field and fall
37
+ * back to default policy on first observe.
31
38
  */
32
- raw?: unknown;
39
+ raw?: {
40
+ schedule?: ScheduleState;
41
+ [key: string]: unknown;
42
+ };
33
43
  }
34
44
  /** Wire shape persisted to disk/R2. */
35
45
  interface InspectionIndex {
@@ -44,6 +54,7 @@ interface InspectionHistoryShard {
44
54
  }
45
55
  declare function inspectionIndexKey(ctx: TenantCtx): string;
46
56
  declare function emptyTypesKey(ctx: TenantCtx): string;
57
+ declare function inspectionParquetKey(ctx: TenantCtx): string;
47
58
  declare function inspectionHistoryKey(ctx: TenantCtx, yearMonth: string): string;
48
59
  /**
49
60
  * Stable URL hash used as the index key. Short, URL-safe, deterministic.
@@ -66,6 +77,34 @@ interface InspectionStore {
66
77
  loadIndex: (ctx: TenantCtx) => Promise<InspectionIndex>;
67
78
  /** Read the per-month history shard if it exists. */
68
79
  loadHistory: (ctx: TenantCtx, yearMonth: string) => Promise<InspectionHistoryShard | undefined>;
80
+ /**
81
+ * Snapshot the current JSON index to a parquet sidecar at
82
+ * `entities/inspections/index.parquet`. One PUT. Sorted by `urlHash` so
83
+ * DuckDB row-group stats can prune URL-keyed JOINs efficiently.
84
+ *
85
+ * Internal seam: callers don't choose JSON-vs-parquet — the store materialises
86
+ * the parquet at end-of-batch (e.g. after `indexing/complete`) and readers
87
+ * pick the format that matches their access pattern (parquet for JOINs,
88
+ * JSON for full-index scans / point lookups).
89
+ *
90
+ * Returns the parquet object key (matches {@link parquetUri} after write).
91
+ */
92
+ materialize: (ctx: TenantCtx) => Promise<{
93
+ key: string;
94
+ rowCount: number;
95
+ bytes: number;
96
+ }>;
97
+ /**
98
+ * DuckDB-resolvable URI for the materialised parquet sidecar, or
99
+ * `undefined` if the underlying `DataSource` has no native URI shape
100
+ * (in-memory tests). When defined, read paths can `read_parquet(<uri>)`
101
+ * directly without staging bytes through JS.
102
+ *
103
+ * Does not check existence — caller is responsible for ensuring
104
+ * `materialize` has run at least once. Returning a URI for a missing key
105
+ * is safe; DuckDB will surface a 404 / not-found at query time.
106
+ */
107
+ parquetUri: (ctx: TenantCtx) => string | undefined;
69
108
  }
70
109
  interface CreateInspectionStoreOptions {
71
110
  dataSource: DataSource;
@@ -100,6 +139,12 @@ interface SitemapRecord {
100
139
  }>;
101
140
  /** Raw payload for fields we don't promote to first-class columns. */
102
141
  raw?: unknown;
142
+ /** Number of URLs observed in this feedpath at last snapshot. */
143
+ urlCount?: number;
144
+ /** Stable hash of the sorted normalized loc list at last snapshot. */
145
+ contentHash?: string;
146
+ /** Adaptive cadence state owned by `sitemapPolicy`. */
147
+ schedule?: ScheduleState;
103
148
  }
104
149
  interface SitemapIndex {
105
150
  version: 1;
@@ -114,6 +159,57 @@ interface SitemapHistoryDoc {
114
159
  }
115
160
  declare function sitemapIndexKey(ctx: TenantCtx): string;
116
161
  declare function sitemapHistoryKey(ctx: TenantCtx, feedpathHash: string, capturedAtMs: number): string;
162
+ declare function sitemapUrlsIndexKey(ctx: TenantCtx): string;
163
+ declare function sitemapUrlsDeltaKey(ctx: TenantCtx, feedpathHash: string, date: string): string;
164
+ /** Parsed URL entry from a sitemap XML. */
165
+ interface ParsedUrl {
166
+ loc: string;
167
+ /** ISO-8601 lastmod from the sitemap, if present. */
168
+ lastmod?: string;
169
+ }
170
+ /** A single URL row in the urls/index.parquet partition. */
171
+ interface SitemapUrlRecord {
172
+ feedpath: string;
173
+ feedpathHash: string;
174
+ urlHash: string;
175
+ loc: string;
176
+ lastmod?: string;
177
+ firstSeenAt: number;
178
+ lastSeenAt: number;
179
+ /** Set when the URL has been removed. Null/undefined = currently live. */
180
+ removedAt?: number;
181
+ }
182
+ interface SnapshotUrlsResult {
183
+ added: number;
184
+ removed: number;
185
+ kept: number;
186
+ contentHash: string;
187
+ /** True when contentHash matched prior; the call performed zero writes. */
188
+ unchanged: boolean;
189
+ }
190
+ interface DeltaEntry {
191
+ feedpath: string;
192
+ feedpathHash: string;
193
+ urlHash: string;
194
+ op: 'added' | 'removed';
195
+ loc: string;
196
+ lastmod?: string;
197
+ at: number;
198
+ }
199
+ interface DateRange {
200
+ /** YYYY-MM-DD inclusive. */
201
+ from?: string;
202
+ /** YYYY-MM-DD inclusive. */
203
+ to?: string;
204
+ }
205
+ interface LoadUrlsOptions {
206
+ includeRemoved?: boolean;
207
+ }
208
+ /**
209
+ * Hash a URL list for change detection. Sorts then folds via FNV-1a so it's
210
+ * deterministic, locale-free, and cheap on Workers.
211
+ */
212
+ declare function hashUrlList(urls: readonly ParsedUrl[]): string;
117
213
  interface SitemapStore {
118
214
  /**
119
215
  * Persist a snapshot run. Updates the index + writes one immutable
@@ -124,6 +220,24 @@ interface SitemapStore {
124
220
  loadIndex: (ctx: TenantCtx) => Promise<SitemapIndex>;
125
221
  /** Fetch the latest snapshot for a feedpath, or undefined. */
126
222
  getLatest: (ctx: TenantCtx, path: string) => Promise<SitemapRecord | undefined>;
223
+ /**
224
+ * Diff incoming URLs against the prior `urls/index.parquet` partition for
225
+ * `feedpath`; on change, writes a single delta parquet under
226
+ * `urls/deltas/YYYY-MM-DD__{feedpathHash}.parquet`. Skipped (0 PUTs) when
227
+ * `contentHash` matches prior.
228
+ */
229
+ snapshotUrls: (ctx: TenantCtx, feedpath: string, urls: readonly ParsedUrl[]) => Promise<SnapshotUrlsResult>;
230
+ /** Stream live (and optionally removed) URL rows for a feedpath. */
231
+ loadUrls: (ctx: TenantCtx, feedpath: string, opts?: LoadUrlsOptions) => AsyncIterable<SitemapUrlRecord>;
232
+ /** Stream all delta entries within `[from, to]` (YYYY-MM-DD inclusive). */
233
+ loadDeltas: (ctx: TenantCtx, dateRange?: DateRange) => AsyncIterable<DeltaEntry>;
234
+ /**
235
+ * Fold every accumulated delta into the prior index; writes a fresh
236
+ * `urls/index.parquet` and deletes the consumed delta files.
237
+ */
238
+ compactUrls: (ctx: TenantCtx) => Promise<void>;
239
+ /** DuckDB-resolvable URI for the URLs index; `undefined` if backend lacks one. */
240
+ urlsParquetUri: (ctx: TenantCtx) => string | undefined;
127
241
  }
128
242
  interface CreateSitemapStoreOptions {
129
243
  dataSource: DataSource;
@@ -175,4 +289,4 @@ interface CreateEmptyTypesStoreOptions {
175
289
  now?: () => number;
176
290
  }
177
291
  declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
178
- export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, EmptyTypesDoc, EmptyTypesStore, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionIndex, InspectionRecord, InspectionStore, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, sitemapHistoryKey, sitemapIndexKey };
292
+ export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionIndex, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
package/dist/entities.mjs CHANGED
@@ -1,3 +1,4 @@
1
+ import { decodeParquetToRows, encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
1
2
  const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
2
3
  function inspectionIndexKey(ctx) {
3
4
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.json` : `u_${ctx.userId}/entities/inspections/index.json`;
@@ -5,6 +6,9 @@ function inspectionIndexKey(ctx) {
5
6
  function emptyTypesKey(ctx) {
6
7
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/empty-types.json` : `u_${ctx.userId}/entities/empty-types.json`;
7
8
  }
9
+ function inspectionParquetKey(ctx) {
10
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.parquet` : `u_${ctx.userId}/entities/inspections/index.parquet`;
11
+ }
8
12
  function inspectionHistoryKey(ctx, yearMonth) {
9
13
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}.json` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}.json`;
10
14
  }
@@ -75,15 +79,251 @@ function createInspectionStore(opts) {
75
79
  },
76
80
  async loadHistory(ctx, yearMonth) {
77
81
  return await readJson(inspectionHistoryKey(ctx, yearMonth));
82
+ },
83
+ async materialize(ctx) {
84
+ const index = await readJson(inspectionIndexKey(ctx)) ?? emptyIndex();
85
+ const rows = Object.entries(index.records).map(([urlHash, r]) => ({
86
+ urlHash,
87
+ url: r.url,
88
+ inspectedAt: r.inspectedAt,
89
+ indexStatus: r.indexStatus ?? null,
90
+ lastCrawlTime: r.lastCrawlTime ?? null,
91
+ googleCanonical: r.googleCanonical ?? null,
92
+ userCanonical: r.userCanonical ?? null,
93
+ coverageState: r.coverageState ?? null,
94
+ robotsTxtState: r.robotsTxtState ?? null,
95
+ indexingState: r.indexingState ?? null,
96
+ pageFetchState: r.pageFetchState ?? null,
97
+ mobileUsabilityVerdict: r.mobileUsabilityVerdict ?? null,
98
+ richResultsVerdict: r.richResultsVerdict ?? null,
99
+ scheduleNextAt: r.raw?.schedule?.nextAt ?? null,
100
+ scheduleConsecutiveUnchanged: r.raw?.schedule?.consecutiveUnchanged ?? null,
101
+ schedulePolicyVersion: r.raw?.schedule?.policyVersion ?? null
102
+ }));
103
+ const bytes = encodeRowsToParquetFlex(rows, {
104
+ columns: INSPECTION_PARQUET_COLUMNS,
105
+ sortKey: ["urlHash"]
106
+ });
107
+ const key = inspectionParquetKey(ctx);
108
+ await ds.write(key, bytes);
109
+ return {
110
+ key,
111
+ rowCount: rows.length,
112
+ bytes: bytes.byteLength
113
+ };
114
+ },
115
+ parquetUri(ctx) {
116
+ return ds.uri?.(inspectionParquetKey(ctx));
78
117
  }
79
118
  };
80
119
  }
120
+ const INSPECTION_PARQUET_COLUMNS = [
121
+ {
122
+ name: "urlHash",
123
+ type: "VARCHAR",
124
+ nullable: false
125
+ },
126
+ {
127
+ name: "url",
128
+ type: "VARCHAR",
129
+ nullable: false
130
+ },
131
+ {
132
+ name: "inspectedAt",
133
+ type: "VARCHAR",
134
+ nullable: false
135
+ },
136
+ {
137
+ name: "indexStatus",
138
+ type: "VARCHAR",
139
+ nullable: true
140
+ },
141
+ {
142
+ name: "lastCrawlTime",
143
+ type: "VARCHAR",
144
+ nullable: true
145
+ },
146
+ {
147
+ name: "googleCanonical",
148
+ type: "VARCHAR",
149
+ nullable: true
150
+ },
151
+ {
152
+ name: "userCanonical",
153
+ type: "VARCHAR",
154
+ nullable: true
155
+ },
156
+ {
157
+ name: "coverageState",
158
+ type: "VARCHAR",
159
+ nullable: true
160
+ },
161
+ {
162
+ name: "robotsTxtState",
163
+ type: "VARCHAR",
164
+ nullable: true
165
+ },
166
+ {
167
+ name: "indexingState",
168
+ type: "VARCHAR",
169
+ nullable: true
170
+ },
171
+ {
172
+ name: "pageFetchState",
173
+ type: "VARCHAR",
174
+ nullable: true
175
+ },
176
+ {
177
+ name: "mobileUsabilityVerdict",
178
+ type: "VARCHAR",
179
+ nullable: true
180
+ },
181
+ {
182
+ name: "richResultsVerdict",
183
+ type: "VARCHAR",
184
+ nullable: true
185
+ },
186
+ {
187
+ name: "scheduleNextAt",
188
+ type: "BIGINT",
189
+ nullable: true
190
+ },
191
+ {
192
+ name: "scheduleConsecutiveUnchanged",
193
+ type: "INTEGER",
194
+ nullable: true
195
+ },
196
+ {
197
+ name: "schedulePolicyVersion",
198
+ type: "INTEGER",
199
+ nullable: true
200
+ }
201
+ ];
81
202
  function sitemapIndexKey(ctx) {
82
203
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/index.json` : `u_${ctx.userId}/entities/sitemaps/index.json`;
83
204
  }
84
205
  function sitemapHistoryKey(ctx, feedpathHash, capturedAtMs) {
85
206
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json` : `u_${ctx.userId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json`;
86
207
  }
208
+ function sitemapUrlsPrefix(ctx) {
209
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/urls` : `u_${ctx.userId}/entities/sitemaps/urls`;
210
+ }
211
+ function sitemapUrlsIndexKey(ctx) {
212
+ return `${sitemapUrlsPrefix(ctx)}/index.parquet`;
213
+ }
214
+ function sitemapUrlsDeltaKey(ctx, feedpathHash, date) {
215
+ return `${sitemapUrlsPrefix(ctx)}/deltas/${date}__${feedpathHash}.parquet`;
216
+ }
217
+ const SITEMAP_URLS_DELTA_PREFIX_RE = /\/urls\/deltas\/(\d{4}-\d{2}-\d{2})__([0-9a-f]+)\.parquet$/;
218
+ const URLS_INDEX_COLUMNS = [
219
+ {
220
+ name: "feedpath",
221
+ type: "VARCHAR",
222
+ nullable: false
223
+ },
224
+ {
225
+ name: "feedpath_hash",
226
+ type: "VARCHAR",
227
+ nullable: false
228
+ },
229
+ {
230
+ name: "url_hash",
231
+ type: "VARCHAR",
232
+ nullable: false
233
+ },
234
+ {
235
+ name: "loc",
236
+ type: "VARCHAR",
237
+ nullable: false
238
+ },
239
+ {
240
+ name: "lastmod",
241
+ type: "VARCHAR",
242
+ nullable: true
243
+ },
244
+ {
245
+ name: "first_seen_at",
246
+ type: "BIGINT",
247
+ nullable: false
248
+ },
249
+ {
250
+ name: "last_seen_at",
251
+ type: "BIGINT",
252
+ nullable: false
253
+ },
254
+ {
255
+ name: "removed_at",
256
+ type: "BIGINT",
257
+ nullable: true
258
+ }
259
+ ];
260
+ const URLS_DELTA_COLUMNS = [
261
+ {
262
+ name: "feedpath",
263
+ type: "VARCHAR",
264
+ nullable: false
265
+ },
266
+ {
267
+ name: "feedpath_hash",
268
+ type: "VARCHAR",
269
+ nullable: false
270
+ },
271
+ {
272
+ name: "url_hash",
273
+ type: "VARCHAR",
274
+ nullable: false
275
+ },
276
+ {
277
+ name: "op",
278
+ type: "VARCHAR",
279
+ nullable: false
280
+ },
281
+ {
282
+ name: "loc",
283
+ type: "VARCHAR",
284
+ nullable: false
285
+ },
286
+ {
287
+ name: "lastmod",
288
+ type: "VARCHAR",
289
+ nullable: true
290
+ },
291
+ {
292
+ name: "at",
293
+ type: "BIGINT",
294
+ nullable: false
295
+ }
296
+ ];
297
+ function rowToUrlRecord(row) {
298
+ return {
299
+ feedpath: String(row.feedpath),
300
+ feedpathHash: String(row.feedpath_hash),
301
+ urlHash: String(row.url_hash),
302
+ loc: String(row.loc),
303
+ lastmod: row.lastmod == null ? void 0 : String(row.lastmod),
304
+ firstSeenAt: Number(row.first_seen_at),
305
+ lastSeenAt: Number(row.last_seen_at),
306
+ removedAt: row.removed_at == null ? void 0 : Number(row.removed_at)
307
+ };
308
+ }
309
+ function urlRecordToRow(r) {
310
+ return {
311
+ feedpath: r.feedpath,
312
+ feedpath_hash: r.feedpathHash,
313
+ url_hash: r.urlHash,
314
+ loc: r.loc,
315
+ lastmod: r.lastmod ?? null,
316
+ first_seen_at: r.firstSeenAt,
317
+ last_seen_at: r.lastSeenAt,
318
+ removed_at: r.removedAt ?? null
319
+ };
320
+ }
321
+ function isoDate(ms) {
322
+ return new Date(ms).toISOString().slice(0, 10);
323
+ }
324
+ function hashUrlList(urls) {
325
+ return hashUrl(urls.map((u) => u.loc).sort().join("\n"));
326
+ }
87
327
  function createSitemapStore(opts) {
88
328
  const ds = opts.dataSource;
89
329
  const hash = opts.hash ?? hashUrl;
@@ -123,6 +363,218 @@ function createSitemapStore(opts) {
123
363
  },
124
364
  async getLatest(ctx, path) {
125
365
  return (await readJson(sitemapIndexKey(ctx)))?.records[hash(path)];
366
+ },
367
+ async snapshotUrls(ctx, feedpath, urls) {
368
+ const fpHash = hash(feedpath);
369
+ const contentHash = hashUrlList(urls);
370
+ const at = now();
371
+ const priorByHash = /* @__PURE__ */ new Map();
372
+ for await (const rec of this.loadUrls(ctx, feedpath, { includeRemoved: true })) priorByHash.set(rec.urlHash, rec);
373
+ const livePrior = Array.from(priorByHash.values()).filter((r) => r.removedAt == null);
374
+ if (livePrior.length > 0) {
375
+ if (hashUrl(livePrior.map((r) => String(r.loc)).sort().join("\n")) === contentHash) return {
376
+ added: 0,
377
+ removed: 0,
378
+ kept: livePrior.length,
379
+ contentHash,
380
+ unchanged: true
381
+ };
382
+ }
383
+ const incomingByHash = /* @__PURE__ */ new Map();
384
+ for (const u of urls) incomingByHash.set(hash(u.loc), u);
385
+ const deltaRows = [];
386
+ let added = 0;
387
+ let removed = 0;
388
+ let kept = 0;
389
+ const date = isoDate(at);
390
+ for (const [urlHash, u] of incomingByHash) {
391
+ const prev = priorByHash.get(urlHash);
392
+ if (!prev || prev.removedAt != null) {
393
+ added++;
394
+ deltaRows.push({
395
+ feedpath,
396
+ feedpath_hash: fpHash,
397
+ url_hash: urlHash,
398
+ op: "added",
399
+ loc: u.loc,
400
+ lastmod: u.lastmod ?? null,
401
+ at
402
+ });
403
+ } else kept++;
404
+ }
405
+ for (const [urlHash, prev] of priorByHash) {
406
+ if (prev.removedAt != null) continue;
407
+ if (!incomingByHash.has(urlHash)) {
408
+ removed++;
409
+ deltaRows.push({
410
+ feedpath,
411
+ feedpath_hash: fpHash,
412
+ url_hash: urlHash,
413
+ op: "removed",
414
+ loc: prev.loc,
415
+ lastmod: prev.lastmod ?? null,
416
+ at
417
+ });
418
+ }
419
+ }
420
+ if (deltaRows.length > 0) {
421
+ const bytes = encodeRowsToParquetFlex(deltaRows, {
422
+ columns: URLS_DELTA_COLUMNS,
423
+ sortKey: ["url_hash"]
424
+ });
425
+ await ds.write(sitemapUrlsDeltaKey(ctx, fpHash, date), bytes);
426
+ }
427
+ return {
428
+ added,
429
+ removed,
430
+ kept,
431
+ contentHash,
432
+ unchanged: false
433
+ };
434
+ },
435
+ async *loadUrls(ctx, feedpath, opts) {
436
+ const fpHash = hash(feedpath);
437
+ const includeRemoved = opts?.includeRemoved ?? false;
438
+ const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx)).catch(() => void 0);
439
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
440
+ const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
441
+ const live = /* @__PURE__ */ new Map();
442
+ const removedMap = /* @__PURE__ */ new Map();
443
+ for (const row of indexRows) {
444
+ if (row.feedpath_hash !== fpHash) continue;
445
+ const rec = rowToUrlRecord(row);
446
+ if (rec.removedAt != null) removedMap.set(rec.urlHash, rec);
447
+ else live.set(rec.urlHash, rec);
448
+ }
449
+ for (const key of deltaKeys) {
450
+ const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
451
+ if (!m || m[2] !== fpHash) continue;
452
+ const dBytes = await ds.read(key).catch(() => void 0);
453
+ if (!dBytes) continue;
454
+ const dRows = await decodeParquetToRows(dBytes);
455
+ for (const r of dRows) {
456
+ const op = String(r.op);
457
+ const urlHash = String(r.url_hash);
458
+ const at = Number(r.at);
459
+ if (op === "added") {
460
+ const prev = live.get(urlHash) ?? removedMap.get(urlHash);
461
+ removedMap.delete(urlHash);
462
+ live.set(urlHash, {
463
+ feedpath,
464
+ feedpathHash: fpHash,
465
+ urlHash,
466
+ loc: String(r.loc),
467
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
468
+ firstSeenAt: prev?.firstSeenAt ?? at,
469
+ lastSeenAt: at
470
+ });
471
+ } else if (op === "removed") {
472
+ const prev = live.get(urlHash);
473
+ live.delete(urlHash);
474
+ if (prev) removedMap.set(urlHash, {
475
+ ...prev,
476
+ removedAt: at
477
+ });
478
+ }
479
+ }
480
+ }
481
+ for (const rec of live.values()) yield rec;
482
+ if (includeRemoved) for (const rec of removedMap.values()) yield rec;
483
+ },
484
+ async *loadDeltas(ctx, dateRange) {
485
+ const from = dateRange?.from;
486
+ const to = dateRange?.to;
487
+ const keys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
488
+ for (const key of keys) {
489
+ const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
490
+ if (!m) continue;
491
+ const date = m[1];
492
+ if (from && date < from) continue;
493
+ if (to && date > to) continue;
494
+ const bytes = await ds.read(key).catch(() => void 0);
495
+ if (!bytes) continue;
496
+ const rows = await decodeParquetToRows(bytes);
497
+ for (const r of rows) {
498
+ const op = String(r.op);
499
+ if (op !== "added" && op !== "removed") continue;
500
+ yield {
501
+ feedpath: String(r.feedpath),
502
+ feedpathHash: String(r.feedpath_hash),
503
+ urlHash: String(r.url_hash),
504
+ op,
505
+ loc: String(r.loc),
506
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
507
+ at: Number(r.at)
508
+ };
509
+ }
510
+ }
511
+ },
512
+ async compactUrls(ctx) {
513
+ const indexKey = sitemapUrlsIndexKey(ctx);
514
+ const indexBytes = await ds.read(indexKey).catch(() => void 0);
515
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
516
+ const stateKey = (fp, u) => `${fp}::${u}`;
517
+ const live = /* @__PURE__ */ new Map();
518
+ const removed = /* @__PURE__ */ new Map();
519
+ for (const row of indexRows) {
520
+ const rec = rowToUrlRecord(row);
521
+ const k = stateKey(rec.feedpathHash, rec.urlHash);
522
+ if (rec.removedAt != null) removed.set(k, rec);
523
+ else live.set(k, rec);
524
+ }
525
+ const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
526
+ const consumed = [];
527
+ for (const key of deltaKeys) {
528
+ const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
529
+ if (!m) continue;
530
+ const fpHash = m[2];
531
+ const bytes = await ds.read(key).catch(() => void 0);
532
+ if (!bytes) continue;
533
+ consumed.push(key);
534
+ const rows = await decodeParquetToRows(bytes);
535
+ for (const r of rows) {
536
+ const urlHash = String(r.url_hash);
537
+ const at = Number(r.at);
538
+ const k = stateKey(fpHash, urlHash);
539
+ const op = String(r.op);
540
+ if (op === "added") {
541
+ const prev = live.get(k) ?? removed.get(k);
542
+ removed.delete(k);
543
+ live.set(k, {
544
+ feedpath: String(r.feedpath),
545
+ feedpathHash: fpHash,
546
+ urlHash,
547
+ loc: String(r.loc),
548
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
549
+ firstSeenAt: prev?.firstSeenAt ?? at,
550
+ lastSeenAt: at
551
+ });
552
+ } else if (op === "removed") {
553
+ const prev = live.get(k);
554
+ live.delete(k);
555
+ if (prev) removed.set(k, {
556
+ ...prev,
557
+ removedAt: at
558
+ });
559
+ }
560
+ }
561
+ }
562
+ const merged = [...live.values(), ...removed.values()];
563
+ merged.sort((a, b) => {
564
+ if (a.feedpathHash !== b.feedpathHash) return a.feedpathHash < b.feedpathHash ? -1 : 1;
565
+ if (a.urlHash !== b.urlHash) return a.urlHash < b.urlHash ? -1 : 1;
566
+ return 0;
567
+ });
568
+ const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
569
+ columns: URLS_INDEX_COLUMNS,
570
+ sortKey: ["feedpath_hash", "url_hash"]
571
+ });
572
+ await ds.write(indexKey, bytes);
573
+ if (consumed.length > 0) await ds.delete(consumed);
574
+ },
575
+ urlsParquetUri(ctx) {
576
+ const key = sitemapUrlsIndexKey(ctx);
577
+ return ds.uri ? ds.uri(key) : void 0;
126
578
  }
127
579
  };
128
580
  }
@@ -206,4 +658,4 @@ function createEmptyTypesStore(opts) {
206
658
  }
207
659
  };
208
660
  }
209
- export { createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, sitemapHistoryKey, sitemapIndexKey };
661
+ export { createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
package/dist/index.d.mts CHANGED
@@ -1,6 +1,7 @@
1
- import { A as SyncStateDetail, B as WriteResult, C as QueryExecutor, D as SearchType, E as RunSQLOptions, F as TenantCtx, G as monthPartition, H as inferLegacyTier, I as Watermark, J as quarterPartition, K as objectKey, L as WatermarkFilter, M as SyncStateKind, N as SyncStateScope, O as StorageEngine, P as TableName, R as WatermarkScope, S as QueryExecuteResult, T as Row, U as inferSearchType, V as dayPartition, W as mondayOfWeek, X as CompactionThresholds, Y as weekPartition, Z as enumeratePartitions, _ as PurgeFilter, a as DataSource, b as QueryCtx, c as FileSetRef, d as LockScope, f as ManifestEntry, g as ParquetCodec, h as OptimizedQueryResult, i as DEFAULT_SEARCH_TYPE, j as SyncStateFilter, k as SyncState, l as GcCtx, m as ManifestStore, n as CompactionTier, o as EngineOptions, p as ManifestPurgeResult, q as quarterOfMonth, r as ComparisonResult, s as ExtraResult, t as CodecCtx, u as ListLiveFilter, v as PurgeResult, w as QueryResult, x as QueryExecuteOptions, y as PurgeUrlsResult, z as WriteCtx } from "./_chunks/storage.mjs";
1
+ import { A as SyncStateDetail, B as WriteResult, C as QueryExecutor, D as SearchType, E as RunSQLOptions, F as TenantCtx, G as CompactionThresholds, H as inferLegacyTier, I as Watermark, K as enumeratePartitions, L as WatermarkFilter, M as SyncStateKind, N as SyncStateScope, O as StorageEngine, P as TableName, R as WatermarkScope, S as QueryExecuteResult, T as Row, U as inferSearchType, V as dayPartition, W as objectKey, _ as PurgeFilter, a as DataSource, b as QueryCtx, c as FileSetRef, d as LockScope, f as ManifestEntry, g as ParquetCodec, h as OptimizedQueryResult, i as DEFAULT_SEARCH_TYPE, j as SyncStateFilter, k as SyncState, l as GcCtx, m as ManifestStore, n as CompactionTier, o as EngineOptions, p as ManifestPurgeResult, r as ComparisonResult, s as ExtraResult, t as CodecCtx, u as ListLiveFilter, v as PurgeResult, w as QueryResult, x as QueryExecuteOptions, y as PurgeUrlsResult, z as WriteCtx } from "./_chunks/storage.mjs";
2
2
  import { a as createDuckDBExecutor, i as createDuckDBCodec, n as DuckDBHandle, r as canonicalEmptyParquetSchema, t as DuckDBFactory } from "./_chunks/duckdb.mjs";
3
3
  import { _ as pages, a as allTables, c as inferTable, d as TABLE_METADATA, f as countries, g as page_keywords, h as keywords, i as TableSchema, m as drizzleSchema, n as ColumnType, o as currentSchemaVersion, p as devices, r as SCHEMAS, s as dimensionToColumn, t as ColumnDef, u as DrizzleSchema } from "./_chunks/schema.mjs";
4
+ import { InspectionVerdict, SchedulePolicy, ScheduleState, fixedPolicy, inspectionPolicy, sitemapPolicy } from "./schedule.mjs";
4
5
  import { GscApiRow, IngestOptions, RowAccumulator, RowAccumulatorOptions, createRowAccumulator, toPath, toSumPosition, transformGscRow } from "./ingest.mjs";
5
6
  import { a as substituteNamedFiles, i as resolveToSQL, n as ResolvedQuery, t as FILES_PLACEHOLDER } from "./_chunks/planner.mjs";
6
7
  import { bindLiterals, formatLiteral } from "./sql-bind.mjs";
@@ -9,4 +10,4 @@ declare function coerceRow(row: Row$1): Row$1;
9
10
  declare function coerceRows(rows: readonly Row$1[]): Row$1[];
10
11
  declare const MAX_DAY_BYTES: number;
11
12
  declare function createStorageEngine(opts: EngineOptions): StorageEngine;
12
- export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type ComparisonResult, DEFAULT_SEARCH_TYPE, type DataSource, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, type ExtraResult, FILES_PLACEHOLDER, type FileSetRef, type GcCtx, type GscApiRow, type IngestOptions, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type OptimizedQueryResult, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SearchType, type StorageEngine, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, TABLE_METADATA, type TableName, type TableSchema, type TenantCtx, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, formatLiteral, inferLegacyTier, inferSearchType, inferTable, keywords, mondayOfWeek, monthPartition, objectKey, page_keywords, pages, quarterOfMonth, quarterPartition, resolveToSQL, substituteNamedFiles, toPath, toSumPosition, transformGscRow, weekPartition };
13
+ export { type CodecCtx, type ColumnDef, type ColumnType, type CompactionThresholds, type CompactionTier, type ComparisonResult, DEFAULT_SEARCH_TYPE, type DataSource, type DrizzleSchema, type DuckDBFactory, type DuckDBHandle, type EngineOptions, type ExtraResult, FILES_PLACEHOLDER, type FileSetRef, type GcCtx, type GscApiRow, type IngestOptions, type InspectionVerdict, type ListLiveFilter, type LockScope, MAX_DAY_BYTES, type ManifestEntry, type ManifestPurgeResult, type ManifestStore, type OptimizedQueryResult, type ParquetCodec, type PurgeFilter, type PurgeResult, type PurgeUrlsResult, type QueryCtx, type QueryExecuteOptions, type QueryExecuteResult, type QueryExecutor, type QueryResult, type ResolvedQuery, type Row, type RowAccumulator, type RowAccumulatorOptions, type RunSQLOptions, SCHEMAS, type SchedulePolicy, type ScheduleState, type SearchType, type StorageEngine, type SyncState, type SyncStateDetail, type SyncStateFilter, type SyncStateKind, type SyncStateScope, TABLE_METADATA, type TableName, type TableSchema, type TenantCtx, type Watermark, type WatermarkFilter, type WatermarkScope, type WriteCtx, type WriteResult, allTables, bindLiterals, canonicalEmptyParquetSchema, coerceRow, coerceRows, countries, createDuckDBCodec, createDuckDBExecutor, createRowAccumulator, createStorageEngine, currentSchemaVersion, dayPartition, devices, dimensionToColumn, drizzleSchema, enumeratePartitions, fixedPolicy, formatLiteral, inferLegacyTier, inferSearchType, inferTable, inspectionPolicy, keywords, objectKey, page_keywords, pages, resolveToSQL, sitemapPolicy, substituteNamedFiles, toPath, toSumPosition, transformGscRow };