@gscdump/engine 0.25.14 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,23 @@
1
+ import { engineErrors } from "../errors.mjs";
2
+ import { isQueryError } from "gscdump/query";
1
3
  var AnalyzerCapabilityError = class extends Error {
2
4
  tool;
3
5
  missing;
6
+ engineError;
4
7
  constructor(tool, missing) {
5
- super(`analyzer "${tool}" requires capabilities [${missing.join(", ")}] not provided by source`);
8
+ const engineError = engineErrors.analyzerCapabilityMissing(tool, missing);
9
+ super(engineError.message);
6
10
  this.tool = tool;
7
11
  this.missing = missing;
8
12
  this.name = "AnalyzerCapabilityError";
13
+ this.engineError = engineError;
9
14
  }
10
15
  };
16
+ function isUnresolvableDatasetError(err) {
17
+ const queryError = err?.queryError;
18
+ if (isQueryError(queryError) && queryError.kind === "unresolvable-dataset") return true;
19
+ return err?.name === "UnresolvableDatasetError";
20
+ }
11
21
  function sourceHas(source, cap) {
12
22
  if (cap === "executeSql") return typeof source.executeSql === "function";
13
23
  return source.capabilities[cap] === true;
@@ -28,7 +38,7 @@ async function runAnalyzerFromSource(source, params, registry) {
28
38
  try {
29
39
  plan = analyzer.build(params, buildCtx);
30
40
  } catch (err) {
31
- const rowsVariant = err?.name === "UnresolvableDatasetError" ? registry.getAnalyzerVariants(params.type)?.rows : void 0;
41
+ const rowsVariant = isUnresolvableDatasetError(err) ? registry.getAnalyzerVariants(params.type)?.rows : void 0;
32
42
  if (!rowsVariant) throw err;
33
43
  assertSatisfies(rowsVariant, source);
34
44
  analyzer = rowsVariant;
@@ -0,0 +1,664 @@
1
+ import { decodeParquetToRows, encodeRowsToParquetFlex } from "../adapters/hyparquet.mjs";
2
+ function isMissingKeyError(e) {
3
+ if (typeof e !== "object" || e === null) return false;
4
+ if (e.code === "ENOENT") return true;
5
+ if (e.name === "NotFoundError") return true;
6
+ const message = e.message;
7
+ if (typeof message !== "string") return false;
8
+ return /\bnot found\b|\bENOENT\b|\bmissing key\b/i.test(message);
9
+ }
10
+ async function readOptional(ds, key, signal) {
11
+ if (ds.head) {
12
+ if (await ds.head(key) === void 0) return void 0;
13
+ }
14
+ return await ds.read(key, void 0, signal).catch((e) => {
15
+ if (isMissingKeyError(e)) return void 0;
16
+ throw e;
17
+ });
18
+ }
19
+ const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
20
+ function inspectionIndexKey(ctx) {
21
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.json` : `u_${ctx.userId}/entities/inspections/index.json`;
22
+ }
23
+ function emptyTypesKey(ctx) {
24
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/empty-types.json` : `u_${ctx.userId}/entities/empty-types.json`;
25
+ }
26
+ function inspectionParquetKey(ctx) {
27
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.parquet` : `u_${ctx.userId}/entities/inspections/index.parquet`;
28
+ }
29
+ function inspectionHistoryPrefix(ctx, yearMonth) {
30
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}`;
31
+ }
32
+ function inspectionHistoryShardKey(ctx, yearMonth, batchId) {
33
+ return `${inspectionHistoryPrefix(ctx, yearMonth)}/${batchId}.json`;
34
+ }
35
+ function hashUrl(url) {
36
+ let hi = 2166136261;
37
+ let lo = 3421674724;
38
+ for (let i = 0; i < url.length; i++) {
39
+ const c = url.charCodeAt(i);
40
+ lo ^= c;
41
+ const loMul = Math.imul(lo, 435) >>> 0;
42
+ const carry = Math.floor(lo * 435 / 4294967296);
43
+ const hiMul = Math.imul(hi, 435) + Math.imul(lo, 1) + carry >>> 0;
44
+ lo = loMul;
45
+ hi = hiMul;
46
+ }
47
+ return (hi >>> 0).toString(16).padStart(8, "0") + (lo >>> 0).toString(16).padStart(8, "0");
48
+ }
49
+ const INSPECTION_HISTORY_MAX_BYTES = 5 * 1024 * 1024;
50
+ const INSPECTION_PARQUET_COLUMNS = [
51
+ {
52
+ name: "urlHash",
53
+ type: "VARCHAR",
54
+ nullable: false
55
+ },
56
+ {
57
+ name: "url",
58
+ type: "VARCHAR",
59
+ nullable: false
60
+ },
61
+ {
62
+ name: "inspectedAt",
63
+ type: "VARCHAR",
64
+ nullable: false
65
+ },
66
+ {
67
+ name: "indexStatus",
68
+ type: "VARCHAR",
69
+ nullable: true
70
+ },
71
+ {
72
+ name: "lastCrawlTime",
73
+ type: "VARCHAR",
74
+ nullable: true
75
+ },
76
+ {
77
+ name: "googleCanonical",
78
+ type: "VARCHAR",
79
+ nullable: true
80
+ },
81
+ {
82
+ name: "userCanonical",
83
+ type: "VARCHAR",
84
+ nullable: true
85
+ },
86
+ {
87
+ name: "coverageState",
88
+ type: "VARCHAR",
89
+ nullable: true
90
+ },
91
+ {
92
+ name: "robotsTxtState",
93
+ type: "VARCHAR",
94
+ nullable: true
95
+ },
96
+ {
97
+ name: "indexingState",
98
+ type: "VARCHAR",
99
+ nullable: true
100
+ },
101
+ {
102
+ name: "pageFetchState",
103
+ type: "VARCHAR",
104
+ nullable: true
105
+ },
106
+ {
107
+ name: "mobileUsabilityVerdict",
108
+ type: "VARCHAR",
109
+ nullable: true
110
+ },
111
+ {
112
+ name: "richResultsVerdict",
113
+ type: "VARCHAR",
114
+ nullable: true
115
+ },
116
+ {
117
+ name: "scheduleNextAt",
118
+ type: "BIGINT",
119
+ nullable: true
120
+ },
121
+ {
122
+ name: "scheduleConsecutiveUnchanged",
123
+ type: "INTEGER",
124
+ nullable: true
125
+ },
126
+ {
127
+ name: "schedulePolicyVersion",
128
+ type: "INTEGER",
129
+ nullable: true
130
+ }
131
+ ];
132
+ function createInspectionStore(opts) {
133
+ const ds = opts.dataSource;
134
+ function shardFor(record) {
135
+ const m = YEAR_MONTH_RE.exec(record.inspectedAt);
136
+ return m ? `${m[1]}-${m[2]}` : "unknown";
137
+ }
138
+ function randomBatchId() {
139
+ return typeof crypto !== "undefined" && "randomUUID" in crypto ? crypto.randomUUID() : `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
140
+ }
141
+ return {
142
+ async appendHistory(ctx, records, options) {
143
+ if (records.length === 0) return;
144
+ const batchId = options?.batchId ?? randomBatchId();
145
+ const byMonth = /* @__PURE__ */ new Map();
146
+ for (const r of records) {
147
+ const month = shardFor(r);
148
+ if (!byMonth.has(month)) byMonth.set(month, []);
149
+ byMonth.get(month).push(r);
150
+ }
151
+ for (const [yearMonth, batch] of byMonth) {
152
+ const shard = {
153
+ version: 1,
154
+ records: batch
155
+ };
156
+ const bytes = new TextEncoder().encode(JSON.stringify(shard));
157
+ if (bytes.byteLength > 5242880) throw new Error(`inspection history shard exceeds ${INSPECTION_HISTORY_MAX_BYTES} bytes (got ${bytes.byteLength}); split the batch`);
158
+ await ds.write(inspectionHistoryShardKey(ctx, yearMonth, batchId), bytes);
159
+ }
160
+ },
161
+ async loadHistory(ctx, yearMonth) {
162
+ const keys = await ds.list(inspectionHistoryPrefix(ctx, yearMonth));
163
+ if (keys.length === 0) return void 0;
164
+ const out = [];
165
+ for (const key of keys) {
166
+ const bytes = await readOptional(ds, key);
167
+ if (!bytes) continue;
168
+ const shard = await Promise.resolve().then(() => JSON.parse(new TextDecoder().decode(bytes))).catch((err) => {
169
+ console.warn("[inspection.loadHistory] failed to decode shard", {
170
+ key,
171
+ error: err.message
172
+ });
173
+ });
174
+ if (shard?.records) out.push(...shard.records);
175
+ }
176
+ return {
177
+ version: 1,
178
+ records: out
179
+ };
180
+ },
181
+ async materialize(ctx, rowIter) {
182
+ const rows = Array.from(rowIter);
183
+ rows.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
184
+ const bytes = encodeRowsToParquetFlex(rows, {
185
+ columns: INSPECTION_PARQUET_COLUMNS,
186
+ sortKey: ["urlHash"]
187
+ });
188
+ const key = inspectionParquetKey(ctx);
189
+ await ds.write(key, bytes);
190
+ return {
191
+ key,
192
+ rowCount: rows.length,
193
+ bytes: bytes.byteLength
194
+ };
195
+ },
196
+ parquetUri(ctx) {
197
+ return ds.uri?.(inspectionParquetKey(ctx));
198
+ }
199
+ };
200
+ }
201
+ function sitemapIndexKey(ctx) {
202
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/index.json` : `u_${ctx.userId}/entities/sitemaps/index.json`;
203
+ }
204
+ function sitemapHistoryKey(ctx, feedpathHash, capturedAtMs) {
205
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json` : `u_${ctx.userId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json`;
206
+ }
207
+ function sitemapUrlsPrefix(ctx) {
208
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/urls` : `u_${ctx.userId}/entities/sitemaps/urls`;
209
+ }
210
+ function sitemapUrlsIndexPrefix(ctx) {
211
+ return `${sitemapUrlsPrefix(ctx)}/by-feed`;
212
+ }
213
+ function sitemapUrlsIndexKey(ctx, feedpathHash) {
214
+ return `${sitemapUrlsIndexPrefix(ctx)}/${feedpathHash}/index.parquet`;
215
+ }
216
+ function sitemapUrlsDeltaKey(ctx, feedpathHash, date) {
217
+ return `${sitemapUrlsPrefix(ctx)}/deltas/${date}__${feedpathHash}.parquet`;
218
+ }
219
+ const SITEMAP_URLS_DELTA_PREFIX_RE = /\/urls\/deltas\/(\d{4}-\d{2}-\d{2})__([0-9a-f]+)\.parquet$/;
220
+ const URLS_INDEX_COLUMNS = [
221
+ {
222
+ name: "feedpath",
223
+ type: "VARCHAR",
224
+ nullable: false
225
+ },
226
+ {
227
+ name: "feedpath_hash",
228
+ type: "VARCHAR",
229
+ nullable: false
230
+ },
231
+ {
232
+ name: "url_hash",
233
+ type: "VARCHAR",
234
+ nullable: false
235
+ },
236
+ {
237
+ name: "loc",
238
+ type: "VARCHAR",
239
+ nullable: false
240
+ },
241
+ {
242
+ name: "lastmod",
243
+ type: "VARCHAR",
244
+ nullable: true
245
+ },
246
+ {
247
+ name: "first_seen_at",
248
+ type: "BIGINT",
249
+ nullable: false
250
+ },
251
+ {
252
+ name: "last_seen_at",
253
+ type: "BIGINT",
254
+ nullable: false
255
+ },
256
+ {
257
+ name: "removed_at",
258
+ type: "BIGINT",
259
+ nullable: true
260
+ }
261
+ ];
262
+ const URLS_DELTA_COLUMNS = [
263
+ {
264
+ name: "feedpath",
265
+ type: "VARCHAR",
266
+ nullable: false
267
+ },
268
+ {
269
+ name: "feedpath_hash",
270
+ type: "VARCHAR",
271
+ nullable: false
272
+ },
273
+ {
274
+ name: "url_hash",
275
+ type: "VARCHAR",
276
+ nullable: false
277
+ },
278
+ {
279
+ name: "op",
280
+ type: "VARCHAR",
281
+ nullable: false
282
+ },
283
+ {
284
+ name: "loc",
285
+ type: "VARCHAR",
286
+ nullable: false
287
+ },
288
+ {
289
+ name: "lastmod",
290
+ type: "VARCHAR",
291
+ nullable: true
292
+ },
293
+ {
294
+ name: "at",
295
+ type: "BIGINT",
296
+ nullable: false
297
+ }
298
+ ];
299
+ function rowToUrlRecord(row) {
300
+ return {
301
+ feedpath: String(row.feedpath),
302
+ feedpathHash: String(row.feedpath_hash),
303
+ urlHash: String(row.url_hash),
304
+ loc: String(row.loc),
305
+ lastmod: row.lastmod == null ? void 0 : String(row.lastmod),
306
+ firstSeenAt: Number(row.first_seen_at),
307
+ lastSeenAt: Number(row.last_seen_at),
308
+ removedAt: row.removed_at == null ? void 0 : Number(row.removed_at)
309
+ };
310
+ }
311
+ function urlRecordToRow(r) {
312
+ return {
313
+ feedpath: r.feedpath,
314
+ feedpath_hash: r.feedpathHash,
315
+ url_hash: r.urlHash,
316
+ loc: r.loc,
317
+ lastmod: r.lastmod ?? null,
318
+ first_seen_at: r.firstSeenAt,
319
+ last_seen_at: r.lastSeenAt,
320
+ removed_at: r.removedAt ?? null
321
+ };
322
+ }
323
+ function isoDate(ms) {
324
+ return new Date(ms).toISOString().slice(0, 10);
325
+ }
326
+ function hashUrlList(urls) {
327
+ return hashUrl(urls.map((u) => u.loc).sort().join("\n"));
328
+ }
329
+ function createSitemapStore(opts) {
330
+ const ds = opts.dataSource;
331
+ const hash = opts.hash ?? hashUrl;
332
+ const now = opts.now ?? (() => Date.now());
333
+ async function readJson(key) {
334
+ const bytes = await readOptional(ds, key);
335
+ if (bytes === void 0) return void 0;
336
+ return JSON.parse(new TextDecoder().decode(bytes));
337
+ }
338
+ async function writeJson(key, value) {
339
+ await ds.write(key, new TextEncoder().encode(JSON.stringify(value)));
340
+ }
341
+ return {
342
+ async writeSnapshot(ctx, records) {
343
+ if (records.length === 0) return;
344
+ const indexKey = sitemapIndexKey(ctx);
345
+ const index = await readJson(indexKey) ?? {
346
+ version: 1,
347
+ records: {}
348
+ };
349
+ const stamp = now();
350
+ for (const r of records) {
351
+ const h = hash(r.path);
352
+ index.records[h] = r;
353
+ await writeJson(sitemapHistoryKey(ctx, h, stamp), {
354
+ version: 1,
355
+ path: r.path,
356
+ capturedAt: r.capturedAt,
357
+ record: r
358
+ });
359
+ }
360
+ await writeJson(indexKey, index);
361
+ },
362
+ async loadIndex(ctx) {
363
+ return await readJson(sitemapIndexKey(ctx)) ?? {
364
+ version: 1,
365
+ records: {}
366
+ };
367
+ },
368
+ async getLatest(ctx, path) {
369
+ return (await readJson(sitemapIndexKey(ctx)))?.records[hash(path)];
370
+ },
371
+ async snapshotUrls(ctx, feedpath, urls) {
372
+ const fpHash = hash(feedpath);
373
+ const contentHash = hashUrlList(urls);
374
+ const at = now();
375
+ const priorByHash = /* @__PURE__ */ new Map();
376
+ for await (const rec of this.loadUrls(ctx, feedpath, { includeRemoved: true })) priorByHash.set(rec.urlHash, rec);
377
+ const livePrior = Array.from(priorByHash.values()).filter((r) => r.removedAt == null);
378
+ if (livePrior.length > 0) {
379
+ if (hashUrl(livePrior.map((r) => String(r.loc)).sort().join("\n")) === contentHash) return {
380
+ added: 0,
381
+ removed: 0,
382
+ kept: livePrior.length,
383
+ contentHash,
384
+ unchanged: true
385
+ };
386
+ }
387
+ const incomingByHash = /* @__PURE__ */ new Map();
388
+ for (const u of urls) incomingByHash.set(hash(u.loc), u);
389
+ const deltaRows = [];
390
+ let added = 0;
391
+ let removed = 0;
392
+ let kept = 0;
393
+ const date = isoDate(at);
394
+ for (const [urlHash, u] of incomingByHash) {
395
+ const prev = priorByHash.get(urlHash);
396
+ if (!prev || prev.removedAt != null) {
397
+ added++;
398
+ deltaRows.push({
399
+ feedpath,
400
+ feedpath_hash: fpHash,
401
+ url_hash: urlHash,
402
+ op: "added",
403
+ loc: u.loc,
404
+ lastmod: u.lastmod ?? null,
405
+ at
406
+ });
407
+ } else kept++;
408
+ }
409
+ for (const [urlHash, prev] of priorByHash) {
410
+ if (prev.removedAt != null) continue;
411
+ if (!incomingByHash.has(urlHash)) {
412
+ removed++;
413
+ deltaRows.push({
414
+ feedpath,
415
+ feedpath_hash: fpHash,
416
+ url_hash: urlHash,
417
+ op: "removed",
418
+ loc: prev.loc,
419
+ lastmod: prev.lastmod ?? null,
420
+ at
421
+ });
422
+ }
423
+ }
424
+ if (deltaRows.length > 0) {
425
+ const bytes = encodeRowsToParquetFlex(deltaRows, {
426
+ columns: URLS_DELTA_COLUMNS,
427
+ sortKey: ["url_hash"]
428
+ });
429
+ await ds.write(sitemapUrlsDeltaKey(ctx, fpHash, date), bytes);
430
+ }
431
+ return {
432
+ added,
433
+ removed,
434
+ kept,
435
+ contentHash,
436
+ unchanged: false
437
+ };
438
+ },
439
+ async *loadUrls(ctx, feedpath, opts) {
440
+ const fpHash = hash(feedpath);
441
+ const includeRemoved = opts?.includeRemoved ?? false;
442
+ const indexBytes = await readOptional(ds, sitemapUrlsIndexKey(ctx, fpHash));
443
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
444
+ const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
445
+ const live = /* @__PURE__ */ new Map();
446
+ const removedMap = /* @__PURE__ */ new Map();
447
+ for (const row of indexRows) {
448
+ const rec = rowToUrlRecord(row);
449
+ if (rec.removedAt != null) removedMap.set(rec.urlHash, rec);
450
+ else live.set(rec.urlHash, rec);
451
+ }
452
+ for (const key of deltaKeys) {
453
+ const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
454
+ if (!m || m[2] !== fpHash) continue;
455
+ const dBytes = await readOptional(ds, key);
456
+ if (!dBytes) continue;
457
+ const dRows = await decodeParquetToRows(dBytes);
458
+ for (const r of dRows) {
459
+ const op = String(r.op);
460
+ const urlHash = String(r.url_hash);
461
+ const at = Number(r.at);
462
+ if (op === "added") {
463
+ const prev = live.get(urlHash) ?? removedMap.get(urlHash);
464
+ removedMap.delete(urlHash);
465
+ live.set(urlHash, {
466
+ feedpath,
467
+ feedpathHash: fpHash,
468
+ urlHash,
469
+ loc: String(r.loc),
470
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
471
+ firstSeenAt: prev?.firstSeenAt ?? at,
472
+ lastSeenAt: at
473
+ });
474
+ } else if (op === "removed") {
475
+ const prev = live.get(urlHash);
476
+ live.delete(urlHash);
477
+ if (prev) removedMap.set(urlHash, {
478
+ ...prev,
479
+ removedAt: at
480
+ });
481
+ }
482
+ }
483
+ }
484
+ for (const rec of live.values()) yield rec;
485
+ if (includeRemoved) for (const rec of removedMap.values()) yield rec;
486
+ },
487
+ async *loadDeltas(ctx, dateRange) {
488
+ const from = dateRange?.from;
489
+ const to = dateRange?.to;
490
+ const keys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
491
+ for (const key of keys) {
492
+ const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
493
+ if (!m) continue;
494
+ const date = m[1];
495
+ if (from && date < from) continue;
496
+ if (to && date > to) continue;
497
+ const bytes = await readOptional(ds, key);
498
+ if (!bytes) continue;
499
+ const rows = await decodeParquetToRows(bytes);
500
+ for (const r of rows) {
501
+ const op = String(r.op);
502
+ if (op !== "added" && op !== "removed") continue;
503
+ yield {
504
+ feedpath: String(r.feedpath),
505
+ feedpathHash: String(r.feedpath_hash),
506
+ urlHash: String(r.url_hash),
507
+ op,
508
+ loc: String(r.loc),
509
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
510
+ at: Number(r.at)
511
+ };
512
+ }
513
+ }
514
+ },
515
+ async compactUrls(ctx) {
516
+ const deltaKeys = await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`);
517
+ const deltasByFeed = /* @__PURE__ */ new Map();
518
+ for (const key of deltaKeys) {
519
+ const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
520
+ if (!m) continue;
521
+ const list = deltasByFeed.get(m[2]) ?? [];
522
+ list.push(key);
523
+ deltasByFeed.set(m[2], list);
524
+ }
525
+ for (const [fpHash, feedDeltaKeys] of deltasByFeed) {
526
+ const indexKey = sitemapUrlsIndexKey(ctx, fpHash);
527
+ const indexBytes = await readOptional(ds, indexKey);
528
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
529
+ const live = /* @__PURE__ */ new Map();
530
+ const removed = /* @__PURE__ */ new Map();
531
+ for (const row of indexRows) {
532
+ const rec = rowToUrlRecord(row);
533
+ if (rec.removedAt != null) removed.set(rec.urlHash, rec);
534
+ else live.set(rec.urlHash, rec);
535
+ }
536
+ const consumed = [];
537
+ for (const key of feedDeltaKeys.sort()) {
538
+ const bytes = await readOptional(ds, key);
539
+ if (!bytes) continue;
540
+ consumed.push(key);
541
+ const rows = await decodeParquetToRows(bytes);
542
+ for (const r of rows) {
543
+ const urlHash = String(r.url_hash);
544
+ const at = Number(r.at);
545
+ const op = String(r.op);
546
+ if (op === "added") {
547
+ const prev = live.get(urlHash) ?? removed.get(urlHash);
548
+ removed.delete(urlHash);
549
+ live.set(urlHash, {
550
+ feedpath: String(r.feedpath),
551
+ feedpathHash: fpHash,
552
+ urlHash,
553
+ loc: String(r.loc),
554
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
555
+ firstSeenAt: prev?.firstSeenAt ?? at,
556
+ lastSeenAt: at
557
+ });
558
+ } else if (op === "removed") {
559
+ const prev = live.get(urlHash);
560
+ live.delete(urlHash);
561
+ if (prev) removed.set(urlHash, {
562
+ ...prev,
563
+ removedAt: at
564
+ });
565
+ }
566
+ }
567
+ }
568
+ const merged = [...live.values(), ...removed.values()];
569
+ merged.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
570
+ const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
571
+ columns: URLS_INDEX_COLUMNS,
572
+ sortKey: ["feedpath_hash", "url_hash"]
573
+ });
574
+ await ds.write(indexKey, bytes);
575
+ if (consumed.length > 0) await ds.delete(consumed);
576
+ }
577
+ }
578
+ };
579
+ }
580
+ function indexingMetadataIndexKey(ctx) {
581
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/indexing/index.json` : `u_${ctx.userId}/entities/indexing/index.json`;
582
+ }
583
+ function createIndexingMetadataStore(opts) {
584
+ const ds = opts.dataSource;
585
+ const hash = opts.hash ?? hashUrl;
586
+ async function readIndex(key) {
587
+ const bytes = await readOptional(ds, key);
588
+ if (bytes === void 0) return {
589
+ version: 1,
590
+ records: {}
591
+ };
592
+ return JSON.parse(new TextDecoder().decode(bytes));
593
+ }
594
+ return {
595
+ async writeBatch(ctx, records) {
596
+ if (records.length === 0) return;
597
+ const key = indexingMetadataIndexKey(ctx);
598
+ const index = await readIndex(key);
599
+ for (const r of records) index.records[hash(r.url)] = r;
600
+ await ds.write(key, new TextEncoder().encode(JSON.stringify(index)));
601
+ },
602
+ async loadIndex(ctx) {
603
+ return readIndex(indexingMetadataIndexKey(ctx));
604
+ },
605
+ async getLatest(ctx, url) {
606
+ return (await readIndex(indexingMetadataIndexKey(ctx))).records[hash(url)];
607
+ }
608
+ };
609
+ }
610
+ function createEmptyTypesStore(opts) {
611
+ const ds = opts.dataSource;
612
+ const now = opts.now ?? (() => Date.now());
613
+ async function readDoc(key) {
614
+ const bytes = await readOptional(ds, key);
615
+ if (bytes === void 0) return {
616
+ version: 1,
617
+ emptyTypes: [],
618
+ markedAt: {}
619
+ };
620
+ return JSON.parse(new TextDecoder().decode(bytes));
621
+ }
622
+ async function writeDoc(key, doc) {
623
+ await ds.write(key, new TextEncoder().encode(JSON.stringify(doc)));
624
+ }
625
+ return {
626
+ async load(ctx) {
627
+ return readDoc(emptyTypesKey(ctx));
628
+ },
629
+ async mark(ctx, types, at) {
630
+ if (types.length === 0) return readDoc(emptyTypesKey(ctx));
631
+ const key = emptyTypesKey(ctx);
632
+ const doc = await readDoc(key);
633
+ const stamp = at ?? now();
634
+ let changed = false;
635
+ for (const t of types) {
636
+ if (!doc.emptyTypes.includes(t)) {
637
+ doc.emptyTypes.push(t);
638
+ changed = true;
639
+ }
640
+ if (doc.markedAt[t] === void 0) {
641
+ doc.markedAt[t] = stamp;
642
+ changed = true;
643
+ }
644
+ }
645
+ if (changed) {
646
+ doc.emptyTypes.sort();
647
+ await writeDoc(key, doc);
648
+ }
649
+ return doc;
650
+ },
651
+ async clear(ctx, types) {
652
+ if (types.length === 0) return readDoc(emptyTypesKey(ctx));
653
+ const key = emptyTypesKey(ctx);
654
+ const doc = await readDoc(key);
655
+ const drop = new Set(types);
656
+ const before = doc.emptyTypes.length;
657
+ doc.emptyTypes = doc.emptyTypes.filter((t) => !drop.has(t));
658
+ for (const t of drop) delete doc.markedAt[t];
659
+ if (doc.emptyTypes.length !== before) await writeDoc(key, doc);
660
+ return doc;
661
+ }
662
+ };
663
+ }
664
+ export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };