@gscdump/engine 0.25.13 → 0.26.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/dispatch.mjs +12 -2
- package/dist/_chunks/entities.mjs +664 -0
- package/dist/_chunks/errors.d.mts +117 -0
- package/dist/_chunks/index.d.mts +2 -0
- package/dist/_chunks/registry.d.mts +1 -1
- package/dist/_chunks/sink.d.mts +23 -7
- package/dist/_chunks/source.mjs +5 -1
- package/dist/_chunks/storage.d.mts +1 -1
- package/dist/adapters/filesystem.mjs +3 -1
- package/dist/adapters/node.d.mts +12 -2
- package/dist/adapters/node.mjs +25 -9
- package/dist/adapters/r2-manifest.mjs +12 -7
- package/dist/analyzer/index.d.mts +2 -0
- package/dist/entities.mjs +1 -640
- package/dist/errors.d.mts +2 -0
- package/dist/errors.mjs +202 -0
- package/dist/iceberg/index.mjs +14 -14
- package/dist/index.d.mts +10 -2
- package/dist/index.mjs +11 -6
- package/dist/resolver/index.d.mts +1 -1
- package/dist/rollups.d.mts +5 -3
- package/dist/rollups.mjs +5 -5
- package/dist/sql-bind.d.mts +15 -1
- package/dist/sql-bind.mjs +32 -20
- package/package.json +9 -4
package/dist/entities.mjs
CHANGED
|
@@ -1,641 +1,2 @@
|
|
|
1
|
-
import {
|
|
2
|
-
const YEAR_MONTH_RE = /^(\d{4})-(\d{2})-/;
|
|
3
|
-
function inspectionIndexKey(ctx) {
|
|
4
|
-
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.json` : `u_${ctx.userId}/entities/inspections/index.json`;
|
|
5
|
-
}
|
|
6
|
-
function emptyTypesKey(ctx) {
|
|
7
|
-
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/empty-types.json` : `u_${ctx.userId}/entities/empty-types.json`;
|
|
8
|
-
}
|
|
9
|
-
function inspectionParquetKey(ctx) {
|
|
10
|
-
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.parquet` : `u_${ctx.userId}/entities/inspections/index.parquet`;
|
|
11
|
-
}
|
|
12
|
-
function inspectionHistoryPrefix(ctx, yearMonth) {
|
|
13
|
-
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}`;
|
|
14
|
-
}
|
|
15
|
-
function inspectionHistoryShardKey(ctx, yearMonth, batchId) {
|
|
16
|
-
return `${inspectionHistoryPrefix(ctx, yearMonth)}/${batchId}.json`;
|
|
17
|
-
}
|
|
18
|
-
function hashUrl(url) {
|
|
19
|
-
let hi = 2166136261;
|
|
20
|
-
let lo = 3421674724;
|
|
21
|
-
for (let i = 0; i < url.length; i++) {
|
|
22
|
-
const c = url.charCodeAt(i);
|
|
23
|
-
lo ^= c;
|
|
24
|
-
const loMul = Math.imul(lo, 435) >>> 0;
|
|
25
|
-
const carry = Math.floor(lo * 435 / 4294967296);
|
|
26
|
-
const hiMul = Math.imul(hi, 435) + Math.imul(lo, 1) + carry >>> 0;
|
|
27
|
-
lo = loMul;
|
|
28
|
-
hi = hiMul;
|
|
29
|
-
}
|
|
30
|
-
return (hi >>> 0).toString(16).padStart(8, "0") + (lo >>> 0).toString(16).padStart(8, "0");
|
|
31
|
-
}
|
|
32
|
-
const INSPECTION_HISTORY_MAX_BYTES = 5 * 1024 * 1024;
|
|
33
|
-
const INSPECTION_PARQUET_COLUMNS = [
|
|
34
|
-
{
|
|
35
|
-
name: "urlHash",
|
|
36
|
-
type: "VARCHAR",
|
|
37
|
-
nullable: false
|
|
38
|
-
},
|
|
39
|
-
{
|
|
40
|
-
name: "url",
|
|
41
|
-
type: "VARCHAR",
|
|
42
|
-
nullable: false
|
|
43
|
-
},
|
|
44
|
-
{
|
|
45
|
-
name: "inspectedAt",
|
|
46
|
-
type: "VARCHAR",
|
|
47
|
-
nullable: false
|
|
48
|
-
},
|
|
49
|
-
{
|
|
50
|
-
name: "indexStatus",
|
|
51
|
-
type: "VARCHAR",
|
|
52
|
-
nullable: true
|
|
53
|
-
},
|
|
54
|
-
{
|
|
55
|
-
name: "lastCrawlTime",
|
|
56
|
-
type: "VARCHAR",
|
|
57
|
-
nullable: true
|
|
58
|
-
},
|
|
59
|
-
{
|
|
60
|
-
name: "googleCanonical",
|
|
61
|
-
type: "VARCHAR",
|
|
62
|
-
nullable: true
|
|
63
|
-
},
|
|
64
|
-
{
|
|
65
|
-
name: "userCanonical",
|
|
66
|
-
type: "VARCHAR",
|
|
67
|
-
nullable: true
|
|
68
|
-
},
|
|
69
|
-
{
|
|
70
|
-
name: "coverageState",
|
|
71
|
-
type: "VARCHAR",
|
|
72
|
-
nullable: true
|
|
73
|
-
},
|
|
74
|
-
{
|
|
75
|
-
name: "robotsTxtState",
|
|
76
|
-
type: "VARCHAR",
|
|
77
|
-
nullable: true
|
|
78
|
-
},
|
|
79
|
-
{
|
|
80
|
-
name: "indexingState",
|
|
81
|
-
type: "VARCHAR",
|
|
82
|
-
nullable: true
|
|
83
|
-
},
|
|
84
|
-
{
|
|
85
|
-
name: "pageFetchState",
|
|
86
|
-
type: "VARCHAR",
|
|
87
|
-
nullable: true
|
|
88
|
-
},
|
|
89
|
-
{
|
|
90
|
-
name: "mobileUsabilityVerdict",
|
|
91
|
-
type: "VARCHAR",
|
|
92
|
-
nullable: true
|
|
93
|
-
},
|
|
94
|
-
{
|
|
95
|
-
name: "richResultsVerdict",
|
|
96
|
-
type: "VARCHAR",
|
|
97
|
-
nullable: true
|
|
98
|
-
},
|
|
99
|
-
{
|
|
100
|
-
name: "scheduleNextAt",
|
|
101
|
-
type: "BIGINT",
|
|
102
|
-
nullable: true
|
|
103
|
-
},
|
|
104
|
-
{
|
|
105
|
-
name: "scheduleConsecutiveUnchanged",
|
|
106
|
-
type: "INTEGER",
|
|
107
|
-
nullable: true
|
|
108
|
-
},
|
|
109
|
-
{
|
|
110
|
-
name: "schedulePolicyVersion",
|
|
111
|
-
type: "INTEGER",
|
|
112
|
-
nullable: true
|
|
113
|
-
}
|
|
114
|
-
];
|
|
115
|
-
function createInspectionStore(opts) {
|
|
116
|
-
const ds = opts.dataSource;
|
|
117
|
-
function shardFor(record) {
|
|
118
|
-
const m = YEAR_MONTH_RE.exec(record.inspectedAt);
|
|
119
|
-
return m ? `${m[1]}-${m[2]}` : "unknown";
|
|
120
|
-
}
|
|
121
|
-
function randomBatchId() {
|
|
122
|
-
return typeof crypto !== "undefined" && "randomUUID" in crypto ? crypto.randomUUID() : `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
|
|
123
|
-
}
|
|
124
|
-
return {
|
|
125
|
-
async appendHistory(ctx, records, options) {
|
|
126
|
-
if (records.length === 0) return;
|
|
127
|
-
const batchId = options?.batchId ?? randomBatchId();
|
|
128
|
-
const byMonth = /* @__PURE__ */ new Map();
|
|
129
|
-
for (const r of records) {
|
|
130
|
-
const month = shardFor(r);
|
|
131
|
-
if (!byMonth.has(month)) byMonth.set(month, []);
|
|
132
|
-
byMonth.get(month).push(r);
|
|
133
|
-
}
|
|
134
|
-
for (const [yearMonth, batch] of byMonth) {
|
|
135
|
-
const shard = {
|
|
136
|
-
version: 1,
|
|
137
|
-
records: batch
|
|
138
|
-
};
|
|
139
|
-
const bytes = new TextEncoder().encode(JSON.stringify(shard));
|
|
140
|
-
if (bytes.byteLength > 5242880) throw new Error(`inspection history shard exceeds ${INSPECTION_HISTORY_MAX_BYTES} bytes (got ${bytes.byteLength}); split the batch`);
|
|
141
|
-
await ds.write(inspectionHistoryShardKey(ctx, yearMonth, batchId), bytes);
|
|
142
|
-
}
|
|
143
|
-
},
|
|
144
|
-
async loadHistory(ctx, yearMonth) {
|
|
145
|
-
const keys = await ds.list(inspectionHistoryPrefix(ctx, yearMonth));
|
|
146
|
-
if (keys.length === 0) return void 0;
|
|
147
|
-
const out = [];
|
|
148
|
-
for (const key of keys) {
|
|
149
|
-
const bytes = await ds.read(key).catch(() => void 0);
|
|
150
|
-
if (!bytes) continue;
|
|
151
|
-
const shard = await Promise.resolve().then(() => JSON.parse(new TextDecoder().decode(bytes))).catch((err) => {
|
|
152
|
-
console.warn("[inspection.loadHistory] failed to decode shard", {
|
|
153
|
-
key,
|
|
154
|
-
error: err.message
|
|
155
|
-
});
|
|
156
|
-
});
|
|
157
|
-
if (shard?.records) out.push(...shard.records);
|
|
158
|
-
}
|
|
159
|
-
return {
|
|
160
|
-
version: 1,
|
|
161
|
-
records: out
|
|
162
|
-
};
|
|
163
|
-
},
|
|
164
|
-
async materialize(ctx, rowIter) {
|
|
165
|
-
const rows = Array.from(rowIter);
|
|
166
|
-
rows.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
|
|
167
|
-
const bytes = encodeRowsToParquetFlex(rows, {
|
|
168
|
-
columns: INSPECTION_PARQUET_COLUMNS,
|
|
169
|
-
sortKey: ["urlHash"]
|
|
170
|
-
});
|
|
171
|
-
const key = inspectionParquetKey(ctx);
|
|
172
|
-
await ds.write(key, bytes);
|
|
173
|
-
return {
|
|
174
|
-
key,
|
|
175
|
-
rowCount: rows.length,
|
|
176
|
-
bytes: bytes.byteLength
|
|
177
|
-
};
|
|
178
|
-
},
|
|
179
|
-
parquetUri(ctx) {
|
|
180
|
-
return ds.uri?.(inspectionParquetKey(ctx));
|
|
181
|
-
}
|
|
182
|
-
};
|
|
183
|
-
}
|
|
184
|
-
function sitemapIndexKey(ctx) {
|
|
185
|
-
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/index.json` : `u_${ctx.userId}/entities/sitemaps/index.json`;
|
|
186
|
-
}
|
|
187
|
-
function sitemapHistoryKey(ctx, feedpathHash, capturedAtMs) {
|
|
188
|
-
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json` : `u_${ctx.userId}/entities/sitemaps/history/${feedpathHash}__${capturedAtMs}.json`;
|
|
189
|
-
}
|
|
190
|
-
function sitemapUrlsPrefix(ctx) {
|
|
191
|
-
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/urls` : `u_${ctx.userId}/entities/sitemaps/urls`;
|
|
192
|
-
}
|
|
193
|
-
function sitemapUrlsIndexPrefix(ctx) {
|
|
194
|
-
return `${sitemapUrlsPrefix(ctx)}/by-feed`;
|
|
195
|
-
}
|
|
196
|
-
function sitemapUrlsIndexKey(ctx, feedpathHash) {
|
|
197
|
-
return `${sitemapUrlsIndexPrefix(ctx)}/${feedpathHash}/index.parquet`;
|
|
198
|
-
}
|
|
199
|
-
function sitemapUrlsDeltaKey(ctx, feedpathHash, date) {
|
|
200
|
-
return `${sitemapUrlsPrefix(ctx)}/deltas/${date}__${feedpathHash}.parquet`;
|
|
201
|
-
}
|
|
202
|
-
const SITEMAP_URLS_DELTA_PREFIX_RE = /\/urls\/deltas\/(\d{4}-\d{2}-\d{2})__([0-9a-f]+)\.parquet$/;
|
|
203
|
-
const URLS_INDEX_COLUMNS = [
|
|
204
|
-
{
|
|
205
|
-
name: "feedpath",
|
|
206
|
-
type: "VARCHAR",
|
|
207
|
-
nullable: false
|
|
208
|
-
},
|
|
209
|
-
{
|
|
210
|
-
name: "feedpath_hash",
|
|
211
|
-
type: "VARCHAR",
|
|
212
|
-
nullable: false
|
|
213
|
-
},
|
|
214
|
-
{
|
|
215
|
-
name: "url_hash",
|
|
216
|
-
type: "VARCHAR",
|
|
217
|
-
nullable: false
|
|
218
|
-
},
|
|
219
|
-
{
|
|
220
|
-
name: "loc",
|
|
221
|
-
type: "VARCHAR",
|
|
222
|
-
nullable: false
|
|
223
|
-
},
|
|
224
|
-
{
|
|
225
|
-
name: "lastmod",
|
|
226
|
-
type: "VARCHAR",
|
|
227
|
-
nullable: true
|
|
228
|
-
},
|
|
229
|
-
{
|
|
230
|
-
name: "first_seen_at",
|
|
231
|
-
type: "BIGINT",
|
|
232
|
-
nullable: false
|
|
233
|
-
},
|
|
234
|
-
{
|
|
235
|
-
name: "last_seen_at",
|
|
236
|
-
type: "BIGINT",
|
|
237
|
-
nullable: false
|
|
238
|
-
},
|
|
239
|
-
{
|
|
240
|
-
name: "removed_at",
|
|
241
|
-
type: "BIGINT",
|
|
242
|
-
nullable: true
|
|
243
|
-
}
|
|
244
|
-
];
|
|
245
|
-
const URLS_DELTA_COLUMNS = [
|
|
246
|
-
{
|
|
247
|
-
name: "feedpath",
|
|
248
|
-
type: "VARCHAR",
|
|
249
|
-
nullable: false
|
|
250
|
-
},
|
|
251
|
-
{
|
|
252
|
-
name: "feedpath_hash",
|
|
253
|
-
type: "VARCHAR",
|
|
254
|
-
nullable: false
|
|
255
|
-
},
|
|
256
|
-
{
|
|
257
|
-
name: "url_hash",
|
|
258
|
-
type: "VARCHAR",
|
|
259
|
-
nullable: false
|
|
260
|
-
},
|
|
261
|
-
{
|
|
262
|
-
name: "op",
|
|
263
|
-
type: "VARCHAR",
|
|
264
|
-
nullable: false
|
|
265
|
-
},
|
|
266
|
-
{
|
|
267
|
-
name: "loc",
|
|
268
|
-
type: "VARCHAR",
|
|
269
|
-
nullable: false
|
|
270
|
-
},
|
|
271
|
-
{
|
|
272
|
-
name: "lastmod",
|
|
273
|
-
type: "VARCHAR",
|
|
274
|
-
nullable: true
|
|
275
|
-
},
|
|
276
|
-
{
|
|
277
|
-
name: "at",
|
|
278
|
-
type: "BIGINT",
|
|
279
|
-
nullable: false
|
|
280
|
-
}
|
|
281
|
-
];
|
|
282
|
-
function rowToUrlRecord(row) {
|
|
283
|
-
return {
|
|
284
|
-
feedpath: String(row.feedpath),
|
|
285
|
-
feedpathHash: String(row.feedpath_hash),
|
|
286
|
-
urlHash: String(row.url_hash),
|
|
287
|
-
loc: String(row.loc),
|
|
288
|
-
lastmod: row.lastmod == null ? void 0 : String(row.lastmod),
|
|
289
|
-
firstSeenAt: Number(row.first_seen_at),
|
|
290
|
-
lastSeenAt: Number(row.last_seen_at),
|
|
291
|
-
removedAt: row.removed_at == null ? void 0 : Number(row.removed_at)
|
|
292
|
-
};
|
|
293
|
-
}
|
|
294
|
-
function urlRecordToRow(r) {
|
|
295
|
-
return {
|
|
296
|
-
feedpath: r.feedpath,
|
|
297
|
-
feedpath_hash: r.feedpathHash,
|
|
298
|
-
url_hash: r.urlHash,
|
|
299
|
-
loc: r.loc,
|
|
300
|
-
lastmod: r.lastmod ?? null,
|
|
301
|
-
first_seen_at: r.firstSeenAt,
|
|
302
|
-
last_seen_at: r.lastSeenAt,
|
|
303
|
-
removed_at: r.removedAt ?? null
|
|
304
|
-
};
|
|
305
|
-
}
|
|
306
|
-
function isoDate(ms) {
|
|
307
|
-
return new Date(ms).toISOString().slice(0, 10);
|
|
308
|
-
}
|
|
309
|
-
function hashUrlList(urls) {
|
|
310
|
-
return hashUrl(urls.map((u) => u.loc).sort().join("\n"));
|
|
311
|
-
}
|
|
312
|
-
function createSitemapStore(opts) {
|
|
313
|
-
const ds = opts.dataSource;
|
|
314
|
-
const hash = opts.hash ?? hashUrl;
|
|
315
|
-
const now = opts.now ?? (() => Date.now());
|
|
316
|
-
async function readJson(key) {
|
|
317
|
-
return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => void 0);
|
|
318
|
-
}
|
|
319
|
-
async function writeJson(key, value) {
|
|
320
|
-
await ds.write(key, new TextEncoder().encode(JSON.stringify(value)));
|
|
321
|
-
}
|
|
322
|
-
return {
|
|
323
|
-
async writeSnapshot(ctx, records) {
|
|
324
|
-
if (records.length === 0) return;
|
|
325
|
-
const indexKey = sitemapIndexKey(ctx);
|
|
326
|
-
const index = await readJson(indexKey) ?? {
|
|
327
|
-
version: 1,
|
|
328
|
-
records: {}
|
|
329
|
-
};
|
|
330
|
-
const stamp = now();
|
|
331
|
-
for (const r of records) {
|
|
332
|
-
const h = hash(r.path);
|
|
333
|
-
index.records[h] = r;
|
|
334
|
-
await writeJson(sitemapHistoryKey(ctx, h, stamp), {
|
|
335
|
-
version: 1,
|
|
336
|
-
path: r.path,
|
|
337
|
-
capturedAt: r.capturedAt,
|
|
338
|
-
record: r
|
|
339
|
-
});
|
|
340
|
-
}
|
|
341
|
-
await writeJson(indexKey, index);
|
|
342
|
-
},
|
|
343
|
-
async loadIndex(ctx) {
|
|
344
|
-
return await readJson(sitemapIndexKey(ctx)) ?? {
|
|
345
|
-
version: 1,
|
|
346
|
-
records: {}
|
|
347
|
-
};
|
|
348
|
-
},
|
|
349
|
-
async getLatest(ctx, path) {
|
|
350
|
-
return (await readJson(sitemapIndexKey(ctx)))?.records[hash(path)];
|
|
351
|
-
},
|
|
352
|
-
async snapshotUrls(ctx, feedpath, urls) {
|
|
353
|
-
const fpHash = hash(feedpath);
|
|
354
|
-
const contentHash = hashUrlList(urls);
|
|
355
|
-
const at = now();
|
|
356
|
-
const priorByHash = /* @__PURE__ */ new Map();
|
|
357
|
-
for await (const rec of this.loadUrls(ctx, feedpath, { includeRemoved: true })) priorByHash.set(rec.urlHash, rec);
|
|
358
|
-
const livePrior = Array.from(priorByHash.values()).filter((r) => r.removedAt == null);
|
|
359
|
-
if (livePrior.length > 0) {
|
|
360
|
-
if (hashUrl(livePrior.map((r) => String(r.loc)).sort().join("\n")) === contentHash) return {
|
|
361
|
-
added: 0,
|
|
362
|
-
removed: 0,
|
|
363
|
-
kept: livePrior.length,
|
|
364
|
-
contentHash,
|
|
365
|
-
unchanged: true
|
|
366
|
-
};
|
|
367
|
-
}
|
|
368
|
-
const incomingByHash = /* @__PURE__ */ new Map();
|
|
369
|
-
for (const u of urls) incomingByHash.set(hash(u.loc), u);
|
|
370
|
-
const deltaRows = [];
|
|
371
|
-
let added = 0;
|
|
372
|
-
let removed = 0;
|
|
373
|
-
let kept = 0;
|
|
374
|
-
const date = isoDate(at);
|
|
375
|
-
for (const [urlHash, u] of incomingByHash) {
|
|
376
|
-
const prev = priorByHash.get(urlHash);
|
|
377
|
-
if (!prev || prev.removedAt != null) {
|
|
378
|
-
added++;
|
|
379
|
-
deltaRows.push({
|
|
380
|
-
feedpath,
|
|
381
|
-
feedpath_hash: fpHash,
|
|
382
|
-
url_hash: urlHash,
|
|
383
|
-
op: "added",
|
|
384
|
-
loc: u.loc,
|
|
385
|
-
lastmod: u.lastmod ?? null,
|
|
386
|
-
at
|
|
387
|
-
});
|
|
388
|
-
} else kept++;
|
|
389
|
-
}
|
|
390
|
-
for (const [urlHash, prev] of priorByHash) {
|
|
391
|
-
if (prev.removedAt != null) continue;
|
|
392
|
-
if (!incomingByHash.has(urlHash)) {
|
|
393
|
-
removed++;
|
|
394
|
-
deltaRows.push({
|
|
395
|
-
feedpath,
|
|
396
|
-
feedpath_hash: fpHash,
|
|
397
|
-
url_hash: urlHash,
|
|
398
|
-
op: "removed",
|
|
399
|
-
loc: prev.loc,
|
|
400
|
-
lastmod: prev.lastmod ?? null,
|
|
401
|
-
at
|
|
402
|
-
});
|
|
403
|
-
}
|
|
404
|
-
}
|
|
405
|
-
if (deltaRows.length > 0) {
|
|
406
|
-
const bytes = encodeRowsToParquetFlex(deltaRows, {
|
|
407
|
-
columns: URLS_DELTA_COLUMNS,
|
|
408
|
-
sortKey: ["url_hash"]
|
|
409
|
-
});
|
|
410
|
-
await ds.write(sitemapUrlsDeltaKey(ctx, fpHash, date), bytes);
|
|
411
|
-
}
|
|
412
|
-
return {
|
|
413
|
-
added,
|
|
414
|
-
removed,
|
|
415
|
-
kept,
|
|
416
|
-
contentHash,
|
|
417
|
-
unchanged: false
|
|
418
|
-
};
|
|
419
|
-
},
|
|
420
|
-
async *loadUrls(ctx, feedpath, opts) {
|
|
421
|
-
const fpHash = hash(feedpath);
|
|
422
|
-
const includeRemoved = opts?.includeRemoved ?? false;
|
|
423
|
-
const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx, fpHash)).catch(() => void 0);
|
|
424
|
-
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
425
|
-
const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
|
|
426
|
-
const live = /* @__PURE__ */ new Map();
|
|
427
|
-
const removedMap = /* @__PURE__ */ new Map();
|
|
428
|
-
for (const row of indexRows) {
|
|
429
|
-
const rec = rowToUrlRecord(row);
|
|
430
|
-
if (rec.removedAt != null) removedMap.set(rec.urlHash, rec);
|
|
431
|
-
else live.set(rec.urlHash, rec);
|
|
432
|
-
}
|
|
433
|
-
for (const key of deltaKeys) {
|
|
434
|
-
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
435
|
-
if (!m || m[2] !== fpHash) continue;
|
|
436
|
-
const dBytes = await ds.read(key).catch(() => void 0);
|
|
437
|
-
if (!dBytes) continue;
|
|
438
|
-
const dRows = await decodeParquetToRows(dBytes);
|
|
439
|
-
for (const r of dRows) {
|
|
440
|
-
const op = String(r.op);
|
|
441
|
-
const urlHash = String(r.url_hash);
|
|
442
|
-
const at = Number(r.at);
|
|
443
|
-
if (op === "added") {
|
|
444
|
-
const prev = live.get(urlHash) ?? removedMap.get(urlHash);
|
|
445
|
-
removedMap.delete(urlHash);
|
|
446
|
-
live.set(urlHash, {
|
|
447
|
-
feedpath,
|
|
448
|
-
feedpathHash: fpHash,
|
|
449
|
-
urlHash,
|
|
450
|
-
loc: String(r.loc),
|
|
451
|
-
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
452
|
-
firstSeenAt: prev?.firstSeenAt ?? at,
|
|
453
|
-
lastSeenAt: at
|
|
454
|
-
});
|
|
455
|
-
} else if (op === "removed") {
|
|
456
|
-
const prev = live.get(urlHash);
|
|
457
|
-
live.delete(urlHash);
|
|
458
|
-
if (prev) removedMap.set(urlHash, {
|
|
459
|
-
...prev,
|
|
460
|
-
removedAt: at
|
|
461
|
-
});
|
|
462
|
-
}
|
|
463
|
-
}
|
|
464
|
-
}
|
|
465
|
-
for (const rec of live.values()) yield rec;
|
|
466
|
-
if (includeRemoved) for (const rec of removedMap.values()) yield rec;
|
|
467
|
-
},
|
|
468
|
-
async *loadDeltas(ctx, dateRange) {
|
|
469
|
-
const from = dateRange?.from;
|
|
470
|
-
const to = dateRange?.to;
|
|
471
|
-
const keys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
|
|
472
|
-
for (const key of keys) {
|
|
473
|
-
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
474
|
-
if (!m) continue;
|
|
475
|
-
const date = m[1];
|
|
476
|
-
if (from && date < from) continue;
|
|
477
|
-
if (to && date > to) continue;
|
|
478
|
-
const bytes = await ds.read(key).catch(() => void 0);
|
|
479
|
-
if (!bytes) continue;
|
|
480
|
-
const rows = await decodeParquetToRows(bytes);
|
|
481
|
-
for (const r of rows) {
|
|
482
|
-
const op = String(r.op);
|
|
483
|
-
if (op !== "added" && op !== "removed") continue;
|
|
484
|
-
yield {
|
|
485
|
-
feedpath: String(r.feedpath),
|
|
486
|
-
feedpathHash: String(r.feedpath_hash),
|
|
487
|
-
urlHash: String(r.url_hash),
|
|
488
|
-
op,
|
|
489
|
-
loc: String(r.loc),
|
|
490
|
-
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
491
|
-
at: Number(r.at)
|
|
492
|
-
};
|
|
493
|
-
}
|
|
494
|
-
}
|
|
495
|
-
},
|
|
496
|
-
async compactUrls(ctx) {
|
|
497
|
-
const deltaKeys = await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`);
|
|
498
|
-
const deltasByFeed = /* @__PURE__ */ new Map();
|
|
499
|
-
for (const key of deltaKeys) {
|
|
500
|
-
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
501
|
-
if (!m) continue;
|
|
502
|
-
const list = deltasByFeed.get(m[2]) ?? [];
|
|
503
|
-
list.push(key);
|
|
504
|
-
deltasByFeed.set(m[2], list);
|
|
505
|
-
}
|
|
506
|
-
for (const [fpHash, feedDeltaKeys] of deltasByFeed) {
|
|
507
|
-
const indexKey = sitemapUrlsIndexKey(ctx, fpHash);
|
|
508
|
-
const indexBytes = await ds.read(indexKey).catch(() => void 0);
|
|
509
|
-
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
510
|
-
const live = /* @__PURE__ */ new Map();
|
|
511
|
-
const removed = /* @__PURE__ */ new Map();
|
|
512
|
-
for (const row of indexRows) {
|
|
513
|
-
const rec = rowToUrlRecord(row);
|
|
514
|
-
if (rec.removedAt != null) removed.set(rec.urlHash, rec);
|
|
515
|
-
else live.set(rec.urlHash, rec);
|
|
516
|
-
}
|
|
517
|
-
const consumed = [];
|
|
518
|
-
for (const key of feedDeltaKeys.sort()) {
|
|
519
|
-
const bytes = await ds.read(key).catch(() => void 0);
|
|
520
|
-
if (!bytes) continue;
|
|
521
|
-
consumed.push(key);
|
|
522
|
-
const rows = await decodeParquetToRows(bytes);
|
|
523
|
-
for (const r of rows) {
|
|
524
|
-
const urlHash = String(r.url_hash);
|
|
525
|
-
const at = Number(r.at);
|
|
526
|
-
const op = String(r.op);
|
|
527
|
-
if (op === "added") {
|
|
528
|
-
const prev = live.get(urlHash) ?? removed.get(urlHash);
|
|
529
|
-
removed.delete(urlHash);
|
|
530
|
-
live.set(urlHash, {
|
|
531
|
-
feedpath: String(r.feedpath),
|
|
532
|
-
feedpathHash: fpHash,
|
|
533
|
-
urlHash,
|
|
534
|
-
loc: String(r.loc),
|
|
535
|
-
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
536
|
-
firstSeenAt: prev?.firstSeenAt ?? at,
|
|
537
|
-
lastSeenAt: at
|
|
538
|
-
});
|
|
539
|
-
} else if (op === "removed") {
|
|
540
|
-
const prev = live.get(urlHash);
|
|
541
|
-
live.delete(urlHash);
|
|
542
|
-
if (prev) removed.set(urlHash, {
|
|
543
|
-
...prev,
|
|
544
|
-
removedAt: at
|
|
545
|
-
});
|
|
546
|
-
}
|
|
547
|
-
}
|
|
548
|
-
}
|
|
549
|
-
const merged = [...live.values(), ...removed.values()];
|
|
550
|
-
merged.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
|
|
551
|
-
const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
|
|
552
|
-
columns: URLS_INDEX_COLUMNS,
|
|
553
|
-
sortKey: ["feedpath_hash", "url_hash"]
|
|
554
|
-
});
|
|
555
|
-
await ds.write(indexKey, bytes);
|
|
556
|
-
if (consumed.length > 0) await ds.delete(consumed);
|
|
557
|
-
}
|
|
558
|
-
}
|
|
559
|
-
};
|
|
560
|
-
}
|
|
561
|
-
function indexingMetadataIndexKey(ctx) {
|
|
562
|
-
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/indexing/index.json` : `u_${ctx.userId}/entities/indexing/index.json`;
|
|
563
|
-
}
|
|
564
|
-
function createIndexingMetadataStore(opts) {
|
|
565
|
-
const ds = opts.dataSource;
|
|
566
|
-
const hash = opts.hash ?? hashUrl;
|
|
567
|
-
async function readIndex(key) {
|
|
568
|
-
return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => ({
|
|
569
|
-
version: 1,
|
|
570
|
-
records: {}
|
|
571
|
-
}));
|
|
572
|
-
}
|
|
573
|
-
return {
|
|
574
|
-
async writeBatch(ctx, records) {
|
|
575
|
-
if (records.length === 0) return;
|
|
576
|
-
const key = indexingMetadataIndexKey(ctx);
|
|
577
|
-
const index = await readIndex(key);
|
|
578
|
-
for (const r of records) index.records[hash(r.url)] = r;
|
|
579
|
-
await ds.write(key, new TextEncoder().encode(JSON.stringify(index)));
|
|
580
|
-
},
|
|
581
|
-
async loadIndex(ctx) {
|
|
582
|
-
return readIndex(indexingMetadataIndexKey(ctx));
|
|
583
|
-
},
|
|
584
|
-
async getLatest(ctx, url) {
|
|
585
|
-
return (await readIndex(indexingMetadataIndexKey(ctx))).records[hash(url)];
|
|
586
|
-
}
|
|
587
|
-
};
|
|
588
|
-
}
|
|
589
|
-
function createEmptyTypesStore(opts) {
|
|
590
|
-
const ds = opts.dataSource;
|
|
591
|
-
const now = opts.now ?? (() => Date.now());
|
|
592
|
-
async function readDoc(key) {
|
|
593
|
-
return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => ({
|
|
594
|
-
version: 1,
|
|
595
|
-
emptyTypes: [],
|
|
596
|
-
markedAt: {}
|
|
597
|
-
}));
|
|
598
|
-
}
|
|
599
|
-
async function writeDoc(key, doc) {
|
|
600
|
-
await ds.write(key, new TextEncoder().encode(JSON.stringify(doc)));
|
|
601
|
-
}
|
|
602
|
-
return {
|
|
603
|
-
async load(ctx) {
|
|
604
|
-
return readDoc(emptyTypesKey(ctx));
|
|
605
|
-
},
|
|
606
|
-
async mark(ctx, types, at) {
|
|
607
|
-
if (types.length === 0) return readDoc(emptyTypesKey(ctx));
|
|
608
|
-
const key = emptyTypesKey(ctx);
|
|
609
|
-
const doc = await readDoc(key);
|
|
610
|
-
const stamp = at ?? now();
|
|
611
|
-
let changed = false;
|
|
612
|
-
for (const t of types) {
|
|
613
|
-
if (!doc.emptyTypes.includes(t)) {
|
|
614
|
-
doc.emptyTypes.push(t);
|
|
615
|
-
changed = true;
|
|
616
|
-
}
|
|
617
|
-
if (doc.markedAt[t] === void 0) {
|
|
618
|
-
doc.markedAt[t] = stamp;
|
|
619
|
-
changed = true;
|
|
620
|
-
}
|
|
621
|
-
}
|
|
622
|
-
if (changed) {
|
|
623
|
-
doc.emptyTypes.sort();
|
|
624
|
-
await writeDoc(key, doc);
|
|
625
|
-
}
|
|
626
|
-
return doc;
|
|
627
|
-
},
|
|
628
|
-
async clear(ctx, types) {
|
|
629
|
-
if (types.length === 0) return readDoc(emptyTypesKey(ctx));
|
|
630
|
-
const key = emptyTypesKey(ctx);
|
|
631
|
-
const doc = await readDoc(key);
|
|
632
|
-
const drop = new Set(types);
|
|
633
|
-
const before = doc.emptyTypes.length;
|
|
634
|
-
doc.emptyTypes = doc.emptyTypes.filter((t) => !drop.has(t));
|
|
635
|
-
for (const t of drop) delete doc.markedAt[t];
|
|
636
|
-
if (doc.emptyTypes.length !== before) await writeDoc(key, doc);
|
|
637
|
-
return doc;
|
|
638
|
-
}
|
|
639
|
-
};
|
|
640
|
-
}
|
|
1
|
+
import { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
|
|
641
2
|
export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
|