@gscdump/engine 0.18.4 → 0.18.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/entities.d.mts +66 -26
- package/dist/entities.mjs +45 -63
- package/package.json +3 -3
package/dist/entities.d.mts
CHANGED
|
@@ -47,49 +47,95 @@ interface InspectionIndex {
|
|
|
47
47
|
/** Map of urlHash → InspectionRecord (latest only). */
|
|
48
48
|
records: Record<string, InspectionRecord>;
|
|
49
49
|
}
|
|
50
|
+
/**
|
|
51
|
+
* Append-only history shard, one blob per `appendHistory` call.
|
|
52
|
+
* Keyed by UUID under the month directory — retries write a new blob,
|
|
53
|
+
* never RMW an existing one. Idempotent under job retries.
|
|
54
|
+
*/
|
|
50
55
|
interface InspectionHistoryShard {
|
|
51
56
|
version: 1;
|
|
52
|
-
/**
|
|
57
|
+
/** Records persisted in this batch. */
|
|
53
58
|
records: InspectionRecord[];
|
|
54
59
|
}
|
|
55
60
|
declare function inspectionIndexKey(ctx: TenantCtx): string;
|
|
56
61
|
declare function emptyTypesKey(ctx: TenantCtx): string;
|
|
57
62
|
declare function inspectionParquetKey(ctx: TenantCtx): string;
|
|
58
|
-
|
|
63
|
+
/**
|
|
64
|
+
* Directory prefix for a month's history shards. Each shard is a UUID-keyed
|
|
65
|
+
* blob under this prefix; `appendHistory` writes one per call, `loadHistory`
|
|
66
|
+
* lists + concatenates.
|
|
67
|
+
*/
|
|
68
|
+
declare function inspectionHistoryPrefix(ctx: TenantCtx, yearMonth: string): string;
|
|
69
|
+
declare function inspectionHistoryShardKey(ctx: TenantCtx, yearMonth: string, batchId: string): string;
|
|
59
70
|
/**
|
|
60
71
|
* Stable URL hash used as the index key. Short, URL-safe, deterministic.
|
|
61
72
|
* Uses a 64-bit FNV-1a; collisions vanishingly unlikely at the scales we
|
|
62
73
|
* care about (≤100k URLs/site).
|
|
63
74
|
*/
|
|
64
75
|
declare function hashUrl(url: string): string;
|
|
76
|
+
/**
|
|
77
|
+
* Row shape for the inspections parquet sidecar. Caller-side schema for
|
|
78
|
+
* `materialize` — D1 is the source of truth in the 2026-05-19 redesign, so
|
|
79
|
+
* consumers stream rows from `url_indexing_status` and pass them in. The
|
|
80
|
+
* parquet sidecar exists for DuckDB JOIN seams; readers go through
|
|
81
|
+
* `parquetUri`.
|
|
82
|
+
*/
|
|
83
|
+
interface InspectionParquetRow {
|
|
84
|
+
urlHash: string;
|
|
85
|
+
url: string;
|
|
86
|
+
inspectedAt: string;
|
|
87
|
+
indexStatus: string | null;
|
|
88
|
+
lastCrawlTime: string | null;
|
|
89
|
+
googleCanonical: string | null;
|
|
90
|
+
userCanonical: string | null;
|
|
91
|
+
coverageState: string | null;
|
|
92
|
+
robotsTxtState: string | null;
|
|
93
|
+
indexingState: string | null;
|
|
94
|
+
pageFetchState: string | null;
|
|
95
|
+
mobileUsabilityVerdict: string | null;
|
|
96
|
+
richResultsVerdict: string | null;
|
|
97
|
+
scheduleNextAt: number | null;
|
|
98
|
+
scheduleConsecutiveUnchanged: number | null;
|
|
99
|
+
schedulePolicyVersion: number | null;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Hard cap on a single `appendHistory` shard payload. Encoded bytes >
|
|
103
|
+
* this threshold throws — the caller logs and moves on (D1 is
|
|
104
|
+
* authoritative, R2 history is a sidecar). At `URLS_PER_JOB=3` a real
|
|
105
|
+
* batch encodes to ~10 KB so the cap is purely defensive against future
|
|
106
|
+
* batch-size bumps.
|
|
107
|
+
*/
|
|
108
|
+
declare const INSPECTION_HISTORY_MAX_BYTES: number;
|
|
65
109
|
interface InspectionStore {
|
|
66
110
|
/**
|
|
67
|
-
*
|
|
68
|
-
*
|
|
111
|
+
* Append a batch of fresh inspection results as an immutable per-batch
|
|
112
|
+
* shard under `history/<YYYY-MM>/<batchId>.json`. Idempotent under job
|
|
113
|
+
* retry (caller-supplied UUID per logical batch), no read-before-write,
|
|
114
|
+
* one PUT per month-group within the batch.
|
|
115
|
+
*
|
|
116
|
+
* Throws if the encoded payload exceeds {@link INSPECTION_HISTORY_MAX_BYTES}.
|
|
69
117
|
*/
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
118
|
+
appendHistory: (ctx: TenantCtx, records: readonly InspectionRecord[], opts?: {
|
|
119
|
+
batchId?: string;
|
|
120
|
+
}) => Promise<void>;
|
|
73
121
|
/**
|
|
74
|
-
* Read
|
|
75
|
-
*
|
|
122
|
+
* Read every shard in a month directory and concatenate. Best-effort:
|
|
123
|
+
* shards that fail to decode are skipped (logged via console). Returns
|
|
124
|
+
* `undefined` if the month has no shards.
|
|
76
125
|
*/
|
|
77
|
-
loadIndex: (ctx: TenantCtx) => Promise<InspectionIndex>;
|
|
78
|
-
/** Read the per-month history shard if it exists. */
|
|
79
126
|
loadHistory: (ctx: TenantCtx, yearMonth: string) => Promise<InspectionHistoryShard | undefined>;
|
|
80
127
|
/**
|
|
81
|
-
*
|
|
82
|
-
* `entities/inspections/index.parquet`.
|
|
83
|
-
*
|
|
128
|
+
* Encode caller-provided rows into the inspections parquet sidecar at
|
|
129
|
+
* `entities/inspections/index.parquet`. Sorted by `urlHash` so DuckDB
|
|
130
|
+
* row-group stats can prune URL-keyed JOINs efficiently. One PUT.
|
|
84
131
|
*
|
|
85
|
-
*
|
|
86
|
-
* the parquet
|
|
87
|
-
*
|
|
88
|
-
* JSON for full-index scans / point lookups).
|
|
132
|
+
* D1 is the source of truth in the 2026-05-19 redesign; this rebuilds
|
|
133
|
+
* the parquet from D1 rows the caller streams in (engine has no D1
|
|
134
|
+
* access). Triggered by `indexing/complete` post-hook.
|
|
89
135
|
*
|
|
90
136
|
* Returns the parquet object key (matches {@link parquetUri} after write).
|
|
91
137
|
*/
|
|
92
|
-
materialize: (ctx: TenantCtx) => Promise<{
|
|
138
|
+
materialize: (ctx: TenantCtx, rows: Iterable<InspectionParquetRow>) => Promise<{
|
|
93
139
|
key: string;
|
|
94
140
|
rowCount: number;
|
|
95
141
|
bytes: number;
|
|
@@ -108,12 +154,6 @@ interface InspectionStore {
|
|
|
108
154
|
}
|
|
109
155
|
interface CreateInspectionStoreOptions {
|
|
110
156
|
dataSource: DataSource;
|
|
111
|
-
/**
|
|
112
|
-
* Override the FNV hash with a callable (test seam, or to swap in
|
|
113
|
-
* SHA-256 if hash collisions become a concern at extreme scale).
|
|
114
|
-
*/
|
|
115
|
-
hash?: (url: string) => string;
|
|
116
|
-
now?: () => number;
|
|
117
157
|
}
|
|
118
158
|
declare function createInspectionStore(opts: CreateInspectionStoreOptions): InspectionStore;
|
|
119
159
|
/** GSC sitemap record we persist. Matches `Schema$WmxSitemap` but as plain JSON. */
|
|
@@ -289,4 +329,4 @@ interface CreateEmptyTypesStoreOptions {
|
|
|
289
329
|
now?: () => number;
|
|
290
330
|
}
|
|
291
331
|
declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
|
|
292
|
-
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionIndex, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey,
|
|
332
|
+
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
|
package/dist/entities.mjs
CHANGED
|
@@ -9,8 +9,11 @@ function emptyTypesKey(ctx) {
|
|
|
9
9
|
function inspectionParquetKey(ctx) {
|
|
10
10
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.parquet` : `u_${ctx.userId}/entities/inspections/index.parquet`;
|
|
11
11
|
}
|
|
12
|
-
function
|
|
13
|
-
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}
|
|
12
|
+
function inspectionHistoryPrefix(ctx, yearMonth) {
|
|
13
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}`;
|
|
14
|
+
}
|
|
15
|
+
function inspectionHistoryShardKey(ctx, yearMonth, batchId) {
|
|
16
|
+
return `${inspectionHistoryPrefix(ctx, yearMonth)}/${batchId}.json`;
|
|
14
17
|
}
|
|
15
18
|
function hashUrl(url) {
|
|
16
19
|
let hi = 2166136261;
|
|
@@ -26,6 +29,7 @@ function hashUrl(url) {
|
|
|
26
29
|
}
|
|
27
30
|
return (hi >>> 0).toString(16).padStart(8, "0") + (lo >>> 0).toString(16).padStart(8, "0");
|
|
28
31
|
}
|
|
32
|
+
const INSPECTION_HISTORY_MAX_BYTES = 5 * 1024 * 1024;
|
|
29
33
|
const INSPECTION_PARQUET_COLUMNS = [
|
|
30
34
|
{
|
|
31
35
|
name: "urlHash",
|
|
@@ -109,79 +113,57 @@ const INSPECTION_PARQUET_COLUMNS = [
|
|
|
109
113
|
}
|
|
110
114
|
];
|
|
111
115
|
function createInspectionStore(opts) {
|
|
112
|
-
const hash = opts.hash ?? hashUrl;
|
|
113
116
|
const ds = opts.dataSource;
|
|
114
|
-
async function readJson(key) {
|
|
115
|
-
return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => void 0);
|
|
116
|
-
}
|
|
117
|
-
async function writeJson(key, value) {
|
|
118
|
-
await ds.write(key, new TextEncoder().encode(JSON.stringify(value)));
|
|
119
|
-
}
|
|
120
|
-
function emptyIndex() {
|
|
121
|
-
return {
|
|
122
|
-
version: 1,
|
|
123
|
-
records: {}
|
|
124
|
-
};
|
|
125
|
-
}
|
|
126
|
-
function emptyShard() {
|
|
127
|
-
return {
|
|
128
|
-
version: 1,
|
|
129
|
-
records: []
|
|
130
|
-
};
|
|
131
|
-
}
|
|
132
117
|
function shardFor(record) {
|
|
133
118
|
const m = YEAR_MONTH_RE.exec(record.inspectedAt);
|
|
134
119
|
return m ? `${m[1]}-${m[2]}` : "unknown";
|
|
135
120
|
}
|
|
121
|
+
function randomBatchId() {
|
|
122
|
+
return typeof crypto !== "undefined" && "randomUUID" in crypto ? crypto.randomUUID() : `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
|
|
123
|
+
}
|
|
136
124
|
return {
|
|
137
|
-
async
|
|
125
|
+
async appendHistory(ctx, records, options) {
|
|
138
126
|
if (records.length === 0) return;
|
|
139
|
-
const
|
|
140
|
-
const
|
|
141
|
-
const byShard = /* @__PURE__ */ new Map();
|
|
127
|
+
const batchId = options?.batchId ?? randomBatchId();
|
|
128
|
+
const byMonth = /* @__PURE__ */ new Map();
|
|
142
129
|
for (const r of records) {
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
byShard.get(shardKey).push(r);
|
|
130
|
+
const month = shardFor(r);
|
|
131
|
+
if (!byMonth.has(month)) byMonth.set(month, []);
|
|
132
|
+
byMonth.get(month).push(r);
|
|
147
133
|
}
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
134
|
+
for (const [yearMonth, batch] of byMonth) {
|
|
135
|
+
const shard = {
|
|
136
|
+
version: 1,
|
|
137
|
+
records: batch
|
|
138
|
+
};
|
|
139
|
+
const bytes = new TextEncoder().encode(JSON.stringify(shard));
|
|
140
|
+
if (bytes.byteLength > 5242880) throw new Error(`inspection history shard exceeds ${INSPECTION_HISTORY_MAX_BYTES} bytes (got ${bytes.byteLength}); split the batch`);
|
|
141
|
+
await ds.write(inspectionHistoryShardKey(ctx, yearMonth, batchId), bytes);
|
|
154
142
|
}
|
|
155
143
|
},
|
|
156
|
-
async getLatest(ctx, url) {
|
|
157
|
-
return (await readJson(inspectionIndexKey(ctx)))?.records[hash(url)];
|
|
158
|
-
},
|
|
159
|
-
async loadIndex(ctx) {
|
|
160
|
-
return await readJson(inspectionIndexKey(ctx)) ?? emptyIndex();
|
|
161
|
-
},
|
|
162
144
|
async loadHistory(ctx, yearMonth) {
|
|
163
|
-
|
|
145
|
+
const keys = await ds.list(inspectionHistoryPrefix(ctx, yearMonth));
|
|
146
|
+
if (keys.length === 0) return void 0;
|
|
147
|
+
const out = [];
|
|
148
|
+
for (const key of keys) {
|
|
149
|
+
const bytes = await ds.read(key).catch(() => void 0);
|
|
150
|
+
if (!bytes) continue;
|
|
151
|
+
const shard = await Promise.resolve().then(() => JSON.parse(new TextDecoder().decode(bytes))).catch((err) => {
|
|
152
|
+
console.warn("[inspection.loadHistory] failed to decode shard", {
|
|
153
|
+
key,
|
|
154
|
+
error: err.message
|
|
155
|
+
});
|
|
156
|
+
});
|
|
157
|
+
if (shard?.records) out.push(...shard.records);
|
|
158
|
+
}
|
|
159
|
+
return {
|
|
160
|
+
version: 1,
|
|
161
|
+
records: out
|
|
162
|
+
};
|
|
164
163
|
},
|
|
165
|
-
async materialize(ctx) {
|
|
166
|
-
const
|
|
167
|
-
|
|
168
|
-
urlHash,
|
|
169
|
-
url: r.url,
|
|
170
|
-
inspectedAt: r.inspectedAt,
|
|
171
|
-
indexStatus: r.indexStatus ?? null,
|
|
172
|
-
lastCrawlTime: r.lastCrawlTime ?? null,
|
|
173
|
-
googleCanonical: r.googleCanonical ?? null,
|
|
174
|
-
userCanonical: r.userCanonical ?? null,
|
|
175
|
-
coverageState: r.coverageState ?? null,
|
|
176
|
-
robotsTxtState: r.robotsTxtState ?? null,
|
|
177
|
-
indexingState: r.indexingState ?? null,
|
|
178
|
-
pageFetchState: r.pageFetchState ?? null,
|
|
179
|
-
mobileUsabilityVerdict: r.mobileUsabilityVerdict ?? null,
|
|
180
|
-
richResultsVerdict: r.richResultsVerdict ?? null,
|
|
181
|
-
scheduleNextAt: r.raw?.schedule?.nextAt ?? null,
|
|
182
|
-
scheduleConsecutiveUnchanged: r.raw?.schedule?.consecutiveUnchanged ?? null,
|
|
183
|
-
schedulePolicyVersion: r.raw?.schedule?.policyVersion ?? null
|
|
184
|
-
}));
|
|
164
|
+
async materialize(ctx, rowIter) {
|
|
165
|
+
const rows = Array.from(rowIter);
|
|
166
|
+
rows.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
|
|
185
167
|
const bytes = encodeRowsToParquetFlex(rows, {
|
|
186
168
|
columns: INSPECTION_PARQUET_COLUMNS,
|
|
187
169
|
sortKey: ["urlHash"]
|
|
@@ -658,4 +640,4 @@ function createEmptyTypesStore(opts) {
|
|
|
658
640
|
}
|
|
659
641
|
};
|
|
660
642
|
}
|
|
661
|
-
export { createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey,
|
|
643
|
+
export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.18.
|
|
4
|
+
"version": "0.18.5",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -169,8 +169,8 @@
|
|
|
169
169
|
"dependencies": {
|
|
170
170
|
"drizzle-orm": "^0.45.2",
|
|
171
171
|
"proper-lockfile": "^4.1.2",
|
|
172
|
-
"gscdump": "0.18.
|
|
173
|
-
"
|
|
172
|
+
"@gscdump/contracts": "0.18.5",
|
|
173
|
+
"gscdump": "0.18.5"
|
|
174
174
|
},
|
|
175
175
|
"devDependencies": {
|
|
176
176
|
"@duckdb/duckdb-wasm": "^1.32.0",
|