@gscdump/engine 0.18.4 → 0.18.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -47,49 +47,95 @@ interface InspectionIndex {
47
47
  /** Map of urlHash → InspectionRecord (latest only). */
48
48
  records: Record<string, InspectionRecord>;
49
49
  }
50
+ /**
51
+ * Append-only history shard, one blob per `appendHistory` call.
52
+ * Keyed by UUID under the month directory — retries write a new blob,
53
+ * never RMW an existing one. Idempotent under job retries.
54
+ */
50
55
  interface InspectionHistoryShard {
51
56
  version: 1;
52
- /** Append-only list of inspection records for the YYYY-MM bucket. */
57
+ /** Records persisted in this batch. */
53
58
  records: InspectionRecord[];
54
59
  }
55
60
  declare function inspectionIndexKey(ctx: TenantCtx): string;
56
61
  declare function emptyTypesKey(ctx: TenantCtx): string;
57
62
  declare function inspectionParquetKey(ctx: TenantCtx): string;
58
- declare function inspectionHistoryKey(ctx: TenantCtx, yearMonth: string): string;
63
+ /**
64
+ * Directory prefix for a month's history shards. Each shard is a UUID-keyed
65
+ * blob under this prefix; `appendHistory` writes one per call, `loadHistory`
66
+ * lists + concatenates.
67
+ */
68
+ declare function inspectionHistoryPrefix(ctx: TenantCtx, yearMonth: string): string;
69
+ declare function inspectionHistoryShardKey(ctx: TenantCtx, yearMonth: string, batchId: string): string;
59
70
  /**
60
71
  * Stable URL hash used as the index key. Short, URL-safe, deterministic.
61
72
  * Uses a 64-bit FNV-1a; collisions vanishingly unlikely at the scales we
62
73
  * care about (≤100k URLs/site).
63
74
  */
64
75
  declare function hashUrl(url: string): string;
76
+ /**
77
+ * Row shape for the inspections parquet sidecar. Caller-side schema for
78
+ * `materialize` — D1 is the source of truth in the 2026-05-19 redesign, so
79
+ * consumers stream rows from `url_indexing_status` and pass them in. The
80
+ * parquet sidecar exists for DuckDB JOIN seams; readers go through
81
+ * `parquetUri`.
82
+ */
83
+ interface InspectionParquetRow {
84
+ urlHash: string;
85
+ url: string;
86
+ inspectedAt: string;
87
+ indexStatus: string | null;
88
+ lastCrawlTime: string | null;
89
+ googleCanonical: string | null;
90
+ userCanonical: string | null;
91
+ coverageState: string | null;
92
+ robotsTxtState: string | null;
93
+ indexingState: string | null;
94
+ pageFetchState: string | null;
95
+ mobileUsabilityVerdict: string | null;
96
+ richResultsVerdict: string | null;
97
+ scheduleNextAt: number | null;
98
+ scheduleConsecutiveUnchanged: number | null;
99
+ schedulePolicyVersion: number | null;
100
+ }
101
+ /**
102
+ * Hard cap on a single `appendHistory` shard payload. Encoded bytes >
103
+ * this threshold throws — the caller logs and moves on (D1 is
104
+ * authoritative, R2 history is a sidecar). At `URLS_PER_JOB=3` a real
105
+ * batch encodes to ~10 KB so the cap is purely defensive against future
106
+ * batch-size bumps.
107
+ */
108
+ declare const INSPECTION_HISTORY_MAX_BYTES: number;
65
109
  interface InspectionStore {
66
110
  /**
67
- * Persist a batch of fresh inspection results. Updates the index +
68
- * appends to the per-month history shard.
111
+ * Append a batch of fresh inspection results as an immutable per-batch
112
+ * shard under `history/<YYYY-MM>/<batchId>.json`. Idempotent under job
113
+ * retry (caller-supplied UUID per logical batch), no read-before-write,
114
+ * one PUT per month-group within the batch.
115
+ *
116
+ * Throws if the encoded payload exceeds {@link INSPECTION_HISTORY_MAX_BYTES}.
69
117
  */
70
- writeBatch: (ctx: TenantCtx, records: readonly InspectionRecord[]) => Promise<void>;
71
- /** Fetch the latest inspection record for a URL, or undefined. */
72
- getLatest: (ctx: TenantCtx, url: string) => Promise<InspectionRecord | undefined>;
118
+ appendHistory: (ctx: TenantCtx, records: readonly InspectionRecord[], opts?: {
119
+ batchId?: string;
120
+ }) => Promise<void>;
73
121
  /**
74
- * Read the full index for a site (latest record per URL). Cheap on
75
- * Workers; on big tenants the dashboard reads this once per page load.
122
+ * Read every shard in a month directory and concatenate. Best-effort:
123
+ * shards that fail to decode are skipped (logged via console). Returns
124
+ * `undefined` if the month has no shards.
76
125
  */
77
- loadIndex: (ctx: TenantCtx) => Promise<InspectionIndex>;
78
- /** Read the per-month history shard if it exists. */
79
126
  loadHistory: (ctx: TenantCtx, yearMonth: string) => Promise<InspectionHistoryShard | undefined>;
80
127
  /**
81
- * Snapshot the current JSON index to a parquet sidecar at
82
- * `entities/inspections/index.parquet`. One PUT. Sorted by `urlHash` so
83
- * DuckDB row-group stats can prune URL-keyed JOINs efficiently.
128
+ * Encode caller-provided rows into the inspections parquet sidecar at
129
+ * `entities/inspections/index.parquet`. Sorted by `urlHash` so DuckDB
130
+ * row-group stats can prune URL-keyed JOINs efficiently. One PUT.
84
131
  *
85
- * Internal seam: callers don't choose JSON-vs-parquet the store materialises
86
- * the parquet at end-of-batch (e.g. after `indexing/complete`) and readers
87
- * pick the format that matches their access pattern (parquet for JOINs,
88
- * JSON for full-index scans / point lookups).
132
+ * D1 is the source of truth in the 2026-05-19 redesign; this rebuilds
133
+ * the parquet from D1 rows the caller streams in (engine has no D1
134
+ * access). Triggered by `indexing/complete` post-hook.
89
135
  *
90
136
  * Returns the parquet object key (matches {@link parquetUri} after write).
91
137
  */
92
- materialize: (ctx: TenantCtx) => Promise<{
138
+ materialize: (ctx: TenantCtx, rows: Iterable<InspectionParquetRow>) => Promise<{
93
139
  key: string;
94
140
  rowCount: number;
95
141
  bytes: number;
@@ -108,12 +154,6 @@ interface InspectionStore {
108
154
  }
109
155
  interface CreateInspectionStoreOptions {
110
156
  dataSource: DataSource;
111
- /**
112
- * Override the FNV hash with a callable (test seam, or to swap in
113
- * SHA-256 if hash collisions become a concern at extreme scale).
114
- */
115
- hash?: (url: string) => string;
116
- now?: () => number;
117
157
  }
118
158
  declare function createInspectionStore(opts: CreateInspectionStoreOptions): InspectionStore;
119
159
  /** GSC sitemap record we persist. Matches `Schema$WmxSitemap` but as plain JSON. */
@@ -289,4 +329,4 @@ interface CreateEmptyTypesStoreOptions {
289
329
  now?: () => number;
290
330
  }
291
331
  declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
292
- export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionIndex, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
332
+ export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
package/dist/entities.mjs CHANGED
@@ -9,8 +9,11 @@ function emptyTypesKey(ctx) {
9
9
  function inspectionParquetKey(ctx) {
10
10
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.parquet` : `u_${ctx.userId}/entities/inspections/index.parquet`;
11
11
  }
12
- function inspectionHistoryKey(ctx, yearMonth) {
13
- return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}.json` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}.json`;
12
+ function inspectionHistoryPrefix(ctx, yearMonth) {
13
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}`;
14
+ }
15
+ function inspectionHistoryShardKey(ctx, yearMonth, batchId) {
16
+ return `${inspectionHistoryPrefix(ctx, yearMonth)}/${batchId}.json`;
14
17
  }
15
18
  function hashUrl(url) {
16
19
  let hi = 2166136261;
@@ -26,6 +29,7 @@ function hashUrl(url) {
26
29
  }
27
30
  return (hi >>> 0).toString(16).padStart(8, "0") + (lo >>> 0).toString(16).padStart(8, "0");
28
31
  }
32
+ const INSPECTION_HISTORY_MAX_BYTES = 5 * 1024 * 1024;
29
33
  const INSPECTION_PARQUET_COLUMNS = [
30
34
  {
31
35
  name: "urlHash",
@@ -109,79 +113,57 @@ const INSPECTION_PARQUET_COLUMNS = [
109
113
  }
110
114
  ];
111
115
  function createInspectionStore(opts) {
112
- const hash = opts.hash ?? hashUrl;
113
116
  const ds = opts.dataSource;
114
- async function readJson(key) {
115
- return await ds.read(key).then((bytes) => JSON.parse(new TextDecoder().decode(bytes)), () => void 0);
116
- }
117
- async function writeJson(key, value) {
118
- await ds.write(key, new TextEncoder().encode(JSON.stringify(value)));
119
- }
120
- function emptyIndex() {
121
- return {
122
- version: 1,
123
- records: {}
124
- };
125
- }
126
- function emptyShard() {
127
- return {
128
- version: 1,
129
- records: []
130
- };
131
- }
132
117
  function shardFor(record) {
133
118
  const m = YEAR_MONTH_RE.exec(record.inspectedAt);
134
119
  return m ? `${m[1]}-${m[2]}` : "unknown";
135
120
  }
121
+ function randomBatchId() {
122
+ return typeof crypto !== "undefined" && "randomUUID" in crypto ? crypto.randomUUID() : `${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
123
+ }
136
124
  return {
137
- async writeBatch(ctx, records) {
125
+ async appendHistory(ctx, records, options) {
138
126
  if (records.length === 0) return;
139
- const indexKey = inspectionIndexKey(ctx);
140
- const index = await readJson(indexKey) ?? emptyIndex();
141
- const byShard = /* @__PURE__ */ new Map();
127
+ const batchId = options?.batchId ?? randomBatchId();
128
+ const byMonth = /* @__PURE__ */ new Map();
142
129
  for (const r of records) {
143
- index.records[hash(r.url)] = r;
144
- const shardKey = shardFor(r);
145
- if (!byShard.has(shardKey)) byShard.set(shardKey, []);
146
- byShard.get(shardKey).push(r);
130
+ const month = shardFor(r);
131
+ if (!byMonth.has(month)) byMonth.set(month, []);
132
+ byMonth.get(month).push(r);
147
133
  }
148
- await writeJson(indexKey, index);
149
- for (const [yearMonth, batch] of byShard) {
150
- const histKey = inspectionHistoryKey(ctx, yearMonth);
151
- const existing = await readJson(histKey) ?? emptyShard();
152
- existing.records.push(...batch);
153
- await writeJson(histKey, existing);
134
+ for (const [yearMonth, batch] of byMonth) {
135
+ const shard = {
136
+ version: 1,
137
+ records: batch
138
+ };
139
+ const bytes = new TextEncoder().encode(JSON.stringify(shard));
140
+ if (bytes.byteLength > 5242880) throw new Error(`inspection history shard exceeds ${INSPECTION_HISTORY_MAX_BYTES} bytes (got ${bytes.byteLength}); split the batch`);
141
+ await ds.write(inspectionHistoryShardKey(ctx, yearMonth, batchId), bytes);
154
142
  }
155
143
  },
156
- async getLatest(ctx, url) {
157
- return (await readJson(inspectionIndexKey(ctx)))?.records[hash(url)];
158
- },
159
- async loadIndex(ctx) {
160
- return await readJson(inspectionIndexKey(ctx)) ?? emptyIndex();
161
- },
162
144
  async loadHistory(ctx, yearMonth) {
163
- return await readJson(inspectionHistoryKey(ctx, yearMonth));
145
+ const keys = await ds.list(inspectionHistoryPrefix(ctx, yearMonth));
146
+ if (keys.length === 0) return void 0;
147
+ const out = [];
148
+ for (const key of keys) {
149
+ const bytes = await ds.read(key).catch(() => void 0);
150
+ if (!bytes) continue;
151
+ const shard = await Promise.resolve().then(() => JSON.parse(new TextDecoder().decode(bytes))).catch((err) => {
152
+ console.warn("[inspection.loadHistory] failed to decode shard", {
153
+ key,
154
+ error: err.message
155
+ });
156
+ });
157
+ if (shard?.records) out.push(...shard.records);
158
+ }
159
+ return {
160
+ version: 1,
161
+ records: out
162
+ };
164
163
  },
165
- async materialize(ctx) {
166
- const index = await readJson(inspectionIndexKey(ctx)) ?? emptyIndex();
167
- const rows = Object.entries(index.records).map(([urlHash, r]) => ({
168
- urlHash,
169
- url: r.url,
170
- inspectedAt: r.inspectedAt,
171
- indexStatus: r.indexStatus ?? null,
172
- lastCrawlTime: r.lastCrawlTime ?? null,
173
- googleCanonical: r.googleCanonical ?? null,
174
- userCanonical: r.userCanonical ?? null,
175
- coverageState: r.coverageState ?? null,
176
- robotsTxtState: r.robotsTxtState ?? null,
177
- indexingState: r.indexingState ?? null,
178
- pageFetchState: r.pageFetchState ?? null,
179
- mobileUsabilityVerdict: r.mobileUsabilityVerdict ?? null,
180
- richResultsVerdict: r.richResultsVerdict ?? null,
181
- scheduleNextAt: r.raw?.schedule?.nextAt ?? null,
182
- scheduleConsecutiveUnchanged: r.raw?.schedule?.consecutiveUnchanged ?? null,
183
- schedulePolicyVersion: r.raw?.schedule?.policyVersion ?? null
184
- }));
164
+ async materialize(ctx, rowIter) {
165
+ const rows = Array.from(rowIter);
166
+ rows.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
185
167
  const bytes = encodeRowsToParquetFlex(rows, {
186
168
  columns: INSPECTION_PARQUET_COLUMNS,
187
169
  sortKey: ["urlHash"]
@@ -658,4 +640,4 @@ function createEmptyTypesStore(opts) {
658
640
  }
659
641
  };
660
642
  }
661
- export { createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
643
+ export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.18.4",
4
+ "version": "0.18.5",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -169,8 +169,8 @@
169
169
  "dependencies": {
170
170
  "drizzle-orm": "^0.45.2",
171
171
  "proper-lockfile": "^4.1.2",
172
- "gscdump": "0.18.4",
173
- "@gscdump/contracts": "0.18.4"
172
+ "@gscdump/contracts": "0.18.5",
173
+ "gscdump": "0.18.5"
174
174
  },
175
175
  "devDependencies": {
176
176
  "@duckdb/duckdb-wasm": "^1.32.0",