@gscdump/engine 0.26.1 → 0.26.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,6 +26,16 @@ function emptyTypesKey(ctx) {
26
26
  function inspectionParquetKey(ctx) {
27
27
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.parquet` : `u_${ctx.userId}/entities/inspections/index.parquet`;
28
28
  }
29
+ function inspectionEventsPrefix(ctx) {
30
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/events` : `u_${ctx.userId}/entities/inspections/events`;
31
+ }
32
+ function inspectionEventKey(ctx, yearMonth, batchId) {
33
+ return `${inspectionEventsPrefix(ctx)}/${yearMonth}/${batchId}.parquet`;
34
+ }
35
+ function inspectionBaseKey(ctx) {
36
+ return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/base.parquet` : `u_${ctx.userId}/entities/inspections/base.parquet`;
37
+ }
38
+ const INSPECTION_EVENT_KEY_RE = /\/inspections\/events\/\d{4}-\d{2}\/[^/]+\.parquet$/;
29
39
  function inspectionHistoryPrefix(ctx, yearMonth) {
30
40
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}`;
31
41
  }
@@ -129,6 +139,49 @@ const INSPECTION_PARQUET_COLUMNS = [
129
139
  nullable: true
130
140
  }
131
141
  ];
142
+ const INSPECTION_EVENT_COLUMNS = [
143
+ ...INSPECTION_PARQUET_COLUMNS,
144
+ {
145
+ name: "crawlingUserAgent",
146
+ type: "VARCHAR",
147
+ nullable: true
148
+ },
149
+ {
150
+ name: "richResultsItems",
151
+ type: "VARCHAR",
152
+ nullable: true
153
+ },
154
+ {
155
+ name: "sitemaps",
156
+ type: "VARCHAR",
157
+ nullable: true
158
+ },
159
+ {
160
+ name: "referringUrls",
161
+ type: "VARCHAR",
162
+ nullable: true
163
+ },
164
+ {
165
+ name: "mobileIssues",
166
+ type: "VARCHAR",
167
+ nullable: true
168
+ },
169
+ {
170
+ name: "inspectionResultLink",
171
+ type: "VARCHAR",
172
+ nullable: true
173
+ },
174
+ {
175
+ name: "firstCheckedAt",
176
+ type: "VARCHAR",
177
+ nullable: true
178
+ },
179
+ {
180
+ name: "checkCount",
181
+ type: "INTEGER",
182
+ nullable: true
183
+ }
184
+ ];
132
185
  function createInspectionStore(opts) {
133
186
  const ds = opts.dataSource;
134
187
  function shardFor(record) {
@@ -193,6 +246,89 @@ function createInspectionStore(opts) {
193
246
  bytes: bytes.byteLength
194
247
  };
195
248
  },
249
+ async appendInspectionEvents(ctx, rows, options) {
250
+ if (rows.length === 0) return {
251
+ keys: [],
252
+ rowCount: 0
253
+ };
254
+ const batchId = options?.batchId ?? randomBatchId();
255
+ const byMonth = /* @__PURE__ */ new Map();
256
+ for (const r of rows) {
257
+ const m = YEAR_MONTH_RE.exec(r.inspectedAt);
258
+ const month = m ? `${m[1]}-${m[2]}` : "unknown";
259
+ const bucket = byMonth.get(month) ?? [];
260
+ bucket.push(r);
261
+ byMonth.set(month, bucket);
262
+ }
263
+ const keys = [];
264
+ for (const [month, batch] of byMonth) {
265
+ const bytes = encodeRowsToParquetFlex(batch, {
266
+ columns: INSPECTION_EVENT_COLUMNS,
267
+ sortKey: ["urlHash"]
268
+ });
269
+ const key = inspectionEventKey(ctx, month, batchId);
270
+ await ds.write(key, bytes);
271
+ keys.push(key);
272
+ }
273
+ return {
274
+ keys,
275
+ rowCount: rows.length
276
+ };
277
+ },
278
+ async compactInspections(ctx) {
279
+ const eventKeys = (await ds.list(`${inspectionEventsPrefix(ctx)}/`)).filter((k) => INSPECTION_EVENT_KEY_RE.test(k));
280
+ if (eventKeys.length === 0) return {
281
+ baseRowCount: 0,
282
+ eventsFolded: 0,
283
+ eventFilesDeleted: 0
284
+ };
285
+ const baseKey = inspectionBaseKey(ctx);
286
+ const baseBytes = await readOptional(ds, baseKey);
287
+ const baseRows = baseBytes ? await decodeParquetToRows(baseBytes) : [];
288
+ const latest = /* @__PURE__ */ new Map();
289
+ const earliestChecked = /* @__PURE__ */ new Map();
290
+ const consider = (row) => {
291
+ const h = String(row.urlHash);
292
+ const prev = latest.get(h);
293
+ if (!prev || String(row.inspectedAt ?? "") > String(prev.inspectedAt ?? "")) latest.set(h, row);
294
+ const fc = row.firstCheckedAt;
295
+ if (fc != null) {
296
+ const fcStr = String(fc);
297
+ const cur = earliestChecked.get(h);
298
+ if (cur === void 0 || fcStr < cur) earliestChecked.set(h, fcStr);
299
+ }
300
+ };
301
+ for (const row of baseRows) consider(row);
302
+ let eventsFolded = 0;
303
+ const consumed = [];
304
+ for (const key of eventKeys.sort()) {
305
+ const bytes = await readOptional(ds, key);
306
+ if (!bytes) continue;
307
+ consumed.push(key);
308
+ const rows = await decodeParquetToRows(bytes);
309
+ for (const row of rows) {
310
+ consider(row);
311
+ eventsFolded++;
312
+ }
313
+ }
314
+ const merged = [];
315
+ for (const [h, row] of latest) {
316
+ const fc = earliestChecked.get(h);
317
+ if (fc !== void 0) row.firstCheckedAt = fc;
318
+ merged.push(row);
319
+ }
320
+ const bytes = encodeRowsToParquetFlex(merged, {
321
+ columns: INSPECTION_EVENT_COLUMNS,
322
+ sortKey: ["urlHash"]
323
+ });
324
+ await ds.write(baseKey, bytes);
325
+ if (consumed.length > 0) await ds.delete(consumed);
326
+ return {
327
+ baseRowCount: merged.length,
328
+ eventsFolded,
329
+ eventFilesDeleted: consumed.length
330
+ };
331
+ },
196
332
  parquetUri(ctx) {
197
333
  return ds.uri?.(inspectionParquetKey(ctx));
198
334
  }
@@ -661,4 +797,4 @@ function createEmptyTypesStore(opts) {
661
797
  }
662
798
  };
663
799
  }
664
- export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
800
+ export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
@@ -1,4 +1,4 @@
1
- import { ColumnDef, ColumnType, Row, TableName, TableSchema, TableSchema as TableSchema$1 } from "@gscdump/contracts";
1
+ import { ColumnDef as ColumnDef$1, ColumnType, Row, TableName, TableSchema, TableSchema as TableSchema$1 } from "@gscdump/contracts";
2
2
  declare const pages: import("drizzle-orm/pg-core").PgTableWithColumns<{
3
3
  name: "pages";
4
4
  schema: undefined;
@@ -2259,4 +2259,4 @@ declare function naturalKeyColumns(table: TableName): readonly string[];
2259
2259
  */
2260
2260
  declare function dedupeByNaturalKey(table: TableName, rows: readonly Row[]): Row[];
2261
2261
  declare function dimensionToColumn(dim: string, _table: TableName): string;
2262
- export { type ColumnDef, type ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema$1 as TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
2262
+ export { type ColumnDef$1 as ColumnDef, type ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema$1 as TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
@@ -1,6 +1,6 @@
1
1
  import { DataSource } from "./_chunks/storage.mjs";
2
2
  import { ScheduleState } from "./schedule.mjs";
3
- import { TenantCtx } from "@gscdump/contracts";
3
+ import { ColumnDef, TenantCtx } from "@gscdump/contracts";
4
4
  /**
5
5
  * GSC URL inspection result fields we persist. Mirrors the
6
6
  * `searchconsole_v1.Schema$UrlInspectionResult` shape but as plain JSON
@@ -60,6 +60,16 @@ interface InspectionHistoryShard {
60
60
  declare function inspectionIndexKey(ctx: TenantCtx): string;
61
61
  declare function emptyTypesKey(ctx: TenantCtx): string;
62
62
  declare function inspectionParquetKey(ctx: TenantCtx): string;
63
+ /** Directory prefix holding a tenant's immutable inspection-event parquets. */
64
+ declare function inspectionEventsPrefix(ctx: TenantCtx): string;
65
+ /**
66
+ * Object key for one immutable inspection-event batch, partitioned by the
67
+ * `YYYY-MM` of the records' `inspectedAt`. The `batchId` is caller-supplied so
68
+ * a job retry re-writes the SAME key (idempotent whole-file overwrite).
69
+ */
70
+ declare function inspectionEventKey(ctx: TenantCtx, yearMonth: string, batchId: string): string;
71
+ /** Compacted latest-per-url base produced by `compactInspections`. */
72
+ declare function inspectionBaseKey(ctx: TenantCtx): string;
63
73
  /**
64
74
  * Directory prefix for a month's history shards. Each shard is a UUID-keyed
65
75
  * blob under this prefix; `appendHistory` writes one per call, `loadHistory`
@@ -99,6 +109,36 @@ interface InspectionParquetRow {
99
109
  scheduleConsecutiveUnchanged: number | null;
100
110
  schedulePolicyVersion: number | null;
101
111
  }
112
+ /**
113
+ * Row shape for the append-only inspection-event store. Superset of
114
+ * {@link InspectionParquetRow}: carries the full-fidelity columns the lossy
115
+ * `materialize` parquet dropped (`crawlingUserAgent`, `richResultsItems`,
116
+ * `sitemaps`, `referringUrls`, `mobileIssues`, `inspectionResultLink`,
117
+ * `firstCheckedAt`, `checkCount`). Object/array fields are persisted as JSON
118
+ * strings — read paths unpack them with DuckDB's JSON functions.
119
+ *
120
+ * `firstCheckedAt` / `checkCount` are caller-managed: the writer carries the
121
+ * earliest-seen timestamp + running observation count forward. Compaction
122
+ * preserves the EARLIEST `firstCheckedAt` per url (mirrors the sitemap store's
123
+ * `firstSeenAt` preservation); every other column is taken from the
124
+ * newest-by-`inspectedAt` event.
125
+ */
126
+ interface InspectionEventRow extends InspectionParquetRow {
127
+ crawlingUserAgent: string | null;
128
+ /** JSON-encoded `RichResultsItem[]`. */
129
+ richResultsItems: string | null;
130
+ /** JSON-encoded list of sitemap URLs referencing this page. */
131
+ sitemaps: string | null;
132
+ /** JSON-encoded list of referring URLs. */
133
+ referringUrls: string | null;
134
+ /** JSON-encoded mobile-usability issues. */
135
+ mobileIssues: string | null;
136
+ inspectionResultLink: string | null;
137
+ /** ISO-8601 timestamp of the first inspection we ever recorded for this url. */
138
+ firstCheckedAt: string | null;
139
+ /** Total number of inspections recorded for this url. */
140
+ checkCount: number | null;
141
+ }
102
142
  /**
103
143
  * Hard cap on a single `appendHistory` shard payload. Encoded bytes >
104
144
  * this threshold throws — the caller logs and moves on (D1 is
@@ -141,6 +181,40 @@ interface InspectionStore {
141
181
  rowCount: number;
142
182
  bytes: number;
143
183
  }>;
184
+ /**
185
+ * Append a batch of inspection results as an immutable per-batch parquet
186
+ * under `events/<YYYY-MM>/<batchId>.parquet`, partitioned by the `YYYY-MM`
187
+ * of each row's `inspectedAt` (a batch spanning a month boundary writes one
188
+ * file per month). No read-before-write; idempotent under job retry (same
189
+ * `batchId` → same key → whole-file overwrite). Rows carry the FULL column
190
+ * set ({@link INSPECTION_EVENT_COLUMNS}); this is the append-only
191
+ * source-of-truth that supersedes {@link InspectionStore.materialize}.
192
+ *
193
+ * Returns the keys written + total row count. Empty input is a no-op.
194
+ */
195
+ appendInspectionEvents: (ctx: TenantCtx, rows: readonly InspectionEventRow[], opts?: {
196
+ batchId?: string;
197
+ }) => Promise<{
198
+ keys: string[];
199
+ rowCount: number;
200
+ }>;
201
+ /**
202
+ * Fold every outstanding event file into the `base.parquet`: latest-per-url
203
+ * by max `inspectedAt` (newest-wins), preserving the earliest non-null
204
+ * `firstCheckedAt` per url. Writes the new base then deletes the consumed
205
+ * event files — file-level only, never row-level (ADR-0002). Idempotent +
206
+ * re-runnable: a crash after the base write but before the delete just
207
+ * re-folds the same events (newest-wins makes that a no-op). A real read
208
+ * failure on the existing base propagates rather than rebuilding from events
209
+ * alone (which would drop URLs only the base held).
210
+ *
211
+ * No-op (no base rewrite) when there are zero outstanding events.
212
+ */
213
+ compactInspections: (ctx: TenantCtx) => Promise<{
214
+ baseRowCount: number;
215
+ eventsFolded: number;
216
+ eventFilesDeleted: number;
217
+ }>;
144
218
  /**
145
219
  * DuckDB-resolvable URI for the materialised parquet sidecar, or
146
220
  * `undefined` if the underlying `DataSource` has no native URI shape
@@ -156,6 +230,14 @@ interface InspectionStore {
156
230
  interface CreateInspectionStoreOptions {
157
231
  dataSource: DataSource;
158
232
  }
233
+ /**
234
+ * Column schema for the append-only inspection-event store + its compacted
235
+ * base. Superset of {@link INSPECTION_PARQUET_COLUMNS}: the 16 promoted columns
236
+ * plus the 8 full-fidelity ones the lossy `materialize` parquet dropped. The
237
+ * event files and `base.parquet` share this schema so DuckDB
238
+ * `read_parquet([...], union_by_name = true)` merges base + events cleanly.
239
+ */
240
+ declare const INSPECTION_EVENT_COLUMNS: readonly ColumnDef[];
159
241
  declare function createInspectionStore(opts: CreateInspectionStoreOptions): InspectionStore;
160
242
  /** GSC sitemap record we persist. Matches `Schema$WmxSitemap` but as plain JSON. */
161
243
  interface SitemapRecord {
@@ -331,4 +413,4 @@ interface CreateEmptyTypesStoreOptions {
331
413
  now?: () => number;
332
414
  }
333
415
  declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
334
- export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
416
+ export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
package/dist/entities.mjs CHANGED
@@ -1,2 +1,2 @@
1
- import { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
2
- export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
1
+ import { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
2
+ export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
@@ -1,5 +1,5 @@
1
1
  import { DataSource, FileSetRef, Row as Row$1 } from "./_chunks/storage.mjs";
2
- import { ColumnDef } from "./_chunks/schema.mjs";
2
+ import { ColumnDef as ColumnDef$1 } from "./_chunks/schema.mjs";
3
3
  import { EngineError } from "./_chunks/errors.mjs";
4
4
  import { SearchType } from "gscdump/query";
5
5
  import { TenantCtx } from "@gscdump/contracts";
@@ -70,7 +70,7 @@ interface RollupDef {
70
70
  * Types map the same way as the fact-table encoder: VARCHAR / DATE go
71
71
  * through BYTE_ARRAY/UTF8; BIGINT → INT64; INTEGER → INT32; DOUBLE → DOUBLE.
72
72
  */
73
- parquetColumns?: readonly ColumnDef[];
73
+ parquetColumns?: readonly ColumnDef$1[];
74
74
  /** Sort-key column names for parquet row-group stats. Optional. */
75
75
  parquetSortKey?: readonly string[];
76
76
  /**
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.26.1",
4
+ "version": "0.26.3",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -190,8 +190,8 @@
190
190
  "drizzle-orm": "1.0.0-rc.3",
191
191
  "icebird": "^0.8.8",
192
192
  "proper-lockfile": "^4.1.2",
193
- "@gscdump/contracts": "0.26.1",
194
- "gscdump": "0.26.1"
193
+ "gscdump": "0.26.3",
194
+ "@gscdump/contracts": "0.26.3"
195
195
  },
196
196
  "devDependencies": {
197
197
  "@duckdb/duckdb-wasm": "^1.32.0",