@gscdump/engine 0.26.2 → 0.26.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/_chunks/entities.mjs +233 -1
- package/dist/_chunks/schema.d.mts +2 -2
- package/dist/entities.d.mts +113 -2
- package/dist/entities.mjs +2 -2
- package/dist/rollups.d.mts +2 -2
- package/package.json +3 -3
|
@@ -26,6 +26,16 @@ function emptyTypesKey(ctx) {
|
|
|
26
26
|
function inspectionParquetKey(ctx) {
|
|
27
27
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/index.parquet` : `u_${ctx.userId}/entities/inspections/index.parquet`;
|
|
28
28
|
}
|
|
29
|
+
function inspectionEventsPrefix(ctx) {
|
|
30
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/events` : `u_${ctx.userId}/entities/inspections/events`;
|
|
31
|
+
}
|
|
32
|
+
function inspectionEventKey(ctx, yearMonth, batchId) {
|
|
33
|
+
return `${inspectionEventsPrefix(ctx)}/${yearMonth}/${batchId}.parquet`;
|
|
34
|
+
}
|
|
35
|
+
function inspectionBaseKey(ctx) {
|
|
36
|
+
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/base.parquet` : `u_${ctx.userId}/entities/inspections/base.parquet`;
|
|
37
|
+
}
|
|
38
|
+
const INSPECTION_EVENT_KEY_RE = /\/inspections\/events\/\d{4}-\d{2}\/[^/]+\.parquet$/;
|
|
29
39
|
function inspectionHistoryPrefix(ctx, yearMonth) {
|
|
30
40
|
return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/inspections/history/${yearMonth}` : `u_${ctx.userId}/entities/inspections/history/${yearMonth}`;
|
|
31
41
|
}
|
|
@@ -129,6 +139,59 @@ const INSPECTION_PARQUET_COLUMNS = [
|
|
|
129
139
|
nullable: true
|
|
130
140
|
}
|
|
131
141
|
];
|
|
142
|
+
const INSPECTION_EVENT_COLUMNS = [
|
|
143
|
+
...INSPECTION_PARQUET_COLUMNS,
|
|
144
|
+
{
|
|
145
|
+
name: "crawlingUserAgent",
|
|
146
|
+
type: "VARCHAR",
|
|
147
|
+
nullable: true
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
name: "richResultsItems",
|
|
151
|
+
type: "VARCHAR",
|
|
152
|
+
nullable: true
|
|
153
|
+
},
|
|
154
|
+
{
|
|
155
|
+
name: "sitemaps",
|
|
156
|
+
type: "VARCHAR",
|
|
157
|
+
nullable: true
|
|
158
|
+
},
|
|
159
|
+
{
|
|
160
|
+
name: "referringUrls",
|
|
161
|
+
type: "VARCHAR",
|
|
162
|
+
nullable: true
|
|
163
|
+
},
|
|
164
|
+
{
|
|
165
|
+
name: "mobileIssues",
|
|
166
|
+
type: "VARCHAR",
|
|
167
|
+
nullable: true
|
|
168
|
+
},
|
|
169
|
+
{
|
|
170
|
+
name: "inspectionResultLink",
|
|
171
|
+
type: "VARCHAR",
|
|
172
|
+
nullable: true
|
|
173
|
+
},
|
|
174
|
+
{
|
|
175
|
+
name: "firstCheckedAt",
|
|
176
|
+
type: "VARCHAR",
|
|
177
|
+
nullable: true
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
name: "checkCount",
|
|
181
|
+
type: "INTEGER",
|
|
182
|
+
nullable: true
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
name: "nextCheckAfter",
|
|
186
|
+
type: "BIGINT",
|
|
187
|
+
nullable: true
|
|
188
|
+
},
|
|
189
|
+
{
|
|
190
|
+
name: "nextCheckPriority",
|
|
191
|
+
type: "VARCHAR",
|
|
192
|
+
nullable: true
|
|
193
|
+
}
|
|
194
|
+
];
|
|
132
195
|
function createInspectionStore(opts) {
|
|
133
196
|
const ds = opts.dataSource;
|
|
134
197
|
function shardFor(record) {
|
|
@@ -193,6 +256,89 @@ function createInspectionStore(opts) {
|
|
|
193
256
|
bytes: bytes.byteLength
|
|
194
257
|
};
|
|
195
258
|
},
|
|
259
|
+
async appendInspectionEvents(ctx, rows, options) {
|
|
260
|
+
if (rows.length === 0) return {
|
|
261
|
+
keys: [],
|
|
262
|
+
rowCount: 0
|
|
263
|
+
};
|
|
264
|
+
const batchId = options?.batchId ?? randomBatchId();
|
|
265
|
+
const byMonth = /* @__PURE__ */ new Map();
|
|
266
|
+
for (const r of rows) {
|
|
267
|
+
const m = YEAR_MONTH_RE.exec(r.inspectedAt);
|
|
268
|
+
const month = m ? `${m[1]}-${m[2]}` : "unknown";
|
|
269
|
+
const bucket = byMonth.get(month) ?? [];
|
|
270
|
+
bucket.push(r);
|
|
271
|
+
byMonth.set(month, bucket);
|
|
272
|
+
}
|
|
273
|
+
const keys = [];
|
|
274
|
+
for (const [month, batch] of byMonth) {
|
|
275
|
+
const bytes = encodeRowsToParquetFlex(batch, {
|
|
276
|
+
columns: INSPECTION_EVENT_COLUMNS,
|
|
277
|
+
sortKey: ["urlHash"]
|
|
278
|
+
});
|
|
279
|
+
const key = inspectionEventKey(ctx, month, batchId);
|
|
280
|
+
await ds.write(key, bytes);
|
|
281
|
+
keys.push(key);
|
|
282
|
+
}
|
|
283
|
+
return {
|
|
284
|
+
keys,
|
|
285
|
+
rowCount: rows.length
|
|
286
|
+
};
|
|
287
|
+
},
|
|
288
|
+
async compactInspections(ctx) {
|
|
289
|
+
const eventKeys = (await ds.list(`${inspectionEventsPrefix(ctx)}/`)).filter((k) => INSPECTION_EVENT_KEY_RE.test(k));
|
|
290
|
+
if (eventKeys.length === 0) return {
|
|
291
|
+
baseRowCount: 0,
|
|
292
|
+
eventsFolded: 0,
|
|
293
|
+
eventFilesDeleted: 0
|
|
294
|
+
};
|
|
295
|
+
const baseKey = inspectionBaseKey(ctx);
|
|
296
|
+
const baseBytes = await readOptional(ds, baseKey);
|
|
297
|
+
const baseRows = baseBytes ? await decodeParquetToRows(baseBytes) : [];
|
|
298
|
+
const latest = /* @__PURE__ */ new Map();
|
|
299
|
+
const earliestChecked = /* @__PURE__ */ new Map();
|
|
300
|
+
const consider = (row) => {
|
|
301
|
+
const h = String(row.urlHash);
|
|
302
|
+
const prev = latest.get(h);
|
|
303
|
+
if (!prev || String(row.inspectedAt ?? "") > String(prev.inspectedAt ?? "")) latest.set(h, row);
|
|
304
|
+
const fc = row.firstCheckedAt;
|
|
305
|
+
if (fc != null) {
|
|
306
|
+
const fcStr = String(fc);
|
|
307
|
+
const cur = earliestChecked.get(h);
|
|
308
|
+
if (cur === void 0 || fcStr < cur) earliestChecked.set(h, fcStr);
|
|
309
|
+
}
|
|
310
|
+
};
|
|
311
|
+
for (const row of baseRows) consider(row);
|
|
312
|
+
let eventsFolded = 0;
|
|
313
|
+
const consumed = [];
|
|
314
|
+
for (const key of eventKeys.sort()) {
|
|
315
|
+
const bytes = await readOptional(ds, key);
|
|
316
|
+
if (!bytes) continue;
|
|
317
|
+
consumed.push(key);
|
|
318
|
+
const rows = await decodeParquetToRows(bytes);
|
|
319
|
+
for (const row of rows) {
|
|
320
|
+
consider(row);
|
|
321
|
+
eventsFolded++;
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
const merged = [];
|
|
325
|
+
for (const [h, row] of latest) {
|
|
326
|
+
const fc = earliestChecked.get(h);
|
|
327
|
+
if (fc !== void 0) row.firstCheckedAt = fc;
|
|
328
|
+
merged.push(row);
|
|
329
|
+
}
|
|
330
|
+
const bytes = encodeRowsToParquetFlex(merged, {
|
|
331
|
+
columns: INSPECTION_EVENT_COLUMNS,
|
|
332
|
+
sortKey: ["urlHash"]
|
|
333
|
+
});
|
|
334
|
+
await ds.write(baseKey, bytes);
|
|
335
|
+
if (consumed.length > 0) await ds.delete(consumed);
|
|
336
|
+
return {
|
|
337
|
+
baseRowCount: merged.length,
|
|
338
|
+
eventsFolded,
|
|
339
|
+
eventFilesDeleted: consumed.length
|
|
340
|
+
};
|
|
341
|
+
},
|
|
196
342
|
parquetUri(ctx) {
|
|
197
343
|
return ds.uri?.(inspectionParquetKey(ctx));
|
|
198
344
|
}
|
|
@@ -574,6 +720,92 @@ function createSitemapStore(opts) {
|
|
|
574
720
|
await ds.write(indexKey, bytes);
|
|
575
721
|
if (consumed.length > 0) await ds.delete(consumed);
|
|
576
722
|
}
|
|
723
|
+
},
|
|
724
|
+
async reconcile(ctx, { liveFeedpaths, at: atOpt }) {
|
|
725
|
+
const at = atOpt ?? now();
|
|
726
|
+
const liveHashes = new Set(liveFeedpaths.map((fp) => hash(fp)));
|
|
727
|
+
const present = /* @__PURE__ */ new Set();
|
|
728
|
+
for (const key of await ds.list(`${sitemapUrlsIndexPrefix(ctx)}/`)) {
|
|
729
|
+
const m = /\/by-feed\/([0-9a-f]+)\/index\.parquet$/.exec(key);
|
|
730
|
+
if (m) present.add(m[1]);
|
|
731
|
+
}
|
|
732
|
+
const deltasByFeed = /* @__PURE__ */ new Map();
|
|
733
|
+
for (const key of await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)) {
|
|
734
|
+
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
735
|
+
if (!m) continue;
|
|
736
|
+
present.add(m[2]);
|
|
737
|
+
const list = deltasByFeed.get(m[2]) ?? [];
|
|
738
|
+
list.push(key);
|
|
739
|
+
deltasByFeed.set(m[2], list);
|
|
740
|
+
}
|
|
741
|
+
let feedpathsPruned = 0;
|
|
742
|
+
let urlsRemoved = 0;
|
|
743
|
+
for (const fpHash of present) {
|
|
744
|
+
if (liveHashes.has(fpHash)) continue;
|
|
745
|
+
const indexKey = sitemapUrlsIndexKey(ctx, fpHash);
|
|
746
|
+
const indexBytes = await readOptional(ds, indexKey);
|
|
747
|
+
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
748
|
+
const live = /* @__PURE__ */ new Map();
|
|
749
|
+
const removed = /* @__PURE__ */ new Map();
|
|
750
|
+
for (const row of indexRows) {
|
|
751
|
+
const r = rowToUrlRecord(row);
|
|
752
|
+
if (r.removedAt != null) removed.set(r.urlHash, r);
|
|
753
|
+
else live.set(r.urlHash, r);
|
|
754
|
+
}
|
|
755
|
+
const consumed = [];
|
|
756
|
+
for (const key of (deltasByFeed.get(fpHash) ?? []).sort()) {
|
|
757
|
+
const bytes = await readOptional(ds, key);
|
|
758
|
+
if (!bytes) continue;
|
|
759
|
+
consumed.push(key);
|
|
760
|
+
const rows = await decodeParquetToRows(bytes);
|
|
761
|
+
for (const r of rows) {
|
|
762
|
+
const urlHash = String(r.url_hash);
|
|
763
|
+
const dat = Number(r.at);
|
|
764
|
+
if (String(r.op) === "added") {
|
|
765
|
+
const prev = live.get(urlHash) ?? removed.get(urlHash);
|
|
766
|
+
removed.delete(urlHash);
|
|
767
|
+
live.set(urlHash, {
|
|
768
|
+
feedpath: String(r.feedpath),
|
|
769
|
+
feedpathHash: fpHash,
|
|
770
|
+
urlHash,
|
|
771
|
+
loc: String(r.loc),
|
|
772
|
+
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
773
|
+
firstSeenAt: prev?.firstSeenAt ?? dat,
|
|
774
|
+
lastSeenAt: dat
|
|
775
|
+
});
|
|
776
|
+
} else if (String(r.op) === "removed") {
|
|
777
|
+
const prev = live.get(urlHash);
|
|
778
|
+
live.delete(urlHash);
|
|
779
|
+
if (prev) removed.set(urlHash, {
|
|
780
|
+
...prev,
|
|
781
|
+
removedAt: dat
|
|
782
|
+
});
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
const hadLive = live.size > 0;
|
|
787
|
+
if (!hadLive && consumed.length === 0) continue;
|
|
788
|
+
for (const [urlHash, r] of live) {
|
|
789
|
+
removed.set(urlHash, {
|
|
790
|
+
...r,
|
|
791
|
+
removedAt: at
|
|
792
|
+
});
|
|
793
|
+
urlsRemoved++;
|
|
794
|
+
}
|
|
795
|
+
const merged = [...removed.values()];
|
|
796
|
+
merged.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
|
|
797
|
+
const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
|
|
798
|
+
columns: URLS_INDEX_COLUMNS,
|
|
799
|
+
sortKey: ["feedpath_hash", "url_hash"]
|
|
800
|
+
});
|
|
801
|
+
await ds.write(indexKey, bytes);
|
|
802
|
+
if (consumed.length > 0) await ds.delete(consumed);
|
|
803
|
+
if (hadLive) feedpathsPruned++;
|
|
804
|
+
}
|
|
805
|
+
return {
|
|
806
|
+
feedpathsPruned,
|
|
807
|
+
urlsRemoved
|
|
808
|
+
};
|
|
577
809
|
}
|
|
578
810
|
};
|
|
579
811
|
}
|
|
@@ -661,4 +893,4 @@ function createEmptyTypesStore(opts) {
|
|
|
661
893
|
}
|
|
662
894
|
};
|
|
663
895
|
}
|
|
664
|
-
export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
|
|
896
|
+
export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { ColumnDef, ColumnType, Row, TableName, TableSchema, TableSchema as TableSchema$1 } from "@gscdump/contracts";
|
|
1
|
+
import { ColumnDef as ColumnDef$1, ColumnType, Row, TableName, TableSchema, TableSchema as TableSchema$1 } from "@gscdump/contracts";
|
|
2
2
|
declare const pages: import("drizzle-orm/pg-core").PgTableWithColumns<{
|
|
3
3
|
name: "pages";
|
|
4
4
|
schema: undefined;
|
|
@@ -2259,4 +2259,4 @@ declare function naturalKeyColumns(table: TableName): readonly string[];
|
|
|
2259
2259
|
*/
|
|
2260
2260
|
declare function dedupeByNaturalKey(table: TableName, rows: readonly Row[]): Row[];
|
|
2261
2261
|
declare function dimensionToColumn(dim: string, _table: TableName): string;
|
|
2262
|
-
export { type ColumnDef, type ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema$1 as TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
|
|
2262
|
+
export { type ColumnDef$1 as ColumnDef, type ColumnType, DrizzleSchema, SCHEMAS, TABLE_METADATA, type TableSchema$1 as TableSchema, allTables, countries, currentSchemaVersion, dates, dedupeByNaturalKey, dimensionToColumn, drizzleSchema, hourly_pages, inferTable, naturalKeyColumns, page_queries, pages, queries, schemaFor, search_appearance, search_appearance_page_queries, search_appearance_pages, search_appearance_queries };
|
package/dist/entities.d.mts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { DataSource } from "./_chunks/storage.mjs";
|
|
2
2
|
import { ScheduleState } from "./schedule.mjs";
|
|
3
|
-
import { TenantCtx } from "@gscdump/contracts";
|
|
3
|
+
import { ColumnDef, TenantCtx } from "@gscdump/contracts";
|
|
4
4
|
/**
|
|
5
5
|
* GSC URL inspection result fields we persist. Mirrors the
|
|
6
6
|
* `searchconsole_v1.Schema$UrlInspectionResult` shape but as plain JSON
|
|
@@ -60,6 +60,16 @@ interface InspectionHistoryShard {
|
|
|
60
60
|
declare function inspectionIndexKey(ctx: TenantCtx): string;
|
|
61
61
|
declare function emptyTypesKey(ctx: TenantCtx): string;
|
|
62
62
|
declare function inspectionParquetKey(ctx: TenantCtx): string;
|
|
63
|
+
/** Directory prefix holding a tenant's immutable inspection-event parquets. */
|
|
64
|
+
declare function inspectionEventsPrefix(ctx: TenantCtx): string;
|
|
65
|
+
/**
|
|
66
|
+
* Object key for one immutable inspection-event batch, partitioned by the
|
|
67
|
+
* `YYYY-MM` of the records' `inspectedAt`. The `batchId` is caller-supplied so
|
|
68
|
+
* a job retry re-writes the SAME key (idempotent whole-file overwrite).
|
|
69
|
+
*/
|
|
70
|
+
declare function inspectionEventKey(ctx: TenantCtx, yearMonth: string, batchId: string): string;
|
|
71
|
+
/** Compacted latest-per-url base produced by `compactInspections`. */
|
|
72
|
+
declare function inspectionBaseKey(ctx: TenantCtx): string;
|
|
63
73
|
/**
|
|
64
74
|
* Directory prefix for a month's history shards. Each shard is a UUID-keyed
|
|
65
75
|
* blob under this prefix; `appendHistory` writes one per call, `loadHistory`
|
|
@@ -99,6 +109,44 @@ interface InspectionParquetRow {
|
|
|
99
109
|
scheduleConsecutiveUnchanged: number | null;
|
|
100
110
|
schedulePolicyVersion: number | null;
|
|
101
111
|
}
|
|
112
|
+
/**
|
|
113
|
+
* Row shape for the append-only inspection-event store. Superset of
|
|
114
|
+
* {@link InspectionParquetRow}: carries the full-fidelity columns the lossy
|
|
115
|
+
* `materialize` parquet dropped (`crawlingUserAgent`, `richResultsItems`,
|
|
116
|
+
* `sitemaps`, `referringUrls`, `mobileIssues`, `inspectionResultLink`,
|
|
117
|
+
* `firstCheckedAt`, `checkCount`). Object/array fields are persisted as JSON
|
|
118
|
+
* strings — read paths unpack them with DuckDB's JSON functions.
|
|
119
|
+
*
|
|
120
|
+
* `firstCheckedAt` / `checkCount` are caller-managed: the writer carries the
|
|
121
|
+
* earliest-seen timestamp + running observation count forward. Compaction
|
|
122
|
+
* preserves the EARLIEST `firstCheckedAt` per url (mirrors the sitemap store's
|
|
123
|
+
* `firstSeenAt` preservation); every other column is taken from the
|
|
124
|
+
* newest-by-`inspectedAt` event.
|
|
125
|
+
*/
|
|
126
|
+
interface InspectionEventRow extends InspectionParquetRow {
|
|
127
|
+
crawlingUserAgent: string | null;
|
|
128
|
+
/** JSON-encoded `RichResultsItem[]`. */
|
|
129
|
+
richResultsItems: string | null;
|
|
130
|
+
/** JSON-encoded list of sitemap URLs referencing this page. */
|
|
131
|
+
sitemaps: string | null;
|
|
132
|
+
/** JSON-encoded list of referring URLs. */
|
|
133
|
+
referringUrls: string | null;
|
|
134
|
+
/** JSON-encoded mobile-usability issues. */
|
|
135
|
+
mobileIssues: string | null;
|
|
136
|
+
inspectionResultLink: string | null;
|
|
137
|
+
/** ISO-8601 timestamp of the first inspection we ever recorded for this url. */
|
|
138
|
+
firstCheckedAt: string | null;
|
|
139
|
+
/** Total number of inspections recorded for this url. */
|
|
140
|
+
checkCount: number | null;
|
|
141
|
+
/**
|
|
142
|
+
* Stored next-recheck unix-seconds + priority as computed AT INSPECT TIME.
|
|
143
|
+
* Carried verbatim (NOT recomputed at read) because the scheduling policy can
|
|
144
|
+
* change over time — `__gsc/inspections` must replay the historical value to
|
|
145
|
+
* keep its frozen wire shape byte-stable.
|
|
146
|
+
*/
|
|
147
|
+
nextCheckAfter: number | null;
|
|
148
|
+
nextCheckPriority: string | null;
|
|
149
|
+
}
|
|
102
150
|
/**
|
|
103
151
|
* Hard cap on a single `appendHistory` shard payload. Encoded bytes >
|
|
104
152
|
* this threshold throws — the caller logs and moves on (D1 is
|
|
@@ -141,6 +189,40 @@ interface InspectionStore {
|
|
|
141
189
|
rowCount: number;
|
|
142
190
|
bytes: number;
|
|
143
191
|
}>;
|
|
192
|
+
/**
|
|
193
|
+
* Append a batch of inspection results as an immutable per-batch parquet
|
|
194
|
+
* under `events/<YYYY-MM>/<batchId>.parquet`, partitioned by the `YYYY-MM`
|
|
195
|
+
* of each row's `inspectedAt` (a batch spanning a month boundary writes one
|
|
196
|
+
* file per month). No read-before-write; idempotent under job retry (same
|
|
197
|
+
* `batchId` → same key → whole-file overwrite). Rows carry the FULL column
|
|
198
|
+
* set ({@link INSPECTION_EVENT_COLUMNS}); this is the append-only
|
|
199
|
+
* source-of-truth that supersedes {@link InspectionStore.materialize}.
|
|
200
|
+
*
|
|
201
|
+
* Returns the keys written + total row count. Empty input is a no-op.
|
|
202
|
+
*/
|
|
203
|
+
appendInspectionEvents: (ctx: TenantCtx, rows: readonly InspectionEventRow[], opts?: {
|
|
204
|
+
batchId?: string;
|
|
205
|
+
}) => Promise<{
|
|
206
|
+
keys: string[];
|
|
207
|
+
rowCount: number;
|
|
208
|
+
}>;
|
|
209
|
+
/**
|
|
210
|
+
* Fold every outstanding event file into the `base.parquet`: latest-per-url
|
|
211
|
+
* by max `inspectedAt` (newest-wins), preserving the earliest non-null
|
|
212
|
+
* `firstCheckedAt` per url. Writes the new base then deletes the consumed
|
|
213
|
+
* event files — file-level only, never row-level (ADR-0002). Idempotent +
|
|
214
|
+
* re-runnable: a crash after the base write but before the delete just
|
|
215
|
+
* re-folds the same events (newest-wins makes that a no-op). A real read
|
|
216
|
+
* failure on the existing base propagates rather than rebuilding from events
|
|
217
|
+
* alone (which would drop URLs only the base held).
|
|
218
|
+
*
|
|
219
|
+
* No-op (no base rewrite) when there are zero outstanding events.
|
|
220
|
+
*/
|
|
221
|
+
compactInspections: (ctx: TenantCtx) => Promise<{
|
|
222
|
+
baseRowCount: number;
|
|
223
|
+
eventsFolded: number;
|
|
224
|
+
eventFilesDeleted: number;
|
|
225
|
+
}>;
|
|
144
226
|
/**
|
|
145
227
|
* DuckDB-resolvable URI for the materialised parquet sidecar, or
|
|
146
228
|
* `undefined` if the underlying `DataSource` has no native URI shape
|
|
@@ -156,6 +238,14 @@ interface InspectionStore {
|
|
|
156
238
|
interface CreateInspectionStoreOptions {
|
|
157
239
|
dataSource: DataSource;
|
|
158
240
|
}
|
|
241
|
+
/**
|
|
242
|
+
* Column schema for the append-only inspection-event store + its compacted
|
|
243
|
+
* base. Superset of {@link INSPECTION_PARQUET_COLUMNS}: the 16 promoted columns
|
|
244
|
+
* plus the 8 full-fidelity ones the lossy `materialize` parquet dropped. The
|
|
245
|
+
* event files and `base.parquet` share this schema so DuckDB
|
|
246
|
+
* `read_parquet([...], union_by_name = true)` merges base + events cleanly.
|
|
247
|
+
*/
|
|
248
|
+
declare const INSPECTION_EVENT_COLUMNS: readonly ColumnDef[];
|
|
159
249
|
declare function createInspectionStore(opts: CreateInspectionStoreOptions): InspectionStore;
|
|
160
250
|
/** GSC sitemap record we persist. Matches `Schema$WmxSitemap` but as plain JSON. */
|
|
161
251
|
interface SitemapRecord {
|
|
@@ -229,6 +319,12 @@ interface SnapshotUrlsResult {
|
|
|
229
319
|
/** True when contentHash matched prior; the call performed zero writes. */
|
|
230
320
|
unchanged: boolean;
|
|
231
321
|
}
|
|
322
|
+
interface ReconcileResult {
|
|
323
|
+
/** Feedpaths that were absent from the live set and had their live URLs pruned. */
|
|
324
|
+
feedpathsPruned: number;
|
|
325
|
+
/** Total URL rows transitioned live → removed across pruned feedpaths. */
|
|
326
|
+
urlsRemoved: number;
|
|
327
|
+
}
|
|
232
328
|
interface DeltaEntry {
|
|
233
329
|
feedpath: string;
|
|
234
330
|
feedpathHash: string;
|
|
@@ -280,6 +376,21 @@ interface SitemapStore {
|
|
|
280
376
|
* regardless of total site URL count.
|
|
281
377
|
*/
|
|
282
378
|
compactUrls: (ctx: TenantCtx) => Promise<void>;
|
|
379
|
+
/**
|
|
380
|
+
* Site-wide convergence: mark every still-live URL whose owning feedpath is
|
|
381
|
+
* absent from `liveFeedpaths` as removed. `compactUrls`/`snapshotUrls` only
|
|
382
|
+
* prune URLs *inside* a feedpath that was re-observed; a whole feed dropped
|
|
383
|
+
* from the sitemap list (no `snapshotUrls` call) leaves its URLs frozen-live
|
|
384
|
+
* forever. This is the sidecar mirror of the D1 generation sweep: it rewrites
|
|
385
|
+
* each dropped feedpath's `by-feed/<hash>/index.parquet` with `removedAt` set
|
|
386
|
+
* and deletes its outstanding deltas (write-new-base + delete-deltas,
|
|
387
|
+
* ADR-0002). Bounded per feedpath, so memory stays flat regardless of site
|
|
388
|
+
* size. Live feedpaths are never touched.
|
|
389
|
+
*/
|
|
390
|
+
reconcile: (ctx: TenantCtx, opts: {
|
|
391
|
+
liveFeedpaths: readonly string[];
|
|
392
|
+
at?: number;
|
|
393
|
+
}) => Promise<ReconcileResult>;
|
|
283
394
|
}
|
|
284
395
|
interface CreateSitemapStoreOptions {
|
|
285
396
|
dataSource: DataSource;
|
|
@@ -331,4 +442,4 @@ interface CreateEmptyTypesStoreOptions {
|
|
|
331
442
|
now?: () => number;
|
|
332
443
|
}
|
|
333
444
|
declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
|
|
334
|
-
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
|
|
445
|
+
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, ReconcileResult, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
|
package/dist/entities.mjs
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
import { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
|
|
2
|
-
export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
|
|
1
|
+
import { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix } from "./_chunks/entities.mjs";
|
|
2
|
+
export { INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
|
package/dist/rollups.d.mts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { DataSource, FileSetRef, Row as Row$1 } from "./_chunks/storage.mjs";
|
|
2
|
-
import { ColumnDef } from "./_chunks/schema.mjs";
|
|
2
|
+
import { ColumnDef as ColumnDef$1 } from "./_chunks/schema.mjs";
|
|
3
3
|
import { EngineError } from "./_chunks/errors.mjs";
|
|
4
4
|
import { SearchType } from "gscdump/query";
|
|
5
5
|
import { TenantCtx } from "@gscdump/contracts";
|
|
@@ -70,7 +70,7 @@ interface RollupDef {
|
|
|
70
70
|
* Types map the same way as the fact-table encoder: VARCHAR / DATE go
|
|
71
71
|
* through BYTE_ARRAY/UTF8; BIGINT → INT64; INTEGER → INT32; DOUBLE → DOUBLE.
|
|
72
72
|
*/
|
|
73
|
-
parquetColumns?: readonly ColumnDef[];
|
|
73
|
+
parquetColumns?: readonly ColumnDef$1[];
|
|
74
74
|
/** Sort-key column names for parquet row-group stats. Optional. */
|
|
75
75
|
parquetSortKey?: readonly string[];
|
|
76
76
|
/**
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.26.
|
|
4
|
+
"version": "0.26.4",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -190,8 +190,8 @@
|
|
|
190
190
|
"drizzle-orm": "1.0.0-rc.3",
|
|
191
191
|
"icebird": "^0.8.8",
|
|
192
192
|
"proper-lockfile": "^4.1.2",
|
|
193
|
-
"gscdump": "0.26.
|
|
194
|
-
"
|
|
193
|
+
"@gscdump/contracts": "0.26.4",
|
|
194
|
+
"gscdump": "0.26.4"
|
|
195
195
|
},
|
|
196
196
|
"devDependencies": {
|
|
197
197
|
"@duckdb/duckdb-wasm": "^1.32.0",
|