@gscdump/engine 0.26.3 → 0.26.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/_chunks/engine.mjs
CHANGED
|
@@ -4,7 +4,7 @@ import { compactTieredImpl, dedupeOverlappingTiers, splitOverlappingTiers } from
|
|
|
4
4
|
import { compileLogicalQueryPlan, substituteNamedFiles } from "./parquet-plan.mjs";
|
|
5
5
|
import { sqlEscape } from "../sql-bind.mjs";
|
|
6
6
|
import { buildLogicalPlan } from "gscdump/query/plan";
|
|
7
|
-
import { normalizeUrl } from "gscdump";
|
|
7
|
+
import { normalizeUrl } from "gscdump/normalize";
|
|
8
8
|
async function encodeBytes(db, table, rows) {
|
|
9
9
|
const inName = db.makeTempPath("json");
|
|
10
10
|
const outName = db.makeTempPath("parquet");
|
|
@@ -180,6 +180,16 @@ const INSPECTION_EVENT_COLUMNS = [
|
|
|
180
180
|
name: "checkCount",
|
|
181
181
|
type: "INTEGER",
|
|
182
182
|
nullable: true
|
|
183
|
+
},
|
|
184
|
+
{
|
|
185
|
+
name: "nextCheckAfter",
|
|
186
|
+
type: "BIGINT",
|
|
187
|
+
nullable: true
|
|
188
|
+
},
|
|
189
|
+
{
|
|
190
|
+
name: "nextCheckPriority",
|
|
191
|
+
type: "VARCHAR",
|
|
192
|
+
nullable: true
|
|
183
193
|
}
|
|
184
194
|
];
|
|
185
195
|
function createInspectionStore(opts) {
|
|
@@ -710,6 +720,92 @@ function createSitemapStore(opts) {
|
|
|
710
720
|
await ds.write(indexKey, bytes);
|
|
711
721
|
if (consumed.length > 0) await ds.delete(consumed);
|
|
712
722
|
}
|
|
723
|
+
},
|
|
724
|
+
async reconcile(ctx, { liveFeedpaths, at: atOpt }) {
|
|
725
|
+
const at = atOpt ?? now();
|
|
726
|
+
const liveHashes = new Set(liveFeedpaths.map((fp) => hash(fp)));
|
|
727
|
+
const present = /* @__PURE__ */ new Set();
|
|
728
|
+
for (const key of await ds.list(`${sitemapUrlsIndexPrefix(ctx)}/`)) {
|
|
729
|
+
const m = /\/by-feed\/([0-9a-f]+)\/index\.parquet$/.exec(key);
|
|
730
|
+
if (m) present.add(m[1]);
|
|
731
|
+
}
|
|
732
|
+
const deltasByFeed = /* @__PURE__ */ new Map();
|
|
733
|
+
for (const key of await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)) {
|
|
734
|
+
const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
|
|
735
|
+
if (!m) continue;
|
|
736
|
+
present.add(m[2]);
|
|
737
|
+
const list = deltasByFeed.get(m[2]) ?? [];
|
|
738
|
+
list.push(key);
|
|
739
|
+
deltasByFeed.set(m[2], list);
|
|
740
|
+
}
|
|
741
|
+
let feedpathsPruned = 0;
|
|
742
|
+
let urlsRemoved = 0;
|
|
743
|
+
for (const fpHash of present) {
|
|
744
|
+
if (liveHashes.has(fpHash)) continue;
|
|
745
|
+
const indexKey = sitemapUrlsIndexKey(ctx, fpHash);
|
|
746
|
+
const indexBytes = await readOptional(ds, indexKey);
|
|
747
|
+
const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
|
|
748
|
+
const live = /* @__PURE__ */ new Map();
|
|
749
|
+
const removed = /* @__PURE__ */ new Map();
|
|
750
|
+
for (const row of indexRows) {
|
|
751
|
+
const r = rowToUrlRecord(row);
|
|
752
|
+
if (r.removedAt != null) removed.set(r.urlHash, r);
|
|
753
|
+
else live.set(r.urlHash, r);
|
|
754
|
+
}
|
|
755
|
+
const consumed = [];
|
|
756
|
+
for (const key of (deltasByFeed.get(fpHash) ?? []).sort()) {
|
|
757
|
+
const bytes = await readOptional(ds, key);
|
|
758
|
+
if (!bytes) continue;
|
|
759
|
+
consumed.push(key);
|
|
760
|
+
const rows = await decodeParquetToRows(bytes);
|
|
761
|
+
for (const r of rows) {
|
|
762
|
+
const urlHash = String(r.url_hash);
|
|
763
|
+
const dat = Number(r.at);
|
|
764
|
+
if (String(r.op) === "added") {
|
|
765
|
+
const prev = live.get(urlHash) ?? removed.get(urlHash);
|
|
766
|
+
removed.delete(urlHash);
|
|
767
|
+
live.set(urlHash, {
|
|
768
|
+
feedpath: String(r.feedpath),
|
|
769
|
+
feedpathHash: fpHash,
|
|
770
|
+
urlHash,
|
|
771
|
+
loc: String(r.loc),
|
|
772
|
+
lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
|
|
773
|
+
firstSeenAt: prev?.firstSeenAt ?? dat,
|
|
774
|
+
lastSeenAt: dat
|
|
775
|
+
});
|
|
776
|
+
} else if (String(r.op) === "removed") {
|
|
777
|
+
const prev = live.get(urlHash);
|
|
778
|
+
live.delete(urlHash);
|
|
779
|
+
if (prev) removed.set(urlHash, {
|
|
780
|
+
...prev,
|
|
781
|
+
removedAt: dat
|
|
782
|
+
});
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
const hadLive = live.size > 0;
|
|
787
|
+
if (!hadLive && consumed.length === 0) continue;
|
|
788
|
+
for (const [urlHash, r] of live) {
|
|
789
|
+
removed.set(urlHash, {
|
|
790
|
+
...r,
|
|
791
|
+
removedAt: at
|
|
792
|
+
});
|
|
793
|
+
urlsRemoved++;
|
|
794
|
+
}
|
|
795
|
+
const merged = [...removed.values()];
|
|
796
|
+
merged.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
|
|
797
|
+
const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
|
|
798
|
+
columns: URLS_INDEX_COLUMNS,
|
|
799
|
+
sortKey: ["feedpath_hash", "url_hash"]
|
|
800
|
+
});
|
|
801
|
+
await ds.write(indexKey, bytes);
|
|
802
|
+
if (consumed.length > 0) await ds.delete(consumed);
|
|
803
|
+
if (hadLive) feedpathsPruned++;
|
|
804
|
+
}
|
|
805
|
+
return {
|
|
806
|
+
feedpathsPruned,
|
|
807
|
+
urlsRemoved
|
|
808
|
+
};
|
|
713
809
|
}
|
|
714
810
|
};
|
|
715
811
|
}
|
|
@@ -3,8 +3,8 @@ import { enumeratePartitions } from "./compaction.mjs";
|
|
|
3
3
|
import { escapeLike } from "../sql-fragments.mjs";
|
|
4
4
|
import "../planner.mjs";
|
|
5
5
|
import { UnresolvableDatasetError, buildLogicalComparisonPlan, buildLogicalPlan, inferDataset as inferLogicalDataset, isDatasetResolvable } from "gscdump/query/plan";
|
|
6
|
-
import { normalizeUrl } from "gscdump";
|
|
7
6
|
import { PgDialect, pgTable, varchar } from "drizzle-orm/pg-core";
|
|
7
|
+
import { normalizeUrl } from "gscdump/normalize";
|
|
8
8
|
import { sql } from "drizzle-orm";
|
|
9
9
|
const DIMENSION_SURFACES = {
|
|
10
10
|
page: ["api", "stored"],
|
package/dist/adapters/node.mjs
CHANGED
|
@@ -2,9 +2,9 @@ import { engineErrors } from "../errors.mjs";
|
|
|
2
2
|
import { createDuckDBCodec, createDuckDBExecutor, createStorageEngine } from "../_chunks/engine.mjs";
|
|
3
3
|
import { createNodeDuckDBHandle, resetNodeDuckDB } from "./duckdb-node.mjs";
|
|
4
4
|
import { createFilesystemDataSource, createFilesystemManifestStore } from "./filesystem.mjs";
|
|
5
|
-
import { encodeSiteId } from "gscdump";
|
|
6
5
|
import { err, ok, unwrapResult } from "gscdump/result";
|
|
7
6
|
import path from "node:path";
|
|
7
|
+
import { encodeSiteId } from "gscdump/tenant";
|
|
8
8
|
function createNodeHarness(opts) {
|
|
9
9
|
const dataDir = opts.dataDir;
|
|
10
10
|
const userId = opts.userId ?? "local";
|
package/dist/entities.d.mts
CHANGED
|
@@ -138,6 +138,14 @@ interface InspectionEventRow extends InspectionParquetRow {
|
|
|
138
138
|
firstCheckedAt: string | null;
|
|
139
139
|
/** Total number of inspections recorded for this url. */
|
|
140
140
|
checkCount: number | null;
|
|
141
|
+
/**
|
|
142
|
+
* Stored next-recheck unix-seconds + priority as computed AT INSPECT TIME.
|
|
143
|
+
* Carried verbatim (NOT recomputed at read) because the scheduling policy can
|
|
144
|
+
* change over time — `__gsc/inspections` must replay the historical value to
|
|
145
|
+
* keep its frozen wire shape byte-stable.
|
|
146
|
+
*/
|
|
147
|
+
nextCheckAfter: number | null;
|
|
148
|
+
nextCheckPriority: string | null;
|
|
141
149
|
}
|
|
142
150
|
/**
|
|
143
151
|
* Hard cap on a single `appendHistory` shard payload. Encoded bytes >
|
|
@@ -311,6 +319,12 @@ interface SnapshotUrlsResult {
|
|
|
311
319
|
/** True when contentHash matched prior; the call performed zero writes. */
|
|
312
320
|
unchanged: boolean;
|
|
313
321
|
}
|
|
322
|
+
interface ReconcileResult {
|
|
323
|
+
/** Feedpaths that were absent from the live set and had their live URLs pruned. */
|
|
324
|
+
feedpathsPruned: number;
|
|
325
|
+
/** Total URL rows transitioned live → removed across pruned feedpaths. */
|
|
326
|
+
urlsRemoved: number;
|
|
327
|
+
}
|
|
314
328
|
interface DeltaEntry {
|
|
315
329
|
feedpath: string;
|
|
316
330
|
feedpathHash: string;
|
|
@@ -362,6 +376,21 @@ interface SitemapStore {
|
|
|
362
376
|
* regardless of total site URL count.
|
|
363
377
|
*/
|
|
364
378
|
compactUrls: (ctx: TenantCtx) => Promise<void>;
|
|
379
|
+
/**
|
|
380
|
+
* Site-wide convergence: mark every still-live URL whose owning feedpath is
|
|
381
|
+
* absent from `liveFeedpaths` as removed. `compactUrls`/`snapshotUrls` only
|
|
382
|
+
* prune URLs *inside* a feedpath that was re-observed; a whole feed dropped
|
|
383
|
+
* from the sitemap list (no `snapshotUrls` call) leaves its URLs frozen-live
|
|
384
|
+
* forever. This is the sidecar mirror of the D1 generation sweep: it rewrites
|
|
385
|
+
* each dropped feedpath's `by-feed/<hash>/index.parquet` with `removedAt` set
|
|
386
|
+
* and deletes its outstanding deltas (write-new-base + delete-deltas,
|
|
387
|
+
* ADR-0002). Bounded per feedpath, so memory stays flat regardless of site
|
|
388
|
+
* size. Live feedpaths are never touched.
|
|
389
|
+
*/
|
|
390
|
+
reconcile: (ctx: TenantCtx, opts: {
|
|
391
|
+
liveFeedpaths: readonly string[];
|
|
392
|
+
at?: number;
|
|
393
|
+
}) => Promise<ReconcileResult>;
|
|
365
394
|
}
|
|
366
395
|
interface CreateSitemapStoreOptions {
|
|
367
396
|
dataSource: DataSource;
|
|
@@ -413,4 +442,4 @@ interface CreateEmptyTypesStoreOptions {
|
|
|
413
442
|
now?: () => number;
|
|
414
443
|
}
|
|
415
444
|
declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
|
|
416
|
-
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
|
|
445
|
+
export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, ReconcileResult, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gscdump/engine",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.26.
|
|
4
|
+
"version": "0.26.6",
|
|
5
5
|
"description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
|
|
6
6
|
"author": {
|
|
7
7
|
"name": "Harlan Wilton",
|
|
@@ -190,8 +190,8 @@
|
|
|
190
190
|
"drizzle-orm": "1.0.0-rc.3",
|
|
191
191
|
"icebird": "^0.8.8",
|
|
192
192
|
"proper-lockfile": "^4.1.2",
|
|
193
|
-
"gscdump": "0.26.
|
|
194
|
-
"
|
|
193
|
+
"@gscdump/contracts": "0.26.6",
|
|
194
|
+
"gscdump": "0.26.6"
|
|
195
195
|
},
|
|
196
196
|
"devDependencies": {
|
|
197
197
|
"@duckdb/duckdb-wasm": "^1.32.0",
|