@gscdump/engine 0.26.3 → 0.26.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,7 @@ import { compactTieredImpl, dedupeOverlappingTiers, splitOverlappingTiers } from
4
4
  import { compileLogicalQueryPlan, substituteNamedFiles } from "./parquet-plan.mjs";
5
5
  import { sqlEscape } from "../sql-bind.mjs";
6
6
  import { buildLogicalPlan } from "gscdump/query/plan";
7
- import { normalizeUrl } from "gscdump";
7
+ import { normalizeUrl } from "gscdump/normalize";
8
8
  async function encodeBytes(db, table, rows) {
9
9
  const inName = db.makeTempPath("json");
10
10
  const outName = db.makeTempPath("parquet");
@@ -180,6 +180,16 @@ const INSPECTION_EVENT_COLUMNS = [
180
180
  name: "checkCount",
181
181
  type: "INTEGER",
182
182
  nullable: true
183
+ },
184
+ {
185
+ name: "nextCheckAfter",
186
+ type: "BIGINT",
187
+ nullable: true
188
+ },
189
+ {
190
+ name: "nextCheckPriority",
191
+ type: "VARCHAR",
192
+ nullable: true
183
193
  }
184
194
  ];
185
195
  function createInspectionStore(opts) {
@@ -710,6 +720,92 @@ function createSitemapStore(opts) {
710
720
  await ds.write(indexKey, bytes);
711
721
  if (consumed.length > 0) await ds.delete(consumed);
712
722
  }
723
+ },
724
+ async reconcile(ctx, { liveFeedpaths, at: atOpt }) {
725
+ const at = atOpt ?? now();
726
+ const liveHashes = new Set(liveFeedpaths.map((fp) => hash(fp)));
727
+ const present = /* @__PURE__ */ new Set();
728
+ for (const key of await ds.list(`${sitemapUrlsIndexPrefix(ctx)}/`)) {
729
+ const m = /\/by-feed\/([0-9a-f]+)\/index\.parquet$/.exec(key);
730
+ if (m) present.add(m[1]);
731
+ }
732
+ const deltasByFeed = /* @__PURE__ */ new Map();
733
+ for (const key of await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)) {
734
+ const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
735
+ if (!m) continue;
736
+ present.add(m[2]);
737
+ const list = deltasByFeed.get(m[2]) ?? [];
738
+ list.push(key);
739
+ deltasByFeed.set(m[2], list);
740
+ }
741
+ let feedpathsPruned = 0;
742
+ let urlsRemoved = 0;
743
+ for (const fpHash of present) {
744
+ if (liveHashes.has(fpHash)) continue;
745
+ const indexKey = sitemapUrlsIndexKey(ctx, fpHash);
746
+ const indexBytes = await readOptional(ds, indexKey);
747
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
748
+ const live = /* @__PURE__ */ new Map();
749
+ const removed = /* @__PURE__ */ new Map();
750
+ for (const row of indexRows) {
751
+ const r = rowToUrlRecord(row);
752
+ if (r.removedAt != null) removed.set(r.urlHash, r);
753
+ else live.set(r.urlHash, r);
754
+ }
755
+ const consumed = [];
756
+ for (const key of (deltasByFeed.get(fpHash) ?? []).sort()) {
757
+ const bytes = await readOptional(ds, key);
758
+ if (!bytes) continue;
759
+ consumed.push(key);
760
+ const rows = await decodeParquetToRows(bytes);
761
+ for (const r of rows) {
762
+ const urlHash = String(r.url_hash);
763
+ const dat = Number(r.at);
764
+ if (String(r.op) === "added") {
765
+ const prev = live.get(urlHash) ?? removed.get(urlHash);
766
+ removed.delete(urlHash);
767
+ live.set(urlHash, {
768
+ feedpath: String(r.feedpath),
769
+ feedpathHash: fpHash,
770
+ urlHash,
771
+ loc: String(r.loc),
772
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
773
+ firstSeenAt: prev?.firstSeenAt ?? dat,
774
+ lastSeenAt: dat
775
+ });
776
+ } else if (String(r.op) === "removed") {
777
+ const prev = live.get(urlHash);
778
+ live.delete(urlHash);
779
+ if (prev) removed.set(urlHash, {
780
+ ...prev,
781
+ removedAt: dat
782
+ });
783
+ }
784
+ }
785
+ }
786
+ const hadLive = live.size > 0;
787
+ if (!hadLive && consumed.length === 0) continue;
788
+ for (const [urlHash, r] of live) {
789
+ removed.set(urlHash, {
790
+ ...r,
791
+ removedAt: at
792
+ });
793
+ urlsRemoved++;
794
+ }
795
+ const merged = [...removed.values()];
796
+ merged.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
797
+ const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
798
+ columns: URLS_INDEX_COLUMNS,
799
+ sortKey: ["feedpath_hash", "url_hash"]
800
+ });
801
+ await ds.write(indexKey, bytes);
802
+ if (consumed.length > 0) await ds.delete(consumed);
803
+ if (hadLive) feedpathsPruned++;
804
+ }
805
+ return {
806
+ feedpathsPruned,
807
+ urlsRemoved
808
+ };
713
809
  }
714
810
  };
715
811
  }
@@ -3,8 +3,8 @@ import { enumeratePartitions } from "./compaction.mjs";
3
3
  import { escapeLike } from "../sql-fragments.mjs";
4
4
  import "../planner.mjs";
5
5
  import { UnresolvableDatasetError, buildLogicalComparisonPlan, buildLogicalPlan, inferDataset as inferLogicalDataset, isDatasetResolvable } from "gscdump/query/plan";
6
- import { normalizeUrl } from "gscdump";
7
6
  import { PgDialect, pgTable, varchar } from "drizzle-orm/pg-core";
7
+ import { normalizeUrl } from "gscdump/normalize";
8
8
  import { sql } from "drizzle-orm";
9
9
  const DIMENSION_SURFACES = {
10
10
  page: ["api", "stored"],
@@ -2,9 +2,9 @@ import { engineErrors } from "../errors.mjs";
2
2
  import { createDuckDBCodec, createDuckDBExecutor, createStorageEngine } from "../_chunks/engine.mjs";
3
3
  import { createNodeDuckDBHandle, resetNodeDuckDB } from "./duckdb-node.mjs";
4
4
  import { createFilesystemDataSource, createFilesystemManifestStore } from "./filesystem.mjs";
5
- import { encodeSiteId } from "gscdump";
6
5
  import { err, ok, unwrapResult } from "gscdump/result";
7
6
  import path from "node:path";
7
+ import { encodeSiteId } from "gscdump/tenant";
8
8
  function createNodeHarness(opts) {
9
9
  const dataDir = opts.dataDir;
10
10
  const userId = opts.userId ?? "local";
@@ -138,6 +138,14 @@ interface InspectionEventRow extends InspectionParquetRow {
138
138
  firstCheckedAt: string | null;
139
139
  /** Total number of inspections recorded for this url. */
140
140
  checkCount: number | null;
141
+ /**
142
+ * Stored next-recheck unix-seconds + priority as computed AT INSPECT TIME.
143
+ * Carried verbatim (NOT recomputed at read) because the scheduling policy can
144
+ * change over time — `__gsc/inspections` must replay the historical value to
145
+ * keep its frozen wire shape byte-stable.
146
+ */
147
+ nextCheckAfter: number | null;
148
+ nextCheckPriority: string | null;
141
149
  }
142
150
  /**
143
151
  * Hard cap on a single `appendHistory` shard payload. Encoded bytes >
@@ -311,6 +319,12 @@ interface SnapshotUrlsResult {
311
319
  /** True when contentHash matched prior; the call performed zero writes. */
312
320
  unchanged: boolean;
313
321
  }
322
+ interface ReconcileResult {
323
+ /** Feedpaths that were absent from the live set and had their live URLs pruned. */
324
+ feedpathsPruned: number;
325
+ /** Total URL rows transitioned live → removed across pruned feedpaths. */
326
+ urlsRemoved: number;
327
+ }
314
328
  interface DeltaEntry {
315
329
  feedpath: string;
316
330
  feedpathHash: string;
@@ -362,6 +376,21 @@ interface SitemapStore {
362
376
  * regardless of total site URL count.
363
377
  */
364
378
  compactUrls: (ctx: TenantCtx) => Promise<void>;
379
+ /**
380
+ * Site-wide convergence: mark every still-live URL whose owning feedpath is
381
+ * absent from `liveFeedpaths` as removed. `compactUrls`/`snapshotUrls` only
382
+ * prune URLs *inside* a feedpath that was re-observed; a whole feed dropped
383
+ * from the sitemap list (no `snapshotUrls` call) leaves its URLs frozen-live
384
+ * forever. This is the sidecar mirror of the D1 generation sweep: it rewrites
385
+ * each dropped feedpath's `by-feed/<hash>/index.parquet` with `removedAt` set
386
+ * and deletes its outstanding deltas (write-new-base + delete-deltas,
387
+ * ADR-0002). Bounded per feedpath, so memory stays flat regardless of site
388
+ * size. Live feedpaths are never touched.
389
+ */
390
+ reconcile: (ctx: TenantCtx, opts: {
391
+ liveFeedpaths: readonly string[];
392
+ at?: number;
393
+ }) => Promise<ReconcileResult>;
365
394
  }
366
395
  interface CreateSitemapStoreOptions {
367
396
  dataSource: DataSource;
@@ -413,4 +442,4 @@ interface CreateEmptyTypesStoreOptions {
413
442
  now?: () => number;
414
443
  }
415
444
  declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
416
- export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
445
+ export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_EVENT_COLUMNS, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionEventRow, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, ReconcileResult, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionBaseKey, inspectionEventKey, inspectionEventsPrefix, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.26.3",
4
+ "version": "0.26.6",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -190,8 +190,8 @@
190
190
  "drizzle-orm": "1.0.0-rc.3",
191
191
  "icebird": "^0.8.8",
192
192
  "proper-lockfile": "^4.1.2",
193
- "gscdump": "0.26.3",
194
- "@gscdump/contracts": "0.26.3"
193
+ "@gscdump/contracts": "0.26.6",
194
+ "gscdump": "0.26.6"
195
195
  },
196
196
  "devDependencies": {
197
197
  "@duckdb/duckdb-wasm": "^1.32.0",