@gscdump/engine 0.19.6 → 0.19.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -200,7 +200,8 @@ interface SitemapHistoryDoc {
200
200
  }
201
201
  declare function sitemapIndexKey(ctx: TenantCtx): string;
202
202
  declare function sitemapHistoryKey(ctx: TenantCtx, feedpathHash: string, capturedAtMs: number): string;
203
- declare function sitemapUrlsIndexKey(ctx: TenantCtx): string;
203
+ declare function sitemapUrlsIndexPrefix(ctx: TenantCtx): string;
204
+ declare function sitemapUrlsIndexKey(ctx: TenantCtx, feedpathHash: string): string;
204
205
  declare function sitemapUrlsDeltaKey(ctx: TenantCtx, feedpathHash: string, date: string): string;
205
206
  /** Parsed URL entry from a sitemap XML. */
206
207
  interface ParsedUrl {
@@ -273,12 +274,12 @@ interface SitemapStore {
273
274
  /** Stream all delta entries within `[from, to]` (YYYY-MM-DD inclusive). */
274
275
  loadDeltas: (ctx: TenantCtx, dateRange?: DateRange) => AsyncIterable<DeltaEntry>;
275
276
  /**
276
- * Fold every accumulated delta into the prior index; writes a fresh
277
- * `urls/index.parquet` and deletes the consumed delta files.
277
+ * Fold accumulated deltas into the prior index, one feedpath at a time:
278
+ * rewrites each touched feedpath's `by-feed/<hash>/index.parquet` and deletes
279
+ * the consumed delta files. Bounded per feedpath, so it stays within memory
280
+ * regardless of total site URL count.
278
281
  */
279
282
  compactUrls: (ctx: TenantCtx) => Promise<void>;
280
- /** DuckDB-resolvable URI for the URLs index; `undefined` if backend lacks one. */
281
- urlsParquetUri: (ctx: TenantCtx) => string | undefined;
282
283
  }
283
284
  interface CreateSitemapStoreOptions {
284
285
  dataSource: DataSource;
@@ -330,4 +331,4 @@ interface CreateEmptyTypesStoreOptions {
330
331
  now?: () => number;
331
332
  }
332
333
  declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
333
- export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
334
+ export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
package/dist/entities.mjs CHANGED
@@ -190,8 +190,11 @@ function sitemapHistoryKey(ctx, feedpathHash, capturedAtMs) {
190
190
  function sitemapUrlsPrefix(ctx) {
191
191
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/urls` : `u_${ctx.userId}/entities/sitemaps/urls`;
192
192
  }
193
- function sitemapUrlsIndexKey(ctx) {
194
- return `${sitemapUrlsPrefix(ctx)}/index.parquet`;
193
+ function sitemapUrlsIndexPrefix(ctx) {
194
+ return `${sitemapUrlsPrefix(ctx)}/by-feed`;
195
+ }
196
+ function sitemapUrlsIndexKey(ctx, feedpathHash) {
197
+ return `${sitemapUrlsIndexPrefix(ctx)}/${feedpathHash}/index.parquet`;
195
198
  }
196
199
  function sitemapUrlsDeltaKey(ctx, feedpathHash, date) {
197
200
  return `${sitemapUrlsPrefix(ctx)}/deltas/${date}__${feedpathHash}.parquet`;
@@ -417,13 +420,12 @@ function createSitemapStore(opts) {
417
420
  async *loadUrls(ctx, feedpath, opts) {
418
421
  const fpHash = hash(feedpath);
419
422
  const includeRemoved = opts?.includeRemoved ?? false;
420
- const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx)).catch(() => void 0);
421
- const indexRows = indexBytes ? await decodeParquetToRows(indexBytes, { filter: { feedpath_hash: { $eq: fpHash } } }) : [];
423
+ const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx, fpHash)).catch(() => void 0);
424
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
422
425
  const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
423
426
  const live = /* @__PURE__ */ new Map();
424
427
  const removedMap = /* @__PURE__ */ new Map();
425
428
  for (const row of indexRows) {
426
- if (row.feedpath_hash !== fpHash) continue;
427
429
  const rec = rowToUrlRecord(row);
428
430
  if (rec.removedAt != null) removedMap.set(rec.urlHash, rec);
429
431
  else live.set(rec.urlHash, rec);
@@ -492,71 +494,67 @@ function createSitemapStore(opts) {
492
494
  }
493
495
  },
494
496
  async compactUrls(ctx) {
495
- const indexKey = sitemapUrlsIndexKey(ctx);
496
- const indexBytes = await ds.read(indexKey).catch(() => void 0);
497
- const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
498
- const stateKey = (fp, u) => `${fp}::${u}`;
499
- const live = /* @__PURE__ */ new Map();
500
- const removed = /* @__PURE__ */ new Map();
501
- for (const row of indexRows) {
502
- const rec = rowToUrlRecord(row);
503
- const k = stateKey(rec.feedpathHash, rec.urlHash);
504
- if (rec.removedAt != null) removed.set(k, rec);
505
- else live.set(k, rec);
506
- }
507
- const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
508
- const consumed = [];
497
+ const deltaKeys = await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`);
498
+ const deltasByFeed = /* @__PURE__ */ new Map();
509
499
  for (const key of deltaKeys) {
510
500
  const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
511
501
  if (!m) continue;
512
- const fpHash = m[2];
513
- const bytes = await ds.read(key).catch(() => void 0);
514
- if (!bytes) continue;
515
- consumed.push(key);
516
- const rows = await decodeParquetToRows(bytes);
517
- for (const r of rows) {
518
- const urlHash = String(r.url_hash);
519
- const at = Number(r.at);
520
- const k = stateKey(fpHash, urlHash);
521
- const op = String(r.op);
522
- if (op === "added") {
523
- const prev = live.get(k) ?? removed.get(k);
524
- removed.delete(k);
525
- live.set(k, {
526
- feedpath: String(r.feedpath),
527
- feedpathHash: fpHash,
528
- urlHash,
529
- loc: String(r.loc),
530
- lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
531
- firstSeenAt: prev?.firstSeenAt ?? at,
532
- lastSeenAt: at
533
- });
534
- } else if (op === "removed") {
535
- const prev = live.get(k);
536
- live.delete(k);
537
- if (prev) removed.set(k, {
538
- ...prev,
539
- removedAt: at
540
- });
502
+ const list = deltasByFeed.get(m[2]) ?? [];
503
+ list.push(key);
504
+ deltasByFeed.set(m[2], list);
505
+ }
506
+ for (const [fpHash, feedDeltaKeys] of deltasByFeed) {
507
+ const indexKey = sitemapUrlsIndexKey(ctx, fpHash);
508
+ const indexBytes = await ds.read(indexKey).catch(() => void 0);
509
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
510
+ const live = /* @__PURE__ */ new Map();
511
+ const removed = /* @__PURE__ */ new Map();
512
+ for (const row of indexRows) {
513
+ const rec = rowToUrlRecord(row);
514
+ if (rec.removedAt != null) removed.set(rec.urlHash, rec);
515
+ else live.set(rec.urlHash, rec);
516
+ }
517
+ const consumed = [];
518
+ for (const key of feedDeltaKeys.sort()) {
519
+ const bytes = await ds.read(key).catch(() => void 0);
520
+ if (!bytes) continue;
521
+ consumed.push(key);
522
+ const rows = await decodeParquetToRows(bytes);
523
+ for (const r of rows) {
524
+ const urlHash = String(r.url_hash);
525
+ const at = Number(r.at);
526
+ const op = String(r.op);
527
+ if (op === "added") {
528
+ const prev = live.get(urlHash) ?? removed.get(urlHash);
529
+ removed.delete(urlHash);
530
+ live.set(urlHash, {
531
+ feedpath: String(r.feedpath),
532
+ feedpathHash: fpHash,
533
+ urlHash,
534
+ loc: String(r.loc),
535
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
536
+ firstSeenAt: prev?.firstSeenAt ?? at,
537
+ lastSeenAt: at
538
+ });
539
+ } else if (op === "removed") {
540
+ const prev = live.get(urlHash);
541
+ live.delete(urlHash);
542
+ if (prev) removed.set(urlHash, {
543
+ ...prev,
544
+ removedAt: at
545
+ });
546
+ }
541
547
  }
542
548
  }
549
+ const merged = [...live.values(), ...removed.values()];
550
+ merged.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
551
+ const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
552
+ columns: URLS_INDEX_COLUMNS,
553
+ sortKey: ["feedpath_hash", "url_hash"]
554
+ });
555
+ await ds.write(indexKey, bytes);
556
+ if (consumed.length > 0) await ds.delete(consumed);
543
557
  }
544
- const merged = [...live.values(), ...removed.values()];
545
- merged.sort((a, b) => {
546
- if (a.feedpathHash !== b.feedpathHash) return a.feedpathHash < b.feedpathHash ? -1 : 1;
547
- if (a.urlHash !== b.urlHash) return a.urlHash < b.urlHash ? -1 : 1;
548
- return 0;
549
- });
550
- const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
551
- columns: URLS_INDEX_COLUMNS,
552
- sortKey: ["feedpath_hash", "url_hash"]
553
- });
554
- await ds.write(indexKey, bytes);
555
- if (consumed.length > 0) await ds.delete(consumed);
556
- },
557
- urlsParquetUri(ctx) {
558
- const key = sitemapUrlsIndexKey(ctx);
559
- return ds.uri ? ds.uri(key) : void 0;
560
558
  }
561
559
  };
562
560
  }
@@ -640,4 +638,4 @@ function createEmptyTypesStore(opts) {
640
638
  }
641
639
  };
642
640
  }
643
- export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
641
+ export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
package/dist/rollups.mjs CHANGED
@@ -1,5 +1,5 @@
1
1
  import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
2
- import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexKey } from "./entities.mjs";
2
+ import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./entities.mjs";
3
3
  import { MS_PER_DAY } from "gscdump";
4
4
  function rollupPrefix(ctx, searchType) {
5
5
  const base = ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
@@ -612,8 +612,8 @@ const indexPercentRollup = {
612
612
  windowDays: 90,
613
613
  sliceOrthogonal: true,
614
614
  async build({ engine, ctx, dataSource, builtAt, searchType }) {
615
- const urlsKey = sitemapUrlsIndexKey(ctx);
616
- if (!await dataSource.head?.(urlsKey)) return {
615
+ const urlsKeys = await dataSource.list(sitemapUrlsIndexPrefix(ctx));
616
+ if (urlsKeys.length === 0) return {
617
617
  totalSitemapUrls: 0,
618
618
  days: []
619
619
  };
@@ -633,7 +633,7 @@ const indexPercentRollup = {
633
633
  },
634
634
  URLS: {
635
635
  table: "pages",
636
- keys: [urlsKey]
636
+ keys: urlsKeys
637
637
  }
638
638
  },
639
639
  ...searchType !== void 0 ? { searchType } : {},
@@ -654,7 +654,7 @@ const indexPercentRollup = {
654
654
  table: "pages",
655
655
  fileSets: { URLS: {
656
656
  table: "pages",
657
- keys: [urlsKey]
657
+ keys: urlsKeys
658
658
  } },
659
659
  sql: `
660
660
  SELECT COUNT(*)::BIGINT AS total
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.19.6",
4
+ "version": "0.19.7",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -169,8 +169,8 @@
169
169
  "dependencies": {
170
170
  "drizzle-orm": "^0.45.2",
171
171
  "proper-lockfile": "^4.1.2",
172
- "gscdump": "0.19.6",
173
- "@gscdump/contracts": "0.19.6"
172
+ "gscdump": "0.19.7",
173
+ "@gscdump/contracts": "0.19.7"
174
174
  },
175
175
  "devDependencies": {
176
176
  "@duckdb/duckdb-wasm": "^1.32.0",