@gscdump/engine 0.19.6 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -200,7 +200,8 @@ interface SitemapHistoryDoc {
200
200
  }
201
201
  declare function sitemapIndexKey(ctx: TenantCtx): string;
202
202
  declare function sitemapHistoryKey(ctx: TenantCtx, feedpathHash: string, capturedAtMs: number): string;
203
- declare function sitemapUrlsIndexKey(ctx: TenantCtx): string;
203
+ declare function sitemapUrlsIndexPrefix(ctx: TenantCtx): string;
204
+ declare function sitemapUrlsIndexKey(ctx: TenantCtx, feedpathHash: string): string;
204
205
  declare function sitemapUrlsDeltaKey(ctx: TenantCtx, feedpathHash: string, date: string): string;
205
206
  /** Parsed URL entry from a sitemap XML. */
206
207
  interface ParsedUrl {
@@ -273,12 +274,12 @@ interface SitemapStore {
273
274
  /** Stream all delta entries within `[from, to]` (YYYY-MM-DD inclusive). */
274
275
  loadDeltas: (ctx: TenantCtx, dateRange?: DateRange) => AsyncIterable<DeltaEntry>;
275
276
  /**
276
- * Fold every accumulated delta into the prior index; writes a fresh
277
- * `urls/index.parquet` and deletes the consumed delta files.
277
+ * Fold accumulated deltas into the prior index, one feedpath at a time:
278
+ * rewrites each touched feedpath's `by-feed/<hash>/index.parquet` and deletes
279
+ * the consumed delta files. Bounded per feedpath, so it stays within memory
280
+ * regardless of total site URL count.
278
281
  */
279
282
  compactUrls: (ctx: TenantCtx) => Promise<void>;
280
- /** DuckDB-resolvable URI for the URLs index; `undefined` if backend lacks one. */
281
- urlsParquetUri: (ctx: TenantCtx) => string | undefined;
282
283
  }
283
284
  interface CreateSitemapStoreOptions {
284
285
  dataSource: DataSource;
@@ -330,4 +331,4 @@ interface CreateEmptyTypesStoreOptions {
330
331
  now?: () => number;
331
332
  }
332
333
  declare function createEmptyTypesStore(opts: CreateEmptyTypesStoreOptions): EmptyTypesStore;
333
- export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
334
+ export { CreateEmptyTypesStoreOptions, CreateIndexingMetadataStoreOptions, CreateInspectionStoreOptions, CreateSitemapStoreOptions, DateRange, DeltaEntry, EmptyTypesDoc, EmptyTypesStore, INSPECTION_HISTORY_MAX_BYTES, IndexingMetadataIndex, IndexingMetadataRecord, IndexingMetadataStore, InspectionHistoryShard, InspectionIndex, InspectionParquetRow, InspectionRecord, InspectionStore, LoadUrlsOptions, ParsedUrl, SitemapHistoryDoc, SitemapIndex, SitemapRecord, SitemapStore, SitemapUrlRecord, SnapshotUrlsResult, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
package/dist/entities.mjs CHANGED
@@ -190,8 +190,11 @@ function sitemapHistoryKey(ctx, feedpathHash, capturedAtMs) {
190
190
  function sitemapUrlsPrefix(ctx) {
191
191
  return ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/entities/sitemaps/urls` : `u_${ctx.userId}/entities/sitemaps/urls`;
192
192
  }
193
- function sitemapUrlsIndexKey(ctx) {
194
- return `${sitemapUrlsPrefix(ctx)}/index.parquet`;
193
+ function sitemapUrlsIndexPrefix(ctx) {
194
+ return `${sitemapUrlsPrefix(ctx)}/by-feed`;
195
+ }
196
+ function sitemapUrlsIndexKey(ctx, feedpathHash) {
197
+ return `${sitemapUrlsIndexPrefix(ctx)}/${feedpathHash}/index.parquet`;
195
198
  }
196
199
  function sitemapUrlsDeltaKey(ctx, feedpathHash, date) {
197
200
  return `${sitemapUrlsPrefix(ctx)}/deltas/${date}__${feedpathHash}.parquet`;
@@ -417,13 +420,12 @@ function createSitemapStore(opts) {
417
420
  async *loadUrls(ctx, feedpath, opts) {
418
421
  const fpHash = hash(feedpath);
419
422
  const includeRemoved = opts?.includeRemoved ?? false;
420
- const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx)).catch(() => void 0);
421
- const indexRows = indexBytes ? await decodeParquetToRows(indexBytes, { filter: { feedpath_hash: { $eq: fpHash } } }) : [];
423
+ const indexBytes = await ds.read(sitemapUrlsIndexKey(ctx, fpHash)).catch(() => void 0);
424
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
422
425
  const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
423
426
  const live = /* @__PURE__ */ new Map();
424
427
  const removedMap = /* @__PURE__ */ new Map();
425
428
  for (const row of indexRows) {
426
- if (row.feedpath_hash !== fpHash) continue;
427
429
  const rec = rowToUrlRecord(row);
428
430
  if (rec.removedAt != null) removedMap.set(rec.urlHash, rec);
429
431
  else live.set(rec.urlHash, rec);
@@ -492,71 +494,67 @@ function createSitemapStore(opts) {
492
494
  }
493
495
  },
494
496
  async compactUrls(ctx) {
495
- const indexKey = sitemapUrlsIndexKey(ctx);
496
- const indexBytes = await ds.read(indexKey).catch(() => void 0);
497
- const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
498
- const stateKey = (fp, u) => `${fp}::${u}`;
499
- const live = /* @__PURE__ */ new Map();
500
- const removed = /* @__PURE__ */ new Map();
501
- for (const row of indexRows) {
502
- const rec = rowToUrlRecord(row);
503
- const k = stateKey(rec.feedpathHash, rec.urlHash);
504
- if (rec.removedAt != null) removed.set(k, rec);
505
- else live.set(k, rec);
506
- }
507
- const deltaKeys = (await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`)).sort();
508
- const consumed = [];
497
+ const deltaKeys = await ds.list(`${sitemapUrlsPrefix(ctx)}/deltas/`);
498
+ const deltasByFeed = /* @__PURE__ */ new Map();
509
499
  for (const key of deltaKeys) {
510
500
  const m = SITEMAP_URLS_DELTA_PREFIX_RE.exec(key);
511
501
  if (!m) continue;
512
- const fpHash = m[2];
513
- const bytes = await ds.read(key).catch(() => void 0);
514
- if (!bytes) continue;
515
- consumed.push(key);
516
- const rows = await decodeParquetToRows(bytes);
517
- for (const r of rows) {
518
- const urlHash = String(r.url_hash);
519
- const at = Number(r.at);
520
- const k = stateKey(fpHash, urlHash);
521
- const op = String(r.op);
522
- if (op === "added") {
523
- const prev = live.get(k) ?? removed.get(k);
524
- removed.delete(k);
525
- live.set(k, {
526
- feedpath: String(r.feedpath),
527
- feedpathHash: fpHash,
528
- urlHash,
529
- loc: String(r.loc),
530
- lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
531
- firstSeenAt: prev?.firstSeenAt ?? at,
532
- lastSeenAt: at
533
- });
534
- } else if (op === "removed") {
535
- const prev = live.get(k);
536
- live.delete(k);
537
- if (prev) removed.set(k, {
538
- ...prev,
539
- removedAt: at
540
- });
502
+ const list = deltasByFeed.get(m[2]) ?? [];
503
+ list.push(key);
504
+ deltasByFeed.set(m[2], list);
505
+ }
506
+ for (const [fpHash, feedDeltaKeys] of deltasByFeed) {
507
+ const indexKey = sitemapUrlsIndexKey(ctx, fpHash);
508
+ const indexBytes = await ds.read(indexKey).catch(() => void 0);
509
+ const indexRows = indexBytes ? await decodeParquetToRows(indexBytes) : [];
510
+ const live = /* @__PURE__ */ new Map();
511
+ const removed = /* @__PURE__ */ new Map();
512
+ for (const row of indexRows) {
513
+ const rec = rowToUrlRecord(row);
514
+ if (rec.removedAt != null) removed.set(rec.urlHash, rec);
515
+ else live.set(rec.urlHash, rec);
516
+ }
517
+ const consumed = [];
518
+ for (const key of feedDeltaKeys.sort()) {
519
+ const bytes = await ds.read(key).catch(() => void 0);
520
+ if (!bytes) continue;
521
+ consumed.push(key);
522
+ const rows = await decodeParquetToRows(bytes);
523
+ for (const r of rows) {
524
+ const urlHash = String(r.url_hash);
525
+ const at = Number(r.at);
526
+ const op = String(r.op);
527
+ if (op === "added") {
528
+ const prev = live.get(urlHash) ?? removed.get(urlHash);
529
+ removed.delete(urlHash);
530
+ live.set(urlHash, {
531
+ feedpath: String(r.feedpath),
532
+ feedpathHash: fpHash,
533
+ urlHash,
534
+ loc: String(r.loc),
535
+ lastmod: r.lastmod == null ? void 0 : String(r.lastmod),
536
+ firstSeenAt: prev?.firstSeenAt ?? at,
537
+ lastSeenAt: at
538
+ });
539
+ } else if (op === "removed") {
540
+ const prev = live.get(urlHash);
541
+ live.delete(urlHash);
542
+ if (prev) removed.set(urlHash, {
543
+ ...prev,
544
+ removedAt: at
545
+ });
546
+ }
541
547
  }
542
548
  }
549
+ const merged = [...live.values(), ...removed.values()];
550
+ merged.sort((a, b) => a.urlHash < b.urlHash ? -1 : a.urlHash > b.urlHash ? 1 : 0);
551
+ const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
552
+ columns: URLS_INDEX_COLUMNS,
553
+ sortKey: ["feedpath_hash", "url_hash"]
554
+ });
555
+ await ds.write(indexKey, bytes);
556
+ if (consumed.length > 0) await ds.delete(consumed);
543
557
  }
544
- const merged = [...live.values(), ...removed.values()];
545
- merged.sort((a, b) => {
546
- if (a.feedpathHash !== b.feedpathHash) return a.feedpathHash < b.feedpathHash ? -1 : 1;
547
- if (a.urlHash !== b.urlHash) return a.urlHash < b.urlHash ? -1 : 1;
548
- return 0;
549
- });
550
- const bytes = encodeRowsToParquetFlex(merged.map(urlRecordToRow), {
551
- columns: URLS_INDEX_COLUMNS,
552
- sortKey: ["feedpath_hash", "url_hash"]
553
- });
554
- await ds.write(indexKey, bytes);
555
- if (consumed.length > 0) await ds.delete(consumed);
556
- },
557
- urlsParquetUri(ctx) {
558
- const key = sitemapUrlsIndexKey(ctx);
559
- return ds.uri ? ds.uri(key) : void 0;
560
558
  }
561
559
  };
562
560
  }
@@ -640,4 +638,4 @@ function createEmptyTypesStore(opts) {
640
638
  }
641
639
  };
642
640
  }
643
- export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey };
641
+ export { INSPECTION_HISTORY_MAX_BYTES, createEmptyTypesStore, createIndexingMetadataStore, createInspectionStore, createSitemapStore, emptyTypesKey, hashUrl, hashUrlList, indexingMetadataIndexKey, inspectionHistoryPrefix, inspectionHistoryShardKey, inspectionIndexKey, inspectionParquetKey, sitemapHistoryKey, sitemapIndexKey, sitemapUrlsDeltaKey, sitemapUrlsIndexKey, sitemapUrlsIndexPrefix };
@@ -119,7 +119,8 @@ interface RunSQLFn {
119
119
  table: TableName$1;
120
120
  fileSets: Record<string, {
121
121
  table: TableName$1;
122
- partitions: string[];
122
+ partitions?: string[];
123
+ keys?: string[];
123
124
  }>;
124
125
  sql: string;
125
126
  params: unknown[];
@@ -30,9 +30,9 @@ interface RollupEngine {
30
30
  /**
31
31
  * Read the live manifest for a (tenant, table[, searchType]) cohort —
32
32
  * cheap, no parquet decode. Builders use this to chunk a full-history scan
33
- * into byte-bounded windows so a single `runSQL` call never has to ship
34
- * more than ~14MB of decoded rows across the Workers service-binding RPC
35
- * (32MiB hard cap).
33
+ * into byte-bounded windows (see `WINDOW_BYTE_BUDGET`) so a single `runSQL`
34
+ * call never ships an oversized Arrow IPC payload across the Workers
35
+ * service-binding RPC (32MiB hard cap).
36
36
  */
37
37
  listPartitions: (opts: {
38
38
  ctx: TenantCtx;
@@ -174,8 +174,15 @@ interface RebuildRollupResult {
174
174
  }
175
175
  declare function rebuildRollups(opts: RebuildRollupsOptions): Promise<RebuildRollupResult[]>;
176
176
  /**
177
- * Target decoded-bytes budget per window. Sits well under the 28MiB executor
178
- * guard so headroom remains for SQL + result rows.
177
+ * Per-window budget, measured in *parquet* bytes (manifest `bytes`), used by
178
+ * `planRollupWindows` to chunk a full-history scan.
179
+ *
180
+ * The executor decodes a window's parquet and ships it as an Arrow IPC stream
181
+ * over the service binding; that IPC is hard-guarded at 28MiB
182
+ * (`IPC_PLACEHOLDER_BUDGET` in @gscdump/cloudflare). Parquet is compressed and
183
+ * the IPC stream is not, so a window inflates on the wire — keep this
184
+ * conservatively below the guard. Re-measure the parquet→IPC ratio against
185
+ * production and raise if headroom allows.
179
186
  */
180
187
  declare const WINDOW_BYTE_BUDGET: number;
181
188
  /**
package/dist/rollups.mjs CHANGED
@@ -1,5 +1,6 @@
1
+ import "./_chunks/storage.mjs";
1
2
  import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
2
- import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexKey } from "./entities.mjs";
3
+ import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./entities.mjs";
3
4
  import { MS_PER_DAY } from "gscdump";
4
5
  function rollupPrefix(ctx, searchType) {
5
6
  const base = ctx.siteId ? `u_${ctx.userId}/${ctx.siteId}/rollups` : `u_${ctx.userId}/rollups`;
@@ -111,7 +112,7 @@ function utcDateMinusDays(at, days) {
111
112
  const d = new Date(at - days * MS_PER_DAY);
112
113
  return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
113
114
  }
114
- const WINDOW_BYTE_BUDGET = 14 * 1024 * 1024;
115
+ const WINDOW_BYTE_BUDGET = 10 * 1024 * 1024;
115
116
  const DAY_RE = /^daily\/(\d{4})-(\d{2})-(\d{2})$/;
116
117
  const WEEK_RE = /^weekly\/(\d{4})-(\d{2})-(\d{2})$/;
117
118
  const MONTH_RE = /^monthly\/(\d{4})-(\d{2})$/;
@@ -612,16 +613,17 @@ const indexPercentRollup = {
612
613
  windowDays: 90,
613
614
  sliceOrthogonal: true,
614
615
  async build({ engine, ctx, dataSource, builtAt, searchType }) {
615
- const urlsKey = sitemapUrlsIndexKey(ctx);
616
- if (!await dataSource.head?.(urlsKey)) return {
616
+ const urlsKeys = await dataSource.list(sitemapUrlsIndexPrefix(ctx));
617
+ if (urlsKeys.length === 0) return {
617
618
  totalSitemapUrls: 0,
618
619
  days: []
619
620
  };
620
621
  const cutoff = utcDateMinusDays(builtAt, 90);
622
+ const factSearchType = searchType ?? "web";
621
623
  const pagesPartitions = partitionsInRange(await engine.listPartitions({
622
624
  ctx,
623
625
  table: "pages",
624
- ...searchType !== void 0 ? { searchType } : {}
626
+ searchType: factSearchType
625
627
  }), cutoff, utcDateMinusDays(builtAt, 0));
626
628
  const numerator = await engine.runSQL({
627
629
  ctx,
@@ -633,10 +635,10 @@ const indexPercentRollup = {
633
635
  },
634
636
  URLS: {
635
637
  table: "pages",
636
- keys: [urlsKey]
638
+ keys: urlsKeys
637
639
  }
638
640
  },
639
- ...searchType !== void 0 ? { searchType } : {},
641
+ searchType: factSearchType,
640
642
  sql: `
641
643
  SELECT
642
644
  p.date AS date,
@@ -654,7 +656,7 @@ const indexPercentRollup = {
654
656
  table: "pages",
655
657
  fileSets: { URLS: {
656
658
  table: "pages",
657
- keys: [urlsKey]
659
+ keys: urlsKeys
658
660
  } },
659
661
  sql: `
660
662
  SELECT COUNT(*)::BIGINT AS total
@@ -90,8 +90,8 @@ interface EngineQuerySourceOptions {
90
90
  declare function createEngineQuerySource(options: EngineQuerySourceOptions): AnalysisQuerySource;
91
91
  /**
92
92
  * Convenience: wrap a storage engine + tenant ctx in a source and dispatch.
93
- * Equivalent to
94
- * `runAnalyzerFromSource(createEngineQuerySource({ engine, ctx }), params, registry)`.
93
+ * Equivalent to wrapping `createEngineQuerySource`, with omitted searchType
94
+ * defaulted to web at this public helper boundary.
95
95
  */
96
96
  declare function runAnalyzerWithEngine(deps: {
97
97
  engine: StorageEngine;
@@ -1,4 +1,5 @@
1
1
  import { n as coerceRows } from "../_chunks/coerce.mjs";
2
+ import "../_chunks/storage.mjs";
2
3
  import { T as assertDimensionsSupported, a as pgResolverAdapter, c as getFilterDimensions, v as resolveToSQL } from "../_chunks/resolver.mjs";
3
4
  import { n as runAnalyzerFromSource } from "../_chunks/dispatch.mjs";
4
5
  var AttachedTableMissingError = class extends Error {
@@ -127,7 +128,8 @@ function createEngineQuerySource(options) {
127
128
  async function runAnalyzerWithEngine(deps, ctx, params, registry) {
128
129
  return runAnalyzerFromSource(createEngineQuerySource({
129
130
  engine: deps.engine,
130
- ctx
131
+ ctx,
132
+ searchType: params.searchType ?? "web"
131
133
  }), params, registry);
132
134
  }
133
135
  function typedQuery(state) {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.19.6",
4
+ "version": "0.20.0",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -169,8 +169,8 @@
169
169
  "dependencies": {
170
170
  "drizzle-orm": "^0.45.2",
171
171
  "proper-lockfile": "^4.1.2",
172
- "gscdump": "0.19.6",
173
- "@gscdump/contracts": "0.19.6"
172
+ "@gscdump/contracts": "0.20.0",
173
+ "gscdump": "0.20.0"
174
174
  },
175
175
  "devDependencies": {
176
176
  "@duckdb/duckdb-wasm": "^1.32.0",