@gscdump/engine 0.19.7 → 0.20.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -119,7 +119,8 @@ interface RunSQLFn {
119
119
  table: TableName$1;
120
120
  fileSets: Record<string, {
121
121
  table: TableName$1;
122
- partitions: string[];
122
+ partitions?: string[];
123
+ keys?: string[];
123
124
  }>;
124
125
  sql: string;
125
126
  params: unknown[];
@@ -30,9 +30,9 @@ interface RollupEngine {
30
30
  /**
31
31
  * Read the live manifest for a (tenant, table[, searchType]) cohort —
32
32
  * cheap, no parquet decode. Builders use this to chunk a full-history scan
33
- * into byte-bounded windows so a single `runSQL` call never has to ship
34
- * more than ~14MB of decoded rows across the Workers service-binding RPC
35
- * (32MiB hard cap).
33
+ * into byte-bounded windows (see `WINDOW_BYTE_BUDGET`) so a single `runSQL`
34
+ * call never ships an oversized Arrow IPC payload across the Workers
35
+ * service-binding RPC (32MiB hard cap).
36
36
  */
37
37
  listPartitions: (opts: {
38
38
  ctx: TenantCtx;
@@ -174,8 +174,15 @@ interface RebuildRollupResult {
174
174
  }
175
175
  declare function rebuildRollups(opts: RebuildRollupsOptions): Promise<RebuildRollupResult[]>;
176
176
  /**
177
- * Target decoded-bytes budget per window. Sits well under the 28MiB executor
178
- * guard so headroom remains for SQL + result rows.
177
+ * Per-window budget, measured in *parquet* bytes (manifest `bytes`), used by
178
+ * `planRollupWindows` to chunk a full-history scan.
179
+ *
180
+ * The executor decodes a window's parquet and ships it as an Arrow IPC stream
181
+ * over the service binding; that IPC is hard-guarded at 28MiB
182
+ * (`IPC_PLACEHOLDER_BUDGET` in @gscdump/cloudflare). Parquet is compressed and
183
+ * the IPC stream is not, so a window inflates on the wire — keep this
184
+ * conservatively below the guard. Re-measure the parquet→IPC ratio against
185
+ * production and raise if headroom allows.
179
186
  */
180
187
  declare const WINDOW_BYTE_BUDGET: number;
181
188
  /**
package/dist/rollups.mjs CHANGED
@@ -1,3 +1,4 @@
1
+ import "./_chunks/storage.mjs";
1
2
  import { encodeRowsToParquetFlex } from "./adapters/hyparquet.mjs";
2
3
  import { createIndexingMetadataStore, createSitemapStore, inspectionParquetKey, sitemapUrlsIndexPrefix } from "./entities.mjs";
3
4
  import { MS_PER_DAY } from "gscdump";
@@ -111,7 +112,7 @@ function utcDateMinusDays(at, days) {
111
112
  const d = new Date(at - days * MS_PER_DAY);
112
113
  return `${d.getUTCFullYear()}-${String(d.getUTCMonth() + 1).padStart(2, "0")}-${String(d.getUTCDate()).padStart(2, "0")}`;
113
114
  }
114
- const WINDOW_BYTE_BUDGET = 14 * 1024 * 1024;
115
+ const WINDOW_BYTE_BUDGET = 10 * 1024 * 1024;
115
116
  const DAY_RE = /^daily\/(\d{4})-(\d{2})-(\d{2})$/;
116
117
  const WEEK_RE = /^weekly\/(\d{4})-(\d{2})-(\d{2})$/;
117
118
  const MONTH_RE = /^monthly\/(\d{4})-(\d{2})$/;
@@ -618,10 +619,11 @@ const indexPercentRollup = {
618
619
  days: []
619
620
  };
620
621
  const cutoff = utcDateMinusDays(builtAt, 90);
622
+ const factSearchType = searchType ?? "web";
621
623
  const pagesPartitions = partitionsInRange(await engine.listPartitions({
622
624
  ctx,
623
625
  table: "pages",
624
- ...searchType !== void 0 ? { searchType } : {}
626
+ searchType: factSearchType
625
627
  }), cutoff, utcDateMinusDays(builtAt, 0));
626
628
  const numerator = await engine.runSQL({
627
629
  ctx,
@@ -636,7 +638,7 @@ const indexPercentRollup = {
636
638
  keys: urlsKeys
637
639
  }
638
640
  },
639
- ...searchType !== void 0 ? { searchType } : {},
641
+ searchType: factSearchType,
640
642
  sql: `
641
643
  SELECT
642
644
  p.date AS date,
@@ -90,8 +90,8 @@ interface EngineQuerySourceOptions {
90
90
  declare function createEngineQuerySource(options: EngineQuerySourceOptions): AnalysisQuerySource;
91
91
  /**
92
92
  * Convenience: wrap a storage engine + tenant ctx in a source and dispatch.
93
- * Equivalent to
94
- * `runAnalyzerFromSource(createEngineQuerySource({ engine, ctx }), params, registry)`.
93
+ * Equivalent to wrapping `createEngineQuerySource`, with omitted searchType
94
+ * defaulted to web at this public helper boundary.
95
95
  */
96
96
  declare function runAnalyzerWithEngine(deps: {
97
97
  engine: StorageEngine;
@@ -1,4 +1,5 @@
1
1
  import { n as coerceRows } from "../_chunks/coerce.mjs";
2
+ import "../_chunks/storage.mjs";
2
3
  import { T as assertDimensionsSupported, a as pgResolverAdapter, c as getFilterDimensions, v as resolveToSQL } from "../_chunks/resolver.mjs";
3
4
  import { n as runAnalyzerFromSource } from "../_chunks/dispatch.mjs";
4
5
  var AttachedTableMissingError = class extends Error {
@@ -127,7 +128,8 @@ function createEngineQuerySource(options) {
127
128
  async function runAnalyzerWithEngine(deps, ctx, params, registry) {
128
129
  return runAnalyzerFromSource(createEngineQuerySource({
129
130
  engine: deps.engine,
130
- ctx
131
+ ctx,
132
+ searchType: params.searchType ?? "web"
131
133
  }), params, registry);
132
134
  }
133
135
  function typedQuery(state) {
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@gscdump/engine",
3
3
  "type": "module",
4
- "version": "0.19.7",
4
+ "version": "0.20.1",
5
5
  "description": "Append-only Parquet/DuckDB storage engine + planner + adapters for the gscdump pipeline. Node + edge runtimes; opt-in heavy peers.",
6
6
  "author": {
7
7
  "name": "Harlan Wilton",
@@ -169,8 +169,8 @@
169
169
  "dependencies": {
170
170
  "drizzle-orm": "^0.45.2",
171
171
  "proper-lockfile": "^4.1.2",
172
- "gscdump": "0.19.7",
173
- "@gscdump/contracts": "0.19.7"
172
+ "gscdump": "0.20.1",
173
+ "@gscdump/contracts": "0.20.1"
174
174
  },
175
175
  "devDependencies": {
176
176
  "@duckdb/duckdb-wasm": "^1.32.0",
@@ -178,7 +178,7 @@
178
178
  "aws4fetch": "^1.0.20",
179
179
  "hyparquet": "^1.25.8",
180
180
  "hyparquet-writer": "^0.15.1",
181
- "tsx": "^4.22.2",
181
+ "tsx": "^4.22.3",
182
182
  "vitest": "^4.1.6"
183
183
  },
184
184
  "scripts": {