@remnic/bench 9.3.675 → 9.3.677

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -10562,12 +10562,315 @@ function isPlainObject(value) {
10562
10562
 
10563
10563
  // src/benchmark.ts
10564
10564
  import fs2 from "fs";
10565
- import path31 from "path";
10565
+ import path32 from "path";
10566
+ import { createHash as createHash9 } from "crypto";
10567
+ import { expandTildePath as expandTildePath3 } from "@remnic/core";
10568
+
10569
+ // src/judges/judge-cache.ts
10570
+ import { createHash as createHash6, randomBytes as randomBytes2 } from "crypto";
10571
+ import {
10572
+ mkdir as mkdir8,
10573
+ readFile as readFile9,
10574
+ rename as rename2,
10575
+ rm as rm3,
10576
+ writeFile as writeFile8
10577
+ } from "fs/promises";
10578
+ import path11 from "path";
10579
+ var JUDGE_CACHE_PROTOCOL_VERSION = "judge-protocol-v1";
10580
+ function stableStringify2(value) {
10581
+ if (Array.isArray(value)) {
10582
+ return `[${value.map((item) => stableStringify2(item)).join(",")}]`;
10583
+ }
10584
+ if (value !== null && typeof value === "object") {
10585
+ const record = value;
10586
+ const keys = Object.keys(record).sort();
10587
+ const body = keys.map((key) => `${JSON.stringify(key)}:${stableStringify2(record[key])}`).join(",");
10588
+ return `{${body}}`;
10589
+ }
10590
+ return JSON.stringify(value) ?? "null";
10591
+ }
10592
+ var JudgeCache = class {
10593
+ dir;
10594
+ // Per-key write serialization so concurrent writers never race a temp-file
10595
+ // rename into place for the same key. Cached entries are read straight from
10596
+ // disk, so reads remain lock-free.
10597
+ writeQueues = /* @__PURE__ */ new Map();
10598
+ // PR #1591 round-8 (cursor thread): in-memory layer for fire-and-forget
10599
+ // writes. putSafely chains cache.put onto a pendingWrites promise
10600
+ // without awaiting, so a second benchmark iteration can call get()
10601
+ // before the disk rename lands. The inflight map is populated
10602
+ // synchronously inside put() (before the first await) and cleared in
10603
+ // the finally after the write settles — closing the gap without
10604
+ // changing the byte-identical baseline (a fresh process has no
10605
+ // inflight entries).
10606
+ inflight = /* @__PURE__ */ new Map();
10607
+ cachedDirExists = false;
10608
+ constructor(options) {
10609
+ this.dir = path11.resolve(options.dir);
10610
+ }
10611
+ /** Compute the sha256-hex key for a set of parts. Pure, sync, side-effect-free. */
10612
+ computeKey(parts) {
10613
+ const fieldDigest = (value) => createHash6("sha256").update(value).digest();
10614
+ return createHash6("sha256").update(fieldDigest(parts.benchmarkId)).update(fieldDigest(parts.datasetVersion)).update(fieldDigest(parts.questionId)).update(fieldDigest(parts.answerText)).update(fieldDigest(parts.judgePromptHash)).update(fieldDigest(parts.judgeModelId)).update(fieldDigest(parts.judgeParamsHash)).digest("hex");
10615
+ }
10616
+ /**
10617
+ * Read a previously-stored verdict. Returns `undefined` on miss, corrupted
10618
+ * entry, missing required field, or read error — never throws, never
10619
+ * fabricates.
10620
+ */
10621
+ async get(parts) {
10622
+ const key = this.computeKey(parts);
10623
+ const inflightHit = this.inflight.get(key);
10624
+ if (inflightHit !== void 0) {
10625
+ return {
10626
+ cacheHit: true,
10627
+ verdict: inflightHit.verdict,
10628
+ storedAt: inflightHit.storedAt
10629
+ };
10630
+ }
10631
+ const filePath = this.entryPath(key);
10632
+ let raw;
10633
+ try {
10634
+ raw = await readFile9(filePath, "utf8");
10635
+ } catch {
10636
+ return void 0;
10637
+ }
10638
+ const envelope = parseEnvelope(raw);
10639
+ if (envelope === void 0) return void 0;
10640
+ return {
10641
+ cacheHit: true,
10642
+ verdict: envelope.verdict,
10643
+ storedAt: envelope.storedAt
10644
+ };
10645
+ }
10646
+ /**
10647
+ * Persist a verdict atomically: write to a temp file then rename into
10648
+ * place. Concurrent writes for the same key serialize via an in-memory
10649
+ * chain so the temp-file never lands on top of a sibling rename.
10650
+ */
10651
+ async put(parts, verdict) {
10652
+ const key = this.computeKey(parts);
10653
+ const envelope = {
10654
+ storedAt: (/* @__PURE__ */ new Date()).toISOString(),
10655
+ key,
10656
+ verdict
10657
+ };
10658
+ this.inflight.set(key, envelope);
10659
+ const prior = this.writeQueues.get(key) ?? Promise.resolve();
10660
+ const next = prior.then(() => this.writeOne(key, envelope));
10661
+ const tracked = next.catch(() => void 0);
10662
+ this.writeQueues.set(key, tracked);
10663
+ try {
10664
+ await next;
10665
+ } finally {
10666
+ if (this.writeQueues.get(key) === tracked) {
10667
+ this.writeQueues.delete(key);
10668
+ }
10669
+ if (this.inflight.get(key) === envelope) {
10670
+ this.inflight.delete(key);
10671
+ }
10672
+ }
10673
+ }
10674
+ /** Number of in-flight per-key write chains (diagnostic/test seam). */
10675
+ pendingWriteCount() {
10676
+ return this.writeQueues.size;
10677
+ }
10678
+ async writeOne(key, envelope) {
10679
+ if (!this.cachedDirExists) {
10680
+ await mkdir8(this.dir, { recursive: true });
10681
+ this.cachedDirExists = true;
10682
+ }
10683
+ const filePath = this.entryPath(key);
10684
+ const tempPath = path11.join(
10685
+ this.dir,
10686
+ `.${key}.${randomBytes2(6).toString("hex")}.tmp`
10687
+ );
10688
+ await writeFile8(tempPath, `${JSON.stringify(envelope)}
10689
+ `, "utf8");
10690
+ try {
10691
+ await rename2(tempPath, filePath);
10692
+ } catch (error) {
10693
+ await rm3(tempPath, { force: true }).catch(() => void 0);
10694
+ throw error;
10695
+ }
10696
+ }
10697
+ entryPath(key) {
10698
+ return path11.join(this.dir, `${key}.json`);
10699
+ }
10700
+ };
10701
+ function runJudgeWithCache(options) {
10702
+ const { judge, cache } = options;
10703
+ const keyExtras = options.keyExtras ?? {};
10704
+ const counters = {
10705
+ modelCalls: 0,
10706
+ cacheHits: 0,
10707
+ cacheMisses: 0,
10708
+ cacheWriteFailures: 0
10709
+ };
10710
+ let pendingWrites = Promise.resolve();
10711
+ const putSafely = (parts, verdict, control) => {
10712
+ if (!cache) return;
10713
+ if (control?.signal?.aborted) return;
10714
+ const write = cache.put(parts, verdict).catch(() => {
10715
+ counters.cacheWriteFailures += 1;
10716
+ });
10717
+ pendingWrites = pendingWrites.then(() => write);
10718
+ };
10719
+ const CACHE_READ_BUDGET_MS = 250;
10720
+ async function readCacheWithAbort(cache2, parts, control) {
10721
+ if (control?.signal?.aborted) return void 0;
10722
+ const read = cache2.get(parts);
10723
+ const readBudget = new Promise((resolveBudget) => {
10724
+ setTimeout(() => {
10725
+ resolveBudget(void 0);
10726
+ }, CACHE_READ_BUDGET_MS);
10727
+ });
10728
+ return Promise.race([read, readBudget]);
10729
+ }
10730
+ const cachedVerdict = (stored) => ({
10731
+ score: stored.score,
10732
+ tokens: { input: 0, output: 0 },
10733
+ latencyMs: 0,
10734
+ ...stored.model !== void 0 ? { model: stored.model } : {}
10735
+ });
10736
+ const wrapper = {
10737
+ counters,
10738
+ cache,
10739
+ drainPendingWrites: () => pendingWrites,
10740
+ async score(question, predicted, expected, control) {
10741
+ const detailed = await wrapper.scoreWithMetrics(
10742
+ question,
10743
+ predicted,
10744
+ expected,
10745
+ control
10746
+ );
10747
+ return detailed.score;
10748
+ },
10749
+ async scoreWithMetrics(question, predicted, expected, control) {
10750
+ const answerText = `${predicted}${expected}`;
10751
+ const parts = {
10752
+ benchmarkId: keyExtras.benchmarkId ?? "unknown-benchmark",
10753
+ datasetVersion: keyExtras.datasetVersion ?? "unknown-version",
10754
+ questionId: question,
10755
+ answerText,
10756
+ judgePromptHash: keyExtras.judgePromptHash ?? "unknown-prompt",
10757
+ judgeModelId: keyExtras.judgeModelId ?? "unknown-judge",
10758
+ judgeParamsHash: keyExtras.judgeParamsHash ?? "unknown-params"
10759
+ };
10760
+ if (cache) {
10761
+ let hit;
10762
+ try {
10763
+ hit = await readCacheWithAbort(cache, parts, control);
10764
+ } catch {
10765
+ hit = void 0;
10766
+ }
10767
+ if (hit) {
10768
+ counters.cacheHits += 1;
10769
+ return cachedVerdict(hit.verdict);
10770
+ }
10771
+ counters.cacheMisses += 1;
10772
+ }
10773
+ if (!judge.scoreWithMetrics) {
10774
+ counters.modelCalls += 1;
10775
+ const scoreStartedAt = Date.now();
10776
+ const scoreValue = judge.score ? await judge.score(question, predicted, expected, control) : 0;
10777
+ const synthesized = {
10778
+ score: scoreValue,
10779
+ tokens: { input: 0, output: 0 },
10780
+ latencyMs: Date.now() - scoreStartedAt,
10781
+ model: keyExtras.judgeModelId ?? void 0
10782
+ };
10783
+ putSafely(parts, synthesized, control);
10784
+ return synthesized;
10785
+ }
10786
+ counters.modelCalls += 1;
10787
+ const fresh = await judge.scoreWithMetrics(
10788
+ question,
10789
+ predicted,
10790
+ expected,
10791
+ control
10792
+ );
10793
+ putSafely(parts, fresh, control);
10794
+ return fresh;
10795
+ }
10796
+ };
10797
+ if (typeof judge.scoreBinaryPrompt === "function") {
10798
+ Object.defineProperty(wrapper, "scoreBinaryPrompt", {
10799
+ configurable: true,
10800
+ enumerable: true,
10801
+ writable: false,
10802
+ value: async function scoreBinaryPrompt(prompt, control) {
10803
+ const parts = {
10804
+ benchmarkId: keyExtras.benchmarkId ?? "unknown-benchmark",
10805
+ datasetVersion: keyExtras.datasetVersion ?? "unknown-version",
10806
+ // Binary prompts are content-sensitive: two distinct prompts of
10807
+ // the same character length would collide on the previous
10808
+ // `binary:N` key, so key on a sha256 prefix of the prompt body.
10809
+ questionId: `binary:${createHash6("sha256").update(prompt).digest("hex").slice(0, 16)}`,
10810
+ answerText: prompt,
10811
+ judgePromptHash: keyExtras.judgePromptHash ?? "unknown-prompt",
10812
+ judgeModelId: keyExtras.judgeModelId ?? "unknown-judge",
10813
+ judgeParamsHash: keyExtras.judgeParamsHash ?? "unknown-params"
10814
+ };
10815
+ if (cache) {
10816
+ let hit;
10817
+ try {
10818
+ hit = await readCacheWithAbort(cache, parts, control);
10819
+ } catch {
10820
+ hit = void 0;
10821
+ }
10822
+ if (hit) {
10823
+ counters.cacheHits += 1;
10824
+ return cachedVerdict(hit.verdict);
10825
+ }
10826
+ counters.cacheMisses += 1;
10827
+ }
10828
+ counters.modelCalls += 1;
10829
+ const fresh = await judge.scoreBinaryPrompt(prompt, control);
10830
+ putSafely(parts, fresh, control);
10831
+ return fresh;
10832
+ }
10833
+ });
10834
+ }
10835
+ return wrapper;
10836
+ }
10837
+ function parseEnvelope(raw) {
10838
+ let parsed;
10839
+ try {
10840
+ parsed = JSON.parse(raw);
10841
+ } catch {
10842
+ return void 0;
10843
+ }
10844
+ if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) {
10845
+ return void 0;
10846
+ }
10847
+ const candidate = parsed;
10848
+ if (typeof candidate.storedAt !== "string") return void 0;
10849
+ if (typeof candidate.key !== "string") return void 0;
10850
+ if (!isBenchJudgeResult(candidate.verdict)) return void 0;
10851
+ return candidate;
10852
+ }
10853
+ function isBenchJudgeResult(value) {
10854
+ if (value === null || typeof value !== "object" || Array.isArray(value)) {
10855
+ return false;
10856
+ }
10857
+ const v = value;
10858
+ if (typeof v.score !== "number" || !Number.isFinite(v.score)) return false;
10859
+ if (v.tokens === null || typeof v.tokens !== "object" || Array.isArray(v.tokens)) {
10860
+ return false;
10861
+ }
10862
+ const tokens = v.tokens;
10863
+ if (typeof tokens.input !== "number" || !Number.isFinite(tokens.input)) return false;
10864
+ if (typeof tokens.output !== "number" || !Number.isFinite(tokens.output)) return false;
10865
+ if (typeof v.latencyMs !== "number" || !Number.isFinite(v.latencyMs)) return false;
10866
+ if (v.model !== void 0 && typeof v.model !== "string") return false;
10867
+ return true;
10868
+ }
10566
10869
 
10567
10870
  // src/benchmarks/published/ama-bench/runner.ts
10568
10871
  import { randomUUID as randomUUID2 } from "crypto";
10569
- import { readFile as readFile9 } from "fs/promises";
10570
- import path11 from "path";
10872
+ import { readFile as readFile10 } from "fs/promises";
10873
+ import path12 from "path";
10571
10874
 
10572
10875
  // src/benchmarks/published/ama-bench/fixture.ts
10573
10876
  var AMA_BENCH_SMOKE_FIXTURE = [
@@ -11142,10 +11445,10 @@ async function loadDataset(mode, datasetDir, limit) {
11142
11445
  return episodes;
11143
11446
  };
11144
11447
  if (datasetDir) {
11145
- const filePath = path11.join(datasetDir, "open_end_qa_set.jsonl");
11448
+ const filePath = path12.join(datasetDir, "open_end_qa_set.jsonl");
11146
11449
  let raw;
11147
11450
  try {
11148
- raw = await readFile9(filePath, "utf8");
11451
+ raw = await readFile10(filePath, "utf8");
11149
11452
  } catch (error) {
11150
11453
  throw new Error(
11151
11454
  `AMA-Bench dataset not found at ${filePath}: ${error instanceof Error ? error.message : String(error)}`
@@ -11437,8 +11740,8 @@ function isValidQaPairs(value) {
11437
11740
 
11438
11741
  // src/benchmarks/published/amemgym/runner.ts
11439
11742
  import { randomUUID as randomUUID3 } from "crypto";
11440
- import { readFile as readFile10 } from "fs/promises";
11441
- import path12 from "path";
11743
+ import { readFile as readFile11 } from "fs/promises";
11744
+ import path13 from "path";
11442
11745
 
11443
11746
  // src/benchmarks/published/amemgym/fixture.ts
11444
11747
  var AMEMGYM_SMOKE_FIXTURE = [
@@ -11967,7 +12270,7 @@ async function loadDataset2(mode, datasetDir, limit) {
11967
12270
  const datasetErrors = [];
11968
12271
  for (const filename of DATASET_FILENAMES) {
11969
12272
  try {
11970
- const raw = await readFile10(path12.join(datasetDir, filename), "utf8");
12273
+ const raw = await readFile11(path13.join(datasetDir, filename), "utf8");
11971
12274
  const parsed = parseDataset(raw, filename, normalizedLimit);
11972
12275
  return ensureDatasetProfiles(parsed);
11973
12276
  } catch (error) {
@@ -12141,8 +12444,8 @@ function normalizeRole(role) {
12141
12444
 
12142
12445
  // src/benchmarks/published/memory-arena/runner.ts
12143
12446
  import { randomUUID as randomUUID4 } from "crypto";
12144
- import { readFile as readFile11, readdir as readdir5, stat as stat3 } from "fs/promises";
12145
- import path13 from "path";
12447
+ import { readFile as readFile12, readdir as readdir5, stat as stat3 } from "fs/promises";
12448
+ import path14 from "path";
12146
12449
  import { expandTildePath as expandTildePath2 } from "@remnic/core";
12147
12450
 
12148
12451
  // src/benchmarks/published/memory-arena/fixture.ts
@@ -12469,7 +12772,7 @@ async function loadDataset3(mode, datasetDir, limit) {
12469
12772
  if (remainingLimit2 === 0) {
12470
12773
  break;
12471
12774
  }
12472
- const raw = await readFile11(path13.join(datasetDir, filename), "utf8");
12775
+ const raw = await readFile12(path14.join(datasetDir, filename), "utf8");
12473
12776
  const parsedTasks = [];
12474
12777
  raw.split("\n").forEach((line, lineIndex) => {
12475
12778
  if (line.trim().length === 0) {
@@ -12805,7 +13108,7 @@ async function loadMemoryArenaWebshopProductCatalog(datasetDir) {
12805
13108
  `MemoryArena WebShop product sidecar is ${sourceStat.size} bytes; provide a compact JSON/JSONL sidecar smaller than ${MEMORY_ARENA_WEBSHOP_PRODUCTS_MAX_BYTES} bytes instead of the full WebShop catalog.`
12806
13109
  );
12807
13110
  }
12808
- const raw = await readFile11(sourcePath, "utf8");
13111
+ const raw = await readFile12(sourcePath, "utf8");
12809
13112
  const records = parseMemoryArenaWebshopSidecarRecords(raw, sourcePath);
12810
13113
  const byAsin = /* @__PURE__ */ new Map();
12811
13114
  for (const record of records) {
@@ -12825,14 +13128,14 @@ async function loadMemoryArenaWebshopProductCatalog(datasetDir) {
12825
13128
  async function resolveMemoryArenaWebshopProductCatalogPath(datasetDir) {
12826
13129
  const configuredPath = process.env[MEMORY_ARENA_WEBSHOP_PRODUCTS_ENV]?.trim();
12827
13130
  if (configuredPath && configuredPath.length > 0) {
12828
- return path13.resolve(expandTildePath2(configuredPath));
13131
+ return path14.resolve(expandTildePath2(configuredPath));
12829
13132
  }
12830
13133
  if (datasetDir === void 0) {
12831
13134
  return void 0;
12832
13135
  }
12833
13136
  const candidatePaths = [
12834
13137
  ...MEMORY_ARENA_WEBSHOP_PRODUCT_SIDECAR_FILENAMES
12835
- ].map((filename) => path13.join(datasetDir, filename));
13138
+ ].map((filename) => path14.join(datasetDir, filename));
12836
13139
  for (const candidatePath of candidatePaths) {
12837
13140
  try {
12838
13141
  const candidateStat = await stat3(candidatePath);
@@ -14254,8 +14557,8 @@ function scoreSubtaskSuccess(scores) {
14254
14557
  import { collectTemporalLexicalCues } from "@remnic/core";
14255
14558
 
14256
14559
  // src/benchmarks/published/dataset-loader.ts
14257
- import { readFile as readFile12 } from "fs/promises";
14258
- import path14 from "path";
14560
+ import { readFile as readFile13 } from "fs/promises";
14561
+ import path15 from "path";
14259
14562
 
14260
14563
  // src/benchmarks/published/longmemeval/fixture.ts
14261
14564
  var LONG_MEM_EVAL_SMOKE_FIXTURE = [
@@ -14358,10 +14661,10 @@ async function loadDataset4(options) {
14358
14661
  const errors = [];
14359
14662
  if (options.datasetDir) {
14360
14663
  for (const filename of options.filenames) {
14361
- const abs = path14.join(options.datasetDir, filename);
14664
+ const abs = path15.join(options.datasetDir, filename);
14362
14665
  let raw;
14363
14666
  try {
14364
- raw = await readFile12(abs, "utf8");
14667
+ raw = await readFile13(abs, "utf8");
14365
14668
  } catch (error) {
14366
14669
  errors.push(
14367
14670
  `${filename}: ${error instanceof Error ? error.message : String(error)}`
@@ -16098,7 +16401,7 @@ function normalizeQaArray(value, location) {
16098
16401
  import { randomUUID as randomUUID6 } from "crypto";
16099
16402
  import { createReadStream as createReadStream2 } from "fs";
16100
16403
  import { readdir as readdir6 } from "fs/promises";
16101
- import path15 from "path";
16404
+ import path16 from "path";
16102
16405
  import { createInterface } from "readline/promises";
16103
16406
  import {
16104
16407
  asyncBufferFromFile,
@@ -16569,8 +16872,8 @@ async function listBeamDatasetFiles(datasetDir) {
16569
16872
  return directFiles;
16570
16873
  }
16571
16874
  try {
16572
- const nestedFilenames = await readdir6(path15.join(datasetDir, "data"));
16573
- return nestedFilenames.filter((filename) => isBeamDatasetFilename(filename)).map((filename) => path15.join("data", filename));
16875
+ const nestedFilenames = await readdir6(path16.join(datasetDir, "data"));
16876
+ return nestedFilenames.filter((filename) => isBeamDatasetFilename(filename)).map((filename) => path16.join("data", filename));
16574
16877
  } catch {
16575
16878
  return [];
16576
16879
  }
@@ -16597,7 +16900,7 @@ async function* iterateDatasetFiles(datasetDir, datasetFiles, limit) {
16597
16900
  let remainingLimit = limit;
16598
16901
  for (const filename of datasetFiles) {
16599
16902
  const scale = inferScaleFromFilename(filename);
16600
- const filePath = path15.join(datasetDir, filename);
16903
+ const filePath = path16.join(datasetDir, filename);
16601
16904
  const conversations = filename.endsWith(".jsonl") ? streamJsonlDataset(filePath, filename, remainingLimit) : filename.endsWith(".parquet") ? streamParquetDataset(filePath, filename, remainingLimit) : streamJsonDataset(filePath, filename, remainingLimit);
16602
16905
  for await (const conversation of conversations) {
16603
16906
  yield {
@@ -17608,9 +17911,9 @@ var StructuredLiteralParser = class {
17608
17911
  };
17609
17912
 
17610
17913
  // src/benchmarks/published/personamem/runner.ts
17611
- import { createHash as createHash6, randomUUID as randomUUID7 } from "crypto";
17612
- import { readFile as readFile13, realpath as realpath4 } from "fs/promises";
17613
- import path16 from "path";
17914
+ import { createHash as createHash7, randomUUID as randomUUID7 } from "crypto";
17915
+ import { readFile as readFile14, realpath as realpath4 } from "fs/promises";
17916
+ import path17 from "path";
17614
17917
 
17615
17918
  // src/benchmarks/published/personamem/fixture.ts
17616
17919
  var PERSONAMEM_SMOKE_FIXTURE = [
@@ -17886,10 +18189,10 @@ async function loadDataset8(mode, datasetDir, limit) {
17886
18189
  if (datasetDir) {
17887
18190
  const datasetErrors = [];
17888
18191
  for (const relativePath of DATASET_FILE_CANDIDATES) {
17889
- const datasetPath = path16.join(datasetDir, relativePath);
18192
+ const datasetPath = path17.join(datasetDir, relativePath);
17890
18193
  let raw;
17891
18194
  try {
17892
- raw = await readFile13(datasetPath, "utf8");
18195
+ raw = await readFile14(datasetPath, "utf8");
17893
18196
  } catch (error) {
17894
18197
  datasetErrors.push(
17895
18198
  `${relativePath}: ${error instanceof Error ? error.message : String(error)}`
@@ -17947,7 +18250,7 @@ async function hydrateSample(row, datasetRoot) {
17947
18250
  datasetRoot,
17948
18251
  row.chat_history_32k_link
17949
18252
  );
17950
- const chatHistoryRaw = await readFile13(chatHistoryPath, "utf8");
18253
+ const chatHistoryRaw = await readFile14(chatHistoryPath, "utf8");
17951
18254
  const chatHistory = parseChatHistory(
17952
18255
  chatHistoryRaw,
17953
18256
  row.chat_history_32k_link
@@ -18080,12 +18383,12 @@ function parseCsv(raw, limit) {
18080
18383
  return rows;
18081
18384
  }
18082
18385
  async function resolveDatasetFilePath(datasetRoot, relativePath) {
18083
- const rootPath = path16.resolve(datasetRoot);
18386
+ const rootPath = path17.resolve(datasetRoot);
18084
18387
  const rootRealPath = await realpath4(rootPath);
18085
- const candidatePath = path16.resolve(rootPath, relativePath);
18388
+ const candidatePath = path17.resolve(rootPath, relativePath);
18086
18389
  const candidateRealPath = await realpath4(candidatePath);
18087
- const relativeToRoot = path16.relative(rootRealPath, candidateRealPath);
18088
- if (relativeToRoot.startsWith("..") || path16.isAbsolute(relativeToRoot)) {
18390
+ const relativeToRoot = path17.relative(rootRealPath, candidateRealPath);
18391
+ if (relativeToRoot.startsWith("..") || path17.isAbsolute(relativeToRoot)) {
18089
18392
  throw new Error(
18090
18393
  `PersonaMem-v2 dataset file reference "${relativePath}" must stay within datasetDir.`
18091
18394
  );
@@ -18213,7 +18516,7 @@ function buildMcqPrompt(sample, seed) {
18213
18516
  function deterministicShuffle(values, seedMaterial) {
18214
18517
  return values.map((value, index) => ({
18215
18518
  value,
18216
- key: createHash6("sha256").update(`${seedMaterial}:${index}:${value}`).digest("hex"),
18519
+ key: createHash7("sha256").update(`${seedMaterial}:${index}:${value}`).digest("hex"),
18217
18520
  index
18218
18521
  })).sort((left, right) => {
18219
18522
  const byKey = left.key.localeCompare(right.key);
@@ -18413,8 +18716,8 @@ function applyLimit6(items, limit) {
18413
18716
 
18414
18717
  // src/benchmarks/published/membench/runner.ts
18415
18718
  import { randomUUID as randomUUID8 } from "crypto";
18416
- import { readFile as readFile14, readdir as readdir7 } from "fs/promises";
18417
- import path17 from "path";
18719
+ import { readFile as readFile15, readdir as readdir7 } from "fs/promises";
18720
+ import path18 from "path";
18418
18721
 
18419
18722
  // src/benchmarks/published/membench/fixture.ts
18420
18723
  var MEMBENCH_SMOKE_FIXTURE = [
@@ -18675,7 +18978,7 @@ async function loadDataset9(mode, datasetDir, limit) {
18675
18978
  let remainingLimit = normalizedLimit;
18676
18979
  for (const filename of filenames) {
18677
18980
  try {
18678
- const raw = await readFile14(path17.join(datasetDir, filename), "utf8");
18981
+ const raw = await readFile15(path18.join(datasetDir, filename), "utf8");
18679
18982
  const parsed = filename.endsWith(".jsonl") ? parseJsonlDataset(raw, filename) : parseJsonDataset(raw, filename);
18680
18983
  const limitedCases = remainingLimit === 0 ? [] : applyLimit7(parsed, remainingLimit);
18681
18984
  if (limitedCases.length > 0) {
@@ -19542,8 +19845,8 @@ function isPlainObject2(value) {
19542
19845
 
19543
19846
  // src/benchmarks/published/memoryagentbench/runner.ts
19544
19847
  import { randomUUID as randomUUID9 } from "crypto";
19545
- import { access, readFile as readFile15 } from "fs/promises";
19546
- import path18 from "path";
19848
+ import { access, readFile as readFile16 } from "fs/promises";
19849
+ import path19 from "path";
19547
19850
 
19548
19851
  // src/benchmarks/published/memoryagentbench/fixture.ts
19549
19852
  var MEMORY_AGENT_BENCH_SMOKE_FIXTURE = [
@@ -20565,7 +20868,7 @@ async function loadRecSysEntityMapping(datasetDir) {
20565
20868
  }
20566
20869
  let parsed;
20567
20870
  try {
20568
- parsed = JSON.parse(await readFile15(candidate, "utf8"));
20871
+ parsed = JSON.parse(await readFile16(candidate, "utf8"));
20569
20872
  } catch (error) {
20570
20873
  console.error(
20571
20874
  ` [WARN] MemoryAgentBench ReDial entity mapping ${candidate} is invalid JSON; trying the next candidate: ${error instanceof Error ? error.message : String(error)}`
@@ -20622,21 +20925,21 @@ function recsysEntityMappingCandidates(datasetDir) {
20622
20925
  if (!datasetDir) {
20623
20926
  return [];
20624
20927
  }
20625
- const absoluteDatasetDir = path18.resolve(datasetDir);
20928
+ const absoluteDatasetDir = path19.resolve(datasetDir);
20626
20929
  const roots = [
20627
20930
  absoluteDatasetDir,
20628
- path18.dirname(absoluteDatasetDir)
20931
+ path19.dirname(absoluteDatasetDir)
20629
20932
  ];
20630
20933
  const canonicalSuffixes = [
20631
- path18.join("processed_data", "Recsys_Redial", "entity2id.json"),
20632
- path18.join("Recsys_Redial", "entity2id.json")
20934
+ path19.join("processed_data", "Recsys_Redial", "entity2id.json"),
20935
+ path19.join("Recsys_Redial", "entity2id.json")
20633
20936
  ];
20634
20937
  const looseSuffixes = ["entity2id.json"];
20635
20938
  return [
20636
20939
  ...roots.flatMap(
20637
- (root) => canonicalSuffixes.map((suffix) => path18.join(root, suffix))
20940
+ (root) => canonicalSuffixes.map((suffix) => path19.join(root, suffix))
20638
20941
  ),
20639
- ...looseSuffixes.map((suffix) => path18.join(absoluteDatasetDir, suffix))
20942
+ ...looseSuffixes.map((suffix) => path19.join(absoluteDatasetDir, suffix))
20640
20943
  ];
20641
20944
  }
20642
20945
  async function fileExists(filePath) {
@@ -20673,7 +20976,7 @@ async function loadDataset10(mode, datasetDir, limit) {
20673
20976
  const datasetErrors = [];
20674
20977
  for (const filename of DATASET_BUNDLE_CANDIDATES) {
20675
20978
  const parsed = await tryReadDatasetFile(
20676
- path18.join(datasetDir, filename),
20979
+ path19.join(datasetDir, filename),
20677
20980
  filename,
20678
20981
  datasetErrors
20679
20982
  );
@@ -20690,7 +20993,7 @@ async function loadDataset10(mode, datasetDir, limit) {
20690
20993
  let splitData;
20691
20994
  for (const filename of splitConfig.candidates) {
20692
20995
  try {
20693
- splitData = await readDatasetFile(path18.join(datasetDir, filename), filename);
20996
+ splitData = await readDatasetFile(path19.join(datasetDir, filename), filename);
20694
20997
  break;
20695
20998
  } catch (error) {
20696
20999
  if (!isFileNotFoundError2(error)) {
@@ -20728,7 +21031,7 @@ async function loadDataset10(mode, datasetDir, limit) {
20728
21031
  return ensureDatasetItems(applyLimit8(MEMORY_AGENT_BENCH_SMOKE_FIXTURE, normalizedLimit));
20729
21032
  }
20730
21033
  async function readDatasetFile(filePath, filename) {
20731
- const raw = await readFile15(filePath, "utf8");
21034
+ const raw = await readFile16(filePath, "utf8");
20732
21035
  const parsed = filename.endsWith(".jsonl") ? parseJsonLines(raw, filename) : parseJsonArray(raw, filename);
20733
21036
  return parsed.map(
20734
21037
  (item, index) => parseMemoryAgentBenchItem(item, `${filename} item ${index + 1}`)
@@ -21339,7 +21642,7 @@ function loadCases(mode, limit) {
21339
21642
  // src/benchmarks/remnic/extraction-judge-calibration/runner.ts
21340
21643
  import { randomUUID as randomUUID11 } from "crypto";
21341
21644
  import os4 from "os";
21342
- import path19 from "path";
21645
+ import path20 from "path";
21343
21646
  import {
21344
21647
  createVerdictCache,
21345
21648
  judgeFactDurability,
@@ -21449,8 +21752,8 @@ var extractionJudgeCalibrationDefinition = {
21449
21752
  async function runExtractionJudgeCalibrationBenchmark(options) {
21450
21753
  const cases = loadCases2(options.mode, options.limit);
21451
21754
  const config = parseConfig2({
21452
- memoryDir: path19.join(os4.tmpdir(), "remnic-bench-extraction-judge"),
21453
- workspaceDir: path19.join(os4.tmpdir(), "remnic-bench-extraction-judge-workspace"),
21755
+ memoryDir: path20.join(os4.tmpdir(), "remnic-bench-extraction-judge"),
21756
+ workspaceDir: path20.join(os4.tmpdir(), "remnic-bench-extraction-judge-workspace"),
21454
21757
  openaiApiKey: "bench-test-key",
21455
21758
  extractionJudgeEnabled: true,
21456
21759
  extractionJudgeBatchSize: 4,
@@ -21999,9 +22302,9 @@ function constantAggregate2(value) {
21999
22302
 
22000
22303
  // src/benchmarks/remnic/entity-consolidation/runner.ts
22001
22304
  import os5 from "os";
22002
- import path20 from "path";
22305
+ import path21 from "path";
22003
22306
  import { randomUUID as randomUUID13 } from "crypto";
22004
- import { mkdtemp as mkdtemp3, rm as rm3 } from "fs/promises";
22307
+ import { mkdtemp as mkdtemp3, rm as rm4 } from "fs/promises";
22005
22308
  import { StorageManager } from "@remnic/core";
22006
22309
 
22007
22310
  // src/benchmarks/remnic/entity-consolidation/fixture.ts
@@ -22162,7 +22465,7 @@ function loadCases4(mode, limit) {
22162
22465
  return limited;
22163
22466
  }
22164
22467
  async function executeCase(sample) {
22165
- const tmpDir = await mkdtemp3(path20.join(os5.tmpdir(), "remnic-bench-entity-consolidation-"));
22468
+ const tmpDir = await mkdtemp3(path21.join(os5.tmpdir(), "remnic-bench-entity-consolidation-"));
22166
22469
  try {
22167
22470
  const storage = new StorageManager(tmpDir);
22168
22471
  await storage.ensureDirectories();
@@ -22170,7 +22473,7 @@ async function executeCase(sample) {
22170
22473
  const rawEntity = await storage.readEntity(canonicalName);
22171
22474
  return summarizeEntity(rawEntity, canonicalName);
22172
22475
  } finally {
22173
- await rm3(tmpDir, { recursive: true, force: true });
22476
+ await rm4(tmpDir, { recursive: true, force: true });
22174
22477
  }
22175
22478
  }
22176
22479
  async function applyScenario(storage, sample) {
@@ -22341,9 +22644,9 @@ function parseNonNegativeInt(rawValue) {
22341
22644
 
22342
22645
  // src/benchmarks/remnic/page-versioning/runner.ts
22343
22646
  import { randomUUID as randomUUID14 } from "crypto";
22344
- import { mkdir as mkdir8, mkdtemp as mkdtemp4, readFile as readFile16, rm as rm4, writeFile as writeFile8 } from "fs/promises";
22647
+ import { mkdir as mkdir9, mkdtemp as mkdtemp4, readFile as readFile17, rm as rm5, writeFile as writeFile9 } from "fs/promises";
22345
22648
  import os6 from "os";
22346
- import path21 from "path";
22649
+ import path22 from "path";
22347
22650
  import {
22348
22651
  createVersion,
22349
22652
  diffVersions,
@@ -22507,21 +22810,21 @@ function loadCases5(mode, limit) {
22507
22810
  return limited;
22508
22811
  }
22509
22812
  async function executeCase2(sample, dependencies) {
22510
- const tmpDir = await mkdtemp4(path21.join(os6.tmpdir(), "remnic-bench-page-versioning-"));
22813
+ const tmpDir = await mkdtemp4(path22.join(os6.tmpdir(), "remnic-bench-page-versioning-"));
22511
22814
  try {
22512
- const factsDir = path21.join(tmpDir, "facts");
22513
- const pagePath = path21.join(factsDir, `${sample.id}.md`);
22514
- await mkdir8(factsDir, { recursive: true });
22815
+ const factsDir = path22.join(tmpDir, "facts");
22816
+ const pagePath = path22.join(factsDir, `${sample.id}.md`);
22817
+ await mkdir9(factsDir, { recursive: true });
22515
22818
  const config = versioningConfig();
22516
22819
  switch (sample.scenario) {
22517
22820
  case "revert-flow": {
22518
- await writeFile8(pagePath, "original content", "utf-8");
22821
+ await writeFile9(pagePath, "original content", "utf-8");
22519
22822
  await dependencies.createVersion(pagePath, "original content", "write", config, void 0, void 0, tmpDir);
22520
- await writeFile8(pagePath, "modified content", "utf-8");
22823
+ await writeFile9(pagePath, "modified content", "utf-8");
22521
22824
  await dependencies.createVersion(pagePath, "modified content", "write", config, void 0, void 0, tmpDir);
22522
22825
  await dependencies.revertToVersion(pagePath, "1", config, void 0, tmpDir);
22523
22826
  const history = await dependencies.listVersions(pagePath, config, tmpDir);
22524
- const pageContent = await readFile16(pagePath, "utf-8");
22827
+ const pageContent = await readFile17(pagePath, "utf-8");
22525
22828
  const observed = await dependencies.getVersion(pagePath, "3", config, tmpDir);
22526
22829
  return {
22527
22830
  versionIds: history.versions.map((version) => version.versionId),
@@ -22534,11 +22837,11 @@ async function executeCase2(sample, dependencies) {
22534
22837
  const pruningConfig = versioningConfig({ maxVersionsPerPage: 2 });
22535
22838
  for (let index = 1; index <= 4; index += 1) {
22536
22839
  const content = `content v${index}`;
22537
- await writeFile8(pagePath, content, "utf-8");
22840
+ await writeFile9(pagePath, content, "utf-8");
22538
22841
  await dependencies.createVersion(pagePath, content, "write", pruningConfig, void 0, void 0, tmpDir);
22539
22842
  }
22540
22843
  const history = await dependencies.listVersions(pagePath, pruningConfig, tmpDir);
22541
- const pageContent = await readFile16(pagePath, "utf-8");
22844
+ const pageContent = await readFile17(pagePath, "utf-8");
22542
22845
  const prunedIds = [];
22543
22846
  for (const versionId of ["1", "2"]) {
22544
22847
  try {
@@ -22558,7 +22861,7 @@ async function executeCase2(sample, dependencies) {
22558
22861
  };
22559
22862
  }
22560
22863
  case "diff-output": {
22561
- await writeFile8(pagePath, "line 1\nline 2\nline 3", "utf-8");
22864
+ await writeFile9(pagePath, "line 1\nline 2\nline 3", "utf-8");
22562
22865
  await dependencies.createVersion(
22563
22866
  pagePath,
22564
22867
  "line 1\nline 2\nline 3",
@@ -22568,7 +22871,7 @@ async function executeCase2(sample, dependencies) {
22568
22871
  void 0,
22569
22872
  tmpDir
22570
22873
  );
22571
- await writeFile8(pagePath, "line 1\nline 2 changed\nline 3\nline 4", "utf-8");
22874
+ await writeFile9(pagePath, "line 1\nline 2 changed\nline 3\nline 4", "utf-8");
22572
22875
  await dependencies.createVersion(
22573
22876
  pagePath,
22574
22877
  "line 1\nline 2 changed\nline 3\nline 4",
@@ -22579,7 +22882,7 @@ async function executeCase2(sample, dependencies) {
22579
22882
  tmpDir
22580
22883
  );
22581
22884
  const history = await dependencies.listVersions(pagePath, config, tmpDir);
22582
- const pageContent = await readFile16(pagePath, "utf-8");
22885
+ const pageContent = await readFile17(pagePath, "utf-8");
22583
22886
  const diff = await dependencies.diffVersions(pagePath, "1", "2", config, tmpDir);
22584
22887
  const observedLines = normalizeDiffChangedLines(diff);
22585
22888
  return {
@@ -22591,7 +22894,7 @@ async function executeCase2(sample, dependencies) {
22591
22894
  }
22592
22895
  }
22593
22896
  } finally {
22594
- await rm4(tmpDir, { recursive: true, force: true });
22897
+ await rm5(tmpDir, { recursive: true, force: true });
22595
22898
  }
22596
22899
  }
22597
22900
  function isMissingPageVersionError(error, pagePath, versionId) {
@@ -24864,9 +25167,9 @@ function loadCases9(mode, limit) {
24864
25167
 
24865
25168
  // src/benchmarks/remnic/procedural-recall/runner.ts
24866
25169
  import { randomUUID as randomUUID21 } from "crypto";
24867
- import { mkdtemp as mkdtemp5, rm as rm5 } from "fs/promises";
25170
+ import { mkdtemp as mkdtemp5, rm as rm6 } from "fs/promises";
24868
25171
  import os7 from "os";
24869
- import path22 from "path";
25172
+ import path23 from "path";
24870
25173
  import {
24871
25174
  StorageManager as StorageManager2,
24872
25175
  parseConfig as parseConfig3,
@@ -24996,7 +25299,7 @@ async function runProceduralRecallBenchmark(options) {
24996
25299
  }
24997
25300
  for (const sample of e2eCases) {
24998
25301
  const startedAt = performance.now();
24999
- const dir = await mkdtemp5(path22.join(os7.tmpdir(), "remnic-bench-procedural-recall-"));
25302
+ const dir = await mkdtemp5(path23.join(os7.tmpdir(), "remnic-bench-procedural-recall-"));
25000
25303
  let section = null;
25001
25304
  try {
25002
25305
  const storage = new StorageManager2(dir);
@@ -25011,7 +25314,7 @@ ${body}`,
25011
25314
  );
25012
25315
  const config = parseConfig3({
25013
25316
  memoryDir: dir,
25014
- workspaceDir: path22.join(dir, "ws"),
25317
+ workspaceDir: path23.join(dir, "ws"),
25015
25318
  openaiApiKey: "bench-key",
25016
25319
  procedural: {
25017
25320
  enabled: sample.proceduralEnabled !== false,
@@ -25020,7 +25323,7 @@ ${body}`,
25020
25323
  });
25021
25324
  section = await buildProcedureRecallSection(storage, sample.prompt, config);
25022
25325
  } finally {
25023
- await rm5(dir, { recursive: true, force: true });
25326
+ await rm6(dir, { recursive: true, force: true });
25024
25327
  }
25025
25328
  const latencyMs = Math.round(performance.now() - startedAt);
25026
25329
  const nonNull = section !== null && section.length > 0;
@@ -25081,9 +25384,9 @@ ${body}`,
25081
25384
 
25082
25385
  // src/benchmarks/remnic/ingestion-entity-recall/runner.ts
25083
25386
  import { randomUUID as randomUUID22 } from "crypto";
25084
- import { mkdtemp as mkdtemp6, writeFile as writeFile9, rm as rm6, mkdir as mkdir9, realpath as realpath5 } from "fs/promises";
25387
+ import { mkdtemp as mkdtemp6, writeFile as writeFile10, rm as rm7, mkdir as mkdir10, realpath as realpath5 } from "fs/promises";
25085
25388
  import { tmpdir as tmpdir2 } from "os";
25086
- import path23 from "path";
25389
+ import path24 from "path";
25087
25390
 
25088
25391
  // src/ingestion-scorer.ts
25089
25392
  function normalize(value) {
@@ -25585,13 +25888,13 @@ async function runIngestionEntityRecallBenchmark(options) {
25585
25888
  throw new Error("ingestionAdapter is required for ingestion benchmarks");
25586
25889
  }
25587
25890
  const fixture = emailFixture.generate();
25588
- const fixtureDir = await mkdtemp6(path23.join(tmpdir2(), "bench-email-"));
25891
+ const fixtureDir = await mkdtemp6(path24.join(tmpdir2(), "bench-email-"));
25589
25892
  try {
25590
25893
  await options.ingestionAdapter.reset();
25591
25894
  for (const file of fixture.files) {
25592
- const filePath = path23.join(fixtureDir, file.relativePath);
25593
- await mkdir9(path23.dirname(filePath), { recursive: true });
25594
- await writeFile9(filePath, file.content, "utf8");
25895
+ const filePath = path24.join(fixtureDir, file.relativePath);
25896
+ await mkdir10(path24.dirname(filePath), { recursive: true });
25897
+ await writeFile10(filePath, file.content, "utf8");
25595
25898
  }
25596
25899
  const { result: ingestionLog, durationMs } = await timed(
25597
25900
  async () => options.ingestionAdapter.ingest(await realpath5(fixtureDir))
@@ -25672,7 +25975,7 @@ async function runIngestionEntityRecallBenchmark(options) {
25672
25975
  ];
25673
25976
  return buildResult(options, tasks, durationMs);
25674
25977
  } finally {
25675
- await rm6(fixtureDir, { recursive: true, force: true });
25978
+ await rm7(fixtureDir, { recursive: true, force: true });
25676
25979
  }
25677
25980
  }
25678
25981
  async function buildResult(options, tasks, totalLatencyMs) {
@@ -25718,9 +26021,9 @@ async function buildResult(options, tasks, totalLatencyMs) {
25718
26021
 
25719
26022
  // src/benchmarks/remnic/ingestion-schema-completeness/runner.ts
25720
26023
  import { randomUUID as randomUUID23 } from "crypto";
25721
- import { mkdtemp as mkdtemp7, writeFile as writeFile10, rm as rm7, mkdir as mkdir10, realpath as realpath6 } from "fs/promises";
26024
+ import { mkdtemp as mkdtemp7, writeFile as writeFile11, rm as rm8, mkdir as mkdir11, realpath as realpath6 } from "fs/promises";
25722
26025
  import { tmpdir as tmpdir3 } from "os";
25723
- import path24 from "path";
26026
+ import path25 from "path";
25724
26027
  var ingestionSchemaCompletenessDefinition = {
25725
26028
  id: "ingestion-schema-completeness",
25726
26029
  title: "Ingestion: Schema Completeness",
@@ -25739,13 +26042,13 @@ async function runIngestionSchemaCompletenessBenchmark(options) {
25739
26042
  throw new Error("ingestionAdapter is required for ingestion benchmarks");
25740
26043
  }
25741
26044
  const fixture = emailFixture.generate();
25742
- const fixtureDir = await mkdtemp7(path24.join(tmpdir3(), "bench-email-"));
26045
+ const fixtureDir = await mkdtemp7(path25.join(tmpdir3(), "bench-email-"));
25743
26046
  try {
25744
26047
  await options.ingestionAdapter.reset();
25745
26048
  for (const file of fixture.files) {
25746
- const filePath = path24.join(fixtureDir, file.relativePath);
25747
- await mkdir10(path24.dirname(filePath), { recursive: true });
25748
- await writeFile10(filePath, file.content, "utf8");
26049
+ const filePath = path25.join(fixtureDir, file.relativePath);
26050
+ await mkdir11(path25.dirname(filePath), { recursive: true });
26051
+ await writeFile11(filePath, file.content, "utf8");
25749
26052
  }
25750
26053
  const { result: ingestionLog, durationMs } = await timed(
25751
26054
  async () => options.ingestionAdapter.ingest(await realpath6(fixtureDir))
@@ -25885,15 +26188,15 @@ async function runIngestionSchemaCompletenessBenchmark(options) {
25885
26188
  }
25886
26189
  };
25887
26190
  } finally {
25888
- await rm7(fixtureDir, { recursive: true, force: true });
26191
+ await rm8(fixtureDir, { recursive: true, force: true });
25889
26192
  }
25890
26193
  }
25891
26194
 
25892
26195
  // src/benchmarks/remnic/ingestion-backlink-f1/runner.ts
25893
26196
  import { randomUUID as randomUUID24 } from "crypto";
25894
- import { mkdtemp as mkdtemp8, writeFile as writeFile11, rm as rm8, mkdir as mkdir11, realpath as realpath7 } from "fs/promises";
26197
+ import { mkdtemp as mkdtemp8, writeFile as writeFile12, rm as rm9, mkdir as mkdir12, realpath as realpath7 } from "fs/promises";
25895
26198
  import { tmpdir as tmpdir4 } from "os";
25896
- import path25 from "path";
26199
+ import path26 from "path";
25897
26200
  var ingestionBacklinkF1Definition = {
25898
26201
  id: "ingestion-backlink-f1",
25899
26202
  title: "Ingestion: Backlink F1",
@@ -25912,13 +26215,13 @@ async function runIngestionBacklinkF1Benchmark(options) {
25912
26215
  throw new Error("ingestionAdapter is required for ingestion benchmarks");
25913
26216
  }
25914
26217
  const fixture = emailFixture.generate();
25915
- const fixtureDir = await mkdtemp8(path25.join(tmpdir4(), "bench-email-"));
26218
+ const fixtureDir = await mkdtemp8(path26.join(tmpdir4(), "bench-email-"));
25916
26219
  try {
25917
26220
  await options.ingestionAdapter.reset();
25918
26221
  for (const file of fixture.files) {
25919
- const filePath = path25.join(fixtureDir, file.relativePath);
25920
- await mkdir11(path25.dirname(filePath), { recursive: true });
25921
- await writeFile11(filePath, file.content, "utf8");
26222
+ const filePath = path26.join(fixtureDir, file.relativePath);
26223
+ await mkdir12(path26.dirname(filePath), { recursive: true });
26224
+ await writeFile12(filePath, file.content, "utf8");
25922
26225
  }
25923
26226
  const { result: ingestionLog, durationMs } = await timed(
25924
26227
  async () => options.ingestionAdapter.ingest(await realpath7(fixtureDir))
@@ -25986,15 +26289,15 @@ async function runIngestionBacklinkF1Benchmark(options) {
25986
26289
  }
25987
26290
  };
25988
26291
  } finally {
25989
- await rm8(fixtureDir, { recursive: true, force: true });
26292
+ await rm9(fixtureDir, { recursive: true, force: true });
25990
26293
  }
25991
26294
  }
25992
26295
 
25993
26296
  // src/benchmarks/remnic/ingestion-setup-friction/runner.ts
25994
26297
  import { randomUUID as randomUUID25 } from "crypto";
25995
- import { mkdtemp as mkdtemp9, writeFile as writeFile12, rm as rm9, mkdir as mkdir12, realpath as realpath8 } from "fs/promises";
26298
+ import { mkdtemp as mkdtemp9, writeFile as writeFile13, rm as rm10, mkdir as mkdir13, realpath as realpath8 } from "fs/promises";
25996
26299
  import { tmpdir as tmpdir5 } from "os";
25997
- import path26 from "path";
26300
+ import path27 from "path";
25998
26301
  var INGESTION_SETUP_FRICTION_LOWER_IS_BETTER = /* @__PURE__ */ new Set(["setup_friction", "commands_count", "prompts_count", "errors_count"]);
25999
26302
  var ingestionSetupFrictionDefinition = {
26000
26303
  id: "ingestion-setup-friction",
@@ -26014,13 +26317,13 @@ async function runIngestionSetupFrictionBenchmark(options) {
26014
26317
  throw new Error("ingestionAdapter is required for ingestion benchmarks");
26015
26318
  }
26016
26319
  const fixture = emailFixture.generate();
26017
- const fixtureDir = await mkdtemp9(path26.join(tmpdir5(), "bench-friction-"));
26320
+ const fixtureDir = await mkdtemp9(path27.join(tmpdir5(), "bench-friction-"));
26018
26321
  try {
26019
26322
  await options.ingestionAdapter.reset();
26020
26323
  for (const file of fixture.files) {
26021
- const filePath = path26.join(fixtureDir, file.relativePath);
26022
- await mkdir12(path26.dirname(filePath), { recursive: true });
26023
- await writeFile12(filePath, file.content, "utf8");
26324
+ const filePath = path27.join(fixtureDir, file.relativePath);
26325
+ await mkdir13(path27.dirname(filePath), { recursive: true });
26326
+ await writeFile13(filePath, file.content, "utf8");
26024
26327
  }
26025
26328
  const { result: ingestionLog, durationMs } = await timed(
26026
26329
  async () => options.ingestionAdapter.ingest(await realpath8(fixtureDir))
@@ -26092,15 +26395,15 @@ async function runIngestionSetupFrictionBenchmark(options) {
26092
26395
  }
26093
26396
  };
26094
26397
  } finally {
26095
- await rm9(fixtureDir, { recursive: true, force: true });
26398
+ await rm10(fixtureDir, { recursive: true, force: true });
26096
26399
  }
26097
26400
  }
26098
26401
 
26099
26402
  // src/benchmarks/remnic/ingestion-citation-accuracy/runner.ts
26100
26403
  import { randomUUID as randomUUID26 } from "crypto";
26101
- import { mkdtemp as mkdtemp10, writeFile as writeFile13, rm as rm10, mkdir as mkdir13, realpath as realpath9 } from "fs/promises";
26404
+ import { mkdtemp as mkdtemp10, writeFile as writeFile14, rm as rm11, mkdir as mkdir14, realpath as realpath9 } from "fs/promises";
26102
26405
  import { tmpdir as tmpdir6 } from "os";
26103
- import path27 from "path";
26406
+ import path28 from "path";
26104
26407
  var CITATION_SUPPORT_THRESHOLD = 0.72;
26105
26408
  var ingestionCitationAccuracyDefinition = {
26106
26409
  id: "ingestion-citation-accuracy",
@@ -26159,10 +26462,10 @@ function resolveCitedSources(sourceRefs, seeAlso, pageRef, sourceContentMap) {
26159
26462
  return "";
26160
26463
  }
26161
26464
  for (const ref of normalizedRefs) {
26162
- const refBase = path27.basename(ref).toLowerCase();
26465
+ const refBase = path28.basename(ref).toLowerCase();
26163
26466
  let matched = false;
26164
26467
  for (const [relativePath, content] of sourceContentMap) {
26165
- if (relativePath === ref || relativePath.endsWith(ref) || path27.basename(relativePath).toLowerCase() === refBase) {
26468
+ if (relativePath === ref || relativePath.endsWith(ref) || path28.basename(relativePath).toLowerCase() === refBase) {
26166
26469
  resolved.push(content);
26167
26470
  matched = true;
26168
26471
  break;
@@ -26178,9 +26481,9 @@ function resolveCitedSources(sourceRefs, seeAlso, pageRef, sourceContentMap) {
26178
26481
  if (normalizedRefs.length > 0) {
26179
26482
  return "";
26180
26483
  }
26181
- const pageBase = path27.basename(pageRef).toLowerCase();
26484
+ const pageBase = path28.basename(pageRef).toLowerCase();
26182
26485
  for (const [relativePath, content] of sourceContentMap) {
26183
- if (path27.basename(relativePath).toLowerCase() === pageBase) {
26486
+ if (path28.basename(relativePath).toLowerCase() === pageBase) {
26184
26487
  return content;
26185
26488
  }
26186
26489
  }
@@ -26191,13 +26494,13 @@ async function runIngestionCitationAccuracyBenchmark(options) {
26191
26494
  throw new Error("ingestionAdapter is required for ingestion benchmarks");
26192
26495
  }
26193
26496
  const fixture = emailFixture.generate();
26194
- const fixtureDir = await mkdtemp10(path27.join(tmpdir6(), "bench-citation-"));
26497
+ const fixtureDir = await mkdtemp10(path28.join(tmpdir6(), "bench-citation-"));
26195
26498
  try {
26196
26499
  await options.ingestionAdapter.reset();
26197
26500
  for (const file of fixture.files) {
26198
- const filePath = path27.join(fixtureDir, file.relativePath);
26199
- await mkdir13(path27.dirname(filePath), { recursive: true });
26200
- await writeFile13(filePath, file.content, "utf8");
26501
+ const filePath = path28.join(fixtureDir, file.relativePath);
26502
+ await mkdir14(path28.dirname(filePath), { recursive: true });
26503
+ await writeFile14(filePath, file.content, "utf8");
26201
26504
  }
26202
26505
  const benchmarkStart = performance.now();
26203
26506
  const { result: ingestionLog, durationMs: ingestionDurationMs } = await timed(
@@ -26394,7 +26697,7 @@ async function runIngestionCitationAccuracyBenchmark(options) {
26394
26697
  }
26395
26698
  };
26396
26699
  } finally {
26397
- await rm10(fixtureDir, { recursive: true, force: true });
26700
+ await rm11(fixtureDir, { recursive: true, force: true });
26398
26701
  }
26399
26702
  }
26400
26703
  function citationSupportScore(claim, citedSources) {
@@ -26582,7 +26885,7 @@ var ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS = ASSISTANT_MORNING_BRIEF_SCENARIOS.
26582
26885
 
26583
26886
  // src/benchmarks/remnic/_assistant-common/runner.ts
26584
26887
  import { randomUUID as randomUUID27 } from "crypto";
26585
- import path29 from "path";
26888
+ import path30 from "path";
26586
26889
 
26587
26890
  // src/run-seeds.ts
26588
26891
  function buildBenchmarkRunSeeds(runCount, baseSeed) {
@@ -26672,9 +26975,9 @@ function pairedDeltaConfidenceInterval(candidateValues, baselineValues, options
26672
26975
  }
26673
26976
 
26674
26977
  // src/judges/sealed-rubric.ts
26675
- import { createHash as createHash7 } from "crypto";
26978
+ import { createHash as createHash8 } from "crypto";
26676
26979
  import { appendFileSync, mkdirSync } from "fs";
26677
- import path28 from "path";
26980
+ import path29 from "path";
26678
26981
 
26679
26982
  // src/judges/sealed-prompts/assistant-rubric-v1.ts
26680
26983
  var ASSISTANT_RUBRIC_V1 = `# Assistant rubric v1 (sealed)
@@ -26781,7 +27084,7 @@ function loadSealedRubric(id = DEFAULT_ASSISTANT_RUBRIC_ID, options = {}) {
26781
27084
  if (typeof prompt !== "string" || prompt.length === 0) {
26782
27085
  throw new Error(`sealed rubric not found in registry: ${id}`);
26783
27086
  }
26784
- const sha256 = createHash7("sha256").update(prompt, "utf8").digest("hex");
27087
+ const sha256 = createHash8("sha256").update(prompt, "utf8").digest("hex");
26785
27088
  const version = parseVersionFromId(id);
26786
27089
  return { id, version, prompt, sha256 };
26787
27090
  }
@@ -26954,7 +27257,7 @@ function createSpotCheckFileLogger(options) {
26954
27257
  return { log() {
26955
27258
  } };
26956
27259
  }
26957
- const logPath = path28.join(directory, `${runId}.jsonl`);
27260
+ const logPath = path29.join(directory, `${runId}.jsonl`);
26958
27261
  let written = 0;
26959
27262
  let warnedOnWriteFailure = false;
26960
27263
  const cap = typeof sampleSize === "number" && sampleSize > 0 ? sampleSize : 5;
@@ -27043,7 +27346,7 @@ async function runAssistantBenchmark(definition, scenarios, resolved, runnerOpti
27043
27346
  const runId = buildRunId(definition.id);
27044
27347
  const spotCheckLogger = createSpotCheckFileLogger({
27045
27348
  runId,
27046
- directory: runnerOptions.spotCheckDir ?? path29.join(process.cwd(), "benchmarks", "results", "spot-checks"),
27349
+ directory: runnerOptions.spotCheckDir ?? path30.join(process.cwd(), "benchmarks", "results", "spot-checks"),
27047
27350
  sampleRate: 0.35,
27048
27351
  sampleSize: 5
27049
27352
  });
@@ -27692,9 +27995,9 @@ async function runAssistantSynthesisBenchmark(options) {
27692
27995
 
27693
27996
  // src/benchmarks/remnic/buffer-surprise-trigger/runner.ts
27694
27997
  import { randomUUID as randomUUID28 } from "crypto";
27695
- import path30 from "path";
27998
+ import path31 from "path";
27696
27999
  import os8 from "os";
27697
- import { mkdir as mkdir14, rm as rm11 } from "fs/promises";
28000
+ import { mkdir as mkdir15, rm as rm12 } from "fs/promises";
27698
28001
  import {
27699
28002
  SmartBuffer,
27700
28003
  computeSurprise,
@@ -27923,11 +28226,11 @@ function hasExplicitTopicPivotCue(text) {
27923
28226
  }
27924
28227
  async function runBufferSurpriseTriggerBenchmark(options) {
27925
28228
  const cases = loadCases10(options.mode, options.limit);
27926
- const tmpRoot = path30.join(
28229
+ const tmpRoot = path31.join(
27927
28230
  os8.tmpdir(),
27928
28231
  `remnic-bench-buffer-surprise-${randomUUID28()}`
27929
28232
  );
27930
- await mkdir14(tmpRoot, { recursive: true });
28233
+ await mkdir15(tmpRoot, { recursive: true });
27931
28234
  const tasks = [];
27932
28235
  const startedAt = performance.now();
27933
28236
  try {
@@ -27945,7 +28248,7 @@ async function runBufferSurpriseTriggerBenchmark(options) {
27945
28248
  tasks.push(buildTaskResult(caseDef, control, candidate));
27946
28249
  }
27947
28250
  } finally {
27948
- await rm11(tmpRoot, { recursive: true, force: true });
28251
+ await rm12(tmpRoot, { recursive: true, force: true });
27949
28252
  }
27950
28253
  const totalLatencyMs = Math.round(performance.now() - startedAt);
27951
28254
  const aggregates = buildAggregates2(tasks);
@@ -27992,12 +28295,12 @@ async function runBufferSurpriseTriggerBenchmark(options) {
27992
28295
  };
27993
28296
  }
27994
28297
  async function runSingleCase(caseDef, options) {
27995
- const memoryDir = path30.join(
28298
+ const memoryDir = path31.join(
27996
28299
  options.tmpRoot,
27997
28300
  `${caseDef.id}-${options.label}`
27998
28301
  );
27999
- const workspaceDir = path30.join(memoryDir, "workspace");
28000
- await mkdir14(workspaceDir, { recursive: true });
28302
+ const workspaceDir = path31.join(memoryDir, "workspace");
28303
+ await mkdir15(workspaceDir, { recursive: true });
28001
28304
  const config = parseConfig4({
28002
28305
  memoryDir,
28003
28306
  workspaceDir,
@@ -29151,8 +29454,8 @@ function finalizeBenchmarkResultConfig(result, options) {
29151
29454
  }
29152
29455
 
29153
29456
  // src/benchmark.ts
29154
- var DEFAULT_BASELINE_PATH = path31.join(process.cwd(), "benchmarks", "baseline.json");
29155
- var DEFAULT_REPORT_PATH = path31.join(process.cwd(), "benchmarks", "report.json");
29457
+ var DEFAULT_BASELINE_PATH = path32.join(process.cwd(), "benchmarks", "baseline.json");
29458
+ var DEFAULT_REPORT_PATH = path32.join(process.cwd(), "benchmarks", "report.json");
29156
29459
  var BASELINE_VERSION = 1;
29157
29460
  var DEFAULT_TOLERANCE = 10;
29158
29461
  var DEFAULT_FULL_RUN_COUNT = 5;
@@ -29214,7 +29517,33 @@ async function runBenchmark(benchmarkId, options) {
29214
29517
  const log = (message) => {
29215
29518
  console.error(` ${message}`);
29216
29519
  };
29217
- const system = !shouldGuardSystem ? options.system : createTimeoutGuardedAdapter(options.system, {
29520
+ const originalSystemJudge = options.system.judge;
29521
+ let systemJudgeMutatedInPlace = false;
29522
+ let judgeCacheCounters;
29523
+ let cachedCrossJudge;
29524
+ let crossJudgeCacheCounters;
29525
+ let primaryDrainPendingWrites;
29526
+ let crossDrainPendingWrites;
29527
+ const cacheWiring = (() => {
29528
+ if (options.noJudgeCache) {
29529
+ return void 0;
29530
+ }
29531
+ const willWrapPrimary = options.system.judge !== void 0 && (options.judgeProvider ?? null) !== null;
29532
+ const willWrapCross = options.amaBenchCrossJudge !== void 0 && (options.amaBenchCrossJudgeProvider ?? null) !== null;
29533
+ if (!willWrapPrimary && !willWrapCross) {
29534
+ return void 0;
29535
+ }
29536
+ const cacheDir = options.judgeCacheDir ? path32.resolve(expandTildePath3(options.judgeCacheDir)) : options.outputDir ? path32.join(path32.resolve(expandTildePath3(options.outputDir)), "judge-cache") : void 0;
29537
+ if (cacheDir === void 0) {
29538
+ return void 0;
29539
+ }
29540
+ return {
29541
+ cache: new JudgeCache({ dir: cacheDir }),
29542
+ willWrapPrimary,
29543
+ willWrapCross
29544
+ };
29545
+ })();
29546
+ let system = !shouldGuardSystem ? options.system : createTimeoutGuardedAdapter(options.system, {
29218
29547
  benchmarkId,
29219
29548
  ...timeoutMs !== void 0 ? { timeoutMs } : {},
29220
29549
  ...options.drainTimeoutMs !== void 0 ? { drainTimeoutMs: options.drainTimeoutMs } : {},
@@ -29245,18 +29574,118 @@ async function runBenchmark(benchmarkId, options) {
29245
29574
  }) : rawIngestionAdapter;
29246
29575
  let result;
29247
29576
  try {
29577
+ if (cacheWiring?.willWrapPrimary && system.judge !== void 0) {
29578
+ const primary = wrapJudgeWithCache({
29579
+ role: "primary",
29580
+ judge: system.judge,
29581
+ benchmarkId,
29582
+ datasetVersion: definition.meta.version,
29583
+ amaBenchJudgeProtocol: options.amaBenchJudgeProtocol ?? "default",
29584
+ provider: options.judgeProvider ?? null,
29585
+ cache: cacheWiring.cache
29586
+ });
29587
+ judgeCacheCounters = primary.counters;
29588
+ primaryDrainPendingWrites = primary.drainPendingWrites;
29589
+ try {
29590
+ system.judge = primary.judge;
29591
+ systemJudgeMutatedInPlace = system === options.system;
29592
+ } catch {
29593
+ system = createJudgeOverrideProxy(system, primary.judge);
29594
+ }
29595
+ }
29596
+ if (cacheWiring?.willWrapCross) {
29597
+ const wrapped = wrapJudgeWithCache({
29598
+ role: "cross",
29599
+ judge: options.amaBenchCrossJudge,
29600
+ benchmarkId,
29601
+ datasetVersion: definition.meta.version,
29602
+ amaBenchJudgeProtocol: options.amaBenchJudgeProtocol ?? "default",
29603
+ provider: options.amaBenchCrossJudgeProvider ?? null,
29604
+ cache: cacheWiring.cache
29605
+ });
29606
+ cachedCrossJudge = wrapped.judge;
29607
+ crossJudgeCacheCounters = wrapped.counters;
29608
+ crossDrainPendingWrites = wrapped.drainPendingWrites;
29609
+ }
29248
29610
  result = await registeredBenchmark.run({
29249
29611
  ...options,
29250
29612
  system,
29613
+ // PR #1591 P2 (thread #10): when caching is on AND a cross judge is
29614
+ // configured, hand the cached cross judge to the runner so AMA-Bench
29615
+ // cross-judge calls participate in the same content-keyed cache as
29616
+ // the primary system judge. Without this override, the runner kept
29617
+ // calling the unwrapped cross judge on every iteration.
29618
+ ...cachedCrossJudge ? { amaBenchCrossJudge: cachedCrossJudge } : {},
29251
29619
  ...ingestionAdapter ? { ingestionAdapter } : {},
29252
29620
  mode: options.mode ?? "quick",
29253
29621
  benchmark: definition
29254
29622
  });
29255
29623
  } finally {
29256
- await destroyOwnedIngestionAdapter();
29624
+ try {
29625
+ await destroyOwnedIngestionAdapter();
29626
+ } finally {
29627
+ if (systemJudgeMutatedInPlace) {
29628
+ options.system.judge = originalSystemJudge;
29629
+ }
29630
+ }
29631
+ if (primaryDrainPendingWrites) {
29632
+ await primaryDrainPendingWrites();
29633
+ }
29634
+ if (crossDrainPendingWrites) {
29635
+ await crossDrainPendingWrites();
29636
+ }
29637
+ }
29638
+ const primaryCalls = judgeCacheCounters?.modelCalls ?? 0;
29639
+ const crossCalls = crossJudgeCacheCounters?.modelCalls ?? 0;
29640
+ if (judgeCacheCounters !== void 0 || crossJudgeCacheCounters !== void 0) {
29641
+ result.cost.judgeModelCalls = primaryCalls + crossCalls;
29257
29642
  }
29258
29643
  return finalizeBenchmarkResultConfig(result, options);
29259
29644
  }
29645
+ function wrapJudgeWithCache(args) {
29646
+ const crossJudgeIdSuffix = args.role === "cross" ? "-cross" : "";
29647
+ const wrapped = runJudgeWithCache({
29648
+ judge: args.judge,
29649
+ cache: args.cache,
29650
+ keyExtras: {
29651
+ benchmarkId: `${args.benchmarkId}${crossJudgeIdSuffix}`,
29652
+ datasetVersion: args.datasetVersion,
29653
+ // Protocol identity: bench judge protocol version + the selected
29654
+ // judge protocol variant, suffixed by role so primary vs cross
29655
+ // differentiator is part of the prompt hash. Bumping
29656
+ // JUDGE_CACHE_PROTOCOL_VERSION invalidates verdicts when judge
29657
+ // prompt/parse semantics change (PR #1591, High).
29658
+ judgePromptHash: createHash9("sha256").update(JUDGE_CACHE_PROTOCOL_VERSION).update("").update(args.amaBenchJudgeProtocol).update("").update(args.role).digest("hex"),
29659
+ judgeModelId: args.provider?.model !== void 0 && args.provider.model.length > 0 ? `${args.provider.model}${crossJudgeIdSuffix}` : `unknown-${args.role}-judge`,
29660
+ // Full judge configuration, deterministically serialized (sorted
29661
+ // keys) so provider/base-url/retry changes produce fresh cache
29662
+ // keys. `role` is included so primary and cross judges never
29663
+ // share a paramsHash.
29664
+ judgeParamsHash: createHash9("sha256").update(
29665
+ stableStringify2({
29666
+ role: args.role,
29667
+ provider: args.provider
29668
+ })
29669
+ ).digest("hex")
29670
+ }
29671
+ });
29672
+ return {
29673
+ judge: wrapped,
29674
+ counters: wrapped.counters,
29675
+ drainPendingWrites: wrapped.drainPendingWrites
29676
+ };
29677
+ }
29678
+ function createJudgeOverrideProxy(adapter, judge) {
29679
+ return new Proxy(adapter, {
29680
+ get(target, prop) {
29681
+ if (prop === "judge") {
29682
+ return judge;
29683
+ }
29684
+ const value = Reflect.get(target, prop, target);
29685
+ return typeof value === "function" ? value.bind(target) : value;
29686
+ }
29687
+ });
29688
+ }
29260
29689
  function benchmarkDefinition(id) {
29261
29690
  const definition = getBenchmark(id);
29262
29691
  if (!definition) {
@@ -29293,7 +29722,7 @@ function loadBaseline(baselinePath) {
29293
29722
  return raw;
29294
29723
  }
29295
29724
  function saveBaseline(baselinePath, baseline) {
29296
- fs2.mkdirSync(path31.dirname(baselinePath), { recursive: true });
29725
+ fs2.mkdirSync(path32.dirname(baselinePath), { recursive: true });
29297
29726
  fs2.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}
29298
29727
  `);
29299
29728
  }
@@ -29523,7 +29952,7 @@ function generateReport(results, reportPath) {
29523
29952
  totalDurationMs: results.reduce((sum, result) => sum + result.totalDurationMs, 0)
29524
29953
  };
29525
29954
  if (reportPath) {
29526
- fs2.mkdirSync(path31.dirname(reportPath), { recursive: true });
29955
+ fs2.mkdirSync(path32.dirname(reportPath), { recursive: true });
29527
29956
  fs2.writeFileSync(reportPath, `${JSON.stringify(report, null, 2)}
29528
29957
  `);
29529
29958
  }
@@ -29670,7 +30099,7 @@ function getBenchmarkLowerIsBetter(benchmarkId) {
29670
30099
  }
29671
30100
 
29672
30101
  // src/integrity/sealed-qrels.ts
29673
- import { readFile as readFile17 } from "fs/promises";
30102
+ import { readFile as readFile18 } from "fs/promises";
29674
30103
  function isSealedQrelsArtifact(value) {
29675
30104
  if (!value || typeof value !== "object") {
29676
30105
  return false;
@@ -29740,7 +30169,7 @@ function parseSealedQrels(raw, options = {}) {
29740
30169
  };
29741
30170
  }
29742
30171
  async function loadSealedQrels(filePath, options = {}) {
29743
- const raw = await readFile17(filePath, "utf8");
30172
+ const raw = await readFile18(filePath, "utf8");
29744
30173
  return parseSealedQrels(raw, options);
29745
30174
  }
29746
30175
  function serializeSealedQrels(artifact) {
@@ -29860,7 +30289,7 @@ function selectFixtureVariant(variants, seed) {
29860
30289
  }
29861
30290
 
29862
30291
  // src/benchmarks/custom/loader.ts
29863
- import { readFile as readFile18 } from "fs/promises";
30292
+ import { readFile as readFile19 } from "fs/promises";
29864
30293
  import { parse as parseYaml } from "yaml";
29865
30294
  var CUSTOM_SCORING_VALUES = /* @__PURE__ */ new Set([
29866
30295
  "exact_match",
@@ -29880,7 +30309,7 @@ function parseCustomBenchmark(source) {
29880
30309
  async function loadCustomBenchmarkFile(filePath) {
29881
30310
  let source;
29882
30311
  try {
29883
- source = await readFile18(filePath, "utf8");
30312
+ source = await readFile19(filePath, "utf8");
29884
30313
  } catch (error) {
29885
30314
  throw new Error(
29886
30315
  `Failed to read custom benchmark file ${filePath}: ${formatError(error)}`
@@ -29988,15 +30417,60 @@ function formatError(error) {
29988
30417
 
29989
30418
  // src/benchmarks/custom/runner.ts
29990
30419
  import { randomUUID as randomUUID31 } from "crypto";
29991
- import path32 from "path";
30420
+ import path33 from "path";
30421
+ import { expandTildePath as expandTildePath4 } from "@remnic/core";
29992
30422
  async function runCustomBenchmarkFile(filePath, options) {
29993
30423
  const spec = await loadCustomBenchmarkFile(filePath);
29994
30424
  const benchmark = createCustomBenchmarkDefinition(spec, filePath);
29995
- return runCustomBenchmark(spec, {
30425
+ const runOptions = {
29996
30426
  ...options,
29997
30427
  mode: options.mode ?? "quick",
29998
30428
  benchmark
29999
- });
30429
+ };
30430
+ let cacheRestore;
30431
+ let cacheCounters;
30432
+ if (spec.scoring === "llm_judge" && runOptions.system.judge !== void 0 && !runOptions.noJudgeCache && (runOptions.judgeProvider ?? null) !== null) {
30433
+ const cacheDir = runOptions.judgeCacheDir ? path33.resolve(expandTildePath4(runOptions.judgeCacheDir)) : runOptions.outputDir ? path33.join(path33.resolve(expandTildePath4(runOptions.outputDir)), "judge-cache") : void 0;
30434
+ if (cacheDir !== void 0) {
30435
+ const originalJudge = runOptions.system.judge;
30436
+ const wrapped = wrapJudgeWithCache({
30437
+ role: "primary",
30438
+ judge: originalJudge,
30439
+ benchmarkId: benchmark.id,
30440
+ datasetVersion: benchmark.meta.version,
30441
+ amaBenchJudgeProtocol: runOptions.amaBenchJudgeProtocol ?? "default",
30442
+ provider: runOptions.judgeProvider ?? null,
30443
+ cache: new JudgeCache({ dir: cacheDir })
30444
+ });
30445
+ let systemJudgeMutatedInPlace = false;
30446
+ try {
30447
+ runOptions.system.judge = wrapped.judge;
30448
+ systemJudgeMutatedInPlace = true;
30449
+ } catch {
30450
+ runOptions.system = createJudgeOverrideProxy(runOptions.system, wrapped.judge);
30451
+ }
30452
+ cacheCounters = wrapped.counters;
30453
+ const needRestore = systemJudgeMutatedInPlace;
30454
+ cacheRestore = async () => {
30455
+ if (needRestore) {
30456
+ runOptions.system.judge = originalJudge;
30457
+ }
30458
+ await wrapped.drainPendingWrites();
30459
+ };
30460
+ }
30461
+ }
30462
+ let result;
30463
+ try {
30464
+ result = await runCustomBenchmark(spec, runOptions);
30465
+ } finally {
30466
+ if (cacheRestore) {
30467
+ await cacheRestore();
30468
+ }
30469
+ }
30470
+ if (cacheCounters) {
30471
+ result.cost.judgeModelCalls = cacheCounters.modelCalls;
30472
+ }
30473
+ return result;
30000
30474
  }
30001
30475
  async function runCustomBenchmark(spec, options) {
30002
30476
  if (spec.scoring === "llm_judge" && !options.system.judge) {
@@ -30157,7 +30631,7 @@ async function scoreTask(scoring, options, question, actual, expected) {
30157
30631
  }
30158
30632
  }
30159
30633
  function createCustomBenchmarkDefinition(benchmark, filePath) {
30160
- const id = `custom:${slugify(path32.basename(filePath, path32.extname(filePath)) || benchmark.name)}`;
30634
+ const id = `custom:${slugify(path33.basename(filePath, path33.extname(filePath)) || benchmark.name)}`;
30161
30635
  return {
30162
30636
  id,
30163
30637
  title: benchmark.name,
@@ -31027,9 +31501,9 @@ var chatFixture = {
31027
31501
  };
31028
31502
 
31029
31503
  // src/benchmarks/remnic/procedural-recall/ablation.ts
31030
- import { mkdir as mkdir15, mkdtemp as mkdtemp11, rm as rm12, writeFile as writeFile14, readFile as readFile19 } from "fs/promises";
31504
+ import { mkdir as mkdir16, mkdtemp as mkdtemp11, rm as rm13, writeFile as writeFile15, readFile as readFile20 } from "fs/promises";
31031
31505
  import os9 from "os";
31032
- import path33 from "path";
31506
+ import path34 from "path";
31033
31507
  import {
31034
31508
  StorageManager as StorageManager3,
31035
31509
  parseConfig as parseConfig5,
@@ -31060,7 +31534,7 @@ async function runSide(scenarios, proceduralEnabled) {
31060
31534
  const observed = [];
31061
31535
  for (const scenario of scenarios) {
31062
31536
  const dir = await mkdtemp11(
31063
- path33.join(os9.tmpdir(), "remnic-bench-proc-ablation-")
31537
+ path34.join(os9.tmpdir(), "remnic-bench-proc-ablation-")
31064
31538
  );
31065
31539
  try {
31066
31540
  const storage = new StorageManager3(dir);
@@ -31075,7 +31549,7 @@ ${body}`,
31075
31549
  );
31076
31550
  const config = parseConfig5({
31077
31551
  memoryDir: dir,
31078
- workspaceDir: path33.join(dir, "ws"),
31552
+ workspaceDir: path34.join(dir, "ws"),
31079
31553
  openaiApiKey: "bench-key",
31080
31554
  procedural: {
31081
31555
  enabled: proceduralEnabled,
@@ -31089,7 +31563,7 @@ ${body}`,
31089
31563
  );
31090
31564
  observed.push(section !== null && section.length > 0);
31091
31565
  } finally {
31092
- await rm12(dir, { recursive: true, force: true });
31566
+ await rm13(dir, { recursive: true, force: true });
31093
31567
  }
31094
31568
  }
31095
31569
  return observed;
@@ -31150,7 +31624,7 @@ async function runProceduralAblation(options) {
31150
31624
  };
31151
31625
  }
31152
31626
  async function loadAblationFixture(fixturePath) {
31153
- const raw = await readFile19(fixturePath, "utf8");
31627
+ const raw = await readFile20(fixturePath, "utf8");
31154
31628
  let parsed;
31155
31629
  try {
31156
31630
  parsed = JSON.parse(raw);
@@ -31246,9 +31720,9 @@ async function runProceduralAblationCli(args) {
31246
31720
  random: args.random,
31247
31721
  seed: args.seed
31248
31722
  });
31249
- const outDir = path33.dirname(path33.resolve(args.outPath));
31250
- await mkdir15(outDir, { recursive: true });
31251
- await writeFile14(args.outPath, JSON.stringify(artifact, null, 2) + "\n", "utf8");
31723
+ const outDir = path34.dirname(path34.resolve(args.outPath));
31724
+ await mkdir16(outDir, { recursive: true });
31725
+ await writeFile15(args.outPath, JSON.stringify(artifact, null, 2) + "\n", "utf8");
31252
31726
  return artifact;
31253
31727
  }
31254
31728