@remnic/bench 9.3.674 → 9.3.676
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +19 -0
- package/dist/index.js +625 -151
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -10562,12 +10562,315 @@ function isPlainObject(value) {
|
|
|
10562
10562
|
|
|
10563
10563
|
// src/benchmark.ts
|
|
10564
10564
|
import fs2 from "fs";
|
|
10565
|
-
import
|
|
10565
|
+
import path32 from "path";
|
|
10566
|
+
import { createHash as createHash9 } from "crypto";
|
|
10567
|
+
import { expandTildePath as expandTildePath3 } from "@remnic/core";
|
|
10568
|
+
|
|
10569
|
+
// src/judges/judge-cache.ts
|
|
10570
|
+
import { createHash as createHash6, randomBytes as randomBytes2 } from "crypto";
|
|
10571
|
+
import {
|
|
10572
|
+
mkdir as mkdir8,
|
|
10573
|
+
readFile as readFile9,
|
|
10574
|
+
rename as rename2,
|
|
10575
|
+
rm as rm3,
|
|
10576
|
+
writeFile as writeFile8
|
|
10577
|
+
} from "fs/promises";
|
|
10578
|
+
import path11 from "path";
|
|
10579
|
+
var JUDGE_CACHE_PROTOCOL_VERSION = "judge-protocol-v1";
|
|
10580
|
+
function stableStringify2(value) {
|
|
10581
|
+
if (Array.isArray(value)) {
|
|
10582
|
+
return `[${value.map((item) => stableStringify2(item)).join(",")}]`;
|
|
10583
|
+
}
|
|
10584
|
+
if (value !== null && typeof value === "object") {
|
|
10585
|
+
const record = value;
|
|
10586
|
+
const keys = Object.keys(record).sort();
|
|
10587
|
+
const body = keys.map((key) => `${JSON.stringify(key)}:${stableStringify2(record[key])}`).join(",");
|
|
10588
|
+
return `{${body}}`;
|
|
10589
|
+
}
|
|
10590
|
+
return JSON.stringify(value) ?? "null";
|
|
10591
|
+
}
|
|
10592
|
+
var JudgeCache = class {
|
|
10593
|
+
dir;
|
|
10594
|
+
// Per-key write serialization so concurrent writers never race a temp-file
|
|
10595
|
+
// rename into place for the same key. Cached entries are read straight from
|
|
10596
|
+
// disk, so reads remain lock-free.
|
|
10597
|
+
writeQueues = /* @__PURE__ */ new Map();
|
|
10598
|
+
// PR #1591 round-8 (cursor thread): in-memory layer for fire-and-forget
|
|
10599
|
+
// writes. putSafely chains cache.put onto a pendingWrites promise
|
|
10600
|
+
// without awaiting, so a second benchmark iteration can call get()
|
|
10601
|
+
// before the disk rename lands. The inflight map is populated
|
|
10602
|
+
// synchronously inside put() (before the first await) and cleared in
|
|
10603
|
+
// the finally after the write settles — closing the gap without
|
|
10604
|
+
// changing the byte-identical baseline (a fresh process has no
|
|
10605
|
+
// inflight entries).
|
|
10606
|
+
inflight = /* @__PURE__ */ new Map();
|
|
10607
|
+
cachedDirExists = false;
|
|
10608
|
+
constructor(options) {
|
|
10609
|
+
this.dir = path11.resolve(options.dir);
|
|
10610
|
+
}
|
|
10611
|
+
/** Compute the sha256-hex key for a set of parts. Pure, sync, side-effect-free. */
|
|
10612
|
+
computeKey(parts) {
|
|
10613
|
+
const fieldDigest = (value) => createHash6("sha256").update(value).digest();
|
|
10614
|
+
return createHash6("sha256").update(fieldDigest(parts.benchmarkId)).update(fieldDigest(parts.datasetVersion)).update(fieldDigest(parts.questionId)).update(fieldDigest(parts.answerText)).update(fieldDigest(parts.judgePromptHash)).update(fieldDigest(parts.judgeModelId)).update(fieldDigest(parts.judgeParamsHash)).digest("hex");
|
|
10615
|
+
}
|
|
10616
|
+
/**
|
|
10617
|
+
* Read a previously-stored verdict. Returns `undefined` on miss, corrupted
|
|
10618
|
+
* entry, missing required field, or read error — never throws, never
|
|
10619
|
+
* fabricates.
|
|
10620
|
+
*/
|
|
10621
|
+
async get(parts) {
|
|
10622
|
+
const key = this.computeKey(parts);
|
|
10623
|
+
const inflightHit = this.inflight.get(key);
|
|
10624
|
+
if (inflightHit !== void 0) {
|
|
10625
|
+
return {
|
|
10626
|
+
cacheHit: true,
|
|
10627
|
+
verdict: inflightHit.verdict,
|
|
10628
|
+
storedAt: inflightHit.storedAt
|
|
10629
|
+
};
|
|
10630
|
+
}
|
|
10631
|
+
const filePath = this.entryPath(key);
|
|
10632
|
+
let raw;
|
|
10633
|
+
try {
|
|
10634
|
+
raw = await readFile9(filePath, "utf8");
|
|
10635
|
+
} catch {
|
|
10636
|
+
return void 0;
|
|
10637
|
+
}
|
|
10638
|
+
const envelope = parseEnvelope(raw);
|
|
10639
|
+
if (envelope === void 0) return void 0;
|
|
10640
|
+
return {
|
|
10641
|
+
cacheHit: true,
|
|
10642
|
+
verdict: envelope.verdict,
|
|
10643
|
+
storedAt: envelope.storedAt
|
|
10644
|
+
};
|
|
10645
|
+
}
|
|
10646
|
+
/**
|
|
10647
|
+
* Persist a verdict atomically: write to a temp file then rename into
|
|
10648
|
+
* place. Concurrent writes for the same key serialize via an in-memory
|
|
10649
|
+
* chain so the temp-file never lands on top of a sibling rename.
|
|
10650
|
+
*/
|
|
10651
|
+
async put(parts, verdict) {
|
|
10652
|
+
const key = this.computeKey(parts);
|
|
10653
|
+
const envelope = {
|
|
10654
|
+
storedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
10655
|
+
key,
|
|
10656
|
+
verdict
|
|
10657
|
+
};
|
|
10658
|
+
this.inflight.set(key, envelope);
|
|
10659
|
+
const prior = this.writeQueues.get(key) ?? Promise.resolve();
|
|
10660
|
+
const next = prior.then(() => this.writeOne(key, envelope));
|
|
10661
|
+
const tracked = next.catch(() => void 0);
|
|
10662
|
+
this.writeQueues.set(key, tracked);
|
|
10663
|
+
try {
|
|
10664
|
+
await next;
|
|
10665
|
+
} finally {
|
|
10666
|
+
if (this.writeQueues.get(key) === tracked) {
|
|
10667
|
+
this.writeQueues.delete(key);
|
|
10668
|
+
}
|
|
10669
|
+
if (this.inflight.get(key) === envelope) {
|
|
10670
|
+
this.inflight.delete(key);
|
|
10671
|
+
}
|
|
10672
|
+
}
|
|
10673
|
+
}
|
|
10674
|
+
/** Number of in-flight per-key write chains (diagnostic/test seam). */
|
|
10675
|
+
pendingWriteCount() {
|
|
10676
|
+
return this.writeQueues.size;
|
|
10677
|
+
}
|
|
10678
|
+
async writeOne(key, envelope) {
|
|
10679
|
+
if (!this.cachedDirExists) {
|
|
10680
|
+
await mkdir8(this.dir, { recursive: true });
|
|
10681
|
+
this.cachedDirExists = true;
|
|
10682
|
+
}
|
|
10683
|
+
const filePath = this.entryPath(key);
|
|
10684
|
+
const tempPath = path11.join(
|
|
10685
|
+
this.dir,
|
|
10686
|
+
`.${key}.${randomBytes2(6).toString("hex")}.tmp`
|
|
10687
|
+
);
|
|
10688
|
+
await writeFile8(tempPath, `${JSON.stringify(envelope)}
|
|
10689
|
+
`, "utf8");
|
|
10690
|
+
try {
|
|
10691
|
+
await rename2(tempPath, filePath);
|
|
10692
|
+
} catch (error) {
|
|
10693
|
+
await rm3(tempPath, { force: true }).catch(() => void 0);
|
|
10694
|
+
throw error;
|
|
10695
|
+
}
|
|
10696
|
+
}
|
|
10697
|
+
entryPath(key) {
|
|
10698
|
+
return path11.join(this.dir, `${key}.json`);
|
|
10699
|
+
}
|
|
10700
|
+
};
|
|
10701
|
+
function runJudgeWithCache(options) {
|
|
10702
|
+
const { judge, cache } = options;
|
|
10703
|
+
const keyExtras = options.keyExtras ?? {};
|
|
10704
|
+
const counters = {
|
|
10705
|
+
modelCalls: 0,
|
|
10706
|
+
cacheHits: 0,
|
|
10707
|
+
cacheMisses: 0,
|
|
10708
|
+
cacheWriteFailures: 0
|
|
10709
|
+
};
|
|
10710
|
+
let pendingWrites = Promise.resolve();
|
|
10711
|
+
const putSafely = (parts, verdict, control) => {
|
|
10712
|
+
if (!cache) return;
|
|
10713
|
+
if (control?.signal?.aborted) return;
|
|
10714
|
+
const write = cache.put(parts, verdict).catch(() => {
|
|
10715
|
+
counters.cacheWriteFailures += 1;
|
|
10716
|
+
});
|
|
10717
|
+
pendingWrites = pendingWrites.then(() => write);
|
|
10718
|
+
};
|
|
10719
|
+
const CACHE_READ_BUDGET_MS = 250;
|
|
10720
|
+
async function readCacheWithAbort(cache2, parts, control) {
|
|
10721
|
+
if (control?.signal?.aborted) return void 0;
|
|
10722
|
+
const read = cache2.get(parts);
|
|
10723
|
+
const readBudget = new Promise((resolveBudget) => {
|
|
10724
|
+
setTimeout(() => {
|
|
10725
|
+
resolveBudget(void 0);
|
|
10726
|
+
}, CACHE_READ_BUDGET_MS);
|
|
10727
|
+
});
|
|
10728
|
+
return Promise.race([read, readBudget]);
|
|
10729
|
+
}
|
|
10730
|
+
const cachedVerdict = (stored) => ({
|
|
10731
|
+
score: stored.score,
|
|
10732
|
+
tokens: { input: 0, output: 0 },
|
|
10733
|
+
latencyMs: 0,
|
|
10734
|
+
...stored.model !== void 0 ? { model: stored.model } : {}
|
|
10735
|
+
});
|
|
10736
|
+
const wrapper = {
|
|
10737
|
+
counters,
|
|
10738
|
+
cache,
|
|
10739
|
+
drainPendingWrites: () => pendingWrites,
|
|
10740
|
+
async score(question, predicted, expected, control) {
|
|
10741
|
+
const detailed = await wrapper.scoreWithMetrics(
|
|
10742
|
+
question,
|
|
10743
|
+
predicted,
|
|
10744
|
+
expected,
|
|
10745
|
+
control
|
|
10746
|
+
);
|
|
10747
|
+
return detailed.score;
|
|
10748
|
+
},
|
|
10749
|
+
async scoreWithMetrics(question, predicted, expected, control) {
|
|
10750
|
+
const answerText = `${predicted}${expected}`;
|
|
10751
|
+
const parts = {
|
|
10752
|
+
benchmarkId: keyExtras.benchmarkId ?? "unknown-benchmark",
|
|
10753
|
+
datasetVersion: keyExtras.datasetVersion ?? "unknown-version",
|
|
10754
|
+
questionId: question,
|
|
10755
|
+
answerText,
|
|
10756
|
+
judgePromptHash: keyExtras.judgePromptHash ?? "unknown-prompt",
|
|
10757
|
+
judgeModelId: keyExtras.judgeModelId ?? "unknown-judge",
|
|
10758
|
+
judgeParamsHash: keyExtras.judgeParamsHash ?? "unknown-params"
|
|
10759
|
+
};
|
|
10760
|
+
if (cache) {
|
|
10761
|
+
let hit;
|
|
10762
|
+
try {
|
|
10763
|
+
hit = await readCacheWithAbort(cache, parts, control);
|
|
10764
|
+
} catch {
|
|
10765
|
+
hit = void 0;
|
|
10766
|
+
}
|
|
10767
|
+
if (hit) {
|
|
10768
|
+
counters.cacheHits += 1;
|
|
10769
|
+
return cachedVerdict(hit.verdict);
|
|
10770
|
+
}
|
|
10771
|
+
counters.cacheMisses += 1;
|
|
10772
|
+
}
|
|
10773
|
+
if (!judge.scoreWithMetrics) {
|
|
10774
|
+
counters.modelCalls += 1;
|
|
10775
|
+
const scoreStartedAt = Date.now();
|
|
10776
|
+
const scoreValue = judge.score ? await judge.score(question, predicted, expected, control) : 0;
|
|
10777
|
+
const synthesized = {
|
|
10778
|
+
score: scoreValue,
|
|
10779
|
+
tokens: { input: 0, output: 0 },
|
|
10780
|
+
latencyMs: Date.now() - scoreStartedAt,
|
|
10781
|
+
model: keyExtras.judgeModelId ?? void 0
|
|
10782
|
+
};
|
|
10783
|
+
putSafely(parts, synthesized, control);
|
|
10784
|
+
return synthesized;
|
|
10785
|
+
}
|
|
10786
|
+
counters.modelCalls += 1;
|
|
10787
|
+
const fresh = await judge.scoreWithMetrics(
|
|
10788
|
+
question,
|
|
10789
|
+
predicted,
|
|
10790
|
+
expected,
|
|
10791
|
+
control
|
|
10792
|
+
);
|
|
10793
|
+
putSafely(parts, fresh, control);
|
|
10794
|
+
return fresh;
|
|
10795
|
+
}
|
|
10796
|
+
};
|
|
10797
|
+
if (typeof judge.scoreBinaryPrompt === "function") {
|
|
10798
|
+
Object.defineProperty(wrapper, "scoreBinaryPrompt", {
|
|
10799
|
+
configurable: true,
|
|
10800
|
+
enumerable: true,
|
|
10801
|
+
writable: false,
|
|
10802
|
+
value: async function scoreBinaryPrompt(prompt, control) {
|
|
10803
|
+
const parts = {
|
|
10804
|
+
benchmarkId: keyExtras.benchmarkId ?? "unknown-benchmark",
|
|
10805
|
+
datasetVersion: keyExtras.datasetVersion ?? "unknown-version",
|
|
10806
|
+
// Binary prompts are content-sensitive: two distinct prompts of
|
|
10807
|
+
// the same character length would collide on the previous
|
|
10808
|
+
// `binary:N` key, so key on a sha256 prefix of the prompt body.
|
|
10809
|
+
questionId: `binary:${createHash6("sha256").update(prompt).digest("hex").slice(0, 16)}`,
|
|
10810
|
+
answerText: prompt,
|
|
10811
|
+
judgePromptHash: keyExtras.judgePromptHash ?? "unknown-prompt",
|
|
10812
|
+
judgeModelId: keyExtras.judgeModelId ?? "unknown-judge",
|
|
10813
|
+
judgeParamsHash: keyExtras.judgeParamsHash ?? "unknown-params"
|
|
10814
|
+
};
|
|
10815
|
+
if (cache) {
|
|
10816
|
+
let hit;
|
|
10817
|
+
try {
|
|
10818
|
+
hit = await readCacheWithAbort(cache, parts, control);
|
|
10819
|
+
} catch {
|
|
10820
|
+
hit = void 0;
|
|
10821
|
+
}
|
|
10822
|
+
if (hit) {
|
|
10823
|
+
counters.cacheHits += 1;
|
|
10824
|
+
return cachedVerdict(hit.verdict);
|
|
10825
|
+
}
|
|
10826
|
+
counters.cacheMisses += 1;
|
|
10827
|
+
}
|
|
10828
|
+
counters.modelCalls += 1;
|
|
10829
|
+
const fresh = await judge.scoreBinaryPrompt(prompt, control);
|
|
10830
|
+
putSafely(parts, fresh, control);
|
|
10831
|
+
return fresh;
|
|
10832
|
+
}
|
|
10833
|
+
});
|
|
10834
|
+
}
|
|
10835
|
+
return wrapper;
|
|
10836
|
+
}
|
|
10837
|
+
function parseEnvelope(raw) {
|
|
10838
|
+
let parsed;
|
|
10839
|
+
try {
|
|
10840
|
+
parsed = JSON.parse(raw);
|
|
10841
|
+
} catch {
|
|
10842
|
+
return void 0;
|
|
10843
|
+
}
|
|
10844
|
+
if (parsed === null || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
10845
|
+
return void 0;
|
|
10846
|
+
}
|
|
10847
|
+
const candidate = parsed;
|
|
10848
|
+
if (typeof candidate.storedAt !== "string") return void 0;
|
|
10849
|
+
if (typeof candidate.key !== "string") return void 0;
|
|
10850
|
+
if (!isBenchJudgeResult(candidate.verdict)) return void 0;
|
|
10851
|
+
return candidate;
|
|
10852
|
+
}
|
|
10853
|
+
function isBenchJudgeResult(value) {
|
|
10854
|
+
if (value === null || typeof value !== "object" || Array.isArray(value)) {
|
|
10855
|
+
return false;
|
|
10856
|
+
}
|
|
10857
|
+
const v = value;
|
|
10858
|
+
if (typeof v.score !== "number" || !Number.isFinite(v.score)) return false;
|
|
10859
|
+
if (v.tokens === null || typeof v.tokens !== "object" || Array.isArray(v.tokens)) {
|
|
10860
|
+
return false;
|
|
10861
|
+
}
|
|
10862
|
+
const tokens = v.tokens;
|
|
10863
|
+
if (typeof tokens.input !== "number" || !Number.isFinite(tokens.input)) return false;
|
|
10864
|
+
if (typeof tokens.output !== "number" || !Number.isFinite(tokens.output)) return false;
|
|
10865
|
+
if (typeof v.latencyMs !== "number" || !Number.isFinite(v.latencyMs)) return false;
|
|
10866
|
+
if (v.model !== void 0 && typeof v.model !== "string") return false;
|
|
10867
|
+
return true;
|
|
10868
|
+
}
|
|
10566
10869
|
|
|
10567
10870
|
// src/benchmarks/published/ama-bench/runner.ts
|
|
10568
10871
|
import { randomUUID as randomUUID2 } from "crypto";
|
|
10569
|
-
import { readFile as
|
|
10570
|
-
import
|
|
10872
|
+
import { readFile as readFile10 } from "fs/promises";
|
|
10873
|
+
import path12 from "path";
|
|
10571
10874
|
|
|
10572
10875
|
// src/benchmarks/published/ama-bench/fixture.ts
|
|
10573
10876
|
var AMA_BENCH_SMOKE_FIXTURE = [
|
|
@@ -11142,10 +11445,10 @@ async function loadDataset(mode, datasetDir, limit) {
|
|
|
11142
11445
|
return episodes;
|
|
11143
11446
|
};
|
|
11144
11447
|
if (datasetDir) {
|
|
11145
|
-
const filePath =
|
|
11448
|
+
const filePath = path12.join(datasetDir, "open_end_qa_set.jsonl");
|
|
11146
11449
|
let raw;
|
|
11147
11450
|
try {
|
|
11148
|
-
raw = await
|
|
11451
|
+
raw = await readFile10(filePath, "utf8");
|
|
11149
11452
|
} catch (error) {
|
|
11150
11453
|
throw new Error(
|
|
11151
11454
|
`AMA-Bench dataset not found at ${filePath}: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -11437,8 +11740,8 @@ function isValidQaPairs(value) {
|
|
|
11437
11740
|
|
|
11438
11741
|
// src/benchmarks/published/amemgym/runner.ts
|
|
11439
11742
|
import { randomUUID as randomUUID3 } from "crypto";
|
|
11440
|
-
import { readFile as
|
|
11441
|
-
import
|
|
11743
|
+
import { readFile as readFile11 } from "fs/promises";
|
|
11744
|
+
import path13 from "path";
|
|
11442
11745
|
|
|
11443
11746
|
// src/benchmarks/published/amemgym/fixture.ts
|
|
11444
11747
|
var AMEMGYM_SMOKE_FIXTURE = [
|
|
@@ -11967,7 +12270,7 @@ async function loadDataset2(mode, datasetDir, limit) {
|
|
|
11967
12270
|
const datasetErrors = [];
|
|
11968
12271
|
for (const filename of DATASET_FILENAMES) {
|
|
11969
12272
|
try {
|
|
11970
|
-
const raw = await
|
|
12273
|
+
const raw = await readFile11(path13.join(datasetDir, filename), "utf8");
|
|
11971
12274
|
const parsed = parseDataset(raw, filename, normalizedLimit);
|
|
11972
12275
|
return ensureDatasetProfiles(parsed);
|
|
11973
12276
|
} catch (error) {
|
|
@@ -12141,8 +12444,8 @@ function normalizeRole(role) {
|
|
|
12141
12444
|
|
|
12142
12445
|
// src/benchmarks/published/memory-arena/runner.ts
|
|
12143
12446
|
import { randomUUID as randomUUID4 } from "crypto";
|
|
12144
|
-
import { readFile as
|
|
12145
|
-
import
|
|
12447
|
+
import { readFile as readFile12, readdir as readdir5, stat as stat3 } from "fs/promises";
|
|
12448
|
+
import path14 from "path";
|
|
12146
12449
|
import { expandTildePath as expandTildePath2 } from "@remnic/core";
|
|
12147
12450
|
|
|
12148
12451
|
// src/benchmarks/published/memory-arena/fixture.ts
|
|
@@ -12469,7 +12772,7 @@ async function loadDataset3(mode, datasetDir, limit) {
|
|
|
12469
12772
|
if (remainingLimit2 === 0) {
|
|
12470
12773
|
break;
|
|
12471
12774
|
}
|
|
12472
|
-
const raw = await
|
|
12775
|
+
const raw = await readFile12(path14.join(datasetDir, filename), "utf8");
|
|
12473
12776
|
const parsedTasks = [];
|
|
12474
12777
|
raw.split("\n").forEach((line, lineIndex) => {
|
|
12475
12778
|
if (line.trim().length === 0) {
|
|
@@ -12805,7 +13108,7 @@ async function loadMemoryArenaWebshopProductCatalog(datasetDir) {
|
|
|
12805
13108
|
`MemoryArena WebShop product sidecar is ${sourceStat.size} bytes; provide a compact JSON/JSONL sidecar smaller than ${MEMORY_ARENA_WEBSHOP_PRODUCTS_MAX_BYTES} bytes instead of the full WebShop catalog.`
|
|
12806
13109
|
);
|
|
12807
13110
|
}
|
|
12808
|
-
const raw = await
|
|
13111
|
+
const raw = await readFile12(sourcePath, "utf8");
|
|
12809
13112
|
const records = parseMemoryArenaWebshopSidecarRecords(raw, sourcePath);
|
|
12810
13113
|
const byAsin = /* @__PURE__ */ new Map();
|
|
12811
13114
|
for (const record of records) {
|
|
@@ -12825,14 +13128,14 @@ async function loadMemoryArenaWebshopProductCatalog(datasetDir) {
|
|
|
12825
13128
|
async function resolveMemoryArenaWebshopProductCatalogPath(datasetDir) {
|
|
12826
13129
|
const configuredPath = process.env[MEMORY_ARENA_WEBSHOP_PRODUCTS_ENV]?.trim();
|
|
12827
13130
|
if (configuredPath && configuredPath.length > 0) {
|
|
12828
|
-
return
|
|
13131
|
+
return path14.resolve(expandTildePath2(configuredPath));
|
|
12829
13132
|
}
|
|
12830
13133
|
if (datasetDir === void 0) {
|
|
12831
13134
|
return void 0;
|
|
12832
13135
|
}
|
|
12833
13136
|
const candidatePaths = [
|
|
12834
13137
|
...MEMORY_ARENA_WEBSHOP_PRODUCT_SIDECAR_FILENAMES
|
|
12835
|
-
].map((filename) =>
|
|
13138
|
+
].map((filename) => path14.join(datasetDir, filename));
|
|
12836
13139
|
for (const candidatePath of candidatePaths) {
|
|
12837
13140
|
try {
|
|
12838
13141
|
const candidateStat = await stat3(candidatePath);
|
|
@@ -14254,8 +14557,8 @@ function scoreSubtaskSuccess(scores) {
|
|
|
14254
14557
|
import { collectTemporalLexicalCues } from "@remnic/core";
|
|
14255
14558
|
|
|
14256
14559
|
// src/benchmarks/published/dataset-loader.ts
|
|
14257
|
-
import { readFile as
|
|
14258
|
-
import
|
|
14560
|
+
import { readFile as readFile13 } from "fs/promises";
|
|
14561
|
+
import path15 from "path";
|
|
14259
14562
|
|
|
14260
14563
|
// src/benchmarks/published/longmemeval/fixture.ts
|
|
14261
14564
|
var LONG_MEM_EVAL_SMOKE_FIXTURE = [
|
|
@@ -14358,10 +14661,10 @@ async function loadDataset4(options) {
|
|
|
14358
14661
|
const errors = [];
|
|
14359
14662
|
if (options.datasetDir) {
|
|
14360
14663
|
for (const filename of options.filenames) {
|
|
14361
|
-
const abs =
|
|
14664
|
+
const abs = path15.join(options.datasetDir, filename);
|
|
14362
14665
|
let raw;
|
|
14363
14666
|
try {
|
|
14364
|
-
raw = await
|
|
14667
|
+
raw = await readFile13(abs, "utf8");
|
|
14365
14668
|
} catch (error) {
|
|
14366
14669
|
errors.push(
|
|
14367
14670
|
`${filename}: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -16098,7 +16401,7 @@ function normalizeQaArray(value, location) {
|
|
|
16098
16401
|
import { randomUUID as randomUUID6 } from "crypto";
|
|
16099
16402
|
import { createReadStream as createReadStream2 } from "fs";
|
|
16100
16403
|
import { readdir as readdir6 } from "fs/promises";
|
|
16101
|
-
import
|
|
16404
|
+
import path16 from "path";
|
|
16102
16405
|
import { createInterface } from "readline/promises";
|
|
16103
16406
|
import {
|
|
16104
16407
|
asyncBufferFromFile,
|
|
@@ -16569,8 +16872,8 @@ async function listBeamDatasetFiles(datasetDir) {
|
|
|
16569
16872
|
return directFiles;
|
|
16570
16873
|
}
|
|
16571
16874
|
try {
|
|
16572
|
-
const nestedFilenames = await readdir6(
|
|
16573
|
-
return nestedFilenames.filter((filename) => isBeamDatasetFilename(filename)).map((filename) =>
|
|
16875
|
+
const nestedFilenames = await readdir6(path16.join(datasetDir, "data"));
|
|
16876
|
+
return nestedFilenames.filter((filename) => isBeamDatasetFilename(filename)).map((filename) => path16.join("data", filename));
|
|
16574
16877
|
} catch {
|
|
16575
16878
|
return [];
|
|
16576
16879
|
}
|
|
@@ -16597,7 +16900,7 @@ async function* iterateDatasetFiles(datasetDir, datasetFiles, limit) {
|
|
|
16597
16900
|
let remainingLimit = limit;
|
|
16598
16901
|
for (const filename of datasetFiles) {
|
|
16599
16902
|
const scale = inferScaleFromFilename(filename);
|
|
16600
|
-
const filePath =
|
|
16903
|
+
const filePath = path16.join(datasetDir, filename);
|
|
16601
16904
|
const conversations = filename.endsWith(".jsonl") ? streamJsonlDataset(filePath, filename, remainingLimit) : filename.endsWith(".parquet") ? streamParquetDataset(filePath, filename, remainingLimit) : streamJsonDataset(filePath, filename, remainingLimit);
|
|
16602
16905
|
for await (const conversation of conversations) {
|
|
16603
16906
|
yield {
|
|
@@ -17608,9 +17911,9 @@ var StructuredLiteralParser = class {
|
|
|
17608
17911
|
};
|
|
17609
17912
|
|
|
17610
17913
|
// src/benchmarks/published/personamem/runner.ts
|
|
17611
|
-
import { createHash as
|
|
17612
|
-
import { readFile as
|
|
17613
|
-
import
|
|
17914
|
+
import { createHash as createHash7, randomUUID as randomUUID7 } from "crypto";
|
|
17915
|
+
import { readFile as readFile14, realpath as realpath4 } from "fs/promises";
|
|
17916
|
+
import path17 from "path";
|
|
17614
17917
|
|
|
17615
17918
|
// src/benchmarks/published/personamem/fixture.ts
|
|
17616
17919
|
var PERSONAMEM_SMOKE_FIXTURE = [
|
|
@@ -17886,10 +18189,10 @@ async function loadDataset8(mode, datasetDir, limit) {
|
|
|
17886
18189
|
if (datasetDir) {
|
|
17887
18190
|
const datasetErrors = [];
|
|
17888
18191
|
for (const relativePath of DATASET_FILE_CANDIDATES) {
|
|
17889
|
-
const datasetPath =
|
|
18192
|
+
const datasetPath = path17.join(datasetDir, relativePath);
|
|
17890
18193
|
let raw;
|
|
17891
18194
|
try {
|
|
17892
|
-
raw = await
|
|
18195
|
+
raw = await readFile14(datasetPath, "utf8");
|
|
17893
18196
|
} catch (error) {
|
|
17894
18197
|
datasetErrors.push(
|
|
17895
18198
|
`${relativePath}: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -17947,7 +18250,7 @@ async function hydrateSample(row, datasetRoot) {
|
|
|
17947
18250
|
datasetRoot,
|
|
17948
18251
|
row.chat_history_32k_link
|
|
17949
18252
|
);
|
|
17950
|
-
const chatHistoryRaw = await
|
|
18253
|
+
const chatHistoryRaw = await readFile14(chatHistoryPath, "utf8");
|
|
17951
18254
|
const chatHistory = parseChatHistory(
|
|
17952
18255
|
chatHistoryRaw,
|
|
17953
18256
|
row.chat_history_32k_link
|
|
@@ -18080,12 +18383,12 @@ function parseCsv(raw, limit) {
|
|
|
18080
18383
|
return rows;
|
|
18081
18384
|
}
|
|
18082
18385
|
async function resolveDatasetFilePath(datasetRoot, relativePath) {
|
|
18083
|
-
const rootPath =
|
|
18386
|
+
const rootPath = path17.resolve(datasetRoot);
|
|
18084
18387
|
const rootRealPath = await realpath4(rootPath);
|
|
18085
|
-
const candidatePath =
|
|
18388
|
+
const candidatePath = path17.resolve(rootPath, relativePath);
|
|
18086
18389
|
const candidateRealPath = await realpath4(candidatePath);
|
|
18087
|
-
const relativeToRoot =
|
|
18088
|
-
if (relativeToRoot.startsWith("..") ||
|
|
18390
|
+
const relativeToRoot = path17.relative(rootRealPath, candidateRealPath);
|
|
18391
|
+
if (relativeToRoot.startsWith("..") || path17.isAbsolute(relativeToRoot)) {
|
|
18089
18392
|
throw new Error(
|
|
18090
18393
|
`PersonaMem-v2 dataset file reference "${relativePath}" must stay within datasetDir.`
|
|
18091
18394
|
);
|
|
@@ -18213,7 +18516,7 @@ function buildMcqPrompt(sample, seed) {
|
|
|
18213
18516
|
function deterministicShuffle(values, seedMaterial) {
|
|
18214
18517
|
return values.map((value, index) => ({
|
|
18215
18518
|
value,
|
|
18216
|
-
key:
|
|
18519
|
+
key: createHash7("sha256").update(`${seedMaterial}:${index}:${value}`).digest("hex"),
|
|
18217
18520
|
index
|
|
18218
18521
|
})).sort((left, right) => {
|
|
18219
18522
|
const byKey = left.key.localeCompare(right.key);
|
|
@@ -18413,8 +18716,8 @@ function applyLimit6(items, limit) {
|
|
|
18413
18716
|
|
|
18414
18717
|
// src/benchmarks/published/membench/runner.ts
|
|
18415
18718
|
import { randomUUID as randomUUID8 } from "crypto";
|
|
18416
|
-
import { readFile as
|
|
18417
|
-
import
|
|
18719
|
+
import { readFile as readFile15, readdir as readdir7 } from "fs/promises";
|
|
18720
|
+
import path18 from "path";
|
|
18418
18721
|
|
|
18419
18722
|
// src/benchmarks/published/membench/fixture.ts
|
|
18420
18723
|
var MEMBENCH_SMOKE_FIXTURE = [
|
|
@@ -18675,7 +18978,7 @@ async function loadDataset9(mode, datasetDir, limit) {
|
|
|
18675
18978
|
let remainingLimit = normalizedLimit;
|
|
18676
18979
|
for (const filename of filenames) {
|
|
18677
18980
|
try {
|
|
18678
|
-
const raw = await
|
|
18981
|
+
const raw = await readFile15(path18.join(datasetDir, filename), "utf8");
|
|
18679
18982
|
const parsed = filename.endsWith(".jsonl") ? parseJsonlDataset(raw, filename) : parseJsonDataset(raw, filename);
|
|
18680
18983
|
const limitedCases = remainingLimit === 0 ? [] : applyLimit7(parsed, remainingLimit);
|
|
18681
18984
|
if (limitedCases.length > 0) {
|
|
@@ -19542,8 +19845,8 @@ function isPlainObject2(value) {
|
|
|
19542
19845
|
|
|
19543
19846
|
// src/benchmarks/published/memoryagentbench/runner.ts
|
|
19544
19847
|
import { randomUUID as randomUUID9 } from "crypto";
|
|
19545
|
-
import { access, readFile as
|
|
19546
|
-
import
|
|
19848
|
+
import { access, readFile as readFile16 } from "fs/promises";
|
|
19849
|
+
import path19 from "path";
|
|
19547
19850
|
|
|
19548
19851
|
// src/benchmarks/published/memoryagentbench/fixture.ts
|
|
19549
19852
|
var MEMORY_AGENT_BENCH_SMOKE_FIXTURE = [
|
|
@@ -20565,7 +20868,7 @@ async function loadRecSysEntityMapping(datasetDir) {
|
|
|
20565
20868
|
}
|
|
20566
20869
|
let parsed;
|
|
20567
20870
|
try {
|
|
20568
|
-
parsed = JSON.parse(await
|
|
20871
|
+
parsed = JSON.parse(await readFile16(candidate, "utf8"));
|
|
20569
20872
|
} catch (error) {
|
|
20570
20873
|
console.error(
|
|
20571
20874
|
` [WARN] MemoryAgentBench ReDial entity mapping ${candidate} is invalid JSON; trying the next candidate: ${error instanceof Error ? error.message : String(error)}`
|
|
@@ -20622,21 +20925,21 @@ function recsysEntityMappingCandidates(datasetDir) {
|
|
|
20622
20925
|
if (!datasetDir) {
|
|
20623
20926
|
return [];
|
|
20624
20927
|
}
|
|
20625
|
-
const absoluteDatasetDir =
|
|
20928
|
+
const absoluteDatasetDir = path19.resolve(datasetDir);
|
|
20626
20929
|
const roots = [
|
|
20627
20930
|
absoluteDatasetDir,
|
|
20628
|
-
|
|
20931
|
+
path19.dirname(absoluteDatasetDir)
|
|
20629
20932
|
];
|
|
20630
20933
|
const canonicalSuffixes = [
|
|
20631
|
-
|
|
20632
|
-
|
|
20934
|
+
path19.join("processed_data", "Recsys_Redial", "entity2id.json"),
|
|
20935
|
+
path19.join("Recsys_Redial", "entity2id.json")
|
|
20633
20936
|
];
|
|
20634
20937
|
const looseSuffixes = ["entity2id.json"];
|
|
20635
20938
|
return [
|
|
20636
20939
|
...roots.flatMap(
|
|
20637
|
-
(root) => canonicalSuffixes.map((suffix) =>
|
|
20940
|
+
(root) => canonicalSuffixes.map((suffix) => path19.join(root, suffix))
|
|
20638
20941
|
),
|
|
20639
|
-
...looseSuffixes.map((suffix) =>
|
|
20942
|
+
...looseSuffixes.map((suffix) => path19.join(absoluteDatasetDir, suffix))
|
|
20640
20943
|
];
|
|
20641
20944
|
}
|
|
20642
20945
|
async function fileExists(filePath) {
|
|
@@ -20673,7 +20976,7 @@ async function loadDataset10(mode, datasetDir, limit) {
|
|
|
20673
20976
|
const datasetErrors = [];
|
|
20674
20977
|
for (const filename of DATASET_BUNDLE_CANDIDATES) {
|
|
20675
20978
|
const parsed = await tryReadDatasetFile(
|
|
20676
|
-
|
|
20979
|
+
path19.join(datasetDir, filename),
|
|
20677
20980
|
filename,
|
|
20678
20981
|
datasetErrors
|
|
20679
20982
|
);
|
|
@@ -20690,7 +20993,7 @@ async function loadDataset10(mode, datasetDir, limit) {
|
|
|
20690
20993
|
let splitData;
|
|
20691
20994
|
for (const filename of splitConfig.candidates) {
|
|
20692
20995
|
try {
|
|
20693
|
-
splitData = await readDatasetFile(
|
|
20996
|
+
splitData = await readDatasetFile(path19.join(datasetDir, filename), filename);
|
|
20694
20997
|
break;
|
|
20695
20998
|
} catch (error) {
|
|
20696
20999
|
if (!isFileNotFoundError2(error)) {
|
|
@@ -20728,7 +21031,7 @@ async function loadDataset10(mode, datasetDir, limit) {
|
|
|
20728
21031
|
return ensureDatasetItems(applyLimit8(MEMORY_AGENT_BENCH_SMOKE_FIXTURE, normalizedLimit));
|
|
20729
21032
|
}
|
|
20730
21033
|
async function readDatasetFile(filePath, filename) {
|
|
20731
|
-
const raw = await
|
|
21034
|
+
const raw = await readFile16(filePath, "utf8");
|
|
20732
21035
|
const parsed = filename.endsWith(".jsonl") ? parseJsonLines(raw, filename) : parseJsonArray(raw, filename);
|
|
20733
21036
|
return parsed.map(
|
|
20734
21037
|
(item, index) => parseMemoryAgentBenchItem(item, `${filename} item ${index + 1}`)
|
|
@@ -21339,7 +21642,7 @@ function loadCases(mode, limit) {
|
|
|
21339
21642
|
// src/benchmarks/remnic/extraction-judge-calibration/runner.ts
|
|
21340
21643
|
import { randomUUID as randomUUID11 } from "crypto";
|
|
21341
21644
|
import os4 from "os";
|
|
21342
|
-
import
|
|
21645
|
+
import path20 from "path";
|
|
21343
21646
|
import {
|
|
21344
21647
|
createVerdictCache,
|
|
21345
21648
|
judgeFactDurability,
|
|
@@ -21449,8 +21752,8 @@ var extractionJudgeCalibrationDefinition = {
|
|
|
21449
21752
|
async function runExtractionJudgeCalibrationBenchmark(options) {
|
|
21450
21753
|
const cases = loadCases2(options.mode, options.limit);
|
|
21451
21754
|
const config = parseConfig2({
|
|
21452
|
-
memoryDir:
|
|
21453
|
-
workspaceDir:
|
|
21755
|
+
memoryDir: path20.join(os4.tmpdir(), "remnic-bench-extraction-judge"),
|
|
21756
|
+
workspaceDir: path20.join(os4.tmpdir(), "remnic-bench-extraction-judge-workspace"),
|
|
21454
21757
|
openaiApiKey: "bench-test-key",
|
|
21455
21758
|
extractionJudgeEnabled: true,
|
|
21456
21759
|
extractionJudgeBatchSize: 4,
|
|
@@ -21999,9 +22302,9 @@ function constantAggregate2(value) {
|
|
|
21999
22302
|
|
|
22000
22303
|
// src/benchmarks/remnic/entity-consolidation/runner.ts
|
|
22001
22304
|
import os5 from "os";
|
|
22002
|
-
import
|
|
22305
|
+
import path21 from "path";
|
|
22003
22306
|
import { randomUUID as randomUUID13 } from "crypto";
|
|
22004
|
-
import { mkdtemp as mkdtemp3, rm as
|
|
22307
|
+
import { mkdtemp as mkdtemp3, rm as rm4 } from "fs/promises";
|
|
22005
22308
|
import { StorageManager } from "@remnic/core";
|
|
22006
22309
|
|
|
22007
22310
|
// src/benchmarks/remnic/entity-consolidation/fixture.ts
|
|
@@ -22162,7 +22465,7 @@ function loadCases4(mode, limit) {
|
|
|
22162
22465
|
return limited;
|
|
22163
22466
|
}
|
|
22164
22467
|
async function executeCase(sample) {
|
|
22165
|
-
const tmpDir = await mkdtemp3(
|
|
22468
|
+
const tmpDir = await mkdtemp3(path21.join(os5.tmpdir(), "remnic-bench-entity-consolidation-"));
|
|
22166
22469
|
try {
|
|
22167
22470
|
const storage = new StorageManager(tmpDir);
|
|
22168
22471
|
await storage.ensureDirectories();
|
|
@@ -22170,7 +22473,7 @@ async function executeCase(sample) {
|
|
|
22170
22473
|
const rawEntity = await storage.readEntity(canonicalName);
|
|
22171
22474
|
return summarizeEntity(rawEntity, canonicalName);
|
|
22172
22475
|
} finally {
|
|
22173
|
-
await
|
|
22476
|
+
await rm4(tmpDir, { recursive: true, force: true });
|
|
22174
22477
|
}
|
|
22175
22478
|
}
|
|
22176
22479
|
async function applyScenario(storage, sample) {
|
|
@@ -22341,9 +22644,9 @@ function parseNonNegativeInt(rawValue) {
|
|
|
22341
22644
|
|
|
22342
22645
|
// src/benchmarks/remnic/page-versioning/runner.ts
|
|
22343
22646
|
import { randomUUID as randomUUID14 } from "crypto";
|
|
22344
|
-
import { mkdir as
|
|
22647
|
+
import { mkdir as mkdir9, mkdtemp as mkdtemp4, readFile as readFile17, rm as rm5, writeFile as writeFile9 } from "fs/promises";
|
|
22345
22648
|
import os6 from "os";
|
|
22346
|
-
import
|
|
22649
|
+
import path22 from "path";
|
|
22347
22650
|
import {
|
|
22348
22651
|
createVersion,
|
|
22349
22652
|
diffVersions,
|
|
@@ -22507,21 +22810,21 @@ function loadCases5(mode, limit) {
|
|
|
22507
22810
|
return limited;
|
|
22508
22811
|
}
|
|
22509
22812
|
async function executeCase2(sample, dependencies) {
|
|
22510
|
-
const tmpDir = await mkdtemp4(
|
|
22813
|
+
const tmpDir = await mkdtemp4(path22.join(os6.tmpdir(), "remnic-bench-page-versioning-"));
|
|
22511
22814
|
try {
|
|
22512
|
-
const factsDir =
|
|
22513
|
-
const pagePath =
|
|
22514
|
-
await
|
|
22815
|
+
const factsDir = path22.join(tmpDir, "facts");
|
|
22816
|
+
const pagePath = path22.join(factsDir, `${sample.id}.md`);
|
|
22817
|
+
await mkdir9(factsDir, { recursive: true });
|
|
22515
22818
|
const config = versioningConfig();
|
|
22516
22819
|
switch (sample.scenario) {
|
|
22517
22820
|
case "revert-flow": {
|
|
22518
|
-
await
|
|
22821
|
+
await writeFile9(pagePath, "original content", "utf-8");
|
|
22519
22822
|
await dependencies.createVersion(pagePath, "original content", "write", config, void 0, void 0, tmpDir);
|
|
22520
|
-
await
|
|
22823
|
+
await writeFile9(pagePath, "modified content", "utf-8");
|
|
22521
22824
|
await dependencies.createVersion(pagePath, "modified content", "write", config, void 0, void 0, tmpDir);
|
|
22522
22825
|
await dependencies.revertToVersion(pagePath, "1", config, void 0, tmpDir);
|
|
22523
22826
|
const history = await dependencies.listVersions(pagePath, config, tmpDir);
|
|
22524
|
-
const pageContent = await
|
|
22827
|
+
const pageContent = await readFile17(pagePath, "utf-8");
|
|
22525
22828
|
const observed = await dependencies.getVersion(pagePath, "3", config, tmpDir);
|
|
22526
22829
|
return {
|
|
22527
22830
|
versionIds: history.versions.map((version) => version.versionId),
|
|
@@ -22534,11 +22837,11 @@ async function executeCase2(sample, dependencies) {
|
|
|
22534
22837
|
const pruningConfig = versioningConfig({ maxVersionsPerPage: 2 });
|
|
22535
22838
|
for (let index = 1; index <= 4; index += 1) {
|
|
22536
22839
|
const content = `content v${index}`;
|
|
22537
|
-
await
|
|
22840
|
+
await writeFile9(pagePath, content, "utf-8");
|
|
22538
22841
|
await dependencies.createVersion(pagePath, content, "write", pruningConfig, void 0, void 0, tmpDir);
|
|
22539
22842
|
}
|
|
22540
22843
|
const history = await dependencies.listVersions(pagePath, pruningConfig, tmpDir);
|
|
22541
|
-
const pageContent = await
|
|
22844
|
+
const pageContent = await readFile17(pagePath, "utf-8");
|
|
22542
22845
|
const prunedIds = [];
|
|
22543
22846
|
for (const versionId of ["1", "2"]) {
|
|
22544
22847
|
try {
|
|
@@ -22558,7 +22861,7 @@ async function executeCase2(sample, dependencies) {
|
|
|
22558
22861
|
};
|
|
22559
22862
|
}
|
|
22560
22863
|
case "diff-output": {
|
|
22561
|
-
await
|
|
22864
|
+
await writeFile9(pagePath, "line 1\nline 2\nline 3", "utf-8");
|
|
22562
22865
|
await dependencies.createVersion(
|
|
22563
22866
|
pagePath,
|
|
22564
22867
|
"line 1\nline 2\nline 3",
|
|
@@ -22568,7 +22871,7 @@ async function executeCase2(sample, dependencies) {
|
|
|
22568
22871
|
void 0,
|
|
22569
22872
|
tmpDir
|
|
22570
22873
|
);
|
|
22571
|
-
await
|
|
22874
|
+
await writeFile9(pagePath, "line 1\nline 2 changed\nline 3\nline 4", "utf-8");
|
|
22572
22875
|
await dependencies.createVersion(
|
|
22573
22876
|
pagePath,
|
|
22574
22877
|
"line 1\nline 2 changed\nline 3\nline 4",
|
|
@@ -22579,7 +22882,7 @@ async function executeCase2(sample, dependencies) {
|
|
|
22579
22882
|
tmpDir
|
|
22580
22883
|
);
|
|
22581
22884
|
const history = await dependencies.listVersions(pagePath, config, tmpDir);
|
|
22582
|
-
const pageContent = await
|
|
22885
|
+
const pageContent = await readFile17(pagePath, "utf-8");
|
|
22583
22886
|
const diff = await dependencies.diffVersions(pagePath, "1", "2", config, tmpDir);
|
|
22584
22887
|
const observedLines = normalizeDiffChangedLines(diff);
|
|
22585
22888
|
return {
|
|
@@ -22591,7 +22894,7 @@ async function executeCase2(sample, dependencies) {
|
|
|
22591
22894
|
}
|
|
22592
22895
|
}
|
|
22593
22896
|
} finally {
|
|
22594
|
-
await
|
|
22897
|
+
await rm5(tmpDir, { recursive: true, force: true });
|
|
22595
22898
|
}
|
|
22596
22899
|
}
|
|
22597
22900
|
function isMissingPageVersionError(error, pagePath, versionId) {
|
|
@@ -24864,9 +25167,9 @@ function loadCases9(mode, limit) {
|
|
|
24864
25167
|
|
|
24865
25168
|
// src/benchmarks/remnic/procedural-recall/runner.ts
|
|
24866
25169
|
import { randomUUID as randomUUID21 } from "crypto";
|
|
24867
|
-
import { mkdtemp as mkdtemp5, rm as
|
|
25170
|
+
import { mkdtemp as mkdtemp5, rm as rm6 } from "fs/promises";
|
|
24868
25171
|
import os7 from "os";
|
|
24869
|
-
import
|
|
25172
|
+
import path23 from "path";
|
|
24870
25173
|
import {
|
|
24871
25174
|
StorageManager as StorageManager2,
|
|
24872
25175
|
parseConfig as parseConfig3,
|
|
@@ -24996,7 +25299,7 @@ async function runProceduralRecallBenchmark(options) {
|
|
|
24996
25299
|
}
|
|
24997
25300
|
for (const sample of e2eCases) {
|
|
24998
25301
|
const startedAt = performance.now();
|
|
24999
|
-
const dir = await mkdtemp5(
|
|
25302
|
+
const dir = await mkdtemp5(path23.join(os7.tmpdir(), "remnic-bench-procedural-recall-"));
|
|
25000
25303
|
let section = null;
|
|
25001
25304
|
try {
|
|
25002
25305
|
const storage = new StorageManager2(dir);
|
|
@@ -25011,7 +25314,7 @@ ${body}`,
|
|
|
25011
25314
|
);
|
|
25012
25315
|
const config = parseConfig3({
|
|
25013
25316
|
memoryDir: dir,
|
|
25014
|
-
workspaceDir:
|
|
25317
|
+
workspaceDir: path23.join(dir, "ws"),
|
|
25015
25318
|
openaiApiKey: "bench-key",
|
|
25016
25319
|
procedural: {
|
|
25017
25320
|
enabled: sample.proceduralEnabled !== false,
|
|
@@ -25020,7 +25323,7 @@ ${body}`,
|
|
|
25020
25323
|
});
|
|
25021
25324
|
section = await buildProcedureRecallSection(storage, sample.prompt, config);
|
|
25022
25325
|
} finally {
|
|
25023
|
-
await
|
|
25326
|
+
await rm6(dir, { recursive: true, force: true });
|
|
25024
25327
|
}
|
|
25025
25328
|
const latencyMs = Math.round(performance.now() - startedAt);
|
|
25026
25329
|
const nonNull = section !== null && section.length > 0;
|
|
@@ -25081,9 +25384,9 @@ ${body}`,
|
|
|
25081
25384
|
|
|
25082
25385
|
// src/benchmarks/remnic/ingestion-entity-recall/runner.ts
|
|
25083
25386
|
import { randomUUID as randomUUID22 } from "crypto";
|
|
25084
|
-
import { mkdtemp as mkdtemp6, writeFile as
|
|
25387
|
+
import { mkdtemp as mkdtemp6, writeFile as writeFile10, rm as rm7, mkdir as mkdir10, realpath as realpath5 } from "fs/promises";
|
|
25085
25388
|
import { tmpdir as tmpdir2 } from "os";
|
|
25086
|
-
import
|
|
25389
|
+
import path24 from "path";
|
|
25087
25390
|
|
|
25088
25391
|
// src/ingestion-scorer.ts
|
|
25089
25392
|
function normalize(value) {
|
|
@@ -25585,13 +25888,13 @@ async function runIngestionEntityRecallBenchmark(options) {
|
|
|
25585
25888
|
throw new Error("ingestionAdapter is required for ingestion benchmarks");
|
|
25586
25889
|
}
|
|
25587
25890
|
const fixture = emailFixture.generate();
|
|
25588
|
-
const fixtureDir = await mkdtemp6(
|
|
25891
|
+
const fixtureDir = await mkdtemp6(path24.join(tmpdir2(), "bench-email-"));
|
|
25589
25892
|
try {
|
|
25590
25893
|
await options.ingestionAdapter.reset();
|
|
25591
25894
|
for (const file of fixture.files) {
|
|
25592
|
-
const filePath =
|
|
25593
|
-
await
|
|
25594
|
-
await
|
|
25895
|
+
const filePath = path24.join(fixtureDir, file.relativePath);
|
|
25896
|
+
await mkdir10(path24.dirname(filePath), { recursive: true });
|
|
25897
|
+
await writeFile10(filePath, file.content, "utf8");
|
|
25595
25898
|
}
|
|
25596
25899
|
const { result: ingestionLog, durationMs } = await timed(
|
|
25597
25900
|
async () => options.ingestionAdapter.ingest(await realpath5(fixtureDir))
|
|
@@ -25672,7 +25975,7 @@ async function runIngestionEntityRecallBenchmark(options) {
|
|
|
25672
25975
|
];
|
|
25673
25976
|
return buildResult(options, tasks, durationMs);
|
|
25674
25977
|
} finally {
|
|
25675
|
-
await
|
|
25978
|
+
await rm7(fixtureDir, { recursive: true, force: true });
|
|
25676
25979
|
}
|
|
25677
25980
|
}
|
|
25678
25981
|
async function buildResult(options, tasks, totalLatencyMs) {
|
|
@@ -25718,9 +26021,9 @@ async function buildResult(options, tasks, totalLatencyMs) {
|
|
|
25718
26021
|
|
|
25719
26022
|
// src/benchmarks/remnic/ingestion-schema-completeness/runner.ts
|
|
25720
26023
|
import { randomUUID as randomUUID23 } from "crypto";
|
|
25721
|
-
import { mkdtemp as mkdtemp7, writeFile as
|
|
26024
|
+
import { mkdtemp as mkdtemp7, writeFile as writeFile11, rm as rm8, mkdir as mkdir11, realpath as realpath6 } from "fs/promises";
|
|
25722
26025
|
import { tmpdir as tmpdir3 } from "os";
|
|
25723
|
-
import
|
|
26026
|
+
import path25 from "path";
|
|
25724
26027
|
var ingestionSchemaCompletenessDefinition = {
|
|
25725
26028
|
id: "ingestion-schema-completeness",
|
|
25726
26029
|
title: "Ingestion: Schema Completeness",
|
|
@@ -25739,13 +26042,13 @@ async function runIngestionSchemaCompletenessBenchmark(options) {
|
|
|
25739
26042
|
throw new Error("ingestionAdapter is required for ingestion benchmarks");
|
|
25740
26043
|
}
|
|
25741
26044
|
const fixture = emailFixture.generate();
|
|
25742
|
-
const fixtureDir = await mkdtemp7(
|
|
26045
|
+
const fixtureDir = await mkdtemp7(path25.join(tmpdir3(), "bench-email-"));
|
|
25743
26046
|
try {
|
|
25744
26047
|
await options.ingestionAdapter.reset();
|
|
25745
26048
|
for (const file of fixture.files) {
|
|
25746
|
-
const filePath =
|
|
25747
|
-
await
|
|
25748
|
-
await
|
|
26049
|
+
const filePath = path25.join(fixtureDir, file.relativePath);
|
|
26050
|
+
await mkdir11(path25.dirname(filePath), { recursive: true });
|
|
26051
|
+
await writeFile11(filePath, file.content, "utf8");
|
|
25749
26052
|
}
|
|
25750
26053
|
const { result: ingestionLog, durationMs } = await timed(
|
|
25751
26054
|
async () => options.ingestionAdapter.ingest(await realpath6(fixtureDir))
|
|
@@ -25885,15 +26188,15 @@ async function runIngestionSchemaCompletenessBenchmark(options) {
|
|
|
25885
26188
|
}
|
|
25886
26189
|
};
|
|
25887
26190
|
} finally {
|
|
25888
|
-
await
|
|
26191
|
+
await rm8(fixtureDir, { recursive: true, force: true });
|
|
25889
26192
|
}
|
|
25890
26193
|
}
|
|
25891
26194
|
|
|
25892
26195
|
// src/benchmarks/remnic/ingestion-backlink-f1/runner.ts
|
|
25893
26196
|
import { randomUUID as randomUUID24 } from "crypto";
|
|
25894
|
-
import { mkdtemp as mkdtemp8, writeFile as
|
|
26197
|
+
import { mkdtemp as mkdtemp8, writeFile as writeFile12, rm as rm9, mkdir as mkdir12, realpath as realpath7 } from "fs/promises";
|
|
25895
26198
|
import { tmpdir as tmpdir4 } from "os";
|
|
25896
|
-
import
|
|
26199
|
+
import path26 from "path";
|
|
25897
26200
|
var ingestionBacklinkF1Definition = {
|
|
25898
26201
|
id: "ingestion-backlink-f1",
|
|
25899
26202
|
title: "Ingestion: Backlink F1",
|
|
@@ -25912,13 +26215,13 @@ async function runIngestionBacklinkF1Benchmark(options) {
|
|
|
25912
26215
|
throw new Error("ingestionAdapter is required for ingestion benchmarks");
|
|
25913
26216
|
}
|
|
25914
26217
|
const fixture = emailFixture.generate();
|
|
25915
|
-
const fixtureDir = await mkdtemp8(
|
|
26218
|
+
const fixtureDir = await mkdtemp8(path26.join(tmpdir4(), "bench-email-"));
|
|
25916
26219
|
try {
|
|
25917
26220
|
await options.ingestionAdapter.reset();
|
|
25918
26221
|
for (const file of fixture.files) {
|
|
25919
|
-
const filePath =
|
|
25920
|
-
await
|
|
25921
|
-
await
|
|
26222
|
+
const filePath = path26.join(fixtureDir, file.relativePath);
|
|
26223
|
+
await mkdir12(path26.dirname(filePath), { recursive: true });
|
|
26224
|
+
await writeFile12(filePath, file.content, "utf8");
|
|
25922
26225
|
}
|
|
25923
26226
|
const { result: ingestionLog, durationMs } = await timed(
|
|
25924
26227
|
async () => options.ingestionAdapter.ingest(await realpath7(fixtureDir))
|
|
@@ -25986,15 +26289,15 @@ async function runIngestionBacklinkF1Benchmark(options) {
|
|
|
25986
26289
|
}
|
|
25987
26290
|
};
|
|
25988
26291
|
} finally {
|
|
25989
|
-
await
|
|
26292
|
+
await rm9(fixtureDir, { recursive: true, force: true });
|
|
25990
26293
|
}
|
|
25991
26294
|
}
|
|
25992
26295
|
|
|
25993
26296
|
// src/benchmarks/remnic/ingestion-setup-friction/runner.ts
|
|
25994
26297
|
import { randomUUID as randomUUID25 } from "crypto";
|
|
25995
|
-
import { mkdtemp as mkdtemp9, writeFile as
|
|
26298
|
+
import { mkdtemp as mkdtemp9, writeFile as writeFile13, rm as rm10, mkdir as mkdir13, realpath as realpath8 } from "fs/promises";
|
|
25996
26299
|
import { tmpdir as tmpdir5 } from "os";
|
|
25997
|
-
import
|
|
26300
|
+
import path27 from "path";
|
|
25998
26301
|
var INGESTION_SETUP_FRICTION_LOWER_IS_BETTER = /* @__PURE__ */ new Set(["setup_friction", "commands_count", "prompts_count", "errors_count"]);
|
|
25999
26302
|
var ingestionSetupFrictionDefinition = {
|
|
26000
26303
|
id: "ingestion-setup-friction",
|
|
@@ -26014,13 +26317,13 @@ async function runIngestionSetupFrictionBenchmark(options) {
|
|
|
26014
26317
|
throw new Error("ingestionAdapter is required for ingestion benchmarks");
|
|
26015
26318
|
}
|
|
26016
26319
|
const fixture = emailFixture.generate();
|
|
26017
|
-
const fixtureDir = await mkdtemp9(
|
|
26320
|
+
const fixtureDir = await mkdtemp9(path27.join(tmpdir5(), "bench-friction-"));
|
|
26018
26321
|
try {
|
|
26019
26322
|
await options.ingestionAdapter.reset();
|
|
26020
26323
|
for (const file of fixture.files) {
|
|
26021
|
-
const filePath =
|
|
26022
|
-
await
|
|
26023
|
-
await
|
|
26324
|
+
const filePath = path27.join(fixtureDir, file.relativePath);
|
|
26325
|
+
await mkdir13(path27.dirname(filePath), { recursive: true });
|
|
26326
|
+
await writeFile13(filePath, file.content, "utf8");
|
|
26024
26327
|
}
|
|
26025
26328
|
const { result: ingestionLog, durationMs } = await timed(
|
|
26026
26329
|
async () => options.ingestionAdapter.ingest(await realpath8(fixtureDir))
|
|
@@ -26092,15 +26395,15 @@ async function runIngestionSetupFrictionBenchmark(options) {
|
|
|
26092
26395
|
}
|
|
26093
26396
|
};
|
|
26094
26397
|
} finally {
|
|
26095
|
-
await
|
|
26398
|
+
await rm10(fixtureDir, { recursive: true, force: true });
|
|
26096
26399
|
}
|
|
26097
26400
|
}
|
|
26098
26401
|
|
|
26099
26402
|
// src/benchmarks/remnic/ingestion-citation-accuracy/runner.ts
|
|
26100
26403
|
import { randomUUID as randomUUID26 } from "crypto";
|
|
26101
|
-
import { mkdtemp as mkdtemp10, writeFile as
|
|
26404
|
+
import { mkdtemp as mkdtemp10, writeFile as writeFile14, rm as rm11, mkdir as mkdir14, realpath as realpath9 } from "fs/promises";
|
|
26102
26405
|
import { tmpdir as tmpdir6 } from "os";
|
|
26103
|
-
import
|
|
26406
|
+
import path28 from "path";
|
|
26104
26407
|
var CITATION_SUPPORT_THRESHOLD = 0.72;
|
|
26105
26408
|
var ingestionCitationAccuracyDefinition = {
|
|
26106
26409
|
id: "ingestion-citation-accuracy",
|
|
@@ -26159,10 +26462,10 @@ function resolveCitedSources(sourceRefs, seeAlso, pageRef, sourceContentMap) {
|
|
|
26159
26462
|
return "";
|
|
26160
26463
|
}
|
|
26161
26464
|
for (const ref of normalizedRefs) {
|
|
26162
|
-
const refBase =
|
|
26465
|
+
const refBase = path28.basename(ref).toLowerCase();
|
|
26163
26466
|
let matched = false;
|
|
26164
26467
|
for (const [relativePath, content] of sourceContentMap) {
|
|
26165
|
-
if (relativePath === ref || relativePath.endsWith(ref) ||
|
|
26468
|
+
if (relativePath === ref || relativePath.endsWith(ref) || path28.basename(relativePath).toLowerCase() === refBase) {
|
|
26166
26469
|
resolved.push(content);
|
|
26167
26470
|
matched = true;
|
|
26168
26471
|
break;
|
|
@@ -26178,9 +26481,9 @@ function resolveCitedSources(sourceRefs, seeAlso, pageRef, sourceContentMap) {
|
|
|
26178
26481
|
if (normalizedRefs.length > 0) {
|
|
26179
26482
|
return "";
|
|
26180
26483
|
}
|
|
26181
|
-
const pageBase =
|
|
26484
|
+
const pageBase = path28.basename(pageRef).toLowerCase();
|
|
26182
26485
|
for (const [relativePath, content] of sourceContentMap) {
|
|
26183
|
-
if (
|
|
26486
|
+
if (path28.basename(relativePath).toLowerCase() === pageBase) {
|
|
26184
26487
|
return content;
|
|
26185
26488
|
}
|
|
26186
26489
|
}
|
|
@@ -26191,13 +26494,13 @@ async function runIngestionCitationAccuracyBenchmark(options) {
|
|
|
26191
26494
|
throw new Error("ingestionAdapter is required for ingestion benchmarks");
|
|
26192
26495
|
}
|
|
26193
26496
|
const fixture = emailFixture.generate();
|
|
26194
|
-
const fixtureDir = await mkdtemp10(
|
|
26497
|
+
const fixtureDir = await mkdtemp10(path28.join(tmpdir6(), "bench-citation-"));
|
|
26195
26498
|
try {
|
|
26196
26499
|
await options.ingestionAdapter.reset();
|
|
26197
26500
|
for (const file of fixture.files) {
|
|
26198
|
-
const filePath =
|
|
26199
|
-
await
|
|
26200
|
-
await
|
|
26501
|
+
const filePath = path28.join(fixtureDir, file.relativePath);
|
|
26502
|
+
await mkdir14(path28.dirname(filePath), { recursive: true });
|
|
26503
|
+
await writeFile14(filePath, file.content, "utf8");
|
|
26201
26504
|
}
|
|
26202
26505
|
const benchmarkStart = performance.now();
|
|
26203
26506
|
const { result: ingestionLog, durationMs: ingestionDurationMs } = await timed(
|
|
@@ -26394,7 +26697,7 @@ async function runIngestionCitationAccuracyBenchmark(options) {
|
|
|
26394
26697
|
}
|
|
26395
26698
|
};
|
|
26396
26699
|
} finally {
|
|
26397
|
-
await
|
|
26700
|
+
await rm11(fixtureDir, { recursive: true, force: true });
|
|
26398
26701
|
}
|
|
26399
26702
|
}
|
|
26400
26703
|
function citationSupportScore(claim, citedSources) {
|
|
@@ -26582,7 +26885,7 @@ var ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS = ASSISTANT_MORNING_BRIEF_SCENARIOS.
|
|
|
26582
26885
|
|
|
26583
26886
|
// src/benchmarks/remnic/_assistant-common/runner.ts
|
|
26584
26887
|
import { randomUUID as randomUUID27 } from "crypto";
|
|
26585
|
-
import
|
|
26888
|
+
import path30 from "path";
|
|
26586
26889
|
|
|
26587
26890
|
// src/run-seeds.ts
|
|
26588
26891
|
function buildBenchmarkRunSeeds(runCount, baseSeed) {
|
|
@@ -26672,9 +26975,9 @@ function pairedDeltaConfidenceInterval(candidateValues, baselineValues, options
|
|
|
26672
26975
|
}
|
|
26673
26976
|
|
|
26674
26977
|
// src/judges/sealed-rubric.ts
|
|
26675
|
-
import { createHash as
|
|
26978
|
+
import { createHash as createHash8 } from "crypto";
|
|
26676
26979
|
import { appendFileSync, mkdirSync } from "fs";
|
|
26677
|
-
import
|
|
26980
|
+
import path29 from "path";
|
|
26678
26981
|
|
|
26679
26982
|
// src/judges/sealed-prompts/assistant-rubric-v1.ts
|
|
26680
26983
|
var ASSISTANT_RUBRIC_V1 = `# Assistant rubric v1 (sealed)
|
|
@@ -26781,7 +27084,7 @@ function loadSealedRubric(id = DEFAULT_ASSISTANT_RUBRIC_ID, options = {}) {
|
|
|
26781
27084
|
if (typeof prompt !== "string" || prompt.length === 0) {
|
|
26782
27085
|
throw new Error(`sealed rubric not found in registry: ${id}`);
|
|
26783
27086
|
}
|
|
26784
|
-
const sha256 =
|
|
27087
|
+
const sha256 = createHash8("sha256").update(prompt, "utf8").digest("hex");
|
|
26785
27088
|
const version = parseVersionFromId(id);
|
|
26786
27089
|
return { id, version, prompt, sha256 };
|
|
26787
27090
|
}
|
|
@@ -26954,7 +27257,7 @@ function createSpotCheckFileLogger(options) {
|
|
|
26954
27257
|
return { log() {
|
|
26955
27258
|
} };
|
|
26956
27259
|
}
|
|
26957
|
-
const logPath =
|
|
27260
|
+
const logPath = path29.join(directory, `${runId}.jsonl`);
|
|
26958
27261
|
let written = 0;
|
|
26959
27262
|
let warnedOnWriteFailure = false;
|
|
26960
27263
|
const cap = typeof sampleSize === "number" && sampleSize > 0 ? sampleSize : 5;
|
|
@@ -27043,7 +27346,7 @@ async function runAssistantBenchmark(definition, scenarios, resolved, runnerOpti
|
|
|
27043
27346
|
const runId = buildRunId(definition.id);
|
|
27044
27347
|
const spotCheckLogger = createSpotCheckFileLogger({
|
|
27045
27348
|
runId,
|
|
27046
|
-
directory: runnerOptions.spotCheckDir ??
|
|
27349
|
+
directory: runnerOptions.spotCheckDir ?? path30.join(process.cwd(), "benchmarks", "results", "spot-checks"),
|
|
27047
27350
|
sampleRate: 0.35,
|
|
27048
27351
|
sampleSize: 5
|
|
27049
27352
|
});
|
|
@@ -27692,9 +27995,9 @@ async function runAssistantSynthesisBenchmark(options) {
|
|
|
27692
27995
|
|
|
27693
27996
|
// src/benchmarks/remnic/buffer-surprise-trigger/runner.ts
|
|
27694
27997
|
import { randomUUID as randomUUID28 } from "crypto";
|
|
27695
|
-
import
|
|
27998
|
+
import path31 from "path";
|
|
27696
27999
|
import os8 from "os";
|
|
27697
|
-
import { mkdir as
|
|
28000
|
+
import { mkdir as mkdir15, rm as rm12 } from "fs/promises";
|
|
27698
28001
|
import {
|
|
27699
28002
|
SmartBuffer,
|
|
27700
28003
|
computeSurprise,
|
|
@@ -27923,11 +28226,11 @@ function hasExplicitTopicPivotCue(text) {
|
|
|
27923
28226
|
}
|
|
27924
28227
|
async function runBufferSurpriseTriggerBenchmark(options) {
|
|
27925
28228
|
const cases = loadCases10(options.mode, options.limit);
|
|
27926
|
-
const tmpRoot =
|
|
28229
|
+
const tmpRoot = path31.join(
|
|
27927
28230
|
os8.tmpdir(),
|
|
27928
28231
|
`remnic-bench-buffer-surprise-${randomUUID28()}`
|
|
27929
28232
|
);
|
|
27930
|
-
await
|
|
28233
|
+
await mkdir15(tmpRoot, { recursive: true });
|
|
27931
28234
|
const tasks = [];
|
|
27932
28235
|
const startedAt = performance.now();
|
|
27933
28236
|
try {
|
|
@@ -27945,7 +28248,7 @@ async function runBufferSurpriseTriggerBenchmark(options) {
|
|
|
27945
28248
|
tasks.push(buildTaskResult(caseDef, control, candidate));
|
|
27946
28249
|
}
|
|
27947
28250
|
} finally {
|
|
27948
|
-
await
|
|
28251
|
+
await rm12(tmpRoot, { recursive: true, force: true });
|
|
27949
28252
|
}
|
|
27950
28253
|
const totalLatencyMs = Math.round(performance.now() - startedAt);
|
|
27951
28254
|
const aggregates = buildAggregates2(tasks);
|
|
@@ -27992,12 +28295,12 @@ async function runBufferSurpriseTriggerBenchmark(options) {
|
|
|
27992
28295
|
};
|
|
27993
28296
|
}
|
|
27994
28297
|
async function runSingleCase(caseDef, options) {
|
|
27995
|
-
const memoryDir =
|
|
28298
|
+
const memoryDir = path31.join(
|
|
27996
28299
|
options.tmpRoot,
|
|
27997
28300
|
`${caseDef.id}-${options.label}`
|
|
27998
28301
|
);
|
|
27999
|
-
const workspaceDir =
|
|
28000
|
-
await
|
|
28302
|
+
const workspaceDir = path31.join(memoryDir, "workspace");
|
|
28303
|
+
await mkdir15(workspaceDir, { recursive: true });
|
|
28001
28304
|
const config = parseConfig4({
|
|
28002
28305
|
memoryDir,
|
|
28003
28306
|
workspaceDir,
|
|
@@ -29151,8 +29454,8 @@ function finalizeBenchmarkResultConfig(result, options) {
|
|
|
29151
29454
|
}
|
|
29152
29455
|
|
|
29153
29456
|
// src/benchmark.ts
|
|
29154
|
-
var DEFAULT_BASELINE_PATH =
|
|
29155
|
-
var DEFAULT_REPORT_PATH =
|
|
29457
|
+
var DEFAULT_BASELINE_PATH = path32.join(process.cwd(), "benchmarks", "baseline.json");
|
|
29458
|
+
var DEFAULT_REPORT_PATH = path32.join(process.cwd(), "benchmarks", "report.json");
|
|
29156
29459
|
var BASELINE_VERSION = 1;
|
|
29157
29460
|
var DEFAULT_TOLERANCE = 10;
|
|
29158
29461
|
var DEFAULT_FULL_RUN_COUNT = 5;
|
|
@@ -29214,7 +29517,33 @@ async function runBenchmark(benchmarkId, options) {
|
|
|
29214
29517
|
const log = (message) => {
|
|
29215
29518
|
console.error(` ${message}`);
|
|
29216
29519
|
};
|
|
29217
|
-
const
|
|
29520
|
+
const originalSystemJudge = options.system.judge;
|
|
29521
|
+
let systemJudgeMutatedInPlace = false;
|
|
29522
|
+
let judgeCacheCounters;
|
|
29523
|
+
let cachedCrossJudge;
|
|
29524
|
+
let crossJudgeCacheCounters;
|
|
29525
|
+
let primaryDrainPendingWrites;
|
|
29526
|
+
let crossDrainPendingWrites;
|
|
29527
|
+
const cacheWiring = (() => {
|
|
29528
|
+
if (options.noJudgeCache) {
|
|
29529
|
+
return void 0;
|
|
29530
|
+
}
|
|
29531
|
+
const willWrapPrimary = options.system.judge !== void 0 && (options.judgeProvider ?? null) !== null;
|
|
29532
|
+
const willWrapCross = options.amaBenchCrossJudge !== void 0 && (options.amaBenchCrossJudgeProvider ?? null) !== null;
|
|
29533
|
+
if (!willWrapPrimary && !willWrapCross) {
|
|
29534
|
+
return void 0;
|
|
29535
|
+
}
|
|
29536
|
+
const cacheDir = options.judgeCacheDir ? path32.resolve(expandTildePath3(options.judgeCacheDir)) : options.outputDir ? path32.join(path32.resolve(expandTildePath3(options.outputDir)), "judge-cache") : void 0;
|
|
29537
|
+
if (cacheDir === void 0) {
|
|
29538
|
+
return void 0;
|
|
29539
|
+
}
|
|
29540
|
+
return {
|
|
29541
|
+
cache: new JudgeCache({ dir: cacheDir }),
|
|
29542
|
+
willWrapPrimary,
|
|
29543
|
+
willWrapCross
|
|
29544
|
+
};
|
|
29545
|
+
})();
|
|
29546
|
+
let system = !shouldGuardSystem ? options.system : createTimeoutGuardedAdapter(options.system, {
|
|
29218
29547
|
benchmarkId,
|
|
29219
29548
|
...timeoutMs !== void 0 ? { timeoutMs } : {},
|
|
29220
29549
|
...options.drainTimeoutMs !== void 0 ? { drainTimeoutMs: options.drainTimeoutMs } : {},
|
|
@@ -29245,18 +29574,118 @@ async function runBenchmark(benchmarkId, options) {
|
|
|
29245
29574
|
}) : rawIngestionAdapter;
|
|
29246
29575
|
let result;
|
|
29247
29576
|
try {
|
|
29577
|
+
if (cacheWiring?.willWrapPrimary && system.judge !== void 0) {
|
|
29578
|
+
const primary = wrapJudgeWithCache({
|
|
29579
|
+
role: "primary",
|
|
29580
|
+
judge: system.judge,
|
|
29581
|
+
benchmarkId,
|
|
29582
|
+
datasetVersion: definition.meta.version,
|
|
29583
|
+
amaBenchJudgeProtocol: options.amaBenchJudgeProtocol ?? "default",
|
|
29584
|
+
provider: options.judgeProvider ?? null,
|
|
29585
|
+
cache: cacheWiring.cache
|
|
29586
|
+
});
|
|
29587
|
+
judgeCacheCounters = primary.counters;
|
|
29588
|
+
primaryDrainPendingWrites = primary.drainPendingWrites;
|
|
29589
|
+
try {
|
|
29590
|
+
system.judge = primary.judge;
|
|
29591
|
+
systemJudgeMutatedInPlace = system === options.system;
|
|
29592
|
+
} catch {
|
|
29593
|
+
system = createJudgeOverrideProxy(system, primary.judge);
|
|
29594
|
+
}
|
|
29595
|
+
}
|
|
29596
|
+
if (cacheWiring?.willWrapCross) {
|
|
29597
|
+
const wrapped = wrapJudgeWithCache({
|
|
29598
|
+
role: "cross",
|
|
29599
|
+
judge: options.amaBenchCrossJudge,
|
|
29600
|
+
benchmarkId,
|
|
29601
|
+
datasetVersion: definition.meta.version,
|
|
29602
|
+
amaBenchJudgeProtocol: options.amaBenchJudgeProtocol ?? "default",
|
|
29603
|
+
provider: options.amaBenchCrossJudgeProvider ?? null,
|
|
29604
|
+
cache: cacheWiring.cache
|
|
29605
|
+
});
|
|
29606
|
+
cachedCrossJudge = wrapped.judge;
|
|
29607
|
+
crossJudgeCacheCounters = wrapped.counters;
|
|
29608
|
+
crossDrainPendingWrites = wrapped.drainPendingWrites;
|
|
29609
|
+
}
|
|
29248
29610
|
result = await registeredBenchmark.run({
|
|
29249
29611
|
...options,
|
|
29250
29612
|
system,
|
|
29613
|
+
// PR #1591 P2 (thread #10): when caching is on AND a cross judge is
|
|
29614
|
+
// configured, hand the cached cross judge to the runner so AMA-Bench
|
|
29615
|
+
// cross-judge calls participate in the same content-keyed cache as
|
|
29616
|
+
// the primary system judge. Without this override, the runner kept
|
|
29617
|
+
// calling the unwrapped cross judge on every iteration.
|
|
29618
|
+
...cachedCrossJudge ? { amaBenchCrossJudge: cachedCrossJudge } : {},
|
|
29251
29619
|
...ingestionAdapter ? { ingestionAdapter } : {},
|
|
29252
29620
|
mode: options.mode ?? "quick",
|
|
29253
29621
|
benchmark: definition
|
|
29254
29622
|
});
|
|
29255
29623
|
} finally {
|
|
29256
|
-
|
|
29624
|
+
try {
|
|
29625
|
+
await destroyOwnedIngestionAdapter();
|
|
29626
|
+
} finally {
|
|
29627
|
+
if (systemJudgeMutatedInPlace) {
|
|
29628
|
+
options.system.judge = originalSystemJudge;
|
|
29629
|
+
}
|
|
29630
|
+
}
|
|
29631
|
+
if (primaryDrainPendingWrites) {
|
|
29632
|
+
await primaryDrainPendingWrites();
|
|
29633
|
+
}
|
|
29634
|
+
if (crossDrainPendingWrites) {
|
|
29635
|
+
await crossDrainPendingWrites();
|
|
29636
|
+
}
|
|
29637
|
+
}
|
|
29638
|
+
const primaryCalls = judgeCacheCounters?.modelCalls ?? 0;
|
|
29639
|
+
const crossCalls = crossJudgeCacheCounters?.modelCalls ?? 0;
|
|
29640
|
+
if (judgeCacheCounters !== void 0 || crossJudgeCacheCounters !== void 0) {
|
|
29641
|
+
result.cost.judgeModelCalls = primaryCalls + crossCalls;
|
|
29257
29642
|
}
|
|
29258
29643
|
return finalizeBenchmarkResultConfig(result, options);
|
|
29259
29644
|
}
|
|
29645
|
+
function wrapJudgeWithCache(args) {
|
|
29646
|
+
const crossJudgeIdSuffix = args.role === "cross" ? "-cross" : "";
|
|
29647
|
+
const wrapped = runJudgeWithCache({
|
|
29648
|
+
judge: args.judge,
|
|
29649
|
+
cache: args.cache,
|
|
29650
|
+
keyExtras: {
|
|
29651
|
+
benchmarkId: `${args.benchmarkId}${crossJudgeIdSuffix}`,
|
|
29652
|
+
datasetVersion: args.datasetVersion,
|
|
29653
|
+
// Protocol identity: bench judge protocol version + the selected
|
|
29654
|
+
// judge protocol variant, suffixed by role so primary vs cross
|
|
29655
|
+
// differentiator is part of the prompt hash. Bumping
|
|
29656
|
+
// JUDGE_CACHE_PROTOCOL_VERSION invalidates verdicts when judge
|
|
29657
|
+
// prompt/parse semantics change (PR #1591, High).
|
|
29658
|
+
judgePromptHash: createHash9("sha256").update(JUDGE_CACHE_PROTOCOL_VERSION).update("").update(args.amaBenchJudgeProtocol).update("").update(args.role).digest("hex"),
|
|
29659
|
+
judgeModelId: args.provider?.model !== void 0 && args.provider.model.length > 0 ? `${args.provider.model}${crossJudgeIdSuffix}` : `unknown-${args.role}-judge`,
|
|
29660
|
+
// Full judge configuration, deterministically serialized (sorted
|
|
29661
|
+
// keys) so provider/base-url/retry changes produce fresh cache
|
|
29662
|
+
// keys. `role` is included so primary and cross judges never
|
|
29663
|
+
// share a paramsHash.
|
|
29664
|
+
judgeParamsHash: createHash9("sha256").update(
|
|
29665
|
+
stableStringify2({
|
|
29666
|
+
role: args.role,
|
|
29667
|
+
provider: args.provider
|
|
29668
|
+
})
|
|
29669
|
+
).digest("hex")
|
|
29670
|
+
}
|
|
29671
|
+
});
|
|
29672
|
+
return {
|
|
29673
|
+
judge: wrapped,
|
|
29674
|
+
counters: wrapped.counters,
|
|
29675
|
+
drainPendingWrites: wrapped.drainPendingWrites
|
|
29676
|
+
};
|
|
29677
|
+
}
|
|
29678
|
+
function createJudgeOverrideProxy(adapter, judge) {
|
|
29679
|
+
return new Proxy(adapter, {
|
|
29680
|
+
get(target, prop) {
|
|
29681
|
+
if (prop === "judge") {
|
|
29682
|
+
return judge;
|
|
29683
|
+
}
|
|
29684
|
+
const value = Reflect.get(target, prop, target);
|
|
29685
|
+
return typeof value === "function" ? value.bind(target) : value;
|
|
29686
|
+
}
|
|
29687
|
+
});
|
|
29688
|
+
}
|
|
29260
29689
|
function benchmarkDefinition(id) {
|
|
29261
29690
|
const definition = getBenchmark(id);
|
|
29262
29691
|
if (!definition) {
|
|
@@ -29293,7 +29722,7 @@ function loadBaseline(baselinePath) {
|
|
|
29293
29722
|
return raw;
|
|
29294
29723
|
}
|
|
29295
29724
|
function saveBaseline(baselinePath, baseline) {
|
|
29296
|
-
fs2.mkdirSync(
|
|
29725
|
+
fs2.mkdirSync(path32.dirname(baselinePath), { recursive: true });
|
|
29297
29726
|
fs2.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}
|
|
29298
29727
|
`);
|
|
29299
29728
|
}
|
|
@@ -29523,7 +29952,7 @@ function generateReport(results, reportPath) {
|
|
|
29523
29952
|
totalDurationMs: results.reduce((sum, result) => sum + result.totalDurationMs, 0)
|
|
29524
29953
|
};
|
|
29525
29954
|
if (reportPath) {
|
|
29526
|
-
fs2.mkdirSync(
|
|
29955
|
+
fs2.mkdirSync(path32.dirname(reportPath), { recursive: true });
|
|
29527
29956
|
fs2.writeFileSync(reportPath, `${JSON.stringify(report, null, 2)}
|
|
29528
29957
|
`);
|
|
29529
29958
|
}
|
|
@@ -29670,7 +30099,7 @@ function getBenchmarkLowerIsBetter(benchmarkId) {
|
|
|
29670
30099
|
}
|
|
29671
30100
|
|
|
29672
30101
|
// src/integrity/sealed-qrels.ts
|
|
29673
|
-
import { readFile as
|
|
30102
|
+
import { readFile as readFile18 } from "fs/promises";
|
|
29674
30103
|
function isSealedQrelsArtifact(value) {
|
|
29675
30104
|
if (!value || typeof value !== "object") {
|
|
29676
30105
|
return false;
|
|
@@ -29740,7 +30169,7 @@ function parseSealedQrels(raw, options = {}) {
|
|
|
29740
30169
|
};
|
|
29741
30170
|
}
|
|
29742
30171
|
async function loadSealedQrels(filePath, options = {}) {
|
|
29743
|
-
const raw = await
|
|
30172
|
+
const raw = await readFile18(filePath, "utf8");
|
|
29744
30173
|
return parseSealedQrels(raw, options);
|
|
29745
30174
|
}
|
|
29746
30175
|
function serializeSealedQrels(artifact) {
|
|
@@ -29860,7 +30289,7 @@ function selectFixtureVariant(variants, seed) {
|
|
|
29860
30289
|
}
|
|
29861
30290
|
|
|
29862
30291
|
// src/benchmarks/custom/loader.ts
|
|
29863
|
-
import { readFile as
|
|
30292
|
+
import { readFile as readFile19 } from "fs/promises";
|
|
29864
30293
|
import { parse as parseYaml } from "yaml";
|
|
29865
30294
|
var CUSTOM_SCORING_VALUES = /* @__PURE__ */ new Set([
|
|
29866
30295
|
"exact_match",
|
|
@@ -29880,7 +30309,7 @@ function parseCustomBenchmark(source) {
|
|
|
29880
30309
|
async function loadCustomBenchmarkFile(filePath) {
|
|
29881
30310
|
let source;
|
|
29882
30311
|
try {
|
|
29883
|
-
source = await
|
|
30312
|
+
source = await readFile19(filePath, "utf8");
|
|
29884
30313
|
} catch (error) {
|
|
29885
30314
|
throw new Error(
|
|
29886
30315
|
`Failed to read custom benchmark file ${filePath}: ${formatError(error)}`
|
|
@@ -29988,15 +30417,60 @@ function formatError(error) {
|
|
|
29988
30417
|
|
|
29989
30418
|
// src/benchmarks/custom/runner.ts
|
|
29990
30419
|
import { randomUUID as randomUUID31 } from "crypto";
|
|
29991
|
-
import
|
|
30420
|
+
import path33 from "path";
|
|
30421
|
+
import { expandTildePath as expandTildePath4 } from "@remnic/core";
|
|
29992
30422
|
async function runCustomBenchmarkFile(filePath, options) {
|
|
29993
30423
|
const spec = await loadCustomBenchmarkFile(filePath);
|
|
29994
30424
|
const benchmark = createCustomBenchmarkDefinition(spec, filePath);
|
|
29995
|
-
|
|
30425
|
+
const runOptions = {
|
|
29996
30426
|
...options,
|
|
29997
30427
|
mode: options.mode ?? "quick",
|
|
29998
30428
|
benchmark
|
|
29999
|
-
}
|
|
30429
|
+
};
|
|
30430
|
+
let cacheRestore;
|
|
30431
|
+
let cacheCounters;
|
|
30432
|
+
if (spec.scoring === "llm_judge" && runOptions.system.judge !== void 0 && !runOptions.noJudgeCache && (runOptions.judgeProvider ?? null) !== null) {
|
|
30433
|
+
const cacheDir = runOptions.judgeCacheDir ? path33.resolve(expandTildePath4(runOptions.judgeCacheDir)) : runOptions.outputDir ? path33.join(path33.resolve(expandTildePath4(runOptions.outputDir)), "judge-cache") : void 0;
|
|
30434
|
+
if (cacheDir !== void 0) {
|
|
30435
|
+
const originalJudge = runOptions.system.judge;
|
|
30436
|
+
const wrapped = wrapJudgeWithCache({
|
|
30437
|
+
role: "primary",
|
|
30438
|
+
judge: originalJudge,
|
|
30439
|
+
benchmarkId: benchmark.id,
|
|
30440
|
+
datasetVersion: benchmark.meta.version,
|
|
30441
|
+
amaBenchJudgeProtocol: runOptions.amaBenchJudgeProtocol ?? "default",
|
|
30442
|
+
provider: runOptions.judgeProvider ?? null,
|
|
30443
|
+
cache: new JudgeCache({ dir: cacheDir })
|
|
30444
|
+
});
|
|
30445
|
+
let systemJudgeMutatedInPlace = false;
|
|
30446
|
+
try {
|
|
30447
|
+
runOptions.system.judge = wrapped.judge;
|
|
30448
|
+
systemJudgeMutatedInPlace = true;
|
|
30449
|
+
} catch {
|
|
30450
|
+
runOptions.system = createJudgeOverrideProxy(runOptions.system, wrapped.judge);
|
|
30451
|
+
}
|
|
30452
|
+
cacheCounters = wrapped.counters;
|
|
30453
|
+
const needRestore = systemJudgeMutatedInPlace;
|
|
30454
|
+
cacheRestore = async () => {
|
|
30455
|
+
if (needRestore) {
|
|
30456
|
+
runOptions.system.judge = originalJudge;
|
|
30457
|
+
}
|
|
30458
|
+
await wrapped.drainPendingWrites();
|
|
30459
|
+
};
|
|
30460
|
+
}
|
|
30461
|
+
}
|
|
30462
|
+
let result;
|
|
30463
|
+
try {
|
|
30464
|
+
result = await runCustomBenchmark(spec, runOptions);
|
|
30465
|
+
} finally {
|
|
30466
|
+
if (cacheRestore) {
|
|
30467
|
+
await cacheRestore();
|
|
30468
|
+
}
|
|
30469
|
+
}
|
|
30470
|
+
if (cacheCounters) {
|
|
30471
|
+
result.cost.judgeModelCalls = cacheCounters.modelCalls;
|
|
30472
|
+
}
|
|
30473
|
+
return result;
|
|
30000
30474
|
}
|
|
30001
30475
|
async function runCustomBenchmark(spec, options) {
|
|
30002
30476
|
if (spec.scoring === "llm_judge" && !options.system.judge) {
|
|
@@ -30157,7 +30631,7 @@ async function scoreTask(scoring, options, question, actual, expected) {
|
|
|
30157
30631
|
}
|
|
30158
30632
|
}
|
|
30159
30633
|
function createCustomBenchmarkDefinition(benchmark, filePath) {
|
|
30160
|
-
const id = `custom:${slugify(
|
|
30634
|
+
const id = `custom:${slugify(path33.basename(filePath, path33.extname(filePath)) || benchmark.name)}`;
|
|
30161
30635
|
return {
|
|
30162
30636
|
id,
|
|
30163
30637
|
title: benchmark.name,
|
|
@@ -31027,9 +31501,9 @@ var chatFixture = {
|
|
|
31027
31501
|
};
|
|
31028
31502
|
|
|
31029
31503
|
// src/benchmarks/remnic/procedural-recall/ablation.ts
|
|
31030
|
-
import { mkdir as
|
|
31504
|
+
import { mkdir as mkdir16, mkdtemp as mkdtemp11, rm as rm13, writeFile as writeFile15, readFile as readFile20 } from "fs/promises";
|
|
31031
31505
|
import os9 from "os";
|
|
31032
|
-
import
|
|
31506
|
+
import path34 from "path";
|
|
31033
31507
|
import {
|
|
31034
31508
|
StorageManager as StorageManager3,
|
|
31035
31509
|
parseConfig as parseConfig5,
|
|
@@ -31060,7 +31534,7 @@ async function runSide(scenarios, proceduralEnabled) {
|
|
|
31060
31534
|
const observed = [];
|
|
31061
31535
|
for (const scenario of scenarios) {
|
|
31062
31536
|
const dir = await mkdtemp11(
|
|
31063
|
-
|
|
31537
|
+
path34.join(os9.tmpdir(), "remnic-bench-proc-ablation-")
|
|
31064
31538
|
);
|
|
31065
31539
|
try {
|
|
31066
31540
|
const storage = new StorageManager3(dir);
|
|
@@ -31075,7 +31549,7 @@ ${body}`,
|
|
|
31075
31549
|
);
|
|
31076
31550
|
const config = parseConfig5({
|
|
31077
31551
|
memoryDir: dir,
|
|
31078
|
-
workspaceDir:
|
|
31552
|
+
workspaceDir: path34.join(dir, "ws"),
|
|
31079
31553
|
openaiApiKey: "bench-key",
|
|
31080
31554
|
procedural: {
|
|
31081
31555
|
enabled: proceduralEnabled,
|
|
@@ -31089,7 +31563,7 @@ ${body}`,
|
|
|
31089
31563
|
);
|
|
31090
31564
|
observed.push(section !== null && section.length > 0);
|
|
31091
31565
|
} finally {
|
|
31092
|
-
await
|
|
31566
|
+
await rm13(dir, { recursive: true, force: true });
|
|
31093
31567
|
}
|
|
31094
31568
|
}
|
|
31095
31569
|
return observed;
|
|
@@ -31150,7 +31624,7 @@ async function runProceduralAblation(options) {
|
|
|
31150
31624
|
};
|
|
31151
31625
|
}
|
|
31152
31626
|
async function loadAblationFixture(fixturePath) {
|
|
31153
|
-
const raw = await
|
|
31627
|
+
const raw = await readFile20(fixturePath, "utf8");
|
|
31154
31628
|
let parsed;
|
|
31155
31629
|
try {
|
|
31156
31630
|
parsed = JSON.parse(raw);
|
|
@@ -31246,9 +31720,9 @@ async function runProceduralAblationCli(args) {
|
|
|
31246
31720
|
random: args.random,
|
|
31247
31721
|
seed: args.seed
|
|
31248
31722
|
});
|
|
31249
|
-
const outDir =
|
|
31250
|
-
await
|
|
31251
|
-
await
|
|
31723
|
+
const outDir = path34.dirname(path34.resolve(args.outPath));
|
|
31724
|
+
await mkdir16(outDir, { recursive: true });
|
|
31725
|
+
await writeFile15(args.outPath, JSON.stringify(artifact, null, 2) + "\n", "utf8");
|
|
31252
31726
|
return artifact;
|
|
31253
31727
|
}
|
|
31254
31728
|
|