@remnic/bench 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,99 @@
1
+ # @remnic/bench
2
+
3
+ Benchmark suite and CI regression gates for [Remnic](https://github.com/joshuaswarren/remnic) memory pipelines. Ships the runners, adapters, and results store that the `remnic bench` CLI surface drives.
4
+
5
+ `@remnic/bench` is an **optional companion** to [`@remnic/cli`](https://www.npmjs.com/package/@remnic/cli). Install it only when you need to run benchmarks, compare runs, or publish results. Memory-only users do not need it.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ # Alongside the CLI:
11
+ npm install -g @remnic/cli @remnic/bench
12
+
13
+ # Or in a project that drives benchmarks programmatically:
14
+ pnpm add @remnic/bench
15
+ ```
16
+
17
+ The CLI loads `@remnic/bench` via a computed-specifier dynamic import. If it's not installed, `remnic bench *` prints a clear install hint; the rest of the CLI keeps working.
18
+
19
+ ## What it does
20
+
21
+ - **Benchmark runners** for a growing set of memory-oriented evals: `longmemeval`, `locomo`, `memory-arena`, `amemgym`, `ama-bench`, plus a lightweight smoke fixture.
22
+ - **Stored-run management** — every `remnic bench run *` writes a timestamped JSON result under `~/.remnic/bench/results/`; `remnic bench runs list|show|delete` let you browse, inspect, and prune.
23
+ - **Baselines + regression gates** — save a run as a named baseline, compare candidates against it, gate CI on threshold violations.
24
+ - **Result export** — `remnic bench export <run> --format json|csv|html`.
25
+ - **Published feed** — `remnic bench publish --target remnic-ai` builds the tamper-evident integrity manifest consumed by remnic.ai.
26
+ - **Provider discovery** — `remnic bench providers discover` enumerates local OpenAI / Anthropic / Ollama / LiteLLM providers for adapter wiring.
27
+
28
+ ## CLI quick reference
29
+
30
+ ```bash
31
+ # List available benchmarks:
32
+ remnic bench list
33
+
34
+ # Download a dataset for a full run:
35
+ remnic bench datasets download longmemeval
36
+
37
+ # Full run on the downloaded dataset:
38
+ remnic bench run longmemeval
39
+
40
+ # 60-second smoke run on the bundled fixture:
41
+ remnic bench run --quick longmemeval
42
+
43
+ # Browse stored runs:
44
+ remnic bench runs list
45
+ remnic bench runs show <run-id> --detail
46
+
47
+ # Compare two runs:
48
+ remnic bench compare base-run candidate-run
49
+
50
+ # Save a baseline (archives the run under ~/.remnic/bench/baselines):
51
+ remnic bench baseline save dashboard-v1 candidate-run
52
+
53
+ # Gate CI against a stored run with a 2% threshold (compare takes run
54
+ # ids / paths, not baseline names — use `baseline save` for archival,
55
+ # then reference the underlying run id in `compare`):
56
+ remnic bench compare candidate-run nightly-run --threshold 0.02
57
+
58
+ # Ship results to remnic.ai:
59
+ remnic bench publish --target remnic-ai
60
+ ```
61
+
62
+ Dataset markers match the runner's accepted filenames, so `datasets status` reports "downloaded" exactly when the runner will load successfully.
63
+
64
+ ## Programmatic API
65
+
66
+ ```ts
67
+ import {
68
+ listBenchmarks,
69
+ runBenchmark,
70
+ writeBenchmarkResult,
71
+ createLightweightAdapter,
72
+ createRemnicAdapter,
73
+ compareResults,
74
+ saveBenchmarkBaseline,
75
+ listBenchmarkResults,
76
+ deleteBenchmarkResults,
77
+ buildBenchmarkPublishFeed,
78
+ discoverAllProviders,
79
+ type BenchmarkResult,
80
+ type ComparisonResult,
81
+ type BenchmarkDefinition,
82
+ } from "@remnic/bench";
83
+ ```
84
+
85
+ Each runner accepts a `system` adapter — `createRemnicAdapter()` talks to a live `@remnic/core` Orchestrator; `createLightweightAdapter()` is a minimal in-memory stand-in used for CI smoke runs. Results conform to the `BenchmarkResult` schema (see `dist/index.d.ts`).
86
+
87
+ ## Agent note
88
+
89
+ If you're an AI agent extending a Remnic-based stack: **do not** import `@remnic/bench` from a base install surface (CLI, core, plugin). Optional companion packages must be loaded via computed-specifier dynamic imports with an install-hint fallback. See `packages/remnic-cli/src/optional-bench.ts` in the repo for the canonical pattern, and the à-la-carte invariant in the repo's `AGENTS.md` §44 / `CLAUDE.md` gotcha #57.
90
+
91
+ ## Related
92
+
93
+ - [`@remnic/cli`](https://www.npmjs.com/package/@remnic/cli) — the CLI that drives `remnic bench *`
94
+ - [`@remnic/core`](https://www.npmjs.com/package/@remnic/core) — the memory engine bench adapters talk to
95
+ - Source + issues: <https://github.com/joshuaswarren/remnic>
96
+
97
+ ## License
98
+
99
+ MIT. See the root [LICENSE](https://github.com/joshuaswarren/remnic/blob/main/LICENSE) file.
package/dist/index.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { GatewayConfig, EngramAccessService } from '@remnic/core';
1
+ import { GatewayConfig, FallbackLlmRuntimeContext, FallbackLlmClient, EngramAccessService } from '@remnic/core';
2
2
 
3
3
  /**
4
4
  * Types for the ingestion benchmark tier.
@@ -907,6 +907,9 @@ declare function clampScore(value: number): number;
907
907
  interface GatewayResponderOptions {
908
908
  gatewayConfig?: GatewayConfig;
909
909
  agentId?: string;
910
+ agentDir?: string;
911
+ workspaceDir?: string;
912
+ llmFactory?: (gatewayConfig: GatewayConfig, runtimeContext: FallbackLlmRuntimeContext) => Pick<FallbackLlmClient, "chatCompletion">;
910
913
  }
911
914
  declare function createResponderFromProvider(provider: LlmProvider): BenchResponder;
912
915
  declare function createProviderBackedResponder(config: ProviderFactoryConfig, providerInstance?: LlmProvider): BenchResponder;
package/dist/index.js CHANGED
@@ -731,8 +731,9 @@ var OpenAiCompatibleProvider = class {
731
731
  })
732
732
  });
733
733
  if (!response.ok) {
734
+ const errorBody = await readErrorBody(response);
734
735
  throw new Error(
735
- `OpenAI-compatible completion failed: ${response.status} ${response.statusText}`
736
+ `OpenAI-compatible completion failed: ${response.status} ${response.statusText}${errorBody ? ` \u2014 ${errorBody}` : ""}`
736
737
  );
737
738
  }
738
739
  const payload = await response.json();
@@ -801,6 +802,17 @@ var OpenAiCompatibleProvider = class {
801
802
  return `${normalizedBase}/${normalizedPath}`;
802
803
  }
803
804
  };
805
+ async function readErrorBody(response) {
806
+ try {
807
+ const text = (await response.text()).trim();
808
+ if (text.length === 0) {
809
+ return "";
810
+ }
811
+ return text.replace(/\s+/g, " ").slice(0, 400);
812
+ } catch {
813
+ return "";
814
+ }
815
+ }
804
816
  function readMessageText(payload) {
805
817
  const content = payload.choices?.[0]?.message?.content;
806
818
  if (typeof content === "string") {
@@ -1113,7 +1125,11 @@ function createGatewayResponder(options) {
1113
1125
  if (!options.gatewayConfig) {
1114
1126
  throw new Error("gateway responder requires gatewayConfig");
1115
1127
  }
1116
- const llm = new FallbackLlmClient(options.gatewayConfig);
1128
+ const runtimeContext = {
1129
+ ...options.agentDir ? { agentDir: options.agentDir } : {},
1130
+ ...options.workspaceDir ? { workspaceDir: options.workspaceDir } : {}
1131
+ };
1132
+ const llm = options.llmFactory?.(options.gatewayConfig, runtimeContext) ?? new FallbackLlmClient(options.gatewayConfig, runtimeContext);
1117
1133
  return {
1118
1134
  async respond(question, recalledText) {
1119
1135
  const startedAt = performance.now();
@@ -1241,6 +1257,7 @@ function clampNormalizedScore(value) {
1241
1257
  }
1242
1258
 
1243
1259
  // src/runtime-profiles.ts
1260
+ import path2 from "path";
1244
1261
  import { readFile } from "fs/promises";
1245
1262
  import {
1246
1263
  resolveRemnicPluginEntry
@@ -1421,7 +1438,8 @@ async function resolveBenchRuntimeProfile(options) {
1421
1438
  const fastGatewayAgentId = options.fastGatewayAgentId ?? asNonEmptyString(openclawRuntime.remnicConfig.fastGatewayAgentId);
1422
1439
  const gatewayResponder = createGatewayResponder({
1423
1440
  gatewayConfig,
1424
- agentId: gatewayAgentId
1441
+ agentId: gatewayAgentId,
1442
+ ...openclawRuntime.runtimeContext
1425
1443
  });
1426
1444
  const persistedRemnicConfig = sanitizePersistedConfig(
1427
1445
  {
@@ -1485,7 +1503,15 @@ async function loadOpenclawRuntimeConfig(filePath) {
1485
1503
  return {
1486
1504
  remnicConfig,
1487
1505
  gatewayConfig,
1488
- persistedGatewayConfig: sanitizeGatewayConfig(gatewayConfig)
1506
+ persistedGatewayConfig: sanitizeGatewayConfig(gatewayConfig),
1507
+ runtimeContext: deriveOpenclawRuntimeContext(filePath)
1508
+ };
1509
+ }
1510
+ function deriveOpenclawRuntimeContext(configPath) {
1511
+ const rootDir = path2.dirname(path2.resolve(configPath));
1512
+ return {
1513
+ agentDir: path2.join(rootDir, "agents", "main", "agent"),
1514
+ workspaceDir: path2.join(rootDir, "workspace")
1489
1515
  };
1490
1516
  }
1491
1517
  async function loadJsonObject(filePath, label) {
@@ -1604,12 +1630,12 @@ function isPlainObject(value) {
1604
1630
 
1605
1631
  // src/benchmark.ts
1606
1632
  import fs from "fs";
1607
- import path23 from "path";
1633
+ import path24 from "path";
1608
1634
 
1609
1635
  // src/benchmarks/published/ama-bench/runner.ts
1610
1636
  import { randomUUID } from "crypto";
1611
1637
  import { readFile as readFile3 } from "fs/promises";
1612
- import path3 from "path";
1638
+ import path4 from "path";
1613
1639
 
1614
1640
  // src/benchmarks/published/ama-bench/fixture.ts
1615
1641
  var AMA_BENCH_SMOKE_FIXTURE = [
@@ -1818,7 +1844,7 @@ function longestCommonSubsequence(left, right) {
1818
1844
  // src/reporter.ts
1819
1845
  import { execSync } from "child_process";
1820
1846
  import { mkdir as mkdir2, readFile as readFile2, writeFile } from "fs/promises";
1821
- import path2 from "path";
1847
+ import path3 from "path";
1822
1848
  function sanitizeFilenameSegment(value) {
1823
1849
  const sanitized = value.trim().replace(/[^a-zA-Z0-9._-]/g, "_");
1824
1850
  return sanitized.length > 0 ? sanitized : "unknown";
@@ -1827,7 +1853,7 @@ async function writeBenchmarkResult(result, outputDir) {
1827
1853
  await mkdir2(outputDir, { recursive: true });
1828
1854
  const safeRemnicVersion = sanitizeFilenameSegment(result.meta.remnicVersion);
1829
1855
  const timestamp = result.meta.timestamp.replace(/[:.]/g, "-");
1830
- const filePath = path2.join(
1856
+ const filePath = path3.join(
1831
1857
  outputDir,
1832
1858
  `${result.meta.benchmark}-v${safeRemnicVersion}-${timestamp}.json`
1833
1859
  );
@@ -1838,7 +1864,7 @@ async function getRemnicVersion() {
1838
1864
  try {
1839
1865
  const packageJson = JSON.parse(
1840
1866
  await readFile2(
1841
- path2.resolve(import.meta.dirname, "../../../package.json"),
1867
+ path3.resolve(import.meta.dirname, "../../../package.json"),
1842
1868
  "utf8"
1843
1869
  )
1844
1870
  );
@@ -1990,7 +2016,7 @@ async function loadDataset(mode, datasetDir, limit) {
1990
2016
  return episodes;
1991
2017
  };
1992
2018
  if (datasetDir) {
1993
- const filePath = path3.join(datasetDir, "open_end_qa_set.jsonl");
2019
+ const filePath = path4.join(datasetDir, "open_end_qa_set.jsonl");
1994
2020
  let raw;
1995
2021
  try {
1996
2022
  raw = await readFile3(filePath, "utf8");
@@ -2063,12 +2089,10 @@ function parseEpisode(line, lineNumber) {
2063
2089
  if (!Number.isFinite(record.num_turns) || !Number.isFinite(record.total_tokens)) {
2064
2090
  throw new Error(`${location} must include numeric num_turns and total_tokens fields.`);
2065
2091
  }
2066
- if (!isValidTrajectory(record.trajectory)) {
2067
- throw new Error(`${location} must include a trajectory array with action/observation turns.`);
2068
- }
2069
2092
  if (!isValidQaPairs(record.qa_pairs)) {
2070
2093
  throw new Error(`${location} must include a qa_pairs array with question/answer/type/question_uuid strings.`);
2071
2094
  }
2095
+ const trajectory = normalizeTrajectory(record.trajectory, location);
2072
2096
  return {
2073
2097
  episode_id: record.episode_id,
2074
2098
  task: record.task,
@@ -2077,14 +2101,45 @@ function parseEpisode(line, lineNumber) {
2077
2101
  success: record.success,
2078
2102
  num_turns: record.num_turns,
2079
2103
  total_tokens: record.total_tokens,
2080
- trajectory: record.trajectory,
2104
+ trajectory,
2081
2105
  qa_pairs: record.qa_pairs
2082
2106
  };
2083
2107
  }
2084
- function isValidTrajectory(value) {
2085
- return Array.isArray(value) && value.every(
2086
- (turn) => !!turn && typeof turn === "object" && !Array.isArray(turn) && Number.isInteger(turn.turn_idx) && typeof turn.action === "string" && typeof turn.observation === "string"
2087
- );
2108
+ function normalizeTrajectory(value, location) {
2109
+ if (!Array.isArray(value)) {
2110
+ throw new Error(`${location} must include a trajectory array with action/observation turns.`);
2111
+ }
2112
+ return value.map((turn, index) => {
2113
+ if (!turn || typeof turn !== "object" || Array.isArray(turn)) {
2114
+ throw new Error(`${location} trajectory[${index}] must be an object.`);
2115
+ }
2116
+ const record = turn;
2117
+ if (!Number.isInteger(record.turn_idx)) {
2118
+ throw new Error(`${location} trajectory[${index}] must include an integer turn_idx.`);
2119
+ }
2120
+ if (!("action" in record) || !("observation" in record)) {
2121
+ throw new Error(
2122
+ `${location} must include a trajectory array with action/observation turns.`
2123
+ );
2124
+ }
2125
+ return {
2126
+ turn_idx: record.turn_idx,
2127
+ action: normalizeTrajectoryText(record.action, `${location} trajectory[${index}].action`),
2128
+ observation: normalizeTrajectoryText(
2129
+ record.observation,
2130
+ `${location} trajectory[${index}].observation`
2131
+ )
2132
+ };
2133
+ });
2134
+ }
2135
+ function normalizeTrajectoryText(value, field) {
2136
+ if (typeof value === "string") {
2137
+ return value;
2138
+ }
2139
+ if (value === null) {
2140
+ return "";
2141
+ }
2142
+ throw new Error(`${field} must be a string or null.`);
2088
2143
  }
2089
2144
  function isValidQaPairs(value) {
2090
2145
  return Array.isArray(value) && value.every(
@@ -2095,7 +2150,7 @@ function isValidQaPairs(value) {
2095
2150
  // src/benchmarks/published/amemgym/runner.ts
2096
2151
  import { randomUUID as randomUUID2 } from "crypto";
2097
2152
  import { readFile as readFile4 } from "fs/promises";
2098
- import path4 from "path";
2153
+ import path5 from "path";
2099
2154
 
2100
2155
  // src/benchmarks/published/amemgym/fixture.ts
2101
2156
  var AMEMGYM_SMOKE_FIXTURE = [
@@ -2333,7 +2388,7 @@ async function loadDataset2(mode, datasetDir, limit) {
2333
2388
  const datasetErrors = [];
2334
2389
  for (const filename of DATASET_FILENAMES) {
2335
2390
  try {
2336
- const raw = await readFile4(path4.join(datasetDir, filename), "utf8");
2391
+ const raw = await readFile4(path5.join(datasetDir, filename), "utf8");
2337
2392
  const parsed = parseDataset(raw, filename);
2338
2393
  return ensureDatasetProfiles(applyLimit2(parsed, normalizedLimit));
2339
2394
  } catch (error) {
@@ -2418,7 +2473,7 @@ function normalizeRole(role) {
2418
2473
  // src/benchmarks/published/memory-arena/runner.ts
2419
2474
  import { randomUUID as randomUUID3 } from "crypto";
2420
2475
  import { readFile as readFile5, readdir } from "fs/promises";
2421
- import path5 from "path";
2476
+ import path6 from "path";
2422
2477
 
2423
2478
  // src/benchmarks/published/memory-arena/fixture.ts
2424
2479
  var MEMORY_ARENA_SMOKE_FIXTURE = [
@@ -2610,7 +2665,7 @@ async function loadDataset3(mode, datasetDir, limit) {
2610
2665
  if (remainingLimit2 === 0) {
2611
2666
  break;
2612
2667
  }
2613
- const raw = await readFile5(path5.join(datasetDir, filename), "utf8");
2668
+ const raw = await readFile5(path6.join(datasetDir, filename), "utf8");
2614
2669
  const parsedTasks = [];
2615
2670
  raw.split("\n").forEach((line, lineIndex) => {
2616
2671
  if (line.trim().length === 0) {
@@ -2686,9 +2741,7 @@ function parseTask(line, filename, lineNumber) {
2686
2741
  if (!Number.isInteger(record.id)) {
2687
2742
  throw new Error(`${location} must include an integer id.`);
2688
2743
  }
2689
- if (typeof record.category !== "string") {
2690
- throw new Error(`${location} must include a string category.`);
2691
- }
2744
+ const category = normalizeCategory(record.category, filename);
2692
2745
  if (!Array.isArray(record.questions) || record.questions.some((question) => typeof question !== "string")) {
2693
2746
  throw new Error(`${location} must include a questions array of strings.`);
2694
2747
  }
@@ -2701,11 +2754,23 @@ function parseTask(line, filename, lineNumber) {
2701
2754
  }
2702
2755
  return {
2703
2756
  id: record.id,
2704
- category: record.category,
2757
+ category,
2705
2758
  questions: record.questions,
2706
2759
  answers: record.answers
2707
2760
  };
2708
2761
  }
2762
+ function normalizeCategory(value, filename) {
2763
+ if (typeof value === "string" && value.trim().length > 0) {
2764
+ return value;
2765
+ }
2766
+ const inferred = filename.replace(/\.jsonl$/i, "").trim();
2767
+ if (inferred.length > 0) {
2768
+ return inferred;
2769
+ }
2770
+ throw new Error(
2771
+ `MemoryArena dataset file ${filename} must include a string category or use a filename that can be inferred as the category.`
2772
+ );
2773
+ }
2709
2774
  function answerToString(answer) {
2710
2775
  if (typeof answer === "string") {
2711
2776
  return answer;
@@ -2753,7 +2818,7 @@ function isValidArenaAnswerObject(answer) {
2753
2818
  // src/benchmarks/published/longmemeval/runner.ts
2754
2819
  import { randomUUID as randomUUID4 } from "crypto";
2755
2820
  import { readFile as readFile6 } from "fs/promises";
2756
- import path6 from "path";
2821
+ import path7 from "path";
2757
2822
 
2758
2823
  // src/benchmarks/published/longmemeval/fixture.ts
2759
2824
  var LONG_MEM_EVAL_SMOKE_FIXTURE = [
@@ -2928,7 +2993,7 @@ async function loadDataset4(mode, datasetDir, limit) {
2928
2993
  "longmemeval.json"
2929
2994
  ]) {
2930
2995
  try {
2931
- const raw = await readFile6(path6.join(datasetDir, filename), "utf8");
2996
+ const raw = await readFile6(path7.join(datasetDir, filename), "utf8");
2932
2997
  const parsed = JSON.parse(raw);
2933
2998
  return ensureDatasetItems(limit ? parsed.slice(0, limit) : parsed);
2934
2999
  } catch (error) {
@@ -2957,7 +3022,7 @@ async function loadDataset4(mode, datasetDir, limit) {
2957
3022
  // src/benchmarks/published/locomo/runner.ts
2958
3023
  import { randomUUID as randomUUID5 } from "crypto";
2959
3024
  import { readFile as readFile7 } from "fs/promises";
2960
- import path7 from "path";
3025
+ import path8 from "path";
2961
3026
 
2962
3027
  // src/benchmarks/published/locomo/fixture.ts
2963
3028
  var LOCOMO_SMOKE_FIXTURE = [
@@ -3186,7 +3251,7 @@ async function loadDataset5(mode, datasetDir, limit) {
3186
3251
  const datasetErrors = [];
3187
3252
  for (const filename of ["locomo10.json", "locomo.json"]) {
3188
3253
  try {
3189
- const raw = await readFile7(path7.join(datasetDir, filename), "utf8");
3254
+ const raw = await readFile7(path8.join(datasetDir, filename), "utf8");
3190
3255
  const parsed = parseDataset2(raw, filename);
3191
3256
  return ensureDatasetConversations(
3192
3257
  applyLimit4(parsed, normalizedLimit)
@@ -3231,29 +3296,69 @@ function parseConversation(entry, filename, index) {
3231
3296
  if (!record.conversation || typeof record.conversation !== "object" || Array.isArray(record.conversation)) {
3232
3297
  throw new Error(`${location} must include a conversation object.`);
3233
3298
  }
3234
- if (!isValidQaArray(record.qa)) {
3235
- throw new Error(
3236
- `${location} must include a qa array with question/answer/evidence/category fields.`
3237
- );
3238
- }
3299
+ const qa = normalizeQaArray(record.qa, location);
3239
3300
  return {
3240
3301
  sample_id: record.sample_id,
3241
3302
  conversation: record.conversation,
3242
- qa: record.qa,
3303
+ qa,
3243
3304
  event_summary: record.event_summary,
3244
3305
  observation: record.observation,
3245
3306
  session_summary: record.session_summary
3246
3307
  };
3247
3308
  }
3248
- function isValidQaArray(value) {
3249
- return Array.isArray(value) && value.every(isValidQa);
3309
+ function normalizeQaArray(value, location) {
3310
+ if (!Array.isArray(value)) {
3311
+ throw new Error(
3312
+ `${location} must include a qa array with question/answer/evidence/category fields.`
3313
+ );
3314
+ }
3315
+ return value.map(
3316
+ (entry, index) => normalizeQa(entry, `${location} qa[${index}]`)
3317
+ );
3250
3318
  }
3251
- function isValidQa(value) {
3319
+ function normalizeQa(value, location) {
3252
3320
  if (!value || typeof value !== "object" || Array.isArray(value)) {
3253
- return false;
3321
+ throw new Error(`${location} must be an object.`);
3254
3322
  }
3255
3323
  const record = value;
3256
- return typeof record.question === "string" && typeof record.answer === "string" && Number.isInteger(record.category) && Array.isArray(record.evidence) && record.evidence.every((item) => typeof item === "string");
3324
+ if (typeof record.question !== "string" || record.question.trim().length === 0) {
3325
+ throw new Error(`${location} must include a non-empty question string.`);
3326
+ }
3327
+ if (!Number.isInteger(record.category)) {
3328
+ throw new Error(`${location} must include an integer category.`);
3329
+ }
3330
+ if (!Array.isArray(record.evidence) || record.evidence.some((item) => typeof item !== "string")) {
3331
+ throw new Error(`${location} must include an evidence array of strings.`);
3332
+ }
3333
+ const answer = normalizeQaAnswer(record.answer, record.adversarial_answer, location);
3334
+ return {
3335
+ question: record.question,
3336
+ answer,
3337
+ evidence: record.evidence,
3338
+ category: record.category
3339
+ };
3340
+ }
3341
+ function normalizeQaAnswer(answer, adversarialAnswer, location) {
3342
+ const direct = normalizeScalarAnswer(answer);
3343
+ if (direct !== void 0) {
3344
+ return direct;
3345
+ }
3346
+ const adversarial = normalizeScalarAnswer(adversarialAnswer);
3347
+ if (adversarial !== void 0) {
3348
+ return adversarial;
3349
+ }
3350
+ throw new Error(
3351
+ `${location} must include a string or numeric answer, or an adversarial_answer fallback.`
3352
+ );
3353
+ }
3354
+ function normalizeScalarAnswer(value) {
3355
+ if (typeof value === "string" && value.trim().length > 0) {
3356
+ return value;
3357
+ }
3358
+ if (typeof value === "number" && Number.isFinite(value)) {
3359
+ return String(value);
3360
+ }
3361
+ return void 0;
3257
3362
  }
3258
3363
  function normalizeLimit4(limit) {
3259
3364
  if (limit === void 0) {
@@ -3276,7 +3381,7 @@ function applyLimit4(items, limit) {
3276
3381
  // src/benchmarks/published/beam/runner.ts
3277
3382
  import { randomUUID as randomUUID6 } from "crypto";
3278
3383
  import { readFile as readFile8, readdir as readdir2 } from "fs/promises";
3279
- import path8 from "path";
3384
+ import path9 from "path";
3280
3385
 
3281
3386
  // src/benchmarks/published/beam/fixture.ts
3282
3387
  var BEAM_SMOKE_FIXTURE = [
@@ -3574,7 +3679,7 @@ async function loadDataset6(mode, datasetDir, limit) {
3574
3679
  if (remainingLimit === 0) {
3575
3680
  break;
3576
3681
  }
3577
- const raw = await readFile8(path8.join(datasetDir, filename), "utf8");
3682
+ const raw = await readFile8(path9.join(datasetDir, filename), "utf8");
3578
3683
  const scale = inferScaleFromFilename(filename);
3579
3684
  const conversations = filename.endsWith(".jsonl") ? parseJsonlDataset(raw, filename) : parseJsonDataset(raw, filename);
3580
3685
  const limitedConversations = applyLimit5(conversations, remainingLimit);
@@ -4018,7 +4123,7 @@ var StructuredLiteralParser = class {
4018
4123
  // src/benchmarks/published/personamem/runner.ts
4019
4124
  import { randomUUID as randomUUID7 } from "crypto";
4020
4125
  import { readFile as readFile9, realpath } from "fs/promises";
4021
- import path9 from "path";
4126
+ import path10 from "path";
4022
4127
 
4023
4128
  // src/benchmarks/published/personamem/fixture.ts
4024
4129
  var PERSONAMEM_SMOKE_FIXTURE = [
@@ -4220,7 +4325,7 @@ async function loadDataset7(mode, datasetDir, limit) {
4220
4325
  if (datasetDir) {
4221
4326
  const datasetErrors = [];
4222
4327
  for (const relativePath of DATASET_FILE_CANDIDATES) {
4223
- const datasetPath = path9.join(datasetDir, relativePath);
4328
+ const datasetPath = path10.join(datasetDir, relativePath);
4224
4329
  try {
4225
4330
  const raw = await readFile9(datasetPath, "utf8");
4226
4331
  const rows = parseCsvRows(raw, relativePath, normalizedLimit);
@@ -4403,12 +4508,12 @@ function parseCsv(raw, limit) {
4403
4508
  return rows;
4404
4509
  }
4405
4510
  async function resolveDatasetFilePath(datasetRoot, relativePath) {
4406
- const rootPath = path9.resolve(datasetRoot);
4511
+ const rootPath = path10.resolve(datasetRoot);
4407
4512
  const rootRealPath = await realpath(rootPath);
4408
- const candidatePath = path9.resolve(rootPath, relativePath);
4513
+ const candidatePath = path10.resolve(rootPath, relativePath);
4409
4514
  const candidateRealPath = await realpath(candidatePath);
4410
- const relativeToRoot = path9.relative(rootRealPath, candidateRealPath);
4411
- if (relativeToRoot.startsWith("..") || path9.isAbsolute(relativeToRoot)) {
4515
+ const relativeToRoot = path10.relative(rootRealPath, candidateRealPath);
4516
+ if (relativeToRoot.startsWith("..") || path10.isAbsolute(relativeToRoot)) {
4412
4517
  throw new Error(
4413
4518
  `PersonaMem-v2 dataset file reference "${relativePath}" must stay within datasetDir.`
4414
4519
  );
@@ -4528,7 +4633,7 @@ function applyLimit6(items, limit) {
4528
4633
  // src/benchmarks/published/membench/runner.ts
4529
4634
  import { randomUUID as randomUUID8 } from "crypto";
4530
4635
  import { readFile as readFile10, readdir as readdir3 } from "fs/promises";
4531
- import path10 from "path";
4636
+ import path11 from "path";
4532
4637
 
4533
4638
  // src/benchmarks/published/membench/fixture.ts
4534
4639
  var MEMBENCH_SMOKE_FIXTURE = [
@@ -4714,7 +4819,7 @@ async function loadDataset8(mode, datasetDir, limit) {
4714
4819
  break;
4715
4820
  }
4716
4821
  try {
4717
- const raw = await readFile10(path10.join(datasetDir, filename), "utf8");
4822
+ const raw = await readFile10(path11.join(datasetDir, filename), "utf8");
4718
4823
  const parsed = filename.endsWith(".jsonl") ? parseJsonlDataset2(raw, filename) : parseJsonDataset2(raw, filename);
4719
4824
  const limitedCases = applyLimit7(parsed, remainingLimit);
4720
4825
  cases.push(...limitedCases);
@@ -5084,7 +5189,7 @@ function isPlainObject2(value) {
5084
5189
  // src/benchmarks/published/memoryagentbench/runner.ts
5085
5190
  import { randomUUID as randomUUID9 } from "crypto";
5086
5191
  import { readFile as readFile11 } from "fs/promises";
5087
- import path11 from "path";
5192
+ import path12 from "path";
5088
5193
 
5089
5194
  // src/benchmarks/published/memoryagentbench/fixture.ts
5090
5195
  var MEMORY_AGENT_BENCH_SMOKE_FIXTURE = [
@@ -5354,7 +5459,7 @@ async function loadDataset9(mode, datasetDir, limit) {
5354
5459
  const datasetErrors = [];
5355
5460
  for (const filename of DATASET_BUNDLE_CANDIDATES) {
5356
5461
  const parsed = await tryReadDatasetFile(
5357
- path11.join(datasetDir, filename),
5462
+ path12.join(datasetDir, filename),
5358
5463
  filename,
5359
5464
  datasetErrors
5360
5465
  );
@@ -5371,7 +5476,7 @@ async function loadDataset9(mode, datasetDir, limit) {
5371
5476
  let splitData;
5372
5477
  for (const filename of splitConfig.candidates) {
5373
5478
  try {
5374
- splitData = await readDatasetFile(path11.join(datasetDir, filename), filename);
5479
+ splitData = await readDatasetFile(path12.join(datasetDir, filename), filename);
5375
5480
  break;
5376
5481
  } catch (error) {
5377
5482
  datasetErrors.push(
@@ -5939,7 +6044,7 @@ function loadCases(mode, limit) {
5939
6044
  // src/benchmarks/remnic/extraction-judge-calibration/runner.ts
5940
6045
  import { randomUUID as randomUUID11 } from "crypto";
5941
6046
  import os from "os";
5942
- import path12 from "path";
6047
+ import path13 from "path";
5943
6048
  import { clearVerdictCache, judgeFactDurability, parseConfig as parseConfig2 } from "@remnic/core";
5944
6049
 
5945
6050
  // src/benchmarks/remnic/extraction-judge-calibration/fixture.ts
@@ -6045,8 +6150,8 @@ var extractionJudgeCalibrationDefinition = {
6045
6150
  async function runExtractionJudgeCalibrationBenchmark(options) {
6046
6151
  const cases = loadCases2(options.mode, options.limit);
6047
6152
  const config = parseConfig2({
6048
- memoryDir: path12.join(os.tmpdir(), "remnic-bench-extraction-judge"),
6049
- workspaceDir: path12.join(os.tmpdir(), "remnic-bench-extraction-judge-workspace"),
6153
+ memoryDir: path13.join(os.tmpdir(), "remnic-bench-extraction-judge"),
6154
+ workspaceDir: path13.join(os.tmpdir(), "remnic-bench-extraction-judge-workspace"),
6050
6155
  openaiApiKey: "bench-test-key",
6051
6156
  extractionJudgeEnabled: true,
6052
6157
  extractionJudgeBatchSize: 4,
@@ -6584,7 +6689,7 @@ function constantAggregate2(value) {
6584
6689
 
6585
6690
  // src/benchmarks/remnic/entity-consolidation/runner.ts
6586
6691
  import os2 from "os";
6587
- import path13 from "path";
6692
+ import path14 from "path";
6588
6693
  import { randomUUID as randomUUID13 } from "crypto";
6589
6694
  import { mkdtemp as mkdtemp2, rm as rm2 } from "fs/promises";
6590
6695
  import { StorageManager } from "@remnic/core";
@@ -6747,7 +6852,7 @@ function loadCases4(mode, limit) {
6747
6852
  return limited;
6748
6853
  }
6749
6854
  async function executeCase(sample) {
6750
- const tmpDir = await mkdtemp2(path13.join(os2.tmpdir(), "remnic-bench-entity-consolidation-"));
6855
+ const tmpDir = await mkdtemp2(path14.join(os2.tmpdir(), "remnic-bench-entity-consolidation-"));
6751
6856
  try {
6752
6857
  const storage = new StorageManager(tmpDir);
6753
6858
  await storage.ensureDirectories();
@@ -6926,7 +7031,7 @@ function parseNonNegativeInt(rawValue) {
6926
7031
 
6927
7032
  // src/benchmarks/remnic/page-versioning/runner.ts
6928
7033
  import os3 from "os";
6929
- import path14 from "path";
7034
+ import path15 from "path";
6930
7035
  import { randomUUID as randomUUID14 } from "crypto";
6931
7036
  import { mkdtemp as mkdtemp3, mkdir as mkdir3, readFile as readFile12, rm as rm3, writeFile as writeFile2 } from "fs/promises";
6932
7037
  import {
@@ -7081,10 +7186,10 @@ function loadCases5(mode, limit) {
7081
7186
  return limited;
7082
7187
  }
7083
7188
  async function executeCase2(sample) {
7084
- const tmpDir = await mkdtemp3(path14.join(os3.tmpdir(), "remnic-bench-page-versioning-"));
7189
+ const tmpDir = await mkdtemp3(path15.join(os3.tmpdir(), "remnic-bench-page-versioning-"));
7085
7190
  try {
7086
- const factsDir = path14.join(tmpDir, "facts");
7087
- const pagePath = path14.join(factsDir, `${sample.id}.md`);
7191
+ const factsDir = path15.join(tmpDir, "facts");
7192
+ const pagePath = path15.join(factsDir, `${sample.id}.md`);
7088
7193
  await mkdir3(factsDir, { recursive: true });
7089
7194
  const config = versioningConfig();
7090
7195
  switch (sample.scenario) {
@@ -8328,7 +8433,7 @@ async function runRetrievalDirectAnswerBenchmark(options) {
8328
8433
  import { randomUUID as randomUUID18 } from "crypto";
8329
8434
  import { mkdtemp as mkdtemp4, rm as rm4 } from "fs/promises";
8330
8435
  import os4 from "os";
8331
- import path15 from "path";
8436
+ import path16 from "path";
8332
8437
  import {
8333
8438
  StorageManager as StorageManager2,
8334
8439
  parseConfig as parseConfig3,
@@ -8449,7 +8554,7 @@ async function runProceduralRecallBenchmark(options) {
8449
8554
  const e2eCases = sliceWithBudget(e2eSource, remainingBudget).picked;
8450
8555
  for (const sample of e2eCases) {
8451
8556
  const startedAt = performance.now();
8452
- const dir = await mkdtemp4(path15.join(os4.tmpdir(), "remnic-bench-procedural-recall-"));
8557
+ const dir = await mkdtemp4(path16.join(os4.tmpdir(), "remnic-bench-procedural-recall-"));
8453
8558
  let section = null;
8454
8559
  try {
8455
8560
  const storage = new StorageManager2(dir);
@@ -8464,7 +8569,7 @@ ${body}`,
8464
8569
  );
8465
8570
  const config = parseConfig3({
8466
8571
  memoryDir: dir,
8467
- workspaceDir: path15.join(dir, "ws"),
8572
+ workspaceDir: path16.join(dir, "ws"),
8468
8573
  openaiApiKey: "bench-key",
8469
8574
  procedural: {
8470
8575
  enabled: sample.proceduralEnabled !== false,
@@ -8535,7 +8640,7 @@ ${body}`,
8535
8640
  import { randomUUID as randomUUID19 } from "crypto";
8536
8641
  import { mkdtemp as mkdtemp5, writeFile as writeFile3, rm as rm5, mkdir as mkdir4 } from "fs/promises";
8537
8642
  import { tmpdir as tmpdir2 } from "os";
8538
- import path16 from "path";
8643
+ import path17 from "path";
8539
8644
 
8540
8645
  // src/ingestion-scorer.ts
8541
8646
  function normalize(value) {
@@ -9100,12 +9205,12 @@ async function runIngestionEntityRecallBenchmark(options) {
9100
9205
  throw new Error("ingestionAdapter is required for ingestion benchmarks");
9101
9206
  }
9102
9207
  const fixture = emailFixture.generate();
9103
- const fixtureDir = await mkdtemp5(path16.join(tmpdir2(), "bench-email-"));
9208
+ const fixtureDir = await mkdtemp5(path17.join(tmpdir2(), "bench-email-"));
9104
9209
  try {
9105
9210
  await options.ingestionAdapter.reset();
9106
9211
  for (const file of fixture.files) {
9107
- const filePath = path16.join(fixtureDir, file.relativePath);
9108
- await mkdir4(path16.dirname(filePath), { recursive: true });
9212
+ const filePath = path17.join(fixtureDir, file.relativePath);
9213
+ await mkdir4(path17.dirname(filePath), { recursive: true });
9109
9214
  await writeFile3(filePath, file.content, "utf8");
9110
9215
  }
9111
9216
  const { result: ingestionLog, durationMs } = await timed(
@@ -9181,7 +9286,7 @@ async function runIngestionEntityRecallBenchmark(options) {
9181
9286
  import { randomUUID as randomUUID20 } from "crypto";
9182
9287
  import { mkdtemp as mkdtemp6, writeFile as writeFile4, rm as rm6, mkdir as mkdir5 } from "fs/promises";
9183
9288
  import { tmpdir as tmpdir3 } from "os";
9184
- import path17 from "path";
9289
+ import path18 from "path";
9185
9290
  var ingestionSchemaCompletenessDefinition = {
9186
9291
  id: "ingestion-schema-completeness",
9187
9292
  title: "Ingestion: Schema Completeness",
@@ -9200,12 +9305,12 @@ async function runIngestionSchemaCompletenessBenchmark(options) {
9200
9305
  throw new Error("ingestionAdapter is required for ingestion benchmarks");
9201
9306
  }
9202
9307
  const fixture = emailFixture.generate();
9203
- const fixtureDir = await mkdtemp6(path17.join(tmpdir3(), "bench-email-"));
9308
+ const fixtureDir = await mkdtemp6(path18.join(tmpdir3(), "bench-email-"));
9204
9309
  try {
9205
9310
  await options.ingestionAdapter.reset();
9206
9311
  for (const file of fixture.files) {
9207
- const filePath = path17.join(fixtureDir, file.relativePath);
9208
- await mkdir5(path17.dirname(filePath), { recursive: true });
9312
+ const filePath = path18.join(fixtureDir, file.relativePath);
9313
+ await mkdir5(path18.dirname(filePath), { recursive: true });
9209
9314
  await writeFile4(filePath, file.content, "utf8");
9210
9315
  }
9211
9316
  const { result: ingestionLog, durationMs } = await timed(
@@ -9291,7 +9396,7 @@ async function runIngestionSchemaCompletenessBenchmark(options) {
9291
9396
  import { randomUUID as randomUUID21 } from "crypto";
9292
9397
  import { mkdtemp as mkdtemp7, writeFile as writeFile5, rm as rm7, mkdir as mkdir6 } from "fs/promises";
9293
9398
  import { tmpdir as tmpdir4 } from "os";
9294
- import path18 from "path";
9399
+ import path19 from "path";
9295
9400
  var ingestionBacklinkF1Definition = {
9296
9401
  id: "ingestion-backlink-f1",
9297
9402
  title: "Ingestion: Backlink F1",
@@ -9310,12 +9415,12 @@ async function runIngestionBacklinkF1Benchmark(options) {
9310
9415
  throw new Error("ingestionAdapter is required for ingestion benchmarks");
9311
9416
  }
9312
9417
  const fixture = emailFixture.generate();
9313
- const fixtureDir = await mkdtemp7(path18.join(tmpdir4(), "bench-email-"));
9418
+ const fixtureDir = await mkdtemp7(path19.join(tmpdir4(), "bench-email-"));
9314
9419
  try {
9315
9420
  await options.ingestionAdapter.reset();
9316
9421
  for (const file of fixture.files) {
9317
- const filePath = path18.join(fixtureDir, file.relativePath);
9318
- await mkdir6(path18.dirname(filePath), { recursive: true });
9422
+ const filePath = path19.join(fixtureDir, file.relativePath);
9423
+ await mkdir6(path19.dirname(filePath), { recursive: true });
9319
9424
  await writeFile5(filePath, file.content, "utf8");
9320
9425
  }
9321
9426
  const { result: ingestionLog, durationMs } = await timed(
@@ -9392,7 +9497,7 @@ async function runIngestionBacklinkF1Benchmark(options) {
9392
9497
  import { randomUUID as randomUUID22 } from "crypto";
9393
9498
  import { mkdtemp as mkdtemp8, writeFile as writeFile6, rm as rm8, mkdir as mkdir7 } from "fs/promises";
9394
9499
  import { tmpdir as tmpdir5 } from "os";
9395
- import path19 from "path";
9500
+ import path20 from "path";
9396
9501
  var INGESTION_SETUP_FRICTION_LOWER_IS_BETTER = /* @__PURE__ */ new Set(["setup_friction", "commands_count", "prompts_count", "errors_count"]);
9397
9502
  var ingestionSetupFrictionDefinition = {
9398
9503
  id: "ingestion-setup-friction",
@@ -9409,12 +9514,12 @@ var ingestionSetupFrictionDefinition = {
9409
9514
  };
9410
9515
  async function runIngestionSetupFrictionBenchmark(options) {
9411
9516
  const fixture = emailFixture.generate();
9412
- const fixtureDir = await mkdtemp8(path19.join(tmpdir5(), "bench-friction-"));
9517
+ const fixtureDir = await mkdtemp8(path20.join(tmpdir5(), "bench-friction-"));
9413
9518
  try {
9414
9519
  await options.ingestionAdapter.reset();
9415
9520
  for (const file of fixture.files) {
9416
- const filePath = path19.join(fixtureDir, file.relativePath);
9417
- await mkdir7(path19.dirname(filePath), { recursive: true });
9521
+ const filePath = path20.join(fixtureDir, file.relativePath);
9522
+ await mkdir7(path20.dirname(filePath), { recursive: true });
9418
9523
  await writeFile6(filePath, file.content, "utf8");
9419
9524
  }
9420
9525
  const { result: ingestionLog, durationMs } = await timed(
@@ -9494,7 +9599,7 @@ async function runIngestionSetupFrictionBenchmark(options) {
9494
9599
  import { randomUUID as randomUUID23 } from "crypto";
9495
9600
  import { mkdtemp as mkdtemp9, writeFile as writeFile7, rm as rm9, mkdir as mkdir8 } from "fs/promises";
9496
9601
  import { tmpdir as tmpdir6 } from "os";
9497
- import path20 from "path";
9602
+ import path21 from "path";
9498
9603
  var ingestionCitationAccuracyDefinition = {
9499
9604
  id: "ingestion-citation-accuracy",
9500
9605
  title: "Ingestion: Citation Accuracy",
@@ -9535,9 +9640,9 @@ function extractClaims(pages) {
9535
9640
  function resolveCitedSources(seeAlso, pageRef, sourceContentMap) {
9536
9641
  const resolved = [];
9537
9642
  for (const ref of seeAlso) {
9538
- const refBase = path20.basename(ref).toLowerCase();
9643
+ const refBase = path21.basename(ref).toLowerCase();
9539
9644
  for (const [relativePath, content] of sourceContentMap) {
9540
- if (relativePath === ref || relativePath.endsWith(ref) || path20.basename(relativePath).toLowerCase() === refBase) {
9645
+ if (relativePath === ref || relativePath.endsWith(ref) || path21.basename(relativePath).toLowerCase() === refBase) {
9541
9646
  resolved.push(content);
9542
9647
  break;
9543
9648
  }
@@ -9546,9 +9651,9 @@ function resolveCitedSources(seeAlso, pageRef, sourceContentMap) {
9546
9651
  if (resolved.length > 0) {
9547
9652
  return resolved.join("\n\n---\n\n");
9548
9653
  }
9549
- const pageBase = path20.basename(pageRef).toLowerCase();
9654
+ const pageBase = path21.basename(pageRef).toLowerCase();
9550
9655
  for (const [relativePath, content] of sourceContentMap) {
9551
- if (path20.basename(relativePath).toLowerCase() === pageBase) {
9656
+ if (path21.basename(relativePath).toLowerCase() === pageBase) {
9552
9657
  return content;
9553
9658
  }
9554
9659
  }
@@ -9559,12 +9664,12 @@ async function runIngestionCitationAccuracyBenchmark(options) {
9559
9664
  throw new Error("ingestionAdapter is required for ingestion benchmarks");
9560
9665
  }
9561
9666
  const fixture = emailFixture.generate();
9562
- const fixtureDir = await mkdtemp9(path20.join(tmpdir6(), "bench-citation-"));
9667
+ const fixtureDir = await mkdtemp9(path21.join(tmpdir6(), "bench-citation-"));
9563
9668
  try {
9564
9669
  await options.ingestionAdapter.reset();
9565
9670
  for (const file of fixture.files) {
9566
- const filePath = path20.join(fixtureDir, file.relativePath);
9567
- await mkdir8(path20.dirname(filePath), { recursive: true });
9671
+ const filePath = path21.join(fixtureDir, file.relativePath);
9672
+ await mkdir8(path21.dirname(filePath), { recursive: true });
9568
9673
  await writeFile7(filePath, file.content, "utf8");
9569
9674
  }
9570
9675
  const benchmarkStart = performance.now();
@@ -9757,7 +9862,7 @@ var ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS = ASSISTANT_MORNING_BRIEF_SCENARIOS.
9757
9862
 
9758
9863
  // src/benchmarks/remnic/_assistant-common/runner.ts
9759
9864
  import { randomUUID as randomUUID24 } from "crypto";
9760
- import path22 from "path";
9865
+ import path23 from "path";
9761
9866
 
9762
9867
  // src/run-seeds.ts
9763
9868
  function buildBenchmarkRunSeeds(runCount, baseSeed) {
@@ -9845,7 +9950,7 @@ function pairedDeltaConfidenceInterval(candidateValues, baselineValues, options
9845
9950
  // src/judges/sealed-rubric.ts
9846
9951
  import { createHash as createHash2 } from "crypto";
9847
9952
  import { appendFileSync, mkdirSync } from "fs";
9848
- import path21 from "path";
9953
+ import path22 from "path";
9849
9954
 
9850
9955
  // src/judges/sealed-prompts/assistant-rubric-v1.ts
9851
9956
  var ASSISTANT_RUBRIC_V1 = `# Assistant rubric v1 (sealed)
@@ -10095,7 +10200,7 @@ function createSpotCheckFileLogger(options) {
10095
10200
  return { log() {
10096
10201
  } };
10097
10202
  }
10098
- const logPath = path21.join(directory, `${runId}.jsonl`);
10203
+ const logPath = path22.join(directory, `${runId}.jsonl`);
10099
10204
  let written = 0;
10100
10205
  let warnedOnWriteFailure = false;
10101
10206
  const cap = typeof sampleSize === "number" && sampleSize > 0 ? sampleSize : 5;
@@ -10184,7 +10289,7 @@ async function runAssistantBenchmark(definition, scenarios, resolved, runnerOpti
10184
10289
  const runId = buildRunId(definition.id);
10185
10290
  const spotCheckLogger = createSpotCheckFileLogger({
10186
10291
  runId,
10187
- directory: runnerOptions.spotCheckDir ?? path22.join(process.cwd(), "benchmarks", "results", "spot-checks"),
10292
+ directory: runnerOptions.spotCheckDir ?? path23.join(process.cwd(), "benchmarks", "results", "spot-checks"),
10188
10293
  sampleRate: 0.35,
10189
10294
  sampleSize: 5
10190
10295
  });
@@ -10911,8 +11016,8 @@ function finalizeBenchmarkResultConfig(result, options) {
10911
11016
  }
10912
11017
 
10913
11018
  // src/benchmark.ts
10914
- var DEFAULT_BASELINE_PATH = path23.join(process.cwd(), "benchmarks", "baseline.json");
10915
- var DEFAULT_REPORT_PATH = path23.join(process.cwd(), "benchmarks", "report.json");
11019
+ var DEFAULT_BASELINE_PATH = path24.join(process.cwd(), "benchmarks", "baseline.json");
11020
+ var DEFAULT_REPORT_PATH = path24.join(process.cwd(), "benchmarks", "report.json");
10916
11021
  var BASELINE_VERSION = 1;
10917
11022
  var DEFAULT_TOLERANCE = 10;
10918
11023
  var DEFAULT_FULL_RUN_COUNT = 5;
@@ -11005,7 +11110,7 @@ function loadBaseline(baselinePath) {
11005
11110
  }
11006
11111
  }
11007
11112
  function saveBaseline(baselinePath, baseline) {
11008
- fs.mkdirSync(path23.dirname(baselinePath), { recursive: true });
11113
+ fs.mkdirSync(path24.dirname(baselinePath), { recursive: true });
11009
11114
  fs.writeFileSync(baselinePath, `${JSON.stringify(baseline, null, 2)}
11010
11115
  `);
11011
11116
  }
@@ -11216,7 +11321,7 @@ function generateReport(results, reportPath) {
11216
11321
  totalDurationMs: results.reduce((sum, result) => sum + result.totalDurationMs, 0)
11217
11322
  };
11218
11323
  if (reportPath) {
11219
- fs.mkdirSync(path23.dirname(reportPath), { recursive: true });
11324
+ fs.mkdirSync(path24.dirname(reportPath), { recursive: true });
11220
11325
  fs.writeFileSync(reportPath, `${JSON.stringify(report, null, 2)}
11221
11326
  `);
11222
11327
  }
@@ -11342,7 +11447,7 @@ function getBenchmarkLowerIsBetter(benchmarkId) {
11342
11447
  import { mkdir as mkdir9, readdir as readdir4, readFile as readFile13, unlink, writeFile as writeFile8 } from "fs/promises";
11343
11448
  import fs2 from "fs";
11344
11449
  import os5 from "os";
11345
- import path24 from "path";
11450
+ import path25 from "path";
11346
11451
 
11347
11452
  // src/integrity/contamination.ts
11348
11453
  var EMPTY_CONTAMINATION_MANIFEST = {
@@ -11405,13 +11510,13 @@ function mergeContaminationManifests(...manifests) {
11405
11510
  var BASELINE_NAME_PATTERN = /^[A-Za-z0-9_-]+$/;
11406
11511
  function defaultBenchmarkBaselineDir() {
11407
11512
  const homeDir = process.env.HOME ?? process.env.USERPROFILE ?? os5.homedir();
11408
- return path24.join(homeDir, ".remnic", "bench", "baselines");
11513
+ return path25.join(homeDir, ".remnic", "bench", "baselines");
11409
11514
  }
11410
11515
  function defaultBenchmarkPublishPath(target) {
11411
11516
  const homeDir = process.env.HOME ?? process.env.USERPROFILE ?? os5.homedir();
11412
11517
  switch (target) {
11413
11518
  case "remnic-ai":
11414
- return path24.join(homeDir, ".remnic", "published", "benchmarks.json");
11519
+ return path25.join(homeDir, ".remnic", "published", "benchmarks.json");
11415
11520
  }
11416
11521
  }
11417
11522
  function compareResultSummaries(left, right) {
@@ -11508,7 +11613,7 @@ async function listBenchmarkResults(outputDir) {
11508
11613
  if (!entry.isFile() || !entry.name.endsWith(".json")) {
11509
11614
  continue;
11510
11615
  }
11511
- const filePath = path24.join(outputDir, entry.name);
11616
+ const filePath = path25.join(outputDir, entry.name);
11512
11617
  try {
11513
11618
  const result = await loadBenchmarkResult(filePath);
11514
11619
  results.push(toSummary(result, filePath));
@@ -11522,7 +11627,7 @@ async function saveBenchmarkBaseline(baselineDir, name, result, source) {
11522
11627
  assertValidBaselineName(name);
11523
11628
  assertUsableBaselineDir(baselineDir);
11524
11629
  await mkdir9(baselineDir, { recursive: true });
11525
- const filePath = path24.join(baselineDir, `${name}.json`);
11630
+ const filePath = path25.join(baselineDir, `${name}.json`);
11526
11631
  const payload = {
11527
11632
  name,
11528
11633
  savedAt: (/* @__PURE__ */ new Date()).toISOString(),
@@ -11552,7 +11657,7 @@ async function listBenchmarkBaselines(baselineDir) {
11552
11657
  if (!entry.isFile() || !entry.name.endsWith(".json")) {
11553
11658
  continue;
11554
11659
  }
11555
- const filePath = path24.join(baselineDir, entry.name);
11660
+ const filePath = path25.join(baselineDir, entry.name);
11556
11661
  try {
11557
11662
  const baseline = await loadBenchmarkBaseline(filePath);
11558
11663
  baselines.push(toBaselineSummary(baseline, filePath));
@@ -11569,7 +11674,7 @@ async function resolveBenchmarkResultReference(outputDir, reference) {
11569
11674
  return exactIdMatch;
11570
11675
  }
11571
11676
  const basenameMatch = summaries.find(
11572
- (summary) => path24.basename(summary.path) === reference
11677
+ (summary) => path25.basename(summary.path) === reference
11573
11678
  );
11574
11679
  if (basenameMatch) {
11575
11680
  return basenameMatch;
@@ -11585,7 +11690,7 @@ async function resolveBenchmarkResultReference(outputDir, reference) {
11585
11690
  return void 0;
11586
11691
  }
11587
11692
  function looksLikeFilesystemPath(reference) {
11588
- return path24.isAbsolute(reference) || reference.includes("/") || reference.includes(path24.sep) || reference.endsWith(".json");
11693
+ return path25.isAbsolute(reference) || reference.includes("/") || reference.includes(path25.sep) || reference.endsWith(".json");
11589
11694
  }
11590
11695
  async function deleteBenchmarkResults(outputDir, references) {
11591
11696
  const summaries = await listBenchmarkResults(outputDir);
@@ -11593,9 +11698,9 @@ async function deleteBenchmarkResults(outputDir, references) {
11593
11698
  const missing = [];
11594
11699
  const seenPaths = /* @__PURE__ */ new Set();
11595
11700
  for (const reference of references) {
11596
- let summary = summaries.find((entry) => entry.id === reference) ?? summaries.find((entry) => path24.basename(entry.path) === reference);
11701
+ let summary = summaries.find((entry) => entry.id === reference) ?? summaries.find((entry) => path25.basename(entry.path) === reference);
11597
11702
  if (!summary && looksLikeFilesystemPath(reference)) {
11598
- const canonicalRef = path24.resolve(reference);
11703
+ const canonicalRef = path25.resolve(reference);
11599
11704
  if (seenPaths.has(canonicalRef)) {
11600
11705
  continue;
11601
11706
  }
@@ -11612,7 +11717,7 @@ async function deleteBenchmarkResults(outputDir, references) {
11612
11717
  missing.push(reference);
11613
11718
  continue;
11614
11719
  }
11615
- const canonicalPath = path24.resolve(summary.path);
11720
+ const canonicalPath = path25.resolve(summary.path);
11616
11721
  if (seenPaths.has(canonicalPath)) {
11617
11722
  continue;
11618
11723
  }
@@ -11738,7 +11843,7 @@ async function buildBenchmarkPublishFeed(outputDir, target, options = {}) {
11738
11843
  };
11739
11844
  }
11740
11845
  async function writeBenchmarkPublishFeed(feed, outputPath) {
11741
- await mkdir9(path24.dirname(outputPath), { recursive: true });
11846
+ await mkdir9(path25.dirname(outputPath), { recursive: true });
11742
11847
  await writeFile8(outputPath, `${JSON.stringify(feed, null, 2)}
11743
11848
  `);
11744
11849
  return outputPath;
@@ -12293,7 +12398,7 @@ function formatError(error) {
12293
12398
 
12294
12399
  // src/benchmarks/custom/runner.ts
12295
12400
  import { randomUUID as randomUUID25 } from "crypto";
12296
- import path25 from "path";
12401
+ import path26 from "path";
12297
12402
  async function runCustomBenchmarkFile(filePath, options) {
12298
12403
  const spec = await loadCustomBenchmarkFile(filePath);
12299
12404
  const benchmark = createCustomBenchmarkDefinition(spec, filePath);
@@ -12443,7 +12548,7 @@ async function scoreTask(scoring, options, question, actual, expected) {
12443
12548
  }
12444
12549
  }
12445
12550
  function createCustomBenchmarkDefinition(benchmark, filePath) {
12446
- const id = `custom:${slugify(path25.basename(filePath, path25.extname(filePath)) || benchmark.name)}`;
12551
+ const id = `custom:${slugify(path26.basename(filePath, path26.extname(filePath)) || benchmark.name)}`;
12447
12552
  return {
12448
12553
  id,
12449
12554
  title: benchmark.name,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@remnic/bench",
3
- "version": "1.0.0",
3
+ "version": "1.0.1",
4
4
  "description": "Retrieval latency ladder benchmarks + CI regression gates for @remnic/core",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",