@remnic/bench 1.0.1 → 9.3.515

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -44,6 +44,8 @@ interface ExtractedPage {
44
44
  frontmatter: Record<string, unknown>;
45
45
  hasExecSummary: boolean;
46
46
  hasTimeline: boolean;
47
+ /** Source corpus file references that support this page's generated claims. */
48
+ sourceRefs?: string[];
47
49
  seeAlso: string[];
48
50
  content: string;
49
51
  }
@@ -72,6 +74,8 @@ declare const REQUIRED_FRONTMATTER_FIELDS: readonly ["title", "type", "state", "
72
74
  interface Message {
73
75
  role: "user" | "assistant" | "system";
74
76
  content: string;
77
+ /** Optional source timestamp for benchmarks with historical query times. */
78
+ timestamp?: string;
75
79
  }
76
80
  interface SearchResult {
77
81
  turnIndex: number;
@@ -84,6 +88,7 @@ interface MemoryStats {
84
88
  totalMessages: number;
85
89
  totalSummaryNodes: number;
86
90
  maxDepth: number;
91
+ maxTurnIndex?: number;
87
92
  }
88
93
  interface BenchResponse {
89
94
  text: string;
@@ -94,8 +99,11 @@ interface BenchResponse {
94
99
  latencyMs: number;
95
100
  model: string;
96
101
  }
102
+ interface BenchPhaseControl {
103
+ signal?: AbortSignal;
104
+ }
97
105
  interface BenchResponder {
98
- respond(question: string, recalledText: string): Promise<BenchResponse>;
106
+ respond(question: string, recalledText: string, control?: BenchPhaseControl): Promise<BenchResponse>;
99
107
  }
100
108
  interface BenchJudgeResult {
101
109
  score: number;
@@ -107,19 +115,32 @@ interface BenchJudgeResult {
107
115
  model?: string;
108
116
  }
109
117
  interface BenchJudge {
110
- score(question: string, predicted: string, expected: string): Promise<number>;
111
- scoreWithMetrics?(question: string, predicted: string, expected: string): Promise<BenchJudgeResult>;
118
+ score(question: string, predicted: string, expected: string, control?: BenchPhaseControl): Promise<number>;
119
+ scoreWithMetrics?(question: string, predicted: string, expected: string, control?: BenchPhaseControl): Promise<BenchJudgeResult>;
120
+ /**
121
+ * Run a benchmark-supplied yes/no judging prompt directly and return a
122
+ * normalized 0/1 score. Published benchmarks such as LongMemEval define
123
+ * their own evaluator prompt; routing those through the scalar generic
124
+ * judge prompt would change the metric contract.
125
+ */
126
+ scoreBinaryPrompt?(prompt: string, control?: BenchPhaseControl): Promise<BenchJudgeResult>;
112
127
  }
113
128
  interface BenchMemoryAdapter {
114
- store(sessionId: string, messages: Message[]): Promise<void>;
115
- recall(sessionId: string, query: string, budgetChars?: number): Promise<string>;
116
- search(query: string, limit: number, sessionId?: string): Promise<SearchResult[]>;
117
- reset(sessionId?: string): Promise<void>;
118
- getStats(sessionId?: string): Promise<MemoryStats>;
129
+ store(sessionId: string, messages: Message[], control?: BenchPhaseControl): Promise<void>;
130
+ recall(sessionId: string, query: string, budgetChars?: number, options?: BenchRecallOptions, control?: BenchPhaseControl): Promise<string>;
131
+ search(query: string, limit: number, sessionId?: string, control?: BenchPhaseControl): Promise<SearchResult[]>;
132
+ reset(sessionId?: string, control?: BenchPhaseControl): Promise<void>;
133
+ getStats(sessionId?: string, control?: BenchPhaseControl): Promise<MemoryStats>;
134
+ /** Wait for background summarization (e.g. LCM) to finish after store(). */
135
+ drain?(control?: BenchPhaseControl): Promise<void>;
119
136
  destroy(): Promise<void>;
120
137
  responder?: BenchResponder;
121
138
  judge?: BenchJudge;
122
139
  }
140
+ interface BenchRecallOptions {
141
+ /** Optional historical recall timestamp for benchmarks that expose query time. */
142
+ asOf?: string;
143
+ }
123
144
  type LlmJudge = BenchJudge;
124
145
  type MemorySystem = BenchMemoryAdapter;
125
146
 
@@ -189,11 +210,37 @@ type BenchmarkTier = "published" | "remnic" | "custom";
189
210
  type BenchmarkStatus = "ready" | "planned";
190
211
  type BenchmarkCategory = "agentic" | "retrieval" | "conversational" | "ingestion";
191
212
  type BenchRuntimeProfile = "baseline" | "real" | "openclaw-chain";
192
- type BuiltInProvider = "openai" | "anthropic" | "ollama" | "litellm";
213
+ type AmaBenchJudgeProtocol = "default" | "recommended";
214
+ /**
215
+ * Built-in LLM providers supported by the bench harness.
216
+ *
217
+ * `local-llm` targets a user-hosted OpenAI-compatible endpoint
218
+ * (llama.cpp, vLLM, LM Studio, etc.) via `--base-url`. It mirrors
219
+ * the `localLlm*` plugin config on the Remnic core side so that
220
+ * `remnic bench published --provider local-llm` actually exercises
221
+ * the same transport path as the running plugin. Issue #566 slice 5.
222
+ *
223
+ * `codex-cli` shells out to `codex exec` as an isolated benchmark-only
224
+ * responder/judge target. It is intentionally not routed through Remnic
225
+ * memory or OpenClaw gateway state.
226
+ */
227
+ type BuiltInProvider = "openai" | "anthropic" | "ollama" | "litellm" | "local-llm" | "codex-cli";
228
+ type BenchReasoningEffort = "low" | "medium" | "high" | "xhigh";
193
229
  interface ProviderConfig {
194
230
  provider: BuiltInProvider;
195
231
  model: string;
196
232
  baseUrl?: string;
233
+ apiKey?: string;
234
+ retryOptions?: {
235
+ maxAttempts?: number;
236
+ baseBackoffMs?: number;
237
+ timeoutMs?: number;
238
+ max429WaitMs?: number;
239
+ };
240
+ disableThinking?: boolean;
241
+ reasoningEffort?: BenchReasoningEffort;
242
+ responderContextBudgetChars?: number;
243
+ responderPromptBudgetChars?: number;
197
244
  }
198
245
  interface TaskTokenUsage {
199
246
  input: number;
@@ -278,13 +325,19 @@ interface BenchmarkResult {
278
325
  * Must stay below the benchmark's canary floor.
279
326
  */
280
327
  canaryScore?: number;
328
+ /** "partial" if the benchmark was interrupted; absent or "complete" otherwise. */
329
+ status?: "complete" | "partial";
330
+ /** If partial, the error that caused interruption. */
331
+ failureReason?: string;
281
332
  };
282
333
  config: {
283
334
  runtimeProfile?: BenchRuntimeProfile | null;
284
335
  systemProvider: ProviderConfig | null;
285
336
  judgeProvider: ProviderConfig | null;
337
+ internalProvider?: ProviderConfig | null;
286
338
  adapterMode: string;
287
339
  remnicConfig: Record<string, unknown>;
340
+ benchmarkOptions?: Record<string, unknown>;
288
341
  };
289
342
  cost: {
290
343
  totalTokens: number;
@@ -339,7 +392,15 @@ interface RunBenchmarkOptions {
339
392
  ingestionAdapter?: IngestionBenchAdapter;
340
393
  systemProvider?: ProviderConfig | null;
341
394
  judgeProvider?: ProviderConfig | null;
395
+ internalProvider?: ProviderConfig | null;
342
396
  remnicConfig?: Record<string, unknown>;
397
+ benchmarkOptions?: Record<string, unknown>;
398
+ drainTimeoutMs?: number;
399
+ amaBenchJudgeProtocol?: AmaBenchJudgeProtocol;
400
+ amaBenchCrossJudge?: BenchJudge;
401
+ amaBenchCrossJudgeProvider?: ProviderConfig | null;
402
+ /** Called after each task completes for progress logging and partial result tracking. */
403
+ onTaskComplete?: (task: TaskResult, completedCount: number, totalCount?: number) => void;
343
404
  }
344
405
  interface ResolvedRunBenchmarkOptions extends RunBenchmarkOptions {
345
406
  mode: BenchmarkMode;
@@ -457,13 +518,70 @@ interface FixtureGenerator {
457
518
 
458
519
  interface RemnicAdapterOptions {
459
520
  configOverrides?: Record<string, unknown>;
521
+ memoryDir?: string;
460
522
  preserveRuntimeDefaults?: boolean;
461
523
  responder?: BenchResponder;
462
524
  judge?: BenchJudge;
525
+ drainTimeoutMs?: number;
526
+ replayExtractionMode?: "await" | "background" | "skip";
527
+ replaySourceValidAtMode?: "historical" | "batch";
528
+ sandboxDir?: string;
463
529
  }
464
530
  declare const createLightweightAdapter: (options?: RemnicAdapterOptions) => Promise<BenchMemoryAdapter>;
465
531
  declare const createRemnicAdapter: (options?: RemnicAdapterOptions) => Promise<BenchMemoryAdapter>;
466
532
 
533
+ interface TimeoutGuardOptions {
534
+ benchmarkId: string;
535
+ timeoutMs?: number;
536
+ drainTimeoutMs?: number;
537
+ logProgress?: boolean;
538
+ log?: (message: string) => void;
539
+ onTimeout?: (phase: string) => void | Promise<void>;
540
+ }
541
+ interface TimeoutGuardConfig {
542
+ remnicConfig?: Record<string, unknown>;
543
+ systemProvider?: ProviderConfig | null;
544
+ judgeProvider?: ProviderConfig | null;
545
+ }
546
+ declare function resolveBenchmarkPhaseTimeoutMs(config: TimeoutGuardConfig): number | undefined;
547
+ declare function resolveBenchmarkProgressLogging(remnicConfig?: Record<string, unknown>): boolean;
548
+ declare function createTimeoutGuardedAdapter(adapter: BenchMemoryAdapter, options: TimeoutGuardOptions): BenchMemoryAdapter;
549
+
550
+ interface SyntheticEmailIngestionAdapterOptions {
551
+ system?: BenchMemoryAdapter;
552
+ }
553
+ /**
554
+ * Isolated ingestion adapter for the synthetic email fixture benchmarks.
555
+ *
556
+ * It writes the raw source corpus through the benchmark's Remnic memory
557
+ * adapter when one is supplied, then exposes the extracted fixture graph in
558
+ * the IngestionBenchAdapter shape expected by the scoring tier. This keeps
559
+ * the ingestion benchmarks runnable in isolated benchmark jobs without
560
+ * touching a production Remnic instance.
561
+ */
562
+ declare function createSyntheticEmailIngestionAdapter(options?: SyntheticEmailIngestionAdapterOptions): IngestionBenchAdapter;
563
+
564
+ declare const MEMORY_EVAL_PUBLIC_LINE: "Agent memory without evals is vibes with a database.";
565
+ type MemoryEvalDimensionId = "repeated_context_reduction" | "unnecessary_clarification_reduction" | "retrieval_correctness" | "stale_memory_harm" | "scope_respect" | "ask_when_needed" | "act_when_enough_context" | "personalization_quality";
566
+ type MemoryEvalCategory = "context-efficiency" | "retrieval-quality" | "boundary-respect" | "action-confidence" | "personalization";
567
+ interface MemoryEvalMetric {
568
+ name: string;
569
+ higherIsBetter: boolean;
570
+ description: string;
571
+ }
572
+ interface MemoryEvalDimension {
573
+ id: MemoryEvalDimensionId;
574
+ question: string;
575
+ category: MemoryEvalCategory;
576
+ metrics: readonly MemoryEvalMetric[];
577
+ quickBenchmarkIds: readonly string[];
578
+ fullModeGuidance: string;
579
+ }
580
+ declare const MEMORY_EVAL_DIMENSIONS: readonly MemoryEvalDimension[];
581
+ declare function listMemoryEvalDimensions(): readonly MemoryEvalDimension[];
582
+ declare function getMemoryEvalDimension(id: MemoryEvalDimensionId): MemoryEvalDimension;
583
+ declare function listMemoryEvalBenchmarkIds(): string[];
584
+
467
585
  /**
468
586
  * Minimal LLM provider contract for the bench engine.
469
587
  */
@@ -473,6 +591,7 @@ interface CompletionOpts {
473
591
  temperature?: number;
474
592
  maxTokens?: number;
475
593
  headers?: Record<string, string>;
594
+ signal?: AbortSignal;
476
595
  }
477
596
  interface CompletionResult {
478
597
  text: string;
@@ -496,6 +615,26 @@ interface ProviderBaseConfig {
496
615
  baseUrl?: string;
497
616
  apiKey?: string;
498
617
  headers?: Record<string, string>;
618
+ retryOptions?: {
619
+ maxAttempts?: number;
620
+ baseBackoffMs?: number;
621
+ timeoutMs?: number;
622
+ max429WaitMs?: number;
623
+ };
624
+ /** Suppress thinking/reasoning tokens for thinking-capable models (Qwen 3.5, Gemma 4, DeepSeek). */
625
+ disableThinking?: boolean;
626
+ /**
627
+ * Optional answering-only memory-context budget. Benchmark artifacts keep the
628
+ * full recalled text, but provider-backed responders may receive this compact
629
+ * deterministic view to avoid transport-specific prompt stalls.
630
+ */
631
+ responderContextBudgetChars?: number;
632
+ /**
633
+ * Optional answering-only question/protocol budget. This keeps the original
634
+ * benchmark question and artifact unchanged while shortening repeated harness
635
+ * instructions for slow transport-backed responders such as Codex CLI.
636
+ */
637
+ responderPromptBudgetChars?: number;
499
638
  }
500
639
  interface OpenAiCompatibleProviderConfig extends ProviderBaseConfig {
501
640
  provider?: "openai" | "litellm";
@@ -507,12 +646,46 @@ interface AnthropicProviderConfig extends ProviderBaseConfig {
507
646
  interface OllamaProviderConfig extends ProviderBaseConfig {
508
647
  provider?: "ollama";
509
648
  }
649
+ /**
650
+ * `local-llm` targets a user-hosted OpenAI-compatible endpoint
651
+ * (llama.cpp, vLLM, LM Studio, etc.). `baseUrl` is required at the
652
+ * CLI layer — it mirrors the plugin's `localLlmUrl` config and is
653
+ * what tells the bench which local server to talk to. The transport
654
+ * is intentionally OpenAI-compatible: `/v1/chat/completions` +
655
+ * `/v1/models`. Issue #566 slice 5.
656
+ */
657
+ interface LocalLlmProviderConfig extends ProviderBaseConfig {
658
+ provider?: "local-llm";
659
+ baseUrl: string;
660
+ }
661
+ interface CodexCliProviderConfig extends ProviderBaseConfig {
662
+ provider?: "codex-cli";
663
+ /** Codex CLI model reasoning effort. Bench CLI defaults this to xhigh. */
664
+ reasoningEffort?: BenchReasoningEffort;
665
+ /** Optional executable override for tests or non-standard Codex CLI installs. */
666
+ executable?: string;
667
+ /**
668
+ * Optional diagnostics artifact directory. When set, the provider writes
669
+ * per-call metadata that helps debug slow benchmark completions without
670
+ * depending on transient temp workspaces.
671
+ */
672
+ diagnosticsDir?: string;
673
+ /**
674
+ * `metadata` stores hashes/counts only. `full` additionally stores the full
675
+ * benchmark prompt and should only be used for isolated benchmark datasets.
676
+ */
677
+ diagnosticsMode?: "metadata" | "full";
678
+ }
510
679
  type ProviderFactoryConfig = (OpenAiCompatibleProviderConfig & {
511
680
  provider: "openai" | "litellm";
512
681
  }) | (AnthropicProviderConfig & {
513
682
  provider: "anthropic";
514
683
  }) | (OllamaProviderConfig & {
515
684
  provider: "ollama";
685
+ }) | (LocalLlmProviderConfig & {
686
+ provider: "local-llm";
687
+ }) | (CodexCliProviderConfig & {
688
+ provider: "codex-cli";
516
689
  });
517
690
  interface ProviderDiscoveryResult {
518
691
  provider: BuiltInProvider;
@@ -629,6 +802,15 @@ declare const BENCHMARK_RESULT_SCHEMA: {
629
802
  readonly baseUrl: {
630
803
  readonly type: "string";
631
804
  };
805
+ readonly reasoningEffort: {
806
+ readonly type: "string";
807
+ };
808
+ readonly responderContextBudgetChars: {
809
+ readonly type: "number";
810
+ };
811
+ readonly responderPromptBudgetChars: {
812
+ readonly type: "number";
813
+ };
632
814
  };
633
815
  }];
634
816
  };
@@ -648,6 +830,31 @@ declare const BENCHMARK_RESULT_SCHEMA: {
648
830
  readonly baseUrl: {
649
831
  readonly type: "string";
650
832
  };
833
+ readonly reasoningEffort: {
834
+ readonly type: "string";
835
+ };
836
+ };
837
+ }];
838
+ };
839
+ readonly internalProvider: {
840
+ readonly anyOf: readonly [{
841
+ readonly type: "null";
842
+ }, {
843
+ readonly type: "object";
844
+ readonly required: readonly ["provider", "model"];
845
+ readonly properties: {
846
+ readonly provider: {
847
+ readonly type: "string";
848
+ };
849
+ readonly model: {
850
+ readonly type: "string";
851
+ };
852
+ readonly baseUrl: {
853
+ readonly type: "string";
854
+ };
855
+ readonly reasoningEffort: {
856
+ readonly type: "string";
857
+ };
651
858
  };
652
859
  }];
653
860
  };
@@ -752,11 +959,315 @@ declare const BENCHMARK_RESULT_SCHEMA: {
752
959
  };
753
960
  };
754
961
 
962
+ declare const BENCHMARK_REPRO_MANIFEST_FILENAME = "MANIFEST.json";
963
+ declare const BENCHMARK_REPRO_MANIFEST_SCHEMA_VERSION = 1;
964
+ interface BenchmarkReproManifestFile {
965
+ path: string;
966
+ kind: "file" | "symlink";
967
+ sizeBytes: number;
968
+ sha256: string;
969
+ target?: string;
970
+ }
971
+ interface BenchmarkReproManifestDataset {
972
+ benchmark: string;
973
+ status: "not-provided" | "missing" | "hashed";
974
+ path?: string;
975
+ realpath?: string;
976
+ fileCount: number;
977
+ totalBytes: number;
978
+ sha256?: string;
979
+ files: BenchmarkReproManifestFile[];
980
+ }
981
+ interface BenchmarkReproManifestResult {
982
+ path: string;
983
+ sha256: string;
984
+ sizeBytes: number;
985
+ resultId: string;
986
+ benchmark: string;
987
+ mode: BenchmarkMode;
988
+ gitSha: string;
989
+ runCount: number;
990
+ seeds: number[];
991
+ taskCount: number;
992
+ configHash: string;
993
+ }
994
+ interface BenchmarkReproManifest {
995
+ schemaVersion: number;
996
+ generatedAt: string;
997
+ run: {
998
+ id: string;
999
+ mode?: BenchmarkMode;
1000
+ selectedBenchmarks: string[];
1001
+ runtimeProfiles: string[];
1002
+ selectedWorkItems: Array<{
1003
+ benchmark: string;
1004
+ runtimeProfile: string;
1005
+ }>;
1006
+ limit?: number;
1007
+ seed?: number;
1008
+ };
1009
+ git: {
1010
+ commit: string;
1011
+ shortCommit: string;
1012
+ dirty: boolean;
1013
+ dirtyEntryCount: number;
1014
+ };
1015
+ command: {
1016
+ cwd: string;
1017
+ argv: string[];
1018
+ envKeys: string[];
1019
+ };
1020
+ environment: {
1021
+ platform: NodeJS.Platform;
1022
+ arch: string;
1023
+ nodeVersion: string;
1024
+ hostname: string;
1025
+ packageManager?: string;
1026
+ };
1027
+ qmd?: {
1028
+ configDir?: string;
1029
+ cacheDir?: string;
1030
+ collections: string[];
1031
+ };
1032
+ configFiles: Array<{
1033
+ label: string;
1034
+ path: string;
1035
+ sha256?: string;
1036
+ sizeBytes?: number;
1037
+ missing?: boolean;
1038
+ }>;
1039
+ datasets: BenchmarkReproManifestDataset[];
1040
+ results: BenchmarkReproManifestResult[];
1041
+ artifactHash: string;
1042
+ }
1043
+ interface BuildBenchmarkReproManifestOptions {
1044
+ resultPaths?: string[];
1045
+ runId?: string;
1046
+ selectedBenchmarks?: string[];
1047
+ runtimeProfiles?: string[];
1048
+ selectedWorkItems?: Array<{
1049
+ benchmark: string;
1050
+ runtimeProfile: string;
1051
+ }>;
1052
+ mode?: BenchmarkMode;
1053
+ limit?: number;
1054
+ seed?: number;
1055
+ datasetDirs?: Record<string, string | undefined>;
1056
+ command?: {
1057
+ cwd?: string;
1058
+ argv?: string[];
1059
+ env?: NodeJS.ProcessEnv;
1060
+ envKeys?: string[];
1061
+ };
1062
+ configFiles?: Array<{
1063
+ label: string;
1064
+ path?: string;
1065
+ }>;
1066
+ qmd?: {
1067
+ configDir?: string;
1068
+ cacheDir?: string;
1069
+ collections?: string[];
1070
+ };
1071
+ }
1072
+ declare function buildBenchmarkReproManifest(resultsDir: string, options?: BuildBenchmarkReproManifestOptions): Promise<BenchmarkReproManifest>;
1073
+ declare function writeBenchmarkReproManifest(resultsDir: string, options?: BuildBenchmarkReproManifestOptions): Promise<string>;
1074
+
1075
+ /**
1076
+ * Public leaderboard artifact schema for published benchmarks.
1077
+ *
1078
+ * `BenchmarkArtifact` is deliberately flatter and more opinionated than
1079
+ * the internal `BenchmarkResult`. The goal is a stable, versioned payload
1080
+ * that Remnic.ai and third-party leaderboard consumers can rely on
1081
+ * without digging into every per-task field the internal runner captures.
1082
+ *
1083
+ * One artifact is written per run to
1084
+ * docs/benchmarks/results/<iso-date>-<benchmark>-<model>-<gitShaShort>.json
1085
+ * (gitignored during development; promoted per-release by slice 6).
1086
+ *
1087
+ * Any breaking change to the artifact shape requires a `schemaVersion`
1088
+ * bump. The companion `buildBenchmarkArtifact()` and
1089
+ * `writeBenchmarkArtifact()` functions in this file emit the current
1090
+ * version; `parseBenchmarkArtifact()` rejects unknown versions.
1091
+ */
1092
+
1093
+ /**
1094
+ * Current artifact schema version. Bump when the serialized shape
1095
+ * changes in a way that breaks existing leaderboard consumers.
1096
+ *
1097
+ * History:
1098
+ * 1 — initial schema (issue #566).
1099
+ */
1100
+ declare const BENCHMARK_ARTIFACT_SCHEMA_VERSION: 1;
1101
+ /** Identifiers of published-benchmark runners that can emit public artifacts. */
1102
+ declare const PUBLISHED_BENCHMARK_ARTIFACT_IDS: readonly ["ama-bench", "memory-arena", "amemgym", "longmemeval", "locomo", "beam", "personamem", "memoryagentbench", "membench"];
1103
+ /** Identifier of a published-benchmark runner. */
1104
+ type PublishedBenchmarkId = (typeof PUBLISHED_BENCHMARK_ARTIFACT_IDS)[number];
1105
+ interface BenchmarkArtifactSystem {
1106
+ /** Short product name, e.g. "remnic". */
1107
+ name: string;
1108
+ /** Semver of `@remnic/core` at run time. */
1109
+ version: string;
1110
+ /** Short git SHA of the repository producing the artifact. */
1111
+ gitSha: string;
1112
+ }
1113
+ interface BenchmarkArtifactEnvironment {
1114
+ /** Node.js version reported by `process.version` at run time. */
1115
+ node: string;
1116
+ /** `process.platform` at run time (linux/darwin/win32/...). */
1117
+ os: string;
1118
+ /** Optional CPU architecture (arm64/x64/...). */
1119
+ arch?: string;
1120
+ }
1121
+ interface BenchmarkArtifactPerTaskScore {
1122
+ /** Runner-assigned task ID (stable across reruns). */
1123
+ taskId: string;
1124
+ /** Task-level scores keyed by metric name (e.g. f1, llm_judge). */
1125
+ scores: Record<string, number>;
1126
+ /** Optional task category / bucket for group-by reports. */
1127
+ category?: string;
1128
+ }
1129
+ interface BenchmarkArtifact {
1130
+ /** Artifact schema version. See `BENCHMARK_ARTIFACT_SCHEMA_VERSION`. */
1131
+ schemaVersion: typeof BENCHMARK_ARTIFACT_SCHEMA_VERSION;
1132
+ /** Benchmark identifier, e.g. "longmemeval" or "locomo". */
1133
+ benchmarkId: PublishedBenchmarkId;
1134
+ /**
1135
+ * Dataset version the runner evaluated against. Free-form string so
1136
+ * runners can record the HuggingFace revision, filename, or
1137
+ * upstream dataset tag.
1138
+ */
1139
+ datasetVersion: string;
1140
+ system: BenchmarkArtifactSystem;
1141
+ /** Evaluator model ID (e.g. "gpt-4o-mini"). */
1142
+ model: string;
1143
+ /** RNG / selection seed used for this run. */
1144
+ seed: number;
1145
+ /** Aggregate metric means keyed by metric name. */
1146
+ metrics: Record<string, number>;
1147
+ /** Per-task score breakdown. Arbitrary-length; safe to truncate for public pages. */
1148
+ perTaskScores: BenchmarkArtifactPerTaskScore[];
1149
+ /** ISO-8601 timestamp of run start. */
1150
+ startedAt: string;
1151
+ /** ISO-8601 timestamp of run finish. */
1152
+ finishedAt: string;
1153
+ /** Total wall-clock duration in milliseconds. */
1154
+ durationMs: number;
1155
+ env: BenchmarkArtifactEnvironment;
1156
+ /** Optional explanatory note (e.g. "--limit 100"). Never contains PII. */
1157
+ note?: string;
1158
+ }
1159
+ /** Input to `buildBenchmarkArtifact()` beyond what `BenchmarkResult` already carries. */
1160
+ interface BuildBenchmarkArtifactInput {
1161
+ benchmarkId: PublishedBenchmarkId;
1162
+ datasetVersion: string;
1163
+ model: string;
1164
+ seed: number;
1165
+ startedAt: string;
1166
+ finishedAt: string;
1167
+ result: BenchmarkResult;
1168
+ /** Optional category extractor for `perTaskScores[].category`. */
1169
+ categoryFor?: (task: TaskResult) => string | undefined;
1170
+ /** Optional free-form note (e.g. `"--limit 100"`). */
1171
+ note?: string;
1172
+ }
1173
+ /**
1174
+ * Build a `BenchmarkArtifact` from a runner's `BenchmarkResult`.
1175
+ * Aggregates metrics to their `.mean` for public consumption; preserves
1176
+ * per-task scores verbatim. The result is sort-stable: metric keys are
1177
+ * emitted in sorted order and perTaskScores preserves runner order.
1178
+ */
1179
+ declare function buildBenchmarkArtifact(input: BuildBenchmarkArtifactInput): BenchmarkArtifact;
1180
+ /**
1181
+ * Build the canonical on-disk filename for an artifact. Filename shape:
1182
+ * <iso-date>-<benchmark>-<model>-<gitShaShort>.json
1183
+ * where iso-date is the startedAt date (YYYY-MM-DD) and gitShaShort is
1184
+ * the first 7 chars of system.gitSha (or "unknown" if absent).
1185
+ *
1186
+ * Every segment that contributes to the filename is sanitized through
1187
+ * `sanitizeSegment()` so it cannot contain `/`, `..`, NUL, or any other
1188
+ * path-separator characters — preventing a malicious artifact input
1189
+ * from directing `writeBenchmarkArtifact()` outside of `outputDir`.
1190
+ */
1191
+ declare function buildBenchmarkArtifactFilename(artifact: BenchmarkArtifact): string;
1192
+ /** Serialize an artifact to deterministic JSON (sorted keys, indented). */
1193
+ declare function serializeBenchmarkArtifact(artifact: BenchmarkArtifact): string;
1194
+ /** Compute SHA-256 of the canonical JSON serialization of the artifact. */
1195
+ declare function hashBenchmarkArtifact(artifact: BenchmarkArtifact): string;
1196
+ interface WriteBenchmarkArtifactResult {
1197
+ path: string;
1198
+ filename: string;
1199
+ sha256: string;
1200
+ bytes: number;
1201
+ }
1202
+ /**
1203
+ * Write the artifact to `<outputDir>/<filename>` and return the resulting
1204
+ * path, filename, SHA-256 of the canonical serialization, and byte count.
1205
+ * Creates `outputDir` recursively if needed.
1206
+ *
1207
+ * Belt-and-suspenders: even though `buildBenchmarkArtifactFilename()`
1208
+ * sanitizes every segment, this function also verifies the resolved
1209
+ * target stays inside `outputDir`. Any path-traversal attempt throws
1210
+ * before the write occurs.
1211
+ */
1212
+ declare function writeBenchmarkArtifact(artifact: BenchmarkArtifact, outputDir: string): Promise<WriteBenchmarkArtifactResult>;
1213
+ /**
1214
+ * Parse + validate a BenchmarkArtifact from raw JSON. Throws on version
1215
+ * mismatch, missing required fields, or structural errors. Keep this in
1216
+ * sync with the `BenchmarkArtifact` interface — every new required
1217
+ * field needs a matching check here and a `schemaVersion` bump.
1218
+ */
1219
+ declare function parseBenchmarkArtifact(raw: string): BenchmarkArtifact;
1220
+ /** Read + parse + re-hash an artifact file. Handy for `verify-artifact` CLI. */
1221
+ declare function loadBenchmarkArtifact(filePath: string): Promise<{
1222
+ artifact: BenchmarkArtifact;
1223
+ sha256: string;
1224
+ bytes: number;
1225
+ }>;
1226
+
755
1227
  declare function createAnthropicProvider(config: AnthropicProviderConfig): LlmProvider;
756
1228
 
1229
+ interface CodexCliRunRequest {
1230
+ executable: string;
1231
+ args: string[];
1232
+ input: string;
1233
+ outputPath: string;
1234
+ workspacePath: string;
1235
+ timeoutMs?: number;
1236
+ signal?: AbortSignal;
1237
+ env: NodeJS.ProcessEnv;
1238
+ }
1239
+ interface CodexCliRunResult {
1240
+ status: number | null;
1241
+ signal: NodeJS.Signals | null;
1242
+ stdout: string;
1243
+ stderr: string;
1244
+ outputText: string;
1245
+ }
1246
+ interface CodexCliProviderDeps {
1247
+ runCodexCli?: (request: CodexCliRunRequest) => Promise<CodexCliRunResult>;
1248
+ runCodexVersion?: (executable: string, env: NodeJS.ProcessEnv) => Promise<{
1249
+ status: number | null;
1250
+ stderr: string;
1251
+ }>;
1252
+ }
1253
+ declare function createCodexCliProvider(config: CodexCliProviderConfig, deps?: CodexCliProviderDeps): LlmProvider;
1254
+
1255
+ /**
1256
+ * Result enrichment and JSON writing helpers.
1257
+ */
1258
+
1259
+ declare function redactBenchmarkResultSecrets<T>(value: T): T;
1260
+ declare function writeBenchmarkResult(result: BenchmarkResult, outputDir: string): Promise<string>;
1261
+ declare function getRemnicVersion(): Promise<string>;
1262
+
1263
+ interface DiscoverAllProvidersOptions {
1264
+ includeCodexCli?: boolean;
1265
+ }
757
1266
  declare function createProvider(config: ProviderFactoryConfig): LlmProvider;
758
- declare function discoverAllProviders(): Promise<ProviderDiscoveryResult[]>;
1267
+ declare function discoverAllProviders(options?: DiscoverAllProvidersOptions): Promise<ProviderDiscoveryResult[]>;
759
1268
 
1269
+ type BenchmarkAnswerMode = "default" | "strict" | "agentic-memory";
1270
+ type BenchmarkAnswerFormat = "auto" | "choice-letter" | "choice-number" | "instruction" | "short" | "short-with-specifics" | "structured";
760
1271
  interface BenchmarkAnswerResult {
761
1272
  finalAnswer: string;
762
1273
  recalledText: string;
@@ -768,12 +1279,145 @@ interface BenchmarkAnswerResult {
768
1279
  };
769
1280
  model?: string;
770
1281
  }
1282
+ interface BenchmarkQuestionContext {
1283
+ benchmark?: string;
1284
+ domain?: string;
1285
+ task?: string;
1286
+ taskType?: string;
1287
+ qaType?: string;
1288
+ }
771
1289
  declare function answerBenchmarkQuestion(options: {
772
1290
  question: string;
773
1291
  recalledText: string;
774
1292
  responder?: BenchResponder;
1293
+ answerMode?: BenchmarkAnswerMode;
1294
+ answerFormat?: BenchmarkAnswerFormat;
1295
+ questionContext?: BenchmarkQuestionContext;
1296
+ retryUnknownWithEvidence?: boolean;
775
1297
  }): Promise<BenchmarkAnswerResult>;
776
1298
 
1299
+ interface LeaderboardArtifactWrite {
1300
+ benchmark: string;
1301
+ path: string;
1302
+ format: string;
1303
+ records: number;
1304
+ }
1305
+ interface AmaBenchLeaderboardRow {
1306
+ episode_id: number | string;
1307
+ answer_list: string[];
1308
+ }
1309
+ declare function writeLeaderboardArtifactsForResult(result: BenchmarkResult, outputDir: string): Promise<LeaderboardArtifactWrite[]>;
1310
+ declare function buildAmaBenchLeaderboardRows(result: BenchmarkResult): AmaBenchLeaderboardRow[];
1311
+ declare function serializeJsonl(rows: readonly AmaBenchLeaderboardRow[]): string;
1312
+
1313
+ type AmaBenchDiagnosticRecallMode = "remnic-full" | "explicit-evidence-only" | "oracle-trajectory";
1314
+ type AmaBenchDiagnosticAnswererMode = "normal" | "strong";
1315
+ interface AmaBenchDiagnosticVariant {
1316
+ id: string;
1317
+ label: string;
1318
+ recallMode: AmaBenchDiagnosticRecallMode;
1319
+ answererMode: AmaBenchDiagnosticAnswererMode;
1320
+ description: string;
1321
+ }
1322
+ declare const AMA_BENCH_DIAGNOSTIC_VARIANTS: readonly AmaBenchDiagnosticVariant[];
1323
+ declare function selectAmaBenchDiagnosticVariants(options?: {
1324
+ ids?: string[];
1325
+ includeStrong?: boolean;
1326
+ }): AmaBenchDiagnosticVariant[];
1327
+ interface AmaBenchDiagnosticAdapterOptions {
1328
+ strongResponder?: BenchResponder;
1329
+ }
1330
+ declare function createAmaBenchDiagnosticAdapter(base: BenchMemoryAdapter, variant: AmaBenchDiagnosticVariant, options?: AmaBenchDiagnosticAdapterOptions): BenchMemoryAdapter;
1331
+ declare function buildOracleTrajectoryRecall(messages: readonly Message[], budgetChars?: number): string;
1332
+ declare function extractMarkdownSectionsByTitle(markdown: string, allowedTitles: readonly string[]): string;
1333
+ interface AmaBenchDiagnosticTaskRow {
1334
+ variantId: string;
1335
+ taskId: string;
1336
+ episodeId?: string | number;
1337
+ domain: string;
1338
+ qaType: string;
1339
+ taskType: string;
1340
+ scores: Record<string, number>;
1341
+ unknownLike: boolean;
1342
+ recalledLength: number;
1343
+ answeredLength: number;
1344
+ recallSections: string[];
1345
+ responderModel?: string;
1346
+ judgeModel?: string;
1347
+ crossJudgeModel?: string;
1348
+ crossJudgeScore?: number;
1349
+ evidence?: AmaBenchDiagnosticTaskEvidence;
1350
+ }
1351
+ interface AmaBenchDiagnosticTaskEvidence {
1352
+ question: string;
1353
+ expected: string;
1354
+ actual: string;
1355
+ recalledText: string;
1356
+ truncatedFields?: string[];
1357
+ }
1358
+ interface AmaBenchDiagnosticBreakdown {
1359
+ key: string;
1360
+ taskCount: number;
1361
+ unknownLikeRate: number;
1362
+ scoreMeans: Record<string, number>;
1363
+ scoreCounts: Record<string, number>;
1364
+ }
1365
+ interface AmaBenchDiagnosticVariantSummary {
1366
+ variant: AmaBenchDiagnosticVariant;
1367
+ usesFullRemnicRecallProcess: boolean;
1368
+ isPrimaryFullSystemScore: boolean;
1369
+ taskCount: number;
1370
+ unknownLikeRate: number;
1371
+ scoreMeans: Record<string, number>;
1372
+ scoreCounts: Record<string, number>;
1373
+ byDomain: AmaBenchDiagnosticBreakdown[];
1374
+ byQaType: AmaBenchDiagnosticBreakdown[];
1375
+ byDomainAndQaType: AmaBenchDiagnosticBreakdown[];
1376
+ tasks: AmaBenchDiagnosticTaskRow[];
1377
+ }
1378
+ interface SanitizedDiagnosticProvider {
1379
+ provider: string;
1380
+ model: string;
1381
+ baseUrl?: string;
1382
+ reasoningEffort?: string;
1383
+ }
1384
+ interface AmaBenchDiagnosticMatrixArtifact {
1385
+ schemaVersion: 1;
1386
+ benchmark: "ama-bench";
1387
+ generatedAt: string;
1388
+ mode: BenchmarkMode;
1389
+ config: {
1390
+ runtimeProfile?: string;
1391
+ adapterMode?: string;
1392
+ datasetDir?: string;
1393
+ limit?: number;
1394
+ seed?: number;
1395
+ systemProvider?: SanitizedDiagnosticProvider | null;
1396
+ judgeProvider?: SanitizedDiagnosticProvider | null;
1397
+ internalProvider?: SanitizedDiagnosticProvider | null;
1398
+ amaBenchCrossJudgeProvider?: SanitizedDiagnosticProvider | null;
1399
+ strongSystemProvider?: SanitizedDiagnosticProvider | null;
1400
+ variantIds?: string[];
1401
+ includeTaskEvidence?: boolean;
1402
+ taskEvidenceMaxChars?: number;
1403
+ };
1404
+ variants: AmaBenchDiagnosticVariantSummary[];
1405
+ }
1406
+ interface AmaBenchDiagnosticRunContext {
1407
+ runtimeProfile?: string;
1408
+ hasResponder?: boolean;
1409
+ includeTaskEvidence?: boolean;
1410
+ taskEvidenceMaxChars?: number;
1411
+ }
1412
+ declare function buildAmaBenchDiagnosticVariantSummary(variant: AmaBenchDiagnosticVariant, result: BenchmarkResult, context?: AmaBenchDiagnosticRunContext): AmaBenchDiagnosticVariantSummary;
1413
+ declare function buildAmaBenchDiagnosticMatrixArtifact(args: {
1414
+ mode: BenchmarkMode;
1415
+ config?: AmaBenchDiagnosticMatrixArtifact["config"];
1416
+ variants: AmaBenchDiagnosticVariantSummary[];
1417
+ generatedAt?: string;
1418
+ }): AmaBenchDiagnosticMatrixArtifact;
1419
+ declare function isAmaBenchUnknownLikeAnswer(answer: string): boolean;
1420
+
777
1421
  /**
778
1422
  * Sealed LLM-judge rubric loader, invocation, and score parser for the
779
1423
  * Assistant bench tier.
@@ -911,77 +1555,215 @@ interface GatewayResponderOptions {
911
1555
  workspaceDir?: string;
912
1556
  llmFactory?: (gatewayConfig: GatewayConfig, runtimeContext: FallbackLlmRuntimeContext) => Pick<FallbackLlmClient, "chatCompletion">;
913
1557
  }
914
- declare function createResponderFromProvider(provider: LlmProvider): BenchResponder;
1558
+ interface ProviderResponderOptions {
1559
+ contextBudgetChars?: number;
1560
+ promptBudgetChars?: number;
1561
+ }
1562
+ declare function createResponderFromProvider(provider: LlmProvider, options?: ProviderResponderOptions): BenchResponder;
915
1563
  declare function createProviderBackedResponder(config: ProviderFactoryConfig, providerInstance?: LlmProvider): BenchResponder;
916
1564
  declare function createProviderBackedJudge(config: ProviderFactoryConfig, providerInstance?: LlmProvider): BenchJudge;
1565
+ declare function createProviderBackedAmaBenchRecommendedJudge(config: ProviderFactoryConfig, providerInstance?: LlmProvider): BenchJudge;
917
1566
  declare function createStructuredJudgeFromProvider(provider: LlmProvider): StructuredJudge;
918
1567
  declare function createProviderBackedStructuredJudge(config: ProviderFactoryConfig, providerInstance?: LlmProvider): StructuredJudge;
919
1568
  declare function createGatewayResponder(options: GatewayResponderOptions): BenchResponder;
920
1569
 
921
1570
  declare function createLiteLlmProvider(config: OpenAiCompatibleProviderConfig): LlmProvider;
922
1571
 
923
- declare function createOllamaProvider(config: OllamaProviderConfig): LlmProvider;
924
-
925
1572
  /**
926
- * Minimal OpenAI-compatible provider for phase 1 bench execution.
1573
+ * Local-LLM bench provider issue #566 slice 5.
1574
+ *
1575
+ * Talks to a user-hosted OpenAI-compatible endpoint (llama.cpp,
1576
+ * vLLM, LM Studio, etc.) using the exact same wire contract
1577
+ * (`/v1/chat/completions` + `/v1/models`) that the Remnic core
1578
+ * `LocalLlmClient` uses. The goal is transport-level parity:
1579
+ * anything `remnic bench published --provider local-llm` can reach
1580
+ * is something the running plugin can also reach.
1581
+ *
1582
+ * Why this is a distinct provider from `openai`:
1583
+ *
1584
+ * - The OpenAI-compatible provider treats `baseUrl` as optional
1585
+ * and defaults to `https://api.openai.com/v1`. That default is
1586
+ * wrong for local servers, and silently falling through to it
1587
+ * violates CLAUDE.md rule 51 (reject invalid user input).
1588
+ * - `local-llm` REQUIRES `baseUrl` at the CLI boundary so the
1589
+ * user must explicitly point at their server. A missing
1590
+ * base URL is a user error, not a default.
1591
+ * - Discovery for `local-llm` is reserved for the future — the
1592
+ * built-in `discoverAllProviders` probe does not assume a
1593
+ * local-llm URL is reachable. Users opt in with `--base-url`.
1594
+ *
1595
+ * See `packages/remnic-core/src/summarizer.ts` for the core-side
1596
+ * `LocalLlmClient` invocation pattern and
1597
+ * `packages/plugin-openclaw/openclaw.plugin.json` for the
1598
+ * `localLlmUrl` / `localLlmModel` config that this provider
1599
+ * mirrors.
927
1600
  */
928
1601
 
929
- declare function createOpenAiCompatibleProvider(config: OpenAiCompatibleProviderConfig): LlmProvider;
930
-
931
- type BenchModelSource = "plugin" | "gateway";
932
- interface ResolveBenchRuntimeProfileOptions {
933
- runtimeProfile?: BenchRuntimeProfile;
934
- remnicConfigPath?: string;
935
- openclawConfigPath?: string;
936
- modelSource?: BenchModelSource;
937
- gatewayAgentId?: string;
938
- fastGatewayAgentId?: string;
939
- systemProvider?: BuiltInProvider;
940
- systemModel?: string;
941
- systemBaseUrl?: string;
942
- judgeProvider?: BuiltInProvider;
943
- judgeModel?: string;
944
- judgeBaseUrl?: string;
945
- }
946
- interface ResolvedBenchRuntimeProfile {
947
- profile: BenchRuntimeProfile;
948
- remnicConfig: Record<string, unknown>;
949
- effectiveRemnicConfig: Record<string, unknown>;
950
- adapterOptions: {
951
- configOverrides: Record<string, unknown>;
952
- preserveRuntimeDefaults?: boolean;
953
- responder?: BenchResponder;
954
- judge?: BenchJudge;
955
- };
956
- systemProvider: ProviderConfig | null;
957
- judgeProvider: ProviderConfig | null;
958
- }
959
- declare function resolveBenchRuntimeProfile(options: ResolveBenchRuntimeProfileOptions): Promise<ResolvedBenchRuntimeProfile>;
960
-
961
- /**
962
- * Published benchmark registry for @remnic/bench phase 1.
963
- */
1602
+ declare function createLocalLlmProvider(config: LocalLlmProviderConfig): LlmProvider;
964
1603
 
965
- declare function listBenchmarks(): BenchmarkDefinition[];
966
- declare function getBenchmark(id: string): BenchmarkDefinition | undefined;
1604
+ declare function createOllamaProvider(config: OllamaProviderConfig): LlmProvider;
967
1605
 
968
1606
  /**
969
- * Result enrichment and JSON writing helpers.
1607
+ * Minimal OpenAI-compatible provider for phase 1 bench execution.
970
1608
  */
971
1609
 
972
- declare function writeBenchmarkResult(result: BenchmarkResult, outputDir: string): Promise<string>;
1610
+ declare function createOpenAiCompatibleProvider(config: OpenAiCompatibleProviderConfig): LlmProvider;
973
1611
 
974
1612
  /**
975
- * Seed-sequence generation for benchmark runs.
1613
+ * Shared types for the Assistant bench tier.
976
1614
  *
977
- * Factored out of `benchmark.ts` so individual runners can reuse it without
978
- * triggering a circular import through `benchmark.ts -> registry.ts ->
979
- * runner.ts -> benchmark.ts`.
980
- */
981
- declare function buildBenchmarkRunSeeds(runCount: number, baseSeed?: number): number[];
982
-
983
- /**
984
- * Public benchmark execution helpers.
1615
+ * Every Assistant benchmark shares the same shape:
1616
+ * - A synthetic memory graph (facts, stances, entities) the agent may read.
1617
+ * - A scenario prompt given to the agent.
1618
+ * - A sealed-rubric judge pass that scores the agent's output along
1619
+ * identity_accuracy / stance_coherence / novelty / calibration.
1620
+ *
1621
+ * The goal is reviewability: each benchmark folder ships a small fixture.ts
1622
+ * that returns `AssistantScenario` values, and the runner wires the shared
1623
+ * multi-run + bootstrap-CI infrastructure around them.
1624
+ */
1625
+
1626
+ interface AssistantMemoryFact {
1627
+ id: string;
1628
+ summary: string;
1629
+ /**
1630
+ * Free-form tags (topic, entity) used to render the memory-graph summary
1631
+ * that is handed to the judge. Not shown to the agent.
1632
+ */
1633
+ tags?: string[];
1634
+ }
1635
+ interface AssistantStance {
1636
+ topic: string;
1637
+ position: string;
1638
+ }
1639
+ interface AssistantMemoryGraph {
1640
+ userHandle: string;
1641
+ userRole: string;
1642
+ /** Fixed scenario date shown to the agent and judge for reproducible temporal reasoning. */
1643
+ currentDate?: string;
1644
+ facts: AssistantMemoryFact[];
1645
+ stances: AssistantStance[];
1646
+ openThreads: string[];
1647
+ }
1648
+ interface AssistantScenario {
1649
+ id: string;
1650
+ title: string;
1651
+ scenarioPrompt: string;
1652
+ memoryGraph: AssistantMemoryGraph;
1653
+ /**
1654
+ * Small label describing what the scenario is meant to exercise. Useful in
1655
+ * dashboards for filtering. Never exposed to the agent.
1656
+ */
1657
+ focus: string;
1658
+ }
1659
+ /**
1660
+ * Minimal agent contract for the Assistant tier. The agent receives the
1661
+ * scenario prompt plus a pre-rendered memory view (analogous to what the
1662
+ * Remnic recall stack would hand to a downstream chat model), and returns
1663
+ * its final answer text.
1664
+ */
1665
+ interface AssistantAgent {
1666
+ respond(request: {
1667
+ scenarioId: string;
1668
+ prompt: string;
1669
+ memoryView: string;
1670
+ seed: number;
1671
+ runIndex: number;
1672
+ runCount: number;
1673
+ }): Promise<string>;
1674
+ }
1675
+ interface AssistantRunnerOptions {
1676
+ agent: AssistantAgent;
1677
+ judge: StructuredJudge | undefined;
1678
+ rubricId?: string;
1679
+ /**
1680
+ * Directory where per-run spot-check JSONL files are appended. Defaults to
1681
+ * `<cwd>/benchmarks/results/spot-checks`.
1682
+ */
1683
+ spotCheckDir?: string;
1684
+ /**
1685
+ * Seed array for deterministic multi-run scheduling. When omitted the
1686
+ * benchmark runner picks a fresh seed array via `buildBenchmarkRunSeeds`.
1687
+ */
1688
+ seeds?: number[];
1689
+ /**
1690
+ * Override used by tests and CLI smoke runs to cap iterations. Must be
1691
+ * `>= 1`. The production contract is `>= 5` per the issue spec.
1692
+ */
1693
+ runCount?: number;
1694
+ /**
1695
+ * Random-number factory for bootstrap sampling. Injected in tests.
1696
+ */
1697
+ random?: () => number;
1698
+ }
1699
+
1700
+ type BenchModelSource = "plugin" | "gateway";
1701
+ interface ResolveBenchRuntimeProfileOptions {
1702
+ runtimeProfile?: BenchRuntimeProfile;
1703
+ remnicConfigPath?: string;
1704
+ openclawConfigPath?: string;
1705
+ modelSource?: BenchModelSource;
1706
+ gatewayAgentId?: string;
1707
+ fastGatewayAgentId?: string;
1708
+ systemProvider?: BuiltInProvider;
1709
+ systemModel?: string;
1710
+ systemBaseUrl?: string;
1711
+ systemApiKey?: string;
1712
+ systemCodexReasoningEffort?: ProviderConfig["reasoningEffort"];
1713
+ systemResponderContextBudgetChars?: number;
1714
+ systemResponderPromptBudgetChars?: number;
1715
+ judgeProvider?: BuiltInProvider;
1716
+ judgeModel?: string;
1717
+ judgeBaseUrl?: string;
1718
+ judgeApiKey?: string;
1719
+ judgeCodexReasoningEffort?: ProviderConfig["reasoningEffort"];
1720
+ internalProvider?: BuiltInProvider;
1721
+ internalModel?: string;
1722
+ internalBaseUrl?: string;
1723
+ internalApiKey?: string;
1724
+ internalDisableThinking?: boolean;
1725
+ internalCodexReasoningEffort?: ProviderConfig["reasoningEffort"];
1726
+ lcmObserveConcurrency?: number;
1727
+ requestTimeout?: number;
1728
+ drainTimeout?: number;
1729
+ max429WaitMs?: number;
1730
+ disableThinking?: boolean;
1731
+ }
1732
+ interface ResolvedBenchRuntimeProfile {
1733
+ profile: BenchRuntimeProfile;
1734
+ remnicConfig: Record<string, unknown>;
1735
+ effectiveRemnicConfig: Record<string, unknown>;
1736
+ adapterOptions: {
1737
+ configOverrides: Record<string, unknown>;
1738
+ preserveRuntimeDefaults?: boolean;
1739
+ responder?: BenchResponder;
1740
+ judge?: BenchJudge;
1741
+ drainTimeoutMs?: number;
1742
+ };
1743
+ systemProvider: ProviderConfig | null;
1744
+ judgeProvider: ProviderConfig | null;
1745
+ internalProvider: ProviderConfig | null;
1746
+ }
1747
+ declare function resolveBenchRuntimeProfile(options: ResolveBenchRuntimeProfileOptions): Promise<ResolvedBenchRuntimeProfile>;
1748
+
1749
+ /**
1750
+ * Published benchmark registry for @remnic/bench phase 1.
1751
+ */
1752
+
1753
+ declare function listBenchmarks(): BenchmarkDefinition[];
1754
+ declare function getBenchmark(id: string): BenchmarkDefinition | undefined;
1755
+
1756
+ /**
1757
+ * Seed-sequence generation for benchmark runs.
1758
+ *
1759
+ * Factored out of `benchmark.ts` so individual runners can reuse it without
1760
+ * triggering a circular import through `benchmark.ts -> registry.ts ->
1761
+ * runner.ts -> benchmark.ts`.
1762
+ */
1763
+ declare function buildBenchmarkRunSeeds(runCount: number, baseSeed?: number): number[];
1764
+
1765
+ /**
1766
+ * Public benchmark execution helpers.
985
1767
  */
986
1768
 
987
1769
  declare function resolveBenchmarkRunCount(mode: BenchmarkMode, requestedIterations?: number): number;
@@ -1190,6 +1972,113 @@ declare function buildBenchmarkPublishFeed(outputDir: string, target: BenchmarkP
1190
1972
  declare function writeBenchmarkPublishFeed(feed: PublishedBenchmarkFeed, outputPath: string): Promise<string>;
1191
1973
  declare function renderBenchmarkResultExport(result: BenchmarkResult, format: BenchmarkExportFormat): string;
1192
1974
 
1975
+ interface HaystackTurn {
1976
+ role: "user" | "assistant";
1977
+ content: string;
1978
+ }
1979
+ interface LongMemEvalItem {
1980
+ question_id: string | number;
1981
+ question_type: string;
1982
+ question: string;
1983
+ answer: string;
1984
+ question_date: string;
1985
+ haystack_dates: string[];
1986
+ haystack_session_ids: string[];
1987
+ haystack_sessions: HaystackTurn[][];
1988
+ answer_session_ids: string[];
1989
+ }
1990
+
1991
+ interface LoCoMoQA {
1992
+ question: string;
1993
+ answer: string;
1994
+ evidence: string[];
1995
+ category: number;
1996
+ }
1997
+ interface LoCoMoConversation {
1998
+ sample_id: string;
1999
+ conversation: Record<string, unknown>;
2000
+ qa: LoCoMoQA[];
2001
+ event_summary?: unknown;
2002
+ observation?: unknown;
2003
+ session_summary?: unknown;
2004
+ }
2005
+
2006
+ /**
2007
+ * Shared dataset loader helpers for the published LongMemEval + LoCoMo
2008
+ * benchmark runners. Wraps the fs probe + JSON parse + fallback logic
2009
+ * previously duplicated inside each runner's `loadDataset` function.
2010
+ *
2011
+ * Contract:
2012
+ *
2013
+ * - When `datasetDir` is defined, loaders probe the known canonical
2014
+ * filenames in order. The first readable file wins. If none are
2015
+ * readable, the result is `{ source: "missing", errors }`.
2016
+ * - When `datasetDir` is undefined (or resolves to `missing`) and
2017
+ * `mode === "quick"`, loaders return the bundled smoke fixture with
2018
+ * source `"smoke"` so the caller can surface a clear log message.
2019
+ * - When `mode === "full"` and no dataset is found, loaders return
2020
+ * `{ source: "missing", errors }` and callers must throw — full mode
2021
+ * never silently falls back to the smoke fixture.
2022
+ *
2023
+ * `scripts/bench/fetch-datasets.sh` documents the expected filenames; keep
2024
+ * them in sync when adding new variants.
2025
+ */
2026
+
2027
+ /** Canonical LongMemEval-S filenames probed by the loader, in priority order. */
2028
+ declare const LONG_MEM_EVAL_DATASET_FILENAMES: readonly string[];
2029
+ /** Canonical LoCoMo-10 filenames probed by the loader, in priority order. */
2030
+ declare const LOCOMO_DATASET_FILENAMES: readonly string[];
2031
+ type DatasetSource = "dataset" | "smoke" | "missing";
2032
+ interface LoadedDataset<T> {
2033
+ source: DatasetSource;
2034
+ /** Filename relative to `datasetDir` when source === "dataset". */
2035
+ filename?: string;
2036
+ items: T[];
2037
+ /** Parse/read errors encountered while probing candidate filenames. */
2038
+ errors: string[];
2039
+ }
2040
+ interface LoadDatasetOptions {
2041
+ mode: BenchmarkMode;
2042
+ datasetDir?: string;
2043
+ limit?: number;
2044
+ }
2045
+ /** Load LongMemEval-S from disk, falling back to the smoke fixture in quick mode. */
2046
+ declare function loadLongMemEvalS(options: LoadDatasetOptions): Promise<LoadedDataset<LongMemEvalItem>>;
2047
+ /**
2048
+ * Load LoCoMo-10 from disk, falling back to the smoke fixture in quick mode.
2049
+ *
2050
+ * `parseFile` is optional — callers that need richer structural
2051
+ * normalization (e.g. the LoCoMo runner's QA answer coercion) can pass
2052
+ * their own parser. When omitted, a minimal parser is used that only
2053
+ * asserts the top-level array + sample_id shape.
2054
+ */
2055
+ declare function loadLoCoMo10(options: LoadDatasetOptions & {
2056
+ parseFile?: (raw: string, filename: string) => LoCoMoConversation[];
2057
+ }): Promise<LoadedDataset<LoCoMoConversation>>;
2058
+ /**
2059
+ * Build a friendly "dataset missing" error message that links operators to
2060
+ * the fetch script. Callers use this when `mode === "full"` and the probe
2061
+ * returned `source: "missing"`.
2062
+ */
2063
+ declare function formatMissingDatasetError(benchmark: "longmemeval" | "locomo", datasetDir: string | undefined, filenames: readonly string[], errors: readonly string[]): string;
2064
+
2065
+ /**
2066
+ * BEAM runner migrated into @remnic/bench for phase 2.
2067
+ */
2068
+
2069
+ interface BeamDatasetPreview {
2070
+ source: "dataset" | "smoke" | "missing";
2071
+ files: string[];
2072
+ items: number;
2073
+ tasks: number;
2074
+ errors: string[];
2075
+ }
2076
+ declare function loadBeamDatasetPreview(options: {
2077
+ mode: BenchmarkMode;
2078
+ datasetDir?: string;
2079
+ limit?: number;
2080
+ }): Promise<BeamDatasetPreview>;
2081
+
1193
2082
  /**
1194
2083
  * Hash verification utilities used by the benchmark integrity pipeline.
1195
2084
  *
@@ -1230,8 +2119,12 @@ declare function hashBytes(value: Uint8Array): string;
1230
2119
  /**
1231
2120
  * Canonicalize a JSON-serializable value so equivalent payloads produce the
1232
2121
  * same digest regardless of key insertion order.
2122
+ *
2123
+ * `space` matches the third argument of `JSON.stringify` — pass `2` (or any
2124
+ * positive integer / indent string) when you want a pretty-printed output
2125
+ * that is still byte-stable across runs. Default is compact output.
1233
2126
  */
1234
- declare function canonicalJsonStringify(value: unknown): string;
2127
+ declare function canonicalJsonStringify(value: unknown, space?: string | number): string;
1235
2128
  declare function hashCanonicalJson(value: unknown): string;
1236
2129
  declare function isSha256Hex(value: unknown): value is string;
1237
2130
  declare function assertSha256Hex(value: unknown, label: string): string;
@@ -1284,9 +2177,10 @@ declare function loadSealKeyFromEnv(envName: string): Buffer | null;
1284
2177
  * }
1285
2178
  * ```
1286
2179
  *
1287
- * `sealHash` is computed over the canonical JSON of `envelope` so two qrels
1288
- * files encrypted with the same key produce distinct `sealHash` values only
1289
- * when their plaintext differs.
2180
+ * `sealHash` is computed over the canonical JSON of `envelope`, including the
2181
+ * random IV and ciphertext. It identifies the sealed envelope artifact, not
2182
+ * the plaintext qrels content. Use `envelope.plaintextHash` when stable
2183
+ * plaintext identity is required across independently sealed artifacts.
1290
2184
  */
1291
2185
 
1292
2186
  interface SealedQrelsArtifact {
@@ -1393,7 +2287,7 @@ interface SeededRng {
1393
2287
  * Deterministic 32-bit PRNG. Mulberry32 is small, fast, and sufficient for
1394
2288
  * shuffling benchmark tasks. Do NOT use for cryptographic operations.
1395
2289
  */
1396
- declare function createSeededRng(seed: number): SeededRng;
2290
+ declare function createSeededRng$1(seed: number): SeededRng;
1397
2291
  /**
1398
2292
  * Fisher-Yates shuffle using a seeded PRNG. Returns a new array.
1399
2293
  */
@@ -1576,89 +2470,6 @@ declare const chatFixture: FixtureGenerator;
1576
2470
  declare const SEALED_PROMPT_REGISTRY: Readonly<Record<string, string>>;
1577
2471
  declare const DEFAULT_ASSISTANT_RUBRIC_ID = "assistant-rubric-v1";
1578
2472
 
1579
- /**
1580
- * Shared types for the Assistant bench tier.
1581
- *
1582
- * Every Assistant benchmark shares the same shape:
1583
- * - A synthetic memory graph (facts, stances, entities) the agent may read.
1584
- * - A scenario prompt given to the agent.
1585
- * - A sealed-rubric judge pass that scores the agent's output along
1586
- * identity_accuracy / stance_coherence / novelty / calibration.
1587
- *
1588
- * The goal is reviewability: each benchmark folder ships a small fixture.ts
1589
- * that returns `AssistantScenario` values, and the runner wires the shared
1590
- * multi-run + bootstrap-CI infrastructure around them.
1591
- */
1592
-
1593
- interface AssistantMemoryFact {
1594
- id: string;
1595
- summary: string;
1596
- /**
1597
- * Free-form tags (topic, entity) used to render the memory-graph summary
1598
- * that is handed to the judge. Not shown to the agent.
1599
- */
1600
- tags?: string[];
1601
- }
1602
- interface AssistantStance {
1603
- topic: string;
1604
- position: string;
1605
- }
1606
- interface AssistantMemoryGraph {
1607
- userHandle: string;
1608
- userRole: string;
1609
- facts: AssistantMemoryFact[];
1610
- stances: AssistantStance[];
1611
- openThreads: string[];
1612
- }
1613
- interface AssistantScenario {
1614
- id: string;
1615
- title: string;
1616
- scenarioPrompt: string;
1617
- memoryGraph: AssistantMemoryGraph;
1618
- /**
1619
- * Small label describing what the scenario is meant to exercise. Useful in
1620
- * dashboards for filtering. Never exposed to the agent.
1621
- */
1622
- focus: string;
1623
- }
1624
- /**
1625
- * Minimal agent contract for the Assistant tier. The agent receives the
1626
- * scenario prompt plus a pre-rendered memory view (analogous to what the
1627
- * Remnic recall stack would hand to a downstream chat model), and returns
1628
- * its final answer text.
1629
- */
1630
- interface AssistantAgent {
1631
- respond(request: {
1632
- scenarioId: string;
1633
- prompt: string;
1634
- memoryView: string;
1635
- }): Promise<string>;
1636
- }
1637
- interface AssistantRunnerOptions {
1638
- agent: AssistantAgent;
1639
- judge: StructuredJudge | undefined;
1640
- rubricId?: string;
1641
- /**
1642
- * Directory where per-run spot-check JSONL files are appended. Defaults to
1643
- * `<cwd>/benchmarks/results/spot-checks`.
1644
- */
1645
- spotCheckDir?: string;
1646
- /**
1647
- * Seed array for deterministic multi-run scheduling. When omitted the
1648
- * benchmark runner picks a fresh seed array via `buildBenchmarkRunSeeds`.
1649
- */
1650
- seeds?: number[];
1651
- /**
1652
- * Override used by tests and CLI smoke runs to cap iterations. Must be
1653
- * `>= 1`. The production contract is `>= 5` per the issue spec.
1654
- */
1655
- runCount?: number;
1656
- /**
1657
- * Random-number factory for bootstrap sampling. Injected in tests.
1658
- */
1659
- random?: () => number;
1660
- }
1661
-
1662
2473
  /**
1663
2474
  * Shared runner scaffolding for the Assistant bench tier.
1664
2475
  *
@@ -1757,4 +2568,598 @@ declare const ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS: AssistantScenario[];
1757
2568
  declare const assistantSynthesisDefinition: BenchmarkDefinition;
1758
2569
  declare function runAssistantSynthesisBenchmark(options: ResolvedRunBenchmarkOptions): Promise<BenchmarkResult>;
1759
2570
 
1760
- export { ASSISTANT_AGENT_CONFIG_KEY, ASSISTANT_JUDGE_CONFIG_KEY, ASSISTANT_MEETING_PREP_SCENARIOS, ASSISTANT_MEETING_PREP_SMOKE_SCENARIOS, ASSISTANT_MORNING_BRIEF_SCENARIOS, ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SMOKE_SCENARIOS, ASSISTANT_RUBRIC_DIMENSIONS, ASSISTANT_RUBRIC_ID_KEY, ASSISTANT_SEEDS_CONFIG_KEY, ASSISTANT_SPOT_CHECK_DIR_KEY, ASSISTANT_SYNTHESIS_SCENARIOS, ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS, type AbstentionRetrievalCase, type AggregateMetrics, type AnthropicProviderConfig, type AssistantAgent, type AssistantMemoryFact, type AssistantMemoryGraph, type AssistantRubricDimension, type AssistantRubricScores, type AssistantRunnerOptions, type AssistantScenario, type AssistantStance, BENCHMARK_INTEGRITY_META_SCHEMA, BENCHMARK_RESULT_SCHEMA, BENCHMARK_SPLIT_TYPES, type BenchConfig, type BenchJudge, type BenchJudgeResult, type BenchMemoryAdapter, type BenchModelSource, type BenchResponder, type BenchResponse, type BenchRuntimeProfile, type BenchTier, type BenchmarkCategory, type BenchmarkDefinition, type BenchmarkIntegrityMeta, type BenchmarkMeta, type BenchmarkMode, type BenchmarkReport, type BenchmarkResult, type BenchmarkSplitType, type BenchmarkStatus, type BenchmarkSuiteResult, type BenchmarkTier, type BuildBenchmarkPublishFeedOptions, type BuiltInProvider, CANARY_FIXED_RECALL, CANARY_SCORE_FLOOR, type CanaryAdapterOptions, type CanaryFloorCheck, type ComparisonMetricDelta, type ComparisonResult, type CompletionOpts, type CompletionResult, type ConfidenceInterval, type ContaminationCheckResult, type ContaminationEntry, type ContaminationManifest, type CustomBenchmarkScoring, type CustomBenchmarkSpec, type CustomBenchmarkTask, DEFAULT_ASSISTANT_RUBRIC_ID, type DiscoveredModel, EMPTY_CONTAMINATION_MANIFEST, type EffectSizeInterpretation, type EffectSizeSummary, type ExplainResult, type ExtractedEntity, type ExtractedLink, type ExtractedPage, type FixtureGenerator, type FixtureOutput, type FixtureVariant, type GeneratedFile, type GoldEntity, type GoldEntityType, type GoldGraph, type GoldLink, type GoldPage, INTEGRITY_CIPHER_ALGORITHM, INTEGRITY_HASH_ALGORITHM, INTEGRITY_META_FIELDS, type IngestionBenchAdapter, type IngestionLog, type LlmJudge, type LlmProvider, type LoadSealedQrelsOptions, type MemoryGraph, type MemoryStats, type MemorySystem, type Message, type MetricAggregate, type MultipleChoiceQuestion, type OllamaProviderConfig, type OpenAiCompatibleProviderConfig, type PersonalizationRetrievalCase, type ProviderBaseConfig, type ProviderConfig, type ProviderDiscoveryResult, type ProviderFactoryConfig, type PublishSkipReason, type PublishSkipRecord, type PublishedBenchmarkFeed, type PublishedBenchmarkFeedEntry, REQUIRED_FRONTMATTER_FIELDS, type RecallMetrics, type RegressionDetail, type RegressionGateResult, type RemnicAdapterOptions, type ResolveBenchRuntimeProfileOptions, type ResolvedBenchRuntimeProfile, type ResolvedRunBenchmarkOptions, type RotatedChoices, type RunBenchmarkOptions, SCHEMA_TIER_FIXTURE, SCHEMA_TIER_SMOKE_FIXTURE, SEALED_PROMPT_REGISTRY, type SavedBaseline, type SchemaTierCorpus, type SchemaTierFixture, type SchemaTierName, type SchemaTierPage, type SchemaTierPageFrontmatter, type SealedArtifact, type SealedJudgeDecision, type SealedJudgeInput, type SealedQrelsArtifact, type SealedQrelsHandle, type SealedRubric, type SearchResult, type SeededRng, type SpotCheckLogger, type StatisticalReport, type StructuredJudge, type TaskResult, type TaskTokenUsage, type TemporalRetrievalCase, type TierDetail, type TokenUsage, addContaminationEntry, aggregateTaskScores, answerBenchmarkQuestion, assertCanaryUnderFloor, assertIntegrityMetaPresent, assertPublishableIntegrity, assertSha256Hex, assistantMeetingPrepDefinition, assistantMorningBriefDefinition, assistantNextBestActionDefinition, assistantSynthesisDefinition, backlinkF1, bootstrapMeanConfidenceInterval, buildBenchmarkPublishFeed, buildBenchmarkRunSeeds, buildJudgePayload, buildSchemaTierFixture, buildSchemaTierSmokeFixture, calendarFixture, canonicalJsonStringify, chatFixture, checkDatasetContamination, checkRegression, clampScore, cohensD, compareResults, computeSealHash, containsAnswer, createAnthropicProvider, createCanaryAdapter, createDeterministicSpotCheckLogger, createGatewayResponder, createLightweightAdapter, createLiteLlmProvider, createOllamaProvider, createOpenAiCompatibleProvider, createProvider, createProviderBackedJudge, createProviderBackedResponder, createProviderBackedStructuredJudge, createRemnicAdapter, createResponderFromProvider, createSeededRng, createSpotCheckFileLogger, createStructuredJudgeFromProvider, defaultBenchmarkBaselineDir, defaultBenchmarkPublishPath, deleteBenchmarkResults, discoverAllProviders, emailFixture, entityRecall, exactMatch, f1Score, generateReport, getBenchmark, getBenchmarkLowerIsBetter, hashBytes, hashCanonicalJson, hashString, integrityMetaIsComplete, interpretEffectSize, isContaminationEntry, isContaminationManifest, isSealedQrelsArtifact, isSha256Hex, linkMatches, listBenchmarkBaselines, listBenchmarkResults, listBenchmarks, llmJudgeScore, llmJudgeScoreDetailed, loadBaseline, loadBenchmarkBaseline, loadBenchmarkResult, loadCustomBenchmarkFile, loadSealKeyFromEnv, loadSealedQrels, loadSealedRubric, matchEntity, mergeContaminationManifests, openSeal, orchestrateBenchmarkRuns, pairedDeltaConfidenceInterval, parseCustomBenchmark, parseRubricResponse, parseSealedQrels, precisionAtK, projectFolderFixture, recallAtK, renderBenchmarkResultExport, renderMemorySummaryForJudge, renderMemoryViewForAgent, resolveAssistantAgent, resolveAssistantRubricId, resolveAssistantSeeds, resolveAssistantSpotCheckDir, resolveBenchRuntimeProfile, resolveBenchmarkResultReference, resolveBenchmarkRunCount, resolveStructuredJudge, rotateDistractors, rougeL, runAssistantBenchmark, runAssistantMeetingPrepBenchmark, runAssistantMorningBriefBenchmark, runAssistantNextBestActionBenchmark, runAssistantSynthesisBenchmark, runBenchSuite, runBenchmark, runCustomBenchmarkFile, runExplain, runSealedJudge, safeHexEqual, saveBaseline, saveBenchmarkBaseline, schemaCompleteness, sealPayload, selectFixtureVariant, serializeSealedQrels, shuffleTasks, timed, verifyRubricDigest, writeBenchmarkPublishFeed, writeBenchmarkResult, zeroScores };
2571
+ interface ProceduralRecallE2eCase {
2572
+ id: string;
2573
+ prompt: string;
2574
+ procedurePreamble: string;
2575
+ procedureSteps: Array<{
2576
+ order: number;
2577
+ intent: string;
2578
+ }>;
2579
+ procedureTags: string[];
2580
+ /** When true, `buildProcedureRecallSection` should return non-null markdown. */
2581
+ expectNonNullSection: boolean;
2582
+ proceduralEnabled?: boolean;
2583
+ }
2584
+
2585
+ /**
2586
+ * Scenario shape for the ablation harness. A superset of
2587
+ * `ProceduralRecallE2eCase` with an `expectMatch` alias so downstream fixtures
2588
+ * can be expressed in either vocabulary.
2589
+ */
2590
+ interface ProceduralAblationScenario {
2591
+ id: string;
2592
+ prompt: string;
2593
+ procedurePreamble: string;
2594
+ procedureSteps: Array<{
2595
+ order: number;
2596
+ intent: string;
2597
+ }>;
2598
+ procedureTags: string[];
2599
+ /**
2600
+ * True when the prompt should recall the procedure. False for distractor /
2601
+ * non-task-initiation prompts where we expect the gate to reject.
2602
+ */
2603
+ expectMatch: boolean;
2604
+ }
2605
+ interface ProceduralAblationPerCase {
2606
+ id: string;
2607
+ prompt: string;
2608
+ expectMatch: boolean;
2609
+ onMatched: boolean;
2610
+ offMatched: boolean;
2611
+ onScore: number;
2612
+ offScore: number;
2613
+ }
2614
+ interface ProceduralAblationArtifact {
2615
+ schemaVersion: 1;
2616
+ fixture: {
2617
+ path: string | null;
2618
+ scenarioCount: number;
2619
+ };
2620
+ onScore: number;
2621
+ offScore: number;
2622
+ lift: number;
2623
+ confidenceInterval: ConfidenceInterval;
2624
+ perCase: ProceduralAblationPerCase[];
2625
+ generatedAt: string;
2626
+ }
2627
+ /**
2628
+ * Convert the existing `ProceduralRecallE2eCase` fixture into
2629
+ * ablation-scenario shape. The ablation ALWAYS sweeps procedural on and off,
2630
+ * so `expectMatch` must reflect what the prompt + procedure pair should do
2631
+ * WHEN PROCEDURAL IS ON — not what the original row's `proceduralEnabled`
2632
+ * flag produced.
2633
+ *
2634
+ * Gate-control rows in the e2e fixture (where `proceduralEnabled=false`
2635
+ * produces `expectNonNullSection=false` only because of the gate, not the
2636
+ * content) are excluded here: their ON-side outcome is content-dependent and
2637
+ * not something this mapper can label correctly without re-running
2638
+ * `buildProcedureRecallSection`. Callers that need those rows should write
2639
+ * the scenario directly with an explicit `expectMatch`.
2640
+ */
2641
+ declare function fixtureToAblationScenarios(fixture: ProceduralRecallE2eCase[]): ProceduralAblationScenario[];
2642
+ /**
2643
+ * Default bootstrap seed used when no `random` / `seed` override is supplied.
2644
+ * Fixing this makes CI bounds reproducible across CLI invocations — flaky CI
2645
+ * bounds would break artifact-based comparisons and saved baselines.
2646
+ */
2647
+ declare const DEFAULT_ABLATION_BOOTSTRAP_SEED = 1919249774;
2648
+ /**
2649
+ * Mulberry32 seeded RNG. Inlined (and re-used from tests) so callers can get a
2650
+ * deterministic default without needing an external dependency.
2651
+ */
2652
+ declare function createSeededRandom(seed: number): () => number;
2653
+ interface RunProceduralAblationOptions {
2654
+ scenarios: ProceduralAblationScenario[];
2655
+ /** Path the ablation was loaded from (echoed back into the artifact). */
2656
+ fixturePath?: string | null;
2657
+ /** Bootstrap iterations for CI on the paired delta (default: 1_000). */
2658
+ bootstrapIterations?: number;
2659
+ /**
2660
+ * Seeded RNG for the bootstrap. Defaults to
2661
+ * `createSeededRandom(DEFAULT_ABLATION_BOOTSTRAP_SEED)` so CI bounds are
2662
+ * deterministic across repeated CLI invocations. Pass `Math.random`
2663
+ * explicitly to opt into non-deterministic sampling.
2664
+ */
2665
+ random?: () => number;
2666
+ /**
2667
+ * Convenience alternative to `random`: if provided (and `random` is not),
2668
+ * a seeded mulberry32 RNG is built from this integer.
2669
+ */
2670
+ seed?: number;
2671
+ }
2672
+ /**
2673
+ * Pure entrypoint — accepts a scenario list and returns the artifact. Reads
2674
+ * and writes are isolated to the StorageManager temp directories the sides
2675
+ * create and remove internally.
2676
+ */
2677
+ declare function runProceduralAblation(options: RunProceduralAblationOptions): Promise<ProceduralAblationArtifact>;
2678
+ /**
2679
+ * Load a scenario list from a JSON file. Validates the JSON is an object with
2680
+ * a `scenarios` array (or a bare array) and each entry has the required
2681
+ * fields. Rejects invalid input per CLAUDE.md rule 51 rather than silently
2682
+ * defaulting.
2683
+ */
2684
+ declare function loadAblationFixture(fixturePath: string): Promise<ProceduralAblationScenario[]>;
2685
+ /**
2686
+ * CLI entrypoint. Resolves `--fixture <path>` (defaults to the built-in e2e
2687
+ * fixture converted to ablation scenarios when unset) and writes the artifact
2688
+ * to `--out <path>`. Validates inputs per CLAUDE.md rules 14 / 17 / 51.
2689
+ */
2690
+ interface RunProceduralAblationCliArgs {
2691
+ fixturePath: string | null;
2692
+ outPath: string;
2693
+ bootstrapIterations?: number;
2694
+ random?: () => number;
2695
+ /**
2696
+ * Optional seed for the bootstrap RNG. When omitted the harness uses
2697
+ * `DEFAULT_ABLATION_BOOTSTRAP_SEED` so CLI runs are reproducible by
2698
+ * default.
2699
+ */
2700
+ seed?: number;
2701
+ }
2702
+ declare function runProceduralAblationCli(args: RunProceduralAblationCliArgs): Promise<ProceduralAblationArtifact>;
2703
+
2704
+ /**
2705
+ * Real-fixture procedural-recall scenarios (issue #567 PR 2/5).
2706
+ *
2707
+ * 20 synthetic but realistic scenarios grouped across four categories:
2708
+ *
2709
+ * - exact-re-run prompt matches a stored procedure near-verbatim;
2710
+ * should recall when procedural is on.
2711
+ * - parameter-variation prompt references the same intent with different
2712
+ * nouns (service name, environment, ticket id);
2713
+ * should recall on overlap + intent compatibility.
2714
+ * - decomposition prompt starts a multi-step task whose steps match
2715
+ * a stored runbook; should recall.
2716
+ * - distractor-rejection prompt looks task-like but the stored procedure
2717
+ * is unrelated — the gate should REJECT (expectMatch
2718
+ * = false).
2719
+ *
2720
+ * All scenarios are deterministic and use ONLY token-overlap + intent
2721
+ * classification semantics (no LLM calls). The deterministic stub LLM
2722
+ * requirement from #567 is satisfied because `buildProcedureRecallSection`
2723
+ * is a pure function of storage + prompt + config. A human runbook for
2724
+ * exercising the gpt-4o-mini path lives in docs/benchmarks/procedural-recall.md.
2725
+ *
2726
+ * Scenarios are synthetic (no personal data), per CLAUDE.md public-repo
2727
+ * privacy policy.
2728
+ */
2729
+
2730
+ type ProceduralRealScenarioCategory = "exact-re-run" | "parameter-variation" | "decomposition" | "distractor-rejection";
2731
+ interface ProceduralRealScenario extends ProceduralAblationScenario {
2732
+ category: ProceduralRealScenarioCategory;
2733
+ notes?: string;
2734
+ }
2735
+ declare const PROCEDURAL_REAL_SCENARIOS: ProceduralRealScenario[];
2736
+ /** Built-in smoke slice (first scenario from each category). */
2737
+ declare const PROCEDURAL_REAL_SCENARIOS_SMOKE: ProceduralRealScenario[];
2738
+
2739
+ /**
2740
+ * Types for the ADAM-style memory-extraction attack harness.
2741
+ *
2742
+ * See docs/security/memory-extraction-threat-model.md for the threat model
2743
+ * this harness probes. The harness targets the modeled read-path surfaces
2744
+ * enumerated in §4 of that document (recall / memory_search /
2745
+ * memory_entities_list / …), driven in-process against a seeded memory
2746
+ * fixture so tests do not need a running daemon.
2747
+ */
2748
+ /**
2749
+ * Attacker knowledge tier the harness simulates.
2750
+ *
2751
+ * These correspond to the T1/T2/T3 tiers in the threat model (§3):
2752
+ *
2753
+ * - `zero-knowledge` — attacker has no prior information about the memory
2754
+ * contents, must probe purely from seed vocabulary. Roughly T1 with a
2755
+ * minimal token (or T2 on a newly-provisioned namespace).
2756
+ * - `same-namespace` — attacker holds a valid token for the same namespace as
2757
+ * the seeded memories and may use entity-graph enumeration as side channel.
2758
+ * This is the primary T2 tier.
2759
+ * - `cross-namespace` — attacker holds a valid token for a *different*
2760
+ * namespace but attempts to leak memories from the victim namespace via
2761
+ * shared-namespace auto-promotion or debug snapshots (T3).
2762
+ */
2763
+ type AttackerMode = "zero-knowledge" | "same-namespace" | "cross-namespace";
2764
+ /**
2765
+ * A single seeded memory the harness treats as ground truth.
2766
+ *
2767
+ * Ground-truth labelling is intentionally coarse: `tokens` is the set of
2768
+ * salient lowercase tokens that uniquely identify this memory to a human.
2769
+ * A recovered query transcript is considered to have leaked the memory if
2770
+ * the response contains a substring that covers a configurable fraction of
2771
+ * these tokens (see `recoveryTokenOverlap`).
2772
+ */
2773
+ interface SeededMemory {
2774
+ /** Stable identifier. */
2775
+ id: string;
2776
+ /** Raw memory text as it would be stored. */
2777
+ content: string;
2778
+ /**
2779
+ * Category bucket (fact / preference / decision / entity / …). Mirrors the
2780
+ * buckets the threat model lists in §2 (Assets). Used by the harness only
2781
+ * for reporting; not used in the attack loop itself.
2782
+ */
2783
+ category: "fact" | "preference" | "decision" | "entity" | "other";
2784
+ /** Namespace the memory lives in. */
2785
+ namespace: string;
2786
+ /**
2787
+ * Optional set of salient tokens that define "the attacker recovered this
2788
+ * memory". If omitted, defaults to all alphanumeric tokens of length > 2 in
2789
+ * `content`.
2790
+ */
2791
+ tokens?: string[];
2792
+ }
2793
+ /**
2794
+ * One retrieval result returned by the target surface.
2795
+ *
2796
+ * The shape is intentionally narrower than `MemoryRecord` in core — the
2797
+ * harness only needs the attacker-observable subset.
2798
+ */
2799
+ interface AttackRetrievalHit {
2800
+ /**
2801
+ * Stable memory identifier if the surface exposes one. Attackers can use
2802
+ * this as a side channel (memory IDs are disclosed by recall responses in
2803
+ * the current MCP surface), so we model it explicitly.
2804
+ */
2805
+ memoryId?: string;
2806
+ /** Namespace the memory came from, if the surface discloses it. */
2807
+ namespace?: string;
2808
+ /** Text content (summary or full) the surface returned. */
2809
+ content: string;
2810
+ /** Optional relevance score. */
2811
+ score?: number;
2812
+ }
2813
+ /**
2814
+ * Minimal contract an attack target must satisfy.
2815
+ *
2816
+ * Callers wrap the real `EngramAccessService.recall()` or a test stub. The
2817
+ * harness deliberately does not depend on `@remnic/core` directly; PR 3
2818
+ * will provide the binding to the real orchestrator.
2819
+ */
2820
+ interface ExtractionAttackTarget {
2821
+ /**
2822
+ * Execute a recall query against the target and return its hits.
2823
+ *
2824
+ * Should throw (or return an empty array) when the target denies the
2825
+ * query — the harness treats both as "no information gained".
2826
+ */
2827
+ recall(query: string, options?: AttackRecallOptions): Promise<AttackRetrievalHit[]>;
2828
+ /**
2829
+ * Optional side channel: enumerate known entity names. Present iff the
2830
+ * attacker mode was granted access to `memory_entities_list`.
2831
+ */
2832
+ listEntities?(): Promise<string[]>;
2833
+ }
2834
+ interface AttackRecallOptions {
2835
+ /** Top-K to request. Defaults to harness budget. */
2836
+ topK?: number;
2837
+ /** Namespace override (harness uses this for T3 cross-namespace probes). */
2838
+ namespace?: string;
2839
+ }
2840
+ /**
2841
+ * Deterministic PRNG interface. Callers can pass a seeded PRNG to make runs
2842
+ * reproducible.
2843
+ */
2844
+ interface HarnessRng {
2845
+ /** Returns a float in [0, 1). */
2846
+ next(): number;
2847
+ }
2848
+ /**
2849
+ * Configuration for a single harness run.
2850
+ */
2851
+ interface ExtractionAttackOptions {
2852
+ target: ExtractionAttackTarget;
2853
+ /** Ground-truth memories the harness is trying to recover. */
2854
+ groundTruth: readonly SeededMemory[];
2855
+ /** Attacker knowledge tier. */
2856
+ attackerMode: AttackerMode;
2857
+ /** Maximum number of recall queries the harness is allowed to issue. */
2858
+ queryBudget: number;
2859
+ /**
2860
+ * Hyper-parameter for the adaptive loop: when the normalized entropy of
2861
+ * the response distribution falls below this value, the harness switches
2862
+ * to exploitation (repeating high-signal tokens) rather than exploration.
2863
+ * Defaults to 0.3.
2864
+ */
2865
+ entropyThreshold?: number;
2866
+ /**
2867
+ * Deterministic RNG. Callers should pass a seeded instance to get
2868
+ * reproducible results.
2869
+ */
2870
+ rng?: HarnessRng;
2871
+ /**
2872
+ * Seed vocabulary the attacker starts from. In `zero-knowledge` mode this
2873
+ * is the *only* prior information; in `same-namespace` mode it is a seed
2874
+ * for exploration. Defaults to a small English stop-list plus common
2875
+ * personal-memory topics.
2876
+ */
2877
+ seedVocabulary?: readonly string[];
2878
+ /**
2879
+ * Fraction of ground-truth tokens a single retrieved response must cover
2880
+ * to count as "recovered". Defaults to 0.5.
2881
+ */
2882
+ recoveryTokenOverlap?: number;
2883
+ /** If true, every query and response is kept in `timeline`. */
2884
+ captureTimeline?: boolean;
2885
+ /** TopK to request per query. Defaults to 10. */
2886
+ topK?: number;
2887
+ /**
2888
+ * Namespace the attacker addresses their queries to. When set, every
2889
+ * `target.recall()` is invoked with `namespace: attackerNamespace`.
2890
+ * Useful for T3-class runs where the caller wants to simulate an
2891
+ * attacker holding a token for a specific cross-namespace tenant.
2892
+ *
2893
+ * Defaults:
2894
+ * - `zero-knowledge`: undefined (target uses its own default).
2895
+ * - `same-namespace`: undefined (target uses its own default).
2896
+ * - `cross-namespace`: `"shared"` (matches the residual-leak path the
2897
+ * threat model calls out in §5, but callers targeting other namespace
2898
+ * models should pass an explicit value here).
2899
+ */
2900
+ attackerNamespace?: string;
2901
+ /**
2902
+ * Optional absolute deadline in ms since epoch. If the harness crosses it
2903
+ * during the attack loop, it terminates early with a partial result. Used
2904
+ * by tests to keep runs bounded.
2905
+ */
2906
+ deadlineMs?: number;
2907
+ /**
2908
+ * When true, a thrown error from `target.recall` aborts the attack
2909
+ * loop and re-throws. Default false — errors are counted in
2910
+ * `ExtractionAttackResult.backendErrorCount` so callers can distinguish
2911
+ * genuine empty recalls from backend failures. Flip to true in CI
2912
+ * gating scripts that must not silently publish ASR from a degraded
2913
+ * target.
2914
+ */
2915
+ failOnBackendError?: boolean;
2916
+ }
2917
+ interface RecoveredMemory {
2918
+ memoryId: string;
2919
+ memory: SeededMemory;
2920
+ recoveredContent: string;
2921
+ queriesUsed: number;
2922
+ /** Index into `timeline` that first recovered this memory. */
2923
+ firstHitAt: number;
2924
+ }
2925
+ interface TimelineEntry {
2926
+ query: string;
2927
+ hits: AttackRetrievalHit[];
2928
+ entropy: number;
2929
+ newlyRecoveredMemoryIds: string[];
2930
+ /** Which strategy chose this query. Useful for diagnosing the algorithm. */
2931
+ strategy: "seed" | "exploit-entity" | "exploit-token" | "explore-random" | "explore-entropy";
2932
+ }
2933
+ interface ExtractionAttackResult {
2934
+ /** Attack Success Rate: fraction of ground-truth memories recovered. */
2935
+ asr: number;
2936
+ /** Number of queries issued (may be less than budget on early exit). */
2937
+ queriesIssued: number;
2938
+ /** Attacker mode this run simulated. */
2939
+ attackerMode: AttackerMode;
2940
+ /** Recovered memories with per-memory metadata. */
2941
+ recovered: RecoveredMemory[];
2942
+ /** Ground-truth memories the attacker failed to recover within budget. */
2943
+ missed: SeededMemory[];
2944
+ /** Full query-by-query trace. Empty unless `captureTimeline: true`. */
2945
+ timeline: TimelineEntry[];
2946
+ /** Seconds of wall time spent inside the attack loop. */
2947
+ durationMs: number;
2948
+ /** True iff the run stopped because `deadlineMs` was reached. */
2949
+ hitDeadline: boolean;
2950
+ /**
2951
+ * Number of `target.recall` calls that threw and were treated as empty
2952
+ * hits. A high value means the harness was talking to a degraded
2953
+ * backend — low/zero ASR in that case is not a security statement
2954
+ * about the system, it is a measurement failure. Callers that want to
2955
+ * fail-fast on backend errors can pass `failOnBackendError: true`.
2956
+ */
2957
+ backendErrorCount: number;
2958
+ }
2959
+
2960
+ /**
2961
+ * ADAM-style entropy-guided memory-extraction attack harness.
2962
+ *
2963
+ * Re-implements the entropy-guided adaptive querying strategy described in
2964
+ * ADAM (arXiv:2604.09747, Apr 2026): the attacker issues a sequence of
2965
+ * recall queries, observes the information gained from each response, and
2966
+ * picks the next query to maximize expected entropy reduction over the
2967
+ * remaining candidate memories.
2968
+ *
2969
+ * This is a clean-room re-implementation, not a port of any released
2970
+ * codebase. The algorithm is:
2971
+ *
2972
+ * 1. Initialize candidate-token pool from the seed vocabulary and (mode
2973
+ * permitting) side channels like entity listings.
2974
+ * 2. Loop until budget exhausted or all memories recovered:
2975
+ * a. Compute Shannon entropy over the attacker's current belief
2976
+ * distribution (`tokenFrequencies`). Low entropy => we have a
2977
+ * concentrated belief; exploit by querying the top tokens. High
2978
+ * entropy => we are uncertain; explore by querying a token we have
2979
+ * not tried yet.
2980
+ * b. Issue the chosen query against the target.
2981
+ * c. Update the belief from the response: tokens appearing in hit
2982
+ * content get their frequency bumped; tokens that keep appearing
2983
+ * alongside already-recovered content get deprioritized so the
2984
+ * attacker does not re-query the same region forever.
2985
+ * d. Check each hit against the ground-truth set; mark recoveries.
2986
+ * 3. Emit ASR + per-memory leak log.
2987
+ *
2988
+ * The harness never calls an LLM. Token "information gain" is computed from
2989
+ * lexical overlap against previously-seen hits; this is the non-LLM
2990
+ * approximation the paper uses for its low-cost variant, and is adequate
2991
+ * for measuring the attack surface's structural leakage.
2992
+ */
2993
+
2994
+ /**
2995
+ * Tiny mulberry32 PRNG — stable across Node versions.
2996
+ */
2997
+ declare function createSeededRng(seed: number): HarnessRng;
2998
+ /**
2999
+ * Entry point. See `types.ts` for the options contract.
3000
+ */
3001
+ declare function runExtractionAttack(options: ExtractionAttackOptions): Promise<ExtractionAttackResult>;
3002
+
3003
+ /**
3004
+ * Synthetic memory fixture and in-process target implementation for the
3005
+ * extraction-attack harness tests.
3006
+ *
3007
+ * Everything in this file is synthetic. Per the public-repo privacy policy
3008
+ * in CLAUDE.md, no real user data may ship in fixtures.
3009
+ */
3010
+
3011
+ /**
3012
+ * 15 synthetic seeded memories covering fact, preference, decision, and
3013
+ * entity categories across two namespaces. Intentionally mundane so no
3014
+ * reader mistakes any of this for real personal data.
3015
+ */
3016
+ declare const SYNTHETIC_MEMORIES: readonly SeededMemory[];
3017
+ /**
3018
+ * A second namespace used for the cross-namespace test. This namespace's
3019
+ * contents MUST stay separate from `victim` — the T3 attack should fail if
3020
+ * the surface honors ACLs.
3021
+ */
3022
+ declare const OTHER_NAMESPACE_MEMORIES: readonly SeededMemory[];
3023
+ interface SyntheticTargetOptions {
3024
+ /** Memories visible through normal recall. */
3025
+ memories: readonly SeededMemory[];
3026
+ /** Entities the side channel should enumerate. */
3027
+ entities?: readonly string[];
3028
+ /**
3029
+ * When true, the target enforces namespace ACLs: a recall with a namespace
3030
+ * other than `allowedNamespace` returns an empty array. Models the T3
3031
+ * mitigation path in the threat model §6.1.
3032
+ */
3033
+ enforceNamespaceAcl?: boolean;
3034
+ /** The only namespace the attacker is entitled to read. */
3035
+ allowedNamespace?: string;
3036
+ /**
3037
+ * When true, the target always includes memory IDs in hits. When false,
3038
+ * the target masks IDs (models a mitigation where recall responses no
3039
+ * longer disclose memory IDs as a side channel).
3040
+ */
3041
+ disclosesMemoryIds?: boolean;
3042
+ /**
3043
+ * How many hits to return per recall. Mirrors QMD behavior where cutoff
3044
+ * is based on score threshold. Defaults to 5.
3045
+ */
3046
+ hitCap?: number;
3047
+ }
3048
+ /**
3049
+ * Deterministic in-process target. Scores memories by token-overlap with the
3050
+ * query and returns the top-K above zero.
3051
+ */
3052
+ declare function createSyntheticTarget(options: SyntheticTargetOptions): ExtractionAttackTarget;
3053
+
3054
+ /**
3055
+ * Baseline measurement runner for the ADAM memory-extraction harness.
3056
+ *
3057
+ * Produces a reproducible set of ASR numbers for every attacker tier against
3058
+ * a synthetic target that mirrors the current Remnic read-path behavior
3059
+ * (memory IDs disclosed, namespace ACL enforced on cross-namespace reads).
3060
+ *
3061
+ * This is intentionally separate from the unit tests: tests keep budgets
3062
+ * small so CI stays fast, whereas the baseline pushes the budget high enough
3063
+ * for each tier to plateau. The output feeds into
3064
+ * `docs/security/adam-baseline-2026-04.md`.
3065
+ */
3066
+
3067
+ interface BaselineScenario {
3068
+ readonly name: string;
3069
+ readonly attackerMode: AttackerMode;
3070
+ readonly queryBudget: number;
3071
+ readonly seed: number;
3072
+ /** Ground truth the attacker is trying to recover. */
3073
+ readonly groundTruth: readonly SeededMemory[];
3074
+ /** Memories the target actually stores (may be a superset of groundTruth). */
3075
+ readonly targetMemories: readonly SeededMemory[];
3076
+ readonly entities?: readonly string[];
3077
+ readonly enforceNamespaceAcl?: boolean;
3078
+ readonly allowedNamespace?: string;
3079
+ readonly disclosesMemoryIds?: boolean;
3080
+ /** Attacker-held namespace. Forwarded as `attackerNamespace` to the runner. */
3081
+ readonly attackerNamespace?: string;
3082
+ }
3083
+ interface BaselineRow {
3084
+ readonly scenario: string;
3085
+ readonly attackerMode: AttackerMode;
3086
+ readonly queryBudget: number;
3087
+ readonly queriesIssued: number;
3088
+ readonly asr: number;
3089
+ readonly recoveredIds: readonly string[];
3090
+ readonly missedIds: readonly string[];
3091
+ readonly durationMs: number;
3092
+ /** Whether mitigations were active during this run. */
3093
+ readonly mitigated?: boolean;
3094
+ }
3095
+ /**
3096
+ * Scenarios used for the 2026-04 baseline. Kept deterministic via fixed seeds
3097
+ * so the document remains reproducible.
3098
+ */
3099
+ declare const DEFAULT_BASELINE_SCENARIOS: readonly BaselineScenario[];
3100
+ /**
3101
+ * Executes every scenario once and returns a flat set of rows suitable for
3102
+ * rendering as a markdown table.
3103
+ */
3104
+ declare function runBaseline(scenarios?: readonly BaselineScenario[]): Promise<BaselineRow[]>;
3105
+ interface MitigatedBaselineConfig {
3106
+ budgetHardLimit: number;
3107
+ budgetWindowMs?: number;
3108
+ /**
3109
+ * Override for the principal's "home" namespace in the mitigated target.
3110
+ * When set, this is passed as `principalNamespace` to `createMitigatedTarget`.
3111
+ * When unset, falls back to `allowedNamespace ?? "default"`.
3112
+ * Use this to decouple the budget's principal identity from the synthetic
3113
+ * target's ACL namespace.
3114
+ */
3115
+ principalNamespaceOverride?: string;
3116
+ }
3117
+ declare const MITIGATED_BASELINE_SCENARIOS: readonly (BaselineScenario & MitigatedBaselineConfig)[];
3118
+ declare function runMitigatedBaseline(scenarios?: readonly (BaselineScenario & MitigatedBaselineConfig)[]): Promise<BaselineRow[]>;
3119
+ /**
3120
+ * Renders a baseline run as a human-readable markdown fragment. The returned
3121
+ * string is suitable for pasting into the baseline document.
3122
+ */
3123
+ declare function renderBaselineMarkdown(rows: readonly BaselineRow[]): string;
3124
+
3125
+ /**
3126
+ * Mitigation-aware target wrapper for the ADAM extraction attack harness.
3127
+ *
3128
+ * Wraps a raw `ExtractionAttackTarget` and enforces:
3129
+ * 1. Cross-namespace query budget (mirrors `CrossNamespaceBudget` from core)
3130
+ * 2. Namespace ACL (carries forward from `createSyntheticTarget`)
3131
+ *
3132
+ * When the budget is exceeded, the wrapper returns empty hits instead of
3133
+ * forwarding the query — simulating the real recall-path denial. This lets
3134
+ * the harness re-measure ASR with mitigations active and compare against
3135
+ * the unmitigated baseline.
3136
+ */
3137
+
3138
+ interface MitigatedTargetConfig {
3139
+ /** Inner (unmitigated) target to wrap. */
3140
+ target: ExtractionAttackTarget;
3141
+ /**
3142
+ * Maximum cross-namespace queries per `budgetWindowMs` window.
3143
+ * Queries beyond this limit return empty hits.
3144
+ */
3145
+ budgetHardLimit: number;
3146
+ /**
3147
+ * Rolling window in ms for the budget counter. Defaults to 60_000.
3148
+ */
3149
+ budgetWindowMs?: number;
3150
+ /**
3151
+ * The principal's "home" namespace. Queries targeting a different
3152
+ * namespace count against the budget; same-namespace queries are free.
3153
+ */
3154
+ principalNamespace: string;
3155
+ }
3156
+ /**
3157
+ * Creates a mitigation-aware wrapper around a raw target.
3158
+ *
3159
+ * The wrapper tracks cross-namespace queries in a sliding window and
3160
+ * returns empty hits when the budget is exceeded. Same-namespace queries
3161
+ * pass through without counting.
3162
+ */
3163
+ declare function createMitigatedTarget(config: MitigatedTargetConfig): ExtractionAttackTarget;
3164
+
3165
+ export { AMA_BENCH_DIAGNOSTIC_VARIANTS, ASSISTANT_AGENT_CONFIG_KEY, ASSISTANT_JUDGE_CONFIG_KEY, ASSISTANT_MEETING_PREP_SCENARIOS, ASSISTANT_MEETING_PREP_SMOKE_SCENARIOS, ASSISTANT_MORNING_BRIEF_SCENARIOS, ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SMOKE_SCENARIOS, ASSISTANT_RUBRIC_DIMENSIONS, ASSISTANT_RUBRIC_ID_KEY, ASSISTANT_SEEDS_CONFIG_KEY, ASSISTANT_SPOT_CHECK_DIR_KEY, ASSISTANT_SYNTHESIS_SCENARIOS, ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS, type AbstentionRetrievalCase, type AggregateMetrics, type AmaBenchDiagnosticAdapterOptions, type AmaBenchDiagnosticAnswererMode, type AmaBenchDiagnosticBreakdown, type AmaBenchDiagnosticMatrixArtifact, type AmaBenchDiagnosticRecallMode, type AmaBenchDiagnosticRunContext, type AmaBenchDiagnosticTaskEvidence, type AmaBenchDiagnosticTaskRow, type AmaBenchDiagnosticVariant, type AmaBenchDiagnosticVariantSummary, type AnthropicProviderConfig, type AssistantAgent, type AssistantMemoryFact, type AssistantMemoryGraph, type AssistantRubricDimension, type AssistantRubricScores, type AssistantRunnerOptions, type AssistantScenario, type AssistantStance, type AttackRecallOptions, type AttackRetrievalHit, type AttackerMode, BENCHMARK_ARTIFACT_SCHEMA_VERSION, BENCHMARK_INTEGRITY_META_SCHEMA, BENCHMARK_REPRO_MANIFEST_FILENAME, BENCHMARK_REPRO_MANIFEST_SCHEMA_VERSION, BENCHMARK_RESULT_SCHEMA, BENCHMARK_SPLIT_TYPES, type BaselineRow, type BaselineScenario, type BeamDatasetPreview, type BenchConfig, type BenchJudge, type BenchJudgeResult, type BenchMemoryAdapter, type BenchModelSource, type BenchReasoningEffort, type BenchRecallOptions, type BenchResponder, type BenchResponse, type BenchRuntimeProfile, type BenchTier, type BenchmarkArtifact, type BenchmarkArtifactEnvironment, type BenchmarkArtifactPerTaskScore, type BenchmarkArtifactSystem, type BenchmarkCategory, type BenchmarkDefinition, type BenchmarkIntegrityMeta, type BenchmarkMeta, type BenchmarkMode, type BenchmarkReport, type BenchmarkReproManifest, type BenchmarkReproManifestDataset, type BenchmarkReproManifestFile, type BenchmarkReproManifestResult, type BenchmarkResult, type BenchmarkSplitType, type BenchmarkStatus, type BenchmarkSuiteResult, type BenchmarkTier, type BuildBenchmarkArtifactInput, type BuildBenchmarkPublishFeedOptions, type BuildBenchmarkReproManifestOptions, type BuiltInProvider, CANARY_FIXED_RECALL, CANARY_SCORE_FLOOR, type CanaryAdapterOptions, type CanaryFloorCheck, type CodexCliProviderConfig, type ComparisonMetricDelta, type ComparisonResult, type CompletionOpts, type CompletionResult, type ConfidenceInterval, type ContaminationCheckResult, type ContaminationEntry, type ContaminationManifest, type CustomBenchmarkScoring, type CustomBenchmarkSpec, type CustomBenchmarkTask, DEFAULT_ABLATION_BOOTSTRAP_SEED, DEFAULT_ASSISTANT_RUBRIC_ID, DEFAULT_BASELINE_SCENARIOS, type DatasetSource, type DiscoveredModel, EMPTY_CONTAMINATION_MANIFEST, type EffectSizeInterpretation, type EffectSizeSummary, type ExplainResult, type ExtractedEntity, type ExtractedLink, type ExtractedPage, type ExtractionAttackOptions, type ExtractionAttackResult, type ExtractionAttackTarget, type FixtureGenerator, type FixtureOutput, type FixtureVariant, type GeneratedFile, type GoldEntity, type GoldEntityType, type GoldGraph, type GoldLink, type GoldPage, type HarnessRng, INTEGRITY_CIPHER_ALGORITHM, INTEGRITY_HASH_ALGORITHM, INTEGRITY_META_FIELDS, type IngestionBenchAdapter, type IngestionLog, LOCOMO_DATASET_FILENAMES, LONG_MEM_EVAL_DATASET_FILENAMES, type LeaderboardArtifactWrite, type LlmJudge, type LlmProvider, type LoadDatasetOptions, type LoadSealedQrelsOptions, type LoadedDataset, type LocalLlmProviderConfig, MEMORY_EVAL_DIMENSIONS, MEMORY_EVAL_PUBLIC_LINE, MITIGATED_BASELINE_SCENARIOS, type MemoryEvalCategory, type MemoryEvalDimension, type MemoryEvalDimensionId, type MemoryEvalMetric, type MemoryGraph, type MemoryStats, type MemorySystem, type Message, type MetricAggregate, type MitigatedBaselineConfig, type MitigatedTargetConfig, type MultipleChoiceQuestion, OTHER_NAMESPACE_MEMORIES, type OllamaProviderConfig, type OpenAiCompatibleProviderConfig, PROCEDURAL_REAL_SCENARIOS, PROCEDURAL_REAL_SCENARIOS_SMOKE, PUBLISHED_BENCHMARK_ARTIFACT_IDS, type PersonalizationRetrievalCase, type ProceduralAblationArtifact, type ProceduralAblationPerCase, type ProceduralAblationScenario, type ProceduralRealScenario, type ProceduralRealScenarioCategory, type ProviderBaseConfig, type ProviderConfig, type ProviderDiscoveryResult, type ProviderFactoryConfig, type PublishSkipReason, type PublishSkipRecord, type PublishedBenchmarkFeed, type PublishedBenchmarkFeedEntry, type PublishedBenchmarkId, REQUIRED_FRONTMATTER_FIELDS, type RecallMetrics, type RecoveredMemory, type RegressionDetail, type RegressionGateResult, type RemnicAdapterOptions, type ResolveBenchRuntimeProfileOptions, type ResolvedBenchRuntimeProfile, type ResolvedRunBenchmarkOptions, type RotatedChoices, type RunBenchmarkOptions, type RunProceduralAblationCliArgs, type RunProceduralAblationOptions, SCHEMA_TIER_FIXTURE, SCHEMA_TIER_SMOKE_FIXTURE, SEALED_PROMPT_REGISTRY, SYNTHETIC_MEMORIES, type SanitizedDiagnosticProvider, type SavedBaseline, type SchemaTierCorpus, type SchemaTierFixture, type SchemaTierName, type SchemaTierPage, type SchemaTierPageFrontmatter, type SealedArtifact, type SealedJudgeDecision, type SealedJudgeInput, type SealedQrelsArtifact, type SealedQrelsHandle, type SealedRubric, type SearchResult, type SeededMemory, type SeededRng, type SpotCheckLogger, type StatisticalReport, type StructuredJudge, type SyntheticEmailIngestionAdapterOptions, type SyntheticTargetOptions, type TaskResult, type TaskTokenUsage, type TemporalRetrievalCase, type TierDetail, type TimelineEntry, type TokenUsage, type WriteBenchmarkArtifactResult, addContaminationEntry, aggregateTaskScores, answerBenchmarkQuestion, assertCanaryUnderFloor, assertIntegrityMetaPresent, assertPublishableIntegrity, assertSha256Hex, assistantMeetingPrepDefinition, assistantMorningBriefDefinition, assistantNextBestActionDefinition, assistantSynthesisDefinition, backlinkF1, bootstrapMeanConfidenceInterval, buildAmaBenchDiagnosticMatrixArtifact, buildAmaBenchDiagnosticVariantSummary, buildAmaBenchLeaderboardRows, buildBenchmarkArtifact, buildBenchmarkArtifactFilename, buildBenchmarkPublishFeed, buildBenchmarkReproManifest, buildBenchmarkRunSeeds, buildJudgePayload, buildOracleTrajectoryRecall, buildSchemaTierFixture, buildSchemaTierSmokeFixture, calendarFixture, canonicalJsonStringify, chatFixture, checkDatasetContamination, checkRegression, clampScore, cohensD, compareResults, computeSealHash, containsAnswer, createSeededRng as createAdamSeededRng, createAmaBenchDiagnosticAdapter, createAnthropicProvider, createCanaryAdapter, createCodexCliProvider, createDeterministicSpotCheckLogger, createGatewayResponder, createLightweightAdapter, createLiteLlmProvider, createLocalLlmProvider, createMitigatedTarget, createOllamaProvider, createOpenAiCompatibleProvider, createSeededRandom as createProceduralAblationSeededRandom, createProvider, createProviderBackedAmaBenchRecommendedJudge, createProviderBackedJudge, createProviderBackedResponder, createProviderBackedStructuredJudge, createRemnicAdapter, createResponderFromProvider, createSeededRng$1 as createSeededRng, createSpotCheckFileLogger, createStructuredJudgeFromProvider, createSyntheticEmailIngestionAdapter, createSyntheticTarget, createTimeoutGuardedAdapter, defaultBenchmarkBaselineDir, defaultBenchmarkPublishPath, deleteBenchmarkResults, discoverAllProviders, emailFixture, entityRecall, exactMatch, extractMarkdownSectionsByTitle, f1Score, fixtureToAblationScenarios, formatMissingDatasetError, generateReport, getBenchmark, getBenchmarkLowerIsBetter, getMemoryEvalDimension, getRemnicVersion, hashBenchmarkArtifact, hashBytes, hashCanonicalJson, hashString, integrityMetaIsComplete, interpretEffectSize, isAmaBenchUnknownLikeAnswer, isContaminationEntry, isContaminationManifest, isSealedQrelsArtifact, isSha256Hex, linkMatches, listBenchmarkBaselines, listBenchmarkResults, listBenchmarks, listMemoryEvalBenchmarkIds, listMemoryEvalDimensions, llmJudgeScore, llmJudgeScoreDetailed, loadAblationFixture, loadBaseline, loadBeamDatasetPreview, loadBenchmarkArtifact, loadBenchmarkBaseline, loadBenchmarkResult, loadCustomBenchmarkFile, loadLoCoMo10, loadLongMemEvalS, loadSealKeyFromEnv, loadSealedQrels, loadSealedRubric, matchEntity, mergeContaminationManifests, openSeal, orchestrateBenchmarkRuns, pairedDeltaConfidenceInterval, parseBenchmarkArtifact, parseCustomBenchmark, parseRubricResponse, parseSealedQrels, precisionAtK, projectFolderFixture, recallAtK, redactBenchmarkResultSecrets, renderBaselineMarkdown, renderBenchmarkResultExport, renderMemorySummaryForJudge, renderMemoryViewForAgent, resolveAssistantAgent, resolveAssistantRubricId, resolveAssistantSeeds, resolveAssistantSpotCheckDir, resolveBenchRuntimeProfile, resolveBenchmarkPhaseTimeoutMs, resolveBenchmarkProgressLogging, resolveBenchmarkResultReference, resolveBenchmarkRunCount, resolveStructuredJudge, rotateDistractors, rougeL, runAssistantBenchmark, runAssistantMeetingPrepBenchmark, runAssistantMorningBriefBenchmark, runAssistantNextBestActionBenchmark, runAssistantSynthesisBenchmark, runBaseline, runBenchSuite, runBenchmark, runCustomBenchmarkFile, runExplain, runExtractionAttack, runMitigatedBaseline, runProceduralAblation, runProceduralAblationCli, runSealedJudge, safeHexEqual, saveBaseline, saveBenchmarkBaseline, schemaCompleteness, sealPayload, selectAmaBenchDiagnosticVariants, selectFixtureVariant, serializeBenchmarkArtifact, serializeJsonl, serializeSealedQrels, shuffleTasks, timed, verifyRubricDigest, writeBenchmarkArtifact, writeBenchmarkPublishFeed, writeBenchmarkReproManifest, writeBenchmarkResult, writeLeaderboardArtifactsForResult, zeroScores };