@remnic/bench 1.0.1 → 9.3.515
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +103 -0
- package/baselines/procedural-recall-baseline.json +198 -0
- package/dist/index.d.ts +1556 -151
- package/dist/index.js +24292 -6309
- package/package.json +13 -6
package/dist/index.d.ts
CHANGED
|
@@ -44,6 +44,8 @@ interface ExtractedPage {
|
|
|
44
44
|
frontmatter: Record<string, unknown>;
|
|
45
45
|
hasExecSummary: boolean;
|
|
46
46
|
hasTimeline: boolean;
|
|
47
|
+
/** Source corpus file references that support this page's generated claims. */
|
|
48
|
+
sourceRefs?: string[];
|
|
47
49
|
seeAlso: string[];
|
|
48
50
|
content: string;
|
|
49
51
|
}
|
|
@@ -72,6 +74,8 @@ declare const REQUIRED_FRONTMATTER_FIELDS: readonly ["title", "type", "state", "
|
|
|
72
74
|
interface Message {
|
|
73
75
|
role: "user" | "assistant" | "system";
|
|
74
76
|
content: string;
|
|
77
|
+
/** Optional source timestamp for benchmarks with historical query times. */
|
|
78
|
+
timestamp?: string;
|
|
75
79
|
}
|
|
76
80
|
interface SearchResult {
|
|
77
81
|
turnIndex: number;
|
|
@@ -84,6 +88,7 @@ interface MemoryStats {
|
|
|
84
88
|
totalMessages: number;
|
|
85
89
|
totalSummaryNodes: number;
|
|
86
90
|
maxDepth: number;
|
|
91
|
+
maxTurnIndex?: number;
|
|
87
92
|
}
|
|
88
93
|
interface BenchResponse {
|
|
89
94
|
text: string;
|
|
@@ -94,8 +99,11 @@ interface BenchResponse {
|
|
|
94
99
|
latencyMs: number;
|
|
95
100
|
model: string;
|
|
96
101
|
}
|
|
102
|
+
interface BenchPhaseControl {
|
|
103
|
+
signal?: AbortSignal;
|
|
104
|
+
}
|
|
97
105
|
interface BenchResponder {
|
|
98
|
-
respond(question: string, recalledText: string): Promise<BenchResponse>;
|
|
106
|
+
respond(question: string, recalledText: string, control?: BenchPhaseControl): Promise<BenchResponse>;
|
|
99
107
|
}
|
|
100
108
|
interface BenchJudgeResult {
|
|
101
109
|
score: number;
|
|
@@ -107,19 +115,32 @@ interface BenchJudgeResult {
|
|
|
107
115
|
model?: string;
|
|
108
116
|
}
|
|
109
117
|
interface BenchJudge {
|
|
110
|
-
score(question: string, predicted: string, expected: string): Promise<number>;
|
|
111
|
-
scoreWithMetrics?(question: string, predicted: string, expected: string): Promise<BenchJudgeResult>;
|
|
118
|
+
score(question: string, predicted: string, expected: string, control?: BenchPhaseControl): Promise<number>;
|
|
119
|
+
scoreWithMetrics?(question: string, predicted: string, expected: string, control?: BenchPhaseControl): Promise<BenchJudgeResult>;
|
|
120
|
+
/**
|
|
121
|
+
* Run a benchmark-supplied yes/no judging prompt directly and return a
|
|
122
|
+
* normalized 0/1 score. Published benchmarks such as LongMemEval define
|
|
123
|
+
* their own evaluator prompt; routing those through the scalar generic
|
|
124
|
+
* judge prompt would change the metric contract.
|
|
125
|
+
*/
|
|
126
|
+
scoreBinaryPrompt?(prompt: string, control?: BenchPhaseControl): Promise<BenchJudgeResult>;
|
|
112
127
|
}
|
|
113
128
|
interface BenchMemoryAdapter {
|
|
114
|
-
store(sessionId: string, messages: Message[]): Promise<void>;
|
|
115
|
-
recall(sessionId: string, query: string, budgetChars?: number): Promise<string>;
|
|
116
|
-
search(query: string, limit: number, sessionId?: string): Promise<SearchResult[]>;
|
|
117
|
-
reset(sessionId?: string): Promise<void>;
|
|
118
|
-
getStats(sessionId?: string): Promise<MemoryStats>;
|
|
129
|
+
store(sessionId: string, messages: Message[], control?: BenchPhaseControl): Promise<void>;
|
|
130
|
+
recall(sessionId: string, query: string, budgetChars?: number, options?: BenchRecallOptions, control?: BenchPhaseControl): Promise<string>;
|
|
131
|
+
search(query: string, limit: number, sessionId?: string, control?: BenchPhaseControl): Promise<SearchResult[]>;
|
|
132
|
+
reset(sessionId?: string, control?: BenchPhaseControl): Promise<void>;
|
|
133
|
+
getStats(sessionId?: string, control?: BenchPhaseControl): Promise<MemoryStats>;
|
|
134
|
+
/** Wait for background summarization (e.g. LCM) to finish after store(). */
|
|
135
|
+
drain?(control?: BenchPhaseControl): Promise<void>;
|
|
119
136
|
destroy(): Promise<void>;
|
|
120
137
|
responder?: BenchResponder;
|
|
121
138
|
judge?: BenchJudge;
|
|
122
139
|
}
|
|
140
|
+
interface BenchRecallOptions {
|
|
141
|
+
/** Optional historical recall timestamp for benchmarks that expose query time. */
|
|
142
|
+
asOf?: string;
|
|
143
|
+
}
|
|
123
144
|
type LlmJudge = BenchJudge;
|
|
124
145
|
type MemorySystem = BenchMemoryAdapter;
|
|
125
146
|
|
|
@@ -189,11 +210,37 @@ type BenchmarkTier = "published" | "remnic" | "custom";
|
|
|
189
210
|
type BenchmarkStatus = "ready" | "planned";
|
|
190
211
|
type BenchmarkCategory = "agentic" | "retrieval" | "conversational" | "ingestion";
|
|
191
212
|
type BenchRuntimeProfile = "baseline" | "real" | "openclaw-chain";
|
|
192
|
-
type
|
|
213
|
+
type AmaBenchJudgeProtocol = "default" | "recommended";
|
|
214
|
+
/**
|
|
215
|
+
* Built-in LLM providers supported by the bench harness.
|
|
216
|
+
*
|
|
217
|
+
* `local-llm` targets a user-hosted OpenAI-compatible endpoint
|
|
218
|
+
* (llama.cpp, vLLM, LM Studio, etc.) via `--base-url`. It mirrors
|
|
219
|
+
* the `localLlm*` plugin config on the Remnic core side so that
|
|
220
|
+
* `remnic bench published --provider local-llm` actually exercises
|
|
221
|
+
* the same transport path as the running plugin. Issue #566 slice 5.
|
|
222
|
+
*
|
|
223
|
+
* `codex-cli` shells out to `codex exec` as an isolated benchmark-only
|
|
224
|
+
* responder/judge target. It is intentionally not routed through Remnic
|
|
225
|
+
* memory or OpenClaw gateway state.
|
|
226
|
+
*/
|
|
227
|
+
type BuiltInProvider = "openai" | "anthropic" | "ollama" | "litellm" | "local-llm" | "codex-cli";
|
|
228
|
+
type BenchReasoningEffort = "low" | "medium" | "high" | "xhigh";
|
|
193
229
|
interface ProviderConfig {
|
|
194
230
|
provider: BuiltInProvider;
|
|
195
231
|
model: string;
|
|
196
232
|
baseUrl?: string;
|
|
233
|
+
apiKey?: string;
|
|
234
|
+
retryOptions?: {
|
|
235
|
+
maxAttempts?: number;
|
|
236
|
+
baseBackoffMs?: number;
|
|
237
|
+
timeoutMs?: number;
|
|
238
|
+
max429WaitMs?: number;
|
|
239
|
+
};
|
|
240
|
+
disableThinking?: boolean;
|
|
241
|
+
reasoningEffort?: BenchReasoningEffort;
|
|
242
|
+
responderContextBudgetChars?: number;
|
|
243
|
+
responderPromptBudgetChars?: number;
|
|
197
244
|
}
|
|
198
245
|
interface TaskTokenUsage {
|
|
199
246
|
input: number;
|
|
@@ -278,13 +325,19 @@ interface BenchmarkResult {
|
|
|
278
325
|
* Must stay below the benchmark's canary floor.
|
|
279
326
|
*/
|
|
280
327
|
canaryScore?: number;
|
|
328
|
+
/** "partial" if the benchmark was interrupted; absent or "complete" otherwise. */
|
|
329
|
+
status?: "complete" | "partial";
|
|
330
|
+
/** If partial, the error that caused interruption. */
|
|
331
|
+
failureReason?: string;
|
|
281
332
|
};
|
|
282
333
|
config: {
|
|
283
334
|
runtimeProfile?: BenchRuntimeProfile | null;
|
|
284
335
|
systemProvider: ProviderConfig | null;
|
|
285
336
|
judgeProvider: ProviderConfig | null;
|
|
337
|
+
internalProvider?: ProviderConfig | null;
|
|
286
338
|
adapterMode: string;
|
|
287
339
|
remnicConfig: Record<string, unknown>;
|
|
340
|
+
benchmarkOptions?: Record<string, unknown>;
|
|
288
341
|
};
|
|
289
342
|
cost: {
|
|
290
343
|
totalTokens: number;
|
|
@@ -339,7 +392,15 @@ interface RunBenchmarkOptions {
|
|
|
339
392
|
ingestionAdapter?: IngestionBenchAdapter;
|
|
340
393
|
systemProvider?: ProviderConfig | null;
|
|
341
394
|
judgeProvider?: ProviderConfig | null;
|
|
395
|
+
internalProvider?: ProviderConfig | null;
|
|
342
396
|
remnicConfig?: Record<string, unknown>;
|
|
397
|
+
benchmarkOptions?: Record<string, unknown>;
|
|
398
|
+
drainTimeoutMs?: number;
|
|
399
|
+
amaBenchJudgeProtocol?: AmaBenchJudgeProtocol;
|
|
400
|
+
amaBenchCrossJudge?: BenchJudge;
|
|
401
|
+
amaBenchCrossJudgeProvider?: ProviderConfig | null;
|
|
402
|
+
/** Called after each task completes for progress logging and partial result tracking. */
|
|
403
|
+
onTaskComplete?: (task: TaskResult, completedCount: number, totalCount?: number) => void;
|
|
343
404
|
}
|
|
344
405
|
interface ResolvedRunBenchmarkOptions extends RunBenchmarkOptions {
|
|
345
406
|
mode: BenchmarkMode;
|
|
@@ -457,13 +518,70 @@ interface FixtureGenerator {
|
|
|
457
518
|
|
|
458
519
|
interface RemnicAdapterOptions {
|
|
459
520
|
configOverrides?: Record<string, unknown>;
|
|
521
|
+
memoryDir?: string;
|
|
460
522
|
preserveRuntimeDefaults?: boolean;
|
|
461
523
|
responder?: BenchResponder;
|
|
462
524
|
judge?: BenchJudge;
|
|
525
|
+
drainTimeoutMs?: number;
|
|
526
|
+
replayExtractionMode?: "await" | "background" | "skip";
|
|
527
|
+
replaySourceValidAtMode?: "historical" | "batch";
|
|
528
|
+
sandboxDir?: string;
|
|
463
529
|
}
|
|
464
530
|
declare const createLightweightAdapter: (options?: RemnicAdapterOptions) => Promise<BenchMemoryAdapter>;
|
|
465
531
|
declare const createRemnicAdapter: (options?: RemnicAdapterOptions) => Promise<BenchMemoryAdapter>;
|
|
466
532
|
|
|
533
|
+
interface TimeoutGuardOptions {
|
|
534
|
+
benchmarkId: string;
|
|
535
|
+
timeoutMs?: number;
|
|
536
|
+
drainTimeoutMs?: number;
|
|
537
|
+
logProgress?: boolean;
|
|
538
|
+
log?: (message: string) => void;
|
|
539
|
+
onTimeout?: (phase: string) => void | Promise<void>;
|
|
540
|
+
}
|
|
541
|
+
interface TimeoutGuardConfig {
|
|
542
|
+
remnicConfig?: Record<string, unknown>;
|
|
543
|
+
systemProvider?: ProviderConfig | null;
|
|
544
|
+
judgeProvider?: ProviderConfig | null;
|
|
545
|
+
}
|
|
546
|
+
declare function resolveBenchmarkPhaseTimeoutMs(config: TimeoutGuardConfig): number | undefined;
|
|
547
|
+
declare function resolveBenchmarkProgressLogging(remnicConfig?: Record<string, unknown>): boolean;
|
|
548
|
+
declare function createTimeoutGuardedAdapter(adapter: BenchMemoryAdapter, options: TimeoutGuardOptions): BenchMemoryAdapter;
|
|
549
|
+
|
|
550
|
+
interface SyntheticEmailIngestionAdapterOptions {
|
|
551
|
+
system?: BenchMemoryAdapter;
|
|
552
|
+
}
|
|
553
|
+
/**
|
|
554
|
+
* Isolated ingestion adapter for the synthetic email fixture benchmarks.
|
|
555
|
+
*
|
|
556
|
+
* It writes the raw source corpus through the benchmark's Remnic memory
|
|
557
|
+
* adapter when one is supplied, then exposes the extracted fixture graph in
|
|
558
|
+
* the IngestionBenchAdapter shape expected by the scoring tier. This keeps
|
|
559
|
+
* the ingestion benchmarks runnable in isolated benchmark jobs without
|
|
560
|
+
* touching a production Remnic instance.
|
|
561
|
+
*/
|
|
562
|
+
declare function createSyntheticEmailIngestionAdapter(options?: SyntheticEmailIngestionAdapterOptions): IngestionBenchAdapter;
|
|
563
|
+
|
|
564
|
+
declare const MEMORY_EVAL_PUBLIC_LINE: "Agent memory without evals is vibes with a database.";
|
|
565
|
+
type MemoryEvalDimensionId = "repeated_context_reduction" | "unnecessary_clarification_reduction" | "retrieval_correctness" | "stale_memory_harm" | "scope_respect" | "ask_when_needed" | "act_when_enough_context" | "personalization_quality";
|
|
566
|
+
type MemoryEvalCategory = "context-efficiency" | "retrieval-quality" | "boundary-respect" | "action-confidence" | "personalization";
|
|
567
|
+
interface MemoryEvalMetric {
|
|
568
|
+
name: string;
|
|
569
|
+
higherIsBetter: boolean;
|
|
570
|
+
description: string;
|
|
571
|
+
}
|
|
572
|
+
interface MemoryEvalDimension {
|
|
573
|
+
id: MemoryEvalDimensionId;
|
|
574
|
+
question: string;
|
|
575
|
+
category: MemoryEvalCategory;
|
|
576
|
+
metrics: readonly MemoryEvalMetric[];
|
|
577
|
+
quickBenchmarkIds: readonly string[];
|
|
578
|
+
fullModeGuidance: string;
|
|
579
|
+
}
|
|
580
|
+
declare const MEMORY_EVAL_DIMENSIONS: readonly MemoryEvalDimension[];
|
|
581
|
+
declare function listMemoryEvalDimensions(): readonly MemoryEvalDimension[];
|
|
582
|
+
declare function getMemoryEvalDimension(id: MemoryEvalDimensionId): MemoryEvalDimension;
|
|
583
|
+
declare function listMemoryEvalBenchmarkIds(): string[];
|
|
584
|
+
|
|
467
585
|
/**
|
|
468
586
|
* Minimal LLM provider contract for the bench engine.
|
|
469
587
|
*/
|
|
@@ -473,6 +591,7 @@ interface CompletionOpts {
|
|
|
473
591
|
temperature?: number;
|
|
474
592
|
maxTokens?: number;
|
|
475
593
|
headers?: Record<string, string>;
|
|
594
|
+
signal?: AbortSignal;
|
|
476
595
|
}
|
|
477
596
|
interface CompletionResult {
|
|
478
597
|
text: string;
|
|
@@ -496,6 +615,26 @@ interface ProviderBaseConfig {
|
|
|
496
615
|
baseUrl?: string;
|
|
497
616
|
apiKey?: string;
|
|
498
617
|
headers?: Record<string, string>;
|
|
618
|
+
retryOptions?: {
|
|
619
|
+
maxAttempts?: number;
|
|
620
|
+
baseBackoffMs?: number;
|
|
621
|
+
timeoutMs?: number;
|
|
622
|
+
max429WaitMs?: number;
|
|
623
|
+
};
|
|
624
|
+
/** Suppress thinking/reasoning tokens for thinking-capable models (Qwen 3.5, Gemma 4, DeepSeek). */
|
|
625
|
+
disableThinking?: boolean;
|
|
626
|
+
/**
|
|
627
|
+
* Optional answering-only memory-context budget. Benchmark artifacts keep the
|
|
628
|
+
* full recalled text, but provider-backed responders may receive this compact
|
|
629
|
+
* deterministic view to avoid transport-specific prompt stalls.
|
|
630
|
+
*/
|
|
631
|
+
responderContextBudgetChars?: number;
|
|
632
|
+
/**
|
|
633
|
+
* Optional answering-only question/protocol budget. This keeps the original
|
|
634
|
+
* benchmark question and artifact unchanged while shortening repeated harness
|
|
635
|
+
* instructions for slow transport-backed responders such as Codex CLI.
|
|
636
|
+
*/
|
|
637
|
+
responderPromptBudgetChars?: number;
|
|
499
638
|
}
|
|
500
639
|
interface OpenAiCompatibleProviderConfig extends ProviderBaseConfig {
|
|
501
640
|
provider?: "openai" | "litellm";
|
|
@@ -507,12 +646,46 @@ interface AnthropicProviderConfig extends ProviderBaseConfig {
|
|
|
507
646
|
interface OllamaProviderConfig extends ProviderBaseConfig {
|
|
508
647
|
provider?: "ollama";
|
|
509
648
|
}
|
|
649
|
+
/**
|
|
650
|
+
* `local-llm` targets a user-hosted OpenAI-compatible endpoint
|
|
651
|
+
* (llama.cpp, vLLM, LM Studio, etc.). `baseUrl` is required at the
|
|
652
|
+
* CLI layer — it mirrors the plugin's `localLlmUrl` config and is
|
|
653
|
+
* what tells the bench which local server to talk to. The transport
|
|
654
|
+
* is intentionally OpenAI-compatible: `/v1/chat/completions` +
|
|
655
|
+
* `/v1/models`. Issue #566 slice 5.
|
|
656
|
+
*/
|
|
657
|
+
interface LocalLlmProviderConfig extends ProviderBaseConfig {
|
|
658
|
+
provider?: "local-llm";
|
|
659
|
+
baseUrl: string;
|
|
660
|
+
}
|
|
661
|
+
interface CodexCliProviderConfig extends ProviderBaseConfig {
|
|
662
|
+
provider?: "codex-cli";
|
|
663
|
+
/** Codex CLI model reasoning effort. Bench CLI defaults this to xhigh. */
|
|
664
|
+
reasoningEffort?: BenchReasoningEffort;
|
|
665
|
+
/** Optional executable override for tests or non-standard Codex CLI installs. */
|
|
666
|
+
executable?: string;
|
|
667
|
+
/**
|
|
668
|
+
* Optional diagnostics artifact directory. When set, the provider writes
|
|
669
|
+
* per-call metadata that helps debug slow benchmark completions without
|
|
670
|
+
* depending on transient temp workspaces.
|
|
671
|
+
*/
|
|
672
|
+
diagnosticsDir?: string;
|
|
673
|
+
/**
|
|
674
|
+
* `metadata` stores hashes/counts only. `full` additionally stores the full
|
|
675
|
+
* benchmark prompt and should only be used for isolated benchmark datasets.
|
|
676
|
+
*/
|
|
677
|
+
diagnosticsMode?: "metadata" | "full";
|
|
678
|
+
}
|
|
510
679
|
type ProviderFactoryConfig = (OpenAiCompatibleProviderConfig & {
|
|
511
680
|
provider: "openai" | "litellm";
|
|
512
681
|
}) | (AnthropicProviderConfig & {
|
|
513
682
|
provider: "anthropic";
|
|
514
683
|
}) | (OllamaProviderConfig & {
|
|
515
684
|
provider: "ollama";
|
|
685
|
+
}) | (LocalLlmProviderConfig & {
|
|
686
|
+
provider: "local-llm";
|
|
687
|
+
}) | (CodexCliProviderConfig & {
|
|
688
|
+
provider: "codex-cli";
|
|
516
689
|
});
|
|
517
690
|
interface ProviderDiscoveryResult {
|
|
518
691
|
provider: BuiltInProvider;
|
|
@@ -629,6 +802,15 @@ declare const BENCHMARK_RESULT_SCHEMA: {
|
|
|
629
802
|
readonly baseUrl: {
|
|
630
803
|
readonly type: "string";
|
|
631
804
|
};
|
|
805
|
+
readonly reasoningEffort: {
|
|
806
|
+
readonly type: "string";
|
|
807
|
+
};
|
|
808
|
+
readonly responderContextBudgetChars: {
|
|
809
|
+
readonly type: "number";
|
|
810
|
+
};
|
|
811
|
+
readonly responderPromptBudgetChars: {
|
|
812
|
+
readonly type: "number";
|
|
813
|
+
};
|
|
632
814
|
};
|
|
633
815
|
}];
|
|
634
816
|
};
|
|
@@ -648,6 +830,31 @@ declare const BENCHMARK_RESULT_SCHEMA: {
|
|
|
648
830
|
readonly baseUrl: {
|
|
649
831
|
readonly type: "string";
|
|
650
832
|
};
|
|
833
|
+
readonly reasoningEffort: {
|
|
834
|
+
readonly type: "string";
|
|
835
|
+
};
|
|
836
|
+
};
|
|
837
|
+
}];
|
|
838
|
+
};
|
|
839
|
+
readonly internalProvider: {
|
|
840
|
+
readonly anyOf: readonly [{
|
|
841
|
+
readonly type: "null";
|
|
842
|
+
}, {
|
|
843
|
+
readonly type: "object";
|
|
844
|
+
readonly required: readonly ["provider", "model"];
|
|
845
|
+
readonly properties: {
|
|
846
|
+
readonly provider: {
|
|
847
|
+
readonly type: "string";
|
|
848
|
+
};
|
|
849
|
+
readonly model: {
|
|
850
|
+
readonly type: "string";
|
|
851
|
+
};
|
|
852
|
+
readonly baseUrl: {
|
|
853
|
+
readonly type: "string";
|
|
854
|
+
};
|
|
855
|
+
readonly reasoningEffort: {
|
|
856
|
+
readonly type: "string";
|
|
857
|
+
};
|
|
651
858
|
};
|
|
652
859
|
}];
|
|
653
860
|
};
|
|
@@ -752,11 +959,315 @@ declare const BENCHMARK_RESULT_SCHEMA: {
|
|
|
752
959
|
};
|
|
753
960
|
};
|
|
754
961
|
|
|
962
|
+
declare const BENCHMARK_REPRO_MANIFEST_FILENAME = "MANIFEST.json";
|
|
963
|
+
declare const BENCHMARK_REPRO_MANIFEST_SCHEMA_VERSION = 1;
|
|
964
|
+
interface BenchmarkReproManifestFile {
|
|
965
|
+
path: string;
|
|
966
|
+
kind: "file" | "symlink";
|
|
967
|
+
sizeBytes: number;
|
|
968
|
+
sha256: string;
|
|
969
|
+
target?: string;
|
|
970
|
+
}
|
|
971
|
+
interface BenchmarkReproManifestDataset {
|
|
972
|
+
benchmark: string;
|
|
973
|
+
status: "not-provided" | "missing" | "hashed";
|
|
974
|
+
path?: string;
|
|
975
|
+
realpath?: string;
|
|
976
|
+
fileCount: number;
|
|
977
|
+
totalBytes: number;
|
|
978
|
+
sha256?: string;
|
|
979
|
+
files: BenchmarkReproManifestFile[];
|
|
980
|
+
}
|
|
981
|
+
interface BenchmarkReproManifestResult {
|
|
982
|
+
path: string;
|
|
983
|
+
sha256: string;
|
|
984
|
+
sizeBytes: number;
|
|
985
|
+
resultId: string;
|
|
986
|
+
benchmark: string;
|
|
987
|
+
mode: BenchmarkMode;
|
|
988
|
+
gitSha: string;
|
|
989
|
+
runCount: number;
|
|
990
|
+
seeds: number[];
|
|
991
|
+
taskCount: number;
|
|
992
|
+
configHash: string;
|
|
993
|
+
}
|
|
994
|
+
interface BenchmarkReproManifest {
|
|
995
|
+
schemaVersion: number;
|
|
996
|
+
generatedAt: string;
|
|
997
|
+
run: {
|
|
998
|
+
id: string;
|
|
999
|
+
mode?: BenchmarkMode;
|
|
1000
|
+
selectedBenchmarks: string[];
|
|
1001
|
+
runtimeProfiles: string[];
|
|
1002
|
+
selectedWorkItems: Array<{
|
|
1003
|
+
benchmark: string;
|
|
1004
|
+
runtimeProfile: string;
|
|
1005
|
+
}>;
|
|
1006
|
+
limit?: number;
|
|
1007
|
+
seed?: number;
|
|
1008
|
+
};
|
|
1009
|
+
git: {
|
|
1010
|
+
commit: string;
|
|
1011
|
+
shortCommit: string;
|
|
1012
|
+
dirty: boolean;
|
|
1013
|
+
dirtyEntryCount: number;
|
|
1014
|
+
};
|
|
1015
|
+
command: {
|
|
1016
|
+
cwd: string;
|
|
1017
|
+
argv: string[];
|
|
1018
|
+
envKeys: string[];
|
|
1019
|
+
};
|
|
1020
|
+
environment: {
|
|
1021
|
+
platform: NodeJS.Platform;
|
|
1022
|
+
arch: string;
|
|
1023
|
+
nodeVersion: string;
|
|
1024
|
+
hostname: string;
|
|
1025
|
+
packageManager?: string;
|
|
1026
|
+
};
|
|
1027
|
+
qmd?: {
|
|
1028
|
+
configDir?: string;
|
|
1029
|
+
cacheDir?: string;
|
|
1030
|
+
collections: string[];
|
|
1031
|
+
};
|
|
1032
|
+
configFiles: Array<{
|
|
1033
|
+
label: string;
|
|
1034
|
+
path: string;
|
|
1035
|
+
sha256?: string;
|
|
1036
|
+
sizeBytes?: number;
|
|
1037
|
+
missing?: boolean;
|
|
1038
|
+
}>;
|
|
1039
|
+
datasets: BenchmarkReproManifestDataset[];
|
|
1040
|
+
results: BenchmarkReproManifestResult[];
|
|
1041
|
+
artifactHash: string;
|
|
1042
|
+
}
|
|
1043
|
+
interface BuildBenchmarkReproManifestOptions {
|
|
1044
|
+
resultPaths?: string[];
|
|
1045
|
+
runId?: string;
|
|
1046
|
+
selectedBenchmarks?: string[];
|
|
1047
|
+
runtimeProfiles?: string[];
|
|
1048
|
+
selectedWorkItems?: Array<{
|
|
1049
|
+
benchmark: string;
|
|
1050
|
+
runtimeProfile: string;
|
|
1051
|
+
}>;
|
|
1052
|
+
mode?: BenchmarkMode;
|
|
1053
|
+
limit?: number;
|
|
1054
|
+
seed?: number;
|
|
1055
|
+
datasetDirs?: Record<string, string | undefined>;
|
|
1056
|
+
command?: {
|
|
1057
|
+
cwd?: string;
|
|
1058
|
+
argv?: string[];
|
|
1059
|
+
env?: NodeJS.ProcessEnv;
|
|
1060
|
+
envKeys?: string[];
|
|
1061
|
+
};
|
|
1062
|
+
configFiles?: Array<{
|
|
1063
|
+
label: string;
|
|
1064
|
+
path?: string;
|
|
1065
|
+
}>;
|
|
1066
|
+
qmd?: {
|
|
1067
|
+
configDir?: string;
|
|
1068
|
+
cacheDir?: string;
|
|
1069
|
+
collections?: string[];
|
|
1070
|
+
};
|
|
1071
|
+
}
|
|
1072
|
+
declare function buildBenchmarkReproManifest(resultsDir: string, options?: BuildBenchmarkReproManifestOptions): Promise<BenchmarkReproManifest>;
|
|
1073
|
+
declare function writeBenchmarkReproManifest(resultsDir: string, options?: BuildBenchmarkReproManifestOptions): Promise<string>;
|
|
1074
|
+
|
|
1075
|
+
/**
|
|
1076
|
+
* Public leaderboard artifact schema for published benchmarks.
|
|
1077
|
+
*
|
|
1078
|
+
* `BenchmarkArtifact` is deliberately flatter and more opinionated than
|
|
1079
|
+
* the internal `BenchmarkResult`. The goal is a stable, versioned payload
|
|
1080
|
+
* that Remnic.ai and third-party leaderboard consumers can rely on
|
|
1081
|
+
* without digging into every per-task field the internal runner captures.
|
|
1082
|
+
*
|
|
1083
|
+
* One artifact is written per run to
|
|
1084
|
+
* docs/benchmarks/results/<iso-date>-<benchmark>-<model>-<gitShaShort>.json
|
|
1085
|
+
* (gitignored during development; promoted per-release by slice 6).
|
|
1086
|
+
*
|
|
1087
|
+
* Any breaking change to the artifact shape requires a `schemaVersion`
|
|
1088
|
+
* bump. The companion `buildBenchmarkArtifact()` and
|
|
1089
|
+
* `writeBenchmarkArtifact()` functions in this file emit the current
|
|
1090
|
+
* version; `parseBenchmarkArtifact()` rejects unknown versions.
|
|
1091
|
+
*/
|
|
1092
|
+
|
|
1093
|
+
/**
|
|
1094
|
+
* Current artifact schema version. Bump when the serialized shape
|
|
1095
|
+
* changes in a way that breaks existing leaderboard consumers.
|
|
1096
|
+
*
|
|
1097
|
+
* History:
|
|
1098
|
+
* 1 — initial schema (issue #566).
|
|
1099
|
+
*/
|
|
1100
|
+
declare const BENCHMARK_ARTIFACT_SCHEMA_VERSION: 1;
|
|
1101
|
+
/** Identifiers of published-benchmark runners that can emit public artifacts. */
|
|
1102
|
+
declare const PUBLISHED_BENCHMARK_ARTIFACT_IDS: readonly ["ama-bench", "memory-arena", "amemgym", "longmemeval", "locomo", "beam", "personamem", "memoryagentbench", "membench"];
|
|
1103
|
+
/** Identifier of a published-benchmark runner. */
|
|
1104
|
+
type PublishedBenchmarkId = (typeof PUBLISHED_BENCHMARK_ARTIFACT_IDS)[number];
|
|
1105
|
+
interface BenchmarkArtifactSystem {
|
|
1106
|
+
/** Short product name, e.g. "remnic". */
|
|
1107
|
+
name: string;
|
|
1108
|
+
/** Semver of `@remnic/core` at run time. */
|
|
1109
|
+
version: string;
|
|
1110
|
+
/** Short git SHA of the repository producing the artifact. */
|
|
1111
|
+
gitSha: string;
|
|
1112
|
+
}
|
|
1113
|
+
interface BenchmarkArtifactEnvironment {
|
|
1114
|
+
/** Node.js version reported by `process.version` at run time. */
|
|
1115
|
+
node: string;
|
|
1116
|
+
/** `process.platform` at run time (linux/darwin/win32/...). */
|
|
1117
|
+
os: string;
|
|
1118
|
+
/** Optional CPU architecture (arm64/x64/...). */
|
|
1119
|
+
arch?: string;
|
|
1120
|
+
}
|
|
1121
|
+
interface BenchmarkArtifactPerTaskScore {
|
|
1122
|
+
/** Runner-assigned task ID (stable across reruns). */
|
|
1123
|
+
taskId: string;
|
|
1124
|
+
/** Task-level scores keyed by metric name (e.g. f1, llm_judge). */
|
|
1125
|
+
scores: Record<string, number>;
|
|
1126
|
+
/** Optional task category / bucket for group-by reports. */
|
|
1127
|
+
category?: string;
|
|
1128
|
+
}
|
|
1129
|
+
interface BenchmarkArtifact {
|
|
1130
|
+
/** Artifact schema version. See `BENCHMARK_ARTIFACT_SCHEMA_VERSION`. */
|
|
1131
|
+
schemaVersion: typeof BENCHMARK_ARTIFACT_SCHEMA_VERSION;
|
|
1132
|
+
/** Benchmark identifier, e.g. "longmemeval" or "locomo". */
|
|
1133
|
+
benchmarkId: PublishedBenchmarkId;
|
|
1134
|
+
/**
|
|
1135
|
+
* Dataset version the runner evaluated against. Free-form string so
|
|
1136
|
+
* runners can record the HuggingFace revision, filename, or
|
|
1137
|
+
* upstream dataset tag.
|
|
1138
|
+
*/
|
|
1139
|
+
datasetVersion: string;
|
|
1140
|
+
system: BenchmarkArtifactSystem;
|
|
1141
|
+
/** Evaluator model ID (e.g. "gpt-4o-mini"). */
|
|
1142
|
+
model: string;
|
|
1143
|
+
/** RNG / selection seed used for this run. */
|
|
1144
|
+
seed: number;
|
|
1145
|
+
/** Aggregate metric means keyed by metric name. */
|
|
1146
|
+
metrics: Record<string, number>;
|
|
1147
|
+
/** Per-task score breakdown. Arbitrary-length; safe to truncate for public pages. */
|
|
1148
|
+
perTaskScores: BenchmarkArtifactPerTaskScore[];
|
|
1149
|
+
/** ISO-8601 timestamp of run start. */
|
|
1150
|
+
startedAt: string;
|
|
1151
|
+
/** ISO-8601 timestamp of run finish. */
|
|
1152
|
+
finishedAt: string;
|
|
1153
|
+
/** Total wall-clock duration in milliseconds. */
|
|
1154
|
+
durationMs: number;
|
|
1155
|
+
env: BenchmarkArtifactEnvironment;
|
|
1156
|
+
/** Optional explanatory note (e.g. "--limit 100"). Never contains PII. */
|
|
1157
|
+
note?: string;
|
|
1158
|
+
}
|
|
1159
|
+
/** Input to `buildBenchmarkArtifact()` beyond what `BenchmarkResult` already carries. */
|
|
1160
|
+
interface BuildBenchmarkArtifactInput {
|
|
1161
|
+
benchmarkId: PublishedBenchmarkId;
|
|
1162
|
+
datasetVersion: string;
|
|
1163
|
+
model: string;
|
|
1164
|
+
seed: number;
|
|
1165
|
+
startedAt: string;
|
|
1166
|
+
finishedAt: string;
|
|
1167
|
+
result: BenchmarkResult;
|
|
1168
|
+
/** Optional category extractor for `perTaskScores[].category`. */
|
|
1169
|
+
categoryFor?: (task: TaskResult) => string | undefined;
|
|
1170
|
+
/** Optional free-form note (e.g. `"--limit 100"`). */
|
|
1171
|
+
note?: string;
|
|
1172
|
+
}
|
|
1173
|
+
/**
|
|
1174
|
+
* Build a `BenchmarkArtifact` from a runner's `BenchmarkResult`.
|
|
1175
|
+
* Aggregates metrics to their `.mean` for public consumption; preserves
|
|
1176
|
+
* per-task scores verbatim. The result is sort-stable: metric keys are
|
|
1177
|
+
* emitted in sorted order and perTaskScores preserves runner order.
|
|
1178
|
+
*/
|
|
1179
|
+
declare function buildBenchmarkArtifact(input: BuildBenchmarkArtifactInput): BenchmarkArtifact;
|
|
1180
|
+
/**
|
|
1181
|
+
* Build the canonical on-disk filename for an artifact. Filename shape:
|
|
1182
|
+
* <iso-date>-<benchmark>-<model>-<gitShaShort>.json
|
|
1183
|
+
* where iso-date is the startedAt date (YYYY-MM-DD) and gitShaShort is
|
|
1184
|
+
* the first 7 chars of system.gitSha (or "unknown" if absent).
|
|
1185
|
+
*
|
|
1186
|
+
* Every segment that contributes to the filename is sanitized through
|
|
1187
|
+
* `sanitizeSegment()` so it cannot contain `/`, `..`, NUL, or any other
|
|
1188
|
+
* path-separator characters — preventing a malicious artifact input
|
|
1189
|
+
* from directing `writeBenchmarkArtifact()` outside of `outputDir`.
|
|
1190
|
+
*/
|
|
1191
|
+
declare function buildBenchmarkArtifactFilename(artifact: BenchmarkArtifact): string;
|
|
1192
|
+
/** Serialize an artifact to deterministic JSON (sorted keys, indented). */
|
|
1193
|
+
declare function serializeBenchmarkArtifact(artifact: BenchmarkArtifact): string;
|
|
1194
|
+
/** Compute SHA-256 of the canonical JSON serialization of the artifact. */
|
|
1195
|
+
declare function hashBenchmarkArtifact(artifact: BenchmarkArtifact): string;
|
|
1196
|
+
interface WriteBenchmarkArtifactResult {
|
|
1197
|
+
path: string;
|
|
1198
|
+
filename: string;
|
|
1199
|
+
sha256: string;
|
|
1200
|
+
bytes: number;
|
|
1201
|
+
}
|
|
1202
|
+
/**
|
|
1203
|
+
* Write the artifact to `<outputDir>/<filename>` and return the resulting
|
|
1204
|
+
* path, filename, SHA-256 of the canonical serialization, and byte count.
|
|
1205
|
+
* Creates `outputDir` recursively if needed.
|
|
1206
|
+
*
|
|
1207
|
+
* Belt-and-suspenders: even though `buildBenchmarkArtifactFilename()`
|
|
1208
|
+
* sanitizes every segment, this function also verifies the resolved
|
|
1209
|
+
* target stays inside `outputDir`. Any path-traversal attempt throws
|
|
1210
|
+
* before the write occurs.
|
|
1211
|
+
*/
|
|
1212
|
+
declare function writeBenchmarkArtifact(artifact: BenchmarkArtifact, outputDir: string): Promise<WriteBenchmarkArtifactResult>;
|
|
1213
|
+
/**
|
|
1214
|
+
* Parse + validate a BenchmarkArtifact from raw JSON. Throws on version
|
|
1215
|
+
* mismatch, missing required fields, or structural errors. Keep this in
|
|
1216
|
+
* sync with the `BenchmarkArtifact` interface — every new required
|
|
1217
|
+
* field needs a matching check here and a `schemaVersion` bump.
|
|
1218
|
+
*/
|
|
1219
|
+
declare function parseBenchmarkArtifact(raw: string): BenchmarkArtifact;
|
|
1220
|
+
/** Read + parse + re-hash an artifact file. Handy for `verify-artifact` CLI. */
|
|
1221
|
+
declare function loadBenchmarkArtifact(filePath: string): Promise<{
|
|
1222
|
+
artifact: BenchmarkArtifact;
|
|
1223
|
+
sha256: string;
|
|
1224
|
+
bytes: number;
|
|
1225
|
+
}>;
|
|
1226
|
+
|
|
755
1227
|
declare function createAnthropicProvider(config: AnthropicProviderConfig): LlmProvider;
|
|
756
1228
|
|
|
1229
|
+
interface CodexCliRunRequest {
|
|
1230
|
+
executable: string;
|
|
1231
|
+
args: string[];
|
|
1232
|
+
input: string;
|
|
1233
|
+
outputPath: string;
|
|
1234
|
+
workspacePath: string;
|
|
1235
|
+
timeoutMs?: number;
|
|
1236
|
+
signal?: AbortSignal;
|
|
1237
|
+
env: NodeJS.ProcessEnv;
|
|
1238
|
+
}
|
|
1239
|
+
interface CodexCliRunResult {
|
|
1240
|
+
status: number | null;
|
|
1241
|
+
signal: NodeJS.Signals | null;
|
|
1242
|
+
stdout: string;
|
|
1243
|
+
stderr: string;
|
|
1244
|
+
outputText: string;
|
|
1245
|
+
}
|
|
1246
|
+
interface CodexCliProviderDeps {
|
|
1247
|
+
runCodexCli?: (request: CodexCliRunRequest) => Promise<CodexCliRunResult>;
|
|
1248
|
+
runCodexVersion?: (executable: string, env: NodeJS.ProcessEnv) => Promise<{
|
|
1249
|
+
status: number | null;
|
|
1250
|
+
stderr: string;
|
|
1251
|
+
}>;
|
|
1252
|
+
}
|
|
1253
|
+
declare function createCodexCliProvider(config: CodexCliProviderConfig, deps?: CodexCliProviderDeps): LlmProvider;
|
|
1254
|
+
|
|
1255
|
+
/**
|
|
1256
|
+
* Result enrichment and JSON writing helpers.
|
|
1257
|
+
*/
|
|
1258
|
+
|
|
1259
|
+
declare function redactBenchmarkResultSecrets<T>(value: T): T;
|
|
1260
|
+
declare function writeBenchmarkResult(result: BenchmarkResult, outputDir: string): Promise<string>;
|
|
1261
|
+
declare function getRemnicVersion(): Promise<string>;
|
|
1262
|
+
|
|
1263
|
+
interface DiscoverAllProvidersOptions {
|
|
1264
|
+
includeCodexCli?: boolean;
|
|
1265
|
+
}
|
|
757
1266
|
declare function createProvider(config: ProviderFactoryConfig): LlmProvider;
|
|
758
|
-
declare function discoverAllProviders(): Promise<ProviderDiscoveryResult[]>;
|
|
1267
|
+
declare function discoverAllProviders(options?: DiscoverAllProvidersOptions): Promise<ProviderDiscoveryResult[]>;
|
|
759
1268
|
|
|
1269
|
+
type BenchmarkAnswerMode = "default" | "strict" | "agentic-memory";
|
|
1270
|
+
type BenchmarkAnswerFormat = "auto" | "choice-letter" | "choice-number" | "instruction" | "short" | "short-with-specifics" | "structured";
|
|
760
1271
|
interface BenchmarkAnswerResult {
|
|
761
1272
|
finalAnswer: string;
|
|
762
1273
|
recalledText: string;
|
|
@@ -768,12 +1279,145 @@ interface BenchmarkAnswerResult {
|
|
|
768
1279
|
};
|
|
769
1280
|
model?: string;
|
|
770
1281
|
}
|
|
1282
|
+
interface BenchmarkQuestionContext {
|
|
1283
|
+
benchmark?: string;
|
|
1284
|
+
domain?: string;
|
|
1285
|
+
task?: string;
|
|
1286
|
+
taskType?: string;
|
|
1287
|
+
qaType?: string;
|
|
1288
|
+
}
|
|
771
1289
|
declare function answerBenchmarkQuestion(options: {
|
|
772
1290
|
question: string;
|
|
773
1291
|
recalledText: string;
|
|
774
1292
|
responder?: BenchResponder;
|
|
1293
|
+
answerMode?: BenchmarkAnswerMode;
|
|
1294
|
+
answerFormat?: BenchmarkAnswerFormat;
|
|
1295
|
+
questionContext?: BenchmarkQuestionContext;
|
|
1296
|
+
retryUnknownWithEvidence?: boolean;
|
|
775
1297
|
}): Promise<BenchmarkAnswerResult>;
|
|
776
1298
|
|
|
1299
|
+
interface LeaderboardArtifactWrite {
|
|
1300
|
+
benchmark: string;
|
|
1301
|
+
path: string;
|
|
1302
|
+
format: string;
|
|
1303
|
+
records: number;
|
|
1304
|
+
}
|
|
1305
|
+
interface AmaBenchLeaderboardRow {
|
|
1306
|
+
episode_id: number | string;
|
|
1307
|
+
answer_list: string[];
|
|
1308
|
+
}
|
|
1309
|
+
declare function writeLeaderboardArtifactsForResult(result: BenchmarkResult, outputDir: string): Promise<LeaderboardArtifactWrite[]>;
|
|
1310
|
+
declare function buildAmaBenchLeaderboardRows(result: BenchmarkResult): AmaBenchLeaderboardRow[];
|
|
1311
|
+
declare function serializeJsonl(rows: readonly AmaBenchLeaderboardRow[]): string;
|
|
1312
|
+
|
|
1313
|
+
type AmaBenchDiagnosticRecallMode = "remnic-full" | "explicit-evidence-only" | "oracle-trajectory";
|
|
1314
|
+
type AmaBenchDiagnosticAnswererMode = "normal" | "strong";
|
|
1315
|
+
interface AmaBenchDiagnosticVariant {
|
|
1316
|
+
id: string;
|
|
1317
|
+
label: string;
|
|
1318
|
+
recallMode: AmaBenchDiagnosticRecallMode;
|
|
1319
|
+
answererMode: AmaBenchDiagnosticAnswererMode;
|
|
1320
|
+
description: string;
|
|
1321
|
+
}
|
|
1322
|
+
declare const AMA_BENCH_DIAGNOSTIC_VARIANTS: readonly AmaBenchDiagnosticVariant[];
|
|
1323
|
+
declare function selectAmaBenchDiagnosticVariants(options?: {
|
|
1324
|
+
ids?: string[];
|
|
1325
|
+
includeStrong?: boolean;
|
|
1326
|
+
}): AmaBenchDiagnosticVariant[];
|
|
1327
|
+
interface AmaBenchDiagnosticAdapterOptions {
|
|
1328
|
+
strongResponder?: BenchResponder;
|
|
1329
|
+
}
|
|
1330
|
+
declare function createAmaBenchDiagnosticAdapter(base: BenchMemoryAdapter, variant: AmaBenchDiagnosticVariant, options?: AmaBenchDiagnosticAdapterOptions): BenchMemoryAdapter;
|
|
1331
|
+
declare function buildOracleTrajectoryRecall(messages: readonly Message[], budgetChars?: number): string;
|
|
1332
|
+
declare function extractMarkdownSectionsByTitle(markdown: string, allowedTitles: readonly string[]): string;
|
|
1333
|
+
interface AmaBenchDiagnosticTaskRow {
|
|
1334
|
+
variantId: string;
|
|
1335
|
+
taskId: string;
|
|
1336
|
+
episodeId?: string | number;
|
|
1337
|
+
domain: string;
|
|
1338
|
+
qaType: string;
|
|
1339
|
+
taskType: string;
|
|
1340
|
+
scores: Record<string, number>;
|
|
1341
|
+
unknownLike: boolean;
|
|
1342
|
+
recalledLength: number;
|
|
1343
|
+
answeredLength: number;
|
|
1344
|
+
recallSections: string[];
|
|
1345
|
+
responderModel?: string;
|
|
1346
|
+
judgeModel?: string;
|
|
1347
|
+
crossJudgeModel?: string;
|
|
1348
|
+
crossJudgeScore?: number;
|
|
1349
|
+
evidence?: AmaBenchDiagnosticTaskEvidence;
|
|
1350
|
+
}
|
|
1351
|
+
interface AmaBenchDiagnosticTaskEvidence {
|
|
1352
|
+
question: string;
|
|
1353
|
+
expected: string;
|
|
1354
|
+
actual: string;
|
|
1355
|
+
recalledText: string;
|
|
1356
|
+
truncatedFields?: string[];
|
|
1357
|
+
}
|
|
1358
|
+
interface AmaBenchDiagnosticBreakdown {
|
|
1359
|
+
key: string;
|
|
1360
|
+
taskCount: number;
|
|
1361
|
+
unknownLikeRate: number;
|
|
1362
|
+
scoreMeans: Record<string, number>;
|
|
1363
|
+
scoreCounts: Record<string, number>;
|
|
1364
|
+
}
|
|
1365
|
+
interface AmaBenchDiagnosticVariantSummary {
|
|
1366
|
+
variant: AmaBenchDiagnosticVariant;
|
|
1367
|
+
usesFullRemnicRecallProcess: boolean;
|
|
1368
|
+
isPrimaryFullSystemScore: boolean;
|
|
1369
|
+
taskCount: number;
|
|
1370
|
+
unknownLikeRate: number;
|
|
1371
|
+
scoreMeans: Record<string, number>;
|
|
1372
|
+
scoreCounts: Record<string, number>;
|
|
1373
|
+
byDomain: AmaBenchDiagnosticBreakdown[];
|
|
1374
|
+
byQaType: AmaBenchDiagnosticBreakdown[];
|
|
1375
|
+
byDomainAndQaType: AmaBenchDiagnosticBreakdown[];
|
|
1376
|
+
tasks: AmaBenchDiagnosticTaskRow[];
|
|
1377
|
+
}
|
|
1378
|
+
interface SanitizedDiagnosticProvider {
|
|
1379
|
+
provider: string;
|
|
1380
|
+
model: string;
|
|
1381
|
+
baseUrl?: string;
|
|
1382
|
+
reasoningEffort?: string;
|
|
1383
|
+
}
|
|
1384
|
+
interface AmaBenchDiagnosticMatrixArtifact {
|
|
1385
|
+
schemaVersion: 1;
|
|
1386
|
+
benchmark: "ama-bench";
|
|
1387
|
+
generatedAt: string;
|
|
1388
|
+
mode: BenchmarkMode;
|
|
1389
|
+
config: {
|
|
1390
|
+
runtimeProfile?: string;
|
|
1391
|
+
adapterMode?: string;
|
|
1392
|
+
datasetDir?: string;
|
|
1393
|
+
limit?: number;
|
|
1394
|
+
seed?: number;
|
|
1395
|
+
systemProvider?: SanitizedDiagnosticProvider | null;
|
|
1396
|
+
judgeProvider?: SanitizedDiagnosticProvider | null;
|
|
1397
|
+
internalProvider?: SanitizedDiagnosticProvider | null;
|
|
1398
|
+
amaBenchCrossJudgeProvider?: SanitizedDiagnosticProvider | null;
|
|
1399
|
+
strongSystemProvider?: SanitizedDiagnosticProvider | null;
|
|
1400
|
+
variantIds?: string[];
|
|
1401
|
+
includeTaskEvidence?: boolean;
|
|
1402
|
+
taskEvidenceMaxChars?: number;
|
|
1403
|
+
};
|
|
1404
|
+
variants: AmaBenchDiagnosticVariantSummary[];
|
|
1405
|
+
}
|
|
1406
|
+
interface AmaBenchDiagnosticRunContext {
|
|
1407
|
+
runtimeProfile?: string;
|
|
1408
|
+
hasResponder?: boolean;
|
|
1409
|
+
includeTaskEvidence?: boolean;
|
|
1410
|
+
taskEvidenceMaxChars?: number;
|
|
1411
|
+
}
|
|
1412
|
+
declare function buildAmaBenchDiagnosticVariantSummary(variant: AmaBenchDiagnosticVariant, result: BenchmarkResult, context?: AmaBenchDiagnosticRunContext): AmaBenchDiagnosticVariantSummary;
|
|
1413
|
+
declare function buildAmaBenchDiagnosticMatrixArtifact(args: {
|
|
1414
|
+
mode: BenchmarkMode;
|
|
1415
|
+
config?: AmaBenchDiagnosticMatrixArtifact["config"];
|
|
1416
|
+
variants: AmaBenchDiagnosticVariantSummary[];
|
|
1417
|
+
generatedAt?: string;
|
|
1418
|
+
}): AmaBenchDiagnosticMatrixArtifact;
|
|
1419
|
+
declare function isAmaBenchUnknownLikeAnswer(answer: string): boolean;
|
|
1420
|
+
|
|
777
1421
|
/**
|
|
778
1422
|
* Sealed LLM-judge rubric loader, invocation, and score parser for the
|
|
779
1423
|
* Assistant bench tier.
|
|
@@ -911,77 +1555,215 @@ interface GatewayResponderOptions {
|
|
|
911
1555
|
workspaceDir?: string;
|
|
912
1556
|
llmFactory?: (gatewayConfig: GatewayConfig, runtimeContext: FallbackLlmRuntimeContext) => Pick<FallbackLlmClient, "chatCompletion">;
|
|
913
1557
|
}
|
|
914
|
-
|
|
1558
|
+
interface ProviderResponderOptions {
|
|
1559
|
+
contextBudgetChars?: number;
|
|
1560
|
+
promptBudgetChars?: number;
|
|
1561
|
+
}
|
|
1562
|
+
declare function createResponderFromProvider(provider: LlmProvider, options?: ProviderResponderOptions): BenchResponder;
|
|
915
1563
|
declare function createProviderBackedResponder(config: ProviderFactoryConfig, providerInstance?: LlmProvider): BenchResponder;
|
|
916
1564
|
declare function createProviderBackedJudge(config: ProviderFactoryConfig, providerInstance?: LlmProvider): BenchJudge;
|
|
1565
|
+
declare function createProviderBackedAmaBenchRecommendedJudge(config: ProviderFactoryConfig, providerInstance?: LlmProvider): BenchJudge;
|
|
917
1566
|
declare function createStructuredJudgeFromProvider(provider: LlmProvider): StructuredJudge;
|
|
918
1567
|
declare function createProviderBackedStructuredJudge(config: ProviderFactoryConfig, providerInstance?: LlmProvider): StructuredJudge;
|
|
919
1568
|
declare function createGatewayResponder(options: GatewayResponderOptions): BenchResponder;
|
|
920
1569
|
|
|
921
1570
|
declare function createLiteLlmProvider(config: OpenAiCompatibleProviderConfig): LlmProvider;
|
|
922
1571
|
|
|
923
|
-
declare function createOllamaProvider(config: OllamaProviderConfig): LlmProvider;
|
|
924
|
-
|
|
925
1572
|
/**
|
|
926
|
-
*
|
|
1573
|
+
* Local-LLM bench provider — issue #566 slice 5.
|
|
1574
|
+
*
|
|
1575
|
+
* Talks to a user-hosted OpenAI-compatible endpoint (llama.cpp,
|
|
1576
|
+
* vLLM, LM Studio, etc.) using the exact same wire contract
|
|
1577
|
+
* (`/v1/chat/completions` + `/v1/models`) that the Remnic core
|
|
1578
|
+
* `LocalLlmClient` uses. The goal is transport-level parity:
|
|
1579
|
+
* anything `remnic bench published --provider local-llm` can reach
|
|
1580
|
+
* is something the running plugin can also reach.
|
|
1581
|
+
*
|
|
1582
|
+
* Why this is a distinct provider from `openai`:
|
|
1583
|
+
*
|
|
1584
|
+
* - The OpenAI-compatible provider treats `baseUrl` as optional
|
|
1585
|
+
* and defaults to `https://api.openai.com/v1`. That default is
|
|
1586
|
+
* wrong for local servers, and silently falling through to it
|
|
1587
|
+
* violates CLAUDE.md rule 51 (reject invalid user input).
|
|
1588
|
+
* - `local-llm` REQUIRES `baseUrl` at the CLI boundary so the
|
|
1589
|
+
* user must explicitly point at their server. A missing
|
|
1590
|
+
* base URL is a user error, not a default.
|
|
1591
|
+
* - Discovery for `local-llm` is reserved for the future — the
|
|
1592
|
+
* built-in `discoverAllProviders` probe does not assume a
|
|
1593
|
+
* local-llm URL is reachable. Users opt in with `--base-url`.
|
|
1594
|
+
*
|
|
1595
|
+
* See `packages/remnic-core/src/summarizer.ts` for the core-side
|
|
1596
|
+
* `LocalLlmClient` invocation pattern and
|
|
1597
|
+
* `packages/plugin-openclaw/openclaw.plugin.json` for the
|
|
1598
|
+
* `localLlmUrl` / `localLlmModel` config that this provider
|
|
1599
|
+
* mirrors.
|
|
927
1600
|
*/
|
|
928
1601
|
|
|
929
|
-
declare function
|
|
930
|
-
|
|
931
|
-
type BenchModelSource = "plugin" | "gateway";
|
|
932
|
-
interface ResolveBenchRuntimeProfileOptions {
|
|
933
|
-
runtimeProfile?: BenchRuntimeProfile;
|
|
934
|
-
remnicConfigPath?: string;
|
|
935
|
-
openclawConfigPath?: string;
|
|
936
|
-
modelSource?: BenchModelSource;
|
|
937
|
-
gatewayAgentId?: string;
|
|
938
|
-
fastGatewayAgentId?: string;
|
|
939
|
-
systemProvider?: BuiltInProvider;
|
|
940
|
-
systemModel?: string;
|
|
941
|
-
systemBaseUrl?: string;
|
|
942
|
-
judgeProvider?: BuiltInProvider;
|
|
943
|
-
judgeModel?: string;
|
|
944
|
-
judgeBaseUrl?: string;
|
|
945
|
-
}
|
|
946
|
-
interface ResolvedBenchRuntimeProfile {
|
|
947
|
-
profile: BenchRuntimeProfile;
|
|
948
|
-
remnicConfig: Record<string, unknown>;
|
|
949
|
-
effectiveRemnicConfig: Record<string, unknown>;
|
|
950
|
-
adapterOptions: {
|
|
951
|
-
configOverrides: Record<string, unknown>;
|
|
952
|
-
preserveRuntimeDefaults?: boolean;
|
|
953
|
-
responder?: BenchResponder;
|
|
954
|
-
judge?: BenchJudge;
|
|
955
|
-
};
|
|
956
|
-
systemProvider: ProviderConfig | null;
|
|
957
|
-
judgeProvider: ProviderConfig | null;
|
|
958
|
-
}
|
|
959
|
-
declare function resolveBenchRuntimeProfile(options: ResolveBenchRuntimeProfileOptions): Promise<ResolvedBenchRuntimeProfile>;
|
|
960
|
-
|
|
961
|
-
/**
|
|
962
|
-
* Published benchmark registry for @remnic/bench phase 1.
|
|
963
|
-
*/
|
|
1602
|
+
declare function createLocalLlmProvider(config: LocalLlmProviderConfig): LlmProvider;
|
|
964
1603
|
|
|
965
|
-
declare function
|
|
966
|
-
declare function getBenchmark(id: string): BenchmarkDefinition | undefined;
|
|
1604
|
+
declare function createOllamaProvider(config: OllamaProviderConfig): LlmProvider;
|
|
967
1605
|
|
|
968
1606
|
/**
|
|
969
|
-
*
|
|
1607
|
+
* Minimal OpenAI-compatible provider for phase 1 bench execution.
|
|
970
1608
|
*/
|
|
971
1609
|
|
|
972
|
-
declare function
|
|
1610
|
+
declare function createOpenAiCompatibleProvider(config: OpenAiCompatibleProviderConfig): LlmProvider;
|
|
973
1611
|
|
|
974
1612
|
/**
|
|
975
|
-
*
|
|
1613
|
+
* Shared types for the Assistant bench tier.
|
|
976
1614
|
*
|
|
977
|
-
*
|
|
978
|
-
*
|
|
979
|
-
*
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
*
|
|
1615
|
+
* Every Assistant benchmark shares the same shape:
|
|
1616
|
+
* - A synthetic memory graph (facts, stances, entities) the agent may read.
|
|
1617
|
+
* - A scenario prompt given to the agent.
|
|
1618
|
+
* - A sealed-rubric judge pass that scores the agent's output along
|
|
1619
|
+
* identity_accuracy / stance_coherence / novelty / calibration.
|
|
1620
|
+
*
|
|
1621
|
+
* The goal is reviewability: each benchmark folder ships a small fixture.ts
|
|
1622
|
+
* that returns `AssistantScenario` values, and the runner wires the shared
|
|
1623
|
+
* multi-run + bootstrap-CI infrastructure around them.
|
|
1624
|
+
*/
|
|
1625
|
+
|
|
1626
|
+
interface AssistantMemoryFact {
|
|
1627
|
+
id: string;
|
|
1628
|
+
summary: string;
|
|
1629
|
+
/**
|
|
1630
|
+
* Free-form tags (topic, entity) used to render the memory-graph summary
|
|
1631
|
+
* that is handed to the judge. Not shown to the agent.
|
|
1632
|
+
*/
|
|
1633
|
+
tags?: string[];
|
|
1634
|
+
}
|
|
1635
|
+
interface AssistantStance {
|
|
1636
|
+
topic: string;
|
|
1637
|
+
position: string;
|
|
1638
|
+
}
|
|
1639
|
+
interface AssistantMemoryGraph {
|
|
1640
|
+
userHandle: string;
|
|
1641
|
+
userRole: string;
|
|
1642
|
+
/** Fixed scenario date shown to the agent and judge for reproducible temporal reasoning. */
|
|
1643
|
+
currentDate?: string;
|
|
1644
|
+
facts: AssistantMemoryFact[];
|
|
1645
|
+
stances: AssistantStance[];
|
|
1646
|
+
openThreads: string[];
|
|
1647
|
+
}
|
|
1648
|
+
interface AssistantScenario {
|
|
1649
|
+
id: string;
|
|
1650
|
+
title: string;
|
|
1651
|
+
scenarioPrompt: string;
|
|
1652
|
+
memoryGraph: AssistantMemoryGraph;
|
|
1653
|
+
/**
|
|
1654
|
+
* Small label describing what the scenario is meant to exercise. Useful in
|
|
1655
|
+
* dashboards for filtering. Never exposed to the agent.
|
|
1656
|
+
*/
|
|
1657
|
+
focus: string;
|
|
1658
|
+
}
|
|
1659
|
+
/**
|
|
1660
|
+
* Minimal agent contract for the Assistant tier. The agent receives the
|
|
1661
|
+
* scenario prompt plus a pre-rendered memory view (analogous to what the
|
|
1662
|
+
* Remnic recall stack would hand to a downstream chat model), and returns
|
|
1663
|
+
* its final answer text.
|
|
1664
|
+
*/
|
|
1665
|
+
interface AssistantAgent {
|
|
1666
|
+
respond(request: {
|
|
1667
|
+
scenarioId: string;
|
|
1668
|
+
prompt: string;
|
|
1669
|
+
memoryView: string;
|
|
1670
|
+
seed: number;
|
|
1671
|
+
runIndex: number;
|
|
1672
|
+
runCount: number;
|
|
1673
|
+
}): Promise<string>;
|
|
1674
|
+
}
|
|
1675
|
+
interface AssistantRunnerOptions {
|
|
1676
|
+
agent: AssistantAgent;
|
|
1677
|
+
judge: StructuredJudge | undefined;
|
|
1678
|
+
rubricId?: string;
|
|
1679
|
+
/**
|
|
1680
|
+
* Directory where per-run spot-check JSONL files are appended. Defaults to
|
|
1681
|
+
* `<cwd>/benchmarks/results/spot-checks`.
|
|
1682
|
+
*/
|
|
1683
|
+
spotCheckDir?: string;
|
|
1684
|
+
/**
|
|
1685
|
+
* Seed array for deterministic multi-run scheduling. When omitted the
|
|
1686
|
+
* benchmark runner picks a fresh seed array via `buildBenchmarkRunSeeds`.
|
|
1687
|
+
*/
|
|
1688
|
+
seeds?: number[];
|
|
1689
|
+
/**
|
|
1690
|
+
* Override used by tests and CLI smoke runs to cap iterations. Must be
|
|
1691
|
+
* `>= 1`. The production contract is `>= 5` per the issue spec.
|
|
1692
|
+
*/
|
|
1693
|
+
runCount?: number;
|
|
1694
|
+
/**
|
|
1695
|
+
* Random-number factory for bootstrap sampling. Injected in tests.
|
|
1696
|
+
*/
|
|
1697
|
+
random?: () => number;
|
|
1698
|
+
}
|
|
1699
|
+
|
|
1700
|
+
type BenchModelSource = "plugin" | "gateway";
|
|
1701
|
+
interface ResolveBenchRuntimeProfileOptions {
|
|
1702
|
+
runtimeProfile?: BenchRuntimeProfile;
|
|
1703
|
+
remnicConfigPath?: string;
|
|
1704
|
+
openclawConfigPath?: string;
|
|
1705
|
+
modelSource?: BenchModelSource;
|
|
1706
|
+
gatewayAgentId?: string;
|
|
1707
|
+
fastGatewayAgentId?: string;
|
|
1708
|
+
systemProvider?: BuiltInProvider;
|
|
1709
|
+
systemModel?: string;
|
|
1710
|
+
systemBaseUrl?: string;
|
|
1711
|
+
systemApiKey?: string;
|
|
1712
|
+
systemCodexReasoningEffort?: ProviderConfig["reasoningEffort"];
|
|
1713
|
+
systemResponderContextBudgetChars?: number;
|
|
1714
|
+
systemResponderPromptBudgetChars?: number;
|
|
1715
|
+
judgeProvider?: BuiltInProvider;
|
|
1716
|
+
judgeModel?: string;
|
|
1717
|
+
judgeBaseUrl?: string;
|
|
1718
|
+
judgeApiKey?: string;
|
|
1719
|
+
judgeCodexReasoningEffort?: ProviderConfig["reasoningEffort"];
|
|
1720
|
+
internalProvider?: BuiltInProvider;
|
|
1721
|
+
internalModel?: string;
|
|
1722
|
+
internalBaseUrl?: string;
|
|
1723
|
+
internalApiKey?: string;
|
|
1724
|
+
internalDisableThinking?: boolean;
|
|
1725
|
+
internalCodexReasoningEffort?: ProviderConfig["reasoningEffort"];
|
|
1726
|
+
lcmObserveConcurrency?: number;
|
|
1727
|
+
requestTimeout?: number;
|
|
1728
|
+
drainTimeout?: number;
|
|
1729
|
+
max429WaitMs?: number;
|
|
1730
|
+
disableThinking?: boolean;
|
|
1731
|
+
}
|
|
1732
|
+
interface ResolvedBenchRuntimeProfile {
|
|
1733
|
+
profile: BenchRuntimeProfile;
|
|
1734
|
+
remnicConfig: Record<string, unknown>;
|
|
1735
|
+
effectiveRemnicConfig: Record<string, unknown>;
|
|
1736
|
+
adapterOptions: {
|
|
1737
|
+
configOverrides: Record<string, unknown>;
|
|
1738
|
+
preserveRuntimeDefaults?: boolean;
|
|
1739
|
+
responder?: BenchResponder;
|
|
1740
|
+
judge?: BenchJudge;
|
|
1741
|
+
drainTimeoutMs?: number;
|
|
1742
|
+
};
|
|
1743
|
+
systemProvider: ProviderConfig | null;
|
|
1744
|
+
judgeProvider: ProviderConfig | null;
|
|
1745
|
+
internalProvider: ProviderConfig | null;
|
|
1746
|
+
}
|
|
1747
|
+
declare function resolveBenchRuntimeProfile(options: ResolveBenchRuntimeProfileOptions): Promise<ResolvedBenchRuntimeProfile>;
|
|
1748
|
+
|
|
1749
|
+
/**
|
|
1750
|
+
* Published benchmark registry for @remnic/bench phase 1.
|
|
1751
|
+
*/
|
|
1752
|
+
|
|
1753
|
+
declare function listBenchmarks(): BenchmarkDefinition[];
|
|
1754
|
+
declare function getBenchmark(id: string): BenchmarkDefinition | undefined;
|
|
1755
|
+
|
|
1756
|
+
/**
|
|
1757
|
+
* Seed-sequence generation for benchmark runs.
|
|
1758
|
+
*
|
|
1759
|
+
* Factored out of `benchmark.ts` so individual runners can reuse it without
|
|
1760
|
+
* triggering a circular import through `benchmark.ts -> registry.ts ->
|
|
1761
|
+
* runner.ts -> benchmark.ts`.
|
|
1762
|
+
*/
|
|
1763
|
+
declare function buildBenchmarkRunSeeds(runCount: number, baseSeed?: number): number[];
|
|
1764
|
+
|
|
1765
|
+
/**
|
|
1766
|
+
* Public benchmark execution helpers.
|
|
985
1767
|
*/
|
|
986
1768
|
|
|
987
1769
|
declare function resolveBenchmarkRunCount(mode: BenchmarkMode, requestedIterations?: number): number;
|
|
@@ -1190,6 +1972,113 @@ declare function buildBenchmarkPublishFeed(outputDir: string, target: BenchmarkP
|
|
|
1190
1972
|
declare function writeBenchmarkPublishFeed(feed: PublishedBenchmarkFeed, outputPath: string): Promise<string>;
|
|
1191
1973
|
declare function renderBenchmarkResultExport(result: BenchmarkResult, format: BenchmarkExportFormat): string;
|
|
1192
1974
|
|
|
1975
|
+
interface HaystackTurn {
|
|
1976
|
+
role: "user" | "assistant";
|
|
1977
|
+
content: string;
|
|
1978
|
+
}
|
|
1979
|
+
interface LongMemEvalItem {
|
|
1980
|
+
question_id: string | number;
|
|
1981
|
+
question_type: string;
|
|
1982
|
+
question: string;
|
|
1983
|
+
answer: string;
|
|
1984
|
+
question_date: string;
|
|
1985
|
+
haystack_dates: string[];
|
|
1986
|
+
haystack_session_ids: string[];
|
|
1987
|
+
haystack_sessions: HaystackTurn[][];
|
|
1988
|
+
answer_session_ids: string[];
|
|
1989
|
+
}
|
|
1990
|
+
|
|
1991
|
+
interface LoCoMoQA {
|
|
1992
|
+
question: string;
|
|
1993
|
+
answer: string;
|
|
1994
|
+
evidence: string[];
|
|
1995
|
+
category: number;
|
|
1996
|
+
}
|
|
1997
|
+
interface LoCoMoConversation {
|
|
1998
|
+
sample_id: string;
|
|
1999
|
+
conversation: Record<string, unknown>;
|
|
2000
|
+
qa: LoCoMoQA[];
|
|
2001
|
+
event_summary?: unknown;
|
|
2002
|
+
observation?: unknown;
|
|
2003
|
+
session_summary?: unknown;
|
|
2004
|
+
}
|
|
2005
|
+
|
|
2006
|
+
/**
|
|
2007
|
+
* Shared dataset loader helpers for the published LongMemEval + LoCoMo
|
|
2008
|
+
* benchmark runners. Wraps the fs probe + JSON parse + fallback logic
|
|
2009
|
+
* previously duplicated inside each runner's `loadDataset` function.
|
|
2010
|
+
*
|
|
2011
|
+
* Contract:
|
|
2012
|
+
*
|
|
2013
|
+
* - When `datasetDir` is defined, loaders probe the known canonical
|
|
2014
|
+
* filenames in order. The first readable file wins. If none are
|
|
2015
|
+
* readable, the result is `{ source: "missing", errors }`.
|
|
2016
|
+
* - When `datasetDir` is undefined (or resolves to `missing`) and
|
|
2017
|
+
* `mode === "quick"`, loaders return the bundled smoke fixture with
|
|
2018
|
+
* source `"smoke"` so the caller can surface a clear log message.
|
|
2019
|
+
* - When `mode === "full"` and no dataset is found, loaders return
|
|
2020
|
+
* `{ source: "missing", errors }` and callers must throw — full mode
|
|
2021
|
+
* never silently falls back to the smoke fixture.
|
|
2022
|
+
*
|
|
2023
|
+
* `scripts/bench/fetch-datasets.sh` documents the expected filenames; keep
|
|
2024
|
+
* them in sync when adding new variants.
|
|
2025
|
+
*/
|
|
2026
|
+
|
|
2027
|
+
/** Canonical LongMemEval-S filenames probed by the loader, in priority order. */
|
|
2028
|
+
declare const LONG_MEM_EVAL_DATASET_FILENAMES: readonly string[];
|
|
2029
|
+
/** Canonical LoCoMo-10 filenames probed by the loader, in priority order. */
|
|
2030
|
+
declare const LOCOMO_DATASET_FILENAMES: readonly string[];
|
|
2031
|
+
type DatasetSource = "dataset" | "smoke" | "missing";
|
|
2032
|
+
interface LoadedDataset<T> {
|
|
2033
|
+
source: DatasetSource;
|
|
2034
|
+
/** Filename relative to `datasetDir` when source === "dataset". */
|
|
2035
|
+
filename?: string;
|
|
2036
|
+
items: T[];
|
|
2037
|
+
/** Parse/read errors encountered while probing candidate filenames. */
|
|
2038
|
+
errors: string[];
|
|
2039
|
+
}
|
|
2040
|
+
interface LoadDatasetOptions {
|
|
2041
|
+
mode: BenchmarkMode;
|
|
2042
|
+
datasetDir?: string;
|
|
2043
|
+
limit?: number;
|
|
2044
|
+
}
|
|
2045
|
+
/** Load LongMemEval-S from disk, falling back to the smoke fixture in quick mode. */
|
|
2046
|
+
declare function loadLongMemEvalS(options: LoadDatasetOptions): Promise<LoadedDataset<LongMemEvalItem>>;
|
|
2047
|
+
/**
|
|
2048
|
+
* Load LoCoMo-10 from disk, falling back to the smoke fixture in quick mode.
|
|
2049
|
+
*
|
|
2050
|
+
* `parseFile` is optional — callers that need richer structural
|
|
2051
|
+
* normalization (e.g. the LoCoMo runner's QA answer coercion) can pass
|
|
2052
|
+
* their own parser. When omitted, a minimal parser is used that only
|
|
2053
|
+
* asserts the top-level array + sample_id shape.
|
|
2054
|
+
*/
|
|
2055
|
+
declare function loadLoCoMo10(options: LoadDatasetOptions & {
|
|
2056
|
+
parseFile?: (raw: string, filename: string) => LoCoMoConversation[];
|
|
2057
|
+
}): Promise<LoadedDataset<LoCoMoConversation>>;
|
|
2058
|
+
/**
|
|
2059
|
+
* Build a friendly "dataset missing" error message that links operators to
|
|
2060
|
+
* the fetch script. Callers use this when `mode === "full"` and the probe
|
|
2061
|
+
* returned `source: "missing"`.
|
|
2062
|
+
*/
|
|
2063
|
+
declare function formatMissingDatasetError(benchmark: "longmemeval" | "locomo", datasetDir: string | undefined, filenames: readonly string[], errors: readonly string[]): string;
|
|
2064
|
+
|
|
2065
|
+
/**
|
|
2066
|
+
* BEAM runner migrated into @remnic/bench for phase 2.
|
|
2067
|
+
*/
|
|
2068
|
+
|
|
2069
|
+
interface BeamDatasetPreview {
|
|
2070
|
+
source: "dataset" | "smoke" | "missing";
|
|
2071
|
+
files: string[];
|
|
2072
|
+
items: number;
|
|
2073
|
+
tasks: number;
|
|
2074
|
+
errors: string[];
|
|
2075
|
+
}
|
|
2076
|
+
declare function loadBeamDatasetPreview(options: {
|
|
2077
|
+
mode: BenchmarkMode;
|
|
2078
|
+
datasetDir?: string;
|
|
2079
|
+
limit?: number;
|
|
2080
|
+
}): Promise<BeamDatasetPreview>;
|
|
2081
|
+
|
|
1193
2082
|
/**
|
|
1194
2083
|
* Hash verification utilities used by the benchmark integrity pipeline.
|
|
1195
2084
|
*
|
|
@@ -1230,8 +2119,12 @@ declare function hashBytes(value: Uint8Array): string;
|
|
|
1230
2119
|
/**
|
|
1231
2120
|
* Canonicalize a JSON-serializable value so equivalent payloads produce the
|
|
1232
2121
|
* same digest regardless of key insertion order.
|
|
2122
|
+
*
|
|
2123
|
+
* `space` matches the third argument of `JSON.stringify` — pass `2` (or any
|
|
2124
|
+
* positive integer / indent string) when you want a pretty-printed output
|
|
2125
|
+
* that is still byte-stable across runs. Default is compact output.
|
|
1233
2126
|
*/
|
|
1234
|
-
declare function canonicalJsonStringify(value: unknown): string;
|
|
2127
|
+
declare function canonicalJsonStringify(value: unknown, space?: string | number): string;
|
|
1235
2128
|
declare function hashCanonicalJson(value: unknown): string;
|
|
1236
2129
|
declare function isSha256Hex(value: unknown): value is string;
|
|
1237
2130
|
declare function assertSha256Hex(value: unknown, label: string): string;
|
|
@@ -1284,9 +2177,10 @@ declare function loadSealKeyFromEnv(envName: string): Buffer | null;
|
|
|
1284
2177
|
* }
|
|
1285
2178
|
* ```
|
|
1286
2179
|
*
|
|
1287
|
-
* `sealHash` is computed over the canonical JSON of `envelope
|
|
1288
|
-
*
|
|
1289
|
-
*
|
|
2180
|
+
* `sealHash` is computed over the canonical JSON of `envelope`, including the
|
|
2181
|
+
* random IV and ciphertext. It identifies the sealed envelope artifact, not
|
|
2182
|
+
* the plaintext qrels content. Use `envelope.plaintextHash` when stable
|
|
2183
|
+
* plaintext identity is required across independently sealed artifacts.
|
|
1290
2184
|
*/
|
|
1291
2185
|
|
|
1292
2186
|
interface SealedQrelsArtifact {
|
|
@@ -1393,7 +2287,7 @@ interface SeededRng {
|
|
|
1393
2287
|
* Deterministic 32-bit PRNG. Mulberry32 is small, fast, and sufficient for
|
|
1394
2288
|
* shuffling benchmark tasks. Do NOT use for cryptographic operations.
|
|
1395
2289
|
*/
|
|
1396
|
-
declare function createSeededRng(seed: number): SeededRng;
|
|
2290
|
+
declare function createSeededRng$1(seed: number): SeededRng;
|
|
1397
2291
|
/**
|
|
1398
2292
|
* Fisher-Yates shuffle using a seeded PRNG. Returns a new array.
|
|
1399
2293
|
*/
|
|
@@ -1576,89 +2470,6 @@ declare const chatFixture: FixtureGenerator;
|
|
|
1576
2470
|
declare const SEALED_PROMPT_REGISTRY: Readonly<Record<string, string>>;
|
|
1577
2471
|
declare const DEFAULT_ASSISTANT_RUBRIC_ID = "assistant-rubric-v1";
|
|
1578
2472
|
|
|
1579
|
-
/**
|
|
1580
|
-
* Shared types for the Assistant bench tier.
|
|
1581
|
-
*
|
|
1582
|
-
* Every Assistant benchmark shares the same shape:
|
|
1583
|
-
* - A synthetic memory graph (facts, stances, entities) the agent may read.
|
|
1584
|
-
* - A scenario prompt given to the agent.
|
|
1585
|
-
* - A sealed-rubric judge pass that scores the agent's output along
|
|
1586
|
-
* identity_accuracy / stance_coherence / novelty / calibration.
|
|
1587
|
-
*
|
|
1588
|
-
* The goal is reviewability: each benchmark folder ships a small fixture.ts
|
|
1589
|
-
* that returns `AssistantScenario` values, and the runner wires the shared
|
|
1590
|
-
* multi-run + bootstrap-CI infrastructure around them.
|
|
1591
|
-
*/
|
|
1592
|
-
|
|
1593
|
-
interface AssistantMemoryFact {
|
|
1594
|
-
id: string;
|
|
1595
|
-
summary: string;
|
|
1596
|
-
/**
|
|
1597
|
-
* Free-form tags (topic, entity) used to render the memory-graph summary
|
|
1598
|
-
* that is handed to the judge. Not shown to the agent.
|
|
1599
|
-
*/
|
|
1600
|
-
tags?: string[];
|
|
1601
|
-
}
|
|
1602
|
-
interface AssistantStance {
|
|
1603
|
-
topic: string;
|
|
1604
|
-
position: string;
|
|
1605
|
-
}
|
|
1606
|
-
interface AssistantMemoryGraph {
|
|
1607
|
-
userHandle: string;
|
|
1608
|
-
userRole: string;
|
|
1609
|
-
facts: AssistantMemoryFact[];
|
|
1610
|
-
stances: AssistantStance[];
|
|
1611
|
-
openThreads: string[];
|
|
1612
|
-
}
|
|
1613
|
-
interface AssistantScenario {
|
|
1614
|
-
id: string;
|
|
1615
|
-
title: string;
|
|
1616
|
-
scenarioPrompt: string;
|
|
1617
|
-
memoryGraph: AssistantMemoryGraph;
|
|
1618
|
-
/**
|
|
1619
|
-
* Small label describing what the scenario is meant to exercise. Useful in
|
|
1620
|
-
* dashboards for filtering. Never exposed to the agent.
|
|
1621
|
-
*/
|
|
1622
|
-
focus: string;
|
|
1623
|
-
}
|
|
1624
|
-
/**
|
|
1625
|
-
* Minimal agent contract for the Assistant tier. The agent receives the
|
|
1626
|
-
* scenario prompt plus a pre-rendered memory view (analogous to what the
|
|
1627
|
-
* Remnic recall stack would hand to a downstream chat model), and returns
|
|
1628
|
-
* its final answer text.
|
|
1629
|
-
*/
|
|
1630
|
-
interface AssistantAgent {
|
|
1631
|
-
respond(request: {
|
|
1632
|
-
scenarioId: string;
|
|
1633
|
-
prompt: string;
|
|
1634
|
-
memoryView: string;
|
|
1635
|
-
}): Promise<string>;
|
|
1636
|
-
}
|
|
1637
|
-
interface AssistantRunnerOptions {
|
|
1638
|
-
agent: AssistantAgent;
|
|
1639
|
-
judge: StructuredJudge | undefined;
|
|
1640
|
-
rubricId?: string;
|
|
1641
|
-
/**
|
|
1642
|
-
* Directory where per-run spot-check JSONL files are appended. Defaults to
|
|
1643
|
-
* `<cwd>/benchmarks/results/spot-checks`.
|
|
1644
|
-
*/
|
|
1645
|
-
spotCheckDir?: string;
|
|
1646
|
-
/**
|
|
1647
|
-
* Seed array for deterministic multi-run scheduling. When omitted the
|
|
1648
|
-
* benchmark runner picks a fresh seed array via `buildBenchmarkRunSeeds`.
|
|
1649
|
-
*/
|
|
1650
|
-
seeds?: number[];
|
|
1651
|
-
/**
|
|
1652
|
-
* Override used by tests and CLI smoke runs to cap iterations. Must be
|
|
1653
|
-
* `>= 1`. The production contract is `>= 5` per the issue spec.
|
|
1654
|
-
*/
|
|
1655
|
-
runCount?: number;
|
|
1656
|
-
/**
|
|
1657
|
-
* Random-number factory for bootstrap sampling. Injected in tests.
|
|
1658
|
-
*/
|
|
1659
|
-
random?: () => number;
|
|
1660
|
-
}
|
|
1661
|
-
|
|
1662
2473
|
/**
|
|
1663
2474
|
* Shared runner scaffolding for the Assistant bench tier.
|
|
1664
2475
|
*
|
|
@@ -1757,4 +2568,598 @@ declare const ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS: AssistantScenario[];
|
|
|
1757
2568
|
declare const assistantSynthesisDefinition: BenchmarkDefinition;
|
|
1758
2569
|
declare function runAssistantSynthesisBenchmark(options: ResolvedRunBenchmarkOptions): Promise<BenchmarkResult>;
|
|
1759
2570
|
|
|
1760
|
-
|
|
2571
|
+
interface ProceduralRecallE2eCase {
|
|
2572
|
+
id: string;
|
|
2573
|
+
prompt: string;
|
|
2574
|
+
procedurePreamble: string;
|
|
2575
|
+
procedureSteps: Array<{
|
|
2576
|
+
order: number;
|
|
2577
|
+
intent: string;
|
|
2578
|
+
}>;
|
|
2579
|
+
procedureTags: string[];
|
|
2580
|
+
/** When true, `buildProcedureRecallSection` should return non-null markdown. */
|
|
2581
|
+
expectNonNullSection: boolean;
|
|
2582
|
+
proceduralEnabled?: boolean;
|
|
2583
|
+
}
|
|
2584
|
+
|
|
2585
|
+
/**
|
|
2586
|
+
* Scenario shape for the ablation harness. A superset of
|
|
2587
|
+
* `ProceduralRecallE2eCase` with an `expectMatch` alias so downstream fixtures
|
|
2588
|
+
* can be expressed in either vocabulary.
|
|
2589
|
+
*/
|
|
2590
|
+
interface ProceduralAblationScenario {
|
|
2591
|
+
id: string;
|
|
2592
|
+
prompt: string;
|
|
2593
|
+
procedurePreamble: string;
|
|
2594
|
+
procedureSteps: Array<{
|
|
2595
|
+
order: number;
|
|
2596
|
+
intent: string;
|
|
2597
|
+
}>;
|
|
2598
|
+
procedureTags: string[];
|
|
2599
|
+
/**
|
|
2600
|
+
* True when the prompt should recall the procedure. False for distractor /
|
|
2601
|
+
* non-task-initiation prompts where we expect the gate to reject.
|
|
2602
|
+
*/
|
|
2603
|
+
expectMatch: boolean;
|
|
2604
|
+
}
|
|
2605
|
+
interface ProceduralAblationPerCase {
|
|
2606
|
+
id: string;
|
|
2607
|
+
prompt: string;
|
|
2608
|
+
expectMatch: boolean;
|
|
2609
|
+
onMatched: boolean;
|
|
2610
|
+
offMatched: boolean;
|
|
2611
|
+
onScore: number;
|
|
2612
|
+
offScore: number;
|
|
2613
|
+
}
|
|
2614
|
+
interface ProceduralAblationArtifact {
|
|
2615
|
+
schemaVersion: 1;
|
|
2616
|
+
fixture: {
|
|
2617
|
+
path: string | null;
|
|
2618
|
+
scenarioCount: number;
|
|
2619
|
+
};
|
|
2620
|
+
onScore: number;
|
|
2621
|
+
offScore: number;
|
|
2622
|
+
lift: number;
|
|
2623
|
+
confidenceInterval: ConfidenceInterval;
|
|
2624
|
+
perCase: ProceduralAblationPerCase[];
|
|
2625
|
+
generatedAt: string;
|
|
2626
|
+
}
|
|
2627
|
+
/**
|
|
2628
|
+
* Convert the existing `ProceduralRecallE2eCase` fixture into
|
|
2629
|
+
* ablation-scenario shape. The ablation ALWAYS sweeps procedural on and off,
|
|
2630
|
+
* so `expectMatch` must reflect what the prompt + procedure pair should do
|
|
2631
|
+
* WHEN PROCEDURAL IS ON — not what the original row's `proceduralEnabled`
|
|
2632
|
+
* flag produced.
|
|
2633
|
+
*
|
|
2634
|
+
* Gate-control rows in the e2e fixture (where `proceduralEnabled=false`
|
|
2635
|
+
* produces `expectNonNullSection=false` only because of the gate, not the
|
|
2636
|
+
* content) are excluded here: their ON-side outcome is content-dependent and
|
|
2637
|
+
* not something this mapper can label correctly without re-running
|
|
2638
|
+
* `buildProcedureRecallSection`. Callers that need those rows should write
|
|
2639
|
+
* the scenario directly with an explicit `expectMatch`.
|
|
2640
|
+
*/
|
|
2641
|
+
declare function fixtureToAblationScenarios(fixture: ProceduralRecallE2eCase[]): ProceduralAblationScenario[];
|
|
2642
|
+
/**
|
|
2643
|
+
* Default bootstrap seed used when no `random` / `seed` override is supplied.
|
|
2644
|
+
* Fixing this makes CI bounds reproducible across CLI invocations — flaky CI
|
|
2645
|
+
* bounds would break artifact-based comparisons and saved baselines.
|
|
2646
|
+
*/
|
|
2647
|
+
declare const DEFAULT_ABLATION_BOOTSTRAP_SEED = 1919249774;
|
|
2648
|
+
/**
|
|
2649
|
+
* Mulberry32 seeded RNG. Inlined (and re-used from tests) so callers can get a
|
|
2650
|
+
* deterministic default without needing an external dependency.
|
|
2651
|
+
*/
|
|
2652
|
+
declare function createSeededRandom(seed: number): () => number;
|
|
2653
|
+
interface RunProceduralAblationOptions {
|
|
2654
|
+
scenarios: ProceduralAblationScenario[];
|
|
2655
|
+
/** Path the ablation was loaded from (echoed back into the artifact). */
|
|
2656
|
+
fixturePath?: string | null;
|
|
2657
|
+
/** Bootstrap iterations for CI on the paired delta (default: 1_000). */
|
|
2658
|
+
bootstrapIterations?: number;
|
|
2659
|
+
/**
|
|
2660
|
+
* Seeded RNG for the bootstrap. Defaults to
|
|
2661
|
+
* `createSeededRandom(DEFAULT_ABLATION_BOOTSTRAP_SEED)` so CI bounds are
|
|
2662
|
+
* deterministic across repeated CLI invocations. Pass `Math.random`
|
|
2663
|
+
* explicitly to opt into non-deterministic sampling.
|
|
2664
|
+
*/
|
|
2665
|
+
random?: () => number;
|
|
2666
|
+
/**
|
|
2667
|
+
* Convenience alternative to `random`: if provided (and `random` is not),
|
|
2668
|
+
* a seeded mulberry32 RNG is built from this integer.
|
|
2669
|
+
*/
|
|
2670
|
+
seed?: number;
|
|
2671
|
+
}
|
|
2672
|
+
/**
|
|
2673
|
+
* Pure entrypoint — accepts a scenario list and returns the artifact. Reads
|
|
2674
|
+
* and writes are isolated to the StorageManager temp directories the sides
|
|
2675
|
+
* create and remove internally.
|
|
2676
|
+
*/
|
|
2677
|
+
declare function runProceduralAblation(options: RunProceduralAblationOptions): Promise<ProceduralAblationArtifact>;
|
|
2678
|
+
/**
|
|
2679
|
+
* Load a scenario list from a JSON file. Validates the JSON is an object with
|
|
2680
|
+
* a `scenarios` array (or a bare array) and each entry has the required
|
|
2681
|
+
* fields. Rejects invalid input per CLAUDE.md rule 51 rather than silently
|
|
2682
|
+
* defaulting.
|
|
2683
|
+
*/
|
|
2684
|
+
declare function loadAblationFixture(fixturePath: string): Promise<ProceduralAblationScenario[]>;
|
|
2685
|
+
/**
|
|
2686
|
+
* CLI entrypoint. Resolves `--fixture <path>` (defaults to the built-in e2e
|
|
2687
|
+
* fixture converted to ablation scenarios when unset) and writes the artifact
|
|
2688
|
+
* to `--out <path>`. Validates inputs per CLAUDE.md rules 14 / 17 / 51.
|
|
2689
|
+
*/
|
|
2690
|
+
interface RunProceduralAblationCliArgs {
|
|
2691
|
+
fixturePath: string | null;
|
|
2692
|
+
outPath: string;
|
|
2693
|
+
bootstrapIterations?: number;
|
|
2694
|
+
random?: () => number;
|
|
2695
|
+
/**
|
|
2696
|
+
* Optional seed for the bootstrap RNG. When omitted the harness uses
|
|
2697
|
+
* `DEFAULT_ABLATION_BOOTSTRAP_SEED` so CLI runs are reproducible by
|
|
2698
|
+
* default.
|
|
2699
|
+
*/
|
|
2700
|
+
seed?: number;
|
|
2701
|
+
}
|
|
2702
|
+
declare function runProceduralAblationCli(args: RunProceduralAblationCliArgs): Promise<ProceduralAblationArtifact>;
|
|
2703
|
+
|
|
2704
|
+
/**
|
|
2705
|
+
* Real-fixture procedural-recall scenarios (issue #567 PR 2/5).
|
|
2706
|
+
*
|
|
2707
|
+
* 20 synthetic but realistic scenarios grouped across four categories:
|
|
2708
|
+
*
|
|
2709
|
+
* - exact-re-run prompt matches a stored procedure near-verbatim;
|
|
2710
|
+
* should recall when procedural is on.
|
|
2711
|
+
* - parameter-variation prompt references the same intent with different
|
|
2712
|
+
* nouns (service name, environment, ticket id);
|
|
2713
|
+
* should recall on overlap + intent compatibility.
|
|
2714
|
+
* - decomposition prompt starts a multi-step task whose steps match
|
|
2715
|
+
* a stored runbook; should recall.
|
|
2716
|
+
* - distractor-rejection prompt looks task-like but the stored procedure
|
|
2717
|
+
* is unrelated — the gate should REJECT (expectMatch
|
|
2718
|
+
* = false).
|
|
2719
|
+
*
|
|
2720
|
+
* All scenarios are deterministic and use ONLY token-overlap + intent
|
|
2721
|
+
* classification semantics (no LLM calls). The deterministic stub LLM
|
|
2722
|
+
* requirement from #567 is satisfied because `buildProcedureRecallSection`
|
|
2723
|
+
* is a pure function of storage + prompt + config. A human runbook for
|
|
2724
|
+
* exercising the gpt-4o-mini path lives in docs/benchmarks/procedural-recall.md.
|
|
2725
|
+
*
|
|
2726
|
+
* Scenarios are synthetic (no personal data), per CLAUDE.md public-repo
|
|
2727
|
+
* privacy policy.
|
|
2728
|
+
*/
|
|
2729
|
+
|
|
2730
|
+
type ProceduralRealScenarioCategory = "exact-re-run" | "parameter-variation" | "decomposition" | "distractor-rejection";
|
|
2731
|
+
interface ProceduralRealScenario extends ProceduralAblationScenario {
|
|
2732
|
+
category: ProceduralRealScenarioCategory;
|
|
2733
|
+
notes?: string;
|
|
2734
|
+
}
|
|
2735
|
+
declare const PROCEDURAL_REAL_SCENARIOS: ProceduralRealScenario[];
|
|
2736
|
+
/** Built-in smoke slice (first scenario from each category). */
|
|
2737
|
+
declare const PROCEDURAL_REAL_SCENARIOS_SMOKE: ProceduralRealScenario[];
|
|
2738
|
+
|
|
2739
|
+
/**
|
|
2740
|
+
* Types for the ADAM-style memory-extraction attack harness.
|
|
2741
|
+
*
|
|
2742
|
+
* See docs/security/memory-extraction-threat-model.md for the threat model
|
|
2743
|
+
* this harness probes. The harness targets the modeled read-path surfaces
|
|
2744
|
+
* enumerated in §4 of that document (recall / memory_search /
|
|
2745
|
+
* memory_entities_list / …), driven in-process against a seeded memory
|
|
2746
|
+
* fixture so tests do not need a running daemon.
|
|
2747
|
+
*/
|
|
2748
|
+
/**
|
|
2749
|
+
* Attacker knowledge tier the harness simulates.
|
|
2750
|
+
*
|
|
2751
|
+
* These correspond to the T1/T2/T3 tiers in the threat model (§3):
|
|
2752
|
+
*
|
|
2753
|
+
* - `zero-knowledge` — attacker has no prior information about the memory
|
|
2754
|
+
* contents, must probe purely from seed vocabulary. Roughly T1 with a
|
|
2755
|
+
* minimal token (or T2 on a newly-provisioned namespace).
|
|
2756
|
+
* - `same-namespace` — attacker holds a valid token for the same namespace as
|
|
2757
|
+
* the seeded memories and may use entity-graph enumeration as side channel.
|
|
2758
|
+
* This is the primary T2 tier.
|
|
2759
|
+
* - `cross-namespace` — attacker holds a valid token for a *different*
|
|
2760
|
+
* namespace but attempts to leak memories from the victim namespace via
|
|
2761
|
+
* shared-namespace auto-promotion or debug snapshots (T3).
|
|
2762
|
+
*/
|
|
2763
|
+
type AttackerMode = "zero-knowledge" | "same-namespace" | "cross-namespace";
|
|
2764
|
+
/**
|
|
2765
|
+
* A single seeded memory the harness treats as ground truth.
|
|
2766
|
+
*
|
|
2767
|
+
* Ground-truth labelling is intentionally coarse: `tokens` is the set of
|
|
2768
|
+
* salient lowercase tokens that uniquely identify this memory to a human.
|
|
2769
|
+
* A recovered query transcript is considered to have leaked the memory if
|
|
2770
|
+
* the response contains a substring that covers a configurable fraction of
|
|
2771
|
+
* these tokens (see `recoveryTokenOverlap`).
|
|
2772
|
+
*/
|
|
2773
|
+
interface SeededMemory {
|
|
2774
|
+
/** Stable identifier. */
|
|
2775
|
+
id: string;
|
|
2776
|
+
/** Raw memory text as it would be stored. */
|
|
2777
|
+
content: string;
|
|
2778
|
+
/**
|
|
2779
|
+
* Category bucket (fact / preference / decision / entity / …). Mirrors the
|
|
2780
|
+
* buckets the threat model lists in §2 (Assets). Used by the harness only
|
|
2781
|
+
* for reporting; not used in the attack loop itself.
|
|
2782
|
+
*/
|
|
2783
|
+
category: "fact" | "preference" | "decision" | "entity" | "other";
|
|
2784
|
+
/** Namespace the memory lives in. */
|
|
2785
|
+
namespace: string;
|
|
2786
|
+
/**
|
|
2787
|
+
* Optional set of salient tokens that define "the attacker recovered this
|
|
2788
|
+
* memory". If omitted, defaults to all alphanumeric tokens of length > 2 in
|
|
2789
|
+
* `content`.
|
|
2790
|
+
*/
|
|
2791
|
+
tokens?: string[];
|
|
2792
|
+
}
|
|
2793
|
+
/**
|
|
2794
|
+
* One retrieval result returned by the target surface.
|
|
2795
|
+
*
|
|
2796
|
+
* The shape is intentionally narrower than `MemoryRecord` in core — the
|
|
2797
|
+
* harness only needs the attacker-observable subset.
|
|
2798
|
+
*/
|
|
2799
|
+
interface AttackRetrievalHit {
|
|
2800
|
+
/**
|
|
2801
|
+
* Stable memory identifier if the surface exposes one. Attackers can use
|
|
2802
|
+
* this as a side channel (memory IDs are disclosed by recall responses in
|
|
2803
|
+
* the current MCP surface), so we model it explicitly.
|
|
2804
|
+
*/
|
|
2805
|
+
memoryId?: string;
|
|
2806
|
+
/** Namespace the memory came from, if the surface discloses it. */
|
|
2807
|
+
namespace?: string;
|
|
2808
|
+
/** Text content (summary or full) the surface returned. */
|
|
2809
|
+
content: string;
|
|
2810
|
+
/** Optional relevance score. */
|
|
2811
|
+
score?: number;
|
|
2812
|
+
}
|
|
2813
|
+
/**
|
|
2814
|
+
* Minimal contract an attack target must satisfy.
|
|
2815
|
+
*
|
|
2816
|
+
* Callers wrap the real `EngramAccessService.recall()` or a test stub. The
|
|
2817
|
+
* harness deliberately does not depend on `@remnic/core` directly; PR 3
|
|
2818
|
+
* will provide the binding to the real orchestrator.
|
|
2819
|
+
*/
|
|
2820
|
+
interface ExtractionAttackTarget {
|
|
2821
|
+
/**
|
|
2822
|
+
* Execute a recall query against the target and return its hits.
|
|
2823
|
+
*
|
|
2824
|
+
* Should throw (or return an empty array) when the target denies the
|
|
2825
|
+
* query — the harness treats both as "no information gained".
|
|
2826
|
+
*/
|
|
2827
|
+
recall(query: string, options?: AttackRecallOptions): Promise<AttackRetrievalHit[]>;
|
|
2828
|
+
/**
|
|
2829
|
+
* Optional side channel: enumerate known entity names. Present iff the
|
|
2830
|
+
* attacker mode was granted access to `memory_entities_list`.
|
|
2831
|
+
*/
|
|
2832
|
+
listEntities?(): Promise<string[]>;
|
|
2833
|
+
}
|
|
2834
|
+
interface AttackRecallOptions {
|
|
2835
|
+
/** Top-K to request. Defaults to harness budget. */
|
|
2836
|
+
topK?: number;
|
|
2837
|
+
/** Namespace override (harness uses this for T3 cross-namespace probes). */
|
|
2838
|
+
namespace?: string;
|
|
2839
|
+
}
|
|
2840
|
+
/**
|
|
2841
|
+
* Deterministic PRNG interface. Callers can pass a seeded PRNG to make runs
|
|
2842
|
+
* reproducible.
|
|
2843
|
+
*/
|
|
2844
|
+
interface HarnessRng {
|
|
2845
|
+
/** Returns a float in [0, 1). */
|
|
2846
|
+
next(): number;
|
|
2847
|
+
}
|
|
2848
|
+
/**
|
|
2849
|
+
* Configuration for a single harness run.
|
|
2850
|
+
*/
|
|
2851
|
+
interface ExtractionAttackOptions {
|
|
2852
|
+
target: ExtractionAttackTarget;
|
|
2853
|
+
/** Ground-truth memories the harness is trying to recover. */
|
|
2854
|
+
groundTruth: readonly SeededMemory[];
|
|
2855
|
+
/** Attacker knowledge tier. */
|
|
2856
|
+
attackerMode: AttackerMode;
|
|
2857
|
+
/** Maximum number of recall queries the harness is allowed to issue. */
|
|
2858
|
+
queryBudget: number;
|
|
2859
|
+
/**
|
|
2860
|
+
* Hyper-parameter for the adaptive loop: when the normalized entropy of
|
|
2861
|
+
* the response distribution falls below this value, the harness switches
|
|
2862
|
+
* to exploitation (repeating high-signal tokens) rather than exploration.
|
|
2863
|
+
* Defaults to 0.3.
|
|
2864
|
+
*/
|
|
2865
|
+
entropyThreshold?: number;
|
|
2866
|
+
/**
|
|
2867
|
+
* Deterministic RNG. Callers should pass a seeded instance to get
|
|
2868
|
+
* reproducible results.
|
|
2869
|
+
*/
|
|
2870
|
+
rng?: HarnessRng;
|
|
2871
|
+
/**
|
|
2872
|
+
* Seed vocabulary the attacker starts from. In `zero-knowledge` mode this
|
|
2873
|
+
* is the *only* prior information; in `same-namespace` mode it is a seed
|
|
2874
|
+
* for exploration. Defaults to a small English stop-list plus common
|
|
2875
|
+
* personal-memory topics.
|
|
2876
|
+
*/
|
|
2877
|
+
seedVocabulary?: readonly string[];
|
|
2878
|
+
/**
|
|
2879
|
+
* Fraction of ground-truth tokens a single retrieved response must cover
|
|
2880
|
+
* to count as "recovered". Defaults to 0.5.
|
|
2881
|
+
*/
|
|
2882
|
+
recoveryTokenOverlap?: number;
|
|
2883
|
+
/** If true, every query and response is kept in `timeline`. */
|
|
2884
|
+
captureTimeline?: boolean;
|
|
2885
|
+
/** TopK to request per query. Defaults to 10. */
|
|
2886
|
+
topK?: number;
|
|
2887
|
+
/**
|
|
2888
|
+
* Namespace the attacker addresses their queries to. When set, every
|
|
2889
|
+
* `target.recall()` is invoked with `namespace: attackerNamespace`.
|
|
2890
|
+
* Useful for T3-class runs where the caller wants to simulate an
|
|
2891
|
+
* attacker holding a token for a specific cross-namespace tenant.
|
|
2892
|
+
*
|
|
2893
|
+
* Defaults:
|
|
2894
|
+
* - `zero-knowledge`: undefined (target uses its own default).
|
|
2895
|
+
* - `same-namespace`: undefined (target uses its own default).
|
|
2896
|
+
* - `cross-namespace`: `"shared"` (matches the residual-leak path the
|
|
2897
|
+
* threat model calls out in §5, but callers targeting other namespace
|
|
2898
|
+
* models should pass an explicit value here).
|
|
2899
|
+
*/
|
|
2900
|
+
attackerNamespace?: string;
|
|
2901
|
+
/**
|
|
2902
|
+
* Optional absolute deadline in ms since epoch. If the harness crosses it
|
|
2903
|
+
* during the attack loop, it terminates early with a partial result. Used
|
|
2904
|
+
* by tests to keep runs bounded.
|
|
2905
|
+
*/
|
|
2906
|
+
deadlineMs?: number;
|
|
2907
|
+
/**
|
|
2908
|
+
* When true, a thrown error from `target.recall` aborts the attack
|
|
2909
|
+
* loop and re-throws. Default false — errors are counted in
|
|
2910
|
+
* `ExtractionAttackResult.backendErrorCount` so callers can distinguish
|
|
2911
|
+
* genuine empty recalls from backend failures. Flip to true in CI
|
|
2912
|
+
* gating scripts that must not silently publish ASR from a degraded
|
|
2913
|
+
* target.
|
|
2914
|
+
*/
|
|
2915
|
+
failOnBackendError?: boolean;
|
|
2916
|
+
}
|
|
2917
|
+
interface RecoveredMemory {
|
|
2918
|
+
memoryId: string;
|
|
2919
|
+
memory: SeededMemory;
|
|
2920
|
+
recoveredContent: string;
|
|
2921
|
+
queriesUsed: number;
|
|
2922
|
+
/** Index into `timeline` that first recovered this memory. */
|
|
2923
|
+
firstHitAt: number;
|
|
2924
|
+
}
|
|
2925
|
+
interface TimelineEntry {
|
|
2926
|
+
query: string;
|
|
2927
|
+
hits: AttackRetrievalHit[];
|
|
2928
|
+
entropy: number;
|
|
2929
|
+
newlyRecoveredMemoryIds: string[];
|
|
2930
|
+
/** Which strategy chose this query. Useful for diagnosing the algorithm. */
|
|
2931
|
+
strategy: "seed" | "exploit-entity" | "exploit-token" | "explore-random" | "explore-entropy";
|
|
2932
|
+
}
|
|
2933
|
+
interface ExtractionAttackResult {
|
|
2934
|
+
/** Attack Success Rate: fraction of ground-truth memories recovered. */
|
|
2935
|
+
asr: number;
|
|
2936
|
+
/** Number of queries issued (may be less than budget on early exit). */
|
|
2937
|
+
queriesIssued: number;
|
|
2938
|
+
/** Attacker mode this run simulated. */
|
|
2939
|
+
attackerMode: AttackerMode;
|
|
2940
|
+
/** Recovered memories with per-memory metadata. */
|
|
2941
|
+
recovered: RecoveredMemory[];
|
|
2942
|
+
/** Ground-truth memories the attacker failed to recover within budget. */
|
|
2943
|
+
missed: SeededMemory[];
|
|
2944
|
+
/** Full query-by-query trace. Empty unless `captureTimeline: true`. */
|
|
2945
|
+
timeline: TimelineEntry[];
|
|
2946
|
+
/** Seconds of wall time spent inside the attack loop. */
|
|
2947
|
+
durationMs: number;
|
|
2948
|
+
/** True iff the run stopped because `deadlineMs` was reached. */
|
|
2949
|
+
hitDeadline: boolean;
|
|
2950
|
+
/**
|
|
2951
|
+
* Number of `target.recall` calls that threw and were treated as empty
|
|
2952
|
+
* hits. A high value means the harness was talking to a degraded
|
|
2953
|
+
* backend — low/zero ASR in that case is not a security statement
|
|
2954
|
+
* about the system, it is a measurement failure. Callers that want to
|
|
2955
|
+
* fail-fast on backend errors can pass `failOnBackendError: true`.
|
|
2956
|
+
*/
|
|
2957
|
+
backendErrorCount: number;
|
|
2958
|
+
}
|
|
2959
|
+
|
|
2960
|
+
/**
|
|
2961
|
+
* ADAM-style entropy-guided memory-extraction attack harness.
|
|
2962
|
+
*
|
|
2963
|
+
* Re-implements the entropy-guided adaptive querying strategy described in
|
|
2964
|
+
* ADAM (arXiv:2604.09747, Apr 2026): the attacker issues a sequence of
|
|
2965
|
+
* recall queries, observes the information gained from each response, and
|
|
2966
|
+
* picks the next query to maximize expected entropy reduction over the
|
|
2967
|
+
* remaining candidate memories.
|
|
2968
|
+
*
|
|
2969
|
+
* This is a clean-room re-implementation, not a port of any released
|
|
2970
|
+
* codebase. The algorithm is:
|
|
2971
|
+
*
|
|
2972
|
+
* 1. Initialize candidate-token pool from the seed vocabulary and (mode
|
|
2973
|
+
* permitting) side channels like entity listings.
|
|
2974
|
+
* 2. Loop until budget exhausted or all memories recovered:
|
|
2975
|
+
* a. Compute Shannon entropy over the attacker's current belief
|
|
2976
|
+
* distribution (`tokenFrequencies`). Low entropy => we have a
|
|
2977
|
+
* concentrated belief; exploit by querying the top tokens. High
|
|
2978
|
+
* entropy => we are uncertain; explore by querying a token we have
|
|
2979
|
+
* not tried yet.
|
|
2980
|
+
* b. Issue the chosen query against the target.
|
|
2981
|
+
* c. Update the belief from the response: tokens appearing in hit
|
|
2982
|
+
* content get their frequency bumped; tokens that keep appearing
|
|
2983
|
+
* alongside already-recovered content get deprioritized so the
|
|
2984
|
+
* attacker does not re-query the same region forever.
|
|
2985
|
+
* d. Check each hit against the ground-truth set; mark recoveries.
|
|
2986
|
+
* 3. Emit ASR + per-memory leak log.
|
|
2987
|
+
*
|
|
2988
|
+
* The harness never calls an LLM. Token "information gain" is computed from
|
|
2989
|
+
* lexical overlap against previously-seen hits; this is the non-LLM
|
|
2990
|
+
* approximation the paper uses for its low-cost variant, and is adequate
|
|
2991
|
+
* for measuring the attack surface's structural leakage.
|
|
2992
|
+
*/
|
|
2993
|
+
|
|
2994
|
+
/**
|
|
2995
|
+
* Tiny mulberry32 PRNG — stable across Node versions.
|
|
2996
|
+
*/
|
|
2997
|
+
declare function createSeededRng(seed: number): HarnessRng;
|
|
2998
|
+
/**
|
|
2999
|
+
* Entry point. See `types.ts` for the options contract.
|
|
3000
|
+
*/
|
|
3001
|
+
declare function runExtractionAttack(options: ExtractionAttackOptions): Promise<ExtractionAttackResult>;
|
|
3002
|
+
|
|
3003
|
+
/**
|
|
3004
|
+
* Synthetic memory fixture and in-process target implementation for the
|
|
3005
|
+
* extraction-attack harness tests.
|
|
3006
|
+
*
|
|
3007
|
+
* Everything in this file is synthetic. Per the public-repo privacy policy
|
|
3008
|
+
* in CLAUDE.md, no real user data may ship in fixtures.
|
|
3009
|
+
*/
|
|
3010
|
+
|
|
3011
|
+
/**
|
|
3012
|
+
* 15 synthetic seeded memories covering fact, preference, decision, and
|
|
3013
|
+
* entity categories across two namespaces. Intentionally mundane so no
|
|
3014
|
+
* reader mistakes any of this for real personal data.
|
|
3015
|
+
*/
|
|
3016
|
+
declare const SYNTHETIC_MEMORIES: readonly SeededMemory[];
|
|
3017
|
+
/**
|
|
3018
|
+
* A second namespace used for the cross-namespace test. This namespace's
|
|
3019
|
+
* contents MUST stay separate from `victim` — the T3 attack should fail if
|
|
3020
|
+
* the surface honors ACLs.
|
|
3021
|
+
*/
|
|
3022
|
+
declare const OTHER_NAMESPACE_MEMORIES: readonly SeededMemory[];
|
|
3023
|
+
interface SyntheticTargetOptions {
|
|
3024
|
+
/** Memories visible through normal recall. */
|
|
3025
|
+
memories: readonly SeededMemory[];
|
|
3026
|
+
/** Entities the side channel should enumerate. */
|
|
3027
|
+
entities?: readonly string[];
|
|
3028
|
+
/**
|
|
3029
|
+
* When true, the target enforces namespace ACLs: a recall with a namespace
|
|
3030
|
+
* other than `allowedNamespace` returns an empty array. Models the T3
|
|
3031
|
+
* mitigation path in the threat model §6.1.
|
|
3032
|
+
*/
|
|
3033
|
+
enforceNamespaceAcl?: boolean;
|
|
3034
|
+
/** The only namespace the attacker is entitled to read. */
|
|
3035
|
+
allowedNamespace?: string;
|
|
3036
|
+
/**
|
|
3037
|
+
* When true, the target always includes memory IDs in hits. When false,
|
|
3038
|
+
* the target masks IDs (models a mitigation where recall responses no
|
|
3039
|
+
* longer disclose memory IDs as a side channel).
|
|
3040
|
+
*/
|
|
3041
|
+
disclosesMemoryIds?: boolean;
|
|
3042
|
+
/**
|
|
3043
|
+
* How many hits to return per recall. Mirrors QMD behavior where cutoff
|
|
3044
|
+
* is based on score threshold. Defaults to 5.
|
|
3045
|
+
*/
|
|
3046
|
+
hitCap?: number;
|
|
3047
|
+
}
|
|
3048
|
+
/**
|
|
3049
|
+
* Deterministic in-process target. Scores memories by token-overlap with the
|
|
3050
|
+
* query and returns the top-K above zero.
|
|
3051
|
+
*/
|
|
3052
|
+
declare function createSyntheticTarget(options: SyntheticTargetOptions): ExtractionAttackTarget;
|
|
3053
|
+
|
|
3054
|
+
/**
|
|
3055
|
+
* Baseline measurement runner for the ADAM memory-extraction harness.
|
|
3056
|
+
*
|
|
3057
|
+
* Produces a reproducible set of ASR numbers for every attacker tier against
|
|
3058
|
+
* a synthetic target that mirrors the current Remnic read-path behavior
|
|
3059
|
+
* (memory IDs disclosed, namespace ACL enforced on cross-namespace reads).
|
|
3060
|
+
*
|
|
3061
|
+
* This is intentionally separate from the unit tests: tests keep budgets
|
|
3062
|
+
* small so CI stays fast, whereas the baseline pushes the budget high enough
|
|
3063
|
+
* for each tier to plateau. The output feeds into
|
|
3064
|
+
* `docs/security/adam-baseline-2026-04.md`.
|
|
3065
|
+
*/
|
|
3066
|
+
|
|
3067
|
+
interface BaselineScenario {
|
|
3068
|
+
readonly name: string;
|
|
3069
|
+
readonly attackerMode: AttackerMode;
|
|
3070
|
+
readonly queryBudget: number;
|
|
3071
|
+
readonly seed: number;
|
|
3072
|
+
/** Ground truth the attacker is trying to recover. */
|
|
3073
|
+
readonly groundTruth: readonly SeededMemory[];
|
|
3074
|
+
/** Memories the target actually stores (may be a superset of groundTruth). */
|
|
3075
|
+
readonly targetMemories: readonly SeededMemory[];
|
|
3076
|
+
readonly entities?: readonly string[];
|
|
3077
|
+
readonly enforceNamespaceAcl?: boolean;
|
|
3078
|
+
readonly allowedNamespace?: string;
|
|
3079
|
+
readonly disclosesMemoryIds?: boolean;
|
|
3080
|
+
/** Attacker-held namespace. Forwarded as `attackerNamespace` to the runner. */
|
|
3081
|
+
readonly attackerNamespace?: string;
|
|
3082
|
+
}
|
|
3083
|
+
interface BaselineRow {
|
|
3084
|
+
readonly scenario: string;
|
|
3085
|
+
readonly attackerMode: AttackerMode;
|
|
3086
|
+
readonly queryBudget: number;
|
|
3087
|
+
readonly queriesIssued: number;
|
|
3088
|
+
readonly asr: number;
|
|
3089
|
+
readonly recoveredIds: readonly string[];
|
|
3090
|
+
readonly missedIds: readonly string[];
|
|
3091
|
+
readonly durationMs: number;
|
|
3092
|
+
/** Whether mitigations were active during this run. */
|
|
3093
|
+
readonly mitigated?: boolean;
|
|
3094
|
+
}
|
|
3095
|
+
/**
|
|
3096
|
+
* Scenarios used for the 2026-04 baseline. Kept deterministic via fixed seeds
|
|
3097
|
+
* so the document remains reproducible.
|
|
3098
|
+
*/
|
|
3099
|
+
declare const DEFAULT_BASELINE_SCENARIOS: readonly BaselineScenario[];
|
|
3100
|
+
/**
|
|
3101
|
+
* Executes every scenario once and returns a flat set of rows suitable for
|
|
3102
|
+
* rendering as a markdown table.
|
|
3103
|
+
*/
|
|
3104
|
+
declare function runBaseline(scenarios?: readonly BaselineScenario[]): Promise<BaselineRow[]>;
|
|
3105
|
+
interface MitigatedBaselineConfig {
|
|
3106
|
+
budgetHardLimit: number;
|
|
3107
|
+
budgetWindowMs?: number;
|
|
3108
|
+
/**
|
|
3109
|
+
* Override for the principal's "home" namespace in the mitigated target.
|
|
3110
|
+
* When set, this is passed as `principalNamespace` to `createMitigatedTarget`.
|
|
3111
|
+
* When unset, falls back to `allowedNamespace ?? "default"`.
|
|
3112
|
+
* Use this to decouple the budget's principal identity from the synthetic
|
|
3113
|
+
* target's ACL namespace.
|
|
3114
|
+
*/
|
|
3115
|
+
principalNamespaceOverride?: string;
|
|
3116
|
+
}
|
|
3117
|
+
declare const MITIGATED_BASELINE_SCENARIOS: readonly (BaselineScenario & MitigatedBaselineConfig)[];
|
|
3118
|
+
declare function runMitigatedBaseline(scenarios?: readonly (BaselineScenario & MitigatedBaselineConfig)[]): Promise<BaselineRow[]>;
|
|
3119
|
+
/**
|
|
3120
|
+
* Renders a baseline run as a human-readable markdown fragment. The returned
|
|
3121
|
+
* string is suitable for pasting into the baseline document.
|
|
3122
|
+
*/
|
|
3123
|
+
declare function renderBaselineMarkdown(rows: readonly BaselineRow[]): string;
|
|
3124
|
+
|
|
3125
|
+
/**
|
|
3126
|
+
* Mitigation-aware target wrapper for the ADAM extraction attack harness.
|
|
3127
|
+
*
|
|
3128
|
+
* Wraps a raw `ExtractionAttackTarget` and enforces:
|
|
3129
|
+
* 1. Cross-namespace query budget (mirrors `CrossNamespaceBudget` from core)
|
|
3130
|
+
* 2. Namespace ACL (carries forward from `createSyntheticTarget`)
|
|
3131
|
+
*
|
|
3132
|
+
* When the budget is exceeded, the wrapper returns empty hits instead of
|
|
3133
|
+
* forwarding the query — simulating the real recall-path denial. This lets
|
|
3134
|
+
* the harness re-measure ASR with mitigations active and compare against
|
|
3135
|
+
* the unmitigated baseline.
|
|
3136
|
+
*/
|
|
3137
|
+
|
|
3138
|
+
interface MitigatedTargetConfig {
|
|
3139
|
+
/** Inner (unmitigated) target to wrap. */
|
|
3140
|
+
target: ExtractionAttackTarget;
|
|
3141
|
+
/**
|
|
3142
|
+
* Maximum cross-namespace queries per `budgetWindowMs` window.
|
|
3143
|
+
* Queries beyond this limit return empty hits.
|
|
3144
|
+
*/
|
|
3145
|
+
budgetHardLimit: number;
|
|
3146
|
+
/**
|
|
3147
|
+
* Rolling window in ms for the budget counter. Defaults to 60_000.
|
|
3148
|
+
*/
|
|
3149
|
+
budgetWindowMs?: number;
|
|
3150
|
+
/**
|
|
3151
|
+
* The principal's "home" namespace. Queries targeting a different
|
|
3152
|
+
* namespace count against the budget; same-namespace queries are free.
|
|
3153
|
+
*/
|
|
3154
|
+
principalNamespace: string;
|
|
3155
|
+
}
|
|
3156
|
+
/**
|
|
3157
|
+
* Creates a mitigation-aware wrapper around a raw target.
|
|
3158
|
+
*
|
|
3159
|
+
* The wrapper tracks cross-namespace queries in a sliding window and
|
|
3160
|
+
* returns empty hits when the budget is exceeded. Same-namespace queries
|
|
3161
|
+
* pass through without counting.
|
|
3162
|
+
*/
|
|
3163
|
+
declare function createMitigatedTarget(config: MitigatedTargetConfig): ExtractionAttackTarget;
|
|
3164
|
+
|
|
3165
|
+
export { AMA_BENCH_DIAGNOSTIC_VARIANTS, ASSISTANT_AGENT_CONFIG_KEY, ASSISTANT_JUDGE_CONFIG_KEY, ASSISTANT_MEETING_PREP_SCENARIOS, ASSISTANT_MEETING_PREP_SMOKE_SCENARIOS, ASSISTANT_MORNING_BRIEF_SCENARIOS, ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SMOKE_SCENARIOS, ASSISTANT_RUBRIC_DIMENSIONS, ASSISTANT_RUBRIC_ID_KEY, ASSISTANT_SEEDS_CONFIG_KEY, ASSISTANT_SPOT_CHECK_DIR_KEY, ASSISTANT_SYNTHESIS_SCENARIOS, ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS, type AbstentionRetrievalCase, type AggregateMetrics, type AmaBenchDiagnosticAdapterOptions, type AmaBenchDiagnosticAnswererMode, type AmaBenchDiagnosticBreakdown, type AmaBenchDiagnosticMatrixArtifact, type AmaBenchDiagnosticRecallMode, type AmaBenchDiagnosticRunContext, type AmaBenchDiagnosticTaskEvidence, type AmaBenchDiagnosticTaskRow, type AmaBenchDiagnosticVariant, type AmaBenchDiagnosticVariantSummary, type AnthropicProviderConfig, type AssistantAgent, type AssistantMemoryFact, type AssistantMemoryGraph, type AssistantRubricDimension, type AssistantRubricScores, type AssistantRunnerOptions, type AssistantScenario, type AssistantStance, type AttackRecallOptions, type AttackRetrievalHit, type AttackerMode, BENCHMARK_ARTIFACT_SCHEMA_VERSION, BENCHMARK_INTEGRITY_META_SCHEMA, BENCHMARK_REPRO_MANIFEST_FILENAME, BENCHMARK_REPRO_MANIFEST_SCHEMA_VERSION, BENCHMARK_RESULT_SCHEMA, BENCHMARK_SPLIT_TYPES, type BaselineRow, type BaselineScenario, type BeamDatasetPreview, type BenchConfig, type BenchJudge, type BenchJudgeResult, type BenchMemoryAdapter, type BenchModelSource, type BenchReasoningEffort, type BenchRecallOptions, type BenchResponder, type BenchResponse, type BenchRuntimeProfile, type BenchTier, type BenchmarkArtifact, type BenchmarkArtifactEnvironment, type BenchmarkArtifactPerTaskScore, type BenchmarkArtifactSystem, type BenchmarkCategory, type BenchmarkDefinition, type BenchmarkIntegrityMeta, type BenchmarkMeta, type BenchmarkMode, type BenchmarkReport, type BenchmarkReproManifest, type BenchmarkReproManifestDataset, type BenchmarkReproManifestFile, type BenchmarkReproManifestResult, type BenchmarkResult, type BenchmarkSplitType, type BenchmarkStatus, type BenchmarkSuiteResult, type BenchmarkTier, type BuildBenchmarkArtifactInput, type BuildBenchmarkPublishFeedOptions, type BuildBenchmarkReproManifestOptions, type BuiltInProvider, CANARY_FIXED_RECALL, CANARY_SCORE_FLOOR, type CanaryAdapterOptions, type CanaryFloorCheck, type CodexCliProviderConfig, type ComparisonMetricDelta, type ComparisonResult, type CompletionOpts, type CompletionResult, type ConfidenceInterval, type ContaminationCheckResult, type ContaminationEntry, type ContaminationManifest, type CustomBenchmarkScoring, type CustomBenchmarkSpec, type CustomBenchmarkTask, DEFAULT_ABLATION_BOOTSTRAP_SEED, DEFAULT_ASSISTANT_RUBRIC_ID, DEFAULT_BASELINE_SCENARIOS, type DatasetSource, type DiscoveredModel, EMPTY_CONTAMINATION_MANIFEST, type EffectSizeInterpretation, type EffectSizeSummary, type ExplainResult, type ExtractedEntity, type ExtractedLink, type ExtractedPage, type ExtractionAttackOptions, type ExtractionAttackResult, type ExtractionAttackTarget, type FixtureGenerator, type FixtureOutput, type FixtureVariant, type GeneratedFile, type GoldEntity, type GoldEntityType, type GoldGraph, type GoldLink, type GoldPage, type HarnessRng, INTEGRITY_CIPHER_ALGORITHM, INTEGRITY_HASH_ALGORITHM, INTEGRITY_META_FIELDS, type IngestionBenchAdapter, type IngestionLog, LOCOMO_DATASET_FILENAMES, LONG_MEM_EVAL_DATASET_FILENAMES, type LeaderboardArtifactWrite, type LlmJudge, type LlmProvider, type LoadDatasetOptions, type LoadSealedQrelsOptions, type LoadedDataset, type LocalLlmProviderConfig, MEMORY_EVAL_DIMENSIONS, MEMORY_EVAL_PUBLIC_LINE, MITIGATED_BASELINE_SCENARIOS, type MemoryEvalCategory, type MemoryEvalDimension, type MemoryEvalDimensionId, type MemoryEvalMetric, type MemoryGraph, type MemoryStats, type MemorySystem, type Message, type MetricAggregate, type MitigatedBaselineConfig, type MitigatedTargetConfig, type MultipleChoiceQuestion, OTHER_NAMESPACE_MEMORIES, type OllamaProviderConfig, type OpenAiCompatibleProviderConfig, PROCEDURAL_REAL_SCENARIOS, PROCEDURAL_REAL_SCENARIOS_SMOKE, PUBLISHED_BENCHMARK_ARTIFACT_IDS, type PersonalizationRetrievalCase, type ProceduralAblationArtifact, type ProceduralAblationPerCase, type ProceduralAblationScenario, type ProceduralRealScenario, type ProceduralRealScenarioCategory, type ProviderBaseConfig, type ProviderConfig, type ProviderDiscoveryResult, type ProviderFactoryConfig, type PublishSkipReason, type PublishSkipRecord, type PublishedBenchmarkFeed, type PublishedBenchmarkFeedEntry, type PublishedBenchmarkId, REQUIRED_FRONTMATTER_FIELDS, type RecallMetrics, type RecoveredMemory, type RegressionDetail, type RegressionGateResult, type RemnicAdapterOptions, type ResolveBenchRuntimeProfileOptions, type ResolvedBenchRuntimeProfile, type ResolvedRunBenchmarkOptions, type RotatedChoices, type RunBenchmarkOptions, type RunProceduralAblationCliArgs, type RunProceduralAblationOptions, SCHEMA_TIER_FIXTURE, SCHEMA_TIER_SMOKE_FIXTURE, SEALED_PROMPT_REGISTRY, SYNTHETIC_MEMORIES, type SanitizedDiagnosticProvider, type SavedBaseline, type SchemaTierCorpus, type SchemaTierFixture, type SchemaTierName, type SchemaTierPage, type SchemaTierPageFrontmatter, type SealedArtifact, type SealedJudgeDecision, type SealedJudgeInput, type SealedQrelsArtifact, type SealedQrelsHandle, type SealedRubric, type SearchResult, type SeededMemory, type SeededRng, type SpotCheckLogger, type StatisticalReport, type StructuredJudge, type SyntheticEmailIngestionAdapterOptions, type SyntheticTargetOptions, type TaskResult, type TaskTokenUsage, type TemporalRetrievalCase, type TierDetail, type TimelineEntry, type TokenUsage, type WriteBenchmarkArtifactResult, addContaminationEntry, aggregateTaskScores, answerBenchmarkQuestion, assertCanaryUnderFloor, assertIntegrityMetaPresent, assertPublishableIntegrity, assertSha256Hex, assistantMeetingPrepDefinition, assistantMorningBriefDefinition, assistantNextBestActionDefinition, assistantSynthesisDefinition, backlinkF1, bootstrapMeanConfidenceInterval, buildAmaBenchDiagnosticMatrixArtifact, buildAmaBenchDiagnosticVariantSummary, buildAmaBenchLeaderboardRows, buildBenchmarkArtifact, buildBenchmarkArtifactFilename, buildBenchmarkPublishFeed, buildBenchmarkReproManifest, buildBenchmarkRunSeeds, buildJudgePayload, buildOracleTrajectoryRecall, buildSchemaTierFixture, buildSchemaTierSmokeFixture, calendarFixture, canonicalJsonStringify, chatFixture, checkDatasetContamination, checkRegression, clampScore, cohensD, compareResults, computeSealHash, containsAnswer, createSeededRng as createAdamSeededRng, createAmaBenchDiagnosticAdapter, createAnthropicProvider, createCanaryAdapter, createCodexCliProvider, createDeterministicSpotCheckLogger, createGatewayResponder, createLightweightAdapter, createLiteLlmProvider, createLocalLlmProvider, createMitigatedTarget, createOllamaProvider, createOpenAiCompatibleProvider, createSeededRandom as createProceduralAblationSeededRandom, createProvider, createProviderBackedAmaBenchRecommendedJudge, createProviderBackedJudge, createProviderBackedResponder, createProviderBackedStructuredJudge, createRemnicAdapter, createResponderFromProvider, createSeededRng$1 as createSeededRng, createSpotCheckFileLogger, createStructuredJudgeFromProvider, createSyntheticEmailIngestionAdapter, createSyntheticTarget, createTimeoutGuardedAdapter, defaultBenchmarkBaselineDir, defaultBenchmarkPublishPath, deleteBenchmarkResults, discoverAllProviders, emailFixture, entityRecall, exactMatch, extractMarkdownSectionsByTitle, f1Score, fixtureToAblationScenarios, formatMissingDatasetError, generateReport, getBenchmark, getBenchmarkLowerIsBetter, getMemoryEvalDimension, getRemnicVersion, hashBenchmarkArtifact, hashBytes, hashCanonicalJson, hashString, integrityMetaIsComplete, interpretEffectSize, isAmaBenchUnknownLikeAnswer, isContaminationEntry, isContaminationManifest, isSealedQrelsArtifact, isSha256Hex, linkMatches, listBenchmarkBaselines, listBenchmarkResults, listBenchmarks, listMemoryEvalBenchmarkIds, listMemoryEvalDimensions, llmJudgeScore, llmJudgeScoreDetailed, loadAblationFixture, loadBaseline, loadBeamDatasetPreview, loadBenchmarkArtifact, loadBenchmarkBaseline, loadBenchmarkResult, loadCustomBenchmarkFile, loadLoCoMo10, loadLongMemEvalS, loadSealKeyFromEnv, loadSealedQrels, loadSealedRubric, matchEntity, mergeContaminationManifests, openSeal, orchestrateBenchmarkRuns, pairedDeltaConfidenceInterval, parseBenchmarkArtifact, parseCustomBenchmark, parseRubricResponse, parseSealedQrels, precisionAtK, projectFolderFixture, recallAtK, redactBenchmarkResultSecrets, renderBaselineMarkdown, renderBenchmarkResultExport, renderMemorySummaryForJudge, renderMemoryViewForAgent, resolveAssistantAgent, resolveAssistantRubricId, resolveAssistantSeeds, resolveAssistantSpotCheckDir, resolveBenchRuntimeProfile, resolveBenchmarkPhaseTimeoutMs, resolveBenchmarkProgressLogging, resolveBenchmarkResultReference, resolveBenchmarkRunCount, resolveStructuredJudge, rotateDistractors, rougeL, runAssistantBenchmark, runAssistantMeetingPrepBenchmark, runAssistantMorningBriefBenchmark, runAssistantNextBestActionBenchmark, runAssistantSynthesisBenchmark, runBaseline, runBenchSuite, runBenchmark, runCustomBenchmarkFile, runExplain, runExtractionAttack, runMitigatedBaseline, runProceduralAblation, runProceduralAblationCli, runSealedJudge, safeHexEqual, saveBaseline, saveBenchmarkBaseline, schemaCompleteness, sealPayload, selectAmaBenchDiagnosticVariants, selectFixtureVariant, serializeBenchmarkArtifact, serializeJsonl, serializeSealedQrels, shuffleTasks, timed, verifyRubricDigest, writeBenchmarkArtifact, writeBenchmarkPublishFeed, writeBenchmarkReproManifest, writeBenchmarkResult, writeLeaderboardArtifactsForResult, zeroScores };
|