@remnic/bench 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/LICENSE +21 -0
  2. package/dist/index.d.ts +1757 -0
  3. package/dist/index.js +13468 -0
  4. package/package.json +46 -0
@@ -0,0 +1,1757 @@
1
+ import { GatewayConfig, EngramAccessService } from '@remnic/core';
2
+
3
+ /**
4
+ * Types for the ingestion benchmark tier.
5
+ */
6
+ type GoldEntityType = "person" | "org" | "project" | "topic" | "event" | "location";
7
+ interface GoldEntity {
8
+ id: string;
9
+ name: string;
10
+ type: GoldEntityType;
11
+ aliases?: string[];
12
+ }
13
+ interface GoldLink {
14
+ source: string;
15
+ target: string;
16
+ relation: string;
17
+ bidirectional: boolean;
18
+ }
19
+ interface GoldPage {
20
+ title: string;
21
+ requiredFields: string[];
22
+ expectTimeline: boolean;
23
+ expectExecSummary: boolean;
24
+ expectSeeAlso: string[];
25
+ }
26
+ interface GoldGraph {
27
+ entities: GoldEntity[];
28
+ links: GoldLink[];
29
+ pages: GoldPage[];
30
+ }
31
+ interface ExtractedEntity {
32
+ name: string;
33
+ type: string;
34
+ sourceFile: string;
35
+ }
36
+ interface ExtractedLink {
37
+ source: string;
38
+ target: string;
39
+ relation: string;
40
+ }
41
+ interface ExtractedPage {
42
+ path: string;
43
+ title: string;
44
+ frontmatter: Record<string, unknown>;
45
+ hasExecSummary: boolean;
46
+ hasTimeline: boolean;
47
+ seeAlso: string[];
48
+ content: string;
49
+ }
50
+ interface MemoryGraph {
51
+ entities: ExtractedEntity[];
52
+ links: ExtractedLink[];
53
+ pages: ExtractedPage[];
54
+ }
55
+ interface IngestionLog {
56
+ commandsIssued: string[];
57
+ promptsShown: string[];
58
+ errors: string[];
59
+ durationMs: number;
60
+ }
61
+ interface IngestionBenchAdapter {
62
+ ingest(inputDir: string): Promise<IngestionLog>;
63
+ getMemoryGraph(): Promise<MemoryGraph>;
64
+ reset(): Promise<void>;
65
+ destroy(): Promise<void>;
66
+ }
67
+ declare const REQUIRED_FRONTMATTER_FIELDS: readonly ["title", "type", "state", "created", "see-also"];
68
+
69
+ /**
70
+ * Shared adapter contract for benchmarks running against Remnic memory systems.
71
+ */
72
+ interface Message {
73
+ role: "user" | "assistant" | "system";
74
+ content: string;
75
+ }
76
+ interface SearchResult {
77
+ turnIndex: number;
78
+ role: string;
79
+ snippet: string;
80
+ sessionId: string;
81
+ score?: number;
82
+ }
83
+ interface MemoryStats {
84
+ totalMessages: number;
85
+ totalSummaryNodes: number;
86
+ maxDepth: number;
87
+ }
88
+ interface BenchResponse {
89
+ text: string;
90
+ tokens: {
91
+ input: number;
92
+ output: number;
93
+ };
94
+ latencyMs: number;
95
+ model: string;
96
+ }
97
+ interface BenchResponder {
98
+ respond(question: string, recalledText: string): Promise<BenchResponse>;
99
+ }
100
+ interface BenchJudgeResult {
101
+ score: number;
102
+ tokens: {
103
+ input: number;
104
+ output: number;
105
+ };
106
+ latencyMs: number;
107
+ model?: string;
108
+ }
109
+ interface BenchJudge {
110
+ score(question: string, predicted: string, expected: string): Promise<number>;
111
+ scoreWithMetrics?(question: string, predicted: string, expected: string): Promise<BenchJudgeResult>;
112
+ }
113
+ interface BenchMemoryAdapter {
114
+ store(sessionId: string, messages: Message[]): Promise<void>;
115
+ recall(sessionId: string, query: string, budgetChars?: number): Promise<string>;
116
+ search(query: string, limit: number, sessionId?: string): Promise<SearchResult[]>;
117
+ reset(sessionId?: string): Promise<void>;
118
+ getStats(sessionId?: string): Promise<MemoryStats>;
119
+ destroy(): Promise<void>;
120
+ responder?: BenchResponder;
121
+ judge?: BenchJudge;
122
+ }
123
+ type LlmJudge = BenchJudge;
124
+ type MemorySystem = BenchMemoryAdapter;
125
+
126
+ /**
127
+ * Integrity-facing additions to `BenchmarkResult.meta`.
128
+ *
129
+ * These fields are required for every published result and checked by the
130
+ * publishing pipeline. See `docs/bench/integrity.md` for the rotation policy.
131
+ */
132
+ declare const BENCHMARK_SPLIT_TYPES: readonly ["public", "holdout"];
133
+ type BenchmarkSplitType = (typeof BENCHMARK_SPLIT_TYPES)[number];
134
+ interface BenchmarkIntegrityMeta {
135
+ /**
136
+ * Which dataset split produced this result. Public leaderboard scores
137
+ * only accept `holdout` results; `public` results are for self-reporting
138
+ * and iteration.
139
+ */
140
+ splitType: BenchmarkSplitType;
141
+ /** SHA-256 of the sealed qrels artifact used by the judge. */
142
+ qrelsSealedHash: string;
143
+ /** SHA-256 of the rendered judge prompt (post-template expansion). */
144
+ judgePromptHash: string;
145
+ /** SHA-256 of the dataset payload as served to the runner. */
146
+ datasetHash: string;
147
+ /**
148
+ * Score the canary adapter scored on the same benchmark during the audit
149
+ * run that produced this result. Must stay below the benchmark's floor.
150
+ * Omitted only during the canary's own run.
151
+ */
152
+ canaryScore?: number;
153
+ }
154
+ declare const INTEGRITY_META_FIELDS: readonly ["splitType", "qrelsSealedHash", "judgePromptHash", "datasetHash"];
155
+ declare const BENCHMARK_INTEGRITY_META_SCHEMA: {
156
+ readonly type: "object";
157
+ readonly required: readonly ["splitType", "qrelsSealedHash", "judgePromptHash", "datasetHash"];
158
+ readonly properties: {
159
+ readonly splitType: {
160
+ readonly type: "string";
161
+ readonly enum: readonly ["public", "holdout"];
162
+ };
163
+ readonly qrelsSealedHash: {
164
+ readonly type: "string";
165
+ readonly pattern: "^[0-9a-f]{64}$";
166
+ };
167
+ readonly judgePromptHash: {
168
+ readonly type: "string";
169
+ readonly pattern: "^[0-9a-f]{64}$";
170
+ };
171
+ readonly datasetHash: {
172
+ readonly type: "string";
173
+ readonly pattern: "^[0-9a-f]{64}$";
174
+ };
175
+ readonly canaryScore: {
176
+ readonly type: "number";
177
+ };
178
+ };
179
+ };
180
+ declare function integrityMetaIsComplete(value: unknown): value is BenchmarkIntegrityMeta;
181
+ /**
182
+ * Throw a descriptive error listing every missing or malformed integrity
183
+ * field. Used by the publishing pipeline.
184
+ */
185
+ declare function assertIntegrityMetaPresent(value: unknown): asserts value is BenchmarkIntegrityMeta;
186
+
187
+ type BenchmarkMode = "full" | "quick";
188
+ type BenchmarkTier = "published" | "remnic" | "custom";
189
+ type BenchmarkStatus = "ready" | "planned";
190
+ type BenchmarkCategory = "agentic" | "retrieval" | "conversational" | "ingestion";
191
+ type BenchRuntimeProfile = "baseline" | "real" | "openclaw-chain";
192
+ type BuiltInProvider = "openai" | "anthropic" | "ollama" | "litellm";
193
+ interface ProviderConfig {
194
+ provider: BuiltInProvider;
195
+ model: string;
196
+ baseUrl?: string;
197
+ }
198
+ interface TaskTokenUsage {
199
+ input: number;
200
+ output: number;
201
+ }
202
+ interface TaskResult {
203
+ taskId: string;
204
+ question: string;
205
+ expected: string;
206
+ actual: string;
207
+ scores: Record<string, number>;
208
+ latencyMs: number;
209
+ tokens: TaskTokenUsage;
210
+ details?: Record<string, unknown>;
211
+ }
212
+ interface MetricAggregate {
213
+ mean: number;
214
+ median: number;
215
+ stdDev: number;
216
+ min: number;
217
+ max: number;
218
+ }
219
+ type AggregateMetrics = Record<string, MetricAggregate>;
220
+ interface ConfidenceInterval {
221
+ lower: number;
222
+ upper: number;
223
+ level: number;
224
+ }
225
+ type EffectSizeInterpretation = "negligible" | "small" | "medium" | "large";
226
+ interface EffectSizeSummary {
227
+ cohensD: number;
228
+ interpretation: EffectSizeInterpretation;
229
+ }
230
+ interface ComparisonMetricDelta {
231
+ baseline: number;
232
+ candidate: number;
233
+ delta: number;
234
+ percentChange: number;
235
+ effectSize: EffectSizeSummary;
236
+ ciOnDelta?: ConfidenceInterval;
237
+ }
238
+ interface ComparisonResult {
239
+ benchmark: string;
240
+ metricDeltas: Record<string, ComparisonMetricDelta>;
241
+ verdict: "pass" | "regression" | "improvement";
242
+ }
243
+ interface StatisticalReport {
244
+ confidenceIntervals: Record<string, ConfidenceInterval>;
245
+ bootstrapSamples: number;
246
+ effectSizes?: Record<string, EffectSizeSummary>;
247
+ pairedComparison?: {
248
+ baselineId: string;
249
+ pValue: number;
250
+ ciOnDelta: ConfidenceInterval;
251
+ };
252
+ }
253
+ interface BenchmarkResult {
254
+ meta: {
255
+ id: string;
256
+ benchmark: string;
257
+ benchmarkTier: BenchmarkTier;
258
+ version: string;
259
+ remnicVersion: string;
260
+ gitSha: string;
261
+ timestamp: string;
262
+ mode: BenchmarkMode;
263
+ runCount: number;
264
+ seeds: number[];
265
+ /**
266
+ * Which dataset split produced this result. Public leaderboard scores
267
+ * only accept `holdout`; `public` is for self-reporting and iteration.
268
+ */
269
+ splitType?: BenchmarkSplitType;
270
+ /** SHA-256 of the sealed qrels artifact used by the judge. */
271
+ qrelsSealedHash?: string;
272
+ /** SHA-256 of the rendered judge prompt (post-template expansion). */
273
+ judgePromptHash?: string;
274
+ /** SHA-256 of the dataset payload as served to the runner. */
275
+ datasetHash?: string;
276
+ /**
277
+ * Canary-adapter score from the audit run that produced this result.
278
+ * Must stay below the benchmark's canary floor.
279
+ */
280
+ canaryScore?: number;
281
+ };
282
+ config: {
283
+ runtimeProfile?: BenchRuntimeProfile | null;
284
+ systemProvider: ProviderConfig | null;
285
+ judgeProvider: ProviderConfig | null;
286
+ adapterMode: string;
287
+ remnicConfig: Record<string, unknown>;
288
+ };
289
+ cost: {
290
+ totalTokens: number;
291
+ inputTokens: number;
292
+ outputTokens: number;
293
+ estimatedCostUsd: number;
294
+ totalLatencyMs: number;
295
+ meanQueryLatencyMs: number;
296
+ };
297
+ results: {
298
+ tasks: TaskResult[];
299
+ aggregates: AggregateMetrics;
300
+ statistics?: StatisticalReport;
301
+ };
302
+ environment: {
303
+ os: string;
304
+ nodeVersion: string;
305
+ hardware?: string;
306
+ };
307
+ }
308
+ interface BenchmarkMeta {
309
+ name: string;
310
+ version: string;
311
+ description: string;
312
+ category: BenchmarkCategory;
313
+ citation?: string;
314
+ /**
315
+ * Optional integrity metadata declared on the benchmark itself (as opposed
316
+ * to each result). When set, the publishing pipeline pins result-time
317
+ * integrity hashes against these values.
318
+ */
319
+ integrity?: BenchmarkIntegrityMeta;
320
+ }
321
+
322
+ interface BenchmarkDefinition {
323
+ id: string;
324
+ title: string;
325
+ tier: BenchmarkTier;
326
+ status: BenchmarkStatus;
327
+ runnerAvailable: boolean;
328
+ meta: BenchmarkMeta;
329
+ }
330
+ interface RunBenchmarkOptions {
331
+ mode?: BenchmarkMode;
332
+ datasetDir?: string;
333
+ outputDir?: string;
334
+ limit?: number;
335
+ seed?: number;
336
+ adapterMode?: string;
337
+ runtimeProfile?: BenchRuntimeProfile | null;
338
+ system: BenchMemoryAdapter;
339
+ ingestionAdapter?: IngestionBenchAdapter;
340
+ systemProvider?: ProviderConfig | null;
341
+ judgeProvider?: ProviderConfig | null;
342
+ remnicConfig?: Record<string, unknown>;
343
+ }
344
+ interface ResolvedRunBenchmarkOptions extends RunBenchmarkOptions {
345
+ mode: BenchmarkMode;
346
+ benchmark: BenchmarkDefinition;
347
+ }
348
+ type BenchTier = "exact_match" | "category_match" | "keyword_overlap" | "high_confidence" | "semantic_search" | "full_search" | "no_results";
349
+ interface TierDetail {
350
+ tier: BenchTier;
351
+ latencyMs: number;
352
+ resultsCount: number;
353
+ }
354
+ interface ExplainResult {
355
+ query: string;
356
+ tiersUsed: BenchTier[];
357
+ tierResults: TierDetail[];
358
+ durationMs: number;
359
+ totalDurationMs: number;
360
+ }
361
+ interface RecallMetrics {
362
+ query: string;
363
+ latencyMs: number;
364
+ tiersUsed: BenchTier[];
365
+ throughput: number;
366
+ resultsCount: number;
367
+ totalDurationMs: number;
368
+ tierDetails: TierDetail[];
369
+ }
370
+ interface BenchmarkReport {
371
+ timestamp: string;
372
+ queries: Array<{
373
+ query: string;
374
+ tiersUsed: BenchTier[];
375
+ durationMs: number;
376
+ resultsCount: number;
377
+ throughput: number;
378
+ tierDetails: TierDetail[];
379
+ }>;
380
+ totalDurationMs: number;
381
+ }
382
+ interface BenchmarkSuiteResult {
383
+ results: RecallMetrics[];
384
+ report: BenchmarkReport;
385
+ totalDurationMs: number;
386
+ regressions: RegressionDetail[];
387
+ }
388
+ interface SavedBaseline {
389
+ version: number;
390
+ timestamp: string;
391
+ metrics: Record<string, number>;
392
+ }
393
+ interface RegressionGateResult {
394
+ passed: boolean;
395
+ regressions: RegressionDetail[];
396
+ }
397
+ interface RegressionDetail {
398
+ metric: string;
399
+ currentValue: number;
400
+ baselineValue: number;
401
+ tolerance: number;
402
+ passed: boolean;
403
+ }
404
+ interface BenchConfig {
405
+ queries?: string[];
406
+ iterations?: number;
407
+ regressionTolerance?: number;
408
+ baselinePath?: string;
409
+ reportPath?: string;
410
+ seed?: number;
411
+ explain?: boolean;
412
+ }
413
+
414
+ /**
415
+ * Custom benchmark schema types.
416
+ */
417
+
418
+ type CustomBenchmarkScoring = "exact_match" | "f1" | "rouge_l" | "llm_judge";
419
+ interface CustomBenchmarkTask {
420
+ question: string;
421
+ expected: string;
422
+ tags?: string[];
423
+ }
424
+ interface CustomBenchmarkSpec {
425
+ name: string;
426
+ description?: string;
427
+ version?: string;
428
+ category?: BenchmarkCategory;
429
+ citation?: string;
430
+ scoring: CustomBenchmarkScoring;
431
+ tasks: CustomBenchmarkTask[];
432
+ }
433
+
434
+ /**
435
+ * Shared types for inbox fixture generators.
436
+ */
437
+
438
+ interface GeneratedFile {
439
+ relativePath: string;
440
+ content: string;
441
+ }
442
+ interface FixtureOutput {
443
+ id: string;
444
+ description: string;
445
+ files: GeneratedFile[];
446
+ goldGraph: GoldGraph;
447
+ }
448
+ interface FixtureGenerator {
449
+ id: string;
450
+ description: string;
451
+ generate(): FixtureOutput;
452
+ }
453
+
454
+ /**
455
+ * Package-owned Remnic adapters used by the phase-1 benchmark CLI surface.
456
+ */
457
+
458
+ interface RemnicAdapterOptions {
459
+ configOverrides?: Record<string, unknown>;
460
+ preserveRuntimeDefaults?: boolean;
461
+ responder?: BenchResponder;
462
+ judge?: BenchJudge;
463
+ }
464
+ declare const createLightweightAdapter: (options?: RemnicAdapterOptions) => Promise<BenchMemoryAdapter>;
465
+ declare const createRemnicAdapter: (options?: RemnicAdapterOptions) => Promise<BenchMemoryAdapter>;
466
+
467
+ /**
468
+ * Minimal LLM provider contract for the bench engine.
469
+ */
470
+
471
+ interface CompletionOpts {
472
+ systemPrompt?: string;
473
+ temperature?: number;
474
+ maxTokens?: number;
475
+ headers?: Record<string, string>;
476
+ }
477
+ interface CompletionResult {
478
+ text: string;
479
+ tokens: {
480
+ input: number;
481
+ output: number;
482
+ };
483
+ latencyMs: number;
484
+ model: string;
485
+ }
486
+ interface DiscoveredModel {
487
+ id: string;
488
+ name: string;
489
+ contextLength: number;
490
+ capabilities: ("completion" | "embedding" | "vision")[];
491
+ quantization?: string;
492
+ parameterCount?: string;
493
+ }
494
+ interface ProviderBaseConfig {
495
+ model: string;
496
+ baseUrl?: string;
497
+ apiKey?: string;
498
+ headers?: Record<string, string>;
499
+ }
500
+ interface OpenAiCompatibleProviderConfig extends ProviderBaseConfig {
501
+ provider?: "openai" | "litellm";
502
+ }
503
+ interface AnthropicProviderConfig extends ProviderBaseConfig {
504
+ provider?: "anthropic";
505
+ anthropicVersion?: string;
506
+ }
507
+ interface OllamaProviderConfig extends ProviderBaseConfig {
508
+ provider?: "ollama";
509
+ }
510
+ type ProviderFactoryConfig = (OpenAiCompatibleProviderConfig & {
511
+ provider: "openai" | "litellm";
512
+ }) | (AnthropicProviderConfig & {
513
+ provider: "anthropic";
514
+ }) | (OllamaProviderConfig & {
515
+ provider: "ollama";
516
+ });
517
+ interface ProviderDiscoveryResult {
518
+ provider: BuiltInProvider;
519
+ models: DiscoveredModel[];
520
+ }
521
+ interface TokenUsage {
522
+ inputTokens: number;
523
+ outputTokens: number;
524
+ totalTokens: number;
525
+ }
526
+ interface LlmProvider {
527
+ id: string;
528
+ name: string;
529
+ provider: BuiltInProvider;
530
+ complete(prompt: string, opts?: CompletionOpts): Promise<CompletionResult>;
531
+ embed?(texts: string[]): Promise<number[][]>;
532
+ discover?(): Promise<DiscoveredModel[]>;
533
+ getUsage(): TokenUsage;
534
+ resetUsage(): void;
535
+ }
536
+
537
+ /**
538
+ * JSON schema contract for BenchmarkResult payloads.
539
+ */
540
+ declare const BENCHMARK_RESULT_SCHEMA: {
541
+ readonly type: "object";
542
+ readonly required: readonly ["meta", "config", "cost", "results", "environment"];
543
+ readonly properties: {
544
+ readonly meta: {
545
+ readonly type: "object";
546
+ readonly required: readonly ["id", "benchmark", "benchmarkTier", "version", "remnicVersion", "gitSha", "timestamp", "mode", "runCount", "seeds"];
547
+ readonly properties: {
548
+ readonly id: {
549
+ readonly type: "string";
550
+ };
551
+ readonly benchmark: {
552
+ readonly type: "string";
553
+ };
554
+ readonly benchmarkTier: {
555
+ readonly type: "string";
556
+ readonly enum: readonly ["published", "remnic", "custom"];
557
+ };
558
+ readonly version: {
559
+ readonly type: "string";
560
+ };
561
+ readonly remnicVersion: {
562
+ readonly type: "string";
563
+ };
564
+ readonly gitSha: {
565
+ readonly type: "string";
566
+ };
567
+ readonly timestamp: {
568
+ readonly type: "string";
569
+ };
570
+ readonly mode: {
571
+ readonly type: "string";
572
+ readonly enum: readonly ["full", "quick"];
573
+ };
574
+ readonly runCount: {
575
+ readonly type: "number";
576
+ };
577
+ readonly seeds: {
578
+ readonly type: "array";
579
+ readonly items: {
580
+ readonly type: "number";
581
+ };
582
+ };
583
+ readonly splitType: {
584
+ readonly type: "string";
585
+ readonly enum: readonly ["public", "holdout"];
586
+ };
587
+ readonly qrelsSealedHash: {
588
+ readonly type: "string";
589
+ readonly pattern: "^[0-9a-f]{64}$";
590
+ };
591
+ readonly judgePromptHash: {
592
+ readonly type: "string";
593
+ readonly pattern: "^[0-9a-f]{64}$";
594
+ };
595
+ readonly datasetHash: {
596
+ readonly type: "string";
597
+ readonly pattern: "^[0-9a-f]{64}$";
598
+ };
599
+ readonly canaryScore: {
600
+ readonly type: "number";
601
+ };
602
+ };
603
+ };
604
+ readonly config: {
605
+ readonly type: "object";
606
+ readonly required: readonly ["systemProvider", "judgeProvider", "adapterMode", "remnicConfig"];
607
+ readonly properties: {
608
+ readonly runtimeProfile: {
609
+ readonly anyOf: readonly [{
610
+ readonly type: "null";
611
+ }, {
612
+ readonly type: "string";
613
+ readonly enum: readonly ["baseline", "real", "openclaw-chain"];
614
+ }];
615
+ };
616
+ readonly systemProvider: {
617
+ readonly anyOf: readonly [{
618
+ readonly type: "null";
619
+ }, {
620
+ readonly type: "object";
621
+ readonly required: readonly ["provider", "model"];
622
+ readonly properties: {
623
+ readonly provider: {
624
+ readonly type: "string";
625
+ };
626
+ readonly model: {
627
+ readonly type: "string";
628
+ };
629
+ readonly baseUrl: {
630
+ readonly type: "string";
631
+ };
632
+ };
633
+ }];
634
+ };
635
+ readonly judgeProvider: {
636
+ readonly anyOf: readonly [{
637
+ readonly type: "null";
638
+ }, {
639
+ readonly type: "object";
640
+ readonly required: readonly ["provider", "model"];
641
+ readonly properties: {
642
+ readonly provider: {
643
+ readonly type: "string";
644
+ };
645
+ readonly model: {
646
+ readonly type: "string";
647
+ };
648
+ readonly baseUrl: {
649
+ readonly type: "string";
650
+ };
651
+ };
652
+ }];
653
+ };
654
+ readonly adapterMode: {
655
+ readonly type: "string";
656
+ };
657
+ readonly remnicConfig: {
658
+ readonly type: "object";
659
+ };
660
+ };
661
+ };
662
+ readonly cost: {
663
+ readonly type: "object";
664
+ readonly required: readonly ["totalTokens", "inputTokens", "outputTokens", "estimatedCostUsd", "totalLatencyMs", "meanQueryLatencyMs"];
665
+ readonly properties: {
666
+ readonly totalTokens: {
667
+ readonly type: "number";
668
+ };
669
+ readonly inputTokens: {
670
+ readonly type: "number";
671
+ };
672
+ readonly outputTokens: {
673
+ readonly type: "number";
674
+ };
675
+ readonly estimatedCostUsd: {
676
+ readonly type: "number";
677
+ };
678
+ readonly totalLatencyMs: {
679
+ readonly type: "number";
680
+ };
681
+ readonly meanQueryLatencyMs: {
682
+ readonly type: "number";
683
+ };
684
+ };
685
+ };
686
+ readonly results: {
687
+ readonly type: "object";
688
+ readonly required: readonly ["tasks", "aggregates"];
689
+ readonly properties: {
690
+ readonly tasks: {
691
+ readonly type: "array";
692
+ readonly items: {
693
+ readonly type: "object";
694
+ readonly required: readonly ["taskId", "question", "expected", "actual", "scores", "latencyMs", "tokens"];
695
+ readonly properties: {
696
+ readonly taskId: {
697
+ readonly type: "string";
698
+ };
699
+ readonly question: {
700
+ readonly type: "string";
701
+ };
702
+ readonly expected: {
703
+ readonly type: "string";
704
+ };
705
+ readonly actual: {
706
+ readonly type: "string";
707
+ };
708
+ readonly scores: {
709
+ readonly type: "object";
710
+ };
711
+ readonly latencyMs: {
712
+ readonly type: "number";
713
+ };
714
+ readonly tokens: {
715
+ readonly type: "object";
716
+ readonly required: readonly ["input", "output"];
717
+ readonly properties: {
718
+ readonly input: {
719
+ readonly type: "number";
720
+ };
721
+ readonly output: {
722
+ readonly type: "number";
723
+ };
724
+ };
725
+ };
726
+ };
727
+ };
728
+ };
729
+ readonly aggregates: {
730
+ readonly type: "object";
731
+ };
732
+ readonly statistics: {
733
+ readonly type: "object";
734
+ };
735
+ };
736
+ };
737
+ readonly environment: {
738
+ readonly type: "object";
739
+ readonly required: readonly ["os", "nodeVersion"];
740
+ readonly properties: {
741
+ readonly os: {
742
+ readonly type: "string";
743
+ };
744
+ readonly nodeVersion: {
745
+ readonly type: "string";
746
+ };
747
+ readonly hardware: {
748
+ readonly type: "string";
749
+ };
750
+ };
751
+ };
752
+ };
753
+ };
754
+
755
+ declare function createAnthropicProvider(config: AnthropicProviderConfig): LlmProvider;
756
+
757
+ declare function createProvider(config: ProviderFactoryConfig): LlmProvider;
758
+ declare function discoverAllProviders(): Promise<ProviderDiscoveryResult[]>;
759
+
760
+ interface BenchmarkAnswerResult {
761
+ finalAnswer: string;
762
+ recalledText: string;
763
+ answeredText: string;
764
+ latencyMs: number;
765
+ tokens: {
766
+ input: number;
767
+ output: number;
768
+ };
769
+ model?: string;
770
+ }
771
+ declare function answerBenchmarkQuestion(options: {
772
+ question: string;
773
+ recalledText: string;
774
+ responder?: BenchResponder;
775
+ }): Promise<BenchmarkAnswerResult>;
776
+
777
+ /**
778
+ * Sealed LLM-judge rubric loader, invocation, and score parser for the
779
+ * Assistant bench tier.
780
+ *
781
+ * Sealing contract:
782
+ * 1. The rubric prompt lives in the in-process registry
783
+ * (`sealed-prompts/index.ts`) and is never exposed to the
784
+ * system-under-test.
785
+ * 2. The rubric text's SHA-256 digest is embedded into every run result so
786
+ * any change to the prompt is detectable by consumers of the bench feed.
787
+ * 3. Rotations are additive — add a new registry entry and a matching
788
+ * `.md` mirror, do not edit old ones.
789
+ */
790
+ declare const ASSISTANT_RUBRIC_DIMENSIONS: readonly ["identity_accuracy", "stance_coherence", "novelty", "calibration"];
791
+ type AssistantRubricDimension = (typeof ASSISTANT_RUBRIC_DIMENSIONS)[number];
792
+ type AssistantRubricScores = Record<AssistantRubricDimension, number>;
793
+ interface SealedRubric {
794
+ id: string;
795
+ version: string;
796
+ prompt: string;
797
+ sha256: string;
798
+ }
799
+ interface SealedJudgeInput {
800
+ taskId: string;
801
+ scenario: string;
802
+ memorySummary: string;
803
+ assistantOutput: string;
804
+ }
805
+ interface SealedJudgeDecision {
806
+ taskId: string;
807
+ rubricId: string;
808
+ rubricSha256: string;
809
+ scores: AssistantRubricScores;
810
+ notes: string;
811
+ rawResponse: string;
812
+ parseOk: boolean;
813
+ }
814
+ /**
815
+ * Rich structured-judge contract for the Assistant tier. Unlike
816
+ * `BenchJudge.score()`, which returns a scalar, structured judges return the
817
+ * raw JSON response text so we can parse the full multi-dimension rubric.
818
+ */
819
+ interface StructuredJudge {
820
+ evaluate(request: {
821
+ system: string;
822
+ user: string;
823
+ rubricId: string;
824
+ taskId: string;
825
+ }): Promise<string>;
826
+ }
827
+ interface SpotCheckLogger {
828
+ log(decision: SealedJudgeDecision, context: SealedJudgeInput): void;
829
+ }
830
+ /**
831
+ * Load a sealed rubric prompt from the in-process registry by id.
832
+ *
833
+ * The returned object captures the canonical text and a SHA-256 digest which
834
+ * callers are expected to store in benchmark results so reviewers can verify
835
+ * the exact rubric text used for a given run.
836
+ */
837
+ declare function loadSealedRubric(id?: string, options?: {
838
+ registry?: Readonly<Record<string, string>>;
839
+ }): SealedRubric;
840
+ /**
841
+ * Verify that a registered rubric still matches an expected digest. Useful in
842
+ * tests and in CI gates that want to catch accidental edits to sealed text.
843
+ */
844
+ declare function verifyRubricDigest(expectedSha256: string, options?: {
845
+ id?: string;
846
+ registry?: Readonly<Record<string, string>>;
847
+ }): boolean;
848
+ /**
849
+ * Build the judge message payload for a single task. Keeps the rubric prompt
850
+ * on the system side of the conversation and the task-specific substitutions
851
+ * in a user message so the judge never leaks rubric text back into the SUT
852
+ * path.
853
+ */
854
+ declare function buildJudgePayload(rubric: SealedRubric, input: SealedJudgeInput): {
855
+ system: string;
856
+ user: string;
857
+ };
858
+ /**
859
+ * Invoke a structured judge with the sealed rubric and parse the response.
860
+ *
861
+ * When `judge` is `undefined` we return a parse_error decision with all-zero
862
+ * scores so the caller can still complete the benchmark with a visible signal
863
+ * that the judge was missing.
864
+ */
865
+ declare function runSealedJudge(judge: StructuredJudge | undefined, rubric: SealedRubric, input: SealedJudgeInput, options?: {
866
+ spotCheckLogger?: SpotCheckLogger;
867
+ }): Promise<SealedJudgeDecision>;
868
+ /**
869
+ * Parse a judge response string as rubric JSON. Exported for unit tests and
870
+ * for judge adapters that return the raw response directly.
871
+ */
872
+ declare function parseRubricResponse(raw: string): {
873
+ scores: AssistantRubricScores;
874
+ notes: string;
875
+ ok: boolean;
876
+ };
877
+ /**
878
+ * Spot-check logger that appends selected judge decisions to a JSONL file.
879
+ * The caller controls the `runId` to keep logs grouped per-run.
880
+ *
881
+ * Logging is a diagnostic side effect, so any filesystem error (non-writable
882
+ * directory, path conflict with an existing file, mid-run ENOSPC, etc.) is
883
+ * caught and downgraded to a one-time `console.warn` rather than aborting
884
+ * the benchmark run. We also fail-safe the `mkdirSync` at construction
885
+ * time: if the directory cannot be created, we return a no-op logger so
886
+ * callers can still run the benchmark end-to-end.
887
+ */
888
+ declare function createSpotCheckFileLogger(options: {
889
+ runId: string;
890
+ directory: string;
891
+ sampleRate?: number;
892
+ random?: () => number;
893
+ sampleSize?: number;
894
+ }): SpotCheckLogger;
895
+ /**
896
+ * Create a deterministic spot-check logger useful in tests: always picks the
897
+ * first `sampleSize` decisions regardless of random draw.
898
+ */
899
+ declare function createDeterministicSpotCheckLogger(options: {
900
+ runId: string;
901
+ directory: string;
902
+ sampleSize?: number;
903
+ }): SpotCheckLogger;
904
+ declare function zeroScores(): AssistantRubricScores;
905
+ declare function clampScore(value: number): number;
906
+
907
+ interface GatewayResponderOptions {
908
+ gatewayConfig?: GatewayConfig;
909
+ agentId?: string;
910
+ }
911
+ declare function createResponderFromProvider(provider: LlmProvider): BenchResponder;
912
+ declare function createProviderBackedResponder(config: ProviderFactoryConfig, providerInstance?: LlmProvider): BenchResponder;
913
+ declare function createProviderBackedJudge(config: ProviderFactoryConfig, providerInstance?: LlmProvider): BenchJudge;
914
+ declare function createStructuredJudgeFromProvider(provider: LlmProvider): StructuredJudge;
915
+ declare function createProviderBackedStructuredJudge(config: ProviderFactoryConfig, providerInstance?: LlmProvider): StructuredJudge;
916
+ declare function createGatewayResponder(options: GatewayResponderOptions): BenchResponder;
917
+
918
+ declare function createLiteLlmProvider(config: OpenAiCompatibleProviderConfig): LlmProvider;
919
+
920
+ declare function createOllamaProvider(config: OllamaProviderConfig): LlmProvider;
921
+
922
+ /**
923
+ * Minimal OpenAI-compatible provider for phase 1 bench execution.
924
+ */
925
+
926
+ declare function createOpenAiCompatibleProvider(config: OpenAiCompatibleProviderConfig): LlmProvider;
927
+
928
+ type BenchModelSource = "plugin" | "gateway";
929
+ interface ResolveBenchRuntimeProfileOptions {
930
+ runtimeProfile?: BenchRuntimeProfile;
931
+ remnicConfigPath?: string;
932
+ openclawConfigPath?: string;
933
+ modelSource?: BenchModelSource;
934
+ gatewayAgentId?: string;
935
+ fastGatewayAgentId?: string;
936
+ systemProvider?: BuiltInProvider;
937
+ systemModel?: string;
938
+ systemBaseUrl?: string;
939
+ judgeProvider?: BuiltInProvider;
940
+ judgeModel?: string;
941
+ judgeBaseUrl?: string;
942
+ }
943
+ interface ResolvedBenchRuntimeProfile {
944
+ profile: BenchRuntimeProfile;
945
+ remnicConfig: Record<string, unknown>;
946
+ effectiveRemnicConfig: Record<string, unknown>;
947
+ adapterOptions: {
948
+ configOverrides: Record<string, unknown>;
949
+ preserveRuntimeDefaults?: boolean;
950
+ responder?: BenchResponder;
951
+ judge?: BenchJudge;
952
+ };
953
+ systemProvider: ProviderConfig | null;
954
+ judgeProvider: ProviderConfig | null;
955
+ }
956
+ declare function resolveBenchRuntimeProfile(options: ResolveBenchRuntimeProfileOptions): Promise<ResolvedBenchRuntimeProfile>;
957
+
958
+ /**
959
+ * Published benchmark registry for @remnic/bench phase 1.
960
+ */
961
+
962
+ declare function listBenchmarks(): BenchmarkDefinition[];
963
+ declare function getBenchmark(id: string): BenchmarkDefinition | undefined;
964
+
965
+ /**
966
+ * Result enrichment and JSON writing helpers.
967
+ */
968
+
969
+ declare function writeBenchmarkResult(result: BenchmarkResult, outputDir: string): Promise<string>;
970
+
971
+ /**
972
+ * Seed-sequence generation for benchmark runs.
973
+ *
974
+ * Factored out of `benchmark.ts` so individual runners can reuse it without
975
+ * triggering a circular import through `benchmark.ts -> registry.ts ->
976
+ * runner.ts -> benchmark.ts`.
977
+ */
978
+ declare function buildBenchmarkRunSeeds(runCount: number, baseSeed?: number): number[];
979
+
980
+ /**
981
+ * Public benchmark execution helpers.
982
+ */
983
+
984
+ declare function resolveBenchmarkRunCount(mode: BenchmarkMode, requestedIterations?: number): number;
985
+
986
+ declare function orchestrateBenchmarkRuns<T>(mode: BenchmarkMode, executeRun: (seed: number, runIndex: number) => Promise<T>, requestedIterations?: number, baseSeed?: number): Promise<{
987
+ runCount: number;
988
+ seeds: number[];
989
+ runs: T[];
990
+ }>;
991
+ declare function runBenchmark(benchmarkId: string, options: RunBenchmarkOptions): Promise<BenchmarkResult>;
992
+ declare function loadBaseline(baselinePath?: string): SavedBaseline | undefined;
993
+ declare function saveBaseline(baselinePath: string, baseline: SavedBaseline): void;
994
+ declare function runExplain(service: EngramAccessService, query: string): Promise<ExplainResult>;
995
+ declare function runBenchSuite(service: EngramAccessService, config?: BenchConfig): Promise<BenchmarkSuiteResult>;
996
+ declare function checkRegression(metrics: Record<string, number>, baseline: SavedBaseline | undefined, tolerance: number): RegressionGateResult;
997
+ declare function generateReport(results: RecallMetrics[], reportPath?: string): BenchmarkReport;
998
+
999
+ /**
1000
+ * Shared scoring utilities for bench runners.
1001
+ */
1002
+
1003
+ declare function exactMatch(predicted: string, expected: string | number | unknown): number;
1004
+ declare function f1Score(predicted: string, expected: string | number | unknown): number;
1005
+ declare function rougeL(predicted: string, expected: string | number | unknown): number;
1006
+ declare function recallAtK(retrieved: string[], relevant: string[], k: number): number;
1007
+ declare function precisionAtK(retrieved: string[], relevant: string[], k: number): number;
1008
+ declare function containsAnswer(predicted: string, expected: string | number | unknown): number;
1009
+ declare function llmJudgeScore(judge: {
1010
+ score(question: string, predicted: string, expected: string): Promise<number>;
1011
+ scoreWithMetrics?(question: string, predicted: string, expected: string): Promise<BenchJudgeResult>;
1012
+ } | undefined, question: string, predicted: string, expected: string): Promise<number>;
1013
+ declare function llmJudgeScoreDetailed(judge: {
1014
+ score(question: string, predicted: string, expected: string): Promise<number>;
1015
+ scoreWithMetrics?(question: string, predicted: string, expected: string): Promise<BenchJudgeResult>;
1016
+ } | undefined, question: string, predicted: string, expected: string): Promise<BenchJudgeResult>;
1017
+ declare function timed<T>(fn: () => Promise<T>): Promise<{
1018
+ result: T;
1019
+ durationMs: number;
1020
+ }>;
1021
+ declare function aggregateTaskScores(metricsList: Array<Record<string, number>>): AggregateMetrics;
1022
+
1023
+ interface BootstrapOptions {
1024
+ iterations?: number;
1025
+ level?: number;
1026
+ random?: () => number;
1027
+ }
1028
+ declare function bootstrapMeanConfidenceInterval(values: number[], options?: BootstrapOptions): ConfidenceInterval;
1029
+ declare function pairedDeltaConfidenceInterval(candidateValues: number[], baselineValues: number[], options?: BootstrapOptions): ConfidenceInterval;
1030
+
1031
+ declare function cohensD(candidateValues: number[], baselineValues: number[]): number;
1032
+ declare function interpretEffectSize(cohensDValue: number): EffectSizeInterpretation;
1033
+
1034
+ declare function compareResults(baseline: BenchmarkResult, candidate: BenchmarkResult, threshold?: number, lowerIsBetter?: ReadonlySet<string>): ComparisonResult;
1035
+ declare function getBenchmarkLowerIsBetter(benchmarkId: string): ReadonlySet<string>;
1036
+
1037
+ /**
1038
+ * Dataset-contamination guard.
1039
+ *
1040
+ * Published benchmark results carry a `datasetHash` in `BenchmarkResult.meta`
1041
+ * so the publishing pipeline can reject results whose dataset hash is known
1042
+ * to appear in an LLM's training corpus. The contamination list starts empty
1043
+ * and is extended as new contamination reports arrive.
1044
+ *
1045
+ * Entries are SHA-256 hex digests. Callers pass a `ContaminationManifest`
1046
+ * rather than a bare array so provenance / justification can be attached
1047
+ * alongside the hash. This keeps the audit trail visible when a result is
1048
+ * rejected.
1049
+ */
1050
+ interface ContaminationEntry {
1051
+ /** SHA-256 of the dataset payload as published. */
1052
+ datasetHash: string;
1053
+ /** Human-readable reason the dataset is considered contaminated. */
1054
+ reason: string;
1055
+ /** Optional citation / URL documenting the contamination report. */
1056
+ reference?: string;
1057
+ /** ISO-8601 timestamp when the entry was added. */
1058
+ addedAt: string;
1059
+ }
1060
+ interface ContaminationManifest {
1061
+ version: 1;
1062
+ entries: ContaminationEntry[];
1063
+ }
1064
+ interface ContaminationCheckResult {
1065
+ /** The dataset hash examined. */
1066
+ datasetHash: string;
1067
+ /** True when the dataset hash is NOT present on the contamination list. */
1068
+ clean: boolean;
1069
+ /** When `clean === false`, the matching manifest entry. */
1070
+ matched?: ContaminationEntry;
1071
+ }
1072
+ /**
1073
+ * Start with an empty list; upstream tooling populates this as contamination
1074
+ * reports surface. Keeping the default list empty avoids hard-coding public
1075
+ * values that could become stale.
1076
+ */
1077
+ declare const EMPTY_CONTAMINATION_MANIFEST: ContaminationManifest;
1078
+ declare function isContaminationManifest(value: unknown): value is ContaminationManifest;
1079
+ declare function isContaminationEntry(value: unknown): value is ContaminationEntry;
1080
+ declare function checkDatasetContamination(datasetHash: string, manifest?: ContaminationManifest): ContaminationCheckResult;
1081
+ /**
1082
+ * Merge an additional contamination entry into an existing manifest. Duplicate
1083
+ * hashes are collapsed (first-write wins) so manifests can be safely merged
1084
+ * across sources without ballooning.
1085
+ */
1086
+ declare function addContaminationEntry(manifest: ContaminationManifest, entry: ContaminationEntry): ContaminationManifest;
1087
+ declare function mergeContaminationManifests(...manifests: ContaminationManifest[]): ContaminationManifest;
1088
+
1089
+ interface StoredBenchmarkResultSummary {
1090
+ id: string;
1091
+ path: string;
1092
+ benchmark: string;
1093
+ timestamp: string;
1094
+ mode: BenchmarkMode;
1095
+ }
1096
+ interface StoredBenchmarkBaseline {
1097
+ name: string;
1098
+ savedAt: string;
1099
+ result: BenchmarkResult;
1100
+ source?: {
1101
+ id: string;
1102
+ path: string;
1103
+ };
1104
+ }
1105
+ interface StoredBenchmarkBaselineSummary {
1106
+ name: string;
1107
+ path: string;
1108
+ benchmark: string;
1109
+ timestamp: string;
1110
+ resultId: string;
1111
+ resultTimestamp: string;
1112
+ mode: BenchmarkMode;
1113
+ }
1114
+ type BenchmarkExportFormat = "json" | "csv" | "html";
1115
+ type BenchmarkPublishTarget = "remnic-ai";
1116
+ interface PublishedBenchmarkFeedEntry {
1117
+ benchmark: string;
1118
+ benchmarkTier: BenchmarkResult["meta"]["benchmarkTier"];
1119
+ resultId: string;
1120
+ timestamp: string;
1121
+ mode: BenchmarkMode;
1122
+ remnicVersion: string;
1123
+ gitSha: string;
1124
+ taskCount: number;
1125
+ aggregateMetrics: BenchmarkResult["results"]["aggregates"];
1126
+ cost: BenchmarkResult["cost"];
1127
+ environment: BenchmarkResult["environment"];
1128
+ integrity: {
1129
+ splitType: NonNullable<BenchmarkResult["meta"]["splitType"]>;
1130
+ qrelsSealedHash: string;
1131
+ judgePromptHash: string;
1132
+ datasetHash: string;
1133
+ canaryScore?: number;
1134
+ };
1135
+ }
1136
+ interface BuildBenchmarkPublishFeedOptions {
1137
+ /**
1138
+ * Contamination manifest applied to every candidate result. A result whose
1139
+ * `datasetHash` matches an entry is dropped from the published feed.
1140
+ * Defaults to the empty manifest.
1141
+ */
1142
+ contaminationManifest?: ContaminationManifest;
1143
+ }
1144
+ interface PublishedBenchmarkFeed {
1145
+ target: BenchmarkPublishTarget;
1146
+ generatedAt: string;
1147
+ benchmarks: PublishedBenchmarkFeedEntry[];
1148
+ /**
1149
+ * Records for every candidate result that was considered but dropped from
1150
+ * this feed because of an integrity concern. Exposed so tooling can surface
1151
+ * the dropped runs without grep-ing logs.
1152
+ */
1153
+ skipped?: PublishSkipRecord[];
1154
+ }
1155
+ declare function defaultBenchmarkBaselineDir(): string;
1156
+ declare function defaultBenchmarkPublishPath(target: BenchmarkPublishTarget): string;
1157
+ declare function loadBenchmarkResult(filePath: string): Promise<BenchmarkResult>;
1158
+ declare function listBenchmarkResults(outputDir: string): Promise<StoredBenchmarkResultSummary[]>;
1159
+ declare function saveBenchmarkBaseline(baselineDir: string, name: string, result: BenchmarkResult, source?: {
1160
+ id: string;
1161
+ path: string;
1162
+ }): Promise<string>;
1163
+ declare function loadBenchmarkBaseline(filePath: string): Promise<StoredBenchmarkBaseline>;
1164
+ declare function listBenchmarkBaselines(baselineDir: string): Promise<StoredBenchmarkBaselineSummary[]>;
1165
+ declare function resolveBenchmarkResultReference(outputDir: string, reference: string): Promise<StoredBenchmarkResultSummary | undefined>;
1166
+ declare function deleteBenchmarkResults(outputDir: string, references: string[]): Promise<{
1167
+ deleted: StoredBenchmarkResultSummary[];
1168
+ missing: string[];
1169
+ }>;
1170
+ /**
1171
+ * Throws if the result is missing any required integrity field. Called
1172
+ * explicitly by tooling (e.g. `remnic bench publish --strict`) that needs to
1173
+ * surface integrity gaps as errors rather than silently skipping the run.
1174
+ * The feed builder uses `isResultPublishable` below to filter non-fatal
1175
+ * conditions (public split, missing integrity) so a single bad result does
1176
+ * not block publishing older, valid holdout runs.
1177
+ */
1178
+ declare function assertPublishableIntegrity(result: BenchmarkResult, target: BenchmarkPublishTarget): void;
1179
+ type PublishSkipReason = "missing-integrity" | "non-holdout-split" | "contaminated-dataset";
1180
+ interface PublishSkipRecord {
1181
+ resultId: string;
1182
+ path: string;
1183
+ reason: PublishSkipReason;
1184
+ detail: string;
1185
+ }
1186
+ declare function buildBenchmarkPublishFeed(outputDir: string, target: BenchmarkPublishTarget, options?: BuildBenchmarkPublishFeedOptions): Promise<PublishedBenchmarkFeed>;
1187
+ declare function writeBenchmarkPublishFeed(feed: PublishedBenchmarkFeed, outputPath: string): Promise<string>;
1188
+ declare function renderBenchmarkResultExport(result: BenchmarkResult, format: BenchmarkExportFormat): string;
1189
+
1190
+ /**
1191
+ * Hash verification utilities used by the benchmark integrity pipeline.
1192
+ *
1193
+ * These helpers produce deterministic SHA-256 digests for sealed artifacts:
1194
+ * qrels payloads, judge prompts, dataset files, and encrypted seals. They are
1195
+ * intentionally simple and rely only on Node's built-in crypto module so the
1196
+ * bench package can verify seals without additional dependencies.
1197
+ *
1198
+ * Rules of the road:
1199
+ * - Hashes are lowercase hex strings. Always compare with `timingSafeEqual`.
1200
+ * - Structured inputs are serialized with sorted keys so equivalent objects
1201
+ * produce identical digests. This aligns with CLAUDE.md gotcha #38.
1202
+ * - The AES-GCM seal helpers use 256-bit keys and 96-bit IVs; they are a
1203
+ * thin interface so CI + tests can exercise the flow without reaching for
1204
+ * a KMS. Production deployments should wire a real key-management backend.
1205
+ */
1206
+ declare const INTEGRITY_HASH_ALGORITHM: "sha256";
1207
+ declare const INTEGRITY_CIPHER_ALGORITHM: "aes-256-gcm";
1208
+ interface SealedArtifact {
1209
+ /** Version marker for the seal envelope. */
1210
+ version: 1;
1211
+ /** Symmetric cipher identifier. */
1212
+ algorithm: typeof INTEGRITY_CIPHER_ALGORITHM;
1213
+ /** Base64-encoded 96-bit IV. */
1214
+ iv: string;
1215
+ /** Base64-encoded 128-bit auth tag. */
1216
+ tag: string;
1217
+ /** Base64-encoded ciphertext. */
1218
+ ciphertext: string;
1219
+ /**
1220
+ * SHA-256 of the plaintext payload. Verified after decryption as a
1221
+ * defence-in-depth check against silent key rotation or ciphertext drift.
1222
+ */
1223
+ plaintextHash: string;
1224
+ }
1225
+ declare function hashString(value: string): string;
1226
+ declare function hashBytes(value: Uint8Array): string;
1227
+ /**
1228
+ * Canonicalize a JSON-serializable value so equivalent payloads produce the
1229
+ * same digest regardless of key insertion order.
1230
+ */
1231
+ declare function canonicalJsonStringify(value: unknown): string;
1232
+ declare function hashCanonicalJson(value: unknown): string;
1233
+ declare function isSha256Hex(value: unknown): value is string;
1234
+ declare function assertSha256Hex(value: unknown, label: string): string;
1235
+ /**
1236
+ * Constant-time equality check for hex digests. Returns `false` when inputs
1237
+ * differ in length — `timingSafeEqual` would otherwise throw.
1238
+ */
1239
+ declare function safeHexEqual(expected: string, actual: string): boolean;
1240
+ /**
1241
+ * Encrypt a plaintext payload with AES-256-GCM, returning a seal envelope.
1242
+ * The caller owns the key. A 96-bit IV is drawn from `crypto.randomBytes`
1243
+ * for each call — never reuse keys across predictable IVs.
1244
+ */
1245
+ declare function sealPayload(plaintext: string, key: Buffer): SealedArtifact;
1246
+ declare function openSeal(seal: SealedArtifact, key: Buffer): string;
1247
+ /**
1248
+ * Load a 32-byte AES key from an environment variable. The variable must
1249
+ * contain a base64-encoded 256-bit key. Returns `null` when unset so callers
1250
+ * can degrade gracefully in environments without a key-management backend.
1251
+ *
1252
+ * The input is validated against a strict base64 pattern before decoding
1253
+ * because Node's `Buffer.from(x, "base64")` silently ignores non-base64
1254
+ * characters and never throws — accepting a malformed key would surface
1255
+ * only later as an opaque decryption or hash mismatch error.
1256
+ */
1257
+ declare function loadSealKeyFromEnv(envName: string): Buffer | null;
1258
+
1259
+ /**
1260
+ * Sealed qrels loader.
1261
+ *
1262
+ * The threat model (see `docs/bench/integrity.md`) is that the runner-side
1263
+ * adapter never sees ground-truth answers: they live only inside the judge /
1264
+ * scorer process. This module enforces that boundary by:
1265
+ *
1266
+ * 1. Loading a sealed qrels artifact from disk.
1267
+ * 2. Verifying its declared SHA-256 hash against the expected value pinned
1268
+ * in the benchmark's metadata (the "seal hash").
1269
+ * 3. Decrypting the payload only when a caller provides the correct seal
1270
+ * key. Callers that only need the seal hash (e.g. the runner emitting
1271
+ * `BenchmarkResult.meta.qrelsSealedHash`) never receive plaintext.
1272
+ *
1273
+ * The artifact format is JSON:
1274
+ *
1275
+ * ```json
1276
+ * {
1277
+ * "benchmark": "<benchmark-id>",
1278
+ * "version": 1,
1279
+ * "sealHash": "<sha256-of-envelope-without-sealHash>",
1280
+ * "envelope": { SealedArtifact }
1281
+ * }
1282
+ * ```
1283
+ *
1284
+ * `sealHash` is computed over the canonical JSON of `envelope` so two qrels
1285
+ * files encrypted with the same key produce distinct `sealHash` values only
1286
+ * when their plaintext differs.
1287
+ */
1288
+
1289
+ interface SealedQrelsArtifact {
1290
+ benchmark: string;
1291
+ version: 1;
1292
+ sealHash: string;
1293
+ envelope: SealedArtifact;
1294
+ }
1295
+ interface SealedQrelsHandle {
1296
+ benchmark: string;
1297
+ sealHash: string;
1298
+ /**
1299
+ * Returns the decrypted qrels JSON as a string. Callers must pass the
1300
+ * seal key explicitly; the handle never caches plaintext.
1301
+ */
1302
+ unseal(key: Buffer): unknown;
1303
+ }
1304
+ interface LoadSealedQrelsOptions {
1305
+ /**
1306
+ * Expected seal hash pinned at benchmark registration. If provided the
1307
+ * loader rejects the artifact when the computed hash does not match.
1308
+ */
1309
+ expectedSealHash?: string;
1310
+ /**
1311
+ * Benchmark ID the artifact must declare. When omitted, any benchmark ID
1312
+ * is accepted so tooling can inspect unknown artifacts.
1313
+ */
1314
+ expectedBenchmarkId?: string;
1315
+ }
1316
+ declare function isSealedQrelsArtifact(value: unknown): value is SealedQrelsArtifact;
1317
+ declare function computeSealHash(envelope: SealedArtifact): string;
1318
+ declare function parseSealedQrels(raw: string, options?: LoadSealedQrelsOptions): SealedQrelsHandle;
1319
+ declare function loadSealedQrels(filePath: string, options?: LoadSealedQrelsOptions): Promise<SealedQrelsHandle>;
1320
+ /**
1321
+ * Serialize a sealed qrels artifact to the canonical on-disk shape. Useful
1322
+ * for tooling that authors new qrels files.
1323
+ */
1324
+ declare function serializeSealedQrels(artifact: SealedQrelsArtifact): string;
1325
+
1326
+ /**
1327
+ * Canary adapter for exploit-detection runs.
1328
+ *
1329
+ * A canary adapter never actually solves a benchmark task. It returns a
1330
+ * deterministic, deliberately-wrong response to every query so that the
1331
+ * exploit-audit workflow can measure how much score a benchmark assigns to
1332
+ * a do-nothing system. If the canary scores above the configured floor
1333
+ * (default `0.1`) on any benchmark, the benchmark is flagged as exploitable
1334
+ * and demoted until fixed.
1335
+ *
1336
+ * This adapter must never be used in production bench runs; it exists only
1337
+ * for the `bench-exploit-audit` CI workflow.
1338
+ */
1339
+
1340
+ /** The fixed reply the canary returns for every `recall`. */
1341
+ declare const CANARY_FIXED_RECALL = "__remnic_canary_response__";
1342
+ /** The score floor the canary must NOT exceed for any benchmark. */
1343
+ declare const CANARY_SCORE_FLOOR = 0.1;
1344
+ interface CanaryAdapterOptions {
1345
+ /**
1346
+ * Override the response string used by `recall`. Useful for running two
1347
+ * canary variants side-by-side (e.g. empty string vs fixed string).
1348
+ */
1349
+ response?: string;
1350
+ /**
1351
+ * If true, `search` returns an empty array instead of a single fake hit.
1352
+ * Some benchmarks rely on the retrieval surface; keeping the default
1353
+ * "one fake hit" covers retrieval-style scorers too.
1354
+ */
1355
+ emptySearch?: boolean;
1356
+ }
1357
+ declare function createCanaryAdapter(options?: CanaryAdapterOptions): BenchMemoryAdapter;
1358
+ interface CanaryFloorCheck {
1359
+ benchmark: string;
1360
+ score: number;
1361
+ floor: number;
1362
+ passed: boolean;
1363
+ }
1364
+ /**
1365
+ * Compare a canary score against the configured floor. Returns a structured
1366
+ * result rather than throwing so callers can aggregate failures across an
1367
+ * entire benchmark suite before reporting.
1368
+ */
1369
+ declare function assertCanaryUnderFloor(benchmark: string, score: number, floor?: number): CanaryFloorCheck;
1370
+
1371
+ /**
1372
+ * Randomization helpers for the integrity pipeline.
1373
+ *
1374
+ * These helpers remove position-in-prompt and fixture-layout exploits:
1375
+ * - `shuffleTasks` randomizes task order per run so a memorized task position
1376
+ * cannot be exploited.
1377
+ * - `rotateDistractors` rotates multiple-choice answer positions and the set
1378
+ * of distractors so answer-position memorization is defeated.
1379
+ * - `selectFixtureVariant` picks a variant by seed so each run exercises a
1380
+ * different fixture graph layout.
1381
+ *
1382
+ * All helpers are seeded. A seeded mulberry32 PRNG gives deterministic,
1383
+ * reproducible shuffles that do not rely on `Math.random`.
1384
+ */
1385
+ interface SeededRng {
1386
+ /** Returns a pseudo-random number in `[0, 1)`. */
1387
+ next(): number;
1388
+ }
1389
+ /**
1390
+ * Deterministic 32-bit PRNG. Mulberry32 is small, fast, and sufficient for
1391
+ * shuffling benchmark tasks. Do NOT use for cryptographic operations.
1392
+ */
1393
+ declare function createSeededRng(seed: number): SeededRng;
1394
+ /**
1395
+ * Fisher-Yates shuffle using a seeded PRNG. Returns a new array.
1396
+ */
1397
+ declare function shuffleTasks<T>(tasks: readonly T[], seed: number): T[];
1398
+ interface MultipleChoiceQuestion<T> {
1399
+ /** Correct answer. Must appear in `distractors` or be prepended below. */
1400
+ correct: T;
1401
+ /** Distractor pool. The correct answer may or may not be present. */
1402
+ distractors: readonly T[];
1403
+ }
1404
+ interface RotatedChoices<T> {
1405
+ /** The choices in rotated order. */
1406
+ choices: T[];
1407
+ /** The index of the correct answer in `choices`. */
1408
+ correctIndex: number;
1409
+ }
1410
+ /**
1411
+ * Rotate the distractor set and answer position for a multiple-choice
1412
+ * question. The full choice pool is `[correct, ...distractors]` with
1413
+ * duplicates removed; the pool is shuffled and the correct-answer index is
1414
+ * reported back to the caller. Callers re-score against `correctIndex`.
1415
+ */
1416
+ declare function rotateDistractors<T>(question: MultipleChoiceQuestion<T>, seed: number): RotatedChoices<T>;
1417
+ interface FixtureVariant<T> {
1418
+ id: string;
1419
+ value: T;
1420
+ }
1421
+ /**
1422
+ * Pick one fixture variant by seed. Stable: the same seed always returns the
1423
+ * same variant index for a given variant list length.
1424
+ */
1425
+ declare function selectFixtureVariant<T>(variants: readonly FixtureVariant<T>[], seed: number): FixtureVariant<T>;
1426
+
1427
+ /**
1428
+ * YAML custom benchmark loader.
1429
+ */
1430
+
1431
+ declare function parseCustomBenchmark(source: string): CustomBenchmarkSpec;
1432
+ declare function loadCustomBenchmarkFile(filePath: string): Promise<CustomBenchmarkSpec>;
1433
+
1434
+ /**
1435
+ * Custom benchmark runner.
1436
+ */
1437
+
1438
+ declare function runCustomBenchmarkFile(filePath: string, options: RunBenchmarkOptions): Promise<BenchmarkResult>;
1439
+
1440
+ type SchemaTierName = "clean" | "dirty";
1441
+ interface SchemaTierPageFrontmatter {
1442
+ title?: string;
1443
+ type?: string;
1444
+ state?: string;
1445
+ created?: string;
1446
+ seeAlso?: string[];
1447
+ timeline?: string[];
1448
+ }
1449
+ interface SchemaTierPage {
1450
+ id: string;
1451
+ owner: string;
1452
+ namespace: string;
1453
+ canonicalTitle: string;
1454
+ title: string;
1455
+ type: string;
1456
+ createdAt: string;
1457
+ aliases: string[];
1458
+ body: string;
1459
+ frontmatter: SchemaTierPageFrontmatter;
1460
+ seeAlso: string[];
1461
+ timeline: string[];
1462
+ dirtySignals: string[];
1463
+ }
1464
+ interface PersonalizationRetrievalCase {
1465
+ id: string;
1466
+ query: string;
1467
+ expectedPageIds: string[];
1468
+ expectedNamespace: string;
1469
+ expectedOwner: string;
1470
+ }
1471
+ interface TemporalRetrievalCase {
1472
+ id: string;
1473
+ query: string;
1474
+ window: {
1475
+ start: string;
1476
+ end: string;
1477
+ };
1478
+ expectedPageIds: string[];
1479
+ }
1480
+ interface AbstentionRetrievalCase {
1481
+ id: string;
1482
+ query: string;
1483
+ reason: "missing_fact" | "cross_tenant" | "hallucination_bait";
1484
+ }
1485
+ interface SchemaTierCorpus {
1486
+ pages: SchemaTierPage[];
1487
+ }
1488
+ interface SchemaTierFixture {
1489
+ seed: number;
1490
+ clean: SchemaTierCorpus;
1491
+ dirty: SchemaTierCorpus;
1492
+ personalizationCases: PersonalizationRetrievalCase[];
1493
+ temporalCases: TemporalRetrievalCase[];
1494
+ abstentionCases: AbstentionRetrievalCase[];
1495
+ }
1496
+ declare function buildSchemaTierFixture(seed?: number): SchemaTierFixture;
1497
+ declare function buildSchemaTierSmokeFixture(seed?: number): SchemaTierFixture;
1498
+ declare const SCHEMA_TIER_FIXTURE: SchemaTierFixture;
1499
+ declare const SCHEMA_TIER_SMOKE_FIXTURE: SchemaTierFixture;
1500
+
1501
+ /**
1502
+ * Scoring utilities for ingestion benchmarks.
1503
+ */
1504
+
1505
+ declare function matchEntity(extracted: ExtractedEntity, gold: GoldEntity): boolean;
1506
+ declare function entityRecall(extracted: ExtractedEntity[], gold: GoldEntity[]): {
1507
+ overall: number;
1508
+ byType: Record<string, number>;
1509
+ };
1510
+ declare function linkMatches(extracted: ExtractedLink, gold: GoldLink): boolean;
1511
+ declare function backlinkF1(extracted: ExtractedLink[], gold: GoldLink[]): {
1512
+ precision: number;
1513
+ recall: number;
1514
+ f1: number;
1515
+ };
1516
+ declare function schemaCompleteness(pages: ExtractedPage[], goldPages: GoldPage[], requiredFields: readonly string[]): {
1517
+ overall: number;
1518
+ fieldCoverage: Record<string, number>;
1519
+ };
1520
+
1521
+ /**
1522
+ * Synthetic email fixture generator.
1523
+ *
1524
+ * Produces a well-formed mbox file (~10-12 messages across 5 threads) covering
1525
+ * the entities defined in email-gold.ts. All data is entirely synthetic — no
1526
+ * real PII is present.
1527
+ */
1528
+
1529
+ declare const emailFixture: FixtureGenerator;
1530
+
1531
+ /**
1532
+ * Synthetic project-folder fixture for ingestion benchmarks.
1533
+ *
1534
+ * Generates a nested directory of markdown, JSON, and text files simulating
1535
+ * a project workspace. All names, organisations, and content are entirely fictional.
1536
+ */
1537
+
1538
+ declare const projectFolderFixture: FixtureGenerator;
1539
+
1540
+ /**
1541
+ * Synthetic calendar ICS fixture for ingestion benchmarks.
1542
+ *
1543
+ * Generates a VCALENDAR file with recurring and one-off events.
1544
+ * All names, organisations, and content are entirely fictional.
1545
+ */
1546
+
1547
+ declare const calendarFixture: FixtureGenerator;
1548
+
1549
+ /**
1550
+ * Synthetic chat transcript fixture for ingestion benchmarks.
1551
+ *
1552
+ * Generates a Slack-style JSON transcript across three channels and one DM.
1553
+ * All names, organisations, and content are entirely fictional.
1554
+ */
1555
+
1556
+ declare const chatFixture: FixtureGenerator;
1557
+
1558
+ /**
1559
+ * Sealed rubric prompt registry.
1560
+ *
1561
+ * The canonical form of each sealed rubric prompt is a frozen string literal
1562
+ * in this registry. The matching `.md` file in this directory is a
1563
+ * human-readable mirror kept for reviewers — the `.md` is never loaded at
1564
+ * runtime. This keeps bundling trivial (no filesystem assets) while still
1565
+ * letting reviewers audit rubric text as prose.
1566
+ *
1567
+ * Rotation policy:
1568
+ * - Never edit an existing entry in place.
1569
+ * - Add a new key (`assistant-rubric-v2`, etc.) and ship a matching `.md`.
1570
+ * - Keep the old entry available so historical benchmark results remain
1571
+ * reproducible.
1572
+ */
1573
+ declare const SEALED_PROMPT_REGISTRY: Readonly<Record<string, string>>;
1574
+ declare const DEFAULT_ASSISTANT_RUBRIC_ID = "assistant-rubric-v1";
1575
+
1576
+ /**
1577
+ * Shared types for the Assistant bench tier.
1578
+ *
1579
+ * Every Assistant benchmark shares the same shape:
1580
+ * - A synthetic memory graph (facts, stances, entities) the agent may read.
1581
+ * - A scenario prompt given to the agent.
1582
+ * - A sealed-rubric judge pass that scores the agent's output along
1583
+ * identity_accuracy / stance_coherence / novelty / calibration.
1584
+ *
1585
+ * The goal is reviewability: each benchmark folder ships a small fixture.ts
1586
+ * that returns `AssistantScenario` values, and the runner wires the shared
1587
+ * multi-run + bootstrap-CI infrastructure around them.
1588
+ */
1589
+
1590
+ interface AssistantMemoryFact {
1591
+ id: string;
1592
+ summary: string;
1593
+ /**
1594
+ * Free-form tags (topic, entity) used to render the memory-graph summary
1595
+ * that is handed to the judge. Not shown to the agent.
1596
+ */
1597
+ tags?: string[];
1598
+ }
1599
+ interface AssistantStance {
1600
+ topic: string;
1601
+ position: string;
1602
+ }
1603
+ interface AssistantMemoryGraph {
1604
+ userHandle: string;
1605
+ userRole: string;
1606
+ facts: AssistantMemoryFact[];
1607
+ stances: AssistantStance[];
1608
+ openThreads: string[];
1609
+ }
1610
+ interface AssistantScenario {
1611
+ id: string;
1612
+ title: string;
1613
+ scenarioPrompt: string;
1614
+ memoryGraph: AssistantMemoryGraph;
1615
+ /**
1616
+ * Small label describing what the scenario is meant to exercise. Useful in
1617
+ * dashboards for filtering. Never exposed to the agent.
1618
+ */
1619
+ focus: string;
1620
+ }
1621
+ /**
1622
+ * Minimal agent contract for the Assistant tier. The agent receives the
1623
+ * scenario prompt plus a pre-rendered memory view (analogous to what the
1624
+ * Remnic recall stack would hand to a downstream chat model), and returns
1625
+ * its final answer text.
1626
+ */
1627
+ interface AssistantAgent {
1628
+ respond(request: {
1629
+ scenarioId: string;
1630
+ prompt: string;
1631
+ memoryView: string;
1632
+ }): Promise<string>;
1633
+ }
1634
+ interface AssistantRunnerOptions {
1635
+ agent: AssistantAgent;
1636
+ judge: StructuredJudge | undefined;
1637
+ rubricId?: string;
1638
+ /**
1639
+ * Directory where per-run spot-check JSONL files are appended. Defaults to
1640
+ * `<cwd>/benchmarks/results/spot-checks`.
1641
+ */
1642
+ spotCheckDir?: string;
1643
+ /**
1644
+ * Seed array for deterministic multi-run scheduling. When omitted the
1645
+ * benchmark runner picks a fresh seed array via `buildBenchmarkRunSeeds`.
1646
+ */
1647
+ seeds?: number[];
1648
+ /**
1649
+ * Override used by tests and CLI smoke runs to cap iterations. Must be
1650
+ * `>= 1`. The production contract is `>= 5` per the issue spec.
1651
+ */
1652
+ runCount?: number;
1653
+ /**
1654
+ * Random-number factory for bootstrap sampling. Injected in tests.
1655
+ */
1656
+ random?: () => number;
1657
+ }
1658
+
1659
+ /**
1660
+ * Shared runner scaffolding for the Assistant bench tier.
1661
+ *
1662
+ * Builds the `BenchmarkResult` shape used by the existing dashboard, but with
1663
+ * per-dimension rubric scores (identity_accuracy, stance_coherence, novelty,
1664
+ * calibration) and bootstrap 95% confidence intervals attached. Each
1665
+ * scenario is executed `runCount` times (default 5) and the per-run means
1666
+ * feed the bootstrap so the dashboard can render error bars.
1667
+ */
1668
+
1669
+ declare function runAssistantBenchmark(definition: BenchmarkDefinition, scenarios: AssistantScenario[], resolved: ResolvedRunBenchmarkOptions, runnerOptions: AssistantRunnerOptions): Promise<BenchmarkResult>;
1670
+ declare function renderMemoryViewForAgent(graph: AssistantMemoryGraph): string;
1671
+ declare function renderMemorySummaryForJudge(graph: AssistantMemoryGraph): string;
1672
+
1673
+ /**
1674
+ * Default assistant agent + judge wiring for the Assistant bench tier.
1675
+ *
1676
+ * The assistant tier is designed to be driven by a real provider-backed agent
1677
+ * and a provider-backed structured judge, but we must also run deterministic
1678
+ * smoke tests under `--test` and in CI without network access.
1679
+ *
1680
+ * This module provides:
1681
+ * - `resolveAssistantAgent()` — returns an `AssistantAgent` built from the
1682
+ * injected `resolved.remnicConfig.assistantAgent` hook if present, else
1683
+ * falls back to a deterministic agent that stringifies the memory view.
1684
+ * - `resolveStructuredJudge()` — mirror for the structured judge.
1685
+ *
1686
+ * Injection happens through `remnicConfig` because that field is already the
1687
+ * benchmark-framework's pass-through channel for runner-specific config. The
1688
+ * CLI will set it; tests set it directly on the options record.
1689
+ */
1690
+
1691
+ declare const ASSISTANT_AGENT_CONFIG_KEY = "assistantAgent";
1692
+ declare const ASSISTANT_JUDGE_CONFIG_KEY = "assistantJudge";
1693
+ declare const ASSISTANT_SEEDS_CONFIG_KEY = "assistantSeeds";
1694
+ declare const ASSISTANT_SPOT_CHECK_DIR_KEY = "assistantSpotCheckDir";
1695
+ declare const ASSISTANT_RUBRIC_ID_KEY = "assistantRubricId";
1696
+ declare function resolveAssistantAgent(resolved: ResolvedRunBenchmarkOptions): AssistantAgent;
1697
+ declare function resolveStructuredJudge(resolved: ResolvedRunBenchmarkOptions): StructuredJudge | undefined;
1698
+ declare function resolveAssistantSeeds(resolved: ResolvedRunBenchmarkOptions): number[] | undefined;
1699
+ declare function resolveAssistantSpotCheckDir(resolved: ResolvedRunBenchmarkOptions): string | undefined;
1700
+ declare function resolveAssistantRubricId(resolved: ResolvedRunBenchmarkOptions): string | undefined;
1701
+
1702
+ declare const ASSISTANT_MORNING_BRIEF_SCENARIOS: AssistantScenario[];
1703
+ declare const ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS: AssistantScenario[];
1704
+
1705
+ /**
1706
+ * Assistant bench: proactive morning brief.
1707
+ *
1708
+ * Exercises whether the assistant can surface what the user should know and
1709
+ * act on first when they sit down in the morning. Scored by a sealed rubric
1710
+ * along identity_accuracy, stance_coherence, novelty, and calibration.
1711
+ */
1712
+
1713
+ declare const assistantMorningBriefDefinition: BenchmarkDefinition;
1714
+ declare function runAssistantMorningBriefBenchmark(options: ResolvedRunBenchmarkOptions): Promise<BenchmarkResult>;
1715
+
1716
+ declare const ASSISTANT_MEETING_PREP_SCENARIOS: AssistantScenario[];
1717
+ declare const ASSISTANT_MEETING_PREP_SMOKE_SCENARIOS: AssistantScenario[];
1718
+
1719
+ /**
1720
+ * Assistant bench: meeting prep.
1721
+ *
1722
+ * Given an upcoming meeting and attendees, generate a prep brief. Judged on
1723
+ * attendee-context accuracy, topic recall, and open-thread surfacing.
1724
+ */
1725
+
1726
+ declare const assistantMeetingPrepDefinition: BenchmarkDefinition;
1727
+ declare function runAssistantMeetingPrepBenchmark(options: ResolvedRunBenchmarkOptions): Promise<BenchmarkResult>;
1728
+
1729
+ declare const ASSISTANT_NEXT_BEST_ACTION_SCENARIOS: AssistantScenario[];
1730
+ declare const ASSISTANT_NEXT_BEST_ACTION_SMOKE_SCENARIOS: AssistantScenario[];
1731
+
1732
+ /**
1733
+ * Assistant bench: next-best-action.
1734
+ *
1735
+ * Given current state, what should the user do next? Judged on grounding in
1736
+ * the memory graph (not generic advice) and on calibration — abstaining on
1737
+ * weak-evidence questions rather than confidently inventing answers.
1738
+ */
1739
+
1740
+ declare const assistantNextBestActionDefinition: BenchmarkDefinition;
1741
+ declare function runAssistantNextBestActionBenchmark(options: ResolvedRunBenchmarkOptions): Promise<BenchmarkResult>;
1742
+
1743
+ declare const ASSISTANT_SYNTHESIS_SCENARIOS: AssistantScenario[];
1744
+ declare const ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS: AssistantScenario[];
1745
+
1746
+ /**
1747
+ * Assistant bench: multi-document synthesis with stance.
1748
+ *
1749
+ * "What does the brain think about X?" — the agent must integrate across
1750
+ * multiple memory items and reflect the user's previously-expressed stance,
1751
+ * rather than regurgitating the single top-k chunk.
1752
+ */
1753
+
1754
+ declare const assistantSynthesisDefinition: BenchmarkDefinition;
1755
+ declare function runAssistantSynthesisBenchmark(options: ResolvedRunBenchmarkOptions): Promise<BenchmarkResult>;
1756
+
1757
+ export { ASSISTANT_AGENT_CONFIG_KEY, ASSISTANT_JUDGE_CONFIG_KEY, ASSISTANT_MEETING_PREP_SCENARIOS, ASSISTANT_MEETING_PREP_SMOKE_SCENARIOS, ASSISTANT_MORNING_BRIEF_SCENARIOS, ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SMOKE_SCENARIOS, ASSISTANT_RUBRIC_DIMENSIONS, ASSISTANT_RUBRIC_ID_KEY, ASSISTANT_SEEDS_CONFIG_KEY, ASSISTANT_SPOT_CHECK_DIR_KEY, ASSISTANT_SYNTHESIS_SCENARIOS, ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS, type AbstentionRetrievalCase, type AggregateMetrics, type AnthropicProviderConfig, type AssistantAgent, type AssistantMemoryFact, type AssistantMemoryGraph, type AssistantRubricDimension, type AssistantRubricScores, type AssistantRunnerOptions, type AssistantScenario, type AssistantStance, BENCHMARK_INTEGRITY_META_SCHEMA, BENCHMARK_RESULT_SCHEMA, BENCHMARK_SPLIT_TYPES, type BenchConfig, type BenchJudge, type BenchJudgeResult, type BenchMemoryAdapter, type BenchModelSource, type BenchResponder, type BenchResponse, type BenchRuntimeProfile, type BenchTier, type BenchmarkCategory, type BenchmarkDefinition, type BenchmarkIntegrityMeta, type BenchmarkMeta, type BenchmarkMode, type BenchmarkReport, type BenchmarkResult, type BenchmarkSplitType, type BenchmarkStatus, type BenchmarkSuiteResult, type BenchmarkTier, type BuildBenchmarkPublishFeedOptions, type BuiltInProvider, CANARY_FIXED_RECALL, CANARY_SCORE_FLOOR, type CanaryAdapterOptions, type CanaryFloorCheck, type ComparisonMetricDelta, type ComparisonResult, type CompletionOpts, type CompletionResult, type ConfidenceInterval, type ContaminationCheckResult, type ContaminationEntry, type ContaminationManifest, type CustomBenchmarkScoring, type CustomBenchmarkSpec, type CustomBenchmarkTask, DEFAULT_ASSISTANT_RUBRIC_ID, type DiscoveredModel, EMPTY_CONTAMINATION_MANIFEST, type EffectSizeInterpretation, type EffectSizeSummary, type ExplainResult, type ExtractedEntity, type ExtractedLink, type ExtractedPage, type FixtureGenerator, type FixtureOutput, type FixtureVariant, type GeneratedFile, type GoldEntity, type GoldEntityType, type GoldGraph, type GoldLink, type GoldPage, INTEGRITY_CIPHER_ALGORITHM, INTEGRITY_HASH_ALGORITHM, INTEGRITY_META_FIELDS, type IngestionBenchAdapter, type IngestionLog, type LlmJudge, type LlmProvider, type LoadSealedQrelsOptions, type MemoryGraph, type MemoryStats, type MemorySystem, type Message, type MetricAggregate, type MultipleChoiceQuestion, type OllamaProviderConfig, type OpenAiCompatibleProviderConfig, type PersonalizationRetrievalCase, type ProviderBaseConfig, type ProviderConfig, type ProviderDiscoveryResult, type ProviderFactoryConfig, type PublishSkipReason, type PublishSkipRecord, type PublishedBenchmarkFeed, type PublishedBenchmarkFeedEntry, REQUIRED_FRONTMATTER_FIELDS, type RecallMetrics, type RegressionDetail, type RegressionGateResult, type RemnicAdapterOptions, type ResolveBenchRuntimeProfileOptions, type ResolvedBenchRuntimeProfile, type ResolvedRunBenchmarkOptions, type RotatedChoices, type RunBenchmarkOptions, SCHEMA_TIER_FIXTURE, SCHEMA_TIER_SMOKE_FIXTURE, SEALED_PROMPT_REGISTRY, type SavedBaseline, type SchemaTierCorpus, type SchemaTierFixture, type SchemaTierName, type SchemaTierPage, type SchemaTierPageFrontmatter, type SealedArtifact, type SealedJudgeDecision, type SealedJudgeInput, type SealedQrelsArtifact, type SealedQrelsHandle, type SealedRubric, type SearchResult, type SeededRng, type SpotCheckLogger, type StatisticalReport, type StructuredJudge, type TaskResult, type TaskTokenUsage, type TemporalRetrievalCase, type TierDetail, type TokenUsage, addContaminationEntry, aggregateTaskScores, answerBenchmarkQuestion, assertCanaryUnderFloor, assertIntegrityMetaPresent, assertPublishableIntegrity, assertSha256Hex, assistantMeetingPrepDefinition, assistantMorningBriefDefinition, assistantNextBestActionDefinition, assistantSynthesisDefinition, backlinkF1, bootstrapMeanConfidenceInterval, buildBenchmarkPublishFeed, buildBenchmarkRunSeeds, buildJudgePayload, buildSchemaTierFixture, buildSchemaTierSmokeFixture, calendarFixture, canonicalJsonStringify, chatFixture, checkDatasetContamination, checkRegression, clampScore, cohensD, compareResults, computeSealHash, containsAnswer, createAnthropicProvider, createCanaryAdapter, createDeterministicSpotCheckLogger, createGatewayResponder, createLightweightAdapter, createLiteLlmProvider, createOllamaProvider, createOpenAiCompatibleProvider, createProvider, createProviderBackedJudge, createProviderBackedResponder, createProviderBackedStructuredJudge, createRemnicAdapter, createResponderFromProvider, createSeededRng, createSpotCheckFileLogger, createStructuredJudgeFromProvider, defaultBenchmarkBaselineDir, defaultBenchmarkPublishPath, deleteBenchmarkResults, discoverAllProviders, emailFixture, entityRecall, exactMatch, f1Score, generateReport, getBenchmark, getBenchmarkLowerIsBetter, hashBytes, hashCanonicalJson, hashString, integrityMetaIsComplete, interpretEffectSize, isContaminationEntry, isContaminationManifest, isSealedQrelsArtifact, isSha256Hex, linkMatches, listBenchmarkBaselines, listBenchmarkResults, listBenchmarks, llmJudgeScore, llmJudgeScoreDetailed, loadBaseline, loadBenchmarkBaseline, loadBenchmarkResult, loadCustomBenchmarkFile, loadSealKeyFromEnv, loadSealedQrels, loadSealedRubric, matchEntity, mergeContaminationManifests, openSeal, orchestrateBenchmarkRuns, pairedDeltaConfidenceInterval, parseCustomBenchmark, parseRubricResponse, parseSealedQrels, precisionAtK, projectFolderFixture, recallAtK, renderBenchmarkResultExport, renderMemorySummaryForJudge, renderMemoryViewForAgent, resolveAssistantAgent, resolveAssistantRubricId, resolveAssistantSeeds, resolveAssistantSpotCheckDir, resolveBenchRuntimeProfile, resolveBenchmarkResultReference, resolveBenchmarkRunCount, resolveStructuredJudge, rotateDistractors, rougeL, runAssistantBenchmark, runAssistantMeetingPrepBenchmark, runAssistantMorningBriefBenchmark, runAssistantNextBestActionBenchmark, runAssistantSynthesisBenchmark, runBenchSuite, runBenchmark, runCustomBenchmarkFile, runExplain, runSealedJudge, safeHexEqual, saveBaseline, saveBenchmarkBaseline, schemaCompleteness, sealPayload, selectFixtureVariant, serializeSealedQrels, shuffleTasks, timed, verifyRubricDigest, writeBenchmarkPublishFeed, writeBenchmarkResult, zeroScores };