@remnic/bench 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/dist/index.d.ts +1757 -0
- package/dist/index.js +13468 -0
- package/package.json +46 -0
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,1757 @@
|
|
|
1
|
+
import { GatewayConfig, EngramAccessService } from '@remnic/core';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Types for the ingestion benchmark tier.
|
|
5
|
+
*/
|
|
6
|
+
type GoldEntityType = "person" | "org" | "project" | "topic" | "event" | "location";
|
|
7
|
+
interface GoldEntity {
|
|
8
|
+
id: string;
|
|
9
|
+
name: string;
|
|
10
|
+
type: GoldEntityType;
|
|
11
|
+
aliases?: string[];
|
|
12
|
+
}
|
|
13
|
+
interface GoldLink {
|
|
14
|
+
source: string;
|
|
15
|
+
target: string;
|
|
16
|
+
relation: string;
|
|
17
|
+
bidirectional: boolean;
|
|
18
|
+
}
|
|
19
|
+
interface GoldPage {
|
|
20
|
+
title: string;
|
|
21
|
+
requiredFields: string[];
|
|
22
|
+
expectTimeline: boolean;
|
|
23
|
+
expectExecSummary: boolean;
|
|
24
|
+
expectSeeAlso: string[];
|
|
25
|
+
}
|
|
26
|
+
interface GoldGraph {
|
|
27
|
+
entities: GoldEntity[];
|
|
28
|
+
links: GoldLink[];
|
|
29
|
+
pages: GoldPage[];
|
|
30
|
+
}
|
|
31
|
+
interface ExtractedEntity {
|
|
32
|
+
name: string;
|
|
33
|
+
type: string;
|
|
34
|
+
sourceFile: string;
|
|
35
|
+
}
|
|
36
|
+
interface ExtractedLink {
|
|
37
|
+
source: string;
|
|
38
|
+
target: string;
|
|
39
|
+
relation: string;
|
|
40
|
+
}
|
|
41
|
+
interface ExtractedPage {
|
|
42
|
+
path: string;
|
|
43
|
+
title: string;
|
|
44
|
+
frontmatter: Record<string, unknown>;
|
|
45
|
+
hasExecSummary: boolean;
|
|
46
|
+
hasTimeline: boolean;
|
|
47
|
+
seeAlso: string[];
|
|
48
|
+
content: string;
|
|
49
|
+
}
|
|
50
|
+
interface MemoryGraph {
|
|
51
|
+
entities: ExtractedEntity[];
|
|
52
|
+
links: ExtractedLink[];
|
|
53
|
+
pages: ExtractedPage[];
|
|
54
|
+
}
|
|
55
|
+
interface IngestionLog {
|
|
56
|
+
commandsIssued: string[];
|
|
57
|
+
promptsShown: string[];
|
|
58
|
+
errors: string[];
|
|
59
|
+
durationMs: number;
|
|
60
|
+
}
|
|
61
|
+
interface IngestionBenchAdapter {
|
|
62
|
+
ingest(inputDir: string): Promise<IngestionLog>;
|
|
63
|
+
getMemoryGraph(): Promise<MemoryGraph>;
|
|
64
|
+
reset(): Promise<void>;
|
|
65
|
+
destroy(): Promise<void>;
|
|
66
|
+
}
|
|
67
|
+
declare const REQUIRED_FRONTMATTER_FIELDS: readonly ["title", "type", "state", "created", "see-also"];
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Shared adapter contract for benchmarks running against Remnic memory systems.
|
|
71
|
+
*/
|
|
72
|
+
interface Message {
|
|
73
|
+
role: "user" | "assistant" | "system";
|
|
74
|
+
content: string;
|
|
75
|
+
}
|
|
76
|
+
interface SearchResult {
|
|
77
|
+
turnIndex: number;
|
|
78
|
+
role: string;
|
|
79
|
+
snippet: string;
|
|
80
|
+
sessionId: string;
|
|
81
|
+
score?: number;
|
|
82
|
+
}
|
|
83
|
+
interface MemoryStats {
|
|
84
|
+
totalMessages: number;
|
|
85
|
+
totalSummaryNodes: number;
|
|
86
|
+
maxDepth: number;
|
|
87
|
+
}
|
|
88
|
+
interface BenchResponse {
|
|
89
|
+
text: string;
|
|
90
|
+
tokens: {
|
|
91
|
+
input: number;
|
|
92
|
+
output: number;
|
|
93
|
+
};
|
|
94
|
+
latencyMs: number;
|
|
95
|
+
model: string;
|
|
96
|
+
}
|
|
97
|
+
interface BenchResponder {
|
|
98
|
+
respond(question: string, recalledText: string): Promise<BenchResponse>;
|
|
99
|
+
}
|
|
100
|
+
interface BenchJudgeResult {
|
|
101
|
+
score: number;
|
|
102
|
+
tokens: {
|
|
103
|
+
input: number;
|
|
104
|
+
output: number;
|
|
105
|
+
};
|
|
106
|
+
latencyMs: number;
|
|
107
|
+
model?: string;
|
|
108
|
+
}
|
|
109
|
+
interface BenchJudge {
|
|
110
|
+
score(question: string, predicted: string, expected: string): Promise<number>;
|
|
111
|
+
scoreWithMetrics?(question: string, predicted: string, expected: string): Promise<BenchJudgeResult>;
|
|
112
|
+
}
|
|
113
|
+
interface BenchMemoryAdapter {
|
|
114
|
+
store(sessionId: string, messages: Message[]): Promise<void>;
|
|
115
|
+
recall(sessionId: string, query: string, budgetChars?: number): Promise<string>;
|
|
116
|
+
search(query: string, limit: number, sessionId?: string): Promise<SearchResult[]>;
|
|
117
|
+
reset(sessionId?: string): Promise<void>;
|
|
118
|
+
getStats(sessionId?: string): Promise<MemoryStats>;
|
|
119
|
+
destroy(): Promise<void>;
|
|
120
|
+
responder?: BenchResponder;
|
|
121
|
+
judge?: BenchJudge;
|
|
122
|
+
}
|
|
123
|
+
type LlmJudge = BenchJudge;
|
|
124
|
+
type MemorySystem = BenchMemoryAdapter;
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Integrity-facing additions to `BenchmarkResult.meta`.
|
|
128
|
+
*
|
|
129
|
+
* These fields are required for every published result and checked by the
|
|
130
|
+
* publishing pipeline. See `docs/bench/integrity.md` for the rotation policy.
|
|
131
|
+
*/
|
|
132
|
+
declare const BENCHMARK_SPLIT_TYPES: readonly ["public", "holdout"];
|
|
133
|
+
type BenchmarkSplitType = (typeof BENCHMARK_SPLIT_TYPES)[number];
|
|
134
|
+
interface BenchmarkIntegrityMeta {
|
|
135
|
+
/**
|
|
136
|
+
* Which dataset split produced this result. Public leaderboard scores
|
|
137
|
+
* only accept `holdout` results; `public` results are for self-reporting
|
|
138
|
+
* and iteration.
|
|
139
|
+
*/
|
|
140
|
+
splitType: BenchmarkSplitType;
|
|
141
|
+
/** SHA-256 of the sealed qrels artifact used by the judge. */
|
|
142
|
+
qrelsSealedHash: string;
|
|
143
|
+
/** SHA-256 of the rendered judge prompt (post-template expansion). */
|
|
144
|
+
judgePromptHash: string;
|
|
145
|
+
/** SHA-256 of the dataset payload as served to the runner. */
|
|
146
|
+
datasetHash: string;
|
|
147
|
+
/**
|
|
148
|
+
* Score the canary adapter scored on the same benchmark during the audit
|
|
149
|
+
* run that produced this result. Must stay below the benchmark's floor.
|
|
150
|
+
* Omitted only during the canary's own run.
|
|
151
|
+
*/
|
|
152
|
+
canaryScore?: number;
|
|
153
|
+
}
|
|
154
|
+
declare const INTEGRITY_META_FIELDS: readonly ["splitType", "qrelsSealedHash", "judgePromptHash", "datasetHash"];
|
|
155
|
+
declare const BENCHMARK_INTEGRITY_META_SCHEMA: {
|
|
156
|
+
readonly type: "object";
|
|
157
|
+
readonly required: readonly ["splitType", "qrelsSealedHash", "judgePromptHash", "datasetHash"];
|
|
158
|
+
readonly properties: {
|
|
159
|
+
readonly splitType: {
|
|
160
|
+
readonly type: "string";
|
|
161
|
+
readonly enum: readonly ["public", "holdout"];
|
|
162
|
+
};
|
|
163
|
+
readonly qrelsSealedHash: {
|
|
164
|
+
readonly type: "string";
|
|
165
|
+
readonly pattern: "^[0-9a-f]{64}$";
|
|
166
|
+
};
|
|
167
|
+
readonly judgePromptHash: {
|
|
168
|
+
readonly type: "string";
|
|
169
|
+
readonly pattern: "^[0-9a-f]{64}$";
|
|
170
|
+
};
|
|
171
|
+
readonly datasetHash: {
|
|
172
|
+
readonly type: "string";
|
|
173
|
+
readonly pattern: "^[0-9a-f]{64}$";
|
|
174
|
+
};
|
|
175
|
+
readonly canaryScore: {
|
|
176
|
+
readonly type: "number";
|
|
177
|
+
};
|
|
178
|
+
};
|
|
179
|
+
};
|
|
180
|
+
declare function integrityMetaIsComplete(value: unknown): value is BenchmarkIntegrityMeta;
|
|
181
|
+
/**
|
|
182
|
+
* Throw a descriptive error listing every missing or malformed integrity
|
|
183
|
+
* field. Used by the publishing pipeline.
|
|
184
|
+
*/
|
|
185
|
+
declare function assertIntegrityMetaPresent(value: unknown): asserts value is BenchmarkIntegrityMeta;
|
|
186
|
+
|
|
187
|
+
type BenchmarkMode = "full" | "quick";
|
|
188
|
+
type BenchmarkTier = "published" | "remnic" | "custom";
|
|
189
|
+
type BenchmarkStatus = "ready" | "planned";
|
|
190
|
+
type BenchmarkCategory = "agentic" | "retrieval" | "conversational" | "ingestion";
|
|
191
|
+
type BenchRuntimeProfile = "baseline" | "real" | "openclaw-chain";
|
|
192
|
+
type BuiltInProvider = "openai" | "anthropic" | "ollama" | "litellm";
|
|
193
|
+
interface ProviderConfig {
|
|
194
|
+
provider: BuiltInProvider;
|
|
195
|
+
model: string;
|
|
196
|
+
baseUrl?: string;
|
|
197
|
+
}
|
|
198
|
+
interface TaskTokenUsage {
|
|
199
|
+
input: number;
|
|
200
|
+
output: number;
|
|
201
|
+
}
|
|
202
|
+
interface TaskResult {
|
|
203
|
+
taskId: string;
|
|
204
|
+
question: string;
|
|
205
|
+
expected: string;
|
|
206
|
+
actual: string;
|
|
207
|
+
scores: Record<string, number>;
|
|
208
|
+
latencyMs: number;
|
|
209
|
+
tokens: TaskTokenUsage;
|
|
210
|
+
details?: Record<string, unknown>;
|
|
211
|
+
}
|
|
212
|
+
interface MetricAggregate {
|
|
213
|
+
mean: number;
|
|
214
|
+
median: number;
|
|
215
|
+
stdDev: number;
|
|
216
|
+
min: number;
|
|
217
|
+
max: number;
|
|
218
|
+
}
|
|
219
|
+
type AggregateMetrics = Record<string, MetricAggregate>;
|
|
220
|
+
interface ConfidenceInterval {
|
|
221
|
+
lower: number;
|
|
222
|
+
upper: number;
|
|
223
|
+
level: number;
|
|
224
|
+
}
|
|
225
|
+
type EffectSizeInterpretation = "negligible" | "small" | "medium" | "large";
|
|
226
|
+
interface EffectSizeSummary {
|
|
227
|
+
cohensD: number;
|
|
228
|
+
interpretation: EffectSizeInterpretation;
|
|
229
|
+
}
|
|
230
|
+
interface ComparisonMetricDelta {
|
|
231
|
+
baseline: number;
|
|
232
|
+
candidate: number;
|
|
233
|
+
delta: number;
|
|
234
|
+
percentChange: number;
|
|
235
|
+
effectSize: EffectSizeSummary;
|
|
236
|
+
ciOnDelta?: ConfidenceInterval;
|
|
237
|
+
}
|
|
238
|
+
interface ComparisonResult {
|
|
239
|
+
benchmark: string;
|
|
240
|
+
metricDeltas: Record<string, ComparisonMetricDelta>;
|
|
241
|
+
verdict: "pass" | "regression" | "improvement";
|
|
242
|
+
}
|
|
243
|
+
interface StatisticalReport {
|
|
244
|
+
confidenceIntervals: Record<string, ConfidenceInterval>;
|
|
245
|
+
bootstrapSamples: number;
|
|
246
|
+
effectSizes?: Record<string, EffectSizeSummary>;
|
|
247
|
+
pairedComparison?: {
|
|
248
|
+
baselineId: string;
|
|
249
|
+
pValue: number;
|
|
250
|
+
ciOnDelta: ConfidenceInterval;
|
|
251
|
+
};
|
|
252
|
+
}
|
|
253
|
+
interface BenchmarkResult {
|
|
254
|
+
meta: {
|
|
255
|
+
id: string;
|
|
256
|
+
benchmark: string;
|
|
257
|
+
benchmarkTier: BenchmarkTier;
|
|
258
|
+
version: string;
|
|
259
|
+
remnicVersion: string;
|
|
260
|
+
gitSha: string;
|
|
261
|
+
timestamp: string;
|
|
262
|
+
mode: BenchmarkMode;
|
|
263
|
+
runCount: number;
|
|
264
|
+
seeds: number[];
|
|
265
|
+
/**
|
|
266
|
+
* Which dataset split produced this result. Public leaderboard scores
|
|
267
|
+
* only accept `holdout`; `public` is for self-reporting and iteration.
|
|
268
|
+
*/
|
|
269
|
+
splitType?: BenchmarkSplitType;
|
|
270
|
+
/** SHA-256 of the sealed qrels artifact used by the judge. */
|
|
271
|
+
qrelsSealedHash?: string;
|
|
272
|
+
/** SHA-256 of the rendered judge prompt (post-template expansion). */
|
|
273
|
+
judgePromptHash?: string;
|
|
274
|
+
/** SHA-256 of the dataset payload as served to the runner. */
|
|
275
|
+
datasetHash?: string;
|
|
276
|
+
/**
|
|
277
|
+
* Canary-adapter score from the audit run that produced this result.
|
|
278
|
+
* Must stay below the benchmark's canary floor.
|
|
279
|
+
*/
|
|
280
|
+
canaryScore?: number;
|
|
281
|
+
};
|
|
282
|
+
config: {
|
|
283
|
+
runtimeProfile?: BenchRuntimeProfile | null;
|
|
284
|
+
systemProvider: ProviderConfig | null;
|
|
285
|
+
judgeProvider: ProviderConfig | null;
|
|
286
|
+
adapterMode: string;
|
|
287
|
+
remnicConfig: Record<string, unknown>;
|
|
288
|
+
};
|
|
289
|
+
cost: {
|
|
290
|
+
totalTokens: number;
|
|
291
|
+
inputTokens: number;
|
|
292
|
+
outputTokens: number;
|
|
293
|
+
estimatedCostUsd: number;
|
|
294
|
+
totalLatencyMs: number;
|
|
295
|
+
meanQueryLatencyMs: number;
|
|
296
|
+
};
|
|
297
|
+
results: {
|
|
298
|
+
tasks: TaskResult[];
|
|
299
|
+
aggregates: AggregateMetrics;
|
|
300
|
+
statistics?: StatisticalReport;
|
|
301
|
+
};
|
|
302
|
+
environment: {
|
|
303
|
+
os: string;
|
|
304
|
+
nodeVersion: string;
|
|
305
|
+
hardware?: string;
|
|
306
|
+
};
|
|
307
|
+
}
|
|
308
|
+
interface BenchmarkMeta {
|
|
309
|
+
name: string;
|
|
310
|
+
version: string;
|
|
311
|
+
description: string;
|
|
312
|
+
category: BenchmarkCategory;
|
|
313
|
+
citation?: string;
|
|
314
|
+
/**
|
|
315
|
+
* Optional integrity metadata declared on the benchmark itself (as opposed
|
|
316
|
+
* to each result). When set, the publishing pipeline pins result-time
|
|
317
|
+
* integrity hashes against these values.
|
|
318
|
+
*/
|
|
319
|
+
integrity?: BenchmarkIntegrityMeta;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
interface BenchmarkDefinition {
|
|
323
|
+
id: string;
|
|
324
|
+
title: string;
|
|
325
|
+
tier: BenchmarkTier;
|
|
326
|
+
status: BenchmarkStatus;
|
|
327
|
+
runnerAvailable: boolean;
|
|
328
|
+
meta: BenchmarkMeta;
|
|
329
|
+
}
|
|
330
|
+
interface RunBenchmarkOptions {
|
|
331
|
+
mode?: BenchmarkMode;
|
|
332
|
+
datasetDir?: string;
|
|
333
|
+
outputDir?: string;
|
|
334
|
+
limit?: number;
|
|
335
|
+
seed?: number;
|
|
336
|
+
adapterMode?: string;
|
|
337
|
+
runtimeProfile?: BenchRuntimeProfile | null;
|
|
338
|
+
system: BenchMemoryAdapter;
|
|
339
|
+
ingestionAdapter?: IngestionBenchAdapter;
|
|
340
|
+
systemProvider?: ProviderConfig | null;
|
|
341
|
+
judgeProvider?: ProviderConfig | null;
|
|
342
|
+
remnicConfig?: Record<string, unknown>;
|
|
343
|
+
}
|
|
344
|
+
interface ResolvedRunBenchmarkOptions extends RunBenchmarkOptions {
|
|
345
|
+
mode: BenchmarkMode;
|
|
346
|
+
benchmark: BenchmarkDefinition;
|
|
347
|
+
}
|
|
348
|
+
type BenchTier = "exact_match" | "category_match" | "keyword_overlap" | "high_confidence" | "semantic_search" | "full_search" | "no_results";
|
|
349
|
+
interface TierDetail {
|
|
350
|
+
tier: BenchTier;
|
|
351
|
+
latencyMs: number;
|
|
352
|
+
resultsCount: number;
|
|
353
|
+
}
|
|
354
|
+
interface ExplainResult {
|
|
355
|
+
query: string;
|
|
356
|
+
tiersUsed: BenchTier[];
|
|
357
|
+
tierResults: TierDetail[];
|
|
358
|
+
durationMs: number;
|
|
359
|
+
totalDurationMs: number;
|
|
360
|
+
}
|
|
361
|
+
interface RecallMetrics {
|
|
362
|
+
query: string;
|
|
363
|
+
latencyMs: number;
|
|
364
|
+
tiersUsed: BenchTier[];
|
|
365
|
+
throughput: number;
|
|
366
|
+
resultsCount: number;
|
|
367
|
+
totalDurationMs: number;
|
|
368
|
+
tierDetails: TierDetail[];
|
|
369
|
+
}
|
|
370
|
+
interface BenchmarkReport {
|
|
371
|
+
timestamp: string;
|
|
372
|
+
queries: Array<{
|
|
373
|
+
query: string;
|
|
374
|
+
tiersUsed: BenchTier[];
|
|
375
|
+
durationMs: number;
|
|
376
|
+
resultsCount: number;
|
|
377
|
+
throughput: number;
|
|
378
|
+
tierDetails: TierDetail[];
|
|
379
|
+
}>;
|
|
380
|
+
totalDurationMs: number;
|
|
381
|
+
}
|
|
382
|
+
interface BenchmarkSuiteResult {
|
|
383
|
+
results: RecallMetrics[];
|
|
384
|
+
report: BenchmarkReport;
|
|
385
|
+
totalDurationMs: number;
|
|
386
|
+
regressions: RegressionDetail[];
|
|
387
|
+
}
|
|
388
|
+
interface SavedBaseline {
|
|
389
|
+
version: number;
|
|
390
|
+
timestamp: string;
|
|
391
|
+
metrics: Record<string, number>;
|
|
392
|
+
}
|
|
393
|
+
interface RegressionGateResult {
|
|
394
|
+
passed: boolean;
|
|
395
|
+
regressions: RegressionDetail[];
|
|
396
|
+
}
|
|
397
|
+
interface RegressionDetail {
|
|
398
|
+
metric: string;
|
|
399
|
+
currentValue: number;
|
|
400
|
+
baselineValue: number;
|
|
401
|
+
tolerance: number;
|
|
402
|
+
passed: boolean;
|
|
403
|
+
}
|
|
404
|
+
interface BenchConfig {
|
|
405
|
+
queries?: string[];
|
|
406
|
+
iterations?: number;
|
|
407
|
+
regressionTolerance?: number;
|
|
408
|
+
baselinePath?: string;
|
|
409
|
+
reportPath?: string;
|
|
410
|
+
seed?: number;
|
|
411
|
+
explain?: boolean;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
/**
|
|
415
|
+
* Custom benchmark schema types.
|
|
416
|
+
*/
|
|
417
|
+
|
|
418
|
+
type CustomBenchmarkScoring = "exact_match" | "f1" | "rouge_l" | "llm_judge";
|
|
419
|
+
interface CustomBenchmarkTask {
|
|
420
|
+
question: string;
|
|
421
|
+
expected: string;
|
|
422
|
+
tags?: string[];
|
|
423
|
+
}
|
|
424
|
+
interface CustomBenchmarkSpec {
|
|
425
|
+
name: string;
|
|
426
|
+
description?: string;
|
|
427
|
+
version?: string;
|
|
428
|
+
category?: BenchmarkCategory;
|
|
429
|
+
citation?: string;
|
|
430
|
+
scoring: CustomBenchmarkScoring;
|
|
431
|
+
tasks: CustomBenchmarkTask[];
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
/**
|
|
435
|
+
* Shared types for inbox fixture generators.
|
|
436
|
+
*/
|
|
437
|
+
|
|
438
|
+
interface GeneratedFile {
|
|
439
|
+
relativePath: string;
|
|
440
|
+
content: string;
|
|
441
|
+
}
|
|
442
|
+
interface FixtureOutput {
|
|
443
|
+
id: string;
|
|
444
|
+
description: string;
|
|
445
|
+
files: GeneratedFile[];
|
|
446
|
+
goldGraph: GoldGraph;
|
|
447
|
+
}
|
|
448
|
+
interface FixtureGenerator {
|
|
449
|
+
id: string;
|
|
450
|
+
description: string;
|
|
451
|
+
generate(): FixtureOutput;
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Package-owned Remnic adapters used by the phase-1 benchmark CLI surface.
|
|
456
|
+
*/
|
|
457
|
+
|
|
458
|
+
interface RemnicAdapterOptions {
|
|
459
|
+
configOverrides?: Record<string, unknown>;
|
|
460
|
+
preserveRuntimeDefaults?: boolean;
|
|
461
|
+
responder?: BenchResponder;
|
|
462
|
+
judge?: BenchJudge;
|
|
463
|
+
}
|
|
464
|
+
declare const createLightweightAdapter: (options?: RemnicAdapterOptions) => Promise<BenchMemoryAdapter>;
|
|
465
|
+
declare const createRemnicAdapter: (options?: RemnicAdapterOptions) => Promise<BenchMemoryAdapter>;
|
|
466
|
+
|
|
467
|
+
/**
|
|
468
|
+
* Minimal LLM provider contract for the bench engine.
|
|
469
|
+
*/
|
|
470
|
+
|
|
471
|
+
interface CompletionOpts {
|
|
472
|
+
systemPrompt?: string;
|
|
473
|
+
temperature?: number;
|
|
474
|
+
maxTokens?: number;
|
|
475
|
+
headers?: Record<string, string>;
|
|
476
|
+
}
|
|
477
|
+
interface CompletionResult {
|
|
478
|
+
text: string;
|
|
479
|
+
tokens: {
|
|
480
|
+
input: number;
|
|
481
|
+
output: number;
|
|
482
|
+
};
|
|
483
|
+
latencyMs: number;
|
|
484
|
+
model: string;
|
|
485
|
+
}
|
|
486
|
+
interface DiscoveredModel {
|
|
487
|
+
id: string;
|
|
488
|
+
name: string;
|
|
489
|
+
contextLength: number;
|
|
490
|
+
capabilities: ("completion" | "embedding" | "vision")[];
|
|
491
|
+
quantization?: string;
|
|
492
|
+
parameterCount?: string;
|
|
493
|
+
}
|
|
494
|
+
interface ProviderBaseConfig {
|
|
495
|
+
model: string;
|
|
496
|
+
baseUrl?: string;
|
|
497
|
+
apiKey?: string;
|
|
498
|
+
headers?: Record<string, string>;
|
|
499
|
+
}
|
|
500
|
+
interface OpenAiCompatibleProviderConfig extends ProviderBaseConfig {
|
|
501
|
+
provider?: "openai" | "litellm";
|
|
502
|
+
}
|
|
503
|
+
interface AnthropicProviderConfig extends ProviderBaseConfig {
|
|
504
|
+
provider?: "anthropic";
|
|
505
|
+
anthropicVersion?: string;
|
|
506
|
+
}
|
|
507
|
+
interface OllamaProviderConfig extends ProviderBaseConfig {
|
|
508
|
+
provider?: "ollama";
|
|
509
|
+
}
|
|
510
|
+
type ProviderFactoryConfig = (OpenAiCompatibleProviderConfig & {
|
|
511
|
+
provider: "openai" | "litellm";
|
|
512
|
+
}) | (AnthropicProviderConfig & {
|
|
513
|
+
provider: "anthropic";
|
|
514
|
+
}) | (OllamaProviderConfig & {
|
|
515
|
+
provider: "ollama";
|
|
516
|
+
});
|
|
517
|
+
interface ProviderDiscoveryResult {
|
|
518
|
+
provider: BuiltInProvider;
|
|
519
|
+
models: DiscoveredModel[];
|
|
520
|
+
}
|
|
521
|
+
interface TokenUsage {
|
|
522
|
+
inputTokens: number;
|
|
523
|
+
outputTokens: number;
|
|
524
|
+
totalTokens: number;
|
|
525
|
+
}
|
|
526
|
+
interface LlmProvider {
|
|
527
|
+
id: string;
|
|
528
|
+
name: string;
|
|
529
|
+
provider: BuiltInProvider;
|
|
530
|
+
complete(prompt: string, opts?: CompletionOpts): Promise<CompletionResult>;
|
|
531
|
+
embed?(texts: string[]): Promise<number[][]>;
|
|
532
|
+
discover?(): Promise<DiscoveredModel[]>;
|
|
533
|
+
getUsage(): TokenUsage;
|
|
534
|
+
resetUsage(): void;
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
/**
|
|
538
|
+
* JSON schema contract for BenchmarkResult payloads.
|
|
539
|
+
*/
|
|
540
|
+
declare const BENCHMARK_RESULT_SCHEMA: {
|
|
541
|
+
readonly type: "object";
|
|
542
|
+
readonly required: readonly ["meta", "config", "cost", "results", "environment"];
|
|
543
|
+
readonly properties: {
|
|
544
|
+
readonly meta: {
|
|
545
|
+
readonly type: "object";
|
|
546
|
+
readonly required: readonly ["id", "benchmark", "benchmarkTier", "version", "remnicVersion", "gitSha", "timestamp", "mode", "runCount", "seeds"];
|
|
547
|
+
readonly properties: {
|
|
548
|
+
readonly id: {
|
|
549
|
+
readonly type: "string";
|
|
550
|
+
};
|
|
551
|
+
readonly benchmark: {
|
|
552
|
+
readonly type: "string";
|
|
553
|
+
};
|
|
554
|
+
readonly benchmarkTier: {
|
|
555
|
+
readonly type: "string";
|
|
556
|
+
readonly enum: readonly ["published", "remnic", "custom"];
|
|
557
|
+
};
|
|
558
|
+
readonly version: {
|
|
559
|
+
readonly type: "string";
|
|
560
|
+
};
|
|
561
|
+
readonly remnicVersion: {
|
|
562
|
+
readonly type: "string";
|
|
563
|
+
};
|
|
564
|
+
readonly gitSha: {
|
|
565
|
+
readonly type: "string";
|
|
566
|
+
};
|
|
567
|
+
readonly timestamp: {
|
|
568
|
+
readonly type: "string";
|
|
569
|
+
};
|
|
570
|
+
readonly mode: {
|
|
571
|
+
readonly type: "string";
|
|
572
|
+
readonly enum: readonly ["full", "quick"];
|
|
573
|
+
};
|
|
574
|
+
readonly runCount: {
|
|
575
|
+
readonly type: "number";
|
|
576
|
+
};
|
|
577
|
+
readonly seeds: {
|
|
578
|
+
readonly type: "array";
|
|
579
|
+
readonly items: {
|
|
580
|
+
readonly type: "number";
|
|
581
|
+
};
|
|
582
|
+
};
|
|
583
|
+
readonly splitType: {
|
|
584
|
+
readonly type: "string";
|
|
585
|
+
readonly enum: readonly ["public", "holdout"];
|
|
586
|
+
};
|
|
587
|
+
readonly qrelsSealedHash: {
|
|
588
|
+
readonly type: "string";
|
|
589
|
+
readonly pattern: "^[0-9a-f]{64}$";
|
|
590
|
+
};
|
|
591
|
+
readonly judgePromptHash: {
|
|
592
|
+
readonly type: "string";
|
|
593
|
+
readonly pattern: "^[0-9a-f]{64}$";
|
|
594
|
+
};
|
|
595
|
+
readonly datasetHash: {
|
|
596
|
+
readonly type: "string";
|
|
597
|
+
readonly pattern: "^[0-9a-f]{64}$";
|
|
598
|
+
};
|
|
599
|
+
readonly canaryScore: {
|
|
600
|
+
readonly type: "number";
|
|
601
|
+
};
|
|
602
|
+
};
|
|
603
|
+
};
|
|
604
|
+
readonly config: {
|
|
605
|
+
readonly type: "object";
|
|
606
|
+
readonly required: readonly ["systemProvider", "judgeProvider", "adapterMode", "remnicConfig"];
|
|
607
|
+
readonly properties: {
|
|
608
|
+
readonly runtimeProfile: {
|
|
609
|
+
readonly anyOf: readonly [{
|
|
610
|
+
readonly type: "null";
|
|
611
|
+
}, {
|
|
612
|
+
readonly type: "string";
|
|
613
|
+
readonly enum: readonly ["baseline", "real", "openclaw-chain"];
|
|
614
|
+
}];
|
|
615
|
+
};
|
|
616
|
+
readonly systemProvider: {
|
|
617
|
+
readonly anyOf: readonly [{
|
|
618
|
+
readonly type: "null";
|
|
619
|
+
}, {
|
|
620
|
+
readonly type: "object";
|
|
621
|
+
readonly required: readonly ["provider", "model"];
|
|
622
|
+
readonly properties: {
|
|
623
|
+
readonly provider: {
|
|
624
|
+
readonly type: "string";
|
|
625
|
+
};
|
|
626
|
+
readonly model: {
|
|
627
|
+
readonly type: "string";
|
|
628
|
+
};
|
|
629
|
+
readonly baseUrl: {
|
|
630
|
+
readonly type: "string";
|
|
631
|
+
};
|
|
632
|
+
};
|
|
633
|
+
}];
|
|
634
|
+
};
|
|
635
|
+
readonly judgeProvider: {
|
|
636
|
+
readonly anyOf: readonly [{
|
|
637
|
+
readonly type: "null";
|
|
638
|
+
}, {
|
|
639
|
+
readonly type: "object";
|
|
640
|
+
readonly required: readonly ["provider", "model"];
|
|
641
|
+
readonly properties: {
|
|
642
|
+
readonly provider: {
|
|
643
|
+
readonly type: "string";
|
|
644
|
+
};
|
|
645
|
+
readonly model: {
|
|
646
|
+
readonly type: "string";
|
|
647
|
+
};
|
|
648
|
+
readonly baseUrl: {
|
|
649
|
+
readonly type: "string";
|
|
650
|
+
};
|
|
651
|
+
};
|
|
652
|
+
}];
|
|
653
|
+
};
|
|
654
|
+
readonly adapterMode: {
|
|
655
|
+
readonly type: "string";
|
|
656
|
+
};
|
|
657
|
+
readonly remnicConfig: {
|
|
658
|
+
readonly type: "object";
|
|
659
|
+
};
|
|
660
|
+
};
|
|
661
|
+
};
|
|
662
|
+
readonly cost: {
|
|
663
|
+
readonly type: "object";
|
|
664
|
+
readonly required: readonly ["totalTokens", "inputTokens", "outputTokens", "estimatedCostUsd", "totalLatencyMs", "meanQueryLatencyMs"];
|
|
665
|
+
readonly properties: {
|
|
666
|
+
readonly totalTokens: {
|
|
667
|
+
readonly type: "number";
|
|
668
|
+
};
|
|
669
|
+
readonly inputTokens: {
|
|
670
|
+
readonly type: "number";
|
|
671
|
+
};
|
|
672
|
+
readonly outputTokens: {
|
|
673
|
+
readonly type: "number";
|
|
674
|
+
};
|
|
675
|
+
readonly estimatedCostUsd: {
|
|
676
|
+
readonly type: "number";
|
|
677
|
+
};
|
|
678
|
+
readonly totalLatencyMs: {
|
|
679
|
+
readonly type: "number";
|
|
680
|
+
};
|
|
681
|
+
readonly meanQueryLatencyMs: {
|
|
682
|
+
readonly type: "number";
|
|
683
|
+
};
|
|
684
|
+
};
|
|
685
|
+
};
|
|
686
|
+
readonly results: {
|
|
687
|
+
readonly type: "object";
|
|
688
|
+
readonly required: readonly ["tasks", "aggregates"];
|
|
689
|
+
readonly properties: {
|
|
690
|
+
readonly tasks: {
|
|
691
|
+
readonly type: "array";
|
|
692
|
+
readonly items: {
|
|
693
|
+
readonly type: "object";
|
|
694
|
+
readonly required: readonly ["taskId", "question", "expected", "actual", "scores", "latencyMs", "tokens"];
|
|
695
|
+
readonly properties: {
|
|
696
|
+
readonly taskId: {
|
|
697
|
+
readonly type: "string";
|
|
698
|
+
};
|
|
699
|
+
readonly question: {
|
|
700
|
+
readonly type: "string";
|
|
701
|
+
};
|
|
702
|
+
readonly expected: {
|
|
703
|
+
readonly type: "string";
|
|
704
|
+
};
|
|
705
|
+
readonly actual: {
|
|
706
|
+
readonly type: "string";
|
|
707
|
+
};
|
|
708
|
+
readonly scores: {
|
|
709
|
+
readonly type: "object";
|
|
710
|
+
};
|
|
711
|
+
readonly latencyMs: {
|
|
712
|
+
readonly type: "number";
|
|
713
|
+
};
|
|
714
|
+
readonly tokens: {
|
|
715
|
+
readonly type: "object";
|
|
716
|
+
readonly required: readonly ["input", "output"];
|
|
717
|
+
readonly properties: {
|
|
718
|
+
readonly input: {
|
|
719
|
+
readonly type: "number";
|
|
720
|
+
};
|
|
721
|
+
readonly output: {
|
|
722
|
+
readonly type: "number";
|
|
723
|
+
};
|
|
724
|
+
};
|
|
725
|
+
};
|
|
726
|
+
};
|
|
727
|
+
};
|
|
728
|
+
};
|
|
729
|
+
readonly aggregates: {
|
|
730
|
+
readonly type: "object";
|
|
731
|
+
};
|
|
732
|
+
readonly statistics: {
|
|
733
|
+
readonly type: "object";
|
|
734
|
+
};
|
|
735
|
+
};
|
|
736
|
+
};
|
|
737
|
+
readonly environment: {
|
|
738
|
+
readonly type: "object";
|
|
739
|
+
readonly required: readonly ["os", "nodeVersion"];
|
|
740
|
+
readonly properties: {
|
|
741
|
+
readonly os: {
|
|
742
|
+
readonly type: "string";
|
|
743
|
+
};
|
|
744
|
+
readonly nodeVersion: {
|
|
745
|
+
readonly type: "string";
|
|
746
|
+
};
|
|
747
|
+
readonly hardware: {
|
|
748
|
+
readonly type: "string";
|
|
749
|
+
};
|
|
750
|
+
};
|
|
751
|
+
};
|
|
752
|
+
};
|
|
753
|
+
};
|
|
754
|
+
|
|
755
|
+
declare function createAnthropicProvider(config: AnthropicProviderConfig): LlmProvider;
|
|
756
|
+
|
|
757
|
+
declare function createProvider(config: ProviderFactoryConfig): LlmProvider;
|
|
758
|
+
declare function discoverAllProviders(): Promise<ProviderDiscoveryResult[]>;
|
|
759
|
+
|
|
760
|
+
interface BenchmarkAnswerResult {
|
|
761
|
+
finalAnswer: string;
|
|
762
|
+
recalledText: string;
|
|
763
|
+
answeredText: string;
|
|
764
|
+
latencyMs: number;
|
|
765
|
+
tokens: {
|
|
766
|
+
input: number;
|
|
767
|
+
output: number;
|
|
768
|
+
};
|
|
769
|
+
model?: string;
|
|
770
|
+
}
|
|
771
|
+
declare function answerBenchmarkQuestion(options: {
|
|
772
|
+
question: string;
|
|
773
|
+
recalledText: string;
|
|
774
|
+
responder?: BenchResponder;
|
|
775
|
+
}): Promise<BenchmarkAnswerResult>;
|
|
776
|
+
|
|
777
|
+
/**
|
|
778
|
+
* Sealed LLM-judge rubric loader, invocation, and score parser for the
|
|
779
|
+
* Assistant bench tier.
|
|
780
|
+
*
|
|
781
|
+
* Sealing contract:
|
|
782
|
+
* 1. The rubric prompt lives in the in-process registry
|
|
783
|
+
* (`sealed-prompts/index.ts`) and is never exposed to the
|
|
784
|
+
* system-under-test.
|
|
785
|
+
* 2. The rubric text's SHA-256 digest is embedded into every run result so
|
|
786
|
+
* any change to the prompt is detectable by consumers of the bench feed.
|
|
787
|
+
* 3. Rotations are additive — add a new registry entry and a matching
|
|
788
|
+
* `.md` mirror, do not edit old ones.
|
|
789
|
+
*/
|
|
790
|
+
declare const ASSISTANT_RUBRIC_DIMENSIONS: readonly ["identity_accuracy", "stance_coherence", "novelty", "calibration"];
|
|
791
|
+
type AssistantRubricDimension = (typeof ASSISTANT_RUBRIC_DIMENSIONS)[number];
|
|
792
|
+
type AssistantRubricScores = Record<AssistantRubricDimension, number>;
|
|
793
|
+
interface SealedRubric {
|
|
794
|
+
id: string;
|
|
795
|
+
version: string;
|
|
796
|
+
prompt: string;
|
|
797
|
+
sha256: string;
|
|
798
|
+
}
|
|
799
|
+
interface SealedJudgeInput {
|
|
800
|
+
taskId: string;
|
|
801
|
+
scenario: string;
|
|
802
|
+
memorySummary: string;
|
|
803
|
+
assistantOutput: string;
|
|
804
|
+
}
|
|
805
|
+
interface SealedJudgeDecision {
|
|
806
|
+
taskId: string;
|
|
807
|
+
rubricId: string;
|
|
808
|
+
rubricSha256: string;
|
|
809
|
+
scores: AssistantRubricScores;
|
|
810
|
+
notes: string;
|
|
811
|
+
rawResponse: string;
|
|
812
|
+
parseOk: boolean;
|
|
813
|
+
}
|
|
814
|
+
/**
|
|
815
|
+
* Rich structured-judge contract for the Assistant tier. Unlike
|
|
816
|
+
* `BenchJudge.score()`, which returns a scalar, structured judges return the
|
|
817
|
+
* raw JSON response text so we can parse the full multi-dimension rubric.
|
|
818
|
+
*/
|
|
819
|
+
interface StructuredJudge {
|
|
820
|
+
evaluate(request: {
|
|
821
|
+
system: string;
|
|
822
|
+
user: string;
|
|
823
|
+
rubricId: string;
|
|
824
|
+
taskId: string;
|
|
825
|
+
}): Promise<string>;
|
|
826
|
+
}
|
|
827
|
+
interface SpotCheckLogger {
|
|
828
|
+
log(decision: SealedJudgeDecision, context: SealedJudgeInput): void;
|
|
829
|
+
}
|
|
830
|
+
/**
|
|
831
|
+
* Load a sealed rubric prompt from the in-process registry by id.
|
|
832
|
+
*
|
|
833
|
+
* The returned object captures the canonical text and a SHA-256 digest which
|
|
834
|
+
* callers are expected to store in benchmark results so reviewers can verify
|
|
835
|
+
* the exact rubric text used for a given run.
|
|
836
|
+
*/
|
|
837
|
+
declare function loadSealedRubric(id?: string, options?: {
|
|
838
|
+
registry?: Readonly<Record<string, string>>;
|
|
839
|
+
}): SealedRubric;
|
|
840
|
+
/**
|
|
841
|
+
* Verify that a registered rubric still matches an expected digest. Useful in
|
|
842
|
+
* tests and in CI gates that want to catch accidental edits to sealed text.
|
|
843
|
+
*/
|
|
844
|
+
declare function verifyRubricDigest(expectedSha256: string, options?: {
|
|
845
|
+
id?: string;
|
|
846
|
+
registry?: Readonly<Record<string, string>>;
|
|
847
|
+
}): boolean;
|
|
848
|
+
/**
|
|
849
|
+
* Build the judge message payload for a single task. Keeps the rubric prompt
|
|
850
|
+
* on the system side of the conversation and the task-specific substitutions
|
|
851
|
+
* in a user message so the judge never leaks rubric text back into the SUT
|
|
852
|
+
* path.
|
|
853
|
+
*/
|
|
854
|
+
declare function buildJudgePayload(rubric: SealedRubric, input: SealedJudgeInput): {
|
|
855
|
+
system: string;
|
|
856
|
+
user: string;
|
|
857
|
+
};
|
|
858
|
+
/**
|
|
859
|
+
* Invoke a structured judge with the sealed rubric and parse the response.
|
|
860
|
+
*
|
|
861
|
+
* When `judge` is `undefined` we return a parse_error decision with all-zero
|
|
862
|
+
* scores so the caller can still complete the benchmark with a visible signal
|
|
863
|
+
* that the judge was missing.
|
|
864
|
+
*/
|
|
865
|
+
declare function runSealedJudge(judge: StructuredJudge | undefined, rubric: SealedRubric, input: SealedJudgeInput, options?: {
|
|
866
|
+
spotCheckLogger?: SpotCheckLogger;
|
|
867
|
+
}): Promise<SealedJudgeDecision>;
|
|
868
|
+
/**
|
|
869
|
+
* Parse a judge response string as rubric JSON. Exported for unit tests and
|
|
870
|
+
* for judge adapters that return the raw response directly.
|
|
871
|
+
*/
|
|
872
|
+
declare function parseRubricResponse(raw: string): {
|
|
873
|
+
scores: AssistantRubricScores;
|
|
874
|
+
notes: string;
|
|
875
|
+
ok: boolean;
|
|
876
|
+
};
|
|
877
|
+
/**
|
|
878
|
+
* Spot-check logger that appends selected judge decisions to a JSONL file.
|
|
879
|
+
* The caller controls the `runId` to keep logs grouped per-run.
|
|
880
|
+
*
|
|
881
|
+
* Logging is a diagnostic side effect, so any filesystem error (non-writable
|
|
882
|
+
* directory, path conflict with an existing file, mid-run ENOSPC, etc.) is
|
|
883
|
+
* caught and downgraded to a one-time `console.warn` rather than aborting
|
|
884
|
+
* the benchmark run. We also fail-safe the `mkdirSync` at construction
|
|
885
|
+
* time: if the directory cannot be created, we return a no-op logger so
|
|
886
|
+
* callers can still run the benchmark end-to-end.
|
|
887
|
+
*/
|
|
888
|
+
declare function createSpotCheckFileLogger(options: {
|
|
889
|
+
runId: string;
|
|
890
|
+
directory: string;
|
|
891
|
+
sampleRate?: number;
|
|
892
|
+
random?: () => number;
|
|
893
|
+
sampleSize?: number;
|
|
894
|
+
}): SpotCheckLogger;
|
|
895
|
+
/**
|
|
896
|
+
* Create a deterministic spot-check logger useful in tests: always picks the
|
|
897
|
+
* first `sampleSize` decisions regardless of random draw.
|
|
898
|
+
*/
|
|
899
|
+
declare function createDeterministicSpotCheckLogger(options: {
|
|
900
|
+
runId: string;
|
|
901
|
+
directory: string;
|
|
902
|
+
sampleSize?: number;
|
|
903
|
+
}): SpotCheckLogger;
|
|
904
|
+
declare function zeroScores(): AssistantRubricScores;
|
|
905
|
+
declare function clampScore(value: number): number;
|
|
906
|
+
|
|
907
|
+
interface GatewayResponderOptions {
|
|
908
|
+
gatewayConfig?: GatewayConfig;
|
|
909
|
+
agentId?: string;
|
|
910
|
+
}
|
|
911
|
+
declare function createResponderFromProvider(provider: LlmProvider): BenchResponder;
|
|
912
|
+
declare function createProviderBackedResponder(config: ProviderFactoryConfig, providerInstance?: LlmProvider): BenchResponder;
|
|
913
|
+
declare function createProviderBackedJudge(config: ProviderFactoryConfig, providerInstance?: LlmProvider): BenchJudge;
|
|
914
|
+
declare function createStructuredJudgeFromProvider(provider: LlmProvider): StructuredJudge;
|
|
915
|
+
declare function createProviderBackedStructuredJudge(config: ProviderFactoryConfig, providerInstance?: LlmProvider): StructuredJudge;
|
|
916
|
+
declare function createGatewayResponder(options: GatewayResponderOptions): BenchResponder;
|
|
917
|
+
|
|
918
|
+
declare function createLiteLlmProvider(config: OpenAiCompatibleProviderConfig): LlmProvider;
|
|
919
|
+
|
|
920
|
+
declare function createOllamaProvider(config: OllamaProviderConfig): LlmProvider;
|
|
921
|
+
|
|
922
|
+
/**
|
|
923
|
+
* Minimal OpenAI-compatible provider for phase 1 bench execution.
|
|
924
|
+
*/
|
|
925
|
+
|
|
926
|
+
declare function createOpenAiCompatibleProvider(config: OpenAiCompatibleProviderConfig): LlmProvider;
|
|
927
|
+
|
|
928
|
+
type BenchModelSource = "plugin" | "gateway";
|
|
929
|
+
interface ResolveBenchRuntimeProfileOptions {
|
|
930
|
+
runtimeProfile?: BenchRuntimeProfile;
|
|
931
|
+
remnicConfigPath?: string;
|
|
932
|
+
openclawConfigPath?: string;
|
|
933
|
+
modelSource?: BenchModelSource;
|
|
934
|
+
gatewayAgentId?: string;
|
|
935
|
+
fastGatewayAgentId?: string;
|
|
936
|
+
systemProvider?: BuiltInProvider;
|
|
937
|
+
systemModel?: string;
|
|
938
|
+
systemBaseUrl?: string;
|
|
939
|
+
judgeProvider?: BuiltInProvider;
|
|
940
|
+
judgeModel?: string;
|
|
941
|
+
judgeBaseUrl?: string;
|
|
942
|
+
}
|
|
943
|
+
interface ResolvedBenchRuntimeProfile {
|
|
944
|
+
profile: BenchRuntimeProfile;
|
|
945
|
+
remnicConfig: Record<string, unknown>;
|
|
946
|
+
effectiveRemnicConfig: Record<string, unknown>;
|
|
947
|
+
adapterOptions: {
|
|
948
|
+
configOverrides: Record<string, unknown>;
|
|
949
|
+
preserveRuntimeDefaults?: boolean;
|
|
950
|
+
responder?: BenchResponder;
|
|
951
|
+
judge?: BenchJudge;
|
|
952
|
+
};
|
|
953
|
+
systemProvider: ProviderConfig | null;
|
|
954
|
+
judgeProvider: ProviderConfig | null;
|
|
955
|
+
}
|
|
956
|
+
declare function resolveBenchRuntimeProfile(options: ResolveBenchRuntimeProfileOptions): Promise<ResolvedBenchRuntimeProfile>;
|
|
957
|
+
|
|
958
|
+
/**
|
|
959
|
+
* Published benchmark registry for @remnic/bench phase 1.
|
|
960
|
+
*/
|
|
961
|
+
|
|
962
|
+
declare function listBenchmarks(): BenchmarkDefinition[];
|
|
963
|
+
declare function getBenchmark(id: string): BenchmarkDefinition | undefined;
|
|
964
|
+
|
|
965
|
+
/**
|
|
966
|
+
* Result enrichment and JSON writing helpers.
|
|
967
|
+
*/
|
|
968
|
+
|
|
969
|
+
declare function writeBenchmarkResult(result: BenchmarkResult, outputDir: string): Promise<string>;
|
|
970
|
+
|
|
971
|
+
/**
|
|
972
|
+
* Seed-sequence generation for benchmark runs.
|
|
973
|
+
*
|
|
974
|
+
* Factored out of `benchmark.ts` so individual runners can reuse it without
|
|
975
|
+
* triggering a circular import through `benchmark.ts -> registry.ts ->
|
|
976
|
+
* runner.ts -> benchmark.ts`.
|
|
977
|
+
*/
|
|
978
|
+
declare function buildBenchmarkRunSeeds(runCount: number, baseSeed?: number): number[];
|
|
979
|
+
|
|
980
|
+
/**
|
|
981
|
+
* Public benchmark execution helpers.
|
|
982
|
+
*/
|
|
983
|
+
|
|
984
|
+
declare function resolveBenchmarkRunCount(mode: BenchmarkMode, requestedIterations?: number): number;
|
|
985
|
+
|
|
986
|
+
declare function orchestrateBenchmarkRuns<T>(mode: BenchmarkMode, executeRun: (seed: number, runIndex: number) => Promise<T>, requestedIterations?: number, baseSeed?: number): Promise<{
|
|
987
|
+
runCount: number;
|
|
988
|
+
seeds: number[];
|
|
989
|
+
runs: T[];
|
|
990
|
+
}>;
|
|
991
|
+
declare function runBenchmark(benchmarkId: string, options: RunBenchmarkOptions): Promise<BenchmarkResult>;
|
|
992
|
+
declare function loadBaseline(baselinePath?: string): SavedBaseline | undefined;
|
|
993
|
+
declare function saveBaseline(baselinePath: string, baseline: SavedBaseline): void;
|
|
994
|
+
declare function runExplain(service: EngramAccessService, query: string): Promise<ExplainResult>;
|
|
995
|
+
declare function runBenchSuite(service: EngramAccessService, config?: BenchConfig): Promise<BenchmarkSuiteResult>;
|
|
996
|
+
declare function checkRegression(metrics: Record<string, number>, baseline: SavedBaseline | undefined, tolerance: number): RegressionGateResult;
|
|
997
|
+
declare function generateReport(results: RecallMetrics[], reportPath?: string): BenchmarkReport;
|
|
998
|
+
|
|
999
|
+
/**
|
|
1000
|
+
* Shared scoring utilities for bench runners.
|
|
1001
|
+
*/
|
|
1002
|
+
|
|
1003
|
+
declare function exactMatch(predicted: string, expected: string | number | unknown): number;
|
|
1004
|
+
declare function f1Score(predicted: string, expected: string | number | unknown): number;
|
|
1005
|
+
declare function rougeL(predicted: string, expected: string | number | unknown): number;
|
|
1006
|
+
declare function recallAtK(retrieved: string[], relevant: string[], k: number): number;
|
|
1007
|
+
declare function precisionAtK(retrieved: string[], relevant: string[], k: number): number;
|
|
1008
|
+
declare function containsAnswer(predicted: string, expected: string | number | unknown): number;
|
|
1009
|
+
declare function llmJudgeScore(judge: {
|
|
1010
|
+
score(question: string, predicted: string, expected: string): Promise<number>;
|
|
1011
|
+
scoreWithMetrics?(question: string, predicted: string, expected: string): Promise<BenchJudgeResult>;
|
|
1012
|
+
} | undefined, question: string, predicted: string, expected: string): Promise<number>;
|
|
1013
|
+
declare function llmJudgeScoreDetailed(judge: {
|
|
1014
|
+
score(question: string, predicted: string, expected: string): Promise<number>;
|
|
1015
|
+
scoreWithMetrics?(question: string, predicted: string, expected: string): Promise<BenchJudgeResult>;
|
|
1016
|
+
} | undefined, question: string, predicted: string, expected: string): Promise<BenchJudgeResult>;
|
|
1017
|
+
declare function timed<T>(fn: () => Promise<T>): Promise<{
|
|
1018
|
+
result: T;
|
|
1019
|
+
durationMs: number;
|
|
1020
|
+
}>;
|
|
1021
|
+
declare function aggregateTaskScores(metricsList: Array<Record<string, number>>): AggregateMetrics;
|
|
1022
|
+
|
|
1023
|
+
interface BootstrapOptions {
|
|
1024
|
+
iterations?: number;
|
|
1025
|
+
level?: number;
|
|
1026
|
+
random?: () => number;
|
|
1027
|
+
}
|
|
1028
|
+
declare function bootstrapMeanConfidenceInterval(values: number[], options?: BootstrapOptions): ConfidenceInterval;
|
|
1029
|
+
declare function pairedDeltaConfidenceInterval(candidateValues: number[], baselineValues: number[], options?: BootstrapOptions): ConfidenceInterval;
|
|
1030
|
+
|
|
1031
|
+
declare function cohensD(candidateValues: number[], baselineValues: number[]): number;
|
|
1032
|
+
declare function interpretEffectSize(cohensDValue: number): EffectSizeInterpretation;
|
|
1033
|
+
|
|
1034
|
+
declare function compareResults(baseline: BenchmarkResult, candidate: BenchmarkResult, threshold?: number, lowerIsBetter?: ReadonlySet<string>): ComparisonResult;
|
|
1035
|
+
declare function getBenchmarkLowerIsBetter(benchmarkId: string): ReadonlySet<string>;
|
|
1036
|
+
|
|
1037
|
+
/**
|
|
1038
|
+
* Dataset-contamination guard.
|
|
1039
|
+
*
|
|
1040
|
+
* Published benchmark results carry a `datasetHash` in `BenchmarkResult.meta`
|
|
1041
|
+
* so the publishing pipeline can reject results whose dataset hash is known
|
|
1042
|
+
* to appear in an LLM's training corpus. The contamination list starts empty
|
|
1043
|
+
* and is extended as new contamination reports arrive.
|
|
1044
|
+
*
|
|
1045
|
+
* Entries are SHA-256 hex digests. Callers pass a `ContaminationManifest`
|
|
1046
|
+
* rather than a bare array so provenance / justification can be attached
|
|
1047
|
+
* alongside the hash. This keeps the audit trail visible when a result is
|
|
1048
|
+
* rejected.
|
|
1049
|
+
*/
|
|
1050
|
+
interface ContaminationEntry {
|
|
1051
|
+
/** SHA-256 of the dataset payload as published. */
|
|
1052
|
+
datasetHash: string;
|
|
1053
|
+
/** Human-readable reason the dataset is considered contaminated. */
|
|
1054
|
+
reason: string;
|
|
1055
|
+
/** Optional citation / URL documenting the contamination report. */
|
|
1056
|
+
reference?: string;
|
|
1057
|
+
/** ISO-8601 timestamp when the entry was added. */
|
|
1058
|
+
addedAt: string;
|
|
1059
|
+
}
|
|
1060
|
+
interface ContaminationManifest {
|
|
1061
|
+
version: 1;
|
|
1062
|
+
entries: ContaminationEntry[];
|
|
1063
|
+
}
|
|
1064
|
+
interface ContaminationCheckResult {
|
|
1065
|
+
/** The dataset hash examined. */
|
|
1066
|
+
datasetHash: string;
|
|
1067
|
+
/** True when the dataset hash is NOT present on the contamination list. */
|
|
1068
|
+
clean: boolean;
|
|
1069
|
+
/** When `clean === false`, the matching manifest entry. */
|
|
1070
|
+
matched?: ContaminationEntry;
|
|
1071
|
+
}
|
|
1072
|
+
/**
|
|
1073
|
+
* Start with an empty list; upstream tooling populates this as contamination
|
|
1074
|
+
* reports surface. Keeping the default list empty avoids hard-coding public
|
|
1075
|
+
* values that could become stale.
|
|
1076
|
+
*/
|
|
1077
|
+
declare const EMPTY_CONTAMINATION_MANIFEST: ContaminationManifest;
|
|
1078
|
+
declare function isContaminationManifest(value: unknown): value is ContaminationManifest;
|
|
1079
|
+
declare function isContaminationEntry(value: unknown): value is ContaminationEntry;
|
|
1080
|
+
declare function checkDatasetContamination(datasetHash: string, manifest?: ContaminationManifest): ContaminationCheckResult;
|
|
1081
|
+
/**
|
|
1082
|
+
* Merge an additional contamination entry into an existing manifest. Duplicate
|
|
1083
|
+
* hashes are collapsed (first-write wins) so manifests can be safely merged
|
|
1084
|
+
* across sources without ballooning.
|
|
1085
|
+
*/
|
|
1086
|
+
declare function addContaminationEntry(manifest: ContaminationManifest, entry: ContaminationEntry): ContaminationManifest;
|
|
1087
|
+
declare function mergeContaminationManifests(...manifests: ContaminationManifest[]): ContaminationManifest;
|
|
1088
|
+
|
|
1089
|
+
interface StoredBenchmarkResultSummary {
|
|
1090
|
+
id: string;
|
|
1091
|
+
path: string;
|
|
1092
|
+
benchmark: string;
|
|
1093
|
+
timestamp: string;
|
|
1094
|
+
mode: BenchmarkMode;
|
|
1095
|
+
}
|
|
1096
|
+
interface StoredBenchmarkBaseline {
|
|
1097
|
+
name: string;
|
|
1098
|
+
savedAt: string;
|
|
1099
|
+
result: BenchmarkResult;
|
|
1100
|
+
source?: {
|
|
1101
|
+
id: string;
|
|
1102
|
+
path: string;
|
|
1103
|
+
};
|
|
1104
|
+
}
|
|
1105
|
+
interface StoredBenchmarkBaselineSummary {
|
|
1106
|
+
name: string;
|
|
1107
|
+
path: string;
|
|
1108
|
+
benchmark: string;
|
|
1109
|
+
timestamp: string;
|
|
1110
|
+
resultId: string;
|
|
1111
|
+
resultTimestamp: string;
|
|
1112
|
+
mode: BenchmarkMode;
|
|
1113
|
+
}
|
|
1114
|
+
type BenchmarkExportFormat = "json" | "csv" | "html";
|
|
1115
|
+
type BenchmarkPublishTarget = "remnic-ai";
|
|
1116
|
+
interface PublishedBenchmarkFeedEntry {
|
|
1117
|
+
benchmark: string;
|
|
1118
|
+
benchmarkTier: BenchmarkResult["meta"]["benchmarkTier"];
|
|
1119
|
+
resultId: string;
|
|
1120
|
+
timestamp: string;
|
|
1121
|
+
mode: BenchmarkMode;
|
|
1122
|
+
remnicVersion: string;
|
|
1123
|
+
gitSha: string;
|
|
1124
|
+
taskCount: number;
|
|
1125
|
+
aggregateMetrics: BenchmarkResult["results"]["aggregates"];
|
|
1126
|
+
cost: BenchmarkResult["cost"];
|
|
1127
|
+
environment: BenchmarkResult["environment"];
|
|
1128
|
+
integrity: {
|
|
1129
|
+
splitType: NonNullable<BenchmarkResult["meta"]["splitType"]>;
|
|
1130
|
+
qrelsSealedHash: string;
|
|
1131
|
+
judgePromptHash: string;
|
|
1132
|
+
datasetHash: string;
|
|
1133
|
+
canaryScore?: number;
|
|
1134
|
+
};
|
|
1135
|
+
}
|
|
1136
|
+
interface BuildBenchmarkPublishFeedOptions {
|
|
1137
|
+
/**
|
|
1138
|
+
* Contamination manifest applied to every candidate result. A result whose
|
|
1139
|
+
* `datasetHash` matches an entry is dropped from the published feed.
|
|
1140
|
+
* Defaults to the empty manifest.
|
|
1141
|
+
*/
|
|
1142
|
+
contaminationManifest?: ContaminationManifest;
|
|
1143
|
+
}
|
|
1144
|
+
interface PublishedBenchmarkFeed {
|
|
1145
|
+
target: BenchmarkPublishTarget;
|
|
1146
|
+
generatedAt: string;
|
|
1147
|
+
benchmarks: PublishedBenchmarkFeedEntry[];
|
|
1148
|
+
/**
|
|
1149
|
+
* Records for every candidate result that was considered but dropped from
|
|
1150
|
+
* this feed because of an integrity concern. Exposed so tooling can surface
|
|
1151
|
+
* the dropped runs without grep-ing logs.
|
|
1152
|
+
*/
|
|
1153
|
+
skipped?: PublishSkipRecord[];
|
|
1154
|
+
}
|
|
1155
|
+
declare function defaultBenchmarkBaselineDir(): string;
|
|
1156
|
+
declare function defaultBenchmarkPublishPath(target: BenchmarkPublishTarget): string;
|
|
1157
|
+
declare function loadBenchmarkResult(filePath: string): Promise<BenchmarkResult>;
|
|
1158
|
+
declare function listBenchmarkResults(outputDir: string): Promise<StoredBenchmarkResultSummary[]>;
|
|
1159
|
+
declare function saveBenchmarkBaseline(baselineDir: string, name: string, result: BenchmarkResult, source?: {
|
|
1160
|
+
id: string;
|
|
1161
|
+
path: string;
|
|
1162
|
+
}): Promise<string>;
|
|
1163
|
+
declare function loadBenchmarkBaseline(filePath: string): Promise<StoredBenchmarkBaseline>;
|
|
1164
|
+
declare function listBenchmarkBaselines(baselineDir: string): Promise<StoredBenchmarkBaselineSummary[]>;
|
|
1165
|
+
declare function resolveBenchmarkResultReference(outputDir: string, reference: string): Promise<StoredBenchmarkResultSummary | undefined>;
|
|
1166
|
+
declare function deleteBenchmarkResults(outputDir: string, references: string[]): Promise<{
|
|
1167
|
+
deleted: StoredBenchmarkResultSummary[];
|
|
1168
|
+
missing: string[];
|
|
1169
|
+
}>;
|
|
1170
|
+
/**
|
|
1171
|
+
* Throws if the result is missing any required integrity field. Called
|
|
1172
|
+
* explicitly by tooling (e.g. `remnic bench publish --strict`) that needs to
|
|
1173
|
+
* surface integrity gaps as errors rather than silently skipping the run.
|
|
1174
|
+
* The feed builder uses `isResultPublishable` below to filter non-fatal
|
|
1175
|
+
* conditions (public split, missing integrity) so a single bad result does
|
|
1176
|
+
* not block publishing older, valid holdout runs.
|
|
1177
|
+
*/
|
|
1178
|
+
declare function assertPublishableIntegrity(result: BenchmarkResult, target: BenchmarkPublishTarget): void;
|
|
1179
|
+
type PublishSkipReason = "missing-integrity" | "non-holdout-split" | "contaminated-dataset";
|
|
1180
|
+
interface PublishSkipRecord {
|
|
1181
|
+
resultId: string;
|
|
1182
|
+
path: string;
|
|
1183
|
+
reason: PublishSkipReason;
|
|
1184
|
+
detail: string;
|
|
1185
|
+
}
|
|
1186
|
+
declare function buildBenchmarkPublishFeed(outputDir: string, target: BenchmarkPublishTarget, options?: BuildBenchmarkPublishFeedOptions): Promise<PublishedBenchmarkFeed>;
|
|
1187
|
+
declare function writeBenchmarkPublishFeed(feed: PublishedBenchmarkFeed, outputPath: string): Promise<string>;
|
|
1188
|
+
declare function renderBenchmarkResultExport(result: BenchmarkResult, format: BenchmarkExportFormat): string;
|
|
1189
|
+
|
|
1190
|
+
/**
|
|
1191
|
+
* Hash verification utilities used by the benchmark integrity pipeline.
|
|
1192
|
+
*
|
|
1193
|
+
* These helpers produce deterministic SHA-256 digests for sealed artifacts:
|
|
1194
|
+
* qrels payloads, judge prompts, dataset files, and encrypted seals. They are
|
|
1195
|
+
* intentionally simple and rely only on Node's built-in crypto module so the
|
|
1196
|
+
* bench package can verify seals without additional dependencies.
|
|
1197
|
+
*
|
|
1198
|
+
* Rules of the road:
|
|
1199
|
+
* - Hashes are lowercase hex strings. Always compare with `timingSafeEqual`.
|
|
1200
|
+
* - Structured inputs are serialized with sorted keys so equivalent objects
|
|
1201
|
+
* produce identical digests. This aligns with CLAUDE.md gotcha #38.
|
|
1202
|
+
* - The AES-GCM seal helpers use 256-bit keys and 96-bit IVs; they are a
|
|
1203
|
+
* thin interface so CI + tests can exercise the flow without reaching for
|
|
1204
|
+
* a KMS. Production deployments should wire a real key-management backend.
|
|
1205
|
+
*/
|
|
1206
|
+
declare const INTEGRITY_HASH_ALGORITHM: "sha256";
|
|
1207
|
+
declare const INTEGRITY_CIPHER_ALGORITHM: "aes-256-gcm";
|
|
1208
|
+
interface SealedArtifact {
|
|
1209
|
+
/** Version marker for the seal envelope. */
|
|
1210
|
+
version: 1;
|
|
1211
|
+
/** Symmetric cipher identifier. */
|
|
1212
|
+
algorithm: typeof INTEGRITY_CIPHER_ALGORITHM;
|
|
1213
|
+
/** Base64-encoded 96-bit IV. */
|
|
1214
|
+
iv: string;
|
|
1215
|
+
/** Base64-encoded 128-bit auth tag. */
|
|
1216
|
+
tag: string;
|
|
1217
|
+
/** Base64-encoded ciphertext. */
|
|
1218
|
+
ciphertext: string;
|
|
1219
|
+
/**
|
|
1220
|
+
* SHA-256 of the plaintext payload. Verified after decryption as a
|
|
1221
|
+
* defence-in-depth check against silent key rotation or ciphertext drift.
|
|
1222
|
+
*/
|
|
1223
|
+
plaintextHash: string;
|
|
1224
|
+
}
|
|
1225
|
+
declare function hashString(value: string): string;
|
|
1226
|
+
declare function hashBytes(value: Uint8Array): string;
|
|
1227
|
+
/**
|
|
1228
|
+
* Canonicalize a JSON-serializable value so equivalent payloads produce the
|
|
1229
|
+
* same digest regardless of key insertion order.
|
|
1230
|
+
*/
|
|
1231
|
+
declare function canonicalJsonStringify(value: unknown): string;
|
|
1232
|
+
declare function hashCanonicalJson(value: unknown): string;
|
|
1233
|
+
declare function isSha256Hex(value: unknown): value is string;
|
|
1234
|
+
declare function assertSha256Hex(value: unknown, label: string): string;
|
|
1235
|
+
/**
|
|
1236
|
+
* Constant-time equality check for hex digests. Returns `false` when inputs
|
|
1237
|
+
* differ in length — `timingSafeEqual` would otherwise throw.
|
|
1238
|
+
*/
|
|
1239
|
+
declare function safeHexEqual(expected: string, actual: string): boolean;
|
|
1240
|
+
/**
|
|
1241
|
+
* Encrypt a plaintext payload with AES-256-GCM, returning a seal envelope.
|
|
1242
|
+
* The caller owns the key. A 96-bit IV is drawn from `crypto.randomBytes`
|
|
1243
|
+
* for each call — never reuse keys across predictable IVs.
|
|
1244
|
+
*/
|
|
1245
|
+
declare function sealPayload(plaintext: string, key: Buffer): SealedArtifact;
|
|
1246
|
+
declare function openSeal(seal: SealedArtifact, key: Buffer): string;
|
|
1247
|
+
/**
|
|
1248
|
+
* Load a 32-byte AES key from an environment variable. The variable must
|
|
1249
|
+
* contain a base64-encoded 256-bit key. Returns `null` when unset so callers
|
|
1250
|
+
* can degrade gracefully in environments without a key-management backend.
|
|
1251
|
+
*
|
|
1252
|
+
* The input is validated against a strict base64 pattern before decoding
|
|
1253
|
+
* because Node's `Buffer.from(x, "base64")` silently ignores non-base64
|
|
1254
|
+
* characters and never throws — accepting a malformed key would surface
|
|
1255
|
+
* only later as an opaque decryption or hash mismatch error.
|
|
1256
|
+
*/
|
|
1257
|
+
declare function loadSealKeyFromEnv(envName: string): Buffer | null;
|
|
1258
|
+
|
|
1259
|
+
/**
|
|
1260
|
+
* Sealed qrels loader.
|
|
1261
|
+
*
|
|
1262
|
+
* The threat model (see `docs/bench/integrity.md`) is that the runner-side
|
|
1263
|
+
* adapter never sees ground-truth answers: they live only inside the judge /
|
|
1264
|
+
* scorer process. This module enforces that boundary by:
|
|
1265
|
+
*
|
|
1266
|
+
* 1. Loading a sealed qrels artifact from disk.
|
|
1267
|
+
* 2. Verifying its declared SHA-256 hash against the expected value pinned
|
|
1268
|
+
* in the benchmark's metadata (the "seal hash").
|
|
1269
|
+
* 3. Decrypting the payload only when a caller provides the correct seal
|
|
1270
|
+
* key. Callers that only need the seal hash (e.g. the runner emitting
|
|
1271
|
+
* `BenchmarkResult.meta.qrelsSealedHash`) never receive plaintext.
|
|
1272
|
+
*
|
|
1273
|
+
* The artifact format is JSON:
|
|
1274
|
+
*
|
|
1275
|
+
* ```json
|
|
1276
|
+
* {
|
|
1277
|
+
* "benchmark": "<benchmark-id>",
|
|
1278
|
+
* "version": 1,
|
|
1279
|
+
* "sealHash": "<sha256-of-envelope-without-sealHash>",
|
|
1280
|
+
* "envelope": { SealedArtifact }
|
|
1281
|
+
* }
|
|
1282
|
+
* ```
|
|
1283
|
+
*
|
|
1284
|
+
* `sealHash` is computed over the canonical JSON of `envelope` so two qrels
|
|
1285
|
+
* files encrypted with the same key produce distinct `sealHash` values only
|
|
1286
|
+
* when their plaintext differs.
|
|
1287
|
+
*/
|
|
1288
|
+
|
|
1289
|
+
interface SealedQrelsArtifact {
|
|
1290
|
+
benchmark: string;
|
|
1291
|
+
version: 1;
|
|
1292
|
+
sealHash: string;
|
|
1293
|
+
envelope: SealedArtifact;
|
|
1294
|
+
}
|
|
1295
|
+
interface SealedQrelsHandle {
|
|
1296
|
+
benchmark: string;
|
|
1297
|
+
sealHash: string;
|
|
1298
|
+
/**
|
|
1299
|
+
* Returns the decrypted qrels JSON as a string. Callers must pass the
|
|
1300
|
+
* seal key explicitly; the handle never caches plaintext.
|
|
1301
|
+
*/
|
|
1302
|
+
unseal(key: Buffer): unknown;
|
|
1303
|
+
}
|
|
1304
|
+
interface LoadSealedQrelsOptions {
|
|
1305
|
+
/**
|
|
1306
|
+
* Expected seal hash pinned at benchmark registration. If provided the
|
|
1307
|
+
* loader rejects the artifact when the computed hash does not match.
|
|
1308
|
+
*/
|
|
1309
|
+
expectedSealHash?: string;
|
|
1310
|
+
/**
|
|
1311
|
+
* Benchmark ID the artifact must declare. When omitted, any benchmark ID
|
|
1312
|
+
* is accepted so tooling can inspect unknown artifacts.
|
|
1313
|
+
*/
|
|
1314
|
+
expectedBenchmarkId?: string;
|
|
1315
|
+
}
|
|
1316
|
+
declare function isSealedQrelsArtifact(value: unknown): value is SealedQrelsArtifact;
|
|
1317
|
+
declare function computeSealHash(envelope: SealedArtifact): string;
|
|
1318
|
+
declare function parseSealedQrels(raw: string, options?: LoadSealedQrelsOptions): SealedQrelsHandle;
|
|
1319
|
+
declare function loadSealedQrels(filePath: string, options?: LoadSealedQrelsOptions): Promise<SealedQrelsHandle>;
|
|
1320
|
+
/**
|
|
1321
|
+
* Serialize a sealed qrels artifact to the canonical on-disk shape. Useful
|
|
1322
|
+
* for tooling that authors new qrels files.
|
|
1323
|
+
*/
|
|
1324
|
+
declare function serializeSealedQrels(artifact: SealedQrelsArtifact): string;
|
|
1325
|
+
|
|
1326
|
+
/**
|
|
1327
|
+
* Canary adapter for exploit-detection runs.
|
|
1328
|
+
*
|
|
1329
|
+
* A canary adapter never actually solves a benchmark task. It returns a
|
|
1330
|
+
* deterministic, deliberately-wrong response to every query so that the
|
|
1331
|
+
* exploit-audit workflow can measure how much score a benchmark assigns to
|
|
1332
|
+
* a do-nothing system. If the canary scores above the configured floor
|
|
1333
|
+
* (default `0.1`) on any benchmark, the benchmark is flagged as exploitable
|
|
1334
|
+
* and demoted until fixed.
|
|
1335
|
+
*
|
|
1336
|
+
* This adapter must never be used in production bench runs; it exists only
|
|
1337
|
+
* for the `bench-exploit-audit` CI workflow.
|
|
1338
|
+
*/
|
|
1339
|
+
|
|
1340
|
+
/** The fixed reply the canary returns for every `recall`. */
|
|
1341
|
+
declare const CANARY_FIXED_RECALL = "__remnic_canary_response__";
|
|
1342
|
+
/** The score floor the canary must NOT exceed for any benchmark. */
|
|
1343
|
+
declare const CANARY_SCORE_FLOOR = 0.1;
|
|
1344
|
+
interface CanaryAdapterOptions {
|
|
1345
|
+
/**
|
|
1346
|
+
* Override the response string used by `recall`. Useful for running two
|
|
1347
|
+
* canary variants side-by-side (e.g. empty string vs fixed string).
|
|
1348
|
+
*/
|
|
1349
|
+
response?: string;
|
|
1350
|
+
/**
|
|
1351
|
+
* If true, `search` returns an empty array instead of a single fake hit.
|
|
1352
|
+
* Some benchmarks rely on the retrieval surface; keeping the default
|
|
1353
|
+
* "one fake hit" covers retrieval-style scorers too.
|
|
1354
|
+
*/
|
|
1355
|
+
emptySearch?: boolean;
|
|
1356
|
+
}
|
|
1357
|
+
declare function createCanaryAdapter(options?: CanaryAdapterOptions): BenchMemoryAdapter;
|
|
1358
|
+
interface CanaryFloorCheck {
|
|
1359
|
+
benchmark: string;
|
|
1360
|
+
score: number;
|
|
1361
|
+
floor: number;
|
|
1362
|
+
passed: boolean;
|
|
1363
|
+
}
|
|
1364
|
+
/**
|
|
1365
|
+
* Compare a canary score against the configured floor. Returns a structured
|
|
1366
|
+
* result rather than throwing so callers can aggregate failures across an
|
|
1367
|
+
* entire benchmark suite before reporting.
|
|
1368
|
+
*/
|
|
1369
|
+
declare function assertCanaryUnderFloor(benchmark: string, score: number, floor?: number): CanaryFloorCheck;
|
|
1370
|
+
|
|
1371
|
+
/**
|
|
1372
|
+
* Randomization helpers for the integrity pipeline.
|
|
1373
|
+
*
|
|
1374
|
+
* These helpers remove position-in-prompt and fixture-layout exploits:
|
|
1375
|
+
* - `shuffleTasks` randomizes task order per run so a memorized task position
|
|
1376
|
+
* cannot be exploited.
|
|
1377
|
+
* - `rotateDistractors` rotates multiple-choice answer positions and the set
|
|
1378
|
+
* of distractors so answer-position memorization is defeated.
|
|
1379
|
+
* - `selectFixtureVariant` picks a variant by seed so each run exercises a
|
|
1380
|
+
* different fixture graph layout.
|
|
1381
|
+
*
|
|
1382
|
+
* All helpers are seeded. A seeded mulberry32 PRNG gives deterministic,
|
|
1383
|
+
* reproducible shuffles that do not rely on `Math.random`.
|
|
1384
|
+
*/
|
|
1385
|
+
interface SeededRng {
|
|
1386
|
+
/** Returns a pseudo-random number in `[0, 1)`. */
|
|
1387
|
+
next(): number;
|
|
1388
|
+
}
|
|
1389
|
+
/**
|
|
1390
|
+
* Deterministic 32-bit PRNG. Mulberry32 is small, fast, and sufficient for
|
|
1391
|
+
* shuffling benchmark tasks. Do NOT use for cryptographic operations.
|
|
1392
|
+
*/
|
|
1393
|
+
declare function createSeededRng(seed: number): SeededRng;
|
|
1394
|
+
/**
|
|
1395
|
+
* Fisher-Yates shuffle using a seeded PRNG. Returns a new array.
|
|
1396
|
+
*/
|
|
1397
|
+
declare function shuffleTasks<T>(tasks: readonly T[], seed: number): T[];
|
|
1398
|
+
interface MultipleChoiceQuestion<T> {
|
|
1399
|
+
/** Correct answer. Must appear in `distractors` or be prepended below. */
|
|
1400
|
+
correct: T;
|
|
1401
|
+
/** Distractor pool. The correct answer may or may not be present. */
|
|
1402
|
+
distractors: readonly T[];
|
|
1403
|
+
}
|
|
1404
|
+
interface RotatedChoices<T> {
|
|
1405
|
+
/** The choices in rotated order. */
|
|
1406
|
+
choices: T[];
|
|
1407
|
+
/** The index of the correct answer in `choices`. */
|
|
1408
|
+
correctIndex: number;
|
|
1409
|
+
}
|
|
1410
|
+
/**
|
|
1411
|
+
* Rotate the distractor set and answer position for a multiple-choice
|
|
1412
|
+
* question. The full choice pool is `[correct, ...distractors]` with
|
|
1413
|
+
* duplicates removed; the pool is shuffled and the correct-answer index is
|
|
1414
|
+
* reported back to the caller. Callers re-score against `correctIndex`.
|
|
1415
|
+
*/
|
|
1416
|
+
declare function rotateDistractors<T>(question: MultipleChoiceQuestion<T>, seed: number): RotatedChoices<T>;
|
|
1417
|
+
interface FixtureVariant<T> {
|
|
1418
|
+
id: string;
|
|
1419
|
+
value: T;
|
|
1420
|
+
}
|
|
1421
|
+
/**
|
|
1422
|
+
* Pick one fixture variant by seed. Stable: the same seed always returns the
|
|
1423
|
+
* same variant index for a given variant list length.
|
|
1424
|
+
*/
|
|
1425
|
+
declare function selectFixtureVariant<T>(variants: readonly FixtureVariant<T>[], seed: number): FixtureVariant<T>;
|
|
1426
|
+
|
|
1427
|
+
/**
|
|
1428
|
+
* YAML custom benchmark loader.
|
|
1429
|
+
*/
|
|
1430
|
+
|
|
1431
|
+
declare function parseCustomBenchmark(source: string): CustomBenchmarkSpec;
|
|
1432
|
+
declare function loadCustomBenchmarkFile(filePath: string): Promise<CustomBenchmarkSpec>;
|
|
1433
|
+
|
|
1434
|
+
/**
|
|
1435
|
+
* Custom benchmark runner.
|
|
1436
|
+
*/
|
|
1437
|
+
|
|
1438
|
+
declare function runCustomBenchmarkFile(filePath: string, options: RunBenchmarkOptions): Promise<BenchmarkResult>;
|
|
1439
|
+
|
|
1440
|
+
type SchemaTierName = "clean" | "dirty";
|
|
1441
|
+
interface SchemaTierPageFrontmatter {
|
|
1442
|
+
title?: string;
|
|
1443
|
+
type?: string;
|
|
1444
|
+
state?: string;
|
|
1445
|
+
created?: string;
|
|
1446
|
+
seeAlso?: string[];
|
|
1447
|
+
timeline?: string[];
|
|
1448
|
+
}
|
|
1449
|
+
interface SchemaTierPage {
|
|
1450
|
+
id: string;
|
|
1451
|
+
owner: string;
|
|
1452
|
+
namespace: string;
|
|
1453
|
+
canonicalTitle: string;
|
|
1454
|
+
title: string;
|
|
1455
|
+
type: string;
|
|
1456
|
+
createdAt: string;
|
|
1457
|
+
aliases: string[];
|
|
1458
|
+
body: string;
|
|
1459
|
+
frontmatter: SchemaTierPageFrontmatter;
|
|
1460
|
+
seeAlso: string[];
|
|
1461
|
+
timeline: string[];
|
|
1462
|
+
dirtySignals: string[];
|
|
1463
|
+
}
|
|
1464
|
+
interface PersonalizationRetrievalCase {
|
|
1465
|
+
id: string;
|
|
1466
|
+
query: string;
|
|
1467
|
+
expectedPageIds: string[];
|
|
1468
|
+
expectedNamespace: string;
|
|
1469
|
+
expectedOwner: string;
|
|
1470
|
+
}
|
|
1471
|
+
interface TemporalRetrievalCase {
|
|
1472
|
+
id: string;
|
|
1473
|
+
query: string;
|
|
1474
|
+
window: {
|
|
1475
|
+
start: string;
|
|
1476
|
+
end: string;
|
|
1477
|
+
};
|
|
1478
|
+
expectedPageIds: string[];
|
|
1479
|
+
}
|
|
1480
|
+
interface AbstentionRetrievalCase {
|
|
1481
|
+
id: string;
|
|
1482
|
+
query: string;
|
|
1483
|
+
reason: "missing_fact" | "cross_tenant" | "hallucination_bait";
|
|
1484
|
+
}
|
|
1485
|
+
interface SchemaTierCorpus {
|
|
1486
|
+
pages: SchemaTierPage[];
|
|
1487
|
+
}
|
|
1488
|
+
interface SchemaTierFixture {
|
|
1489
|
+
seed: number;
|
|
1490
|
+
clean: SchemaTierCorpus;
|
|
1491
|
+
dirty: SchemaTierCorpus;
|
|
1492
|
+
personalizationCases: PersonalizationRetrievalCase[];
|
|
1493
|
+
temporalCases: TemporalRetrievalCase[];
|
|
1494
|
+
abstentionCases: AbstentionRetrievalCase[];
|
|
1495
|
+
}
|
|
1496
|
+
declare function buildSchemaTierFixture(seed?: number): SchemaTierFixture;
|
|
1497
|
+
declare function buildSchemaTierSmokeFixture(seed?: number): SchemaTierFixture;
|
|
1498
|
+
declare const SCHEMA_TIER_FIXTURE: SchemaTierFixture;
|
|
1499
|
+
declare const SCHEMA_TIER_SMOKE_FIXTURE: SchemaTierFixture;
|
|
1500
|
+
|
|
1501
|
+
/**
|
|
1502
|
+
* Scoring utilities for ingestion benchmarks.
|
|
1503
|
+
*/
|
|
1504
|
+
|
|
1505
|
+
declare function matchEntity(extracted: ExtractedEntity, gold: GoldEntity): boolean;
|
|
1506
|
+
declare function entityRecall(extracted: ExtractedEntity[], gold: GoldEntity[]): {
|
|
1507
|
+
overall: number;
|
|
1508
|
+
byType: Record<string, number>;
|
|
1509
|
+
};
|
|
1510
|
+
declare function linkMatches(extracted: ExtractedLink, gold: GoldLink): boolean;
|
|
1511
|
+
declare function backlinkF1(extracted: ExtractedLink[], gold: GoldLink[]): {
|
|
1512
|
+
precision: number;
|
|
1513
|
+
recall: number;
|
|
1514
|
+
f1: number;
|
|
1515
|
+
};
|
|
1516
|
+
declare function schemaCompleteness(pages: ExtractedPage[], goldPages: GoldPage[], requiredFields: readonly string[]): {
|
|
1517
|
+
overall: number;
|
|
1518
|
+
fieldCoverage: Record<string, number>;
|
|
1519
|
+
};
|
|
1520
|
+
|
|
1521
|
+
/**
|
|
1522
|
+
* Synthetic email fixture generator.
|
|
1523
|
+
*
|
|
1524
|
+
* Produces a well-formed mbox file (~10-12 messages across 5 threads) covering
|
|
1525
|
+
* the entities defined in email-gold.ts. All data is entirely synthetic — no
|
|
1526
|
+
* real PII is present.
|
|
1527
|
+
*/
|
|
1528
|
+
|
|
1529
|
+
declare const emailFixture: FixtureGenerator;
|
|
1530
|
+
|
|
1531
|
+
/**
|
|
1532
|
+
* Synthetic project-folder fixture for ingestion benchmarks.
|
|
1533
|
+
*
|
|
1534
|
+
* Generates a nested directory of markdown, JSON, and text files simulating
|
|
1535
|
+
* a project workspace. All names, organisations, and content are entirely fictional.
|
|
1536
|
+
*/
|
|
1537
|
+
|
|
1538
|
+
declare const projectFolderFixture: FixtureGenerator;
|
|
1539
|
+
|
|
1540
|
+
/**
|
|
1541
|
+
* Synthetic calendar ICS fixture for ingestion benchmarks.
|
|
1542
|
+
*
|
|
1543
|
+
* Generates a VCALENDAR file with recurring and one-off events.
|
|
1544
|
+
* All names, organisations, and content are entirely fictional.
|
|
1545
|
+
*/
|
|
1546
|
+
|
|
1547
|
+
declare const calendarFixture: FixtureGenerator;
|
|
1548
|
+
|
|
1549
|
+
/**
|
|
1550
|
+
* Synthetic chat transcript fixture for ingestion benchmarks.
|
|
1551
|
+
*
|
|
1552
|
+
* Generates a Slack-style JSON transcript across three channels and one DM.
|
|
1553
|
+
* All names, organisations, and content are entirely fictional.
|
|
1554
|
+
*/
|
|
1555
|
+
|
|
1556
|
+
declare const chatFixture: FixtureGenerator;
|
|
1557
|
+
|
|
1558
|
+
/**
|
|
1559
|
+
* Sealed rubric prompt registry.
|
|
1560
|
+
*
|
|
1561
|
+
* The canonical form of each sealed rubric prompt is a frozen string literal
|
|
1562
|
+
* in this registry. The matching `.md` file in this directory is a
|
|
1563
|
+
* human-readable mirror kept for reviewers — the `.md` is never loaded at
|
|
1564
|
+
* runtime. This keeps bundling trivial (no filesystem assets) while still
|
|
1565
|
+
* letting reviewers audit rubric text as prose.
|
|
1566
|
+
*
|
|
1567
|
+
* Rotation policy:
|
|
1568
|
+
* - Never edit an existing entry in place.
|
|
1569
|
+
* - Add a new key (`assistant-rubric-v2`, etc.) and ship a matching `.md`.
|
|
1570
|
+
* - Keep the old entry available so historical benchmark results remain
|
|
1571
|
+
* reproducible.
|
|
1572
|
+
*/
|
|
1573
|
+
declare const SEALED_PROMPT_REGISTRY: Readonly<Record<string, string>>;
|
|
1574
|
+
declare const DEFAULT_ASSISTANT_RUBRIC_ID = "assistant-rubric-v1";
|
|
1575
|
+
|
|
1576
|
+
/**
|
|
1577
|
+
* Shared types for the Assistant bench tier.
|
|
1578
|
+
*
|
|
1579
|
+
* Every Assistant benchmark shares the same shape:
|
|
1580
|
+
* - A synthetic memory graph (facts, stances, entities) the agent may read.
|
|
1581
|
+
* - A scenario prompt given to the agent.
|
|
1582
|
+
* - A sealed-rubric judge pass that scores the agent's output along
|
|
1583
|
+
* identity_accuracy / stance_coherence / novelty / calibration.
|
|
1584
|
+
*
|
|
1585
|
+
* The goal is reviewability: each benchmark folder ships a small fixture.ts
|
|
1586
|
+
* that returns `AssistantScenario` values, and the runner wires the shared
|
|
1587
|
+
* multi-run + bootstrap-CI infrastructure around them.
|
|
1588
|
+
*/
|
|
1589
|
+
|
|
1590
|
+
interface AssistantMemoryFact {
|
|
1591
|
+
id: string;
|
|
1592
|
+
summary: string;
|
|
1593
|
+
/**
|
|
1594
|
+
* Free-form tags (topic, entity) used to render the memory-graph summary
|
|
1595
|
+
* that is handed to the judge. Not shown to the agent.
|
|
1596
|
+
*/
|
|
1597
|
+
tags?: string[];
|
|
1598
|
+
}
|
|
1599
|
+
interface AssistantStance {
|
|
1600
|
+
topic: string;
|
|
1601
|
+
position: string;
|
|
1602
|
+
}
|
|
1603
|
+
interface AssistantMemoryGraph {
|
|
1604
|
+
userHandle: string;
|
|
1605
|
+
userRole: string;
|
|
1606
|
+
facts: AssistantMemoryFact[];
|
|
1607
|
+
stances: AssistantStance[];
|
|
1608
|
+
openThreads: string[];
|
|
1609
|
+
}
|
|
1610
|
+
interface AssistantScenario {
|
|
1611
|
+
id: string;
|
|
1612
|
+
title: string;
|
|
1613
|
+
scenarioPrompt: string;
|
|
1614
|
+
memoryGraph: AssistantMemoryGraph;
|
|
1615
|
+
/**
|
|
1616
|
+
* Small label describing what the scenario is meant to exercise. Useful in
|
|
1617
|
+
* dashboards for filtering. Never exposed to the agent.
|
|
1618
|
+
*/
|
|
1619
|
+
focus: string;
|
|
1620
|
+
}
|
|
1621
|
+
/**
|
|
1622
|
+
* Minimal agent contract for the Assistant tier. The agent receives the
|
|
1623
|
+
* scenario prompt plus a pre-rendered memory view (analogous to what the
|
|
1624
|
+
* Remnic recall stack would hand to a downstream chat model), and returns
|
|
1625
|
+
* its final answer text.
|
|
1626
|
+
*/
|
|
1627
|
+
interface AssistantAgent {
|
|
1628
|
+
respond(request: {
|
|
1629
|
+
scenarioId: string;
|
|
1630
|
+
prompt: string;
|
|
1631
|
+
memoryView: string;
|
|
1632
|
+
}): Promise<string>;
|
|
1633
|
+
}
|
|
1634
|
+
interface AssistantRunnerOptions {
|
|
1635
|
+
agent: AssistantAgent;
|
|
1636
|
+
judge: StructuredJudge | undefined;
|
|
1637
|
+
rubricId?: string;
|
|
1638
|
+
/**
|
|
1639
|
+
* Directory where per-run spot-check JSONL files are appended. Defaults to
|
|
1640
|
+
* `<cwd>/benchmarks/results/spot-checks`.
|
|
1641
|
+
*/
|
|
1642
|
+
spotCheckDir?: string;
|
|
1643
|
+
/**
|
|
1644
|
+
* Seed array for deterministic multi-run scheduling. When omitted the
|
|
1645
|
+
* benchmark runner picks a fresh seed array via `buildBenchmarkRunSeeds`.
|
|
1646
|
+
*/
|
|
1647
|
+
seeds?: number[];
|
|
1648
|
+
/**
|
|
1649
|
+
* Override used by tests and CLI smoke runs to cap iterations. Must be
|
|
1650
|
+
* `>= 1`. The production contract is `>= 5` per the issue spec.
|
|
1651
|
+
*/
|
|
1652
|
+
runCount?: number;
|
|
1653
|
+
/**
|
|
1654
|
+
* Random-number factory for bootstrap sampling. Injected in tests.
|
|
1655
|
+
*/
|
|
1656
|
+
random?: () => number;
|
|
1657
|
+
}
|
|
1658
|
+
|
|
1659
|
+
/**
|
|
1660
|
+
* Shared runner scaffolding for the Assistant bench tier.
|
|
1661
|
+
*
|
|
1662
|
+
* Builds the `BenchmarkResult` shape used by the existing dashboard, but with
|
|
1663
|
+
* per-dimension rubric scores (identity_accuracy, stance_coherence, novelty,
|
|
1664
|
+
* calibration) and bootstrap 95% confidence intervals attached. Each
|
|
1665
|
+
* scenario is executed `runCount` times (default 5) and the per-run means
|
|
1666
|
+
* feed the bootstrap so the dashboard can render error bars.
|
|
1667
|
+
*/
|
|
1668
|
+
|
|
1669
|
+
declare function runAssistantBenchmark(definition: BenchmarkDefinition, scenarios: AssistantScenario[], resolved: ResolvedRunBenchmarkOptions, runnerOptions: AssistantRunnerOptions): Promise<BenchmarkResult>;
|
|
1670
|
+
declare function renderMemoryViewForAgent(graph: AssistantMemoryGraph): string;
|
|
1671
|
+
declare function renderMemorySummaryForJudge(graph: AssistantMemoryGraph): string;
|
|
1672
|
+
|
|
1673
|
+
/**
|
|
1674
|
+
* Default assistant agent + judge wiring for the Assistant bench tier.
|
|
1675
|
+
*
|
|
1676
|
+
* The assistant tier is designed to be driven by a real provider-backed agent
|
|
1677
|
+
* and a provider-backed structured judge, but we must also run deterministic
|
|
1678
|
+
* smoke tests under `--test` and in CI without network access.
|
|
1679
|
+
*
|
|
1680
|
+
* This module provides:
|
|
1681
|
+
* - `resolveAssistantAgent()` — returns an `AssistantAgent` built from the
|
|
1682
|
+
* injected `resolved.remnicConfig.assistantAgent` hook if present, else
|
|
1683
|
+
* falls back to a deterministic agent that stringifies the memory view.
|
|
1684
|
+
* - `resolveStructuredJudge()` — mirror for the structured judge.
|
|
1685
|
+
*
|
|
1686
|
+
* Injection happens through `remnicConfig` because that field is already the
|
|
1687
|
+
* benchmark-framework's pass-through channel for runner-specific config. The
|
|
1688
|
+
* CLI will set it; tests set it directly on the options record.
|
|
1689
|
+
*/
|
|
1690
|
+
|
|
1691
|
+
declare const ASSISTANT_AGENT_CONFIG_KEY = "assistantAgent";
|
|
1692
|
+
declare const ASSISTANT_JUDGE_CONFIG_KEY = "assistantJudge";
|
|
1693
|
+
declare const ASSISTANT_SEEDS_CONFIG_KEY = "assistantSeeds";
|
|
1694
|
+
declare const ASSISTANT_SPOT_CHECK_DIR_KEY = "assistantSpotCheckDir";
|
|
1695
|
+
declare const ASSISTANT_RUBRIC_ID_KEY = "assistantRubricId";
|
|
1696
|
+
declare function resolveAssistantAgent(resolved: ResolvedRunBenchmarkOptions): AssistantAgent;
|
|
1697
|
+
declare function resolveStructuredJudge(resolved: ResolvedRunBenchmarkOptions): StructuredJudge | undefined;
|
|
1698
|
+
declare function resolveAssistantSeeds(resolved: ResolvedRunBenchmarkOptions): number[] | undefined;
|
|
1699
|
+
declare function resolveAssistantSpotCheckDir(resolved: ResolvedRunBenchmarkOptions): string | undefined;
|
|
1700
|
+
declare function resolveAssistantRubricId(resolved: ResolvedRunBenchmarkOptions): string | undefined;
|
|
1701
|
+
|
|
1702
|
+
declare const ASSISTANT_MORNING_BRIEF_SCENARIOS: AssistantScenario[];
|
|
1703
|
+
declare const ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS: AssistantScenario[];
|
|
1704
|
+
|
|
1705
|
+
/**
|
|
1706
|
+
* Assistant bench: proactive morning brief.
|
|
1707
|
+
*
|
|
1708
|
+
* Exercises whether the assistant can surface what the user should know and
|
|
1709
|
+
* act on first when they sit down in the morning. Scored by a sealed rubric
|
|
1710
|
+
* along identity_accuracy, stance_coherence, novelty, and calibration.
|
|
1711
|
+
*/
|
|
1712
|
+
|
|
1713
|
+
declare const assistantMorningBriefDefinition: BenchmarkDefinition;
|
|
1714
|
+
declare function runAssistantMorningBriefBenchmark(options: ResolvedRunBenchmarkOptions): Promise<BenchmarkResult>;
|
|
1715
|
+
|
|
1716
|
+
declare const ASSISTANT_MEETING_PREP_SCENARIOS: AssistantScenario[];
|
|
1717
|
+
declare const ASSISTANT_MEETING_PREP_SMOKE_SCENARIOS: AssistantScenario[];
|
|
1718
|
+
|
|
1719
|
+
/**
|
|
1720
|
+
* Assistant bench: meeting prep.
|
|
1721
|
+
*
|
|
1722
|
+
* Given an upcoming meeting and attendees, generate a prep brief. Judged on
|
|
1723
|
+
* attendee-context accuracy, topic recall, and open-thread surfacing.
|
|
1724
|
+
*/
|
|
1725
|
+
|
|
1726
|
+
declare const assistantMeetingPrepDefinition: BenchmarkDefinition;
|
|
1727
|
+
declare function runAssistantMeetingPrepBenchmark(options: ResolvedRunBenchmarkOptions): Promise<BenchmarkResult>;
|
|
1728
|
+
|
|
1729
|
+
declare const ASSISTANT_NEXT_BEST_ACTION_SCENARIOS: AssistantScenario[];
|
|
1730
|
+
declare const ASSISTANT_NEXT_BEST_ACTION_SMOKE_SCENARIOS: AssistantScenario[];
|
|
1731
|
+
|
|
1732
|
+
/**
|
|
1733
|
+
* Assistant bench: next-best-action.
|
|
1734
|
+
*
|
|
1735
|
+
* Given current state, what should the user do next? Judged on grounding in
|
|
1736
|
+
* the memory graph (not generic advice) and on calibration — abstaining on
|
|
1737
|
+
* weak-evidence questions rather than confidently inventing answers.
|
|
1738
|
+
*/
|
|
1739
|
+
|
|
1740
|
+
declare const assistantNextBestActionDefinition: BenchmarkDefinition;
|
|
1741
|
+
declare function runAssistantNextBestActionBenchmark(options: ResolvedRunBenchmarkOptions): Promise<BenchmarkResult>;
|
|
1742
|
+
|
|
1743
|
+
declare const ASSISTANT_SYNTHESIS_SCENARIOS: AssistantScenario[];
|
|
1744
|
+
declare const ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS: AssistantScenario[];
|
|
1745
|
+
|
|
1746
|
+
/**
|
|
1747
|
+
* Assistant bench: multi-document synthesis with stance.
|
|
1748
|
+
*
|
|
1749
|
+
* "What does the brain think about X?" — the agent must integrate across
|
|
1750
|
+
* multiple memory items and reflect the user's previously-expressed stance,
|
|
1751
|
+
* rather than regurgitating the single top-k chunk.
|
|
1752
|
+
*/
|
|
1753
|
+
|
|
1754
|
+
declare const assistantSynthesisDefinition: BenchmarkDefinition;
|
|
1755
|
+
declare function runAssistantSynthesisBenchmark(options: ResolvedRunBenchmarkOptions): Promise<BenchmarkResult>;
|
|
1756
|
+
|
|
1757
|
+
export { ASSISTANT_AGENT_CONFIG_KEY, ASSISTANT_JUDGE_CONFIG_KEY, ASSISTANT_MEETING_PREP_SCENARIOS, ASSISTANT_MEETING_PREP_SMOKE_SCENARIOS, ASSISTANT_MORNING_BRIEF_SCENARIOS, ASSISTANT_MORNING_BRIEF_SMOKE_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SCENARIOS, ASSISTANT_NEXT_BEST_ACTION_SMOKE_SCENARIOS, ASSISTANT_RUBRIC_DIMENSIONS, ASSISTANT_RUBRIC_ID_KEY, ASSISTANT_SEEDS_CONFIG_KEY, ASSISTANT_SPOT_CHECK_DIR_KEY, ASSISTANT_SYNTHESIS_SCENARIOS, ASSISTANT_SYNTHESIS_SMOKE_SCENARIOS, type AbstentionRetrievalCase, type AggregateMetrics, type AnthropicProviderConfig, type AssistantAgent, type AssistantMemoryFact, type AssistantMemoryGraph, type AssistantRubricDimension, type AssistantRubricScores, type AssistantRunnerOptions, type AssistantScenario, type AssistantStance, BENCHMARK_INTEGRITY_META_SCHEMA, BENCHMARK_RESULT_SCHEMA, BENCHMARK_SPLIT_TYPES, type BenchConfig, type BenchJudge, type BenchJudgeResult, type BenchMemoryAdapter, type BenchModelSource, type BenchResponder, type BenchResponse, type BenchRuntimeProfile, type BenchTier, type BenchmarkCategory, type BenchmarkDefinition, type BenchmarkIntegrityMeta, type BenchmarkMeta, type BenchmarkMode, type BenchmarkReport, type BenchmarkResult, type BenchmarkSplitType, type BenchmarkStatus, type BenchmarkSuiteResult, type BenchmarkTier, type BuildBenchmarkPublishFeedOptions, type BuiltInProvider, CANARY_FIXED_RECALL, CANARY_SCORE_FLOOR, type CanaryAdapterOptions, type CanaryFloorCheck, type ComparisonMetricDelta, type ComparisonResult, type CompletionOpts, type CompletionResult, type ConfidenceInterval, type ContaminationCheckResult, type ContaminationEntry, type ContaminationManifest, type CustomBenchmarkScoring, type CustomBenchmarkSpec, type CustomBenchmarkTask, DEFAULT_ASSISTANT_RUBRIC_ID, type DiscoveredModel, EMPTY_CONTAMINATION_MANIFEST, type EffectSizeInterpretation, type EffectSizeSummary, type ExplainResult, type ExtractedEntity, type ExtractedLink, type ExtractedPage, type FixtureGenerator, type FixtureOutput, type FixtureVariant, type GeneratedFile, type GoldEntity, type GoldEntityType, type GoldGraph, type GoldLink, type GoldPage, INTEGRITY_CIPHER_ALGORITHM, INTEGRITY_HASH_ALGORITHM, INTEGRITY_META_FIELDS, type IngestionBenchAdapter, type IngestionLog, type LlmJudge, type LlmProvider, type LoadSealedQrelsOptions, type MemoryGraph, type MemoryStats, type MemorySystem, type Message, type MetricAggregate, type MultipleChoiceQuestion, type OllamaProviderConfig, type OpenAiCompatibleProviderConfig, type PersonalizationRetrievalCase, type ProviderBaseConfig, type ProviderConfig, type ProviderDiscoveryResult, type ProviderFactoryConfig, type PublishSkipReason, type PublishSkipRecord, type PublishedBenchmarkFeed, type PublishedBenchmarkFeedEntry, REQUIRED_FRONTMATTER_FIELDS, type RecallMetrics, type RegressionDetail, type RegressionGateResult, type RemnicAdapterOptions, type ResolveBenchRuntimeProfileOptions, type ResolvedBenchRuntimeProfile, type ResolvedRunBenchmarkOptions, type RotatedChoices, type RunBenchmarkOptions, SCHEMA_TIER_FIXTURE, SCHEMA_TIER_SMOKE_FIXTURE, SEALED_PROMPT_REGISTRY, type SavedBaseline, type SchemaTierCorpus, type SchemaTierFixture, type SchemaTierName, type SchemaTierPage, type SchemaTierPageFrontmatter, type SealedArtifact, type SealedJudgeDecision, type SealedJudgeInput, type SealedQrelsArtifact, type SealedQrelsHandle, type SealedRubric, type SearchResult, type SeededRng, type SpotCheckLogger, type StatisticalReport, type StructuredJudge, type TaskResult, type TaskTokenUsage, type TemporalRetrievalCase, type TierDetail, type TokenUsage, addContaminationEntry, aggregateTaskScores, answerBenchmarkQuestion, assertCanaryUnderFloor, assertIntegrityMetaPresent, assertPublishableIntegrity, assertSha256Hex, assistantMeetingPrepDefinition, assistantMorningBriefDefinition, assistantNextBestActionDefinition, assistantSynthesisDefinition, backlinkF1, bootstrapMeanConfidenceInterval, buildBenchmarkPublishFeed, buildBenchmarkRunSeeds, buildJudgePayload, buildSchemaTierFixture, buildSchemaTierSmokeFixture, calendarFixture, canonicalJsonStringify, chatFixture, checkDatasetContamination, checkRegression, clampScore, cohensD, compareResults, computeSealHash, containsAnswer, createAnthropicProvider, createCanaryAdapter, createDeterministicSpotCheckLogger, createGatewayResponder, createLightweightAdapter, createLiteLlmProvider, createOllamaProvider, createOpenAiCompatibleProvider, createProvider, createProviderBackedJudge, createProviderBackedResponder, createProviderBackedStructuredJudge, createRemnicAdapter, createResponderFromProvider, createSeededRng, createSpotCheckFileLogger, createStructuredJudgeFromProvider, defaultBenchmarkBaselineDir, defaultBenchmarkPublishPath, deleteBenchmarkResults, discoverAllProviders, emailFixture, entityRecall, exactMatch, f1Score, generateReport, getBenchmark, getBenchmarkLowerIsBetter, hashBytes, hashCanonicalJson, hashString, integrityMetaIsComplete, interpretEffectSize, isContaminationEntry, isContaminationManifest, isSealedQrelsArtifact, isSha256Hex, linkMatches, listBenchmarkBaselines, listBenchmarkResults, listBenchmarks, llmJudgeScore, llmJudgeScoreDetailed, loadBaseline, loadBenchmarkBaseline, loadBenchmarkResult, loadCustomBenchmarkFile, loadSealKeyFromEnv, loadSealedQrels, loadSealedRubric, matchEntity, mergeContaminationManifests, openSeal, orchestrateBenchmarkRuns, pairedDeltaConfidenceInterval, parseCustomBenchmark, parseRubricResponse, parseSealedQrels, precisionAtK, projectFolderFixture, recallAtK, renderBenchmarkResultExport, renderMemorySummaryForJudge, renderMemoryViewForAgent, resolveAssistantAgent, resolveAssistantRubricId, resolveAssistantSeeds, resolveAssistantSpotCheckDir, resolveBenchRuntimeProfile, resolveBenchmarkResultReference, resolveBenchmarkRunCount, resolveStructuredJudge, rotateDistractors, rougeL, runAssistantBenchmark, runAssistantMeetingPrepBenchmark, runAssistantMorningBriefBenchmark, runAssistantNextBestActionBenchmark, runAssistantSynthesisBenchmark, runBenchSuite, runBenchmark, runCustomBenchmarkFile, runExplain, runSealedJudge, safeHexEqual, saveBaseline, saveBenchmarkBaseline, schemaCompleteness, sealPayload, selectFixtureVariant, serializeSealedQrels, shuffleTasks, timed, verifyRubricDigest, writeBenchmarkPublishFeed, writeBenchmarkResult, zeroScores };
|