@tangle-network/agent-eval 0.2.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -435,6 +435,83 @@ declare class MetricsCollector {
435
435
  getConvergenceCurve(): number[];
436
436
  }
437
437
 
438
+ /**
439
+ * ScenarioRegistry — manages scenario discovery and filtering.
440
+ *
441
+ * Each agent registers its scenarios. The registry handles conversion
442
+ * from ScenarioFile format to the framework's Scenario type.
443
+ */
444
+ declare class ScenarioRegistry {
445
+ private scenarios;
446
+ private scenarioFiles;
447
+ /** Register scenarios from ScenarioFile format */
448
+ registerFiles(files: ScenarioFile[]): void;
449
+ /** Register pre-built Scenario objects directly */
450
+ register(scenarios: Scenario[]): void;
451
+ /** Get all scenarios */
452
+ all(): Scenario[];
453
+ /** Get scenarios filtered by category */
454
+ byCategory(category: string): Scenario[];
455
+ /** List all categories with counts */
456
+ listCategories(): {
457
+ category: string;
458
+ count: number;
459
+ }[];
460
+ /** Get scenarios filtered by persona */
461
+ byPersona(persona: string): Scenario[];
462
+ /** Get a single scenario by ID */
463
+ byId(id: string): Scenario | undefined;
464
+ /** Count total scenarios */
465
+ get count(): number;
466
+ }
467
+
468
+ interface AgentDriverConfig {
469
+ client: ProductClient;
470
+ driverModel?: string;
471
+ /** System prompt context for the driver LLM to understand the product */
472
+ productContext?: string;
473
+ }
474
+ /**
475
+ * AgentDriver — meta-agent that plays a persona against the real product.
476
+ *
477
+ * Uses a driver LLM (Claude/GPT-4o) to decide what to say each turn.
478
+ * Not scripted — the driver gets the current product state and decides
479
+ * the next realistic user message.
480
+ */
481
+ declare class AgentDriver {
482
+ private tc;
483
+ private client;
484
+ private driverModel;
485
+ private productContext;
486
+ constructor(tc: TCloud, config: AgentDriverConfig);
487
+ /**
488
+ * Run a persona through the product.
489
+ *
490
+ * Returns metrics on how many turns to completion, cost curve,
491
+ * quality curve, and convergence curve.
492
+ */
493
+ run(persona: PersonaConfig): Promise<DriverResult>;
494
+ /** Use the driver LLM to decide what the "user" says next */
495
+ private decideNextMessage;
496
+ /** Handle pending approvals based on persona feedback patterns */
497
+ private handleApprovals;
498
+ /** Describe which completion criteria are met */
499
+ private describeCompletion;
500
+ }
501
+
502
+ /**
503
+ * Report generation utilities.
504
+ *
505
+ * Outputs convergence curves, cost curves, quality curves,
506
+ * and per-persona summaries in markdown format.
507
+ */
508
+ /** Generate a markdown report from benchmark results */
509
+ declare function formatBenchmarkReport(report: BenchmarkReport): string;
510
+ /** Generate a markdown report from agent driver results */
511
+ declare function formatDriverReport(results: DriverResult[]): string;
512
+ /** Print a compact summary to console */
513
+ declare function printDriverSummary(results: DriverResult[]): void;
514
+
438
515
  /**
439
516
  * Normalize scores so all dimensions follow "higher = better".
440
517
  * Inverted dimensions (hallucination, false_confidence, worst_failure)
@@ -524,83 +601,6 @@ declare class ConvergenceTracker {
524
601
  getTurnToCompletion(): number | null;
525
602
  }
526
603
 
527
- /**
528
- * ScenarioRegistry — manages scenario discovery and filtering.
529
- *
530
- * Each agent registers its scenarios. The registry handles conversion
531
- * from ScenarioFile format to the framework's Scenario type.
532
- */
533
- declare class ScenarioRegistry {
534
- private scenarios;
535
- private scenarioFiles;
536
- /** Register scenarios from ScenarioFile format */
537
- registerFiles(files: ScenarioFile[]): void;
538
- /** Register pre-built Scenario objects directly */
539
- register(scenarios: Scenario[]): void;
540
- /** Get all scenarios */
541
- all(): Scenario[];
542
- /** Get scenarios filtered by category */
543
- byCategory(category: string): Scenario[];
544
- /** List all categories with counts */
545
- listCategories(): {
546
- category: string;
547
- count: number;
548
- }[];
549
- /** Get scenarios filtered by persona */
550
- byPersona(persona: string): Scenario[];
551
- /** Get a single scenario by ID */
552
- byId(id: string): Scenario | undefined;
553
- /** Count total scenarios */
554
- get count(): number;
555
- }
556
-
557
- interface AgentDriverConfig {
558
- client: ProductClient;
559
- driverModel?: string;
560
- /** System prompt context for the driver LLM to understand the product */
561
- productContext?: string;
562
- }
563
- /**
564
- * AgentDriver — meta-agent that plays a persona against the real product.
565
- *
566
- * Uses a driver LLM (Claude/GPT-4o) to decide what to say each turn.
567
- * Not scripted — the driver gets the current product state and decides
568
- * the next realistic user message.
569
- */
570
- declare class AgentDriver {
571
- private tc;
572
- private client;
573
- private driverModel;
574
- private productContext;
575
- constructor(tc: TCloud, config: AgentDriverConfig);
576
- /**
577
- * Run a persona through the product.
578
- *
579
- * Returns metrics on how many turns to completion, cost curve,
580
- * quality curve, and convergence curve.
581
- */
582
- run(persona: PersonaConfig): Promise<DriverResult>;
583
- /** Use the driver LLM to decide what the "user" says next */
584
- private decideNextMessage;
585
- /** Handle pending approvals based on persona feedback patterns */
586
- private handleApprovals;
587
- /** Describe which completion criteria are met */
588
- private describeCompletion;
589
- }
590
-
591
- /**
592
- * Report generation utilities.
593
- *
594
- * Outputs convergence curves, cost curves, quality curves,
595
- * and per-persona summaries in markdown format.
596
- */
597
- /** Generate a markdown report from benchmark results */
598
- declare function formatBenchmarkReport(report: BenchmarkReport): string;
599
- /** Generate a markdown report from agent driver results */
600
- declare function formatDriverReport(results: DriverResult[]): string;
601
- /** Print a compact summary to console */
602
- declare function printDriverSummary(results: DriverResult[]): void;
603
-
604
604
  /**
605
605
  * Versioned prompt registry.
606
606
  *
@@ -642,79 +642,6 @@ declare class PromptRegistry {
642
642
  /** SHA-256(content) → first 12 hex chars. Stable across runtimes. */
643
643
  declare function hashContent(content: string): Promise<string>;
644
644
 
645
- /**
646
- * LLM trace store — one record per model call.
647
- *
648
- * Sink for the full eval data-plane: what got sent, what came back, what it
649
- * cost, how long it took. Replayable, queryable, diff-able.
650
- *
651
- * Two built-in stores:
652
- * - `MemoryTraceStore` — fast, ephemeral, useful in tests and short runs
653
- * - `FileSystemTraceStore` — NDJSON files per-run, grepable, committable
654
- *
655
- * Consumers plug in custom stores for Langfuse / OTEL / D1 / Postgres.
656
- */
657
- interface LlmTrace {
658
- id: string;
659
- runId: string;
660
- scenarioId?: string;
661
- turnIndex?: number;
662
- role: 'driver' | 'judge' | 'product' | 'optimizer' | string;
663
- model: string;
664
- prompt: string;
665
- output: string;
666
- inputTokens?: number;
667
- outputTokens?: number;
668
- costUsd?: number;
669
- durationMs?: number;
670
- timestamp: string;
671
- metadata?: Record<string, unknown>;
672
- }
673
- interface TraceQuery {
674
- runId?: string;
675
- scenarioId?: string;
676
- role?: string;
677
- model?: string;
678
- sinceMs?: number;
679
- limit?: number;
680
- }
681
- interface TraceStore {
682
- record(trace: LlmTrace): Promise<void>;
683
- query(query: TraceQuery): Promise<LlmTrace[]>;
684
- count(query?: TraceQuery): Promise<number>;
685
- }
686
- declare class MemoryTraceStore implements TraceStore {
687
- private traces;
688
- record(trace: LlmTrace): Promise<void>;
689
- query(query: TraceQuery): Promise<LlmTrace[]>;
690
- count(query?: TraceQuery): Promise<number>;
691
- /** Clear the store — test helper. */
692
- reset(): void;
693
- private filter;
694
- }
695
- interface FileSystemTraceStoreOptions {
696
- dir: string;
697
- /** Max file size before rolling to a new segment (default 32 MB). */
698
- rolloverBytes?: number;
699
- /** Function to write the file — defaults to node:fs/promises.appendFile */
700
- append?: (path: string, data: string) => Promise<void>;
701
- read?: (path: string) => Promise<string>;
702
- list?: (dir: string) => Promise<string[]>;
703
- stat?: (path: string) => Promise<{
704
- size: number;
705
- }>;
706
- mkdir?: (dir: string) => Promise<void>;
707
- }
708
- declare class FileSystemTraceStore implements TraceStore {
709
- private readonly opts;
710
- constructor(opts: FileSystemTraceStoreOptions);
711
- record(trace: LlmTrace): Promise<void>;
712
- query(query: TraceQuery): Promise<LlmTrace[]>;
713
- count(query?: TraceQuery): Promise<number>;
714
- private segments;
715
- private currentSegment;
716
- }
717
-
718
645
  /**
719
646
  * Anti-slop quality judge.
720
647
  *
@@ -787,7 +714,7 @@ declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSl
787
714
  * returns a `ValidationResult` with pass/fail + 0..1 score + structured
788
715
  * issues.
789
716
  */
790
- interface Artifact {
717
+ interface Artifact$1 {
791
718
  /** Logical kind — validators type-guard on this */
792
719
  kind: 'file' | 'json' | 'text' | 'binary' | string;
793
720
  /** Filesystem-style path, optional */
@@ -803,7 +730,7 @@ interface ValidationContext {
803
730
  scenarioId: string;
804
731
  turnIndex?: number;
805
732
  /** Prior artifacts for multi-artifact scenarios */
806
- priorArtifacts?: Artifact[];
733
+ priorArtifacts?: Artifact$1[];
807
734
  /** Free-form hints the validator uses for domain-specific checks */
808
735
  hints?: Record<string, unknown>;
809
736
  }
@@ -827,7 +754,7 @@ interface ArtifactValidator {
827
754
  /** Optional description for human-facing reports. */
828
755
  description?: string;
829
756
  /** Called once per artifact; validators are expected to be pure + idempotent. */
830
- validate(artifact: Artifact, context: ValidationContext): Promise<ValidationResult>;
757
+ validate(artifact: Artifact$1, context: ValidationContext): Promise<ValidationResult>;
831
758
  }
832
759
  /**
833
760
  * Run every validator on the same artifact; aggregate pass as AND, score as
@@ -938,7 +865,7 @@ interface RunConfig {
938
865
  seed?: number;
939
866
  metadata?: Record<string, unknown>;
940
867
  }
941
- interface Run {
868
+ interface Run$1 {
942
869
  id: string;
943
870
  experimentId: string;
944
871
  name?: string;
@@ -959,9 +886,9 @@ interface ExperimentStore {
959
886
  saveExperiment(exp: Experiment): Promise<void>;
960
887
  getExperiment(id: string): Promise<Experiment | null>;
961
888
  listExperiments(): Promise<Experiment[]>;
962
- saveRun(run: Run): Promise<void>;
963
- getRun(id: string): Promise<Run | null>;
964
- listRuns(experimentId: string): Promise<Run[]>;
889
+ saveRun(run: Run$1): Promise<void>;
890
+ getRun(id: string): Promise<Run$1 | null>;
891
+ listRuns(experimentId: string): Promise<Run$1[]>;
965
892
  }
966
893
  declare class InMemoryExperimentStore implements ExperimentStore {
967
894
  private readonly experiments;
@@ -969,15 +896,15 @@ declare class InMemoryExperimentStore implements ExperimentStore {
969
896
  saveExperiment(exp: Experiment): Promise<void>;
970
897
  getExperiment(id: string): Promise<Experiment | null>;
971
898
  listExperiments(): Promise<Experiment[]>;
972
- saveRun(run: Run): Promise<void>;
973
- getRun(id: string): Promise<Run | null>;
974
- listRuns(experimentId: string): Promise<Run[]>;
899
+ saveRun(run: Run$1): Promise<void>;
900
+ getRun(id: string): Promise<Run$1 | null>;
901
+ listRuns(experimentId: string): Promise<Run$1[]>;
975
902
  }
976
903
  declare class ExperimentTracker {
977
904
  private readonly store;
978
905
  constructor(store: ExperimentStore);
979
906
  startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
980
- startRun(config: RunConfig): Promise<Run>;
907
+ startRun(config: RunConfig): Promise<Run$1>;
981
908
  completeRun(runId: string, report: BenchmarkReport): Promise<void>;
982
909
  failRun(runId: string, error: string): Promise<void>;
983
910
  /**
@@ -1080,6 +1007,9 @@ interface PairwiseComparison {
1080
1007
  variantA: string;
1081
1008
  variantB: string;
1082
1009
  pValue: number;
1010
+ /** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
1011
+ qValue: number;
1012
+ /** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
1083
1013
  significant: boolean;
1084
1014
  meanDelta: number;
1085
1015
  }
@@ -1184,4 +1114,3022 @@ declare class DualAgentBench {
1184
1114
  run(config: DualAgentBenchConfig): Promise<DualAgentReport>;
1185
1115
  }
1186
1116
 
1187
- export { AgentDriver, type AgentDriverConfig, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type ArtifactResult, type ArtifactValidator, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type CheckResult, type CollectedArtifacts, type CompletionCriterion, ConvergenceTracker, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EvalResult, type ExecutorConfig, type Experiment, type ExperimentStore, ExperimentTracker, type FeedbackPattern, FileSystemTraceStore, type FileSystemTraceStoreOptions, InMemoryExperimentStore, InMemoryWorkspaceInspector, type InspectorContext, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgeRubric, type JudgeScore, type LlmTrace, MODEL_PRICING, MemoryTraceStore, MetricsCollector, type OptimizationConfig, type OptimizationResult, type PairwiseComparison, type PersonaConfig, ProductClient, type ProductClientConfig, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, type RouteMap, type RubricDimension, type Run, type RunConfig, type RunDiff, type Scenario, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type SlopCategory, type TestResult, TokenCounter, type TraceQuery, type TraceStore, type Turn, type TurnMetrics, type TurnResult, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, analyzeAntiSlop, byteLengthRange, codeExecutionJudge, cohensD, coherenceJudge, composeValidators, confidenceInterval, containsAll, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, defaultJudges, estimateCost, estimateTokens, executeScenario, fileContains, fileExists, formatBenchmarkReport, formatDriverReport, hashContent, interRaterReliability, jsonHasKeys, mannWhitneyU, normalizeScores, pairedTTest, partialCredit, printDriverSummary, regexMatch, rowCount, rowWhere, runAssertions, runE2EWorkflow, weightedMean, wilcoxonSignedRank };
1117
+ /**
1118
+ * TraceSchema v1 — the canonical data model for agent-eval.
1119
+ *
1120
+ * Every score, every failure class, every pipeline in the framework is
1121
+ * a view over this data. Shape it once, live with it.
1122
+ *
1123
+ * Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
1124
+ * but extended with agent-specific span kinds (llm, tool, retrieval,
1125
+ * judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
1126
+ * entities that OTEL leaves as free-form attributes.
1127
+ */
1128
+ declare const TRACE_SCHEMA_VERSION = "1.0.0";
1129
+ type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
1130
+ interface BudgetSpec {
1131
+ tokens?: number;
1132
+ wallMs?: number;
1133
+ calls?: number;
1134
+ usd?: number;
1135
+ }
1136
+ interface RunOutcome {
1137
+ score?: number;
1138
+ pass?: boolean;
1139
+ failureClass?: FailureClass;
1140
+ notes?: string;
1141
+ }
1142
+ /**
1143
+ * Layer — optional classification in a nested build workflow.
1144
+ * `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
1145
+ * `app-build`: sandbox harness that compiled + tested the generated scaffold.
1146
+ * `app-runtime`: a run of the generated agent against a domain scenario.
1147
+ * `meta`: any meta-eval (judge replay, correlation analysis).
1148
+ */
1149
+ type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
1150
+ interface Run {
1151
+ runId: string;
1152
+ scenarioId: string;
1153
+ variantId?: string;
1154
+ datasetVersion?: string;
1155
+ /** Git SHA of agent code at run time. */
1156
+ codeSha?: string;
1157
+ /** Hash of the prompt template + any system prompt. */
1158
+ promptSha?: string;
1159
+ /** Model id + date + system-prompt hash, concatenated. */
1160
+ modelFingerprint?: string;
1161
+ seed?: number;
1162
+ /** Arbitrary environment markers (shell, docker version, tz). */
1163
+ envFingerprint?: Record<string, string>;
1164
+ /** Version of the redaction rules applied to this run. */
1165
+ redactionVersion?: string;
1166
+ /** Parent run in a nested build workflow. A builder run's children are
1167
+ * app-build runs; those children are app-runtime runs. */
1168
+ parentRunId?: string;
1169
+ /** Stable project identifier — groups runs across chats + sessions. */
1170
+ projectId?: string;
1171
+ /** Chat/conversation identifier within a project. */
1172
+ chatId?: string;
1173
+ /** Layer classification — hint for aggregation; not enforced. */
1174
+ layer?: RunLayer;
1175
+ startedAt: number;
1176
+ endedAt?: number;
1177
+ status: RunStatus;
1178
+ outcome?: RunOutcome;
1179
+ budget?: BudgetSpec;
1180
+ /** Free-form labels for downstream grouping. */
1181
+ tags?: Record<string, string>;
1182
+ }
1183
+ type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
1184
+ type SpanStatus = 'ok' | 'error';
1185
+ interface SpanBase {
1186
+ spanId: string;
1187
+ parentSpanId?: string;
1188
+ runId: string;
1189
+ kind: SpanKind;
1190
+ name: string;
1191
+ startedAt: number;
1192
+ endedAt?: number;
1193
+ status?: SpanStatus;
1194
+ error?: string;
1195
+ /** Anything not covered by typed fields. Kept deliberately free-form. */
1196
+ attributes?: Record<string, unknown>;
1197
+ }
1198
+ interface Message {
1199
+ role: 'system' | 'user' | 'assistant' | 'tool';
1200
+ content: string;
1201
+ tokens?: number;
1202
+ /** Multi-modal content descriptors; blobs themselves live in Artifacts. */
1203
+ images?: Array<{
1204
+ artifactId?: string;
1205
+ url?: string;
1206
+ mime?: string;
1207
+ }>;
1208
+ }
1209
+ interface LlmSpan extends SpanBase {
1210
+ kind: 'llm';
1211
+ model: string;
1212
+ messages: Message[];
1213
+ output?: string;
1214
+ inputTokens?: number;
1215
+ outputTokens?: number;
1216
+ cachedTokens?: number;
1217
+ reasoningTokens?: number;
1218
+ costUsd?: number;
1219
+ finishReason?: string;
1220
+ }
1221
+ interface ToolSpan extends SpanBase {
1222
+ kind: 'tool';
1223
+ toolName: string;
1224
+ args: unknown;
1225
+ result?: unknown;
1226
+ latencyMs?: number;
1227
+ }
1228
+ interface RetrievalSpan extends SpanBase {
1229
+ kind: 'retrieval';
1230
+ query: string;
1231
+ hits: Array<{
1232
+ docId: string;
1233
+ score: number;
1234
+ content?: string;
1235
+ }>;
1236
+ }
1237
+ interface JudgeSpan extends SpanBase {
1238
+ kind: 'judge';
1239
+ judgeId: string;
1240
+ /** Span this judgment applies to. */
1241
+ targetSpanId: string;
1242
+ dimension: string;
1243
+ /** Numeric score (free-range; interpretation up to the judge). */
1244
+ score: number;
1245
+ rationale?: string;
1246
+ evidence?: string;
1247
+ }
1248
+ interface SandboxSpan extends SpanBase {
1249
+ kind: 'sandbox';
1250
+ image?: string;
1251
+ command?: string;
1252
+ exitCode?: number;
1253
+ testsTotal?: number;
1254
+ testsPassed?: number;
1255
+ stdoutHash?: string;
1256
+ stderrHash?: string;
1257
+ /** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
1258
+ wallMs?: number;
1259
+ }
1260
+ interface GenericSpan extends SpanBase {
1261
+ kind: 'agent' | 'custom';
1262
+ }
1263
+ type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
1264
+ type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
1265
+ interface TraceEvent {
1266
+ eventId: string;
1267
+ runId: string;
1268
+ spanId?: string;
1269
+ kind: EventKind;
1270
+ timestamp: number;
1271
+ payload: Record<string, unknown>;
1272
+ }
1273
+ interface BudgetLedgerEntry {
1274
+ runId: string;
1275
+ dimension: keyof BudgetSpec;
1276
+ limit: number;
1277
+ consumed: number;
1278
+ remaining: number;
1279
+ timestamp: number;
1280
+ breached: boolean;
1281
+ /** Span that triggered this entry, if any. */
1282
+ spanId?: string;
1283
+ }
1284
+ interface Artifact {
1285
+ artifactId: string;
1286
+ runId: string;
1287
+ spanId?: string;
1288
+ contentType: string;
1289
+ sizeBytes: number;
1290
+ /** sha256 in hex. */
1291
+ hash: string;
1292
+ /** External storage URL (R2, S3, filesystem path). */
1293
+ storageUrl?: string;
1294
+ /** Inline content for small blobs — keep under ~64KB. */
1295
+ inlineContent?: string;
1296
+ }
1297
+ type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'unknown';
1298
+ declare const FAILURE_CLASSES: readonly FailureClass[];
1299
+ declare function isLlmSpan(s: Span): s is LlmSpan;
1300
+ declare function isToolSpan(s: Span): s is ToolSpan;
1301
+ declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
1302
+ declare function isJudgeSpan(s: Span): s is JudgeSpan;
1303
+ declare function isSandboxSpan(s: Span): s is SandboxSpan;
1304
+
1305
+ interface RunFilter {
1306
+ scenarioId?: string;
1307
+ variantId?: string;
1308
+ status?: RunStatus;
1309
+ since?: number;
1310
+ until?: number;
1311
+ tag?: {
1312
+ key: string;
1313
+ value: string;
1314
+ };
1315
+ parentRunId?: string;
1316
+ projectId?: string;
1317
+ chatId?: string;
1318
+ layer?: RunLayer;
1319
+ }
1320
+ interface SpanFilter {
1321
+ runId?: string;
1322
+ parentSpanId?: string;
1323
+ kind?: SpanKind;
1324
+ name?: string;
1325
+ toolName?: string;
1326
+ judgeId?: string;
1327
+ since?: number;
1328
+ until?: number;
1329
+ }
1330
+ interface EventFilter {
1331
+ runId?: string;
1332
+ spanId?: string;
1333
+ kind?: EventKind;
1334
+ since?: number;
1335
+ until?: number;
1336
+ }
1337
+ interface TraceStore {
1338
+ appendRun(run: Run): Promise<void>;
1339
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
1340
+ appendSpan(span: Span): Promise<void>;
1341
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1342
+ appendEvent(event: TraceEvent): Promise<void>;
1343
+ appendArtifact(artifact: Artifact): Promise<void>;
1344
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1345
+ getRun(runId: string): Promise<Run | undefined>;
1346
+ listRuns(filter?: RunFilter): Promise<Run[]>;
1347
+ spans(filter?: SpanFilter): Promise<Span[]>;
1348
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
1349
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
1350
+ artifacts(runId: string): Promise<Artifact[]>;
1351
+ }
1352
+ declare class InMemoryTraceStore implements TraceStore {
1353
+ private runs;
1354
+ private allSpans;
1355
+ private allEvents;
1356
+ private allArtifacts;
1357
+ private allBudget;
1358
+ appendRun(run: Run): Promise<void>;
1359
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
1360
+ appendSpan(span: Span): Promise<void>;
1361
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1362
+ appendEvent(event: TraceEvent): Promise<void>;
1363
+ appendArtifact(artifact: Artifact): Promise<void>;
1364
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1365
+ getRun(runId: string): Promise<Run | undefined>;
1366
+ listRuns(filter?: RunFilter): Promise<Run[]>;
1367
+ spans(filter?: SpanFilter): Promise<Span[]>;
1368
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
1369
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
1370
+ artifacts(runId: string): Promise<Artifact[]>;
1371
+ }
1372
+ interface FileSystemTraceStoreOptions {
1373
+ dir: string;
1374
+ /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
1375
+ maxBytes?: number;
1376
+ }
1377
+ declare class FileSystemTraceStore implements TraceStore {
1378
+ private dir;
1379
+ private maxBytes;
1380
+ /** Lazy in-memory index for queries — populated on first read. */
1381
+ private index?;
1382
+ private loaded;
1383
+ constructor(options: FileSystemTraceStoreOptions);
1384
+ private ensureDir;
1385
+ private append;
1386
+ private insertInto;
1387
+ private load;
1388
+ appendRun(run: Run): Promise<void>;
1389
+ updateRun(runId: string, patch: Partial<Run>): Promise<void>;
1390
+ appendSpan(span: Span): Promise<void>;
1391
+ updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
1392
+ appendEvent(event: TraceEvent): Promise<void>;
1393
+ appendArtifact(artifact: Artifact): Promise<void>;
1394
+ appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
1395
+ getRun(runId: string): Promise<Run | undefined>;
1396
+ listRuns(filter?: RunFilter): Promise<Run[]>;
1397
+ spans(filter?: SpanFilter): Promise<Span[]>;
1398
+ events(filter?: EventFilter): Promise<TraceEvent[]>;
1399
+ budget(runId: string): Promise<BudgetLedgerEntry[]>;
1400
+ artifacts(runId: string): Promise<Artifact[]>;
1401
+ }
1402
+
1403
+ /**
1404
+ * TraceEmitter — hierarchical span builder that auto-parents using an
1405
+ * internal stack. One emitter per Run; emitters do NOT share state.
1406
+ *
1407
+ * Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
1408
+ * return a `SpanHandle` with `.end()` / `.fail()` so callers don't
1409
+ * have to thread spanIds manually. For async workflows that can't use
1410
+ * the stack (e.g. fan-out parallel calls), pass `parentSpanId`
1411
+ * explicitly.
1412
+ */
1413
+
1414
+ interface SpanHandle<S extends Span = Span> {
1415
+ span: S;
1416
+ end(patch?: Partial<S>): Promise<void>;
1417
+ fail(error: string | Error, patch?: Partial<S>): Promise<void>;
1418
+ }
1419
+ interface TraceEmitterOptions {
1420
+ runId?: string;
1421
+ /** Inject a clock for deterministic tests. */
1422
+ now?: () => number;
1423
+ /** Inject an id generator for deterministic tests. */
1424
+ id?: () => string;
1425
+ }
1426
+ declare class TraceEmitter {
1427
+ private store;
1428
+ private stack;
1429
+ private _runId;
1430
+ private now;
1431
+ private id;
1432
+ constructor(store: TraceStore, options?: TraceEmitterOptions);
1433
+ get runId(): string;
1434
+ startRun(run: Omit<Run, 'runId' | 'startedAt' | 'status'>): Promise<Run>;
1435
+ endRun(outcome?: RunOutcome): Promise<void>;
1436
+ abortRun(reason: string): Promise<void>;
1437
+ span<S extends Span = Span>(init: {
1438
+ kind: SpanKind;
1439
+ name: string;
1440
+ parentSpanId?: string;
1441
+ attributes?: Record<string, unknown>;
1442
+ } & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
1443
+ private handle;
1444
+ private pop;
1445
+ llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
1446
+ tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
1447
+ retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
1448
+ recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
1449
+ sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
1450
+ emit(event: {
1451
+ kind: EventKind;
1452
+ spanId?: string;
1453
+ payload?: Record<string, unknown>;
1454
+ }): Promise<TraceEvent>;
1455
+ recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
1456
+ timestamp?: number;
1457
+ }): Promise<BudgetLedgerEntry>;
1458
+ recordArtifact(artifact: Omit<Artifact, 'artifactId' | 'runId'>): Promise<Artifact>;
1459
+ /**
1460
+ * Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
1461
+ * Returns the fn's return value. Use this for the 95% case.
1462
+ */
1463
+ within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
1464
+ }
1465
+ /** Helper to build an LLM span handle args object from a provider-shaped response. */
1466
+ declare function llmSpanFromProvider(args: {
1467
+ name?: string;
1468
+ model: string;
1469
+ messages: Message[];
1470
+ output: string;
1471
+ usage?: {
1472
+ inputTokens?: number;
1473
+ outputTokens?: number;
1474
+ cachedTokens?: number;
1475
+ reasoningTokens?: number;
1476
+ };
1477
+ costUsd?: number;
1478
+ finishReason?: string;
1479
+ }): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
1480
+
1481
+ /**
1482
+ * Typed query helpers over TraceStore.
1483
+ *
1484
+ * Not a full SQL engine — a minimal, composable set of operators that
1485
+ * cover the canned-pipeline use cases. For ad-hoc analytics, persist to
1486
+ * NDJSON and point DuckDB at it; the schema is stable so external SQL
1487
+ * tooling works out of the box.
1488
+ */
1489
+
1490
+ declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
1491
+ declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
1492
+ declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
1493
+ declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
1494
+ /** Group spans by any key selector. */
1495
+ declare function groupBy<T, K extends string | number>(items: T[], key: (t: T) => K): Map<K, T[]>;
1496
+ /** Hash tool arguments to an orderless-key-stable string for de-duplication. */
1497
+ declare function argHash(args: unknown): string;
1498
+ /** Sum an LLM-span array into aggregate token + cost. */
1499
+ declare function aggregateLlm(spans: LlmSpan[]): {
1500
+ inputTokens: number;
1501
+ outputTokens: number;
1502
+ cachedTokens: number;
1503
+ costUsd: number;
1504
+ };
1505
+ /** Pick the outcome's failure class when present, else derive 'success' from run status. */
1506
+ declare function runFailureClass(run: Run): FailureClass;
1507
+
1508
+ /**
1509
+ * Redaction — remove PII / secrets from trace payloads before persist.
1510
+ *
1511
+ * Pre-persistence rules mean raw traces in storage are already scrubbed.
1512
+ * Unredacted variants (for debugging / post-mortems) live in a separate
1513
+ * storage layer with stricter access controls; this module only covers
1514
+ * the default scrub-then-persist path.
1515
+ *
1516
+ * Rules compose: pass an array of `RedactionRule`, each is applied in
1517
+ * order. Strings that match get replaced with a tagged sentinel so the
1518
+ * eval framework can count how many redactions happened per run
1519
+ * (surfaced via `redaction_applied` events).
1520
+ */
1521
+ interface RedactionRule {
1522
+ id: string;
1523
+ pattern: RegExp;
1524
+ /** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
1525
+ replacement?: string;
1526
+ }
1527
+ interface RedactionReport {
1528
+ redactionCount: number;
1529
+ byRule: Record<string, number>;
1530
+ }
1531
+ /** OWASP / common-sense defaults — extend per-domain. */
1532
+ declare const DEFAULT_REDACTION_RULES: RedactionRule[];
1533
+ declare const REDACTION_VERSION = "1.0.0";
1534
+ /**
1535
+ * Redact a single string. Returns the new string and a per-rule count of
1536
+ * how many substitutions fired.
1537
+ */
1538
+ declare function redactString(input: string, rules?: RedactionRule[]): {
1539
+ output: string;
1540
+ report: RedactionReport;
1541
+ };
1542
+ /**
1543
+ * Walk a JSON-ish value applying `redactString` to every string leaf.
1544
+ * Arrays and plain objects are recursed; other types pass through
1545
+ * untouched. Circular references throw — traces should be tree-shaped.
1546
+ */
1547
+ declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
1548
+ value: unknown;
1549
+ report: RedactionReport;
1550
+ };
1551
+
1552
+ /**
1553
+ * OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
1554
+ * traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
1555
+ *
1556
+ * Wire format only. We do NOT depend on the @opentelemetry SDK — that
1557
+ * would drag in polyfills incompatible with Workers/Edge. Consumers
1558
+ * push the JSON to their collector of choice via HTTP.
1559
+ *
1560
+ * Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
1561
+ */
1562
+
1563
+ declare const OTEL_AGENT_EVAL_SCOPE: {
1564
+ name: string;
1565
+ version: string;
1566
+ };
1567
+ interface OtlpSpan {
1568
+ traceId: string;
1569
+ spanId: string;
1570
+ parentSpanId?: string;
1571
+ name: string;
1572
+ kind: number;
1573
+ startTimeUnixNano: string;
1574
+ endTimeUnixNano: string;
1575
+ attributes: Array<{
1576
+ key: string;
1577
+ value: {
1578
+ stringValue?: string;
1579
+ intValue?: string;
1580
+ doubleValue?: number;
1581
+ boolValue?: boolean;
1582
+ };
1583
+ }>;
1584
+ events?: Array<{
1585
+ timeUnixNano: string;
1586
+ name: string;
1587
+ attributes?: OtlpSpan['attributes'];
1588
+ }>;
1589
+ status?: {
1590
+ code: number;
1591
+ message?: string;
1592
+ };
1593
+ }
1594
+ interface OtlpResourceSpans {
1595
+ resource: {
1596
+ attributes: OtlpSpan['attributes'];
1597
+ };
1598
+ scopeSpans: Array<{
1599
+ scope: typeof OTEL_AGENT_EVAL_SCOPE;
1600
+ spans: OtlpSpan[];
1601
+ }>;
1602
+ }
1603
+ interface OtlpExport {
1604
+ resourceSpans: OtlpResourceSpans[];
1605
+ }
1606
+ /** Export a single run's spans + events in OTLP/JSON. */
1607
+ declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
1608
+
1609
+ /**
1610
+ * SandboxHarness — executes a scenario in an isolated environment and
1611
+ * emits a rich SandboxSpan into the trace.
1612
+ *
1613
+ * Two built-in drivers:
1614
+ * - `SubprocessSandboxDriver` — spawn in a local cwd with env vars.
1615
+ * Fast, no dependencies, fine for unit tests and most CI gates.
1616
+ * - `DockerSandboxDriver` — lifted from tangle-router's sandbox path;
1617
+ * shells out to `docker run`. Stronger isolation, slower startup.
1618
+ *
1619
+ * Consumers implement `SandboxDriver` for custom backends (Firecracker,
1620
+ * Cloudflare sandbox product, etc.). The harness doesn't care which.
1621
+ */
1622
+
1623
+ interface HarnessConfig {
1624
+ /** Setup command (e.g. "pnpm install"). Non-zero exit fails the run. */
1625
+ setupCommand?: string;
1626
+ /** Run command (e.g. "pnpm build"). */
1627
+ runCommand?: string;
1628
+ /** Test command (e.g. "pnpm test --run"). Drives the test count + pass count. */
1629
+ testCommand?: string;
1630
+ /** Absolute cwd for the subprocess driver. Ignored by docker driver. */
1631
+ cwd?: string;
1632
+ /** Max wall-clock per phase in ms. Default 10 minutes. */
1633
+ timeoutMs?: number;
1634
+ /** Image for the docker driver. */
1635
+ image?: string;
1636
+ /** Extra env vars (validated; shell-escaped). */
1637
+ env?: Record<string, string>;
1638
+ /** Parser for the test output — maps stdout/stderr/exit code → pass count. */
1639
+ testParser?: TestOutputParser;
1640
+ }
1641
+ interface TestOutputParser {
1642
+ id: string;
1643
+ parse(stdout: string, stderr: string, exitCode: number): {
1644
+ testsTotal: number;
1645
+ testsPassed: number;
1646
+ } | undefined;
1647
+ }
1648
+ interface SandboxResult {
1649
+ phase: 'setup' | 'run' | 'test';
1650
+ exitCode: number;
1651
+ stdout: string;
1652
+ stderr: string;
1653
+ wallMs: number;
1654
+ testsTotal?: number;
1655
+ testsPassed?: number;
1656
+ }
1657
+ interface SandboxDriver {
1658
+ id: string;
1659
+ exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
1660
+ }
1661
+ /** Vitest default summary line: "Tests X passed | Y failed". */
1662
+ declare const vitestTestParser: TestOutputParser;
1663
+ /** Pytest default: "collected N items" + " X passed, Y failed". */
1664
+ declare const pytestTestParser: TestOutputParser;
1665
+ /** Jest: "Tests: X passed, Y total" (and optional failed). */
1666
+ declare const jestTestParser: TestOutputParser;
1667
+ /** Composite parser — tries a list of parsers in order. */
1668
+ declare function composeParsers(...parsers: TestOutputParser[]): TestOutputParser;
1669
+ declare class SubprocessSandboxDriver implements SandboxDriver {
1670
+ id: string;
1671
+ exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
1672
+ }
1673
+ declare class DockerSandboxDriver implements SandboxDriver {
1674
+ id: string;
1675
+ exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
1676
+ }
1677
+ interface SandboxHarnessResult {
1678
+ passed: boolean;
1679
+ setup?: SandboxResult;
1680
+ run?: SandboxResult;
1681
+ test?: SandboxResult;
1682
+ totalWallMs: number;
1683
+ /** Final score — 0 when no tests; otherwise testsPassed/testsTotal. */
1684
+ score: number;
1685
+ }
1686
+ declare class SandboxHarness {
1687
+ private driver;
1688
+ constructor(driver?: SandboxDriver);
1689
+ run(config: HarnessConfig, emitter: TraceEmitter): Promise<SandboxHarnessResult>;
1690
+ }
1691
+
1692
+ /**
1693
+ * TestGradedScenario — a scenario whose score comes from a test suite.
1694
+ *
1695
+ * This is the SWE-bench pattern generalized. The scenario ships:
1696
+ * - fixture data (setup instructions)
1697
+ * - a test command the harness runs
1698
+ * - optional assertion overrides
1699
+ *
1700
+ * The runner emits a run, delegates to SandboxHarness, records the
1701
+ * outcome, and returns a structured verdict. Consumers bind their own
1702
+ * agent execution to this contract.
1703
+ */
1704
+
1705
+ interface TestGradedScenario {
1706
+ id: string;
1707
+ description?: string;
1708
+ harness: HarnessConfig;
1709
+ /** Optional pass threshold in 0..1 (default 1.0 = all tests must pass). */
1710
+ passThreshold?: number;
1711
+ /** Provenance for dataset tracking. */
1712
+ datasetVersion?: string;
1713
+ /** Free-form tags (difficulty, category, etc.). */
1714
+ tags?: Record<string, string>;
1715
+ }
1716
+ interface TestGradedRunOptions {
1717
+ variantId?: string;
1718
+ driver?: SandboxDriver;
1719
+ /** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */
1720
+ provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
1721
+ }
1722
+ interface TestGradedRunResult {
1723
+ runId: string;
1724
+ scenario: TestGradedScenario;
1725
+ harness: SandboxHarnessResult;
1726
+ pass: boolean;
1727
+ score: number;
1728
+ failureClass?: FailureClass;
1729
+ }
1730
+ declare function runTestGradedScenario(scenario: TestGradedScenario, store: TraceStore, options?: TestGradedRunOptions): Promise<TestGradedRunResult>;
1731
+
1732
+ /**
1733
+ * BudgetGuard — enforces token / wall-clock / call / $ caps, records
1734
+ * a ledger entry on every decrement, emits `budget_breach` + throws
1735
+ * `BudgetBreachError` when a cap is hit.
1736
+ *
1737
+ * Wraps a TraceEmitter. The emitter persists ledger entries + breach
1738
+ * events so the classifier, pipelines, and reports can all read
1739
+ * budget state from the trace corpus — no separate accounting.
1740
+ */
1741
+
1742
+ declare class BudgetBreachError extends Error {
1743
+ dimension: keyof BudgetSpec;
1744
+ limit: number;
1745
+ attempted: number;
1746
+ constructor(dimension: keyof BudgetSpec, limit: number, attempted: number);
1747
+ }
1748
+ declare class BudgetGuard {
1749
+ private consumed;
1750
+ private emitter;
1751
+ private budget;
1752
+ private startedAt;
1753
+ constructor(emitter: TraceEmitter, budget: BudgetSpec, now?: () => number);
1754
+ /** Record consumption. Throws `BudgetBreachError` if any dimension exceeds its cap. */
1755
+ charge(delta: Partial<Record<keyof BudgetSpec, number>>, spanId?: string): Promise<void>;
1756
+ /** Convenience: advance wall-clock budget based on elapsed wall time. */
1757
+ tickWall(nowMs: number, spanId?: string): Promise<void>;
1758
+ get state(): Record<keyof BudgetSpec, number>;
1759
+ }
1760
+
1761
+ /**
1762
+ * Failure taxonomy — canonical classes + a default classifier.
1763
+ *
1764
+ * Every failed run should end up in a named class. The classifier here
1765
+ * is rule-based (fast, deterministic); an LLM fallback can be added by
1766
+ * the consumer for novel cases and trained into the rule base over time.
1767
+ *
1768
+ * Consumers call `classifyFailure(run, spans, events)` and persist the
1769
+ * returned class as `Run.outcome.failureClass`.
1770
+ */
1771
+
1772
+ interface FailureContext {
1773
+ run: Run;
1774
+ spans: Span[];
1775
+ events: TraceEvent[];
1776
+ }
1777
+ interface FailureClassification {
1778
+ failureClass: FailureClass;
1779
+ reason: string;
1780
+ triggerSpanId?: string;
1781
+ triggerEventId?: string;
1782
+ }
1783
+ /** Ordered rules — first match wins. */
1784
+ interface FailureRule {
1785
+ id: string;
1786
+ match: (ctx: FailureContext) => {
1787
+ failureClass: FailureClass;
1788
+ reason: string;
1789
+ triggerSpanId?: string;
1790
+ triggerEventId?: string;
1791
+ } | null;
1792
+ }
1793
+ declare const DEFAULT_RULES: FailureRule[];
1794
+ /** Classify the failure mode of a run using an ordered rule list. */
1795
+ declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
1796
+
1797
+ /**
1798
+ * Trajectory — ordered, structured view over a run's spans.
1799
+ *
1800
+ * A pure function `buildTrajectory(store, runId) → Trajectory` returns
1801
+ * a topologically ordered list of `TrajectoryStep` with parent-child
1802
+ * grouping collapsed into a single line-of-agent-work. Separate
1803
+ * analyzers (stuck-loop detection, waste ratio) live in
1804
+ * `pipelines/` and consume the trajectory.
1805
+ */
1806
+
1807
+ interface TrajectoryStep {
1808
+ index: number;
1809
+ span: Span;
1810
+ /** Depth in the span tree from the root. 0 = top-level. */
1811
+ depth: number;
1812
+ /** Events attached to this span. */
1813
+ events: TraceEvent[];
1814
+ }
1815
+ interface Trajectory {
1816
+ runId: string;
1817
+ steps: TrajectoryStep[];
1818
+ llmTurns: number;
1819
+ toolCalls: number;
1820
+ judgeVerdicts: number;
1821
+ retrievals: number;
1822
+ totalDurationMs: number;
1823
+ }
1824
+ declare function buildTrajectory(store: TraceStore, runId: string): Promise<Trajectory>;
1825
+
1826
+ /**
1827
+ * Tool-use metrics — derived purely from trace data.
1828
+ *
1829
+ * No scoring assumptions: consumers supply optional ground-truth tool
1830
+ * selections per turn + optional "information used downstream" signals.
1831
+ * Without those, we still compute descriptive metrics (error rate,
1832
+ * retry rate, duplicate-call rate) that are useful on their own.
1833
+ */
1834
+
1835
+ interface ToolUseMetrics {
1836
+ runId: string;
1837
+ totalCalls: number;
1838
+ byTool: Record<string, ToolStats>;
1839
+ errorRate: number;
1840
+ /** Ratio of calls with identical (toolName, argHash) already seen earlier in the same run. */
1841
+ duplicateRate: number;
1842
+ /** Ratio of error calls followed by ≥1 retry on same tool. */
1843
+ retryRate: number;
1844
+ /** Optional: of the calls agent made, fraction the evaluator marked as "correct selection". */
1845
+ selectionAccuracy?: number;
1846
+ }
1847
+ interface ToolStats {
1848
+ calls: number;
1849
+ errors: number;
1850
+ avgLatencyMs: number;
1851
+ duplicates: number;
1852
+ }
1853
+ interface ToolUseOptions {
1854
+ /** Map of spanId → whether the evaluator judged the tool selection correct. Optional. */
1855
+ selectionLabels?: Record<string, boolean>;
1856
+ }
1857
+ declare function computeToolUseMetrics(store: TraceStore, runId: string, options?: ToolUseOptions): Promise<ToolUseMetrics>;
1858
+
1859
+ /**
1860
+ * StuckLoopView — detects when an agent calls the same tool with the
1861
+ * same (or structurally similar) arguments ≥ N times in a short window.
1862
+ *
1863
+ * Rationale: agents that loop are the number-one production failure
1864
+ * mode on long-horizon flows. The view returns (runId, toolName,
1865
+ * argHash, occurrences, windowMs) for each detected loop plus a
1866
+ * fraction of runs affected.
1867
+ */
1868
+
1869
+ interface StuckLoopFinding {
1870
+ runId: string;
1871
+ toolName: string;
1872
+ argHash: string;
1873
+ occurrences: number;
1874
+ spanIds: string[];
1875
+ /** Milliseconds between first and last call in the loop. */
1876
+ windowMs: number;
1877
+ }
1878
+ interface StuckLoopReport {
1879
+ findings: StuckLoopFinding[];
1880
+ affectedRunRatio: number;
1881
+ totalRuns: number;
1882
+ }
1883
+ interface StuckLoopOptions {
1884
+ /** Minimum call count to flag a loop (default 3). */
1885
+ minOccurrences?: number;
1886
+ /** Filter to a specific runId; omit to scan the entire corpus. */
1887
+ runId?: string;
1888
+ }
1889
+ declare function stuckLoopView(store: TraceStore, options?: StuckLoopOptions): Promise<StuckLoopReport>;
1890
+
1891
+ /**
1892
+ * ToolWasteView — fraction of tool calls whose results weren't used
1893
+ * downstream. Without a "used" signal we fall back to structural
1894
+ * proxies: error calls, duplicate calls, and tool calls followed by
1895
+ * zero subsequent LLM spans are all considered waste.
1896
+ *
1897
+ * Consumers can pass a `usageOracle` that inspects a tool span and
1898
+ * returns true iff the tool's result appears in a later LLM message,
1899
+ * artifact, or state mutation — that's the canonical definition; the
1900
+ * default heuristic is a reasonable fallback.
1901
+ */
1902
+
1903
+ interface ToolWasteFinding {
1904
+ runId: string;
1905
+ wastedCalls: number;
1906
+ totalCalls: number;
1907
+ wasteRate: number;
1908
+ }
1909
+ interface ToolWasteReport {
1910
+ byRun: ToolWasteFinding[];
1911
+ overallWasteRate: number;
1912
+ }
1913
+ interface ToolWasteOptions {
1914
+ runId?: string;
1915
+ usageOracle?: (tool: ToolSpan, later: {
1916
+ llm: Awaited<ReturnType<typeof llmSpans>>;
1917
+ }) => boolean;
1918
+ }
1919
+ declare function toolWasteView(store: TraceStore, options?: ToolWasteOptions): Promise<ToolWasteReport>;
1920
+
1921
+ /**
1922
+ * BudgetBreachView — aggregates breach events across the corpus.
1923
+ *
1924
+ * Answers: which dimensions get hit most often? Which scenarios are
1925
+ * underbudgeted? Which variants trigger the most breaches?
1926
+ */
1927
+
1928
+ interface BudgetBreachFinding {
1929
+ runId: string;
1930
+ scenarioId: string;
1931
+ variantId?: string;
1932
+ dimension: keyof BudgetSpec;
1933
+ limit: number;
1934
+ consumed: number;
1935
+ excessRatio: number;
1936
+ timestamp: number;
1937
+ }
1938
+ interface BudgetBreachReport {
1939
+ findings: BudgetBreachFinding[];
1940
+ byDimension: Record<string, number>;
1941
+ byScenario: Record<string, number>;
1942
+ byVariant: Record<string, number>;
1943
+ totalRuns: number;
1944
+ breachedRunRatio: number;
1945
+ }
1946
+ declare function budgetBreachView(store: TraceStore, options?: {
1947
+ scenarioId?: string;
1948
+ variantId?: string;
1949
+ }): Promise<BudgetBreachReport>;
1950
+
1951
+ /**
1952
+ * FailureClusterView — groups failed runs by (failureClass, triggerTool,
1953
+ * argHash-prefix) so weekly reviews can prioritize the top-N clusters.
1954
+ *
1955
+ * Each cluster includes: N runs, scenarios affected, representative
1956
+ * error message, a proposed mitigation hint (rule → action table).
1957
+ */
1958
+
1959
+ interface FailureCluster {
1960
+ failureClass: FailureClass;
1961
+ /** Tool name when the trigger was a tool span, else undefined. */
1962
+ toolName?: string;
1963
+ /** First 16 chars of argHash — clusters similar args. */
1964
+ argPrefix?: string;
1965
+ runCount: number;
1966
+ scenarioIds: string[];
1967
+ exampleError?: string;
1968
+ exampleRunId: string;
1969
+ }
1970
+ interface FailureClusterReport {
1971
+ clusters: FailureCluster[];
1972
+ totalFailures: number;
1973
+ totalRuns: number;
1974
+ }
1975
+ declare function failureClusterView(store: TraceStore, options?: {
1976
+ rules?: FailureRule[];
1977
+ minClusterSize?: number;
1978
+ }): Promise<FailureClusterReport>;
1979
+
1980
+ /**
1981
+ * JudgeAgreementView — pairwise agreement between judges across the
1982
+ * corpus, grouped by dimension.
1983
+ *
1984
+ * Output drives two workflows:
1985
+ * - Judge robustness audit: "does Claude agree with GPT at κ ≥ 0.6?"
1986
+ * - Calibration tracking: κ vs golden human labels over time (by
1987
+ * providing a `humanGoldenJudgeId`).
1988
+ */
1989
+
1990
+ interface JudgePair {
1991
+ judgeA: string;
1992
+ judgeB: string;
1993
+ dimension: string;
1994
+ /** Number of (targetSpanId, dimension) tuples both judges scored. */
1995
+ commonItems: number;
1996
+ pearson: number;
1997
+ krippendorff: number;
1998
+ }
1999
+ interface JudgeAgreementReport {
2000
+ pairs: JudgePair[];
2001
+ dimensions: string[];
2002
+ judgeIds: string[];
2003
+ }
2004
+ declare function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport>;
2005
+
2006
+ /**
2007
+ * FirstDivergenceView — aligns two trajectories by step index, reports
2008
+ * the first step where they differ.
2009
+ *
2010
+ * "Differ" is configurable — default is (kind, toolName if tool, model
2011
+ * if llm). Use this view to attribute "why is variant B better?" to a
2012
+ * specific step rather than an aggregate mean delta.
2013
+ */
2014
+
2015
+ interface DivergenceReport {
2016
+ runA: string;
2017
+ runB: string;
2018
+ firstDivergenceIndex: number | null;
2019
+ aStep?: TrajectoryStep;
2020
+ bStep?: TrajectoryStep;
2021
+ reason?: string;
2022
+ /** Common prefix length (steps that matched). */
2023
+ commonPrefixLen: number;
2024
+ }
2025
+ interface DivergenceOptions {
2026
+ /** Returns true if two steps are considered equal. Default: kind + tool/model match. */
2027
+ stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean;
2028
+ }
2029
+ declare function firstDivergenceView(store: TraceStore, runA: string, runB: string, options?: DivergenceOptions): Promise<DivergenceReport>;
2030
+
2031
+ /**
2032
+ * Baseline regression detection.
2033
+ *
2034
+ * Lifted from ADC baseline.ts. Every promotion-blocking signal boils down
2035
+ * to: "is this run measurably worse than baseline?" — with enough
2036
+ * statistical rigor to distinguish noise from drift.
2037
+ *
2038
+ * Uses:
2039
+ * - Welch's t-test (unequal variance) for per-metric mean comparison
2040
+ * - Cohen's d for effect size magnitude
2041
+ * - IQR for stability flag (unstable samples can't be trusted for comparisons)
2042
+ *
2043
+ * Returns a structured verdict: improved | regressed | stable | unstable.
2044
+ */
2045
+ interface MetricSamples {
2046
+ /** Stable metric key (e.g. "overallScore", "firstTokenMs"). */
2047
+ metric: string;
2048
+ /** Whether higher values are better. */
2049
+ higherIsBetter: boolean;
2050
+ baseline: number[];
2051
+ candidate: number[];
2052
+ }
2053
+ interface MetricVerdict {
2054
+ metric: string;
2055
+ baselineMean: number;
2056
+ candidateMean: number;
2057
+ delta: number;
2058
+ cohensD: number;
2059
+ welchT: number;
2060
+ welchDf: number;
2061
+ welchP: number;
2062
+ stable: boolean;
2063
+ /** IQR of the combined samples — used as a rough stability indicator. */
2064
+ iqr: number;
2065
+ verdict: 'improved' | 'regressed' | 'stable' | 'unstable';
2066
+ }
2067
+ interface BaselineReport {
2068
+ metrics: MetricVerdict[];
2069
+ /** True if any critical metric regressed. */
2070
+ hasRegression: boolean;
2071
+ /** True if any metric is unstable (too noisy to judge). */
2072
+ hasUnstable: boolean;
2073
+ }
2074
+ interface BaselineOptions {
2075
+ /** Effect size threshold for meaningful delta (default 0.5 — medium effect). */
2076
+ effectThreshold?: number;
2077
+ /** p-value threshold for statistical significance (default 0.05). */
2078
+ alpha?: number;
2079
+ /** IQR/mean ratio above which samples are flagged unstable (default 0.30). */
2080
+ unstableCvThreshold?: number;
2081
+ }
2082
+ /**
2083
+ * Compare candidate samples against baseline per metric. Verdict logic:
2084
+ * - unstable: IQR/|mean| > threshold on either set — not enough signal
2085
+ * - improved: meaningful effect in the "better" direction AND p < alpha
2086
+ * - regressed: meaningful effect in the "worse" direction AND p < alpha
2087
+ * - stable: otherwise (no significant change)
2088
+ */
2089
+ declare function compareToBaseline(samples: MetricSamples[], options?: BaselineOptions): BaselineReport;
2090
+ /** Inter-quartile range; 0 when the sample has no spread. */
2091
+ declare function iqr(xs: number[]): number;
2092
+ /**
2093
+ * Welch's t-test — unequal-variance two-sample t. Uses the same Student-t
2094
+ * CDF as `pairedTTest` (via incomplete beta); falls back to normal tail
2095
+ * when df is large.
2096
+ */
2097
+ declare function welchsTTest(a: number[], b: number[]): {
2098
+ t: number;
2099
+ df: number;
2100
+ p: number;
2101
+ };
2102
+
2103
+ /**
2104
+ * RegressionView — compares a candidate slice to a baseline slice on a
2105
+ * named metric. Delegates the statistics (Welch's t-test, Cohen's d,
2106
+ * IQR stability) to `baseline.ts`.
2107
+ *
2108
+ * This is the entry point for CI regression gates: "given runs tagged
2109
+ * release=A and release=B, did any metric regress?"
2110
+ */
2111
+
2112
+ interface RegressionSpec {
2113
+ metric: string;
2114
+ higherIsBetter: boolean;
2115
+ /** Extract a scalar from a run. Default extractors handle common metrics. */
2116
+ extract?: (run: Run, store: TraceStore) => Promise<number | null>;
2117
+ }
2118
+ interface RegressionOptions extends BaselineOptions {
2119
+ baseline: RunFilter;
2120
+ candidate: RunFilter;
2121
+ }
2122
+ declare function regressionView(store: TraceStore, metrics: RegressionSpec[], options: RegressionOptions): Promise<BaselineReport>;
2123
+
2124
+ /**
2125
+ * SLO gates — quantified pass/fail primitives beyond score thresholds.
2126
+ *
2127
+ * Lifted from ADC's sandbox eval suite. Each SLO defines a metric, a
2128
+ * threshold, and a severity (critical | warning). Critical breaches fail
2129
+ * the eval; warnings are reported but don't gate CI. Margin is the
2130
+ * ratio of actual to threshold for histogramming "how close are we?"
2131
+ *
2132
+ * Consumers assemble their own SLO arrays; DEFAULT_AGENT_SLOS covers
2133
+ * the generic agent flow (provision, first token, pass rate, cost).
2134
+ */
2135
+ type SloSeverity = 'critical' | 'warning';
2136
+ type SloComparator = 'lte' | 'gte';
2137
+ interface Slo {
2138
+ /** Stable identifier — must be unique within an SLO set. */
2139
+ id: string;
2140
+ /** Human description, shown in reports. */
2141
+ description: string;
2142
+ /** Metric key looked up in the candidate record. */
2143
+ metric: string;
2144
+ /** Whether the metric should stay below (lte) or above (gte) threshold. */
2145
+ comparator: SloComparator;
2146
+ /** Threshold value. */
2147
+ threshold: number;
2148
+ severity: SloSeverity;
2149
+ }
2150
+ interface SloCheckResult {
2151
+ slo: Slo;
2152
+ actual: number | undefined;
2153
+ passed: boolean;
2154
+ /** actual/threshold for lte, threshold/actual for gte. >1 means safe margin; <1 means breach. 0 when actual is missing. */
2155
+ margin: number;
2156
+ detail: string;
2157
+ }
2158
+ interface SloReport {
2159
+ results: SloCheckResult[];
2160
+ passedCritical: boolean;
2161
+ criticalBreaches: SloCheckResult[];
2162
+ warnings: SloCheckResult[];
2163
+ }
2164
+ /**
2165
+ * Evaluate an SLO set against a candidate metrics object. Missing metrics
2166
+ * count as breaches — if you declared it, you must measure it.
2167
+ */
2168
+ declare function checkSlos(metrics: Record<string, number>, slos: Slo[]): SloReport;
2169
+ /** Reference SLO set for agent-style evals. Tune per-product by cloning + overriding. */
2170
+ declare const DEFAULT_AGENT_SLOS: Slo[];
2171
+
2172
+ /**
2173
+ * Declarative oracles — ground-truth assertions without an LLM.
2174
+ *
2175
+ * Lifted from browser-agent-driver's _oracle.mjs. When you know the
2176
+ * expected outcome exactly (a URL, a text fragment, a JSON shape), you
2177
+ * don't need an LLM judge — you need a regex. These oracles are
2178
+ * composable pass/fail checks over an observation bundle.
2179
+ *
2180
+ * Each oracle returns { pass, detail, evidence? } and has a short
2181
+ * `id` for reporting. `evaluateOracles` runs a batch and aggregates.
2182
+ */
2183
+ interface OracleObservation {
2184
+ /** Final observable text output from the agent (response, page snapshot, stdout). */
2185
+ text?: string;
2186
+ /** Final URL — for browser-style scenarios. */
2187
+ url?: string;
2188
+ /** Any structured JSON the agent produced. */
2189
+ json?: unknown;
2190
+ /** Free-form context used by custom oracles. */
2191
+ context?: Record<string, unknown>;
2192
+ }
2193
+ interface OracleResult {
2194
+ id: string;
2195
+ pass: boolean;
2196
+ detail: string;
2197
+ evidence?: string;
2198
+ }
2199
+ interface Oracle {
2200
+ id: string;
2201
+ check(obs: OracleObservation): OracleResult;
2202
+ }
2203
+ declare function textInSnapshot(needle: string, opts?: {
2204
+ caseSensitive?: boolean;
2205
+ }): Oracle;
2206
+ declare function urlContains(fragment: string): Oracle;
2207
+ declare function jsonShape(expected: Record<string, unknown>): Oracle;
2208
+ declare function regexMatches(pattern: RegExp): Oracle;
2209
+ /**
2210
+ * Anti-bot detector — distinguishes genuine failures from blocked navigation
2211
+ * (cloudflare, recaptcha, etc). Returns an Oracle that PASSES when no block
2212
+ * marker is present; on block, detail names the blocker so runners can tag
2213
+ * results as "blocked" rather than "failed". Lifted from browser-agent-driver.
2214
+ */
2215
+ declare function notBlocked(): Oracle;
2216
+ interface OracleReport {
2217
+ results: OracleResult[];
2218
+ pass: boolean;
2219
+ passCount: number;
2220
+ failCount: number;
2221
+ /** 0-1 ratio of oracles passed. */
2222
+ score: number;
2223
+ }
2224
+ /** Run all oracles against one observation and aggregate. */
2225
+ declare function evaluateOracles(obs: OracleObservation, oracles: Oracle[]): OracleReport;
2226
+
2227
+ /**
2228
+ * Cost tracker — token + USD accounting per scenario and per run.
2229
+ *
2230
+ * Lifted from tax/legal metrics.ts + tangle-router UsageEvent. Every
2231
+ * optimizer needs to know "is the quality gain worth the cost delta?",
2232
+ * and every dashboard needs dollars-per-completed-task. MODEL_PRICING
2233
+ * from metrics.ts stays authoritative for estimate math; this module
2234
+ * adds the aggregation + per-scenario roll-up that was duplicated
2235
+ * across 4 verticals.
2236
+ */
2237
+ interface TokenSpec {
2238
+ inputTokens: number;
2239
+ outputTokens: number;
2240
+ cachedTokens?: number;
2241
+ reasoningTokens?: number;
2242
+ }
2243
+ interface CostEntry extends TokenSpec {
2244
+ scenarioId: string;
2245
+ model: string;
2246
+ /** Override estimate with an observed cost (e.g. from provider response). */
2247
+ actualCostUsd?: number;
2248
+ timestamp: number;
2249
+ /** Free-form tags (variant id, round #, etc.). */
2250
+ tags?: Record<string, string>;
2251
+ }
2252
+ interface ScenarioCost {
2253
+ scenarioId: string;
2254
+ entries: CostEntry[];
2255
+ totalInputTokens: number;
2256
+ totalOutputTokens: number;
2257
+ totalCachedTokens: number;
2258
+ totalCostUsd: number;
2259
+ /** Pass flag — set by consumer via markOutcome; used for cost-per-completed-task. */
2260
+ completed?: boolean;
2261
+ }
2262
+ declare class CostTracker {
2263
+ private byScenario;
2264
+ record(entry: Omit<CostEntry, 'timestamp'> & {
2265
+ timestamp?: number;
2266
+ }): CostEntry;
2267
+ markOutcome(scenarioId: string, completed: boolean): void;
2268
+ get(scenarioId: string): ScenarioCost | undefined;
2269
+ list(): ScenarioCost[];
2270
+ summary(): CostSummary;
2271
+ }
2272
+ interface CostSummary {
2273
+ scenarioCount: number;
2274
+ completedCount: number;
2275
+ totalInputTokens: number;
2276
+ totalOutputTokens: number;
2277
+ totalCostUsd: number;
2278
+ avgCostPerScenarioUsd: number;
2279
+ /** Total USD / completed scenarios — null when nothing completed. */
2280
+ costPerCompletedTaskUsd: number | null;
2281
+ }
2282
+
2283
+ /**
2284
+ * Pareto frontier — multi-objective optimization over candidate runs.
2285
+ *
2286
+ * Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
2287
+ * trading off (cost, latency, quality) or (passRate, tokenBudget,
2288
+ * ttfb), you rarely have a single "winner" — you have a set of
2289
+ * non-dominated candidates. This module exposes:
2290
+ *
2291
+ * - `paretoFrontier`: filter a set of candidates to the non-dominated ones
2292
+ * - `dominates`: does A dominate B across all objectives?
2293
+ *
2294
+ * Each objective is declared with a direction: 'maximize' (higher=better)
2295
+ * or 'minimize' (lower=better). Candidates are any object; pass an
2296
+ * `objective(candidate)` accessor.
2297
+ */
2298
+ type Direction = 'maximize' | 'minimize';
2299
+ interface Objective<T> {
2300
+ /** Stable label used in reports. */
2301
+ name: string;
2302
+ direction: Direction;
2303
+ value: (candidate: T) => number;
2304
+ }
2305
+ interface ParetoResult<T> {
2306
+ frontier: T[];
2307
+ dominated: T[];
2308
+ /** Index map: frontier[i] dominates each of dominatedBy[i]. */
2309
+ dominanceMap: Array<{
2310
+ dominator: T;
2311
+ dominated: T[];
2312
+ }>;
2313
+ }
2314
+ /** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
2315
+ declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
2316
+ /**
2317
+ * Compute the non-dominated frontier. Candidates with NaN/Infinity on any
2318
+ * objective are excluded (can't rank them). A candidate enters the frontier
2319
+ * iff no other candidate dominates it.
2320
+ */
2321
+ declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
2322
+
2323
+ /**
2324
+ * Series convergence — detects whether a sequence of scalar measurements
2325
+ * is stabilizing, drifting, or noisy.
2326
+ *
2327
+ * Lifted from ADC convergence.ts. The per-turn `ConvergenceTracker` is
2328
+ * about progress *within* a single run; this module is about drift
2329
+ * *across* runs (e.g. "are my nightly eval scores stabilizing?").
2330
+ *
2331
+ * Three signals:
2332
+ * - stabilized: last K values have low variance (< epsilon) — done
2333
+ * - drifting: recent trend is monotonic and beyond noise — regressing or improving
2334
+ * - noisy: neither — keep iterating, but flag as untrustworthy for gating
2335
+ */
2336
+ interface SeriesConvergenceOptions {
2337
+ /** Window size for "recent" analysis (default 5). */
2338
+ window?: number;
2339
+ /** Coefficient-of-variation threshold below which the window is stabilized (default 0.05 = 5%). */
2340
+ stableCv?: number;
2341
+ /** Minimum monotone run length to call drift (default 3). */
2342
+ driftRun?: number;
2343
+ }
2344
+ interface SeriesConvergenceResult {
2345
+ state: 'stabilized' | 'drifting-up' | 'drifting-down' | 'noisy' | 'insufficient-data';
2346
+ windowMean: number;
2347
+ windowCv: number;
2348
+ /** Longest monotonic run at the tail of the series (positive for up, negative for down). */
2349
+ tailRun: number;
2350
+ /** True when n ≥ window AND windowCv ≤ stableCv. */
2351
+ stable: boolean;
2352
+ }
2353
+ declare function analyzeSeries(values: number[], options?: SeriesConvergenceOptions): SeriesConvergenceResult;
2354
+
2355
+ /**
2356
+ * State continuity scoring — measures how well a resumed/handed-off agent
2357
+ * preserves prior work.
2358
+ *
2359
+ * Lifted from tax-agent's run-resume-eval.ts. When session 2 continues
2360
+ * session 1's work, the key question is: did it preserve key artifacts,
2361
+ * or start over and lose context? Each `ContinuityCheck` inspects one
2362
+ * aspect (file preserved, key count grew, status advanced) and yields
2363
+ * 0-1 credit; the aggregate is the simple mean.
2364
+ *
2365
+ * Generic over any "snapshot" shape — pass your own checks.
2366
+ */
2367
+ interface ContinuitySnapshotPair<T> {
2368
+ before: T;
2369
+ after: T;
2370
+ }
2371
+ interface ContinuityCheck<T> {
2372
+ /** Stable identifier; shown in the report. */
2373
+ id: string;
2374
+ /** Description of what this check measures. */
2375
+ description: string;
2376
+ /** Returns 0..1 credit for this dimension (1 = fully preserved/improved). */
2377
+ score: (pair: ContinuitySnapshotPair<T>) => number;
2378
+ }
2379
+ interface ContinuityCheckResult {
2380
+ id: string;
2381
+ description: string;
2382
+ score: number;
2383
+ pass: boolean;
2384
+ }
2385
+ interface ContinuityReport {
2386
+ results: ContinuityCheckResult[];
2387
+ /** Mean of per-check scores, in 0..1. */
2388
+ overallScore: number;
2389
+ /** True iff ALL checks scored ≥ passThreshold. */
2390
+ pass: boolean;
2391
+ }
2392
+ declare function scoreContinuity<T>(pair: ContinuitySnapshotPair<T>, checks: ContinuityCheck<T>[], options?: {
2393
+ passThreshold?: number;
2394
+ }): ContinuityReport;
2395
+ /** Common check: a required key in a record exists and equals the prior value. */
2396
+ declare function keyPreserved<T extends Record<string, unknown>>(key: keyof T & string): ContinuityCheck<T>;
2397
+ /** Common check: a collection (array) grew or stayed the same size. */
2398
+ declare function collectionPreserved<T, K extends keyof T & string>(key: K, minRatio?: number): ContinuityCheck<T>;
2399
+ /** Common check: a status field advanced in an expected order. */
2400
+ declare function statusAdvanced<T extends Record<string, unknown>>(key: keyof T & string, progression: readonly string[]): ContinuityCheck<T>;
2401
+
2402
+ /**
2403
+ * Dataset — versioned, sliceable, content-hashed scenario collection.
2404
+ *
2405
+ * Scenarios stop being ephemeral arrays and become first-class
2406
+ * artifacts. Every Dataset carries:
2407
+ * - content hash (sha256 over canonicalized scenario array)
2408
+ * - provenance (contributor, createdAt, sourceUrl)
2409
+ * - split labels (train | dev | test | holdout)
2410
+ * - difficulty tiers (easy | medium | hard | extreme)
2411
+ * - tags (free-form, per-scenario)
2412
+ *
2413
+ * `Dataset.slice({ difficulty, split, holdout, seed })` returns a
2414
+ * deterministic, reproducible subset. Holdout slices are locked: you
2415
+ * can read them but `mutate` throws, which prevents "oh I'll just
2416
+ * tweak that one scenario" contamination drift.
2417
+ */
2418
+ type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
2419
+ type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
2420
+ interface DatasetScenario {
2421
+ id: string;
2422
+ /** Arbitrary payload; the framework doesn't interpret it. */
2423
+ payload: unknown;
2424
+ split?: DatasetSplit;
2425
+ difficulty?: DatasetDifficulty;
2426
+ /** Canary token that MUST NOT round-trip through a correct agent output. */
2427
+ canary?: string;
2428
+ tags?: Record<string, string>;
2429
+ }
2430
+ interface DatasetProvenance {
2431
+ contributor?: string;
2432
+ createdAt: string;
2433
+ sourceUrl?: string;
2434
+ license?: string;
2435
+ description?: string;
2436
+ /** Monotonic human-readable version (e.g. "2026.04.20"). */
2437
+ version: string;
2438
+ }
2439
+ interface DatasetManifest {
2440
+ name: string;
2441
+ provenance: DatasetProvenance;
2442
+ /** sha256 hex over canonicalized scenarios. */
2443
+ contentHash: string;
2444
+ scenarioCount: number;
2445
+ splitCounts: Record<DatasetSplit, number>;
2446
+ }
2447
+ interface SliceOptions {
2448
+ split?: DatasetSplit;
2449
+ difficulty?: DatasetDifficulty;
2450
+ /** Number of scenarios (random sample, seeded). Omit to take all that match. */
2451
+ limit?: number;
2452
+ seed?: number;
2453
+ /** Predicate narrowing. Applied after split/difficulty filters. */
2454
+ filter?: (scenario: DatasetScenario) => boolean;
2455
+ /** If true, include scenarios marked as holdout. Default false. */
2456
+ includeHoldout?: boolean;
2457
+ }
2458
+ /** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
2459
+ declare class HoldoutLockedError extends Error {
2460
+ constructor(datasetName: string);
2461
+ }
2462
+ declare class Dataset {
2463
+ readonly name: string;
2464
+ readonly provenance: DatasetProvenance;
2465
+ private scenarios;
2466
+ private locked;
2467
+ constructor(init: {
2468
+ name: string;
2469
+ provenance: DatasetProvenance;
2470
+ scenarios: DatasetScenario[];
2471
+ locked?: boolean;
2472
+ });
2473
+ /** All scenarios. Readonly — callers must go through `slice` or `clone`. */
2474
+ all(): readonly DatasetScenario[];
2475
+ get size(): number;
2476
+ /**
2477
+ * Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
2478
+ * the same arguments always produce the same slice across machines.
2479
+ */
2480
+ slice(options?: SliceOptions): DatasetScenario[];
2481
+ /**
2482
+ * Assemble the manifest (name + provenance + content hash + counts).
2483
+ * Content hash is deterministic over canonicalized scenarios.
2484
+ */
2485
+ manifest(): Promise<DatasetManifest>;
2486
+ /** Fresh unlocked copy — for post-release forks when mutation is needed. */
2487
+ clone(overrides?: Partial<{
2488
+ name: string;
2489
+ version: string;
2490
+ }>): Dataset;
2491
+ lock(): void;
2492
+ add(scenario: DatasetScenario): void;
2493
+ remove(scenarioId: string): void;
2494
+ /**
2495
+ * Stable JSON-Lines serialization — deterministic byte-for-byte.
2496
+ * Write to disk for contamination-verifiable archives.
2497
+ */
2498
+ toJsonl(): string;
2499
+ static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
2500
+ }
2501
+ declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
2502
+
2503
+ /**
2504
+ * ContaminationGuard — ensures held-out scenarios don't leak into
2505
+ * training/prompt paths, and flags model memorization.
2506
+ *
2507
+ * Three probes:
2508
+ * 1. `checkCanaries(output, scenario)` — if the scenario carries a
2509
+ * canary token, it MUST NOT appear in the agent's output.
2510
+ * Canaries are strings that are statistically impossible to
2511
+ * reconstruct from the scenario description alone — so if they
2512
+ * echo back, the model memorized them.
2513
+ * 2. `canaryLeakView(store)` — cross-corpus view of every run whose
2514
+ * output contained a canary, with the offending scenario + run.
2515
+ * 3. `HoldoutAuditor` — wraps a Dataset and emits a structured error
2516
+ * on any code path that reads holdout scenarios but doesn't flag
2517
+ * `purpose: 'evaluation'`. Keeps engineers honest.
2518
+ */
2519
+
2520
+ interface CanaryLeak {
2521
+ scenarioId: string;
2522
+ canary: string;
2523
+ runId?: string;
2524
+ evidence: string;
2525
+ }
2526
+ declare function checkCanaries(output: string, scenarios: DatasetScenario[]): CanaryLeak[];
2527
+ /**
2528
+ * Scan the LLM-output history in a corpus; returns every case where a
2529
+ * canary from a known scenario appeared in agent output. Pass the full
2530
+ * set of scenarios whose canaries you care about (typically the whole
2531
+ * held-out slice).
2532
+ */
2533
+ declare function canaryLeakView(store: TraceStore, scenarios: DatasetScenario[]): Promise<CanaryLeak[]>;
2534
+ declare class HoldoutAuditor {
2535
+ private scenarios;
2536
+ private accessLog;
2537
+ constructor(scenarios: DatasetScenario[]);
2538
+ /** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */
2539
+ get(scenarioId: string, purpose: 'evaluation' | 'debugging'): DatasetScenario;
2540
+ getAccessLog(): ReadonlyArray<{
2541
+ scenarioId: string;
2542
+ purpose: string;
2543
+ at: number;
2544
+ }>;
2545
+ }
2546
+
2547
+ /**
2548
+ * Red-team battery — adversarial scenario corpus with per-category
2549
+ * scorers.
2550
+ *
2551
+ * Every scenario is a `DatasetScenario` with a `redTeam: {...}` payload
2552
+ * that declares: category, expected refusal, attack surface. Scorers
2553
+ * inspect the agent's output + trajectory and return a PASS/FAIL per
2554
+ * category. Aggregated via `redTeamReport`.
2555
+ *
2556
+ * The built-in corpus is intentionally conservative — 40 cases
2557
+ * covering the most-seen attack patterns. Extend via `extendCorpus`.
2558
+ */
2559
+
2560
+ type RedTeamCategory = 'prompt_injection_direct' | 'prompt_injection_indirect' | 'jailbreak_persona' | 'jailbreak_dan' | 'pii_leak' | 'permission_escalation' | 'data_exfil' | 'policy_override';
2561
+ interface RedTeamPayload {
2562
+ category: RedTeamCategory;
2563
+ /** The hostile input the agent sees. */
2564
+ input: string;
2565
+ /**
2566
+ * What a safe agent should do: "refuse" means explicit refusal,
2567
+ * "ignore" means process benignly without executing the attack.
2568
+ */
2569
+ expected: 'refuse' | 'ignore';
2570
+ /** Substring(s) that MUST NOT appear in output. */
2571
+ forbiddenStrings?: string[];
2572
+ /** Tool names the agent MUST NOT call for this scenario. */
2573
+ forbiddenTools?: string[];
2574
+ }
2575
+ interface RedTeamCase extends DatasetScenario {
2576
+ payload: RedTeamPayload;
2577
+ }
2578
+ interface RedTeamFinding {
2579
+ scenarioId: string;
2580
+ category: RedTeamCategory;
2581
+ passed: boolean;
2582
+ reason: string;
2583
+ evidence?: string;
2584
+ }
2585
+ interface RedTeamReport {
2586
+ findings: RedTeamFinding[];
2587
+ passRateByCategory: Record<RedTeamCategory, number>;
2588
+ overallPassRate: number;
2589
+ }
2590
+ /** Ship a canonical, small corpus. Consumers extend via `extendCorpus`. */
2591
+ declare const DEFAULT_RED_TEAM_CORPUS: RedTeamCase[];
2592
+ declare function redTeamDataset(extraCases?: RedTeamCase[]): Dataset;
2593
+ /**
2594
+ * Score a single agent response against the red-team case.
2595
+ * `toolCalls` should be the names of tools the agent invoked during the run.
2596
+ */
2597
+ declare function scoreRedTeamOutput(output: string, toolCalls: string[], rtCase: RedTeamCase): RedTeamFinding;
2598
+ /** Aggregate red-team findings into per-category pass rates. */
2599
+ declare function redTeamReport(findings: RedTeamFinding[]): RedTeamReport;
2600
+ /**
2601
+ * Extract the tool-call names from a corpus run — convenience for the
2602
+ * common pipeline (run the scenario → score the run).
2603
+ */
2604
+ declare function toolNamesForRun(store: TraceStore, runId: string): Promise<string[]>;
2605
+
2606
+ /**
2607
+ * Power analysis + multiple-comparison correction.
2608
+ *
2609
+ * Two jobs:
2610
+ * 1. Before running: `requiredSampleSize({ effect, alpha, power })`
2611
+ * returns the N per arm needed to detect a given effect size.
2612
+ * 2. After running: `benjaminiHochberg(pValues, fdr)` and
2613
+ * `bonferroni(pValues, alpha)` correct for multiple pairwise tests
2614
+ * so PromptOptimizer's "significant" flag is statistically honest.
2615
+ *
2616
+ * Fixes the correctness bug in 0.2's PromptOptimizer which applied
2617
+ * alpha directly across n*(n-1)/2 pairwise tests without correction —
2618
+ * dramatically inflating false-positive rate when variants ≥ 3.
2619
+ */
2620
+ /**
2621
+ * Required N per arm for a two-sample comparison at target effect size,
2622
+ * alpha, and power. Uses the normal-approximation formula:
2623
+ *
2624
+ * n = 2 * ( (z_{1-α/2} + z_{1-β}) / d )^2
2625
+ *
2626
+ * where d is Cohen's d. Returns Infinity for effect ≤ 0.
2627
+ */
2628
+ declare function requiredSampleSize(opts: {
2629
+ effect: number;
2630
+ alpha?: number;
2631
+ power?: number;
2632
+ twoSided?: boolean;
2633
+ }): number;
2634
+ /** Bonferroni adjustment: multiply every p-value by the number of tests, clamp at 1. */
2635
+ declare function bonferroni(pValues: number[], alpha?: number): {
2636
+ adjusted: number[];
2637
+ significant: boolean[];
2638
+ };
2639
+ /**
2640
+ * Benjamini–Hochberg false discovery rate. Returns adjusted q-values and
2641
+ * significance at the target FDR. Properly handles ties and preserves
2642
+ * monotonicity of q-values.
2643
+ */
2644
+ declare function benjaminiHochberg(pValues: number[], fdr?: number): {
2645
+ qValues: number[];
2646
+ significant: boolean[];
2647
+ };
2648
+
2649
+ /**
2650
+ * Behavior DSL — pytest-style assertions over a run's trajectory.
2651
+ *
2652
+ * Shape:
2653
+ * expect(store, runId).toCall('search').withArgs({ q: /.+/ })
2654
+ * expect(store, runId).toRefuse()
2655
+ * expect(store, runId).toOutputMatch(/confirmed/i)
2656
+ * expect(store, runId).toRespectBudget('tokens')
2657
+ * expect(store, runId).toCompleteWithin({ wallMs: 30_000 })
2658
+ *
2659
+ * Each matcher returns an `Expectation` with `.check() → MatcherResult`
2660
+ * so the DSL is composable with suite runners — you can collect all
2661
+ * expectations into a report instead of throwing on first failure.
2662
+ */
2663
+
2664
+ interface MatcherResult {
2665
+ ok: boolean;
2666
+ detail: string;
2667
+ evidence?: string;
2668
+ }
2669
+ interface Expectation {
2670
+ /** Human-facing label; used in reports. */
2671
+ label: string;
2672
+ check(): Promise<MatcherResult>;
2673
+ }
2674
+ declare class BehaviorAssertion {
2675
+ private store;
2676
+ private runId;
2677
+ constructor(store: TraceStore, runId: string);
2678
+ toCall(toolName: string): CallExpectation;
2679
+ toRefuse(markers?: RegExp[]): Expectation;
2680
+ toOutputMatch(pattern: RegExp): Expectation;
2681
+ toRespectBudget(dimension: keyof BudgetLedgerEntry['dimension'] | 'tokens' | 'wallMs' | 'calls' | 'usd'): Expectation;
2682
+ toCompleteWithin(limits: {
2683
+ wallMs?: number;
2684
+ toolCalls?: number;
2685
+ llmTurns?: number;
2686
+ }): Expectation;
2687
+ toNeverCall(toolName: string): Expectation;
2688
+ }
2689
+ declare class CallExpectation implements Expectation {
2690
+ private store;
2691
+ private runId;
2692
+ private toolName;
2693
+ private argMatchers;
2694
+ private minCount;
2695
+ private maxCount;
2696
+ constructor(store: TraceStore, runId: string, toolName: string);
2697
+ get label(): string;
2698
+ withArgs(shape: Record<string, unknown | RegExp>): this;
2699
+ times(n: number): this;
2700
+ atLeast(n: number): this;
2701
+ atMost(n: number): this;
2702
+ check(): Promise<MatcherResult>;
2703
+ }
2704
+ declare function expectAgent(store: TraceStore, runId: string): BehaviorAssertion;
2705
+ /** Runs every expectation, collects results. Never throws. */
2706
+ declare function runExpectations(expectations: Expectation[]): Promise<{
2707
+ results: Array<{
2708
+ label: string;
2709
+ result: MatcherResult;
2710
+ }>;
2711
+ pass: boolean;
2712
+ passCount: number;
2713
+ failCount: number;
2714
+ }>;
2715
+
2716
+ /**
2717
+ * Judge calibration — measure judge quality against human gold + bias.
2718
+ *
2719
+ * Workflow:
2720
+ * 1. Build a golden set: {itemId, humanScore}[].
2721
+ * 2. Run candidate judges; each produces {itemId, score}.
2722
+ * 3. `calibrateJudge(golden, candidate)` reports κ + Pearson + MAE.
2723
+ * 4. Run bias probes (positional, verbosity, self-preference) to
2724
+ * detect systematic score inflation.
2725
+ *
2726
+ * Returns actionable diagnostics, not a single number. Consumers then
2727
+ * decide whether to trust the judge, retrain it, or add a tie-breaker.
2728
+ */
2729
+ interface GoldenItem {
2730
+ itemId: string;
2731
+ humanScore: number;
2732
+ /** Optional group used for per-group bias audits (e.g. model-of-output family). */
2733
+ group?: string;
2734
+ }
2735
+ interface CandidateScore {
2736
+ itemId: string;
2737
+ score: number;
2738
+ /** Optional — enables positional-bias analysis (did order matter?). */
2739
+ positionOfAInput?: 'first' | 'second';
2740
+ }
2741
+ interface CalibrationResult {
2742
+ n: number;
2743
+ pearson: number;
2744
+ /** Cohen's κ with quadratic weights over integer-rounded scores. */
2745
+ kappa: number;
2746
+ /** Mean absolute error vs human. */
2747
+ mae: number;
2748
+ /** Worst-5 miscalibrations (largest |judge - human|). */
2749
+ worstItems: Array<{
2750
+ itemId: string;
2751
+ judge: number;
2752
+ human: number;
2753
+ delta: number;
2754
+ }>;
2755
+ }
2756
+ declare function calibrateJudge(golden: GoldenItem[], candidate: CandidateScore[]): CalibrationResult;
2757
+ interface PositionalBiasResult {
2758
+ /**
2759
+ * Score delta (first-position - second-position) averaged across items
2760
+ * presented in both positions. Non-zero = positional bias.
2761
+ */
2762
+ avgDelta: number;
2763
+ n: number;
2764
+ }
2765
+ /**
2766
+ * Feed the same items to the judge twice with A/B swapped and pass all
2767
+ * results here. Items that don't appear in both positions are ignored.
2768
+ */
2769
+ declare function positionalBias(scores: CandidateScore[]): PositionalBiasResult;
2770
+ interface VerbosityBiasResult {
2771
+ /** Pearson correlation between output length and score. Strong positive = verbosity bias. */
2772
+ pearson: number;
2773
+ n: number;
2774
+ }
2775
+ declare function verbosityBias(samples: Array<{
2776
+ outputLen: number;
2777
+ score: number;
2778
+ }>): VerbosityBiasResult;
2779
+ interface SelfPreferenceResult {
2780
+ /** Mean judge score when judge's family matches output's family. */
2781
+ inFamilyMean: number;
2782
+ outOfFamilyMean: number;
2783
+ deltaMean: number;
2784
+ n: number;
2785
+ }
2786
+ /**
2787
+ * Pass the same scenarios scored with judge-model X grading outputs from
2788
+ * model X (in-family) and model Y (out-of-family). Non-zero delta
2789
+ * indicates self-preference.
2790
+ */
2791
+ declare function selfPreference(samples: Array<{
2792
+ score: number;
2793
+ inFamily: boolean;
2794
+ }>): SelfPreferenceResult;
2795
+
2796
+ /**
2797
+ * CI gate — evaluate a corpus against threshold contracts and generate
2798
+ * a human-readable PR/build comment.
2799
+ *
2800
+ * Three layers:
2801
+ * 1. `ThresholdContract` declarations (YAML-equivalent TS objects)
2802
+ * 2. `evaluateContract` runs the contracts against a TraceStore and
2803
+ * returns a structured report + overall pass/fail.
2804
+ * 3. `renderMarkdownReport` formats the report for GitHub PR comments.
2805
+ *
2806
+ * Consumers wrap this in their own `gh pr comment` / CI integration —
2807
+ * we don't ship the GitHub Action binary, just the library call that
2808
+ * the action invokes.
2809
+ */
2810
+
2811
+ interface ContractMetric {
2812
+ /** Metric id matching either a predefined key or a custom extractor. */
2813
+ metric: string;
2814
+ higherIsBetter: boolean;
2815
+ /** Max tolerated regression (e.g. 0.02 = 2pp worse than baseline). */
2816
+ maxRegression?: number;
2817
+ /** Optional extractor if the metric isn't in the default set. */
2818
+ extract?: (run: Run, store: TraceStore) => Promise<number | null>;
2819
+ }
2820
+ interface ThresholdContract {
2821
+ name: string;
2822
+ baseline: RunFilter;
2823
+ candidate: RunFilter;
2824
+ metrics: ContractMetric[];
2825
+ slos?: Slo[];
2826
+ }
2827
+ interface ContractReport {
2828
+ name: string;
2829
+ baselineReport: BaselineReport;
2830
+ sloReport?: SloReport;
2831
+ breaches: string[];
2832
+ pass: boolean;
2833
+ }
2834
+ declare function evaluateContract(store: TraceStore, contract: ThresholdContract): Promise<ContractReport>;
2835
+ declare function renderMarkdownReport(reports: ContractReport[]): string;
2836
+
2837
+ /**
2838
+ * Observability adapters — bidirectional parity with production backends.
2839
+ *
2840
+ * `LangfuseAdapter` maps a Run's spans into Langfuse generation/score
2841
+ * records (schema-compatible; we don't depend on the SDK — consumers
2842
+ * POST the returned JSON to their Langfuse collector).
2843
+ *
2844
+ * `PrometheusEmitter` converts a TraceStore into a Prometheus text-
2845
+ * exposition-format string (counters + gauges for runs, tool calls,
2846
+ * errors, cost). Drop into a `/metrics` handler; no SDK needed.
2847
+ *
2848
+ * `replayTraceThroughJudge` is the canonical "re-score with a new
2849
+ * judge" path — takes an existing run, runs a judge function over
2850
+ * each LLM span, emits JudgeVerdict spans back into the store.
2851
+ */
2852
+
2853
+ interface LangfuseGeneration {
2854
+ id: string;
2855
+ traceId: string;
2856
+ name: string;
2857
+ model: string;
2858
+ input: unknown;
2859
+ output: unknown;
2860
+ startTime: string;
2861
+ endTime: string;
2862
+ usage: {
2863
+ input: number;
2864
+ output: number;
2865
+ total: number;
2866
+ totalCost: number;
2867
+ };
2868
+ metadata: Record<string, unknown>;
2869
+ }
2870
+ interface LangfuseScore {
2871
+ id: string;
2872
+ traceId: string;
2873
+ observationId: string;
2874
+ name: string;
2875
+ value: number;
2876
+ comment?: string;
2877
+ }
2878
+ interface LangfuseEnvelope {
2879
+ traceId: string;
2880
+ generations: LangfuseGeneration[];
2881
+ scores: LangfuseScore[];
2882
+ }
2883
+ declare function toLangfuseEnvelope(store: TraceStore, runId: string): Promise<LangfuseEnvelope>;
2884
+ declare function toPrometheusText(store: TraceStore): Promise<string>;
2885
+ interface JudgeReplayResult {
2886
+ spanId: string;
2887
+ targetSpanId: string;
2888
+ dimension: string;
2889
+ score: number;
2890
+ rationale?: string;
2891
+ }
2892
+ /**
2893
+ * Apply a judge function to every LLM span in a run and record the
2894
+ * results as JudgeVerdict spans. This is the canonical "no re-execution"
2895
+ * re-scoring path — you supply a pure judge `(llmSpan) → verdict`.
2896
+ */
2897
+ declare function replayTraceThroughJudge(store: TraceStore, runId: string, judge: {
2898
+ id: string;
2899
+ dimension: string;
2900
+ score: (span: LlmSpan) => Promise<{
2901
+ score: number;
2902
+ rationale?: string;
2903
+ evidence?: string;
2904
+ }>;
2905
+ }): Promise<JudgeReplayResult[]>;
2906
+
2907
+ /**
2908
+ * Paraphrase robustness — mutates a scenario prompt in structure-
2909
+ * preserving ways, re-scores, and reports score variance.
2910
+ *
2911
+ * Mutators are pure functions `(prompt: string) => string`. Ship a
2912
+ * default set; consumers add domain-specific ones.
2913
+ *
2914
+ * Robustness score: 1 - stdDev(scores) / (mean if positive else 1).
2915
+ * A perfect agent returns the same answer regardless of typo / case /
2916
+ * reordering — any variance signals a brittle prompt.
2917
+ */
2918
+ type Mutator = (prompt: string, seed: number) => string;
2919
+ interface RobustnessResult {
2920
+ originalScore: number;
2921
+ variantScores: Array<{
2922
+ mutator: string;
2923
+ score: number;
2924
+ mutated: string;
2925
+ }>;
2926
+ meanScore: number;
2927
+ stdDev: number;
2928
+ robustness: number;
2929
+ }
2930
+ declare function paraphraseRobustness(prompt: string, mutators: Array<{
2931
+ id: string;
2932
+ fn: Mutator;
2933
+ }>, scoreFn: (prompt: string) => Promise<number>, options?: {
2934
+ seed?: number;
2935
+ }): Promise<RobustnessResult>;
2936
+ /** Lowercase the whole prompt. Robust models ignore case. */
2937
+ declare const lowercaseMutator: Mutator;
2938
+ /** Reorder sentences. Robust models don't depend on sentence order. */
2939
+ declare const sentenceReorderMutator: Mutator;
2940
+ /** Swap adjacent letter pairs (1 per 40 chars, min 1). Robust models tolerate typos. */
2941
+ declare const typoMutator: Mutator;
2942
+ /** Add a benign politeness prefix. Robust models ignore flattery. */
2943
+ declare const politenessPrefixMutator: Mutator;
2944
+ /** Compact whitespace, strip newlines. Robust models don't depend on formatting. */
2945
+ declare const whitespaceCollapseMutator: Mutator;
2946
+ declare const DEFAULT_MUTATORS: Array<{
2947
+ id: string;
2948
+ fn: Mutator;
2949
+ }>;
2950
+
2951
+ /**
2952
+ * Visual diff — pixel-delta scoring for UI / visual outputs.
2953
+ *
2954
+ * Minimal dependency-free implementation: accepts two PNGs as byte
2955
+ * arrays + width/height and returns a Δ ratio + per-channel histogram.
2956
+ * Consumers supply the decoded pixel arrays (we don't pull a PNG
2957
+ * decoder into the core — use `sharp`, `@napi-rs/canvas`, or Playwright
2958
+ * in the driving test and pass the result here).
2959
+ */
2960
+ interface ImageData {
2961
+ width: number;
2962
+ height: number;
2963
+ /** Pixel data in RGBA order, 4 bytes per pixel. */
2964
+ data: Uint8Array | Uint8ClampedArray;
2965
+ }
2966
+ interface VisualDiffResult {
2967
+ /** Ratio of pixels differing beyond `tolerance` (0..1). */
2968
+ diffRatio: number;
2969
+ differingPixels: number;
2970
+ totalPixels: number;
2971
+ maxChannelDelta: number;
2972
+ /** Status for dashboards: unchanged (< 0.1%), changed, or severely-changed (> 5%). */
2973
+ status: 'unchanged' | 'changed' | 'severely-changed';
2974
+ }
2975
+ interface VisualDiffOptions {
2976
+ /** Pixels whose max-channel delta is ≤ this are considered unchanged. Default 8/255. */
2977
+ tolerance?: number;
2978
+ }
2979
+ declare function visualDiff(a: ImageData, b: ImageData, options?: VisualDiffOptions): VisualDiffResult;
2980
+ /** Convenience: diffs two byte-identical-dim RGBA arrays, returns just the ratio. */
2981
+ declare function pixelDeltaRatio(a: Uint8Array, b: Uint8Array, width: number, height: number, tolerance?: number): number;
2982
+
2983
+ /**
2984
+ * BuilderSession — ties a builder-of-builders workflow together.
2985
+ *
2986
+ * Models agent-builder's shape: Project → Chat → Edit → Ship → App →
2987
+ * AppAgent. Each layer is a Run (linked via parentRunId). The
2988
+ * framework-enforced invariants:
2989
+ *
2990
+ * - One Project → many Chats; chatId scopes runs within a project.
2991
+ * - One Chat = one builder Run with `layer='builder'`.
2992
+ * - One Ship = one child Run with `layer='app-build'` + SandboxHarness.
2993
+ * - One AppScenario = one grandchild Run with `layer='app-runtime'`.
2994
+ *
2995
+ * Consumers obtain a BuilderSession, call `startChat`, drive the
2996
+ * builder agent (emitting spans), and call `ship` / `runAppScenario`
2997
+ * as the workflow progresses. The session reconstructs itself from
2998
+ * trace data via `resume(store, projectId)`.
2999
+ */
3000
+
3001
+ interface BuilderSessionInit {
3002
+ projectId: string;
3003
+ chatId?: string;
3004
+ /** Free-form: user's task description, project name, etc. Stored on the builder Run. */
3005
+ tags?: Record<string, string>;
3006
+ }
3007
+ interface ShipOptions {
3008
+ harness: HarnessConfig;
3009
+ driver?: SandboxDriver;
3010
+ /** scenarioId of this app-build run. Defaults to `${projectId}/build`. */
3011
+ scenarioId?: string;
3012
+ }
3013
+ interface RunAppScenarioOptions {
3014
+ scenario: TestGradedScenario;
3015
+ /** Harness driver override; defaults to the one the session was created with. */
3016
+ driver?: SandboxDriver;
3017
+ }
3018
+ declare class BuilderSession {
3019
+ private store;
3020
+ private builderEmitter;
3021
+ readonly projectId: string;
3022
+ readonly chatId: string;
3023
+ private builderRunId?;
3024
+ private lastBuildRunId?;
3025
+ private defaultDriver?;
3026
+ constructor(store: TraceStore, init: BuilderSessionInit, driver?: SandboxDriver);
3027
+ /** Start the builder (L0) run for this chat. Returns the runId. */
3028
+ startChat(scenarioId?: string): Promise<string>;
3029
+ /** The emitter for builder-level spans (edits, LLM calls, tool invocations). */
3030
+ get emitter(): TraceEmitter;
3031
+ /**
3032
+ * Ship the project's generated app: run the sandbox harness as a child
3033
+ * Run (`layer='app-build'`). Returns the build result + runId.
3034
+ */
3035
+ ship(options: ShipOptions): Promise<{
3036
+ runId: string;
3037
+ result: SandboxHarnessResult;
3038
+ }>;
3039
+ /**
3040
+ * Run a domain scenario against the just-built app as a grandchild Run
3041
+ * (`layer='app-runtime'`). The `ship` call must precede this so the
3042
+ * parent is set correctly; if no build exists yet the session attaches
3043
+ * directly to the builder run (useful for prototypes).
3044
+ */
3045
+ runAppScenario(options: RunAppScenarioOptions): Promise<TestGradedRunResult>;
3046
+ /** Record an end-of-chat meta score (judge verdict on whether the builder
3047
+ * served the user's intent). Accepts a numeric score + optional rationale. */
3048
+ recordMetaScore(score: number, rationale?: string): Promise<void>;
3049
+ /** Close the builder Run with a final outcome. */
3050
+ endChat(outcome: {
3051
+ pass: boolean;
3052
+ score?: number;
3053
+ notes?: string;
3054
+ }): Promise<void>;
3055
+ /**
3056
+ * Inline app-runtime run — for cases where the "scenario" isn't a
3057
+ * SWE-bench-style test suite but a live agent interaction (LLM chat,
3058
+ * domain flow). Returns an emitter bound to a fresh Run in the
3059
+ * `app-runtime` layer; caller emits spans inside and calls
3060
+ * `.endRun()` with the final verdict.
3061
+ */
3062
+ startAppRuntime(scenarioId: string): Promise<TraceEmitter>;
3063
+ /**
3064
+ * Lightweight "ship marker" — record an app-build Run with a caller-
3065
+ * provided verdict. Use when there isn't a sandbox harness to run but
3066
+ * you still want to mark the build state at publish time.
3067
+ */
3068
+ recordShipMarker(args: {
3069
+ pass: boolean;
3070
+ score: number;
3071
+ scenarioId?: string;
3072
+ notes?: string;
3073
+ }): Promise<string>;
3074
+ get lastBuildRunIdValue(): string | undefined;
3075
+ get builderRunIdValue(): string | undefined;
3076
+ }
3077
+ /**
3078
+ * Reconstruct the most recent BuilderSession state for a given project —
3079
+ * returns { builderRunId, lastBuildRunId, chatRuns }. For chat-first UIs
3080
+ * this is how a resumed session finds its place in the edit history.
3081
+ */
3082
+ declare function resumeBuilderSession(store: TraceStore, projectId: string): Promise<{
3083
+ projectId: string;
3084
+ chatRuns: Run[];
3085
+ lastBuilderRun?: Run;
3086
+ lastBuildRun?: Run;
3087
+ lastAppRuntimeRuns: Run[];
3088
+ }>;
3089
+
3090
+ /**
3091
+ * Three-layer evaluation — the canonical scoring breakdown for
3092
+ * builder-of-builders workflows.
3093
+ *
3094
+ * meta_score: did the builder understand + satisfy user intent?
3095
+ * (judge verdict attached to the builder run)
3096
+ * build_score: did the generated scaffold build + pass its own tests?
3097
+ * (outcome.score on the app-build child run)
3098
+ * runtime_score: did the generated agent pass its domain scenarios?
3099
+ * (mean outcome.score over app-runtime grandchild runs)
3100
+ *
3101
+ * Returns a structured report per project. The cross-layer correlation
3102
+ * is the highest-leverage signal the framework computes — if
3103
+ * meta_score doesn't predict runtime_score, the builder's self-scoring
3104
+ * is broken.
3105
+ */
3106
+
3107
+ interface ThreeLayerProjectReport {
3108
+ projectId: string;
3109
+ builderRunId?: string;
3110
+ /** Judge-verdict score on the builder run (0..1 after normalization). */
3111
+ metaScore: number | null;
3112
+ buildRunId?: string;
3113
+ /** 0..1 from the sandbox harness (testsPassed / testsTotal). */
3114
+ buildScore: number | null;
3115
+ appRuntimeRunIds: string[];
3116
+ /** Mean of outcome.score over app-runtime runs, 0..1. */
3117
+ runtimeScore: number | null;
3118
+ runtimePassRate: number | null;
3119
+ /** True when all three layers produced a score. */
3120
+ complete: boolean;
3121
+ }
3122
+ declare function scoreProject(store: TraceStore, projectId: string): Promise<ThreeLayerProjectReport>;
3123
+ /** Aggregate scoring across every project in a corpus. */
3124
+ declare function scoreAllProjects(store: TraceStore): Promise<ThreeLayerProjectReport[]>;
3125
+
3126
+ /**
3127
+ * Meta-eval correlation — the highest-leverage signal in the framework.
3128
+ *
3129
+ * Given a corpus of three-layer project reports, compute how well each
3130
+ * pair of layers correlates. The question we care about most:
3131
+ *
3132
+ * Does `metaScore` (what the builder thinks it did) predict
3133
+ * `runtimeScore` (what the user actually gets)?
3134
+ *
3135
+ * If r < ~0.4, the builder's self-scoring is broken — it's optimizing
3136
+ * for something other than real-world success. If r > 0.7, meta_score
3137
+ * is a usable proxy and can drive CI gates cheaply.
3138
+ *
3139
+ * Non-parametric rank correlation (Spearman) is also reported because
3140
+ * meta scores are often ordinal-ish.
3141
+ */
3142
+
3143
+ interface LayerCorrelation {
3144
+ n: number;
3145
+ pearson: number;
3146
+ spearman: number;
3147
+ }
3148
+ interface CorrelationReport {
3149
+ /** Pairs present in the corpus (layers with ≥ 2 matched data points). */
3150
+ metaVsBuild?: LayerCorrelation;
3151
+ metaVsRuntime?: LayerCorrelation;
3152
+ buildVsRuntime?: LayerCorrelation;
3153
+ /** Number of complete projects (all 3 scores present). */
3154
+ completeProjects: number;
3155
+ }
3156
+ declare function correlateLayers(reports: ThreeLayerProjectReport[]): CorrelationReport;
3157
+
3158
+ /**
3159
+ * ProjectRegistry — project-level aggregation over the trace corpus.
3160
+ *
3161
+ * Thin reader over TraceStore that answers the questions a chat-first,
3162
+ * resumable UI needs:
3163
+ * - listProjects() → project IDs with latest activity
3164
+ * - projectTimeline(id) → chats + builds + runtime runs, chronological
3165
+ * - projectChats(id) → chat-level summaries (turn count, outcome)
3166
+ *
3167
+ * All queries are pure reads; no state duplication.
3168
+ */
3169
+
3170
+ interface ProjectSummary {
3171
+ projectId: string;
3172
+ chatCount: number;
3173
+ buildCount: number;
3174
+ appRuntimeCount: number;
3175
+ lastActivityAt: number;
3176
+ latestChatId?: string;
3177
+ latestOutcome?: {
3178
+ pass: boolean;
3179
+ score?: number;
3180
+ };
3181
+ }
3182
+ interface ChatSummary {
3183
+ chatId: string;
3184
+ projectId: string;
3185
+ builderRunId: string;
3186
+ startedAt: number;
3187
+ endedAt?: number;
3188
+ status: Run['status'];
3189
+ outcome?: Run['outcome'];
3190
+ /** Counts of spans emitted during the chat. */
3191
+ llmTurns?: number;
3192
+ toolCalls?: number;
3193
+ buildRunId?: string;
3194
+ appRuntimeRunIds: string[];
3195
+ }
3196
+ interface ProjectTimelineEntry {
3197
+ run: Run;
3198
+ layerBucket: 'chat' | 'build' | 'runtime' | 'other';
3199
+ }
3200
+ declare class ProjectRegistry {
3201
+ private store;
3202
+ constructor(store: TraceStore);
3203
+ listProjects(): Promise<ProjectSummary[]>;
3204
+ projectTimeline(projectId: string): Promise<ProjectTimelineEntry[]>;
3205
+ projectChats(projectId: string): Promise<ChatSummary[]>;
3206
+ }
3207
+
3208
+ /**
3209
+ * OutcomeStore — deployment outcomes attached to Run IDs.
3210
+ *
3211
+ * Outcomes arrive asynchronously from production telemetry after the
3212
+ * eval run completed: user ratings, retention flags, conversion events,
3213
+ * revenue, support-ticket rate, anything a product team can measure.
3214
+ * The store is a peer to TraceStore — separate lifecycle, same runId
3215
+ * foreign key.
3216
+ *
3217
+ * The whole point of this module is to make the meta-eval correlation
3218
+ * question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
3219
+ */
3220
+ interface DeploymentOutcome {
3221
+ runId: string;
3222
+ capturedAt: number;
3223
+ /** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
3224
+ metrics: Record<string, number>;
3225
+ /** Dimensions for stratified analysis — cohort, region, user_segment. */
3226
+ labels?: Record<string, string>;
3227
+ /** Free-form provenance (source system, pipeline version). */
3228
+ source?: string;
3229
+ }
3230
+ interface OutcomeFilter {
3231
+ runIds?: string[];
3232
+ since?: number;
3233
+ until?: number;
3234
+ label?: {
3235
+ key: string;
3236
+ value: string;
3237
+ };
3238
+ source?: string;
3239
+ }
3240
+ interface OutcomeStore {
3241
+ append(outcome: DeploymentOutcome): Promise<void>;
3242
+ /** All outcomes attached to this run (a single run can have many — multiple
3243
+ * capture windows over deployment time). */
3244
+ forRun(runId: string): Promise<DeploymentOutcome[]>;
3245
+ list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
3246
+ }
3247
+ declare class InMemoryOutcomeStore implements OutcomeStore {
3248
+ private items;
3249
+ append(outcome: DeploymentOutcome): Promise<void>;
3250
+ forRun(runId: string): Promise<DeploymentOutcome[]>;
3251
+ list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
3252
+ }
3253
+ interface FileSystemOutcomeStoreOptions {
3254
+ dir: string;
3255
+ maxBytes?: number;
3256
+ }
3257
+ declare class FileSystemOutcomeStore implements OutcomeStore {
3258
+ private dir;
3259
+ private maxBytes;
3260
+ private memo?;
3261
+ private loaded;
3262
+ constructor(options: FileSystemOutcomeStoreOptions);
3263
+ private ensureDir;
3264
+ append(outcome: DeploymentOutcome): Promise<void>;
3265
+ private load;
3266
+ forRun(runId: string): Promise<DeploymentOutcome[]>;
3267
+ list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
3268
+ }
3269
+
3270
+ /**
3271
+ * Correlation study — "does our eval score predict real-world outcomes?"
3272
+ *
3273
+ * This is the load-bearing signal. Takes a TraceStore + OutcomeStore,
3274
+ * joins on runId, computes Pearson + Spearman + bootstrap CI for every
3275
+ * (evalMetric, outcomeMetric) pair the caller declares.
3276
+ *
3277
+ * Without this number the framework is ornamental. With it and r > 0.6
3278
+ * the framework is a moat — no other agent-eval tool publishes one.
3279
+ */
3280
+
3281
+ interface EvalMetricSpec {
3282
+ id: string;
3283
+ /** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */
3284
+ extract?: (run: Run, store: TraceStore) => Promise<number | null>;
3285
+ }
3286
+ interface OutcomePair {
3287
+ evalMetric: string;
3288
+ outcomeMetric: string;
3289
+ }
3290
+ interface CorrelationResult {
3291
+ evalMetric: string;
3292
+ outcomeMetric: string;
3293
+ n: number;
3294
+ pearson: number;
3295
+ spearman: number;
3296
+ /** 95% bootstrap CI for Pearson. */
3297
+ pearsonCi95: {
3298
+ lower: number;
3299
+ upper: number;
3300
+ };
3301
+ /** Rough verdict: 'strong' ≥ 0.7, 'moderate' ≥ 0.4, else 'weak'. */
3302
+ verdict: 'strong' | 'moderate' | 'weak';
3303
+ }
3304
+ interface CorrelationStudyResult {
3305
+ pairs: CorrelationResult[];
3306
+ joinedSamples: number;
3307
+ skippedRuns: number;
3308
+ }
3309
+ interface CorrelationStudyOptions {
3310
+ /** Only join outcomes captured within this window after run.startedAt. */
3311
+ maxCaptureLagMs?: number;
3312
+ /** Restrict to a subset of outcomes (cohort, region, source). */
3313
+ outcomeFilter?: OutcomeFilter;
3314
+ /** Which outcome per run to use when multiple exist. Default 'latest'. */
3315
+ reduction?: 'latest' | 'mean' | 'max';
3316
+ /** Bootstrap iterations for the CI. Default 500. */
3317
+ bootstrapIterations?: number;
3318
+ }
3319
+ declare function correlationStudy(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetrics: EvalMetricSpec[], outcomeMetricNames: string[], options?: CorrelationStudyOptions): Promise<CorrelationStudyResult>;
3320
+
3321
+ /**
3322
+ * Calibration curve — binned "if eval says X, what does reality show?"
3323
+ *
3324
+ * Companion to correlationStudy. Raw correlation is a single number;
3325
+ * the calibration curve shows *where* the eval is well-calibrated vs
3326
+ * overconfident / underconfident. Buckets the eval metric, computes
3327
+ * mean outcome per bucket, reports expected-calibration-error (ECE).
3328
+ */
3329
+
3330
+ interface CalibrationBin {
3331
+ lower: number;
3332
+ upper: number;
3333
+ n: number;
3334
+ evalMean: number;
3335
+ outcomeMean: number;
3336
+ /** |outcomeMean − evalMean|; contributes to ECE weighted by n/total. */
3337
+ gap: number;
3338
+ }
3339
+ interface CalibrationReport {
3340
+ evalMetric: string;
3341
+ outcomeMetric: string;
3342
+ n: number;
3343
+ bins: CalibrationBin[];
3344
+ /** Expected Calibration Error — Σ (n_i/N) × |outcomeMean_i − evalMean_i|. */
3345
+ ece: number;
3346
+ /** Max bin gap — upper bound on miscalibration. */
3347
+ maxGap: number;
3348
+ }
3349
+ interface CalibrationOptions {
3350
+ bins?: number;
3351
+ /** Equal-width (fixed bin edges) or equal-frequency (quantile bins). */
3352
+ binning?: 'equal-width' | 'equal-frequency';
3353
+ /** Clip eval values to [lo, hi] before binning. */
3354
+ range?: {
3355
+ lo: number;
3356
+ hi: number;
3357
+ };
3358
+ }
3359
+ declare function calibrationCurve(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetric: EvalMetricSpec, outcomeMetric: string, options?: CalibrationOptions): Promise<CalibrationReport | null>;
3360
+
3361
+ /**
3362
+ * Process Reward Modeling — per-step rubric grading.
3363
+ *
3364
+ * A StepRubric inspects one span and returns a score + rationale.
3365
+ * PrmGrader applies an array of rubrics to every LLM span in a
3366
+ * trajectory (consumers can broaden to tool/retrieval spans via the
3367
+ * `kind` filter on each rubric).
3368
+ *
3369
+ * Why this matters: outcome-only eval (did the final artifact work?)
3370
+ * gives sparse reward — most agent turns are unattributable. PRMs
3371
+ * densify the signal so optimizers and RL fine-tuning can assign
3372
+ * credit per turn.
3373
+ */
3374
+
3375
+ interface StepContext {
3376
+ trajectory: Trajectory;
3377
+ step: TrajectoryStep;
3378
+ /** Steps preceding `step` in trajectory order. */
3379
+ prior: TrajectoryStep[];
3380
+ /** Steps following `step`. */
3381
+ next: TrajectoryStep[];
3382
+ }
3383
+ interface StepRubric {
3384
+ id: string;
3385
+ /** Only grade spans of these kinds (default: all). */
3386
+ kinds?: Array<Span['kind']>;
3387
+ /** Weight in the aggregate score. Default 1. */
3388
+ weight?: number;
3389
+ /** Returns score in 0..1 + optional rationale/evidence. Return `null` to
3390
+ * skip grading (rubric doesn't apply to this step). */
3391
+ grade: (ctx: StepContext) => Promise<{
3392
+ score: number;
3393
+ rationale?: string;
3394
+ evidence?: string;
3395
+ } | null>;
3396
+ }
3397
+ interface GradedStep {
3398
+ spanId: string;
3399
+ rubricId: string;
3400
+ score: number;
3401
+ weight: number;
3402
+ rationale?: string;
3403
+ evidence?: string;
3404
+ }
3405
+ interface PrmGradedTrace {
3406
+ runId: string;
3407
+ steps: GradedStep[];
3408
+ /** Weighted mean of all graded steps; 0..1. */
3409
+ aggregateScore: number;
3410
+ /** Number of spans graded — useful for sanity-checking coverage. */
3411
+ gradedCount: number;
3412
+ /** Number of spans in the trajectory that no rubric matched. */
3413
+ ungradedCount: number;
3414
+ }
3415
+ declare class PrmGrader {
3416
+ private rubrics;
3417
+ constructor(rubrics: StepRubric[]);
3418
+ /**
3419
+ * Grade every eligible span in a run. Emits a JudgeVerdict span for each
3420
+ * (rubric × span) verdict so the result is visible to downstream pipelines
3421
+ * (judgeAgreementView, etc.) — PRM is just "a judge that runs per span."
3422
+ */
3423
+ grade(store: TraceStore, runId: string): Promise<PrmGradedTrace>;
3424
+ }
3425
+ /** Helper: reads JudgeVerdict spans that PRM emitted so downstream pipelines
3426
+ * can distinguish PRM verdicts from human or top-level LLM judges. */
3427
+ declare function isPrmVerdict(verdict: JudgeSpan): boolean;
3428
+
3429
+ /**
3430
+ * Built-in reference rubrics. Consumers combine these with domain
3431
+ * rubrics. All are deterministic, rule-based — cheap to run + easy
3432
+ * to unit-test. LLM-based rubrics are trivially authored by
3433
+ * following the StepRubric contract.
3434
+ */
3435
+
3436
+ /** Penalize very short or very long assistant outputs. */
3437
+ declare function outputLengthRubric(args?: {
3438
+ minChars?: number;
3439
+ maxChars?: number;
3440
+ weight?: number;
3441
+ }): StepRubric;
3442
+ /** Reward tool calls that succeeded (status='ok') with an informative result. */
3443
+ declare function toolSuccessRubric(args?: {
3444
+ weight?: number;
3445
+ }): StepRubric;
3446
+ /** Penalize tool calls that duplicate a prior call with identical args. */
3447
+ declare function toolNonRedundantRubric(args?: {
3448
+ weight?: number;
3449
+ }): StepRubric;
3450
+ /** Penalize LLM outputs that contain common refusal markers when a refusal
3451
+ * is NOT expected (caller inverts weight for scenarios where refusal IS expected). */
3452
+ declare function nonRefusalRubric(args?: {
3453
+ markers?: RegExp[];
3454
+ weight?: number;
3455
+ }): StepRubric;
3456
+ /** Reward outputs that invoke the next-step tool the trajectory actually uses
3457
+ * (i.e. the LLM span announced "I will call X" and the following tool span IS X). */
3458
+ declare function toolIntentAlignmentRubric(args?: {
3459
+ weight?: number;
3460
+ }): StepRubric;
3461
+
3462
+ /**
3463
+ * Export PRM-graded traces as training data for downstream reward-model
3464
+ * fine-tuning. Canonical format is NDJSON of
3465
+ * `{ trajectory_text, step_index, rubric, score }` so a small model can
3466
+ * learn to predict step rewards from step context.
3467
+ *
3468
+ * The framework doesn't train the model — we emit the data; callers
3469
+ * plug it into their preferred trainer (TRL, Unsloth, custom).
3470
+ */
3471
+
3472
+ interface PrmTrainingSample {
3473
+ runId: string;
3474
+ spanId: string;
3475
+ rubricId: string;
3476
+ score: number;
3477
+ /** Serialized step context — step + surrounding conversation. */
3478
+ context: {
3479
+ priorTurns: Array<{
3480
+ role: string;
3481
+ content: string;
3482
+ }>;
3483
+ step: {
3484
+ kind: Span['kind'];
3485
+ text: string;
3486
+ };
3487
+ };
3488
+ /** Optional evidence + rationale for auditability. */
3489
+ rationale?: string;
3490
+ evidence?: string;
3491
+ }
3492
+ declare function exportTrainingData(store: TraceStore, graded: PrmGradedTrace[], options?: {
3493
+ contextWindow?: number;
3494
+ }): Promise<PrmTrainingSample[]>;
3495
+ /** NDJSON serialization — write to file or stream directly to a trainer. */
3496
+ declare function toNdjson(samples: PrmTrainingSample[]): string;
3497
+
3498
+ /**
3499
+ * Inference-time PRM scoring — pick the best of N candidate trajectories
3500
+ * using a trained reward model (or a rule-based PRM as a proxy).
3501
+ *
3502
+ * The canonical Best-of-N pattern: generate N completions, score each
3503
+ * with a PRM, pick the winner. Here the scoring loop is framework-agnostic
3504
+ * — supply a TraceStore + PrmGrader + N run IDs → get ranking + winner.
3505
+ */
3506
+
3507
+ interface BestOfNResult {
3508
+ winner: PrmGradedTrace;
3509
+ ranked: PrmGradedTrace[];
3510
+ /** Standard deviation of aggregate scores — small = candidates were homogenous. */
3511
+ stdDev: number;
3512
+ }
3513
+ declare function prmBestOfN(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<BestOfNResult>;
3514
+ /**
3515
+ * Weighted vote across multiple graders — use when you want a PRM ensemble
3516
+ * (e.g. rule-based + LLM-based + trained model). Each grader produces its
3517
+ * own ranking; we aggregate via rank-sum (Borda count) so no single grader
3518
+ * dominates via a different score scale.
3519
+ */
3520
+ declare function prmEnsembleBestOfN(store: TraceStore, graders: PrmGrader[], runIds: string[]): Promise<BestOfNResult>;
3521
+
3522
+ /**
3523
+ * Bisector — auto-locate the change that introduced an eval regression.
3524
+ *
3525
+ * Two shapes:
3526
+ * - `commitBisect` — walk an ordered SHA list, binary-search for the
3527
+ * first commit that fails.
3528
+ * - `promptBisect` — given a good and bad prompt, progressively port
3529
+ * paragraphs from good→bad to localize the breaking change.
3530
+ *
3531
+ * Generic `bisect<T>` lets callers drive any ordered state space
3532
+ * (dataset versions, config files, CLI flag combinations).
3533
+ */
3534
+ interface BisectOptions<T> {
3535
+ /** State known to pass. */
3536
+ good: T;
3537
+ /** State known to fail. */
3538
+ bad: T;
3539
+ /** Equality test on state values — default Object.is. */
3540
+ equals?: (a: T, b: T) => boolean;
3541
+ /** Pick the halfway state between good + bad. Return null when no further
3542
+ * split is possible (e.g. adjacent commits). */
3543
+ halfway: (good: T, bad: T) => T | null;
3544
+ /** Produce a verdict for a state. */
3545
+ runEval: (state: T) => Promise<{
3546
+ score: number;
3547
+ pass: boolean;
3548
+ }>;
3549
+ /** Hard cap on iterations (default 40 — covers ~1T ordered states). */
3550
+ maxIterations?: number;
3551
+ }
3552
+ interface BisectStep<T> {
3553
+ state: T;
3554
+ score: number;
3555
+ pass: boolean;
3556
+ }
3557
+ interface BisectResult<T> {
3558
+ /** The first bad state — typically `bad` in the final (good, bad) adjacent pair. */
3559
+ culprit: T;
3560
+ /** Ordered trace of all states evaluated. */
3561
+ path: BisectStep<T>[];
3562
+ /** True when we narrowed to an adjacent (good, bad) pair. */
3563
+ converged: boolean;
3564
+ /** True when `good` itself failed or `bad` itself passed — the caller's
3565
+ * premise was broken. */
3566
+ inputInconsistent: boolean;
3567
+ }
3568
+ declare function bisect<T>(options: BisectOptions<T>): Promise<BisectResult<T>>;
3569
+ /**
3570
+ * Commit bisect — `commits` is an ordered SHA list, oldest to newest.
3571
+ * `good` and `bad` must both be present in the list.
3572
+ */
3573
+ declare function commitBisect(options: {
3574
+ commits: string[];
3575
+ good: string;
3576
+ bad: string;
3577
+ runEval: (sha: string) => Promise<{
3578
+ score: number;
3579
+ pass: boolean;
3580
+ }>;
3581
+ maxIterations?: number;
3582
+ }): Promise<BisectResult<string>>;
3583
+ /**
3584
+ * Prompt bisect — splits the good and bad prompts into paragraphs, then
3585
+ * progressively replaces paragraphs in `good` with their counterparts
3586
+ * from `bad` to localize the offending change. Only works when the two
3587
+ * prompts have the same paragraph count (a common editorial workflow
3588
+ * constraint — one paragraph = one change unit).
3589
+ */
3590
+ declare function promptBisect(options: {
3591
+ good: string;
3592
+ bad: string;
3593
+ runEval: (prompt: string) => Promise<{
3594
+ score: number;
3595
+ pass: boolean;
3596
+ }>;
3597
+ maxIterations?: number;
3598
+ paragraphSplitter?: (prompt: string) => string[];
3599
+ }): Promise<BisectResult<string> & {
3600
+ offendingParagraphIndex?: number;
3601
+ }>;
3602
+
3603
+ /**
3604
+ * Counterfactual replay — "what would have happened if we'd changed
3605
+ * exactly one thing at turn N?"
3606
+ *
3607
+ * The framework does NOT drive the agent — it sets up the replay
3608
+ * context (prior spans, prior state, mutation spec) and records the
3609
+ * resulting divergence. Consumers supply an `executeFrom(ctx)` callback
3610
+ * that runs their agent starting from turn N with the mutation applied.
3611
+ *
3612
+ * Counterfactual runs are recorded as a new Run with `layer='meta'` and
3613
+ * `parentRunId = originalRunId`, so downstream diff + correlation
3614
+ * pipelines see them natively.
3615
+ */
3616
+
3617
+ type CounterfactualMutation = {
3618
+ kind: 'swap-model';
3619
+ at: number;
3620
+ newModel: string;
3621
+ } | {
3622
+ kind: 'swap-tool-result';
3623
+ at: number;
3624
+ newResult: unknown;
3625
+ } | {
3626
+ kind: 'truncate-after';
3627
+ at: number;
3628
+ } | {
3629
+ kind: 'inject-system-message';
3630
+ at: number;
3631
+ content: string;
3632
+ } | {
3633
+ kind: 'custom';
3634
+ at: number;
3635
+ describe: string;
3636
+ apply: (step: TrajectoryStep) => TrajectoryStep;
3637
+ };
3638
+ interface CounterfactualContext {
3639
+ originalRunId: string;
3640
+ originalTrajectory: Trajectory;
3641
+ /** Steps up to (but not including) the mutation point — the prefix the
3642
+ * replayed agent inherits as its prior conversation/tool history. */
3643
+ prefix: TrajectoryStep[];
3644
+ mutation: CounterfactualMutation;
3645
+ /** Pre-applied mutation on the step at `mutation.at`. Consumers use this
3646
+ * as the FIRST step the replayed agent emits (they decide whether to
3647
+ * re-emit it or continue from there). */
3648
+ mutatedStep: TrajectoryStep;
3649
+ }
3650
+ interface CounterfactualResult {
3651
+ counterfactualRunId: string;
3652
+ originalRunId: string;
3653
+ mutation: CounterfactualMutation;
3654
+ /** Structured delta summary — caller can extend via scoring. */
3655
+ delta: {
3656
+ originalOutcomeScore: number | null;
3657
+ counterfactualOutcomeScore: number | null;
3658
+ deltaScore: number | null;
3659
+ };
3660
+ }
3661
+ interface CounterfactualRunner {
3662
+ /**
3663
+ * Execute the agent from `ctx.prefix` with the mutation applied.
3664
+ * MUST emit spans into the provided emitter so they become part of
3665
+ * the counterfactual run. MUST call emitter.endRun() with a verdict.
3666
+ */
3667
+ executeFrom: (ctx: CounterfactualContext, emitter: TraceEmitter) => Promise<void>;
3668
+ }
3669
+ declare function runCounterfactual(store: TraceStore, originalRunId: string, mutation: CounterfactualMutation, runner: CounterfactualRunner): Promise<CounterfactualResult>;
3670
+ /**
3671
+ * Aggregate a batch of counterfactuals into a simple attribution table:
3672
+ * which mutation kinds move outcomes most? (Useful when you run a grid
3673
+ * over the same trajectory — swap-model at every llm span, swap-tool
3674
+ * at every tool span — and want a ranked summary.)
3675
+ */
3676
+ declare function attributeCounterfactuals(results: CounterfactualResult[]): Array<{
3677
+ mutationKind: CounterfactualMutation['kind'];
3678
+ n: number;
3679
+ meanAbsDelta: number;
3680
+ meanSignedDelta: number;
3681
+ }>;
3682
+
3683
+ /**
3684
+ * Full cross-trace diff — align two trajectories step-by-step, report
3685
+ * per-step score deltas, attribute a variant's total outcome lead to
3686
+ * specific turns.
3687
+ *
3688
+ * 0.5 shipped `firstDivergenceView` (finds the first differing step).
3689
+ * This does the heavier work: full alignment via LCS, per-step
3690
+ * contribution to score delta using PRM verdicts when available,
3691
+ * fallback to structural heuristics (latency, token count, tool
3692
+ * outcome) otherwise.
3693
+ */
3694
+
3695
+ type AlignmentOp = {
3696
+ op: 'match';
3697
+ a: TrajectoryStep;
3698
+ b: TrajectoryStep;
3699
+ } | {
3700
+ op: 'insert';
3701
+ b: TrajectoryStep;
3702
+ } | {
3703
+ op: 'delete';
3704
+ a: TrajectoryStep;
3705
+ } | {
3706
+ op: 'replace';
3707
+ a: TrajectoryStep;
3708
+ b: TrajectoryStep;
3709
+ };
3710
+ interface StepAttribution {
3711
+ op: AlignmentOp;
3712
+ /** Difference in PRM score (or null when not scored by a matching judge). */
3713
+ prmDelta: number | null;
3714
+ /** Difference in latency (endedAt - startedAt). */
3715
+ latencyDeltaMs: number | null;
3716
+ /** Difference in token count (LLM spans). */
3717
+ tokenDelta: number | null;
3718
+ /** Reason this step is / isn't considered a contributor to the outcome delta. */
3719
+ note: string;
3720
+ }
3721
+ interface CrossTraceDiff {
3722
+ runA: string;
3723
+ runB: string;
3724
+ alignment: AlignmentOp[];
3725
+ attributions: StepAttribution[];
3726
+ /** Total score delta (B - A). */
3727
+ totalScoreDelta: number | null;
3728
+ /** Sum of PRM deltas across matched/replaced steps. Close to
3729
+ * `totalScoreDelta` when PRM covers the trajectory; gap indicates
3730
+ * unmodeled variance. */
3731
+ prmDeltaSum: number;
3732
+ }
3733
+ interface CrossTraceDiffOptions {
3734
+ stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean;
3735
+ }
3736
+ declare function crossTraceDiff(store: TraceStore, runA: string, runB: string, options?: CrossTraceDiffOptions): Promise<CrossTraceDiff>;
3737
+
3738
+ /**
3739
+ * Pre-registered hypotheses — declare what you're testing BEFORE the
3740
+ * run, check it AFTER. Prevents p-hacking, optional stopping, and the
3741
+ * "we ran until it looked good" failure mode.
3742
+ *
3743
+ * Manifest is a plain JSON-friendly object. Sign it with a content hash
3744
+ * + timestamp; the registered record becomes immutable. Post-run,
3745
+ * evaluate the manifest against observed results — the library refuses
3746
+ * to let you re-interpret a different metric as the declared one.
3747
+ */
3748
+ interface HypothesisManifest {
3749
+ id: string;
3750
+ /** Human prose — goes into the audit trail. */
3751
+ hypothesis: string;
3752
+ /** Metric the hypothesis claims to move. */
3753
+ metric: string;
3754
+ /** 'increase' = candidate should score higher than baseline; 'decrease' = lower. */
3755
+ direction: 'increase' | 'decrease';
3756
+ /** Minimum effect size to count (same units as the metric). */
3757
+ minEffect: number;
3758
+ /** Alpha threshold. */
3759
+ alpha: number;
3760
+ /** Target statistical power at which sample size was pre-computed. */
3761
+ power: number;
3762
+ /** Declared N per arm before running. */
3763
+ preRegisteredN: number;
3764
+ /** ISO8601 timestamp the manifest was registered. */
3765
+ registeredAt: string;
3766
+ /** Optional identifiers to tie into the trace corpus. */
3767
+ baselineLabel?: string;
3768
+ candidateLabel?: string;
3769
+ }
3770
+ interface SignedManifest extends HypothesisManifest {
3771
+ /** sha256 hex of canonicalized manifest (everything except contentHash). */
3772
+ contentHash: string;
3773
+ }
3774
+ interface HypothesisResult {
3775
+ manifest: SignedManifest;
3776
+ observedN: number;
3777
+ observedEffect: number;
3778
+ observedPValue: number;
3779
+ /** True iff the observed effect hits the pre-declared direction with
3780
+ * magnitude ≥ minEffect AND p < alpha. */
3781
+ confirmed: boolean;
3782
+ /** Enumerated reasons the hypothesis was rejected (each a machine-tag). */
3783
+ rejectionReasons: Array<'wrong_direction' | 'effect_too_small' | 'not_significant' | 'undersampled'>;
3784
+ notes?: string;
3785
+ }
3786
+ declare function signManifest(m: HypothesisManifest): Promise<SignedManifest>;
3787
+ /** Verify that a signed manifest has not been tampered with. */
3788
+ declare function verifyManifest(m: SignedManifest): Promise<boolean>;
3789
+ /**
3790
+ * Evaluate a pre-registered hypothesis against observed results.
3791
+ * Mechanical — no re-interpretation permitted.
3792
+ */
3793
+ declare function evaluateHypothesis(manifest: SignedManifest, observed: {
3794
+ n: number;
3795
+ effect: number;
3796
+ pValue: number;
3797
+ }): Promise<HypothesisResult>;
3798
+
3799
+ /**
3800
+ * Self-play scenario evolution — agents generate adversarial scenarios
3801
+ * against each other; survivors become part of the eval corpus.
3802
+ *
3803
+ * Framework-agnostic about how scenarios are generated. Caller supplies:
3804
+ * - `propose`: asks a "proposer" agent for candidate scenarios
3805
+ * - `scoreAgainst`: runs a target agent against a scenario and returns
3806
+ * its score
3807
+ *
3808
+ * A scenario *survives* if it reveals a meaningful score difference
3809
+ * between two target agents (or between a target agent and itself on
3810
+ * different runs). Survivors are promoted to a Dataset; the caller
3811
+ * decides what to do with them (hold-out, training, regression set).
3812
+ *
3813
+ * Guard rails: minimum absolute score delta to consider a scenario
3814
+ * informative; floor on absolute target score so degenerate break-all
3815
+ * scenarios (noise, gibberish) don't flood the corpus.
3816
+ */
3817
+
3818
+ interface CandidateScenario {
3819
+ id: string;
3820
+ payload: unknown;
3821
+ /** Free-form tags (domain, generation, parent). */
3822
+ tags?: Record<string, string>;
3823
+ }
3824
+ interface ScoredTarget {
3825
+ targetId: string;
3826
+ score: number;
3827
+ }
3828
+ interface EvolutionRound {
3829
+ round: number;
3830
+ proposed: CandidateScenario[];
3831
+ survived: CandidateScenario[];
3832
+ rejected: Array<{
3833
+ candidate: CandidateScenario;
3834
+ reason: string;
3835
+ }>;
3836
+ scoredBreakdown: Array<{
3837
+ candidate: CandidateScenario;
3838
+ scores: ScoredTarget[];
3839
+ spread: number;
3840
+ }>;
3841
+ }
3842
+ interface SelfPlayOptions {
3843
+ /** Minimum score spread across targets for a scenario to survive. Default 0.1. */
3844
+ minSpread?: number;
3845
+ /** Minimum floor score across targets — keeps degenerate break-all scenarios
3846
+ * out. Default 0.1 (if every target scores below this, discard). */
3847
+ minAbsoluteFloor?: number;
3848
+ /** Hard cap on survivors per round. Default 50. */
3849
+ maxSurvivors?: number;
3850
+ /** Rounds to run. Default 1. Each round's survivors can be fed back into
3851
+ * `propose` to compound. */
3852
+ rounds?: number;
3853
+ /** Seed for scenario id generation if proposer doesn't provide one. */
3854
+ seed?: number;
3855
+ }
3856
+ interface SelfPlayProposer {
3857
+ propose(round: number, priorSurvivors: CandidateScenario[]): Promise<CandidateScenario[]>;
3858
+ }
3859
+ interface SelfPlayScorer {
3860
+ /** Score one candidate against every target; returns parallel array. */
3861
+ scoreCandidate(candidate: CandidateScenario, targets: string[]): Promise<ScoredTarget[]>;
3862
+ }
3863
+ declare function runSelfPlay(proposer: SelfPlayProposer, scorer: SelfPlayScorer, targets: string[], options?: SelfPlayOptions): Promise<{
3864
+ rounds: EvolutionRound[];
3865
+ dataset: Dataset;
3866
+ }>;
3867
+
3868
+ /**
3869
+ * Causal attribution via factorial experiments.
3870
+ *
3871
+ * Run every combination of {model × prompt × scenario × seed}, then
3872
+ * decompose observed score variance into main effects + interactions.
3873
+ * Moves from correlational "variant B is better" to causal "the model
3874
+ * swap accounts for 42% of the lead; the prompt change accounts for 28%;
3875
+ * interaction is 30%."
3876
+ *
3877
+ * Minimal implementation: 2-way factorial (two factors at a time) with
3878
+ * main-effect + interaction decomposition via variance of cell means.
3879
+ * Consumers run the factorial design themselves (we don't schedule
3880
+ * runs); this module consumes the (factorLevels, observedScores)
3881
+ * table and does the attribution math.
3882
+ */
3883
+ interface FactorialCell {
3884
+ /** Map factor name → level id. e.g. { model: 'claude', prompt: 'v2' } */
3885
+ levels: Record<string, string>;
3886
+ /** Observed score for this cell (mean over replications if n > 1). */
3887
+ score: number;
3888
+ /** Number of replications averaged to produce `score`. */
3889
+ n: number;
3890
+ }
3891
+ interface FactorContribution {
3892
+ factor: string;
3893
+ /** Variance attributed to this factor's main effect, as a fraction of total. */
3894
+ shareOfVariance: number;
3895
+ /** Range of cell means across levels of this factor. */
3896
+ range: number;
3897
+ }
3898
+ interface InteractionContribution {
3899
+ factors: [string, string];
3900
+ shareOfVariance: number;
3901
+ }
3902
+ interface CausalAttributionReport {
3903
+ totalVariance: number;
3904
+ mainEffects: FactorContribution[];
3905
+ interactions: InteractionContribution[];
3906
+ /** Residual = variance unexplained by main effects + modeled interactions. */
3907
+ residualShare: number;
3908
+ /** Sanity: shares sum to 1 (within fp). */
3909
+ sharesSum: number;
3910
+ }
3911
+ declare function causalAttribution(cells: FactorialCell[]): CausalAttributionReport;
3912
+
3913
+ /**
3914
+ * Active learning — agent-as-scenario-author.
3915
+ *
3916
+ * Analyzes an existing Dataset + trace corpus for coverage gaps and
3917
+ * weak spots, returns a prioritized list of *synthesis targets*:
3918
+ * (gap description, existing-neighbor examples, suggested direction).
3919
+ *
3920
+ * Does NOT call an LLM itself — the proposer agent is caller-supplied.
3921
+ * This module's job is to identify WHERE new scenarios would compound
3922
+ * the most information, not to author them.
3923
+ *
3924
+ * Gaps we detect:
3925
+ * - dimensions with high score variance (unstable, need more data)
3926
+ * - dimensions with low coverage count (undersampled)
3927
+ * - failure classes with clusters (systematic weakness)
3928
+ * - difficulty bins with no coverage
3929
+ */
3930
+
3931
+ type SynthesisReason = 'high-variance' | 'undersampled' | 'failure-cluster' | 'difficulty-gap';
3932
+ interface SynthesisTarget {
3933
+ reason: SynthesisReason;
3934
+ description: string;
3935
+ /** Existing scenarios that are closest to the gap; caller feeds these to
3936
+ * their LLM proposer as few-shot examples. */
3937
+ neighbors: DatasetScenario[];
3938
+ /** Suggested direction — e.g. "harder variants", "edge cases of X", "failure class Y". */
3939
+ direction: string;
3940
+ /** Priority score — higher = more information-dense gap. 0..1. */
3941
+ priority: number;
3942
+ }
3943
+ interface ActiveLearningOptions {
3944
+ /** Minimum scenarios per difficulty band to count as "covered". */
3945
+ minPerBand?: number;
3946
+ /** Variance threshold above which a scenario's dimension is "unstable". */
3947
+ varianceThreshold?: number;
3948
+ /** Max synthesis targets returned. */
3949
+ topK?: number;
3950
+ }
3951
+ declare function proposeSynthesisTargets(dataset: Dataset, traceStore: TraceStore, options?: ActiveLearningOptions): Promise<SynthesisTarget[]>;
3952
+
3953
+ /**
3954
+ * Reward-model export — the productizable wrapper around PRM training
3955
+ * data. Takes a TraceStore + PrmGrader, produces an embeddable
3956
+ * inference scorer that customers plug into their own agent stack.
3957
+ *
3958
+ * Two export forms:
3959
+ * - `exportRewardModel(store, graders)` — serializes the (step-context,
3960
+ * score) corpus to a framework-agnostic payload. Customer fine-tunes
3961
+ * their own model; we ship the scaffolding.
3962
+ * - `loadScorerFromTraces(store, grader)` — a zero-deps "reward model"
3963
+ * that literally replays the trained rubric at inference time. Works
3964
+ * as a reference baseline + deterministic fallback.
3965
+ */
3966
+
3967
+ interface ExportedRewardModel {
3968
+ /** Version of the export format. Bump when payload shape changes. */
3969
+ version: '1.0';
3970
+ /** Metadata about the training corpus. */
3971
+ metadata: {
3972
+ nTraces: number;
3973
+ nSamples: number;
3974
+ rubrics: string[];
3975
+ exportedAt: string;
3976
+ /** Mean reward across training corpus — use as sanity check at load. */
3977
+ meanReward: number;
3978
+ };
3979
+ /** NDJSON training payload suitable for most fine-tuning frameworks. */
3980
+ trainingNdjson: string;
3981
+ }
3982
+ declare function exportRewardModel(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<ExportedRewardModel>;
3983
+ /**
3984
+ * Zero-deps inference scorer — apply a grader to a trajectory and return
3985
+ * its aggregate score. This is the "reward model" customers embed when
3986
+ * they don't want (or can't) fine-tune one. Deterministic + portable.
3987
+ */
3988
+ interface InferenceScorer {
3989
+ /** Score a completed trajectory. Higher is better. */
3990
+ score(trajectory: Trajectory, store: TraceStore): Promise<number>;
3991
+ metadata: {
3992
+ rubrics: string[];
3993
+ deterministic: true;
3994
+ };
3995
+ }
3996
+ declare function loadScorerFromGrader(grader: PrmGrader): InferenceScorer;
3997
+ /**
3998
+ * Replay a trace corpus through a scorer — produces the canonical
3999
+ * "what would this reward model have said about every run?" table.
4000
+ * Callers use this to validate a trained model against the training
4001
+ * corpus (expect high agreement; drift indicates overfitting).
4002
+ */
4003
+ declare function replayScorerOverCorpus(store: TraceStore, scorer: InferenceScorer, runIds: string[]): Promise<Array<{
4004
+ runId: string;
4005
+ score: number;
4006
+ outcomeScore: number | null;
4007
+ }>>;
4008
+
4009
+ /**
4010
+ * Governance reporting — shared types.
4011
+ *
4012
+ * The framework collects a `GovernanceContext` (traces + outcomes +
4013
+ * dataset manifests + red-team results + judge calibration) and each
4014
+ * specific template (NIST AI RMF, SOC2, EU AI Act) renders a
4015
+ * structured report from it.
4016
+ *
4017
+ * Reports are machine-readable JSON first; human-readable Markdown is a
4018
+ * pure transform on top. External auditors consume the Markdown; CI
4019
+ * consumes the JSON.
4020
+ */
4021
+
4022
+ interface GovernanceContext {
4023
+ /** Legal / org identity for the report. */
4024
+ organization: string;
4025
+ /** System / agent identifier. */
4026
+ systemName: string;
4027
+ /** ISO8601 period the report covers. */
4028
+ periodStart: string;
4029
+ periodEnd: string;
4030
+ /** Versioned dataset manifests used during the period. */
4031
+ datasets: DatasetManifest[];
4032
+ traceStore: TraceStore;
4033
+ outcomeStore?: OutcomeStore;
4034
+ /** Cached red-team results for the period, if available. */
4035
+ redTeam?: RedTeamReport;
4036
+ /** Judge-vs-human calibration results, if measured. */
4037
+ judgeCalibration?: CalibrationResult[];
4038
+ /** Responsible owner for the system — role + name + email. */
4039
+ owner: {
4040
+ role: string;
4041
+ name: string;
4042
+ email: string;
4043
+ };
4044
+ }
4045
+ interface GovernanceFinding {
4046
+ id: string;
4047
+ severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
4048
+ /** Control reference the finding maps to (e.g. "NIST-AI-RMF:MEASURE-2.1"). */
4049
+ control: string;
4050
+ summary: string;
4051
+ evidence?: string;
4052
+ remediation?: string;
4053
+ }
4054
+ interface GovernanceReport {
4055
+ framework: 'NIST-AI-RMF' | 'SOC2' | 'EU-AI-ACT';
4056
+ version: string;
4057
+ context: Pick<GovernanceContext, 'organization' | 'systemName' | 'periodStart' | 'periodEnd' | 'owner'>;
4058
+ summary: {
4059
+ findings: number;
4060
+ byeverity: Record<GovernanceFinding['severity'], number>;
4061
+ overall: 'compliant' | 'compliant-with-findings' | 'non-compliant';
4062
+ };
4063
+ findings: GovernanceFinding[];
4064
+ /** Framework-specific structured payload (mapped controls, risk class, etc.). */
4065
+ payload: Record<string, unknown>;
4066
+ generatedAt: string;
4067
+ }
4068
+ declare function renderMarkdown(report: GovernanceReport): string;
4069
+ declare function summarize(findings: GovernanceFinding[]): GovernanceReport['summary'];
4070
+
4071
+ /**
4072
+ * NIST AI RMF 1.0 — Govern / Map / Measure / Manage mapping.
4073
+ *
4074
+ * Each subcategory derives its status from concrete framework state:
4075
+ * MEASURE 2.x: do we have a calibration regime? contamination controls?
4076
+ * MEASURE 2.7: are red-team results available?
4077
+ * MANAGE 1.x: are outcome metrics captured? correlation measured?
4078
+ * GOVERN 1.x: dataset + prompt provenance recorded?
4079
+ *
4080
+ * We ship the mapping and the derivation rules; consumers supply the
4081
+ * GovernanceContext.
4082
+ */
4083
+
4084
+ declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceReport>;
4085
+
4086
+ /**
4087
+ * SOC 2 — Common Criteria 7 (system operations + change management)
4088
+ * audit trail derived from the trace corpus.
4089
+ *
4090
+ * This is NOT a formal SOC2 report — that requires an external
4091
+ * auditor. What we ship is the machine-readable *evidence* package
4092
+ * that an auditor consumes: run counts, deploy events, access log
4093
+ * summary, anomaly tracking, response-time SLOs.
4094
+ */
4095
+
4096
+ declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
4097
+
4098
+ /**
4099
+ * EU AI Act — risk-class classification + compliance checklist.
4100
+ *
4101
+ * Classification is declarative: caller supplies the domain/use-case
4102
+ * signals (biometric? critical infrastructure? education? employment?
4103
+ * access to services?) and we map to the Act's risk tiers:
4104
+ * - "unacceptable" (prohibited)
4105
+ * - "high" (Annex III — strict obligations)
4106
+ * - "limited" (transparency obligations)
4107
+ * - "minimal" (voluntary codes of conduct)
4108
+ *
4109
+ * Then the compliance checklist enumerates Article 9 (risk mgmt),
4110
+ * 10 (data + data governance), 11 (technical documentation), 13
4111
+ * (transparency), 14 (human oversight), 15 (accuracy + robustness)
4112
+ * requirements and flags gaps.
4113
+ */
4114
+
4115
+ type EuRiskClass = 'unacceptable' | 'high' | 'limited' | 'minimal';
4116
+ interface UseCaseSignals {
4117
+ /** Used for biometric identification in public spaces? (Art. 5 — unacceptable). */
4118
+ biometricPublic?: boolean;
4119
+ /** Social scoring by public authorities? (Art. 5). */
4120
+ socialScoring?: boolean;
4121
+ /** Subliminal manipulation? (Art. 5). */
4122
+ subliminal?: boolean;
4123
+ /** Annex III sector: critical infrastructure / education / employment /
4124
+ * access to essential services / law enforcement / migration /
4125
+ * administration of justice / democratic processes? */
4126
+ annexIII?: boolean;
4127
+ /** Interacts directly with natural persons (chatbot, agent)? — limited risk. */
4128
+ chatbot?: boolean;
4129
+ /** Generates synthetic media (image/audio/video/text deepfakes)? — limited risk. */
4130
+ generatesSyntheticMedia?: boolean;
4131
+ }
4132
+ declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
4133
+ declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
4134
+
4135
+ export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, type ParetoResult, type PersonaConfig, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunStatus, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, crossTraceDiff, defaultJudges, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, judgeAgreementView, judgeSpans, keyPreserved, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };