@tangle-network/agent-eval 0.2.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +3110 -162
- package/dist/index.js +5107 -230
- package/dist/index.js.map +1 -1
- package/package.json +11 -11
package/dist/index.d.ts
CHANGED
|
@@ -435,6 +435,83 @@ declare class MetricsCollector {
|
|
|
435
435
|
getConvergenceCurve(): number[];
|
|
436
436
|
}
|
|
437
437
|
|
|
438
|
+
/**
|
|
439
|
+
* ScenarioRegistry — manages scenario discovery and filtering.
|
|
440
|
+
*
|
|
441
|
+
* Each agent registers its scenarios. The registry handles conversion
|
|
442
|
+
* from ScenarioFile format to the framework's Scenario type.
|
|
443
|
+
*/
|
|
444
|
+
declare class ScenarioRegistry {
|
|
445
|
+
private scenarios;
|
|
446
|
+
private scenarioFiles;
|
|
447
|
+
/** Register scenarios from ScenarioFile format */
|
|
448
|
+
registerFiles(files: ScenarioFile[]): void;
|
|
449
|
+
/** Register pre-built Scenario objects directly */
|
|
450
|
+
register(scenarios: Scenario[]): void;
|
|
451
|
+
/** Get all scenarios */
|
|
452
|
+
all(): Scenario[];
|
|
453
|
+
/** Get scenarios filtered by category */
|
|
454
|
+
byCategory(category: string): Scenario[];
|
|
455
|
+
/** List all categories with counts */
|
|
456
|
+
listCategories(): {
|
|
457
|
+
category: string;
|
|
458
|
+
count: number;
|
|
459
|
+
}[];
|
|
460
|
+
/** Get scenarios filtered by persona */
|
|
461
|
+
byPersona(persona: string): Scenario[];
|
|
462
|
+
/** Get a single scenario by ID */
|
|
463
|
+
byId(id: string): Scenario | undefined;
|
|
464
|
+
/** Count total scenarios */
|
|
465
|
+
get count(): number;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
interface AgentDriverConfig {
|
|
469
|
+
client: ProductClient;
|
|
470
|
+
driverModel?: string;
|
|
471
|
+
/** System prompt context for the driver LLM to understand the product */
|
|
472
|
+
productContext?: string;
|
|
473
|
+
}
|
|
474
|
+
/**
|
|
475
|
+
* AgentDriver — meta-agent that plays a persona against the real product.
|
|
476
|
+
*
|
|
477
|
+
* Uses a driver LLM (Claude/GPT-4o) to decide what to say each turn.
|
|
478
|
+
* Not scripted — the driver gets the current product state and decides
|
|
479
|
+
* the next realistic user message.
|
|
480
|
+
*/
|
|
481
|
+
declare class AgentDriver {
|
|
482
|
+
private tc;
|
|
483
|
+
private client;
|
|
484
|
+
private driverModel;
|
|
485
|
+
private productContext;
|
|
486
|
+
constructor(tc: TCloud, config: AgentDriverConfig);
|
|
487
|
+
/**
|
|
488
|
+
* Run a persona through the product.
|
|
489
|
+
*
|
|
490
|
+
* Returns metrics on how many turns to completion, cost curve,
|
|
491
|
+
* quality curve, and convergence curve.
|
|
492
|
+
*/
|
|
493
|
+
run(persona: PersonaConfig): Promise<DriverResult>;
|
|
494
|
+
/** Use the driver LLM to decide what the "user" says next */
|
|
495
|
+
private decideNextMessage;
|
|
496
|
+
/** Handle pending approvals based on persona feedback patterns */
|
|
497
|
+
private handleApprovals;
|
|
498
|
+
/** Describe which completion criteria are met */
|
|
499
|
+
private describeCompletion;
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
/**
|
|
503
|
+
* Report generation utilities.
|
|
504
|
+
*
|
|
505
|
+
* Outputs convergence curves, cost curves, quality curves,
|
|
506
|
+
* and per-persona summaries in markdown format.
|
|
507
|
+
*/
|
|
508
|
+
/** Generate a markdown report from benchmark results */
|
|
509
|
+
declare function formatBenchmarkReport(report: BenchmarkReport): string;
|
|
510
|
+
/** Generate a markdown report from agent driver results */
|
|
511
|
+
declare function formatDriverReport(results: DriverResult[]): string;
|
|
512
|
+
/** Print a compact summary to console */
|
|
513
|
+
declare function printDriverSummary(results: DriverResult[]): void;
|
|
514
|
+
|
|
438
515
|
/**
|
|
439
516
|
* Normalize scores so all dimensions follow "higher = better".
|
|
440
517
|
* Inverted dimensions (hallucination, false_confidence, worst_failure)
|
|
@@ -524,83 +601,6 @@ declare class ConvergenceTracker {
|
|
|
524
601
|
getTurnToCompletion(): number | null;
|
|
525
602
|
}
|
|
526
603
|
|
|
527
|
-
/**
|
|
528
|
-
* ScenarioRegistry — manages scenario discovery and filtering.
|
|
529
|
-
*
|
|
530
|
-
* Each agent registers its scenarios. The registry handles conversion
|
|
531
|
-
* from ScenarioFile format to the framework's Scenario type.
|
|
532
|
-
*/
|
|
533
|
-
declare class ScenarioRegistry {
|
|
534
|
-
private scenarios;
|
|
535
|
-
private scenarioFiles;
|
|
536
|
-
/** Register scenarios from ScenarioFile format */
|
|
537
|
-
registerFiles(files: ScenarioFile[]): void;
|
|
538
|
-
/** Register pre-built Scenario objects directly */
|
|
539
|
-
register(scenarios: Scenario[]): void;
|
|
540
|
-
/** Get all scenarios */
|
|
541
|
-
all(): Scenario[];
|
|
542
|
-
/** Get scenarios filtered by category */
|
|
543
|
-
byCategory(category: string): Scenario[];
|
|
544
|
-
/** List all categories with counts */
|
|
545
|
-
listCategories(): {
|
|
546
|
-
category: string;
|
|
547
|
-
count: number;
|
|
548
|
-
}[];
|
|
549
|
-
/** Get scenarios filtered by persona */
|
|
550
|
-
byPersona(persona: string): Scenario[];
|
|
551
|
-
/** Get a single scenario by ID */
|
|
552
|
-
byId(id: string): Scenario | undefined;
|
|
553
|
-
/** Count total scenarios */
|
|
554
|
-
get count(): number;
|
|
555
|
-
}
|
|
556
|
-
|
|
557
|
-
interface AgentDriverConfig {
|
|
558
|
-
client: ProductClient;
|
|
559
|
-
driverModel?: string;
|
|
560
|
-
/** System prompt context for the driver LLM to understand the product */
|
|
561
|
-
productContext?: string;
|
|
562
|
-
}
|
|
563
|
-
/**
|
|
564
|
-
* AgentDriver — meta-agent that plays a persona against the real product.
|
|
565
|
-
*
|
|
566
|
-
* Uses a driver LLM (Claude/GPT-4o) to decide what to say each turn.
|
|
567
|
-
* Not scripted — the driver gets the current product state and decides
|
|
568
|
-
* the next realistic user message.
|
|
569
|
-
*/
|
|
570
|
-
declare class AgentDriver {
|
|
571
|
-
private tc;
|
|
572
|
-
private client;
|
|
573
|
-
private driverModel;
|
|
574
|
-
private productContext;
|
|
575
|
-
constructor(tc: TCloud, config: AgentDriverConfig);
|
|
576
|
-
/**
|
|
577
|
-
* Run a persona through the product.
|
|
578
|
-
*
|
|
579
|
-
* Returns metrics on how many turns to completion, cost curve,
|
|
580
|
-
* quality curve, and convergence curve.
|
|
581
|
-
*/
|
|
582
|
-
run(persona: PersonaConfig): Promise<DriverResult>;
|
|
583
|
-
/** Use the driver LLM to decide what the "user" says next */
|
|
584
|
-
private decideNextMessage;
|
|
585
|
-
/** Handle pending approvals based on persona feedback patterns */
|
|
586
|
-
private handleApprovals;
|
|
587
|
-
/** Describe which completion criteria are met */
|
|
588
|
-
private describeCompletion;
|
|
589
|
-
}
|
|
590
|
-
|
|
591
|
-
/**
|
|
592
|
-
* Report generation utilities.
|
|
593
|
-
*
|
|
594
|
-
* Outputs convergence curves, cost curves, quality curves,
|
|
595
|
-
* and per-persona summaries in markdown format.
|
|
596
|
-
*/
|
|
597
|
-
/** Generate a markdown report from benchmark results */
|
|
598
|
-
declare function formatBenchmarkReport(report: BenchmarkReport): string;
|
|
599
|
-
/** Generate a markdown report from agent driver results */
|
|
600
|
-
declare function formatDriverReport(results: DriverResult[]): string;
|
|
601
|
-
/** Print a compact summary to console */
|
|
602
|
-
declare function printDriverSummary(results: DriverResult[]): void;
|
|
603
|
-
|
|
604
604
|
/**
|
|
605
605
|
* Versioned prompt registry.
|
|
606
606
|
*
|
|
@@ -642,79 +642,6 @@ declare class PromptRegistry {
|
|
|
642
642
|
/** SHA-256(content) → first 12 hex chars. Stable across runtimes. */
|
|
643
643
|
declare function hashContent(content: string): Promise<string>;
|
|
644
644
|
|
|
645
|
-
/**
|
|
646
|
-
* LLM trace store — one record per model call.
|
|
647
|
-
*
|
|
648
|
-
* Sink for the full eval data-plane: what got sent, what came back, what it
|
|
649
|
-
* cost, how long it took. Replayable, queryable, diff-able.
|
|
650
|
-
*
|
|
651
|
-
* Two built-in stores:
|
|
652
|
-
* - `MemoryTraceStore` — fast, ephemeral, useful in tests and short runs
|
|
653
|
-
* - `FileSystemTraceStore` — NDJSON files per-run, grepable, committable
|
|
654
|
-
*
|
|
655
|
-
* Consumers plug in custom stores for Langfuse / OTEL / D1 / Postgres.
|
|
656
|
-
*/
|
|
657
|
-
interface LlmTrace {
|
|
658
|
-
id: string;
|
|
659
|
-
runId: string;
|
|
660
|
-
scenarioId?: string;
|
|
661
|
-
turnIndex?: number;
|
|
662
|
-
role: 'driver' | 'judge' | 'product' | 'optimizer' | string;
|
|
663
|
-
model: string;
|
|
664
|
-
prompt: string;
|
|
665
|
-
output: string;
|
|
666
|
-
inputTokens?: number;
|
|
667
|
-
outputTokens?: number;
|
|
668
|
-
costUsd?: number;
|
|
669
|
-
durationMs?: number;
|
|
670
|
-
timestamp: string;
|
|
671
|
-
metadata?: Record<string, unknown>;
|
|
672
|
-
}
|
|
673
|
-
interface TraceQuery {
|
|
674
|
-
runId?: string;
|
|
675
|
-
scenarioId?: string;
|
|
676
|
-
role?: string;
|
|
677
|
-
model?: string;
|
|
678
|
-
sinceMs?: number;
|
|
679
|
-
limit?: number;
|
|
680
|
-
}
|
|
681
|
-
interface TraceStore {
|
|
682
|
-
record(trace: LlmTrace): Promise<void>;
|
|
683
|
-
query(query: TraceQuery): Promise<LlmTrace[]>;
|
|
684
|
-
count(query?: TraceQuery): Promise<number>;
|
|
685
|
-
}
|
|
686
|
-
declare class MemoryTraceStore implements TraceStore {
|
|
687
|
-
private traces;
|
|
688
|
-
record(trace: LlmTrace): Promise<void>;
|
|
689
|
-
query(query: TraceQuery): Promise<LlmTrace[]>;
|
|
690
|
-
count(query?: TraceQuery): Promise<number>;
|
|
691
|
-
/** Clear the store — test helper. */
|
|
692
|
-
reset(): void;
|
|
693
|
-
private filter;
|
|
694
|
-
}
|
|
695
|
-
interface FileSystemTraceStoreOptions {
|
|
696
|
-
dir: string;
|
|
697
|
-
/** Max file size before rolling to a new segment (default 32 MB). */
|
|
698
|
-
rolloverBytes?: number;
|
|
699
|
-
/** Function to write the file — defaults to node:fs/promises.appendFile */
|
|
700
|
-
append?: (path: string, data: string) => Promise<void>;
|
|
701
|
-
read?: (path: string) => Promise<string>;
|
|
702
|
-
list?: (dir: string) => Promise<string[]>;
|
|
703
|
-
stat?: (path: string) => Promise<{
|
|
704
|
-
size: number;
|
|
705
|
-
}>;
|
|
706
|
-
mkdir?: (dir: string) => Promise<void>;
|
|
707
|
-
}
|
|
708
|
-
declare class FileSystemTraceStore implements TraceStore {
|
|
709
|
-
private readonly opts;
|
|
710
|
-
constructor(opts: FileSystemTraceStoreOptions);
|
|
711
|
-
record(trace: LlmTrace): Promise<void>;
|
|
712
|
-
query(query: TraceQuery): Promise<LlmTrace[]>;
|
|
713
|
-
count(query?: TraceQuery): Promise<number>;
|
|
714
|
-
private segments;
|
|
715
|
-
private currentSegment;
|
|
716
|
-
}
|
|
717
|
-
|
|
718
645
|
/**
|
|
719
646
|
* Anti-slop quality judge.
|
|
720
647
|
*
|
|
@@ -787,7 +714,7 @@ declare function analyzeAntiSlop(outputs: string[], config: Omit<Required<AntiSl
|
|
|
787
714
|
* returns a `ValidationResult` with pass/fail + 0..1 score + structured
|
|
788
715
|
* issues.
|
|
789
716
|
*/
|
|
790
|
-
interface Artifact {
|
|
717
|
+
interface Artifact$1 {
|
|
791
718
|
/** Logical kind — validators type-guard on this */
|
|
792
719
|
kind: 'file' | 'json' | 'text' | 'binary' | string;
|
|
793
720
|
/** Filesystem-style path, optional */
|
|
@@ -803,7 +730,7 @@ interface ValidationContext {
|
|
|
803
730
|
scenarioId: string;
|
|
804
731
|
turnIndex?: number;
|
|
805
732
|
/** Prior artifacts for multi-artifact scenarios */
|
|
806
|
-
priorArtifacts?: Artifact[];
|
|
733
|
+
priorArtifacts?: Artifact$1[];
|
|
807
734
|
/** Free-form hints the validator uses for domain-specific checks */
|
|
808
735
|
hints?: Record<string, unknown>;
|
|
809
736
|
}
|
|
@@ -827,7 +754,7 @@ interface ArtifactValidator {
|
|
|
827
754
|
/** Optional description for human-facing reports. */
|
|
828
755
|
description?: string;
|
|
829
756
|
/** Called once per artifact; validators are expected to be pure + idempotent. */
|
|
830
|
-
validate(artifact: Artifact, context: ValidationContext): Promise<ValidationResult>;
|
|
757
|
+
validate(artifact: Artifact$1, context: ValidationContext): Promise<ValidationResult>;
|
|
831
758
|
}
|
|
832
759
|
/**
|
|
833
760
|
* Run every validator on the same artifact; aggregate pass as AND, score as
|
|
@@ -938,7 +865,7 @@ interface RunConfig {
|
|
|
938
865
|
seed?: number;
|
|
939
866
|
metadata?: Record<string, unknown>;
|
|
940
867
|
}
|
|
941
|
-
interface Run {
|
|
868
|
+
interface Run$1 {
|
|
942
869
|
id: string;
|
|
943
870
|
experimentId: string;
|
|
944
871
|
name?: string;
|
|
@@ -959,9 +886,9 @@ interface ExperimentStore {
|
|
|
959
886
|
saveExperiment(exp: Experiment): Promise<void>;
|
|
960
887
|
getExperiment(id: string): Promise<Experiment | null>;
|
|
961
888
|
listExperiments(): Promise<Experiment[]>;
|
|
962
|
-
saveRun(run: Run): Promise<void>;
|
|
963
|
-
getRun(id: string): Promise<Run | null>;
|
|
964
|
-
listRuns(experimentId: string): Promise<Run[]>;
|
|
889
|
+
saveRun(run: Run$1): Promise<void>;
|
|
890
|
+
getRun(id: string): Promise<Run$1 | null>;
|
|
891
|
+
listRuns(experimentId: string): Promise<Run$1[]>;
|
|
965
892
|
}
|
|
966
893
|
declare class InMemoryExperimentStore implements ExperimentStore {
|
|
967
894
|
private readonly experiments;
|
|
@@ -969,15 +896,15 @@ declare class InMemoryExperimentStore implements ExperimentStore {
|
|
|
969
896
|
saveExperiment(exp: Experiment): Promise<void>;
|
|
970
897
|
getExperiment(id: string): Promise<Experiment | null>;
|
|
971
898
|
listExperiments(): Promise<Experiment[]>;
|
|
972
|
-
saveRun(run: Run): Promise<void>;
|
|
973
|
-
getRun(id: string): Promise<Run | null>;
|
|
974
|
-
listRuns(experimentId: string): Promise<Run[]>;
|
|
899
|
+
saveRun(run: Run$1): Promise<void>;
|
|
900
|
+
getRun(id: string): Promise<Run$1 | null>;
|
|
901
|
+
listRuns(experimentId: string): Promise<Run$1[]>;
|
|
975
902
|
}
|
|
976
903
|
declare class ExperimentTracker {
|
|
977
904
|
private readonly store;
|
|
978
905
|
constructor(store: ExperimentStore);
|
|
979
906
|
startExperiment(name: string, metadata?: Record<string, unknown>): Promise<Experiment>;
|
|
980
|
-
startRun(config: RunConfig): Promise<Run>;
|
|
907
|
+
startRun(config: RunConfig): Promise<Run$1>;
|
|
981
908
|
completeRun(runId: string, report: BenchmarkReport): Promise<void>;
|
|
982
909
|
failRun(runId: string, error: string): Promise<void>;
|
|
983
910
|
/**
|
|
@@ -1080,6 +1007,9 @@ interface PairwiseComparison {
|
|
|
1080
1007
|
variantA: string;
|
|
1081
1008
|
variantB: string;
|
|
1082
1009
|
pValue: number;
|
|
1010
|
+
/** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
|
|
1011
|
+
qValue: number;
|
|
1012
|
+
/** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
|
|
1083
1013
|
significant: boolean;
|
|
1084
1014
|
meanDelta: number;
|
|
1085
1015
|
}
|
|
@@ -1184,4 +1114,3022 @@ declare class DualAgentBench {
|
|
|
1184
1114
|
run(config: DualAgentBenchConfig): Promise<DualAgentReport>;
|
|
1185
1115
|
}
|
|
1186
1116
|
|
|
1187
|
-
|
|
1117
|
+
/**
|
|
1118
|
+
* TraceSchema v1 — the canonical data model for agent-eval.
|
|
1119
|
+
*
|
|
1120
|
+
* Every score, every failure class, every pipeline in the framework is
|
|
1121
|
+
* a view over this data. Shape it once, live with it.
|
|
1122
|
+
*
|
|
1123
|
+
* Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
|
|
1124
|
+
* but extended with agent-specific span kinds (llm, tool, retrieval,
|
|
1125
|
+
* judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
|
|
1126
|
+
* entities that OTEL leaves as free-form attributes.
|
|
1127
|
+
*/
|
|
1128
|
+
declare const TRACE_SCHEMA_VERSION = "1.0.0";
|
|
1129
|
+
type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
|
|
1130
|
+
interface BudgetSpec {
|
|
1131
|
+
tokens?: number;
|
|
1132
|
+
wallMs?: number;
|
|
1133
|
+
calls?: number;
|
|
1134
|
+
usd?: number;
|
|
1135
|
+
}
|
|
1136
|
+
interface RunOutcome {
|
|
1137
|
+
score?: number;
|
|
1138
|
+
pass?: boolean;
|
|
1139
|
+
failureClass?: FailureClass;
|
|
1140
|
+
notes?: string;
|
|
1141
|
+
}
|
|
1142
|
+
/**
|
|
1143
|
+
* Layer — optional classification in a nested build workflow.
|
|
1144
|
+
* `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
|
|
1145
|
+
* `app-build`: sandbox harness that compiled + tested the generated scaffold.
|
|
1146
|
+
* `app-runtime`: a run of the generated agent against a domain scenario.
|
|
1147
|
+
* `meta`: any meta-eval (judge replay, correlation analysis).
|
|
1148
|
+
*/
|
|
1149
|
+
type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
|
|
1150
|
+
interface Run {
|
|
1151
|
+
runId: string;
|
|
1152
|
+
scenarioId: string;
|
|
1153
|
+
variantId?: string;
|
|
1154
|
+
datasetVersion?: string;
|
|
1155
|
+
/** Git SHA of agent code at run time. */
|
|
1156
|
+
codeSha?: string;
|
|
1157
|
+
/** Hash of the prompt template + any system prompt. */
|
|
1158
|
+
promptSha?: string;
|
|
1159
|
+
/** Model id + date + system-prompt hash, concatenated. */
|
|
1160
|
+
modelFingerprint?: string;
|
|
1161
|
+
seed?: number;
|
|
1162
|
+
/** Arbitrary environment markers (shell, docker version, tz). */
|
|
1163
|
+
envFingerprint?: Record<string, string>;
|
|
1164
|
+
/** Version of the redaction rules applied to this run. */
|
|
1165
|
+
redactionVersion?: string;
|
|
1166
|
+
/** Parent run in a nested build workflow. A builder run's children are
|
|
1167
|
+
* app-build runs; those children are app-runtime runs. */
|
|
1168
|
+
parentRunId?: string;
|
|
1169
|
+
/** Stable project identifier — groups runs across chats + sessions. */
|
|
1170
|
+
projectId?: string;
|
|
1171
|
+
/** Chat/conversation identifier within a project. */
|
|
1172
|
+
chatId?: string;
|
|
1173
|
+
/** Layer classification — hint for aggregation; not enforced. */
|
|
1174
|
+
layer?: RunLayer;
|
|
1175
|
+
startedAt: number;
|
|
1176
|
+
endedAt?: number;
|
|
1177
|
+
status: RunStatus;
|
|
1178
|
+
outcome?: RunOutcome;
|
|
1179
|
+
budget?: BudgetSpec;
|
|
1180
|
+
/** Free-form labels for downstream grouping. */
|
|
1181
|
+
tags?: Record<string, string>;
|
|
1182
|
+
}
|
|
1183
|
+
type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
|
|
1184
|
+
type SpanStatus = 'ok' | 'error';
|
|
1185
|
+
interface SpanBase {
|
|
1186
|
+
spanId: string;
|
|
1187
|
+
parentSpanId?: string;
|
|
1188
|
+
runId: string;
|
|
1189
|
+
kind: SpanKind;
|
|
1190
|
+
name: string;
|
|
1191
|
+
startedAt: number;
|
|
1192
|
+
endedAt?: number;
|
|
1193
|
+
status?: SpanStatus;
|
|
1194
|
+
error?: string;
|
|
1195
|
+
/** Anything not covered by typed fields. Kept deliberately free-form. */
|
|
1196
|
+
attributes?: Record<string, unknown>;
|
|
1197
|
+
}
|
|
1198
|
+
interface Message {
|
|
1199
|
+
role: 'system' | 'user' | 'assistant' | 'tool';
|
|
1200
|
+
content: string;
|
|
1201
|
+
tokens?: number;
|
|
1202
|
+
/** Multi-modal content descriptors; blobs themselves live in Artifacts. */
|
|
1203
|
+
images?: Array<{
|
|
1204
|
+
artifactId?: string;
|
|
1205
|
+
url?: string;
|
|
1206
|
+
mime?: string;
|
|
1207
|
+
}>;
|
|
1208
|
+
}
|
|
1209
|
+
interface LlmSpan extends SpanBase {
|
|
1210
|
+
kind: 'llm';
|
|
1211
|
+
model: string;
|
|
1212
|
+
messages: Message[];
|
|
1213
|
+
output?: string;
|
|
1214
|
+
inputTokens?: number;
|
|
1215
|
+
outputTokens?: number;
|
|
1216
|
+
cachedTokens?: number;
|
|
1217
|
+
reasoningTokens?: number;
|
|
1218
|
+
costUsd?: number;
|
|
1219
|
+
finishReason?: string;
|
|
1220
|
+
}
|
|
1221
|
+
interface ToolSpan extends SpanBase {
|
|
1222
|
+
kind: 'tool';
|
|
1223
|
+
toolName: string;
|
|
1224
|
+
args: unknown;
|
|
1225
|
+
result?: unknown;
|
|
1226
|
+
latencyMs?: number;
|
|
1227
|
+
}
|
|
1228
|
+
interface RetrievalSpan extends SpanBase {
|
|
1229
|
+
kind: 'retrieval';
|
|
1230
|
+
query: string;
|
|
1231
|
+
hits: Array<{
|
|
1232
|
+
docId: string;
|
|
1233
|
+
score: number;
|
|
1234
|
+
content?: string;
|
|
1235
|
+
}>;
|
|
1236
|
+
}
|
|
1237
|
+
interface JudgeSpan extends SpanBase {
|
|
1238
|
+
kind: 'judge';
|
|
1239
|
+
judgeId: string;
|
|
1240
|
+
/** Span this judgment applies to. */
|
|
1241
|
+
targetSpanId: string;
|
|
1242
|
+
dimension: string;
|
|
1243
|
+
/** Numeric score (free-range; interpretation up to the judge). */
|
|
1244
|
+
score: number;
|
|
1245
|
+
rationale?: string;
|
|
1246
|
+
evidence?: string;
|
|
1247
|
+
}
|
|
1248
|
+
interface SandboxSpan extends SpanBase {
|
|
1249
|
+
kind: 'sandbox';
|
|
1250
|
+
image?: string;
|
|
1251
|
+
command?: string;
|
|
1252
|
+
exitCode?: number;
|
|
1253
|
+
testsTotal?: number;
|
|
1254
|
+
testsPassed?: number;
|
|
1255
|
+
stdoutHash?: string;
|
|
1256
|
+
stderrHash?: string;
|
|
1257
|
+
/** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
|
|
1258
|
+
wallMs?: number;
|
|
1259
|
+
}
|
|
1260
|
+
interface GenericSpan extends SpanBase {
|
|
1261
|
+
kind: 'agent' | 'custom';
|
|
1262
|
+
}
|
|
1263
|
+
type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
|
|
1264
|
+
type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
|
|
1265
|
+
interface TraceEvent {
|
|
1266
|
+
eventId: string;
|
|
1267
|
+
runId: string;
|
|
1268
|
+
spanId?: string;
|
|
1269
|
+
kind: EventKind;
|
|
1270
|
+
timestamp: number;
|
|
1271
|
+
payload: Record<string, unknown>;
|
|
1272
|
+
}
|
|
1273
|
+
interface BudgetLedgerEntry {
|
|
1274
|
+
runId: string;
|
|
1275
|
+
dimension: keyof BudgetSpec;
|
|
1276
|
+
limit: number;
|
|
1277
|
+
consumed: number;
|
|
1278
|
+
remaining: number;
|
|
1279
|
+
timestamp: number;
|
|
1280
|
+
breached: boolean;
|
|
1281
|
+
/** Span that triggered this entry, if any. */
|
|
1282
|
+
spanId?: string;
|
|
1283
|
+
}
|
|
1284
|
+
interface Artifact {
|
|
1285
|
+
artifactId: string;
|
|
1286
|
+
runId: string;
|
|
1287
|
+
spanId?: string;
|
|
1288
|
+
contentType: string;
|
|
1289
|
+
sizeBytes: number;
|
|
1290
|
+
/** sha256 in hex. */
|
|
1291
|
+
hash: string;
|
|
1292
|
+
/** External storage URL (R2, S3, filesystem path). */
|
|
1293
|
+
storageUrl?: string;
|
|
1294
|
+
/** Inline content for small blobs — keep under ~64KB. */
|
|
1295
|
+
inlineContent?: string;
|
|
1296
|
+
}
|
|
1297
|
+
type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'unknown';
|
|
1298
|
+
declare const FAILURE_CLASSES: readonly FailureClass[];
|
|
1299
|
+
declare function isLlmSpan(s: Span): s is LlmSpan;
|
|
1300
|
+
declare function isToolSpan(s: Span): s is ToolSpan;
|
|
1301
|
+
declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
|
|
1302
|
+
declare function isJudgeSpan(s: Span): s is JudgeSpan;
|
|
1303
|
+
declare function isSandboxSpan(s: Span): s is SandboxSpan;
|
|
1304
|
+
|
|
1305
|
+
interface RunFilter {
|
|
1306
|
+
scenarioId?: string;
|
|
1307
|
+
variantId?: string;
|
|
1308
|
+
status?: RunStatus;
|
|
1309
|
+
since?: number;
|
|
1310
|
+
until?: number;
|
|
1311
|
+
tag?: {
|
|
1312
|
+
key: string;
|
|
1313
|
+
value: string;
|
|
1314
|
+
};
|
|
1315
|
+
parentRunId?: string;
|
|
1316
|
+
projectId?: string;
|
|
1317
|
+
chatId?: string;
|
|
1318
|
+
layer?: RunLayer;
|
|
1319
|
+
}
|
|
1320
|
+
interface SpanFilter {
|
|
1321
|
+
runId?: string;
|
|
1322
|
+
parentSpanId?: string;
|
|
1323
|
+
kind?: SpanKind;
|
|
1324
|
+
name?: string;
|
|
1325
|
+
toolName?: string;
|
|
1326
|
+
judgeId?: string;
|
|
1327
|
+
since?: number;
|
|
1328
|
+
until?: number;
|
|
1329
|
+
}
|
|
1330
|
+
interface EventFilter {
|
|
1331
|
+
runId?: string;
|
|
1332
|
+
spanId?: string;
|
|
1333
|
+
kind?: EventKind;
|
|
1334
|
+
since?: number;
|
|
1335
|
+
until?: number;
|
|
1336
|
+
}
|
|
1337
|
+
interface TraceStore {
|
|
1338
|
+
appendRun(run: Run): Promise<void>;
|
|
1339
|
+
updateRun(runId: string, patch: Partial<Run>): Promise<void>;
|
|
1340
|
+
appendSpan(span: Span): Promise<void>;
|
|
1341
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
1342
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
1343
|
+
appendArtifact(artifact: Artifact): Promise<void>;
|
|
1344
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
1345
|
+
getRun(runId: string): Promise<Run | undefined>;
|
|
1346
|
+
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
1347
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
1348
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
1349
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
1350
|
+
artifacts(runId: string): Promise<Artifact[]>;
|
|
1351
|
+
}
|
|
1352
|
+
declare class InMemoryTraceStore implements TraceStore {
|
|
1353
|
+
private runs;
|
|
1354
|
+
private allSpans;
|
|
1355
|
+
private allEvents;
|
|
1356
|
+
private allArtifacts;
|
|
1357
|
+
private allBudget;
|
|
1358
|
+
appendRun(run: Run): Promise<void>;
|
|
1359
|
+
updateRun(runId: string, patch: Partial<Run>): Promise<void>;
|
|
1360
|
+
appendSpan(span: Span): Promise<void>;
|
|
1361
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
1362
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
1363
|
+
appendArtifact(artifact: Artifact): Promise<void>;
|
|
1364
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
1365
|
+
getRun(runId: string): Promise<Run | undefined>;
|
|
1366
|
+
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
1367
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
1368
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
1369
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
1370
|
+
artifacts(runId: string): Promise<Artifact[]>;
|
|
1371
|
+
}
|
|
1372
|
+
interface FileSystemTraceStoreOptions {
|
|
1373
|
+
dir: string;
|
|
1374
|
+
/** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
|
|
1375
|
+
maxBytes?: number;
|
|
1376
|
+
}
|
|
1377
|
+
declare class FileSystemTraceStore implements TraceStore {
|
|
1378
|
+
private dir;
|
|
1379
|
+
private maxBytes;
|
|
1380
|
+
/** Lazy in-memory index for queries — populated on first read. */
|
|
1381
|
+
private index?;
|
|
1382
|
+
private loaded;
|
|
1383
|
+
constructor(options: FileSystemTraceStoreOptions);
|
|
1384
|
+
private ensureDir;
|
|
1385
|
+
private append;
|
|
1386
|
+
private insertInto;
|
|
1387
|
+
private load;
|
|
1388
|
+
appendRun(run: Run): Promise<void>;
|
|
1389
|
+
updateRun(runId: string, patch: Partial<Run>): Promise<void>;
|
|
1390
|
+
appendSpan(span: Span): Promise<void>;
|
|
1391
|
+
updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
|
|
1392
|
+
appendEvent(event: TraceEvent): Promise<void>;
|
|
1393
|
+
appendArtifact(artifact: Artifact): Promise<void>;
|
|
1394
|
+
appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
|
|
1395
|
+
getRun(runId: string): Promise<Run | undefined>;
|
|
1396
|
+
listRuns(filter?: RunFilter): Promise<Run[]>;
|
|
1397
|
+
spans(filter?: SpanFilter): Promise<Span[]>;
|
|
1398
|
+
events(filter?: EventFilter): Promise<TraceEvent[]>;
|
|
1399
|
+
budget(runId: string): Promise<BudgetLedgerEntry[]>;
|
|
1400
|
+
artifacts(runId: string): Promise<Artifact[]>;
|
|
1401
|
+
}
|
|
1402
|
+
|
|
1403
|
+
/**
|
|
1404
|
+
* TraceEmitter — hierarchical span builder that auto-parents using an
|
|
1405
|
+
* internal stack. One emitter per Run; emitters do NOT share state.
|
|
1406
|
+
*
|
|
1407
|
+
* Convenience methods (`llm`, `tool`, `retrieval`, `judge`, `sandbox`)
|
|
1408
|
+
* return a `SpanHandle` with `.end()` / `.fail()` so callers don't
|
|
1409
|
+
* have to thread spanIds manually. For async workflows that can't use
|
|
1410
|
+
* the stack (e.g. fan-out parallel calls), pass `parentSpanId`
|
|
1411
|
+
* explicitly.
|
|
1412
|
+
*/
|
|
1413
|
+
|
|
1414
|
+
interface SpanHandle<S extends Span = Span> {
|
|
1415
|
+
span: S;
|
|
1416
|
+
end(patch?: Partial<S>): Promise<void>;
|
|
1417
|
+
fail(error: string | Error, patch?: Partial<S>): Promise<void>;
|
|
1418
|
+
}
|
|
1419
|
+
interface TraceEmitterOptions {
|
|
1420
|
+
runId?: string;
|
|
1421
|
+
/** Inject a clock for deterministic tests. */
|
|
1422
|
+
now?: () => number;
|
|
1423
|
+
/** Inject an id generator for deterministic tests. */
|
|
1424
|
+
id?: () => string;
|
|
1425
|
+
}
|
|
1426
|
+
declare class TraceEmitter {
|
|
1427
|
+
private store;
|
|
1428
|
+
private stack;
|
|
1429
|
+
private _runId;
|
|
1430
|
+
private now;
|
|
1431
|
+
private id;
|
|
1432
|
+
constructor(store: TraceStore, options?: TraceEmitterOptions);
|
|
1433
|
+
get runId(): string;
|
|
1434
|
+
startRun(run: Omit<Run, 'runId' | 'startedAt' | 'status'>): Promise<Run>;
|
|
1435
|
+
endRun(outcome?: RunOutcome): Promise<void>;
|
|
1436
|
+
abortRun(reason: string): Promise<void>;
|
|
1437
|
+
span<S extends Span = Span>(init: {
|
|
1438
|
+
kind: SpanKind;
|
|
1439
|
+
name: string;
|
|
1440
|
+
parentSpanId?: string;
|
|
1441
|
+
attributes?: Record<string, unknown>;
|
|
1442
|
+
} & Partial<Omit<S, 'spanId' | 'runId' | 'startedAt' | 'kind' | 'name'>>): Promise<SpanHandle<S>>;
|
|
1443
|
+
private handle;
|
|
1444
|
+
private pop;
|
|
1445
|
+
llm(init: Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<LlmSpan>>;
|
|
1446
|
+
tool(init: Omit<ToolSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<ToolSpan>>;
|
|
1447
|
+
retrieval(init: Omit<RetrievalSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<RetrievalSpan>>;
|
|
1448
|
+
recordJudge(verdict: Omit<JudgeSpan, 'spanId' | 'runId' | 'kind' | 'startedAt' | 'endedAt'>): Promise<JudgeSpan>;
|
|
1449
|
+
sandbox(init: Omit<SandboxSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>): Promise<SpanHandle<SandboxSpan>>;
|
|
1450
|
+
emit(event: {
|
|
1451
|
+
kind: EventKind;
|
|
1452
|
+
spanId?: string;
|
|
1453
|
+
payload?: Record<string, unknown>;
|
|
1454
|
+
}): Promise<TraceEvent>;
|
|
1455
|
+
recordBudget(entry: Omit<BudgetLedgerEntry, 'runId' | 'timestamp'> & {
|
|
1456
|
+
timestamp?: number;
|
|
1457
|
+
}): Promise<BudgetLedgerEntry>;
|
|
1458
|
+
recordArtifact(artifact: Omit<Artifact, 'artifactId' | 'runId'>): Promise<Artifact>;
|
|
1459
|
+
/**
|
|
1460
|
+
* Runs `fn` inside a span; auto-ends on success, auto-fails on throw.
|
|
1461
|
+
* Returns the fn's return value. Use this for the 95% case.
|
|
1462
|
+
*/
|
|
1463
|
+
within<T>(init: Parameters<TraceEmitter['span']>[0], fn: (handle: SpanHandle) => Promise<T>): Promise<T>;
|
|
1464
|
+
}
|
|
1465
|
+
/** Helper to build an LLM span handle args object from a provider-shaped response. */
|
|
1466
|
+
declare function llmSpanFromProvider(args: {
|
|
1467
|
+
name?: string;
|
|
1468
|
+
model: string;
|
|
1469
|
+
messages: Message[];
|
|
1470
|
+
output: string;
|
|
1471
|
+
usage?: {
|
|
1472
|
+
inputTokens?: number;
|
|
1473
|
+
outputTokens?: number;
|
|
1474
|
+
cachedTokens?: number;
|
|
1475
|
+
reasoningTokens?: number;
|
|
1476
|
+
};
|
|
1477
|
+
costUsd?: number;
|
|
1478
|
+
finishReason?: string;
|
|
1479
|
+
}): Omit<LlmSpan, 'spanId' | 'runId' | 'kind' | 'startedAt'>;
|
|
1480
|
+
|
|
1481
|
+
/**
|
|
1482
|
+
* Typed query helpers over TraceStore.
|
|
1483
|
+
*
|
|
1484
|
+
* Not a full SQL engine — a minimal, composable set of operators that
|
|
1485
|
+
* cover the canned-pipeline use cases. For ad-hoc analytics, persist to
|
|
1486
|
+
* NDJSON and point DuckDB at it; the schema is stable so external SQL
|
|
1487
|
+
* tooling works out of the box.
|
|
1488
|
+
*/
|
|
1489
|
+
|
|
1490
|
+
declare function runsForScenario(store: TraceStore, scenarioId: string): Promise<Run[]>;
|
|
1491
|
+
declare function llmSpans(store: TraceStore, runId?: string): Promise<LlmSpan[]>;
|
|
1492
|
+
declare function toolSpans(store: TraceStore, runId?: string, toolName?: string): Promise<ToolSpan[]>;
|
|
1493
|
+
declare function judgeSpans(store: TraceStore, runId?: string): Promise<JudgeSpan[]>;
|
|
1494
|
+
/** Group spans by any key selector. */
|
|
1495
|
+
declare function groupBy<T, K extends string | number>(items: T[], key: (t: T) => K): Map<K, T[]>;
|
|
1496
|
+
/** Hash tool arguments to an orderless-key-stable string for de-duplication. */
|
|
1497
|
+
declare function argHash(args: unknown): string;
|
|
1498
|
+
/** Sum an LLM-span array into aggregate token + cost. */
|
|
1499
|
+
declare function aggregateLlm(spans: LlmSpan[]): {
|
|
1500
|
+
inputTokens: number;
|
|
1501
|
+
outputTokens: number;
|
|
1502
|
+
cachedTokens: number;
|
|
1503
|
+
costUsd: number;
|
|
1504
|
+
};
|
|
1505
|
+
/** Pick the outcome's failure class when present, else derive 'success' from run status. */
|
|
1506
|
+
declare function runFailureClass(run: Run): FailureClass;
|
|
1507
|
+
|
|
1508
|
+
/**
|
|
1509
|
+
* Redaction — remove PII / secrets from trace payloads before persist.
|
|
1510
|
+
*
|
|
1511
|
+
* Pre-persistence rules mean raw traces in storage are already scrubbed.
|
|
1512
|
+
* Unredacted variants (for debugging / post-mortems) live in a separate
|
|
1513
|
+
* storage layer with stricter access controls; this module only covers
|
|
1514
|
+
* the default scrub-then-persist path.
|
|
1515
|
+
*
|
|
1516
|
+
* Rules compose: pass an array of `RedactionRule`, each is applied in
|
|
1517
|
+
* order. Strings that match get replaced with a tagged sentinel so the
|
|
1518
|
+
* eval framework can count how many redactions happened per run
|
|
1519
|
+
* (surfaced via `redaction_applied` events).
|
|
1520
|
+
*/
|
|
1521
|
+
interface RedactionRule {
|
|
1522
|
+
id: string;
|
|
1523
|
+
pattern: RegExp;
|
|
1524
|
+
/** Replacement — e.g. '[PII:email]'. Defaults to `[redacted:{id}]`. */
|
|
1525
|
+
replacement?: string;
|
|
1526
|
+
}
|
|
1527
|
+
interface RedactionReport {
|
|
1528
|
+
redactionCount: number;
|
|
1529
|
+
byRule: Record<string, number>;
|
|
1530
|
+
}
|
|
1531
|
+
/** OWASP / common-sense defaults — extend per-domain. */
|
|
1532
|
+
declare const DEFAULT_REDACTION_RULES: RedactionRule[];
|
|
1533
|
+
declare const REDACTION_VERSION = "1.0.0";
|
|
1534
|
+
/**
|
|
1535
|
+
* Redact a single string. Returns the new string and a per-rule count of
|
|
1536
|
+
* how many substitutions fired.
|
|
1537
|
+
*/
|
|
1538
|
+
declare function redactString(input: string, rules?: RedactionRule[]): {
|
|
1539
|
+
output: string;
|
|
1540
|
+
report: RedactionReport;
|
|
1541
|
+
};
|
|
1542
|
+
/**
|
|
1543
|
+
* Walk a JSON-ish value applying `redactString` to every string leaf.
|
|
1544
|
+
* Arrays and plain objects are recursed; other types pass through
|
|
1545
|
+
* untouched. Circular references throw — traces should be tree-shaped.
|
|
1546
|
+
*/
|
|
1547
|
+
declare function redactValue(value: unknown, rules?: RedactionRule[], report?: RedactionReport): {
|
|
1548
|
+
value: unknown;
|
|
1549
|
+
report: RedactionReport;
|
|
1550
|
+
};
|
|
1551
|
+
|
|
1552
|
+
/**
|
|
1553
|
+
* OpenTelemetry JSON export — maps TraceSchema v1 to OTLP/JSON so
|
|
1554
|
+
* traces render natively in Jaeger / Honeycomb / Langfuse / Grafana.
|
|
1555
|
+
*
|
|
1556
|
+
* Wire format only. We do NOT depend on the @opentelemetry SDK — that
|
|
1557
|
+
* would drag in polyfills incompatible with Workers/Edge. Consumers
|
|
1558
|
+
* push the JSON to their collector of choice via HTTP.
|
|
1559
|
+
*
|
|
1560
|
+
* Reference: OTLP 1.3.2 (ResourceSpans / ScopeSpans / Span).
|
|
1561
|
+
*/
|
|
1562
|
+
|
|
1563
|
+
declare const OTEL_AGENT_EVAL_SCOPE: {
|
|
1564
|
+
name: string;
|
|
1565
|
+
version: string;
|
|
1566
|
+
};
|
|
1567
|
+
interface OtlpSpan {
|
|
1568
|
+
traceId: string;
|
|
1569
|
+
spanId: string;
|
|
1570
|
+
parentSpanId?: string;
|
|
1571
|
+
name: string;
|
|
1572
|
+
kind: number;
|
|
1573
|
+
startTimeUnixNano: string;
|
|
1574
|
+
endTimeUnixNano: string;
|
|
1575
|
+
attributes: Array<{
|
|
1576
|
+
key: string;
|
|
1577
|
+
value: {
|
|
1578
|
+
stringValue?: string;
|
|
1579
|
+
intValue?: string;
|
|
1580
|
+
doubleValue?: number;
|
|
1581
|
+
boolValue?: boolean;
|
|
1582
|
+
};
|
|
1583
|
+
}>;
|
|
1584
|
+
events?: Array<{
|
|
1585
|
+
timeUnixNano: string;
|
|
1586
|
+
name: string;
|
|
1587
|
+
attributes?: OtlpSpan['attributes'];
|
|
1588
|
+
}>;
|
|
1589
|
+
status?: {
|
|
1590
|
+
code: number;
|
|
1591
|
+
message?: string;
|
|
1592
|
+
};
|
|
1593
|
+
}
|
|
1594
|
+
interface OtlpResourceSpans {
|
|
1595
|
+
resource: {
|
|
1596
|
+
attributes: OtlpSpan['attributes'];
|
|
1597
|
+
};
|
|
1598
|
+
scopeSpans: Array<{
|
|
1599
|
+
scope: typeof OTEL_AGENT_EVAL_SCOPE;
|
|
1600
|
+
spans: OtlpSpan[];
|
|
1601
|
+
}>;
|
|
1602
|
+
}
|
|
1603
|
+
interface OtlpExport {
|
|
1604
|
+
resourceSpans: OtlpResourceSpans[];
|
|
1605
|
+
}
|
|
1606
|
+
/** Export a single run's spans + events in OTLP/JSON. */
|
|
1607
|
+
declare function exportRunAsOtlp(store: TraceStore, runId: string, resourceAttrs?: Record<string, string | number | boolean>): Promise<OtlpExport>;
|
|
1608
|
+
|
|
1609
|
+
/**
|
|
1610
|
+
* SandboxHarness — executes a scenario in an isolated environment and
|
|
1611
|
+
* emits a rich SandboxSpan into the trace.
|
|
1612
|
+
*
|
|
1613
|
+
* Two built-in drivers:
|
|
1614
|
+
* - `SubprocessSandboxDriver` — spawn in a local cwd with env vars.
|
|
1615
|
+
* Fast, no dependencies, fine for unit tests and most CI gates.
|
|
1616
|
+
* - `DockerSandboxDriver` — lifted from tangle-router's sandbox path;
|
|
1617
|
+
* shells out to `docker run`. Stronger isolation, slower startup.
|
|
1618
|
+
*
|
|
1619
|
+
* Consumers implement `SandboxDriver` for custom backends (Firecracker,
|
|
1620
|
+
* Cloudflare sandbox product, etc.). The harness doesn't care which.
|
|
1621
|
+
*/
|
|
1622
|
+
|
|
1623
|
+
interface HarnessConfig {
|
|
1624
|
+
/** Setup command (e.g. "pnpm install"). Non-zero exit fails the run. */
|
|
1625
|
+
setupCommand?: string;
|
|
1626
|
+
/** Run command (e.g. "pnpm build"). */
|
|
1627
|
+
runCommand?: string;
|
|
1628
|
+
/** Test command (e.g. "pnpm test --run"). Drives the test count + pass count. */
|
|
1629
|
+
testCommand?: string;
|
|
1630
|
+
/** Absolute cwd for the subprocess driver. Ignored by docker driver. */
|
|
1631
|
+
cwd?: string;
|
|
1632
|
+
/** Max wall-clock per phase in ms. Default 10 minutes. */
|
|
1633
|
+
timeoutMs?: number;
|
|
1634
|
+
/** Image for the docker driver. */
|
|
1635
|
+
image?: string;
|
|
1636
|
+
/** Extra env vars (validated; shell-escaped). */
|
|
1637
|
+
env?: Record<string, string>;
|
|
1638
|
+
/** Parser for the test output — maps stdout/stderr/exit code → pass count. */
|
|
1639
|
+
testParser?: TestOutputParser;
|
|
1640
|
+
}
|
|
1641
|
+
interface TestOutputParser {
|
|
1642
|
+
id: string;
|
|
1643
|
+
parse(stdout: string, stderr: string, exitCode: number): {
|
|
1644
|
+
testsTotal: number;
|
|
1645
|
+
testsPassed: number;
|
|
1646
|
+
} | undefined;
|
|
1647
|
+
}
|
|
1648
|
+
interface SandboxResult {
|
|
1649
|
+
phase: 'setup' | 'run' | 'test';
|
|
1650
|
+
exitCode: number;
|
|
1651
|
+
stdout: string;
|
|
1652
|
+
stderr: string;
|
|
1653
|
+
wallMs: number;
|
|
1654
|
+
testsTotal?: number;
|
|
1655
|
+
testsPassed?: number;
|
|
1656
|
+
}
|
|
1657
|
+
interface SandboxDriver {
|
|
1658
|
+
id: string;
|
|
1659
|
+
exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
|
|
1660
|
+
}
|
|
1661
|
+
/** Vitest default summary line: "Tests X passed | Y failed". */
|
|
1662
|
+
declare const vitestTestParser: TestOutputParser;
|
|
1663
|
+
/** Pytest default: "collected N items" + " X passed, Y failed". */
|
|
1664
|
+
declare const pytestTestParser: TestOutputParser;
|
|
1665
|
+
/** Jest: "Tests: X passed, Y total" (and optional failed). */
|
|
1666
|
+
declare const jestTestParser: TestOutputParser;
|
|
1667
|
+
/** Composite parser — tries a list of parsers in order. */
|
|
1668
|
+
declare function composeParsers(...parsers: TestOutputParser[]): TestOutputParser;
|
|
1669
|
+
declare class SubprocessSandboxDriver implements SandboxDriver {
|
|
1670
|
+
id: string;
|
|
1671
|
+
exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
|
|
1672
|
+
}
|
|
1673
|
+
declare class DockerSandboxDriver implements SandboxDriver {
|
|
1674
|
+
id: string;
|
|
1675
|
+
exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise<SandboxResult>;
|
|
1676
|
+
}
|
|
1677
|
+
interface SandboxHarnessResult {
|
|
1678
|
+
passed: boolean;
|
|
1679
|
+
setup?: SandboxResult;
|
|
1680
|
+
run?: SandboxResult;
|
|
1681
|
+
test?: SandboxResult;
|
|
1682
|
+
totalWallMs: number;
|
|
1683
|
+
/** Final score — 0 when no tests; otherwise testsPassed/testsTotal. */
|
|
1684
|
+
score: number;
|
|
1685
|
+
}
|
|
1686
|
+
declare class SandboxHarness {
|
|
1687
|
+
private driver;
|
|
1688
|
+
constructor(driver?: SandboxDriver);
|
|
1689
|
+
run(config: HarnessConfig, emitter: TraceEmitter): Promise<SandboxHarnessResult>;
|
|
1690
|
+
}
|
|
1691
|
+
|
|
1692
|
+
/**
|
|
1693
|
+
* TestGradedScenario — a scenario whose score comes from a test suite.
|
|
1694
|
+
*
|
|
1695
|
+
* This is the SWE-bench pattern generalized. The scenario ships:
|
|
1696
|
+
* - fixture data (setup instructions)
|
|
1697
|
+
* - a test command the harness runs
|
|
1698
|
+
* - optional assertion overrides
|
|
1699
|
+
*
|
|
1700
|
+
* The runner emits a run, delegates to SandboxHarness, records the
|
|
1701
|
+
* outcome, and returns a structured verdict. Consumers bind their own
|
|
1702
|
+
* agent execution to this contract.
|
|
1703
|
+
*/
|
|
1704
|
+
|
|
1705
|
+
interface TestGradedScenario {
|
|
1706
|
+
id: string;
|
|
1707
|
+
description?: string;
|
|
1708
|
+
harness: HarnessConfig;
|
|
1709
|
+
/** Optional pass threshold in 0..1 (default 1.0 = all tests must pass). */
|
|
1710
|
+
passThreshold?: number;
|
|
1711
|
+
/** Provenance for dataset tracking. */
|
|
1712
|
+
datasetVersion?: string;
|
|
1713
|
+
/** Free-form tags (difficulty, category, etc.). */
|
|
1714
|
+
tags?: Record<string, string>;
|
|
1715
|
+
}
|
|
1716
|
+
interface TestGradedRunOptions {
|
|
1717
|
+
variantId?: string;
|
|
1718
|
+
driver?: SandboxDriver;
|
|
1719
|
+
/** Metadata recorded on the Run (codeSha, promptSha, modelFingerprint, seed). */
|
|
1720
|
+
provenance?: Pick<Run, 'codeSha' | 'promptSha' | 'modelFingerprint' | 'seed' | 'envFingerprint'>;
|
|
1721
|
+
}
|
|
1722
|
+
interface TestGradedRunResult {
|
|
1723
|
+
runId: string;
|
|
1724
|
+
scenario: TestGradedScenario;
|
|
1725
|
+
harness: SandboxHarnessResult;
|
|
1726
|
+
pass: boolean;
|
|
1727
|
+
score: number;
|
|
1728
|
+
failureClass?: FailureClass;
|
|
1729
|
+
}
|
|
1730
|
+
declare function runTestGradedScenario(scenario: TestGradedScenario, store: TraceStore, options?: TestGradedRunOptions): Promise<TestGradedRunResult>;
|
|
1731
|
+
|
|
1732
|
+
/**
|
|
1733
|
+
* BudgetGuard — enforces token / wall-clock / call / $ caps, records
|
|
1734
|
+
* a ledger entry on every decrement, emits `budget_breach` + throws
|
|
1735
|
+
* `BudgetBreachError` when a cap is hit.
|
|
1736
|
+
*
|
|
1737
|
+
* Wraps a TraceEmitter. The emitter persists ledger entries + breach
|
|
1738
|
+
* events so the classifier, pipelines, and reports can all read
|
|
1739
|
+
* budget state from the trace corpus — no separate accounting.
|
|
1740
|
+
*/
|
|
1741
|
+
|
|
1742
|
+
declare class BudgetBreachError extends Error {
|
|
1743
|
+
dimension: keyof BudgetSpec;
|
|
1744
|
+
limit: number;
|
|
1745
|
+
attempted: number;
|
|
1746
|
+
constructor(dimension: keyof BudgetSpec, limit: number, attempted: number);
|
|
1747
|
+
}
|
|
1748
|
+
declare class BudgetGuard {
|
|
1749
|
+
private consumed;
|
|
1750
|
+
private emitter;
|
|
1751
|
+
private budget;
|
|
1752
|
+
private startedAt;
|
|
1753
|
+
constructor(emitter: TraceEmitter, budget: BudgetSpec, now?: () => number);
|
|
1754
|
+
/** Record consumption. Throws `BudgetBreachError` if any dimension exceeds its cap. */
|
|
1755
|
+
charge(delta: Partial<Record<keyof BudgetSpec, number>>, spanId?: string): Promise<void>;
|
|
1756
|
+
/** Convenience: advance wall-clock budget based on elapsed wall time. */
|
|
1757
|
+
tickWall(nowMs: number, spanId?: string): Promise<void>;
|
|
1758
|
+
get state(): Record<keyof BudgetSpec, number>;
|
|
1759
|
+
}
|
|
1760
|
+
|
|
1761
|
+
/**
|
|
1762
|
+
* Failure taxonomy — canonical classes + a default classifier.
|
|
1763
|
+
*
|
|
1764
|
+
* Every failed run should end up in a named class. The classifier here
|
|
1765
|
+
* is rule-based (fast, deterministic); an LLM fallback can be added by
|
|
1766
|
+
* the consumer for novel cases and trained into the rule base over time.
|
|
1767
|
+
*
|
|
1768
|
+
* Consumers call `classifyFailure(run, spans, events)` and persist the
|
|
1769
|
+
* returned class as `Run.outcome.failureClass`.
|
|
1770
|
+
*/
|
|
1771
|
+
|
|
1772
|
+
interface FailureContext {
|
|
1773
|
+
run: Run;
|
|
1774
|
+
spans: Span[];
|
|
1775
|
+
events: TraceEvent[];
|
|
1776
|
+
}
|
|
1777
|
+
interface FailureClassification {
|
|
1778
|
+
failureClass: FailureClass;
|
|
1779
|
+
reason: string;
|
|
1780
|
+
triggerSpanId?: string;
|
|
1781
|
+
triggerEventId?: string;
|
|
1782
|
+
}
|
|
1783
|
+
/** Ordered rules — first match wins. */
|
|
1784
|
+
interface FailureRule {
|
|
1785
|
+
id: string;
|
|
1786
|
+
match: (ctx: FailureContext) => {
|
|
1787
|
+
failureClass: FailureClass;
|
|
1788
|
+
reason: string;
|
|
1789
|
+
triggerSpanId?: string;
|
|
1790
|
+
triggerEventId?: string;
|
|
1791
|
+
} | null;
|
|
1792
|
+
}
|
|
1793
|
+
declare const DEFAULT_RULES: FailureRule[];
|
|
1794
|
+
/** Classify the failure mode of a run using an ordered rule list. */
|
|
1795
|
+
declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
|
|
1796
|
+
|
|
1797
|
+
/**
|
|
1798
|
+
* Trajectory — ordered, structured view over a run's spans.
|
|
1799
|
+
*
|
|
1800
|
+
* A pure function `buildTrajectory(store, runId) → Trajectory` returns
|
|
1801
|
+
* a topologically ordered list of `TrajectoryStep` with parent-child
|
|
1802
|
+
* grouping collapsed into a single line-of-agent-work. Separate
|
|
1803
|
+
* analyzers (stuck-loop detection, waste ratio) live in
|
|
1804
|
+
* `pipelines/` and consume the trajectory.
|
|
1805
|
+
*/
|
|
1806
|
+
|
|
1807
|
+
interface TrajectoryStep {
|
|
1808
|
+
index: number;
|
|
1809
|
+
span: Span;
|
|
1810
|
+
/** Depth in the span tree from the root. 0 = top-level. */
|
|
1811
|
+
depth: number;
|
|
1812
|
+
/** Events attached to this span. */
|
|
1813
|
+
events: TraceEvent[];
|
|
1814
|
+
}
|
|
1815
|
+
interface Trajectory {
|
|
1816
|
+
runId: string;
|
|
1817
|
+
steps: TrajectoryStep[];
|
|
1818
|
+
llmTurns: number;
|
|
1819
|
+
toolCalls: number;
|
|
1820
|
+
judgeVerdicts: number;
|
|
1821
|
+
retrievals: number;
|
|
1822
|
+
totalDurationMs: number;
|
|
1823
|
+
}
|
|
1824
|
+
declare function buildTrajectory(store: TraceStore, runId: string): Promise<Trajectory>;
|
|
1825
|
+
|
|
1826
|
+
/**
|
|
1827
|
+
* Tool-use metrics — derived purely from trace data.
|
|
1828
|
+
*
|
|
1829
|
+
* No scoring assumptions: consumers supply optional ground-truth tool
|
|
1830
|
+
* selections per turn + optional "information used downstream" signals.
|
|
1831
|
+
* Without those, we still compute descriptive metrics (error rate,
|
|
1832
|
+
* retry rate, duplicate-call rate) that are useful on their own.
|
|
1833
|
+
*/
|
|
1834
|
+
|
|
1835
|
+
interface ToolUseMetrics {
|
|
1836
|
+
runId: string;
|
|
1837
|
+
totalCalls: number;
|
|
1838
|
+
byTool: Record<string, ToolStats>;
|
|
1839
|
+
errorRate: number;
|
|
1840
|
+
/** Ratio of calls with identical (toolName, argHash) already seen earlier in the same run. */
|
|
1841
|
+
duplicateRate: number;
|
|
1842
|
+
/** Ratio of error calls followed by ≥1 retry on same tool. */
|
|
1843
|
+
retryRate: number;
|
|
1844
|
+
/** Optional: of the calls agent made, fraction the evaluator marked as "correct selection". */
|
|
1845
|
+
selectionAccuracy?: number;
|
|
1846
|
+
}
|
|
1847
|
+
interface ToolStats {
|
|
1848
|
+
calls: number;
|
|
1849
|
+
errors: number;
|
|
1850
|
+
avgLatencyMs: number;
|
|
1851
|
+
duplicates: number;
|
|
1852
|
+
}
|
|
1853
|
+
interface ToolUseOptions {
|
|
1854
|
+
/** Map of spanId → whether the evaluator judged the tool selection correct. Optional. */
|
|
1855
|
+
selectionLabels?: Record<string, boolean>;
|
|
1856
|
+
}
|
|
1857
|
+
declare function computeToolUseMetrics(store: TraceStore, runId: string, options?: ToolUseOptions): Promise<ToolUseMetrics>;
|
|
1858
|
+
|
|
1859
|
+
/**
|
|
1860
|
+
* StuckLoopView — detects when an agent calls the same tool with the
|
|
1861
|
+
* same (or structurally similar) arguments ≥ N times in a short window.
|
|
1862
|
+
*
|
|
1863
|
+
* Rationale: agents that loop are the number-one production failure
|
|
1864
|
+
* mode on long-horizon flows. The view returns (runId, toolName,
|
|
1865
|
+
* argHash, occurrences, windowMs) for each detected loop plus a
|
|
1866
|
+
* fraction of runs affected.
|
|
1867
|
+
*/
|
|
1868
|
+
|
|
1869
|
+
interface StuckLoopFinding {
|
|
1870
|
+
runId: string;
|
|
1871
|
+
toolName: string;
|
|
1872
|
+
argHash: string;
|
|
1873
|
+
occurrences: number;
|
|
1874
|
+
spanIds: string[];
|
|
1875
|
+
/** Milliseconds between first and last call in the loop. */
|
|
1876
|
+
windowMs: number;
|
|
1877
|
+
}
|
|
1878
|
+
interface StuckLoopReport {
|
|
1879
|
+
findings: StuckLoopFinding[];
|
|
1880
|
+
affectedRunRatio: number;
|
|
1881
|
+
totalRuns: number;
|
|
1882
|
+
}
|
|
1883
|
+
interface StuckLoopOptions {
|
|
1884
|
+
/** Minimum call count to flag a loop (default 3). */
|
|
1885
|
+
minOccurrences?: number;
|
|
1886
|
+
/** Filter to a specific runId; omit to scan the entire corpus. */
|
|
1887
|
+
runId?: string;
|
|
1888
|
+
}
|
|
1889
|
+
declare function stuckLoopView(store: TraceStore, options?: StuckLoopOptions): Promise<StuckLoopReport>;
|
|
1890
|
+
|
|
1891
|
+
/**
|
|
1892
|
+
* ToolWasteView — fraction of tool calls whose results weren't used
|
|
1893
|
+
* downstream. Without a "used" signal we fall back to structural
|
|
1894
|
+
* proxies: error calls, duplicate calls, and tool calls followed by
|
|
1895
|
+
* zero subsequent LLM spans are all considered waste.
|
|
1896
|
+
*
|
|
1897
|
+
* Consumers can pass a `usageOracle` that inspects a tool span and
|
|
1898
|
+
* returns true iff the tool's result appears in a later LLM message,
|
|
1899
|
+
* artifact, or state mutation — that's the canonical definition; the
|
|
1900
|
+
* default heuristic is a reasonable fallback.
|
|
1901
|
+
*/
|
|
1902
|
+
|
|
1903
|
+
interface ToolWasteFinding {
|
|
1904
|
+
runId: string;
|
|
1905
|
+
wastedCalls: number;
|
|
1906
|
+
totalCalls: number;
|
|
1907
|
+
wasteRate: number;
|
|
1908
|
+
}
|
|
1909
|
+
interface ToolWasteReport {
|
|
1910
|
+
byRun: ToolWasteFinding[];
|
|
1911
|
+
overallWasteRate: number;
|
|
1912
|
+
}
|
|
1913
|
+
interface ToolWasteOptions {
|
|
1914
|
+
runId?: string;
|
|
1915
|
+
usageOracle?: (tool: ToolSpan, later: {
|
|
1916
|
+
llm: Awaited<ReturnType<typeof llmSpans>>;
|
|
1917
|
+
}) => boolean;
|
|
1918
|
+
}
|
|
1919
|
+
declare function toolWasteView(store: TraceStore, options?: ToolWasteOptions): Promise<ToolWasteReport>;
|
|
1920
|
+
|
|
1921
|
+
/**
|
|
1922
|
+
* BudgetBreachView — aggregates breach events across the corpus.
|
|
1923
|
+
*
|
|
1924
|
+
* Answers: which dimensions get hit most often? Which scenarios are
|
|
1925
|
+
* underbudgeted? Which variants trigger the most breaches?
|
|
1926
|
+
*/
|
|
1927
|
+
|
|
1928
|
+
interface BudgetBreachFinding {
|
|
1929
|
+
runId: string;
|
|
1930
|
+
scenarioId: string;
|
|
1931
|
+
variantId?: string;
|
|
1932
|
+
dimension: keyof BudgetSpec;
|
|
1933
|
+
limit: number;
|
|
1934
|
+
consumed: number;
|
|
1935
|
+
excessRatio: number;
|
|
1936
|
+
timestamp: number;
|
|
1937
|
+
}
|
|
1938
|
+
interface BudgetBreachReport {
|
|
1939
|
+
findings: BudgetBreachFinding[];
|
|
1940
|
+
byDimension: Record<string, number>;
|
|
1941
|
+
byScenario: Record<string, number>;
|
|
1942
|
+
byVariant: Record<string, number>;
|
|
1943
|
+
totalRuns: number;
|
|
1944
|
+
breachedRunRatio: number;
|
|
1945
|
+
}
|
|
1946
|
+
declare function budgetBreachView(store: TraceStore, options?: {
|
|
1947
|
+
scenarioId?: string;
|
|
1948
|
+
variantId?: string;
|
|
1949
|
+
}): Promise<BudgetBreachReport>;
|
|
1950
|
+
|
|
1951
|
+
/**
|
|
1952
|
+
* FailureClusterView — groups failed runs by (failureClass, triggerTool,
|
|
1953
|
+
* argHash-prefix) so weekly reviews can prioritize the top-N clusters.
|
|
1954
|
+
*
|
|
1955
|
+
* Each cluster includes: N runs, scenarios affected, representative
|
|
1956
|
+
* error message, a proposed mitigation hint (rule → action table).
|
|
1957
|
+
*/
|
|
1958
|
+
|
|
1959
|
+
interface FailureCluster {
|
|
1960
|
+
failureClass: FailureClass;
|
|
1961
|
+
/** Tool name when the trigger was a tool span, else undefined. */
|
|
1962
|
+
toolName?: string;
|
|
1963
|
+
/** First 16 chars of argHash — clusters similar args. */
|
|
1964
|
+
argPrefix?: string;
|
|
1965
|
+
runCount: number;
|
|
1966
|
+
scenarioIds: string[];
|
|
1967
|
+
exampleError?: string;
|
|
1968
|
+
exampleRunId: string;
|
|
1969
|
+
}
|
|
1970
|
+
interface FailureClusterReport {
|
|
1971
|
+
clusters: FailureCluster[];
|
|
1972
|
+
totalFailures: number;
|
|
1973
|
+
totalRuns: number;
|
|
1974
|
+
}
|
|
1975
|
+
declare function failureClusterView(store: TraceStore, options?: {
|
|
1976
|
+
rules?: FailureRule[];
|
|
1977
|
+
minClusterSize?: number;
|
|
1978
|
+
}): Promise<FailureClusterReport>;
|
|
1979
|
+
|
|
1980
|
+
/**
|
|
1981
|
+
* JudgeAgreementView — pairwise agreement between judges across the
|
|
1982
|
+
* corpus, grouped by dimension.
|
|
1983
|
+
*
|
|
1984
|
+
* Output drives two workflows:
|
|
1985
|
+
* - Judge robustness audit: "does Claude agree with GPT at κ ≥ 0.6?"
|
|
1986
|
+
* - Calibration tracking: κ vs golden human labels over time (by
|
|
1987
|
+
* providing a `humanGoldenJudgeId`).
|
|
1988
|
+
*/
|
|
1989
|
+
|
|
1990
|
+
interface JudgePair {
|
|
1991
|
+
judgeA: string;
|
|
1992
|
+
judgeB: string;
|
|
1993
|
+
dimension: string;
|
|
1994
|
+
/** Number of (targetSpanId, dimension) tuples both judges scored. */
|
|
1995
|
+
commonItems: number;
|
|
1996
|
+
pearson: number;
|
|
1997
|
+
krippendorff: number;
|
|
1998
|
+
}
|
|
1999
|
+
interface JudgeAgreementReport {
|
|
2000
|
+
pairs: JudgePair[];
|
|
2001
|
+
dimensions: string[];
|
|
2002
|
+
judgeIds: string[];
|
|
2003
|
+
}
|
|
2004
|
+
declare function judgeAgreementView(store: TraceStore): Promise<JudgeAgreementReport>;
|
|
2005
|
+
|
|
2006
|
+
/**
|
|
2007
|
+
* FirstDivergenceView — aligns two trajectories by step index, reports
|
|
2008
|
+
* the first step where they differ.
|
|
2009
|
+
*
|
|
2010
|
+
* "Differ" is configurable — default is (kind, toolName if tool, model
|
|
2011
|
+
* if llm). Use this view to attribute "why is variant B better?" to a
|
|
2012
|
+
* specific step rather than an aggregate mean delta.
|
|
2013
|
+
*/
|
|
2014
|
+
|
|
2015
|
+
interface DivergenceReport {
|
|
2016
|
+
runA: string;
|
|
2017
|
+
runB: string;
|
|
2018
|
+
firstDivergenceIndex: number | null;
|
|
2019
|
+
aStep?: TrajectoryStep;
|
|
2020
|
+
bStep?: TrajectoryStep;
|
|
2021
|
+
reason?: string;
|
|
2022
|
+
/** Common prefix length (steps that matched). */
|
|
2023
|
+
commonPrefixLen: number;
|
|
2024
|
+
}
|
|
2025
|
+
interface DivergenceOptions {
|
|
2026
|
+
/** Returns true if two steps are considered equal. Default: kind + tool/model match. */
|
|
2027
|
+
stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean;
|
|
2028
|
+
}
|
|
2029
|
+
declare function firstDivergenceView(store: TraceStore, runA: string, runB: string, options?: DivergenceOptions): Promise<DivergenceReport>;
|
|
2030
|
+
|
|
2031
|
+
/**
|
|
2032
|
+
* Baseline regression detection.
|
|
2033
|
+
*
|
|
2034
|
+
* Lifted from ADC baseline.ts. Every promotion-blocking signal boils down
|
|
2035
|
+
* to: "is this run measurably worse than baseline?" — with enough
|
|
2036
|
+
* statistical rigor to distinguish noise from drift.
|
|
2037
|
+
*
|
|
2038
|
+
* Uses:
|
|
2039
|
+
* - Welch's t-test (unequal variance) for per-metric mean comparison
|
|
2040
|
+
* - Cohen's d for effect size magnitude
|
|
2041
|
+
* - IQR for stability flag (unstable samples can't be trusted for comparisons)
|
|
2042
|
+
*
|
|
2043
|
+
* Returns a structured verdict: improved | regressed | stable | unstable.
|
|
2044
|
+
*/
|
|
2045
|
+
interface MetricSamples {
|
|
2046
|
+
/** Stable metric key (e.g. "overallScore", "firstTokenMs"). */
|
|
2047
|
+
metric: string;
|
|
2048
|
+
/** Whether higher values are better. */
|
|
2049
|
+
higherIsBetter: boolean;
|
|
2050
|
+
baseline: number[];
|
|
2051
|
+
candidate: number[];
|
|
2052
|
+
}
|
|
2053
|
+
interface MetricVerdict {
|
|
2054
|
+
metric: string;
|
|
2055
|
+
baselineMean: number;
|
|
2056
|
+
candidateMean: number;
|
|
2057
|
+
delta: number;
|
|
2058
|
+
cohensD: number;
|
|
2059
|
+
welchT: number;
|
|
2060
|
+
welchDf: number;
|
|
2061
|
+
welchP: number;
|
|
2062
|
+
stable: boolean;
|
|
2063
|
+
/** IQR of the combined samples — used as a rough stability indicator. */
|
|
2064
|
+
iqr: number;
|
|
2065
|
+
verdict: 'improved' | 'regressed' | 'stable' | 'unstable';
|
|
2066
|
+
}
|
|
2067
|
+
interface BaselineReport {
|
|
2068
|
+
metrics: MetricVerdict[];
|
|
2069
|
+
/** True if any critical metric regressed. */
|
|
2070
|
+
hasRegression: boolean;
|
|
2071
|
+
/** True if any metric is unstable (too noisy to judge). */
|
|
2072
|
+
hasUnstable: boolean;
|
|
2073
|
+
}
|
|
2074
|
+
interface BaselineOptions {
|
|
2075
|
+
/** Effect size threshold for meaningful delta (default 0.5 — medium effect). */
|
|
2076
|
+
effectThreshold?: number;
|
|
2077
|
+
/** p-value threshold for statistical significance (default 0.05). */
|
|
2078
|
+
alpha?: number;
|
|
2079
|
+
/** IQR/mean ratio above which samples are flagged unstable (default 0.30). */
|
|
2080
|
+
unstableCvThreshold?: number;
|
|
2081
|
+
}
|
|
2082
|
+
/**
|
|
2083
|
+
* Compare candidate samples against baseline per metric. Verdict logic:
|
|
2084
|
+
* - unstable: IQR/|mean| > threshold on either set — not enough signal
|
|
2085
|
+
* - improved: meaningful effect in the "better" direction AND p < alpha
|
|
2086
|
+
* - regressed: meaningful effect in the "worse" direction AND p < alpha
|
|
2087
|
+
* - stable: otherwise (no significant change)
|
|
2088
|
+
*/
|
|
2089
|
+
declare function compareToBaseline(samples: MetricSamples[], options?: BaselineOptions): BaselineReport;
|
|
2090
|
+
/** Inter-quartile range; 0 when the sample has no spread. */
|
|
2091
|
+
declare function iqr(xs: number[]): number;
|
|
2092
|
+
/**
|
|
2093
|
+
* Welch's t-test — unequal-variance two-sample t. Uses the same Student-t
|
|
2094
|
+
* CDF as `pairedTTest` (via incomplete beta); falls back to normal tail
|
|
2095
|
+
* when df is large.
|
|
2096
|
+
*/
|
|
2097
|
+
declare function welchsTTest(a: number[], b: number[]): {
|
|
2098
|
+
t: number;
|
|
2099
|
+
df: number;
|
|
2100
|
+
p: number;
|
|
2101
|
+
};
|
|
2102
|
+
|
|
2103
|
+
/**
|
|
2104
|
+
* RegressionView — compares a candidate slice to a baseline slice on a
|
|
2105
|
+
* named metric. Delegates the statistics (Welch's t-test, Cohen's d,
|
|
2106
|
+
* IQR stability) to `baseline.ts`.
|
|
2107
|
+
*
|
|
2108
|
+
* This is the entry point for CI regression gates: "given runs tagged
|
|
2109
|
+
* release=A and release=B, did any metric regress?"
|
|
2110
|
+
*/
|
|
2111
|
+
|
|
2112
|
+
interface RegressionSpec {
|
|
2113
|
+
metric: string;
|
|
2114
|
+
higherIsBetter: boolean;
|
|
2115
|
+
/** Extract a scalar from a run. Default extractors handle common metrics. */
|
|
2116
|
+
extract?: (run: Run, store: TraceStore) => Promise<number | null>;
|
|
2117
|
+
}
|
|
2118
|
+
interface RegressionOptions extends BaselineOptions {
|
|
2119
|
+
baseline: RunFilter;
|
|
2120
|
+
candidate: RunFilter;
|
|
2121
|
+
}
|
|
2122
|
+
declare function regressionView(store: TraceStore, metrics: RegressionSpec[], options: RegressionOptions): Promise<BaselineReport>;
|
|
2123
|
+
|
|
2124
|
+
/**
|
|
2125
|
+
* SLO gates — quantified pass/fail primitives beyond score thresholds.
|
|
2126
|
+
*
|
|
2127
|
+
* Lifted from ADC's sandbox eval suite. Each SLO defines a metric, a
|
|
2128
|
+
* threshold, and a severity (critical | warning). Critical breaches fail
|
|
2129
|
+
* the eval; warnings are reported but don't gate CI. Margin is the
|
|
2130
|
+
* ratio of actual to threshold for histogramming "how close are we?"
|
|
2131
|
+
*
|
|
2132
|
+
* Consumers assemble their own SLO arrays; DEFAULT_AGENT_SLOS covers
|
|
2133
|
+
* the generic agent flow (provision, first token, pass rate, cost).
|
|
2134
|
+
*/
|
|
2135
|
+
type SloSeverity = 'critical' | 'warning';
|
|
2136
|
+
type SloComparator = 'lte' | 'gte';
|
|
2137
|
+
interface Slo {
|
|
2138
|
+
/** Stable identifier — must be unique within an SLO set. */
|
|
2139
|
+
id: string;
|
|
2140
|
+
/** Human description, shown in reports. */
|
|
2141
|
+
description: string;
|
|
2142
|
+
/** Metric key looked up in the candidate record. */
|
|
2143
|
+
metric: string;
|
|
2144
|
+
/** Whether the metric should stay below (lte) or above (gte) threshold. */
|
|
2145
|
+
comparator: SloComparator;
|
|
2146
|
+
/** Threshold value. */
|
|
2147
|
+
threshold: number;
|
|
2148
|
+
severity: SloSeverity;
|
|
2149
|
+
}
|
|
2150
|
+
interface SloCheckResult {
|
|
2151
|
+
slo: Slo;
|
|
2152
|
+
actual: number | undefined;
|
|
2153
|
+
passed: boolean;
|
|
2154
|
+
/** actual/threshold for lte, threshold/actual for gte. >1 means safe margin; <1 means breach. 0 when actual is missing. */
|
|
2155
|
+
margin: number;
|
|
2156
|
+
detail: string;
|
|
2157
|
+
}
|
|
2158
|
+
interface SloReport {
|
|
2159
|
+
results: SloCheckResult[];
|
|
2160
|
+
passedCritical: boolean;
|
|
2161
|
+
criticalBreaches: SloCheckResult[];
|
|
2162
|
+
warnings: SloCheckResult[];
|
|
2163
|
+
}
|
|
2164
|
+
/**
|
|
2165
|
+
* Evaluate an SLO set against a candidate metrics object. Missing metrics
|
|
2166
|
+
* count as breaches — if you declared it, you must measure it.
|
|
2167
|
+
*/
|
|
2168
|
+
declare function checkSlos(metrics: Record<string, number>, slos: Slo[]): SloReport;
|
|
2169
|
+
/** Reference SLO set for agent-style evals. Tune per-product by cloning + overriding. */
|
|
2170
|
+
declare const DEFAULT_AGENT_SLOS: Slo[];
|
|
2171
|
+
|
|
2172
|
+
/**
|
|
2173
|
+
* Declarative oracles — ground-truth assertions without an LLM.
|
|
2174
|
+
*
|
|
2175
|
+
* Lifted from browser-agent-driver's _oracle.mjs. When you know the
|
|
2176
|
+
* expected outcome exactly (a URL, a text fragment, a JSON shape), you
|
|
2177
|
+
* don't need an LLM judge — you need a regex. These oracles are
|
|
2178
|
+
* composable pass/fail checks over an observation bundle.
|
|
2179
|
+
*
|
|
2180
|
+
* Each oracle returns { pass, detail, evidence? } and has a short
|
|
2181
|
+
* `id` for reporting. `evaluateOracles` runs a batch and aggregates.
|
|
2182
|
+
*/
|
|
2183
|
+
interface OracleObservation {
|
|
2184
|
+
/** Final observable text output from the agent (response, page snapshot, stdout). */
|
|
2185
|
+
text?: string;
|
|
2186
|
+
/** Final URL — for browser-style scenarios. */
|
|
2187
|
+
url?: string;
|
|
2188
|
+
/** Any structured JSON the agent produced. */
|
|
2189
|
+
json?: unknown;
|
|
2190
|
+
/** Free-form context used by custom oracles. */
|
|
2191
|
+
context?: Record<string, unknown>;
|
|
2192
|
+
}
|
|
2193
|
+
interface OracleResult {
|
|
2194
|
+
id: string;
|
|
2195
|
+
pass: boolean;
|
|
2196
|
+
detail: string;
|
|
2197
|
+
evidence?: string;
|
|
2198
|
+
}
|
|
2199
|
+
interface Oracle {
|
|
2200
|
+
id: string;
|
|
2201
|
+
check(obs: OracleObservation): OracleResult;
|
|
2202
|
+
}
|
|
2203
|
+
declare function textInSnapshot(needle: string, opts?: {
|
|
2204
|
+
caseSensitive?: boolean;
|
|
2205
|
+
}): Oracle;
|
|
2206
|
+
declare function urlContains(fragment: string): Oracle;
|
|
2207
|
+
declare function jsonShape(expected: Record<string, unknown>): Oracle;
|
|
2208
|
+
declare function regexMatches(pattern: RegExp): Oracle;
|
|
2209
|
+
/**
|
|
2210
|
+
* Anti-bot detector — distinguishes genuine failures from blocked navigation
|
|
2211
|
+
* (cloudflare, recaptcha, etc). Returns an Oracle that PASSES when no block
|
|
2212
|
+
* marker is present; on block, detail names the blocker so runners can tag
|
|
2213
|
+
* results as "blocked" rather than "failed". Lifted from browser-agent-driver.
|
|
2214
|
+
*/
|
|
2215
|
+
declare function notBlocked(): Oracle;
|
|
2216
|
+
interface OracleReport {
|
|
2217
|
+
results: OracleResult[];
|
|
2218
|
+
pass: boolean;
|
|
2219
|
+
passCount: number;
|
|
2220
|
+
failCount: number;
|
|
2221
|
+
/** 0-1 ratio of oracles passed. */
|
|
2222
|
+
score: number;
|
|
2223
|
+
}
|
|
2224
|
+
/** Run all oracles against one observation and aggregate. */
|
|
2225
|
+
declare function evaluateOracles(obs: OracleObservation, oracles: Oracle[]): OracleReport;
|
|
2226
|
+
|
|
2227
|
+
/**
|
|
2228
|
+
* Cost tracker — token + USD accounting per scenario and per run.
|
|
2229
|
+
*
|
|
2230
|
+
* Lifted from tax/legal metrics.ts + tangle-router UsageEvent. Every
|
|
2231
|
+
* optimizer needs to know "is the quality gain worth the cost delta?",
|
|
2232
|
+
* and every dashboard needs dollars-per-completed-task. MODEL_PRICING
|
|
2233
|
+
* from metrics.ts stays authoritative for estimate math; this module
|
|
2234
|
+
* adds the aggregation + per-scenario roll-up that was duplicated
|
|
2235
|
+
* across 4 verticals.
|
|
2236
|
+
*/
|
|
2237
|
+
interface TokenSpec {
|
|
2238
|
+
inputTokens: number;
|
|
2239
|
+
outputTokens: number;
|
|
2240
|
+
cachedTokens?: number;
|
|
2241
|
+
reasoningTokens?: number;
|
|
2242
|
+
}
|
|
2243
|
+
interface CostEntry extends TokenSpec {
|
|
2244
|
+
scenarioId: string;
|
|
2245
|
+
model: string;
|
|
2246
|
+
/** Override estimate with an observed cost (e.g. from provider response). */
|
|
2247
|
+
actualCostUsd?: number;
|
|
2248
|
+
timestamp: number;
|
|
2249
|
+
/** Free-form tags (variant id, round #, etc.). */
|
|
2250
|
+
tags?: Record<string, string>;
|
|
2251
|
+
}
|
|
2252
|
+
interface ScenarioCost {
|
|
2253
|
+
scenarioId: string;
|
|
2254
|
+
entries: CostEntry[];
|
|
2255
|
+
totalInputTokens: number;
|
|
2256
|
+
totalOutputTokens: number;
|
|
2257
|
+
totalCachedTokens: number;
|
|
2258
|
+
totalCostUsd: number;
|
|
2259
|
+
/** Pass flag — set by consumer via markOutcome; used for cost-per-completed-task. */
|
|
2260
|
+
completed?: boolean;
|
|
2261
|
+
}
|
|
2262
|
+
declare class CostTracker {
|
|
2263
|
+
private byScenario;
|
|
2264
|
+
record(entry: Omit<CostEntry, 'timestamp'> & {
|
|
2265
|
+
timestamp?: number;
|
|
2266
|
+
}): CostEntry;
|
|
2267
|
+
markOutcome(scenarioId: string, completed: boolean): void;
|
|
2268
|
+
get(scenarioId: string): ScenarioCost | undefined;
|
|
2269
|
+
list(): ScenarioCost[];
|
|
2270
|
+
summary(): CostSummary;
|
|
2271
|
+
}
|
|
2272
|
+
interface CostSummary {
|
|
2273
|
+
scenarioCount: number;
|
|
2274
|
+
completedCount: number;
|
|
2275
|
+
totalInputTokens: number;
|
|
2276
|
+
totalOutputTokens: number;
|
|
2277
|
+
totalCostUsd: number;
|
|
2278
|
+
avgCostPerScenarioUsd: number;
|
|
2279
|
+
/** Total USD / completed scenarios — null when nothing completed. */
|
|
2280
|
+
costPerCompletedTaskUsd: number | null;
|
|
2281
|
+
}
|
|
2282
|
+
|
|
2283
|
+
/**
|
|
2284
|
+
* Pareto frontier — multi-objective optimization over candidate runs.
|
|
2285
|
+
*
|
|
2286
|
+
* Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
|
|
2287
|
+
* trading off (cost, latency, quality) or (passRate, tokenBudget,
|
|
2288
|
+
* ttfb), you rarely have a single "winner" — you have a set of
|
|
2289
|
+
* non-dominated candidates. This module exposes:
|
|
2290
|
+
*
|
|
2291
|
+
* - `paretoFrontier`: filter a set of candidates to the non-dominated ones
|
|
2292
|
+
* - `dominates`: does A dominate B across all objectives?
|
|
2293
|
+
*
|
|
2294
|
+
* Each objective is declared with a direction: 'maximize' (higher=better)
|
|
2295
|
+
* or 'minimize' (lower=better). Candidates are any object; pass an
|
|
2296
|
+
* `objective(candidate)` accessor.
|
|
2297
|
+
*/
|
|
2298
|
+
type Direction = 'maximize' | 'minimize';
|
|
2299
|
+
interface Objective<T> {
|
|
2300
|
+
/** Stable label used in reports. */
|
|
2301
|
+
name: string;
|
|
2302
|
+
direction: Direction;
|
|
2303
|
+
value: (candidate: T) => number;
|
|
2304
|
+
}
|
|
2305
|
+
interface ParetoResult<T> {
|
|
2306
|
+
frontier: T[];
|
|
2307
|
+
dominated: T[];
|
|
2308
|
+
/** Index map: frontier[i] dominates each of dominatedBy[i]. */
|
|
2309
|
+
dominanceMap: Array<{
|
|
2310
|
+
dominator: T;
|
|
2311
|
+
dominated: T[];
|
|
2312
|
+
}>;
|
|
2313
|
+
}
|
|
2314
|
+
/** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
|
|
2315
|
+
declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
|
|
2316
|
+
/**
|
|
2317
|
+
* Compute the non-dominated frontier. Candidates with NaN/Infinity on any
|
|
2318
|
+
* objective are excluded (can't rank them). A candidate enters the frontier
|
|
2319
|
+
* iff no other candidate dominates it.
|
|
2320
|
+
*/
|
|
2321
|
+
declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
|
|
2322
|
+
|
|
2323
|
+
/**
|
|
2324
|
+
* Series convergence — detects whether a sequence of scalar measurements
|
|
2325
|
+
* is stabilizing, drifting, or noisy.
|
|
2326
|
+
*
|
|
2327
|
+
* Lifted from ADC convergence.ts. The per-turn `ConvergenceTracker` is
|
|
2328
|
+
* about progress *within* a single run; this module is about drift
|
|
2329
|
+
* *across* runs (e.g. "are my nightly eval scores stabilizing?").
|
|
2330
|
+
*
|
|
2331
|
+
* Three signals:
|
|
2332
|
+
* - stabilized: last K values have low variance (< epsilon) — done
|
|
2333
|
+
* - drifting: recent trend is monotonic and beyond noise — regressing or improving
|
|
2334
|
+
* - noisy: neither — keep iterating, but flag as untrustworthy for gating
|
|
2335
|
+
*/
|
|
2336
|
+
interface SeriesConvergenceOptions {
|
|
2337
|
+
/** Window size for "recent" analysis (default 5). */
|
|
2338
|
+
window?: number;
|
|
2339
|
+
/** Coefficient-of-variation threshold below which the window is stabilized (default 0.05 = 5%). */
|
|
2340
|
+
stableCv?: number;
|
|
2341
|
+
/** Minimum monotone run length to call drift (default 3). */
|
|
2342
|
+
driftRun?: number;
|
|
2343
|
+
}
|
|
2344
|
+
interface SeriesConvergenceResult {
|
|
2345
|
+
state: 'stabilized' | 'drifting-up' | 'drifting-down' | 'noisy' | 'insufficient-data';
|
|
2346
|
+
windowMean: number;
|
|
2347
|
+
windowCv: number;
|
|
2348
|
+
/** Longest monotonic run at the tail of the series (positive for up, negative for down). */
|
|
2349
|
+
tailRun: number;
|
|
2350
|
+
/** True when n ≥ window AND windowCv ≤ stableCv. */
|
|
2351
|
+
stable: boolean;
|
|
2352
|
+
}
|
|
2353
|
+
declare function analyzeSeries(values: number[], options?: SeriesConvergenceOptions): SeriesConvergenceResult;
|
|
2354
|
+
|
|
2355
|
+
/**
|
|
2356
|
+
* State continuity scoring — measures how well a resumed/handed-off agent
|
|
2357
|
+
* preserves prior work.
|
|
2358
|
+
*
|
|
2359
|
+
* Lifted from tax-agent's run-resume-eval.ts. When session 2 continues
|
|
2360
|
+
* session 1's work, the key question is: did it preserve key artifacts,
|
|
2361
|
+
* or start over and lose context? Each `ContinuityCheck` inspects one
|
|
2362
|
+
* aspect (file preserved, key count grew, status advanced) and yields
|
|
2363
|
+
* 0-1 credit; the aggregate is the simple mean.
|
|
2364
|
+
*
|
|
2365
|
+
* Generic over any "snapshot" shape — pass your own checks.
|
|
2366
|
+
*/
|
|
2367
|
+
interface ContinuitySnapshotPair<T> {
|
|
2368
|
+
before: T;
|
|
2369
|
+
after: T;
|
|
2370
|
+
}
|
|
2371
|
+
interface ContinuityCheck<T> {
|
|
2372
|
+
/** Stable identifier; shown in the report. */
|
|
2373
|
+
id: string;
|
|
2374
|
+
/** Description of what this check measures. */
|
|
2375
|
+
description: string;
|
|
2376
|
+
/** Returns 0..1 credit for this dimension (1 = fully preserved/improved). */
|
|
2377
|
+
score: (pair: ContinuitySnapshotPair<T>) => number;
|
|
2378
|
+
}
|
|
2379
|
+
interface ContinuityCheckResult {
|
|
2380
|
+
id: string;
|
|
2381
|
+
description: string;
|
|
2382
|
+
score: number;
|
|
2383
|
+
pass: boolean;
|
|
2384
|
+
}
|
|
2385
|
+
interface ContinuityReport {
|
|
2386
|
+
results: ContinuityCheckResult[];
|
|
2387
|
+
/** Mean of per-check scores, in 0..1. */
|
|
2388
|
+
overallScore: number;
|
|
2389
|
+
/** True iff ALL checks scored ≥ passThreshold. */
|
|
2390
|
+
pass: boolean;
|
|
2391
|
+
}
|
|
2392
|
+
declare function scoreContinuity<T>(pair: ContinuitySnapshotPair<T>, checks: ContinuityCheck<T>[], options?: {
|
|
2393
|
+
passThreshold?: number;
|
|
2394
|
+
}): ContinuityReport;
|
|
2395
|
+
/** Common check: a required key in a record exists and equals the prior value. */
|
|
2396
|
+
declare function keyPreserved<T extends Record<string, unknown>>(key: keyof T & string): ContinuityCheck<T>;
|
|
2397
|
+
/** Common check: a collection (array) grew or stayed the same size. */
|
|
2398
|
+
declare function collectionPreserved<T, K extends keyof T & string>(key: K, minRatio?: number): ContinuityCheck<T>;
|
|
2399
|
+
/** Common check: a status field advanced in an expected order. */
|
|
2400
|
+
declare function statusAdvanced<T extends Record<string, unknown>>(key: keyof T & string, progression: readonly string[]): ContinuityCheck<T>;
|
|
2401
|
+
|
|
2402
|
+
/**
|
|
2403
|
+
* Dataset — versioned, sliceable, content-hashed scenario collection.
|
|
2404
|
+
*
|
|
2405
|
+
* Scenarios stop being ephemeral arrays and become first-class
|
|
2406
|
+
* artifacts. Every Dataset carries:
|
|
2407
|
+
* - content hash (sha256 over canonicalized scenario array)
|
|
2408
|
+
* - provenance (contributor, createdAt, sourceUrl)
|
|
2409
|
+
* - split labels (train | dev | test | holdout)
|
|
2410
|
+
* - difficulty tiers (easy | medium | hard | extreme)
|
|
2411
|
+
* - tags (free-form, per-scenario)
|
|
2412
|
+
*
|
|
2413
|
+
* `Dataset.slice({ difficulty, split, holdout, seed })` returns a
|
|
2414
|
+
* deterministic, reproducible subset. Holdout slices are locked: you
|
|
2415
|
+
* can read them but `mutate` throws, which prevents "oh I'll just
|
|
2416
|
+
* tweak that one scenario" contamination drift.
|
|
2417
|
+
*/
|
|
2418
|
+
type DatasetSplit = 'train' | 'dev' | 'test' | 'holdout';
|
|
2419
|
+
type DatasetDifficulty = 'easy' | 'medium' | 'hard' | 'extreme';
|
|
2420
|
+
interface DatasetScenario {
|
|
2421
|
+
id: string;
|
|
2422
|
+
/** Arbitrary payload; the framework doesn't interpret it. */
|
|
2423
|
+
payload: unknown;
|
|
2424
|
+
split?: DatasetSplit;
|
|
2425
|
+
difficulty?: DatasetDifficulty;
|
|
2426
|
+
/** Canary token that MUST NOT round-trip through a correct agent output. */
|
|
2427
|
+
canary?: string;
|
|
2428
|
+
tags?: Record<string, string>;
|
|
2429
|
+
}
|
|
2430
|
+
interface DatasetProvenance {
|
|
2431
|
+
contributor?: string;
|
|
2432
|
+
createdAt: string;
|
|
2433
|
+
sourceUrl?: string;
|
|
2434
|
+
license?: string;
|
|
2435
|
+
description?: string;
|
|
2436
|
+
/** Monotonic human-readable version (e.g. "2026.04.20"). */
|
|
2437
|
+
version: string;
|
|
2438
|
+
}
|
|
2439
|
+
interface DatasetManifest {
|
|
2440
|
+
name: string;
|
|
2441
|
+
provenance: DatasetProvenance;
|
|
2442
|
+
/** sha256 hex over canonicalized scenarios. */
|
|
2443
|
+
contentHash: string;
|
|
2444
|
+
scenarioCount: number;
|
|
2445
|
+
splitCounts: Record<DatasetSplit, number>;
|
|
2446
|
+
}
|
|
2447
|
+
interface SliceOptions {
|
|
2448
|
+
split?: DatasetSplit;
|
|
2449
|
+
difficulty?: DatasetDifficulty;
|
|
2450
|
+
/** Number of scenarios (random sample, seeded). Omit to take all that match. */
|
|
2451
|
+
limit?: number;
|
|
2452
|
+
seed?: number;
|
|
2453
|
+
/** Predicate narrowing. Applied after split/difficulty filters. */
|
|
2454
|
+
filter?: (scenario: DatasetScenario) => boolean;
|
|
2455
|
+
/** If true, include scenarios marked as holdout. Default false. */
|
|
2456
|
+
includeHoldout?: boolean;
|
|
2457
|
+
}
|
|
2458
|
+
/** Locked holdouts — throws on mutate. Callers that need a mutable dataset fork it. */
|
|
2459
|
+
declare class HoldoutLockedError extends Error {
|
|
2460
|
+
constructor(datasetName: string);
|
|
2461
|
+
}
|
|
2462
|
+
declare class Dataset {
|
|
2463
|
+
readonly name: string;
|
|
2464
|
+
readonly provenance: DatasetProvenance;
|
|
2465
|
+
private scenarios;
|
|
2466
|
+
private locked;
|
|
2467
|
+
constructor(init: {
|
|
2468
|
+
name: string;
|
|
2469
|
+
provenance: DatasetProvenance;
|
|
2470
|
+
scenarios: DatasetScenario[];
|
|
2471
|
+
locked?: boolean;
|
|
2472
|
+
});
|
|
2473
|
+
/** All scenarios. Readonly — callers must go through `slice` or `clone`. */
|
|
2474
|
+
all(): readonly DatasetScenario[];
|
|
2475
|
+
get size(): number;
|
|
2476
|
+
/**
|
|
2477
|
+
* Deterministic sliced subset. Seed is REQUIRED when `limit` is set so
|
|
2478
|
+
* the same arguments always produce the same slice across machines.
|
|
2479
|
+
*/
|
|
2480
|
+
slice(options?: SliceOptions): DatasetScenario[];
|
|
2481
|
+
/**
|
|
2482
|
+
* Assemble the manifest (name + provenance + content hash + counts).
|
|
2483
|
+
* Content hash is deterministic over canonicalized scenarios.
|
|
2484
|
+
*/
|
|
2485
|
+
manifest(): Promise<DatasetManifest>;
|
|
2486
|
+
/** Fresh unlocked copy — for post-release forks when mutation is needed. */
|
|
2487
|
+
clone(overrides?: Partial<{
|
|
2488
|
+
name: string;
|
|
2489
|
+
version: string;
|
|
2490
|
+
}>): Dataset;
|
|
2491
|
+
lock(): void;
|
|
2492
|
+
add(scenario: DatasetScenario): void;
|
|
2493
|
+
remove(scenarioId: string): void;
|
|
2494
|
+
/**
|
|
2495
|
+
* Stable JSON-Lines serialization — deterministic byte-for-byte.
|
|
2496
|
+
* Write to disk for contamination-verifiable archives.
|
|
2497
|
+
*/
|
|
2498
|
+
toJsonl(): string;
|
|
2499
|
+
static fromJsonl(jsonl: string, manifest: Omit<DatasetManifest, 'contentHash' | 'scenarioCount' | 'splitCounts'>): Dataset;
|
|
2500
|
+
}
|
|
2501
|
+
declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
|
|
2502
|
+
|
|
2503
|
+
/**
|
|
2504
|
+
* ContaminationGuard — ensures held-out scenarios don't leak into
|
|
2505
|
+
* training/prompt paths, and flags model memorization.
|
|
2506
|
+
*
|
|
2507
|
+
* Three probes:
|
|
2508
|
+
* 1. `checkCanaries(output, scenario)` — if the scenario carries a
|
|
2509
|
+
* canary token, it MUST NOT appear in the agent's output.
|
|
2510
|
+
* Canaries are strings that are statistically impossible to
|
|
2511
|
+
* reconstruct from the scenario description alone — so if they
|
|
2512
|
+
* echo back, the model memorized them.
|
|
2513
|
+
* 2. `canaryLeakView(store)` — cross-corpus view of every run whose
|
|
2514
|
+
* output contained a canary, with the offending scenario + run.
|
|
2515
|
+
* 3. `HoldoutAuditor` — wraps a Dataset and emits a structured error
|
|
2516
|
+
* on any code path that reads holdout scenarios but doesn't flag
|
|
2517
|
+
* `purpose: 'evaluation'`. Keeps engineers honest.
|
|
2518
|
+
*/
|
|
2519
|
+
|
|
2520
|
+
interface CanaryLeak {
|
|
2521
|
+
scenarioId: string;
|
|
2522
|
+
canary: string;
|
|
2523
|
+
runId?: string;
|
|
2524
|
+
evidence: string;
|
|
2525
|
+
}
|
|
2526
|
+
declare function checkCanaries(output: string, scenarios: DatasetScenario[]): CanaryLeak[];
|
|
2527
|
+
/**
|
|
2528
|
+
* Scan the LLM-output history in a corpus; returns every case where a
|
|
2529
|
+
* canary from a known scenario appeared in agent output. Pass the full
|
|
2530
|
+
* set of scenarios whose canaries you care about (typically the whole
|
|
2531
|
+
* held-out slice).
|
|
2532
|
+
*/
|
|
2533
|
+
declare function canaryLeakView(store: TraceStore, scenarios: DatasetScenario[]): Promise<CanaryLeak[]>;
|
|
2534
|
+
declare class HoldoutAuditor {
|
|
2535
|
+
private scenarios;
|
|
2536
|
+
private accessLog;
|
|
2537
|
+
constructor(scenarios: DatasetScenario[]);
|
|
2538
|
+
/** Retrieve a holdout scenario for a declared purpose. Non-'evaluation' throws. */
|
|
2539
|
+
get(scenarioId: string, purpose: 'evaluation' | 'debugging'): DatasetScenario;
|
|
2540
|
+
getAccessLog(): ReadonlyArray<{
|
|
2541
|
+
scenarioId: string;
|
|
2542
|
+
purpose: string;
|
|
2543
|
+
at: number;
|
|
2544
|
+
}>;
|
|
2545
|
+
}
|
|
2546
|
+
|
|
2547
|
+
/**
|
|
2548
|
+
* Red-team battery — adversarial scenario corpus with per-category
|
|
2549
|
+
* scorers.
|
|
2550
|
+
*
|
|
2551
|
+
* Every scenario is a `DatasetScenario` with a `redTeam: {...}` payload
|
|
2552
|
+
* that declares: category, expected refusal, attack surface. Scorers
|
|
2553
|
+
* inspect the agent's output + trajectory and return a PASS/FAIL per
|
|
2554
|
+
* category. Aggregated via `redTeamReport`.
|
|
2555
|
+
*
|
|
2556
|
+
* The built-in corpus is intentionally conservative — 40 cases
|
|
2557
|
+
* covering the most-seen attack patterns. Extend via `extendCorpus`.
|
|
2558
|
+
*/
|
|
2559
|
+
|
|
2560
|
+
type RedTeamCategory = 'prompt_injection_direct' | 'prompt_injection_indirect' | 'jailbreak_persona' | 'jailbreak_dan' | 'pii_leak' | 'permission_escalation' | 'data_exfil' | 'policy_override';
|
|
2561
|
+
interface RedTeamPayload {
|
|
2562
|
+
category: RedTeamCategory;
|
|
2563
|
+
/** The hostile input the agent sees. */
|
|
2564
|
+
input: string;
|
|
2565
|
+
/**
|
|
2566
|
+
* What a safe agent should do: "refuse" means explicit refusal,
|
|
2567
|
+
* "ignore" means process benignly without executing the attack.
|
|
2568
|
+
*/
|
|
2569
|
+
expected: 'refuse' | 'ignore';
|
|
2570
|
+
/** Substring(s) that MUST NOT appear in output. */
|
|
2571
|
+
forbiddenStrings?: string[];
|
|
2572
|
+
/** Tool names the agent MUST NOT call for this scenario. */
|
|
2573
|
+
forbiddenTools?: string[];
|
|
2574
|
+
}
|
|
2575
|
+
interface RedTeamCase extends DatasetScenario {
|
|
2576
|
+
payload: RedTeamPayload;
|
|
2577
|
+
}
|
|
2578
|
+
interface RedTeamFinding {
|
|
2579
|
+
scenarioId: string;
|
|
2580
|
+
category: RedTeamCategory;
|
|
2581
|
+
passed: boolean;
|
|
2582
|
+
reason: string;
|
|
2583
|
+
evidence?: string;
|
|
2584
|
+
}
|
|
2585
|
+
interface RedTeamReport {
|
|
2586
|
+
findings: RedTeamFinding[];
|
|
2587
|
+
passRateByCategory: Record<RedTeamCategory, number>;
|
|
2588
|
+
overallPassRate: number;
|
|
2589
|
+
}
|
|
2590
|
+
/** Ship a canonical, small corpus. Consumers extend via `extendCorpus`. */
|
|
2591
|
+
declare const DEFAULT_RED_TEAM_CORPUS: RedTeamCase[];
|
|
2592
|
+
declare function redTeamDataset(extraCases?: RedTeamCase[]): Dataset;
|
|
2593
|
+
/**
|
|
2594
|
+
* Score a single agent response against the red-team case.
|
|
2595
|
+
* `toolCalls` should be the names of tools the agent invoked during the run.
|
|
2596
|
+
*/
|
|
2597
|
+
declare function scoreRedTeamOutput(output: string, toolCalls: string[], rtCase: RedTeamCase): RedTeamFinding;
|
|
2598
|
+
/** Aggregate red-team findings into per-category pass rates. */
|
|
2599
|
+
declare function redTeamReport(findings: RedTeamFinding[]): RedTeamReport;
|
|
2600
|
+
/**
|
|
2601
|
+
* Extract the tool-call names from a corpus run — convenience for the
|
|
2602
|
+
* common pipeline (run the scenario → score the run).
|
|
2603
|
+
*/
|
|
2604
|
+
declare function toolNamesForRun(store: TraceStore, runId: string): Promise<string[]>;
|
|
2605
|
+
|
|
2606
|
+
/**
|
|
2607
|
+
* Power analysis + multiple-comparison correction.
|
|
2608
|
+
*
|
|
2609
|
+
* Two jobs:
|
|
2610
|
+
* 1. Before running: `requiredSampleSize({ effect, alpha, power })`
|
|
2611
|
+
* returns the N per arm needed to detect a given effect size.
|
|
2612
|
+
* 2. After running: `benjaminiHochberg(pValues, fdr)` and
|
|
2613
|
+
* `bonferroni(pValues, alpha)` correct for multiple pairwise tests
|
|
2614
|
+
* so PromptOptimizer's "significant" flag is statistically honest.
|
|
2615
|
+
*
|
|
2616
|
+
* Fixes the correctness bug in 0.2's PromptOptimizer which applied
|
|
2617
|
+
* alpha directly across n*(n-1)/2 pairwise tests without correction —
|
|
2618
|
+
* dramatically inflating false-positive rate when variants ≥ 3.
|
|
2619
|
+
*/
|
|
2620
|
+
/**
|
|
2621
|
+
* Required N per arm for a two-sample comparison at target effect size,
|
|
2622
|
+
* alpha, and power. Uses the normal-approximation formula:
|
|
2623
|
+
*
|
|
2624
|
+
* n = 2 * ( (z_{1-α/2} + z_{1-β}) / d )^2
|
|
2625
|
+
*
|
|
2626
|
+
* where d is Cohen's d. Returns Infinity for effect ≤ 0.
|
|
2627
|
+
*/
|
|
2628
|
+
declare function requiredSampleSize(opts: {
|
|
2629
|
+
effect: number;
|
|
2630
|
+
alpha?: number;
|
|
2631
|
+
power?: number;
|
|
2632
|
+
twoSided?: boolean;
|
|
2633
|
+
}): number;
|
|
2634
|
+
/** Bonferroni adjustment: multiply every p-value by the number of tests, clamp at 1. */
|
|
2635
|
+
declare function bonferroni(pValues: number[], alpha?: number): {
|
|
2636
|
+
adjusted: number[];
|
|
2637
|
+
significant: boolean[];
|
|
2638
|
+
};
|
|
2639
|
+
/**
|
|
2640
|
+
* Benjamini–Hochberg false discovery rate. Returns adjusted q-values and
|
|
2641
|
+
* significance at the target FDR. Properly handles ties and preserves
|
|
2642
|
+
* monotonicity of q-values.
|
|
2643
|
+
*/
|
|
2644
|
+
declare function benjaminiHochberg(pValues: number[], fdr?: number): {
|
|
2645
|
+
qValues: number[];
|
|
2646
|
+
significant: boolean[];
|
|
2647
|
+
};
|
|
2648
|
+
|
|
2649
|
+
/**
|
|
2650
|
+
* Behavior DSL — pytest-style assertions over a run's trajectory.
|
|
2651
|
+
*
|
|
2652
|
+
* Shape:
|
|
2653
|
+
* expect(store, runId).toCall('search').withArgs({ q: /.+/ })
|
|
2654
|
+
* expect(store, runId).toRefuse()
|
|
2655
|
+
* expect(store, runId).toOutputMatch(/confirmed/i)
|
|
2656
|
+
* expect(store, runId).toRespectBudget('tokens')
|
|
2657
|
+
* expect(store, runId).toCompleteWithin({ wallMs: 30_000 })
|
|
2658
|
+
*
|
|
2659
|
+
* Each matcher returns an `Expectation` with `.check() → MatcherResult`
|
|
2660
|
+
* so the DSL is composable with suite runners — you can collect all
|
|
2661
|
+
* expectations into a report instead of throwing on first failure.
|
|
2662
|
+
*/
|
|
2663
|
+
|
|
2664
|
+
interface MatcherResult {
|
|
2665
|
+
ok: boolean;
|
|
2666
|
+
detail: string;
|
|
2667
|
+
evidence?: string;
|
|
2668
|
+
}
|
|
2669
|
+
interface Expectation {
|
|
2670
|
+
/** Human-facing label; used in reports. */
|
|
2671
|
+
label: string;
|
|
2672
|
+
check(): Promise<MatcherResult>;
|
|
2673
|
+
}
|
|
2674
|
+
declare class BehaviorAssertion {
|
|
2675
|
+
private store;
|
|
2676
|
+
private runId;
|
|
2677
|
+
constructor(store: TraceStore, runId: string);
|
|
2678
|
+
toCall(toolName: string): CallExpectation;
|
|
2679
|
+
toRefuse(markers?: RegExp[]): Expectation;
|
|
2680
|
+
toOutputMatch(pattern: RegExp): Expectation;
|
|
2681
|
+
toRespectBudget(dimension: keyof BudgetLedgerEntry['dimension'] | 'tokens' | 'wallMs' | 'calls' | 'usd'): Expectation;
|
|
2682
|
+
toCompleteWithin(limits: {
|
|
2683
|
+
wallMs?: number;
|
|
2684
|
+
toolCalls?: number;
|
|
2685
|
+
llmTurns?: number;
|
|
2686
|
+
}): Expectation;
|
|
2687
|
+
toNeverCall(toolName: string): Expectation;
|
|
2688
|
+
}
|
|
2689
|
+
declare class CallExpectation implements Expectation {
|
|
2690
|
+
private store;
|
|
2691
|
+
private runId;
|
|
2692
|
+
private toolName;
|
|
2693
|
+
private argMatchers;
|
|
2694
|
+
private minCount;
|
|
2695
|
+
private maxCount;
|
|
2696
|
+
constructor(store: TraceStore, runId: string, toolName: string);
|
|
2697
|
+
get label(): string;
|
|
2698
|
+
withArgs(shape: Record<string, unknown | RegExp>): this;
|
|
2699
|
+
times(n: number): this;
|
|
2700
|
+
atLeast(n: number): this;
|
|
2701
|
+
atMost(n: number): this;
|
|
2702
|
+
check(): Promise<MatcherResult>;
|
|
2703
|
+
}
|
|
2704
|
+
declare function expectAgent(store: TraceStore, runId: string): BehaviorAssertion;
|
|
2705
|
+
/** Runs every expectation, collects results. Never throws. */
|
|
2706
|
+
declare function runExpectations(expectations: Expectation[]): Promise<{
|
|
2707
|
+
results: Array<{
|
|
2708
|
+
label: string;
|
|
2709
|
+
result: MatcherResult;
|
|
2710
|
+
}>;
|
|
2711
|
+
pass: boolean;
|
|
2712
|
+
passCount: number;
|
|
2713
|
+
failCount: number;
|
|
2714
|
+
}>;
|
|
2715
|
+
|
|
2716
|
+
/**
|
|
2717
|
+
* Judge calibration — measure judge quality against human gold + bias.
|
|
2718
|
+
*
|
|
2719
|
+
* Workflow:
|
|
2720
|
+
* 1. Build a golden set: {itemId, humanScore}[].
|
|
2721
|
+
* 2. Run candidate judges; each produces {itemId, score}.
|
|
2722
|
+
* 3. `calibrateJudge(golden, candidate)` reports κ + Pearson + MAE.
|
|
2723
|
+
* 4. Run bias probes (positional, verbosity, self-preference) to
|
|
2724
|
+
* detect systematic score inflation.
|
|
2725
|
+
*
|
|
2726
|
+
* Returns actionable diagnostics, not a single number. Consumers then
|
|
2727
|
+
* decide whether to trust the judge, retrain it, or add a tie-breaker.
|
|
2728
|
+
*/
|
|
2729
|
+
interface GoldenItem {
|
|
2730
|
+
itemId: string;
|
|
2731
|
+
humanScore: number;
|
|
2732
|
+
/** Optional group used for per-group bias audits (e.g. model-of-output family). */
|
|
2733
|
+
group?: string;
|
|
2734
|
+
}
|
|
2735
|
+
interface CandidateScore {
|
|
2736
|
+
itemId: string;
|
|
2737
|
+
score: number;
|
|
2738
|
+
/** Optional — enables positional-bias analysis (did order matter?). */
|
|
2739
|
+
positionOfAInput?: 'first' | 'second';
|
|
2740
|
+
}
|
|
2741
|
+
interface CalibrationResult {
|
|
2742
|
+
n: number;
|
|
2743
|
+
pearson: number;
|
|
2744
|
+
/** Cohen's κ with quadratic weights over integer-rounded scores. */
|
|
2745
|
+
kappa: number;
|
|
2746
|
+
/** Mean absolute error vs human. */
|
|
2747
|
+
mae: number;
|
|
2748
|
+
/** Worst-5 miscalibrations (largest |judge - human|). */
|
|
2749
|
+
worstItems: Array<{
|
|
2750
|
+
itemId: string;
|
|
2751
|
+
judge: number;
|
|
2752
|
+
human: number;
|
|
2753
|
+
delta: number;
|
|
2754
|
+
}>;
|
|
2755
|
+
}
|
|
2756
|
+
declare function calibrateJudge(golden: GoldenItem[], candidate: CandidateScore[]): CalibrationResult;
|
|
2757
|
+
interface PositionalBiasResult {
|
|
2758
|
+
/**
|
|
2759
|
+
* Score delta (first-position - second-position) averaged across items
|
|
2760
|
+
* presented in both positions. Non-zero = positional bias.
|
|
2761
|
+
*/
|
|
2762
|
+
avgDelta: number;
|
|
2763
|
+
n: number;
|
|
2764
|
+
}
|
|
2765
|
+
/**
|
|
2766
|
+
* Feed the same items to the judge twice with A/B swapped and pass all
|
|
2767
|
+
* results here. Items that don't appear in both positions are ignored.
|
|
2768
|
+
*/
|
|
2769
|
+
declare function positionalBias(scores: CandidateScore[]): PositionalBiasResult;
|
|
2770
|
+
interface VerbosityBiasResult {
|
|
2771
|
+
/** Pearson correlation between output length and score. Strong positive = verbosity bias. */
|
|
2772
|
+
pearson: number;
|
|
2773
|
+
n: number;
|
|
2774
|
+
}
|
|
2775
|
+
declare function verbosityBias(samples: Array<{
|
|
2776
|
+
outputLen: number;
|
|
2777
|
+
score: number;
|
|
2778
|
+
}>): VerbosityBiasResult;
|
|
2779
|
+
interface SelfPreferenceResult {
|
|
2780
|
+
/** Mean judge score when judge's family matches output's family. */
|
|
2781
|
+
inFamilyMean: number;
|
|
2782
|
+
outOfFamilyMean: number;
|
|
2783
|
+
deltaMean: number;
|
|
2784
|
+
n: number;
|
|
2785
|
+
}
|
|
2786
|
+
/**
|
|
2787
|
+
* Pass the same scenarios scored with judge-model X grading outputs from
|
|
2788
|
+
* model X (in-family) and model Y (out-of-family). Non-zero delta
|
|
2789
|
+
* indicates self-preference.
|
|
2790
|
+
*/
|
|
2791
|
+
declare function selfPreference(samples: Array<{
|
|
2792
|
+
score: number;
|
|
2793
|
+
inFamily: boolean;
|
|
2794
|
+
}>): SelfPreferenceResult;
|
|
2795
|
+
|
|
2796
|
+
/**
|
|
2797
|
+
* CI gate — evaluate a corpus against threshold contracts and generate
|
|
2798
|
+
* a human-readable PR/build comment.
|
|
2799
|
+
*
|
|
2800
|
+
* Three layers:
|
|
2801
|
+
* 1. `ThresholdContract` declarations (YAML-equivalent TS objects)
|
|
2802
|
+
* 2. `evaluateContract` runs the contracts against a TraceStore and
|
|
2803
|
+
* returns a structured report + overall pass/fail.
|
|
2804
|
+
* 3. `renderMarkdownReport` formats the report for GitHub PR comments.
|
|
2805
|
+
*
|
|
2806
|
+
* Consumers wrap this in their own `gh pr comment` / CI integration —
|
|
2807
|
+
* we don't ship the GitHub Action binary, just the library call that
|
|
2808
|
+
* the action invokes.
|
|
2809
|
+
*/
|
|
2810
|
+
|
|
2811
|
+
interface ContractMetric {
|
|
2812
|
+
/** Metric id matching either a predefined key or a custom extractor. */
|
|
2813
|
+
metric: string;
|
|
2814
|
+
higherIsBetter: boolean;
|
|
2815
|
+
/** Max tolerated regression (e.g. 0.02 = 2pp worse than baseline). */
|
|
2816
|
+
maxRegression?: number;
|
|
2817
|
+
/** Optional extractor if the metric isn't in the default set. */
|
|
2818
|
+
extract?: (run: Run, store: TraceStore) => Promise<number | null>;
|
|
2819
|
+
}
|
|
2820
|
+
interface ThresholdContract {
|
|
2821
|
+
name: string;
|
|
2822
|
+
baseline: RunFilter;
|
|
2823
|
+
candidate: RunFilter;
|
|
2824
|
+
metrics: ContractMetric[];
|
|
2825
|
+
slos?: Slo[];
|
|
2826
|
+
}
|
|
2827
|
+
interface ContractReport {
|
|
2828
|
+
name: string;
|
|
2829
|
+
baselineReport: BaselineReport;
|
|
2830
|
+
sloReport?: SloReport;
|
|
2831
|
+
breaches: string[];
|
|
2832
|
+
pass: boolean;
|
|
2833
|
+
}
|
|
2834
|
+
declare function evaluateContract(store: TraceStore, contract: ThresholdContract): Promise<ContractReport>;
|
|
2835
|
+
declare function renderMarkdownReport(reports: ContractReport[]): string;
|
|
2836
|
+
|
|
2837
|
+
/**
|
|
2838
|
+
* Observability adapters — bidirectional parity with production backends.
|
|
2839
|
+
*
|
|
2840
|
+
* `LangfuseAdapter` maps a Run's spans into Langfuse generation/score
|
|
2841
|
+
* records (schema-compatible; we don't depend on the SDK — consumers
|
|
2842
|
+
* POST the returned JSON to their Langfuse collector).
|
|
2843
|
+
*
|
|
2844
|
+
* `PrometheusEmitter` converts a TraceStore into a Prometheus text-
|
|
2845
|
+
* exposition-format string (counters + gauges for runs, tool calls,
|
|
2846
|
+
* errors, cost). Drop into a `/metrics` handler; no SDK needed.
|
|
2847
|
+
*
|
|
2848
|
+
* `replayTraceThroughJudge` is the canonical "re-score with a new
|
|
2849
|
+
* judge" path — takes an existing run, runs a judge function over
|
|
2850
|
+
* each LLM span, emits JudgeVerdict spans back into the store.
|
|
2851
|
+
*/
|
|
2852
|
+
|
|
2853
|
+
interface LangfuseGeneration {
|
|
2854
|
+
id: string;
|
|
2855
|
+
traceId: string;
|
|
2856
|
+
name: string;
|
|
2857
|
+
model: string;
|
|
2858
|
+
input: unknown;
|
|
2859
|
+
output: unknown;
|
|
2860
|
+
startTime: string;
|
|
2861
|
+
endTime: string;
|
|
2862
|
+
usage: {
|
|
2863
|
+
input: number;
|
|
2864
|
+
output: number;
|
|
2865
|
+
total: number;
|
|
2866
|
+
totalCost: number;
|
|
2867
|
+
};
|
|
2868
|
+
metadata: Record<string, unknown>;
|
|
2869
|
+
}
|
|
2870
|
+
interface LangfuseScore {
|
|
2871
|
+
id: string;
|
|
2872
|
+
traceId: string;
|
|
2873
|
+
observationId: string;
|
|
2874
|
+
name: string;
|
|
2875
|
+
value: number;
|
|
2876
|
+
comment?: string;
|
|
2877
|
+
}
|
|
2878
|
+
interface LangfuseEnvelope {
|
|
2879
|
+
traceId: string;
|
|
2880
|
+
generations: LangfuseGeneration[];
|
|
2881
|
+
scores: LangfuseScore[];
|
|
2882
|
+
}
|
|
2883
|
+
declare function toLangfuseEnvelope(store: TraceStore, runId: string): Promise<LangfuseEnvelope>;
|
|
2884
|
+
declare function toPrometheusText(store: TraceStore): Promise<string>;
|
|
2885
|
+
interface JudgeReplayResult {
|
|
2886
|
+
spanId: string;
|
|
2887
|
+
targetSpanId: string;
|
|
2888
|
+
dimension: string;
|
|
2889
|
+
score: number;
|
|
2890
|
+
rationale?: string;
|
|
2891
|
+
}
|
|
2892
|
+
/**
|
|
2893
|
+
* Apply a judge function to every LLM span in a run and record the
|
|
2894
|
+
* results as JudgeVerdict spans. This is the canonical "no re-execution"
|
|
2895
|
+
* re-scoring path — you supply a pure judge `(llmSpan) → verdict`.
|
|
2896
|
+
*/
|
|
2897
|
+
declare function replayTraceThroughJudge(store: TraceStore, runId: string, judge: {
|
|
2898
|
+
id: string;
|
|
2899
|
+
dimension: string;
|
|
2900
|
+
score: (span: LlmSpan) => Promise<{
|
|
2901
|
+
score: number;
|
|
2902
|
+
rationale?: string;
|
|
2903
|
+
evidence?: string;
|
|
2904
|
+
}>;
|
|
2905
|
+
}): Promise<JudgeReplayResult[]>;
|
|
2906
|
+
|
|
2907
|
+
/**
|
|
2908
|
+
* Paraphrase robustness — mutates a scenario prompt in structure-
|
|
2909
|
+
* preserving ways, re-scores, and reports score variance.
|
|
2910
|
+
*
|
|
2911
|
+
* Mutators are pure functions `(prompt: string) => string`. Ship a
|
|
2912
|
+
* default set; consumers add domain-specific ones.
|
|
2913
|
+
*
|
|
2914
|
+
* Robustness score: 1 - stdDev(scores) / (mean if positive else 1).
|
|
2915
|
+
* A perfect agent returns the same answer regardless of typo / case /
|
|
2916
|
+
* reordering — any variance signals a brittle prompt.
|
|
2917
|
+
*/
|
|
2918
|
+
type Mutator = (prompt: string, seed: number) => string;
|
|
2919
|
+
interface RobustnessResult {
|
|
2920
|
+
originalScore: number;
|
|
2921
|
+
variantScores: Array<{
|
|
2922
|
+
mutator: string;
|
|
2923
|
+
score: number;
|
|
2924
|
+
mutated: string;
|
|
2925
|
+
}>;
|
|
2926
|
+
meanScore: number;
|
|
2927
|
+
stdDev: number;
|
|
2928
|
+
robustness: number;
|
|
2929
|
+
}
|
|
2930
|
+
declare function paraphraseRobustness(prompt: string, mutators: Array<{
|
|
2931
|
+
id: string;
|
|
2932
|
+
fn: Mutator;
|
|
2933
|
+
}>, scoreFn: (prompt: string) => Promise<number>, options?: {
|
|
2934
|
+
seed?: number;
|
|
2935
|
+
}): Promise<RobustnessResult>;
|
|
2936
|
+
/** Lowercase the whole prompt. Robust models ignore case. */
|
|
2937
|
+
declare const lowercaseMutator: Mutator;
|
|
2938
|
+
/** Reorder sentences. Robust models don't depend on sentence order. */
|
|
2939
|
+
declare const sentenceReorderMutator: Mutator;
|
|
2940
|
+
/** Swap adjacent letter pairs (1 per 40 chars, min 1). Robust models tolerate typos. */
|
|
2941
|
+
declare const typoMutator: Mutator;
|
|
2942
|
+
/** Add a benign politeness prefix. Robust models ignore flattery. */
|
|
2943
|
+
declare const politenessPrefixMutator: Mutator;
|
|
2944
|
+
/** Compact whitespace, strip newlines. Robust models don't depend on formatting. */
|
|
2945
|
+
declare const whitespaceCollapseMutator: Mutator;
|
|
2946
|
+
declare const DEFAULT_MUTATORS: Array<{
|
|
2947
|
+
id: string;
|
|
2948
|
+
fn: Mutator;
|
|
2949
|
+
}>;
|
|
2950
|
+
|
|
2951
|
+
/**
|
|
2952
|
+
* Visual diff — pixel-delta scoring for UI / visual outputs.
|
|
2953
|
+
*
|
|
2954
|
+
* Minimal dependency-free implementation: accepts two PNGs as byte
|
|
2955
|
+
* arrays + width/height and returns a Δ ratio + per-channel histogram.
|
|
2956
|
+
* Consumers supply the decoded pixel arrays (we don't pull a PNG
|
|
2957
|
+
* decoder into the core — use `sharp`, `@napi-rs/canvas`, or Playwright
|
|
2958
|
+
* in the driving test and pass the result here).
|
|
2959
|
+
*/
|
|
2960
|
+
interface ImageData {
|
|
2961
|
+
width: number;
|
|
2962
|
+
height: number;
|
|
2963
|
+
/** Pixel data in RGBA order, 4 bytes per pixel. */
|
|
2964
|
+
data: Uint8Array | Uint8ClampedArray;
|
|
2965
|
+
}
|
|
2966
|
+
interface VisualDiffResult {
|
|
2967
|
+
/** Ratio of pixels differing beyond `tolerance` (0..1). */
|
|
2968
|
+
diffRatio: number;
|
|
2969
|
+
differingPixels: number;
|
|
2970
|
+
totalPixels: number;
|
|
2971
|
+
maxChannelDelta: number;
|
|
2972
|
+
/** Status for dashboards: unchanged (< 0.1%), changed, or severely-changed (> 5%). */
|
|
2973
|
+
status: 'unchanged' | 'changed' | 'severely-changed';
|
|
2974
|
+
}
|
|
2975
|
+
interface VisualDiffOptions {
|
|
2976
|
+
/** Pixels whose max-channel delta is ≤ this are considered unchanged. Default 8/255. */
|
|
2977
|
+
tolerance?: number;
|
|
2978
|
+
}
|
|
2979
|
+
declare function visualDiff(a: ImageData, b: ImageData, options?: VisualDiffOptions): VisualDiffResult;
|
|
2980
|
+
/** Convenience: diffs two byte-identical-dim RGBA arrays, returns just the ratio. */
|
|
2981
|
+
declare function pixelDeltaRatio(a: Uint8Array, b: Uint8Array, width: number, height: number, tolerance?: number): number;
|
|
2982
|
+
|
|
2983
|
+
/**
|
|
2984
|
+
* BuilderSession — ties a builder-of-builders workflow together.
|
|
2985
|
+
*
|
|
2986
|
+
* Models agent-builder's shape: Project → Chat → Edit → Ship → App →
|
|
2987
|
+
* AppAgent. Each layer is a Run (linked via parentRunId). The
|
|
2988
|
+
* framework-enforced invariants:
|
|
2989
|
+
*
|
|
2990
|
+
* - One Project → many Chats; chatId scopes runs within a project.
|
|
2991
|
+
* - One Chat = one builder Run with `layer='builder'`.
|
|
2992
|
+
* - One Ship = one child Run with `layer='app-build'` + SandboxHarness.
|
|
2993
|
+
* - One AppScenario = one grandchild Run with `layer='app-runtime'`.
|
|
2994
|
+
*
|
|
2995
|
+
* Consumers obtain a BuilderSession, call `startChat`, drive the
|
|
2996
|
+
* builder agent (emitting spans), and call `ship` / `runAppScenario`
|
|
2997
|
+
* as the workflow progresses. The session reconstructs itself from
|
|
2998
|
+
* trace data via `resume(store, projectId)`.
|
|
2999
|
+
*/
|
|
3000
|
+
|
|
3001
|
+
interface BuilderSessionInit {
|
|
3002
|
+
projectId: string;
|
|
3003
|
+
chatId?: string;
|
|
3004
|
+
/** Free-form: user's task description, project name, etc. Stored on the builder Run. */
|
|
3005
|
+
tags?: Record<string, string>;
|
|
3006
|
+
}
|
|
3007
|
+
interface ShipOptions {
|
|
3008
|
+
harness: HarnessConfig;
|
|
3009
|
+
driver?: SandboxDriver;
|
|
3010
|
+
/** scenarioId of this app-build run. Defaults to `${projectId}/build`. */
|
|
3011
|
+
scenarioId?: string;
|
|
3012
|
+
}
|
|
3013
|
+
interface RunAppScenarioOptions {
|
|
3014
|
+
scenario: TestGradedScenario;
|
|
3015
|
+
/** Harness driver override; defaults to the one the session was created with. */
|
|
3016
|
+
driver?: SandboxDriver;
|
|
3017
|
+
}
|
|
3018
|
+
declare class BuilderSession {
|
|
3019
|
+
private store;
|
|
3020
|
+
private builderEmitter;
|
|
3021
|
+
readonly projectId: string;
|
|
3022
|
+
readonly chatId: string;
|
|
3023
|
+
private builderRunId?;
|
|
3024
|
+
private lastBuildRunId?;
|
|
3025
|
+
private defaultDriver?;
|
|
3026
|
+
constructor(store: TraceStore, init: BuilderSessionInit, driver?: SandboxDriver);
|
|
3027
|
+
/** Start the builder (L0) run for this chat. Returns the runId. */
|
|
3028
|
+
startChat(scenarioId?: string): Promise<string>;
|
|
3029
|
+
/** The emitter for builder-level spans (edits, LLM calls, tool invocations). */
|
|
3030
|
+
get emitter(): TraceEmitter;
|
|
3031
|
+
/**
|
|
3032
|
+
* Ship the project's generated app: run the sandbox harness as a child
|
|
3033
|
+
* Run (`layer='app-build'`). Returns the build result + runId.
|
|
3034
|
+
*/
|
|
3035
|
+
ship(options: ShipOptions): Promise<{
|
|
3036
|
+
runId: string;
|
|
3037
|
+
result: SandboxHarnessResult;
|
|
3038
|
+
}>;
|
|
3039
|
+
/**
|
|
3040
|
+
* Run a domain scenario against the just-built app as a grandchild Run
|
|
3041
|
+
* (`layer='app-runtime'`). The `ship` call must precede this so the
|
|
3042
|
+
* parent is set correctly; if no build exists yet the session attaches
|
|
3043
|
+
* directly to the builder run (useful for prototypes).
|
|
3044
|
+
*/
|
|
3045
|
+
runAppScenario(options: RunAppScenarioOptions): Promise<TestGradedRunResult>;
|
|
3046
|
+
/** Record an end-of-chat meta score (judge verdict on whether the builder
|
|
3047
|
+
* served the user's intent). Accepts a numeric score + optional rationale. */
|
|
3048
|
+
recordMetaScore(score: number, rationale?: string): Promise<void>;
|
|
3049
|
+
/** Close the builder Run with a final outcome. */
|
|
3050
|
+
endChat(outcome: {
|
|
3051
|
+
pass: boolean;
|
|
3052
|
+
score?: number;
|
|
3053
|
+
notes?: string;
|
|
3054
|
+
}): Promise<void>;
|
|
3055
|
+
/**
|
|
3056
|
+
* Inline app-runtime run — for cases where the "scenario" isn't a
|
|
3057
|
+
* SWE-bench-style test suite but a live agent interaction (LLM chat,
|
|
3058
|
+
* domain flow). Returns an emitter bound to a fresh Run in the
|
|
3059
|
+
* `app-runtime` layer; caller emits spans inside and calls
|
|
3060
|
+
* `.endRun()` with the final verdict.
|
|
3061
|
+
*/
|
|
3062
|
+
startAppRuntime(scenarioId: string): Promise<TraceEmitter>;
|
|
3063
|
+
/**
|
|
3064
|
+
* Lightweight "ship marker" — record an app-build Run with a caller-
|
|
3065
|
+
* provided verdict. Use when there isn't a sandbox harness to run but
|
|
3066
|
+
* you still want to mark the build state at publish time.
|
|
3067
|
+
*/
|
|
3068
|
+
recordShipMarker(args: {
|
|
3069
|
+
pass: boolean;
|
|
3070
|
+
score: number;
|
|
3071
|
+
scenarioId?: string;
|
|
3072
|
+
notes?: string;
|
|
3073
|
+
}): Promise<string>;
|
|
3074
|
+
get lastBuildRunIdValue(): string | undefined;
|
|
3075
|
+
get builderRunIdValue(): string | undefined;
|
|
3076
|
+
}
|
|
3077
|
+
/**
|
|
3078
|
+
* Reconstruct the most recent BuilderSession state for a given project —
|
|
3079
|
+
* returns { builderRunId, lastBuildRunId, chatRuns }. For chat-first UIs
|
|
3080
|
+
* this is how a resumed session finds its place in the edit history.
|
|
3081
|
+
*/
|
|
3082
|
+
declare function resumeBuilderSession(store: TraceStore, projectId: string): Promise<{
|
|
3083
|
+
projectId: string;
|
|
3084
|
+
chatRuns: Run[];
|
|
3085
|
+
lastBuilderRun?: Run;
|
|
3086
|
+
lastBuildRun?: Run;
|
|
3087
|
+
lastAppRuntimeRuns: Run[];
|
|
3088
|
+
}>;
|
|
3089
|
+
|
|
3090
|
+
/**
|
|
3091
|
+
* Three-layer evaluation — the canonical scoring breakdown for
|
|
3092
|
+
* builder-of-builders workflows.
|
|
3093
|
+
*
|
|
3094
|
+
* meta_score: did the builder understand + satisfy user intent?
|
|
3095
|
+
* (judge verdict attached to the builder run)
|
|
3096
|
+
* build_score: did the generated scaffold build + pass its own tests?
|
|
3097
|
+
* (outcome.score on the app-build child run)
|
|
3098
|
+
* runtime_score: did the generated agent pass its domain scenarios?
|
|
3099
|
+
* (mean outcome.score over app-runtime grandchild runs)
|
|
3100
|
+
*
|
|
3101
|
+
* Returns a structured report per project. The cross-layer correlation
|
|
3102
|
+
* is the highest-leverage signal the framework computes — if
|
|
3103
|
+
* meta_score doesn't predict runtime_score, the builder's self-scoring
|
|
3104
|
+
* is broken.
|
|
3105
|
+
*/
|
|
3106
|
+
|
|
3107
|
+
interface ThreeLayerProjectReport {
|
|
3108
|
+
projectId: string;
|
|
3109
|
+
builderRunId?: string;
|
|
3110
|
+
/** Judge-verdict score on the builder run (0..1 after normalization). */
|
|
3111
|
+
metaScore: number | null;
|
|
3112
|
+
buildRunId?: string;
|
|
3113
|
+
/** 0..1 from the sandbox harness (testsPassed / testsTotal). */
|
|
3114
|
+
buildScore: number | null;
|
|
3115
|
+
appRuntimeRunIds: string[];
|
|
3116
|
+
/** Mean of outcome.score over app-runtime runs, 0..1. */
|
|
3117
|
+
runtimeScore: number | null;
|
|
3118
|
+
runtimePassRate: number | null;
|
|
3119
|
+
/** True when all three layers produced a score. */
|
|
3120
|
+
complete: boolean;
|
|
3121
|
+
}
|
|
3122
|
+
declare function scoreProject(store: TraceStore, projectId: string): Promise<ThreeLayerProjectReport>;
|
|
3123
|
+
/** Aggregate scoring across every project in a corpus. */
|
|
3124
|
+
declare function scoreAllProjects(store: TraceStore): Promise<ThreeLayerProjectReport[]>;
|
|
3125
|
+
|
|
3126
|
+
/**
|
|
3127
|
+
* Meta-eval correlation — the highest-leverage signal in the framework.
|
|
3128
|
+
*
|
|
3129
|
+
* Given a corpus of three-layer project reports, compute how well each
|
|
3130
|
+
* pair of layers correlates. The question we care about most:
|
|
3131
|
+
*
|
|
3132
|
+
* Does `metaScore` (what the builder thinks it did) predict
|
|
3133
|
+
* `runtimeScore` (what the user actually gets)?
|
|
3134
|
+
*
|
|
3135
|
+
* If r < ~0.4, the builder's self-scoring is broken — it's optimizing
|
|
3136
|
+
* for something other than real-world success. If r > 0.7, meta_score
|
|
3137
|
+
* is a usable proxy and can drive CI gates cheaply.
|
|
3138
|
+
*
|
|
3139
|
+
* Non-parametric rank correlation (Spearman) is also reported because
|
|
3140
|
+
* meta scores are often ordinal-ish.
|
|
3141
|
+
*/
|
|
3142
|
+
|
|
3143
|
+
interface LayerCorrelation {
|
|
3144
|
+
n: number;
|
|
3145
|
+
pearson: number;
|
|
3146
|
+
spearman: number;
|
|
3147
|
+
}
|
|
3148
|
+
interface CorrelationReport {
|
|
3149
|
+
/** Pairs present in the corpus (layers with ≥ 2 matched data points). */
|
|
3150
|
+
metaVsBuild?: LayerCorrelation;
|
|
3151
|
+
metaVsRuntime?: LayerCorrelation;
|
|
3152
|
+
buildVsRuntime?: LayerCorrelation;
|
|
3153
|
+
/** Number of complete projects (all 3 scores present). */
|
|
3154
|
+
completeProjects: number;
|
|
3155
|
+
}
|
|
3156
|
+
declare function correlateLayers(reports: ThreeLayerProjectReport[]): CorrelationReport;
|
|
3157
|
+
|
|
3158
|
+
/**
|
|
3159
|
+
* ProjectRegistry — project-level aggregation over the trace corpus.
|
|
3160
|
+
*
|
|
3161
|
+
* Thin reader over TraceStore that answers the questions a chat-first,
|
|
3162
|
+
* resumable UI needs:
|
|
3163
|
+
* - listProjects() → project IDs with latest activity
|
|
3164
|
+
* - projectTimeline(id) → chats + builds + runtime runs, chronological
|
|
3165
|
+
* - projectChats(id) → chat-level summaries (turn count, outcome)
|
|
3166
|
+
*
|
|
3167
|
+
* All queries are pure reads; no state duplication.
|
|
3168
|
+
*/
|
|
3169
|
+
|
|
3170
|
+
interface ProjectSummary {
|
|
3171
|
+
projectId: string;
|
|
3172
|
+
chatCount: number;
|
|
3173
|
+
buildCount: number;
|
|
3174
|
+
appRuntimeCount: number;
|
|
3175
|
+
lastActivityAt: number;
|
|
3176
|
+
latestChatId?: string;
|
|
3177
|
+
latestOutcome?: {
|
|
3178
|
+
pass: boolean;
|
|
3179
|
+
score?: number;
|
|
3180
|
+
};
|
|
3181
|
+
}
|
|
3182
|
+
interface ChatSummary {
|
|
3183
|
+
chatId: string;
|
|
3184
|
+
projectId: string;
|
|
3185
|
+
builderRunId: string;
|
|
3186
|
+
startedAt: number;
|
|
3187
|
+
endedAt?: number;
|
|
3188
|
+
status: Run['status'];
|
|
3189
|
+
outcome?: Run['outcome'];
|
|
3190
|
+
/** Counts of spans emitted during the chat. */
|
|
3191
|
+
llmTurns?: number;
|
|
3192
|
+
toolCalls?: number;
|
|
3193
|
+
buildRunId?: string;
|
|
3194
|
+
appRuntimeRunIds: string[];
|
|
3195
|
+
}
|
|
3196
|
+
interface ProjectTimelineEntry {
|
|
3197
|
+
run: Run;
|
|
3198
|
+
layerBucket: 'chat' | 'build' | 'runtime' | 'other';
|
|
3199
|
+
}
|
|
3200
|
+
declare class ProjectRegistry {
|
|
3201
|
+
private store;
|
|
3202
|
+
constructor(store: TraceStore);
|
|
3203
|
+
listProjects(): Promise<ProjectSummary[]>;
|
|
3204
|
+
projectTimeline(projectId: string): Promise<ProjectTimelineEntry[]>;
|
|
3205
|
+
projectChats(projectId: string): Promise<ChatSummary[]>;
|
|
3206
|
+
}
|
|
3207
|
+
|
|
3208
|
+
/**
|
|
3209
|
+
* OutcomeStore — deployment outcomes attached to Run IDs.
|
|
3210
|
+
*
|
|
3211
|
+
* Outcomes arrive asynchronously from production telemetry after the
|
|
3212
|
+
* eval run completed: user ratings, retention flags, conversion events,
|
|
3213
|
+
* revenue, support-ticket rate, anything a product team can measure.
|
|
3214
|
+
* The store is a peer to TraceStore — separate lifecycle, same runId
|
|
3215
|
+
* foreign key.
|
|
3216
|
+
*
|
|
3217
|
+
* The whole point of this module is to make the meta-eval correlation
|
|
3218
|
+
* question computable: `correlate(evalMetric, outcomeMetric) → r, ρ, n, CI`.
|
|
3219
|
+
*/
|
|
3220
|
+
interface DeploymentOutcome {
|
|
3221
|
+
runId: string;
|
|
3222
|
+
capturedAt: number;
|
|
3223
|
+
/** Numeric outcomes keyed by name — retention_7d, csat, revenue_usd, etc. */
|
|
3224
|
+
metrics: Record<string, number>;
|
|
3225
|
+
/** Dimensions for stratified analysis — cohort, region, user_segment. */
|
|
3226
|
+
labels?: Record<string, string>;
|
|
3227
|
+
/** Free-form provenance (source system, pipeline version). */
|
|
3228
|
+
source?: string;
|
|
3229
|
+
}
|
|
3230
|
+
interface OutcomeFilter {
|
|
3231
|
+
runIds?: string[];
|
|
3232
|
+
since?: number;
|
|
3233
|
+
until?: number;
|
|
3234
|
+
label?: {
|
|
3235
|
+
key: string;
|
|
3236
|
+
value: string;
|
|
3237
|
+
};
|
|
3238
|
+
source?: string;
|
|
3239
|
+
}
|
|
3240
|
+
interface OutcomeStore {
|
|
3241
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
3242
|
+
/** All outcomes attached to this run (a single run can have many — multiple
|
|
3243
|
+
* capture windows over deployment time). */
|
|
3244
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
3245
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
3246
|
+
}
|
|
3247
|
+
declare class InMemoryOutcomeStore implements OutcomeStore {
|
|
3248
|
+
private items;
|
|
3249
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
3250
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
3251
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
3252
|
+
}
|
|
3253
|
+
interface FileSystemOutcomeStoreOptions {
|
|
3254
|
+
dir: string;
|
|
3255
|
+
maxBytes?: number;
|
|
3256
|
+
}
|
|
3257
|
+
declare class FileSystemOutcomeStore implements OutcomeStore {
|
|
3258
|
+
private dir;
|
|
3259
|
+
private maxBytes;
|
|
3260
|
+
private memo?;
|
|
3261
|
+
private loaded;
|
|
3262
|
+
constructor(options: FileSystemOutcomeStoreOptions);
|
|
3263
|
+
private ensureDir;
|
|
3264
|
+
append(outcome: DeploymentOutcome): Promise<void>;
|
|
3265
|
+
private load;
|
|
3266
|
+
forRun(runId: string): Promise<DeploymentOutcome[]>;
|
|
3267
|
+
list(filter?: OutcomeFilter): Promise<DeploymentOutcome[]>;
|
|
3268
|
+
}
|
|
3269
|
+
|
|
3270
|
+
/**
|
|
3271
|
+
* Correlation study — "does our eval score predict real-world outcomes?"
|
|
3272
|
+
*
|
|
3273
|
+
* This is the load-bearing signal. Takes a TraceStore + OutcomeStore,
|
|
3274
|
+
* joins on runId, computes Pearson + Spearman + bootstrap CI for every
|
|
3275
|
+
* (evalMetric, outcomeMetric) pair the caller declares.
|
|
3276
|
+
*
|
|
3277
|
+
* Without this number the framework is ornamental. With it and r > 0.6
|
|
3278
|
+
* the framework is a moat — no other agent-eval tool publishes one.
|
|
3279
|
+
*/
|
|
3280
|
+
|
|
3281
|
+
interface EvalMetricSpec {
|
|
3282
|
+
id: string;
|
|
3283
|
+
/** Extract a scalar from a run (defaults cover score/pass/durationMs/costUsd/tokens). */
|
|
3284
|
+
extract?: (run: Run, store: TraceStore) => Promise<number | null>;
|
|
3285
|
+
}
|
|
3286
|
+
interface OutcomePair {
|
|
3287
|
+
evalMetric: string;
|
|
3288
|
+
outcomeMetric: string;
|
|
3289
|
+
}
|
|
3290
|
+
interface CorrelationResult {
|
|
3291
|
+
evalMetric: string;
|
|
3292
|
+
outcomeMetric: string;
|
|
3293
|
+
n: number;
|
|
3294
|
+
pearson: number;
|
|
3295
|
+
spearman: number;
|
|
3296
|
+
/** 95% bootstrap CI for Pearson. */
|
|
3297
|
+
pearsonCi95: {
|
|
3298
|
+
lower: number;
|
|
3299
|
+
upper: number;
|
|
3300
|
+
};
|
|
3301
|
+
/** Rough verdict: 'strong' ≥ 0.7, 'moderate' ≥ 0.4, else 'weak'. */
|
|
3302
|
+
verdict: 'strong' | 'moderate' | 'weak';
|
|
3303
|
+
}
|
|
3304
|
+
interface CorrelationStudyResult {
|
|
3305
|
+
pairs: CorrelationResult[];
|
|
3306
|
+
joinedSamples: number;
|
|
3307
|
+
skippedRuns: number;
|
|
3308
|
+
}
|
|
3309
|
+
interface CorrelationStudyOptions {
|
|
3310
|
+
/** Only join outcomes captured within this window after run.startedAt. */
|
|
3311
|
+
maxCaptureLagMs?: number;
|
|
3312
|
+
/** Restrict to a subset of outcomes (cohort, region, source). */
|
|
3313
|
+
outcomeFilter?: OutcomeFilter;
|
|
3314
|
+
/** Which outcome per run to use when multiple exist. Default 'latest'. */
|
|
3315
|
+
reduction?: 'latest' | 'mean' | 'max';
|
|
3316
|
+
/** Bootstrap iterations for the CI. Default 500. */
|
|
3317
|
+
bootstrapIterations?: number;
|
|
3318
|
+
}
|
|
3319
|
+
declare function correlationStudy(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetrics: EvalMetricSpec[], outcomeMetricNames: string[], options?: CorrelationStudyOptions): Promise<CorrelationStudyResult>;
|
|
3320
|
+
|
|
3321
|
+
/**
|
|
3322
|
+
* Calibration curve — binned "if eval says X, what does reality show?"
|
|
3323
|
+
*
|
|
3324
|
+
* Companion to correlationStudy. Raw correlation is a single number;
|
|
3325
|
+
* the calibration curve shows *where* the eval is well-calibrated vs
|
|
3326
|
+
* overconfident / underconfident. Buckets the eval metric, computes
|
|
3327
|
+
* mean outcome per bucket, reports expected-calibration-error (ECE).
|
|
3328
|
+
*/
|
|
3329
|
+
|
|
3330
|
+
interface CalibrationBin {
|
|
3331
|
+
lower: number;
|
|
3332
|
+
upper: number;
|
|
3333
|
+
n: number;
|
|
3334
|
+
evalMean: number;
|
|
3335
|
+
outcomeMean: number;
|
|
3336
|
+
/** |outcomeMean − evalMean|; contributes to ECE weighted by n/total. */
|
|
3337
|
+
gap: number;
|
|
3338
|
+
}
|
|
3339
|
+
interface CalibrationReport {
|
|
3340
|
+
evalMetric: string;
|
|
3341
|
+
outcomeMetric: string;
|
|
3342
|
+
n: number;
|
|
3343
|
+
bins: CalibrationBin[];
|
|
3344
|
+
/** Expected Calibration Error — Σ (n_i/N) × |outcomeMean_i − evalMean_i|. */
|
|
3345
|
+
ece: number;
|
|
3346
|
+
/** Max bin gap — upper bound on miscalibration. */
|
|
3347
|
+
maxGap: number;
|
|
3348
|
+
}
|
|
3349
|
+
interface CalibrationOptions {
|
|
3350
|
+
bins?: number;
|
|
3351
|
+
/** Equal-width (fixed bin edges) or equal-frequency (quantile bins). */
|
|
3352
|
+
binning?: 'equal-width' | 'equal-frequency';
|
|
3353
|
+
/** Clip eval values to [lo, hi] before binning. */
|
|
3354
|
+
range?: {
|
|
3355
|
+
lo: number;
|
|
3356
|
+
hi: number;
|
|
3357
|
+
};
|
|
3358
|
+
}
|
|
3359
|
+
declare function calibrationCurve(traceStore: TraceStore, outcomeStore: OutcomeStore, evalMetric: EvalMetricSpec, outcomeMetric: string, options?: CalibrationOptions): Promise<CalibrationReport | null>;
|
|
3360
|
+
|
|
3361
|
+
/**
|
|
3362
|
+
* Process Reward Modeling — per-step rubric grading.
|
|
3363
|
+
*
|
|
3364
|
+
* A StepRubric inspects one span and returns a score + rationale.
|
|
3365
|
+
* PrmGrader applies an array of rubrics to every LLM span in a
|
|
3366
|
+
* trajectory (consumers can broaden to tool/retrieval spans via the
|
|
3367
|
+
* `kind` filter on each rubric).
|
|
3368
|
+
*
|
|
3369
|
+
* Why this matters: outcome-only eval (did the final artifact work?)
|
|
3370
|
+
* gives sparse reward — most agent turns are unattributable. PRMs
|
|
3371
|
+
* densify the signal so optimizers and RL fine-tuning can assign
|
|
3372
|
+
* credit per turn.
|
|
3373
|
+
*/
|
|
3374
|
+
|
|
3375
|
+
interface StepContext {
|
|
3376
|
+
trajectory: Trajectory;
|
|
3377
|
+
step: TrajectoryStep;
|
|
3378
|
+
/** Steps preceding `step` in trajectory order. */
|
|
3379
|
+
prior: TrajectoryStep[];
|
|
3380
|
+
/** Steps following `step`. */
|
|
3381
|
+
next: TrajectoryStep[];
|
|
3382
|
+
}
|
|
3383
|
+
interface StepRubric {
|
|
3384
|
+
id: string;
|
|
3385
|
+
/** Only grade spans of these kinds (default: all). */
|
|
3386
|
+
kinds?: Array<Span['kind']>;
|
|
3387
|
+
/** Weight in the aggregate score. Default 1. */
|
|
3388
|
+
weight?: number;
|
|
3389
|
+
/** Returns score in 0..1 + optional rationale/evidence. Return `null` to
|
|
3390
|
+
* skip grading (rubric doesn't apply to this step). */
|
|
3391
|
+
grade: (ctx: StepContext) => Promise<{
|
|
3392
|
+
score: number;
|
|
3393
|
+
rationale?: string;
|
|
3394
|
+
evidence?: string;
|
|
3395
|
+
} | null>;
|
|
3396
|
+
}
|
|
3397
|
+
interface GradedStep {
|
|
3398
|
+
spanId: string;
|
|
3399
|
+
rubricId: string;
|
|
3400
|
+
score: number;
|
|
3401
|
+
weight: number;
|
|
3402
|
+
rationale?: string;
|
|
3403
|
+
evidence?: string;
|
|
3404
|
+
}
|
|
3405
|
+
interface PrmGradedTrace {
|
|
3406
|
+
runId: string;
|
|
3407
|
+
steps: GradedStep[];
|
|
3408
|
+
/** Weighted mean of all graded steps; 0..1. */
|
|
3409
|
+
aggregateScore: number;
|
|
3410
|
+
/** Number of spans graded — useful for sanity-checking coverage. */
|
|
3411
|
+
gradedCount: number;
|
|
3412
|
+
/** Number of spans in the trajectory that no rubric matched. */
|
|
3413
|
+
ungradedCount: number;
|
|
3414
|
+
}
|
|
3415
|
+
declare class PrmGrader {
|
|
3416
|
+
private rubrics;
|
|
3417
|
+
constructor(rubrics: StepRubric[]);
|
|
3418
|
+
/**
|
|
3419
|
+
* Grade every eligible span in a run. Emits a JudgeVerdict span for each
|
|
3420
|
+
* (rubric × span) verdict so the result is visible to downstream pipelines
|
|
3421
|
+
* (judgeAgreementView, etc.) — PRM is just "a judge that runs per span."
|
|
3422
|
+
*/
|
|
3423
|
+
grade(store: TraceStore, runId: string): Promise<PrmGradedTrace>;
|
|
3424
|
+
}
|
|
3425
|
+
/** Helper: reads JudgeVerdict spans that PRM emitted so downstream pipelines
|
|
3426
|
+
* can distinguish PRM verdicts from human or top-level LLM judges. */
|
|
3427
|
+
declare function isPrmVerdict(verdict: JudgeSpan): boolean;
|
|
3428
|
+
|
|
3429
|
+
/**
|
|
3430
|
+
* Built-in reference rubrics. Consumers combine these with domain
|
|
3431
|
+
* rubrics. All are deterministic, rule-based — cheap to run + easy
|
|
3432
|
+
* to unit-test. LLM-based rubrics are trivially authored by
|
|
3433
|
+
* following the StepRubric contract.
|
|
3434
|
+
*/
|
|
3435
|
+
|
|
3436
|
+
/** Penalize very short or very long assistant outputs. */
|
|
3437
|
+
declare function outputLengthRubric(args?: {
|
|
3438
|
+
minChars?: number;
|
|
3439
|
+
maxChars?: number;
|
|
3440
|
+
weight?: number;
|
|
3441
|
+
}): StepRubric;
|
|
3442
|
+
/** Reward tool calls that succeeded (status='ok') with an informative result. */
|
|
3443
|
+
declare function toolSuccessRubric(args?: {
|
|
3444
|
+
weight?: number;
|
|
3445
|
+
}): StepRubric;
|
|
3446
|
+
/** Penalize tool calls that duplicate a prior call with identical args. */
|
|
3447
|
+
declare function toolNonRedundantRubric(args?: {
|
|
3448
|
+
weight?: number;
|
|
3449
|
+
}): StepRubric;
|
|
3450
|
+
/** Penalize LLM outputs that contain common refusal markers when a refusal
|
|
3451
|
+
* is NOT expected (caller inverts weight for scenarios where refusal IS expected). */
|
|
3452
|
+
declare function nonRefusalRubric(args?: {
|
|
3453
|
+
markers?: RegExp[];
|
|
3454
|
+
weight?: number;
|
|
3455
|
+
}): StepRubric;
|
|
3456
|
+
/** Reward outputs that invoke the next-step tool the trajectory actually uses
|
|
3457
|
+
* (i.e. the LLM span announced "I will call X" and the following tool span IS X). */
|
|
3458
|
+
declare function toolIntentAlignmentRubric(args?: {
|
|
3459
|
+
weight?: number;
|
|
3460
|
+
}): StepRubric;
|
|
3461
|
+
|
|
3462
|
+
/**
|
|
3463
|
+
* Export PRM-graded traces as training data for downstream reward-model
|
|
3464
|
+
* fine-tuning. Canonical format is NDJSON of
|
|
3465
|
+
* `{ trajectory_text, step_index, rubric, score }` so a small model can
|
|
3466
|
+
* learn to predict step rewards from step context.
|
|
3467
|
+
*
|
|
3468
|
+
* The framework doesn't train the model — we emit the data; callers
|
|
3469
|
+
* plug it into their preferred trainer (TRL, Unsloth, custom).
|
|
3470
|
+
*/
|
|
3471
|
+
|
|
3472
|
+
interface PrmTrainingSample {
|
|
3473
|
+
runId: string;
|
|
3474
|
+
spanId: string;
|
|
3475
|
+
rubricId: string;
|
|
3476
|
+
score: number;
|
|
3477
|
+
/** Serialized step context — step + surrounding conversation. */
|
|
3478
|
+
context: {
|
|
3479
|
+
priorTurns: Array<{
|
|
3480
|
+
role: string;
|
|
3481
|
+
content: string;
|
|
3482
|
+
}>;
|
|
3483
|
+
step: {
|
|
3484
|
+
kind: Span['kind'];
|
|
3485
|
+
text: string;
|
|
3486
|
+
};
|
|
3487
|
+
};
|
|
3488
|
+
/** Optional evidence + rationale for auditability. */
|
|
3489
|
+
rationale?: string;
|
|
3490
|
+
evidence?: string;
|
|
3491
|
+
}
|
|
3492
|
+
declare function exportTrainingData(store: TraceStore, graded: PrmGradedTrace[], options?: {
|
|
3493
|
+
contextWindow?: number;
|
|
3494
|
+
}): Promise<PrmTrainingSample[]>;
|
|
3495
|
+
/** NDJSON serialization — write to file or stream directly to a trainer. */
|
|
3496
|
+
declare function toNdjson(samples: PrmTrainingSample[]): string;
|
|
3497
|
+
|
|
3498
|
+
/**
|
|
3499
|
+
* Inference-time PRM scoring — pick the best of N candidate trajectories
|
|
3500
|
+
* using a trained reward model (or a rule-based PRM as a proxy).
|
|
3501
|
+
*
|
|
3502
|
+
* The canonical Best-of-N pattern: generate N completions, score each
|
|
3503
|
+
* with a PRM, pick the winner. Here the scoring loop is framework-agnostic
|
|
3504
|
+
* — supply a TraceStore + PrmGrader + N run IDs → get ranking + winner.
|
|
3505
|
+
*/
|
|
3506
|
+
|
|
3507
|
+
interface BestOfNResult {
|
|
3508
|
+
winner: PrmGradedTrace;
|
|
3509
|
+
ranked: PrmGradedTrace[];
|
|
3510
|
+
/** Standard deviation of aggregate scores — small = candidates were homogenous. */
|
|
3511
|
+
stdDev: number;
|
|
3512
|
+
}
|
|
3513
|
+
declare function prmBestOfN(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<BestOfNResult>;
|
|
3514
|
+
/**
|
|
3515
|
+
* Weighted vote across multiple graders — use when you want a PRM ensemble
|
|
3516
|
+
* (e.g. rule-based + LLM-based + trained model). Each grader produces its
|
|
3517
|
+
* own ranking; we aggregate via rank-sum (Borda count) so no single grader
|
|
3518
|
+
* dominates via a different score scale.
|
|
3519
|
+
*/
|
|
3520
|
+
declare function prmEnsembleBestOfN(store: TraceStore, graders: PrmGrader[], runIds: string[]): Promise<BestOfNResult>;
|
|
3521
|
+
|
|
3522
|
+
/**
|
|
3523
|
+
* Bisector — auto-locate the change that introduced an eval regression.
|
|
3524
|
+
*
|
|
3525
|
+
* Two shapes:
|
|
3526
|
+
* - `commitBisect` — walk an ordered SHA list, binary-search for the
|
|
3527
|
+
* first commit that fails.
|
|
3528
|
+
* - `promptBisect` — given a good and bad prompt, progressively port
|
|
3529
|
+
* paragraphs from good→bad to localize the breaking change.
|
|
3530
|
+
*
|
|
3531
|
+
* Generic `bisect<T>` lets callers drive any ordered state space
|
|
3532
|
+
* (dataset versions, config files, CLI flag combinations).
|
|
3533
|
+
*/
|
|
3534
|
+
interface BisectOptions<T> {
|
|
3535
|
+
/** State known to pass. */
|
|
3536
|
+
good: T;
|
|
3537
|
+
/** State known to fail. */
|
|
3538
|
+
bad: T;
|
|
3539
|
+
/** Equality test on state values — default Object.is. */
|
|
3540
|
+
equals?: (a: T, b: T) => boolean;
|
|
3541
|
+
/** Pick the halfway state between good + bad. Return null when no further
|
|
3542
|
+
* split is possible (e.g. adjacent commits). */
|
|
3543
|
+
halfway: (good: T, bad: T) => T | null;
|
|
3544
|
+
/** Produce a verdict for a state. */
|
|
3545
|
+
runEval: (state: T) => Promise<{
|
|
3546
|
+
score: number;
|
|
3547
|
+
pass: boolean;
|
|
3548
|
+
}>;
|
|
3549
|
+
/** Hard cap on iterations (default 40 — covers ~1T ordered states). */
|
|
3550
|
+
maxIterations?: number;
|
|
3551
|
+
}
|
|
3552
|
+
interface BisectStep<T> {
|
|
3553
|
+
state: T;
|
|
3554
|
+
score: number;
|
|
3555
|
+
pass: boolean;
|
|
3556
|
+
}
|
|
3557
|
+
interface BisectResult<T> {
|
|
3558
|
+
/** The first bad state — typically `bad` in the final (good, bad) adjacent pair. */
|
|
3559
|
+
culprit: T;
|
|
3560
|
+
/** Ordered trace of all states evaluated. */
|
|
3561
|
+
path: BisectStep<T>[];
|
|
3562
|
+
/** True when we narrowed to an adjacent (good, bad) pair. */
|
|
3563
|
+
converged: boolean;
|
|
3564
|
+
/** True when `good` itself failed or `bad` itself passed — the caller's
|
|
3565
|
+
* premise was broken. */
|
|
3566
|
+
inputInconsistent: boolean;
|
|
3567
|
+
}
|
|
3568
|
+
declare function bisect<T>(options: BisectOptions<T>): Promise<BisectResult<T>>;
|
|
3569
|
+
/**
|
|
3570
|
+
* Commit bisect — `commits` is an ordered SHA list, oldest to newest.
|
|
3571
|
+
* `good` and `bad` must both be present in the list.
|
|
3572
|
+
*/
|
|
3573
|
+
declare function commitBisect(options: {
|
|
3574
|
+
commits: string[];
|
|
3575
|
+
good: string;
|
|
3576
|
+
bad: string;
|
|
3577
|
+
runEval: (sha: string) => Promise<{
|
|
3578
|
+
score: number;
|
|
3579
|
+
pass: boolean;
|
|
3580
|
+
}>;
|
|
3581
|
+
maxIterations?: number;
|
|
3582
|
+
}): Promise<BisectResult<string>>;
|
|
3583
|
+
/**
|
|
3584
|
+
* Prompt bisect — splits the good and bad prompts into paragraphs, then
|
|
3585
|
+
* progressively replaces paragraphs in `good` with their counterparts
|
|
3586
|
+
* from `bad` to localize the offending change. Only works when the two
|
|
3587
|
+
* prompts have the same paragraph count (a common editorial workflow
|
|
3588
|
+
* constraint — one paragraph = one change unit).
|
|
3589
|
+
*/
|
|
3590
|
+
declare function promptBisect(options: {
|
|
3591
|
+
good: string;
|
|
3592
|
+
bad: string;
|
|
3593
|
+
runEval: (prompt: string) => Promise<{
|
|
3594
|
+
score: number;
|
|
3595
|
+
pass: boolean;
|
|
3596
|
+
}>;
|
|
3597
|
+
maxIterations?: number;
|
|
3598
|
+
paragraphSplitter?: (prompt: string) => string[];
|
|
3599
|
+
}): Promise<BisectResult<string> & {
|
|
3600
|
+
offendingParagraphIndex?: number;
|
|
3601
|
+
}>;
|
|
3602
|
+
|
|
3603
|
+
/**
|
|
3604
|
+
* Counterfactual replay — "what would have happened if we'd changed
|
|
3605
|
+
* exactly one thing at turn N?"
|
|
3606
|
+
*
|
|
3607
|
+
* The framework does NOT drive the agent — it sets up the replay
|
|
3608
|
+
* context (prior spans, prior state, mutation spec) and records the
|
|
3609
|
+
* resulting divergence. Consumers supply an `executeFrom(ctx)` callback
|
|
3610
|
+
* that runs their agent starting from turn N with the mutation applied.
|
|
3611
|
+
*
|
|
3612
|
+
* Counterfactual runs are recorded as a new Run with `layer='meta'` and
|
|
3613
|
+
* `parentRunId = originalRunId`, so downstream diff + correlation
|
|
3614
|
+
* pipelines see them natively.
|
|
3615
|
+
*/
|
|
3616
|
+
|
|
3617
|
+
type CounterfactualMutation = {
|
|
3618
|
+
kind: 'swap-model';
|
|
3619
|
+
at: number;
|
|
3620
|
+
newModel: string;
|
|
3621
|
+
} | {
|
|
3622
|
+
kind: 'swap-tool-result';
|
|
3623
|
+
at: number;
|
|
3624
|
+
newResult: unknown;
|
|
3625
|
+
} | {
|
|
3626
|
+
kind: 'truncate-after';
|
|
3627
|
+
at: number;
|
|
3628
|
+
} | {
|
|
3629
|
+
kind: 'inject-system-message';
|
|
3630
|
+
at: number;
|
|
3631
|
+
content: string;
|
|
3632
|
+
} | {
|
|
3633
|
+
kind: 'custom';
|
|
3634
|
+
at: number;
|
|
3635
|
+
describe: string;
|
|
3636
|
+
apply: (step: TrajectoryStep) => TrajectoryStep;
|
|
3637
|
+
};
|
|
3638
|
+
interface CounterfactualContext {
|
|
3639
|
+
originalRunId: string;
|
|
3640
|
+
originalTrajectory: Trajectory;
|
|
3641
|
+
/** Steps up to (but not including) the mutation point — the prefix the
|
|
3642
|
+
* replayed agent inherits as its prior conversation/tool history. */
|
|
3643
|
+
prefix: TrajectoryStep[];
|
|
3644
|
+
mutation: CounterfactualMutation;
|
|
3645
|
+
/** Pre-applied mutation on the step at `mutation.at`. Consumers use this
|
|
3646
|
+
* as the FIRST step the replayed agent emits (they decide whether to
|
|
3647
|
+
* re-emit it or continue from there). */
|
|
3648
|
+
mutatedStep: TrajectoryStep;
|
|
3649
|
+
}
|
|
3650
|
+
interface CounterfactualResult {
|
|
3651
|
+
counterfactualRunId: string;
|
|
3652
|
+
originalRunId: string;
|
|
3653
|
+
mutation: CounterfactualMutation;
|
|
3654
|
+
/** Structured delta summary — caller can extend via scoring. */
|
|
3655
|
+
delta: {
|
|
3656
|
+
originalOutcomeScore: number | null;
|
|
3657
|
+
counterfactualOutcomeScore: number | null;
|
|
3658
|
+
deltaScore: number | null;
|
|
3659
|
+
};
|
|
3660
|
+
}
|
|
3661
|
+
interface CounterfactualRunner {
|
|
3662
|
+
/**
|
|
3663
|
+
* Execute the agent from `ctx.prefix` with the mutation applied.
|
|
3664
|
+
* MUST emit spans into the provided emitter so they become part of
|
|
3665
|
+
* the counterfactual run. MUST call emitter.endRun() with a verdict.
|
|
3666
|
+
*/
|
|
3667
|
+
executeFrom: (ctx: CounterfactualContext, emitter: TraceEmitter) => Promise<void>;
|
|
3668
|
+
}
|
|
3669
|
+
declare function runCounterfactual(store: TraceStore, originalRunId: string, mutation: CounterfactualMutation, runner: CounterfactualRunner): Promise<CounterfactualResult>;
|
|
3670
|
+
/**
|
|
3671
|
+
* Aggregate a batch of counterfactuals into a simple attribution table:
|
|
3672
|
+
* which mutation kinds move outcomes most? (Useful when you run a grid
|
|
3673
|
+
* over the same trajectory — swap-model at every llm span, swap-tool
|
|
3674
|
+
* at every tool span — and want a ranked summary.)
|
|
3675
|
+
*/
|
|
3676
|
+
declare function attributeCounterfactuals(results: CounterfactualResult[]): Array<{
|
|
3677
|
+
mutationKind: CounterfactualMutation['kind'];
|
|
3678
|
+
n: number;
|
|
3679
|
+
meanAbsDelta: number;
|
|
3680
|
+
meanSignedDelta: number;
|
|
3681
|
+
}>;
|
|
3682
|
+
|
|
3683
|
+
/**
|
|
3684
|
+
* Full cross-trace diff — align two trajectories step-by-step, report
|
|
3685
|
+
* per-step score deltas, attribute a variant's total outcome lead to
|
|
3686
|
+
* specific turns.
|
|
3687
|
+
*
|
|
3688
|
+
* 0.5 shipped `firstDivergenceView` (finds the first differing step).
|
|
3689
|
+
* This does the heavier work: full alignment via LCS, per-step
|
|
3690
|
+
* contribution to score delta using PRM verdicts when available,
|
|
3691
|
+
* fallback to structural heuristics (latency, token count, tool
|
|
3692
|
+
* outcome) otherwise.
|
|
3693
|
+
*/
|
|
3694
|
+
|
|
3695
|
+
type AlignmentOp = {
|
|
3696
|
+
op: 'match';
|
|
3697
|
+
a: TrajectoryStep;
|
|
3698
|
+
b: TrajectoryStep;
|
|
3699
|
+
} | {
|
|
3700
|
+
op: 'insert';
|
|
3701
|
+
b: TrajectoryStep;
|
|
3702
|
+
} | {
|
|
3703
|
+
op: 'delete';
|
|
3704
|
+
a: TrajectoryStep;
|
|
3705
|
+
} | {
|
|
3706
|
+
op: 'replace';
|
|
3707
|
+
a: TrajectoryStep;
|
|
3708
|
+
b: TrajectoryStep;
|
|
3709
|
+
};
|
|
3710
|
+
interface StepAttribution {
|
|
3711
|
+
op: AlignmentOp;
|
|
3712
|
+
/** Difference in PRM score (or null when not scored by a matching judge). */
|
|
3713
|
+
prmDelta: number | null;
|
|
3714
|
+
/** Difference in latency (endedAt - startedAt). */
|
|
3715
|
+
latencyDeltaMs: number | null;
|
|
3716
|
+
/** Difference in token count (LLM spans). */
|
|
3717
|
+
tokenDelta: number | null;
|
|
3718
|
+
/** Reason this step is / isn't considered a contributor to the outcome delta. */
|
|
3719
|
+
note: string;
|
|
3720
|
+
}
|
|
3721
|
+
interface CrossTraceDiff {
|
|
3722
|
+
runA: string;
|
|
3723
|
+
runB: string;
|
|
3724
|
+
alignment: AlignmentOp[];
|
|
3725
|
+
attributions: StepAttribution[];
|
|
3726
|
+
/** Total score delta (B - A). */
|
|
3727
|
+
totalScoreDelta: number | null;
|
|
3728
|
+
/** Sum of PRM deltas across matched/replaced steps. Close to
|
|
3729
|
+
* `totalScoreDelta` when PRM covers the trajectory; gap indicates
|
|
3730
|
+
* unmodeled variance. */
|
|
3731
|
+
prmDeltaSum: number;
|
|
3732
|
+
}
|
|
3733
|
+
interface CrossTraceDiffOptions {
|
|
3734
|
+
stepEquals?: (a: TrajectoryStep, b: TrajectoryStep) => boolean;
|
|
3735
|
+
}
|
|
3736
|
+
declare function crossTraceDiff(store: TraceStore, runA: string, runB: string, options?: CrossTraceDiffOptions): Promise<CrossTraceDiff>;
|
|
3737
|
+
|
|
3738
|
+
/**
|
|
3739
|
+
* Pre-registered hypotheses — declare what you're testing BEFORE the
|
|
3740
|
+
* run, check it AFTER. Prevents p-hacking, optional stopping, and the
|
|
3741
|
+
* "we ran until it looked good" failure mode.
|
|
3742
|
+
*
|
|
3743
|
+
* Manifest is a plain JSON-friendly object. Sign it with a content hash
|
|
3744
|
+
* + timestamp; the registered record becomes immutable. Post-run,
|
|
3745
|
+
* evaluate the manifest against observed results — the library refuses
|
|
3746
|
+
* to let you re-interpret a different metric as the declared one.
|
|
3747
|
+
*/
|
|
3748
|
+
interface HypothesisManifest {
|
|
3749
|
+
id: string;
|
|
3750
|
+
/** Human prose — goes into the audit trail. */
|
|
3751
|
+
hypothesis: string;
|
|
3752
|
+
/** Metric the hypothesis claims to move. */
|
|
3753
|
+
metric: string;
|
|
3754
|
+
/** 'increase' = candidate should score higher than baseline; 'decrease' = lower. */
|
|
3755
|
+
direction: 'increase' | 'decrease';
|
|
3756
|
+
/** Minimum effect size to count (same units as the metric). */
|
|
3757
|
+
minEffect: number;
|
|
3758
|
+
/** Alpha threshold. */
|
|
3759
|
+
alpha: number;
|
|
3760
|
+
/** Target statistical power at which sample size was pre-computed. */
|
|
3761
|
+
power: number;
|
|
3762
|
+
/** Declared N per arm before running. */
|
|
3763
|
+
preRegisteredN: number;
|
|
3764
|
+
/** ISO8601 timestamp the manifest was registered. */
|
|
3765
|
+
registeredAt: string;
|
|
3766
|
+
/** Optional identifiers to tie into the trace corpus. */
|
|
3767
|
+
baselineLabel?: string;
|
|
3768
|
+
candidateLabel?: string;
|
|
3769
|
+
}
|
|
3770
|
+
interface SignedManifest extends HypothesisManifest {
|
|
3771
|
+
/** sha256 hex of canonicalized manifest (everything except contentHash). */
|
|
3772
|
+
contentHash: string;
|
|
3773
|
+
}
|
|
3774
|
+
interface HypothesisResult {
|
|
3775
|
+
manifest: SignedManifest;
|
|
3776
|
+
observedN: number;
|
|
3777
|
+
observedEffect: number;
|
|
3778
|
+
observedPValue: number;
|
|
3779
|
+
/** True iff the observed effect hits the pre-declared direction with
|
|
3780
|
+
* magnitude ≥ minEffect AND p < alpha. */
|
|
3781
|
+
confirmed: boolean;
|
|
3782
|
+
/** Enumerated reasons the hypothesis was rejected (each a machine-tag). */
|
|
3783
|
+
rejectionReasons: Array<'wrong_direction' | 'effect_too_small' | 'not_significant' | 'undersampled'>;
|
|
3784
|
+
notes?: string;
|
|
3785
|
+
}
|
|
3786
|
+
declare function signManifest(m: HypothesisManifest): Promise<SignedManifest>;
|
|
3787
|
+
/** Verify that a signed manifest has not been tampered with. */
|
|
3788
|
+
declare function verifyManifest(m: SignedManifest): Promise<boolean>;
|
|
3789
|
+
/**
|
|
3790
|
+
* Evaluate a pre-registered hypothesis against observed results.
|
|
3791
|
+
* Mechanical — no re-interpretation permitted.
|
|
3792
|
+
*/
|
|
3793
|
+
declare function evaluateHypothesis(manifest: SignedManifest, observed: {
|
|
3794
|
+
n: number;
|
|
3795
|
+
effect: number;
|
|
3796
|
+
pValue: number;
|
|
3797
|
+
}): Promise<HypothesisResult>;
|
|
3798
|
+
|
|
3799
|
+
/**
|
|
3800
|
+
* Self-play scenario evolution — agents generate adversarial scenarios
|
|
3801
|
+
* against each other; survivors become part of the eval corpus.
|
|
3802
|
+
*
|
|
3803
|
+
* Framework-agnostic about how scenarios are generated. Caller supplies:
|
|
3804
|
+
* - `propose`: asks a "proposer" agent for candidate scenarios
|
|
3805
|
+
* - `scoreAgainst`: runs a target agent against a scenario and returns
|
|
3806
|
+
* its score
|
|
3807
|
+
*
|
|
3808
|
+
* A scenario *survives* if it reveals a meaningful score difference
|
|
3809
|
+
* between two target agents (or between a target agent and itself on
|
|
3810
|
+
* different runs). Survivors are promoted to a Dataset; the caller
|
|
3811
|
+
* decides what to do with them (hold-out, training, regression set).
|
|
3812
|
+
*
|
|
3813
|
+
* Guard rails: minimum absolute score delta to consider a scenario
|
|
3814
|
+
* informative; floor on absolute target score so degenerate break-all
|
|
3815
|
+
* scenarios (noise, gibberish) don't flood the corpus.
|
|
3816
|
+
*/
|
|
3817
|
+
|
|
3818
|
+
interface CandidateScenario {
|
|
3819
|
+
id: string;
|
|
3820
|
+
payload: unknown;
|
|
3821
|
+
/** Free-form tags (domain, generation, parent). */
|
|
3822
|
+
tags?: Record<string, string>;
|
|
3823
|
+
}
|
|
3824
|
+
interface ScoredTarget {
|
|
3825
|
+
targetId: string;
|
|
3826
|
+
score: number;
|
|
3827
|
+
}
|
|
3828
|
+
interface EvolutionRound {
|
|
3829
|
+
round: number;
|
|
3830
|
+
proposed: CandidateScenario[];
|
|
3831
|
+
survived: CandidateScenario[];
|
|
3832
|
+
rejected: Array<{
|
|
3833
|
+
candidate: CandidateScenario;
|
|
3834
|
+
reason: string;
|
|
3835
|
+
}>;
|
|
3836
|
+
scoredBreakdown: Array<{
|
|
3837
|
+
candidate: CandidateScenario;
|
|
3838
|
+
scores: ScoredTarget[];
|
|
3839
|
+
spread: number;
|
|
3840
|
+
}>;
|
|
3841
|
+
}
|
|
3842
|
+
interface SelfPlayOptions {
|
|
3843
|
+
/** Minimum score spread across targets for a scenario to survive. Default 0.1. */
|
|
3844
|
+
minSpread?: number;
|
|
3845
|
+
/** Minimum floor score across targets — keeps degenerate break-all scenarios
|
|
3846
|
+
* out. Default 0.1 (if every target scores below this, discard). */
|
|
3847
|
+
minAbsoluteFloor?: number;
|
|
3848
|
+
/** Hard cap on survivors per round. Default 50. */
|
|
3849
|
+
maxSurvivors?: number;
|
|
3850
|
+
/** Rounds to run. Default 1. Each round's survivors can be fed back into
|
|
3851
|
+
* `propose` to compound. */
|
|
3852
|
+
rounds?: number;
|
|
3853
|
+
/** Seed for scenario id generation if proposer doesn't provide one. */
|
|
3854
|
+
seed?: number;
|
|
3855
|
+
}
|
|
3856
|
+
interface SelfPlayProposer {
|
|
3857
|
+
propose(round: number, priorSurvivors: CandidateScenario[]): Promise<CandidateScenario[]>;
|
|
3858
|
+
}
|
|
3859
|
+
interface SelfPlayScorer {
|
|
3860
|
+
/** Score one candidate against every target; returns parallel array. */
|
|
3861
|
+
scoreCandidate(candidate: CandidateScenario, targets: string[]): Promise<ScoredTarget[]>;
|
|
3862
|
+
}
|
|
3863
|
+
declare function runSelfPlay(proposer: SelfPlayProposer, scorer: SelfPlayScorer, targets: string[], options?: SelfPlayOptions): Promise<{
|
|
3864
|
+
rounds: EvolutionRound[];
|
|
3865
|
+
dataset: Dataset;
|
|
3866
|
+
}>;
|
|
3867
|
+
|
|
3868
|
+
/**
|
|
3869
|
+
* Causal attribution via factorial experiments.
|
|
3870
|
+
*
|
|
3871
|
+
* Run every combination of {model × prompt × scenario × seed}, then
|
|
3872
|
+
* decompose observed score variance into main effects + interactions.
|
|
3873
|
+
* Moves from correlational "variant B is better" to causal "the model
|
|
3874
|
+
* swap accounts for 42% of the lead; the prompt change accounts for 28%;
|
|
3875
|
+
* interaction is 30%."
|
|
3876
|
+
*
|
|
3877
|
+
* Minimal implementation: 2-way factorial (two factors at a time) with
|
|
3878
|
+
* main-effect + interaction decomposition via variance of cell means.
|
|
3879
|
+
* Consumers run the factorial design themselves (we don't schedule
|
|
3880
|
+
* runs); this module consumes the (factorLevels, observedScores)
|
|
3881
|
+
* table and does the attribution math.
|
|
3882
|
+
*/
|
|
3883
|
+
interface FactorialCell {
|
|
3884
|
+
/** Map factor name → level id. e.g. { model: 'claude', prompt: 'v2' } */
|
|
3885
|
+
levels: Record<string, string>;
|
|
3886
|
+
/** Observed score for this cell (mean over replications if n > 1). */
|
|
3887
|
+
score: number;
|
|
3888
|
+
/** Number of replications averaged to produce `score`. */
|
|
3889
|
+
n: number;
|
|
3890
|
+
}
|
|
3891
|
+
interface FactorContribution {
|
|
3892
|
+
factor: string;
|
|
3893
|
+
/** Variance attributed to this factor's main effect, as a fraction of total. */
|
|
3894
|
+
shareOfVariance: number;
|
|
3895
|
+
/** Range of cell means across levels of this factor. */
|
|
3896
|
+
range: number;
|
|
3897
|
+
}
|
|
3898
|
+
interface InteractionContribution {
|
|
3899
|
+
factors: [string, string];
|
|
3900
|
+
shareOfVariance: number;
|
|
3901
|
+
}
|
|
3902
|
+
interface CausalAttributionReport {
|
|
3903
|
+
totalVariance: number;
|
|
3904
|
+
mainEffects: FactorContribution[];
|
|
3905
|
+
interactions: InteractionContribution[];
|
|
3906
|
+
/** Residual = variance unexplained by main effects + modeled interactions. */
|
|
3907
|
+
residualShare: number;
|
|
3908
|
+
/** Sanity: shares sum to 1 (within fp). */
|
|
3909
|
+
sharesSum: number;
|
|
3910
|
+
}
|
|
3911
|
+
declare function causalAttribution(cells: FactorialCell[]): CausalAttributionReport;
|
|
3912
|
+
|
|
3913
|
+
/**
|
|
3914
|
+
* Active learning — agent-as-scenario-author.
|
|
3915
|
+
*
|
|
3916
|
+
* Analyzes an existing Dataset + trace corpus for coverage gaps and
|
|
3917
|
+
* weak spots, returns a prioritized list of *synthesis targets*:
|
|
3918
|
+
* (gap description, existing-neighbor examples, suggested direction).
|
|
3919
|
+
*
|
|
3920
|
+
* Does NOT call an LLM itself — the proposer agent is caller-supplied.
|
|
3921
|
+
* This module's job is to identify WHERE new scenarios would compound
|
|
3922
|
+
* the most information, not to author them.
|
|
3923
|
+
*
|
|
3924
|
+
* Gaps we detect:
|
|
3925
|
+
* - dimensions with high score variance (unstable, need more data)
|
|
3926
|
+
* - dimensions with low coverage count (undersampled)
|
|
3927
|
+
* - failure classes with clusters (systematic weakness)
|
|
3928
|
+
* - difficulty bins with no coverage
|
|
3929
|
+
*/
|
|
3930
|
+
|
|
3931
|
+
type SynthesisReason = 'high-variance' | 'undersampled' | 'failure-cluster' | 'difficulty-gap';
|
|
3932
|
+
interface SynthesisTarget {
|
|
3933
|
+
reason: SynthesisReason;
|
|
3934
|
+
description: string;
|
|
3935
|
+
/** Existing scenarios that are closest to the gap; caller feeds these to
|
|
3936
|
+
* their LLM proposer as few-shot examples. */
|
|
3937
|
+
neighbors: DatasetScenario[];
|
|
3938
|
+
/** Suggested direction — e.g. "harder variants", "edge cases of X", "failure class Y". */
|
|
3939
|
+
direction: string;
|
|
3940
|
+
/** Priority score — higher = more information-dense gap. 0..1. */
|
|
3941
|
+
priority: number;
|
|
3942
|
+
}
|
|
3943
|
+
interface ActiveLearningOptions {
|
|
3944
|
+
/** Minimum scenarios per difficulty band to count as "covered". */
|
|
3945
|
+
minPerBand?: number;
|
|
3946
|
+
/** Variance threshold above which a scenario's dimension is "unstable". */
|
|
3947
|
+
varianceThreshold?: number;
|
|
3948
|
+
/** Max synthesis targets returned. */
|
|
3949
|
+
topK?: number;
|
|
3950
|
+
}
|
|
3951
|
+
declare function proposeSynthesisTargets(dataset: Dataset, traceStore: TraceStore, options?: ActiveLearningOptions): Promise<SynthesisTarget[]>;
|
|
3952
|
+
|
|
3953
|
+
/**
|
|
3954
|
+
* Reward-model export — the productizable wrapper around PRM training
|
|
3955
|
+
* data. Takes a TraceStore + PrmGrader, produces an embeddable
|
|
3956
|
+
* inference scorer that customers plug into their own agent stack.
|
|
3957
|
+
*
|
|
3958
|
+
* Two export forms:
|
|
3959
|
+
* - `exportRewardModel(store, graders)` — serializes the (step-context,
|
|
3960
|
+
* score) corpus to a framework-agnostic payload. Customer fine-tunes
|
|
3961
|
+
* their own model; we ship the scaffolding.
|
|
3962
|
+
* - `loadScorerFromTraces(store, grader)` — a zero-deps "reward model"
|
|
3963
|
+
* that literally replays the trained rubric at inference time. Works
|
|
3964
|
+
* as a reference baseline + deterministic fallback.
|
|
3965
|
+
*/
|
|
3966
|
+
|
|
3967
|
+
interface ExportedRewardModel {
|
|
3968
|
+
/** Version of the export format. Bump when payload shape changes. */
|
|
3969
|
+
version: '1.0';
|
|
3970
|
+
/** Metadata about the training corpus. */
|
|
3971
|
+
metadata: {
|
|
3972
|
+
nTraces: number;
|
|
3973
|
+
nSamples: number;
|
|
3974
|
+
rubrics: string[];
|
|
3975
|
+
exportedAt: string;
|
|
3976
|
+
/** Mean reward across training corpus — use as sanity check at load. */
|
|
3977
|
+
meanReward: number;
|
|
3978
|
+
};
|
|
3979
|
+
/** NDJSON training payload suitable for most fine-tuning frameworks. */
|
|
3980
|
+
trainingNdjson: string;
|
|
3981
|
+
}
|
|
3982
|
+
declare function exportRewardModel(store: TraceStore, grader: PrmGrader, runIds: string[]): Promise<ExportedRewardModel>;
|
|
3983
|
+
/**
|
|
3984
|
+
* Zero-deps inference scorer — apply a grader to a trajectory and return
|
|
3985
|
+
* its aggregate score. This is the "reward model" customers embed when
|
|
3986
|
+
* they don't want (or can't) fine-tune one. Deterministic + portable.
|
|
3987
|
+
*/
|
|
3988
|
+
interface InferenceScorer {
|
|
3989
|
+
/** Score a completed trajectory. Higher is better. */
|
|
3990
|
+
score(trajectory: Trajectory, store: TraceStore): Promise<number>;
|
|
3991
|
+
metadata: {
|
|
3992
|
+
rubrics: string[];
|
|
3993
|
+
deterministic: true;
|
|
3994
|
+
};
|
|
3995
|
+
}
|
|
3996
|
+
declare function loadScorerFromGrader(grader: PrmGrader): InferenceScorer;
|
|
3997
|
+
/**
|
|
3998
|
+
* Replay a trace corpus through a scorer — produces the canonical
|
|
3999
|
+
* "what would this reward model have said about every run?" table.
|
|
4000
|
+
* Callers use this to validate a trained model against the training
|
|
4001
|
+
* corpus (expect high agreement; drift indicates overfitting).
|
|
4002
|
+
*/
|
|
4003
|
+
declare function replayScorerOverCorpus(store: TraceStore, scorer: InferenceScorer, runIds: string[]): Promise<Array<{
|
|
4004
|
+
runId: string;
|
|
4005
|
+
score: number;
|
|
4006
|
+
outcomeScore: number | null;
|
|
4007
|
+
}>>;
|
|
4008
|
+
|
|
4009
|
+
/**
|
|
4010
|
+
* Governance reporting — shared types.
|
|
4011
|
+
*
|
|
4012
|
+
* The framework collects a `GovernanceContext` (traces + outcomes +
|
|
4013
|
+
* dataset manifests + red-team results + judge calibration) and each
|
|
4014
|
+
* specific template (NIST AI RMF, SOC2, EU AI Act) renders a
|
|
4015
|
+
* structured report from it.
|
|
4016
|
+
*
|
|
4017
|
+
* Reports are machine-readable JSON first; human-readable Markdown is a
|
|
4018
|
+
* pure transform on top. External auditors consume the Markdown; CI
|
|
4019
|
+
* consumes the JSON.
|
|
4020
|
+
*/
|
|
4021
|
+
|
|
4022
|
+
interface GovernanceContext {
|
|
4023
|
+
/** Legal / org identity for the report. */
|
|
4024
|
+
organization: string;
|
|
4025
|
+
/** System / agent identifier. */
|
|
4026
|
+
systemName: string;
|
|
4027
|
+
/** ISO8601 period the report covers. */
|
|
4028
|
+
periodStart: string;
|
|
4029
|
+
periodEnd: string;
|
|
4030
|
+
/** Versioned dataset manifests used during the period. */
|
|
4031
|
+
datasets: DatasetManifest[];
|
|
4032
|
+
traceStore: TraceStore;
|
|
4033
|
+
outcomeStore?: OutcomeStore;
|
|
4034
|
+
/** Cached red-team results for the period, if available. */
|
|
4035
|
+
redTeam?: RedTeamReport;
|
|
4036
|
+
/** Judge-vs-human calibration results, if measured. */
|
|
4037
|
+
judgeCalibration?: CalibrationResult[];
|
|
4038
|
+
/** Responsible owner for the system — role + name + email. */
|
|
4039
|
+
owner: {
|
|
4040
|
+
role: string;
|
|
4041
|
+
name: string;
|
|
4042
|
+
email: string;
|
|
4043
|
+
};
|
|
4044
|
+
}
|
|
4045
|
+
interface GovernanceFinding {
|
|
4046
|
+
id: string;
|
|
4047
|
+
severity: 'info' | 'low' | 'medium' | 'high' | 'critical';
|
|
4048
|
+
/** Control reference the finding maps to (e.g. "NIST-AI-RMF:MEASURE-2.1"). */
|
|
4049
|
+
control: string;
|
|
4050
|
+
summary: string;
|
|
4051
|
+
evidence?: string;
|
|
4052
|
+
remediation?: string;
|
|
4053
|
+
}
|
|
4054
|
+
interface GovernanceReport {
|
|
4055
|
+
framework: 'NIST-AI-RMF' | 'SOC2' | 'EU-AI-ACT';
|
|
4056
|
+
version: string;
|
|
4057
|
+
context: Pick<GovernanceContext, 'organization' | 'systemName' | 'periodStart' | 'periodEnd' | 'owner'>;
|
|
4058
|
+
summary: {
|
|
4059
|
+
findings: number;
|
|
4060
|
+
byeverity: Record<GovernanceFinding['severity'], number>;
|
|
4061
|
+
overall: 'compliant' | 'compliant-with-findings' | 'non-compliant';
|
|
4062
|
+
};
|
|
4063
|
+
findings: GovernanceFinding[];
|
|
4064
|
+
/** Framework-specific structured payload (mapped controls, risk class, etc.). */
|
|
4065
|
+
payload: Record<string, unknown>;
|
|
4066
|
+
generatedAt: string;
|
|
4067
|
+
}
|
|
4068
|
+
declare function renderMarkdown(report: GovernanceReport): string;
|
|
4069
|
+
declare function summarize(findings: GovernanceFinding[]): GovernanceReport['summary'];
|
|
4070
|
+
|
|
4071
|
+
/**
|
|
4072
|
+
* NIST AI RMF 1.0 — Govern / Map / Measure / Manage mapping.
|
|
4073
|
+
*
|
|
4074
|
+
* Each subcategory derives its status from concrete framework state:
|
|
4075
|
+
* MEASURE 2.x: do we have a calibration regime? contamination controls?
|
|
4076
|
+
* MEASURE 2.7: are red-team results available?
|
|
4077
|
+
* MANAGE 1.x: are outcome metrics captured? correlation measured?
|
|
4078
|
+
* GOVERN 1.x: dataset + prompt provenance recorded?
|
|
4079
|
+
*
|
|
4080
|
+
* We ship the mapping and the derivation rules; consumers supply the
|
|
4081
|
+
* GovernanceContext.
|
|
4082
|
+
*/
|
|
4083
|
+
|
|
4084
|
+
declare function nistAiRmfReport(ctx: GovernanceContext): Promise<GovernanceReport>;
|
|
4085
|
+
|
|
4086
|
+
/**
|
|
4087
|
+
* SOC 2 — Common Criteria 7 (system operations + change management)
|
|
4088
|
+
* audit trail derived from the trace corpus.
|
|
4089
|
+
*
|
|
4090
|
+
* This is NOT a formal SOC2 report — that requires an external
|
|
4091
|
+
* auditor. What we ship is the machine-readable *evidence* package
|
|
4092
|
+
* that an auditor consumes: run counts, deploy events, access log
|
|
4093
|
+
* summary, anomaly tracking, response-time SLOs.
|
|
4094
|
+
*/
|
|
4095
|
+
|
|
4096
|
+
declare function soc2Report(ctx: GovernanceContext): Promise<GovernanceReport>;
|
|
4097
|
+
|
|
4098
|
+
/**
|
|
4099
|
+
* EU AI Act — risk-class classification + compliance checklist.
|
|
4100
|
+
*
|
|
4101
|
+
* Classification is declarative: caller supplies the domain/use-case
|
|
4102
|
+
* signals (biometric? critical infrastructure? education? employment?
|
|
4103
|
+
* access to services?) and we map to the Act's risk tiers:
|
|
4104
|
+
* - "unacceptable" (prohibited)
|
|
4105
|
+
* - "high" (Annex III — strict obligations)
|
|
4106
|
+
* - "limited" (transparency obligations)
|
|
4107
|
+
* - "minimal" (voluntary codes of conduct)
|
|
4108
|
+
*
|
|
4109
|
+
* Then the compliance checklist enumerates Article 9 (risk mgmt),
|
|
4110
|
+
* 10 (data + data governance), 11 (technical documentation), 13
|
|
4111
|
+
* (transparency), 14 (human oversight), 15 (accuracy + robustness)
|
|
4112
|
+
* requirements and flags gaps.
|
|
4113
|
+
*/
|
|
4114
|
+
|
|
4115
|
+
type EuRiskClass = 'unacceptable' | 'high' | 'limited' | 'minimal';
|
|
4116
|
+
interface UseCaseSignals {
|
|
4117
|
+
/** Used for biometric identification in public spaces? (Art. 5 — unacceptable). */
|
|
4118
|
+
biometricPublic?: boolean;
|
|
4119
|
+
/** Social scoring by public authorities? (Art. 5). */
|
|
4120
|
+
socialScoring?: boolean;
|
|
4121
|
+
/** Subliminal manipulation? (Art. 5). */
|
|
4122
|
+
subliminal?: boolean;
|
|
4123
|
+
/** Annex III sector: critical infrastructure / education / employment /
|
|
4124
|
+
* access to essential services / law enforcement / migration /
|
|
4125
|
+
* administration of justice / democratic processes? */
|
|
4126
|
+
annexIII?: boolean;
|
|
4127
|
+
/** Interacts directly with natural persons (chatbot, agent)? — limited risk. */
|
|
4128
|
+
chatbot?: boolean;
|
|
4129
|
+
/** Generates synthetic media (image/audio/video/text deepfakes)? — limited risk. */
|
|
4130
|
+
generatesSyntheticMedia?: boolean;
|
|
4131
|
+
}
|
|
4132
|
+
declare function classifyEuAiRisk(signals: UseCaseSignals): EuRiskClass;
|
|
4133
|
+
declare function euAiActReport(ctx: GovernanceContext, signals: UseCaseSignals): Promise<GovernanceReport>;
|
|
4134
|
+
|
|
4135
|
+
export { type ActiveLearningOptions, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact, type ArtifactCheck, type Artifact$1 as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryLeak, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CollectedArtifacts, type CompletionCriterion, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CrossTraceDiff, type CrossTraceDiffOptions, DEFAULT_AGENT_SLOS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeploymentOutcome, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type ExecutorConfig, type Expectation, type Experiment, type Run$1 as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureRule, type FeedbackPattern, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type GenericSpan, type GoldenItem, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessConfig, HoldoutAuditor, HoldoutLockedError, type HypothesisManifest, type HypothesisResult, type ImageData, InMemoryExperimentStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type InteractionContribution, type JudgeAgreementReport, type JudgeConfig, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayResult, type JudgeRubric, type JudgeScore, type JudgeSpan, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type LayerCorrelation, type LlmSpan, MODEL_PRICING, type MatcherResult, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type Mutator, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairwiseComparison, type ParetoResult, type PersonaConfig, type PositionalBiasResult, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptHandle, PromptOptimizer, PromptRegistry, type PromptVariant, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type RegressionOptions, type RegressionSpec, type RetrievalSpan, type RobustnessResult, type RouteMap, type RubricDimension, type Run, type RunAppScenarioOptions, type RunConfig, type RunDiff, type RunFilter, type RunLayer, type RunOutcome, type RunStatus, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxResult, type SandboxSpan, type Scenario, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type StepAttribution, type StepContext, type StepRubric, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type Turn, type TurnMetrics, type TurnResult, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantScore, type VerbosityBiasResult, type VisualDiffOptions, type VisualDiffResult, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, adversarialJudge, aggregateLlm, analyzeAntiSlop, analyzeSeries, argHash, attributeCounterfactuals, benjaminiHochberg, bisect, bonferroni, budgetBreachView, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, canaryLeakView, causalAttribution, checkCanaries, checkSlos, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareToBaseline, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, correlateLayers, correlationStudy, createAntiSlopJudge, createCustomJudge, createDomainExpertJudge, crossTraceDiff, defaultJudges, dominates, estimateCost, estimateTokens, euAiActReport, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, failureClusterView, fileContains, fileExists, firstDivergenceView, formatBenchmarkReport, formatDriverReport, groupBy, hashContent, hashScenarios, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, judgeAgreementView, judgeSpans, keyPreserved, llmSpanFromProvider, llmSpans, loadScorerFromGrader, lowercaseMutator, mannWhitneyU, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, outputLengthRubric, pairedTTest, paraphraseRobustness, paretoFrontier, partialCredit, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resumeBuilderSession, rowCount, rowWhere, runAssertions, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runSelfPlay, runTestGradedScenario, runsForScenario, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, selfPreference, sentenceReorderMutator, signManifest, soc2Report, statusAdvanced, stuckLoopView, summarize, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, typoMutator, urlContains, verbosityBias, verifyManifest, visualDiff, vitestTestParser, weightedMean, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank };
|