@tangle-network/agent-eval 0.18.0 → 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1151,199 +1151,6 @@ declare class Dataset {
1151
1151
  }
1152
1152
  declare function hashScenarios(scenarios: DatasetScenario[]): Promise<string>;
1153
1153
 
1154
- /**
1155
- * Prompt optimizer — A/B test prompt variants with statistical rigor.
1156
- *
1157
- * Runs N prompt variants against a fixed scenario set, collects per-scenario
1158
- * scores via the user-provided `scoreVariant` callback, and returns:
1159
- * - per-variant mean + bootstrap CI
1160
- * - pairwise significance (Mann-Whitney, non-parametric — works on any
1161
- * score distribution, not just normal)
1162
- * - a winner (highest mean, flagged if the lead is not significant)
1163
- *
1164
- * Deliberately generic — the `scoreVariant` callback does whatever domain
1165
- * work the consumer needs (invoke the agent, judge the output, whatever),
1166
- * and returns a number per scenario. This lets the optimizer stay small +
1167
- * testable.
1168
- */
1169
- interface PromptVariant$1 {
1170
- id: string;
1171
- prompt: string;
1172
- metadata?: Record<string, unknown>;
1173
- }
1174
- interface OptimizationConfig {
1175
- variants: PromptVariant$1[];
1176
- /** How many trials per (variant, scenario) — controls CI tightness. Default 3. */
1177
- trialsPerScenario?: number;
1178
- /** Significance threshold for pairwise comparison (default 0.05). */
1179
- significanceLevel?: number;
1180
- /**
1181
- * The scoring callback. For each (variant, scenarioId, trialIndex), produce
1182
- * a score in 0..1 (or any numeric range — the optimizer only cares about
1183
- * monotonicity).
1184
- */
1185
- scoreVariant: (args: {
1186
- variant: PromptVariant$1;
1187
- scenarioId: string;
1188
- trialIndex: number;
1189
- }) => Promise<number>;
1190
- /** Scenario ids to run against. */
1191
- scenarioIds: string[];
1192
- /** Optional hook — fires after each (variant, scenario) fully scored. */
1193
- onScenarioComplete?: (info: {
1194
- variantId: string;
1195
- scenarioId: string;
1196
- scores: number[];
1197
- }) => void;
1198
- }
1199
- interface VariantScore {
1200
- variantId: string;
1201
- mean: number;
1202
- ci95: {
1203
- lower: number;
1204
- upper: number;
1205
- };
1206
- n: number;
1207
- perScenario: Record<string, {
1208
- mean: number;
1209
- n: number;
1210
- samples: number[];
1211
- }>;
1212
- }
1213
- interface PairwiseComparison {
1214
- variantA: string;
1215
- variantB: string;
1216
- pValue: number;
1217
- /** BH-FDR-corrected q-value across all n*(n-1)/2 pairwise tests. */
1218
- qValue: number;
1219
- /** True when q-value passes the FDR threshold. Prefer over raw p-value when variants > 2. */
1220
- significant: boolean;
1221
- meanDelta: number;
1222
- }
1223
- interface OptimizationResult {
1224
- winner: {
1225
- variantId: string;
1226
- /** True when the winner's lead vs every other variant is statistically significant. */
1227
- significant: boolean;
1228
- ciLowerBoundExceedsSecondMean: boolean;
1229
- };
1230
- scores: VariantScore[];
1231
- pairwise: PairwiseComparison[];
1232
- config: {
1233
- trialsPerScenario: number;
1234
- significanceLevel: number;
1235
- variants: string[];
1236
- scenarios: string[];
1237
- };
1238
- }
1239
- declare class PromptOptimizer {
1240
- run(config: OptimizationConfig): Promise<OptimizationResult>;
1241
- }
1242
-
1243
- interface RunScore {
1244
- success: number;
1245
- goalProgress: number;
1246
- repoGroundedness: number;
1247
- driftPenalty: number;
1248
- toolUseQuality: number;
1249
- patchQuality: number;
1250
- testReality: number;
1251
- finalGate: number;
1252
- reviewerBlockers: number;
1253
- costUsd: number;
1254
- wallSeconds: number;
1255
- notes?: string[];
1256
- }
1257
- interface RunScoreWeights {
1258
- success: number;
1259
- goalProgress: number;
1260
- repoGroundedness: number;
1261
- driftPenalty: number;
1262
- toolUseQuality: number;
1263
- patchQuality: number;
1264
- testReality: number;
1265
- finalGate: number;
1266
- reviewerBlockers: number;
1267
- costUsd: number;
1268
- wallSeconds: number;
1269
- }
1270
- declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
1271
- declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
1272
- declare function clamp01(value: number): number;
1273
-
1274
- interface SteeringRolePrompt {
1275
- system?: string;
1276
- append?: string;
1277
- }
1278
- interface SteeringBundle {
1279
- id: string;
1280
- coderPrompt?: string;
1281
- continuePrompt?: string;
1282
- reviewerPrompts?: Record<string, string>;
1283
- skills?: string[];
1284
- rolePrompts?: Record<string, SteeringRolePrompt>;
1285
- metadata?: Record<string, unknown>;
1286
- }
1287
- interface SteeringDelta {
1288
- coderPrompt?: string;
1289
- continuePrompt?: string;
1290
- reviewerPrompts?: Record<string, string>;
1291
- skills?: string[];
1292
- rolePrompts?: Record<string, SteeringRolePrompt>;
1293
- metadata?: Record<string, unknown>;
1294
- }
1295
- declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
1296
- declare function renderSteeringText(bundle: SteeringBundle): string;
1297
-
1298
- interface OptimizationExample {
1299
- scenarioId: string;
1300
- metadata?: Record<string, unknown>;
1301
- }
1302
- interface SteeringEvaluation {
1303
- variant: SteeringBundle;
1304
- example: OptimizationExample;
1305
- trialIndex: number;
1306
- }
1307
- interface SteeringVariantReport {
1308
- variantId: string;
1309
- bundle: SteeringBundle;
1310
- mean: number;
1311
- ci95: {
1312
- lower: number;
1313
- upper: number;
1314
- };
1315
- scenarioScores: Record<string, {
1316
- mean: number;
1317
- n: number;
1318
- samples: number[];
1319
- }>;
1320
- }
1321
- interface OptimizationLoopResult {
1322
- winner: SteeringBundle;
1323
- significant: boolean;
1324
- reports: SteeringVariantReport[];
1325
- pairwise: Array<{
1326
- variantA: string;
1327
- variantB: string;
1328
- pValue: number;
1329
- qValue: number;
1330
- significant: boolean;
1331
- meanDelta: number;
1332
- }>;
1333
- }
1334
- interface OptimizationLoopConfig {
1335
- variants: SteeringBundle[];
1336
- examples: OptimizationExample[];
1337
- evaluate: (args: SteeringEvaluation) => Promise<RunScore>;
1338
- scoreWeights?: Partial<RunScoreWeights>;
1339
- trialsPerScenario?: number;
1340
- }
1341
- declare class OptimizationLoop {
1342
- private readonly optimizer;
1343
- constructor(optimizer?: PromptOptimizer);
1344
- run(config: OptimizationLoopConfig): Promise<OptimizationLoopResult>;
1345
- }
1346
-
1347
1154
  type FeedbackArtifactType = 'text' | 'code' | 'plan' | 'research' | 'action' | 'ui' | 'decision' | 'data' | 'other';
1348
1155
  type FeedbackLabelSource = 'user' | 'judge' | 'environment' | 'metric' | 'policy' | 'system';
1349
1156
  type FeedbackLabelKind = 'approve' | 'reject' | 'select' | 'edit' | 'rank' | 'rate' | 'comment' | 'metric_outcome' | 'policy_block' | 'revision_request';
@@ -1432,10 +1239,12 @@ interface PreferenceMemoryEntry {
1432
1239
  sourceLabelId?: string;
1433
1240
  category?: string;
1434
1241
  }
1435
- interface FeedbackOptimizerRow extends OptimizationExample {
1242
+ interface FeedbackOptimizerRow {
1243
+ scenarioId: string;
1436
1244
  trajectoryId: string;
1437
1245
  labelKinds: FeedbackLabelKind[];
1438
1246
  score?: number;
1247
+ metadata?: Record<string, unknown>;
1439
1248
  }
1440
1249
  interface FeedbackReplayResult {
1441
1250
  trajectoryId: string;
@@ -2070,6 +1879,61 @@ declare class D1ExperimentStore implements ExperimentStore {
2070
1879
  listRuns(experimentId: string): Promise<Run[]>;
2071
1880
  }
2072
1881
 
1882
+ interface SteeringRolePrompt {
1883
+ system?: string;
1884
+ append?: string;
1885
+ }
1886
+ interface SteeringBundle {
1887
+ id: string;
1888
+ coderPrompt?: string;
1889
+ continuePrompt?: string;
1890
+ reviewerPrompts?: Record<string, string>;
1891
+ skills?: string[];
1892
+ rolePrompts?: Record<string, SteeringRolePrompt>;
1893
+ metadata?: Record<string, unknown>;
1894
+ }
1895
+ interface SteeringDelta {
1896
+ coderPrompt?: string;
1897
+ continuePrompt?: string;
1898
+ reviewerPrompts?: Record<string, string>;
1899
+ skills?: string[];
1900
+ rolePrompts?: Record<string, SteeringRolePrompt>;
1901
+ metadata?: Record<string, unknown>;
1902
+ }
1903
+ declare function mergeSteeringBundle(base: SteeringBundle, delta: SteeringDelta): SteeringBundle;
1904
+ declare function renderSteeringText(bundle: SteeringBundle): string;
1905
+
1906
+ interface RunScore {
1907
+ success: number;
1908
+ goalProgress: number;
1909
+ repoGroundedness: number;
1910
+ driftPenalty: number;
1911
+ toolUseQuality: number;
1912
+ patchQuality: number;
1913
+ testReality: number;
1914
+ finalGate: number;
1915
+ reviewerBlockers: number;
1916
+ costUsd: number;
1917
+ wallSeconds: number;
1918
+ notes?: string[];
1919
+ }
1920
+ interface RunScoreWeights {
1921
+ success: number;
1922
+ goalProgress: number;
1923
+ repoGroundedness: number;
1924
+ driftPenalty: number;
1925
+ toolUseQuality: number;
1926
+ patchQuality: number;
1927
+ testReality: number;
1928
+ finalGate: number;
1929
+ reviewerBlockers: number;
1930
+ costUsd: number;
1931
+ wallSeconds: number;
1932
+ }
1933
+ declare const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights;
1934
+ declare function aggregateRunScore(score: RunScore, weights?: Partial<RunScoreWeights>): number;
1935
+ declare function clamp01(value: number): number;
1936
+
2073
1937
  /**
2074
1938
  * Typed query helpers over TraceStore.
2075
1939
  *
@@ -3809,9 +3673,9 @@ declare function toolNamesForRun(store: TraceStore, runId: string): Promise<stri
3809
3673
  * returns the N per arm needed to detect a given effect size.
3810
3674
  * 2. After running: `benjaminiHochberg(pValues, fdr)` and
3811
3675
  * `bonferroni(pValues, alpha)` correct for multiple pairwise tests
3812
- * so PromptOptimizer's "significant" flag is statistically honest.
3676
+ * so pairwise variant comparisons stay statistically honest.
3813
3677
  *
3814
- * Fixes the correctness bug in 0.2's PromptOptimizer which applied
3678
+ * Fixes the correctness bug in 0.2's pairwise optimizer which applied
3815
3679
  * alpha directly across n*(n-1)/2 pairwise tests without correction —
3816
3680
  * dramatically inflating false-positive rate when variants ≥ 3.
3817
3681
  */
@@ -7505,7 +7369,7 @@ declare function referenceReplayScenarioToRunScore(scenarioScore: ReferenceRepla
7505
7369
  * 4. Repeat for N generations OR until convergence.
7506
7370
  *
7507
7371
  * Domain-agnostic. Consumers supply:
7508
- * - A seed population of `PromptVariant`s.
7372
+ * - A seed population of `EvolvableVariant`s.
7509
7373
  * - A `ScoreAdapter` that runs (variant, scenario, rep) → `TrialResult`.
7510
7374
  * - A `MutateAdapter` that produces children given trace evidence.
7511
7375
  * - Pareto `Objective<TrialAggregate>[]` defining the multi-objective vector.
@@ -7517,7 +7381,7 @@ declare function referenceReplayScenarioToRunScore(scenarioScore: ReferenceRepla
7517
7381
  * mutation primitives, persisting to disk. Those are the consumer's call.
7518
7382
  */
7519
7383
 
7520
- interface PromptVariant<P = unknown> {
7384
+ interface EvolvableVariant<P = unknown> {
7521
7385
  /** Stable id for the variant — surfaces in reports and trial results. */
7522
7386
  id: string;
7523
7387
  /** Variant payload — interpretation is the consumer's responsibility. */
@@ -7571,26 +7435,26 @@ interface VariantAggregate {
7571
7435
  }
7572
7436
  interface ScoreAdapter<P = unknown> {
7573
7437
  score(args: {
7574
- variant: PromptVariant<P>;
7438
+ variant: EvolvableVariant<P>;
7575
7439
  scenarioId: string;
7576
7440
  rep: number;
7577
7441
  }): Promise<TrialResult>;
7578
7442
  }
7579
7443
  interface MutateAdapter<P = unknown> {
7580
7444
  mutate(args: {
7581
- parent: PromptVariant<P>;
7445
+ parent: EvolvableVariant<P>;
7582
7446
  parentAggregate: VariantAggregate;
7583
7447
  topTrials: TrialResult[];
7584
7448
  bottomTrials: TrialResult[];
7585
7449
  childCount: number;
7586
7450
  generation: number;
7587
- }): Promise<PromptVariant<P>[]>;
7451
+ }): Promise<EvolvableVariant<P>[]>;
7588
7452
  }
7589
7453
  interface PromptEvolutionConfig<P = unknown> {
7590
7454
  runId: string;
7591
7455
  /** What component is being mutated — surfaces in reports + reflection prompts. */
7592
7456
  target: string;
7593
- seedVariants: PromptVariant<P>[];
7457
+ seedVariants: EvolvableVariant<P>[];
7594
7458
  scenarioIds: string[];
7595
7459
  reps: number;
7596
7460
  generations: number;
@@ -7649,7 +7513,7 @@ interface GenerationReport<P = unknown> {
7649
7513
  runId: string;
7650
7514
  target: string;
7651
7515
  generation: number;
7652
- variants: PromptVariant<P>[];
7516
+ variants: EvolvableVariant<P>[];
7653
7517
  aggregates: VariantAggregate[];
7654
7518
  /** Frontier candidates, sorted by descending crowding distance. */
7655
7519
  paretoFrontIds: string[];
@@ -7663,7 +7527,7 @@ interface PromptEvolutionResult<P = unknown> {
7663
7527
  target: string;
7664
7528
  generations: GenerationReport<P>[];
7665
7529
  /** Best variant by scalar score in the final generation. */
7666
- bestVariant: PromptVariant<P>;
7530
+ bestVariant: EvolvableVariant<P>;
7667
7531
  /** Best aggregate (matches bestVariant). */
7668
7532
  bestAggregate: VariantAggregate;
7669
7533
  }
@@ -7754,7 +7618,7 @@ declare function parseReflectionResponse(raw: string, maxProposals?: number): Re
7754
7618
 
7755
7619
  type MultiShotSplit = 'search' | 'dev' | 'holdout';
7756
7620
  type AsiSeverity = 'info' | 'warning' | 'error' | 'critical';
7757
- type MultiShotVariant<P = unknown> = PromptVariant<P>;
7621
+ type MultiShotVariant<P = unknown> = EvolvableVariant<P>;
7758
7622
  interface ActionableSideInfo {
7759
7623
  /** Stable expectation/check id when available. */
7760
7624
  expectationId?: string;
@@ -7794,7 +7658,7 @@ interface MultiShotRun {
7794
7658
  metadata?: Record<string, unknown>;
7795
7659
  }
7796
7660
  interface MultiShotRunInput<P = unknown> {
7797
- variant: PromptVariant<P>;
7661
+ variant: EvolvableVariant<P>;
7798
7662
  scenarioId: string;
7799
7663
  rep: number;
7800
7664
  split: MultiShotSplit;
@@ -7832,13 +7696,13 @@ interface MultiShotTrialResult extends TrialResult {
7832
7696
  }
7833
7697
  interface MultiShotMutateAdapter<P = unknown> {
7834
7698
  mutate(args: {
7835
- parent: PromptVariant<P>;
7699
+ parent: EvolvableVariant<P>;
7836
7700
  parentAggregate: VariantAggregate;
7837
7701
  topTrials: MultiShotTrialResult[];
7838
7702
  bottomTrials: MultiShotTrialResult[];
7839
7703
  childCount: number;
7840
7704
  generation: number;
7841
- }): Promise<PromptVariant<P>[]>;
7705
+ }): Promise<EvolvableVariant<P>[]>;
7842
7706
  }
7843
7707
  interface MultiShotGateConfig<P = unknown> {
7844
7708
  /** Search rows are optional, but enable HeldOutGate's overfit-gap check. */
@@ -7848,7 +7712,7 @@ interface MultiShotGateConfig<P = unknown> {
7848
7712
  gate: HeldOutGateConfig;
7849
7713
  /** Convert scored trajectory runs into paper-grade RunRecords. */
7850
7714
  toRunRecord(input: {
7851
- variant: PromptVariant<P>;
7715
+ variant: EvolvableVariant<P>;
7852
7716
  scenarioId: string;
7853
7717
  rep: number;
7854
7718
  split: RunSplitTag;
@@ -7859,7 +7723,7 @@ interface MultiShotGateConfig<P = unknown> {
7859
7723
  interface MultiShotOptimizationConfig<P = unknown> {
7860
7724
  runId: string;
7861
7725
  target: string;
7862
- seedVariants: PromptVariant<P>[];
7726
+ seedVariants: EvolvableVariant<P>[];
7863
7727
  searchScenarioIds: string[];
7864
7728
  reps: number;
7865
7729
  generations: number;
@@ -7884,10 +7748,10 @@ interface MultiShotGateResult {
7884
7748
  interface MultiShotOptimizationResult<P = unknown> {
7885
7749
  evolution: PromptEvolutionResult<P>;
7886
7750
  /** Best candidate on the optimizer-visible search split. */
7887
- searchBestVariant: PromptVariant<P>;
7751
+ searchBestVariant: EvolvableVariant<P>;
7888
7752
  searchBestAggregate: VariantAggregate;
7889
7753
  /** Variant callers should actually ship after optional holdout gating. */
7890
- promotedVariant: PromptVariant<P>;
7754
+ promotedVariant: EvolvableVariant<P>;
7891
7755
  promotedAggregate: VariantAggregate;
7892
7756
  /** Null when no gate was configured or the search-best candidate was the baseline. */
7893
7757
  gate: MultiShotGateResult | null;
@@ -7896,6 +7760,114 @@ declare function runMultiShotOptimization<P>(config: MultiShotOptimizationConfig
7896
7760
  declare function defaultMultiShotObjectives(): Objective<VariantAggregate>[];
7897
7761
  declare function trialTraceFromMultiShotTrial(trial: MultiShotTrialResult): TrialTrace;
7898
7762
 
7763
+ /**
7764
+ * Release confidence gate.
7765
+ *
7766
+ * This is the production-facing composition layer over the lower-level
7767
+ * primitives:
7768
+ * - Dataset manifests prove corpus/version coverage.
7769
+ * - RunRecord rows prove reproducible search/holdout outcomes.
7770
+ * - Multi-shot trace evidence carries turn counts and ASI diagnostics.
7771
+ * - HeldOutGate decisions remain the paired promotion authority.
7772
+ *
7773
+ * The gate is intentionally pure and conservative. Missing declared evidence
7774
+ * fails closed instead of being treated as a neutral zero.
7775
+ */
7776
+
7777
+ type ReleaseConfidenceStatus = 'pass' | 'warn' | 'fail';
7778
+ type ReleaseConfidenceAxisName = 'corpus' | 'quality' | 'generalization' | 'diagnostics' | 'efficiency';
7779
+ interface ReleaseTraceEvidence {
7780
+ scenarioId: string;
7781
+ candidateId?: string;
7782
+ split?: RunSplitTag;
7783
+ score?: number;
7784
+ ok?: boolean;
7785
+ turnCount?: number;
7786
+ costUsd?: number;
7787
+ durationMs?: number;
7788
+ failureMode?: string;
7789
+ asi?: ActionableSideInfo[];
7790
+ metadata?: Record<string, unknown>;
7791
+ }
7792
+ interface ReleaseConfidenceThresholds {
7793
+ /** Require a Dataset manifest or explicit scenarios. Default true. */
7794
+ requireCorpus?: boolean;
7795
+ minScenarioCount?: number;
7796
+ minSearchRuns?: number;
7797
+ minHoldoutRuns?: number;
7798
+ /** Require at least one holdout scenario/run. Default true. */
7799
+ requireHoldout?: boolean;
7800
+ minPassRate?: number;
7801
+ minMeanScore?: number;
7802
+ /** Search mean may exceed holdout mean by at most this much. */
7803
+ maxOverfitGap?: number;
7804
+ maxMeanCostUsd?: number;
7805
+ maxP95WallMs?: number;
7806
+ /** Low-score/failed rows must carry ASI. Default true. */
7807
+ requireAsiForFailures?: boolean;
7808
+ /** Score below this is considered a failure for ASI coverage. Default 0.5. */
7809
+ failureScoreThreshold?: number;
7810
+ }
7811
+ interface ReleaseConfidenceInput {
7812
+ target: string;
7813
+ candidateId?: string;
7814
+ baselineId?: string;
7815
+ dataset?: DatasetManifest;
7816
+ scenarios?: readonly DatasetScenario[];
7817
+ runs?: readonly RunRecord[];
7818
+ traces?: readonly ReleaseTraceEvidence[];
7819
+ gateDecision?: GateDecision | null;
7820
+ thresholds?: ReleaseConfidenceThresholds;
7821
+ }
7822
+ interface ReleaseConfidenceAxis {
7823
+ name: ReleaseConfidenceAxisName;
7824
+ status: ReleaseConfidenceStatus;
7825
+ score: number;
7826
+ detail: string;
7827
+ }
7828
+ interface ReleaseConfidenceIssue {
7829
+ axis: ReleaseConfidenceAxisName;
7830
+ severity: 'critical' | 'warning';
7831
+ code: string;
7832
+ detail: string;
7833
+ }
7834
+ interface ReleaseConfidenceMetrics {
7835
+ scenarioCount: number;
7836
+ searchRuns: number;
7837
+ holdoutRuns: number;
7838
+ passRate: number;
7839
+ meanScore: number;
7840
+ searchMeanScore: number;
7841
+ holdoutMeanScore: number;
7842
+ overfitGap: number;
7843
+ meanCostUsd: number;
7844
+ p95WallMs: number;
7845
+ failedRows: number;
7846
+ failuresWithAsi: number;
7847
+ singleShotTraces: number;
7848
+ multiShotTraces: number;
7849
+ splitCounts: Record<DatasetSplit, number>;
7850
+ domainCounts: Record<string, number>;
7851
+ failureModeCounts: Record<string, number>;
7852
+ responsibleSurfaceCounts: Record<string, number>;
7853
+ }
7854
+ interface ReleaseConfidenceScorecard {
7855
+ target: string;
7856
+ candidateId: string | null;
7857
+ baselineId: string | null;
7858
+ status: ReleaseConfidenceStatus;
7859
+ promote: boolean;
7860
+ axes: ReleaseConfidenceAxis[];
7861
+ issues: ReleaseConfidenceIssue[];
7862
+ metrics: ReleaseConfidenceMetrics;
7863
+ dataset: DatasetManifest | null;
7864
+ gateDecision: GateDecision | null;
7865
+ summary: string;
7866
+ }
7867
+ declare function releaseTraceEvidenceFromMultiShotTrials(trials: readonly MultiShotTrialResult[]): ReleaseTraceEvidence[];
7868
+ declare function evaluateReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
7869
+ declare function assertReleaseConfidence(input: ReleaseConfidenceInput): ReleaseConfidenceScorecard;
7870
+
7899
7871
  /**
7900
7872
  * concurrency — small primitives the evolution loop needs.
7901
7873
  *
@@ -8076,7 +8048,7 @@ interface LineageNode {
8076
8048
  * that field is part of the audit-bench convention but cheap enough to
8077
8049
  * accept any payload that mirrors it. Override by passing your own.
8078
8050
  */
8079
- type LineageKindResolver<P> = (variant: PromptVariant<P>) => LineageKind;
8051
+ type LineageKindResolver<P> = (variant: EvolvableVariant<P>) => LineageKind;
8080
8052
  /**
8081
8053
  * Persistence shape:
8082
8054
  *
@@ -8101,7 +8073,7 @@ declare class LineageRecorder<P = unknown> {
8101
8073
  private readonly kindOf;
8102
8074
  constructor(path: string, kindOf?: LineageKindResolver<P>);
8103
8075
  upsert(node: LineageNode): Promise<void>;
8104
- upsertVariant(variant: PromptVariant<P>, opts?: {
8076
+ upsertVariant(variant: EvolvableVariant<P>, opts?: {
8105
8077
  omitPayload?: boolean;
8106
8078
  }): Promise<void>;
8107
8079
  snapshot(): LineageNode[];
@@ -8300,7 +8272,7 @@ interface CodeMutationOutcome {
8300
8272
  childId?: string;
8301
8273
  /** Free-form one-liner: "tightened tool descriptions in forge-tools.ts". */
8302
8274
  description?: string;
8303
- /** What the runner was trying to fix (carried into PromptVariant.rationale). */
8275
+ /** What the runner was trying to fix (carried into EvolvableVariant.rationale). */
8304
8276
  rationale?: string;
8305
8277
  /** Caller-defined diff payload. Mapped into the variant's payload by
8306
8278
  * `toVariantPayload`; agent-eval treats it as opaque. */
@@ -8317,7 +8289,7 @@ interface CodeMutationOutcome {
8317
8289
  }
8318
8290
  type CodeMutationRunner<T, P> = (args: {
8319
8291
  slot: PoolSlot<T>;
8320
- parent: PromptVariant<P>;
8292
+ parent: EvolvableVariant<P>;
8321
8293
  parentAggregate: VariantAggregate;
8322
8294
  topTrials: TrialResult[];
8323
8295
  bottomTrials: TrialResult[];
@@ -8332,15 +8304,15 @@ interface CreateSandboxCodeMutatorOpts<T, P> {
8332
8304
  * encode the diff however they want (file map, patch string, branch
8333
8305
  * ref, snapshot id) without agent-eval taking a stance.
8334
8306
  */
8335
- toVariantPayload(outcome: CodeMutationOutcome, parent: PromptVariant<P>): P;
8307
+ toVariantPayload(outcome: CodeMutationOutcome, parent: EvolvableVariant<P>): P;
8336
8308
  /** Optional telemetry sinks. */
8337
8309
  mutationTelemetry?: MutationTelemetry;
8338
8310
  costLedger?: CostLedger;
8339
8311
  lineage?: LineageRecorder<P>;
8340
8312
  /** Override id generation. Default: `${parent.id}.g${generation}.code.${i}`. */
8341
- childIdFor?(parent: PromptVariant<P>, generation: number, index: number): string;
8313
+ childIdFor?(parent: EvolvableVariant<P>, generation: number, index: number): string;
8342
8314
  /** Default label for the variant (visible in reports). */
8343
- labelFor?(outcome: CodeMutationOutcome, parent: PromptVariant<P>, generation: number, index: number): string;
8315
+ labelFor?(outcome: CodeMutationOutcome, parent: EvolvableVariant<P>, generation: number, index: number): string;
8344
8316
  }
8345
8317
  declare function createSandboxCodeMutator<T, P>(opts: CreateSandboxCodeMutatorOpts<T, P>): MutateAdapter<P>;
8346
8318
 
@@ -8543,4 +8515,4 @@ declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): P
8543
8515
  candidateSamples: number;
8544
8516
  }>;
8545
8517
 
8546
- export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type PromptVariant as EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type OptimizationConfig, type OptimizationExample, OptimizationLoop, type OptimizationLoopConfig, type OptimizationLoopResult, type OptimizationResult, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, type PairwiseComparison, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptOptimizer, PromptRegistry, type TrialResult as PromptTrialResult, type PromptVariant$1 as PromptVariant, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringEvaluation, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type SteeringVariantReport, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VariantScore, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, trialTraceFromMultiShotTrial, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
8518
+ export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, trialTraceFromMultiShotTrial, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };