@tangle-network/agent-eval 0.19.1 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +244 -4
- package/dist/index.js +317 -14
- package/dist/index.js.map +1 -1
- package/docs/knowledge-readiness.md +84 -0
- package/docs/multi-shot-optimization.md +7 -0
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -692,7 +692,7 @@ interface Artifact$1 {
|
|
|
692
692
|
/** Inline content for small blobs — keep under ~64KB. */
|
|
693
693
|
inlineContent?: string;
|
|
694
694
|
}
|
|
695
|
-
type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'unknown';
|
|
695
|
+
type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'missing_user_data' | 'missing_domain_data' | 'missing_codebase_context' | 'missing_runtime_context' | 'missing_credentials' | 'stale_external_data' | 'bad_retrieval' | 'insufficient_evidence' | 'contradictory_evidence' | 'ambiguous_user_intent' | 'knowledge_readiness_blocked' | 'unknown';
|
|
696
696
|
declare const FAILURE_CLASSES: readonly FailureClass[];
|
|
697
697
|
declare function isLlmSpan(s: Span): s is LlmSpan;
|
|
698
698
|
declare function isToolSpan(s: Span): s is ToolSpan;
|
|
@@ -905,6 +905,8 @@ interface ControlEvalResult {
|
|
|
905
905
|
evidence?: string;
|
|
906
906
|
/** True when the result came from deterministic state, not LLM judgment. */
|
|
907
907
|
objective?: boolean;
|
|
908
|
+
/** Structured details for downstream control policies and reports. */
|
|
909
|
+
metadata?: Record<string, unknown>;
|
|
908
910
|
}
|
|
909
911
|
interface ControlBudget {
|
|
910
912
|
maxSteps: number;
|
|
@@ -1050,6 +1052,90 @@ declare function objectiveEval(input: Omit<ControlEvalResult, 'objective'>): Con
|
|
|
1050
1052
|
declare function subjectiveEval(input: Omit<ControlEvalResult, 'objective'>): ControlEvalResult;
|
|
1051
1053
|
declare function allCriticalPassed(evals: ControlEvalResult[]): boolean;
|
|
1052
1054
|
|
|
1055
|
+
type KnowledgeRequirementCategory = 'user_specific' | 'company_specific' | 'domain_specific' | 'codebase_specific' | 'market_specific' | 'regulatory' | 'tool_api' | 'credential_or_secret' | 'runtime_environment' | 'preference' | 'historical_context';
|
|
1056
|
+
type KnowledgeAcquisitionMode = 'ask_user' | 'search_web' | 'query_connector' | 'inspect_repo' | 'run_command' | 'infer_low_confidence' | 'not_available';
|
|
1057
|
+
type KnowledgeImportance = 'blocking' | 'high' | 'medium' | 'low';
|
|
1058
|
+
type KnowledgeFreshness = 'static' | 'monthly' | 'weekly' | 'daily' | 'realtime';
|
|
1059
|
+
type KnowledgeSensitivity = 'public' | 'private' | 'secret';
|
|
1060
|
+
type KnowledgeFallbackPolicy = 'block' | 'ask' | 'continue_with_caveat' | 'use_default';
|
|
1061
|
+
interface KnowledgeRequirement {
|
|
1062
|
+
id: string;
|
|
1063
|
+
description: string;
|
|
1064
|
+
requiredFor: string[];
|
|
1065
|
+
category: KnowledgeRequirementCategory;
|
|
1066
|
+
acquisitionMode: KnowledgeAcquisitionMode;
|
|
1067
|
+
importance: KnowledgeImportance;
|
|
1068
|
+
freshness: KnowledgeFreshness;
|
|
1069
|
+
sensitivity: KnowledgeSensitivity;
|
|
1070
|
+
confidenceNeeded: number;
|
|
1071
|
+
currentConfidence: number;
|
|
1072
|
+
evidenceIds: string[];
|
|
1073
|
+
fallbackPolicy: KnowledgeFallbackPolicy;
|
|
1074
|
+
metadata?: Record<string, unknown>;
|
|
1075
|
+
}
|
|
1076
|
+
interface KnowledgeBundle {
|
|
1077
|
+
taskId: string;
|
|
1078
|
+
requirements: KnowledgeRequirement[];
|
|
1079
|
+
evidenceIds: string[];
|
|
1080
|
+
claimIds: string[];
|
|
1081
|
+
wikiPageIds: string[];
|
|
1082
|
+
userAnswers: Record<string, string>;
|
|
1083
|
+
missing: KnowledgeRequirement[];
|
|
1084
|
+
readinessScore: number;
|
|
1085
|
+
metadata?: Record<string, unknown>;
|
|
1086
|
+
}
|
|
1087
|
+
type KnowledgeRecommendedAction = 'run_agent' | 'ask_user' | 'collect_web_data' | 'query_connectors' | 'inspect_repo' | 'build_domain_wiki' | 'continue_with_caveat' | 'abort_or_rescope';
|
|
1088
|
+
interface KnowledgeReadinessReport {
|
|
1089
|
+
taskId: string;
|
|
1090
|
+
readinessScore: number;
|
|
1091
|
+
blockingMissingRequirements: KnowledgeRequirement[];
|
|
1092
|
+
nonBlockingGaps: KnowledgeRequirement[];
|
|
1093
|
+
recommendedAction: KnowledgeRecommendedAction;
|
|
1094
|
+
bundle: KnowledgeBundle;
|
|
1095
|
+
severity: ControlSeverity;
|
|
1096
|
+
reason: string;
|
|
1097
|
+
}
|
|
1098
|
+
interface UserQuestion {
|
|
1099
|
+
id: string;
|
|
1100
|
+
question: string;
|
|
1101
|
+
reason: string;
|
|
1102
|
+
requirementId: string;
|
|
1103
|
+
importance: KnowledgeImportance;
|
|
1104
|
+
answerType: 'free_text' | 'select_one' | 'multi_select' | 'file_upload' | 'credential' | 'url';
|
|
1105
|
+
defaultIfSkipped?: string;
|
|
1106
|
+
impactIfUnknown: string;
|
|
1107
|
+
options?: string[];
|
|
1108
|
+
metadata?: Record<string, unknown>;
|
|
1109
|
+
}
|
|
1110
|
+
interface DataAcquisitionPlan {
|
|
1111
|
+
id: string;
|
|
1112
|
+
requirementIds: string[];
|
|
1113
|
+
mode: Exclude<KnowledgeAcquisitionMode, 'not_available' | 'infer_low_confidence'> | 'build_domain_wiki';
|
|
1114
|
+
description: string;
|
|
1115
|
+
priority: KnowledgeImportance;
|
|
1116
|
+
expectedEvidenceIds?: string[];
|
|
1117
|
+
questions?: UserQuestion[];
|
|
1118
|
+
metadata?: Record<string, unknown>;
|
|
1119
|
+
}
|
|
1120
|
+
type KnowledgeResponsibleSurface = 'knowledge-requirements' | 'data-acquisition' | 'retrieval-policy' | 'user-question-policy';
|
|
1121
|
+
|
|
1122
|
+
interface ScoreKnowledgeReadinessOptions {
|
|
1123
|
+
taskId: string;
|
|
1124
|
+
requirements: KnowledgeRequirement[];
|
|
1125
|
+
evidenceIds?: string[];
|
|
1126
|
+
claimIds?: string[];
|
|
1127
|
+
wikiPageIds?: string[];
|
|
1128
|
+
userAnswers?: Record<string, string>;
|
|
1129
|
+
metadata?: Record<string, unknown>;
|
|
1130
|
+
}
|
|
1131
|
+
declare function scoreKnowledgeReadiness(options: ScoreKnowledgeReadinessOptions): KnowledgeReadinessReport;
|
|
1132
|
+
declare function blockingKnowledgeEval(report: KnowledgeReadinessReport, options?: {
|
|
1133
|
+
id?: string;
|
|
1134
|
+
minimumScore?: number;
|
|
1135
|
+
}): ControlEvalResult;
|
|
1136
|
+
declare function userQuestionsForKnowledgeGaps(gaps: KnowledgeRequirement[]): UserQuestion[];
|
|
1137
|
+
declare function acquisitionPlansForKnowledgeGaps(gaps: KnowledgeRequirement[]): DataAcquisitionPlan[];
|
|
1138
|
+
|
|
1053
1139
|
/**
|
|
1054
1140
|
* Dataset — versioned, sliceable, content-hashed scenario collection.
|
|
1055
1141
|
*
|
|
@@ -1076,6 +1162,16 @@ interface DatasetScenario {
|
|
|
1076
1162
|
difficulty?: DatasetDifficulty;
|
|
1077
1163
|
/** Canary token that MUST NOT round-trip through a correct agent output. */
|
|
1078
1164
|
canary?: string;
|
|
1165
|
+
/**
|
|
1166
|
+
* Behavioral-canary forbidden pattern. A string OR a serialized regex
|
|
1167
|
+
* (`/.../flags`) that the agent under test MUST NOT emit. Used by
|
|
1168
|
+
* {@link import('./canary').checkBehavioralCanary | checkBehavioralCanary},
|
|
1169
|
+
* which inverts the contamination-style semantic: presence in the
|
|
1170
|
+
* agent output is a LEAK / failure, not a positive signal.
|
|
1171
|
+
*
|
|
1172
|
+
* Falls back to {@link canary} when omitted.
|
|
1173
|
+
*/
|
|
1174
|
+
forbiddenPattern?: string;
|
|
1079
1175
|
tags?: Record<string, string>;
|
|
1080
1176
|
}
|
|
1081
1177
|
interface DatasetProvenance {
|
|
@@ -3043,6 +3139,14 @@ interface FailureCluster {
|
|
|
3043
3139
|
toolName?: string;
|
|
3044
3140
|
/** First 16 chars of argHash — clusters similar args. */
|
|
3045
3141
|
argPrefix?: string;
|
|
3142
|
+
/**
|
|
3143
|
+
* Source dimension when the trigger was a judge span (e.g. `'format'`,
|
|
3144
|
+
* `'safety'`, `'correctness'`). Lets cross-template aggregators
|
|
3145
|
+
* group failures by the dimension that fired without overloading
|
|
3146
|
+
* `argPrefix`. Optional — legacy clusters without this field
|
|
3147
|
+
* deserialize cleanly.
|
|
3148
|
+
*/
|
|
3149
|
+
dimension?: string;
|
|
3046
3150
|
runCount: number;
|
|
3047
3151
|
scenarioIds: string[];
|
|
3048
3152
|
exampleError?: string;
|
|
@@ -3586,6 +3690,42 @@ interface CanaryLeak {
|
|
|
3586
3690
|
evidence: string;
|
|
3587
3691
|
}
|
|
3588
3692
|
declare function checkCanaries(output: string, scenarios: DatasetScenario[]): CanaryLeak[];
|
|
3693
|
+
/**
|
|
3694
|
+
* Behavioral canary: tests the AGENT, not the eval grep.
|
|
3695
|
+
*
|
|
3696
|
+
* - `forbiddenPattern` PRESENT in output ⇒ AGENT EMITTED BAD CONTENT ⇒ LEAK.
|
|
3697
|
+
* - `forbiddenPattern` ABSENT from output ⇒ AGENT HELD THE LINE ⇒ PASS.
|
|
3698
|
+
*
|
|
3699
|
+
* Use when running known-bad-prompt scenarios against the agent under
|
|
3700
|
+
* test and you want to know if the agent misbehaved. The classical
|
|
3701
|
+
* {@link checkCanaries} / {@link import('./canary').runCanaries | runCanaries}
|
|
3702
|
+
* test whether the eval check fires when the bad output is forced
|
|
3703
|
+
* into the eval flow — different question, different answer.
|
|
3704
|
+
*
|
|
3705
|
+
* Pattern resolution order (first match wins):
|
|
3706
|
+
* 1. `scenario.forbiddenPattern` — if it parses as `/body/flags`,
|
|
3707
|
+
* treated as a regex; otherwise a literal substring.
|
|
3708
|
+
* 2. `scenario.canary` — literal substring fallback so the helper
|
|
3709
|
+
* works on existing scenario fixtures.
|
|
3710
|
+
*
|
|
3711
|
+
* Returns `null` when nothing forbidden was found OR the scenario
|
|
3712
|
+
* declared no pattern.
|
|
3713
|
+
*/
|
|
3714
|
+
declare function checkBehavioralCanary(output: string, scenario: DatasetScenario): CanaryLeak | null;
|
|
3715
|
+
/**
|
|
3716
|
+
* Behavioral canary over many (scenario, output) pairs. Sibling to
|
|
3717
|
+
* {@link import('./canary').runCanaries | runCanaries} — same idea
|
|
3718
|
+
* (run-many → report) but the question being answered is "did the
|
|
3719
|
+
* AGENT misbehave?" rather than "did the EVAL grep fire?".
|
|
3720
|
+
*
|
|
3721
|
+
* Returns one `CanaryLeak` per pair where the agent's output
|
|
3722
|
+
* contained its scenario's `forbiddenPattern` (or `canary` fallback).
|
|
3723
|
+
*/
|
|
3724
|
+
declare function runBehavioralCanaries(cases: Array<{
|
|
3725
|
+
scenario: DatasetScenario;
|
|
3726
|
+
output: string;
|
|
3727
|
+
runId?: string;
|
|
3728
|
+
}>): CanaryLeak[];
|
|
3589
3729
|
/**
|
|
3590
3730
|
* Scan the LLM-output history in a corpus; returns every case where a
|
|
3591
3731
|
* canary from a known scenario appeared in agent output. Pass the full
|
|
@@ -4009,6 +4149,69 @@ declare const DEFAULT_MUTATORS: Array<{
|
|
|
4009
4149
|
id: string;
|
|
4010
4150
|
fn: Mutator;
|
|
4011
4151
|
}>;
|
|
4152
|
+
interface ParaphraseRobustnessScenarioInput {
|
|
4153
|
+
scenarios: Array<{
|
|
4154
|
+
id: string;
|
|
4155
|
+
userTurns: string[];
|
|
4156
|
+
}>;
|
|
4157
|
+
/**
|
|
4158
|
+
* Mutators applied to every user turn in every scenario. Each
|
|
4159
|
+
* scenario is paraphrased once per mutator (so `reps` × `scenarios`
|
|
4160
|
+
* × `mutators` total paraphrased runs).
|
|
4161
|
+
*/
|
|
4162
|
+
mutators: Array<{
|
|
4163
|
+
name: string;
|
|
4164
|
+
mutator: (text: string) => string;
|
|
4165
|
+
}>;
|
|
4166
|
+
/**
|
|
4167
|
+
* Run a (possibly mutated) scenario and return its score in [0,1].
|
|
4168
|
+
* Called once for the original turns of each scenario, and once per
|
|
4169
|
+
* (scenario × mutator × rep) for the paraphrased variants.
|
|
4170
|
+
*/
|
|
4171
|
+
runScenario: (args: {
|
|
4172
|
+
id: string;
|
|
4173
|
+
userTurns: string[];
|
|
4174
|
+
}) => Promise<{
|
|
4175
|
+
score: number;
|
|
4176
|
+
}>;
|
|
4177
|
+
/** Times to repeat each (scenario × mutator) pair. Default 1. */
|
|
4178
|
+
reps?: number;
|
|
4179
|
+
}
|
|
4180
|
+
interface ParaphraseRobustnessScenarioResult {
|
|
4181
|
+
/**
|
|
4182
|
+
* Aggregate robustness: `mean(paraphrased) / mean(original)`,
|
|
4183
|
+
* clipped to `[0, 1]`. `1` = paraphrasing didn't degrade the agent;
|
|
4184
|
+
* `0` = paraphrasing destroyed it (or original was 0).
|
|
4185
|
+
*/
|
|
4186
|
+
score: number;
|
|
4187
|
+
perScenario: Array<{
|
|
4188
|
+
id: string;
|
|
4189
|
+
originalScore: number;
|
|
4190
|
+
paraphrasedMean: number;
|
|
4191
|
+
/** Per-mutator delta (paraphrased − original); negative = mutator hurt. */
|
|
4192
|
+
deltas: Record<string, number>;
|
|
4193
|
+
}>;
|
|
4194
|
+
mutators: string[];
|
|
4195
|
+
}
|
|
4196
|
+
/**
|
|
4197
|
+
* Multi-turn convenience wrapper around {@link paraphraseRobustness}.
|
|
4198
|
+
*
|
|
4199
|
+
* Consumers with a list of multi-turn scenarios were hand-wrapping the
|
|
4200
|
+
* single-prompt runner per scenario; this iterates for them. Mutators
|
|
4201
|
+
* are applied to every user turn (mutator runs once per turn with a
|
|
4202
|
+
* stable seed derived from the rep index).
|
|
4203
|
+
*
|
|
4204
|
+
* Contract:
|
|
4205
|
+
* - Calls `runScenario` once with the original `userTurns` to
|
|
4206
|
+
* establish the baseline `originalScore`.
|
|
4207
|
+
* - For each `(scenario, mutator, rep)` combination, builds a
|
|
4208
|
+
* mutated copy of `userTurns` (every turn passed through
|
|
4209
|
+
* `mutator.mutator`) and calls `runScenario` again.
|
|
4210
|
+
* - Aggregates per-scenario means, then computes the overall
|
|
4211
|
+
* `mean(paraphrasedMean) / mean(originalScore)`, clipped to
|
|
4212
|
+
* `[0, 1]`. If every original score is 0 the aggregate is 0.
|
|
4213
|
+
*/
|
|
4214
|
+
declare function paraphraseRobustnessScenarios(args: ParaphraseRobustnessScenarioInput): Promise<ParaphraseRobustnessScenarioResult>;
|
|
4012
4215
|
|
|
4013
4216
|
/**
|
|
4014
4217
|
* Visual diff — pixel-delta scoring for UI / visual outputs.
|
|
@@ -4848,9 +5051,30 @@ interface HypothesisManifest {
|
|
|
4848
5051
|
baselineLabel?: string;
|
|
4849
5052
|
candidateLabel?: string;
|
|
4850
5053
|
}
|
|
5054
|
+
/**
|
|
5055
|
+
* Identifier for the hashing scheme used to produce `contentHash`.
|
|
5056
|
+
*
|
|
5057
|
+
* `'sha256-content'` — sha256 hex over the canonicalized manifest with
|
|
5058
|
+
* the `contentHash` and `algo` fields stripped. This is what
|
|
5059
|
+
* `signManifest` produces today.
|
|
5060
|
+
*
|
|
5061
|
+
* Held as a string union so future schemes can be added without
|
|
5062
|
+
* breaking parsers; legacy SignedManifest values written before this
|
|
5063
|
+
* field existed will deserialize cleanly because the field is optional.
|
|
5064
|
+
*/
|
|
5065
|
+
type SignedManifestAlgo = 'sha256-content';
|
|
4851
5066
|
interface SignedManifest extends HypothesisManifest {
|
|
4852
|
-
/** sha256 hex of canonicalized manifest (everything except contentHash). */
|
|
5067
|
+
/** sha256 hex of canonicalized manifest (everything except contentHash and algo). */
|
|
4853
5068
|
contentHash: string;
|
|
5069
|
+
/**
|
|
5070
|
+
* Algorithm string describing how `contentHash` was produced.
|
|
5071
|
+
*
|
|
5072
|
+
* Optional on the type so legacy serialized manifests (pre-`algo`)
|
|
5073
|
+
* still parse, but ALWAYS populated by {@link signManifest}.
|
|
5074
|
+
* Consumers that want to enforce a known algorithm should reject
|
|
5075
|
+
* manifests where this field is missing or unrecognized.
|
|
5076
|
+
*/
|
|
5077
|
+
algo?: SignedManifestAlgo;
|
|
4854
5078
|
}
|
|
4855
5079
|
interface HypothesisResult {
|
|
4856
5080
|
manifest: SignedManifest;
|
|
@@ -4864,8 +5088,24 @@ interface HypothesisResult {
|
|
|
4864
5088
|
rejectionReasons: Array<'wrong_direction' | 'effect_too_small' | 'not_significant' | 'undersampled'>;
|
|
4865
5089
|
notes?: string;
|
|
4866
5090
|
}
|
|
5091
|
+
/**
|
|
5092
|
+
* Sign a manifest with a SHA-256 content hash.
|
|
5093
|
+
*
|
|
5094
|
+
* The hash covers the canonicalized manifest with the `contentHash`
|
|
5095
|
+
* and `algo` fields stripped; this lets verifiers re-sign the rest and
|
|
5096
|
+
* compare. Returned manifest always carries `algo: 'sha256-content'`
|
|
5097
|
+
* so downstream consumers can identify the scheme; legacy serialized
|
|
5098
|
+
* manifests without `algo` still verify because it is stripped before
|
|
5099
|
+
* hashing on both sides.
|
|
5100
|
+
*/
|
|
4867
5101
|
declare function signManifest(m: HypothesisManifest): Promise<SignedManifest>;
|
|
4868
|
-
/**
|
|
5102
|
+
/**
|
|
5103
|
+
* Verify that a signed manifest has not been tampered with.
|
|
5104
|
+
*
|
|
5105
|
+
* Strips `contentHash` and `algo` before re-signing so legacy manifests
|
|
5106
|
+
* (written before `algo` was emitted) verify identically to current
|
|
5107
|
+
* ones.
|
|
5108
|
+
*/
|
|
4869
5109
|
declare function verifyManifest(m: SignedManifest): Promise<boolean>;
|
|
4870
5110
|
/**
|
|
4871
5111
|
* Evaluate a pre-registered hypothesis against observed results.
|
|
@@ -8515,4 +8755,4 @@ declare function judgeReplayGate<TOutput>(args: JudgeReplayGateArgs<TOutput>): P
|
|
|
8515
8755
|
candidateSamples: number;
|
|
8516
8756
|
}>;
|
|
8517
8757
|
|
|
8518
|
-
export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, trialTraceFromMultiShotTrial, typoMutator, urlContains, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
|
|
8758
|
+
export { type ActionExecutionPolicy, type ActionPolicyDecision, type ActionableSideInfo, type ActiveLearningOptions, type AdapterRun, AgentDriver, type AgentDriverConfig, type AlignmentOp, type AntiSlopConfig, type AntiSlopIssue, type AntiSlopReport, type Artifact$1 as Artifact, type ArtifactCheck, type Artifact as ArtifactCheckArtifact, type ArtifactResult, type ArtifactValidator, type AsiSeverity, AxGepaSteeringOptimizer, type AxSteeringOptimizerConfig, BENCHMARK_SPLIT_SEED, type BaselineOptions, type BaselineReport, BehaviorAssertion, type BenchmarkAdapter, type BenchmarkDatasetItem, type BenchmarkEvaluation, type BenchmarkReport, BenchmarkRunner, type BenchmarkRunnerConfig, type BestOfNResult, type BisectOptions, type BisectResult, type BisectStep, type BootstrapOptions, type BootstrapResult, BudgetBreachError, type BudgetBreachFinding, type BudgetBreachReport, BudgetGuard, type BudgetLedgerEntry, type BudgetSpec, BuilderSession, type BuilderSessionInit, type CalibrationBin, type CalibrationOptions, type CalibrationReport, type CalibrationResult, CallExpectation, type CanaryAlert, type CanaryKind, type CanaryLeak, type CanaryOptions, type CanaryReport, type CanarySeverity, type CandidateScenario, type CandidateScore, type CausalAttributionReport, type ChatSummary, type CheckResult, type CodeMutationOutcome, type CodeMutationRunner, type CollectedArtifacts, type CommandRunner, type CompletionCriterion, type CompositePolicy, type ConceptComplexity, type ConceptFinding, type ConceptSpec, type ConceptWeightStrategy, type ContinuityCheck, type ContinuityCheckResult, type ContinuityReport, type ContinuitySnapshotPair, type ContractMetric, type ContractReport, type ControlActionFailureMode, type ControlActionOutcome, type ControlBudget, type ControlContext, type ControlDecision, type ControlEvalResult, type ControlRunResult, type ControlRuntimeConfig, type ControlRuntimeError, type ControlSeverity, type ControlStep, type ControlStopPolicies, ConvergenceTracker, type CorrelationReport, type CorrelationResult, type CorrelationStudyOptions, type CorrelationStudyResult, type CostEntry, CostLedger, type CostLedgerGeneration, type CostLedgerSnapshot, type CostSummary, CostTracker, type CounterfactualContext, type CounterfactualMutation, type CounterfactualResult, type CounterfactualRunner, type CreateCompositeMutatorOpts, type CreateDefaultReviewerOptions, type CreateSandboxCodeMutatorOpts, type CreateSandboxPoolOpts, type CrossTraceDiff, type CrossTraceDiffOptions, D1ExperimentStore, type D1ExperimentStoreOptions, type D1Like, type D1PreparedStatementLike, DEFAULT_AGENT_SLOS, DEFAULT_COMPLEXITY_WEIGHTS, DEFAULT_RULES as DEFAULT_FAILURE_RULES, DEFAULT_FINDERS, DEFAULT_HARNESS_OBJECTIVES, DEFAULT_MUTATION_PRIMITIVES, DEFAULT_MUTATORS, DEFAULT_REDACTION_RULES, DEFAULT_RED_TEAM_CORPUS, DEFAULT_RUN_SCORE_WEIGHTS, DEFAULT_SEVERITY_WEIGHTS, type DataAcquisitionPlan, Dataset, type DatasetDifficulty, type DatasetManifest, type DatasetProvenance, type DatasetScenario, type DatasetSplit, type DeployFamily, type DeployGateLayerInput, type DeployRunResult, type DeployRunner, type DeploymentOutcome, type DirEntry, type Direction, type DivergenceOptions, type DivergenceReport, DockerSandboxDriver, type DriverResult, type DriverState, DualAgentBench, type DualAgentBenchConfig, type DualAgentReport, type DualAgentRound, type DualAgentScenario, type DualAgentScenarioResult, ERROR_COUNT_PATTERNS, type ErrorCountPattern, type EuRiskClass, type EvalMetricSpec, type EvalResult, type EventFilter, type EventKind, type EvolutionRound, type EvolvableVariant, type ExecutorConfig, type Expectation, type Experiment, type ExperimentPlan, type ExperimentResult, type Run as ExperimentRun, type ExperimentStore, ExperimentTracker, type ExportedRewardModel, type ExtractOptions, type ExtractResult, FAILURE_CLASSES, type FactorContribution, type FactorialCell, type FailureClass, type FailureClassification, type FailureCluster, type FailureClusterReport, type FailureContext, type FailureMode, type FailureRule, type FeedbackArtifactType, type FeedbackAttempt, type FeedbackLabel, type FeedbackLabelKind, type FeedbackLabelSource, type FeedbackOptimizerRow, type FeedbackOutcome, type FeedbackPattern, type FeedbackReplayAdapter, type FeedbackReplayResult, type FeedbackSeverity, type FeedbackSplitPolicy, type FeedbackTask, type FeedbackTrajectory, type FeedbackTrajectoryFilter, type FeedbackTrajectoryStore, FileSystemExperimentStore, type FileSystemExperimentStoreOptions, FileSystemFeedbackTrajectoryStore, FileSystemOutcomeStore, type FileSystemOutcomeStoreOptions, FileSystemTraceStore, type FileSystemTraceStoreOptions, type Finding, type FlowAction, type FlowLayerEnv, type FlowLayerFactoryInput, type FlowRunner, type FlowRunnerStepResult, type FlowSpec, type FlowStep, type GainDistributionBin, type GainDistributionFigureSpec, type GainDistributionOptions, type GateDecision, type GateEvidence, type GenerationReport, type GenericSpan, type GoldenItem, type GoldenSeverity, type GoldenSpec, type GovernanceContext, type GovernanceFinding, type GovernanceReport, type GradedStep, type HarnessAdapter, type HarnessConfig, type HarnessExperimentConfig, type HarnessExperimentResult, type HarnessIntervention, type HarnessRunRequest, type HarnessRunResult, type HarnessScenario, type HarnessSelection, type HarnessVariant, type HarnessVariantReport, HeldOutGate, type HeldOutGateConfig, type HeldOutGateRejectionCode, HoldoutAuditor, HoldoutLockedError, type HostedJudgeConfig, type HostedJudgeDimension, type HostedJudgeRequest, type HostedJudgeResponse, type HostedRunCriticConfig, type HostedRunScoreRequest, type HostedRunScoreResponse, type HypothesisManifest, type HypothesisResult, INTENT_MATCH_JUDGE_VERSION, type ImageData, InMemoryExperimentStore, InMemoryFeedbackTrajectoryStore, InMemoryOutcomeStore, InMemoryTraceStore, InMemoryTrialCache, InMemoryWorkspaceInspector, type InferenceScorer, type InspectorContext, type IntentMatchInput, type IntentMatchOptions, type IntentMatchResult, type InteractionContribution, JsonlTrialCache, type JudgeAgreementReport, type JudgeConfig, type JudgeFleetOptions, type JudgeFn, type JudgeInput, type JudgePair, type JudgeReplayGateArgs, type JudgeReplayResult, type JudgeRubric, JudgeRunner, type JudgeScore, type JudgeSpan, type KeywordConceptSpec, type KeywordCoverageFinding, type KeywordCoverageOptions, type KeywordCoverageResult, type KnowledgeAcquisitionMode, type KnowledgeBundle, type KnowledgeFallbackPolicy, type KnowledgeFreshness, type KnowledgeImportance, type KnowledgeReadinessReport, type KnowledgeRecommendedAction, type KnowledgeRequirement, type KnowledgeRequirementCategory, type KnowledgeResponsibleSurface, type KnowledgeSensitivity, type LangfuseEnvelope, type LangfuseGeneration, type LangfuseScore, type Layer, type LayerCorrelation, type LayerResult, type LayerStatus, type LineageKind, type LineageKindResolver, type LineageNode, LineageRecorder, LlmCallError, type LlmCallRequest, type LlmCallResult, LlmClient, type LlmClientOptions, type LlmJsonCall, type LlmMessage, type LlmReviewerConfig, type LlmSpan, type LlmUsage, LockedJsonlAppender, MODEL_PRICING, type MatchResult, type MatcherResult, type MeasurementPolicy, type MergeOptions, type Message, type MetricSamples, type MetricVerdict, MetricsCollector, type MuffledFinder, type MuffledFinding, MultiLayerVerifier, type MultiShotGateConfig, type MultiShotGateResult, type MultiShotMutateAdapter, type MultiShotOptimizationConfig, type MultiShotOptimizationResult, type MultiShotRun, type MultiShotRunInput, type MultiShotRunner, type MultiShotScore, type MultiShotScorer, type MultiShotSplit, type MultiShotTrace, type MultiShotTrialResult, type MultiShotVariant, type MultiToolchainLayerConfig, type MutateAdapter, type MutationAttempt, type MutationChannel, MutationTelemetry, type Mutator, Mutex, NoopResearcher, OTEL_AGENT_EVAL_SCOPE, type Objective, type Oracle, type OracleObservation, type OracleReport, type OracleResult, type OrthogonalityInput, type OrthogonalityResult, type OtlpExport, type OtlpResourceSpans, type OtlpSpan, type OutcomeFilter, type OutcomePair, type OutcomeStore, type PairedBootstrapOptions, type PairedBootstrapResult, PairwiseSteeringOptimizer, type ParaphraseRobustnessScenarioInput, type ParaphraseRobustnessScenarioResult, type ParetoFigureSpec, type ParetoPoint, type ParetoResult, type PersonaConfig, type Playbook, type PlaybookEntry, type PoolSlot, type PositionalBiasResult, type PreferenceMemoryEntry, type PrmGradedTrace, PrmGrader, type PrmTrainingSample, ProductClient, type ProductClientConfig, type ProjectKind, ProjectRegistry, type ProjectSummary, type ProjectTimelineEntry, type PromptEvolutionConfig, type PromptEvolutionEvent, type PromptEvolutionResult, type PromptHandle, PromptRegistry, type TrialResult as PromptTrialResult, type ProposeFn, type ProposeInput, type ProposeOutput, type ProposeReviewConfig, type ProposeReviewControlAction, type ProposeReviewControlConfig, type ProposeReviewControlResult, type ProposeReviewControlState, type ProposeReviewReport, type ProposeReviewShot, type ProposedSideEffect, REDACTION_VERSION, type RedTeamCase, type RedTeamCategory, type RedTeamFinding, type RedTeamPayload, type RedTeamReport, type RedactionReport, type RedactionRule, type ReferenceMatchResult, type ReferenceReplayAdapter, type ReferenceReplayAdapterFn, type ReferenceReplayAdapterLike, type ReferenceReplayAggregate, type ReferenceReplayCandidate, type ReferenceReplayCase, type ReferenceReplayCaseRun, type ReferenceReplayExecutionScenario, type ReferenceReplayItem, type ReferenceReplayMatch, type ReferenceReplayMatchStrategy, type ReferenceReplayMatcher, type ReferenceReplayPromotionDecision, type ReferenceReplayPromotionPolicy, type ReferenceReplayRun, type ReferenceReplayRunContext, type ReferenceReplayRunOptions, type ReferenceReplayRunStore, type ReferenceReplayScenario, type ReferenceReplayScenarioScore, type ReferenceReplayScore, type ReferenceReplayScoreOptions, type ReferenceReplaySplit, type ReferenceReplaySplitComparison, type ReferenceReplaySteeringRowsOptions, type ReflectionContext, type ReflectionProposal, type RegressionOptions, type RegressionSpec, type ReleaseConfidenceAxis, type ReleaseConfidenceAxisName, type ReleaseConfidenceInput, type ReleaseConfidenceIssue, type ReleaseConfidenceMetrics, type ReleaseConfidenceScorecard, type ReleaseConfidenceStatus, type ReleaseConfidenceThresholds, type ReleaseTraceEvidence, type Researcher, type RetrievalSpan, type Review, type ReviewFn, type ReviewInput, type ReviewMemoryEntry, type ReviewMemoryStore, type ReviewerMemoryEntry, type ReviewerOutput, type ReviewerPromptInput, type ReviewerSoftFailDefaults, type ReviewerVerificationSummary, type RobustnessResult, type RouteMap, type RubricDimension, type Run$1 as Run, type RunAppScenarioOptions, type RunCommandInput, type RunCommandResult, type RunConfig, RunCritic, type RunCriticOptions, type RunDiff, type RunFilter, type RunJudgeMetadata, type RunLayer, type RunOutcome, type RunRecord, RunRecordValidationError, type RunScore, type RunScoreWeights, type RunSplitTag, type RunStatus, type RunTokenUsage, type RunTrace, SEMANTIC_CONCEPT_JUDGE_VERSION, type SandboxDriver, SandboxHarness, type SandboxHarnessResult, type SandboxJudgeKind, type SandboxJudgeResult, type SandboxJudgeSpec, type SandboxPool, type SandboxResult, type SandboxSpan, type ScanOptions, type Scenario, type ScenarioAggregate, type ScenarioCost, type ScenarioFile, ScenarioRegistry, type ScenarioResult, type ScoreAdapter, type ScoreKnowledgeReadinessOptions, type ScoredTarget, type SelfPlayOptions, type SelfPlayProposer, type SelfPlayScorer, type SelfPreferenceResult, type SemanticConceptJudgeInput, type SemanticConceptJudgeOptions, type SemanticConceptJudgeResult, type SeriesConvergenceOptions, type SeriesConvergenceResult, type Severity, type ShipOptions, type SignedManifest, type SignedManifestAlgo, type SliceOptions, type Slo, type SloCheckResult, type SloComparator, type SloReport, type SloSeverity, type SlopCategory, type SlotFactory, type Span, type SpanBase, type SpanFilter, type SpanHandle, type SpanKind, type SpanStatus, type SteeringBundle, type SteeringChange, type SteeringDelta, type SteeringOptimizationResult, type SteeringOptimizationRow, type SteeringOptimizationSelector, type SteeringOptimizerBackend, type SteeringOptimizerConfig, type SteeringRolePrompt, type StepAttribution, type StepContext, type StepRubric, type StopDecision, type StuckLoopFinding, type StuckLoopOptions, type StuckLoopReport, SubprocessSandboxDriver, type SubprocessSandboxDriverOptions, type SummaryTable, type SummaryTableOptions, type SummaryTableRow, type SynthesisReason, type SynthesisTarget, TRACE_SCHEMA_VERSION, type TestGradedRunOptions, type TestGradedRunResult, type TestGradedScenario, type TestOutputParser, type TestResult, type ThreeLayerProjectReport, type ThresholdContract, TokenCounter, type TokenSpec, type ToolSpan, type ToolStats, type ToolUseMetrics, type ToolUseOptions, type ToolWasteFinding, type ToolWasteOptions, type ToolWasteReport, TraceEmitter, type TraceEmitterOptions, type TraceEvent, type TraceStore, type Trajectory, type TrajectoryStep, type TrialAttempt, type TrialCache, TrialTelemetry, type TrialTrace, type Turn, type TurnMetrics, type TurnResult, UNIVERSAL_FINDERS, type UseCaseSignals, type UserQuestion, type ValidationContext, type ValidationIssue, type ValidationResult, type VariantAggregate, type VerbosityBiasResult, type Verdict, type Verification, type VerificationReport, type VerifyContext, type VerifyFn, type VerifyOptions, type VisualDiffOptions, type VisualDiffResult, type ViteDeployRunnerInput, type WorkflowTopology, type WorkspaceAssertion, type WorkspaceAssertionResult, type WorkspaceInspector, type WorkspaceSnapshot, type WranglerDeployRunnerInput, acquisitionPlansForKnowledgeGaps, adversarialJudge, aggregateLlm, aggregateRunScore, allCriticalPassed, analyzeAntiSlop, analyzeSeries, argHash, assertReleaseConfidence, assignFeedbackSplit, attributeCounterfactuals, deterministicSplit as benchmarkDeterministicSplit, index as benchmarks, benjaminiHochberg, bhAdjust, bisect, blockingKnowledgeEval, bonferroni, bootstrapCi, budgetBreachView, buildReflectionPrompt, buildReviewerPrompt, buildTrajectory, byteLengthRange, calibrateJudge, calibrationCurve, callLlm, callLlmJson, canaryLeakView, causalAttribution, checkBehavioralCanary, checkCanaries, checkSlos, clamp01, classifyEuAiRisk, classifyFailure, codeExecutionJudge, cohensD, coherenceJudge, collectionPreserved, commitBisect, compareReferenceReplay, compareToBaseline, compilerJudge, composeParsers, composeValidators, computeToolUseMetrics, confidenceInterval, containsAll, controlFailureClassFromVerification, controlRunToFeedbackTrajectory, correlateLayers, correlationStudy, createAntiSlopJudge, createCompositeMutator, createCustomJudge, createDefaultReviewer, createDomainExpertJudge, createFeedbackTrajectory, createIntentMatchJudge, createLlmReviewer, createSandboxCodeMutator, createSandboxPool, createSemanticConceptJudge, crossTraceDiff, crowdingDistance, decideReferenceReplayPromotion, decideReferenceReplayRunPromotion, defaultJudges, defaultMultiShotObjectives, defaultReferenceReplayMatcher, deployGateLayer, distillPlaybook, dominates, estimateCost, estimateTokens, euAiActReport, evaluateActionPolicy, evaluateContract, evaluateHypothesis, evaluateOracles, evaluateReleaseConfidence, executeScenario, expectAgent, exportRewardModel, exportRunAsOtlp, exportTrainingData, extractAssetUrls, extractErrorCount, failureClusterView, feedbackTrajectoriesToDatasetScenarios, feedbackTrajectoriesToOptimizerRows, feedbackTrajectoryToDatasetScenario, feedbackTrajectoryToOptimizerRow, fileContains, fileExists, findAutoMatchNoExpectation, findConstructorCwdDropped, findFallbackToPass, findLiteralTruePass, findSkipCountsAsPass, firstDivergenceView, flowLayer, formatBenchmarkReport, formatDriverReport, formatFindings, gainHistogram, precision as goldenPrecision, gradeSemanticStatus, groupBy, hashContent, hashScenarios, htmlContainsElement, inMemoryReferenceReplayStore, inMemoryReviewStore, interRaterReliability, iqr, isJudgeSpan, isLlmSpan, isPrmVerdict, isRetrievalSpan, isRunRecord, isSandboxSpan, isToolSpan, jestTestParser, jsonHasKeys, jsonShape, jsonlReferenceReplayStore, jsonlReviewStore, judgeAgreementView, judgeReplayGate, judgeSpans, keyPreserved, linterJudge, llmSpanFromProvider, llmSpans, loadScorerFromGrader, localCommandRunner, lowercaseMutator, mannWhitneyU, matchGoldens, mergeLayerResults, mergeSteeringBundle, multiToolchainLayer, nistAiRmfReport, nonRefusalRubric, normalizeScores, notBlocked, objectiveEval, outputLengthRubric, pairedBootstrap, pairedTTest, pairedWilcoxon, paraphraseRobustness, paraphraseRobustnessScenarios, paretoChart, paretoFrontier, paretoFrontierWithCrowding, parseFeedbackTrajectoriesJsonl, parseReflectionResponse, parseRunRecordSafe, partialCredit, passOrthogonality, pixelDeltaRatio, politenessPrefixMutator, positionalBias, printDriverSummary, prmBestOfN, prmEnsembleBestOfN, probeLlm, promptBisect, proposeSynthesisTargets, pytestTestParser, redTeamDataset, redTeamReport, redactString, redactValue, referenceReplayRunsToSteeringRows, referenceReplayScenarioToRunScore, regexMatch, regexMatches, regressionView, releaseTraceEvidenceFromMultiShotTrials, renderMarkdown, renderMarkdownReport, renderPlaybookMarkdown, renderPreferenceMemoryMarkdown, renderSteeringText, replayFeedbackTrajectories, replayFeedbackTrajectory, replayScorerOverCorpus, replayTraceThroughJudge, requiredSampleSize, resetLockedAppendersForTesting, resumeBuilderSession, roundTripRunRecord, rowCount, rowWhere, runAgentControlLoop, runAssertions, runBehavioralCanaries, runCanaries, runCounterfactual, runE2EWorkflow, runExpectations, runFailureClass, runHarnessExperiment, runIntentMatchJudge, runJudgeFleet, runKeywordCoverageJudge, runKeywordCoverageJudgeUrl, runMultiShotOptimization, runPromptEvolution, runProposeReview, runProposeReviewAsControlLoop, runReferenceReplay, runSelfPlay, runSemanticConceptJudge, runTestGradedScenario, runsForScenario, scalarScore, scanForMuffledGates, scoreAllProjects, scoreContinuity, scoreKnowledgeReadiness, scoreProject, scoreRedTeamOutput, scoreReferenceReplay, securityJudge, selectHarnessVariant, selfPreference, sentenceReorderMutator, serializeFeedbackTrajectoriesJsonl, signManifest, soc2Report, statusAdvanced, stopOnNoProgress, stopOnRepeatedAction, stripFencedJson, stuckLoopView, subjectiveEval, summarize, summarizeHarnessResults, summarizePreferenceMemory, summaryTable, testJudge, textInSnapshot, toLangfuseEnvelope, toNdjson, toPrometheusText, toolIntentAlignmentRubric, toolNamesForRun, toolNonRedundantRubric, toolSpans, toolSuccessRubric, toolWasteView, trialTraceFromMultiShotTrial, typoMutator, urlContains, userQuestionsForKnowledgeGaps, validateRunRecord, verbosityBias, verifyManifest, visualDiff, viteDeployRunner, vitestTestParser, weightedMean, weightedRecall, welchsTTest, whitespaceCollapseMutator, wilcoxonSignedRank, withAssignedFeedbackSplit, wranglerDeployRunner };
|