@agentv/core 2.11.4 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-REN5PS7B.js → chunk-JHER2LQ5.js} +1 -1
- package/dist/chunk-JHER2LQ5.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +174 -28
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +54 -1
- package/dist/index.d.ts +54 -1
- package/dist/index.js +174 -29
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-REN5PS7B.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1004,6 +1004,12 @@ interface TrialResult {
|
|
|
1004
1004
|
readonly scores?: readonly EvaluatorResult[];
|
|
1005
1005
|
readonly error?: string;
|
|
1006
1006
|
readonly costUsd?: number;
|
|
1007
|
+
/** Primary classification for this trial attempt */
|
|
1008
|
+
readonly executionStatus?: ExecutionStatus;
|
|
1009
|
+
/** Pipeline stage where failure occurred */
|
|
1010
|
+
readonly failureStage?: FailureStage;
|
|
1011
|
+
/** Machine-readable failure reason code */
|
|
1012
|
+
readonly failureReasonCode?: string;
|
|
1007
1013
|
}
|
|
1008
1014
|
/**
|
|
1009
1015
|
* Aggregation metadata for pass_at_k strategy.
|
|
@@ -1036,6 +1042,30 @@ interface ConfidenceIntervalAggregation {
|
|
|
1036
1042
|
* Discriminated union of trial aggregation results.
|
|
1037
1043
|
*/
|
|
1038
1044
|
type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
|
|
1045
|
+
/**
|
|
1046
|
+
* Primary classification of evaluation outcome.
|
|
1047
|
+
* - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
|
|
1048
|
+
* - 'quality_failure': evaluation completed but model scored below threshold
|
|
1049
|
+
* - 'execution_error': evaluation could not complete due to infrastructure/tooling error
|
|
1050
|
+
*/
|
|
1051
|
+
type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
|
|
1052
|
+
/**
|
|
1053
|
+
* Pipeline stage where the failure occurred.
|
|
1054
|
+
*/
|
|
1055
|
+
type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
|
|
1056
|
+
/**
|
|
1057
|
+
* Structured error detail for execution failures.
|
|
1058
|
+
*/
|
|
1059
|
+
interface ExecutionError {
|
|
1060
|
+
readonly message: string;
|
|
1061
|
+
readonly stage: FailureStage;
|
|
1062
|
+
}
|
|
1063
|
+
/**
|
|
1064
|
+
* Tolerance for execution errors in an eval run.
|
|
1065
|
+
* - `true`: halt on first execution error
|
|
1066
|
+
* - `false`: never halt on errors (default)
|
|
1067
|
+
*/
|
|
1068
|
+
type FailOnError = boolean;
|
|
1039
1069
|
/**
|
|
1040
1070
|
* Evaluator scorecard for a single eval case run.
|
|
1041
1071
|
*/
|
|
@@ -1093,6 +1123,14 @@ interface EvaluationResult {
|
|
|
1093
1123
|
readonly costLimited?: boolean;
|
|
1094
1124
|
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
1095
1125
|
readonly budgetExceeded?: boolean;
|
|
1126
|
+
/** Primary classification: ok, quality_failure, or execution_error */
|
|
1127
|
+
readonly executionStatus: ExecutionStatus;
|
|
1128
|
+
/** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
|
|
1129
|
+
readonly failureStage?: FailureStage;
|
|
1130
|
+
/** Machine-readable failure reason code (only when executionStatus !== 'ok') */
|
|
1131
|
+
readonly failureReasonCode?: string;
|
|
1132
|
+
/** Structured error detail (only when executionStatus === 'execution_error') */
|
|
1133
|
+
readonly executionError?: ExecutionError;
|
|
1096
1134
|
}
|
|
1097
1135
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
|
|
1098
1136
|
interface EvaluatorResult {
|
|
@@ -1162,6 +1200,7 @@ type ExecutionDefaults = {
|
|
|
1162
1200
|
readonly otel_file?: string;
|
|
1163
1201
|
};
|
|
1164
1202
|
type AgentVConfig$1 = {
|
|
1203
|
+
readonly required_version?: string;
|
|
1165
1204
|
readonly guideline_patterns?: readonly string[];
|
|
1166
1205
|
readonly eval_patterns?: readonly string[];
|
|
1167
1206
|
readonly execution?: ExecutionDefaults;
|
|
@@ -1206,6 +1245,12 @@ interface CacheConfig {
|
|
|
1206
1245
|
* Returns undefined when no cache config is specified.
|
|
1207
1246
|
*/
|
|
1208
1247
|
declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
|
|
1248
|
+
/**
|
|
1249
|
+
* Extract `execution.fail_on_error` from parsed eval suite.
|
|
1250
|
+
* Accepts `true` or `false`.
|
|
1251
|
+
* Returns undefined when not specified.
|
|
1252
|
+
*/
|
|
1253
|
+
declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
|
|
1209
1254
|
|
|
1210
1255
|
/**
|
|
1211
1256
|
* Formatting mode for segment content.
|
|
@@ -1265,6 +1310,8 @@ type EvalSuiteResult = {
|
|
|
1265
1310
|
readonly metadata?: EvalMetadata;
|
|
1266
1311
|
/** Suite-level total cost budget in USD */
|
|
1267
1312
|
readonly totalBudgetUsd?: number;
|
|
1313
|
+
/** Execution error tolerance: true or false */
|
|
1314
|
+
readonly failOnError?: FailOnError;
|
|
1268
1315
|
};
|
|
1269
1316
|
/**
|
|
1270
1317
|
* Load tests and suite metadata from a single parse.
|
|
@@ -2367,6 +2414,8 @@ interface RunEvalCaseOptions {
|
|
|
2367
2414
|
readonly typeRegistry?: EvaluatorRegistry;
|
|
2368
2415
|
/** RepoManager instance for repo lifecycle (shared workspace mode) */
|
|
2369
2416
|
readonly repoManager?: RepoManager;
|
|
2417
|
+
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
2418
|
+
readonly evalDir?: string;
|
|
2370
2419
|
}
|
|
2371
2420
|
interface ProgressEvent {
|
|
2372
2421
|
readonly workerId: number;
|
|
@@ -2406,6 +2455,8 @@ interface RunEvaluationOptions {
|
|
|
2406
2455
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2407
2456
|
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
|
|
2408
2457
|
readonly totalBudgetUsd?: number;
|
|
2458
|
+
/** Execution error tolerance: true halts on first error */
|
|
2459
|
+
readonly failOnError?: FailOnError;
|
|
2409
2460
|
}
|
|
2410
2461
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2411
2462
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -2863,6 +2914,8 @@ interface ScriptExecutionContext {
|
|
|
2863
2914
|
readonly evalRunId: string;
|
|
2864
2915
|
readonly caseInput?: string;
|
|
2865
2916
|
readonly caseMetadata?: Record<string, unknown>;
|
|
2917
|
+
/** Directory containing the eval YAML file. Used as default cwd. */
|
|
2918
|
+
readonly evalDir?: string;
|
|
2866
2919
|
}
|
|
2867
2920
|
type ScriptFailureMode = 'fatal' | 'warn';
|
|
2868
2921
|
/**
|
|
@@ -3120,4 +3173,4 @@ type AgentKernel = {
|
|
|
3120
3173
|
};
|
|
3121
3174
|
declare function createAgentKernel(): AgentKernel;
|
|
3122
3175
|
|
|
3123
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3176
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -1004,6 +1004,12 @@ interface TrialResult {
|
|
|
1004
1004
|
readonly scores?: readonly EvaluatorResult[];
|
|
1005
1005
|
readonly error?: string;
|
|
1006
1006
|
readonly costUsd?: number;
|
|
1007
|
+
/** Primary classification for this trial attempt */
|
|
1008
|
+
readonly executionStatus?: ExecutionStatus;
|
|
1009
|
+
/** Pipeline stage where failure occurred */
|
|
1010
|
+
readonly failureStage?: FailureStage;
|
|
1011
|
+
/** Machine-readable failure reason code */
|
|
1012
|
+
readonly failureReasonCode?: string;
|
|
1007
1013
|
}
|
|
1008
1014
|
/**
|
|
1009
1015
|
* Aggregation metadata for pass_at_k strategy.
|
|
@@ -1036,6 +1042,30 @@ interface ConfidenceIntervalAggregation {
|
|
|
1036
1042
|
* Discriminated union of trial aggregation results.
|
|
1037
1043
|
*/
|
|
1038
1044
|
type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
|
|
1045
|
+
/**
|
|
1046
|
+
* Primary classification of evaluation outcome.
|
|
1047
|
+
* - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
|
|
1048
|
+
* - 'quality_failure': evaluation completed but model scored below threshold
|
|
1049
|
+
* - 'execution_error': evaluation could not complete due to infrastructure/tooling error
|
|
1050
|
+
*/
|
|
1051
|
+
type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
|
|
1052
|
+
/**
|
|
1053
|
+
* Pipeline stage where the failure occurred.
|
|
1054
|
+
*/
|
|
1055
|
+
type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
|
|
1056
|
+
/**
|
|
1057
|
+
* Structured error detail for execution failures.
|
|
1058
|
+
*/
|
|
1059
|
+
interface ExecutionError {
|
|
1060
|
+
readonly message: string;
|
|
1061
|
+
readonly stage: FailureStage;
|
|
1062
|
+
}
|
|
1063
|
+
/**
|
|
1064
|
+
* Tolerance for execution errors in an eval run.
|
|
1065
|
+
* - `true`: halt on first execution error
|
|
1066
|
+
* - `false`: never halt on errors (default)
|
|
1067
|
+
*/
|
|
1068
|
+
type FailOnError = boolean;
|
|
1039
1069
|
/**
|
|
1040
1070
|
* Evaluator scorecard for a single eval case run.
|
|
1041
1071
|
*/
|
|
@@ -1093,6 +1123,14 @@ interface EvaluationResult {
|
|
|
1093
1123
|
readonly costLimited?: boolean;
|
|
1094
1124
|
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
1095
1125
|
readonly budgetExceeded?: boolean;
|
|
1126
|
+
/** Primary classification: ok, quality_failure, or execution_error */
|
|
1127
|
+
readonly executionStatus: ExecutionStatus;
|
|
1128
|
+
/** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
|
|
1129
|
+
readonly failureStage?: FailureStage;
|
|
1130
|
+
/** Machine-readable failure reason code (only when executionStatus !== 'ok') */
|
|
1131
|
+
readonly failureReasonCode?: string;
|
|
1132
|
+
/** Structured error detail (only when executionStatus === 'execution_error') */
|
|
1133
|
+
readonly executionError?: ExecutionError;
|
|
1096
1134
|
}
|
|
1097
1135
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
|
|
1098
1136
|
interface EvaluatorResult {
|
|
@@ -1162,6 +1200,7 @@ type ExecutionDefaults = {
|
|
|
1162
1200
|
readonly otel_file?: string;
|
|
1163
1201
|
};
|
|
1164
1202
|
type AgentVConfig$1 = {
|
|
1203
|
+
readonly required_version?: string;
|
|
1165
1204
|
readonly guideline_patterns?: readonly string[];
|
|
1166
1205
|
readonly eval_patterns?: readonly string[];
|
|
1167
1206
|
readonly execution?: ExecutionDefaults;
|
|
@@ -1206,6 +1245,12 @@ interface CacheConfig {
|
|
|
1206
1245
|
* Returns undefined when no cache config is specified.
|
|
1207
1246
|
*/
|
|
1208
1247
|
declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
|
|
1248
|
+
/**
|
|
1249
|
+
* Extract `execution.fail_on_error` from parsed eval suite.
|
|
1250
|
+
* Accepts `true` or `false`.
|
|
1251
|
+
* Returns undefined when not specified.
|
|
1252
|
+
*/
|
|
1253
|
+
declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
|
|
1209
1254
|
|
|
1210
1255
|
/**
|
|
1211
1256
|
* Formatting mode for segment content.
|
|
@@ -1265,6 +1310,8 @@ type EvalSuiteResult = {
|
|
|
1265
1310
|
readonly metadata?: EvalMetadata;
|
|
1266
1311
|
/** Suite-level total cost budget in USD */
|
|
1267
1312
|
readonly totalBudgetUsd?: number;
|
|
1313
|
+
/** Execution error tolerance: true or false */
|
|
1314
|
+
readonly failOnError?: FailOnError;
|
|
1268
1315
|
};
|
|
1269
1316
|
/**
|
|
1270
1317
|
* Load tests and suite metadata from a single parse.
|
|
@@ -2367,6 +2414,8 @@ interface RunEvalCaseOptions {
|
|
|
2367
2414
|
readonly typeRegistry?: EvaluatorRegistry;
|
|
2368
2415
|
/** RepoManager instance for repo lifecycle (shared workspace mode) */
|
|
2369
2416
|
readonly repoManager?: RepoManager;
|
|
2417
|
+
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
2418
|
+
readonly evalDir?: string;
|
|
2370
2419
|
}
|
|
2371
2420
|
interface ProgressEvent {
|
|
2372
2421
|
readonly workerId: number;
|
|
@@ -2406,6 +2455,8 @@ interface RunEvaluationOptions {
|
|
|
2406
2455
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2407
2456
|
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
|
|
2408
2457
|
readonly totalBudgetUsd?: number;
|
|
2458
|
+
/** Execution error tolerance: true halts on first error */
|
|
2459
|
+
readonly failOnError?: FailOnError;
|
|
2409
2460
|
}
|
|
2410
2461
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2411
2462
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -2863,6 +2914,8 @@ interface ScriptExecutionContext {
|
|
|
2863
2914
|
readonly evalRunId: string;
|
|
2864
2915
|
readonly caseInput?: string;
|
|
2865
2916
|
readonly caseMetadata?: Record<string, unknown>;
|
|
2917
|
+
/** Directory containing the eval YAML file. Used as default cwd. */
|
|
2918
|
+
readonly evalDir?: string;
|
|
2866
2919
|
}
|
|
2867
2920
|
type ScriptFailureMode = 'fatal' | 'warn';
|
|
2868
2921
|
/**
|
|
@@ -3120,4 +3173,4 @@ type AgentKernel = {
|
|
|
3120
3173
|
};
|
|
3121
3174
|
declare function createAgentKernel(): AgentKernel;
|
|
3122
3175
|
|
|
3123
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3176
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|