@agentv/core 2.11.4 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1004,6 +1004,12 @@ interface TrialResult {
1004
1004
  readonly scores?: readonly EvaluatorResult[];
1005
1005
  readonly error?: string;
1006
1006
  readonly costUsd?: number;
1007
+ /** Primary classification for this trial attempt */
1008
+ readonly executionStatus?: ExecutionStatus;
1009
+ /** Pipeline stage where failure occurred */
1010
+ readonly failureStage?: FailureStage;
1011
+ /** Machine-readable failure reason code */
1012
+ readonly failureReasonCode?: string;
1007
1013
  }
1008
1014
  /**
1009
1015
  * Aggregation metadata for pass_at_k strategy.
@@ -1036,6 +1042,24 @@ interface ConfidenceIntervalAggregation {
1036
1042
  * Discriminated union of trial aggregation results.
1037
1043
  */
1038
1044
  type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
1045
+ /**
1046
+ * Primary classification of evaluation outcome.
1047
+ * - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
1048
+ * - 'quality_failure': evaluation completed but model scored below threshold
1049
+ * - 'execution_error': evaluation could not complete due to infrastructure/tooling error
1050
+ */
1051
+ type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
1052
+ /**
1053
+ * Pipeline stage where the failure occurred.
1054
+ */
1055
+ type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
1056
+ /**
1057
+ * Structured error detail for execution failures.
1058
+ */
1059
+ interface ExecutionError {
1060
+ readonly message: string;
1061
+ readonly stage: FailureStage;
1062
+ }
1039
1063
  /**
1040
1064
  * Evaluator scorecard for a single eval case run.
1041
1065
  */
@@ -1093,6 +1117,14 @@ interface EvaluationResult {
1093
1117
  readonly costLimited?: boolean;
1094
1118
  /** Whether the evaluation was skipped due to suite-level budget exhaustion */
1095
1119
  readonly budgetExceeded?: boolean;
1120
+ /** Primary classification: ok, quality_failure, or execution_error */
1121
+ readonly executionStatus: ExecutionStatus;
1122
+ /** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
1123
+ readonly failureStage?: FailureStage;
1124
+ /** Machine-readable failure reason code (only when executionStatus !== 'ok') */
1125
+ readonly failureReasonCode?: string;
1126
+ /** Structured error detail (only when executionStatus === 'execution_error') */
1127
+ readonly executionError?: ExecutionError;
1096
1128
  }
1097
1129
  type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
1098
1130
  interface EvaluatorResult {
@@ -2367,6 +2399,8 @@ interface RunEvalCaseOptions {
2367
2399
  readonly typeRegistry?: EvaluatorRegistry;
2368
2400
  /** RepoManager instance for repo lifecycle (shared workspace mode) */
2369
2401
  readonly repoManager?: RepoManager;
2402
+ /** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
2403
+ readonly evalDir?: string;
2370
2404
  }
2371
2405
  interface ProgressEvent {
2372
2406
  readonly workerId: number;
@@ -2863,6 +2897,8 @@ interface ScriptExecutionContext {
2863
2897
  readonly evalRunId: string;
2864
2898
  readonly caseInput?: string;
2865
2899
  readonly caseMetadata?: Record<string, unknown>;
2900
+ /** Directory containing the eval YAML file. Used as default cwd. */
2901
+ readonly evalDir?: string;
2866
2902
  }
2867
2903
  type ScriptFailureMode = 'fatal' | 'warn';
2868
2904
  /**
@@ -3120,4 +3156,4 @@ type AgentKernel = {
3120
3156
  };
3121
3157
  declare function createAgentKernel(): AgentKernel;
3122
3158
 
3123
- export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
3159
+ export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -1004,6 +1004,12 @@ interface TrialResult {
1004
1004
  readonly scores?: readonly EvaluatorResult[];
1005
1005
  readonly error?: string;
1006
1006
  readonly costUsd?: number;
1007
+ /** Primary classification for this trial attempt */
1008
+ readonly executionStatus?: ExecutionStatus;
1009
+ /** Pipeline stage where failure occurred */
1010
+ readonly failureStage?: FailureStage;
1011
+ /** Machine-readable failure reason code */
1012
+ readonly failureReasonCode?: string;
1007
1013
  }
1008
1014
  /**
1009
1015
  * Aggregation metadata for pass_at_k strategy.
@@ -1036,6 +1042,24 @@ interface ConfidenceIntervalAggregation {
1036
1042
  * Discriminated union of trial aggregation results.
1037
1043
  */
1038
1044
  type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
1045
+ /**
1046
+ * Primary classification of evaluation outcome.
1047
+ * - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
1048
+ * - 'quality_failure': evaluation completed but model scored below threshold
1049
+ * - 'execution_error': evaluation could not complete due to infrastructure/tooling error
1050
+ */
1051
+ type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
1052
+ /**
1053
+ * Pipeline stage where the failure occurred.
1054
+ */
1055
+ type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
1056
+ /**
1057
+ * Structured error detail for execution failures.
1058
+ */
1059
+ interface ExecutionError {
1060
+ readonly message: string;
1061
+ readonly stage: FailureStage;
1062
+ }
1039
1063
  /**
1040
1064
  * Evaluator scorecard for a single eval case run.
1041
1065
  */
@@ -1093,6 +1117,14 @@ interface EvaluationResult {
1093
1117
  readonly costLimited?: boolean;
1094
1118
  /** Whether the evaluation was skipped due to suite-level budget exhaustion */
1095
1119
  readonly budgetExceeded?: boolean;
1120
+ /** Primary classification: ok, quality_failure, or execution_error */
1121
+ readonly executionStatus: ExecutionStatus;
1122
+ /** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
1123
+ readonly failureStage?: FailureStage;
1124
+ /** Machine-readable failure reason code (only when executionStatus !== 'ok') */
1125
+ readonly failureReasonCode?: string;
1126
+ /** Structured error detail (only when executionStatus === 'execution_error') */
1127
+ readonly executionError?: ExecutionError;
1096
1128
  }
1097
1129
  type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
1098
1130
  interface EvaluatorResult {
@@ -2367,6 +2399,8 @@ interface RunEvalCaseOptions {
2367
2399
  readonly typeRegistry?: EvaluatorRegistry;
2368
2400
  /** RepoManager instance for repo lifecycle (shared workspace mode) */
2369
2401
  readonly repoManager?: RepoManager;
2402
+ /** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
2403
+ readonly evalDir?: string;
2370
2404
  }
2371
2405
  interface ProgressEvent {
2372
2406
  readonly workerId: number;
@@ -2863,6 +2897,8 @@ interface ScriptExecutionContext {
2863
2897
  readonly evalRunId: string;
2864
2898
  readonly caseInput?: string;
2865
2899
  readonly caseMetadata?: Record<string, unknown>;
2900
+ /** Directory containing the eval YAML file. Used as default cwd. */
2901
+ readonly evalDir?: string;
2866
2902
  }
2867
2903
  type ScriptFailureMode = 'fatal' | 'warn';
2868
2904
  /**
@@ -3120,4 +3156,4 @@ type AgentKernel = {
3120
3156
  };
3121
3157
  declare function createAgentKernel(): AgentKernel;
3122
3158
 
3123
- export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
3159
+ export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-REN5PS7B.js";
20
+ } from "./chunk-7HPKTRFZ.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -12847,6 +12847,16 @@ async function resolveWorkspaceTemplate(templatePath) {
12847
12847
  }
12848
12848
 
12849
12849
  // src/evaluation/workspace/script-executor.ts
12850
+ function interpolateArgs(args, context) {
12851
+ const vars = {
12852
+ workspace_path: context.workspacePath,
12853
+ test_id: context.testId,
12854
+ eval_run_id: context.evalRunId,
12855
+ case_input: context.caseInput ?? "",
12856
+ case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
12857
+ };
12858
+ return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
12859
+ }
12850
12860
  async function executeWorkspaceScript(config, context, failureMode = "fatal") {
12851
12861
  const stdin = JSON.stringify({
12852
12862
  workspace_path: context.workspacePath,
@@ -12856,8 +12866,9 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
12856
12866
  case_metadata: context.caseMetadata ?? null
12857
12867
  });
12858
12868
  const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
12859
- const cwd = config.cwd;
12860
- const commandArray = config.command ?? config.script ?? [];
12869
+ const cwd = config.cwd ?? context.evalDir;
12870
+ const rawCommand = config.command ?? config.script ?? [];
12871
+ const commandArray = interpolateArgs(rawCommand, context);
12861
12872
  const result = await execFileWithStdin(commandArray, stdin, {
12862
12873
  timeoutMs,
12863
12874
  cwd
@@ -12874,6 +12885,10 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
12874
12885
  }
12875
12886
 
12876
12887
  // src/evaluation/orchestrator.ts
12888
+ var QUALITY_PASS_THRESHOLD = 0.8;
12889
+ function classifyQualityStatus(score) {
12890
+ return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
12891
+ }
12877
12892
  function usesFileReferencePrompt(provider) {
12878
12893
  return isAgentProvider(provider) || provider.kind === "cli";
12879
12894
  }
@@ -12981,6 +12996,7 @@ async function runEvaluation(options) {
12981
12996
  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
12982
12997
  const typeRegistry = createBuiltinRegistry();
12983
12998
  const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
12999
+ const evalDir = discoveryBaseDir;
12984
13000
  await discoverAssertions(typeRegistry, discoveryBaseDir);
12985
13001
  const providerRegistry = createBuiltinProviderRegistry();
12986
13002
  await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -13076,7 +13092,8 @@ async function runEvaluation(options) {
13076
13092
  const scriptContext = {
13077
13093
  workspacePath: sharedWorkspacePath,
13078
13094
  testId: "__before_all__",
13079
- evalRunId
13095
+ evalRunId,
13096
+ evalDir
13080
13097
  };
13081
13098
  try {
13082
13099
  beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
@@ -13115,7 +13132,14 @@ async function runEvaluation(options) {
13115
13132
  answer: "",
13116
13133
  target: target.name,
13117
13134
  error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
13118
- budgetExceeded: true
13135
+ budgetExceeded: true,
13136
+ executionStatus: "execution_error",
13137
+ failureStage: "setup",
13138
+ failureReasonCode: "budget_exceeded",
13139
+ executionError: {
13140
+ message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
13141
+ stage: "setup"
13142
+ }
13119
13143
  };
13120
13144
  if (onProgress) {
13121
13145
  await onProgress({
@@ -13162,7 +13186,8 @@ async function runEvaluation(options) {
13162
13186
  suiteWorkspaceFile,
13163
13187
  streamCallbacks,
13164
13188
  typeRegistry,
13165
- repoManager
13189
+ repoManager,
13190
+ evalDir
13166
13191
  };
13167
13192
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
13168
13193
  if (totalBudgetUsd !== void 0) {
@@ -13231,7 +13256,9 @@ async function runEvaluation(options) {
13231
13256
  (now ?? (() => /* @__PURE__ */ new Date()))(),
13232
13257
  outcome.reason,
13233
13258
  promptInputs,
13234
- primaryProvider
13259
+ primaryProvider,
13260
+ "agent",
13261
+ "provider_error"
13235
13262
  );
13236
13263
  results.push(errorResult);
13237
13264
  if (onResult) {
@@ -13243,7 +13270,8 @@ async function runEvaluation(options) {
13243
13270
  const scriptContext = {
13244
13271
  workspacePath: sharedWorkspacePath,
13245
13272
  testId: "__after_all__",
13246
- evalRunId
13273
+ evalRunId,
13274
+ evalDir
13247
13275
  };
13248
13276
  try {
13249
13277
  const afterAllOutput = await executeWorkspaceScript(
@@ -13373,7 +13401,14 @@ async function runBatchEvaluation(options) {
13373
13401
  availableTargets
13374
13402
  });
13375
13403
  if (providerError) {
13376
- result = { ...result, error: providerError };
13404
+ result = {
13405
+ ...result,
13406
+ error: providerError,
13407
+ executionStatus: "execution_error",
13408
+ failureStage: "agent",
13409
+ failureReasonCode: "provider_error",
13410
+ executionError: { message: providerError, stage: "agent" }
13411
+ };
13377
13412
  }
13378
13413
  } catch (error) {
13379
13414
  const errorResult = buildErrorResult(
@@ -13382,7 +13417,9 @@ async function runBatchEvaluation(options) {
13382
13417
  nowFn(),
13383
13418
  error,
13384
13419
  promptInputs,
13385
- provider
13420
+ provider,
13421
+ "evaluator",
13422
+ "evaluator_error"
13386
13423
  );
13387
13424
  results.push(errorResult);
13388
13425
  if (onResult) {
@@ -13438,7 +13475,8 @@ async function runEvalCase(options) {
13438
13475
  sharedBaselineCommit,
13439
13476
  suiteWorkspaceFile,
13440
13477
  typeRegistry: providedTypeRegistry,
13441
- repoManager
13478
+ repoManager,
13479
+ evalDir
13442
13480
  } = options;
13443
13481
  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
13444
13482
  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -13471,7 +13509,9 @@ async function runEvalCase(options) {
13471
13509
  nowFn(),
13472
13510
  new Error(`Failed to create workspace: ${message}`),
13473
13511
  promptInputs,
13474
- provider
13512
+ provider,
13513
+ "setup",
13514
+ "template_error"
13475
13515
  );
13476
13516
  }
13477
13517
  }
@@ -13491,7 +13531,9 @@ async function runEvalCase(options) {
13491
13531
  nowFn(),
13492
13532
  new Error(`Failed to materialize repos: ${message}`),
13493
13533
  promptInputs,
13494
- provider
13534
+ provider,
13535
+ "repo_setup",
13536
+ "clone_error"
13495
13537
  );
13496
13538
  }
13497
13539
  }
@@ -13501,7 +13543,8 @@ async function runEvalCase(options) {
13501
13543
  testId: evalCase.id,
13502
13544
  evalRunId: evalRunId ?? "",
13503
13545
  caseInput: evalCase.question,
13504
- caseMetadata: evalCase.metadata
13546
+ caseMetadata: evalCase.metadata,
13547
+ evalDir
13505
13548
  };
13506
13549
  try {
13507
13550
  beforeAllOutput = await executeWorkspaceScript(
@@ -13520,7 +13563,9 @@ async function runEvalCase(options) {
13520
13563
  nowFn(),
13521
13564
  new Error(`before_all script failed: ${message}`),
13522
13565
  promptInputs,
13523
- provider
13566
+ provider,
13567
+ "setup",
13568
+ "script_error"
13524
13569
  );
13525
13570
  }
13526
13571
  }
@@ -13531,7 +13576,8 @@ async function runEvalCase(options) {
13531
13576
  testId: evalCase.id,
13532
13577
  evalRunId: evalRunId ?? "",
13533
13578
  caseInput: evalCase.question,
13534
- caseMetadata: evalCase.metadata
13579
+ caseMetadata: evalCase.metadata,
13580
+ evalDir
13535
13581
  };
13536
13582
  try {
13537
13583
  beforeEachOutput = await executeWorkspaceScript(
@@ -13546,7 +13592,9 @@ async function runEvalCase(options) {
13546
13592
  nowFn(),
13547
13593
  new Error(`before_each script failed: ${message}`),
13548
13594
  promptInputs,
13549
- provider
13595
+ provider,
13596
+ "setup",
13597
+ "script_error"
13550
13598
  );
13551
13599
  }
13552
13600
  }
@@ -13587,7 +13635,9 @@ async function runEvalCase(options) {
13587
13635
  nowFn(),
13588
13636
  error,
13589
13637
  promptInputs,
13590
- provider
13638
+ provider,
13639
+ "agent",
13640
+ "provider_error"
13591
13641
  );
13592
13642
  if (workspacePath) {
13593
13643
  if (forceCleanup) {
@@ -13606,7 +13656,9 @@ async function runEvalCase(options) {
13606
13656
  nowFn(),
13607
13657
  lastError ?? new Error("Provider did not return a response"),
13608
13658
  promptInputs,
13609
- provider
13659
+ provider,
13660
+ "agent",
13661
+ "provider_error"
13610
13662
  );
13611
13663
  if (workspacePath) {
13612
13664
  if (forceCleanup) {
@@ -13662,7 +13714,8 @@ async function runEvalCase(options) {
13662
13714
  testId: evalCase.id,
13663
13715
  evalRunId: evalRunId ?? "",
13664
13716
  caseInput: evalCase.question,
13665
- caseMetadata: evalCase.metadata
13717
+ caseMetadata: evalCase.metadata,
13718
+ evalDir
13666
13719
  };
13667
13720
  try {
13668
13721
  afterEachOutput = await executeWorkspaceScript(
@@ -13698,7 +13751,18 @@ async function runEvalCase(options) {
13698
13751
  fileChanges,
13699
13752
  workspacePath
13700
13753
  });
13701
- const finalResult = providerError ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput } : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput };
13754
+ const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
13755
+ const finalResult = providerError ? {
13756
+ ...result,
13757
+ error: providerError,
13758
+ executionStatus,
13759
+ failureStage: "agent",
13760
+ failureReasonCode: "provider_error",
13761
+ executionError: { message: providerError, stage: "agent" },
13762
+ beforeAllOutput,
13763
+ beforeEachOutput,
13764
+ afterEachOutput
13765
+ } : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
13702
13766
  const isFailure = !!finalResult.error || finalResult.score < 0.5;
13703
13767
  if (workspacePath && !isSharedWorkspace) {
13704
13768
  if (forceCleanup) {
@@ -13719,7 +13783,9 @@ async function runEvalCase(options) {
13719
13783
  nowFn(),
13720
13784
  error,
13721
13785
  promptInputs,
13722
- provider
13786
+ provider,
13787
+ "evaluator",
13788
+ "evaluator_error"
13723
13789
  );
13724
13790
  if (workspacePath && !isSharedWorkspace) {
13725
13791
  if (forceCleanup) {
@@ -13757,7 +13823,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
13757
13823
  verdict: trialVerdict,
13758
13824
  scores: result.scores,
13759
13825
  error: result.error,
13760
- costUsd: trialCost
13826
+ costUsd: trialCost,
13827
+ executionStatus: result.executionStatus,
13828
+ failureStage: result.failureStage,
13829
+ failureReasonCode: result.failureReasonCode
13761
13830
  };
13762
13831
  trialResults.push(trial);
13763
13832
  if (trialCost !== void 0) {
@@ -13782,12 +13851,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
13782
13851
  0
13783
13852
  );
13784
13853
  const baseResult = allResults[bestTrialIndex];
13854
+ const hasOk = trialResults.some((t) => t.executionStatus === "ok");
13855
+ const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
13856
+ const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
13857
+ const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
13858
+ const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
13859
+ const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
13785
13860
  return {
13786
13861
  ...baseResult,
13787
13862
  score,
13788
13863
  trials: trialResults,
13789
13864
  aggregation,
13790
- costLimited: costLimited || void 0
13865
+ costLimited: costLimited || void 0,
13866
+ executionStatus: aggregateExecutionStatus,
13867
+ failureStage: aggregateFailureStage,
13868
+ failureReasonCode: aggregateFailureReasonCode,
13869
+ executionError: aggregateExecutionError
13791
13870
  };
13792
13871
  }
13793
13872
  async function evaluateCandidate(options) {
@@ -13888,7 +13967,8 @@ async function evaluateCandidate(options) {
13888
13967
  scores,
13889
13968
  trace,
13890
13969
  output,
13891
- fileChanges
13970
+ fileChanges,
13971
+ executionStatus: classifyQualityStatus(score.score)
13892
13972
  };
13893
13973
  }
13894
13974
  async function runEvaluatorsForCase(options) {
@@ -14193,7 +14273,7 @@ async function invokeProvider(provider, options) {
14193
14273
  }
14194
14274
  }
14195
14275
  }
14196
- function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
14276
+ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
14197
14277
  const message = error instanceof Error ? error.message : String(error);
14198
14278
  let agentRequest;
14199
14279
  let lmRequest;
@@ -14236,7 +14316,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
14236
14316
  target: targetName,
14237
14317
  requests,
14238
14318
  input,
14239
- error: message
14319
+ error: message,
14320
+ executionStatus: "execution_error",
14321
+ failureStage,
14322
+ failureReasonCode,
14323
+ executionError: { message, stage: failureStage }
14240
14324
  };
14241
14325
  }
14242
14326
  function extractProviderError(response) {