@agentv/core 2.11.4 → 2.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -1004,6 +1004,12 @@ interface TrialResult {
1004
1004
  readonly scores?: readonly EvaluatorResult[];
1005
1005
  readonly error?: string;
1006
1006
  readonly costUsd?: number;
1007
+ /** Primary classification for this trial attempt */
1008
+ readonly executionStatus?: ExecutionStatus;
1009
+ /** Pipeline stage where failure occurred */
1010
+ readonly failureStage?: FailureStage;
1011
+ /** Machine-readable failure reason code */
1012
+ readonly failureReasonCode?: string;
1007
1013
  }
1008
1014
  /**
1009
1015
  * Aggregation metadata for pass_at_k strategy.
@@ -1036,6 +1042,30 @@ interface ConfidenceIntervalAggregation {
1036
1042
  * Discriminated union of trial aggregation results.
1037
1043
  */
1038
1044
  type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
1045
+ /**
1046
+ * Primary classification of evaluation outcome.
1047
+ * - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
1048
+ * - 'quality_failure': evaluation completed but model scored below threshold
1049
+ * - 'execution_error': evaluation could not complete due to infrastructure/tooling error
1050
+ */
1051
+ type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
1052
+ /**
1053
+ * Pipeline stage where the failure occurred.
1054
+ */
1055
+ type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
1056
+ /**
1057
+ * Structured error detail for execution failures.
1058
+ */
1059
+ interface ExecutionError {
1060
+ readonly message: string;
1061
+ readonly stage: FailureStage;
1062
+ }
1063
+ /**
1064
+ * Tolerance for execution errors in an eval run.
1065
+ * - `true`: halt on first execution error
1066
+ * - `false`: never halt on errors (default)
1067
+ */
1068
+ type FailOnError = boolean;
1039
1069
  /**
1040
1070
  * Evaluator scorecard for a single eval case run.
1041
1071
  */
@@ -1093,6 +1123,14 @@ interface EvaluationResult {
1093
1123
  readonly costLimited?: boolean;
1094
1124
  /** Whether the evaluation was skipped due to suite-level budget exhaustion */
1095
1125
  readonly budgetExceeded?: boolean;
1126
+ /** Primary classification: ok, quality_failure, or execution_error */
1127
+ readonly executionStatus: ExecutionStatus;
1128
+ /** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
1129
+ readonly failureStage?: FailureStage;
1130
+ /** Machine-readable failure reason code (only when executionStatus !== 'ok') */
1131
+ readonly failureReasonCode?: string;
1132
+ /** Structured error detail (only when executionStatus === 'execution_error') */
1133
+ readonly executionError?: ExecutionError;
1096
1134
  }
1097
1135
  type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
1098
1136
  interface EvaluatorResult {
@@ -1162,6 +1200,7 @@ type ExecutionDefaults = {
1162
1200
  readonly otel_file?: string;
1163
1201
  };
1164
1202
  type AgentVConfig$1 = {
1203
+ readonly required_version?: string;
1165
1204
  readonly guideline_patterns?: readonly string[];
1166
1205
  readonly eval_patterns?: readonly string[];
1167
1206
  readonly execution?: ExecutionDefaults;
@@ -1206,6 +1245,12 @@ interface CacheConfig {
1206
1245
  * Returns undefined when no cache config is specified.
1207
1246
  */
1208
1247
  declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
1248
+ /**
1249
+ * Extract `execution.fail_on_error` from parsed eval suite.
1250
+ * Accepts `true` or `false`.
1251
+ * Returns undefined when not specified.
1252
+ */
1253
+ declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1209
1254
 
1210
1255
  /**
1211
1256
  * Formatting mode for segment content.
@@ -1265,6 +1310,8 @@ type EvalSuiteResult = {
1265
1310
  readonly metadata?: EvalMetadata;
1266
1311
  /** Suite-level total cost budget in USD */
1267
1312
  readonly totalBudgetUsd?: number;
1313
+ /** Execution error tolerance: true or false */
1314
+ readonly failOnError?: FailOnError;
1268
1315
  };
1269
1316
  /**
1270
1317
  * Load tests and suite metadata from a single parse.
@@ -2367,6 +2414,8 @@ interface RunEvalCaseOptions {
2367
2414
  readonly typeRegistry?: EvaluatorRegistry;
2368
2415
  /** RepoManager instance for repo lifecycle (shared workspace mode) */
2369
2416
  readonly repoManager?: RepoManager;
2417
+ /** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
2418
+ readonly evalDir?: string;
2370
2419
  }
2371
2420
  interface ProgressEvent {
2372
2421
  readonly workerId: number;
@@ -2406,6 +2455,8 @@ interface RunEvaluationOptions {
2406
2455
  readonly streamCallbacks?: ProviderStreamCallbacks;
2407
2456
  /** Suite-level total cost budget in USD (stops dispatching when exceeded) */
2408
2457
  readonly totalBudgetUsd?: number;
2458
+ /** Execution error tolerance: true halts on first error */
2459
+ readonly failOnError?: FailOnError;
2409
2460
  }
2410
2461
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
2411
2462
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
@@ -2863,6 +2914,8 @@ interface ScriptExecutionContext {
2863
2914
  readonly evalRunId: string;
2864
2915
  readonly caseInput?: string;
2865
2916
  readonly caseMetadata?: Record<string, unknown>;
2917
+ /** Directory containing the eval YAML file. Used as default cwd. */
2918
+ readonly evalDir?: string;
2866
2919
  }
2867
2920
  type ScriptFailureMode = 'fatal' | 'warn';
2868
2921
  /**
@@ -3120,4 +3173,4 @@ type AgentKernel = {
3120
3173
  };
3121
3174
  declare function createAgentKernel(): AgentKernel;
3122
3175
 
3123
- export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
3176
+ export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -1004,6 +1004,12 @@ interface TrialResult {
1004
1004
  readonly scores?: readonly EvaluatorResult[];
1005
1005
  readonly error?: string;
1006
1006
  readonly costUsd?: number;
1007
+ /** Primary classification for this trial attempt */
1008
+ readonly executionStatus?: ExecutionStatus;
1009
+ /** Pipeline stage where failure occurred */
1010
+ readonly failureStage?: FailureStage;
1011
+ /** Machine-readable failure reason code */
1012
+ readonly failureReasonCode?: string;
1007
1013
  }
1008
1014
  /**
1009
1015
  * Aggregation metadata for pass_at_k strategy.
@@ -1036,6 +1042,30 @@ interface ConfidenceIntervalAggregation {
1036
1042
  * Discriminated union of trial aggregation results.
1037
1043
  */
1038
1044
  type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
1045
+ /**
1046
+ * Primary classification of evaluation outcome.
1047
+ * - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
1048
+ * - 'quality_failure': evaluation completed but model scored below threshold
1049
+ * - 'execution_error': evaluation could not complete due to infrastructure/tooling error
1050
+ */
1051
+ type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
1052
+ /**
1053
+ * Pipeline stage where the failure occurred.
1054
+ */
1055
+ type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
1056
+ /**
1057
+ * Structured error detail for execution failures.
1058
+ */
1059
+ interface ExecutionError {
1060
+ readonly message: string;
1061
+ readonly stage: FailureStage;
1062
+ }
1063
+ /**
1064
+ * Tolerance for execution errors in an eval run.
1065
+ * - `true`: halt on first execution error
1066
+ * - `false`: never halt on errors (default)
1067
+ */
1068
+ type FailOnError = boolean;
1039
1069
  /**
1040
1070
  * Evaluator scorecard for a single eval case run.
1041
1071
  */
@@ -1093,6 +1123,14 @@ interface EvaluationResult {
1093
1123
  readonly costLimited?: boolean;
1094
1124
  /** Whether the evaluation was skipped due to suite-level budget exhaustion */
1095
1125
  readonly budgetExceeded?: boolean;
1126
+ /** Primary classification: ok, quality_failure, or execution_error */
1127
+ readonly executionStatus: ExecutionStatus;
1128
+ /** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
1129
+ readonly failureStage?: FailureStage;
1130
+ /** Machine-readable failure reason code (only when executionStatus !== 'ok') */
1131
+ readonly failureReasonCode?: string;
1132
+ /** Structured error detail (only when executionStatus === 'execution_error') */
1133
+ readonly executionError?: ExecutionError;
1096
1134
  }
1097
1135
  type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
1098
1136
  interface EvaluatorResult {
@@ -1162,6 +1200,7 @@ type ExecutionDefaults = {
1162
1200
  readonly otel_file?: string;
1163
1201
  };
1164
1202
  type AgentVConfig$1 = {
1203
+ readonly required_version?: string;
1165
1204
  readonly guideline_patterns?: readonly string[];
1166
1205
  readonly eval_patterns?: readonly string[];
1167
1206
  readonly execution?: ExecutionDefaults;
@@ -1206,6 +1245,12 @@ interface CacheConfig {
1206
1245
  * Returns undefined when no cache config is specified.
1207
1246
  */
1208
1247
  declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
1248
+ /**
1249
+ * Extract `execution.fail_on_error` from parsed eval suite.
1250
+ * Accepts `true` or `false`.
1251
+ * Returns undefined when not specified.
1252
+ */
1253
+ declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1209
1254
 
1210
1255
  /**
1211
1256
  * Formatting mode for segment content.
@@ -1265,6 +1310,8 @@ type EvalSuiteResult = {
1265
1310
  readonly metadata?: EvalMetadata;
1266
1311
  /** Suite-level total cost budget in USD */
1267
1312
  readonly totalBudgetUsd?: number;
1313
+ /** Execution error tolerance: true or false */
1314
+ readonly failOnError?: FailOnError;
1268
1315
  };
1269
1316
  /**
1270
1317
  * Load tests and suite metadata from a single parse.
@@ -2367,6 +2414,8 @@ interface RunEvalCaseOptions {
2367
2414
  readonly typeRegistry?: EvaluatorRegistry;
2368
2415
  /** RepoManager instance for repo lifecycle (shared workspace mode) */
2369
2416
  readonly repoManager?: RepoManager;
2417
+ /** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
2418
+ readonly evalDir?: string;
2370
2419
  }
2371
2420
  interface ProgressEvent {
2372
2421
  readonly workerId: number;
@@ -2406,6 +2455,8 @@ interface RunEvaluationOptions {
2406
2455
  readonly streamCallbacks?: ProviderStreamCallbacks;
2407
2456
  /** Suite-level total cost budget in USD (stops dispatching when exceeded) */
2408
2457
  readonly totalBudgetUsd?: number;
2458
+ /** Execution error tolerance: true halts on first error */
2459
+ readonly failOnError?: FailOnError;
2409
2460
  }
2410
2461
  declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
2411
2462
  declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
@@ -2863,6 +2914,8 @@ interface ScriptExecutionContext {
2863
2914
  readonly evalRunId: string;
2864
2915
  readonly caseInput?: string;
2865
2916
  readonly caseMetadata?: Record<string, unknown>;
2917
+ /** Directory containing the eval YAML file. Used as default cwd. */
2918
+ readonly evalDir?: string;
2866
2919
  }
2867
2920
  type ScriptFailureMode = 'fatal' | 'warn';
2868
2921
  /**
@@ -3120,4 +3173,4 @@ type AgentKernel = {
3120
3173
  };
3121
3174
  declare function createAgentKernel(): AgentKernel;
3122
3175
 
3123
- export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
3176
+ export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };