@agentv/core 2.11.4 → 2.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-REN5PS7B.js → chunk-7HPKTRFZ.js} +1 -1
- package/dist/chunk-7HPKTRFZ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +110 -26
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +37 -1
- package/dist/index.d.ts +37 -1
- package/dist/index.js +111 -27
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-REN5PS7B.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1004,6 +1004,12 @@ interface TrialResult {
|
|
|
1004
1004
|
readonly scores?: readonly EvaluatorResult[];
|
|
1005
1005
|
readonly error?: string;
|
|
1006
1006
|
readonly costUsd?: number;
|
|
1007
|
+
/** Primary classification for this trial attempt */
|
|
1008
|
+
readonly executionStatus?: ExecutionStatus;
|
|
1009
|
+
/** Pipeline stage where failure occurred */
|
|
1010
|
+
readonly failureStage?: FailureStage;
|
|
1011
|
+
/** Machine-readable failure reason code */
|
|
1012
|
+
readonly failureReasonCode?: string;
|
|
1007
1013
|
}
|
|
1008
1014
|
/**
|
|
1009
1015
|
* Aggregation metadata for pass_at_k strategy.
|
|
@@ -1036,6 +1042,24 @@ interface ConfidenceIntervalAggregation {
|
|
|
1036
1042
|
* Discriminated union of trial aggregation results.
|
|
1037
1043
|
*/
|
|
1038
1044
|
type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
|
|
1045
|
+
/**
|
|
1046
|
+
* Primary classification of evaluation outcome.
|
|
1047
|
+
* - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
|
|
1048
|
+
* - 'quality_failure': evaluation completed but model scored below threshold
|
|
1049
|
+
* - 'execution_error': evaluation could not complete due to infrastructure/tooling error
|
|
1050
|
+
*/
|
|
1051
|
+
type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
|
|
1052
|
+
/**
|
|
1053
|
+
* Pipeline stage where the failure occurred.
|
|
1054
|
+
*/
|
|
1055
|
+
type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
|
|
1056
|
+
/**
|
|
1057
|
+
* Structured error detail for execution failures.
|
|
1058
|
+
*/
|
|
1059
|
+
interface ExecutionError {
|
|
1060
|
+
readonly message: string;
|
|
1061
|
+
readonly stage: FailureStage;
|
|
1062
|
+
}
|
|
1039
1063
|
/**
|
|
1040
1064
|
* Evaluator scorecard for a single eval case run.
|
|
1041
1065
|
*/
|
|
@@ -1093,6 +1117,14 @@ interface EvaluationResult {
|
|
|
1093
1117
|
readonly costLimited?: boolean;
|
|
1094
1118
|
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
1095
1119
|
readonly budgetExceeded?: boolean;
|
|
1120
|
+
/** Primary classification: ok, quality_failure, or execution_error */
|
|
1121
|
+
readonly executionStatus: ExecutionStatus;
|
|
1122
|
+
/** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
|
|
1123
|
+
readonly failureStage?: FailureStage;
|
|
1124
|
+
/** Machine-readable failure reason code (only when executionStatus !== 'ok') */
|
|
1125
|
+
readonly failureReasonCode?: string;
|
|
1126
|
+
/** Structured error detail (only when executionStatus === 'execution_error') */
|
|
1127
|
+
readonly executionError?: ExecutionError;
|
|
1096
1128
|
}
|
|
1097
1129
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
|
|
1098
1130
|
interface EvaluatorResult {
|
|
@@ -2367,6 +2399,8 @@ interface RunEvalCaseOptions {
|
|
|
2367
2399
|
readonly typeRegistry?: EvaluatorRegistry;
|
|
2368
2400
|
/** RepoManager instance for repo lifecycle (shared workspace mode) */
|
|
2369
2401
|
readonly repoManager?: RepoManager;
|
|
2402
|
+
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
2403
|
+
readonly evalDir?: string;
|
|
2370
2404
|
}
|
|
2371
2405
|
interface ProgressEvent {
|
|
2372
2406
|
readonly workerId: number;
|
|
@@ -2863,6 +2897,8 @@ interface ScriptExecutionContext {
|
|
|
2863
2897
|
readonly evalRunId: string;
|
|
2864
2898
|
readonly caseInput?: string;
|
|
2865
2899
|
readonly caseMetadata?: Record<string, unknown>;
|
|
2900
|
+
/** Directory containing the eval YAML file. Used as default cwd. */
|
|
2901
|
+
readonly evalDir?: string;
|
|
2866
2902
|
}
|
|
2867
2903
|
type ScriptFailureMode = 'fatal' | 'warn';
|
|
2868
2904
|
/**
|
|
@@ -3120,4 +3156,4 @@ type AgentKernel = {
|
|
|
3120
3156
|
};
|
|
3121
3157
|
declare function createAgentKernel(): AgentKernel;
|
|
3122
3158
|
|
|
3123
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3159
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -1004,6 +1004,12 @@ interface TrialResult {
|
|
|
1004
1004
|
readonly scores?: readonly EvaluatorResult[];
|
|
1005
1005
|
readonly error?: string;
|
|
1006
1006
|
readonly costUsd?: number;
|
|
1007
|
+
/** Primary classification for this trial attempt */
|
|
1008
|
+
readonly executionStatus?: ExecutionStatus;
|
|
1009
|
+
/** Pipeline stage where failure occurred */
|
|
1010
|
+
readonly failureStage?: FailureStage;
|
|
1011
|
+
/** Machine-readable failure reason code */
|
|
1012
|
+
readonly failureReasonCode?: string;
|
|
1007
1013
|
}
|
|
1008
1014
|
/**
|
|
1009
1015
|
* Aggregation metadata for pass_at_k strategy.
|
|
@@ -1036,6 +1042,24 @@ interface ConfidenceIntervalAggregation {
|
|
|
1036
1042
|
* Discriminated union of trial aggregation results.
|
|
1037
1043
|
*/
|
|
1038
1044
|
type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
|
|
1045
|
+
/**
|
|
1046
|
+
* Primary classification of evaluation outcome.
|
|
1047
|
+
* - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
|
|
1048
|
+
* - 'quality_failure': evaluation completed but model scored below threshold
|
|
1049
|
+
* - 'execution_error': evaluation could not complete due to infrastructure/tooling error
|
|
1050
|
+
*/
|
|
1051
|
+
type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
|
|
1052
|
+
/**
|
|
1053
|
+
* Pipeline stage where the failure occurred.
|
|
1054
|
+
*/
|
|
1055
|
+
type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
|
|
1056
|
+
/**
|
|
1057
|
+
* Structured error detail for execution failures.
|
|
1058
|
+
*/
|
|
1059
|
+
interface ExecutionError {
|
|
1060
|
+
readonly message: string;
|
|
1061
|
+
readonly stage: FailureStage;
|
|
1062
|
+
}
|
|
1039
1063
|
/**
|
|
1040
1064
|
* Evaluator scorecard for a single eval case run.
|
|
1041
1065
|
*/
|
|
@@ -1093,6 +1117,14 @@ interface EvaluationResult {
|
|
|
1093
1117
|
readonly costLimited?: boolean;
|
|
1094
1118
|
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
1095
1119
|
readonly budgetExceeded?: boolean;
|
|
1120
|
+
/** Primary classification: ok, quality_failure, or execution_error */
|
|
1121
|
+
readonly executionStatus: ExecutionStatus;
|
|
1122
|
+
/** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
|
|
1123
|
+
readonly failureStage?: FailureStage;
|
|
1124
|
+
/** Machine-readable failure reason code (only when executionStatus !== 'ok') */
|
|
1125
|
+
readonly failureReasonCode?: string;
|
|
1126
|
+
/** Structured error detail (only when executionStatus === 'execution_error') */
|
|
1127
|
+
readonly executionError?: ExecutionError;
|
|
1096
1128
|
}
|
|
1097
1129
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
|
|
1098
1130
|
interface EvaluatorResult {
|
|
@@ -2367,6 +2399,8 @@ interface RunEvalCaseOptions {
|
|
|
2367
2399
|
readonly typeRegistry?: EvaluatorRegistry;
|
|
2368
2400
|
/** RepoManager instance for repo lifecycle (shared workspace mode) */
|
|
2369
2401
|
readonly repoManager?: RepoManager;
|
|
2402
|
+
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
2403
|
+
readonly evalDir?: string;
|
|
2370
2404
|
}
|
|
2371
2405
|
interface ProgressEvent {
|
|
2372
2406
|
readonly workerId: number;
|
|
@@ -2863,6 +2897,8 @@ interface ScriptExecutionContext {
|
|
|
2863
2897
|
readonly evalRunId: string;
|
|
2864
2898
|
readonly caseInput?: string;
|
|
2865
2899
|
readonly caseMetadata?: Record<string, unknown>;
|
|
2900
|
+
/** Directory containing the eval YAML file. Used as default cwd. */
|
|
2901
|
+
readonly evalDir?: string;
|
|
2866
2902
|
}
|
|
2867
2903
|
type ScriptFailureMode = 'fatal' | 'warn';
|
|
2868
2904
|
/**
|
|
@@ -3120,4 +3156,4 @@ type AgentKernel = {
|
|
|
3120
3156
|
};
|
|
3121
3157
|
declare function createAgentKernel(): AgentKernel;
|
|
3122
3158
|
|
|
3123
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3159
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-7HPKTRFZ.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -12847,6 +12847,16 @@ async function resolveWorkspaceTemplate(templatePath) {
|
|
|
12847
12847
|
}
|
|
12848
12848
|
|
|
12849
12849
|
// src/evaluation/workspace/script-executor.ts
|
|
12850
|
+
function interpolateArgs(args, context) {
|
|
12851
|
+
const vars = {
|
|
12852
|
+
workspace_path: context.workspacePath,
|
|
12853
|
+
test_id: context.testId,
|
|
12854
|
+
eval_run_id: context.evalRunId,
|
|
12855
|
+
case_input: context.caseInput ?? "",
|
|
12856
|
+
case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
|
|
12857
|
+
};
|
|
12858
|
+
return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
|
|
12859
|
+
}
|
|
12850
12860
|
async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
12851
12861
|
const stdin = JSON.stringify({
|
|
12852
12862
|
workspace_path: context.workspacePath,
|
|
@@ -12856,8 +12866,9 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
|
12856
12866
|
case_metadata: context.caseMetadata ?? null
|
|
12857
12867
|
});
|
|
12858
12868
|
const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
|
|
12859
|
-
const cwd = config.cwd;
|
|
12860
|
-
const
|
|
12869
|
+
const cwd = config.cwd ?? context.evalDir;
|
|
12870
|
+
const rawCommand = config.command ?? config.script ?? [];
|
|
12871
|
+
const commandArray = interpolateArgs(rawCommand, context);
|
|
12861
12872
|
const result = await execFileWithStdin(commandArray, stdin, {
|
|
12862
12873
|
timeoutMs,
|
|
12863
12874
|
cwd
|
|
@@ -12874,6 +12885,10 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
|
|
|
12874
12885
|
}
|
|
12875
12886
|
|
|
12876
12887
|
// src/evaluation/orchestrator.ts
|
|
12888
|
+
var QUALITY_PASS_THRESHOLD = 0.8;
|
|
12889
|
+
function classifyQualityStatus(score) {
|
|
12890
|
+
return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
|
|
12891
|
+
}
|
|
12877
12892
|
function usesFileReferencePrompt(provider) {
|
|
12878
12893
|
return isAgentProvider(provider) || provider.kind === "cli";
|
|
12879
12894
|
}
|
|
@@ -12981,6 +12996,7 @@ async function runEvaluation(options) {
|
|
|
12981
12996
|
const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
|
|
12982
12997
|
const typeRegistry = createBuiltinRegistry();
|
|
12983
12998
|
const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
|
|
12999
|
+
const evalDir = discoveryBaseDir;
|
|
12984
13000
|
await discoverAssertions(typeRegistry, discoveryBaseDir);
|
|
12985
13001
|
const providerRegistry = createBuiltinProviderRegistry();
|
|
12986
13002
|
await discoverProviders(providerRegistry, discoveryBaseDir);
|
|
@@ -13076,7 +13092,8 @@ async function runEvaluation(options) {
|
|
|
13076
13092
|
const scriptContext = {
|
|
13077
13093
|
workspacePath: sharedWorkspacePath,
|
|
13078
13094
|
testId: "__before_all__",
|
|
13079
|
-
evalRunId
|
|
13095
|
+
evalRunId,
|
|
13096
|
+
evalDir
|
|
13080
13097
|
};
|
|
13081
13098
|
try {
|
|
13082
13099
|
beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
|
|
@@ -13115,7 +13132,14 @@ async function runEvaluation(options) {
|
|
|
13115
13132
|
answer: "",
|
|
13116
13133
|
target: target.name,
|
|
13117
13134
|
error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13118
|
-
budgetExceeded: true
|
|
13135
|
+
budgetExceeded: true,
|
|
13136
|
+
executionStatus: "execution_error",
|
|
13137
|
+
failureStage: "setup",
|
|
13138
|
+
failureReasonCode: "budget_exceeded",
|
|
13139
|
+
executionError: {
|
|
13140
|
+
message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
|
|
13141
|
+
stage: "setup"
|
|
13142
|
+
}
|
|
13119
13143
|
};
|
|
13120
13144
|
if (onProgress) {
|
|
13121
13145
|
await onProgress({
|
|
@@ -13162,7 +13186,8 @@ async function runEvaluation(options) {
|
|
|
13162
13186
|
suiteWorkspaceFile,
|
|
13163
13187
|
streamCallbacks,
|
|
13164
13188
|
typeRegistry,
|
|
13165
|
-
repoManager
|
|
13189
|
+
repoManager,
|
|
13190
|
+
evalDir
|
|
13166
13191
|
};
|
|
13167
13192
|
let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
|
|
13168
13193
|
if (totalBudgetUsd !== void 0) {
|
|
@@ -13231,7 +13256,9 @@ async function runEvaluation(options) {
|
|
|
13231
13256
|
(now ?? (() => /* @__PURE__ */ new Date()))(),
|
|
13232
13257
|
outcome.reason,
|
|
13233
13258
|
promptInputs,
|
|
13234
|
-
primaryProvider
|
|
13259
|
+
primaryProvider,
|
|
13260
|
+
"agent",
|
|
13261
|
+
"provider_error"
|
|
13235
13262
|
);
|
|
13236
13263
|
results.push(errorResult);
|
|
13237
13264
|
if (onResult) {
|
|
@@ -13243,7 +13270,8 @@ async function runEvaluation(options) {
|
|
|
13243
13270
|
const scriptContext = {
|
|
13244
13271
|
workspacePath: sharedWorkspacePath,
|
|
13245
13272
|
testId: "__after_all__",
|
|
13246
|
-
evalRunId
|
|
13273
|
+
evalRunId,
|
|
13274
|
+
evalDir
|
|
13247
13275
|
};
|
|
13248
13276
|
try {
|
|
13249
13277
|
const afterAllOutput = await executeWorkspaceScript(
|
|
@@ -13373,7 +13401,14 @@ async function runBatchEvaluation(options) {
|
|
|
13373
13401
|
availableTargets
|
|
13374
13402
|
});
|
|
13375
13403
|
if (providerError) {
|
|
13376
|
-
result = {
|
|
13404
|
+
result = {
|
|
13405
|
+
...result,
|
|
13406
|
+
error: providerError,
|
|
13407
|
+
executionStatus: "execution_error",
|
|
13408
|
+
failureStage: "agent",
|
|
13409
|
+
failureReasonCode: "provider_error",
|
|
13410
|
+
executionError: { message: providerError, stage: "agent" }
|
|
13411
|
+
};
|
|
13377
13412
|
}
|
|
13378
13413
|
} catch (error) {
|
|
13379
13414
|
const errorResult = buildErrorResult(
|
|
@@ -13382,7 +13417,9 @@ async function runBatchEvaluation(options) {
|
|
|
13382
13417
|
nowFn(),
|
|
13383
13418
|
error,
|
|
13384
13419
|
promptInputs,
|
|
13385
|
-
provider
|
|
13420
|
+
provider,
|
|
13421
|
+
"evaluator",
|
|
13422
|
+
"evaluator_error"
|
|
13386
13423
|
);
|
|
13387
13424
|
results.push(errorResult);
|
|
13388
13425
|
if (onResult) {
|
|
@@ -13438,7 +13475,8 @@ async function runEvalCase(options) {
|
|
|
13438
13475
|
sharedBaselineCommit,
|
|
13439
13476
|
suiteWorkspaceFile,
|
|
13440
13477
|
typeRegistry: providedTypeRegistry,
|
|
13441
|
-
repoManager
|
|
13478
|
+
repoManager,
|
|
13479
|
+
evalDir
|
|
13442
13480
|
} = options;
|
|
13443
13481
|
const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
|
|
13444
13482
|
const promptInputs = await buildPromptInputs(evalCase, formattingMode);
|
|
@@ -13471,7 +13509,9 @@ async function runEvalCase(options) {
|
|
|
13471
13509
|
nowFn(),
|
|
13472
13510
|
new Error(`Failed to create workspace: ${message}`),
|
|
13473
13511
|
promptInputs,
|
|
13474
|
-
provider
|
|
13512
|
+
provider,
|
|
13513
|
+
"setup",
|
|
13514
|
+
"template_error"
|
|
13475
13515
|
);
|
|
13476
13516
|
}
|
|
13477
13517
|
}
|
|
@@ -13491,7 +13531,9 @@ async function runEvalCase(options) {
|
|
|
13491
13531
|
nowFn(),
|
|
13492
13532
|
new Error(`Failed to materialize repos: ${message}`),
|
|
13493
13533
|
promptInputs,
|
|
13494
|
-
provider
|
|
13534
|
+
provider,
|
|
13535
|
+
"repo_setup",
|
|
13536
|
+
"clone_error"
|
|
13495
13537
|
);
|
|
13496
13538
|
}
|
|
13497
13539
|
}
|
|
@@ -13501,7 +13543,8 @@ async function runEvalCase(options) {
|
|
|
13501
13543
|
testId: evalCase.id,
|
|
13502
13544
|
evalRunId: evalRunId ?? "",
|
|
13503
13545
|
caseInput: evalCase.question,
|
|
13504
|
-
caseMetadata: evalCase.metadata
|
|
13546
|
+
caseMetadata: evalCase.metadata,
|
|
13547
|
+
evalDir
|
|
13505
13548
|
};
|
|
13506
13549
|
try {
|
|
13507
13550
|
beforeAllOutput = await executeWorkspaceScript(
|
|
@@ -13520,7 +13563,9 @@ async function runEvalCase(options) {
|
|
|
13520
13563
|
nowFn(),
|
|
13521
13564
|
new Error(`before_all script failed: ${message}`),
|
|
13522
13565
|
promptInputs,
|
|
13523
|
-
provider
|
|
13566
|
+
provider,
|
|
13567
|
+
"setup",
|
|
13568
|
+
"script_error"
|
|
13524
13569
|
);
|
|
13525
13570
|
}
|
|
13526
13571
|
}
|
|
@@ -13531,7 +13576,8 @@ async function runEvalCase(options) {
|
|
|
13531
13576
|
testId: evalCase.id,
|
|
13532
13577
|
evalRunId: evalRunId ?? "",
|
|
13533
13578
|
caseInput: evalCase.question,
|
|
13534
|
-
caseMetadata: evalCase.metadata
|
|
13579
|
+
caseMetadata: evalCase.metadata,
|
|
13580
|
+
evalDir
|
|
13535
13581
|
};
|
|
13536
13582
|
try {
|
|
13537
13583
|
beforeEachOutput = await executeWorkspaceScript(
|
|
@@ -13546,7 +13592,9 @@ async function runEvalCase(options) {
|
|
|
13546
13592
|
nowFn(),
|
|
13547
13593
|
new Error(`before_each script failed: ${message}`),
|
|
13548
13594
|
promptInputs,
|
|
13549
|
-
provider
|
|
13595
|
+
provider,
|
|
13596
|
+
"setup",
|
|
13597
|
+
"script_error"
|
|
13550
13598
|
);
|
|
13551
13599
|
}
|
|
13552
13600
|
}
|
|
@@ -13587,7 +13635,9 @@ async function runEvalCase(options) {
|
|
|
13587
13635
|
nowFn(),
|
|
13588
13636
|
error,
|
|
13589
13637
|
promptInputs,
|
|
13590
|
-
provider
|
|
13638
|
+
provider,
|
|
13639
|
+
"agent",
|
|
13640
|
+
"provider_error"
|
|
13591
13641
|
);
|
|
13592
13642
|
if (workspacePath) {
|
|
13593
13643
|
if (forceCleanup) {
|
|
@@ -13606,7 +13656,9 @@ async function runEvalCase(options) {
|
|
|
13606
13656
|
nowFn(),
|
|
13607
13657
|
lastError ?? new Error("Provider did not return a response"),
|
|
13608
13658
|
promptInputs,
|
|
13609
|
-
provider
|
|
13659
|
+
provider,
|
|
13660
|
+
"agent",
|
|
13661
|
+
"provider_error"
|
|
13610
13662
|
);
|
|
13611
13663
|
if (workspacePath) {
|
|
13612
13664
|
if (forceCleanup) {
|
|
@@ -13662,7 +13714,8 @@ async function runEvalCase(options) {
|
|
|
13662
13714
|
testId: evalCase.id,
|
|
13663
13715
|
evalRunId: evalRunId ?? "",
|
|
13664
13716
|
caseInput: evalCase.question,
|
|
13665
|
-
caseMetadata: evalCase.metadata
|
|
13717
|
+
caseMetadata: evalCase.metadata,
|
|
13718
|
+
evalDir
|
|
13666
13719
|
};
|
|
13667
13720
|
try {
|
|
13668
13721
|
afterEachOutput = await executeWorkspaceScript(
|
|
@@ -13698,7 +13751,18 @@ async function runEvalCase(options) {
|
|
|
13698
13751
|
fileChanges,
|
|
13699
13752
|
workspacePath
|
|
13700
13753
|
});
|
|
13701
|
-
const
|
|
13754
|
+
const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
|
|
13755
|
+
const finalResult = providerError ? {
|
|
13756
|
+
...result,
|
|
13757
|
+
error: providerError,
|
|
13758
|
+
executionStatus,
|
|
13759
|
+
failureStage: "agent",
|
|
13760
|
+
failureReasonCode: "provider_error",
|
|
13761
|
+
executionError: { message: providerError, stage: "agent" },
|
|
13762
|
+
beforeAllOutput,
|
|
13763
|
+
beforeEachOutput,
|
|
13764
|
+
afterEachOutput
|
|
13765
|
+
} : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
|
|
13702
13766
|
const isFailure = !!finalResult.error || finalResult.score < 0.5;
|
|
13703
13767
|
if (workspacePath && !isSharedWorkspace) {
|
|
13704
13768
|
if (forceCleanup) {
|
|
@@ -13719,7 +13783,9 @@ async function runEvalCase(options) {
|
|
|
13719
13783
|
nowFn(),
|
|
13720
13784
|
error,
|
|
13721
13785
|
promptInputs,
|
|
13722
|
-
provider
|
|
13786
|
+
provider,
|
|
13787
|
+
"evaluator",
|
|
13788
|
+
"evaluator_error"
|
|
13723
13789
|
);
|
|
13724
13790
|
if (workspacePath && !isSharedWorkspace) {
|
|
13725
13791
|
if (forceCleanup) {
|
|
@@ -13757,7 +13823,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
13757
13823
|
verdict: trialVerdict,
|
|
13758
13824
|
scores: result.scores,
|
|
13759
13825
|
error: result.error,
|
|
13760
|
-
costUsd: trialCost
|
|
13826
|
+
costUsd: trialCost,
|
|
13827
|
+
executionStatus: result.executionStatus,
|
|
13828
|
+
failureStage: result.failureStage,
|
|
13829
|
+
failureReasonCode: result.failureReasonCode
|
|
13761
13830
|
};
|
|
13762
13831
|
trialResults.push(trial);
|
|
13763
13832
|
if (trialCost !== void 0) {
|
|
@@ -13782,12 +13851,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
|
|
|
13782
13851
|
0
|
|
13783
13852
|
);
|
|
13784
13853
|
const baseResult = allResults[bestTrialIndex];
|
|
13854
|
+
const hasOk = trialResults.some((t) => t.executionStatus === "ok");
|
|
13855
|
+
const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
|
|
13856
|
+
const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
|
|
13857
|
+
const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
|
|
13858
|
+
const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
|
|
13859
|
+
const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
|
|
13785
13860
|
return {
|
|
13786
13861
|
...baseResult,
|
|
13787
13862
|
score,
|
|
13788
13863
|
trials: trialResults,
|
|
13789
13864
|
aggregation,
|
|
13790
|
-
costLimited: costLimited || void 0
|
|
13865
|
+
costLimited: costLimited || void 0,
|
|
13866
|
+
executionStatus: aggregateExecutionStatus,
|
|
13867
|
+
failureStage: aggregateFailureStage,
|
|
13868
|
+
failureReasonCode: aggregateFailureReasonCode,
|
|
13869
|
+
executionError: aggregateExecutionError
|
|
13791
13870
|
};
|
|
13792
13871
|
}
|
|
13793
13872
|
async function evaluateCandidate(options) {
|
|
@@ -13888,7 +13967,8 @@ async function evaluateCandidate(options) {
|
|
|
13888
13967
|
scores,
|
|
13889
13968
|
trace,
|
|
13890
13969
|
output,
|
|
13891
|
-
fileChanges
|
|
13970
|
+
fileChanges,
|
|
13971
|
+
executionStatus: classifyQualityStatus(score.score)
|
|
13892
13972
|
};
|
|
13893
13973
|
}
|
|
13894
13974
|
async function runEvaluatorsForCase(options) {
|
|
@@ -14193,7 +14273,7 @@ async function invokeProvider(provider, options) {
|
|
|
14193
14273
|
}
|
|
14194
14274
|
}
|
|
14195
14275
|
}
|
|
14196
|
-
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
|
|
14276
|
+
function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
|
|
14197
14277
|
const message = error instanceof Error ? error.message : String(error);
|
|
14198
14278
|
let agentRequest;
|
|
14199
14279
|
let lmRequest;
|
|
@@ -14236,7 +14316,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
|
|
|
14236
14316
|
target: targetName,
|
|
14237
14317
|
requests,
|
|
14238
14318
|
input,
|
|
14239
|
-
error: message
|
|
14319
|
+
error: message,
|
|
14320
|
+
executionStatus: "execution_error",
|
|
14321
|
+
failureStage,
|
|
14322
|
+
failureReasonCode,
|
|
14323
|
+
executionError: { message, stage: failureStage }
|
|
14240
14324
|
};
|
|
14241
14325
|
}
|
|
14242
14326
|
function extractProviderError(response) {
|