npm - @agentv/core - Versions diffs - 2.11.4 → 2.12.0 - Mend

@agentv/core 2.11.4 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/{chunk-REN5PS7B.js → chunk-7HPKTRFZ.js} +1 -1
package/dist/chunk-7HPKTRFZ.js.map +1 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/index.cjs +110 -26
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +37 -1
package/dist/index.d.ts +37 -1
package/dist/index.js +111 -27
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-REN5PS7B.js.map +0 -1

package/dist/index.d.cts CHANGED Viewed

@@ -1004,6 +1004,12 @@ interface TrialResult {
     readonly scores?: readonly EvaluatorResult[];
     readonly error?: string;
     readonly costUsd?: number;
+    /** Primary classification for this trial attempt */
+    readonly executionStatus?: ExecutionStatus;
+    /** Pipeline stage where failure occurred */
+    readonly failureStage?: FailureStage;
+    /** Machine-readable failure reason code */
+    readonly failureReasonCode?: string;
 }
 /**
  * Aggregation metadata for pass_at_k strategy.
@@ -1036,6 +1042,24 @@ interface ConfidenceIntervalAggregation {
  * Discriminated union of trial aggregation results.
  */
 type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
+/**
+ * Primary classification of evaluation outcome.
+ * - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
+ * - 'quality_failure': evaluation completed but model scored below threshold
+ * - 'execution_error': evaluation could not complete due to infrastructure/tooling error
+ */
+type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
+/**
+ * Pipeline stage where the failure occurred.
+ */
+type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
+/**
+ * Structured error detail for execution failures.
+ */
+interface ExecutionError {
+    readonly message: string;
+    readonly stage: FailureStage;
+}
 /**
  * Evaluator scorecard for a single eval case run.
  */
@@ -1093,6 +1117,14 @@ interface EvaluationResult {
     readonly costLimited?: boolean;
     /** Whether the evaluation was skipped due to suite-level budget exhaustion */
     readonly budgetExceeded?: boolean;
+    /** Primary classification: ok, quality_failure, or execution_error */
+    readonly executionStatus: ExecutionStatus;
+    /** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
+    readonly failureStage?: FailureStage;
+    /** Machine-readable failure reason code (only when executionStatus !== 'ok') */
+    readonly failureReasonCode?: string;
+    /** Structured error detail (only when executionStatus === 'execution_error') */
+    readonly executionError?: ExecutionError;
 }
 type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
 interface EvaluatorResult {
@@ -2367,6 +2399,8 @@ interface RunEvalCaseOptions {
     readonly typeRegistry?: EvaluatorRegistry;
     /** RepoManager instance for repo lifecycle (shared workspace mode) */
     readonly repoManager?: RepoManager;
+    /** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
+    readonly evalDir?: string;
 }
 interface ProgressEvent {
     readonly workerId: number;
@@ -2863,6 +2897,8 @@ interface ScriptExecutionContext {
     readonly evalRunId: string;
     readonly caseInput?: string;
     readonly caseMetadata?: Record<string, unknown>;
+    /** Directory containing the eval YAML file. Used as default cwd. */
+    readonly evalDir?: string;
 }
 type ScriptFailureMode = 'fatal' | 'warn';
 /**
@@ -3120,4 +3156,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
+export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };

package/dist/index.d.ts CHANGED Viewed

@@ -1004,6 +1004,12 @@ interface TrialResult {
     readonly scores?: readonly EvaluatorResult[];
     readonly error?: string;
     readonly costUsd?: number;
+    /** Primary classification for this trial attempt */
+    readonly executionStatus?: ExecutionStatus;
+    /** Pipeline stage where failure occurred */
+    readonly failureStage?: FailureStage;
+    /** Machine-readable failure reason code */
+    readonly failureReasonCode?: string;
 }
 /**
  * Aggregation metadata for pass_at_k strategy.
@@ -1036,6 +1042,24 @@ interface ConfidenceIntervalAggregation {
  * Discriminated union of trial aggregation results.
  */
 type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
+/**
+ * Primary classification of evaluation outcome.
+ * - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
+ * - 'quality_failure': evaluation completed but model scored below threshold
+ * - 'execution_error': evaluation could not complete due to infrastructure/tooling error
+ */
+type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
+/**
+ * Pipeline stage where the failure occurred.
+ */
+type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
+/**
+ * Structured error detail for execution failures.
+ */
+interface ExecutionError {
+    readonly message: string;
+    readonly stage: FailureStage;
+}
 /**
  * Evaluator scorecard for a single eval case run.
  */
@@ -1093,6 +1117,14 @@ interface EvaluationResult {
     readonly costLimited?: boolean;
     /** Whether the evaluation was skipped due to suite-level budget exhaustion */
     readonly budgetExceeded?: boolean;
+    /** Primary classification: ok, quality_failure, or execution_error */
+    readonly executionStatus: ExecutionStatus;
+    /** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
+    readonly failureStage?: FailureStage;
+    /** Machine-readable failure reason code (only when executionStatus !== 'ok') */
+    readonly failureReasonCode?: string;
+    /** Structured error detail (only when executionStatus === 'execution_error') */
+    readonly executionError?: ExecutionError;
 }
 type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
 interface EvaluatorResult {
@@ -2367,6 +2399,8 @@ interface RunEvalCaseOptions {
     readonly typeRegistry?: EvaluatorRegistry;
     /** RepoManager instance for repo lifecycle (shared workspace mode) */
     readonly repoManager?: RepoManager;
+    /** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
+    readonly evalDir?: string;
 }
 interface ProgressEvent {
     readonly workerId: number;
@@ -2863,6 +2897,8 @@ interface ScriptExecutionContext {
     readonly evalRunId: string;
     readonly caseInput?: string;
     readonly caseMetadata?: Record<string, unknown>;
+    /** Directory containing the eval YAML file. Used as default cwd. */
+    readonly evalDir?: string;
 }
 type ScriptFailureMode = 'fatal' | 'warn';
 /**
@@ -3120,4 +3156,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
+export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };

package/dist/index.js CHANGED Viewed

@@ -17,7 +17,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-REN5PS7B.js";
+} from "./chunk-7HPKTRFZ.js";
 import {
   OtlpJsonFileExporter
 } from "./chunk-HFSYZHGF.js";
@@ -12847,6 +12847,16 @@ async function resolveWorkspaceTemplate(templatePath) {
 }
 // src/evaluation/workspace/script-executor.ts
+function interpolateArgs(args, context) {
+  const vars = {
+    workspace_path: context.workspacePath,
+    test_id: context.testId,
+    eval_run_id: context.evalRunId,
+    case_input: context.caseInput ?? "",
+    case_metadata: context.caseMetadata ? JSON.stringify(context.caseMetadata) : ""
+  };
+  return args.map((arg) => arg.replace(/\{\{(\w+)\}\}/g, (match, name) => vars[name] ?? match));
+}
 async function executeWorkspaceScript(config, context, failureMode = "fatal") {
   const stdin = JSON.stringify({
     workspace_path: context.workspacePath,
@@ -12856,8 +12866,9 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
     case_metadata: context.caseMetadata ?? null
   });
   const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
-  const cwd = config.cwd;
-  const commandArray = config.command ?? config.script ?? [];
+  const cwd = config.cwd ?? context.evalDir;
+  const rawCommand = config.command ?? config.script ?? [];
+  const commandArray = interpolateArgs(rawCommand, context);
   const result = await execFileWithStdin(commandArray, stdin, {
     timeoutMs,
     cwd
@@ -12874,6 +12885,10 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
 }
 // src/evaluation/orchestrator.ts
+var QUALITY_PASS_THRESHOLD = 0.8;
+function classifyQualityStatus(score) {
+  return score >= QUALITY_PASS_THRESHOLD ? "ok" : "quality_failure";
+}
 function usesFileReferencePrompt(provider) {
   return isAgentProvider(provider) || provider.kind === "cli";
 }
@@ -12981,6 +12996,7 @@ async function runEvaluation(options) {
   const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
   const typeRegistry = createBuiltinRegistry();
   const discoveryBaseDir = evalFilePath ? path37.dirname(path37.resolve(evalFilePath)) : process.cwd();
+  const evalDir = discoveryBaseDir;
   await discoverAssertions(typeRegistry, discoveryBaseDir);
   const providerRegistry = createBuiltinProviderRegistry();
   await discoverProviders(providerRegistry, discoveryBaseDir);
@@ -13076,7 +13092,8 @@ async function runEvaluation(options) {
     const scriptContext = {
       workspacePath: sharedWorkspacePath,
       testId: "__before_all__",
-      evalRunId
+      evalRunId,
+      evalDir
     };
     try {
       beforeAllOutput = await executeWorkspaceScript(suiteWorkspace.before_all, scriptContext);
@@ -13115,7 +13132,14 @@ async function runEvaluation(options) {
           answer: "",
           target: target.name,
           error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
-          budgetExceeded: true
+          budgetExceeded: true,
+          executionStatus: "execution_error",
+          failureStage: "setup",
+          failureReasonCode: "budget_exceeded",
+          executionError: {
+            message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
+            stage: "setup"
+          }
         };
         if (onProgress) {
           await onProgress({
@@ -13162,7 +13186,8 @@ async function runEvaluation(options) {
           suiteWorkspaceFile,
           streamCallbacks,
           typeRegistry,
-          repoManager
+          repoManager,
+          evalDir
         };
         let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
         if (totalBudgetUsd !== void 0) {
@@ -13231,7 +13256,9 @@ async function runEvaluation(options) {
         (now ?? (() => /* @__PURE__ */ new Date()))(),
         outcome.reason,
         promptInputs,
-        primaryProvider
+        primaryProvider,
+        "agent",
+        "provider_error"
       );
       results.push(errorResult);
       if (onResult) {
@@ -13243,7 +13270,8 @@ async function runEvaluation(options) {
     const scriptContext = {
       workspacePath: sharedWorkspacePath,
       testId: "__after_all__",
-      evalRunId
+      evalRunId,
+      evalDir
     };
     try {
       const afterAllOutput = await executeWorkspaceScript(
@@ -13373,7 +13401,14 @@ async function runBatchEvaluation(options) {
         availableTargets
       });
       if (providerError) {
-        result = { ...result, error: providerError };
+        result = {
+          ...result,
+          error: providerError,
+          executionStatus: "execution_error",
+          failureStage: "agent",
+          failureReasonCode: "provider_error",
+          executionError: { message: providerError, stage: "agent" }
+        };
       }
     } catch (error) {
       const errorResult = buildErrorResult(
@@ -13382,7 +13417,9 @@ async function runBatchEvaluation(options) {
         nowFn(),
         error,
         promptInputs,
-        provider
+        provider,
+        "evaluator",
+        "evaluator_error"
       );
       results.push(errorResult);
       if (onResult) {
@@ -13438,7 +13475,8 @@ async function runEvalCase(options) {
     sharedBaselineCommit,
     suiteWorkspaceFile,
     typeRegistry: providedTypeRegistry,
-    repoManager
+    repoManager,
+    evalDir
   } = options;
   const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
   const promptInputs = await buildPromptInputs(evalCase, formattingMode);
@@ -13471,7 +13509,9 @@ async function runEvalCase(options) {
           nowFn(),
           new Error(`Failed to create workspace: ${message}`),
           promptInputs,
-          provider
+          provider,
+          "setup",
+          "template_error"
         );
       }
     }
@@ -13491,7 +13531,9 @@ async function runEvalCase(options) {
           nowFn(),
           new Error(`Failed to materialize repos: ${message}`),
           promptInputs,
-          provider
+          provider,
+          "repo_setup",
+          "clone_error"
         );
       }
     }
@@ -13501,7 +13543,8 @@ async function runEvalCase(options) {
         testId: evalCase.id,
         evalRunId: evalRunId ?? "",
         caseInput: evalCase.question,
-        caseMetadata: evalCase.metadata
+        caseMetadata: evalCase.metadata,
+        evalDir
       };
       try {
         beforeAllOutput = await executeWorkspaceScript(
@@ -13520,7 +13563,9 @@ async function runEvalCase(options) {
           nowFn(),
           new Error(`before_all script failed: ${message}`),
           promptInputs,
-          provider
+          provider,
+          "setup",
+          "script_error"
         );
       }
     }
@@ -13531,7 +13576,8 @@ async function runEvalCase(options) {
       testId: evalCase.id,
       evalRunId: evalRunId ?? "",
       caseInput: evalCase.question,
-      caseMetadata: evalCase.metadata
+      caseMetadata: evalCase.metadata,
+      evalDir
     };
     try {
       beforeEachOutput = await executeWorkspaceScript(
@@ -13546,7 +13592,9 @@ async function runEvalCase(options) {
         nowFn(),
         new Error(`before_each script failed: ${message}`),
         promptInputs,
-        provider
+        provider,
+        "setup",
+        "script_error"
       );
     }
   }
@@ -13587,7 +13635,9 @@ async function runEvalCase(options) {
         nowFn(),
         error,
         promptInputs,
-        provider
+        provider,
+        "agent",
+        "provider_error"
       );
       if (workspacePath) {
         if (forceCleanup) {
@@ -13606,7 +13656,9 @@ async function runEvalCase(options) {
       nowFn(),
       lastError ?? new Error("Provider did not return a response"),
       promptInputs,
-      provider
+      provider,
+      "agent",
+      "provider_error"
     );
     if (workspacePath) {
       if (forceCleanup) {
@@ -13662,7 +13714,8 @@ async function runEvalCase(options) {
       testId: evalCase.id,
       evalRunId: evalRunId ?? "",
       caseInput: evalCase.question,
-      caseMetadata: evalCase.metadata
+      caseMetadata: evalCase.metadata,
+      evalDir
     };
     try {
       afterEachOutput = await executeWorkspaceScript(
@@ -13698,7 +13751,18 @@ async function runEvalCase(options) {
       fileChanges,
       workspacePath
     });
-    const finalResult = providerError ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput } : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput };
+    const executionStatus = providerError ? "execution_error" : classifyQualityStatus(result.score);
+    const finalResult = providerError ? {
+      ...result,
+      error: providerError,
+      executionStatus,
+      failureStage: "agent",
+      failureReasonCode: "provider_error",
+      executionError: { message: providerError, stage: "agent" },
+      beforeAllOutput,
+      beforeEachOutput,
+      afterEachOutput
+    } : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
     const isFailure = !!finalResult.error || finalResult.score < 0.5;
     if (workspacePath && !isSharedWorkspace) {
       if (forceCleanup) {
@@ -13719,7 +13783,9 @@ async function runEvalCase(options) {
       nowFn(),
       error,
       promptInputs,
-      provider
+      provider,
+      "evaluator",
+      "evaluator_error"
     );
     if (workspacePath && !isSharedWorkspace) {
       if (forceCleanup) {
@@ -13757,7 +13823,10 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
       verdict: trialVerdict,
       scores: result.scores,
       error: result.error,
-      costUsd: trialCost
+      costUsd: trialCost,
+      executionStatus: result.executionStatus,
+      failureStage: result.failureStage,
+      failureReasonCode: result.failureReasonCode
     };
     trialResults.push(trial);
     if (trialCost !== void 0) {
@@ -13782,12 +13851,22 @@ async function runEvalCaseWithTrials(options, trialsConfig) {
     0
   );
   const baseResult = allResults[bestTrialIndex];
+  const hasOk = trialResults.some((t) => t.executionStatus === "ok");
+  const allExecutionError = trialResults.length > 0 && trialResults.every((t) => t.executionStatus === "execution_error");
+  const aggregateExecutionStatus = hasOk ? "ok" : allExecutionError ? "execution_error" : "quality_failure";
+  const aggregateFailureStage = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureStage;
+  const aggregateFailureReasonCode = aggregateExecutionStatus === "ok" ? void 0 : baseResult.failureReasonCode;
+  const aggregateExecutionError = aggregateExecutionStatus === "execution_error" ? baseResult.executionError : void 0;
   return {
     ...baseResult,
     score,
     trials: trialResults,
     aggregation,
-    costLimited: costLimited || void 0
+    costLimited: costLimited || void 0,
+    executionStatus: aggregateExecutionStatus,
+    failureStage: aggregateFailureStage,
+    failureReasonCode: aggregateFailureReasonCode,
+    executionError: aggregateExecutionError
   };
 }
 async function evaluateCandidate(options) {
@@ -13888,7 +13967,8 @@ async function evaluateCandidate(options) {
     scores,
     trace,
     output,
-    fileChanges
+    fileChanges,
+    executionStatus: classifyQualityStatus(score.score)
   };
 }
 async function runEvaluatorsForCase(options) {
@@ -14193,7 +14273,7 @@ async function invokeProvider(provider, options) {
     }
   }
 }
-function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
+function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider, failureStage, failureReasonCode) {
   const message = error instanceof Error ? error.message : String(error);
   let agentRequest;
   let lmRequest;
@@ -14236,7 +14316,11 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
     target: targetName,
     requests,
     input,
-    error: message
+    error: message,
+    executionStatus: "execution_error",
+    failureStage,
+    failureReasonCode,
+    executionError: { message, stage: failureStage }
   };
 }
 function extractProviderError(response) {