@agentv/core 2.12.0 → 2.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7HPKTRFZ.js → chunk-JHER2LQ5.js} +1 -1
- package/dist/chunk-JHER2LQ5.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +64 -2
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +18 -1
- package/dist/index.d.ts +18 -1
- package/dist/index.js +64 -3
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-7HPKTRFZ.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1060,6 +1060,12 @@ interface ExecutionError {
|
|
|
1060
1060
|
readonly message: string;
|
|
1061
1061
|
readonly stage: FailureStage;
|
|
1062
1062
|
}
|
|
1063
|
+
/**
|
|
1064
|
+
* Tolerance for execution errors in an eval run.
|
|
1065
|
+
* - `true`: halt on first execution error
|
|
1066
|
+
* - `false`: never halt on errors (default)
|
|
1067
|
+
*/
|
|
1068
|
+
type FailOnError = boolean;
|
|
1063
1069
|
/**
|
|
1064
1070
|
* Evaluator scorecard for a single eval case run.
|
|
1065
1071
|
*/
|
|
@@ -1194,6 +1200,7 @@ type ExecutionDefaults = {
|
|
|
1194
1200
|
readonly otel_file?: string;
|
|
1195
1201
|
};
|
|
1196
1202
|
type AgentVConfig$1 = {
|
|
1203
|
+
readonly required_version?: string;
|
|
1197
1204
|
readonly guideline_patterns?: readonly string[];
|
|
1198
1205
|
readonly eval_patterns?: readonly string[];
|
|
1199
1206
|
readonly execution?: ExecutionDefaults;
|
|
@@ -1238,6 +1245,12 @@ interface CacheConfig {
|
|
|
1238
1245
|
* Returns undefined when no cache config is specified.
|
|
1239
1246
|
*/
|
|
1240
1247
|
declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
|
|
1248
|
+
/**
|
|
1249
|
+
* Extract `execution.fail_on_error` from parsed eval suite.
|
|
1250
|
+
* Accepts `true` or `false`.
|
|
1251
|
+
* Returns undefined when not specified.
|
|
1252
|
+
*/
|
|
1253
|
+
declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
|
|
1241
1254
|
|
|
1242
1255
|
/**
|
|
1243
1256
|
* Formatting mode for segment content.
|
|
@@ -1297,6 +1310,8 @@ type EvalSuiteResult = {
|
|
|
1297
1310
|
readonly metadata?: EvalMetadata;
|
|
1298
1311
|
/** Suite-level total cost budget in USD */
|
|
1299
1312
|
readonly totalBudgetUsd?: number;
|
|
1313
|
+
/** Execution error tolerance: true or false */
|
|
1314
|
+
readonly failOnError?: FailOnError;
|
|
1300
1315
|
};
|
|
1301
1316
|
/**
|
|
1302
1317
|
* Load tests and suite metadata from a single parse.
|
|
@@ -2440,6 +2455,8 @@ interface RunEvaluationOptions {
|
|
|
2440
2455
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2441
2456
|
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
|
|
2442
2457
|
readonly totalBudgetUsd?: number;
|
|
2458
|
+
/** Execution error tolerance: true halts on first error */
|
|
2459
|
+
readonly failOnError?: FailOnError;
|
|
2443
2460
|
}
|
|
2444
2461
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2445
2462
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -3156,4 +3173,4 @@ type AgentKernel = {
|
|
|
3156
3173
|
};
|
|
3157
3174
|
declare function createAgentKernel(): AgentKernel;
|
|
3158
3175
|
|
|
3159
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3176
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -1060,6 +1060,12 @@ interface ExecutionError {
|
|
|
1060
1060
|
readonly message: string;
|
|
1061
1061
|
readonly stage: FailureStage;
|
|
1062
1062
|
}
|
|
1063
|
+
/**
|
|
1064
|
+
* Tolerance for execution errors in an eval run.
|
|
1065
|
+
* - `true`: halt on first execution error
|
|
1066
|
+
* - `false`: never halt on errors (default)
|
|
1067
|
+
*/
|
|
1068
|
+
type FailOnError = boolean;
|
|
1063
1069
|
/**
|
|
1064
1070
|
* Evaluator scorecard for a single eval case run.
|
|
1065
1071
|
*/
|
|
@@ -1194,6 +1200,7 @@ type ExecutionDefaults = {
|
|
|
1194
1200
|
readonly otel_file?: string;
|
|
1195
1201
|
};
|
|
1196
1202
|
type AgentVConfig$1 = {
|
|
1203
|
+
readonly required_version?: string;
|
|
1197
1204
|
readonly guideline_patterns?: readonly string[];
|
|
1198
1205
|
readonly eval_patterns?: readonly string[];
|
|
1199
1206
|
readonly execution?: ExecutionDefaults;
|
|
@@ -1238,6 +1245,12 @@ interface CacheConfig {
|
|
|
1238
1245
|
* Returns undefined when no cache config is specified.
|
|
1239
1246
|
*/
|
|
1240
1247
|
declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
|
|
1248
|
+
/**
|
|
1249
|
+
* Extract `execution.fail_on_error` from parsed eval suite.
|
|
1250
|
+
* Accepts `true` or `false`.
|
|
1251
|
+
* Returns undefined when not specified.
|
|
1252
|
+
*/
|
|
1253
|
+
declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
|
|
1241
1254
|
|
|
1242
1255
|
/**
|
|
1243
1256
|
* Formatting mode for segment content.
|
|
@@ -1297,6 +1310,8 @@ type EvalSuiteResult = {
|
|
|
1297
1310
|
readonly metadata?: EvalMetadata;
|
|
1298
1311
|
/** Suite-level total cost budget in USD */
|
|
1299
1312
|
readonly totalBudgetUsd?: number;
|
|
1313
|
+
/** Execution error tolerance: true or false */
|
|
1314
|
+
readonly failOnError?: FailOnError;
|
|
1300
1315
|
};
|
|
1301
1316
|
/**
|
|
1302
1317
|
* Load tests and suite metadata from a single parse.
|
|
@@ -2440,6 +2455,8 @@ interface RunEvaluationOptions {
|
|
|
2440
2455
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2441
2456
|
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
|
|
2442
2457
|
readonly totalBudgetUsd?: number;
|
|
2458
|
+
/** Execution error tolerance: true halts on first error */
|
|
2459
|
+
readonly failOnError?: FailOnError;
|
|
2443
2460
|
}
|
|
2444
2461
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2445
2462
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -3156,4 +3173,4 @@ type AgentKernel = {
|
|
|
3156
3173
|
};
|
|
3157
3174
|
declare function createAgentKernel(): AgentKernel;
|
|
3158
3175
|
|
|
3159
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3176
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
package/dist/index.js
CHANGED
|
@@ -17,7 +17,7 @@ import {
|
|
|
17
17
|
readTextFile,
|
|
18
18
|
resolveFileReference,
|
|
19
19
|
resolveTargetDefinition
|
|
20
|
-
} from "./chunk-
|
|
20
|
+
} from "./chunk-JHER2LQ5.js";
|
|
21
21
|
import {
|
|
22
22
|
OtlpJsonFileExporter
|
|
23
23
|
} from "./chunk-HFSYZHGF.js";
|
|
@@ -396,6 +396,11 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
396
396
|
continue;
|
|
397
397
|
}
|
|
398
398
|
const config = parsed;
|
|
399
|
+
const requiredVersion = parsed.required_version;
|
|
400
|
+
if (requiredVersion !== void 0 && typeof requiredVersion !== "string") {
|
|
401
|
+
logWarning(`Invalid required_version in ${configPath}, expected string`);
|
|
402
|
+
continue;
|
|
403
|
+
}
|
|
399
404
|
const guidelinePatterns = config.guideline_patterns;
|
|
400
405
|
if (guidelinePatterns !== void 0 && !Array.isArray(guidelinePatterns)) {
|
|
401
406
|
logWarning(`Invalid guideline_patterns in ${configPath}, expected array`);
|
|
@@ -419,6 +424,7 @@ async function loadConfig(evalFilePath, repoRoot) {
|
|
|
419
424
|
configPath
|
|
420
425
|
);
|
|
421
426
|
return {
|
|
427
|
+
required_version: requiredVersion,
|
|
422
428
|
guideline_patterns: guidelinePatterns,
|
|
423
429
|
eval_patterns: evalPatterns,
|
|
424
430
|
execution: executionDefaults
|
|
@@ -562,6 +568,22 @@ function extractTotalBudgetUsd(suite) {
|
|
|
562
568
|
);
|
|
563
569
|
return void 0;
|
|
564
570
|
}
|
|
571
|
+
function extractFailOnError(suite) {
|
|
572
|
+
const execution = suite.execution;
|
|
573
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
574
|
+
return void 0;
|
|
575
|
+
}
|
|
576
|
+
const executionObj = execution;
|
|
577
|
+
const raw = executionObj.fail_on_error ?? executionObj.failOnError;
|
|
578
|
+
if (raw === void 0 || raw === null) {
|
|
579
|
+
return void 0;
|
|
580
|
+
}
|
|
581
|
+
if (typeof raw === "boolean") {
|
|
582
|
+
return raw;
|
|
583
|
+
}
|
|
584
|
+
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
585
|
+
return void 0;
|
|
586
|
+
}
|
|
565
587
|
function parseExecutionDefaults(raw, configPath) {
|
|
566
588
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
567
589
|
return void 0;
|
|
@@ -2757,13 +2779,15 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
2757
2779
|
}
|
|
2758
2780
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
2759
2781
|
const metadata = parseMetadata(parsed);
|
|
2782
|
+
const failOnError = extractFailOnError(parsed);
|
|
2760
2783
|
return {
|
|
2761
2784
|
tests,
|
|
2762
2785
|
trials: extractTrialsConfig(parsed),
|
|
2763
2786
|
targets: extractTargetsFromSuite(parsed),
|
|
2764
2787
|
cacheConfig: extractCacheConfig(parsed),
|
|
2765
2788
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
2766
|
-
...metadata !== void 0 && { metadata }
|
|
2789
|
+
...metadata !== void 0 && { metadata },
|
|
2790
|
+
...failOnError !== void 0 && { failOnError }
|
|
2767
2791
|
};
|
|
2768
2792
|
}
|
|
2769
2793
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -12921,7 +12945,8 @@ async function runEvaluation(options) {
|
|
|
12921
12945
|
cleanupWorkspaces,
|
|
12922
12946
|
trials,
|
|
12923
12947
|
streamCallbacks,
|
|
12924
|
-
totalBudgetUsd
|
|
12948
|
+
totalBudgetUsd,
|
|
12949
|
+
failOnError
|
|
12925
12950
|
} = options;
|
|
12926
12951
|
let useCache = options.useCache;
|
|
12927
12952
|
if (trials && trials.count > 1 && useCache) {
|
|
@@ -13117,6 +13142,7 @@ async function runEvaluation(options) {
|
|
|
13117
13142
|
let beforeAllOutputAttached = false;
|
|
13118
13143
|
let cumulativeBudgetCost = 0;
|
|
13119
13144
|
let budgetExhausted = false;
|
|
13145
|
+
let failOnErrorTriggered = false;
|
|
13120
13146
|
const promises = filteredEvalCases.map(
|
|
13121
13147
|
(evalCase) => limit(async () => {
|
|
13122
13148
|
const workerId = nextWorkerId++;
|
|
@@ -13155,6 +13181,37 @@ async function runEvaluation(options) {
|
|
|
13155
13181
|
}
|
|
13156
13182
|
return budgetResult;
|
|
13157
13183
|
}
|
|
13184
|
+
if (failOnError === true && failOnErrorTriggered) {
|
|
13185
|
+
const errorMsg = "Halted: execution error encountered with fail_on_error enabled";
|
|
13186
|
+
const haltResult = {
|
|
13187
|
+
timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
|
|
13188
|
+
testId: evalCase.id,
|
|
13189
|
+
dataset: evalCase.dataset,
|
|
13190
|
+
score: 0,
|
|
13191
|
+
hits: [],
|
|
13192
|
+
misses: [],
|
|
13193
|
+
answer: "",
|
|
13194
|
+
target: target.name,
|
|
13195
|
+
error: errorMsg,
|
|
13196
|
+
executionStatus: "execution_error",
|
|
13197
|
+
failureStage: "setup",
|
|
13198
|
+
failureReasonCode: "error_threshold_exceeded",
|
|
13199
|
+
executionError: { message: errorMsg, stage: "setup" }
|
|
13200
|
+
};
|
|
13201
|
+
if (onProgress) {
|
|
13202
|
+
await onProgress({
|
|
13203
|
+
workerId,
|
|
13204
|
+
testId: evalCase.id,
|
|
13205
|
+
status: "failed",
|
|
13206
|
+
completedAt: Date.now(),
|
|
13207
|
+
error: haltResult.error
|
|
13208
|
+
});
|
|
13209
|
+
}
|
|
13210
|
+
if (onResult) {
|
|
13211
|
+
await onResult(haltResult);
|
|
13212
|
+
}
|
|
13213
|
+
return haltResult;
|
|
13214
|
+
}
|
|
13158
13215
|
if (onProgress) {
|
|
13159
13216
|
await onProgress({
|
|
13160
13217
|
workerId,
|
|
@@ -13207,6 +13264,9 @@ async function runEvaluation(options) {
|
|
|
13207
13264
|
}
|
|
13208
13265
|
}
|
|
13209
13266
|
}
|
|
13267
|
+
if (failOnError === true && result.executionStatus === "execution_error") {
|
|
13268
|
+
failOnErrorTriggered = true;
|
|
13269
|
+
}
|
|
13210
13270
|
if (beforeAllOutput && !beforeAllOutputAttached) {
|
|
13211
13271
|
result = { ...result, beforeAllOutput };
|
|
13212
13272
|
beforeAllOutputAttached = true;
|
|
@@ -15268,6 +15328,7 @@ export {
|
|
|
15268
15328
|
executeWorkspaceScript,
|
|
15269
15329
|
explorationRatio,
|
|
15270
15330
|
extractCacheConfig,
|
|
15331
|
+
extractFailOnError,
|
|
15271
15332
|
extractJsonBlob,
|
|
15272
15333
|
extractTargetFromSuite,
|
|
15273
15334
|
extractTargetsFromSuite,
|