@agentv/core 2.11.2 → 2.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-REN5PS7B.js → chunk-7HPKTRFZ.js} +1 -1
- package/dist/chunk-7HPKTRFZ.js.map +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +136 -30
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +62 -2
- package/dist/index.d.ts +62 -2
- package/dist/index.js +137 -31
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-REN5PS7B.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -1004,6 +1004,12 @@ interface TrialResult {
|
|
|
1004
1004
|
readonly scores?: readonly EvaluatorResult[];
|
|
1005
1005
|
readonly error?: string;
|
|
1006
1006
|
readonly costUsd?: number;
|
|
1007
|
+
/** Primary classification for this trial attempt */
|
|
1008
|
+
readonly executionStatus?: ExecutionStatus;
|
|
1009
|
+
/** Pipeline stage where failure occurred */
|
|
1010
|
+
readonly failureStage?: FailureStage;
|
|
1011
|
+
/** Machine-readable failure reason code */
|
|
1012
|
+
readonly failureReasonCode?: string;
|
|
1007
1013
|
}
|
|
1008
1014
|
/**
|
|
1009
1015
|
* Aggregation metadata for pass_at_k strategy.
|
|
@@ -1036,6 +1042,24 @@ interface ConfidenceIntervalAggregation {
|
|
|
1036
1042
|
* Discriminated union of trial aggregation results.
|
|
1037
1043
|
*/
|
|
1038
1044
|
type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
|
|
1045
|
+
/**
|
|
1046
|
+
* Primary classification of evaluation outcome.
|
|
1047
|
+
* - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
|
|
1048
|
+
* - 'quality_failure': evaluation completed but model scored below threshold
|
|
1049
|
+
* - 'execution_error': evaluation could not complete due to infrastructure/tooling error
|
|
1050
|
+
*/
|
|
1051
|
+
type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
|
|
1052
|
+
/**
|
|
1053
|
+
* Pipeline stage where the failure occurred.
|
|
1054
|
+
*/
|
|
1055
|
+
type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
|
|
1056
|
+
/**
|
|
1057
|
+
* Structured error detail for execution failures.
|
|
1058
|
+
*/
|
|
1059
|
+
interface ExecutionError {
|
|
1060
|
+
readonly message: string;
|
|
1061
|
+
readonly stage: FailureStage;
|
|
1062
|
+
}
|
|
1039
1063
|
/**
|
|
1040
1064
|
* Evaluator scorecard for a single eval case run.
|
|
1041
1065
|
*/
|
|
@@ -1093,6 +1117,14 @@ interface EvaluationResult {
|
|
|
1093
1117
|
readonly costLimited?: boolean;
|
|
1094
1118
|
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
1095
1119
|
readonly budgetExceeded?: boolean;
|
|
1120
|
+
/** Primary classification: ok, quality_failure, or execution_error */
|
|
1121
|
+
readonly executionStatus: ExecutionStatus;
|
|
1122
|
+
/** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
|
|
1123
|
+
readonly failureStage?: FailureStage;
|
|
1124
|
+
/** Machine-readable failure reason code (only when executionStatus !== 'ok') */
|
|
1125
|
+
readonly failureReasonCode?: string;
|
|
1126
|
+
/** Structured error detail (only when executionStatus === 'execution_error') */
|
|
1127
|
+
readonly executionError?: ExecutionError;
|
|
1096
1128
|
}
|
|
1097
1129
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
|
|
1098
1130
|
interface EvaluatorResult {
|
|
@@ -2305,7 +2337,7 @@ declare class RepoManager {
|
|
|
2305
2337
|
* Creates on first access, fetches updates on subsequent calls.
|
|
2306
2338
|
* Returns the absolute path to the cache directory.
|
|
2307
2339
|
*/
|
|
2308
|
-
ensureCache(source: RepoSource, depth?: number): Promise<string>;
|
|
2340
|
+
ensureCache(source: RepoSource, depth?: number, resolve?: 'remote' | 'local'): Promise<string>;
|
|
2309
2341
|
/**
|
|
2310
2342
|
* Clone a repo from cache into the workspace at the configured path.
|
|
2311
2343
|
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
@@ -2367,6 +2399,8 @@ interface RunEvalCaseOptions {
|
|
|
2367
2399
|
readonly typeRegistry?: EvaluatorRegistry;
|
|
2368
2400
|
/** RepoManager instance for repo lifecycle (shared workspace mode) */
|
|
2369
2401
|
readonly repoManager?: RepoManager;
|
|
2402
|
+
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
2403
|
+
readonly evalDir?: string;
|
|
2370
2404
|
}
|
|
2371
2405
|
interface ProgressEvent {
|
|
2372
2406
|
readonly workerId: number;
|
|
@@ -2628,14 +2662,30 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
2628
2662
|
maxRetries: z.ZodOptional<z.ZodNumber>;
|
|
2629
2663
|
/** Agent timeout in milliseconds (default: 120000) */
|
|
2630
2664
|
agentTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
2665
|
+
/** Enable verbose logging */
|
|
2666
|
+
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
2667
|
+
/** Write human-readable trace JSONL to this path (supports {timestamp} placeholder) */
|
|
2668
|
+
traceFile: z.ZodOptional<z.ZodString>;
|
|
2669
|
+
/** Always keep temp workspaces after eval */
|
|
2670
|
+
keepWorkspaces: z.ZodOptional<z.ZodBoolean>;
|
|
2671
|
+
/** Write OTLP JSON trace to this path (supports {timestamp} placeholder) */
|
|
2672
|
+
otelFile: z.ZodOptional<z.ZodString>;
|
|
2631
2673
|
}, "strip", z.ZodTypeAny, {
|
|
2674
|
+
verbose?: boolean | undefined;
|
|
2632
2675
|
workers?: number | undefined;
|
|
2633
2676
|
maxRetries?: number | undefined;
|
|
2634
2677
|
agentTimeoutMs?: number | undefined;
|
|
2678
|
+
keepWorkspaces?: boolean | undefined;
|
|
2679
|
+
traceFile?: string | undefined;
|
|
2680
|
+
otelFile?: string | undefined;
|
|
2635
2681
|
}, {
|
|
2682
|
+
verbose?: boolean | undefined;
|
|
2636
2683
|
workers?: number | undefined;
|
|
2637
2684
|
maxRetries?: number | undefined;
|
|
2638
2685
|
agentTimeoutMs?: number | undefined;
|
|
2686
|
+
keepWorkspaces?: boolean | undefined;
|
|
2687
|
+
traceFile?: string | undefined;
|
|
2688
|
+
otelFile?: string | undefined;
|
|
2639
2689
|
}>>;
|
|
2640
2690
|
/** Output settings */
|
|
2641
2691
|
output: z.ZodOptional<z.ZodObject<{
|
|
@@ -2682,9 +2732,13 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
2682
2732
|
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
2683
2733
|
} | undefined;
|
|
2684
2734
|
execution?: {
|
|
2735
|
+
verbose?: boolean | undefined;
|
|
2685
2736
|
workers?: number | undefined;
|
|
2686
2737
|
maxRetries?: number | undefined;
|
|
2687
2738
|
agentTimeoutMs?: number | undefined;
|
|
2739
|
+
keepWorkspaces?: boolean | undefined;
|
|
2740
|
+
traceFile?: string | undefined;
|
|
2741
|
+
otelFile?: string | undefined;
|
|
2688
2742
|
} | undefined;
|
|
2689
2743
|
cache?: {
|
|
2690
2744
|
enabled?: boolean | undefined;
|
|
@@ -2700,9 +2754,13 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
2700
2754
|
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
2701
2755
|
} | undefined;
|
|
2702
2756
|
execution?: {
|
|
2757
|
+
verbose?: boolean | undefined;
|
|
2703
2758
|
workers?: number | undefined;
|
|
2704
2759
|
maxRetries?: number | undefined;
|
|
2705
2760
|
agentTimeoutMs?: number | undefined;
|
|
2761
|
+
keepWorkspaces?: boolean | undefined;
|
|
2762
|
+
traceFile?: string | undefined;
|
|
2763
|
+
otelFile?: string | undefined;
|
|
2706
2764
|
} | undefined;
|
|
2707
2765
|
cache?: {
|
|
2708
2766
|
enabled?: boolean | undefined;
|
|
@@ -2839,6 +2897,8 @@ interface ScriptExecutionContext {
|
|
|
2839
2897
|
readonly evalRunId: string;
|
|
2840
2898
|
readonly caseInput?: string;
|
|
2841
2899
|
readonly caseMetadata?: Record<string, unknown>;
|
|
2900
|
+
/** Directory containing the eval YAML file. Used as default cwd. */
|
|
2901
|
+
readonly evalDir?: string;
|
|
2842
2902
|
}
|
|
2843
2903
|
type ScriptFailureMode = 'fatal' | 'warn';
|
|
2844
2904
|
/**
|
|
@@ -3096,4 +3156,4 @@ type AgentKernel = {
|
|
|
3096
3156
|
};
|
|
3097
3157
|
declare function createAgentKernel(): AgentKernel;
|
|
3098
3158
|
|
|
3099
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3159
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -1004,6 +1004,12 @@ interface TrialResult {
|
|
|
1004
1004
|
readonly scores?: readonly EvaluatorResult[];
|
|
1005
1005
|
readonly error?: string;
|
|
1006
1006
|
readonly costUsd?: number;
|
|
1007
|
+
/** Primary classification for this trial attempt */
|
|
1008
|
+
readonly executionStatus?: ExecutionStatus;
|
|
1009
|
+
/** Pipeline stage where failure occurred */
|
|
1010
|
+
readonly failureStage?: FailureStage;
|
|
1011
|
+
/** Machine-readable failure reason code */
|
|
1012
|
+
readonly failureReasonCode?: string;
|
|
1007
1013
|
}
|
|
1008
1014
|
/**
|
|
1009
1015
|
* Aggregation metadata for pass_at_k strategy.
|
|
@@ -1036,6 +1042,24 @@ interface ConfidenceIntervalAggregation {
|
|
|
1036
1042
|
* Discriminated union of trial aggregation results.
|
|
1037
1043
|
*/
|
|
1038
1044
|
type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
|
|
1045
|
+
/**
|
|
1046
|
+
* Primary classification of evaluation outcome.
|
|
1047
|
+
* - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
|
|
1048
|
+
* - 'quality_failure': evaluation completed but model scored below threshold
|
|
1049
|
+
* - 'execution_error': evaluation could not complete due to infrastructure/tooling error
|
|
1050
|
+
*/
|
|
1051
|
+
type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
|
|
1052
|
+
/**
|
|
1053
|
+
* Pipeline stage where the failure occurred.
|
|
1054
|
+
*/
|
|
1055
|
+
type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
|
|
1056
|
+
/**
|
|
1057
|
+
* Structured error detail for execution failures.
|
|
1058
|
+
*/
|
|
1059
|
+
interface ExecutionError {
|
|
1060
|
+
readonly message: string;
|
|
1061
|
+
readonly stage: FailureStage;
|
|
1062
|
+
}
|
|
1039
1063
|
/**
|
|
1040
1064
|
* Evaluator scorecard for a single eval case run.
|
|
1041
1065
|
*/
|
|
@@ -1093,6 +1117,14 @@ interface EvaluationResult {
|
|
|
1093
1117
|
readonly costLimited?: boolean;
|
|
1094
1118
|
/** Whether the evaluation was skipped due to suite-level budget exhaustion */
|
|
1095
1119
|
readonly budgetExceeded?: boolean;
|
|
1120
|
+
/** Primary classification: ok, quality_failure, or execution_error */
|
|
1121
|
+
readonly executionStatus: ExecutionStatus;
|
|
1122
|
+
/** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
|
|
1123
|
+
readonly failureStage?: FailureStage;
|
|
1124
|
+
/** Machine-readable failure reason code (only when executionStatus !== 'ok') */
|
|
1125
|
+
readonly failureReasonCode?: string;
|
|
1126
|
+
/** Structured error detail (only when executionStatus === 'execution_error') */
|
|
1127
|
+
readonly executionError?: ExecutionError;
|
|
1096
1128
|
}
|
|
1097
1129
|
type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';
|
|
1098
1130
|
interface EvaluatorResult {
|
|
@@ -2305,7 +2337,7 @@ declare class RepoManager {
|
|
|
2305
2337
|
* Creates on first access, fetches updates on subsequent calls.
|
|
2306
2338
|
* Returns the absolute path to the cache directory.
|
|
2307
2339
|
*/
|
|
2308
|
-
ensureCache(source: RepoSource, depth?: number): Promise<string>;
|
|
2340
|
+
ensureCache(source: RepoSource, depth?: number, resolve?: 'remote' | 'local'): Promise<string>;
|
|
2309
2341
|
/**
|
|
2310
2342
|
* Clone a repo from cache into the workspace at the configured path.
|
|
2311
2343
|
* Handles checkout, ref resolution, ancestor walking, shallow clone, sparse checkout.
|
|
@@ -2367,6 +2399,8 @@ interface RunEvalCaseOptions {
|
|
|
2367
2399
|
readonly typeRegistry?: EvaluatorRegistry;
|
|
2368
2400
|
/** RepoManager instance for repo lifecycle (shared workspace mode) */
|
|
2369
2401
|
readonly repoManager?: RepoManager;
|
|
2402
|
+
/** Directory containing the eval YAML file. Used as default cwd for workspace scripts. */
|
|
2403
|
+
readonly evalDir?: string;
|
|
2370
2404
|
}
|
|
2371
2405
|
interface ProgressEvent {
|
|
2372
2406
|
readonly workerId: number;
|
|
@@ -2628,14 +2662,30 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
2628
2662
|
maxRetries: z.ZodOptional<z.ZodNumber>;
|
|
2629
2663
|
/** Agent timeout in milliseconds (default: 120000) */
|
|
2630
2664
|
agentTimeoutMs: z.ZodOptional<z.ZodNumber>;
|
|
2665
|
+
/** Enable verbose logging */
|
|
2666
|
+
verbose: z.ZodOptional<z.ZodBoolean>;
|
|
2667
|
+
/** Write human-readable trace JSONL to this path (supports {timestamp} placeholder) */
|
|
2668
|
+
traceFile: z.ZodOptional<z.ZodString>;
|
|
2669
|
+
/** Always keep temp workspaces after eval */
|
|
2670
|
+
keepWorkspaces: z.ZodOptional<z.ZodBoolean>;
|
|
2671
|
+
/** Write OTLP JSON trace to this path (supports {timestamp} placeholder) */
|
|
2672
|
+
otelFile: z.ZodOptional<z.ZodString>;
|
|
2631
2673
|
}, "strip", z.ZodTypeAny, {
|
|
2674
|
+
verbose?: boolean | undefined;
|
|
2632
2675
|
workers?: number | undefined;
|
|
2633
2676
|
maxRetries?: number | undefined;
|
|
2634
2677
|
agentTimeoutMs?: number | undefined;
|
|
2678
|
+
keepWorkspaces?: boolean | undefined;
|
|
2679
|
+
traceFile?: string | undefined;
|
|
2680
|
+
otelFile?: string | undefined;
|
|
2635
2681
|
}, {
|
|
2682
|
+
verbose?: boolean | undefined;
|
|
2636
2683
|
workers?: number | undefined;
|
|
2637
2684
|
maxRetries?: number | undefined;
|
|
2638
2685
|
agentTimeoutMs?: number | undefined;
|
|
2686
|
+
keepWorkspaces?: boolean | undefined;
|
|
2687
|
+
traceFile?: string | undefined;
|
|
2688
|
+
otelFile?: string | undefined;
|
|
2639
2689
|
}>>;
|
|
2640
2690
|
/** Output settings */
|
|
2641
2691
|
output: z.ZodOptional<z.ZodObject<{
|
|
@@ -2682,9 +2732,13 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
2682
2732
|
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
2683
2733
|
} | undefined;
|
|
2684
2734
|
execution?: {
|
|
2735
|
+
verbose?: boolean | undefined;
|
|
2685
2736
|
workers?: number | undefined;
|
|
2686
2737
|
maxRetries?: number | undefined;
|
|
2687
2738
|
agentTimeoutMs?: number | undefined;
|
|
2739
|
+
keepWorkspaces?: boolean | undefined;
|
|
2740
|
+
traceFile?: string | undefined;
|
|
2741
|
+
otelFile?: string | undefined;
|
|
2688
2742
|
} | undefined;
|
|
2689
2743
|
cache?: {
|
|
2690
2744
|
enabled?: boolean | undefined;
|
|
@@ -2700,9 +2754,13 @@ declare const AgentVConfigSchema: z.ZodObject<{
|
|
|
2700
2754
|
format?: "yaml" | "jsonl" | "json" | "xml" | undefined;
|
|
2701
2755
|
} | undefined;
|
|
2702
2756
|
execution?: {
|
|
2757
|
+
verbose?: boolean | undefined;
|
|
2703
2758
|
workers?: number | undefined;
|
|
2704
2759
|
maxRetries?: number | undefined;
|
|
2705
2760
|
agentTimeoutMs?: number | undefined;
|
|
2761
|
+
keepWorkspaces?: boolean | undefined;
|
|
2762
|
+
traceFile?: string | undefined;
|
|
2763
|
+
otelFile?: string | undefined;
|
|
2706
2764
|
} | undefined;
|
|
2707
2765
|
cache?: {
|
|
2708
2766
|
enabled?: boolean | undefined;
|
|
@@ -2839,6 +2897,8 @@ interface ScriptExecutionContext {
|
|
|
2839
2897
|
readonly evalRunId: string;
|
|
2840
2898
|
readonly caseInput?: string;
|
|
2841
2899
|
readonly caseMetadata?: Record<string, unknown>;
|
|
2900
|
+
/** Directory containing the eval YAML file. Used as default cwd. */
|
|
2901
|
+
readonly evalDir?: string;
|
|
2842
2902
|
}
|
|
2843
2903
|
type ScriptFailureMode = 'fatal' | 'warn';
|
|
2844
2904
|
/**
|
|
@@ -3096,4 +3156,4 @@ type AgentKernel = {
|
|
|
3096
3156
|
};
|
|
3097
3157
|
declare function createAgentKernel(): AgentKernel;
|
|
3098
3158
|
|
|
3099
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3159
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|