@agentv/core 3.13.0 → 3.13.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-4XWPXNQM.js → chunk-ZB3AUPES.js} +1 -3
- package/dist/chunk-ZB3AUPES.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +0 -2
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +65 -32
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +21 -22
- package/dist/index.d.ts +21 -22
- package/dist/index.js +65 -31
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-4XWPXNQM.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -473,7 +473,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
473
473
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
474
474
|
*/
|
|
475
475
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
476
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "
|
|
476
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
|
|
477
477
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
478
478
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
479
479
|
/**
|
|
@@ -576,7 +576,7 @@ type WorkspaceConfig = {
|
|
|
576
576
|
};
|
|
577
577
|
type CodeEvaluatorConfig = {
|
|
578
578
|
readonly name: string;
|
|
579
|
-
readonly type: 'code-
|
|
579
|
+
readonly type: 'code-grader';
|
|
580
580
|
readonly command: readonly string[];
|
|
581
581
|
/** @deprecated Use `command` instead */
|
|
582
582
|
readonly script?: readonly string[];
|
|
@@ -606,7 +606,7 @@ type PromptScriptConfig = {
|
|
|
606
606
|
};
|
|
607
607
|
type LlmGraderEvaluatorConfig = {
|
|
608
608
|
readonly name: string;
|
|
609
|
-
readonly type: 'llm-grader'
|
|
609
|
+
readonly type: 'llm-grader';
|
|
610
610
|
/** Text prompt (inline or file path) or executable script config */
|
|
611
611
|
readonly prompt?: string | PromptScriptConfig;
|
|
612
612
|
readonly promptPath?: string;
|
|
@@ -678,20 +678,11 @@ type CompositeAggregatorConfig = {
|
|
|
678
678
|
readonly type: 'code-grader';
|
|
679
679
|
readonly path: string;
|
|
680
680
|
readonly cwd?: string;
|
|
681
|
-
} | {
|
|
682
|
-
readonly type: 'code-judge';
|
|
683
|
-
readonly path: string;
|
|
684
|
-
readonly cwd?: string;
|
|
685
681
|
} | {
|
|
686
682
|
readonly type: 'llm-grader';
|
|
687
683
|
readonly prompt?: string;
|
|
688
684
|
readonly promptPath?: string;
|
|
689
685
|
readonly model?: string;
|
|
690
|
-
} | {
|
|
691
|
-
readonly type: 'llm-judge';
|
|
692
|
-
readonly prompt?: string;
|
|
693
|
-
readonly promptPath?: string;
|
|
694
|
-
readonly model?: string;
|
|
695
686
|
} | {
|
|
696
687
|
readonly type: 'threshold';
|
|
697
688
|
readonly threshold: number;
|
|
@@ -1310,6 +1301,12 @@ declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
|
|
|
1310
1301
|
* Returns undefined when not specified.
|
|
1311
1302
|
*/
|
|
1312
1303
|
declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
|
|
1304
|
+
/**
|
|
1305
|
+
* Extract `execution.threshold` from parsed eval suite.
|
|
1306
|
+
* Accepts a number in [0, 1] range.
|
|
1307
|
+
* Returns undefined when not specified.
|
|
1308
|
+
*/
|
|
1309
|
+
declare function extractThreshold(suite: JsonObject): number | undefined;
|
|
1313
1310
|
|
|
1314
1311
|
/**
|
|
1315
1312
|
* Formatting mode for segment content.
|
|
@@ -1372,6 +1369,8 @@ type EvalSuiteResult = {
|
|
|
1372
1369
|
readonly totalBudgetUsd?: number;
|
|
1373
1370
|
/** Execution error tolerance: true or false */
|
|
1374
1371
|
readonly failOnError?: FailOnError;
|
|
1372
|
+
/** Suite-level quality threshold (0-1) — suite fails if mean score is below */
|
|
1373
|
+
readonly threshold?: number;
|
|
1375
1374
|
};
|
|
1376
1375
|
/**
|
|
1377
1376
|
* Load tests and suite metadata from a single parse.
|
|
@@ -2117,7 +2116,7 @@ interface CodeEvaluatorOptions {
|
|
|
2117
2116
|
readonly target?: TargetAccessConfig;
|
|
2118
2117
|
}
|
|
2119
2118
|
declare class CodeEvaluator implements Evaluator {
|
|
2120
|
-
readonly kind = "code-
|
|
2119
|
+
readonly kind = "code-grader";
|
|
2121
2120
|
private readonly command;
|
|
2122
2121
|
private readonly cwd?;
|
|
2123
2122
|
private readonly agentTimeoutMs?;
|
|
@@ -2852,7 +2851,7 @@ interface EvalTestInput {
|
|
|
2852
2851
|
readonly expectedOutput?: string;
|
|
2853
2852
|
/** @deprecated Use `expectedOutput` instead */
|
|
2854
2853
|
readonly expected_output?: string;
|
|
2855
|
-
/** Assertion
|
|
2854
|
+
/** Assertion graders — accepts factory functions, config objects, or inline functions */
|
|
2856
2855
|
readonly assert?: readonly AssertEntry[];
|
|
2857
2856
|
/** Arbitrary metadata */
|
|
2858
2857
|
readonly metadata?: Record<string, unknown>;
|
|
@@ -2862,7 +2861,7 @@ interface EvalTestInput {
|
|
|
2862
2861
|
* Matches the YAML `assert` block structure.
|
|
2863
2862
|
*/
|
|
2864
2863
|
interface EvalAssertionInput {
|
|
2865
|
-
/** Assertion type (e.g., 'contains', 'llm-
|
|
2864
|
+
/** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
|
|
2866
2865
|
readonly type: string;
|
|
2867
2866
|
/** Display name */
|
|
2868
2867
|
readonly name?: string;
|
|
@@ -2872,9 +2871,9 @@ interface EvalAssertionInput {
|
|
|
2872
2871
|
readonly weight?: number;
|
|
2873
2872
|
/** Whether this assertion is required to pass */
|
|
2874
2873
|
readonly required?: boolean | number;
|
|
2875
|
-
/** Prompt file for
|
|
2874
|
+
/** Prompt file for llm_grader */
|
|
2876
2875
|
readonly prompt?: string;
|
|
2877
|
-
/** Script for
|
|
2876
|
+
/** Script for code_grader */
|
|
2878
2877
|
readonly script?: string | readonly string[];
|
|
2879
2878
|
/** Additional config passed to the assertion */
|
|
2880
2879
|
readonly config?: Record<string, unknown>;
|
|
@@ -3568,17 +3567,17 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
|
3568
3567
|
* Convention-based discovery of custom assertion scripts.
|
|
3569
3568
|
*
|
|
3570
3569
|
* Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
|
|
3571
|
-
* them as code
|
|
3572
|
-
* extension) becomes the
|
|
3570
|
+
* them as code graders in the registry. The file name (without
|
|
3571
|
+
* extension) becomes the grader type name.
|
|
3573
3572
|
*
|
|
3574
3573
|
* Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
|
|
3575
3574
|
*/
|
|
3576
3575
|
|
|
3577
3576
|
/**
|
|
3578
3577
|
* Discover custom assertion scripts from `.agentv/assertions/` and register
|
|
3579
|
-
* them as
|
|
3578
|
+
* them as grader types in the registry.
|
|
3580
3579
|
*
|
|
3581
|
-
* @param registry - The
|
|
3580
|
+
* @param registry - The grader registry to register discovered assertions into
|
|
3582
3581
|
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
3583
3582
|
* @returns Names of discovered assertion types
|
|
3584
3583
|
*/
|
|
@@ -3609,4 +3608,4 @@ type AgentKernel = {
|
|
|
3609
3608
|
};
|
|
3610
3609
|
declare function createAgentKernel(): AgentKernel;
|
|
3611
3610
|
|
|
3612
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
3611
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.d.ts
CHANGED
|
@@ -473,7 +473,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
473
473
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
474
474
|
*/
|
|
475
475
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
476
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "
|
|
476
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
|
|
477
477
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
478
478
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
479
479
|
/**
|
|
@@ -576,7 +576,7 @@ type WorkspaceConfig = {
|
|
|
576
576
|
};
|
|
577
577
|
type CodeEvaluatorConfig = {
|
|
578
578
|
readonly name: string;
|
|
579
|
-
readonly type: 'code-
|
|
579
|
+
readonly type: 'code-grader';
|
|
580
580
|
readonly command: readonly string[];
|
|
581
581
|
/** @deprecated Use `command` instead */
|
|
582
582
|
readonly script?: readonly string[];
|
|
@@ -606,7 +606,7 @@ type PromptScriptConfig = {
|
|
|
606
606
|
};
|
|
607
607
|
type LlmGraderEvaluatorConfig = {
|
|
608
608
|
readonly name: string;
|
|
609
|
-
readonly type: 'llm-grader'
|
|
609
|
+
readonly type: 'llm-grader';
|
|
610
610
|
/** Text prompt (inline or file path) or executable script config */
|
|
611
611
|
readonly prompt?: string | PromptScriptConfig;
|
|
612
612
|
readonly promptPath?: string;
|
|
@@ -678,20 +678,11 @@ type CompositeAggregatorConfig = {
|
|
|
678
678
|
readonly type: 'code-grader';
|
|
679
679
|
readonly path: string;
|
|
680
680
|
readonly cwd?: string;
|
|
681
|
-
} | {
|
|
682
|
-
readonly type: 'code-judge';
|
|
683
|
-
readonly path: string;
|
|
684
|
-
readonly cwd?: string;
|
|
685
681
|
} | {
|
|
686
682
|
readonly type: 'llm-grader';
|
|
687
683
|
readonly prompt?: string;
|
|
688
684
|
readonly promptPath?: string;
|
|
689
685
|
readonly model?: string;
|
|
690
|
-
} | {
|
|
691
|
-
readonly type: 'llm-judge';
|
|
692
|
-
readonly prompt?: string;
|
|
693
|
-
readonly promptPath?: string;
|
|
694
|
-
readonly model?: string;
|
|
695
686
|
} | {
|
|
696
687
|
readonly type: 'threshold';
|
|
697
688
|
readonly threshold: number;
|
|
@@ -1310,6 +1301,12 @@ declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
|
|
|
1310
1301
|
* Returns undefined when not specified.
|
|
1311
1302
|
*/
|
|
1312
1303
|
declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
|
|
1304
|
+
/**
|
|
1305
|
+
* Extract `execution.threshold` from parsed eval suite.
|
|
1306
|
+
* Accepts a number in [0, 1] range.
|
|
1307
|
+
* Returns undefined when not specified.
|
|
1308
|
+
*/
|
|
1309
|
+
declare function extractThreshold(suite: JsonObject): number | undefined;
|
|
1313
1310
|
|
|
1314
1311
|
/**
|
|
1315
1312
|
* Formatting mode for segment content.
|
|
@@ -1372,6 +1369,8 @@ type EvalSuiteResult = {
|
|
|
1372
1369
|
readonly totalBudgetUsd?: number;
|
|
1373
1370
|
/** Execution error tolerance: true or false */
|
|
1374
1371
|
readonly failOnError?: FailOnError;
|
|
1372
|
+
/** Suite-level quality threshold (0-1) — suite fails if mean score is below */
|
|
1373
|
+
readonly threshold?: number;
|
|
1375
1374
|
};
|
|
1376
1375
|
/**
|
|
1377
1376
|
* Load tests and suite metadata from a single parse.
|
|
@@ -2117,7 +2116,7 @@ interface CodeEvaluatorOptions {
|
|
|
2117
2116
|
readonly target?: TargetAccessConfig;
|
|
2118
2117
|
}
|
|
2119
2118
|
declare class CodeEvaluator implements Evaluator {
|
|
2120
|
-
readonly kind = "code-
|
|
2119
|
+
readonly kind = "code-grader";
|
|
2121
2120
|
private readonly command;
|
|
2122
2121
|
private readonly cwd?;
|
|
2123
2122
|
private readonly agentTimeoutMs?;
|
|
@@ -2852,7 +2851,7 @@ interface EvalTestInput {
|
|
|
2852
2851
|
readonly expectedOutput?: string;
|
|
2853
2852
|
/** @deprecated Use `expectedOutput` instead */
|
|
2854
2853
|
readonly expected_output?: string;
|
|
2855
|
-
/** Assertion
|
|
2854
|
+
/** Assertion graders — accepts factory functions, config objects, or inline functions */
|
|
2856
2855
|
readonly assert?: readonly AssertEntry[];
|
|
2857
2856
|
/** Arbitrary metadata */
|
|
2858
2857
|
readonly metadata?: Record<string, unknown>;
|
|
@@ -2862,7 +2861,7 @@ interface EvalTestInput {
|
|
|
2862
2861
|
* Matches the YAML `assert` block structure.
|
|
2863
2862
|
*/
|
|
2864
2863
|
interface EvalAssertionInput {
|
|
2865
|
-
/** Assertion type (e.g., 'contains', 'llm-
|
|
2864
|
+
/** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
|
|
2866
2865
|
readonly type: string;
|
|
2867
2866
|
/** Display name */
|
|
2868
2867
|
readonly name?: string;
|
|
@@ -2872,9 +2871,9 @@ interface EvalAssertionInput {
|
|
|
2872
2871
|
readonly weight?: number;
|
|
2873
2872
|
/** Whether this assertion is required to pass */
|
|
2874
2873
|
readonly required?: boolean | number;
|
|
2875
|
-
/** Prompt file for
|
|
2874
|
+
/** Prompt file for llm_grader */
|
|
2876
2875
|
readonly prompt?: string;
|
|
2877
|
-
/** Script for
|
|
2876
|
+
/** Script for code_grader */
|
|
2878
2877
|
readonly script?: string | readonly string[];
|
|
2879
2878
|
/** Additional config passed to the assertion */
|
|
2880
2879
|
readonly config?: Record<string, unknown>;
|
|
@@ -3568,17 +3567,17 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
|
3568
3567
|
* Convention-based discovery of custom assertion scripts.
|
|
3569
3568
|
*
|
|
3570
3569
|
* Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
|
|
3571
|
-
* them as code
|
|
3572
|
-
* extension) becomes the
|
|
3570
|
+
* them as code graders in the registry. The file name (without
|
|
3571
|
+
* extension) becomes the grader type name.
|
|
3573
3572
|
*
|
|
3574
3573
|
* Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
|
|
3575
3574
|
*/
|
|
3576
3575
|
|
|
3577
3576
|
/**
|
|
3578
3577
|
* Discover custom assertion scripts from `.agentv/assertions/` and register
|
|
3579
|
-
* them as
|
|
3578
|
+
* them as grader types in the registry.
|
|
3580
3579
|
*
|
|
3581
|
-
* @param registry - The
|
|
3580
|
+
* @param registry - The grader registry to register discovered assertions into
|
|
3582
3581
|
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
3583
3582
|
* @returns Names of discovered assertion types
|
|
3584
3583
|
*/
|
|
@@ -3609,4 +3608,4 @@ type AgentKernel = {
|
|
|
3609
3608
|
};
|
|
3610
3609
|
declare function createAgentKernel(): AgentKernel;
|
|
3611
3610
|
|
|
3612
|
-
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
|
3611
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|
package/dist/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
readTextFile,
|
|
20
20
|
resolveFileReference,
|
|
21
21
|
resolveTargetDefinition
|
|
22
|
-
} from "./chunk-
|
|
22
|
+
} from "./chunk-ZB3AUPES.js";
|
|
23
23
|
import {
|
|
24
24
|
AgentvProvider
|
|
25
25
|
} from "./chunk-W5YDZWT4.js";
|
|
@@ -601,6 +601,22 @@ function extractFailOnError(suite) {
|
|
|
601
601
|
logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
|
|
602
602
|
return void 0;
|
|
603
603
|
}
|
|
604
|
+
function extractThreshold(suite) {
|
|
605
|
+
const execution = suite.execution;
|
|
606
|
+
if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
|
|
607
|
+
return void 0;
|
|
608
|
+
}
|
|
609
|
+
const executionObj = execution;
|
|
610
|
+
const raw = executionObj.threshold;
|
|
611
|
+
if (raw === void 0 || raw === null) {
|
|
612
|
+
return void 0;
|
|
613
|
+
}
|
|
614
|
+
if (typeof raw === "number" && raw >= 0 && raw <= 1) {
|
|
615
|
+
return raw;
|
|
616
|
+
}
|
|
617
|
+
logWarning(`Invalid execution.threshold: ${raw}. Must be a number between 0 and 1. Ignoring.`);
|
|
618
|
+
return void 0;
|
|
619
|
+
}
|
|
604
620
|
function parseExecutionDefaults(raw, configPath) {
|
|
605
621
|
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
|
606
622
|
return void 0;
|
|
@@ -728,6 +744,9 @@ var ANSI_RESET4 = "\x1B[0m";
|
|
|
728
744
|
function normalizeEvaluatorType(type) {
|
|
729
745
|
return type.replace(/_/g, "-");
|
|
730
746
|
}
|
|
747
|
+
function isDeprecatedJudgeType(type) {
|
|
748
|
+
return type === "code-judge" || type === "llm-judge";
|
|
749
|
+
}
|
|
731
750
|
async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
|
|
732
751
|
const execution = rawEvalCase.execution;
|
|
733
752
|
const executionObject = isJsonObject2(execution) ? execution : void 0;
|
|
@@ -790,6 +809,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
790
809
|
const rawName = asString(rawEvaluator.name);
|
|
791
810
|
const rawType = rawEvaluator.type;
|
|
792
811
|
const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
|
|
812
|
+
if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
|
|
813
|
+
logWarning2(
|
|
814
|
+
`Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
|
|
815
|
+
);
|
|
816
|
+
continue;
|
|
817
|
+
}
|
|
793
818
|
const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
|
|
794
819
|
if (typeof typeValue !== "string") {
|
|
795
820
|
logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
|
|
@@ -822,7 +847,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
822
847
|
});
|
|
823
848
|
continue;
|
|
824
849
|
}
|
|
825
|
-
if (typeValue === "code-grader"
|
|
850
|
+
if (typeValue === "code-grader") {
|
|
826
851
|
let command;
|
|
827
852
|
if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
|
|
828
853
|
console.warn(
|
|
@@ -932,7 +957,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
932
957
|
continue;
|
|
933
958
|
}
|
|
934
959
|
const aggregatorType = asString(rawAggregator.type);
|
|
935
|
-
|
|
960
|
+
const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
|
|
961
|
+
if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
|
|
962
|
+
logWarning2(
|
|
963
|
+
`Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
|
|
964
|
+
);
|
|
965
|
+
continue;
|
|
966
|
+
}
|
|
967
|
+
if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
|
|
936
968
|
logWarning2(
|
|
937
969
|
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
938
970
|
);
|
|
@@ -967,7 +999,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
967
999
|
continue;
|
|
968
1000
|
}
|
|
969
1001
|
let aggregator;
|
|
970
|
-
if (
|
|
1002
|
+
if (normalizedAggregatorType === "weighted_average") {
|
|
971
1003
|
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
972
1004
|
const parsedWeights = {};
|
|
973
1005
|
if (weights) {
|
|
@@ -981,7 +1013,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
981
1013
|
type: "weighted_average",
|
|
982
1014
|
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
983
1015
|
};
|
|
984
|
-
} else if (
|
|
1016
|
+
} else if (normalizedAggregatorType === "code-grader") {
|
|
985
1017
|
const aggregatorPath = asString(rawAggregator.path);
|
|
986
1018
|
if (!aggregatorPath) {
|
|
987
1019
|
logWarning2(
|
|
@@ -994,7 +1026,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
|
|
|
994
1026
|
path: aggregatorPath,
|
|
995
1027
|
cwd: searchRoots[0]
|
|
996
1028
|
};
|
|
997
|
-
} else if (
|
|
1029
|
+
} else if (normalizedAggregatorType === "threshold") {
|
|
998
1030
|
const thresholdValue = rawAggregator.threshold;
|
|
999
1031
|
if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
|
|
1000
1032
|
logWarning2(
|
|
@@ -1742,10 +1774,15 @@ function coerceEvaluator(candidate, contextId) {
|
|
|
1742
1774
|
return void 0;
|
|
1743
1775
|
}
|
|
1744
1776
|
const normalized = normalizeEvaluatorType(candidate);
|
|
1777
|
+
if (isDeprecatedJudgeType(normalized)) {
|
|
1778
|
+
throw new Error(
|
|
1779
|
+
`Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
|
|
1780
|
+
);
|
|
1781
|
+
}
|
|
1745
1782
|
if (isEvaluatorKind(normalized)) {
|
|
1746
1783
|
return normalized;
|
|
1747
1784
|
}
|
|
1748
|
-
logWarning2(`Unknown
|
|
1785
|
+
logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
|
|
1749
1786
|
return void 0;
|
|
1750
1787
|
}
|
|
1751
1788
|
function asString(value) {
|
|
@@ -2729,6 +2766,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
2729
2766
|
const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
|
|
2730
2767
|
const metadata = parseMetadata(parsed);
|
|
2731
2768
|
const failOnError = extractFailOnError(parsed);
|
|
2769
|
+
const threshold = extractThreshold(parsed);
|
|
2732
2770
|
return {
|
|
2733
2771
|
tests,
|
|
2734
2772
|
trials: extractTrialsConfig(parsed),
|
|
@@ -2737,7 +2775,8 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
|
|
|
2737
2775
|
cacheConfig: extractCacheConfig(parsed),
|
|
2738
2776
|
totalBudgetUsd: extractTotalBudgetUsd(parsed),
|
|
2739
2777
|
...metadata !== void 0 && { metadata },
|
|
2740
|
-
...failOnError !== void 0 && { failOnError }
|
|
2778
|
+
...failOnError !== void 0 && { failOnError },
|
|
2779
|
+
...threshold !== void 0 && { threshold }
|
|
2741
2780
|
};
|
|
2742
2781
|
}
|
|
2743
2782
|
var loadEvalSuite = loadTestSuite;
|
|
@@ -3178,9 +3217,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
3178
3217
|
case "ends_with":
|
|
3179
3218
|
return `Output ends with '${entry.value}'`;
|
|
3180
3219
|
case "llm-grader":
|
|
3181
|
-
case "llm_grader":
|
|
3182
|
-
case "llm-judge":
|
|
3183
|
-
case "llm_judge": {
|
|
3220
|
+
case "llm_grader": {
|
|
3184
3221
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
3185
3222
|
return null;
|
|
3186
3223
|
}
|
|
@@ -3193,9 +3230,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
3193
3230
|
return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
|
|
3194
3231
|
}
|
|
3195
3232
|
case "code-grader":
|
|
3196
|
-
case "code_grader":
|
|
3197
|
-
case "code-judge":
|
|
3198
|
-
case "code_judge": {
|
|
3233
|
+
case "code_grader": {
|
|
3199
3234
|
const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
|
|
3200
3235
|
const desc = typeof entry.description === "string" ? entry.description : void 0;
|
|
3201
3236
|
return codeGraderInstruction(graderName, desc);
|
|
@@ -3226,7 +3261,7 @@ function assertionToNaturalLanguage(entry) {
|
|
|
3226
3261
|
}
|
|
3227
3262
|
}
|
|
3228
3263
|
function assertionToNaturalLanguageList(entry) {
|
|
3229
|
-
if (entry.type === "llm-grader" || entry.type === "llm_grader"
|
|
3264
|
+
if (entry.type === "llm-grader" || entry.type === "llm_grader") {
|
|
3230
3265
|
if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
|
|
3231
3266
|
return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
|
|
3232
3267
|
}
|
|
@@ -10083,7 +10118,7 @@ function toCamelCaseDeep(obj) {
|
|
|
10083
10118
|
// src/evaluation/evaluators/code-evaluator.ts
|
|
10084
10119
|
var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
|
|
10085
10120
|
var CodeEvaluator = class {
|
|
10086
|
-
kind = "code-
|
|
10121
|
+
kind = "code-grader";
|
|
10087
10122
|
command;
|
|
10088
10123
|
cwd;
|
|
10089
10124
|
agentTimeoutMs;
|
|
@@ -10102,7 +10137,7 @@ var CodeEvaluator = class {
|
|
|
10102
10137
|
if (outputForPayload) {
|
|
10103
10138
|
const serialized = JSON.stringify(outputForPayload);
|
|
10104
10139
|
if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
|
|
10105
|
-
const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-
|
|
10140
|
+
const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-grader-"));
|
|
10106
10141
|
outputPath = join(tmpDir, "output.json");
|
|
10107
10142
|
await writeFile6(outputPath, serialized);
|
|
10108
10143
|
outputForPayload = null;
|
|
@@ -10360,7 +10395,7 @@ var LlmGraderEvaluator = class {
|
|
|
10360
10395
|
return this.evaluateWithDelegatedAgent(context, graderProvider);
|
|
10361
10396
|
}
|
|
10362
10397
|
const config = context.evaluator;
|
|
10363
|
-
if (
|
|
10398
|
+
if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
|
|
10364
10399
|
return this.evaluateWithRubrics(context, graderProvider, config.rubrics);
|
|
10365
10400
|
}
|
|
10366
10401
|
return this.evaluateFreeform(context, graderProvider);
|
|
@@ -10545,7 +10580,7 @@ ${context.fileChanges}`;
|
|
|
10545
10580
|
const systemPrompt = this.buildAgentSystemPrompt(context);
|
|
10546
10581
|
const userPrompt = this.buildAgentUserPrompt(context);
|
|
10547
10582
|
const config = context.evaluator;
|
|
10548
|
-
const rubrics = config?.type === "llm-grader"
|
|
10583
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
10549
10584
|
const fsTools = createFilesystemTools(workspacePath);
|
|
10550
10585
|
const evaluatorRawRequest = {
|
|
10551
10586
|
mode: "built-in",
|
|
@@ -10641,7 +10676,7 @@ ${context.fileChanges}`;
|
|
|
10641
10676
|
};
|
|
10642
10677
|
}
|
|
10643
10678
|
const config = context.evaluator;
|
|
10644
|
-
const rubrics = config?.type === "llm-grader"
|
|
10679
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
10645
10680
|
const details = {
|
|
10646
10681
|
mode: modeLabel,
|
|
10647
10682
|
grader_target: provider.targetName
|
|
@@ -10681,7 +10716,7 @@ ${context.fileChanges}`;
|
|
|
10681
10716
|
*/
|
|
10682
10717
|
buildAgentSystemPrompt(context) {
|
|
10683
10718
|
const config = context.evaluator;
|
|
10684
|
-
const rubrics = config?.type === "llm-grader"
|
|
10719
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
10685
10720
|
const parts = [
|
|
10686
10721
|
"You are an expert evaluator with access to the workspace filesystem.",
|
|
10687
10722
|
"Use the provided tools to investigate the workspace and verify the criteria are met.",
|
|
@@ -10712,7 +10747,7 @@ ${context.fileChanges}`;
|
|
|
10712
10747
|
return substituteVariables(this.evaluatorTemplate, variables);
|
|
10713
10748
|
}
|
|
10714
10749
|
const config = context.evaluator;
|
|
10715
|
-
const rubrics = config?.type === "llm-grader"
|
|
10750
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
10716
10751
|
const parts = [
|
|
10717
10752
|
"Evaluate the candidate answer by investigating the workspace.",
|
|
10718
10753
|
"",
|
|
@@ -10755,7 +10790,7 @@ ${context.fileChanges}`;
|
|
|
10755
10790
|
buildDelegatedPrompt(context) {
|
|
10756
10791
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
10757
10792
|
const config = context.evaluator;
|
|
10758
|
-
const rubrics = config?.type === "llm-grader"
|
|
10793
|
+
const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
|
|
10759
10794
|
if (this.evaluatorTemplate) {
|
|
10760
10795
|
const variables = {
|
|
10761
10796
|
[TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
|
|
@@ -11252,10 +11287,8 @@ var CompositeEvaluator = class {
|
|
|
11252
11287
|
const aggregator = this.config.aggregator;
|
|
11253
11288
|
switch (aggregator.type) {
|
|
11254
11289
|
case "code-grader":
|
|
11255
|
-
case "code-judge":
|
|
11256
11290
|
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
11257
11291
|
case "llm-grader":
|
|
11258
|
-
case "llm-judge":
|
|
11259
11292
|
return this.runLlmAggregator(results, context, aggregator);
|
|
11260
11293
|
case "threshold":
|
|
11261
11294
|
return this.runThreshold(results, aggregator.threshold);
|
|
@@ -13677,7 +13710,7 @@ var endsWithFactory = (config) => {
|
|
|
13677
13710
|
};
|
|
13678
13711
|
function createBuiltinRegistry() {
|
|
13679
13712
|
const registry = new EvaluatorRegistry();
|
|
13680
|
-
registry.register("llm-grader", llmGraderFactory).register("
|
|
13713
|
+
registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
|
|
13681
13714
|
const fn = config[INLINE_ASSERT_FN];
|
|
13682
13715
|
if (!fn) {
|
|
13683
13716
|
throw new Error(
|
|
@@ -16395,7 +16428,7 @@ function filterEvalCases(evalCases, filter) {
|
|
|
16395
16428
|
return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter));
|
|
16396
16429
|
}
|
|
16397
16430
|
function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
|
|
16398
|
-
const llmGrader = overrides?.["llm-grader"] ??
|
|
16431
|
+
const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
|
|
16399
16432
|
resolveGraderProvider: async (context) => {
|
|
16400
16433
|
if (context.graderProvider) {
|
|
16401
16434
|
return context.graderProvider;
|
|
@@ -17239,10 +17272,10 @@ var OtelTraceExporter = class {
|
|
|
17239
17272
|
}
|
|
17240
17273
|
if (result.scores) {
|
|
17241
17274
|
for (const score of result.scores) {
|
|
17242
|
-
rootSpan.addEvent(`agentv.
|
|
17243
|
-
"agentv.
|
|
17244
|
-
"agentv.
|
|
17245
|
-
...score.verdict ? { "agentv.
|
|
17275
|
+
rootSpan.addEvent(`agentv.grader.${score.name}`, {
|
|
17276
|
+
"agentv.grader.score": score.score,
|
|
17277
|
+
"agentv.grader.type": score.type,
|
|
17278
|
+
...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
|
|
17246
17279
|
});
|
|
17247
17280
|
}
|
|
17248
17281
|
}
|
|
@@ -17628,6 +17661,7 @@ export {
|
|
|
17628
17661
|
extractTargetFromSuite,
|
|
17629
17662
|
extractTargetsFromSuite,
|
|
17630
17663
|
extractTargetsFromTestCase,
|
|
17664
|
+
extractThreshold,
|
|
17631
17665
|
extractTrialsConfig,
|
|
17632
17666
|
extractWorkersFromSuite,
|
|
17633
17667
|
fileExists,
|