@agentv/core 3.13.0 → 3.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -473,7 +473,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
473
473
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
474
474
  */
475
475
  declare function isTestMessage(value: unknown): value is TestMessage;
476
- declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
476
+ declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
477
477
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
478
478
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
479
479
  /**
@@ -576,7 +576,7 @@ type WorkspaceConfig = {
576
576
  };
577
577
  type CodeEvaluatorConfig = {
578
578
  readonly name: string;
579
- readonly type: 'code-judge' | 'code-grader';
579
+ readonly type: 'code-grader';
580
580
  readonly command: readonly string[];
581
581
  /** @deprecated Use `command` instead */
582
582
  readonly script?: readonly string[];
@@ -606,7 +606,7 @@ type PromptScriptConfig = {
606
606
  };
607
607
  type LlmGraderEvaluatorConfig = {
608
608
  readonly name: string;
609
- readonly type: 'llm-grader' | 'llm-judge';
609
+ readonly type: 'llm-grader';
610
610
  /** Text prompt (inline or file path) or executable script config */
611
611
  readonly prompt?: string | PromptScriptConfig;
612
612
  readonly promptPath?: string;
@@ -678,20 +678,11 @@ type CompositeAggregatorConfig = {
678
678
  readonly type: 'code-grader';
679
679
  readonly path: string;
680
680
  readonly cwd?: string;
681
- } | {
682
- readonly type: 'code-judge';
683
- readonly path: string;
684
- readonly cwd?: string;
685
681
  } | {
686
682
  readonly type: 'llm-grader';
687
683
  readonly prompt?: string;
688
684
  readonly promptPath?: string;
689
685
  readonly model?: string;
690
- } | {
691
- readonly type: 'llm-judge';
692
- readonly prompt?: string;
693
- readonly promptPath?: string;
694
- readonly model?: string;
695
686
  } | {
696
687
  readonly type: 'threshold';
697
688
  readonly threshold: number;
@@ -1310,6 +1301,12 @@ declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
1310
1301
  * Returns undefined when not specified.
1311
1302
  */
1312
1303
  declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1304
+ /**
1305
+ * Extract `execution.threshold` from parsed eval suite.
1306
+ * Accepts a number in [0, 1] range.
1307
+ * Returns undefined when not specified.
1308
+ */
1309
+ declare function extractThreshold(suite: JsonObject): number | undefined;
1313
1310
 
1314
1311
  /**
1315
1312
  * Formatting mode for segment content.
@@ -1372,6 +1369,8 @@ type EvalSuiteResult = {
1372
1369
  readonly totalBudgetUsd?: number;
1373
1370
  /** Execution error tolerance: true or false */
1374
1371
  readonly failOnError?: FailOnError;
1372
+ /** Suite-level quality threshold (0-1) — suite fails if mean score is below */
1373
+ readonly threshold?: number;
1375
1374
  };
1376
1375
  /**
1377
1376
  * Load tests and suite metadata from a single parse.
@@ -2117,7 +2116,7 @@ interface CodeEvaluatorOptions {
2117
2116
  readonly target?: TargetAccessConfig;
2118
2117
  }
2119
2118
  declare class CodeEvaluator implements Evaluator {
2120
- readonly kind = "code-judge";
2119
+ readonly kind = "code-grader";
2121
2120
  private readonly command;
2122
2121
  private readonly cwd?;
2123
2122
  private readonly agentTimeoutMs?;
@@ -2852,7 +2851,7 @@ interface EvalTestInput {
2852
2851
  readonly expectedOutput?: string;
2853
2852
  /** @deprecated Use `expectedOutput` instead */
2854
2853
  readonly expected_output?: string;
2855
- /** Assertion evaluators — accepts factory functions, config objects, or inline functions */
2854
+ /** Assertion graders — accepts factory functions, config objects, or inline functions */
2856
2855
  readonly assert?: readonly AssertEntry[];
2857
2856
  /** Arbitrary metadata */
2858
2857
  readonly metadata?: Record<string, unknown>;
@@ -2862,7 +2861,7 @@ interface EvalTestInput {
2862
2861
  * Matches the YAML `assert` block structure.
2863
2862
  */
2864
2863
  interface EvalAssertionInput {
2865
- /** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
2864
+ /** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
2866
2865
  readonly type: string;
2867
2866
  /** Display name */
2868
2867
  readonly name?: string;
@@ -2872,9 +2871,9 @@ interface EvalAssertionInput {
2872
2871
  readonly weight?: number;
2873
2872
  /** Whether this assertion is required to pass */
2874
2873
  readonly required?: boolean | number;
2875
- /** Prompt file for llm_judge */
2874
+ /** Prompt file for llm_grader */
2876
2875
  readonly prompt?: string;
2877
- /** Script for code_judge */
2876
+ /** Script for code_grader */
2878
2877
  readonly script?: string | readonly string[];
2879
2878
  /** Additional config passed to the assertion */
2880
2879
  readonly config?: Record<string, unknown>;
@@ -3568,17 +3567,17 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
3568
3567
  * Convention-based discovery of custom assertion scripts.
3569
3568
  *
3570
3569
  * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
3571
- * them as code-judge evaluators in the registry. The file name (without
3572
- * extension) becomes the evaluator type name.
3570
+ * them as code graders in the registry. The file name (without
3571
+ * extension) becomes the grader type name.
3573
3572
  *
3574
3573
  * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
3575
3574
  */
3576
3575
 
3577
3576
  /**
3578
3577
  * Discover custom assertion scripts from `.agentv/assertions/` and register
3579
- * them as evaluator types in the registry.
3578
+ * them as grader types in the registry.
3580
3579
  *
3581
- * @param registry - The evaluator registry to register discovered assertions into
3580
+ * @param registry - The grader registry to register discovered assertions into
3582
3581
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
3583
3582
  * @returns Names of discovered assertion types
3584
3583
  */
@@ -3609,4 +3608,4 @@ type AgentKernel = {
3609
3608
  };
3610
3609
  declare function createAgentKernel(): AgentKernel;
3611
3610
 
3612
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
3611
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.d.ts CHANGED
@@ -473,7 +473,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
473
473
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
474
474
  */
475
475
  declare function isTestMessage(value: unknown): value is TestMessage;
476
- declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
476
+ declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
477
477
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
478
478
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
479
479
  /**
@@ -576,7 +576,7 @@ type WorkspaceConfig = {
576
576
  };
577
577
  type CodeEvaluatorConfig = {
578
578
  readonly name: string;
579
- readonly type: 'code-judge' | 'code-grader';
579
+ readonly type: 'code-grader';
580
580
  readonly command: readonly string[];
581
581
  /** @deprecated Use `command` instead */
582
582
  readonly script?: readonly string[];
@@ -606,7 +606,7 @@ type PromptScriptConfig = {
606
606
  };
607
607
  type LlmGraderEvaluatorConfig = {
608
608
  readonly name: string;
609
- readonly type: 'llm-grader' | 'llm-judge';
609
+ readonly type: 'llm-grader';
610
610
  /** Text prompt (inline or file path) or executable script config */
611
611
  readonly prompt?: string | PromptScriptConfig;
612
612
  readonly promptPath?: string;
@@ -678,20 +678,11 @@ type CompositeAggregatorConfig = {
678
678
  readonly type: 'code-grader';
679
679
  readonly path: string;
680
680
  readonly cwd?: string;
681
- } | {
682
- readonly type: 'code-judge';
683
- readonly path: string;
684
- readonly cwd?: string;
685
681
  } | {
686
682
  readonly type: 'llm-grader';
687
683
  readonly prompt?: string;
688
684
  readonly promptPath?: string;
689
685
  readonly model?: string;
690
- } | {
691
- readonly type: 'llm-judge';
692
- readonly prompt?: string;
693
- readonly promptPath?: string;
694
- readonly model?: string;
695
686
  } | {
696
687
  readonly type: 'threshold';
697
688
  readonly threshold: number;
@@ -1310,6 +1301,12 @@ declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
1310
1301
  * Returns undefined when not specified.
1311
1302
  */
1312
1303
  declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
1304
+ /**
1305
+ * Extract `execution.threshold` from parsed eval suite.
1306
+ * Accepts a number in [0, 1] range.
1307
+ * Returns undefined when not specified.
1308
+ */
1309
+ declare function extractThreshold(suite: JsonObject): number | undefined;
1313
1310
 
1314
1311
  /**
1315
1312
  * Formatting mode for segment content.
@@ -1372,6 +1369,8 @@ type EvalSuiteResult = {
1372
1369
  readonly totalBudgetUsd?: number;
1373
1370
  /** Execution error tolerance: true or false */
1374
1371
  readonly failOnError?: FailOnError;
1372
+ /** Suite-level quality threshold (0-1) — suite fails if mean score is below */
1373
+ readonly threshold?: number;
1375
1374
  };
1376
1375
  /**
1377
1376
  * Load tests and suite metadata from a single parse.
@@ -2117,7 +2116,7 @@ interface CodeEvaluatorOptions {
2117
2116
  readonly target?: TargetAccessConfig;
2118
2117
  }
2119
2118
  declare class CodeEvaluator implements Evaluator {
2120
- readonly kind = "code-judge";
2119
+ readonly kind = "code-grader";
2121
2120
  private readonly command;
2122
2121
  private readonly cwd?;
2123
2122
  private readonly agentTimeoutMs?;
@@ -2852,7 +2851,7 @@ interface EvalTestInput {
2852
2851
  readonly expectedOutput?: string;
2853
2852
  /** @deprecated Use `expectedOutput` instead */
2854
2853
  readonly expected_output?: string;
2855
- /** Assertion evaluators — accepts factory functions, config objects, or inline functions */
2854
+ /** Assertion graders — accepts factory functions, config objects, or inline functions */
2856
2855
  readonly assert?: readonly AssertEntry[];
2857
2856
  /** Arbitrary metadata */
2858
2857
  readonly metadata?: Record<string, unknown>;
@@ -2862,7 +2861,7 @@ interface EvalTestInput {
2862
2861
  * Matches the YAML `assert` block structure.
2863
2862
  */
2864
2863
  interface EvalAssertionInput {
2865
- /** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
2864
+ /** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
2866
2865
  readonly type: string;
2867
2866
  /** Display name */
2868
2867
  readonly name?: string;
@@ -2872,9 +2871,9 @@ interface EvalAssertionInput {
2872
2871
  readonly weight?: number;
2873
2872
  /** Whether this assertion is required to pass */
2874
2873
  readonly required?: boolean | number;
2875
- /** Prompt file for llm_judge */
2874
+ /** Prompt file for llm_grader */
2876
2875
  readonly prompt?: string;
2877
- /** Script for code_judge */
2876
+ /** Script for code_grader */
2878
2877
  readonly script?: string | readonly string[];
2879
2878
  /** Additional config passed to the assertion */
2880
2879
  readonly config?: Record<string, unknown>;
@@ -3568,17 +3567,17 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
3568
3567
  * Convention-based discovery of custom assertion scripts.
3569
3568
  *
3570
3569
  * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
3571
- * them as code-judge evaluators in the registry. The file name (without
3572
- * extension) becomes the evaluator type name.
3570
+ * them as code graders in the registry. The file name (without
3571
+ * extension) becomes the grader type name.
3573
3572
  *
3574
3573
  * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
3575
3574
  */
3576
3575
 
3577
3576
  /**
3578
3577
  * Discover custom assertion scripts from `.agentv/assertions/` and register
3579
- * them as evaluator types in the registry.
3578
+ * them as grader types in the registry.
3580
3579
  *
3581
- * @param registry - The evaluator registry to register discovered assertions into
3580
+ * @param registry - The grader registry to register discovered assertions into
3582
3581
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
3583
3582
  * @returns Names of discovered assertion types
3584
3583
  */
@@ -3609,4 +3608,4 @@ type AgentKernel = {
3609
3608
  };
3610
3609
  declare function createAgentKernel(): AgentKernel;
3611
3610
 
3612
- export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
3611
+ export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
package/dist/index.js CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  readTextFile,
20
20
  resolveFileReference,
21
21
  resolveTargetDefinition
22
- } from "./chunk-4XWPXNQM.js";
22
+ } from "./chunk-ZB3AUPES.js";
23
23
  import {
24
24
  AgentvProvider
25
25
  } from "./chunk-W5YDZWT4.js";
@@ -601,6 +601,22 @@ function extractFailOnError(suite) {
601
601
  logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
602
602
  return void 0;
603
603
  }
604
+ function extractThreshold(suite) {
605
+ const execution = suite.execution;
606
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
607
+ return void 0;
608
+ }
609
+ const executionObj = execution;
610
+ const raw = executionObj.threshold;
611
+ if (raw === void 0 || raw === null) {
612
+ return void 0;
613
+ }
614
+ if (typeof raw === "number" && raw >= 0 && raw <= 1) {
615
+ return raw;
616
+ }
617
+ logWarning(`Invalid execution.threshold: ${raw}. Must be a number between 0 and 1. Ignoring.`);
618
+ return void 0;
619
+ }
604
620
  function parseExecutionDefaults(raw, configPath) {
605
621
  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
606
622
  return void 0;
@@ -728,6 +744,9 @@ var ANSI_RESET4 = "\x1B[0m";
728
744
  function normalizeEvaluatorType(type) {
729
745
  return type.replace(/_/g, "-");
730
746
  }
747
+ function isDeprecatedJudgeType(type) {
748
+ return type === "code-judge" || type === "llm-judge";
749
+ }
731
750
  async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
732
751
  const execution = rawEvalCase.execution;
733
752
  const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -790,6 +809,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
790
809
  const rawName = asString(rawEvaluator.name);
791
810
  const rawType = rawEvaluator.type;
792
811
  const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
812
+ if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
813
+ logWarning2(
814
+ `Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
815
+ );
816
+ continue;
817
+ }
793
818
  const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
794
819
  if (typeof typeValue !== "string") {
795
820
  logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -822,7 +847,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
822
847
  });
823
848
  continue;
824
849
  }
825
- if (typeValue === "code-grader" || typeValue === "code-judge") {
850
+ if (typeValue === "code-grader") {
826
851
  let command;
827
852
  if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
828
853
  console.warn(
@@ -932,7 +957,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
932
957
  continue;
933
958
  }
934
959
  const aggregatorType = asString(rawAggregator.type);
935
- if (aggregatorType !== "weighted_average" && aggregatorType !== "code-grader" && aggregatorType !== "code-judge" && aggregatorType !== "llm-grader" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
960
+ const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
961
+ if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
962
+ logWarning2(
963
+ `Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
964
+ );
965
+ continue;
966
+ }
967
+ if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
936
968
  logWarning2(
937
969
  `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
938
970
  );
@@ -967,7 +999,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
967
999
  continue;
968
1000
  }
969
1001
  let aggregator;
970
- if (aggregatorType === "weighted_average") {
1002
+ if (normalizedAggregatorType === "weighted_average") {
971
1003
  const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
972
1004
  const parsedWeights = {};
973
1005
  if (weights) {
@@ -981,7 +1013,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
981
1013
  type: "weighted_average",
982
1014
  ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
983
1015
  };
984
- } else if (aggregatorType === "code-grader" || aggregatorType === "code-judge") {
1016
+ } else if (normalizedAggregatorType === "code-grader") {
985
1017
  const aggregatorPath = asString(rawAggregator.path);
986
1018
  if (!aggregatorPath) {
987
1019
  logWarning2(
@@ -994,7 +1026,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
994
1026
  path: aggregatorPath,
995
1027
  cwd: searchRoots[0]
996
1028
  };
997
- } else if (aggregatorType === "threshold") {
1029
+ } else if (normalizedAggregatorType === "threshold") {
998
1030
  const thresholdValue = rawAggregator.threshold;
999
1031
  if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
1000
1032
  logWarning2(
@@ -1742,10 +1774,15 @@ function coerceEvaluator(candidate, contextId) {
1742
1774
  return void 0;
1743
1775
  }
1744
1776
  const normalized = normalizeEvaluatorType(candidate);
1777
+ if (isDeprecatedJudgeType(normalized)) {
1778
+ throw new Error(
1779
+ `Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
1780
+ );
1781
+ }
1745
1782
  if (isEvaluatorKind(normalized)) {
1746
1783
  return normalized;
1747
1784
  }
1748
- logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
1785
+ logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
1749
1786
  return void 0;
1750
1787
  }
1751
1788
  function asString(value) {
@@ -2729,6 +2766,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
2729
2766
  const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
2730
2767
  const metadata = parseMetadata(parsed);
2731
2768
  const failOnError = extractFailOnError(parsed);
2769
+ const threshold = extractThreshold(parsed);
2732
2770
  return {
2733
2771
  tests,
2734
2772
  trials: extractTrialsConfig(parsed),
@@ -2737,7 +2775,8 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
2737
2775
  cacheConfig: extractCacheConfig(parsed),
2738
2776
  totalBudgetUsd: extractTotalBudgetUsd(parsed),
2739
2777
  ...metadata !== void 0 && { metadata },
2740
- ...failOnError !== void 0 && { failOnError }
2778
+ ...failOnError !== void 0 && { failOnError },
2779
+ ...threshold !== void 0 && { threshold }
2741
2780
  };
2742
2781
  }
2743
2782
  var loadEvalSuite = loadTestSuite;
@@ -3178,9 +3217,7 @@ function assertionToNaturalLanguage(entry) {
3178
3217
  case "ends_with":
3179
3218
  return `Output ends with '${entry.value}'`;
3180
3219
  case "llm-grader":
3181
- case "llm_grader":
3182
- case "llm-judge":
3183
- case "llm_judge": {
3220
+ case "llm_grader": {
3184
3221
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
3185
3222
  return null;
3186
3223
  }
@@ -3193,9 +3230,7 @@ function assertionToNaturalLanguage(entry) {
3193
3230
  return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
3194
3231
  }
3195
3232
  case "code-grader":
3196
- case "code_grader":
3197
- case "code-judge":
3198
- case "code_judge": {
3233
+ case "code_grader": {
3199
3234
  const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
3200
3235
  const desc = typeof entry.description === "string" ? entry.description : void 0;
3201
3236
  return codeGraderInstruction(graderName, desc);
@@ -3226,7 +3261,7 @@ function assertionToNaturalLanguage(entry) {
3226
3261
  }
3227
3262
  }
3228
3263
  function assertionToNaturalLanguageList(entry) {
3229
- if (entry.type === "llm-grader" || entry.type === "llm_grader" || entry.type === "llm-judge" || entry.type === "llm_judge") {
3264
+ if (entry.type === "llm-grader" || entry.type === "llm_grader") {
3230
3265
  if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
3231
3266
  return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
3232
3267
  }
@@ -10083,7 +10118,7 @@ function toCamelCaseDeep(obj) {
10083
10118
  // src/evaluation/evaluators/code-evaluator.ts
10084
10119
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
10085
10120
  var CodeEvaluator = class {
10086
- kind = "code-judge";
10121
+ kind = "code-grader";
10087
10122
  command;
10088
10123
  cwd;
10089
10124
  agentTimeoutMs;
@@ -10102,7 +10137,7 @@ var CodeEvaluator = class {
10102
10137
  if (outputForPayload) {
10103
10138
  const serialized = JSON.stringify(outputForPayload);
10104
10139
  if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
10105
- const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-judge-"));
10140
+ const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-grader-"));
10106
10141
  outputPath = join(tmpDir, "output.json");
10107
10142
  await writeFile6(outputPath, serialized);
10108
10143
  outputForPayload = null;
@@ -10360,7 +10395,7 @@ var LlmGraderEvaluator = class {
10360
10395
  return this.evaluateWithDelegatedAgent(context, graderProvider);
10361
10396
  }
10362
10397
  const config = context.evaluator;
10363
- if ((config?.type === "llm-grader" || config?.type === "llm-judge") && config.rubrics && config.rubrics.length > 0) {
10398
+ if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
10364
10399
  return this.evaluateWithRubrics(context, graderProvider, config.rubrics);
10365
10400
  }
10366
10401
  return this.evaluateFreeform(context, graderProvider);
@@ -10545,7 +10580,7 @@ ${context.fileChanges}`;
10545
10580
  const systemPrompt = this.buildAgentSystemPrompt(context);
10546
10581
  const userPrompt = this.buildAgentUserPrompt(context);
10547
10582
  const config = context.evaluator;
10548
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10583
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10549
10584
  const fsTools = createFilesystemTools(workspacePath);
10550
10585
  const evaluatorRawRequest = {
10551
10586
  mode: "built-in",
@@ -10641,7 +10676,7 @@ ${context.fileChanges}`;
10641
10676
  };
10642
10677
  }
10643
10678
  const config = context.evaluator;
10644
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10679
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10645
10680
  const details = {
10646
10681
  mode: modeLabel,
10647
10682
  grader_target: provider.targetName
@@ -10681,7 +10716,7 @@ ${context.fileChanges}`;
10681
10716
  */
10682
10717
  buildAgentSystemPrompt(context) {
10683
10718
  const config = context.evaluator;
10684
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10719
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10685
10720
  const parts = [
10686
10721
  "You are an expert evaluator with access to the workspace filesystem.",
10687
10722
  "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -10712,7 +10747,7 @@ ${context.fileChanges}`;
10712
10747
  return substituteVariables(this.evaluatorTemplate, variables);
10713
10748
  }
10714
10749
  const config = context.evaluator;
10715
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10750
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10716
10751
  const parts = [
10717
10752
  "Evaluate the candidate answer by investigating the workspace.",
10718
10753
  "",
@@ -10755,7 +10790,7 @@ ${context.fileChanges}`;
10755
10790
  buildDelegatedPrompt(context) {
10756
10791
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
10757
10792
  const config = context.evaluator;
10758
- const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
10793
+ const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
10759
10794
  if (this.evaluatorTemplate) {
10760
10795
  const variables = {
10761
10796
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
@@ -11252,10 +11287,8 @@ var CompositeEvaluator = class {
11252
11287
  const aggregator = this.config.aggregator;
11253
11288
  switch (aggregator.type) {
11254
11289
  case "code-grader":
11255
- case "code-judge":
11256
11290
  return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
11257
11291
  case "llm-grader":
11258
- case "llm-judge":
11259
11292
  return this.runLlmAggregator(results, context, aggregator);
11260
11293
  case "threshold":
11261
11294
  return this.runThreshold(results, aggregator.threshold);
@@ -13677,7 +13710,7 @@ var endsWithFactory = (config) => {
13677
13710
  };
13678
13711
  function createBuiltinRegistry() {
13679
13712
  const registry = new EvaluatorRegistry();
13680
- registry.register("llm-grader", llmGraderFactory).register("llm-judge", llmGraderFactory).register("code-grader", codeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
13713
+ registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
13681
13714
  const fn = config[INLINE_ASSERT_FN];
13682
13715
  if (!fn) {
13683
13716
  throw new Error(
@@ -16395,7 +16428,7 @@ function filterEvalCases(evalCases, filter) {
16395
16428
  return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter));
16396
16429
  }
16397
16430
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
16398
- const llmGrader = overrides?.["llm-grader"] ?? overrides?.["llm-judge"] ?? new LlmGraderEvaluator({
16431
+ const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
16399
16432
  resolveGraderProvider: async (context) => {
16400
16433
  if (context.graderProvider) {
16401
16434
  return context.graderProvider;
@@ -17239,10 +17272,10 @@ var OtelTraceExporter = class {
17239
17272
  }
17240
17273
  if (result.scores) {
17241
17274
  for (const score of result.scores) {
17242
- rootSpan.addEvent(`agentv.evaluator.${score.name}`, {
17243
- "agentv.evaluator.score": score.score,
17244
- "agentv.evaluator.type": score.type,
17245
- ...score.verdict ? { "agentv.evaluator.verdict": score.verdict } : {}
17275
+ rootSpan.addEvent(`agentv.grader.${score.name}`, {
17276
+ "agentv.grader.score": score.score,
17277
+ "agentv.grader.type": score.type,
17278
+ ...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
17246
17279
  });
17247
17280
  }
17248
17281
  }
@@ -17628,6 +17661,7 @@ export {
17628
17661
  extractTargetFromSuite,
17629
17662
  extractTargetsFromSuite,
17630
17663
  extractTargetsFromTestCase,
17664
+ extractThreshold,
17631
17665
  extractTrialsConfig,
17632
17666
  extractWorkersFromSuite,
17633
17667
  fileExists,