npm - @agentv/core - Versions diffs - 3.13.0 → 3.13.2 - Mend

@agentv/core 3.13.0 → 3.13.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/{chunk-4XWPXNQM.js → chunk-ZB3AUPES.js} +1 -3
package/dist/chunk-ZB3AUPES.js.map +1 -0
package/dist/evaluation/validation/index.cjs +0 -2
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/index.cjs +65 -32
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +21 -22
package/dist/index.d.ts +21 -22
package/dist/index.js +65 -31
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-4XWPXNQM.js.map +0 -1

package/dist/index.d.cts CHANGED Viewed

@@ -473,7 +473,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
  */
 declare function isTestMessage(value: unknown): value is TestMessage;
-declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
+declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
 /**
@@ -576,7 +576,7 @@ type WorkspaceConfig = {
 };
 type CodeEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'code-judge' | 'code-grader';
+    readonly type: 'code-grader';
     readonly command: readonly string[];
     /** @deprecated Use `command` instead */
     readonly script?: readonly string[];
@@ -606,7 +606,7 @@ type PromptScriptConfig = {
 };
 type LlmGraderEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'llm-grader' | 'llm-judge';
+    readonly type: 'llm-grader';
     /** Text prompt (inline or file path) or executable script config */
     readonly prompt?: string | PromptScriptConfig;
     readonly promptPath?: string;
@@ -678,20 +678,11 @@ type CompositeAggregatorConfig = {
     readonly type: 'code-grader';
     readonly path: string;
     readonly cwd?: string;
-} | {
-    readonly type: 'code-judge';
-    readonly path: string;
-    readonly cwd?: string;
 } | {
     readonly type: 'llm-grader';
     readonly prompt?: string;
     readonly promptPath?: string;
     readonly model?: string;
-} | {
-    readonly type: 'llm-judge';
-    readonly prompt?: string;
-    readonly promptPath?: string;
-    readonly model?: string;
 } | {
     readonly type: 'threshold';
     readonly threshold: number;
@@ -1310,6 +1301,12 @@ declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
  * Returns undefined when not specified.
  */
 declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
+/**
+ * Extract `execution.threshold` from parsed eval suite.
+ * Accepts a number in [0, 1] range.
+ * Returns undefined when not specified.
+ */
+declare function extractThreshold(suite: JsonObject): number | undefined;
 /**
  * Formatting mode for segment content.
@@ -1372,6 +1369,8 @@ type EvalSuiteResult = {
     readonly totalBudgetUsd?: number;
     /** Execution error tolerance: true or false */
     readonly failOnError?: FailOnError;
+    /** Suite-level quality threshold (0-1) — suite fails if mean score is below */
+    readonly threshold?: number;
 };
 /**
  * Load tests and suite metadata from a single parse.
@@ -2117,7 +2116,7 @@ interface CodeEvaluatorOptions {
     readonly target?: TargetAccessConfig;
 }
 declare class CodeEvaluator implements Evaluator {
-    readonly kind = "code-judge";
+    readonly kind = "code-grader";
     private readonly command;
     private readonly cwd?;
     private readonly agentTimeoutMs?;
@@ -2852,7 +2851,7 @@ interface EvalTestInput {
     readonly expectedOutput?: string;
     /** @deprecated Use `expectedOutput` instead */
     readonly expected_output?: string;
-    /** Assertion evaluators — accepts factory functions, config objects, or inline functions */
+    /** Assertion graders — accepts factory functions, config objects, or inline functions */
     readonly assert?: readonly AssertEntry[];
     /** Arbitrary metadata */
     readonly metadata?: Record<string, unknown>;
@@ -2862,7 +2861,7 @@ interface EvalTestInput {
  * Matches the YAML `assert` block structure.
  */
 interface EvalAssertionInput {
-    /** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
+    /** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
     readonly type: string;
     /** Display name */
     readonly name?: string;
@@ -2872,9 +2871,9 @@ interface EvalAssertionInput {
     readonly weight?: number;
     /** Whether this assertion is required to pass */
     readonly required?: boolean | number;
-    /** Prompt file for llm_judge */
+    /** Prompt file for llm_grader */
     readonly prompt?: string;
-    /** Script for code_judge */
+    /** Script for code_grader */
     readonly script?: string | readonly string[];
     /** Additional config passed to the assertion */
     readonly config?: Record<string, unknown>;
@@ -3568,17 +3567,17 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
  * Convention-based discovery of custom assertion scripts.
  *
  * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
- * them as code-judge evaluators in the registry. The file name (without
- * extension) becomes the evaluator type name.
+ * them as code graders in the registry. The file name (without
+ * extension) becomes the grader type name.
  *
  * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
  */
 /**
  * Discover custom assertion scripts from `.agentv/assertions/` and register
- * them as evaluator types in the registry.
+ * them as grader types in the registry.
  *
- * @param registry - The evaluator registry to register discovered assertions into
+ * @param registry - The grader registry to register discovered assertions into
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
  * @returns Names of discovered assertion types
  */
@@ -3609,4 +3608,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
+export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };

package/dist/index.d.ts CHANGED Viewed

@@ -473,7 +473,7 @@ declare function isJsonValue(value: unknown): value is JsonValue;
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
  */
 declare function isTestMessage(value: unknown): value is TestMessage;
-declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
+declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
 /**
@@ -576,7 +576,7 @@ type WorkspaceConfig = {
 };
 type CodeEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'code-judge' | 'code-grader';
+    readonly type: 'code-grader';
     readonly command: readonly string[];
     /** @deprecated Use `command` instead */
     readonly script?: readonly string[];
@@ -606,7 +606,7 @@ type PromptScriptConfig = {
 };
 type LlmGraderEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'llm-grader' | 'llm-judge';
+    readonly type: 'llm-grader';
     /** Text prompt (inline or file path) or executable script config */
     readonly prompt?: string | PromptScriptConfig;
     readonly promptPath?: string;
@@ -678,20 +678,11 @@ type CompositeAggregatorConfig = {
     readonly type: 'code-grader';
     readonly path: string;
     readonly cwd?: string;
-} | {
-    readonly type: 'code-judge';
-    readonly path: string;
-    readonly cwd?: string;
 } | {
     readonly type: 'llm-grader';
     readonly prompt?: string;
     readonly promptPath?: string;
     readonly model?: string;
-} | {
-    readonly type: 'llm-judge';
-    readonly prompt?: string;
-    readonly promptPath?: string;
-    readonly model?: string;
 } | {
     readonly type: 'threshold';
     readonly threshold: number;
@@ -1310,6 +1301,12 @@ declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
  * Returns undefined when not specified.
  */
 declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
+/**
+ * Extract `execution.threshold` from parsed eval suite.
+ * Accepts a number in [0, 1] range.
+ * Returns undefined when not specified.
+ */
+declare function extractThreshold(suite: JsonObject): number | undefined;
 /**
  * Formatting mode for segment content.
@@ -1372,6 +1369,8 @@ type EvalSuiteResult = {
     readonly totalBudgetUsd?: number;
     /** Execution error tolerance: true or false */
     readonly failOnError?: FailOnError;
+    /** Suite-level quality threshold (0-1) — suite fails if mean score is below */
+    readonly threshold?: number;
 };
 /**
  * Load tests and suite metadata from a single parse.
@@ -2117,7 +2116,7 @@ interface CodeEvaluatorOptions {
     readonly target?: TargetAccessConfig;
 }
 declare class CodeEvaluator implements Evaluator {
-    readonly kind = "code-judge";
+    readonly kind = "code-grader";
     private readonly command;
     private readonly cwd?;
     private readonly agentTimeoutMs?;
@@ -2852,7 +2851,7 @@ interface EvalTestInput {
     readonly expectedOutput?: string;
     /** @deprecated Use `expectedOutput` instead */
     readonly expected_output?: string;
-    /** Assertion evaluators — accepts factory functions, config objects, or inline functions */
+    /** Assertion graders — accepts factory functions, config objects, or inline functions */
     readonly assert?: readonly AssertEntry[];
     /** Arbitrary metadata */
     readonly metadata?: Record<string, unknown>;
@@ -2862,7 +2861,7 @@ interface EvalTestInput {
  * Matches the YAML `assert` block structure.
  */
 interface EvalAssertionInput {
-    /** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
+    /** Assertion type (e.g., 'contains', 'llm-grader', 'code-grader') */
     readonly type: string;
     /** Display name */
     readonly name?: string;
@@ -2872,9 +2871,9 @@ interface EvalAssertionInput {
     readonly weight?: number;
     /** Whether this assertion is required to pass */
     readonly required?: boolean | number;
-    /** Prompt file for llm_judge */
+    /** Prompt file for llm_grader */
     readonly prompt?: string;
-    /** Script for code_judge */
+    /** Script for code_grader */
     readonly script?: string | readonly string[];
     /** Additional config passed to the assertion */
     readonly config?: Record<string, unknown>;
@@ -3568,17 +3567,17 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
  * Convention-based discovery of custom assertion scripts.
  *
  * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
- * them as code-judge evaluators in the registry. The file name (without
- * extension) becomes the evaluator type name.
+ * them as code graders in the registry. The file name (without
+ * extension) becomes the grader type name.
  *
  * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
  */
 /**
  * Discover custom assertion scripts from `.agentv/assertions/` and register
- * them as evaluator types in the registry.
+ * them as grader types in the registry.
  *
- * @param registry - The evaluator registry to register discovered assertions into
+ * @param registry - The grader registry to register discovered assertions into
  * @param baseDir - The base directory to search from (typically project root or eval file dir)
  * @returns Names of discovered assertion types
  */
@@ -3609,4 +3608,4 @@ type AgentKernel = {
 };
 declare function createAgentKernel(): AgentKernel;
-export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
+export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionEntry, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type LocalPathValidationError, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OpenAIResolvedConfig, type OpenRouterResolvedConfig, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiCliResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractThreshold, extractTrialsConfig, extractWorkersFromSuite, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };

package/dist/index.js CHANGED Viewed

@@ -19,7 +19,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-4XWPXNQM.js";
+} from "./chunk-ZB3AUPES.js";
 import {
   AgentvProvider
 } from "./chunk-W5YDZWT4.js";
@@ -601,6 +601,22 @@ function extractFailOnError(suite) {
   logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`);
   return void 0;
 }
+function extractThreshold(suite) {
+  const execution = suite.execution;
+  if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
+    return void 0;
+  }
+  const executionObj = execution;
+  const raw = executionObj.threshold;
+  if (raw === void 0 || raw === null) {
+    return void 0;
+  }
+  if (typeof raw === "number" && raw >= 0 && raw <= 1) {
+    return raw;
+  }
+  logWarning(`Invalid execution.threshold: ${raw}. Must be a number between 0 and 1. Ignoring.`);
+  return void 0;
+}
 function parseExecutionDefaults(raw, configPath) {
   if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
     return void 0;
@@ -728,6 +744,9 @@ var ANSI_RESET4 = "\x1B[0m";
 function normalizeEvaluatorType(type) {
   return type.replace(/_/g, "-");
 }
+function isDeprecatedJudgeType(type) {
+  return type === "code-judge" || type === "llm-judge";
+}
 async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
   const execution = rawEvalCase.execution;
   const executionObject = isJsonObject2(execution) ? execution : void 0;
@@ -790,6 +809,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
     const rawName = asString(rawEvaluator.name);
     const rawType = rawEvaluator.type;
     const typeValue = typeof rawType === "string" ? normalizeEvaluatorType(rawType) : rawType;
+    if (typeof typeValue === "string" && isDeprecatedJudgeType(typeValue)) {
+      logWarning2(
+        `Skipping evaluator '${rawName ?? "<unnamed>"}' in '${evalId}': '${rawType}' is deprecated. Use '${typeValue.replace("-judge", "-grader")}' instead`
+      );
+      continue;
+    }
     const isCustomType = typeof typeValue === "string" && !isEvaluatorKind(typeValue);
     if (typeof typeValue !== "string") {
       logWarning2(`Skipping evaluator with invalid type in '${evalId}'`);
@@ -822,7 +847,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
       });
       continue;
     }
-    if (typeValue === "code-grader" || typeValue === "code-judge") {
+    if (typeValue === "code-grader") {
       let command;
       if (rawEvaluator.script !== void 0 && rawEvaluator.command === void 0) {
         console.warn(
@@ -932,7 +957,14 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       const aggregatorType = asString(rawAggregator.type);
-      if (aggregatorType !== "weighted_average" && aggregatorType !== "code-grader" && aggregatorType !== "code-judge" && aggregatorType !== "llm-grader" && aggregatorType !== "llm-judge" && aggregatorType !== "threshold") {
+      const normalizedAggregatorType = typeof aggregatorType === "string" ? aggregatorType === "weighted_average" || aggregatorType === "threshold" ? aggregatorType : normalizeEvaluatorType(aggregatorType) : aggregatorType;
+      if (typeof normalizedAggregatorType === "string" && isDeprecatedJudgeType(normalizedAggregatorType)) {
+        logWarning2(
+          `Skipping composite evaluator '${name}' in '${evalId}': aggregator type '${aggregatorType}' is deprecated. Use '${normalizedAggregatorType.replace("-judge", "-grader")}' instead`
+        );
+        continue;
+      }
+      if (normalizedAggregatorType !== "weighted_average" && normalizedAggregatorType !== "code-grader" && normalizedAggregatorType !== "llm-grader" && normalizedAggregatorType !== "threshold") {
         logWarning2(
           `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
         );
@@ -967,7 +999,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
         continue;
       }
       let aggregator;
-      if (aggregatorType === "weighted_average") {
+      if (normalizedAggregatorType === "weighted_average") {
         const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
         const parsedWeights = {};
         if (weights) {
@@ -981,7 +1013,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
           type: "weighted_average",
           ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
         };
-      } else if (aggregatorType === "code-grader" || aggregatorType === "code-judge") {
+      } else if (normalizedAggregatorType === "code-grader") {
         const aggregatorPath = asString(rawAggregator.path);
         if (!aggregatorPath) {
           logWarning2(
@@ -994,7 +1026,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
           path: aggregatorPath,
           cwd: searchRoots[0]
         };
-      } else if (aggregatorType === "threshold") {
+      } else if (normalizedAggregatorType === "threshold") {
         const thresholdValue = rawAggregator.threshold;
         if (typeof thresholdValue !== "number" || thresholdValue < 0 || thresholdValue > 1) {
           logWarning2(
@@ -1742,10 +1774,15 @@ function coerceEvaluator(candidate, contextId) {
     return void 0;
   }
   const normalized = normalizeEvaluatorType(candidate);
+  if (isDeprecatedJudgeType(normalized)) {
+    throw new Error(
+      `Unsupported grader '${candidate}' in ${contextId}. Use '${normalized.replace("-judge", "-grader")}' instead.`
+    );
+  }
   if (isEvaluatorKind(normalized)) {
     return normalized;
   }
-  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
+  logWarning2(`Unknown grader '${candidate}' in ${contextId}, falling back to default`);
   return void 0;
 }
 function asString(value) {
@@ -2729,6 +2766,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
   const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options);
   const metadata = parseMetadata(parsed);
   const failOnError = extractFailOnError(parsed);
+  const threshold = extractThreshold(parsed);
   return {
     tests,
     trials: extractTrialsConfig(parsed),
@@ -2737,7 +2775,8 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
     cacheConfig: extractCacheConfig(parsed),
     totalBudgetUsd: extractTotalBudgetUsd(parsed),
     ...metadata !== void 0 && { metadata },
-    ...failOnError !== void 0 && { failOnError }
+    ...failOnError !== void 0 && { failOnError },
+    ...threshold !== void 0 && { threshold }
   };
 }
 var loadEvalSuite = loadTestSuite;
@@ -3178,9 +3217,7 @@ function assertionToNaturalLanguage(entry) {
     case "ends_with":
       return `Output ends with '${entry.value}'`;
     case "llm-grader":
-    case "llm_grader":
-    case "llm-judge":
-    case "llm_judge": {
+    case "llm_grader": {
       if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
         return null;
       }
@@ -3193,9 +3230,7 @@ function assertionToNaturalLanguage(entry) {
       return tools ? `Agent called tools in order: ${tools}` : "Agent followed expected tool trajectory";
     }
     case "code-grader":
-    case "code_grader":
-    case "code-judge":
-    case "code_judge": {
+    case "code_grader": {
       const graderName = entry.name ?? deriveGraderNameFromCommand(entry.command) ?? "code-grader";
       const desc = typeof entry.description === "string" ? entry.description : void 0;
       return codeGraderInstruction(graderName, desc);
@@ -3226,7 +3261,7 @@ function assertionToNaturalLanguage(entry) {
   }
 }
 function assertionToNaturalLanguageList(entry) {
-  if (entry.type === "llm-grader" || entry.type === "llm_grader" || entry.type === "llm-judge" || entry.type === "llm_judge") {
+  if (entry.type === "llm-grader" || entry.type === "llm_grader") {
     if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
       return entry.rubrics.map((r) => r.outcome ?? r.criteria ?? r.id).filter((s) => typeof s === "string");
     }
@@ -10083,7 +10118,7 @@ function toCamelCaseDeep(obj) {
 // src/evaluation/evaluators/code-evaluator.ts
 var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
 var CodeEvaluator = class {
-  kind = "code-judge";
+  kind = "code-grader";
   command;
   cwd;
   agentTimeoutMs;
@@ -10102,7 +10137,7 @@ var CodeEvaluator = class {
     if (outputForPayload) {
       const serialized = JSON.stringify(outputForPayload);
       if (serialized.length > FILE_BACKED_OUTPUT_THRESHOLD) {
-        const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-judge-"));
+        const tmpDir = await mkdtemp2(join(tmpdir2(), "agentv-grader-"));
         outputPath = join(tmpDir, "output.json");
         await writeFile6(outputPath, serialized);
         outputForPayload = null;
@@ -10360,7 +10395,7 @@ var LlmGraderEvaluator = class {
       return this.evaluateWithDelegatedAgent(context, graderProvider);
     }
     const config = context.evaluator;
-    if ((config?.type === "llm-grader" || config?.type === "llm-judge") && config.rubrics && config.rubrics.length > 0) {
+    if (config?.type === "llm-grader" && config.rubrics && config.rubrics.length > 0) {
       return this.evaluateWithRubrics(context, graderProvider, config.rubrics);
     }
     return this.evaluateFreeform(context, graderProvider);
@@ -10545,7 +10580,7 @@ ${context.fileChanges}`;
     const systemPrompt = this.buildAgentSystemPrompt(context);
     const userPrompt = this.buildAgentUserPrompt(context);
     const config = context.evaluator;
-    const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
+    const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
     const fsTools = createFilesystemTools(workspacePath);
     const evaluatorRawRequest = {
       mode: "built-in",
@@ -10641,7 +10676,7 @@ ${context.fileChanges}`;
         };
       }
       const config = context.evaluator;
-      const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
+      const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
       const details = {
         mode: modeLabel,
         grader_target: provider.targetName
@@ -10681,7 +10716,7 @@ ${context.fileChanges}`;
    */
   buildAgentSystemPrompt(context) {
     const config = context.evaluator;
-    const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
+    const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
     const parts = [
       "You are an expert evaluator with access to the workspace filesystem.",
       "Use the provided tools to investigate the workspace and verify the criteria are met.",
@@ -10712,7 +10747,7 @@ ${context.fileChanges}`;
       return substituteVariables(this.evaluatorTemplate, variables);
     }
     const config = context.evaluator;
-    const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
+    const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
     const parts = [
       "Evaluate the candidate answer by investigating the workspace.",
       "",
@@ -10755,7 +10790,7 @@ ${context.fileChanges}`;
   buildDelegatedPrompt(context) {
     const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
     const config = context.evaluator;
-    const rubrics = config?.type === "llm-grader" || config?.type === "llm-judge" ? config.rubrics : void 0;
+    const rubrics = config?.type === "llm-grader" ? config.rubrics : void 0;
     if (this.evaluatorTemplate) {
       const variables = {
         [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
@@ -11252,10 +11287,8 @@ var CompositeEvaluator = class {
     const aggregator = this.config.aggregator;
     switch (aggregator.type) {
       case "code-grader":
-      case "code-judge":
         return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
       case "llm-grader":
-      case "llm-judge":
         return this.runLlmAggregator(results, context, aggregator);
       case "threshold":
         return this.runThreshold(results, aggregator.threshold);
@@ -13677,7 +13710,7 @@ var endsWithFactory = (config) => {
 };
 function createBuiltinRegistry() {
   const registry = new EvaluatorRegistry();
-  registry.register("llm-grader", llmGraderFactory).register("llm-judge", llmGraderFactory).register("code-grader", codeFactory).register("code-judge", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
+  registry.register("llm-grader", llmGraderFactory).register("code-grader", codeFactory).register("composite", compositeFactory).register("tool-trajectory", toolTrajectoryFactory).register("field-accuracy", fieldAccuracyFactory).register("latency", latencyFactory).register("cost", costFactory).register("token-usage", tokenUsageFactory).register("execution-metrics", executionMetricsFactory).register("skill-trigger", skillTriggerFactory).register("contains", containsFactory).register("contains-any", containsAnyFactory).register("contains-all", containsAllFactory).register("icontains", icontainsFactory).register("icontains-any", icontainsAnyFactory).register("icontains-all", icontainsAllFactory).register("starts-with", startsWithFactory).register("ends-with", endsWithFactory).register("regex", regexFactory).register("is-json", isJsonFactory).register("equals", equalsFactory).register("inline-assert", (config) => {
     const fn = config[INLINE_ASSERT_FN];
     if (!fn) {
       throw new Error(
@@ -16395,7 +16428,7 @@ function filterEvalCases(evalCases, filter) {
   return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter));
 }
 function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
-  const llmGrader = overrides?.["llm-grader"] ?? overrides?.["llm-judge"] ?? new LlmGraderEvaluator({
+  const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
     resolveGraderProvider: async (context) => {
       if (context.graderProvider) {
         return context.graderProvider;
@@ -17239,10 +17272,10 @@ var OtelTraceExporter = class {
         }
         if (result.scores) {
           for (const score of result.scores) {
-            rootSpan.addEvent(`agentv.evaluator.${score.name}`, {
-              "agentv.evaluator.score": score.score,
-              "agentv.evaluator.type": score.type,
-              ...score.verdict ? { "agentv.evaluator.verdict": score.verdict } : {}
+            rootSpan.addEvent(`agentv.grader.${score.name}`, {
+              "agentv.grader.score": score.score,
+              "agentv.grader.type": score.type,
+              ...score.verdict ? { "agentv.grader.verdict": score.verdict } : {}
             });
           }
         }
@@ -17628,6 +17661,7 @@ export {
   extractTargetFromSuite,
   extractTargetsFromSuite,
   extractTargetsFromTestCase,
+  extractThreshold,
   extractTrialsConfig,
   extractWorkersFromSuite,
   fileExists,