npm - @agentv/core - Versions diffs - 2.13.0 → 2.14.1 - Mend

@agentv/core 2.13.0 → 2.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-JHER2LQ5.js → chunk-N55K52OO.js} +15 -15
package/dist/chunk-N55K52OO.js.map +1 -0
package/dist/evaluation/validation/index.cjs +25 -24
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +12 -11
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +184 -158
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +40 -40
package/dist/index.d.ts +40 -40
package/dist/index.js +172 -146
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-JHER2LQ5.js.map +0 -1

package/dist/index.d.cts CHANGED Viewed

@@ -257,7 +257,7 @@ interface TraceComputeResult {
     readonly endTime?: string;
 }
 /**
- * Argument matching mode for tool_trajectory expected items.
+ * Argument matching mode for tool-trajectory expected items.
  * - 'exact': bidirectional deep equality, no extra keys allowed (default)
  * - 'superset': actual args must contain all expected keys (extras OK)
  * - 'subset': actual args must be a subset of expected keys (no unexpected keys)
@@ -265,11 +265,11 @@ interface TraceComputeResult {
  */
 type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
 /**
- * Configuration for tool_trajectory evaluator.
+ * Configuration for tool-trajectory evaluator.
  */
 interface ToolTrajectoryEvaluatorConfig {
     readonly name: string;
-    readonly type: 'tool_trajectory';
+    readonly type: 'tool-trajectory';
     /** Matching mode */
     readonly mode: 'any_order' | 'in_order' | 'exact' | 'subset' | 'superset';
     /** Minimum call counts per tool (for any_order mode) */
@@ -453,11 +453,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
  */
 declare function isTestMessage(value: unknown): value is TestMessage;
-declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "contains_any", "contains_all", "icontains", "icontains_any", "icontains_all", "starts_with", "ends_with", "regex", "is_json", "equals", "rubrics"];
+declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "agent-judge", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
 /**
- * Configuration for enabling target access in code_judge evaluators.
+ * Configuration for enabling target access in code-judge evaluators.
  * When present, the runtime will start a local proxy server that allows
  * the script to invoke configured targets without direct credential access.
  */
@@ -539,7 +539,7 @@ type WorkspaceConfig = {
 };
 type CodeEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'code';
+    readonly type: 'code-judge';
     readonly command: readonly string[];
     /** @deprecated Use `command` instead */
     readonly script?: readonly string[];
@@ -550,14 +550,14 @@ type CodeEvaluatorConfig = {
     readonly required?: boolean | number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
-    /** Pass-through configuration for the code_judge (any unrecognized YAML properties) */
+    /** Pass-through configuration for the code-judge (any unrecognized YAML properties) */
     readonly config?: JsonObject;
     /** When present, enables target access via local proxy */
     readonly target?: TargetAccessConfig;
 };
 /**
  * Executable prompt template configuration.
- * Matches code_judge pattern for consistency.
+ * Matches code-judge pattern for consistency.
  */
 type PromptScriptConfig = {
     /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
@@ -569,13 +569,13 @@ type PromptScriptConfig = {
 };
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'llm_judge';
+    readonly type: 'llm-judge';
     /** Text prompt (inline or file path) or executable script config */
     readonly prompt?: string | PromptScriptConfig;
     readonly promptPath?: string;
     /** Resolved absolute path for prompt file (used for text template prompts) */
     readonly resolvedPromptPath?: string;
-    /** Resolved script array for executable prompts (matches code_judge pattern) */
+    /** Resolved script array for executable prompts (matches code-judge pattern) */
     readonly resolvedPromptScript?: readonly string[];
     readonly rubrics?: readonly RubricItem[];
     readonly weight?: number;
@@ -630,11 +630,11 @@ type CompositeAggregatorConfig = {
     readonly type: 'weighted_average';
     readonly weights?: Record<string, number>;
 } | {
-    readonly type: 'code_judge';
+    readonly type: 'code-judge';
     readonly path: string;
     readonly cwd?: string;
 } | {
-    readonly type: 'llm_judge';
+    readonly type: 'llm-judge';
     readonly prompt?: string;
     readonly promptPath?: string;
     readonly model?: string;
@@ -654,7 +654,7 @@ type CompositeEvaluatorConfig = {
 };
 /**
  * Match type for field accuracy evaluation.
- * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code_judge evaluator.
+ * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-judge evaluator.
  * See examples/features/document-extraction/fuzzy_match.ts for an example.
  */
 type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
@@ -682,11 +682,11 @@ type FieldConfig = {
     readonly formats?: readonly string[];
 };
 /**
- * Configuration for the field_accuracy evaluator.
+ * Configuration for the field-accuracy evaluator.
  */
 type FieldAccuracyEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'field_accuracy';
+    readonly type: 'field-accuracy';
     /** Fields to compare between candidate and expected */
     readonly fields: readonly FieldConfig[];
     /** Strategy for combining field scores (default: weighted_average) */
@@ -725,12 +725,12 @@ type CostEvaluatorConfig = {
     readonly negate?: boolean;
 };
 /**
- * Configuration for the token_usage evaluator.
+ * Configuration for the token-usage evaluator.
  * Checks provider-reported token usage against configured limits.
  */
 type TokenUsageEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'token_usage';
+    readonly type: 'token-usage';
     /** Maximum allowed total tokens (input + output + cached, when present) */
     readonly max_total?: number;
     /** Maximum allowed input tokens (prompt) */
@@ -743,13 +743,13 @@ type TokenUsageEvaluatorConfig = {
     readonly negate?: boolean;
 };
 /**
- * Configuration for the execution_metrics evaluator.
+ * Configuration for the execution-metrics evaluator.
  * Provides declarative threshold-based checks on execution metrics.
  * Only specified thresholds are checked; omitted ones are ignored.
  */
 type ExecutionMetricsEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'execution_metrics';
+    readonly type: 'execution-metrics';
     /** Maximum allowed number of tool calls */
     readonly max_tool_calls?: number;
     /** Maximum allowed number of LLM calls (assistant messages) */
@@ -770,7 +770,7 @@ type ExecutionMetricsEvaluatorConfig = {
     readonly negate?: boolean;
 };
 /**
- * Configuration for the agent_judge evaluator.
+ * Configuration for the agent-judge evaluator.
  * Runs an agentic investigation loop to audit workspaces and verify criteria.
  * Two modes:
  * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
@@ -778,13 +778,13 @@ type ExecutionMetricsEvaluatorConfig = {
  */
 type AgentJudgeEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'agent_judge';
+    readonly type: 'agent-judge';
     /** Custom evaluation prompt (inline text or file path) */
     readonly prompt?: string;
     readonly promptPath?: string;
     /** Resolved absolute path for prompt file */
     readonly resolvedPromptPath?: string;
-    /** Rubric items for structured evaluation (reuses llm_judge rubric infra) */
+    /** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
     readonly rubrics?: readonly RubricItem[];
     /** Maximum agent steps for built-in mode (default 10, max 50) */
     readonly max_steps?: number;
@@ -816,7 +816,7 @@ type ContainsEvaluatorConfig = {
  */
 type ContainsAnyEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'contains_any';
+    readonly type: 'contains-any';
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -829,7 +829,7 @@ type ContainsAnyEvaluatorConfig = {
  */
 type ContainsAllEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'contains_all';
+    readonly type: 'contains-all';
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -855,7 +855,7 @@ type IcontainsEvaluatorConfig = {
  */
 type IcontainsAnyEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'icontains_any';
+    readonly type: 'icontains-any';
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -868,7 +868,7 @@ type IcontainsAnyEvaluatorConfig = {
  */
 type IcontainsAllEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'icontains_all';
+    readonly type: 'icontains-all';
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -881,7 +881,7 @@ type IcontainsAllEvaluatorConfig = {
  */
 type StartsWithEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'starts_with';
+    readonly type: 'starts-with';
     readonly value: string;
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -894,7 +894,7 @@ type StartsWithEvaluatorConfig = {
  */
 type EndsWithEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'ends_with';
+    readonly type: 'ends-with';
     readonly value: string;
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -922,7 +922,7 @@ type RegexEvaluatorConfig = {
  */
 type IsJsonEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'is_json';
+    readonly type: 'is-json';
     readonly weight?: number;
     readonly required?: boolean | number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
@@ -1915,7 +1915,7 @@ interface CodeEvaluatorOptions {
     readonly target?: TargetAccessConfig;
 }
 declare class CodeEvaluator implements Evaluator {
-    readonly kind = "code";
+    readonly kind = "code-judge";
     private readonly command;
     private readonly cwd?;
     private readonly agentTimeoutMs?;
@@ -1970,7 +1970,7 @@ interface ExecutionMetricsEvaluatorOptions {
  * Score is proportional: hits.length / (hits.length + misses.length)
  */
 declare class ExecutionMetricsEvaluator implements Evaluator {
-    readonly kind = "execution_metrics";
+    readonly kind = "execution-metrics";
     private readonly config;
     constructor(options: ExecutionMetricsEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
@@ -1986,7 +1986,7 @@ interface FieldAccuracyEvaluatorOptions {
  * with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
  */
 declare class FieldAccuracyEvaluator implements Evaluator {
-    readonly kind = "field_accuracy";
+    readonly kind = "field-accuracy";
     private readonly config;
     constructor(options: FieldAccuracyEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
@@ -2091,7 +2091,7 @@ declare const rubricEvaluationSchema: z.ZodObject<{
 }>;
 declare class LlmJudgeEvaluator implements Evaluator {
-    readonly kind = "llm_judge";
+    readonly kind = "llm-judge";
     private readonly resolveJudgeProvider;
     private readonly maxOutputTokens?;
     private readonly temperature?;
@@ -2138,7 +2138,7 @@ interface AgentJudgeEvaluatorOptions {
     readonly judgeTargetProvider?: Provider;
 }
 declare class AgentJudgeEvaluator implements Evaluator {
-    readonly kind = "agent_judge";
+    readonly kind = "agent-judge";
     private readonly resolveJudgeProvider;
     private readonly maxSteps;
     private readonly temperature;
@@ -2200,7 +2200,7 @@ interface TokenUsageEvaluatorOptions {
  * Uses tokenUsage from the evaluation context.
  */
 declare class TokenUsageEvaluator implements Evaluator {
-    readonly kind = "token_usage";
+    readonly kind = "token-usage";
     private readonly config;
     constructor(options: TokenUsageEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
@@ -2210,7 +2210,7 @@ interface ToolTrajectoryEvaluatorOptions {
     readonly config: ToolTrajectoryEvaluatorConfig;
 }
 declare class ToolTrajectoryEvaluator implements Evaluator {
-    readonly kind = "tool_trajectory";
+    readonly kind = "tool-trajectory";
     private readonly config;
     constructor(options: ToolTrajectoryEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
@@ -2335,7 +2335,7 @@ declare class EvaluatorRegistry {
 }
 /**
  * Adapter that wraps a synchronous assertion function as an Evaluator.
- * Used for deterministic assertions (contains, regex, is_json, equals).
+ * Used for deterministic assertions (contains, regex, is-json, equals).
  */
 declare class DeterministicAssertionEvaluator implements Evaluator {
     private readonly assertFn;
@@ -2383,7 +2383,7 @@ interface RunEvalCaseOptions {
     readonly provider: Provider;
     readonly target: ResolvedTarget;
     readonly evaluators: Partial<Record<string, Evaluator>> & {
-        readonly llm_judge: Evaluator;
+        readonly 'llm-judge': Evaluator;
     };
     readonly now?: () => Date;
     readonly maxRetries?: number;
@@ -2524,7 +2524,7 @@ interface EvalTestInput {
  * Matches the YAML `assert` block structure.
  */
 interface EvalAssertionInput {
-    /** Assertion type (e.g., 'contains', 'llm_judge', 'code_judge') */
+    /** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
     readonly type: string;
     /** Display name */
     readonly name?: string;
@@ -3152,7 +3152,7 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
  * Convention-based discovery of custom assertion scripts.
  *
  * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
- * them as code_judge evaluators in the registry. The file name (without
+ * them as code-judge evaluators in the registry. The file name (without
  * extension) becomes the evaluator type name.
  *
  * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml

package/dist/index.d.ts CHANGED Viewed

@@ -257,7 +257,7 @@ interface TraceComputeResult {
     readonly endTime?: string;
 }
 /**
- * Argument matching mode for tool_trajectory expected items.
+ * Argument matching mode for tool-trajectory expected items.
  * - 'exact': bidirectional deep equality, no extra keys allowed (default)
  * - 'superset': actual args must contain all expected keys (extras OK)
  * - 'subset': actual args must be a subset of expected keys (no unexpected keys)
@@ -265,11 +265,11 @@ interface TraceComputeResult {
  */
 type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
 /**
- * Configuration for tool_trajectory evaluator.
+ * Configuration for tool-trajectory evaluator.
  */
 interface ToolTrajectoryEvaluatorConfig {
     readonly name: string;
-    readonly type: 'tool_trajectory';
+    readonly type: 'tool-trajectory';
     /** Matching mode */
     readonly mode: 'any_order' | 'in_order' | 'exact' | 'subset' | 'superset';
     /** Minimum call counts per tool (for any_order mode) */
@@ -453,11 +453,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
  */
 declare function isTestMessage(value: unknown): value is TestMessage;
-declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "contains_any", "contains_all", "icontains", "icontains_any", "icontains_all", "starts_with", "ends_with", "regex", "is_json", "equals", "rubrics"];
+declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "agent-judge", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics"];
 type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
 declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
 /**
- * Configuration for enabling target access in code_judge evaluators.
+ * Configuration for enabling target access in code-judge evaluators.
  * When present, the runtime will start a local proxy server that allows
  * the script to invoke configured targets without direct credential access.
  */
@@ -539,7 +539,7 @@ type WorkspaceConfig = {
 };
 type CodeEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'code';
+    readonly type: 'code-judge';
     readonly command: readonly string[];
     /** @deprecated Use `command` instead */
     readonly script?: readonly string[];
@@ -550,14 +550,14 @@ type CodeEvaluatorConfig = {
     readonly required?: boolean | number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
     readonly negate?: boolean;
-    /** Pass-through configuration for the code_judge (any unrecognized YAML properties) */
+    /** Pass-through configuration for the code-judge (any unrecognized YAML properties) */
     readonly config?: JsonObject;
     /** When present, enables target access via local proxy */
     readonly target?: TargetAccessConfig;
 };
 /**
  * Executable prompt template configuration.
- * Matches code_judge pattern for consistency.
+ * Matches code-judge pattern for consistency.
  */
 type PromptScriptConfig = {
     /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
@@ -569,13 +569,13 @@ type PromptScriptConfig = {
 };
 type LlmJudgeEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'llm_judge';
+    readonly type: 'llm-judge';
     /** Text prompt (inline or file path) or executable script config */
     readonly prompt?: string | PromptScriptConfig;
     readonly promptPath?: string;
     /** Resolved absolute path for prompt file (used for text template prompts) */
     readonly resolvedPromptPath?: string;
-    /** Resolved script array for executable prompts (matches code_judge pattern) */
+    /** Resolved script array for executable prompts (matches code-judge pattern) */
     readonly resolvedPromptScript?: readonly string[];
     readonly rubrics?: readonly RubricItem[];
     readonly weight?: number;
@@ -630,11 +630,11 @@ type CompositeAggregatorConfig = {
     readonly type: 'weighted_average';
     readonly weights?: Record<string, number>;
 } | {
-    readonly type: 'code_judge';
+    readonly type: 'code-judge';
     readonly path: string;
     readonly cwd?: string;
 } | {
-    readonly type: 'llm_judge';
+    readonly type: 'llm-judge';
     readonly prompt?: string;
     readonly promptPath?: string;
     readonly model?: string;
@@ -654,7 +654,7 @@ type CompositeEvaluatorConfig = {
 };
 /**
  * Match type for field accuracy evaluation.
- * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code_judge evaluator.
+ * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-judge evaluator.
  * See examples/features/document-extraction/fuzzy_match.ts for an example.
  */
 type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
@@ -682,11 +682,11 @@ type FieldConfig = {
     readonly formats?: readonly string[];
 };
 /**
- * Configuration for the field_accuracy evaluator.
+ * Configuration for the field-accuracy evaluator.
  */
 type FieldAccuracyEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'field_accuracy';
+    readonly type: 'field-accuracy';
     /** Fields to compare between candidate and expected */
     readonly fields: readonly FieldConfig[];
     /** Strategy for combining field scores (default: weighted_average) */
@@ -725,12 +725,12 @@ type CostEvaluatorConfig = {
     readonly negate?: boolean;
 };
 /**
- * Configuration for the token_usage evaluator.
+ * Configuration for the token-usage evaluator.
  * Checks provider-reported token usage against configured limits.
  */
 type TokenUsageEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'token_usage';
+    readonly type: 'token-usage';
     /** Maximum allowed total tokens (input + output + cached, when present) */
     readonly max_total?: number;
     /** Maximum allowed input tokens (prompt) */
@@ -743,13 +743,13 @@ type TokenUsageEvaluatorConfig = {
     readonly negate?: boolean;
 };
 /**
- * Configuration for the execution_metrics evaluator.
+ * Configuration for the execution-metrics evaluator.
  * Provides declarative threshold-based checks on execution metrics.
  * Only specified thresholds are checked; omitted ones are ignored.
  */
 type ExecutionMetricsEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'execution_metrics';
+    readonly type: 'execution-metrics';
     /** Maximum allowed number of tool calls */
     readonly max_tool_calls?: number;
     /** Maximum allowed number of LLM calls (assistant messages) */
@@ -770,7 +770,7 @@ type ExecutionMetricsEvaluatorConfig = {
     readonly negate?: boolean;
 };
 /**
- * Configuration for the agent_judge evaluator.
+ * Configuration for the agent-judge evaluator.
  * Runs an agentic investigation loop to audit workspaces and verify criteria.
  * Two modes:
  * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
@@ -778,13 +778,13 @@ type ExecutionMetricsEvaluatorConfig = {
  */
 type AgentJudgeEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'agent_judge';
+    readonly type: 'agent-judge';
     /** Custom evaluation prompt (inline text or file path) */
     readonly prompt?: string;
     readonly promptPath?: string;
     /** Resolved absolute path for prompt file */
     readonly resolvedPromptPath?: string;
-    /** Rubric items for structured evaluation (reuses llm_judge rubric infra) */
+    /** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
     readonly rubrics?: readonly RubricItem[];
     /** Maximum agent steps for built-in mode (default 10, max 50) */
     readonly max_steps?: number;
@@ -816,7 +816,7 @@ type ContainsEvaluatorConfig = {
  */
 type ContainsAnyEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'contains_any';
+    readonly type: 'contains-any';
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -829,7 +829,7 @@ type ContainsAnyEvaluatorConfig = {
  */
 type ContainsAllEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'contains_all';
+    readonly type: 'contains-all';
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -855,7 +855,7 @@ type IcontainsEvaluatorConfig = {
  */
 type IcontainsAnyEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'icontains_any';
+    readonly type: 'icontains-any';
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -868,7 +868,7 @@ type IcontainsAnyEvaluatorConfig = {
  */
 type IcontainsAllEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'icontains_all';
+    readonly type: 'icontains-all';
     readonly value: readonly string[];
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -881,7 +881,7 @@ type IcontainsAllEvaluatorConfig = {
  */
 type StartsWithEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'starts_with';
+    readonly type: 'starts-with';
     readonly value: string;
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -894,7 +894,7 @@ type StartsWithEvaluatorConfig = {
  */
 type EndsWithEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'ends_with';
+    readonly type: 'ends-with';
     readonly value: string;
     readonly weight?: number;
     readonly required?: boolean | number;
@@ -922,7 +922,7 @@ type RegexEvaluatorConfig = {
  */
 type IsJsonEvaluatorConfig = {
     readonly name: string;
-    readonly type: 'is_json';
+    readonly type: 'is-json';
     readonly weight?: number;
     readonly required?: boolean | number;
     /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
@@ -1915,7 +1915,7 @@ interface CodeEvaluatorOptions {
     readonly target?: TargetAccessConfig;
 }
 declare class CodeEvaluator implements Evaluator {
-    readonly kind = "code";
+    readonly kind = "code-judge";
     private readonly command;
     private readonly cwd?;
     private readonly agentTimeoutMs?;
@@ -1970,7 +1970,7 @@ interface ExecutionMetricsEvaluatorOptions {
  * Score is proportional: hits.length / (hits.length + misses.length)
  */
 declare class ExecutionMetricsEvaluator implements Evaluator {
-    readonly kind = "execution_metrics";
+    readonly kind = "execution-metrics";
     private readonly config;
     constructor(options: ExecutionMetricsEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
@@ -1986,7 +1986,7 @@ interface FieldAccuracyEvaluatorOptions {
  * with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
  */
 declare class FieldAccuracyEvaluator implements Evaluator {
-    readonly kind = "field_accuracy";
+    readonly kind = "field-accuracy";
     private readonly config;
     constructor(options: FieldAccuracyEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
@@ -2091,7 +2091,7 @@ declare const rubricEvaluationSchema: z.ZodObject<{
 }>;
 declare class LlmJudgeEvaluator implements Evaluator {
-    readonly kind = "llm_judge";
+    readonly kind = "llm-judge";
     private readonly resolveJudgeProvider;
     private readonly maxOutputTokens?;
     private readonly temperature?;
@@ -2138,7 +2138,7 @@ interface AgentJudgeEvaluatorOptions {
     readonly judgeTargetProvider?: Provider;
 }
 declare class AgentJudgeEvaluator implements Evaluator {
-    readonly kind = "agent_judge";
+    readonly kind = "agent-judge";
     private readonly resolveJudgeProvider;
     private readonly maxSteps;
     private readonly temperature;
@@ -2200,7 +2200,7 @@ interface TokenUsageEvaluatorOptions {
  * Uses tokenUsage from the evaluation context.
  */
 declare class TokenUsageEvaluator implements Evaluator {
-    readonly kind = "token_usage";
+    readonly kind = "token-usage";
     private readonly config;
     constructor(options: TokenUsageEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
@@ -2210,7 +2210,7 @@ interface ToolTrajectoryEvaluatorOptions {
     readonly config: ToolTrajectoryEvaluatorConfig;
 }
 declare class ToolTrajectoryEvaluator implements Evaluator {
-    readonly kind = "tool_trajectory";
+    readonly kind = "tool-trajectory";
     private readonly config;
     constructor(options: ToolTrajectoryEvaluatorOptions);
     evaluate(context: EvaluationContext): EvaluationScore;
@@ -2335,7 +2335,7 @@ declare class EvaluatorRegistry {
 }
 /**
  * Adapter that wraps a synchronous assertion function as an Evaluator.
- * Used for deterministic assertions (contains, regex, is_json, equals).
+ * Used for deterministic assertions (contains, regex, is-json, equals).
  */
 declare class DeterministicAssertionEvaluator implements Evaluator {
     private readonly assertFn;
@@ -2383,7 +2383,7 @@ interface RunEvalCaseOptions {
     readonly provider: Provider;
     readonly target: ResolvedTarget;
     readonly evaluators: Partial<Record<string, Evaluator>> & {
-        readonly llm_judge: Evaluator;
+        readonly 'llm-judge': Evaluator;
     };
     readonly now?: () => Date;
     readonly maxRetries?: number;
@@ -2524,7 +2524,7 @@ interface EvalTestInput {
  * Matches the YAML `assert` block structure.
  */
 interface EvalAssertionInput {
-    /** Assertion type (e.g., 'contains', 'llm_judge', 'code_judge') */
+    /** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
     readonly type: string;
     /** Display name */
     readonly name?: string;
@@ -3152,7 +3152,7 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
  * Convention-based discovery of custom assertion scripts.
  *
  * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
- * them as code_judge evaluators in the registry. The file name (without
+ * them as code-judge evaluators in the registry. The file name (without
  * extension) becomes the evaluator type name.
  *
  * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml