@agentv/core 2.13.0 → 2.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -257,7 +257,7 @@ interface TraceComputeResult {
257
257
  readonly endTime?: string;
258
258
  }
259
259
  /**
260
- * Argument matching mode for tool_trajectory expected items.
260
+ * Argument matching mode for tool-trajectory expected items.
261
261
  * - 'exact': bidirectional deep equality, no extra keys allowed (default)
262
262
  * - 'superset': actual args must contain all expected keys (extras OK)
263
263
  * - 'subset': actual args must be a subset of expected keys (no unexpected keys)
@@ -265,11 +265,11 @@ interface TraceComputeResult {
265
265
  */
266
266
  type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
267
267
  /**
268
- * Configuration for tool_trajectory evaluator.
268
+ * Configuration for tool-trajectory evaluator.
269
269
  */
270
270
  interface ToolTrajectoryEvaluatorConfig {
271
271
  readonly name: string;
272
- readonly type: 'tool_trajectory';
272
+ readonly type: 'tool-trajectory';
273
273
  /** Matching mode */
274
274
  readonly mode: 'any_order' | 'in_order' | 'exact' | 'subset' | 'superset';
275
275
  /** Minimum call counts per tool (for any_order mode) */
@@ -453,11 +453,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
453
453
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
454
454
  */
455
455
  declare function isTestMessage(value: unknown): value is TestMessage;
456
- declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "contains_any", "contains_all", "icontains", "icontains_any", "icontains_all", "starts_with", "ends_with", "regex", "is_json", "equals", "rubrics"];
456
+ declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "agent-judge", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics"];
457
457
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
458
458
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
459
459
  /**
460
- * Configuration for enabling target access in code_judge evaluators.
460
+ * Configuration for enabling target access in code-judge evaluators.
461
461
  * When present, the runtime will start a local proxy server that allows
462
462
  * the script to invoke configured targets without direct credential access.
463
463
  */
@@ -539,7 +539,7 @@ type WorkspaceConfig = {
539
539
  };
540
540
  type CodeEvaluatorConfig = {
541
541
  readonly name: string;
542
- readonly type: 'code';
542
+ readonly type: 'code-judge';
543
543
  readonly command: readonly string[];
544
544
  /** @deprecated Use `command` instead */
545
545
  readonly script?: readonly string[];
@@ -550,14 +550,14 @@ type CodeEvaluatorConfig = {
550
550
  readonly required?: boolean | number;
551
551
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
552
552
  readonly negate?: boolean;
553
- /** Pass-through configuration for the code_judge (any unrecognized YAML properties) */
553
+ /** Pass-through configuration for the code-judge (any unrecognized YAML properties) */
554
554
  readonly config?: JsonObject;
555
555
  /** When present, enables target access via local proxy */
556
556
  readonly target?: TargetAccessConfig;
557
557
  };
558
558
  /**
559
559
  * Executable prompt template configuration.
560
- * Matches code_judge pattern for consistency.
560
+ * Matches code-judge pattern for consistency.
561
561
  */
562
562
  type PromptScriptConfig = {
563
563
  /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
@@ -569,13 +569,13 @@ type PromptScriptConfig = {
569
569
  };
570
570
  type LlmJudgeEvaluatorConfig = {
571
571
  readonly name: string;
572
- readonly type: 'llm_judge';
572
+ readonly type: 'llm-judge';
573
573
  /** Text prompt (inline or file path) or executable script config */
574
574
  readonly prompt?: string | PromptScriptConfig;
575
575
  readonly promptPath?: string;
576
576
  /** Resolved absolute path for prompt file (used for text template prompts) */
577
577
  readonly resolvedPromptPath?: string;
578
- /** Resolved script array for executable prompts (matches code_judge pattern) */
578
+ /** Resolved script array for executable prompts (matches code-judge pattern) */
579
579
  readonly resolvedPromptScript?: readonly string[];
580
580
  readonly rubrics?: readonly RubricItem[];
581
581
  readonly weight?: number;
@@ -630,11 +630,11 @@ type CompositeAggregatorConfig = {
630
630
  readonly type: 'weighted_average';
631
631
  readonly weights?: Record<string, number>;
632
632
  } | {
633
- readonly type: 'code_judge';
633
+ readonly type: 'code-judge';
634
634
  readonly path: string;
635
635
  readonly cwd?: string;
636
636
  } | {
637
- readonly type: 'llm_judge';
637
+ readonly type: 'llm-judge';
638
638
  readonly prompt?: string;
639
639
  readonly promptPath?: string;
640
640
  readonly model?: string;
@@ -654,7 +654,7 @@ type CompositeEvaluatorConfig = {
654
654
  };
655
655
  /**
656
656
  * Match type for field accuracy evaluation.
657
- * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code_judge evaluator.
657
+ * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-judge evaluator.
658
658
  * See examples/features/document-extraction/fuzzy_match.ts for an example.
659
659
  */
660
660
  type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
@@ -682,11 +682,11 @@ type FieldConfig = {
682
682
  readonly formats?: readonly string[];
683
683
  };
684
684
  /**
685
- * Configuration for the field_accuracy evaluator.
685
+ * Configuration for the field-accuracy evaluator.
686
686
  */
687
687
  type FieldAccuracyEvaluatorConfig = {
688
688
  readonly name: string;
689
- readonly type: 'field_accuracy';
689
+ readonly type: 'field-accuracy';
690
690
  /** Fields to compare between candidate and expected */
691
691
  readonly fields: readonly FieldConfig[];
692
692
  /** Strategy for combining field scores (default: weighted_average) */
@@ -725,12 +725,12 @@ type CostEvaluatorConfig = {
725
725
  readonly negate?: boolean;
726
726
  };
727
727
  /**
728
- * Configuration for the token_usage evaluator.
728
+ * Configuration for the token-usage evaluator.
729
729
  * Checks provider-reported token usage against configured limits.
730
730
  */
731
731
  type TokenUsageEvaluatorConfig = {
732
732
  readonly name: string;
733
- readonly type: 'token_usage';
733
+ readonly type: 'token-usage';
734
734
  /** Maximum allowed total tokens (input + output + cached, when present) */
735
735
  readonly max_total?: number;
736
736
  /** Maximum allowed input tokens (prompt) */
@@ -743,13 +743,13 @@ type TokenUsageEvaluatorConfig = {
743
743
  readonly negate?: boolean;
744
744
  };
745
745
  /**
746
- * Configuration for the execution_metrics evaluator.
746
+ * Configuration for the execution-metrics evaluator.
747
747
  * Provides declarative threshold-based checks on execution metrics.
748
748
  * Only specified thresholds are checked; omitted ones are ignored.
749
749
  */
750
750
  type ExecutionMetricsEvaluatorConfig = {
751
751
  readonly name: string;
752
- readonly type: 'execution_metrics';
752
+ readonly type: 'execution-metrics';
753
753
  /** Maximum allowed number of tool calls */
754
754
  readonly max_tool_calls?: number;
755
755
  /** Maximum allowed number of LLM calls (assistant messages) */
@@ -770,7 +770,7 @@ type ExecutionMetricsEvaluatorConfig = {
770
770
  readonly negate?: boolean;
771
771
  };
772
772
  /**
773
- * Configuration for the agent_judge evaluator.
773
+ * Configuration for the agent-judge evaluator.
774
774
  * Runs an agentic investigation loop to audit workspaces and verify criteria.
775
775
  * Two modes:
776
776
  * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
@@ -778,13 +778,13 @@ type ExecutionMetricsEvaluatorConfig = {
778
778
  */
779
779
  type AgentJudgeEvaluatorConfig = {
780
780
  readonly name: string;
781
- readonly type: 'agent_judge';
781
+ readonly type: 'agent-judge';
782
782
  /** Custom evaluation prompt (inline text or file path) */
783
783
  readonly prompt?: string;
784
784
  readonly promptPath?: string;
785
785
  /** Resolved absolute path for prompt file */
786
786
  readonly resolvedPromptPath?: string;
787
- /** Rubric items for structured evaluation (reuses llm_judge rubric infra) */
787
+ /** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
788
788
  readonly rubrics?: readonly RubricItem[];
789
789
  /** Maximum agent steps for built-in mode (default 10, max 50) */
790
790
  readonly max_steps?: number;
@@ -816,7 +816,7 @@ type ContainsEvaluatorConfig = {
816
816
  */
817
817
  type ContainsAnyEvaluatorConfig = {
818
818
  readonly name: string;
819
- readonly type: 'contains_any';
819
+ readonly type: 'contains-any';
820
820
  readonly value: readonly string[];
821
821
  readonly weight?: number;
822
822
  readonly required?: boolean | number;
@@ -829,7 +829,7 @@ type ContainsAnyEvaluatorConfig = {
829
829
  */
830
830
  type ContainsAllEvaluatorConfig = {
831
831
  readonly name: string;
832
- readonly type: 'contains_all';
832
+ readonly type: 'contains-all';
833
833
  readonly value: readonly string[];
834
834
  readonly weight?: number;
835
835
  readonly required?: boolean | number;
@@ -855,7 +855,7 @@ type IcontainsEvaluatorConfig = {
855
855
  */
856
856
  type IcontainsAnyEvaluatorConfig = {
857
857
  readonly name: string;
858
- readonly type: 'icontains_any';
858
+ readonly type: 'icontains-any';
859
859
  readonly value: readonly string[];
860
860
  readonly weight?: number;
861
861
  readonly required?: boolean | number;
@@ -868,7 +868,7 @@ type IcontainsAnyEvaluatorConfig = {
868
868
  */
869
869
  type IcontainsAllEvaluatorConfig = {
870
870
  readonly name: string;
871
- readonly type: 'icontains_all';
871
+ readonly type: 'icontains-all';
872
872
  readonly value: readonly string[];
873
873
  readonly weight?: number;
874
874
  readonly required?: boolean | number;
@@ -881,7 +881,7 @@ type IcontainsAllEvaluatorConfig = {
881
881
  */
882
882
  type StartsWithEvaluatorConfig = {
883
883
  readonly name: string;
884
- readonly type: 'starts_with';
884
+ readonly type: 'starts-with';
885
885
  readonly value: string;
886
886
  readonly weight?: number;
887
887
  readonly required?: boolean | number;
@@ -894,7 +894,7 @@ type StartsWithEvaluatorConfig = {
894
894
  */
895
895
  type EndsWithEvaluatorConfig = {
896
896
  readonly name: string;
897
- readonly type: 'ends_with';
897
+ readonly type: 'ends-with';
898
898
  readonly value: string;
899
899
  readonly weight?: number;
900
900
  readonly required?: boolean | number;
@@ -922,7 +922,7 @@ type RegexEvaluatorConfig = {
922
922
  */
923
923
  type IsJsonEvaluatorConfig = {
924
924
  readonly name: string;
925
- readonly type: 'is_json';
925
+ readonly type: 'is-json';
926
926
  readonly weight?: number;
927
927
  readonly required?: boolean | number;
928
928
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
@@ -1915,7 +1915,7 @@ interface CodeEvaluatorOptions {
1915
1915
  readonly target?: TargetAccessConfig;
1916
1916
  }
1917
1917
  declare class CodeEvaluator implements Evaluator {
1918
- readonly kind = "code";
1918
+ readonly kind = "code-judge";
1919
1919
  private readonly command;
1920
1920
  private readonly cwd?;
1921
1921
  private readonly agentTimeoutMs?;
@@ -1970,7 +1970,7 @@ interface ExecutionMetricsEvaluatorOptions {
1970
1970
  * Score is proportional: hits.length / (hits.length + misses.length)
1971
1971
  */
1972
1972
  declare class ExecutionMetricsEvaluator implements Evaluator {
1973
- readonly kind = "execution_metrics";
1973
+ readonly kind = "execution-metrics";
1974
1974
  private readonly config;
1975
1975
  constructor(options: ExecutionMetricsEvaluatorOptions);
1976
1976
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -1986,7 +1986,7 @@ interface FieldAccuracyEvaluatorOptions {
1986
1986
  * with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
1987
1987
  */
1988
1988
  declare class FieldAccuracyEvaluator implements Evaluator {
1989
- readonly kind = "field_accuracy";
1989
+ readonly kind = "field-accuracy";
1990
1990
  private readonly config;
1991
1991
  constructor(options: FieldAccuracyEvaluatorOptions);
1992
1992
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -2091,7 +2091,7 @@ declare const rubricEvaluationSchema: z.ZodObject<{
2091
2091
  }>;
2092
2092
 
2093
2093
  declare class LlmJudgeEvaluator implements Evaluator {
2094
- readonly kind = "llm_judge";
2094
+ readonly kind = "llm-judge";
2095
2095
  private readonly resolveJudgeProvider;
2096
2096
  private readonly maxOutputTokens?;
2097
2097
  private readonly temperature?;
@@ -2138,7 +2138,7 @@ interface AgentJudgeEvaluatorOptions {
2138
2138
  readonly judgeTargetProvider?: Provider;
2139
2139
  }
2140
2140
  declare class AgentJudgeEvaluator implements Evaluator {
2141
- readonly kind = "agent_judge";
2141
+ readonly kind = "agent-judge";
2142
2142
  private readonly resolveJudgeProvider;
2143
2143
  private readonly maxSteps;
2144
2144
  private readonly temperature;
@@ -2200,7 +2200,7 @@ interface TokenUsageEvaluatorOptions {
2200
2200
  * Uses tokenUsage from the evaluation context.
2201
2201
  */
2202
2202
  declare class TokenUsageEvaluator implements Evaluator {
2203
- readonly kind = "token_usage";
2203
+ readonly kind = "token-usage";
2204
2204
  private readonly config;
2205
2205
  constructor(options: TokenUsageEvaluatorOptions);
2206
2206
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -2210,7 +2210,7 @@ interface ToolTrajectoryEvaluatorOptions {
2210
2210
  readonly config: ToolTrajectoryEvaluatorConfig;
2211
2211
  }
2212
2212
  declare class ToolTrajectoryEvaluator implements Evaluator {
2213
- readonly kind = "tool_trajectory";
2213
+ readonly kind = "tool-trajectory";
2214
2214
  private readonly config;
2215
2215
  constructor(options: ToolTrajectoryEvaluatorOptions);
2216
2216
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -2335,7 +2335,7 @@ declare class EvaluatorRegistry {
2335
2335
  }
2336
2336
  /**
2337
2337
  * Adapter that wraps a synchronous assertion function as an Evaluator.
2338
- * Used for deterministic assertions (contains, regex, is_json, equals).
2338
+ * Used for deterministic assertions (contains, regex, is-json, equals).
2339
2339
  */
2340
2340
  declare class DeterministicAssertionEvaluator implements Evaluator {
2341
2341
  private readonly assertFn;
@@ -2383,7 +2383,7 @@ interface RunEvalCaseOptions {
2383
2383
  readonly provider: Provider;
2384
2384
  readonly target: ResolvedTarget;
2385
2385
  readonly evaluators: Partial<Record<string, Evaluator>> & {
2386
- readonly llm_judge: Evaluator;
2386
+ readonly 'llm-judge': Evaluator;
2387
2387
  };
2388
2388
  readonly now?: () => Date;
2389
2389
  readonly maxRetries?: number;
@@ -2524,7 +2524,7 @@ interface EvalTestInput {
2524
2524
  * Matches the YAML `assert` block structure.
2525
2525
  */
2526
2526
  interface EvalAssertionInput {
2527
- /** Assertion type (e.g., 'contains', 'llm_judge', 'code_judge') */
2527
+ /** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
2528
2528
  readonly type: string;
2529
2529
  /** Display name */
2530
2530
  readonly name?: string;
@@ -3152,7 +3152,7 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
3152
3152
  * Convention-based discovery of custom assertion scripts.
3153
3153
  *
3154
3154
  * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
3155
- * them as code_judge evaluators in the registry. The file name (without
3155
+ * them as code-judge evaluators in the registry. The file name (without
3156
3156
  * extension) becomes the evaluator type name.
3157
3157
  *
3158
3158
  * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
package/dist/index.d.ts CHANGED
@@ -257,7 +257,7 @@ interface TraceComputeResult {
257
257
  readonly endTime?: string;
258
258
  }
259
259
  /**
260
- * Argument matching mode for tool_trajectory expected items.
260
+ * Argument matching mode for tool-trajectory expected items.
261
261
  * - 'exact': bidirectional deep equality, no extra keys allowed (default)
262
262
  * - 'superset': actual args must contain all expected keys (extras OK)
263
263
  * - 'subset': actual args must be a subset of expected keys (no unexpected keys)
@@ -265,11 +265,11 @@ interface TraceComputeResult {
265
265
  */
266
266
  type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
267
267
  /**
268
- * Configuration for tool_trajectory evaluator.
268
+ * Configuration for tool-trajectory evaluator.
269
269
  */
270
270
  interface ToolTrajectoryEvaluatorConfig {
271
271
  readonly name: string;
272
- readonly type: 'tool_trajectory';
272
+ readonly type: 'tool-trajectory';
273
273
  /** Matching mode */
274
274
  readonly mode: 'any_order' | 'in_order' | 'exact' | 'subset' | 'superset';
275
275
  /** Minimum call counts per tool (for any_order mode) */
@@ -453,11 +453,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
453
453
  * - Either content (string or array of objects) OR tool_calls (for assistant messages)
454
454
  */
455
455
  declare function isTestMessage(value: unknown): value is TestMessage;
456
- declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage", "execution_metrics", "agent_judge", "contains", "contains_any", "contains_all", "icontains", "icontains_any", "icontains_all", "starts_with", "ends_with", "regex", "is_json", "equals", "rubrics"];
456
+ declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "agent-judge", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics"];
457
457
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
458
458
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
459
459
  /**
460
- * Configuration for enabling target access in code_judge evaluators.
460
+ * Configuration for enabling target access in code-judge evaluators.
461
461
  * When present, the runtime will start a local proxy server that allows
462
462
  * the script to invoke configured targets without direct credential access.
463
463
  */
@@ -539,7 +539,7 @@ type WorkspaceConfig = {
539
539
  };
540
540
  type CodeEvaluatorConfig = {
541
541
  readonly name: string;
542
- readonly type: 'code';
542
+ readonly type: 'code-judge';
543
543
  readonly command: readonly string[];
544
544
  /** @deprecated Use `command` instead */
545
545
  readonly script?: readonly string[];
@@ -550,14 +550,14 @@ type CodeEvaluatorConfig = {
550
550
  readonly required?: boolean | number;
551
551
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
552
552
  readonly negate?: boolean;
553
- /** Pass-through configuration for the code_judge (any unrecognized YAML properties) */
553
+ /** Pass-through configuration for the code-judge (any unrecognized YAML properties) */
554
554
  readonly config?: JsonObject;
555
555
  /** When present, enables target access via local proxy */
556
556
  readonly target?: TargetAccessConfig;
557
557
  };
558
558
  /**
559
559
  * Executable prompt template configuration.
560
- * Matches code_judge pattern for consistency.
560
+ * Matches code-judge pattern for consistency.
561
561
  */
562
562
  type PromptScriptConfig = {
563
563
  /** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
@@ -569,13 +569,13 @@ type PromptScriptConfig = {
569
569
  };
570
570
  type LlmJudgeEvaluatorConfig = {
571
571
  readonly name: string;
572
- readonly type: 'llm_judge';
572
+ readonly type: 'llm-judge';
573
573
  /** Text prompt (inline or file path) or executable script config */
574
574
  readonly prompt?: string | PromptScriptConfig;
575
575
  readonly promptPath?: string;
576
576
  /** Resolved absolute path for prompt file (used for text template prompts) */
577
577
  readonly resolvedPromptPath?: string;
578
- /** Resolved script array for executable prompts (matches code_judge pattern) */
578
+ /** Resolved script array for executable prompts (matches code-judge pattern) */
579
579
  readonly resolvedPromptScript?: readonly string[];
580
580
  readonly rubrics?: readonly RubricItem[];
581
581
  readonly weight?: number;
@@ -630,11 +630,11 @@ type CompositeAggregatorConfig = {
630
630
  readonly type: 'weighted_average';
631
631
  readonly weights?: Record<string, number>;
632
632
  } | {
633
- readonly type: 'code_judge';
633
+ readonly type: 'code-judge';
634
634
  readonly path: string;
635
635
  readonly cwd?: string;
636
636
  } | {
637
- readonly type: 'llm_judge';
637
+ readonly type: 'llm-judge';
638
638
  readonly prompt?: string;
639
639
  readonly promptPath?: string;
640
640
  readonly model?: string;
@@ -654,7 +654,7 @@ type CompositeEvaluatorConfig = {
654
654
  };
655
655
  /**
656
656
  * Match type for field accuracy evaluation.
657
- * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code_judge evaluator.
657
+ * Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-judge evaluator.
658
658
  * See examples/features/document-extraction/fuzzy_match.ts for an example.
659
659
  */
660
660
  type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
@@ -682,11 +682,11 @@ type FieldConfig = {
682
682
  readonly formats?: readonly string[];
683
683
  };
684
684
  /**
685
- * Configuration for the field_accuracy evaluator.
685
+ * Configuration for the field-accuracy evaluator.
686
686
  */
687
687
  type FieldAccuracyEvaluatorConfig = {
688
688
  readonly name: string;
689
- readonly type: 'field_accuracy';
689
+ readonly type: 'field-accuracy';
690
690
  /** Fields to compare between candidate and expected */
691
691
  readonly fields: readonly FieldConfig[];
692
692
  /** Strategy for combining field scores (default: weighted_average) */
@@ -725,12 +725,12 @@ type CostEvaluatorConfig = {
725
725
  readonly negate?: boolean;
726
726
  };
727
727
  /**
728
- * Configuration for the token_usage evaluator.
728
+ * Configuration for the token-usage evaluator.
729
729
  * Checks provider-reported token usage against configured limits.
730
730
  */
731
731
  type TokenUsageEvaluatorConfig = {
732
732
  readonly name: string;
733
- readonly type: 'token_usage';
733
+ readonly type: 'token-usage';
734
734
  /** Maximum allowed total tokens (input + output + cached, when present) */
735
735
  readonly max_total?: number;
736
736
  /** Maximum allowed input tokens (prompt) */
@@ -743,13 +743,13 @@ type TokenUsageEvaluatorConfig = {
743
743
  readonly negate?: boolean;
744
744
  };
745
745
  /**
746
- * Configuration for the execution_metrics evaluator.
746
+ * Configuration for the execution-metrics evaluator.
747
747
  * Provides declarative threshold-based checks on execution metrics.
748
748
  * Only specified thresholds are checked; omitted ones are ignored.
749
749
  */
750
750
  type ExecutionMetricsEvaluatorConfig = {
751
751
  readonly name: string;
752
- readonly type: 'execution_metrics';
752
+ readonly type: 'execution-metrics';
753
753
  /** Maximum allowed number of tool calls */
754
754
  readonly max_tool_calls?: number;
755
755
  /** Maximum allowed number of LLM calls (assistant messages) */
@@ -770,7 +770,7 @@ type ExecutionMetricsEvaluatorConfig = {
770
770
  readonly negate?: boolean;
771
771
  };
772
772
  /**
773
- * Configuration for the agent_judge evaluator.
773
+ * Configuration for the agent-judge evaluator.
774
774
  * Runs an agentic investigation loop to audit workspaces and verify criteria.
775
775
  * Two modes:
776
776
  * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
@@ -778,13 +778,13 @@ type ExecutionMetricsEvaluatorConfig = {
778
778
  */
779
779
  type AgentJudgeEvaluatorConfig = {
780
780
  readonly name: string;
781
- readonly type: 'agent_judge';
781
+ readonly type: 'agent-judge';
782
782
  /** Custom evaluation prompt (inline text or file path) */
783
783
  readonly prompt?: string;
784
784
  readonly promptPath?: string;
785
785
  /** Resolved absolute path for prompt file */
786
786
  readonly resolvedPromptPath?: string;
787
- /** Rubric items for structured evaluation (reuses llm_judge rubric infra) */
787
+ /** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
788
788
  readonly rubrics?: readonly RubricItem[];
789
789
  /** Maximum agent steps for built-in mode (default 10, max 50) */
790
790
  readonly max_steps?: number;
@@ -816,7 +816,7 @@ type ContainsEvaluatorConfig = {
816
816
  */
817
817
  type ContainsAnyEvaluatorConfig = {
818
818
  readonly name: string;
819
- readonly type: 'contains_any';
819
+ readonly type: 'contains-any';
820
820
  readonly value: readonly string[];
821
821
  readonly weight?: number;
822
822
  readonly required?: boolean | number;
@@ -829,7 +829,7 @@ type ContainsAnyEvaluatorConfig = {
829
829
  */
830
830
  type ContainsAllEvaluatorConfig = {
831
831
  readonly name: string;
832
- readonly type: 'contains_all';
832
+ readonly type: 'contains-all';
833
833
  readonly value: readonly string[];
834
834
  readonly weight?: number;
835
835
  readonly required?: boolean | number;
@@ -855,7 +855,7 @@ type IcontainsEvaluatorConfig = {
855
855
  */
856
856
  type IcontainsAnyEvaluatorConfig = {
857
857
  readonly name: string;
858
- readonly type: 'icontains_any';
858
+ readonly type: 'icontains-any';
859
859
  readonly value: readonly string[];
860
860
  readonly weight?: number;
861
861
  readonly required?: boolean | number;
@@ -868,7 +868,7 @@ type IcontainsAnyEvaluatorConfig = {
868
868
  */
869
869
  type IcontainsAllEvaluatorConfig = {
870
870
  readonly name: string;
871
- readonly type: 'icontains_all';
871
+ readonly type: 'icontains-all';
872
872
  readonly value: readonly string[];
873
873
  readonly weight?: number;
874
874
  readonly required?: boolean | number;
@@ -881,7 +881,7 @@ type IcontainsAllEvaluatorConfig = {
881
881
  */
882
882
  type StartsWithEvaluatorConfig = {
883
883
  readonly name: string;
884
- readonly type: 'starts_with';
884
+ readonly type: 'starts-with';
885
885
  readonly value: string;
886
886
  readonly weight?: number;
887
887
  readonly required?: boolean | number;
@@ -894,7 +894,7 @@ type StartsWithEvaluatorConfig = {
894
894
  */
895
895
  type EndsWithEvaluatorConfig = {
896
896
  readonly name: string;
897
- readonly type: 'ends_with';
897
+ readonly type: 'ends-with';
898
898
  readonly value: string;
899
899
  readonly weight?: number;
900
900
  readonly required?: boolean | number;
@@ -922,7 +922,7 @@ type RegexEvaluatorConfig = {
922
922
  */
923
923
  type IsJsonEvaluatorConfig = {
924
924
  readonly name: string;
925
- readonly type: 'is_json';
925
+ readonly type: 'is-json';
926
926
  readonly weight?: number;
927
927
  readonly required?: boolean | number;
928
928
  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
@@ -1915,7 +1915,7 @@ interface CodeEvaluatorOptions {
1915
1915
  readonly target?: TargetAccessConfig;
1916
1916
  }
1917
1917
  declare class CodeEvaluator implements Evaluator {
1918
- readonly kind = "code";
1918
+ readonly kind = "code-judge";
1919
1919
  private readonly command;
1920
1920
  private readonly cwd?;
1921
1921
  private readonly agentTimeoutMs?;
@@ -1970,7 +1970,7 @@ interface ExecutionMetricsEvaluatorOptions {
1970
1970
  * Score is proportional: hits.length / (hits.length + misses.length)
1971
1971
  */
1972
1972
  declare class ExecutionMetricsEvaluator implements Evaluator {
1973
- readonly kind = "execution_metrics";
1973
+ readonly kind = "execution-metrics";
1974
1974
  private readonly config;
1975
1975
  constructor(options: ExecutionMetricsEvaluatorOptions);
1976
1976
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -1986,7 +1986,7 @@ interface FieldAccuracyEvaluatorOptions {
1986
1986
  * with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
1987
1987
  */
1988
1988
  declare class FieldAccuracyEvaluator implements Evaluator {
1989
- readonly kind = "field_accuracy";
1989
+ readonly kind = "field-accuracy";
1990
1990
  private readonly config;
1991
1991
  constructor(options: FieldAccuracyEvaluatorOptions);
1992
1992
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -2091,7 +2091,7 @@ declare const rubricEvaluationSchema: z.ZodObject<{
2091
2091
  }>;
2092
2092
 
2093
2093
  declare class LlmJudgeEvaluator implements Evaluator {
2094
- readonly kind = "llm_judge";
2094
+ readonly kind = "llm-judge";
2095
2095
  private readonly resolveJudgeProvider;
2096
2096
  private readonly maxOutputTokens?;
2097
2097
  private readonly temperature?;
@@ -2138,7 +2138,7 @@ interface AgentJudgeEvaluatorOptions {
2138
2138
  readonly judgeTargetProvider?: Provider;
2139
2139
  }
2140
2140
  declare class AgentJudgeEvaluator implements Evaluator {
2141
- readonly kind = "agent_judge";
2141
+ readonly kind = "agent-judge";
2142
2142
  private readonly resolveJudgeProvider;
2143
2143
  private readonly maxSteps;
2144
2144
  private readonly temperature;
@@ -2200,7 +2200,7 @@ interface TokenUsageEvaluatorOptions {
2200
2200
  * Uses tokenUsage from the evaluation context.
2201
2201
  */
2202
2202
  declare class TokenUsageEvaluator implements Evaluator {
2203
- readonly kind = "token_usage";
2203
+ readonly kind = "token-usage";
2204
2204
  private readonly config;
2205
2205
  constructor(options: TokenUsageEvaluatorOptions);
2206
2206
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -2210,7 +2210,7 @@ interface ToolTrajectoryEvaluatorOptions {
2210
2210
  readonly config: ToolTrajectoryEvaluatorConfig;
2211
2211
  }
2212
2212
  declare class ToolTrajectoryEvaluator implements Evaluator {
2213
- readonly kind = "tool_trajectory";
2213
+ readonly kind = "tool-trajectory";
2214
2214
  private readonly config;
2215
2215
  constructor(options: ToolTrajectoryEvaluatorOptions);
2216
2216
  evaluate(context: EvaluationContext): EvaluationScore;
@@ -2335,7 +2335,7 @@ declare class EvaluatorRegistry {
2335
2335
  }
2336
2336
  /**
2337
2337
  * Adapter that wraps a synchronous assertion function as an Evaluator.
2338
- * Used for deterministic assertions (contains, regex, is_json, equals).
2338
+ * Used for deterministic assertions (contains, regex, is-json, equals).
2339
2339
  */
2340
2340
  declare class DeterministicAssertionEvaluator implements Evaluator {
2341
2341
  private readonly assertFn;
@@ -2383,7 +2383,7 @@ interface RunEvalCaseOptions {
2383
2383
  readonly provider: Provider;
2384
2384
  readonly target: ResolvedTarget;
2385
2385
  readonly evaluators: Partial<Record<string, Evaluator>> & {
2386
- readonly llm_judge: Evaluator;
2386
+ readonly 'llm-judge': Evaluator;
2387
2387
  };
2388
2388
  readonly now?: () => Date;
2389
2389
  readonly maxRetries?: number;
@@ -2524,7 +2524,7 @@ interface EvalTestInput {
2524
2524
  * Matches the YAML `assert` block structure.
2525
2525
  */
2526
2526
  interface EvalAssertionInput {
2527
- /** Assertion type (e.g., 'contains', 'llm_judge', 'code_judge') */
2527
+ /** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
2528
2528
  readonly type: string;
2529
2529
  /** Display name */
2530
2530
  readonly name?: string;
@@ -3152,7 +3152,7 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
3152
3152
  * Convention-based discovery of custom assertion scripts.
3153
3153
  *
3154
3154
  * Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
3155
- * them as code_judge evaluators in the registry. The file name (without
3155
+ * them as code-judge evaluators in the registry. The file name (without
3156
3156
  * extension) becomes the evaluator type name.
3157
3157
  *
3158
3158
  * Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml