@agentv/core 2.13.0 → 2.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-JHER2LQ5.js → chunk-N55K52OO.js} +15 -15
- package/dist/chunk-N55K52OO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +25 -24
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +12 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +184 -158
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +40 -40
- package/dist/index.d.ts +40 -40
- package/dist/index.js +172 -146
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-JHER2LQ5.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -257,7 +257,7 @@ interface TraceComputeResult {
|
|
|
257
257
|
readonly endTime?: string;
|
|
258
258
|
}
|
|
259
259
|
/**
|
|
260
|
-
* Argument matching mode for
|
|
260
|
+
* Argument matching mode for tool-trajectory expected items.
|
|
261
261
|
* - 'exact': bidirectional deep equality, no extra keys allowed (default)
|
|
262
262
|
* - 'superset': actual args must contain all expected keys (extras OK)
|
|
263
263
|
* - 'subset': actual args must be a subset of expected keys (no unexpected keys)
|
|
@@ -265,11 +265,11 @@ interface TraceComputeResult {
|
|
|
265
265
|
*/
|
|
266
266
|
type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
|
|
267
267
|
/**
|
|
268
|
-
* Configuration for
|
|
268
|
+
* Configuration for tool-trajectory evaluator.
|
|
269
269
|
*/
|
|
270
270
|
interface ToolTrajectoryEvaluatorConfig {
|
|
271
271
|
readonly name: string;
|
|
272
|
-
readonly type: '
|
|
272
|
+
readonly type: 'tool-trajectory';
|
|
273
273
|
/** Matching mode */
|
|
274
274
|
readonly mode: 'any_order' | 'in_order' | 'exact' | 'subset' | 'superset';
|
|
275
275
|
/** Minimum call counts per tool (for any_order mode) */
|
|
@@ -453,11 +453,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
453
453
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
454
454
|
*/
|
|
455
455
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
456
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["
|
|
456
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "agent-judge", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics"];
|
|
457
457
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
458
458
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
459
459
|
/**
|
|
460
|
-
* Configuration for enabling target access in
|
|
460
|
+
* Configuration for enabling target access in code-judge evaluators.
|
|
461
461
|
* When present, the runtime will start a local proxy server that allows
|
|
462
462
|
* the script to invoke configured targets without direct credential access.
|
|
463
463
|
*/
|
|
@@ -539,7 +539,7 @@ type WorkspaceConfig = {
|
|
|
539
539
|
};
|
|
540
540
|
type CodeEvaluatorConfig = {
|
|
541
541
|
readonly name: string;
|
|
542
|
-
readonly type: 'code';
|
|
542
|
+
readonly type: 'code-judge';
|
|
543
543
|
readonly command: readonly string[];
|
|
544
544
|
/** @deprecated Use `command` instead */
|
|
545
545
|
readonly script?: readonly string[];
|
|
@@ -550,14 +550,14 @@ type CodeEvaluatorConfig = {
|
|
|
550
550
|
readonly required?: boolean | number;
|
|
551
551
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
552
552
|
readonly negate?: boolean;
|
|
553
|
-
/** Pass-through configuration for the
|
|
553
|
+
/** Pass-through configuration for the code-judge (any unrecognized YAML properties) */
|
|
554
554
|
readonly config?: JsonObject;
|
|
555
555
|
/** When present, enables target access via local proxy */
|
|
556
556
|
readonly target?: TargetAccessConfig;
|
|
557
557
|
};
|
|
558
558
|
/**
|
|
559
559
|
* Executable prompt template configuration.
|
|
560
|
-
* Matches
|
|
560
|
+
* Matches code-judge pattern for consistency.
|
|
561
561
|
*/
|
|
562
562
|
type PromptScriptConfig = {
|
|
563
563
|
/** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
|
|
@@ -569,13 +569,13 @@ type PromptScriptConfig = {
|
|
|
569
569
|
};
|
|
570
570
|
type LlmJudgeEvaluatorConfig = {
|
|
571
571
|
readonly name: string;
|
|
572
|
-
readonly type: '
|
|
572
|
+
readonly type: 'llm-judge';
|
|
573
573
|
/** Text prompt (inline or file path) or executable script config */
|
|
574
574
|
readonly prompt?: string | PromptScriptConfig;
|
|
575
575
|
readonly promptPath?: string;
|
|
576
576
|
/** Resolved absolute path for prompt file (used for text template prompts) */
|
|
577
577
|
readonly resolvedPromptPath?: string;
|
|
578
|
-
/** Resolved script array for executable prompts (matches
|
|
578
|
+
/** Resolved script array for executable prompts (matches code-judge pattern) */
|
|
579
579
|
readonly resolvedPromptScript?: readonly string[];
|
|
580
580
|
readonly rubrics?: readonly RubricItem[];
|
|
581
581
|
readonly weight?: number;
|
|
@@ -630,11 +630,11 @@ type CompositeAggregatorConfig = {
|
|
|
630
630
|
readonly type: 'weighted_average';
|
|
631
631
|
readonly weights?: Record<string, number>;
|
|
632
632
|
} | {
|
|
633
|
-
readonly type: '
|
|
633
|
+
readonly type: 'code-judge';
|
|
634
634
|
readonly path: string;
|
|
635
635
|
readonly cwd?: string;
|
|
636
636
|
} | {
|
|
637
|
-
readonly type: '
|
|
637
|
+
readonly type: 'llm-judge';
|
|
638
638
|
readonly prompt?: string;
|
|
639
639
|
readonly promptPath?: string;
|
|
640
640
|
readonly model?: string;
|
|
@@ -654,7 +654,7 @@ type CompositeEvaluatorConfig = {
|
|
|
654
654
|
};
|
|
655
655
|
/**
|
|
656
656
|
* Match type for field accuracy evaluation.
|
|
657
|
-
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a
|
|
657
|
+
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-judge evaluator.
|
|
658
658
|
* See examples/features/document-extraction/fuzzy_match.ts for an example.
|
|
659
659
|
*/
|
|
660
660
|
type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
|
|
@@ -682,11 +682,11 @@ type FieldConfig = {
|
|
|
682
682
|
readonly formats?: readonly string[];
|
|
683
683
|
};
|
|
684
684
|
/**
|
|
685
|
-
* Configuration for the
|
|
685
|
+
* Configuration for the field-accuracy evaluator.
|
|
686
686
|
*/
|
|
687
687
|
type FieldAccuracyEvaluatorConfig = {
|
|
688
688
|
readonly name: string;
|
|
689
|
-
readonly type: '
|
|
689
|
+
readonly type: 'field-accuracy';
|
|
690
690
|
/** Fields to compare between candidate and expected */
|
|
691
691
|
readonly fields: readonly FieldConfig[];
|
|
692
692
|
/** Strategy for combining field scores (default: weighted_average) */
|
|
@@ -725,12 +725,12 @@ type CostEvaluatorConfig = {
|
|
|
725
725
|
readonly negate?: boolean;
|
|
726
726
|
};
|
|
727
727
|
/**
|
|
728
|
-
* Configuration for the
|
|
728
|
+
* Configuration for the token-usage evaluator.
|
|
729
729
|
* Checks provider-reported token usage against configured limits.
|
|
730
730
|
*/
|
|
731
731
|
type TokenUsageEvaluatorConfig = {
|
|
732
732
|
readonly name: string;
|
|
733
|
-
readonly type: '
|
|
733
|
+
readonly type: 'token-usage';
|
|
734
734
|
/** Maximum allowed total tokens (input + output + cached, when present) */
|
|
735
735
|
readonly max_total?: number;
|
|
736
736
|
/** Maximum allowed input tokens (prompt) */
|
|
@@ -743,13 +743,13 @@ type TokenUsageEvaluatorConfig = {
|
|
|
743
743
|
readonly negate?: boolean;
|
|
744
744
|
};
|
|
745
745
|
/**
|
|
746
|
-
* Configuration for the
|
|
746
|
+
* Configuration for the execution-metrics evaluator.
|
|
747
747
|
* Provides declarative threshold-based checks on execution metrics.
|
|
748
748
|
* Only specified thresholds are checked; omitted ones are ignored.
|
|
749
749
|
*/
|
|
750
750
|
type ExecutionMetricsEvaluatorConfig = {
|
|
751
751
|
readonly name: string;
|
|
752
|
-
readonly type: '
|
|
752
|
+
readonly type: 'execution-metrics';
|
|
753
753
|
/** Maximum allowed number of tool calls */
|
|
754
754
|
readonly max_tool_calls?: number;
|
|
755
755
|
/** Maximum allowed number of LLM calls (assistant messages) */
|
|
@@ -770,7 +770,7 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
770
770
|
readonly negate?: boolean;
|
|
771
771
|
};
|
|
772
772
|
/**
|
|
773
|
-
* Configuration for the
|
|
773
|
+
* Configuration for the agent-judge evaluator.
|
|
774
774
|
* Runs an agentic investigation loop to audit workspaces and verify criteria.
|
|
775
775
|
* Two modes:
|
|
776
776
|
* - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
|
|
@@ -778,13 +778,13 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
778
778
|
*/
|
|
779
779
|
type AgentJudgeEvaluatorConfig = {
|
|
780
780
|
readonly name: string;
|
|
781
|
-
readonly type: '
|
|
781
|
+
readonly type: 'agent-judge';
|
|
782
782
|
/** Custom evaluation prompt (inline text or file path) */
|
|
783
783
|
readonly prompt?: string;
|
|
784
784
|
readonly promptPath?: string;
|
|
785
785
|
/** Resolved absolute path for prompt file */
|
|
786
786
|
readonly resolvedPromptPath?: string;
|
|
787
|
-
/** Rubric items for structured evaluation (reuses
|
|
787
|
+
/** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
|
|
788
788
|
readonly rubrics?: readonly RubricItem[];
|
|
789
789
|
/** Maximum agent steps for built-in mode (default 10, max 50) */
|
|
790
790
|
readonly max_steps?: number;
|
|
@@ -816,7 +816,7 @@ type ContainsEvaluatorConfig = {
|
|
|
816
816
|
*/
|
|
817
817
|
type ContainsAnyEvaluatorConfig = {
|
|
818
818
|
readonly name: string;
|
|
819
|
-
readonly type: '
|
|
819
|
+
readonly type: 'contains-any';
|
|
820
820
|
readonly value: readonly string[];
|
|
821
821
|
readonly weight?: number;
|
|
822
822
|
readonly required?: boolean | number;
|
|
@@ -829,7 +829,7 @@ type ContainsAnyEvaluatorConfig = {
|
|
|
829
829
|
*/
|
|
830
830
|
type ContainsAllEvaluatorConfig = {
|
|
831
831
|
readonly name: string;
|
|
832
|
-
readonly type: '
|
|
832
|
+
readonly type: 'contains-all';
|
|
833
833
|
readonly value: readonly string[];
|
|
834
834
|
readonly weight?: number;
|
|
835
835
|
readonly required?: boolean | number;
|
|
@@ -855,7 +855,7 @@ type IcontainsEvaluatorConfig = {
|
|
|
855
855
|
*/
|
|
856
856
|
type IcontainsAnyEvaluatorConfig = {
|
|
857
857
|
readonly name: string;
|
|
858
|
-
readonly type: '
|
|
858
|
+
readonly type: 'icontains-any';
|
|
859
859
|
readonly value: readonly string[];
|
|
860
860
|
readonly weight?: number;
|
|
861
861
|
readonly required?: boolean | number;
|
|
@@ -868,7 +868,7 @@ type IcontainsAnyEvaluatorConfig = {
|
|
|
868
868
|
*/
|
|
869
869
|
type IcontainsAllEvaluatorConfig = {
|
|
870
870
|
readonly name: string;
|
|
871
|
-
readonly type: '
|
|
871
|
+
readonly type: 'icontains-all';
|
|
872
872
|
readonly value: readonly string[];
|
|
873
873
|
readonly weight?: number;
|
|
874
874
|
readonly required?: boolean | number;
|
|
@@ -881,7 +881,7 @@ type IcontainsAllEvaluatorConfig = {
|
|
|
881
881
|
*/
|
|
882
882
|
type StartsWithEvaluatorConfig = {
|
|
883
883
|
readonly name: string;
|
|
884
|
-
readonly type: '
|
|
884
|
+
readonly type: 'starts-with';
|
|
885
885
|
readonly value: string;
|
|
886
886
|
readonly weight?: number;
|
|
887
887
|
readonly required?: boolean | number;
|
|
@@ -894,7 +894,7 @@ type StartsWithEvaluatorConfig = {
|
|
|
894
894
|
*/
|
|
895
895
|
type EndsWithEvaluatorConfig = {
|
|
896
896
|
readonly name: string;
|
|
897
|
-
readonly type: '
|
|
897
|
+
readonly type: 'ends-with';
|
|
898
898
|
readonly value: string;
|
|
899
899
|
readonly weight?: number;
|
|
900
900
|
readonly required?: boolean | number;
|
|
@@ -922,7 +922,7 @@ type RegexEvaluatorConfig = {
|
|
|
922
922
|
*/
|
|
923
923
|
type IsJsonEvaluatorConfig = {
|
|
924
924
|
readonly name: string;
|
|
925
|
-
readonly type: '
|
|
925
|
+
readonly type: 'is-json';
|
|
926
926
|
readonly weight?: number;
|
|
927
927
|
readonly required?: boolean | number;
|
|
928
928
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
@@ -1915,7 +1915,7 @@ interface CodeEvaluatorOptions {
|
|
|
1915
1915
|
readonly target?: TargetAccessConfig;
|
|
1916
1916
|
}
|
|
1917
1917
|
declare class CodeEvaluator implements Evaluator {
|
|
1918
|
-
readonly kind = "code";
|
|
1918
|
+
readonly kind = "code-judge";
|
|
1919
1919
|
private readonly command;
|
|
1920
1920
|
private readonly cwd?;
|
|
1921
1921
|
private readonly agentTimeoutMs?;
|
|
@@ -1970,7 +1970,7 @@ interface ExecutionMetricsEvaluatorOptions {
|
|
|
1970
1970
|
* Score is proportional: hits.length / (hits.length + misses.length)
|
|
1971
1971
|
*/
|
|
1972
1972
|
declare class ExecutionMetricsEvaluator implements Evaluator {
|
|
1973
|
-
readonly kind = "
|
|
1973
|
+
readonly kind = "execution-metrics";
|
|
1974
1974
|
private readonly config;
|
|
1975
1975
|
constructor(options: ExecutionMetricsEvaluatorOptions);
|
|
1976
1976
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -1986,7 +1986,7 @@ interface FieldAccuracyEvaluatorOptions {
|
|
|
1986
1986
|
* with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
|
|
1987
1987
|
*/
|
|
1988
1988
|
declare class FieldAccuracyEvaluator implements Evaluator {
|
|
1989
|
-
readonly kind = "
|
|
1989
|
+
readonly kind = "field-accuracy";
|
|
1990
1990
|
private readonly config;
|
|
1991
1991
|
constructor(options: FieldAccuracyEvaluatorOptions);
|
|
1992
1992
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -2091,7 +2091,7 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2091
2091
|
}>;
|
|
2092
2092
|
|
|
2093
2093
|
declare class LlmJudgeEvaluator implements Evaluator {
|
|
2094
|
-
readonly kind = "
|
|
2094
|
+
readonly kind = "llm-judge";
|
|
2095
2095
|
private readonly resolveJudgeProvider;
|
|
2096
2096
|
private readonly maxOutputTokens?;
|
|
2097
2097
|
private readonly temperature?;
|
|
@@ -2138,7 +2138,7 @@ interface AgentJudgeEvaluatorOptions {
|
|
|
2138
2138
|
readonly judgeTargetProvider?: Provider;
|
|
2139
2139
|
}
|
|
2140
2140
|
declare class AgentJudgeEvaluator implements Evaluator {
|
|
2141
|
-
readonly kind = "
|
|
2141
|
+
readonly kind = "agent-judge";
|
|
2142
2142
|
private readonly resolveJudgeProvider;
|
|
2143
2143
|
private readonly maxSteps;
|
|
2144
2144
|
private readonly temperature;
|
|
@@ -2200,7 +2200,7 @@ interface TokenUsageEvaluatorOptions {
|
|
|
2200
2200
|
* Uses tokenUsage from the evaluation context.
|
|
2201
2201
|
*/
|
|
2202
2202
|
declare class TokenUsageEvaluator implements Evaluator {
|
|
2203
|
-
readonly kind = "
|
|
2203
|
+
readonly kind = "token-usage";
|
|
2204
2204
|
private readonly config;
|
|
2205
2205
|
constructor(options: TokenUsageEvaluatorOptions);
|
|
2206
2206
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -2210,7 +2210,7 @@ interface ToolTrajectoryEvaluatorOptions {
|
|
|
2210
2210
|
readonly config: ToolTrajectoryEvaluatorConfig;
|
|
2211
2211
|
}
|
|
2212
2212
|
declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
2213
|
-
readonly kind = "
|
|
2213
|
+
readonly kind = "tool-trajectory";
|
|
2214
2214
|
private readonly config;
|
|
2215
2215
|
constructor(options: ToolTrajectoryEvaluatorOptions);
|
|
2216
2216
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -2335,7 +2335,7 @@ declare class EvaluatorRegistry {
|
|
|
2335
2335
|
}
|
|
2336
2336
|
/**
|
|
2337
2337
|
* Adapter that wraps a synchronous assertion function as an Evaluator.
|
|
2338
|
-
* Used for deterministic assertions (contains, regex,
|
|
2338
|
+
* Used for deterministic assertions (contains, regex, is-json, equals).
|
|
2339
2339
|
*/
|
|
2340
2340
|
declare class DeterministicAssertionEvaluator implements Evaluator {
|
|
2341
2341
|
private readonly assertFn;
|
|
@@ -2383,7 +2383,7 @@ interface RunEvalCaseOptions {
|
|
|
2383
2383
|
readonly provider: Provider;
|
|
2384
2384
|
readonly target: ResolvedTarget;
|
|
2385
2385
|
readonly evaluators: Partial<Record<string, Evaluator>> & {
|
|
2386
|
-
readonly
|
|
2386
|
+
readonly 'llm-judge': Evaluator;
|
|
2387
2387
|
};
|
|
2388
2388
|
readonly now?: () => Date;
|
|
2389
2389
|
readonly maxRetries?: number;
|
|
@@ -2524,7 +2524,7 @@ interface EvalTestInput {
|
|
|
2524
2524
|
* Matches the YAML `assert` block structure.
|
|
2525
2525
|
*/
|
|
2526
2526
|
interface EvalAssertionInput {
|
|
2527
|
-
/** Assertion type (e.g., 'contains', '
|
|
2527
|
+
/** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
|
|
2528
2528
|
readonly type: string;
|
|
2529
2529
|
/** Display name */
|
|
2530
2530
|
readonly name?: string;
|
|
@@ -3152,7 +3152,7 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
|
3152
3152
|
* Convention-based discovery of custom assertion scripts.
|
|
3153
3153
|
*
|
|
3154
3154
|
* Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
|
|
3155
|
-
* them as
|
|
3155
|
+
* them as code-judge evaluators in the registry. The file name (without
|
|
3156
3156
|
* extension) becomes the evaluator type name.
|
|
3157
3157
|
*
|
|
3158
3158
|
* Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
|
package/dist/index.d.ts
CHANGED
|
@@ -257,7 +257,7 @@ interface TraceComputeResult {
|
|
|
257
257
|
readonly endTime?: string;
|
|
258
258
|
}
|
|
259
259
|
/**
|
|
260
|
-
* Argument matching mode for
|
|
260
|
+
* Argument matching mode for tool-trajectory expected items.
|
|
261
261
|
* - 'exact': bidirectional deep equality, no extra keys allowed (default)
|
|
262
262
|
* - 'superset': actual args must contain all expected keys (extras OK)
|
|
263
263
|
* - 'subset': actual args must be a subset of expected keys (no unexpected keys)
|
|
@@ -265,11 +265,11 @@ interface TraceComputeResult {
|
|
|
265
265
|
*/
|
|
266
266
|
type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
|
|
267
267
|
/**
|
|
268
|
-
* Configuration for
|
|
268
|
+
* Configuration for tool-trajectory evaluator.
|
|
269
269
|
*/
|
|
270
270
|
interface ToolTrajectoryEvaluatorConfig {
|
|
271
271
|
readonly name: string;
|
|
272
|
-
readonly type: '
|
|
272
|
+
readonly type: 'tool-trajectory';
|
|
273
273
|
/** Matching mode */
|
|
274
274
|
readonly mode: 'any_order' | 'in_order' | 'exact' | 'subset' | 'superset';
|
|
275
275
|
/** Minimum call counts per tool (for any_order mode) */
|
|
@@ -453,11 +453,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
453
453
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
454
454
|
*/
|
|
455
455
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
456
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["
|
|
456
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "agent-judge", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics"];
|
|
457
457
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
458
458
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
459
459
|
/**
|
|
460
|
-
* Configuration for enabling target access in
|
|
460
|
+
* Configuration for enabling target access in code-judge evaluators.
|
|
461
461
|
* When present, the runtime will start a local proxy server that allows
|
|
462
462
|
* the script to invoke configured targets without direct credential access.
|
|
463
463
|
*/
|
|
@@ -539,7 +539,7 @@ type WorkspaceConfig = {
|
|
|
539
539
|
};
|
|
540
540
|
type CodeEvaluatorConfig = {
|
|
541
541
|
readonly name: string;
|
|
542
|
-
readonly type: 'code';
|
|
542
|
+
readonly type: 'code-judge';
|
|
543
543
|
readonly command: readonly string[];
|
|
544
544
|
/** @deprecated Use `command` instead */
|
|
545
545
|
readonly script?: readonly string[];
|
|
@@ -550,14 +550,14 @@ type CodeEvaluatorConfig = {
|
|
|
550
550
|
readonly required?: boolean | number;
|
|
551
551
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
552
552
|
readonly negate?: boolean;
|
|
553
|
-
/** Pass-through configuration for the
|
|
553
|
+
/** Pass-through configuration for the code-judge (any unrecognized YAML properties) */
|
|
554
554
|
readonly config?: JsonObject;
|
|
555
555
|
/** When present, enables target access via local proxy */
|
|
556
556
|
readonly target?: TargetAccessConfig;
|
|
557
557
|
};
|
|
558
558
|
/**
|
|
559
559
|
* Executable prompt template configuration.
|
|
560
|
-
* Matches
|
|
560
|
+
* Matches code-judge pattern for consistency.
|
|
561
561
|
*/
|
|
562
562
|
type PromptScriptConfig = {
|
|
563
563
|
/** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
|
|
@@ -569,13 +569,13 @@ type PromptScriptConfig = {
|
|
|
569
569
|
};
|
|
570
570
|
type LlmJudgeEvaluatorConfig = {
|
|
571
571
|
readonly name: string;
|
|
572
|
-
readonly type: '
|
|
572
|
+
readonly type: 'llm-judge';
|
|
573
573
|
/** Text prompt (inline or file path) or executable script config */
|
|
574
574
|
readonly prompt?: string | PromptScriptConfig;
|
|
575
575
|
readonly promptPath?: string;
|
|
576
576
|
/** Resolved absolute path for prompt file (used for text template prompts) */
|
|
577
577
|
readonly resolvedPromptPath?: string;
|
|
578
|
-
/** Resolved script array for executable prompts (matches
|
|
578
|
+
/** Resolved script array for executable prompts (matches code-judge pattern) */
|
|
579
579
|
readonly resolvedPromptScript?: readonly string[];
|
|
580
580
|
readonly rubrics?: readonly RubricItem[];
|
|
581
581
|
readonly weight?: number;
|
|
@@ -630,11 +630,11 @@ type CompositeAggregatorConfig = {
|
|
|
630
630
|
readonly type: 'weighted_average';
|
|
631
631
|
readonly weights?: Record<string, number>;
|
|
632
632
|
} | {
|
|
633
|
-
readonly type: '
|
|
633
|
+
readonly type: 'code-judge';
|
|
634
634
|
readonly path: string;
|
|
635
635
|
readonly cwd?: string;
|
|
636
636
|
} | {
|
|
637
|
-
readonly type: '
|
|
637
|
+
readonly type: 'llm-judge';
|
|
638
638
|
readonly prompt?: string;
|
|
639
639
|
readonly promptPath?: string;
|
|
640
640
|
readonly model?: string;
|
|
@@ -654,7 +654,7 @@ type CompositeEvaluatorConfig = {
|
|
|
654
654
|
};
|
|
655
655
|
/**
|
|
656
656
|
* Match type for field accuracy evaluation.
|
|
657
|
-
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a
|
|
657
|
+
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-judge evaluator.
|
|
658
658
|
* See examples/features/document-extraction/fuzzy_match.ts for an example.
|
|
659
659
|
*/
|
|
660
660
|
type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
|
|
@@ -682,11 +682,11 @@ type FieldConfig = {
|
|
|
682
682
|
readonly formats?: readonly string[];
|
|
683
683
|
};
|
|
684
684
|
/**
|
|
685
|
-
* Configuration for the
|
|
685
|
+
* Configuration for the field-accuracy evaluator.
|
|
686
686
|
*/
|
|
687
687
|
type FieldAccuracyEvaluatorConfig = {
|
|
688
688
|
readonly name: string;
|
|
689
|
-
readonly type: '
|
|
689
|
+
readonly type: 'field-accuracy';
|
|
690
690
|
/** Fields to compare between candidate and expected */
|
|
691
691
|
readonly fields: readonly FieldConfig[];
|
|
692
692
|
/** Strategy for combining field scores (default: weighted_average) */
|
|
@@ -725,12 +725,12 @@ type CostEvaluatorConfig = {
|
|
|
725
725
|
readonly negate?: boolean;
|
|
726
726
|
};
|
|
727
727
|
/**
|
|
728
|
-
* Configuration for the
|
|
728
|
+
* Configuration for the token-usage evaluator.
|
|
729
729
|
* Checks provider-reported token usage against configured limits.
|
|
730
730
|
*/
|
|
731
731
|
type TokenUsageEvaluatorConfig = {
|
|
732
732
|
readonly name: string;
|
|
733
|
-
readonly type: '
|
|
733
|
+
readonly type: 'token-usage';
|
|
734
734
|
/** Maximum allowed total tokens (input + output + cached, when present) */
|
|
735
735
|
readonly max_total?: number;
|
|
736
736
|
/** Maximum allowed input tokens (prompt) */
|
|
@@ -743,13 +743,13 @@ type TokenUsageEvaluatorConfig = {
|
|
|
743
743
|
readonly negate?: boolean;
|
|
744
744
|
};
|
|
745
745
|
/**
|
|
746
|
-
* Configuration for the
|
|
746
|
+
* Configuration for the execution-metrics evaluator.
|
|
747
747
|
* Provides declarative threshold-based checks on execution metrics.
|
|
748
748
|
* Only specified thresholds are checked; omitted ones are ignored.
|
|
749
749
|
*/
|
|
750
750
|
type ExecutionMetricsEvaluatorConfig = {
|
|
751
751
|
readonly name: string;
|
|
752
|
-
readonly type: '
|
|
752
|
+
readonly type: 'execution-metrics';
|
|
753
753
|
/** Maximum allowed number of tool calls */
|
|
754
754
|
readonly max_tool_calls?: number;
|
|
755
755
|
/** Maximum allowed number of LLM calls (assistant messages) */
|
|
@@ -770,7 +770,7 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
770
770
|
readonly negate?: boolean;
|
|
771
771
|
};
|
|
772
772
|
/**
|
|
773
|
-
* Configuration for the
|
|
773
|
+
* Configuration for the agent-judge evaluator.
|
|
774
774
|
* Runs an agentic investigation loop to audit workspaces and verify criteria.
|
|
775
775
|
* Two modes:
|
|
776
776
|
* - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
|
|
@@ -778,13 +778,13 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
778
778
|
*/
|
|
779
779
|
type AgentJudgeEvaluatorConfig = {
|
|
780
780
|
readonly name: string;
|
|
781
|
-
readonly type: '
|
|
781
|
+
readonly type: 'agent-judge';
|
|
782
782
|
/** Custom evaluation prompt (inline text or file path) */
|
|
783
783
|
readonly prompt?: string;
|
|
784
784
|
readonly promptPath?: string;
|
|
785
785
|
/** Resolved absolute path for prompt file */
|
|
786
786
|
readonly resolvedPromptPath?: string;
|
|
787
|
-
/** Rubric items for structured evaluation (reuses
|
|
787
|
+
/** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
|
|
788
788
|
readonly rubrics?: readonly RubricItem[];
|
|
789
789
|
/** Maximum agent steps for built-in mode (default 10, max 50) */
|
|
790
790
|
readonly max_steps?: number;
|
|
@@ -816,7 +816,7 @@ type ContainsEvaluatorConfig = {
|
|
|
816
816
|
*/
|
|
817
817
|
type ContainsAnyEvaluatorConfig = {
|
|
818
818
|
readonly name: string;
|
|
819
|
-
readonly type: '
|
|
819
|
+
readonly type: 'contains-any';
|
|
820
820
|
readonly value: readonly string[];
|
|
821
821
|
readonly weight?: number;
|
|
822
822
|
readonly required?: boolean | number;
|
|
@@ -829,7 +829,7 @@ type ContainsAnyEvaluatorConfig = {
|
|
|
829
829
|
*/
|
|
830
830
|
type ContainsAllEvaluatorConfig = {
|
|
831
831
|
readonly name: string;
|
|
832
|
-
readonly type: '
|
|
832
|
+
readonly type: 'contains-all';
|
|
833
833
|
readonly value: readonly string[];
|
|
834
834
|
readonly weight?: number;
|
|
835
835
|
readonly required?: boolean | number;
|
|
@@ -855,7 +855,7 @@ type IcontainsEvaluatorConfig = {
|
|
|
855
855
|
*/
|
|
856
856
|
type IcontainsAnyEvaluatorConfig = {
|
|
857
857
|
readonly name: string;
|
|
858
|
-
readonly type: '
|
|
858
|
+
readonly type: 'icontains-any';
|
|
859
859
|
readonly value: readonly string[];
|
|
860
860
|
readonly weight?: number;
|
|
861
861
|
readonly required?: boolean | number;
|
|
@@ -868,7 +868,7 @@ type IcontainsAnyEvaluatorConfig = {
|
|
|
868
868
|
*/
|
|
869
869
|
type IcontainsAllEvaluatorConfig = {
|
|
870
870
|
readonly name: string;
|
|
871
|
-
readonly type: '
|
|
871
|
+
readonly type: 'icontains-all';
|
|
872
872
|
readonly value: readonly string[];
|
|
873
873
|
readonly weight?: number;
|
|
874
874
|
readonly required?: boolean | number;
|
|
@@ -881,7 +881,7 @@ type IcontainsAllEvaluatorConfig = {
|
|
|
881
881
|
*/
|
|
882
882
|
type StartsWithEvaluatorConfig = {
|
|
883
883
|
readonly name: string;
|
|
884
|
-
readonly type: '
|
|
884
|
+
readonly type: 'starts-with';
|
|
885
885
|
readonly value: string;
|
|
886
886
|
readonly weight?: number;
|
|
887
887
|
readonly required?: boolean | number;
|
|
@@ -894,7 +894,7 @@ type StartsWithEvaluatorConfig = {
|
|
|
894
894
|
*/
|
|
895
895
|
type EndsWithEvaluatorConfig = {
|
|
896
896
|
readonly name: string;
|
|
897
|
-
readonly type: '
|
|
897
|
+
readonly type: 'ends-with';
|
|
898
898
|
readonly value: string;
|
|
899
899
|
readonly weight?: number;
|
|
900
900
|
readonly required?: boolean | number;
|
|
@@ -922,7 +922,7 @@ type RegexEvaluatorConfig = {
|
|
|
922
922
|
*/
|
|
923
923
|
type IsJsonEvaluatorConfig = {
|
|
924
924
|
readonly name: string;
|
|
925
|
-
readonly type: '
|
|
925
|
+
readonly type: 'is-json';
|
|
926
926
|
readonly weight?: number;
|
|
927
927
|
readonly required?: boolean | number;
|
|
928
928
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
@@ -1915,7 +1915,7 @@ interface CodeEvaluatorOptions {
|
|
|
1915
1915
|
readonly target?: TargetAccessConfig;
|
|
1916
1916
|
}
|
|
1917
1917
|
declare class CodeEvaluator implements Evaluator {
|
|
1918
|
-
readonly kind = "code";
|
|
1918
|
+
readonly kind = "code-judge";
|
|
1919
1919
|
private readonly command;
|
|
1920
1920
|
private readonly cwd?;
|
|
1921
1921
|
private readonly agentTimeoutMs?;
|
|
@@ -1970,7 +1970,7 @@ interface ExecutionMetricsEvaluatorOptions {
|
|
|
1970
1970
|
* Score is proportional: hits.length / (hits.length + misses.length)
|
|
1971
1971
|
*/
|
|
1972
1972
|
declare class ExecutionMetricsEvaluator implements Evaluator {
|
|
1973
|
-
readonly kind = "
|
|
1973
|
+
readonly kind = "execution-metrics";
|
|
1974
1974
|
private readonly config;
|
|
1975
1975
|
constructor(options: ExecutionMetricsEvaluatorOptions);
|
|
1976
1976
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -1986,7 +1986,7 @@ interface FieldAccuracyEvaluatorOptions {
|
|
|
1986
1986
|
* with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
|
|
1987
1987
|
*/
|
|
1988
1988
|
declare class FieldAccuracyEvaluator implements Evaluator {
|
|
1989
|
-
readonly kind = "
|
|
1989
|
+
readonly kind = "field-accuracy";
|
|
1990
1990
|
private readonly config;
|
|
1991
1991
|
constructor(options: FieldAccuracyEvaluatorOptions);
|
|
1992
1992
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -2091,7 +2091,7 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2091
2091
|
}>;
|
|
2092
2092
|
|
|
2093
2093
|
declare class LlmJudgeEvaluator implements Evaluator {
|
|
2094
|
-
readonly kind = "
|
|
2094
|
+
readonly kind = "llm-judge";
|
|
2095
2095
|
private readonly resolveJudgeProvider;
|
|
2096
2096
|
private readonly maxOutputTokens?;
|
|
2097
2097
|
private readonly temperature?;
|
|
@@ -2138,7 +2138,7 @@ interface AgentJudgeEvaluatorOptions {
|
|
|
2138
2138
|
readonly judgeTargetProvider?: Provider;
|
|
2139
2139
|
}
|
|
2140
2140
|
declare class AgentJudgeEvaluator implements Evaluator {
|
|
2141
|
-
readonly kind = "
|
|
2141
|
+
readonly kind = "agent-judge";
|
|
2142
2142
|
private readonly resolveJudgeProvider;
|
|
2143
2143
|
private readonly maxSteps;
|
|
2144
2144
|
private readonly temperature;
|
|
@@ -2200,7 +2200,7 @@ interface TokenUsageEvaluatorOptions {
|
|
|
2200
2200
|
* Uses tokenUsage from the evaluation context.
|
|
2201
2201
|
*/
|
|
2202
2202
|
declare class TokenUsageEvaluator implements Evaluator {
|
|
2203
|
-
readonly kind = "
|
|
2203
|
+
readonly kind = "token-usage";
|
|
2204
2204
|
private readonly config;
|
|
2205
2205
|
constructor(options: TokenUsageEvaluatorOptions);
|
|
2206
2206
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -2210,7 +2210,7 @@ interface ToolTrajectoryEvaluatorOptions {
|
|
|
2210
2210
|
readonly config: ToolTrajectoryEvaluatorConfig;
|
|
2211
2211
|
}
|
|
2212
2212
|
declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
2213
|
-
readonly kind = "
|
|
2213
|
+
readonly kind = "tool-trajectory";
|
|
2214
2214
|
private readonly config;
|
|
2215
2215
|
constructor(options: ToolTrajectoryEvaluatorOptions);
|
|
2216
2216
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -2335,7 +2335,7 @@ declare class EvaluatorRegistry {
|
|
|
2335
2335
|
}
|
|
2336
2336
|
/**
|
|
2337
2337
|
* Adapter that wraps a synchronous assertion function as an Evaluator.
|
|
2338
|
-
* Used for deterministic assertions (contains, regex,
|
|
2338
|
+
* Used for deterministic assertions (contains, regex, is-json, equals).
|
|
2339
2339
|
*/
|
|
2340
2340
|
declare class DeterministicAssertionEvaluator implements Evaluator {
|
|
2341
2341
|
private readonly assertFn;
|
|
@@ -2383,7 +2383,7 @@ interface RunEvalCaseOptions {
|
|
|
2383
2383
|
readonly provider: Provider;
|
|
2384
2384
|
readonly target: ResolvedTarget;
|
|
2385
2385
|
readonly evaluators: Partial<Record<string, Evaluator>> & {
|
|
2386
|
-
readonly
|
|
2386
|
+
readonly 'llm-judge': Evaluator;
|
|
2387
2387
|
};
|
|
2388
2388
|
readonly now?: () => Date;
|
|
2389
2389
|
readonly maxRetries?: number;
|
|
@@ -2524,7 +2524,7 @@ interface EvalTestInput {
|
|
|
2524
2524
|
* Matches the YAML `assert` block structure.
|
|
2525
2525
|
*/
|
|
2526
2526
|
interface EvalAssertionInput {
|
|
2527
|
-
/** Assertion type (e.g., 'contains', '
|
|
2527
|
+
/** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
|
|
2528
2528
|
readonly type: string;
|
|
2529
2529
|
/** Display name */
|
|
2530
2530
|
readonly name?: string;
|
|
@@ -3152,7 +3152,7 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
|
3152
3152
|
* Convention-based discovery of custom assertion scripts.
|
|
3153
3153
|
*
|
|
3154
3154
|
* Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
|
|
3155
|
-
* them as
|
|
3155
|
+
* them as code-judge evaluators in the registry. The file name (without
|
|
3156
3156
|
* extension) becomes the evaluator type name.
|
|
3157
3157
|
*
|
|
3158
3158
|
* Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
|