@agentv/core 2.12.0 → 2.14.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7HPKTRFZ.js → chunk-N55K52OO.js} +15 -15
- package/dist/chunk-N55K52OO.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +25 -24
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +12 -11
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +248 -160
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +58 -41
- package/dist/index.d.ts +58 -41
- package/dist/index.js +235 -148
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-7HPKTRFZ.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -257,7 +257,7 @@ interface TraceComputeResult {
|
|
|
257
257
|
readonly endTime?: string;
|
|
258
258
|
}
|
|
259
259
|
/**
|
|
260
|
-
* Argument matching mode for
|
|
260
|
+
* Argument matching mode for tool-trajectory expected items.
|
|
261
261
|
* - 'exact': bidirectional deep equality, no extra keys allowed (default)
|
|
262
262
|
* - 'superset': actual args must contain all expected keys (extras OK)
|
|
263
263
|
* - 'subset': actual args must be a subset of expected keys (no unexpected keys)
|
|
@@ -265,11 +265,11 @@ interface TraceComputeResult {
|
|
|
265
265
|
*/
|
|
266
266
|
type ArgsMatchMode = 'exact' | 'ignore' | 'subset' | 'superset';
|
|
267
267
|
/**
|
|
268
|
-
* Configuration for
|
|
268
|
+
* Configuration for tool-trajectory evaluator.
|
|
269
269
|
*/
|
|
270
270
|
interface ToolTrajectoryEvaluatorConfig {
|
|
271
271
|
readonly name: string;
|
|
272
|
-
readonly type: '
|
|
272
|
+
readonly type: 'tool-trajectory';
|
|
273
273
|
/** Matching mode */
|
|
274
274
|
readonly mode: 'any_order' | 'in_order' | 'exact' | 'subset' | 'superset';
|
|
275
275
|
/** Minimum call counts per tool (for any_order mode) */
|
|
@@ -453,11 +453,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
453
453
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
454
454
|
*/
|
|
455
455
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
456
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["
|
|
456
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "agent-judge", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics"];
|
|
457
457
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
458
458
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
459
459
|
/**
|
|
460
|
-
* Configuration for enabling target access in
|
|
460
|
+
* Configuration for enabling target access in code-judge evaluators.
|
|
461
461
|
* When present, the runtime will start a local proxy server that allows
|
|
462
462
|
* the script to invoke configured targets without direct credential access.
|
|
463
463
|
*/
|
|
@@ -539,7 +539,7 @@ type WorkspaceConfig = {
|
|
|
539
539
|
};
|
|
540
540
|
type CodeEvaluatorConfig = {
|
|
541
541
|
readonly name: string;
|
|
542
|
-
readonly type: 'code';
|
|
542
|
+
readonly type: 'code-judge';
|
|
543
543
|
readonly command: readonly string[];
|
|
544
544
|
/** @deprecated Use `command` instead */
|
|
545
545
|
readonly script?: readonly string[];
|
|
@@ -550,14 +550,14 @@ type CodeEvaluatorConfig = {
|
|
|
550
550
|
readonly required?: boolean | number;
|
|
551
551
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
552
552
|
readonly negate?: boolean;
|
|
553
|
-
/** Pass-through configuration for the
|
|
553
|
+
/** Pass-through configuration for the code-judge (any unrecognized YAML properties) */
|
|
554
554
|
readonly config?: JsonObject;
|
|
555
555
|
/** When present, enables target access via local proxy */
|
|
556
556
|
readonly target?: TargetAccessConfig;
|
|
557
557
|
};
|
|
558
558
|
/**
|
|
559
559
|
* Executable prompt template configuration.
|
|
560
|
-
* Matches
|
|
560
|
+
* Matches code-judge pattern for consistency.
|
|
561
561
|
*/
|
|
562
562
|
type PromptScriptConfig = {
|
|
563
563
|
/** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
|
|
@@ -569,13 +569,13 @@ type PromptScriptConfig = {
|
|
|
569
569
|
};
|
|
570
570
|
type LlmJudgeEvaluatorConfig = {
|
|
571
571
|
readonly name: string;
|
|
572
|
-
readonly type: '
|
|
572
|
+
readonly type: 'llm-judge';
|
|
573
573
|
/** Text prompt (inline or file path) or executable script config */
|
|
574
574
|
readonly prompt?: string | PromptScriptConfig;
|
|
575
575
|
readonly promptPath?: string;
|
|
576
576
|
/** Resolved absolute path for prompt file (used for text template prompts) */
|
|
577
577
|
readonly resolvedPromptPath?: string;
|
|
578
|
-
/** Resolved script array for executable prompts (matches
|
|
578
|
+
/** Resolved script array for executable prompts (matches code-judge pattern) */
|
|
579
579
|
readonly resolvedPromptScript?: readonly string[];
|
|
580
580
|
readonly rubrics?: readonly RubricItem[];
|
|
581
581
|
readonly weight?: number;
|
|
@@ -630,11 +630,11 @@ type CompositeAggregatorConfig = {
|
|
|
630
630
|
readonly type: 'weighted_average';
|
|
631
631
|
readonly weights?: Record<string, number>;
|
|
632
632
|
} | {
|
|
633
|
-
readonly type: '
|
|
633
|
+
readonly type: 'code-judge';
|
|
634
634
|
readonly path: string;
|
|
635
635
|
readonly cwd?: string;
|
|
636
636
|
} | {
|
|
637
|
-
readonly type: '
|
|
637
|
+
readonly type: 'llm-judge';
|
|
638
638
|
readonly prompt?: string;
|
|
639
639
|
readonly promptPath?: string;
|
|
640
640
|
readonly model?: string;
|
|
@@ -654,7 +654,7 @@ type CompositeEvaluatorConfig = {
|
|
|
654
654
|
};
|
|
655
655
|
/**
|
|
656
656
|
* Match type for field accuracy evaluation.
|
|
657
|
-
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a
|
|
657
|
+
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-judge evaluator.
|
|
658
658
|
* See examples/features/document-extraction/fuzzy_match.ts for an example.
|
|
659
659
|
*/
|
|
660
660
|
type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
|
|
@@ -682,11 +682,11 @@ type FieldConfig = {
|
|
|
682
682
|
readonly formats?: readonly string[];
|
|
683
683
|
};
|
|
684
684
|
/**
|
|
685
|
-
* Configuration for the
|
|
685
|
+
* Configuration for the field-accuracy evaluator.
|
|
686
686
|
*/
|
|
687
687
|
type FieldAccuracyEvaluatorConfig = {
|
|
688
688
|
readonly name: string;
|
|
689
|
-
readonly type: '
|
|
689
|
+
readonly type: 'field-accuracy';
|
|
690
690
|
/** Fields to compare between candidate and expected */
|
|
691
691
|
readonly fields: readonly FieldConfig[];
|
|
692
692
|
/** Strategy for combining field scores (default: weighted_average) */
|
|
@@ -725,12 +725,12 @@ type CostEvaluatorConfig = {
|
|
|
725
725
|
readonly negate?: boolean;
|
|
726
726
|
};
|
|
727
727
|
/**
|
|
728
|
-
* Configuration for the
|
|
728
|
+
* Configuration for the token-usage evaluator.
|
|
729
729
|
* Checks provider-reported token usage against configured limits.
|
|
730
730
|
*/
|
|
731
731
|
type TokenUsageEvaluatorConfig = {
|
|
732
732
|
readonly name: string;
|
|
733
|
-
readonly type: '
|
|
733
|
+
readonly type: 'token-usage';
|
|
734
734
|
/** Maximum allowed total tokens (input + output + cached, when present) */
|
|
735
735
|
readonly max_total?: number;
|
|
736
736
|
/** Maximum allowed input tokens (prompt) */
|
|
@@ -743,13 +743,13 @@ type TokenUsageEvaluatorConfig = {
|
|
|
743
743
|
readonly negate?: boolean;
|
|
744
744
|
};
|
|
745
745
|
/**
|
|
746
|
-
* Configuration for the
|
|
746
|
+
* Configuration for the execution-metrics evaluator.
|
|
747
747
|
* Provides declarative threshold-based checks on execution metrics.
|
|
748
748
|
* Only specified thresholds are checked; omitted ones are ignored.
|
|
749
749
|
*/
|
|
750
750
|
type ExecutionMetricsEvaluatorConfig = {
|
|
751
751
|
readonly name: string;
|
|
752
|
-
readonly type: '
|
|
752
|
+
readonly type: 'execution-metrics';
|
|
753
753
|
/** Maximum allowed number of tool calls */
|
|
754
754
|
readonly max_tool_calls?: number;
|
|
755
755
|
/** Maximum allowed number of LLM calls (assistant messages) */
|
|
@@ -770,7 +770,7 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
770
770
|
readonly negate?: boolean;
|
|
771
771
|
};
|
|
772
772
|
/**
|
|
773
|
-
* Configuration for the
|
|
773
|
+
* Configuration for the agent-judge evaluator.
|
|
774
774
|
* Runs an agentic investigation loop to audit workspaces and verify criteria.
|
|
775
775
|
* Two modes:
|
|
776
776
|
* - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
|
|
@@ -778,13 +778,13 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
778
778
|
*/
|
|
779
779
|
type AgentJudgeEvaluatorConfig = {
|
|
780
780
|
readonly name: string;
|
|
781
|
-
readonly type: '
|
|
781
|
+
readonly type: 'agent-judge';
|
|
782
782
|
/** Custom evaluation prompt (inline text or file path) */
|
|
783
783
|
readonly prompt?: string;
|
|
784
784
|
readonly promptPath?: string;
|
|
785
785
|
/** Resolved absolute path for prompt file */
|
|
786
786
|
readonly resolvedPromptPath?: string;
|
|
787
|
-
/** Rubric items for structured evaluation (reuses
|
|
787
|
+
/** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
|
|
788
788
|
readonly rubrics?: readonly RubricItem[];
|
|
789
789
|
/** Maximum agent steps for built-in mode (default 10, max 50) */
|
|
790
790
|
readonly max_steps?: number;
|
|
@@ -816,7 +816,7 @@ type ContainsEvaluatorConfig = {
|
|
|
816
816
|
*/
|
|
817
817
|
type ContainsAnyEvaluatorConfig = {
|
|
818
818
|
readonly name: string;
|
|
819
|
-
readonly type: '
|
|
819
|
+
readonly type: 'contains-any';
|
|
820
820
|
readonly value: readonly string[];
|
|
821
821
|
readonly weight?: number;
|
|
822
822
|
readonly required?: boolean | number;
|
|
@@ -829,7 +829,7 @@ type ContainsAnyEvaluatorConfig = {
|
|
|
829
829
|
*/
|
|
830
830
|
type ContainsAllEvaluatorConfig = {
|
|
831
831
|
readonly name: string;
|
|
832
|
-
readonly type: '
|
|
832
|
+
readonly type: 'contains-all';
|
|
833
833
|
readonly value: readonly string[];
|
|
834
834
|
readonly weight?: number;
|
|
835
835
|
readonly required?: boolean | number;
|
|
@@ -855,7 +855,7 @@ type IcontainsEvaluatorConfig = {
|
|
|
855
855
|
*/
|
|
856
856
|
type IcontainsAnyEvaluatorConfig = {
|
|
857
857
|
readonly name: string;
|
|
858
|
-
readonly type: '
|
|
858
|
+
readonly type: 'icontains-any';
|
|
859
859
|
readonly value: readonly string[];
|
|
860
860
|
readonly weight?: number;
|
|
861
861
|
readonly required?: boolean | number;
|
|
@@ -868,7 +868,7 @@ type IcontainsAnyEvaluatorConfig = {
|
|
|
868
868
|
*/
|
|
869
869
|
type IcontainsAllEvaluatorConfig = {
|
|
870
870
|
readonly name: string;
|
|
871
|
-
readonly type: '
|
|
871
|
+
readonly type: 'icontains-all';
|
|
872
872
|
readonly value: readonly string[];
|
|
873
873
|
readonly weight?: number;
|
|
874
874
|
readonly required?: boolean | number;
|
|
@@ -881,7 +881,7 @@ type IcontainsAllEvaluatorConfig = {
|
|
|
881
881
|
*/
|
|
882
882
|
type StartsWithEvaluatorConfig = {
|
|
883
883
|
readonly name: string;
|
|
884
|
-
readonly type: '
|
|
884
|
+
readonly type: 'starts-with';
|
|
885
885
|
readonly value: string;
|
|
886
886
|
readonly weight?: number;
|
|
887
887
|
readonly required?: boolean | number;
|
|
@@ -894,7 +894,7 @@ type StartsWithEvaluatorConfig = {
|
|
|
894
894
|
*/
|
|
895
895
|
type EndsWithEvaluatorConfig = {
|
|
896
896
|
readonly name: string;
|
|
897
|
-
readonly type: '
|
|
897
|
+
readonly type: 'ends-with';
|
|
898
898
|
readonly value: string;
|
|
899
899
|
readonly weight?: number;
|
|
900
900
|
readonly required?: boolean | number;
|
|
@@ -922,7 +922,7 @@ type RegexEvaluatorConfig = {
|
|
|
922
922
|
*/
|
|
923
923
|
type IsJsonEvaluatorConfig = {
|
|
924
924
|
readonly name: string;
|
|
925
|
-
readonly type: '
|
|
925
|
+
readonly type: 'is-json';
|
|
926
926
|
readonly weight?: number;
|
|
927
927
|
readonly required?: boolean | number;
|
|
928
928
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
@@ -1060,6 +1060,12 @@ interface ExecutionError {
|
|
|
1060
1060
|
readonly message: string;
|
|
1061
1061
|
readonly stage: FailureStage;
|
|
1062
1062
|
}
|
|
1063
|
+
/**
|
|
1064
|
+
* Tolerance for execution errors in an eval run.
|
|
1065
|
+
* - `true`: halt on first execution error
|
|
1066
|
+
* - `false`: never halt on errors (default)
|
|
1067
|
+
*/
|
|
1068
|
+
type FailOnError = boolean;
|
|
1063
1069
|
/**
|
|
1064
1070
|
* Evaluator scorecard for a single eval case run.
|
|
1065
1071
|
*/
|
|
@@ -1194,6 +1200,7 @@ type ExecutionDefaults = {
|
|
|
1194
1200
|
readonly otel_file?: string;
|
|
1195
1201
|
};
|
|
1196
1202
|
type AgentVConfig$1 = {
|
|
1203
|
+
readonly required_version?: string;
|
|
1197
1204
|
readonly guideline_patterns?: readonly string[];
|
|
1198
1205
|
readonly eval_patterns?: readonly string[];
|
|
1199
1206
|
readonly execution?: ExecutionDefaults;
|
|
@@ -1238,6 +1245,12 @@ interface CacheConfig {
|
|
|
1238
1245
|
* Returns undefined when no cache config is specified.
|
|
1239
1246
|
*/
|
|
1240
1247
|
declare function extractCacheConfig(suite: JsonObject): CacheConfig | undefined;
|
|
1248
|
+
/**
|
|
1249
|
+
* Extract `execution.fail_on_error` from parsed eval suite.
|
|
1250
|
+
* Accepts `true` or `false`.
|
|
1251
|
+
* Returns undefined when not specified.
|
|
1252
|
+
*/
|
|
1253
|
+
declare function extractFailOnError(suite: JsonObject): FailOnError | undefined;
|
|
1241
1254
|
|
|
1242
1255
|
/**
|
|
1243
1256
|
* Formatting mode for segment content.
|
|
@@ -1297,6 +1310,8 @@ type EvalSuiteResult = {
|
|
|
1297
1310
|
readonly metadata?: EvalMetadata;
|
|
1298
1311
|
/** Suite-level total cost budget in USD */
|
|
1299
1312
|
readonly totalBudgetUsd?: number;
|
|
1313
|
+
/** Execution error tolerance: true or false */
|
|
1314
|
+
readonly failOnError?: FailOnError;
|
|
1300
1315
|
};
|
|
1301
1316
|
/**
|
|
1302
1317
|
* Load tests and suite metadata from a single parse.
|
|
@@ -1900,7 +1915,7 @@ interface CodeEvaluatorOptions {
|
|
|
1900
1915
|
readonly target?: TargetAccessConfig;
|
|
1901
1916
|
}
|
|
1902
1917
|
declare class CodeEvaluator implements Evaluator {
|
|
1903
|
-
readonly kind = "code";
|
|
1918
|
+
readonly kind = "code-judge";
|
|
1904
1919
|
private readonly command;
|
|
1905
1920
|
private readonly cwd?;
|
|
1906
1921
|
private readonly agentTimeoutMs?;
|
|
@@ -1955,7 +1970,7 @@ interface ExecutionMetricsEvaluatorOptions {
|
|
|
1955
1970
|
* Score is proportional: hits.length / (hits.length + misses.length)
|
|
1956
1971
|
*/
|
|
1957
1972
|
declare class ExecutionMetricsEvaluator implements Evaluator {
|
|
1958
|
-
readonly kind = "
|
|
1973
|
+
readonly kind = "execution-metrics";
|
|
1959
1974
|
private readonly config;
|
|
1960
1975
|
constructor(options: ExecutionMetricsEvaluatorOptions);
|
|
1961
1976
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -1971,7 +1986,7 @@ interface FieldAccuracyEvaluatorOptions {
|
|
|
1971
1986
|
* with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
|
|
1972
1987
|
*/
|
|
1973
1988
|
declare class FieldAccuracyEvaluator implements Evaluator {
|
|
1974
|
-
readonly kind = "
|
|
1989
|
+
readonly kind = "field-accuracy";
|
|
1975
1990
|
private readonly config;
|
|
1976
1991
|
constructor(options: FieldAccuracyEvaluatorOptions);
|
|
1977
1992
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -2076,7 +2091,7 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2076
2091
|
}>;
|
|
2077
2092
|
|
|
2078
2093
|
declare class LlmJudgeEvaluator implements Evaluator {
|
|
2079
|
-
readonly kind = "
|
|
2094
|
+
readonly kind = "llm-judge";
|
|
2080
2095
|
private readonly resolveJudgeProvider;
|
|
2081
2096
|
private readonly maxOutputTokens?;
|
|
2082
2097
|
private readonly temperature?;
|
|
@@ -2123,7 +2138,7 @@ interface AgentJudgeEvaluatorOptions {
|
|
|
2123
2138
|
readonly judgeTargetProvider?: Provider;
|
|
2124
2139
|
}
|
|
2125
2140
|
declare class AgentJudgeEvaluator implements Evaluator {
|
|
2126
|
-
readonly kind = "
|
|
2141
|
+
readonly kind = "agent-judge";
|
|
2127
2142
|
private readonly resolveJudgeProvider;
|
|
2128
2143
|
private readonly maxSteps;
|
|
2129
2144
|
private readonly temperature;
|
|
@@ -2185,7 +2200,7 @@ interface TokenUsageEvaluatorOptions {
|
|
|
2185
2200
|
* Uses tokenUsage from the evaluation context.
|
|
2186
2201
|
*/
|
|
2187
2202
|
declare class TokenUsageEvaluator implements Evaluator {
|
|
2188
|
-
readonly kind = "
|
|
2203
|
+
readonly kind = "token-usage";
|
|
2189
2204
|
private readonly config;
|
|
2190
2205
|
constructor(options: TokenUsageEvaluatorOptions);
|
|
2191
2206
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -2195,7 +2210,7 @@ interface ToolTrajectoryEvaluatorOptions {
|
|
|
2195
2210
|
readonly config: ToolTrajectoryEvaluatorConfig;
|
|
2196
2211
|
}
|
|
2197
2212
|
declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
2198
|
-
readonly kind = "
|
|
2213
|
+
readonly kind = "tool-trajectory";
|
|
2199
2214
|
private readonly config;
|
|
2200
2215
|
constructor(options: ToolTrajectoryEvaluatorOptions);
|
|
2201
2216
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
@@ -2320,7 +2335,7 @@ declare class EvaluatorRegistry {
|
|
|
2320
2335
|
}
|
|
2321
2336
|
/**
|
|
2322
2337
|
* Adapter that wraps a synchronous assertion function as an Evaluator.
|
|
2323
|
-
* Used for deterministic assertions (contains, regex,
|
|
2338
|
+
* Used for deterministic assertions (contains, regex, is-json, equals).
|
|
2324
2339
|
*/
|
|
2325
2340
|
declare class DeterministicAssertionEvaluator implements Evaluator {
|
|
2326
2341
|
private readonly assertFn;
|
|
@@ -2368,7 +2383,7 @@ interface RunEvalCaseOptions {
|
|
|
2368
2383
|
readonly provider: Provider;
|
|
2369
2384
|
readonly target: ResolvedTarget;
|
|
2370
2385
|
readonly evaluators: Partial<Record<string, Evaluator>> & {
|
|
2371
|
-
readonly
|
|
2386
|
+
readonly 'llm-judge': Evaluator;
|
|
2372
2387
|
};
|
|
2373
2388
|
readonly now?: () => Date;
|
|
2374
2389
|
readonly maxRetries?: number;
|
|
@@ -2440,6 +2455,8 @@ interface RunEvaluationOptions {
|
|
|
2440
2455
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
2441
2456
|
/** Suite-level total cost budget in USD (stops dispatching when exceeded) */
|
|
2442
2457
|
readonly totalBudgetUsd?: number;
|
|
2458
|
+
/** Execution error tolerance: true halts on first error */
|
|
2459
|
+
readonly failOnError?: FailOnError;
|
|
2443
2460
|
}
|
|
2444
2461
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2445
2462
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
@@ -2507,7 +2524,7 @@ interface EvalTestInput {
|
|
|
2507
2524
|
* Matches the YAML `assert` block structure.
|
|
2508
2525
|
*/
|
|
2509
2526
|
interface EvalAssertionInput {
|
|
2510
|
-
/** Assertion type (e.g., 'contains', '
|
|
2527
|
+
/** Assertion type (e.g., 'contains', 'llm-judge', 'code-judge') */
|
|
2511
2528
|
readonly type: string;
|
|
2512
2529
|
/** Display name */
|
|
2513
2530
|
readonly name?: string;
|
|
@@ -3135,7 +3152,7 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
|
3135
3152
|
* Convention-based discovery of custom assertion scripts.
|
|
3136
3153
|
*
|
|
3137
3154
|
* Scans `.agentv/assertions/` for TypeScript/JavaScript files and registers
|
|
3138
|
-
* them as
|
|
3155
|
+
* them as code-judge evaluators in the registry. The file name (without
|
|
3139
3156
|
* extension) becomes the evaluator type name.
|
|
3140
3157
|
*
|
|
3141
3158
|
* Example: `.agentv/assertions/sentiment.ts` → type "sentiment" in EVAL.yaml
|
|
@@ -3156,4 +3173,4 @@ type AgentKernel = {
|
|
|
3156
3173
|
};
|
|
3157
3174
|
declare function createAgentKernel(): AgentKernel;
|
|
3158
3175
|
|
|
3159
|
-
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|
|
3176
|
+
export { AgentJudgeEvaluator, type AgentJudgeEvaluatorConfig, type AgentJudgeEvaluatorOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResetConfig, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceScriptConfig, assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, getWorkspacePath, initializeBaseline, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, trimBaselineResult };
|