@agentv/core 2.19.0 → 3.0.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-5CJVBBGG.js +7 -0
- package/dist/agentv-provider-5CJVBBGG.js.map +1 -0
- package/dist/{chunk-ACTIPQZ3.js → chunk-CASGWWOU.js} +56 -20
- package/dist/chunk-CASGWWOU.js.map +1 -0
- package/dist/chunk-XBGLLO22.js +65 -0
- package/dist/chunk-XBGLLO22.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +31 -14
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +19 -10
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +4690 -3406
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +362 -137
- package/dist/index.d.ts +362 -137
- package/dist/index.js +7316 -6147
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
- package/dist/chunk-ACTIPQZ3.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -8,7 +8,7 @@ interface ChatMessage {
|
|
|
8
8
|
readonly name?: string;
|
|
9
9
|
}
|
|
10
10
|
type ChatPrompt = readonly ChatMessage[];
|
|
11
|
-
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
11
|
+
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv';
|
|
12
12
|
/** Callbacks for real-time observability during provider execution */
|
|
13
13
|
interface ProviderStreamCallbacks {
|
|
14
14
|
onToolCallStart?: (toolName: string, toolCallId?: string) => void;
|
|
@@ -144,6 +144,8 @@ type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
|
144
144
|
interface TargetDefinition {
|
|
145
145
|
readonly name: string;
|
|
146
146
|
readonly provider: ProviderKind | string;
|
|
147
|
+
readonly grader_target?: string | undefined;
|
|
148
|
+
/** @deprecated Use `grader_target` instead */
|
|
147
149
|
readonly judge_target?: string | undefined;
|
|
148
150
|
readonly workers?: number | undefined;
|
|
149
151
|
readonly provider_batching?: boolean | undefined;
|
|
@@ -463,11 +465,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
463
465
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
464
466
|
*/
|
|
465
467
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
466
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "
|
|
468
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
|
|
467
469
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
468
470
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
469
471
|
/**
|
|
470
|
-
* Configuration for enabling target access in code-
|
|
472
|
+
* Configuration for enabling target access in code-grader evaluators.
|
|
471
473
|
* When present, the runtime will start a local proxy server that allows
|
|
472
474
|
* the script to invoke configured targets without direct credential access.
|
|
473
475
|
*/
|
|
@@ -566,7 +568,7 @@ type WorkspaceConfig = {
|
|
|
566
568
|
};
|
|
567
569
|
type CodeEvaluatorConfig = {
|
|
568
570
|
readonly name: string;
|
|
569
|
-
readonly type: 'code-judge';
|
|
571
|
+
readonly type: 'code-judge' | 'code-grader';
|
|
570
572
|
readonly command: readonly string[];
|
|
571
573
|
/** @deprecated Use `command` instead */
|
|
572
574
|
readonly script?: readonly string[];
|
|
@@ -577,14 +579,14 @@ type CodeEvaluatorConfig = {
|
|
|
577
579
|
readonly required?: boolean | number;
|
|
578
580
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
579
581
|
readonly negate?: boolean;
|
|
580
|
-
/** Pass-through configuration for the code-
|
|
582
|
+
/** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
|
|
581
583
|
readonly config?: JsonObject;
|
|
582
584
|
/** When present, enables target access via local proxy */
|
|
583
585
|
readonly target?: TargetAccessConfig;
|
|
584
586
|
};
|
|
585
587
|
/**
|
|
586
588
|
* Executable prompt template configuration.
|
|
587
|
-
* Matches code-
|
|
589
|
+
* Matches code-grader pattern for consistency.
|
|
588
590
|
*/
|
|
589
591
|
type PromptScriptConfig = {
|
|
590
592
|
/** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
|
|
@@ -594,24 +596,32 @@ type PromptScriptConfig = {
|
|
|
594
596
|
/** Pass-through configuration for the prompt template */
|
|
595
597
|
readonly config?: Record<string, unknown>;
|
|
596
598
|
};
|
|
597
|
-
type
|
|
599
|
+
type LlmGraderEvaluatorConfig = {
|
|
598
600
|
readonly name: string;
|
|
599
|
-
readonly type: 'llm-judge';
|
|
601
|
+
readonly type: 'llm-grader' | 'llm-judge';
|
|
600
602
|
/** Text prompt (inline or file path) or executable script config */
|
|
601
603
|
readonly prompt?: string | PromptScriptConfig;
|
|
602
604
|
readonly promptPath?: string;
|
|
603
605
|
/** Resolved absolute path for prompt file (used for text template prompts) */
|
|
604
606
|
readonly resolvedPromptPath?: string;
|
|
605
|
-
/** Resolved script array for executable prompts (matches code-
|
|
607
|
+
/** Resolved script array for executable prompts (matches code-grader pattern) */
|
|
606
608
|
readonly resolvedPromptScript?: readonly string[];
|
|
607
609
|
readonly rubrics?: readonly RubricItem[];
|
|
608
610
|
readonly weight?: number;
|
|
609
611
|
readonly required?: boolean | number;
|
|
610
612
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
611
613
|
readonly negate?: boolean;
|
|
614
|
+
/** Optional target override for this grader (uses a named LLM target from targets.yaml). */
|
|
615
|
+
readonly target?: string;
|
|
612
616
|
/** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
|
|
613
617
|
readonly config?: Record<string, unknown>;
|
|
618
|
+
/** Maximum agent steps for agentv built-in mode (default 10, max 50). Ignored in LLM mode. */
|
|
619
|
+
readonly max_steps?: number;
|
|
620
|
+
/** Temperature override for grader calls */
|
|
621
|
+
readonly temperature?: number;
|
|
614
622
|
};
|
|
623
|
+
/** @deprecated Use `LlmGraderEvaluatorConfig` instead */
|
|
624
|
+
type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
|
|
615
625
|
/**
|
|
616
626
|
* Score range definition for analytic rubric scoring.
|
|
617
627
|
* Each range maps an integer score band (0-10) to an outcome description.
|
|
@@ -623,7 +633,7 @@ type ScoreRange = {
|
|
|
623
633
|
readonly outcome: string;
|
|
624
634
|
};
|
|
625
635
|
/**
|
|
626
|
-
* Rubric item for LLM
|
|
636
|
+
* Rubric item for LLM grader evaluation.
|
|
627
637
|
* Supports two modes:
|
|
628
638
|
* - Checklist mode: boolean satisfied/not-satisfied with `outcome`
|
|
629
639
|
* - Score-range mode: 0-10 integer scoring with `score_ranges`
|
|
@@ -648,7 +658,7 @@ type RubricItem = {
|
|
|
648
658
|
readonly required_min_score?: number;
|
|
649
659
|
/**
|
|
650
660
|
* Score range definitions for analytic rubric scoring.
|
|
651
|
-
* When present, the
|
|
661
|
+
* When present, the grader outputs an integer 0-10 score per criterion.
|
|
652
662
|
* Ranges must be non-overlapping and cover 0-10 inclusive.
|
|
653
663
|
*/
|
|
654
664
|
readonly score_ranges?: readonly ScoreRange[];
|
|
@@ -656,10 +666,19 @@ type RubricItem = {
|
|
|
656
666
|
type CompositeAggregatorConfig = {
|
|
657
667
|
readonly type: 'weighted_average';
|
|
658
668
|
readonly weights?: Record<string, number>;
|
|
669
|
+
} | {
|
|
670
|
+
readonly type: 'code-grader';
|
|
671
|
+
readonly path: string;
|
|
672
|
+
readonly cwd?: string;
|
|
659
673
|
} | {
|
|
660
674
|
readonly type: 'code-judge';
|
|
661
675
|
readonly path: string;
|
|
662
676
|
readonly cwd?: string;
|
|
677
|
+
} | {
|
|
678
|
+
readonly type: 'llm-grader';
|
|
679
|
+
readonly prompt?: string;
|
|
680
|
+
readonly promptPath?: string;
|
|
681
|
+
readonly model?: string;
|
|
663
682
|
} | {
|
|
664
683
|
readonly type: 'llm-judge';
|
|
665
684
|
readonly prompt?: string;
|
|
@@ -672,7 +691,7 @@ type CompositeAggregatorConfig = {
|
|
|
672
691
|
type CompositeEvaluatorConfig = {
|
|
673
692
|
readonly name: string;
|
|
674
693
|
readonly type: 'composite';
|
|
675
|
-
readonly
|
|
694
|
+
readonly assertions: readonly EvaluatorConfig[];
|
|
676
695
|
readonly aggregator: CompositeAggregatorConfig;
|
|
677
696
|
readonly weight?: number;
|
|
678
697
|
readonly required?: boolean | number;
|
|
@@ -681,7 +700,7 @@ type CompositeEvaluatorConfig = {
|
|
|
681
700
|
};
|
|
682
701
|
/**
|
|
683
702
|
* Match type for field accuracy evaluation.
|
|
684
|
-
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-
|
|
703
|
+
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-grader evaluator.
|
|
685
704
|
* See examples/features/document-extraction/fuzzy_match.ts for an example.
|
|
686
705
|
*/
|
|
687
706
|
type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
|
|
@@ -796,34 +815,6 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
796
815
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
797
816
|
readonly negate?: boolean;
|
|
798
817
|
};
|
|
799
|
-
/**
|
|
800
|
-
* Configuration for the agent-judge evaluator.
|
|
801
|
-
* Runs an agentic investigation loop to audit workspaces and verify criteria.
|
|
802
|
-
* Two modes:
|
|
803
|
-
* - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
|
|
804
|
-
* - Judge target: Delegates to an external agent provider via Provider.invoke()
|
|
805
|
-
*/
|
|
806
|
-
type AgentJudgeEvaluatorConfig = {
|
|
807
|
-
readonly name: string;
|
|
808
|
-
readonly type: 'agent-judge';
|
|
809
|
-
/** Custom evaluation prompt (inline text or file path) */
|
|
810
|
-
readonly prompt?: string;
|
|
811
|
-
readonly promptPath?: string;
|
|
812
|
-
/** Resolved absolute path for prompt file */
|
|
813
|
-
readonly resolvedPromptPath?: string;
|
|
814
|
-
/** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
|
|
815
|
-
readonly rubrics?: readonly RubricItem[];
|
|
816
|
-
/** Maximum agent steps for built-in mode (default 10, max 50) */
|
|
817
|
-
readonly max_steps?: number;
|
|
818
|
-
/** Temperature for built-in mode (default 0) */
|
|
819
|
-
readonly temperature?: number;
|
|
820
|
-
/** Target name — delegates agent loop to this provider instead of built-in mode */
|
|
821
|
-
readonly target?: string;
|
|
822
|
-
readonly weight?: number;
|
|
823
|
-
readonly required?: boolean | number;
|
|
824
|
-
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
825
|
-
readonly negate?: boolean;
|
|
826
|
-
};
|
|
827
818
|
/**
|
|
828
819
|
* Configuration for the contains assertion evaluator.
|
|
829
820
|
* Checks whether the candidate output contains a specified substring.
|
|
@@ -981,7 +972,34 @@ type RubricsEvaluatorConfig = {
|
|
|
981
972
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
982
973
|
readonly negate?: boolean;
|
|
983
974
|
};
|
|
984
|
-
|
|
975
|
+
/**
|
|
976
|
+
* Configuration for the skill-trigger evaluator.
|
|
977
|
+
* Detects whether the agent invoked a named Claude Code skill as its first tool call.
|
|
978
|
+
* Mirrors the post-hoc fallback detection in skill-creator's run_eval.py.
|
|
979
|
+
*/
|
|
980
|
+
type SkillTriggerEvaluatorConfig = {
|
|
981
|
+
readonly name: string;
|
|
982
|
+
readonly type: 'skill-trigger';
|
|
983
|
+
/** The skill name to check for (case-sensitive substring match) */
|
|
984
|
+
readonly skill: string;
|
|
985
|
+
/** Whether the skill is expected to trigger (default: true) */
|
|
986
|
+
readonly should_trigger?: boolean;
|
|
987
|
+
readonly weight?: number;
|
|
988
|
+
readonly required?: boolean | number;
|
|
989
|
+
readonly negate?: boolean;
|
|
990
|
+
};
|
|
991
|
+
/**
|
|
992
|
+
* Configuration for the inline-assert evaluator.
|
|
993
|
+
* Wraps an AssertFn for in-process evaluation via the evaluate() API.
|
|
994
|
+
*/
|
|
995
|
+
type InlineAssertEvaluatorConfig = {
|
|
996
|
+
readonly name: string;
|
|
997
|
+
readonly type: 'inline-assert';
|
|
998
|
+
readonly weight?: number;
|
|
999
|
+
readonly required?: boolean | number;
|
|
1000
|
+
readonly negate?: boolean;
|
|
1001
|
+
};
|
|
1002
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
|
|
985
1003
|
/**
|
|
986
1004
|
* Eval test definition sourced from AgentV specs.
|
|
987
1005
|
*/
|
|
@@ -999,7 +1017,7 @@ interface EvalTest {
|
|
|
999
1017
|
readonly file_paths: readonly string[];
|
|
1000
1018
|
readonly criteria: string;
|
|
1001
1019
|
readonly evaluator?: EvaluatorKind;
|
|
1002
|
-
readonly
|
|
1020
|
+
readonly assertions?: readonly EvaluatorConfig[];
|
|
1003
1021
|
/** Workspace configuration (merged from suite-level and case-level) */
|
|
1004
1022
|
readonly workspace?: WorkspaceConfig;
|
|
1005
1023
|
/** Arbitrary metadata passed to workspace scripts via stdin */
|
|
@@ -1172,15 +1190,15 @@ interface EvaluatorResult {
|
|
|
1172
1190
|
readonly rawRequest?: JsonObject;
|
|
1173
1191
|
readonly evaluatorProviderRequest?: JsonObject;
|
|
1174
1192
|
readonly scores?: readonly EvaluatorResult[];
|
|
1175
|
-
/** Optional structured details from code
|
|
1193
|
+
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
|
|
1176
1194
|
readonly details?: JsonObject;
|
|
1177
1195
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1178
1196
|
readonly tokenUsage?: TokenUsage;
|
|
1179
|
-
/** Wall-clock duration of this
|
|
1197
|
+
/** Wall-clock duration of this grader execution in milliseconds. */
|
|
1180
1198
|
readonly durationMs?: number;
|
|
1181
|
-
/** ISO 8601 UTC timestamp when this
|
|
1199
|
+
/** ISO 8601 UTC timestamp when this grader started executing. */
|
|
1182
1200
|
readonly startedAt?: string;
|
|
1183
|
-
/** ISO 8601 UTC timestamp when this
|
|
1201
|
+
/** ISO 8601 UTC timestamp when this grader finished executing. */
|
|
1184
1202
|
readonly endedAt?: string;
|
|
1185
1203
|
}
|
|
1186
1204
|
/**
|
|
@@ -1314,7 +1332,7 @@ declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): P
|
|
|
1314
1332
|
/**
|
|
1315
1333
|
* Detect file format by extension.
|
|
1316
1334
|
*/
|
|
1317
|
-
declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
|
|
1335
|
+
declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json';
|
|
1318
1336
|
|
|
1319
1337
|
type LoadOptions = {
|
|
1320
1338
|
readonly verbose?: boolean;
|
|
@@ -1366,6 +1384,83 @@ declare function loadTestById(evalFilePath: string, repoRoot: URL | string, eval
|
|
|
1366
1384
|
/** @deprecated Use `loadTestById` instead */
|
|
1367
1385
|
declare const loadEvalCaseById: typeof loadTestById;
|
|
1368
1386
|
|
|
1387
|
+
/**
|
|
1388
|
+
* Raw Agent Skills evals.json schema.
|
|
1389
|
+
* @see https://agentskills.io/skill-creation/evaluating-skills
|
|
1390
|
+
*/
|
|
1391
|
+
interface AgentSkillsEvalsFile {
|
|
1392
|
+
readonly skill_name?: string;
|
|
1393
|
+
readonly evals: readonly AgentSkillsEvalCase[];
|
|
1394
|
+
}
|
|
1395
|
+
interface AgentSkillsEvalCase {
|
|
1396
|
+
readonly id: number;
|
|
1397
|
+
readonly prompt: string;
|
|
1398
|
+
readonly expected_output?: string;
|
|
1399
|
+
readonly files?: readonly string[];
|
|
1400
|
+
readonly assertions?: readonly string[];
|
|
1401
|
+
}
|
|
1402
|
+
/**
|
|
1403
|
+
* Detect whether a JSON file is in Agent Skills evals.json format.
|
|
1404
|
+
* Returns true if the parsed content has an `evals` array.
|
|
1405
|
+
*/
|
|
1406
|
+
declare function isAgentSkillsFormat(parsed: unknown): parsed is AgentSkillsEvalsFile;
|
|
1407
|
+
/**
|
|
1408
|
+
* Parse already-loaded Agent Skills evals data into EvalTest[].
|
|
1409
|
+
* Exported for testing without file I/O.
|
|
1410
|
+
*/
|
|
1411
|
+
declare function parseAgentSkillsEvals(parsed: unknown, source?: string, baseDir?: string): readonly EvalTest[];
|
|
1412
|
+
|
|
1413
|
+
/**
|
|
1414
|
+
* EVAL.yaml → evals.json transpiler.
|
|
1415
|
+
*
|
|
1416
|
+
* Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
|
|
1417
|
+
* for consumption by the skill-creator pipeline.
|
|
1418
|
+
*
|
|
1419
|
+
* Handles both `assertions:` (current) and `assert:` (deprecated alias).
|
|
1420
|
+
*/
|
|
1421
|
+
interface EvalsJsonCase {
|
|
1422
|
+
id: number;
|
|
1423
|
+
prompt: string;
|
|
1424
|
+
expected_output?: string;
|
|
1425
|
+
files?: string[];
|
|
1426
|
+
should_trigger?: boolean;
|
|
1427
|
+
assertions: string[];
|
|
1428
|
+
}
|
|
1429
|
+
interface EvalsJsonFile {
|
|
1430
|
+
skill_name: string;
|
|
1431
|
+
evals: EvalsJsonCase[];
|
|
1432
|
+
}
|
|
1433
|
+
/**
|
|
1434
|
+
* Result of transpiling a single EVAL.yaml.
|
|
1435
|
+
* May produce multiple evals.json files (one per skill).
|
|
1436
|
+
*/
|
|
1437
|
+
interface TranspileResult {
|
|
1438
|
+
/** Map from skill_name → EvalsJsonFile */
|
|
1439
|
+
files: Map<string, EvalsJsonFile>;
|
|
1440
|
+
/** Warning messages accumulated during transpilation */
|
|
1441
|
+
warnings: string[];
|
|
1442
|
+
}
|
|
1443
|
+
/**
|
|
1444
|
+
* Transpile a parsed EVAL.yaml object into one or more evals.json objects.
|
|
1445
|
+
*
|
|
1446
|
+
* @param suite Parsed YAML object (already loaded, no file I/O here)
|
|
1447
|
+
* @param source Source identifier for error messages (e.g. file path)
|
|
1448
|
+
*/
|
|
1449
|
+
declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
|
|
1450
|
+
/**
|
|
1451
|
+
* Transpile an EVAL.yaml file into one or more evals.json objects.
|
|
1452
|
+
* Returns a map from output filename → JSON content.
|
|
1453
|
+
*
|
|
1454
|
+
* @param evalYamlPath Absolute path to the EVAL.yaml file
|
|
1455
|
+
*/
|
|
1456
|
+
declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
|
|
1457
|
+
/**
|
|
1458
|
+
* Determine the output filename(s) for a transpile result.
|
|
1459
|
+
* Single skill → "evals.json"
|
|
1460
|
+
* Multiple skills → "<skill>.evals.json"
|
|
1461
|
+
*/
|
|
1462
|
+
declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
|
|
1463
|
+
|
|
1369
1464
|
declare function fileExists(filePath: string): Promise<boolean>;
|
|
1370
1465
|
/**
|
|
1371
1466
|
* Normalize line endings to LF (\n).
|
|
@@ -1613,87 +1708,112 @@ interface VSCodeResolvedConfig {
|
|
|
1613
1708
|
readonly workspaceTemplate?: string;
|
|
1614
1709
|
readonly timeoutMs?: number;
|
|
1615
1710
|
}
|
|
1711
|
+
interface AgentVResolvedConfig {
|
|
1712
|
+
readonly model: string;
|
|
1713
|
+
readonly temperature: number;
|
|
1714
|
+
}
|
|
1616
1715
|
type ResolvedTarget = {
|
|
1617
1716
|
readonly kind: 'azure';
|
|
1618
1717
|
readonly name: string;
|
|
1619
|
-
readonly
|
|
1718
|
+
readonly graderTarget?: string;
|
|
1620
1719
|
readonly workers?: number;
|
|
1621
1720
|
readonly providerBatching?: boolean;
|
|
1622
1721
|
readonly config: AzureResolvedConfig;
|
|
1623
1722
|
} | {
|
|
1624
1723
|
readonly kind: 'anthropic';
|
|
1625
1724
|
readonly name: string;
|
|
1626
|
-
readonly
|
|
1725
|
+
readonly graderTarget?: string;
|
|
1627
1726
|
readonly workers?: number;
|
|
1628
1727
|
readonly providerBatching?: boolean;
|
|
1629
1728
|
readonly config: AnthropicResolvedConfig;
|
|
1630
1729
|
} | {
|
|
1631
1730
|
readonly kind: 'gemini';
|
|
1632
1731
|
readonly name: string;
|
|
1633
|
-
readonly
|
|
1732
|
+
readonly graderTarget?: string;
|
|
1634
1733
|
readonly workers?: number;
|
|
1635
1734
|
readonly providerBatching?: boolean;
|
|
1636
1735
|
readonly config: GeminiResolvedConfig;
|
|
1637
1736
|
} | {
|
|
1638
1737
|
readonly kind: 'codex';
|
|
1639
1738
|
readonly name: string;
|
|
1640
|
-
readonly
|
|
1739
|
+
readonly graderTarget?: string;
|
|
1641
1740
|
readonly workers?: number;
|
|
1642
1741
|
readonly providerBatching?: boolean;
|
|
1643
1742
|
readonly config: CodexResolvedConfig;
|
|
1644
1743
|
} | {
|
|
1645
1744
|
readonly kind: 'copilot-sdk';
|
|
1646
1745
|
readonly name: string;
|
|
1647
|
-
readonly
|
|
1746
|
+
readonly graderTarget?: string;
|
|
1648
1747
|
readonly workers?: number;
|
|
1649
1748
|
readonly providerBatching?: boolean;
|
|
1650
1749
|
readonly config: CopilotSdkResolvedConfig;
|
|
1651
1750
|
} | {
|
|
1652
1751
|
readonly kind: 'copilot-cli';
|
|
1653
1752
|
readonly name: string;
|
|
1654
|
-
readonly
|
|
1753
|
+
readonly graderTarget?: string;
|
|
1655
1754
|
readonly workers?: number;
|
|
1656
1755
|
readonly providerBatching?: boolean;
|
|
1657
1756
|
readonly config: CopilotCliResolvedConfig;
|
|
1658
1757
|
} | {
|
|
1659
1758
|
readonly kind: 'pi-coding-agent';
|
|
1660
1759
|
readonly name: string;
|
|
1661
|
-
readonly
|
|
1760
|
+
readonly graderTarget?: string;
|
|
1662
1761
|
readonly workers?: number;
|
|
1663
1762
|
readonly providerBatching?: boolean;
|
|
1664
1763
|
readonly config: PiCodingAgentResolvedConfig;
|
|
1665
1764
|
} | {
|
|
1666
1765
|
readonly kind: 'pi-agent-sdk';
|
|
1667
1766
|
readonly name: string;
|
|
1668
|
-
readonly
|
|
1767
|
+
readonly graderTarget?: string;
|
|
1669
1768
|
readonly workers?: number;
|
|
1670
1769
|
readonly providerBatching?: boolean;
|
|
1671
1770
|
readonly config: PiAgentSdkResolvedConfig;
|
|
1672
1771
|
} | {
|
|
1673
1772
|
readonly kind: 'claude';
|
|
1674
1773
|
readonly name: string;
|
|
1675
|
-
readonly
|
|
1774
|
+
readonly graderTarget?: string;
|
|
1775
|
+
readonly workers?: number;
|
|
1776
|
+
readonly providerBatching?: boolean;
|
|
1777
|
+
readonly config: ClaudeResolvedConfig;
|
|
1778
|
+
} | {
|
|
1779
|
+
readonly kind: 'claude-cli';
|
|
1780
|
+
readonly name: string;
|
|
1781
|
+
readonly graderTarget?: string;
|
|
1782
|
+
readonly workers?: number;
|
|
1783
|
+
readonly providerBatching?: boolean;
|
|
1784
|
+
readonly config: ClaudeResolvedConfig;
|
|
1785
|
+
} | {
|
|
1786
|
+
readonly kind: 'claude-sdk';
|
|
1787
|
+
readonly name: string;
|
|
1788
|
+
readonly graderTarget?: string;
|
|
1676
1789
|
readonly workers?: number;
|
|
1677
1790
|
readonly providerBatching?: boolean;
|
|
1678
1791
|
readonly config: ClaudeResolvedConfig;
|
|
1679
1792
|
} | {
|
|
1680
1793
|
readonly kind: 'mock';
|
|
1681
1794
|
readonly name: string;
|
|
1682
|
-
readonly
|
|
1795
|
+
readonly graderTarget?: string;
|
|
1683
1796
|
readonly workers?: number;
|
|
1684
1797
|
readonly providerBatching?: boolean;
|
|
1685
1798
|
readonly config: MockResolvedConfig;
|
|
1686
1799
|
} | {
|
|
1687
1800
|
readonly kind: 'vscode' | 'vscode-insiders';
|
|
1688
1801
|
readonly name: string;
|
|
1689
|
-
readonly
|
|
1802
|
+
readonly graderTarget?: string;
|
|
1690
1803
|
readonly workers?: number;
|
|
1691
1804
|
readonly providerBatching?: boolean;
|
|
1692
1805
|
readonly config: VSCodeResolvedConfig;
|
|
1806
|
+
} | {
|
|
1807
|
+
readonly kind: 'agentv';
|
|
1808
|
+
readonly name: string;
|
|
1809
|
+
readonly graderTarget?: string;
|
|
1810
|
+
readonly workers?: number;
|
|
1811
|
+
readonly providerBatching?: boolean;
|
|
1812
|
+
readonly config: AgentVResolvedConfig;
|
|
1693
1813
|
} | {
|
|
1694
1814
|
readonly kind: 'cli';
|
|
1695
1815
|
readonly name: string;
|
|
1696
|
-
readonly
|
|
1816
|
+
readonly graderTarget?: string;
|
|
1697
1817
|
readonly workers?: number;
|
|
1698
1818
|
readonly providerBatching?: boolean;
|
|
1699
1819
|
readonly config: CliResolvedConfig;
|
|
@@ -1845,7 +1965,7 @@ declare function resolveAndCreateProvider(definition: TargetDefinition, env?: En
|
|
|
1845
1965
|
|
|
1846
1966
|
/**
|
|
1847
1967
|
* Function to resolve a target name to a provider.
|
|
1848
|
-
* Used by code
|
|
1968
|
+
* Used by code graders to support target override.
|
|
1849
1969
|
*/
|
|
1850
1970
|
type TargetResolver = (targetName: string) => Provider | undefined;
|
|
1851
1971
|
interface EvaluationContext {
|
|
@@ -1861,6 +1981,8 @@ interface EvaluationContext {
|
|
|
1861
1981
|
readonly chatPrompt?: ChatPrompt;
|
|
1862
1982
|
};
|
|
1863
1983
|
readonly now: Date;
|
|
1984
|
+
readonly graderProvider?: Provider;
|
|
1985
|
+
/** @deprecated Use `graderProvider` instead */
|
|
1864
1986
|
readonly judgeProvider?: Provider;
|
|
1865
1987
|
readonly evaluatorTemplateOverride?: string;
|
|
1866
1988
|
readonly evaluator?: EvaluatorConfig;
|
|
@@ -1878,9 +2000,9 @@ interface EvaluationContext {
|
|
|
1878
2000
|
readonly startTime?: string;
|
|
1879
2001
|
/** ISO 8601 timestamp when execution ended */
|
|
1880
2002
|
readonly endTime?: string;
|
|
1881
|
-
/** Resolver for target override in code
|
|
2003
|
+
/** Resolver for target override in code graders */
|
|
1882
2004
|
readonly targetResolver?: TargetResolver;
|
|
1883
|
-
/** List of available target names for code
|
|
2005
|
+
/** List of available target names for code graders */
|
|
1884
2006
|
readonly availableTargets?: readonly string[];
|
|
1885
2007
|
/** Unified diff of file changes from workspace (when workspace_template is configured) */
|
|
1886
2008
|
readonly fileChanges?: string;
|
|
@@ -1896,7 +2018,7 @@ interface EvaluationScore {
|
|
|
1896
2018
|
readonly reasoning?: string;
|
|
1897
2019
|
readonly evaluatorRawRequest?: JsonObject;
|
|
1898
2020
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1899
|
-
/** Optional structured details from code
|
|
2021
|
+
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1900
2022
|
readonly details?: JsonObject;
|
|
1901
2023
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1902
2024
|
readonly tokenUsage?: TokenUsage;
|
|
@@ -1912,7 +2034,7 @@ interface ChildEvaluatorResult {
|
|
|
1912
2034
|
readonly reasoning?: string;
|
|
1913
2035
|
readonly evaluatorRawRequest?: JsonObject;
|
|
1914
2036
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1915
|
-
/** Optional structured details from code
|
|
2037
|
+
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1916
2038
|
readonly details?: JsonObject;
|
|
1917
2039
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1918
2040
|
readonly tokenUsage?: TokenUsage;
|
|
@@ -2071,12 +2193,18 @@ declare class LatencyEvaluator implements Evaluator {
|
|
|
2071
2193
|
* Custom evaluators can override this via evaluatorTemplate option.
|
|
2072
2194
|
*/
|
|
2073
2195
|
declare const DEFAULT_EVALUATOR_TEMPLATE: string;
|
|
2074
|
-
type
|
|
2075
|
-
interface
|
|
2076
|
-
readonly
|
|
2196
|
+
type GraderProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
2197
|
+
interface LlmGraderEvaluatorOptions {
|
|
2198
|
+
readonly resolveGraderProvider: GraderProviderResolver;
|
|
2199
|
+
/** @deprecated Use `resolveGraderProvider` instead. */
|
|
2200
|
+
readonly resolveJudgeProvider?: GraderProviderResolver;
|
|
2077
2201
|
readonly maxOutputTokens?: number;
|
|
2078
2202
|
readonly temperature?: number;
|
|
2079
2203
|
readonly evaluatorTemplate?: string;
|
|
2204
|
+
readonly maxSteps?: number;
|
|
2205
|
+
readonly graderTargetProvider?: Provider;
|
|
2206
|
+
/** @deprecated Use `graderTargetProvider` instead. */
|
|
2207
|
+
readonly judgeTargetProvider?: Provider;
|
|
2080
2208
|
}
|
|
2081
2209
|
declare const freeformEvaluationSchema: z.ZodObject<{
|
|
2082
2210
|
score: z.ZodNumber;
|
|
@@ -2125,13 +2253,15 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2125
2253
|
overall_reasoning: string;
|
|
2126
2254
|
}>;
|
|
2127
2255
|
|
|
2128
|
-
declare class
|
|
2129
|
-
readonly kind = "llm-
|
|
2130
|
-
private readonly
|
|
2256
|
+
declare class LlmGraderEvaluator implements Evaluator {
|
|
2257
|
+
readonly kind = "llm-grader";
|
|
2258
|
+
private readonly resolveGraderProvider;
|
|
2131
2259
|
private readonly maxOutputTokens?;
|
|
2132
2260
|
private readonly temperature?;
|
|
2133
2261
|
private readonly evaluatorTemplate?;
|
|
2134
|
-
|
|
2262
|
+
private readonly maxSteps;
|
|
2263
|
+
private readonly graderTargetProvider?;
|
|
2264
|
+
constructor(options: LlmGraderEvaluatorOptions);
|
|
2135
2265
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
2136
2266
|
private evaluateFreeform;
|
|
2137
2267
|
private evaluateWithRubrics;
|
|
@@ -2140,6 +2270,43 @@ declare class LlmJudgeEvaluator implements Evaluator {
|
|
|
2140
2270
|
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
2141
2271
|
*/
|
|
2142
2272
|
private evaluateWithScoreRanges;
|
|
2273
|
+
/**
|
|
2274
|
+
* Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
|
|
2275
|
+
*/
|
|
2276
|
+
private evaluateBuiltIn;
|
|
2277
|
+
/**
|
|
2278
|
+
* Grader target mode: Delegates to an explicit graderTargetProvider via Provider.invoke().
|
|
2279
|
+
*/
|
|
2280
|
+
private evaluateWithGraderTarget;
|
|
2281
|
+
/**
|
|
2282
|
+
* Delegate mode: resolved provider is an agent provider — send prompt via invoke().
|
|
2283
|
+
*/
|
|
2284
|
+
private evaluateWithDelegatedAgent;
|
|
2285
|
+
/**
|
|
2286
|
+
* Shared implementation for grader_target and delegate modes.
|
|
2287
|
+
* Both invoke a provider and parse the agent result from the response.
|
|
2288
|
+
*/
|
|
2289
|
+
private evaluateWithDelegate;
|
|
2290
|
+
/**
|
|
2291
|
+
* Build system prompt for built-in agent mode.
|
|
2292
|
+
* Includes output format instructions.
|
|
2293
|
+
*/
|
|
2294
|
+
private buildAgentSystemPrompt;
|
|
2295
|
+
/**
|
|
2296
|
+
* Build user prompt for built-in agent mode.
|
|
2297
|
+
* Uses custom template if provided, otherwise builds default prompt.
|
|
2298
|
+
*/
|
|
2299
|
+
private buildAgentUserPrompt;
|
|
2300
|
+
/**
|
|
2301
|
+
* Build the full evaluation prompt for delegate mode (agent providers).
|
|
2302
|
+
* Combines task context, criteria, candidate info, and output format instructions.
|
|
2303
|
+
*/
|
|
2304
|
+
private buildDelegatedPrompt;
|
|
2305
|
+
/**
|
|
2306
|
+
* Parse the agent's response text into an EvaluationScore.
|
|
2307
|
+
* Supports both freeform and rubric modes.
|
|
2308
|
+
*/
|
|
2309
|
+
private parseAgentResult;
|
|
2143
2310
|
/**
|
|
2144
2311
|
* Build prompt for score-range rubric evaluation.
|
|
2145
2312
|
*/
|
|
@@ -2165,67 +2332,40 @@ declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSch
|
|
|
2165
2332
|
*/
|
|
2166
2333
|
declare function buildScoreRangeOutputSchema(): string;
|
|
2167
2334
|
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
* Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
|
|
2186
|
-
*/
|
|
2187
|
-
private evaluateBuiltIn;
|
|
2188
|
-
/**
|
|
2189
|
-
* Judge target mode: Delegates to an external agent provider via Provider.invoke().
|
|
2190
|
-
*/
|
|
2191
|
-
private evaluateWithJudgeTarget;
|
|
2192
|
-
/**
|
|
2193
|
-
* Parse the agent's response text into an EvaluationScore.
|
|
2194
|
-
* Supports both freeform and rubric modes.
|
|
2195
|
-
*/
|
|
2196
|
-
private parseResult;
|
|
2197
|
-
/**
|
|
2198
|
-
* Build system prompt for built-in mode.
|
|
2199
|
-
* Includes output format instructions.
|
|
2200
|
-
*/
|
|
2201
|
-
private buildSystemPrompt;
|
|
2202
|
-
/**
|
|
2203
|
-
* Build user prompt for built-in mode.
|
|
2204
|
-
* Uses custom template if provided, otherwise builds default prompt.
|
|
2205
|
-
*/
|
|
2206
|
-
private buildUserPrompt;
|
|
2207
|
-
/**
|
|
2208
|
-
* Build the full evaluation prompt for judge target mode (delegation).
|
|
2209
|
-
* Combines task context, criteria, candidate info, and output format instructions.
|
|
2210
|
-
*/
|
|
2211
|
-
private buildDelegatedPrompt;
|
|
2335
|
+
/**
|
|
2336
|
+
* Built-in skill-trigger evaluator.
|
|
2337
|
+
*
|
|
2338
|
+
* Detects whether the agent invoked a named Claude Code skill as its first tool call.
|
|
2339
|
+
* Mirrors the post-hoc fallback detection in skill-creator's run_eval.py:
|
|
2340
|
+
* - Only the FIRST tool call matters.
|
|
2341
|
+
* - Skill tool: checks input.skill contains the skill name (case-sensitive substring).
|
|
2342
|
+
* - Read tool: checks input.file_path contains the skill name (case-sensitive substring).
|
|
2343
|
+
* - Any other tool as first call means the skill was not triggered.
|
|
2344
|
+
* - Supports negative cases via should_trigger: false.
|
|
2345
|
+
*/
|
|
2346
|
+
|
|
2347
|
+
declare class SkillTriggerEvaluator implements Evaluator {
|
|
2348
|
+
readonly kind = "skill-trigger";
|
|
2349
|
+
private readonly config;
|
|
2350
|
+
constructor(config: SkillTriggerEvaluatorConfig);
|
|
2351
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2212
2352
|
}
|
|
2213
2353
|
|
|
2214
|
-
interface
|
|
2354
|
+
interface LlmGraderPromptAssembly {
|
|
2215
2355
|
systemPrompt: string;
|
|
2216
2356
|
userPrompt: string;
|
|
2217
2357
|
responseSchema: string;
|
|
2218
2358
|
mode: 'freeform' | 'checklist' | 'score_range';
|
|
2219
2359
|
}
|
|
2220
|
-
declare function
|
|
2360
|
+
declare function assembleLlmGraderPrompt(input: {
|
|
2221
2361
|
evalCase: EvalTest;
|
|
2222
2362
|
candidate: string;
|
|
2223
2363
|
promptInputs: PromptInputs;
|
|
2224
|
-
evaluatorConfig?:
|
|
2364
|
+
evaluatorConfig?: LlmGraderEvaluatorConfig;
|
|
2225
2365
|
output?: readonly Message[];
|
|
2226
2366
|
fileChanges?: string;
|
|
2227
2367
|
evaluatorTemplateOverride?: string;
|
|
2228
|
-
}):
|
|
2368
|
+
}): LlmGraderPromptAssembly;
|
|
2229
2369
|
|
|
2230
2370
|
interface TokenUsageEvaluatorOptions {
|
|
2231
2371
|
readonly config: TokenUsageEvaluatorConfig;
|
|
@@ -2322,18 +2462,22 @@ declare function runEqualsAssertion(output: string, value: string): AssertionRes
|
|
|
2322
2462
|
* Contains shared resources needed by evaluator instances.
|
|
2323
2463
|
*/
|
|
2324
2464
|
interface EvaluatorDispatchContext {
|
|
2325
|
-
/** Shared LLM
|
|
2465
|
+
/** Shared LLM grader provider (resolved at suite level) */
|
|
2466
|
+
readonly graderProvider?: Provider;
|
|
2467
|
+
/** @deprecated Use `graderProvider` instead */
|
|
2326
2468
|
readonly judgeProvider?: Provider;
|
|
2327
2469
|
/** Function to resolve target names to providers */
|
|
2328
2470
|
readonly targetResolver?: TargetResolver;
|
|
2329
|
-
/** Available target names for code
|
|
2471
|
+
/** Available target names for code graders */
|
|
2330
2472
|
readonly availableTargets?: readonly string[];
|
|
2331
2473
|
/** Agent timeout in ms */
|
|
2332
2474
|
readonly agentTimeoutMs?: number;
|
|
2333
2475
|
/** Directory containing the eval file (for composite member resolution) */
|
|
2334
2476
|
readonly evalFileDir?: string;
|
|
2335
|
-
/** Shared LLM
|
|
2336
|
-
readonly
|
|
2477
|
+
/** Shared LLM grader evaluator instance */
|
|
2478
|
+
readonly llmGrader: Evaluator;
|
|
2479
|
+
/** @deprecated Use `llmGrader` instead */
|
|
2480
|
+
readonly llmJudge?: Evaluator;
|
|
2337
2481
|
/** Reference to the registry itself (for composite evaluators that need to create children) */
|
|
2338
2482
|
readonly registry: EvaluatorRegistry;
|
|
2339
2483
|
}
|
|
@@ -2341,8 +2485,8 @@ interface EvaluatorDispatchContext {
|
|
|
2341
2485
|
* Factory function that creates an Evaluator instance from a config.
|
|
2342
2486
|
*
|
|
2343
2487
|
* Factory functions handle all type-specific initialization logic:
|
|
2344
|
-
* - Reading prompt files for LLM
|
|
2345
|
-
* - Resolving script paths for code
|
|
2488
|
+
* - Reading prompt files for LLM graders
|
|
2489
|
+
* - Resolving script paths for code graders
|
|
2346
2490
|
* - Creating adapter evaluators for deterministic assertions
|
|
2347
2491
|
*/
|
|
2348
2492
|
type EvaluatorFactoryFn = (config: EvaluatorConfig, context: EvaluatorDispatchContext) => Evaluator | Promise<Evaluator>;
|
|
@@ -2404,7 +2548,7 @@ interface RunEvalCaseOptions {
|
|
|
2404
2548
|
readonly provider: Provider;
|
|
2405
2549
|
readonly target: ResolvedTarget;
|
|
2406
2550
|
readonly evaluators: Partial<Record<string, Evaluator>> & {
|
|
2407
|
-
readonly 'llm-
|
|
2551
|
+
readonly 'llm-grader': Evaluator;
|
|
2408
2552
|
};
|
|
2409
2553
|
readonly now?: () => Date;
|
|
2410
2554
|
readonly maxRetries?: number;
|
|
@@ -2412,10 +2556,10 @@ interface RunEvalCaseOptions {
|
|
|
2412
2556
|
readonly cache?: EvaluationCache;
|
|
2413
2557
|
readonly useCache?: boolean;
|
|
2414
2558
|
readonly signal?: AbortSignal;
|
|
2415
|
-
readonly
|
|
2416
|
-
/** Resolver for target override in code
|
|
2559
|
+
readonly graderProvider?: Provider;
|
|
2560
|
+
/** Resolver for target override in code graders */
|
|
2417
2561
|
readonly targetResolver?: (name: string) => Provider | undefined;
|
|
2418
|
-
/** List of available target names for code
|
|
2562
|
+
/** List of available target names for code graders */
|
|
2419
2563
|
readonly availableTargets?: readonly string[];
|
|
2420
2564
|
/** Unique identifier for the evaluation run (used for workspace management) */
|
|
2421
2565
|
readonly evalRunId?: string;
|
|
@@ -2498,10 +2642,44 @@ interface RunEvaluationOptions {
|
|
|
2498
2642
|
readonly retainOnSuccess?: 'keep' | 'cleanup';
|
|
2499
2643
|
/** Retention policy override for failed cases */
|
|
2500
2644
|
readonly retainOnFailure?: 'keep' | 'cleanup';
|
|
2645
|
+
/** CLI override: grader target name (e.g., "agentv" or a target from targets.yaml) */
|
|
2646
|
+
readonly graderTarget?: string;
|
|
2647
|
+
/** CLI override: model for grader target (e.g., "openai:gpt-5-mini") */
|
|
2648
|
+
readonly model?: string;
|
|
2501
2649
|
}
|
|
2502
2650
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2503
2651
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
2504
2652
|
|
|
2653
|
+
/**
|
|
2654
|
+
* Types for inline assertion functions used in the evaluate() API.
|
|
2655
|
+
*
|
|
2656
|
+
* Inline functions are the escape hatch for custom evaluation logic
|
|
2657
|
+
* that doesn't fit a built-in evaluator type. For built-in assertions
|
|
2658
|
+
* (contains, regex, is-json, etc.), use config objects instead:
|
|
2659
|
+
*
|
|
2660
|
+
* assert: [{ type: 'contains', value: 'hello' }]
|
|
2661
|
+
*
|
|
2662
|
+
* Inline functions are for custom logic:
|
|
2663
|
+
*
|
|
2664
|
+
* assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
|
|
2665
|
+
*/
|
|
2666
|
+
/** Context passed to inline assertion functions */
|
|
2667
|
+
interface AssertContext {
|
|
2668
|
+
readonly input: string;
|
|
2669
|
+
readonly output: string;
|
|
2670
|
+
readonly expectedOutput?: string;
|
|
2671
|
+
readonly criteria?: string;
|
|
2672
|
+
readonly metadata?: Record<string, unknown>;
|
|
2673
|
+
}
|
|
2674
|
+
/** Result from an inline assertion function */
|
|
2675
|
+
interface AssertResult {
|
|
2676
|
+
readonly name: string;
|
|
2677
|
+
readonly score: number;
|
|
2678
|
+
readonly metadata?: Record<string, unknown>;
|
|
2679
|
+
}
|
|
2680
|
+
/** Inline assertion function signature */
|
|
2681
|
+
type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
|
|
2682
|
+
|
|
2505
2683
|
/**
|
|
2506
2684
|
* Programmatic API for running evaluations.
|
|
2507
2685
|
*
|
|
@@ -2509,7 +2687,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
|
|
|
2509
2687
|
* instead of a CLI. The config shape mirrors the YAML structure for easy
|
|
2510
2688
|
* translation between file-based and programmatic usage.
|
|
2511
2689
|
*
|
|
2512
|
-
* @example Inline tests
|
|
2690
|
+
* @example Inline tests with config objects
|
|
2513
2691
|
* ```typescript
|
|
2514
2692
|
* import { evaluate } from '@agentv/core';
|
|
2515
2693
|
*
|
|
@@ -2518,7 +2696,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
|
|
|
2518
2696
|
* {
|
|
2519
2697
|
* id: 'capital',
|
|
2520
2698
|
* input: 'What is the capital of France?',
|
|
2521
|
-
*
|
|
2699
|
+
* expectedOutput: 'Paris',
|
|
2522
2700
|
* assert: [{ type: 'contains', value: 'Paris' }],
|
|
2523
2701
|
* },
|
|
2524
2702
|
* ],
|
|
@@ -2528,6 +2706,27 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
|
|
|
2528
2706
|
* console.log(results.summary.passed, 'passed');
|
|
2529
2707
|
* ```
|
|
2530
2708
|
*
|
|
2709
|
+
* @example Inline tests with task function and custom assertion
|
|
2710
|
+
* ```typescript
|
|
2711
|
+
* import { evaluate } from '@agentv/core';
|
|
2712
|
+
*
|
|
2713
|
+
* const { summary } = await evaluate({
|
|
2714
|
+
* tests: [
|
|
2715
|
+
* {
|
|
2716
|
+
* id: 'echo',
|
|
2717
|
+
* input: 'hello',
|
|
2718
|
+
* expectedOutput: 'Echo: hello',
|
|
2719
|
+
* assert: [
|
|
2720
|
+
* { type: 'contains', value: 'hello' },
|
|
2721
|
+
* { type: 'equals' },
|
|
2722
|
+
* ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
|
|
2723
|
+
* ],
|
|
2724
|
+
* },
|
|
2725
|
+
* ],
|
|
2726
|
+
* task: async (input) => `Echo: ${input}`,
|
|
2727
|
+
* });
|
|
2728
|
+
* ```
|
|
2729
|
+
*
|
|
2531
2730
|
* @example File-based
|
|
2532
2731
|
* ```typescript
|
|
2533
2732
|
* const results = await evaluate({
|
|
@@ -2553,10 +2752,12 @@ interface EvalTestInput {
|
|
|
2553
2752
|
role: string;
|
|
2554
2753
|
content: string;
|
|
2555
2754
|
}[];
|
|
2556
|
-
/** Expected reference output */
|
|
2755
|
+
/** Expected reference output (camelCase preferred) */
|
|
2756
|
+
readonly expectedOutput?: string;
|
|
2757
|
+
/** @deprecated Use `expectedOutput` instead */
|
|
2557
2758
|
readonly expected_output?: string;
|
|
2558
|
-
/** Assertion evaluators */
|
|
2559
|
-
readonly assert?: readonly
|
|
2759
|
+
/** Assertion evaluators — accepts factory functions, config objects, or inline functions */
|
|
2760
|
+
readonly assert?: readonly AssertEntry[];
|
|
2560
2761
|
/** Arbitrary metadata */
|
|
2561
2762
|
readonly metadata?: Record<string, unknown>;
|
|
2562
2763
|
}
|
|
@@ -2592,6 +2793,8 @@ interface EvalAssertionInput {
|
|
|
2592
2793
|
/** Additional properties */
|
|
2593
2794
|
readonly [key: string]: unknown;
|
|
2594
2795
|
}
|
|
2796
|
+
/** Assert entry: inline function or config object */
|
|
2797
|
+
type AssertEntry = AssertFn | EvalAssertionInput;
|
|
2595
2798
|
/**
|
|
2596
2799
|
* Configuration for `evaluate()`.
|
|
2597
2800
|
* Accepts either inline tests or a spec file path.
|
|
@@ -2603,8 +2806,10 @@ interface EvalConfig {
|
|
|
2603
2806
|
readonly specFile?: string;
|
|
2604
2807
|
/** Target provider configuration */
|
|
2605
2808
|
readonly target?: TargetDefinition;
|
|
2809
|
+
/** Custom task function — mutually exclusive with target */
|
|
2810
|
+
readonly task?: (input: string) => string | Promise<string>;
|
|
2606
2811
|
/** Suite-level assertions applied to all tests */
|
|
2607
|
-
readonly assert?: readonly
|
|
2812
|
+
readonly assert?: readonly AssertEntry[];
|
|
2608
2813
|
/** Filter tests by ID pattern (glob supported) */
|
|
2609
2814
|
readonly filter?: string;
|
|
2610
2815
|
/** Maximum concurrent workers (default: 3) */
|
|
@@ -3298,9 +3503,29 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
|
3298
3503
|
*/
|
|
3299
3504
|
declare function discoverAssertions(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
|
|
3300
3505
|
|
|
3506
|
+
/**
|
|
3507
|
+
* Convention-based discovery of custom grader scripts.
|
|
3508
|
+
*
|
|
3509
|
+
* Scans `.agentv/graders/` (and legacy `.agentv/judges/`) for TypeScript/JavaScript
|
|
3510
|
+
* files and registers them as code-grader evaluators in the registry. The file name
|
|
3511
|
+
* (without extension) becomes the evaluator type name.
|
|
3512
|
+
*
|
|
3513
|
+
* Example: `.agentv/graders/custom-grader.ts` → type "custom-grader" in EVAL.yaml
|
|
3514
|
+
*/
|
|
3515
|
+
|
|
3516
|
+
/**
|
|
3517
|
+
* Discover custom grader scripts from `.agentv/graders/` (and legacy `.agentv/judges/`)
|
|
3518
|
+
* and register them as evaluator types in the registry.
|
|
3519
|
+
*
|
|
3520
|
+
* @param registry - The evaluator registry to register discovered graders into
|
|
3521
|
+
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
3522
|
+
* @returns Names of discovered grader types
|
|
3523
|
+
*/
|
|
3524
|
+
declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
|
|
3525
|
+
|
|
3301
3526
|
type AgentKernel = {
|
|
3302
3527
|
status: string;
|
|
3303
3528
|
};
|
|
3304
3529
|
declare function createAgentKernel(): AgentKernel;
|
|
3305
3530
|
|
|
3306
|
-
export { type AcquireWorkspaceOptions,
|
|
3531
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|