@agentv/core 2.18.4 → 3.0.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agentv-provider-5CJVBBGG.js +7 -0
- package/dist/agentv-provider-5CJVBBGG.js.map +1 -0
- package/dist/{chunk-V42NUK73.js → chunk-CASGWWOU.js} +56 -20
- package/dist/chunk-CASGWWOU.js.map +1 -0
- package/dist/chunk-XBGLLO22.js +65 -0
- package/dist/chunk-XBGLLO22.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +31 -14
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +19 -10
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +5188 -3879
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +377 -137
- package/dist/index.d.ts +377 -137
- package/dist/index.js +6974 -5780
- package/dist/index.js.map +1 -1
- package/package.json +3 -2
- package/dist/chunk-V42NUK73.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -8,12 +8,17 @@ interface ChatMessage {
|
|
|
8
8
|
readonly name?: string;
|
|
9
9
|
}
|
|
10
10
|
type ChatPrompt = readonly ChatMessage[];
|
|
11
|
-
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
11
|
+
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'copilot-sdk' | 'copilot-cli' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' | 'claude-cli' | 'claude-sdk' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders' | 'agentv';
|
|
12
12
|
/** Callbacks for real-time observability during provider execution */
|
|
13
13
|
interface ProviderStreamCallbacks {
|
|
14
14
|
onToolCallStart?: (toolName: string, toolCallId?: string) => void;
|
|
15
15
|
onToolCallEnd?: (toolName: string, input: unknown, output: unknown, durationMs: number, toolCallId?: string) => void;
|
|
16
16
|
onLlmCallEnd?: (model: string, tokenUsage?: ProviderTokenUsage) => void;
|
|
17
|
+
/** Returns active OTel span IDs for Braintrust trace bridging (optional) */
|
|
18
|
+
getActiveSpanIds?: () => {
|
|
19
|
+
parentSpanId: string;
|
|
20
|
+
rootSpanId: string;
|
|
21
|
+
} | null;
|
|
17
22
|
}
|
|
18
23
|
interface ProviderRequest {
|
|
19
24
|
readonly question: string;
|
|
@@ -36,6 +41,11 @@ interface ProviderRequest {
|
|
|
36
41
|
readonly captureFileChanges?: boolean;
|
|
37
42
|
/** Real-time observability callbacks (optional) */
|
|
38
43
|
readonly streamCallbacks?: ProviderStreamCallbacks;
|
|
44
|
+
/** Braintrust span IDs for trace-claude-code plugin (optional) */
|
|
45
|
+
readonly braintrustSpanIds?: {
|
|
46
|
+
readonly parentSpanId: string;
|
|
47
|
+
readonly rootSpanId: string;
|
|
48
|
+
};
|
|
39
49
|
}
|
|
40
50
|
/**
|
|
41
51
|
* A tool call within an output message.
|
|
@@ -134,6 +144,8 @@ type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
|
134
144
|
interface TargetDefinition {
|
|
135
145
|
readonly name: string;
|
|
136
146
|
readonly provider: ProviderKind | string;
|
|
147
|
+
readonly grader_target?: string | undefined;
|
|
148
|
+
/** @deprecated Use `grader_target` instead */
|
|
137
149
|
readonly judge_target?: string | undefined;
|
|
138
150
|
readonly workers?: number | undefined;
|
|
139
151
|
readonly provider_batching?: boolean | undefined;
|
|
@@ -453,11 +465,11 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
453
465
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
454
466
|
*/
|
|
455
467
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
456
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "
|
|
468
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code-grader", "llm-grader", "code-judge", "llm-judge", "rubric", "composite", "tool-trajectory", "field-accuracy", "latency", "cost", "token-usage", "execution-metrics", "skill-trigger", "contains", "contains-any", "contains-all", "icontains", "icontains-any", "icontains-all", "starts-with", "ends-with", "regex", "is-json", "equals", "rubrics", "inline-assert"];
|
|
457
469
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
458
470
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
459
471
|
/**
|
|
460
|
-
* Configuration for enabling target access in code-
|
|
472
|
+
* Configuration for enabling target access in code-grader evaluators.
|
|
461
473
|
* When present, the runtime will start a local proxy server that allows
|
|
462
474
|
* the script to invoke configured targets without direct credential access.
|
|
463
475
|
*/
|
|
@@ -556,7 +568,7 @@ type WorkspaceConfig = {
|
|
|
556
568
|
};
|
|
557
569
|
type CodeEvaluatorConfig = {
|
|
558
570
|
readonly name: string;
|
|
559
|
-
readonly type: 'code-judge';
|
|
571
|
+
readonly type: 'code-judge' | 'code-grader';
|
|
560
572
|
readonly command: readonly string[];
|
|
561
573
|
/** @deprecated Use `command` instead */
|
|
562
574
|
readonly script?: readonly string[];
|
|
@@ -567,14 +579,14 @@ type CodeEvaluatorConfig = {
|
|
|
567
579
|
readonly required?: boolean | number;
|
|
568
580
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
569
581
|
readonly negate?: boolean;
|
|
570
|
-
/** Pass-through configuration for the code-
|
|
582
|
+
/** Pass-through configuration for the code-grader (any unrecognized YAML properties) */
|
|
571
583
|
readonly config?: JsonObject;
|
|
572
584
|
/** When present, enables target access via local proxy */
|
|
573
585
|
readonly target?: TargetAccessConfig;
|
|
574
586
|
};
|
|
575
587
|
/**
|
|
576
588
|
* Executable prompt template configuration.
|
|
577
|
-
* Matches code-
|
|
589
|
+
* Matches code-grader pattern for consistency.
|
|
578
590
|
*/
|
|
579
591
|
type PromptScriptConfig = {
|
|
580
592
|
/** Command array to execute (e.g., ["bun", "run", "template.ts"]) */
|
|
@@ -584,24 +596,32 @@ type PromptScriptConfig = {
|
|
|
584
596
|
/** Pass-through configuration for the prompt template */
|
|
585
597
|
readonly config?: Record<string, unknown>;
|
|
586
598
|
};
|
|
587
|
-
type
|
|
599
|
+
type LlmGraderEvaluatorConfig = {
|
|
588
600
|
readonly name: string;
|
|
589
|
-
readonly type: 'llm-judge';
|
|
601
|
+
readonly type: 'llm-grader' | 'llm-judge';
|
|
590
602
|
/** Text prompt (inline or file path) or executable script config */
|
|
591
603
|
readonly prompt?: string | PromptScriptConfig;
|
|
592
604
|
readonly promptPath?: string;
|
|
593
605
|
/** Resolved absolute path for prompt file (used for text template prompts) */
|
|
594
606
|
readonly resolvedPromptPath?: string;
|
|
595
|
-
/** Resolved script array for executable prompts (matches code-
|
|
607
|
+
/** Resolved script array for executable prompts (matches code-grader pattern) */
|
|
596
608
|
readonly resolvedPromptScript?: readonly string[];
|
|
597
609
|
readonly rubrics?: readonly RubricItem[];
|
|
598
610
|
readonly weight?: number;
|
|
599
611
|
readonly required?: boolean | number;
|
|
600
612
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
601
613
|
readonly negate?: boolean;
|
|
614
|
+
/** Optional target override for this grader (uses a named LLM target from targets.yaml). */
|
|
615
|
+
readonly target?: string;
|
|
602
616
|
/** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
|
|
603
617
|
readonly config?: Record<string, unknown>;
|
|
618
|
+
/** Maximum agent steps for agentv built-in mode (default 10, max 50). Ignored in LLM mode. */
|
|
619
|
+
readonly max_steps?: number;
|
|
620
|
+
/** Temperature override for grader calls */
|
|
621
|
+
readonly temperature?: number;
|
|
604
622
|
};
|
|
623
|
+
/** @deprecated Use `LlmGraderEvaluatorConfig` instead */
|
|
624
|
+
type LlmJudgeEvaluatorConfig = LlmGraderEvaluatorConfig;
|
|
605
625
|
/**
|
|
606
626
|
* Score range definition for analytic rubric scoring.
|
|
607
627
|
* Each range maps an integer score band (0-10) to an outcome description.
|
|
@@ -613,7 +633,7 @@ type ScoreRange = {
|
|
|
613
633
|
readonly outcome: string;
|
|
614
634
|
};
|
|
615
635
|
/**
|
|
616
|
-
* Rubric item for LLM
|
|
636
|
+
* Rubric item for LLM grader evaluation.
|
|
617
637
|
* Supports two modes:
|
|
618
638
|
* - Checklist mode: boolean satisfied/not-satisfied with `outcome`
|
|
619
639
|
* - Score-range mode: 0-10 integer scoring with `score_ranges`
|
|
@@ -638,7 +658,7 @@ type RubricItem = {
|
|
|
638
658
|
readonly required_min_score?: number;
|
|
639
659
|
/**
|
|
640
660
|
* Score range definitions for analytic rubric scoring.
|
|
641
|
-
* When present, the
|
|
661
|
+
* When present, the grader outputs an integer 0-10 score per criterion.
|
|
642
662
|
* Ranges must be non-overlapping and cover 0-10 inclusive.
|
|
643
663
|
*/
|
|
644
664
|
readonly score_ranges?: readonly ScoreRange[];
|
|
@@ -646,10 +666,19 @@ type RubricItem = {
|
|
|
646
666
|
type CompositeAggregatorConfig = {
|
|
647
667
|
readonly type: 'weighted_average';
|
|
648
668
|
readonly weights?: Record<string, number>;
|
|
669
|
+
} | {
|
|
670
|
+
readonly type: 'code-grader';
|
|
671
|
+
readonly path: string;
|
|
672
|
+
readonly cwd?: string;
|
|
649
673
|
} | {
|
|
650
674
|
readonly type: 'code-judge';
|
|
651
675
|
readonly path: string;
|
|
652
676
|
readonly cwd?: string;
|
|
677
|
+
} | {
|
|
678
|
+
readonly type: 'llm-grader';
|
|
679
|
+
readonly prompt?: string;
|
|
680
|
+
readonly promptPath?: string;
|
|
681
|
+
readonly model?: string;
|
|
653
682
|
} | {
|
|
654
683
|
readonly type: 'llm-judge';
|
|
655
684
|
readonly prompt?: string;
|
|
@@ -662,7 +691,7 @@ type CompositeAggregatorConfig = {
|
|
|
662
691
|
type CompositeEvaluatorConfig = {
|
|
663
692
|
readonly name: string;
|
|
664
693
|
readonly type: 'composite';
|
|
665
|
-
readonly
|
|
694
|
+
readonly assertions: readonly EvaluatorConfig[];
|
|
666
695
|
readonly aggregator: CompositeAggregatorConfig;
|
|
667
696
|
readonly weight?: number;
|
|
668
697
|
readonly required?: boolean | number;
|
|
@@ -671,7 +700,7 @@ type CompositeEvaluatorConfig = {
|
|
|
671
700
|
};
|
|
672
701
|
/**
|
|
673
702
|
* Match type for field accuracy evaluation.
|
|
674
|
-
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-
|
|
703
|
+
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code-grader evaluator.
|
|
675
704
|
* See examples/features/document-extraction/fuzzy_match.ts for an example.
|
|
676
705
|
*/
|
|
677
706
|
type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
|
|
@@ -786,34 +815,6 @@ type ExecutionMetricsEvaluatorConfig = {
|
|
|
786
815
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
787
816
|
readonly negate?: boolean;
|
|
788
817
|
};
|
|
789
|
-
/**
|
|
790
|
-
* Configuration for the agent-judge evaluator.
|
|
791
|
-
* Runs an agentic investigation loop to audit workspaces and verify criteria.
|
|
792
|
-
* Two modes:
|
|
793
|
-
* - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
|
|
794
|
-
* - Judge target: Delegates to an external agent provider via Provider.invoke()
|
|
795
|
-
*/
|
|
796
|
-
type AgentJudgeEvaluatorConfig = {
|
|
797
|
-
readonly name: string;
|
|
798
|
-
readonly type: 'agent-judge';
|
|
799
|
-
/** Custom evaluation prompt (inline text or file path) */
|
|
800
|
-
readonly prompt?: string;
|
|
801
|
-
readonly promptPath?: string;
|
|
802
|
-
/** Resolved absolute path for prompt file */
|
|
803
|
-
readonly resolvedPromptPath?: string;
|
|
804
|
-
/** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
|
|
805
|
-
readonly rubrics?: readonly RubricItem[];
|
|
806
|
-
/** Maximum agent steps for built-in mode (default 10, max 50) */
|
|
807
|
-
readonly max_steps?: number;
|
|
808
|
-
/** Temperature for built-in mode (default 0) */
|
|
809
|
-
readonly temperature?: number;
|
|
810
|
-
/** Target name — delegates agent loop to this provider instead of built-in mode */
|
|
811
|
-
readonly target?: string;
|
|
812
|
-
readonly weight?: number;
|
|
813
|
-
readonly required?: boolean | number;
|
|
814
|
-
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
815
|
-
readonly negate?: boolean;
|
|
816
|
-
};
|
|
817
818
|
/**
|
|
818
819
|
* Configuration for the contains assertion evaluator.
|
|
819
820
|
* Checks whether the candidate output contains a specified substring.
|
|
@@ -971,7 +972,34 @@ type RubricsEvaluatorConfig = {
|
|
|
971
972
|
/** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
|
|
972
973
|
readonly negate?: boolean;
|
|
973
974
|
};
|
|
974
|
-
|
|
975
|
+
/**
|
|
976
|
+
* Configuration for the skill-trigger evaluator.
|
|
977
|
+
* Detects whether the agent invoked a named Claude Code skill as its first tool call.
|
|
978
|
+
* Mirrors the post-hoc fallback detection in skill-creator's run_eval.py.
|
|
979
|
+
*/
|
|
980
|
+
type SkillTriggerEvaluatorConfig = {
|
|
981
|
+
readonly name: string;
|
|
982
|
+
readonly type: 'skill-trigger';
|
|
983
|
+
/** The skill name to check for (case-sensitive substring match) */
|
|
984
|
+
readonly skill: string;
|
|
985
|
+
/** Whether the skill is expected to trigger (default: true) */
|
|
986
|
+
readonly should_trigger?: boolean;
|
|
987
|
+
readonly weight?: number;
|
|
988
|
+
readonly required?: boolean | number;
|
|
989
|
+
readonly negate?: boolean;
|
|
990
|
+
};
|
|
991
|
+
/**
|
|
992
|
+
* Configuration for the inline-assert evaluator.
|
|
993
|
+
* Wraps an AssertFn for in-process evaluation via the evaluate() API.
|
|
994
|
+
*/
|
|
995
|
+
type InlineAssertEvaluatorConfig = {
|
|
996
|
+
readonly name: string;
|
|
997
|
+
readonly type: 'inline-assert';
|
|
998
|
+
readonly weight?: number;
|
|
999
|
+
readonly required?: boolean | number;
|
|
1000
|
+
readonly negate?: boolean;
|
|
1001
|
+
};
|
|
1002
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmGraderEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig | IcontainsEvaluatorConfig | IcontainsAnyEvaluatorConfig | IcontainsAllEvaluatorConfig | StartsWithEvaluatorConfig | EndsWithEvaluatorConfig | RegexEvaluatorConfig | IsJsonEvaluatorConfig | EqualsEvaluatorConfig | RubricsEvaluatorConfig | InlineAssertEvaluatorConfig;
|
|
975
1003
|
/**
|
|
976
1004
|
* Eval test definition sourced from AgentV specs.
|
|
977
1005
|
*/
|
|
@@ -989,7 +1017,7 @@ interface EvalTest {
|
|
|
989
1017
|
readonly file_paths: readonly string[];
|
|
990
1018
|
readonly criteria: string;
|
|
991
1019
|
readonly evaluator?: EvaluatorKind;
|
|
992
|
-
readonly
|
|
1020
|
+
readonly assertions?: readonly EvaluatorConfig[];
|
|
993
1021
|
/** Workspace configuration (merged from suite-level and case-level) */
|
|
994
1022
|
readonly workspace?: WorkspaceConfig;
|
|
995
1023
|
/** Arbitrary metadata passed to workspace scripts via stdin */
|
|
@@ -1162,15 +1190,15 @@ interface EvaluatorResult {
|
|
|
1162
1190
|
readonly rawRequest?: JsonObject;
|
|
1163
1191
|
readonly evaluatorProviderRequest?: JsonObject;
|
|
1164
1192
|
readonly scores?: readonly EvaluatorResult[];
|
|
1165
|
-
/** Optional structured details from code
|
|
1193
|
+
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts). */
|
|
1166
1194
|
readonly details?: JsonObject;
|
|
1167
1195
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1168
1196
|
readonly tokenUsage?: TokenUsage;
|
|
1169
|
-
/** Wall-clock duration of this
|
|
1197
|
+
/** Wall-clock duration of this grader execution in milliseconds. */
|
|
1170
1198
|
readonly durationMs?: number;
|
|
1171
|
-
/** ISO 8601 UTC timestamp when this
|
|
1199
|
+
/** ISO 8601 UTC timestamp when this grader started executing. */
|
|
1172
1200
|
readonly startedAt?: string;
|
|
1173
|
-
/** ISO 8601 UTC timestamp when this
|
|
1201
|
+
/** ISO 8601 UTC timestamp when this grader finished executing. */
|
|
1174
1202
|
readonly endedAt?: string;
|
|
1175
1203
|
}
|
|
1176
1204
|
/**
|
|
@@ -1304,7 +1332,7 @@ declare function buildPromptInputs(testCase: EvalTest, mode?: FormattingMode): P
|
|
|
1304
1332
|
/**
|
|
1305
1333
|
* Detect file format by extension.
|
|
1306
1334
|
*/
|
|
1307
|
-
declare function detectFormat(filePath: string): 'yaml' | 'jsonl';
|
|
1335
|
+
declare function detectFormat(filePath: string): 'yaml' | 'jsonl' | 'agent-skills-json';
|
|
1308
1336
|
|
|
1309
1337
|
type LoadOptions = {
|
|
1310
1338
|
readonly verbose?: boolean;
|
|
@@ -1356,6 +1384,83 @@ declare function loadTestById(evalFilePath: string, repoRoot: URL | string, eval
|
|
|
1356
1384
|
/** @deprecated Use `loadTestById` instead */
|
|
1357
1385
|
declare const loadEvalCaseById: typeof loadTestById;
|
|
1358
1386
|
|
|
1387
|
+
/**
|
|
1388
|
+
* Raw Agent Skills evals.json schema.
|
|
1389
|
+
* @see https://agentskills.io/skill-creation/evaluating-skills
|
|
1390
|
+
*/
|
|
1391
|
+
interface AgentSkillsEvalsFile {
|
|
1392
|
+
readonly skill_name?: string;
|
|
1393
|
+
readonly evals: readonly AgentSkillsEvalCase[];
|
|
1394
|
+
}
|
|
1395
|
+
interface AgentSkillsEvalCase {
|
|
1396
|
+
readonly id: number;
|
|
1397
|
+
readonly prompt: string;
|
|
1398
|
+
readonly expected_output?: string;
|
|
1399
|
+
readonly files?: readonly string[];
|
|
1400
|
+
readonly assertions?: readonly string[];
|
|
1401
|
+
}
|
|
1402
|
+
/**
|
|
1403
|
+
* Detect whether a JSON file is in Agent Skills evals.json format.
|
|
1404
|
+
* Returns true if the parsed content has an `evals` array.
|
|
1405
|
+
*/
|
|
1406
|
+
declare function isAgentSkillsFormat(parsed: unknown): parsed is AgentSkillsEvalsFile;
|
|
1407
|
+
/**
|
|
1408
|
+
* Parse already-loaded Agent Skills evals data into EvalTest[].
|
|
1409
|
+
* Exported for testing without file I/O.
|
|
1410
|
+
*/
|
|
1411
|
+
declare function parseAgentSkillsEvals(parsed: unknown, source?: string, baseDir?: string): readonly EvalTest[];
|
|
1412
|
+
|
|
1413
|
+
/**
|
|
1414
|
+
* EVAL.yaml → evals.json transpiler.
|
|
1415
|
+
*
|
|
1416
|
+
* Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
|
|
1417
|
+
* for consumption by the skill-creator pipeline.
|
|
1418
|
+
*
|
|
1419
|
+
* Handles both `assertions:` (current) and `assert:` (deprecated alias).
|
|
1420
|
+
*/
|
|
1421
|
+
interface EvalsJsonCase {
|
|
1422
|
+
id: number;
|
|
1423
|
+
prompt: string;
|
|
1424
|
+
expected_output?: string;
|
|
1425
|
+
files?: string[];
|
|
1426
|
+
should_trigger?: boolean;
|
|
1427
|
+
assertions: string[];
|
|
1428
|
+
}
|
|
1429
|
+
interface EvalsJsonFile {
|
|
1430
|
+
skill_name: string;
|
|
1431
|
+
evals: EvalsJsonCase[];
|
|
1432
|
+
}
|
|
1433
|
+
/**
|
|
1434
|
+
* Result of transpiling a single EVAL.yaml.
|
|
1435
|
+
* May produce multiple evals.json files (one per skill).
|
|
1436
|
+
*/
|
|
1437
|
+
interface TranspileResult {
|
|
1438
|
+
/** Map from skill_name → EvalsJsonFile */
|
|
1439
|
+
files: Map<string, EvalsJsonFile>;
|
|
1440
|
+
/** Warning messages accumulated during transpilation */
|
|
1441
|
+
warnings: string[];
|
|
1442
|
+
}
|
|
1443
|
+
/**
|
|
1444
|
+
* Transpile a parsed EVAL.yaml object into one or more evals.json objects.
|
|
1445
|
+
*
|
|
1446
|
+
* @param suite Parsed YAML object (already loaded, no file I/O here)
|
|
1447
|
+
* @param source Source identifier for error messages (e.g. file path)
|
|
1448
|
+
*/
|
|
1449
|
+
declare function transpileEvalYaml(suite: unknown, source?: string): TranspileResult;
|
|
1450
|
+
/**
|
|
1451
|
+
* Transpile an EVAL.yaml file into one or more evals.json objects.
|
|
1452
|
+
* Returns a map from output filename → JSON content.
|
|
1453
|
+
*
|
|
1454
|
+
* @param evalYamlPath Absolute path to the EVAL.yaml file
|
|
1455
|
+
*/
|
|
1456
|
+
declare function transpileEvalYamlFile(evalYamlPath: string): TranspileResult;
|
|
1457
|
+
/**
|
|
1458
|
+
* Determine the output filename(s) for a transpile result.
|
|
1459
|
+
* Single skill → "evals.json"
|
|
1460
|
+
* Multiple skills → "<skill>.evals.json"
|
|
1461
|
+
*/
|
|
1462
|
+
declare function getOutputFilenames(result: TranspileResult): Map<string, string>;
|
|
1463
|
+
|
|
1359
1464
|
declare function fileExists(filePath: string): Promise<boolean>;
|
|
1360
1465
|
/**
|
|
1361
1466
|
* Normalize line endings to LF (\n).
|
|
@@ -1603,87 +1708,112 @@ interface VSCodeResolvedConfig {
|
|
|
1603
1708
|
readonly workspaceTemplate?: string;
|
|
1604
1709
|
readonly timeoutMs?: number;
|
|
1605
1710
|
}
|
|
1711
|
+
interface AgentVResolvedConfig {
|
|
1712
|
+
readonly model: string;
|
|
1713
|
+
readonly temperature: number;
|
|
1714
|
+
}
|
|
1606
1715
|
type ResolvedTarget = {
|
|
1607
1716
|
readonly kind: 'azure';
|
|
1608
1717
|
readonly name: string;
|
|
1609
|
-
readonly
|
|
1718
|
+
readonly graderTarget?: string;
|
|
1610
1719
|
readonly workers?: number;
|
|
1611
1720
|
readonly providerBatching?: boolean;
|
|
1612
1721
|
readonly config: AzureResolvedConfig;
|
|
1613
1722
|
} | {
|
|
1614
1723
|
readonly kind: 'anthropic';
|
|
1615
1724
|
readonly name: string;
|
|
1616
|
-
readonly
|
|
1725
|
+
readonly graderTarget?: string;
|
|
1617
1726
|
readonly workers?: number;
|
|
1618
1727
|
readonly providerBatching?: boolean;
|
|
1619
1728
|
readonly config: AnthropicResolvedConfig;
|
|
1620
1729
|
} | {
|
|
1621
1730
|
readonly kind: 'gemini';
|
|
1622
1731
|
readonly name: string;
|
|
1623
|
-
readonly
|
|
1732
|
+
readonly graderTarget?: string;
|
|
1624
1733
|
readonly workers?: number;
|
|
1625
1734
|
readonly providerBatching?: boolean;
|
|
1626
1735
|
readonly config: GeminiResolvedConfig;
|
|
1627
1736
|
} | {
|
|
1628
1737
|
readonly kind: 'codex';
|
|
1629
1738
|
readonly name: string;
|
|
1630
|
-
readonly
|
|
1739
|
+
readonly graderTarget?: string;
|
|
1631
1740
|
readonly workers?: number;
|
|
1632
1741
|
readonly providerBatching?: boolean;
|
|
1633
1742
|
readonly config: CodexResolvedConfig;
|
|
1634
1743
|
} | {
|
|
1635
1744
|
readonly kind: 'copilot-sdk';
|
|
1636
1745
|
readonly name: string;
|
|
1637
|
-
readonly
|
|
1746
|
+
readonly graderTarget?: string;
|
|
1638
1747
|
readonly workers?: number;
|
|
1639
1748
|
readonly providerBatching?: boolean;
|
|
1640
1749
|
readonly config: CopilotSdkResolvedConfig;
|
|
1641
1750
|
} | {
|
|
1642
1751
|
readonly kind: 'copilot-cli';
|
|
1643
1752
|
readonly name: string;
|
|
1644
|
-
readonly
|
|
1753
|
+
readonly graderTarget?: string;
|
|
1645
1754
|
readonly workers?: number;
|
|
1646
1755
|
readonly providerBatching?: boolean;
|
|
1647
1756
|
readonly config: CopilotCliResolvedConfig;
|
|
1648
1757
|
} | {
|
|
1649
1758
|
readonly kind: 'pi-coding-agent';
|
|
1650
1759
|
readonly name: string;
|
|
1651
|
-
readonly
|
|
1760
|
+
readonly graderTarget?: string;
|
|
1652
1761
|
readonly workers?: number;
|
|
1653
1762
|
readonly providerBatching?: boolean;
|
|
1654
1763
|
readonly config: PiCodingAgentResolvedConfig;
|
|
1655
1764
|
} | {
|
|
1656
1765
|
readonly kind: 'pi-agent-sdk';
|
|
1657
1766
|
readonly name: string;
|
|
1658
|
-
readonly
|
|
1767
|
+
readonly graderTarget?: string;
|
|
1659
1768
|
readonly workers?: number;
|
|
1660
1769
|
readonly providerBatching?: boolean;
|
|
1661
1770
|
readonly config: PiAgentSdkResolvedConfig;
|
|
1662
1771
|
} | {
|
|
1663
1772
|
readonly kind: 'claude';
|
|
1664
1773
|
readonly name: string;
|
|
1665
|
-
readonly
|
|
1774
|
+
readonly graderTarget?: string;
|
|
1775
|
+
readonly workers?: number;
|
|
1776
|
+
readonly providerBatching?: boolean;
|
|
1777
|
+
readonly config: ClaudeResolvedConfig;
|
|
1778
|
+
} | {
|
|
1779
|
+
readonly kind: 'claude-cli';
|
|
1780
|
+
readonly name: string;
|
|
1781
|
+
readonly graderTarget?: string;
|
|
1782
|
+
readonly workers?: number;
|
|
1783
|
+
readonly providerBatching?: boolean;
|
|
1784
|
+
readonly config: ClaudeResolvedConfig;
|
|
1785
|
+
} | {
|
|
1786
|
+
readonly kind: 'claude-sdk';
|
|
1787
|
+
readonly name: string;
|
|
1788
|
+
readonly graderTarget?: string;
|
|
1666
1789
|
readonly workers?: number;
|
|
1667
1790
|
readonly providerBatching?: boolean;
|
|
1668
1791
|
readonly config: ClaudeResolvedConfig;
|
|
1669
1792
|
} | {
|
|
1670
1793
|
readonly kind: 'mock';
|
|
1671
1794
|
readonly name: string;
|
|
1672
|
-
readonly
|
|
1795
|
+
readonly graderTarget?: string;
|
|
1673
1796
|
readonly workers?: number;
|
|
1674
1797
|
readonly providerBatching?: boolean;
|
|
1675
1798
|
readonly config: MockResolvedConfig;
|
|
1676
1799
|
} | {
|
|
1677
1800
|
readonly kind: 'vscode' | 'vscode-insiders';
|
|
1678
1801
|
readonly name: string;
|
|
1679
|
-
readonly
|
|
1802
|
+
readonly graderTarget?: string;
|
|
1680
1803
|
readonly workers?: number;
|
|
1681
1804
|
readonly providerBatching?: boolean;
|
|
1682
1805
|
readonly config: VSCodeResolvedConfig;
|
|
1806
|
+
} | {
|
|
1807
|
+
readonly kind: 'agentv';
|
|
1808
|
+
readonly name: string;
|
|
1809
|
+
readonly graderTarget?: string;
|
|
1810
|
+
readonly workers?: number;
|
|
1811
|
+
readonly providerBatching?: boolean;
|
|
1812
|
+
readonly config: AgentVResolvedConfig;
|
|
1683
1813
|
} | {
|
|
1684
1814
|
readonly kind: 'cli';
|
|
1685
1815
|
readonly name: string;
|
|
1686
|
-
readonly
|
|
1816
|
+
readonly graderTarget?: string;
|
|
1687
1817
|
readonly workers?: number;
|
|
1688
1818
|
readonly providerBatching?: boolean;
|
|
1689
1819
|
readonly config: CliResolvedConfig;
|
|
@@ -1835,7 +1965,7 @@ declare function resolveAndCreateProvider(definition: TargetDefinition, env?: En
|
|
|
1835
1965
|
|
|
1836
1966
|
/**
|
|
1837
1967
|
* Function to resolve a target name to a provider.
|
|
1838
|
-
* Used by code
|
|
1968
|
+
* Used by code graders to support target override.
|
|
1839
1969
|
*/
|
|
1840
1970
|
type TargetResolver = (targetName: string) => Provider | undefined;
|
|
1841
1971
|
interface EvaluationContext {
|
|
@@ -1851,6 +1981,8 @@ interface EvaluationContext {
|
|
|
1851
1981
|
readonly chatPrompt?: ChatPrompt;
|
|
1852
1982
|
};
|
|
1853
1983
|
readonly now: Date;
|
|
1984
|
+
readonly graderProvider?: Provider;
|
|
1985
|
+
/** @deprecated Use `graderProvider` instead */
|
|
1854
1986
|
readonly judgeProvider?: Provider;
|
|
1855
1987
|
readonly evaluatorTemplateOverride?: string;
|
|
1856
1988
|
readonly evaluator?: EvaluatorConfig;
|
|
@@ -1868,9 +2000,9 @@ interface EvaluationContext {
|
|
|
1868
2000
|
readonly startTime?: string;
|
|
1869
2001
|
/** ISO 8601 timestamp when execution ended */
|
|
1870
2002
|
readonly endTime?: string;
|
|
1871
|
-
/** Resolver for target override in code
|
|
2003
|
+
/** Resolver for target override in code graders */
|
|
1872
2004
|
readonly targetResolver?: TargetResolver;
|
|
1873
|
-
/** List of available target names for code
|
|
2005
|
+
/** List of available target names for code graders */
|
|
1874
2006
|
readonly availableTargets?: readonly string[];
|
|
1875
2007
|
/** Unified diff of file changes from workspace (when workspace_template is configured) */
|
|
1876
2008
|
readonly fileChanges?: string;
|
|
@@ -1886,7 +2018,7 @@ interface EvaluationScore {
|
|
|
1886
2018
|
readonly reasoning?: string;
|
|
1887
2019
|
readonly evaluatorRawRequest?: JsonObject;
|
|
1888
2020
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1889
|
-
/** Optional structured details from code
|
|
2021
|
+
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1890
2022
|
readonly details?: JsonObject;
|
|
1891
2023
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1892
2024
|
readonly tokenUsage?: TokenUsage;
|
|
@@ -1902,7 +2034,7 @@ interface ChildEvaluatorResult {
|
|
|
1902
2034
|
readonly reasoning?: string;
|
|
1903
2035
|
readonly evaluatorRawRequest?: JsonObject;
|
|
1904
2036
|
readonly scores?: readonly ChildEvaluatorResult[];
|
|
1905
|
-
/** Optional structured details from code
|
|
2037
|
+
/** Optional structured details from code graders (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1906
2038
|
readonly details?: JsonObject;
|
|
1907
2039
|
/** Token usage from LLM calls made by this evaluator (optional). */
|
|
1908
2040
|
readonly tokenUsage?: TokenUsage;
|
|
@@ -2061,12 +2193,18 @@ declare class LatencyEvaluator implements Evaluator {
|
|
|
2061
2193
|
* Custom evaluators can override this via evaluatorTemplate option.
|
|
2062
2194
|
*/
|
|
2063
2195
|
declare const DEFAULT_EVALUATOR_TEMPLATE: string;
|
|
2064
|
-
type
|
|
2065
|
-
interface
|
|
2066
|
-
readonly
|
|
2196
|
+
type GraderProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
2197
|
+
interface LlmGraderEvaluatorOptions {
|
|
2198
|
+
readonly resolveGraderProvider: GraderProviderResolver;
|
|
2199
|
+
/** @deprecated Use `resolveGraderProvider` instead. */
|
|
2200
|
+
readonly resolveJudgeProvider?: GraderProviderResolver;
|
|
2067
2201
|
readonly maxOutputTokens?: number;
|
|
2068
2202
|
readonly temperature?: number;
|
|
2069
2203
|
readonly evaluatorTemplate?: string;
|
|
2204
|
+
readonly maxSteps?: number;
|
|
2205
|
+
readonly graderTargetProvider?: Provider;
|
|
2206
|
+
/** @deprecated Use `graderTargetProvider` instead. */
|
|
2207
|
+
readonly judgeTargetProvider?: Provider;
|
|
2070
2208
|
}
|
|
2071
2209
|
declare const freeformEvaluationSchema: z.ZodObject<{
|
|
2072
2210
|
score: z.ZodNumber;
|
|
@@ -2115,13 +2253,15 @@ declare const rubricEvaluationSchema: z.ZodObject<{
|
|
|
2115
2253
|
overall_reasoning: string;
|
|
2116
2254
|
}>;
|
|
2117
2255
|
|
|
2118
|
-
declare class
|
|
2119
|
-
readonly kind = "llm-
|
|
2120
|
-
private readonly
|
|
2256
|
+
declare class LlmGraderEvaluator implements Evaluator {
|
|
2257
|
+
readonly kind = "llm-grader";
|
|
2258
|
+
private readonly resolveGraderProvider;
|
|
2121
2259
|
private readonly maxOutputTokens?;
|
|
2122
2260
|
private readonly temperature?;
|
|
2123
2261
|
private readonly evaluatorTemplate?;
|
|
2124
|
-
|
|
2262
|
+
private readonly maxSteps;
|
|
2263
|
+
private readonly graderTargetProvider?;
|
|
2264
|
+
constructor(options: LlmGraderEvaluatorOptions);
|
|
2125
2265
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
2126
2266
|
private evaluateFreeform;
|
|
2127
2267
|
private evaluateWithRubrics;
|
|
@@ -2130,6 +2270,43 @@ declare class LlmJudgeEvaluator implements Evaluator {
|
|
|
2130
2270
|
* Each criterion is scored 0-10 and normalized to 0-1.
|
|
2131
2271
|
*/
|
|
2132
2272
|
private evaluateWithScoreRanges;
|
|
2273
|
+
/**
|
|
2274
|
+
* Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
|
|
2275
|
+
*/
|
|
2276
|
+
private evaluateBuiltIn;
|
|
2277
|
+
/**
|
|
2278
|
+
* Grader target mode: Delegates to an explicit graderTargetProvider via Provider.invoke().
|
|
2279
|
+
*/
|
|
2280
|
+
private evaluateWithGraderTarget;
|
|
2281
|
+
/**
|
|
2282
|
+
* Delegate mode: resolved provider is an agent provider — send prompt via invoke().
|
|
2283
|
+
*/
|
|
2284
|
+
private evaluateWithDelegatedAgent;
|
|
2285
|
+
/**
|
|
2286
|
+
* Shared implementation for grader_target and delegate modes.
|
|
2287
|
+
* Both invoke a provider and parse the agent result from the response.
|
|
2288
|
+
*/
|
|
2289
|
+
private evaluateWithDelegate;
|
|
2290
|
+
/**
|
|
2291
|
+
* Build system prompt for built-in agent mode.
|
|
2292
|
+
* Includes output format instructions.
|
|
2293
|
+
*/
|
|
2294
|
+
private buildAgentSystemPrompt;
|
|
2295
|
+
/**
|
|
2296
|
+
* Build user prompt for built-in agent mode.
|
|
2297
|
+
* Uses custom template if provided, otherwise builds default prompt.
|
|
2298
|
+
*/
|
|
2299
|
+
private buildAgentUserPrompt;
|
|
2300
|
+
/**
|
|
2301
|
+
* Build the full evaluation prompt for delegate mode (agent providers).
|
|
2302
|
+
* Combines task context, criteria, candidate info, and output format instructions.
|
|
2303
|
+
*/
|
|
2304
|
+
private buildDelegatedPrompt;
|
|
2305
|
+
/**
|
|
2306
|
+
* Parse the agent's response text into an EvaluationScore.
|
|
2307
|
+
* Supports both freeform and rubric modes.
|
|
2308
|
+
*/
|
|
2309
|
+
private parseAgentResult;
|
|
2133
2310
|
/**
|
|
2134
2311
|
* Build prompt for score-range rubric evaluation.
|
|
2135
2312
|
*/
|
|
@@ -2155,67 +2332,40 @@ declare function calculateRubricScore(result: z.infer<typeof rubricEvaluationSch
|
|
|
2155
2332
|
*/
|
|
2156
2333
|
declare function buildScoreRangeOutputSchema(): string;
|
|
2157
2334
|
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
|
|
2167
|
-
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
* Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
|
|
2176
|
-
*/
|
|
2177
|
-
private evaluateBuiltIn;
|
|
2178
|
-
/**
|
|
2179
|
-
* Judge target mode: Delegates to an external agent provider via Provider.invoke().
|
|
2180
|
-
*/
|
|
2181
|
-
private evaluateWithJudgeTarget;
|
|
2182
|
-
/**
|
|
2183
|
-
* Parse the agent's response text into an EvaluationScore.
|
|
2184
|
-
* Supports both freeform and rubric modes.
|
|
2185
|
-
*/
|
|
2186
|
-
private parseResult;
|
|
2187
|
-
/**
|
|
2188
|
-
* Build system prompt for built-in mode.
|
|
2189
|
-
* Includes output format instructions.
|
|
2190
|
-
*/
|
|
2191
|
-
private buildSystemPrompt;
|
|
2192
|
-
/**
|
|
2193
|
-
* Build user prompt for built-in mode.
|
|
2194
|
-
* Uses custom template if provided, otherwise builds default prompt.
|
|
2195
|
-
*/
|
|
2196
|
-
private buildUserPrompt;
|
|
2197
|
-
/**
|
|
2198
|
-
* Build the full evaluation prompt for judge target mode (delegation).
|
|
2199
|
-
* Combines task context, criteria, candidate info, and output format instructions.
|
|
2200
|
-
*/
|
|
2201
|
-
private buildDelegatedPrompt;
|
|
2335
|
+
/**
|
|
2336
|
+
* Built-in skill-trigger evaluator.
|
|
2337
|
+
*
|
|
2338
|
+
* Detects whether the agent invoked a named Claude Code skill as its first tool call.
|
|
2339
|
+
* Mirrors the post-hoc fallback detection in skill-creator's run_eval.py:
|
|
2340
|
+
* - Only the FIRST tool call matters.
|
|
2341
|
+
* - Skill tool: checks input.skill contains the skill name (case-sensitive substring).
|
|
2342
|
+
* - Read tool: checks input.file_path contains the skill name (case-sensitive substring).
|
|
2343
|
+
* - Any other tool as first call means the skill was not triggered.
|
|
2344
|
+
* - Supports negative cases via should_trigger: false.
|
|
2345
|
+
*/
|
|
2346
|
+
|
|
2347
|
+
declare class SkillTriggerEvaluator implements Evaluator {
|
|
2348
|
+
readonly kind = "skill-trigger";
|
|
2349
|
+
private readonly config;
|
|
2350
|
+
constructor(config: SkillTriggerEvaluatorConfig);
|
|
2351
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
2202
2352
|
}
|
|
2203
2353
|
|
|
2204
|
-
interface
|
|
2354
|
+
interface LlmGraderPromptAssembly {
|
|
2205
2355
|
systemPrompt: string;
|
|
2206
2356
|
userPrompt: string;
|
|
2207
2357
|
responseSchema: string;
|
|
2208
2358
|
mode: 'freeform' | 'checklist' | 'score_range';
|
|
2209
2359
|
}
|
|
2210
|
-
declare function
|
|
2360
|
+
declare function assembleLlmGraderPrompt(input: {
|
|
2211
2361
|
evalCase: EvalTest;
|
|
2212
2362
|
candidate: string;
|
|
2213
2363
|
promptInputs: PromptInputs;
|
|
2214
|
-
evaluatorConfig?:
|
|
2364
|
+
evaluatorConfig?: LlmGraderEvaluatorConfig;
|
|
2215
2365
|
output?: readonly Message[];
|
|
2216
2366
|
fileChanges?: string;
|
|
2217
2367
|
evaluatorTemplateOverride?: string;
|
|
2218
|
-
}):
|
|
2368
|
+
}): LlmGraderPromptAssembly;
|
|
2219
2369
|
|
|
2220
2370
|
interface TokenUsageEvaluatorOptions {
|
|
2221
2371
|
readonly config: TokenUsageEvaluatorConfig;
|
|
@@ -2312,18 +2462,22 @@ declare function runEqualsAssertion(output: string, value: string): AssertionRes
|
|
|
2312
2462
|
* Contains shared resources needed by evaluator instances.
|
|
2313
2463
|
*/
|
|
2314
2464
|
interface EvaluatorDispatchContext {
|
|
2315
|
-
/** Shared LLM
|
|
2465
|
+
/** Shared LLM grader provider (resolved at suite level) */
|
|
2466
|
+
readonly graderProvider?: Provider;
|
|
2467
|
+
/** @deprecated Use `graderProvider` instead */
|
|
2316
2468
|
readonly judgeProvider?: Provider;
|
|
2317
2469
|
/** Function to resolve target names to providers */
|
|
2318
2470
|
readonly targetResolver?: TargetResolver;
|
|
2319
|
-
/** Available target names for code
|
|
2471
|
+
/** Available target names for code graders */
|
|
2320
2472
|
readonly availableTargets?: readonly string[];
|
|
2321
2473
|
/** Agent timeout in ms */
|
|
2322
2474
|
readonly agentTimeoutMs?: number;
|
|
2323
2475
|
/** Directory containing the eval file (for composite member resolution) */
|
|
2324
2476
|
readonly evalFileDir?: string;
|
|
2325
|
-
/** Shared LLM
|
|
2326
|
-
readonly
|
|
2477
|
+
/** Shared LLM grader evaluator instance */
|
|
2478
|
+
readonly llmGrader: Evaluator;
|
|
2479
|
+
/** @deprecated Use `llmGrader` instead */
|
|
2480
|
+
readonly llmJudge?: Evaluator;
|
|
2327
2481
|
/** Reference to the registry itself (for composite evaluators that need to create children) */
|
|
2328
2482
|
readonly registry: EvaluatorRegistry;
|
|
2329
2483
|
}
|
|
@@ -2331,8 +2485,8 @@ interface EvaluatorDispatchContext {
|
|
|
2331
2485
|
* Factory function that creates an Evaluator instance from a config.
|
|
2332
2486
|
*
|
|
2333
2487
|
* Factory functions handle all type-specific initialization logic:
|
|
2334
|
-
* - Reading prompt files for LLM
|
|
2335
|
-
* - Resolving script paths for code
|
|
2488
|
+
* - Reading prompt files for LLM graders
|
|
2489
|
+
* - Resolving script paths for code graders
|
|
2336
2490
|
* - Creating adapter evaluators for deterministic assertions
|
|
2337
2491
|
*/
|
|
2338
2492
|
type EvaluatorFactoryFn = (config: EvaluatorConfig, context: EvaluatorDispatchContext) => Evaluator | Promise<Evaluator>;
|
|
@@ -2394,7 +2548,7 @@ interface RunEvalCaseOptions {
|
|
|
2394
2548
|
readonly provider: Provider;
|
|
2395
2549
|
readonly target: ResolvedTarget;
|
|
2396
2550
|
readonly evaluators: Partial<Record<string, Evaluator>> & {
|
|
2397
|
-
readonly 'llm-
|
|
2551
|
+
readonly 'llm-grader': Evaluator;
|
|
2398
2552
|
};
|
|
2399
2553
|
readonly now?: () => Date;
|
|
2400
2554
|
readonly maxRetries?: number;
|
|
@@ -2402,10 +2556,10 @@ interface RunEvalCaseOptions {
|
|
|
2402
2556
|
readonly cache?: EvaluationCache;
|
|
2403
2557
|
readonly useCache?: boolean;
|
|
2404
2558
|
readonly signal?: AbortSignal;
|
|
2405
|
-
readonly
|
|
2406
|
-
/** Resolver for target override in code
|
|
2559
|
+
readonly graderProvider?: Provider;
|
|
2560
|
+
/** Resolver for target override in code graders */
|
|
2407
2561
|
readonly targetResolver?: (name: string) => Provider | undefined;
|
|
2408
|
-
/** List of available target names for code
|
|
2562
|
+
/** List of available target names for code graders */
|
|
2409
2563
|
readonly availableTargets?: readonly string[];
|
|
2410
2564
|
/** Unique identifier for the evaluation run (used for workspace management) */
|
|
2411
2565
|
readonly evalRunId?: string;
|
|
@@ -2488,10 +2642,44 @@ interface RunEvaluationOptions {
|
|
|
2488
2642
|
readonly retainOnSuccess?: 'keep' | 'cleanup';
|
|
2489
2643
|
/** Retention policy override for failed cases */
|
|
2490
2644
|
readonly retainOnFailure?: 'keep' | 'cleanup';
|
|
2645
|
+
/** CLI override: grader target name (e.g., "agentv" or a target from targets.yaml) */
|
|
2646
|
+
readonly graderTarget?: string;
|
|
2647
|
+
/** CLI override: model for grader target (e.g., "openai:gpt-5-mini") */
|
|
2648
|
+
readonly model?: string;
|
|
2491
2649
|
}
|
|
2492
2650
|
declare function runEvaluation(options: RunEvaluationOptions): Promise<readonly EvaluationResult[]>;
|
|
2493
2651
|
declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationResult>;
|
|
2494
2652
|
|
|
2653
|
+
/**
|
|
2654
|
+
* Types for inline assertion functions used in the evaluate() API.
|
|
2655
|
+
*
|
|
2656
|
+
* Inline functions are the escape hatch for custom evaluation logic
|
|
2657
|
+
* that doesn't fit a built-in evaluator type. For built-in assertions
|
|
2658
|
+
* (contains, regex, is-json, etc.), use config objects instead:
|
|
2659
|
+
*
|
|
2660
|
+
* assert: [{ type: 'contains', value: 'hello' }]
|
|
2661
|
+
*
|
|
2662
|
+
* Inline functions are for custom logic:
|
|
2663
|
+
*
|
|
2664
|
+
* assert: [({ output }) => ({ name: 'len', score: output.length > 5 ? 1 : 0 })]
|
|
2665
|
+
*/
|
|
2666
|
+
/** Context passed to inline assertion functions */
|
|
2667
|
+
interface AssertContext {
|
|
2668
|
+
readonly input: string;
|
|
2669
|
+
readonly output: string;
|
|
2670
|
+
readonly expectedOutput?: string;
|
|
2671
|
+
readonly criteria?: string;
|
|
2672
|
+
readonly metadata?: Record<string, unknown>;
|
|
2673
|
+
}
|
|
2674
|
+
/** Result from an inline assertion function */
|
|
2675
|
+
interface AssertResult {
|
|
2676
|
+
readonly name: string;
|
|
2677
|
+
readonly score: number;
|
|
2678
|
+
readonly metadata?: Record<string, unknown>;
|
|
2679
|
+
}
|
|
2680
|
+
/** Inline assertion function signature */
|
|
2681
|
+
type AssertFn = (ctx: AssertContext) => AssertResult | Promise<AssertResult>;
|
|
2682
|
+
|
|
2495
2683
|
/**
|
|
2496
2684
|
* Programmatic API for running evaluations.
|
|
2497
2685
|
*
|
|
@@ -2499,7 +2687,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
|
|
|
2499
2687
|
* instead of a CLI. The config shape mirrors the YAML structure for easy
|
|
2500
2688
|
* translation between file-based and programmatic usage.
|
|
2501
2689
|
*
|
|
2502
|
-
* @example Inline tests
|
|
2690
|
+
* @example Inline tests with config objects
|
|
2503
2691
|
* ```typescript
|
|
2504
2692
|
* import { evaluate } from '@agentv/core';
|
|
2505
2693
|
*
|
|
@@ -2508,7 +2696,7 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
|
|
|
2508
2696
|
* {
|
|
2509
2697
|
* id: 'capital',
|
|
2510
2698
|
* input: 'What is the capital of France?',
|
|
2511
|
-
*
|
|
2699
|
+
* expectedOutput: 'Paris',
|
|
2512
2700
|
* assert: [{ type: 'contains', value: 'Paris' }],
|
|
2513
2701
|
* },
|
|
2514
2702
|
* ],
|
|
@@ -2518,6 +2706,27 @@ declare function runEvalCase(options: RunEvalCaseOptions): Promise<EvaluationRes
|
|
|
2518
2706
|
* console.log(results.summary.passed, 'passed');
|
|
2519
2707
|
* ```
|
|
2520
2708
|
*
|
|
2709
|
+
* @example Inline tests with task function and custom assertion
|
|
2710
|
+
* ```typescript
|
|
2711
|
+
* import { evaluate } from '@agentv/core';
|
|
2712
|
+
*
|
|
2713
|
+
* const { summary } = await evaluate({
|
|
2714
|
+
* tests: [
|
|
2715
|
+
* {
|
|
2716
|
+
* id: 'echo',
|
|
2717
|
+
* input: 'hello',
|
|
2718
|
+
* expectedOutput: 'Echo: hello',
|
|
2719
|
+
* assert: [
|
|
2720
|
+
* { type: 'contains', value: 'hello' },
|
|
2721
|
+
* { type: 'equals' },
|
|
2722
|
+
* ({ output }) => ({ name: 'custom', score: output.length > 0 ? 1 : 0 }),
|
|
2723
|
+
* ],
|
|
2724
|
+
* },
|
|
2725
|
+
* ],
|
|
2726
|
+
* task: async (input) => `Echo: ${input}`,
|
|
2727
|
+
* });
|
|
2728
|
+
* ```
|
|
2729
|
+
*
|
|
2521
2730
|
* @example File-based
|
|
2522
2731
|
* ```typescript
|
|
2523
2732
|
* const results = await evaluate({
|
|
@@ -2543,10 +2752,12 @@ interface EvalTestInput {
|
|
|
2543
2752
|
role: string;
|
|
2544
2753
|
content: string;
|
|
2545
2754
|
}[];
|
|
2546
|
-
/** Expected reference output */
|
|
2755
|
+
/** Expected reference output (camelCase preferred) */
|
|
2756
|
+
readonly expectedOutput?: string;
|
|
2757
|
+
/** @deprecated Use `expectedOutput` instead */
|
|
2547
2758
|
readonly expected_output?: string;
|
|
2548
|
-
/** Assertion evaluators */
|
|
2549
|
-
readonly assert?: readonly
|
|
2759
|
+
/** Assertion evaluators — accepts factory functions, config objects, or inline functions */
|
|
2760
|
+
readonly assert?: readonly AssertEntry[];
|
|
2550
2761
|
/** Arbitrary metadata */
|
|
2551
2762
|
readonly metadata?: Record<string, unknown>;
|
|
2552
2763
|
}
|
|
@@ -2582,6 +2793,8 @@ interface EvalAssertionInput {
|
|
|
2582
2793
|
/** Additional properties */
|
|
2583
2794
|
readonly [key: string]: unknown;
|
|
2584
2795
|
}
|
|
2796
|
+
/** Assert entry: inline function or config object */
|
|
2797
|
+
type AssertEntry = AssertFn | EvalAssertionInput;
|
|
2585
2798
|
/**
|
|
2586
2799
|
* Configuration for `evaluate()`.
|
|
2587
2800
|
* Accepts either inline tests or a spec file path.
|
|
@@ -2593,8 +2806,10 @@ interface EvalConfig {
|
|
|
2593
2806
|
readonly specFile?: string;
|
|
2594
2807
|
/** Target provider configuration */
|
|
2595
2808
|
readonly target?: TargetDefinition;
|
|
2809
|
+
/** Custom task function — mutually exclusive with target */
|
|
2810
|
+
readonly task?: (input: string) => string | Promise<string>;
|
|
2596
2811
|
/** Suite-level assertions applied to all tests */
|
|
2597
|
-
readonly assert?: readonly
|
|
2812
|
+
readonly assert?: readonly AssertEntry[];
|
|
2598
2813
|
/** Filter tests by ID pattern (glob supported) */
|
|
2599
2814
|
readonly filter?: string;
|
|
2600
2815
|
/** Maximum concurrent workers (default: 3) */
|
|
@@ -3207,6 +3422,11 @@ declare class OtelStreamingObserver {
|
|
|
3207
3422
|
onLlmCall(model: string, tokenUsage?: ProviderTokenUsage): void;
|
|
3208
3423
|
/** Finalize root span with score/verdict after evaluation completes */
|
|
3209
3424
|
finalizeEvalCase(score: number, error?: string): void;
|
|
3425
|
+
/** Return the active eval span's trace ID and span ID for Braintrust trace bridging */
|
|
3426
|
+
getActiveSpanIds(): {
|
|
3427
|
+
parentSpanId: string;
|
|
3428
|
+
rootSpanId: string;
|
|
3429
|
+
} | null;
|
|
3210
3430
|
/** Get ProviderStreamCallbacks for passing to providers */
|
|
3211
3431
|
getStreamCallbacks(): ProviderStreamCallbacks;
|
|
3212
3432
|
}
|
|
@@ -3283,9 +3503,29 @@ declare function createBuiltinRegistry(): EvaluatorRegistry;
|
|
|
3283
3503
|
*/
|
|
3284
3504
|
declare function discoverAssertions(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
|
|
3285
3505
|
|
|
3506
|
+
/**
|
|
3507
|
+
* Convention-based discovery of custom grader scripts.
|
|
3508
|
+
*
|
|
3509
|
+
* Scans `.agentv/graders/` (and legacy `.agentv/judges/`) for TypeScript/JavaScript
|
|
3510
|
+
* files and registers them as code-grader evaluators in the registry. The file name
|
|
3511
|
+
* (without extension) becomes the evaluator type name.
|
|
3512
|
+
*
|
|
3513
|
+
* Example: `.agentv/graders/custom-grader.ts` → type "custom-grader" in EVAL.yaml
|
|
3514
|
+
*/
|
|
3515
|
+
|
|
3516
|
+
/**
|
|
3517
|
+
* Discover custom grader scripts from `.agentv/graders/` (and legacy `.agentv/judges/`)
|
|
3518
|
+
* and register them as evaluator types in the registry.
|
|
3519
|
+
*
|
|
3520
|
+
* @param registry - The evaluator registry to register discovered graders into
|
|
3521
|
+
* @param baseDir - The base directory to search from (typically project root or eval file dir)
|
|
3522
|
+
* @returns Names of discovered grader types
|
|
3523
|
+
*/
|
|
3524
|
+
declare function discoverGraders(registry: EvaluatorRegistry, baseDir: string): Promise<string[]>;
|
|
3525
|
+
|
|
3286
3526
|
type AgentKernel = {
|
|
3287
3527
|
status: string;
|
|
3288
3528
|
};
|
|
3289
3529
|
declare function createAgentKernel(): AgentKernel;
|
|
3290
3530
|
|
|
3291
|
-
export { type AcquireWorkspaceOptions,
|
|
3531
|
+
export { type AcquireWorkspaceOptions, type AgentKernel, type AgentVConfig$1 as AgentVConfig, type AgentVResolvedConfig, type AgentVConfig as AgentVTsConfig, type AnthropicResolvedConfig, type ArgsMatchMode, type AssertContext, type AssertEntry, type AssertFn, type AssertResult, type AssertionResult, type AssistantTestMessage, type AzureResolvedConfig, type CacheConfig, type ChildEvaluatorResult, type ClaudeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, type ConfidenceIntervalAggregation, type ContainsAllEvaluatorConfig, type ContainsAnyEvaluatorConfig, type ContainsEvaluatorConfig, type CopilotCliResolvedConfig, type CopilotSdkResolvedConfig, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EVALUATOR_TEMPLATE, DEFAULT_EVAL_PATTERNS, DEFAULT_EXPLORATION_TOOLS, DeterministicAssertionEvaluator, type EndsWithEvaluatorConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EqualsEvaluatorConfig, type EvalAssertionInput, type EvalCase, type EvalConfig, type EvalMetadata, type EvalRunResult, type EvalSuiteResult, type EvalSummary, type EvalTest, type EvalTestInput, type EvalsJsonCase, type EvalsJsonFile, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorDispatchContext, type EvaluatorFactory, type EvaluatorFactoryFn, type EvaluatorKind, EvaluatorRegistry, type EvaluatorResult, type ExecutionDefaults, type ExecutionError, type ExecutionMetrics, ExecutionMetricsEvaluator, type ExecutionMetricsEvaluatorConfig, type ExecutionMetricsEvaluatorOptions, type ExecutionStatus, type FailOnError, type FailureStage, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type IcontainsAllEvaluatorConfig, type IcontainsAnyEvaluatorConfig, type IcontainsEvaluatorConfig, type InlineAssertEvaluatorConfig, type IsJsonEvaluatorConfig, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmGraderEvaluator, type LlmGraderEvaluatorConfig, type LlmGraderEvaluatorOptions, type LlmGraderPromptAssembly, LlmGraderEvaluator as LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmGraderEvaluatorOptions as LlmJudgeEvaluatorOptions, type LlmGraderPromptAssembly as LlmJudgePromptAssembly, type MeanAggregation, type Message, type MockResolvedConfig, OTEL_BACKEND_PRESETS, type OtelBackendPreset, type OtelExportOptions, OtelStreamingObserver, OtelTraceExporter, OtlpJsonFileExporter, type OutputMessage, type PassAtKAggregation, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type PoolSlot, type ProgressEvent, type PromptInputs, type PromptScriptConfig, type Provider, type ProviderFactoryFn, type ProviderKind, ProviderRegistry, type ProviderRequest, type ProviderResponse, type ProviderStreamCallbacks, type ProviderTokenUsage, type RegexEvaluatorConfig, type RepoCheckout, type RepoClone, type RepoConfig, RepoManager, type RepoSource, type ResolvedTarget, type ResolvedWorkspaceTemplate, ResponseCache, type RubricItem, type RubricsEvaluatorConfig, type RunEvalCaseOptions, type RunEvaluationOptions, type ScoreRange, type ScriptExecutionContext, SimpleTraceFileExporter, SkillTriggerEvaluator, type SkillTriggerEvaluatorConfig, type StartsWithEvaluatorConfig, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, TemplateNotDirectoryError, TemplateNotFoundError, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolCall, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceComputeResult, type TraceSummary, type TranspileResult, type TrialAggregation, type TrialResult, type TrialStrategy, type TrialsConfig, type UserTestMessage, type VSCodeResolvedConfig, type WorkspaceConfig, WorkspaceCreationError, type WorkspaceHookConfig, type WorkspaceHooksConfig, WorkspacePoolManager, type WorkspaceScriptConfig, assembleLlmGraderPrompt, assembleLlmGraderPrompt as assembleLlmJudgePrompt, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildRubricOutputSchema, buildScoreRangeOutputSchema, buildSearchRoots, calculateRubricScore, captureFileChanges, clampScore, cleanupEvalWorkspaces, cleanupWorkspace, computeTraceSummary, computeWorkspaceFingerprint, consumeClaudeLogEntries, consumeCodexLogEntries, consumeCopilotCliLogEntries, consumeCopilotSdkLogEntries, consumePiLogEntries, createAgentKernel, createBuiltinProviderRegistry, createBuiltinRegistry, createProvider, createTempWorkspace, deepEqual, defineConfig, detectFormat, discoverAssertions, discoverGraders, discoverGraders as discoverJudges, discoverProviders, ensureVSCodeSubagents, evaluate, executeScript, executeWorkspaceScript, explorationRatio, extractCacheConfig, extractFailOnError, extractJsonBlob, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, extractTrialsConfig, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getAgentvHome, getHitCount, getOutputFilenames, getSubagentsRoot, getTraceStateRoot, getWorkspacePath, getWorkspacePoolRoot, getWorkspacesRoot, initializeBaseline, isAgentSkillsFormat, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadConfig, loadEvalCaseById, loadEvalCases, loadEvalSuite, loadTestById, loadTestSuite, loadTests, loadTsConfig, mergeExecutionMetrics, negateScore, normalizeLineEndings, parseAgentSkillsEvals, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, resolveWorkspaceTemplate, rubricEvaluationSchema, runContainsAllAssertion, runContainsAnyAssertion, runContainsAssertion, runEndsWithAssertion, runEqualsAssertion, runEvalCase, runEvaluation, runIcontainsAllAssertion, runIcontainsAnyAssertion, runIcontainsAssertion, runIsJsonAssertion, runRegexAssertion, runStartsWithAssertion, scoreToVerdict, shouldEnableCache, shouldSkipCacheForTemperature, subscribeToClaudeLogEntries, subscribeToCodexLogEntries, subscribeToCopilotCliLogEntries, subscribeToCopilotSdkLogEntries, subscribeToPiLogEntries, substituteVariables, toCamelCaseDeep, toSnakeCaseDeep, tokensPerTool, transpileEvalYaml, transpileEvalYamlFile, trimBaselineResult };
|