@agentv/core 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -204,6 +204,15 @@ declare function isTestMessage(value: unknown): value is TestMessage;
204
204
  declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
205
205
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
206
206
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
207
+ /**
208
+ * Configuration for enabling target access in code_judge evaluators.
209
+ * When present, the runtime will start a local proxy server that allows
210
+ * the script to invoke configured targets without direct credential access.
211
+ */
212
+ type TargetAccessConfig = {
213
+ /** Maximum number of target invocations allowed per execution (default: 50) */
214
+ readonly max_calls?: number;
215
+ };
207
216
  type CodeEvaluatorConfig = {
208
217
  readonly name: string;
209
218
  readonly type: 'code';
@@ -214,6 +223,8 @@ type CodeEvaluatorConfig = {
214
223
  readonly weight?: number;
215
224
  /** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
216
225
  readonly config?: JsonObject;
226
+ /** When present, enables target access for the script via local proxy */
227
+ readonly target?: TargetAccessConfig;
217
228
  };
218
229
  type LlmJudgeEvaluatorConfig = {
219
230
  readonly name: string;
@@ -343,7 +354,6 @@ interface EvalCase {
343
354
  readonly guideline_paths: readonly string[];
344
355
  readonly guideline_patterns?: readonly string[];
345
356
  readonly file_paths: readonly string[];
346
- readonly code_snippets: readonly string[];
347
357
  readonly expected_outcome: string;
348
358
  readonly evaluator?: EvaluatorKind;
349
359
  readonly evaluators?: readonly EvaluatorConfig[];
@@ -383,6 +393,8 @@ interface EvaluatorResult {
383
393
  readonly rawRequest?: JsonObject;
384
394
  readonly evaluatorProviderRequest?: JsonObject;
385
395
  readonly evaluatorResults?: readonly EvaluatorResult[];
396
+ /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
397
+ readonly details?: JsonObject;
386
398
  }
387
399
  /**
388
400
  * Convenience accessor matching the Python hit_count property.
@@ -566,10 +578,6 @@ interface TargetDefinition {
566
578
  * - 'lm': Embedded file content with XML tags (for language model providers)
567
579
  */
568
580
  type FormattingMode = 'agent' | 'lm';
569
- /**
570
- * Extract fenced code blocks from AgentV user segments.
571
- */
572
- declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
573
581
 
574
582
  /**
575
583
  * Build prompt inputs by consolidating user request context and guideline content.
@@ -962,6 +970,11 @@ declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener
962
970
  declare function createProvider(target: ResolvedTarget): Provider;
963
971
  declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
964
972
 
973
+ /**
974
+ * Function to resolve a target name to a provider.
975
+ * Used by code judges to support target override.
976
+ */
977
+ type TargetResolver = (targetName: string) => Provider | undefined;
965
978
  interface EvaluationContext {
966
979
  readonly evalCase: EvalCase;
967
980
  readonly candidate: string;
@@ -982,6 +995,10 @@ interface EvaluationContext {
982
995
  readonly outputMessages?: readonly OutputMessage[];
983
996
  /** Lightweight summary of trace events (if available) */
984
997
  readonly traceSummary?: TraceSummary;
998
+ /** Resolver for target override in code judges */
999
+ readonly targetResolver?: TargetResolver;
1000
+ /** List of available target names for code judges */
1001
+ readonly availableTargets?: readonly string[];
985
1002
  }
986
1003
  interface EvaluationScore {
987
1004
  readonly score: number;
@@ -992,6 +1009,8 @@ interface EvaluationScore {
992
1009
  readonly reasoning?: string;
993
1010
  readonly evaluatorRawRequest?: JsonObject;
994
1011
  readonly evaluatorResults?: readonly ChildEvaluatorResult[];
1012
+ /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1013
+ readonly details?: JsonObject;
995
1014
  }
996
1015
  interface ChildEvaluatorResult {
997
1016
  readonly name: string;
@@ -1004,37 +1023,37 @@ interface ChildEvaluatorResult {
1004
1023
  readonly reasoning?: string;
1005
1024
  readonly evaluatorRawRequest?: JsonObject;
1006
1025
  readonly evaluatorResults?: readonly ChildEvaluatorResult[];
1026
+ /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1027
+ readonly details?: JsonObject;
1007
1028
  }
1008
1029
  interface Evaluator {
1009
1030
  readonly kind: string;
1010
1031
  evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
1011
1032
  }
1012
- type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
1013
- interface LlmJudgeEvaluatorOptions {
1014
- readonly resolveJudgeProvider: JudgeProviderResolver;
1015
- readonly maxOutputTokens?: number;
1016
- readonly temperature?: number;
1017
- readonly evaluatorTemplate?: string;
1018
- }
1019
- declare class LlmJudgeEvaluator implements Evaluator {
1020
- readonly kind = "llm_judge";
1021
- private readonly resolveJudgeProvider;
1022
- private readonly maxOutputTokens?;
1023
- private readonly temperature?;
1024
- private readonly evaluatorTemplate?;
1025
- constructor(options: LlmJudgeEvaluatorOptions);
1026
- evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1027
- private evaluateFreeform;
1028
- private evaluateWithRubrics;
1029
- private buildRubricPrompt;
1030
- private runWithRetry;
1033
+ interface EvaluatorFactory {
1034
+ create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
1031
1035
  }
1036
+
1037
+ declare function scoreToVerdict(score: number): EvaluationVerdict;
1038
+ declare function clampScore(value: number): number;
1039
+ declare function extractJsonBlob(text: string): string | undefined;
1040
+ declare function parseJsonFromText(text: string): unknown;
1041
+ declare function isNonEmptyString(value: unknown): value is string;
1042
+ declare function parseJsonSafe(payload: string): Record<string, unknown> | undefined;
1043
+ /**
1044
+ * Deep equality check for two values.
1045
+ * Handles primitives, arrays, and plain objects.
1046
+ */
1047
+ declare function deepEqual(a: unknown, b: unknown): boolean;
1048
+
1032
1049
  interface CodeEvaluatorOptions {
1033
1050
  readonly script: readonly string[];
1034
1051
  readonly cwd?: string;
1035
1052
  readonly agentTimeoutMs?: number;
1036
1053
  /** Pass-through configuration from YAML (any unrecognized properties) */
1037
1054
  readonly config?: Record<string, unknown>;
1055
+ /** Target access config - when present, enables target invocation for the script */
1056
+ readonly target?: TargetAccessConfig;
1038
1057
  }
1039
1058
  declare class CodeEvaluator implements Evaluator {
1040
1059
  readonly kind = "code";
@@ -1042,29 +1061,44 @@ declare class CodeEvaluator implements Evaluator {
1042
1061
  private readonly cwd?;
1043
1062
  private readonly agentTimeoutMs?;
1044
1063
  private readonly config?;
1064
+ private readonly target?;
1045
1065
  constructor(options: CodeEvaluatorOptions);
1046
1066
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1047
1067
  }
1048
- interface ToolTrajectoryEvaluatorOptions {
1049
- readonly config: ToolTrajectoryEvaluatorConfig;
1068
+ declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
1069
+
1070
+ interface CompositeEvaluatorOptions {
1071
+ readonly config: CompositeEvaluatorConfig;
1072
+ readonly evaluatorFactory: EvaluatorFactory;
1073
+ readonly cwd?: string;
1050
1074
  }
1051
- declare class ToolTrajectoryEvaluator implements Evaluator {
1052
- readonly kind = "tool_trajectory";
1075
+ declare class CompositeEvaluator implements Evaluator {
1076
+ readonly kind = "composite";
1053
1077
  private readonly config;
1054
- constructor(options: ToolTrajectoryEvaluatorOptions);
1078
+ private readonly evaluatorFactory;
1079
+ private readonly cwd?;
1080
+ constructor(options: CompositeEvaluatorOptions);
1081
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1082
+ private aggregate;
1083
+ private runWeightedAverage;
1084
+ private runCodeAggregator;
1085
+ private runLlmAggregator;
1086
+ }
1087
+
1088
+ interface CostEvaluatorOptions {
1089
+ readonly config: CostEvaluatorConfig;
1090
+ }
1091
+ /**
1092
+ * Evaluator that checks execution cost against a budget.
1093
+ * Uses traceSummary.costUsd from the evaluation context.
1094
+ */
1095
+ declare class CostEvaluator implements Evaluator {
1096
+ readonly kind = "cost";
1097
+ private readonly config;
1098
+ constructor(options: CostEvaluatorOptions);
1055
1099
  evaluate(context: EvaluationContext): EvaluationScore;
1056
- /**
1057
- * Extract tool calls from output messages.
1058
- */
1059
- private extractToolCallsFromMessages;
1060
- /**
1061
- * Build a summary from extracted tool calls.
1062
- */
1063
- private buildSummary;
1064
- private evaluateAnyOrder;
1065
- private evaluateInOrder;
1066
- private evaluateExact;
1067
1100
  }
1101
+
1068
1102
  interface FieldAccuracyEvaluatorOptions {
1069
1103
  readonly config: FieldAccuracyEvaluatorConfig;
1070
1104
  }
@@ -1103,26 +1137,7 @@ declare class FieldAccuracyEvaluator implements Evaluator {
1103
1137
  */
1104
1138
  private aggregateResults;
1105
1139
  }
1106
- interface EvaluatorFactory {
1107
- create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
1108
- }
1109
- interface CompositeEvaluatorOptions {
1110
- readonly config: CompositeEvaluatorConfig;
1111
- readonly evaluatorFactory: EvaluatorFactory;
1112
- readonly cwd?: string;
1113
- }
1114
- declare class CompositeEvaluator implements Evaluator {
1115
- readonly kind = "composite";
1116
- private readonly config;
1117
- private readonly evaluatorFactory;
1118
- private readonly cwd?;
1119
- constructor(options: CompositeEvaluatorOptions);
1120
- evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1121
- private aggregate;
1122
- private runWeightedAverage;
1123
- private runCodeAggregator;
1124
- private runLlmAggregator;
1125
- }
1140
+
1126
1141
  interface LatencyEvaluatorOptions {
1127
1142
  readonly config: LatencyEvaluatorConfig;
1128
1143
  }
@@ -1136,19 +1151,50 @@ declare class LatencyEvaluator implements Evaluator {
1136
1151
  constructor(options: LatencyEvaluatorOptions);
1137
1152
  evaluate(context: EvaluationContext): EvaluationScore;
1138
1153
  }
1139
- interface CostEvaluatorOptions {
1140
- readonly config: CostEvaluatorConfig;
1154
+
1155
+ type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
1156
+ interface LlmJudgeEvaluatorOptions {
1157
+ readonly resolveJudgeProvider: JudgeProviderResolver;
1158
+ readonly maxOutputTokens?: number;
1159
+ readonly temperature?: number;
1160
+ readonly evaluatorTemplate?: string;
1161
+ }
1162
+ declare const freeformEvaluationSchema: z.ZodObject<{
1163
+ score: z.ZodNumber;
1164
+ hits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
1165
+ misses: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
1166
+ reasoning: z.ZodOptional<z.ZodString>;
1167
+ }, "strip", z.ZodTypeAny, {
1168
+ score: number;
1169
+ hits?: string[] | undefined;
1170
+ misses?: string[] | undefined;
1171
+ reasoning?: string | undefined;
1172
+ }, {
1173
+ score: number;
1174
+ hits?: string[] | undefined;
1175
+ misses?: string[] | undefined;
1176
+ reasoning?: string | undefined;
1177
+ }>;
1178
+
1179
+ declare class LlmJudgeEvaluator implements Evaluator {
1180
+ readonly kind = "llm_judge";
1181
+ private readonly resolveJudgeProvider;
1182
+ private readonly maxOutputTokens?;
1183
+ private readonly temperature?;
1184
+ private readonly evaluatorTemplate?;
1185
+ constructor(options: LlmJudgeEvaluatorOptions);
1186
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1187
+ private evaluateFreeform;
1188
+ private evaluateWithRubrics;
1189
+ private buildRubricPrompt;
1190
+ private runWithRetry;
1141
1191
  }
1142
1192
  /**
1143
- * Evaluator that checks execution cost against a budget.
1144
- * Uses traceSummary.costUsd from the evaluation context.
1193
+ * Build the mandatory output schema that all evaluators must follow.
1194
+ * This schema is always appended to the evaluator template.
1145
1195
  */
1146
- declare class CostEvaluator implements Evaluator {
1147
- readonly kind = "cost";
1148
- private readonly config;
1149
- constructor(options: CostEvaluatorOptions);
1150
- evaluate(context: EvaluationContext): EvaluationScore;
1151
- }
1196
+ declare function buildOutputSchema(): string;
1197
+
1152
1198
  interface TokenUsageEvaluatorOptions {
1153
1199
  readonly config: TokenUsageEvaluatorConfig;
1154
1200
  }
@@ -1163,6 +1209,27 @@ declare class TokenUsageEvaluator implements Evaluator {
1163
1209
  evaluate(context: EvaluationContext): EvaluationScore;
1164
1210
  }
1165
1211
 
1212
+ interface ToolTrajectoryEvaluatorOptions {
1213
+ readonly config: ToolTrajectoryEvaluatorConfig;
1214
+ }
1215
+ declare class ToolTrajectoryEvaluator implements Evaluator {
1216
+ readonly kind = "tool_trajectory";
1217
+ private readonly config;
1218
+ constructor(options: ToolTrajectoryEvaluatorOptions);
1219
+ evaluate(context: EvaluationContext): EvaluationScore;
1220
+ /**
1221
+ * Extract tool calls from output messages.
1222
+ */
1223
+ private extractToolCallsFromMessages;
1224
+ /**
1225
+ * Build a summary from extracted tool calls.
1226
+ */
1227
+ private buildSummary;
1228
+ private evaluateAnyOrder;
1229
+ private evaluateInOrder;
1230
+ private evaluateExact;
1231
+ }
1232
+
1166
1233
  type MaybePromise<T> = T | Promise<T>;
1167
1234
  interface EvaluationCache {
1168
1235
  get(key: string): MaybePromise<ProviderResponse | undefined>;
@@ -1182,6 +1249,10 @@ interface RunEvalCaseOptions {
1182
1249
  readonly useCache?: boolean;
1183
1250
  readonly signal?: AbortSignal;
1184
1251
  readonly judgeProvider?: Provider;
1252
+ /** Resolver for target override in code judges */
1253
+ readonly targetResolver?: (name: string) => Provider | undefined;
1254
+ /** List of available target names for code judges */
1255
+ readonly availableTargets?: readonly string[];
1185
1256
  }
1186
1257
  interface ProgressEvent {
1187
1258
  readonly workerId: number;
@@ -1230,4 +1301,4 @@ type AgentKernel = {
1230
1301
  };
1231
1302
  declare function createAgentKernel(): AgentKernel;
1232
1303
 
1233
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
1304
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
package/dist/index.d.ts CHANGED
@@ -204,6 +204,15 @@ declare function isTestMessage(value: unknown): value is TestMessage;
204
204
  declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
205
205
  type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
206
206
  declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
207
+ /**
208
+ * Configuration for enabling target access in code_judge evaluators.
209
+ * When present, the runtime will start a local proxy server that allows
210
+ * the script to invoke configured targets without direct credential access.
211
+ */
212
+ type TargetAccessConfig = {
213
+ /** Maximum number of target invocations allowed per execution (default: 50) */
214
+ readonly max_calls?: number;
215
+ };
207
216
  type CodeEvaluatorConfig = {
208
217
  readonly name: string;
209
218
  readonly type: 'code';
@@ -214,6 +223,8 @@ type CodeEvaluatorConfig = {
214
223
  readonly weight?: number;
215
224
  /** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
216
225
  readonly config?: JsonObject;
226
+ /** When present, enables target access for the script via local proxy */
227
+ readonly target?: TargetAccessConfig;
217
228
  };
218
229
  type LlmJudgeEvaluatorConfig = {
219
230
  readonly name: string;
@@ -343,7 +354,6 @@ interface EvalCase {
343
354
  readonly guideline_paths: readonly string[];
344
355
  readonly guideline_patterns?: readonly string[];
345
356
  readonly file_paths: readonly string[];
346
- readonly code_snippets: readonly string[];
347
357
  readonly expected_outcome: string;
348
358
  readonly evaluator?: EvaluatorKind;
349
359
  readonly evaluators?: readonly EvaluatorConfig[];
@@ -383,6 +393,8 @@ interface EvaluatorResult {
383
393
  readonly rawRequest?: JsonObject;
384
394
  readonly evaluatorProviderRequest?: JsonObject;
385
395
  readonly evaluatorResults?: readonly EvaluatorResult[];
396
+ /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
397
+ readonly details?: JsonObject;
386
398
  }
387
399
  /**
388
400
  * Convenience accessor matching the Python hit_count property.
@@ -566,10 +578,6 @@ interface TargetDefinition {
566
578
  * - 'lm': Embedded file content with XML tags (for language model providers)
567
579
  */
568
580
  type FormattingMode = 'agent' | 'lm';
569
- /**
570
- * Extract fenced code blocks from AgentV user segments.
571
- */
572
- declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
573
581
 
574
582
  /**
575
583
  * Build prompt inputs by consolidating user request context and guideline content.
@@ -962,6 +970,11 @@ declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener
962
970
  declare function createProvider(target: ResolvedTarget): Provider;
963
971
  declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
964
972
 
973
+ /**
974
+ * Function to resolve a target name to a provider.
975
+ * Used by code judges to support target override.
976
+ */
977
+ type TargetResolver = (targetName: string) => Provider | undefined;
965
978
  interface EvaluationContext {
966
979
  readonly evalCase: EvalCase;
967
980
  readonly candidate: string;
@@ -982,6 +995,10 @@ interface EvaluationContext {
982
995
  readonly outputMessages?: readonly OutputMessage[];
983
996
  /** Lightweight summary of trace events (if available) */
984
997
  readonly traceSummary?: TraceSummary;
998
+ /** Resolver for target override in code judges */
999
+ readonly targetResolver?: TargetResolver;
1000
+ /** List of available target names for code judges */
1001
+ readonly availableTargets?: readonly string[];
985
1002
  }
986
1003
  interface EvaluationScore {
987
1004
  readonly score: number;
@@ -992,6 +1009,8 @@ interface EvaluationScore {
992
1009
  readonly reasoning?: string;
993
1010
  readonly evaluatorRawRequest?: JsonObject;
994
1011
  readonly evaluatorResults?: readonly ChildEvaluatorResult[];
1012
+ /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1013
+ readonly details?: JsonObject;
995
1014
  }
996
1015
  interface ChildEvaluatorResult {
997
1016
  readonly name: string;
@@ -1004,37 +1023,37 @@ interface ChildEvaluatorResult {
1004
1023
  readonly reasoning?: string;
1005
1024
  readonly evaluatorRawRequest?: JsonObject;
1006
1025
  readonly evaluatorResults?: readonly ChildEvaluatorResult[];
1026
+ /** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
1027
+ readonly details?: JsonObject;
1007
1028
  }
1008
1029
  interface Evaluator {
1009
1030
  readonly kind: string;
1010
1031
  evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
1011
1032
  }
1012
- type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
1013
- interface LlmJudgeEvaluatorOptions {
1014
- readonly resolveJudgeProvider: JudgeProviderResolver;
1015
- readonly maxOutputTokens?: number;
1016
- readonly temperature?: number;
1017
- readonly evaluatorTemplate?: string;
1018
- }
1019
- declare class LlmJudgeEvaluator implements Evaluator {
1020
- readonly kind = "llm_judge";
1021
- private readonly resolveJudgeProvider;
1022
- private readonly maxOutputTokens?;
1023
- private readonly temperature?;
1024
- private readonly evaluatorTemplate?;
1025
- constructor(options: LlmJudgeEvaluatorOptions);
1026
- evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1027
- private evaluateFreeform;
1028
- private evaluateWithRubrics;
1029
- private buildRubricPrompt;
1030
- private runWithRetry;
1033
+ interface EvaluatorFactory {
1034
+ create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
1031
1035
  }
1036
+
1037
+ declare function scoreToVerdict(score: number): EvaluationVerdict;
1038
+ declare function clampScore(value: number): number;
1039
+ declare function extractJsonBlob(text: string): string | undefined;
1040
+ declare function parseJsonFromText(text: string): unknown;
1041
+ declare function isNonEmptyString(value: unknown): value is string;
1042
+ declare function parseJsonSafe(payload: string): Record<string, unknown> | undefined;
1043
+ /**
1044
+ * Deep equality check for two values.
1045
+ * Handles primitives, arrays, and plain objects.
1046
+ */
1047
+ declare function deepEqual(a: unknown, b: unknown): boolean;
1048
+
1032
1049
  interface CodeEvaluatorOptions {
1033
1050
  readonly script: readonly string[];
1034
1051
  readonly cwd?: string;
1035
1052
  readonly agentTimeoutMs?: number;
1036
1053
  /** Pass-through configuration from YAML (any unrecognized properties) */
1037
1054
  readonly config?: Record<string, unknown>;
1055
+ /** Target access config - when present, enables target invocation for the script */
1056
+ readonly target?: TargetAccessConfig;
1038
1057
  }
1039
1058
  declare class CodeEvaluator implements Evaluator {
1040
1059
  readonly kind = "code";
@@ -1042,29 +1061,44 @@ declare class CodeEvaluator implements Evaluator {
1042
1061
  private readonly cwd?;
1043
1062
  private readonly agentTimeoutMs?;
1044
1063
  private readonly config?;
1064
+ private readonly target?;
1045
1065
  constructor(options: CodeEvaluatorOptions);
1046
1066
  evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1047
1067
  }
1048
- interface ToolTrajectoryEvaluatorOptions {
1049
- readonly config: ToolTrajectoryEvaluatorConfig;
1068
+ declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
1069
+
1070
+ interface CompositeEvaluatorOptions {
1071
+ readonly config: CompositeEvaluatorConfig;
1072
+ readonly evaluatorFactory: EvaluatorFactory;
1073
+ readonly cwd?: string;
1050
1074
  }
1051
- declare class ToolTrajectoryEvaluator implements Evaluator {
1052
- readonly kind = "tool_trajectory";
1075
+ declare class CompositeEvaluator implements Evaluator {
1076
+ readonly kind = "composite";
1053
1077
  private readonly config;
1054
- constructor(options: ToolTrajectoryEvaluatorOptions);
1078
+ private readonly evaluatorFactory;
1079
+ private readonly cwd?;
1080
+ constructor(options: CompositeEvaluatorOptions);
1081
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1082
+ private aggregate;
1083
+ private runWeightedAverage;
1084
+ private runCodeAggregator;
1085
+ private runLlmAggregator;
1086
+ }
1087
+
1088
+ interface CostEvaluatorOptions {
1089
+ readonly config: CostEvaluatorConfig;
1090
+ }
1091
+ /**
1092
+ * Evaluator that checks execution cost against a budget.
1093
+ * Uses traceSummary.costUsd from the evaluation context.
1094
+ */
1095
+ declare class CostEvaluator implements Evaluator {
1096
+ readonly kind = "cost";
1097
+ private readonly config;
1098
+ constructor(options: CostEvaluatorOptions);
1055
1099
  evaluate(context: EvaluationContext): EvaluationScore;
1056
- /**
1057
- * Extract tool calls from output messages.
1058
- */
1059
- private extractToolCallsFromMessages;
1060
- /**
1061
- * Build a summary from extracted tool calls.
1062
- */
1063
- private buildSummary;
1064
- private evaluateAnyOrder;
1065
- private evaluateInOrder;
1066
- private evaluateExact;
1067
1100
  }
1101
+
1068
1102
  interface FieldAccuracyEvaluatorOptions {
1069
1103
  readonly config: FieldAccuracyEvaluatorConfig;
1070
1104
  }
@@ -1103,26 +1137,7 @@ declare class FieldAccuracyEvaluator implements Evaluator {
1103
1137
  */
1104
1138
  private aggregateResults;
1105
1139
  }
1106
- interface EvaluatorFactory {
1107
- create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
1108
- }
1109
- interface CompositeEvaluatorOptions {
1110
- readonly config: CompositeEvaluatorConfig;
1111
- readonly evaluatorFactory: EvaluatorFactory;
1112
- readonly cwd?: string;
1113
- }
1114
- declare class CompositeEvaluator implements Evaluator {
1115
- readonly kind = "composite";
1116
- private readonly config;
1117
- private readonly evaluatorFactory;
1118
- private readonly cwd?;
1119
- constructor(options: CompositeEvaluatorOptions);
1120
- evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1121
- private aggregate;
1122
- private runWeightedAverage;
1123
- private runCodeAggregator;
1124
- private runLlmAggregator;
1125
- }
1140
+
1126
1141
  interface LatencyEvaluatorOptions {
1127
1142
  readonly config: LatencyEvaluatorConfig;
1128
1143
  }
@@ -1136,19 +1151,50 @@ declare class LatencyEvaluator implements Evaluator {
1136
1151
  constructor(options: LatencyEvaluatorOptions);
1137
1152
  evaluate(context: EvaluationContext): EvaluationScore;
1138
1153
  }
1139
- interface CostEvaluatorOptions {
1140
- readonly config: CostEvaluatorConfig;
1154
+
1155
+ type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
1156
+ interface LlmJudgeEvaluatorOptions {
1157
+ readonly resolveJudgeProvider: JudgeProviderResolver;
1158
+ readonly maxOutputTokens?: number;
1159
+ readonly temperature?: number;
1160
+ readonly evaluatorTemplate?: string;
1161
+ }
1162
+ declare const freeformEvaluationSchema: z.ZodObject<{
1163
+ score: z.ZodNumber;
1164
+ hits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
1165
+ misses: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
1166
+ reasoning: z.ZodOptional<z.ZodString>;
1167
+ }, "strip", z.ZodTypeAny, {
1168
+ score: number;
1169
+ hits?: string[] | undefined;
1170
+ misses?: string[] | undefined;
1171
+ reasoning?: string | undefined;
1172
+ }, {
1173
+ score: number;
1174
+ hits?: string[] | undefined;
1175
+ misses?: string[] | undefined;
1176
+ reasoning?: string | undefined;
1177
+ }>;
1178
+
1179
+ declare class LlmJudgeEvaluator implements Evaluator {
1180
+ readonly kind = "llm_judge";
1181
+ private readonly resolveJudgeProvider;
1182
+ private readonly maxOutputTokens?;
1183
+ private readonly temperature?;
1184
+ private readonly evaluatorTemplate?;
1185
+ constructor(options: LlmJudgeEvaluatorOptions);
1186
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
1187
+ private evaluateFreeform;
1188
+ private evaluateWithRubrics;
1189
+ private buildRubricPrompt;
1190
+ private runWithRetry;
1141
1191
  }
1142
1192
  /**
1143
- * Evaluator that checks execution cost against a budget.
1144
- * Uses traceSummary.costUsd from the evaluation context.
1193
+ * Build the mandatory output schema that all evaluators must follow.
1194
+ * This schema is always appended to the evaluator template.
1145
1195
  */
1146
- declare class CostEvaluator implements Evaluator {
1147
- readonly kind = "cost";
1148
- private readonly config;
1149
- constructor(options: CostEvaluatorOptions);
1150
- evaluate(context: EvaluationContext): EvaluationScore;
1151
- }
1196
+ declare function buildOutputSchema(): string;
1197
+
1152
1198
  interface TokenUsageEvaluatorOptions {
1153
1199
  readonly config: TokenUsageEvaluatorConfig;
1154
1200
  }
@@ -1163,6 +1209,27 @@ declare class TokenUsageEvaluator implements Evaluator {
1163
1209
  evaluate(context: EvaluationContext): EvaluationScore;
1164
1210
  }
1165
1211
 
1212
+ interface ToolTrajectoryEvaluatorOptions {
1213
+ readonly config: ToolTrajectoryEvaluatorConfig;
1214
+ }
1215
+ declare class ToolTrajectoryEvaluator implements Evaluator {
1216
+ readonly kind = "tool_trajectory";
1217
+ private readonly config;
1218
+ constructor(options: ToolTrajectoryEvaluatorOptions);
1219
+ evaluate(context: EvaluationContext): EvaluationScore;
1220
+ /**
1221
+ * Extract tool calls from output messages.
1222
+ */
1223
+ private extractToolCallsFromMessages;
1224
+ /**
1225
+ * Build a summary from extracted tool calls.
1226
+ */
1227
+ private buildSummary;
1228
+ private evaluateAnyOrder;
1229
+ private evaluateInOrder;
1230
+ private evaluateExact;
1231
+ }
1232
+
1166
1233
  type MaybePromise<T> = T | Promise<T>;
1167
1234
  interface EvaluationCache {
1168
1235
  get(key: string): MaybePromise<ProviderResponse | undefined>;
@@ -1182,6 +1249,10 @@ interface RunEvalCaseOptions {
1182
1249
  readonly useCache?: boolean;
1183
1250
  readonly signal?: AbortSignal;
1184
1251
  readonly judgeProvider?: Provider;
1252
+ /** Resolver for target override in code judges */
1253
+ readonly targetResolver?: (name: string) => Provider | undefined;
1254
+ /** List of available target names for code judges */
1255
+ readonly availableTargets?: readonly string[];
1185
1256
  }
1186
1257
  interface ProgressEvent {
1187
1258
  readonly workerId: number;
@@ -1230,4 +1301,4 @@ type AgentKernel = {
1230
1301
  };
1231
1302
  declare function createAgentKernel(): AgentKernel;
1232
1303
 
1233
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
1304
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };