@agentv/core 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1457 -1121
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +142 -71
- package/dist/index.d.ts +142 -71
- package/dist/index.js +1295 -968
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/dist/index.d.cts
CHANGED
|
@@ -204,6 +204,15 @@ declare function isTestMessage(value: unknown): value is TestMessage;
|
|
|
204
204
|
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
|
|
205
205
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
206
206
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
207
|
+
/**
|
|
208
|
+
* Configuration for enabling target access in code_judge evaluators.
|
|
209
|
+
* When present, the runtime will start a local proxy server that allows
|
|
210
|
+
* the script to invoke configured targets without direct credential access.
|
|
211
|
+
*/
|
|
212
|
+
type TargetAccessConfig = {
|
|
213
|
+
/** Maximum number of target invocations allowed per execution (default: 50) */
|
|
214
|
+
readonly max_calls?: number;
|
|
215
|
+
};
|
|
207
216
|
type CodeEvaluatorConfig = {
|
|
208
217
|
readonly name: string;
|
|
209
218
|
readonly type: 'code';
|
|
@@ -214,6 +223,8 @@ type CodeEvaluatorConfig = {
|
|
|
214
223
|
readonly weight?: number;
|
|
215
224
|
/** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
|
|
216
225
|
readonly config?: JsonObject;
|
|
226
|
+
/** When present, enables target access for the script via local proxy */
|
|
227
|
+
readonly target?: TargetAccessConfig;
|
|
217
228
|
};
|
|
218
229
|
type LlmJudgeEvaluatorConfig = {
|
|
219
230
|
readonly name: string;
|
|
@@ -343,7 +354,6 @@ interface EvalCase {
|
|
|
343
354
|
readonly guideline_paths: readonly string[];
|
|
344
355
|
readonly guideline_patterns?: readonly string[];
|
|
345
356
|
readonly file_paths: readonly string[];
|
|
346
|
-
readonly code_snippets: readonly string[];
|
|
347
357
|
readonly expected_outcome: string;
|
|
348
358
|
readonly evaluator?: EvaluatorKind;
|
|
349
359
|
readonly evaluators?: readonly EvaluatorConfig[];
|
|
@@ -383,6 +393,8 @@ interface EvaluatorResult {
|
|
|
383
393
|
readonly rawRequest?: JsonObject;
|
|
384
394
|
readonly evaluatorProviderRequest?: JsonObject;
|
|
385
395
|
readonly evaluatorResults?: readonly EvaluatorResult[];
|
|
396
|
+
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
|
|
397
|
+
readonly details?: JsonObject;
|
|
386
398
|
}
|
|
387
399
|
/**
|
|
388
400
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -566,10 +578,6 @@ interface TargetDefinition {
|
|
|
566
578
|
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
567
579
|
*/
|
|
568
580
|
type FormattingMode = 'agent' | 'lm';
|
|
569
|
-
/**
|
|
570
|
-
* Extract fenced code blocks from AgentV user segments.
|
|
571
|
-
*/
|
|
572
|
-
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
573
581
|
|
|
574
582
|
/**
|
|
575
583
|
* Build prompt inputs by consolidating user request context and guideline content.
|
|
@@ -962,6 +970,11 @@ declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener
|
|
|
962
970
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
963
971
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
964
972
|
|
|
973
|
+
/**
|
|
974
|
+
* Function to resolve a target name to a provider.
|
|
975
|
+
* Used by code judges to support target override.
|
|
976
|
+
*/
|
|
977
|
+
type TargetResolver = (targetName: string) => Provider | undefined;
|
|
965
978
|
interface EvaluationContext {
|
|
966
979
|
readonly evalCase: EvalCase;
|
|
967
980
|
readonly candidate: string;
|
|
@@ -982,6 +995,10 @@ interface EvaluationContext {
|
|
|
982
995
|
readonly outputMessages?: readonly OutputMessage[];
|
|
983
996
|
/** Lightweight summary of trace events (if available) */
|
|
984
997
|
readonly traceSummary?: TraceSummary;
|
|
998
|
+
/** Resolver for target override in code judges */
|
|
999
|
+
readonly targetResolver?: TargetResolver;
|
|
1000
|
+
/** List of available target names for code judges */
|
|
1001
|
+
readonly availableTargets?: readonly string[];
|
|
985
1002
|
}
|
|
986
1003
|
interface EvaluationScore {
|
|
987
1004
|
readonly score: number;
|
|
@@ -992,6 +1009,8 @@ interface EvaluationScore {
|
|
|
992
1009
|
readonly reasoning?: string;
|
|
993
1010
|
readonly evaluatorRawRequest?: JsonObject;
|
|
994
1011
|
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
1012
|
+
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1013
|
+
readonly details?: JsonObject;
|
|
995
1014
|
}
|
|
996
1015
|
interface ChildEvaluatorResult {
|
|
997
1016
|
readonly name: string;
|
|
@@ -1004,37 +1023,37 @@ interface ChildEvaluatorResult {
|
|
|
1004
1023
|
readonly reasoning?: string;
|
|
1005
1024
|
readonly evaluatorRawRequest?: JsonObject;
|
|
1006
1025
|
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
1026
|
+
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1027
|
+
readonly details?: JsonObject;
|
|
1007
1028
|
}
|
|
1008
1029
|
interface Evaluator {
|
|
1009
1030
|
readonly kind: string;
|
|
1010
1031
|
evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
|
|
1011
1032
|
}
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
1015
|
-
readonly maxOutputTokens?: number;
|
|
1016
|
-
readonly temperature?: number;
|
|
1017
|
-
readonly evaluatorTemplate?: string;
|
|
1018
|
-
}
|
|
1019
|
-
declare class LlmJudgeEvaluator implements Evaluator {
|
|
1020
|
-
readonly kind = "llm_judge";
|
|
1021
|
-
private readonly resolveJudgeProvider;
|
|
1022
|
-
private readonly maxOutputTokens?;
|
|
1023
|
-
private readonly temperature?;
|
|
1024
|
-
private readonly evaluatorTemplate?;
|
|
1025
|
-
constructor(options: LlmJudgeEvaluatorOptions);
|
|
1026
|
-
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1027
|
-
private evaluateFreeform;
|
|
1028
|
-
private evaluateWithRubrics;
|
|
1029
|
-
private buildRubricPrompt;
|
|
1030
|
-
private runWithRetry;
|
|
1033
|
+
interface EvaluatorFactory {
|
|
1034
|
+
create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
|
|
1031
1035
|
}
|
|
1036
|
+
|
|
1037
|
+
declare function scoreToVerdict(score: number): EvaluationVerdict;
|
|
1038
|
+
declare function clampScore(value: number): number;
|
|
1039
|
+
declare function extractJsonBlob(text: string): string | undefined;
|
|
1040
|
+
declare function parseJsonFromText(text: string): unknown;
|
|
1041
|
+
declare function isNonEmptyString(value: unknown): value is string;
|
|
1042
|
+
declare function parseJsonSafe(payload: string): Record<string, unknown> | undefined;
|
|
1043
|
+
/**
|
|
1044
|
+
* Deep equality check for two values.
|
|
1045
|
+
* Handles primitives, arrays, and plain objects.
|
|
1046
|
+
*/
|
|
1047
|
+
declare function deepEqual(a: unknown, b: unknown): boolean;
|
|
1048
|
+
|
|
1032
1049
|
interface CodeEvaluatorOptions {
|
|
1033
1050
|
readonly script: readonly string[];
|
|
1034
1051
|
readonly cwd?: string;
|
|
1035
1052
|
readonly agentTimeoutMs?: number;
|
|
1036
1053
|
/** Pass-through configuration from YAML (any unrecognized properties) */
|
|
1037
1054
|
readonly config?: Record<string, unknown>;
|
|
1055
|
+
/** Target access config - when present, enables target invocation for the script */
|
|
1056
|
+
readonly target?: TargetAccessConfig;
|
|
1038
1057
|
}
|
|
1039
1058
|
declare class CodeEvaluator implements Evaluator {
|
|
1040
1059
|
readonly kind = "code";
|
|
@@ -1042,29 +1061,44 @@ declare class CodeEvaluator implements Evaluator {
|
|
|
1042
1061
|
private readonly cwd?;
|
|
1043
1062
|
private readonly agentTimeoutMs?;
|
|
1044
1063
|
private readonly config?;
|
|
1064
|
+
private readonly target?;
|
|
1045
1065
|
constructor(options: CodeEvaluatorOptions);
|
|
1046
1066
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1047
1067
|
}
|
|
1048
|
-
|
|
1049
|
-
|
|
1068
|
+
declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
|
|
1069
|
+
|
|
1070
|
+
interface CompositeEvaluatorOptions {
|
|
1071
|
+
readonly config: CompositeEvaluatorConfig;
|
|
1072
|
+
readonly evaluatorFactory: EvaluatorFactory;
|
|
1073
|
+
readonly cwd?: string;
|
|
1050
1074
|
}
|
|
1051
|
-
declare class
|
|
1052
|
-
readonly kind = "
|
|
1075
|
+
declare class CompositeEvaluator implements Evaluator {
|
|
1076
|
+
readonly kind = "composite";
|
|
1053
1077
|
private readonly config;
|
|
1054
|
-
|
|
1078
|
+
private readonly evaluatorFactory;
|
|
1079
|
+
private readonly cwd?;
|
|
1080
|
+
constructor(options: CompositeEvaluatorOptions);
|
|
1081
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1082
|
+
private aggregate;
|
|
1083
|
+
private runWeightedAverage;
|
|
1084
|
+
private runCodeAggregator;
|
|
1085
|
+
private runLlmAggregator;
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
interface CostEvaluatorOptions {
|
|
1089
|
+
readonly config: CostEvaluatorConfig;
|
|
1090
|
+
}
|
|
1091
|
+
/**
|
|
1092
|
+
* Evaluator that checks execution cost against a budget.
|
|
1093
|
+
* Uses traceSummary.costUsd from the evaluation context.
|
|
1094
|
+
*/
|
|
1095
|
+
declare class CostEvaluator implements Evaluator {
|
|
1096
|
+
readonly kind = "cost";
|
|
1097
|
+
private readonly config;
|
|
1098
|
+
constructor(options: CostEvaluatorOptions);
|
|
1055
1099
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1056
|
-
/**
|
|
1057
|
-
* Extract tool calls from output messages.
|
|
1058
|
-
*/
|
|
1059
|
-
private extractToolCallsFromMessages;
|
|
1060
|
-
/**
|
|
1061
|
-
* Build a summary from extracted tool calls.
|
|
1062
|
-
*/
|
|
1063
|
-
private buildSummary;
|
|
1064
|
-
private evaluateAnyOrder;
|
|
1065
|
-
private evaluateInOrder;
|
|
1066
|
-
private evaluateExact;
|
|
1067
1100
|
}
|
|
1101
|
+
|
|
1068
1102
|
interface FieldAccuracyEvaluatorOptions {
|
|
1069
1103
|
readonly config: FieldAccuracyEvaluatorConfig;
|
|
1070
1104
|
}
|
|
@@ -1103,26 +1137,7 @@ declare class FieldAccuracyEvaluator implements Evaluator {
|
|
|
1103
1137
|
*/
|
|
1104
1138
|
private aggregateResults;
|
|
1105
1139
|
}
|
|
1106
|
-
|
|
1107
|
-
create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
|
|
1108
|
-
}
|
|
1109
|
-
interface CompositeEvaluatorOptions {
|
|
1110
|
-
readonly config: CompositeEvaluatorConfig;
|
|
1111
|
-
readonly evaluatorFactory: EvaluatorFactory;
|
|
1112
|
-
readonly cwd?: string;
|
|
1113
|
-
}
|
|
1114
|
-
declare class CompositeEvaluator implements Evaluator {
|
|
1115
|
-
readonly kind = "composite";
|
|
1116
|
-
private readonly config;
|
|
1117
|
-
private readonly evaluatorFactory;
|
|
1118
|
-
private readonly cwd?;
|
|
1119
|
-
constructor(options: CompositeEvaluatorOptions);
|
|
1120
|
-
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1121
|
-
private aggregate;
|
|
1122
|
-
private runWeightedAverage;
|
|
1123
|
-
private runCodeAggregator;
|
|
1124
|
-
private runLlmAggregator;
|
|
1125
|
-
}
|
|
1140
|
+
|
|
1126
1141
|
interface LatencyEvaluatorOptions {
|
|
1127
1142
|
readonly config: LatencyEvaluatorConfig;
|
|
1128
1143
|
}
|
|
@@ -1136,19 +1151,50 @@ declare class LatencyEvaluator implements Evaluator {
|
|
|
1136
1151
|
constructor(options: LatencyEvaluatorOptions);
|
|
1137
1152
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1138
1153
|
}
|
|
1139
|
-
|
|
1140
|
-
|
|
1154
|
+
|
|
1155
|
+
type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
1156
|
+
interface LlmJudgeEvaluatorOptions {
|
|
1157
|
+
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
1158
|
+
readonly maxOutputTokens?: number;
|
|
1159
|
+
readonly temperature?: number;
|
|
1160
|
+
readonly evaluatorTemplate?: string;
|
|
1161
|
+
}
|
|
1162
|
+
declare const freeformEvaluationSchema: z.ZodObject<{
|
|
1163
|
+
score: z.ZodNumber;
|
|
1164
|
+
hits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
1165
|
+
misses: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
1166
|
+
reasoning: z.ZodOptional<z.ZodString>;
|
|
1167
|
+
}, "strip", z.ZodTypeAny, {
|
|
1168
|
+
score: number;
|
|
1169
|
+
hits?: string[] | undefined;
|
|
1170
|
+
misses?: string[] | undefined;
|
|
1171
|
+
reasoning?: string | undefined;
|
|
1172
|
+
}, {
|
|
1173
|
+
score: number;
|
|
1174
|
+
hits?: string[] | undefined;
|
|
1175
|
+
misses?: string[] | undefined;
|
|
1176
|
+
reasoning?: string | undefined;
|
|
1177
|
+
}>;
|
|
1178
|
+
|
|
1179
|
+
declare class LlmJudgeEvaluator implements Evaluator {
|
|
1180
|
+
readonly kind = "llm_judge";
|
|
1181
|
+
private readonly resolveJudgeProvider;
|
|
1182
|
+
private readonly maxOutputTokens?;
|
|
1183
|
+
private readonly temperature?;
|
|
1184
|
+
private readonly evaluatorTemplate?;
|
|
1185
|
+
constructor(options: LlmJudgeEvaluatorOptions);
|
|
1186
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1187
|
+
private evaluateFreeform;
|
|
1188
|
+
private evaluateWithRubrics;
|
|
1189
|
+
private buildRubricPrompt;
|
|
1190
|
+
private runWithRetry;
|
|
1141
1191
|
}
|
|
1142
1192
|
/**
|
|
1143
|
-
*
|
|
1144
|
-
*
|
|
1193
|
+
* Build the mandatory output schema that all evaluators must follow.
|
|
1194
|
+
* This schema is always appended to the evaluator template.
|
|
1145
1195
|
*/
|
|
1146
|
-
declare
|
|
1147
|
-
|
|
1148
|
-
private readonly config;
|
|
1149
|
-
constructor(options: CostEvaluatorOptions);
|
|
1150
|
-
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1151
|
-
}
|
|
1196
|
+
declare function buildOutputSchema(): string;
|
|
1197
|
+
|
|
1152
1198
|
interface TokenUsageEvaluatorOptions {
|
|
1153
1199
|
readonly config: TokenUsageEvaluatorConfig;
|
|
1154
1200
|
}
|
|
@@ -1163,6 +1209,27 @@ declare class TokenUsageEvaluator implements Evaluator {
|
|
|
1163
1209
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1164
1210
|
}
|
|
1165
1211
|
|
|
1212
|
+
interface ToolTrajectoryEvaluatorOptions {
|
|
1213
|
+
readonly config: ToolTrajectoryEvaluatorConfig;
|
|
1214
|
+
}
|
|
1215
|
+
declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
1216
|
+
readonly kind = "tool_trajectory";
|
|
1217
|
+
private readonly config;
|
|
1218
|
+
constructor(options: ToolTrajectoryEvaluatorOptions);
|
|
1219
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1220
|
+
/**
|
|
1221
|
+
* Extract tool calls from output messages.
|
|
1222
|
+
*/
|
|
1223
|
+
private extractToolCallsFromMessages;
|
|
1224
|
+
/**
|
|
1225
|
+
* Build a summary from extracted tool calls.
|
|
1226
|
+
*/
|
|
1227
|
+
private buildSummary;
|
|
1228
|
+
private evaluateAnyOrder;
|
|
1229
|
+
private evaluateInOrder;
|
|
1230
|
+
private evaluateExact;
|
|
1231
|
+
}
|
|
1232
|
+
|
|
1166
1233
|
type MaybePromise<T> = T | Promise<T>;
|
|
1167
1234
|
interface EvaluationCache {
|
|
1168
1235
|
get(key: string): MaybePromise<ProviderResponse | undefined>;
|
|
@@ -1182,6 +1249,10 @@ interface RunEvalCaseOptions {
|
|
|
1182
1249
|
readonly useCache?: boolean;
|
|
1183
1250
|
readonly signal?: AbortSignal;
|
|
1184
1251
|
readonly judgeProvider?: Provider;
|
|
1252
|
+
/** Resolver for target override in code judges */
|
|
1253
|
+
readonly targetResolver?: (name: string) => Provider | undefined;
|
|
1254
|
+
/** List of available target names for code judges */
|
|
1255
|
+
readonly availableTargets?: readonly string[];
|
|
1185
1256
|
}
|
|
1186
1257
|
interface ProgressEvent {
|
|
1187
1258
|
readonly workerId: number;
|
|
@@ -1230,4 +1301,4 @@ type AgentKernel = {
|
|
|
1230
1301
|
};
|
|
1231
1302
|
declare function createAgentKernel(): AgentKernel;
|
|
1232
1303
|
|
|
1233
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio,
|
|
1304
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|
package/dist/index.d.ts
CHANGED
|
@@ -204,6 +204,15 @@ declare function isTestMessage(value: unknown): value is TestMessage;
|
|
|
204
204
|
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
|
|
205
205
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
206
206
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
207
|
+
/**
|
|
208
|
+
* Configuration for enabling target access in code_judge evaluators.
|
|
209
|
+
* When present, the runtime will start a local proxy server that allows
|
|
210
|
+
* the script to invoke configured targets without direct credential access.
|
|
211
|
+
*/
|
|
212
|
+
type TargetAccessConfig = {
|
|
213
|
+
/** Maximum number of target invocations allowed per execution (default: 50) */
|
|
214
|
+
readonly max_calls?: number;
|
|
215
|
+
};
|
|
207
216
|
type CodeEvaluatorConfig = {
|
|
208
217
|
readonly name: string;
|
|
209
218
|
readonly type: 'code';
|
|
@@ -214,6 +223,8 @@ type CodeEvaluatorConfig = {
|
|
|
214
223
|
readonly weight?: number;
|
|
215
224
|
/** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
|
|
216
225
|
readonly config?: JsonObject;
|
|
226
|
+
/** When present, enables target access for the script via local proxy */
|
|
227
|
+
readonly target?: TargetAccessConfig;
|
|
217
228
|
};
|
|
218
229
|
type LlmJudgeEvaluatorConfig = {
|
|
219
230
|
readonly name: string;
|
|
@@ -343,7 +354,6 @@ interface EvalCase {
|
|
|
343
354
|
readonly guideline_paths: readonly string[];
|
|
344
355
|
readonly guideline_patterns?: readonly string[];
|
|
345
356
|
readonly file_paths: readonly string[];
|
|
346
|
-
readonly code_snippets: readonly string[];
|
|
347
357
|
readonly expected_outcome: string;
|
|
348
358
|
readonly evaluator?: EvaluatorKind;
|
|
349
359
|
readonly evaluators?: readonly EvaluatorConfig[];
|
|
@@ -383,6 +393,8 @@ interface EvaluatorResult {
|
|
|
383
393
|
readonly rawRequest?: JsonObject;
|
|
384
394
|
readonly evaluatorProviderRequest?: JsonObject;
|
|
385
395
|
readonly evaluatorResults?: readonly EvaluatorResult[];
|
|
396
|
+
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
|
|
397
|
+
readonly details?: JsonObject;
|
|
386
398
|
}
|
|
387
399
|
/**
|
|
388
400
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -566,10 +578,6 @@ interface TargetDefinition {
|
|
|
566
578
|
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
567
579
|
*/
|
|
568
580
|
type FormattingMode = 'agent' | 'lm';
|
|
569
|
-
/**
|
|
570
|
-
* Extract fenced code blocks from AgentV user segments.
|
|
571
|
-
*/
|
|
572
|
-
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
573
581
|
|
|
574
582
|
/**
|
|
575
583
|
* Build prompt inputs by consolidating user request context and guideline content.
|
|
@@ -962,6 +970,11 @@ declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener
|
|
|
962
970
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
963
971
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
964
972
|
|
|
973
|
+
/**
|
|
974
|
+
* Function to resolve a target name to a provider.
|
|
975
|
+
* Used by code judges to support target override.
|
|
976
|
+
*/
|
|
977
|
+
type TargetResolver = (targetName: string) => Provider | undefined;
|
|
965
978
|
interface EvaluationContext {
|
|
966
979
|
readonly evalCase: EvalCase;
|
|
967
980
|
readonly candidate: string;
|
|
@@ -982,6 +995,10 @@ interface EvaluationContext {
|
|
|
982
995
|
readonly outputMessages?: readonly OutputMessage[];
|
|
983
996
|
/** Lightweight summary of trace events (if available) */
|
|
984
997
|
readonly traceSummary?: TraceSummary;
|
|
998
|
+
/** Resolver for target override in code judges */
|
|
999
|
+
readonly targetResolver?: TargetResolver;
|
|
1000
|
+
/** List of available target names for code judges */
|
|
1001
|
+
readonly availableTargets?: readonly string[];
|
|
985
1002
|
}
|
|
986
1003
|
interface EvaluationScore {
|
|
987
1004
|
readonly score: number;
|
|
@@ -992,6 +1009,8 @@ interface EvaluationScore {
|
|
|
992
1009
|
readonly reasoning?: string;
|
|
993
1010
|
readonly evaluatorRawRequest?: JsonObject;
|
|
994
1011
|
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
1012
|
+
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1013
|
+
readonly details?: JsonObject;
|
|
995
1014
|
}
|
|
996
1015
|
interface ChildEvaluatorResult {
|
|
997
1016
|
readonly name: string;
|
|
@@ -1004,37 +1023,37 @@ interface ChildEvaluatorResult {
|
|
|
1004
1023
|
readonly reasoning?: string;
|
|
1005
1024
|
readonly evaluatorRawRequest?: JsonObject;
|
|
1006
1025
|
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
1026
|
+
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1027
|
+
readonly details?: JsonObject;
|
|
1007
1028
|
}
|
|
1008
1029
|
interface Evaluator {
|
|
1009
1030
|
readonly kind: string;
|
|
1010
1031
|
evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
|
|
1011
1032
|
}
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
1015
|
-
readonly maxOutputTokens?: number;
|
|
1016
|
-
readonly temperature?: number;
|
|
1017
|
-
readonly evaluatorTemplate?: string;
|
|
1018
|
-
}
|
|
1019
|
-
declare class LlmJudgeEvaluator implements Evaluator {
|
|
1020
|
-
readonly kind = "llm_judge";
|
|
1021
|
-
private readonly resolveJudgeProvider;
|
|
1022
|
-
private readonly maxOutputTokens?;
|
|
1023
|
-
private readonly temperature?;
|
|
1024
|
-
private readonly evaluatorTemplate?;
|
|
1025
|
-
constructor(options: LlmJudgeEvaluatorOptions);
|
|
1026
|
-
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1027
|
-
private evaluateFreeform;
|
|
1028
|
-
private evaluateWithRubrics;
|
|
1029
|
-
private buildRubricPrompt;
|
|
1030
|
-
private runWithRetry;
|
|
1033
|
+
interface EvaluatorFactory {
|
|
1034
|
+
create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
|
|
1031
1035
|
}
|
|
1036
|
+
|
|
1037
|
+
declare function scoreToVerdict(score: number): EvaluationVerdict;
|
|
1038
|
+
declare function clampScore(value: number): number;
|
|
1039
|
+
declare function extractJsonBlob(text: string): string | undefined;
|
|
1040
|
+
declare function parseJsonFromText(text: string): unknown;
|
|
1041
|
+
declare function isNonEmptyString(value: unknown): value is string;
|
|
1042
|
+
declare function parseJsonSafe(payload: string): Record<string, unknown> | undefined;
|
|
1043
|
+
/**
|
|
1044
|
+
* Deep equality check for two values.
|
|
1045
|
+
* Handles primitives, arrays, and plain objects.
|
|
1046
|
+
*/
|
|
1047
|
+
declare function deepEqual(a: unknown, b: unknown): boolean;
|
|
1048
|
+
|
|
1032
1049
|
interface CodeEvaluatorOptions {
|
|
1033
1050
|
readonly script: readonly string[];
|
|
1034
1051
|
readonly cwd?: string;
|
|
1035
1052
|
readonly agentTimeoutMs?: number;
|
|
1036
1053
|
/** Pass-through configuration from YAML (any unrecognized properties) */
|
|
1037
1054
|
readonly config?: Record<string, unknown>;
|
|
1055
|
+
/** Target access config - when present, enables target invocation for the script */
|
|
1056
|
+
readonly target?: TargetAccessConfig;
|
|
1038
1057
|
}
|
|
1039
1058
|
declare class CodeEvaluator implements Evaluator {
|
|
1040
1059
|
readonly kind = "code";
|
|
@@ -1042,29 +1061,44 @@ declare class CodeEvaluator implements Evaluator {
|
|
|
1042
1061
|
private readonly cwd?;
|
|
1043
1062
|
private readonly agentTimeoutMs?;
|
|
1044
1063
|
private readonly config?;
|
|
1064
|
+
private readonly target?;
|
|
1045
1065
|
constructor(options: CodeEvaluatorOptions);
|
|
1046
1066
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1047
1067
|
}
|
|
1048
|
-
|
|
1049
|
-
|
|
1068
|
+
declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
|
|
1069
|
+
|
|
1070
|
+
interface CompositeEvaluatorOptions {
|
|
1071
|
+
readonly config: CompositeEvaluatorConfig;
|
|
1072
|
+
readonly evaluatorFactory: EvaluatorFactory;
|
|
1073
|
+
readonly cwd?: string;
|
|
1050
1074
|
}
|
|
1051
|
-
declare class
|
|
1052
|
-
readonly kind = "
|
|
1075
|
+
declare class CompositeEvaluator implements Evaluator {
|
|
1076
|
+
readonly kind = "composite";
|
|
1053
1077
|
private readonly config;
|
|
1054
|
-
|
|
1078
|
+
private readonly evaluatorFactory;
|
|
1079
|
+
private readonly cwd?;
|
|
1080
|
+
constructor(options: CompositeEvaluatorOptions);
|
|
1081
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1082
|
+
private aggregate;
|
|
1083
|
+
private runWeightedAverage;
|
|
1084
|
+
private runCodeAggregator;
|
|
1085
|
+
private runLlmAggregator;
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
interface CostEvaluatorOptions {
|
|
1089
|
+
readonly config: CostEvaluatorConfig;
|
|
1090
|
+
}
|
|
1091
|
+
/**
|
|
1092
|
+
* Evaluator that checks execution cost against a budget.
|
|
1093
|
+
* Uses traceSummary.costUsd from the evaluation context.
|
|
1094
|
+
*/
|
|
1095
|
+
declare class CostEvaluator implements Evaluator {
|
|
1096
|
+
readonly kind = "cost";
|
|
1097
|
+
private readonly config;
|
|
1098
|
+
constructor(options: CostEvaluatorOptions);
|
|
1055
1099
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1056
|
-
/**
|
|
1057
|
-
* Extract tool calls from output messages.
|
|
1058
|
-
*/
|
|
1059
|
-
private extractToolCallsFromMessages;
|
|
1060
|
-
/**
|
|
1061
|
-
* Build a summary from extracted tool calls.
|
|
1062
|
-
*/
|
|
1063
|
-
private buildSummary;
|
|
1064
|
-
private evaluateAnyOrder;
|
|
1065
|
-
private evaluateInOrder;
|
|
1066
|
-
private evaluateExact;
|
|
1067
1100
|
}
|
|
1101
|
+
|
|
1068
1102
|
interface FieldAccuracyEvaluatorOptions {
|
|
1069
1103
|
readonly config: FieldAccuracyEvaluatorConfig;
|
|
1070
1104
|
}
|
|
@@ -1103,26 +1137,7 @@ declare class FieldAccuracyEvaluator implements Evaluator {
|
|
|
1103
1137
|
*/
|
|
1104
1138
|
private aggregateResults;
|
|
1105
1139
|
}
|
|
1106
|
-
|
|
1107
|
-
create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
|
|
1108
|
-
}
|
|
1109
|
-
interface CompositeEvaluatorOptions {
|
|
1110
|
-
readonly config: CompositeEvaluatorConfig;
|
|
1111
|
-
readonly evaluatorFactory: EvaluatorFactory;
|
|
1112
|
-
readonly cwd?: string;
|
|
1113
|
-
}
|
|
1114
|
-
declare class CompositeEvaluator implements Evaluator {
|
|
1115
|
-
readonly kind = "composite";
|
|
1116
|
-
private readonly config;
|
|
1117
|
-
private readonly evaluatorFactory;
|
|
1118
|
-
private readonly cwd?;
|
|
1119
|
-
constructor(options: CompositeEvaluatorOptions);
|
|
1120
|
-
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1121
|
-
private aggregate;
|
|
1122
|
-
private runWeightedAverage;
|
|
1123
|
-
private runCodeAggregator;
|
|
1124
|
-
private runLlmAggregator;
|
|
1125
|
-
}
|
|
1140
|
+
|
|
1126
1141
|
interface LatencyEvaluatorOptions {
|
|
1127
1142
|
readonly config: LatencyEvaluatorConfig;
|
|
1128
1143
|
}
|
|
@@ -1136,19 +1151,50 @@ declare class LatencyEvaluator implements Evaluator {
|
|
|
1136
1151
|
constructor(options: LatencyEvaluatorOptions);
|
|
1137
1152
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1138
1153
|
}
|
|
1139
|
-
|
|
1140
|
-
|
|
1154
|
+
|
|
1155
|
+
type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
1156
|
+
interface LlmJudgeEvaluatorOptions {
|
|
1157
|
+
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
1158
|
+
readonly maxOutputTokens?: number;
|
|
1159
|
+
readonly temperature?: number;
|
|
1160
|
+
readonly evaluatorTemplate?: string;
|
|
1161
|
+
}
|
|
1162
|
+
declare const freeformEvaluationSchema: z.ZodObject<{
|
|
1163
|
+
score: z.ZodNumber;
|
|
1164
|
+
hits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
1165
|
+
misses: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
1166
|
+
reasoning: z.ZodOptional<z.ZodString>;
|
|
1167
|
+
}, "strip", z.ZodTypeAny, {
|
|
1168
|
+
score: number;
|
|
1169
|
+
hits?: string[] | undefined;
|
|
1170
|
+
misses?: string[] | undefined;
|
|
1171
|
+
reasoning?: string | undefined;
|
|
1172
|
+
}, {
|
|
1173
|
+
score: number;
|
|
1174
|
+
hits?: string[] | undefined;
|
|
1175
|
+
misses?: string[] | undefined;
|
|
1176
|
+
reasoning?: string | undefined;
|
|
1177
|
+
}>;
|
|
1178
|
+
|
|
1179
|
+
declare class LlmJudgeEvaluator implements Evaluator {
|
|
1180
|
+
readonly kind = "llm_judge";
|
|
1181
|
+
private readonly resolveJudgeProvider;
|
|
1182
|
+
private readonly maxOutputTokens?;
|
|
1183
|
+
private readonly temperature?;
|
|
1184
|
+
private readonly evaluatorTemplate?;
|
|
1185
|
+
constructor(options: LlmJudgeEvaluatorOptions);
|
|
1186
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1187
|
+
private evaluateFreeform;
|
|
1188
|
+
private evaluateWithRubrics;
|
|
1189
|
+
private buildRubricPrompt;
|
|
1190
|
+
private runWithRetry;
|
|
1141
1191
|
}
|
|
1142
1192
|
/**
|
|
1143
|
-
*
|
|
1144
|
-
*
|
|
1193
|
+
* Build the mandatory output schema that all evaluators must follow.
|
|
1194
|
+
* This schema is always appended to the evaluator template.
|
|
1145
1195
|
*/
|
|
1146
|
-
declare
|
|
1147
|
-
|
|
1148
|
-
private readonly config;
|
|
1149
|
-
constructor(options: CostEvaluatorOptions);
|
|
1150
|
-
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1151
|
-
}
|
|
1196
|
+
declare function buildOutputSchema(): string;
|
|
1197
|
+
|
|
1152
1198
|
interface TokenUsageEvaluatorOptions {
|
|
1153
1199
|
readonly config: TokenUsageEvaluatorConfig;
|
|
1154
1200
|
}
|
|
@@ -1163,6 +1209,27 @@ declare class TokenUsageEvaluator implements Evaluator {
|
|
|
1163
1209
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1164
1210
|
}
|
|
1165
1211
|
|
|
1212
|
+
interface ToolTrajectoryEvaluatorOptions {
|
|
1213
|
+
readonly config: ToolTrajectoryEvaluatorConfig;
|
|
1214
|
+
}
|
|
1215
|
+
declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
1216
|
+
readonly kind = "tool_trajectory";
|
|
1217
|
+
private readonly config;
|
|
1218
|
+
constructor(options: ToolTrajectoryEvaluatorOptions);
|
|
1219
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1220
|
+
/**
|
|
1221
|
+
* Extract tool calls from output messages.
|
|
1222
|
+
*/
|
|
1223
|
+
private extractToolCallsFromMessages;
|
|
1224
|
+
/**
|
|
1225
|
+
* Build a summary from extracted tool calls.
|
|
1226
|
+
*/
|
|
1227
|
+
private buildSummary;
|
|
1228
|
+
private evaluateAnyOrder;
|
|
1229
|
+
private evaluateInOrder;
|
|
1230
|
+
private evaluateExact;
|
|
1231
|
+
}
|
|
1232
|
+
|
|
1166
1233
|
type MaybePromise<T> = T | Promise<T>;
|
|
1167
1234
|
interface EvaluationCache {
|
|
1168
1235
|
get(key: string): MaybePromise<ProviderResponse | undefined>;
|
|
@@ -1182,6 +1249,10 @@ interface RunEvalCaseOptions {
|
|
|
1182
1249
|
readonly useCache?: boolean;
|
|
1183
1250
|
readonly signal?: AbortSignal;
|
|
1184
1251
|
readonly judgeProvider?: Provider;
|
|
1252
|
+
/** Resolver for target override in code judges */
|
|
1253
|
+
readonly targetResolver?: (name: string) => Provider | undefined;
|
|
1254
|
+
/** List of available target names for code judges */
|
|
1255
|
+
readonly availableTargets?: readonly string[];
|
|
1185
1256
|
}
|
|
1186
1257
|
interface ProgressEvent {
|
|
1187
1258
|
readonly workerId: number;
|
|
@@ -1230,4 +1301,4 @@ type AgentKernel = {
|
|
|
1230
1301
|
};
|
|
1231
1302
|
declare function createAgentKernel(): AgentKernel;
|
|
1232
1303
|
|
|
1233
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio,
|
|
1304
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|