@agentv/core 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-IBTKEEOT.js → chunk-KDEP4I7G.js} +44 -1
- package/dist/chunk-KDEP4I7G.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +1 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +1641 -1138
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +157 -100
- package/dist/index.d.ts +157 -100
- package/dist/index.js +1451 -997
- package/dist/index.js.map +1 -1
- package/package.json +4 -1
- package/dist/chunk-IBTKEEOT.js.map +0 -1
package/dist/index.d.ts
CHANGED
|
@@ -204,6 +204,15 @@ declare function isTestMessage(value: unknown): value is TestMessage;
|
|
|
204
204
|
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
|
|
205
205
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
206
206
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
207
|
+
/**
|
|
208
|
+
* Configuration for enabling target access in code_judge evaluators.
|
|
209
|
+
* When present, the runtime will start a local proxy server that allows
|
|
210
|
+
* the script to invoke configured targets without direct credential access.
|
|
211
|
+
*/
|
|
212
|
+
type TargetAccessConfig = {
|
|
213
|
+
/** Maximum number of target invocations allowed per execution (default: 50) */
|
|
214
|
+
readonly max_calls?: number;
|
|
215
|
+
};
|
|
207
216
|
type CodeEvaluatorConfig = {
|
|
208
217
|
readonly name: string;
|
|
209
218
|
readonly type: 'code';
|
|
@@ -214,6 +223,8 @@ type CodeEvaluatorConfig = {
|
|
|
214
223
|
readonly weight?: number;
|
|
215
224
|
/** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
|
|
216
225
|
readonly config?: JsonObject;
|
|
226
|
+
/** When present, enables target access for the script via local proxy */
|
|
227
|
+
readonly target?: TargetAccessConfig;
|
|
217
228
|
};
|
|
218
229
|
type LlmJudgeEvaluatorConfig = {
|
|
219
230
|
readonly name: string;
|
|
@@ -343,7 +354,6 @@ interface EvalCase {
|
|
|
343
354
|
readonly guideline_paths: readonly string[];
|
|
344
355
|
readonly guideline_patterns?: readonly string[];
|
|
345
356
|
readonly file_paths: readonly string[];
|
|
346
|
-
readonly code_snippets: readonly string[];
|
|
347
357
|
readonly expected_outcome: string;
|
|
348
358
|
readonly evaluator?: EvaluatorKind;
|
|
349
359
|
readonly evaluators?: readonly EvaluatorConfig[];
|
|
@@ -383,6 +393,8 @@ interface EvaluatorResult {
|
|
|
383
393
|
readonly rawRequest?: JsonObject;
|
|
384
394
|
readonly evaluatorProviderRequest?: JsonObject;
|
|
385
395
|
readonly evaluatorResults?: readonly EvaluatorResult[];
|
|
396
|
+
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts). */
|
|
397
|
+
readonly details?: JsonObject;
|
|
386
398
|
}
|
|
387
399
|
/**
|
|
388
400
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -396,7 +408,7 @@ interface ChatMessage {
|
|
|
396
408
|
readonly name?: string;
|
|
397
409
|
}
|
|
398
410
|
type ChatPrompt = readonly ChatMessage[];
|
|
399
|
-
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
411
|
+
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
400
412
|
interface ProviderRequest {
|
|
401
413
|
readonly question: string;
|
|
402
414
|
readonly systemPrompt?: string;
|
|
@@ -566,10 +578,6 @@ interface TargetDefinition {
|
|
|
566
578
|
* - 'lm': Embedded file content with XML tags (for language model providers)
|
|
567
579
|
*/
|
|
568
580
|
type FormattingMode = 'agent' | 'lm';
|
|
569
|
-
/**
|
|
570
|
-
* Extract fenced code blocks from AgentV user segments.
|
|
571
|
-
*/
|
|
572
|
-
declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
|
|
573
581
|
|
|
574
582
|
/**
|
|
575
583
|
* Build prompt inputs by consolidating user request context and guideline content.
|
|
@@ -805,6 +813,13 @@ interface PiCodingAgentResolvedConfig {
|
|
|
805
813
|
readonly logFormat?: 'summary' | 'json';
|
|
806
814
|
readonly systemPrompt?: string;
|
|
807
815
|
}
|
|
816
|
+
interface PiAgentSdkResolvedConfig {
|
|
817
|
+
readonly provider?: string;
|
|
818
|
+
readonly model?: string;
|
|
819
|
+
readonly apiKey?: string;
|
|
820
|
+
readonly timeoutMs?: number;
|
|
821
|
+
readonly systemPrompt?: string;
|
|
822
|
+
}
|
|
808
823
|
interface ClaudeCodeResolvedConfig {
|
|
809
824
|
readonly executable: string;
|
|
810
825
|
readonly model?: string;
|
|
@@ -863,6 +878,13 @@ type ResolvedTarget = {
|
|
|
863
878
|
readonly workers?: number;
|
|
864
879
|
readonly providerBatching?: boolean;
|
|
865
880
|
readonly config: PiCodingAgentResolvedConfig;
|
|
881
|
+
} | {
|
|
882
|
+
readonly kind: 'pi-agent-sdk';
|
|
883
|
+
readonly name: string;
|
|
884
|
+
readonly judgeTarget?: string;
|
|
885
|
+
readonly workers?: number;
|
|
886
|
+
readonly providerBatching?: boolean;
|
|
887
|
+
readonly config: PiAgentSdkResolvedConfig;
|
|
866
888
|
} | {
|
|
867
889
|
readonly kind: 'claude-code';
|
|
868
890
|
readonly name: string;
|
|
@@ -948,6 +970,11 @@ declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener
|
|
|
948
970
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
949
971
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
950
972
|
|
|
973
|
+
/**
|
|
974
|
+
* Function to resolve a target name to a provider.
|
|
975
|
+
* Used by code judges to support target override.
|
|
976
|
+
*/
|
|
977
|
+
type TargetResolver = (targetName: string) => Provider | undefined;
|
|
951
978
|
interface EvaluationContext {
|
|
952
979
|
readonly evalCase: EvalCase;
|
|
953
980
|
readonly candidate: string;
|
|
@@ -968,6 +995,10 @@ interface EvaluationContext {
|
|
|
968
995
|
readonly outputMessages?: readonly OutputMessage[];
|
|
969
996
|
/** Lightweight summary of trace events (if available) */
|
|
970
997
|
readonly traceSummary?: TraceSummary;
|
|
998
|
+
/** Resolver for target override in code judges */
|
|
999
|
+
readonly targetResolver?: TargetResolver;
|
|
1000
|
+
/** List of available target names for code judges */
|
|
1001
|
+
readonly availableTargets?: readonly string[];
|
|
971
1002
|
}
|
|
972
1003
|
interface EvaluationScore {
|
|
973
1004
|
readonly score: number;
|
|
@@ -978,6 +1009,8 @@ interface EvaluationScore {
|
|
|
978
1009
|
readonly reasoning?: string;
|
|
979
1010
|
readonly evaluatorRawRequest?: JsonObject;
|
|
980
1011
|
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
1012
|
+
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1013
|
+
readonly details?: JsonObject;
|
|
981
1014
|
}
|
|
982
1015
|
interface ChildEvaluatorResult {
|
|
983
1016
|
readonly name: string;
|
|
@@ -990,37 +1023,37 @@ interface ChildEvaluatorResult {
|
|
|
990
1023
|
readonly reasoning?: string;
|
|
991
1024
|
readonly evaluatorRawRequest?: JsonObject;
|
|
992
1025
|
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
1026
|
+
/** Optional structured details from code judges (e.g., TP/TN/FP/FN counts, alignments). */
|
|
1027
|
+
readonly details?: JsonObject;
|
|
993
1028
|
}
|
|
994
1029
|
interface Evaluator {
|
|
995
1030
|
readonly kind: string;
|
|
996
1031
|
evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
|
|
997
1032
|
}
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
1001
|
-
readonly maxOutputTokens?: number;
|
|
1002
|
-
readonly temperature?: number;
|
|
1003
|
-
readonly evaluatorTemplate?: string;
|
|
1004
|
-
}
|
|
1005
|
-
declare class LlmJudgeEvaluator implements Evaluator {
|
|
1006
|
-
readonly kind = "llm_judge";
|
|
1007
|
-
private readonly resolveJudgeProvider;
|
|
1008
|
-
private readonly maxOutputTokens?;
|
|
1009
|
-
private readonly temperature?;
|
|
1010
|
-
private readonly evaluatorTemplate?;
|
|
1011
|
-
constructor(options: LlmJudgeEvaluatorOptions);
|
|
1012
|
-
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1013
|
-
private evaluateFreeform;
|
|
1014
|
-
private evaluateWithRubrics;
|
|
1015
|
-
private buildRubricPrompt;
|
|
1016
|
-
private runWithRetry;
|
|
1033
|
+
interface EvaluatorFactory {
|
|
1034
|
+
create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
|
|
1017
1035
|
}
|
|
1036
|
+
|
|
1037
|
+
declare function scoreToVerdict(score: number): EvaluationVerdict;
|
|
1038
|
+
declare function clampScore(value: number): number;
|
|
1039
|
+
declare function extractJsonBlob(text: string): string | undefined;
|
|
1040
|
+
declare function parseJsonFromText(text: string): unknown;
|
|
1041
|
+
declare function isNonEmptyString(value: unknown): value is string;
|
|
1042
|
+
declare function parseJsonSafe(payload: string): Record<string, unknown> | undefined;
|
|
1043
|
+
/**
|
|
1044
|
+
* Deep equality check for two values.
|
|
1045
|
+
* Handles primitives, arrays, and plain objects.
|
|
1046
|
+
*/
|
|
1047
|
+
declare function deepEqual(a: unknown, b: unknown): boolean;
|
|
1048
|
+
|
|
1018
1049
|
interface CodeEvaluatorOptions {
|
|
1019
1050
|
readonly script: readonly string[];
|
|
1020
1051
|
readonly cwd?: string;
|
|
1021
1052
|
readonly agentTimeoutMs?: number;
|
|
1022
1053
|
/** Pass-through configuration from YAML (any unrecognized properties) */
|
|
1023
1054
|
readonly config?: Record<string, unknown>;
|
|
1055
|
+
/** Target access config - when present, enables target invocation for the script */
|
|
1056
|
+
readonly target?: TargetAccessConfig;
|
|
1024
1057
|
}
|
|
1025
1058
|
declare class CodeEvaluator implements Evaluator {
|
|
1026
1059
|
readonly kind = "code";
|
|
@@ -1028,29 +1061,44 @@ declare class CodeEvaluator implements Evaluator {
|
|
|
1028
1061
|
private readonly cwd?;
|
|
1029
1062
|
private readonly agentTimeoutMs?;
|
|
1030
1063
|
private readonly config?;
|
|
1064
|
+
private readonly target?;
|
|
1031
1065
|
constructor(options: CodeEvaluatorOptions);
|
|
1032
1066
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1033
1067
|
}
|
|
1034
|
-
|
|
1035
|
-
|
|
1068
|
+
declare function executeScript(scriptPath: readonly string[] | string, input: string, agentTimeoutMs?: number, cwd?: string, env?: Record<string, string>): Promise<string>;
|
|
1069
|
+
|
|
1070
|
+
interface CompositeEvaluatorOptions {
|
|
1071
|
+
readonly config: CompositeEvaluatorConfig;
|
|
1072
|
+
readonly evaluatorFactory: EvaluatorFactory;
|
|
1073
|
+
readonly cwd?: string;
|
|
1036
1074
|
}
|
|
1037
|
-
declare class
|
|
1038
|
-
readonly kind = "
|
|
1075
|
+
declare class CompositeEvaluator implements Evaluator {
|
|
1076
|
+
readonly kind = "composite";
|
|
1039
1077
|
private readonly config;
|
|
1040
|
-
|
|
1078
|
+
private readonly evaluatorFactory;
|
|
1079
|
+
private readonly cwd?;
|
|
1080
|
+
constructor(options: CompositeEvaluatorOptions);
|
|
1081
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1082
|
+
private aggregate;
|
|
1083
|
+
private runWeightedAverage;
|
|
1084
|
+
private runCodeAggregator;
|
|
1085
|
+
private runLlmAggregator;
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
interface CostEvaluatorOptions {
|
|
1089
|
+
readonly config: CostEvaluatorConfig;
|
|
1090
|
+
}
|
|
1091
|
+
/**
|
|
1092
|
+
* Evaluator that checks execution cost against a budget.
|
|
1093
|
+
* Uses traceSummary.costUsd from the evaluation context.
|
|
1094
|
+
*/
|
|
1095
|
+
declare class CostEvaluator implements Evaluator {
|
|
1096
|
+
readonly kind = "cost";
|
|
1097
|
+
private readonly config;
|
|
1098
|
+
constructor(options: CostEvaluatorOptions);
|
|
1041
1099
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1042
|
-
/**
|
|
1043
|
-
* Extract tool calls from output messages.
|
|
1044
|
-
*/
|
|
1045
|
-
private extractToolCallsFromMessages;
|
|
1046
|
-
/**
|
|
1047
|
-
* Build a summary from extracted tool calls.
|
|
1048
|
-
*/
|
|
1049
|
-
private buildSummary;
|
|
1050
|
-
private evaluateAnyOrder;
|
|
1051
|
-
private evaluateInOrder;
|
|
1052
|
-
private evaluateExact;
|
|
1053
1100
|
}
|
|
1101
|
+
|
|
1054
1102
|
interface FieldAccuracyEvaluatorOptions {
|
|
1055
1103
|
readonly config: FieldAccuracyEvaluatorConfig;
|
|
1056
1104
|
}
|
|
@@ -1089,26 +1137,7 @@ declare class FieldAccuracyEvaluator implements Evaluator {
|
|
|
1089
1137
|
*/
|
|
1090
1138
|
private aggregateResults;
|
|
1091
1139
|
}
|
|
1092
|
-
|
|
1093
|
-
create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
|
|
1094
|
-
}
|
|
1095
|
-
interface CompositeEvaluatorOptions {
|
|
1096
|
-
readonly config: CompositeEvaluatorConfig;
|
|
1097
|
-
readonly evaluatorFactory: EvaluatorFactory;
|
|
1098
|
-
readonly cwd?: string;
|
|
1099
|
-
}
|
|
1100
|
-
declare class CompositeEvaluator implements Evaluator {
|
|
1101
|
-
readonly kind = "composite";
|
|
1102
|
-
private readonly config;
|
|
1103
|
-
private readonly evaluatorFactory;
|
|
1104
|
-
private readonly cwd?;
|
|
1105
|
-
constructor(options: CompositeEvaluatorOptions);
|
|
1106
|
-
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1107
|
-
private aggregate;
|
|
1108
|
-
private runWeightedAverage;
|
|
1109
|
-
private runCodeAggregator;
|
|
1110
|
-
private runLlmAggregator;
|
|
1111
|
-
}
|
|
1140
|
+
|
|
1112
1141
|
interface LatencyEvaluatorOptions {
|
|
1113
1142
|
readonly config: LatencyEvaluatorConfig;
|
|
1114
1143
|
}
|
|
@@ -1122,19 +1151,50 @@ declare class LatencyEvaluator implements Evaluator {
|
|
|
1122
1151
|
constructor(options: LatencyEvaluatorOptions);
|
|
1123
1152
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1124
1153
|
}
|
|
1125
|
-
|
|
1126
|
-
|
|
1154
|
+
|
|
1155
|
+
type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
1156
|
+
interface LlmJudgeEvaluatorOptions {
|
|
1157
|
+
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
1158
|
+
readonly maxOutputTokens?: number;
|
|
1159
|
+
readonly temperature?: number;
|
|
1160
|
+
readonly evaluatorTemplate?: string;
|
|
1161
|
+
}
|
|
1162
|
+
declare const freeformEvaluationSchema: z.ZodObject<{
|
|
1163
|
+
score: z.ZodNumber;
|
|
1164
|
+
hits: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
1165
|
+
misses: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
1166
|
+
reasoning: z.ZodOptional<z.ZodString>;
|
|
1167
|
+
}, "strip", z.ZodTypeAny, {
|
|
1168
|
+
score: number;
|
|
1169
|
+
hits?: string[] | undefined;
|
|
1170
|
+
misses?: string[] | undefined;
|
|
1171
|
+
reasoning?: string | undefined;
|
|
1172
|
+
}, {
|
|
1173
|
+
score: number;
|
|
1174
|
+
hits?: string[] | undefined;
|
|
1175
|
+
misses?: string[] | undefined;
|
|
1176
|
+
reasoning?: string | undefined;
|
|
1177
|
+
}>;
|
|
1178
|
+
|
|
1179
|
+
declare class LlmJudgeEvaluator implements Evaluator {
|
|
1180
|
+
readonly kind = "llm_judge";
|
|
1181
|
+
private readonly resolveJudgeProvider;
|
|
1182
|
+
private readonly maxOutputTokens?;
|
|
1183
|
+
private readonly temperature?;
|
|
1184
|
+
private readonly evaluatorTemplate?;
|
|
1185
|
+
constructor(options: LlmJudgeEvaluatorOptions);
|
|
1186
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
1187
|
+
private evaluateFreeform;
|
|
1188
|
+
private evaluateWithRubrics;
|
|
1189
|
+
private buildRubricPrompt;
|
|
1190
|
+
private runWithRetry;
|
|
1127
1191
|
}
|
|
1128
1192
|
/**
|
|
1129
|
-
*
|
|
1130
|
-
*
|
|
1193
|
+
* Build the mandatory output schema that all evaluators must follow.
|
|
1194
|
+
* This schema is always appended to the evaluator template.
|
|
1131
1195
|
*/
|
|
1132
|
-
declare
|
|
1133
|
-
|
|
1134
|
-
private readonly config;
|
|
1135
|
-
constructor(options: CostEvaluatorOptions);
|
|
1136
|
-
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1137
|
-
}
|
|
1196
|
+
declare function buildOutputSchema(): string;
|
|
1197
|
+
|
|
1138
1198
|
interface TokenUsageEvaluatorOptions {
|
|
1139
1199
|
readonly config: TokenUsageEvaluatorConfig;
|
|
1140
1200
|
}
|
|
@@ -1149,6 +1209,27 @@ declare class TokenUsageEvaluator implements Evaluator {
|
|
|
1149
1209
|
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1150
1210
|
}
|
|
1151
1211
|
|
|
1212
|
+
interface ToolTrajectoryEvaluatorOptions {
|
|
1213
|
+
readonly config: ToolTrajectoryEvaluatorConfig;
|
|
1214
|
+
}
|
|
1215
|
+
declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
1216
|
+
readonly kind = "tool_trajectory";
|
|
1217
|
+
private readonly config;
|
|
1218
|
+
constructor(options: ToolTrajectoryEvaluatorOptions);
|
|
1219
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1220
|
+
/**
|
|
1221
|
+
* Extract tool calls from output messages.
|
|
1222
|
+
*/
|
|
1223
|
+
private extractToolCallsFromMessages;
|
|
1224
|
+
/**
|
|
1225
|
+
* Build a summary from extracted tool calls.
|
|
1226
|
+
*/
|
|
1227
|
+
private buildSummary;
|
|
1228
|
+
private evaluateAnyOrder;
|
|
1229
|
+
private evaluateInOrder;
|
|
1230
|
+
private evaluateExact;
|
|
1231
|
+
}
|
|
1232
|
+
|
|
1152
1233
|
type MaybePromise<T> = T | Promise<T>;
|
|
1153
1234
|
interface EvaluationCache {
|
|
1154
1235
|
get(key: string): MaybePromise<ProviderResponse | undefined>;
|
|
@@ -1168,6 +1249,10 @@ interface RunEvalCaseOptions {
|
|
|
1168
1249
|
readonly useCache?: boolean;
|
|
1169
1250
|
readonly signal?: AbortSignal;
|
|
1170
1251
|
readonly judgeProvider?: Provider;
|
|
1252
|
+
/** Resolver for target override in code judges */
|
|
1253
|
+
readonly targetResolver?: (name: string) => Provider | undefined;
|
|
1254
|
+
/** List of available target names for code judges */
|
|
1255
|
+
readonly availableTargets?: readonly string[];
|
|
1171
1256
|
}
|
|
1172
1257
|
interface ProgressEvent {
|
|
1173
1258
|
readonly workerId: number;
|
|
@@ -1211,37 +1296,9 @@ interface GenerateRubricsOptions {
|
|
|
1211
1296
|
*/
|
|
1212
1297
|
declare function generateRubrics(options: GenerateRubricsOptions): Promise<readonly RubricItem[]>;
|
|
1213
1298
|
|
|
1214
|
-
/**
|
|
1215
|
-
* Payload received by code judges via stdin.
|
|
1216
|
-
* All properties use camelCase for TypeScript ergonomics.
|
|
1217
|
-
*/
|
|
1218
|
-
interface CodeJudgePayload {
|
|
1219
|
-
readonly question: string;
|
|
1220
|
-
readonly expectedOutcome: string;
|
|
1221
|
-
readonly expectedMessages: readonly JsonObject[];
|
|
1222
|
-
readonly referenceAnswer?: string;
|
|
1223
|
-
readonly candidateAnswer: string;
|
|
1224
|
-
readonly outputMessages?: readonly OutputMessage[] | null;
|
|
1225
|
-
readonly guidelineFiles: readonly string[];
|
|
1226
|
-
readonly inputFiles: readonly string[];
|
|
1227
|
-
readonly inputMessages: readonly TestMessage[];
|
|
1228
|
-
readonly traceSummary?: TraceSummary | null;
|
|
1229
|
-
readonly config?: JsonObject | null;
|
|
1230
|
-
}
|
|
1231
|
-
/**
|
|
1232
|
-
* Parse stdin JSON (snake_case) into typed camelCase object.
|
|
1233
|
-
* Use this in TypeScript code judges to get type-safe, idiomatic input.
|
|
1234
|
-
*/
|
|
1235
|
-
declare function parseCodeJudgePayload(payload: string): CodeJudgePayload;
|
|
1236
|
-
/**
|
|
1237
|
-
* Convenience helper that reads stdin and parses it.
|
|
1238
|
-
* Equivalent to: parseCodeJudgePayload(readFileSync(0, 'utf8'))
|
|
1239
|
-
*/
|
|
1240
|
-
declare function readCodeJudgePayload(): CodeJudgePayload;
|
|
1241
|
-
|
|
1242
1299
|
type AgentKernel = {
|
|
1243
1300
|
status: string;
|
|
1244
1301
|
};
|
|
1245
1302
|
declare function createAgentKernel(): AgentKernel;
|
|
1246
1303
|
|
|
1247
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type
|
|
1304
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetAccessConfig, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildOutputSchema, buildPromptInputs, buildSearchRoots, clampScore, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, deepEqual, ensureVSCodeSubagents, executeScript, explorationRatio, extractJsonBlob, fileExists, findGitRoot, freeformEvaluationSchema, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isNonEmptyString, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, parseJsonFromText, parseJsonSafe, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreToVerdict, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|