@agentv/core 1.5.0 → 2.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -77
- package/dist/{chunk-E2VSU4WZ.js → chunk-KDEP4I7G.js} +116 -1
- package/dist/chunk-KDEP4I7G.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +2 -0
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +1 -1
- package/dist/index.cjs +2715 -675
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +207 -10
- package/dist/index.d.ts +207 -10
- package/dist/index.js +2491 -570
- package/dist/index.js.map +1 -1
- package/package.json +8 -2
- package/dist/chunk-E2VSU4WZ.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -201,17 +201,19 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
201
201
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
202
202
|
*/
|
|
203
203
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
204
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory"];
|
|
204
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
|
|
205
205
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
206
206
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
207
207
|
type CodeEvaluatorConfig = {
|
|
208
208
|
readonly name: string;
|
|
209
209
|
readonly type: 'code';
|
|
210
|
-
readonly script: string;
|
|
210
|
+
readonly script: readonly string[];
|
|
211
211
|
readonly resolvedScriptPath?: string;
|
|
212
212
|
readonly cwd?: string;
|
|
213
213
|
readonly resolvedCwd?: string;
|
|
214
214
|
readonly weight?: number;
|
|
215
|
+
/** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
|
|
216
|
+
readonly config?: JsonObject;
|
|
215
217
|
};
|
|
216
218
|
type LlmJudgeEvaluatorConfig = {
|
|
217
219
|
readonly name: string;
|
|
@@ -247,7 +249,85 @@ type CompositeEvaluatorConfig = {
|
|
|
247
249
|
readonly aggregator: CompositeAggregatorConfig;
|
|
248
250
|
readonly weight?: number;
|
|
249
251
|
};
|
|
250
|
-
|
|
252
|
+
/**
|
|
253
|
+
* Match type for field accuracy evaluation.
|
|
254
|
+
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code_judge evaluator.
|
|
255
|
+
* See examples/features/document-extraction/fuzzy_match.ts for an example.
|
|
256
|
+
*/
|
|
257
|
+
type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
|
|
258
|
+
/**
|
|
259
|
+
* Aggregation strategy for combining field scores.
|
|
260
|
+
*/
|
|
261
|
+
type FieldAggregationType = 'weighted_average' | 'all_or_nothing';
|
|
262
|
+
/**
|
|
263
|
+
* Configuration for a single field to evaluate.
|
|
264
|
+
*/
|
|
265
|
+
type FieldConfig = {
|
|
266
|
+
/** Dot-notation path to the field (e.g., "invoice.vendor.name" or "items[0].amount") */
|
|
267
|
+
readonly path: string;
|
|
268
|
+
/** Match strategy for this field */
|
|
269
|
+
readonly match: FieldMatchType;
|
|
270
|
+
/** Whether this field is required (missing required fields count as failures) */
|
|
271
|
+
readonly required?: boolean;
|
|
272
|
+
/** Weight for aggregation (default: 1.0) */
|
|
273
|
+
readonly weight?: number;
|
|
274
|
+
/** Tolerance for numeric matching (absolute value unless relative is true) */
|
|
275
|
+
readonly tolerance?: number;
|
|
276
|
+
/** Whether tolerance is relative (percentage) vs absolute */
|
|
277
|
+
readonly relative?: boolean;
|
|
278
|
+
/** Date formats to try when parsing (default: common formats) */
|
|
279
|
+
readonly formats?: readonly string[];
|
|
280
|
+
};
|
|
281
|
+
/**
|
|
282
|
+
* Configuration for the field_accuracy evaluator.
|
|
283
|
+
*/
|
|
284
|
+
type FieldAccuracyEvaluatorConfig = {
|
|
285
|
+
readonly name: string;
|
|
286
|
+
readonly type: 'field_accuracy';
|
|
287
|
+
/** Fields to compare between candidate and expected */
|
|
288
|
+
readonly fields: readonly FieldConfig[];
|
|
289
|
+
/** Strategy for combining field scores (default: weighted_average) */
|
|
290
|
+
readonly aggregation?: FieldAggregationType;
|
|
291
|
+
readonly weight?: number;
|
|
292
|
+
};
|
|
293
|
+
/**
|
|
294
|
+
* Configuration for the latency evaluator.
|
|
295
|
+
* Checks execution duration against a threshold.
|
|
296
|
+
*/
|
|
297
|
+
type LatencyEvaluatorConfig = {
|
|
298
|
+
readonly name: string;
|
|
299
|
+
readonly type: 'latency';
|
|
300
|
+
/** Maximum allowed duration in milliseconds */
|
|
301
|
+
readonly threshold: number;
|
|
302
|
+
readonly weight?: number;
|
|
303
|
+
};
|
|
304
|
+
/**
|
|
305
|
+
* Configuration for the cost evaluator.
|
|
306
|
+
* Checks execution cost against a budget.
|
|
307
|
+
*/
|
|
308
|
+
type CostEvaluatorConfig = {
|
|
309
|
+
readonly name: string;
|
|
310
|
+
readonly type: 'cost';
|
|
311
|
+
/** Maximum allowed cost in USD */
|
|
312
|
+
readonly budget: number;
|
|
313
|
+
readonly weight?: number;
|
|
314
|
+
};
|
|
315
|
+
/**
|
|
316
|
+
* Configuration for the token_usage evaluator.
|
|
317
|
+
* Checks provider-reported token usage against configured limits.
|
|
318
|
+
*/
|
|
319
|
+
type TokenUsageEvaluatorConfig = {
|
|
320
|
+
readonly name: string;
|
|
321
|
+
readonly type: 'token_usage';
|
|
322
|
+
/** Maximum allowed total tokens (input + output + cached, when present) */
|
|
323
|
+
readonly max_total?: number;
|
|
324
|
+
/** Maximum allowed input tokens (prompt) */
|
|
325
|
+
readonly max_input?: number;
|
|
326
|
+
/** Maximum allowed output tokens (completion) */
|
|
327
|
+
readonly max_output?: number;
|
|
328
|
+
readonly weight?: number;
|
|
329
|
+
};
|
|
330
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig;
|
|
251
331
|
/**
|
|
252
332
|
* Eval case definition sourced from AgentV specs.
|
|
253
333
|
*/
|
|
@@ -282,7 +362,6 @@ interface EvaluationResult {
|
|
|
282
362
|
readonly candidateAnswer: string;
|
|
283
363
|
readonly target: string;
|
|
284
364
|
readonly reasoning?: string;
|
|
285
|
-
readonly rawAspects?: readonly string[];
|
|
286
365
|
readonly agentProviderRequest?: JsonObject;
|
|
287
366
|
readonly lmProviderRequest?: JsonObject;
|
|
288
367
|
readonly evaluatorProviderRequest?: JsonObject;
|
|
@@ -317,7 +396,7 @@ interface ChatMessage {
|
|
|
317
396
|
readonly name?: string;
|
|
318
397
|
}
|
|
319
398
|
type ChatPrompt = readonly ChatMessage[];
|
|
320
|
-
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
399
|
+
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
321
400
|
interface ProviderRequest {
|
|
322
401
|
readonly question: string;
|
|
323
402
|
readonly systemPrompt?: string;
|
|
@@ -726,6 +805,23 @@ interface PiCodingAgentResolvedConfig {
|
|
|
726
805
|
readonly logFormat?: 'summary' | 'json';
|
|
727
806
|
readonly systemPrompt?: string;
|
|
728
807
|
}
|
|
808
|
+
interface PiAgentSdkResolvedConfig {
|
|
809
|
+
readonly provider?: string;
|
|
810
|
+
readonly model?: string;
|
|
811
|
+
readonly apiKey?: string;
|
|
812
|
+
readonly timeoutMs?: number;
|
|
813
|
+
readonly systemPrompt?: string;
|
|
814
|
+
}
|
|
815
|
+
interface ClaudeCodeResolvedConfig {
|
|
816
|
+
readonly executable: string;
|
|
817
|
+
readonly model?: string;
|
|
818
|
+
readonly systemPrompt?: string;
|
|
819
|
+
readonly args?: readonly string[];
|
|
820
|
+
readonly cwd?: string;
|
|
821
|
+
readonly timeoutMs?: number;
|
|
822
|
+
readonly logDir?: string;
|
|
823
|
+
readonly logFormat?: 'summary' | 'json';
|
|
824
|
+
}
|
|
729
825
|
interface MockResolvedConfig {
|
|
730
826
|
readonly response?: string;
|
|
731
827
|
readonly delayMs?: number;
|
|
@@ -774,6 +870,20 @@ type ResolvedTarget = {
|
|
|
774
870
|
readonly workers?: number;
|
|
775
871
|
readonly providerBatching?: boolean;
|
|
776
872
|
readonly config: PiCodingAgentResolvedConfig;
|
|
873
|
+
} | {
|
|
874
|
+
readonly kind: 'pi-agent-sdk';
|
|
875
|
+
readonly name: string;
|
|
876
|
+
readonly judgeTarget?: string;
|
|
877
|
+
readonly workers?: number;
|
|
878
|
+
readonly providerBatching?: boolean;
|
|
879
|
+
readonly config: PiAgentSdkResolvedConfig;
|
|
880
|
+
} | {
|
|
881
|
+
readonly kind: 'claude-code';
|
|
882
|
+
readonly name: string;
|
|
883
|
+
readonly judgeTarget?: string;
|
|
884
|
+
readonly workers?: number;
|
|
885
|
+
readonly providerBatching?: boolean;
|
|
886
|
+
readonly config: ClaudeCodeResolvedConfig;
|
|
777
887
|
} | {
|
|
778
888
|
readonly kind: 'mock';
|
|
779
889
|
readonly name: string;
|
|
@@ -839,6 +949,16 @@ type PiLogListener = (entry: PiLogEntry) => void;
|
|
|
839
949
|
declare function consumePiLogEntries(): PiLogEntry[];
|
|
840
950
|
declare function subscribeToPiLogEntries(listener: PiLogListener): () => void;
|
|
841
951
|
|
|
952
|
+
type ClaudeCodeLogEntry = {
|
|
953
|
+
readonly filePath: string;
|
|
954
|
+
readonly evalCaseId?: string;
|
|
955
|
+
readonly targetName: string;
|
|
956
|
+
readonly attempt?: number;
|
|
957
|
+
};
|
|
958
|
+
type ClaudeCodeLogListener = (entry: ClaudeCodeLogEntry) => void;
|
|
959
|
+
declare function consumeClaudeCodeLogEntries(): ClaudeCodeLogEntry[];
|
|
960
|
+
declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener): () => void;
|
|
961
|
+
|
|
842
962
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
843
963
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
844
964
|
|
|
@@ -870,7 +990,6 @@ interface EvaluationScore {
|
|
|
870
990
|
readonly misses: readonly string[];
|
|
871
991
|
readonly expectedAspectCount: number;
|
|
872
992
|
readonly reasoning?: string;
|
|
873
|
-
readonly rawAspects?: readonly string[];
|
|
874
993
|
readonly evaluatorRawRequest?: JsonObject;
|
|
875
994
|
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
876
995
|
}
|
|
@@ -911,15 +1030,18 @@ declare class LlmJudgeEvaluator implements Evaluator {
|
|
|
911
1030
|
private runWithRetry;
|
|
912
1031
|
}
|
|
913
1032
|
interface CodeEvaluatorOptions {
|
|
914
|
-
readonly script: string;
|
|
1033
|
+
readonly script: readonly string[];
|
|
915
1034
|
readonly cwd?: string;
|
|
916
1035
|
readonly agentTimeoutMs?: number;
|
|
1036
|
+
/** Pass-through configuration from YAML (any unrecognized properties) */
|
|
1037
|
+
readonly config?: Record<string, unknown>;
|
|
917
1038
|
}
|
|
918
1039
|
declare class CodeEvaluator implements Evaluator {
|
|
919
1040
|
readonly kind = "code";
|
|
920
1041
|
private readonly script;
|
|
921
1042
|
private readonly cwd?;
|
|
922
1043
|
private readonly agentTimeoutMs?;
|
|
1044
|
+
private readonly config?;
|
|
923
1045
|
constructor(options: CodeEvaluatorOptions);
|
|
924
1046
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
925
1047
|
}
|
|
@@ -943,6 +1065,44 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
|
943
1065
|
private evaluateInOrder;
|
|
944
1066
|
private evaluateExact;
|
|
945
1067
|
}
|
|
1068
|
+
interface FieldAccuracyEvaluatorOptions {
|
|
1069
|
+
readonly config: FieldAccuracyEvaluatorConfig;
|
|
1070
|
+
}
|
|
1071
|
+
/**
|
|
1072
|
+
* FieldAccuracyEvaluator compares extracted structured data against expected values
|
|
1073
|
+
* with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
|
|
1074
|
+
*/
|
|
1075
|
+
declare class FieldAccuracyEvaluator implements Evaluator {
|
|
1076
|
+
readonly kind = "field_accuracy";
|
|
1077
|
+
private readonly config;
|
|
1078
|
+
constructor(options: FieldAccuracyEvaluatorOptions);
|
|
1079
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1080
|
+
/**
|
|
1081
|
+
* Extract expected data from expected_messages array.
|
|
1082
|
+
* Looks for the last assistant message with content.
|
|
1083
|
+
*/
|
|
1084
|
+
private extractExpectedData;
|
|
1085
|
+
/**
|
|
1086
|
+
* Evaluate a single field against the expected value.
|
|
1087
|
+
*/
|
|
1088
|
+
private evaluateField;
|
|
1089
|
+
/**
|
|
1090
|
+
* Exact equality comparison.
|
|
1091
|
+
*/
|
|
1092
|
+
private compareExact;
|
|
1093
|
+
/**
|
|
1094
|
+
* Numeric comparison with absolute or relative tolerance.
|
|
1095
|
+
*/
|
|
1096
|
+
private compareNumericTolerance;
|
|
1097
|
+
/**
|
|
1098
|
+
* Date comparison with format normalization.
|
|
1099
|
+
*/
|
|
1100
|
+
private compareDate;
|
|
1101
|
+
/**
|
|
1102
|
+
* Aggregate field results using configured strategy.
|
|
1103
|
+
*/
|
|
1104
|
+
private aggregateResults;
|
|
1105
|
+
}
|
|
946
1106
|
interface EvaluatorFactory {
|
|
947
1107
|
create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
|
|
948
1108
|
}
|
|
@@ -963,6 +1123,45 @@ declare class CompositeEvaluator implements Evaluator {
|
|
|
963
1123
|
private runCodeAggregator;
|
|
964
1124
|
private runLlmAggregator;
|
|
965
1125
|
}
|
|
1126
|
+
interface LatencyEvaluatorOptions {
|
|
1127
|
+
readonly config: LatencyEvaluatorConfig;
|
|
1128
|
+
}
|
|
1129
|
+
/**
|
|
1130
|
+
* Evaluator that checks execution duration against a threshold.
|
|
1131
|
+
* Uses traceSummary.durationMs from the evaluation context.
|
|
1132
|
+
*/
|
|
1133
|
+
declare class LatencyEvaluator implements Evaluator {
|
|
1134
|
+
readonly kind = "latency";
|
|
1135
|
+
private readonly config;
|
|
1136
|
+
constructor(options: LatencyEvaluatorOptions);
|
|
1137
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1138
|
+
}
|
|
1139
|
+
interface CostEvaluatorOptions {
|
|
1140
|
+
readonly config: CostEvaluatorConfig;
|
|
1141
|
+
}
|
|
1142
|
+
/**
|
|
1143
|
+
* Evaluator that checks execution cost against a budget.
|
|
1144
|
+
* Uses traceSummary.costUsd from the evaluation context.
|
|
1145
|
+
*/
|
|
1146
|
+
declare class CostEvaluator implements Evaluator {
|
|
1147
|
+
readonly kind = "cost";
|
|
1148
|
+
private readonly config;
|
|
1149
|
+
constructor(options: CostEvaluatorOptions);
|
|
1150
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1151
|
+
}
|
|
1152
|
+
interface TokenUsageEvaluatorOptions {
|
|
1153
|
+
readonly config: TokenUsageEvaluatorConfig;
|
|
1154
|
+
}
|
|
1155
|
+
/**
|
|
1156
|
+
* Evaluator that checks provider-reported token usage against configured limits.
|
|
1157
|
+
* Uses traceSummary.tokenUsage from the evaluation context.
|
|
1158
|
+
*/
|
|
1159
|
+
declare class TokenUsageEvaluator implements Evaluator {
|
|
1160
|
+
readonly kind = "token_usage";
|
|
1161
|
+
private readonly config;
|
|
1162
|
+
constructor(options: TokenUsageEvaluatorOptions);
|
|
1163
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1164
|
+
}
|
|
966
1165
|
|
|
967
1166
|
type MaybePromise<T> = T | Promise<T>;
|
|
968
1167
|
interface EvaluationCache {
|
|
@@ -979,7 +1178,6 @@ interface RunEvalCaseOptions {
|
|
|
979
1178
|
readonly now?: () => Date;
|
|
980
1179
|
readonly maxRetries?: number;
|
|
981
1180
|
readonly agentTimeoutMs?: number;
|
|
982
|
-
readonly promptDumpDir?: string;
|
|
983
1181
|
readonly cache?: EvaluationCache;
|
|
984
1182
|
readonly useCache?: boolean;
|
|
985
1183
|
readonly signal?: AbortSignal;
|
|
@@ -1003,7 +1201,6 @@ interface RunEvaluationOptions {
|
|
|
1003
1201
|
readonly evaluators?: Partial<Record<string, Evaluator>>;
|
|
1004
1202
|
readonly maxRetries?: number;
|
|
1005
1203
|
readonly agentTimeoutMs?: number;
|
|
1006
|
-
readonly promptDumpDir?: string;
|
|
1007
1204
|
readonly cache?: EvaluationCache;
|
|
1008
1205
|
readonly useCache?: boolean;
|
|
1009
1206
|
readonly now?: () => Date;
|
|
@@ -1033,4 +1230,4 @@ type AgentKernel = {
|
|
|
1033
1230
|
};
|
|
1034
1231
|
declare function createAgentKernel(): AgentKernel;
|
|
1035
1232
|
|
|
1036
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|
|
1233
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|
package/dist/index.d.ts
CHANGED
|
@@ -201,17 +201,19 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
201
201
|
* - Either content (string or array of objects) OR tool_calls (for assistant messages)
|
|
202
202
|
*/
|
|
203
203
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
204
|
-
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory"];
|
|
204
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code_judge", "llm_judge", "rubric", "composite", "tool_trajectory", "field_accuracy", "latency", "cost", "token_usage"];
|
|
205
205
|
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
206
206
|
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
207
207
|
type CodeEvaluatorConfig = {
|
|
208
208
|
readonly name: string;
|
|
209
209
|
readonly type: 'code';
|
|
210
|
-
readonly script: string;
|
|
210
|
+
readonly script: readonly string[];
|
|
211
211
|
readonly resolvedScriptPath?: string;
|
|
212
212
|
readonly cwd?: string;
|
|
213
213
|
readonly resolvedCwd?: string;
|
|
214
214
|
readonly weight?: number;
|
|
215
|
+
/** Pass-through configuration for the code_judge script (any unrecognized YAML properties) */
|
|
216
|
+
readonly config?: JsonObject;
|
|
215
217
|
};
|
|
216
218
|
type LlmJudgeEvaluatorConfig = {
|
|
217
219
|
readonly name: string;
|
|
@@ -247,7 +249,85 @@ type CompositeEvaluatorConfig = {
|
|
|
247
249
|
readonly aggregator: CompositeAggregatorConfig;
|
|
248
250
|
readonly weight?: number;
|
|
249
251
|
};
|
|
250
|
-
|
|
252
|
+
/**
|
|
253
|
+
* Match type for field accuracy evaluation.
|
|
254
|
+
* Note: For fuzzy string matching (Levenshtein, Jaro-Winkler, etc.), use a code_judge evaluator.
|
|
255
|
+
* See examples/features/document-extraction/fuzzy_match.ts for an example.
|
|
256
|
+
*/
|
|
257
|
+
type FieldMatchType = 'exact' | 'numeric_tolerance' | 'date';
|
|
258
|
+
/**
|
|
259
|
+
* Aggregation strategy for combining field scores.
|
|
260
|
+
*/
|
|
261
|
+
type FieldAggregationType = 'weighted_average' | 'all_or_nothing';
|
|
262
|
+
/**
|
|
263
|
+
* Configuration for a single field to evaluate.
|
|
264
|
+
*/
|
|
265
|
+
type FieldConfig = {
|
|
266
|
+
/** Dot-notation path to the field (e.g., "invoice.vendor.name" or "items[0].amount") */
|
|
267
|
+
readonly path: string;
|
|
268
|
+
/** Match strategy for this field */
|
|
269
|
+
readonly match: FieldMatchType;
|
|
270
|
+
/** Whether this field is required (missing required fields count as failures) */
|
|
271
|
+
readonly required?: boolean;
|
|
272
|
+
/** Weight for aggregation (default: 1.0) */
|
|
273
|
+
readonly weight?: number;
|
|
274
|
+
/** Tolerance for numeric matching (absolute value unless relative is true) */
|
|
275
|
+
readonly tolerance?: number;
|
|
276
|
+
/** Whether tolerance is relative (percentage) vs absolute */
|
|
277
|
+
readonly relative?: boolean;
|
|
278
|
+
/** Date formats to try when parsing (default: common formats) */
|
|
279
|
+
readonly formats?: readonly string[];
|
|
280
|
+
};
|
|
281
|
+
/**
|
|
282
|
+
* Configuration for the field_accuracy evaluator.
|
|
283
|
+
*/
|
|
284
|
+
type FieldAccuracyEvaluatorConfig = {
|
|
285
|
+
readonly name: string;
|
|
286
|
+
readonly type: 'field_accuracy';
|
|
287
|
+
/** Fields to compare between candidate and expected */
|
|
288
|
+
readonly fields: readonly FieldConfig[];
|
|
289
|
+
/** Strategy for combining field scores (default: weighted_average) */
|
|
290
|
+
readonly aggregation?: FieldAggregationType;
|
|
291
|
+
readonly weight?: number;
|
|
292
|
+
};
|
|
293
|
+
/**
|
|
294
|
+
* Configuration for the latency evaluator.
|
|
295
|
+
* Checks execution duration against a threshold.
|
|
296
|
+
*/
|
|
297
|
+
type LatencyEvaluatorConfig = {
|
|
298
|
+
readonly name: string;
|
|
299
|
+
readonly type: 'latency';
|
|
300
|
+
/** Maximum allowed duration in milliseconds */
|
|
301
|
+
readonly threshold: number;
|
|
302
|
+
readonly weight?: number;
|
|
303
|
+
};
|
|
304
|
+
/**
|
|
305
|
+
* Configuration for the cost evaluator.
|
|
306
|
+
* Checks execution cost against a budget.
|
|
307
|
+
*/
|
|
308
|
+
type CostEvaluatorConfig = {
|
|
309
|
+
readonly name: string;
|
|
310
|
+
readonly type: 'cost';
|
|
311
|
+
/** Maximum allowed cost in USD */
|
|
312
|
+
readonly budget: number;
|
|
313
|
+
readonly weight?: number;
|
|
314
|
+
};
|
|
315
|
+
/**
|
|
316
|
+
* Configuration for the token_usage evaluator.
|
|
317
|
+
* Checks provider-reported token usage against configured limits.
|
|
318
|
+
*/
|
|
319
|
+
type TokenUsageEvaluatorConfig = {
|
|
320
|
+
readonly name: string;
|
|
321
|
+
readonly type: 'token_usage';
|
|
322
|
+
/** Maximum allowed total tokens (input + output + cached, when present) */
|
|
323
|
+
readonly max_total?: number;
|
|
324
|
+
/** Maximum allowed input tokens (prompt) */
|
|
325
|
+
readonly max_input?: number;
|
|
326
|
+
/** Maximum allowed output tokens (completion) */
|
|
327
|
+
readonly max_output?: number;
|
|
328
|
+
readonly weight?: number;
|
|
329
|
+
};
|
|
330
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig | CompositeEvaluatorConfig | ToolTrajectoryEvaluatorConfig | FieldAccuracyEvaluatorConfig | LatencyEvaluatorConfig | CostEvaluatorConfig | TokenUsageEvaluatorConfig;
|
|
251
331
|
/**
|
|
252
332
|
* Eval case definition sourced from AgentV specs.
|
|
253
333
|
*/
|
|
@@ -282,7 +362,6 @@ interface EvaluationResult {
|
|
|
282
362
|
readonly candidateAnswer: string;
|
|
283
363
|
readonly target: string;
|
|
284
364
|
readonly reasoning?: string;
|
|
285
|
-
readonly rawAspects?: readonly string[];
|
|
286
365
|
readonly agentProviderRequest?: JsonObject;
|
|
287
366
|
readonly lmProviderRequest?: JsonObject;
|
|
288
367
|
readonly evaluatorProviderRequest?: JsonObject;
|
|
@@ -317,7 +396,7 @@ interface ChatMessage {
|
|
|
317
396
|
readonly name?: string;
|
|
318
397
|
}
|
|
319
398
|
type ChatPrompt = readonly ChatMessage[];
|
|
320
|
-
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
399
|
+
type ProviderKind = 'azure' | 'anthropic' | 'gemini' | 'codex' | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude-code' | 'cli' | 'mock' | 'vscode' | 'vscode-insiders';
|
|
321
400
|
interface ProviderRequest {
|
|
322
401
|
readonly question: string;
|
|
323
402
|
readonly systemPrompt?: string;
|
|
@@ -726,6 +805,23 @@ interface PiCodingAgentResolvedConfig {
|
|
|
726
805
|
readonly logFormat?: 'summary' | 'json';
|
|
727
806
|
readonly systemPrompt?: string;
|
|
728
807
|
}
|
|
808
|
+
interface PiAgentSdkResolvedConfig {
|
|
809
|
+
readonly provider?: string;
|
|
810
|
+
readonly model?: string;
|
|
811
|
+
readonly apiKey?: string;
|
|
812
|
+
readonly timeoutMs?: number;
|
|
813
|
+
readonly systemPrompt?: string;
|
|
814
|
+
}
|
|
815
|
+
interface ClaudeCodeResolvedConfig {
|
|
816
|
+
readonly executable: string;
|
|
817
|
+
readonly model?: string;
|
|
818
|
+
readonly systemPrompt?: string;
|
|
819
|
+
readonly args?: readonly string[];
|
|
820
|
+
readonly cwd?: string;
|
|
821
|
+
readonly timeoutMs?: number;
|
|
822
|
+
readonly logDir?: string;
|
|
823
|
+
readonly logFormat?: 'summary' | 'json';
|
|
824
|
+
}
|
|
729
825
|
interface MockResolvedConfig {
|
|
730
826
|
readonly response?: string;
|
|
731
827
|
readonly delayMs?: number;
|
|
@@ -774,6 +870,20 @@ type ResolvedTarget = {
|
|
|
774
870
|
readonly workers?: number;
|
|
775
871
|
readonly providerBatching?: boolean;
|
|
776
872
|
readonly config: PiCodingAgentResolvedConfig;
|
|
873
|
+
} | {
|
|
874
|
+
readonly kind: 'pi-agent-sdk';
|
|
875
|
+
readonly name: string;
|
|
876
|
+
readonly judgeTarget?: string;
|
|
877
|
+
readonly workers?: number;
|
|
878
|
+
readonly providerBatching?: boolean;
|
|
879
|
+
readonly config: PiAgentSdkResolvedConfig;
|
|
880
|
+
} | {
|
|
881
|
+
readonly kind: 'claude-code';
|
|
882
|
+
readonly name: string;
|
|
883
|
+
readonly judgeTarget?: string;
|
|
884
|
+
readonly workers?: number;
|
|
885
|
+
readonly providerBatching?: boolean;
|
|
886
|
+
readonly config: ClaudeCodeResolvedConfig;
|
|
777
887
|
} | {
|
|
778
888
|
readonly kind: 'mock';
|
|
779
889
|
readonly name: string;
|
|
@@ -839,6 +949,16 @@ type PiLogListener = (entry: PiLogEntry) => void;
|
|
|
839
949
|
declare function consumePiLogEntries(): PiLogEntry[];
|
|
840
950
|
declare function subscribeToPiLogEntries(listener: PiLogListener): () => void;
|
|
841
951
|
|
|
952
|
+
type ClaudeCodeLogEntry = {
|
|
953
|
+
readonly filePath: string;
|
|
954
|
+
readonly evalCaseId?: string;
|
|
955
|
+
readonly targetName: string;
|
|
956
|
+
readonly attempt?: number;
|
|
957
|
+
};
|
|
958
|
+
type ClaudeCodeLogListener = (entry: ClaudeCodeLogEntry) => void;
|
|
959
|
+
declare function consumeClaudeCodeLogEntries(): ClaudeCodeLogEntry[];
|
|
960
|
+
declare function subscribeToClaudeCodeLogEntries(listener: ClaudeCodeLogListener): () => void;
|
|
961
|
+
|
|
842
962
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
843
963
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
844
964
|
|
|
@@ -870,7 +990,6 @@ interface EvaluationScore {
|
|
|
870
990
|
readonly misses: readonly string[];
|
|
871
991
|
readonly expectedAspectCount: number;
|
|
872
992
|
readonly reasoning?: string;
|
|
873
|
-
readonly rawAspects?: readonly string[];
|
|
874
993
|
readonly evaluatorRawRequest?: JsonObject;
|
|
875
994
|
readonly evaluatorResults?: readonly ChildEvaluatorResult[];
|
|
876
995
|
}
|
|
@@ -911,15 +1030,18 @@ declare class LlmJudgeEvaluator implements Evaluator {
|
|
|
911
1030
|
private runWithRetry;
|
|
912
1031
|
}
|
|
913
1032
|
interface CodeEvaluatorOptions {
|
|
914
|
-
readonly script: string;
|
|
1033
|
+
readonly script: readonly string[];
|
|
915
1034
|
readonly cwd?: string;
|
|
916
1035
|
readonly agentTimeoutMs?: number;
|
|
1036
|
+
/** Pass-through configuration from YAML (any unrecognized properties) */
|
|
1037
|
+
readonly config?: Record<string, unknown>;
|
|
917
1038
|
}
|
|
918
1039
|
declare class CodeEvaluator implements Evaluator {
|
|
919
1040
|
readonly kind = "code";
|
|
920
1041
|
private readonly script;
|
|
921
1042
|
private readonly cwd?;
|
|
922
1043
|
private readonly agentTimeoutMs?;
|
|
1044
|
+
private readonly config?;
|
|
923
1045
|
constructor(options: CodeEvaluatorOptions);
|
|
924
1046
|
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
925
1047
|
}
|
|
@@ -943,6 +1065,44 @@ declare class ToolTrajectoryEvaluator implements Evaluator {
|
|
|
943
1065
|
private evaluateInOrder;
|
|
944
1066
|
private evaluateExact;
|
|
945
1067
|
}
|
|
1068
|
+
interface FieldAccuracyEvaluatorOptions {
|
|
1069
|
+
readonly config: FieldAccuracyEvaluatorConfig;
|
|
1070
|
+
}
|
|
1071
|
+
/**
|
|
1072
|
+
* FieldAccuracyEvaluator compares extracted structured data against expected values
|
|
1073
|
+
* with configurable matching strategies (exact, fuzzy, numeric_tolerance, date).
|
|
1074
|
+
*/
|
|
1075
|
+
declare class FieldAccuracyEvaluator implements Evaluator {
|
|
1076
|
+
readonly kind = "field_accuracy";
|
|
1077
|
+
private readonly config;
|
|
1078
|
+
constructor(options: FieldAccuracyEvaluatorOptions);
|
|
1079
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1080
|
+
/**
|
|
1081
|
+
* Extract expected data from expected_messages array.
|
|
1082
|
+
* Looks for the last assistant message with content.
|
|
1083
|
+
*/
|
|
1084
|
+
private extractExpectedData;
|
|
1085
|
+
/**
|
|
1086
|
+
* Evaluate a single field against the expected value.
|
|
1087
|
+
*/
|
|
1088
|
+
private evaluateField;
|
|
1089
|
+
/**
|
|
1090
|
+
* Exact equality comparison.
|
|
1091
|
+
*/
|
|
1092
|
+
private compareExact;
|
|
1093
|
+
/**
|
|
1094
|
+
* Numeric comparison with absolute or relative tolerance.
|
|
1095
|
+
*/
|
|
1096
|
+
private compareNumericTolerance;
|
|
1097
|
+
/**
|
|
1098
|
+
* Date comparison with format normalization.
|
|
1099
|
+
*/
|
|
1100
|
+
private compareDate;
|
|
1101
|
+
/**
|
|
1102
|
+
* Aggregate field results using configured strategy.
|
|
1103
|
+
*/
|
|
1104
|
+
private aggregateResults;
|
|
1105
|
+
}
|
|
946
1106
|
interface EvaluatorFactory {
|
|
947
1107
|
create(config: EvaluatorConfig, context: EvaluationContext): Evaluator;
|
|
948
1108
|
}
|
|
@@ -963,6 +1123,45 @@ declare class CompositeEvaluator implements Evaluator {
|
|
|
963
1123
|
private runCodeAggregator;
|
|
964
1124
|
private runLlmAggregator;
|
|
965
1125
|
}
|
|
1126
|
+
interface LatencyEvaluatorOptions {
|
|
1127
|
+
readonly config: LatencyEvaluatorConfig;
|
|
1128
|
+
}
|
|
1129
|
+
/**
|
|
1130
|
+
* Evaluator that checks execution duration against a threshold.
|
|
1131
|
+
* Uses traceSummary.durationMs from the evaluation context.
|
|
1132
|
+
*/
|
|
1133
|
+
declare class LatencyEvaluator implements Evaluator {
|
|
1134
|
+
readonly kind = "latency";
|
|
1135
|
+
private readonly config;
|
|
1136
|
+
constructor(options: LatencyEvaluatorOptions);
|
|
1137
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1138
|
+
}
|
|
1139
|
+
interface CostEvaluatorOptions {
|
|
1140
|
+
readonly config: CostEvaluatorConfig;
|
|
1141
|
+
}
|
|
1142
|
+
/**
|
|
1143
|
+
* Evaluator that checks execution cost against a budget.
|
|
1144
|
+
* Uses traceSummary.costUsd from the evaluation context.
|
|
1145
|
+
*/
|
|
1146
|
+
declare class CostEvaluator implements Evaluator {
|
|
1147
|
+
readonly kind = "cost";
|
|
1148
|
+
private readonly config;
|
|
1149
|
+
constructor(options: CostEvaluatorOptions);
|
|
1150
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1151
|
+
}
|
|
1152
|
+
interface TokenUsageEvaluatorOptions {
|
|
1153
|
+
readonly config: TokenUsageEvaluatorConfig;
|
|
1154
|
+
}
|
|
1155
|
+
/**
|
|
1156
|
+
* Evaluator that checks provider-reported token usage against configured limits.
|
|
1157
|
+
* Uses traceSummary.tokenUsage from the evaluation context.
|
|
1158
|
+
*/
|
|
1159
|
+
declare class TokenUsageEvaluator implements Evaluator {
|
|
1160
|
+
readonly kind = "token_usage";
|
|
1161
|
+
private readonly config;
|
|
1162
|
+
constructor(options: TokenUsageEvaluatorOptions);
|
|
1163
|
+
evaluate(context: EvaluationContext): EvaluationScore;
|
|
1164
|
+
}
|
|
966
1165
|
|
|
967
1166
|
type MaybePromise<T> = T | Promise<T>;
|
|
968
1167
|
interface EvaluationCache {
|
|
@@ -979,7 +1178,6 @@ interface RunEvalCaseOptions {
|
|
|
979
1178
|
readonly now?: () => Date;
|
|
980
1179
|
readonly maxRetries?: number;
|
|
981
1180
|
readonly agentTimeoutMs?: number;
|
|
982
|
-
readonly promptDumpDir?: string;
|
|
983
1181
|
readonly cache?: EvaluationCache;
|
|
984
1182
|
readonly useCache?: boolean;
|
|
985
1183
|
readonly signal?: AbortSignal;
|
|
@@ -1003,7 +1201,6 @@ interface RunEvaluationOptions {
|
|
|
1003
1201
|
readonly evaluators?: Partial<Record<string, Evaluator>>;
|
|
1004
1202
|
readonly maxRetries?: number;
|
|
1005
1203
|
readonly agentTimeoutMs?: number;
|
|
1006
|
-
readonly promptDumpDir?: string;
|
|
1007
1204
|
readonly cache?: EvaluationCache;
|
|
1008
1205
|
readonly useCache?: boolean;
|
|
1009
1206
|
readonly now?: () => Date;
|
|
@@ -1033,4 +1230,4 @@ type AgentKernel = {
|
|
|
1033
1230
|
};
|
|
1034
1231
|
declare function createAgentKernel(): AgentKernel;
|
|
1035
1232
|
|
|
1036
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|
|
1233
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type ChildEvaluatorResult, type ClaudeCodeResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CompositeAggregatorConfig, CompositeEvaluator, type CompositeEvaluatorConfig, type CompositeEvaluatorOptions, CostEvaluator, type CostEvaluatorConfig, type CostEvaluatorOptions, DEFAULT_EXPLORATION_TOOLS, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type EvaluationVerdict, type Evaluator, type EvaluatorConfig, type EvaluatorFactory, type EvaluatorKind, type EvaluatorResult, type ExecutionMetrics, FieldAccuracyEvaluator, type FieldAccuracyEvaluatorConfig, type FieldAccuracyEvaluatorOptions, type FieldAggregationType, type FieldConfig, type FieldMatchType, type GeminiResolvedConfig, type GenerateRubricsOptions, type JsonObject, type JsonPrimitive, type JsonValue, LatencyEvaluator, type LatencyEvaluatorConfig, type LatencyEvaluatorOptions, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type PiAgentSdkResolvedConfig, type PiCodingAgentResolvedConfig, type ProgressEvent, type PromptInputs, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ProviderTokenUsage, type ResolvedTarget, type RubricItem, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type TokenUsage, TokenUsageEvaluator, type TokenUsageEvaluatorConfig, type TokenUsageEvaluatorOptions, type ToolTestMessage, ToolTrajectoryEvaluator, type ToolTrajectoryEvaluatorConfig, type ToolTrajectoryEvaluatorOptions, type ToolTrajectoryExpectedItem, type TraceSummary, type UserTestMessage, type VSCodeResolvedConfig, avgToolDurationMs, buildDirectoryChain, buildPromptInputs, buildSearchRoots, computeTraceSummary, consumeClaudeCodeLogEntries, consumeCodexLogEntries, consumePiLogEntries, createAgentKernel, createProvider, ensureVSCodeSubagents, explorationRatio, extractCodeBlocks, fileExists, findGitRoot, generateRubrics, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, mergeExecutionMetrics, normalizeLineEndings, readJsonFile, readTargetDefinitions, readTestSuiteMetadata, readTextFile, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, subscribeToClaudeCodeLogEntries, subscribeToCodexLogEntries, subscribeToPiLogEntries, tokensPerTool };
|