@agentv/core 0.2.11 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-P4GOYWYH.js → chunk-NL7K4CAK.js} +5 -1
- package/dist/chunk-NL7K4CAK.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +186 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +183 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1519 -396
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +107 -63
- package/dist/index.d.ts +107 -63
- package/dist/index.js +1519 -395
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/dist/chunk-P4GOYWYH.js.map +0 -1
- package/dist/chunk-XXNQA4EW.js +0 -140
- package/dist/chunk-XXNQA4EW.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -75,19 +75,25 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
75
75
|
* Guard validating raw test messages.
|
|
76
76
|
*/
|
|
77
77
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
78
|
-
declare const
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
78
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
|
|
79
|
+
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
80
|
+
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
81
|
+
type CodeEvaluatorConfig = {
|
|
82
|
+
readonly name: string;
|
|
83
|
+
readonly type: "code";
|
|
84
|
+
readonly script: string;
|
|
85
|
+
readonly resolvedScriptPath?: string;
|
|
86
|
+
readonly cwd?: string;
|
|
87
|
+
readonly resolvedCwd?: string;
|
|
88
|
+
};
|
|
89
|
+
type LlmJudgeEvaluatorConfig = {
|
|
90
|
+
readonly name: string;
|
|
91
|
+
readonly type: "llm_judge";
|
|
92
|
+
readonly prompt?: string;
|
|
93
|
+
readonly promptPath?: string;
|
|
94
|
+
readonly model?: string;
|
|
95
|
+
};
|
|
96
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
91
97
|
/**
|
|
92
98
|
* Test case definition sourced from AgentV specs.
|
|
93
99
|
*/
|
|
@@ -103,7 +109,8 @@ interface EvalCase {
|
|
|
103
109
|
readonly file_paths: readonly string[];
|
|
104
110
|
readonly code_snippets: readonly string[];
|
|
105
111
|
readonly outcome: string;
|
|
106
|
-
readonly
|
|
112
|
+
readonly evaluator?: EvaluatorKind;
|
|
113
|
+
readonly evaluators?: readonly EvaluatorConfig[];
|
|
107
114
|
}
|
|
108
115
|
/**
|
|
109
116
|
* Evaluator scorecard for a single test case run.
|
|
@@ -121,7 +128,18 @@ interface EvaluationResult {
|
|
|
121
128
|
readonly reasoning?: string;
|
|
122
129
|
readonly raw_aspects?: readonly string[];
|
|
123
130
|
readonly raw_request?: JsonObject;
|
|
124
|
-
readonly
|
|
131
|
+
readonly evaluator_raw_request?: JsonObject;
|
|
132
|
+
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
133
|
+
}
|
|
134
|
+
interface EvaluatorResult {
|
|
135
|
+
readonly name: string;
|
|
136
|
+
readonly type: EvaluatorKind;
|
|
137
|
+
readonly score: number;
|
|
138
|
+
readonly hits: readonly string[];
|
|
139
|
+
readonly misses: readonly string[];
|
|
140
|
+
readonly reasoning?: string;
|
|
141
|
+
readonly raw_request?: JsonObject;
|
|
142
|
+
readonly evaluator_raw_request?: JsonObject;
|
|
125
143
|
}
|
|
126
144
|
/**
|
|
127
145
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -177,13 +195,13 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
177
195
|
}>;
|
|
178
196
|
|
|
179
197
|
type ChatPrompt = AxChatRequest["chatPrompt"];
|
|
180
|
-
type ProviderKind = "azure" | "anthropic" | "gemini" | "mock" | "vscode" | "vscode-insiders";
|
|
198
|
+
type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
|
|
181
199
|
interface ProviderRequest {
|
|
182
200
|
readonly prompt: string;
|
|
183
201
|
readonly guidelines?: string;
|
|
184
202
|
readonly guideline_patterns?: readonly string[];
|
|
185
203
|
readonly chatPrompt?: ChatPrompt;
|
|
186
|
-
readonly
|
|
204
|
+
readonly inputFiles?: readonly string[];
|
|
187
205
|
readonly evalCaseId?: string;
|
|
188
206
|
readonly attempt?: number;
|
|
189
207
|
readonly maxOutputTokens?: number;
|
|
@@ -242,6 +260,12 @@ interface GeminiResolvedConfig {
|
|
|
242
260
|
readonly temperature?: number;
|
|
243
261
|
readonly maxOutputTokens?: number;
|
|
244
262
|
}
|
|
263
|
+
interface CodexResolvedConfig {
|
|
264
|
+
readonly executable: string;
|
|
265
|
+
readonly args?: readonly string[];
|
|
266
|
+
readonly cwd?: string;
|
|
267
|
+
readonly timeoutMs?: number;
|
|
268
|
+
}
|
|
245
269
|
interface MockResolvedConfig {
|
|
246
270
|
readonly response?: string;
|
|
247
271
|
readonly delayMs?: number;
|
|
@@ -255,6 +279,24 @@ interface VSCodeResolvedConfig {
|
|
|
255
279
|
readonly subagentRoot?: string;
|
|
256
280
|
readonly workspaceTemplate?: string;
|
|
257
281
|
}
|
|
282
|
+
type CliHealthcheck = {
|
|
283
|
+
readonly type: "http";
|
|
284
|
+
readonly url: string;
|
|
285
|
+
readonly timeoutMs?: number;
|
|
286
|
+
} | {
|
|
287
|
+
readonly type: "command";
|
|
288
|
+
readonly commandTemplate: string;
|
|
289
|
+
readonly timeoutMs?: number;
|
|
290
|
+
readonly cwd?: string;
|
|
291
|
+
};
|
|
292
|
+
interface CliResolvedConfig {
|
|
293
|
+
readonly commandTemplate: string;
|
|
294
|
+
readonly filesFormat?: string;
|
|
295
|
+
readonly cwd?: string;
|
|
296
|
+
readonly env?: Record<string, string>;
|
|
297
|
+
readonly timeoutMs?: number;
|
|
298
|
+
readonly healthcheck?: CliHealthcheck;
|
|
299
|
+
}
|
|
258
300
|
type ResolvedTarget = {
|
|
259
301
|
readonly kind: "azure";
|
|
260
302
|
readonly name: string;
|
|
@@ -276,6 +318,13 @@ type ResolvedTarget = {
|
|
|
276
318
|
readonly workers?: number;
|
|
277
319
|
readonly providerBatching?: boolean;
|
|
278
320
|
readonly config: GeminiResolvedConfig;
|
|
321
|
+
} | {
|
|
322
|
+
readonly kind: "codex";
|
|
323
|
+
readonly name: string;
|
|
324
|
+
readonly judgeTarget?: string;
|
|
325
|
+
readonly workers?: number;
|
|
326
|
+
readonly providerBatching?: boolean;
|
|
327
|
+
readonly config: CodexResolvedConfig;
|
|
279
328
|
} | {
|
|
280
329
|
readonly kind: "mock";
|
|
281
330
|
readonly name: string;
|
|
@@ -290,6 +339,13 @@ type ResolvedTarget = {
|
|
|
290
339
|
readonly workers?: number;
|
|
291
340
|
readonly providerBatching?: boolean;
|
|
292
341
|
readonly config: VSCodeResolvedConfig;
|
|
342
|
+
} | {
|
|
343
|
+
readonly kind: "cli";
|
|
344
|
+
readonly name: string;
|
|
345
|
+
readonly judgeTarget?: string;
|
|
346
|
+
readonly workers?: number;
|
|
347
|
+
readonly providerBatching?: boolean;
|
|
348
|
+
readonly config: CliResolvedConfig;
|
|
293
349
|
};
|
|
294
350
|
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
|
|
295
351
|
|
|
@@ -317,36 +373,7 @@ declare function ensureVSCodeSubagents(options: EnsureSubagentsOptions): Promise
|
|
|
317
373
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
318
374
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
319
375
|
|
|
320
|
-
interface
|
|
321
|
-
readonly score: number;
|
|
322
|
-
readonly hits: readonly string[];
|
|
323
|
-
readonly misses: readonly string[];
|
|
324
|
-
readonly hitCount: number;
|
|
325
|
-
readonly totalAspects: number;
|
|
326
|
-
readonly rawAspects: readonly string[];
|
|
327
|
-
}
|
|
328
|
-
/**
|
|
329
|
-
* Extract individual evaluation aspects from the expected assistant response.
|
|
330
|
-
*/
|
|
331
|
-
declare function extractAspects(expectedResponse: string): readonly string[];
|
|
332
|
-
/**
|
|
333
|
-
* Determine which aspects were covered by the candidate response.
|
|
334
|
-
*/
|
|
335
|
-
declare function calculateHits(candidateResponse: string, expectedAspects: readonly string[]): readonly string[];
|
|
336
|
-
/**
|
|
337
|
-
* Determine which aspects were not satisfied by the candidate response.
|
|
338
|
-
*/
|
|
339
|
-
declare function calculateMisses(candidateResponse: string, expectedAspects: readonly string[], resolvedHits?: readonly string[]): readonly string[];
|
|
340
|
-
/**
|
|
341
|
-
* Evaluate the candidate response against the expected aspects.
|
|
342
|
-
*/
|
|
343
|
-
declare function scoreCandidateResponse(candidateResponse: string, expectedAspects: readonly string[]): HeuristicScore;
|
|
344
|
-
/**
|
|
345
|
-
* Detect common error-prefixed outputs from providers.
|
|
346
|
-
*/
|
|
347
|
-
declare function isErrorLike(text: string | undefined | null): boolean;
|
|
348
|
-
|
|
349
|
-
interface GradeContext {
|
|
376
|
+
interface EvaluationContext {
|
|
350
377
|
readonly evalCase: EvalCase;
|
|
351
378
|
readonly candidate: string;
|
|
352
379
|
readonly target: ResolvedTarget;
|
|
@@ -355,40 +382,55 @@ interface GradeContext {
|
|
|
355
382
|
readonly promptInputs: {
|
|
356
383
|
readonly request: string;
|
|
357
384
|
readonly guidelines: string;
|
|
385
|
+
readonly systemMessage?: string;
|
|
358
386
|
};
|
|
359
387
|
readonly now: Date;
|
|
360
388
|
readonly judgeProvider?: Provider;
|
|
389
|
+
readonly systemPrompt?: string;
|
|
390
|
+
readonly evaluator?: EvaluatorConfig;
|
|
391
|
+
readonly judgeModel?: string;
|
|
361
392
|
}
|
|
362
|
-
interface
|
|
393
|
+
interface EvaluationScore {
|
|
363
394
|
readonly score: number;
|
|
364
395
|
readonly hits: readonly string[];
|
|
365
396
|
readonly misses: readonly string[];
|
|
366
397
|
readonly expectedAspectCount: number;
|
|
367
398
|
readonly reasoning?: string;
|
|
368
399
|
readonly rawAspects?: readonly string[];
|
|
369
|
-
readonly
|
|
400
|
+
readonly evaluatorRawRequest?: JsonObject;
|
|
370
401
|
}
|
|
371
|
-
interface
|
|
402
|
+
interface Evaluator {
|
|
372
403
|
readonly kind: string;
|
|
373
|
-
|
|
374
|
-
}
|
|
375
|
-
declare class HeuristicGrader implements Grader {
|
|
376
|
-
readonly kind = "heuristic";
|
|
377
|
-
grade(context: GradeContext): GradeResult;
|
|
404
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
|
|
378
405
|
}
|
|
379
|
-
type JudgeProviderResolver = (context:
|
|
380
|
-
interface
|
|
406
|
+
type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
407
|
+
interface LlmJudgeEvaluatorOptions {
|
|
381
408
|
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
382
409
|
readonly maxOutputTokens?: number;
|
|
383
410
|
readonly temperature?: number;
|
|
411
|
+
readonly customPrompt?: string;
|
|
384
412
|
}
|
|
385
|
-
declare class
|
|
413
|
+
declare class LlmJudgeEvaluator implements Evaluator {
|
|
386
414
|
readonly kind = "llm_judge";
|
|
387
415
|
private readonly resolveJudgeProvider;
|
|
388
416
|
private readonly maxOutputTokens?;
|
|
389
417
|
private readonly temperature?;
|
|
390
|
-
|
|
391
|
-
|
|
418
|
+
private readonly customPrompt?;
|
|
419
|
+
constructor(options: LlmJudgeEvaluatorOptions);
|
|
420
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
421
|
+
}
|
|
422
|
+
interface CodeEvaluatorOptions {
|
|
423
|
+
readonly script: string;
|
|
424
|
+
readonly cwd?: string;
|
|
425
|
+
readonly agentTimeoutMs?: number;
|
|
426
|
+
}
|
|
427
|
+
declare class CodeEvaluator implements Evaluator {
|
|
428
|
+
readonly kind = "code";
|
|
429
|
+
private readonly script;
|
|
430
|
+
private readonly cwd?;
|
|
431
|
+
private readonly agentTimeoutMs?;
|
|
432
|
+
constructor(options: CodeEvaluatorOptions);
|
|
433
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
392
434
|
}
|
|
393
435
|
|
|
394
436
|
type MaybePromise<T> = T | Promise<T>;
|
|
@@ -400,7 +442,9 @@ interface RunEvalCaseOptions {
|
|
|
400
442
|
readonly evalCase: EvalCase;
|
|
401
443
|
readonly provider: Provider;
|
|
402
444
|
readonly target: ResolvedTarget;
|
|
403
|
-
readonly
|
|
445
|
+
readonly evaluators: Partial<Record<string, Evaluator>> & {
|
|
446
|
+
readonly llm_judge: Evaluator;
|
|
447
|
+
};
|
|
404
448
|
readonly now?: () => Date;
|
|
405
449
|
readonly maxRetries?: number;
|
|
406
450
|
readonly agentTimeoutMs?: number;
|
|
@@ -425,7 +469,7 @@ interface RunEvaluationOptions {
|
|
|
425
469
|
readonly targets?: readonly TargetDefinition[];
|
|
426
470
|
readonly env?: EnvLookup;
|
|
427
471
|
readonly providerFactory?: (target: ResolvedTarget) => Provider;
|
|
428
|
-
readonly
|
|
472
|
+
readonly evaluators?: Partial<Record<string, Evaluator>>;
|
|
429
473
|
readonly maxRetries?: number;
|
|
430
474
|
readonly agentTimeoutMs?: number;
|
|
431
475
|
readonly promptDumpDir?: string;
|
|
@@ -446,4 +490,4 @@ type AgentKernel = {
|
|
|
446
490
|
};
|
|
447
491
|
declare function createAgentKernel(): AgentKernel;
|
|
448
492
|
|
|
449
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type
|
|
493
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation };
|
package/dist/index.d.ts
CHANGED
|
@@ -75,19 +75,25 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
75
75
|
* Guard validating raw test messages.
|
|
76
76
|
*/
|
|
77
77
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
78
|
-
declare const
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
78
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
|
|
79
|
+
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
80
|
+
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
81
|
+
type CodeEvaluatorConfig = {
|
|
82
|
+
readonly name: string;
|
|
83
|
+
readonly type: "code";
|
|
84
|
+
readonly script: string;
|
|
85
|
+
readonly resolvedScriptPath?: string;
|
|
86
|
+
readonly cwd?: string;
|
|
87
|
+
readonly resolvedCwd?: string;
|
|
88
|
+
};
|
|
89
|
+
type LlmJudgeEvaluatorConfig = {
|
|
90
|
+
readonly name: string;
|
|
91
|
+
readonly type: "llm_judge";
|
|
92
|
+
readonly prompt?: string;
|
|
93
|
+
readonly promptPath?: string;
|
|
94
|
+
readonly model?: string;
|
|
95
|
+
};
|
|
96
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
91
97
|
/**
|
|
92
98
|
* Test case definition sourced from AgentV specs.
|
|
93
99
|
*/
|
|
@@ -103,7 +109,8 @@ interface EvalCase {
|
|
|
103
109
|
readonly file_paths: readonly string[];
|
|
104
110
|
readonly code_snippets: readonly string[];
|
|
105
111
|
readonly outcome: string;
|
|
106
|
-
readonly
|
|
112
|
+
readonly evaluator?: EvaluatorKind;
|
|
113
|
+
readonly evaluators?: readonly EvaluatorConfig[];
|
|
107
114
|
}
|
|
108
115
|
/**
|
|
109
116
|
* Evaluator scorecard for a single test case run.
|
|
@@ -121,7 +128,18 @@ interface EvaluationResult {
|
|
|
121
128
|
readonly reasoning?: string;
|
|
122
129
|
readonly raw_aspects?: readonly string[];
|
|
123
130
|
readonly raw_request?: JsonObject;
|
|
124
|
-
readonly
|
|
131
|
+
readonly evaluator_raw_request?: JsonObject;
|
|
132
|
+
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
133
|
+
}
|
|
134
|
+
interface EvaluatorResult {
|
|
135
|
+
readonly name: string;
|
|
136
|
+
readonly type: EvaluatorKind;
|
|
137
|
+
readonly score: number;
|
|
138
|
+
readonly hits: readonly string[];
|
|
139
|
+
readonly misses: readonly string[];
|
|
140
|
+
readonly reasoning?: string;
|
|
141
|
+
readonly raw_request?: JsonObject;
|
|
142
|
+
readonly evaluator_raw_request?: JsonObject;
|
|
125
143
|
}
|
|
126
144
|
/**
|
|
127
145
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -177,13 +195,13 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
177
195
|
}>;
|
|
178
196
|
|
|
179
197
|
type ChatPrompt = AxChatRequest["chatPrompt"];
|
|
180
|
-
type ProviderKind = "azure" | "anthropic" | "gemini" | "mock" | "vscode" | "vscode-insiders";
|
|
198
|
+
type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
|
|
181
199
|
interface ProviderRequest {
|
|
182
200
|
readonly prompt: string;
|
|
183
201
|
readonly guidelines?: string;
|
|
184
202
|
readonly guideline_patterns?: readonly string[];
|
|
185
203
|
readonly chatPrompt?: ChatPrompt;
|
|
186
|
-
readonly
|
|
204
|
+
readonly inputFiles?: readonly string[];
|
|
187
205
|
readonly evalCaseId?: string;
|
|
188
206
|
readonly attempt?: number;
|
|
189
207
|
readonly maxOutputTokens?: number;
|
|
@@ -242,6 +260,12 @@ interface GeminiResolvedConfig {
|
|
|
242
260
|
readonly temperature?: number;
|
|
243
261
|
readonly maxOutputTokens?: number;
|
|
244
262
|
}
|
|
263
|
+
interface CodexResolvedConfig {
|
|
264
|
+
readonly executable: string;
|
|
265
|
+
readonly args?: readonly string[];
|
|
266
|
+
readonly cwd?: string;
|
|
267
|
+
readonly timeoutMs?: number;
|
|
268
|
+
}
|
|
245
269
|
interface MockResolvedConfig {
|
|
246
270
|
readonly response?: string;
|
|
247
271
|
readonly delayMs?: number;
|
|
@@ -255,6 +279,24 @@ interface VSCodeResolvedConfig {
|
|
|
255
279
|
readonly subagentRoot?: string;
|
|
256
280
|
readonly workspaceTemplate?: string;
|
|
257
281
|
}
|
|
282
|
+
type CliHealthcheck = {
|
|
283
|
+
readonly type: "http";
|
|
284
|
+
readonly url: string;
|
|
285
|
+
readonly timeoutMs?: number;
|
|
286
|
+
} | {
|
|
287
|
+
readonly type: "command";
|
|
288
|
+
readonly commandTemplate: string;
|
|
289
|
+
readonly timeoutMs?: number;
|
|
290
|
+
readonly cwd?: string;
|
|
291
|
+
};
|
|
292
|
+
interface CliResolvedConfig {
|
|
293
|
+
readonly commandTemplate: string;
|
|
294
|
+
readonly filesFormat?: string;
|
|
295
|
+
readonly cwd?: string;
|
|
296
|
+
readonly env?: Record<string, string>;
|
|
297
|
+
readonly timeoutMs?: number;
|
|
298
|
+
readonly healthcheck?: CliHealthcheck;
|
|
299
|
+
}
|
|
258
300
|
type ResolvedTarget = {
|
|
259
301
|
readonly kind: "azure";
|
|
260
302
|
readonly name: string;
|
|
@@ -276,6 +318,13 @@ type ResolvedTarget = {
|
|
|
276
318
|
readonly workers?: number;
|
|
277
319
|
readonly providerBatching?: boolean;
|
|
278
320
|
readonly config: GeminiResolvedConfig;
|
|
321
|
+
} | {
|
|
322
|
+
readonly kind: "codex";
|
|
323
|
+
readonly name: string;
|
|
324
|
+
readonly judgeTarget?: string;
|
|
325
|
+
readonly workers?: number;
|
|
326
|
+
readonly providerBatching?: boolean;
|
|
327
|
+
readonly config: CodexResolvedConfig;
|
|
279
328
|
} | {
|
|
280
329
|
readonly kind: "mock";
|
|
281
330
|
readonly name: string;
|
|
@@ -290,6 +339,13 @@ type ResolvedTarget = {
|
|
|
290
339
|
readonly workers?: number;
|
|
291
340
|
readonly providerBatching?: boolean;
|
|
292
341
|
readonly config: VSCodeResolvedConfig;
|
|
342
|
+
} | {
|
|
343
|
+
readonly kind: "cli";
|
|
344
|
+
readonly name: string;
|
|
345
|
+
readonly judgeTarget?: string;
|
|
346
|
+
readonly workers?: number;
|
|
347
|
+
readonly providerBatching?: boolean;
|
|
348
|
+
readonly config: CliResolvedConfig;
|
|
293
349
|
};
|
|
294
350
|
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
|
|
295
351
|
|
|
@@ -317,36 +373,7 @@ declare function ensureVSCodeSubagents(options: EnsureSubagentsOptions): Promise
|
|
|
317
373
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
318
374
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
319
375
|
|
|
320
|
-
interface
|
|
321
|
-
readonly score: number;
|
|
322
|
-
readonly hits: readonly string[];
|
|
323
|
-
readonly misses: readonly string[];
|
|
324
|
-
readonly hitCount: number;
|
|
325
|
-
readonly totalAspects: number;
|
|
326
|
-
readonly rawAspects: readonly string[];
|
|
327
|
-
}
|
|
328
|
-
/**
|
|
329
|
-
* Extract individual evaluation aspects from the expected assistant response.
|
|
330
|
-
*/
|
|
331
|
-
declare function extractAspects(expectedResponse: string): readonly string[];
|
|
332
|
-
/**
|
|
333
|
-
* Determine which aspects were covered by the candidate response.
|
|
334
|
-
*/
|
|
335
|
-
declare function calculateHits(candidateResponse: string, expectedAspects: readonly string[]): readonly string[];
|
|
336
|
-
/**
|
|
337
|
-
* Determine which aspects were not satisfied by the candidate response.
|
|
338
|
-
*/
|
|
339
|
-
declare function calculateMisses(candidateResponse: string, expectedAspects: readonly string[], resolvedHits?: readonly string[]): readonly string[];
|
|
340
|
-
/**
|
|
341
|
-
* Evaluate the candidate response against the expected aspects.
|
|
342
|
-
*/
|
|
343
|
-
declare function scoreCandidateResponse(candidateResponse: string, expectedAspects: readonly string[]): HeuristicScore;
|
|
344
|
-
/**
|
|
345
|
-
* Detect common error-prefixed outputs from providers.
|
|
346
|
-
*/
|
|
347
|
-
declare function isErrorLike(text: string | undefined | null): boolean;
|
|
348
|
-
|
|
349
|
-
interface GradeContext {
|
|
376
|
+
interface EvaluationContext {
|
|
350
377
|
readonly evalCase: EvalCase;
|
|
351
378
|
readonly candidate: string;
|
|
352
379
|
readonly target: ResolvedTarget;
|
|
@@ -355,40 +382,55 @@ interface GradeContext {
|
|
|
355
382
|
readonly promptInputs: {
|
|
356
383
|
readonly request: string;
|
|
357
384
|
readonly guidelines: string;
|
|
385
|
+
readonly systemMessage?: string;
|
|
358
386
|
};
|
|
359
387
|
readonly now: Date;
|
|
360
388
|
readonly judgeProvider?: Provider;
|
|
389
|
+
readonly systemPrompt?: string;
|
|
390
|
+
readonly evaluator?: EvaluatorConfig;
|
|
391
|
+
readonly judgeModel?: string;
|
|
361
392
|
}
|
|
362
|
-
interface
|
|
393
|
+
interface EvaluationScore {
|
|
363
394
|
readonly score: number;
|
|
364
395
|
readonly hits: readonly string[];
|
|
365
396
|
readonly misses: readonly string[];
|
|
366
397
|
readonly expectedAspectCount: number;
|
|
367
398
|
readonly reasoning?: string;
|
|
368
399
|
readonly rawAspects?: readonly string[];
|
|
369
|
-
readonly
|
|
400
|
+
readonly evaluatorRawRequest?: JsonObject;
|
|
370
401
|
}
|
|
371
|
-
interface
|
|
402
|
+
interface Evaluator {
|
|
372
403
|
readonly kind: string;
|
|
373
|
-
|
|
374
|
-
}
|
|
375
|
-
declare class HeuristicGrader implements Grader {
|
|
376
|
-
readonly kind = "heuristic";
|
|
377
|
-
grade(context: GradeContext): GradeResult;
|
|
404
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
|
|
378
405
|
}
|
|
379
|
-
type JudgeProviderResolver = (context:
|
|
380
|
-
interface
|
|
406
|
+
type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
407
|
+
interface LlmJudgeEvaluatorOptions {
|
|
381
408
|
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
382
409
|
readonly maxOutputTokens?: number;
|
|
383
410
|
readonly temperature?: number;
|
|
411
|
+
readonly customPrompt?: string;
|
|
384
412
|
}
|
|
385
|
-
declare class
|
|
413
|
+
declare class LlmJudgeEvaluator implements Evaluator {
|
|
386
414
|
readonly kind = "llm_judge";
|
|
387
415
|
private readonly resolveJudgeProvider;
|
|
388
416
|
private readonly maxOutputTokens?;
|
|
389
417
|
private readonly temperature?;
|
|
390
|
-
|
|
391
|
-
|
|
418
|
+
private readonly customPrompt?;
|
|
419
|
+
constructor(options: LlmJudgeEvaluatorOptions);
|
|
420
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
421
|
+
}
|
|
422
|
+
interface CodeEvaluatorOptions {
|
|
423
|
+
readonly script: string;
|
|
424
|
+
readonly cwd?: string;
|
|
425
|
+
readonly agentTimeoutMs?: number;
|
|
426
|
+
}
|
|
427
|
+
declare class CodeEvaluator implements Evaluator {
|
|
428
|
+
readonly kind = "code";
|
|
429
|
+
private readonly script;
|
|
430
|
+
private readonly cwd?;
|
|
431
|
+
private readonly agentTimeoutMs?;
|
|
432
|
+
constructor(options: CodeEvaluatorOptions);
|
|
433
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
392
434
|
}
|
|
393
435
|
|
|
394
436
|
type MaybePromise<T> = T | Promise<T>;
|
|
@@ -400,7 +442,9 @@ interface RunEvalCaseOptions {
|
|
|
400
442
|
readonly evalCase: EvalCase;
|
|
401
443
|
readonly provider: Provider;
|
|
402
444
|
readonly target: ResolvedTarget;
|
|
403
|
-
readonly
|
|
445
|
+
readonly evaluators: Partial<Record<string, Evaluator>> & {
|
|
446
|
+
readonly llm_judge: Evaluator;
|
|
447
|
+
};
|
|
404
448
|
readonly now?: () => Date;
|
|
405
449
|
readonly maxRetries?: number;
|
|
406
450
|
readonly agentTimeoutMs?: number;
|
|
@@ -425,7 +469,7 @@ interface RunEvaluationOptions {
|
|
|
425
469
|
readonly targets?: readonly TargetDefinition[];
|
|
426
470
|
readonly env?: EnvLookup;
|
|
427
471
|
readonly providerFactory?: (target: ResolvedTarget) => Provider;
|
|
428
|
-
readonly
|
|
472
|
+
readonly evaluators?: Partial<Record<string, Evaluator>>;
|
|
429
473
|
readonly maxRetries?: number;
|
|
430
474
|
readonly agentTimeoutMs?: number;
|
|
431
475
|
readonly promptDumpDir?: string;
|
|
@@ -446,4 +490,4 @@ type AgentKernel = {
|
|
|
446
490
|
};
|
|
447
491
|
declare function createAgentKernel(): AgentKernel;
|
|
448
492
|
|
|
449
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type
|
|
493
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation };
|