@agentv/core 0.2.8 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-XXNQA4EW.js → chunk-NL7K4CAK.js} +5 -1
- package/dist/chunk-NL7K4CAK.js.map +1 -0
- package/dist/evaluation/validation/index.cjs +186 -1
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js +183 -2
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +1701 -324
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +121 -63
- package/dist/index.d.ts +121 -63
- package/dist/index.js +1710 -327
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
- package/dist/chunk-XXNQA4EW.js.map +0 -1
package/dist/index.d.cts
CHANGED
|
@@ -75,19 +75,25 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
75
75
|
* Guard validating raw test messages.
|
|
76
76
|
*/
|
|
77
77
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
78
|
-
declare const
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
78
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
|
|
79
|
+
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
80
|
+
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
81
|
+
type CodeEvaluatorConfig = {
|
|
82
|
+
readonly name: string;
|
|
83
|
+
readonly type: "code";
|
|
84
|
+
readonly script: string;
|
|
85
|
+
readonly resolvedScriptPath?: string;
|
|
86
|
+
readonly cwd?: string;
|
|
87
|
+
readonly resolvedCwd?: string;
|
|
88
|
+
};
|
|
89
|
+
type LlmJudgeEvaluatorConfig = {
|
|
90
|
+
readonly name: string;
|
|
91
|
+
readonly type: "llm_judge";
|
|
92
|
+
readonly prompt?: string;
|
|
93
|
+
readonly promptPath?: string;
|
|
94
|
+
readonly model?: string;
|
|
95
|
+
};
|
|
96
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
91
97
|
/**
|
|
92
98
|
* Test case definition sourced from AgentV specs.
|
|
93
99
|
*/
|
|
@@ -103,7 +109,8 @@ interface EvalCase {
|
|
|
103
109
|
readonly file_paths: readonly string[];
|
|
104
110
|
readonly code_snippets: readonly string[];
|
|
105
111
|
readonly outcome: string;
|
|
106
|
-
readonly
|
|
112
|
+
readonly evaluator?: EvaluatorKind;
|
|
113
|
+
readonly evaluators?: readonly EvaluatorConfig[];
|
|
107
114
|
}
|
|
108
115
|
/**
|
|
109
116
|
* Evaluator scorecard for a single test case run.
|
|
@@ -121,7 +128,18 @@ interface EvaluationResult {
|
|
|
121
128
|
readonly reasoning?: string;
|
|
122
129
|
readonly raw_aspects?: readonly string[];
|
|
123
130
|
readonly raw_request?: JsonObject;
|
|
124
|
-
readonly
|
|
131
|
+
readonly evaluator_raw_request?: JsonObject;
|
|
132
|
+
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
133
|
+
}
|
|
134
|
+
interface EvaluatorResult {
|
|
135
|
+
readonly name: string;
|
|
136
|
+
readonly type: EvaluatorKind;
|
|
137
|
+
readonly score: number;
|
|
138
|
+
readonly hits: readonly string[];
|
|
139
|
+
readonly misses: readonly string[];
|
|
140
|
+
readonly reasoning?: string;
|
|
141
|
+
readonly raw_request?: JsonObject;
|
|
142
|
+
readonly evaluator_raw_request?: JsonObject;
|
|
125
143
|
}
|
|
126
144
|
/**
|
|
127
145
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -177,13 +195,13 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
177
195
|
}>;
|
|
178
196
|
|
|
179
197
|
type ChatPrompt = AxChatRequest["chatPrompt"];
|
|
180
|
-
type ProviderKind = "azure" | "anthropic" | "gemini" | "mock" | "vscode" | "vscode-insiders";
|
|
198
|
+
type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
|
|
181
199
|
interface ProviderRequest {
|
|
182
200
|
readonly prompt: string;
|
|
183
201
|
readonly guidelines?: string;
|
|
184
202
|
readonly guideline_patterns?: readonly string[];
|
|
185
203
|
readonly chatPrompt?: ChatPrompt;
|
|
186
|
-
readonly
|
|
204
|
+
readonly inputFiles?: readonly string[];
|
|
187
205
|
readonly evalCaseId?: string;
|
|
188
206
|
readonly attempt?: number;
|
|
189
207
|
readonly maxOutputTokens?: number;
|
|
@@ -202,6 +220,15 @@ interface Provider {
|
|
|
202
220
|
readonly kind: ProviderKind;
|
|
203
221
|
readonly targetName: string;
|
|
204
222
|
invoke(request: ProviderRequest): Promise<ProviderResponse>;
|
|
223
|
+
/**
|
|
224
|
+
* Optional capability marker for provider-managed batching (single session handling multiple requests).
|
|
225
|
+
*/
|
|
226
|
+
readonly supportsBatch?: boolean;
|
|
227
|
+
/**
|
|
228
|
+
* Optional batch invocation hook. When defined alongside supportsBatch=true,
|
|
229
|
+
* the orchestrator may send multiple requests in a single provider session.
|
|
230
|
+
*/
|
|
231
|
+
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
205
232
|
}
|
|
206
233
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
207
234
|
interface TargetDefinition {
|
|
@@ -233,6 +260,12 @@ interface GeminiResolvedConfig {
|
|
|
233
260
|
readonly temperature?: number;
|
|
234
261
|
readonly maxOutputTokens?: number;
|
|
235
262
|
}
|
|
263
|
+
interface CodexResolvedConfig {
|
|
264
|
+
readonly executable: string;
|
|
265
|
+
readonly args?: readonly string[];
|
|
266
|
+
readonly cwd?: string;
|
|
267
|
+
readonly timeoutMs?: number;
|
|
268
|
+
}
|
|
236
269
|
interface MockResolvedConfig {
|
|
237
270
|
readonly response?: string;
|
|
238
271
|
readonly delayMs?: number;
|
|
@@ -246,36 +279,73 @@ interface VSCodeResolvedConfig {
|
|
|
246
279
|
readonly subagentRoot?: string;
|
|
247
280
|
readonly workspaceTemplate?: string;
|
|
248
281
|
}
|
|
282
|
+
type CliHealthcheck = {
|
|
283
|
+
readonly type: "http";
|
|
284
|
+
readonly url: string;
|
|
285
|
+
readonly timeoutMs?: number;
|
|
286
|
+
} | {
|
|
287
|
+
readonly type: "command";
|
|
288
|
+
readonly commandTemplate: string;
|
|
289
|
+
readonly timeoutMs?: number;
|
|
290
|
+
readonly cwd?: string;
|
|
291
|
+
};
|
|
292
|
+
interface CliResolvedConfig {
|
|
293
|
+
readonly commandTemplate: string;
|
|
294
|
+
readonly filesFormat?: string;
|
|
295
|
+
readonly cwd?: string;
|
|
296
|
+
readonly env?: Record<string, string>;
|
|
297
|
+
readonly timeoutMs?: number;
|
|
298
|
+
readonly healthcheck?: CliHealthcheck;
|
|
299
|
+
}
|
|
249
300
|
type ResolvedTarget = {
|
|
250
301
|
readonly kind: "azure";
|
|
251
302
|
readonly name: string;
|
|
252
303
|
readonly judgeTarget?: string;
|
|
253
304
|
readonly workers?: number;
|
|
305
|
+
readonly providerBatching?: boolean;
|
|
254
306
|
readonly config: AzureResolvedConfig;
|
|
255
307
|
} | {
|
|
256
308
|
readonly kind: "anthropic";
|
|
257
309
|
readonly name: string;
|
|
258
310
|
readonly judgeTarget?: string;
|
|
259
311
|
readonly workers?: number;
|
|
312
|
+
readonly providerBatching?: boolean;
|
|
260
313
|
readonly config: AnthropicResolvedConfig;
|
|
261
314
|
} | {
|
|
262
315
|
readonly kind: "gemini";
|
|
263
316
|
readonly name: string;
|
|
264
317
|
readonly judgeTarget?: string;
|
|
265
318
|
readonly workers?: number;
|
|
319
|
+
readonly providerBatching?: boolean;
|
|
266
320
|
readonly config: GeminiResolvedConfig;
|
|
321
|
+
} | {
|
|
322
|
+
readonly kind: "codex";
|
|
323
|
+
readonly name: string;
|
|
324
|
+
readonly judgeTarget?: string;
|
|
325
|
+
readonly workers?: number;
|
|
326
|
+
readonly providerBatching?: boolean;
|
|
327
|
+
readonly config: CodexResolvedConfig;
|
|
267
328
|
} | {
|
|
268
329
|
readonly kind: "mock";
|
|
269
330
|
readonly name: string;
|
|
270
331
|
readonly judgeTarget?: string;
|
|
271
332
|
readonly workers?: number;
|
|
333
|
+
readonly providerBatching?: boolean;
|
|
272
334
|
readonly config: MockResolvedConfig;
|
|
273
335
|
} | {
|
|
274
336
|
readonly kind: "vscode" | "vscode-insiders";
|
|
275
337
|
readonly name: string;
|
|
276
338
|
readonly judgeTarget?: string;
|
|
277
339
|
readonly workers?: number;
|
|
340
|
+
readonly providerBatching?: boolean;
|
|
278
341
|
readonly config: VSCodeResolvedConfig;
|
|
342
|
+
} | {
|
|
343
|
+
readonly kind: "cli";
|
|
344
|
+
readonly name: string;
|
|
345
|
+
readonly judgeTarget?: string;
|
|
346
|
+
readonly workers?: number;
|
|
347
|
+
readonly providerBatching?: boolean;
|
|
348
|
+
readonly config: CliResolvedConfig;
|
|
279
349
|
};
|
|
280
350
|
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
|
|
281
351
|
|
|
@@ -303,36 +373,7 @@ declare function ensureVSCodeSubagents(options: EnsureSubagentsOptions): Promise
|
|
|
303
373
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
304
374
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
305
375
|
|
|
306
|
-
interface
|
|
307
|
-
readonly score: number;
|
|
308
|
-
readonly hits: readonly string[];
|
|
309
|
-
readonly misses: readonly string[];
|
|
310
|
-
readonly hitCount: number;
|
|
311
|
-
readonly totalAspects: number;
|
|
312
|
-
readonly rawAspects: readonly string[];
|
|
313
|
-
}
|
|
314
|
-
/**
|
|
315
|
-
* Extract individual evaluation aspects from the expected assistant response.
|
|
316
|
-
*/
|
|
317
|
-
declare function extractAspects(expectedResponse: string): readonly string[];
|
|
318
|
-
/**
|
|
319
|
-
* Determine which aspects were covered by the candidate response.
|
|
320
|
-
*/
|
|
321
|
-
declare function calculateHits(candidateResponse: string, expectedAspects: readonly string[]): readonly string[];
|
|
322
|
-
/**
|
|
323
|
-
* Determine which aspects were not satisfied by the candidate response.
|
|
324
|
-
*/
|
|
325
|
-
declare function calculateMisses(candidateResponse: string, expectedAspects: readonly string[], resolvedHits?: readonly string[]): readonly string[];
|
|
326
|
-
/**
|
|
327
|
-
* Evaluate the candidate response against the expected aspects.
|
|
328
|
-
*/
|
|
329
|
-
declare function scoreCandidateResponse(candidateResponse: string, expectedAspects: readonly string[]): HeuristicScore;
|
|
330
|
-
/**
|
|
331
|
-
* Detect common error-prefixed outputs from providers.
|
|
332
|
-
*/
|
|
333
|
-
declare function isErrorLike(text: string | undefined | null): boolean;
|
|
334
|
-
|
|
335
|
-
interface GradeContext {
|
|
376
|
+
interface EvaluationContext {
|
|
336
377
|
readonly evalCase: EvalCase;
|
|
337
378
|
readonly candidate: string;
|
|
338
379
|
readonly target: ResolvedTarget;
|
|
@@ -341,40 +382,55 @@ interface GradeContext {
|
|
|
341
382
|
readonly promptInputs: {
|
|
342
383
|
readonly request: string;
|
|
343
384
|
readonly guidelines: string;
|
|
385
|
+
readonly systemMessage?: string;
|
|
344
386
|
};
|
|
345
387
|
readonly now: Date;
|
|
346
388
|
readonly judgeProvider?: Provider;
|
|
389
|
+
readonly systemPrompt?: string;
|
|
390
|
+
readonly evaluator?: EvaluatorConfig;
|
|
391
|
+
readonly judgeModel?: string;
|
|
347
392
|
}
|
|
348
|
-
interface
|
|
393
|
+
interface EvaluationScore {
|
|
349
394
|
readonly score: number;
|
|
350
395
|
readonly hits: readonly string[];
|
|
351
396
|
readonly misses: readonly string[];
|
|
352
397
|
readonly expectedAspectCount: number;
|
|
353
398
|
readonly reasoning?: string;
|
|
354
399
|
readonly rawAspects?: readonly string[];
|
|
355
|
-
readonly
|
|
400
|
+
readonly evaluatorRawRequest?: JsonObject;
|
|
356
401
|
}
|
|
357
|
-
interface
|
|
402
|
+
interface Evaluator {
|
|
358
403
|
readonly kind: string;
|
|
359
|
-
|
|
360
|
-
}
|
|
361
|
-
declare class HeuristicGrader implements Grader {
|
|
362
|
-
readonly kind = "heuristic";
|
|
363
|
-
grade(context: GradeContext): GradeResult;
|
|
404
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
|
|
364
405
|
}
|
|
365
|
-
type JudgeProviderResolver = (context:
|
|
366
|
-
interface
|
|
406
|
+
type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
407
|
+
interface LlmJudgeEvaluatorOptions {
|
|
367
408
|
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
368
409
|
readonly maxOutputTokens?: number;
|
|
369
410
|
readonly temperature?: number;
|
|
411
|
+
readonly customPrompt?: string;
|
|
370
412
|
}
|
|
371
|
-
declare class
|
|
413
|
+
declare class LlmJudgeEvaluator implements Evaluator {
|
|
372
414
|
readonly kind = "llm_judge";
|
|
373
415
|
private readonly resolveJudgeProvider;
|
|
374
416
|
private readonly maxOutputTokens?;
|
|
375
417
|
private readonly temperature?;
|
|
376
|
-
|
|
377
|
-
|
|
418
|
+
private readonly customPrompt?;
|
|
419
|
+
constructor(options: LlmJudgeEvaluatorOptions);
|
|
420
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
421
|
+
}
|
|
422
|
+
interface CodeEvaluatorOptions {
|
|
423
|
+
readonly script: string;
|
|
424
|
+
readonly cwd?: string;
|
|
425
|
+
readonly agentTimeoutMs?: number;
|
|
426
|
+
}
|
|
427
|
+
declare class CodeEvaluator implements Evaluator {
|
|
428
|
+
readonly kind = "code";
|
|
429
|
+
private readonly script;
|
|
430
|
+
private readonly cwd?;
|
|
431
|
+
private readonly agentTimeoutMs?;
|
|
432
|
+
constructor(options: CodeEvaluatorOptions);
|
|
433
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
378
434
|
}
|
|
379
435
|
|
|
380
436
|
type MaybePromise<T> = T | Promise<T>;
|
|
@@ -386,7 +442,9 @@ interface RunEvalCaseOptions {
|
|
|
386
442
|
readonly evalCase: EvalCase;
|
|
387
443
|
readonly provider: Provider;
|
|
388
444
|
readonly target: ResolvedTarget;
|
|
389
|
-
readonly
|
|
445
|
+
readonly evaluators: Partial<Record<string, Evaluator>> & {
|
|
446
|
+
readonly llm_judge: Evaluator;
|
|
447
|
+
};
|
|
390
448
|
readonly now?: () => Date;
|
|
391
449
|
readonly maxRetries?: number;
|
|
392
450
|
readonly agentTimeoutMs?: number;
|
|
@@ -411,7 +469,7 @@ interface RunEvaluationOptions {
|
|
|
411
469
|
readonly targets?: readonly TargetDefinition[];
|
|
412
470
|
readonly env?: EnvLookup;
|
|
413
471
|
readonly providerFactory?: (target: ResolvedTarget) => Provider;
|
|
414
|
-
readonly
|
|
472
|
+
readonly evaluators?: Partial<Record<string, Evaluator>>;
|
|
415
473
|
readonly maxRetries?: number;
|
|
416
474
|
readonly agentTimeoutMs?: number;
|
|
417
475
|
readonly promptDumpDir?: string;
|
|
@@ -432,4 +490,4 @@ type AgentKernel = {
|
|
|
432
490
|
};
|
|
433
491
|
declare function createAgentKernel(): AgentKernel;
|
|
434
492
|
|
|
435
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type
|
|
493
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation };
|
package/dist/index.d.ts
CHANGED
|
@@ -75,19 +75,25 @@ declare function isJsonValue(value: unknown): value is JsonValue;
|
|
|
75
75
|
* Guard validating raw test messages.
|
|
76
76
|
*/
|
|
77
77
|
declare function isTestMessage(value: unknown): value is TestMessage;
|
|
78
|
-
declare const
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
78
|
+
declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
|
|
79
|
+
type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
|
|
80
|
+
declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
|
|
81
|
+
type CodeEvaluatorConfig = {
|
|
82
|
+
readonly name: string;
|
|
83
|
+
readonly type: "code";
|
|
84
|
+
readonly script: string;
|
|
85
|
+
readonly resolvedScriptPath?: string;
|
|
86
|
+
readonly cwd?: string;
|
|
87
|
+
readonly resolvedCwd?: string;
|
|
88
|
+
};
|
|
89
|
+
type LlmJudgeEvaluatorConfig = {
|
|
90
|
+
readonly name: string;
|
|
91
|
+
readonly type: "llm_judge";
|
|
92
|
+
readonly prompt?: string;
|
|
93
|
+
readonly promptPath?: string;
|
|
94
|
+
readonly model?: string;
|
|
95
|
+
};
|
|
96
|
+
type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
|
|
91
97
|
/**
|
|
92
98
|
* Test case definition sourced from AgentV specs.
|
|
93
99
|
*/
|
|
@@ -103,7 +109,8 @@ interface EvalCase {
|
|
|
103
109
|
readonly file_paths: readonly string[];
|
|
104
110
|
readonly code_snippets: readonly string[];
|
|
105
111
|
readonly outcome: string;
|
|
106
|
-
readonly
|
|
112
|
+
readonly evaluator?: EvaluatorKind;
|
|
113
|
+
readonly evaluators?: readonly EvaluatorConfig[];
|
|
107
114
|
}
|
|
108
115
|
/**
|
|
109
116
|
* Evaluator scorecard for a single test case run.
|
|
@@ -121,7 +128,18 @@ interface EvaluationResult {
|
|
|
121
128
|
readonly reasoning?: string;
|
|
122
129
|
readonly raw_aspects?: readonly string[];
|
|
123
130
|
readonly raw_request?: JsonObject;
|
|
124
|
-
readonly
|
|
131
|
+
readonly evaluator_raw_request?: JsonObject;
|
|
132
|
+
readonly evaluator_results?: readonly EvaluatorResult[];
|
|
133
|
+
}
|
|
134
|
+
interface EvaluatorResult {
|
|
135
|
+
readonly name: string;
|
|
136
|
+
readonly type: EvaluatorKind;
|
|
137
|
+
readonly score: number;
|
|
138
|
+
readonly hits: readonly string[];
|
|
139
|
+
readonly misses: readonly string[];
|
|
140
|
+
readonly reasoning?: string;
|
|
141
|
+
readonly raw_request?: JsonObject;
|
|
142
|
+
readonly evaluator_raw_request?: JsonObject;
|
|
125
143
|
}
|
|
126
144
|
/**
|
|
127
145
|
* Convenience accessor matching the Python hit_count property.
|
|
@@ -177,13 +195,13 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
|
|
|
177
195
|
}>;
|
|
178
196
|
|
|
179
197
|
type ChatPrompt = AxChatRequest["chatPrompt"];
|
|
180
|
-
type ProviderKind = "azure" | "anthropic" | "gemini" | "mock" | "vscode" | "vscode-insiders";
|
|
198
|
+
type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
|
|
181
199
|
interface ProviderRequest {
|
|
182
200
|
readonly prompt: string;
|
|
183
201
|
readonly guidelines?: string;
|
|
184
202
|
readonly guideline_patterns?: readonly string[];
|
|
185
203
|
readonly chatPrompt?: ChatPrompt;
|
|
186
|
-
readonly
|
|
204
|
+
readonly inputFiles?: readonly string[];
|
|
187
205
|
readonly evalCaseId?: string;
|
|
188
206
|
readonly attempt?: number;
|
|
189
207
|
readonly maxOutputTokens?: number;
|
|
@@ -202,6 +220,15 @@ interface Provider {
|
|
|
202
220
|
readonly kind: ProviderKind;
|
|
203
221
|
readonly targetName: string;
|
|
204
222
|
invoke(request: ProviderRequest): Promise<ProviderResponse>;
|
|
223
|
+
/**
|
|
224
|
+
* Optional capability marker for provider-managed batching (single session handling multiple requests).
|
|
225
|
+
*/
|
|
226
|
+
readonly supportsBatch?: boolean;
|
|
227
|
+
/**
|
|
228
|
+
* Optional batch invocation hook. When defined alongside supportsBatch=true,
|
|
229
|
+
* the orchestrator may send multiple requests in a single provider session.
|
|
230
|
+
*/
|
|
231
|
+
invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
|
|
205
232
|
}
|
|
206
233
|
type EnvLookup = Readonly<Record<string, string | undefined>>;
|
|
207
234
|
interface TargetDefinition {
|
|
@@ -233,6 +260,12 @@ interface GeminiResolvedConfig {
|
|
|
233
260
|
readonly temperature?: number;
|
|
234
261
|
readonly maxOutputTokens?: number;
|
|
235
262
|
}
|
|
263
|
+
interface CodexResolvedConfig {
|
|
264
|
+
readonly executable: string;
|
|
265
|
+
readonly args?: readonly string[];
|
|
266
|
+
readonly cwd?: string;
|
|
267
|
+
readonly timeoutMs?: number;
|
|
268
|
+
}
|
|
236
269
|
interface MockResolvedConfig {
|
|
237
270
|
readonly response?: string;
|
|
238
271
|
readonly delayMs?: number;
|
|
@@ -246,36 +279,73 @@ interface VSCodeResolvedConfig {
|
|
|
246
279
|
readonly subagentRoot?: string;
|
|
247
280
|
readonly workspaceTemplate?: string;
|
|
248
281
|
}
|
|
282
|
+
type CliHealthcheck = {
|
|
283
|
+
readonly type: "http";
|
|
284
|
+
readonly url: string;
|
|
285
|
+
readonly timeoutMs?: number;
|
|
286
|
+
} | {
|
|
287
|
+
readonly type: "command";
|
|
288
|
+
readonly commandTemplate: string;
|
|
289
|
+
readonly timeoutMs?: number;
|
|
290
|
+
readonly cwd?: string;
|
|
291
|
+
};
|
|
292
|
+
interface CliResolvedConfig {
|
|
293
|
+
readonly commandTemplate: string;
|
|
294
|
+
readonly filesFormat?: string;
|
|
295
|
+
readonly cwd?: string;
|
|
296
|
+
readonly env?: Record<string, string>;
|
|
297
|
+
readonly timeoutMs?: number;
|
|
298
|
+
readonly healthcheck?: CliHealthcheck;
|
|
299
|
+
}
|
|
249
300
|
type ResolvedTarget = {
|
|
250
301
|
readonly kind: "azure";
|
|
251
302
|
readonly name: string;
|
|
252
303
|
readonly judgeTarget?: string;
|
|
253
304
|
readonly workers?: number;
|
|
305
|
+
readonly providerBatching?: boolean;
|
|
254
306
|
readonly config: AzureResolvedConfig;
|
|
255
307
|
} | {
|
|
256
308
|
readonly kind: "anthropic";
|
|
257
309
|
readonly name: string;
|
|
258
310
|
readonly judgeTarget?: string;
|
|
259
311
|
readonly workers?: number;
|
|
312
|
+
readonly providerBatching?: boolean;
|
|
260
313
|
readonly config: AnthropicResolvedConfig;
|
|
261
314
|
} | {
|
|
262
315
|
readonly kind: "gemini";
|
|
263
316
|
readonly name: string;
|
|
264
317
|
readonly judgeTarget?: string;
|
|
265
318
|
readonly workers?: number;
|
|
319
|
+
readonly providerBatching?: boolean;
|
|
266
320
|
readonly config: GeminiResolvedConfig;
|
|
321
|
+
} | {
|
|
322
|
+
readonly kind: "codex";
|
|
323
|
+
readonly name: string;
|
|
324
|
+
readonly judgeTarget?: string;
|
|
325
|
+
readonly workers?: number;
|
|
326
|
+
readonly providerBatching?: boolean;
|
|
327
|
+
readonly config: CodexResolvedConfig;
|
|
267
328
|
} | {
|
|
268
329
|
readonly kind: "mock";
|
|
269
330
|
readonly name: string;
|
|
270
331
|
readonly judgeTarget?: string;
|
|
271
332
|
readonly workers?: number;
|
|
333
|
+
readonly providerBatching?: boolean;
|
|
272
334
|
readonly config: MockResolvedConfig;
|
|
273
335
|
} | {
|
|
274
336
|
readonly kind: "vscode" | "vscode-insiders";
|
|
275
337
|
readonly name: string;
|
|
276
338
|
readonly judgeTarget?: string;
|
|
277
339
|
readonly workers?: number;
|
|
340
|
+
readonly providerBatching?: boolean;
|
|
278
341
|
readonly config: VSCodeResolvedConfig;
|
|
342
|
+
} | {
|
|
343
|
+
readonly kind: "cli";
|
|
344
|
+
readonly name: string;
|
|
345
|
+
readonly judgeTarget?: string;
|
|
346
|
+
readonly workers?: number;
|
|
347
|
+
readonly providerBatching?: boolean;
|
|
348
|
+
readonly config: CliResolvedConfig;
|
|
279
349
|
};
|
|
280
350
|
declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
|
|
281
351
|
|
|
@@ -303,36 +373,7 @@ declare function ensureVSCodeSubagents(options: EnsureSubagentsOptions): Promise
|
|
|
303
373
|
declare function createProvider(target: ResolvedTarget): Provider;
|
|
304
374
|
declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
|
|
305
375
|
|
|
306
|
-
interface
|
|
307
|
-
readonly score: number;
|
|
308
|
-
readonly hits: readonly string[];
|
|
309
|
-
readonly misses: readonly string[];
|
|
310
|
-
readonly hitCount: number;
|
|
311
|
-
readonly totalAspects: number;
|
|
312
|
-
readonly rawAspects: readonly string[];
|
|
313
|
-
}
|
|
314
|
-
/**
|
|
315
|
-
* Extract individual evaluation aspects from the expected assistant response.
|
|
316
|
-
*/
|
|
317
|
-
declare function extractAspects(expectedResponse: string): readonly string[];
|
|
318
|
-
/**
|
|
319
|
-
* Determine which aspects were covered by the candidate response.
|
|
320
|
-
*/
|
|
321
|
-
declare function calculateHits(candidateResponse: string, expectedAspects: readonly string[]): readonly string[];
|
|
322
|
-
/**
|
|
323
|
-
* Determine which aspects were not satisfied by the candidate response.
|
|
324
|
-
*/
|
|
325
|
-
declare function calculateMisses(candidateResponse: string, expectedAspects: readonly string[], resolvedHits?: readonly string[]): readonly string[];
|
|
326
|
-
/**
|
|
327
|
-
* Evaluate the candidate response against the expected aspects.
|
|
328
|
-
*/
|
|
329
|
-
declare function scoreCandidateResponse(candidateResponse: string, expectedAspects: readonly string[]): HeuristicScore;
|
|
330
|
-
/**
|
|
331
|
-
* Detect common error-prefixed outputs from providers.
|
|
332
|
-
*/
|
|
333
|
-
declare function isErrorLike(text: string | undefined | null): boolean;
|
|
334
|
-
|
|
335
|
-
interface GradeContext {
|
|
376
|
+
interface EvaluationContext {
|
|
336
377
|
readonly evalCase: EvalCase;
|
|
337
378
|
readonly candidate: string;
|
|
338
379
|
readonly target: ResolvedTarget;
|
|
@@ -341,40 +382,55 @@ interface GradeContext {
|
|
|
341
382
|
readonly promptInputs: {
|
|
342
383
|
readonly request: string;
|
|
343
384
|
readonly guidelines: string;
|
|
385
|
+
readonly systemMessage?: string;
|
|
344
386
|
};
|
|
345
387
|
readonly now: Date;
|
|
346
388
|
readonly judgeProvider?: Provider;
|
|
389
|
+
readonly systemPrompt?: string;
|
|
390
|
+
readonly evaluator?: EvaluatorConfig;
|
|
391
|
+
readonly judgeModel?: string;
|
|
347
392
|
}
|
|
348
|
-
interface
|
|
393
|
+
interface EvaluationScore {
|
|
349
394
|
readonly score: number;
|
|
350
395
|
readonly hits: readonly string[];
|
|
351
396
|
readonly misses: readonly string[];
|
|
352
397
|
readonly expectedAspectCount: number;
|
|
353
398
|
readonly reasoning?: string;
|
|
354
399
|
readonly rawAspects?: readonly string[];
|
|
355
|
-
readonly
|
|
400
|
+
readonly evaluatorRawRequest?: JsonObject;
|
|
356
401
|
}
|
|
357
|
-
interface
|
|
402
|
+
interface Evaluator {
|
|
358
403
|
readonly kind: string;
|
|
359
|
-
|
|
360
|
-
}
|
|
361
|
-
declare class HeuristicGrader implements Grader {
|
|
362
|
-
readonly kind = "heuristic";
|
|
363
|
-
grade(context: GradeContext): GradeResult;
|
|
404
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
|
|
364
405
|
}
|
|
365
|
-
type JudgeProviderResolver = (context:
|
|
366
|
-
interface
|
|
406
|
+
type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
|
|
407
|
+
interface LlmJudgeEvaluatorOptions {
|
|
367
408
|
readonly resolveJudgeProvider: JudgeProviderResolver;
|
|
368
409
|
readonly maxOutputTokens?: number;
|
|
369
410
|
readonly temperature?: number;
|
|
411
|
+
readonly customPrompt?: string;
|
|
370
412
|
}
|
|
371
|
-
declare class
|
|
413
|
+
declare class LlmJudgeEvaluator implements Evaluator {
|
|
372
414
|
readonly kind = "llm_judge";
|
|
373
415
|
private readonly resolveJudgeProvider;
|
|
374
416
|
private readonly maxOutputTokens?;
|
|
375
417
|
private readonly temperature?;
|
|
376
|
-
|
|
377
|
-
|
|
418
|
+
private readonly customPrompt?;
|
|
419
|
+
constructor(options: LlmJudgeEvaluatorOptions);
|
|
420
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
421
|
+
}
|
|
422
|
+
interface CodeEvaluatorOptions {
|
|
423
|
+
readonly script: string;
|
|
424
|
+
readonly cwd?: string;
|
|
425
|
+
readonly agentTimeoutMs?: number;
|
|
426
|
+
}
|
|
427
|
+
declare class CodeEvaluator implements Evaluator {
|
|
428
|
+
readonly kind = "code";
|
|
429
|
+
private readonly script;
|
|
430
|
+
private readonly cwd?;
|
|
431
|
+
private readonly agentTimeoutMs?;
|
|
432
|
+
constructor(options: CodeEvaluatorOptions);
|
|
433
|
+
evaluate(context: EvaluationContext): Promise<EvaluationScore>;
|
|
378
434
|
}
|
|
379
435
|
|
|
380
436
|
type MaybePromise<T> = T | Promise<T>;
|
|
@@ -386,7 +442,9 @@ interface RunEvalCaseOptions {
|
|
|
386
442
|
readonly evalCase: EvalCase;
|
|
387
443
|
readonly provider: Provider;
|
|
388
444
|
readonly target: ResolvedTarget;
|
|
389
|
-
readonly
|
|
445
|
+
readonly evaluators: Partial<Record<string, Evaluator>> & {
|
|
446
|
+
readonly llm_judge: Evaluator;
|
|
447
|
+
};
|
|
390
448
|
readonly now?: () => Date;
|
|
391
449
|
readonly maxRetries?: number;
|
|
392
450
|
readonly agentTimeoutMs?: number;
|
|
@@ -411,7 +469,7 @@ interface RunEvaluationOptions {
|
|
|
411
469
|
readonly targets?: readonly TargetDefinition[];
|
|
412
470
|
readonly env?: EnvLookup;
|
|
413
471
|
readonly providerFactory?: (target: ResolvedTarget) => Provider;
|
|
414
|
-
readonly
|
|
472
|
+
readonly evaluators?: Partial<Record<string, Evaluator>>;
|
|
415
473
|
readonly maxRetries?: number;
|
|
416
474
|
readonly agentTimeoutMs?: number;
|
|
417
475
|
readonly promptDumpDir?: string;
|
|
@@ -432,4 +490,4 @@ type AgentKernel = {
|
|
|
432
490
|
};
|
|
433
491
|
declare function createAgentKernel(): AgentKernel;
|
|
434
492
|
|
|
435
|
-
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type
|
|
493
|
+
export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation };
|