@agentv/core 0.2.8 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -75,19 +75,25 @@ declare function isJsonValue(value: unknown): value is JsonValue;
75
75
  * Guard validating raw test messages.
76
76
  */
77
77
  declare function isTestMessage(value: unknown): value is TestMessage;
78
- declare const GRADER_KIND_VALUES: readonly ["heuristic", "llm_judge"];
79
- /**
80
- * Supported grader implementations.
81
- */
82
- declare const GRADER_KINDS: readonly ["heuristic", "llm_judge"];
83
- /**
84
- * Grader identifiers available to the pipeline.
85
- */
86
- type GraderKind = (typeof GRADER_KIND_VALUES)[number];
87
- /**
88
- * Guard validating grader identifiers.
89
- */
90
- declare function isGraderKind(value: unknown): value is GraderKind;
78
+ declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
79
+ type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
80
+ declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
81
+ type CodeEvaluatorConfig = {
82
+ readonly name: string;
83
+ readonly type: "code";
84
+ readonly script: string;
85
+ readonly resolvedScriptPath?: string;
86
+ readonly cwd?: string;
87
+ readonly resolvedCwd?: string;
88
+ };
89
+ type LlmJudgeEvaluatorConfig = {
90
+ readonly name: string;
91
+ readonly type: "llm_judge";
92
+ readonly prompt?: string;
93
+ readonly promptPath?: string;
94
+ readonly model?: string;
95
+ };
96
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
91
97
  /**
92
98
  * Test case definition sourced from AgentV specs.
93
99
  */
@@ -103,7 +109,8 @@ interface EvalCase {
103
109
  readonly file_paths: readonly string[];
104
110
  readonly code_snippets: readonly string[];
105
111
  readonly outcome: string;
106
- readonly grader: GraderKind;
112
+ readonly evaluator?: EvaluatorKind;
113
+ readonly evaluators?: readonly EvaluatorConfig[];
107
114
  }
108
115
  /**
109
116
  * Evaluator scorecard for a single test case run.
@@ -121,7 +128,18 @@ interface EvaluationResult {
121
128
  readonly reasoning?: string;
122
129
  readonly raw_aspects?: readonly string[];
123
130
  readonly raw_request?: JsonObject;
124
- readonly grader_raw_request?: JsonObject;
131
+ readonly evaluator_raw_request?: JsonObject;
132
+ readonly evaluator_results?: readonly EvaluatorResult[];
133
+ }
134
+ interface EvaluatorResult {
135
+ readonly name: string;
136
+ readonly type: EvaluatorKind;
137
+ readonly score: number;
138
+ readonly hits: readonly string[];
139
+ readonly misses: readonly string[];
140
+ readonly reasoning?: string;
141
+ readonly raw_request?: JsonObject;
142
+ readonly evaluator_raw_request?: JsonObject;
125
143
  }
126
144
  /**
127
145
  * Convenience accessor matching the Python hit_count property.
@@ -177,13 +195,13 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
177
195
  }>;
178
196
 
179
197
  type ChatPrompt = AxChatRequest["chatPrompt"];
180
- type ProviderKind = "azure" | "anthropic" | "gemini" | "mock" | "vscode" | "vscode-insiders";
198
+ type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
181
199
  interface ProviderRequest {
182
200
  readonly prompt: string;
183
201
  readonly guidelines?: string;
184
202
  readonly guideline_patterns?: readonly string[];
185
203
  readonly chatPrompt?: ChatPrompt;
186
- readonly attachments?: readonly string[];
204
+ readonly inputFiles?: readonly string[];
187
205
  readonly evalCaseId?: string;
188
206
  readonly attempt?: number;
189
207
  readonly maxOutputTokens?: number;
@@ -202,6 +220,15 @@ interface Provider {
202
220
  readonly kind: ProviderKind;
203
221
  readonly targetName: string;
204
222
  invoke(request: ProviderRequest): Promise<ProviderResponse>;
223
+ /**
224
+ * Optional capability marker for provider-managed batching (single session handling multiple requests).
225
+ */
226
+ readonly supportsBatch?: boolean;
227
+ /**
228
+ * Optional batch invocation hook. When defined alongside supportsBatch=true,
229
+ * the orchestrator may send multiple requests in a single provider session.
230
+ */
231
+ invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
205
232
  }
206
233
  type EnvLookup = Readonly<Record<string, string | undefined>>;
207
234
  interface TargetDefinition {
@@ -233,6 +260,12 @@ interface GeminiResolvedConfig {
233
260
  readonly temperature?: number;
234
261
  readonly maxOutputTokens?: number;
235
262
  }
263
+ interface CodexResolvedConfig {
264
+ readonly executable: string;
265
+ readonly args?: readonly string[];
266
+ readonly cwd?: string;
267
+ readonly timeoutMs?: number;
268
+ }
236
269
  interface MockResolvedConfig {
237
270
  readonly response?: string;
238
271
  readonly delayMs?: number;
@@ -246,36 +279,73 @@ interface VSCodeResolvedConfig {
246
279
  readonly subagentRoot?: string;
247
280
  readonly workspaceTemplate?: string;
248
281
  }
282
+ type CliHealthcheck = {
283
+ readonly type: "http";
284
+ readonly url: string;
285
+ readonly timeoutMs?: number;
286
+ } | {
287
+ readonly type: "command";
288
+ readonly commandTemplate: string;
289
+ readonly timeoutMs?: number;
290
+ readonly cwd?: string;
291
+ };
292
+ interface CliResolvedConfig {
293
+ readonly commandTemplate: string;
294
+ readonly filesFormat?: string;
295
+ readonly cwd?: string;
296
+ readonly env?: Record<string, string>;
297
+ readonly timeoutMs?: number;
298
+ readonly healthcheck?: CliHealthcheck;
299
+ }
249
300
  type ResolvedTarget = {
250
301
  readonly kind: "azure";
251
302
  readonly name: string;
252
303
  readonly judgeTarget?: string;
253
304
  readonly workers?: number;
305
+ readonly providerBatching?: boolean;
254
306
  readonly config: AzureResolvedConfig;
255
307
  } | {
256
308
  readonly kind: "anthropic";
257
309
  readonly name: string;
258
310
  readonly judgeTarget?: string;
259
311
  readonly workers?: number;
312
+ readonly providerBatching?: boolean;
260
313
  readonly config: AnthropicResolvedConfig;
261
314
  } | {
262
315
  readonly kind: "gemini";
263
316
  readonly name: string;
264
317
  readonly judgeTarget?: string;
265
318
  readonly workers?: number;
319
+ readonly providerBatching?: boolean;
266
320
  readonly config: GeminiResolvedConfig;
321
+ } | {
322
+ readonly kind: "codex";
323
+ readonly name: string;
324
+ readonly judgeTarget?: string;
325
+ readonly workers?: number;
326
+ readonly providerBatching?: boolean;
327
+ readonly config: CodexResolvedConfig;
267
328
  } | {
268
329
  readonly kind: "mock";
269
330
  readonly name: string;
270
331
  readonly judgeTarget?: string;
271
332
  readonly workers?: number;
333
+ readonly providerBatching?: boolean;
272
334
  readonly config: MockResolvedConfig;
273
335
  } | {
274
336
  readonly kind: "vscode" | "vscode-insiders";
275
337
  readonly name: string;
276
338
  readonly judgeTarget?: string;
277
339
  readonly workers?: number;
340
+ readonly providerBatching?: boolean;
278
341
  readonly config: VSCodeResolvedConfig;
342
+ } | {
343
+ readonly kind: "cli";
344
+ readonly name: string;
345
+ readonly judgeTarget?: string;
346
+ readonly workers?: number;
347
+ readonly providerBatching?: boolean;
348
+ readonly config: CliResolvedConfig;
279
349
  };
280
350
  declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
281
351
 
@@ -303,36 +373,7 @@ declare function ensureVSCodeSubagents(options: EnsureSubagentsOptions): Promise
303
373
  declare function createProvider(target: ResolvedTarget): Provider;
304
374
  declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
305
375
 
306
- interface HeuristicScore {
307
- readonly score: number;
308
- readonly hits: readonly string[];
309
- readonly misses: readonly string[];
310
- readonly hitCount: number;
311
- readonly totalAspects: number;
312
- readonly rawAspects: readonly string[];
313
- }
314
- /**
315
- * Extract individual evaluation aspects from the expected assistant response.
316
- */
317
- declare function extractAspects(expectedResponse: string): readonly string[];
318
- /**
319
- * Determine which aspects were covered by the candidate response.
320
- */
321
- declare function calculateHits(candidateResponse: string, expectedAspects: readonly string[]): readonly string[];
322
- /**
323
- * Determine which aspects were not satisfied by the candidate response.
324
- */
325
- declare function calculateMisses(candidateResponse: string, expectedAspects: readonly string[], resolvedHits?: readonly string[]): readonly string[];
326
- /**
327
- * Evaluate the candidate response against the expected aspects.
328
- */
329
- declare function scoreCandidateResponse(candidateResponse: string, expectedAspects: readonly string[]): HeuristicScore;
330
- /**
331
- * Detect common error-prefixed outputs from providers.
332
- */
333
- declare function isErrorLike(text: string | undefined | null): boolean;
334
-
335
- interface GradeContext {
376
+ interface EvaluationContext {
336
377
  readonly evalCase: EvalCase;
337
378
  readonly candidate: string;
338
379
  readonly target: ResolvedTarget;
@@ -341,40 +382,55 @@ interface GradeContext {
341
382
  readonly promptInputs: {
342
383
  readonly request: string;
343
384
  readonly guidelines: string;
385
+ readonly systemMessage?: string;
344
386
  };
345
387
  readonly now: Date;
346
388
  readonly judgeProvider?: Provider;
389
+ readonly systemPrompt?: string;
390
+ readonly evaluator?: EvaluatorConfig;
391
+ readonly judgeModel?: string;
347
392
  }
348
- interface GradeResult {
393
+ interface EvaluationScore {
349
394
  readonly score: number;
350
395
  readonly hits: readonly string[];
351
396
  readonly misses: readonly string[];
352
397
  readonly expectedAspectCount: number;
353
398
  readonly reasoning?: string;
354
399
  readonly rawAspects?: readonly string[];
355
- readonly graderRawRequest?: JsonObject;
400
+ readonly evaluatorRawRequest?: JsonObject;
356
401
  }
357
- interface Grader {
402
+ interface Evaluator {
358
403
  readonly kind: string;
359
- grade(context: GradeContext): Promise<GradeResult> | GradeResult;
360
- }
361
- declare class HeuristicGrader implements Grader {
362
- readonly kind = "heuristic";
363
- grade(context: GradeContext): GradeResult;
404
+ evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
364
405
  }
365
- type JudgeProviderResolver = (context: GradeContext) => Promise<Provider | undefined>;
366
- interface QualityGraderOptions {
406
+ type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
407
+ interface LlmJudgeEvaluatorOptions {
367
408
  readonly resolveJudgeProvider: JudgeProviderResolver;
368
409
  readonly maxOutputTokens?: number;
369
410
  readonly temperature?: number;
411
+ readonly customPrompt?: string;
370
412
  }
371
- declare class QualityGrader implements Grader {
413
+ declare class LlmJudgeEvaluator implements Evaluator {
372
414
  readonly kind = "llm_judge";
373
415
  private readonly resolveJudgeProvider;
374
416
  private readonly maxOutputTokens?;
375
417
  private readonly temperature?;
376
- constructor(options: QualityGraderOptions);
377
- grade(context: GradeContext): Promise<GradeResult>;
418
+ private readonly customPrompt?;
419
+ constructor(options: LlmJudgeEvaluatorOptions);
420
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
421
+ }
422
+ interface CodeEvaluatorOptions {
423
+ readonly script: string;
424
+ readonly cwd?: string;
425
+ readonly agentTimeoutMs?: number;
426
+ }
427
+ declare class CodeEvaluator implements Evaluator {
428
+ readonly kind = "code";
429
+ private readonly script;
430
+ private readonly cwd?;
431
+ private readonly agentTimeoutMs?;
432
+ constructor(options: CodeEvaluatorOptions);
433
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
378
434
  }
379
435
 
380
436
  type MaybePromise<T> = T | Promise<T>;
@@ -386,7 +442,9 @@ interface RunEvalCaseOptions {
386
442
  readonly evalCase: EvalCase;
387
443
  readonly provider: Provider;
388
444
  readonly target: ResolvedTarget;
389
- readonly graders: Partial<Record<string, Grader>>;
445
+ readonly evaluators: Partial<Record<string, Evaluator>> & {
446
+ readonly llm_judge: Evaluator;
447
+ };
390
448
  readonly now?: () => Date;
391
449
  readonly maxRetries?: number;
392
450
  readonly agentTimeoutMs?: number;
@@ -411,7 +469,7 @@ interface RunEvaluationOptions {
411
469
  readonly targets?: readonly TargetDefinition[];
412
470
  readonly env?: EnvLookup;
413
471
  readonly providerFactory?: (target: ResolvedTarget) => Provider;
414
- readonly graders?: Partial<Record<string, Grader>>;
472
+ readonly evaluators?: Partial<Record<string, Evaluator>>;
415
473
  readonly maxRetries?: number;
416
474
  readonly agentTimeoutMs?: number;
417
475
  readonly promptDumpDir?: string;
@@ -432,4 +490,4 @@ type AgentKernel = {
432
490
  };
433
491
  declare function createAgentKernel(): AgentKernel;
434
492
 
435
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationResult, GRADER_KINDS, type GeminiResolvedConfig, type GradeContext, type GradeResult, type Grader, type GraderKind, HeuristicGrader, type HeuristicScore, type JsonObject, type JsonPrimitive, type JsonValue, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, QualityGrader, type QualityGraderOptions, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, calculateHits, calculateMisses, createAgentKernel, createProvider, ensureVSCodeSubagents, extractAspects, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isErrorLike, isGraderKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreCandidateResponse };
493
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation };
package/dist/index.d.ts CHANGED
@@ -75,19 +75,25 @@ declare function isJsonValue(value: unknown): value is JsonValue;
75
75
  * Guard validating raw test messages.
76
76
  */
77
77
  declare function isTestMessage(value: unknown): value is TestMessage;
78
- declare const GRADER_KIND_VALUES: readonly ["heuristic", "llm_judge"];
79
- /**
80
- * Supported grader implementations.
81
- */
82
- declare const GRADER_KINDS: readonly ["heuristic", "llm_judge"];
83
- /**
84
- * Grader identifiers available to the pipeline.
85
- */
86
- type GraderKind = (typeof GRADER_KIND_VALUES)[number];
87
- /**
88
- * Guard validating grader identifiers.
89
- */
90
- declare function isGraderKind(value: unknown): value is GraderKind;
78
+ declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
79
+ type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
80
+ declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
81
+ type CodeEvaluatorConfig = {
82
+ readonly name: string;
83
+ readonly type: "code";
84
+ readonly script: string;
85
+ readonly resolvedScriptPath?: string;
86
+ readonly cwd?: string;
87
+ readonly resolvedCwd?: string;
88
+ };
89
+ type LlmJudgeEvaluatorConfig = {
90
+ readonly name: string;
91
+ readonly type: "llm_judge";
92
+ readonly prompt?: string;
93
+ readonly promptPath?: string;
94
+ readonly model?: string;
95
+ };
96
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
91
97
  /**
92
98
  * Test case definition sourced from AgentV specs.
93
99
  */
@@ -103,7 +109,8 @@ interface EvalCase {
103
109
  readonly file_paths: readonly string[];
104
110
  readonly code_snippets: readonly string[];
105
111
  readonly outcome: string;
106
- readonly grader: GraderKind;
112
+ readonly evaluator?: EvaluatorKind;
113
+ readonly evaluators?: readonly EvaluatorConfig[];
107
114
  }
108
115
  /**
109
116
  * Evaluator scorecard for a single test case run.
@@ -121,7 +128,18 @@ interface EvaluationResult {
121
128
  readonly reasoning?: string;
122
129
  readonly raw_aspects?: readonly string[];
123
130
  readonly raw_request?: JsonObject;
124
- readonly grader_raw_request?: JsonObject;
131
+ readonly evaluator_raw_request?: JsonObject;
132
+ readonly evaluator_results?: readonly EvaluatorResult[];
133
+ }
134
+ interface EvaluatorResult {
135
+ readonly name: string;
136
+ readonly type: EvaluatorKind;
137
+ readonly score: number;
138
+ readonly hits: readonly string[];
139
+ readonly misses: readonly string[];
140
+ readonly reasoning?: string;
141
+ readonly raw_request?: JsonObject;
142
+ readonly evaluator_raw_request?: JsonObject;
125
143
  }
126
144
  /**
127
145
  * Convenience accessor matching the Python hit_count property.
@@ -177,13 +195,13 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
177
195
  }>;
178
196
 
179
197
  type ChatPrompt = AxChatRequest["chatPrompt"];
180
- type ProviderKind = "azure" | "anthropic" | "gemini" | "mock" | "vscode" | "vscode-insiders";
198
+ type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
181
199
  interface ProviderRequest {
182
200
  readonly prompt: string;
183
201
  readonly guidelines?: string;
184
202
  readonly guideline_patterns?: readonly string[];
185
203
  readonly chatPrompt?: ChatPrompt;
186
- readonly attachments?: readonly string[];
204
+ readonly inputFiles?: readonly string[];
187
205
  readonly evalCaseId?: string;
188
206
  readonly attempt?: number;
189
207
  readonly maxOutputTokens?: number;
@@ -202,6 +220,15 @@ interface Provider {
202
220
  readonly kind: ProviderKind;
203
221
  readonly targetName: string;
204
222
  invoke(request: ProviderRequest): Promise<ProviderResponse>;
223
+ /**
224
+ * Optional capability marker for provider-managed batching (single session handling multiple requests).
225
+ */
226
+ readonly supportsBatch?: boolean;
227
+ /**
228
+ * Optional batch invocation hook. When defined alongside supportsBatch=true,
229
+ * the orchestrator may send multiple requests in a single provider session.
230
+ */
231
+ invokeBatch?(requests: readonly ProviderRequest[]): Promise<readonly ProviderResponse[]>;
205
232
  }
206
233
  type EnvLookup = Readonly<Record<string, string | undefined>>;
207
234
  interface TargetDefinition {
@@ -233,6 +260,12 @@ interface GeminiResolvedConfig {
233
260
  readonly temperature?: number;
234
261
  readonly maxOutputTokens?: number;
235
262
  }
263
+ interface CodexResolvedConfig {
264
+ readonly executable: string;
265
+ readonly args?: readonly string[];
266
+ readonly cwd?: string;
267
+ readonly timeoutMs?: number;
268
+ }
236
269
  interface MockResolvedConfig {
237
270
  readonly response?: string;
238
271
  readonly delayMs?: number;
@@ -246,36 +279,73 @@ interface VSCodeResolvedConfig {
246
279
  readonly subagentRoot?: string;
247
280
  readonly workspaceTemplate?: string;
248
281
  }
282
+ type CliHealthcheck = {
283
+ readonly type: "http";
284
+ readonly url: string;
285
+ readonly timeoutMs?: number;
286
+ } | {
287
+ readonly type: "command";
288
+ readonly commandTemplate: string;
289
+ readonly timeoutMs?: number;
290
+ readonly cwd?: string;
291
+ };
292
+ interface CliResolvedConfig {
293
+ readonly commandTemplate: string;
294
+ readonly filesFormat?: string;
295
+ readonly cwd?: string;
296
+ readonly env?: Record<string, string>;
297
+ readonly timeoutMs?: number;
298
+ readonly healthcheck?: CliHealthcheck;
299
+ }
249
300
  type ResolvedTarget = {
250
301
  readonly kind: "azure";
251
302
  readonly name: string;
252
303
  readonly judgeTarget?: string;
253
304
  readonly workers?: number;
305
+ readonly providerBatching?: boolean;
254
306
  readonly config: AzureResolvedConfig;
255
307
  } | {
256
308
  readonly kind: "anthropic";
257
309
  readonly name: string;
258
310
  readonly judgeTarget?: string;
259
311
  readonly workers?: number;
312
+ readonly providerBatching?: boolean;
260
313
  readonly config: AnthropicResolvedConfig;
261
314
  } | {
262
315
  readonly kind: "gemini";
263
316
  readonly name: string;
264
317
  readonly judgeTarget?: string;
265
318
  readonly workers?: number;
319
+ readonly providerBatching?: boolean;
266
320
  readonly config: GeminiResolvedConfig;
321
+ } | {
322
+ readonly kind: "codex";
323
+ readonly name: string;
324
+ readonly judgeTarget?: string;
325
+ readonly workers?: number;
326
+ readonly providerBatching?: boolean;
327
+ readonly config: CodexResolvedConfig;
267
328
  } | {
268
329
  readonly kind: "mock";
269
330
  readonly name: string;
270
331
  readonly judgeTarget?: string;
271
332
  readonly workers?: number;
333
+ readonly providerBatching?: boolean;
272
334
  readonly config: MockResolvedConfig;
273
335
  } | {
274
336
  readonly kind: "vscode" | "vscode-insiders";
275
337
  readonly name: string;
276
338
  readonly judgeTarget?: string;
277
339
  readonly workers?: number;
340
+ readonly providerBatching?: boolean;
278
341
  readonly config: VSCodeResolvedConfig;
342
+ } | {
343
+ readonly kind: "cli";
344
+ readonly name: string;
345
+ readonly judgeTarget?: string;
346
+ readonly workers?: number;
347
+ readonly providerBatching?: boolean;
348
+ readonly config: CliResolvedConfig;
279
349
  };
280
350
  declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
281
351
 
@@ -303,36 +373,7 @@ declare function ensureVSCodeSubagents(options: EnsureSubagentsOptions): Promise
303
373
  declare function createProvider(target: ResolvedTarget): Provider;
304
374
  declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
305
375
 
306
- interface HeuristicScore {
307
- readonly score: number;
308
- readonly hits: readonly string[];
309
- readonly misses: readonly string[];
310
- readonly hitCount: number;
311
- readonly totalAspects: number;
312
- readonly rawAspects: readonly string[];
313
- }
314
- /**
315
- * Extract individual evaluation aspects from the expected assistant response.
316
- */
317
- declare function extractAspects(expectedResponse: string): readonly string[];
318
- /**
319
- * Determine which aspects were covered by the candidate response.
320
- */
321
- declare function calculateHits(candidateResponse: string, expectedAspects: readonly string[]): readonly string[];
322
- /**
323
- * Determine which aspects were not satisfied by the candidate response.
324
- */
325
- declare function calculateMisses(candidateResponse: string, expectedAspects: readonly string[], resolvedHits?: readonly string[]): readonly string[];
326
- /**
327
- * Evaluate the candidate response against the expected aspects.
328
- */
329
- declare function scoreCandidateResponse(candidateResponse: string, expectedAspects: readonly string[]): HeuristicScore;
330
- /**
331
- * Detect common error-prefixed outputs from providers.
332
- */
333
- declare function isErrorLike(text: string | undefined | null): boolean;
334
-
335
- interface GradeContext {
376
+ interface EvaluationContext {
336
377
  readonly evalCase: EvalCase;
337
378
  readonly candidate: string;
338
379
  readonly target: ResolvedTarget;
@@ -341,40 +382,55 @@ interface GradeContext {
341
382
  readonly promptInputs: {
342
383
  readonly request: string;
343
384
  readonly guidelines: string;
385
+ readonly systemMessage?: string;
344
386
  };
345
387
  readonly now: Date;
346
388
  readonly judgeProvider?: Provider;
389
+ readonly systemPrompt?: string;
390
+ readonly evaluator?: EvaluatorConfig;
391
+ readonly judgeModel?: string;
347
392
  }
348
- interface GradeResult {
393
+ interface EvaluationScore {
349
394
  readonly score: number;
350
395
  readonly hits: readonly string[];
351
396
  readonly misses: readonly string[];
352
397
  readonly expectedAspectCount: number;
353
398
  readonly reasoning?: string;
354
399
  readonly rawAspects?: readonly string[];
355
- readonly graderRawRequest?: JsonObject;
400
+ readonly evaluatorRawRequest?: JsonObject;
356
401
  }
357
- interface Grader {
402
+ interface Evaluator {
358
403
  readonly kind: string;
359
- grade(context: GradeContext): Promise<GradeResult> | GradeResult;
360
- }
361
- declare class HeuristicGrader implements Grader {
362
- readonly kind = "heuristic";
363
- grade(context: GradeContext): GradeResult;
404
+ evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
364
405
  }
365
- type JudgeProviderResolver = (context: GradeContext) => Promise<Provider | undefined>;
366
- interface QualityGraderOptions {
406
+ type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
407
+ interface LlmJudgeEvaluatorOptions {
367
408
  readonly resolveJudgeProvider: JudgeProviderResolver;
368
409
  readonly maxOutputTokens?: number;
369
410
  readonly temperature?: number;
411
+ readonly customPrompt?: string;
370
412
  }
371
- declare class QualityGrader implements Grader {
413
+ declare class LlmJudgeEvaluator implements Evaluator {
372
414
  readonly kind = "llm_judge";
373
415
  private readonly resolveJudgeProvider;
374
416
  private readonly maxOutputTokens?;
375
417
  private readonly temperature?;
376
- constructor(options: QualityGraderOptions);
377
- grade(context: GradeContext): Promise<GradeResult>;
418
+ private readonly customPrompt?;
419
+ constructor(options: LlmJudgeEvaluatorOptions);
420
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
421
+ }
422
+ interface CodeEvaluatorOptions {
423
+ readonly script: string;
424
+ readonly cwd?: string;
425
+ readonly agentTimeoutMs?: number;
426
+ }
427
+ declare class CodeEvaluator implements Evaluator {
428
+ readonly kind = "code";
429
+ private readonly script;
430
+ private readonly cwd?;
431
+ private readonly agentTimeoutMs?;
432
+ constructor(options: CodeEvaluatorOptions);
433
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
378
434
  }
379
435
 
380
436
  type MaybePromise<T> = T | Promise<T>;
@@ -386,7 +442,9 @@ interface RunEvalCaseOptions {
386
442
  readonly evalCase: EvalCase;
387
443
  readonly provider: Provider;
388
444
  readonly target: ResolvedTarget;
389
- readonly graders: Partial<Record<string, Grader>>;
445
+ readonly evaluators: Partial<Record<string, Evaluator>> & {
446
+ readonly llm_judge: Evaluator;
447
+ };
390
448
  readonly now?: () => Date;
391
449
  readonly maxRetries?: number;
392
450
  readonly agentTimeoutMs?: number;
@@ -411,7 +469,7 @@ interface RunEvaluationOptions {
411
469
  readonly targets?: readonly TargetDefinition[];
412
470
  readonly env?: EnvLookup;
413
471
  readonly providerFactory?: (target: ResolvedTarget) => Provider;
414
- readonly graders?: Partial<Record<string, Grader>>;
472
+ readonly evaluators?: Partial<Record<string, Evaluator>>;
415
473
  readonly maxRetries?: number;
416
474
  readonly agentTimeoutMs?: number;
417
475
  readonly promptDumpDir?: string;
@@ -432,4 +490,4 @@ type AgentKernel = {
432
490
  };
433
491
  declare function createAgentKernel(): AgentKernel;
434
492
 
435
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationResult, GRADER_KINDS, type GeminiResolvedConfig, type GradeContext, type GradeResult, type Grader, type GraderKind, HeuristicGrader, type HeuristicScore, type JsonObject, type JsonPrimitive, type JsonValue, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, QualityGrader, type QualityGraderOptions, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, calculateHits, calculateMisses, createAgentKernel, createProvider, ensureVSCodeSubagents, extractAspects, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isErrorLike, isGraderKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreCandidateResponse };
493
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type CodexResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation };