@agentv/core 0.2.11 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -75,19 +75,25 @@ declare function isJsonValue(value: unknown): value is JsonValue;
75
75
  * Guard validating raw test messages.
76
76
  */
77
77
  declare function isTestMessage(value: unknown): value is TestMessage;
78
- declare const GRADER_KIND_VALUES: readonly ["heuristic", "llm_judge"];
79
- /**
80
- * Supported grader implementations.
81
- */
82
- declare const GRADER_KINDS: readonly ["heuristic", "llm_judge"];
83
- /**
84
- * Grader identifiers available to the pipeline.
85
- */
86
- type GraderKind = (typeof GRADER_KIND_VALUES)[number];
87
- /**
88
- * Guard validating grader identifiers.
89
- */
90
- declare function isGraderKind(value: unknown): value is GraderKind;
78
+ declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
79
+ type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
80
+ declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
81
+ type CodeEvaluatorConfig = {
82
+ readonly name: string;
83
+ readonly type: "code";
84
+ readonly script: string;
85
+ readonly resolvedScriptPath?: string;
86
+ readonly cwd?: string;
87
+ readonly resolvedCwd?: string;
88
+ };
89
+ type LlmJudgeEvaluatorConfig = {
90
+ readonly name: string;
91
+ readonly type: "llm_judge";
92
+ readonly prompt?: string;
93
+ readonly promptPath?: string;
94
+ readonly model?: string;
95
+ };
96
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
91
97
  /**
92
98
  * Test case definition sourced from AgentV specs.
93
99
  */
@@ -103,7 +109,8 @@ interface EvalCase {
103
109
  readonly file_paths: readonly string[];
104
110
  readonly code_snippets: readonly string[];
105
111
  readonly outcome: string;
106
- readonly grader: GraderKind;
112
+ readonly evaluator?: EvaluatorKind;
113
+ readonly evaluators?: readonly EvaluatorConfig[];
107
114
  }
108
115
  /**
109
116
  * Evaluator scorecard for a single test case run.
@@ -121,7 +128,18 @@ interface EvaluationResult {
121
128
  readonly reasoning?: string;
122
129
  readonly raw_aspects?: readonly string[];
123
130
  readonly raw_request?: JsonObject;
124
- readonly grader_raw_request?: JsonObject;
131
+ readonly evaluator_raw_request?: JsonObject;
132
+ readonly evaluator_results?: readonly EvaluatorResult[];
133
+ }
134
+ interface EvaluatorResult {
135
+ readonly name: string;
136
+ readonly type: EvaluatorKind;
137
+ readonly score: number;
138
+ readonly hits: readonly string[];
139
+ readonly misses: readonly string[];
140
+ readonly reasoning?: string;
141
+ readonly raw_request?: JsonObject;
142
+ readonly evaluator_raw_request?: JsonObject;
125
143
  }
126
144
  /**
127
145
  * Convenience accessor matching the Python hit_count property.
@@ -177,13 +195,13 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
177
195
  }>;
178
196
 
179
197
  type ChatPrompt = AxChatRequest["chatPrompt"];
180
- type ProviderKind = "azure" | "anthropic" | "gemini" | "mock" | "vscode" | "vscode-insiders";
198
+ type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
181
199
  interface ProviderRequest {
182
200
  readonly prompt: string;
183
201
  readonly guidelines?: string;
184
202
  readonly guideline_patterns?: readonly string[];
185
203
  readonly chatPrompt?: ChatPrompt;
186
- readonly attachments?: readonly string[];
204
+ readonly inputFiles?: readonly string[];
187
205
  readonly evalCaseId?: string;
188
206
  readonly attempt?: number;
189
207
  readonly maxOutputTokens?: number;
@@ -242,6 +260,12 @@ interface GeminiResolvedConfig {
242
260
  readonly temperature?: number;
243
261
  readonly maxOutputTokens?: number;
244
262
  }
263
+ interface CodexResolvedConfig {
264
+ readonly executable: string;
265
+ readonly args?: readonly string[];
266
+ readonly cwd?: string;
267
+ readonly timeoutMs?: number;
268
+ }
245
269
  interface MockResolvedConfig {
246
270
  readonly response?: string;
247
271
  readonly delayMs?: number;
@@ -255,6 +279,24 @@ interface VSCodeResolvedConfig {
255
279
  readonly subagentRoot?: string;
256
280
  readonly workspaceTemplate?: string;
257
281
  }
282
+ type CliHealthcheck = {
283
+ readonly type: "http";
284
+ readonly url: string;
285
+ readonly timeoutMs?: number;
286
+ } | {
287
+ readonly type: "command";
288
+ readonly commandTemplate: string;
289
+ readonly timeoutMs?: number;
290
+ readonly cwd?: string;
291
+ };
292
+ interface CliResolvedConfig {
293
+ readonly commandTemplate: string;
294
+ readonly filesFormat?: string;
295
+ readonly cwd?: string;
296
+ readonly env?: Record<string, string>;
297
+ readonly timeoutMs?: number;
298
+ readonly healthcheck?: CliHealthcheck;
299
+ }
258
300
  type ResolvedTarget = {
259
301
  readonly kind: "azure";
260
302
  readonly name: string;
@@ -276,6 +318,13 @@ type ResolvedTarget = {
276
318
  readonly workers?: number;
277
319
  readonly providerBatching?: boolean;
278
320
  readonly config: GeminiResolvedConfig;
321
+ } | {
322
+ readonly kind: "codex";
323
+ readonly name: string;
324
+ readonly judgeTarget?: string;
325
+ readonly workers?: number;
326
+ readonly providerBatching?: boolean;
327
+ readonly config: CodexResolvedConfig;
279
328
  } | {
280
329
  readonly kind: "mock";
281
330
  readonly name: string;
@@ -290,6 +339,13 @@ type ResolvedTarget = {
290
339
  readonly workers?: number;
291
340
  readonly providerBatching?: boolean;
292
341
  readonly config: VSCodeResolvedConfig;
342
+ } | {
343
+ readonly kind: "cli";
344
+ readonly name: string;
345
+ readonly judgeTarget?: string;
346
+ readonly workers?: number;
347
+ readonly providerBatching?: boolean;
348
+ readonly config: CliResolvedConfig;
293
349
  };
294
350
  declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
295
351
 
@@ -317,36 +373,7 @@ declare function ensureVSCodeSubagents(options: EnsureSubagentsOptions): Promise
317
373
  declare function createProvider(target: ResolvedTarget): Provider;
318
374
  declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
319
375
 
320
- interface HeuristicScore {
321
- readonly score: number;
322
- readonly hits: readonly string[];
323
- readonly misses: readonly string[];
324
- readonly hitCount: number;
325
- readonly totalAspects: number;
326
- readonly rawAspects: readonly string[];
327
- }
328
- /**
329
- * Extract individual evaluation aspects from the expected assistant response.
330
- */
331
- declare function extractAspects(expectedResponse: string): readonly string[];
332
- /**
333
- * Determine which aspects were covered by the candidate response.
334
- */
335
- declare function calculateHits(candidateResponse: string, expectedAspects: readonly string[]): readonly string[];
336
- /**
337
- * Determine which aspects were not satisfied by the candidate response.
338
- */
339
- declare function calculateMisses(candidateResponse: string, expectedAspects: readonly string[], resolvedHits?: readonly string[]): readonly string[];
340
- /**
341
- * Evaluate the candidate response against the expected aspects.
342
- */
343
- declare function scoreCandidateResponse(candidateResponse: string, expectedAspects: readonly string[]): HeuristicScore;
344
- /**
345
- * Detect common error-prefixed outputs from providers.
346
- */
347
- declare function isErrorLike(text: string | undefined | null): boolean;
348
-
349
- interface GradeContext {
376
+ interface EvaluationContext {
350
377
  readonly evalCase: EvalCase;
351
378
  readonly candidate: string;
352
379
  readonly target: ResolvedTarget;
@@ -355,40 +382,55 @@ interface GradeContext {
355
382
  readonly promptInputs: {
356
383
  readonly request: string;
357
384
  readonly guidelines: string;
385
+ readonly systemMessage?: string;
358
386
  };
359
387
  readonly now: Date;
360
388
  readonly judgeProvider?: Provider;
389
+ readonly systemPrompt?: string;
390
+ readonly evaluator?: EvaluatorConfig;
391
+ readonly judgeModel?: string;
361
392
  }
362
- interface GradeResult {
393
+ interface EvaluationScore {
363
394
  readonly score: number;
364
395
  readonly hits: readonly string[];
365
396
  readonly misses: readonly string[];
366
397
  readonly expectedAspectCount: number;
367
398
  readonly reasoning?: string;
368
399
  readonly rawAspects?: readonly string[];
369
- readonly graderRawRequest?: JsonObject;
400
+ readonly evaluatorRawRequest?: JsonObject;
370
401
  }
371
- interface Grader {
402
+ interface Evaluator {
372
403
  readonly kind: string;
373
- grade(context: GradeContext): Promise<GradeResult> | GradeResult;
374
- }
375
- declare class HeuristicGrader implements Grader {
376
- readonly kind = "heuristic";
377
- grade(context: GradeContext): GradeResult;
404
+ evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
378
405
  }
379
- type JudgeProviderResolver = (context: GradeContext) => Promise<Provider | undefined>;
380
- interface QualityGraderOptions {
406
+ type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
407
+ interface LlmJudgeEvaluatorOptions {
381
408
  readonly resolveJudgeProvider: JudgeProviderResolver;
382
409
  readonly maxOutputTokens?: number;
383
410
  readonly temperature?: number;
411
+ readonly customPrompt?: string;
384
412
  }
385
- declare class QualityGrader implements Grader {
413
+ declare class LlmJudgeEvaluator implements Evaluator {
386
414
  readonly kind = "llm_judge";
387
415
  private readonly resolveJudgeProvider;
388
416
  private readonly maxOutputTokens?;
389
417
  private readonly temperature?;
390
- constructor(options: QualityGraderOptions);
391
- grade(context: GradeContext): Promise<GradeResult>;
418
+ private readonly customPrompt?;
419
+ constructor(options: LlmJudgeEvaluatorOptions);
420
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
421
+ }
422
+ interface CodeEvaluatorOptions {
423
+ readonly script: string;
424
+ readonly cwd?: string;
425
+ readonly agentTimeoutMs?: number;
426
+ }
427
+ declare class CodeEvaluator implements Evaluator {
428
+ readonly kind = "code";
429
+ private readonly script;
430
+ private readonly cwd?;
431
+ private readonly agentTimeoutMs?;
432
+ constructor(options: CodeEvaluatorOptions);
433
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
392
434
  }
393
435
 
394
436
  type MaybePromise<T> = T | Promise<T>;
@@ -400,7 +442,9 @@ interface RunEvalCaseOptions {
400
442
  readonly evalCase: EvalCase;
401
443
  readonly provider: Provider;
402
444
  readonly target: ResolvedTarget;
403
- readonly graders: Partial<Record<string, Grader>>;
445
+ readonly evaluators: Partial<Record<string, Evaluator>> & {
446
+ readonly llm_judge: Evaluator;
447
+ };
404
448
  readonly now?: () => Date;
405
449
  readonly maxRetries?: number;
406
450
  readonly agentTimeoutMs?: number;
@@ -425,7 +469,7 @@ interface RunEvaluationOptions {
425
469
  readonly targets?: readonly TargetDefinition[];
426
470
  readonly env?: EnvLookup;
427
471
  readonly providerFactory?: (target: ResolvedTarget) => Provider;
428
- readonly graders?: Partial<Record<string, Grader>>;
472
+ readonly evaluators?: Partial<Record<string, Evaluator>>;
429
473
  readonly maxRetries?: number;
430
474
  readonly agentTimeoutMs?: number;
431
475
  readonly promptDumpDir?: string;
@@ -446,4 +490,4 @@ type AgentKernel = {
446
490
  };
447
491
  declare function createAgentKernel(): AgentKernel;
448
492
 
449
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationResult, GRADER_KINDS, type GeminiResolvedConfig, type GradeContext, type GradeResult, type Grader, type GraderKind, HeuristicGrader, type HeuristicScore, type JsonObject, type JsonPrimitive, type JsonValue, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, QualityGrader, type QualityGraderOptions, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, calculateHits, calculateMisses, createAgentKernel, createProvider, ensureVSCodeSubagents, extractAspects, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isErrorLike, isGraderKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreCandidateResponse };
493
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation };
package/dist/index.d.ts CHANGED
@@ -75,19 +75,25 @@ declare function isJsonValue(value: unknown): value is JsonValue;
75
75
  * Guard validating raw test messages.
76
76
  */
77
77
  declare function isTestMessage(value: unknown): value is TestMessage;
78
- declare const GRADER_KIND_VALUES: readonly ["heuristic", "llm_judge"];
79
- /**
80
- * Supported grader implementations.
81
- */
82
- declare const GRADER_KINDS: readonly ["heuristic", "llm_judge"];
83
- /**
84
- * Grader identifiers available to the pipeline.
85
- */
86
- type GraderKind = (typeof GRADER_KIND_VALUES)[number];
87
- /**
88
- * Guard validating grader identifiers.
89
- */
90
- declare function isGraderKind(value: unknown): value is GraderKind;
78
+ declare const EVALUATOR_KIND_VALUES: readonly ["code", "llm_judge"];
79
+ type EvaluatorKind = (typeof EVALUATOR_KIND_VALUES)[number];
80
+ declare function isEvaluatorKind(value: unknown): value is EvaluatorKind;
81
+ type CodeEvaluatorConfig = {
82
+ readonly name: string;
83
+ readonly type: "code";
84
+ readonly script: string;
85
+ readonly resolvedScriptPath?: string;
86
+ readonly cwd?: string;
87
+ readonly resolvedCwd?: string;
88
+ };
89
+ type LlmJudgeEvaluatorConfig = {
90
+ readonly name: string;
91
+ readonly type: "llm_judge";
92
+ readonly prompt?: string;
93
+ readonly promptPath?: string;
94
+ readonly model?: string;
95
+ };
96
+ type EvaluatorConfig = CodeEvaluatorConfig | LlmJudgeEvaluatorConfig;
91
97
  /**
92
98
  * Test case definition sourced from AgentV specs.
93
99
  */
@@ -103,7 +109,8 @@ interface EvalCase {
103
109
  readonly file_paths: readonly string[];
104
110
  readonly code_snippets: readonly string[];
105
111
  readonly outcome: string;
106
- readonly grader: GraderKind;
112
+ readonly evaluator?: EvaluatorKind;
113
+ readonly evaluators?: readonly EvaluatorConfig[];
107
114
  }
108
115
  /**
109
116
  * Evaluator scorecard for a single test case run.
@@ -121,7 +128,18 @@ interface EvaluationResult {
121
128
  readonly reasoning?: string;
122
129
  readonly raw_aspects?: readonly string[];
123
130
  readonly raw_request?: JsonObject;
124
- readonly grader_raw_request?: JsonObject;
131
+ readonly evaluator_raw_request?: JsonObject;
132
+ readonly evaluator_results?: readonly EvaluatorResult[];
133
+ }
134
+ interface EvaluatorResult {
135
+ readonly name: string;
136
+ readonly type: EvaluatorKind;
137
+ readonly score: number;
138
+ readonly hits: readonly string[];
139
+ readonly misses: readonly string[];
140
+ readonly reasoning?: string;
141
+ readonly raw_request?: JsonObject;
142
+ readonly evaluator_raw_request?: JsonObject;
125
143
  }
126
144
  /**
127
145
  * Convenience accessor matching the Python hit_count property.
@@ -177,13 +195,13 @@ declare function resolveFileReference(rawValue: string, searchRoots: readonly st
177
195
  }>;
178
196
 
179
197
  type ChatPrompt = AxChatRequest["chatPrompt"];
180
- type ProviderKind = "azure" | "anthropic" | "gemini" | "mock" | "vscode" | "vscode-insiders";
198
+ type ProviderKind = "azure" | "anthropic" | "gemini" | "codex" | "cli" | "mock" | "vscode" | "vscode-insiders";
181
199
  interface ProviderRequest {
182
200
  readonly prompt: string;
183
201
  readonly guidelines?: string;
184
202
  readonly guideline_patterns?: readonly string[];
185
203
  readonly chatPrompt?: ChatPrompt;
186
- readonly attachments?: readonly string[];
204
+ readonly inputFiles?: readonly string[];
187
205
  readonly evalCaseId?: string;
188
206
  readonly attempt?: number;
189
207
  readonly maxOutputTokens?: number;
@@ -242,6 +260,12 @@ interface GeminiResolvedConfig {
242
260
  readonly temperature?: number;
243
261
  readonly maxOutputTokens?: number;
244
262
  }
263
+ interface CodexResolvedConfig {
264
+ readonly executable: string;
265
+ readonly args?: readonly string[];
266
+ readonly cwd?: string;
267
+ readonly timeoutMs?: number;
268
+ }
245
269
  interface MockResolvedConfig {
246
270
  readonly response?: string;
247
271
  readonly delayMs?: number;
@@ -255,6 +279,24 @@ interface VSCodeResolvedConfig {
255
279
  readonly subagentRoot?: string;
256
280
  readonly workspaceTemplate?: string;
257
281
  }
282
+ type CliHealthcheck = {
283
+ readonly type: "http";
284
+ readonly url: string;
285
+ readonly timeoutMs?: number;
286
+ } | {
287
+ readonly type: "command";
288
+ readonly commandTemplate: string;
289
+ readonly timeoutMs?: number;
290
+ readonly cwd?: string;
291
+ };
292
+ interface CliResolvedConfig {
293
+ readonly commandTemplate: string;
294
+ readonly filesFormat?: string;
295
+ readonly cwd?: string;
296
+ readonly env?: Record<string, string>;
297
+ readonly timeoutMs?: number;
298
+ readonly healthcheck?: CliHealthcheck;
299
+ }
258
300
  type ResolvedTarget = {
259
301
  readonly kind: "azure";
260
302
  readonly name: string;
@@ -276,6 +318,13 @@ type ResolvedTarget = {
276
318
  readonly workers?: number;
277
319
  readonly providerBatching?: boolean;
278
320
  readonly config: GeminiResolvedConfig;
321
+ } | {
322
+ readonly kind: "codex";
323
+ readonly name: string;
324
+ readonly judgeTarget?: string;
325
+ readonly workers?: number;
326
+ readonly providerBatching?: boolean;
327
+ readonly config: CodexResolvedConfig;
279
328
  } | {
280
329
  readonly kind: "mock";
281
330
  readonly name: string;
@@ -290,6 +339,13 @@ type ResolvedTarget = {
290
339
  readonly workers?: number;
291
340
  readonly providerBatching?: boolean;
292
341
  readonly config: VSCodeResolvedConfig;
342
+ } | {
343
+ readonly kind: "cli";
344
+ readonly name: string;
345
+ readonly judgeTarget?: string;
346
+ readonly workers?: number;
347
+ readonly providerBatching?: boolean;
348
+ readonly config: CliResolvedConfig;
293
349
  };
294
350
  declare function resolveTargetDefinition(definition: TargetDefinition, env?: EnvLookup): ResolvedTarget;
295
351
 
@@ -317,36 +373,7 @@ declare function ensureVSCodeSubagents(options: EnsureSubagentsOptions): Promise
317
373
  declare function createProvider(target: ResolvedTarget): Provider;
318
374
  declare function resolveAndCreateProvider(definition: TargetDefinition, env?: EnvLookup): Provider;
319
375
 
320
- interface HeuristicScore {
321
- readonly score: number;
322
- readonly hits: readonly string[];
323
- readonly misses: readonly string[];
324
- readonly hitCount: number;
325
- readonly totalAspects: number;
326
- readonly rawAspects: readonly string[];
327
- }
328
- /**
329
- * Extract individual evaluation aspects from the expected assistant response.
330
- */
331
- declare function extractAspects(expectedResponse: string): readonly string[];
332
- /**
333
- * Determine which aspects were covered by the candidate response.
334
- */
335
- declare function calculateHits(candidateResponse: string, expectedAspects: readonly string[]): readonly string[];
336
- /**
337
- * Determine which aspects were not satisfied by the candidate response.
338
- */
339
- declare function calculateMisses(candidateResponse: string, expectedAspects: readonly string[], resolvedHits?: readonly string[]): readonly string[];
340
- /**
341
- * Evaluate the candidate response against the expected aspects.
342
- */
343
- declare function scoreCandidateResponse(candidateResponse: string, expectedAspects: readonly string[]): HeuristicScore;
344
- /**
345
- * Detect common error-prefixed outputs from providers.
346
- */
347
- declare function isErrorLike(text: string | undefined | null): boolean;
348
-
349
- interface GradeContext {
376
+ interface EvaluationContext {
350
377
  readonly evalCase: EvalCase;
351
378
  readonly candidate: string;
352
379
  readonly target: ResolvedTarget;
@@ -355,40 +382,55 @@ interface GradeContext {
355
382
  readonly promptInputs: {
356
383
  readonly request: string;
357
384
  readonly guidelines: string;
385
+ readonly systemMessage?: string;
358
386
  };
359
387
  readonly now: Date;
360
388
  readonly judgeProvider?: Provider;
389
+ readonly systemPrompt?: string;
390
+ readonly evaluator?: EvaluatorConfig;
391
+ readonly judgeModel?: string;
361
392
  }
362
- interface GradeResult {
393
+ interface EvaluationScore {
363
394
  readonly score: number;
364
395
  readonly hits: readonly string[];
365
396
  readonly misses: readonly string[];
366
397
  readonly expectedAspectCount: number;
367
398
  readonly reasoning?: string;
368
399
  readonly rawAspects?: readonly string[];
369
- readonly graderRawRequest?: JsonObject;
400
+ readonly evaluatorRawRequest?: JsonObject;
370
401
  }
371
- interface Grader {
402
+ interface Evaluator {
372
403
  readonly kind: string;
373
- grade(context: GradeContext): Promise<GradeResult> | GradeResult;
374
- }
375
- declare class HeuristicGrader implements Grader {
376
- readonly kind = "heuristic";
377
- grade(context: GradeContext): GradeResult;
404
+ evaluate(context: EvaluationContext): Promise<EvaluationScore> | EvaluationScore;
378
405
  }
379
- type JudgeProviderResolver = (context: GradeContext) => Promise<Provider | undefined>;
380
- interface QualityGraderOptions {
406
+ type JudgeProviderResolver = (context: EvaluationContext) => Promise<Provider | undefined>;
407
+ interface LlmJudgeEvaluatorOptions {
381
408
  readonly resolveJudgeProvider: JudgeProviderResolver;
382
409
  readonly maxOutputTokens?: number;
383
410
  readonly temperature?: number;
411
+ readonly customPrompt?: string;
384
412
  }
385
- declare class QualityGrader implements Grader {
413
+ declare class LlmJudgeEvaluator implements Evaluator {
386
414
  readonly kind = "llm_judge";
387
415
  private readonly resolveJudgeProvider;
388
416
  private readonly maxOutputTokens?;
389
417
  private readonly temperature?;
390
- constructor(options: QualityGraderOptions);
391
- grade(context: GradeContext): Promise<GradeResult>;
418
+ private readonly customPrompt?;
419
+ constructor(options: LlmJudgeEvaluatorOptions);
420
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
421
+ }
422
+ interface CodeEvaluatorOptions {
423
+ readonly script: string;
424
+ readonly cwd?: string;
425
+ readonly agentTimeoutMs?: number;
426
+ }
427
+ declare class CodeEvaluator implements Evaluator {
428
+ readonly kind = "code";
429
+ private readonly script;
430
+ private readonly cwd?;
431
+ private readonly agentTimeoutMs?;
432
+ constructor(options: CodeEvaluatorOptions);
433
+ evaluate(context: EvaluationContext): Promise<EvaluationScore>;
392
434
  }
393
435
 
394
436
  type MaybePromise<T> = T | Promise<T>;
@@ -400,7 +442,9 @@ interface RunEvalCaseOptions {
400
442
  readonly evalCase: EvalCase;
401
443
  readonly provider: Provider;
402
444
  readonly target: ResolvedTarget;
403
- readonly graders: Partial<Record<string, Grader>>;
445
+ readonly evaluators: Partial<Record<string, Evaluator>> & {
446
+ readonly llm_judge: Evaluator;
447
+ };
404
448
  readonly now?: () => Date;
405
449
  readonly maxRetries?: number;
406
450
  readonly agentTimeoutMs?: number;
@@ -425,7 +469,7 @@ interface RunEvaluationOptions {
425
469
  readonly targets?: readonly TargetDefinition[];
426
470
  readonly env?: EnvLookup;
427
471
  readonly providerFactory?: (target: ResolvedTarget) => Provider;
428
- readonly graders?: Partial<Record<string, Grader>>;
472
+ readonly evaluators?: Partial<Record<string, Evaluator>>;
429
473
  readonly maxRetries?: number;
430
474
  readonly agentTimeoutMs?: number;
431
475
  readonly promptDumpDir?: string;
@@ -446,4 +490,4 @@ type AgentKernel = {
446
490
  };
447
491
  declare function createAgentKernel(): AgentKernel;
448
492
 
449
- export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationResult, GRADER_KINDS, type GeminiResolvedConfig, type GradeContext, type GradeResult, type Grader, type GraderKind, HeuristicGrader, type HeuristicScore, type JsonObject, type JsonPrimitive, type JsonValue, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, QualityGrader, type QualityGraderOptions, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, calculateHits, calculateMisses, createAgentKernel, createProvider, ensureVSCodeSubagents, extractAspects, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isErrorLike, isGraderKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation, scoreCandidateResponse };
493
+ export { type AgentKernel, type AnthropicResolvedConfig, type AssistantTestMessage, type AzureResolvedConfig, type CliResolvedConfig, CodeEvaluator, type CodeEvaluatorConfig, type CodeEvaluatorOptions, type EnsureSubagentsOptions, type EnsureSubagentsResult, type EnvLookup, type EvalCase, type EvaluationCache, type EvaluationContext, type EvaluationResult, type EvaluationScore, type Evaluator, type EvaluatorConfig, type EvaluatorKind, type EvaluatorResult, type GeminiResolvedConfig, type JsonObject, type JsonPrimitive, type JsonValue, LlmJudgeEvaluator, type LlmJudgeEvaluatorConfig, type LlmJudgeEvaluatorOptions, type MockResolvedConfig, type ProgressEvent, type Provider, type ProviderKind, type ProviderRequest, type ProviderResponse, type ResolvedTarget, type RunEvalCaseOptions, type RunEvaluationOptions, type SystemTestMessage, TEST_MESSAGE_ROLES, type TargetDefinition, type TestMessage, type TestMessageContent, type TestMessageRole, type ToolTestMessage, type UserTestMessage, type VSCodeResolvedConfig, buildDirectoryChain, buildPromptInputs, buildSearchRoots, createAgentKernel, createProvider, ensureVSCodeSubagents, extractCodeBlocks, fileExists, findGitRoot, getHitCount, isEvaluatorKind, isGuidelineFile, isJsonObject, isJsonValue, isTestMessage, isTestMessageRole, listTargetNames, loadEvalCases, readTargetDefinitions, resolveAndCreateProvider, resolveFileReference, resolveTargetDefinition, runEvalCase, runEvaluation };