@agtlantis/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +198 -0
- package/LICENSE +21 -0
- package/README.md +496 -0
- package/dist/cli.js +4709 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +3998 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2738 -0
- package/dist/index.d.ts +2738 -0
- package/dist/index.js +3868 -0
- package/dist/index.js.map +1 -0
- package/package.json +101 -0
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,2738 @@
|
|
|
1
|
+
import { Provider, ProviderType, ProviderPricing } from '@agtlantis/core';
|
|
2
|
+
export { ANTHROPIC_PRICING, CostResult, DEFAULT_PRICING_CONFIG, FilePromptRepositoryOptions, FileSource, FileSourceBase64, FileSourceData, FileSourceDisplayInfo, FileSourcePath, FileSourceUrl, FileSystem, FoundFileSource, GOOGLE_PRICING, ModelPricing, OPENAI_PRICING, PricingConfig, PromptRepository, PromptTemplate, ResolveOptions, calculateCostFromUsage, compileTemplate, createFilePromptRepository, getFileSourceDisplayInfo, getFileSourcesDisplayInfo, inferMediaType, isFileSource, isFileSourceBase64, isFileSourceData, isFileSourcePath, isFileSourceUrl, resolveFileSource, resolveFileSourcesInInput, scanForFileSources } from '@agtlantis/core';
|
|
3
|
+
export { MockCall, MockProvider, mock } from '@agtlantis/core/testing';
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Simplified token usage type for eval package.
|
|
8
|
+
*
|
|
9
|
+
* This is a subset of AI SDK's LanguageModelUsage that only includes
|
|
10
|
+
* the properties eval actually tracks. The cost-helpers module handles
|
|
11
|
+
* conversion when calling @agtlantis/core's pricing calculator.
|
|
12
|
+
*
|
|
13
|
+
* @example
|
|
14
|
+
* ```typescript
|
|
15
|
+
* const usage: EvalTokenUsage = {
|
|
16
|
+
* inputTokens: 100,
|
|
17
|
+
* outputTokens: 50,
|
|
18
|
+
* totalTokens: 150,
|
|
19
|
+
* }
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
22
|
+
interface EvalTokenUsage {
|
|
23
|
+
/** Number of input (prompt) tokens */
|
|
24
|
+
inputTokens: number;
|
|
25
|
+
/** Number of output (completion) tokens */
|
|
26
|
+
outputTokens: number;
|
|
27
|
+
/** Total tokens (input + output) */
|
|
28
|
+
totalTokens: number;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* Simplified agent configuration for evaluation.
|
|
32
|
+
* Only requires fields needed for eval purposes.
|
|
33
|
+
*
|
|
34
|
+
* For agents from `ai-agents` package with full AgentConfig,
|
|
35
|
+
* use `toEvalAgent()` adapter to convert them.
|
|
36
|
+
*/
|
|
37
|
+
interface EvalAgentConfig {
|
|
38
|
+
/** Agent name for identification */
|
|
39
|
+
name: string;
|
|
40
|
+
/** Agent description (used by Judge for context) */
|
|
41
|
+
description?: string;
|
|
42
|
+
/** Additional custom fields */
|
|
43
|
+
[key: string]: unknown;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Agent prompt template.
|
|
47
|
+
*/
|
|
48
|
+
interface AgentPrompt<TInput> {
|
|
49
|
+
/** Prompt unique ID for version tracking */
|
|
50
|
+
id: string;
|
|
51
|
+
/** Version string (e.g., "1.0.0") */
|
|
52
|
+
version: string;
|
|
53
|
+
/** System prompt */
|
|
54
|
+
system: string;
|
|
55
|
+
/** User template string (for serialization/history) */
|
|
56
|
+
userTemplate?: string;
|
|
57
|
+
/** User prompt builder function */
|
|
58
|
+
renderUserPrompt: (input: TInput) => string;
|
|
59
|
+
/** Additional custom fields */
|
|
60
|
+
[key: string]: unknown;
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* Base metadata type shared by all LLM-using components (Agent, Judge, Improver).
|
|
64
|
+
* Provides consistent structure for tracking token usage and model information.
|
|
65
|
+
*
|
|
66
|
+
* @example
|
|
67
|
+
* ```typescript
|
|
68
|
+
* const metadata: ComponentMetadata = {
|
|
69
|
+
* tokenUsage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
|
|
70
|
+
* model: 'gpt-4o',
|
|
71
|
+
* }
|
|
72
|
+
* ```
|
|
73
|
+
*/
|
|
74
|
+
interface ComponentMetadata {
|
|
75
|
+
/** Token usage from the LLM call (AI SDK LanguageModelUsage format) */
|
|
76
|
+
tokenUsage?: EvalTokenUsage;
|
|
77
|
+
/** Model identifier used for the LLM call */
|
|
78
|
+
model?: string;
|
|
79
|
+
/** Additional custom fields */
|
|
80
|
+
[key: string]: unknown;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Agent execution result metadata.
|
|
84
|
+
* Extends ComponentMetadata with agent-specific fields.
|
|
85
|
+
*/
|
|
86
|
+
interface AgentMetadata extends ComponentMetadata {
|
|
87
|
+
/** Prompt version used for execution */
|
|
88
|
+
promptVersion?: string;
|
|
89
|
+
/** Execution duration in milliseconds */
|
|
90
|
+
duration?: number;
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Judge evaluation metadata.
|
|
94
|
+
* Tracks token usage and model for cost calculation.
|
|
95
|
+
*/
|
|
96
|
+
interface JudgeMetadata extends ComponentMetadata {
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Improver analysis metadata.
|
|
100
|
+
* Tracks token usage and model for cost calculation.
|
|
101
|
+
*/
|
|
102
|
+
interface ImproverMetadata extends ComponentMetadata {
|
|
103
|
+
}
|
|
104
|
+
/**
|
|
105
|
+
* Agent execution result.
|
|
106
|
+
*/
|
|
107
|
+
interface AgentResult<TOutput> {
|
|
108
|
+
result: TOutput;
|
|
109
|
+
metadata?: AgentMetadata;
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Simplified Agent interface for evaluation.
|
|
113
|
+
*
|
|
114
|
+
* @example
|
|
115
|
+
* ```typescript
|
|
116
|
+
* // Direct implementation
|
|
117
|
+
* const myAgent: EvalAgent<string, string> = {
|
|
118
|
+
* config: { name: 'MyAgent', description: 'A simple agent' },
|
|
119
|
+
* prompt: { id: 'prompt-1', version: '1.0.0', system: '...', renderUserPrompt: (input) => input },
|
|
120
|
+
* execute: async (input) => ({ result: `Processed: ${input}` })
|
|
121
|
+
* }
|
|
122
|
+
*
|
|
123
|
+
* // Or adapt from full ai-agents Agent
|
|
124
|
+
* const evalAgent = toEvalAgent(fullAgent)
|
|
125
|
+
* ```
|
|
126
|
+
*/
|
|
127
|
+
interface EvalAgent<TInput, TOutput> {
|
|
128
|
+
readonly config: EvalAgentConfig;
|
|
129
|
+
readonly prompt: AgentPrompt<TInput>;
|
|
130
|
+
execute(input: TInput, options?: unknown): Promise<AgentResult<TOutput>>;
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* Full AgentConfig interface (compatible with ai-agents package).
|
|
134
|
+
* Used for type-safe adaptation.
|
|
135
|
+
*/
|
|
136
|
+
interface FullAgentConfig {
|
|
137
|
+
name: string;
|
|
138
|
+
role: 'generator' | 'analyzer' | 'validator' | 'enhancer';
|
|
139
|
+
streaming: 'required' | 'optional' | 'none';
|
|
140
|
+
execution: 'batch' | 'realtime';
|
|
141
|
+
conversation?: 'single-turn' | 'multi-turn';
|
|
142
|
+
description?: string;
|
|
143
|
+
[key: string]: unknown;
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Full Agent interface (compatible with ai-agents package).
|
|
147
|
+
* Used for type-safe adaptation.
|
|
148
|
+
*/
|
|
149
|
+
interface FullAgent<TInput, TOutput> {
|
|
150
|
+
readonly config: FullAgentConfig;
|
|
151
|
+
readonly prompt: AgentPrompt<TInput>;
|
|
152
|
+
execute(input: TInput, options?: unknown): Promise<{
|
|
153
|
+
result: TOutput;
|
|
154
|
+
metadata: {
|
|
155
|
+
duration: number;
|
|
156
|
+
promptVersion: string;
|
|
157
|
+
tokenUsage?: EvalTokenUsage;
|
|
158
|
+
model?: string;
|
|
159
|
+
retryCount?: number;
|
|
160
|
+
traceId?: string;
|
|
161
|
+
[key: string]: unknown;
|
|
162
|
+
};
|
|
163
|
+
}>;
|
|
164
|
+
}
|
|
165
|
+
/**
|
|
166
|
+
* Adapts a full Agent (from ai-agents) to EvalAgent for evaluation.
|
|
167
|
+
* Extracts only the fields needed for evaluation.
|
|
168
|
+
*
|
|
169
|
+
* @example
|
|
170
|
+
* ```typescript
|
|
171
|
+
* import { scenarioGenerator } from './agents/mce'
|
|
172
|
+
*
|
|
173
|
+
* const evalAgent = toEvalAgent(scenarioGenerator)
|
|
174
|
+
* const suite = createEvalSuite({ agent: evalAgent, ... })
|
|
175
|
+
* ```
|
|
176
|
+
*/
|
|
177
|
+
declare function toEvalAgent<TInput, TOutput>(agent: FullAgent<TInput, TOutput>): EvalAgent<TInput, TOutput>;
|
|
178
|
+
/**
|
|
179
|
+
* Metadata for file content.
|
|
180
|
+
*/
|
|
181
|
+
interface FileContentMetadata {
|
|
182
|
+
/** File size in bytes */
|
|
183
|
+
size?: number;
|
|
184
|
+
/** Full resolved path (for loaded files) */
|
|
185
|
+
fullPath?: string;
|
|
186
|
+
/** Whether the content was created inline (not from disk) */
|
|
187
|
+
inline?: boolean;
|
|
188
|
+
/** Additional custom metadata */
|
|
189
|
+
[key: string]: unknown;
|
|
190
|
+
}
|
|
191
|
+
interface FileContent {
|
|
192
|
+
/** File path (relative or absolute) - used as identifier */
|
|
193
|
+
path: string;
|
|
194
|
+
/** File content as string (text files only for Phase 5.3) */
|
|
195
|
+
content: string;
|
|
196
|
+
/** Optional MIME type hint (defaults to 'text/plain') */
|
|
197
|
+
mediaType?: string;
|
|
198
|
+
/** Optional encoding (defaults to 'utf-8') */
|
|
199
|
+
encoding?: BufferEncoding;
|
|
200
|
+
/** Optional metadata (e.g., original size, full path, etc.) */
|
|
201
|
+
metadata?: FileContentMetadata;
|
|
202
|
+
}
|
|
203
|
+
interface TestCase<TInput> {
|
|
204
|
+
id?: string;
|
|
205
|
+
input: TInput;
|
|
206
|
+
tags?: string[];
|
|
207
|
+
description?: string;
|
|
208
|
+
expectedOutput?: unknown;
|
|
209
|
+
/**
|
|
210
|
+
* Optional file context for agent and judge (Phase 5.3).
|
|
211
|
+
* Files are passed to Judge for evaluation context.
|
|
212
|
+
* For Agent access, include files in the input type directly.
|
|
213
|
+
*
|
|
214
|
+
* @deprecated Use FileSource in input directly for flexible file handling
|
|
215
|
+
*/
|
|
216
|
+
files?: FileContent[];
|
|
217
|
+
}
|
|
218
|
+
interface MetricsResult {
|
|
219
|
+
latencyMs: number;
|
|
220
|
+
tokenUsage: EvalTokenUsage;
|
|
221
|
+
}
|
|
222
|
+
interface Criterion {
|
|
223
|
+
id: string;
|
|
224
|
+
name: string;
|
|
225
|
+
description: string;
|
|
226
|
+
weight?: number;
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* Zod error issue - minimal type compatible with ZodError.errors.
|
|
230
|
+
* Using `readonly` and rest index to be compatible with Zod's discriminated union.
|
|
231
|
+
*/
|
|
232
|
+
type ZodIssue = {
|
|
233
|
+
readonly code: string;
|
|
234
|
+
readonly path: readonly (string | number)[];
|
|
235
|
+
readonly message: string;
|
|
236
|
+
};
|
|
237
|
+
/**
|
|
238
|
+
* Result of programmatic schema validation.
|
|
239
|
+
*/
|
|
240
|
+
interface SchemaValidationResult {
|
|
241
|
+
/** Whether the output matches the schema */
|
|
242
|
+
valid: boolean;
|
|
243
|
+
/** Validation errors if invalid (Zod issue format) */
|
|
244
|
+
errors?: readonly ZodIssue[];
|
|
245
|
+
/** Human-readable error summary */
|
|
246
|
+
errorSummary?: string;
|
|
247
|
+
}
|
|
248
|
+
/**
|
|
249
|
+
* Validator function type for programmatic validation.
|
|
250
|
+
* Returns validation result with binary pass/fail outcome.
|
|
251
|
+
*/
|
|
252
|
+
type ValidatorFn = (output: unknown) => SchemaValidationResult;
|
|
253
|
+
/**
|
|
254
|
+
* Extended criterion with optional programmatic validator.
|
|
255
|
+
* Validators run before LLM evaluation with binary scoring (0 or 100).
|
|
256
|
+
*
|
|
257
|
+
* @example
|
|
258
|
+
* ```typescript
|
|
259
|
+
* import { z } from 'zod'
|
|
260
|
+
* import { schema } from '@agtlantis/eval'
|
|
261
|
+
*
|
|
262
|
+
* const criterion = schema({
|
|
263
|
+
* schema: z.object({ name: z.string() }),
|
|
264
|
+
* weight: 2,
|
|
265
|
+
* })
|
|
266
|
+
* ```
|
|
267
|
+
*/
|
|
268
|
+
interface ValidatorCriterion extends Criterion {
|
|
269
|
+
/**
|
|
270
|
+
* Optional programmatic validator.
|
|
271
|
+
* If provided and fails, score is automatically 0.
|
|
272
|
+
* If provided and passes, score is automatically 100.
|
|
273
|
+
*/
|
|
274
|
+
validator?: ValidatorFn;
|
|
275
|
+
}
|
|
276
|
+
interface Verdict {
|
|
277
|
+
criterionId: string;
|
|
278
|
+
score: number;
|
|
279
|
+
reasoning: string;
|
|
280
|
+
passed: boolean;
|
|
281
|
+
}
|
|
282
|
+
interface TestResult<TInput, TOutput> {
|
|
283
|
+
testCase: TestCase<TInput>;
|
|
284
|
+
output: TOutput;
|
|
285
|
+
metrics: MetricsResult;
|
|
286
|
+
error?: Error;
|
|
287
|
+
}
|
|
288
|
+
interface TestResultWithVerdict<TInput, TOutput> extends TestResult<TInput, TOutput> {
|
|
289
|
+
verdicts: Verdict[];
|
|
290
|
+
overallScore: number;
|
|
291
|
+
passed: boolean;
|
|
292
|
+
/** Judge metadata for cost tracking */
|
|
293
|
+
judgeMetadata?: JudgeMetadata;
|
|
294
|
+
}
|
|
295
|
+
/**
|
|
296
|
+
* Statistics from running the same test multiple times.
|
|
297
|
+
* Used to measure consistency and reliability of LLM-based agents.
|
|
298
|
+
*/
|
|
299
|
+
interface IterationStats {
|
|
300
|
+
/** Total number of iterations run */
|
|
301
|
+
iterations: number;
|
|
302
|
+
/** Score from each iteration */
|
|
303
|
+
scores: number[];
|
|
304
|
+
/** Average score across all iterations */
|
|
305
|
+
mean: number;
|
|
306
|
+
/** Standard deviation (lower = more consistent) */
|
|
307
|
+
stdDev: number;
|
|
308
|
+
/** Lowest score achieved */
|
|
309
|
+
min: number;
|
|
310
|
+
/** Highest score achieved */
|
|
311
|
+
max: number;
|
|
312
|
+
/** Pass rate as decimal (0-1, e.g., 0.67 = 67%) */
|
|
313
|
+
passRate: number;
|
|
314
|
+
/** Number of iterations that passed */
|
|
315
|
+
passCount: number;
|
|
316
|
+
}
|
|
317
|
+
/**
|
|
318
|
+
* Extended iteration statistics for multi-turn tests.
|
|
319
|
+
* Includes turn-count metrics and termination type distribution.
|
|
320
|
+
*
|
|
321
|
+
* @example
|
|
322
|
+
* ```typescript
|
|
323
|
+
* if (hasMultiTurnIterationData(result)) {
|
|
324
|
+
* console.log(`Average turns: ${result.multiTurnIterationStats.avgTurns}`)
|
|
325
|
+
* console.log(`Termination types: ${JSON.stringify(result.multiTurnIterationStats.terminationCounts)}`)
|
|
326
|
+
* }
|
|
327
|
+
* ```
|
|
328
|
+
*/
|
|
329
|
+
interface MultiTurnIterationStats extends IterationStats {
|
|
330
|
+
/** Average number of turns across all iterations */
|
|
331
|
+
avgTurns: number;
|
|
332
|
+
/** Minimum turns in any iteration */
|
|
333
|
+
minTurns: number;
|
|
334
|
+
/** Maximum turns in any iteration */
|
|
335
|
+
maxTurns: number;
|
|
336
|
+
/** Distribution of termination types across iterations (e.g., { condition: 2, maxTurns: 1 }) */
|
|
337
|
+
terminationCounts: Record<string, number>;
|
|
338
|
+
}
|
|
339
|
+
/**
|
|
340
|
+
* Discriminator for eval result types.
|
|
341
|
+
* Used for exhaustive pattern matching on result variants.
|
|
342
|
+
*/
|
|
343
|
+
type EvalResultKind = 'single-turn' | 'single-turn-iterated' | 'multi-turn' | 'multi-turn-iterated';
|
|
344
|
+
/**
|
|
345
|
+
* Properties present when test ran with multiple iterations.
|
|
346
|
+
* Extracted as a separate interface for composition.
|
|
347
|
+
*/
|
|
348
|
+
interface IterationData<TInput, TOutput> {
|
|
349
|
+
/** Aggregated statistics across all iterations */
|
|
350
|
+
iterationStats: IterationStats;
|
|
351
|
+
/** Individual results from each iteration */
|
|
352
|
+
iterationResults: TestResultWithVerdict<TInput, TOutput>[];
|
|
353
|
+
}
|
|
354
|
+
/**
|
|
355
|
+
* Single conversation entry in multi-turn tests.
|
|
356
|
+
*/
|
|
357
|
+
interface ConversationEntry<TInput, TOutput> {
|
|
358
|
+
/** Turn number (1-based) */
|
|
359
|
+
turn: number;
|
|
360
|
+
/** Input provided for this turn */
|
|
361
|
+
input: TInput;
|
|
362
|
+
/** Output from agent (undefined if execution failed) */
|
|
363
|
+
output: TOutput | undefined;
|
|
364
|
+
/** Agent execution metadata */
|
|
365
|
+
metadata?: AgentMetadata;
|
|
366
|
+
}
|
|
367
|
+
/**
|
|
368
|
+
* Termination info for multi-turn tests.
|
|
369
|
+
* Compatible with TerminationCheckResult from multi-turn module.
|
|
370
|
+
*/
|
|
371
|
+
interface TerminationInfo {
|
|
372
|
+
/** Whether the conversation terminated */
|
|
373
|
+
terminated: boolean;
|
|
374
|
+
/** Human-readable reason for termination */
|
|
375
|
+
reason: string;
|
|
376
|
+
/** Type of termination (condition, maxTurns, error, exhausted) */
|
|
377
|
+
terminationType?: string;
|
|
378
|
+
/** The condition that caused termination (if applicable) */
|
|
379
|
+
matchedCondition?: unknown;
|
|
380
|
+
}
|
|
381
|
+
/**
|
|
382
|
+
* Properties present for multi-turn test results.
|
|
383
|
+
* Extracted as a separate interface for composition.
|
|
384
|
+
*/
|
|
385
|
+
interface MultiTurnData<TInput, TOutput> {
|
|
386
|
+
/** Full conversation history */
|
|
387
|
+
conversationHistory: ConversationEntry<TInput, TOutput>[];
|
|
388
|
+
/** Total turns executed */
|
|
389
|
+
totalTurns: number;
|
|
390
|
+
/** Human-readable termination reason */
|
|
391
|
+
terminationReason: string;
|
|
392
|
+
/** Full termination check result */
|
|
393
|
+
termination: TerminationInfo;
|
|
394
|
+
}
|
|
395
|
+
/**
|
|
396
|
+
* Single-turn test result with single iteration (base case).
|
|
397
|
+
* No iteration stats, no multi-turn data.
|
|
398
|
+
*/
|
|
399
|
+
interface SingleTurnResult<TInput, TOutput> extends TestResultWithVerdict<TInput, TOutput> {
|
|
400
|
+
readonly kind: 'single-turn';
|
|
401
|
+
}
|
|
402
|
+
/**
|
|
403
|
+
* Single-turn test result with multiple iterations.
|
|
404
|
+
* Has iteration stats but no multi-turn data.
|
|
405
|
+
*/
|
|
406
|
+
interface SingleTurnIteratedResult<TInput, TOutput> extends TestResultWithVerdict<TInput, TOutput>, IterationData<TInput, TOutput> {
|
|
407
|
+
readonly kind: 'single-turn-iterated';
|
|
408
|
+
}
|
|
409
|
+
/**
|
|
410
|
+
* Multi-turn test result with single iteration.
|
|
411
|
+
* Has multi-turn data but no iteration stats.
|
|
412
|
+
*/
|
|
413
|
+
interface MultiTurnResult<TInput, TOutput> extends TestResultWithVerdict<TInput, TOutput>, MultiTurnData<TInput, TOutput> {
|
|
414
|
+
readonly kind: 'multi-turn';
|
|
415
|
+
}
|
|
416
|
+
/**
|
|
417
|
+
* Multi-turn test result with multiple iterations.
|
|
418
|
+
* Has both multi-turn data and iteration stats.
|
|
419
|
+
*/
|
|
420
|
+
interface MultiTurnIteratedResult<TInput, TOutput> extends TestResultWithVerdict<TInput, TOutput>, IterationData<TInput, TOutput>, MultiTurnData<TInput, TOutput> {
|
|
421
|
+
readonly kind: 'multi-turn-iterated';
|
|
422
|
+
/** Multi-turn specific iteration statistics */
|
|
423
|
+
multiTurnIterationStats: MultiTurnIterationStats;
|
|
424
|
+
}
|
|
425
|
+
/**
|
|
426
|
+
* Unified eval result type - discriminated union of all result kinds.
|
|
427
|
+
*
|
|
428
|
+
* Use pattern matching on `kind` for exhaustive handling:
|
|
429
|
+
* @example
|
|
430
|
+
* ```typescript
|
|
431
|
+
* switch (result.kind) {
|
|
432
|
+
* case 'single-turn':
|
|
433
|
+
* // No iteration stats, no multi-turn data
|
|
434
|
+
* break
|
|
435
|
+
* case 'single-turn-iterated':
|
|
436
|
+
* console.log(result.iterationStats.mean) // Type-safe
|
|
437
|
+
* break
|
|
438
|
+
* case 'multi-turn':
|
|
439
|
+
* console.log(result.conversationHistory) // Type-safe
|
|
440
|
+
* break
|
|
441
|
+
* case 'multi-turn-iterated':
|
|
442
|
+
* console.log(result.multiTurnIterationStats.avgTurns) // Type-safe
|
|
443
|
+
* break
|
|
444
|
+
* }
|
|
445
|
+
* ```
|
|
446
|
+
*/
|
|
447
|
+
type EvalTestResult<TInput, TOutput> = SingleTurnResult<TInput, TOutput> | SingleTurnIteratedResult<TInput, TOutput> | MultiTurnResult<TInput, TOutput> | MultiTurnIteratedResult<TInput, TOutput>;
|
|
448
|
+
/**
|
|
449
|
+
* Check if result is from a single-turn test (either iterated or not).
|
|
450
|
+
*
|
|
451
|
+
* @example
|
|
452
|
+
* ```typescript
|
|
453
|
+
* if (isSingleTurnResult(result)) {
|
|
454
|
+
* // result is SingleTurnResult | SingleTurnIteratedResult
|
|
455
|
+
* console.log('Single turn test')
|
|
456
|
+
* }
|
|
457
|
+
* ```
|
|
458
|
+
*/
|
|
459
|
+
declare function isSingleTurnResult<TInput, TOutput>(result: EvalTestResult<TInput, TOutput>): result is SingleTurnResult<TInput, TOutput> | SingleTurnIteratedResult<TInput, TOutput>;
|
|
460
|
+
/**
|
|
461
|
+
* Check if result is from a multi-turn test (either iterated or not).
|
|
462
|
+
*
|
|
463
|
+
* @example
|
|
464
|
+
* ```typescript
|
|
465
|
+
* if (isMultiTurnResult(result)) {
|
|
466
|
+
* // result is MultiTurnResult | MultiTurnIteratedResult
|
|
467
|
+
* console.log(`Turns: ${result.totalTurns}`) // Type-safe
|
|
468
|
+
* for (const entry of result.conversationHistory) { // Type-safe
|
|
469
|
+
* console.log(`Turn ${entry.turn}: ${entry.input}`)
|
|
470
|
+
* }
|
|
471
|
+
* }
|
|
472
|
+
* ```
|
|
473
|
+
*/
|
|
474
|
+
declare function isMultiTurnResult<TInput, TOutput>(result: EvalTestResult<TInput, TOutput>): result is MultiTurnResult<TInput, TOutput> | MultiTurnIteratedResult<TInput, TOutput>;
|
|
475
|
+
/**
|
|
476
|
+
* Check if result has iteration data (multiple iterations ran).
|
|
477
|
+
*
|
|
478
|
+
* @example
|
|
479
|
+
* ```typescript
|
|
480
|
+
* if (isIteratedResult(result)) {
|
|
481
|
+
* // result is SingleTurnIteratedResult | MultiTurnIteratedResult
|
|
482
|
+
* console.log(`Mean score: ${result.iterationStats.mean}`) // Type-safe
|
|
483
|
+
* console.log(`Pass rate: ${result.iterationStats.passRate}`) // Type-safe
|
|
484
|
+
* }
|
|
485
|
+
* ```
|
|
486
|
+
*/
|
|
487
|
+
declare function isIteratedResult<TInput, TOutput>(result: EvalTestResult<TInput, TOutput>): result is SingleTurnIteratedResult<TInput, TOutput> | MultiTurnIteratedResult<TInput, TOutput>;
|
|
488
|
+
|
|
489
|
+
/**
|
|
490
|
+
* Context passed to JudgePrompt.renderUserPrompt().
|
|
491
|
+
*/
|
|
492
|
+
interface JudgeContext {
|
|
493
|
+
agentDescription: string;
|
|
494
|
+
input: unknown;
|
|
495
|
+
output: unknown;
|
|
496
|
+
criteria: Criterion[];
|
|
497
|
+
files?: FileContent[];
|
|
498
|
+
}
|
|
499
|
+
/**
|
|
500
|
+
* Context for evaluating agent output.
|
|
501
|
+
*
|
|
502
|
+
* @example
|
|
503
|
+
* ```typescript
|
|
504
|
+
* const result = await judge.evaluate({
|
|
505
|
+
* input: { query: 'Hello' },
|
|
506
|
+
* output: { response: 'Hi there!' },
|
|
507
|
+
* agentDescription: 'A friendly chatbot',
|
|
508
|
+
* files: [{ path: 'context.md', content: '...' }],
|
|
509
|
+
* })
|
|
510
|
+
* ```
|
|
511
|
+
*/
|
|
512
|
+
interface EvalContext {
|
|
513
|
+
input: unknown;
|
|
514
|
+
output: unknown;
|
|
515
|
+
agentDescription: string;
|
|
516
|
+
files?: FileContent[];
|
|
517
|
+
}
|
|
518
|
+
interface JudgeResult {
|
|
519
|
+
verdicts: Verdict[];
|
|
520
|
+
overallScore: number;
|
|
521
|
+
passed: boolean;
|
|
522
|
+
metadata?: JudgeMetadata;
|
|
523
|
+
}
|
|
524
|
+
interface JudgePrompt {
|
|
525
|
+
id: string;
|
|
526
|
+
version: string;
|
|
527
|
+
system: string;
|
|
528
|
+
renderUserPrompt: (context: JudgeContext) => string;
|
|
529
|
+
}
|
|
530
|
+
interface JudgeConfig {
|
|
531
|
+
provider: Provider;
|
|
532
|
+
prompt?: JudgePrompt;
|
|
533
|
+
criteria: Criterion[];
|
|
534
|
+
passThreshold?: number;
|
|
535
|
+
/** Model name for cost tracking (e.g., 'gpt-4o', 'gemini-2.5-flash') */
|
|
536
|
+
model?: string;
|
|
537
|
+
}
|
|
538
|
+
/**
|
|
539
|
+
* LLM-as-Judge evaluator interface.
|
|
540
|
+
*
|
|
541
|
+
* @example
|
|
542
|
+
* ```typescript
|
|
543
|
+
* const judge = createJudge({ llm, prompt, criteria })
|
|
544
|
+
*
|
|
545
|
+
* const result = await judge.evaluate({
|
|
546
|
+
* input: { query: 'What is 2+2?' },
|
|
547
|
+
* output: { answer: '4' },
|
|
548
|
+
* agentDescription: 'A math tutor agent',
|
|
549
|
+
* files: [{ path: 'reference.md', content: '...' }],
|
|
550
|
+
* })
|
|
551
|
+
* ```
|
|
552
|
+
*/
|
|
553
|
+
interface Judge {
|
|
554
|
+
evaluate(context: EvalContext): Promise<JudgeResult>;
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
interface AggregatedMetrics {
|
|
558
|
+
avgLatencyMs: number;
|
|
559
|
+
totalTokens: number;
|
|
560
|
+
totalEstimatedCost?: number;
|
|
561
|
+
}
|
|
562
|
+
/**
|
|
563
|
+
* Pure data interface - use utility functions for operations.
|
|
564
|
+
*
|
|
565
|
+
* @example
|
|
566
|
+
* ```typescript
|
|
567
|
+
* for (const suggestion of report.suggestions) {
|
|
568
|
+
* console.log(suggestionDiff(suggestion))
|
|
569
|
+
* console.log(suggestionPreview(suggestion))
|
|
570
|
+
* suggestion.approved = true
|
|
571
|
+
* }
|
|
572
|
+
*
|
|
573
|
+
* const newPrompt = applyPromptSuggestions(agent.prompt, report.suggestions)
|
|
574
|
+
* ```
|
|
575
|
+
*/
|
|
576
|
+
interface Suggestion {
|
|
577
|
+
type: 'system_prompt' | 'user_prompt' | 'parameters';
|
|
578
|
+
priority: 'high' | 'medium' | 'low';
|
|
579
|
+
currentValue: string;
|
|
580
|
+
suggestedValue: string;
|
|
581
|
+
reasoning: string;
|
|
582
|
+
expectedImprovement: string;
|
|
583
|
+
approved?: boolean;
|
|
584
|
+
modified?: boolean;
|
|
585
|
+
}
|
|
586
|
+
interface ImproveResult {
|
|
587
|
+
suggestions: Suggestion[];
|
|
588
|
+
metadata?: ImproverMetadata;
|
|
589
|
+
}
|
|
590
|
+
interface ImproverContext {
|
|
591
|
+
agentPrompt: AgentPrompt<any>;
|
|
592
|
+
evaluatedResults: EvalTestResult<any, any>[];
|
|
593
|
+
aggregatedMetrics: AggregatedMetrics;
|
|
594
|
+
}
|
|
595
|
+
interface ImproverPrompt {
|
|
596
|
+
id: string;
|
|
597
|
+
version: string;
|
|
598
|
+
system: string;
|
|
599
|
+
renderUserPrompt: (context: ImproverContext) => string;
|
|
600
|
+
}
|
|
601
|
+
interface ImproverConfig {
|
|
602
|
+
provider: Provider;
|
|
603
|
+
prompt?: ImproverPrompt;
|
|
604
|
+
/** Model name for cost tracking (e.g., 'gpt-4o', 'gemini-2.5-flash') */
|
|
605
|
+
model?: string;
|
|
606
|
+
}
|
|
607
|
+
interface Improver {
|
|
608
|
+
improve(agentPrompt: AgentPrompt<any>, results: EvalTestResult<any, any>[]): Promise<ImproveResult>;
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
/** Cost breakdown by component (Agent, Judge, Improver) */
|
|
612
|
+
interface CostBreakdown {
|
|
613
|
+
agent?: number;
|
|
614
|
+
judge?: number;
|
|
615
|
+
improver?: number;
|
|
616
|
+
total?: number;
|
|
617
|
+
}
|
|
618
|
+
/** Cost summary aggregated across all test results */
|
|
619
|
+
interface CostSummary {
|
|
620
|
+
total: number;
|
|
621
|
+
byComponent: {
|
|
622
|
+
agent: number;
|
|
623
|
+
judge: number;
|
|
624
|
+
improver?: number;
|
|
625
|
+
};
|
|
626
|
+
}
|
|
627
|
+
interface MetricsWithCost {
|
|
628
|
+
latencyMs: number;
|
|
629
|
+
tokenUsage: EvalTokenUsage;
|
|
630
|
+
costBreakdown: CostBreakdown;
|
|
631
|
+
}
|
|
632
|
+
/** Test result with cost breakdown, returned by addCostsToResults() */
|
|
633
|
+
interface TestResultWithCost<TInput, TOutput> {
|
|
634
|
+
testCase: {
|
|
635
|
+
id?: string;
|
|
636
|
+
input: TInput;
|
|
637
|
+
tags?: string[];
|
|
638
|
+
description?: string;
|
|
639
|
+
expectedOutput?: unknown;
|
|
640
|
+
};
|
|
641
|
+
output: TOutput;
|
|
642
|
+
metrics: MetricsWithCost;
|
|
643
|
+
error?: Error;
|
|
644
|
+
verdicts: Array<{
|
|
645
|
+
criterionId: string;
|
|
646
|
+
score: number;
|
|
647
|
+
reasoning: string;
|
|
648
|
+
passed: boolean;
|
|
649
|
+
}>;
|
|
650
|
+
overallScore: number;
|
|
651
|
+
passed: boolean;
|
|
652
|
+
}
|
|
653
|
+
/** Pricing configuration for eval */
|
|
654
|
+
interface EvalPricingConfig {
|
|
655
|
+
/** Provider-specific pricing overrides. Key is provider name (e.g., 'google', 'openai'), value is model pricing. */
|
|
656
|
+
providerPricing?: Partial<Record<ProviderType, ProviderPricing>>;
|
|
657
|
+
}
|
|
658
|
+
/** Minimal result interface compatible with TestResultWithVerdict and TestResultWithIteration */
|
|
659
|
+
interface ResultForCostCalculation<TInput, TOutput> {
|
|
660
|
+
testCase: {
|
|
661
|
+
id?: string;
|
|
662
|
+
input: TInput;
|
|
663
|
+
tags?: string[];
|
|
664
|
+
description?: string;
|
|
665
|
+
expectedOutput?: unknown;
|
|
666
|
+
};
|
|
667
|
+
output: TOutput;
|
|
668
|
+
metrics: {
|
|
669
|
+
latencyMs: number;
|
|
670
|
+
tokenUsage: EvalTokenUsage;
|
|
671
|
+
};
|
|
672
|
+
error?: Error;
|
|
673
|
+
verdicts: Array<{
|
|
674
|
+
criterionId: string;
|
|
675
|
+
score: number;
|
|
676
|
+
reasoning: string;
|
|
677
|
+
passed: boolean;
|
|
678
|
+
}>;
|
|
679
|
+
overallScore: number;
|
|
680
|
+
passed: boolean;
|
|
681
|
+
agentMetadata?: {
|
|
682
|
+
tokenUsage?: EvalTokenUsage;
|
|
683
|
+
model?: string;
|
|
684
|
+
provider?: string;
|
|
685
|
+
};
|
|
686
|
+
judgeMetadata?: {
|
|
687
|
+
tokenUsage?: EvalTokenUsage;
|
|
688
|
+
model?: string;
|
|
689
|
+
provider?: string;
|
|
690
|
+
};
|
|
691
|
+
}
|
|
692
|
+
interface ReportForCostCalculation<TInput, TOutput> {
|
|
693
|
+
results: ResultForCostCalculation<TInput, TOutput>[];
|
|
694
|
+
}
|
|
695
|
+
declare function calculateResultCost<TInput, TOutput>(result: ResultForCostCalculation<TInput, TOutput>, config?: EvalPricingConfig): CostBreakdown;
|
|
696
|
+
declare function calculateReportCosts<TInput, TOutput>(report: ReportForCostCalculation<TInput, TOutput>, config?: EvalPricingConfig): CostSummary;
|
|
697
|
+
/** Add cost breakdown to each result. Returns new array (does not mutate original). */
|
|
698
|
+
declare function addCostsToResults<TInput, TOutput>(results: ResultForCostCalculation<TInput, TOutput>[], config?: EvalPricingConfig): TestResultWithCost<TInput, TOutput>[];
|
|
699
|
+
|
|
700
|
+
/**
|
|
701
|
+
* Reporter interface for saving/logging evaluation reports.
|
|
702
|
+
*
|
|
703
|
+
* @example
|
|
704
|
+
* ```typescript
|
|
705
|
+
* const reporter = createJsonReporter('./reports')
|
|
706
|
+
* reporter.save(report, 'my-test') // → ./reports/my-test-1736691234567.json
|
|
707
|
+
* ```
|
|
708
|
+
*/
|
|
709
|
+
interface Reporter<TInput = unknown, TOutput = unknown> {
|
|
710
|
+
/** Save report to file, returns file path (optional - not all reporters save files) */
|
|
711
|
+
save?(report: EvalReport<TInput, TOutput>, name: string): string;
|
|
712
|
+
/** Log report to console (optional) */
|
|
713
|
+
log?(report: EvalReport<TInput, TOutput>): void;
|
|
714
|
+
}
|
|
715
|
+
/**
|
|
716
|
+
* Common options for file-based reporters.
|
|
717
|
+
*/
|
|
718
|
+
interface FileReporterOptions {
|
|
719
|
+
/** Output directory (created if missing) */
|
|
720
|
+
outputDir: string;
|
|
721
|
+
/** Pricing config for cost calculation */
|
|
722
|
+
pricing?: EvalPricingConfig;
|
|
723
|
+
/** Add timestamp to filename (default: true) */
|
|
724
|
+
addTimestamp?: boolean;
|
|
725
|
+
}
|
|
726
|
+
/**
|
|
727
|
+
* Verbosity level for console output.
|
|
728
|
+
*/
|
|
729
|
+
type LogVerbosity = 'summary' | 'detailed' | 'full';
|
|
730
|
+
/**
|
|
731
|
+
* Options for ConsoleReporter.
|
|
732
|
+
*/
|
|
733
|
+
interface ConsoleReporterOptions {
|
|
734
|
+
/** Verbosity level (default: 'summary') */
|
|
735
|
+
verbosity?: LogVerbosity;
|
|
736
|
+
/** Pricing config for cost display */
|
|
737
|
+
pricing?: EvalPricingConfig;
|
|
738
|
+
}
|
|
739
|
+
interface ReportSummary {
|
|
740
|
+
totalTests: number;
|
|
741
|
+
passed: number;
|
|
742
|
+
failed: number;
|
|
743
|
+
avgScore: number;
|
|
744
|
+
metrics: AggregatedMetrics;
|
|
745
|
+
/** Number of iterations run per test case (only present when iterations > 1) */
|
|
746
|
+
iterations?: number;
|
|
747
|
+
/** Average standard deviation across all tests */
|
|
748
|
+
avgStdDev?: number;
|
|
749
|
+
/** Average pass rate across all tests */
|
|
750
|
+
avgPassRate?: number;
|
|
751
|
+
/** Cost summary (set by CLI or manually via calculateReportCosts) */
|
|
752
|
+
costSummary?: CostSummary;
|
|
753
|
+
}
|
|
754
|
+
/**
|
|
755
|
+
* Evaluation report data.
|
|
756
|
+
* Pure data interface - use utility functions for operations.
|
|
757
|
+
*
|
|
758
|
+
* @example
|
|
759
|
+
* ```typescript
|
|
760
|
+
* const report = await suite.run(testCases)
|
|
761
|
+
*
|
|
762
|
+
* // Convert to markdown
|
|
763
|
+
* const markdown = reportToMarkdown(report)
|
|
764
|
+
*
|
|
765
|
+
* // Save to file
|
|
766
|
+
* await saveReportMarkdown(report, './reports/eval-report.md')
|
|
767
|
+
* ```
|
|
768
|
+
*/
|
|
769
|
+
interface EvalReport<TInput, TOutput> {
|
|
770
|
+
summary: ReportSummary;
|
|
771
|
+
/** Results - may include iteration stats when iterations > 1 */
|
|
772
|
+
results: EvalTestResult<TInput, TOutput>[];
|
|
773
|
+
suggestions: Suggestion[];
|
|
774
|
+
generatedAt: Date;
|
|
775
|
+
promptVersion: string;
|
|
776
|
+
}
|
|
777
|
+
/**
|
|
778
|
+
* Options for markdown report generation.
|
|
779
|
+
*/
|
|
780
|
+
interface ReportMarkdownOptions {
|
|
781
|
+
/** Include passed test details (default: false, collapsed) */
|
|
782
|
+
expandPassedTests?: boolean;
|
|
783
|
+
/** Include raw JSON output (default: false) */
|
|
784
|
+
includeRawOutput?: boolean;
|
|
785
|
+
/** Max length for output preview (default: 200) */
|
|
786
|
+
outputPreviewLength?: number;
|
|
787
|
+
}
|
|
788
|
+
/**
|
|
789
|
+
* Result of comparing two evaluation reports.
|
|
790
|
+
* Useful for tracking improvements across prompt versions.
|
|
791
|
+
*
|
|
792
|
+
* @example
|
|
793
|
+
* ```typescript
|
|
794
|
+
* const beforeReport = await suite.run(testCases)
|
|
795
|
+
* const afterReport = await suite.withAgent(improvedAgent).run(testCases)
|
|
796
|
+
* const comparison = compareReports(beforeReport, afterReport)
|
|
797
|
+
*
|
|
798
|
+
* console.log(`Score delta: ${comparison.scoreDelta}`)
|
|
799
|
+
* console.log(`Improved tests: ${comparison.improved.join(', ')}`)
|
|
800
|
+
* ```
|
|
801
|
+
*/
|
|
802
|
+
interface ReportComparison {
|
|
803
|
+
/** Change in average score (positive = improvement) */
|
|
804
|
+
scoreDelta: number;
|
|
805
|
+
/** Change in pass rate (positive = improvement) */
|
|
806
|
+
passRateDelta: number;
|
|
807
|
+
/** Changes in performance metrics */
|
|
808
|
+
metricsDelta: {
|
|
809
|
+
/** Change in average latency (ms) */
|
|
810
|
+
latencyMs: number;
|
|
811
|
+
/** Change in total token usage */
|
|
812
|
+
tokenUsage: number;
|
|
813
|
+
};
|
|
814
|
+
/** Test IDs that improved (score increased) */
|
|
815
|
+
improved: string[];
|
|
816
|
+
/** Test IDs that regressed (score decreased) */
|
|
817
|
+
regressed: string[];
|
|
818
|
+
/** Test IDs that were removed (in before but not in after) */
|
|
819
|
+
removed: string[];
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
/**
|
|
823
|
+
* Options for running test cases.
|
|
824
|
+
*/
|
|
825
|
+
interface RunOptions {
|
|
826
|
+
/** Maximum number of concurrent test case executions. Defaults to 1 (sequential). */
|
|
827
|
+
concurrency?: number;
|
|
828
|
+
/** Stop execution after the first test failure. Defaults to false. */
|
|
829
|
+
stopOnFirstFailure?: boolean;
|
|
830
|
+
/** AbortSignal for cancelling execution */
|
|
831
|
+
signal?: AbortSignal;
|
|
832
|
+
/**
|
|
833
|
+
* Number of times to run each test case. Defaults to 1.
|
|
834
|
+
* When > 1, results include iteration statistics (mean, stdDev, passRate).
|
|
835
|
+
*/
|
|
836
|
+
iterations?: number;
|
|
837
|
+
}
|
|
838
|
+
/**
|
|
839
|
+
* Context required for executing a single test case.
|
|
840
|
+
* @internal
|
|
841
|
+
*/
|
|
842
|
+
interface ExecuteContext<TInput, TOutput> {
|
|
843
|
+
agent: EvalAgent<TInput, TOutput>;
|
|
844
|
+
judge: Judge;
|
|
845
|
+
agentDescription: string;
|
|
846
|
+
}
|
|
847
|
+
/**
|
|
848
|
+
* Executes a single test case and returns the result with verdict.
|
|
849
|
+
*
|
|
850
|
+
* Flow:
|
|
851
|
+
* 1. Execute agent with test input
|
|
852
|
+
* 2. Measure execution latency
|
|
853
|
+
* 3. Collect token usage from agent metadata
|
|
854
|
+
* 4. Evaluate output using Judge
|
|
855
|
+
* 5. Return combined result with verdicts
|
|
856
|
+
*
|
|
857
|
+
* @example
|
|
858
|
+
* ```typescript
|
|
859
|
+
* const result = await executeTestCase(
|
|
860
|
+
* { id: 'test-1', input: { query: 'Hello' } },
|
|
861
|
+
* { agent: myAgent, judge: myJudge, agentDescription: 'A friendly bot' }
|
|
862
|
+
* )
|
|
863
|
+
*
|
|
864
|
+
* console.log(result.passed) // true/false
|
|
865
|
+
* console.log(result.overallScore) // 0-100
|
|
866
|
+
* console.log(result.verdicts) // Verdict[]
|
|
867
|
+
* ```
|
|
868
|
+
*/
|
|
869
|
+
declare function executeTestCase<TInput, TOutput>(testCase: TestCase<TInput>, context: ExecuteContext<TInput, TOutput>, signal?: AbortSignal): Promise<SingleTurnResult<TInput, TOutput>>;
|
|
870
|
+
/**
|
|
871
|
+
* Runs multiple test cases with configurable concurrency.
|
|
872
|
+
*
|
|
873
|
+
* Features:
|
|
874
|
+
* - Parallel execution with concurrency limit
|
|
875
|
+
* - Stop on first failure option
|
|
876
|
+
* - AbortSignal support for cancellation
|
|
877
|
+
*
|
|
878
|
+
* @example
|
|
879
|
+
* ```typescript
|
|
880
|
+
* const results = await runWithConcurrency(
|
|
881
|
+
* testCases,
|
|
882
|
+
* { agent: myAgent, judge: myJudge, agentDescription: 'Test agent' },
|
|
883
|
+
* { concurrency: 5, stopOnFirstFailure: false }
|
|
884
|
+
* )
|
|
885
|
+
*
|
|
886
|
+
* console.log(`Passed: ${results.filter(r => r.passed).length}`)
|
|
887
|
+
* console.log(`Failed: ${results.filter(r => !r.passed).length}`)
|
|
888
|
+
* ```
|
|
889
|
+
*/
|
|
890
|
+
declare function runWithConcurrency<TInput, TOutput>(testCases: TestCase<TInput>[], context: ExecuteContext<TInput, TOutput>, options?: RunOptions): Promise<EvalTestResult<TInput, TOutput>[]>;
|
|
891
|
+
|
|
892
|
+
/**
|
|
893
|
+
* Configuration for creating an EvalSuite.
|
|
894
|
+
*
|
|
895
|
+
* @example
|
|
896
|
+
* ```typescript
|
|
897
|
+
* const suite = createEvalSuite({
|
|
898
|
+
* agent: myAgent,
|
|
899
|
+
* judge: myJudge,
|
|
900
|
+
* agentDescription: 'Recommends career paths based on student profiles',
|
|
901
|
+
* })
|
|
902
|
+
* ```
|
|
903
|
+
*/
|
|
904
|
+
interface EvalSuiteConfig<TInput, TOutput> {
|
|
905
|
+
/** The agent to evaluate */
|
|
906
|
+
agent: EvalAgent<TInput, TOutput>;
|
|
907
|
+
/** Human-readable description of what the agent does (used by Judge) */
|
|
908
|
+
agentDescription?: string;
|
|
909
|
+
/** Judge instance for evaluating agent outputs */
|
|
910
|
+
judge: Judge;
|
|
911
|
+
/** Improver instance for generating prompt improvement suggestions (optional) */
|
|
912
|
+
improver?: Improver;
|
|
913
|
+
}
|
|
914
|
+
/**
|
|
915
|
+
* Evaluation suite for running test cases against an agent.
|
|
916
|
+
*
|
|
917
|
+
* @example
|
|
918
|
+
* ```typescript
|
|
919
|
+
* const report = await suite.run(testCases, { concurrency: 3 })
|
|
920
|
+
* console.log(reportToMarkdown(report))
|
|
921
|
+
*
|
|
922
|
+
* // Test with a different agent
|
|
923
|
+
* const newReport = await suite.withAgent(improvedAgent).run(testCases)
|
|
924
|
+
* ```
|
|
925
|
+
*/
|
|
926
|
+
interface EvalSuite<TInput, TOutput> {
|
|
927
|
+
/**
|
|
928
|
+
* Run test cases and generate an evaluation report.
|
|
929
|
+
*
|
|
930
|
+
* @param testCases - Test cases to run
|
|
931
|
+
* @param options - Run options (concurrency, stopOnFirstFailure, signal)
|
|
932
|
+
* @returns Evaluation report with results, summary, and suggestions
|
|
933
|
+
*/
|
|
934
|
+
run(testCases: TestCase<TInput>[], options?: RunOptions): Promise<EvalReport<TInput, TOutput>>;
|
|
935
|
+
/**
|
|
936
|
+
* Create a new suite with a different agent.
|
|
937
|
+
* Useful for A/B testing or testing prompt improvements.
|
|
938
|
+
*
|
|
939
|
+
* @param agent - New agent to use
|
|
940
|
+
* @returns New EvalSuite instance with the updated agent
|
|
941
|
+
*/
|
|
942
|
+
withAgent(agent: EvalAgent<TInput, TOutput>): EvalSuite<TInput, TOutput>;
|
|
943
|
+
}
|
|
944
|
+
/**
|
|
945
|
+
* Create an evaluation suite for testing an agent.
|
|
946
|
+
*
|
|
947
|
+
* The suite orchestrates test execution, evaluation, and optional
|
|
948
|
+
* prompt improvement suggestions.
|
|
949
|
+
*
|
|
950
|
+
* @example
|
|
951
|
+
* ```typescript
|
|
952
|
+
* const suite = createEvalSuite({
|
|
953
|
+
* agent: scenarioGenerator,
|
|
954
|
+
* agentDescription: 'Recommends majors based on student profiles',
|
|
955
|
+
* judge: createJudge({
|
|
956
|
+
* llm: openaiClient,
|
|
957
|
+
* prompt: defaultJudgePrompt,
|
|
958
|
+
* criteria: [accuracy(), relevance()],
|
|
959
|
+
* }),
|
|
960
|
+
* })
|
|
961
|
+
*
|
|
962
|
+
* const report = await suite.run(testCases, { concurrency: 3 })
|
|
963
|
+
* ```
|
|
964
|
+
*/
|
|
965
|
+
declare function createEvalSuite<TInput, TOutput>(config: EvalSuiteConfig<TInput, TOutput>): EvalSuite<TInput, TOutput>;
|
|
966
|
+
|
|
967
|
+
/**
|
|
968
|
+
* Iteration statistics utilities for repeated test execution.
|
|
969
|
+
*
|
|
970
|
+
* These functions aggregate results from running the same test multiple times,
|
|
971
|
+
* providing statistical metrics like mean, standard deviation, and pass rate.
|
|
972
|
+
*/
|
|
973
|
+
|
|
974
|
+
/**
|
|
975
|
+
* Calculate iteration statistics from multiple test results.
|
|
976
|
+
*
|
|
977
|
+
* @param results - Results from running the same test multiple times
|
|
978
|
+
* @returns Aggregated statistics including mean, stdDev, and passRate
|
|
979
|
+
*
|
|
980
|
+
* @example
|
|
981
|
+
* ```typescript
|
|
982
|
+
* const stats = calculateIterationStats([
|
|
983
|
+
* { overallScore: 85, passed: true, ... },
|
|
984
|
+
* { overallScore: 90, passed: true, ... },
|
|
985
|
+
* { overallScore: 80, passed: true, ... },
|
|
986
|
+
* ])
|
|
987
|
+
* // stats.mean = 85
|
|
988
|
+
* // stats.stdDev ≈ 4.08
|
|
989
|
+
* // stats.passRate = 1.0
|
|
990
|
+
* ```
|
|
991
|
+
*/
|
|
992
|
+
declare function calculateIterationStats(results: TestResultWithVerdict<unknown, unknown>[]): IterationStats;
|
|
993
|
+
/**
|
|
994
|
+
* Calculate multi-turn specific iteration statistics.
|
|
995
|
+
*
|
|
996
|
+
* Extends base iteration stats with turn counts and termination type distribution.
|
|
997
|
+
* Used when aggregating multiple iterations of multi-turn tests.
|
|
998
|
+
*
|
|
999
|
+
* @param results - Results from running the same multi-turn test multiple times
|
|
1000
|
+
* @returns Extended statistics including avgTurns, min/max turns, and terminationCounts
|
|
1001
|
+
*
|
|
1002
|
+
* @example
|
|
1003
|
+
* ```typescript
|
|
1004
|
+
* const stats = calculateMultiTurnIterationStats(results)
|
|
1005
|
+
* // stats.avgTurns = 4.2
|
|
1006
|
+
* // stats.minTurns = 3
|
|
1007
|
+
* // stats.maxTurns = 6
|
|
1008
|
+
* // stats.terminationCounts = { condition: 2, maxTurns: 1 }
|
|
1009
|
+
* ```
|
|
1010
|
+
*/
|
|
1011
|
+
declare function calculateMultiTurnIterationStats<TInput, TOutput>(results: (MultiTurnResult<TInput, TOutput> | MultiTurnIteratedResult<TInput, TOutput>)[]): MultiTurnIterationStats;
|
|
1012
|
+
/**
|
|
1013
|
+
* Select the result closest to the mean score.
|
|
1014
|
+
* Used to pick a "representative" result for displaying verdicts/reasoning.
|
|
1015
|
+
*
|
|
1016
|
+
* The function preserves the full type of the input array, so if you pass
|
|
1017
|
+
* `TestResultWithIteration[]`, you get back `TestResultWithIteration`.
|
|
1018
|
+
*
|
|
1019
|
+
* @param results - Array of results to choose from (must not be empty)
|
|
1020
|
+
* @param mean - The mean score to compare against
|
|
1021
|
+
* @returns The result with overallScore closest to mean
|
|
1022
|
+
* @throws Error if results array is empty
|
|
1023
|
+
*/
|
|
1024
|
+
declare function selectRepresentativeResult<TInput, TOutput, T extends TestResultWithVerdict<TInput, TOutput> = TestResultWithVerdict<TInput, TOutput>>(results: T[], mean: number): T;
|
|
1025
|
+
/**
|
|
1026
|
+
* Aggregate results from multiple iteration runs into iterated result types.
|
|
1027
|
+
*
|
|
1028
|
+
* Takes N arrays of results (one per iteration) and groups them by test case,
|
|
1029
|
+
* calculating iteration statistics for each test case.
|
|
1030
|
+
*
|
|
1031
|
+
* For multi-turn tests, returns MultiTurnIteratedResult with multi-turn specific
|
|
1032
|
+
* statistics like average turns, min/max turns, and termination type distribution.
|
|
1033
|
+
*
|
|
1034
|
+
* For single-turn tests, returns SingleTurnIteratedResult with base iteration stats.
|
|
1035
|
+
*
|
|
1036
|
+
* @param allIterationResults - Array of arrays: outer = iterations, inner = test cases
|
|
1037
|
+
* @returns Aggregated results with iteration statistics
|
|
1038
|
+
*
|
|
1039
|
+
* @example
|
|
1040
|
+
* ```typescript
|
|
1041
|
+
* // 3 iterations, 2 test cases each
|
|
1042
|
+
* const allResults = [
|
|
1043
|
+
* [testCase1_iter1, testCase2_iter1], // iteration 1
|
|
1044
|
+
* [testCase1_iter2, testCase2_iter2], // iteration 2
|
|
1045
|
+
* [testCase1_iter3, testCase2_iter3], // iteration 3
|
|
1046
|
+
* ]
|
|
1047
|
+
*
|
|
1048
|
+
* const aggregated = aggregateIterationResults(allResults)
|
|
1049
|
+
* // aggregated[0] = testCase1 with stats from iter1, iter2, iter3
|
|
1050
|
+
* // aggregated[1] = testCase2 with stats from iter1, iter2, iter3
|
|
1051
|
+
*
|
|
1052
|
+
* // For multi-turn tests:
|
|
1053
|
+
* // aggregated[0].kind === 'multi-turn-iterated'
|
|
1054
|
+
* // aggregated[0].multiTurnIterationStats = { avgTurns, minTurns, maxTurns, terminationCounts }
|
|
1055
|
+
* ```
|
|
1056
|
+
*/
|
|
1057
|
+
declare function aggregateIterationResults<TInput, TOutput>(allIterationResults: EvalTestResult<TInput, TOutput>[][]): (SingleTurnIteratedResult<TInput, TOutput> | MultiTurnIteratedResult<TInput, TOutput>)[];
|
|
1058
|
+
/**
|
|
1059
|
+
* Calculate average standard deviation across multiple test results.
|
|
1060
|
+
* Used for report summary.
|
|
1061
|
+
*
|
|
1062
|
+
* @param results - Eval results (only iterated results have stats)
|
|
1063
|
+
* @returns Average stdDev across all iterated tests, or undefined if no iteration data
|
|
1064
|
+
*/
|
|
1065
|
+
declare function calculateAvgStdDev<TInput, TOutput>(results: EvalTestResult<TInput, TOutput>[]): number | undefined;
|
|
1066
|
+
/**
|
|
1067
|
+
* Calculate average pass rate across multiple test results.
|
|
1068
|
+
* Used for report summary.
|
|
1069
|
+
*
|
|
1070
|
+
* @param results - Eval results (only iterated results have stats)
|
|
1071
|
+
* @returns Average passRate across all iterated tests, or undefined if no iteration data
|
|
1072
|
+
*/
|
|
1073
|
+
declare function calculateAvgPassRate<TInput, TOutput>(results: EvalTestResult<TInput, TOutput>[]): number | undefined;
|
|
1074
|
+
|
|
1075
|
+
/**
|
|
1076
|
+
* Error codes for agent-eval operations
|
|
1077
|
+
*/
|
|
1078
|
+
declare enum EvalErrorCode {
|
|
1079
|
+
LLM_API_ERROR = "LLM_API_ERROR",
|
|
1080
|
+
LLM_RATE_LIMIT = "LLM_RATE_LIMIT",
|
|
1081
|
+
LLM_TIMEOUT = "LLM_TIMEOUT",
|
|
1082
|
+
JSON_PARSE_ERROR = "JSON_PARSE_ERROR",
|
|
1083
|
+
VERDICT_PARSE_ERROR = "VERDICT_PARSE_ERROR",
|
|
1084
|
+
TEMPLATE_COMPILE_ERROR = "TEMPLATE_COMPILE_ERROR",
|
|
1085
|
+
AGENT_EXECUTION_ERROR = "AGENT_EXECUTION_ERROR",
|
|
1086
|
+
INVALID_CONFIG = "INVALID_CONFIG",
|
|
1087
|
+
MISSING_API_KEY = "MISSING_API_KEY",
|
|
1088
|
+
PROMPT_NOT_FOUND = "PROMPT_NOT_FOUND",
|
|
1089
|
+
PROMPT_INVALID_FORMAT = "PROMPT_INVALID_FORMAT",
|
|
1090
|
+
PROMPT_WRITE_ERROR = "PROMPT_WRITE_ERROR",
|
|
1091
|
+
PROMPT_READ_ERROR = "PROMPT_READ_ERROR",
|
|
1092
|
+
SUGGESTION_APPLY_ERROR = "SUGGESTION_APPLY_ERROR",
|
|
1093
|
+
SCHEMA_VALIDATION_ERROR = "SCHEMA_VALIDATION_ERROR",
|
|
1094
|
+
SCHEMA_GENERATION_ERROR = "SCHEMA_GENERATION_ERROR",
|
|
1095
|
+
FILE_READ_ERROR = "FILE_READ_ERROR",
|
|
1096
|
+
FILE_WRITE_ERROR = "FILE_WRITE_ERROR",
|
|
1097
|
+
FILE_TOO_LARGE = "FILE_TOO_LARGE",
|
|
1098
|
+
CONCURRENT_MODIFICATION = "CONCURRENT_MODIFICATION",
|
|
1099
|
+
UNKNOWN_ERROR = "UNKNOWN_ERROR"
|
|
1100
|
+
}
|
|
1101
|
+
interface EvalErrorOptions {
|
|
1102
|
+
code: EvalErrorCode;
|
|
1103
|
+
cause?: Error;
|
|
1104
|
+
context?: Record<string, unknown>;
|
|
1105
|
+
}
|
|
1106
|
+
/**
|
|
1107
|
+
* Custom error class for agent-eval operations.
|
|
1108
|
+
* Provides structured error information including error code and optional context.
|
|
1109
|
+
*/
|
|
1110
|
+
declare class EvalError extends Error {
|
|
1111
|
+
readonly code: EvalErrorCode;
|
|
1112
|
+
readonly cause?: Error;
|
|
1113
|
+
readonly context?: Record<string, unknown>;
|
|
1114
|
+
constructor(message: string, options: EvalErrorOptions);
|
|
1115
|
+
/**
|
|
1116
|
+
* Creates an EvalError from an unknown error with a specific code.
|
|
1117
|
+
*/
|
|
1118
|
+
static from(error: unknown, code: EvalErrorCode, context?: Record<string, unknown>): EvalError;
|
|
1119
|
+
toJSON(): Record<string, unknown>;
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
/**
|
|
1123
|
+
* Creates an LLM-as-Judge evaluator.
|
|
1124
|
+
*
|
|
1125
|
+
* @example
|
|
1126
|
+
* ```typescript
|
|
1127
|
+
* import { createJudge, defaultJudgePrompt, accuracy, consistency } from 'agent-eval'
|
|
1128
|
+
* import { createGoogleProvider } from '@agtlantis/core'
|
|
1129
|
+
*
|
|
1130
|
+
* const provider = createGoogleProvider({ apiKey }).withDefaultModel('gemini-2.5-flash')
|
|
1131
|
+
*
|
|
1132
|
+
* const judge = createJudge({
|
|
1133
|
+
* provider,
|
|
1134
|
+
* prompt: defaultJudgePrompt,
|
|
1135
|
+
* criteria: [accuracy(), consistency()],
|
|
1136
|
+
* passThreshold: 70,
|
|
1137
|
+
* })
|
|
1138
|
+
*
|
|
1139
|
+
* const result = await judge.evaluate({
|
|
1140
|
+
* input: { query: 'What is 2+2?' },
|
|
1141
|
+
* output: { answer: '4' },
|
|
1142
|
+
* agentDescription: 'A math tutor agent',
|
|
1143
|
+
* files: [{ path: 'reference.md', content: '...' }],
|
|
1144
|
+
* })
|
|
1145
|
+
*
|
|
1146
|
+
* console.log(result.overallScore) // e.g., 85
|
|
1147
|
+
* console.log(result.passed) // true
|
|
1148
|
+
* ```
|
|
1149
|
+
*/
|
|
1150
|
+
declare function createJudge(config: JudgeConfig): Judge;
|
|
1151
|
+
|
|
1152
|
+
interface SchemaOptions<T> extends CriterionOptions {
|
|
1153
|
+
schema: z.ZodType<T>;
|
|
1154
|
+
/** Use unique IDs when using multiple validators */
|
|
1155
|
+
id?: string;
|
|
1156
|
+
name?: string;
|
|
1157
|
+
description?: string;
|
|
1158
|
+
}
|
|
1159
|
+
/**
|
|
1160
|
+
* Creates a schema validation criterion using Zod.
|
|
1161
|
+
*
|
|
1162
|
+
* Performs PROGRAMMATIC validation (not LLM-based).
|
|
1163
|
+
* Scoring is binary: 100 if validation passes, 0 if it fails.
|
|
1164
|
+
*
|
|
1165
|
+
* @example
|
|
1166
|
+
* ```typescript
|
|
1167
|
+
* import { z } from 'zod'
|
|
1168
|
+
* import { schema, createJudge, accuracy, defaultJudgePrompt } from '@agtlantis/eval'
|
|
1169
|
+
*
|
|
1170
|
+
* const RecipeSchema = z.object({
|
|
1171
|
+
* name: z.string(),
|
|
1172
|
+
* ingredients: z.array(z.object({
|
|
1173
|
+
* name: z.string(),
|
|
1174
|
+
* amount: z.string(),
|
|
1175
|
+
* })),
|
|
1176
|
+
* steps: z.array(z.string()).min(1),
|
|
1177
|
+
* })
|
|
1178
|
+
*
|
|
1179
|
+
* const judge = createJudge({
|
|
1180
|
+
* llm: openaiClient,
|
|
1181
|
+
* prompt: defaultJudgePrompt,
|
|
1182
|
+
* criteria: [
|
|
1183
|
+
* schema({ schema: RecipeSchema, weight: 2 }),
|
|
1184
|
+
* accuracy(),
|
|
1185
|
+
* ],
|
|
1186
|
+
* })
|
|
1187
|
+
* ```
|
|
1188
|
+
*/
|
|
1189
|
+
declare function schema<T>(options: SchemaOptions<T>): ValidatorCriterion;
|
|
1190
|
+
|
|
1191
|
+
interface CriterionOptions {
|
|
1192
|
+
weight?: number;
|
|
1193
|
+
}
|
|
1194
|
+
/**
|
|
1195
|
+
* Evaluates whether the agent's output is factually accurate
|
|
1196
|
+
* and free from errors or hallucinations.
|
|
1197
|
+
*/
|
|
1198
|
+
declare function accuracy(options?: CriterionOptions): Criterion;
|
|
1199
|
+
/**
|
|
1200
|
+
* Evaluates whether the agent's output is internally consistent
|
|
1201
|
+
* and doesn't contradict itself or the provided context.
|
|
1202
|
+
*/
|
|
1203
|
+
declare function consistency(options?: CriterionOptions): Criterion;
|
|
1204
|
+
/**
|
|
1205
|
+
* Evaluates whether the agent's output is relevant to the input
|
|
1206
|
+
* and addresses the user's needs appropriately.
|
|
1207
|
+
*/
|
|
1208
|
+
declare function relevance(options?: CriterionOptions): Criterion;
|
|
1209
|
+
|
|
1210
|
+
/**
|
|
1211
|
+
* Converts an evaluation report to Markdown format.
|
|
1212
|
+
*
|
|
1213
|
+
* @example
|
|
1214
|
+
* ```typescript
|
|
1215
|
+
* const report = await suite.run(testCases)
|
|
1216
|
+
* const markdown = reportToMarkdown(report)
|
|
1217
|
+
* console.log(markdown)
|
|
1218
|
+
* ```
|
|
1219
|
+
*/
|
|
1220
|
+
declare function reportToMarkdown<TInput, TOutput>(report: EvalReport<TInput, TOutput>, options?: ReportMarkdownOptions): string;
|
|
1221
|
+
/**
|
|
1222
|
+
* Saves an evaluation report as a Markdown file.
|
|
1223
|
+
*
|
|
1224
|
+
* @example
|
|
1225
|
+
* ```typescript
|
|
1226
|
+
* const report = await suite.run(testCases)
|
|
1227
|
+
* await saveReportMarkdown(report, './reports/eval-2024-01.md')
|
|
1228
|
+
* ```
|
|
1229
|
+
*/
|
|
1230
|
+
declare function saveReportMarkdown<TInput, TOutput>(report: EvalReport<TInput, TOutput>, path: string, options?: ReportMarkdownOptions): Promise<void>;
|
|
1231
|
+
/**
|
|
1232
|
+
* Compares two evaluation reports and returns the differences.
|
|
1233
|
+
* Useful for tracking improvements across prompt versions.
|
|
1234
|
+
*
|
|
1235
|
+
* @example
|
|
1236
|
+
* ```typescript
|
|
1237
|
+
* const beforeReport = await suite.run(testCases)
|
|
1238
|
+
* // ... apply improvements ...
|
|
1239
|
+
* const afterReport = await suite.withAgent(improvedAgent).run(testCases)
|
|
1240
|
+
*
|
|
1241
|
+
* const comparison = compareReports(beforeReport, afterReport)
|
|
1242
|
+
* console.log(`Score improved by ${comparison.scoreDelta} points`)
|
|
1243
|
+
* console.log(`Tests improved: ${comparison.improved.join(', ')}`)
|
|
1244
|
+
* console.log(`Tests regressed: ${comparison.regressed.join(', ')}`)
|
|
1245
|
+
* ```
|
|
1246
|
+
*/
|
|
1247
|
+
declare function compareReports<TInput, TOutput>(before: EvalReport<TInput, TOutput>, after: EvalReport<TInput, TOutput>): ReportComparison;
|
|
1248
|
+
|
|
1249
|
+
/**
|
|
1250
|
+
* Reporter that saves EvalReport as JSON.
|
|
1251
|
+
*
|
|
1252
|
+
* @example
|
|
1253
|
+
* ```typescript
|
|
1254
|
+
* const reporter = new JsonReporter({ outputDir: './reports' })
|
|
1255
|
+
* reporter.save(report, 'my-test') // -> ./reports/my-test-1736691234567.json
|
|
1256
|
+
*
|
|
1257
|
+
* // Without timestamp
|
|
1258
|
+
* const fixedReporter = new JsonReporter({
|
|
1259
|
+
* outputDir: './reports',
|
|
1260
|
+
* addTimestamp: false,
|
|
1261
|
+
* })
|
|
1262
|
+
* fixedReporter.save(report, 'round-1') // -> ./reports/round-1.json
|
|
1263
|
+
* ```
|
|
1264
|
+
*/
|
|
1265
|
+
declare class JsonReporter<TInput = unknown, TOutput = unknown> implements Reporter<TInput, TOutput> {
|
|
1266
|
+
private readonly outputDir;
|
|
1267
|
+
private readonly pricing?;
|
|
1268
|
+
private readonly addTimestamp;
|
|
1269
|
+
constructor(options: FileReporterOptions);
|
|
1270
|
+
save(report: EvalReport<TInput, TOutput>, name: string): string;
|
|
1271
|
+
}
|
|
1272
|
+
|
|
1273
|
+
interface MarkdownReporterOptions extends FileReporterOptions {
|
|
1274
|
+
/** Markdown generation options */
|
|
1275
|
+
markdown?: ReportMarkdownOptions;
|
|
1276
|
+
}
|
|
1277
|
+
/**
|
|
1278
|
+
* Reporter that saves EvalReport as Markdown.
|
|
1279
|
+
*
|
|
1280
|
+
* @example
|
|
1281
|
+
* ```typescript
|
|
1282
|
+
* const reporter = new MarkdownReporter({ outputDir: './reports' })
|
|
1283
|
+
* reporter.save(report, 'my-test') // -> ./reports/my-test-1736691234567.md
|
|
1284
|
+
*
|
|
1285
|
+
* // With expanded passed tests
|
|
1286
|
+
* const detailedReporter = new MarkdownReporter({
|
|
1287
|
+
* outputDir: './reports',
|
|
1288
|
+
* markdown: { expandPassedTests: true },
|
|
1289
|
+
* })
|
|
1290
|
+
* ```
|
|
1291
|
+
*/
|
|
1292
|
+
declare class MarkdownReporter<TInput = unknown, TOutput = unknown> implements Reporter<TInput, TOutput> {
|
|
1293
|
+
private readonly outputDir;
|
|
1294
|
+
private readonly addTimestamp;
|
|
1295
|
+
private readonly markdownOptions;
|
|
1296
|
+
constructor(options: MarkdownReporterOptions);
|
|
1297
|
+
save(report: EvalReport<TInput, TOutput>, name: string): string;
|
|
1298
|
+
}
|
|
1299
|
+
|
|
1300
|
+
/**
|
|
1301
|
+
* Reporter that logs EvalReport to console.
|
|
1302
|
+
*
|
|
1303
|
+
* @example
|
|
1304
|
+
* ```typescript
|
|
1305
|
+
* const reporter = new ConsoleReporter({ verbosity: 'detailed' })
|
|
1306
|
+
* reporter.log(report) // Logs to console
|
|
1307
|
+
*
|
|
1308
|
+
* // With cost display
|
|
1309
|
+
* const costReporter = new ConsoleReporter({
|
|
1310
|
+
* verbosity: 'summary',
|
|
1311
|
+
* pricing: GOOGLE_PRICING,
|
|
1312
|
+
* })
|
|
1313
|
+
* ```
|
|
1314
|
+
*/
|
|
1315
|
+
declare class ConsoleReporter<TInput = unknown, TOutput = unknown> implements Reporter<TInput, TOutput> {
|
|
1316
|
+
private readonly verbosity;
|
|
1317
|
+
private readonly pricing?;
|
|
1318
|
+
constructor(options?: ConsoleReporterOptions);
|
|
1319
|
+
log(report: EvalReport<TInput, TOutput>): void;
|
|
1320
|
+
private logCostIfAvailable;
|
|
1321
|
+
}
|
|
1322
|
+
|
|
1323
|
+
/**
|
|
1324
|
+
* Combines multiple reporters to save/log to multiple outputs.
|
|
1325
|
+
*
|
|
1326
|
+
* @example
|
|
1327
|
+
* ```typescript
|
|
1328
|
+
* const reporter = new CompositeReporter([
|
|
1329
|
+
* new JsonReporter({ outputDir: './reports' }),
|
|
1330
|
+
* new ConsoleReporter({ verbosity: 'detailed' }),
|
|
1331
|
+
* ])
|
|
1332
|
+
* reporter.save(report, 'my-test') // JSON 저장 + 콘솔 출력
|
|
1333
|
+
* ```
|
|
1334
|
+
*/
|
|
1335
|
+
declare class CompositeReporter<TInput = unknown, TOutput = unknown> implements Reporter<TInput, TOutput> {
|
|
1336
|
+
private readonly reporters;
|
|
1337
|
+
constructor(reporters: Reporter<TInput, TOutput>[]);
|
|
1338
|
+
/**
|
|
1339
|
+
* Saves to all reporters that support saving.
|
|
1340
|
+
* Returns the first successful file path (usually JsonReporter).
|
|
1341
|
+
*/
|
|
1342
|
+
save(report: EvalReport<TInput, TOutput>, name: string): string;
|
|
1343
|
+
log(report: EvalReport<TInput, TOutput>): void;
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
/**
|
|
1347
|
+
* Create a JSON reporter.
|
|
1348
|
+
*
|
|
1349
|
+
* @example
|
|
1350
|
+
* ```typescript
|
|
1351
|
+
* const reporter = createJsonReporter('./reports')
|
|
1352
|
+
* reporter.save(report, 'my-test') // → ./reports/my-test-1736691234567.json
|
|
1353
|
+
* ```
|
|
1354
|
+
*/
|
|
1355
|
+
declare function createJsonReporter<TInput = unknown, TOutput = unknown>(outputDir: string, options?: Omit<FileReporterOptions, 'outputDir'>): JsonReporter<TInput, TOutput>;
|
|
1356
|
+
/**
|
|
1357
|
+
* Create a Markdown reporter.
|
|
1358
|
+
*
|
|
1359
|
+
* @example
|
|
1360
|
+
* ```typescript
|
|
1361
|
+
* const reporter = createMarkdownReporter('./reports')
|
|
1362
|
+
* reporter.save(report, 'my-test') // → ./reports/my-test-1736691234567.md
|
|
1363
|
+
* ```
|
|
1364
|
+
*/
|
|
1365
|
+
declare function createMarkdownReporter<TInput = unknown, TOutput = unknown>(outputDir: string, options?: Omit<MarkdownReporterOptions, 'outputDir'>): MarkdownReporter<TInput, TOutput>;
|
|
1366
|
+
/**
|
|
1367
|
+
* Create a console reporter.
|
|
1368
|
+
*
|
|
1369
|
+
* @example
|
|
1370
|
+
* ```typescript
|
|
1371
|
+
* const reporter = createConsoleReporter({ verbosity: 'detailed' })
|
|
1372
|
+
* reporter.log(report) // Logs to console
|
|
1373
|
+
* ```
|
|
1374
|
+
*/
|
|
1375
|
+
declare function createConsoleReporter<TInput = unknown, TOutput = unknown>(options?: ConsoleReporterOptions): ConsoleReporter<TInput, TOutput>;
|
|
1376
|
+
/**
|
|
1377
|
+
* Create a composite reporter from multiple reporters.
|
|
1378
|
+
*
|
|
1379
|
+
* @example
|
|
1380
|
+
* ```typescript
|
|
1381
|
+
* const reporter = createCompositeReporter([
|
|
1382
|
+
* createJsonReporter('./reports'),
|
|
1383
|
+
* createConsoleReporter({ verbosity: 'summary' }),
|
|
1384
|
+
* ])
|
|
1385
|
+
* ```
|
|
1386
|
+
*/
|
|
1387
|
+
declare function createCompositeReporter<TInput = unknown, TOutput = unknown>(reporters: Reporter<TInput, TOutput>[]): CompositeReporter<TInput, TOutput>;
|
|
1388
|
+
/**
|
|
1389
|
+
* Convenience: Create JSON + Console reporter combo.
|
|
1390
|
+
*
|
|
1391
|
+
* @example
|
|
1392
|
+
* ```typescript
|
|
1393
|
+
* const reporter = createDefaultReporter('./reports', {
|
|
1394
|
+
* pricing: GOOGLE_PRICING,
|
|
1395
|
+
* verbosity: 'summary',
|
|
1396
|
+
* })
|
|
1397
|
+
* reporter.save(report, 'my-test') // JSON 저장 + 콘솔 출력
|
|
1398
|
+
* ```
|
|
1399
|
+
*/
|
|
1400
|
+
declare function createDefaultReporter<TInput = unknown, TOutput = unknown>(outputDir: string, options?: {
|
|
1401
|
+
pricing?: EvalPricingConfig;
|
|
1402
|
+
verbosity?: LogVerbosity;
|
|
1403
|
+
addTimestamp?: boolean;
|
|
1404
|
+
}): CompositeReporter<TInput, TOutput>;
|
|
1405
|
+
|
|
1406
|
+
/**
|
|
1407
|
+
* Options for creating a report runner.
|
|
1408
|
+
*/
|
|
1409
|
+
interface ReportRunnerOptions {
|
|
1410
|
+
/** Directory where reports will be saved */
|
|
1411
|
+
outputDir: string;
|
|
1412
|
+
/** Pricing config for cost calculation */
|
|
1413
|
+
pricing?: EvalPricingConfig;
|
|
1414
|
+
/** Verbosity level for console output (false to disable logging) */
|
|
1415
|
+
verbosity?: LogVerbosity | false;
|
|
1416
|
+
}
|
|
1417
|
+
/**
|
|
1418
|
+
* Result returned by the report runner.
|
|
1419
|
+
*/
|
|
1420
|
+
interface ReportRunnerResult<TInput, TOutput> {
|
|
1421
|
+
/** The generated evaluation report */
|
|
1422
|
+
report: EvalReport<TInput, TOutput>;
|
|
1423
|
+
/** Path where the report was saved */
|
|
1424
|
+
savedPath: string;
|
|
1425
|
+
}
|
|
1426
|
+
/**
|
|
1427
|
+
* Creates a runner that automatically logs and saves reports.
|
|
1428
|
+
*
|
|
1429
|
+
* @param options - Runner configuration
|
|
1430
|
+
* @returns A function that runs the suite and handles reporting
|
|
1431
|
+
*
|
|
1432
|
+
* @example
|
|
1433
|
+
* ```typescript
|
|
1434
|
+
* import { createReportRunner, GOOGLE_PRICING } from '@agtlantis/eval'
|
|
1435
|
+
*
|
|
1436
|
+
* const run = createReportRunner({
|
|
1437
|
+
* outputDir: './reports',
|
|
1438
|
+
* pricing: GOOGLE_PRICING,
|
|
1439
|
+
* verbosity: 'detailed',
|
|
1440
|
+
* })
|
|
1441
|
+
*
|
|
1442
|
+
* const { report, savedPath } = await run(suite, testCases, 'my-evaluation')
|
|
1443
|
+
* // Logs to console and saves to ./reports/my-evaluation-{timestamp}.json
|
|
1444
|
+
* console.log(`Saved to: ${savedPath}`)
|
|
1445
|
+
* ```
|
|
1446
|
+
*/
|
|
1447
|
+
declare function createReportRunner(options: ReportRunnerOptions): <TInput, TOutput>(suite: EvalSuite<TInput, TOutput>, testCases: TestCase<TInput>[], name: string) => Promise<ReportRunnerResult<TInput, TOutput>>;
|
|
1448
|
+
|
|
1449
|
+
/** Storage abstraction for testability - allows injecting mock storage */
|
|
1450
|
+
interface HistoryStorage {
|
|
1451
|
+
readFile: (path: string) => Promise<string>;
|
|
1452
|
+
writeFile: (path: string, content: string) => Promise<void>;
|
|
1453
|
+
exists: (path: string) => boolean;
|
|
1454
|
+
mkdir: (path: string, options?: {
|
|
1455
|
+
recursive?: boolean;
|
|
1456
|
+
}) => Promise<string | undefined | void>;
|
|
1457
|
+
}
|
|
1458
|
+
declare const defaultHistoryStorage: HistoryStorage;
|
|
1459
|
+
interface ImprovementSession {
|
|
1460
|
+
readonly sessionId: string;
|
|
1461
|
+
readonly history: Readonly<ImprovementHistory>;
|
|
1462
|
+
readonly canSave: boolean;
|
|
1463
|
+
addRound(roundResult: RoundResult, updatedPrompt: SerializedPrompt): void;
|
|
1464
|
+
complete(terminationReason: string): void;
|
|
1465
|
+
save(): Promise<void>;
|
|
1466
|
+
flush(): Promise<void>;
|
|
1467
|
+
}
|
|
1468
|
+
interface SessionConfig {
|
|
1469
|
+
path?: string;
|
|
1470
|
+
autoSave?: boolean;
|
|
1471
|
+
storage?: HistoryStorage;
|
|
1472
|
+
onAutoSaveError?: (error: Error) => void;
|
|
1473
|
+
}
|
|
1474
|
+
/** @throws EvalError with PROMPT_INVALID_FORMAT if userTemplate is missing */
|
|
1475
|
+
declare function serializePrompt<TInput>(prompt: AgentPrompt<TInput>): SerializedPrompt;
|
|
1476
|
+
/** Reconstructs renderUserPrompt using compileTemplate. */
|
|
1477
|
+
declare function deserializePrompt<TInput>(serialized: SerializedPrompt): AgentPrompt<TInput>;
|
|
1478
|
+
/** @throws EvalError with PROMPT_INVALID_FORMAT if prompt lacks userTemplate */
|
|
1479
|
+
declare function createSession<TInput>(initialPrompt: AgentPrompt<TInput>, config?: SessionConfig): ImprovementSession;
|
|
1480
|
+
/** Resume from a history file. Clears completion status to allow adding new rounds. */
|
|
1481
|
+
declare function resumeSession(path: string, config?: Omit<SessionConfig, 'path'>): Promise<ImprovementSession>;
|
|
1482
|
+
/** Save history to JSON file. Creates parent directories if needed. */
|
|
1483
|
+
declare function saveHistory(history: ImprovementHistory, path: string, storage?: HistoryStorage): Promise<void>;
|
|
1484
|
+
declare function loadHistory(path: string, storage?: HistoryStorage): Promise<ImprovementHistory>;
|
|
1485
|
+
|
|
1486
|
+
/** Terminate when average score reaches threshold */
|
|
1487
|
+
interface TargetScoreCondition {
|
|
1488
|
+
type: 'targetScore';
|
|
1489
|
+
/** Score threshold (0-100) */
|
|
1490
|
+
threshold: number;
|
|
1491
|
+
}
|
|
1492
|
+
/** Terminate after N rounds */
|
|
1493
|
+
interface MaxRoundsCondition {
|
|
1494
|
+
type: 'maxRounds';
|
|
1495
|
+
/** Maximum number of improvement rounds */
|
|
1496
|
+
count: number;
|
|
1497
|
+
}
|
|
1498
|
+
/** Terminate when score doesn't improve for N consecutive rounds */
|
|
1499
|
+
interface NoImprovementCondition {
|
|
1500
|
+
type: 'noImprovement';
|
|
1501
|
+
/** Number of consecutive rounds without improvement */
|
|
1502
|
+
consecutiveRounds: number;
|
|
1503
|
+
/** Minimum score delta to count as improvement (default: 0) */
|
|
1504
|
+
minDelta?: number;
|
|
1505
|
+
}
|
|
1506
|
+
/** Terminate when total cost exceeds budget */
|
|
1507
|
+
interface MaxCostCondition {
|
|
1508
|
+
type: 'maxCost';
|
|
1509
|
+
/** Maximum cost in USD */
|
|
1510
|
+
maxUSD: number;
|
|
1511
|
+
}
|
|
1512
|
+
/** Custom condition with user-defined check function */
|
|
1513
|
+
interface CustomCycleCondition {
|
|
1514
|
+
type: 'custom';
|
|
1515
|
+
/** Function to check if termination condition is met */
|
|
1516
|
+
check: (ctx: CycleContext) => boolean | Promise<boolean>;
|
|
1517
|
+
/** Human-readable description (for debugging/logging) */
|
|
1518
|
+
description?: string;
|
|
1519
|
+
}
|
|
1520
|
+
/** Discriminated union of termination conditions. Uses OR semantics - first match triggers. */
|
|
1521
|
+
type CycleTerminationCondition = TargetScoreCondition | MaxRoundsCondition | NoImprovementCondition | MaxCostCondition | CustomCycleCondition;
|
|
1522
|
+
/** Context available to termination condition checks */
|
|
1523
|
+
interface CycleContext {
|
|
1524
|
+
/** Current round number (1-indexed) */
|
|
1525
|
+
currentRound: number;
|
|
1526
|
+
/** Average score from the latest round */
|
|
1527
|
+
latestScore: number;
|
|
1528
|
+
/** Score history from all previous rounds */
|
|
1529
|
+
previousScores: number[];
|
|
1530
|
+
/** Total accumulated cost in USD */
|
|
1531
|
+
totalCost: number;
|
|
1532
|
+
/** Full history of completed rounds */
|
|
1533
|
+
history: RoundResult[];
|
|
1534
|
+
}
|
|
1535
|
+
/** Result when cycle should continue (no termination) */
|
|
1536
|
+
interface CycleContinueResult {
|
|
1537
|
+
terminated: false;
|
|
1538
|
+
reason: string;
|
|
1539
|
+
/** Not present when not terminated (for type safety with discriminated union) */
|
|
1540
|
+
matchedCondition?: never;
|
|
1541
|
+
}
|
|
1542
|
+
/** Result when cycle should terminate */
|
|
1543
|
+
interface CycleTerminatedResult {
|
|
1544
|
+
terminated: true;
|
|
1545
|
+
matchedCondition: CycleTerminationCondition;
|
|
1546
|
+
reason: string;
|
|
1547
|
+
}
|
|
1548
|
+
type CycleTerminationResult = CycleContinueResult | CycleTerminatedResult;
|
|
1549
|
+
/**
|
|
1550
|
+
* Data yielded after each improvement round for Human-in-the-Loop (HITL) control.
|
|
1551
|
+
* The AsyncGenerator yields this after each round, allowing inspection and decision.
|
|
1552
|
+
*/
|
|
1553
|
+
interface RoundYield {
|
|
1554
|
+
/** Result of the completed round */
|
|
1555
|
+
roundResult: RoundResult;
|
|
1556
|
+
/** Current cycle context */
|
|
1557
|
+
context: CycleContext;
|
|
1558
|
+
/** Suggestions awaiting approval */
|
|
1559
|
+
pendingSuggestions: Suggestion[];
|
|
1560
|
+
/** Termination check result (use isCycleTerminated() to check if terminated) */
|
|
1561
|
+
terminationCheck: CycleTerminationResult;
|
|
1562
|
+
}
|
|
1563
|
+
/** Decision from the caller after reviewing a round */
|
|
1564
|
+
interface RoundDecision {
|
|
1565
|
+
/** Action to take */
|
|
1566
|
+
action: 'continue' | 'stop' | 'rollback';
|
|
1567
|
+
/** Target round for rollback (required if action is 'rollback') */
|
|
1568
|
+
rollbackToRound?: number;
|
|
1569
|
+
/** Suggestions approved by user (optional override) */
|
|
1570
|
+
approvedSuggestions?: Suggestion[];
|
|
1571
|
+
}
|
|
1572
|
+
/** Cost breakdown for a single round */
|
|
1573
|
+
interface RoundCost {
|
|
1574
|
+
/** Agent LLM cost in USD */
|
|
1575
|
+
agent: number;
|
|
1576
|
+
/** Judge LLM cost in USD */
|
|
1577
|
+
judge: number;
|
|
1578
|
+
/** Improver LLM cost in USD */
|
|
1579
|
+
improver: number;
|
|
1580
|
+
/** Total cost in USD */
|
|
1581
|
+
total: number;
|
|
1582
|
+
}
|
|
1583
|
+
/** Result of a single improvement round */
|
|
1584
|
+
interface RoundResult {
|
|
1585
|
+
/** Round number (1-indexed) */
|
|
1586
|
+
round: number;
|
|
1587
|
+
/** When this round completed */
|
|
1588
|
+
completedAt: Date;
|
|
1589
|
+
/** Full evaluation report */
|
|
1590
|
+
report: EvalReport<unknown, unknown>;
|
|
1591
|
+
/** All suggestions generated by improver */
|
|
1592
|
+
suggestionsGenerated: Suggestion[];
|
|
1593
|
+
/** Suggestions that were approved/applied */
|
|
1594
|
+
suggestionsApproved: Suggestion[];
|
|
1595
|
+
/** Prompt snapshot at start of this round (for rollback) */
|
|
1596
|
+
promptSnapshot: SerializedPrompt;
|
|
1597
|
+
/** Prompt version after applying suggestions */
|
|
1598
|
+
promptVersionAfter: string;
|
|
1599
|
+
/** Cost breakdown for this round */
|
|
1600
|
+
cost: RoundCost;
|
|
1601
|
+
/** Score change from previous round (null for first round) */
|
|
1602
|
+
scoreDelta: number | null;
|
|
1603
|
+
}
|
|
1604
|
+
/**
|
|
1605
|
+
* Serialized prompt for JSON storage.
|
|
1606
|
+
* Note: renderUserPrompt cannot be serialized; use compileTemplate(userTemplate) to reconstruct.
|
|
1607
|
+
*/
|
|
1608
|
+
interface SerializedPrompt {
|
|
1609
|
+
/** Prompt unique ID */
|
|
1610
|
+
id: string;
|
|
1611
|
+
/** Version string (e.g., "1.0.0") */
|
|
1612
|
+
version: string;
|
|
1613
|
+
/** System prompt */
|
|
1614
|
+
system: string;
|
|
1615
|
+
/** User prompt template (Mustache format) */
|
|
1616
|
+
userTemplate: string;
|
|
1617
|
+
/** Additional custom fields from AgentPrompt */
|
|
1618
|
+
customFields?: Record<string, unknown>;
|
|
1619
|
+
}
|
|
1620
|
+
/** Serialized round result for JSON storage */
|
|
1621
|
+
interface SerializedRoundResult {
|
|
1622
|
+
/** Round number (1-indexed) */
|
|
1623
|
+
round: number;
|
|
1624
|
+
/** Completion timestamp (ISO 8601) */
|
|
1625
|
+
completedAt: string;
|
|
1626
|
+
/** Average score from this round */
|
|
1627
|
+
avgScore: number;
|
|
1628
|
+
/** Number of passed tests */
|
|
1629
|
+
passed: number;
|
|
1630
|
+
/** Number of failed tests */
|
|
1631
|
+
failed: number;
|
|
1632
|
+
/** Total number of tests */
|
|
1633
|
+
totalTests: number;
|
|
1634
|
+
/** All suggestions generated */
|
|
1635
|
+
suggestionsGenerated: Suggestion[];
|
|
1636
|
+
/** Suggestions that were approved/applied */
|
|
1637
|
+
suggestionsApproved: Suggestion[];
|
|
1638
|
+
/** Prompt snapshot at start of this round */
|
|
1639
|
+
promptSnapshot: SerializedPrompt;
|
|
1640
|
+
/** Prompt version after applying suggestions */
|
|
1641
|
+
promptVersionAfter: string;
|
|
1642
|
+
/** Cost breakdown */
|
|
1643
|
+
cost: RoundCost;
|
|
1644
|
+
/** Score change from previous round */
|
|
1645
|
+
scoreDelta: number | null;
|
|
1646
|
+
}
|
|
1647
|
+
/**
|
|
1648
|
+
* Improvement cycle history (JSON file schema v1.1.0).
|
|
1649
|
+
* Includes promptSnapshot per round for rollback support.
|
|
1650
|
+
*/
|
|
1651
|
+
interface ImprovementHistory {
|
|
1652
|
+
/** Schema version for migration compatibility */
|
|
1653
|
+
schemaVersion: '1.1.0';
|
|
1654
|
+
/** Unique session identifier */
|
|
1655
|
+
sessionId: string;
|
|
1656
|
+
/** Session start timestamp (ISO 8601) */
|
|
1657
|
+
startedAt: string;
|
|
1658
|
+
/** Session completion timestamp (ISO 8601, if completed) */
|
|
1659
|
+
completedAt?: string;
|
|
1660
|
+
/** Initial prompt before any improvements */
|
|
1661
|
+
initialPrompt: SerializedPrompt;
|
|
1662
|
+
/** Current/latest prompt */
|
|
1663
|
+
currentPrompt: SerializedPrompt;
|
|
1664
|
+
/** All completed rounds */
|
|
1665
|
+
rounds: SerializedRoundResult[];
|
|
1666
|
+
/** Reason for termination (if completed) */
|
|
1667
|
+
terminationReason?: string;
|
|
1668
|
+
/** Total accumulated cost in USD */
|
|
1669
|
+
totalCost: number;
|
|
1670
|
+
}
|
|
1671
|
+
/** History persistence configuration */
|
|
1672
|
+
interface HistoryConfig {
|
|
1673
|
+
/** Path to save history JSON */
|
|
1674
|
+
path: string;
|
|
1675
|
+
/** Auto-save after each round (default: true) */
|
|
1676
|
+
autoSave?: boolean;
|
|
1677
|
+
}
|
|
1678
|
+
/** Configuration for running an improvement cycle */
|
|
1679
|
+
interface ImprovementCycleConfig<TInput, TOutput> {
|
|
1680
|
+
/** Factory function to create agent with given prompt */
|
|
1681
|
+
createAgent: (prompt: AgentPrompt<TInput>) => EvalAgent<TInput, TOutput>;
|
|
1682
|
+
/** Starting prompt for improvements */
|
|
1683
|
+
initialPrompt: AgentPrompt<TInput>;
|
|
1684
|
+
/** Test cases to evaluate against */
|
|
1685
|
+
testCases: TestCase<TInput>[];
|
|
1686
|
+
/** Judge for evaluation */
|
|
1687
|
+
judge: Judge;
|
|
1688
|
+
/** Improver for generating suggestions */
|
|
1689
|
+
improver: Improver;
|
|
1690
|
+
/** Termination conditions (OR semantics) */
|
|
1691
|
+
terminateWhen: CycleTerminationCondition[];
|
|
1692
|
+
/** Optional configuration */
|
|
1693
|
+
options?: ImprovementCycleOptions;
|
|
1694
|
+
}
|
|
1695
|
+
/** Optional configuration for improvement cycle */
|
|
1696
|
+
interface ImprovementCycleOptions {
|
|
1697
|
+
/** Options passed to eval suite run */
|
|
1698
|
+
runOptions?: {
|
|
1699
|
+
concurrency?: number;
|
|
1700
|
+
iterations?: number;
|
|
1701
|
+
};
|
|
1702
|
+
/** How to bump version on each improvement */
|
|
1703
|
+
versionBump?: 'major' | 'minor' | 'patch';
|
|
1704
|
+
/** Pricing configuration for cost calculation */
|
|
1705
|
+
pricingConfig?: EvalPricingConfig;
|
|
1706
|
+
/** History persistence settings */
|
|
1707
|
+
history?: HistoryConfig;
|
|
1708
|
+
/** Description for agent (passed to judge) */
|
|
1709
|
+
agentDescription?: string;
|
|
1710
|
+
/** Existing session to resume (preserves session ID and accumulated state) */
|
|
1711
|
+
session?: ImprovementSession;
|
|
1712
|
+
}
|
|
1713
|
+
/** Final result of an improvement cycle */
|
|
1714
|
+
interface ImprovementCycleResult<TInput, TOutput> {
|
|
1715
|
+
/** Final improved prompt */
|
|
1716
|
+
finalPrompt: AgentPrompt<TInput>;
|
|
1717
|
+
/** All completed rounds */
|
|
1718
|
+
rounds: RoundResult[];
|
|
1719
|
+
/** Reason for termination */
|
|
1720
|
+
terminationReason: string;
|
|
1721
|
+
/** Total cost in USD */
|
|
1722
|
+
totalCost: number;
|
|
1723
|
+
/** Saved history (if persistence was enabled) */
|
|
1724
|
+
history?: ImprovementHistory;
|
|
1725
|
+
}
|
|
1726
|
+
declare function isTargetScoreCondition(condition: CycleTerminationCondition): condition is TargetScoreCondition;
|
|
1727
|
+
declare function isMaxRoundsCondition(condition: CycleTerminationCondition): condition is MaxRoundsCondition;
|
|
1728
|
+
declare function isNoImprovementCondition(condition: CycleTerminationCondition): condition is NoImprovementCondition;
|
|
1729
|
+
declare function isMaxCostCondition(condition: CycleTerminationCondition): condition is MaxCostCondition;
|
|
1730
|
+
declare function isCustomCycleCondition(condition: CycleTerminationCondition): condition is CustomCycleCondition;
|
|
1731
|
+
declare function isCycleTerminated(result: CycleTerminationResult): result is CycleTerminatedResult;
|
|
1732
|
+
|
|
1733
|
+
/**
|
|
1734
|
+
* Options for saving an ImprovementCycleResult as JSON.
|
|
1735
|
+
*
|
|
1736
|
+
* Supports two modes:
|
|
1737
|
+
* - **Auto mode**: Provide `outputDir` and `name` to create a timestamped subdirectory
|
|
1738
|
+
* - **Explicit mode**: Provide `directory` to use an existing directory directly
|
|
1739
|
+
*/
|
|
1740
|
+
interface SaveCycleJsonOptions {
|
|
1741
|
+
/** Base output directory (creates {name}-{timestamp}/ subdirectory) */
|
|
1742
|
+
outputDir?: string;
|
|
1743
|
+
/** Cycle name (used for folder name with timestamp) */
|
|
1744
|
+
name?: string;
|
|
1745
|
+
/** Use this exact directory path (no timestamp suffix added) */
|
|
1746
|
+
directory?: string;
|
|
1747
|
+
/** Whether to save individual round reports (default: true) */
|
|
1748
|
+
saveRounds?: boolean;
|
|
1749
|
+
}
|
|
1750
|
+
/**
|
|
1751
|
+
* Saves an ImprovementCycleResult to JSON files.
|
|
1752
|
+
*
|
|
1753
|
+
* Creates a directory containing:
|
|
1754
|
+
* - `cycle-summary.json`: Structured cycle summary
|
|
1755
|
+
* - `round-{n}-report.json`: Individual round reports (if saveRounds=true)
|
|
1756
|
+
*
|
|
1757
|
+
* @example Auto mode (creates timestamped directory)
|
|
1758
|
+
* ```typescript
|
|
1759
|
+
* const dir = saveCycleJson(result, {
|
|
1760
|
+
* outputDir: './reports',
|
|
1761
|
+
* name: 'my-agent',
|
|
1762
|
+
* })
|
|
1763
|
+
* // -> ./reports/my-agent-1736691234567/
|
|
1764
|
+
* ```
|
|
1765
|
+
*
|
|
1766
|
+
* @example Explicit mode (uses existing directory)
|
|
1767
|
+
* ```typescript
|
|
1768
|
+
* const dir = saveCycleJson(result, {
|
|
1769
|
+
* directory: './reports/my-existing-dir',
|
|
1770
|
+
* })
|
|
1771
|
+
* // -> ./reports/my-existing-dir/
|
|
1772
|
+
* ```
|
|
1773
|
+
*/
|
|
1774
|
+
declare function saveCycleJson<TInput, TOutput>(result: ImprovementCycleResult<TInput, TOutput>, options: SaveCycleJsonOptions): string;
|
|
1775
|
+
|
|
1776
|
+
/**
|
|
1777
|
+
* Options for logging an ImprovementCycleResult to console.
|
|
1778
|
+
*/
|
|
1779
|
+
interface LogCycleOptions {
|
|
1780
|
+
/** Verbosity level for per-round details */
|
|
1781
|
+
verbosity?: LogVerbosity;
|
|
1782
|
+
/** Show per-round details (default: false, summary only) */
|
|
1783
|
+
showRounds?: boolean;
|
|
1784
|
+
}
|
|
1785
|
+
/**
|
|
1786
|
+
* Logs an ImprovementCycleResult to the console.
|
|
1787
|
+
*
|
|
1788
|
+
* Shows cycle summary including round count, termination reason, total cost,
|
|
1789
|
+
* and score progression. Optionally shows per-round details.
|
|
1790
|
+
*
|
|
1791
|
+
* @param result - The improvement cycle result to log
|
|
1792
|
+
* @param options - Logging options
|
|
1793
|
+
*
|
|
1794
|
+
* @example
|
|
1795
|
+
* ```typescript
|
|
1796
|
+
* import { logCycle } from '@agtlantis/eval'
|
|
1797
|
+
*
|
|
1798
|
+
* const result = await runImprovementCycleAuto(config)
|
|
1799
|
+
* logCycle(result, { verbosity: 'detailed', showRounds: true })
|
|
1800
|
+
* ```
|
|
1801
|
+
*/
|
|
1802
|
+
declare function logCycle<TInput, TOutput>(result: ImprovementCycleResult<TInput, TOutput>, options?: LogCycleOptions): void;
|
|
1803
|
+
|
|
1804
|
+
/**
|
|
1805
|
+
* Options for generating cycle markdown.
|
|
1806
|
+
*/
|
|
1807
|
+
interface CycleMarkdownOptions {
|
|
1808
|
+
/** Include full per-round details (default: true) */
|
|
1809
|
+
includeRoundDetails?: boolean;
|
|
1810
|
+
/** Show prompt evolution - initial vs final (default: false) */
|
|
1811
|
+
showPromptEvolution?: boolean;
|
|
1812
|
+
}
|
|
1813
|
+
/**
|
|
1814
|
+
* Converts an ImprovementCycleResult to markdown.
|
|
1815
|
+
*
|
|
1816
|
+
* Generates a comprehensive report including:
|
|
1817
|
+
* - Summary table (rounds, termination, cost, scores)
|
|
1818
|
+
* - Score progression table
|
|
1819
|
+
* - Per-round details (optional)
|
|
1820
|
+
* - Prompt evolution (optional)
|
|
1821
|
+
*
|
|
1822
|
+
* @param result - The improvement cycle result
|
|
1823
|
+
* @param options - Markdown generation options
|
|
1824
|
+
* @returns Markdown string
|
|
1825
|
+
*
|
|
1826
|
+
* @example
|
|
1827
|
+
* ```typescript
|
|
1828
|
+
* import { cycleToMarkdown } from '@agtlantis/eval'
|
|
1829
|
+
*
|
|
1830
|
+
* const result = await runImprovementCycleAuto(config)
|
|
1831
|
+
* const markdown = cycleToMarkdown(result, {
|
|
1832
|
+
* includeRoundDetails: true,
|
|
1833
|
+
* showPromptEvolution: true,
|
|
1834
|
+
* })
|
|
1835
|
+
* ```
|
|
1836
|
+
*/
|
|
1837
|
+
declare function cycleToMarkdown<TInput, TOutput>(result: ImprovementCycleResult<TInput, TOutput>, options?: CycleMarkdownOptions): string;
|
|
1838
|
+
/**
|
|
1839
|
+
* Saves an ImprovementCycleResult as markdown.
|
|
1840
|
+
*
|
|
1841
|
+
* @param result - The improvement cycle result
|
|
1842
|
+
* @param filePath - Path to save the markdown file
|
|
1843
|
+
* @param options - Markdown generation options
|
|
1844
|
+
*
|
|
1845
|
+
* @example
|
|
1846
|
+
* ```typescript
|
|
1847
|
+
* import { saveCycleMarkdown } from '@agtlantis/eval'
|
|
1848
|
+
*
|
|
1849
|
+
* const result = await runImprovementCycleAuto(config)
|
|
1850
|
+
* saveCycleMarkdown(result, './reports/cycle-report.md', {
|
|
1851
|
+
* includeRoundDetails: true,
|
|
1852
|
+
* })
|
|
1853
|
+
* ```
|
|
1854
|
+
*/
|
|
1855
|
+
declare function saveCycleMarkdown<TInput, TOutput>(result: ImprovementCycleResult<TInput, TOutput>, filePath: string, options?: CycleMarkdownOptions): void;
|
|
1856
|
+
|
|
1857
|
+
/**
|
|
1858
|
+
* Generates a unified diff string for a suggestion.
|
|
1859
|
+
*
|
|
1860
|
+
* @example
|
|
1861
|
+
* ```typescript
|
|
1862
|
+
* const diff = suggestionDiff(suggestion)
|
|
1863
|
+
* console.log(diff)
|
|
1864
|
+
* // - Old value here
|
|
1865
|
+
* // + New value here
|
|
1866
|
+
* ```
|
|
1867
|
+
*/
|
|
1868
|
+
declare function suggestionDiff(suggestion: Suggestion): string;
|
|
1869
|
+
/**
|
|
1870
|
+
* Generates a preview of what the suggestion would look like when applied.
|
|
1871
|
+
*
|
|
1872
|
+
* @example
|
|
1873
|
+
* ```typescript
|
|
1874
|
+
* const preview = suggestionPreview(suggestion)
|
|
1875
|
+
* console.log(preview)
|
|
1876
|
+
* ```
|
|
1877
|
+
*/
|
|
1878
|
+
declare function suggestionPreview(suggestion: Suggestion): string;
|
|
1879
|
+
/**
|
|
1880
|
+
* Formats a suggestion as a compact summary string.
|
|
1881
|
+
*
|
|
1882
|
+
* @example
|
|
1883
|
+
* ```typescript
|
|
1884
|
+
* console.log(suggestionSummary(suggestion))
|
|
1885
|
+
* // [HIGH] system_prompt: Improve clarity in instructions
|
|
1886
|
+
* ```
|
|
1887
|
+
*/
|
|
1888
|
+
declare function suggestionSummary(suggestion: Suggestion): string;
|
|
1889
|
+
/**
|
|
1890
|
+
* Options for applying suggestions to a prompt.
|
|
1891
|
+
*/
|
|
1892
|
+
interface ApplyPromptSuggestionsOptions {
|
|
1893
|
+
/**
|
|
1894
|
+
* Version bump type for semver.
|
|
1895
|
+
* - 'major': 1.0.0 → 2.0.0 (breaking changes)
|
|
1896
|
+
* - 'minor': 1.0.0 → 1.1.0 (new features)
|
|
1897
|
+
* - 'patch': 1.0.0 → 1.0.1 (bug fixes)
|
|
1898
|
+
*/
|
|
1899
|
+
bumpVersion?: 'major' | 'minor' | 'patch';
|
|
1900
|
+
}
|
|
1901
|
+
/**
|
|
1902
|
+
* Result of applying suggestions to a prompt.
|
|
1903
|
+
*/
|
|
1904
|
+
interface ApplySuggestionsResult<TInput, TOutput = unknown> {
|
|
1905
|
+
/** The updated prompt with suggestions applied */
|
|
1906
|
+
prompt: AgentPrompt<TInput>;
|
|
1907
|
+
/** Number of suggestions that were successfully applied */
|
|
1908
|
+
appliedCount: number;
|
|
1909
|
+
/** Suggestions that could not be applied (currentValue not found) */
|
|
1910
|
+
skipped: Array<{
|
|
1911
|
+
suggestion: Suggestion;
|
|
1912
|
+
reason: string;
|
|
1913
|
+
}>;
|
|
1914
|
+
}
|
|
1915
|
+
/**
|
|
1916
|
+
* Bumps a semver version string.
|
|
1917
|
+
*
|
|
1918
|
+
* @example
|
|
1919
|
+
* ```typescript
|
|
1920
|
+
* bumpVersion('1.0.0', 'major') // '2.0.0'
|
|
1921
|
+
* bumpVersion('1.0.0', 'minor') // '1.1.0'
|
|
1922
|
+
* bumpVersion('1.0.0', 'patch') // '1.0.1'
|
|
1923
|
+
* bumpVersion('1.2.3', 'minor') // '1.3.0'
|
|
1924
|
+
* ```
|
|
1925
|
+
*/
|
|
1926
|
+
declare function bumpVersion(version: string, bump: 'major' | 'minor' | 'patch'): string;
|
|
1927
|
+
/**
|
|
1928
|
+
* Applies approved suggestions to an AgentPrompt and returns a new prompt.
|
|
1929
|
+
*
|
|
1930
|
+
* This function:
|
|
1931
|
+
* - Only applies suggestions where `approved === true`
|
|
1932
|
+
* - For `system_prompt`: replaces `currentValue` in `prompt.system`
|
|
1933
|
+
* - For `user_prompt`: requires `prompt.userTemplate` field, updates it and regenerates `renderUserPrompt`
|
|
1934
|
+
* - For `parameters`: applies to custom fields in the prompt
|
|
1935
|
+
* - Optionally bumps the version (major/minor/patch)
|
|
1936
|
+
*
|
|
1937
|
+
* **Important behaviors:**
|
|
1938
|
+
* - Only the **first occurrence** of `currentValue` is replaced (not all occurrences)
|
|
1939
|
+
* - Special characters like `$&`, `$1` in `suggestedValue` are preserved as-is (no regex interpretation)
|
|
1940
|
+
*
|
|
1941
|
+
* @example
|
|
1942
|
+
* ```typescript
|
|
1943
|
+
* // Apply approved suggestions with minor version bump
|
|
1944
|
+
* const result = applyPromptSuggestions(
|
|
1945
|
+
* currentPrompt,
|
|
1946
|
+
* suggestions.filter(s => s.approved),
|
|
1947
|
+
* { bumpVersion: 'minor' }
|
|
1948
|
+
* )
|
|
1949
|
+
*
|
|
1950
|
+
* console.log(result.prompt.version) // '1.1.0'
|
|
1951
|
+
* console.log(`Applied ${result.appliedCount} suggestions`)
|
|
1952
|
+
*
|
|
1953
|
+
* if (result.skipped.length > 0) {
|
|
1954
|
+
* console.warn('Skipped suggestions:', result.skipped)
|
|
1955
|
+
* }
|
|
1956
|
+
* ```
|
|
1957
|
+
*
|
|
1958
|
+
* @throws {EvalError} with code SUGGESTION_APPLY_ERROR if:
|
|
1959
|
+
* - A `user_prompt` suggestion is applied but prompt lacks `userTemplate` field
|
|
1960
|
+
* - Version format is invalid when bumpVersion is specified
|
|
1961
|
+
*/
|
|
1962
|
+
declare function applyPromptSuggestions<TInput, TOutput = unknown>(currentPrompt: AgentPrompt<TInput>, suggestions: Suggestion[], options?: ApplyPromptSuggestionsOptions): ApplySuggestionsResult<TInput, TOutput>;
|
|
1963
|
+
|
|
1964
|
+
/**
|
|
1965
|
+
* Creates an LLM-based prompt improver.
|
|
1966
|
+
*
|
|
1967
|
+
* Analyzes test results and suggests improvements to the agent's prompt,
|
|
1968
|
+
* focusing on low-scoring criteria with actionable suggestions.
|
|
1969
|
+
*
|
|
1970
|
+
* @example
|
|
1971
|
+
* ```typescript
|
|
1972
|
+
* import { createImprover, defaultImproverPrompt } from '@agtlantis/eval'
|
|
1973
|
+
* import { createGoogleProvider } from '@agtlantis/core'
|
|
1974
|
+
*
|
|
1975
|
+
* const provider = createGoogleProvider({ apiKey }).withDefaultModel('gemini-2.5-flash')
|
|
1976
|
+
*
|
|
1977
|
+
* const improver = createImprover({
|
|
1978
|
+
* provider,
|
|
1979
|
+
* prompt: defaultImproverPrompt,
|
|
1980
|
+
* })
|
|
1981
|
+
*
|
|
1982
|
+
* const { suggestions } = await improver.improve(agent.prompt, evaluatedResults)
|
|
1983
|
+
*
|
|
1984
|
+
* for (const suggestion of suggestions) {
|
|
1985
|
+
* console.log(suggestionDiff(suggestion))
|
|
1986
|
+
* suggestion.approved = true
|
|
1987
|
+
* }
|
|
1988
|
+
*
|
|
1989
|
+
* const newPrompt = applyPromptSuggestions(agent.prompt, suggestions)
|
|
1990
|
+
* ```
|
|
1991
|
+
*/
|
|
1992
|
+
declare function createImprover(config: ImproverConfig): Improver;
|
|
1993
|
+
|
|
1994
|
+
/**
|
|
1995
|
+
* Configuration for creating a mock agent.
|
|
1996
|
+
*/
|
|
1997
|
+
interface MockAgentConfig<TInput, TOutput> {
|
|
1998
|
+
/** Name for the mock agent */
|
|
1999
|
+
name?: string;
|
|
2000
|
+
/** Description for the mock agent */
|
|
2001
|
+
description?: string;
|
|
2002
|
+
/** Response to return from execute() */
|
|
2003
|
+
response?: TOutput;
|
|
2004
|
+
/** Token usage to include in metadata */
|
|
2005
|
+
tokenUsage?: EvalTokenUsage;
|
|
2006
|
+
/** Delay in ms before returning response */
|
|
2007
|
+
delay?: number;
|
|
2008
|
+
/** If true, throw an error instead of returning response */
|
|
2009
|
+
shouldError?: boolean;
|
|
2010
|
+
/** Custom error message when shouldError is true */
|
|
2011
|
+
errorMessage?: string;
|
|
2012
|
+
/** Custom execute function for more control */
|
|
2013
|
+
executeFn?: (input: TInput) => Promise<{
|
|
2014
|
+
result: TOutput;
|
|
2015
|
+
metadata?: {
|
|
2016
|
+
tokenUsage?: EvalTokenUsage;
|
|
2017
|
+
};
|
|
2018
|
+
}>;
|
|
2019
|
+
}
|
|
2020
|
+
/**
|
|
2021
|
+
* Creates a mock agent for testing purposes.
|
|
2022
|
+
*
|
|
2023
|
+
* @example
|
|
2024
|
+
* ```typescript
|
|
2025
|
+
* // Basic usage
|
|
2026
|
+
* const agent = createMockAgent<{ query: string }, { answer: string }>({
|
|
2027
|
+
* response: { answer: 'Hello!' },
|
|
2028
|
+
* })
|
|
2029
|
+
*
|
|
2030
|
+
* // With delay and token usage
|
|
2031
|
+
* const agent = createMockAgent({
|
|
2032
|
+
* response: { answer: 'Response' },
|
|
2033
|
+
* delay: 100,
|
|
2034
|
+
* tokenUsage: { inputTokens: 10, outputTokens: 20, totalTokens: 30 },
|
|
2035
|
+
* })
|
|
2036
|
+
*
|
|
2037
|
+
* // Error testing
|
|
2038
|
+
* const agent = createMockAgent({
|
|
2039
|
+
* shouldError: true,
|
|
2040
|
+
* errorMessage: 'Agent failed',
|
|
2041
|
+
* })
|
|
2042
|
+
* ```
|
|
2043
|
+
*/
|
|
2044
|
+
declare function createMockAgent<TInput, TOutput>(config?: MockAgentConfig<TInput, TOutput>): EvalAgent<TInput, TOutput>;
|
|
2045
|
+
/**
|
|
2046
|
+
* Configuration for creating a mock judge.
|
|
2047
|
+
*/
|
|
2048
|
+
interface MockJudgeConfig {
|
|
2049
|
+
/** Overall score to return (0-100) */
|
|
2050
|
+
score?: number;
|
|
2051
|
+
/** Whether the evaluation passed */
|
|
2052
|
+
passed?: boolean;
|
|
2053
|
+
/** Verdicts to return */
|
|
2054
|
+
verdicts?: Verdict[];
|
|
2055
|
+
/** Metadata to return (for cost tracking tests) */
|
|
2056
|
+
metadata?: JudgeResult['metadata'];
|
|
2057
|
+
/** If true, throw an error instead of returning result */
|
|
2058
|
+
shouldError?: boolean;
|
|
2059
|
+
/** Custom error message when shouldError is true */
|
|
2060
|
+
errorMessage?: string;
|
|
2061
|
+
/** Custom evaluate function for more control */
|
|
2062
|
+
evaluateFn?: (context: EvalContext) => Promise<JudgeResult>;
|
|
2063
|
+
}
|
|
2064
|
+
/**
|
|
2065
|
+
* Creates a mock judge for testing purposes.
|
|
2066
|
+
*
|
|
2067
|
+
* @example
|
|
2068
|
+
* ```typescript
|
|
2069
|
+
* // Basic usage - passing test
|
|
2070
|
+
* const judge = createMockJudge({
|
|
2071
|
+
* score: 85,
|
|
2072
|
+
* passed: true,
|
|
2073
|
+
* })
|
|
2074
|
+
*
|
|
2075
|
+
* // Custom verdicts
|
|
2076
|
+
* const judge = createMockJudge({
|
|
2077
|
+
* verdicts: [
|
|
2078
|
+
* { criterionId: 'accuracy', score: 90, reasoning: 'Good', passed: true },
|
|
2079
|
+
* { criterionId: 'clarity', score: 80, reasoning: 'Clear', passed: true },
|
|
2080
|
+
* ],
|
|
2081
|
+
* score: 85,
|
|
2082
|
+
* passed: true,
|
|
2083
|
+
* })
|
|
2084
|
+
*
|
|
2085
|
+
* // Failing test
|
|
2086
|
+
* const judge = createMockJudge({
|
|
2087
|
+
* score: 40,
|
|
2088
|
+
* passed: false,
|
|
2089
|
+
* })
|
|
2090
|
+
*
|
|
2091
|
+
* // Error testing
|
|
2092
|
+
* const judge = createMockJudge({
|
|
2093
|
+
* shouldError: true,
|
|
2094
|
+
* errorMessage: 'Judge failed to evaluate',
|
|
2095
|
+
* })
|
|
2096
|
+
* ```
|
|
2097
|
+
*/
|
|
2098
|
+
declare function createMockJudge(config?: MockJudgeConfig): Judge;
|
|
2099
|
+
/**
|
|
2100
|
+
* Configuration for creating a mock improver.
|
|
2101
|
+
*/
|
|
2102
|
+
interface MockImproverConfig {
|
|
2103
|
+
/** Suggestions to return */
|
|
2104
|
+
suggestions?: Suggestion[];
|
|
2105
|
+
/** If true, throw an error instead of returning suggestions */
|
|
2106
|
+
shouldError?: boolean;
|
|
2107
|
+
/** Custom error message when shouldError is true */
|
|
2108
|
+
errorMessage?: string;
|
|
2109
|
+
/** Custom improve function for more control */
|
|
2110
|
+
improveFn?: (agentPrompt: AgentPrompt<any>, results: TestResultWithVerdict<any, any>[]) => Promise<ImproveResult>;
|
|
2111
|
+
}
|
|
2112
|
+
/**
|
|
2113
|
+
* Creates a mock improver for testing purposes.
|
|
2114
|
+
*
|
|
2115
|
+
* @example
|
|
2116
|
+
* ```typescript
|
|
2117
|
+
* // Basic usage
|
|
2118
|
+
* const improver = createMockImprover({
|
|
2119
|
+
* suggestions: [
|
|
2120
|
+
* {
|
|
2121
|
+
* type: 'system_prompt',
|
|
2122
|
+
* priority: 'high',
|
|
2123
|
+
* currentValue: 'Old prompt',
|
|
2124
|
+
* suggestedValue: 'New prompt',
|
|
2125
|
+
* reasoning: 'Better clarity',
|
|
2126
|
+
* expectedImprovement: '10% improvement',
|
|
2127
|
+
* },
|
|
2128
|
+
* ],
|
|
2129
|
+
* })
|
|
2130
|
+
*
|
|
2131
|
+
* // Empty suggestions
|
|
2132
|
+
* const improver = createMockImprover({ suggestions: [] })
|
|
2133
|
+
*
|
|
2134
|
+
* // Error testing
|
|
2135
|
+
* const improver = createMockImprover({
|
|
2136
|
+
* shouldError: true,
|
|
2137
|
+
* errorMessage: 'Improver failed',
|
|
2138
|
+
* })
|
|
2139
|
+
* ```
|
|
2140
|
+
*/
|
|
2141
|
+
declare function createMockImprover(config?: MockImproverConfig): Improver;
|
|
2142
|
+
|
|
2143
|
+
type TerminationCondition<TInput = unknown, TOutput = unknown> = MaxTurnsCondition | FieldSetCondition | FieldValueCondition | CustomCondition<TInput, TOutput>;
|
|
2144
|
+
interface MaxTurnsCondition {
|
|
2145
|
+
type: 'maxTurns';
|
|
2146
|
+
/** Safety limit - terminates after this many turns */
|
|
2147
|
+
count: number;
|
|
2148
|
+
}
|
|
2149
|
+
interface FieldsCondition {
|
|
2150
|
+
/** Dot notation for nested access (e.g., "result.recommendation") */
|
|
2151
|
+
fieldPath: string;
|
|
2152
|
+
}
|
|
2153
|
+
interface FieldSetCondition extends FieldsCondition {
|
|
2154
|
+
type: 'fieldSet';
|
|
2155
|
+
}
|
|
2156
|
+
interface FieldValueCondition extends FieldsCondition {
|
|
2157
|
+
type: 'fieldValue';
|
|
2158
|
+
expectedValue: unknown;
|
|
2159
|
+
}
|
|
2160
|
+
interface CustomCondition<TInput = unknown, TOutput = unknown> {
|
|
2161
|
+
type: 'custom';
|
|
2162
|
+
/** Sync or async check function (e.g., for LLM-based conditions) */
|
|
2163
|
+
check: (context: ConversationContext<TInput, TOutput>) => boolean | Promise<boolean>;
|
|
2164
|
+
/** For debugging/logging */
|
|
2165
|
+
description?: string;
|
|
2166
|
+
}
|
|
2167
|
+
type TerminationType = 'condition' | 'maxTurns' | 'error' | 'exhausted';
|
|
2168
|
+
interface ContinueResult {
|
|
2169
|
+
terminated: false;
|
|
2170
|
+
reason: string;
|
|
2171
|
+
terminationType?: never;
|
|
2172
|
+
matchedCondition?: never;
|
|
2173
|
+
}
|
|
2174
|
+
interface TerminatedResult {
|
|
2175
|
+
terminated: true;
|
|
2176
|
+
terminationType: TerminationType;
|
|
2177
|
+
matchedCondition?: TerminationCondition<unknown, unknown>;
|
|
2178
|
+
reason: string;
|
|
2179
|
+
}
|
|
2180
|
+
type TerminationCheckResult = ContinueResult | TerminatedResult;
|
|
2181
|
+
interface ConversationContext<TInput, TOutput = unknown> {
|
|
2182
|
+
currentTurn: number;
|
|
2183
|
+
history: Array<{
|
|
2184
|
+
turn: number;
|
|
2185
|
+
input: TInput;
|
|
2186
|
+
output: TOutput | undefined;
|
|
2187
|
+
metadata?: AgentMetadata;
|
|
2188
|
+
}>;
|
|
2189
|
+
lastOutput?: TOutput;
|
|
2190
|
+
}
|
|
2191
|
+
interface FollowUpInput<TInput, TOutput = unknown> {
|
|
2192
|
+
/**
|
|
2193
|
+
* Input for this follow-up turn.
|
|
2194
|
+
* Can be static, dynamic (sync), or async (for AI-generated inputs via aiUser()).
|
|
2195
|
+
*/
|
|
2196
|
+
input: TInput | ((context: ConversationContext<TInput, TOutput>) => TInput) | ((context: ConversationContext<TInput, TOutput>) => Promise<TInput>);
|
|
2197
|
+
/** For debugging/reports */
|
|
2198
|
+
description?: string;
|
|
2199
|
+
/**
|
|
2200
|
+
* Repeat count (default: 1).
|
|
2201
|
+
* Use Infinity to repeat until termination (must be last followUpInput).
|
|
2202
|
+
*/
|
|
2203
|
+
turns?: number;
|
|
2204
|
+
}
|
|
2205
|
+
interface MultiTurnTestCase<TInput, TOutput = unknown> extends TestCase<TInput> {
|
|
2206
|
+
multiTurn: {
|
|
2207
|
+
/** Inputs for 2nd turn onwards (first turn uses TestCase.input) */
|
|
2208
|
+
followUpInputs?: FollowUpInput<TInput, TOutput>[];
|
|
2209
|
+
/** Any condition triggers termination (OR logic) */
|
|
2210
|
+
terminateWhen: TerminationCondition<TInput, TOutput>[];
|
|
2211
|
+
/** Safety limit (default: 10). Uses min of this and any maxTurns condition. */
|
|
2212
|
+
maxTurns?: number;
|
|
2213
|
+
/** Pass/fail when condition met (default: 'pass') */
|
|
2214
|
+
onConditionMet?: 'pass' | 'fail';
|
|
2215
|
+
/** Pass/fail when maxTurns reached (default: 'fail') */
|
|
2216
|
+
onMaxTurnsReached?: 'pass' | 'fail';
|
|
2217
|
+
};
|
|
2218
|
+
}
|
|
2219
|
+
interface MultiTurnTestResult<TInput, TOutput> extends Omit<TestResultWithVerdict<TInput, TOutput>, 'output'> {
|
|
2220
|
+
output: TOutput | undefined;
|
|
2221
|
+
conversationHistory: Array<{
|
|
2222
|
+
turn: number;
|
|
2223
|
+
input: TInput;
|
|
2224
|
+
output: TOutput | undefined;
|
|
2225
|
+
metadata?: AgentMetadata;
|
|
2226
|
+
}>;
|
|
2227
|
+
termination: TerminationCheckResult;
|
|
2228
|
+
totalTurns: number;
|
|
2229
|
+
}
|
|
2230
|
+
declare function isMaxTurnsCondition<TInput, TOutput>(condition: TerminationCondition<TInput, TOutput>): condition is MaxTurnsCondition;
|
|
2231
|
+
declare function isFieldSetCondition<TInput, TOutput>(condition: TerminationCondition<TInput, TOutput>): condition is FieldSetCondition;
|
|
2232
|
+
declare function isFieldValueCondition<TInput, TOutput>(condition: TerminationCondition<TInput, TOutput>): condition is FieldValueCondition;
|
|
2233
|
+
declare function isCustomCondition<TInput, TOutput>(condition: TerminationCondition<TInput, TOutput>): condition is CustomCondition<TInput, TOutput>;
|
|
2234
|
+
declare function isMultiTurnTestCase<TInput, TOutput = unknown>(testCase: TestCase<TInput>): testCase is MultiTurnTestCase<TInput, TOutput>;
|
|
2235
|
+
declare function isTerminated(result: TerminationCheckResult): result is TerminatedResult;
|
|
2236
|
+
|
|
2237
|
+
/** Access a nested field value using dot notation (e.g., "result.recommendation"). */
|
|
2238
|
+
declare function getFieldValue(obj: unknown, fieldPath: string): unknown;
|
|
2239
|
+
declare function checkCondition<TInput, TOutput>(condition: TerminationCondition<TInput, TOutput>, context: ConversationContext<TInput, TOutput>): Promise<TerminationCheckResult>;
|
|
2240
|
+
/** Check all termination conditions (OR relationship). Returns on first termination. */
|
|
2241
|
+
declare function checkTermination<TInput, TOutput>(conditions: TerminationCondition<TInput, TOutput>[], context: ConversationContext<TInput, TOutput>): Promise<TerminationCheckResult>;
|
|
2242
|
+
|
|
2243
|
+
interface NaturalLanguageConditionOptions {
|
|
2244
|
+
/** Provider to use for evaluation */
|
|
2245
|
+
provider: Provider;
|
|
2246
|
+
/** Prompt describing the termination criteria (e.g., "Has the user's question been fully answered?") */
|
|
2247
|
+
prompt: string;
|
|
2248
|
+
/** Optional system prompt override */
|
|
2249
|
+
systemPrompt?: string;
|
|
2250
|
+
}
|
|
2251
|
+
/** LLM-based termination condition. Asks the LLM to evaluate the termination criteria. */
|
|
2252
|
+
declare function naturalLanguage<TInput = unknown, TOutput = unknown>(options: NaturalLanguageConditionOptions): CustomCondition<TInput, TOutput>;
|
|
2253
|
+
/** Terminates when ALL sub-conditions are met (AND logic). */
|
|
2254
|
+
declare function and$1<TInput = unknown, TOutput = unknown>(...conditions: TerminationCondition<TInput, TOutput>[]): CustomCondition<TInput, TOutput>;
|
|
2255
|
+
/** Terminates when ANY sub-condition is met (OR logic). Useful for nested composites. */
|
|
2256
|
+
declare function or$1<TInput = unknown, TOutput = unknown>(...conditions: TerminationCondition<TInput, TOutput>[]): CustomCondition<TInput, TOutput>;
|
|
2257
|
+
/** Inverts another condition (NOT logic). */
|
|
2258
|
+
declare function not$1<TInput = unknown, TOutput = unknown>(condition: TerminationCondition<TInput, TOutput>): CustomCondition<TInput, TOutput>;
|
|
2259
|
+
/** Terminates after a specified number of turns. Convenience wrapper for use in composites. */
|
|
2260
|
+
declare function afterTurns<TInput = unknown, TOutput = unknown>(count: number): CustomCondition<TInput, TOutput>;
|
|
2261
|
+
/** Terminates when a field matches a specific value. Convenience wrapper for composites. */
|
|
2262
|
+
declare function fieldEquals<TInput = unknown, TOutput = unknown>(fieldPath: string, expectedValue: unknown): CustomCondition<TInput, TOutput>;
|
|
2263
|
+
/** Terminates when a field is set (not null/undefined). Convenience wrapper for composites. */
|
|
2264
|
+
declare function fieldIsSet<TInput = unknown, TOutput = unknown>(fieldPath: string): CustomCondition<TInput, TOutput>;
|
|
2265
|
+
|
|
2266
|
+
interface MultiTurnExecuteContext<TInput, TOutput> {
|
|
2267
|
+
agent: EvalAgent<TInput, TOutput>;
|
|
2268
|
+
judge: Judge;
|
|
2269
|
+
agentDescription: string;
|
|
2270
|
+
}
|
|
2271
|
+
interface MultiTurnExecuteOptions {
|
|
2272
|
+
signal?: AbortSignal;
|
|
2273
|
+
}
|
|
2274
|
+
declare function executeMultiTurnTestCase<TInput, TOutput>(testCase: MultiTurnTestCase<TInput, TOutput>, context: MultiTurnExecuteContext<TInput, TOutput>, options?: MultiTurnExecuteOptions): Promise<MultiTurnTestResult<TInput, TOutput>>;
|
|
2275
|
+
|
|
2276
|
+
interface AIUserOptions<TInput, TOutput> {
|
|
2277
|
+
/** Provider for generating user responses */
|
|
2278
|
+
provider: Provider;
|
|
2279
|
+
/** System prompt (string or function for dynamic personas). Uses default if not provided. */
|
|
2280
|
+
systemPrompt?: string | ((context: ConversationContext<TInput, TOutput>) => string);
|
|
2281
|
+
/** Custom history formatter. Default: JSON-based "User: {input}\nAssistant: {output}" format. */
|
|
2282
|
+
formatHistory?: (context: ConversationContext<TInput, TOutput>) => string;
|
|
2283
|
+
/** Convert LLM text response to TInput. Has access to full context for structured input building. */
|
|
2284
|
+
buildInput: (llmResponse: string, context: ConversationContext<TInput, TOutput>) => TInput;
|
|
2285
|
+
}
|
|
2286
|
+
/**
|
|
2287
|
+
* Creates an async function that generates user inputs using an LLM for multi-turn testing.
|
|
2288
|
+
*
|
|
2289
|
+
* @example
|
|
2290
|
+
* ```typescript
|
|
2291
|
+
* aiUser({
|
|
2292
|
+
* provider: openai,
|
|
2293
|
+
* systemPrompt: 'You are a friendly customer.',
|
|
2294
|
+
* buildInput: (response, ctx) => ({ message: response }),
|
|
2295
|
+
* })
|
|
2296
|
+
* ```
|
|
2297
|
+
*/
|
|
2298
|
+
declare function aiUser<TInput, TOutput>(options: AIUserOptions<TInput, TOutput>): (context: ConversationContext<TInput, TOutput>) => Promise<TInput>;
|
|
2299
|
+
|
|
2300
|
+
/**
|
|
2301
|
+
* CLI Configuration Types
|
|
2302
|
+
*
|
|
2303
|
+
* Defines the configuration schema for `agent-eval.config.ts` files.
|
|
2304
|
+
* Use `defineConfig()` helper for type inference and IDE autocompletion.
|
|
2305
|
+
*/
|
|
2306
|
+
|
|
2307
|
+
/**
|
|
2308
|
+
* LLM provider configuration.
|
|
2309
|
+
* API keys fall back to OPENAI_API_KEY or GOOGLE_API_KEY env vars.
|
|
2310
|
+
*/
|
|
2311
|
+
interface LLMConfig {
|
|
2312
|
+
/** LLM provider */
|
|
2313
|
+
provider: 'openai' | 'gemini';
|
|
2314
|
+
/** API key (optional - falls back to environment variable) */
|
|
2315
|
+
apiKey?: string;
|
|
2316
|
+
/** Default model to use */
|
|
2317
|
+
defaultModel?: string;
|
|
2318
|
+
/**
|
|
2319
|
+
* OpenAI reasoning effort (o1/o3 models only)
|
|
2320
|
+
* @see https://platform.openai.com/docs/guides/reasoning
|
|
2321
|
+
*/
|
|
2322
|
+
reasoningEffort?: 'minimal' | 'low' | 'medium' | 'high';
|
|
2323
|
+
/**
|
|
2324
|
+
* Default response format
|
|
2325
|
+
* @see https://platform.openai.com/docs/guides/structured-outputs
|
|
2326
|
+
*/
|
|
2327
|
+
defaultResponseFormat?: {
|
|
2328
|
+
type: 'json_object' | 'text';
|
|
2329
|
+
};
|
|
2330
|
+
}
|
|
2331
|
+
interface CLIJudgeConfig {
|
|
2332
|
+
/**
|
|
2333
|
+
* LLM configuration for judge.
|
|
2334
|
+
* If not specified, uses the main `llm` config.
|
|
2335
|
+
*/
|
|
2336
|
+
llm?: LLMConfig;
|
|
2337
|
+
/**
|
|
2338
|
+
* Evaluation criteria.
|
|
2339
|
+
* Use built-in criteria factories like `accuracy()`, `relevance()`,
|
|
2340
|
+
* or define custom criteria objects.
|
|
2341
|
+
*/
|
|
2342
|
+
criteria: Array<Criterion | ValidatorCriterion>;
|
|
2343
|
+
/**
|
|
2344
|
+
* Score threshold for passing (0-100).
|
|
2345
|
+
* @default 70
|
|
2346
|
+
*/
|
|
2347
|
+
passThreshold?: number;
|
|
2348
|
+
/**
|
|
2349
|
+
* Custom judge prompt.
|
|
2350
|
+
* If not specified, uses the default judge prompt.
|
|
2351
|
+
*/
|
|
2352
|
+
prompt?: JudgePrompt;
|
|
2353
|
+
}
|
|
2354
|
+
interface CLIImproverConfig {
|
|
2355
|
+
/**
|
|
2356
|
+
* LLM configuration for improver.
|
|
2357
|
+
* If not specified, uses the main `llm` config.
|
|
2358
|
+
*/
|
|
2359
|
+
llm?: LLMConfig;
|
|
2360
|
+
/**
|
|
2361
|
+
* Custom improver prompt.
|
|
2362
|
+
* If not specified, uses the default improver prompt.
|
|
2363
|
+
*/
|
|
2364
|
+
prompt?: ImproverPrompt;
|
|
2365
|
+
}
|
|
2366
|
+
interface OutputConfig {
|
|
2367
|
+
/**
|
|
2368
|
+
* Directory for report output.
|
|
2369
|
+
* @default './reports'
|
|
2370
|
+
*/
|
|
2371
|
+
dir?: string;
|
|
2372
|
+
/**
|
|
2373
|
+
* Custom filename pattern.
|
|
2374
|
+
* Supports `{timestamp}` placeholder.
|
|
2375
|
+
* @default 'eval-{timestamp}.md'
|
|
2376
|
+
*/
|
|
2377
|
+
filename?: string;
|
|
2378
|
+
/**
|
|
2379
|
+
* Include verbose details in console output.
|
|
2380
|
+
* @default false
|
|
2381
|
+
*/
|
|
2382
|
+
verbose?: boolean;
|
|
2383
|
+
}
|
|
2384
|
+
interface RunConfig {
|
|
2385
|
+
/**
|
|
2386
|
+
* Number of concurrent test executions.
|
|
2387
|
+
* @default 1
|
|
2388
|
+
*/
|
|
2389
|
+
concurrency?: number;
|
|
2390
|
+
/**
|
|
2391
|
+
* Number of iterations per test case (for statistical analysis).
|
|
2392
|
+
* @default 1
|
|
2393
|
+
*/
|
|
2394
|
+
iterations?: number;
|
|
2395
|
+
/**
|
|
2396
|
+
* Stop execution on first test failure.
|
|
2397
|
+
* @default false
|
|
2398
|
+
*/
|
|
2399
|
+
stopOnFirstFailure?: boolean;
|
|
2400
|
+
}
|
|
2401
|
+
interface CLISingleTurnTestCase<TInput> extends TestCase<TInput> {
|
|
2402
|
+
/** Test case must NOT have multiTurn field */
|
|
2403
|
+
multiTurn?: never;
|
|
2404
|
+
}
|
|
2405
|
+
interface CLIMultiTurnTestCase<TInput, TOutput = unknown> extends TestCase<TInput> {
|
|
2406
|
+
/** Multi-turn configuration */
|
|
2407
|
+
multiTurn: {
|
|
2408
|
+
/**
|
|
2409
|
+
* Inputs for 2nd turn onwards.
|
|
2410
|
+
* First turn uses `input` field.
|
|
2411
|
+
*/
|
|
2412
|
+
followUpInputs?: FollowUpInput<TInput, TOutput>[];
|
|
2413
|
+
/**
|
|
2414
|
+
* Termination conditions (OR relationship).
|
|
2415
|
+
* Any one triggers termination.
|
|
2416
|
+
*/
|
|
2417
|
+
terminateWhen: TerminationCondition<TInput, TOutput>[];
|
|
2418
|
+
/**
|
|
2419
|
+
* Safety limit: maximum turns.
|
|
2420
|
+
* @default 10
|
|
2421
|
+
*/
|
|
2422
|
+
maxTurns?: number;
|
|
2423
|
+
/**
|
|
2424
|
+
* Outcome when termination condition is met.
|
|
2425
|
+
* @default 'pass'
|
|
2426
|
+
*/
|
|
2427
|
+
onConditionMet?: 'pass' | 'fail';
|
|
2428
|
+
/**
|
|
2429
|
+
* Outcome when maxTurns is reached.
|
|
2430
|
+
* @default 'fail'
|
|
2431
|
+
*/
|
|
2432
|
+
onMaxTurnsReached?: 'pass' | 'fail';
|
|
2433
|
+
};
|
|
2434
|
+
}
|
|
2435
|
+
type CLITestCase<TInput, TOutput = unknown> = CLISingleTurnTestCase<TInput> | CLIMultiTurnTestCase<TInput, TOutput>;
|
|
2436
|
+
/**
|
|
2437
|
+
* Main evaluation configuration for CLI.
|
|
2438
|
+
* @typeParam TInput - Agent input type
|
|
2439
|
+
* @typeParam TOutput - Agent output type
|
|
2440
|
+
*/
|
|
2441
|
+
interface EvalConfig<TInput = unknown, TOutput = unknown> {
|
|
2442
|
+
/**
|
|
2443
|
+
* Human-readable name for this evaluation.
|
|
2444
|
+
*/
|
|
2445
|
+
name?: string;
|
|
2446
|
+
/**
|
|
2447
|
+
* Description of what the agent does.
|
|
2448
|
+
* Used by Judge for evaluation context.
|
|
2449
|
+
*/
|
|
2450
|
+
agentDescription?: string;
|
|
2451
|
+
/**
|
|
2452
|
+
* The agent to evaluate.
|
|
2453
|
+
*/
|
|
2454
|
+
agent: EvalAgent<TInput, TOutput>;
|
|
2455
|
+
/**
|
|
2456
|
+
* LLM configuration (shared by Judge and Improver unless overridden).
|
|
2457
|
+
*/
|
|
2458
|
+
llm: LLMConfig;
|
|
2459
|
+
/**
|
|
2460
|
+
* Judge configuration for evaluating agent outputs.
|
|
2461
|
+
*/
|
|
2462
|
+
judge: CLIJudgeConfig;
|
|
2463
|
+
/**
|
|
2464
|
+
* Improver configuration for prompt improvement suggestions.
|
|
2465
|
+
* Optional - if not specified, no improvements are generated.
|
|
2466
|
+
*/
|
|
2467
|
+
improver?: CLIImproverConfig;
|
|
2468
|
+
/**
|
|
2469
|
+
* Test cases to run (inline TypeScript definition).
|
|
2470
|
+
* Can mix single-turn and multi-turn test cases.
|
|
2471
|
+
*
|
|
2472
|
+
* Either `testCases` or `include` must be provided.
|
|
2473
|
+
* - Use `testCases` for inline TypeScript test case definitions
|
|
2474
|
+
* - Use `include` for YAML-based test case files
|
|
2475
|
+
*/
|
|
2476
|
+
testCases?: CLITestCase<TInput, TOutput>[];
|
|
2477
|
+
/**
|
|
2478
|
+
* Output configuration for reports.
|
|
2479
|
+
*/
|
|
2480
|
+
output?: OutputConfig;
|
|
2481
|
+
/**
|
|
2482
|
+
* Run configuration for test execution.
|
|
2483
|
+
*/
|
|
2484
|
+
run?: RunConfig;
|
|
2485
|
+
/**
|
|
2486
|
+
* Pricing configuration for cost calculation.
|
|
2487
|
+
* If provided, cost breakdown will be included in test metrics.
|
|
2488
|
+
*
|
|
2489
|
+
* @example
|
|
2490
|
+
* ```typescript
|
|
2491
|
+
* pricing: {
|
|
2492
|
+
* openai: { 'gpt-4o': { inputPricePerMillion: 2.5, outputPricePerMillion: 10 } },
|
|
2493
|
+
* fallback: { inputPricePerMillion: 1.0, outputPricePerMillion: 3.0 },
|
|
2494
|
+
* }
|
|
2495
|
+
* ```
|
|
2496
|
+
*/
|
|
2497
|
+
pricing?: EvalPricingConfig;
|
|
2498
|
+
/**
|
|
2499
|
+
* Glob patterns to discover YAML eval files.
|
|
2500
|
+
* Required when using YAML-based test cases instead of inline testCases.
|
|
2501
|
+
*
|
|
2502
|
+
* @example
|
|
2503
|
+
* ```typescript
|
|
2504
|
+
* include: ['evals/booking/*.eval.yaml']
|
|
2505
|
+
* ```
|
|
2506
|
+
*/
|
|
2507
|
+
include?: string[];
|
|
2508
|
+
/**
|
|
2509
|
+
* Agent registry for YAML file references.
|
|
2510
|
+
* YAML files reference agents by name (e.g., `agent: booking-agent`).
|
|
2511
|
+
*
|
|
2512
|
+
* @example
|
|
2513
|
+
* ```typescript
|
|
2514
|
+
* agents: {
|
|
2515
|
+
* 'booking-agent': bookingAgent,
|
|
2516
|
+
* 'qa-agent': qaAgent,
|
|
2517
|
+
* }
|
|
2518
|
+
* ```
|
|
2519
|
+
*/
|
|
2520
|
+
agents?: Record<string, EvalAgent<unknown, unknown>>;
|
|
2521
|
+
}
|
|
2522
|
+
/** Identity function for type inference and IDE autocompletion. */
|
|
2523
|
+
declare function defineConfig<TInput = unknown, TOutput = unknown>(config: EvalConfig<TInput, TOutput>): EvalConfig<TInput, TOutput>;
|
|
2524
|
+
|
|
2525
|
+
interface DiscoverOptions {
|
|
2526
|
+
/** Override config include patterns (CLI --include) */
|
|
2527
|
+
include?: string[];
|
|
2528
|
+
/** Base directory for glob patterns (defaults to process.cwd()) */
|
|
2529
|
+
cwd?: string;
|
|
2530
|
+
/** Ignore patterns (default excludes node_modules) */
|
|
2531
|
+
ignore?: string[];
|
|
2532
|
+
}
|
|
2533
|
+
/** Discover YAML eval files matching glob patterns. CLI patterns override config. */
|
|
2534
|
+
declare function discoverEvalFiles(config: Pick<EvalConfig, 'include'>, options?: DiscoverOptions): Promise<string[]>;
|
|
2535
|
+
|
|
2536
|
+
/** Terminates when the average score reaches or exceeds threshold. */
|
|
2537
|
+
declare function targetScore(threshold: number): TargetScoreCondition;
|
|
2538
|
+
/** Terminates after completing the specified number of rounds. */
|
|
2539
|
+
declare function maxRounds(count: number): MaxRoundsCondition;
|
|
2540
|
+
/** Terminates when score hasn't improved for N consecutive rounds. */
|
|
2541
|
+
declare function noImprovement(consecutiveRounds: number, minDelta?: number): NoImprovementCondition;
|
|
2542
|
+
/** Terminates when total accumulated cost reaches or exceeds the budget. */
|
|
2543
|
+
declare function maxCost(maxUSD: number): MaxCostCondition;
|
|
2544
|
+
/** Custom termination condition with arbitrary logic. Supports async checks. */
|
|
2545
|
+
declare function customCondition(check: (ctx: CycleContext) => boolean | Promise<boolean>, description?: string): CustomCycleCondition;
|
|
2546
|
+
/** All conditions must be met for termination. Short-circuits on first false. */
|
|
2547
|
+
declare function and(...conditions: CycleTerminationCondition[]): CustomCycleCondition;
|
|
2548
|
+
/** Any condition being met causes termination. Short-circuits on first true. */
|
|
2549
|
+
declare function or(...conditions: CycleTerminationCondition[]): CustomCycleCondition;
|
|
2550
|
+
/** Invert a condition's result. Terminates when inner condition does NOT terminate. */
|
|
2551
|
+
declare function not(condition: CycleTerminationCondition): CustomCycleCondition;
|
|
2552
|
+
/** Dispatches to the appropriate check function based on condition type. */
|
|
2553
|
+
declare function checkCycleCondition(condition: CycleTerminationCondition, context: CycleContext): Promise<CycleTerminationResult>;
|
|
2554
|
+
/** Check all conditions with OR semantics - first match wins. */
|
|
2555
|
+
declare function checkCycleTermination(conditions: CycleTerminationCondition[], context: CycleContext): Promise<CycleTerminationResult>;
|
|
2556
|
+
|
|
2557
|
+
/**
|
|
2558
|
+
* Run an improvement cycle as an AsyncGenerator for Human-in-the-Loop control.
|
|
2559
|
+
* Yields after each round for decision-making (continue, stop, or rollback).
|
|
2560
|
+
*/
|
|
2561
|
+
declare function runImprovementCycle<TInput, TOutput>(config: ImprovementCycleConfig<TInput, TOutput>): AsyncGenerator<RoundYield, ImprovementCycleResult<TInput, TOutput>, RoundDecision | undefined>;
|
|
2562
|
+
/**
|
|
2563
|
+
* Run improvement cycle with automatic approval of all suggestions.
|
|
2564
|
+
* Continues until a termination condition is met.
|
|
2565
|
+
*/
|
|
2566
|
+
declare function runImprovementCycleAuto<TInput, TOutput>(config: ImprovementCycleConfig<TInput, TOutput>): Promise<ImprovementCycleResult<TInput, TOutput>>;
|
|
2567
|
+
|
|
2568
|
+
/**
|
|
2569
|
+
* Options for random selection.
|
|
2570
|
+
*/
|
|
2571
|
+
interface RandomOptions {
|
|
2572
|
+
/** Seed for reproducible random selection */
|
|
2573
|
+
seed?: number;
|
|
2574
|
+
}
|
|
2575
|
+
/**
|
|
2576
|
+
* Immutable collection for managing and selecting test cases.
|
|
2577
|
+
* Provides fluent API for filtering, sampling, and accessing test cases.
|
|
2578
|
+
*
|
|
2579
|
+
* ## Immutability
|
|
2580
|
+
* - All selection methods (`filter`, `first`, `random`, etc.) return **new collections**
|
|
2581
|
+
* - Chaining creates intermediate collections without modifying the original
|
|
2582
|
+
* - Internal array is frozen with `Object.freeze()` to prevent accidental mutation
|
|
2583
|
+
* - `toArray()` returns a **mutable copy** for consumer convenience
|
|
2584
|
+
*
|
|
2585
|
+
* @example
|
|
2586
|
+
* ```typescript
|
|
2587
|
+
* import { TestCaseCollection, createEvalSuite } from '@agtlantis/eval'
|
|
2588
|
+
*
|
|
2589
|
+
* const cases = TestCaseCollection.from([
|
|
2590
|
+
* { id: 'basic', input: { query: 'Hello' } },
|
|
2591
|
+
* { id: 'complex', input: { query: 'Explain quantum computing' } },
|
|
2592
|
+
* { id: 'edge', input: { query: '' } },
|
|
2593
|
+
* ])
|
|
2594
|
+
*
|
|
2595
|
+
* // Development: quick feedback
|
|
2596
|
+
* await suite.run(cases.minimal().toArray())
|
|
2597
|
+
*
|
|
2598
|
+
* // CI: full coverage
|
|
2599
|
+
* await suite.run(cases.all().toArray())
|
|
2600
|
+
*
|
|
2601
|
+
* // Debugging specific case
|
|
2602
|
+
* await suite.run(cases.byIds(['edge']).toArray())
|
|
2603
|
+
*
|
|
2604
|
+
* // Chaining: filter then sample
|
|
2605
|
+
* const filtered = cases.filter(tc => tc.tags?.includes('fast')).random(3).toArray()
|
|
2606
|
+
* ```
|
|
2607
|
+
*/
|
|
2608
|
+
declare class TestCaseCollection<TInput> {
|
|
2609
|
+
private readonly cases;
|
|
2610
|
+
private constructor();
|
|
2611
|
+
/**
|
|
2612
|
+
* Create a collection from an array of test cases.
|
|
2613
|
+
*/
|
|
2614
|
+
static from<T>(cases: TestCase<T>[]): TestCaseCollection<T>;
|
|
2615
|
+
/**
|
|
2616
|
+
* Create an empty collection.
|
|
2617
|
+
*/
|
|
2618
|
+
static empty<T>(): TestCaseCollection<T>;
|
|
2619
|
+
/**
|
|
2620
|
+
* Number of test cases in the collection.
|
|
2621
|
+
*/
|
|
2622
|
+
get length(): number;
|
|
2623
|
+
/**
|
|
2624
|
+
* Whether the collection is empty.
|
|
2625
|
+
*/
|
|
2626
|
+
get isEmpty(): boolean;
|
|
2627
|
+
/**
|
|
2628
|
+
* Returns all test cases.
|
|
2629
|
+
* Returns `this` since the collection is immutable (frozen array).
|
|
2630
|
+
* Useful as explicit starting point in chains.
|
|
2631
|
+
*/
|
|
2632
|
+
all(): TestCaseCollection<TInput>;
|
|
2633
|
+
/**
|
|
2634
|
+
* Returns the first N test cases (default: 1).
|
|
2635
|
+
* Useful for cost-controlled testing during development.
|
|
2636
|
+
*/
|
|
2637
|
+
minimal(count?: number): TestCaseCollection<TInput>;
|
|
2638
|
+
/**
|
|
2639
|
+
* Returns the first N test cases.
|
|
2640
|
+
*/
|
|
2641
|
+
first(count: number): TestCaseCollection<TInput>;
|
|
2642
|
+
/**
|
|
2643
|
+
* Returns the last N test cases (default: 1).
|
|
2644
|
+
* Preserves original order (earlier cases first).
|
|
2645
|
+
*/
|
|
2646
|
+
last(count?: number): TestCaseCollection<TInput>;
|
|
2647
|
+
/**
|
|
2648
|
+
* Returns N random test cases.
|
|
2649
|
+
*
|
|
2650
|
+
* @param count - Number of cases to select
|
|
2651
|
+
* @param options - Optional seed for reproducibility
|
|
2652
|
+
*
|
|
2653
|
+
* @example
|
|
2654
|
+
* ```typescript
|
|
2655
|
+
* // Different each time
|
|
2656
|
+
* collection.random(5)
|
|
2657
|
+
*
|
|
2658
|
+
* // Same result with same seed
|
|
2659
|
+
* collection.random(5, { seed: 42 })
|
|
2660
|
+
* ```
|
|
2661
|
+
*/
|
|
2662
|
+
random(count: number, options?: RandomOptions): TestCaseCollection<TInput>;
|
|
2663
|
+
/**
|
|
2664
|
+
* Filter test cases by predicate.
|
|
2665
|
+
*/
|
|
2666
|
+
filter(predicate: (testCase: TestCase<TInput>) => boolean): TestCaseCollection<TInput>;
|
|
2667
|
+
/**
|
|
2668
|
+
* Find test case by ID.
|
|
2669
|
+
* Returns collection with single case or empty collection.
|
|
2670
|
+
*/
|
|
2671
|
+
byId(id: string): TestCaseCollection<TInput>;
|
|
2672
|
+
/**
|
|
2673
|
+
* Find test cases by multiple IDs.
|
|
2674
|
+
* Preserves order of provided IDs (first occurrence).
|
|
2675
|
+
* Skips non-existent IDs. Duplicate IDs in input are deduplicated.
|
|
2676
|
+
*
|
|
2677
|
+
* @example
|
|
2678
|
+
* ```typescript
|
|
2679
|
+
* collection.byIds(['a', 'b', 'a']) // returns [case-a, case-b] (deduplicated)
|
|
2680
|
+
* collection.byIds(['b', 'a']) // returns [case-b, case-a] (order preserved)
|
|
2681
|
+
* ```
|
|
2682
|
+
*/
|
|
2683
|
+
byIds(ids: string[]): TestCaseCollection<TInput>;
|
|
2684
|
+
/**
|
|
2685
|
+
* Get test case by ID.
|
|
2686
|
+
* Returns undefined if not found.
|
|
2687
|
+
*/
|
|
2688
|
+
get(id: string): TestCase<TInput> | undefined;
|
|
2689
|
+
/**
|
|
2690
|
+
* Get test case by index.
|
|
2691
|
+
* Supports negative indices (e.g., -1 for last item).
|
|
2692
|
+
* Returns undefined if index is out of bounds.
|
|
2693
|
+
*/
|
|
2694
|
+
at(index: number): TestCase<TInput> | undefined;
|
|
2695
|
+
/**
|
|
2696
|
+
* Convert to array.
|
|
2697
|
+
* Returns a mutable copy of the internal array.
|
|
2698
|
+
*/
|
|
2699
|
+
toArray(): TestCase<TInput>[];
|
|
2700
|
+
/**
|
|
2701
|
+
* Iterator support for for...of loops and spread operator.
|
|
2702
|
+
*/
|
|
2703
|
+
[Symbol.iterator](): Iterator<TestCase<TInput>>;
|
|
2704
|
+
}
|
|
2705
|
+
/**
|
|
2706
|
+
* Create a single test case with auto-generated ID if not provided.
|
|
2707
|
+
*
|
|
2708
|
+
* Auto-generated IDs use a global counter: `test-1`, `test-2`, etc.
|
|
2709
|
+
*
|
|
2710
|
+
* @param input - The test case input data
|
|
2711
|
+
* @param id - Optional custom ID (uses auto-generated if omitted)
|
|
2712
|
+
* @returns A TestCase object
|
|
2713
|
+
*
|
|
2714
|
+
* @example
|
|
2715
|
+
* ```typescript
|
|
2716
|
+
* const case1 = testCase({ name: 'Alice' }) // id: 'test-1'
|
|
2717
|
+
* const case2 = testCase({ name: 'Bob' }) // id: 'test-2'
|
|
2718
|
+
* const case3 = testCase({ name: 'Charlie' }, 'custom-id') // id: 'custom-id'
|
|
2719
|
+
* ```
|
|
2720
|
+
*
|
|
2721
|
+
* @remarks
|
|
2722
|
+
* The global counter increments on every call. For deterministic IDs,
|
|
2723
|
+
* provide an explicit ID or use `testCases()` with a prefix.
|
|
2724
|
+
*/
|
|
2725
|
+
declare function testCase<TInput>(input: TInput, id?: string): TestCase<TInput>;
|
|
2726
|
+
/**
|
|
2727
|
+
* Create multiple test cases from inputs.
|
|
2728
|
+
* Auto-generates IDs with optional prefix.
|
|
2729
|
+
*
|
|
2730
|
+
* @example
|
|
2731
|
+
* ```typescript
|
|
2732
|
+
* const cases = testCases([{ name: 'Alice' }, { name: 'Bob' }], 'greet')
|
|
2733
|
+
* // Results in: [{ id: 'greet-0', input: {...} }, { id: 'greet-1', input: {...} }]
|
|
2734
|
+
* ```
|
|
2735
|
+
*/
|
|
2736
|
+
declare function testCases<TInput>(inputs: TInput[], prefix?: string): TestCase<TInput>[];
|
|
2737
|
+
|
|
2738
|
+
export { type AIUserOptions, type AgentMetadata, type AgentPrompt, type AgentResult, type AggregatedMetrics, type ApplyPromptSuggestionsOptions, type ApplySuggestionsResult, type CLIImproverConfig, type CLIJudgeConfig, type CLIMultiTurnTestCase, type CLISingleTurnTestCase, type CLITestCase, type ComponentMetadata, CompositeReporter, ConsoleReporter, type ConsoleReporterOptions, type ContinueResult, type ConversationContext, type ConversationEntry, type CostBreakdown, type CostSummary, type Criterion, type CriterionOptions, type CustomCondition, type CustomCycleCondition, type CycleContext, type CycleContinueResult, type CycleMarkdownOptions, type CycleTerminatedResult, type CycleTerminationCondition, type CycleTerminationResult, type DiscoverOptions, type EvalAgent, type EvalAgentConfig, type EvalConfig, type EvalContext, EvalError, EvalErrorCode, type EvalErrorOptions, type EvalPricingConfig, type EvalReport, type EvalResultKind, type EvalSuite, type EvalSuiteConfig, type EvalTestResult, type EvalTokenUsage, type ExecuteContext, type FieldSetCondition, type FieldValueCondition, type FieldsCondition, type FileContent, type FileContentMetadata, type FileReporterOptions, type FollowUpInput, type HistoryConfig, type HistoryStorage, type ImproveResult, type ImprovementCycleConfig, type ImprovementCycleOptions, type ImprovementCycleResult, type ImprovementHistory, type ImprovementSession, type Improver, type ImproverConfig, type ImproverContext, type ImproverMetadata, type ImproverPrompt, type IterationData, type IterationStats, JsonReporter, type Judge, type JudgeConfig, type JudgeContext, type JudgeMetadata, type JudgePrompt, type JudgeResult, type LLMConfig, type LogCycleOptions, type LogVerbosity, MarkdownReporter, type MarkdownReporterOptions, type MaxCostCondition, type MaxRoundsCondition, type MaxTurnsCondition, type MetricsResult, type MetricsWithCost, type MockAgentConfig, type MockImproverConfig, type MockJudgeConfig, type MultiTurnData, type MultiTurnExecuteContext, type MultiTurnExecuteOptions, type MultiTurnIteratedResult, type MultiTurnIterationStats, type MultiTurnResult, type MultiTurnTestCase, type MultiTurnTestResult, type NaturalLanguageConditionOptions, type NoImprovementCondition, type OutputConfig, type RandomOptions, type ReportComparison, type ReportMarkdownOptions, type ReportRunnerOptions, type ReportSummary, type Reporter, type RoundCost, type RoundDecision, type RoundResult, type RoundYield, type RunConfig, type RunOptions, type SaveCycleJsonOptions, type SchemaOptions, type SchemaValidationResult, type SerializedPrompt, type SerializedRoundResult, type SessionConfig, type SingleTurnIteratedResult, type SingleTurnResult, type Suggestion, type TargetScoreCondition, type TerminatedResult, type TerminationCheckResult, type TerminationCondition, type TerminationInfo, type TestCase, TestCaseCollection, type TestResult, type TestResultWithCost, type TestResultWithVerdict, type ValidatorCriterion, type ValidatorFn, type Verdict, type ZodIssue, accuracy, addCostsToResults, afterTurns, aggregateIterationResults, aiUser, and$1 as and, applyPromptSuggestions, bumpVersion, calculateAvgPassRate, calculateAvgStdDev, calculateIterationStats, calculateMultiTurnIterationStats, calculateReportCosts, calculateResultCost, checkCondition, checkCycleCondition, checkCycleTermination, checkTermination, compareReports, consistency, createCompositeReporter, createConsoleReporter, createDefaultReporter, createEvalSuite, createImprover, createJsonReporter, createJudge, createMarkdownReporter, createMockAgent, createMockImprover, createMockJudge, createReportRunner, createSession, customCondition, and as cycleAnd, not as cycleNot, or as cycleOr, cycleToMarkdown, defaultHistoryStorage, defineConfig, deserializePrompt, discoverEvalFiles, executeMultiTurnTestCase, executeTestCase, fieldEquals, fieldIsSet, getFieldValue, isCustomCondition, isCustomCycleCondition, isCycleTerminated, isFieldSetCondition, isFieldValueCondition, isIteratedResult, isMaxCostCondition, isMaxRoundsCondition, isMaxTurnsCondition, isMultiTurnResult, isMultiTurnTestCase, isNoImprovementCondition, isSingleTurnResult, isTargetScoreCondition, isTerminated, loadHistory, logCycle, maxCost, maxRounds, naturalLanguage, noImprovement, not$1 as not, or$1 as or, relevance, reportToMarkdown, resumeSession, runImprovementCycle, runImprovementCycleAuto, runWithConcurrency, saveCycleJson, saveCycleMarkdown, saveHistory, saveReportMarkdown, schema, selectRepresentativeResult, serializePrompt, suggestionDiff, suggestionPreview, suggestionSummary, targetScore, testCase, testCases, toEvalAgent };
|