@agtlantis/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2738 @@
1
+ import { Provider, ProviderType, ProviderPricing } from '@agtlantis/core';
2
+ export { ANTHROPIC_PRICING, CostResult, DEFAULT_PRICING_CONFIG, FilePromptRepositoryOptions, FileSource, FileSourceBase64, FileSourceData, FileSourceDisplayInfo, FileSourcePath, FileSourceUrl, FileSystem, FoundFileSource, GOOGLE_PRICING, ModelPricing, OPENAI_PRICING, PricingConfig, PromptRepository, PromptTemplate, ResolveOptions, calculateCostFromUsage, compileTemplate, createFilePromptRepository, getFileSourceDisplayInfo, getFileSourcesDisplayInfo, inferMediaType, isFileSource, isFileSourceBase64, isFileSourceData, isFileSourcePath, isFileSourceUrl, resolveFileSource, resolveFileSourcesInInput, scanForFileSources } from '@agtlantis/core';
3
+ export { MockCall, MockProvider, mock } from '@agtlantis/core/testing';
4
+ import { z } from 'zod';
5
+
6
+ /**
7
+ * Simplified token usage type for eval package.
8
+ *
9
+ * This is a subset of AI SDK's LanguageModelUsage that only includes
10
+ * the properties eval actually tracks. The cost-helpers module handles
11
+ * conversion when calling @agtlantis/core's pricing calculator.
12
+ *
13
+ * @example
14
+ * ```typescript
15
+ * const usage: EvalTokenUsage = {
16
+ * inputTokens: 100,
17
+ * outputTokens: 50,
18
+ * totalTokens: 150,
19
+ * }
20
+ * ```
21
+ */
22
+ interface EvalTokenUsage {
23
+ /** Number of input (prompt) tokens */
24
+ inputTokens: number;
25
+ /** Number of output (completion) tokens */
26
+ outputTokens: number;
27
+ /** Total tokens (input + output) */
28
+ totalTokens: number;
29
+ }
30
+ /**
31
+ * Simplified agent configuration for evaluation.
32
+ * Only requires fields needed for eval purposes.
33
+ *
34
+ * For agents from `ai-agents` package with full AgentConfig,
35
+ * use `toEvalAgent()` adapter to convert them.
36
+ */
37
+ interface EvalAgentConfig {
38
+ /** Agent name for identification */
39
+ name: string;
40
+ /** Agent description (used by Judge for context) */
41
+ description?: string;
42
+ /** Additional custom fields */
43
+ [key: string]: unknown;
44
+ }
45
+ /**
46
+ * Agent prompt template.
47
+ */
48
+ interface AgentPrompt<TInput> {
49
+ /** Prompt unique ID for version tracking */
50
+ id: string;
51
+ /** Version string (e.g., "1.0.0") */
52
+ version: string;
53
+ /** System prompt */
54
+ system: string;
55
+ /** User template string (for serialization/history) */
56
+ userTemplate?: string;
57
+ /** User prompt builder function */
58
+ renderUserPrompt: (input: TInput) => string;
59
+ /** Additional custom fields */
60
+ [key: string]: unknown;
61
+ }
62
+ /**
63
+ * Base metadata type shared by all LLM-using components (Agent, Judge, Improver).
64
+ * Provides consistent structure for tracking token usage and model information.
65
+ *
66
+ * @example
67
+ * ```typescript
68
+ * const metadata: ComponentMetadata = {
69
+ * tokenUsage: { inputTokens: 100, outputTokens: 50, totalTokens: 150 },
70
+ * model: 'gpt-4o',
71
+ * }
72
+ * ```
73
+ */
74
+ interface ComponentMetadata {
75
+ /** Token usage from the LLM call (AI SDK LanguageModelUsage format) */
76
+ tokenUsage?: EvalTokenUsage;
77
+ /** Model identifier used for the LLM call */
78
+ model?: string;
79
+ /** Additional custom fields */
80
+ [key: string]: unknown;
81
+ }
82
+ /**
83
+ * Agent execution result metadata.
84
+ * Extends ComponentMetadata with agent-specific fields.
85
+ */
86
+ interface AgentMetadata extends ComponentMetadata {
87
+ /** Prompt version used for execution */
88
+ promptVersion?: string;
89
+ /** Execution duration in milliseconds */
90
+ duration?: number;
91
+ }
92
+ /**
93
+ * Judge evaluation metadata.
94
+ * Tracks token usage and model for cost calculation.
95
+ */
96
+ interface JudgeMetadata extends ComponentMetadata {
97
+ }
98
+ /**
99
+ * Improver analysis metadata.
100
+ * Tracks token usage and model for cost calculation.
101
+ */
102
+ interface ImproverMetadata extends ComponentMetadata {
103
+ }
104
+ /**
105
+ * Agent execution result.
106
+ */
107
+ interface AgentResult<TOutput> {
108
+ result: TOutput;
109
+ metadata?: AgentMetadata;
110
+ }
111
+ /**
112
+ * Simplified Agent interface for evaluation.
113
+ *
114
+ * @example
115
+ * ```typescript
116
+ * // Direct implementation
117
+ * const myAgent: EvalAgent<string, string> = {
118
+ * config: { name: 'MyAgent', description: 'A simple agent' },
119
+ * prompt: { id: 'prompt-1', version: '1.0.0', system: '...', renderUserPrompt: (input) => input },
120
+ * execute: async (input) => ({ result: `Processed: ${input}` })
121
+ * }
122
+ *
123
+ * // Or adapt from full ai-agents Agent
124
+ * const evalAgent = toEvalAgent(fullAgent)
125
+ * ```
126
+ */
127
+ interface EvalAgent<TInput, TOutput> {
128
+ readonly config: EvalAgentConfig;
129
+ readonly prompt: AgentPrompt<TInput>;
130
+ execute(input: TInput, options?: unknown): Promise<AgentResult<TOutput>>;
131
+ }
132
+ /**
133
+ * Full AgentConfig interface (compatible with ai-agents package).
134
+ * Used for type-safe adaptation.
135
+ */
136
+ interface FullAgentConfig {
137
+ name: string;
138
+ role: 'generator' | 'analyzer' | 'validator' | 'enhancer';
139
+ streaming: 'required' | 'optional' | 'none';
140
+ execution: 'batch' | 'realtime';
141
+ conversation?: 'single-turn' | 'multi-turn';
142
+ description?: string;
143
+ [key: string]: unknown;
144
+ }
145
+ /**
146
+ * Full Agent interface (compatible with ai-agents package).
147
+ * Used for type-safe adaptation.
148
+ */
149
+ interface FullAgent<TInput, TOutput> {
150
+ readonly config: FullAgentConfig;
151
+ readonly prompt: AgentPrompt<TInput>;
152
+ execute(input: TInput, options?: unknown): Promise<{
153
+ result: TOutput;
154
+ metadata: {
155
+ duration: number;
156
+ promptVersion: string;
157
+ tokenUsage?: EvalTokenUsage;
158
+ model?: string;
159
+ retryCount?: number;
160
+ traceId?: string;
161
+ [key: string]: unknown;
162
+ };
163
+ }>;
164
+ }
165
+ /**
166
+ * Adapts a full Agent (from ai-agents) to EvalAgent for evaluation.
167
+ * Extracts only the fields needed for evaluation.
168
+ *
169
+ * @example
170
+ * ```typescript
171
+ * import { scenarioGenerator } from './agents/mce'
172
+ *
173
+ * const evalAgent = toEvalAgent(scenarioGenerator)
174
+ * const suite = createEvalSuite({ agent: evalAgent, ... })
175
+ * ```
176
+ */
177
+ declare function toEvalAgent<TInput, TOutput>(agent: FullAgent<TInput, TOutput>): EvalAgent<TInput, TOutput>;
178
+ /**
179
+ * Metadata for file content.
180
+ */
181
+ interface FileContentMetadata {
182
+ /** File size in bytes */
183
+ size?: number;
184
+ /** Full resolved path (for loaded files) */
185
+ fullPath?: string;
186
+ /** Whether the content was created inline (not from disk) */
187
+ inline?: boolean;
188
+ /** Additional custom metadata */
189
+ [key: string]: unknown;
190
+ }
191
+ interface FileContent {
192
+ /** File path (relative or absolute) - used as identifier */
193
+ path: string;
194
+ /** File content as string (text files only for Phase 5.3) */
195
+ content: string;
196
+ /** Optional MIME type hint (defaults to 'text/plain') */
197
+ mediaType?: string;
198
+ /** Optional encoding (defaults to 'utf-8') */
199
+ encoding?: BufferEncoding;
200
+ /** Optional metadata (e.g., original size, full path, etc.) */
201
+ metadata?: FileContentMetadata;
202
+ }
203
+ interface TestCase<TInput> {
204
+ id?: string;
205
+ input: TInput;
206
+ tags?: string[];
207
+ description?: string;
208
+ expectedOutput?: unknown;
209
+ /**
210
+ * Optional file context for agent and judge (Phase 5.3).
211
+ * Files are passed to Judge for evaluation context.
212
+ * For Agent access, include files in the input type directly.
213
+ *
214
+ * @deprecated Use FileSource in input directly for flexible file handling
215
+ */
216
+ files?: FileContent[];
217
+ }
218
+ interface MetricsResult {
219
+ latencyMs: number;
220
+ tokenUsage: EvalTokenUsage;
221
+ }
222
+ interface Criterion {
223
+ id: string;
224
+ name: string;
225
+ description: string;
226
+ weight?: number;
227
+ }
228
+ /**
229
+ * Zod error issue - minimal type compatible with ZodError.errors.
230
+ * Using `readonly` and rest index to be compatible with Zod's discriminated union.
231
+ */
232
+ type ZodIssue = {
233
+ readonly code: string;
234
+ readonly path: readonly (string | number)[];
235
+ readonly message: string;
236
+ };
237
+ /**
238
+ * Result of programmatic schema validation.
239
+ */
240
+ interface SchemaValidationResult {
241
+ /** Whether the output matches the schema */
242
+ valid: boolean;
243
+ /** Validation errors if invalid (Zod issue format) */
244
+ errors?: readonly ZodIssue[];
245
+ /** Human-readable error summary */
246
+ errorSummary?: string;
247
+ }
248
+ /**
249
+ * Validator function type for programmatic validation.
250
+ * Returns validation result with binary pass/fail outcome.
251
+ */
252
+ type ValidatorFn = (output: unknown) => SchemaValidationResult;
253
+ /**
254
+ * Extended criterion with optional programmatic validator.
255
+ * Validators run before LLM evaluation with binary scoring (0 or 100).
256
+ *
257
+ * @example
258
+ * ```typescript
259
+ * import { z } from 'zod'
260
+ * import { schema } from '@agtlantis/eval'
261
+ *
262
+ * const criterion = schema({
263
+ * schema: z.object({ name: z.string() }),
264
+ * weight: 2,
265
+ * })
266
+ * ```
267
+ */
268
+ interface ValidatorCriterion extends Criterion {
269
+ /**
270
+ * Optional programmatic validator.
271
+ * If provided and fails, score is automatically 0.
272
+ * If provided and passes, score is automatically 100.
273
+ */
274
+ validator?: ValidatorFn;
275
+ }
276
+ interface Verdict {
277
+ criterionId: string;
278
+ score: number;
279
+ reasoning: string;
280
+ passed: boolean;
281
+ }
282
+ interface TestResult<TInput, TOutput> {
283
+ testCase: TestCase<TInput>;
284
+ output: TOutput;
285
+ metrics: MetricsResult;
286
+ error?: Error;
287
+ }
288
+ interface TestResultWithVerdict<TInput, TOutput> extends TestResult<TInput, TOutput> {
289
+ verdicts: Verdict[];
290
+ overallScore: number;
291
+ passed: boolean;
292
+ /** Judge metadata for cost tracking */
293
+ judgeMetadata?: JudgeMetadata;
294
+ }
295
+ /**
296
+ * Statistics from running the same test multiple times.
297
+ * Used to measure consistency and reliability of LLM-based agents.
298
+ */
299
+ interface IterationStats {
300
+ /** Total number of iterations run */
301
+ iterations: number;
302
+ /** Score from each iteration */
303
+ scores: number[];
304
+ /** Average score across all iterations */
305
+ mean: number;
306
+ /** Standard deviation (lower = more consistent) */
307
+ stdDev: number;
308
+ /** Lowest score achieved */
309
+ min: number;
310
+ /** Highest score achieved */
311
+ max: number;
312
+ /** Pass rate as decimal (0-1, e.g., 0.67 = 67%) */
313
+ passRate: number;
314
+ /** Number of iterations that passed */
315
+ passCount: number;
316
+ }
317
+ /**
318
+ * Extended iteration statistics for multi-turn tests.
319
+ * Includes turn-count metrics and termination type distribution.
320
+ *
321
+ * @example
322
+ * ```typescript
323
+ * if (hasMultiTurnIterationData(result)) {
324
+ * console.log(`Average turns: ${result.multiTurnIterationStats.avgTurns}`)
325
+ * console.log(`Termination types: ${JSON.stringify(result.multiTurnIterationStats.terminationCounts)}`)
326
+ * }
327
+ * ```
328
+ */
329
+ interface MultiTurnIterationStats extends IterationStats {
330
+ /** Average number of turns across all iterations */
331
+ avgTurns: number;
332
+ /** Minimum turns in any iteration */
333
+ minTurns: number;
334
+ /** Maximum turns in any iteration */
335
+ maxTurns: number;
336
+ /** Distribution of termination types across iterations (e.g., { condition: 2, maxTurns: 1 }) */
337
+ terminationCounts: Record<string, number>;
338
+ }
339
+ /**
340
+ * Discriminator for eval result types.
341
+ * Used for exhaustive pattern matching on result variants.
342
+ */
343
+ type EvalResultKind = 'single-turn' | 'single-turn-iterated' | 'multi-turn' | 'multi-turn-iterated';
344
+ /**
345
+ * Properties present when test ran with multiple iterations.
346
+ * Extracted as a separate interface for composition.
347
+ */
348
+ interface IterationData<TInput, TOutput> {
349
+ /** Aggregated statistics across all iterations */
350
+ iterationStats: IterationStats;
351
+ /** Individual results from each iteration */
352
+ iterationResults: TestResultWithVerdict<TInput, TOutput>[];
353
+ }
354
+ /**
355
+ * Single conversation entry in multi-turn tests.
356
+ */
357
+ interface ConversationEntry<TInput, TOutput> {
358
+ /** Turn number (1-based) */
359
+ turn: number;
360
+ /** Input provided for this turn */
361
+ input: TInput;
362
+ /** Output from agent (undefined if execution failed) */
363
+ output: TOutput | undefined;
364
+ /** Agent execution metadata */
365
+ metadata?: AgentMetadata;
366
+ }
367
+ /**
368
+ * Termination info for multi-turn tests.
369
+ * Compatible with TerminationCheckResult from multi-turn module.
370
+ */
371
+ interface TerminationInfo {
372
+ /** Whether the conversation terminated */
373
+ terminated: boolean;
374
+ /** Human-readable reason for termination */
375
+ reason: string;
376
+ /** Type of termination (condition, maxTurns, error, exhausted) */
377
+ terminationType?: string;
378
+ /** The condition that caused termination (if applicable) */
379
+ matchedCondition?: unknown;
380
+ }
381
+ /**
382
+ * Properties present for multi-turn test results.
383
+ * Extracted as a separate interface for composition.
384
+ */
385
+ interface MultiTurnData<TInput, TOutput> {
386
+ /** Full conversation history */
387
+ conversationHistory: ConversationEntry<TInput, TOutput>[];
388
+ /** Total turns executed */
389
+ totalTurns: number;
390
+ /** Human-readable termination reason */
391
+ terminationReason: string;
392
+ /** Full termination check result */
393
+ termination: TerminationInfo;
394
+ }
395
+ /**
396
+ * Single-turn test result with single iteration (base case).
397
+ * No iteration stats, no multi-turn data.
398
+ */
399
+ interface SingleTurnResult<TInput, TOutput> extends TestResultWithVerdict<TInput, TOutput> {
400
+ readonly kind: 'single-turn';
401
+ }
402
+ /**
403
+ * Single-turn test result with multiple iterations.
404
+ * Has iteration stats but no multi-turn data.
405
+ */
406
+ interface SingleTurnIteratedResult<TInput, TOutput> extends TestResultWithVerdict<TInput, TOutput>, IterationData<TInput, TOutput> {
407
+ readonly kind: 'single-turn-iterated';
408
+ }
409
+ /**
410
+ * Multi-turn test result with single iteration.
411
+ * Has multi-turn data but no iteration stats.
412
+ */
413
+ interface MultiTurnResult<TInput, TOutput> extends TestResultWithVerdict<TInput, TOutput>, MultiTurnData<TInput, TOutput> {
414
+ readonly kind: 'multi-turn';
415
+ }
416
+ /**
417
+ * Multi-turn test result with multiple iterations.
418
+ * Has both multi-turn data and iteration stats.
419
+ */
420
+ interface MultiTurnIteratedResult<TInput, TOutput> extends TestResultWithVerdict<TInput, TOutput>, IterationData<TInput, TOutput>, MultiTurnData<TInput, TOutput> {
421
+ readonly kind: 'multi-turn-iterated';
422
+ /** Multi-turn specific iteration statistics */
423
+ multiTurnIterationStats: MultiTurnIterationStats;
424
+ }
425
+ /**
426
+ * Unified eval result type - discriminated union of all result kinds.
427
+ *
428
+ * Use pattern matching on `kind` for exhaustive handling:
429
+ * @example
430
+ * ```typescript
431
+ * switch (result.kind) {
432
+ * case 'single-turn':
433
+ * // No iteration stats, no multi-turn data
434
+ * break
435
+ * case 'single-turn-iterated':
436
+ * console.log(result.iterationStats.mean) // Type-safe
437
+ * break
438
+ * case 'multi-turn':
439
+ * console.log(result.conversationHistory) // Type-safe
440
+ * break
441
+ * case 'multi-turn-iterated':
442
+ * console.log(result.multiTurnIterationStats.avgTurns) // Type-safe
443
+ * break
444
+ * }
445
+ * ```
446
+ */
447
+ type EvalTestResult<TInput, TOutput> = SingleTurnResult<TInput, TOutput> | SingleTurnIteratedResult<TInput, TOutput> | MultiTurnResult<TInput, TOutput> | MultiTurnIteratedResult<TInput, TOutput>;
448
+ /**
449
+ * Check if result is from a single-turn test (either iterated or not).
450
+ *
451
+ * @example
452
+ * ```typescript
453
+ * if (isSingleTurnResult(result)) {
454
+ * // result is SingleTurnResult | SingleTurnIteratedResult
455
+ * console.log('Single turn test')
456
+ * }
457
+ * ```
458
+ */
459
+ declare function isSingleTurnResult<TInput, TOutput>(result: EvalTestResult<TInput, TOutput>): result is SingleTurnResult<TInput, TOutput> | SingleTurnIteratedResult<TInput, TOutput>;
460
+ /**
461
+ * Check if result is from a multi-turn test (either iterated or not).
462
+ *
463
+ * @example
464
+ * ```typescript
465
+ * if (isMultiTurnResult(result)) {
466
+ * // result is MultiTurnResult | MultiTurnIteratedResult
467
+ * console.log(`Turns: ${result.totalTurns}`) // Type-safe
468
+ * for (const entry of result.conversationHistory) { // Type-safe
469
+ * console.log(`Turn ${entry.turn}: ${entry.input}`)
470
+ * }
471
+ * }
472
+ * ```
473
+ */
474
+ declare function isMultiTurnResult<TInput, TOutput>(result: EvalTestResult<TInput, TOutput>): result is MultiTurnResult<TInput, TOutput> | MultiTurnIteratedResult<TInput, TOutput>;
475
+ /**
476
+ * Check if result has iteration data (multiple iterations ran).
477
+ *
478
+ * @example
479
+ * ```typescript
480
+ * if (isIteratedResult(result)) {
481
+ * // result is SingleTurnIteratedResult | MultiTurnIteratedResult
482
+ * console.log(`Mean score: ${result.iterationStats.mean}`) // Type-safe
483
+ * console.log(`Pass rate: ${result.iterationStats.passRate}`) // Type-safe
484
+ * }
485
+ * ```
486
+ */
487
+ declare function isIteratedResult<TInput, TOutput>(result: EvalTestResult<TInput, TOutput>): result is SingleTurnIteratedResult<TInput, TOutput> | MultiTurnIteratedResult<TInput, TOutput>;
488
+
489
+ /**
490
+ * Context passed to JudgePrompt.renderUserPrompt().
491
+ */
492
+ interface JudgeContext {
493
+ agentDescription: string;
494
+ input: unknown;
495
+ output: unknown;
496
+ criteria: Criterion[];
497
+ files?: FileContent[];
498
+ }
499
+ /**
500
+ * Context for evaluating agent output.
501
+ *
502
+ * @example
503
+ * ```typescript
504
+ * const result = await judge.evaluate({
505
+ * input: { query: 'Hello' },
506
+ * output: { response: 'Hi there!' },
507
+ * agentDescription: 'A friendly chatbot',
508
+ * files: [{ path: 'context.md', content: '...' }],
509
+ * })
510
+ * ```
511
+ */
512
+ interface EvalContext {
513
+ input: unknown;
514
+ output: unknown;
515
+ agentDescription: string;
516
+ files?: FileContent[];
517
+ }
518
+ interface JudgeResult {
519
+ verdicts: Verdict[];
520
+ overallScore: number;
521
+ passed: boolean;
522
+ metadata?: JudgeMetadata;
523
+ }
524
+ interface JudgePrompt {
525
+ id: string;
526
+ version: string;
527
+ system: string;
528
+ renderUserPrompt: (context: JudgeContext) => string;
529
+ }
530
+ interface JudgeConfig {
531
+ provider: Provider;
532
+ prompt?: JudgePrompt;
533
+ criteria: Criterion[];
534
+ passThreshold?: number;
535
+ /** Model name for cost tracking (e.g., 'gpt-4o', 'gemini-2.5-flash') */
536
+ model?: string;
537
+ }
538
+ /**
539
+ * LLM-as-Judge evaluator interface.
540
+ *
541
+ * @example
542
+ * ```typescript
543
+ * const judge = createJudge({ llm, prompt, criteria })
544
+ *
545
+ * const result = await judge.evaluate({
546
+ * input: { query: 'What is 2+2?' },
547
+ * output: { answer: '4' },
548
+ * agentDescription: 'A math tutor agent',
549
+ * files: [{ path: 'reference.md', content: '...' }],
550
+ * })
551
+ * ```
552
+ */
553
+ interface Judge {
554
+ evaluate(context: EvalContext): Promise<JudgeResult>;
555
+ }
556
+
557
+ interface AggregatedMetrics {
558
+ avgLatencyMs: number;
559
+ totalTokens: number;
560
+ totalEstimatedCost?: number;
561
+ }
562
+ /**
563
+ * Pure data interface - use utility functions for operations.
564
+ *
565
+ * @example
566
+ * ```typescript
567
+ * for (const suggestion of report.suggestions) {
568
+ * console.log(suggestionDiff(suggestion))
569
+ * console.log(suggestionPreview(suggestion))
570
+ * suggestion.approved = true
571
+ * }
572
+ *
573
+ * const newPrompt = applyPromptSuggestions(agent.prompt, report.suggestions)
574
+ * ```
575
+ */
576
+ interface Suggestion {
577
+ type: 'system_prompt' | 'user_prompt' | 'parameters';
578
+ priority: 'high' | 'medium' | 'low';
579
+ currentValue: string;
580
+ suggestedValue: string;
581
+ reasoning: string;
582
+ expectedImprovement: string;
583
+ approved?: boolean;
584
+ modified?: boolean;
585
+ }
586
+ interface ImproveResult {
587
+ suggestions: Suggestion[];
588
+ metadata?: ImproverMetadata;
589
+ }
590
+ interface ImproverContext {
591
+ agentPrompt: AgentPrompt<any>;
592
+ evaluatedResults: EvalTestResult<any, any>[];
593
+ aggregatedMetrics: AggregatedMetrics;
594
+ }
595
+ interface ImproverPrompt {
596
+ id: string;
597
+ version: string;
598
+ system: string;
599
+ renderUserPrompt: (context: ImproverContext) => string;
600
+ }
601
+ interface ImproverConfig {
602
+ provider: Provider;
603
+ prompt?: ImproverPrompt;
604
+ /** Model name for cost tracking (e.g., 'gpt-4o', 'gemini-2.5-flash') */
605
+ model?: string;
606
+ }
607
+ interface Improver {
608
+ improve(agentPrompt: AgentPrompt<any>, results: EvalTestResult<any, any>[]): Promise<ImproveResult>;
609
+ }
610
+
611
+ /** Cost breakdown by component (Agent, Judge, Improver) */
612
+ interface CostBreakdown {
613
+ agent?: number;
614
+ judge?: number;
615
+ improver?: number;
616
+ total?: number;
617
+ }
618
+ /** Cost summary aggregated across all test results */
619
+ interface CostSummary {
620
+ total: number;
621
+ byComponent: {
622
+ agent: number;
623
+ judge: number;
624
+ improver?: number;
625
+ };
626
+ }
627
+ interface MetricsWithCost {
628
+ latencyMs: number;
629
+ tokenUsage: EvalTokenUsage;
630
+ costBreakdown: CostBreakdown;
631
+ }
632
+ /** Test result with cost breakdown, returned by addCostsToResults() */
633
+ interface TestResultWithCost<TInput, TOutput> {
634
+ testCase: {
635
+ id?: string;
636
+ input: TInput;
637
+ tags?: string[];
638
+ description?: string;
639
+ expectedOutput?: unknown;
640
+ };
641
+ output: TOutput;
642
+ metrics: MetricsWithCost;
643
+ error?: Error;
644
+ verdicts: Array<{
645
+ criterionId: string;
646
+ score: number;
647
+ reasoning: string;
648
+ passed: boolean;
649
+ }>;
650
+ overallScore: number;
651
+ passed: boolean;
652
+ }
653
+ /** Pricing configuration for eval */
654
+ interface EvalPricingConfig {
655
+ /** Provider-specific pricing overrides. Key is provider name (e.g., 'google', 'openai'), value is model pricing. */
656
+ providerPricing?: Partial<Record<ProviderType, ProviderPricing>>;
657
+ }
658
+ /** Minimal result interface compatible with TestResultWithVerdict and TestResultWithIteration */
659
+ interface ResultForCostCalculation<TInput, TOutput> {
660
+ testCase: {
661
+ id?: string;
662
+ input: TInput;
663
+ tags?: string[];
664
+ description?: string;
665
+ expectedOutput?: unknown;
666
+ };
667
+ output: TOutput;
668
+ metrics: {
669
+ latencyMs: number;
670
+ tokenUsage: EvalTokenUsage;
671
+ };
672
+ error?: Error;
673
+ verdicts: Array<{
674
+ criterionId: string;
675
+ score: number;
676
+ reasoning: string;
677
+ passed: boolean;
678
+ }>;
679
+ overallScore: number;
680
+ passed: boolean;
681
+ agentMetadata?: {
682
+ tokenUsage?: EvalTokenUsage;
683
+ model?: string;
684
+ provider?: string;
685
+ };
686
+ judgeMetadata?: {
687
+ tokenUsage?: EvalTokenUsage;
688
+ model?: string;
689
+ provider?: string;
690
+ };
691
+ }
692
+ interface ReportForCostCalculation<TInput, TOutput> {
693
+ results: ResultForCostCalculation<TInput, TOutput>[];
694
+ }
695
+ declare function calculateResultCost<TInput, TOutput>(result: ResultForCostCalculation<TInput, TOutput>, config?: EvalPricingConfig): CostBreakdown;
696
+ declare function calculateReportCosts<TInput, TOutput>(report: ReportForCostCalculation<TInput, TOutput>, config?: EvalPricingConfig): CostSummary;
697
+ /** Add cost breakdown to each result. Returns new array (does not mutate original). */
698
+ declare function addCostsToResults<TInput, TOutput>(results: ResultForCostCalculation<TInput, TOutput>[], config?: EvalPricingConfig): TestResultWithCost<TInput, TOutput>[];
699
+
700
+ /**
701
+ * Reporter interface for saving/logging evaluation reports.
702
+ *
703
+ * @example
704
+ * ```typescript
705
+ * const reporter = createJsonReporter('./reports')
706
+ * reporter.save(report, 'my-test') // → ./reports/my-test-1736691234567.json
707
+ * ```
708
+ */
709
+ interface Reporter<TInput = unknown, TOutput = unknown> {
710
+ /** Save report to file, returns file path (optional - not all reporters save files) */
711
+ save?(report: EvalReport<TInput, TOutput>, name: string): string;
712
+ /** Log report to console (optional) */
713
+ log?(report: EvalReport<TInput, TOutput>): void;
714
+ }
715
+ /**
716
+ * Common options for file-based reporters.
717
+ */
718
+ interface FileReporterOptions {
719
+ /** Output directory (created if missing) */
720
+ outputDir: string;
721
+ /** Pricing config for cost calculation */
722
+ pricing?: EvalPricingConfig;
723
+ /** Add timestamp to filename (default: true) */
724
+ addTimestamp?: boolean;
725
+ }
726
+ /**
727
+ * Verbosity level for console output.
728
+ */
729
+ type LogVerbosity = 'summary' | 'detailed' | 'full';
730
+ /**
731
+ * Options for ConsoleReporter.
732
+ */
733
+ interface ConsoleReporterOptions {
734
+ /** Verbosity level (default: 'summary') */
735
+ verbosity?: LogVerbosity;
736
+ /** Pricing config for cost display */
737
+ pricing?: EvalPricingConfig;
738
+ }
739
+ interface ReportSummary {
740
+ totalTests: number;
741
+ passed: number;
742
+ failed: number;
743
+ avgScore: number;
744
+ metrics: AggregatedMetrics;
745
+ /** Number of iterations run per test case (only present when iterations > 1) */
746
+ iterations?: number;
747
+ /** Average standard deviation across all tests */
748
+ avgStdDev?: number;
749
+ /** Average pass rate across all tests */
750
+ avgPassRate?: number;
751
+ /** Cost summary (set by CLI or manually via calculateReportCosts) */
752
+ costSummary?: CostSummary;
753
+ }
754
+ /**
755
+ * Evaluation report data.
756
+ * Pure data interface - use utility functions for operations.
757
+ *
758
+ * @example
759
+ * ```typescript
760
+ * const report = await suite.run(testCases)
761
+ *
762
+ * // Convert to markdown
763
+ * const markdown = reportToMarkdown(report)
764
+ *
765
+ * // Save to file
766
+ * await saveReportMarkdown(report, './reports/eval-report.md')
767
+ * ```
768
+ */
769
+ interface EvalReport<TInput, TOutput> {
770
+ summary: ReportSummary;
771
+ /** Results - may include iteration stats when iterations > 1 */
772
+ results: EvalTestResult<TInput, TOutput>[];
773
+ suggestions: Suggestion[];
774
+ generatedAt: Date;
775
+ promptVersion: string;
776
+ }
777
+ /**
778
+ * Options for markdown report generation.
779
+ */
780
+ interface ReportMarkdownOptions {
781
+ /** Include passed test details (default: false, collapsed) */
782
+ expandPassedTests?: boolean;
783
+ /** Include raw JSON output (default: false) */
784
+ includeRawOutput?: boolean;
785
+ /** Max length for output preview (default: 200) */
786
+ outputPreviewLength?: number;
787
+ }
788
+ /**
789
+ * Result of comparing two evaluation reports.
790
+ * Useful for tracking improvements across prompt versions.
791
+ *
792
+ * @example
793
+ * ```typescript
794
+ * const beforeReport = await suite.run(testCases)
795
+ * const afterReport = await suite.withAgent(improvedAgent).run(testCases)
796
+ * const comparison = compareReports(beforeReport, afterReport)
797
+ *
798
+ * console.log(`Score delta: ${comparison.scoreDelta}`)
799
+ * console.log(`Improved tests: ${comparison.improved.join(', ')}`)
800
+ * ```
801
+ */
802
+ interface ReportComparison {
803
+ /** Change in average score (positive = improvement) */
804
+ scoreDelta: number;
805
+ /** Change in pass rate (positive = improvement) */
806
+ passRateDelta: number;
807
+ /** Changes in performance metrics */
808
+ metricsDelta: {
809
+ /** Change in average latency (ms) */
810
+ latencyMs: number;
811
+ /** Change in total token usage */
812
+ tokenUsage: number;
813
+ };
814
+ /** Test IDs that improved (score increased) */
815
+ improved: string[];
816
+ /** Test IDs that regressed (score decreased) */
817
+ regressed: string[];
818
+ /** Test IDs that were removed (in before but not in after) */
819
+ removed: string[];
820
+ }
821
+
822
+ /**
823
+ * Options for running test cases.
824
+ */
825
+ interface RunOptions {
826
+ /** Maximum number of concurrent test case executions. Defaults to 1 (sequential). */
827
+ concurrency?: number;
828
+ /** Stop execution after the first test failure. Defaults to false. */
829
+ stopOnFirstFailure?: boolean;
830
+ /** AbortSignal for cancelling execution */
831
+ signal?: AbortSignal;
832
+ /**
833
+ * Number of times to run each test case. Defaults to 1.
834
+ * When > 1, results include iteration statistics (mean, stdDev, passRate).
835
+ */
836
+ iterations?: number;
837
+ }
838
+ /**
839
+ * Context required for executing a single test case.
840
+ * @internal
841
+ */
842
+ interface ExecuteContext<TInput, TOutput> {
843
+ agent: EvalAgent<TInput, TOutput>;
844
+ judge: Judge;
845
+ agentDescription: string;
846
+ }
847
+ /**
848
+ * Executes a single test case and returns the result with verdict.
849
+ *
850
+ * Flow:
851
+ * 1. Execute agent with test input
852
+ * 2. Measure execution latency
853
+ * 3. Collect token usage from agent metadata
854
+ * 4. Evaluate output using Judge
855
+ * 5. Return combined result with verdicts
856
+ *
857
+ * @example
858
+ * ```typescript
859
+ * const result = await executeTestCase(
860
+ * { id: 'test-1', input: { query: 'Hello' } },
861
+ * { agent: myAgent, judge: myJudge, agentDescription: 'A friendly bot' }
862
+ * )
863
+ *
864
+ * console.log(result.passed) // true/false
865
+ * console.log(result.overallScore) // 0-100
866
+ * console.log(result.verdicts) // Verdict[]
867
+ * ```
868
+ */
869
+ declare function executeTestCase<TInput, TOutput>(testCase: TestCase<TInput>, context: ExecuteContext<TInput, TOutput>, signal?: AbortSignal): Promise<SingleTurnResult<TInput, TOutput>>;
870
+ /**
871
+ * Runs multiple test cases with configurable concurrency.
872
+ *
873
+ * Features:
874
+ * - Parallel execution with concurrency limit
875
+ * - Stop on first failure option
876
+ * - AbortSignal support for cancellation
877
+ *
878
+ * @example
879
+ * ```typescript
880
+ * const results = await runWithConcurrency(
881
+ * testCases,
882
+ * { agent: myAgent, judge: myJudge, agentDescription: 'Test agent' },
883
+ * { concurrency: 5, stopOnFirstFailure: false }
884
+ * )
885
+ *
886
+ * console.log(`Passed: ${results.filter(r => r.passed).length}`)
887
+ * console.log(`Failed: ${results.filter(r => !r.passed).length}`)
888
+ * ```
889
+ */
890
+ declare function runWithConcurrency<TInput, TOutput>(testCases: TestCase<TInput>[], context: ExecuteContext<TInput, TOutput>, options?: RunOptions): Promise<EvalTestResult<TInput, TOutput>[]>;
891
+
892
+ /**
893
+ * Configuration for creating an EvalSuite.
894
+ *
895
+ * @example
896
+ * ```typescript
897
+ * const suite = createEvalSuite({
898
+ * agent: myAgent,
899
+ * judge: myJudge,
900
+ * agentDescription: 'Recommends career paths based on student profiles',
901
+ * })
902
+ * ```
903
+ */
904
+ interface EvalSuiteConfig<TInput, TOutput> {
905
+ /** The agent to evaluate */
906
+ agent: EvalAgent<TInput, TOutput>;
907
+ /** Human-readable description of what the agent does (used by Judge) */
908
+ agentDescription?: string;
909
+ /** Judge instance for evaluating agent outputs */
910
+ judge: Judge;
911
+ /** Improver instance for generating prompt improvement suggestions (optional) */
912
+ improver?: Improver;
913
+ }
914
+ /**
915
+ * Evaluation suite for running test cases against an agent.
916
+ *
917
+ * @example
918
+ * ```typescript
919
+ * const report = await suite.run(testCases, { concurrency: 3 })
920
+ * console.log(reportToMarkdown(report))
921
+ *
922
+ * // Test with a different agent
923
+ * const newReport = await suite.withAgent(improvedAgent).run(testCases)
924
+ * ```
925
+ */
926
+ interface EvalSuite<TInput, TOutput> {
927
+ /**
928
+ * Run test cases and generate an evaluation report.
929
+ *
930
+ * @param testCases - Test cases to run
931
+ * @param options - Run options (concurrency, stopOnFirstFailure, signal)
932
+ * @returns Evaluation report with results, summary, and suggestions
933
+ */
934
+ run(testCases: TestCase<TInput>[], options?: RunOptions): Promise<EvalReport<TInput, TOutput>>;
935
+ /**
936
+ * Create a new suite with a different agent.
937
+ * Useful for A/B testing or testing prompt improvements.
938
+ *
939
+ * @param agent - New agent to use
940
+ * @returns New EvalSuite instance with the updated agent
941
+ */
942
+ withAgent(agent: EvalAgent<TInput, TOutput>): EvalSuite<TInput, TOutput>;
943
+ }
944
+ /**
945
+ * Create an evaluation suite for testing an agent.
946
+ *
947
+ * The suite orchestrates test execution, evaluation, and optional
948
+ * prompt improvement suggestions.
949
+ *
950
+ * @example
951
+ * ```typescript
952
+ * const suite = createEvalSuite({
953
+ * agent: scenarioGenerator,
954
+ * agentDescription: 'Recommends majors based on student profiles',
955
+ * judge: createJudge({
956
+ * llm: openaiClient,
957
+ * prompt: defaultJudgePrompt,
958
+ * criteria: [accuracy(), relevance()],
959
+ * }),
960
+ * })
961
+ *
962
+ * const report = await suite.run(testCases, { concurrency: 3 })
963
+ * ```
964
+ */
965
+ declare function createEvalSuite<TInput, TOutput>(config: EvalSuiteConfig<TInput, TOutput>): EvalSuite<TInput, TOutput>;
966
+
967
+ /**
968
+ * Iteration statistics utilities for repeated test execution.
969
+ *
970
+ * These functions aggregate results from running the same test multiple times,
971
+ * providing statistical metrics like mean, standard deviation, and pass rate.
972
+ */
973
+
974
+ /**
975
+ * Calculate iteration statistics from multiple test results.
976
+ *
977
+ * @param results - Results from running the same test multiple times
978
+ * @returns Aggregated statistics including mean, stdDev, and passRate
979
+ *
980
+ * @example
981
+ * ```typescript
982
+ * const stats = calculateIterationStats([
983
+ * { overallScore: 85, passed: true, ... },
984
+ * { overallScore: 90, passed: true, ... },
985
+ * { overallScore: 80, passed: true, ... },
986
+ * ])
987
+ * // stats.mean = 85
988
+ * // stats.stdDev ≈ 4.08
989
+ * // stats.passRate = 1.0
990
+ * ```
991
+ */
992
+ declare function calculateIterationStats(results: TestResultWithVerdict<unknown, unknown>[]): IterationStats;
993
+ /**
994
+ * Calculate multi-turn specific iteration statistics.
995
+ *
996
+ * Extends base iteration stats with turn counts and termination type distribution.
997
+ * Used when aggregating multiple iterations of multi-turn tests.
998
+ *
999
+ * @param results - Results from running the same multi-turn test multiple times
1000
+ * @returns Extended statistics including avgTurns, min/max turns, and terminationCounts
1001
+ *
1002
+ * @example
1003
+ * ```typescript
1004
+ * const stats = calculateMultiTurnIterationStats(results)
1005
+ * // stats.avgTurns = 4.2
1006
+ * // stats.minTurns = 3
1007
+ * // stats.maxTurns = 6
1008
+ * // stats.terminationCounts = { condition: 2, maxTurns: 1 }
1009
+ * ```
1010
+ */
1011
+ declare function calculateMultiTurnIterationStats<TInput, TOutput>(results: (MultiTurnResult<TInput, TOutput> | MultiTurnIteratedResult<TInput, TOutput>)[]): MultiTurnIterationStats;
1012
+ /**
1013
+ * Select the result closest to the mean score.
1014
+ * Used to pick a "representative" result for displaying verdicts/reasoning.
1015
+ *
1016
+ * The function preserves the full type of the input array, so if you pass
1017
+ * `TestResultWithIteration[]`, you get back `TestResultWithIteration`.
1018
+ *
1019
+ * @param results - Array of results to choose from (must not be empty)
1020
+ * @param mean - The mean score to compare against
1021
+ * @returns The result with overallScore closest to mean
1022
+ * @throws Error if results array is empty
1023
+ */
1024
+ declare function selectRepresentativeResult<TInput, TOutput, T extends TestResultWithVerdict<TInput, TOutput> = TestResultWithVerdict<TInput, TOutput>>(results: T[], mean: number): T;
1025
+ /**
1026
+ * Aggregate results from multiple iteration runs into iterated result types.
1027
+ *
1028
+ * Takes N arrays of results (one per iteration) and groups them by test case,
1029
+ * calculating iteration statistics for each test case.
1030
+ *
1031
+ * For multi-turn tests, returns MultiTurnIteratedResult with multi-turn specific
1032
+ * statistics like average turns, min/max turns, and termination type distribution.
1033
+ *
1034
+ * For single-turn tests, returns SingleTurnIteratedResult with base iteration stats.
1035
+ *
1036
+ * @param allIterationResults - Array of arrays: outer = iterations, inner = test cases
1037
+ * @returns Aggregated results with iteration statistics
1038
+ *
1039
+ * @example
1040
+ * ```typescript
1041
+ * // 3 iterations, 2 test cases each
1042
+ * const allResults = [
1043
+ * [testCase1_iter1, testCase2_iter1], // iteration 1
1044
+ * [testCase1_iter2, testCase2_iter2], // iteration 2
1045
+ * [testCase1_iter3, testCase2_iter3], // iteration 3
1046
+ * ]
1047
+ *
1048
+ * const aggregated = aggregateIterationResults(allResults)
1049
+ * // aggregated[0] = testCase1 with stats from iter1, iter2, iter3
1050
+ * // aggregated[1] = testCase2 with stats from iter1, iter2, iter3
1051
+ *
1052
+ * // For multi-turn tests:
1053
+ * // aggregated[0].kind === 'multi-turn-iterated'
1054
+ * // aggregated[0].multiTurnIterationStats = { avgTurns, minTurns, maxTurns, terminationCounts }
1055
+ * ```
1056
+ */
1057
+ declare function aggregateIterationResults<TInput, TOutput>(allIterationResults: EvalTestResult<TInput, TOutput>[][]): (SingleTurnIteratedResult<TInput, TOutput> | MultiTurnIteratedResult<TInput, TOutput>)[];
1058
+ /**
1059
+ * Calculate average standard deviation across multiple test results.
1060
+ * Used for report summary.
1061
+ *
1062
+ * @param results - Eval results (only iterated results have stats)
1063
+ * @returns Average stdDev across all iterated tests, or undefined if no iteration data
1064
+ */
1065
+ declare function calculateAvgStdDev<TInput, TOutput>(results: EvalTestResult<TInput, TOutput>[]): number | undefined;
1066
+ /**
1067
+ * Calculate average pass rate across multiple test results.
1068
+ * Used for report summary.
1069
+ *
1070
+ * @param results - Eval results (only iterated results have stats)
1071
+ * @returns Average passRate across all iterated tests, or undefined if no iteration data
1072
+ */
1073
+ declare function calculateAvgPassRate<TInput, TOutput>(results: EvalTestResult<TInput, TOutput>[]): number | undefined;
1074
+
1075
+ /**
1076
+ * Error codes for agent-eval operations
1077
+ */
1078
+ declare enum EvalErrorCode {
1079
+ LLM_API_ERROR = "LLM_API_ERROR",
1080
+ LLM_RATE_LIMIT = "LLM_RATE_LIMIT",
1081
+ LLM_TIMEOUT = "LLM_TIMEOUT",
1082
+ JSON_PARSE_ERROR = "JSON_PARSE_ERROR",
1083
+ VERDICT_PARSE_ERROR = "VERDICT_PARSE_ERROR",
1084
+ TEMPLATE_COMPILE_ERROR = "TEMPLATE_COMPILE_ERROR",
1085
+ AGENT_EXECUTION_ERROR = "AGENT_EXECUTION_ERROR",
1086
+ INVALID_CONFIG = "INVALID_CONFIG",
1087
+ MISSING_API_KEY = "MISSING_API_KEY",
1088
+ PROMPT_NOT_FOUND = "PROMPT_NOT_FOUND",
1089
+ PROMPT_INVALID_FORMAT = "PROMPT_INVALID_FORMAT",
1090
+ PROMPT_WRITE_ERROR = "PROMPT_WRITE_ERROR",
1091
+ PROMPT_READ_ERROR = "PROMPT_READ_ERROR",
1092
+ SUGGESTION_APPLY_ERROR = "SUGGESTION_APPLY_ERROR",
1093
+ SCHEMA_VALIDATION_ERROR = "SCHEMA_VALIDATION_ERROR",
1094
+ SCHEMA_GENERATION_ERROR = "SCHEMA_GENERATION_ERROR",
1095
+ FILE_READ_ERROR = "FILE_READ_ERROR",
1096
+ FILE_WRITE_ERROR = "FILE_WRITE_ERROR",
1097
+ FILE_TOO_LARGE = "FILE_TOO_LARGE",
1098
+ CONCURRENT_MODIFICATION = "CONCURRENT_MODIFICATION",
1099
+ UNKNOWN_ERROR = "UNKNOWN_ERROR"
1100
+ }
1101
+ interface EvalErrorOptions {
1102
+ code: EvalErrorCode;
1103
+ cause?: Error;
1104
+ context?: Record<string, unknown>;
1105
+ }
1106
+ /**
1107
+ * Custom error class for agent-eval operations.
1108
+ * Provides structured error information including error code and optional context.
1109
+ */
1110
+ declare class EvalError extends Error {
1111
+ readonly code: EvalErrorCode;
1112
+ readonly cause?: Error;
1113
+ readonly context?: Record<string, unknown>;
1114
+ constructor(message: string, options: EvalErrorOptions);
1115
+ /**
1116
+ * Creates an EvalError from an unknown error with a specific code.
1117
+ */
1118
+ static from(error: unknown, code: EvalErrorCode, context?: Record<string, unknown>): EvalError;
1119
+ toJSON(): Record<string, unknown>;
1120
+ }
1121
+
1122
+ /**
1123
+ * Creates an LLM-as-Judge evaluator.
1124
+ *
1125
+ * @example
1126
+ * ```typescript
1127
+ * import { createJudge, defaultJudgePrompt, accuracy, consistency } from 'agent-eval'
1128
+ * import { createGoogleProvider } from '@agtlantis/core'
1129
+ *
1130
+ * const provider = createGoogleProvider({ apiKey }).withDefaultModel('gemini-2.5-flash')
1131
+ *
1132
+ * const judge = createJudge({
1133
+ * provider,
1134
+ * prompt: defaultJudgePrompt,
1135
+ * criteria: [accuracy(), consistency()],
1136
+ * passThreshold: 70,
1137
+ * })
1138
+ *
1139
+ * const result = await judge.evaluate({
1140
+ * input: { query: 'What is 2+2?' },
1141
+ * output: { answer: '4' },
1142
+ * agentDescription: 'A math tutor agent',
1143
+ * files: [{ path: 'reference.md', content: '...' }],
1144
+ * })
1145
+ *
1146
+ * console.log(result.overallScore) // e.g., 85
1147
+ * console.log(result.passed) // true
1148
+ * ```
1149
+ */
1150
+ declare function createJudge(config: JudgeConfig): Judge;
1151
+
1152
+ interface SchemaOptions<T> extends CriterionOptions {
1153
+ schema: z.ZodType<T>;
1154
+ /** Use unique IDs when using multiple validators */
1155
+ id?: string;
1156
+ name?: string;
1157
+ description?: string;
1158
+ }
1159
+ /**
1160
+ * Creates a schema validation criterion using Zod.
1161
+ *
1162
+ * Performs PROGRAMMATIC validation (not LLM-based).
1163
+ * Scoring is binary: 100 if validation passes, 0 if it fails.
1164
+ *
1165
+ * @example
1166
+ * ```typescript
1167
+ * import { z } from 'zod'
1168
+ * import { schema, createJudge, accuracy, defaultJudgePrompt } from '@agtlantis/eval'
1169
+ *
1170
+ * const RecipeSchema = z.object({
1171
+ * name: z.string(),
1172
+ * ingredients: z.array(z.object({
1173
+ * name: z.string(),
1174
+ * amount: z.string(),
1175
+ * })),
1176
+ * steps: z.array(z.string()).min(1),
1177
+ * })
1178
+ *
1179
+ * const judge = createJudge({
1180
+ * llm: openaiClient,
1181
+ * prompt: defaultJudgePrompt,
1182
+ * criteria: [
1183
+ * schema({ schema: RecipeSchema, weight: 2 }),
1184
+ * accuracy(),
1185
+ * ],
1186
+ * })
1187
+ * ```
1188
+ */
1189
+ declare function schema<T>(options: SchemaOptions<T>): ValidatorCriterion;
1190
+
1191
+ interface CriterionOptions {
1192
+ weight?: number;
1193
+ }
1194
+ /**
1195
+ * Evaluates whether the agent's output is factually accurate
1196
+ * and free from errors or hallucinations.
1197
+ */
1198
+ declare function accuracy(options?: CriterionOptions): Criterion;
1199
+ /**
1200
+ * Evaluates whether the agent's output is internally consistent
1201
+ * and doesn't contradict itself or the provided context.
1202
+ */
1203
+ declare function consistency(options?: CriterionOptions): Criterion;
1204
+ /**
1205
+ * Evaluates whether the agent's output is relevant to the input
1206
+ * and addresses the user's needs appropriately.
1207
+ */
1208
+ declare function relevance(options?: CriterionOptions): Criterion;
1209
+
1210
+ /**
1211
+ * Converts an evaluation report to Markdown format.
1212
+ *
1213
+ * @example
1214
+ * ```typescript
1215
+ * const report = await suite.run(testCases)
1216
+ * const markdown = reportToMarkdown(report)
1217
+ * console.log(markdown)
1218
+ * ```
1219
+ */
1220
+ declare function reportToMarkdown<TInput, TOutput>(report: EvalReport<TInput, TOutput>, options?: ReportMarkdownOptions): string;
1221
+ /**
1222
+ * Saves an evaluation report as a Markdown file.
1223
+ *
1224
+ * @example
1225
+ * ```typescript
1226
+ * const report = await suite.run(testCases)
1227
+ * await saveReportMarkdown(report, './reports/eval-2024-01.md')
1228
+ * ```
1229
+ */
1230
+ declare function saveReportMarkdown<TInput, TOutput>(report: EvalReport<TInput, TOutput>, path: string, options?: ReportMarkdownOptions): Promise<void>;
1231
+ /**
1232
+ * Compares two evaluation reports and returns the differences.
1233
+ * Useful for tracking improvements across prompt versions.
1234
+ *
1235
+ * @example
1236
+ * ```typescript
1237
+ * const beforeReport = await suite.run(testCases)
1238
+ * // ... apply improvements ...
1239
+ * const afterReport = await suite.withAgent(improvedAgent).run(testCases)
1240
+ *
1241
+ * const comparison = compareReports(beforeReport, afterReport)
1242
+ * console.log(`Score improved by ${comparison.scoreDelta} points`)
1243
+ * console.log(`Tests improved: ${comparison.improved.join(', ')}`)
1244
+ * console.log(`Tests regressed: ${comparison.regressed.join(', ')}`)
1245
+ * ```
1246
+ */
1247
+ declare function compareReports<TInput, TOutput>(before: EvalReport<TInput, TOutput>, after: EvalReport<TInput, TOutput>): ReportComparison;
1248
+
1249
+ /**
1250
+ * Reporter that saves EvalReport as JSON.
1251
+ *
1252
+ * @example
1253
+ * ```typescript
1254
+ * const reporter = new JsonReporter({ outputDir: './reports' })
1255
+ * reporter.save(report, 'my-test') // -> ./reports/my-test-1736691234567.json
1256
+ *
1257
+ * // Without timestamp
1258
+ * const fixedReporter = new JsonReporter({
1259
+ * outputDir: './reports',
1260
+ * addTimestamp: false,
1261
+ * })
1262
+ * fixedReporter.save(report, 'round-1') // -> ./reports/round-1.json
1263
+ * ```
1264
+ */
1265
+ declare class JsonReporter<TInput = unknown, TOutput = unknown> implements Reporter<TInput, TOutput> {
1266
+ private readonly outputDir;
1267
+ private readonly pricing?;
1268
+ private readonly addTimestamp;
1269
+ constructor(options: FileReporterOptions);
1270
+ save(report: EvalReport<TInput, TOutput>, name: string): string;
1271
+ }
1272
+
1273
+ interface MarkdownReporterOptions extends FileReporterOptions {
1274
+ /** Markdown generation options */
1275
+ markdown?: ReportMarkdownOptions;
1276
+ }
1277
+ /**
1278
+ * Reporter that saves EvalReport as Markdown.
1279
+ *
1280
+ * @example
1281
+ * ```typescript
1282
+ * const reporter = new MarkdownReporter({ outputDir: './reports' })
1283
+ * reporter.save(report, 'my-test') // -> ./reports/my-test-1736691234567.md
1284
+ *
1285
+ * // With expanded passed tests
1286
+ * const detailedReporter = new MarkdownReporter({
1287
+ * outputDir: './reports',
1288
+ * markdown: { expandPassedTests: true },
1289
+ * })
1290
+ * ```
1291
+ */
1292
+ declare class MarkdownReporter<TInput = unknown, TOutput = unknown> implements Reporter<TInput, TOutput> {
1293
+ private readonly outputDir;
1294
+ private readonly addTimestamp;
1295
+ private readonly markdownOptions;
1296
+ constructor(options: MarkdownReporterOptions);
1297
+ save(report: EvalReport<TInput, TOutput>, name: string): string;
1298
+ }
1299
+
1300
+ /**
1301
+ * Reporter that logs EvalReport to console.
1302
+ *
1303
+ * @example
1304
+ * ```typescript
1305
+ * const reporter = new ConsoleReporter({ verbosity: 'detailed' })
1306
+ * reporter.log(report) // Logs to console
1307
+ *
1308
+ * // With cost display
1309
+ * const costReporter = new ConsoleReporter({
1310
+ * verbosity: 'summary',
1311
+ * pricing: GOOGLE_PRICING,
1312
+ * })
1313
+ * ```
1314
+ */
1315
+ declare class ConsoleReporter<TInput = unknown, TOutput = unknown> implements Reporter<TInput, TOutput> {
1316
+ private readonly verbosity;
1317
+ private readonly pricing?;
1318
+ constructor(options?: ConsoleReporterOptions);
1319
+ log(report: EvalReport<TInput, TOutput>): void;
1320
+ private logCostIfAvailable;
1321
+ }
1322
+
1323
+ /**
1324
+ * Combines multiple reporters to save/log to multiple outputs.
1325
+ *
1326
+ * @example
1327
+ * ```typescript
1328
+ * const reporter = new CompositeReporter([
1329
+ * new JsonReporter({ outputDir: './reports' }),
1330
+ * new ConsoleReporter({ verbosity: 'detailed' }),
1331
+ * ])
1332
+ * reporter.save(report, 'my-test') // JSON 저장 + 콘솔 출력
1333
+ * ```
1334
+ */
1335
+ declare class CompositeReporter<TInput = unknown, TOutput = unknown> implements Reporter<TInput, TOutput> {
1336
+ private readonly reporters;
1337
+ constructor(reporters: Reporter<TInput, TOutput>[]);
1338
+ /**
1339
+ * Saves to all reporters that support saving.
1340
+ * Returns the first successful file path (usually JsonReporter).
1341
+ */
1342
+ save(report: EvalReport<TInput, TOutput>, name: string): string;
1343
+ log(report: EvalReport<TInput, TOutput>): void;
1344
+ }
1345
+
1346
+ /**
1347
+ * Create a JSON reporter.
1348
+ *
1349
+ * @example
1350
+ * ```typescript
1351
+ * const reporter = createJsonReporter('./reports')
1352
+ * reporter.save(report, 'my-test') // → ./reports/my-test-1736691234567.json
1353
+ * ```
1354
+ */
1355
+ declare function createJsonReporter<TInput = unknown, TOutput = unknown>(outputDir: string, options?: Omit<FileReporterOptions, 'outputDir'>): JsonReporter<TInput, TOutput>;
1356
+ /**
1357
+ * Create a Markdown reporter.
1358
+ *
1359
+ * @example
1360
+ * ```typescript
1361
+ * const reporter = createMarkdownReporter('./reports')
1362
+ * reporter.save(report, 'my-test') // → ./reports/my-test-1736691234567.md
1363
+ * ```
1364
+ */
1365
+ declare function createMarkdownReporter<TInput = unknown, TOutput = unknown>(outputDir: string, options?: Omit<MarkdownReporterOptions, 'outputDir'>): MarkdownReporter<TInput, TOutput>;
1366
+ /**
1367
+ * Create a console reporter.
1368
+ *
1369
+ * @example
1370
+ * ```typescript
1371
+ * const reporter = createConsoleReporter({ verbosity: 'detailed' })
1372
+ * reporter.log(report) // Logs to console
1373
+ * ```
1374
+ */
1375
+ declare function createConsoleReporter<TInput = unknown, TOutput = unknown>(options?: ConsoleReporterOptions): ConsoleReporter<TInput, TOutput>;
1376
+ /**
1377
+ * Create a composite reporter from multiple reporters.
1378
+ *
1379
+ * @example
1380
+ * ```typescript
1381
+ * const reporter = createCompositeReporter([
1382
+ * createJsonReporter('./reports'),
1383
+ * createConsoleReporter({ verbosity: 'summary' }),
1384
+ * ])
1385
+ * ```
1386
+ */
1387
+ declare function createCompositeReporter<TInput = unknown, TOutput = unknown>(reporters: Reporter<TInput, TOutput>[]): CompositeReporter<TInput, TOutput>;
1388
+ /**
1389
+ * Convenience: Create JSON + Console reporter combo.
1390
+ *
1391
+ * @example
1392
+ * ```typescript
1393
+ * const reporter = createDefaultReporter('./reports', {
1394
+ * pricing: GOOGLE_PRICING,
1395
+ * verbosity: 'summary',
1396
+ * })
1397
+ * reporter.save(report, 'my-test') // JSON 저장 + 콘솔 출력
1398
+ * ```
1399
+ */
1400
+ declare function createDefaultReporter<TInput = unknown, TOutput = unknown>(outputDir: string, options?: {
1401
+ pricing?: EvalPricingConfig;
1402
+ verbosity?: LogVerbosity;
1403
+ addTimestamp?: boolean;
1404
+ }): CompositeReporter<TInput, TOutput>;
1405
+
1406
+ /**
1407
+ * Options for creating a report runner.
1408
+ */
1409
+ interface ReportRunnerOptions {
1410
+ /** Directory where reports will be saved */
1411
+ outputDir: string;
1412
+ /** Pricing config for cost calculation */
1413
+ pricing?: EvalPricingConfig;
1414
+ /** Verbosity level for console output (false to disable logging) */
1415
+ verbosity?: LogVerbosity | false;
1416
+ }
1417
+ /**
1418
+ * Result returned by the report runner.
1419
+ */
1420
+ interface ReportRunnerResult<TInput, TOutput> {
1421
+ /** The generated evaluation report */
1422
+ report: EvalReport<TInput, TOutput>;
1423
+ /** Path where the report was saved */
1424
+ savedPath: string;
1425
+ }
1426
+ /**
1427
+ * Creates a runner that automatically logs and saves reports.
1428
+ *
1429
+ * @param options - Runner configuration
1430
+ * @returns A function that runs the suite and handles reporting
1431
+ *
1432
+ * @example
1433
+ * ```typescript
1434
+ * import { createReportRunner, GOOGLE_PRICING } from '@agtlantis/eval'
1435
+ *
1436
+ * const run = createReportRunner({
1437
+ * outputDir: './reports',
1438
+ * pricing: GOOGLE_PRICING,
1439
+ * verbosity: 'detailed',
1440
+ * })
1441
+ *
1442
+ * const { report, savedPath } = await run(suite, testCases, 'my-evaluation')
1443
+ * // Logs to console and saves to ./reports/my-evaluation-{timestamp}.json
1444
+ * console.log(`Saved to: ${savedPath}`)
1445
+ * ```
1446
+ */
1447
+ declare function createReportRunner(options: ReportRunnerOptions): <TInput, TOutput>(suite: EvalSuite<TInput, TOutput>, testCases: TestCase<TInput>[], name: string) => Promise<ReportRunnerResult<TInput, TOutput>>;
1448
+
1449
+ /** Storage abstraction for testability - allows injecting mock storage */
1450
+ interface HistoryStorage {
1451
+ readFile: (path: string) => Promise<string>;
1452
+ writeFile: (path: string, content: string) => Promise<void>;
1453
+ exists: (path: string) => boolean;
1454
+ mkdir: (path: string, options?: {
1455
+ recursive?: boolean;
1456
+ }) => Promise<string | undefined | void>;
1457
+ }
1458
+ declare const defaultHistoryStorage: HistoryStorage;
1459
+ interface ImprovementSession {
1460
+ readonly sessionId: string;
1461
+ readonly history: Readonly<ImprovementHistory>;
1462
+ readonly canSave: boolean;
1463
+ addRound(roundResult: RoundResult, updatedPrompt: SerializedPrompt): void;
1464
+ complete(terminationReason: string): void;
1465
+ save(): Promise<void>;
1466
+ flush(): Promise<void>;
1467
+ }
1468
+ interface SessionConfig {
1469
+ path?: string;
1470
+ autoSave?: boolean;
1471
+ storage?: HistoryStorage;
1472
+ onAutoSaveError?: (error: Error) => void;
1473
+ }
1474
+ /** @throws EvalError with PROMPT_INVALID_FORMAT if userTemplate is missing */
1475
+ declare function serializePrompt<TInput>(prompt: AgentPrompt<TInput>): SerializedPrompt;
1476
+ /** Reconstructs renderUserPrompt using compileTemplate. */
1477
+ declare function deserializePrompt<TInput>(serialized: SerializedPrompt): AgentPrompt<TInput>;
1478
+ /** @throws EvalError with PROMPT_INVALID_FORMAT if prompt lacks userTemplate */
1479
+ declare function createSession<TInput>(initialPrompt: AgentPrompt<TInput>, config?: SessionConfig): ImprovementSession;
1480
+ /** Resume from a history file. Clears completion status to allow adding new rounds. */
1481
+ declare function resumeSession(path: string, config?: Omit<SessionConfig, 'path'>): Promise<ImprovementSession>;
1482
+ /** Save history to JSON file. Creates parent directories if needed. */
1483
+ declare function saveHistory(history: ImprovementHistory, path: string, storage?: HistoryStorage): Promise<void>;
1484
+ declare function loadHistory(path: string, storage?: HistoryStorage): Promise<ImprovementHistory>;
1485
+
1486
+ /** Terminate when average score reaches threshold */
1487
+ interface TargetScoreCondition {
1488
+ type: 'targetScore';
1489
+ /** Score threshold (0-100) */
1490
+ threshold: number;
1491
+ }
1492
+ /** Terminate after N rounds */
1493
+ interface MaxRoundsCondition {
1494
+ type: 'maxRounds';
1495
+ /** Maximum number of improvement rounds */
1496
+ count: number;
1497
+ }
1498
+ /** Terminate when score doesn't improve for N consecutive rounds */
1499
+ interface NoImprovementCondition {
1500
+ type: 'noImprovement';
1501
+ /** Number of consecutive rounds without improvement */
1502
+ consecutiveRounds: number;
1503
+ /** Minimum score delta to count as improvement (default: 0) */
1504
+ minDelta?: number;
1505
+ }
1506
+ /** Terminate when total cost exceeds budget */
1507
+ interface MaxCostCondition {
1508
+ type: 'maxCost';
1509
+ /** Maximum cost in USD */
1510
+ maxUSD: number;
1511
+ }
1512
+ /** Custom condition with user-defined check function */
1513
+ interface CustomCycleCondition {
1514
+ type: 'custom';
1515
+ /** Function to check if termination condition is met */
1516
+ check: (ctx: CycleContext) => boolean | Promise<boolean>;
1517
+ /** Human-readable description (for debugging/logging) */
1518
+ description?: string;
1519
+ }
1520
+ /** Discriminated union of termination conditions. Uses OR semantics - first match triggers. */
1521
+ type CycleTerminationCondition = TargetScoreCondition | MaxRoundsCondition | NoImprovementCondition | MaxCostCondition | CustomCycleCondition;
1522
+ /** Context available to termination condition checks */
1523
+ interface CycleContext {
1524
+ /** Current round number (1-indexed) */
1525
+ currentRound: number;
1526
+ /** Average score from the latest round */
1527
+ latestScore: number;
1528
+ /** Score history from all previous rounds */
1529
+ previousScores: number[];
1530
+ /** Total accumulated cost in USD */
1531
+ totalCost: number;
1532
+ /** Full history of completed rounds */
1533
+ history: RoundResult[];
1534
+ }
1535
+ /** Result when cycle should continue (no termination) */
1536
+ interface CycleContinueResult {
1537
+ terminated: false;
1538
+ reason: string;
1539
+ /** Not present when not terminated (for type safety with discriminated union) */
1540
+ matchedCondition?: never;
1541
+ }
1542
+ /** Result when cycle should terminate */
1543
+ interface CycleTerminatedResult {
1544
+ terminated: true;
1545
+ matchedCondition: CycleTerminationCondition;
1546
+ reason: string;
1547
+ }
1548
+ type CycleTerminationResult = CycleContinueResult | CycleTerminatedResult;
1549
+ /**
1550
+ * Data yielded after each improvement round for Human-in-the-Loop (HITL) control.
1551
+ * The AsyncGenerator yields this after each round, allowing inspection and decision.
1552
+ */
1553
+ interface RoundYield {
1554
+ /** Result of the completed round */
1555
+ roundResult: RoundResult;
1556
+ /** Current cycle context */
1557
+ context: CycleContext;
1558
+ /** Suggestions awaiting approval */
1559
+ pendingSuggestions: Suggestion[];
1560
+ /** Termination check result (use isCycleTerminated() to check if terminated) */
1561
+ terminationCheck: CycleTerminationResult;
1562
+ }
1563
+ /** Decision from the caller after reviewing a round */
1564
+ interface RoundDecision {
1565
+ /** Action to take */
1566
+ action: 'continue' | 'stop' | 'rollback';
1567
+ /** Target round for rollback (required if action is 'rollback') */
1568
+ rollbackToRound?: number;
1569
+ /** Suggestions approved by user (optional override) */
1570
+ approvedSuggestions?: Suggestion[];
1571
+ }
1572
+ /** Cost breakdown for a single round */
1573
+ interface RoundCost {
1574
+ /** Agent LLM cost in USD */
1575
+ agent: number;
1576
+ /** Judge LLM cost in USD */
1577
+ judge: number;
1578
+ /** Improver LLM cost in USD */
1579
+ improver: number;
1580
+ /** Total cost in USD */
1581
+ total: number;
1582
+ }
1583
+ /** Result of a single improvement round */
1584
+ interface RoundResult {
1585
+ /** Round number (1-indexed) */
1586
+ round: number;
1587
+ /** When this round completed */
1588
+ completedAt: Date;
1589
+ /** Full evaluation report */
1590
+ report: EvalReport<unknown, unknown>;
1591
+ /** All suggestions generated by improver */
1592
+ suggestionsGenerated: Suggestion[];
1593
+ /** Suggestions that were approved/applied */
1594
+ suggestionsApproved: Suggestion[];
1595
+ /** Prompt snapshot at start of this round (for rollback) */
1596
+ promptSnapshot: SerializedPrompt;
1597
+ /** Prompt version after applying suggestions */
1598
+ promptVersionAfter: string;
1599
+ /** Cost breakdown for this round */
1600
+ cost: RoundCost;
1601
+ /** Score change from previous round (null for first round) */
1602
+ scoreDelta: number | null;
1603
+ }
1604
+ /**
1605
+ * Serialized prompt for JSON storage.
1606
+ * Note: renderUserPrompt cannot be serialized; use compileTemplate(userTemplate) to reconstruct.
1607
+ */
1608
+ interface SerializedPrompt {
1609
+ /** Prompt unique ID */
1610
+ id: string;
1611
+ /** Version string (e.g., "1.0.0") */
1612
+ version: string;
1613
+ /** System prompt */
1614
+ system: string;
1615
+ /** User prompt template (Mustache format) */
1616
+ userTemplate: string;
1617
+ /** Additional custom fields from AgentPrompt */
1618
+ customFields?: Record<string, unknown>;
1619
+ }
1620
+ /** Serialized round result for JSON storage */
1621
+ interface SerializedRoundResult {
1622
+ /** Round number (1-indexed) */
1623
+ round: number;
1624
+ /** Completion timestamp (ISO 8601) */
1625
+ completedAt: string;
1626
+ /** Average score from this round */
1627
+ avgScore: number;
1628
+ /** Number of passed tests */
1629
+ passed: number;
1630
+ /** Number of failed tests */
1631
+ failed: number;
1632
+ /** Total number of tests */
1633
+ totalTests: number;
1634
+ /** All suggestions generated */
1635
+ suggestionsGenerated: Suggestion[];
1636
+ /** Suggestions that were approved/applied */
1637
+ suggestionsApproved: Suggestion[];
1638
+ /** Prompt snapshot at start of this round */
1639
+ promptSnapshot: SerializedPrompt;
1640
+ /** Prompt version after applying suggestions */
1641
+ promptVersionAfter: string;
1642
+ /** Cost breakdown */
1643
+ cost: RoundCost;
1644
+ /** Score change from previous round */
1645
+ scoreDelta: number | null;
1646
+ }
1647
+ /**
1648
+ * Improvement cycle history (JSON file schema v1.1.0).
1649
+ * Includes promptSnapshot per round for rollback support.
1650
+ */
1651
+ interface ImprovementHistory {
1652
+ /** Schema version for migration compatibility */
1653
+ schemaVersion: '1.1.0';
1654
+ /** Unique session identifier */
1655
+ sessionId: string;
1656
+ /** Session start timestamp (ISO 8601) */
1657
+ startedAt: string;
1658
+ /** Session completion timestamp (ISO 8601, if completed) */
1659
+ completedAt?: string;
1660
+ /** Initial prompt before any improvements */
1661
+ initialPrompt: SerializedPrompt;
1662
+ /** Current/latest prompt */
1663
+ currentPrompt: SerializedPrompt;
1664
+ /** All completed rounds */
1665
+ rounds: SerializedRoundResult[];
1666
+ /** Reason for termination (if completed) */
1667
+ terminationReason?: string;
1668
+ /** Total accumulated cost in USD */
1669
+ totalCost: number;
1670
+ }
1671
+ /** History persistence configuration */
1672
+ interface HistoryConfig {
1673
+ /** Path to save history JSON */
1674
+ path: string;
1675
+ /** Auto-save after each round (default: true) */
1676
+ autoSave?: boolean;
1677
+ }
1678
+ /** Configuration for running an improvement cycle */
1679
+ interface ImprovementCycleConfig<TInput, TOutput> {
1680
+ /** Factory function to create agent with given prompt */
1681
+ createAgent: (prompt: AgentPrompt<TInput>) => EvalAgent<TInput, TOutput>;
1682
+ /** Starting prompt for improvements */
1683
+ initialPrompt: AgentPrompt<TInput>;
1684
+ /** Test cases to evaluate against */
1685
+ testCases: TestCase<TInput>[];
1686
+ /** Judge for evaluation */
1687
+ judge: Judge;
1688
+ /** Improver for generating suggestions */
1689
+ improver: Improver;
1690
+ /** Termination conditions (OR semantics) */
1691
+ terminateWhen: CycleTerminationCondition[];
1692
+ /** Optional configuration */
1693
+ options?: ImprovementCycleOptions;
1694
+ }
1695
+ /** Optional configuration for improvement cycle */
1696
+ interface ImprovementCycleOptions {
1697
+ /** Options passed to eval suite run */
1698
+ runOptions?: {
1699
+ concurrency?: number;
1700
+ iterations?: number;
1701
+ };
1702
+ /** How to bump version on each improvement */
1703
+ versionBump?: 'major' | 'minor' | 'patch';
1704
+ /** Pricing configuration for cost calculation */
1705
+ pricingConfig?: EvalPricingConfig;
1706
+ /** History persistence settings */
1707
+ history?: HistoryConfig;
1708
+ /** Description for agent (passed to judge) */
1709
+ agentDescription?: string;
1710
+ /** Existing session to resume (preserves session ID and accumulated state) */
1711
+ session?: ImprovementSession;
1712
+ }
1713
+ /** Final result of an improvement cycle */
1714
+ interface ImprovementCycleResult<TInput, TOutput> {
1715
+ /** Final improved prompt */
1716
+ finalPrompt: AgentPrompt<TInput>;
1717
+ /** All completed rounds */
1718
+ rounds: RoundResult[];
1719
+ /** Reason for termination */
1720
+ terminationReason: string;
1721
+ /** Total cost in USD */
1722
+ totalCost: number;
1723
+ /** Saved history (if persistence was enabled) */
1724
+ history?: ImprovementHistory;
1725
+ }
1726
+ declare function isTargetScoreCondition(condition: CycleTerminationCondition): condition is TargetScoreCondition;
1727
+ declare function isMaxRoundsCondition(condition: CycleTerminationCondition): condition is MaxRoundsCondition;
1728
+ declare function isNoImprovementCondition(condition: CycleTerminationCondition): condition is NoImprovementCondition;
1729
+ declare function isMaxCostCondition(condition: CycleTerminationCondition): condition is MaxCostCondition;
1730
+ declare function isCustomCycleCondition(condition: CycleTerminationCondition): condition is CustomCycleCondition;
1731
+ declare function isCycleTerminated(result: CycleTerminationResult): result is CycleTerminatedResult;
1732
+
1733
+ /**
1734
+ * Options for saving an ImprovementCycleResult as JSON.
1735
+ *
1736
+ * Supports two modes:
1737
+ * - **Auto mode**: Provide `outputDir` and `name` to create a timestamped subdirectory
1738
+ * - **Explicit mode**: Provide `directory` to use an existing directory directly
1739
+ */
1740
+ interface SaveCycleJsonOptions {
1741
+ /** Base output directory (creates {name}-{timestamp}/ subdirectory) */
1742
+ outputDir?: string;
1743
+ /** Cycle name (used for folder name with timestamp) */
1744
+ name?: string;
1745
+ /** Use this exact directory path (no timestamp suffix added) */
1746
+ directory?: string;
1747
+ /** Whether to save individual round reports (default: true) */
1748
+ saveRounds?: boolean;
1749
+ }
1750
+ /**
1751
+ * Saves an ImprovementCycleResult to JSON files.
1752
+ *
1753
+ * Creates a directory containing:
1754
+ * - `cycle-summary.json`: Structured cycle summary
1755
+ * - `round-{n}-report.json`: Individual round reports (if saveRounds=true)
1756
+ *
1757
+ * @example Auto mode (creates timestamped directory)
1758
+ * ```typescript
1759
+ * const dir = saveCycleJson(result, {
1760
+ * outputDir: './reports',
1761
+ * name: 'my-agent',
1762
+ * })
1763
+ * // -> ./reports/my-agent-1736691234567/
1764
+ * ```
1765
+ *
1766
+ * @example Explicit mode (uses existing directory)
1767
+ * ```typescript
1768
+ * const dir = saveCycleJson(result, {
1769
+ * directory: './reports/my-existing-dir',
1770
+ * })
1771
+ * // -> ./reports/my-existing-dir/
1772
+ * ```
1773
+ */
1774
+ declare function saveCycleJson<TInput, TOutput>(result: ImprovementCycleResult<TInput, TOutput>, options: SaveCycleJsonOptions): string;
1775
+
1776
+ /**
1777
+ * Options for logging an ImprovementCycleResult to console.
1778
+ */
1779
+ interface LogCycleOptions {
1780
+ /** Verbosity level for per-round details */
1781
+ verbosity?: LogVerbosity;
1782
+ /** Show per-round details (default: false, summary only) */
1783
+ showRounds?: boolean;
1784
+ }
1785
+ /**
1786
+ * Logs an ImprovementCycleResult to the console.
1787
+ *
1788
+ * Shows cycle summary including round count, termination reason, total cost,
1789
+ * and score progression. Optionally shows per-round details.
1790
+ *
1791
+ * @param result - The improvement cycle result to log
1792
+ * @param options - Logging options
1793
+ *
1794
+ * @example
1795
+ * ```typescript
1796
+ * import { logCycle } from '@agtlantis/eval'
1797
+ *
1798
+ * const result = await runImprovementCycleAuto(config)
1799
+ * logCycle(result, { verbosity: 'detailed', showRounds: true })
1800
+ * ```
1801
+ */
1802
+ declare function logCycle<TInput, TOutput>(result: ImprovementCycleResult<TInput, TOutput>, options?: LogCycleOptions): void;
1803
+
1804
+ /**
1805
+ * Options for generating cycle markdown.
1806
+ */
1807
+ interface CycleMarkdownOptions {
1808
+ /** Include full per-round details (default: true) */
1809
+ includeRoundDetails?: boolean;
1810
+ /** Show prompt evolution - initial vs final (default: false) */
1811
+ showPromptEvolution?: boolean;
1812
+ }
1813
+ /**
1814
+ * Converts an ImprovementCycleResult to markdown.
1815
+ *
1816
+ * Generates a comprehensive report including:
1817
+ * - Summary table (rounds, termination, cost, scores)
1818
+ * - Score progression table
1819
+ * - Per-round details (optional)
1820
+ * - Prompt evolution (optional)
1821
+ *
1822
+ * @param result - The improvement cycle result
1823
+ * @param options - Markdown generation options
1824
+ * @returns Markdown string
1825
+ *
1826
+ * @example
1827
+ * ```typescript
1828
+ * import { cycleToMarkdown } from '@agtlantis/eval'
1829
+ *
1830
+ * const result = await runImprovementCycleAuto(config)
1831
+ * const markdown = cycleToMarkdown(result, {
1832
+ * includeRoundDetails: true,
1833
+ * showPromptEvolution: true,
1834
+ * })
1835
+ * ```
1836
+ */
1837
+ declare function cycleToMarkdown<TInput, TOutput>(result: ImprovementCycleResult<TInput, TOutput>, options?: CycleMarkdownOptions): string;
1838
+ /**
1839
+ * Saves an ImprovementCycleResult as markdown.
1840
+ *
1841
+ * @param result - The improvement cycle result
1842
+ * @param filePath - Path to save the markdown file
1843
+ * @param options - Markdown generation options
1844
+ *
1845
+ * @example
1846
+ * ```typescript
1847
+ * import { saveCycleMarkdown } from '@agtlantis/eval'
1848
+ *
1849
+ * const result = await runImprovementCycleAuto(config)
1850
+ * saveCycleMarkdown(result, './reports/cycle-report.md', {
1851
+ * includeRoundDetails: true,
1852
+ * })
1853
+ * ```
1854
+ */
1855
+ declare function saveCycleMarkdown<TInput, TOutput>(result: ImprovementCycleResult<TInput, TOutput>, filePath: string, options?: CycleMarkdownOptions): void;
1856
+
1857
+ /**
1858
+ * Generates a unified diff string for a suggestion.
1859
+ *
1860
+ * @example
1861
+ * ```typescript
1862
+ * const diff = suggestionDiff(suggestion)
1863
+ * console.log(diff)
1864
+ * // - Old value here
1865
+ * // + New value here
1866
+ * ```
1867
+ */
1868
+ declare function suggestionDiff(suggestion: Suggestion): string;
1869
+ /**
1870
+ * Generates a preview of what the suggestion would look like when applied.
1871
+ *
1872
+ * @example
1873
+ * ```typescript
1874
+ * const preview = suggestionPreview(suggestion)
1875
+ * console.log(preview)
1876
+ * ```
1877
+ */
1878
+ declare function suggestionPreview(suggestion: Suggestion): string;
1879
+ /**
1880
+ * Formats a suggestion as a compact summary string.
1881
+ *
1882
+ * @example
1883
+ * ```typescript
1884
+ * console.log(suggestionSummary(suggestion))
1885
+ * // [HIGH] system_prompt: Improve clarity in instructions
1886
+ * ```
1887
+ */
1888
+ declare function suggestionSummary(suggestion: Suggestion): string;
1889
+ /**
1890
+ * Options for applying suggestions to a prompt.
1891
+ */
1892
+ interface ApplyPromptSuggestionsOptions {
1893
+ /**
1894
+ * Version bump type for semver.
1895
+ * - 'major': 1.0.0 → 2.0.0 (breaking changes)
1896
+ * - 'minor': 1.0.0 → 1.1.0 (new features)
1897
+ * - 'patch': 1.0.0 → 1.0.1 (bug fixes)
1898
+ */
1899
+ bumpVersion?: 'major' | 'minor' | 'patch';
1900
+ }
1901
+ /**
1902
+ * Result of applying suggestions to a prompt.
1903
+ */
1904
+ interface ApplySuggestionsResult<TInput, TOutput = unknown> {
1905
+ /** The updated prompt with suggestions applied */
1906
+ prompt: AgentPrompt<TInput>;
1907
+ /** Number of suggestions that were successfully applied */
1908
+ appliedCount: number;
1909
+ /** Suggestions that could not be applied (currentValue not found) */
1910
+ skipped: Array<{
1911
+ suggestion: Suggestion;
1912
+ reason: string;
1913
+ }>;
1914
+ }
1915
+ /**
1916
+ * Bumps a semver version string.
1917
+ *
1918
+ * @example
1919
+ * ```typescript
1920
+ * bumpVersion('1.0.0', 'major') // '2.0.0'
1921
+ * bumpVersion('1.0.0', 'minor') // '1.1.0'
1922
+ * bumpVersion('1.0.0', 'patch') // '1.0.1'
1923
+ * bumpVersion('1.2.3', 'minor') // '1.3.0'
1924
+ * ```
1925
+ */
1926
+ declare function bumpVersion(version: string, bump: 'major' | 'minor' | 'patch'): string;
1927
+ /**
1928
+ * Applies approved suggestions to an AgentPrompt and returns a new prompt.
1929
+ *
1930
+ * This function:
1931
+ * - Only applies suggestions where `approved === true`
1932
+ * - For `system_prompt`: replaces `currentValue` in `prompt.system`
1933
+ * - For `user_prompt`: requires `prompt.userTemplate` field, updates it and regenerates `renderUserPrompt`
1934
+ * - For `parameters`: applies to custom fields in the prompt
1935
+ * - Optionally bumps the version (major/minor/patch)
1936
+ *
1937
+ * **Important behaviors:**
1938
+ * - Only the **first occurrence** of `currentValue` is replaced (not all occurrences)
1939
+ * - Special characters like `$&`, `$1` in `suggestedValue` are preserved as-is (no regex interpretation)
1940
+ *
1941
+ * @example
1942
+ * ```typescript
1943
+ * // Apply approved suggestions with minor version bump
1944
+ * const result = applyPromptSuggestions(
1945
+ * currentPrompt,
1946
+ * suggestions.filter(s => s.approved),
1947
+ * { bumpVersion: 'minor' }
1948
+ * )
1949
+ *
1950
+ * console.log(result.prompt.version) // '1.1.0'
1951
+ * console.log(`Applied ${result.appliedCount} suggestions`)
1952
+ *
1953
+ * if (result.skipped.length > 0) {
1954
+ * console.warn('Skipped suggestions:', result.skipped)
1955
+ * }
1956
+ * ```
1957
+ *
1958
+ * @throws {EvalError} with code SUGGESTION_APPLY_ERROR if:
1959
+ * - A `user_prompt` suggestion is applied but prompt lacks `userTemplate` field
1960
+ * - Version format is invalid when bumpVersion is specified
1961
+ */
1962
+ declare function applyPromptSuggestions<TInput, TOutput = unknown>(currentPrompt: AgentPrompt<TInput>, suggestions: Suggestion[], options?: ApplyPromptSuggestionsOptions): ApplySuggestionsResult<TInput, TOutput>;
1963
+
1964
+ /**
1965
+ * Creates an LLM-based prompt improver.
1966
+ *
1967
+ * Analyzes test results and suggests improvements to the agent's prompt,
1968
+ * focusing on low-scoring criteria with actionable suggestions.
1969
+ *
1970
+ * @example
1971
+ * ```typescript
1972
+ * import { createImprover, defaultImproverPrompt } from '@agtlantis/eval'
1973
+ * import { createGoogleProvider } from '@agtlantis/core'
1974
+ *
1975
+ * const provider = createGoogleProvider({ apiKey }).withDefaultModel('gemini-2.5-flash')
1976
+ *
1977
+ * const improver = createImprover({
1978
+ * provider,
1979
+ * prompt: defaultImproverPrompt,
1980
+ * })
1981
+ *
1982
+ * const { suggestions } = await improver.improve(agent.prompt, evaluatedResults)
1983
+ *
1984
+ * for (const suggestion of suggestions) {
1985
+ * console.log(suggestionDiff(suggestion))
1986
+ * suggestion.approved = true
1987
+ * }
1988
+ *
1989
+ * const newPrompt = applyPromptSuggestions(agent.prompt, suggestions)
1990
+ * ```
1991
+ */
1992
+ declare function createImprover(config: ImproverConfig): Improver;
1993
+
1994
+ /**
1995
+ * Configuration for creating a mock agent.
1996
+ */
1997
+ interface MockAgentConfig<TInput, TOutput> {
1998
+ /** Name for the mock agent */
1999
+ name?: string;
2000
+ /** Description for the mock agent */
2001
+ description?: string;
2002
+ /** Response to return from execute() */
2003
+ response?: TOutput;
2004
+ /** Token usage to include in metadata */
2005
+ tokenUsage?: EvalTokenUsage;
2006
+ /** Delay in ms before returning response */
2007
+ delay?: number;
2008
+ /** If true, throw an error instead of returning response */
2009
+ shouldError?: boolean;
2010
+ /** Custom error message when shouldError is true */
2011
+ errorMessage?: string;
2012
+ /** Custom execute function for more control */
2013
+ executeFn?: (input: TInput) => Promise<{
2014
+ result: TOutput;
2015
+ metadata?: {
2016
+ tokenUsage?: EvalTokenUsage;
2017
+ };
2018
+ }>;
2019
+ }
2020
+ /**
2021
+ * Creates a mock agent for testing purposes.
2022
+ *
2023
+ * @example
2024
+ * ```typescript
2025
+ * // Basic usage
2026
+ * const agent = createMockAgent<{ query: string }, { answer: string }>({
2027
+ * response: { answer: 'Hello!' },
2028
+ * })
2029
+ *
2030
+ * // With delay and token usage
2031
+ * const agent = createMockAgent({
2032
+ * response: { answer: 'Response' },
2033
+ * delay: 100,
2034
+ * tokenUsage: { inputTokens: 10, outputTokens: 20, totalTokens: 30 },
2035
+ * })
2036
+ *
2037
+ * // Error testing
2038
+ * const agent = createMockAgent({
2039
+ * shouldError: true,
2040
+ * errorMessage: 'Agent failed',
2041
+ * })
2042
+ * ```
2043
+ */
2044
+ declare function createMockAgent<TInput, TOutput>(config?: MockAgentConfig<TInput, TOutput>): EvalAgent<TInput, TOutput>;
2045
+ /**
2046
+ * Configuration for creating a mock judge.
2047
+ */
2048
+ interface MockJudgeConfig {
2049
+ /** Overall score to return (0-100) */
2050
+ score?: number;
2051
+ /** Whether the evaluation passed */
2052
+ passed?: boolean;
2053
+ /** Verdicts to return */
2054
+ verdicts?: Verdict[];
2055
+ /** Metadata to return (for cost tracking tests) */
2056
+ metadata?: JudgeResult['metadata'];
2057
+ /** If true, throw an error instead of returning result */
2058
+ shouldError?: boolean;
2059
+ /** Custom error message when shouldError is true */
2060
+ errorMessage?: string;
2061
+ /** Custom evaluate function for more control */
2062
+ evaluateFn?: (context: EvalContext) => Promise<JudgeResult>;
2063
+ }
2064
+ /**
2065
+ * Creates a mock judge for testing purposes.
2066
+ *
2067
+ * @example
2068
+ * ```typescript
2069
+ * // Basic usage - passing test
2070
+ * const judge = createMockJudge({
2071
+ * score: 85,
2072
+ * passed: true,
2073
+ * })
2074
+ *
2075
+ * // Custom verdicts
2076
+ * const judge = createMockJudge({
2077
+ * verdicts: [
2078
+ * { criterionId: 'accuracy', score: 90, reasoning: 'Good', passed: true },
2079
+ * { criterionId: 'clarity', score: 80, reasoning: 'Clear', passed: true },
2080
+ * ],
2081
+ * score: 85,
2082
+ * passed: true,
2083
+ * })
2084
+ *
2085
+ * // Failing test
2086
+ * const judge = createMockJudge({
2087
+ * score: 40,
2088
+ * passed: false,
2089
+ * })
2090
+ *
2091
+ * // Error testing
2092
+ * const judge = createMockJudge({
2093
+ * shouldError: true,
2094
+ * errorMessage: 'Judge failed to evaluate',
2095
+ * })
2096
+ * ```
2097
+ */
2098
+ declare function createMockJudge(config?: MockJudgeConfig): Judge;
2099
+ /**
2100
+ * Configuration for creating a mock improver.
2101
+ */
2102
+ interface MockImproverConfig {
2103
+ /** Suggestions to return */
2104
+ suggestions?: Suggestion[];
2105
+ /** If true, throw an error instead of returning suggestions */
2106
+ shouldError?: boolean;
2107
+ /** Custom error message when shouldError is true */
2108
+ errorMessage?: string;
2109
+ /** Custom improve function for more control */
2110
+ improveFn?: (agentPrompt: AgentPrompt<any>, results: TestResultWithVerdict<any, any>[]) => Promise<ImproveResult>;
2111
+ }
2112
+ /**
2113
+ * Creates a mock improver for testing purposes.
2114
+ *
2115
+ * @example
2116
+ * ```typescript
2117
+ * // Basic usage
2118
+ * const improver = createMockImprover({
2119
+ * suggestions: [
2120
+ * {
2121
+ * type: 'system_prompt',
2122
+ * priority: 'high',
2123
+ * currentValue: 'Old prompt',
2124
+ * suggestedValue: 'New prompt',
2125
+ * reasoning: 'Better clarity',
2126
+ * expectedImprovement: '10% improvement',
2127
+ * },
2128
+ * ],
2129
+ * })
2130
+ *
2131
+ * // Empty suggestions
2132
+ * const improver = createMockImprover({ suggestions: [] })
2133
+ *
2134
+ * // Error testing
2135
+ * const improver = createMockImprover({
2136
+ * shouldError: true,
2137
+ * errorMessage: 'Improver failed',
2138
+ * })
2139
+ * ```
2140
+ */
2141
+ declare function createMockImprover(config?: MockImproverConfig): Improver;
2142
+
2143
+ type TerminationCondition<TInput = unknown, TOutput = unknown> = MaxTurnsCondition | FieldSetCondition | FieldValueCondition | CustomCondition<TInput, TOutput>;
2144
+ interface MaxTurnsCondition {
2145
+ type: 'maxTurns';
2146
+ /** Safety limit - terminates after this many turns */
2147
+ count: number;
2148
+ }
2149
+ interface FieldsCondition {
2150
+ /** Dot notation for nested access (e.g., "result.recommendation") */
2151
+ fieldPath: string;
2152
+ }
2153
+ interface FieldSetCondition extends FieldsCondition {
2154
+ type: 'fieldSet';
2155
+ }
2156
+ interface FieldValueCondition extends FieldsCondition {
2157
+ type: 'fieldValue';
2158
+ expectedValue: unknown;
2159
+ }
2160
+ interface CustomCondition<TInput = unknown, TOutput = unknown> {
2161
+ type: 'custom';
2162
+ /** Sync or async check function (e.g., for LLM-based conditions) */
2163
+ check: (context: ConversationContext<TInput, TOutput>) => boolean | Promise<boolean>;
2164
+ /** For debugging/logging */
2165
+ description?: string;
2166
+ }
2167
+ type TerminationType = 'condition' | 'maxTurns' | 'error' | 'exhausted';
2168
+ interface ContinueResult {
2169
+ terminated: false;
2170
+ reason: string;
2171
+ terminationType?: never;
2172
+ matchedCondition?: never;
2173
+ }
2174
+ interface TerminatedResult {
2175
+ terminated: true;
2176
+ terminationType: TerminationType;
2177
+ matchedCondition?: TerminationCondition<unknown, unknown>;
2178
+ reason: string;
2179
+ }
2180
+ type TerminationCheckResult = ContinueResult | TerminatedResult;
2181
+ interface ConversationContext<TInput, TOutput = unknown> {
2182
+ currentTurn: number;
2183
+ history: Array<{
2184
+ turn: number;
2185
+ input: TInput;
2186
+ output: TOutput | undefined;
2187
+ metadata?: AgentMetadata;
2188
+ }>;
2189
+ lastOutput?: TOutput;
2190
+ }
2191
+ interface FollowUpInput<TInput, TOutput = unknown> {
2192
+ /**
2193
+ * Input for this follow-up turn.
2194
+ * Can be static, dynamic (sync), or async (for AI-generated inputs via aiUser()).
2195
+ */
2196
+ input: TInput | ((context: ConversationContext<TInput, TOutput>) => TInput) | ((context: ConversationContext<TInput, TOutput>) => Promise<TInput>);
2197
+ /** For debugging/reports */
2198
+ description?: string;
2199
+ /**
2200
+ * Repeat count (default: 1).
2201
+ * Use Infinity to repeat until termination (must be last followUpInput).
2202
+ */
2203
+ turns?: number;
2204
+ }
2205
+ interface MultiTurnTestCase<TInput, TOutput = unknown> extends TestCase<TInput> {
2206
+ multiTurn: {
2207
+ /** Inputs for 2nd turn onwards (first turn uses TestCase.input) */
2208
+ followUpInputs?: FollowUpInput<TInput, TOutput>[];
2209
+ /** Any condition triggers termination (OR logic) */
2210
+ terminateWhen: TerminationCondition<TInput, TOutput>[];
2211
+ /** Safety limit (default: 10). Uses min of this and any maxTurns condition. */
2212
+ maxTurns?: number;
2213
+ /** Pass/fail when condition met (default: 'pass') */
2214
+ onConditionMet?: 'pass' | 'fail';
2215
+ /** Pass/fail when maxTurns reached (default: 'fail') */
2216
+ onMaxTurnsReached?: 'pass' | 'fail';
2217
+ };
2218
+ }
2219
+ interface MultiTurnTestResult<TInput, TOutput> extends Omit<TestResultWithVerdict<TInput, TOutput>, 'output'> {
2220
+ output: TOutput | undefined;
2221
+ conversationHistory: Array<{
2222
+ turn: number;
2223
+ input: TInput;
2224
+ output: TOutput | undefined;
2225
+ metadata?: AgentMetadata;
2226
+ }>;
2227
+ termination: TerminationCheckResult;
2228
+ totalTurns: number;
2229
+ }
2230
+ declare function isMaxTurnsCondition<TInput, TOutput>(condition: TerminationCondition<TInput, TOutput>): condition is MaxTurnsCondition;
2231
+ declare function isFieldSetCondition<TInput, TOutput>(condition: TerminationCondition<TInput, TOutput>): condition is FieldSetCondition;
2232
+ declare function isFieldValueCondition<TInput, TOutput>(condition: TerminationCondition<TInput, TOutput>): condition is FieldValueCondition;
2233
+ declare function isCustomCondition<TInput, TOutput>(condition: TerminationCondition<TInput, TOutput>): condition is CustomCondition<TInput, TOutput>;
2234
+ declare function isMultiTurnTestCase<TInput, TOutput = unknown>(testCase: TestCase<TInput>): testCase is MultiTurnTestCase<TInput, TOutput>;
2235
+ declare function isTerminated(result: TerminationCheckResult): result is TerminatedResult;
2236
+
2237
+ /** Access a nested field value using dot notation (e.g., "result.recommendation"). */
2238
+ declare function getFieldValue(obj: unknown, fieldPath: string): unknown;
2239
+ declare function checkCondition<TInput, TOutput>(condition: TerminationCondition<TInput, TOutput>, context: ConversationContext<TInput, TOutput>): Promise<TerminationCheckResult>;
2240
+ /** Check all termination conditions (OR relationship). Returns on first termination. */
2241
+ declare function checkTermination<TInput, TOutput>(conditions: TerminationCondition<TInput, TOutput>[], context: ConversationContext<TInput, TOutput>): Promise<TerminationCheckResult>;
2242
+
2243
+ interface NaturalLanguageConditionOptions {
2244
+ /** Provider to use for evaluation */
2245
+ provider: Provider;
2246
+ /** Prompt describing the termination criteria (e.g., "Has the user's question been fully answered?") */
2247
+ prompt: string;
2248
+ /** Optional system prompt override */
2249
+ systemPrompt?: string;
2250
+ }
2251
+ /** LLM-based termination condition. Asks the LLM to evaluate the termination criteria. */
2252
+ declare function naturalLanguage<TInput = unknown, TOutput = unknown>(options: NaturalLanguageConditionOptions): CustomCondition<TInput, TOutput>;
2253
+ /** Terminates when ALL sub-conditions are met (AND logic). */
2254
+ declare function and$1<TInput = unknown, TOutput = unknown>(...conditions: TerminationCondition<TInput, TOutput>[]): CustomCondition<TInput, TOutput>;
2255
+ /** Terminates when ANY sub-condition is met (OR logic). Useful for nested composites. */
2256
+ declare function or$1<TInput = unknown, TOutput = unknown>(...conditions: TerminationCondition<TInput, TOutput>[]): CustomCondition<TInput, TOutput>;
2257
+ /** Inverts another condition (NOT logic). */
2258
+ declare function not$1<TInput = unknown, TOutput = unknown>(condition: TerminationCondition<TInput, TOutput>): CustomCondition<TInput, TOutput>;
2259
+ /** Terminates after a specified number of turns. Convenience wrapper for use in composites. */
2260
+ declare function afterTurns<TInput = unknown, TOutput = unknown>(count: number): CustomCondition<TInput, TOutput>;
2261
+ /** Terminates when a field matches a specific value. Convenience wrapper for composites. */
2262
+ declare function fieldEquals<TInput = unknown, TOutput = unknown>(fieldPath: string, expectedValue: unknown): CustomCondition<TInput, TOutput>;
2263
+ /** Terminates when a field is set (not null/undefined). Convenience wrapper for composites. */
2264
+ declare function fieldIsSet<TInput = unknown, TOutput = unknown>(fieldPath: string): CustomCondition<TInput, TOutput>;
2265
+
2266
+ interface MultiTurnExecuteContext<TInput, TOutput> {
2267
+ agent: EvalAgent<TInput, TOutput>;
2268
+ judge: Judge;
2269
+ agentDescription: string;
2270
+ }
2271
+ interface MultiTurnExecuteOptions {
2272
+ signal?: AbortSignal;
2273
+ }
2274
+ declare function executeMultiTurnTestCase<TInput, TOutput>(testCase: MultiTurnTestCase<TInput, TOutput>, context: MultiTurnExecuteContext<TInput, TOutput>, options?: MultiTurnExecuteOptions): Promise<MultiTurnTestResult<TInput, TOutput>>;
2275
+
2276
+ interface AIUserOptions<TInput, TOutput> {
2277
+ /** Provider for generating user responses */
2278
+ provider: Provider;
2279
+ /** System prompt (string or function for dynamic personas). Uses default if not provided. */
2280
+ systemPrompt?: string | ((context: ConversationContext<TInput, TOutput>) => string);
2281
+ /** Custom history formatter. Default: JSON-based "User: {input}\nAssistant: {output}" format. */
2282
+ formatHistory?: (context: ConversationContext<TInput, TOutput>) => string;
2283
+ /** Convert LLM text response to TInput. Has access to full context for structured input building. */
2284
+ buildInput: (llmResponse: string, context: ConversationContext<TInput, TOutput>) => TInput;
2285
+ }
2286
+ /**
2287
+ * Creates an async function that generates user inputs using an LLM for multi-turn testing.
2288
+ *
2289
+ * @example
2290
+ * ```typescript
2291
+ * aiUser({
2292
+ * provider: openai,
2293
+ * systemPrompt: 'You are a friendly customer.',
2294
+ * buildInput: (response, ctx) => ({ message: response }),
2295
+ * })
2296
+ * ```
2297
+ */
2298
+ declare function aiUser<TInput, TOutput>(options: AIUserOptions<TInput, TOutput>): (context: ConversationContext<TInput, TOutput>) => Promise<TInput>;
2299
+
2300
+ /**
2301
+ * CLI Configuration Types
2302
+ *
2303
+ * Defines the configuration schema for `agent-eval.config.ts` files.
2304
+ * Use `defineConfig()` helper for type inference and IDE autocompletion.
2305
+ */
2306
+
2307
+ /**
2308
+ * LLM provider configuration.
2309
+ * API keys fall back to OPENAI_API_KEY or GOOGLE_API_KEY env vars.
2310
+ */
2311
+ interface LLMConfig {
2312
+ /** LLM provider */
2313
+ provider: 'openai' | 'gemini';
2314
+ /** API key (optional - falls back to environment variable) */
2315
+ apiKey?: string;
2316
+ /** Default model to use */
2317
+ defaultModel?: string;
2318
+ /**
2319
+ * OpenAI reasoning effort (o1/o3 models only)
2320
+ * @see https://platform.openai.com/docs/guides/reasoning
2321
+ */
2322
+ reasoningEffort?: 'minimal' | 'low' | 'medium' | 'high';
2323
+ /**
2324
+ * Default response format
2325
+ * @see https://platform.openai.com/docs/guides/structured-outputs
2326
+ */
2327
+ defaultResponseFormat?: {
2328
+ type: 'json_object' | 'text';
2329
+ };
2330
+ }
2331
+ interface CLIJudgeConfig {
2332
+ /**
2333
+ * LLM configuration for judge.
2334
+ * If not specified, uses the main `llm` config.
2335
+ */
2336
+ llm?: LLMConfig;
2337
+ /**
2338
+ * Evaluation criteria.
2339
+ * Use built-in criteria factories like `accuracy()`, `relevance()`,
2340
+ * or define custom criteria objects.
2341
+ */
2342
+ criteria: Array<Criterion | ValidatorCriterion>;
2343
+ /**
2344
+ * Score threshold for passing (0-100).
2345
+ * @default 70
2346
+ */
2347
+ passThreshold?: number;
2348
+ /**
2349
+ * Custom judge prompt.
2350
+ * If not specified, uses the default judge prompt.
2351
+ */
2352
+ prompt?: JudgePrompt;
2353
+ }
2354
+ interface CLIImproverConfig {
2355
+ /**
2356
+ * LLM configuration for improver.
2357
+ * If not specified, uses the main `llm` config.
2358
+ */
2359
+ llm?: LLMConfig;
2360
+ /**
2361
+ * Custom improver prompt.
2362
+ * If not specified, uses the default improver prompt.
2363
+ */
2364
+ prompt?: ImproverPrompt;
2365
+ }
2366
+ interface OutputConfig {
2367
+ /**
2368
+ * Directory for report output.
2369
+ * @default './reports'
2370
+ */
2371
+ dir?: string;
2372
+ /**
2373
+ * Custom filename pattern.
2374
+ * Supports `{timestamp}` placeholder.
2375
+ * @default 'eval-{timestamp}.md'
2376
+ */
2377
+ filename?: string;
2378
+ /**
2379
+ * Include verbose details in console output.
2380
+ * @default false
2381
+ */
2382
+ verbose?: boolean;
2383
+ }
2384
+ interface RunConfig {
2385
+ /**
2386
+ * Number of concurrent test executions.
2387
+ * @default 1
2388
+ */
2389
+ concurrency?: number;
2390
+ /**
2391
+ * Number of iterations per test case (for statistical analysis).
2392
+ * @default 1
2393
+ */
2394
+ iterations?: number;
2395
+ /**
2396
+ * Stop execution on first test failure.
2397
+ * @default false
2398
+ */
2399
+ stopOnFirstFailure?: boolean;
2400
+ }
2401
+ interface CLISingleTurnTestCase<TInput> extends TestCase<TInput> {
2402
+ /** Test case must NOT have multiTurn field */
2403
+ multiTurn?: never;
2404
+ }
2405
+ interface CLIMultiTurnTestCase<TInput, TOutput = unknown> extends TestCase<TInput> {
2406
+ /** Multi-turn configuration */
2407
+ multiTurn: {
2408
+ /**
2409
+ * Inputs for 2nd turn onwards.
2410
+ * First turn uses `input` field.
2411
+ */
2412
+ followUpInputs?: FollowUpInput<TInput, TOutput>[];
2413
+ /**
2414
+ * Termination conditions (OR relationship).
2415
+ * Any one triggers termination.
2416
+ */
2417
+ terminateWhen: TerminationCondition<TInput, TOutput>[];
2418
+ /**
2419
+ * Safety limit: maximum turns.
2420
+ * @default 10
2421
+ */
2422
+ maxTurns?: number;
2423
+ /**
2424
+ * Outcome when termination condition is met.
2425
+ * @default 'pass'
2426
+ */
2427
+ onConditionMet?: 'pass' | 'fail';
2428
+ /**
2429
+ * Outcome when maxTurns is reached.
2430
+ * @default 'fail'
2431
+ */
2432
+ onMaxTurnsReached?: 'pass' | 'fail';
2433
+ };
2434
+ }
2435
+ type CLITestCase<TInput, TOutput = unknown> = CLISingleTurnTestCase<TInput> | CLIMultiTurnTestCase<TInput, TOutput>;
2436
+ /**
2437
+ * Main evaluation configuration for CLI.
2438
+ * @typeParam TInput - Agent input type
2439
+ * @typeParam TOutput - Agent output type
2440
+ */
2441
+ interface EvalConfig<TInput = unknown, TOutput = unknown> {
2442
+ /**
2443
+ * Human-readable name for this evaluation.
2444
+ */
2445
+ name?: string;
2446
+ /**
2447
+ * Description of what the agent does.
2448
+ * Used by Judge for evaluation context.
2449
+ */
2450
+ agentDescription?: string;
2451
+ /**
2452
+ * The agent to evaluate.
2453
+ */
2454
+ agent: EvalAgent<TInput, TOutput>;
2455
+ /**
2456
+ * LLM configuration (shared by Judge and Improver unless overridden).
2457
+ */
2458
+ llm: LLMConfig;
2459
+ /**
2460
+ * Judge configuration for evaluating agent outputs.
2461
+ */
2462
+ judge: CLIJudgeConfig;
2463
+ /**
2464
+ * Improver configuration for prompt improvement suggestions.
2465
+ * Optional - if not specified, no improvements are generated.
2466
+ */
2467
+ improver?: CLIImproverConfig;
2468
+ /**
2469
+ * Test cases to run (inline TypeScript definition).
2470
+ * Can mix single-turn and multi-turn test cases.
2471
+ *
2472
+ * Either `testCases` or `include` must be provided.
2473
+ * - Use `testCases` for inline TypeScript test case definitions
2474
+ * - Use `include` for YAML-based test case files
2475
+ */
2476
+ testCases?: CLITestCase<TInput, TOutput>[];
2477
+ /**
2478
+ * Output configuration for reports.
2479
+ */
2480
+ output?: OutputConfig;
2481
+ /**
2482
+ * Run configuration for test execution.
2483
+ */
2484
+ run?: RunConfig;
2485
+ /**
2486
+ * Pricing configuration for cost calculation.
2487
+ * If provided, cost breakdown will be included in test metrics.
2488
+ *
2489
+ * @example
2490
+ * ```typescript
2491
+ * pricing: {
2492
+ * openai: { 'gpt-4o': { inputPricePerMillion: 2.5, outputPricePerMillion: 10 } },
2493
+ * fallback: { inputPricePerMillion: 1.0, outputPricePerMillion: 3.0 },
2494
+ * }
2495
+ * ```
2496
+ */
2497
+ pricing?: EvalPricingConfig;
2498
+ /**
2499
+ * Glob patterns to discover YAML eval files.
2500
+ * Required when using YAML-based test cases instead of inline testCases.
2501
+ *
2502
+ * @example
2503
+ * ```typescript
2504
+ * include: ['evals/booking/*.eval.yaml']
2505
+ * ```
2506
+ */
2507
+ include?: string[];
2508
+ /**
2509
+ * Agent registry for YAML file references.
2510
+ * YAML files reference agents by name (e.g., `agent: booking-agent`).
2511
+ *
2512
+ * @example
2513
+ * ```typescript
2514
+ * agents: {
2515
+ * 'booking-agent': bookingAgent,
2516
+ * 'qa-agent': qaAgent,
2517
+ * }
2518
+ * ```
2519
+ */
2520
+ agents?: Record<string, EvalAgent<unknown, unknown>>;
2521
+ }
2522
+ /** Identity function for type inference and IDE autocompletion. */
2523
+ declare function defineConfig<TInput = unknown, TOutput = unknown>(config: EvalConfig<TInput, TOutput>): EvalConfig<TInput, TOutput>;
2524
+
2525
+ interface DiscoverOptions {
2526
+ /** Override config include patterns (CLI --include) */
2527
+ include?: string[];
2528
+ /** Base directory for glob patterns (defaults to process.cwd()) */
2529
+ cwd?: string;
2530
+ /** Ignore patterns (default excludes node_modules) */
2531
+ ignore?: string[];
2532
+ }
2533
+ /** Discover YAML eval files matching glob patterns. CLI patterns override config. */
2534
+ declare function discoverEvalFiles(config: Pick<EvalConfig, 'include'>, options?: DiscoverOptions): Promise<string[]>;
2535
+
2536
+ /** Terminates when the average score reaches or exceeds threshold. */
2537
+ declare function targetScore(threshold: number): TargetScoreCondition;
2538
+ /** Terminates after completing the specified number of rounds. */
2539
+ declare function maxRounds(count: number): MaxRoundsCondition;
2540
+ /** Terminates when score hasn't improved for N consecutive rounds. */
2541
+ declare function noImprovement(consecutiveRounds: number, minDelta?: number): NoImprovementCondition;
2542
+ /** Terminates when total accumulated cost reaches or exceeds the budget. */
2543
+ declare function maxCost(maxUSD: number): MaxCostCondition;
2544
+ /** Custom termination condition with arbitrary logic. Supports async checks. */
2545
+ declare function customCondition(check: (ctx: CycleContext) => boolean | Promise<boolean>, description?: string): CustomCycleCondition;
2546
+ /** All conditions must be met for termination. Short-circuits on first false. */
2547
+ declare function and(...conditions: CycleTerminationCondition[]): CustomCycleCondition;
2548
+ /** Any condition being met causes termination. Short-circuits on first true. */
2549
+ declare function or(...conditions: CycleTerminationCondition[]): CustomCycleCondition;
2550
+ /** Invert a condition's result. Terminates when inner condition does NOT terminate. */
2551
+ declare function not(condition: CycleTerminationCondition): CustomCycleCondition;
2552
+ /** Dispatches to the appropriate check function based on condition type. */
2553
+ declare function checkCycleCondition(condition: CycleTerminationCondition, context: CycleContext): Promise<CycleTerminationResult>;
2554
+ /** Check all conditions with OR semantics - first match wins. */
2555
+ declare function checkCycleTermination(conditions: CycleTerminationCondition[], context: CycleContext): Promise<CycleTerminationResult>;
2556
+
2557
+ /**
2558
+ * Run an improvement cycle as an AsyncGenerator for Human-in-the-Loop control.
2559
+ * Yields after each round for decision-making (continue, stop, or rollback).
2560
+ */
2561
+ declare function runImprovementCycle<TInput, TOutput>(config: ImprovementCycleConfig<TInput, TOutput>): AsyncGenerator<RoundYield, ImprovementCycleResult<TInput, TOutput>, RoundDecision | undefined>;
2562
+ /**
2563
+ * Run improvement cycle with automatic approval of all suggestions.
2564
+ * Continues until a termination condition is met.
2565
+ */
2566
+ declare function runImprovementCycleAuto<TInput, TOutput>(config: ImprovementCycleConfig<TInput, TOutput>): Promise<ImprovementCycleResult<TInput, TOutput>>;
2567
+
2568
+ /**
2569
+ * Options for random selection.
2570
+ */
2571
+ interface RandomOptions {
2572
+ /** Seed for reproducible random selection */
2573
+ seed?: number;
2574
+ }
2575
+ /**
2576
+ * Immutable collection for managing and selecting test cases.
2577
+ * Provides fluent API for filtering, sampling, and accessing test cases.
2578
+ *
2579
+ * ## Immutability
2580
+ * - All selection methods (`filter`, `first`, `random`, etc.) return **new collections**
2581
+ * - Chaining creates intermediate collections without modifying the original
2582
+ * - Internal array is frozen with `Object.freeze()` to prevent accidental mutation
2583
+ * - `toArray()` returns a **mutable copy** for consumer convenience
2584
+ *
2585
+ * @example
2586
+ * ```typescript
2587
+ * import { TestCaseCollection, createEvalSuite } from '@agtlantis/eval'
2588
+ *
2589
+ * const cases = TestCaseCollection.from([
2590
+ * { id: 'basic', input: { query: 'Hello' } },
2591
+ * { id: 'complex', input: { query: 'Explain quantum computing' } },
2592
+ * { id: 'edge', input: { query: '' } },
2593
+ * ])
2594
+ *
2595
+ * // Development: quick feedback
2596
+ * await suite.run(cases.minimal().toArray())
2597
+ *
2598
+ * // CI: full coverage
2599
+ * await suite.run(cases.all().toArray())
2600
+ *
2601
+ * // Debugging specific case
2602
+ * await suite.run(cases.byIds(['edge']).toArray())
2603
+ *
2604
+ * // Chaining: filter then sample
2605
+ * const filtered = cases.filter(tc => tc.tags?.includes('fast')).random(3).toArray()
2606
+ * ```
2607
+ */
2608
+ declare class TestCaseCollection<TInput> {
2609
+ private readonly cases;
2610
+ private constructor();
2611
+ /**
2612
+ * Create a collection from an array of test cases.
2613
+ */
2614
+ static from<T>(cases: TestCase<T>[]): TestCaseCollection<T>;
2615
+ /**
2616
+ * Create an empty collection.
2617
+ */
2618
+ static empty<T>(): TestCaseCollection<T>;
2619
+ /**
2620
+ * Number of test cases in the collection.
2621
+ */
2622
+ get length(): number;
2623
+ /**
2624
+ * Whether the collection is empty.
2625
+ */
2626
+ get isEmpty(): boolean;
2627
+ /**
2628
+ * Returns all test cases.
2629
+ * Returns `this` since the collection is immutable (frozen array).
2630
+ * Useful as explicit starting point in chains.
2631
+ */
2632
+ all(): TestCaseCollection<TInput>;
2633
+ /**
2634
+ * Returns the first N test cases (default: 1).
2635
+ * Useful for cost-controlled testing during development.
2636
+ */
2637
+ minimal(count?: number): TestCaseCollection<TInput>;
2638
+ /**
2639
+ * Returns the first N test cases.
2640
+ */
2641
+ first(count: number): TestCaseCollection<TInput>;
2642
+ /**
2643
+ * Returns the last N test cases (default: 1).
2644
+ * Preserves original order (earlier cases first).
2645
+ */
2646
+ last(count?: number): TestCaseCollection<TInput>;
2647
+ /**
2648
+ * Returns N random test cases.
2649
+ *
2650
+ * @param count - Number of cases to select
2651
+ * @param options - Optional seed for reproducibility
2652
+ *
2653
+ * @example
2654
+ * ```typescript
2655
+ * // Different each time
2656
+ * collection.random(5)
2657
+ *
2658
+ * // Same result with same seed
2659
+ * collection.random(5, { seed: 42 })
2660
+ * ```
2661
+ */
2662
+ random(count: number, options?: RandomOptions): TestCaseCollection<TInput>;
2663
+ /**
2664
+ * Filter test cases by predicate.
2665
+ */
2666
+ filter(predicate: (testCase: TestCase<TInput>) => boolean): TestCaseCollection<TInput>;
2667
+ /**
2668
+ * Find test case by ID.
2669
+ * Returns collection with single case or empty collection.
2670
+ */
2671
+ byId(id: string): TestCaseCollection<TInput>;
2672
+ /**
2673
+ * Find test cases by multiple IDs.
2674
+ * Preserves order of provided IDs (first occurrence).
2675
+ * Skips non-existent IDs. Duplicate IDs in input are deduplicated.
2676
+ *
2677
+ * @example
2678
+ * ```typescript
2679
+ * collection.byIds(['a', 'b', 'a']) // returns [case-a, case-b] (deduplicated)
2680
+ * collection.byIds(['b', 'a']) // returns [case-b, case-a] (order preserved)
2681
+ * ```
2682
+ */
2683
+ byIds(ids: string[]): TestCaseCollection<TInput>;
2684
+ /**
2685
+ * Get test case by ID.
2686
+ * Returns undefined if not found.
2687
+ */
2688
+ get(id: string): TestCase<TInput> | undefined;
2689
+ /**
2690
+ * Get test case by index.
2691
+ * Supports negative indices (e.g., -1 for last item).
2692
+ * Returns undefined if index is out of bounds.
2693
+ */
2694
+ at(index: number): TestCase<TInput> | undefined;
2695
+ /**
2696
+ * Convert to array.
2697
+ * Returns a mutable copy of the internal array.
2698
+ */
2699
+ toArray(): TestCase<TInput>[];
2700
+ /**
2701
+ * Iterator support for for...of loops and spread operator.
2702
+ */
2703
+ [Symbol.iterator](): Iterator<TestCase<TInput>>;
2704
+ }
2705
+ /**
2706
+ * Create a single test case with auto-generated ID if not provided.
2707
+ *
2708
+ * Auto-generated IDs use a global counter: `test-1`, `test-2`, etc.
2709
+ *
2710
+ * @param input - The test case input data
2711
+ * @param id - Optional custom ID (uses auto-generated if omitted)
2712
+ * @returns A TestCase object
2713
+ *
2714
+ * @example
2715
+ * ```typescript
2716
+ * const case1 = testCase({ name: 'Alice' }) // id: 'test-1'
2717
+ * const case2 = testCase({ name: 'Bob' }) // id: 'test-2'
2718
+ * const case3 = testCase({ name: 'Charlie' }, 'custom-id') // id: 'custom-id'
2719
+ * ```
2720
+ *
2721
+ * @remarks
2722
+ * The global counter increments on every call. For deterministic IDs,
2723
+ * provide an explicit ID or use `testCases()` with a prefix.
2724
+ */
2725
+ declare function testCase<TInput>(input: TInput, id?: string): TestCase<TInput>;
2726
+ /**
2727
+ * Create multiple test cases from inputs.
2728
+ * Auto-generates IDs with optional prefix.
2729
+ *
2730
+ * @example
2731
+ * ```typescript
2732
+ * const cases = testCases([{ name: 'Alice' }, { name: 'Bob' }], 'greet')
2733
+ * // Results in: [{ id: 'greet-0', input: {...} }, { id: 'greet-1', input: {...} }]
2734
+ * ```
2735
+ */
2736
+ declare function testCases<TInput>(inputs: TInput[], prefix?: string): TestCase<TInput>[];
2737
+
2738
+ export { type AIUserOptions, type AgentMetadata, type AgentPrompt, type AgentResult, type AggregatedMetrics, type ApplyPromptSuggestionsOptions, type ApplySuggestionsResult, type CLIImproverConfig, type CLIJudgeConfig, type CLIMultiTurnTestCase, type CLISingleTurnTestCase, type CLITestCase, type ComponentMetadata, CompositeReporter, ConsoleReporter, type ConsoleReporterOptions, type ContinueResult, type ConversationContext, type ConversationEntry, type CostBreakdown, type CostSummary, type Criterion, type CriterionOptions, type CustomCondition, type CustomCycleCondition, type CycleContext, type CycleContinueResult, type CycleMarkdownOptions, type CycleTerminatedResult, type CycleTerminationCondition, type CycleTerminationResult, type DiscoverOptions, type EvalAgent, type EvalAgentConfig, type EvalConfig, type EvalContext, EvalError, EvalErrorCode, type EvalErrorOptions, type EvalPricingConfig, type EvalReport, type EvalResultKind, type EvalSuite, type EvalSuiteConfig, type EvalTestResult, type EvalTokenUsage, type ExecuteContext, type FieldSetCondition, type FieldValueCondition, type FieldsCondition, type FileContent, type FileContentMetadata, type FileReporterOptions, type FollowUpInput, type HistoryConfig, type HistoryStorage, type ImproveResult, type ImprovementCycleConfig, type ImprovementCycleOptions, type ImprovementCycleResult, type ImprovementHistory, type ImprovementSession, type Improver, type ImproverConfig, type ImproverContext, type ImproverMetadata, type ImproverPrompt, type IterationData, type IterationStats, JsonReporter, type Judge, type JudgeConfig, type JudgeContext, type JudgeMetadata, type JudgePrompt, type JudgeResult, type LLMConfig, type LogCycleOptions, type LogVerbosity, MarkdownReporter, type MarkdownReporterOptions, type MaxCostCondition, type MaxRoundsCondition, type MaxTurnsCondition, type MetricsResult, type MetricsWithCost, type MockAgentConfig, type MockImproverConfig, type MockJudgeConfig, type MultiTurnData, type MultiTurnExecuteContext, type MultiTurnExecuteOptions, type MultiTurnIteratedResult, type MultiTurnIterationStats, type MultiTurnResult, type MultiTurnTestCase, type MultiTurnTestResult, type NaturalLanguageConditionOptions, type NoImprovementCondition, type OutputConfig, type RandomOptions, type ReportComparison, type ReportMarkdownOptions, type ReportRunnerOptions, type ReportSummary, type Reporter, type RoundCost, type RoundDecision, type RoundResult, type RoundYield, type RunConfig, type RunOptions, type SaveCycleJsonOptions, type SchemaOptions, type SchemaValidationResult, type SerializedPrompt, type SerializedRoundResult, type SessionConfig, type SingleTurnIteratedResult, type SingleTurnResult, type Suggestion, type TargetScoreCondition, type TerminatedResult, type TerminationCheckResult, type TerminationCondition, type TerminationInfo, type TestCase, TestCaseCollection, type TestResult, type TestResultWithCost, type TestResultWithVerdict, type ValidatorCriterion, type ValidatorFn, type Verdict, type ZodIssue, accuracy, addCostsToResults, afterTurns, aggregateIterationResults, aiUser, and$1 as and, applyPromptSuggestions, bumpVersion, calculateAvgPassRate, calculateAvgStdDev, calculateIterationStats, calculateMultiTurnIterationStats, calculateReportCosts, calculateResultCost, checkCondition, checkCycleCondition, checkCycleTermination, checkTermination, compareReports, consistency, createCompositeReporter, createConsoleReporter, createDefaultReporter, createEvalSuite, createImprover, createJsonReporter, createJudge, createMarkdownReporter, createMockAgent, createMockImprover, createMockJudge, createReportRunner, createSession, customCondition, and as cycleAnd, not as cycleNot, or as cycleOr, cycleToMarkdown, defaultHistoryStorage, defineConfig, deserializePrompt, discoverEvalFiles, executeMultiTurnTestCase, executeTestCase, fieldEquals, fieldIsSet, getFieldValue, isCustomCondition, isCustomCycleCondition, isCycleTerminated, isFieldSetCondition, isFieldValueCondition, isIteratedResult, isMaxCostCondition, isMaxRoundsCondition, isMaxTurnsCondition, isMultiTurnResult, isMultiTurnTestCase, isNoImprovementCondition, isSingleTurnResult, isTargetScoreCondition, isTerminated, loadHistory, logCycle, maxCost, maxRounds, naturalLanguage, noImprovement, not$1 as not, or$1 as or, relevance, reportToMarkdown, resumeSession, runImprovementCycle, runImprovementCycleAuto, runWithConcurrency, saveCycleJson, saveCycleMarkdown, saveHistory, saveReportMarkdown, schema, selectRepresentativeResult, serializePrompt, suggestionDiff, suggestionPreview, suggestionSummary, targetScore, testCase, testCases, toEvalAgent };