@agtlantis/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,3868 @@
1
+ // src/core/runner.ts
2
+ import { resolveFileSourcesInInput as resolveFileSourcesInInput2 } from "@agtlantis/core";
3
+
4
+ // src/multi-turn/types.ts
5
+ function isMaxTurnsCondition(condition) {
6
+ return condition.type === "maxTurns";
7
+ }
8
+ function isFieldSetCondition(condition) {
9
+ return condition.type === "fieldSet";
10
+ }
11
+ function isFieldValueCondition(condition) {
12
+ return condition.type === "fieldValue";
13
+ }
14
+ function isCustomCondition(condition) {
15
+ return condition.type === "custom";
16
+ }
17
+ function isMultiTurnTestCase(testCase2) {
18
+ return "multiTurn" in testCase2;
19
+ }
20
+ function isTerminated(result) {
21
+ return result.terminated === true;
22
+ }
23
+
24
+ // src/core/errors.ts
25
+ var EvalErrorCode = /* @__PURE__ */ ((EvalErrorCode2) => {
26
+ EvalErrorCode2["LLM_API_ERROR"] = "LLM_API_ERROR";
27
+ EvalErrorCode2["LLM_RATE_LIMIT"] = "LLM_RATE_LIMIT";
28
+ EvalErrorCode2["LLM_TIMEOUT"] = "LLM_TIMEOUT";
29
+ EvalErrorCode2["JSON_PARSE_ERROR"] = "JSON_PARSE_ERROR";
30
+ EvalErrorCode2["VERDICT_PARSE_ERROR"] = "VERDICT_PARSE_ERROR";
31
+ EvalErrorCode2["TEMPLATE_COMPILE_ERROR"] = "TEMPLATE_COMPILE_ERROR";
32
+ EvalErrorCode2["AGENT_EXECUTION_ERROR"] = "AGENT_EXECUTION_ERROR";
33
+ EvalErrorCode2["INVALID_CONFIG"] = "INVALID_CONFIG";
34
+ EvalErrorCode2["MISSING_API_KEY"] = "MISSING_API_KEY";
35
+ EvalErrorCode2["PROMPT_NOT_FOUND"] = "PROMPT_NOT_FOUND";
36
+ EvalErrorCode2["PROMPT_INVALID_FORMAT"] = "PROMPT_INVALID_FORMAT";
37
+ EvalErrorCode2["PROMPT_WRITE_ERROR"] = "PROMPT_WRITE_ERROR";
38
+ EvalErrorCode2["PROMPT_READ_ERROR"] = "PROMPT_READ_ERROR";
39
+ EvalErrorCode2["SUGGESTION_APPLY_ERROR"] = "SUGGESTION_APPLY_ERROR";
40
+ EvalErrorCode2["SCHEMA_VALIDATION_ERROR"] = "SCHEMA_VALIDATION_ERROR";
41
+ EvalErrorCode2["SCHEMA_GENERATION_ERROR"] = "SCHEMA_GENERATION_ERROR";
42
+ EvalErrorCode2["FILE_READ_ERROR"] = "FILE_READ_ERROR";
43
+ EvalErrorCode2["FILE_WRITE_ERROR"] = "FILE_WRITE_ERROR";
44
+ EvalErrorCode2["FILE_TOO_LARGE"] = "FILE_TOO_LARGE";
45
+ EvalErrorCode2["CONCURRENT_MODIFICATION"] = "CONCURRENT_MODIFICATION";
46
+ EvalErrorCode2["UNKNOWN_ERROR"] = "UNKNOWN_ERROR";
47
+ return EvalErrorCode2;
48
+ })(EvalErrorCode || {});
49
+ var EvalError = class _EvalError extends Error {
50
+ code;
51
+ cause;
52
+ context;
53
+ constructor(message, options) {
54
+ super(message);
55
+ this.name = "EvalError";
56
+ this.code = options.code;
57
+ this.cause = options.cause;
58
+ this.context = options.context;
59
+ if (Error.captureStackTrace) {
60
+ Error.captureStackTrace(this, _EvalError);
61
+ }
62
+ }
63
+ /**
64
+ * Creates an EvalError from an unknown error with a specific code.
65
+ */
66
+ static from(error, code, context) {
67
+ if (error instanceof _EvalError) {
68
+ return error;
69
+ }
70
+ const cause = error instanceof Error ? error : new Error(String(error));
71
+ return new _EvalError(cause.message, { code, cause, context });
72
+ }
73
+ toJSON() {
74
+ return {
75
+ name: this.name,
76
+ message: this.message,
77
+ code: this.code,
78
+ context: this.context,
79
+ cause: this.cause?.message
80
+ };
81
+ }
82
+ };
83
+
84
+ // src/multi-turn/termination.ts
85
+ function getFieldValue(obj, fieldPath) {
86
+ if (obj === null || obj === void 0) {
87
+ return void 0;
88
+ }
89
+ const parts = fieldPath.split(".");
90
+ let current = obj;
91
+ for (const part of parts) {
92
+ if (current === null || current === void 0) {
93
+ return void 0;
94
+ }
95
+ if (typeof current !== "object") {
96
+ return void 0;
97
+ }
98
+ current = current[part];
99
+ }
100
+ return current;
101
+ }
102
+ function isSet(value) {
103
+ return value !== null && value !== void 0;
104
+ }
105
+ function checkMaxTurns(condition, context) {
106
+ const shouldTerminate = context.currentTurn >= condition.count;
107
+ if (shouldTerminate) {
108
+ return {
109
+ terminated: true,
110
+ terminationType: "maxTurns",
111
+ matchedCondition: condition,
112
+ reason: `Maximum turns reached (${condition.count})`
113
+ };
114
+ }
115
+ return {
116
+ terminated: false,
117
+ reason: `Turn ${context.currentTurn} of ${condition.count}`
118
+ };
119
+ }
120
+ function checkFieldSet(condition, context) {
121
+ const fieldValue = getFieldValue(context.lastOutput, condition.fieldPath);
122
+ const fieldIsSet2 = isSet(fieldValue);
123
+ if (fieldIsSet2) {
124
+ return {
125
+ terminated: true,
126
+ terminationType: "condition",
127
+ matchedCondition: condition,
128
+ reason: `Field "${condition.fieldPath}" is set (value: ${JSON.stringify(fieldValue)})`
129
+ };
130
+ }
131
+ return {
132
+ terminated: false,
133
+ reason: `Field "${condition.fieldPath}" is not set`
134
+ };
135
+ }
136
+ function checkFieldValue(condition, context) {
137
+ const fieldValue = getFieldValue(context.lastOutput, condition.fieldPath);
138
+ const matches = fieldValue === condition.expectedValue;
139
+ if (matches) {
140
+ return {
141
+ terminated: true,
142
+ terminationType: "condition",
143
+ matchedCondition: condition,
144
+ reason: `Field "${condition.fieldPath}" equals expected value`
145
+ };
146
+ }
147
+ return {
148
+ terminated: false,
149
+ reason: `Field "${condition.fieldPath}" does not equal expected value (got: ${JSON.stringify(fieldValue)})`
150
+ };
151
+ }
152
+ async function checkCustom(condition, context) {
153
+ const description = condition.description ?? "Custom condition";
154
+ try {
155
+ const shouldTerminate = await condition.check(context);
156
+ if (shouldTerminate) {
157
+ return {
158
+ terminated: true,
159
+ terminationType: "condition",
160
+ matchedCondition: condition,
161
+ reason: `${description} met`
162
+ };
163
+ }
164
+ return {
165
+ terminated: false,
166
+ reason: `${description} not met`
167
+ };
168
+ } catch (error) {
169
+ const errorMessage = error instanceof Error ? error.message : String(error);
170
+ return {
171
+ terminated: false,
172
+ reason: `${description} failed: ${errorMessage}`
173
+ };
174
+ }
175
+ }
176
+ async function checkCondition(condition, context) {
177
+ if (isMaxTurnsCondition(condition)) {
178
+ return checkMaxTurns(condition, context);
179
+ }
180
+ if (isFieldValueCondition(condition)) {
181
+ return checkFieldValue(condition, context);
182
+ }
183
+ if (isFieldSetCondition(condition)) {
184
+ return checkFieldSet(condition, context);
185
+ }
186
+ if (isCustomCondition(condition)) {
187
+ return checkCustom(condition, context);
188
+ }
189
+ const _exhaustive = condition;
190
+ throw new EvalError(`Unknown condition type: ${JSON.stringify(_exhaustive)}`, {
191
+ code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */,
192
+ context: { condition: _exhaustive }
193
+ });
194
+ }
195
+ async function checkTermination(conditions, context) {
196
+ if (conditions.length === 0) {
197
+ return {
198
+ terminated: false,
199
+ reason: "No termination conditions specified"
200
+ };
201
+ }
202
+ for (const condition of conditions) {
203
+ const result = await checkCondition(condition, context);
204
+ if (result.terminated) {
205
+ return result;
206
+ }
207
+ }
208
+ return {
209
+ terminated: false,
210
+ reason: "No termination conditions met"
211
+ };
212
+ }
213
+
214
+ // src/utils/json.ts
215
+ function truncate(str, maxLength) {
216
+ if (!str) {
217
+ return "";
218
+ }
219
+ if (str.length <= maxLength) {
220
+ return str;
221
+ }
222
+ return str.slice(0, maxLength) + "...";
223
+ }
224
+
225
+ // src/utils/condition-composites.ts
226
+ function createAndCheck(conditions, checkFn) {
227
+ return async (context) => {
228
+ for (const condition of conditions) {
229
+ const result = await checkFn(condition, context);
230
+ if (!result.terminated) {
231
+ return false;
232
+ }
233
+ }
234
+ return true;
235
+ };
236
+ }
237
+ function createOrCheck(conditions, checkFn) {
238
+ return async (context) => {
239
+ for (const condition of conditions) {
240
+ const result = await checkFn(condition, context);
241
+ if (result.terminated) {
242
+ return true;
243
+ }
244
+ }
245
+ return false;
246
+ };
247
+ }
248
+ function createNotCheck(condition, checkFn) {
249
+ return async (context) => {
250
+ const result = await checkFn(condition, context);
251
+ return !result.terminated;
252
+ };
253
+ }
254
+ function formatCompositeDescription(type, conditions) {
255
+ if (conditions.length === 0) {
256
+ return `${type}() - empty, never terminates`;
257
+ }
258
+ return `${type}(${conditions.map((c) => c.type).join(", ")})`;
259
+ }
260
+
261
+ // src/multi-turn/conditions.ts
262
+ function naturalLanguage(options) {
263
+ const { provider, prompt, systemPrompt } = options;
264
+ const defaultSystemPrompt = `You are an assistant that evaluates whether a conversation should terminate.
265
+ Analyze the conversation history and determine if the specified condition is met.
266
+ Respond with ONLY "yes" or "no" - nothing else.`;
267
+ return {
268
+ type: "custom",
269
+ check: async (context) => {
270
+ const historyText = context.history.map(
271
+ (h) => `Turn ${h.turn}:
272
+ Input: ${JSON.stringify(h.input)}
273
+ Output: ${JSON.stringify(h.output)}`
274
+ ).join("\n\n");
275
+ const userPrompt = `## Termination Condition
276
+ ${prompt}
277
+
278
+ ## Conversation History
279
+ ${historyText || "(No history yet)"}
280
+
281
+ ## Current Turn
282
+ Turn: ${context.currentTurn}
283
+ Last Output: ${JSON.stringify(context.lastOutput)}
284
+
285
+ Should the conversation terminate based on the condition above? Answer "yes" or "no" only.`;
286
+ const execution = provider.simpleExecution(async (session) => {
287
+ const result = await session.generateText({
288
+ messages: [
289
+ { role: "system", content: systemPrompt ?? defaultSystemPrompt },
290
+ { role: "user", content: userPrompt }
291
+ ]
292
+ });
293
+ return result.text;
294
+ });
295
+ const executionResult = await execution.result();
296
+ if (executionResult.status !== "succeeded") {
297
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
298
+ }
299
+ const responseText = executionResult.value;
300
+ const answer = responseText.toLowerCase().trim();
301
+ return answer === "yes" || answer.startsWith("yes");
302
+ },
303
+ description: `NL: ${truncate(prompt, 50)}`
304
+ };
305
+ }
306
+ function and(...conditions) {
307
+ if (conditions.length === 0) {
308
+ return {
309
+ type: "custom",
310
+ check: () => false,
311
+ description: formatCompositeDescription("and", [])
312
+ };
313
+ }
314
+ return {
315
+ type: "custom",
316
+ check: createAndCheck(conditions, checkCondition),
317
+ description: formatCompositeDescription("and", conditions)
318
+ };
319
+ }
320
+ function or(...conditions) {
321
+ if (conditions.length === 0) {
322
+ return {
323
+ type: "custom",
324
+ check: () => false,
325
+ description: formatCompositeDescription("or", [])
326
+ };
327
+ }
328
+ return {
329
+ type: "custom",
330
+ check: createOrCheck(conditions, checkCondition),
331
+ description: formatCompositeDescription("or", conditions)
332
+ };
333
+ }
334
+ function not(condition) {
335
+ return {
336
+ type: "custom",
337
+ check: createNotCheck(condition, checkCondition),
338
+ description: `not(${condition.type})`
339
+ };
340
+ }
341
+ function afterTurns(count) {
342
+ return {
343
+ type: "custom",
344
+ check: (context) => context.currentTurn >= count,
345
+ description: `afterTurns(${count})`
346
+ };
347
+ }
348
+ function fieldEquals(fieldPath, expectedValue) {
349
+ return {
350
+ type: "custom",
351
+ check: async (context) => {
352
+ const result = await checkCondition(
353
+ { type: "fieldValue", fieldPath, expectedValue },
354
+ context
355
+ );
356
+ return result.terminated;
357
+ },
358
+ description: `fieldEquals(${fieldPath}, ${JSON.stringify(expectedValue)})`
359
+ };
360
+ }
361
+ function fieldIsSet(fieldPath) {
362
+ return {
363
+ type: "custom",
364
+ check: async (context) => {
365
+ const result = await checkCondition({ type: "fieldSet", fieldPath }, context);
366
+ return result.terminated;
367
+ },
368
+ description: `fieldIsSet(${fieldPath})`
369
+ };
370
+ }
371
+
372
+ // src/multi-turn/runner.ts
373
+ import { resolveFileSourcesInInput } from "@agtlantis/core";
374
+ var DEFAULT_MAX_TURNS = 10;
375
+ var DEFAULT_ON_CONDITION_MET = "pass";
376
+ var DEFAULT_ON_MAX_TURNS_REACHED = "fail";
377
+ function aggregateTokenUsage(usages) {
378
+ return usages.reduce(
379
+ (acc, usage) => ({
380
+ inputTokens: acc.inputTokens + usage.inputTokens,
381
+ outputTokens: acc.outputTokens + usage.outputTokens,
382
+ totalTokens: acc.totalTokens + usage.totalTokens
383
+ }),
384
+ { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
385
+ );
386
+ }
387
+ function getEffectiveMaxTurns(conditions, safetyLimit) {
388
+ const maxTurnsCondition = conditions.find((c) => c.type === "maxTurns");
389
+ if (maxTurnsCondition && maxTurnsCondition.type === "maxTurns") {
390
+ return Math.min(maxTurnsCondition.count, safetyLimit);
391
+ }
392
+ return safetyLimit;
393
+ }
394
+ async function resolveInput(followUpInput, context) {
395
+ const inputValue = followUpInput.input;
396
+ if (typeof inputValue === "function") {
397
+ const result = inputValue(context);
398
+ return result instanceof Promise ? await result : result;
399
+ }
400
+ return inputValue;
401
+ }
402
+ function buildContext(currentTurn, history) {
403
+ return {
404
+ currentTurn,
405
+ history,
406
+ lastOutput: history.length > 0 ? history[history.length - 1].output : void 0
407
+ };
408
+ }
409
+ function getFollowUpInput(followUpInputs, followUpIndex) {
410
+ let currentIndex = 0;
411
+ for (const followUp of followUpInputs) {
412
+ const repeatCount = followUp.turns ?? 1;
413
+ if (!Number.isFinite(repeatCount) && followUpIndex >= currentIndex) {
414
+ return followUp;
415
+ }
416
+ if (followUpIndex < currentIndex + repeatCount) {
417
+ return followUp;
418
+ }
419
+ currentIndex += repeatCount;
420
+ }
421
+ return null;
422
+ }
423
+ function validateFollowUpInputs(followUpInputs) {
424
+ for (let i = 0; i < followUpInputs.length; i++) {
425
+ const followUp = followUpInputs[i];
426
+ if (followUp.turns === void 0) {
427
+ continue;
428
+ }
429
+ if (typeof followUp.turns !== "number" || followUp.turns < 1) {
430
+ throw new EvalError("turns must be a positive number or Infinity", {
431
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
432
+ context: {
433
+ description: followUp.description,
434
+ turns: followUp.turns
435
+ }
436
+ });
437
+ }
438
+ if (!Number.isFinite(followUp.turns) && i < followUpInputs.length - 1) {
439
+ throw new EvalError(
440
+ "turns: Infinity must be the last followUpInput (subsequent items would be unreachable)",
441
+ {
442
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
443
+ context: {
444
+ description: followUp.description,
445
+ position: i,
446
+ totalItems: followUpInputs.length
447
+ }
448
+ }
449
+ );
450
+ }
451
+ }
452
+ }
453
+ async function getTurnInput(turn, testCaseInput, followUpInputs, conversationHistory) {
454
+ if (turn === 1) {
455
+ return { type: "success", input: testCaseInput };
456
+ }
457
+ const followUpIndex = turn - 2;
458
+ const followUp = getFollowUpInput(followUpInputs, followUpIndex);
459
+ if (!followUp) {
460
+ return { type: "exhausted" };
461
+ }
462
+ const ctx = buildContext(turn, conversationHistory);
463
+ const input = await resolveInput(followUp, ctx);
464
+ return { type: "success", input };
465
+ }
466
+ function isFileResolutionError(result) {
467
+ return "type" in result && result.type === "fileResolutionError";
468
+ }
469
+ async function executeSingleTurn(input, agent, testCaseId, turn) {
470
+ let resolvedInput;
471
+ try {
472
+ resolvedInput = await resolveFileSourcesInInput(input, {
473
+ basePath: process.cwd()
474
+ });
475
+ } catch (e) {
476
+ return {
477
+ type: "fileResolutionError",
478
+ reason: `FileSource resolution failed on turn ${turn}: ${e instanceof Error ? e.message : String(e)}`
479
+ };
480
+ }
481
+ const startTime = performance.now();
482
+ let output;
483
+ let metadata;
484
+ let error;
485
+ try {
486
+ const agentResult = await agent.execute(resolvedInput);
487
+ output = agentResult.result;
488
+ metadata = agentResult.metadata;
489
+ } catch (e) {
490
+ error = EvalError.from(e, "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */, {
491
+ testCaseId,
492
+ turn,
493
+ agentName: agent.config.name
494
+ });
495
+ }
496
+ const latencyMs = performance.now() - startTime;
497
+ return { output, metadata, latencyMs, error };
498
+ }
499
+ function determinePassFromTermination(termination, onConditionMet, onMaxTurnsReached) {
500
+ if (!isTerminated(termination)) {
501
+ return true;
502
+ }
503
+ switch (termination.terminationType) {
504
+ case "error":
505
+ case "exhausted":
506
+ return false;
507
+ case "maxTurns":
508
+ return onMaxTurnsReached === "pass";
509
+ case "condition":
510
+ return onConditionMet === "pass";
511
+ default:
512
+ return true;
513
+ }
514
+ }
515
+ async function executeMultiTurnTestCase(testCase2, context, options) {
516
+ const { agent, judge, agentDescription } = context;
517
+ const { multiTurn } = testCase2;
518
+ const signal = options?.signal;
519
+ const maxTurns = getEffectiveMaxTurns(
520
+ multiTurn.terminateWhen,
521
+ multiTurn.maxTurns ?? DEFAULT_MAX_TURNS
522
+ );
523
+ const onConditionMet = multiTurn.onConditionMet ?? DEFAULT_ON_CONDITION_MET;
524
+ const onMaxTurnsReached = multiTurn.onMaxTurnsReached ?? DEFAULT_ON_MAX_TURNS_REACHED;
525
+ const followUpInputs = multiTurn.followUpInputs ?? [];
526
+ validateFollowUpInputs(followUpInputs);
527
+ const conversationHistory = [];
528
+ const tokenUsages = [];
529
+ let totalLatencyMs = 0;
530
+ let termination = {
531
+ terminated: false,
532
+ reason: "Execution not started"
533
+ };
534
+ for (let turn = 1; turn <= maxTurns; turn++) {
535
+ if (signal?.aborted) {
536
+ throw new EvalError("Multi-turn test execution aborted", {
537
+ code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
538
+ context: { testCaseId: testCase2.id, turn, reason: "aborted" }
539
+ });
540
+ }
541
+ const inputResult = await getTurnInput(
542
+ turn,
543
+ testCase2.input,
544
+ followUpInputs,
545
+ conversationHistory
546
+ );
547
+ if (inputResult.type === "exhausted") {
548
+ termination = {
549
+ terminated: true,
550
+ terminationType: "exhausted",
551
+ reason: "All follow-up inputs exhausted"
552
+ };
553
+ break;
554
+ }
555
+ const input = inputResult.input;
556
+ const turnResult = await executeSingleTurn(input, agent, testCase2.id ?? "unknown", turn);
557
+ if (isFileResolutionError(turnResult)) {
558
+ termination = {
559
+ terminated: true,
560
+ terminationType: "error",
561
+ reason: turnResult.reason
562
+ };
563
+ break;
564
+ }
565
+ const {
566
+ output: agentOutput,
567
+ metadata: agentMetadata,
568
+ latencyMs,
569
+ error: agentError
570
+ } = turnResult;
571
+ totalLatencyMs += latencyMs;
572
+ const turnUsage = agentMetadata?.tokenUsage ?? {
573
+ inputTokens: 0,
574
+ outputTokens: 0,
575
+ totalTokens: 0
576
+ };
577
+ tokenUsages.push(turnUsage);
578
+ conversationHistory.push({
579
+ turn,
580
+ input,
581
+ output: agentOutput,
582
+ metadata: agentMetadata
583
+ });
584
+ if (agentError) {
585
+ termination = {
586
+ terminated: true,
587
+ terminationType: "error",
588
+ reason: `Agent execution failed on turn ${turn}: ${agentError.message}`
589
+ };
590
+ break;
591
+ }
592
+ const ctx = buildContext(turn, conversationHistory);
593
+ termination = await checkTermination(multiTurn.terminateWhen, ctx);
594
+ if (termination.terminated) {
595
+ break;
596
+ }
597
+ if (turn >= maxTurns) {
598
+ termination = {
599
+ terminated: true,
600
+ terminationType: "maxTurns",
601
+ matchedCondition: { type: "maxTurns", count: maxTurns },
602
+ reason: `Maximum turns reached (${maxTurns})`
603
+ };
604
+ break;
605
+ }
606
+ }
607
+ const aggregatedTokenUsage = aggregateTokenUsage(tokenUsages);
608
+ const metrics = {
609
+ latencyMs: totalLatencyMs,
610
+ tokenUsage: aggregatedTokenUsage
611
+ };
612
+ const lastTurn = conversationHistory[conversationHistory.length - 1];
613
+ const finalOutput = lastTurn?.output;
614
+ const judgeResult = await judge.evaluate({
615
+ input: testCase2.input,
616
+ output: finalOutput,
617
+ agentDescription,
618
+ files: testCase2.files
619
+ });
620
+ const passedTermination = determinePassFromTermination(
621
+ termination,
622
+ onConditionMet,
623
+ onMaxTurnsReached
624
+ );
625
+ const passed = passedTermination && judgeResult.passed;
626
+ return {
627
+ testCase: testCase2,
628
+ output: finalOutput,
629
+ metrics,
630
+ verdicts: judgeResult.verdicts,
631
+ overallScore: judgeResult.overallScore,
632
+ passed,
633
+ judgeMetadata: judgeResult.metadata,
634
+ conversationHistory,
635
+ termination,
636
+ totalTurns: conversationHistory.length
637
+ };
638
+ }
639
+
640
+ // src/multi-turn/ai-user.ts
641
+ var DEFAULT_SYSTEM_PROMPT = `You are simulating a realistic user in a conversation with an AI assistant.
642
+
643
+ ## Your Role
644
+ Generate natural, context-appropriate user messages based on the conversation history.
645
+
646
+ ## Guidelines
647
+
648
+ 1. **Stay in Character**: Respond as a real user would - with natural language, occasional typos, or casual phrasing when appropriate.
649
+
650
+ 2. **Be Goal-Oriented**: Users have objectives. Pursue them logically based on the conversation context:
651
+ - If the assistant asks a question, provide a reasonable answer
652
+ - If clarification is needed, ask for it naturally
653
+ - If a task is progressing, guide it toward completion
654
+
655
+ 3. **React Appropriately**: Respond to what the assistant says:
656
+ - Acknowledge when the assistant is helpful
657
+ - Express confusion if the response is unclear
658
+ - Correct misunderstandings if they occur
659
+
660
+ 4. **Keep It Realistic**: Real users:
661
+ - Don't always provide perfect information upfront
662
+ - May change their mind or add requirements
663
+ - Sometimes need time to think or decide
664
+
665
+ ## Output Format
666
+ Respond with ONLY the user's message. No additional formatting, explanation, or meta-commentary.`;
667
+ function aiUser(options) {
668
+ const { provider, systemPrompt, formatHistory, buildInput } = options;
669
+ const defaultFormatHistory = (ctx) => ctx.history.map(
670
+ (h, i) => `[Turn ${i + 1}]
671
+ User: ${JSON.stringify(h.input)}
672
+ Assistant: ${JSON.stringify(h.output)}`
673
+ ).join("\n\n");
674
+ return async (context) => {
675
+ const historyText = (formatHistory ?? defaultFormatHistory)(context);
676
+ const resolvedSystemPrompt = typeof systemPrompt === "function" ? systemPrompt(context) : systemPrompt ?? DEFAULT_SYSTEM_PROMPT;
677
+ const userPrompt = historyText ? `## Conversation History
678
+ ${historyText}
679
+
680
+ ## Your Task
681
+ Generate the next user message based on the conversation above:` : `## Your Task
682
+ This is the start of a new conversation. Generate an appropriate opening message from the user:`;
683
+ const execution = provider.simpleExecution(async (session) => {
684
+ const result = await session.generateText({
685
+ messages: [
686
+ { role: "system", content: resolvedSystemPrompt },
687
+ { role: "user", content: userPrompt }
688
+ ]
689
+ });
690
+ return result.text;
691
+ });
692
+ const executionResult = await execution.result();
693
+ if (executionResult.status !== "succeeded") {
694
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
695
+ }
696
+ const responseText = executionResult.value;
697
+ return buildInput(responseText, context);
698
+ };
699
+ }
700
+
701
+ // src/utils/semaphore.ts
702
+ function createSemaphore(limit) {
703
+ let running = 0;
704
+ const waiting = [];
705
+ return {
706
+ async acquire() {
707
+ if (running < limit) {
708
+ running++;
709
+ return;
710
+ }
711
+ return new Promise((resolve2) => {
712
+ waiting.push(resolve2);
713
+ });
714
+ },
715
+ release() {
716
+ running--;
717
+ const next = waiting.shift();
718
+ if (next) {
719
+ running++;
720
+ next();
721
+ }
722
+ }
723
+ };
724
+ }
725
+
726
+ // src/core/constants.ts
727
+ var SCORE = {
728
+ /** Minimum possible score */
729
+ MIN: 0,
730
+ /** Maximum possible score */
731
+ MAX: 100,
732
+ /** Default threshold for passing evaluation */
733
+ DEFAULT_PASS_THRESHOLD: 70,
734
+ /** Threshold for majority-based pass determination (50%) */
735
+ MAJORITY_PASS_THRESHOLD: 0.5
736
+ };
737
+ var ZERO_TOKEN_USAGE = {
738
+ inputTokens: 0,
739
+ outputTokens: 0,
740
+ totalTokens: 0
741
+ };
742
+
743
+ // src/core/runner.ts
744
+ async function executeTestCase(testCase2, context, signal) {
745
+ const { agent, judge, agentDescription } = context;
746
+ if (signal?.aborted) {
747
+ throw new EvalError("Test execution aborted", {
748
+ code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
749
+ context: { testCaseId: testCase2.id, reason: "aborted" }
750
+ });
751
+ }
752
+ let resolvedInput;
753
+ try {
754
+ resolvedInput = await resolveFileSourcesInInput2(testCase2.input, {
755
+ basePath: process.cwd()
756
+ });
757
+ } catch (e) {
758
+ const error2 = EvalError.from(e, "FILE_READ_ERROR" /* FILE_READ_ERROR */, {
759
+ testCaseId: testCase2.id,
760
+ agentName: agent.config.name
761
+ });
762
+ return createFailedResult(testCase2, error2);
763
+ }
764
+ const startTime = performance.now();
765
+ let output;
766
+ let tokenUsage = ZERO_TOKEN_USAGE;
767
+ let error;
768
+ try {
769
+ const agentResult = await agent.execute(resolvedInput);
770
+ output = agentResult.result;
771
+ if (agentResult.metadata?.tokenUsage) {
772
+ tokenUsage = agentResult.metadata.tokenUsage;
773
+ }
774
+ } catch (e) {
775
+ error = EvalError.from(e, "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */, {
776
+ testCaseId: testCase2.id,
777
+ agentName: agent.config.name
778
+ });
779
+ output = void 0;
780
+ }
781
+ const latencyMs = performance.now() - startTime;
782
+ const metrics = { latencyMs, tokenUsage };
783
+ const testResult = { testCase: testCase2, output, metrics, error };
784
+ if (error) {
785
+ return {
786
+ kind: "single-turn",
787
+ ...testResult,
788
+ verdicts: [],
789
+ overallScore: 0,
790
+ passed: false,
791
+ judgeMetadata: void 0
792
+ };
793
+ }
794
+ if (signal?.aborted) {
795
+ throw new EvalError("Test execution aborted before evaluation", {
796
+ code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
797
+ context: { testCaseId: testCase2.id, reason: "aborted" }
798
+ });
799
+ }
800
+ const judgeResult = await judge.evaluate({
801
+ input: testCase2.input,
802
+ output,
803
+ agentDescription,
804
+ files: testCase2.files
805
+ });
806
+ return {
807
+ kind: "single-turn",
808
+ ...testResult,
809
+ verdicts: judgeResult.verdicts,
810
+ overallScore: judgeResult.overallScore,
811
+ passed: judgeResult.passed,
812
+ judgeMetadata: judgeResult.metadata
813
+ };
814
+ }
815
+ function createFailedResult(testCase2, error) {
816
+ return {
817
+ kind: "single-turn",
818
+ testCase: testCase2,
819
+ output: void 0,
820
+ metrics: { latencyMs: 0, tokenUsage: ZERO_TOKEN_USAGE },
821
+ error,
822
+ verdicts: [],
823
+ overallScore: 0,
824
+ passed: false,
825
+ judgeMetadata: void 0
826
+ };
827
+ }
828
+ function toMultiTurnResult(result) {
829
+ return {
830
+ kind: "multi-turn",
831
+ testCase: result.testCase,
832
+ output: result.output,
833
+ metrics: result.metrics,
834
+ verdicts: result.verdicts,
835
+ overallScore: result.overallScore,
836
+ passed: result.passed,
837
+ judgeMetadata: result.judgeMetadata,
838
+ conversationHistory: result.conversationHistory,
839
+ totalTurns: result.totalTurns,
840
+ terminationReason: result.termination.reason,
841
+ termination: result.termination
842
+ };
843
+ }
844
+ async function runWithConcurrency(testCases2, context, options = {}) {
845
+ const { concurrency = 1, stopOnFirstFailure = false, signal } = options;
846
+ if (concurrency < 1) {
847
+ throw new EvalError("Concurrency must be at least 1", {
848
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
849
+ context: { concurrency }
850
+ });
851
+ }
852
+ if (testCases2.length === 0) {
853
+ return [];
854
+ }
855
+ const semaphore = createSemaphore(concurrency);
856
+ const results = [];
857
+ let shouldStop = false;
858
+ let firstError;
859
+ const internalAbort = new AbortController();
860
+ const propagateExternalAbort = () => {
861
+ shouldStop = true;
862
+ internalAbort.abort();
863
+ };
864
+ signal?.addEventListener("abort", propagateExternalAbort);
865
+ if (signal?.aborted) {
866
+ shouldStop = true;
867
+ }
868
+ try {
869
+ const executeOne = async (testCase2, index) => {
870
+ if (shouldStop) return;
871
+ await semaphore.acquire();
872
+ try {
873
+ if (shouldStop) return;
874
+ const result = await executeTestCaseByType(testCase2, context, internalAbort.signal);
875
+ results[index] = result;
876
+ if (stopOnFirstFailure && !result.passed) {
877
+ shouldStop = true;
878
+ internalAbort.abort();
879
+ }
880
+ } catch (e) {
881
+ if (!firstError && !isAbortError(e)) {
882
+ firstError = e instanceof Error ? e : new Error(String(e));
883
+ }
884
+ shouldStop = true;
885
+ internalAbort.abort();
886
+ } finally {
887
+ semaphore.release();
888
+ }
889
+ };
890
+ const promises = testCases2.map((tc, i) => executeOne(tc, i));
891
+ await Promise.allSettled(promises);
892
+ if (firstError) {
893
+ throw firstError;
894
+ }
895
+ return results.filter((r) => r !== void 0);
896
+ } finally {
897
+ signal?.removeEventListener("abort", propagateExternalAbort);
898
+ }
899
+ }
900
+ function isAbortError(e) {
901
+ return e instanceof DOMException && e.name === "AbortError" || e instanceof EvalError && e.context?.reason === "aborted";
902
+ }
903
+ async function executeTestCaseByType(testCase2, context, signal) {
904
+ if (isMultiTurnTestCase(testCase2)) {
905
+ const multiTurnResult = await executeMultiTurnTestCase(testCase2, context, { signal });
906
+ return toMultiTurnResult(multiTurnResult);
907
+ }
908
+ return executeTestCase(testCase2, context, signal);
909
+ }
910
+
911
+ // src/core/types.ts
912
+ function toEvalAgent(agent) {
913
+ return {
914
+ config: {
915
+ name: agent.config.name,
916
+ description: agent.config.description
917
+ },
918
+ prompt: agent.prompt,
919
+ execute: async (input, options) => {
920
+ const result = await agent.execute(input, options);
921
+ return {
922
+ result: result.result,
923
+ metadata: result.metadata
924
+ };
925
+ }
926
+ };
927
+ }
928
+ function isSingleTurnResult(result) {
929
+ return result.kind === "single-turn" || result.kind === "single-turn-iterated";
930
+ }
931
+ function isMultiTurnResult(result) {
932
+ return result.kind === "multi-turn" || result.kind === "multi-turn-iterated";
933
+ }
934
+ function isIteratedResult(result) {
935
+ return result.kind === "single-turn-iterated" || result.kind === "multi-turn-iterated";
936
+ }
937
+
938
+ // src/core/iteration.ts
939
+ function calculateIterationStats(results) {
940
+ if (results.length === 0) {
941
+ return {
942
+ iterations: 0,
943
+ scores: [],
944
+ mean: 0,
945
+ stdDev: 0,
946
+ min: 0,
947
+ max: 0,
948
+ passRate: 0,
949
+ passCount: 0
950
+ };
951
+ }
952
+ const scores = results.map((r) => r.overallScore);
953
+ const passCount = results.filter((r) => r.passed).length;
954
+ const mean = scores.reduce((sum, s) => sum + s, 0) / scores.length;
955
+ const variance = scores.reduce((sum, s) => sum + Math.pow(s - mean, 2), 0) / scores.length;
956
+ const stdDev = Math.sqrt(variance);
957
+ return {
958
+ iterations: results.length,
959
+ scores,
960
+ mean,
961
+ stdDev,
962
+ min: Math.min(...scores),
963
+ max: Math.max(...scores),
964
+ passRate: passCount / results.length,
965
+ passCount
966
+ };
967
+ }
968
+ function calculateMultiTurnIterationStats(results) {
969
+ const baseStats = calculateIterationStats(results);
970
+ const turns = results.map((r) => r.totalTurns);
971
+ const terminationCounts = {};
972
+ for (const r of results) {
973
+ const type = r.termination.terminationType;
974
+ if (type) {
975
+ terminationCounts[type] = (terminationCounts[type] || 0) + 1;
976
+ }
977
+ }
978
+ return {
979
+ ...baseStats,
980
+ avgTurns: turns.length > 0 ? turns.reduce((a, b) => a + b, 0) / turns.length : 0,
981
+ minTurns: turns.length > 0 ? Math.min(...turns) : 0,
982
+ maxTurns: turns.length > 0 ? Math.max(...turns) : 0,
983
+ terminationCounts
984
+ };
985
+ }
986
+ function selectRepresentativeResult(results, mean) {
987
+ if (results.length === 0) {
988
+ throw new EvalError("Cannot select representative result from empty array", {
989
+ code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */
990
+ });
991
+ }
992
+ return results.reduce((closest, current) => {
993
+ const closestDiff = Math.abs(closest.overallScore - mean);
994
+ const currentDiff = Math.abs(current.overallScore - mean);
995
+ return currentDiff < closestDiff ? current : closest;
996
+ });
997
+ }
998
+ function aggregateIterationResults(allIterationResults) {
999
+ if (allIterationResults.length === 0) {
1000
+ return [];
1001
+ }
1002
+ const testCount = allIterationResults[0].length;
1003
+ const aggregated = [];
1004
+ for (let i = 0; i < testCount; i++) {
1005
+ const resultsForTestCase = allIterationResults.map((iteration) => iteration[i]);
1006
+ const stats = calculateIterationStats(resultsForTestCase);
1007
+ const representative = selectRepresentativeResult(resultsForTestCase, stats.mean);
1008
+ const isMultiTurn = resultsForTestCase.some((r) => isMultiTurnResult(r));
1009
+ const passedByMajority = stats.passRate >= SCORE.MAJORITY_PASS_THRESHOLD;
1010
+ if (isMultiTurn) {
1011
+ const multiTurnResults = resultsForTestCase.filter(
1012
+ (r) => isMultiTurnResult(r)
1013
+ );
1014
+ const multiTurnRep = representative;
1015
+ const aggregatedResult = {
1016
+ kind: "multi-turn-iterated",
1017
+ testCase: multiTurnRep.testCase,
1018
+ output: multiTurnRep.output,
1019
+ metrics: multiTurnRep.metrics,
1020
+ verdicts: multiTurnRep.verdicts,
1021
+ error: multiTurnRep.error,
1022
+ overallScore: stats.mean,
1023
+ passed: passedByMajority,
1024
+ iterationStats: stats,
1025
+ iterationResults: resultsForTestCase,
1026
+ conversationHistory: multiTurnRep.conversationHistory,
1027
+ totalTurns: multiTurnRep.totalTurns,
1028
+ terminationReason: multiTurnRep.terminationReason,
1029
+ termination: multiTurnRep.termination,
1030
+ multiTurnIterationStats: calculateMultiTurnIterationStats(multiTurnResults)
1031
+ };
1032
+ aggregated.push(aggregatedResult);
1033
+ } else {
1034
+ const aggregatedResult = {
1035
+ kind: "single-turn-iterated",
1036
+ testCase: representative.testCase,
1037
+ output: representative.output,
1038
+ metrics: representative.metrics,
1039
+ verdicts: representative.verdicts,
1040
+ error: representative.error,
1041
+ overallScore: stats.mean,
1042
+ passed: passedByMajority,
1043
+ iterationStats: stats,
1044
+ iterationResults: resultsForTestCase
1045
+ };
1046
+ aggregated.push(aggregatedResult);
1047
+ }
1048
+ }
1049
+ return aggregated;
1050
+ }
1051
+ function filterIteratedResults(results) {
1052
+ return results.filter(
1053
+ (r) => r.kind === "single-turn-iterated" || r.kind === "multi-turn-iterated"
1054
+ );
1055
+ }
1056
+ function averageIterationStat(results, selector) {
1057
+ const iteratedResults = filterIteratedResults(results);
1058
+ if (iteratedResults.length === 0) {
1059
+ return void 0;
1060
+ }
1061
+ const total = iteratedResults.reduce((sum, r) => sum + selector(r.iterationStats), 0);
1062
+ return total / iteratedResults.length;
1063
+ }
1064
+ function calculateAvgStdDev(results) {
1065
+ return averageIterationStat(results, (stats) => stats.stdDev);
1066
+ }
1067
+ function calculateAvgPassRate(results) {
1068
+ return averageIterationStat(results, (stats) => stats.passRate);
1069
+ }
1070
+
1071
+ // src/core/suite.ts
1072
+ function calculateAggregatedMetrics(results) {
1073
+ if (results.length === 0) {
1074
+ return { avgLatencyMs: 0, totalTokens: 0 };
1075
+ }
1076
+ const totalLatencyMs = sumBy(results, (r) => r.metrics.latencyMs);
1077
+ const totalTokens = sumBy(results, (r) => r.metrics.tokenUsage.totalTokens);
1078
+ return {
1079
+ avgLatencyMs: totalLatencyMs / results.length,
1080
+ totalTokens
1081
+ };
1082
+ }
1083
+ function sumBy(items, selector) {
1084
+ return items.reduce((sum, item) => sum + selector(item), 0);
1085
+ }
1086
+ function calculateSummary(results, iterations) {
1087
+ const metrics = calculateAggregatedMetrics(results);
1088
+ const passedCount = results.filter((r) => r.passed).length;
1089
+ const failedCount = results.length - passedCount;
1090
+ const avgScore = results.length > 0 ? sumBy(results, (r) => r.overallScore) / results.length : 0;
1091
+ const summary = {
1092
+ totalTests: results.length,
1093
+ passed: passedCount,
1094
+ failed: failedCount,
1095
+ avgScore,
1096
+ metrics
1097
+ };
1098
+ const hasMultipleIterations = iterations && iterations > 1;
1099
+ if (hasMultipleIterations) {
1100
+ summary.iterations = iterations;
1101
+ summary.avgStdDev = calculateAvgStdDev(results);
1102
+ summary.avgPassRate = calculateAvgPassRate(results);
1103
+ }
1104
+ return summary;
1105
+ }
1106
+ function createEvalSuite(config) {
1107
+ const { agent, agentDescription, judge, improver } = config;
1108
+ const description = agentDescription ?? agent.config.description ?? agent.config.name;
1109
+ const suite = {
1110
+ async run(testCases2, options) {
1111
+ const iterations = options?.iterations ?? 1;
1112
+ validateIterations(iterations);
1113
+ const executeContext = { agent, judge, agentDescription: description };
1114
+ const results = iterations <= 1 ? await runWithConcurrency(testCases2, executeContext, options) : await runMultipleIterations(testCases2, executeContext, options, iterations);
1115
+ const summary = calculateSummary(results, iterations > 1 ? iterations : void 0);
1116
+ const suggestions = improver ? (await improver.improve(agent.prompt, results)).suggestions : [];
1117
+ return {
1118
+ summary,
1119
+ results,
1120
+ suggestions,
1121
+ generatedAt: /* @__PURE__ */ new Date(),
1122
+ promptVersion: agent.prompt.version
1123
+ };
1124
+ },
1125
+ withAgent(newAgent) {
1126
+ return createEvalSuite({
1127
+ ...config,
1128
+ agent: newAgent,
1129
+ agentDescription: void 0
1130
+ });
1131
+ }
1132
+ };
1133
+ return suite;
1134
+ }
1135
+ function validateIterations(iterations) {
1136
+ if (iterations < 1 || !Number.isInteger(iterations)) {
1137
+ throw new EvalError(
1138
+ `Invalid iterations value: ${iterations}. Must be a positive integer.`,
1139
+ { code: "INVALID_CONFIG" /* INVALID_CONFIG */, context: { iterations } }
1140
+ );
1141
+ }
1142
+ }
1143
+ async function runMultipleIterations(testCases2, executeContext, options, iterations) {
1144
+ const allIterationResults = [];
1145
+ for (let i = 0; i < iterations; i++) {
1146
+ const iterationResults = await runWithConcurrency(
1147
+ testCases2,
1148
+ executeContext,
1149
+ { ...options, iterations: void 0 }
1150
+ );
1151
+ allIterationResults.push(iterationResults);
1152
+ }
1153
+ return aggregateIterationResults(allIterationResults);
1154
+ }
1155
+
1156
+ // src/index.ts
1157
+ import {
1158
+ resolveFileSource,
1159
+ resolveFileSourcesInInput as resolveFileSourcesInInput3,
1160
+ scanForFileSources,
1161
+ getFileSourceDisplayInfo,
1162
+ getFileSourcesDisplayInfo as getFileSourcesDisplayInfo2,
1163
+ inferMediaType,
1164
+ isFileSource,
1165
+ isFileSourcePath,
1166
+ isFileSourceData,
1167
+ isFileSourceBase64,
1168
+ isFileSourceUrl
1169
+ } from "@agtlantis/core";
1170
+
1171
+ // src/judge/llm-judge.ts
1172
+ import { Output } from "ai";
1173
+ import { z } from "zod";
1174
+
1175
+ // src/judge/prompts/default.ts
1176
+ var defaultJudgePrompt = {
1177
+ id: "default-judge",
1178
+ version: "2.0.0",
1179
+ system: `You are an expert evaluator specializing in assessing AI Agent outputs.
1180
+
1181
+ Your role is to fairly and thoroughly evaluate the agent's output against the provided criteria.
1182
+
1183
+ ## Evaluation Principles
1184
+
1185
+ 1. **Scoring**: Assign a score between 0-100 for each criterion
1186
+ - 90-100: Exceptional - Exceeds expectations with no significant issues
1187
+ - 70-89: Good - Meets expectations with minor issues
1188
+ - 50-69: Acceptable - Partially meets expectations, notable issues present
1189
+ - 30-49: Poor - Falls short of expectations, significant issues
1190
+ - 0-29: Failing - Does not meet minimum requirements
1191
+
1192
+ 2. **Reasoning**: Always provide specific, evidence-based reasoning
1193
+ - Quote or reference specific parts of the output
1194
+ - Explain both strengths and weaknesses
1195
+ - Be constructive and actionable in feedback
1196
+
1197
+ 3. **Objectivity**: Evaluate based solely on the criteria provided
1198
+ - Avoid personal preferences or unstated requirements
1199
+ - Consider the agent's intended purpose and context
1200
+ - Weight severity of issues proportionally
1201
+
1202
+ ## Response Format
1203
+
1204
+ You MUST respond with valid JSON only. No additional text or explanation outside the JSON structure.
1205
+
1206
+ {
1207
+ "verdicts": [
1208
+ {
1209
+ "criterionId": "criterion-id",
1210
+ "score": 0-100,
1211
+ "reasoning": "Detailed explanation with specific evidence from the output",
1212
+ "passed": true/false
1213
+ }
1214
+ ]
1215
+ }`,
1216
+ renderUserPrompt: (ctx) => {
1217
+ const fileSection = buildFileSection(ctx.files);
1218
+ return `
1219
+ ## Agent Under Evaluation
1220
+ ${ctx.agentDescription}
1221
+
1222
+ ## Input Provided to Agent
1223
+ \`\`\`json
1224
+ ${JSON.stringify(ctx.input, null, 2)}
1225
+ \`\`\`
1226
+ ${fileSection}
1227
+ ## Agent Output
1228
+ \`\`\`json
1229
+ ${JSON.stringify(ctx.output, null, 2)}
1230
+ \`\`\`
1231
+
1232
+ ## Evaluation Criteria
1233
+ ${ctx.criteria.map((c) => `- **${c.name}** (id: ${c.id}, weight: ${c.weight ?? 1}): ${c.description}`).join("\n")}
1234
+
1235
+ Please evaluate the agent's output against each criterion listed above.`.trim();
1236
+ }
1237
+ };
1238
+ function buildFileSection(files) {
1239
+ if (!files || files.length === 0) {
1240
+ return "";
1241
+ }
1242
+ return `
1243
+ ## Reference Files
1244
+ ${files.map((f) => `### ${f.path}
1245
+ \`\`\`
1246
+ ${f.content}
1247
+ \`\`\``).join("\n\n")}
1248
+ `;
1249
+ }
1250
+
1251
+ // src/judge/llm-judge.ts
1252
+ function toEvalTokenUsage(usage) {
1253
+ return {
1254
+ inputTokens: usage.inputTokens ?? 0,
1255
+ outputTokens: usage.outputTokens ?? 0,
1256
+ totalTokens: usage.totalTokens ?? 0
1257
+ };
1258
+ }
1259
+ function hasValidator(criterion) {
1260
+ return "validator" in criterion && typeof criterion.validator === "function";
1261
+ }
1262
+ var JudgeResponseSchema = z.object({
1263
+ verdicts: z.array(
1264
+ z.object({
1265
+ criterionId: z.string(),
1266
+ score: z.number().min(SCORE.MIN).max(SCORE.MAX),
1267
+ reasoning: z.string(),
1268
+ passed: z.boolean().optional()
1269
+ })
1270
+ )
1271
+ });
1272
+ function validateAllCriteriaHaveVerdicts(verdicts, criteriaIds) {
1273
+ const providedIds = new Set(verdicts.map((v) => v.criterionId));
1274
+ const missingIds = criteriaIds.filter((id) => !providedIds.has(id));
1275
+ if (missingIds.length > 0) {
1276
+ throw new EvalError("Judge response missing verdicts for some criteria", {
1277
+ code: "VERDICT_PARSE_ERROR" /* VERDICT_PARSE_ERROR */,
1278
+ context: { missingCriteriaIds: missingIds, providedIds: [...providedIds] }
1279
+ });
1280
+ }
1281
+ }
1282
+ function calculateOverallScore(verdicts, criteriaWeights) {
1283
+ let totalWeight = 0;
1284
+ let weightedSum = 0;
1285
+ for (const verdict of verdicts) {
1286
+ const weight = criteriaWeights.get(verdict.criterionId) ?? 1;
1287
+ weightedSum += verdict.score * weight;
1288
+ totalWeight += weight;
1289
+ }
1290
+ if (totalWeight === 0) {
1291
+ return 0;
1292
+ }
1293
+ return Math.round(weightedSum / totalWeight * 100) / 100;
1294
+ }
1295
+ function runValidatorCriteria(validatorCriteria, output) {
1296
+ return validatorCriteria.map((criterion) => {
1297
+ const result = criterion.validator(output);
1298
+ if (result.valid) {
1299
+ return {
1300
+ criterionId: criterion.id,
1301
+ score: 100,
1302
+ reasoning: `${criterion.name} \uD1B5\uACFC`,
1303
+ passed: true
1304
+ };
1305
+ }
1306
+ return {
1307
+ criterionId: criterion.id,
1308
+ score: 0,
1309
+ reasoning: `${criterion.name} \uC2E4\uD328:
1310
+ ${result.errorSummary ?? "\uC720\uD6A8\uC131 \uAC80\uC99D \uC624\uB958"}`,
1311
+ passed: false
1312
+ };
1313
+ });
1314
+ }
1315
+ async function runLLMEvaluation(provider, prompt, context, llmCriteriaIds, passThreshold) {
1316
+ const messages = [
1317
+ { role: "system", content: prompt.system },
1318
+ { role: "user", content: prompt.renderUserPrompt(context) }
1319
+ ];
1320
+ let response;
1321
+ let usage;
1322
+ try {
1323
+ const execution = provider.simpleExecution(async (session) => {
1324
+ const result = await session.generateText({
1325
+ messages,
1326
+ output: Output.object({ schema: JudgeResponseSchema })
1327
+ });
1328
+ return result.output;
1329
+ });
1330
+ const executionResult = await execution.result();
1331
+ if (executionResult.status !== "succeeded") {
1332
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
1333
+ }
1334
+ response = executionResult.value;
1335
+ usage = executionResult.summary.totalLLMUsage;
1336
+ } catch (cause) {
1337
+ throw EvalError.from(cause, "LLM_API_ERROR" /* LLM_API_ERROR */, {
1338
+ promptId: prompt.id,
1339
+ promptVersion: prompt.version
1340
+ });
1341
+ }
1342
+ validateAllCriteriaHaveVerdicts(response.verdicts, llmCriteriaIds);
1343
+ const verdicts = response.verdicts.map((v) => ({
1344
+ criterionId: v.criterionId,
1345
+ score: v.score,
1346
+ reasoning: v.reasoning,
1347
+ passed: v.passed ?? v.score >= passThreshold
1348
+ }));
1349
+ return { verdicts, usage };
1350
+ }
1351
+ function createJudge(config) {
1352
+ const {
1353
+ provider,
1354
+ prompt = defaultJudgePrompt,
1355
+ criteria,
1356
+ passThreshold = SCORE.DEFAULT_PASS_THRESHOLD,
1357
+ model
1358
+ } = config;
1359
+ const validatorCriteria = [];
1360
+ const llmCriteria = [];
1361
+ const criteriaWeights = /* @__PURE__ */ new Map();
1362
+ const llmCriteriaIds = [];
1363
+ for (const c of criteria) {
1364
+ criteriaWeights.set(c.id, c.weight ?? 1);
1365
+ if (hasValidator(c)) {
1366
+ validatorCriteria.push(c);
1367
+ } else {
1368
+ llmCriteria.push(c);
1369
+ llmCriteriaIds.push(c.id);
1370
+ }
1371
+ }
1372
+ return {
1373
+ async evaluate(evalContext) {
1374
+ const { input, output, agentDescription, files } = evalContext;
1375
+ const validatorVerdicts = runValidatorCriteria(validatorCriteria, output);
1376
+ let llmVerdicts = [];
1377
+ let llmUsage;
1378
+ if (llmCriteria.length > 0) {
1379
+ const context = {
1380
+ agentDescription,
1381
+ input,
1382
+ output,
1383
+ criteria: llmCriteria,
1384
+ files
1385
+ };
1386
+ const llmResult = await runLLMEvaluation(
1387
+ provider,
1388
+ prompt,
1389
+ context,
1390
+ llmCriteriaIds,
1391
+ passThreshold
1392
+ );
1393
+ llmVerdicts = llmResult.verdicts;
1394
+ llmUsage = llmResult.usage;
1395
+ }
1396
+ const allVerdicts = [...validatorVerdicts, ...llmVerdicts];
1397
+ const overallScore = calculateOverallScore(allVerdicts, criteriaWeights);
1398
+ const passed = overallScore >= passThreshold;
1399
+ const metadata = llmUsage ? { tokenUsage: toEvalTokenUsage(llmUsage), model } : void 0;
1400
+ return {
1401
+ verdicts: allVerdicts,
1402
+ overallScore,
1403
+ passed,
1404
+ metadata
1405
+ };
1406
+ }
1407
+ };
1408
+ }
1409
+
1410
+ // src/judge/criteria/validate-schema.ts
1411
+ function formatZodErrors(error) {
1412
+ return error.errors.map((e) => {
1413
+ const path3 = e.path.length > 0 ? `${e.path.join(".")}: ` : "";
1414
+ return `- ${path3}${e.message}`;
1415
+ }).join("\n");
1416
+ }
1417
+ function schema(options) {
1418
+ const { schema: schema2, id, weight, name, description } = options;
1419
+ return {
1420
+ id: id ?? "schema-validation",
1421
+ name: name ?? "\uC2A4\uD0A4\uB9C8 \uC720\uD6A8\uC131",
1422
+ description: description ?? "\uCD9C\uB825\uC774 \uC9C0\uC815\uB41C \uC2A4\uD0A4\uB9C8(Zod)\uB97C \uC900\uC218\uD558\uB294\uC9C0 \uD504\uB85C\uADF8\uB798\uBC0D \uBC29\uC2DD\uC73C\uB85C \uAC80\uC99D\uD569\uB2C8\uB2E4.",
1423
+ weight,
1424
+ validator: (output) => {
1425
+ const result = schema2.safeParse(output);
1426
+ if (result.success) {
1427
+ return { valid: true };
1428
+ }
1429
+ return {
1430
+ valid: false,
1431
+ errors: result.error.errors,
1432
+ errorSummary: formatZodErrors(result.error)
1433
+ };
1434
+ }
1435
+ };
1436
+ }
1437
+
1438
+ // src/judge/criteria/index.ts
1439
+ function accuracy(options) {
1440
+ return {
1441
+ id: "accuracy",
1442
+ name: "Accuracy",
1443
+ description: "Evaluates whether the output is factually correct, free from errors, and avoids hallucinations. Check for incorrect facts, made-up information, or misrepresentation of the input data.",
1444
+ weight: options?.weight
1445
+ };
1446
+ }
1447
+ function consistency(options) {
1448
+ return {
1449
+ id: "consistency",
1450
+ name: "Consistency",
1451
+ description: "Evaluates whether the output is internally coherent and logically consistent. Check for self-contradictions, conflicting statements, or logical inconsistencies within the response.",
1452
+ weight: options?.weight
1453
+ };
1454
+ }
1455
+ function relevance(options) {
1456
+ return {
1457
+ id: "relevance",
1458
+ name: "Relevance",
1459
+ description: "Evaluates whether the output directly addresses the input and fulfills the user intent. Check for off-topic content, missing key requirements, or responses that fail to answer the actual question.",
1460
+ weight: options?.weight
1461
+ };
1462
+ }
1463
+
1464
+ // src/reporter/markdown.ts
1465
+ import { writeFile } from "fs/promises";
1466
+ import { getFileSourcesDisplayInfo } from "@agtlantis/core";
1467
+ var PASS_ICON = "\u2705";
1468
+ var FAIL_ICON = "\u274C";
1469
+ var PRIORITY_ORDER = { high: 0, medium: 1, low: 2 };
1470
+ function reportToMarkdown(report, options = {}) {
1471
+ const {
1472
+ expandPassedTests = false,
1473
+ includeRawOutput = false,
1474
+ outputPreviewLength = 200
1475
+ } = options;
1476
+ const { summary, results, suggestions, generatedAt, promptVersion } = report;
1477
+ const passRate = summary.totalTests > 0 ? (summary.passed / summary.totalTests * 100).toFixed(1) : "0.0";
1478
+ const lines = [];
1479
+ lines.push("# Evaluation Report");
1480
+ lines.push("");
1481
+ lines.push(`> Generated: ${generatedAt.toISOString()}`);
1482
+ lines.push(`> Prompt Version: ${promptVersion}`);
1483
+ lines.push("");
1484
+ lines.push("## Summary");
1485
+ lines.push("");
1486
+ lines.push(`| Metric | Value |`);
1487
+ lines.push(`|--------|-------|`);
1488
+ lines.push(`| Total Tests | ${summary.totalTests} |`);
1489
+ if (summary.iterations && summary.iterations > 1) {
1490
+ lines.push(`| **Iterations** | **${summary.iterations}** |`);
1491
+ }
1492
+ lines.push(`| Passed | ${summary.passed} (${passRate}%) |`);
1493
+ lines.push(`| Failed | ${summary.failed} |`);
1494
+ if (summary.avgStdDev !== void 0) {
1495
+ lines.push(
1496
+ `| Average Score | ${summary.avgScore.toFixed(1)} \xB1 ${summary.avgStdDev.toFixed(1)} |`
1497
+ );
1498
+ } else {
1499
+ lines.push(`| Average Score | ${summary.avgScore.toFixed(1)} |`);
1500
+ }
1501
+ if (summary.avgPassRate !== void 0) {
1502
+ lines.push(`| Avg Pass Rate | ${(summary.avgPassRate * 100).toFixed(1)}% |`);
1503
+ }
1504
+ lines.push(`| Avg Latency | ${summary.metrics.avgLatencyMs.toFixed(0)}ms |`);
1505
+ lines.push(`| Total Tokens | ${summary.metrics.totalTokens} |`);
1506
+ if (summary.costSummary?.total !== void 0) {
1507
+ lines.push(`| Est. Cost | $${summary.costSummary.total.toFixed(4)} |`);
1508
+ }
1509
+ lines.push("");
1510
+ const failedResults = results.filter((r) => !r.passed);
1511
+ if (failedResults.length > 0) {
1512
+ lines.push(`## ${FAIL_ICON} Failed Tests`);
1513
+ lines.push("");
1514
+ for (const result of failedResults) {
1515
+ lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
1516
+ }
1517
+ }
1518
+ const passedResults = results.filter((r) => r.passed);
1519
+ if (passedResults.length > 0) {
1520
+ lines.push(`## ${PASS_ICON} Passed Tests`);
1521
+ lines.push("");
1522
+ if (expandPassedTests) {
1523
+ for (const result of passedResults) {
1524
+ lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
1525
+ }
1526
+ } else {
1527
+ lines.push("<details>");
1528
+ lines.push("<summary>Click to expand passed tests</summary>");
1529
+ lines.push("");
1530
+ for (const result of passedResults) {
1531
+ lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
1532
+ }
1533
+ lines.push("</details>");
1534
+ lines.push("");
1535
+ }
1536
+ }
1537
+ if (suggestions.length > 0) {
1538
+ lines.push("## \u{1F4A1} Improvement Suggestions");
1539
+ lines.push("");
1540
+ const sortedSuggestions = [...suggestions].sort(
1541
+ (a, b) => PRIORITY_ORDER[a.priority] - PRIORITY_ORDER[b.priority]
1542
+ );
1543
+ for (const suggestion of sortedSuggestions) {
1544
+ lines.push(formatSuggestion(suggestion));
1545
+ }
1546
+ }
1547
+ return lines.join("\n");
1548
+ }
1549
+ async function saveReportMarkdown(report, path3, options) {
1550
+ const markdown = reportToMarkdown(report, options);
1551
+ await writeFile(path3, markdown, "utf-8");
1552
+ }
1553
+ function jsonCodeBlock(value, maxLength) {
1554
+ const json = JSON.stringify(value, null, 2);
1555
+ const content = maxLength !== void 0 ? truncate(json, maxLength) : json;
1556
+ return ["```json", content, "```"];
1557
+ }
1558
+ function passFailIcon(passed) {
1559
+ return passed ? PASS_ICON : FAIL_ICON;
1560
+ }
1561
+ function formatTestResult(result, previewLength, includeRaw) {
1562
+ const lines = [];
1563
+ const testId = result.testCase.id ?? "unnamed";
1564
+ const scoreDisplay = result.iterationStats ? `${result.overallScore.toFixed(1)} \xB1 ${result.iterationStats.stdDev.toFixed(1)}` : result.overallScore.toFixed(1);
1565
+ lines.push(`### ${testId} (Score: ${scoreDisplay})`);
1566
+ lines.push("");
1567
+ if (result.testCase.description) {
1568
+ lines.push(`> ${result.testCase.description}`);
1569
+ lines.push("");
1570
+ }
1571
+ const fileDisplayInfos = getFileSourcesDisplayInfo(result.testCase.input);
1572
+ if (fileDisplayInfos.length > 0) {
1573
+ lines.push("**Files:**");
1574
+ for (const info of fileDisplayInfos) {
1575
+ const namePrefix = info.filename ? `${info.filename} - ` : "";
1576
+ lines.push(`- ${namePrefix}${info.source}: ${info.description} (${info.mediaType})`);
1577
+ }
1578
+ lines.push("");
1579
+ }
1580
+ if (result.totalTurns !== void 0) {
1581
+ lines.push(
1582
+ `**Multi-turn:** ${result.totalTurns} turns | Termination: ${result.terminationReason ?? "unknown"}`
1583
+ );
1584
+ lines.push("");
1585
+ }
1586
+ if (result.multiTurnIterationStats) {
1587
+ lines.push(...formatMultiTurnIterationStats(result.multiTurnIterationStats));
1588
+ }
1589
+ if (result.iterationStats && result.iterationResults) {
1590
+ lines.push(...formatIterationResults(result.iterationStats, result.iterationResults));
1591
+ }
1592
+ if (result.conversationHistory && result.conversationHistory.length > 0) {
1593
+ lines.push(...formatConversationHistory(result.conversationHistory, previewLength));
1594
+ } else {
1595
+ lines.push(
1596
+ ...formatSingleTurnInputOutput(result.testCase.input, result.output, previewLength)
1597
+ );
1598
+ }
1599
+ lines.push("**Verdicts:**");
1600
+ for (const verdict of result.verdicts) {
1601
+ lines.push(
1602
+ `- ${passFailIcon(verdict.passed)} **${verdict.criterionId}**: ${verdict.score} - ${verdict.reasoning}`
1603
+ );
1604
+ }
1605
+ lines.push("");
1606
+ if (includeRaw) {
1607
+ lines.push("<details>");
1608
+ lines.push("<summary>Raw Output</summary>");
1609
+ lines.push("");
1610
+ lines.push(...jsonCodeBlock(result.output));
1611
+ lines.push("</details>");
1612
+ lines.push("");
1613
+ }
1614
+ return lines.join("\n");
1615
+ }
1616
+ function formatMultiTurnIterationStats(stats) {
1617
+ const terminationSummary = Object.entries(stats.terminationCounts).map(([type, count]) => `${type}: ${count}`).join(", ") || "none";
1618
+ return [
1619
+ "**Multi-turn Iteration Statistics:**",
1620
+ "",
1621
+ "| Metric | Value |",
1622
+ "|--------|-------|",
1623
+ `| Avg Turns | ${stats.avgTurns.toFixed(1)} |`,
1624
+ `| Min/Max Turns | ${stats.minTurns} / ${stats.maxTurns} |`,
1625
+ `| Termination Distribution | ${terminationSummary} |`,
1626
+ ""
1627
+ ];
1628
+ }
1629
+ function formatIterationResults(stats, results) {
1630
+ const lines = [
1631
+ "**Iteration Results:**",
1632
+ "",
1633
+ "| # | Score | Passed | Latency |",
1634
+ "|---|-------|--------|---------|"
1635
+ ];
1636
+ results.forEach((iter, idx) => {
1637
+ lines.push(
1638
+ `| ${idx + 1} | ${iter.overallScore.toFixed(1)} | ${passFailIcon(iter.passed)} | ${iter.metrics.latencyMs.toFixed(0)}ms |`
1639
+ );
1640
+ });
1641
+ lines.push("");
1642
+ lines.push(
1643
+ `**Stats:** ${stats.mean.toFixed(1)} \xB1 ${stats.stdDev.toFixed(1)} (min: ${stats.min.toFixed(0)}, max: ${stats.max.toFixed(0)}, pass rate: ${(stats.passRate * 100).toFixed(0)}%)`
1644
+ );
1645
+ lines.push("");
1646
+ return lines;
1647
+ }
1648
+ function formatConversationHistory(history, previewLength) {
1649
+ const lines = ["**Conversation History:**", ""];
1650
+ for (const turn of history) {
1651
+ lines.push("<details>");
1652
+ lines.push(`<summary>Turn ${turn.turn}</summary>`);
1653
+ lines.push("");
1654
+ lines.push("**Input:**");
1655
+ lines.push(...jsonCodeBlock(turn.input, previewLength));
1656
+ lines.push("");
1657
+ lines.push("**Output:**");
1658
+ lines.push(...jsonCodeBlock(turn.output, previewLength));
1659
+ lines.push("</details>");
1660
+ lines.push("");
1661
+ }
1662
+ return lines;
1663
+ }
1664
+ function formatSingleTurnInputOutput(input, output, previewLength) {
1665
+ return [
1666
+ "**Input:**",
1667
+ ...jsonCodeBlock(input, previewLength),
1668
+ "",
1669
+ "**Output:**",
1670
+ ...jsonCodeBlock(output, previewLength),
1671
+ ""
1672
+ ];
1673
+ }
1674
+ function formatSuggestion(suggestion) {
1675
+ const lines = [];
1676
+ const priorityIcon = { high: "\u{1F534}", medium: "\u{1F7E1}", low: "\u{1F7E2}" }[suggestion.priority] ?? "\u26AA";
1677
+ lines.push(`### ${priorityIcon} [${suggestion.priority.toUpperCase()}] ${suggestion.type}`);
1678
+ lines.push("");
1679
+ lines.push(`**Reasoning:** ${suggestion.reasoning}`);
1680
+ lines.push("");
1681
+ lines.push(`**Expected Improvement:** ${suggestion.expectedImprovement}`);
1682
+ lines.push("");
1683
+ lines.push("**Diff:**");
1684
+ lines.push("```diff");
1685
+ lines.push(`- ${suggestion.currentValue.split("\n").join("\n- ")}`);
1686
+ lines.push(`+ ${suggestion.suggestedValue.split("\n").join("\n+ ")}`);
1687
+ lines.push("```");
1688
+ lines.push("");
1689
+ return lines.join("\n");
1690
+ }
1691
+ function compareReports(before, after) {
1692
+ const scoreDelta = after.summary.avgScore - before.summary.avgScore;
1693
+ const beforePassRate = before.summary.totalTests > 0 ? before.summary.passed / before.summary.totalTests : 0;
1694
+ const afterPassRate = after.summary.totalTests > 0 ? after.summary.passed / after.summary.totalTests : 0;
1695
+ const passRateDelta = afterPassRate - beforePassRate;
1696
+ const metricsDelta = {
1697
+ latencyMs: after.summary.metrics.avgLatencyMs - before.summary.metrics.avgLatencyMs,
1698
+ tokenUsage: after.summary.metrics.totalTokens - before.summary.metrics.totalTokens
1699
+ };
1700
+ const beforeScores = buildScoreMap(before.results);
1701
+ const afterScores = buildScoreMap(after.results);
1702
+ const improved = [];
1703
+ const regressed = [];
1704
+ for (const [id, afterScore] of afterScores) {
1705
+ const beforeScore = beforeScores.get(id);
1706
+ if (beforeScore === void 0) continue;
1707
+ if (afterScore > beforeScore) {
1708
+ improved.push(id);
1709
+ } else if (afterScore < beforeScore) {
1710
+ regressed.push(id);
1711
+ }
1712
+ }
1713
+ const removed = [...beforeScores.keys()].filter((id) => !afterScores.has(id));
1714
+ return {
1715
+ scoreDelta,
1716
+ passRateDelta,
1717
+ metricsDelta,
1718
+ improved,
1719
+ regressed,
1720
+ removed
1721
+ };
1722
+ }
1723
+ function buildScoreMap(results) {
1724
+ const scoreMap = /* @__PURE__ */ new Map();
1725
+ for (const result of results) {
1726
+ scoreMap.set(result.testCase.id ?? "unnamed", result.overallScore);
1727
+ }
1728
+ return scoreMap;
1729
+ }
1730
+
1731
+ // src/reporter/json-reporter.ts
1732
+ import { writeFileSync } from "fs";
1733
+
1734
+ // src/reporter/cost-helpers.ts
1735
+ import {
1736
+ calculateCostFromUsage
1737
+ } from "@agtlantis/core";
1738
+ function toLanguageModelUsage(usage) {
1739
+ return {
1740
+ inputTokens: usage.inputTokens,
1741
+ outputTokens: usage.outputTokens,
1742
+ totalTokens: usage.totalTokens
1743
+ };
1744
+ }
1745
+ var PROVIDER_MAPPING = {
1746
+ gemini: "google",
1747
+ openai: "openai",
1748
+ anthropic: "anthropic",
1749
+ google: "google"
1750
+ };
1751
+ function detectProvider(model) {
1752
+ if (!model) return "google";
1753
+ if (model.startsWith("gpt-") || model.startsWith("o1") || model.startsWith("o3")) {
1754
+ return "openai";
1755
+ }
1756
+ if (model.startsWith("gemini-")) {
1757
+ return "google";
1758
+ }
1759
+ if (model.startsWith("claude-")) {
1760
+ return "anthropic";
1761
+ }
1762
+ return "google";
1763
+ }
1764
+ function normalizeProvider(provider) {
1765
+ if (!provider) return "google";
1766
+ return PROVIDER_MAPPING[provider] ?? provider;
1767
+ }
1768
+ function calculateComponentCost(tokenUsage, model, provider, config) {
1769
+ if (!tokenUsage) return void 0;
1770
+ const normalizedProvider = provider ? normalizeProvider(provider) : detectProvider(model);
1771
+ const providerPricing = config?.providerPricing?.[normalizedProvider];
1772
+ const result = calculateCostFromUsage(
1773
+ toLanguageModelUsage(tokenUsage),
1774
+ model ?? "unknown",
1775
+ normalizedProvider,
1776
+ providerPricing
1777
+ );
1778
+ return result.total;
1779
+ }
1780
+ function buildCostBreakdown(costs) {
1781
+ const total = (costs.agent ?? 0) + (costs.judge ?? 0) + (costs.improver ?? 0);
1782
+ return {
1783
+ ...costs,
1784
+ total: total > 0 ? total : void 0
1785
+ };
1786
+ }
1787
+ function calculateResultCost(result, config) {
1788
+ const agentCost = calculateComponentCost(
1789
+ result.metrics.tokenUsage,
1790
+ result.agentMetadata?.model,
1791
+ result.agentMetadata?.provider,
1792
+ config
1793
+ );
1794
+ const judgeCost = result.judgeMetadata?.tokenUsage ? calculateComponentCost(
1795
+ result.judgeMetadata.tokenUsage,
1796
+ result.judgeMetadata.model,
1797
+ result.judgeMetadata.provider,
1798
+ config
1799
+ ) : void 0;
1800
+ return buildCostBreakdown({
1801
+ agent: agentCost,
1802
+ judge: judgeCost
1803
+ });
1804
+ }
1805
+ function calculateReportCosts(report, config) {
1806
+ let totalAgent = 0;
1807
+ let totalJudge = 0;
1808
+ for (const result of report.results) {
1809
+ const breakdown = calculateResultCost(result, config);
1810
+ totalAgent += breakdown.agent ?? 0;
1811
+ totalJudge += breakdown.judge ?? 0;
1812
+ }
1813
+ return {
1814
+ total: totalAgent + totalJudge,
1815
+ byComponent: {
1816
+ agent: totalAgent,
1817
+ judge: totalJudge
1818
+ }
1819
+ };
1820
+ }
1821
+ function addCostsToResults(results, config) {
1822
+ return results.map((result) => {
1823
+ const costBreakdown = calculateResultCost(result, config);
1824
+ const metricsWithCost = {
1825
+ latencyMs: result.metrics.latencyMs,
1826
+ tokenUsage: result.metrics.tokenUsage,
1827
+ costBreakdown
1828
+ };
1829
+ return {
1830
+ testCase: result.testCase,
1831
+ output: result.output,
1832
+ metrics: metricsWithCost,
1833
+ error: result.error,
1834
+ verdicts: result.verdicts,
1835
+ overallScore: result.overallScore,
1836
+ passed: result.passed
1837
+ };
1838
+ });
1839
+ }
1840
+
1841
+ // src/reporter/format-utils.ts
1842
+ import { mkdirSync } from "fs";
1843
+ import path from "path";
1844
+ function formatScoreDelta(delta) {
1845
+ if (delta === null) {
1846
+ return "-";
1847
+ }
1848
+ const sign = delta >= 0 ? "+" : "";
1849
+ return `${sign}${delta.toFixed(1)}`;
1850
+ }
1851
+ function buildOutputPath(outputDir, name, extension, addTimestamp) {
1852
+ mkdirSync(outputDir, { recursive: true });
1853
+ const filename = addTimestamp ? `${name}-${Date.now()}.${extension}` : `${name}.${extension}`;
1854
+ return path.join(outputDir, filename);
1855
+ }
1856
+ function toISOStringIfDate(value) {
1857
+ return value instanceof Date ? value.toISOString() : value;
1858
+ }
1859
+
1860
+ // src/reporter/json-reporter.ts
1861
+ var JsonReporter = class {
1862
+ outputDir;
1863
+ pricing;
1864
+ addTimestamp;
1865
+ constructor(options) {
1866
+ this.outputDir = options.outputDir;
1867
+ this.pricing = options.pricing;
1868
+ this.addTimestamp = options.addTimestamp ?? true;
1869
+ }
1870
+ save(report, name) {
1871
+ const filepath = buildOutputPath(this.outputDir, name, "json", this.addTimestamp);
1872
+ const costs = this.pricing ? calculateReportCosts(report, this.pricing) : void 0;
1873
+ const output = {
1874
+ summary: report.summary,
1875
+ results: report.results,
1876
+ suggestions: report.suggestions,
1877
+ generatedAt: report.generatedAt.toISOString(),
1878
+ promptVersion: report.promptVersion,
1879
+ ...costs && { costs }
1880
+ };
1881
+ writeFileSync(filepath, JSON.stringify(output, null, 2));
1882
+ return filepath;
1883
+ }
1884
+ };
1885
+
1886
+ // src/reporter/markdown-reporter.ts
1887
+ import { writeFileSync as writeFileSync2 } from "fs";
1888
+ var MarkdownReporter = class {
1889
+ outputDir;
1890
+ addTimestamp;
1891
+ markdownOptions;
1892
+ constructor(options) {
1893
+ this.outputDir = options.outputDir;
1894
+ this.addTimestamp = options.addTimestamp ?? true;
1895
+ this.markdownOptions = options.markdown ?? {};
1896
+ }
1897
+ save(report, name) {
1898
+ const filepath = buildOutputPath(this.outputDir, name, "md", this.addTimestamp);
1899
+ const markdown = reportToMarkdown(report, this.markdownOptions);
1900
+ writeFileSync2(filepath, markdown);
1901
+ return filepath;
1902
+ }
1903
+ };
1904
+
1905
+ // src/reporter/console-reporter.ts
1906
+ var ConsoleReporter = class {
1907
+ verbosity;
1908
+ pricing;
1909
+ constructor(options = {}) {
1910
+ this.verbosity = options.verbosity ?? "summary";
1911
+ this.pricing = options.pricing;
1912
+ }
1913
+ log(report) {
1914
+ const { summary } = report;
1915
+ const passRate = summary.totalTests > 0 ? summary.passed / summary.totalTests : 0;
1916
+ console.log(`
1917
+ \u{1F4CA} Eval Report: ${summary.totalTests} tests`);
1918
+ console.log(` Score: ${summary.avgScore.toFixed(1)} | Pass Rate: ${(passRate * 100).toFixed(0)}%`);
1919
+ if (this.verbosity === "summary") {
1920
+ this.logCostIfAvailable(report);
1921
+ return;
1922
+ }
1923
+ console.log("");
1924
+ for (const result of report.results) {
1925
+ const testId = result.testCase.id || "unknown";
1926
+ const status = result.passed ? "\u2713" : "\u2717";
1927
+ console.log(` ${status} [${testId}] Score: ${result.overallScore.toFixed(1)}`);
1928
+ if (this.verbosity === "full") {
1929
+ console.log(` Input: ${truncate(JSON.stringify(result.testCase.input), 80)}`);
1930
+ console.log(` Output: ${truncate(String(result.output), 80)}`);
1931
+ }
1932
+ }
1933
+ this.logCostIfAvailable(report);
1934
+ }
1935
+ logCostIfAvailable(report) {
1936
+ if (this.pricing) {
1937
+ const costs = calculateReportCosts(report, this.pricing);
1938
+ console.log(`
1939
+ \u{1F4B0} Cost: $${costs.total.toFixed(4)}`);
1940
+ }
1941
+ }
1942
+ };
1943
+
1944
+ // src/reporter/composite-reporter.ts
1945
+ var CompositeReporter = class {
1946
+ constructor(reporters) {
1947
+ this.reporters = reporters;
1948
+ }
1949
+ /**
1950
+ * Saves to all reporters that support saving.
1951
+ * Returns the first successful file path (usually JsonReporter).
1952
+ */
1953
+ save(report, name) {
1954
+ const errors = [];
1955
+ let firstPath;
1956
+ for (const reporter of this.reporters) {
1957
+ if (!reporter.save) {
1958
+ reporter.log?.(report);
1959
+ continue;
1960
+ }
1961
+ try {
1962
+ const savedPath = reporter.save(report, name);
1963
+ if (!firstPath) firstPath = savedPath;
1964
+ } catch (error) {
1965
+ errors.push({
1966
+ reporter: reporter.constructor.name,
1967
+ error
1968
+ });
1969
+ }
1970
+ reporter.log?.(report);
1971
+ }
1972
+ if (!firstPath) {
1973
+ const details = errors.length > 0 ? errors.map((e) => `${e.reporter}: ${e.error.message}`).join(", ") : "No reporters support save()";
1974
+ throw new Error(`No reporter saved successfully. ${details}`);
1975
+ }
1976
+ return firstPath;
1977
+ }
1978
+ log(report) {
1979
+ for (const reporter of this.reporters) {
1980
+ reporter.log?.(report);
1981
+ }
1982
+ }
1983
+ };
1984
+
1985
+ // src/reporter/factory.ts
1986
+ function createJsonReporter(outputDir, options) {
1987
+ return new JsonReporter({ outputDir, ...options });
1988
+ }
1989
+ function createMarkdownReporter(outputDir, options) {
1990
+ return new MarkdownReporter({ outputDir, ...options });
1991
+ }
1992
+ function createConsoleReporter(options) {
1993
+ return new ConsoleReporter(options);
1994
+ }
1995
+ function createCompositeReporter(reporters) {
1996
+ return new CompositeReporter(reporters);
1997
+ }
1998
+ function createDefaultReporter(outputDir, options) {
1999
+ return new CompositeReporter([
2000
+ new JsonReporter({
2001
+ outputDir,
2002
+ pricing: options?.pricing,
2003
+ addTimestamp: options?.addTimestamp
2004
+ }),
2005
+ new ConsoleReporter({
2006
+ verbosity: options?.verbosity,
2007
+ pricing: options?.pricing
2008
+ })
2009
+ ]);
2010
+ }
2011
+
2012
+ // src/reporter/runner.ts
2013
+ function createReportRunner(options) {
2014
+ const { outputDir, pricing, verbosity } = options;
2015
+ const jsonReporter = new JsonReporter({ outputDir, pricing });
2016
+ const consoleReporter = verbosity !== false ? new ConsoleReporter({ verbosity: verbosity || "summary", pricing }) : null;
2017
+ return async (suite, testCases2, name) => {
2018
+ const report = await suite.run(testCases2);
2019
+ consoleReporter?.log(report);
2020
+ const savedPath = jsonReporter.save(report, name);
2021
+ return { report, savedPath };
2022
+ };
2023
+ }
2024
+
2025
+ // src/reporter/cycle-json.ts
2026
+ import { writeFileSync as writeFileSync3, mkdirSync as mkdirSync2 } from "fs";
2027
+ import path2 from "path";
2028
+ function saveCycleJson(result, options) {
2029
+ const { outputDir, name, directory, saveRounds = true } = options;
2030
+ const cycleDir = resolveCycleDirectory(outputDir, name, directory);
2031
+ mkdirSync2(cycleDir, { recursive: true });
2032
+ saveCycleSummary(cycleDir, result);
2033
+ if (saveRounds) {
2034
+ saveRoundReports(cycleDir, result.rounds);
2035
+ }
2036
+ return cycleDir;
2037
+ }
2038
+ function resolveCycleDirectory(outputDir, name, directory) {
2039
+ if (directory) {
2040
+ return directory;
2041
+ }
2042
+ if (outputDir && name) {
2043
+ return path2.join(outputDir, `${name}-${Date.now()}`);
2044
+ }
2045
+ throw new Error('saveCycleJson requires either "directory" or both "outputDir" and "name"');
2046
+ }
2047
+ function saveCycleSummary(cycleDir, result) {
2048
+ const summaryPath = path2.join(cycleDir, "cycle-summary.json");
2049
+ const summary = {
2050
+ rounds: result.rounds.map((round) => ({
2051
+ round: round.round,
2052
+ completedAt: toISOStringIfDate(round.completedAt),
2053
+ score: round.report.summary.avgScore,
2054
+ scoreDelta: round.scoreDelta,
2055
+ cost: round.cost,
2056
+ suggestionsGenerated: round.suggestionsGenerated.length,
2057
+ suggestionsApproved: round.suggestionsApproved.length,
2058
+ promptVersionAfter: round.promptVersionAfter
2059
+ })),
2060
+ terminationReason: result.terminationReason,
2061
+ totalCost: result.totalCost,
2062
+ roundCount: result.rounds.length,
2063
+ initialScore: result.rounds[0]?.report.summary.avgScore ?? null,
2064
+ finalScore: result.rounds[result.rounds.length - 1]?.report.summary.avgScore ?? null
2065
+ };
2066
+ writeFileSync3(summaryPath, JSON.stringify(summary, null, 2));
2067
+ }
2068
+ function saveRoundReports(cycleDir, rounds) {
2069
+ for (const round of rounds) {
2070
+ const roundPath = path2.join(cycleDir, `round-${round.round}-report.json`);
2071
+ const roundData = {
2072
+ round: round.round,
2073
+ completedAt: toISOStringIfDate(round.completedAt),
2074
+ report: {
2075
+ ...round.report,
2076
+ generatedAt: toISOStringIfDate(round.report.generatedAt)
2077
+ },
2078
+ suggestionsGenerated: round.suggestionsGenerated,
2079
+ suggestionsApproved: round.suggestionsApproved,
2080
+ promptSnapshot: round.promptSnapshot,
2081
+ cost: round.cost,
2082
+ scoreDelta: round.scoreDelta
2083
+ };
2084
+ writeFileSync3(roundPath, JSON.stringify(roundData, null, 2));
2085
+ }
2086
+ }
2087
+
2088
+ // src/reporter/cycle-console.ts
2089
+ function logCycle(result, options = {}) {
2090
+ const { verbosity = "summary", showRounds = false } = options;
2091
+ console.log("\n\u{1F504} Improvement Cycle Complete");
2092
+ console.log(` Rounds: ${result.rounds.length}`);
2093
+ console.log(` Termination: ${result.terminationReason}`);
2094
+ console.log(` Total Cost: $${result.totalCost.toFixed(4)}`);
2095
+ if (result.rounds.length > 0) {
2096
+ const firstScore = result.rounds[0].report.summary.avgScore;
2097
+ const lastScore = result.rounds[result.rounds.length - 1].report.summary.avgScore;
2098
+ const delta = lastScore - firstScore;
2099
+ console.log(` Score: ${firstScore.toFixed(1)} -> ${lastScore.toFixed(1)} (${formatScoreDelta(delta)})`);
2100
+ }
2101
+ if (showRounds) {
2102
+ const consoleReporter = new ConsoleReporter({ verbosity });
2103
+ for (const round of result.rounds) {
2104
+ console.log(`
2105
+ -- Round ${round.round} --`);
2106
+ consoleReporter.log(round.report);
2107
+ }
2108
+ }
2109
+ }
2110
+
2111
+ // src/reporter/cycle-markdown.ts
2112
+ import { writeFileSync as writeFileSync4 } from "fs";
2113
+ function cycleToMarkdown(result, options = {}) {
2114
+ const { includeRoundDetails = true, showPromptEvolution = false } = options;
2115
+ const lines = [];
2116
+ lines.push("# Improvement Cycle Report");
2117
+ lines.push("");
2118
+ lines.push("## Summary");
2119
+ lines.push("");
2120
+ lines.push("| Metric | Value |");
2121
+ lines.push("|--------|-------|");
2122
+ lines.push(`| Rounds | ${result.rounds.length} |`);
2123
+ lines.push(`| Termination | ${result.terminationReason} |`);
2124
+ lines.push(`| Total Cost | $${result.totalCost.toFixed(4)} |`);
2125
+ if (result.rounds.length > 0) {
2126
+ const first = result.rounds[0].report.summary.avgScore;
2127
+ const last = result.rounds[result.rounds.length - 1].report.summary.avgScore;
2128
+ lines.push(`| Initial Score | ${first.toFixed(1)} |`);
2129
+ lines.push(`| Final Score | ${last.toFixed(1)} |`);
2130
+ lines.push(`| Improvement | ${formatScoreDelta(last - first)} |`);
2131
+ }
2132
+ lines.push("");
2133
+ lines.push("## Score Progression");
2134
+ lines.push("");
2135
+ lines.push("| Round | Score | Delta | Cost |");
2136
+ lines.push("|-------|-------|-------|------|");
2137
+ for (const round of result.rounds) {
2138
+ const delta = formatScoreDelta(round.scoreDelta);
2139
+ lines.push(
2140
+ `| ${round.round} | ${round.report.summary.avgScore.toFixed(1)} | ${delta} | $${round.cost.total.toFixed(4)} |`
2141
+ );
2142
+ }
2143
+ lines.push("");
2144
+ if (includeRoundDetails) {
2145
+ lines.push("## Round Details");
2146
+ lines.push("");
2147
+ for (const round of result.rounds) {
2148
+ lines.push(`### Round ${round.round}`);
2149
+ lines.push("");
2150
+ lines.push(reportToMarkdown(round.report));
2151
+ lines.push("");
2152
+ }
2153
+ }
2154
+ if (showPromptEvolution && result.rounds.length > 0) {
2155
+ lines.push("## Prompt Evolution");
2156
+ lines.push("");
2157
+ lines.push("### Initial Prompt");
2158
+ lines.push("");
2159
+ lines.push("```");
2160
+ lines.push(result.rounds[0].promptSnapshot.userTemplate);
2161
+ lines.push("```");
2162
+ lines.push("");
2163
+ lines.push("### Final Prompt");
2164
+ lines.push("");
2165
+ lines.push("```");
2166
+ const finalPrompt = result.finalPrompt;
2167
+ if ("userTemplate" in finalPrompt && typeof finalPrompt.userTemplate === "string") {
2168
+ lines.push(finalPrompt.userTemplate);
2169
+ } else {
2170
+ lines.push("[Compiled prompt - template not available]");
2171
+ }
2172
+ lines.push("```");
2173
+ }
2174
+ return lines.join("\n");
2175
+ }
2176
+ function saveCycleMarkdown(result, filePath, options) {
2177
+ const markdown = cycleToMarkdown(result, options);
2178
+ writeFileSync4(filePath, markdown);
2179
+ }
2180
+
2181
+ // src/improver/utils.ts
2182
+ import { compileTemplate } from "@agtlantis/core";
2183
+ function suggestionDiff(suggestion) {
2184
+ const oldLines = suggestion.currentValue.split("\n");
2185
+ const newLines = suggestion.suggestedValue.split("\n");
2186
+ const lines = [];
2187
+ lines.push(`--- ${suggestion.type} (current)`);
2188
+ lines.push(`+++ ${suggestion.type} (suggested)`);
2189
+ lines.push("");
2190
+ for (const line of oldLines) {
2191
+ lines.push(`- ${line}`);
2192
+ }
2193
+ for (const line of newLines) {
2194
+ lines.push(`+ ${line}`);
2195
+ }
2196
+ return lines.join("\n");
2197
+ }
2198
+ function suggestionPreview(suggestion) {
2199
+ const lines = [];
2200
+ lines.push(`=== Suggestion Preview ===`);
2201
+ lines.push(`Type: ${suggestion.type}`);
2202
+ lines.push(`Priority: ${suggestion.priority}`);
2203
+ lines.push(``);
2204
+ lines.push(`Reasoning: ${suggestion.reasoning}`);
2205
+ lines.push(``);
2206
+ lines.push(`Expected Improvement: ${suggestion.expectedImprovement}`);
2207
+ lines.push(``);
2208
+ lines.push(`--- Current Value ---`);
2209
+ lines.push(suggestion.currentValue);
2210
+ lines.push(``);
2211
+ lines.push(`--- Suggested Value ---`);
2212
+ lines.push(suggestion.suggestedValue);
2213
+ return lines.join("\n");
2214
+ }
2215
+ function suggestionSummary(suggestion) {
2216
+ const priorityTag = `[${suggestion.priority.toUpperCase()}]`;
2217
+ return `${priorityTag} ${suggestion.type}: ${truncate(suggestion.reasoning, 60)}`;
2218
+ }
2219
+ function safeReplace(str, search, replacement) {
2220
+ return str.replace(search, () => replacement);
2221
+ }
2222
+ function bumpVersion(version, bump) {
2223
+ const parts = version.split(".").map((n) => parseInt(n, 10));
2224
+ if (parts.length !== 3 || parts.some(isNaN)) {
2225
+ throw new EvalError(
2226
+ `Invalid version format: "${version}". Expected semver format (x.y.z)`,
2227
+ {
2228
+ code: "SUGGESTION_APPLY_ERROR" /* SUGGESTION_APPLY_ERROR */,
2229
+ context: { version, expectedFormat: "x.y.z" }
2230
+ }
2231
+ );
2232
+ }
2233
+ const [major, minor, patch] = parts;
2234
+ switch (bump) {
2235
+ case "major":
2236
+ return `${major + 1}.0.0`;
2237
+ case "minor":
2238
+ return `${major}.${minor + 1}.0`;
2239
+ case "patch":
2240
+ return `${major}.${minor}.${patch + 1}`;
2241
+ }
2242
+ }
2243
+ function applyPromptSuggestions(currentPrompt, suggestions, options) {
2244
+ const approvedSuggestions = suggestions.filter((s) => s.approved);
2245
+ if (approvedSuggestions.length === 0) {
2246
+ return {
2247
+ prompt: currentPrompt,
2248
+ appliedCount: 0,
2249
+ skipped: []
2250
+ };
2251
+ }
2252
+ let newPrompt = { ...currentPrompt };
2253
+ let appliedCount = 0;
2254
+ const skipped = [];
2255
+ for (const suggestion of approvedSuggestions) {
2256
+ const applyResult = applySingleSuggestion(newPrompt, suggestion);
2257
+ if (applyResult.success) {
2258
+ newPrompt = applyResult.prompt;
2259
+ appliedCount++;
2260
+ } else {
2261
+ skipped.push({ suggestion, reason: applyResult.reason });
2262
+ }
2263
+ }
2264
+ if (options?.bumpVersion && appliedCount > 0) {
2265
+ newPrompt = {
2266
+ ...newPrompt,
2267
+ version: bumpVersion(currentPrompt.version, options.bumpVersion)
2268
+ };
2269
+ }
2270
+ return {
2271
+ prompt: newPrompt,
2272
+ appliedCount,
2273
+ skipped
2274
+ };
2275
+ }
2276
+ var AGENT_PROMPT_CORE_FIELDS = [
2277
+ "id",
2278
+ "version",
2279
+ "system",
2280
+ "renderUserPrompt",
2281
+ "userTemplate"
2282
+ ];
2283
+ function applySingleSuggestion(prompt, suggestion) {
2284
+ switch (suggestion.type) {
2285
+ case "system_prompt": {
2286
+ if (!prompt.system.includes(suggestion.currentValue)) {
2287
+ return {
2288
+ success: false,
2289
+ reason: `currentValue not found in system prompt: "${truncate(suggestion.currentValue, 50)}"`
2290
+ };
2291
+ }
2292
+ return {
2293
+ success: true,
2294
+ prompt: {
2295
+ ...prompt,
2296
+ system: safeReplace(
2297
+ prompt.system,
2298
+ suggestion.currentValue,
2299
+ suggestion.suggestedValue
2300
+ )
2301
+ }
2302
+ };
2303
+ }
2304
+ case "user_prompt": {
2305
+ const userTemplate = prompt.userTemplate;
2306
+ if (typeof userTemplate !== "string") {
2307
+ throw new EvalError(
2308
+ `Cannot apply user_prompt suggestion: prompt does not have a userTemplate field. The renderUserPrompt is a function and cannot be modified directly.`,
2309
+ {
2310
+ code: "SUGGESTION_APPLY_ERROR" /* SUGGESTION_APPLY_ERROR */,
2311
+ context: {
2312
+ suggestionType: suggestion.type,
2313
+ hasUserTemplate: "userTemplate" in prompt
2314
+ }
2315
+ }
2316
+ );
2317
+ }
2318
+ if (!userTemplate.includes(suggestion.currentValue)) {
2319
+ return {
2320
+ success: false,
2321
+ reason: `currentValue not found in userTemplate: "${truncate(suggestion.currentValue, 50)}"`
2322
+ };
2323
+ }
2324
+ const newTemplate = safeReplace(
2325
+ userTemplate,
2326
+ suggestion.currentValue,
2327
+ suggestion.suggestedValue
2328
+ );
2329
+ return {
2330
+ success: true,
2331
+ prompt: {
2332
+ ...prompt,
2333
+ userTemplate: newTemplate,
2334
+ renderUserPrompt: compileTemplate(newTemplate, prompt.id)
2335
+ }
2336
+ };
2337
+ }
2338
+ case "parameters": {
2339
+ const updatedPrompt = { ...prompt };
2340
+ let found = false;
2341
+ for (const [key, value] of Object.entries(updatedPrompt)) {
2342
+ if (AGENT_PROMPT_CORE_FIELDS.includes(
2343
+ key
2344
+ )) {
2345
+ continue;
2346
+ }
2347
+ if (typeof value === "string" && value.includes(suggestion.currentValue)) {
2348
+ updatedPrompt[key] = safeReplace(
2349
+ value,
2350
+ suggestion.currentValue,
2351
+ suggestion.suggestedValue
2352
+ );
2353
+ found = true;
2354
+ break;
2355
+ }
2356
+ }
2357
+ if (!found) {
2358
+ return {
2359
+ success: false,
2360
+ reason: `currentValue not found in any parameter field: "${truncate(suggestion.currentValue, 50)}"`
2361
+ };
2362
+ }
2363
+ return {
2364
+ success: true,
2365
+ prompt: updatedPrompt
2366
+ };
2367
+ }
2368
+ default: {
2369
+ const _exhaustive = suggestion.type;
2370
+ return {
2371
+ success: false,
2372
+ reason: `Unknown suggestion type: ${suggestion.type}`
2373
+ };
2374
+ }
2375
+ }
2376
+ }
2377
+
2378
+ // src/improver/llm-improver.ts
2379
+ import { Output as Output2 } from "ai";
2380
+ import { z as z2 } from "zod";
2381
+
2382
+ // src/improver/prompts/default.ts
2383
+ var defaultImproverPrompt = {
2384
+ id: "default-improver",
2385
+ version: "2.0.0",
2386
+ system: `You are an expert prompt engineer specializing in optimizing AI Agent prompts.
2387
+
2388
+ Your role is to analyze test results and evaluation feedback to propose targeted improvements.
2389
+
2390
+ ## Improvement Principles
2391
+
2392
+ 1. **Focus on Impact**: Prioritize changes that address the lowest-scoring criteria
2393
+ - Target specific failure patterns, not general improvements
2394
+ - One well-crafted change is better than many superficial ones
2395
+
2396
+ 2. **Be Specific and Actionable**: Provide concrete changes, not vague suggestions
2397
+ - Show exact text to add, modify, or remove
2398
+ - Explain the mechanism by which the change will help
2399
+
2400
+ 3. **Consider Trade-offs**: Evaluate side effects of each change
2401
+ - Will this fix break other test cases?
2402
+ - Does it increase prompt length/cost significantly?
2403
+ - Could it introduce new failure modes?
2404
+
2405
+ 4. **Maintain Prompt Quality**: Preserve clarity and structure
2406
+ - Keep prompts readable and maintainable
2407
+ - Avoid over-engineering or excessive constraints
2408
+ - Ensure changes align with the agent's core purpose
2409
+
2410
+ ## Suggestion Priority Levels
2411
+ - **high**: Critical issues causing test failures, should be addressed immediately
2412
+ - **medium**: Issues affecting quality scores, recommended for next iteration
2413
+ - **low**: Minor optimizations, nice-to-have improvements
2414
+
2415
+ ## Response Format
2416
+
2417
+ You MUST respond with valid JSON only. No additional text outside the JSON structure.
2418
+
2419
+ {
2420
+ "suggestions": [
2421
+ {
2422
+ "type": "system_prompt" | "user_prompt" | "parameters",
2423
+ "priority": "high" | "medium" | "low",
2424
+ "currentValue": "The specific text or value being changed",
2425
+ "suggestedValue": "The proposed replacement text or value",
2426
+ "reasoning": "Why this change addresses the identified issue",
2427
+ "expectedImprovement": "Predicted impact on scores and behavior"
2428
+ }
2429
+ ]
2430
+ }`,
2431
+ renderUserPrompt: (ctx) => {
2432
+ const failedDetails = buildFailedCaseDetails(ctx.evaluatedResults);
2433
+ return `
2434
+ ## Current Agent Prompt
2435
+
2436
+ ### System Prompt
2437
+ \`\`\`
2438
+ ${ctx.agentPrompt.system}
2439
+ \`\`\`
2440
+
2441
+ ## Test Results Summary
2442
+ - Total tests: ${ctx.evaluatedResults.length}
2443
+ - Passed: ${ctx.evaluatedResults.filter((r) => r.passed).length}
2444
+ - Failed: ${ctx.evaluatedResults.filter((r) => !r.passed).length}
2445
+
2446
+ ## Performance Metrics
2447
+ - Average latency: ${ctx.aggregatedMetrics.avgLatencyMs}ms
2448
+ - Total tokens used: ${ctx.aggregatedMetrics.totalTokens}
2449
+
2450
+ ## Failed/Low-Score Cases Details
2451
+ ${failedDetails}
2452
+
2453
+ Based on the above results, please propose specific prompt improvements.`.trim();
2454
+ }
2455
+ };
2456
+ function buildFailedCaseDetails(results) {
2457
+ const failedOrLowScore = results.filter((r) => !r.passed || r.overallScore < 70);
2458
+ if (failedOrLowScore.length === 0) {
2459
+ return "(None - all tests passed with acceptable scores)";
2460
+ }
2461
+ return failedOrLowScore.map(
2462
+ (r) => `
2463
+ ### ${r.testCase.id ?? "unnamed"} (Score: ${r.overallScore})
2464
+ **Input:** ${truncate(JSON.stringify(r.testCase.input), 200)}
2465
+ **Output:** ${truncate(JSON.stringify(r.output), 200)}
2466
+ **Evaluation:**
2467
+ ${r.verdicts.map((v) => `- ${v.criterionId}: ${v.score}/100 - ${v.reasoning}`).join("\n")}`
2468
+ ).join("\n");
2469
+ }
2470
+
2471
+ // src/improver/llm-improver.ts
2472
+ function toEvalTokenUsage2(usage) {
2473
+ return {
2474
+ inputTokens: usage.inputTokens ?? 0,
2475
+ outputTokens: usage.outputTokens ?? 0,
2476
+ totalTokens: usage.totalTokens ?? 0
2477
+ };
2478
+ }
2479
+ var ImproverResponseSchema = z2.object({
2480
+ suggestions: z2.array(
2481
+ z2.object({
2482
+ type: z2.enum(["system_prompt", "user_prompt", "parameters"]),
2483
+ priority: z2.enum(["high", "medium", "low"]),
2484
+ currentValue: z2.string(),
2485
+ suggestedValue: z2.string(),
2486
+ reasoning: z2.string(),
2487
+ expectedImprovement: z2.string()
2488
+ })
2489
+ )
2490
+ });
2491
+ function aggregateMetrics(results) {
2492
+ if (results.length === 0) {
2493
+ return {
2494
+ avgLatencyMs: 0,
2495
+ totalTokens: 0
2496
+ };
2497
+ }
2498
+ let totalLatency = 0;
2499
+ let totalTokens = 0;
2500
+ for (const result of results) {
2501
+ totalLatency += result.metrics.latencyMs;
2502
+ totalTokens += result.metrics.tokenUsage.totalTokens;
2503
+ }
2504
+ return {
2505
+ avgLatencyMs: Math.round(totalLatency / results.length),
2506
+ totalTokens
2507
+ };
2508
+ }
2509
+ function createImprover(config) {
2510
+ const { provider, prompt = defaultImproverPrompt, model } = config;
2511
+ return {
2512
+ async improve(agentPrompt, results) {
2513
+ const context = {
2514
+ agentPrompt,
2515
+ evaluatedResults: results,
2516
+ aggregatedMetrics: aggregateMetrics(results)
2517
+ };
2518
+ const messages = [
2519
+ { role: "system", content: prompt.system },
2520
+ { role: "user", content: prompt.renderUserPrompt(context) }
2521
+ ];
2522
+ let response;
2523
+ let llmUsage;
2524
+ try {
2525
+ const execution = provider.simpleExecution(async (session) => {
2526
+ const result = await session.generateText({
2527
+ messages,
2528
+ output: Output2.object({ schema: ImproverResponseSchema })
2529
+ });
2530
+ return result.output;
2531
+ });
2532
+ const executionResult = await execution.result();
2533
+ if (executionResult.status !== "succeeded") {
2534
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
2535
+ }
2536
+ response = executionResult.value;
2537
+ llmUsage = executionResult.summary.totalLLMUsage;
2538
+ } catch (cause) {
2539
+ throw EvalError.from(cause, "LLM_API_ERROR" /* LLM_API_ERROR */, {
2540
+ promptId: prompt.id,
2541
+ promptVersion: prompt.version
2542
+ });
2543
+ }
2544
+ const suggestions = response.suggestions.map((s) => ({
2545
+ ...s,
2546
+ approved: void 0,
2547
+ modified: void 0
2548
+ }));
2549
+ const metadata = llmUsage ? { tokenUsage: toEvalTokenUsage2(llmUsage), model } : void 0;
2550
+ return { suggestions, metadata };
2551
+ }
2552
+ };
2553
+ }
2554
+
2555
+ // src/index.ts
2556
+ import { mock, MockProvider } from "@agtlantis/core/testing";
2557
+
2558
+ // src/testing/mock-agent.ts
2559
+ function createMockAgent(config = {}) {
2560
+ const {
2561
+ name = "MockAgent",
2562
+ description = "A mock agent for testing",
2563
+ response = {},
2564
+ tokenUsage = { inputTokens: 10, outputTokens: 20, totalTokens: 30 },
2565
+ delay = 0,
2566
+ shouldError = false,
2567
+ errorMessage = "Mock agent execution failed",
2568
+ executeFn
2569
+ } = config;
2570
+ return {
2571
+ config: { name, description },
2572
+ prompt: {
2573
+ id: "mock-prompt",
2574
+ version: "1.0.0",
2575
+ system: "You are a mock agent",
2576
+ renderUserPrompt: (input) => JSON.stringify(input)
2577
+ },
2578
+ execute: async (input) => {
2579
+ if (executeFn) {
2580
+ return executeFn(input);
2581
+ }
2582
+ if (delay > 0) {
2583
+ await new Promise((resolve2) => setTimeout(resolve2, delay));
2584
+ }
2585
+ if (shouldError) {
2586
+ throw new Error(errorMessage);
2587
+ }
2588
+ return {
2589
+ result: response,
2590
+ metadata: { tokenUsage }
2591
+ };
2592
+ }
2593
+ };
2594
+ }
2595
+ function createMockJudge(config = {}) {
2596
+ const {
2597
+ score = 80,
2598
+ passed = true,
2599
+ verdicts = [
2600
+ { criterionId: "default", score: 80, reasoning: "Default verdict", passed: true }
2601
+ ],
2602
+ metadata,
2603
+ shouldError = false,
2604
+ errorMessage = "Mock judge evaluation failed",
2605
+ evaluateFn
2606
+ } = config;
2607
+ return {
2608
+ evaluate: async (context) => {
2609
+ if (evaluateFn) {
2610
+ return evaluateFn(context);
2611
+ }
2612
+ if (shouldError) {
2613
+ throw new Error(errorMessage);
2614
+ }
2615
+ return {
2616
+ verdicts,
2617
+ overallScore: score,
2618
+ passed,
2619
+ metadata
2620
+ };
2621
+ }
2622
+ };
2623
+ }
2624
+ function createMockImprover(config = {}) {
2625
+ const {
2626
+ suggestions = [],
2627
+ shouldError = false,
2628
+ errorMessage = "Mock improver failed",
2629
+ improveFn
2630
+ } = config;
2631
+ return {
2632
+ improve: async (agentPrompt, results) => {
2633
+ if (improveFn) {
2634
+ return improveFn(agentPrompt, results);
2635
+ }
2636
+ if (shouldError) {
2637
+ throw new Error(errorMessage);
2638
+ }
2639
+ return { suggestions };
2640
+ }
2641
+ };
2642
+ }
2643
+
2644
+ // src/index.ts
2645
+ import {
2646
+ compileTemplate as compileTemplate3,
2647
+ createFilePromptRepository
2648
+ } from "@agtlantis/core";
2649
+ import {
2650
+ calculateCostFromUsage as calculateCostFromUsage3,
2651
+ OPENAI_PRICING,
2652
+ GOOGLE_PRICING,
2653
+ ANTHROPIC_PRICING,
2654
+ DEFAULT_PRICING_CONFIG
2655
+ } from "@agtlantis/core";
2656
+
2657
+ // src/cli/config/types.ts
2658
+ function defineConfig(config) {
2659
+ return config;
2660
+ }
2661
+
2662
+ // src/cli/config/loader.ts
2663
+ import { existsSync } from "fs";
2664
+ import { resolve, extname } from "path";
2665
+ import { pathToFileURL } from "url";
2666
+ import { bundleRequire } from "bundle-require";
2667
+ import fg from "fast-glob";
2668
+
2669
+ // src/cli/config/schema.ts
2670
+ import { z as z3 } from "zod";
2671
+ var llmConfigSchema = z3.object({
2672
+ provider: z3.enum(["openai", "gemini"], {
2673
+ errorMap: () => ({
2674
+ message: "provider must be 'openai' or 'gemini'"
2675
+ })
2676
+ }),
2677
+ apiKey: z3.string().optional(),
2678
+ defaultModel: z3.string().optional(),
2679
+ reasoningEffort: z3.enum(["minimal", "low", "medium", "high"]).optional(),
2680
+ defaultResponseFormat: z3.object({
2681
+ type: z3.enum(["json_object", "text"])
2682
+ }).optional()
2683
+ });
2684
+ var criterionSchema = z3.object({
2685
+ id: z3.string().min(1, "Criterion id is required"),
2686
+ name: z3.string().min(1, "Criterion name is required"),
2687
+ description: z3.string().min(1, "Criterion description is required"),
2688
+ weight: z3.number().positive().optional(),
2689
+ validator: z3.function().optional()
2690
+ });
2691
+ var judgeConfigSchema = z3.object({
2692
+ llm: llmConfigSchema.optional(),
2693
+ criteria: z3.array(criterionSchema).min(1, "At least one criterion is required"),
2694
+ passThreshold: z3.number().min(0).max(100).optional(),
2695
+ prompt: z3.any().optional()
2696
+ });
2697
+ var improverConfigSchema = z3.object({
2698
+ llm: llmConfigSchema.optional(),
2699
+ prompt: z3.any().optional()
2700
+ }).optional();
2701
+ var outputConfigSchema = z3.object({
2702
+ dir: z3.string().optional(),
2703
+ filename: z3.string().optional(),
2704
+ verbose: z3.boolean().optional()
2705
+ }).optional();
2706
+ var runConfigSchema = z3.object({
2707
+ concurrency: z3.number().int().positive().optional(),
2708
+ iterations: z3.number().int().positive().optional(),
2709
+ stopOnFirstFailure: z3.boolean().optional()
2710
+ }).optional();
2711
+ var maxTurnsConditionSchema = z3.object({
2712
+ type: z3.literal("maxTurns"),
2713
+ count: z3.number().int().positive()
2714
+ });
2715
+ var fieldSetConditionSchema = z3.object({
2716
+ type: z3.literal("fieldSet"),
2717
+ fieldPath: z3.string().min(1)
2718
+ });
2719
+ var fieldValueConditionSchema = z3.object({
2720
+ type: z3.literal("fieldValue"),
2721
+ fieldPath: z3.string().min(1),
2722
+ expectedValue: z3.unknown()
2723
+ });
2724
+ var customConditionSchema = z3.object({
2725
+ type: z3.literal("custom"),
2726
+ check: z3.function(),
2727
+ description: z3.string().optional()
2728
+ });
2729
+ var terminationConditionSchema = z3.union([
2730
+ maxTurnsConditionSchema,
2731
+ fieldSetConditionSchema,
2732
+ fieldValueConditionSchema,
2733
+ customConditionSchema
2734
+ ]);
2735
+ var followUpInputSchema = z3.object({
2736
+ input: z3.unknown(),
2737
+ description: z3.string().optional(),
2738
+ turns: z3.number().optional()
2739
+ });
2740
+ var multiTurnConfigSchema = z3.object({
2741
+ followUpInputs: z3.array(followUpInputSchema).optional(),
2742
+ terminateWhen: z3.array(terminationConditionSchema).min(1, "At least one termination condition is required"),
2743
+ maxTurns: z3.number().int().positive().optional(),
2744
+ onConditionMet: z3.enum(["pass", "fail"]).optional(),
2745
+ onMaxTurnsReached: z3.enum(["pass", "fail"]).optional()
2746
+ });
2747
+ var testCaseSchema = z3.object({
2748
+ id: z3.string().optional(),
2749
+ input: z3.unknown(),
2750
+ tags: z3.array(z3.string()).optional(),
2751
+ description: z3.string().optional(),
2752
+ expectedOutput: z3.unknown().optional(),
2753
+ files: z3.array(z3.any()).optional(),
2754
+ multiTurn: multiTurnConfigSchema.optional()
2755
+ });
2756
+ var agentSchema = z3.object({
2757
+ config: z3.object({
2758
+ name: z3.string(),
2759
+ description: z3.string().optional()
2760
+ }),
2761
+ prompt: z3.object({
2762
+ id: z3.string(),
2763
+ version: z3.string(),
2764
+ system: z3.string(),
2765
+ renderUserPrompt: z3.function()
2766
+ }),
2767
+ execute: z3.function()
2768
+ });
2769
+ var evalConfigSchema = z3.object({
2770
+ name: z3.string().optional(),
2771
+ agentDescription: z3.string().optional(),
2772
+ agent: agentSchema,
2773
+ llm: llmConfigSchema,
2774
+ judge: judgeConfigSchema,
2775
+ improver: improverConfigSchema,
2776
+ testCases: z3.array(testCaseSchema).optional(),
2777
+ output: outputConfigSchema,
2778
+ run: runConfigSchema,
2779
+ include: z3.array(z3.string().min(1, "Include pattern cannot be empty")).min(1, "Include array must have at least one pattern").optional(),
2780
+ agents: z3.record(z3.string(), agentSchema).optional()
2781
+ }).refine(
2782
+ (data) => {
2783
+ const hasTestCases = (data.testCases?.length ?? 0) > 0;
2784
+ const hasInclude = (data.include?.length ?? 0) > 0;
2785
+ return hasTestCases || hasInclude;
2786
+ },
2787
+ {
2788
+ message: "Either testCases or include must be provided. Use testCases for inline TypeScript tests, or include for YAML file discovery.",
2789
+ path: ["testCases"]
2790
+ }
2791
+ );
2792
+
2793
+ // src/cli/config/loader.ts
2794
+ var ConfigError = class extends Error {
2795
+ constructor(message, code, context) {
2796
+ super(message);
2797
+ this.code = code;
2798
+ this.context = context;
2799
+ this.name = "ConfigError";
2800
+ }
2801
+ };
2802
+ async function discoverEvalFiles(config, options = {}) {
2803
+ const patterns = options.include ?? config.include;
2804
+ if (!patterns || patterns.length === 0) {
2805
+ throw new ConfigError(
2806
+ `No include patterns specified.
2807
+
2808
+ Add an include field to your config:
2809
+ include: ['evals/**/*.eval.yaml']
2810
+
2811
+ Or use the --include CLI option:
2812
+ npx agent-eval --include "evals/**/*.eval.yaml"`,
2813
+ "CONFIG_NO_INCLUDE_PATTERN"
2814
+ );
2815
+ }
2816
+ const cwd = options.cwd ?? process.cwd();
2817
+ const ignore = options.ignore ?? ["**/node_modules/**"];
2818
+ const files = await fg(patterns, {
2819
+ absolute: true,
2820
+ cwd,
2821
+ ignore,
2822
+ onlyFiles: true,
2823
+ dot: false,
2824
+ followSymbolicLinks: false,
2825
+ unique: true,
2826
+ suppressErrors: false
2827
+ });
2828
+ return files.sort();
2829
+ }
2830
+
2831
+ // src/improvement-cycle/types.ts
2832
+ function isTargetScoreCondition(condition) {
2833
+ return condition.type === "targetScore";
2834
+ }
2835
+ function isMaxRoundsCondition(condition) {
2836
+ return condition.type === "maxRounds";
2837
+ }
2838
+ function isNoImprovementCondition(condition) {
2839
+ return condition.type === "noImprovement";
2840
+ }
2841
+ function isMaxCostCondition(condition) {
2842
+ return condition.type === "maxCost";
2843
+ }
2844
+ function isCustomCycleCondition(condition) {
2845
+ return condition.type === "custom";
2846
+ }
2847
+ function isCycleTerminated(result) {
2848
+ return result.terminated === true;
2849
+ }
2850
+
2851
+ // src/improvement-cycle/conditions.ts
2852
+ function targetScore(threshold) {
2853
+ if (!Number.isFinite(threshold)) {
2854
+ throw new EvalError("threshold must be a finite number", {
2855
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
2856
+ context: { threshold }
2857
+ });
2858
+ }
2859
+ if (threshold < 0 || threshold > 100) {
2860
+ throw new EvalError("threshold must be between 0 and 100", {
2861
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
2862
+ context: { threshold }
2863
+ });
2864
+ }
2865
+ return { type: "targetScore", threshold };
2866
+ }
2867
+ function maxRounds(count) {
2868
+ if (!Number.isInteger(count) || count < 1) {
2869
+ throw new EvalError("count must be a positive integer", {
2870
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
2871
+ context: { count }
2872
+ });
2873
+ }
2874
+ return { type: "maxRounds", count };
2875
+ }
2876
+ function noImprovement(consecutiveRounds, minDelta) {
2877
+ if (!Number.isInteger(consecutiveRounds) || consecutiveRounds < 1) {
2878
+ throw new EvalError("consecutiveRounds must be a positive integer", {
2879
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
2880
+ context: { consecutiveRounds }
2881
+ });
2882
+ }
2883
+ if (minDelta !== void 0 && (!Number.isFinite(minDelta) || minDelta < 0)) {
2884
+ throw new EvalError("minDelta must be a non-negative finite number", {
2885
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
2886
+ context: { minDelta }
2887
+ });
2888
+ }
2889
+ return {
2890
+ type: "noImprovement",
2891
+ consecutiveRounds,
2892
+ ...minDelta !== void 0 && { minDelta }
2893
+ };
2894
+ }
2895
+ function maxCost(maxUSD) {
2896
+ if (!Number.isFinite(maxUSD) || maxUSD <= 0) {
2897
+ throw new EvalError("maxUSD must be a positive finite number", {
2898
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
2899
+ context: { maxUSD }
2900
+ });
2901
+ }
2902
+ return { type: "maxCost", maxUSD };
2903
+ }
2904
+ function customCondition(check, description) {
2905
+ return {
2906
+ type: "custom",
2907
+ check,
2908
+ ...description !== void 0 && { description }
2909
+ };
2910
+ }
2911
+ function and2(...conditions) {
2912
+ if (conditions.length === 0) {
2913
+ return {
2914
+ type: "custom",
2915
+ check: () => false,
2916
+ description: formatCompositeDescription("and", [])
2917
+ };
2918
+ }
2919
+ return {
2920
+ type: "custom",
2921
+ check: createAndCheck(conditions, checkCycleCondition),
2922
+ description: formatCompositeDescription("and", conditions)
2923
+ };
2924
+ }
2925
+ function or2(...conditions) {
2926
+ if (conditions.length === 0) {
2927
+ return {
2928
+ type: "custom",
2929
+ check: () => false,
2930
+ description: formatCompositeDescription("or", [])
2931
+ };
2932
+ }
2933
+ return {
2934
+ type: "custom",
2935
+ check: createOrCheck(conditions, checkCycleCondition),
2936
+ description: formatCompositeDescription("or", conditions)
2937
+ };
2938
+ }
2939
+ function not2(condition) {
2940
+ return {
2941
+ type: "custom",
2942
+ check: createNotCheck(condition, checkCycleCondition),
2943
+ description: `not(${condition.type})`
2944
+ };
2945
+ }
2946
+ function checkTargetScore(condition, ctx) {
2947
+ if (ctx.latestScore >= condition.threshold) {
2948
+ return {
2949
+ terminated: true,
2950
+ matchedCondition: condition,
2951
+ reason: `Target score ${condition.threshold} reached (current: ${ctx.latestScore})`
2952
+ };
2953
+ }
2954
+ return {
2955
+ terminated: false,
2956
+ reason: `Score ${ctx.latestScore} below target ${condition.threshold}`
2957
+ };
2958
+ }
2959
+ function checkMaxRounds(condition, ctx) {
2960
+ if (ctx.currentRound >= condition.count) {
2961
+ return {
2962
+ terminated: true,
2963
+ matchedCondition: condition,
2964
+ reason: `Maximum rounds reached (${condition.count})`
2965
+ };
2966
+ }
2967
+ return {
2968
+ terminated: false,
2969
+ reason: `Round ${ctx.currentRound} of ${condition.count}`
2970
+ };
2971
+ }
2972
+ function checkNoImprovement(condition, ctx) {
2973
+ const { consecutiveRounds, minDelta = 0 } = condition;
2974
+ const { history } = ctx;
2975
+ let noImprovementCount = 0;
2976
+ for (let i = history.length - 1; i >= 0; i--) {
2977
+ const round = history[i];
2978
+ if (round.scoreDelta === null) break;
2979
+ if (round.scoreDelta <= minDelta) {
2980
+ noImprovementCount++;
2981
+ } else {
2982
+ break;
2983
+ }
2984
+ }
2985
+ if (noImprovementCount >= consecutiveRounds) {
2986
+ return {
2987
+ terminated: true,
2988
+ matchedCondition: condition,
2989
+ reason: `No improvement for ${noImprovementCount} consecutive round${noImprovementCount === 1 ? "" : "s"}`
2990
+ };
2991
+ }
2992
+ const roundWord = noImprovementCount === 1 ? "round" : "rounds";
2993
+ return {
2994
+ terminated: false,
2995
+ reason: `${noImprovementCount} ${roundWord} without improvement (need ${consecutiveRounds})`
2996
+ };
2997
+ }
2998
+ function checkMaxCost(condition, ctx) {
2999
+ if (ctx.totalCost >= condition.maxUSD) {
3000
+ return {
3001
+ terminated: true,
3002
+ matchedCondition: condition,
3003
+ reason: `Cost limit exceeded ($${ctx.totalCost.toFixed(2)} >= $${condition.maxUSD.toFixed(2)})`
3004
+ };
3005
+ }
3006
+ return {
3007
+ terminated: false,
3008
+ reason: `Cost $${ctx.totalCost.toFixed(2)} under limit $${condition.maxUSD.toFixed(2)}`
3009
+ };
3010
+ }
3011
+ async function checkCustomCondition(condition, ctx) {
3012
+ const description = condition.description ?? "Custom condition";
3013
+ try {
3014
+ const shouldTerminate = await condition.check(ctx);
3015
+ if (shouldTerminate) {
3016
+ return {
3017
+ terminated: true,
3018
+ matchedCondition: condition,
3019
+ reason: `${description} met`
3020
+ };
3021
+ }
3022
+ return {
3023
+ terminated: false,
3024
+ reason: `${description} not met`
3025
+ };
3026
+ } catch (error) {
3027
+ const message = error instanceof Error ? error.message : String(error);
3028
+ return {
3029
+ terminated: false,
3030
+ reason: `${description} check failed: ${message}`
3031
+ };
3032
+ }
3033
+ }
3034
+ async function checkCycleCondition(condition, context) {
3035
+ if (isTargetScoreCondition(condition)) {
3036
+ return checkTargetScore(condition, context);
3037
+ }
3038
+ if (isMaxRoundsCondition(condition)) {
3039
+ return checkMaxRounds(condition, context);
3040
+ }
3041
+ if (isNoImprovementCondition(condition)) {
3042
+ return checkNoImprovement(condition, context);
3043
+ }
3044
+ if (isMaxCostCondition(condition)) {
3045
+ return checkMaxCost(condition, context);
3046
+ }
3047
+ if (isCustomCycleCondition(condition)) {
3048
+ return checkCustomCondition(condition, context);
3049
+ }
3050
+ const _exhaustive = condition;
3051
+ throw new EvalError(`Unknown condition type: ${JSON.stringify(_exhaustive)}`, {
3052
+ code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */,
3053
+ context: { condition: _exhaustive }
3054
+ });
3055
+ }
3056
+ async function checkCycleTermination(conditions, context) {
3057
+ if (conditions.length === 0) {
3058
+ return {
3059
+ terminated: false,
3060
+ reason: "No termination conditions specified"
3061
+ };
3062
+ }
3063
+ for (const condition of conditions) {
3064
+ const result = await checkCycleCondition(condition, context);
3065
+ if (result.terminated) {
3066
+ return result;
3067
+ }
3068
+ }
3069
+ return {
3070
+ terminated: false,
3071
+ reason: "No termination conditions met"
3072
+ };
3073
+ }
3074
+
3075
+ // src/improvement-cycle/runner.ts
3076
+ import { calculateCostFromUsage as calculateCostFromUsage2 } from "@agtlantis/core";
3077
+
3078
+ // src/improvement-cycle/history.ts
3079
+ import crypto from "crypto";
3080
+ import { existsSync as existsSync2 } from "fs";
3081
+ import { mkdir, readFile, writeFile as writeFile2 } from "fs/promises";
3082
+ import { dirname } from "path";
3083
+ import { compileTemplate as compileTemplate2 } from "@agtlantis/core";
3084
+ var defaultHistoryStorage = {
3085
+ readFile: (path3) => readFile(path3, "utf-8"),
3086
+ writeFile: (path3, content) => writeFile2(path3, content, "utf-8"),
3087
+ exists: existsSync2,
3088
+ mkdir: (path3, options) => mkdir(path3, options)
3089
+ };
3090
+ function hasUserTemplate(prompt) {
3091
+ return typeof prompt.userTemplate === "string";
3092
+ }
3093
+ function serializePrompt(prompt) {
3094
+ const p = prompt;
3095
+ if (!hasUserTemplate(p)) {
3096
+ throw new EvalError("Cannot serialize prompt: userTemplate field is required", {
3097
+ code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
3098
+ context: { promptId: p.id }
3099
+ });
3100
+ }
3101
+ const { id, version, system, userTemplate, renderUserPrompt, ...rest } = p;
3102
+ const customFields = Object.keys(rest).length > 0 ? rest : void 0;
3103
+ return {
3104
+ id,
3105
+ version,
3106
+ system,
3107
+ userTemplate,
3108
+ ...customFields && { customFields }
3109
+ };
3110
+ }
3111
+ function validateDeserializedPrompt(obj, promptId) {
3112
+ const requiredStrings = ["id", "version", "system", "userTemplate"];
3113
+ for (const field of requiredStrings) {
3114
+ if (typeof obj[field] !== "string") {
3115
+ throw new EvalError(`Invalid deserialized prompt: ${field} must be a string`, {
3116
+ code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
3117
+ context: { promptId, field, actual: typeof obj[field] }
3118
+ });
3119
+ }
3120
+ }
3121
+ if (typeof obj.renderUserPrompt !== "function") {
3122
+ throw new EvalError("Invalid deserialized prompt: renderUserPrompt must be a function", {
3123
+ code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
3124
+ context: { promptId, actual: typeof obj.renderUserPrompt }
3125
+ });
3126
+ }
3127
+ }
3128
+ function deserializePrompt(serialized) {
3129
+ const { id, version, system, userTemplate, customFields } = serialized;
3130
+ let renderUserPrompt;
3131
+ try {
3132
+ renderUserPrompt = compileTemplate2(userTemplate, id);
3133
+ } catch (error) {
3134
+ const message = error instanceof Error ? error.message : String(error);
3135
+ throw new EvalError(`Failed to compile userTemplate: ${message}`, {
3136
+ code: "TEMPLATE_COMPILE_ERROR" /* TEMPLATE_COMPILE_ERROR */,
3137
+ context: { promptId: id, userTemplate }
3138
+ });
3139
+ }
3140
+ const result = {
3141
+ ...customFields,
3142
+ id,
3143
+ version,
3144
+ system,
3145
+ userTemplate,
3146
+ renderUserPrompt
3147
+ };
3148
+ validateDeserializedPrompt(result, id);
3149
+ return result;
3150
+ }
3151
+ function serializeRoundResult(result) {
3152
+ const { summary } = result.report;
3153
+ return {
3154
+ round: result.round,
3155
+ completedAt: result.completedAt.toISOString(),
3156
+ avgScore: summary.avgScore,
3157
+ passed: summary.passed,
3158
+ failed: summary.failed,
3159
+ totalTests: summary.totalTests,
3160
+ suggestionsGenerated: result.suggestionsGenerated,
3161
+ suggestionsApproved: result.suggestionsApproved,
3162
+ promptSnapshot: result.promptSnapshot,
3163
+ promptVersionAfter: result.promptVersionAfter,
3164
+ cost: result.cost,
3165
+ scoreDelta: result.scoreDelta
3166
+ };
3167
+ }
3168
+ function validateHistorySchema(data) {
3169
+ if (typeof data !== "object" || data === null) {
3170
+ throw new EvalError("Invalid history: not an object", {
3171
+ code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */
3172
+ });
3173
+ }
3174
+ const h = data;
3175
+ if (h.schemaVersion !== "1.1.0") {
3176
+ throw new EvalError(`Unsupported schema version: ${String(h.schemaVersion)}`, {
3177
+ code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */,
3178
+ context: { schemaVersion: h.schemaVersion }
3179
+ });
3180
+ }
3181
+ const requiredFields = [
3182
+ "sessionId",
3183
+ "startedAt",
3184
+ "initialPrompt",
3185
+ "currentPrompt",
3186
+ "rounds",
3187
+ "totalCost"
3188
+ ];
3189
+ for (const field of requiredFields) {
3190
+ if (!(field in h)) {
3191
+ throw new EvalError(`Invalid history: missing field "${field}"`, {
3192
+ code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */,
3193
+ context: { missingField: field }
3194
+ });
3195
+ }
3196
+ }
3197
+ }
3198
+ var ImprovementSessionImpl = class {
3199
+ _history;
3200
+ _isUpdating = false;
3201
+ _savePromise = Promise.resolve();
3202
+ config;
3203
+ constructor(history, config = {}) {
3204
+ this._history = history;
3205
+ this.config = {
3206
+ autoSave: config.autoSave ?? false,
3207
+ ...config
3208
+ };
3209
+ }
3210
+ get sessionId() {
3211
+ return this._history.sessionId;
3212
+ }
3213
+ get history() {
3214
+ return this._history;
3215
+ }
3216
+ get canSave() {
3217
+ return this.config.path !== void 0;
3218
+ }
3219
+ addRound(roundResult, updatedPrompt) {
3220
+ if (this._isUpdating) {
3221
+ throw new EvalError("Session is being updated", {
3222
+ code: "CONCURRENT_MODIFICATION" /* CONCURRENT_MODIFICATION */,
3223
+ context: { sessionId: this.sessionId }
3224
+ });
3225
+ }
3226
+ if (this._history.completedAt) {
3227
+ throw new EvalError("Cannot add round to completed session", {
3228
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3229
+ context: { sessionId: this.sessionId }
3230
+ });
3231
+ }
3232
+ this._isUpdating = true;
3233
+ try {
3234
+ const serializedRound = serializeRoundResult(roundResult);
3235
+ this._history = {
3236
+ ...this._history,
3237
+ currentPrompt: updatedPrompt,
3238
+ rounds: [...this._history.rounds, serializedRound],
3239
+ totalCost: this._history.totalCost + roundResult.cost.total
3240
+ };
3241
+ if (this.config.autoSave && this.canSave) {
3242
+ this.save().catch((err) => this.handleAutoSaveError(err));
3243
+ }
3244
+ } finally {
3245
+ this._isUpdating = false;
3246
+ }
3247
+ }
3248
+ complete(terminationReason) {
3249
+ this._history = {
3250
+ ...this._history,
3251
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
3252
+ terminationReason
3253
+ };
3254
+ if (this.config.autoSave && this.canSave) {
3255
+ this.save().catch((err) => this.handleAutoSaveError(err));
3256
+ }
3257
+ }
3258
+ handleAutoSaveError(error) {
3259
+ const err = error instanceof Error ? error : new Error(String(error));
3260
+ if (this.config.onAutoSaveError) {
3261
+ this.config.onAutoSaveError(err);
3262
+ } else {
3263
+ console.error("Auto-save failed:", err);
3264
+ }
3265
+ }
3266
+ async save() {
3267
+ if (!this.config.path) {
3268
+ throw new EvalError("Cannot save: no path configured", {
3269
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3270
+ context: { sessionId: this.sessionId }
3271
+ });
3272
+ }
3273
+ this._savePromise = this._savePromise.then(async () => {
3274
+ await saveHistory(this._history, this.config.path, this.config.storage);
3275
+ });
3276
+ return this._savePromise;
3277
+ }
3278
+ async flush() {
3279
+ return this._savePromise;
3280
+ }
3281
+ };
3282
+ function createSession(initialPrompt, config) {
3283
+ const serializedPrompt = serializePrompt(initialPrompt);
3284
+ const history = {
3285
+ schemaVersion: "1.1.0",
3286
+ sessionId: crypto.randomUUID(),
3287
+ startedAt: (/* @__PURE__ */ new Date()).toISOString(),
3288
+ initialPrompt: serializedPrompt,
3289
+ currentPrompt: serializedPrompt,
3290
+ rounds: [],
3291
+ totalCost: 0
3292
+ };
3293
+ return new ImprovementSessionImpl(history, config);
3294
+ }
3295
+ async function resumeSession(path3, config) {
3296
+ const history = await loadHistory(path3, config?.storage);
3297
+ const reopenedHistory = {
3298
+ ...history,
3299
+ completedAt: void 0,
3300
+ terminationReason: void 0
3301
+ };
3302
+ return new ImprovementSessionImpl(reopenedHistory, { ...config, path: path3 });
3303
+ }
3304
+ async function saveHistory(history, path3, storage = defaultHistoryStorage) {
3305
+ try {
3306
+ const dir = dirname(path3);
3307
+ if (dir && dir !== "." && dir !== "/" && !storage.exists(dir)) {
3308
+ await storage.mkdir(dir, { recursive: true });
3309
+ }
3310
+ await storage.writeFile(path3, JSON.stringify(history, null, 2));
3311
+ } catch (error) {
3312
+ if (error instanceof EvalError) throw error;
3313
+ throw EvalError.from(error, "FILE_WRITE_ERROR" /* FILE_WRITE_ERROR */, { path: path3 });
3314
+ }
3315
+ }
3316
+ async function loadHistory(path3, storage = defaultHistoryStorage) {
3317
+ try {
3318
+ if (!storage.exists(path3)) {
3319
+ throw new EvalError(`History file not found: ${path3}`, {
3320
+ code: "FILE_READ_ERROR" /* FILE_READ_ERROR */,
3321
+ context: { path: path3 }
3322
+ });
3323
+ }
3324
+ const content = await storage.readFile(path3);
3325
+ const history = JSON.parse(content);
3326
+ validateHistorySchema(history);
3327
+ return history;
3328
+ } catch (error) {
3329
+ if (error instanceof EvalError) throw error;
3330
+ throw EvalError.from(error, "FILE_READ_ERROR" /* FILE_READ_ERROR */, { path: path3 });
3331
+ }
3332
+ }
3333
+
3334
+ // src/improvement-cycle/runner.ts
3335
+ function initializeCycleState(initialPrompt, existingSession) {
3336
+ const resumeFromRound = existingSession ? existingSession.history.rounds.length : 0;
3337
+ return {
3338
+ currentPrompt: initialPrompt,
3339
+ currentRound: resumeFromRound,
3340
+ previousScores: existingSession ? existingSession.history.rounds.map((r) => r.avgScore) : [],
3341
+ totalCost: existingSession ? existingSession.history.totalCost : 0,
3342
+ completedRounds: []
3343
+ };
3344
+ }
3345
+ function calculateScoreDelta(currentScore, previousScores) {
3346
+ if (previousScores.length === 0) {
3347
+ return null;
3348
+ }
3349
+ const previousScore = previousScores[previousScores.length - 1];
3350
+ return currentScore - previousScore;
3351
+ }
3352
+ function buildCycleContext(state, currentScore) {
3353
+ return {
3354
+ currentRound: state.currentRound,
3355
+ latestScore: currentScore,
3356
+ previousScores: [...state.previousScores],
3357
+ totalCost: state.totalCost,
3358
+ history: state.completedRounds
3359
+ };
3360
+ }
3361
+ function createRoundResult(state, report, improveResult, cost, scoreDelta, promptSnapshot) {
3362
+ return {
3363
+ round: state.currentRound,
3364
+ report,
3365
+ completedAt: /* @__PURE__ */ new Date(),
3366
+ suggestionsGenerated: improveResult.suggestions,
3367
+ suggestionsApproved: [],
3368
+ // Will be updated after decision
3369
+ promptSnapshot,
3370
+ promptVersionAfter: state.currentPrompt.version,
3371
+ cost,
3372
+ scoreDelta
3373
+ };
3374
+ }
3375
+ async function handleStopDecision(state, session, roundResult, promptSnapshot, terminatedByCondition, conditionReason) {
3376
+ const reason = terminatedByCondition ? conditionReason : "User requested stop";
3377
+ session.addRound(roundResult, promptSnapshot);
3378
+ session.complete(reason);
3379
+ await session.flush();
3380
+ state.completedRounds.push(roundResult);
3381
+ return {
3382
+ rounds: state.completedRounds,
3383
+ finalPrompt: deserializePrompt(session.history.currentPrompt),
3384
+ terminationReason: reason,
3385
+ totalCost: state.totalCost,
3386
+ history: session.history
3387
+ };
3388
+ }
3389
+ function handleRollbackDecision(state, rollbackToRound) {
3390
+ const targetRoundIndex = rollbackToRound - 1;
3391
+ if (targetRoundIndex < 0 || targetRoundIndex >= state.completedRounds.length) {
3392
+ throw new Error(`Cannot rollback to round ${rollbackToRound}: round not found`);
3393
+ }
3394
+ const targetRound = state.completedRounds[targetRoundIndex];
3395
+ state.currentPrompt = deserializePrompt(targetRound.promptSnapshot);
3396
+ state.previousScores = state.previousScores.slice(0, rollbackToRound - 1);
3397
+ }
3398
+ function handleContinueDecision(state, session, roundResult, approvedSuggestions, versionBump) {
3399
+ const updatedRoundResult = {
3400
+ ...roundResult,
3401
+ suggestionsApproved: approvedSuggestions
3402
+ };
3403
+ if (approvedSuggestions.length > 0) {
3404
+ const applyResult = applyPromptSuggestions(state.currentPrompt, approvedSuggestions, {
3405
+ bumpVersion: versionBump
3406
+ });
3407
+ state.currentPrompt = applyResult.prompt;
3408
+ updatedRoundResult.promptVersionAfter = state.currentPrompt.version;
3409
+ }
3410
+ const updatedPromptSnapshot = serializePrompt(state.currentPrompt);
3411
+ session.addRound(updatedRoundResult, updatedPromptSnapshot);
3412
+ state.completedRounds.push(updatedRoundResult);
3413
+ return updatedRoundResult;
3414
+ }
3415
+ async function executeRound(config, state, pricingConfig) {
3416
+ const { createAgent, judge, improver, testCases: testCases2, options = {} } = config;
3417
+ const agent = createAgent(state.currentPrompt);
3418
+ const suite = createEvalSuite({
3419
+ agent,
3420
+ judge,
3421
+ agentDescription: options.agentDescription
3422
+ });
3423
+ const report = await suite.run(testCases2, options.runOptions);
3424
+ const improveResult = improver ? await improver.improve(state.currentPrompt, report.results) : { suggestions: [] };
3425
+ const cost = calculateRoundCost(report, improveResult, pricingConfig);
3426
+ return { report, improveResult, cost };
3427
+ }
3428
+ function detectProviderForImprover(model) {
3429
+ if (!model) return "anthropic";
3430
+ if (model.startsWith("claude-")) return "anthropic";
3431
+ if (model.startsWith("gpt-") || model.startsWith("o1") || model.startsWith("o3")) return "openai";
3432
+ if (model.startsWith("gemini-")) return "google";
3433
+ return "anthropic";
3434
+ }
3435
+ function toLanguageModelUsage2(usage) {
3436
+ return {
3437
+ inputTokens: usage.inputTokens,
3438
+ outputTokens: usage.outputTokens,
3439
+ totalTokens: usage.totalTokens
3440
+ };
3441
+ }
3442
+ function calculateImproverCost(improveResult, pricingConfig) {
3443
+ const usage = improveResult.metadata?.tokenUsage;
3444
+ if (!usage) return 0;
3445
+ const model = improveResult.metadata?.model ?? "unknown";
3446
+ const provider = detectProviderForImprover(model);
3447
+ const providerPricing = pricingConfig?.providerPricing?.[provider];
3448
+ const result = calculateCostFromUsage2(
3449
+ toLanguageModelUsage2(usage),
3450
+ model,
3451
+ provider,
3452
+ providerPricing
3453
+ );
3454
+ return result.total;
3455
+ }
3456
+ function calculateRoundCost(report, improveResult, pricingConfig) {
3457
+ const reportCosts = pricingConfig ? calculateReportCosts(report, pricingConfig) : { total: 0, byComponent: { agent: 0, judge: 0 } };
3458
+ const improverCost = calculateImproverCost(improveResult, pricingConfig);
3459
+ return {
3460
+ agent: reportCosts.byComponent.agent ?? 0,
3461
+ judge: reportCosts.byComponent.judge ?? 0,
3462
+ improver: improverCost,
3463
+ total: reportCosts.total + improverCost
3464
+ };
3465
+ }
3466
+ async function* runImprovementCycle(config) {
3467
+ const { initialPrompt, terminateWhen = [], options = {} } = config;
3468
+ const { pricingConfig, versionBump = "patch", history: historyConfig, session: existingSession } = options;
3469
+ const session = existingSession ?? createSession(
3470
+ initialPrompt,
3471
+ historyConfig ? { path: historyConfig.path, autoSave: historyConfig.autoSave } : void 0
3472
+ );
3473
+ const state = initializeCycleState(initialPrompt, existingSession);
3474
+ try {
3475
+ while (true) {
3476
+ state.currentRound++;
3477
+ const { report, improveResult, cost } = await executeRound(config, state, pricingConfig);
3478
+ state.totalCost += cost.total;
3479
+ const currentScore = report.summary.avgScore;
3480
+ const scoreDelta = calculateScoreDelta(currentScore, state.previousScores);
3481
+ const promptSnapshot = serializePrompt(state.currentPrompt);
3482
+ const roundResult = createRoundResult(state, report, improveResult, cost, scoreDelta, promptSnapshot);
3483
+ const context = buildCycleContext(state, currentScore);
3484
+ state.previousScores.push(currentScore);
3485
+ const terminationCheck = await checkCycleTermination(terminateWhen, context);
3486
+ const pendingSuggestions = improveResult.suggestions.map((s) => ({
3487
+ ...s,
3488
+ approved: false
3489
+ }));
3490
+ const roundYield = {
3491
+ roundResult,
3492
+ pendingSuggestions,
3493
+ terminationCheck,
3494
+ context
3495
+ };
3496
+ const decision = yield roundYield;
3497
+ if (!decision || decision.action === "stop") {
3498
+ return await handleStopDecision(
3499
+ state,
3500
+ session,
3501
+ roundResult,
3502
+ promptSnapshot,
3503
+ terminationCheck.terminated,
3504
+ terminationCheck.reason
3505
+ );
3506
+ }
3507
+ if (decision.action === "rollback" && decision.rollbackToRound !== void 0) {
3508
+ handleRollbackDecision(state, decision.rollbackToRound);
3509
+ continue;
3510
+ }
3511
+ handleContinueDecision(
3512
+ state,
3513
+ session,
3514
+ roundResult,
3515
+ decision.approvedSuggestions ?? [],
3516
+ versionBump
3517
+ );
3518
+ }
3519
+ } catch (error) {
3520
+ const errorMessage = error instanceof Error ? error.message : String(error);
3521
+ session.complete(`Error: ${errorMessage}`);
3522
+ throw error;
3523
+ }
3524
+ }
3525
+ async function runImprovementCycleAuto(config) {
3526
+ const cycle = runImprovementCycle(config);
3527
+ let iteratorResult = await cycle.next();
3528
+ while (!iteratorResult.done) {
3529
+ const roundYield = iteratorResult.value;
3530
+ let decision;
3531
+ if (roundYield.terminationCheck.terminated) {
3532
+ decision = { action: "stop" };
3533
+ } else {
3534
+ const approvedSuggestions = roundYield.pendingSuggestions.map((s) => ({
3535
+ ...s,
3536
+ approved: true
3537
+ }));
3538
+ decision = { action: "continue", approvedSuggestions };
3539
+ }
3540
+ iteratorResult = await cycle.next(decision);
3541
+ }
3542
+ return iteratorResult.value;
3543
+ }
3544
+
3545
+ // src/core/test-case-collection.ts
3546
+ var TestCaseCollection = class _TestCaseCollection {
3547
+ cases;
3548
+ constructor(cases) {
3549
+ this.cases = Object.freeze([...cases]);
3550
+ }
3551
+ // ============================================================================
3552
+ // Static Factories
3553
+ // ============================================================================
3554
+ /**
3555
+ * Create a collection from an array of test cases.
3556
+ */
3557
+ static from(cases) {
3558
+ return new _TestCaseCollection(cases);
3559
+ }
3560
+ /**
3561
+ * Create an empty collection.
3562
+ */
3563
+ static empty() {
3564
+ return new _TestCaseCollection([]);
3565
+ }
3566
+ // ============================================================================
3567
+ // Properties
3568
+ // ============================================================================
3569
+ /**
3570
+ * Number of test cases in the collection.
3571
+ */
3572
+ get length() {
3573
+ return this.cases.length;
3574
+ }
3575
+ /**
3576
+ * Whether the collection is empty.
3577
+ */
3578
+ get isEmpty() {
3579
+ return this.cases.length === 0;
3580
+ }
3581
+ // ============================================================================
3582
+ // Selection Methods (return new TestCaseCollection - chainable)
3583
+ // ============================================================================
3584
+ /**
3585
+ * Returns all test cases.
3586
+ * Returns `this` since the collection is immutable (frozen array).
3587
+ * Useful as explicit starting point in chains.
3588
+ */
3589
+ all() {
3590
+ return this;
3591
+ }
3592
+ /**
3593
+ * Returns the first N test cases (default: 1).
3594
+ * Useful for cost-controlled testing during development.
3595
+ */
3596
+ minimal(count = 1) {
3597
+ return this.first(count);
3598
+ }
3599
+ /**
3600
+ * Returns the first N test cases.
3601
+ */
3602
+ first(count) {
3603
+ if (count <= 0) {
3604
+ return _TestCaseCollection.empty();
3605
+ }
3606
+ return new _TestCaseCollection([...this.cases.slice(0, count)]);
3607
+ }
3608
+ /**
3609
+ * Returns the last N test cases (default: 1).
3610
+ * Preserves original order (earlier cases first).
3611
+ */
3612
+ last(count = 1) {
3613
+ if (count <= 0) {
3614
+ return _TestCaseCollection.empty();
3615
+ }
3616
+ const startIndex = Math.max(0, this.cases.length - count);
3617
+ return new _TestCaseCollection([...this.cases.slice(startIndex)]);
3618
+ }
3619
+ /**
3620
+ * Returns N random test cases.
3621
+ *
3622
+ * @param count - Number of cases to select
3623
+ * @param options - Optional seed for reproducibility
3624
+ *
3625
+ * @example
3626
+ * ```typescript
3627
+ * // Different each time
3628
+ * collection.random(5)
3629
+ *
3630
+ * // Same result with same seed
3631
+ * collection.random(5, { seed: 42 })
3632
+ * ```
3633
+ */
3634
+ random(count, options) {
3635
+ if (count <= 0 || this.cases.length === 0) {
3636
+ return _TestCaseCollection.empty();
3637
+ }
3638
+ const actualCount = Math.min(count, this.cases.length);
3639
+ const indices = [...Array(this.cases.length).keys()];
3640
+ const rng = options?.seed !== void 0 ? createSeededRng(options.seed) : Math.random;
3641
+ for (let i = indices.length - 1; i > 0; i--) {
3642
+ const j = Math.floor(rng() * (i + 1));
3643
+ [indices[i], indices[j]] = [indices[j], indices[i]];
3644
+ }
3645
+ const selected = indices.slice(0, actualCount).map((i) => this.cases[i]);
3646
+ return new _TestCaseCollection([...selected]);
3647
+ }
3648
+ /**
3649
+ * Filter test cases by predicate.
3650
+ */
3651
+ filter(predicate) {
3652
+ return new _TestCaseCollection([...this.cases.filter(predicate)]);
3653
+ }
3654
+ /**
3655
+ * Find test case by ID.
3656
+ * Returns collection with single case or empty collection.
3657
+ */
3658
+ byId(id) {
3659
+ const found = this.cases.find((tc) => tc.id === id);
3660
+ return found ? new _TestCaseCollection([found]) : _TestCaseCollection.empty();
3661
+ }
3662
+ /**
3663
+ * Find test cases by multiple IDs.
3664
+ * Preserves order of provided IDs (first occurrence).
3665
+ * Skips non-existent IDs. Duplicate IDs in input are deduplicated.
3666
+ *
3667
+ * @example
3668
+ * ```typescript
3669
+ * collection.byIds(['a', 'b', 'a']) // returns [case-a, case-b] (deduplicated)
3670
+ * collection.byIds(['b', 'a']) // returns [case-b, case-a] (order preserved)
3671
+ * ```
3672
+ */
3673
+ byIds(ids) {
3674
+ const uniqueIds = [...new Set(ids)];
3675
+ const idSet = new Set(uniqueIds);
3676
+ const idToCase = /* @__PURE__ */ new Map();
3677
+ for (const tc of this.cases) {
3678
+ if (tc.id && idSet.has(tc.id) && !idToCase.has(tc.id)) {
3679
+ idToCase.set(tc.id, tc);
3680
+ }
3681
+ }
3682
+ const result = uniqueIds.map((id) => idToCase.get(id)).filter((tc) => tc !== void 0);
3683
+ return new _TestCaseCollection(result);
3684
+ }
3685
+ // ============================================================================
3686
+ // Access Methods
3687
+ // ============================================================================
3688
+ /**
3689
+ * Get test case by ID.
3690
+ * Returns undefined if not found.
3691
+ */
3692
+ get(id) {
3693
+ return this.cases.find((tc) => tc.id === id);
3694
+ }
3695
+ /**
3696
+ * Get test case by index.
3697
+ * Supports negative indices (e.g., -1 for last item).
3698
+ * Returns undefined if index is out of bounds.
3699
+ */
3700
+ at(index) {
3701
+ const normalizedIndex = index < 0 ? this.cases.length + index : index;
3702
+ if (normalizedIndex < 0 || normalizedIndex >= this.cases.length) {
3703
+ return void 0;
3704
+ }
3705
+ return this.cases[normalizedIndex];
3706
+ }
3707
+ // ============================================================================
3708
+ // Conversion Methods
3709
+ // ============================================================================
3710
+ /**
3711
+ * Convert to array.
3712
+ * Returns a mutable copy of the internal array.
3713
+ */
3714
+ toArray() {
3715
+ return [...this.cases];
3716
+ }
3717
+ // ============================================================================
3718
+ // Iterator Support
3719
+ // ============================================================================
3720
+ /**
3721
+ * Iterator support for for...of loops and spread operator.
3722
+ */
3723
+ [Symbol.iterator]() {
3724
+ return this.cases[Symbol.iterator]();
3725
+ }
3726
+ };
3727
+ var autoIdCounter = 0;
3728
+ function testCase(input, id) {
3729
+ return {
3730
+ id: id ?? `test-${++autoIdCounter}`,
3731
+ input
3732
+ };
3733
+ }
3734
+ function testCases(inputs, prefix = "case") {
3735
+ return inputs.map((input, index) => ({
3736
+ id: `${prefix}-${index}`,
3737
+ input
3738
+ }));
3739
+ }
3740
+ function createSeededRng(seed) {
3741
+ let state = seed;
3742
+ return () => {
3743
+ state = state + 1831565813 | 0;
3744
+ let t = Math.imul(state ^ state >>> 15, state | 1);
3745
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
3746
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
3747
+ };
3748
+ }
3749
+ export {
3750
+ ANTHROPIC_PRICING,
3751
+ CompositeReporter,
3752
+ ConsoleReporter,
3753
+ DEFAULT_PRICING_CONFIG,
3754
+ EvalError,
3755
+ EvalErrorCode,
3756
+ GOOGLE_PRICING,
3757
+ JsonReporter,
3758
+ MarkdownReporter,
3759
+ MockProvider,
3760
+ OPENAI_PRICING,
3761
+ TestCaseCollection,
3762
+ accuracy,
3763
+ addCostsToResults,
3764
+ afterTurns,
3765
+ aggregateIterationResults,
3766
+ aiUser,
3767
+ and,
3768
+ applyPromptSuggestions,
3769
+ bumpVersion,
3770
+ calculateAvgPassRate,
3771
+ calculateAvgStdDev,
3772
+ calculateCostFromUsage3 as calculateCostFromUsage,
3773
+ calculateIterationStats,
3774
+ calculateMultiTurnIterationStats,
3775
+ calculateReportCosts,
3776
+ calculateResultCost,
3777
+ checkCondition,
3778
+ checkCycleCondition,
3779
+ checkCycleTermination,
3780
+ checkTermination,
3781
+ compareReports,
3782
+ compileTemplate3 as compileTemplate,
3783
+ consistency,
3784
+ createCompositeReporter,
3785
+ createConsoleReporter,
3786
+ createDefaultReporter,
3787
+ createEvalSuite,
3788
+ createFilePromptRepository,
3789
+ createImprover,
3790
+ createJsonReporter,
3791
+ createJudge,
3792
+ createMarkdownReporter,
3793
+ createMockAgent,
3794
+ createMockImprover,
3795
+ createMockJudge,
3796
+ createReportRunner,
3797
+ createSession,
3798
+ customCondition,
3799
+ and2 as cycleAnd,
3800
+ not2 as cycleNot,
3801
+ or2 as cycleOr,
3802
+ cycleToMarkdown,
3803
+ defaultHistoryStorage,
3804
+ defineConfig,
3805
+ deserializePrompt,
3806
+ discoverEvalFiles,
3807
+ executeMultiTurnTestCase,
3808
+ executeTestCase,
3809
+ fieldEquals,
3810
+ fieldIsSet,
3811
+ getFieldValue,
3812
+ getFileSourceDisplayInfo,
3813
+ getFileSourcesDisplayInfo2 as getFileSourcesDisplayInfo,
3814
+ inferMediaType,
3815
+ isCustomCondition,
3816
+ isCustomCycleCondition,
3817
+ isCycleTerminated,
3818
+ isFieldSetCondition,
3819
+ isFieldValueCondition,
3820
+ isFileSource,
3821
+ isFileSourceBase64,
3822
+ isFileSourceData,
3823
+ isFileSourcePath,
3824
+ isFileSourceUrl,
3825
+ isIteratedResult,
3826
+ isMaxCostCondition,
3827
+ isMaxRoundsCondition,
3828
+ isMaxTurnsCondition,
3829
+ isMultiTurnResult,
3830
+ isMultiTurnTestCase,
3831
+ isNoImprovementCondition,
3832
+ isSingleTurnResult,
3833
+ isTargetScoreCondition,
3834
+ isTerminated,
3835
+ loadHistory,
3836
+ logCycle,
3837
+ maxCost,
3838
+ maxRounds,
3839
+ mock,
3840
+ naturalLanguage,
3841
+ noImprovement,
3842
+ not,
3843
+ or,
3844
+ relevance,
3845
+ reportToMarkdown,
3846
+ resolveFileSource,
3847
+ resolveFileSourcesInInput3 as resolveFileSourcesInInput,
3848
+ resumeSession,
3849
+ runImprovementCycle,
3850
+ runImprovementCycleAuto,
3851
+ runWithConcurrency,
3852
+ saveCycleJson,
3853
+ saveCycleMarkdown,
3854
+ saveHistory,
3855
+ saveReportMarkdown,
3856
+ scanForFileSources,
3857
+ schema,
3858
+ selectRepresentativeResult,
3859
+ serializePrompt,
3860
+ suggestionDiff,
3861
+ suggestionPreview,
3862
+ suggestionSummary,
3863
+ targetScore,
3864
+ testCase,
3865
+ testCases,
3866
+ toEvalAgent
3867
+ };
3868
+ //# sourceMappingURL=index.js.map