@agtlantis/eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js ADDED
@@ -0,0 +1,4709 @@
1
+ #!/usr/bin/env node
2
+
3
+ // ../../../../node_modules/.pnpm/cac@6.7.14/node_modules/cac/dist/index.mjs
4
+ import { EventEmitter } from "events";
5
+ function toArr(any) {
6
+ return any == null ? [] : Array.isArray(any) ? any : [any];
7
+ }
8
+ function toVal(out, key, val, opts) {
9
+ var x, old = out[key], nxt = !!~opts.string.indexOf(key) ? val == null || val === true ? "" : String(val) : typeof val === "boolean" ? val : !!~opts.boolean.indexOf(key) ? val === "false" ? false : val === "true" || (out._.push((x = +val, x * 0 === 0) ? x : val), !!val) : (x = +val, x * 0 === 0) ? x : val;
10
+ out[key] = old == null ? nxt : Array.isArray(old) ? old.concat(nxt) : [old, nxt];
11
+ }
12
+ function mri2(args, opts) {
13
+ args = args || [];
14
+ opts = opts || {};
15
+ var k, arr, arg, name, val, out = { _: [] };
16
+ var i = 0, j = 0, idx = 0, len = args.length;
17
+ const alibi = opts.alias !== void 0;
18
+ const strict = opts.unknown !== void 0;
19
+ const defaults = opts.default !== void 0;
20
+ opts.alias = opts.alias || {};
21
+ opts.string = toArr(opts.string);
22
+ opts.boolean = toArr(opts.boolean);
23
+ if (alibi) {
24
+ for (k in opts.alias) {
25
+ arr = opts.alias[k] = toArr(opts.alias[k]);
26
+ for (i = 0; i < arr.length; i++) {
27
+ (opts.alias[arr[i]] = arr.concat(k)).splice(i, 1);
28
+ }
29
+ }
30
+ }
31
+ for (i = opts.boolean.length; i-- > 0; ) {
32
+ arr = opts.alias[opts.boolean[i]] || [];
33
+ for (j = arr.length; j-- > 0; ) opts.boolean.push(arr[j]);
34
+ }
35
+ for (i = opts.string.length; i-- > 0; ) {
36
+ arr = opts.alias[opts.string[i]] || [];
37
+ for (j = arr.length; j-- > 0; ) opts.string.push(arr[j]);
38
+ }
39
+ if (defaults) {
40
+ for (k in opts.default) {
41
+ name = typeof opts.default[k];
42
+ arr = opts.alias[k] = opts.alias[k] || [];
43
+ if (opts[name] !== void 0) {
44
+ opts[name].push(k);
45
+ for (i = 0; i < arr.length; i++) {
46
+ opts[name].push(arr[i]);
47
+ }
48
+ }
49
+ }
50
+ }
51
+ const keys = strict ? Object.keys(opts.alias) : [];
52
+ for (i = 0; i < len; i++) {
53
+ arg = args[i];
54
+ if (arg === "--") {
55
+ out._ = out._.concat(args.slice(++i));
56
+ break;
57
+ }
58
+ for (j = 0; j < arg.length; j++) {
59
+ if (arg.charCodeAt(j) !== 45) break;
60
+ }
61
+ if (j === 0) {
62
+ out._.push(arg);
63
+ } else if (arg.substring(j, j + 3) === "no-") {
64
+ name = arg.substring(j + 3);
65
+ if (strict && !~keys.indexOf(name)) {
66
+ return opts.unknown(arg);
67
+ }
68
+ out[name] = false;
69
+ } else {
70
+ for (idx = j + 1; idx < arg.length; idx++) {
71
+ if (arg.charCodeAt(idx) === 61) break;
72
+ }
73
+ name = arg.substring(j, idx);
74
+ val = arg.substring(++idx) || (i + 1 === len || ("" + args[i + 1]).charCodeAt(0) === 45 || args[++i]);
75
+ arr = j === 2 ? [name] : name;
76
+ for (idx = 0; idx < arr.length; idx++) {
77
+ name = arr[idx];
78
+ if (strict && !~keys.indexOf(name)) return opts.unknown("-".repeat(j) + name);
79
+ toVal(out, name, idx + 1 < arr.length || val, opts);
80
+ }
81
+ }
82
+ }
83
+ if (defaults) {
84
+ for (k in opts.default) {
85
+ if (out[k] === void 0) {
86
+ out[k] = opts.default[k];
87
+ }
88
+ }
89
+ }
90
+ if (alibi) {
91
+ for (k in out) {
92
+ arr = opts.alias[k] || [];
93
+ while (arr.length > 0) {
94
+ out[arr.shift()] = out[k];
95
+ }
96
+ }
97
+ }
98
+ return out;
99
+ }
100
+ var removeBrackets = (v) => v.replace(/[<[].+/, "").trim();
101
+ var findAllBrackets = (v) => {
102
+ const ANGLED_BRACKET_RE_GLOBAL = /<([^>]+)>/g;
103
+ const SQUARE_BRACKET_RE_GLOBAL = /\[([^\]]+)\]/g;
104
+ const res = [];
105
+ const parse = (match) => {
106
+ let variadic = false;
107
+ let value = match[1];
108
+ if (value.startsWith("...")) {
109
+ value = value.slice(3);
110
+ variadic = true;
111
+ }
112
+ return {
113
+ required: match[0].startsWith("<"),
114
+ value,
115
+ variadic
116
+ };
117
+ };
118
+ let angledMatch;
119
+ while (angledMatch = ANGLED_BRACKET_RE_GLOBAL.exec(v)) {
120
+ res.push(parse(angledMatch));
121
+ }
122
+ let squareMatch;
123
+ while (squareMatch = SQUARE_BRACKET_RE_GLOBAL.exec(v)) {
124
+ res.push(parse(squareMatch));
125
+ }
126
+ return res;
127
+ };
128
+ var getMriOptions = (options) => {
129
+ const result = { alias: {}, boolean: [] };
130
+ for (const [index, option] of options.entries()) {
131
+ if (option.names.length > 1) {
132
+ result.alias[option.names[0]] = option.names.slice(1);
133
+ }
134
+ if (option.isBoolean) {
135
+ if (option.negated) {
136
+ const hasStringTypeOption = options.some((o, i) => {
137
+ return i !== index && o.names.some((name) => option.names.includes(name)) && typeof o.required === "boolean";
138
+ });
139
+ if (!hasStringTypeOption) {
140
+ result.boolean.push(option.names[0]);
141
+ }
142
+ } else {
143
+ result.boolean.push(option.names[0]);
144
+ }
145
+ }
146
+ }
147
+ return result;
148
+ };
149
+ var findLongest = (arr) => {
150
+ return arr.sort((a, b) => {
151
+ return a.length > b.length ? -1 : 1;
152
+ })[0];
153
+ };
154
+ var padRight = (str, length) => {
155
+ return str.length >= length ? str : `${str}${" ".repeat(length - str.length)}`;
156
+ };
157
+ var camelcase = (input) => {
158
+ return input.replace(/([a-z])-([a-z])/g, (_, p1, p2) => {
159
+ return p1 + p2.toUpperCase();
160
+ });
161
+ };
162
+ var setDotProp = (obj, keys, val) => {
163
+ let i = 0;
164
+ let length = keys.length;
165
+ let t = obj;
166
+ let x;
167
+ for (; i < length; ++i) {
168
+ x = t[keys[i]];
169
+ t = t[keys[i]] = i === length - 1 ? val : x != null ? x : !!~keys[i + 1].indexOf(".") || !(+keys[i + 1] > -1) ? {} : [];
170
+ }
171
+ };
172
+ var setByType = (obj, transforms) => {
173
+ for (const key of Object.keys(transforms)) {
174
+ const transform = transforms[key];
175
+ if (transform.shouldTransform) {
176
+ obj[key] = Array.prototype.concat.call([], obj[key]);
177
+ if (typeof transform.transformFunction === "function") {
178
+ obj[key] = obj[key].map(transform.transformFunction);
179
+ }
180
+ }
181
+ }
182
+ };
183
+ var getFileName = (input) => {
184
+ const m = /([^\\\/]+)$/.exec(input);
185
+ return m ? m[1] : "";
186
+ };
187
+ var camelcaseOptionName = (name) => {
188
+ return name.split(".").map((v, i) => {
189
+ return i === 0 ? camelcase(v) : v;
190
+ }).join(".");
191
+ };
192
+ var CACError = class extends Error {
193
+ constructor(message) {
194
+ super(message);
195
+ this.name = this.constructor.name;
196
+ if (typeof Error.captureStackTrace === "function") {
197
+ Error.captureStackTrace(this, this.constructor);
198
+ } else {
199
+ this.stack = new Error(message).stack;
200
+ }
201
+ }
202
+ };
203
+ var Option = class {
204
+ constructor(rawName, description, config) {
205
+ this.rawName = rawName;
206
+ this.description = description;
207
+ this.config = Object.assign({}, config);
208
+ rawName = rawName.replace(/\.\*/g, "");
209
+ this.negated = false;
210
+ this.names = removeBrackets(rawName).split(",").map((v) => {
211
+ let name = v.trim().replace(/^-{1,2}/, "");
212
+ if (name.startsWith("no-")) {
213
+ this.negated = true;
214
+ name = name.replace(/^no-/, "");
215
+ }
216
+ return camelcaseOptionName(name);
217
+ }).sort((a, b) => a.length > b.length ? 1 : -1);
218
+ this.name = this.names[this.names.length - 1];
219
+ if (this.negated && this.config.default == null) {
220
+ this.config.default = true;
221
+ }
222
+ if (rawName.includes("<")) {
223
+ this.required = true;
224
+ } else if (rawName.includes("[")) {
225
+ this.required = false;
226
+ } else {
227
+ this.isBoolean = true;
228
+ }
229
+ }
230
+ };
231
+ var processArgs = process.argv;
232
+ var platformInfo = `${process.platform}-${process.arch} node-${process.version}`;
233
+ var Command = class {
234
+ constructor(rawName, description, config = {}, cli2) {
235
+ this.rawName = rawName;
236
+ this.description = description;
237
+ this.config = config;
238
+ this.cli = cli2;
239
+ this.options = [];
240
+ this.aliasNames = [];
241
+ this.name = removeBrackets(rawName);
242
+ this.args = findAllBrackets(rawName);
243
+ this.examples = [];
244
+ }
245
+ usage(text) {
246
+ this.usageText = text;
247
+ return this;
248
+ }
249
+ allowUnknownOptions() {
250
+ this.config.allowUnknownOptions = true;
251
+ return this;
252
+ }
253
+ ignoreOptionDefaultValue() {
254
+ this.config.ignoreOptionDefaultValue = true;
255
+ return this;
256
+ }
257
+ version(version, customFlags = "-v, --version") {
258
+ this.versionNumber = version;
259
+ this.option(customFlags, "Display version number");
260
+ return this;
261
+ }
262
+ example(example) {
263
+ this.examples.push(example);
264
+ return this;
265
+ }
266
+ option(rawName, description, config) {
267
+ const option = new Option(rawName, description, config);
268
+ this.options.push(option);
269
+ return this;
270
+ }
271
+ alias(name) {
272
+ this.aliasNames.push(name);
273
+ return this;
274
+ }
275
+ action(callback) {
276
+ this.commandAction = callback;
277
+ return this;
278
+ }
279
+ isMatched(name) {
280
+ return this.name === name || this.aliasNames.includes(name);
281
+ }
282
+ get isDefaultCommand() {
283
+ return this.name === "" || this.aliasNames.includes("!");
284
+ }
285
+ get isGlobalCommand() {
286
+ return this instanceof GlobalCommand;
287
+ }
288
+ hasOption(name) {
289
+ name = name.split(".")[0];
290
+ return this.options.find((option) => {
291
+ return option.names.includes(name);
292
+ });
293
+ }
294
+ outputHelp() {
295
+ const { name, commands } = this.cli;
296
+ const {
297
+ versionNumber,
298
+ options: globalOptions,
299
+ helpCallback
300
+ } = this.cli.globalCommand;
301
+ let sections = [
302
+ {
303
+ body: `${name}${versionNumber ? `/${versionNumber}` : ""}`
304
+ }
305
+ ];
306
+ sections.push({
307
+ title: "Usage",
308
+ body: ` $ ${name} ${this.usageText || this.rawName}`
309
+ });
310
+ const showCommands = (this.isGlobalCommand || this.isDefaultCommand) && commands.length > 0;
311
+ if (showCommands) {
312
+ const longestCommandName = findLongest(commands.map((command) => command.rawName));
313
+ sections.push({
314
+ title: "Commands",
315
+ body: commands.map((command) => {
316
+ return ` ${padRight(command.rawName, longestCommandName.length)} ${command.description}`;
317
+ }).join("\n")
318
+ });
319
+ sections.push({
320
+ title: `For more info, run any command with the \`--help\` flag`,
321
+ body: commands.map((command) => ` $ ${name}${command.name === "" ? "" : ` ${command.name}`} --help`).join("\n")
322
+ });
323
+ }
324
+ let options = this.isGlobalCommand ? globalOptions : [...this.options, ...globalOptions || []];
325
+ if (!this.isGlobalCommand && !this.isDefaultCommand) {
326
+ options = options.filter((option) => option.name !== "version");
327
+ }
328
+ if (options.length > 0) {
329
+ const longestOptionName = findLongest(options.map((option) => option.rawName));
330
+ sections.push({
331
+ title: "Options",
332
+ body: options.map((option) => {
333
+ return ` ${padRight(option.rawName, longestOptionName.length)} ${option.description} ${option.config.default === void 0 ? "" : `(default: ${option.config.default})`}`;
334
+ }).join("\n")
335
+ });
336
+ }
337
+ if (this.examples.length > 0) {
338
+ sections.push({
339
+ title: "Examples",
340
+ body: this.examples.map((example) => {
341
+ if (typeof example === "function") {
342
+ return example(name);
343
+ }
344
+ return example;
345
+ }).join("\n")
346
+ });
347
+ }
348
+ if (helpCallback) {
349
+ sections = helpCallback(sections) || sections;
350
+ }
351
+ console.log(sections.map((section) => {
352
+ return section.title ? `${section.title}:
353
+ ${section.body}` : section.body;
354
+ }).join("\n\n"));
355
+ }
356
+ outputVersion() {
357
+ const { name } = this.cli;
358
+ const { versionNumber } = this.cli.globalCommand;
359
+ if (versionNumber) {
360
+ console.log(`${name}/${versionNumber} ${platformInfo}`);
361
+ }
362
+ }
363
+ checkRequiredArgs() {
364
+ const minimalArgsCount = this.args.filter((arg) => arg.required).length;
365
+ if (this.cli.args.length < minimalArgsCount) {
366
+ throw new CACError(`missing required args for command \`${this.rawName}\``);
367
+ }
368
+ }
369
+ checkUnknownOptions() {
370
+ const { options, globalCommand } = this.cli;
371
+ if (!this.config.allowUnknownOptions) {
372
+ for (const name of Object.keys(options)) {
373
+ if (name !== "--" && !this.hasOption(name) && !globalCommand.hasOption(name)) {
374
+ throw new CACError(`Unknown option \`${name.length > 1 ? `--${name}` : `-${name}`}\``);
375
+ }
376
+ }
377
+ }
378
+ }
379
+ checkOptionValue() {
380
+ const { options: parsedOptions, globalCommand } = this.cli;
381
+ const options = [...globalCommand.options, ...this.options];
382
+ for (const option of options) {
383
+ const value = parsedOptions[option.name.split(".")[0]];
384
+ if (option.required) {
385
+ const hasNegated = options.some((o) => o.negated && o.names.includes(option.name));
386
+ if (value === true || value === false && !hasNegated) {
387
+ throw new CACError(`option \`${option.rawName}\` value is missing`);
388
+ }
389
+ }
390
+ }
391
+ }
392
+ };
393
+ var GlobalCommand = class extends Command {
394
+ constructor(cli2) {
395
+ super("@@global@@", "", {}, cli2);
396
+ }
397
+ };
398
+ var __assign = Object.assign;
399
+ var CAC = class extends EventEmitter {
400
+ constructor(name = "") {
401
+ super();
402
+ this.name = name;
403
+ this.commands = [];
404
+ this.rawArgs = [];
405
+ this.args = [];
406
+ this.options = {};
407
+ this.globalCommand = new GlobalCommand(this);
408
+ this.globalCommand.usage("<command> [options]");
409
+ }
410
+ usage(text) {
411
+ this.globalCommand.usage(text);
412
+ return this;
413
+ }
414
+ command(rawName, description, config) {
415
+ const command = new Command(rawName, description || "", config, this);
416
+ command.globalCommand = this.globalCommand;
417
+ this.commands.push(command);
418
+ return command;
419
+ }
420
+ option(rawName, description, config) {
421
+ this.globalCommand.option(rawName, description, config);
422
+ return this;
423
+ }
424
+ help(callback) {
425
+ this.globalCommand.option("-h, --help", "Display this message");
426
+ this.globalCommand.helpCallback = callback;
427
+ this.showHelpOnExit = true;
428
+ return this;
429
+ }
430
+ version(version, customFlags = "-v, --version") {
431
+ this.globalCommand.version(version, customFlags);
432
+ this.showVersionOnExit = true;
433
+ return this;
434
+ }
435
+ example(example) {
436
+ this.globalCommand.example(example);
437
+ return this;
438
+ }
439
+ outputHelp() {
440
+ if (this.matchedCommand) {
441
+ this.matchedCommand.outputHelp();
442
+ } else {
443
+ this.globalCommand.outputHelp();
444
+ }
445
+ }
446
+ outputVersion() {
447
+ this.globalCommand.outputVersion();
448
+ }
449
+ setParsedInfo({ args, options }, matchedCommand, matchedCommandName) {
450
+ this.args = args;
451
+ this.options = options;
452
+ if (matchedCommand) {
453
+ this.matchedCommand = matchedCommand;
454
+ }
455
+ if (matchedCommandName) {
456
+ this.matchedCommandName = matchedCommandName;
457
+ }
458
+ return this;
459
+ }
460
+ unsetMatchedCommand() {
461
+ this.matchedCommand = void 0;
462
+ this.matchedCommandName = void 0;
463
+ }
464
+ parse(argv = processArgs, {
465
+ run = true
466
+ } = {}) {
467
+ this.rawArgs = argv;
468
+ if (!this.name) {
469
+ this.name = argv[1] ? getFileName(argv[1]) : "cli";
470
+ }
471
+ let shouldParse = true;
472
+ for (const command of this.commands) {
473
+ const parsed = this.mri(argv.slice(2), command);
474
+ const commandName = parsed.args[0];
475
+ if (command.isMatched(commandName)) {
476
+ shouldParse = false;
477
+ const parsedInfo = __assign(__assign({}, parsed), {
478
+ args: parsed.args.slice(1)
479
+ });
480
+ this.setParsedInfo(parsedInfo, command, commandName);
481
+ this.emit(`command:${commandName}`, command);
482
+ }
483
+ }
484
+ if (shouldParse) {
485
+ for (const command of this.commands) {
486
+ if (command.name === "") {
487
+ shouldParse = false;
488
+ const parsed = this.mri(argv.slice(2), command);
489
+ this.setParsedInfo(parsed, command);
490
+ this.emit(`command:!`, command);
491
+ }
492
+ }
493
+ }
494
+ if (shouldParse) {
495
+ const parsed = this.mri(argv.slice(2));
496
+ this.setParsedInfo(parsed);
497
+ }
498
+ if (this.options.help && this.showHelpOnExit) {
499
+ this.outputHelp();
500
+ run = false;
501
+ this.unsetMatchedCommand();
502
+ }
503
+ if (this.options.version && this.showVersionOnExit && this.matchedCommandName == null) {
504
+ this.outputVersion();
505
+ run = false;
506
+ this.unsetMatchedCommand();
507
+ }
508
+ const parsedArgv = { args: this.args, options: this.options };
509
+ if (run) {
510
+ this.runMatchedCommand();
511
+ }
512
+ if (!this.matchedCommand && this.args[0]) {
513
+ this.emit("command:*");
514
+ }
515
+ return parsedArgv;
516
+ }
517
+ mri(argv, command) {
518
+ const cliOptions = [
519
+ ...this.globalCommand.options,
520
+ ...command ? command.options : []
521
+ ];
522
+ const mriOptions = getMriOptions(cliOptions);
523
+ let argsAfterDoubleDashes = [];
524
+ const doubleDashesIndex = argv.indexOf("--");
525
+ if (doubleDashesIndex > -1) {
526
+ argsAfterDoubleDashes = argv.slice(doubleDashesIndex + 1);
527
+ argv = argv.slice(0, doubleDashesIndex);
528
+ }
529
+ let parsed = mri2(argv, mriOptions);
530
+ parsed = Object.keys(parsed).reduce((res, name) => {
531
+ return __assign(__assign({}, res), {
532
+ [camelcaseOptionName(name)]: parsed[name]
533
+ });
534
+ }, { _: [] });
535
+ const args = parsed._;
536
+ const options = {
537
+ "--": argsAfterDoubleDashes
538
+ };
539
+ const ignoreDefault = command && command.config.ignoreOptionDefaultValue ? command.config.ignoreOptionDefaultValue : this.globalCommand.config.ignoreOptionDefaultValue;
540
+ let transforms = /* @__PURE__ */ Object.create(null);
541
+ for (const cliOption of cliOptions) {
542
+ if (!ignoreDefault && cliOption.config.default !== void 0) {
543
+ for (const name of cliOption.names) {
544
+ options[name] = cliOption.config.default;
545
+ }
546
+ }
547
+ if (Array.isArray(cliOption.config.type)) {
548
+ if (transforms[cliOption.name] === void 0) {
549
+ transforms[cliOption.name] = /* @__PURE__ */ Object.create(null);
550
+ transforms[cliOption.name]["shouldTransform"] = true;
551
+ transforms[cliOption.name]["transformFunction"] = cliOption.config.type[0];
552
+ }
553
+ }
554
+ }
555
+ for (const key of Object.keys(parsed)) {
556
+ if (key !== "_") {
557
+ const keys = key.split(".");
558
+ setDotProp(options, keys, parsed[key]);
559
+ setByType(options, transforms);
560
+ }
561
+ }
562
+ return {
563
+ args,
564
+ options
565
+ };
566
+ }
567
+ runMatchedCommand() {
568
+ const { args, options, matchedCommand: command } = this;
569
+ if (!command || !command.commandAction)
570
+ return;
571
+ command.checkUnknownOptions();
572
+ command.checkOptionValue();
573
+ command.checkRequiredArgs();
574
+ const actionArgs = [];
575
+ command.args.forEach((arg, index) => {
576
+ if (arg.variadic) {
577
+ actionArgs.push(args.slice(index));
578
+ } else {
579
+ actionArgs.push(args[index]);
580
+ }
581
+ });
582
+ actionArgs.push(options);
583
+ return command.commandAction.apply(this, actionArgs);
584
+ }
585
+ };
586
+ var cac = (name = "") => new CAC(name);
587
+ var dist_default = cac;
588
+
589
+ // src/cli/config/types.ts
590
+ function isMultiTurnConfig(testCase2) {
591
+ return "multiTurn" in testCase2 && testCase2.multiTurn !== void 0;
592
+ }
593
+
594
+ // src/cli/config/schema.ts
595
+ import { z } from "zod";
596
+
597
+ // src/core/errors.ts
598
+ var EvalError = class _EvalError extends Error {
599
+ code;
600
+ cause;
601
+ context;
602
+ constructor(message, options) {
603
+ super(message);
604
+ this.name = "EvalError";
605
+ this.code = options.code;
606
+ this.cause = options.cause;
607
+ this.context = options.context;
608
+ if (Error.captureStackTrace) {
609
+ Error.captureStackTrace(this, _EvalError);
610
+ }
611
+ }
612
+ /**
613
+ * Creates an EvalError from an unknown error with a specific code.
614
+ */
615
+ static from(error, code, context) {
616
+ if (error instanceof _EvalError) {
617
+ return error;
618
+ }
619
+ const cause = error instanceof Error ? error : new Error(String(error));
620
+ return new _EvalError(cause.message, { code, cause, context });
621
+ }
622
+ toJSON() {
623
+ return {
624
+ name: this.name,
625
+ message: this.message,
626
+ code: this.code,
627
+ context: this.context,
628
+ cause: this.cause?.message
629
+ };
630
+ }
631
+ };
632
+
633
+ // src/cli/config/schema.ts
634
+ var llmConfigSchema = z.object({
635
+ provider: z.enum(["openai", "gemini"], {
636
+ errorMap: () => ({
637
+ message: "provider must be 'openai' or 'gemini'"
638
+ })
639
+ }),
640
+ apiKey: z.string().optional(),
641
+ defaultModel: z.string().optional(),
642
+ reasoningEffort: z.enum(["minimal", "low", "medium", "high"]).optional(),
643
+ defaultResponseFormat: z.object({
644
+ type: z.enum(["json_object", "text"])
645
+ }).optional()
646
+ });
647
+ var criterionSchema = z.object({
648
+ id: z.string().min(1, "Criterion id is required"),
649
+ name: z.string().min(1, "Criterion name is required"),
650
+ description: z.string().min(1, "Criterion description is required"),
651
+ weight: z.number().positive().optional(),
652
+ validator: z.function().optional()
653
+ });
654
+ var judgeConfigSchema = z.object({
655
+ llm: llmConfigSchema.optional(),
656
+ criteria: z.array(criterionSchema).min(1, "At least one criterion is required"),
657
+ passThreshold: z.number().min(0).max(100).optional(),
658
+ prompt: z.any().optional()
659
+ });
660
+ var improverConfigSchema = z.object({
661
+ llm: llmConfigSchema.optional(),
662
+ prompt: z.any().optional()
663
+ }).optional();
664
+ var outputConfigSchema = z.object({
665
+ dir: z.string().optional(),
666
+ filename: z.string().optional(),
667
+ verbose: z.boolean().optional()
668
+ }).optional();
669
+ var runConfigSchema = z.object({
670
+ concurrency: z.number().int().positive().optional(),
671
+ iterations: z.number().int().positive().optional(),
672
+ stopOnFirstFailure: z.boolean().optional()
673
+ }).optional();
674
+ var maxTurnsConditionSchema = z.object({
675
+ type: z.literal("maxTurns"),
676
+ count: z.number().int().positive()
677
+ });
678
+ var fieldSetConditionSchema = z.object({
679
+ type: z.literal("fieldSet"),
680
+ fieldPath: z.string().min(1)
681
+ });
682
+ var fieldValueConditionSchema = z.object({
683
+ type: z.literal("fieldValue"),
684
+ fieldPath: z.string().min(1),
685
+ expectedValue: z.unknown()
686
+ });
687
+ var customConditionSchema = z.object({
688
+ type: z.literal("custom"),
689
+ check: z.function(),
690
+ description: z.string().optional()
691
+ });
692
+ var terminationConditionSchema = z.union([
693
+ maxTurnsConditionSchema,
694
+ fieldSetConditionSchema,
695
+ fieldValueConditionSchema,
696
+ customConditionSchema
697
+ ]);
698
+ var followUpInputSchema = z.object({
699
+ input: z.unknown(),
700
+ description: z.string().optional(),
701
+ turns: z.number().optional()
702
+ });
703
+ var multiTurnConfigSchema = z.object({
704
+ followUpInputs: z.array(followUpInputSchema).optional(),
705
+ terminateWhen: z.array(terminationConditionSchema).min(1, "At least one termination condition is required"),
706
+ maxTurns: z.number().int().positive().optional(),
707
+ onConditionMet: z.enum(["pass", "fail"]).optional(),
708
+ onMaxTurnsReached: z.enum(["pass", "fail"]).optional()
709
+ });
710
+ var testCaseSchema = z.object({
711
+ id: z.string().optional(),
712
+ input: z.unknown(),
713
+ tags: z.array(z.string()).optional(),
714
+ description: z.string().optional(),
715
+ expectedOutput: z.unknown().optional(),
716
+ files: z.array(z.any()).optional(),
717
+ multiTurn: multiTurnConfigSchema.optional()
718
+ });
719
+ var agentSchema = z.object({
720
+ config: z.object({
721
+ name: z.string(),
722
+ description: z.string().optional()
723
+ }),
724
+ prompt: z.object({
725
+ id: z.string(),
726
+ version: z.string(),
727
+ system: z.string(),
728
+ renderUserPrompt: z.function()
729
+ }),
730
+ execute: z.function()
731
+ });
732
+ var evalConfigSchema = z.object({
733
+ name: z.string().optional(),
734
+ agentDescription: z.string().optional(),
735
+ agent: agentSchema,
736
+ llm: llmConfigSchema,
737
+ judge: judgeConfigSchema,
738
+ improver: improverConfigSchema,
739
+ testCases: z.array(testCaseSchema).optional(),
740
+ output: outputConfigSchema,
741
+ run: runConfigSchema,
742
+ include: z.array(z.string().min(1, "Include pattern cannot be empty")).min(1, "Include array must have at least one pattern").optional(),
743
+ agents: z.record(z.string(), agentSchema).optional()
744
+ }).refine(
745
+ (data) => {
746
+ const hasTestCases = (data.testCases?.length ?? 0) > 0;
747
+ const hasInclude = (data.include?.length ?? 0) > 0;
748
+ return hasTestCases || hasInclude;
749
+ },
750
+ {
751
+ message: "Either testCases or include must be provided. Use testCases for inline TypeScript tests, or include for YAML file discovery.",
752
+ path: ["testCases"]
753
+ }
754
+ );
755
+ function validateConfig(config) {
756
+ const result = evalConfigSchema.safeParse(config);
757
+ if (!result.success) {
758
+ const errors = result.error.issues.map((issue) => {
759
+ const path3 = issue.path.join(".");
760
+ return path3 ? ` - ${path3}: ${issue.message}` : ` - ${issue.message}`;
761
+ }).join("\n");
762
+ throw new EvalError(`Invalid configuration:
763
+ ${errors}`, {
764
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */
765
+ });
766
+ }
767
+ return result.data;
768
+ }
769
+
770
+ // src/cli/config/loader.ts
771
+ import { existsSync } from "fs";
772
+ import { resolve, extname } from "path";
773
+ import { pathToFileURL } from "url";
774
+ import { bundleRequire } from "bundle-require";
775
+ import fg from "fast-glob";
776
+ var ConfigError = class extends Error {
777
+ constructor(message, code, context) {
778
+ super(message);
779
+ this.code = code;
780
+ this.context = context;
781
+ this.name = "ConfigError";
782
+ }
783
+ };
784
+ var DEFAULT_CONFIG_FILE = "agent-eval.config.ts";
785
+ var SUPPORTED_EXTENSIONS = [".ts", ".mts", ".cts", ".js", ".mjs", ".cjs"];
786
+ function resolveConfigPath(configPath = DEFAULT_CONFIG_FILE, cwd = process.cwd()) {
787
+ return resolve(cwd, configPath);
788
+ }
789
+ async function loadConfig(configPath) {
790
+ const absolutePath = resolve(process.cwd(), configPath);
791
+ if (!existsSync(absolutePath)) {
792
+ throw new ConfigError(
793
+ `Config file not found: ${configPath}
794
+
795
+ Create an ${DEFAULT_CONFIG_FILE} file or specify a path:
796
+ npx agent-eval run ./path/to/config.ts`,
797
+ "CONFIG_NOT_FOUND",
798
+ { path: absolutePath }
799
+ );
800
+ }
801
+ const ext = extname(absolutePath).toLowerCase();
802
+ if (!SUPPORTED_EXTENSIONS.includes(ext)) {
803
+ throw new ConfigError(
804
+ `Unsupported config file extension: ${ext}
805
+ Supported extensions: ${SUPPORTED_EXTENSIONS.join(", ")}`,
806
+ "CONFIG_LOAD_ERROR",
807
+ { path: absolutePath, extension: ext }
808
+ );
809
+ }
810
+ let mod;
811
+ try {
812
+ if (ext === ".ts" || ext === ".mts" || ext === ".cts") {
813
+ const result = await bundleRequire({
814
+ filepath: absolutePath,
815
+ format: "esm",
816
+ esbuildOptions: { sourcemap: "inline" }
817
+ });
818
+ mod = result.mod;
819
+ } else {
820
+ const fileUrl = pathToFileURL(absolutePath).href;
821
+ mod = await import(fileUrl);
822
+ }
823
+ } catch (error) {
824
+ const message = error instanceof Error ? error.message : String(error);
825
+ throw new ConfigError(
826
+ `Failed to load config file: ${configPath}
827
+
828
+ Error: ${message}
829
+
830
+ Make sure the file is valid TypeScript/JavaScript and has no syntax errors.`,
831
+ "CONFIG_LOAD_ERROR",
832
+ { path: absolutePath, originalError: message }
833
+ );
834
+ }
835
+ const config = "default" in mod ? mod.default : mod;
836
+ if (!config || typeof config !== "object") {
837
+ throw new ConfigError(
838
+ `Config file must export a default configuration object.
839
+
840
+ Example:
841
+ import { defineConfig } from '@agtlantis/eval'
842
+ export default defineConfig({ ... })`,
843
+ "CONFIG_NO_DEFAULT_EXPORT",
844
+ { path: absolutePath }
845
+ );
846
+ }
847
+ try {
848
+ validateConfig(config);
849
+ } catch (error) {
850
+ const message = error instanceof Error ? error.message : String(error);
851
+ throw new ConfigError(
852
+ message,
853
+ "CONFIG_VALIDATION_ERROR",
854
+ { path: absolutePath }
855
+ );
856
+ }
857
+ return config;
858
+ }
859
+ async function loadConfigWithDefaults(configPath, cwd) {
860
+ const resolvedPath = resolveConfigPath(configPath, cwd);
861
+ return loadConfig(resolvedPath);
862
+ }
863
+ async function discoverEvalFiles(config, options = {}) {
864
+ const patterns = options.include ?? config.include;
865
+ if (!patterns || patterns.length === 0) {
866
+ throw new ConfigError(
867
+ `No include patterns specified.
868
+
869
+ Add an include field to your config:
870
+ include: ['evals/**/*.eval.yaml']
871
+
872
+ Or use the --include CLI option:
873
+ npx agent-eval --include "evals/**/*.eval.yaml"`,
874
+ "CONFIG_NO_INCLUDE_PATTERN"
875
+ );
876
+ }
877
+ const cwd = options.cwd ?? process.cwd();
878
+ const ignore = options.ignore ?? ["**/node_modules/**"];
879
+ const files = await fg(patterns, {
880
+ absolute: true,
881
+ cwd,
882
+ ignore,
883
+ onlyFiles: true,
884
+ dot: false,
885
+ followSymbolicLinks: false,
886
+ unique: true,
887
+ suppressErrors: false
888
+ });
889
+ return files.sort();
890
+ }
891
+
892
+ // src/cli/yaml/schema.ts
893
+ import { z as z2 } from "zod";
894
+ var yamlExpectationSchema = z2.object({
895
+ minTurns: z2.number().int().positive().optional(),
896
+ maxTurns: z2.number().int().positive().optional(),
897
+ minScore: z2.number().min(0).max(100).optional()
898
+ });
899
+ var yamlTerminationConditionSchema = z2.object({
900
+ field: z2.string().min(1).optional(),
901
+ equals: z2.unknown().optional(),
902
+ naturalLanguage: z2.string().min(1).optional()
903
+ }).refine((data) => data.field !== void 0 || data.naturalLanguage !== void 0, {
904
+ message: "Either field or naturalLanguage must be specified"
905
+ });
906
+ var yamlPersonaSchema = z2.object({
907
+ name: z2.string().min(1, "Persona name is required"),
908
+ description: z2.string().optional(),
909
+ systemPrompt: z2.string().min(1, "Persona systemPrompt is required")
910
+ });
911
+ var yamlTestCaseDefaultsSchema = z2.object({
912
+ maxTurns: z2.number().int().positive().optional(),
913
+ endWhen: yamlTerminationConditionSchema.optional(),
914
+ onConditionMet: z2.enum(["pass", "fail"]).optional(),
915
+ onMaxTurnsReached: z2.enum(["pass", "fail"]).optional(),
916
+ tags: z2.array(z2.string()).optional()
917
+ });
918
+ var yamlTestCaseSchema = z2.object({
919
+ id: z2.string().min(1, "Test case id is required"),
920
+ name: z2.string().optional(),
921
+ description: z2.string().optional(),
922
+ tags: z2.array(z2.string()).optional(),
923
+ input: z2.record(z2.unknown()),
924
+ persona: z2.union([z2.string().min(1), yamlPersonaSchema]).optional(),
925
+ maxTurns: z2.number().int().positive().optional(),
926
+ endWhen: yamlTerminationConditionSchema.optional(),
927
+ onConditionMet: z2.enum(["pass", "fail"]).optional(),
928
+ onMaxTurnsReached: z2.enum(["pass", "fail"]).optional(),
929
+ expectedOutput: z2.record(z2.unknown()).optional(),
930
+ expect: yamlExpectationSchema.optional()
931
+ });
932
+ var yamlEvalFileSchema = z2.object({
933
+ agent: z2.string().min(1, "Agent name is required"),
934
+ name: z2.string().optional(),
935
+ description: z2.string().optional(),
936
+ defaults: yamlTestCaseDefaultsSchema.optional(),
937
+ personas: z2.record(yamlPersonaSchema).optional(),
938
+ cases: z2.array(yamlTestCaseSchema).min(1, "At least one test case is required")
939
+ });
940
+ function validateYamlEvalFile(content) {
941
+ const result = yamlEvalFileSchema.safeParse(content);
942
+ if (!result.success) {
943
+ const errors = result.error.issues.map((issue) => {
944
+ const path3 = issue.path.join(".");
945
+ return path3 ? ` - ${path3}: ${issue.message}` : ` - ${issue.message}`;
946
+ }).join("\n");
947
+ throw new EvalError(`Invalid YAML eval file:
948
+ ${errors}`, {
949
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */
950
+ });
951
+ }
952
+ return result.data;
953
+ }
954
+
955
+ // src/cli/yaml/loader.ts
956
+ import { existsSync as existsSync2 } from "fs";
957
+ import { readFile } from "fs/promises";
958
+ import { isAbsolute, resolve as resolve2 } from "path";
959
+ import { parse as parseYaml } from "yaml";
960
+
961
+ // src/multi-turn/types.ts
962
+ function isMaxTurnsCondition(condition) {
963
+ return condition.type === "maxTurns";
964
+ }
965
+ function isFieldSetCondition(condition) {
966
+ return condition.type === "fieldSet";
967
+ }
968
+ function isFieldValueCondition(condition) {
969
+ return condition.type === "fieldValue";
970
+ }
971
+ function isCustomCondition(condition) {
972
+ return condition.type === "custom";
973
+ }
974
+ function isMultiTurnTestCase(testCase2) {
975
+ return "multiTurn" in testCase2;
976
+ }
977
+ function isTerminated(result) {
978
+ return result.terminated === true;
979
+ }
980
+
981
+ // src/multi-turn/termination.ts
982
+ function getFieldValue(obj, fieldPath) {
983
+ if (obj === null || obj === void 0) {
984
+ return void 0;
985
+ }
986
+ const parts = fieldPath.split(".");
987
+ let current = obj;
988
+ for (const part of parts) {
989
+ if (current === null || current === void 0) {
990
+ return void 0;
991
+ }
992
+ if (typeof current !== "object") {
993
+ return void 0;
994
+ }
995
+ current = current[part];
996
+ }
997
+ return current;
998
+ }
999
+ function isSet(value) {
1000
+ return value !== null && value !== void 0;
1001
+ }
1002
+ function checkMaxTurns(condition, context) {
1003
+ const shouldTerminate = context.currentTurn >= condition.count;
1004
+ if (shouldTerminate) {
1005
+ return {
1006
+ terminated: true,
1007
+ terminationType: "maxTurns",
1008
+ matchedCondition: condition,
1009
+ reason: `Maximum turns reached (${condition.count})`
1010
+ };
1011
+ }
1012
+ return {
1013
+ terminated: false,
1014
+ reason: `Turn ${context.currentTurn} of ${condition.count}`
1015
+ };
1016
+ }
1017
+ function checkFieldSet(condition, context) {
1018
+ const fieldValue = getFieldValue(context.lastOutput, condition.fieldPath);
1019
+ const fieldIsSet2 = isSet(fieldValue);
1020
+ if (fieldIsSet2) {
1021
+ return {
1022
+ terminated: true,
1023
+ terminationType: "condition",
1024
+ matchedCondition: condition,
1025
+ reason: `Field "${condition.fieldPath}" is set (value: ${JSON.stringify(fieldValue)})`
1026
+ };
1027
+ }
1028
+ return {
1029
+ terminated: false,
1030
+ reason: `Field "${condition.fieldPath}" is not set`
1031
+ };
1032
+ }
1033
+ function checkFieldValue(condition, context) {
1034
+ const fieldValue = getFieldValue(context.lastOutput, condition.fieldPath);
1035
+ const matches = fieldValue === condition.expectedValue;
1036
+ if (matches) {
1037
+ return {
1038
+ terminated: true,
1039
+ terminationType: "condition",
1040
+ matchedCondition: condition,
1041
+ reason: `Field "${condition.fieldPath}" equals expected value`
1042
+ };
1043
+ }
1044
+ return {
1045
+ terminated: false,
1046
+ reason: `Field "${condition.fieldPath}" does not equal expected value (got: ${JSON.stringify(fieldValue)})`
1047
+ };
1048
+ }
1049
+ async function checkCustom(condition, context) {
1050
+ const description = condition.description ?? "Custom condition";
1051
+ try {
1052
+ const shouldTerminate = await condition.check(context);
1053
+ if (shouldTerminate) {
1054
+ return {
1055
+ terminated: true,
1056
+ terminationType: "condition",
1057
+ matchedCondition: condition,
1058
+ reason: `${description} met`
1059
+ };
1060
+ }
1061
+ return {
1062
+ terminated: false,
1063
+ reason: `${description} not met`
1064
+ };
1065
+ } catch (error) {
1066
+ const errorMessage = error instanceof Error ? error.message : String(error);
1067
+ return {
1068
+ terminated: false,
1069
+ reason: `${description} failed: ${errorMessage}`
1070
+ };
1071
+ }
1072
+ }
1073
+ async function checkCondition(condition, context) {
1074
+ if (isMaxTurnsCondition(condition)) {
1075
+ return checkMaxTurns(condition, context);
1076
+ }
1077
+ if (isFieldValueCondition(condition)) {
1078
+ return checkFieldValue(condition, context);
1079
+ }
1080
+ if (isFieldSetCondition(condition)) {
1081
+ return checkFieldSet(condition, context);
1082
+ }
1083
+ if (isCustomCondition(condition)) {
1084
+ return checkCustom(condition, context);
1085
+ }
1086
+ const _exhaustive = condition;
1087
+ throw new EvalError(`Unknown condition type: ${JSON.stringify(_exhaustive)}`, {
1088
+ code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */,
1089
+ context: { condition: _exhaustive }
1090
+ });
1091
+ }
1092
+ async function checkTermination(conditions, context) {
1093
+ if (conditions.length === 0) {
1094
+ return {
1095
+ terminated: false,
1096
+ reason: "No termination conditions specified"
1097
+ };
1098
+ }
1099
+ for (const condition of conditions) {
1100
+ const result = await checkCondition(condition, context);
1101
+ if (result.terminated) {
1102
+ return result;
1103
+ }
1104
+ }
1105
+ return {
1106
+ terminated: false,
1107
+ reason: "No termination conditions met"
1108
+ };
1109
+ }
1110
+
1111
+ // src/utils/json.ts
1112
+ function truncate(str, maxLength) {
1113
+ if (!str) {
1114
+ return "";
1115
+ }
1116
+ if (str.length <= maxLength) {
1117
+ return str;
1118
+ }
1119
+ return str.slice(0, maxLength) + "...";
1120
+ }
1121
+
1122
+ // src/multi-turn/conditions.ts
1123
+ function naturalLanguage(options) {
1124
+ const { provider, prompt, systemPrompt } = options;
1125
+ const defaultSystemPrompt = `You are an assistant that evaluates whether a conversation should terminate.
1126
+ Analyze the conversation history and determine if the specified condition is met.
1127
+ Respond with ONLY "yes" or "no" - nothing else.`;
1128
+ return {
1129
+ type: "custom",
1130
+ check: async (context) => {
1131
+ const historyText = context.history.map(
1132
+ (h) => `Turn ${h.turn}:
1133
+ Input: ${JSON.stringify(h.input)}
1134
+ Output: ${JSON.stringify(h.output)}`
1135
+ ).join("\n\n");
1136
+ const userPrompt = `## Termination Condition
1137
+ ${prompt}
1138
+
1139
+ ## Conversation History
1140
+ ${historyText || "(No history yet)"}
1141
+
1142
+ ## Current Turn
1143
+ Turn: ${context.currentTurn}
1144
+ Last Output: ${JSON.stringify(context.lastOutput)}
1145
+
1146
+ Should the conversation terminate based on the condition above? Answer "yes" or "no" only.`;
1147
+ const execution = provider.simpleExecution(async (session) => {
1148
+ const result = await session.generateText({
1149
+ messages: [
1150
+ { role: "system", content: systemPrompt ?? defaultSystemPrompt },
1151
+ { role: "user", content: userPrompt }
1152
+ ]
1153
+ });
1154
+ return result.text;
1155
+ });
1156
+ const executionResult = await execution.result();
1157
+ if (executionResult.status !== "succeeded") {
1158
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
1159
+ }
1160
+ const responseText = executionResult.value;
1161
+ const answer = responseText.toLowerCase().trim();
1162
+ return answer === "yes" || answer.startsWith("yes");
1163
+ },
1164
+ description: `NL: ${truncate(prompt, 50)}`
1165
+ };
1166
+ }
1167
+ function fieldEquals(fieldPath, expectedValue) {
1168
+ return {
1169
+ type: "custom",
1170
+ check: async (context) => {
1171
+ const result = await checkCondition(
1172
+ { type: "fieldValue", fieldPath, expectedValue },
1173
+ context
1174
+ );
1175
+ return result.terminated;
1176
+ },
1177
+ description: `fieldEquals(${fieldPath}, ${JSON.stringify(expectedValue)})`
1178
+ };
1179
+ }
1180
+ function fieldIsSet(fieldPath) {
1181
+ return {
1182
+ type: "custom",
1183
+ check: async (context) => {
1184
+ const result = await checkCondition({ type: "fieldSet", fieldPath }, context);
1185
+ return result.terminated;
1186
+ },
1187
+ description: `fieldIsSet(${fieldPath})`
1188
+ };
1189
+ }
1190
+
1191
+ // src/multi-turn/runner.ts
1192
+ import { resolveFileSourcesInInput } from "@agtlantis/core";
1193
+ var DEFAULT_MAX_TURNS = 10;
1194
+ var DEFAULT_ON_CONDITION_MET = "pass";
1195
+ var DEFAULT_ON_MAX_TURNS_REACHED = "fail";
1196
+ function aggregateTokenUsage(usages) {
1197
+ return usages.reduce(
1198
+ (acc, usage) => ({
1199
+ inputTokens: acc.inputTokens + usage.inputTokens,
1200
+ outputTokens: acc.outputTokens + usage.outputTokens,
1201
+ totalTokens: acc.totalTokens + usage.totalTokens
1202
+ }),
1203
+ { inputTokens: 0, outputTokens: 0, totalTokens: 0 }
1204
+ );
1205
+ }
1206
+ function getEffectiveMaxTurns(conditions, safetyLimit) {
1207
+ const maxTurnsCondition = conditions.find((c2) => c2.type === "maxTurns");
1208
+ if (maxTurnsCondition && maxTurnsCondition.type === "maxTurns") {
1209
+ return Math.min(maxTurnsCondition.count, safetyLimit);
1210
+ }
1211
+ return safetyLimit;
1212
+ }
1213
+ async function resolveInput(followUpInput, context) {
1214
+ const inputValue = followUpInput.input;
1215
+ if (typeof inputValue === "function") {
1216
+ const result = inputValue(context);
1217
+ return result instanceof Promise ? await result : result;
1218
+ }
1219
+ return inputValue;
1220
+ }
1221
+ function buildContext(currentTurn, history) {
1222
+ return {
1223
+ currentTurn,
1224
+ history,
1225
+ lastOutput: history.length > 0 ? history[history.length - 1].output : void 0
1226
+ };
1227
+ }
1228
+ function getFollowUpInput(followUpInputs, followUpIndex) {
1229
+ let currentIndex = 0;
1230
+ for (const followUp of followUpInputs) {
1231
+ const repeatCount = followUp.turns ?? 1;
1232
+ if (!Number.isFinite(repeatCount) && followUpIndex >= currentIndex) {
1233
+ return followUp;
1234
+ }
1235
+ if (followUpIndex < currentIndex + repeatCount) {
1236
+ return followUp;
1237
+ }
1238
+ currentIndex += repeatCount;
1239
+ }
1240
+ return null;
1241
+ }
1242
+ function validateFollowUpInputs(followUpInputs) {
1243
+ for (let i = 0; i < followUpInputs.length; i++) {
1244
+ const followUp = followUpInputs[i];
1245
+ if (followUp.turns === void 0) {
1246
+ continue;
1247
+ }
1248
+ if (typeof followUp.turns !== "number" || followUp.turns < 1) {
1249
+ throw new EvalError("turns must be a positive number or Infinity", {
1250
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
1251
+ context: {
1252
+ description: followUp.description,
1253
+ turns: followUp.turns
1254
+ }
1255
+ });
1256
+ }
1257
+ if (!Number.isFinite(followUp.turns) && i < followUpInputs.length - 1) {
1258
+ throw new EvalError(
1259
+ "turns: Infinity must be the last followUpInput (subsequent items would be unreachable)",
1260
+ {
1261
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
1262
+ context: {
1263
+ description: followUp.description,
1264
+ position: i,
1265
+ totalItems: followUpInputs.length
1266
+ }
1267
+ }
1268
+ );
1269
+ }
1270
+ }
1271
+ }
1272
+ async function getTurnInput(turn, testCaseInput, followUpInputs, conversationHistory) {
1273
+ if (turn === 1) {
1274
+ return { type: "success", input: testCaseInput };
1275
+ }
1276
+ const followUpIndex = turn - 2;
1277
+ const followUp = getFollowUpInput(followUpInputs, followUpIndex);
1278
+ if (!followUp) {
1279
+ return { type: "exhausted" };
1280
+ }
1281
+ const ctx = buildContext(turn, conversationHistory);
1282
+ const input = await resolveInput(followUp, ctx);
1283
+ return { type: "success", input };
1284
+ }
1285
+ function isFileResolutionError(result) {
1286
+ return "type" in result && result.type === "fileResolutionError";
1287
+ }
1288
+ async function executeSingleTurn(input, agent, testCaseId, turn) {
1289
+ let resolvedInput;
1290
+ try {
1291
+ resolvedInput = await resolveFileSourcesInInput(input, {
1292
+ basePath: process.cwd()
1293
+ });
1294
+ } catch (e) {
1295
+ return {
1296
+ type: "fileResolutionError",
1297
+ reason: `FileSource resolution failed on turn ${turn}: ${e instanceof Error ? e.message : String(e)}`
1298
+ };
1299
+ }
1300
+ const startTime = performance.now();
1301
+ let output;
1302
+ let metadata;
1303
+ let error;
1304
+ try {
1305
+ const agentResult = await agent.execute(resolvedInput);
1306
+ output = agentResult.result;
1307
+ metadata = agentResult.metadata;
1308
+ } catch (e) {
1309
+ error = EvalError.from(e, "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */, {
1310
+ testCaseId,
1311
+ turn,
1312
+ agentName: agent.config.name
1313
+ });
1314
+ }
1315
+ const latencyMs = performance.now() - startTime;
1316
+ return { output, metadata, latencyMs, error };
1317
+ }
1318
+ function determinePassFromTermination(termination, onConditionMet, onMaxTurnsReached) {
1319
+ if (!isTerminated(termination)) {
1320
+ return true;
1321
+ }
1322
+ switch (termination.terminationType) {
1323
+ case "error":
1324
+ case "exhausted":
1325
+ return false;
1326
+ case "maxTurns":
1327
+ return onMaxTurnsReached === "pass";
1328
+ case "condition":
1329
+ return onConditionMet === "pass";
1330
+ default:
1331
+ return true;
1332
+ }
1333
+ }
1334
+ async function executeMultiTurnTestCase(testCase2, context, options) {
1335
+ const { agent, judge, agentDescription } = context;
1336
+ const { multiTurn } = testCase2;
1337
+ const signal = options?.signal;
1338
+ const maxTurns = getEffectiveMaxTurns(
1339
+ multiTurn.terminateWhen,
1340
+ multiTurn.maxTurns ?? DEFAULT_MAX_TURNS
1341
+ );
1342
+ const onConditionMet = multiTurn.onConditionMet ?? DEFAULT_ON_CONDITION_MET;
1343
+ const onMaxTurnsReached = multiTurn.onMaxTurnsReached ?? DEFAULT_ON_MAX_TURNS_REACHED;
1344
+ const followUpInputs = multiTurn.followUpInputs ?? [];
1345
+ validateFollowUpInputs(followUpInputs);
1346
+ const conversationHistory = [];
1347
+ const tokenUsages = [];
1348
+ let totalLatencyMs = 0;
1349
+ let termination = {
1350
+ terminated: false,
1351
+ reason: "Execution not started"
1352
+ };
1353
+ for (let turn = 1; turn <= maxTurns; turn++) {
1354
+ if (signal?.aborted) {
1355
+ throw new EvalError("Multi-turn test execution aborted", {
1356
+ code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
1357
+ context: { testCaseId: testCase2.id, turn, reason: "aborted" }
1358
+ });
1359
+ }
1360
+ const inputResult = await getTurnInput(
1361
+ turn,
1362
+ testCase2.input,
1363
+ followUpInputs,
1364
+ conversationHistory
1365
+ );
1366
+ if (inputResult.type === "exhausted") {
1367
+ termination = {
1368
+ terminated: true,
1369
+ terminationType: "exhausted",
1370
+ reason: "All follow-up inputs exhausted"
1371
+ };
1372
+ break;
1373
+ }
1374
+ const input = inputResult.input;
1375
+ const turnResult = await executeSingleTurn(input, agent, testCase2.id ?? "unknown", turn);
1376
+ if (isFileResolutionError(turnResult)) {
1377
+ termination = {
1378
+ terminated: true,
1379
+ terminationType: "error",
1380
+ reason: turnResult.reason
1381
+ };
1382
+ break;
1383
+ }
1384
+ const {
1385
+ output: agentOutput,
1386
+ metadata: agentMetadata,
1387
+ latencyMs,
1388
+ error: agentError
1389
+ } = turnResult;
1390
+ totalLatencyMs += latencyMs;
1391
+ const turnUsage = agentMetadata?.tokenUsage ?? {
1392
+ inputTokens: 0,
1393
+ outputTokens: 0,
1394
+ totalTokens: 0
1395
+ };
1396
+ tokenUsages.push(turnUsage);
1397
+ conversationHistory.push({
1398
+ turn,
1399
+ input,
1400
+ output: agentOutput,
1401
+ metadata: agentMetadata
1402
+ });
1403
+ if (agentError) {
1404
+ termination = {
1405
+ terminated: true,
1406
+ terminationType: "error",
1407
+ reason: `Agent execution failed on turn ${turn}: ${agentError.message}`
1408
+ };
1409
+ break;
1410
+ }
1411
+ const ctx = buildContext(turn, conversationHistory);
1412
+ termination = await checkTermination(multiTurn.terminateWhen, ctx);
1413
+ if (termination.terminated) {
1414
+ break;
1415
+ }
1416
+ if (turn >= maxTurns) {
1417
+ termination = {
1418
+ terminated: true,
1419
+ terminationType: "maxTurns",
1420
+ matchedCondition: { type: "maxTurns", count: maxTurns },
1421
+ reason: `Maximum turns reached (${maxTurns})`
1422
+ };
1423
+ break;
1424
+ }
1425
+ }
1426
+ const aggregatedTokenUsage = aggregateTokenUsage(tokenUsages);
1427
+ const metrics = {
1428
+ latencyMs: totalLatencyMs,
1429
+ tokenUsage: aggregatedTokenUsage
1430
+ };
1431
+ const lastTurn = conversationHistory[conversationHistory.length - 1];
1432
+ const finalOutput = lastTurn?.output;
1433
+ const judgeResult = await judge.evaluate({
1434
+ input: testCase2.input,
1435
+ output: finalOutput,
1436
+ agentDescription,
1437
+ files: testCase2.files
1438
+ });
1439
+ const passedTermination = determinePassFromTermination(
1440
+ termination,
1441
+ onConditionMet,
1442
+ onMaxTurnsReached
1443
+ );
1444
+ const passed = passedTermination && judgeResult.passed;
1445
+ return {
1446
+ testCase: testCase2,
1447
+ output: finalOutput,
1448
+ metrics,
1449
+ verdicts: judgeResult.verdicts,
1450
+ overallScore: judgeResult.overallScore,
1451
+ passed,
1452
+ judgeMetadata: judgeResult.metadata,
1453
+ conversationHistory,
1454
+ termination,
1455
+ totalTurns: conversationHistory.length
1456
+ };
1457
+ }
1458
+
1459
+ // src/multi-turn/ai-user.ts
1460
+ var DEFAULT_SYSTEM_PROMPT = `You are simulating a realistic user in a conversation with an AI assistant.
1461
+
1462
+ ## Your Role
1463
+ Generate natural, context-appropriate user messages based on the conversation history.
1464
+
1465
+ ## Guidelines
1466
+
1467
+ 1. **Stay in Character**: Respond as a real user would - with natural language, occasional typos, or casual phrasing when appropriate.
1468
+
1469
+ 2. **Be Goal-Oriented**: Users have objectives. Pursue them logically based on the conversation context:
1470
+ - If the assistant asks a question, provide a reasonable answer
1471
+ - If clarification is needed, ask for it naturally
1472
+ - If a task is progressing, guide it toward completion
1473
+
1474
+ 3. **React Appropriately**: Respond to what the assistant says:
1475
+ - Acknowledge when the assistant is helpful
1476
+ - Express confusion if the response is unclear
1477
+ - Correct misunderstandings if they occur
1478
+
1479
+ 4. **Keep It Realistic**: Real users:
1480
+ - Don't always provide perfect information upfront
1481
+ - May change their mind or add requirements
1482
+ - Sometimes need time to think or decide
1483
+
1484
+ ## Output Format
1485
+ Respond with ONLY the user's message. No additional formatting, explanation, or meta-commentary.`;
1486
+ function aiUser(options) {
1487
+ const { provider, systemPrompt, formatHistory, buildInput } = options;
1488
+ const defaultFormatHistory = (ctx) => ctx.history.map(
1489
+ (h, i) => `[Turn ${i + 1}]
1490
+ User: ${JSON.stringify(h.input)}
1491
+ Assistant: ${JSON.stringify(h.output)}`
1492
+ ).join("\n\n");
1493
+ return async (context) => {
1494
+ const historyText = (formatHistory ?? defaultFormatHistory)(context);
1495
+ const resolvedSystemPrompt = typeof systemPrompt === "function" ? systemPrompt(context) : systemPrompt ?? DEFAULT_SYSTEM_PROMPT;
1496
+ const userPrompt = historyText ? `## Conversation History
1497
+ ${historyText}
1498
+
1499
+ ## Your Task
1500
+ Generate the next user message based on the conversation above:` : `## Your Task
1501
+ This is the start of a new conversation. Generate an appropriate opening message from the user:`;
1502
+ const execution = provider.simpleExecution(async (session) => {
1503
+ const result = await session.generateText({
1504
+ messages: [
1505
+ { role: "system", content: resolvedSystemPrompt },
1506
+ { role: "user", content: userPrompt }
1507
+ ]
1508
+ });
1509
+ return result.text;
1510
+ });
1511
+ const executionResult = await execution.result();
1512
+ if (executionResult.status !== "succeeded") {
1513
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
1514
+ }
1515
+ const responseText = executionResult.value;
1516
+ return buildInput(responseText, context);
1517
+ };
1518
+ }
1519
+
1520
+ // src/cli/yaml/loader.ts
1521
+ async function loadYamlEvalFile(path3, options = {}) {
1522
+ const { basePath = process.cwd(), skipValidation = false } = options;
1523
+ const absolutePath = isAbsolute(path3) ? path3 : resolve2(basePath, path3);
1524
+ if (!existsSync2(absolutePath)) {
1525
+ throw new EvalError(`YAML eval file not found: ${absolutePath}`, {
1526
+ code: "FILE_READ_ERROR" /* FILE_READ_ERROR */,
1527
+ context: { path: path3, absolutePath }
1528
+ });
1529
+ }
1530
+ let content;
1531
+ try {
1532
+ content = await readFile(absolutePath, "utf-8");
1533
+ } catch (error) {
1534
+ throw EvalError.from(error, "FILE_READ_ERROR" /* FILE_READ_ERROR */, {
1535
+ path: path3,
1536
+ absolutePath
1537
+ });
1538
+ }
1539
+ let parsed;
1540
+ try {
1541
+ parsed = parseYaml(content);
1542
+ } catch (error) {
1543
+ const message = error instanceof Error ? error.message : String(error);
1544
+ throw new EvalError(`Failed to parse YAML: ${message}`, {
1545
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
1546
+ context: { path: path3, absolutePath },
1547
+ cause: error instanceof Error ? error : void 0
1548
+ });
1549
+ }
1550
+ if (skipValidation) {
1551
+ return parsed;
1552
+ }
1553
+ return validateYamlEvalFile(parsed);
1554
+ }
1555
+ async function loadYamlEvalFiles(paths, options = {}) {
1556
+ const results = [];
1557
+ for (const path3 of paths) {
1558
+ const content = await loadYamlEvalFile(path3, options);
1559
+ results.push({ path: path3, content });
1560
+ }
1561
+ return results;
1562
+ }
1563
+ function convertToTestCases(yaml, context) {
1564
+ const { defaults, personas, cases } = yaml;
1565
+ return cases.map((testCase2) => {
1566
+ const merged = mergeWithDefaults(testCase2, defaults);
1567
+ if (isMultiTurnCase(merged)) {
1568
+ return convertToMultiTurnTestCase(merged, personas, context);
1569
+ }
1570
+ return convertToSimpleTestCase(merged);
1571
+ });
1572
+ }
1573
+ function mergeWithDefaults(testCase2, defaults) {
1574
+ if (!defaults) {
1575
+ return testCase2;
1576
+ }
1577
+ return {
1578
+ ...testCase2,
1579
+ maxTurns: testCase2.maxTurns ?? defaults.maxTurns,
1580
+ endWhen: testCase2.endWhen ?? defaults.endWhen,
1581
+ onConditionMet: testCase2.onConditionMet ?? defaults.onConditionMet,
1582
+ onMaxTurnsReached: testCase2.onMaxTurnsReached ?? defaults.onMaxTurnsReached,
1583
+ tags: [...defaults.tags ?? [], ...testCase2.tags ?? []]
1584
+ };
1585
+ }
1586
+ function isMultiTurnCase(testCase2) {
1587
+ return testCase2.persona !== void 0 || testCase2.endWhen !== void 0;
1588
+ }
1589
+ function resolvePersona(ref, personas) {
1590
+ if (ref === void 0) {
1591
+ return void 0;
1592
+ }
1593
+ if (typeof ref === "object") {
1594
+ return ref;
1595
+ }
1596
+ if (!personas || !(ref in personas)) {
1597
+ throw new EvalError(`Persona not found: "${ref}"`, {
1598
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
1599
+ context: {
1600
+ personaRef: ref,
1601
+ availablePersonas: personas ? Object.keys(personas) : []
1602
+ }
1603
+ });
1604
+ }
1605
+ return personas[ref];
1606
+ }
1607
+ function convertTerminationCondition(condition, provider) {
1608
+ if (condition.naturalLanguage) {
1609
+ return naturalLanguage({
1610
+ provider,
1611
+ prompt: condition.naturalLanguage
1612
+ });
1613
+ }
1614
+ if (condition.field) {
1615
+ if (condition.equals !== void 0) {
1616
+ return fieldEquals(condition.field, condition.equals);
1617
+ }
1618
+ return fieldIsSet(condition.field);
1619
+ }
1620
+ throw new EvalError("Invalid termination condition: no field or naturalLanguage specified", {
1621
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
1622
+ context: { condition }
1623
+ });
1624
+ }
1625
+ function convertToSimpleTestCase(merged) {
1626
+ return {
1627
+ id: merged.id,
1628
+ description: merged.name ?? merged.description,
1629
+ tags: merged.tags,
1630
+ input: merged.input,
1631
+ expectedOutput: merged.expectedOutput
1632
+ };
1633
+ }
1634
+ function convertToMultiTurnTestCase(merged, personas, context) {
1635
+ const { provider, buildInput, formatHistory } = context;
1636
+ const persona = resolvePersona(merged.persona, personas);
1637
+ const terminateWhen = [];
1638
+ if (merged.endWhen) {
1639
+ terminateWhen.push(convertTerminationCondition(merged.endWhen, provider));
1640
+ }
1641
+ const followUpInputs = [];
1642
+ if (persona) {
1643
+ const defaultBuildInput = (response) => ({ message: response });
1644
+ const aiUserInput = aiUser({
1645
+ provider,
1646
+ systemPrompt: persona.systemPrompt,
1647
+ formatHistory,
1648
+ buildInput: buildInput ?? defaultBuildInput
1649
+ });
1650
+ const maxTurns = merged.maxTurns ?? 10;
1651
+ for (let i = 0; i < maxTurns - 1; i++) {
1652
+ followUpInputs.push({
1653
+ input: aiUserInput,
1654
+ description: `AI User (${persona.name}) - Turn ${i + 2}`
1655
+ });
1656
+ }
1657
+ }
1658
+ return {
1659
+ id: merged.id,
1660
+ description: merged.name ?? merged.description,
1661
+ tags: merged.tags,
1662
+ input: merged.input,
1663
+ expectedOutput: merged.expectedOutput,
1664
+ multiTurn: {
1665
+ followUpInputs,
1666
+ terminateWhen,
1667
+ maxTurns: merged.maxTurns ?? 10,
1668
+ onConditionMet: merged.onConditionMet ?? "pass",
1669
+ onMaxTurnsReached: merged.onMaxTurnsReached ?? "fail"
1670
+ }
1671
+ };
1672
+ }
1673
+
1674
+ // src/cli/utils/env.ts
1675
+ import { existsSync as existsSync3 } from "fs";
1676
+ import { readFile as readFile2 } from "fs/promises";
1677
+ import { resolve as resolve3 } from "path";
1678
+ async function loadEnvFile(filePath = ".env", cwd = process.cwd()) {
1679
+ const absolutePath = resolve3(cwd, filePath);
1680
+ if (!existsSync3(absolutePath)) {
1681
+ return;
1682
+ }
1683
+ try {
1684
+ const content = await readFile2(absolutePath, "utf-8");
1685
+ const vars = parseEnvFile(content);
1686
+ for (const [key, value] of Object.entries(vars)) {
1687
+ if (process.env[key] === void 0) {
1688
+ process.env[key] = value;
1689
+ }
1690
+ }
1691
+ } catch {
1692
+ }
1693
+ }
1694
+ function parseEnvFile(content) {
1695
+ const result = {};
1696
+ const lines = content.split("\n");
1697
+ for (const line of lines) {
1698
+ const trimmed = line.trim();
1699
+ if (!trimmed || trimmed.startsWith("#")) {
1700
+ continue;
1701
+ }
1702
+ const eqIndex = trimmed.indexOf("=");
1703
+ if (eqIndex === -1) {
1704
+ continue;
1705
+ }
1706
+ const key = trimmed.slice(0, eqIndex).trim();
1707
+ let value = trimmed.slice(eqIndex + 1).trim();
1708
+ if (value.startsWith('"') && value.endsWith('"') || value.startsWith("'") && value.endsWith("'")) {
1709
+ value = value.slice(1, -1);
1710
+ }
1711
+ if (value.includes("\\")) {
1712
+ value = value.replace(/\\n/g, "\n").replace(/\\r/g, "\r").replace(/\\t/g, " ").replace(/\\\\/g, "\\");
1713
+ }
1714
+ if (key) {
1715
+ result[key] = value;
1716
+ }
1717
+ }
1718
+ return result;
1719
+ }
1720
+
1721
+ // src/cli/utils/provider-factory.ts
1722
+ import {
1723
+ createOpenAIProvider,
1724
+ createGoogleProvider
1725
+ } from "@agtlantis/core";
1726
+ import { mock } from "@agtlantis/core/testing";
1727
+
1728
+ // src/cli/constants.ts
1729
+ var CLI_DEFAULTS = {
1730
+ /** Width for console dividers (═ characters) */
1731
+ DIVIDER_WIDTH: 60,
1732
+ /** Default score used in mock mode for testing */
1733
+ MOCK_DEFAULT_SCORE: 85
1734
+ };
1735
+
1736
+ // src/cli/output/colors.ts
1737
+ var colors = {
1738
+ reset: "\x1B[0m",
1739
+ bold: "\x1B[1m",
1740
+ dim: "\x1B[2m",
1741
+ green: "\x1B[32m",
1742
+ red: "\x1B[31m",
1743
+ yellow: "\x1B[33m",
1744
+ blue: "\x1B[34m",
1745
+ cyan: "\x1B[36m",
1746
+ gray: "\x1B[90m"
1747
+ };
1748
+ var isColorSupported = process.stdout.isTTY && !process.env.NO_COLOR;
1749
+ function c(color, text) {
1750
+ return isColorSupported ? `${colors[color]}${text}${colors.reset}` : text;
1751
+ }
1752
+
1753
+ // src/cli/output/console.ts
1754
+ function printBanner() {
1755
+ console.log();
1756
+ console.log(c("cyan", " agent-eval"));
1757
+ console.log(c("dim", " LLM-as-Judge AI Agent Evaluation"));
1758
+ console.log();
1759
+ }
1760
+ function printProgress(message) {
1761
+ console.log(c("dim", ` ${message}`));
1762
+ }
1763
+ function printSummary(report, options = {}) {
1764
+ const { summary, results } = report;
1765
+ const { verbose, duration } = options;
1766
+ const passRate = summary.totalTests > 0 ? (summary.passed / summary.totalTests * 100).toFixed(1) : "0.0";
1767
+ const divider = "\u2550".repeat(CLI_DEFAULTS.DIVIDER_WIDTH);
1768
+ console.log();
1769
+ console.log(c("cyan", divider));
1770
+ console.log(c("bold", " Evaluation Results"));
1771
+ console.log(c("cyan", divider));
1772
+ console.log();
1773
+ console.log(` ${c("bold", "Total Tests:")} ${summary.totalTests}`);
1774
+ console.log(
1775
+ ` ${c("bold", "Passed:")} ${c("green", String(summary.passed))} (${passRate}%)`
1776
+ );
1777
+ console.log(
1778
+ ` ${c("bold", "Failed:")} ${summary.failed > 0 ? c("red", String(summary.failed)) : "0"}`
1779
+ );
1780
+ console.log(
1781
+ ` ${c("bold", "Average Score:")} ${summary.avgScore.toFixed(1)}/100`
1782
+ );
1783
+ console.log();
1784
+ console.log(
1785
+ ` ${c("bold", "Total Tokens:")} ${formatNumber(summary.metrics.totalTokens)}`
1786
+ );
1787
+ console.log(
1788
+ ` ${c("bold", "Avg Latency:")} ${formatMs(summary.metrics.avgLatencyMs)}`
1789
+ );
1790
+ if (duration !== void 0) {
1791
+ console.log();
1792
+ console.log(` ${c("bold", "Duration:")} ${formatMs(duration)}`);
1793
+ }
1794
+ console.log();
1795
+ console.log(c("cyan", divider));
1796
+ if (verbose && results.length > 0) {
1797
+ printVerboseResults(results);
1798
+ }
1799
+ }
1800
+ function printVerboseResults(results) {
1801
+ console.log();
1802
+ console.log(c("bold", " Test Results:"));
1803
+ console.log();
1804
+ for (const result of results) {
1805
+ const status = result.passed ? c("green", "\u2713 PASS") : c("red", "\u2717 FAIL");
1806
+ const testId = "testCase" in result && result.testCase?.id ? result.testCase.id : "unknown";
1807
+ console.log(` ${status} ${testId}`);
1808
+ console.log(` Score: ${result.overallScore.toFixed(1)}/100`);
1809
+ if ("criteriaScores" in result && result.criteriaScores) {
1810
+ const scores = result.criteriaScores;
1811
+ for (const score of scores) {
1812
+ console.log(` ${c("dim", score.criterionId)}: ${score.score.toFixed(1)}`);
1813
+ }
1814
+ }
1815
+ console.log();
1816
+ }
1817
+ }
1818
+ function printError(error) {
1819
+ console.error();
1820
+ console.error(c("red", " \u2717 Error:"));
1821
+ console.error();
1822
+ console.error(` ${error.message}`);
1823
+ console.error();
1824
+ }
1825
+ function formatNumber(num) {
1826
+ return num.toLocaleString("en-US");
1827
+ }
1828
+ function formatMs(ms) {
1829
+ if (ms < 1e3) {
1830
+ return `${ms.toFixed(0)}ms`;
1831
+ }
1832
+ return `${(ms / 1e3).toFixed(2)}s`;
1833
+ }
1834
+
1835
+ // src/cli/utils/provider-factory.ts
1836
+ function getApiKeyFromEnv(provider) {
1837
+ if (provider === "openai") {
1838
+ return process.env.OPENAI_API_KEY;
1839
+ }
1840
+ return process.env.GOOGLE_API_KEY;
1841
+ }
1842
+ function createProviderFromConfig(config) {
1843
+ const { llm } = config;
1844
+ const apiKey = llm.apiKey ?? getApiKeyFromEnv(llm.provider);
1845
+ if (!apiKey) {
1846
+ const envVar = llm.provider === "openai" ? "OPENAI_API_KEY" : "GOOGLE_API_KEY";
1847
+ throw new ConfigError(
1848
+ `API key not found for ${llm.provider}.
1849
+
1850
+ Set the ${envVar} environment variable or provide apiKey in config.`,
1851
+ "CONFIG_VALIDATION_ERROR"
1852
+ );
1853
+ }
1854
+ if (llm.provider === "openai") {
1855
+ return createOpenAIProvider({
1856
+ apiKey
1857
+ }).withDefaultModel(llm.defaultModel ?? "gpt-4o-mini");
1858
+ }
1859
+ return createGoogleProvider({
1860
+ apiKey
1861
+ }).withDefaultModel(llm.defaultModel ?? "gemini-1.5-flash");
1862
+ }
1863
+ function initializeProviders(config, options) {
1864
+ if (options.mock) {
1865
+ printProgress("Using mock Provider (--mock mode)");
1866
+ const mockVerdicts = config.judge.criteria.map((criterion) => ({
1867
+ criterionId: criterion.id,
1868
+ score: CLI_DEFAULTS.MOCK_DEFAULT_SCORE,
1869
+ reasoning: "Mock evaluation - test mode",
1870
+ passed: true
1871
+ }));
1872
+ const mockProvider = mock.provider(mock.json({ verdicts: mockVerdicts }));
1873
+ return { mainProvider: mockProvider, judgeProvider: mockProvider, improverProvider: mockProvider };
1874
+ }
1875
+ const mainProvider = createProviderFromConfig(config);
1876
+ const judgeProvider = config.judge.llm ? createProviderFromConfig({ ...config, llm: config.judge.llm }) : mainProvider;
1877
+ const improverProvider = config.improver?.llm ? createProviderFromConfig({ ...config, llm: config.improver.llm }) : mainProvider;
1878
+ return { mainProvider, judgeProvider, improverProvider };
1879
+ }
1880
+
1881
+ // src/cli/output/report.ts
1882
+ import { mkdir, writeFile as writeFile2 } from "fs/promises";
1883
+ import { existsSync as existsSync4 } from "fs";
1884
+ import { join } from "path";
1885
+
1886
+ // src/reporter/markdown.ts
1887
+ import { writeFile } from "fs/promises";
1888
+ import { getFileSourcesDisplayInfo } from "@agtlantis/core";
1889
+ var PASS_ICON = "\u2705";
1890
+ var FAIL_ICON = "\u274C";
1891
+ var PRIORITY_ORDER = { high: 0, medium: 1, low: 2 };
1892
+ function reportToMarkdown(report, options = {}) {
1893
+ const {
1894
+ expandPassedTests = false,
1895
+ includeRawOutput = false,
1896
+ outputPreviewLength = 200
1897
+ } = options;
1898
+ const { summary, results, suggestions, generatedAt, promptVersion } = report;
1899
+ const passRate = summary.totalTests > 0 ? (summary.passed / summary.totalTests * 100).toFixed(1) : "0.0";
1900
+ const lines = [];
1901
+ lines.push("# Evaluation Report");
1902
+ lines.push("");
1903
+ lines.push(`> Generated: ${generatedAt.toISOString()}`);
1904
+ lines.push(`> Prompt Version: ${promptVersion}`);
1905
+ lines.push("");
1906
+ lines.push("## Summary");
1907
+ lines.push("");
1908
+ lines.push(`| Metric | Value |`);
1909
+ lines.push(`|--------|-------|`);
1910
+ lines.push(`| Total Tests | ${summary.totalTests} |`);
1911
+ if (summary.iterations && summary.iterations > 1) {
1912
+ lines.push(`| **Iterations** | **${summary.iterations}** |`);
1913
+ }
1914
+ lines.push(`| Passed | ${summary.passed} (${passRate}%) |`);
1915
+ lines.push(`| Failed | ${summary.failed} |`);
1916
+ if (summary.avgStdDev !== void 0) {
1917
+ lines.push(
1918
+ `| Average Score | ${summary.avgScore.toFixed(1)} \xB1 ${summary.avgStdDev.toFixed(1)} |`
1919
+ );
1920
+ } else {
1921
+ lines.push(`| Average Score | ${summary.avgScore.toFixed(1)} |`);
1922
+ }
1923
+ if (summary.avgPassRate !== void 0) {
1924
+ lines.push(`| Avg Pass Rate | ${(summary.avgPassRate * 100).toFixed(1)}% |`);
1925
+ }
1926
+ lines.push(`| Avg Latency | ${summary.metrics.avgLatencyMs.toFixed(0)}ms |`);
1927
+ lines.push(`| Total Tokens | ${summary.metrics.totalTokens} |`);
1928
+ if (summary.costSummary?.total !== void 0) {
1929
+ lines.push(`| Est. Cost | $${summary.costSummary.total.toFixed(4)} |`);
1930
+ }
1931
+ lines.push("");
1932
+ const failedResults = results.filter((r) => !r.passed);
1933
+ if (failedResults.length > 0) {
1934
+ lines.push(`## ${FAIL_ICON} Failed Tests`);
1935
+ lines.push("");
1936
+ for (const result of failedResults) {
1937
+ lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
1938
+ }
1939
+ }
1940
+ const passedResults = results.filter((r) => r.passed);
1941
+ if (passedResults.length > 0) {
1942
+ lines.push(`## ${PASS_ICON} Passed Tests`);
1943
+ lines.push("");
1944
+ if (expandPassedTests) {
1945
+ for (const result of passedResults) {
1946
+ lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
1947
+ }
1948
+ } else {
1949
+ lines.push("<details>");
1950
+ lines.push("<summary>Click to expand passed tests</summary>");
1951
+ lines.push("");
1952
+ for (const result of passedResults) {
1953
+ lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
1954
+ }
1955
+ lines.push("</details>");
1956
+ lines.push("");
1957
+ }
1958
+ }
1959
+ if (suggestions.length > 0) {
1960
+ lines.push("## \u{1F4A1} Improvement Suggestions");
1961
+ lines.push("");
1962
+ const sortedSuggestions = [...suggestions].sort(
1963
+ (a, b) => PRIORITY_ORDER[a.priority] - PRIORITY_ORDER[b.priority]
1964
+ );
1965
+ for (const suggestion of sortedSuggestions) {
1966
+ lines.push(formatSuggestion(suggestion));
1967
+ }
1968
+ }
1969
+ return lines.join("\n");
1970
+ }
1971
+ function jsonCodeBlock(value, maxLength) {
1972
+ const json = JSON.stringify(value, null, 2);
1973
+ const content = maxLength !== void 0 ? truncate(json, maxLength) : json;
1974
+ return ["```json", content, "```"];
1975
+ }
1976
+ function passFailIcon(passed) {
1977
+ return passed ? PASS_ICON : FAIL_ICON;
1978
+ }
1979
+ function formatTestResult(result, previewLength, includeRaw) {
1980
+ const lines = [];
1981
+ const testId = result.testCase.id ?? "unnamed";
1982
+ const scoreDisplay = result.iterationStats ? `${result.overallScore.toFixed(1)} \xB1 ${result.iterationStats.stdDev.toFixed(1)}` : result.overallScore.toFixed(1);
1983
+ lines.push(`### ${testId} (Score: ${scoreDisplay})`);
1984
+ lines.push("");
1985
+ if (result.testCase.description) {
1986
+ lines.push(`> ${result.testCase.description}`);
1987
+ lines.push("");
1988
+ }
1989
+ const fileDisplayInfos = getFileSourcesDisplayInfo(result.testCase.input);
1990
+ if (fileDisplayInfos.length > 0) {
1991
+ lines.push("**Files:**");
1992
+ for (const info of fileDisplayInfos) {
1993
+ const namePrefix = info.filename ? `${info.filename} - ` : "";
1994
+ lines.push(`- ${namePrefix}${info.source}: ${info.description} (${info.mediaType})`);
1995
+ }
1996
+ lines.push("");
1997
+ }
1998
+ if (result.totalTurns !== void 0) {
1999
+ lines.push(
2000
+ `**Multi-turn:** ${result.totalTurns} turns | Termination: ${result.terminationReason ?? "unknown"}`
2001
+ );
2002
+ lines.push("");
2003
+ }
2004
+ if (result.multiTurnIterationStats) {
2005
+ lines.push(...formatMultiTurnIterationStats(result.multiTurnIterationStats));
2006
+ }
2007
+ if (result.iterationStats && result.iterationResults) {
2008
+ lines.push(...formatIterationResults(result.iterationStats, result.iterationResults));
2009
+ }
2010
+ if (result.conversationHistory && result.conversationHistory.length > 0) {
2011
+ lines.push(...formatConversationHistory(result.conversationHistory, previewLength));
2012
+ } else {
2013
+ lines.push(
2014
+ ...formatSingleTurnInputOutput(result.testCase.input, result.output, previewLength)
2015
+ );
2016
+ }
2017
+ lines.push("**Verdicts:**");
2018
+ for (const verdict of result.verdicts) {
2019
+ lines.push(
2020
+ `- ${passFailIcon(verdict.passed)} **${verdict.criterionId}**: ${verdict.score} - ${verdict.reasoning}`
2021
+ );
2022
+ }
2023
+ lines.push("");
2024
+ if (includeRaw) {
2025
+ lines.push("<details>");
2026
+ lines.push("<summary>Raw Output</summary>");
2027
+ lines.push("");
2028
+ lines.push(...jsonCodeBlock(result.output));
2029
+ lines.push("</details>");
2030
+ lines.push("");
2031
+ }
2032
+ return lines.join("\n");
2033
+ }
2034
+ function formatMultiTurnIterationStats(stats) {
2035
+ const terminationSummary = Object.entries(stats.terminationCounts).map(([type, count]) => `${type}: ${count}`).join(", ") || "none";
2036
+ return [
2037
+ "**Multi-turn Iteration Statistics:**",
2038
+ "",
2039
+ "| Metric | Value |",
2040
+ "|--------|-------|",
2041
+ `| Avg Turns | ${stats.avgTurns.toFixed(1)} |`,
2042
+ `| Min/Max Turns | ${stats.minTurns} / ${stats.maxTurns} |`,
2043
+ `| Termination Distribution | ${terminationSummary} |`,
2044
+ ""
2045
+ ];
2046
+ }
2047
+ function formatIterationResults(stats, results) {
2048
+ const lines = [
2049
+ "**Iteration Results:**",
2050
+ "",
2051
+ "| # | Score | Passed | Latency |",
2052
+ "|---|-------|--------|---------|"
2053
+ ];
2054
+ results.forEach((iter, idx) => {
2055
+ lines.push(
2056
+ `| ${idx + 1} | ${iter.overallScore.toFixed(1)} | ${passFailIcon(iter.passed)} | ${iter.metrics.latencyMs.toFixed(0)}ms |`
2057
+ );
2058
+ });
2059
+ lines.push("");
2060
+ lines.push(
2061
+ `**Stats:** ${stats.mean.toFixed(1)} \xB1 ${stats.stdDev.toFixed(1)} (min: ${stats.min.toFixed(0)}, max: ${stats.max.toFixed(0)}, pass rate: ${(stats.passRate * 100).toFixed(0)}%)`
2062
+ );
2063
+ lines.push("");
2064
+ return lines;
2065
+ }
2066
+ function formatConversationHistory(history, previewLength) {
2067
+ const lines = ["**Conversation History:**", ""];
2068
+ for (const turn of history) {
2069
+ lines.push("<details>");
2070
+ lines.push(`<summary>Turn ${turn.turn}</summary>`);
2071
+ lines.push("");
2072
+ lines.push("**Input:**");
2073
+ lines.push(...jsonCodeBlock(turn.input, previewLength));
2074
+ lines.push("");
2075
+ lines.push("**Output:**");
2076
+ lines.push(...jsonCodeBlock(turn.output, previewLength));
2077
+ lines.push("</details>");
2078
+ lines.push("");
2079
+ }
2080
+ return lines;
2081
+ }
2082
+ function formatSingleTurnInputOutput(input, output, previewLength) {
2083
+ return [
2084
+ "**Input:**",
2085
+ ...jsonCodeBlock(input, previewLength),
2086
+ "",
2087
+ "**Output:**",
2088
+ ...jsonCodeBlock(output, previewLength),
2089
+ ""
2090
+ ];
2091
+ }
2092
+ function formatSuggestion(suggestion) {
2093
+ const lines = [];
2094
+ const priorityIcon = { high: "\u{1F534}", medium: "\u{1F7E1}", low: "\u{1F7E2}" }[suggestion.priority] ?? "\u26AA";
2095
+ lines.push(`### ${priorityIcon} [${suggestion.priority.toUpperCase()}] ${suggestion.type}`);
2096
+ lines.push("");
2097
+ lines.push(`**Reasoning:** ${suggestion.reasoning}`);
2098
+ lines.push("");
2099
+ lines.push(`**Expected Improvement:** ${suggestion.expectedImprovement}`);
2100
+ lines.push("");
2101
+ lines.push("**Diff:**");
2102
+ lines.push("```diff");
2103
+ lines.push(`- ${suggestion.currentValue.split("\n").join("\n- ")}`);
2104
+ lines.push(`+ ${suggestion.suggestedValue.split("\n").join("\n+ ")}`);
2105
+ lines.push("```");
2106
+ lines.push("");
2107
+ return lines.join("\n");
2108
+ }
2109
+
2110
+ // src/cli/output/report.ts
2111
+ async function generateReport(report, options = {}) {
2112
+ const {
2113
+ dir = "./reports",
2114
+ filename = generateFilename(),
2115
+ markdown
2116
+ } = options;
2117
+ if (!existsSync4(dir)) {
2118
+ await mkdir(dir, { recursive: true });
2119
+ }
2120
+ const content = reportToMarkdown(report, markdown);
2121
+ const outputPath = join(dir, filename);
2122
+ await writeFile2(outputPath, content, "utf-8");
2123
+ return outputPath;
2124
+ }
2125
+ function generateFilename() {
2126
+ const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/:/g, "-").replace(/\.\d{3}Z$/, "");
2127
+ return `eval-${timestamp}.md`;
2128
+ }
2129
+
2130
+ // src/core/runner.ts
2131
+ import { resolveFileSourcesInInput as resolveFileSourcesInInput2 } from "@agtlantis/core";
2132
+
2133
+ // src/utils/semaphore.ts
2134
+ function createSemaphore(limit) {
2135
+ let running = 0;
2136
+ const waiting = [];
2137
+ return {
2138
+ async acquire() {
2139
+ if (running < limit) {
2140
+ running++;
2141
+ return;
2142
+ }
2143
+ return new Promise((resolve4) => {
2144
+ waiting.push(resolve4);
2145
+ });
2146
+ },
2147
+ release() {
2148
+ running--;
2149
+ const next = waiting.shift();
2150
+ if (next) {
2151
+ running++;
2152
+ next();
2153
+ }
2154
+ }
2155
+ };
2156
+ }
2157
+
2158
+ // src/core/constants.ts
2159
+ var SCORE = {
2160
+ /** Minimum possible score */
2161
+ MIN: 0,
2162
+ /** Maximum possible score */
2163
+ MAX: 100,
2164
+ /** Default threshold for passing evaluation */
2165
+ DEFAULT_PASS_THRESHOLD: 70,
2166
+ /** Threshold for majority-based pass determination (50%) */
2167
+ MAJORITY_PASS_THRESHOLD: 0.5
2168
+ };
2169
+ var ZERO_TOKEN_USAGE = {
2170
+ inputTokens: 0,
2171
+ outputTokens: 0,
2172
+ totalTokens: 0
2173
+ };
2174
+
2175
+ // src/core/runner.ts
2176
+ async function executeTestCase(testCase2, context, signal) {
2177
+ const { agent, judge, agentDescription } = context;
2178
+ if (signal?.aborted) {
2179
+ throw new EvalError("Test execution aborted", {
2180
+ code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
2181
+ context: { testCaseId: testCase2.id, reason: "aborted" }
2182
+ });
2183
+ }
2184
+ let resolvedInput;
2185
+ try {
2186
+ resolvedInput = await resolveFileSourcesInInput2(testCase2.input, {
2187
+ basePath: process.cwd()
2188
+ });
2189
+ } catch (e) {
2190
+ const error2 = EvalError.from(e, "FILE_READ_ERROR" /* FILE_READ_ERROR */, {
2191
+ testCaseId: testCase2.id,
2192
+ agentName: agent.config.name
2193
+ });
2194
+ return createFailedResult(testCase2, error2);
2195
+ }
2196
+ const startTime = performance.now();
2197
+ let output;
2198
+ let tokenUsage = ZERO_TOKEN_USAGE;
2199
+ let error;
2200
+ try {
2201
+ const agentResult = await agent.execute(resolvedInput);
2202
+ output = agentResult.result;
2203
+ if (agentResult.metadata?.tokenUsage) {
2204
+ tokenUsage = agentResult.metadata.tokenUsage;
2205
+ }
2206
+ } catch (e) {
2207
+ error = EvalError.from(e, "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */, {
2208
+ testCaseId: testCase2.id,
2209
+ agentName: agent.config.name
2210
+ });
2211
+ output = void 0;
2212
+ }
2213
+ const latencyMs = performance.now() - startTime;
2214
+ const metrics = { latencyMs, tokenUsage };
2215
+ const testResult = { testCase: testCase2, output, metrics, error };
2216
+ if (error) {
2217
+ return {
2218
+ kind: "single-turn",
2219
+ ...testResult,
2220
+ verdicts: [],
2221
+ overallScore: 0,
2222
+ passed: false,
2223
+ judgeMetadata: void 0
2224
+ };
2225
+ }
2226
+ if (signal?.aborted) {
2227
+ throw new EvalError("Test execution aborted before evaluation", {
2228
+ code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
2229
+ context: { testCaseId: testCase2.id, reason: "aborted" }
2230
+ });
2231
+ }
2232
+ const judgeResult = await judge.evaluate({
2233
+ input: testCase2.input,
2234
+ output,
2235
+ agentDescription,
2236
+ files: testCase2.files
2237
+ });
2238
+ return {
2239
+ kind: "single-turn",
2240
+ ...testResult,
2241
+ verdicts: judgeResult.verdicts,
2242
+ overallScore: judgeResult.overallScore,
2243
+ passed: judgeResult.passed,
2244
+ judgeMetadata: judgeResult.metadata
2245
+ };
2246
+ }
2247
+ function createFailedResult(testCase2, error) {
2248
+ return {
2249
+ kind: "single-turn",
2250
+ testCase: testCase2,
2251
+ output: void 0,
2252
+ metrics: { latencyMs: 0, tokenUsage: ZERO_TOKEN_USAGE },
2253
+ error,
2254
+ verdicts: [],
2255
+ overallScore: 0,
2256
+ passed: false,
2257
+ judgeMetadata: void 0
2258
+ };
2259
+ }
2260
+ function toMultiTurnResult(result) {
2261
+ return {
2262
+ kind: "multi-turn",
2263
+ testCase: result.testCase,
2264
+ output: result.output,
2265
+ metrics: result.metrics,
2266
+ verdicts: result.verdicts,
2267
+ overallScore: result.overallScore,
2268
+ passed: result.passed,
2269
+ judgeMetadata: result.judgeMetadata,
2270
+ conversationHistory: result.conversationHistory,
2271
+ totalTurns: result.totalTurns,
2272
+ terminationReason: result.termination.reason,
2273
+ termination: result.termination
2274
+ };
2275
+ }
2276
+ async function runWithConcurrency(testCases2, context, options = {}) {
2277
+ const { concurrency = 1, stopOnFirstFailure = false, signal } = options;
2278
+ if (concurrency < 1) {
2279
+ throw new EvalError("Concurrency must be at least 1", {
2280
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
2281
+ context: { concurrency }
2282
+ });
2283
+ }
2284
+ if (testCases2.length === 0) {
2285
+ return [];
2286
+ }
2287
+ const semaphore = createSemaphore(concurrency);
2288
+ const results = [];
2289
+ let shouldStop = false;
2290
+ let firstError;
2291
+ const internalAbort = new AbortController();
2292
+ const propagateExternalAbort = () => {
2293
+ shouldStop = true;
2294
+ internalAbort.abort();
2295
+ };
2296
+ signal?.addEventListener("abort", propagateExternalAbort);
2297
+ if (signal?.aborted) {
2298
+ shouldStop = true;
2299
+ }
2300
+ try {
2301
+ const executeOne = async (testCase2, index) => {
2302
+ if (shouldStop) return;
2303
+ await semaphore.acquire();
2304
+ try {
2305
+ if (shouldStop) return;
2306
+ const result = await executeTestCaseByType(testCase2, context, internalAbort.signal);
2307
+ results[index] = result;
2308
+ if (stopOnFirstFailure && !result.passed) {
2309
+ shouldStop = true;
2310
+ internalAbort.abort();
2311
+ }
2312
+ } catch (e) {
2313
+ if (!firstError && !isAbortError(e)) {
2314
+ firstError = e instanceof Error ? e : new Error(String(e));
2315
+ }
2316
+ shouldStop = true;
2317
+ internalAbort.abort();
2318
+ } finally {
2319
+ semaphore.release();
2320
+ }
2321
+ };
2322
+ const promises = testCases2.map((tc, i) => executeOne(tc, i));
2323
+ await Promise.allSettled(promises);
2324
+ if (firstError) {
2325
+ throw firstError;
2326
+ }
2327
+ return results.filter((r) => r !== void 0);
2328
+ } finally {
2329
+ signal?.removeEventListener("abort", propagateExternalAbort);
2330
+ }
2331
+ }
2332
+ function isAbortError(e) {
2333
+ return e instanceof DOMException && e.name === "AbortError" || e instanceof EvalError && e.context?.reason === "aborted";
2334
+ }
2335
+ async function executeTestCaseByType(testCase2, context, signal) {
2336
+ if (isMultiTurnTestCase(testCase2)) {
2337
+ const multiTurnResult = await executeMultiTurnTestCase(testCase2, context, { signal });
2338
+ return toMultiTurnResult(multiTurnResult);
2339
+ }
2340
+ return executeTestCase(testCase2, context, signal);
2341
+ }
2342
+
2343
+ // src/core/types.ts
2344
+ function isMultiTurnResult(result) {
2345
+ return result.kind === "multi-turn" || result.kind === "multi-turn-iterated";
2346
+ }
2347
+
2348
+ // src/core/iteration.ts
2349
+ function calculateIterationStats(results) {
2350
+ if (results.length === 0) {
2351
+ return {
2352
+ iterations: 0,
2353
+ scores: [],
2354
+ mean: 0,
2355
+ stdDev: 0,
2356
+ min: 0,
2357
+ max: 0,
2358
+ passRate: 0,
2359
+ passCount: 0
2360
+ };
2361
+ }
2362
+ const scores = results.map((r) => r.overallScore);
2363
+ const passCount = results.filter((r) => r.passed).length;
2364
+ const mean = scores.reduce((sum, s) => sum + s, 0) / scores.length;
2365
+ const variance = scores.reduce((sum, s) => sum + Math.pow(s - mean, 2), 0) / scores.length;
2366
+ const stdDev = Math.sqrt(variance);
2367
+ return {
2368
+ iterations: results.length,
2369
+ scores,
2370
+ mean,
2371
+ stdDev,
2372
+ min: Math.min(...scores),
2373
+ max: Math.max(...scores),
2374
+ passRate: passCount / results.length,
2375
+ passCount
2376
+ };
2377
+ }
2378
+ function calculateMultiTurnIterationStats(results) {
2379
+ const baseStats = calculateIterationStats(results);
2380
+ const turns = results.map((r) => r.totalTurns);
2381
+ const terminationCounts = {};
2382
+ for (const r of results) {
2383
+ const type = r.termination.terminationType;
2384
+ if (type) {
2385
+ terminationCounts[type] = (terminationCounts[type] || 0) + 1;
2386
+ }
2387
+ }
2388
+ return {
2389
+ ...baseStats,
2390
+ avgTurns: turns.length > 0 ? turns.reduce((a, b) => a + b, 0) / turns.length : 0,
2391
+ minTurns: turns.length > 0 ? Math.min(...turns) : 0,
2392
+ maxTurns: turns.length > 0 ? Math.max(...turns) : 0,
2393
+ terminationCounts
2394
+ };
2395
+ }
2396
+ function selectRepresentativeResult(results, mean) {
2397
+ if (results.length === 0) {
2398
+ throw new EvalError("Cannot select representative result from empty array", {
2399
+ code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */
2400
+ });
2401
+ }
2402
+ return results.reduce((closest, current) => {
2403
+ const closestDiff = Math.abs(closest.overallScore - mean);
2404
+ const currentDiff = Math.abs(current.overallScore - mean);
2405
+ return currentDiff < closestDiff ? current : closest;
2406
+ });
2407
+ }
2408
+ function aggregateIterationResults(allIterationResults) {
2409
+ if (allIterationResults.length === 0) {
2410
+ return [];
2411
+ }
2412
+ const testCount = allIterationResults[0].length;
2413
+ const aggregated = [];
2414
+ for (let i = 0; i < testCount; i++) {
2415
+ const resultsForTestCase = allIterationResults.map((iteration) => iteration[i]);
2416
+ const stats = calculateIterationStats(resultsForTestCase);
2417
+ const representative = selectRepresentativeResult(resultsForTestCase, stats.mean);
2418
+ const isMultiTurn = resultsForTestCase.some((r) => isMultiTurnResult(r));
2419
+ const passedByMajority = stats.passRate >= SCORE.MAJORITY_PASS_THRESHOLD;
2420
+ if (isMultiTurn) {
2421
+ const multiTurnResults = resultsForTestCase.filter(
2422
+ (r) => isMultiTurnResult(r)
2423
+ );
2424
+ const multiTurnRep = representative;
2425
+ const aggregatedResult = {
2426
+ kind: "multi-turn-iterated",
2427
+ testCase: multiTurnRep.testCase,
2428
+ output: multiTurnRep.output,
2429
+ metrics: multiTurnRep.metrics,
2430
+ verdicts: multiTurnRep.verdicts,
2431
+ error: multiTurnRep.error,
2432
+ overallScore: stats.mean,
2433
+ passed: passedByMajority,
2434
+ iterationStats: stats,
2435
+ iterationResults: resultsForTestCase,
2436
+ conversationHistory: multiTurnRep.conversationHistory,
2437
+ totalTurns: multiTurnRep.totalTurns,
2438
+ terminationReason: multiTurnRep.terminationReason,
2439
+ termination: multiTurnRep.termination,
2440
+ multiTurnIterationStats: calculateMultiTurnIterationStats(multiTurnResults)
2441
+ };
2442
+ aggregated.push(aggregatedResult);
2443
+ } else {
2444
+ const aggregatedResult = {
2445
+ kind: "single-turn-iterated",
2446
+ testCase: representative.testCase,
2447
+ output: representative.output,
2448
+ metrics: representative.metrics,
2449
+ verdicts: representative.verdicts,
2450
+ error: representative.error,
2451
+ overallScore: stats.mean,
2452
+ passed: passedByMajority,
2453
+ iterationStats: stats,
2454
+ iterationResults: resultsForTestCase
2455
+ };
2456
+ aggregated.push(aggregatedResult);
2457
+ }
2458
+ }
2459
+ return aggregated;
2460
+ }
2461
+ function filterIteratedResults(results) {
2462
+ return results.filter(
2463
+ (r) => r.kind === "single-turn-iterated" || r.kind === "multi-turn-iterated"
2464
+ );
2465
+ }
2466
+ function averageIterationStat(results, selector) {
2467
+ const iteratedResults = filterIteratedResults(results);
2468
+ if (iteratedResults.length === 0) {
2469
+ return void 0;
2470
+ }
2471
+ const total = iteratedResults.reduce((sum, r) => sum + selector(r.iterationStats), 0);
2472
+ return total / iteratedResults.length;
2473
+ }
2474
+ function calculateAvgStdDev(results) {
2475
+ return averageIterationStat(results, (stats) => stats.stdDev);
2476
+ }
2477
+ function calculateAvgPassRate(results) {
2478
+ return averageIterationStat(results, (stats) => stats.passRate);
2479
+ }
2480
+
2481
+ // src/core/suite.ts
2482
+ function calculateAggregatedMetrics(results) {
2483
+ if (results.length === 0) {
2484
+ return { avgLatencyMs: 0, totalTokens: 0 };
2485
+ }
2486
+ const totalLatencyMs = sumBy(results, (r) => r.metrics.latencyMs);
2487
+ const totalTokens = sumBy(results, (r) => r.metrics.tokenUsage.totalTokens);
2488
+ return {
2489
+ avgLatencyMs: totalLatencyMs / results.length,
2490
+ totalTokens
2491
+ };
2492
+ }
2493
+ function sumBy(items, selector) {
2494
+ return items.reduce((sum, item) => sum + selector(item), 0);
2495
+ }
2496
+ function calculateSummary(results, iterations) {
2497
+ const metrics = calculateAggregatedMetrics(results);
2498
+ const passedCount = results.filter((r) => r.passed).length;
2499
+ const failedCount = results.length - passedCount;
2500
+ const avgScore = results.length > 0 ? sumBy(results, (r) => r.overallScore) / results.length : 0;
2501
+ const summary = {
2502
+ totalTests: results.length,
2503
+ passed: passedCount,
2504
+ failed: failedCount,
2505
+ avgScore,
2506
+ metrics
2507
+ };
2508
+ const hasMultipleIterations = iterations && iterations > 1;
2509
+ if (hasMultipleIterations) {
2510
+ summary.iterations = iterations;
2511
+ summary.avgStdDev = calculateAvgStdDev(results);
2512
+ summary.avgPassRate = calculateAvgPassRate(results);
2513
+ }
2514
+ return summary;
2515
+ }
2516
+ function createEvalSuite(config) {
2517
+ const { agent, agentDescription, judge, improver } = config;
2518
+ const description = agentDescription ?? agent.config.description ?? agent.config.name;
2519
+ const suite = {
2520
+ async run(testCases2, options) {
2521
+ const iterations = options?.iterations ?? 1;
2522
+ validateIterations(iterations);
2523
+ const executeContext = { agent, judge, agentDescription: description };
2524
+ const results = iterations <= 1 ? await runWithConcurrency(testCases2, executeContext, options) : await runMultipleIterations(testCases2, executeContext, options, iterations);
2525
+ const summary = calculateSummary(results, iterations > 1 ? iterations : void 0);
2526
+ const suggestions = improver ? (await improver.improve(agent.prompt, results)).suggestions : [];
2527
+ return {
2528
+ summary,
2529
+ results,
2530
+ suggestions,
2531
+ generatedAt: /* @__PURE__ */ new Date(),
2532
+ promptVersion: agent.prompt.version
2533
+ };
2534
+ },
2535
+ withAgent(newAgent) {
2536
+ return createEvalSuite({
2537
+ ...config,
2538
+ agent: newAgent,
2539
+ agentDescription: void 0
2540
+ });
2541
+ }
2542
+ };
2543
+ return suite;
2544
+ }
2545
+ function validateIterations(iterations) {
2546
+ if (iterations < 1 || !Number.isInteger(iterations)) {
2547
+ throw new EvalError(
2548
+ `Invalid iterations value: ${iterations}. Must be a positive integer.`,
2549
+ { code: "INVALID_CONFIG" /* INVALID_CONFIG */, context: { iterations } }
2550
+ );
2551
+ }
2552
+ }
2553
+ async function runMultipleIterations(testCases2, executeContext, options, iterations) {
2554
+ const allIterationResults = [];
2555
+ for (let i = 0; i < iterations; i++) {
2556
+ const iterationResults = await runWithConcurrency(
2557
+ testCases2,
2558
+ executeContext,
2559
+ { ...options, iterations: void 0 }
2560
+ );
2561
+ allIterationResults.push(iterationResults);
2562
+ }
2563
+ return aggregateIterationResults(allIterationResults);
2564
+ }
2565
+
2566
+ // src/index.ts
2567
+ import {
2568
+ resolveFileSource,
2569
+ resolveFileSourcesInInput as resolveFileSourcesInInput3,
2570
+ scanForFileSources,
2571
+ getFileSourceDisplayInfo,
2572
+ getFileSourcesDisplayInfo as getFileSourcesDisplayInfo2,
2573
+ inferMediaType,
2574
+ isFileSource,
2575
+ isFileSourcePath,
2576
+ isFileSourceData,
2577
+ isFileSourceBase64,
2578
+ isFileSourceUrl
2579
+ } from "@agtlantis/core";
2580
+
2581
+ // src/judge/llm-judge.ts
2582
+ import { Output } from "ai";
2583
+ import { z as z3 } from "zod";
2584
+
2585
+ // src/judge/prompts/default.ts
2586
+ var defaultJudgePrompt = {
2587
+ id: "default-judge",
2588
+ version: "2.0.0",
2589
+ system: `You are an expert evaluator specializing in assessing AI Agent outputs.
2590
+
2591
+ Your role is to fairly and thoroughly evaluate the agent's output against the provided criteria.
2592
+
2593
+ ## Evaluation Principles
2594
+
2595
+ 1. **Scoring**: Assign a score between 0-100 for each criterion
2596
+ - 90-100: Exceptional - Exceeds expectations with no significant issues
2597
+ - 70-89: Good - Meets expectations with minor issues
2598
+ - 50-69: Acceptable - Partially meets expectations, notable issues present
2599
+ - 30-49: Poor - Falls short of expectations, significant issues
2600
+ - 0-29: Failing - Does not meet minimum requirements
2601
+
2602
+ 2. **Reasoning**: Always provide specific, evidence-based reasoning
2603
+ - Quote or reference specific parts of the output
2604
+ - Explain both strengths and weaknesses
2605
+ - Be constructive and actionable in feedback
2606
+
2607
+ 3. **Objectivity**: Evaluate based solely on the criteria provided
2608
+ - Avoid personal preferences or unstated requirements
2609
+ - Consider the agent's intended purpose and context
2610
+ - Weight severity of issues proportionally
2611
+
2612
+ ## Response Format
2613
+
2614
+ You MUST respond with valid JSON only. No additional text or explanation outside the JSON structure.
2615
+
2616
+ {
2617
+ "verdicts": [
2618
+ {
2619
+ "criterionId": "criterion-id",
2620
+ "score": 0-100,
2621
+ "reasoning": "Detailed explanation with specific evidence from the output",
2622
+ "passed": true/false
2623
+ }
2624
+ ]
2625
+ }`,
2626
+ renderUserPrompt: (ctx) => {
2627
+ const fileSection = buildFileSection(ctx.files);
2628
+ return `
2629
+ ## Agent Under Evaluation
2630
+ ${ctx.agentDescription}
2631
+
2632
+ ## Input Provided to Agent
2633
+ \`\`\`json
2634
+ ${JSON.stringify(ctx.input, null, 2)}
2635
+ \`\`\`
2636
+ ${fileSection}
2637
+ ## Agent Output
2638
+ \`\`\`json
2639
+ ${JSON.stringify(ctx.output, null, 2)}
2640
+ \`\`\`
2641
+
2642
+ ## Evaluation Criteria
2643
+ ${ctx.criteria.map((c2) => `- **${c2.name}** (id: ${c2.id}, weight: ${c2.weight ?? 1}): ${c2.description}`).join("\n")}
2644
+
2645
+ Please evaluate the agent's output against each criterion listed above.`.trim();
2646
+ }
2647
+ };
2648
+ function buildFileSection(files) {
2649
+ if (!files || files.length === 0) {
2650
+ return "";
2651
+ }
2652
+ return `
2653
+ ## Reference Files
2654
+ ${files.map((f) => `### ${f.path}
2655
+ \`\`\`
2656
+ ${f.content}
2657
+ \`\`\``).join("\n\n")}
2658
+ `;
2659
+ }
2660
+
2661
+ // src/judge/llm-judge.ts
2662
+ function toEvalTokenUsage(usage) {
2663
+ return {
2664
+ inputTokens: usage.inputTokens ?? 0,
2665
+ outputTokens: usage.outputTokens ?? 0,
2666
+ totalTokens: usage.totalTokens ?? 0
2667
+ };
2668
+ }
2669
+ function hasValidator(criterion) {
2670
+ return "validator" in criterion && typeof criterion.validator === "function";
2671
+ }
2672
+ var JudgeResponseSchema = z3.object({
2673
+ verdicts: z3.array(
2674
+ z3.object({
2675
+ criterionId: z3.string(),
2676
+ score: z3.number().min(SCORE.MIN).max(SCORE.MAX),
2677
+ reasoning: z3.string(),
2678
+ passed: z3.boolean().optional()
2679
+ })
2680
+ )
2681
+ });
2682
+ function validateAllCriteriaHaveVerdicts(verdicts, criteriaIds) {
2683
+ const providedIds = new Set(verdicts.map((v) => v.criterionId));
2684
+ const missingIds = criteriaIds.filter((id) => !providedIds.has(id));
2685
+ if (missingIds.length > 0) {
2686
+ throw new EvalError("Judge response missing verdicts for some criteria", {
2687
+ code: "VERDICT_PARSE_ERROR" /* VERDICT_PARSE_ERROR */,
2688
+ context: { missingCriteriaIds: missingIds, providedIds: [...providedIds] }
2689
+ });
2690
+ }
2691
+ }
2692
+ function calculateOverallScore(verdicts, criteriaWeights) {
2693
+ let totalWeight = 0;
2694
+ let weightedSum = 0;
2695
+ for (const verdict of verdicts) {
2696
+ const weight = criteriaWeights.get(verdict.criterionId) ?? 1;
2697
+ weightedSum += verdict.score * weight;
2698
+ totalWeight += weight;
2699
+ }
2700
+ if (totalWeight === 0) {
2701
+ return 0;
2702
+ }
2703
+ return Math.round(weightedSum / totalWeight * 100) / 100;
2704
+ }
2705
+ function runValidatorCriteria(validatorCriteria, output) {
2706
+ return validatorCriteria.map((criterion) => {
2707
+ const result = criterion.validator(output);
2708
+ if (result.valid) {
2709
+ return {
2710
+ criterionId: criterion.id,
2711
+ score: 100,
2712
+ reasoning: `${criterion.name} \uD1B5\uACFC`,
2713
+ passed: true
2714
+ };
2715
+ }
2716
+ return {
2717
+ criterionId: criterion.id,
2718
+ score: 0,
2719
+ reasoning: `${criterion.name} \uC2E4\uD328:
2720
+ ${result.errorSummary ?? "\uC720\uD6A8\uC131 \uAC80\uC99D \uC624\uB958"}`,
2721
+ passed: false
2722
+ };
2723
+ });
2724
+ }
2725
+ async function runLLMEvaluation(provider, prompt, context, llmCriteriaIds, passThreshold) {
2726
+ const messages = [
2727
+ { role: "system", content: prompt.system },
2728
+ { role: "user", content: prompt.renderUserPrompt(context) }
2729
+ ];
2730
+ let response;
2731
+ let usage;
2732
+ try {
2733
+ const execution = provider.simpleExecution(async (session) => {
2734
+ const result = await session.generateText({
2735
+ messages,
2736
+ output: Output.object({ schema: JudgeResponseSchema })
2737
+ });
2738
+ return result.output;
2739
+ });
2740
+ const executionResult = await execution.result();
2741
+ if (executionResult.status !== "succeeded") {
2742
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
2743
+ }
2744
+ response = executionResult.value;
2745
+ usage = executionResult.summary.totalLLMUsage;
2746
+ } catch (cause) {
2747
+ throw EvalError.from(cause, "LLM_API_ERROR" /* LLM_API_ERROR */, {
2748
+ promptId: prompt.id,
2749
+ promptVersion: prompt.version
2750
+ });
2751
+ }
2752
+ validateAllCriteriaHaveVerdicts(response.verdicts, llmCriteriaIds);
2753
+ const verdicts = response.verdicts.map((v) => ({
2754
+ criterionId: v.criterionId,
2755
+ score: v.score,
2756
+ reasoning: v.reasoning,
2757
+ passed: v.passed ?? v.score >= passThreshold
2758
+ }));
2759
+ return { verdicts, usage };
2760
+ }
2761
+ function createJudge(config) {
2762
+ const {
2763
+ provider,
2764
+ prompt = defaultJudgePrompt,
2765
+ criteria,
2766
+ passThreshold = SCORE.DEFAULT_PASS_THRESHOLD,
2767
+ model
2768
+ } = config;
2769
+ const validatorCriteria = [];
2770
+ const llmCriteria = [];
2771
+ const criteriaWeights = /* @__PURE__ */ new Map();
2772
+ const llmCriteriaIds = [];
2773
+ for (const c2 of criteria) {
2774
+ criteriaWeights.set(c2.id, c2.weight ?? 1);
2775
+ if (hasValidator(c2)) {
2776
+ validatorCriteria.push(c2);
2777
+ } else {
2778
+ llmCriteria.push(c2);
2779
+ llmCriteriaIds.push(c2.id);
2780
+ }
2781
+ }
2782
+ return {
2783
+ async evaluate(evalContext) {
2784
+ const { input, output, agentDescription, files } = evalContext;
2785
+ const validatorVerdicts = runValidatorCriteria(validatorCriteria, output);
2786
+ let llmVerdicts = [];
2787
+ let llmUsage;
2788
+ if (llmCriteria.length > 0) {
2789
+ const context = {
2790
+ agentDescription,
2791
+ input,
2792
+ output,
2793
+ criteria: llmCriteria,
2794
+ files
2795
+ };
2796
+ const llmResult = await runLLMEvaluation(
2797
+ provider,
2798
+ prompt,
2799
+ context,
2800
+ llmCriteriaIds,
2801
+ passThreshold
2802
+ );
2803
+ llmVerdicts = llmResult.verdicts;
2804
+ llmUsage = llmResult.usage;
2805
+ }
2806
+ const allVerdicts = [...validatorVerdicts, ...llmVerdicts];
2807
+ const overallScore = calculateOverallScore(allVerdicts, criteriaWeights);
2808
+ const passed = overallScore >= passThreshold;
2809
+ const metadata = llmUsage ? { tokenUsage: toEvalTokenUsage(llmUsage), model } : void 0;
2810
+ return {
2811
+ verdicts: allVerdicts,
2812
+ overallScore,
2813
+ passed,
2814
+ metadata
2815
+ };
2816
+ }
2817
+ };
2818
+ }
2819
+
2820
+ // src/reporter/json-reporter.ts
2821
+ import { writeFileSync } from "fs";
2822
+
2823
+ // src/reporter/cost-helpers.ts
2824
+ import {
2825
+ calculateCostFromUsage
2826
+ } from "@agtlantis/core";
2827
+ function toLanguageModelUsage(usage) {
2828
+ return {
2829
+ inputTokens: usage.inputTokens,
2830
+ outputTokens: usage.outputTokens,
2831
+ totalTokens: usage.totalTokens
2832
+ };
2833
+ }
2834
+ var PROVIDER_MAPPING = {
2835
+ gemini: "google",
2836
+ openai: "openai",
2837
+ anthropic: "anthropic",
2838
+ google: "google"
2839
+ };
2840
+ function detectProvider(model) {
2841
+ if (!model) return "google";
2842
+ if (model.startsWith("gpt-") || model.startsWith("o1") || model.startsWith("o3")) {
2843
+ return "openai";
2844
+ }
2845
+ if (model.startsWith("gemini-")) {
2846
+ return "google";
2847
+ }
2848
+ if (model.startsWith("claude-")) {
2849
+ return "anthropic";
2850
+ }
2851
+ return "google";
2852
+ }
2853
+ function normalizeProvider(provider) {
2854
+ if (!provider) return "google";
2855
+ return PROVIDER_MAPPING[provider] ?? provider;
2856
+ }
2857
+ function calculateComponentCost(tokenUsage, model, provider, config) {
2858
+ if (!tokenUsage) return void 0;
2859
+ const normalizedProvider = provider ? normalizeProvider(provider) : detectProvider(model);
2860
+ const providerPricing = config?.providerPricing?.[normalizedProvider];
2861
+ const result = calculateCostFromUsage(
2862
+ toLanguageModelUsage(tokenUsage),
2863
+ model ?? "unknown",
2864
+ normalizedProvider,
2865
+ providerPricing
2866
+ );
2867
+ return result.total;
2868
+ }
2869
+ function buildCostBreakdown(costs) {
2870
+ const total = (costs.agent ?? 0) + (costs.judge ?? 0) + (costs.improver ?? 0);
2871
+ return {
2872
+ ...costs,
2873
+ total: total > 0 ? total : void 0
2874
+ };
2875
+ }
2876
+ function calculateResultCost(result, config) {
2877
+ const agentCost = calculateComponentCost(
2878
+ result.metrics.tokenUsage,
2879
+ result.agentMetadata?.model,
2880
+ result.agentMetadata?.provider,
2881
+ config
2882
+ );
2883
+ const judgeCost = result.judgeMetadata?.tokenUsage ? calculateComponentCost(
2884
+ result.judgeMetadata.tokenUsage,
2885
+ result.judgeMetadata.model,
2886
+ result.judgeMetadata.provider,
2887
+ config
2888
+ ) : void 0;
2889
+ return buildCostBreakdown({
2890
+ agent: agentCost,
2891
+ judge: judgeCost
2892
+ });
2893
+ }
2894
+ function calculateReportCosts(report, config) {
2895
+ let totalAgent = 0;
2896
+ let totalJudge = 0;
2897
+ for (const result of report.results) {
2898
+ const breakdown = calculateResultCost(result, config);
2899
+ totalAgent += breakdown.agent ?? 0;
2900
+ totalJudge += breakdown.judge ?? 0;
2901
+ }
2902
+ return {
2903
+ total: totalAgent + totalJudge,
2904
+ byComponent: {
2905
+ agent: totalAgent,
2906
+ judge: totalJudge
2907
+ }
2908
+ };
2909
+ }
2910
+
2911
+ // src/reporter/format-utils.ts
2912
+ import { mkdirSync } from "fs";
2913
+ import path from "path";
2914
+
2915
+ // src/reporter/markdown-reporter.ts
2916
+ import { writeFileSync as writeFileSync2 } from "fs";
2917
+
2918
+ // src/reporter/cycle-json.ts
2919
+ import { writeFileSync as writeFileSync3, mkdirSync as mkdirSync2 } from "fs";
2920
+ import path2 from "path";
2921
+
2922
+ // src/reporter/cycle-markdown.ts
2923
+ import { writeFileSync as writeFileSync4 } from "fs";
2924
+
2925
+ // src/improver/utils.ts
2926
+ import { compileTemplate } from "@agtlantis/core";
2927
+ function safeReplace(str, search, replacement) {
2928
+ return str.replace(search, () => replacement);
2929
+ }
2930
+ function bumpVersion(version, bump) {
2931
+ const parts = version.split(".").map((n) => parseInt(n, 10));
2932
+ if (parts.length !== 3 || parts.some(isNaN)) {
2933
+ throw new EvalError(
2934
+ `Invalid version format: "${version}". Expected semver format (x.y.z)`,
2935
+ {
2936
+ code: "SUGGESTION_APPLY_ERROR" /* SUGGESTION_APPLY_ERROR */,
2937
+ context: { version, expectedFormat: "x.y.z" }
2938
+ }
2939
+ );
2940
+ }
2941
+ const [major, minor, patch] = parts;
2942
+ switch (bump) {
2943
+ case "major":
2944
+ return `${major + 1}.0.0`;
2945
+ case "minor":
2946
+ return `${major}.${minor + 1}.0`;
2947
+ case "patch":
2948
+ return `${major}.${minor}.${patch + 1}`;
2949
+ }
2950
+ }
2951
+ function applyPromptSuggestions(currentPrompt, suggestions, options) {
2952
+ const approvedSuggestions = suggestions.filter((s) => s.approved);
2953
+ if (approvedSuggestions.length === 0) {
2954
+ return {
2955
+ prompt: currentPrompt,
2956
+ appliedCount: 0,
2957
+ skipped: []
2958
+ };
2959
+ }
2960
+ let newPrompt = { ...currentPrompt };
2961
+ let appliedCount = 0;
2962
+ const skipped = [];
2963
+ for (const suggestion of approvedSuggestions) {
2964
+ const applyResult = applySingleSuggestion(newPrompt, suggestion);
2965
+ if (applyResult.success) {
2966
+ newPrompt = applyResult.prompt;
2967
+ appliedCount++;
2968
+ } else {
2969
+ skipped.push({ suggestion, reason: applyResult.reason });
2970
+ }
2971
+ }
2972
+ if (options?.bumpVersion && appliedCount > 0) {
2973
+ newPrompt = {
2974
+ ...newPrompt,
2975
+ version: bumpVersion(currentPrompt.version, options.bumpVersion)
2976
+ };
2977
+ }
2978
+ return {
2979
+ prompt: newPrompt,
2980
+ appliedCount,
2981
+ skipped
2982
+ };
2983
+ }
2984
+ var AGENT_PROMPT_CORE_FIELDS = [
2985
+ "id",
2986
+ "version",
2987
+ "system",
2988
+ "renderUserPrompt",
2989
+ "userTemplate"
2990
+ ];
2991
+ function applySingleSuggestion(prompt, suggestion) {
2992
+ switch (suggestion.type) {
2993
+ case "system_prompt": {
2994
+ if (!prompt.system.includes(suggestion.currentValue)) {
2995
+ return {
2996
+ success: false,
2997
+ reason: `currentValue not found in system prompt: "${truncate(suggestion.currentValue, 50)}"`
2998
+ };
2999
+ }
3000
+ return {
3001
+ success: true,
3002
+ prompt: {
3003
+ ...prompt,
3004
+ system: safeReplace(
3005
+ prompt.system,
3006
+ suggestion.currentValue,
3007
+ suggestion.suggestedValue
3008
+ )
3009
+ }
3010
+ };
3011
+ }
3012
+ case "user_prompt": {
3013
+ const userTemplate = prompt.userTemplate;
3014
+ if (typeof userTemplate !== "string") {
3015
+ throw new EvalError(
3016
+ `Cannot apply user_prompt suggestion: prompt does not have a userTemplate field. The renderUserPrompt is a function and cannot be modified directly.`,
3017
+ {
3018
+ code: "SUGGESTION_APPLY_ERROR" /* SUGGESTION_APPLY_ERROR */,
3019
+ context: {
3020
+ suggestionType: suggestion.type,
3021
+ hasUserTemplate: "userTemplate" in prompt
3022
+ }
3023
+ }
3024
+ );
3025
+ }
3026
+ if (!userTemplate.includes(suggestion.currentValue)) {
3027
+ return {
3028
+ success: false,
3029
+ reason: `currentValue not found in userTemplate: "${truncate(suggestion.currentValue, 50)}"`
3030
+ };
3031
+ }
3032
+ const newTemplate = safeReplace(
3033
+ userTemplate,
3034
+ suggestion.currentValue,
3035
+ suggestion.suggestedValue
3036
+ );
3037
+ return {
3038
+ success: true,
3039
+ prompt: {
3040
+ ...prompt,
3041
+ userTemplate: newTemplate,
3042
+ renderUserPrompt: compileTemplate(newTemplate, prompt.id)
3043
+ }
3044
+ };
3045
+ }
3046
+ case "parameters": {
3047
+ const updatedPrompt = { ...prompt };
3048
+ let found = false;
3049
+ for (const [key, value] of Object.entries(updatedPrompt)) {
3050
+ if (AGENT_PROMPT_CORE_FIELDS.includes(
3051
+ key
3052
+ )) {
3053
+ continue;
3054
+ }
3055
+ if (typeof value === "string" && value.includes(suggestion.currentValue)) {
3056
+ updatedPrompt[key] = safeReplace(
3057
+ value,
3058
+ suggestion.currentValue,
3059
+ suggestion.suggestedValue
3060
+ );
3061
+ found = true;
3062
+ break;
3063
+ }
3064
+ }
3065
+ if (!found) {
3066
+ return {
3067
+ success: false,
3068
+ reason: `currentValue not found in any parameter field: "${truncate(suggestion.currentValue, 50)}"`
3069
+ };
3070
+ }
3071
+ return {
3072
+ success: true,
3073
+ prompt: updatedPrompt
3074
+ };
3075
+ }
3076
+ default: {
3077
+ const _exhaustive = suggestion.type;
3078
+ return {
3079
+ success: false,
3080
+ reason: `Unknown suggestion type: ${suggestion.type}`
3081
+ };
3082
+ }
3083
+ }
3084
+ }
3085
+
3086
+ // src/improver/llm-improver.ts
3087
+ import { Output as Output2 } from "ai";
3088
+ import { z as z4 } from "zod";
3089
+
3090
+ // src/improver/prompts/default.ts
3091
+ var defaultImproverPrompt = {
3092
+ id: "default-improver",
3093
+ version: "2.0.0",
3094
+ system: `You are an expert prompt engineer specializing in optimizing AI Agent prompts.
3095
+
3096
+ Your role is to analyze test results and evaluation feedback to propose targeted improvements.
3097
+
3098
+ ## Improvement Principles
3099
+
3100
+ 1. **Focus on Impact**: Prioritize changes that address the lowest-scoring criteria
3101
+ - Target specific failure patterns, not general improvements
3102
+ - One well-crafted change is better than many superficial ones
3103
+
3104
+ 2. **Be Specific and Actionable**: Provide concrete changes, not vague suggestions
3105
+ - Show exact text to add, modify, or remove
3106
+ - Explain the mechanism by which the change will help
3107
+
3108
+ 3. **Consider Trade-offs**: Evaluate side effects of each change
3109
+ - Will this fix break other test cases?
3110
+ - Does it increase prompt length/cost significantly?
3111
+ - Could it introduce new failure modes?
3112
+
3113
+ 4. **Maintain Prompt Quality**: Preserve clarity and structure
3114
+ - Keep prompts readable and maintainable
3115
+ - Avoid over-engineering or excessive constraints
3116
+ - Ensure changes align with the agent's core purpose
3117
+
3118
+ ## Suggestion Priority Levels
3119
+ - **high**: Critical issues causing test failures, should be addressed immediately
3120
+ - **medium**: Issues affecting quality scores, recommended for next iteration
3121
+ - **low**: Minor optimizations, nice-to-have improvements
3122
+
3123
+ ## Response Format
3124
+
3125
+ You MUST respond with valid JSON only. No additional text outside the JSON structure.
3126
+
3127
+ {
3128
+ "suggestions": [
3129
+ {
3130
+ "type": "system_prompt" | "user_prompt" | "parameters",
3131
+ "priority": "high" | "medium" | "low",
3132
+ "currentValue": "The specific text or value being changed",
3133
+ "suggestedValue": "The proposed replacement text or value",
3134
+ "reasoning": "Why this change addresses the identified issue",
3135
+ "expectedImprovement": "Predicted impact on scores and behavior"
3136
+ }
3137
+ ]
3138
+ }`,
3139
+ renderUserPrompt: (ctx) => {
3140
+ const failedDetails = buildFailedCaseDetails(ctx.evaluatedResults);
3141
+ return `
3142
+ ## Current Agent Prompt
3143
+
3144
+ ### System Prompt
3145
+ \`\`\`
3146
+ ${ctx.agentPrompt.system}
3147
+ \`\`\`
3148
+
3149
+ ## Test Results Summary
3150
+ - Total tests: ${ctx.evaluatedResults.length}
3151
+ - Passed: ${ctx.evaluatedResults.filter((r) => r.passed).length}
3152
+ - Failed: ${ctx.evaluatedResults.filter((r) => !r.passed).length}
3153
+
3154
+ ## Performance Metrics
3155
+ - Average latency: ${ctx.aggregatedMetrics.avgLatencyMs}ms
3156
+ - Total tokens used: ${ctx.aggregatedMetrics.totalTokens}
3157
+
3158
+ ## Failed/Low-Score Cases Details
3159
+ ${failedDetails}
3160
+
3161
+ Based on the above results, please propose specific prompt improvements.`.trim();
3162
+ }
3163
+ };
3164
+ function buildFailedCaseDetails(results) {
3165
+ const failedOrLowScore = results.filter((r) => !r.passed || r.overallScore < 70);
3166
+ if (failedOrLowScore.length === 0) {
3167
+ return "(None - all tests passed with acceptable scores)";
3168
+ }
3169
+ return failedOrLowScore.map(
3170
+ (r) => `
3171
+ ### ${r.testCase.id ?? "unnamed"} (Score: ${r.overallScore})
3172
+ **Input:** ${truncate(JSON.stringify(r.testCase.input), 200)}
3173
+ **Output:** ${truncate(JSON.stringify(r.output), 200)}
3174
+ **Evaluation:**
3175
+ ${r.verdicts.map((v) => `- ${v.criterionId}: ${v.score}/100 - ${v.reasoning}`).join("\n")}`
3176
+ ).join("\n");
3177
+ }
3178
+
3179
+ // src/improver/llm-improver.ts
3180
+ function toEvalTokenUsage2(usage) {
3181
+ return {
3182
+ inputTokens: usage.inputTokens ?? 0,
3183
+ outputTokens: usage.outputTokens ?? 0,
3184
+ totalTokens: usage.totalTokens ?? 0
3185
+ };
3186
+ }
3187
+ var ImproverResponseSchema = z4.object({
3188
+ suggestions: z4.array(
3189
+ z4.object({
3190
+ type: z4.enum(["system_prompt", "user_prompt", "parameters"]),
3191
+ priority: z4.enum(["high", "medium", "low"]),
3192
+ currentValue: z4.string(),
3193
+ suggestedValue: z4.string(),
3194
+ reasoning: z4.string(),
3195
+ expectedImprovement: z4.string()
3196
+ })
3197
+ )
3198
+ });
3199
+ function aggregateMetrics(results) {
3200
+ if (results.length === 0) {
3201
+ return {
3202
+ avgLatencyMs: 0,
3203
+ totalTokens: 0
3204
+ };
3205
+ }
3206
+ let totalLatency = 0;
3207
+ let totalTokens = 0;
3208
+ for (const result of results) {
3209
+ totalLatency += result.metrics.latencyMs;
3210
+ totalTokens += result.metrics.tokenUsage.totalTokens;
3211
+ }
3212
+ return {
3213
+ avgLatencyMs: Math.round(totalLatency / results.length),
3214
+ totalTokens
3215
+ };
3216
+ }
3217
+ function createImprover(config) {
3218
+ const { provider, prompt = defaultImproverPrompt, model } = config;
3219
+ return {
3220
+ async improve(agentPrompt, results) {
3221
+ const context = {
3222
+ agentPrompt,
3223
+ evaluatedResults: results,
3224
+ aggregatedMetrics: aggregateMetrics(results)
3225
+ };
3226
+ const messages = [
3227
+ { role: "system", content: prompt.system },
3228
+ { role: "user", content: prompt.renderUserPrompt(context) }
3229
+ ];
3230
+ let response;
3231
+ let llmUsage;
3232
+ try {
3233
+ const execution = provider.simpleExecution(async (session) => {
3234
+ const result = await session.generateText({
3235
+ messages,
3236
+ output: Output2.object({ schema: ImproverResponseSchema })
3237
+ });
3238
+ return result.output;
3239
+ });
3240
+ const executionResult = await execution.result();
3241
+ if (executionResult.status !== "succeeded") {
3242
+ throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
3243
+ }
3244
+ response = executionResult.value;
3245
+ llmUsage = executionResult.summary.totalLLMUsage;
3246
+ } catch (cause) {
3247
+ throw EvalError.from(cause, "LLM_API_ERROR" /* LLM_API_ERROR */, {
3248
+ promptId: prompt.id,
3249
+ promptVersion: prompt.version
3250
+ });
3251
+ }
3252
+ const suggestions = response.suggestions.map((s) => ({
3253
+ ...s,
3254
+ approved: void 0,
3255
+ modified: void 0
3256
+ }));
3257
+ const metadata = llmUsage ? { tokenUsage: toEvalTokenUsage2(llmUsage), model } : void 0;
3258
+ return { suggestions, metadata };
3259
+ }
3260
+ };
3261
+ }
3262
+
3263
+ // src/index.ts
3264
+ import { mock as mock2, MockProvider } from "@agtlantis/core/testing";
3265
+ import {
3266
+ compileTemplate as compileTemplate3,
3267
+ createFilePromptRepository
3268
+ } from "@agtlantis/core";
3269
+ import {
3270
+ calculateCostFromUsage as calculateCostFromUsage3,
3271
+ OPENAI_PRICING,
3272
+ GOOGLE_PRICING,
3273
+ ANTHROPIC_PRICING,
3274
+ DEFAULT_PRICING_CONFIG
3275
+ } from "@agtlantis/core";
3276
+
3277
+ // src/improvement-cycle/types.ts
3278
+ function isTargetScoreCondition(condition) {
3279
+ return condition.type === "targetScore";
3280
+ }
3281
+ function isMaxRoundsCondition(condition) {
3282
+ return condition.type === "maxRounds";
3283
+ }
3284
+ function isNoImprovementCondition(condition) {
3285
+ return condition.type === "noImprovement";
3286
+ }
3287
+ function isMaxCostCondition(condition) {
3288
+ return condition.type === "maxCost";
3289
+ }
3290
+ function isCustomCycleCondition(condition) {
3291
+ return condition.type === "custom";
3292
+ }
3293
+
3294
+ // src/improvement-cycle/conditions.ts
3295
+ function targetScore(threshold) {
3296
+ if (!Number.isFinite(threshold)) {
3297
+ throw new EvalError("threshold must be a finite number", {
3298
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3299
+ context: { threshold }
3300
+ });
3301
+ }
3302
+ if (threshold < 0 || threshold > 100) {
3303
+ throw new EvalError("threshold must be between 0 and 100", {
3304
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3305
+ context: { threshold }
3306
+ });
3307
+ }
3308
+ return { type: "targetScore", threshold };
3309
+ }
3310
+ function maxRounds(count) {
3311
+ if (!Number.isInteger(count) || count < 1) {
3312
+ throw new EvalError("count must be a positive integer", {
3313
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3314
+ context: { count }
3315
+ });
3316
+ }
3317
+ return { type: "maxRounds", count };
3318
+ }
3319
+ function noImprovement(consecutiveRounds, minDelta) {
3320
+ if (!Number.isInteger(consecutiveRounds) || consecutiveRounds < 1) {
3321
+ throw new EvalError("consecutiveRounds must be a positive integer", {
3322
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3323
+ context: { consecutiveRounds }
3324
+ });
3325
+ }
3326
+ if (minDelta !== void 0 && (!Number.isFinite(minDelta) || minDelta < 0)) {
3327
+ throw new EvalError("minDelta must be a non-negative finite number", {
3328
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3329
+ context: { minDelta }
3330
+ });
3331
+ }
3332
+ return {
3333
+ type: "noImprovement",
3334
+ consecutiveRounds,
3335
+ ...minDelta !== void 0 && { minDelta }
3336
+ };
3337
+ }
3338
+ function maxCost(maxUSD) {
3339
+ if (!Number.isFinite(maxUSD) || maxUSD <= 0) {
3340
+ throw new EvalError("maxUSD must be a positive finite number", {
3341
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3342
+ context: { maxUSD }
3343
+ });
3344
+ }
3345
+ return { type: "maxCost", maxUSD };
3346
+ }
3347
+ function checkTargetScore(condition, ctx) {
3348
+ if (ctx.latestScore >= condition.threshold) {
3349
+ return {
3350
+ terminated: true,
3351
+ matchedCondition: condition,
3352
+ reason: `Target score ${condition.threshold} reached (current: ${ctx.latestScore})`
3353
+ };
3354
+ }
3355
+ return {
3356
+ terminated: false,
3357
+ reason: `Score ${ctx.latestScore} below target ${condition.threshold}`
3358
+ };
3359
+ }
3360
+ function checkMaxRounds(condition, ctx) {
3361
+ if (ctx.currentRound >= condition.count) {
3362
+ return {
3363
+ terminated: true,
3364
+ matchedCondition: condition,
3365
+ reason: `Maximum rounds reached (${condition.count})`
3366
+ };
3367
+ }
3368
+ return {
3369
+ terminated: false,
3370
+ reason: `Round ${ctx.currentRound} of ${condition.count}`
3371
+ };
3372
+ }
3373
+ function checkNoImprovement(condition, ctx) {
3374
+ const { consecutiveRounds, minDelta = 0 } = condition;
3375
+ const { history } = ctx;
3376
+ let noImprovementCount = 0;
3377
+ for (let i = history.length - 1; i >= 0; i--) {
3378
+ const round = history[i];
3379
+ if (round.scoreDelta === null) break;
3380
+ if (round.scoreDelta <= minDelta) {
3381
+ noImprovementCount++;
3382
+ } else {
3383
+ break;
3384
+ }
3385
+ }
3386
+ if (noImprovementCount >= consecutiveRounds) {
3387
+ return {
3388
+ terminated: true,
3389
+ matchedCondition: condition,
3390
+ reason: `No improvement for ${noImprovementCount} consecutive round${noImprovementCount === 1 ? "" : "s"}`
3391
+ };
3392
+ }
3393
+ const roundWord = noImprovementCount === 1 ? "round" : "rounds";
3394
+ return {
3395
+ terminated: false,
3396
+ reason: `${noImprovementCount} ${roundWord} without improvement (need ${consecutiveRounds})`
3397
+ };
3398
+ }
3399
+ function checkMaxCost(condition, ctx) {
3400
+ if (ctx.totalCost >= condition.maxUSD) {
3401
+ return {
3402
+ terminated: true,
3403
+ matchedCondition: condition,
3404
+ reason: `Cost limit exceeded ($${ctx.totalCost.toFixed(2)} >= $${condition.maxUSD.toFixed(2)})`
3405
+ };
3406
+ }
3407
+ return {
3408
+ terminated: false,
3409
+ reason: `Cost $${ctx.totalCost.toFixed(2)} under limit $${condition.maxUSD.toFixed(2)}`
3410
+ };
3411
+ }
3412
+ async function checkCustomCondition(condition, ctx) {
3413
+ const description = condition.description ?? "Custom condition";
3414
+ try {
3415
+ const shouldTerminate = await condition.check(ctx);
3416
+ if (shouldTerminate) {
3417
+ return {
3418
+ terminated: true,
3419
+ matchedCondition: condition,
3420
+ reason: `${description} met`
3421
+ };
3422
+ }
3423
+ return {
3424
+ terminated: false,
3425
+ reason: `${description} not met`
3426
+ };
3427
+ } catch (error) {
3428
+ const message = error instanceof Error ? error.message : String(error);
3429
+ return {
3430
+ terminated: false,
3431
+ reason: `${description} check failed: ${message}`
3432
+ };
3433
+ }
3434
+ }
3435
+ async function checkCycleCondition(condition, context) {
3436
+ if (isTargetScoreCondition(condition)) {
3437
+ return checkTargetScore(condition, context);
3438
+ }
3439
+ if (isMaxRoundsCondition(condition)) {
3440
+ return checkMaxRounds(condition, context);
3441
+ }
3442
+ if (isNoImprovementCondition(condition)) {
3443
+ return checkNoImprovement(condition, context);
3444
+ }
3445
+ if (isMaxCostCondition(condition)) {
3446
+ return checkMaxCost(condition, context);
3447
+ }
3448
+ if (isCustomCycleCondition(condition)) {
3449
+ return checkCustomCondition(condition, context);
3450
+ }
3451
+ const _exhaustive = condition;
3452
+ throw new EvalError(`Unknown condition type: ${JSON.stringify(_exhaustive)}`, {
3453
+ code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */,
3454
+ context: { condition: _exhaustive }
3455
+ });
3456
+ }
3457
+ async function checkCycleTermination(conditions, context) {
3458
+ if (conditions.length === 0) {
3459
+ return {
3460
+ terminated: false,
3461
+ reason: "No termination conditions specified"
3462
+ };
3463
+ }
3464
+ for (const condition of conditions) {
3465
+ const result = await checkCycleCondition(condition, context);
3466
+ if (result.terminated) {
3467
+ return result;
3468
+ }
3469
+ }
3470
+ return {
3471
+ terminated: false,
3472
+ reason: "No termination conditions met"
3473
+ };
3474
+ }
3475
+
3476
+ // src/improvement-cycle/runner.ts
3477
+ import { calculateCostFromUsage as calculateCostFromUsage2 } from "@agtlantis/core";
3478
+
3479
+ // src/improvement-cycle/history.ts
3480
+ import crypto from "crypto";
3481
+ import { existsSync as existsSync5 } from "fs";
3482
+ import { mkdir as mkdir2, readFile as readFile3, writeFile as writeFile3 } from "fs/promises";
3483
+ import { dirname } from "path";
3484
+ import { compileTemplate as compileTemplate2 } from "@agtlantis/core";
3485
+ var defaultHistoryStorage = {
3486
+ readFile: (path3) => readFile3(path3, "utf-8"),
3487
+ writeFile: (path3, content) => writeFile3(path3, content, "utf-8"),
3488
+ exists: existsSync5,
3489
+ mkdir: (path3, options) => mkdir2(path3, options)
3490
+ };
3491
+ function hasUserTemplate(prompt) {
3492
+ return typeof prompt.userTemplate === "string";
3493
+ }
3494
+ function serializePrompt(prompt) {
3495
+ const p = prompt;
3496
+ if (!hasUserTemplate(p)) {
3497
+ throw new EvalError("Cannot serialize prompt: userTemplate field is required", {
3498
+ code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
3499
+ context: { promptId: p.id }
3500
+ });
3501
+ }
3502
+ const { id, version, system, userTemplate, renderUserPrompt, ...rest } = p;
3503
+ const customFields = Object.keys(rest).length > 0 ? rest : void 0;
3504
+ return {
3505
+ id,
3506
+ version,
3507
+ system,
3508
+ userTemplate,
3509
+ ...customFields && { customFields }
3510
+ };
3511
+ }
3512
+ function validateDeserializedPrompt(obj, promptId) {
3513
+ const requiredStrings = ["id", "version", "system", "userTemplate"];
3514
+ for (const field of requiredStrings) {
3515
+ if (typeof obj[field] !== "string") {
3516
+ throw new EvalError(`Invalid deserialized prompt: ${field} must be a string`, {
3517
+ code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
3518
+ context: { promptId, field, actual: typeof obj[field] }
3519
+ });
3520
+ }
3521
+ }
3522
+ if (typeof obj.renderUserPrompt !== "function") {
3523
+ throw new EvalError("Invalid deserialized prompt: renderUserPrompt must be a function", {
3524
+ code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
3525
+ context: { promptId, actual: typeof obj.renderUserPrompt }
3526
+ });
3527
+ }
3528
+ }
3529
+ function deserializePrompt(serialized) {
3530
+ const { id, version, system, userTemplate, customFields } = serialized;
3531
+ let renderUserPrompt;
3532
+ try {
3533
+ renderUserPrompt = compileTemplate2(userTemplate, id);
3534
+ } catch (error) {
3535
+ const message = error instanceof Error ? error.message : String(error);
3536
+ throw new EvalError(`Failed to compile userTemplate: ${message}`, {
3537
+ code: "TEMPLATE_COMPILE_ERROR" /* TEMPLATE_COMPILE_ERROR */,
3538
+ context: { promptId: id, userTemplate }
3539
+ });
3540
+ }
3541
+ const result = {
3542
+ ...customFields,
3543
+ id,
3544
+ version,
3545
+ system,
3546
+ userTemplate,
3547
+ renderUserPrompt
3548
+ };
3549
+ validateDeserializedPrompt(result, id);
3550
+ return result;
3551
+ }
3552
+ function serializeRoundResult(result) {
3553
+ const { summary } = result.report;
3554
+ return {
3555
+ round: result.round,
3556
+ completedAt: result.completedAt.toISOString(),
3557
+ avgScore: summary.avgScore,
3558
+ passed: summary.passed,
3559
+ failed: summary.failed,
3560
+ totalTests: summary.totalTests,
3561
+ suggestionsGenerated: result.suggestionsGenerated,
3562
+ suggestionsApproved: result.suggestionsApproved,
3563
+ promptSnapshot: result.promptSnapshot,
3564
+ promptVersionAfter: result.promptVersionAfter,
3565
+ cost: result.cost,
3566
+ scoreDelta: result.scoreDelta
3567
+ };
3568
+ }
3569
+ function validateHistorySchema(data) {
3570
+ if (typeof data !== "object" || data === null) {
3571
+ throw new EvalError("Invalid history: not an object", {
3572
+ code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */
3573
+ });
3574
+ }
3575
+ const h = data;
3576
+ if (h.schemaVersion !== "1.1.0") {
3577
+ throw new EvalError(`Unsupported schema version: ${String(h.schemaVersion)}`, {
3578
+ code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */,
3579
+ context: { schemaVersion: h.schemaVersion }
3580
+ });
3581
+ }
3582
+ const requiredFields = [
3583
+ "sessionId",
3584
+ "startedAt",
3585
+ "initialPrompt",
3586
+ "currentPrompt",
3587
+ "rounds",
3588
+ "totalCost"
3589
+ ];
3590
+ for (const field of requiredFields) {
3591
+ if (!(field in h)) {
3592
+ throw new EvalError(`Invalid history: missing field "${field}"`, {
3593
+ code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */,
3594
+ context: { missingField: field }
3595
+ });
3596
+ }
3597
+ }
3598
+ }
3599
+ var ImprovementSessionImpl = class {
3600
+ _history;
3601
+ _isUpdating = false;
3602
+ _savePromise = Promise.resolve();
3603
+ config;
3604
+ constructor(history, config = {}) {
3605
+ this._history = history;
3606
+ this.config = {
3607
+ autoSave: config.autoSave ?? false,
3608
+ ...config
3609
+ };
3610
+ }
3611
+ get sessionId() {
3612
+ return this._history.sessionId;
3613
+ }
3614
+ get history() {
3615
+ return this._history;
3616
+ }
3617
+ get canSave() {
3618
+ return this.config.path !== void 0;
3619
+ }
3620
+ addRound(roundResult, updatedPrompt) {
3621
+ if (this._isUpdating) {
3622
+ throw new EvalError("Session is being updated", {
3623
+ code: "CONCURRENT_MODIFICATION" /* CONCURRENT_MODIFICATION */,
3624
+ context: { sessionId: this.sessionId }
3625
+ });
3626
+ }
3627
+ if (this._history.completedAt) {
3628
+ throw new EvalError("Cannot add round to completed session", {
3629
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3630
+ context: { sessionId: this.sessionId }
3631
+ });
3632
+ }
3633
+ this._isUpdating = true;
3634
+ try {
3635
+ const serializedRound = serializeRoundResult(roundResult);
3636
+ this._history = {
3637
+ ...this._history,
3638
+ currentPrompt: updatedPrompt,
3639
+ rounds: [...this._history.rounds, serializedRound],
3640
+ totalCost: this._history.totalCost + roundResult.cost.total
3641
+ };
3642
+ if (this.config.autoSave && this.canSave) {
3643
+ this.save().catch((err) => this.handleAutoSaveError(err));
3644
+ }
3645
+ } finally {
3646
+ this._isUpdating = false;
3647
+ }
3648
+ }
3649
+ complete(terminationReason) {
3650
+ this._history = {
3651
+ ...this._history,
3652
+ completedAt: (/* @__PURE__ */ new Date()).toISOString(),
3653
+ terminationReason
3654
+ };
3655
+ if (this.config.autoSave && this.canSave) {
3656
+ this.save().catch((err) => this.handleAutoSaveError(err));
3657
+ }
3658
+ }
3659
+ handleAutoSaveError(error) {
3660
+ const err = error instanceof Error ? error : new Error(String(error));
3661
+ if (this.config.onAutoSaveError) {
3662
+ this.config.onAutoSaveError(err);
3663
+ } else {
3664
+ console.error("Auto-save failed:", err);
3665
+ }
3666
+ }
3667
+ async save() {
3668
+ if (!this.config.path) {
3669
+ throw new EvalError("Cannot save: no path configured", {
3670
+ code: "INVALID_CONFIG" /* INVALID_CONFIG */,
3671
+ context: { sessionId: this.sessionId }
3672
+ });
3673
+ }
3674
+ this._savePromise = this._savePromise.then(async () => {
3675
+ await saveHistory(this._history, this.config.path, this.config.storage);
3676
+ });
3677
+ return this._savePromise;
3678
+ }
3679
+ async flush() {
3680
+ return this._savePromise;
3681
+ }
3682
+ };
3683
+ function createSession(initialPrompt, config) {
3684
+ const serializedPrompt = serializePrompt(initialPrompt);
3685
+ const history = {
3686
+ schemaVersion: "1.1.0",
3687
+ sessionId: crypto.randomUUID(),
3688
+ startedAt: (/* @__PURE__ */ new Date()).toISOString(),
3689
+ initialPrompt: serializedPrompt,
3690
+ currentPrompt: serializedPrompt,
3691
+ rounds: [],
3692
+ totalCost: 0
3693
+ };
3694
+ return new ImprovementSessionImpl(history, config);
3695
+ }
3696
+ async function resumeSession(path3, config) {
3697
+ const history = await loadHistory(path3, config?.storage);
3698
+ const reopenedHistory = {
3699
+ ...history,
3700
+ completedAt: void 0,
3701
+ terminationReason: void 0
3702
+ };
3703
+ return new ImprovementSessionImpl(reopenedHistory, { ...config, path: path3 });
3704
+ }
3705
+ async function saveHistory(history, path3, storage = defaultHistoryStorage) {
3706
+ try {
3707
+ const dir = dirname(path3);
3708
+ if (dir && dir !== "." && dir !== "/" && !storage.exists(dir)) {
3709
+ await storage.mkdir(dir, { recursive: true });
3710
+ }
3711
+ await storage.writeFile(path3, JSON.stringify(history, null, 2));
3712
+ } catch (error) {
3713
+ if (error instanceof EvalError) throw error;
3714
+ throw EvalError.from(error, "FILE_WRITE_ERROR" /* FILE_WRITE_ERROR */, { path: path3 });
3715
+ }
3716
+ }
3717
+ async function loadHistory(path3, storage = defaultHistoryStorage) {
3718
+ try {
3719
+ if (!storage.exists(path3)) {
3720
+ throw new EvalError(`History file not found: ${path3}`, {
3721
+ code: "FILE_READ_ERROR" /* FILE_READ_ERROR */,
3722
+ context: { path: path3 }
3723
+ });
3724
+ }
3725
+ const content = await storage.readFile(path3);
3726
+ const history = JSON.parse(content);
3727
+ validateHistorySchema(history);
3728
+ return history;
3729
+ } catch (error) {
3730
+ if (error instanceof EvalError) throw error;
3731
+ throw EvalError.from(error, "FILE_READ_ERROR" /* FILE_READ_ERROR */, { path: path3 });
3732
+ }
3733
+ }
3734
+
3735
+ // src/improvement-cycle/runner.ts
3736
+ function initializeCycleState(initialPrompt, existingSession) {
3737
+ const resumeFromRound = existingSession ? existingSession.history.rounds.length : 0;
3738
+ return {
3739
+ currentPrompt: initialPrompt,
3740
+ currentRound: resumeFromRound,
3741
+ previousScores: existingSession ? existingSession.history.rounds.map((r) => r.avgScore) : [],
3742
+ totalCost: existingSession ? existingSession.history.totalCost : 0,
3743
+ completedRounds: []
3744
+ };
3745
+ }
3746
+ function calculateScoreDelta(currentScore, previousScores) {
3747
+ if (previousScores.length === 0) {
3748
+ return null;
3749
+ }
3750
+ const previousScore = previousScores[previousScores.length - 1];
3751
+ return currentScore - previousScore;
3752
+ }
3753
+ function buildCycleContext(state, currentScore) {
3754
+ return {
3755
+ currentRound: state.currentRound,
3756
+ latestScore: currentScore,
3757
+ previousScores: [...state.previousScores],
3758
+ totalCost: state.totalCost,
3759
+ history: state.completedRounds
3760
+ };
3761
+ }
3762
+ function createRoundResult(state, report, improveResult, cost, scoreDelta, promptSnapshot) {
3763
+ return {
3764
+ round: state.currentRound,
3765
+ report,
3766
+ completedAt: /* @__PURE__ */ new Date(),
3767
+ suggestionsGenerated: improveResult.suggestions,
3768
+ suggestionsApproved: [],
3769
+ // Will be updated after decision
3770
+ promptSnapshot,
3771
+ promptVersionAfter: state.currentPrompt.version,
3772
+ cost,
3773
+ scoreDelta
3774
+ };
3775
+ }
3776
+ async function handleStopDecision(state, session, roundResult, promptSnapshot, terminatedByCondition, conditionReason) {
3777
+ const reason = terminatedByCondition ? conditionReason : "User requested stop";
3778
+ session.addRound(roundResult, promptSnapshot);
3779
+ session.complete(reason);
3780
+ await session.flush();
3781
+ state.completedRounds.push(roundResult);
3782
+ return {
3783
+ rounds: state.completedRounds,
3784
+ finalPrompt: deserializePrompt(session.history.currentPrompt),
3785
+ terminationReason: reason,
3786
+ totalCost: state.totalCost,
3787
+ history: session.history
3788
+ };
3789
+ }
3790
+ function handleRollbackDecision(state, rollbackToRound) {
3791
+ const targetRoundIndex = rollbackToRound - 1;
3792
+ if (targetRoundIndex < 0 || targetRoundIndex >= state.completedRounds.length) {
3793
+ throw new Error(`Cannot rollback to round ${rollbackToRound}: round not found`);
3794
+ }
3795
+ const targetRound = state.completedRounds[targetRoundIndex];
3796
+ state.currentPrompt = deserializePrompt(targetRound.promptSnapshot);
3797
+ state.previousScores = state.previousScores.slice(0, rollbackToRound - 1);
3798
+ }
3799
+ function handleContinueDecision(state, session, roundResult, approvedSuggestions, versionBump) {
3800
+ const updatedRoundResult = {
3801
+ ...roundResult,
3802
+ suggestionsApproved: approvedSuggestions
3803
+ };
3804
+ if (approvedSuggestions.length > 0) {
3805
+ const applyResult = applyPromptSuggestions(state.currentPrompt, approvedSuggestions, {
3806
+ bumpVersion: versionBump
3807
+ });
3808
+ state.currentPrompt = applyResult.prompt;
3809
+ updatedRoundResult.promptVersionAfter = state.currentPrompt.version;
3810
+ }
3811
+ const updatedPromptSnapshot = serializePrompt(state.currentPrompt);
3812
+ session.addRound(updatedRoundResult, updatedPromptSnapshot);
3813
+ state.completedRounds.push(updatedRoundResult);
3814
+ return updatedRoundResult;
3815
+ }
3816
+ async function executeRound(config, state, pricingConfig) {
3817
+ const { createAgent, judge, improver, testCases: testCases2, options = {} } = config;
3818
+ const agent = createAgent(state.currentPrompt);
3819
+ const suite = createEvalSuite({
3820
+ agent,
3821
+ judge,
3822
+ agentDescription: options.agentDescription
3823
+ });
3824
+ const report = await suite.run(testCases2, options.runOptions);
3825
+ const improveResult = improver ? await improver.improve(state.currentPrompt, report.results) : { suggestions: [] };
3826
+ const cost = calculateRoundCost(report, improveResult, pricingConfig);
3827
+ return { report, improveResult, cost };
3828
+ }
3829
+ function detectProviderForImprover(model) {
3830
+ if (!model) return "anthropic";
3831
+ if (model.startsWith("claude-")) return "anthropic";
3832
+ if (model.startsWith("gpt-") || model.startsWith("o1") || model.startsWith("o3")) return "openai";
3833
+ if (model.startsWith("gemini-")) return "google";
3834
+ return "anthropic";
3835
+ }
3836
+ function toLanguageModelUsage2(usage) {
3837
+ return {
3838
+ inputTokens: usage.inputTokens,
3839
+ outputTokens: usage.outputTokens,
3840
+ totalTokens: usage.totalTokens
3841
+ };
3842
+ }
3843
+ function calculateImproverCost(improveResult, pricingConfig) {
3844
+ const usage = improveResult.metadata?.tokenUsage;
3845
+ if (!usage) return 0;
3846
+ const model = improveResult.metadata?.model ?? "unknown";
3847
+ const provider = detectProviderForImprover(model);
3848
+ const providerPricing = pricingConfig?.providerPricing?.[provider];
3849
+ const result = calculateCostFromUsage2(
3850
+ toLanguageModelUsage2(usage),
3851
+ model,
3852
+ provider,
3853
+ providerPricing
3854
+ );
3855
+ return result.total;
3856
+ }
3857
+ function calculateRoundCost(report, improveResult, pricingConfig) {
3858
+ const reportCosts = pricingConfig ? calculateReportCosts(report, pricingConfig) : { total: 0, byComponent: { agent: 0, judge: 0 } };
3859
+ const improverCost = calculateImproverCost(improveResult, pricingConfig);
3860
+ return {
3861
+ agent: reportCosts.byComponent.agent ?? 0,
3862
+ judge: reportCosts.byComponent.judge ?? 0,
3863
+ improver: improverCost,
3864
+ total: reportCosts.total + improverCost
3865
+ };
3866
+ }
3867
+ async function* runImprovementCycle(config) {
3868
+ const { initialPrompt, terminateWhen = [], options = {} } = config;
3869
+ const { pricingConfig, versionBump = "patch", history: historyConfig, session: existingSession } = options;
3870
+ const session = existingSession ?? createSession(
3871
+ initialPrompt,
3872
+ historyConfig ? { path: historyConfig.path, autoSave: historyConfig.autoSave } : void 0
3873
+ );
3874
+ const state = initializeCycleState(initialPrompt, existingSession);
3875
+ try {
3876
+ while (true) {
3877
+ state.currentRound++;
3878
+ const { report, improveResult, cost } = await executeRound(config, state, pricingConfig);
3879
+ state.totalCost += cost.total;
3880
+ const currentScore = report.summary.avgScore;
3881
+ const scoreDelta = calculateScoreDelta(currentScore, state.previousScores);
3882
+ const promptSnapshot = serializePrompt(state.currentPrompt);
3883
+ const roundResult = createRoundResult(state, report, improveResult, cost, scoreDelta, promptSnapshot);
3884
+ const context = buildCycleContext(state, currentScore);
3885
+ state.previousScores.push(currentScore);
3886
+ const terminationCheck = await checkCycleTermination(terminateWhen, context);
3887
+ const pendingSuggestions = improveResult.suggestions.map((s) => ({
3888
+ ...s,
3889
+ approved: false
3890
+ }));
3891
+ const roundYield = {
3892
+ roundResult,
3893
+ pendingSuggestions,
3894
+ terminationCheck,
3895
+ context
3896
+ };
3897
+ const decision = yield roundYield;
3898
+ if (!decision || decision.action === "stop") {
3899
+ return await handleStopDecision(
3900
+ state,
3901
+ session,
3902
+ roundResult,
3903
+ promptSnapshot,
3904
+ terminationCheck.terminated,
3905
+ terminationCheck.reason
3906
+ );
3907
+ }
3908
+ if (decision.action === "rollback" && decision.rollbackToRound !== void 0) {
3909
+ handleRollbackDecision(state, decision.rollbackToRound);
3910
+ continue;
3911
+ }
3912
+ handleContinueDecision(
3913
+ state,
3914
+ session,
3915
+ roundResult,
3916
+ decision.approvedSuggestions ?? [],
3917
+ versionBump
3918
+ );
3919
+ }
3920
+ } catch (error) {
3921
+ const errorMessage = error instanceof Error ? error.message : String(error);
3922
+ session.complete(`Error: ${errorMessage}`);
3923
+ throw error;
3924
+ }
3925
+ }
3926
+ async function runImprovementCycleAuto(config) {
3927
+ const cycle = runImprovementCycle(config);
3928
+ let iteratorResult = await cycle.next();
3929
+ while (!iteratorResult.done) {
3930
+ const roundYield = iteratorResult.value;
3931
+ let decision;
3932
+ if (roundYield.terminationCheck.terminated) {
3933
+ decision = { action: "stop" };
3934
+ } else {
3935
+ const approvedSuggestions = roundYield.pendingSuggestions.map((s) => ({
3936
+ ...s,
3937
+ approved: true
3938
+ }));
3939
+ decision = { action: "continue", approvedSuggestions };
3940
+ }
3941
+ iteratorResult = await cycle.next(decision);
3942
+ }
3943
+ return iteratorResult.value;
3944
+ }
3945
+
3946
+ // src/cli/commands/run.ts
3947
+ async function runCommand(configPath, options) {
3948
+ const startTime = Date.now();
3949
+ try {
3950
+ printBanner();
3951
+ printProgress("Loading environment...");
3952
+ await loadEnvFile(options.envFile);
3953
+ printProgress("Loading configuration...");
3954
+ const config = await loadConfigWithDefaults(configPath);
3955
+ printProgress("Initializing providers...");
3956
+ const { mainProvider, judgeProvider, improverProvider } = initializeProviders(config, options);
3957
+ const judge = createJudge({
3958
+ provider: judgeProvider,
3959
+ prompt: config.judge.prompt,
3960
+ criteria: config.judge.criteria,
3961
+ passThreshold: config.judge.passThreshold
3962
+ });
3963
+ const improver = config.improver ? createImprover({
3964
+ provider: improverProvider,
3965
+ prompt: config.improver.prompt
3966
+ }) : void 0;
3967
+ const concurrency = options.concurrency ? parseInt(options.concurrency, 10) : config.run?.concurrency ?? 1;
3968
+ const iterations = options.iterations ? parseInt(options.iterations, 10) : config.run?.iterations ?? 1;
3969
+ const verbose = options.verbose ?? config.output?.verbose ?? false;
3970
+ const allReports = [];
3971
+ const includePatterns = options.include ?? config.include;
3972
+ if (includePatterns && includePatterns.length > 0) {
3973
+ const yamlReports = await runYamlTests({
3974
+ config,
3975
+ options,
3976
+ includePatterns,
3977
+ mainProvider,
3978
+ judge,
3979
+ improver,
3980
+ concurrency,
3981
+ iterations
3982
+ });
3983
+ allReports.push(...yamlReports);
3984
+ }
3985
+ if (config.testCases && config.testCases.length > 0) {
3986
+ const inlineReports = await runInlineTests({
3987
+ config,
3988
+ options,
3989
+ judge,
3990
+ improver,
3991
+ concurrency,
3992
+ iterations
3993
+ });
3994
+ allReports.push(...inlineReports);
3995
+ }
3996
+ const report = mergeReports(allReports, resolvePromptVersion(config));
3997
+ if (report.summary.totalTests === 0) {
3998
+ printError(
3999
+ new Error(
4000
+ "No test cases to run after filtering.\n" + (options.tags ? ` Tags filter: ${options.tags.join(", ")}
4001
+ ` : "") + (options.agent ? ` Agent filter: ${options.agent}
4002
+ ` : "")
4003
+ )
4004
+ );
4005
+ process.exit(1);
4006
+ }
4007
+ const duration = Date.now() - startTime;
4008
+ printSummary(report, { verbose, duration });
4009
+ if (options.report !== false) {
4010
+ const outputPath = await generateReport(report, {
4011
+ dir: config.output?.dir,
4012
+ filename: options.output ?? config.output?.filename
4013
+ });
4014
+ console.log(`
4015
+ Report saved to: ${outputPath}`);
4016
+ }
4017
+ const hasFailures = report.summary.failed > 0;
4018
+ process.exit(hasFailures ? 1 : 0);
4019
+ } catch (error) {
4020
+ printError(error instanceof Error ? error : new Error(String(error)));
4021
+ process.exit(1);
4022
+ }
4023
+ }
4024
+ async function runYamlTests(params) {
4025
+ const { config, options, includePatterns, mainProvider, judge, improver, concurrency, iterations } = params;
4026
+ const reports = [];
4027
+ printProgress("Discovering YAML eval files...");
4028
+ const filePaths = await discoverEvalFiles(config, { include: includePatterns });
4029
+ if (filePaths.length === 0) {
4030
+ printProgress("No YAML files found matching patterns");
4031
+ return reports;
4032
+ }
4033
+ printProgress(`Discovered ${filePaths.length} YAML file(s)`);
4034
+ const yamlFiles = await loadYamlEvalFiles(filePaths);
4035
+ const filteredFiles = options.agent ? yamlFiles.filter((f) => f.content.agent === options.agent) : yamlFiles;
4036
+ if (options.agent && filteredFiles.length === 0) {
4037
+ const availableAgents = [...new Set(yamlFiles.map((f) => f.content.agent))];
4038
+ throw new ConfigError(
4039
+ `No YAML files found for agent "${options.agent}".
4040
+ Available agents: ${availableAgents.join(", ")}`,
4041
+ "CONFIG_VALIDATION_ERROR"
4042
+ );
4043
+ }
4044
+ const yamlByAgent = groupYamlByAgent(filteredFiles);
4045
+ for (const [agentName, agentFiles] of yamlByAgent) {
4046
+ printProgress(`Running tests for agent: ${agentName}`);
4047
+ const agent = lookupAgent(config, agentName);
4048
+ const yamlContext = {
4049
+ provider: mainProvider,
4050
+ buildInput: (response, _ctx) => ({ message: response })
4051
+ };
4052
+ const yamlTestCases = [];
4053
+ for (const file of agentFiles) {
4054
+ const cases = convertToTestCases(file.content, yamlContext);
4055
+ yamlTestCases.push(...cases);
4056
+ }
4057
+ const filteredCases = filterByTags(yamlTestCases, options.tags);
4058
+ if (filteredCases.length === 0) {
4059
+ printProgress(` No matching tests after tag filter for ${agentName}`);
4060
+ continue;
4061
+ }
4062
+ const { singleTurnCases, multiTurnCases } = splitTestCases(filteredCases);
4063
+ const suite = createEvalSuite({
4064
+ agent,
4065
+ judge,
4066
+ improver,
4067
+ agentDescription: resolveAgentDescription(config, agent)
4068
+ });
4069
+ if (singleTurnCases.length > 0) {
4070
+ printProgress(` Running ${singleTurnCases.length} single-turn test(s)...`);
4071
+ const report = await suite.run(singleTurnCases, {
4072
+ concurrency,
4073
+ iterations,
4074
+ stopOnFirstFailure: config.run?.stopOnFirstFailure
4075
+ });
4076
+ reports.push(report);
4077
+ }
4078
+ if (multiTurnCases.length > 0) {
4079
+ printProgress(` Running ${multiTurnCases.length} multi-turn test(s)...`);
4080
+ const multiTurnResults = await runMultiTurnCases(multiTurnCases, {
4081
+ agent,
4082
+ judge,
4083
+ agentDescription: resolveAgentDescription(config, agent)
4084
+ });
4085
+ const multiTurnReport = createMultiTurnReport(multiTurnResults, { ...config, agent });
4086
+ reports.push(multiTurnReport);
4087
+ }
4088
+ }
4089
+ return reports;
4090
+ }
4091
+ async function runInlineTests(params) {
4092
+ const { config, options, judge, improver, concurrency, iterations } = params;
4093
+ const reports = [];
4094
+ const filteredInline = filterByTags(config.testCases, options.tags);
4095
+ if (filteredInline.length === 0) {
4096
+ return reports;
4097
+ }
4098
+ const { singleTurnCases, multiTurnCases } = splitTestCases(filteredInline);
4099
+ if (singleTurnCases.length > 0) {
4100
+ printProgress(`Running ${singleTurnCases.length} inline single-turn test(s)...`);
4101
+ const suite = createEvalSuite({
4102
+ agent: config.agent,
4103
+ judge,
4104
+ improver,
4105
+ agentDescription: resolveAgentDescription(config, config.agent)
4106
+ });
4107
+ const report = await suite.run(singleTurnCases, {
4108
+ concurrency,
4109
+ iterations,
4110
+ stopOnFirstFailure: config.run?.stopOnFirstFailure
4111
+ });
4112
+ reports.push(report);
4113
+ }
4114
+ if (multiTurnCases.length > 0) {
4115
+ printProgress(`Running ${multiTurnCases.length} inline multi-turn test(s)...`);
4116
+ const multiTurnResults = await runMultiTurnCases(multiTurnCases, {
4117
+ agent: config.agent,
4118
+ judge,
4119
+ agentDescription: resolveAgentDescription(config, config.agent)
4120
+ });
4121
+ const multiTurnReport = createMultiTurnReport(multiTurnResults, config);
4122
+ reports.push(multiTurnReport);
4123
+ }
4124
+ return reports;
4125
+ }
4126
+ async function runMultiTurnCases(testCases2, context) {
4127
+ const results = [];
4128
+ for (const testCase2 of testCases2) {
4129
+ const result = await executeMultiTurnTestCase(
4130
+ testCase2,
4131
+ context
4132
+ );
4133
+ results.push(result);
4134
+ }
4135
+ return results;
4136
+ }
4137
+ function createMultiTurnReport(results, config) {
4138
+ const totalTests = results.length;
4139
+ const passed = results.filter((r) => r.passed).length;
4140
+ const failed = totalTests - passed;
4141
+ const avgScore = totalTests > 0 ? results.reduce((sum, r) => sum + r.overallScore, 0) / totalTests : 0;
4142
+ const totalLatency = results.reduce((sum, r) => sum + r.metrics.latencyMs, 0);
4143
+ const totalTokens = results.reduce((sum, r) => sum + r.metrics.tokenUsage.totalTokens, 0);
4144
+ return {
4145
+ summary: {
4146
+ totalTests,
4147
+ passed,
4148
+ failed,
4149
+ avgScore,
4150
+ metrics: {
4151
+ avgLatencyMs: totalTests > 0 ? totalLatency / totalTests : 0,
4152
+ totalTokens,
4153
+ totalEstimatedCost: 0
4154
+ }
4155
+ },
4156
+ results,
4157
+ suggestions: [],
4158
+ generatedAt: /* @__PURE__ */ new Date(),
4159
+ promptVersion: resolvePromptVersion(config)
4160
+ };
4161
+ }
4162
+ function groupYamlByAgent(files) {
4163
+ const groups = /* @__PURE__ */ new Map();
4164
+ for (const file of files) {
4165
+ const agentName = file.content.agent;
4166
+ if (!groups.has(agentName)) {
4167
+ groups.set(agentName, []);
4168
+ }
4169
+ groups.get(agentName).push(file);
4170
+ }
4171
+ return groups;
4172
+ }
4173
+ function lookupAgent(config, agentName) {
4174
+ if (!config.agents || !(agentName in config.agents)) {
4175
+ const available = config.agents ? Object.keys(config.agents) : [];
4176
+ throw new ConfigError(
4177
+ `Agent "${agentName}" not found in config.agents registry.
4178
+ Available agents: ${available.length > 0 ? available.join(", ") : "(none)"}`,
4179
+ "CONFIG_VALIDATION_ERROR"
4180
+ );
4181
+ }
4182
+ return config.agents[agentName];
4183
+ }
4184
+ function filterByTags(testCases2, tags) {
4185
+ if (!tags || tags.length === 0) {
4186
+ return testCases2;
4187
+ }
4188
+ return testCases2.filter((tc) => {
4189
+ if (!tc.tags || tc.tags.length === 0) {
4190
+ return false;
4191
+ }
4192
+ return tc.tags.some((tag) => tags.includes(tag));
4193
+ });
4194
+ }
4195
+ function splitTestCases(testCases2) {
4196
+ const singleTurnCases = [];
4197
+ const multiTurnCases = [];
4198
+ for (const testCase2 of testCases2) {
4199
+ if (isMultiTurnConfig(testCase2)) {
4200
+ multiTurnCases.push(testCase2);
4201
+ } else {
4202
+ singleTurnCases.push(testCase2);
4203
+ }
4204
+ }
4205
+ return { singleTurnCases, multiTurnCases };
4206
+ }
4207
+ function resolveAgentDescription(config, agent) {
4208
+ return config.agentDescription ?? agent.config.description ?? "";
4209
+ }
4210
+ function resolvePromptVersion(config) {
4211
+ if (config.agent?.prompt?.version) {
4212
+ return config.agent.prompt.version;
4213
+ }
4214
+ if (config.agents) {
4215
+ const firstAgent = Object.values(config.agents)[0];
4216
+ if (firstAgent?.prompt?.version) {
4217
+ return firstAgent.prompt.version;
4218
+ }
4219
+ }
4220
+ return "unknown";
4221
+ }
4222
+ function mergeReports(reports, promptVersion) {
4223
+ if (reports.length === 0) {
4224
+ return createEmptyReport(promptVersion);
4225
+ }
4226
+ if (reports.length === 1) {
4227
+ return reports[0];
4228
+ }
4229
+ const allResults = [];
4230
+ const suggestionMap = /* @__PURE__ */ new Map();
4231
+ let totalTests = 0;
4232
+ let passed = 0;
4233
+ let failed = 0;
4234
+ let totalScore = 0;
4235
+ let totalLatency = 0;
4236
+ let totalTokens = 0;
4237
+ let totalCost = 0;
4238
+ for (const report of reports) {
4239
+ allResults.push(...report.results);
4240
+ deduplicateSuggestions(report.suggestions, suggestionMap);
4241
+ totalTests += report.summary.totalTests;
4242
+ passed += report.summary.passed;
4243
+ failed += report.summary.failed;
4244
+ totalScore += report.summary.avgScore * report.summary.totalTests;
4245
+ totalLatency += report.summary.metrics.avgLatencyMs * report.summary.totalTests;
4246
+ totalTokens += report.summary.metrics.totalTokens;
4247
+ totalCost += report.summary.metrics.totalEstimatedCost ?? 0;
4248
+ }
4249
+ return {
4250
+ summary: {
4251
+ totalTests,
4252
+ passed,
4253
+ failed,
4254
+ avgScore: totalTests > 0 ? totalScore / totalTests : 0,
4255
+ metrics: {
4256
+ avgLatencyMs: totalTests > 0 ? totalLatency / totalTests : 0,
4257
+ totalTokens,
4258
+ totalEstimatedCost: totalCost
4259
+ }
4260
+ },
4261
+ results: allResults,
4262
+ suggestions: [...suggestionMap.values()],
4263
+ generatedAt: /* @__PURE__ */ new Date(),
4264
+ promptVersion
4265
+ };
4266
+ }
4267
+ function createEmptyReport(promptVersion) {
4268
+ return {
4269
+ summary: {
4270
+ totalTests: 0,
4271
+ passed: 0,
4272
+ failed: 0,
4273
+ avgScore: 0,
4274
+ metrics: {
4275
+ avgLatencyMs: 0,
4276
+ totalTokens: 0,
4277
+ totalEstimatedCost: 0
4278
+ }
4279
+ },
4280
+ results: [],
4281
+ suggestions: [],
4282
+ generatedAt: /* @__PURE__ */ new Date(),
4283
+ promptVersion
4284
+ };
4285
+ }
4286
+ function deduplicateSuggestions(suggestions, map) {
4287
+ for (const suggestion of suggestions) {
4288
+ const key = `${suggestion.type}:${suggestion.suggestedValue}`;
4289
+ if (!map.has(key)) {
4290
+ map.set(key, suggestion);
4291
+ }
4292
+ }
4293
+ }
4294
+
4295
+ // src/cli/output/improve-report.ts
4296
+ function printImprovementSummary(result, options = {}) {
4297
+ const { rounds, terminationReason, totalCost, finalPrompt } = result;
4298
+ const { verbose, duration } = options;
4299
+ const divider = "\u2550".repeat(CLI_DEFAULTS.DIVIDER_WIDTH);
4300
+ console.log();
4301
+ console.log(c("cyan", divider));
4302
+ console.log(c("bold", " Improvement Cycle Results"));
4303
+ console.log(c("cyan", divider));
4304
+ console.log();
4305
+ const finalScore = getFinalScore(rounds);
4306
+ const scoreChange = getScoreChange(rounds);
4307
+ console.log(` ${c("bold", "Total Rounds:")} ${rounds.length}`);
4308
+ console.log(` ${c("bold", "Final Score:")} ${finalScore.toFixed(1)}/100`);
4309
+ console.log(` ${c("bold", "Score Change:")} ${scoreChange}`);
4310
+ console.log(` ${c("bold", "Total Cost:")} $${totalCost.toFixed(2)}`);
4311
+ console.log(` ${c("bold", "Final Version:")} ${finalPrompt.version}`);
4312
+ console.log();
4313
+ console.log(` ${c("bold", "Termination:")} ${terminationReason}`);
4314
+ if (duration !== void 0) {
4315
+ console.log();
4316
+ console.log(` ${c("bold", "Duration:")} ${formatDuration(duration)}`);
4317
+ }
4318
+ console.log();
4319
+ console.log(c("cyan", divider));
4320
+ if (verbose && rounds.length > 0) {
4321
+ printRoundsDetail(rounds);
4322
+ }
4323
+ }
4324
+ function getFinalScore(rounds) {
4325
+ if (rounds.length === 0) return 0;
4326
+ return rounds[rounds.length - 1].report.summary.avgScore;
4327
+ }
4328
+ function getScoreChange(rounds) {
4329
+ if (rounds.length < 1) return "N/A";
4330
+ const firstScore = rounds[0].report.summary.avgScore;
4331
+ const lastScore = rounds[rounds.length - 1].report.summary.avgScore;
4332
+ const delta = lastScore - firstScore;
4333
+ if (rounds.length === 1) {
4334
+ return c("dim", "N/A (first round)");
4335
+ }
4336
+ if (delta > 0) {
4337
+ return c("green", `+${delta.toFixed(1)}`);
4338
+ } else if (delta < 0) {
4339
+ return c("red", `${delta.toFixed(1)}`);
4340
+ }
4341
+ return "0.0";
4342
+ }
4343
+ function formatDuration(ms) {
4344
+ if (ms < 1e3) return `${ms}ms`;
4345
+ if (ms < 6e4) return `${(ms / 1e3).toFixed(1)}s`;
4346
+ return `${(ms / 6e4).toFixed(1)}m`;
4347
+ }
4348
+ function printRoundsDetail(rounds) {
4349
+ console.log();
4350
+ console.log(c("bold", " Round History:"));
4351
+ console.log();
4352
+ for (const round of rounds) {
4353
+ const scoreStr = round.report.summary.avgScore.toFixed(1);
4354
+ const deltaStr = round.scoreDelta !== null ? ` (${round.scoreDelta >= 0 ? "+" : ""}${round.scoreDelta.toFixed(1)})` : "";
4355
+ const costStr = `$${round.cost.total.toFixed(2)}`;
4356
+ console.log(
4357
+ ` Round ${round.round}: Score ${scoreStr}${deltaStr} | Cost ${costStr}`
4358
+ );
4359
+ console.log(
4360
+ ` Suggestions: ${round.suggestionsGenerated.length} generated, ${round.suggestionsApproved.length} applied`
4361
+ );
4362
+ }
4363
+ }
4364
+
4365
+ // src/cli/commands/improve.ts
4366
+ async function improveCommand(configPath, options) {
4367
+ const startTime = Date.now();
4368
+ try {
4369
+ printBanner();
4370
+ validateImproveOptions(options);
4371
+ printProgress("Loading environment...");
4372
+ await loadEnvFile(options.envFile);
4373
+ printProgress("Loading configuration...");
4374
+ const config = await loadConfigWithDefaults(configPath);
4375
+ printProgress("Initializing providers...");
4376
+ const { mainProvider, judgeProvider, improverProvider } = initializeProviders(config, options);
4377
+ const conditions = buildTerminationConditions(options);
4378
+ if (conditions.length === 0) {
4379
+ throw new Error(
4380
+ "At least one termination condition is required.\nUse --target-score, --max-rounds, --max-cost, or --stale-rounds"
4381
+ );
4382
+ }
4383
+ const judge = createJudge({
4384
+ provider: judgeProvider,
4385
+ prompt: config.judge.prompt,
4386
+ criteria: config.judge.criteria,
4387
+ passThreshold: config.judge.passThreshold
4388
+ });
4389
+ if (!config.improver) {
4390
+ throw new Error(
4391
+ "Improver configuration is required for improvement cycles.\nAdd an `improver` section to your config file."
4392
+ );
4393
+ }
4394
+ const improver = createImprover({
4395
+ provider: improverProvider,
4396
+ prompt: config.improver.prompt
4397
+ });
4398
+ if (options.resume) {
4399
+ await runResumeMode(
4400
+ options,
4401
+ config,
4402
+ conditions,
4403
+ judge,
4404
+ improver,
4405
+ mainProvider,
4406
+ startTime
4407
+ );
4408
+ } else {
4409
+ await runFreshMode(
4410
+ options,
4411
+ config,
4412
+ conditions,
4413
+ judge,
4414
+ improver,
4415
+ mainProvider,
4416
+ startTime
4417
+ );
4418
+ }
4419
+ } catch (error) {
4420
+ printError(error instanceof Error ? error : new Error(String(error)));
4421
+ process.exit(1);
4422
+ }
4423
+ }
4424
+ function validateImproveOptions(options) {
4425
+ if (!options.history && !options.resume) {
4426
+ throw new Error(
4427
+ "--history <path> is required to save improvement history.\nOr use --resume <path> to continue from existing history."
4428
+ );
4429
+ }
4430
+ }
4431
+ function buildTerminationConditions(options) {
4432
+ const conditions = [];
4433
+ if (options.targetScore) {
4434
+ const score = parseInt(options.targetScore, 10);
4435
+ if (isNaN(score) || score < 0 || score > 100) {
4436
+ throw new Error(`Invalid target score: ${options.targetScore}. Must be 0-100.`);
4437
+ }
4438
+ conditions.push(targetScore(score));
4439
+ }
4440
+ if (options.maxRounds) {
4441
+ const rounds = parseInt(options.maxRounds, 10);
4442
+ if (isNaN(rounds) || rounds < 1) {
4443
+ throw new Error(`Invalid max rounds: ${options.maxRounds}. Must be >= 1.`);
4444
+ }
4445
+ conditions.push(maxRounds(rounds));
4446
+ }
4447
+ if (options.maxCost) {
4448
+ const cost = parseFloat(options.maxCost);
4449
+ if (isNaN(cost) || cost <= 0) {
4450
+ throw new Error(`Invalid max cost: ${options.maxCost}. Must be > 0.`);
4451
+ }
4452
+ conditions.push(maxCost(cost));
4453
+ }
4454
+ if (options.staleRounds) {
4455
+ const rounds = parseInt(options.staleRounds, 10);
4456
+ if (isNaN(rounds) || rounds < 1) {
4457
+ throw new Error(`Invalid stale-rounds: ${options.staleRounds}. Must be >= 1.`);
4458
+ }
4459
+ conditions.push(noImprovement(rounds));
4460
+ }
4461
+ return conditions;
4462
+ }
4463
+ function createAgentFactory(baseAgent) {
4464
+ return (prompt) => ({
4465
+ ...baseAgent,
4466
+ prompt
4467
+ });
4468
+ }
4469
+ async function runFreshMode(options, config, conditions, judge, improver, _mainProvider, startTime) {
4470
+ printProgress("Starting improvement cycle...");
4471
+ const testCases2 = config.testCases ?? [];
4472
+ if (testCases2.length === 0) {
4473
+ throw new Error(
4474
+ "No test cases found. Add testCases to your config or use include patterns."
4475
+ );
4476
+ }
4477
+ const cycleConfig = {
4478
+ createAgent: createAgentFactory(config.agent),
4479
+ initialPrompt: config.agent.prompt,
4480
+ testCases: testCases2,
4481
+ judge,
4482
+ improver,
4483
+ terminateWhen: conditions,
4484
+ options: {
4485
+ pricingConfig: config.pricing,
4486
+ agentDescription: config.agentDescription,
4487
+ history: options.history ? {
4488
+ path: options.history,
4489
+ autoSave: true
4490
+ } : void 0,
4491
+ runOptions: {
4492
+ concurrency: options.concurrency ? parseInt(options.concurrency, 10) : config.run?.concurrency,
4493
+ iterations: options.iterations ? parseInt(options.iterations, 10) : config.run?.iterations
4494
+ }
4495
+ }
4496
+ };
4497
+ printProgress(`Running with ${testCases2.length} test case(s)...`);
4498
+ printProgress(`Termination: ${formatConditions(conditions)}`);
4499
+ console.log();
4500
+ const result = await runImprovementCycleAuto(cycleConfig);
4501
+ const duration = Date.now() - startTime;
4502
+ printImprovementSummary(result, { verbose: options.verbose, duration });
4503
+ if (options.history) {
4504
+ console.log(`
4505
+ History saved to: ${options.history}`);
4506
+ }
4507
+ process.exit(0);
4508
+ }
4509
+ async function runResumeMode(options, config, conditions, judge, improver, _mainProvider, startTime) {
4510
+ printProgress(`Resuming from ${options.resume}...`);
4511
+ const session = await resumeSession(options.resume, { autoSave: true });
4512
+ const currentPrompt = deserializePrompt(session.history.currentPrompt);
4513
+ printProgress(`Resumed session ${session.sessionId}`);
4514
+ printProgress(`Continuing from round ${session.history.rounds.length + 1}`);
4515
+ const testCases2 = config.testCases ?? [];
4516
+ if (testCases2.length === 0) {
4517
+ throw new Error(
4518
+ "No test cases found. Add testCases to your config or use include patterns."
4519
+ );
4520
+ }
4521
+ const cycleConfig = {
4522
+ createAgent: createAgentFactory(config.agent),
4523
+ initialPrompt: currentPrompt,
4524
+ testCases: testCases2,
4525
+ judge,
4526
+ improver,
4527
+ terminateWhen: conditions,
4528
+ options: {
4529
+ pricingConfig: config.pricing,
4530
+ agentDescription: config.agentDescription,
4531
+ history: {
4532
+ path: options.resume,
4533
+ autoSave: true
4534
+ },
4535
+ session,
4536
+ // Pass the resumed session to preserve session ID and accumulated state
4537
+ runOptions: {
4538
+ concurrency: options.concurrency ? parseInt(options.concurrency, 10) : config.run?.concurrency,
4539
+ iterations: options.iterations ? parseInt(options.iterations, 10) : config.run?.iterations
4540
+ }
4541
+ }
4542
+ };
4543
+ printProgress(`Running with ${testCases2.length} test case(s)...`);
4544
+ printProgress(`Termination: ${formatConditions(conditions)}`);
4545
+ console.log();
4546
+ const result = await runImprovementCycleAuto(cycleConfig);
4547
+ const duration = Date.now() - startTime;
4548
+ printImprovementSummary(result, { verbose: options.verbose, duration });
4549
+ console.log(`
4550
+ History saved to: ${options.resume}`);
4551
+ process.exit(0);
4552
+ }
4553
+ function formatConditions(conditions) {
4554
+ return conditions.map((c2) => {
4555
+ switch (c2.type) {
4556
+ case "targetScore":
4557
+ return `score >= ${c2.threshold}`;
4558
+ case "maxRounds":
4559
+ return `max ${c2.count} rounds`;
4560
+ case "maxCost":
4561
+ return `max $${c2.maxUSD}`;
4562
+ case "noImprovement":
4563
+ return `no improvement for ${c2.consecutiveRounds} rounds`;
4564
+ case "custom":
4565
+ return c2.description ?? "custom condition";
4566
+ }
4567
+ }).join(" OR ");
4568
+ }
4569
+
4570
+ // src/cli/commands/rollback.ts
4571
+ import { existsSync as existsSync6 } from "fs";
4572
+ import { mkdir as mkdir3, writeFile as writeFile4 } from "fs/promises";
4573
+ import { dirname as dirname2 } from "path";
4574
+ async function rollbackCommand(historyPath, options) {
4575
+ try {
4576
+ printBanner();
4577
+ validateRollbackOptions(historyPath, options);
4578
+ printProgress(`Loading history from ${historyPath}...`);
4579
+ const history = await loadHistory(historyPath);
4580
+ const { prompt: serializedPrompt, sourceLabel } = extractPromptSnapshot(history, options);
4581
+ printProgress(`Extracting ${sourceLabel}...`);
4582
+ const format = options.format ?? "json";
4583
+ const output = formatPromptOutput(serializedPrompt, format);
4584
+ await writeOutputFile(options.output, output);
4585
+ console.log();
4586
+ console.log(` Prompt extracted to: ${options.output}`);
4587
+ console.log(` Prompt ID: ${serializedPrompt.id}`);
4588
+ console.log(` Version: ${serializedPrompt.version}`);
4589
+ console.log();
4590
+ } catch (error) {
4591
+ printError(error instanceof Error ? error : new Error(String(error)));
4592
+ process.exit(1);
4593
+ }
4594
+ }
4595
+ function extractPromptSnapshot(history, options) {
4596
+ if (options.initial) {
4597
+ return {
4598
+ prompt: history.initialPrompt,
4599
+ sourceLabel: "initial prompt"
4600
+ };
4601
+ }
4602
+ const roundNumber = parseInt(options.round, 10);
4603
+ return {
4604
+ prompt: extractPromptFromRound(history, roundNumber),
4605
+ sourceLabel: `round ${roundNumber}`
4606
+ };
4607
+ }
4608
+ function validateRollbackOptions(historyPath, options) {
4609
+ if (!historyPath) {
4610
+ throw new Error("History file path is required");
4611
+ }
4612
+ const hasRound = options.round !== void 0;
4613
+ const hasInitial = options.initial === true;
4614
+ if (!hasRound && !hasInitial) {
4615
+ throw new Error("Either --round <n> or --initial is required");
4616
+ }
4617
+ if (hasRound && hasInitial) {
4618
+ throw new Error("Cannot use both --round and --initial");
4619
+ }
4620
+ if (!options.output) {
4621
+ throw new Error("--output <path> is required");
4622
+ }
4623
+ if (hasRound) {
4624
+ const roundNum = parseInt(options.round, 10);
4625
+ if (!Number.isInteger(roundNum) || roundNum < 1) {
4626
+ throw new Error(`Invalid round number: ${options.round}. Must be 1 or greater.`);
4627
+ }
4628
+ }
4629
+ if (options.format && !["json", "ts"].includes(options.format)) {
4630
+ throw new Error(`Invalid format: ${options.format}. Use 'json' or 'ts'`);
4631
+ }
4632
+ }
4633
+ function extractPromptFromRound(history, roundNumber) {
4634
+ const roundIndex = roundNumber - 1;
4635
+ if (roundIndex < 0 || roundIndex >= history.rounds.length) {
4636
+ const availableRounds = history.rounds.length > 0 ? `1-${history.rounds.length}` : "none";
4637
+ throw new Error(
4638
+ `Round ${roundNumber} not found. Available rounds: ${availableRounds}. Use --initial for the original prompt.`
4639
+ );
4640
+ }
4641
+ return history.rounds[roundIndex].promptSnapshot;
4642
+ }
4643
+ function formatPromptOutput(prompt, format) {
4644
+ if (format === "json") {
4645
+ return JSON.stringify(prompt, null, 2);
4646
+ }
4647
+ return generateTypeScriptPrompt(prompt);
4648
+ }
4649
+ function generateTypeScriptPrompt(prompt) {
4650
+ const escapedSystem = escapeTemplateString(prompt.system);
4651
+ const escapedUserTemplate = escapeTemplateString(prompt.userTemplate);
4652
+ const customFieldsComment = prompt.customFields ? `
4653
+ * Custom fields: ${Object.keys(prompt.customFields).join(", ")}` : "";
4654
+ return `import { compileTemplate } from 'agent-eval'
4655
+ import type { AgentPrompt } from 'agent-eval'
4656
+
4657
+ /**
4658
+ * Extracted from improvement cycle
4659
+ * Original ID: ${prompt.id}
4660
+ * Version: ${prompt.version}${customFieldsComment}
4661
+ */
4662
+ export const prompt: AgentPrompt<YourInputType> = {
4663
+ id: '${prompt.id}',
4664
+ version: '${prompt.version}',
4665
+ system: \`${escapedSystem}\`,
4666
+ userTemplate: \`${escapedUserTemplate}\`,
4667
+ renderUserPrompt: compileTemplate(\`${escapedUserTemplate}\`),
4668
+ }
4669
+ `;
4670
+ }
4671
+ function escapeTemplateString(str) {
4672
+ return str.replace(/\\/g, "\\\\").replace(/`/g, "\\`").replace(/\${/g, "\\${");
4673
+ }
4674
+ async function writeOutputFile(path3, content) {
4675
+ const dir = dirname2(path3);
4676
+ if (dir && dir !== "." && dir !== "/" && !existsSync6(dir)) {
4677
+ await mkdir3(dir, { recursive: true });
4678
+ }
4679
+ await writeFile4(path3, content, "utf-8");
4680
+ }
4681
+
4682
+ // src/cli/index.ts
4683
+ var VERSION = "0.1.0";
4684
+ var cli = dist_default("agent-eval");
4685
+ cli.command("run [config]", "Run evaluation suite").option("-o, --output <path>", "Output path for markdown report").option("-e, --env-file <path>", "Path to env file", { default: ".env" }).option("-v, --verbose", "Enable verbose output").option("-c, --concurrency <n>", "Concurrency level").option("-i, --iterations <n>", "Number of iterations per test").option("--no-report", "Skip saving markdown report").option("--mock", "Use mock LLM for testing (no API calls)").option("--include <pattern>", "Glob patterns for YAML files (can be repeated)").option("--tags <tag>", "Filter test cases by tags, OR logic (can be repeated)").option("--agent <name>", "Filter to specific agent name").action(async (configPath, options) => {
4686
+ try {
4687
+ await runCommand(configPath, options);
4688
+ } catch {
4689
+ process.exit(1);
4690
+ }
4691
+ });
4692
+ cli.command("improve [config]", "Run improvement cycle on prompts").option("-e, --env-file <path>", "Path to env file", { default: ".env" }).option("--history <path>", "Path to save history JSON").option("--target-score <n>", "Target score to reach (0-100)").option("--max-rounds <n>", "Maximum improvement rounds").option("--max-cost <usd>", "Maximum cost in USD").option("--stale-rounds <n>", "Stop after N rounds without improvement").option("--resume <path>", "Resume from existing history file").option("-c, --concurrency <n>", "Concurrency level").option("-i, --iterations <n>", "Iterations per test").option("-v, --verbose", "Enable verbose output").option("--mock", "Use mock LLM for testing (no API calls)").action(async (configPath, options) => {
4693
+ try {
4694
+ await improveCommand(configPath, options);
4695
+ } catch {
4696
+ process.exit(1);
4697
+ }
4698
+ });
4699
+ cli.command("rollback <history>", "Extract prompt from improvement history").option("-r, --round <n>", "Round number to extract (1, 2, ...)").option("--initial", "Extract the initial prompt (before any improvements)").option("-o, --output <path>", "Output file path").option("-f, --format <type>", "Output format: json or ts", { default: "json" }).action(async (historyPath, options) => {
4700
+ try {
4701
+ await rollbackCommand(historyPath, options);
4702
+ } catch {
4703
+ process.exit(1);
4704
+ }
4705
+ });
4706
+ cli.help();
4707
+ cli.version(VERSION);
4708
+ cli.parse();
4709
+ //# sourceMappingURL=cli.js.map