@agtlantis/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +198 -0
- package/LICENSE +21 -0
- package/README.md +496 -0
- package/dist/cli.js +4709 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +3998 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2738 -0
- package/dist/index.d.ts +2738 -0
- package/dist/index.js +3868 -0
- package/dist/index.js.map +1 -0
- package/package.json +101 -0
package/dist/cli.js
ADDED
|
@@ -0,0 +1,4709 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// ../../../../node_modules/.pnpm/cac@6.7.14/node_modules/cac/dist/index.mjs
|
|
4
|
+
import { EventEmitter } from "events";
|
|
5
|
+
function toArr(any) {
|
|
6
|
+
return any == null ? [] : Array.isArray(any) ? any : [any];
|
|
7
|
+
}
|
|
8
|
+
function toVal(out, key, val, opts) {
|
|
9
|
+
var x, old = out[key], nxt = !!~opts.string.indexOf(key) ? val == null || val === true ? "" : String(val) : typeof val === "boolean" ? val : !!~opts.boolean.indexOf(key) ? val === "false" ? false : val === "true" || (out._.push((x = +val, x * 0 === 0) ? x : val), !!val) : (x = +val, x * 0 === 0) ? x : val;
|
|
10
|
+
out[key] = old == null ? nxt : Array.isArray(old) ? old.concat(nxt) : [old, nxt];
|
|
11
|
+
}
|
|
12
|
+
function mri2(args, opts) {
|
|
13
|
+
args = args || [];
|
|
14
|
+
opts = opts || {};
|
|
15
|
+
var k, arr, arg, name, val, out = { _: [] };
|
|
16
|
+
var i = 0, j = 0, idx = 0, len = args.length;
|
|
17
|
+
const alibi = opts.alias !== void 0;
|
|
18
|
+
const strict = opts.unknown !== void 0;
|
|
19
|
+
const defaults = opts.default !== void 0;
|
|
20
|
+
opts.alias = opts.alias || {};
|
|
21
|
+
opts.string = toArr(opts.string);
|
|
22
|
+
opts.boolean = toArr(opts.boolean);
|
|
23
|
+
if (alibi) {
|
|
24
|
+
for (k in opts.alias) {
|
|
25
|
+
arr = opts.alias[k] = toArr(opts.alias[k]);
|
|
26
|
+
for (i = 0; i < arr.length; i++) {
|
|
27
|
+
(opts.alias[arr[i]] = arr.concat(k)).splice(i, 1);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
for (i = opts.boolean.length; i-- > 0; ) {
|
|
32
|
+
arr = opts.alias[opts.boolean[i]] || [];
|
|
33
|
+
for (j = arr.length; j-- > 0; ) opts.boolean.push(arr[j]);
|
|
34
|
+
}
|
|
35
|
+
for (i = opts.string.length; i-- > 0; ) {
|
|
36
|
+
arr = opts.alias[opts.string[i]] || [];
|
|
37
|
+
for (j = arr.length; j-- > 0; ) opts.string.push(arr[j]);
|
|
38
|
+
}
|
|
39
|
+
if (defaults) {
|
|
40
|
+
for (k in opts.default) {
|
|
41
|
+
name = typeof opts.default[k];
|
|
42
|
+
arr = opts.alias[k] = opts.alias[k] || [];
|
|
43
|
+
if (opts[name] !== void 0) {
|
|
44
|
+
opts[name].push(k);
|
|
45
|
+
for (i = 0; i < arr.length; i++) {
|
|
46
|
+
opts[name].push(arr[i]);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
const keys = strict ? Object.keys(opts.alias) : [];
|
|
52
|
+
for (i = 0; i < len; i++) {
|
|
53
|
+
arg = args[i];
|
|
54
|
+
if (arg === "--") {
|
|
55
|
+
out._ = out._.concat(args.slice(++i));
|
|
56
|
+
break;
|
|
57
|
+
}
|
|
58
|
+
for (j = 0; j < arg.length; j++) {
|
|
59
|
+
if (arg.charCodeAt(j) !== 45) break;
|
|
60
|
+
}
|
|
61
|
+
if (j === 0) {
|
|
62
|
+
out._.push(arg);
|
|
63
|
+
} else if (arg.substring(j, j + 3) === "no-") {
|
|
64
|
+
name = arg.substring(j + 3);
|
|
65
|
+
if (strict && !~keys.indexOf(name)) {
|
|
66
|
+
return opts.unknown(arg);
|
|
67
|
+
}
|
|
68
|
+
out[name] = false;
|
|
69
|
+
} else {
|
|
70
|
+
for (idx = j + 1; idx < arg.length; idx++) {
|
|
71
|
+
if (arg.charCodeAt(idx) === 61) break;
|
|
72
|
+
}
|
|
73
|
+
name = arg.substring(j, idx);
|
|
74
|
+
val = arg.substring(++idx) || (i + 1 === len || ("" + args[i + 1]).charCodeAt(0) === 45 || args[++i]);
|
|
75
|
+
arr = j === 2 ? [name] : name;
|
|
76
|
+
for (idx = 0; idx < arr.length; idx++) {
|
|
77
|
+
name = arr[idx];
|
|
78
|
+
if (strict && !~keys.indexOf(name)) return opts.unknown("-".repeat(j) + name);
|
|
79
|
+
toVal(out, name, idx + 1 < arr.length || val, opts);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
if (defaults) {
|
|
84
|
+
for (k in opts.default) {
|
|
85
|
+
if (out[k] === void 0) {
|
|
86
|
+
out[k] = opts.default[k];
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
if (alibi) {
|
|
91
|
+
for (k in out) {
|
|
92
|
+
arr = opts.alias[k] || [];
|
|
93
|
+
while (arr.length > 0) {
|
|
94
|
+
out[arr.shift()] = out[k];
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
return out;
|
|
99
|
+
}
|
|
100
|
+
var removeBrackets = (v) => v.replace(/[<[].+/, "").trim();
|
|
101
|
+
var findAllBrackets = (v) => {
|
|
102
|
+
const ANGLED_BRACKET_RE_GLOBAL = /<([^>]+)>/g;
|
|
103
|
+
const SQUARE_BRACKET_RE_GLOBAL = /\[([^\]]+)\]/g;
|
|
104
|
+
const res = [];
|
|
105
|
+
const parse = (match) => {
|
|
106
|
+
let variadic = false;
|
|
107
|
+
let value = match[1];
|
|
108
|
+
if (value.startsWith("...")) {
|
|
109
|
+
value = value.slice(3);
|
|
110
|
+
variadic = true;
|
|
111
|
+
}
|
|
112
|
+
return {
|
|
113
|
+
required: match[0].startsWith("<"),
|
|
114
|
+
value,
|
|
115
|
+
variadic
|
|
116
|
+
};
|
|
117
|
+
};
|
|
118
|
+
let angledMatch;
|
|
119
|
+
while (angledMatch = ANGLED_BRACKET_RE_GLOBAL.exec(v)) {
|
|
120
|
+
res.push(parse(angledMatch));
|
|
121
|
+
}
|
|
122
|
+
let squareMatch;
|
|
123
|
+
while (squareMatch = SQUARE_BRACKET_RE_GLOBAL.exec(v)) {
|
|
124
|
+
res.push(parse(squareMatch));
|
|
125
|
+
}
|
|
126
|
+
return res;
|
|
127
|
+
};
|
|
128
|
+
var getMriOptions = (options) => {
|
|
129
|
+
const result = { alias: {}, boolean: [] };
|
|
130
|
+
for (const [index, option] of options.entries()) {
|
|
131
|
+
if (option.names.length > 1) {
|
|
132
|
+
result.alias[option.names[0]] = option.names.slice(1);
|
|
133
|
+
}
|
|
134
|
+
if (option.isBoolean) {
|
|
135
|
+
if (option.negated) {
|
|
136
|
+
const hasStringTypeOption = options.some((o, i) => {
|
|
137
|
+
return i !== index && o.names.some((name) => option.names.includes(name)) && typeof o.required === "boolean";
|
|
138
|
+
});
|
|
139
|
+
if (!hasStringTypeOption) {
|
|
140
|
+
result.boolean.push(option.names[0]);
|
|
141
|
+
}
|
|
142
|
+
} else {
|
|
143
|
+
result.boolean.push(option.names[0]);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
return result;
|
|
148
|
+
};
|
|
149
|
+
var findLongest = (arr) => {
|
|
150
|
+
return arr.sort((a, b) => {
|
|
151
|
+
return a.length > b.length ? -1 : 1;
|
|
152
|
+
})[0];
|
|
153
|
+
};
|
|
154
|
+
var padRight = (str, length) => {
|
|
155
|
+
return str.length >= length ? str : `${str}${" ".repeat(length - str.length)}`;
|
|
156
|
+
};
|
|
157
|
+
var camelcase = (input) => {
|
|
158
|
+
return input.replace(/([a-z])-([a-z])/g, (_, p1, p2) => {
|
|
159
|
+
return p1 + p2.toUpperCase();
|
|
160
|
+
});
|
|
161
|
+
};
|
|
162
|
+
var setDotProp = (obj, keys, val) => {
|
|
163
|
+
let i = 0;
|
|
164
|
+
let length = keys.length;
|
|
165
|
+
let t = obj;
|
|
166
|
+
let x;
|
|
167
|
+
for (; i < length; ++i) {
|
|
168
|
+
x = t[keys[i]];
|
|
169
|
+
t = t[keys[i]] = i === length - 1 ? val : x != null ? x : !!~keys[i + 1].indexOf(".") || !(+keys[i + 1] > -1) ? {} : [];
|
|
170
|
+
}
|
|
171
|
+
};
|
|
172
|
+
var setByType = (obj, transforms) => {
|
|
173
|
+
for (const key of Object.keys(transforms)) {
|
|
174
|
+
const transform = transforms[key];
|
|
175
|
+
if (transform.shouldTransform) {
|
|
176
|
+
obj[key] = Array.prototype.concat.call([], obj[key]);
|
|
177
|
+
if (typeof transform.transformFunction === "function") {
|
|
178
|
+
obj[key] = obj[key].map(transform.transformFunction);
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
};
|
|
183
|
+
var getFileName = (input) => {
|
|
184
|
+
const m = /([^\\\/]+)$/.exec(input);
|
|
185
|
+
return m ? m[1] : "";
|
|
186
|
+
};
|
|
187
|
+
var camelcaseOptionName = (name) => {
|
|
188
|
+
return name.split(".").map((v, i) => {
|
|
189
|
+
return i === 0 ? camelcase(v) : v;
|
|
190
|
+
}).join(".");
|
|
191
|
+
};
|
|
192
|
+
var CACError = class extends Error {
|
|
193
|
+
constructor(message) {
|
|
194
|
+
super(message);
|
|
195
|
+
this.name = this.constructor.name;
|
|
196
|
+
if (typeof Error.captureStackTrace === "function") {
|
|
197
|
+
Error.captureStackTrace(this, this.constructor);
|
|
198
|
+
} else {
|
|
199
|
+
this.stack = new Error(message).stack;
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
};
|
|
203
|
+
var Option = class {
|
|
204
|
+
constructor(rawName, description, config) {
|
|
205
|
+
this.rawName = rawName;
|
|
206
|
+
this.description = description;
|
|
207
|
+
this.config = Object.assign({}, config);
|
|
208
|
+
rawName = rawName.replace(/\.\*/g, "");
|
|
209
|
+
this.negated = false;
|
|
210
|
+
this.names = removeBrackets(rawName).split(",").map((v) => {
|
|
211
|
+
let name = v.trim().replace(/^-{1,2}/, "");
|
|
212
|
+
if (name.startsWith("no-")) {
|
|
213
|
+
this.negated = true;
|
|
214
|
+
name = name.replace(/^no-/, "");
|
|
215
|
+
}
|
|
216
|
+
return camelcaseOptionName(name);
|
|
217
|
+
}).sort((a, b) => a.length > b.length ? 1 : -1);
|
|
218
|
+
this.name = this.names[this.names.length - 1];
|
|
219
|
+
if (this.negated && this.config.default == null) {
|
|
220
|
+
this.config.default = true;
|
|
221
|
+
}
|
|
222
|
+
if (rawName.includes("<")) {
|
|
223
|
+
this.required = true;
|
|
224
|
+
} else if (rawName.includes("[")) {
|
|
225
|
+
this.required = false;
|
|
226
|
+
} else {
|
|
227
|
+
this.isBoolean = true;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
};
|
|
231
|
+
var processArgs = process.argv;
|
|
232
|
+
var platformInfo = `${process.platform}-${process.arch} node-${process.version}`;
|
|
233
|
+
var Command = class {
|
|
234
|
+
constructor(rawName, description, config = {}, cli2) {
|
|
235
|
+
this.rawName = rawName;
|
|
236
|
+
this.description = description;
|
|
237
|
+
this.config = config;
|
|
238
|
+
this.cli = cli2;
|
|
239
|
+
this.options = [];
|
|
240
|
+
this.aliasNames = [];
|
|
241
|
+
this.name = removeBrackets(rawName);
|
|
242
|
+
this.args = findAllBrackets(rawName);
|
|
243
|
+
this.examples = [];
|
|
244
|
+
}
|
|
245
|
+
usage(text) {
|
|
246
|
+
this.usageText = text;
|
|
247
|
+
return this;
|
|
248
|
+
}
|
|
249
|
+
allowUnknownOptions() {
|
|
250
|
+
this.config.allowUnknownOptions = true;
|
|
251
|
+
return this;
|
|
252
|
+
}
|
|
253
|
+
ignoreOptionDefaultValue() {
|
|
254
|
+
this.config.ignoreOptionDefaultValue = true;
|
|
255
|
+
return this;
|
|
256
|
+
}
|
|
257
|
+
version(version, customFlags = "-v, --version") {
|
|
258
|
+
this.versionNumber = version;
|
|
259
|
+
this.option(customFlags, "Display version number");
|
|
260
|
+
return this;
|
|
261
|
+
}
|
|
262
|
+
example(example) {
|
|
263
|
+
this.examples.push(example);
|
|
264
|
+
return this;
|
|
265
|
+
}
|
|
266
|
+
option(rawName, description, config) {
|
|
267
|
+
const option = new Option(rawName, description, config);
|
|
268
|
+
this.options.push(option);
|
|
269
|
+
return this;
|
|
270
|
+
}
|
|
271
|
+
alias(name) {
|
|
272
|
+
this.aliasNames.push(name);
|
|
273
|
+
return this;
|
|
274
|
+
}
|
|
275
|
+
action(callback) {
|
|
276
|
+
this.commandAction = callback;
|
|
277
|
+
return this;
|
|
278
|
+
}
|
|
279
|
+
isMatched(name) {
|
|
280
|
+
return this.name === name || this.aliasNames.includes(name);
|
|
281
|
+
}
|
|
282
|
+
get isDefaultCommand() {
|
|
283
|
+
return this.name === "" || this.aliasNames.includes("!");
|
|
284
|
+
}
|
|
285
|
+
get isGlobalCommand() {
|
|
286
|
+
return this instanceof GlobalCommand;
|
|
287
|
+
}
|
|
288
|
+
hasOption(name) {
|
|
289
|
+
name = name.split(".")[0];
|
|
290
|
+
return this.options.find((option) => {
|
|
291
|
+
return option.names.includes(name);
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
outputHelp() {
|
|
295
|
+
const { name, commands } = this.cli;
|
|
296
|
+
const {
|
|
297
|
+
versionNumber,
|
|
298
|
+
options: globalOptions,
|
|
299
|
+
helpCallback
|
|
300
|
+
} = this.cli.globalCommand;
|
|
301
|
+
let sections = [
|
|
302
|
+
{
|
|
303
|
+
body: `${name}${versionNumber ? `/${versionNumber}` : ""}`
|
|
304
|
+
}
|
|
305
|
+
];
|
|
306
|
+
sections.push({
|
|
307
|
+
title: "Usage",
|
|
308
|
+
body: ` $ ${name} ${this.usageText || this.rawName}`
|
|
309
|
+
});
|
|
310
|
+
const showCommands = (this.isGlobalCommand || this.isDefaultCommand) && commands.length > 0;
|
|
311
|
+
if (showCommands) {
|
|
312
|
+
const longestCommandName = findLongest(commands.map((command) => command.rawName));
|
|
313
|
+
sections.push({
|
|
314
|
+
title: "Commands",
|
|
315
|
+
body: commands.map((command) => {
|
|
316
|
+
return ` ${padRight(command.rawName, longestCommandName.length)} ${command.description}`;
|
|
317
|
+
}).join("\n")
|
|
318
|
+
});
|
|
319
|
+
sections.push({
|
|
320
|
+
title: `For more info, run any command with the \`--help\` flag`,
|
|
321
|
+
body: commands.map((command) => ` $ ${name}${command.name === "" ? "" : ` ${command.name}`} --help`).join("\n")
|
|
322
|
+
});
|
|
323
|
+
}
|
|
324
|
+
let options = this.isGlobalCommand ? globalOptions : [...this.options, ...globalOptions || []];
|
|
325
|
+
if (!this.isGlobalCommand && !this.isDefaultCommand) {
|
|
326
|
+
options = options.filter((option) => option.name !== "version");
|
|
327
|
+
}
|
|
328
|
+
if (options.length > 0) {
|
|
329
|
+
const longestOptionName = findLongest(options.map((option) => option.rawName));
|
|
330
|
+
sections.push({
|
|
331
|
+
title: "Options",
|
|
332
|
+
body: options.map((option) => {
|
|
333
|
+
return ` ${padRight(option.rawName, longestOptionName.length)} ${option.description} ${option.config.default === void 0 ? "" : `(default: ${option.config.default})`}`;
|
|
334
|
+
}).join("\n")
|
|
335
|
+
});
|
|
336
|
+
}
|
|
337
|
+
if (this.examples.length > 0) {
|
|
338
|
+
sections.push({
|
|
339
|
+
title: "Examples",
|
|
340
|
+
body: this.examples.map((example) => {
|
|
341
|
+
if (typeof example === "function") {
|
|
342
|
+
return example(name);
|
|
343
|
+
}
|
|
344
|
+
return example;
|
|
345
|
+
}).join("\n")
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
if (helpCallback) {
|
|
349
|
+
sections = helpCallback(sections) || sections;
|
|
350
|
+
}
|
|
351
|
+
console.log(sections.map((section) => {
|
|
352
|
+
return section.title ? `${section.title}:
|
|
353
|
+
${section.body}` : section.body;
|
|
354
|
+
}).join("\n\n"));
|
|
355
|
+
}
|
|
356
|
+
outputVersion() {
|
|
357
|
+
const { name } = this.cli;
|
|
358
|
+
const { versionNumber } = this.cli.globalCommand;
|
|
359
|
+
if (versionNumber) {
|
|
360
|
+
console.log(`${name}/${versionNumber} ${platformInfo}`);
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
checkRequiredArgs() {
|
|
364
|
+
const minimalArgsCount = this.args.filter((arg) => arg.required).length;
|
|
365
|
+
if (this.cli.args.length < minimalArgsCount) {
|
|
366
|
+
throw new CACError(`missing required args for command \`${this.rawName}\``);
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
checkUnknownOptions() {
|
|
370
|
+
const { options, globalCommand } = this.cli;
|
|
371
|
+
if (!this.config.allowUnknownOptions) {
|
|
372
|
+
for (const name of Object.keys(options)) {
|
|
373
|
+
if (name !== "--" && !this.hasOption(name) && !globalCommand.hasOption(name)) {
|
|
374
|
+
throw new CACError(`Unknown option \`${name.length > 1 ? `--${name}` : `-${name}`}\``);
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
checkOptionValue() {
|
|
380
|
+
const { options: parsedOptions, globalCommand } = this.cli;
|
|
381
|
+
const options = [...globalCommand.options, ...this.options];
|
|
382
|
+
for (const option of options) {
|
|
383
|
+
const value = parsedOptions[option.name.split(".")[0]];
|
|
384
|
+
if (option.required) {
|
|
385
|
+
const hasNegated = options.some((o) => o.negated && o.names.includes(option.name));
|
|
386
|
+
if (value === true || value === false && !hasNegated) {
|
|
387
|
+
throw new CACError(`option \`${option.rawName}\` value is missing`);
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
};
|
|
393
|
+
var GlobalCommand = class extends Command {
|
|
394
|
+
constructor(cli2) {
|
|
395
|
+
super("@@global@@", "", {}, cli2);
|
|
396
|
+
}
|
|
397
|
+
};
|
|
398
|
+
var __assign = Object.assign;
|
|
399
|
+
var CAC = class extends EventEmitter {
|
|
400
|
+
constructor(name = "") {
|
|
401
|
+
super();
|
|
402
|
+
this.name = name;
|
|
403
|
+
this.commands = [];
|
|
404
|
+
this.rawArgs = [];
|
|
405
|
+
this.args = [];
|
|
406
|
+
this.options = {};
|
|
407
|
+
this.globalCommand = new GlobalCommand(this);
|
|
408
|
+
this.globalCommand.usage("<command> [options]");
|
|
409
|
+
}
|
|
410
|
+
usage(text) {
|
|
411
|
+
this.globalCommand.usage(text);
|
|
412
|
+
return this;
|
|
413
|
+
}
|
|
414
|
+
command(rawName, description, config) {
|
|
415
|
+
const command = new Command(rawName, description || "", config, this);
|
|
416
|
+
command.globalCommand = this.globalCommand;
|
|
417
|
+
this.commands.push(command);
|
|
418
|
+
return command;
|
|
419
|
+
}
|
|
420
|
+
option(rawName, description, config) {
|
|
421
|
+
this.globalCommand.option(rawName, description, config);
|
|
422
|
+
return this;
|
|
423
|
+
}
|
|
424
|
+
help(callback) {
|
|
425
|
+
this.globalCommand.option("-h, --help", "Display this message");
|
|
426
|
+
this.globalCommand.helpCallback = callback;
|
|
427
|
+
this.showHelpOnExit = true;
|
|
428
|
+
return this;
|
|
429
|
+
}
|
|
430
|
+
version(version, customFlags = "-v, --version") {
|
|
431
|
+
this.globalCommand.version(version, customFlags);
|
|
432
|
+
this.showVersionOnExit = true;
|
|
433
|
+
return this;
|
|
434
|
+
}
|
|
435
|
+
example(example) {
|
|
436
|
+
this.globalCommand.example(example);
|
|
437
|
+
return this;
|
|
438
|
+
}
|
|
439
|
+
outputHelp() {
|
|
440
|
+
if (this.matchedCommand) {
|
|
441
|
+
this.matchedCommand.outputHelp();
|
|
442
|
+
} else {
|
|
443
|
+
this.globalCommand.outputHelp();
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
outputVersion() {
|
|
447
|
+
this.globalCommand.outputVersion();
|
|
448
|
+
}
|
|
449
|
+
setParsedInfo({ args, options }, matchedCommand, matchedCommandName) {
|
|
450
|
+
this.args = args;
|
|
451
|
+
this.options = options;
|
|
452
|
+
if (matchedCommand) {
|
|
453
|
+
this.matchedCommand = matchedCommand;
|
|
454
|
+
}
|
|
455
|
+
if (matchedCommandName) {
|
|
456
|
+
this.matchedCommandName = matchedCommandName;
|
|
457
|
+
}
|
|
458
|
+
return this;
|
|
459
|
+
}
|
|
460
|
+
unsetMatchedCommand() {
|
|
461
|
+
this.matchedCommand = void 0;
|
|
462
|
+
this.matchedCommandName = void 0;
|
|
463
|
+
}
|
|
464
|
+
parse(argv = processArgs, {
|
|
465
|
+
run = true
|
|
466
|
+
} = {}) {
|
|
467
|
+
this.rawArgs = argv;
|
|
468
|
+
if (!this.name) {
|
|
469
|
+
this.name = argv[1] ? getFileName(argv[1]) : "cli";
|
|
470
|
+
}
|
|
471
|
+
let shouldParse = true;
|
|
472
|
+
for (const command of this.commands) {
|
|
473
|
+
const parsed = this.mri(argv.slice(2), command);
|
|
474
|
+
const commandName = parsed.args[0];
|
|
475
|
+
if (command.isMatched(commandName)) {
|
|
476
|
+
shouldParse = false;
|
|
477
|
+
const parsedInfo = __assign(__assign({}, parsed), {
|
|
478
|
+
args: parsed.args.slice(1)
|
|
479
|
+
});
|
|
480
|
+
this.setParsedInfo(parsedInfo, command, commandName);
|
|
481
|
+
this.emit(`command:${commandName}`, command);
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
if (shouldParse) {
|
|
485
|
+
for (const command of this.commands) {
|
|
486
|
+
if (command.name === "") {
|
|
487
|
+
shouldParse = false;
|
|
488
|
+
const parsed = this.mri(argv.slice(2), command);
|
|
489
|
+
this.setParsedInfo(parsed, command);
|
|
490
|
+
this.emit(`command:!`, command);
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
if (shouldParse) {
|
|
495
|
+
const parsed = this.mri(argv.slice(2));
|
|
496
|
+
this.setParsedInfo(parsed);
|
|
497
|
+
}
|
|
498
|
+
if (this.options.help && this.showHelpOnExit) {
|
|
499
|
+
this.outputHelp();
|
|
500
|
+
run = false;
|
|
501
|
+
this.unsetMatchedCommand();
|
|
502
|
+
}
|
|
503
|
+
if (this.options.version && this.showVersionOnExit && this.matchedCommandName == null) {
|
|
504
|
+
this.outputVersion();
|
|
505
|
+
run = false;
|
|
506
|
+
this.unsetMatchedCommand();
|
|
507
|
+
}
|
|
508
|
+
const parsedArgv = { args: this.args, options: this.options };
|
|
509
|
+
if (run) {
|
|
510
|
+
this.runMatchedCommand();
|
|
511
|
+
}
|
|
512
|
+
if (!this.matchedCommand && this.args[0]) {
|
|
513
|
+
this.emit("command:*");
|
|
514
|
+
}
|
|
515
|
+
return parsedArgv;
|
|
516
|
+
}
|
|
517
|
+
mri(argv, command) {
|
|
518
|
+
const cliOptions = [
|
|
519
|
+
...this.globalCommand.options,
|
|
520
|
+
...command ? command.options : []
|
|
521
|
+
];
|
|
522
|
+
const mriOptions = getMriOptions(cliOptions);
|
|
523
|
+
let argsAfterDoubleDashes = [];
|
|
524
|
+
const doubleDashesIndex = argv.indexOf("--");
|
|
525
|
+
if (doubleDashesIndex > -1) {
|
|
526
|
+
argsAfterDoubleDashes = argv.slice(doubleDashesIndex + 1);
|
|
527
|
+
argv = argv.slice(0, doubleDashesIndex);
|
|
528
|
+
}
|
|
529
|
+
let parsed = mri2(argv, mriOptions);
|
|
530
|
+
parsed = Object.keys(parsed).reduce((res, name) => {
|
|
531
|
+
return __assign(__assign({}, res), {
|
|
532
|
+
[camelcaseOptionName(name)]: parsed[name]
|
|
533
|
+
});
|
|
534
|
+
}, { _: [] });
|
|
535
|
+
const args = parsed._;
|
|
536
|
+
const options = {
|
|
537
|
+
"--": argsAfterDoubleDashes
|
|
538
|
+
};
|
|
539
|
+
const ignoreDefault = command && command.config.ignoreOptionDefaultValue ? command.config.ignoreOptionDefaultValue : this.globalCommand.config.ignoreOptionDefaultValue;
|
|
540
|
+
let transforms = /* @__PURE__ */ Object.create(null);
|
|
541
|
+
for (const cliOption of cliOptions) {
|
|
542
|
+
if (!ignoreDefault && cliOption.config.default !== void 0) {
|
|
543
|
+
for (const name of cliOption.names) {
|
|
544
|
+
options[name] = cliOption.config.default;
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
if (Array.isArray(cliOption.config.type)) {
|
|
548
|
+
if (transforms[cliOption.name] === void 0) {
|
|
549
|
+
transforms[cliOption.name] = /* @__PURE__ */ Object.create(null);
|
|
550
|
+
transforms[cliOption.name]["shouldTransform"] = true;
|
|
551
|
+
transforms[cliOption.name]["transformFunction"] = cliOption.config.type[0];
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
for (const key of Object.keys(parsed)) {
|
|
556
|
+
if (key !== "_") {
|
|
557
|
+
const keys = key.split(".");
|
|
558
|
+
setDotProp(options, keys, parsed[key]);
|
|
559
|
+
setByType(options, transforms);
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
return {
|
|
563
|
+
args,
|
|
564
|
+
options
|
|
565
|
+
};
|
|
566
|
+
}
|
|
567
|
+
runMatchedCommand() {
|
|
568
|
+
const { args, options, matchedCommand: command } = this;
|
|
569
|
+
if (!command || !command.commandAction)
|
|
570
|
+
return;
|
|
571
|
+
command.checkUnknownOptions();
|
|
572
|
+
command.checkOptionValue();
|
|
573
|
+
command.checkRequiredArgs();
|
|
574
|
+
const actionArgs = [];
|
|
575
|
+
command.args.forEach((arg, index) => {
|
|
576
|
+
if (arg.variadic) {
|
|
577
|
+
actionArgs.push(args.slice(index));
|
|
578
|
+
} else {
|
|
579
|
+
actionArgs.push(args[index]);
|
|
580
|
+
}
|
|
581
|
+
});
|
|
582
|
+
actionArgs.push(options);
|
|
583
|
+
return command.commandAction.apply(this, actionArgs);
|
|
584
|
+
}
|
|
585
|
+
};
|
|
586
|
+
var cac = (name = "") => new CAC(name);
|
|
587
|
+
var dist_default = cac;
|
|
588
|
+
|
|
589
|
+
// src/cli/config/types.ts
|
|
590
|
+
function isMultiTurnConfig(testCase2) {
|
|
591
|
+
return "multiTurn" in testCase2 && testCase2.multiTurn !== void 0;
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
// src/cli/config/schema.ts
|
|
595
|
+
import { z } from "zod";
|
|
596
|
+
|
|
597
|
+
// src/core/errors.ts
|
|
598
|
+
var EvalError = class _EvalError extends Error {
|
|
599
|
+
code;
|
|
600
|
+
cause;
|
|
601
|
+
context;
|
|
602
|
+
constructor(message, options) {
|
|
603
|
+
super(message);
|
|
604
|
+
this.name = "EvalError";
|
|
605
|
+
this.code = options.code;
|
|
606
|
+
this.cause = options.cause;
|
|
607
|
+
this.context = options.context;
|
|
608
|
+
if (Error.captureStackTrace) {
|
|
609
|
+
Error.captureStackTrace(this, _EvalError);
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
/**
|
|
613
|
+
* Creates an EvalError from an unknown error with a specific code.
|
|
614
|
+
*/
|
|
615
|
+
static from(error, code, context) {
|
|
616
|
+
if (error instanceof _EvalError) {
|
|
617
|
+
return error;
|
|
618
|
+
}
|
|
619
|
+
const cause = error instanceof Error ? error : new Error(String(error));
|
|
620
|
+
return new _EvalError(cause.message, { code, cause, context });
|
|
621
|
+
}
|
|
622
|
+
toJSON() {
|
|
623
|
+
return {
|
|
624
|
+
name: this.name,
|
|
625
|
+
message: this.message,
|
|
626
|
+
code: this.code,
|
|
627
|
+
context: this.context,
|
|
628
|
+
cause: this.cause?.message
|
|
629
|
+
};
|
|
630
|
+
}
|
|
631
|
+
};
|
|
632
|
+
|
|
633
|
+
// src/cli/config/schema.ts
|
|
634
|
+
var llmConfigSchema = z.object({
|
|
635
|
+
provider: z.enum(["openai", "gemini"], {
|
|
636
|
+
errorMap: () => ({
|
|
637
|
+
message: "provider must be 'openai' or 'gemini'"
|
|
638
|
+
})
|
|
639
|
+
}),
|
|
640
|
+
apiKey: z.string().optional(),
|
|
641
|
+
defaultModel: z.string().optional(),
|
|
642
|
+
reasoningEffort: z.enum(["minimal", "low", "medium", "high"]).optional(),
|
|
643
|
+
defaultResponseFormat: z.object({
|
|
644
|
+
type: z.enum(["json_object", "text"])
|
|
645
|
+
}).optional()
|
|
646
|
+
});
|
|
647
|
+
var criterionSchema = z.object({
|
|
648
|
+
id: z.string().min(1, "Criterion id is required"),
|
|
649
|
+
name: z.string().min(1, "Criterion name is required"),
|
|
650
|
+
description: z.string().min(1, "Criterion description is required"),
|
|
651
|
+
weight: z.number().positive().optional(),
|
|
652
|
+
validator: z.function().optional()
|
|
653
|
+
});
|
|
654
|
+
var judgeConfigSchema = z.object({
|
|
655
|
+
llm: llmConfigSchema.optional(),
|
|
656
|
+
criteria: z.array(criterionSchema).min(1, "At least one criterion is required"),
|
|
657
|
+
passThreshold: z.number().min(0).max(100).optional(),
|
|
658
|
+
prompt: z.any().optional()
|
|
659
|
+
});
|
|
660
|
+
var improverConfigSchema = z.object({
|
|
661
|
+
llm: llmConfigSchema.optional(),
|
|
662
|
+
prompt: z.any().optional()
|
|
663
|
+
}).optional();
|
|
664
|
+
var outputConfigSchema = z.object({
|
|
665
|
+
dir: z.string().optional(),
|
|
666
|
+
filename: z.string().optional(),
|
|
667
|
+
verbose: z.boolean().optional()
|
|
668
|
+
}).optional();
|
|
669
|
+
var runConfigSchema = z.object({
|
|
670
|
+
concurrency: z.number().int().positive().optional(),
|
|
671
|
+
iterations: z.number().int().positive().optional(),
|
|
672
|
+
stopOnFirstFailure: z.boolean().optional()
|
|
673
|
+
}).optional();
|
|
674
|
+
var maxTurnsConditionSchema = z.object({
|
|
675
|
+
type: z.literal("maxTurns"),
|
|
676
|
+
count: z.number().int().positive()
|
|
677
|
+
});
|
|
678
|
+
var fieldSetConditionSchema = z.object({
|
|
679
|
+
type: z.literal("fieldSet"),
|
|
680
|
+
fieldPath: z.string().min(1)
|
|
681
|
+
});
|
|
682
|
+
var fieldValueConditionSchema = z.object({
|
|
683
|
+
type: z.literal("fieldValue"),
|
|
684
|
+
fieldPath: z.string().min(1),
|
|
685
|
+
expectedValue: z.unknown()
|
|
686
|
+
});
|
|
687
|
+
var customConditionSchema = z.object({
|
|
688
|
+
type: z.literal("custom"),
|
|
689
|
+
check: z.function(),
|
|
690
|
+
description: z.string().optional()
|
|
691
|
+
});
|
|
692
|
+
var terminationConditionSchema = z.union([
|
|
693
|
+
maxTurnsConditionSchema,
|
|
694
|
+
fieldSetConditionSchema,
|
|
695
|
+
fieldValueConditionSchema,
|
|
696
|
+
customConditionSchema
|
|
697
|
+
]);
|
|
698
|
+
var followUpInputSchema = z.object({
|
|
699
|
+
input: z.unknown(),
|
|
700
|
+
description: z.string().optional(),
|
|
701
|
+
turns: z.number().optional()
|
|
702
|
+
});
|
|
703
|
+
var multiTurnConfigSchema = z.object({
|
|
704
|
+
followUpInputs: z.array(followUpInputSchema).optional(),
|
|
705
|
+
terminateWhen: z.array(terminationConditionSchema).min(1, "At least one termination condition is required"),
|
|
706
|
+
maxTurns: z.number().int().positive().optional(),
|
|
707
|
+
onConditionMet: z.enum(["pass", "fail"]).optional(),
|
|
708
|
+
onMaxTurnsReached: z.enum(["pass", "fail"]).optional()
|
|
709
|
+
});
|
|
710
|
+
var testCaseSchema = z.object({
|
|
711
|
+
id: z.string().optional(),
|
|
712
|
+
input: z.unknown(),
|
|
713
|
+
tags: z.array(z.string()).optional(),
|
|
714
|
+
description: z.string().optional(),
|
|
715
|
+
expectedOutput: z.unknown().optional(),
|
|
716
|
+
files: z.array(z.any()).optional(),
|
|
717
|
+
multiTurn: multiTurnConfigSchema.optional()
|
|
718
|
+
});
|
|
719
|
+
var agentSchema = z.object({
|
|
720
|
+
config: z.object({
|
|
721
|
+
name: z.string(),
|
|
722
|
+
description: z.string().optional()
|
|
723
|
+
}),
|
|
724
|
+
prompt: z.object({
|
|
725
|
+
id: z.string(),
|
|
726
|
+
version: z.string(),
|
|
727
|
+
system: z.string(),
|
|
728
|
+
renderUserPrompt: z.function()
|
|
729
|
+
}),
|
|
730
|
+
execute: z.function()
|
|
731
|
+
});
|
|
732
|
+
var evalConfigSchema = z.object({
|
|
733
|
+
name: z.string().optional(),
|
|
734
|
+
agentDescription: z.string().optional(),
|
|
735
|
+
agent: agentSchema,
|
|
736
|
+
llm: llmConfigSchema,
|
|
737
|
+
judge: judgeConfigSchema,
|
|
738
|
+
improver: improverConfigSchema,
|
|
739
|
+
testCases: z.array(testCaseSchema).optional(),
|
|
740
|
+
output: outputConfigSchema,
|
|
741
|
+
run: runConfigSchema,
|
|
742
|
+
include: z.array(z.string().min(1, "Include pattern cannot be empty")).min(1, "Include array must have at least one pattern").optional(),
|
|
743
|
+
agents: z.record(z.string(), agentSchema).optional()
|
|
744
|
+
}).refine(
|
|
745
|
+
(data) => {
|
|
746
|
+
const hasTestCases = (data.testCases?.length ?? 0) > 0;
|
|
747
|
+
const hasInclude = (data.include?.length ?? 0) > 0;
|
|
748
|
+
return hasTestCases || hasInclude;
|
|
749
|
+
},
|
|
750
|
+
{
|
|
751
|
+
message: "Either testCases or include must be provided. Use testCases for inline TypeScript tests, or include for YAML file discovery.",
|
|
752
|
+
path: ["testCases"]
|
|
753
|
+
}
|
|
754
|
+
);
|
|
755
|
+
function validateConfig(config) {
|
|
756
|
+
const result = evalConfigSchema.safeParse(config);
|
|
757
|
+
if (!result.success) {
|
|
758
|
+
const errors = result.error.issues.map((issue) => {
|
|
759
|
+
const path3 = issue.path.join(".");
|
|
760
|
+
return path3 ? ` - ${path3}: ${issue.message}` : ` - ${issue.message}`;
|
|
761
|
+
}).join("\n");
|
|
762
|
+
throw new EvalError(`Invalid configuration:
|
|
763
|
+
${errors}`, {
|
|
764
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */
|
|
765
|
+
});
|
|
766
|
+
}
|
|
767
|
+
return result.data;
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
// src/cli/config/loader.ts
|
|
771
|
+
import { existsSync } from "fs";
|
|
772
|
+
import { resolve, extname } from "path";
|
|
773
|
+
import { pathToFileURL } from "url";
|
|
774
|
+
import { bundleRequire } from "bundle-require";
|
|
775
|
+
import fg from "fast-glob";
|
|
776
|
+
var ConfigError = class extends Error {
|
|
777
|
+
constructor(message, code, context) {
|
|
778
|
+
super(message);
|
|
779
|
+
this.code = code;
|
|
780
|
+
this.context = context;
|
|
781
|
+
this.name = "ConfigError";
|
|
782
|
+
}
|
|
783
|
+
};
|
|
784
|
+
var DEFAULT_CONFIG_FILE = "agent-eval.config.ts";
|
|
785
|
+
var SUPPORTED_EXTENSIONS = [".ts", ".mts", ".cts", ".js", ".mjs", ".cjs"];
|
|
786
|
+
function resolveConfigPath(configPath = DEFAULT_CONFIG_FILE, cwd = process.cwd()) {
|
|
787
|
+
return resolve(cwd, configPath);
|
|
788
|
+
}
|
|
789
|
+
async function loadConfig(configPath) {
|
|
790
|
+
const absolutePath = resolve(process.cwd(), configPath);
|
|
791
|
+
if (!existsSync(absolutePath)) {
|
|
792
|
+
throw new ConfigError(
|
|
793
|
+
`Config file not found: ${configPath}
|
|
794
|
+
|
|
795
|
+
Create an ${DEFAULT_CONFIG_FILE} file or specify a path:
|
|
796
|
+
npx agent-eval run ./path/to/config.ts`,
|
|
797
|
+
"CONFIG_NOT_FOUND",
|
|
798
|
+
{ path: absolutePath }
|
|
799
|
+
);
|
|
800
|
+
}
|
|
801
|
+
const ext = extname(absolutePath).toLowerCase();
|
|
802
|
+
if (!SUPPORTED_EXTENSIONS.includes(ext)) {
|
|
803
|
+
throw new ConfigError(
|
|
804
|
+
`Unsupported config file extension: ${ext}
|
|
805
|
+
Supported extensions: ${SUPPORTED_EXTENSIONS.join(", ")}`,
|
|
806
|
+
"CONFIG_LOAD_ERROR",
|
|
807
|
+
{ path: absolutePath, extension: ext }
|
|
808
|
+
);
|
|
809
|
+
}
|
|
810
|
+
let mod;
|
|
811
|
+
try {
|
|
812
|
+
if (ext === ".ts" || ext === ".mts" || ext === ".cts") {
|
|
813
|
+
const result = await bundleRequire({
|
|
814
|
+
filepath: absolutePath,
|
|
815
|
+
format: "esm",
|
|
816
|
+
esbuildOptions: { sourcemap: "inline" }
|
|
817
|
+
});
|
|
818
|
+
mod = result.mod;
|
|
819
|
+
} else {
|
|
820
|
+
const fileUrl = pathToFileURL(absolutePath).href;
|
|
821
|
+
mod = await import(fileUrl);
|
|
822
|
+
}
|
|
823
|
+
} catch (error) {
|
|
824
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
825
|
+
throw new ConfigError(
|
|
826
|
+
`Failed to load config file: ${configPath}
|
|
827
|
+
|
|
828
|
+
Error: ${message}
|
|
829
|
+
|
|
830
|
+
Make sure the file is valid TypeScript/JavaScript and has no syntax errors.`,
|
|
831
|
+
"CONFIG_LOAD_ERROR",
|
|
832
|
+
{ path: absolutePath, originalError: message }
|
|
833
|
+
);
|
|
834
|
+
}
|
|
835
|
+
const config = "default" in mod ? mod.default : mod;
|
|
836
|
+
if (!config || typeof config !== "object") {
|
|
837
|
+
throw new ConfigError(
|
|
838
|
+
`Config file must export a default configuration object.
|
|
839
|
+
|
|
840
|
+
Example:
|
|
841
|
+
import { defineConfig } from '@agtlantis/eval'
|
|
842
|
+
export default defineConfig({ ... })`,
|
|
843
|
+
"CONFIG_NO_DEFAULT_EXPORT",
|
|
844
|
+
{ path: absolutePath }
|
|
845
|
+
);
|
|
846
|
+
}
|
|
847
|
+
try {
|
|
848
|
+
validateConfig(config);
|
|
849
|
+
} catch (error) {
|
|
850
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
851
|
+
throw new ConfigError(
|
|
852
|
+
message,
|
|
853
|
+
"CONFIG_VALIDATION_ERROR",
|
|
854
|
+
{ path: absolutePath }
|
|
855
|
+
);
|
|
856
|
+
}
|
|
857
|
+
return config;
|
|
858
|
+
}
|
|
859
|
+
async function loadConfigWithDefaults(configPath, cwd) {
|
|
860
|
+
const resolvedPath = resolveConfigPath(configPath, cwd);
|
|
861
|
+
return loadConfig(resolvedPath);
|
|
862
|
+
}
|
|
863
|
+
async function discoverEvalFiles(config, options = {}) {
|
|
864
|
+
const patterns = options.include ?? config.include;
|
|
865
|
+
if (!patterns || patterns.length === 0) {
|
|
866
|
+
throw new ConfigError(
|
|
867
|
+
`No include patterns specified.
|
|
868
|
+
|
|
869
|
+
Add an include field to your config:
|
|
870
|
+
include: ['evals/**/*.eval.yaml']
|
|
871
|
+
|
|
872
|
+
Or use the --include CLI option:
|
|
873
|
+
npx agent-eval --include "evals/**/*.eval.yaml"`,
|
|
874
|
+
"CONFIG_NO_INCLUDE_PATTERN"
|
|
875
|
+
);
|
|
876
|
+
}
|
|
877
|
+
const cwd = options.cwd ?? process.cwd();
|
|
878
|
+
const ignore = options.ignore ?? ["**/node_modules/**"];
|
|
879
|
+
const files = await fg(patterns, {
|
|
880
|
+
absolute: true,
|
|
881
|
+
cwd,
|
|
882
|
+
ignore,
|
|
883
|
+
onlyFiles: true,
|
|
884
|
+
dot: false,
|
|
885
|
+
followSymbolicLinks: false,
|
|
886
|
+
unique: true,
|
|
887
|
+
suppressErrors: false
|
|
888
|
+
});
|
|
889
|
+
return files.sort();
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
// src/cli/yaml/schema.ts
|
|
893
|
+
import { z as z2 } from "zod";
|
|
894
|
+
var yamlExpectationSchema = z2.object({
|
|
895
|
+
minTurns: z2.number().int().positive().optional(),
|
|
896
|
+
maxTurns: z2.number().int().positive().optional(),
|
|
897
|
+
minScore: z2.number().min(0).max(100).optional()
|
|
898
|
+
});
|
|
899
|
+
var yamlTerminationConditionSchema = z2.object({
|
|
900
|
+
field: z2.string().min(1).optional(),
|
|
901
|
+
equals: z2.unknown().optional(),
|
|
902
|
+
naturalLanguage: z2.string().min(1).optional()
|
|
903
|
+
}).refine((data) => data.field !== void 0 || data.naturalLanguage !== void 0, {
|
|
904
|
+
message: "Either field or naturalLanguage must be specified"
|
|
905
|
+
});
|
|
906
|
+
var yamlPersonaSchema = z2.object({
|
|
907
|
+
name: z2.string().min(1, "Persona name is required"),
|
|
908
|
+
description: z2.string().optional(),
|
|
909
|
+
systemPrompt: z2.string().min(1, "Persona systemPrompt is required")
|
|
910
|
+
});
|
|
911
|
+
var yamlTestCaseDefaultsSchema = z2.object({
|
|
912
|
+
maxTurns: z2.number().int().positive().optional(),
|
|
913
|
+
endWhen: yamlTerminationConditionSchema.optional(),
|
|
914
|
+
onConditionMet: z2.enum(["pass", "fail"]).optional(),
|
|
915
|
+
onMaxTurnsReached: z2.enum(["pass", "fail"]).optional(),
|
|
916
|
+
tags: z2.array(z2.string()).optional()
|
|
917
|
+
});
|
|
918
|
+
var yamlTestCaseSchema = z2.object({
|
|
919
|
+
id: z2.string().min(1, "Test case id is required"),
|
|
920
|
+
name: z2.string().optional(),
|
|
921
|
+
description: z2.string().optional(),
|
|
922
|
+
tags: z2.array(z2.string()).optional(),
|
|
923
|
+
input: z2.record(z2.unknown()),
|
|
924
|
+
persona: z2.union([z2.string().min(1), yamlPersonaSchema]).optional(),
|
|
925
|
+
maxTurns: z2.number().int().positive().optional(),
|
|
926
|
+
endWhen: yamlTerminationConditionSchema.optional(),
|
|
927
|
+
onConditionMet: z2.enum(["pass", "fail"]).optional(),
|
|
928
|
+
onMaxTurnsReached: z2.enum(["pass", "fail"]).optional(),
|
|
929
|
+
expectedOutput: z2.record(z2.unknown()).optional(),
|
|
930
|
+
expect: yamlExpectationSchema.optional()
|
|
931
|
+
});
|
|
932
|
+
var yamlEvalFileSchema = z2.object({
|
|
933
|
+
agent: z2.string().min(1, "Agent name is required"),
|
|
934
|
+
name: z2.string().optional(),
|
|
935
|
+
description: z2.string().optional(),
|
|
936
|
+
defaults: yamlTestCaseDefaultsSchema.optional(),
|
|
937
|
+
personas: z2.record(yamlPersonaSchema).optional(),
|
|
938
|
+
cases: z2.array(yamlTestCaseSchema).min(1, "At least one test case is required")
|
|
939
|
+
});
|
|
940
|
+
function validateYamlEvalFile(content) {
|
|
941
|
+
const result = yamlEvalFileSchema.safeParse(content);
|
|
942
|
+
if (!result.success) {
|
|
943
|
+
const errors = result.error.issues.map((issue) => {
|
|
944
|
+
const path3 = issue.path.join(".");
|
|
945
|
+
return path3 ? ` - ${path3}: ${issue.message}` : ` - ${issue.message}`;
|
|
946
|
+
}).join("\n");
|
|
947
|
+
throw new EvalError(`Invalid YAML eval file:
|
|
948
|
+
${errors}`, {
|
|
949
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */
|
|
950
|
+
});
|
|
951
|
+
}
|
|
952
|
+
return result.data;
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
// src/cli/yaml/loader.ts
|
|
956
|
+
import { existsSync as existsSync2 } from "fs";
|
|
957
|
+
import { readFile } from "fs/promises";
|
|
958
|
+
import { isAbsolute, resolve as resolve2 } from "path";
|
|
959
|
+
import { parse as parseYaml } from "yaml";
|
|
960
|
+
|
|
961
|
+
// src/multi-turn/types.ts
|
|
962
|
+
function isMaxTurnsCondition(condition) {
|
|
963
|
+
return condition.type === "maxTurns";
|
|
964
|
+
}
|
|
965
|
+
function isFieldSetCondition(condition) {
|
|
966
|
+
return condition.type === "fieldSet";
|
|
967
|
+
}
|
|
968
|
+
function isFieldValueCondition(condition) {
|
|
969
|
+
return condition.type === "fieldValue";
|
|
970
|
+
}
|
|
971
|
+
function isCustomCondition(condition) {
|
|
972
|
+
return condition.type === "custom";
|
|
973
|
+
}
|
|
974
|
+
function isMultiTurnTestCase(testCase2) {
|
|
975
|
+
return "multiTurn" in testCase2;
|
|
976
|
+
}
|
|
977
|
+
function isTerminated(result) {
|
|
978
|
+
return result.terminated === true;
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
// src/multi-turn/termination.ts
|
|
982
|
+
function getFieldValue(obj, fieldPath) {
|
|
983
|
+
if (obj === null || obj === void 0) {
|
|
984
|
+
return void 0;
|
|
985
|
+
}
|
|
986
|
+
const parts = fieldPath.split(".");
|
|
987
|
+
let current = obj;
|
|
988
|
+
for (const part of parts) {
|
|
989
|
+
if (current === null || current === void 0) {
|
|
990
|
+
return void 0;
|
|
991
|
+
}
|
|
992
|
+
if (typeof current !== "object") {
|
|
993
|
+
return void 0;
|
|
994
|
+
}
|
|
995
|
+
current = current[part];
|
|
996
|
+
}
|
|
997
|
+
return current;
|
|
998
|
+
}
|
|
999
|
+
function isSet(value) {
|
|
1000
|
+
return value !== null && value !== void 0;
|
|
1001
|
+
}
|
|
1002
|
+
function checkMaxTurns(condition, context) {
|
|
1003
|
+
const shouldTerminate = context.currentTurn >= condition.count;
|
|
1004
|
+
if (shouldTerminate) {
|
|
1005
|
+
return {
|
|
1006
|
+
terminated: true,
|
|
1007
|
+
terminationType: "maxTurns",
|
|
1008
|
+
matchedCondition: condition,
|
|
1009
|
+
reason: `Maximum turns reached (${condition.count})`
|
|
1010
|
+
};
|
|
1011
|
+
}
|
|
1012
|
+
return {
|
|
1013
|
+
terminated: false,
|
|
1014
|
+
reason: `Turn ${context.currentTurn} of ${condition.count}`
|
|
1015
|
+
};
|
|
1016
|
+
}
|
|
1017
|
+
function checkFieldSet(condition, context) {
|
|
1018
|
+
const fieldValue = getFieldValue(context.lastOutput, condition.fieldPath);
|
|
1019
|
+
const fieldIsSet2 = isSet(fieldValue);
|
|
1020
|
+
if (fieldIsSet2) {
|
|
1021
|
+
return {
|
|
1022
|
+
terminated: true,
|
|
1023
|
+
terminationType: "condition",
|
|
1024
|
+
matchedCondition: condition,
|
|
1025
|
+
reason: `Field "${condition.fieldPath}" is set (value: ${JSON.stringify(fieldValue)})`
|
|
1026
|
+
};
|
|
1027
|
+
}
|
|
1028
|
+
return {
|
|
1029
|
+
terminated: false,
|
|
1030
|
+
reason: `Field "${condition.fieldPath}" is not set`
|
|
1031
|
+
};
|
|
1032
|
+
}
|
|
1033
|
+
function checkFieldValue(condition, context) {
|
|
1034
|
+
const fieldValue = getFieldValue(context.lastOutput, condition.fieldPath);
|
|
1035
|
+
const matches = fieldValue === condition.expectedValue;
|
|
1036
|
+
if (matches) {
|
|
1037
|
+
return {
|
|
1038
|
+
terminated: true,
|
|
1039
|
+
terminationType: "condition",
|
|
1040
|
+
matchedCondition: condition,
|
|
1041
|
+
reason: `Field "${condition.fieldPath}" equals expected value`
|
|
1042
|
+
};
|
|
1043
|
+
}
|
|
1044
|
+
return {
|
|
1045
|
+
terminated: false,
|
|
1046
|
+
reason: `Field "${condition.fieldPath}" does not equal expected value (got: ${JSON.stringify(fieldValue)})`
|
|
1047
|
+
};
|
|
1048
|
+
}
|
|
1049
|
+
async function checkCustom(condition, context) {
|
|
1050
|
+
const description = condition.description ?? "Custom condition";
|
|
1051
|
+
try {
|
|
1052
|
+
const shouldTerminate = await condition.check(context);
|
|
1053
|
+
if (shouldTerminate) {
|
|
1054
|
+
return {
|
|
1055
|
+
terminated: true,
|
|
1056
|
+
terminationType: "condition",
|
|
1057
|
+
matchedCondition: condition,
|
|
1058
|
+
reason: `${description} met`
|
|
1059
|
+
};
|
|
1060
|
+
}
|
|
1061
|
+
return {
|
|
1062
|
+
terminated: false,
|
|
1063
|
+
reason: `${description} not met`
|
|
1064
|
+
};
|
|
1065
|
+
} catch (error) {
|
|
1066
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
1067
|
+
return {
|
|
1068
|
+
terminated: false,
|
|
1069
|
+
reason: `${description} failed: ${errorMessage}`
|
|
1070
|
+
};
|
|
1071
|
+
}
|
|
1072
|
+
}
|
|
1073
|
+
async function checkCondition(condition, context) {
|
|
1074
|
+
if (isMaxTurnsCondition(condition)) {
|
|
1075
|
+
return checkMaxTurns(condition, context);
|
|
1076
|
+
}
|
|
1077
|
+
if (isFieldValueCondition(condition)) {
|
|
1078
|
+
return checkFieldValue(condition, context);
|
|
1079
|
+
}
|
|
1080
|
+
if (isFieldSetCondition(condition)) {
|
|
1081
|
+
return checkFieldSet(condition, context);
|
|
1082
|
+
}
|
|
1083
|
+
if (isCustomCondition(condition)) {
|
|
1084
|
+
return checkCustom(condition, context);
|
|
1085
|
+
}
|
|
1086
|
+
const _exhaustive = condition;
|
|
1087
|
+
throw new EvalError(`Unknown condition type: ${JSON.stringify(_exhaustive)}`, {
|
|
1088
|
+
code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */,
|
|
1089
|
+
context: { condition: _exhaustive }
|
|
1090
|
+
});
|
|
1091
|
+
}
|
|
1092
|
+
async function checkTermination(conditions, context) {
|
|
1093
|
+
if (conditions.length === 0) {
|
|
1094
|
+
return {
|
|
1095
|
+
terminated: false,
|
|
1096
|
+
reason: "No termination conditions specified"
|
|
1097
|
+
};
|
|
1098
|
+
}
|
|
1099
|
+
for (const condition of conditions) {
|
|
1100
|
+
const result = await checkCondition(condition, context);
|
|
1101
|
+
if (result.terminated) {
|
|
1102
|
+
return result;
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1105
|
+
return {
|
|
1106
|
+
terminated: false,
|
|
1107
|
+
reason: "No termination conditions met"
|
|
1108
|
+
};
|
|
1109
|
+
}
|
|
1110
|
+
|
|
1111
|
+
// src/utils/json.ts
|
|
1112
|
+
function truncate(str, maxLength) {
|
|
1113
|
+
if (!str) {
|
|
1114
|
+
return "";
|
|
1115
|
+
}
|
|
1116
|
+
if (str.length <= maxLength) {
|
|
1117
|
+
return str;
|
|
1118
|
+
}
|
|
1119
|
+
return str.slice(0, maxLength) + "...";
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1122
|
+
// src/multi-turn/conditions.ts
|
|
1123
|
+
function naturalLanguage(options) {
|
|
1124
|
+
const { provider, prompt, systemPrompt } = options;
|
|
1125
|
+
const defaultSystemPrompt = `You are an assistant that evaluates whether a conversation should terminate.
|
|
1126
|
+
Analyze the conversation history and determine if the specified condition is met.
|
|
1127
|
+
Respond with ONLY "yes" or "no" - nothing else.`;
|
|
1128
|
+
return {
|
|
1129
|
+
type: "custom",
|
|
1130
|
+
check: async (context) => {
|
|
1131
|
+
const historyText = context.history.map(
|
|
1132
|
+
(h) => `Turn ${h.turn}:
|
|
1133
|
+
Input: ${JSON.stringify(h.input)}
|
|
1134
|
+
Output: ${JSON.stringify(h.output)}`
|
|
1135
|
+
).join("\n\n");
|
|
1136
|
+
const userPrompt = `## Termination Condition
|
|
1137
|
+
${prompt}
|
|
1138
|
+
|
|
1139
|
+
## Conversation History
|
|
1140
|
+
${historyText || "(No history yet)"}
|
|
1141
|
+
|
|
1142
|
+
## Current Turn
|
|
1143
|
+
Turn: ${context.currentTurn}
|
|
1144
|
+
Last Output: ${JSON.stringify(context.lastOutput)}
|
|
1145
|
+
|
|
1146
|
+
Should the conversation terminate based on the condition above? Answer "yes" or "no" only.`;
|
|
1147
|
+
const execution = provider.simpleExecution(async (session) => {
|
|
1148
|
+
const result = await session.generateText({
|
|
1149
|
+
messages: [
|
|
1150
|
+
{ role: "system", content: systemPrompt ?? defaultSystemPrompt },
|
|
1151
|
+
{ role: "user", content: userPrompt }
|
|
1152
|
+
]
|
|
1153
|
+
});
|
|
1154
|
+
return result.text;
|
|
1155
|
+
});
|
|
1156
|
+
const executionResult = await execution.result();
|
|
1157
|
+
if (executionResult.status !== "succeeded") {
|
|
1158
|
+
throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
|
|
1159
|
+
}
|
|
1160
|
+
const responseText = executionResult.value;
|
|
1161
|
+
const answer = responseText.toLowerCase().trim();
|
|
1162
|
+
return answer === "yes" || answer.startsWith("yes");
|
|
1163
|
+
},
|
|
1164
|
+
description: `NL: ${truncate(prompt, 50)}`
|
|
1165
|
+
};
|
|
1166
|
+
}
|
|
1167
|
+
function fieldEquals(fieldPath, expectedValue) {
|
|
1168
|
+
return {
|
|
1169
|
+
type: "custom",
|
|
1170
|
+
check: async (context) => {
|
|
1171
|
+
const result = await checkCondition(
|
|
1172
|
+
{ type: "fieldValue", fieldPath, expectedValue },
|
|
1173
|
+
context
|
|
1174
|
+
);
|
|
1175
|
+
return result.terminated;
|
|
1176
|
+
},
|
|
1177
|
+
description: `fieldEquals(${fieldPath}, ${JSON.stringify(expectedValue)})`
|
|
1178
|
+
};
|
|
1179
|
+
}
|
|
1180
|
+
function fieldIsSet(fieldPath) {
|
|
1181
|
+
return {
|
|
1182
|
+
type: "custom",
|
|
1183
|
+
check: async (context) => {
|
|
1184
|
+
const result = await checkCondition({ type: "fieldSet", fieldPath }, context);
|
|
1185
|
+
return result.terminated;
|
|
1186
|
+
},
|
|
1187
|
+
description: `fieldIsSet(${fieldPath})`
|
|
1188
|
+
};
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
// src/multi-turn/runner.ts
|
|
1192
|
+
import { resolveFileSourcesInInput } from "@agtlantis/core";
|
|
1193
|
+
var DEFAULT_MAX_TURNS = 10;
|
|
1194
|
+
var DEFAULT_ON_CONDITION_MET = "pass";
|
|
1195
|
+
var DEFAULT_ON_MAX_TURNS_REACHED = "fail";
|
|
1196
|
+
function aggregateTokenUsage(usages) {
|
|
1197
|
+
return usages.reduce(
|
|
1198
|
+
(acc, usage) => ({
|
|
1199
|
+
inputTokens: acc.inputTokens + usage.inputTokens,
|
|
1200
|
+
outputTokens: acc.outputTokens + usage.outputTokens,
|
|
1201
|
+
totalTokens: acc.totalTokens + usage.totalTokens
|
|
1202
|
+
}),
|
|
1203
|
+
{ inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
|
1204
|
+
);
|
|
1205
|
+
}
|
|
1206
|
+
function getEffectiveMaxTurns(conditions, safetyLimit) {
|
|
1207
|
+
const maxTurnsCondition = conditions.find((c2) => c2.type === "maxTurns");
|
|
1208
|
+
if (maxTurnsCondition && maxTurnsCondition.type === "maxTurns") {
|
|
1209
|
+
return Math.min(maxTurnsCondition.count, safetyLimit);
|
|
1210
|
+
}
|
|
1211
|
+
return safetyLimit;
|
|
1212
|
+
}
|
|
1213
|
+
async function resolveInput(followUpInput, context) {
|
|
1214
|
+
const inputValue = followUpInput.input;
|
|
1215
|
+
if (typeof inputValue === "function") {
|
|
1216
|
+
const result = inputValue(context);
|
|
1217
|
+
return result instanceof Promise ? await result : result;
|
|
1218
|
+
}
|
|
1219
|
+
return inputValue;
|
|
1220
|
+
}
|
|
1221
|
+
function buildContext(currentTurn, history) {
|
|
1222
|
+
return {
|
|
1223
|
+
currentTurn,
|
|
1224
|
+
history,
|
|
1225
|
+
lastOutput: history.length > 0 ? history[history.length - 1].output : void 0
|
|
1226
|
+
};
|
|
1227
|
+
}
|
|
1228
|
+
function getFollowUpInput(followUpInputs, followUpIndex) {
|
|
1229
|
+
let currentIndex = 0;
|
|
1230
|
+
for (const followUp of followUpInputs) {
|
|
1231
|
+
const repeatCount = followUp.turns ?? 1;
|
|
1232
|
+
if (!Number.isFinite(repeatCount) && followUpIndex >= currentIndex) {
|
|
1233
|
+
return followUp;
|
|
1234
|
+
}
|
|
1235
|
+
if (followUpIndex < currentIndex + repeatCount) {
|
|
1236
|
+
return followUp;
|
|
1237
|
+
}
|
|
1238
|
+
currentIndex += repeatCount;
|
|
1239
|
+
}
|
|
1240
|
+
return null;
|
|
1241
|
+
}
|
|
1242
|
+
function validateFollowUpInputs(followUpInputs) {
|
|
1243
|
+
for (let i = 0; i < followUpInputs.length; i++) {
|
|
1244
|
+
const followUp = followUpInputs[i];
|
|
1245
|
+
if (followUp.turns === void 0) {
|
|
1246
|
+
continue;
|
|
1247
|
+
}
|
|
1248
|
+
if (typeof followUp.turns !== "number" || followUp.turns < 1) {
|
|
1249
|
+
throw new EvalError("turns must be a positive number or Infinity", {
|
|
1250
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
1251
|
+
context: {
|
|
1252
|
+
description: followUp.description,
|
|
1253
|
+
turns: followUp.turns
|
|
1254
|
+
}
|
|
1255
|
+
});
|
|
1256
|
+
}
|
|
1257
|
+
if (!Number.isFinite(followUp.turns) && i < followUpInputs.length - 1) {
|
|
1258
|
+
throw new EvalError(
|
|
1259
|
+
"turns: Infinity must be the last followUpInput (subsequent items would be unreachable)",
|
|
1260
|
+
{
|
|
1261
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
1262
|
+
context: {
|
|
1263
|
+
description: followUp.description,
|
|
1264
|
+
position: i,
|
|
1265
|
+
totalItems: followUpInputs.length
|
|
1266
|
+
}
|
|
1267
|
+
}
|
|
1268
|
+
);
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1271
|
+
}
|
|
1272
|
+
async function getTurnInput(turn, testCaseInput, followUpInputs, conversationHistory) {
|
|
1273
|
+
if (turn === 1) {
|
|
1274
|
+
return { type: "success", input: testCaseInput };
|
|
1275
|
+
}
|
|
1276
|
+
const followUpIndex = turn - 2;
|
|
1277
|
+
const followUp = getFollowUpInput(followUpInputs, followUpIndex);
|
|
1278
|
+
if (!followUp) {
|
|
1279
|
+
return { type: "exhausted" };
|
|
1280
|
+
}
|
|
1281
|
+
const ctx = buildContext(turn, conversationHistory);
|
|
1282
|
+
const input = await resolveInput(followUp, ctx);
|
|
1283
|
+
return { type: "success", input };
|
|
1284
|
+
}
|
|
1285
|
+
function isFileResolutionError(result) {
|
|
1286
|
+
return "type" in result && result.type === "fileResolutionError";
|
|
1287
|
+
}
|
|
1288
|
+
async function executeSingleTurn(input, agent, testCaseId, turn) {
|
|
1289
|
+
let resolvedInput;
|
|
1290
|
+
try {
|
|
1291
|
+
resolvedInput = await resolveFileSourcesInInput(input, {
|
|
1292
|
+
basePath: process.cwd()
|
|
1293
|
+
});
|
|
1294
|
+
} catch (e) {
|
|
1295
|
+
return {
|
|
1296
|
+
type: "fileResolutionError",
|
|
1297
|
+
reason: `FileSource resolution failed on turn ${turn}: ${e instanceof Error ? e.message : String(e)}`
|
|
1298
|
+
};
|
|
1299
|
+
}
|
|
1300
|
+
const startTime = performance.now();
|
|
1301
|
+
let output;
|
|
1302
|
+
let metadata;
|
|
1303
|
+
let error;
|
|
1304
|
+
try {
|
|
1305
|
+
const agentResult = await agent.execute(resolvedInput);
|
|
1306
|
+
output = agentResult.result;
|
|
1307
|
+
metadata = agentResult.metadata;
|
|
1308
|
+
} catch (e) {
|
|
1309
|
+
error = EvalError.from(e, "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */, {
|
|
1310
|
+
testCaseId,
|
|
1311
|
+
turn,
|
|
1312
|
+
agentName: agent.config.name
|
|
1313
|
+
});
|
|
1314
|
+
}
|
|
1315
|
+
const latencyMs = performance.now() - startTime;
|
|
1316
|
+
return { output, metadata, latencyMs, error };
|
|
1317
|
+
}
|
|
1318
|
+
function determinePassFromTermination(termination, onConditionMet, onMaxTurnsReached) {
|
|
1319
|
+
if (!isTerminated(termination)) {
|
|
1320
|
+
return true;
|
|
1321
|
+
}
|
|
1322
|
+
switch (termination.terminationType) {
|
|
1323
|
+
case "error":
|
|
1324
|
+
case "exhausted":
|
|
1325
|
+
return false;
|
|
1326
|
+
case "maxTurns":
|
|
1327
|
+
return onMaxTurnsReached === "pass";
|
|
1328
|
+
case "condition":
|
|
1329
|
+
return onConditionMet === "pass";
|
|
1330
|
+
default:
|
|
1331
|
+
return true;
|
|
1332
|
+
}
|
|
1333
|
+
}
|
|
1334
|
+
async function executeMultiTurnTestCase(testCase2, context, options) {
|
|
1335
|
+
const { agent, judge, agentDescription } = context;
|
|
1336
|
+
const { multiTurn } = testCase2;
|
|
1337
|
+
const signal = options?.signal;
|
|
1338
|
+
const maxTurns = getEffectiveMaxTurns(
|
|
1339
|
+
multiTurn.terminateWhen,
|
|
1340
|
+
multiTurn.maxTurns ?? DEFAULT_MAX_TURNS
|
|
1341
|
+
);
|
|
1342
|
+
const onConditionMet = multiTurn.onConditionMet ?? DEFAULT_ON_CONDITION_MET;
|
|
1343
|
+
const onMaxTurnsReached = multiTurn.onMaxTurnsReached ?? DEFAULT_ON_MAX_TURNS_REACHED;
|
|
1344
|
+
const followUpInputs = multiTurn.followUpInputs ?? [];
|
|
1345
|
+
validateFollowUpInputs(followUpInputs);
|
|
1346
|
+
const conversationHistory = [];
|
|
1347
|
+
const tokenUsages = [];
|
|
1348
|
+
let totalLatencyMs = 0;
|
|
1349
|
+
let termination = {
|
|
1350
|
+
terminated: false,
|
|
1351
|
+
reason: "Execution not started"
|
|
1352
|
+
};
|
|
1353
|
+
for (let turn = 1; turn <= maxTurns; turn++) {
|
|
1354
|
+
if (signal?.aborted) {
|
|
1355
|
+
throw new EvalError("Multi-turn test execution aborted", {
|
|
1356
|
+
code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
|
|
1357
|
+
context: { testCaseId: testCase2.id, turn, reason: "aborted" }
|
|
1358
|
+
});
|
|
1359
|
+
}
|
|
1360
|
+
const inputResult = await getTurnInput(
|
|
1361
|
+
turn,
|
|
1362
|
+
testCase2.input,
|
|
1363
|
+
followUpInputs,
|
|
1364
|
+
conversationHistory
|
|
1365
|
+
);
|
|
1366
|
+
if (inputResult.type === "exhausted") {
|
|
1367
|
+
termination = {
|
|
1368
|
+
terminated: true,
|
|
1369
|
+
terminationType: "exhausted",
|
|
1370
|
+
reason: "All follow-up inputs exhausted"
|
|
1371
|
+
};
|
|
1372
|
+
break;
|
|
1373
|
+
}
|
|
1374
|
+
const input = inputResult.input;
|
|
1375
|
+
const turnResult = await executeSingleTurn(input, agent, testCase2.id ?? "unknown", turn);
|
|
1376
|
+
if (isFileResolutionError(turnResult)) {
|
|
1377
|
+
termination = {
|
|
1378
|
+
terminated: true,
|
|
1379
|
+
terminationType: "error",
|
|
1380
|
+
reason: turnResult.reason
|
|
1381
|
+
};
|
|
1382
|
+
break;
|
|
1383
|
+
}
|
|
1384
|
+
const {
|
|
1385
|
+
output: agentOutput,
|
|
1386
|
+
metadata: agentMetadata,
|
|
1387
|
+
latencyMs,
|
|
1388
|
+
error: agentError
|
|
1389
|
+
} = turnResult;
|
|
1390
|
+
totalLatencyMs += latencyMs;
|
|
1391
|
+
const turnUsage = agentMetadata?.tokenUsage ?? {
|
|
1392
|
+
inputTokens: 0,
|
|
1393
|
+
outputTokens: 0,
|
|
1394
|
+
totalTokens: 0
|
|
1395
|
+
};
|
|
1396
|
+
tokenUsages.push(turnUsage);
|
|
1397
|
+
conversationHistory.push({
|
|
1398
|
+
turn,
|
|
1399
|
+
input,
|
|
1400
|
+
output: agentOutput,
|
|
1401
|
+
metadata: agentMetadata
|
|
1402
|
+
});
|
|
1403
|
+
if (agentError) {
|
|
1404
|
+
termination = {
|
|
1405
|
+
terminated: true,
|
|
1406
|
+
terminationType: "error",
|
|
1407
|
+
reason: `Agent execution failed on turn ${turn}: ${agentError.message}`
|
|
1408
|
+
};
|
|
1409
|
+
break;
|
|
1410
|
+
}
|
|
1411
|
+
const ctx = buildContext(turn, conversationHistory);
|
|
1412
|
+
termination = await checkTermination(multiTurn.terminateWhen, ctx);
|
|
1413
|
+
if (termination.terminated) {
|
|
1414
|
+
break;
|
|
1415
|
+
}
|
|
1416
|
+
if (turn >= maxTurns) {
|
|
1417
|
+
termination = {
|
|
1418
|
+
terminated: true,
|
|
1419
|
+
terminationType: "maxTurns",
|
|
1420
|
+
matchedCondition: { type: "maxTurns", count: maxTurns },
|
|
1421
|
+
reason: `Maximum turns reached (${maxTurns})`
|
|
1422
|
+
};
|
|
1423
|
+
break;
|
|
1424
|
+
}
|
|
1425
|
+
}
|
|
1426
|
+
const aggregatedTokenUsage = aggregateTokenUsage(tokenUsages);
|
|
1427
|
+
const metrics = {
|
|
1428
|
+
latencyMs: totalLatencyMs,
|
|
1429
|
+
tokenUsage: aggregatedTokenUsage
|
|
1430
|
+
};
|
|
1431
|
+
const lastTurn = conversationHistory[conversationHistory.length - 1];
|
|
1432
|
+
const finalOutput = lastTurn?.output;
|
|
1433
|
+
const judgeResult = await judge.evaluate({
|
|
1434
|
+
input: testCase2.input,
|
|
1435
|
+
output: finalOutput,
|
|
1436
|
+
agentDescription,
|
|
1437
|
+
files: testCase2.files
|
|
1438
|
+
});
|
|
1439
|
+
const passedTermination = determinePassFromTermination(
|
|
1440
|
+
termination,
|
|
1441
|
+
onConditionMet,
|
|
1442
|
+
onMaxTurnsReached
|
|
1443
|
+
);
|
|
1444
|
+
const passed = passedTermination && judgeResult.passed;
|
|
1445
|
+
return {
|
|
1446
|
+
testCase: testCase2,
|
|
1447
|
+
output: finalOutput,
|
|
1448
|
+
metrics,
|
|
1449
|
+
verdicts: judgeResult.verdicts,
|
|
1450
|
+
overallScore: judgeResult.overallScore,
|
|
1451
|
+
passed,
|
|
1452
|
+
judgeMetadata: judgeResult.metadata,
|
|
1453
|
+
conversationHistory,
|
|
1454
|
+
termination,
|
|
1455
|
+
totalTurns: conversationHistory.length
|
|
1456
|
+
};
|
|
1457
|
+
}
|
|
1458
|
+
|
|
1459
|
+
// src/multi-turn/ai-user.ts
|
|
1460
|
+
var DEFAULT_SYSTEM_PROMPT = `You are simulating a realistic user in a conversation with an AI assistant.
|
|
1461
|
+
|
|
1462
|
+
## Your Role
|
|
1463
|
+
Generate natural, context-appropriate user messages based on the conversation history.
|
|
1464
|
+
|
|
1465
|
+
## Guidelines
|
|
1466
|
+
|
|
1467
|
+
1. **Stay in Character**: Respond as a real user would - with natural language, occasional typos, or casual phrasing when appropriate.
|
|
1468
|
+
|
|
1469
|
+
2. **Be Goal-Oriented**: Users have objectives. Pursue them logically based on the conversation context:
|
|
1470
|
+
- If the assistant asks a question, provide a reasonable answer
|
|
1471
|
+
- If clarification is needed, ask for it naturally
|
|
1472
|
+
- If a task is progressing, guide it toward completion
|
|
1473
|
+
|
|
1474
|
+
3. **React Appropriately**: Respond to what the assistant says:
|
|
1475
|
+
- Acknowledge when the assistant is helpful
|
|
1476
|
+
- Express confusion if the response is unclear
|
|
1477
|
+
- Correct misunderstandings if they occur
|
|
1478
|
+
|
|
1479
|
+
4. **Keep It Realistic**: Real users:
|
|
1480
|
+
- Don't always provide perfect information upfront
|
|
1481
|
+
- May change their mind or add requirements
|
|
1482
|
+
- Sometimes need time to think or decide
|
|
1483
|
+
|
|
1484
|
+
## Output Format
|
|
1485
|
+
Respond with ONLY the user's message. No additional formatting, explanation, or meta-commentary.`;
|
|
1486
|
+
function aiUser(options) {
|
|
1487
|
+
const { provider, systemPrompt, formatHistory, buildInput } = options;
|
|
1488
|
+
const defaultFormatHistory = (ctx) => ctx.history.map(
|
|
1489
|
+
(h, i) => `[Turn ${i + 1}]
|
|
1490
|
+
User: ${JSON.stringify(h.input)}
|
|
1491
|
+
Assistant: ${JSON.stringify(h.output)}`
|
|
1492
|
+
).join("\n\n");
|
|
1493
|
+
return async (context) => {
|
|
1494
|
+
const historyText = (formatHistory ?? defaultFormatHistory)(context);
|
|
1495
|
+
const resolvedSystemPrompt = typeof systemPrompt === "function" ? systemPrompt(context) : systemPrompt ?? DEFAULT_SYSTEM_PROMPT;
|
|
1496
|
+
const userPrompt = historyText ? `## Conversation History
|
|
1497
|
+
${historyText}
|
|
1498
|
+
|
|
1499
|
+
## Your Task
|
|
1500
|
+
Generate the next user message based on the conversation above:` : `## Your Task
|
|
1501
|
+
This is the start of a new conversation. Generate an appropriate opening message from the user:`;
|
|
1502
|
+
const execution = provider.simpleExecution(async (session) => {
|
|
1503
|
+
const result = await session.generateText({
|
|
1504
|
+
messages: [
|
|
1505
|
+
{ role: "system", content: resolvedSystemPrompt },
|
|
1506
|
+
{ role: "user", content: userPrompt }
|
|
1507
|
+
]
|
|
1508
|
+
});
|
|
1509
|
+
return result.text;
|
|
1510
|
+
});
|
|
1511
|
+
const executionResult = await execution.result();
|
|
1512
|
+
if (executionResult.status !== "succeeded") {
|
|
1513
|
+
throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
|
|
1514
|
+
}
|
|
1515
|
+
const responseText = executionResult.value;
|
|
1516
|
+
return buildInput(responseText, context);
|
|
1517
|
+
};
|
|
1518
|
+
}
|
|
1519
|
+
|
|
1520
|
+
// src/cli/yaml/loader.ts
|
|
1521
|
+
async function loadYamlEvalFile(path3, options = {}) {
|
|
1522
|
+
const { basePath = process.cwd(), skipValidation = false } = options;
|
|
1523
|
+
const absolutePath = isAbsolute(path3) ? path3 : resolve2(basePath, path3);
|
|
1524
|
+
if (!existsSync2(absolutePath)) {
|
|
1525
|
+
throw new EvalError(`YAML eval file not found: ${absolutePath}`, {
|
|
1526
|
+
code: "FILE_READ_ERROR" /* FILE_READ_ERROR */,
|
|
1527
|
+
context: { path: path3, absolutePath }
|
|
1528
|
+
});
|
|
1529
|
+
}
|
|
1530
|
+
let content;
|
|
1531
|
+
try {
|
|
1532
|
+
content = await readFile(absolutePath, "utf-8");
|
|
1533
|
+
} catch (error) {
|
|
1534
|
+
throw EvalError.from(error, "FILE_READ_ERROR" /* FILE_READ_ERROR */, {
|
|
1535
|
+
path: path3,
|
|
1536
|
+
absolutePath
|
|
1537
|
+
});
|
|
1538
|
+
}
|
|
1539
|
+
let parsed;
|
|
1540
|
+
try {
|
|
1541
|
+
parsed = parseYaml(content);
|
|
1542
|
+
} catch (error) {
|
|
1543
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
1544
|
+
throw new EvalError(`Failed to parse YAML: ${message}`, {
|
|
1545
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
1546
|
+
context: { path: path3, absolutePath },
|
|
1547
|
+
cause: error instanceof Error ? error : void 0
|
|
1548
|
+
});
|
|
1549
|
+
}
|
|
1550
|
+
if (skipValidation) {
|
|
1551
|
+
return parsed;
|
|
1552
|
+
}
|
|
1553
|
+
return validateYamlEvalFile(parsed);
|
|
1554
|
+
}
|
|
1555
|
+
async function loadYamlEvalFiles(paths, options = {}) {
|
|
1556
|
+
const results = [];
|
|
1557
|
+
for (const path3 of paths) {
|
|
1558
|
+
const content = await loadYamlEvalFile(path3, options);
|
|
1559
|
+
results.push({ path: path3, content });
|
|
1560
|
+
}
|
|
1561
|
+
return results;
|
|
1562
|
+
}
|
|
1563
|
+
function convertToTestCases(yaml, context) {
|
|
1564
|
+
const { defaults, personas, cases } = yaml;
|
|
1565
|
+
return cases.map((testCase2) => {
|
|
1566
|
+
const merged = mergeWithDefaults(testCase2, defaults);
|
|
1567
|
+
if (isMultiTurnCase(merged)) {
|
|
1568
|
+
return convertToMultiTurnTestCase(merged, personas, context);
|
|
1569
|
+
}
|
|
1570
|
+
return convertToSimpleTestCase(merged);
|
|
1571
|
+
});
|
|
1572
|
+
}
|
|
1573
|
+
function mergeWithDefaults(testCase2, defaults) {
|
|
1574
|
+
if (!defaults) {
|
|
1575
|
+
return testCase2;
|
|
1576
|
+
}
|
|
1577
|
+
return {
|
|
1578
|
+
...testCase2,
|
|
1579
|
+
maxTurns: testCase2.maxTurns ?? defaults.maxTurns,
|
|
1580
|
+
endWhen: testCase2.endWhen ?? defaults.endWhen,
|
|
1581
|
+
onConditionMet: testCase2.onConditionMet ?? defaults.onConditionMet,
|
|
1582
|
+
onMaxTurnsReached: testCase2.onMaxTurnsReached ?? defaults.onMaxTurnsReached,
|
|
1583
|
+
tags: [...defaults.tags ?? [], ...testCase2.tags ?? []]
|
|
1584
|
+
};
|
|
1585
|
+
}
|
|
1586
|
+
function isMultiTurnCase(testCase2) {
|
|
1587
|
+
return testCase2.persona !== void 0 || testCase2.endWhen !== void 0;
|
|
1588
|
+
}
|
|
1589
|
+
function resolvePersona(ref, personas) {
|
|
1590
|
+
if (ref === void 0) {
|
|
1591
|
+
return void 0;
|
|
1592
|
+
}
|
|
1593
|
+
if (typeof ref === "object") {
|
|
1594
|
+
return ref;
|
|
1595
|
+
}
|
|
1596
|
+
if (!personas || !(ref in personas)) {
|
|
1597
|
+
throw new EvalError(`Persona not found: "${ref}"`, {
|
|
1598
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
1599
|
+
context: {
|
|
1600
|
+
personaRef: ref,
|
|
1601
|
+
availablePersonas: personas ? Object.keys(personas) : []
|
|
1602
|
+
}
|
|
1603
|
+
});
|
|
1604
|
+
}
|
|
1605
|
+
return personas[ref];
|
|
1606
|
+
}
|
|
1607
|
+
function convertTerminationCondition(condition, provider) {
|
|
1608
|
+
if (condition.naturalLanguage) {
|
|
1609
|
+
return naturalLanguage({
|
|
1610
|
+
provider,
|
|
1611
|
+
prompt: condition.naturalLanguage
|
|
1612
|
+
});
|
|
1613
|
+
}
|
|
1614
|
+
if (condition.field) {
|
|
1615
|
+
if (condition.equals !== void 0) {
|
|
1616
|
+
return fieldEquals(condition.field, condition.equals);
|
|
1617
|
+
}
|
|
1618
|
+
return fieldIsSet(condition.field);
|
|
1619
|
+
}
|
|
1620
|
+
throw new EvalError("Invalid termination condition: no field or naturalLanguage specified", {
|
|
1621
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
1622
|
+
context: { condition }
|
|
1623
|
+
});
|
|
1624
|
+
}
|
|
1625
|
+
function convertToSimpleTestCase(merged) {
|
|
1626
|
+
return {
|
|
1627
|
+
id: merged.id,
|
|
1628
|
+
description: merged.name ?? merged.description,
|
|
1629
|
+
tags: merged.tags,
|
|
1630
|
+
input: merged.input,
|
|
1631
|
+
expectedOutput: merged.expectedOutput
|
|
1632
|
+
};
|
|
1633
|
+
}
|
|
1634
|
+
function convertToMultiTurnTestCase(merged, personas, context) {
|
|
1635
|
+
const { provider, buildInput, formatHistory } = context;
|
|
1636
|
+
const persona = resolvePersona(merged.persona, personas);
|
|
1637
|
+
const terminateWhen = [];
|
|
1638
|
+
if (merged.endWhen) {
|
|
1639
|
+
terminateWhen.push(convertTerminationCondition(merged.endWhen, provider));
|
|
1640
|
+
}
|
|
1641
|
+
const followUpInputs = [];
|
|
1642
|
+
if (persona) {
|
|
1643
|
+
const defaultBuildInput = (response) => ({ message: response });
|
|
1644
|
+
const aiUserInput = aiUser({
|
|
1645
|
+
provider,
|
|
1646
|
+
systemPrompt: persona.systemPrompt,
|
|
1647
|
+
formatHistory,
|
|
1648
|
+
buildInput: buildInput ?? defaultBuildInput
|
|
1649
|
+
});
|
|
1650
|
+
const maxTurns = merged.maxTurns ?? 10;
|
|
1651
|
+
for (let i = 0; i < maxTurns - 1; i++) {
|
|
1652
|
+
followUpInputs.push({
|
|
1653
|
+
input: aiUserInput,
|
|
1654
|
+
description: `AI User (${persona.name}) - Turn ${i + 2}`
|
|
1655
|
+
});
|
|
1656
|
+
}
|
|
1657
|
+
}
|
|
1658
|
+
return {
|
|
1659
|
+
id: merged.id,
|
|
1660
|
+
description: merged.name ?? merged.description,
|
|
1661
|
+
tags: merged.tags,
|
|
1662
|
+
input: merged.input,
|
|
1663
|
+
expectedOutput: merged.expectedOutput,
|
|
1664
|
+
multiTurn: {
|
|
1665
|
+
followUpInputs,
|
|
1666
|
+
terminateWhen,
|
|
1667
|
+
maxTurns: merged.maxTurns ?? 10,
|
|
1668
|
+
onConditionMet: merged.onConditionMet ?? "pass",
|
|
1669
|
+
onMaxTurnsReached: merged.onMaxTurnsReached ?? "fail"
|
|
1670
|
+
}
|
|
1671
|
+
};
|
|
1672
|
+
}
|
|
1673
|
+
|
|
1674
|
+
// src/cli/utils/env.ts
|
|
1675
|
+
import { existsSync as existsSync3 } from "fs";
|
|
1676
|
+
import { readFile as readFile2 } from "fs/promises";
|
|
1677
|
+
import { resolve as resolve3 } from "path";
|
|
1678
|
+
async function loadEnvFile(filePath = ".env", cwd = process.cwd()) {
|
|
1679
|
+
const absolutePath = resolve3(cwd, filePath);
|
|
1680
|
+
if (!existsSync3(absolutePath)) {
|
|
1681
|
+
return;
|
|
1682
|
+
}
|
|
1683
|
+
try {
|
|
1684
|
+
const content = await readFile2(absolutePath, "utf-8");
|
|
1685
|
+
const vars = parseEnvFile(content);
|
|
1686
|
+
for (const [key, value] of Object.entries(vars)) {
|
|
1687
|
+
if (process.env[key] === void 0) {
|
|
1688
|
+
process.env[key] = value;
|
|
1689
|
+
}
|
|
1690
|
+
}
|
|
1691
|
+
} catch {
|
|
1692
|
+
}
|
|
1693
|
+
}
|
|
1694
|
+
function parseEnvFile(content) {
|
|
1695
|
+
const result = {};
|
|
1696
|
+
const lines = content.split("\n");
|
|
1697
|
+
for (const line of lines) {
|
|
1698
|
+
const trimmed = line.trim();
|
|
1699
|
+
if (!trimmed || trimmed.startsWith("#")) {
|
|
1700
|
+
continue;
|
|
1701
|
+
}
|
|
1702
|
+
const eqIndex = trimmed.indexOf("=");
|
|
1703
|
+
if (eqIndex === -1) {
|
|
1704
|
+
continue;
|
|
1705
|
+
}
|
|
1706
|
+
const key = trimmed.slice(0, eqIndex).trim();
|
|
1707
|
+
let value = trimmed.slice(eqIndex + 1).trim();
|
|
1708
|
+
if (value.startsWith('"') && value.endsWith('"') || value.startsWith("'") && value.endsWith("'")) {
|
|
1709
|
+
value = value.slice(1, -1);
|
|
1710
|
+
}
|
|
1711
|
+
if (value.includes("\\")) {
|
|
1712
|
+
value = value.replace(/\\n/g, "\n").replace(/\\r/g, "\r").replace(/\\t/g, " ").replace(/\\\\/g, "\\");
|
|
1713
|
+
}
|
|
1714
|
+
if (key) {
|
|
1715
|
+
result[key] = value;
|
|
1716
|
+
}
|
|
1717
|
+
}
|
|
1718
|
+
return result;
|
|
1719
|
+
}
|
|
1720
|
+
|
|
1721
|
+
// src/cli/utils/provider-factory.ts
|
|
1722
|
+
import {
|
|
1723
|
+
createOpenAIProvider,
|
|
1724
|
+
createGoogleProvider
|
|
1725
|
+
} from "@agtlantis/core";
|
|
1726
|
+
import { mock } from "@agtlantis/core/testing";
|
|
1727
|
+
|
|
1728
|
+
// src/cli/constants.ts
|
|
1729
|
+
var CLI_DEFAULTS = {
|
|
1730
|
+
/** Width for console dividers (═ characters) */
|
|
1731
|
+
DIVIDER_WIDTH: 60,
|
|
1732
|
+
/** Default score used in mock mode for testing */
|
|
1733
|
+
MOCK_DEFAULT_SCORE: 85
|
|
1734
|
+
};
|
|
1735
|
+
|
|
1736
|
+
// src/cli/output/colors.ts
|
|
1737
|
+
var colors = {
|
|
1738
|
+
reset: "\x1B[0m",
|
|
1739
|
+
bold: "\x1B[1m",
|
|
1740
|
+
dim: "\x1B[2m",
|
|
1741
|
+
green: "\x1B[32m",
|
|
1742
|
+
red: "\x1B[31m",
|
|
1743
|
+
yellow: "\x1B[33m",
|
|
1744
|
+
blue: "\x1B[34m",
|
|
1745
|
+
cyan: "\x1B[36m",
|
|
1746
|
+
gray: "\x1B[90m"
|
|
1747
|
+
};
|
|
1748
|
+
var isColorSupported = process.stdout.isTTY && !process.env.NO_COLOR;
|
|
1749
|
+
function c(color, text) {
|
|
1750
|
+
return isColorSupported ? `${colors[color]}${text}${colors.reset}` : text;
|
|
1751
|
+
}
|
|
1752
|
+
|
|
1753
|
+
// src/cli/output/console.ts
|
|
1754
|
+
function printBanner() {
|
|
1755
|
+
console.log();
|
|
1756
|
+
console.log(c("cyan", " agent-eval"));
|
|
1757
|
+
console.log(c("dim", " LLM-as-Judge AI Agent Evaluation"));
|
|
1758
|
+
console.log();
|
|
1759
|
+
}
|
|
1760
|
+
function printProgress(message) {
|
|
1761
|
+
console.log(c("dim", ` ${message}`));
|
|
1762
|
+
}
|
|
1763
|
+
function printSummary(report, options = {}) {
|
|
1764
|
+
const { summary, results } = report;
|
|
1765
|
+
const { verbose, duration } = options;
|
|
1766
|
+
const passRate = summary.totalTests > 0 ? (summary.passed / summary.totalTests * 100).toFixed(1) : "0.0";
|
|
1767
|
+
const divider = "\u2550".repeat(CLI_DEFAULTS.DIVIDER_WIDTH);
|
|
1768
|
+
console.log();
|
|
1769
|
+
console.log(c("cyan", divider));
|
|
1770
|
+
console.log(c("bold", " Evaluation Results"));
|
|
1771
|
+
console.log(c("cyan", divider));
|
|
1772
|
+
console.log();
|
|
1773
|
+
console.log(` ${c("bold", "Total Tests:")} ${summary.totalTests}`);
|
|
1774
|
+
console.log(
|
|
1775
|
+
` ${c("bold", "Passed:")} ${c("green", String(summary.passed))} (${passRate}%)`
|
|
1776
|
+
);
|
|
1777
|
+
console.log(
|
|
1778
|
+
` ${c("bold", "Failed:")} ${summary.failed > 0 ? c("red", String(summary.failed)) : "0"}`
|
|
1779
|
+
);
|
|
1780
|
+
console.log(
|
|
1781
|
+
` ${c("bold", "Average Score:")} ${summary.avgScore.toFixed(1)}/100`
|
|
1782
|
+
);
|
|
1783
|
+
console.log();
|
|
1784
|
+
console.log(
|
|
1785
|
+
` ${c("bold", "Total Tokens:")} ${formatNumber(summary.metrics.totalTokens)}`
|
|
1786
|
+
);
|
|
1787
|
+
console.log(
|
|
1788
|
+
` ${c("bold", "Avg Latency:")} ${formatMs(summary.metrics.avgLatencyMs)}`
|
|
1789
|
+
);
|
|
1790
|
+
if (duration !== void 0) {
|
|
1791
|
+
console.log();
|
|
1792
|
+
console.log(` ${c("bold", "Duration:")} ${formatMs(duration)}`);
|
|
1793
|
+
}
|
|
1794
|
+
console.log();
|
|
1795
|
+
console.log(c("cyan", divider));
|
|
1796
|
+
if (verbose && results.length > 0) {
|
|
1797
|
+
printVerboseResults(results);
|
|
1798
|
+
}
|
|
1799
|
+
}
|
|
1800
|
+
function printVerboseResults(results) {
|
|
1801
|
+
console.log();
|
|
1802
|
+
console.log(c("bold", " Test Results:"));
|
|
1803
|
+
console.log();
|
|
1804
|
+
for (const result of results) {
|
|
1805
|
+
const status = result.passed ? c("green", "\u2713 PASS") : c("red", "\u2717 FAIL");
|
|
1806
|
+
const testId = "testCase" in result && result.testCase?.id ? result.testCase.id : "unknown";
|
|
1807
|
+
console.log(` ${status} ${testId}`);
|
|
1808
|
+
console.log(` Score: ${result.overallScore.toFixed(1)}/100`);
|
|
1809
|
+
if ("criteriaScores" in result && result.criteriaScores) {
|
|
1810
|
+
const scores = result.criteriaScores;
|
|
1811
|
+
for (const score of scores) {
|
|
1812
|
+
console.log(` ${c("dim", score.criterionId)}: ${score.score.toFixed(1)}`);
|
|
1813
|
+
}
|
|
1814
|
+
}
|
|
1815
|
+
console.log();
|
|
1816
|
+
}
|
|
1817
|
+
}
|
|
1818
|
+
function printError(error) {
|
|
1819
|
+
console.error();
|
|
1820
|
+
console.error(c("red", " \u2717 Error:"));
|
|
1821
|
+
console.error();
|
|
1822
|
+
console.error(` ${error.message}`);
|
|
1823
|
+
console.error();
|
|
1824
|
+
}
|
|
1825
|
+
function formatNumber(num) {
|
|
1826
|
+
return num.toLocaleString("en-US");
|
|
1827
|
+
}
|
|
1828
|
+
function formatMs(ms) {
|
|
1829
|
+
if (ms < 1e3) {
|
|
1830
|
+
return `${ms.toFixed(0)}ms`;
|
|
1831
|
+
}
|
|
1832
|
+
return `${(ms / 1e3).toFixed(2)}s`;
|
|
1833
|
+
}
|
|
1834
|
+
|
|
1835
|
+
// src/cli/utils/provider-factory.ts
|
|
1836
|
+
function getApiKeyFromEnv(provider) {
|
|
1837
|
+
if (provider === "openai") {
|
|
1838
|
+
return process.env.OPENAI_API_KEY;
|
|
1839
|
+
}
|
|
1840
|
+
return process.env.GOOGLE_API_KEY;
|
|
1841
|
+
}
|
|
1842
|
+
function createProviderFromConfig(config) {
|
|
1843
|
+
const { llm } = config;
|
|
1844
|
+
const apiKey = llm.apiKey ?? getApiKeyFromEnv(llm.provider);
|
|
1845
|
+
if (!apiKey) {
|
|
1846
|
+
const envVar = llm.provider === "openai" ? "OPENAI_API_KEY" : "GOOGLE_API_KEY";
|
|
1847
|
+
throw new ConfigError(
|
|
1848
|
+
`API key not found for ${llm.provider}.
|
|
1849
|
+
|
|
1850
|
+
Set the ${envVar} environment variable or provide apiKey in config.`,
|
|
1851
|
+
"CONFIG_VALIDATION_ERROR"
|
|
1852
|
+
);
|
|
1853
|
+
}
|
|
1854
|
+
if (llm.provider === "openai") {
|
|
1855
|
+
return createOpenAIProvider({
|
|
1856
|
+
apiKey
|
|
1857
|
+
}).withDefaultModel(llm.defaultModel ?? "gpt-4o-mini");
|
|
1858
|
+
}
|
|
1859
|
+
return createGoogleProvider({
|
|
1860
|
+
apiKey
|
|
1861
|
+
}).withDefaultModel(llm.defaultModel ?? "gemini-1.5-flash");
|
|
1862
|
+
}
|
|
1863
|
+
function initializeProviders(config, options) {
|
|
1864
|
+
if (options.mock) {
|
|
1865
|
+
printProgress("Using mock Provider (--mock mode)");
|
|
1866
|
+
const mockVerdicts = config.judge.criteria.map((criterion) => ({
|
|
1867
|
+
criterionId: criterion.id,
|
|
1868
|
+
score: CLI_DEFAULTS.MOCK_DEFAULT_SCORE,
|
|
1869
|
+
reasoning: "Mock evaluation - test mode",
|
|
1870
|
+
passed: true
|
|
1871
|
+
}));
|
|
1872
|
+
const mockProvider = mock.provider(mock.json({ verdicts: mockVerdicts }));
|
|
1873
|
+
return { mainProvider: mockProvider, judgeProvider: mockProvider, improverProvider: mockProvider };
|
|
1874
|
+
}
|
|
1875
|
+
const mainProvider = createProviderFromConfig(config);
|
|
1876
|
+
const judgeProvider = config.judge.llm ? createProviderFromConfig({ ...config, llm: config.judge.llm }) : mainProvider;
|
|
1877
|
+
const improverProvider = config.improver?.llm ? createProviderFromConfig({ ...config, llm: config.improver.llm }) : mainProvider;
|
|
1878
|
+
return { mainProvider, judgeProvider, improverProvider };
|
|
1879
|
+
}
|
|
1880
|
+
|
|
1881
|
+
// src/cli/output/report.ts
|
|
1882
|
+
import { mkdir, writeFile as writeFile2 } from "fs/promises";
|
|
1883
|
+
import { existsSync as existsSync4 } from "fs";
|
|
1884
|
+
import { join } from "path";
|
|
1885
|
+
|
|
1886
|
+
// src/reporter/markdown.ts
|
|
1887
|
+
import { writeFile } from "fs/promises";
|
|
1888
|
+
import { getFileSourcesDisplayInfo } from "@agtlantis/core";
|
|
1889
|
+
var PASS_ICON = "\u2705";
|
|
1890
|
+
var FAIL_ICON = "\u274C";
|
|
1891
|
+
var PRIORITY_ORDER = { high: 0, medium: 1, low: 2 };
|
|
1892
|
+
function reportToMarkdown(report, options = {}) {
|
|
1893
|
+
const {
|
|
1894
|
+
expandPassedTests = false,
|
|
1895
|
+
includeRawOutput = false,
|
|
1896
|
+
outputPreviewLength = 200
|
|
1897
|
+
} = options;
|
|
1898
|
+
const { summary, results, suggestions, generatedAt, promptVersion } = report;
|
|
1899
|
+
const passRate = summary.totalTests > 0 ? (summary.passed / summary.totalTests * 100).toFixed(1) : "0.0";
|
|
1900
|
+
const lines = [];
|
|
1901
|
+
lines.push("# Evaluation Report");
|
|
1902
|
+
lines.push("");
|
|
1903
|
+
lines.push(`> Generated: ${generatedAt.toISOString()}`);
|
|
1904
|
+
lines.push(`> Prompt Version: ${promptVersion}`);
|
|
1905
|
+
lines.push("");
|
|
1906
|
+
lines.push("## Summary");
|
|
1907
|
+
lines.push("");
|
|
1908
|
+
lines.push(`| Metric | Value |`);
|
|
1909
|
+
lines.push(`|--------|-------|`);
|
|
1910
|
+
lines.push(`| Total Tests | ${summary.totalTests} |`);
|
|
1911
|
+
if (summary.iterations && summary.iterations > 1) {
|
|
1912
|
+
lines.push(`| **Iterations** | **${summary.iterations}** |`);
|
|
1913
|
+
}
|
|
1914
|
+
lines.push(`| Passed | ${summary.passed} (${passRate}%) |`);
|
|
1915
|
+
lines.push(`| Failed | ${summary.failed} |`);
|
|
1916
|
+
if (summary.avgStdDev !== void 0) {
|
|
1917
|
+
lines.push(
|
|
1918
|
+
`| Average Score | ${summary.avgScore.toFixed(1)} \xB1 ${summary.avgStdDev.toFixed(1)} |`
|
|
1919
|
+
);
|
|
1920
|
+
} else {
|
|
1921
|
+
lines.push(`| Average Score | ${summary.avgScore.toFixed(1)} |`);
|
|
1922
|
+
}
|
|
1923
|
+
if (summary.avgPassRate !== void 0) {
|
|
1924
|
+
lines.push(`| Avg Pass Rate | ${(summary.avgPassRate * 100).toFixed(1)}% |`);
|
|
1925
|
+
}
|
|
1926
|
+
lines.push(`| Avg Latency | ${summary.metrics.avgLatencyMs.toFixed(0)}ms |`);
|
|
1927
|
+
lines.push(`| Total Tokens | ${summary.metrics.totalTokens} |`);
|
|
1928
|
+
if (summary.costSummary?.total !== void 0) {
|
|
1929
|
+
lines.push(`| Est. Cost | $${summary.costSummary.total.toFixed(4)} |`);
|
|
1930
|
+
}
|
|
1931
|
+
lines.push("");
|
|
1932
|
+
const failedResults = results.filter((r) => !r.passed);
|
|
1933
|
+
if (failedResults.length > 0) {
|
|
1934
|
+
lines.push(`## ${FAIL_ICON} Failed Tests`);
|
|
1935
|
+
lines.push("");
|
|
1936
|
+
for (const result of failedResults) {
|
|
1937
|
+
lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
|
|
1938
|
+
}
|
|
1939
|
+
}
|
|
1940
|
+
const passedResults = results.filter((r) => r.passed);
|
|
1941
|
+
if (passedResults.length > 0) {
|
|
1942
|
+
lines.push(`## ${PASS_ICON} Passed Tests`);
|
|
1943
|
+
lines.push("");
|
|
1944
|
+
if (expandPassedTests) {
|
|
1945
|
+
for (const result of passedResults) {
|
|
1946
|
+
lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
|
|
1947
|
+
}
|
|
1948
|
+
} else {
|
|
1949
|
+
lines.push("<details>");
|
|
1950
|
+
lines.push("<summary>Click to expand passed tests</summary>");
|
|
1951
|
+
lines.push("");
|
|
1952
|
+
for (const result of passedResults) {
|
|
1953
|
+
lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
|
|
1954
|
+
}
|
|
1955
|
+
lines.push("</details>");
|
|
1956
|
+
lines.push("");
|
|
1957
|
+
}
|
|
1958
|
+
}
|
|
1959
|
+
if (suggestions.length > 0) {
|
|
1960
|
+
lines.push("## \u{1F4A1} Improvement Suggestions");
|
|
1961
|
+
lines.push("");
|
|
1962
|
+
const sortedSuggestions = [...suggestions].sort(
|
|
1963
|
+
(a, b) => PRIORITY_ORDER[a.priority] - PRIORITY_ORDER[b.priority]
|
|
1964
|
+
);
|
|
1965
|
+
for (const suggestion of sortedSuggestions) {
|
|
1966
|
+
lines.push(formatSuggestion(suggestion));
|
|
1967
|
+
}
|
|
1968
|
+
}
|
|
1969
|
+
return lines.join("\n");
|
|
1970
|
+
}
|
|
1971
|
+
function jsonCodeBlock(value, maxLength) {
|
|
1972
|
+
const json = JSON.stringify(value, null, 2);
|
|
1973
|
+
const content = maxLength !== void 0 ? truncate(json, maxLength) : json;
|
|
1974
|
+
return ["```json", content, "```"];
|
|
1975
|
+
}
|
|
1976
|
+
function passFailIcon(passed) {
|
|
1977
|
+
return passed ? PASS_ICON : FAIL_ICON;
|
|
1978
|
+
}
|
|
1979
|
+
function formatTestResult(result, previewLength, includeRaw) {
|
|
1980
|
+
const lines = [];
|
|
1981
|
+
const testId = result.testCase.id ?? "unnamed";
|
|
1982
|
+
const scoreDisplay = result.iterationStats ? `${result.overallScore.toFixed(1)} \xB1 ${result.iterationStats.stdDev.toFixed(1)}` : result.overallScore.toFixed(1);
|
|
1983
|
+
lines.push(`### ${testId} (Score: ${scoreDisplay})`);
|
|
1984
|
+
lines.push("");
|
|
1985
|
+
if (result.testCase.description) {
|
|
1986
|
+
lines.push(`> ${result.testCase.description}`);
|
|
1987
|
+
lines.push("");
|
|
1988
|
+
}
|
|
1989
|
+
const fileDisplayInfos = getFileSourcesDisplayInfo(result.testCase.input);
|
|
1990
|
+
if (fileDisplayInfos.length > 0) {
|
|
1991
|
+
lines.push("**Files:**");
|
|
1992
|
+
for (const info of fileDisplayInfos) {
|
|
1993
|
+
const namePrefix = info.filename ? `${info.filename} - ` : "";
|
|
1994
|
+
lines.push(`- ${namePrefix}${info.source}: ${info.description} (${info.mediaType})`);
|
|
1995
|
+
}
|
|
1996
|
+
lines.push("");
|
|
1997
|
+
}
|
|
1998
|
+
if (result.totalTurns !== void 0) {
|
|
1999
|
+
lines.push(
|
|
2000
|
+
`**Multi-turn:** ${result.totalTurns} turns | Termination: ${result.terminationReason ?? "unknown"}`
|
|
2001
|
+
);
|
|
2002
|
+
lines.push("");
|
|
2003
|
+
}
|
|
2004
|
+
if (result.multiTurnIterationStats) {
|
|
2005
|
+
lines.push(...formatMultiTurnIterationStats(result.multiTurnIterationStats));
|
|
2006
|
+
}
|
|
2007
|
+
if (result.iterationStats && result.iterationResults) {
|
|
2008
|
+
lines.push(...formatIterationResults(result.iterationStats, result.iterationResults));
|
|
2009
|
+
}
|
|
2010
|
+
if (result.conversationHistory && result.conversationHistory.length > 0) {
|
|
2011
|
+
lines.push(...formatConversationHistory(result.conversationHistory, previewLength));
|
|
2012
|
+
} else {
|
|
2013
|
+
lines.push(
|
|
2014
|
+
...formatSingleTurnInputOutput(result.testCase.input, result.output, previewLength)
|
|
2015
|
+
);
|
|
2016
|
+
}
|
|
2017
|
+
lines.push("**Verdicts:**");
|
|
2018
|
+
for (const verdict of result.verdicts) {
|
|
2019
|
+
lines.push(
|
|
2020
|
+
`- ${passFailIcon(verdict.passed)} **${verdict.criterionId}**: ${verdict.score} - ${verdict.reasoning}`
|
|
2021
|
+
);
|
|
2022
|
+
}
|
|
2023
|
+
lines.push("");
|
|
2024
|
+
if (includeRaw) {
|
|
2025
|
+
lines.push("<details>");
|
|
2026
|
+
lines.push("<summary>Raw Output</summary>");
|
|
2027
|
+
lines.push("");
|
|
2028
|
+
lines.push(...jsonCodeBlock(result.output));
|
|
2029
|
+
lines.push("</details>");
|
|
2030
|
+
lines.push("");
|
|
2031
|
+
}
|
|
2032
|
+
return lines.join("\n");
|
|
2033
|
+
}
|
|
2034
|
+
function formatMultiTurnIterationStats(stats) {
|
|
2035
|
+
const terminationSummary = Object.entries(stats.terminationCounts).map(([type, count]) => `${type}: ${count}`).join(", ") || "none";
|
|
2036
|
+
return [
|
|
2037
|
+
"**Multi-turn Iteration Statistics:**",
|
|
2038
|
+
"",
|
|
2039
|
+
"| Metric | Value |",
|
|
2040
|
+
"|--------|-------|",
|
|
2041
|
+
`| Avg Turns | ${stats.avgTurns.toFixed(1)} |`,
|
|
2042
|
+
`| Min/Max Turns | ${stats.minTurns} / ${stats.maxTurns} |`,
|
|
2043
|
+
`| Termination Distribution | ${terminationSummary} |`,
|
|
2044
|
+
""
|
|
2045
|
+
];
|
|
2046
|
+
}
|
|
2047
|
+
function formatIterationResults(stats, results) {
|
|
2048
|
+
const lines = [
|
|
2049
|
+
"**Iteration Results:**",
|
|
2050
|
+
"",
|
|
2051
|
+
"| # | Score | Passed | Latency |",
|
|
2052
|
+
"|---|-------|--------|---------|"
|
|
2053
|
+
];
|
|
2054
|
+
results.forEach((iter, idx) => {
|
|
2055
|
+
lines.push(
|
|
2056
|
+
`| ${idx + 1} | ${iter.overallScore.toFixed(1)} | ${passFailIcon(iter.passed)} | ${iter.metrics.latencyMs.toFixed(0)}ms |`
|
|
2057
|
+
);
|
|
2058
|
+
});
|
|
2059
|
+
lines.push("");
|
|
2060
|
+
lines.push(
|
|
2061
|
+
`**Stats:** ${stats.mean.toFixed(1)} \xB1 ${stats.stdDev.toFixed(1)} (min: ${stats.min.toFixed(0)}, max: ${stats.max.toFixed(0)}, pass rate: ${(stats.passRate * 100).toFixed(0)}%)`
|
|
2062
|
+
);
|
|
2063
|
+
lines.push("");
|
|
2064
|
+
return lines;
|
|
2065
|
+
}
|
|
2066
|
+
function formatConversationHistory(history, previewLength) {
|
|
2067
|
+
const lines = ["**Conversation History:**", ""];
|
|
2068
|
+
for (const turn of history) {
|
|
2069
|
+
lines.push("<details>");
|
|
2070
|
+
lines.push(`<summary>Turn ${turn.turn}</summary>`);
|
|
2071
|
+
lines.push("");
|
|
2072
|
+
lines.push("**Input:**");
|
|
2073
|
+
lines.push(...jsonCodeBlock(turn.input, previewLength));
|
|
2074
|
+
lines.push("");
|
|
2075
|
+
lines.push("**Output:**");
|
|
2076
|
+
lines.push(...jsonCodeBlock(turn.output, previewLength));
|
|
2077
|
+
lines.push("</details>");
|
|
2078
|
+
lines.push("");
|
|
2079
|
+
}
|
|
2080
|
+
return lines;
|
|
2081
|
+
}
|
|
2082
|
+
function formatSingleTurnInputOutput(input, output, previewLength) {
|
|
2083
|
+
return [
|
|
2084
|
+
"**Input:**",
|
|
2085
|
+
...jsonCodeBlock(input, previewLength),
|
|
2086
|
+
"",
|
|
2087
|
+
"**Output:**",
|
|
2088
|
+
...jsonCodeBlock(output, previewLength),
|
|
2089
|
+
""
|
|
2090
|
+
];
|
|
2091
|
+
}
|
|
2092
|
+
function formatSuggestion(suggestion) {
|
|
2093
|
+
const lines = [];
|
|
2094
|
+
const priorityIcon = { high: "\u{1F534}", medium: "\u{1F7E1}", low: "\u{1F7E2}" }[suggestion.priority] ?? "\u26AA";
|
|
2095
|
+
lines.push(`### ${priorityIcon} [${suggestion.priority.toUpperCase()}] ${suggestion.type}`);
|
|
2096
|
+
lines.push("");
|
|
2097
|
+
lines.push(`**Reasoning:** ${suggestion.reasoning}`);
|
|
2098
|
+
lines.push("");
|
|
2099
|
+
lines.push(`**Expected Improvement:** ${suggestion.expectedImprovement}`);
|
|
2100
|
+
lines.push("");
|
|
2101
|
+
lines.push("**Diff:**");
|
|
2102
|
+
lines.push("```diff");
|
|
2103
|
+
lines.push(`- ${suggestion.currentValue.split("\n").join("\n- ")}`);
|
|
2104
|
+
lines.push(`+ ${suggestion.suggestedValue.split("\n").join("\n+ ")}`);
|
|
2105
|
+
lines.push("```");
|
|
2106
|
+
lines.push("");
|
|
2107
|
+
return lines.join("\n");
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
// src/cli/output/report.ts
|
|
2111
|
+
async function generateReport(report, options = {}) {
|
|
2112
|
+
const {
|
|
2113
|
+
dir = "./reports",
|
|
2114
|
+
filename = generateFilename(),
|
|
2115
|
+
markdown
|
|
2116
|
+
} = options;
|
|
2117
|
+
if (!existsSync4(dir)) {
|
|
2118
|
+
await mkdir(dir, { recursive: true });
|
|
2119
|
+
}
|
|
2120
|
+
const content = reportToMarkdown(report, markdown);
|
|
2121
|
+
const outputPath = join(dir, filename);
|
|
2122
|
+
await writeFile2(outputPath, content, "utf-8");
|
|
2123
|
+
return outputPath;
|
|
2124
|
+
}
|
|
2125
|
+
function generateFilename() {
|
|
2126
|
+
const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/:/g, "-").replace(/\.\d{3}Z$/, "");
|
|
2127
|
+
return `eval-${timestamp}.md`;
|
|
2128
|
+
}
|
|
2129
|
+
|
|
2130
|
+
// src/core/runner.ts
|
|
2131
|
+
import { resolveFileSourcesInInput as resolveFileSourcesInInput2 } from "@agtlantis/core";
|
|
2132
|
+
|
|
2133
|
+
// src/utils/semaphore.ts
|
|
2134
|
+
function createSemaphore(limit) {
|
|
2135
|
+
let running = 0;
|
|
2136
|
+
const waiting = [];
|
|
2137
|
+
return {
|
|
2138
|
+
async acquire() {
|
|
2139
|
+
if (running < limit) {
|
|
2140
|
+
running++;
|
|
2141
|
+
return;
|
|
2142
|
+
}
|
|
2143
|
+
return new Promise((resolve4) => {
|
|
2144
|
+
waiting.push(resolve4);
|
|
2145
|
+
});
|
|
2146
|
+
},
|
|
2147
|
+
release() {
|
|
2148
|
+
running--;
|
|
2149
|
+
const next = waiting.shift();
|
|
2150
|
+
if (next) {
|
|
2151
|
+
running++;
|
|
2152
|
+
next();
|
|
2153
|
+
}
|
|
2154
|
+
}
|
|
2155
|
+
};
|
|
2156
|
+
}
|
|
2157
|
+
|
|
2158
|
+
// src/core/constants.ts
|
|
2159
|
+
var SCORE = {
|
|
2160
|
+
/** Minimum possible score */
|
|
2161
|
+
MIN: 0,
|
|
2162
|
+
/** Maximum possible score */
|
|
2163
|
+
MAX: 100,
|
|
2164
|
+
/** Default threshold for passing evaluation */
|
|
2165
|
+
DEFAULT_PASS_THRESHOLD: 70,
|
|
2166
|
+
/** Threshold for majority-based pass determination (50%) */
|
|
2167
|
+
MAJORITY_PASS_THRESHOLD: 0.5
|
|
2168
|
+
};
|
|
2169
|
+
var ZERO_TOKEN_USAGE = {
|
|
2170
|
+
inputTokens: 0,
|
|
2171
|
+
outputTokens: 0,
|
|
2172
|
+
totalTokens: 0
|
|
2173
|
+
};
|
|
2174
|
+
|
|
2175
|
+
// src/core/runner.ts
|
|
2176
|
+
async function executeTestCase(testCase2, context, signal) {
|
|
2177
|
+
const { agent, judge, agentDescription } = context;
|
|
2178
|
+
if (signal?.aborted) {
|
|
2179
|
+
throw new EvalError("Test execution aborted", {
|
|
2180
|
+
code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
|
|
2181
|
+
context: { testCaseId: testCase2.id, reason: "aborted" }
|
|
2182
|
+
});
|
|
2183
|
+
}
|
|
2184
|
+
let resolvedInput;
|
|
2185
|
+
try {
|
|
2186
|
+
resolvedInput = await resolveFileSourcesInInput2(testCase2.input, {
|
|
2187
|
+
basePath: process.cwd()
|
|
2188
|
+
});
|
|
2189
|
+
} catch (e) {
|
|
2190
|
+
const error2 = EvalError.from(e, "FILE_READ_ERROR" /* FILE_READ_ERROR */, {
|
|
2191
|
+
testCaseId: testCase2.id,
|
|
2192
|
+
agentName: agent.config.name
|
|
2193
|
+
});
|
|
2194
|
+
return createFailedResult(testCase2, error2);
|
|
2195
|
+
}
|
|
2196
|
+
const startTime = performance.now();
|
|
2197
|
+
let output;
|
|
2198
|
+
let tokenUsage = ZERO_TOKEN_USAGE;
|
|
2199
|
+
let error;
|
|
2200
|
+
try {
|
|
2201
|
+
const agentResult = await agent.execute(resolvedInput);
|
|
2202
|
+
output = agentResult.result;
|
|
2203
|
+
if (agentResult.metadata?.tokenUsage) {
|
|
2204
|
+
tokenUsage = agentResult.metadata.tokenUsage;
|
|
2205
|
+
}
|
|
2206
|
+
} catch (e) {
|
|
2207
|
+
error = EvalError.from(e, "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */, {
|
|
2208
|
+
testCaseId: testCase2.id,
|
|
2209
|
+
agentName: agent.config.name
|
|
2210
|
+
});
|
|
2211
|
+
output = void 0;
|
|
2212
|
+
}
|
|
2213
|
+
const latencyMs = performance.now() - startTime;
|
|
2214
|
+
const metrics = { latencyMs, tokenUsage };
|
|
2215
|
+
const testResult = { testCase: testCase2, output, metrics, error };
|
|
2216
|
+
if (error) {
|
|
2217
|
+
return {
|
|
2218
|
+
kind: "single-turn",
|
|
2219
|
+
...testResult,
|
|
2220
|
+
verdicts: [],
|
|
2221
|
+
overallScore: 0,
|
|
2222
|
+
passed: false,
|
|
2223
|
+
judgeMetadata: void 0
|
|
2224
|
+
};
|
|
2225
|
+
}
|
|
2226
|
+
if (signal?.aborted) {
|
|
2227
|
+
throw new EvalError("Test execution aborted before evaluation", {
|
|
2228
|
+
code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
|
|
2229
|
+
context: { testCaseId: testCase2.id, reason: "aborted" }
|
|
2230
|
+
});
|
|
2231
|
+
}
|
|
2232
|
+
const judgeResult = await judge.evaluate({
|
|
2233
|
+
input: testCase2.input,
|
|
2234
|
+
output,
|
|
2235
|
+
agentDescription,
|
|
2236
|
+
files: testCase2.files
|
|
2237
|
+
});
|
|
2238
|
+
return {
|
|
2239
|
+
kind: "single-turn",
|
|
2240
|
+
...testResult,
|
|
2241
|
+
verdicts: judgeResult.verdicts,
|
|
2242
|
+
overallScore: judgeResult.overallScore,
|
|
2243
|
+
passed: judgeResult.passed,
|
|
2244
|
+
judgeMetadata: judgeResult.metadata
|
|
2245
|
+
};
|
|
2246
|
+
}
|
|
2247
|
+
function createFailedResult(testCase2, error) {
|
|
2248
|
+
return {
|
|
2249
|
+
kind: "single-turn",
|
|
2250
|
+
testCase: testCase2,
|
|
2251
|
+
output: void 0,
|
|
2252
|
+
metrics: { latencyMs: 0, tokenUsage: ZERO_TOKEN_USAGE },
|
|
2253
|
+
error,
|
|
2254
|
+
verdicts: [],
|
|
2255
|
+
overallScore: 0,
|
|
2256
|
+
passed: false,
|
|
2257
|
+
judgeMetadata: void 0
|
|
2258
|
+
};
|
|
2259
|
+
}
|
|
2260
|
+
function toMultiTurnResult(result) {
|
|
2261
|
+
return {
|
|
2262
|
+
kind: "multi-turn",
|
|
2263
|
+
testCase: result.testCase,
|
|
2264
|
+
output: result.output,
|
|
2265
|
+
metrics: result.metrics,
|
|
2266
|
+
verdicts: result.verdicts,
|
|
2267
|
+
overallScore: result.overallScore,
|
|
2268
|
+
passed: result.passed,
|
|
2269
|
+
judgeMetadata: result.judgeMetadata,
|
|
2270
|
+
conversationHistory: result.conversationHistory,
|
|
2271
|
+
totalTurns: result.totalTurns,
|
|
2272
|
+
terminationReason: result.termination.reason,
|
|
2273
|
+
termination: result.termination
|
|
2274
|
+
};
|
|
2275
|
+
}
|
|
2276
|
+
async function runWithConcurrency(testCases2, context, options = {}) {
|
|
2277
|
+
const { concurrency = 1, stopOnFirstFailure = false, signal } = options;
|
|
2278
|
+
if (concurrency < 1) {
|
|
2279
|
+
throw new EvalError("Concurrency must be at least 1", {
|
|
2280
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
2281
|
+
context: { concurrency }
|
|
2282
|
+
});
|
|
2283
|
+
}
|
|
2284
|
+
if (testCases2.length === 0) {
|
|
2285
|
+
return [];
|
|
2286
|
+
}
|
|
2287
|
+
const semaphore = createSemaphore(concurrency);
|
|
2288
|
+
const results = [];
|
|
2289
|
+
let shouldStop = false;
|
|
2290
|
+
let firstError;
|
|
2291
|
+
const internalAbort = new AbortController();
|
|
2292
|
+
const propagateExternalAbort = () => {
|
|
2293
|
+
shouldStop = true;
|
|
2294
|
+
internalAbort.abort();
|
|
2295
|
+
};
|
|
2296
|
+
signal?.addEventListener("abort", propagateExternalAbort);
|
|
2297
|
+
if (signal?.aborted) {
|
|
2298
|
+
shouldStop = true;
|
|
2299
|
+
}
|
|
2300
|
+
try {
|
|
2301
|
+
const executeOne = async (testCase2, index) => {
|
|
2302
|
+
if (shouldStop) return;
|
|
2303
|
+
await semaphore.acquire();
|
|
2304
|
+
try {
|
|
2305
|
+
if (shouldStop) return;
|
|
2306
|
+
const result = await executeTestCaseByType(testCase2, context, internalAbort.signal);
|
|
2307
|
+
results[index] = result;
|
|
2308
|
+
if (stopOnFirstFailure && !result.passed) {
|
|
2309
|
+
shouldStop = true;
|
|
2310
|
+
internalAbort.abort();
|
|
2311
|
+
}
|
|
2312
|
+
} catch (e) {
|
|
2313
|
+
if (!firstError && !isAbortError(e)) {
|
|
2314
|
+
firstError = e instanceof Error ? e : new Error(String(e));
|
|
2315
|
+
}
|
|
2316
|
+
shouldStop = true;
|
|
2317
|
+
internalAbort.abort();
|
|
2318
|
+
} finally {
|
|
2319
|
+
semaphore.release();
|
|
2320
|
+
}
|
|
2321
|
+
};
|
|
2322
|
+
const promises = testCases2.map((tc, i) => executeOne(tc, i));
|
|
2323
|
+
await Promise.allSettled(promises);
|
|
2324
|
+
if (firstError) {
|
|
2325
|
+
throw firstError;
|
|
2326
|
+
}
|
|
2327
|
+
return results.filter((r) => r !== void 0);
|
|
2328
|
+
} finally {
|
|
2329
|
+
signal?.removeEventListener("abort", propagateExternalAbort);
|
|
2330
|
+
}
|
|
2331
|
+
}
|
|
2332
|
+
function isAbortError(e) {
|
|
2333
|
+
return e instanceof DOMException && e.name === "AbortError" || e instanceof EvalError && e.context?.reason === "aborted";
|
|
2334
|
+
}
|
|
2335
|
+
async function executeTestCaseByType(testCase2, context, signal) {
|
|
2336
|
+
if (isMultiTurnTestCase(testCase2)) {
|
|
2337
|
+
const multiTurnResult = await executeMultiTurnTestCase(testCase2, context, { signal });
|
|
2338
|
+
return toMultiTurnResult(multiTurnResult);
|
|
2339
|
+
}
|
|
2340
|
+
return executeTestCase(testCase2, context, signal);
|
|
2341
|
+
}
|
|
2342
|
+
|
|
2343
|
+
// src/core/types.ts
|
|
2344
|
+
function isMultiTurnResult(result) {
|
|
2345
|
+
return result.kind === "multi-turn" || result.kind === "multi-turn-iterated";
|
|
2346
|
+
}
|
|
2347
|
+
|
|
2348
|
+
// src/core/iteration.ts
|
|
2349
|
+
function calculateIterationStats(results) {
|
|
2350
|
+
if (results.length === 0) {
|
|
2351
|
+
return {
|
|
2352
|
+
iterations: 0,
|
|
2353
|
+
scores: [],
|
|
2354
|
+
mean: 0,
|
|
2355
|
+
stdDev: 0,
|
|
2356
|
+
min: 0,
|
|
2357
|
+
max: 0,
|
|
2358
|
+
passRate: 0,
|
|
2359
|
+
passCount: 0
|
|
2360
|
+
};
|
|
2361
|
+
}
|
|
2362
|
+
const scores = results.map((r) => r.overallScore);
|
|
2363
|
+
const passCount = results.filter((r) => r.passed).length;
|
|
2364
|
+
const mean = scores.reduce((sum, s) => sum + s, 0) / scores.length;
|
|
2365
|
+
const variance = scores.reduce((sum, s) => sum + Math.pow(s - mean, 2), 0) / scores.length;
|
|
2366
|
+
const stdDev = Math.sqrt(variance);
|
|
2367
|
+
return {
|
|
2368
|
+
iterations: results.length,
|
|
2369
|
+
scores,
|
|
2370
|
+
mean,
|
|
2371
|
+
stdDev,
|
|
2372
|
+
min: Math.min(...scores),
|
|
2373
|
+
max: Math.max(...scores),
|
|
2374
|
+
passRate: passCount / results.length,
|
|
2375
|
+
passCount
|
|
2376
|
+
};
|
|
2377
|
+
}
|
|
2378
|
+
function calculateMultiTurnIterationStats(results) {
|
|
2379
|
+
const baseStats = calculateIterationStats(results);
|
|
2380
|
+
const turns = results.map((r) => r.totalTurns);
|
|
2381
|
+
const terminationCounts = {};
|
|
2382
|
+
for (const r of results) {
|
|
2383
|
+
const type = r.termination.terminationType;
|
|
2384
|
+
if (type) {
|
|
2385
|
+
terminationCounts[type] = (terminationCounts[type] || 0) + 1;
|
|
2386
|
+
}
|
|
2387
|
+
}
|
|
2388
|
+
return {
|
|
2389
|
+
...baseStats,
|
|
2390
|
+
avgTurns: turns.length > 0 ? turns.reduce((a, b) => a + b, 0) / turns.length : 0,
|
|
2391
|
+
minTurns: turns.length > 0 ? Math.min(...turns) : 0,
|
|
2392
|
+
maxTurns: turns.length > 0 ? Math.max(...turns) : 0,
|
|
2393
|
+
terminationCounts
|
|
2394
|
+
};
|
|
2395
|
+
}
|
|
2396
|
+
function selectRepresentativeResult(results, mean) {
|
|
2397
|
+
if (results.length === 0) {
|
|
2398
|
+
throw new EvalError("Cannot select representative result from empty array", {
|
|
2399
|
+
code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */
|
|
2400
|
+
});
|
|
2401
|
+
}
|
|
2402
|
+
return results.reduce((closest, current) => {
|
|
2403
|
+
const closestDiff = Math.abs(closest.overallScore - mean);
|
|
2404
|
+
const currentDiff = Math.abs(current.overallScore - mean);
|
|
2405
|
+
return currentDiff < closestDiff ? current : closest;
|
|
2406
|
+
});
|
|
2407
|
+
}
|
|
2408
|
+
function aggregateIterationResults(allIterationResults) {
|
|
2409
|
+
if (allIterationResults.length === 0) {
|
|
2410
|
+
return [];
|
|
2411
|
+
}
|
|
2412
|
+
const testCount = allIterationResults[0].length;
|
|
2413
|
+
const aggregated = [];
|
|
2414
|
+
for (let i = 0; i < testCount; i++) {
|
|
2415
|
+
const resultsForTestCase = allIterationResults.map((iteration) => iteration[i]);
|
|
2416
|
+
const stats = calculateIterationStats(resultsForTestCase);
|
|
2417
|
+
const representative = selectRepresentativeResult(resultsForTestCase, stats.mean);
|
|
2418
|
+
const isMultiTurn = resultsForTestCase.some((r) => isMultiTurnResult(r));
|
|
2419
|
+
const passedByMajority = stats.passRate >= SCORE.MAJORITY_PASS_THRESHOLD;
|
|
2420
|
+
if (isMultiTurn) {
|
|
2421
|
+
const multiTurnResults = resultsForTestCase.filter(
|
|
2422
|
+
(r) => isMultiTurnResult(r)
|
|
2423
|
+
);
|
|
2424
|
+
const multiTurnRep = representative;
|
|
2425
|
+
const aggregatedResult = {
|
|
2426
|
+
kind: "multi-turn-iterated",
|
|
2427
|
+
testCase: multiTurnRep.testCase,
|
|
2428
|
+
output: multiTurnRep.output,
|
|
2429
|
+
metrics: multiTurnRep.metrics,
|
|
2430
|
+
verdicts: multiTurnRep.verdicts,
|
|
2431
|
+
error: multiTurnRep.error,
|
|
2432
|
+
overallScore: stats.mean,
|
|
2433
|
+
passed: passedByMajority,
|
|
2434
|
+
iterationStats: stats,
|
|
2435
|
+
iterationResults: resultsForTestCase,
|
|
2436
|
+
conversationHistory: multiTurnRep.conversationHistory,
|
|
2437
|
+
totalTurns: multiTurnRep.totalTurns,
|
|
2438
|
+
terminationReason: multiTurnRep.terminationReason,
|
|
2439
|
+
termination: multiTurnRep.termination,
|
|
2440
|
+
multiTurnIterationStats: calculateMultiTurnIterationStats(multiTurnResults)
|
|
2441
|
+
};
|
|
2442
|
+
aggregated.push(aggregatedResult);
|
|
2443
|
+
} else {
|
|
2444
|
+
const aggregatedResult = {
|
|
2445
|
+
kind: "single-turn-iterated",
|
|
2446
|
+
testCase: representative.testCase,
|
|
2447
|
+
output: representative.output,
|
|
2448
|
+
metrics: representative.metrics,
|
|
2449
|
+
verdicts: representative.verdicts,
|
|
2450
|
+
error: representative.error,
|
|
2451
|
+
overallScore: stats.mean,
|
|
2452
|
+
passed: passedByMajority,
|
|
2453
|
+
iterationStats: stats,
|
|
2454
|
+
iterationResults: resultsForTestCase
|
|
2455
|
+
};
|
|
2456
|
+
aggregated.push(aggregatedResult);
|
|
2457
|
+
}
|
|
2458
|
+
}
|
|
2459
|
+
return aggregated;
|
|
2460
|
+
}
|
|
2461
|
+
function filterIteratedResults(results) {
|
|
2462
|
+
return results.filter(
|
|
2463
|
+
(r) => r.kind === "single-turn-iterated" || r.kind === "multi-turn-iterated"
|
|
2464
|
+
);
|
|
2465
|
+
}
|
|
2466
|
+
function averageIterationStat(results, selector) {
|
|
2467
|
+
const iteratedResults = filterIteratedResults(results);
|
|
2468
|
+
if (iteratedResults.length === 0) {
|
|
2469
|
+
return void 0;
|
|
2470
|
+
}
|
|
2471
|
+
const total = iteratedResults.reduce((sum, r) => sum + selector(r.iterationStats), 0);
|
|
2472
|
+
return total / iteratedResults.length;
|
|
2473
|
+
}
|
|
2474
|
+
function calculateAvgStdDev(results) {
|
|
2475
|
+
return averageIterationStat(results, (stats) => stats.stdDev);
|
|
2476
|
+
}
|
|
2477
|
+
function calculateAvgPassRate(results) {
|
|
2478
|
+
return averageIterationStat(results, (stats) => stats.passRate);
|
|
2479
|
+
}
|
|
2480
|
+
|
|
2481
|
+
// src/core/suite.ts
|
|
2482
|
+
function calculateAggregatedMetrics(results) {
|
|
2483
|
+
if (results.length === 0) {
|
|
2484
|
+
return { avgLatencyMs: 0, totalTokens: 0 };
|
|
2485
|
+
}
|
|
2486
|
+
const totalLatencyMs = sumBy(results, (r) => r.metrics.latencyMs);
|
|
2487
|
+
const totalTokens = sumBy(results, (r) => r.metrics.tokenUsage.totalTokens);
|
|
2488
|
+
return {
|
|
2489
|
+
avgLatencyMs: totalLatencyMs / results.length,
|
|
2490
|
+
totalTokens
|
|
2491
|
+
};
|
|
2492
|
+
}
|
|
2493
|
+
function sumBy(items, selector) {
|
|
2494
|
+
return items.reduce((sum, item) => sum + selector(item), 0);
|
|
2495
|
+
}
|
|
2496
|
+
function calculateSummary(results, iterations) {
|
|
2497
|
+
const metrics = calculateAggregatedMetrics(results);
|
|
2498
|
+
const passedCount = results.filter((r) => r.passed).length;
|
|
2499
|
+
const failedCount = results.length - passedCount;
|
|
2500
|
+
const avgScore = results.length > 0 ? sumBy(results, (r) => r.overallScore) / results.length : 0;
|
|
2501
|
+
const summary = {
|
|
2502
|
+
totalTests: results.length,
|
|
2503
|
+
passed: passedCount,
|
|
2504
|
+
failed: failedCount,
|
|
2505
|
+
avgScore,
|
|
2506
|
+
metrics
|
|
2507
|
+
};
|
|
2508
|
+
const hasMultipleIterations = iterations && iterations > 1;
|
|
2509
|
+
if (hasMultipleIterations) {
|
|
2510
|
+
summary.iterations = iterations;
|
|
2511
|
+
summary.avgStdDev = calculateAvgStdDev(results);
|
|
2512
|
+
summary.avgPassRate = calculateAvgPassRate(results);
|
|
2513
|
+
}
|
|
2514
|
+
return summary;
|
|
2515
|
+
}
|
|
2516
|
+
function createEvalSuite(config) {
|
|
2517
|
+
const { agent, agentDescription, judge, improver } = config;
|
|
2518
|
+
const description = agentDescription ?? agent.config.description ?? agent.config.name;
|
|
2519
|
+
const suite = {
|
|
2520
|
+
async run(testCases2, options) {
|
|
2521
|
+
const iterations = options?.iterations ?? 1;
|
|
2522
|
+
validateIterations(iterations);
|
|
2523
|
+
const executeContext = { agent, judge, agentDescription: description };
|
|
2524
|
+
const results = iterations <= 1 ? await runWithConcurrency(testCases2, executeContext, options) : await runMultipleIterations(testCases2, executeContext, options, iterations);
|
|
2525
|
+
const summary = calculateSummary(results, iterations > 1 ? iterations : void 0);
|
|
2526
|
+
const suggestions = improver ? (await improver.improve(agent.prompt, results)).suggestions : [];
|
|
2527
|
+
return {
|
|
2528
|
+
summary,
|
|
2529
|
+
results,
|
|
2530
|
+
suggestions,
|
|
2531
|
+
generatedAt: /* @__PURE__ */ new Date(),
|
|
2532
|
+
promptVersion: agent.prompt.version
|
|
2533
|
+
};
|
|
2534
|
+
},
|
|
2535
|
+
withAgent(newAgent) {
|
|
2536
|
+
return createEvalSuite({
|
|
2537
|
+
...config,
|
|
2538
|
+
agent: newAgent,
|
|
2539
|
+
agentDescription: void 0
|
|
2540
|
+
});
|
|
2541
|
+
}
|
|
2542
|
+
};
|
|
2543
|
+
return suite;
|
|
2544
|
+
}
|
|
2545
|
+
function validateIterations(iterations) {
|
|
2546
|
+
if (iterations < 1 || !Number.isInteger(iterations)) {
|
|
2547
|
+
throw new EvalError(
|
|
2548
|
+
`Invalid iterations value: ${iterations}. Must be a positive integer.`,
|
|
2549
|
+
{ code: "INVALID_CONFIG" /* INVALID_CONFIG */, context: { iterations } }
|
|
2550
|
+
);
|
|
2551
|
+
}
|
|
2552
|
+
}
|
|
2553
|
+
async function runMultipleIterations(testCases2, executeContext, options, iterations) {
|
|
2554
|
+
const allIterationResults = [];
|
|
2555
|
+
for (let i = 0; i < iterations; i++) {
|
|
2556
|
+
const iterationResults = await runWithConcurrency(
|
|
2557
|
+
testCases2,
|
|
2558
|
+
executeContext,
|
|
2559
|
+
{ ...options, iterations: void 0 }
|
|
2560
|
+
);
|
|
2561
|
+
allIterationResults.push(iterationResults);
|
|
2562
|
+
}
|
|
2563
|
+
return aggregateIterationResults(allIterationResults);
|
|
2564
|
+
}
|
|
2565
|
+
|
|
2566
|
+
// src/index.ts
|
|
2567
|
+
import {
|
|
2568
|
+
resolveFileSource,
|
|
2569
|
+
resolveFileSourcesInInput as resolveFileSourcesInInput3,
|
|
2570
|
+
scanForFileSources,
|
|
2571
|
+
getFileSourceDisplayInfo,
|
|
2572
|
+
getFileSourcesDisplayInfo as getFileSourcesDisplayInfo2,
|
|
2573
|
+
inferMediaType,
|
|
2574
|
+
isFileSource,
|
|
2575
|
+
isFileSourcePath,
|
|
2576
|
+
isFileSourceData,
|
|
2577
|
+
isFileSourceBase64,
|
|
2578
|
+
isFileSourceUrl
|
|
2579
|
+
} from "@agtlantis/core";
|
|
2580
|
+
|
|
2581
|
+
// src/judge/llm-judge.ts
|
|
2582
|
+
import { Output } from "ai";
|
|
2583
|
+
import { z as z3 } from "zod";
|
|
2584
|
+
|
|
2585
|
+
// src/judge/prompts/default.ts
|
|
2586
|
+
var defaultJudgePrompt = {
|
|
2587
|
+
id: "default-judge",
|
|
2588
|
+
version: "2.0.0",
|
|
2589
|
+
system: `You are an expert evaluator specializing in assessing AI Agent outputs.
|
|
2590
|
+
|
|
2591
|
+
Your role is to fairly and thoroughly evaluate the agent's output against the provided criteria.
|
|
2592
|
+
|
|
2593
|
+
## Evaluation Principles
|
|
2594
|
+
|
|
2595
|
+
1. **Scoring**: Assign a score between 0-100 for each criterion
|
|
2596
|
+
- 90-100: Exceptional - Exceeds expectations with no significant issues
|
|
2597
|
+
- 70-89: Good - Meets expectations with minor issues
|
|
2598
|
+
- 50-69: Acceptable - Partially meets expectations, notable issues present
|
|
2599
|
+
- 30-49: Poor - Falls short of expectations, significant issues
|
|
2600
|
+
- 0-29: Failing - Does not meet minimum requirements
|
|
2601
|
+
|
|
2602
|
+
2. **Reasoning**: Always provide specific, evidence-based reasoning
|
|
2603
|
+
- Quote or reference specific parts of the output
|
|
2604
|
+
- Explain both strengths and weaknesses
|
|
2605
|
+
- Be constructive and actionable in feedback
|
|
2606
|
+
|
|
2607
|
+
3. **Objectivity**: Evaluate based solely on the criteria provided
|
|
2608
|
+
- Avoid personal preferences or unstated requirements
|
|
2609
|
+
- Consider the agent's intended purpose and context
|
|
2610
|
+
- Weight severity of issues proportionally
|
|
2611
|
+
|
|
2612
|
+
## Response Format
|
|
2613
|
+
|
|
2614
|
+
You MUST respond with valid JSON only. No additional text or explanation outside the JSON structure.
|
|
2615
|
+
|
|
2616
|
+
{
|
|
2617
|
+
"verdicts": [
|
|
2618
|
+
{
|
|
2619
|
+
"criterionId": "criterion-id",
|
|
2620
|
+
"score": 0-100,
|
|
2621
|
+
"reasoning": "Detailed explanation with specific evidence from the output",
|
|
2622
|
+
"passed": true/false
|
|
2623
|
+
}
|
|
2624
|
+
]
|
|
2625
|
+
}`,
|
|
2626
|
+
renderUserPrompt: (ctx) => {
|
|
2627
|
+
const fileSection = buildFileSection(ctx.files);
|
|
2628
|
+
return `
|
|
2629
|
+
## Agent Under Evaluation
|
|
2630
|
+
${ctx.agentDescription}
|
|
2631
|
+
|
|
2632
|
+
## Input Provided to Agent
|
|
2633
|
+
\`\`\`json
|
|
2634
|
+
${JSON.stringify(ctx.input, null, 2)}
|
|
2635
|
+
\`\`\`
|
|
2636
|
+
${fileSection}
|
|
2637
|
+
## Agent Output
|
|
2638
|
+
\`\`\`json
|
|
2639
|
+
${JSON.stringify(ctx.output, null, 2)}
|
|
2640
|
+
\`\`\`
|
|
2641
|
+
|
|
2642
|
+
## Evaluation Criteria
|
|
2643
|
+
${ctx.criteria.map((c2) => `- **${c2.name}** (id: ${c2.id}, weight: ${c2.weight ?? 1}): ${c2.description}`).join("\n")}
|
|
2644
|
+
|
|
2645
|
+
Please evaluate the agent's output against each criterion listed above.`.trim();
|
|
2646
|
+
}
|
|
2647
|
+
};
|
|
2648
|
+
function buildFileSection(files) {
|
|
2649
|
+
if (!files || files.length === 0) {
|
|
2650
|
+
return "";
|
|
2651
|
+
}
|
|
2652
|
+
return `
|
|
2653
|
+
## Reference Files
|
|
2654
|
+
${files.map((f) => `### ${f.path}
|
|
2655
|
+
\`\`\`
|
|
2656
|
+
${f.content}
|
|
2657
|
+
\`\`\``).join("\n\n")}
|
|
2658
|
+
`;
|
|
2659
|
+
}
|
|
2660
|
+
|
|
2661
|
+
// src/judge/llm-judge.ts
|
|
2662
|
+
function toEvalTokenUsage(usage) {
|
|
2663
|
+
return {
|
|
2664
|
+
inputTokens: usage.inputTokens ?? 0,
|
|
2665
|
+
outputTokens: usage.outputTokens ?? 0,
|
|
2666
|
+
totalTokens: usage.totalTokens ?? 0
|
|
2667
|
+
};
|
|
2668
|
+
}
|
|
2669
|
+
function hasValidator(criterion) {
|
|
2670
|
+
return "validator" in criterion && typeof criterion.validator === "function";
|
|
2671
|
+
}
|
|
2672
|
+
var JudgeResponseSchema = z3.object({
|
|
2673
|
+
verdicts: z3.array(
|
|
2674
|
+
z3.object({
|
|
2675
|
+
criterionId: z3.string(),
|
|
2676
|
+
score: z3.number().min(SCORE.MIN).max(SCORE.MAX),
|
|
2677
|
+
reasoning: z3.string(),
|
|
2678
|
+
passed: z3.boolean().optional()
|
|
2679
|
+
})
|
|
2680
|
+
)
|
|
2681
|
+
});
|
|
2682
|
+
function validateAllCriteriaHaveVerdicts(verdicts, criteriaIds) {
|
|
2683
|
+
const providedIds = new Set(verdicts.map((v) => v.criterionId));
|
|
2684
|
+
const missingIds = criteriaIds.filter((id) => !providedIds.has(id));
|
|
2685
|
+
if (missingIds.length > 0) {
|
|
2686
|
+
throw new EvalError("Judge response missing verdicts for some criteria", {
|
|
2687
|
+
code: "VERDICT_PARSE_ERROR" /* VERDICT_PARSE_ERROR */,
|
|
2688
|
+
context: { missingCriteriaIds: missingIds, providedIds: [...providedIds] }
|
|
2689
|
+
});
|
|
2690
|
+
}
|
|
2691
|
+
}
|
|
2692
|
+
function calculateOverallScore(verdicts, criteriaWeights) {
|
|
2693
|
+
let totalWeight = 0;
|
|
2694
|
+
let weightedSum = 0;
|
|
2695
|
+
for (const verdict of verdicts) {
|
|
2696
|
+
const weight = criteriaWeights.get(verdict.criterionId) ?? 1;
|
|
2697
|
+
weightedSum += verdict.score * weight;
|
|
2698
|
+
totalWeight += weight;
|
|
2699
|
+
}
|
|
2700
|
+
if (totalWeight === 0) {
|
|
2701
|
+
return 0;
|
|
2702
|
+
}
|
|
2703
|
+
return Math.round(weightedSum / totalWeight * 100) / 100;
|
|
2704
|
+
}
|
|
2705
|
+
function runValidatorCriteria(validatorCriteria, output) {
|
|
2706
|
+
return validatorCriteria.map((criterion) => {
|
|
2707
|
+
const result = criterion.validator(output);
|
|
2708
|
+
if (result.valid) {
|
|
2709
|
+
return {
|
|
2710
|
+
criterionId: criterion.id,
|
|
2711
|
+
score: 100,
|
|
2712
|
+
reasoning: `${criterion.name} \uD1B5\uACFC`,
|
|
2713
|
+
passed: true
|
|
2714
|
+
};
|
|
2715
|
+
}
|
|
2716
|
+
return {
|
|
2717
|
+
criterionId: criterion.id,
|
|
2718
|
+
score: 0,
|
|
2719
|
+
reasoning: `${criterion.name} \uC2E4\uD328:
|
|
2720
|
+
${result.errorSummary ?? "\uC720\uD6A8\uC131 \uAC80\uC99D \uC624\uB958"}`,
|
|
2721
|
+
passed: false
|
|
2722
|
+
};
|
|
2723
|
+
});
|
|
2724
|
+
}
|
|
2725
|
+
async function runLLMEvaluation(provider, prompt, context, llmCriteriaIds, passThreshold) {
|
|
2726
|
+
const messages = [
|
|
2727
|
+
{ role: "system", content: prompt.system },
|
|
2728
|
+
{ role: "user", content: prompt.renderUserPrompt(context) }
|
|
2729
|
+
];
|
|
2730
|
+
let response;
|
|
2731
|
+
let usage;
|
|
2732
|
+
try {
|
|
2733
|
+
const execution = provider.simpleExecution(async (session) => {
|
|
2734
|
+
const result = await session.generateText({
|
|
2735
|
+
messages,
|
|
2736
|
+
output: Output.object({ schema: JudgeResponseSchema })
|
|
2737
|
+
});
|
|
2738
|
+
return result.output;
|
|
2739
|
+
});
|
|
2740
|
+
const executionResult = await execution.result();
|
|
2741
|
+
if (executionResult.status !== "succeeded") {
|
|
2742
|
+
throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
|
|
2743
|
+
}
|
|
2744
|
+
response = executionResult.value;
|
|
2745
|
+
usage = executionResult.summary.totalLLMUsage;
|
|
2746
|
+
} catch (cause) {
|
|
2747
|
+
throw EvalError.from(cause, "LLM_API_ERROR" /* LLM_API_ERROR */, {
|
|
2748
|
+
promptId: prompt.id,
|
|
2749
|
+
promptVersion: prompt.version
|
|
2750
|
+
});
|
|
2751
|
+
}
|
|
2752
|
+
validateAllCriteriaHaveVerdicts(response.verdicts, llmCriteriaIds);
|
|
2753
|
+
const verdicts = response.verdicts.map((v) => ({
|
|
2754
|
+
criterionId: v.criterionId,
|
|
2755
|
+
score: v.score,
|
|
2756
|
+
reasoning: v.reasoning,
|
|
2757
|
+
passed: v.passed ?? v.score >= passThreshold
|
|
2758
|
+
}));
|
|
2759
|
+
return { verdicts, usage };
|
|
2760
|
+
}
|
|
2761
|
+
function createJudge(config) {
|
|
2762
|
+
const {
|
|
2763
|
+
provider,
|
|
2764
|
+
prompt = defaultJudgePrompt,
|
|
2765
|
+
criteria,
|
|
2766
|
+
passThreshold = SCORE.DEFAULT_PASS_THRESHOLD,
|
|
2767
|
+
model
|
|
2768
|
+
} = config;
|
|
2769
|
+
const validatorCriteria = [];
|
|
2770
|
+
const llmCriteria = [];
|
|
2771
|
+
const criteriaWeights = /* @__PURE__ */ new Map();
|
|
2772
|
+
const llmCriteriaIds = [];
|
|
2773
|
+
for (const c2 of criteria) {
|
|
2774
|
+
criteriaWeights.set(c2.id, c2.weight ?? 1);
|
|
2775
|
+
if (hasValidator(c2)) {
|
|
2776
|
+
validatorCriteria.push(c2);
|
|
2777
|
+
} else {
|
|
2778
|
+
llmCriteria.push(c2);
|
|
2779
|
+
llmCriteriaIds.push(c2.id);
|
|
2780
|
+
}
|
|
2781
|
+
}
|
|
2782
|
+
return {
|
|
2783
|
+
async evaluate(evalContext) {
|
|
2784
|
+
const { input, output, agentDescription, files } = evalContext;
|
|
2785
|
+
const validatorVerdicts = runValidatorCriteria(validatorCriteria, output);
|
|
2786
|
+
let llmVerdicts = [];
|
|
2787
|
+
let llmUsage;
|
|
2788
|
+
if (llmCriteria.length > 0) {
|
|
2789
|
+
const context = {
|
|
2790
|
+
agentDescription,
|
|
2791
|
+
input,
|
|
2792
|
+
output,
|
|
2793
|
+
criteria: llmCriteria,
|
|
2794
|
+
files
|
|
2795
|
+
};
|
|
2796
|
+
const llmResult = await runLLMEvaluation(
|
|
2797
|
+
provider,
|
|
2798
|
+
prompt,
|
|
2799
|
+
context,
|
|
2800
|
+
llmCriteriaIds,
|
|
2801
|
+
passThreshold
|
|
2802
|
+
);
|
|
2803
|
+
llmVerdicts = llmResult.verdicts;
|
|
2804
|
+
llmUsage = llmResult.usage;
|
|
2805
|
+
}
|
|
2806
|
+
const allVerdicts = [...validatorVerdicts, ...llmVerdicts];
|
|
2807
|
+
const overallScore = calculateOverallScore(allVerdicts, criteriaWeights);
|
|
2808
|
+
const passed = overallScore >= passThreshold;
|
|
2809
|
+
const metadata = llmUsage ? { tokenUsage: toEvalTokenUsage(llmUsage), model } : void 0;
|
|
2810
|
+
return {
|
|
2811
|
+
verdicts: allVerdicts,
|
|
2812
|
+
overallScore,
|
|
2813
|
+
passed,
|
|
2814
|
+
metadata
|
|
2815
|
+
};
|
|
2816
|
+
}
|
|
2817
|
+
};
|
|
2818
|
+
}
|
|
2819
|
+
|
|
2820
|
+
// src/reporter/json-reporter.ts
|
|
2821
|
+
import { writeFileSync } from "fs";
|
|
2822
|
+
|
|
2823
|
+
// src/reporter/cost-helpers.ts
|
|
2824
|
+
import {
|
|
2825
|
+
calculateCostFromUsage
|
|
2826
|
+
} from "@agtlantis/core";
|
|
2827
|
+
function toLanguageModelUsage(usage) {
|
|
2828
|
+
return {
|
|
2829
|
+
inputTokens: usage.inputTokens,
|
|
2830
|
+
outputTokens: usage.outputTokens,
|
|
2831
|
+
totalTokens: usage.totalTokens
|
|
2832
|
+
};
|
|
2833
|
+
}
|
|
2834
|
+
var PROVIDER_MAPPING = {
|
|
2835
|
+
gemini: "google",
|
|
2836
|
+
openai: "openai",
|
|
2837
|
+
anthropic: "anthropic",
|
|
2838
|
+
google: "google"
|
|
2839
|
+
};
|
|
2840
|
+
function detectProvider(model) {
|
|
2841
|
+
if (!model) return "google";
|
|
2842
|
+
if (model.startsWith("gpt-") || model.startsWith("o1") || model.startsWith("o3")) {
|
|
2843
|
+
return "openai";
|
|
2844
|
+
}
|
|
2845
|
+
if (model.startsWith("gemini-")) {
|
|
2846
|
+
return "google";
|
|
2847
|
+
}
|
|
2848
|
+
if (model.startsWith("claude-")) {
|
|
2849
|
+
return "anthropic";
|
|
2850
|
+
}
|
|
2851
|
+
return "google";
|
|
2852
|
+
}
|
|
2853
|
+
function normalizeProvider(provider) {
|
|
2854
|
+
if (!provider) return "google";
|
|
2855
|
+
return PROVIDER_MAPPING[provider] ?? provider;
|
|
2856
|
+
}
|
|
2857
|
+
function calculateComponentCost(tokenUsage, model, provider, config) {
|
|
2858
|
+
if (!tokenUsage) return void 0;
|
|
2859
|
+
const normalizedProvider = provider ? normalizeProvider(provider) : detectProvider(model);
|
|
2860
|
+
const providerPricing = config?.providerPricing?.[normalizedProvider];
|
|
2861
|
+
const result = calculateCostFromUsage(
|
|
2862
|
+
toLanguageModelUsage(tokenUsage),
|
|
2863
|
+
model ?? "unknown",
|
|
2864
|
+
normalizedProvider,
|
|
2865
|
+
providerPricing
|
|
2866
|
+
);
|
|
2867
|
+
return result.total;
|
|
2868
|
+
}
|
|
2869
|
+
function buildCostBreakdown(costs) {
|
|
2870
|
+
const total = (costs.agent ?? 0) + (costs.judge ?? 0) + (costs.improver ?? 0);
|
|
2871
|
+
return {
|
|
2872
|
+
...costs,
|
|
2873
|
+
total: total > 0 ? total : void 0
|
|
2874
|
+
};
|
|
2875
|
+
}
|
|
2876
|
+
function calculateResultCost(result, config) {
|
|
2877
|
+
const agentCost = calculateComponentCost(
|
|
2878
|
+
result.metrics.tokenUsage,
|
|
2879
|
+
result.agentMetadata?.model,
|
|
2880
|
+
result.agentMetadata?.provider,
|
|
2881
|
+
config
|
|
2882
|
+
);
|
|
2883
|
+
const judgeCost = result.judgeMetadata?.tokenUsage ? calculateComponentCost(
|
|
2884
|
+
result.judgeMetadata.tokenUsage,
|
|
2885
|
+
result.judgeMetadata.model,
|
|
2886
|
+
result.judgeMetadata.provider,
|
|
2887
|
+
config
|
|
2888
|
+
) : void 0;
|
|
2889
|
+
return buildCostBreakdown({
|
|
2890
|
+
agent: agentCost,
|
|
2891
|
+
judge: judgeCost
|
|
2892
|
+
});
|
|
2893
|
+
}
|
|
2894
|
+
function calculateReportCosts(report, config) {
|
|
2895
|
+
let totalAgent = 0;
|
|
2896
|
+
let totalJudge = 0;
|
|
2897
|
+
for (const result of report.results) {
|
|
2898
|
+
const breakdown = calculateResultCost(result, config);
|
|
2899
|
+
totalAgent += breakdown.agent ?? 0;
|
|
2900
|
+
totalJudge += breakdown.judge ?? 0;
|
|
2901
|
+
}
|
|
2902
|
+
return {
|
|
2903
|
+
total: totalAgent + totalJudge,
|
|
2904
|
+
byComponent: {
|
|
2905
|
+
agent: totalAgent,
|
|
2906
|
+
judge: totalJudge
|
|
2907
|
+
}
|
|
2908
|
+
};
|
|
2909
|
+
}
|
|
2910
|
+
|
|
2911
|
+
// src/reporter/format-utils.ts
|
|
2912
|
+
import { mkdirSync } from "fs";
|
|
2913
|
+
import path from "path";
|
|
2914
|
+
|
|
2915
|
+
// src/reporter/markdown-reporter.ts
|
|
2916
|
+
import { writeFileSync as writeFileSync2 } from "fs";
|
|
2917
|
+
|
|
2918
|
+
// src/reporter/cycle-json.ts
|
|
2919
|
+
import { writeFileSync as writeFileSync3, mkdirSync as mkdirSync2 } from "fs";
|
|
2920
|
+
import path2 from "path";
|
|
2921
|
+
|
|
2922
|
+
// src/reporter/cycle-markdown.ts
|
|
2923
|
+
import { writeFileSync as writeFileSync4 } from "fs";
|
|
2924
|
+
|
|
2925
|
+
// src/improver/utils.ts
|
|
2926
|
+
import { compileTemplate } from "@agtlantis/core";
|
|
2927
|
+
function safeReplace(str, search, replacement) {
|
|
2928
|
+
return str.replace(search, () => replacement);
|
|
2929
|
+
}
|
|
2930
|
+
function bumpVersion(version, bump) {
|
|
2931
|
+
const parts = version.split(".").map((n) => parseInt(n, 10));
|
|
2932
|
+
if (parts.length !== 3 || parts.some(isNaN)) {
|
|
2933
|
+
throw new EvalError(
|
|
2934
|
+
`Invalid version format: "${version}". Expected semver format (x.y.z)`,
|
|
2935
|
+
{
|
|
2936
|
+
code: "SUGGESTION_APPLY_ERROR" /* SUGGESTION_APPLY_ERROR */,
|
|
2937
|
+
context: { version, expectedFormat: "x.y.z" }
|
|
2938
|
+
}
|
|
2939
|
+
);
|
|
2940
|
+
}
|
|
2941
|
+
const [major, minor, patch] = parts;
|
|
2942
|
+
switch (bump) {
|
|
2943
|
+
case "major":
|
|
2944
|
+
return `${major + 1}.0.0`;
|
|
2945
|
+
case "minor":
|
|
2946
|
+
return `${major}.${minor + 1}.0`;
|
|
2947
|
+
case "patch":
|
|
2948
|
+
return `${major}.${minor}.${patch + 1}`;
|
|
2949
|
+
}
|
|
2950
|
+
}
|
|
2951
|
+
function applyPromptSuggestions(currentPrompt, suggestions, options) {
|
|
2952
|
+
const approvedSuggestions = suggestions.filter((s) => s.approved);
|
|
2953
|
+
if (approvedSuggestions.length === 0) {
|
|
2954
|
+
return {
|
|
2955
|
+
prompt: currentPrompt,
|
|
2956
|
+
appliedCount: 0,
|
|
2957
|
+
skipped: []
|
|
2958
|
+
};
|
|
2959
|
+
}
|
|
2960
|
+
let newPrompt = { ...currentPrompt };
|
|
2961
|
+
let appliedCount = 0;
|
|
2962
|
+
const skipped = [];
|
|
2963
|
+
for (const suggestion of approvedSuggestions) {
|
|
2964
|
+
const applyResult = applySingleSuggestion(newPrompt, suggestion);
|
|
2965
|
+
if (applyResult.success) {
|
|
2966
|
+
newPrompt = applyResult.prompt;
|
|
2967
|
+
appliedCount++;
|
|
2968
|
+
} else {
|
|
2969
|
+
skipped.push({ suggestion, reason: applyResult.reason });
|
|
2970
|
+
}
|
|
2971
|
+
}
|
|
2972
|
+
if (options?.bumpVersion && appliedCount > 0) {
|
|
2973
|
+
newPrompt = {
|
|
2974
|
+
...newPrompt,
|
|
2975
|
+
version: bumpVersion(currentPrompt.version, options.bumpVersion)
|
|
2976
|
+
};
|
|
2977
|
+
}
|
|
2978
|
+
return {
|
|
2979
|
+
prompt: newPrompt,
|
|
2980
|
+
appliedCount,
|
|
2981
|
+
skipped
|
|
2982
|
+
};
|
|
2983
|
+
}
|
|
2984
|
+
var AGENT_PROMPT_CORE_FIELDS = [
|
|
2985
|
+
"id",
|
|
2986
|
+
"version",
|
|
2987
|
+
"system",
|
|
2988
|
+
"renderUserPrompt",
|
|
2989
|
+
"userTemplate"
|
|
2990
|
+
];
|
|
2991
|
+
function applySingleSuggestion(prompt, suggestion) {
|
|
2992
|
+
switch (suggestion.type) {
|
|
2993
|
+
case "system_prompt": {
|
|
2994
|
+
if (!prompt.system.includes(suggestion.currentValue)) {
|
|
2995
|
+
return {
|
|
2996
|
+
success: false,
|
|
2997
|
+
reason: `currentValue not found in system prompt: "${truncate(suggestion.currentValue, 50)}"`
|
|
2998
|
+
};
|
|
2999
|
+
}
|
|
3000
|
+
return {
|
|
3001
|
+
success: true,
|
|
3002
|
+
prompt: {
|
|
3003
|
+
...prompt,
|
|
3004
|
+
system: safeReplace(
|
|
3005
|
+
prompt.system,
|
|
3006
|
+
suggestion.currentValue,
|
|
3007
|
+
suggestion.suggestedValue
|
|
3008
|
+
)
|
|
3009
|
+
}
|
|
3010
|
+
};
|
|
3011
|
+
}
|
|
3012
|
+
case "user_prompt": {
|
|
3013
|
+
const userTemplate = prompt.userTemplate;
|
|
3014
|
+
if (typeof userTemplate !== "string") {
|
|
3015
|
+
throw new EvalError(
|
|
3016
|
+
`Cannot apply user_prompt suggestion: prompt does not have a userTemplate field. The renderUserPrompt is a function and cannot be modified directly.`,
|
|
3017
|
+
{
|
|
3018
|
+
code: "SUGGESTION_APPLY_ERROR" /* SUGGESTION_APPLY_ERROR */,
|
|
3019
|
+
context: {
|
|
3020
|
+
suggestionType: suggestion.type,
|
|
3021
|
+
hasUserTemplate: "userTemplate" in prompt
|
|
3022
|
+
}
|
|
3023
|
+
}
|
|
3024
|
+
);
|
|
3025
|
+
}
|
|
3026
|
+
if (!userTemplate.includes(suggestion.currentValue)) {
|
|
3027
|
+
return {
|
|
3028
|
+
success: false,
|
|
3029
|
+
reason: `currentValue not found in userTemplate: "${truncate(suggestion.currentValue, 50)}"`
|
|
3030
|
+
};
|
|
3031
|
+
}
|
|
3032
|
+
const newTemplate = safeReplace(
|
|
3033
|
+
userTemplate,
|
|
3034
|
+
suggestion.currentValue,
|
|
3035
|
+
suggestion.suggestedValue
|
|
3036
|
+
);
|
|
3037
|
+
return {
|
|
3038
|
+
success: true,
|
|
3039
|
+
prompt: {
|
|
3040
|
+
...prompt,
|
|
3041
|
+
userTemplate: newTemplate,
|
|
3042
|
+
renderUserPrompt: compileTemplate(newTemplate, prompt.id)
|
|
3043
|
+
}
|
|
3044
|
+
};
|
|
3045
|
+
}
|
|
3046
|
+
case "parameters": {
|
|
3047
|
+
const updatedPrompt = { ...prompt };
|
|
3048
|
+
let found = false;
|
|
3049
|
+
for (const [key, value] of Object.entries(updatedPrompt)) {
|
|
3050
|
+
if (AGENT_PROMPT_CORE_FIELDS.includes(
|
|
3051
|
+
key
|
|
3052
|
+
)) {
|
|
3053
|
+
continue;
|
|
3054
|
+
}
|
|
3055
|
+
if (typeof value === "string" && value.includes(suggestion.currentValue)) {
|
|
3056
|
+
updatedPrompt[key] = safeReplace(
|
|
3057
|
+
value,
|
|
3058
|
+
suggestion.currentValue,
|
|
3059
|
+
suggestion.suggestedValue
|
|
3060
|
+
);
|
|
3061
|
+
found = true;
|
|
3062
|
+
break;
|
|
3063
|
+
}
|
|
3064
|
+
}
|
|
3065
|
+
if (!found) {
|
|
3066
|
+
return {
|
|
3067
|
+
success: false,
|
|
3068
|
+
reason: `currentValue not found in any parameter field: "${truncate(suggestion.currentValue, 50)}"`
|
|
3069
|
+
};
|
|
3070
|
+
}
|
|
3071
|
+
return {
|
|
3072
|
+
success: true,
|
|
3073
|
+
prompt: updatedPrompt
|
|
3074
|
+
};
|
|
3075
|
+
}
|
|
3076
|
+
default: {
|
|
3077
|
+
const _exhaustive = suggestion.type;
|
|
3078
|
+
return {
|
|
3079
|
+
success: false,
|
|
3080
|
+
reason: `Unknown suggestion type: ${suggestion.type}`
|
|
3081
|
+
};
|
|
3082
|
+
}
|
|
3083
|
+
}
|
|
3084
|
+
}
|
|
3085
|
+
|
|
3086
|
+
// src/improver/llm-improver.ts
|
|
3087
|
+
import { Output as Output2 } from "ai";
|
|
3088
|
+
import { z as z4 } from "zod";
|
|
3089
|
+
|
|
3090
|
+
// src/improver/prompts/default.ts
|
|
3091
|
+
var defaultImproverPrompt = {
|
|
3092
|
+
id: "default-improver",
|
|
3093
|
+
version: "2.0.0",
|
|
3094
|
+
system: `You are an expert prompt engineer specializing in optimizing AI Agent prompts.
|
|
3095
|
+
|
|
3096
|
+
Your role is to analyze test results and evaluation feedback to propose targeted improvements.
|
|
3097
|
+
|
|
3098
|
+
## Improvement Principles
|
|
3099
|
+
|
|
3100
|
+
1. **Focus on Impact**: Prioritize changes that address the lowest-scoring criteria
|
|
3101
|
+
- Target specific failure patterns, not general improvements
|
|
3102
|
+
- One well-crafted change is better than many superficial ones
|
|
3103
|
+
|
|
3104
|
+
2. **Be Specific and Actionable**: Provide concrete changes, not vague suggestions
|
|
3105
|
+
- Show exact text to add, modify, or remove
|
|
3106
|
+
- Explain the mechanism by which the change will help
|
|
3107
|
+
|
|
3108
|
+
3. **Consider Trade-offs**: Evaluate side effects of each change
|
|
3109
|
+
- Will this fix break other test cases?
|
|
3110
|
+
- Does it increase prompt length/cost significantly?
|
|
3111
|
+
- Could it introduce new failure modes?
|
|
3112
|
+
|
|
3113
|
+
4. **Maintain Prompt Quality**: Preserve clarity and structure
|
|
3114
|
+
- Keep prompts readable and maintainable
|
|
3115
|
+
- Avoid over-engineering or excessive constraints
|
|
3116
|
+
- Ensure changes align with the agent's core purpose
|
|
3117
|
+
|
|
3118
|
+
## Suggestion Priority Levels
|
|
3119
|
+
- **high**: Critical issues causing test failures, should be addressed immediately
|
|
3120
|
+
- **medium**: Issues affecting quality scores, recommended for next iteration
|
|
3121
|
+
- **low**: Minor optimizations, nice-to-have improvements
|
|
3122
|
+
|
|
3123
|
+
## Response Format
|
|
3124
|
+
|
|
3125
|
+
You MUST respond with valid JSON only. No additional text outside the JSON structure.
|
|
3126
|
+
|
|
3127
|
+
{
|
|
3128
|
+
"suggestions": [
|
|
3129
|
+
{
|
|
3130
|
+
"type": "system_prompt" | "user_prompt" | "parameters",
|
|
3131
|
+
"priority": "high" | "medium" | "low",
|
|
3132
|
+
"currentValue": "The specific text or value being changed",
|
|
3133
|
+
"suggestedValue": "The proposed replacement text or value",
|
|
3134
|
+
"reasoning": "Why this change addresses the identified issue",
|
|
3135
|
+
"expectedImprovement": "Predicted impact on scores and behavior"
|
|
3136
|
+
}
|
|
3137
|
+
]
|
|
3138
|
+
}`,
|
|
3139
|
+
renderUserPrompt: (ctx) => {
|
|
3140
|
+
const failedDetails = buildFailedCaseDetails(ctx.evaluatedResults);
|
|
3141
|
+
return `
|
|
3142
|
+
## Current Agent Prompt
|
|
3143
|
+
|
|
3144
|
+
### System Prompt
|
|
3145
|
+
\`\`\`
|
|
3146
|
+
${ctx.agentPrompt.system}
|
|
3147
|
+
\`\`\`
|
|
3148
|
+
|
|
3149
|
+
## Test Results Summary
|
|
3150
|
+
- Total tests: ${ctx.evaluatedResults.length}
|
|
3151
|
+
- Passed: ${ctx.evaluatedResults.filter((r) => r.passed).length}
|
|
3152
|
+
- Failed: ${ctx.evaluatedResults.filter((r) => !r.passed).length}
|
|
3153
|
+
|
|
3154
|
+
## Performance Metrics
|
|
3155
|
+
- Average latency: ${ctx.aggregatedMetrics.avgLatencyMs}ms
|
|
3156
|
+
- Total tokens used: ${ctx.aggregatedMetrics.totalTokens}
|
|
3157
|
+
|
|
3158
|
+
## Failed/Low-Score Cases Details
|
|
3159
|
+
${failedDetails}
|
|
3160
|
+
|
|
3161
|
+
Based on the above results, please propose specific prompt improvements.`.trim();
|
|
3162
|
+
}
|
|
3163
|
+
};
|
|
3164
|
+
function buildFailedCaseDetails(results) {
|
|
3165
|
+
const failedOrLowScore = results.filter((r) => !r.passed || r.overallScore < 70);
|
|
3166
|
+
if (failedOrLowScore.length === 0) {
|
|
3167
|
+
return "(None - all tests passed with acceptable scores)";
|
|
3168
|
+
}
|
|
3169
|
+
return failedOrLowScore.map(
|
|
3170
|
+
(r) => `
|
|
3171
|
+
### ${r.testCase.id ?? "unnamed"} (Score: ${r.overallScore})
|
|
3172
|
+
**Input:** ${truncate(JSON.stringify(r.testCase.input), 200)}
|
|
3173
|
+
**Output:** ${truncate(JSON.stringify(r.output), 200)}
|
|
3174
|
+
**Evaluation:**
|
|
3175
|
+
${r.verdicts.map((v) => `- ${v.criterionId}: ${v.score}/100 - ${v.reasoning}`).join("\n")}`
|
|
3176
|
+
).join("\n");
|
|
3177
|
+
}
|
|
3178
|
+
|
|
3179
|
+
// src/improver/llm-improver.ts
|
|
3180
|
+
function toEvalTokenUsage2(usage) {
|
|
3181
|
+
return {
|
|
3182
|
+
inputTokens: usage.inputTokens ?? 0,
|
|
3183
|
+
outputTokens: usage.outputTokens ?? 0,
|
|
3184
|
+
totalTokens: usage.totalTokens ?? 0
|
|
3185
|
+
};
|
|
3186
|
+
}
|
|
3187
|
+
var ImproverResponseSchema = z4.object({
|
|
3188
|
+
suggestions: z4.array(
|
|
3189
|
+
z4.object({
|
|
3190
|
+
type: z4.enum(["system_prompt", "user_prompt", "parameters"]),
|
|
3191
|
+
priority: z4.enum(["high", "medium", "low"]),
|
|
3192
|
+
currentValue: z4.string(),
|
|
3193
|
+
suggestedValue: z4.string(),
|
|
3194
|
+
reasoning: z4.string(),
|
|
3195
|
+
expectedImprovement: z4.string()
|
|
3196
|
+
})
|
|
3197
|
+
)
|
|
3198
|
+
});
|
|
3199
|
+
function aggregateMetrics(results) {
|
|
3200
|
+
if (results.length === 0) {
|
|
3201
|
+
return {
|
|
3202
|
+
avgLatencyMs: 0,
|
|
3203
|
+
totalTokens: 0
|
|
3204
|
+
};
|
|
3205
|
+
}
|
|
3206
|
+
let totalLatency = 0;
|
|
3207
|
+
let totalTokens = 0;
|
|
3208
|
+
for (const result of results) {
|
|
3209
|
+
totalLatency += result.metrics.latencyMs;
|
|
3210
|
+
totalTokens += result.metrics.tokenUsage.totalTokens;
|
|
3211
|
+
}
|
|
3212
|
+
return {
|
|
3213
|
+
avgLatencyMs: Math.round(totalLatency / results.length),
|
|
3214
|
+
totalTokens
|
|
3215
|
+
};
|
|
3216
|
+
}
|
|
3217
|
+
function createImprover(config) {
|
|
3218
|
+
const { provider, prompt = defaultImproverPrompt, model } = config;
|
|
3219
|
+
return {
|
|
3220
|
+
async improve(agentPrompt, results) {
|
|
3221
|
+
const context = {
|
|
3222
|
+
agentPrompt,
|
|
3223
|
+
evaluatedResults: results,
|
|
3224
|
+
aggregatedMetrics: aggregateMetrics(results)
|
|
3225
|
+
};
|
|
3226
|
+
const messages = [
|
|
3227
|
+
{ role: "system", content: prompt.system },
|
|
3228
|
+
{ role: "user", content: prompt.renderUserPrompt(context) }
|
|
3229
|
+
];
|
|
3230
|
+
let response;
|
|
3231
|
+
let llmUsage;
|
|
3232
|
+
try {
|
|
3233
|
+
const execution = provider.simpleExecution(async (session) => {
|
|
3234
|
+
const result = await session.generateText({
|
|
3235
|
+
messages,
|
|
3236
|
+
output: Output2.object({ schema: ImproverResponseSchema })
|
|
3237
|
+
});
|
|
3238
|
+
return result.output;
|
|
3239
|
+
});
|
|
3240
|
+
const executionResult = await execution.result();
|
|
3241
|
+
if (executionResult.status !== "succeeded") {
|
|
3242
|
+
throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
|
|
3243
|
+
}
|
|
3244
|
+
response = executionResult.value;
|
|
3245
|
+
llmUsage = executionResult.summary.totalLLMUsage;
|
|
3246
|
+
} catch (cause) {
|
|
3247
|
+
throw EvalError.from(cause, "LLM_API_ERROR" /* LLM_API_ERROR */, {
|
|
3248
|
+
promptId: prompt.id,
|
|
3249
|
+
promptVersion: prompt.version
|
|
3250
|
+
});
|
|
3251
|
+
}
|
|
3252
|
+
const suggestions = response.suggestions.map((s) => ({
|
|
3253
|
+
...s,
|
|
3254
|
+
approved: void 0,
|
|
3255
|
+
modified: void 0
|
|
3256
|
+
}));
|
|
3257
|
+
const metadata = llmUsage ? { tokenUsage: toEvalTokenUsage2(llmUsage), model } : void 0;
|
|
3258
|
+
return { suggestions, metadata };
|
|
3259
|
+
}
|
|
3260
|
+
};
|
|
3261
|
+
}
|
|
3262
|
+
|
|
3263
|
+
// src/index.ts
|
|
3264
|
+
import { mock as mock2, MockProvider } from "@agtlantis/core/testing";
|
|
3265
|
+
import {
|
|
3266
|
+
compileTemplate as compileTemplate3,
|
|
3267
|
+
createFilePromptRepository
|
|
3268
|
+
} from "@agtlantis/core";
|
|
3269
|
+
import {
|
|
3270
|
+
calculateCostFromUsage as calculateCostFromUsage3,
|
|
3271
|
+
OPENAI_PRICING,
|
|
3272
|
+
GOOGLE_PRICING,
|
|
3273
|
+
ANTHROPIC_PRICING,
|
|
3274
|
+
DEFAULT_PRICING_CONFIG
|
|
3275
|
+
} from "@agtlantis/core";
|
|
3276
|
+
|
|
3277
|
+
// src/improvement-cycle/types.ts
|
|
3278
|
+
function isTargetScoreCondition(condition) {
|
|
3279
|
+
return condition.type === "targetScore";
|
|
3280
|
+
}
|
|
3281
|
+
function isMaxRoundsCondition(condition) {
|
|
3282
|
+
return condition.type === "maxRounds";
|
|
3283
|
+
}
|
|
3284
|
+
function isNoImprovementCondition(condition) {
|
|
3285
|
+
return condition.type === "noImprovement";
|
|
3286
|
+
}
|
|
3287
|
+
function isMaxCostCondition(condition) {
|
|
3288
|
+
return condition.type === "maxCost";
|
|
3289
|
+
}
|
|
3290
|
+
function isCustomCycleCondition(condition) {
|
|
3291
|
+
return condition.type === "custom";
|
|
3292
|
+
}
|
|
3293
|
+
|
|
3294
|
+
// src/improvement-cycle/conditions.ts
|
|
3295
|
+
function targetScore(threshold) {
|
|
3296
|
+
if (!Number.isFinite(threshold)) {
|
|
3297
|
+
throw new EvalError("threshold must be a finite number", {
|
|
3298
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
3299
|
+
context: { threshold }
|
|
3300
|
+
});
|
|
3301
|
+
}
|
|
3302
|
+
if (threshold < 0 || threshold > 100) {
|
|
3303
|
+
throw new EvalError("threshold must be between 0 and 100", {
|
|
3304
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
3305
|
+
context: { threshold }
|
|
3306
|
+
});
|
|
3307
|
+
}
|
|
3308
|
+
return { type: "targetScore", threshold };
|
|
3309
|
+
}
|
|
3310
|
+
function maxRounds(count) {
|
|
3311
|
+
if (!Number.isInteger(count) || count < 1) {
|
|
3312
|
+
throw new EvalError("count must be a positive integer", {
|
|
3313
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
3314
|
+
context: { count }
|
|
3315
|
+
});
|
|
3316
|
+
}
|
|
3317
|
+
return { type: "maxRounds", count };
|
|
3318
|
+
}
|
|
3319
|
+
function noImprovement(consecutiveRounds, minDelta) {
|
|
3320
|
+
if (!Number.isInteger(consecutiveRounds) || consecutiveRounds < 1) {
|
|
3321
|
+
throw new EvalError("consecutiveRounds must be a positive integer", {
|
|
3322
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
3323
|
+
context: { consecutiveRounds }
|
|
3324
|
+
});
|
|
3325
|
+
}
|
|
3326
|
+
if (minDelta !== void 0 && (!Number.isFinite(minDelta) || minDelta < 0)) {
|
|
3327
|
+
throw new EvalError("minDelta must be a non-negative finite number", {
|
|
3328
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
3329
|
+
context: { minDelta }
|
|
3330
|
+
});
|
|
3331
|
+
}
|
|
3332
|
+
return {
|
|
3333
|
+
type: "noImprovement",
|
|
3334
|
+
consecutiveRounds,
|
|
3335
|
+
...minDelta !== void 0 && { minDelta }
|
|
3336
|
+
};
|
|
3337
|
+
}
|
|
3338
|
+
function maxCost(maxUSD) {
|
|
3339
|
+
if (!Number.isFinite(maxUSD) || maxUSD <= 0) {
|
|
3340
|
+
throw new EvalError("maxUSD must be a positive finite number", {
|
|
3341
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
3342
|
+
context: { maxUSD }
|
|
3343
|
+
});
|
|
3344
|
+
}
|
|
3345
|
+
return { type: "maxCost", maxUSD };
|
|
3346
|
+
}
|
|
3347
|
+
function checkTargetScore(condition, ctx) {
|
|
3348
|
+
if (ctx.latestScore >= condition.threshold) {
|
|
3349
|
+
return {
|
|
3350
|
+
terminated: true,
|
|
3351
|
+
matchedCondition: condition,
|
|
3352
|
+
reason: `Target score ${condition.threshold} reached (current: ${ctx.latestScore})`
|
|
3353
|
+
};
|
|
3354
|
+
}
|
|
3355
|
+
return {
|
|
3356
|
+
terminated: false,
|
|
3357
|
+
reason: `Score ${ctx.latestScore} below target ${condition.threshold}`
|
|
3358
|
+
};
|
|
3359
|
+
}
|
|
3360
|
+
function checkMaxRounds(condition, ctx) {
|
|
3361
|
+
if (ctx.currentRound >= condition.count) {
|
|
3362
|
+
return {
|
|
3363
|
+
terminated: true,
|
|
3364
|
+
matchedCondition: condition,
|
|
3365
|
+
reason: `Maximum rounds reached (${condition.count})`
|
|
3366
|
+
};
|
|
3367
|
+
}
|
|
3368
|
+
return {
|
|
3369
|
+
terminated: false,
|
|
3370
|
+
reason: `Round ${ctx.currentRound} of ${condition.count}`
|
|
3371
|
+
};
|
|
3372
|
+
}
|
|
3373
|
+
function checkNoImprovement(condition, ctx) {
|
|
3374
|
+
const { consecutiveRounds, minDelta = 0 } = condition;
|
|
3375
|
+
const { history } = ctx;
|
|
3376
|
+
let noImprovementCount = 0;
|
|
3377
|
+
for (let i = history.length - 1; i >= 0; i--) {
|
|
3378
|
+
const round = history[i];
|
|
3379
|
+
if (round.scoreDelta === null) break;
|
|
3380
|
+
if (round.scoreDelta <= minDelta) {
|
|
3381
|
+
noImprovementCount++;
|
|
3382
|
+
} else {
|
|
3383
|
+
break;
|
|
3384
|
+
}
|
|
3385
|
+
}
|
|
3386
|
+
if (noImprovementCount >= consecutiveRounds) {
|
|
3387
|
+
return {
|
|
3388
|
+
terminated: true,
|
|
3389
|
+
matchedCondition: condition,
|
|
3390
|
+
reason: `No improvement for ${noImprovementCount} consecutive round${noImprovementCount === 1 ? "" : "s"}`
|
|
3391
|
+
};
|
|
3392
|
+
}
|
|
3393
|
+
const roundWord = noImprovementCount === 1 ? "round" : "rounds";
|
|
3394
|
+
return {
|
|
3395
|
+
terminated: false,
|
|
3396
|
+
reason: `${noImprovementCount} ${roundWord} without improvement (need ${consecutiveRounds})`
|
|
3397
|
+
};
|
|
3398
|
+
}
|
|
3399
|
+
function checkMaxCost(condition, ctx) {
|
|
3400
|
+
if (ctx.totalCost >= condition.maxUSD) {
|
|
3401
|
+
return {
|
|
3402
|
+
terminated: true,
|
|
3403
|
+
matchedCondition: condition,
|
|
3404
|
+
reason: `Cost limit exceeded ($${ctx.totalCost.toFixed(2)} >= $${condition.maxUSD.toFixed(2)})`
|
|
3405
|
+
};
|
|
3406
|
+
}
|
|
3407
|
+
return {
|
|
3408
|
+
terminated: false,
|
|
3409
|
+
reason: `Cost $${ctx.totalCost.toFixed(2)} under limit $${condition.maxUSD.toFixed(2)}`
|
|
3410
|
+
};
|
|
3411
|
+
}
|
|
3412
|
+
async function checkCustomCondition(condition, ctx) {
|
|
3413
|
+
const description = condition.description ?? "Custom condition";
|
|
3414
|
+
try {
|
|
3415
|
+
const shouldTerminate = await condition.check(ctx);
|
|
3416
|
+
if (shouldTerminate) {
|
|
3417
|
+
return {
|
|
3418
|
+
terminated: true,
|
|
3419
|
+
matchedCondition: condition,
|
|
3420
|
+
reason: `${description} met`
|
|
3421
|
+
};
|
|
3422
|
+
}
|
|
3423
|
+
return {
|
|
3424
|
+
terminated: false,
|
|
3425
|
+
reason: `${description} not met`
|
|
3426
|
+
};
|
|
3427
|
+
} catch (error) {
|
|
3428
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3429
|
+
return {
|
|
3430
|
+
terminated: false,
|
|
3431
|
+
reason: `${description} check failed: ${message}`
|
|
3432
|
+
};
|
|
3433
|
+
}
|
|
3434
|
+
}
|
|
3435
|
+
async function checkCycleCondition(condition, context) {
|
|
3436
|
+
if (isTargetScoreCondition(condition)) {
|
|
3437
|
+
return checkTargetScore(condition, context);
|
|
3438
|
+
}
|
|
3439
|
+
if (isMaxRoundsCondition(condition)) {
|
|
3440
|
+
return checkMaxRounds(condition, context);
|
|
3441
|
+
}
|
|
3442
|
+
if (isNoImprovementCondition(condition)) {
|
|
3443
|
+
return checkNoImprovement(condition, context);
|
|
3444
|
+
}
|
|
3445
|
+
if (isMaxCostCondition(condition)) {
|
|
3446
|
+
return checkMaxCost(condition, context);
|
|
3447
|
+
}
|
|
3448
|
+
if (isCustomCycleCondition(condition)) {
|
|
3449
|
+
return checkCustomCondition(condition, context);
|
|
3450
|
+
}
|
|
3451
|
+
const _exhaustive = condition;
|
|
3452
|
+
throw new EvalError(`Unknown condition type: ${JSON.stringify(_exhaustive)}`, {
|
|
3453
|
+
code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */,
|
|
3454
|
+
context: { condition: _exhaustive }
|
|
3455
|
+
});
|
|
3456
|
+
}
|
|
3457
|
+
async function checkCycleTermination(conditions, context) {
|
|
3458
|
+
if (conditions.length === 0) {
|
|
3459
|
+
return {
|
|
3460
|
+
terminated: false,
|
|
3461
|
+
reason: "No termination conditions specified"
|
|
3462
|
+
};
|
|
3463
|
+
}
|
|
3464
|
+
for (const condition of conditions) {
|
|
3465
|
+
const result = await checkCycleCondition(condition, context);
|
|
3466
|
+
if (result.terminated) {
|
|
3467
|
+
return result;
|
|
3468
|
+
}
|
|
3469
|
+
}
|
|
3470
|
+
return {
|
|
3471
|
+
terminated: false,
|
|
3472
|
+
reason: "No termination conditions met"
|
|
3473
|
+
};
|
|
3474
|
+
}
|
|
3475
|
+
|
|
3476
|
+
// src/improvement-cycle/runner.ts
|
|
3477
|
+
import { calculateCostFromUsage as calculateCostFromUsage2 } from "@agtlantis/core";
|
|
3478
|
+
|
|
3479
|
+
// src/improvement-cycle/history.ts
|
|
3480
|
+
import crypto from "crypto";
|
|
3481
|
+
import { existsSync as existsSync5 } from "fs";
|
|
3482
|
+
import { mkdir as mkdir2, readFile as readFile3, writeFile as writeFile3 } from "fs/promises";
|
|
3483
|
+
import { dirname } from "path";
|
|
3484
|
+
import { compileTemplate as compileTemplate2 } from "@agtlantis/core";
|
|
3485
|
+
var defaultHistoryStorage = {
|
|
3486
|
+
readFile: (path3) => readFile3(path3, "utf-8"),
|
|
3487
|
+
writeFile: (path3, content) => writeFile3(path3, content, "utf-8"),
|
|
3488
|
+
exists: existsSync5,
|
|
3489
|
+
mkdir: (path3, options) => mkdir2(path3, options)
|
|
3490
|
+
};
|
|
3491
|
+
function hasUserTemplate(prompt) {
|
|
3492
|
+
return typeof prompt.userTemplate === "string";
|
|
3493
|
+
}
|
|
3494
|
+
function serializePrompt(prompt) {
|
|
3495
|
+
const p = prompt;
|
|
3496
|
+
if (!hasUserTemplate(p)) {
|
|
3497
|
+
throw new EvalError("Cannot serialize prompt: userTemplate field is required", {
|
|
3498
|
+
code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
|
|
3499
|
+
context: { promptId: p.id }
|
|
3500
|
+
});
|
|
3501
|
+
}
|
|
3502
|
+
const { id, version, system, userTemplate, renderUserPrompt, ...rest } = p;
|
|
3503
|
+
const customFields = Object.keys(rest).length > 0 ? rest : void 0;
|
|
3504
|
+
return {
|
|
3505
|
+
id,
|
|
3506
|
+
version,
|
|
3507
|
+
system,
|
|
3508
|
+
userTemplate,
|
|
3509
|
+
...customFields && { customFields }
|
|
3510
|
+
};
|
|
3511
|
+
}
|
|
3512
|
+
function validateDeserializedPrompt(obj, promptId) {
|
|
3513
|
+
const requiredStrings = ["id", "version", "system", "userTemplate"];
|
|
3514
|
+
for (const field of requiredStrings) {
|
|
3515
|
+
if (typeof obj[field] !== "string") {
|
|
3516
|
+
throw new EvalError(`Invalid deserialized prompt: ${field} must be a string`, {
|
|
3517
|
+
code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
|
|
3518
|
+
context: { promptId, field, actual: typeof obj[field] }
|
|
3519
|
+
});
|
|
3520
|
+
}
|
|
3521
|
+
}
|
|
3522
|
+
if (typeof obj.renderUserPrompt !== "function") {
|
|
3523
|
+
throw new EvalError("Invalid deserialized prompt: renderUserPrompt must be a function", {
|
|
3524
|
+
code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
|
|
3525
|
+
context: { promptId, actual: typeof obj.renderUserPrompt }
|
|
3526
|
+
});
|
|
3527
|
+
}
|
|
3528
|
+
}
|
|
3529
|
+
function deserializePrompt(serialized) {
|
|
3530
|
+
const { id, version, system, userTemplate, customFields } = serialized;
|
|
3531
|
+
let renderUserPrompt;
|
|
3532
|
+
try {
|
|
3533
|
+
renderUserPrompt = compileTemplate2(userTemplate, id);
|
|
3534
|
+
} catch (error) {
|
|
3535
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3536
|
+
throw new EvalError(`Failed to compile userTemplate: ${message}`, {
|
|
3537
|
+
code: "TEMPLATE_COMPILE_ERROR" /* TEMPLATE_COMPILE_ERROR */,
|
|
3538
|
+
context: { promptId: id, userTemplate }
|
|
3539
|
+
});
|
|
3540
|
+
}
|
|
3541
|
+
const result = {
|
|
3542
|
+
...customFields,
|
|
3543
|
+
id,
|
|
3544
|
+
version,
|
|
3545
|
+
system,
|
|
3546
|
+
userTemplate,
|
|
3547
|
+
renderUserPrompt
|
|
3548
|
+
};
|
|
3549
|
+
validateDeserializedPrompt(result, id);
|
|
3550
|
+
return result;
|
|
3551
|
+
}
|
|
3552
|
+
function serializeRoundResult(result) {
|
|
3553
|
+
const { summary } = result.report;
|
|
3554
|
+
return {
|
|
3555
|
+
round: result.round,
|
|
3556
|
+
completedAt: result.completedAt.toISOString(),
|
|
3557
|
+
avgScore: summary.avgScore,
|
|
3558
|
+
passed: summary.passed,
|
|
3559
|
+
failed: summary.failed,
|
|
3560
|
+
totalTests: summary.totalTests,
|
|
3561
|
+
suggestionsGenerated: result.suggestionsGenerated,
|
|
3562
|
+
suggestionsApproved: result.suggestionsApproved,
|
|
3563
|
+
promptSnapshot: result.promptSnapshot,
|
|
3564
|
+
promptVersionAfter: result.promptVersionAfter,
|
|
3565
|
+
cost: result.cost,
|
|
3566
|
+
scoreDelta: result.scoreDelta
|
|
3567
|
+
};
|
|
3568
|
+
}
|
|
3569
|
+
function validateHistorySchema(data) {
|
|
3570
|
+
if (typeof data !== "object" || data === null) {
|
|
3571
|
+
throw new EvalError("Invalid history: not an object", {
|
|
3572
|
+
code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */
|
|
3573
|
+
});
|
|
3574
|
+
}
|
|
3575
|
+
const h = data;
|
|
3576
|
+
if (h.schemaVersion !== "1.1.0") {
|
|
3577
|
+
throw new EvalError(`Unsupported schema version: ${String(h.schemaVersion)}`, {
|
|
3578
|
+
code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */,
|
|
3579
|
+
context: { schemaVersion: h.schemaVersion }
|
|
3580
|
+
});
|
|
3581
|
+
}
|
|
3582
|
+
const requiredFields = [
|
|
3583
|
+
"sessionId",
|
|
3584
|
+
"startedAt",
|
|
3585
|
+
"initialPrompt",
|
|
3586
|
+
"currentPrompt",
|
|
3587
|
+
"rounds",
|
|
3588
|
+
"totalCost"
|
|
3589
|
+
];
|
|
3590
|
+
for (const field of requiredFields) {
|
|
3591
|
+
if (!(field in h)) {
|
|
3592
|
+
throw new EvalError(`Invalid history: missing field "${field}"`, {
|
|
3593
|
+
code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */,
|
|
3594
|
+
context: { missingField: field }
|
|
3595
|
+
});
|
|
3596
|
+
}
|
|
3597
|
+
}
|
|
3598
|
+
}
|
|
3599
|
+
var ImprovementSessionImpl = class {
|
|
3600
|
+
_history;
|
|
3601
|
+
_isUpdating = false;
|
|
3602
|
+
_savePromise = Promise.resolve();
|
|
3603
|
+
config;
|
|
3604
|
+
constructor(history, config = {}) {
|
|
3605
|
+
this._history = history;
|
|
3606
|
+
this.config = {
|
|
3607
|
+
autoSave: config.autoSave ?? false,
|
|
3608
|
+
...config
|
|
3609
|
+
};
|
|
3610
|
+
}
|
|
3611
|
+
get sessionId() {
|
|
3612
|
+
return this._history.sessionId;
|
|
3613
|
+
}
|
|
3614
|
+
get history() {
|
|
3615
|
+
return this._history;
|
|
3616
|
+
}
|
|
3617
|
+
get canSave() {
|
|
3618
|
+
return this.config.path !== void 0;
|
|
3619
|
+
}
|
|
3620
|
+
addRound(roundResult, updatedPrompt) {
|
|
3621
|
+
if (this._isUpdating) {
|
|
3622
|
+
throw new EvalError("Session is being updated", {
|
|
3623
|
+
code: "CONCURRENT_MODIFICATION" /* CONCURRENT_MODIFICATION */,
|
|
3624
|
+
context: { sessionId: this.sessionId }
|
|
3625
|
+
});
|
|
3626
|
+
}
|
|
3627
|
+
if (this._history.completedAt) {
|
|
3628
|
+
throw new EvalError("Cannot add round to completed session", {
|
|
3629
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
3630
|
+
context: { sessionId: this.sessionId }
|
|
3631
|
+
});
|
|
3632
|
+
}
|
|
3633
|
+
this._isUpdating = true;
|
|
3634
|
+
try {
|
|
3635
|
+
const serializedRound = serializeRoundResult(roundResult);
|
|
3636
|
+
this._history = {
|
|
3637
|
+
...this._history,
|
|
3638
|
+
currentPrompt: updatedPrompt,
|
|
3639
|
+
rounds: [...this._history.rounds, serializedRound],
|
|
3640
|
+
totalCost: this._history.totalCost + roundResult.cost.total
|
|
3641
|
+
};
|
|
3642
|
+
if (this.config.autoSave && this.canSave) {
|
|
3643
|
+
this.save().catch((err) => this.handleAutoSaveError(err));
|
|
3644
|
+
}
|
|
3645
|
+
} finally {
|
|
3646
|
+
this._isUpdating = false;
|
|
3647
|
+
}
|
|
3648
|
+
}
|
|
3649
|
+
complete(terminationReason) {
|
|
3650
|
+
this._history = {
|
|
3651
|
+
...this._history,
|
|
3652
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3653
|
+
terminationReason
|
|
3654
|
+
};
|
|
3655
|
+
if (this.config.autoSave && this.canSave) {
|
|
3656
|
+
this.save().catch((err) => this.handleAutoSaveError(err));
|
|
3657
|
+
}
|
|
3658
|
+
}
|
|
3659
|
+
handleAutoSaveError(error) {
|
|
3660
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
3661
|
+
if (this.config.onAutoSaveError) {
|
|
3662
|
+
this.config.onAutoSaveError(err);
|
|
3663
|
+
} else {
|
|
3664
|
+
console.error("Auto-save failed:", err);
|
|
3665
|
+
}
|
|
3666
|
+
}
|
|
3667
|
+
async save() {
|
|
3668
|
+
if (!this.config.path) {
|
|
3669
|
+
throw new EvalError("Cannot save: no path configured", {
|
|
3670
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
3671
|
+
context: { sessionId: this.sessionId }
|
|
3672
|
+
});
|
|
3673
|
+
}
|
|
3674
|
+
this._savePromise = this._savePromise.then(async () => {
|
|
3675
|
+
await saveHistory(this._history, this.config.path, this.config.storage);
|
|
3676
|
+
});
|
|
3677
|
+
return this._savePromise;
|
|
3678
|
+
}
|
|
3679
|
+
async flush() {
|
|
3680
|
+
return this._savePromise;
|
|
3681
|
+
}
|
|
3682
|
+
};
|
|
3683
|
+
function createSession(initialPrompt, config) {
|
|
3684
|
+
const serializedPrompt = serializePrompt(initialPrompt);
|
|
3685
|
+
const history = {
|
|
3686
|
+
schemaVersion: "1.1.0",
|
|
3687
|
+
sessionId: crypto.randomUUID(),
|
|
3688
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3689
|
+
initialPrompt: serializedPrompt,
|
|
3690
|
+
currentPrompt: serializedPrompt,
|
|
3691
|
+
rounds: [],
|
|
3692
|
+
totalCost: 0
|
|
3693
|
+
};
|
|
3694
|
+
return new ImprovementSessionImpl(history, config);
|
|
3695
|
+
}
|
|
3696
|
+
async function resumeSession(path3, config) {
|
|
3697
|
+
const history = await loadHistory(path3, config?.storage);
|
|
3698
|
+
const reopenedHistory = {
|
|
3699
|
+
...history,
|
|
3700
|
+
completedAt: void 0,
|
|
3701
|
+
terminationReason: void 0
|
|
3702
|
+
};
|
|
3703
|
+
return new ImprovementSessionImpl(reopenedHistory, { ...config, path: path3 });
|
|
3704
|
+
}
|
|
3705
|
+
async function saveHistory(history, path3, storage = defaultHistoryStorage) {
|
|
3706
|
+
try {
|
|
3707
|
+
const dir = dirname(path3);
|
|
3708
|
+
if (dir && dir !== "." && dir !== "/" && !storage.exists(dir)) {
|
|
3709
|
+
await storage.mkdir(dir, { recursive: true });
|
|
3710
|
+
}
|
|
3711
|
+
await storage.writeFile(path3, JSON.stringify(history, null, 2));
|
|
3712
|
+
} catch (error) {
|
|
3713
|
+
if (error instanceof EvalError) throw error;
|
|
3714
|
+
throw EvalError.from(error, "FILE_WRITE_ERROR" /* FILE_WRITE_ERROR */, { path: path3 });
|
|
3715
|
+
}
|
|
3716
|
+
}
|
|
3717
|
+
async function loadHistory(path3, storage = defaultHistoryStorage) {
|
|
3718
|
+
try {
|
|
3719
|
+
if (!storage.exists(path3)) {
|
|
3720
|
+
throw new EvalError(`History file not found: ${path3}`, {
|
|
3721
|
+
code: "FILE_READ_ERROR" /* FILE_READ_ERROR */,
|
|
3722
|
+
context: { path: path3 }
|
|
3723
|
+
});
|
|
3724
|
+
}
|
|
3725
|
+
const content = await storage.readFile(path3);
|
|
3726
|
+
const history = JSON.parse(content);
|
|
3727
|
+
validateHistorySchema(history);
|
|
3728
|
+
return history;
|
|
3729
|
+
} catch (error) {
|
|
3730
|
+
if (error instanceof EvalError) throw error;
|
|
3731
|
+
throw EvalError.from(error, "FILE_READ_ERROR" /* FILE_READ_ERROR */, { path: path3 });
|
|
3732
|
+
}
|
|
3733
|
+
}
|
|
3734
|
+
|
|
3735
|
+
// src/improvement-cycle/runner.ts
|
|
3736
|
+
function initializeCycleState(initialPrompt, existingSession) {
|
|
3737
|
+
const resumeFromRound = existingSession ? existingSession.history.rounds.length : 0;
|
|
3738
|
+
return {
|
|
3739
|
+
currentPrompt: initialPrompt,
|
|
3740
|
+
currentRound: resumeFromRound,
|
|
3741
|
+
previousScores: existingSession ? existingSession.history.rounds.map((r) => r.avgScore) : [],
|
|
3742
|
+
totalCost: existingSession ? existingSession.history.totalCost : 0,
|
|
3743
|
+
completedRounds: []
|
|
3744
|
+
};
|
|
3745
|
+
}
|
|
3746
|
+
function calculateScoreDelta(currentScore, previousScores) {
|
|
3747
|
+
if (previousScores.length === 0) {
|
|
3748
|
+
return null;
|
|
3749
|
+
}
|
|
3750
|
+
const previousScore = previousScores[previousScores.length - 1];
|
|
3751
|
+
return currentScore - previousScore;
|
|
3752
|
+
}
|
|
3753
|
+
function buildCycleContext(state, currentScore) {
|
|
3754
|
+
return {
|
|
3755
|
+
currentRound: state.currentRound,
|
|
3756
|
+
latestScore: currentScore,
|
|
3757
|
+
previousScores: [...state.previousScores],
|
|
3758
|
+
totalCost: state.totalCost,
|
|
3759
|
+
history: state.completedRounds
|
|
3760
|
+
};
|
|
3761
|
+
}
|
|
3762
|
+
function createRoundResult(state, report, improveResult, cost, scoreDelta, promptSnapshot) {
|
|
3763
|
+
return {
|
|
3764
|
+
round: state.currentRound,
|
|
3765
|
+
report,
|
|
3766
|
+
completedAt: /* @__PURE__ */ new Date(),
|
|
3767
|
+
suggestionsGenerated: improveResult.suggestions,
|
|
3768
|
+
suggestionsApproved: [],
|
|
3769
|
+
// Will be updated after decision
|
|
3770
|
+
promptSnapshot,
|
|
3771
|
+
promptVersionAfter: state.currentPrompt.version,
|
|
3772
|
+
cost,
|
|
3773
|
+
scoreDelta
|
|
3774
|
+
};
|
|
3775
|
+
}
|
|
3776
|
+
async function handleStopDecision(state, session, roundResult, promptSnapshot, terminatedByCondition, conditionReason) {
|
|
3777
|
+
const reason = terminatedByCondition ? conditionReason : "User requested stop";
|
|
3778
|
+
session.addRound(roundResult, promptSnapshot);
|
|
3779
|
+
session.complete(reason);
|
|
3780
|
+
await session.flush();
|
|
3781
|
+
state.completedRounds.push(roundResult);
|
|
3782
|
+
return {
|
|
3783
|
+
rounds: state.completedRounds,
|
|
3784
|
+
finalPrompt: deserializePrompt(session.history.currentPrompt),
|
|
3785
|
+
terminationReason: reason,
|
|
3786
|
+
totalCost: state.totalCost,
|
|
3787
|
+
history: session.history
|
|
3788
|
+
};
|
|
3789
|
+
}
|
|
3790
|
+
function handleRollbackDecision(state, rollbackToRound) {
|
|
3791
|
+
const targetRoundIndex = rollbackToRound - 1;
|
|
3792
|
+
if (targetRoundIndex < 0 || targetRoundIndex >= state.completedRounds.length) {
|
|
3793
|
+
throw new Error(`Cannot rollback to round ${rollbackToRound}: round not found`);
|
|
3794
|
+
}
|
|
3795
|
+
const targetRound = state.completedRounds[targetRoundIndex];
|
|
3796
|
+
state.currentPrompt = deserializePrompt(targetRound.promptSnapshot);
|
|
3797
|
+
state.previousScores = state.previousScores.slice(0, rollbackToRound - 1);
|
|
3798
|
+
}
|
|
3799
|
+
function handleContinueDecision(state, session, roundResult, approvedSuggestions, versionBump) {
|
|
3800
|
+
const updatedRoundResult = {
|
|
3801
|
+
...roundResult,
|
|
3802
|
+
suggestionsApproved: approvedSuggestions
|
|
3803
|
+
};
|
|
3804
|
+
if (approvedSuggestions.length > 0) {
|
|
3805
|
+
const applyResult = applyPromptSuggestions(state.currentPrompt, approvedSuggestions, {
|
|
3806
|
+
bumpVersion: versionBump
|
|
3807
|
+
});
|
|
3808
|
+
state.currentPrompt = applyResult.prompt;
|
|
3809
|
+
updatedRoundResult.promptVersionAfter = state.currentPrompt.version;
|
|
3810
|
+
}
|
|
3811
|
+
const updatedPromptSnapshot = serializePrompt(state.currentPrompt);
|
|
3812
|
+
session.addRound(updatedRoundResult, updatedPromptSnapshot);
|
|
3813
|
+
state.completedRounds.push(updatedRoundResult);
|
|
3814
|
+
return updatedRoundResult;
|
|
3815
|
+
}
|
|
3816
|
+
async function executeRound(config, state, pricingConfig) {
|
|
3817
|
+
const { createAgent, judge, improver, testCases: testCases2, options = {} } = config;
|
|
3818
|
+
const agent = createAgent(state.currentPrompt);
|
|
3819
|
+
const suite = createEvalSuite({
|
|
3820
|
+
agent,
|
|
3821
|
+
judge,
|
|
3822
|
+
agentDescription: options.agentDescription
|
|
3823
|
+
});
|
|
3824
|
+
const report = await suite.run(testCases2, options.runOptions);
|
|
3825
|
+
const improveResult = improver ? await improver.improve(state.currentPrompt, report.results) : { suggestions: [] };
|
|
3826
|
+
const cost = calculateRoundCost(report, improveResult, pricingConfig);
|
|
3827
|
+
return { report, improveResult, cost };
|
|
3828
|
+
}
|
|
3829
|
+
function detectProviderForImprover(model) {
|
|
3830
|
+
if (!model) return "anthropic";
|
|
3831
|
+
if (model.startsWith("claude-")) return "anthropic";
|
|
3832
|
+
if (model.startsWith("gpt-") || model.startsWith("o1") || model.startsWith("o3")) return "openai";
|
|
3833
|
+
if (model.startsWith("gemini-")) return "google";
|
|
3834
|
+
return "anthropic";
|
|
3835
|
+
}
|
|
3836
|
+
function toLanguageModelUsage2(usage) {
|
|
3837
|
+
return {
|
|
3838
|
+
inputTokens: usage.inputTokens,
|
|
3839
|
+
outputTokens: usage.outputTokens,
|
|
3840
|
+
totalTokens: usage.totalTokens
|
|
3841
|
+
};
|
|
3842
|
+
}
|
|
3843
|
+
function calculateImproverCost(improveResult, pricingConfig) {
|
|
3844
|
+
const usage = improveResult.metadata?.tokenUsage;
|
|
3845
|
+
if (!usage) return 0;
|
|
3846
|
+
const model = improveResult.metadata?.model ?? "unknown";
|
|
3847
|
+
const provider = detectProviderForImprover(model);
|
|
3848
|
+
const providerPricing = pricingConfig?.providerPricing?.[provider];
|
|
3849
|
+
const result = calculateCostFromUsage2(
|
|
3850
|
+
toLanguageModelUsage2(usage),
|
|
3851
|
+
model,
|
|
3852
|
+
provider,
|
|
3853
|
+
providerPricing
|
|
3854
|
+
);
|
|
3855
|
+
return result.total;
|
|
3856
|
+
}
|
|
3857
|
+
function calculateRoundCost(report, improveResult, pricingConfig) {
|
|
3858
|
+
const reportCosts = pricingConfig ? calculateReportCosts(report, pricingConfig) : { total: 0, byComponent: { agent: 0, judge: 0 } };
|
|
3859
|
+
const improverCost = calculateImproverCost(improveResult, pricingConfig);
|
|
3860
|
+
return {
|
|
3861
|
+
agent: reportCosts.byComponent.agent ?? 0,
|
|
3862
|
+
judge: reportCosts.byComponent.judge ?? 0,
|
|
3863
|
+
improver: improverCost,
|
|
3864
|
+
total: reportCosts.total + improverCost
|
|
3865
|
+
};
|
|
3866
|
+
}
|
|
3867
|
+
async function* runImprovementCycle(config) {
|
|
3868
|
+
const { initialPrompt, terminateWhen = [], options = {} } = config;
|
|
3869
|
+
const { pricingConfig, versionBump = "patch", history: historyConfig, session: existingSession } = options;
|
|
3870
|
+
const session = existingSession ?? createSession(
|
|
3871
|
+
initialPrompt,
|
|
3872
|
+
historyConfig ? { path: historyConfig.path, autoSave: historyConfig.autoSave } : void 0
|
|
3873
|
+
);
|
|
3874
|
+
const state = initializeCycleState(initialPrompt, existingSession);
|
|
3875
|
+
try {
|
|
3876
|
+
while (true) {
|
|
3877
|
+
state.currentRound++;
|
|
3878
|
+
const { report, improveResult, cost } = await executeRound(config, state, pricingConfig);
|
|
3879
|
+
state.totalCost += cost.total;
|
|
3880
|
+
const currentScore = report.summary.avgScore;
|
|
3881
|
+
const scoreDelta = calculateScoreDelta(currentScore, state.previousScores);
|
|
3882
|
+
const promptSnapshot = serializePrompt(state.currentPrompt);
|
|
3883
|
+
const roundResult = createRoundResult(state, report, improveResult, cost, scoreDelta, promptSnapshot);
|
|
3884
|
+
const context = buildCycleContext(state, currentScore);
|
|
3885
|
+
state.previousScores.push(currentScore);
|
|
3886
|
+
const terminationCheck = await checkCycleTermination(terminateWhen, context);
|
|
3887
|
+
const pendingSuggestions = improveResult.suggestions.map((s) => ({
|
|
3888
|
+
...s,
|
|
3889
|
+
approved: false
|
|
3890
|
+
}));
|
|
3891
|
+
const roundYield = {
|
|
3892
|
+
roundResult,
|
|
3893
|
+
pendingSuggestions,
|
|
3894
|
+
terminationCheck,
|
|
3895
|
+
context
|
|
3896
|
+
};
|
|
3897
|
+
const decision = yield roundYield;
|
|
3898
|
+
if (!decision || decision.action === "stop") {
|
|
3899
|
+
return await handleStopDecision(
|
|
3900
|
+
state,
|
|
3901
|
+
session,
|
|
3902
|
+
roundResult,
|
|
3903
|
+
promptSnapshot,
|
|
3904
|
+
terminationCheck.terminated,
|
|
3905
|
+
terminationCheck.reason
|
|
3906
|
+
);
|
|
3907
|
+
}
|
|
3908
|
+
if (decision.action === "rollback" && decision.rollbackToRound !== void 0) {
|
|
3909
|
+
handleRollbackDecision(state, decision.rollbackToRound);
|
|
3910
|
+
continue;
|
|
3911
|
+
}
|
|
3912
|
+
handleContinueDecision(
|
|
3913
|
+
state,
|
|
3914
|
+
session,
|
|
3915
|
+
roundResult,
|
|
3916
|
+
decision.approvedSuggestions ?? [],
|
|
3917
|
+
versionBump
|
|
3918
|
+
);
|
|
3919
|
+
}
|
|
3920
|
+
} catch (error) {
|
|
3921
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
3922
|
+
session.complete(`Error: ${errorMessage}`);
|
|
3923
|
+
throw error;
|
|
3924
|
+
}
|
|
3925
|
+
}
|
|
3926
|
+
async function runImprovementCycleAuto(config) {
|
|
3927
|
+
const cycle = runImprovementCycle(config);
|
|
3928
|
+
let iteratorResult = await cycle.next();
|
|
3929
|
+
while (!iteratorResult.done) {
|
|
3930
|
+
const roundYield = iteratorResult.value;
|
|
3931
|
+
let decision;
|
|
3932
|
+
if (roundYield.terminationCheck.terminated) {
|
|
3933
|
+
decision = { action: "stop" };
|
|
3934
|
+
} else {
|
|
3935
|
+
const approvedSuggestions = roundYield.pendingSuggestions.map((s) => ({
|
|
3936
|
+
...s,
|
|
3937
|
+
approved: true
|
|
3938
|
+
}));
|
|
3939
|
+
decision = { action: "continue", approvedSuggestions };
|
|
3940
|
+
}
|
|
3941
|
+
iteratorResult = await cycle.next(decision);
|
|
3942
|
+
}
|
|
3943
|
+
return iteratorResult.value;
|
|
3944
|
+
}
|
|
3945
|
+
|
|
3946
|
+
// src/cli/commands/run.ts
|
|
3947
|
+
async function runCommand(configPath, options) {
|
|
3948
|
+
const startTime = Date.now();
|
|
3949
|
+
try {
|
|
3950
|
+
printBanner();
|
|
3951
|
+
printProgress("Loading environment...");
|
|
3952
|
+
await loadEnvFile(options.envFile);
|
|
3953
|
+
printProgress("Loading configuration...");
|
|
3954
|
+
const config = await loadConfigWithDefaults(configPath);
|
|
3955
|
+
printProgress("Initializing providers...");
|
|
3956
|
+
const { mainProvider, judgeProvider, improverProvider } = initializeProviders(config, options);
|
|
3957
|
+
const judge = createJudge({
|
|
3958
|
+
provider: judgeProvider,
|
|
3959
|
+
prompt: config.judge.prompt,
|
|
3960
|
+
criteria: config.judge.criteria,
|
|
3961
|
+
passThreshold: config.judge.passThreshold
|
|
3962
|
+
});
|
|
3963
|
+
const improver = config.improver ? createImprover({
|
|
3964
|
+
provider: improverProvider,
|
|
3965
|
+
prompt: config.improver.prompt
|
|
3966
|
+
}) : void 0;
|
|
3967
|
+
const concurrency = options.concurrency ? parseInt(options.concurrency, 10) : config.run?.concurrency ?? 1;
|
|
3968
|
+
const iterations = options.iterations ? parseInt(options.iterations, 10) : config.run?.iterations ?? 1;
|
|
3969
|
+
const verbose = options.verbose ?? config.output?.verbose ?? false;
|
|
3970
|
+
const allReports = [];
|
|
3971
|
+
const includePatterns = options.include ?? config.include;
|
|
3972
|
+
if (includePatterns && includePatterns.length > 0) {
|
|
3973
|
+
const yamlReports = await runYamlTests({
|
|
3974
|
+
config,
|
|
3975
|
+
options,
|
|
3976
|
+
includePatterns,
|
|
3977
|
+
mainProvider,
|
|
3978
|
+
judge,
|
|
3979
|
+
improver,
|
|
3980
|
+
concurrency,
|
|
3981
|
+
iterations
|
|
3982
|
+
});
|
|
3983
|
+
allReports.push(...yamlReports);
|
|
3984
|
+
}
|
|
3985
|
+
if (config.testCases && config.testCases.length > 0) {
|
|
3986
|
+
const inlineReports = await runInlineTests({
|
|
3987
|
+
config,
|
|
3988
|
+
options,
|
|
3989
|
+
judge,
|
|
3990
|
+
improver,
|
|
3991
|
+
concurrency,
|
|
3992
|
+
iterations
|
|
3993
|
+
});
|
|
3994
|
+
allReports.push(...inlineReports);
|
|
3995
|
+
}
|
|
3996
|
+
const report = mergeReports(allReports, resolvePromptVersion(config));
|
|
3997
|
+
if (report.summary.totalTests === 0) {
|
|
3998
|
+
printError(
|
|
3999
|
+
new Error(
|
|
4000
|
+
"No test cases to run after filtering.\n" + (options.tags ? ` Tags filter: ${options.tags.join(", ")}
|
|
4001
|
+
` : "") + (options.agent ? ` Agent filter: ${options.agent}
|
|
4002
|
+
` : "")
|
|
4003
|
+
)
|
|
4004
|
+
);
|
|
4005
|
+
process.exit(1);
|
|
4006
|
+
}
|
|
4007
|
+
const duration = Date.now() - startTime;
|
|
4008
|
+
printSummary(report, { verbose, duration });
|
|
4009
|
+
if (options.report !== false) {
|
|
4010
|
+
const outputPath = await generateReport(report, {
|
|
4011
|
+
dir: config.output?.dir,
|
|
4012
|
+
filename: options.output ?? config.output?.filename
|
|
4013
|
+
});
|
|
4014
|
+
console.log(`
|
|
4015
|
+
Report saved to: ${outputPath}`);
|
|
4016
|
+
}
|
|
4017
|
+
const hasFailures = report.summary.failed > 0;
|
|
4018
|
+
process.exit(hasFailures ? 1 : 0);
|
|
4019
|
+
} catch (error) {
|
|
4020
|
+
printError(error instanceof Error ? error : new Error(String(error)));
|
|
4021
|
+
process.exit(1);
|
|
4022
|
+
}
|
|
4023
|
+
}
|
|
4024
|
+
async function runYamlTests(params) {
|
|
4025
|
+
const { config, options, includePatterns, mainProvider, judge, improver, concurrency, iterations } = params;
|
|
4026
|
+
const reports = [];
|
|
4027
|
+
printProgress("Discovering YAML eval files...");
|
|
4028
|
+
const filePaths = await discoverEvalFiles(config, { include: includePatterns });
|
|
4029
|
+
if (filePaths.length === 0) {
|
|
4030
|
+
printProgress("No YAML files found matching patterns");
|
|
4031
|
+
return reports;
|
|
4032
|
+
}
|
|
4033
|
+
printProgress(`Discovered ${filePaths.length} YAML file(s)`);
|
|
4034
|
+
const yamlFiles = await loadYamlEvalFiles(filePaths);
|
|
4035
|
+
const filteredFiles = options.agent ? yamlFiles.filter((f) => f.content.agent === options.agent) : yamlFiles;
|
|
4036
|
+
if (options.agent && filteredFiles.length === 0) {
|
|
4037
|
+
const availableAgents = [...new Set(yamlFiles.map((f) => f.content.agent))];
|
|
4038
|
+
throw new ConfigError(
|
|
4039
|
+
`No YAML files found for agent "${options.agent}".
|
|
4040
|
+
Available agents: ${availableAgents.join(", ")}`,
|
|
4041
|
+
"CONFIG_VALIDATION_ERROR"
|
|
4042
|
+
);
|
|
4043
|
+
}
|
|
4044
|
+
const yamlByAgent = groupYamlByAgent(filteredFiles);
|
|
4045
|
+
for (const [agentName, agentFiles] of yamlByAgent) {
|
|
4046
|
+
printProgress(`Running tests for agent: ${agentName}`);
|
|
4047
|
+
const agent = lookupAgent(config, agentName);
|
|
4048
|
+
const yamlContext = {
|
|
4049
|
+
provider: mainProvider,
|
|
4050
|
+
buildInput: (response, _ctx) => ({ message: response })
|
|
4051
|
+
};
|
|
4052
|
+
const yamlTestCases = [];
|
|
4053
|
+
for (const file of agentFiles) {
|
|
4054
|
+
const cases = convertToTestCases(file.content, yamlContext);
|
|
4055
|
+
yamlTestCases.push(...cases);
|
|
4056
|
+
}
|
|
4057
|
+
const filteredCases = filterByTags(yamlTestCases, options.tags);
|
|
4058
|
+
if (filteredCases.length === 0) {
|
|
4059
|
+
printProgress(` No matching tests after tag filter for ${agentName}`);
|
|
4060
|
+
continue;
|
|
4061
|
+
}
|
|
4062
|
+
const { singleTurnCases, multiTurnCases } = splitTestCases(filteredCases);
|
|
4063
|
+
const suite = createEvalSuite({
|
|
4064
|
+
agent,
|
|
4065
|
+
judge,
|
|
4066
|
+
improver,
|
|
4067
|
+
agentDescription: resolveAgentDescription(config, agent)
|
|
4068
|
+
});
|
|
4069
|
+
if (singleTurnCases.length > 0) {
|
|
4070
|
+
printProgress(` Running ${singleTurnCases.length} single-turn test(s)...`);
|
|
4071
|
+
const report = await suite.run(singleTurnCases, {
|
|
4072
|
+
concurrency,
|
|
4073
|
+
iterations,
|
|
4074
|
+
stopOnFirstFailure: config.run?.stopOnFirstFailure
|
|
4075
|
+
});
|
|
4076
|
+
reports.push(report);
|
|
4077
|
+
}
|
|
4078
|
+
if (multiTurnCases.length > 0) {
|
|
4079
|
+
printProgress(` Running ${multiTurnCases.length} multi-turn test(s)...`);
|
|
4080
|
+
const multiTurnResults = await runMultiTurnCases(multiTurnCases, {
|
|
4081
|
+
agent,
|
|
4082
|
+
judge,
|
|
4083
|
+
agentDescription: resolveAgentDescription(config, agent)
|
|
4084
|
+
});
|
|
4085
|
+
const multiTurnReport = createMultiTurnReport(multiTurnResults, { ...config, agent });
|
|
4086
|
+
reports.push(multiTurnReport);
|
|
4087
|
+
}
|
|
4088
|
+
}
|
|
4089
|
+
return reports;
|
|
4090
|
+
}
|
|
4091
|
+
async function runInlineTests(params) {
|
|
4092
|
+
const { config, options, judge, improver, concurrency, iterations } = params;
|
|
4093
|
+
const reports = [];
|
|
4094
|
+
const filteredInline = filterByTags(config.testCases, options.tags);
|
|
4095
|
+
if (filteredInline.length === 0) {
|
|
4096
|
+
return reports;
|
|
4097
|
+
}
|
|
4098
|
+
const { singleTurnCases, multiTurnCases } = splitTestCases(filteredInline);
|
|
4099
|
+
if (singleTurnCases.length > 0) {
|
|
4100
|
+
printProgress(`Running ${singleTurnCases.length} inline single-turn test(s)...`);
|
|
4101
|
+
const suite = createEvalSuite({
|
|
4102
|
+
agent: config.agent,
|
|
4103
|
+
judge,
|
|
4104
|
+
improver,
|
|
4105
|
+
agentDescription: resolveAgentDescription(config, config.agent)
|
|
4106
|
+
});
|
|
4107
|
+
const report = await suite.run(singleTurnCases, {
|
|
4108
|
+
concurrency,
|
|
4109
|
+
iterations,
|
|
4110
|
+
stopOnFirstFailure: config.run?.stopOnFirstFailure
|
|
4111
|
+
});
|
|
4112
|
+
reports.push(report);
|
|
4113
|
+
}
|
|
4114
|
+
if (multiTurnCases.length > 0) {
|
|
4115
|
+
printProgress(`Running ${multiTurnCases.length} inline multi-turn test(s)...`);
|
|
4116
|
+
const multiTurnResults = await runMultiTurnCases(multiTurnCases, {
|
|
4117
|
+
agent: config.agent,
|
|
4118
|
+
judge,
|
|
4119
|
+
agentDescription: resolveAgentDescription(config, config.agent)
|
|
4120
|
+
});
|
|
4121
|
+
const multiTurnReport = createMultiTurnReport(multiTurnResults, config);
|
|
4122
|
+
reports.push(multiTurnReport);
|
|
4123
|
+
}
|
|
4124
|
+
return reports;
|
|
4125
|
+
}
|
|
4126
|
+
async function runMultiTurnCases(testCases2, context) {
|
|
4127
|
+
const results = [];
|
|
4128
|
+
for (const testCase2 of testCases2) {
|
|
4129
|
+
const result = await executeMultiTurnTestCase(
|
|
4130
|
+
testCase2,
|
|
4131
|
+
context
|
|
4132
|
+
);
|
|
4133
|
+
results.push(result);
|
|
4134
|
+
}
|
|
4135
|
+
return results;
|
|
4136
|
+
}
|
|
4137
|
+
function createMultiTurnReport(results, config) {
|
|
4138
|
+
const totalTests = results.length;
|
|
4139
|
+
const passed = results.filter((r) => r.passed).length;
|
|
4140
|
+
const failed = totalTests - passed;
|
|
4141
|
+
const avgScore = totalTests > 0 ? results.reduce((sum, r) => sum + r.overallScore, 0) / totalTests : 0;
|
|
4142
|
+
const totalLatency = results.reduce((sum, r) => sum + r.metrics.latencyMs, 0);
|
|
4143
|
+
const totalTokens = results.reduce((sum, r) => sum + r.metrics.tokenUsage.totalTokens, 0);
|
|
4144
|
+
return {
|
|
4145
|
+
summary: {
|
|
4146
|
+
totalTests,
|
|
4147
|
+
passed,
|
|
4148
|
+
failed,
|
|
4149
|
+
avgScore,
|
|
4150
|
+
metrics: {
|
|
4151
|
+
avgLatencyMs: totalTests > 0 ? totalLatency / totalTests : 0,
|
|
4152
|
+
totalTokens,
|
|
4153
|
+
totalEstimatedCost: 0
|
|
4154
|
+
}
|
|
4155
|
+
},
|
|
4156
|
+
results,
|
|
4157
|
+
suggestions: [],
|
|
4158
|
+
generatedAt: /* @__PURE__ */ new Date(),
|
|
4159
|
+
promptVersion: resolvePromptVersion(config)
|
|
4160
|
+
};
|
|
4161
|
+
}
|
|
4162
|
+
function groupYamlByAgent(files) {
|
|
4163
|
+
const groups = /* @__PURE__ */ new Map();
|
|
4164
|
+
for (const file of files) {
|
|
4165
|
+
const agentName = file.content.agent;
|
|
4166
|
+
if (!groups.has(agentName)) {
|
|
4167
|
+
groups.set(agentName, []);
|
|
4168
|
+
}
|
|
4169
|
+
groups.get(agentName).push(file);
|
|
4170
|
+
}
|
|
4171
|
+
return groups;
|
|
4172
|
+
}
|
|
4173
|
+
function lookupAgent(config, agentName) {
|
|
4174
|
+
if (!config.agents || !(agentName in config.agents)) {
|
|
4175
|
+
const available = config.agents ? Object.keys(config.agents) : [];
|
|
4176
|
+
throw new ConfigError(
|
|
4177
|
+
`Agent "${agentName}" not found in config.agents registry.
|
|
4178
|
+
Available agents: ${available.length > 0 ? available.join(", ") : "(none)"}`,
|
|
4179
|
+
"CONFIG_VALIDATION_ERROR"
|
|
4180
|
+
);
|
|
4181
|
+
}
|
|
4182
|
+
return config.agents[agentName];
|
|
4183
|
+
}
|
|
4184
|
+
function filterByTags(testCases2, tags) {
|
|
4185
|
+
if (!tags || tags.length === 0) {
|
|
4186
|
+
return testCases2;
|
|
4187
|
+
}
|
|
4188
|
+
return testCases2.filter((tc) => {
|
|
4189
|
+
if (!tc.tags || tc.tags.length === 0) {
|
|
4190
|
+
return false;
|
|
4191
|
+
}
|
|
4192
|
+
return tc.tags.some((tag) => tags.includes(tag));
|
|
4193
|
+
});
|
|
4194
|
+
}
|
|
4195
|
+
function splitTestCases(testCases2) {
|
|
4196
|
+
const singleTurnCases = [];
|
|
4197
|
+
const multiTurnCases = [];
|
|
4198
|
+
for (const testCase2 of testCases2) {
|
|
4199
|
+
if (isMultiTurnConfig(testCase2)) {
|
|
4200
|
+
multiTurnCases.push(testCase2);
|
|
4201
|
+
} else {
|
|
4202
|
+
singleTurnCases.push(testCase2);
|
|
4203
|
+
}
|
|
4204
|
+
}
|
|
4205
|
+
return { singleTurnCases, multiTurnCases };
|
|
4206
|
+
}
|
|
4207
|
+
function resolveAgentDescription(config, agent) {
|
|
4208
|
+
return config.agentDescription ?? agent.config.description ?? "";
|
|
4209
|
+
}
|
|
4210
|
+
function resolvePromptVersion(config) {
|
|
4211
|
+
if (config.agent?.prompt?.version) {
|
|
4212
|
+
return config.agent.prompt.version;
|
|
4213
|
+
}
|
|
4214
|
+
if (config.agents) {
|
|
4215
|
+
const firstAgent = Object.values(config.agents)[0];
|
|
4216
|
+
if (firstAgent?.prompt?.version) {
|
|
4217
|
+
return firstAgent.prompt.version;
|
|
4218
|
+
}
|
|
4219
|
+
}
|
|
4220
|
+
return "unknown";
|
|
4221
|
+
}
|
|
4222
|
+
function mergeReports(reports, promptVersion) {
|
|
4223
|
+
if (reports.length === 0) {
|
|
4224
|
+
return createEmptyReport(promptVersion);
|
|
4225
|
+
}
|
|
4226
|
+
if (reports.length === 1) {
|
|
4227
|
+
return reports[0];
|
|
4228
|
+
}
|
|
4229
|
+
const allResults = [];
|
|
4230
|
+
const suggestionMap = /* @__PURE__ */ new Map();
|
|
4231
|
+
let totalTests = 0;
|
|
4232
|
+
let passed = 0;
|
|
4233
|
+
let failed = 0;
|
|
4234
|
+
let totalScore = 0;
|
|
4235
|
+
let totalLatency = 0;
|
|
4236
|
+
let totalTokens = 0;
|
|
4237
|
+
let totalCost = 0;
|
|
4238
|
+
for (const report of reports) {
|
|
4239
|
+
allResults.push(...report.results);
|
|
4240
|
+
deduplicateSuggestions(report.suggestions, suggestionMap);
|
|
4241
|
+
totalTests += report.summary.totalTests;
|
|
4242
|
+
passed += report.summary.passed;
|
|
4243
|
+
failed += report.summary.failed;
|
|
4244
|
+
totalScore += report.summary.avgScore * report.summary.totalTests;
|
|
4245
|
+
totalLatency += report.summary.metrics.avgLatencyMs * report.summary.totalTests;
|
|
4246
|
+
totalTokens += report.summary.metrics.totalTokens;
|
|
4247
|
+
totalCost += report.summary.metrics.totalEstimatedCost ?? 0;
|
|
4248
|
+
}
|
|
4249
|
+
return {
|
|
4250
|
+
summary: {
|
|
4251
|
+
totalTests,
|
|
4252
|
+
passed,
|
|
4253
|
+
failed,
|
|
4254
|
+
avgScore: totalTests > 0 ? totalScore / totalTests : 0,
|
|
4255
|
+
metrics: {
|
|
4256
|
+
avgLatencyMs: totalTests > 0 ? totalLatency / totalTests : 0,
|
|
4257
|
+
totalTokens,
|
|
4258
|
+
totalEstimatedCost: totalCost
|
|
4259
|
+
}
|
|
4260
|
+
},
|
|
4261
|
+
results: allResults,
|
|
4262
|
+
suggestions: [...suggestionMap.values()],
|
|
4263
|
+
generatedAt: /* @__PURE__ */ new Date(),
|
|
4264
|
+
promptVersion
|
|
4265
|
+
};
|
|
4266
|
+
}
|
|
4267
|
+
function createEmptyReport(promptVersion) {
|
|
4268
|
+
return {
|
|
4269
|
+
summary: {
|
|
4270
|
+
totalTests: 0,
|
|
4271
|
+
passed: 0,
|
|
4272
|
+
failed: 0,
|
|
4273
|
+
avgScore: 0,
|
|
4274
|
+
metrics: {
|
|
4275
|
+
avgLatencyMs: 0,
|
|
4276
|
+
totalTokens: 0,
|
|
4277
|
+
totalEstimatedCost: 0
|
|
4278
|
+
}
|
|
4279
|
+
},
|
|
4280
|
+
results: [],
|
|
4281
|
+
suggestions: [],
|
|
4282
|
+
generatedAt: /* @__PURE__ */ new Date(),
|
|
4283
|
+
promptVersion
|
|
4284
|
+
};
|
|
4285
|
+
}
|
|
4286
|
+
function deduplicateSuggestions(suggestions, map) {
|
|
4287
|
+
for (const suggestion of suggestions) {
|
|
4288
|
+
const key = `${suggestion.type}:${suggestion.suggestedValue}`;
|
|
4289
|
+
if (!map.has(key)) {
|
|
4290
|
+
map.set(key, suggestion);
|
|
4291
|
+
}
|
|
4292
|
+
}
|
|
4293
|
+
}
|
|
4294
|
+
|
|
4295
|
+
// src/cli/output/improve-report.ts
|
|
4296
|
+
function printImprovementSummary(result, options = {}) {
|
|
4297
|
+
const { rounds, terminationReason, totalCost, finalPrompt } = result;
|
|
4298
|
+
const { verbose, duration } = options;
|
|
4299
|
+
const divider = "\u2550".repeat(CLI_DEFAULTS.DIVIDER_WIDTH);
|
|
4300
|
+
console.log();
|
|
4301
|
+
console.log(c("cyan", divider));
|
|
4302
|
+
console.log(c("bold", " Improvement Cycle Results"));
|
|
4303
|
+
console.log(c("cyan", divider));
|
|
4304
|
+
console.log();
|
|
4305
|
+
const finalScore = getFinalScore(rounds);
|
|
4306
|
+
const scoreChange = getScoreChange(rounds);
|
|
4307
|
+
console.log(` ${c("bold", "Total Rounds:")} ${rounds.length}`);
|
|
4308
|
+
console.log(` ${c("bold", "Final Score:")} ${finalScore.toFixed(1)}/100`);
|
|
4309
|
+
console.log(` ${c("bold", "Score Change:")} ${scoreChange}`);
|
|
4310
|
+
console.log(` ${c("bold", "Total Cost:")} $${totalCost.toFixed(2)}`);
|
|
4311
|
+
console.log(` ${c("bold", "Final Version:")} ${finalPrompt.version}`);
|
|
4312
|
+
console.log();
|
|
4313
|
+
console.log(` ${c("bold", "Termination:")} ${terminationReason}`);
|
|
4314
|
+
if (duration !== void 0) {
|
|
4315
|
+
console.log();
|
|
4316
|
+
console.log(` ${c("bold", "Duration:")} ${formatDuration(duration)}`);
|
|
4317
|
+
}
|
|
4318
|
+
console.log();
|
|
4319
|
+
console.log(c("cyan", divider));
|
|
4320
|
+
if (verbose && rounds.length > 0) {
|
|
4321
|
+
printRoundsDetail(rounds);
|
|
4322
|
+
}
|
|
4323
|
+
}
|
|
4324
|
+
function getFinalScore(rounds) {
|
|
4325
|
+
if (rounds.length === 0) return 0;
|
|
4326
|
+
return rounds[rounds.length - 1].report.summary.avgScore;
|
|
4327
|
+
}
|
|
4328
|
+
function getScoreChange(rounds) {
|
|
4329
|
+
if (rounds.length < 1) return "N/A";
|
|
4330
|
+
const firstScore = rounds[0].report.summary.avgScore;
|
|
4331
|
+
const lastScore = rounds[rounds.length - 1].report.summary.avgScore;
|
|
4332
|
+
const delta = lastScore - firstScore;
|
|
4333
|
+
if (rounds.length === 1) {
|
|
4334
|
+
return c("dim", "N/A (first round)");
|
|
4335
|
+
}
|
|
4336
|
+
if (delta > 0) {
|
|
4337
|
+
return c("green", `+${delta.toFixed(1)}`);
|
|
4338
|
+
} else if (delta < 0) {
|
|
4339
|
+
return c("red", `${delta.toFixed(1)}`);
|
|
4340
|
+
}
|
|
4341
|
+
return "0.0";
|
|
4342
|
+
}
|
|
4343
|
+
function formatDuration(ms) {
|
|
4344
|
+
if (ms < 1e3) return `${ms}ms`;
|
|
4345
|
+
if (ms < 6e4) return `${(ms / 1e3).toFixed(1)}s`;
|
|
4346
|
+
return `${(ms / 6e4).toFixed(1)}m`;
|
|
4347
|
+
}
|
|
4348
|
+
function printRoundsDetail(rounds) {
|
|
4349
|
+
console.log();
|
|
4350
|
+
console.log(c("bold", " Round History:"));
|
|
4351
|
+
console.log();
|
|
4352
|
+
for (const round of rounds) {
|
|
4353
|
+
const scoreStr = round.report.summary.avgScore.toFixed(1);
|
|
4354
|
+
const deltaStr = round.scoreDelta !== null ? ` (${round.scoreDelta >= 0 ? "+" : ""}${round.scoreDelta.toFixed(1)})` : "";
|
|
4355
|
+
const costStr = `$${round.cost.total.toFixed(2)}`;
|
|
4356
|
+
console.log(
|
|
4357
|
+
` Round ${round.round}: Score ${scoreStr}${deltaStr} | Cost ${costStr}`
|
|
4358
|
+
);
|
|
4359
|
+
console.log(
|
|
4360
|
+
` Suggestions: ${round.suggestionsGenerated.length} generated, ${round.suggestionsApproved.length} applied`
|
|
4361
|
+
);
|
|
4362
|
+
}
|
|
4363
|
+
}
|
|
4364
|
+
|
|
4365
|
+
// src/cli/commands/improve.ts
|
|
4366
|
+
async function improveCommand(configPath, options) {
|
|
4367
|
+
const startTime = Date.now();
|
|
4368
|
+
try {
|
|
4369
|
+
printBanner();
|
|
4370
|
+
validateImproveOptions(options);
|
|
4371
|
+
printProgress("Loading environment...");
|
|
4372
|
+
await loadEnvFile(options.envFile);
|
|
4373
|
+
printProgress("Loading configuration...");
|
|
4374
|
+
const config = await loadConfigWithDefaults(configPath);
|
|
4375
|
+
printProgress("Initializing providers...");
|
|
4376
|
+
const { mainProvider, judgeProvider, improverProvider } = initializeProviders(config, options);
|
|
4377
|
+
const conditions = buildTerminationConditions(options);
|
|
4378
|
+
if (conditions.length === 0) {
|
|
4379
|
+
throw new Error(
|
|
4380
|
+
"At least one termination condition is required.\nUse --target-score, --max-rounds, --max-cost, or --stale-rounds"
|
|
4381
|
+
);
|
|
4382
|
+
}
|
|
4383
|
+
const judge = createJudge({
|
|
4384
|
+
provider: judgeProvider,
|
|
4385
|
+
prompt: config.judge.prompt,
|
|
4386
|
+
criteria: config.judge.criteria,
|
|
4387
|
+
passThreshold: config.judge.passThreshold
|
|
4388
|
+
});
|
|
4389
|
+
if (!config.improver) {
|
|
4390
|
+
throw new Error(
|
|
4391
|
+
"Improver configuration is required for improvement cycles.\nAdd an `improver` section to your config file."
|
|
4392
|
+
);
|
|
4393
|
+
}
|
|
4394
|
+
const improver = createImprover({
|
|
4395
|
+
provider: improverProvider,
|
|
4396
|
+
prompt: config.improver.prompt
|
|
4397
|
+
});
|
|
4398
|
+
if (options.resume) {
|
|
4399
|
+
await runResumeMode(
|
|
4400
|
+
options,
|
|
4401
|
+
config,
|
|
4402
|
+
conditions,
|
|
4403
|
+
judge,
|
|
4404
|
+
improver,
|
|
4405
|
+
mainProvider,
|
|
4406
|
+
startTime
|
|
4407
|
+
);
|
|
4408
|
+
} else {
|
|
4409
|
+
await runFreshMode(
|
|
4410
|
+
options,
|
|
4411
|
+
config,
|
|
4412
|
+
conditions,
|
|
4413
|
+
judge,
|
|
4414
|
+
improver,
|
|
4415
|
+
mainProvider,
|
|
4416
|
+
startTime
|
|
4417
|
+
);
|
|
4418
|
+
}
|
|
4419
|
+
} catch (error) {
|
|
4420
|
+
printError(error instanceof Error ? error : new Error(String(error)));
|
|
4421
|
+
process.exit(1);
|
|
4422
|
+
}
|
|
4423
|
+
}
|
|
4424
|
+
function validateImproveOptions(options) {
|
|
4425
|
+
if (!options.history && !options.resume) {
|
|
4426
|
+
throw new Error(
|
|
4427
|
+
"--history <path> is required to save improvement history.\nOr use --resume <path> to continue from existing history."
|
|
4428
|
+
);
|
|
4429
|
+
}
|
|
4430
|
+
}
|
|
4431
|
+
function buildTerminationConditions(options) {
|
|
4432
|
+
const conditions = [];
|
|
4433
|
+
if (options.targetScore) {
|
|
4434
|
+
const score = parseInt(options.targetScore, 10);
|
|
4435
|
+
if (isNaN(score) || score < 0 || score > 100) {
|
|
4436
|
+
throw new Error(`Invalid target score: ${options.targetScore}. Must be 0-100.`);
|
|
4437
|
+
}
|
|
4438
|
+
conditions.push(targetScore(score));
|
|
4439
|
+
}
|
|
4440
|
+
if (options.maxRounds) {
|
|
4441
|
+
const rounds = parseInt(options.maxRounds, 10);
|
|
4442
|
+
if (isNaN(rounds) || rounds < 1) {
|
|
4443
|
+
throw new Error(`Invalid max rounds: ${options.maxRounds}. Must be >= 1.`);
|
|
4444
|
+
}
|
|
4445
|
+
conditions.push(maxRounds(rounds));
|
|
4446
|
+
}
|
|
4447
|
+
if (options.maxCost) {
|
|
4448
|
+
const cost = parseFloat(options.maxCost);
|
|
4449
|
+
if (isNaN(cost) || cost <= 0) {
|
|
4450
|
+
throw new Error(`Invalid max cost: ${options.maxCost}. Must be > 0.`);
|
|
4451
|
+
}
|
|
4452
|
+
conditions.push(maxCost(cost));
|
|
4453
|
+
}
|
|
4454
|
+
if (options.staleRounds) {
|
|
4455
|
+
const rounds = parseInt(options.staleRounds, 10);
|
|
4456
|
+
if (isNaN(rounds) || rounds < 1) {
|
|
4457
|
+
throw new Error(`Invalid stale-rounds: ${options.staleRounds}. Must be >= 1.`);
|
|
4458
|
+
}
|
|
4459
|
+
conditions.push(noImprovement(rounds));
|
|
4460
|
+
}
|
|
4461
|
+
return conditions;
|
|
4462
|
+
}
|
|
4463
|
+
function createAgentFactory(baseAgent) {
|
|
4464
|
+
return (prompt) => ({
|
|
4465
|
+
...baseAgent,
|
|
4466
|
+
prompt
|
|
4467
|
+
});
|
|
4468
|
+
}
|
|
4469
|
+
async function runFreshMode(options, config, conditions, judge, improver, _mainProvider, startTime) {
|
|
4470
|
+
printProgress("Starting improvement cycle...");
|
|
4471
|
+
const testCases2 = config.testCases ?? [];
|
|
4472
|
+
if (testCases2.length === 0) {
|
|
4473
|
+
throw new Error(
|
|
4474
|
+
"No test cases found. Add testCases to your config or use include patterns."
|
|
4475
|
+
);
|
|
4476
|
+
}
|
|
4477
|
+
const cycleConfig = {
|
|
4478
|
+
createAgent: createAgentFactory(config.agent),
|
|
4479
|
+
initialPrompt: config.agent.prompt,
|
|
4480
|
+
testCases: testCases2,
|
|
4481
|
+
judge,
|
|
4482
|
+
improver,
|
|
4483
|
+
terminateWhen: conditions,
|
|
4484
|
+
options: {
|
|
4485
|
+
pricingConfig: config.pricing,
|
|
4486
|
+
agentDescription: config.agentDescription,
|
|
4487
|
+
history: options.history ? {
|
|
4488
|
+
path: options.history,
|
|
4489
|
+
autoSave: true
|
|
4490
|
+
} : void 0,
|
|
4491
|
+
runOptions: {
|
|
4492
|
+
concurrency: options.concurrency ? parseInt(options.concurrency, 10) : config.run?.concurrency,
|
|
4493
|
+
iterations: options.iterations ? parseInt(options.iterations, 10) : config.run?.iterations
|
|
4494
|
+
}
|
|
4495
|
+
}
|
|
4496
|
+
};
|
|
4497
|
+
printProgress(`Running with ${testCases2.length} test case(s)...`);
|
|
4498
|
+
printProgress(`Termination: ${formatConditions(conditions)}`);
|
|
4499
|
+
console.log();
|
|
4500
|
+
const result = await runImprovementCycleAuto(cycleConfig);
|
|
4501
|
+
const duration = Date.now() - startTime;
|
|
4502
|
+
printImprovementSummary(result, { verbose: options.verbose, duration });
|
|
4503
|
+
if (options.history) {
|
|
4504
|
+
console.log(`
|
|
4505
|
+
History saved to: ${options.history}`);
|
|
4506
|
+
}
|
|
4507
|
+
process.exit(0);
|
|
4508
|
+
}
|
|
4509
|
+
async function runResumeMode(options, config, conditions, judge, improver, _mainProvider, startTime) {
|
|
4510
|
+
printProgress(`Resuming from ${options.resume}...`);
|
|
4511
|
+
const session = await resumeSession(options.resume, { autoSave: true });
|
|
4512
|
+
const currentPrompt = deserializePrompt(session.history.currentPrompt);
|
|
4513
|
+
printProgress(`Resumed session ${session.sessionId}`);
|
|
4514
|
+
printProgress(`Continuing from round ${session.history.rounds.length + 1}`);
|
|
4515
|
+
const testCases2 = config.testCases ?? [];
|
|
4516
|
+
if (testCases2.length === 0) {
|
|
4517
|
+
throw new Error(
|
|
4518
|
+
"No test cases found. Add testCases to your config or use include patterns."
|
|
4519
|
+
);
|
|
4520
|
+
}
|
|
4521
|
+
const cycleConfig = {
|
|
4522
|
+
createAgent: createAgentFactory(config.agent),
|
|
4523
|
+
initialPrompt: currentPrompt,
|
|
4524
|
+
testCases: testCases2,
|
|
4525
|
+
judge,
|
|
4526
|
+
improver,
|
|
4527
|
+
terminateWhen: conditions,
|
|
4528
|
+
options: {
|
|
4529
|
+
pricingConfig: config.pricing,
|
|
4530
|
+
agentDescription: config.agentDescription,
|
|
4531
|
+
history: {
|
|
4532
|
+
path: options.resume,
|
|
4533
|
+
autoSave: true
|
|
4534
|
+
},
|
|
4535
|
+
session,
|
|
4536
|
+
// Pass the resumed session to preserve session ID and accumulated state
|
|
4537
|
+
runOptions: {
|
|
4538
|
+
concurrency: options.concurrency ? parseInt(options.concurrency, 10) : config.run?.concurrency,
|
|
4539
|
+
iterations: options.iterations ? parseInt(options.iterations, 10) : config.run?.iterations
|
|
4540
|
+
}
|
|
4541
|
+
}
|
|
4542
|
+
};
|
|
4543
|
+
printProgress(`Running with ${testCases2.length} test case(s)...`);
|
|
4544
|
+
printProgress(`Termination: ${formatConditions(conditions)}`);
|
|
4545
|
+
console.log();
|
|
4546
|
+
const result = await runImprovementCycleAuto(cycleConfig);
|
|
4547
|
+
const duration = Date.now() - startTime;
|
|
4548
|
+
printImprovementSummary(result, { verbose: options.verbose, duration });
|
|
4549
|
+
console.log(`
|
|
4550
|
+
History saved to: ${options.resume}`);
|
|
4551
|
+
process.exit(0);
|
|
4552
|
+
}
|
|
4553
|
+
function formatConditions(conditions) {
|
|
4554
|
+
return conditions.map((c2) => {
|
|
4555
|
+
switch (c2.type) {
|
|
4556
|
+
case "targetScore":
|
|
4557
|
+
return `score >= ${c2.threshold}`;
|
|
4558
|
+
case "maxRounds":
|
|
4559
|
+
return `max ${c2.count} rounds`;
|
|
4560
|
+
case "maxCost":
|
|
4561
|
+
return `max $${c2.maxUSD}`;
|
|
4562
|
+
case "noImprovement":
|
|
4563
|
+
return `no improvement for ${c2.consecutiveRounds} rounds`;
|
|
4564
|
+
case "custom":
|
|
4565
|
+
return c2.description ?? "custom condition";
|
|
4566
|
+
}
|
|
4567
|
+
}).join(" OR ");
|
|
4568
|
+
}
|
|
4569
|
+
|
|
4570
|
+
// src/cli/commands/rollback.ts
|
|
4571
|
+
import { existsSync as existsSync6 } from "fs";
|
|
4572
|
+
import { mkdir as mkdir3, writeFile as writeFile4 } from "fs/promises";
|
|
4573
|
+
import { dirname as dirname2 } from "path";
|
|
4574
|
+
async function rollbackCommand(historyPath, options) {
|
|
4575
|
+
try {
|
|
4576
|
+
printBanner();
|
|
4577
|
+
validateRollbackOptions(historyPath, options);
|
|
4578
|
+
printProgress(`Loading history from ${historyPath}...`);
|
|
4579
|
+
const history = await loadHistory(historyPath);
|
|
4580
|
+
const { prompt: serializedPrompt, sourceLabel } = extractPromptSnapshot(history, options);
|
|
4581
|
+
printProgress(`Extracting ${sourceLabel}...`);
|
|
4582
|
+
const format = options.format ?? "json";
|
|
4583
|
+
const output = formatPromptOutput(serializedPrompt, format);
|
|
4584
|
+
await writeOutputFile(options.output, output);
|
|
4585
|
+
console.log();
|
|
4586
|
+
console.log(` Prompt extracted to: ${options.output}`);
|
|
4587
|
+
console.log(` Prompt ID: ${serializedPrompt.id}`);
|
|
4588
|
+
console.log(` Version: ${serializedPrompt.version}`);
|
|
4589
|
+
console.log();
|
|
4590
|
+
} catch (error) {
|
|
4591
|
+
printError(error instanceof Error ? error : new Error(String(error)));
|
|
4592
|
+
process.exit(1);
|
|
4593
|
+
}
|
|
4594
|
+
}
|
|
4595
|
+
function extractPromptSnapshot(history, options) {
|
|
4596
|
+
if (options.initial) {
|
|
4597
|
+
return {
|
|
4598
|
+
prompt: history.initialPrompt,
|
|
4599
|
+
sourceLabel: "initial prompt"
|
|
4600
|
+
};
|
|
4601
|
+
}
|
|
4602
|
+
const roundNumber = parseInt(options.round, 10);
|
|
4603
|
+
return {
|
|
4604
|
+
prompt: extractPromptFromRound(history, roundNumber),
|
|
4605
|
+
sourceLabel: `round ${roundNumber}`
|
|
4606
|
+
};
|
|
4607
|
+
}
|
|
4608
|
+
function validateRollbackOptions(historyPath, options) {
|
|
4609
|
+
if (!historyPath) {
|
|
4610
|
+
throw new Error("History file path is required");
|
|
4611
|
+
}
|
|
4612
|
+
const hasRound = options.round !== void 0;
|
|
4613
|
+
const hasInitial = options.initial === true;
|
|
4614
|
+
if (!hasRound && !hasInitial) {
|
|
4615
|
+
throw new Error("Either --round <n> or --initial is required");
|
|
4616
|
+
}
|
|
4617
|
+
if (hasRound && hasInitial) {
|
|
4618
|
+
throw new Error("Cannot use both --round and --initial");
|
|
4619
|
+
}
|
|
4620
|
+
if (!options.output) {
|
|
4621
|
+
throw new Error("--output <path> is required");
|
|
4622
|
+
}
|
|
4623
|
+
if (hasRound) {
|
|
4624
|
+
const roundNum = parseInt(options.round, 10);
|
|
4625
|
+
if (!Number.isInteger(roundNum) || roundNum < 1) {
|
|
4626
|
+
throw new Error(`Invalid round number: ${options.round}. Must be 1 or greater.`);
|
|
4627
|
+
}
|
|
4628
|
+
}
|
|
4629
|
+
if (options.format && !["json", "ts"].includes(options.format)) {
|
|
4630
|
+
throw new Error(`Invalid format: ${options.format}. Use 'json' or 'ts'`);
|
|
4631
|
+
}
|
|
4632
|
+
}
|
|
4633
|
+
function extractPromptFromRound(history, roundNumber) {
|
|
4634
|
+
const roundIndex = roundNumber - 1;
|
|
4635
|
+
if (roundIndex < 0 || roundIndex >= history.rounds.length) {
|
|
4636
|
+
const availableRounds = history.rounds.length > 0 ? `1-${history.rounds.length}` : "none";
|
|
4637
|
+
throw new Error(
|
|
4638
|
+
`Round ${roundNumber} not found. Available rounds: ${availableRounds}. Use --initial for the original prompt.`
|
|
4639
|
+
);
|
|
4640
|
+
}
|
|
4641
|
+
return history.rounds[roundIndex].promptSnapshot;
|
|
4642
|
+
}
|
|
4643
|
+
function formatPromptOutput(prompt, format) {
|
|
4644
|
+
if (format === "json") {
|
|
4645
|
+
return JSON.stringify(prompt, null, 2);
|
|
4646
|
+
}
|
|
4647
|
+
return generateTypeScriptPrompt(prompt);
|
|
4648
|
+
}
|
|
4649
|
+
function generateTypeScriptPrompt(prompt) {
|
|
4650
|
+
const escapedSystem = escapeTemplateString(prompt.system);
|
|
4651
|
+
const escapedUserTemplate = escapeTemplateString(prompt.userTemplate);
|
|
4652
|
+
const customFieldsComment = prompt.customFields ? `
|
|
4653
|
+
* Custom fields: ${Object.keys(prompt.customFields).join(", ")}` : "";
|
|
4654
|
+
return `import { compileTemplate } from 'agent-eval'
|
|
4655
|
+
import type { AgentPrompt } from 'agent-eval'
|
|
4656
|
+
|
|
4657
|
+
/**
|
|
4658
|
+
* Extracted from improvement cycle
|
|
4659
|
+
* Original ID: ${prompt.id}
|
|
4660
|
+
* Version: ${prompt.version}${customFieldsComment}
|
|
4661
|
+
*/
|
|
4662
|
+
export const prompt: AgentPrompt<YourInputType> = {
|
|
4663
|
+
id: '${prompt.id}',
|
|
4664
|
+
version: '${prompt.version}',
|
|
4665
|
+
system: \`${escapedSystem}\`,
|
|
4666
|
+
userTemplate: \`${escapedUserTemplate}\`,
|
|
4667
|
+
renderUserPrompt: compileTemplate(\`${escapedUserTemplate}\`),
|
|
4668
|
+
}
|
|
4669
|
+
`;
|
|
4670
|
+
}
|
|
4671
|
+
function escapeTemplateString(str) {
|
|
4672
|
+
return str.replace(/\\/g, "\\\\").replace(/`/g, "\\`").replace(/\${/g, "\\${");
|
|
4673
|
+
}
|
|
4674
|
+
async function writeOutputFile(path3, content) {
|
|
4675
|
+
const dir = dirname2(path3);
|
|
4676
|
+
if (dir && dir !== "." && dir !== "/" && !existsSync6(dir)) {
|
|
4677
|
+
await mkdir3(dir, { recursive: true });
|
|
4678
|
+
}
|
|
4679
|
+
await writeFile4(path3, content, "utf-8");
|
|
4680
|
+
}
|
|
4681
|
+
|
|
4682
|
+
// src/cli/index.ts
|
|
4683
|
+
var VERSION = "0.1.0";
|
|
4684
|
+
var cli = dist_default("agent-eval");
|
|
4685
|
+
cli.command("run [config]", "Run evaluation suite").option("-o, --output <path>", "Output path for markdown report").option("-e, --env-file <path>", "Path to env file", { default: ".env" }).option("-v, --verbose", "Enable verbose output").option("-c, --concurrency <n>", "Concurrency level").option("-i, --iterations <n>", "Number of iterations per test").option("--no-report", "Skip saving markdown report").option("--mock", "Use mock LLM for testing (no API calls)").option("--include <pattern>", "Glob patterns for YAML files (can be repeated)").option("--tags <tag>", "Filter test cases by tags, OR logic (can be repeated)").option("--agent <name>", "Filter to specific agent name").action(async (configPath, options) => {
|
|
4686
|
+
try {
|
|
4687
|
+
await runCommand(configPath, options);
|
|
4688
|
+
} catch {
|
|
4689
|
+
process.exit(1);
|
|
4690
|
+
}
|
|
4691
|
+
});
|
|
4692
|
+
cli.command("improve [config]", "Run improvement cycle on prompts").option("-e, --env-file <path>", "Path to env file", { default: ".env" }).option("--history <path>", "Path to save history JSON").option("--target-score <n>", "Target score to reach (0-100)").option("--max-rounds <n>", "Maximum improvement rounds").option("--max-cost <usd>", "Maximum cost in USD").option("--stale-rounds <n>", "Stop after N rounds without improvement").option("--resume <path>", "Resume from existing history file").option("-c, --concurrency <n>", "Concurrency level").option("-i, --iterations <n>", "Iterations per test").option("-v, --verbose", "Enable verbose output").option("--mock", "Use mock LLM for testing (no API calls)").action(async (configPath, options) => {
|
|
4693
|
+
try {
|
|
4694
|
+
await improveCommand(configPath, options);
|
|
4695
|
+
} catch {
|
|
4696
|
+
process.exit(1);
|
|
4697
|
+
}
|
|
4698
|
+
});
|
|
4699
|
+
cli.command("rollback <history>", "Extract prompt from improvement history").option("-r, --round <n>", "Round number to extract (1, 2, ...)").option("--initial", "Extract the initial prompt (before any improvements)").option("-o, --output <path>", "Output file path").option("-f, --format <type>", "Output format: json or ts", { default: "json" }).action(async (historyPath, options) => {
|
|
4700
|
+
try {
|
|
4701
|
+
await rollbackCommand(historyPath, options);
|
|
4702
|
+
} catch {
|
|
4703
|
+
process.exit(1);
|
|
4704
|
+
}
|
|
4705
|
+
});
|
|
4706
|
+
cli.help();
|
|
4707
|
+
cli.version(VERSION);
|
|
4708
|
+
cli.parse();
|
|
4709
|
+
//# sourceMappingURL=cli.js.map
|