@agtlantis/eval 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +198 -0
- package/LICENSE +21 -0
- package/README.md +496 -0
- package/dist/cli.js +4709 -0
- package/dist/cli.js.map +1 -0
- package/dist/index.cjs +3998 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2738 -0
- package/dist/index.d.ts +2738 -0
- package/dist/index.js +3868 -0
- package/dist/index.js.map +1 -0
- package/package.json +101 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,3868 @@
|
|
|
1
|
+
// src/core/runner.ts
|
|
2
|
+
import { resolveFileSourcesInInput as resolveFileSourcesInInput2 } from "@agtlantis/core";
|
|
3
|
+
|
|
4
|
+
// src/multi-turn/types.ts
|
|
5
|
+
function isMaxTurnsCondition(condition) {
|
|
6
|
+
return condition.type === "maxTurns";
|
|
7
|
+
}
|
|
8
|
+
function isFieldSetCondition(condition) {
|
|
9
|
+
return condition.type === "fieldSet";
|
|
10
|
+
}
|
|
11
|
+
function isFieldValueCondition(condition) {
|
|
12
|
+
return condition.type === "fieldValue";
|
|
13
|
+
}
|
|
14
|
+
function isCustomCondition(condition) {
|
|
15
|
+
return condition.type === "custom";
|
|
16
|
+
}
|
|
17
|
+
function isMultiTurnTestCase(testCase2) {
|
|
18
|
+
return "multiTurn" in testCase2;
|
|
19
|
+
}
|
|
20
|
+
function isTerminated(result) {
|
|
21
|
+
return result.terminated === true;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// src/core/errors.ts
|
|
25
|
+
var EvalErrorCode = /* @__PURE__ */ ((EvalErrorCode2) => {
|
|
26
|
+
EvalErrorCode2["LLM_API_ERROR"] = "LLM_API_ERROR";
|
|
27
|
+
EvalErrorCode2["LLM_RATE_LIMIT"] = "LLM_RATE_LIMIT";
|
|
28
|
+
EvalErrorCode2["LLM_TIMEOUT"] = "LLM_TIMEOUT";
|
|
29
|
+
EvalErrorCode2["JSON_PARSE_ERROR"] = "JSON_PARSE_ERROR";
|
|
30
|
+
EvalErrorCode2["VERDICT_PARSE_ERROR"] = "VERDICT_PARSE_ERROR";
|
|
31
|
+
EvalErrorCode2["TEMPLATE_COMPILE_ERROR"] = "TEMPLATE_COMPILE_ERROR";
|
|
32
|
+
EvalErrorCode2["AGENT_EXECUTION_ERROR"] = "AGENT_EXECUTION_ERROR";
|
|
33
|
+
EvalErrorCode2["INVALID_CONFIG"] = "INVALID_CONFIG";
|
|
34
|
+
EvalErrorCode2["MISSING_API_KEY"] = "MISSING_API_KEY";
|
|
35
|
+
EvalErrorCode2["PROMPT_NOT_FOUND"] = "PROMPT_NOT_FOUND";
|
|
36
|
+
EvalErrorCode2["PROMPT_INVALID_FORMAT"] = "PROMPT_INVALID_FORMAT";
|
|
37
|
+
EvalErrorCode2["PROMPT_WRITE_ERROR"] = "PROMPT_WRITE_ERROR";
|
|
38
|
+
EvalErrorCode2["PROMPT_READ_ERROR"] = "PROMPT_READ_ERROR";
|
|
39
|
+
EvalErrorCode2["SUGGESTION_APPLY_ERROR"] = "SUGGESTION_APPLY_ERROR";
|
|
40
|
+
EvalErrorCode2["SCHEMA_VALIDATION_ERROR"] = "SCHEMA_VALIDATION_ERROR";
|
|
41
|
+
EvalErrorCode2["SCHEMA_GENERATION_ERROR"] = "SCHEMA_GENERATION_ERROR";
|
|
42
|
+
EvalErrorCode2["FILE_READ_ERROR"] = "FILE_READ_ERROR";
|
|
43
|
+
EvalErrorCode2["FILE_WRITE_ERROR"] = "FILE_WRITE_ERROR";
|
|
44
|
+
EvalErrorCode2["FILE_TOO_LARGE"] = "FILE_TOO_LARGE";
|
|
45
|
+
EvalErrorCode2["CONCURRENT_MODIFICATION"] = "CONCURRENT_MODIFICATION";
|
|
46
|
+
EvalErrorCode2["UNKNOWN_ERROR"] = "UNKNOWN_ERROR";
|
|
47
|
+
return EvalErrorCode2;
|
|
48
|
+
})(EvalErrorCode || {});
|
|
49
|
+
var EvalError = class _EvalError extends Error {
|
|
50
|
+
code;
|
|
51
|
+
cause;
|
|
52
|
+
context;
|
|
53
|
+
constructor(message, options) {
|
|
54
|
+
super(message);
|
|
55
|
+
this.name = "EvalError";
|
|
56
|
+
this.code = options.code;
|
|
57
|
+
this.cause = options.cause;
|
|
58
|
+
this.context = options.context;
|
|
59
|
+
if (Error.captureStackTrace) {
|
|
60
|
+
Error.captureStackTrace(this, _EvalError);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Creates an EvalError from an unknown error with a specific code.
|
|
65
|
+
*/
|
|
66
|
+
static from(error, code, context) {
|
|
67
|
+
if (error instanceof _EvalError) {
|
|
68
|
+
return error;
|
|
69
|
+
}
|
|
70
|
+
const cause = error instanceof Error ? error : new Error(String(error));
|
|
71
|
+
return new _EvalError(cause.message, { code, cause, context });
|
|
72
|
+
}
|
|
73
|
+
toJSON() {
|
|
74
|
+
return {
|
|
75
|
+
name: this.name,
|
|
76
|
+
message: this.message,
|
|
77
|
+
code: this.code,
|
|
78
|
+
context: this.context,
|
|
79
|
+
cause: this.cause?.message
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
// src/multi-turn/termination.ts
|
|
85
|
+
function getFieldValue(obj, fieldPath) {
|
|
86
|
+
if (obj === null || obj === void 0) {
|
|
87
|
+
return void 0;
|
|
88
|
+
}
|
|
89
|
+
const parts = fieldPath.split(".");
|
|
90
|
+
let current = obj;
|
|
91
|
+
for (const part of parts) {
|
|
92
|
+
if (current === null || current === void 0) {
|
|
93
|
+
return void 0;
|
|
94
|
+
}
|
|
95
|
+
if (typeof current !== "object") {
|
|
96
|
+
return void 0;
|
|
97
|
+
}
|
|
98
|
+
current = current[part];
|
|
99
|
+
}
|
|
100
|
+
return current;
|
|
101
|
+
}
|
|
102
|
+
function isSet(value) {
|
|
103
|
+
return value !== null && value !== void 0;
|
|
104
|
+
}
|
|
105
|
+
function checkMaxTurns(condition, context) {
|
|
106
|
+
const shouldTerminate = context.currentTurn >= condition.count;
|
|
107
|
+
if (shouldTerminate) {
|
|
108
|
+
return {
|
|
109
|
+
terminated: true,
|
|
110
|
+
terminationType: "maxTurns",
|
|
111
|
+
matchedCondition: condition,
|
|
112
|
+
reason: `Maximum turns reached (${condition.count})`
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
return {
|
|
116
|
+
terminated: false,
|
|
117
|
+
reason: `Turn ${context.currentTurn} of ${condition.count}`
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
function checkFieldSet(condition, context) {
|
|
121
|
+
const fieldValue = getFieldValue(context.lastOutput, condition.fieldPath);
|
|
122
|
+
const fieldIsSet2 = isSet(fieldValue);
|
|
123
|
+
if (fieldIsSet2) {
|
|
124
|
+
return {
|
|
125
|
+
terminated: true,
|
|
126
|
+
terminationType: "condition",
|
|
127
|
+
matchedCondition: condition,
|
|
128
|
+
reason: `Field "${condition.fieldPath}" is set (value: ${JSON.stringify(fieldValue)})`
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
return {
|
|
132
|
+
terminated: false,
|
|
133
|
+
reason: `Field "${condition.fieldPath}" is not set`
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
function checkFieldValue(condition, context) {
|
|
137
|
+
const fieldValue = getFieldValue(context.lastOutput, condition.fieldPath);
|
|
138
|
+
const matches = fieldValue === condition.expectedValue;
|
|
139
|
+
if (matches) {
|
|
140
|
+
return {
|
|
141
|
+
terminated: true,
|
|
142
|
+
terminationType: "condition",
|
|
143
|
+
matchedCondition: condition,
|
|
144
|
+
reason: `Field "${condition.fieldPath}" equals expected value`
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
return {
|
|
148
|
+
terminated: false,
|
|
149
|
+
reason: `Field "${condition.fieldPath}" does not equal expected value (got: ${JSON.stringify(fieldValue)})`
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
async function checkCustom(condition, context) {
|
|
153
|
+
const description = condition.description ?? "Custom condition";
|
|
154
|
+
try {
|
|
155
|
+
const shouldTerminate = await condition.check(context);
|
|
156
|
+
if (shouldTerminate) {
|
|
157
|
+
return {
|
|
158
|
+
terminated: true,
|
|
159
|
+
terminationType: "condition",
|
|
160
|
+
matchedCondition: condition,
|
|
161
|
+
reason: `${description} met`
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
return {
|
|
165
|
+
terminated: false,
|
|
166
|
+
reason: `${description} not met`
|
|
167
|
+
};
|
|
168
|
+
} catch (error) {
|
|
169
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
170
|
+
return {
|
|
171
|
+
terminated: false,
|
|
172
|
+
reason: `${description} failed: ${errorMessage}`
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
async function checkCondition(condition, context) {
|
|
177
|
+
if (isMaxTurnsCondition(condition)) {
|
|
178
|
+
return checkMaxTurns(condition, context);
|
|
179
|
+
}
|
|
180
|
+
if (isFieldValueCondition(condition)) {
|
|
181
|
+
return checkFieldValue(condition, context);
|
|
182
|
+
}
|
|
183
|
+
if (isFieldSetCondition(condition)) {
|
|
184
|
+
return checkFieldSet(condition, context);
|
|
185
|
+
}
|
|
186
|
+
if (isCustomCondition(condition)) {
|
|
187
|
+
return checkCustom(condition, context);
|
|
188
|
+
}
|
|
189
|
+
const _exhaustive = condition;
|
|
190
|
+
throw new EvalError(`Unknown condition type: ${JSON.stringify(_exhaustive)}`, {
|
|
191
|
+
code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */,
|
|
192
|
+
context: { condition: _exhaustive }
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
async function checkTermination(conditions, context) {
|
|
196
|
+
if (conditions.length === 0) {
|
|
197
|
+
return {
|
|
198
|
+
terminated: false,
|
|
199
|
+
reason: "No termination conditions specified"
|
|
200
|
+
};
|
|
201
|
+
}
|
|
202
|
+
for (const condition of conditions) {
|
|
203
|
+
const result = await checkCondition(condition, context);
|
|
204
|
+
if (result.terminated) {
|
|
205
|
+
return result;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
return {
|
|
209
|
+
terminated: false,
|
|
210
|
+
reason: "No termination conditions met"
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
// src/utils/json.ts
|
|
215
|
+
function truncate(str, maxLength) {
|
|
216
|
+
if (!str) {
|
|
217
|
+
return "";
|
|
218
|
+
}
|
|
219
|
+
if (str.length <= maxLength) {
|
|
220
|
+
return str;
|
|
221
|
+
}
|
|
222
|
+
return str.slice(0, maxLength) + "...";
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// src/utils/condition-composites.ts
|
|
226
|
+
function createAndCheck(conditions, checkFn) {
|
|
227
|
+
return async (context) => {
|
|
228
|
+
for (const condition of conditions) {
|
|
229
|
+
const result = await checkFn(condition, context);
|
|
230
|
+
if (!result.terminated) {
|
|
231
|
+
return false;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
return true;
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
function createOrCheck(conditions, checkFn) {
|
|
238
|
+
return async (context) => {
|
|
239
|
+
for (const condition of conditions) {
|
|
240
|
+
const result = await checkFn(condition, context);
|
|
241
|
+
if (result.terminated) {
|
|
242
|
+
return true;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
return false;
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
function createNotCheck(condition, checkFn) {
|
|
249
|
+
return async (context) => {
|
|
250
|
+
const result = await checkFn(condition, context);
|
|
251
|
+
return !result.terminated;
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
function formatCompositeDescription(type, conditions) {
|
|
255
|
+
if (conditions.length === 0) {
|
|
256
|
+
return `${type}() - empty, never terminates`;
|
|
257
|
+
}
|
|
258
|
+
return `${type}(${conditions.map((c) => c.type).join(", ")})`;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// src/multi-turn/conditions.ts
|
|
262
|
+
function naturalLanguage(options) {
|
|
263
|
+
const { provider, prompt, systemPrompt } = options;
|
|
264
|
+
const defaultSystemPrompt = `You are an assistant that evaluates whether a conversation should terminate.
|
|
265
|
+
Analyze the conversation history and determine if the specified condition is met.
|
|
266
|
+
Respond with ONLY "yes" or "no" - nothing else.`;
|
|
267
|
+
return {
|
|
268
|
+
type: "custom",
|
|
269
|
+
check: async (context) => {
|
|
270
|
+
const historyText = context.history.map(
|
|
271
|
+
(h) => `Turn ${h.turn}:
|
|
272
|
+
Input: ${JSON.stringify(h.input)}
|
|
273
|
+
Output: ${JSON.stringify(h.output)}`
|
|
274
|
+
).join("\n\n");
|
|
275
|
+
const userPrompt = `## Termination Condition
|
|
276
|
+
${prompt}
|
|
277
|
+
|
|
278
|
+
## Conversation History
|
|
279
|
+
${historyText || "(No history yet)"}
|
|
280
|
+
|
|
281
|
+
## Current Turn
|
|
282
|
+
Turn: ${context.currentTurn}
|
|
283
|
+
Last Output: ${JSON.stringify(context.lastOutput)}
|
|
284
|
+
|
|
285
|
+
Should the conversation terminate based on the condition above? Answer "yes" or "no" only.`;
|
|
286
|
+
const execution = provider.simpleExecution(async (session) => {
|
|
287
|
+
const result = await session.generateText({
|
|
288
|
+
messages: [
|
|
289
|
+
{ role: "system", content: systemPrompt ?? defaultSystemPrompt },
|
|
290
|
+
{ role: "user", content: userPrompt }
|
|
291
|
+
]
|
|
292
|
+
});
|
|
293
|
+
return result.text;
|
|
294
|
+
});
|
|
295
|
+
const executionResult = await execution.result();
|
|
296
|
+
if (executionResult.status !== "succeeded") {
|
|
297
|
+
throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
|
|
298
|
+
}
|
|
299
|
+
const responseText = executionResult.value;
|
|
300
|
+
const answer = responseText.toLowerCase().trim();
|
|
301
|
+
return answer === "yes" || answer.startsWith("yes");
|
|
302
|
+
},
|
|
303
|
+
description: `NL: ${truncate(prompt, 50)}`
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
function and(...conditions) {
|
|
307
|
+
if (conditions.length === 0) {
|
|
308
|
+
return {
|
|
309
|
+
type: "custom",
|
|
310
|
+
check: () => false,
|
|
311
|
+
description: formatCompositeDescription("and", [])
|
|
312
|
+
};
|
|
313
|
+
}
|
|
314
|
+
return {
|
|
315
|
+
type: "custom",
|
|
316
|
+
check: createAndCheck(conditions, checkCondition),
|
|
317
|
+
description: formatCompositeDescription("and", conditions)
|
|
318
|
+
};
|
|
319
|
+
}
|
|
320
|
+
function or(...conditions) {
|
|
321
|
+
if (conditions.length === 0) {
|
|
322
|
+
return {
|
|
323
|
+
type: "custom",
|
|
324
|
+
check: () => false,
|
|
325
|
+
description: formatCompositeDescription("or", [])
|
|
326
|
+
};
|
|
327
|
+
}
|
|
328
|
+
return {
|
|
329
|
+
type: "custom",
|
|
330
|
+
check: createOrCheck(conditions, checkCondition),
|
|
331
|
+
description: formatCompositeDescription("or", conditions)
|
|
332
|
+
};
|
|
333
|
+
}
|
|
334
|
+
function not(condition) {
|
|
335
|
+
return {
|
|
336
|
+
type: "custom",
|
|
337
|
+
check: createNotCheck(condition, checkCondition),
|
|
338
|
+
description: `not(${condition.type})`
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
function afterTurns(count) {
|
|
342
|
+
return {
|
|
343
|
+
type: "custom",
|
|
344
|
+
check: (context) => context.currentTurn >= count,
|
|
345
|
+
description: `afterTurns(${count})`
|
|
346
|
+
};
|
|
347
|
+
}
|
|
348
|
+
function fieldEquals(fieldPath, expectedValue) {
|
|
349
|
+
return {
|
|
350
|
+
type: "custom",
|
|
351
|
+
check: async (context) => {
|
|
352
|
+
const result = await checkCondition(
|
|
353
|
+
{ type: "fieldValue", fieldPath, expectedValue },
|
|
354
|
+
context
|
|
355
|
+
);
|
|
356
|
+
return result.terminated;
|
|
357
|
+
},
|
|
358
|
+
description: `fieldEquals(${fieldPath}, ${JSON.stringify(expectedValue)})`
|
|
359
|
+
};
|
|
360
|
+
}
|
|
361
|
+
function fieldIsSet(fieldPath) {
|
|
362
|
+
return {
|
|
363
|
+
type: "custom",
|
|
364
|
+
check: async (context) => {
|
|
365
|
+
const result = await checkCondition({ type: "fieldSet", fieldPath }, context);
|
|
366
|
+
return result.terminated;
|
|
367
|
+
},
|
|
368
|
+
description: `fieldIsSet(${fieldPath})`
|
|
369
|
+
};
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// src/multi-turn/runner.ts
|
|
373
|
+
import { resolveFileSourcesInInput } from "@agtlantis/core";
|
|
374
|
+
var DEFAULT_MAX_TURNS = 10;
|
|
375
|
+
var DEFAULT_ON_CONDITION_MET = "pass";
|
|
376
|
+
var DEFAULT_ON_MAX_TURNS_REACHED = "fail";
|
|
377
|
+
function aggregateTokenUsage(usages) {
|
|
378
|
+
return usages.reduce(
|
|
379
|
+
(acc, usage) => ({
|
|
380
|
+
inputTokens: acc.inputTokens + usage.inputTokens,
|
|
381
|
+
outputTokens: acc.outputTokens + usage.outputTokens,
|
|
382
|
+
totalTokens: acc.totalTokens + usage.totalTokens
|
|
383
|
+
}),
|
|
384
|
+
{ inputTokens: 0, outputTokens: 0, totalTokens: 0 }
|
|
385
|
+
);
|
|
386
|
+
}
|
|
387
|
+
function getEffectiveMaxTurns(conditions, safetyLimit) {
|
|
388
|
+
const maxTurnsCondition = conditions.find((c) => c.type === "maxTurns");
|
|
389
|
+
if (maxTurnsCondition && maxTurnsCondition.type === "maxTurns") {
|
|
390
|
+
return Math.min(maxTurnsCondition.count, safetyLimit);
|
|
391
|
+
}
|
|
392
|
+
return safetyLimit;
|
|
393
|
+
}
|
|
394
|
+
async function resolveInput(followUpInput, context) {
|
|
395
|
+
const inputValue = followUpInput.input;
|
|
396
|
+
if (typeof inputValue === "function") {
|
|
397
|
+
const result = inputValue(context);
|
|
398
|
+
return result instanceof Promise ? await result : result;
|
|
399
|
+
}
|
|
400
|
+
return inputValue;
|
|
401
|
+
}
|
|
402
|
+
function buildContext(currentTurn, history) {
|
|
403
|
+
return {
|
|
404
|
+
currentTurn,
|
|
405
|
+
history,
|
|
406
|
+
lastOutput: history.length > 0 ? history[history.length - 1].output : void 0
|
|
407
|
+
};
|
|
408
|
+
}
|
|
409
|
+
function getFollowUpInput(followUpInputs, followUpIndex) {
|
|
410
|
+
let currentIndex = 0;
|
|
411
|
+
for (const followUp of followUpInputs) {
|
|
412
|
+
const repeatCount = followUp.turns ?? 1;
|
|
413
|
+
if (!Number.isFinite(repeatCount) && followUpIndex >= currentIndex) {
|
|
414
|
+
return followUp;
|
|
415
|
+
}
|
|
416
|
+
if (followUpIndex < currentIndex + repeatCount) {
|
|
417
|
+
return followUp;
|
|
418
|
+
}
|
|
419
|
+
currentIndex += repeatCount;
|
|
420
|
+
}
|
|
421
|
+
return null;
|
|
422
|
+
}
|
|
423
|
+
function validateFollowUpInputs(followUpInputs) {
|
|
424
|
+
for (let i = 0; i < followUpInputs.length; i++) {
|
|
425
|
+
const followUp = followUpInputs[i];
|
|
426
|
+
if (followUp.turns === void 0) {
|
|
427
|
+
continue;
|
|
428
|
+
}
|
|
429
|
+
if (typeof followUp.turns !== "number" || followUp.turns < 1) {
|
|
430
|
+
throw new EvalError("turns must be a positive number or Infinity", {
|
|
431
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
432
|
+
context: {
|
|
433
|
+
description: followUp.description,
|
|
434
|
+
turns: followUp.turns
|
|
435
|
+
}
|
|
436
|
+
});
|
|
437
|
+
}
|
|
438
|
+
if (!Number.isFinite(followUp.turns) && i < followUpInputs.length - 1) {
|
|
439
|
+
throw new EvalError(
|
|
440
|
+
"turns: Infinity must be the last followUpInput (subsequent items would be unreachable)",
|
|
441
|
+
{
|
|
442
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
443
|
+
context: {
|
|
444
|
+
description: followUp.description,
|
|
445
|
+
position: i,
|
|
446
|
+
totalItems: followUpInputs.length
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
);
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
async function getTurnInput(turn, testCaseInput, followUpInputs, conversationHistory) {
|
|
454
|
+
if (turn === 1) {
|
|
455
|
+
return { type: "success", input: testCaseInput };
|
|
456
|
+
}
|
|
457
|
+
const followUpIndex = turn - 2;
|
|
458
|
+
const followUp = getFollowUpInput(followUpInputs, followUpIndex);
|
|
459
|
+
if (!followUp) {
|
|
460
|
+
return { type: "exhausted" };
|
|
461
|
+
}
|
|
462
|
+
const ctx = buildContext(turn, conversationHistory);
|
|
463
|
+
const input = await resolveInput(followUp, ctx);
|
|
464
|
+
return { type: "success", input };
|
|
465
|
+
}
|
|
466
|
+
function isFileResolutionError(result) {
|
|
467
|
+
return "type" in result && result.type === "fileResolutionError";
|
|
468
|
+
}
|
|
469
|
+
async function executeSingleTurn(input, agent, testCaseId, turn) {
|
|
470
|
+
let resolvedInput;
|
|
471
|
+
try {
|
|
472
|
+
resolvedInput = await resolveFileSourcesInInput(input, {
|
|
473
|
+
basePath: process.cwd()
|
|
474
|
+
});
|
|
475
|
+
} catch (e) {
|
|
476
|
+
return {
|
|
477
|
+
type: "fileResolutionError",
|
|
478
|
+
reason: `FileSource resolution failed on turn ${turn}: ${e instanceof Error ? e.message : String(e)}`
|
|
479
|
+
};
|
|
480
|
+
}
|
|
481
|
+
const startTime = performance.now();
|
|
482
|
+
let output;
|
|
483
|
+
let metadata;
|
|
484
|
+
let error;
|
|
485
|
+
try {
|
|
486
|
+
const agentResult = await agent.execute(resolvedInput);
|
|
487
|
+
output = agentResult.result;
|
|
488
|
+
metadata = agentResult.metadata;
|
|
489
|
+
} catch (e) {
|
|
490
|
+
error = EvalError.from(e, "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */, {
|
|
491
|
+
testCaseId,
|
|
492
|
+
turn,
|
|
493
|
+
agentName: agent.config.name
|
|
494
|
+
});
|
|
495
|
+
}
|
|
496
|
+
const latencyMs = performance.now() - startTime;
|
|
497
|
+
return { output, metadata, latencyMs, error };
|
|
498
|
+
}
|
|
499
|
+
function determinePassFromTermination(termination, onConditionMet, onMaxTurnsReached) {
|
|
500
|
+
if (!isTerminated(termination)) {
|
|
501
|
+
return true;
|
|
502
|
+
}
|
|
503
|
+
switch (termination.terminationType) {
|
|
504
|
+
case "error":
|
|
505
|
+
case "exhausted":
|
|
506
|
+
return false;
|
|
507
|
+
case "maxTurns":
|
|
508
|
+
return onMaxTurnsReached === "pass";
|
|
509
|
+
case "condition":
|
|
510
|
+
return onConditionMet === "pass";
|
|
511
|
+
default:
|
|
512
|
+
return true;
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
async function executeMultiTurnTestCase(testCase2, context, options) {
|
|
516
|
+
const { agent, judge, agentDescription } = context;
|
|
517
|
+
const { multiTurn } = testCase2;
|
|
518
|
+
const signal = options?.signal;
|
|
519
|
+
const maxTurns = getEffectiveMaxTurns(
|
|
520
|
+
multiTurn.terminateWhen,
|
|
521
|
+
multiTurn.maxTurns ?? DEFAULT_MAX_TURNS
|
|
522
|
+
);
|
|
523
|
+
const onConditionMet = multiTurn.onConditionMet ?? DEFAULT_ON_CONDITION_MET;
|
|
524
|
+
const onMaxTurnsReached = multiTurn.onMaxTurnsReached ?? DEFAULT_ON_MAX_TURNS_REACHED;
|
|
525
|
+
const followUpInputs = multiTurn.followUpInputs ?? [];
|
|
526
|
+
validateFollowUpInputs(followUpInputs);
|
|
527
|
+
const conversationHistory = [];
|
|
528
|
+
const tokenUsages = [];
|
|
529
|
+
let totalLatencyMs = 0;
|
|
530
|
+
let termination = {
|
|
531
|
+
terminated: false,
|
|
532
|
+
reason: "Execution not started"
|
|
533
|
+
};
|
|
534
|
+
for (let turn = 1; turn <= maxTurns; turn++) {
|
|
535
|
+
if (signal?.aborted) {
|
|
536
|
+
throw new EvalError("Multi-turn test execution aborted", {
|
|
537
|
+
code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
|
|
538
|
+
context: { testCaseId: testCase2.id, turn, reason: "aborted" }
|
|
539
|
+
});
|
|
540
|
+
}
|
|
541
|
+
const inputResult = await getTurnInput(
|
|
542
|
+
turn,
|
|
543
|
+
testCase2.input,
|
|
544
|
+
followUpInputs,
|
|
545
|
+
conversationHistory
|
|
546
|
+
);
|
|
547
|
+
if (inputResult.type === "exhausted") {
|
|
548
|
+
termination = {
|
|
549
|
+
terminated: true,
|
|
550
|
+
terminationType: "exhausted",
|
|
551
|
+
reason: "All follow-up inputs exhausted"
|
|
552
|
+
};
|
|
553
|
+
break;
|
|
554
|
+
}
|
|
555
|
+
const input = inputResult.input;
|
|
556
|
+
const turnResult = await executeSingleTurn(input, agent, testCase2.id ?? "unknown", turn);
|
|
557
|
+
if (isFileResolutionError(turnResult)) {
|
|
558
|
+
termination = {
|
|
559
|
+
terminated: true,
|
|
560
|
+
terminationType: "error",
|
|
561
|
+
reason: turnResult.reason
|
|
562
|
+
};
|
|
563
|
+
break;
|
|
564
|
+
}
|
|
565
|
+
const {
|
|
566
|
+
output: agentOutput,
|
|
567
|
+
metadata: agentMetadata,
|
|
568
|
+
latencyMs,
|
|
569
|
+
error: agentError
|
|
570
|
+
} = turnResult;
|
|
571
|
+
totalLatencyMs += latencyMs;
|
|
572
|
+
const turnUsage = agentMetadata?.tokenUsage ?? {
|
|
573
|
+
inputTokens: 0,
|
|
574
|
+
outputTokens: 0,
|
|
575
|
+
totalTokens: 0
|
|
576
|
+
};
|
|
577
|
+
tokenUsages.push(turnUsage);
|
|
578
|
+
conversationHistory.push({
|
|
579
|
+
turn,
|
|
580
|
+
input,
|
|
581
|
+
output: agentOutput,
|
|
582
|
+
metadata: agentMetadata
|
|
583
|
+
});
|
|
584
|
+
if (agentError) {
|
|
585
|
+
termination = {
|
|
586
|
+
terminated: true,
|
|
587
|
+
terminationType: "error",
|
|
588
|
+
reason: `Agent execution failed on turn ${turn}: ${agentError.message}`
|
|
589
|
+
};
|
|
590
|
+
break;
|
|
591
|
+
}
|
|
592
|
+
const ctx = buildContext(turn, conversationHistory);
|
|
593
|
+
termination = await checkTermination(multiTurn.terminateWhen, ctx);
|
|
594
|
+
if (termination.terminated) {
|
|
595
|
+
break;
|
|
596
|
+
}
|
|
597
|
+
if (turn >= maxTurns) {
|
|
598
|
+
termination = {
|
|
599
|
+
terminated: true,
|
|
600
|
+
terminationType: "maxTurns",
|
|
601
|
+
matchedCondition: { type: "maxTurns", count: maxTurns },
|
|
602
|
+
reason: `Maximum turns reached (${maxTurns})`
|
|
603
|
+
};
|
|
604
|
+
break;
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
const aggregatedTokenUsage = aggregateTokenUsage(tokenUsages);
|
|
608
|
+
const metrics = {
|
|
609
|
+
latencyMs: totalLatencyMs,
|
|
610
|
+
tokenUsage: aggregatedTokenUsage
|
|
611
|
+
};
|
|
612
|
+
const lastTurn = conversationHistory[conversationHistory.length - 1];
|
|
613
|
+
const finalOutput = lastTurn?.output;
|
|
614
|
+
const judgeResult = await judge.evaluate({
|
|
615
|
+
input: testCase2.input,
|
|
616
|
+
output: finalOutput,
|
|
617
|
+
agentDescription,
|
|
618
|
+
files: testCase2.files
|
|
619
|
+
});
|
|
620
|
+
const passedTermination = determinePassFromTermination(
|
|
621
|
+
termination,
|
|
622
|
+
onConditionMet,
|
|
623
|
+
onMaxTurnsReached
|
|
624
|
+
);
|
|
625
|
+
const passed = passedTermination && judgeResult.passed;
|
|
626
|
+
return {
|
|
627
|
+
testCase: testCase2,
|
|
628
|
+
output: finalOutput,
|
|
629
|
+
metrics,
|
|
630
|
+
verdicts: judgeResult.verdicts,
|
|
631
|
+
overallScore: judgeResult.overallScore,
|
|
632
|
+
passed,
|
|
633
|
+
judgeMetadata: judgeResult.metadata,
|
|
634
|
+
conversationHistory,
|
|
635
|
+
termination,
|
|
636
|
+
totalTurns: conversationHistory.length
|
|
637
|
+
};
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
// src/multi-turn/ai-user.ts
|
|
641
|
+
var DEFAULT_SYSTEM_PROMPT = `You are simulating a realistic user in a conversation with an AI assistant.
|
|
642
|
+
|
|
643
|
+
## Your Role
|
|
644
|
+
Generate natural, context-appropriate user messages based on the conversation history.
|
|
645
|
+
|
|
646
|
+
## Guidelines
|
|
647
|
+
|
|
648
|
+
1. **Stay in Character**: Respond as a real user would - with natural language, occasional typos, or casual phrasing when appropriate.
|
|
649
|
+
|
|
650
|
+
2. **Be Goal-Oriented**: Users have objectives. Pursue them logically based on the conversation context:
|
|
651
|
+
- If the assistant asks a question, provide a reasonable answer
|
|
652
|
+
- If clarification is needed, ask for it naturally
|
|
653
|
+
- If a task is progressing, guide it toward completion
|
|
654
|
+
|
|
655
|
+
3. **React Appropriately**: Respond to what the assistant says:
|
|
656
|
+
- Acknowledge when the assistant is helpful
|
|
657
|
+
- Express confusion if the response is unclear
|
|
658
|
+
- Correct misunderstandings if they occur
|
|
659
|
+
|
|
660
|
+
4. **Keep It Realistic**: Real users:
|
|
661
|
+
- Don't always provide perfect information upfront
|
|
662
|
+
- May change their mind or add requirements
|
|
663
|
+
- Sometimes need time to think or decide
|
|
664
|
+
|
|
665
|
+
## Output Format
|
|
666
|
+
Respond with ONLY the user's message. No additional formatting, explanation, or meta-commentary.`;
|
|
667
|
+
function aiUser(options) {
|
|
668
|
+
const { provider, systemPrompt, formatHistory, buildInput } = options;
|
|
669
|
+
const defaultFormatHistory = (ctx) => ctx.history.map(
|
|
670
|
+
(h, i) => `[Turn ${i + 1}]
|
|
671
|
+
User: ${JSON.stringify(h.input)}
|
|
672
|
+
Assistant: ${JSON.stringify(h.output)}`
|
|
673
|
+
).join("\n\n");
|
|
674
|
+
return async (context) => {
|
|
675
|
+
const historyText = (formatHistory ?? defaultFormatHistory)(context);
|
|
676
|
+
const resolvedSystemPrompt = typeof systemPrompt === "function" ? systemPrompt(context) : systemPrompt ?? DEFAULT_SYSTEM_PROMPT;
|
|
677
|
+
const userPrompt = historyText ? `## Conversation History
|
|
678
|
+
${historyText}
|
|
679
|
+
|
|
680
|
+
## Your Task
|
|
681
|
+
Generate the next user message based on the conversation above:` : `## Your Task
|
|
682
|
+
This is the start of a new conversation. Generate an appropriate opening message from the user:`;
|
|
683
|
+
const execution = provider.simpleExecution(async (session) => {
|
|
684
|
+
const result = await session.generateText({
|
|
685
|
+
messages: [
|
|
686
|
+
{ role: "system", content: resolvedSystemPrompt },
|
|
687
|
+
{ role: "user", content: userPrompt }
|
|
688
|
+
]
|
|
689
|
+
});
|
|
690
|
+
return result.text;
|
|
691
|
+
});
|
|
692
|
+
const executionResult = await execution.result();
|
|
693
|
+
if (executionResult.status !== "succeeded") {
|
|
694
|
+
throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
|
|
695
|
+
}
|
|
696
|
+
const responseText = executionResult.value;
|
|
697
|
+
return buildInput(responseText, context);
|
|
698
|
+
};
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
// src/utils/semaphore.ts
|
|
702
|
+
function createSemaphore(limit) {
|
|
703
|
+
let running = 0;
|
|
704
|
+
const waiting = [];
|
|
705
|
+
return {
|
|
706
|
+
async acquire() {
|
|
707
|
+
if (running < limit) {
|
|
708
|
+
running++;
|
|
709
|
+
return;
|
|
710
|
+
}
|
|
711
|
+
return new Promise((resolve2) => {
|
|
712
|
+
waiting.push(resolve2);
|
|
713
|
+
});
|
|
714
|
+
},
|
|
715
|
+
release() {
|
|
716
|
+
running--;
|
|
717
|
+
const next = waiting.shift();
|
|
718
|
+
if (next) {
|
|
719
|
+
running++;
|
|
720
|
+
next();
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
// src/core/constants.ts
|
|
727
|
+
var SCORE = {
|
|
728
|
+
/** Minimum possible score */
|
|
729
|
+
MIN: 0,
|
|
730
|
+
/** Maximum possible score */
|
|
731
|
+
MAX: 100,
|
|
732
|
+
/** Default threshold for passing evaluation */
|
|
733
|
+
DEFAULT_PASS_THRESHOLD: 70,
|
|
734
|
+
/** Threshold for majority-based pass determination (50%) */
|
|
735
|
+
MAJORITY_PASS_THRESHOLD: 0.5
|
|
736
|
+
};
|
|
737
|
+
var ZERO_TOKEN_USAGE = {
|
|
738
|
+
inputTokens: 0,
|
|
739
|
+
outputTokens: 0,
|
|
740
|
+
totalTokens: 0
|
|
741
|
+
};
|
|
742
|
+
|
|
743
|
+
// src/core/runner.ts
|
|
744
|
+
async function executeTestCase(testCase2, context, signal) {
|
|
745
|
+
const { agent, judge, agentDescription } = context;
|
|
746
|
+
if (signal?.aborted) {
|
|
747
|
+
throw new EvalError("Test execution aborted", {
|
|
748
|
+
code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
|
|
749
|
+
context: { testCaseId: testCase2.id, reason: "aborted" }
|
|
750
|
+
});
|
|
751
|
+
}
|
|
752
|
+
let resolvedInput;
|
|
753
|
+
try {
|
|
754
|
+
resolvedInput = await resolveFileSourcesInInput2(testCase2.input, {
|
|
755
|
+
basePath: process.cwd()
|
|
756
|
+
});
|
|
757
|
+
} catch (e) {
|
|
758
|
+
const error2 = EvalError.from(e, "FILE_READ_ERROR" /* FILE_READ_ERROR */, {
|
|
759
|
+
testCaseId: testCase2.id,
|
|
760
|
+
agentName: agent.config.name
|
|
761
|
+
});
|
|
762
|
+
return createFailedResult(testCase2, error2);
|
|
763
|
+
}
|
|
764
|
+
const startTime = performance.now();
|
|
765
|
+
let output;
|
|
766
|
+
let tokenUsage = ZERO_TOKEN_USAGE;
|
|
767
|
+
let error;
|
|
768
|
+
try {
|
|
769
|
+
const agentResult = await agent.execute(resolvedInput);
|
|
770
|
+
output = agentResult.result;
|
|
771
|
+
if (agentResult.metadata?.tokenUsage) {
|
|
772
|
+
tokenUsage = agentResult.metadata.tokenUsage;
|
|
773
|
+
}
|
|
774
|
+
} catch (e) {
|
|
775
|
+
error = EvalError.from(e, "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */, {
|
|
776
|
+
testCaseId: testCase2.id,
|
|
777
|
+
agentName: agent.config.name
|
|
778
|
+
});
|
|
779
|
+
output = void 0;
|
|
780
|
+
}
|
|
781
|
+
const latencyMs = performance.now() - startTime;
|
|
782
|
+
const metrics = { latencyMs, tokenUsage };
|
|
783
|
+
const testResult = { testCase: testCase2, output, metrics, error };
|
|
784
|
+
if (error) {
|
|
785
|
+
return {
|
|
786
|
+
kind: "single-turn",
|
|
787
|
+
...testResult,
|
|
788
|
+
verdicts: [],
|
|
789
|
+
overallScore: 0,
|
|
790
|
+
passed: false,
|
|
791
|
+
judgeMetadata: void 0
|
|
792
|
+
};
|
|
793
|
+
}
|
|
794
|
+
if (signal?.aborted) {
|
|
795
|
+
throw new EvalError("Test execution aborted before evaluation", {
|
|
796
|
+
code: "AGENT_EXECUTION_ERROR" /* AGENT_EXECUTION_ERROR */,
|
|
797
|
+
context: { testCaseId: testCase2.id, reason: "aborted" }
|
|
798
|
+
});
|
|
799
|
+
}
|
|
800
|
+
const judgeResult = await judge.evaluate({
|
|
801
|
+
input: testCase2.input,
|
|
802
|
+
output,
|
|
803
|
+
agentDescription,
|
|
804
|
+
files: testCase2.files
|
|
805
|
+
});
|
|
806
|
+
return {
|
|
807
|
+
kind: "single-turn",
|
|
808
|
+
...testResult,
|
|
809
|
+
verdicts: judgeResult.verdicts,
|
|
810
|
+
overallScore: judgeResult.overallScore,
|
|
811
|
+
passed: judgeResult.passed,
|
|
812
|
+
judgeMetadata: judgeResult.metadata
|
|
813
|
+
};
|
|
814
|
+
}
|
|
815
|
+
function createFailedResult(testCase2, error) {
|
|
816
|
+
return {
|
|
817
|
+
kind: "single-turn",
|
|
818
|
+
testCase: testCase2,
|
|
819
|
+
output: void 0,
|
|
820
|
+
metrics: { latencyMs: 0, tokenUsage: ZERO_TOKEN_USAGE },
|
|
821
|
+
error,
|
|
822
|
+
verdicts: [],
|
|
823
|
+
overallScore: 0,
|
|
824
|
+
passed: false,
|
|
825
|
+
judgeMetadata: void 0
|
|
826
|
+
};
|
|
827
|
+
}
|
|
828
|
+
function toMultiTurnResult(result) {
|
|
829
|
+
return {
|
|
830
|
+
kind: "multi-turn",
|
|
831
|
+
testCase: result.testCase,
|
|
832
|
+
output: result.output,
|
|
833
|
+
metrics: result.metrics,
|
|
834
|
+
verdicts: result.verdicts,
|
|
835
|
+
overallScore: result.overallScore,
|
|
836
|
+
passed: result.passed,
|
|
837
|
+
judgeMetadata: result.judgeMetadata,
|
|
838
|
+
conversationHistory: result.conversationHistory,
|
|
839
|
+
totalTurns: result.totalTurns,
|
|
840
|
+
terminationReason: result.termination.reason,
|
|
841
|
+
termination: result.termination
|
|
842
|
+
};
|
|
843
|
+
}
|
|
844
|
+
async function runWithConcurrency(testCases2, context, options = {}) {
|
|
845
|
+
const { concurrency = 1, stopOnFirstFailure = false, signal } = options;
|
|
846
|
+
if (concurrency < 1) {
|
|
847
|
+
throw new EvalError("Concurrency must be at least 1", {
|
|
848
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
849
|
+
context: { concurrency }
|
|
850
|
+
});
|
|
851
|
+
}
|
|
852
|
+
if (testCases2.length === 0) {
|
|
853
|
+
return [];
|
|
854
|
+
}
|
|
855
|
+
const semaphore = createSemaphore(concurrency);
|
|
856
|
+
const results = [];
|
|
857
|
+
let shouldStop = false;
|
|
858
|
+
let firstError;
|
|
859
|
+
const internalAbort = new AbortController();
|
|
860
|
+
const propagateExternalAbort = () => {
|
|
861
|
+
shouldStop = true;
|
|
862
|
+
internalAbort.abort();
|
|
863
|
+
};
|
|
864
|
+
signal?.addEventListener("abort", propagateExternalAbort);
|
|
865
|
+
if (signal?.aborted) {
|
|
866
|
+
shouldStop = true;
|
|
867
|
+
}
|
|
868
|
+
try {
|
|
869
|
+
const executeOne = async (testCase2, index) => {
|
|
870
|
+
if (shouldStop) return;
|
|
871
|
+
await semaphore.acquire();
|
|
872
|
+
try {
|
|
873
|
+
if (shouldStop) return;
|
|
874
|
+
const result = await executeTestCaseByType(testCase2, context, internalAbort.signal);
|
|
875
|
+
results[index] = result;
|
|
876
|
+
if (stopOnFirstFailure && !result.passed) {
|
|
877
|
+
shouldStop = true;
|
|
878
|
+
internalAbort.abort();
|
|
879
|
+
}
|
|
880
|
+
} catch (e) {
|
|
881
|
+
if (!firstError && !isAbortError(e)) {
|
|
882
|
+
firstError = e instanceof Error ? e : new Error(String(e));
|
|
883
|
+
}
|
|
884
|
+
shouldStop = true;
|
|
885
|
+
internalAbort.abort();
|
|
886
|
+
} finally {
|
|
887
|
+
semaphore.release();
|
|
888
|
+
}
|
|
889
|
+
};
|
|
890
|
+
const promises = testCases2.map((tc, i) => executeOne(tc, i));
|
|
891
|
+
await Promise.allSettled(promises);
|
|
892
|
+
if (firstError) {
|
|
893
|
+
throw firstError;
|
|
894
|
+
}
|
|
895
|
+
return results.filter((r) => r !== void 0);
|
|
896
|
+
} finally {
|
|
897
|
+
signal?.removeEventListener("abort", propagateExternalAbort);
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
function isAbortError(e) {
|
|
901
|
+
return e instanceof DOMException && e.name === "AbortError" || e instanceof EvalError && e.context?.reason === "aborted";
|
|
902
|
+
}
|
|
903
|
+
async function executeTestCaseByType(testCase2, context, signal) {
|
|
904
|
+
if (isMultiTurnTestCase(testCase2)) {
|
|
905
|
+
const multiTurnResult = await executeMultiTurnTestCase(testCase2, context, { signal });
|
|
906
|
+
return toMultiTurnResult(multiTurnResult);
|
|
907
|
+
}
|
|
908
|
+
return executeTestCase(testCase2, context, signal);
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
// src/core/types.ts
|
|
912
|
+
function toEvalAgent(agent) {
|
|
913
|
+
return {
|
|
914
|
+
config: {
|
|
915
|
+
name: agent.config.name,
|
|
916
|
+
description: agent.config.description
|
|
917
|
+
},
|
|
918
|
+
prompt: agent.prompt,
|
|
919
|
+
execute: async (input, options) => {
|
|
920
|
+
const result = await agent.execute(input, options);
|
|
921
|
+
return {
|
|
922
|
+
result: result.result,
|
|
923
|
+
metadata: result.metadata
|
|
924
|
+
};
|
|
925
|
+
}
|
|
926
|
+
};
|
|
927
|
+
}
|
|
928
|
+
function isSingleTurnResult(result) {
|
|
929
|
+
return result.kind === "single-turn" || result.kind === "single-turn-iterated";
|
|
930
|
+
}
|
|
931
|
+
function isMultiTurnResult(result) {
|
|
932
|
+
return result.kind === "multi-turn" || result.kind === "multi-turn-iterated";
|
|
933
|
+
}
|
|
934
|
+
function isIteratedResult(result) {
|
|
935
|
+
return result.kind === "single-turn-iterated" || result.kind === "multi-turn-iterated";
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
// src/core/iteration.ts
|
|
939
|
+
function calculateIterationStats(results) {
|
|
940
|
+
if (results.length === 0) {
|
|
941
|
+
return {
|
|
942
|
+
iterations: 0,
|
|
943
|
+
scores: [],
|
|
944
|
+
mean: 0,
|
|
945
|
+
stdDev: 0,
|
|
946
|
+
min: 0,
|
|
947
|
+
max: 0,
|
|
948
|
+
passRate: 0,
|
|
949
|
+
passCount: 0
|
|
950
|
+
};
|
|
951
|
+
}
|
|
952
|
+
const scores = results.map((r) => r.overallScore);
|
|
953
|
+
const passCount = results.filter((r) => r.passed).length;
|
|
954
|
+
const mean = scores.reduce((sum, s) => sum + s, 0) / scores.length;
|
|
955
|
+
const variance = scores.reduce((sum, s) => sum + Math.pow(s - mean, 2), 0) / scores.length;
|
|
956
|
+
const stdDev = Math.sqrt(variance);
|
|
957
|
+
return {
|
|
958
|
+
iterations: results.length,
|
|
959
|
+
scores,
|
|
960
|
+
mean,
|
|
961
|
+
stdDev,
|
|
962
|
+
min: Math.min(...scores),
|
|
963
|
+
max: Math.max(...scores),
|
|
964
|
+
passRate: passCount / results.length,
|
|
965
|
+
passCount
|
|
966
|
+
};
|
|
967
|
+
}
|
|
968
|
+
function calculateMultiTurnIterationStats(results) {
|
|
969
|
+
const baseStats = calculateIterationStats(results);
|
|
970
|
+
const turns = results.map((r) => r.totalTurns);
|
|
971
|
+
const terminationCounts = {};
|
|
972
|
+
for (const r of results) {
|
|
973
|
+
const type = r.termination.terminationType;
|
|
974
|
+
if (type) {
|
|
975
|
+
terminationCounts[type] = (terminationCounts[type] || 0) + 1;
|
|
976
|
+
}
|
|
977
|
+
}
|
|
978
|
+
return {
|
|
979
|
+
...baseStats,
|
|
980
|
+
avgTurns: turns.length > 0 ? turns.reduce((a, b) => a + b, 0) / turns.length : 0,
|
|
981
|
+
minTurns: turns.length > 0 ? Math.min(...turns) : 0,
|
|
982
|
+
maxTurns: turns.length > 0 ? Math.max(...turns) : 0,
|
|
983
|
+
terminationCounts
|
|
984
|
+
};
|
|
985
|
+
}
|
|
986
|
+
function selectRepresentativeResult(results, mean) {
|
|
987
|
+
if (results.length === 0) {
|
|
988
|
+
throw new EvalError("Cannot select representative result from empty array", {
|
|
989
|
+
code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */
|
|
990
|
+
});
|
|
991
|
+
}
|
|
992
|
+
return results.reduce((closest, current) => {
|
|
993
|
+
const closestDiff = Math.abs(closest.overallScore - mean);
|
|
994
|
+
const currentDiff = Math.abs(current.overallScore - mean);
|
|
995
|
+
return currentDiff < closestDiff ? current : closest;
|
|
996
|
+
});
|
|
997
|
+
}
|
|
998
|
+
function aggregateIterationResults(allIterationResults) {
|
|
999
|
+
if (allIterationResults.length === 0) {
|
|
1000
|
+
return [];
|
|
1001
|
+
}
|
|
1002
|
+
const testCount = allIterationResults[0].length;
|
|
1003
|
+
const aggregated = [];
|
|
1004
|
+
for (let i = 0; i < testCount; i++) {
|
|
1005
|
+
const resultsForTestCase = allIterationResults.map((iteration) => iteration[i]);
|
|
1006
|
+
const stats = calculateIterationStats(resultsForTestCase);
|
|
1007
|
+
const representative = selectRepresentativeResult(resultsForTestCase, stats.mean);
|
|
1008
|
+
const isMultiTurn = resultsForTestCase.some((r) => isMultiTurnResult(r));
|
|
1009
|
+
const passedByMajority = stats.passRate >= SCORE.MAJORITY_PASS_THRESHOLD;
|
|
1010
|
+
if (isMultiTurn) {
|
|
1011
|
+
const multiTurnResults = resultsForTestCase.filter(
|
|
1012
|
+
(r) => isMultiTurnResult(r)
|
|
1013
|
+
);
|
|
1014
|
+
const multiTurnRep = representative;
|
|
1015
|
+
const aggregatedResult = {
|
|
1016
|
+
kind: "multi-turn-iterated",
|
|
1017
|
+
testCase: multiTurnRep.testCase,
|
|
1018
|
+
output: multiTurnRep.output,
|
|
1019
|
+
metrics: multiTurnRep.metrics,
|
|
1020
|
+
verdicts: multiTurnRep.verdicts,
|
|
1021
|
+
error: multiTurnRep.error,
|
|
1022
|
+
overallScore: stats.mean,
|
|
1023
|
+
passed: passedByMajority,
|
|
1024
|
+
iterationStats: stats,
|
|
1025
|
+
iterationResults: resultsForTestCase,
|
|
1026
|
+
conversationHistory: multiTurnRep.conversationHistory,
|
|
1027
|
+
totalTurns: multiTurnRep.totalTurns,
|
|
1028
|
+
terminationReason: multiTurnRep.terminationReason,
|
|
1029
|
+
termination: multiTurnRep.termination,
|
|
1030
|
+
multiTurnIterationStats: calculateMultiTurnIterationStats(multiTurnResults)
|
|
1031
|
+
};
|
|
1032
|
+
aggregated.push(aggregatedResult);
|
|
1033
|
+
} else {
|
|
1034
|
+
const aggregatedResult = {
|
|
1035
|
+
kind: "single-turn-iterated",
|
|
1036
|
+
testCase: representative.testCase,
|
|
1037
|
+
output: representative.output,
|
|
1038
|
+
metrics: representative.metrics,
|
|
1039
|
+
verdicts: representative.verdicts,
|
|
1040
|
+
error: representative.error,
|
|
1041
|
+
overallScore: stats.mean,
|
|
1042
|
+
passed: passedByMajority,
|
|
1043
|
+
iterationStats: stats,
|
|
1044
|
+
iterationResults: resultsForTestCase
|
|
1045
|
+
};
|
|
1046
|
+
aggregated.push(aggregatedResult);
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
return aggregated;
|
|
1050
|
+
}
|
|
1051
|
+
function filterIteratedResults(results) {
|
|
1052
|
+
return results.filter(
|
|
1053
|
+
(r) => r.kind === "single-turn-iterated" || r.kind === "multi-turn-iterated"
|
|
1054
|
+
);
|
|
1055
|
+
}
|
|
1056
|
+
function averageIterationStat(results, selector) {
|
|
1057
|
+
const iteratedResults = filterIteratedResults(results);
|
|
1058
|
+
if (iteratedResults.length === 0) {
|
|
1059
|
+
return void 0;
|
|
1060
|
+
}
|
|
1061
|
+
const total = iteratedResults.reduce((sum, r) => sum + selector(r.iterationStats), 0);
|
|
1062
|
+
return total / iteratedResults.length;
|
|
1063
|
+
}
|
|
1064
|
+
function calculateAvgStdDev(results) {
|
|
1065
|
+
return averageIterationStat(results, (stats) => stats.stdDev);
|
|
1066
|
+
}
|
|
1067
|
+
function calculateAvgPassRate(results) {
|
|
1068
|
+
return averageIterationStat(results, (stats) => stats.passRate);
|
|
1069
|
+
}
|
|
1070
|
+
|
|
1071
|
+
// src/core/suite.ts
|
|
1072
|
+
function calculateAggregatedMetrics(results) {
|
|
1073
|
+
if (results.length === 0) {
|
|
1074
|
+
return { avgLatencyMs: 0, totalTokens: 0 };
|
|
1075
|
+
}
|
|
1076
|
+
const totalLatencyMs = sumBy(results, (r) => r.metrics.latencyMs);
|
|
1077
|
+
const totalTokens = sumBy(results, (r) => r.metrics.tokenUsage.totalTokens);
|
|
1078
|
+
return {
|
|
1079
|
+
avgLatencyMs: totalLatencyMs / results.length,
|
|
1080
|
+
totalTokens
|
|
1081
|
+
};
|
|
1082
|
+
}
|
|
1083
|
+
function sumBy(items, selector) {
|
|
1084
|
+
return items.reduce((sum, item) => sum + selector(item), 0);
|
|
1085
|
+
}
|
|
1086
|
+
function calculateSummary(results, iterations) {
|
|
1087
|
+
const metrics = calculateAggregatedMetrics(results);
|
|
1088
|
+
const passedCount = results.filter((r) => r.passed).length;
|
|
1089
|
+
const failedCount = results.length - passedCount;
|
|
1090
|
+
const avgScore = results.length > 0 ? sumBy(results, (r) => r.overallScore) / results.length : 0;
|
|
1091
|
+
const summary = {
|
|
1092
|
+
totalTests: results.length,
|
|
1093
|
+
passed: passedCount,
|
|
1094
|
+
failed: failedCount,
|
|
1095
|
+
avgScore,
|
|
1096
|
+
metrics
|
|
1097
|
+
};
|
|
1098
|
+
const hasMultipleIterations = iterations && iterations > 1;
|
|
1099
|
+
if (hasMultipleIterations) {
|
|
1100
|
+
summary.iterations = iterations;
|
|
1101
|
+
summary.avgStdDev = calculateAvgStdDev(results);
|
|
1102
|
+
summary.avgPassRate = calculateAvgPassRate(results);
|
|
1103
|
+
}
|
|
1104
|
+
return summary;
|
|
1105
|
+
}
|
|
1106
|
+
function createEvalSuite(config) {
|
|
1107
|
+
const { agent, agentDescription, judge, improver } = config;
|
|
1108
|
+
const description = agentDescription ?? agent.config.description ?? agent.config.name;
|
|
1109
|
+
const suite = {
|
|
1110
|
+
async run(testCases2, options) {
|
|
1111
|
+
const iterations = options?.iterations ?? 1;
|
|
1112
|
+
validateIterations(iterations);
|
|
1113
|
+
const executeContext = { agent, judge, agentDescription: description };
|
|
1114
|
+
const results = iterations <= 1 ? await runWithConcurrency(testCases2, executeContext, options) : await runMultipleIterations(testCases2, executeContext, options, iterations);
|
|
1115
|
+
const summary = calculateSummary(results, iterations > 1 ? iterations : void 0);
|
|
1116
|
+
const suggestions = improver ? (await improver.improve(agent.prompt, results)).suggestions : [];
|
|
1117
|
+
return {
|
|
1118
|
+
summary,
|
|
1119
|
+
results,
|
|
1120
|
+
suggestions,
|
|
1121
|
+
generatedAt: /* @__PURE__ */ new Date(),
|
|
1122
|
+
promptVersion: agent.prompt.version
|
|
1123
|
+
};
|
|
1124
|
+
},
|
|
1125
|
+
withAgent(newAgent) {
|
|
1126
|
+
return createEvalSuite({
|
|
1127
|
+
...config,
|
|
1128
|
+
agent: newAgent,
|
|
1129
|
+
agentDescription: void 0
|
|
1130
|
+
});
|
|
1131
|
+
}
|
|
1132
|
+
};
|
|
1133
|
+
return suite;
|
|
1134
|
+
}
|
|
1135
|
+
function validateIterations(iterations) {
|
|
1136
|
+
if (iterations < 1 || !Number.isInteger(iterations)) {
|
|
1137
|
+
throw new EvalError(
|
|
1138
|
+
`Invalid iterations value: ${iterations}. Must be a positive integer.`,
|
|
1139
|
+
{ code: "INVALID_CONFIG" /* INVALID_CONFIG */, context: { iterations } }
|
|
1140
|
+
);
|
|
1141
|
+
}
|
|
1142
|
+
}
|
|
1143
|
+
async function runMultipleIterations(testCases2, executeContext, options, iterations) {
|
|
1144
|
+
const allIterationResults = [];
|
|
1145
|
+
for (let i = 0; i < iterations; i++) {
|
|
1146
|
+
const iterationResults = await runWithConcurrency(
|
|
1147
|
+
testCases2,
|
|
1148
|
+
executeContext,
|
|
1149
|
+
{ ...options, iterations: void 0 }
|
|
1150
|
+
);
|
|
1151
|
+
allIterationResults.push(iterationResults);
|
|
1152
|
+
}
|
|
1153
|
+
return aggregateIterationResults(allIterationResults);
|
|
1154
|
+
}
|
|
1155
|
+
|
|
1156
|
+
// src/index.ts
|
|
1157
|
+
import {
|
|
1158
|
+
resolveFileSource,
|
|
1159
|
+
resolveFileSourcesInInput as resolveFileSourcesInInput3,
|
|
1160
|
+
scanForFileSources,
|
|
1161
|
+
getFileSourceDisplayInfo,
|
|
1162
|
+
getFileSourcesDisplayInfo as getFileSourcesDisplayInfo2,
|
|
1163
|
+
inferMediaType,
|
|
1164
|
+
isFileSource,
|
|
1165
|
+
isFileSourcePath,
|
|
1166
|
+
isFileSourceData,
|
|
1167
|
+
isFileSourceBase64,
|
|
1168
|
+
isFileSourceUrl
|
|
1169
|
+
} from "@agtlantis/core";
|
|
1170
|
+
|
|
1171
|
+
// src/judge/llm-judge.ts
|
|
1172
|
+
import { Output } from "ai";
|
|
1173
|
+
import { z } from "zod";
|
|
1174
|
+
|
|
1175
|
+
// src/judge/prompts/default.ts
|
|
1176
|
+
var defaultJudgePrompt = {
|
|
1177
|
+
id: "default-judge",
|
|
1178
|
+
version: "2.0.0",
|
|
1179
|
+
system: `You are an expert evaluator specializing in assessing AI Agent outputs.
|
|
1180
|
+
|
|
1181
|
+
Your role is to fairly and thoroughly evaluate the agent's output against the provided criteria.
|
|
1182
|
+
|
|
1183
|
+
## Evaluation Principles
|
|
1184
|
+
|
|
1185
|
+
1. **Scoring**: Assign a score between 0-100 for each criterion
|
|
1186
|
+
- 90-100: Exceptional - Exceeds expectations with no significant issues
|
|
1187
|
+
- 70-89: Good - Meets expectations with minor issues
|
|
1188
|
+
- 50-69: Acceptable - Partially meets expectations, notable issues present
|
|
1189
|
+
- 30-49: Poor - Falls short of expectations, significant issues
|
|
1190
|
+
- 0-29: Failing - Does not meet minimum requirements
|
|
1191
|
+
|
|
1192
|
+
2. **Reasoning**: Always provide specific, evidence-based reasoning
|
|
1193
|
+
- Quote or reference specific parts of the output
|
|
1194
|
+
- Explain both strengths and weaknesses
|
|
1195
|
+
- Be constructive and actionable in feedback
|
|
1196
|
+
|
|
1197
|
+
3. **Objectivity**: Evaluate based solely on the criteria provided
|
|
1198
|
+
- Avoid personal preferences or unstated requirements
|
|
1199
|
+
- Consider the agent's intended purpose and context
|
|
1200
|
+
- Weight severity of issues proportionally
|
|
1201
|
+
|
|
1202
|
+
## Response Format
|
|
1203
|
+
|
|
1204
|
+
You MUST respond with valid JSON only. No additional text or explanation outside the JSON structure.
|
|
1205
|
+
|
|
1206
|
+
{
|
|
1207
|
+
"verdicts": [
|
|
1208
|
+
{
|
|
1209
|
+
"criterionId": "criterion-id",
|
|
1210
|
+
"score": 0-100,
|
|
1211
|
+
"reasoning": "Detailed explanation with specific evidence from the output",
|
|
1212
|
+
"passed": true/false
|
|
1213
|
+
}
|
|
1214
|
+
]
|
|
1215
|
+
}`,
|
|
1216
|
+
renderUserPrompt: (ctx) => {
|
|
1217
|
+
const fileSection = buildFileSection(ctx.files);
|
|
1218
|
+
return `
|
|
1219
|
+
## Agent Under Evaluation
|
|
1220
|
+
${ctx.agentDescription}
|
|
1221
|
+
|
|
1222
|
+
## Input Provided to Agent
|
|
1223
|
+
\`\`\`json
|
|
1224
|
+
${JSON.stringify(ctx.input, null, 2)}
|
|
1225
|
+
\`\`\`
|
|
1226
|
+
${fileSection}
|
|
1227
|
+
## Agent Output
|
|
1228
|
+
\`\`\`json
|
|
1229
|
+
${JSON.stringify(ctx.output, null, 2)}
|
|
1230
|
+
\`\`\`
|
|
1231
|
+
|
|
1232
|
+
## Evaluation Criteria
|
|
1233
|
+
${ctx.criteria.map((c) => `- **${c.name}** (id: ${c.id}, weight: ${c.weight ?? 1}): ${c.description}`).join("\n")}
|
|
1234
|
+
|
|
1235
|
+
Please evaluate the agent's output against each criterion listed above.`.trim();
|
|
1236
|
+
}
|
|
1237
|
+
};
|
|
1238
|
+
function buildFileSection(files) {
|
|
1239
|
+
if (!files || files.length === 0) {
|
|
1240
|
+
return "";
|
|
1241
|
+
}
|
|
1242
|
+
return `
|
|
1243
|
+
## Reference Files
|
|
1244
|
+
${files.map((f) => `### ${f.path}
|
|
1245
|
+
\`\`\`
|
|
1246
|
+
${f.content}
|
|
1247
|
+
\`\`\``).join("\n\n")}
|
|
1248
|
+
`;
|
|
1249
|
+
}
|
|
1250
|
+
|
|
1251
|
+
// src/judge/llm-judge.ts
|
|
1252
|
+
function toEvalTokenUsage(usage) {
|
|
1253
|
+
return {
|
|
1254
|
+
inputTokens: usage.inputTokens ?? 0,
|
|
1255
|
+
outputTokens: usage.outputTokens ?? 0,
|
|
1256
|
+
totalTokens: usage.totalTokens ?? 0
|
|
1257
|
+
};
|
|
1258
|
+
}
|
|
1259
|
+
function hasValidator(criterion) {
|
|
1260
|
+
return "validator" in criterion && typeof criterion.validator === "function";
|
|
1261
|
+
}
|
|
1262
|
+
var JudgeResponseSchema = z.object({
|
|
1263
|
+
verdicts: z.array(
|
|
1264
|
+
z.object({
|
|
1265
|
+
criterionId: z.string(),
|
|
1266
|
+
score: z.number().min(SCORE.MIN).max(SCORE.MAX),
|
|
1267
|
+
reasoning: z.string(),
|
|
1268
|
+
passed: z.boolean().optional()
|
|
1269
|
+
})
|
|
1270
|
+
)
|
|
1271
|
+
});
|
|
1272
|
+
function validateAllCriteriaHaveVerdicts(verdicts, criteriaIds) {
|
|
1273
|
+
const providedIds = new Set(verdicts.map((v) => v.criterionId));
|
|
1274
|
+
const missingIds = criteriaIds.filter((id) => !providedIds.has(id));
|
|
1275
|
+
if (missingIds.length > 0) {
|
|
1276
|
+
throw new EvalError("Judge response missing verdicts for some criteria", {
|
|
1277
|
+
code: "VERDICT_PARSE_ERROR" /* VERDICT_PARSE_ERROR */,
|
|
1278
|
+
context: { missingCriteriaIds: missingIds, providedIds: [...providedIds] }
|
|
1279
|
+
});
|
|
1280
|
+
}
|
|
1281
|
+
}
|
|
1282
|
+
function calculateOverallScore(verdicts, criteriaWeights) {
|
|
1283
|
+
let totalWeight = 0;
|
|
1284
|
+
let weightedSum = 0;
|
|
1285
|
+
for (const verdict of verdicts) {
|
|
1286
|
+
const weight = criteriaWeights.get(verdict.criterionId) ?? 1;
|
|
1287
|
+
weightedSum += verdict.score * weight;
|
|
1288
|
+
totalWeight += weight;
|
|
1289
|
+
}
|
|
1290
|
+
if (totalWeight === 0) {
|
|
1291
|
+
return 0;
|
|
1292
|
+
}
|
|
1293
|
+
return Math.round(weightedSum / totalWeight * 100) / 100;
|
|
1294
|
+
}
|
|
1295
|
+
function runValidatorCriteria(validatorCriteria, output) {
|
|
1296
|
+
return validatorCriteria.map((criterion) => {
|
|
1297
|
+
const result = criterion.validator(output);
|
|
1298
|
+
if (result.valid) {
|
|
1299
|
+
return {
|
|
1300
|
+
criterionId: criterion.id,
|
|
1301
|
+
score: 100,
|
|
1302
|
+
reasoning: `${criterion.name} \uD1B5\uACFC`,
|
|
1303
|
+
passed: true
|
|
1304
|
+
};
|
|
1305
|
+
}
|
|
1306
|
+
return {
|
|
1307
|
+
criterionId: criterion.id,
|
|
1308
|
+
score: 0,
|
|
1309
|
+
reasoning: `${criterion.name} \uC2E4\uD328:
|
|
1310
|
+
${result.errorSummary ?? "\uC720\uD6A8\uC131 \uAC80\uC99D \uC624\uB958"}`,
|
|
1311
|
+
passed: false
|
|
1312
|
+
};
|
|
1313
|
+
});
|
|
1314
|
+
}
|
|
1315
|
+
async function runLLMEvaluation(provider, prompt, context, llmCriteriaIds, passThreshold) {
|
|
1316
|
+
const messages = [
|
|
1317
|
+
{ role: "system", content: prompt.system },
|
|
1318
|
+
{ role: "user", content: prompt.renderUserPrompt(context) }
|
|
1319
|
+
];
|
|
1320
|
+
let response;
|
|
1321
|
+
let usage;
|
|
1322
|
+
try {
|
|
1323
|
+
const execution = provider.simpleExecution(async (session) => {
|
|
1324
|
+
const result = await session.generateText({
|
|
1325
|
+
messages,
|
|
1326
|
+
output: Output.object({ schema: JudgeResponseSchema })
|
|
1327
|
+
});
|
|
1328
|
+
return result.output;
|
|
1329
|
+
});
|
|
1330
|
+
const executionResult = await execution.result();
|
|
1331
|
+
if (executionResult.status !== "succeeded") {
|
|
1332
|
+
throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
|
|
1333
|
+
}
|
|
1334
|
+
response = executionResult.value;
|
|
1335
|
+
usage = executionResult.summary.totalLLMUsage;
|
|
1336
|
+
} catch (cause) {
|
|
1337
|
+
throw EvalError.from(cause, "LLM_API_ERROR" /* LLM_API_ERROR */, {
|
|
1338
|
+
promptId: prompt.id,
|
|
1339
|
+
promptVersion: prompt.version
|
|
1340
|
+
});
|
|
1341
|
+
}
|
|
1342
|
+
validateAllCriteriaHaveVerdicts(response.verdicts, llmCriteriaIds);
|
|
1343
|
+
const verdicts = response.verdicts.map((v) => ({
|
|
1344
|
+
criterionId: v.criterionId,
|
|
1345
|
+
score: v.score,
|
|
1346
|
+
reasoning: v.reasoning,
|
|
1347
|
+
passed: v.passed ?? v.score >= passThreshold
|
|
1348
|
+
}));
|
|
1349
|
+
return { verdicts, usage };
|
|
1350
|
+
}
|
|
1351
|
+
function createJudge(config) {
|
|
1352
|
+
const {
|
|
1353
|
+
provider,
|
|
1354
|
+
prompt = defaultJudgePrompt,
|
|
1355
|
+
criteria,
|
|
1356
|
+
passThreshold = SCORE.DEFAULT_PASS_THRESHOLD,
|
|
1357
|
+
model
|
|
1358
|
+
} = config;
|
|
1359
|
+
const validatorCriteria = [];
|
|
1360
|
+
const llmCriteria = [];
|
|
1361
|
+
const criteriaWeights = /* @__PURE__ */ new Map();
|
|
1362
|
+
const llmCriteriaIds = [];
|
|
1363
|
+
for (const c of criteria) {
|
|
1364
|
+
criteriaWeights.set(c.id, c.weight ?? 1);
|
|
1365
|
+
if (hasValidator(c)) {
|
|
1366
|
+
validatorCriteria.push(c);
|
|
1367
|
+
} else {
|
|
1368
|
+
llmCriteria.push(c);
|
|
1369
|
+
llmCriteriaIds.push(c.id);
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
return {
|
|
1373
|
+
async evaluate(evalContext) {
|
|
1374
|
+
const { input, output, agentDescription, files } = evalContext;
|
|
1375
|
+
const validatorVerdicts = runValidatorCriteria(validatorCriteria, output);
|
|
1376
|
+
let llmVerdicts = [];
|
|
1377
|
+
let llmUsage;
|
|
1378
|
+
if (llmCriteria.length > 0) {
|
|
1379
|
+
const context = {
|
|
1380
|
+
agentDescription,
|
|
1381
|
+
input,
|
|
1382
|
+
output,
|
|
1383
|
+
criteria: llmCriteria,
|
|
1384
|
+
files
|
|
1385
|
+
};
|
|
1386
|
+
const llmResult = await runLLMEvaluation(
|
|
1387
|
+
provider,
|
|
1388
|
+
prompt,
|
|
1389
|
+
context,
|
|
1390
|
+
llmCriteriaIds,
|
|
1391
|
+
passThreshold
|
|
1392
|
+
);
|
|
1393
|
+
llmVerdicts = llmResult.verdicts;
|
|
1394
|
+
llmUsage = llmResult.usage;
|
|
1395
|
+
}
|
|
1396
|
+
const allVerdicts = [...validatorVerdicts, ...llmVerdicts];
|
|
1397
|
+
const overallScore = calculateOverallScore(allVerdicts, criteriaWeights);
|
|
1398
|
+
const passed = overallScore >= passThreshold;
|
|
1399
|
+
const metadata = llmUsage ? { tokenUsage: toEvalTokenUsage(llmUsage), model } : void 0;
|
|
1400
|
+
return {
|
|
1401
|
+
verdicts: allVerdicts,
|
|
1402
|
+
overallScore,
|
|
1403
|
+
passed,
|
|
1404
|
+
metadata
|
|
1405
|
+
};
|
|
1406
|
+
}
|
|
1407
|
+
};
|
|
1408
|
+
}
|
|
1409
|
+
|
|
1410
|
+
// src/judge/criteria/validate-schema.ts
|
|
1411
|
+
function formatZodErrors(error) {
|
|
1412
|
+
return error.errors.map((e) => {
|
|
1413
|
+
const path3 = e.path.length > 0 ? `${e.path.join(".")}: ` : "";
|
|
1414
|
+
return `- ${path3}${e.message}`;
|
|
1415
|
+
}).join("\n");
|
|
1416
|
+
}
|
|
1417
|
+
function schema(options) {
|
|
1418
|
+
const { schema: schema2, id, weight, name, description } = options;
|
|
1419
|
+
return {
|
|
1420
|
+
id: id ?? "schema-validation",
|
|
1421
|
+
name: name ?? "\uC2A4\uD0A4\uB9C8 \uC720\uD6A8\uC131",
|
|
1422
|
+
description: description ?? "\uCD9C\uB825\uC774 \uC9C0\uC815\uB41C \uC2A4\uD0A4\uB9C8(Zod)\uB97C \uC900\uC218\uD558\uB294\uC9C0 \uD504\uB85C\uADF8\uB798\uBC0D \uBC29\uC2DD\uC73C\uB85C \uAC80\uC99D\uD569\uB2C8\uB2E4.",
|
|
1423
|
+
weight,
|
|
1424
|
+
validator: (output) => {
|
|
1425
|
+
const result = schema2.safeParse(output);
|
|
1426
|
+
if (result.success) {
|
|
1427
|
+
return { valid: true };
|
|
1428
|
+
}
|
|
1429
|
+
return {
|
|
1430
|
+
valid: false,
|
|
1431
|
+
errors: result.error.errors,
|
|
1432
|
+
errorSummary: formatZodErrors(result.error)
|
|
1433
|
+
};
|
|
1434
|
+
}
|
|
1435
|
+
};
|
|
1436
|
+
}
|
|
1437
|
+
|
|
1438
|
+
// src/judge/criteria/index.ts
|
|
1439
|
+
function accuracy(options) {
|
|
1440
|
+
return {
|
|
1441
|
+
id: "accuracy",
|
|
1442
|
+
name: "Accuracy",
|
|
1443
|
+
description: "Evaluates whether the output is factually correct, free from errors, and avoids hallucinations. Check for incorrect facts, made-up information, or misrepresentation of the input data.",
|
|
1444
|
+
weight: options?.weight
|
|
1445
|
+
};
|
|
1446
|
+
}
|
|
1447
|
+
function consistency(options) {
|
|
1448
|
+
return {
|
|
1449
|
+
id: "consistency",
|
|
1450
|
+
name: "Consistency",
|
|
1451
|
+
description: "Evaluates whether the output is internally coherent and logically consistent. Check for self-contradictions, conflicting statements, or logical inconsistencies within the response.",
|
|
1452
|
+
weight: options?.weight
|
|
1453
|
+
};
|
|
1454
|
+
}
|
|
1455
|
+
function relevance(options) {
|
|
1456
|
+
return {
|
|
1457
|
+
id: "relevance",
|
|
1458
|
+
name: "Relevance",
|
|
1459
|
+
description: "Evaluates whether the output directly addresses the input and fulfills the user intent. Check for off-topic content, missing key requirements, or responses that fail to answer the actual question.",
|
|
1460
|
+
weight: options?.weight
|
|
1461
|
+
};
|
|
1462
|
+
}
|
|
1463
|
+
|
|
1464
|
+
// src/reporter/markdown.ts
|
|
1465
|
+
import { writeFile } from "fs/promises";
|
|
1466
|
+
import { getFileSourcesDisplayInfo } from "@agtlantis/core";
|
|
1467
|
+
var PASS_ICON = "\u2705";
|
|
1468
|
+
var FAIL_ICON = "\u274C";
|
|
1469
|
+
var PRIORITY_ORDER = { high: 0, medium: 1, low: 2 };
|
|
1470
|
+
function reportToMarkdown(report, options = {}) {
|
|
1471
|
+
const {
|
|
1472
|
+
expandPassedTests = false,
|
|
1473
|
+
includeRawOutput = false,
|
|
1474
|
+
outputPreviewLength = 200
|
|
1475
|
+
} = options;
|
|
1476
|
+
const { summary, results, suggestions, generatedAt, promptVersion } = report;
|
|
1477
|
+
const passRate = summary.totalTests > 0 ? (summary.passed / summary.totalTests * 100).toFixed(1) : "0.0";
|
|
1478
|
+
const lines = [];
|
|
1479
|
+
lines.push("# Evaluation Report");
|
|
1480
|
+
lines.push("");
|
|
1481
|
+
lines.push(`> Generated: ${generatedAt.toISOString()}`);
|
|
1482
|
+
lines.push(`> Prompt Version: ${promptVersion}`);
|
|
1483
|
+
lines.push("");
|
|
1484
|
+
lines.push("## Summary");
|
|
1485
|
+
lines.push("");
|
|
1486
|
+
lines.push(`| Metric | Value |`);
|
|
1487
|
+
lines.push(`|--------|-------|`);
|
|
1488
|
+
lines.push(`| Total Tests | ${summary.totalTests} |`);
|
|
1489
|
+
if (summary.iterations && summary.iterations > 1) {
|
|
1490
|
+
lines.push(`| **Iterations** | **${summary.iterations}** |`);
|
|
1491
|
+
}
|
|
1492
|
+
lines.push(`| Passed | ${summary.passed} (${passRate}%) |`);
|
|
1493
|
+
lines.push(`| Failed | ${summary.failed} |`);
|
|
1494
|
+
if (summary.avgStdDev !== void 0) {
|
|
1495
|
+
lines.push(
|
|
1496
|
+
`| Average Score | ${summary.avgScore.toFixed(1)} \xB1 ${summary.avgStdDev.toFixed(1)} |`
|
|
1497
|
+
);
|
|
1498
|
+
} else {
|
|
1499
|
+
lines.push(`| Average Score | ${summary.avgScore.toFixed(1)} |`);
|
|
1500
|
+
}
|
|
1501
|
+
if (summary.avgPassRate !== void 0) {
|
|
1502
|
+
lines.push(`| Avg Pass Rate | ${(summary.avgPassRate * 100).toFixed(1)}% |`);
|
|
1503
|
+
}
|
|
1504
|
+
lines.push(`| Avg Latency | ${summary.metrics.avgLatencyMs.toFixed(0)}ms |`);
|
|
1505
|
+
lines.push(`| Total Tokens | ${summary.metrics.totalTokens} |`);
|
|
1506
|
+
if (summary.costSummary?.total !== void 0) {
|
|
1507
|
+
lines.push(`| Est. Cost | $${summary.costSummary.total.toFixed(4)} |`);
|
|
1508
|
+
}
|
|
1509
|
+
lines.push("");
|
|
1510
|
+
const failedResults = results.filter((r) => !r.passed);
|
|
1511
|
+
if (failedResults.length > 0) {
|
|
1512
|
+
lines.push(`## ${FAIL_ICON} Failed Tests`);
|
|
1513
|
+
lines.push("");
|
|
1514
|
+
for (const result of failedResults) {
|
|
1515
|
+
lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
|
|
1516
|
+
}
|
|
1517
|
+
}
|
|
1518
|
+
const passedResults = results.filter((r) => r.passed);
|
|
1519
|
+
if (passedResults.length > 0) {
|
|
1520
|
+
lines.push(`## ${PASS_ICON} Passed Tests`);
|
|
1521
|
+
lines.push("");
|
|
1522
|
+
if (expandPassedTests) {
|
|
1523
|
+
for (const result of passedResults) {
|
|
1524
|
+
lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
|
|
1525
|
+
}
|
|
1526
|
+
} else {
|
|
1527
|
+
lines.push("<details>");
|
|
1528
|
+
lines.push("<summary>Click to expand passed tests</summary>");
|
|
1529
|
+
lines.push("");
|
|
1530
|
+
for (const result of passedResults) {
|
|
1531
|
+
lines.push(formatTestResult(result, outputPreviewLength, includeRawOutput));
|
|
1532
|
+
}
|
|
1533
|
+
lines.push("</details>");
|
|
1534
|
+
lines.push("");
|
|
1535
|
+
}
|
|
1536
|
+
}
|
|
1537
|
+
if (suggestions.length > 0) {
|
|
1538
|
+
lines.push("## \u{1F4A1} Improvement Suggestions");
|
|
1539
|
+
lines.push("");
|
|
1540
|
+
const sortedSuggestions = [...suggestions].sort(
|
|
1541
|
+
(a, b) => PRIORITY_ORDER[a.priority] - PRIORITY_ORDER[b.priority]
|
|
1542
|
+
);
|
|
1543
|
+
for (const suggestion of sortedSuggestions) {
|
|
1544
|
+
lines.push(formatSuggestion(suggestion));
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
return lines.join("\n");
|
|
1548
|
+
}
|
|
1549
|
+
async function saveReportMarkdown(report, path3, options) {
|
|
1550
|
+
const markdown = reportToMarkdown(report, options);
|
|
1551
|
+
await writeFile(path3, markdown, "utf-8");
|
|
1552
|
+
}
|
|
1553
|
+
function jsonCodeBlock(value, maxLength) {
|
|
1554
|
+
const json = JSON.stringify(value, null, 2);
|
|
1555
|
+
const content = maxLength !== void 0 ? truncate(json, maxLength) : json;
|
|
1556
|
+
return ["```json", content, "```"];
|
|
1557
|
+
}
|
|
1558
|
+
function passFailIcon(passed) {
|
|
1559
|
+
return passed ? PASS_ICON : FAIL_ICON;
|
|
1560
|
+
}
|
|
1561
|
+
function formatTestResult(result, previewLength, includeRaw) {
|
|
1562
|
+
const lines = [];
|
|
1563
|
+
const testId = result.testCase.id ?? "unnamed";
|
|
1564
|
+
const scoreDisplay = result.iterationStats ? `${result.overallScore.toFixed(1)} \xB1 ${result.iterationStats.stdDev.toFixed(1)}` : result.overallScore.toFixed(1);
|
|
1565
|
+
lines.push(`### ${testId} (Score: ${scoreDisplay})`);
|
|
1566
|
+
lines.push("");
|
|
1567
|
+
if (result.testCase.description) {
|
|
1568
|
+
lines.push(`> ${result.testCase.description}`);
|
|
1569
|
+
lines.push("");
|
|
1570
|
+
}
|
|
1571
|
+
const fileDisplayInfos = getFileSourcesDisplayInfo(result.testCase.input);
|
|
1572
|
+
if (fileDisplayInfos.length > 0) {
|
|
1573
|
+
lines.push("**Files:**");
|
|
1574
|
+
for (const info of fileDisplayInfos) {
|
|
1575
|
+
const namePrefix = info.filename ? `${info.filename} - ` : "";
|
|
1576
|
+
lines.push(`- ${namePrefix}${info.source}: ${info.description} (${info.mediaType})`);
|
|
1577
|
+
}
|
|
1578
|
+
lines.push("");
|
|
1579
|
+
}
|
|
1580
|
+
if (result.totalTurns !== void 0) {
|
|
1581
|
+
lines.push(
|
|
1582
|
+
`**Multi-turn:** ${result.totalTurns} turns | Termination: ${result.terminationReason ?? "unknown"}`
|
|
1583
|
+
);
|
|
1584
|
+
lines.push("");
|
|
1585
|
+
}
|
|
1586
|
+
if (result.multiTurnIterationStats) {
|
|
1587
|
+
lines.push(...formatMultiTurnIterationStats(result.multiTurnIterationStats));
|
|
1588
|
+
}
|
|
1589
|
+
if (result.iterationStats && result.iterationResults) {
|
|
1590
|
+
lines.push(...formatIterationResults(result.iterationStats, result.iterationResults));
|
|
1591
|
+
}
|
|
1592
|
+
if (result.conversationHistory && result.conversationHistory.length > 0) {
|
|
1593
|
+
lines.push(...formatConversationHistory(result.conversationHistory, previewLength));
|
|
1594
|
+
} else {
|
|
1595
|
+
lines.push(
|
|
1596
|
+
...formatSingleTurnInputOutput(result.testCase.input, result.output, previewLength)
|
|
1597
|
+
);
|
|
1598
|
+
}
|
|
1599
|
+
lines.push("**Verdicts:**");
|
|
1600
|
+
for (const verdict of result.verdicts) {
|
|
1601
|
+
lines.push(
|
|
1602
|
+
`- ${passFailIcon(verdict.passed)} **${verdict.criterionId}**: ${verdict.score} - ${verdict.reasoning}`
|
|
1603
|
+
);
|
|
1604
|
+
}
|
|
1605
|
+
lines.push("");
|
|
1606
|
+
if (includeRaw) {
|
|
1607
|
+
lines.push("<details>");
|
|
1608
|
+
lines.push("<summary>Raw Output</summary>");
|
|
1609
|
+
lines.push("");
|
|
1610
|
+
lines.push(...jsonCodeBlock(result.output));
|
|
1611
|
+
lines.push("</details>");
|
|
1612
|
+
lines.push("");
|
|
1613
|
+
}
|
|
1614
|
+
return lines.join("\n");
|
|
1615
|
+
}
|
|
1616
|
+
function formatMultiTurnIterationStats(stats) {
|
|
1617
|
+
const terminationSummary = Object.entries(stats.terminationCounts).map(([type, count]) => `${type}: ${count}`).join(", ") || "none";
|
|
1618
|
+
return [
|
|
1619
|
+
"**Multi-turn Iteration Statistics:**",
|
|
1620
|
+
"",
|
|
1621
|
+
"| Metric | Value |",
|
|
1622
|
+
"|--------|-------|",
|
|
1623
|
+
`| Avg Turns | ${stats.avgTurns.toFixed(1)} |`,
|
|
1624
|
+
`| Min/Max Turns | ${stats.minTurns} / ${stats.maxTurns} |`,
|
|
1625
|
+
`| Termination Distribution | ${terminationSummary} |`,
|
|
1626
|
+
""
|
|
1627
|
+
];
|
|
1628
|
+
}
|
|
1629
|
+
function formatIterationResults(stats, results) {
|
|
1630
|
+
const lines = [
|
|
1631
|
+
"**Iteration Results:**",
|
|
1632
|
+
"",
|
|
1633
|
+
"| # | Score | Passed | Latency |",
|
|
1634
|
+
"|---|-------|--------|---------|"
|
|
1635
|
+
];
|
|
1636
|
+
results.forEach((iter, idx) => {
|
|
1637
|
+
lines.push(
|
|
1638
|
+
`| ${idx + 1} | ${iter.overallScore.toFixed(1)} | ${passFailIcon(iter.passed)} | ${iter.metrics.latencyMs.toFixed(0)}ms |`
|
|
1639
|
+
);
|
|
1640
|
+
});
|
|
1641
|
+
lines.push("");
|
|
1642
|
+
lines.push(
|
|
1643
|
+
`**Stats:** ${stats.mean.toFixed(1)} \xB1 ${stats.stdDev.toFixed(1)} (min: ${stats.min.toFixed(0)}, max: ${stats.max.toFixed(0)}, pass rate: ${(stats.passRate * 100).toFixed(0)}%)`
|
|
1644
|
+
);
|
|
1645
|
+
lines.push("");
|
|
1646
|
+
return lines;
|
|
1647
|
+
}
|
|
1648
|
+
function formatConversationHistory(history, previewLength) {
|
|
1649
|
+
const lines = ["**Conversation History:**", ""];
|
|
1650
|
+
for (const turn of history) {
|
|
1651
|
+
lines.push("<details>");
|
|
1652
|
+
lines.push(`<summary>Turn ${turn.turn}</summary>`);
|
|
1653
|
+
lines.push("");
|
|
1654
|
+
lines.push("**Input:**");
|
|
1655
|
+
lines.push(...jsonCodeBlock(turn.input, previewLength));
|
|
1656
|
+
lines.push("");
|
|
1657
|
+
lines.push("**Output:**");
|
|
1658
|
+
lines.push(...jsonCodeBlock(turn.output, previewLength));
|
|
1659
|
+
lines.push("</details>");
|
|
1660
|
+
lines.push("");
|
|
1661
|
+
}
|
|
1662
|
+
return lines;
|
|
1663
|
+
}
|
|
1664
|
+
function formatSingleTurnInputOutput(input, output, previewLength) {
|
|
1665
|
+
return [
|
|
1666
|
+
"**Input:**",
|
|
1667
|
+
...jsonCodeBlock(input, previewLength),
|
|
1668
|
+
"",
|
|
1669
|
+
"**Output:**",
|
|
1670
|
+
...jsonCodeBlock(output, previewLength),
|
|
1671
|
+
""
|
|
1672
|
+
];
|
|
1673
|
+
}
|
|
1674
|
+
function formatSuggestion(suggestion) {
|
|
1675
|
+
const lines = [];
|
|
1676
|
+
const priorityIcon = { high: "\u{1F534}", medium: "\u{1F7E1}", low: "\u{1F7E2}" }[suggestion.priority] ?? "\u26AA";
|
|
1677
|
+
lines.push(`### ${priorityIcon} [${suggestion.priority.toUpperCase()}] ${suggestion.type}`);
|
|
1678
|
+
lines.push("");
|
|
1679
|
+
lines.push(`**Reasoning:** ${suggestion.reasoning}`);
|
|
1680
|
+
lines.push("");
|
|
1681
|
+
lines.push(`**Expected Improvement:** ${suggestion.expectedImprovement}`);
|
|
1682
|
+
lines.push("");
|
|
1683
|
+
lines.push("**Diff:**");
|
|
1684
|
+
lines.push("```diff");
|
|
1685
|
+
lines.push(`- ${suggestion.currentValue.split("\n").join("\n- ")}`);
|
|
1686
|
+
lines.push(`+ ${suggestion.suggestedValue.split("\n").join("\n+ ")}`);
|
|
1687
|
+
lines.push("```");
|
|
1688
|
+
lines.push("");
|
|
1689
|
+
return lines.join("\n");
|
|
1690
|
+
}
|
|
1691
|
+
function compareReports(before, after) {
|
|
1692
|
+
const scoreDelta = after.summary.avgScore - before.summary.avgScore;
|
|
1693
|
+
const beforePassRate = before.summary.totalTests > 0 ? before.summary.passed / before.summary.totalTests : 0;
|
|
1694
|
+
const afterPassRate = after.summary.totalTests > 0 ? after.summary.passed / after.summary.totalTests : 0;
|
|
1695
|
+
const passRateDelta = afterPassRate - beforePassRate;
|
|
1696
|
+
const metricsDelta = {
|
|
1697
|
+
latencyMs: after.summary.metrics.avgLatencyMs - before.summary.metrics.avgLatencyMs,
|
|
1698
|
+
tokenUsage: after.summary.metrics.totalTokens - before.summary.metrics.totalTokens
|
|
1699
|
+
};
|
|
1700
|
+
const beforeScores = buildScoreMap(before.results);
|
|
1701
|
+
const afterScores = buildScoreMap(after.results);
|
|
1702
|
+
const improved = [];
|
|
1703
|
+
const regressed = [];
|
|
1704
|
+
for (const [id, afterScore] of afterScores) {
|
|
1705
|
+
const beforeScore = beforeScores.get(id);
|
|
1706
|
+
if (beforeScore === void 0) continue;
|
|
1707
|
+
if (afterScore > beforeScore) {
|
|
1708
|
+
improved.push(id);
|
|
1709
|
+
} else if (afterScore < beforeScore) {
|
|
1710
|
+
regressed.push(id);
|
|
1711
|
+
}
|
|
1712
|
+
}
|
|
1713
|
+
const removed = [...beforeScores.keys()].filter((id) => !afterScores.has(id));
|
|
1714
|
+
return {
|
|
1715
|
+
scoreDelta,
|
|
1716
|
+
passRateDelta,
|
|
1717
|
+
metricsDelta,
|
|
1718
|
+
improved,
|
|
1719
|
+
regressed,
|
|
1720
|
+
removed
|
|
1721
|
+
};
|
|
1722
|
+
}
|
|
1723
|
+
function buildScoreMap(results) {
|
|
1724
|
+
const scoreMap = /* @__PURE__ */ new Map();
|
|
1725
|
+
for (const result of results) {
|
|
1726
|
+
scoreMap.set(result.testCase.id ?? "unnamed", result.overallScore);
|
|
1727
|
+
}
|
|
1728
|
+
return scoreMap;
|
|
1729
|
+
}
|
|
1730
|
+
|
|
1731
|
+
// src/reporter/json-reporter.ts
|
|
1732
|
+
import { writeFileSync } from "fs";
|
|
1733
|
+
|
|
1734
|
+
// src/reporter/cost-helpers.ts
|
|
1735
|
+
import {
|
|
1736
|
+
calculateCostFromUsage
|
|
1737
|
+
} from "@agtlantis/core";
|
|
1738
|
+
function toLanguageModelUsage(usage) {
|
|
1739
|
+
return {
|
|
1740
|
+
inputTokens: usage.inputTokens,
|
|
1741
|
+
outputTokens: usage.outputTokens,
|
|
1742
|
+
totalTokens: usage.totalTokens
|
|
1743
|
+
};
|
|
1744
|
+
}
|
|
1745
|
+
var PROVIDER_MAPPING = {
|
|
1746
|
+
gemini: "google",
|
|
1747
|
+
openai: "openai",
|
|
1748
|
+
anthropic: "anthropic",
|
|
1749
|
+
google: "google"
|
|
1750
|
+
};
|
|
1751
|
+
function detectProvider(model) {
|
|
1752
|
+
if (!model) return "google";
|
|
1753
|
+
if (model.startsWith("gpt-") || model.startsWith("o1") || model.startsWith("o3")) {
|
|
1754
|
+
return "openai";
|
|
1755
|
+
}
|
|
1756
|
+
if (model.startsWith("gemini-")) {
|
|
1757
|
+
return "google";
|
|
1758
|
+
}
|
|
1759
|
+
if (model.startsWith("claude-")) {
|
|
1760
|
+
return "anthropic";
|
|
1761
|
+
}
|
|
1762
|
+
return "google";
|
|
1763
|
+
}
|
|
1764
|
+
function normalizeProvider(provider) {
|
|
1765
|
+
if (!provider) return "google";
|
|
1766
|
+
return PROVIDER_MAPPING[provider] ?? provider;
|
|
1767
|
+
}
|
|
1768
|
+
function calculateComponentCost(tokenUsage, model, provider, config) {
|
|
1769
|
+
if (!tokenUsage) return void 0;
|
|
1770
|
+
const normalizedProvider = provider ? normalizeProvider(provider) : detectProvider(model);
|
|
1771
|
+
const providerPricing = config?.providerPricing?.[normalizedProvider];
|
|
1772
|
+
const result = calculateCostFromUsage(
|
|
1773
|
+
toLanguageModelUsage(tokenUsage),
|
|
1774
|
+
model ?? "unknown",
|
|
1775
|
+
normalizedProvider,
|
|
1776
|
+
providerPricing
|
|
1777
|
+
);
|
|
1778
|
+
return result.total;
|
|
1779
|
+
}
|
|
1780
|
+
function buildCostBreakdown(costs) {
|
|
1781
|
+
const total = (costs.agent ?? 0) + (costs.judge ?? 0) + (costs.improver ?? 0);
|
|
1782
|
+
return {
|
|
1783
|
+
...costs,
|
|
1784
|
+
total: total > 0 ? total : void 0
|
|
1785
|
+
};
|
|
1786
|
+
}
|
|
1787
|
+
function calculateResultCost(result, config) {
|
|
1788
|
+
const agentCost = calculateComponentCost(
|
|
1789
|
+
result.metrics.tokenUsage,
|
|
1790
|
+
result.agentMetadata?.model,
|
|
1791
|
+
result.agentMetadata?.provider,
|
|
1792
|
+
config
|
|
1793
|
+
);
|
|
1794
|
+
const judgeCost = result.judgeMetadata?.tokenUsage ? calculateComponentCost(
|
|
1795
|
+
result.judgeMetadata.tokenUsage,
|
|
1796
|
+
result.judgeMetadata.model,
|
|
1797
|
+
result.judgeMetadata.provider,
|
|
1798
|
+
config
|
|
1799
|
+
) : void 0;
|
|
1800
|
+
return buildCostBreakdown({
|
|
1801
|
+
agent: agentCost,
|
|
1802
|
+
judge: judgeCost
|
|
1803
|
+
});
|
|
1804
|
+
}
|
|
1805
|
+
function calculateReportCosts(report, config) {
|
|
1806
|
+
let totalAgent = 0;
|
|
1807
|
+
let totalJudge = 0;
|
|
1808
|
+
for (const result of report.results) {
|
|
1809
|
+
const breakdown = calculateResultCost(result, config);
|
|
1810
|
+
totalAgent += breakdown.agent ?? 0;
|
|
1811
|
+
totalJudge += breakdown.judge ?? 0;
|
|
1812
|
+
}
|
|
1813
|
+
return {
|
|
1814
|
+
total: totalAgent + totalJudge,
|
|
1815
|
+
byComponent: {
|
|
1816
|
+
agent: totalAgent,
|
|
1817
|
+
judge: totalJudge
|
|
1818
|
+
}
|
|
1819
|
+
};
|
|
1820
|
+
}
|
|
1821
|
+
function addCostsToResults(results, config) {
|
|
1822
|
+
return results.map((result) => {
|
|
1823
|
+
const costBreakdown = calculateResultCost(result, config);
|
|
1824
|
+
const metricsWithCost = {
|
|
1825
|
+
latencyMs: result.metrics.latencyMs,
|
|
1826
|
+
tokenUsage: result.metrics.tokenUsage,
|
|
1827
|
+
costBreakdown
|
|
1828
|
+
};
|
|
1829
|
+
return {
|
|
1830
|
+
testCase: result.testCase,
|
|
1831
|
+
output: result.output,
|
|
1832
|
+
metrics: metricsWithCost,
|
|
1833
|
+
error: result.error,
|
|
1834
|
+
verdicts: result.verdicts,
|
|
1835
|
+
overallScore: result.overallScore,
|
|
1836
|
+
passed: result.passed
|
|
1837
|
+
};
|
|
1838
|
+
});
|
|
1839
|
+
}
|
|
1840
|
+
|
|
1841
|
+
// src/reporter/format-utils.ts
|
|
1842
|
+
import { mkdirSync } from "fs";
|
|
1843
|
+
import path from "path";
|
|
1844
|
+
function formatScoreDelta(delta) {
|
|
1845
|
+
if (delta === null) {
|
|
1846
|
+
return "-";
|
|
1847
|
+
}
|
|
1848
|
+
const sign = delta >= 0 ? "+" : "";
|
|
1849
|
+
return `${sign}${delta.toFixed(1)}`;
|
|
1850
|
+
}
|
|
1851
|
+
function buildOutputPath(outputDir, name, extension, addTimestamp) {
|
|
1852
|
+
mkdirSync(outputDir, { recursive: true });
|
|
1853
|
+
const filename = addTimestamp ? `${name}-${Date.now()}.${extension}` : `${name}.${extension}`;
|
|
1854
|
+
return path.join(outputDir, filename);
|
|
1855
|
+
}
|
|
1856
|
+
function toISOStringIfDate(value) {
|
|
1857
|
+
return value instanceof Date ? value.toISOString() : value;
|
|
1858
|
+
}
|
|
1859
|
+
|
|
1860
|
+
// src/reporter/json-reporter.ts
|
|
1861
|
+
var JsonReporter = class {
|
|
1862
|
+
outputDir;
|
|
1863
|
+
pricing;
|
|
1864
|
+
addTimestamp;
|
|
1865
|
+
constructor(options) {
|
|
1866
|
+
this.outputDir = options.outputDir;
|
|
1867
|
+
this.pricing = options.pricing;
|
|
1868
|
+
this.addTimestamp = options.addTimestamp ?? true;
|
|
1869
|
+
}
|
|
1870
|
+
save(report, name) {
|
|
1871
|
+
const filepath = buildOutputPath(this.outputDir, name, "json", this.addTimestamp);
|
|
1872
|
+
const costs = this.pricing ? calculateReportCosts(report, this.pricing) : void 0;
|
|
1873
|
+
const output = {
|
|
1874
|
+
summary: report.summary,
|
|
1875
|
+
results: report.results,
|
|
1876
|
+
suggestions: report.suggestions,
|
|
1877
|
+
generatedAt: report.generatedAt.toISOString(),
|
|
1878
|
+
promptVersion: report.promptVersion,
|
|
1879
|
+
...costs && { costs }
|
|
1880
|
+
};
|
|
1881
|
+
writeFileSync(filepath, JSON.stringify(output, null, 2));
|
|
1882
|
+
return filepath;
|
|
1883
|
+
}
|
|
1884
|
+
};
|
|
1885
|
+
|
|
1886
|
+
// src/reporter/markdown-reporter.ts
|
|
1887
|
+
import { writeFileSync as writeFileSync2 } from "fs";
|
|
1888
|
+
var MarkdownReporter = class {
|
|
1889
|
+
outputDir;
|
|
1890
|
+
addTimestamp;
|
|
1891
|
+
markdownOptions;
|
|
1892
|
+
constructor(options) {
|
|
1893
|
+
this.outputDir = options.outputDir;
|
|
1894
|
+
this.addTimestamp = options.addTimestamp ?? true;
|
|
1895
|
+
this.markdownOptions = options.markdown ?? {};
|
|
1896
|
+
}
|
|
1897
|
+
save(report, name) {
|
|
1898
|
+
const filepath = buildOutputPath(this.outputDir, name, "md", this.addTimestamp);
|
|
1899
|
+
const markdown = reportToMarkdown(report, this.markdownOptions);
|
|
1900
|
+
writeFileSync2(filepath, markdown);
|
|
1901
|
+
return filepath;
|
|
1902
|
+
}
|
|
1903
|
+
};
|
|
1904
|
+
|
|
1905
|
+
// src/reporter/console-reporter.ts
|
|
1906
|
+
var ConsoleReporter = class {
|
|
1907
|
+
verbosity;
|
|
1908
|
+
pricing;
|
|
1909
|
+
constructor(options = {}) {
|
|
1910
|
+
this.verbosity = options.verbosity ?? "summary";
|
|
1911
|
+
this.pricing = options.pricing;
|
|
1912
|
+
}
|
|
1913
|
+
log(report) {
|
|
1914
|
+
const { summary } = report;
|
|
1915
|
+
const passRate = summary.totalTests > 0 ? summary.passed / summary.totalTests : 0;
|
|
1916
|
+
console.log(`
|
|
1917
|
+
\u{1F4CA} Eval Report: ${summary.totalTests} tests`);
|
|
1918
|
+
console.log(` Score: ${summary.avgScore.toFixed(1)} | Pass Rate: ${(passRate * 100).toFixed(0)}%`);
|
|
1919
|
+
if (this.verbosity === "summary") {
|
|
1920
|
+
this.logCostIfAvailable(report);
|
|
1921
|
+
return;
|
|
1922
|
+
}
|
|
1923
|
+
console.log("");
|
|
1924
|
+
for (const result of report.results) {
|
|
1925
|
+
const testId = result.testCase.id || "unknown";
|
|
1926
|
+
const status = result.passed ? "\u2713" : "\u2717";
|
|
1927
|
+
console.log(` ${status} [${testId}] Score: ${result.overallScore.toFixed(1)}`);
|
|
1928
|
+
if (this.verbosity === "full") {
|
|
1929
|
+
console.log(` Input: ${truncate(JSON.stringify(result.testCase.input), 80)}`);
|
|
1930
|
+
console.log(` Output: ${truncate(String(result.output), 80)}`);
|
|
1931
|
+
}
|
|
1932
|
+
}
|
|
1933
|
+
this.logCostIfAvailable(report);
|
|
1934
|
+
}
|
|
1935
|
+
logCostIfAvailable(report) {
|
|
1936
|
+
if (this.pricing) {
|
|
1937
|
+
const costs = calculateReportCosts(report, this.pricing);
|
|
1938
|
+
console.log(`
|
|
1939
|
+
\u{1F4B0} Cost: $${costs.total.toFixed(4)}`);
|
|
1940
|
+
}
|
|
1941
|
+
}
|
|
1942
|
+
};
|
|
1943
|
+
|
|
1944
|
+
// src/reporter/composite-reporter.ts
|
|
1945
|
+
var CompositeReporter = class {
|
|
1946
|
+
constructor(reporters) {
|
|
1947
|
+
this.reporters = reporters;
|
|
1948
|
+
}
|
|
1949
|
+
/**
|
|
1950
|
+
* Saves to all reporters that support saving.
|
|
1951
|
+
* Returns the first successful file path (usually JsonReporter).
|
|
1952
|
+
*/
|
|
1953
|
+
save(report, name) {
|
|
1954
|
+
const errors = [];
|
|
1955
|
+
let firstPath;
|
|
1956
|
+
for (const reporter of this.reporters) {
|
|
1957
|
+
if (!reporter.save) {
|
|
1958
|
+
reporter.log?.(report);
|
|
1959
|
+
continue;
|
|
1960
|
+
}
|
|
1961
|
+
try {
|
|
1962
|
+
const savedPath = reporter.save(report, name);
|
|
1963
|
+
if (!firstPath) firstPath = savedPath;
|
|
1964
|
+
} catch (error) {
|
|
1965
|
+
errors.push({
|
|
1966
|
+
reporter: reporter.constructor.name,
|
|
1967
|
+
error
|
|
1968
|
+
});
|
|
1969
|
+
}
|
|
1970
|
+
reporter.log?.(report);
|
|
1971
|
+
}
|
|
1972
|
+
if (!firstPath) {
|
|
1973
|
+
const details = errors.length > 0 ? errors.map((e) => `${e.reporter}: ${e.error.message}`).join(", ") : "No reporters support save()";
|
|
1974
|
+
throw new Error(`No reporter saved successfully. ${details}`);
|
|
1975
|
+
}
|
|
1976
|
+
return firstPath;
|
|
1977
|
+
}
|
|
1978
|
+
log(report) {
|
|
1979
|
+
for (const reporter of this.reporters) {
|
|
1980
|
+
reporter.log?.(report);
|
|
1981
|
+
}
|
|
1982
|
+
}
|
|
1983
|
+
};
|
|
1984
|
+
|
|
1985
|
+
// src/reporter/factory.ts
|
|
1986
|
+
function createJsonReporter(outputDir, options) {
|
|
1987
|
+
return new JsonReporter({ outputDir, ...options });
|
|
1988
|
+
}
|
|
1989
|
+
function createMarkdownReporter(outputDir, options) {
|
|
1990
|
+
return new MarkdownReporter({ outputDir, ...options });
|
|
1991
|
+
}
|
|
1992
|
+
function createConsoleReporter(options) {
|
|
1993
|
+
return new ConsoleReporter(options);
|
|
1994
|
+
}
|
|
1995
|
+
function createCompositeReporter(reporters) {
|
|
1996
|
+
return new CompositeReporter(reporters);
|
|
1997
|
+
}
|
|
1998
|
+
function createDefaultReporter(outputDir, options) {
|
|
1999
|
+
return new CompositeReporter([
|
|
2000
|
+
new JsonReporter({
|
|
2001
|
+
outputDir,
|
|
2002
|
+
pricing: options?.pricing,
|
|
2003
|
+
addTimestamp: options?.addTimestamp
|
|
2004
|
+
}),
|
|
2005
|
+
new ConsoleReporter({
|
|
2006
|
+
verbosity: options?.verbosity,
|
|
2007
|
+
pricing: options?.pricing
|
|
2008
|
+
})
|
|
2009
|
+
]);
|
|
2010
|
+
}
|
|
2011
|
+
|
|
2012
|
+
// src/reporter/runner.ts
|
|
2013
|
+
function createReportRunner(options) {
|
|
2014
|
+
const { outputDir, pricing, verbosity } = options;
|
|
2015
|
+
const jsonReporter = new JsonReporter({ outputDir, pricing });
|
|
2016
|
+
const consoleReporter = verbosity !== false ? new ConsoleReporter({ verbosity: verbosity || "summary", pricing }) : null;
|
|
2017
|
+
return async (suite, testCases2, name) => {
|
|
2018
|
+
const report = await suite.run(testCases2);
|
|
2019
|
+
consoleReporter?.log(report);
|
|
2020
|
+
const savedPath = jsonReporter.save(report, name);
|
|
2021
|
+
return { report, savedPath };
|
|
2022
|
+
};
|
|
2023
|
+
}
|
|
2024
|
+
|
|
2025
|
+
// src/reporter/cycle-json.ts
|
|
2026
|
+
import { writeFileSync as writeFileSync3, mkdirSync as mkdirSync2 } from "fs";
|
|
2027
|
+
import path2 from "path";
|
|
2028
|
+
function saveCycleJson(result, options) {
|
|
2029
|
+
const { outputDir, name, directory, saveRounds = true } = options;
|
|
2030
|
+
const cycleDir = resolveCycleDirectory(outputDir, name, directory);
|
|
2031
|
+
mkdirSync2(cycleDir, { recursive: true });
|
|
2032
|
+
saveCycleSummary(cycleDir, result);
|
|
2033
|
+
if (saveRounds) {
|
|
2034
|
+
saveRoundReports(cycleDir, result.rounds);
|
|
2035
|
+
}
|
|
2036
|
+
return cycleDir;
|
|
2037
|
+
}
|
|
2038
|
+
function resolveCycleDirectory(outputDir, name, directory) {
|
|
2039
|
+
if (directory) {
|
|
2040
|
+
return directory;
|
|
2041
|
+
}
|
|
2042
|
+
if (outputDir && name) {
|
|
2043
|
+
return path2.join(outputDir, `${name}-${Date.now()}`);
|
|
2044
|
+
}
|
|
2045
|
+
throw new Error('saveCycleJson requires either "directory" or both "outputDir" and "name"');
|
|
2046
|
+
}
|
|
2047
|
+
function saveCycleSummary(cycleDir, result) {
|
|
2048
|
+
const summaryPath = path2.join(cycleDir, "cycle-summary.json");
|
|
2049
|
+
const summary = {
|
|
2050
|
+
rounds: result.rounds.map((round) => ({
|
|
2051
|
+
round: round.round,
|
|
2052
|
+
completedAt: toISOStringIfDate(round.completedAt),
|
|
2053
|
+
score: round.report.summary.avgScore,
|
|
2054
|
+
scoreDelta: round.scoreDelta,
|
|
2055
|
+
cost: round.cost,
|
|
2056
|
+
suggestionsGenerated: round.suggestionsGenerated.length,
|
|
2057
|
+
suggestionsApproved: round.suggestionsApproved.length,
|
|
2058
|
+
promptVersionAfter: round.promptVersionAfter
|
|
2059
|
+
})),
|
|
2060
|
+
terminationReason: result.terminationReason,
|
|
2061
|
+
totalCost: result.totalCost,
|
|
2062
|
+
roundCount: result.rounds.length,
|
|
2063
|
+
initialScore: result.rounds[0]?.report.summary.avgScore ?? null,
|
|
2064
|
+
finalScore: result.rounds[result.rounds.length - 1]?.report.summary.avgScore ?? null
|
|
2065
|
+
};
|
|
2066
|
+
writeFileSync3(summaryPath, JSON.stringify(summary, null, 2));
|
|
2067
|
+
}
|
|
2068
|
+
function saveRoundReports(cycleDir, rounds) {
|
|
2069
|
+
for (const round of rounds) {
|
|
2070
|
+
const roundPath = path2.join(cycleDir, `round-${round.round}-report.json`);
|
|
2071
|
+
const roundData = {
|
|
2072
|
+
round: round.round,
|
|
2073
|
+
completedAt: toISOStringIfDate(round.completedAt),
|
|
2074
|
+
report: {
|
|
2075
|
+
...round.report,
|
|
2076
|
+
generatedAt: toISOStringIfDate(round.report.generatedAt)
|
|
2077
|
+
},
|
|
2078
|
+
suggestionsGenerated: round.suggestionsGenerated,
|
|
2079
|
+
suggestionsApproved: round.suggestionsApproved,
|
|
2080
|
+
promptSnapshot: round.promptSnapshot,
|
|
2081
|
+
cost: round.cost,
|
|
2082
|
+
scoreDelta: round.scoreDelta
|
|
2083
|
+
};
|
|
2084
|
+
writeFileSync3(roundPath, JSON.stringify(roundData, null, 2));
|
|
2085
|
+
}
|
|
2086
|
+
}
|
|
2087
|
+
|
|
2088
|
+
// src/reporter/cycle-console.ts
|
|
2089
|
+
function logCycle(result, options = {}) {
|
|
2090
|
+
const { verbosity = "summary", showRounds = false } = options;
|
|
2091
|
+
console.log("\n\u{1F504} Improvement Cycle Complete");
|
|
2092
|
+
console.log(` Rounds: ${result.rounds.length}`);
|
|
2093
|
+
console.log(` Termination: ${result.terminationReason}`);
|
|
2094
|
+
console.log(` Total Cost: $${result.totalCost.toFixed(4)}`);
|
|
2095
|
+
if (result.rounds.length > 0) {
|
|
2096
|
+
const firstScore = result.rounds[0].report.summary.avgScore;
|
|
2097
|
+
const lastScore = result.rounds[result.rounds.length - 1].report.summary.avgScore;
|
|
2098
|
+
const delta = lastScore - firstScore;
|
|
2099
|
+
console.log(` Score: ${firstScore.toFixed(1)} -> ${lastScore.toFixed(1)} (${formatScoreDelta(delta)})`);
|
|
2100
|
+
}
|
|
2101
|
+
if (showRounds) {
|
|
2102
|
+
const consoleReporter = new ConsoleReporter({ verbosity });
|
|
2103
|
+
for (const round of result.rounds) {
|
|
2104
|
+
console.log(`
|
|
2105
|
+
-- Round ${round.round} --`);
|
|
2106
|
+
consoleReporter.log(round.report);
|
|
2107
|
+
}
|
|
2108
|
+
}
|
|
2109
|
+
}
|
|
2110
|
+
|
|
2111
|
+
// src/reporter/cycle-markdown.ts
|
|
2112
|
+
import { writeFileSync as writeFileSync4 } from "fs";
|
|
2113
|
+
function cycleToMarkdown(result, options = {}) {
|
|
2114
|
+
const { includeRoundDetails = true, showPromptEvolution = false } = options;
|
|
2115
|
+
const lines = [];
|
|
2116
|
+
lines.push("# Improvement Cycle Report");
|
|
2117
|
+
lines.push("");
|
|
2118
|
+
lines.push("## Summary");
|
|
2119
|
+
lines.push("");
|
|
2120
|
+
lines.push("| Metric | Value |");
|
|
2121
|
+
lines.push("|--------|-------|");
|
|
2122
|
+
lines.push(`| Rounds | ${result.rounds.length} |`);
|
|
2123
|
+
lines.push(`| Termination | ${result.terminationReason} |`);
|
|
2124
|
+
lines.push(`| Total Cost | $${result.totalCost.toFixed(4)} |`);
|
|
2125
|
+
if (result.rounds.length > 0) {
|
|
2126
|
+
const first = result.rounds[0].report.summary.avgScore;
|
|
2127
|
+
const last = result.rounds[result.rounds.length - 1].report.summary.avgScore;
|
|
2128
|
+
lines.push(`| Initial Score | ${first.toFixed(1)} |`);
|
|
2129
|
+
lines.push(`| Final Score | ${last.toFixed(1)} |`);
|
|
2130
|
+
lines.push(`| Improvement | ${formatScoreDelta(last - first)} |`);
|
|
2131
|
+
}
|
|
2132
|
+
lines.push("");
|
|
2133
|
+
lines.push("## Score Progression");
|
|
2134
|
+
lines.push("");
|
|
2135
|
+
lines.push("| Round | Score | Delta | Cost |");
|
|
2136
|
+
lines.push("|-------|-------|-------|------|");
|
|
2137
|
+
for (const round of result.rounds) {
|
|
2138
|
+
const delta = formatScoreDelta(round.scoreDelta);
|
|
2139
|
+
lines.push(
|
|
2140
|
+
`| ${round.round} | ${round.report.summary.avgScore.toFixed(1)} | ${delta} | $${round.cost.total.toFixed(4)} |`
|
|
2141
|
+
);
|
|
2142
|
+
}
|
|
2143
|
+
lines.push("");
|
|
2144
|
+
if (includeRoundDetails) {
|
|
2145
|
+
lines.push("## Round Details");
|
|
2146
|
+
lines.push("");
|
|
2147
|
+
for (const round of result.rounds) {
|
|
2148
|
+
lines.push(`### Round ${round.round}`);
|
|
2149
|
+
lines.push("");
|
|
2150
|
+
lines.push(reportToMarkdown(round.report));
|
|
2151
|
+
lines.push("");
|
|
2152
|
+
}
|
|
2153
|
+
}
|
|
2154
|
+
if (showPromptEvolution && result.rounds.length > 0) {
|
|
2155
|
+
lines.push("## Prompt Evolution");
|
|
2156
|
+
lines.push("");
|
|
2157
|
+
lines.push("### Initial Prompt");
|
|
2158
|
+
lines.push("");
|
|
2159
|
+
lines.push("```");
|
|
2160
|
+
lines.push(result.rounds[0].promptSnapshot.userTemplate);
|
|
2161
|
+
lines.push("```");
|
|
2162
|
+
lines.push("");
|
|
2163
|
+
lines.push("### Final Prompt");
|
|
2164
|
+
lines.push("");
|
|
2165
|
+
lines.push("```");
|
|
2166
|
+
const finalPrompt = result.finalPrompt;
|
|
2167
|
+
if ("userTemplate" in finalPrompt && typeof finalPrompt.userTemplate === "string") {
|
|
2168
|
+
lines.push(finalPrompt.userTemplate);
|
|
2169
|
+
} else {
|
|
2170
|
+
lines.push("[Compiled prompt - template not available]");
|
|
2171
|
+
}
|
|
2172
|
+
lines.push("```");
|
|
2173
|
+
}
|
|
2174
|
+
return lines.join("\n");
|
|
2175
|
+
}
|
|
2176
|
+
function saveCycleMarkdown(result, filePath, options) {
|
|
2177
|
+
const markdown = cycleToMarkdown(result, options);
|
|
2178
|
+
writeFileSync4(filePath, markdown);
|
|
2179
|
+
}
|
|
2180
|
+
|
|
2181
|
+
// src/improver/utils.ts
|
|
2182
|
+
import { compileTemplate } from "@agtlantis/core";
|
|
2183
|
+
function suggestionDiff(suggestion) {
|
|
2184
|
+
const oldLines = suggestion.currentValue.split("\n");
|
|
2185
|
+
const newLines = suggestion.suggestedValue.split("\n");
|
|
2186
|
+
const lines = [];
|
|
2187
|
+
lines.push(`--- ${suggestion.type} (current)`);
|
|
2188
|
+
lines.push(`+++ ${suggestion.type} (suggested)`);
|
|
2189
|
+
lines.push("");
|
|
2190
|
+
for (const line of oldLines) {
|
|
2191
|
+
lines.push(`- ${line}`);
|
|
2192
|
+
}
|
|
2193
|
+
for (const line of newLines) {
|
|
2194
|
+
lines.push(`+ ${line}`);
|
|
2195
|
+
}
|
|
2196
|
+
return lines.join("\n");
|
|
2197
|
+
}
|
|
2198
|
+
function suggestionPreview(suggestion) {
|
|
2199
|
+
const lines = [];
|
|
2200
|
+
lines.push(`=== Suggestion Preview ===`);
|
|
2201
|
+
lines.push(`Type: ${suggestion.type}`);
|
|
2202
|
+
lines.push(`Priority: ${suggestion.priority}`);
|
|
2203
|
+
lines.push(``);
|
|
2204
|
+
lines.push(`Reasoning: ${suggestion.reasoning}`);
|
|
2205
|
+
lines.push(``);
|
|
2206
|
+
lines.push(`Expected Improvement: ${suggestion.expectedImprovement}`);
|
|
2207
|
+
lines.push(``);
|
|
2208
|
+
lines.push(`--- Current Value ---`);
|
|
2209
|
+
lines.push(suggestion.currentValue);
|
|
2210
|
+
lines.push(``);
|
|
2211
|
+
lines.push(`--- Suggested Value ---`);
|
|
2212
|
+
lines.push(suggestion.suggestedValue);
|
|
2213
|
+
return lines.join("\n");
|
|
2214
|
+
}
|
|
2215
|
+
function suggestionSummary(suggestion) {
|
|
2216
|
+
const priorityTag = `[${suggestion.priority.toUpperCase()}]`;
|
|
2217
|
+
return `${priorityTag} ${suggestion.type}: ${truncate(suggestion.reasoning, 60)}`;
|
|
2218
|
+
}
|
|
2219
|
+
function safeReplace(str, search, replacement) {
|
|
2220
|
+
return str.replace(search, () => replacement);
|
|
2221
|
+
}
|
|
2222
|
+
function bumpVersion(version, bump) {
|
|
2223
|
+
const parts = version.split(".").map((n) => parseInt(n, 10));
|
|
2224
|
+
if (parts.length !== 3 || parts.some(isNaN)) {
|
|
2225
|
+
throw new EvalError(
|
|
2226
|
+
`Invalid version format: "${version}". Expected semver format (x.y.z)`,
|
|
2227
|
+
{
|
|
2228
|
+
code: "SUGGESTION_APPLY_ERROR" /* SUGGESTION_APPLY_ERROR */,
|
|
2229
|
+
context: { version, expectedFormat: "x.y.z" }
|
|
2230
|
+
}
|
|
2231
|
+
);
|
|
2232
|
+
}
|
|
2233
|
+
const [major, minor, patch] = parts;
|
|
2234
|
+
switch (bump) {
|
|
2235
|
+
case "major":
|
|
2236
|
+
return `${major + 1}.0.0`;
|
|
2237
|
+
case "minor":
|
|
2238
|
+
return `${major}.${minor + 1}.0`;
|
|
2239
|
+
case "patch":
|
|
2240
|
+
return `${major}.${minor}.${patch + 1}`;
|
|
2241
|
+
}
|
|
2242
|
+
}
|
|
2243
|
+
function applyPromptSuggestions(currentPrompt, suggestions, options) {
|
|
2244
|
+
const approvedSuggestions = suggestions.filter((s) => s.approved);
|
|
2245
|
+
if (approvedSuggestions.length === 0) {
|
|
2246
|
+
return {
|
|
2247
|
+
prompt: currentPrompt,
|
|
2248
|
+
appliedCount: 0,
|
|
2249
|
+
skipped: []
|
|
2250
|
+
};
|
|
2251
|
+
}
|
|
2252
|
+
let newPrompt = { ...currentPrompt };
|
|
2253
|
+
let appliedCount = 0;
|
|
2254
|
+
const skipped = [];
|
|
2255
|
+
for (const suggestion of approvedSuggestions) {
|
|
2256
|
+
const applyResult = applySingleSuggestion(newPrompt, suggestion);
|
|
2257
|
+
if (applyResult.success) {
|
|
2258
|
+
newPrompt = applyResult.prompt;
|
|
2259
|
+
appliedCount++;
|
|
2260
|
+
} else {
|
|
2261
|
+
skipped.push({ suggestion, reason: applyResult.reason });
|
|
2262
|
+
}
|
|
2263
|
+
}
|
|
2264
|
+
if (options?.bumpVersion && appliedCount > 0) {
|
|
2265
|
+
newPrompt = {
|
|
2266
|
+
...newPrompt,
|
|
2267
|
+
version: bumpVersion(currentPrompt.version, options.bumpVersion)
|
|
2268
|
+
};
|
|
2269
|
+
}
|
|
2270
|
+
return {
|
|
2271
|
+
prompt: newPrompt,
|
|
2272
|
+
appliedCount,
|
|
2273
|
+
skipped
|
|
2274
|
+
};
|
|
2275
|
+
}
|
|
2276
|
+
var AGENT_PROMPT_CORE_FIELDS = [
|
|
2277
|
+
"id",
|
|
2278
|
+
"version",
|
|
2279
|
+
"system",
|
|
2280
|
+
"renderUserPrompt",
|
|
2281
|
+
"userTemplate"
|
|
2282
|
+
];
|
|
2283
|
+
function applySingleSuggestion(prompt, suggestion) {
|
|
2284
|
+
switch (suggestion.type) {
|
|
2285
|
+
case "system_prompt": {
|
|
2286
|
+
if (!prompt.system.includes(suggestion.currentValue)) {
|
|
2287
|
+
return {
|
|
2288
|
+
success: false,
|
|
2289
|
+
reason: `currentValue not found in system prompt: "${truncate(suggestion.currentValue, 50)}"`
|
|
2290
|
+
};
|
|
2291
|
+
}
|
|
2292
|
+
return {
|
|
2293
|
+
success: true,
|
|
2294
|
+
prompt: {
|
|
2295
|
+
...prompt,
|
|
2296
|
+
system: safeReplace(
|
|
2297
|
+
prompt.system,
|
|
2298
|
+
suggestion.currentValue,
|
|
2299
|
+
suggestion.suggestedValue
|
|
2300
|
+
)
|
|
2301
|
+
}
|
|
2302
|
+
};
|
|
2303
|
+
}
|
|
2304
|
+
case "user_prompt": {
|
|
2305
|
+
const userTemplate = prompt.userTemplate;
|
|
2306
|
+
if (typeof userTemplate !== "string") {
|
|
2307
|
+
throw new EvalError(
|
|
2308
|
+
`Cannot apply user_prompt suggestion: prompt does not have a userTemplate field. The renderUserPrompt is a function and cannot be modified directly.`,
|
|
2309
|
+
{
|
|
2310
|
+
code: "SUGGESTION_APPLY_ERROR" /* SUGGESTION_APPLY_ERROR */,
|
|
2311
|
+
context: {
|
|
2312
|
+
suggestionType: suggestion.type,
|
|
2313
|
+
hasUserTemplate: "userTemplate" in prompt
|
|
2314
|
+
}
|
|
2315
|
+
}
|
|
2316
|
+
);
|
|
2317
|
+
}
|
|
2318
|
+
if (!userTemplate.includes(suggestion.currentValue)) {
|
|
2319
|
+
return {
|
|
2320
|
+
success: false,
|
|
2321
|
+
reason: `currentValue not found in userTemplate: "${truncate(suggestion.currentValue, 50)}"`
|
|
2322
|
+
};
|
|
2323
|
+
}
|
|
2324
|
+
const newTemplate = safeReplace(
|
|
2325
|
+
userTemplate,
|
|
2326
|
+
suggestion.currentValue,
|
|
2327
|
+
suggestion.suggestedValue
|
|
2328
|
+
);
|
|
2329
|
+
return {
|
|
2330
|
+
success: true,
|
|
2331
|
+
prompt: {
|
|
2332
|
+
...prompt,
|
|
2333
|
+
userTemplate: newTemplate,
|
|
2334
|
+
renderUserPrompt: compileTemplate(newTemplate, prompt.id)
|
|
2335
|
+
}
|
|
2336
|
+
};
|
|
2337
|
+
}
|
|
2338
|
+
case "parameters": {
|
|
2339
|
+
const updatedPrompt = { ...prompt };
|
|
2340
|
+
let found = false;
|
|
2341
|
+
for (const [key, value] of Object.entries(updatedPrompt)) {
|
|
2342
|
+
if (AGENT_PROMPT_CORE_FIELDS.includes(
|
|
2343
|
+
key
|
|
2344
|
+
)) {
|
|
2345
|
+
continue;
|
|
2346
|
+
}
|
|
2347
|
+
if (typeof value === "string" && value.includes(suggestion.currentValue)) {
|
|
2348
|
+
updatedPrompt[key] = safeReplace(
|
|
2349
|
+
value,
|
|
2350
|
+
suggestion.currentValue,
|
|
2351
|
+
suggestion.suggestedValue
|
|
2352
|
+
);
|
|
2353
|
+
found = true;
|
|
2354
|
+
break;
|
|
2355
|
+
}
|
|
2356
|
+
}
|
|
2357
|
+
if (!found) {
|
|
2358
|
+
return {
|
|
2359
|
+
success: false,
|
|
2360
|
+
reason: `currentValue not found in any parameter field: "${truncate(suggestion.currentValue, 50)}"`
|
|
2361
|
+
};
|
|
2362
|
+
}
|
|
2363
|
+
return {
|
|
2364
|
+
success: true,
|
|
2365
|
+
prompt: updatedPrompt
|
|
2366
|
+
};
|
|
2367
|
+
}
|
|
2368
|
+
default: {
|
|
2369
|
+
const _exhaustive = suggestion.type;
|
|
2370
|
+
return {
|
|
2371
|
+
success: false,
|
|
2372
|
+
reason: `Unknown suggestion type: ${suggestion.type}`
|
|
2373
|
+
};
|
|
2374
|
+
}
|
|
2375
|
+
}
|
|
2376
|
+
}
|
|
2377
|
+
|
|
2378
|
+
// src/improver/llm-improver.ts
|
|
2379
|
+
import { Output as Output2 } from "ai";
|
|
2380
|
+
import { z as z2 } from "zod";
|
|
2381
|
+
|
|
2382
|
+
// src/improver/prompts/default.ts
|
|
2383
|
+
var defaultImproverPrompt = {
|
|
2384
|
+
id: "default-improver",
|
|
2385
|
+
version: "2.0.0",
|
|
2386
|
+
system: `You are an expert prompt engineer specializing in optimizing AI Agent prompts.
|
|
2387
|
+
|
|
2388
|
+
Your role is to analyze test results and evaluation feedback to propose targeted improvements.
|
|
2389
|
+
|
|
2390
|
+
## Improvement Principles
|
|
2391
|
+
|
|
2392
|
+
1. **Focus on Impact**: Prioritize changes that address the lowest-scoring criteria
|
|
2393
|
+
- Target specific failure patterns, not general improvements
|
|
2394
|
+
- One well-crafted change is better than many superficial ones
|
|
2395
|
+
|
|
2396
|
+
2. **Be Specific and Actionable**: Provide concrete changes, not vague suggestions
|
|
2397
|
+
- Show exact text to add, modify, or remove
|
|
2398
|
+
- Explain the mechanism by which the change will help
|
|
2399
|
+
|
|
2400
|
+
3. **Consider Trade-offs**: Evaluate side effects of each change
|
|
2401
|
+
- Will this fix break other test cases?
|
|
2402
|
+
- Does it increase prompt length/cost significantly?
|
|
2403
|
+
- Could it introduce new failure modes?
|
|
2404
|
+
|
|
2405
|
+
4. **Maintain Prompt Quality**: Preserve clarity and structure
|
|
2406
|
+
- Keep prompts readable and maintainable
|
|
2407
|
+
- Avoid over-engineering or excessive constraints
|
|
2408
|
+
- Ensure changes align with the agent's core purpose
|
|
2409
|
+
|
|
2410
|
+
## Suggestion Priority Levels
|
|
2411
|
+
- **high**: Critical issues causing test failures, should be addressed immediately
|
|
2412
|
+
- **medium**: Issues affecting quality scores, recommended for next iteration
|
|
2413
|
+
- **low**: Minor optimizations, nice-to-have improvements
|
|
2414
|
+
|
|
2415
|
+
## Response Format
|
|
2416
|
+
|
|
2417
|
+
You MUST respond with valid JSON only. No additional text outside the JSON structure.
|
|
2418
|
+
|
|
2419
|
+
{
|
|
2420
|
+
"suggestions": [
|
|
2421
|
+
{
|
|
2422
|
+
"type": "system_prompt" | "user_prompt" | "parameters",
|
|
2423
|
+
"priority": "high" | "medium" | "low",
|
|
2424
|
+
"currentValue": "The specific text or value being changed",
|
|
2425
|
+
"suggestedValue": "The proposed replacement text or value",
|
|
2426
|
+
"reasoning": "Why this change addresses the identified issue",
|
|
2427
|
+
"expectedImprovement": "Predicted impact on scores and behavior"
|
|
2428
|
+
}
|
|
2429
|
+
]
|
|
2430
|
+
}`,
|
|
2431
|
+
renderUserPrompt: (ctx) => {
|
|
2432
|
+
const failedDetails = buildFailedCaseDetails(ctx.evaluatedResults);
|
|
2433
|
+
return `
|
|
2434
|
+
## Current Agent Prompt
|
|
2435
|
+
|
|
2436
|
+
### System Prompt
|
|
2437
|
+
\`\`\`
|
|
2438
|
+
${ctx.agentPrompt.system}
|
|
2439
|
+
\`\`\`
|
|
2440
|
+
|
|
2441
|
+
## Test Results Summary
|
|
2442
|
+
- Total tests: ${ctx.evaluatedResults.length}
|
|
2443
|
+
- Passed: ${ctx.evaluatedResults.filter((r) => r.passed).length}
|
|
2444
|
+
- Failed: ${ctx.evaluatedResults.filter((r) => !r.passed).length}
|
|
2445
|
+
|
|
2446
|
+
## Performance Metrics
|
|
2447
|
+
- Average latency: ${ctx.aggregatedMetrics.avgLatencyMs}ms
|
|
2448
|
+
- Total tokens used: ${ctx.aggregatedMetrics.totalTokens}
|
|
2449
|
+
|
|
2450
|
+
## Failed/Low-Score Cases Details
|
|
2451
|
+
${failedDetails}
|
|
2452
|
+
|
|
2453
|
+
Based on the above results, please propose specific prompt improvements.`.trim();
|
|
2454
|
+
}
|
|
2455
|
+
};
|
|
2456
|
+
function buildFailedCaseDetails(results) {
|
|
2457
|
+
const failedOrLowScore = results.filter((r) => !r.passed || r.overallScore < 70);
|
|
2458
|
+
if (failedOrLowScore.length === 0) {
|
|
2459
|
+
return "(None - all tests passed with acceptable scores)";
|
|
2460
|
+
}
|
|
2461
|
+
return failedOrLowScore.map(
|
|
2462
|
+
(r) => `
|
|
2463
|
+
### ${r.testCase.id ?? "unnamed"} (Score: ${r.overallScore})
|
|
2464
|
+
**Input:** ${truncate(JSON.stringify(r.testCase.input), 200)}
|
|
2465
|
+
**Output:** ${truncate(JSON.stringify(r.output), 200)}
|
|
2466
|
+
**Evaluation:**
|
|
2467
|
+
${r.verdicts.map((v) => `- ${v.criterionId}: ${v.score}/100 - ${v.reasoning}`).join("\n")}`
|
|
2468
|
+
).join("\n");
|
|
2469
|
+
}
|
|
2470
|
+
|
|
2471
|
+
// src/improver/llm-improver.ts
|
|
2472
|
+
function toEvalTokenUsage2(usage) {
|
|
2473
|
+
return {
|
|
2474
|
+
inputTokens: usage.inputTokens ?? 0,
|
|
2475
|
+
outputTokens: usage.outputTokens ?? 0,
|
|
2476
|
+
totalTokens: usage.totalTokens ?? 0
|
|
2477
|
+
};
|
|
2478
|
+
}
|
|
2479
|
+
var ImproverResponseSchema = z2.object({
|
|
2480
|
+
suggestions: z2.array(
|
|
2481
|
+
z2.object({
|
|
2482
|
+
type: z2.enum(["system_prompt", "user_prompt", "parameters"]),
|
|
2483
|
+
priority: z2.enum(["high", "medium", "low"]),
|
|
2484
|
+
currentValue: z2.string(),
|
|
2485
|
+
suggestedValue: z2.string(),
|
|
2486
|
+
reasoning: z2.string(),
|
|
2487
|
+
expectedImprovement: z2.string()
|
|
2488
|
+
})
|
|
2489
|
+
)
|
|
2490
|
+
});
|
|
2491
|
+
function aggregateMetrics(results) {
|
|
2492
|
+
if (results.length === 0) {
|
|
2493
|
+
return {
|
|
2494
|
+
avgLatencyMs: 0,
|
|
2495
|
+
totalTokens: 0
|
|
2496
|
+
};
|
|
2497
|
+
}
|
|
2498
|
+
let totalLatency = 0;
|
|
2499
|
+
let totalTokens = 0;
|
|
2500
|
+
for (const result of results) {
|
|
2501
|
+
totalLatency += result.metrics.latencyMs;
|
|
2502
|
+
totalTokens += result.metrics.tokenUsage.totalTokens;
|
|
2503
|
+
}
|
|
2504
|
+
return {
|
|
2505
|
+
avgLatencyMs: Math.round(totalLatency / results.length),
|
|
2506
|
+
totalTokens
|
|
2507
|
+
};
|
|
2508
|
+
}
|
|
2509
|
+
function createImprover(config) {
|
|
2510
|
+
const { provider, prompt = defaultImproverPrompt, model } = config;
|
|
2511
|
+
return {
|
|
2512
|
+
async improve(agentPrompt, results) {
|
|
2513
|
+
const context = {
|
|
2514
|
+
agentPrompt,
|
|
2515
|
+
evaluatedResults: results,
|
|
2516
|
+
aggregatedMetrics: aggregateMetrics(results)
|
|
2517
|
+
};
|
|
2518
|
+
const messages = [
|
|
2519
|
+
{ role: "system", content: prompt.system },
|
|
2520
|
+
{ role: "user", content: prompt.renderUserPrompt(context) }
|
|
2521
|
+
];
|
|
2522
|
+
let response;
|
|
2523
|
+
let llmUsage;
|
|
2524
|
+
try {
|
|
2525
|
+
const execution = provider.simpleExecution(async (session) => {
|
|
2526
|
+
const result = await session.generateText({
|
|
2527
|
+
messages,
|
|
2528
|
+
output: Output2.object({ schema: ImproverResponseSchema })
|
|
2529
|
+
});
|
|
2530
|
+
return result.output;
|
|
2531
|
+
});
|
|
2532
|
+
const executionResult = await execution.result();
|
|
2533
|
+
if (executionResult.status !== "succeeded") {
|
|
2534
|
+
throw executionResult.status === "failed" ? executionResult.error : new Error("Execution was canceled");
|
|
2535
|
+
}
|
|
2536
|
+
response = executionResult.value;
|
|
2537
|
+
llmUsage = executionResult.summary.totalLLMUsage;
|
|
2538
|
+
} catch (cause) {
|
|
2539
|
+
throw EvalError.from(cause, "LLM_API_ERROR" /* LLM_API_ERROR */, {
|
|
2540
|
+
promptId: prompt.id,
|
|
2541
|
+
promptVersion: prompt.version
|
|
2542
|
+
});
|
|
2543
|
+
}
|
|
2544
|
+
const suggestions = response.suggestions.map((s) => ({
|
|
2545
|
+
...s,
|
|
2546
|
+
approved: void 0,
|
|
2547
|
+
modified: void 0
|
|
2548
|
+
}));
|
|
2549
|
+
const metadata = llmUsage ? { tokenUsage: toEvalTokenUsage2(llmUsage), model } : void 0;
|
|
2550
|
+
return { suggestions, metadata };
|
|
2551
|
+
}
|
|
2552
|
+
};
|
|
2553
|
+
}
|
|
2554
|
+
|
|
2555
|
+
// src/index.ts
|
|
2556
|
+
import { mock, MockProvider } from "@agtlantis/core/testing";
|
|
2557
|
+
|
|
2558
|
+
// src/testing/mock-agent.ts
|
|
2559
|
+
function createMockAgent(config = {}) {
|
|
2560
|
+
const {
|
|
2561
|
+
name = "MockAgent",
|
|
2562
|
+
description = "A mock agent for testing",
|
|
2563
|
+
response = {},
|
|
2564
|
+
tokenUsage = { inputTokens: 10, outputTokens: 20, totalTokens: 30 },
|
|
2565
|
+
delay = 0,
|
|
2566
|
+
shouldError = false,
|
|
2567
|
+
errorMessage = "Mock agent execution failed",
|
|
2568
|
+
executeFn
|
|
2569
|
+
} = config;
|
|
2570
|
+
return {
|
|
2571
|
+
config: { name, description },
|
|
2572
|
+
prompt: {
|
|
2573
|
+
id: "mock-prompt",
|
|
2574
|
+
version: "1.0.0",
|
|
2575
|
+
system: "You are a mock agent",
|
|
2576
|
+
renderUserPrompt: (input) => JSON.stringify(input)
|
|
2577
|
+
},
|
|
2578
|
+
execute: async (input) => {
|
|
2579
|
+
if (executeFn) {
|
|
2580
|
+
return executeFn(input);
|
|
2581
|
+
}
|
|
2582
|
+
if (delay > 0) {
|
|
2583
|
+
await new Promise((resolve2) => setTimeout(resolve2, delay));
|
|
2584
|
+
}
|
|
2585
|
+
if (shouldError) {
|
|
2586
|
+
throw new Error(errorMessage);
|
|
2587
|
+
}
|
|
2588
|
+
return {
|
|
2589
|
+
result: response,
|
|
2590
|
+
metadata: { tokenUsage }
|
|
2591
|
+
};
|
|
2592
|
+
}
|
|
2593
|
+
};
|
|
2594
|
+
}
|
|
2595
|
+
function createMockJudge(config = {}) {
|
|
2596
|
+
const {
|
|
2597
|
+
score = 80,
|
|
2598
|
+
passed = true,
|
|
2599
|
+
verdicts = [
|
|
2600
|
+
{ criterionId: "default", score: 80, reasoning: "Default verdict", passed: true }
|
|
2601
|
+
],
|
|
2602
|
+
metadata,
|
|
2603
|
+
shouldError = false,
|
|
2604
|
+
errorMessage = "Mock judge evaluation failed",
|
|
2605
|
+
evaluateFn
|
|
2606
|
+
} = config;
|
|
2607
|
+
return {
|
|
2608
|
+
evaluate: async (context) => {
|
|
2609
|
+
if (evaluateFn) {
|
|
2610
|
+
return evaluateFn(context);
|
|
2611
|
+
}
|
|
2612
|
+
if (shouldError) {
|
|
2613
|
+
throw new Error(errorMessage);
|
|
2614
|
+
}
|
|
2615
|
+
return {
|
|
2616
|
+
verdicts,
|
|
2617
|
+
overallScore: score,
|
|
2618
|
+
passed,
|
|
2619
|
+
metadata
|
|
2620
|
+
};
|
|
2621
|
+
}
|
|
2622
|
+
};
|
|
2623
|
+
}
|
|
2624
|
+
function createMockImprover(config = {}) {
|
|
2625
|
+
const {
|
|
2626
|
+
suggestions = [],
|
|
2627
|
+
shouldError = false,
|
|
2628
|
+
errorMessage = "Mock improver failed",
|
|
2629
|
+
improveFn
|
|
2630
|
+
} = config;
|
|
2631
|
+
return {
|
|
2632
|
+
improve: async (agentPrompt, results) => {
|
|
2633
|
+
if (improveFn) {
|
|
2634
|
+
return improveFn(agentPrompt, results);
|
|
2635
|
+
}
|
|
2636
|
+
if (shouldError) {
|
|
2637
|
+
throw new Error(errorMessage);
|
|
2638
|
+
}
|
|
2639
|
+
return { suggestions };
|
|
2640
|
+
}
|
|
2641
|
+
};
|
|
2642
|
+
}
|
|
2643
|
+
|
|
2644
|
+
// src/index.ts
|
|
2645
|
+
import {
|
|
2646
|
+
compileTemplate as compileTemplate3,
|
|
2647
|
+
createFilePromptRepository
|
|
2648
|
+
} from "@agtlantis/core";
|
|
2649
|
+
import {
|
|
2650
|
+
calculateCostFromUsage as calculateCostFromUsage3,
|
|
2651
|
+
OPENAI_PRICING,
|
|
2652
|
+
GOOGLE_PRICING,
|
|
2653
|
+
ANTHROPIC_PRICING,
|
|
2654
|
+
DEFAULT_PRICING_CONFIG
|
|
2655
|
+
} from "@agtlantis/core";
|
|
2656
|
+
|
|
2657
|
+
// src/cli/config/types.ts
|
|
2658
|
+
function defineConfig(config) {
|
|
2659
|
+
return config;
|
|
2660
|
+
}
|
|
2661
|
+
|
|
2662
|
+
// src/cli/config/loader.ts
|
|
2663
|
+
import { existsSync } from "fs";
|
|
2664
|
+
import { resolve, extname } from "path";
|
|
2665
|
+
import { pathToFileURL } from "url";
|
|
2666
|
+
import { bundleRequire } from "bundle-require";
|
|
2667
|
+
import fg from "fast-glob";
|
|
2668
|
+
|
|
2669
|
+
// src/cli/config/schema.ts
|
|
2670
|
+
import { z as z3 } from "zod";
|
|
2671
|
+
var llmConfigSchema = z3.object({
|
|
2672
|
+
provider: z3.enum(["openai", "gemini"], {
|
|
2673
|
+
errorMap: () => ({
|
|
2674
|
+
message: "provider must be 'openai' or 'gemini'"
|
|
2675
|
+
})
|
|
2676
|
+
}),
|
|
2677
|
+
apiKey: z3.string().optional(),
|
|
2678
|
+
defaultModel: z3.string().optional(),
|
|
2679
|
+
reasoningEffort: z3.enum(["minimal", "low", "medium", "high"]).optional(),
|
|
2680
|
+
defaultResponseFormat: z3.object({
|
|
2681
|
+
type: z3.enum(["json_object", "text"])
|
|
2682
|
+
}).optional()
|
|
2683
|
+
});
|
|
2684
|
+
var criterionSchema = z3.object({
|
|
2685
|
+
id: z3.string().min(1, "Criterion id is required"),
|
|
2686
|
+
name: z3.string().min(1, "Criterion name is required"),
|
|
2687
|
+
description: z3.string().min(1, "Criterion description is required"),
|
|
2688
|
+
weight: z3.number().positive().optional(),
|
|
2689
|
+
validator: z3.function().optional()
|
|
2690
|
+
});
|
|
2691
|
+
var judgeConfigSchema = z3.object({
|
|
2692
|
+
llm: llmConfigSchema.optional(),
|
|
2693
|
+
criteria: z3.array(criterionSchema).min(1, "At least one criterion is required"),
|
|
2694
|
+
passThreshold: z3.number().min(0).max(100).optional(),
|
|
2695
|
+
prompt: z3.any().optional()
|
|
2696
|
+
});
|
|
2697
|
+
var improverConfigSchema = z3.object({
|
|
2698
|
+
llm: llmConfigSchema.optional(),
|
|
2699
|
+
prompt: z3.any().optional()
|
|
2700
|
+
}).optional();
|
|
2701
|
+
var outputConfigSchema = z3.object({
|
|
2702
|
+
dir: z3.string().optional(),
|
|
2703
|
+
filename: z3.string().optional(),
|
|
2704
|
+
verbose: z3.boolean().optional()
|
|
2705
|
+
}).optional();
|
|
2706
|
+
var runConfigSchema = z3.object({
|
|
2707
|
+
concurrency: z3.number().int().positive().optional(),
|
|
2708
|
+
iterations: z3.number().int().positive().optional(),
|
|
2709
|
+
stopOnFirstFailure: z3.boolean().optional()
|
|
2710
|
+
}).optional();
|
|
2711
|
+
var maxTurnsConditionSchema = z3.object({
|
|
2712
|
+
type: z3.literal("maxTurns"),
|
|
2713
|
+
count: z3.number().int().positive()
|
|
2714
|
+
});
|
|
2715
|
+
var fieldSetConditionSchema = z3.object({
|
|
2716
|
+
type: z3.literal("fieldSet"),
|
|
2717
|
+
fieldPath: z3.string().min(1)
|
|
2718
|
+
});
|
|
2719
|
+
var fieldValueConditionSchema = z3.object({
|
|
2720
|
+
type: z3.literal("fieldValue"),
|
|
2721
|
+
fieldPath: z3.string().min(1),
|
|
2722
|
+
expectedValue: z3.unknown()
|
|
2723
|
+
});
|
|
2724
|
+
var customConditionSchema = z3.object({
|
|
2725
|
+
type: z3.literal("custom"),
|
|
2726
|
+
check: z3.function(),
|
|
2727
|
+
description: z3.string().optional()
|
|
2728
|
+
});
|
|
2729
|
+
var terminationConditionSchema = z3.union([
|
|
2730
|
+
maxTurnsConditionSchema,
|
|
2731
|
+
fieldSetConditionSchema,
|
|
2732
|
+
fieldValueConditionSchema,
|
|
2733
|
+
customConditionSchema
|
|
2734
|
+
]);
|
|
2735
|
+
var followUpInputSchema = z3.object({
|
|
2736
|
+
input: z3.unknown(),
|
|
2737
|
+
description: z3.string().optional(),
|
|
2738
|
+
turns: z3.number().optional()
|
|
2739
|
+
});
|
|
2740
|
+
var multiTurnConfigSchema = z3.object({
|
|
2741
|
+
followUpInputs: z3.array(followUpInputSchema).optional(),
|
|
2742
|
+
terminateWhen: z3.array(terminationConditionSchema).min(1, "At least one termination condition is required"),
|
|
2743
|
+
maxTurns: z3.number().int().positive().optional(),
|
|
2744
|
+
onConditionMet: z3.enum(["pass", "fail"]).optional(),
|
|
2745
|
+
onMaxTurnsReached: z3.enum(["pass", "fail"]).optional()
|
|
2746
|
+
});
|
|
2747
|
+
var testCaseSchema = z3.object({
|
|
2748
|
+
id: z3.string().optional(),
|
|
2749
|
+
input: z3.unknown(),
|
|
2750
|
+
tags: z3.array(z3.string()).optional(),
|
|
2751
|
+
description: z3.string().optional(),
|
|
2752
|
+
expectedOutput: z3.unknown().optional(),
|
|
2753
|
+
files: z3.array(z3.any()).optional(),
|
|
2754
|
+
multiTurn: multiTurnConfigSchema.optional()
|
|
2755
|
+
});
|
|
2756
|
+
var agentSchema = z3.object({
|
|
2757
|
+
config: z3.object({
|
|
2758
|
+
name: z3.string(),
|
|
2759
|
+
description: z3.string().optional()
|
|
2760
|
+
}),
|
|
2761
|
+
prompt: z3.object({
|
|
2762
|
+
id: z3.string(),
|
|
2763
|
+
version: z3.string(),
|
|
2764
|
+
system: z3.string(),
|
|
2765
|
+
renderUserPrompt: z3.function()
|
|
2766
|
+
}),
|
|
2767
|
+
execute: z3.function()
|
|
2768
|
+
});
|
|
2769
|
+
var evalConfigSchema = z3.object({
|
|
2770
|
+
name: z3.string().optional(),
|
|
2771
|
+
agentDescription: z3.string().optional(),
|
|
2772
|
+
agent: agentSchema,
|
|
2773
|
+
llm: llmConfigSchema,
|
|
2774
|
+
judge: judgeConfigSchema,
|
|
2775
|
+
improver: improverConfigSchema,
|
|
2776
|
+
testCases: z3.array(testCaseSchema).optional(),
|
|
2777
|
+
output: outputConfigSchema,
|
|
2778
|
+
run: runConfigSchema,
|
|
2779
|
+
include: z3.array(z3.string().min(1, "Include pattern cannot be empty")).min(1, "Include array must have at least one pattern").optional(),
|
|
2780
|
+
agents: z3.record(z3.string(), agentSchema).optional()
|
|
2781
|
+
}).refine(
|
|
2782
|
+
(data) => {
|
|
2783
|
+
const hasTestCases = (data.testCases?.length ?? 0) > 0;
|
|
2784
|
+
const hasInclude = (data.include?.length ?? 0) > 0;
|
|
2785
|
+
return hasTestCases || hasInclude;
|
|
2786
|
+
},
|
|
2787
|
+
{
|
|
2788
|
+
message: "Either testCases or include must be provided. Use testCases for inline TypeScript tests, or include for YAML file discovery.",
|
|
2789
|
+
path: ["testCases"]
|
|
2790
|
+
}
|
|
2791
|
+
);
|
|
2792
|
+
|
|
2793
|
+
// src/cli/config/loader.ts
|
|
2794
|
+
var ConfigError = class extends Error {
|
|
2795
|
+
constructor(message, code, context) {
|
|
2796
|
+
super(message);
|
|
2797
|
+
this.code = code;
|
|
2798
|
+
this.context = context;
|
|
2799
|
+
this.name = "ConfigError";
|
|
2800
|
+
}
|
|
2801
|
+
};
|
|
2802
|
+
async function discoverEvalFiles(config, options = {}) {
|
|
2803
|
+
const patterns = options.include ?? config.include;
|
|
2804
|
+
if (!patterns || patterns.length === 0) {
|
|
2805
|
+
throw new ConfigError(
|
|
2806
|
+
`No include patterns specified.
|
|
2807
|
+
|
|
2808
|
+
Add an include field to your config:
|
|
2809
|
+
include: ['evals/**/*.eval.yaml']
|
|
2810
|
+
|
|
2811
|
+
Or use the --include CLI option:
|
|
2812
|
+
npx agent-eval --include "evals/**/*.eval.yaml"`,
|
|
2813
|
+
"CONFIG_NO_INCLUDE_PATTERN"
|
|
2814
|
+
);
|
|
2815
|
+
}
|
|
2816
|
+
const cwd = options.cwd ?? process.cwd();
|
|
2817
|
+
const ignore = options.ignore ?? ["**/node_modules/**"];
|
|
2818
|
+
const files = await fg(patterns, {
|
|
2819
|
+
absolute: true,
|
|
2820
|
+
cwd,
|
|
2821
|
+
ignore,
|
|
2822
|
+
onlyFiles: true,
|
|
2823
|
+
dot: false,
|
|
2824
|
+
followSymbolicLinks: false,
|
|
2825
|
+
unique: true,
|
|
2826
|
+
suppressErrors: false
|
|
2827
|
+
});
|
|
2828
|
+
return files.sort();
|
|
2829
|
+
}
|
|
2830
|
+
|
|
2831
|
+
// src/improvement-cycle/types.ts
|
|
2832
|
+
function isTargetScoreCondition(condition) {
|
|
2833
|
+
return condition.type === "targetScore";
|
|
2834
|
+
}
|
|
2835
|
+
function isMaxRoundsCondition(condition) {
|
|
2836
|
+
return condition.type === "maxRounds";
|
|
2837
|
+
}
|
|
2838
|
+
function isNoImprovementCondition(condition) {
|
|
2839
|
+
return condition.type === "noImprovement";
|
|
2840
|
+
}
|
|
2841
|
+
function isMaxCostCondition(condition) {
|
|
2842
|
+
return condition.type === "maxCost";
|
|
2843
|
+
}
|
|
2844
|
+
function isCustomCycleCondition(condition) {
|
|
2845
|
+
return condition.type === "custom";
|
|
2846
|
+
}
|
|
2847
|
+
function isCycleTerminated(result) {
|
|
2848
|
+
return result.terminated === true;
|
|
2849
|
+
}
|
|
2850
|
+
|
|
2851
|
+
// src/improvement-cycle/conditions.ts
|
|
2852
|
+
function targetScore(threshold) {
|
|
2853
|
+
if (!Number.isFinite(threshold)) {
|
|
2854
|
+
throw new EvalError("threshold must be a finite number", {
|
|
2855
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
2856
|
+
context: { threshold }
|
|
2857
|
+
});
|
|
2858
|
+
}
|
|
2859
|
+
if (threshold < 0 || threshold > 100) {
|
|
2860
|
+
throw new EvalError("threshold must be between 0 and 100", {
|
|
2861
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
2862
|
+
context: { threshold }
|
|
2863
|
+
});
|
|
2864
|
+
}
|
|
2865
|
+
return { type: "targetScore", threshold };
|
|
2866
|
+
}
|
|
2867
|
+
function maxRounds(count) {
|
|
2868
|
+
if (!Number.isInteger(count) || count < 1) {
|
|
2869
|
+
throw new EvalError("count must be a positive integer", {
|
|
2870
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
2871
|
+
context: { count }
|
|
2872
|
+
});
|
|
2873
|
+
}
|
|
2874
|
+
return { type: "maxRounds", count };
|
|
2875
|
+
}
|
|
2876
|
+
function noImprovement(consecutiveRounds, minDelta) {
|
|
2877
|
+
if (!Number.isInteger(consecutiveRounds) || consecutiveRounds < 1) {
|
|
2878
|
+
throw new EvalError("consecutiveRounds must be a positive integer", {
|
|
2879
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
2880
|
+
context: { consecutiveRounds }
|
|
2881
|
+
});
|
|
2882
|
+
}
|
|
2883
|
+
if (minDelta !== void 0 && (!Number.isFinite(minDelta) || minDelta < 0)) {
|
|
2884
|
+
throw new EvalError("minDelta must be a non-negative finite number", {
|
|
2885
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
2886
|
+
context: { minDelta }
|
|
2887
|
+
});
|
|
2888
|
+
}
|
|
2889
|
+
return {
|
|
2890
|
+
type: "noImprovement",
|
|
2891
|
+
consecutiveRounds,
|
|
2892
|
+
...minDelta !== void 0 && { minDelta }
|
|
2893
|
+
};
|
|
2894
|
+
}
|
|
2895
|
+
function maxCost(maxUSD) {
|
|
2896
|
+
if (!Number.isFinite(maxUSD) || maxUSD <= 0) {
|
|
2897
|
+
throw new EvalError("maxUSD must be a positive finite number", {
|
|
2898
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
2899
|
+
context: { maxUSD }
|
|
2900
|
+
});
|
|
2901
|
+
}
|
|
2902
|
+
return { type: "maxCost", maxUSD };
|
|
2903
|
+
}
|
|
2904
|
+
function customCondition(check, description) {
|
|
2905
|
+
return {
|
|
2906
|
+
type: "custom",
|
|
2907
|
+
check,
|
|
2908
|
+
...description !== void 0 && { description }
|
|
2909
|
+
};
|
|
2910
|
+
}
|
|
2911
|
+
function and2(...conditions) {
|
|
2912
|
+
if (conditions.length === 0) {
|
|
2913
|
+
return {
|
|
2914
|
+
type: "custom",
|
|
2915
|
+
check: () => false,
|
|
2916
|
+
description: formatCompositeDescription("and", [])
|
|
2917
|
+
};
|
|
2918
|
+
}
|
|
2919
|
+
return {
|
|
2920
|
+
type: "custom",
|
|
2921
|
+
check: createAndCheck(conditions, checkCycleCondition),
|
|
2922
|
+
description: formatCompositeDescription("and", conditions)
|
|
2923
|
+
};
|
|
2924
|
+
}
|
|
2925
|
+
function or2(...conditions) {
|
|
2926
|
+
if (conditions.length === 0) {
|
|
2927
|
+
return {
|
|
2928
|
+
type: "custom",
|
|
2929
|
+
check: () => false,
|
|
2930
|
+
description: formatCompositeDescription("or", [])
|
|
2931
|
+
};
|
|
2932
|
+
}
|
|
2933
|
+
return {
|
|
2934
|
+
type: "custom",
|
|
2935
|
+
check: createOrCheck(conditions, checkCycleCondition),
|
|
2936
|
+
description: formatCompositeDescription("or", conditions)
|
|
2937
|
+
};
|
|
2938
|
+
}
|
|
2939
|
+
function not2(condition) {
|
|
2940
|
+
return {
|
|
2941
|
+
type: "custom",
|
|
2942
|
+
check: createNotCheck(condition, checkCycleCondition),
|
|
2943
|
+
description: `not(${condition.type})`
|
|
2944
|
+
};
|
|
2945
|
+
}
|
|
2946
|
+
function checkTargetScore(condition, ctx) {
|
|
2947
|
+
if (ctx.latestScore >= condition.threshold) {
|
|
2948
|
+
return {
|
|
2949
|
+
terminated: true,
|
|
2950
|
+
matchedCondition: condition,
|
|
2951
|
+
reason: `Target score ${condition.threshold} reached (current: ${ctx.latestScore})`
|
|
2952
|
+
};
|
|
2953
|
+
}
|
|
2954
|
+
return {
|
|
2955
|
+
terminated: false,
|
|
2956
|
+
reason: `Score ${ctx.latestScore} below target ${condition.threshold}`
|
|
2957
|
+
};
|
|
2958
|
+
}
|
|
2959
|
+
function checkMaxRounds(condition, ctx) {
|
|
2960
|
+
if (ctx.currentRound >= condition.count) {
|
|
2961
|
+
return {
|
|
2962
|
+
terminated: true,
|
|
2963
|
+
matchedCondition: condition,
|
|
2964
|
+
reason: `Maximum rounds reached (${condition.count})`
|
|
2965
|
+
};
|
|
2966
|
+
}
|
|
2967
|
+
return {
|
|
2968
|
+
terminated: false,
|
|
2969
|
+
reason: `Round ${ctx.currentRound} of ${condition.count}`
|
|
2970
|
+
};
|
|
2971
|
+
}
|
|
2972
|
+
function checkNoImprovement(condition, ctx) {
|
|
2973
|
+
const { consecutiveRounds, minDelta = 0 } = condition;
|
|
2974
|
+
const { history } = ctx;
|
|
2975
|
+
let noImprovementCount = 0;
|
|
2976
|
+
for (let i = history.length - 1; i >= 0; i--) {
|
|
2977
|
+
const round = history[i];
|
|
2978
|
+
if (round.scoreDelta === null) break;
|
|
2979
|
+
if (round.scoreDelta <= minDelta) {
|
|
2980
|
+
noImprovementCount++;
|
|
2981
|
+
} else {
|
|
2982
|
+
break;
|
|
2983
|
+
}
|
|
2984
|
+
}
|
|
2985
|
+
if (noImprovementCount >= consecutiveRounds) {
|
|
2986
|
+
return {
|
|
2987
|
+
terminated: true,
|
|
2988
|
+
matchedCondition: condition,
|
|
2989
|
+
reason: `No improvement for ${noImprovementCount} consecutive round${noImprovementCount === 1 ? "" : "s"}`
|
|
2990
|
+
};
|
|
2991
|
+
}
|
|
2992
|
+
const roundWord = noImprovementCount === 1 ? "round" : "rounds";
|
|
2993
|
+
return {
|
|
2994
|
+
terminated: false,
|
|
2995
|
+
reason: `${noImprovementCount} ${roundWord} without improvement (need ${consecutiveRounds})`
|
|
2996
|
+
};
|
|
2997
|
+
}
|
|
2998
|
+
function checkMaxCost(condition, ctx) {
|
|
2999
|
+
if (ctx.totalCost >= condition.maxUSD) {
|
|
3000
|
+
return {
|
|
3001
|
+
terminated: true,
|
|
3002
|
+
matchedCondition: condition,
|
|
3003
|
+
reason: `Cost limit exceeded ($${ctx.totalCost.toFixed(2)} >= $${condition.maxUSD.toFixed(2)})`
|
|
3004
|
+
};
|
|
3005
|
+
}
|
|
3006
|
+
return {
|
|
3007
|
+
terminated: false,
|
|
3008
|
+
reason: `Cost $${ctx.totalCost.toFixed(2)} under limit $${condition.maxUSD.toFixed(2)}`
|
|
3009
|
+
};
|
|
3010
|
+
}
|
|
3011
|
+
async function checkCustomCondition(condition, ctx) {
|
|
3012
|
+
const description = condition.description ?? "Custom condition";
|
|
3013
|
+
try {
|
|
3014
|
+
const shouldTerminate = await condition.check(ctx);
|
|
3015
|
+
if (shouldTerminate) {
|
|
3016
|
+
return {
|
|
3017
|
+
terminated: true,
|
|
3018
|
+
matchedCondition: condition,
|
|
3019
|
+
reason: `${description} met`
|
|
3020
|
+
};
|
|
3021
|
+
}
|
|
3022
|
+
return {
|
|
3023
|
+
terminated: false,
|
|
3024
|
+
reason: `${description} not met`
|
|
3025
|
+
};
|
|
3026
|
+
} catch (error) {
|
|
3027
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3028
|
+
return {
|
|
3029
|
+
terminated: false,
|
|
3030
|
+
reason: `${description} check failed: ${message}`
|
|
3031
|
+
};
|
|
3032
|
+
}
|
|
3033
|
+
}
|
|
3034
|
+
async function checkCycleCondition(condition, context) {
|
|
3035
|
+
if (isTargetScoreCondition(condition)) {
|
|
3036
|
+
return checkTargetScore(condition, context);
|
|
3037
|
+
}
|
|
3038
|
+
if (isMaxRoundsCondition(condition)) {
|
|
3039
|
+
return checkMaxRounds(condition, context);
|
|
3040
|
+
}
|
|
3041
|
+
if (isNoImprovementCondition(condition)) {
|
|
3042
|
+
return checkNoImprovement(condition, context);
|
|
3043
|
+
}
|
|
3044
|
+
if (isMaxCostCondition(condition)) {
|
|
3045
|
+
return checkMaxCost(condition, context);
|
|
3046
|
+
}
|
|
3047
|
+
if (isCustomCycleCondition(condition)) {
|
|
3048
|
+
return checkCustomCondition(condition, context);
|
|
3049
|
+
}
|
|
3050
|
+
const _exhaustive = condition;
|
|
3051
|
+
throw new EvalError(`Unknown condition type: ${JSON.stringify(_exhaustive)}`, {
|
|
3052
|
+
code: "UNKNOWN_ERROR" /* UNKNOWN_ERROR */,
|
|
3053
|
+
context: { condition: _exhaustive }
|
|
3054
|
+
});
|
|
3055
|
+
}
|
|
3056
|
+
async function checkCycleTermination(conditions, context) {
|
|
3057
|
+
if (conditions.length === 0) {
|
|
3058
|
+
return {
|
|
3059
|
+
terminated: false,
|
|
3060
|
+
reason: "No termination conditions specified"
|
|
3061
|
+
};
|
|
3062
|
+
}
|
|
3063
|
+
for (const condition of conditions) {
|
|
3064
|
+
const result = await checkCycleCondition(condition, context);
|
|
3065
|
+
if (result.terminated) {
|
|
3066
|
+
return result;
|
|
3067
|
+
}
|
|
3068
|
+
}
|
|
3069
|
+
return {
|
|
3070
|
+
terminated: false,
|
|
3071
|
+
reason: "No termination conditions met"
|
|
3072
|
+
};
|
|
3073
|
+
}
|
|
3074
|
+
|
|
3075
|
+
// src/improvement-cycle/runner.ts
|
|
3076
|
+
import { calculateCostFromUsage as calculateCostFromUsage2 } from "@agtlantis/core";
|
|
3077
|
+
|
|
3078
|
+
// src/improvement-cycle/history.ts
|
|
3079
|
+
import crypto from "crypto";
|
|
3080
|
+
import { existsSync as existsSync2 } from "fs";
|
|
3081
|
+
import { mkdir, readFile, writeFile as writeFile2 } from "fs/promises";
|
|
3082
|
+
import { dirname } from "path";
|
|
3083
|
+
import { compileTemplate as compileTemplate2 } from "@agtlantis/core";
|
|
3084
|
+
var defaultHistoryStorage = {
|
|
3085
|
+
readFile: (path3) => readFile(path3, "utf-8"),
|
|
3086
|
+
writeFile: (path3, content) => writeFile2(path3, content, "utf-8"),
|
|
3087
|
+
exists: existsSync2,
|
|
3088
|
+
mkdir: (path3, options) => mkdir(path3, options)
|
|
3089
|
+
};
|
|
3090
|
+
function hasUserTemplate(prompt) {
|
|
3091
|
+
return typeof prompt.userTemplate === "string";
|
|
3092
|
+
}
|
|
3093
|
+
function serializePrompt(prompt) {
|
|
3094
|
+
const p = prompt;
|
|
3095
|
+
if (!hasUserTemplate(p)) {
|
|
3096
|
+
throw new EvalError("Cannot serialize prompt: userTemplate field is required", {
|
|
3097
|
+
code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
|
|
3098
|
+
context: { promptId: p.id }
|
|
3099
|
+
});
|
|
3100
|
+
}
|
|
3101
|
+
const { id, version, system, userTemplate, renderUserPrompt, ...rest } = p;
|
|
3102
|
+
const customFields = Object.keys(rest).length > 0 ? rest : void 0;
|
|
3103
|
+
return {
|
|
3104
|
+
id,
|
|
3105
|
+
version,
|
|
3106
|
+
system,
|
|
3107
|
+
userTemplate,
|
|
3108
|
+
...customFields && { customFields }
|
|
3109
|
+
};
|
|
3110
|
+
}
|
|
3111
|
+
function validateDeserializedPrompt(obj, promptId) {
|
|
3112
|
+
const requiredStrings = ["id", "version", "system", "userTemplate"];
|
|
3113
|
+
for (const field of requiredStrings) {
|
|
3114
|
+
if (typeof obj[field] !== "string") {
|
|
3115
|
+
throw new EvalError(`Invalid deserialized prompt: ${field} must be a string`, {
|
|
3116
|
+
code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
|
|
3117
|
+
context: { promptId, field, actual: typeof obj[field] }
|
|
3118
|
+
});
|
|
3119
|
+
}
|
|
3120
|
+
}
|
|
3121
|
+
if (typeof obj.renderUserPrompt !== "function") {
|
|
3122
|
+
throw new EvalError("Invalid deserialized prompt: renderUserPrompt must be a function", {
|
|
3123
|
+
code: "PROMPT_INVALID_FORMAT" /* PROMPT_INVALID_FORMAT */,
|
|
3124
|
+
context: { promptId, actual: typeof obj.renderUserPrompt }
|
|
3125
|
+
});
|
|
3126
|
+
}
|
|
3127
|
+
}
|
|
3128
|
+
function deserializePrompt(serialized) {
|
|
3129
|
+
const { id, version, system, userTemplate, customFields } = serialized;
|
|
3130
|
+
let renderUserPrompt;
|
|
3131
|
+
try {
|
|
3132
|
+
renderUserPrompt = compileTemplate2(userTemplate, id);
|
|
3133
|
+
} catch (error) {
|
|
3134
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3135
|
+
throw new EvalError(`Failed to compile userTemplate: ${message}`, {
|
|
3136
|
+
code: "TEMPLATE_COMPILE_ERROR" /* TEMPLATE_COMPILE_ERROR */,
|
|
3137
|
+
context: { promptId: id, userTemplate }
|
|
3138
|
+
});
|
|
3139
|
+
}
|
|
3140
|
+
const result = {
|
|
3141
|
+
...customFields,
|
|
3142
|
+
id,
|
|
3143
|
+
version,
|
|
3144
|
+
system,
|
|
3145
|
+
userTemplate,
|
|
3146
|
+
renderUserPrompt
|
|
3147
|
+
};
|
|
3148
|
+
validateDeserializedPrompt(result, id);
|
|
3149
|
+
return result;
|
|
3150
|
+
}
|
|
3151
|
+
function serializeRoundResult(result) {
|
|
3152
|
+
const { summary } = result.report;
|
|
3153
|
+
return {
|
|
3154
|
+
round: result.round,
|
|
3155
|
+
completedAt: result.completedAt.toISOString(),
|
|
3156
|
+
avgScore: summary.avgScore,
|
|
3157
|
+
passed: summary.passed,
|
|
3158
|
+
failed: summary.failed,
|
|
3159
|
+
totalTests: summary.totalTests,
|
|
3160
|
+
suggestionsGenerated: result.suggestionsGenerated,
|
|
3161
|
+
suggestionsApproved: result.suggestionsApproved,
|
|
3162
|
+
promptSnapshot: result.promptSnapshot,
|
|
3163
|
+
promptVersionAfter: result.promptVersionAfter,
|
|
3164
|
+
cost: result.cost,
|
|
3165
|
+
scoreDelta: result.scoreDelta
|
|
3166
|
+
};
|
|
3167
|
+
}
|
|
3168
|
+
function validateHistorySchema(data) {
|
|
3169
|
+
if (typeof data !== "object" || data === null) {
|
|
3170
|
+
throw new EvalError("Invalid history: not an object", {
|
|
3171
|
+
code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */
|
|
3172
|
+
});
|
|
3173
|
+
}
|
|
3174
|
+
const h = data;
|
|
3175
|
+
if (h.schemaVersion !== "1.1.0") {
|
|
3176
|
+
throw new EvalError(`Unsupported schema version: ${String(h.schemaVersion)}`, {
|
|
3177
|
+
code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */,
|
|
3178
|
+
context: { schemaVersion: h.schemaVersion }
|
|
3179
|
+
});
|
|
3180
|
+
}
|
|
3181
|
+
const requiredFields = [
|
|
3182
|
+
"sessionId",
|
|
3183
|
+
"startedAt",
|
|
3184
|
+
"initialPrompt",
|
|
3185
|
+
"currentPrompt",
|
|
3186
|
+
"rounds",
|
|
3187
|
+
"totalCost"
|
|
3188
|
+
];
|
|
3189
|
+
for (const field of requiredFields) {
|
|
3190
|
+
if (!(field in h)) {
|
|
3191
|
+
throw new EvalError(`Invalid history: missing field "${field}"`, {
|
|
3192
|
+
code: "SCHEMA_VALIDATION_ERROR" /* SCHEMA_VALIDATION_ERROR */,
|
|
3193
|
+
context: { missingField: field }
|
|
3194
|
+
});
|
|
3195
|
+
}
|
|
3196
|
+
}
|
|
3197
|
+
}
|
|
3198
|
+
var ImprovementSessionImpl = class {
|
|
3199
|
+
_history;
|
|
3200
|
+
_isUpdating = false;
|
|
3201
|
+
_savePromise = Promise.resolve();
|
|
3202
|
+
config;
|
|
3203
|
+
constructor(history, config = {}) {
|
|
3204
|
+
this._history = history;
|
|
3205
|
+
this.config = {
|
|
3206
|
+
autoSave: config.autoSave ?? false,
|
|
3207
|
+
...config
|
|
3208
|
+
};
|
|
3209
|
+
}
|
|
3210
|
+
get sessionId() {
|
|
3211
|
+
return this._history.sessionId;
|
|
3212
|
+
}
|
|
3213
|
+
get history() {
|
|
3214
|
+
return this._history;
|
|
3215
|
+
}
|
|
3216
|
+
get canSave() {
|
|
3217
|
+
return this.config.path !== void 0;
|
|
3218
|
+
}
|
|
3219
|
+
addRound(roundResult, updatedPrompt) {
|
|
3220
|
+
if (this._isUpdating) {
|
|
3221
|
+
throw new EvalError("Session is being updated", {
|
|
3222
|
+
code: "CONCURRENT_MODIFICATION" /* CONCURRENT_MODIFICATION */,
|
|
3223
|
+
context: { sessionId: this.sessionId }
|
|
3224
|
+
});
|
|
3225
|
+
}
|
|
3226
|
+
if (this._history.completedAt) {
|
|
3227
|
+
throw new EvalError("Cannot add round to completed session", {
|
|
3228
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
3229
|
+
context: { sessionId: this.sessionId }
|
|
3230
|
+
});
|
|
3231
|
+
}
|
|
3232
|
+
this._isUpdating = true;
|
|
3233
|
+
try {
|
|
3234
|
+
const serializedRound = serializeRoundResult(roundResult);
|
|
3235
|
+
this._history = {
|
|
3236
|
+
...this._history,
|
|
3237
|
+
currentPrompt: updatedPrompt,
|
|
3238
|
+
rounds: [...this._history.rounds, serializedRound],
|
|
3239
|
+
totalCost: this._history.totalCost + roundResult.cost.total
|
|
3240
|
+
};
|
|
3241
|
+
if (this.config.autoSave && this.canSave) {
|
|
3242
|
+
this.save().catch((err) => this.handleAutoSaveError(err));
|
|
3243
|
+
}
|
|
3244
|
+
} finally {
|
|
3245
|
+
this._isUpdating = false;
|
|
3246
|
+
}
|
|
3247
|
+
}
|
|
3248
|
+
complete(terminationReason) {
|
|
3249
|
+
this._history = {
|
|
3250
|
+
...this._history,
|
|
3251
|
+
completedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3252
|
+
terminationReason
|
|
3253
|
+
};
|
|
3254
|
+
if (this.config.autoSave && this.canSave) {
|
|
3255
|
+
this.save().catch((err) => this.handleAutoSaveError(err));
|
|
3256
|
+
}
|
|
3257
|
+
}
|
|
3258
|
+
handleAutoSaveError(error) {
|
|
3259
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
3260
|
+
if (this.config.onAutoSaveError) {
|
|
3261
|
+
this.config.onAutoSaveError(err);
|
|
3262
|
+
} else {
|
|
3263
|
+
console.error("Auto-save failed:", err);
|
|
3264
|
+
}
|
|
3265
|
+
}
|
|
3266
|
+
async save() {
|
|
3267
|
+
if (!this.config.path) {
|
|
3268
|
+
throw new EvalError("Cannot save: no path configured", {
|
|
3269
|
+
code: "INVALID_CONFIG" /* INVALID_CONFIG */,
|
|
3270
|
+
context: { sessionId: this.sessionId }
|
|
3271
|
+
});
|
|
3272
|
+
}
|
|
3273
|
+
this._savePromise = this._savePromise.then(async () => {
|
|
3274
|
+
await saveHistory(this._history, this.config.path, this.config.storage);
|
|
3275
|
+
});
|
|
3276
|
+
return this._savePromise;
|
|
3277
|
+
}
|
|
3278
|
+
async flush() {
|
|
3279
|
+
return this._savePromise;
|
|
3280
|
+
}
|
|
3281
|
+
};
|
|
3282
|
+
function createSession(initialPrompt, config) {
|
|
3283
|
+
const serializedPrompt = serializePrompt(initialPrompt);
|
|
3284
|
+
const history = {
|
|
3285
|
+
schemaVersion: "1.1.0",
|
|
3286
|
+
sessionId: crypto.randomUUID(),
|
|
3287
|
+
startedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3288
|
+
initialPrompt: serializedPrompt,
|
|
3289
|
+
currentPrompt: serializedPrompt,
|
|
3290
|
+
rounds: [],
|
|
3291
|
+
totalCost: 0
|
|
3292
|
+
};
|
|
3293
|
+
return new ImprovementSessionImpl(history, config);
|
|
3294
|
+
}
|
|
3295
|
+
async function resumeSession(path3, config) {
|
|
3296
|
+
const history = await loadHistory(path3, config?.storage);
|
|
3297
|
+
const reopenedHistory = {
|
|
3298
|
+
...history,
|
|
3299
|
+
completedAt: void 0,
|
|
3300
|
+
terminationReason: void 0
|
|
3301
|
+
};
|
|
3302
|
+
return new ImprovementSessionImpl(reopenedHistory, { ...config, path: path3 });
|
|
3303
|
+
}
|
|
3304
|
+
async function saveHistory(history, path3, storage = defaultHistoryStorage) {
|
|
3305
|
+
try {
|
|
3306
|
+
const dir = dirname(path3);
|
|
3307
|
+
if (dir && dir !== "." && dir !== "/" && !storage.exists(dir)) {
|
|
3308
|
+
await storage.mkdir(dir, { recursive: true });
|
|
3309
|
+
}
|
|
3310
|
+
await storage.writeFile(path3, JSON.stringify(history, null, 2));
|
|
3311
|
+
} catch (error) {
|
|
3312
|
+
if (error instanceof EvalError) throw error;
|
|
3313
|
+
throw EvalError.from(error, "FILE_WRITE_ERROR" /* FILE_WRITE_ERROR */, { path: path3 });
|
|
3314
|
+
}
|
|
3315
|
+
}
|
|
3316
|
+
async function loadHistory(path3, storage = defaultHistoryStorage) {
|
|
3317
|
+
try {
|
|
3318
|
+
if (!storage.exists(path3)) {
|
|
3319
|
+
throw new EvalError(`History file not found: ${path3}`, {
|
|
3320
|
+
code: "FILE_READ_ERROR" /* FILE_READ_ERROR */,
|
|
3321
|
+
context: { path: path3 }
|
|
3322
|
+
});
|
|
3323
|
+
}
|
|
3324
|
+
const content = await storage.readFile(path3);
|
|
3325
|
+
const history = JSON.parse(content);
|
|
3326
|
+
validateHistorySchema(history);
|
|
3327
|
+
return history;
|
|
3328
|
+
} catch (error) {
|
|
3329
|
+
if (error instanceof EvalError) throw error;
|
|
3330
|
+
throw EvalError.from(error, "FILE_READ_ERROR" /* FILE_READ_ERROR */, { path: path3 });
|
|
3331
|
+
}
|
|
3332
|
+
}
|
|
3333
|
+
|
|
3334
|
+
// src/improvement-cycle/runner.ts
|
|
3335
|
+
function initializeCycleState(initialPrompt, existingSession) {
|
|
3336
|
+
const resumeFromRound = existingSession ? existingSession.history.rounds.length : 0;
|
|
3337
|
+
return {
|
|
3338
|
+
currentPrompt: initialPrompt,
|
|
3339
|
+
currentRound: resumeFromRound,
|
|
3340
|
+
previousScores: existingSession ? existingSession.history.rounds.map((r) => r.avgScore) : [],
|
|
3341
|
+
totalCost: existingSession ? existingSession.history.totalCost : 0,
|
|
3342
|
+
completedRounds: []
|
|
3343
|
+
};
|
|
3344
|
+
}
|
|
3345
|
+
function calculateScoreDelta(currentScore, previousScores) {
|
|
3346
|
+
if (previousScores.length === 0) {
|
|
3347
|
+
return null;
|
|
3348
|
+
}
|
|
3349
|
+
const previousScore = previousScores[previousScores.length - 1];
|
|
3350
|
+
return currentScore - previousScore;
|
|
3351
|
+
}
|
|
3352
|
+
function buildCycleContext(state, currentScore) {
|
|
3353
|
+
return {
|
|
3354
|
+
currentRound: state.currentRound,
|
|
3355
|
+
latestScore: currentScore,
|
|
3356
|
+
previousScores: [...state.previousScores],
|
|
3357
|
+
totalCost: state.totalCost,
|
|
3358
|
+
history: state.completedRounds
|
|
3359
|
+
};
|
|
3360
|
+
}
|
|
3361
|
+
function createRoundResult(state, report, improveResult, cost, scoreDelta, promptSnapshot) {
|
|
3362
|
+
return {
|
|
3363
|
+
round: state.currentRound,
|
|
3364
|
+
report,
|
|
3365
|
+
completedAt: /* @__PURE__ */ new Date(),
|
|
3366
|
+
suggestionsGenerated: improveResult.suggestions,
|
|
3367
|
+
suggestionsApproved: [],
|
|
3368
|
+
// Will be updated after decision
|
|
3369
|
+
promptSnapshot,
|
|
3370
|
+
promptVersionAfter: state.currentPrompt.version,
|
|
3371
|
+
cost,
|
|
3372
|
+
scoreDelta
|
|
3373
|
+
};
|
|
3374
|
+
}
|
|
3375
|
+
async function handleStopDecision(state, session, roundResult, promptSnapshot, terminatedByCondition, conditionReason) {
|
|
3376
|
+
const reason = terminatedByCondition ? conditionReason : "User requested stop";
|
|
3377
|
+
session.addRound(roundResult, promptSnapshot);
|
|
3378
|
+
session.complete(reason);
|
|
3379
|
+
await session.flush();
|
|
3380
|
+
state.completedRounds.push(roundResult);
|
|
3381
|
+
return {
|
|
3382
|
+
rounds: state.completedRounds,
|
|
3383
|
+
finalPrompt: deserializePrompt(session.history.currentPrompt),
|
|
3384
|
+
terminationReason: reason,
|
|
3385
|
+
totalCost: state.totalCost,
|
|
3386
|
+
history: session.history
|
|
3387
|
+
};
|
|
3388
|
+
}
|
|
3389
|
+
function handleRollbackDecision(state, rollbackToRound) {
|
|
3390
|
+
const targetRoundIndex = rollbackToRound - 1;
|
|
3391
|
+
if (targetRoundIndex < 0 || targetRoundIndex >= state.completedRounds.length) {
|
|
3392
|
+
throw new Error(`Cannot rollback to round ${rollbackToRound}: round not found`);
|
|
3393
|
+
}
|
|
3394
|
+
const targetRound = state.completedRounds[targetRoundIndex];
|
|
3395
|
+
state.currentPrompt = deserializePrompt(targetRound.promptSnapshot);
|
|
3396
|
+
state.previousScores = state.previousScores.slice(0, rollbackToRound - 1);
|
|
3397
|
+
}
|
|
3398
|
+
function handleContinueDecision(state, session, roundResult, approvedSuggestions, versionBump) {
|
|
3399
|
+
const updatedRoundResult = {
|
|
3400
|
+
...roundResult,
|
|
3401
|
+
suggestionsApproved: approvedSuggestions
|
|
3402
|
+
};
|
|
3403
|
+
if (approvedSuggestions.length > 0) {
|
|
3404
|
+
const applyResult = applyPromptSuggestions(state.currentPrompt, approvedSuggestions, {
|
|
3405
|
+
bumpVersion: versionBump
|
|
3406
|
+
});
|
|
3407
|
+
state.currentPrompt = applyResult.prompt;
|
|
3408
|
+
updatedRoundResult.promptVersionAfter = state.currentPrompt.version;
|
|
3409
|
+
}
|
|
3410
|
+
const updatedPromptSnapshot = serializePrompt(state.currentPrompt);
|
|
3411
|
+
session.addRound(updatedRoundResult, updatedPromptSnapshot);
|
|
3412
|
+
state.completedRounds.push(updatedRoundResult);
|
|
3413
|
+
return updatedRoundResult;
|
|
3414
|
+
}
|
|
3415
|
+
async function executeRound(config, state, pricingConfig) {
|
|
3416
|
+
const { createAgent, judge, improver, testCases: testCases2, options = {} } = config;
|
|
3417
|
+
const agent = createAgent(state.currentPrompt);
|
|
3418
|
+
const suite = createEvalSuite({
|
|
3419
|
+
agent,
|
|
3420
|
+
judge,
|
|
3421
|
+
agentDescription: options.agentDescription
|
|
3422
|
+
});
|
|
3423
|
+
const report = await suite.run(testCases2, options.runOptions);
|
|
3424
|
+
const improveResult = improver ? await improver.improve(state.currentPrompt, report.results) : { suggestions: [] };
|
|
3425
|
+
const cost = calculateRoundCost(report, improveResult, pricingConfig);
|
|
3426
|
+
return { report, improveResult, cost };
|
|
3427
|
+
}
|
|
3428
|
+
function detectProviderForImprover(model) {
|
|
3429
|
+
if (!model) return "anthropic";
|
|
3430
|
+
if (model.startsWith("claude-")) return "anthropic";
|
|
3431
|
+
if (model.startsWith("gpt-") || model.startsWith("o1") || model.startsWith("o3")) return "openai";
|
|
3432
|
+
if (model.startsWith("gemini-")) return "google";
|
|
3433
|
+
return "anthropic";
|
|
3434
|
+
}
|
|
3435
|
+
function toLanguageModelUsage2(usage) {
|
|
3436
|
+
return {
|
|
3437
|
+
inputTokens: usage.inputTokens,
|
|
3438
|
+
outputTokens: usage.outputTokens,
|
|
3439
|
+
totalTokens: usage.totalTokens
|
|
3440
|
+
};
|
|
3441
|
+
}
|
|
3442
|
+
function calculateImproverCost(improveResult, pricingConfig) {
|
|
3443
|
+
const usage = improveResult.metadata?.tokenUsage;
|
|
3444
|
+
if (!usage) return 0;
|
|
3445
|
+
const model = improveResult.metadata?.model ?? "unknown";
|
|
3446
|
+
const provider = detectProviderForImprover(model);
|
|
3447
|
+
const providerPricing = pricingConfig?.providerPricing?.[provider];
|
|
3448
|
+
const result = calculateCostFromUsage2(
|
|
3449
|
+
toLanguageModelUsage2(usage),
|
|
3450
|
+
model,
|
|
3451
|
+
provider,
|
|
3452
|
+
providerPricing
|
|
3453
|
+
);
|
|
3454
|
+
return result.total;
|
|
3455
|
+
}
|
|
3456
|
+
function calculateRoundCost(report, improveResult, pricingConfig) {
|
|
3457
|
+
const reportCosts = pricingConfig ? calculateReportCosts(report, pricingConfig) : { total: 0, byComponent: { agent: 0, judge: 0 } };
|
|
3458
|
+
const improverCost = calculateImproverCost(improveResult, pricingConfig);
|
|
3459
|
+
return {
|
|
3460
|
+
agent: reportCosts.byComponent.agent ?? 0,
|
|
3461
|
+
judge: reportCosts.byComponent.judge ?? 0,
|
|
3462
|
+
improver: improverCost,
|
|
3463
|
+
total: reportCosts.total + improverCost
|
|
3464
|
+
};
|
|
3465
|
+
}
|
|
3466
|
+
async function* runImprovementCycle(config) {
|
|
3467
|
+
const { initialPrompt, terminateWhen = [], options = {} } = config;
|
|
3468
|
+
const { pricingConfig, versionBump = "patch", history: historyConfig, session: existingSession } = options;
|
|
3469
|
+
const session = existingSession ?? createSession(
|
|
3470
|
+
initialPrompt,
|
|
3471
|
+
historyConfig ? { path: historyConfig.path, autoSave: historyConfig.autoSave } : void 0
|
|
3472
|
+
);
|
|
3473
|
+
const state = initializeCycleState(initialPrompt, existingSession);
|
|
3474
|
+
try {
|
|
3475
|
+
while (true) {
|
|
3476
|
+
state.currentRound++;
|
|
3477
|
+
const { report, improveResult, cost } = await executeRound(config, state, pricingConfig);
|
|
3478
|
+
state.totalCost += cost.total;
|
|
3479
|
+
const currentScore = report.summary.avgScore;
|
|
3480
|
+
const scoreDelta = calculateScoreDelta(currentScore, state.previousScores);
|
|
3481
|
+
const promptSnapshot = serializePrompt(state.currentPrompt);
|
|
3482
|
+
const roundResult = createRoundResult(state, report, improveResult, cost, scoreDelta, promptSnapshot);
|
|
3483
|
+
const context = buildCycleContext(state, currentScore);
|
|
3484
|
+
state.previousScores.push(currentScore);
|
|
3485
|
+
const terminationCheck = await checkCycleTermination(terminateWhen, context);
|
|
3486
|
+
const pendingSuggestions = improveResult.suggestions.map((s) => ({
|
|
3487
|
+
...s,
|
|
3488
|
+
approved: false
|
|
3489
|
+
}));
|
|
3490
|
+
const roundYield = {
|
|
3491
|
+
roundResult,
|
|
3492
|
+
pendingSuggestions,
|
|
3493
|
+
terminationCheck,
|
|
3494
|
+
context
|
|
3495
|
+
};
|
|
3496
|
+
const decision = yield roundYield;
|
|
3497
|
+
if (!decision || decision.action === "stop") {
|
|
3498
|
+
return await handleStopDecision(
|
|
3499
|
+
state,
|
|
3500
|
+
session,
|
|
3501
|
+
roundResult,
|
|
3502
|
+
promptSnapshot,
|
|
3503
|
+
terminationCheck.terminated,
|
|
3504
|
+
terminationCheck.reason
|
|
3505
|
+
);
|
|
3506
|
+
}
|
|
3507
|
+
if (decision.action === "rollback" && decision.rollbackToRound !== void 0) {
|
|
3508
|
+
handleRollbackDecision(state, decision.rollbackToRound);
|
|
3509
|
+
continue;
|
|
3510
|
+
}
|
|
3511
|
+
handleContinueDecision(
|
|
3512
|
+
state,
|
|
3513
|
+
session,
|
|
3514
|
+
roundResult,
|
|
3515
|
+
decision.approvedSuggestions ?? [],
|
|
3516
|
+
versionBump
|
|
3517
|
+
);
|
|
3518
|
+
}
|
|
3519
|
+
} catch (error) {
|
|
3520
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
3521
|
+
session.complete(`Error: ${errorMessage}`);
|
|
3522
|
+
throw error;
|
|
3523
|
+
}
|
|
3524
|
+
}
|
|
3525
|
+
async function runImprovementCycleAuto(config) {
|
|
3526
|
+
const cycle = runImprovementCycle(config);
|
|
3527
|
+
let iteratorResult = await cycle.next();
|
|
3528
|
+
while (!iteratorResult.done) {
|
|
3529
|
+
const roundYield = iteratorResult.value;
|
|
3530
|
+
let decision;
|
|
3531
|
+
if (roundYield.terminationCheck.terminated) {
|
|
3532
|
+
decision = { action: "stop" };
|
|
3533
|
+
} else {
|
|
3534
|
+
const approvedSuggestions = roundYield.pendingSuggestions.map((s) => ({
|
|
3535
|
+
...s,
|
|
3536
|
+
approved: true
|
|
3537
|
+
}));
|
|
3538
|
+
decision = { action: "continue", approvedSuggestions };
|
|
3539
|
+
}
|
|
3540
|
+
iteratorResult = await cycle.next(decision);
|
|
3541
|
+
}
|
|
3542
|
+
return iteratorResult.value;
|
|
3543
|
+
}
|
|
3544
|
+
|
|
3545
|
+
// src/core/test-case-collection.ts
|
|
3546
|
+
var TestCaseCollection = class _TestCaseCollection {
|
|
3547
|
+
cases;
|
|
3548
|
+
constructor(cases) {
|
|
3549
|
+
this.cases = Object.freeze([...cases]);
|
|
3550
|
+
}
|
|
3551
|
+
// ============================================================================
|
|
3552
|
+
// Static Factories
|
|
3553
|
+
// ============================================================================
|
|
3554
|
+
/**
|
|
3555
|
+
* Create a collection from an array of test cases.
|
|
3556
|
+
*/
|
|
3557
|
+
static from(cases) {
|
|
3558
|
+
return new _TestCaseCollection(cases);
|
|
3559
|
+
}
|
|
3560
|
+
/**
|
|
3561
|
+
* Create an empty collection.
|
|
3562
|
+
*/
|
|
3563
|
+
static empty() {
|
|
3564
|
+
return new _TestCaseCollection([]);
|
|
3565
|
+
}
|
|
3566
|
+
// ============================================================================
|
|
3567
|
+
// Properties
|
|
3568
|
+
// ============================================================================
|
|
3569
|
+
/**
|
|
3570
|
+
* Number of test cases in the collection.
|
|
3571
|
+
*/
|
|
3572
|
+
get length() {
|
|
3573
|
+
return this.cases.length;
|
|
3574
|
+
}
|
|
3575
|
+
/**
|
|
3576
|
+
* Whether the collection is empty.
|
|
3577
|
+
*/
|
|
3578
|
+
get isEmpty() {
|
|
3579
|
+
return this.cases.length === 0;
|
|
3580
|
+
}
|
|
3581
|
+
// ============================================================================
|
|
3582
|
+
// Selection Methods (return new TestCaseCollection - chainable)
|
|
3583
|
+
// ============================================================================
|
|
3584
|
+
/**
|
|
3585
|
+
* Returns all test cases.
|
|
3586
|
+
* Returns `this` since the collection is immutable (frozen array).
|
|
3587
|
+
* Useful as explicit starting point in chains.
|
|
3588
|
+
*/
|
|
3589
|
+
all() {
|
|
3590
|
+
return this;
|
|
3591
|
+
}
|
|
3592
|
+
/**
|
|
3593
|
+
* Returns the first N test cases (default: 1).
|
|
3594
|
+
* Useful for cost-controlled testing during development.
|
|
3595
|
+
*/
|
|
3596
|
+
minimal(count = 1) {
|
|
3597
|
+
return this.first(count);
|
|
3598
|
+
}
|
|
3599
|
+
/**
|
|
3600
|
+
* Returns the first N test cases.
|
|
3601
|
+
*/
|
|
3602
|
+
first(count) {
|
|
3603
|
+
if (count <= 0) {
|
|
3604
|
+
return _TestCaseCollection.empty();
|
|
3605
|
+
}
|
|
3606
|
+
return new _TestCaseCollection([...this.cases.slice(0, count)]);
|
|
3607
|
+
}
|
|
3608
|
+
/**
|
|
3609
|
+
* Returns the last N test cases (default: 1).
|
|
3610
|
+
* Preserves original order (earlier cases first).
|
|
3611
|
+
*/
|
|
3612
|
+
last(count = 1) {
|
|
3613
|
+
if (count <= 0) {
|
|
3614
|
+
return _TestCaseCollection.empty();
|
|
3615
|
+
}
|
|
3616
|
+
const startIndex = Math.max(0, this.cases.length - count);
|
|
3617
|
+
return new _TestCaseCollection([...this.cases.slice(startIndex)]);
|
|
3618
|
+
}
|
|
3619
|
+
/**
|
|
3620
|
+
* Returns N random test cases.
|
|
3621
|
+
*
|
|
3622
|
+
* @param count - Number of cases to select
|
|
3623
|
+
* @param options - Optional seed for reproducibility
|
|
3624
|
+
*
|
|
3625
|
+
* @example
|
|
3626
|
+
* ```typescript
|
|
3627
|
+
* // Different each time
|
|
3628
|
+
* collection.random(5)
|
|
3629
|
+
*
|
|
3630
|
+
* // Same result with same seed
|
|
3631
|
+
* collection.random(5, { seed: 42 })
|
|
3632
|
+
* ```
|
|
3633
|
+
*/
|
|
3634
|
+
random(count, options) {
|
|
3635
|
+
if (count <= 0 || this.cases.length === 0) {
|
|
3636
|
+
return _TestCaseCollection.empty();
|
|
3637
|
+
}
|
|
3638
|
+
const actualCount = Math.min(count, this.cases.length);
|
|
3639
|
+
const indices = [...Array(this.cases.length).keys()];
|
|
3640
|
+
const rng = options?.seed !== void 0 ? createSeededRng(options.seed) : Math.random;
|
|
3641
|
+
for (let i = indices.length - 1; i > 0; i--) {
|
|
3642
|
+
const j = Math.floor(rng() * (i + 1));
|
|
3643
|
+
[indices[i], indices[j]] = [indices[j], indices[i]];
|
|
3644
|
+
}
|
|
3645
|
+
const selected = indices.slice(0, actualCount).map((i) => this.cases[i]);
|
|
3646
|
+
return new _TestCaseCollection([...selected]);
|
|
3647
|
+
}
|
|
3648
|
+
/**
|
|
3649
|
+
* Filter test cases by predicate.
|
|
3650
|
+
*/
|
|
3651
|
+
filter(predicate) {
|
|
3652
|
+
return new _TestCaseCollection([...this.cases.filter(predicate)]);
|
|
3653
|
+
}
|
|
3654
|
+
/**
|
|
3655
|
+
* Find test case by ID.
|
|
3656
|
+
* Returns collection with single case or empty collection.
|
|
3657
|
+
*/
|
|
3658
|
+
byId(id) {
|
|
3659
|
+
const found = this.cases.find((tc) => tc.id === id);
|
|
3660
|
+
return found ? new _TestCaseCollection([found]) : _TestCaseCollection.empty();
|
|
3661
|
+
}
|
|
3662
|
+
/**
|
|
3663
|
+
* Find test cases by multiple IDs.
|
|
3664
|
+
* Preserves order of provided IDs (first occurrence).
|
|
3665
|
+
* Skips non-existent IDs. Duplicate IDs in input are deduplicated.
|
|
3666
|
+
*
|
|
3667
|
+
* @example
|
|
3668
|
+
* ```typescript
|
|
3669
|
+
* collection.byIds(['a', 'b', 'a']) // returns [case-a, case-b] (deduplicated)
|
|
3670
|
+
* collection.byIds(['b', 'a']) // returns [case-b, case-a] (order preserved)
|
|
3671
|
+
* ```
|
|
3672
|
+
*/
|
|
3673
|
+
byIds(ids) {
|
|
3674
|
+
const uniqueIds = [...new Set(ids)];
|
|
3675
|
+
const idSet = new Set(uniqueIds);
|
|
3676
|
+
const idToCase = /* @__PURE__ */ new Map();
|
|
3677
|
+
for (const tc of this.cases) {
|
|
3678
|
+
if (tc.id && idSet.has(tc.id) && !idToCase.has(tc.id)) {
|
|
3679
|
+
idToCase.set(tc.id, tc);
|
|
3680
|
+
}
|
|
3681
|
+
}
|
|
3682
|
+
const result = uniqueIds.map((id) => idToCase.get(id)).filter((tc) => tc !== void 0);
|
|
3683
|
+
return new _TestCaseCollection(result);
|
|
3684
|
+
}
|
|
3685
|
+
// ============================================================================
|
|
3686
|
+
// Access Methods
|
|
3687
|
+
// ============================================================================
|
|
3688
|
+
/**
|
|
3689
|
+
* Get test case by ID.
|
|
3690
|
+
* Returns undefined if not found.
|
|
3691
|
+
*/
|
|
3692
|
+
get(id) {
|
|
3693
|
+
return this.cases.find((tc) => tc.id === id);
|
|
3694
|
+
}
|
|
3695
|
+
/**
|
|
3696
|
+
* Get test case by index.
|
|
3697
|
+
* Supports negative indices (e.g., -1 for last item).
|
|
3698
|
+
* Returns undefined if index is out of bounds.
|
|
3699
|
+
*/
|
|
3700
|
+
at(index) {
|
|
3701
|
+
const normalizedIndex = index < 0 ? this.cases.length + index : index;
|
|
3702
|
+
if (normalizedIndex < 0 || normalizedIndex >= this.cases.length) {
|
|
3703
|
+
return void 0;
|
|
3704
|
+
}
|
|
3705
|
+
return this.cases[normalizedIndex];
|
|
3706
|
+
}
|
|
3707
|
+
// ============================================================================
|
|
3708
|
+
// Conversion Methods
|
|
3709
|
+
// ============================================================================
|
|
3710
|
+
/**
|
|
3711
|
+
* Convert to array.
|
|
3712
|
+
* Returns a mutable copy of the internal array.
|
|
3713
|
+
*/
|
|
3714
|
+
toArray() {
|
|
3715
|
+
return [...this.cases];
|
|
3716
|
+
}
|
|
3717
|
+
// ============================================================================
|
|
3718
|
+
// Iterator Support
|
|
3719
|
+
// ============================================================================
|
|
3720
|
+
/**
|
|
3721
|
+
* Iterator support for for...of loops and spread operator.
|
|
3722
|
+
*/
|
|
3723
|
+
[Symbol.iterator]() {
|
|
3724
|
+
return this.cases[Symbol.iterator]();
|
|
3725
|
+
}
|
|
3726
|
+
};
|
|
3727
|
+
var autoIdCounter = 0;
|
|
3728
|
+
function testCase(input, id) {
|
|
3729
|
+
return {
|
|
3730
|
+
id: id ?? `test-${++autoIdCounter}`,
|
|
3731
|
+
input
|
|
3732
|
+
};
|
|
3733
|
+
}
|
|
3734
|
+
function testCases(inputs, prefix = "case") {
|
|
3735
|
+
return inputs.map((input, index) => ({
|
|
3736
|
+
id: `${prefix}-${index}`,
|
|
3737
|
+
input
|
|
3738
|
+
}));
|
|
3739
|
+
}
|
|
3740
|
+
function createSeededRng(seed) {
|
|
3741
|
+
let state = seed;
|
|
3742
|
+
return () => {
|
|
3743
|
+
state = state + 1831565813 | 0;
|
|
3744
|
+
let t = Math.imul(state ^ state >>> 15, state | 1);
|
|
3745
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
3746
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
3747
|
+
};
|
|
3748
|
+
}
|
|
3749
|
+
export {
|
|
3750
|
+
ANTHROPIC_PRICING,
|
|
3751
|
+
CompositeReporter,
|
|
3752
|
+
ConsoleReporter,
|
|
3753
|
+
DEFAULT_PRICING_CONFIG,
|
|
3754
|
+
EvalError,
|
|
3755
|
+
EvalErrorCode,
|
|
3756
|
+
GOOGLE_PRICING,
|
|
3757
|
+
JsonReporter,
|
|
3758
|
+
MarkdownReporter,
|
|
3759
|
+
MockProvider,
|
|
3760
|
+
OPENAI_PRICING,
|
|
3761
|
+
TestCaseCollection,
|
|
3762
|
+
accuracy,
|
|
3763
|
+
addCostsToResults,
|
|
3764
|
+
afterTurns,
|
|
3765
|
+
aggregateIterationResults,
|
|
3766
|
+
aiUser,
|
|
3767
|
+
and,
|
|
3768
|
+
applyPromptSuggestions,
|
|
3769
|
+
bumpVersion,
|
|
3770
|
+
calculateAvgPassRate,
|
|
3771
|
+
calculateAvgStdDev,
|
|
3772
|
+
calculateCostFromUsage3 as calculateCostFromUsage,
|
|
3773
|
+
calculateIterationStats,
|
|
3774
|
+
calculateMultiTurnIterationStats,
|
|
3775
|
+
calculateReportCosts,
|
|
3776
|
+
calculateResultCost,
|
|
3777
|
+
checkCondition,
|
|
3778
|
+
checkCycleCondition,
|
|
3779
|
+
checkCycleTermination,
|
|
3780
|
+
checkTermination,
|
|
3781
|
+
compareReports,
|
|
3782
|
+
compileTemplate3 as compileTemplate,
|
|
3783
|
+
consistency,
|
|
3784
|
+
createCompositeReporter,
|
|
3785
|
+
createConsoleReporter,
|
|
3786
|
+
createDefaultReporter,
|
|
3787
|
+
createEvalSuite,
|
|
3788
|
+
createFilePromptRepository,
|
|
3789
|
+
createImprover,
|
|
3790
|
+
createJsonReporter,
|
|
3791
|
+
createJudge,
|
|
3792
|
+
createMarkdownReporter,
|
|
3793
|
+
createMockAgent,
|
|
3794
|
+
createMockImprover,
|
|
3795
|
+
createMockJudge,
|
|
3796
|
+
createReportRunner,
|
|
3797
|
+
createSession,
|
|
3798
|
+
customCondition,
|
|
3799
|
+
and2 as cycleAnd,
|
|
3800
|
+
not2 as cycleNot,
|
|
3801
|
+
or2 as cycleOr,
|
|
3802
|
+
cycleToMarkdown,
|
|
3803
|
+
defaultHistoryStorage,
|
|
3804
|
+
defineConfig,
|
|
3805
|
+
deserializePrompt,
|
|
3806
|
+
discoverEvalFiles,
|
|
3807
|
+
executeMultiTurnTestCase,
|
|
3808
|
+
executeTestCase,
|
|
3809
|
+
fieldEquals,
|
|
3810
|
+
fieldIsSet,
|
|
3811
|
+
getFieldValue,
|
|
3812
|
+
getFileSourceDisplayInfo,
|
|
3813
|
+
getFileSourcesDisplayInfo2 as getFileSourcesDisplayInfo,
|
|
3814
|
+
inferMediaType,
|
|
3815
|
+
isCustomCondition,
|
|
3816
|
+
isCustomCycleCondition,
|
|
3817
|
+
isCycleTerminated,
|
|
3818
|
+
isFieldSetCondition,
|
|
3819
|
+
isFieldValueCondition,
|
|
3820
|
+
isFileSource,
|
|
3821
|
+
isFileSourceBase64,
|
|
3822
|
+
isFileSourceData,
|
|
3823
|
+
isFileSourcePath,
|
|
3824
|
+
isFileSourceUrl,
|
|
3825
|
+
isIteratedResult,
|
|
3826
|
+
isMaxCostCondition,
|
|
3827
|
+
isMaxRoundsCondition,
|
|
3828
|
+
isMaxTurnsCondition,
|
|
3829
|
+
isMultiTurnResult,
|
|
3830
|
+
isMultiTurnTestCase,
|
|
3831
|
+
isNoImprovementCondition,
|
|
3832
|
+
isSingleTurnResult,
|
|
3833
|
+
isTargetScoreCondition,
|
|
3834
|
+
isTerminated,
|
|
3835
|
+
loadHistory,
|
|
3836
|
+
logCycle,
|
|
3837
|
+
maxCost,
|
|
3838
|
+
maxRounds,
|
|
3839
|
+
mock,
|
|
3840
|
+
naturalLanguage,
|
|
3841
|
+
noImprovement,
|
|
3842
|
+
not,
|
|
3843
|
+
or,
|
|
3844
|
+
relevance,
|
|
3845
|
+
reportToMarkdown,
|
|
3846
|
+
resolveFileSource,
|
|
3847
|
+
resolveFileSourcesInInput3 as resolveFileSourcesInInput,
|
|
3848
|
+
resumeSession,
|
|
3849
|
+
runImprovementCycle,
|
|
3850
|
+
runImprovementCycleAuto,
|
|
3851
|
+
runWithConcurrency,
|
|
3852
|
+
saveCycleJson,
|
|
3853
|
+
saveCycleMarkdown,
|
|
3854
|
+
saveHistory,
|
|
3855
|
+
saveReportMarkdown,
|
|
3856
|
+
scanForFileSources,
|
|
3857
|
+
schema,
|
|
3858
|
+
selectRepresentativeResult,
|
|
3859
|
+
serializePrompt,
|
|
3860
|
+
suggestionDiff,
|
|
3861
|
+
suggestionPreview,
|
|
3862
|
+
suggestionSummary,
|
|
3863
|
+
targetScore,
|
|
3864
|
+
testCase,
|
|
3865
|
+
testCases,
|
|
3866
|
+
toEvalAgent
|
|
3867
|
+
};
|
|
3868
|
+
//# sourceMappingURL=index.js.map
|