@gnsx/genesys.agent.eval 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/bundle/cli.js +1149 -0
- package/dist/bundle/cli.js.map +1 -0
- package/dist/src/args.d.ts +2 -16
- package/dist/src/args.d.ts.map +1 -1
- package/dist/src/args.js +57 -207
- package/dist/src/args.js.map +1 -1
- package/dist/src/{launcher.d.ts → cli.d.ts} +1 -1
- package/dist/src/cli.d.ts.map +1 -0
- package/dist/src/{launcher.js → cli.js} +5 -11
- package/dist/src/cli.js.map +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/dist/tsup.config.d.ts +3 -0
- package/dist/tsup.config.d.ts.map +1 -0
- package/dist/tsup.config.js +13 -0
- package/dist/tsup.config.js.map +1 -0
- package/package.json +8 -4
- package/dist/src/launcher.d.ts.map +0 -1
- package/dist/src/launcher.js.map +0 -1
|
@@ -0,0 +1,1149 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/cli.ts
|
|
4
|
+
import { resolve as resolve2 } from "path";
|
|
5
|
+
|
|
6
|
+
// ../../../cli-utils/dist/self-update.js
|
|
7
|
+
import { execSync } from "child_process";
|
|
8
|
+
import { fileURLToPath } from "url";
|
|
9
|
+
function isInstalledPackage() {
|
|
10
|
+
try {
|
|
11
|
+
const currentFile = fileURLToPath(import.meta.url);
|
|
12
|
+
return currentFile.includes("node_modules");
|
|
13
|
+
} catch {
|
|
14
|
+
return false;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
function detectPackageManager() {
|
|
18
|
+
if (process.env.PNPM_PACKAGE_NAME)
|
|
19
|
+
return "pnpm";
|
|
20
|
+
if (process.env.npm_execpath?.includes("pnpm"))
|
|
21
|
+
return "pnpm";
|
|
22
|
+
const execPath = process.argv[1] || "";
|
|
23
|
+
if (execPath.includes("pnpm"))
|
|
24
|
+
return "pnpm";
|
|
25
|
+
if (execPath.includes("npm"))
|
|
26
|
+
return "npm";
|
|
27
|
+
return "pnpm";
|
|
28
|
+
}
|
|
29
|
+
function isNewerVersion(latest, current) {
|
|
30
|
+
const latestParts = latest.split(".").map(Number);
|
|
31
|
+
const currentParts = current.split(".").map(Number);
|
|
32
|
+
for (let i = 0; i < 3; i++) {
|
|
33
|
+
const latestPart = latestParts[i] || 0;
|
|
34
|
+
const currentPart = currentParts[i] || 0;
|
|
35
|
+
if (latestPart > currentPart)
|
|
36
|
+
return true;
|
|
37
|
+
if (latestPart < currentPart)
|
|
38
|
+
return false;
|
|
39
|
+
}
|
|
40
|
+
return false;
|
|
41
|
+
}
|
|
42
|
+
async function checkForUpdates(packageName, currentVersion) {
|
|
43
|
+
try {
|
|
44
|
+
if (!isInstalledPackage())
|
|
45
|
+
return null;
|
|
46
|
+
const encodedName = encodeURIComponent(packageName);
|
|
47
|
+
const response = await fetch(`https://registry.npmjs.org/${encodedName}/latest`, {
|
|
48
|
+
signal: AbortSignal.timeout(1e4)
|
|
49
|
+
});
|
|
50
|
+
if (!response.ok)
|
|
51
|
+
return null;
|
|
52
|
+
const data = await response.json();
|
|
53
|
+
const latestVersion = data.version;
|
|
54
|
+
if (!latestVersion || latestVersion === currentVersion)
|
|
55
|
+
return null;
|
|
56
|
+
if (!isNewerVersion(latestVersion, currentVersion))
|
|
57
|
+
return null;
|
|
58
|
+
return latestVersion;
|
|
59
|
+
} catch {
|
|
60
|
+
return null;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
function performUpdate(packageName, packageManager, currentVersion, latestVersion) {
|
|
64
|
+
console.log(`
|
|
65
|
+
Updating ${packageName}...`);
|
|
66
|
+
console.log(` Current: v${currentVersion}`);
|
|
67
|
+
console.log(` Latest: v${latestVersion}
|
|
68
|
+
`);
|
|
69
|
+
try {
|
|
70
|
+
const updateCommand = packageManager === "pnpm" ? `pnpm remove -g ${packageName} && pnpm add -g ${packageName}@latest` : `npm uninstall -g ${packageName} && npm install -g ${packageName}@latest`;
|
|
71
|
+
execSync(updateCommand, { stdio: "inherit" });
|
|
72
|
+
console.log(`
|
|
73
|
+
\u2705 Update complete! ${packageName} has been updated to v${latestVersion}.`);
|
|
74
|
+
return true;
|
|
75
|
+
} catch (error) {
|
|
76
|
+
console.error(`
|
|
77
|
+
\u274C Update failed. Please try running the commands manually:`);
|
|
78
|
+
if (packageManager === "pnpm") {
|
|
79
|
+
console.error(` pnpm remove -g ${packageName}`);
|
|
80
|
+
console.error(` pnpm add -g ${packageName}@latest`);
|
|
81
|
+
} else {
|
|
82
|
+
console.error(` npm uninstall -g ${packageName}`);
|
|
83
|
+
console.error(` npm install -g ${packageName}@latest`);
|
|
84
|
+
}
|
|
85
|
+
return false;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
function addUpdateCommand(program, packageName, currentVersion) {
|
|
89
|
+
program.command("update").description("Check for updates and install the latest version").action(async () => {
|
|
90
|
+
if (!isInstalledPackage()) {
|
|
91
|
+
console.log("Skipping update check - running from local development.");
|
|
92
|
+
process.exit(0);
|
|
93
|
+
}
|
|
94
|
+
console.log("Checking for updates...");
|
|
95
|
+
const latestVersion = await checkForUpdates(packageName, currentVersion);
|
|
96
|
+
if (!latestVersion) {
|
|
97
|
+
console.log(`You are already using the latest version (v${currentVersion}).`);
|
|
98
|
+
process.exit(0);
|
|
99
|
+
}
|
|
100
|
+
console.log(`
|
|
101
|
+
\u{1F4E6} Update available: v${currentVersion} \u2192 v${latestVersion}`);
|
|
102
|
+
const packageManager = detectPackageManager();
|
|
103
|
+
if (packageManager === "unknown") {
|
|
104
|
+
console.error("\n\u274C Could not detect package manager. Please update manually:");
|
|
105
|
+
console.error(` pnpm remove -g ${packageName} && pnpm add -g ${packageName}@latest`);
|
|
106
|
+
console.error(` or`);
|
|
107
|
+
console.error(` npm uninstall -g ${packageName} && npm install -g ${packageName}@latest`);
|
|
108
|
+
process.exit(1);
|
|
109
|
+
}
|
|
110
|
+
const success = performUpdate(packageName, packageManager, currentVersion, latestVersion);
|
|
111
|
+
process.exit(success ? 0 : 1);
|
|
112
|
+
});
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// src/args.ts
|
|
116
|
+
import { Command } from "commander";
|
|
117
|
+
var VALID_FORMATS = ["console", "json", "html"];
|
|
118
|
+
var VALID_JUDGE_TYPES = ["embedding", "llm"];
|
|
119
|
+
function isValidFormat(value) {
|
|
120
|
+
return VALID_FORMATS.includes(value);
|
|
121
|
+
}
|
|
122
|
+
function isValidJudgeType(value) {
|
|
123
|
+
return VALID_JUDGE_TYPES.includes(value);
|
|
124
|
+
}
|
|
125
|
+
function parseArgs(argv, version, packageName) {
|
|
126
|
+
const program = new Command();
|
|
127
|
+
program.name("genesys-eval").description("Agent evaluation harness for benchmarking AI agents").version(version, "-v, --version");
|
|
128
|
+
addUpdateCommand(program, packageName, version);
|
|
129
|
+
program.option("--tests <path>", "path to YAML test file", "./eval-tests.yaml").option("-a, --agent <command>", "agent CLI command to test", "genesys").option("--cwd <dir>", "working directory for test context", process.cwd()).option("-t, --timeout <secs>", "timeout per test in seconds", "120").option("-o, --output <path>", "output file for results").option("--format <format>", "output format: console, json, html", "console").option("-p, --parallel <n>", "number of parallel test executions", "1").option("--judge-type <type>", "judge type: embedding, llm", "embedding").option("--judge-model <model>", "model for LLM judge", "claude-3-5-sonnet-20241022").option("--judge-provider <provider>", "provider for LLM judge", "anthropic").parse(argv, { from: "user" });
|
|
130
|
+
const opts = program.opts();
|
|
131
|
+
if (!isValidFormat(opts.format)) {
|
|
132
|
+
console.error(`--format must be one of: ${VALID_FORMATS.join(", ")}`);
|
|
133
|
+
process.exit(1);
|
|
134
|
+
}
|
|
135
|
+
if (!isValidJudgeType(opts.judgeType)) {
|
|
136
|
+
console.error(`--judge-type must be one of: ${VALID_JUDGE_TYPES.join(", ")}`);
|
|
137
|
+
process.exit(1);
|
|
138
|
+
}
|
|
139
|
+
const timeout = parseInt(opts.timeout, 10);
|
|
140
|
+
if (isNaN(timeout) || timeout < 1) {
|
|
141
|
+
console.error("--timeout must be a positive integer");
|
|
142
|
+
process.exit(1);
|
|
143
|
+
}
|
|
144
|
+
const parallel = parseInt(opts.parallel, 10);
|
|
145
|
+
if (isNaN(parallel) || parallel < 1) {
|
|
146
|
+
console.error("--parallel must be a positive integer");
|
|
147
|
+
process.exit(1);
|
|
148
|
+
}
|
|
149
|
+
return {
|
|
150
|
+
tests: opts.tests,
|
|
151
|
+
agent: opts.agent,
|
|
152
|
+
cwd: opts.cwd,
|
|
153
|
+
timeout,
|
|
154
|
+
output: opts.output,
|
|
155
|
+
format: opts.format,
|
|
156
|
+
parallel,
|
|
157
|
+
judgeType: opts.judgeType,
|
|
158
|
+
judgeModel: opts.judgeModel,
|
|
159
|
+
judgeProvider: opts.judgeProvider,
|
|
160
|
+
help: false,
|
|
161
|
+
version: false
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// src/embedding-judge.ts
|
|
166
|
+
import { pipeline } from "@huggingface/transformers";
|
|
167
|
+
var EmbeddingJudge = class {
|
|
168
|
+
_config;
|
|
169
|
+
_pipeline = null;
|
|
170
|
+
_modelLoading = null;
|
|
171
|
+
constructor(config = {}) {
|
|
172
|
+
this._config = {
|
|
173
|
+
passThreshold: 0.7,
|
|
174
|
+
model: "Xenova/all-MiniLM-L6-v2",
|
|
175
|
+
...config
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Get or create the embedding pipeline.
|
|
180
|
+
* Lazy loads the model on first use.
|
|
181
|
+
*/
|
|
182
|
+
async getPipeline() {
|
|
183
|
+
if (this._pipeline) {
|
|
184
|
+
return this._pipeline;
|
|
185
|
+
}
|
|
186
|
+
if (this._modelLoading) {
|
|
187
|
+
return this._modelLoading;
|
|
188
|
+
}
|
|
189
|
+
this._modelLoading = pipeline(
|
|
190
|
+
"feature-extraction",
|
|
191
|
+
this._config.model
|
|
192
|
+
);
|
|
193
|
+
this._pipeline = await this._modelLoading;
|
|
194
|
+
return this._pipeline;
|
|
195
|
+
}
|
|
196
|
+
/**
|
|
197
|
+
* Generate embeddings for text.
|
|
198
|
+
*
|
|
199
|
+
* @param text - Text to embed
|
|
200
|
+
* @returns Embedding vector
|
|
201
|
+
*/
|
|
202
|
+
async generateEmbedding(text) {
|
|
203
|
+
const pipe = await this.getPipeline();
|
|
204
|
+
const output = await pipe(text, {
|
|
205
|
+
pooling: "mean",
|
|
206
|
+
normalize: true
|
|
207
|
+
});
|
|
208
|
+
return Array.from(output.data);
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Calculate cosine similarity between two vectors.
|
|
212
|
+
*/
|
|
213
|
+
cosineSimilarity(a, b) {
|
|
214
|
+
let dotProduct = 0;
|
|
215
|
+
let normA = 0;
|
|
216
|
+
let normB = 0;
|
|
217
|
+
for (let i = 0; i < a.length; i++) {
|
|
218
|
+
dotProduct += a[i] * b[i];
|
|
219
|
+
normA += a[i] * a[i];
|
|
220
|
+
normB += b[i] * b[i];
|
|
221
|
+
}
|
|
222
|
+
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
|
|
223
|
+
if (magnitude === 0) {
|
|
224
|
+
return 0;
|
|
225
|
+
}
|
|
226
|
+
return Math.max(0, Math.min(1, dotProduct / magnitude));
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* Evaluate a test case against the actual output.
|
|
230
|
+
*
|
|
231
|
+
* @param test - The test case
|
|
232
|
+
* @param actualOutput - The actual output from the agent
|
|
233
|
+
* @returns The judge result with score and reasoning
|
|
234
|
+
*/
|
|
235
|
+
async evaluate(test, actualOutput) {
|
|
236
|
+
try {
|
|
237
|
+
const expectedEmbedding = await this.generateEmbedding(test.expectedOutput);
|
|
238
|
+
const actualEmbedding = await this.generateEmbedding(actualOutput);
|
|
239
|
+
const score = this.cosineSimilarity(expectedEmbedding, actualEmbedding);
|
|
240
|
+
let reasoning;
|
|
241
|
+
if (score >= 0.9) {
|
|
242
|
+
reasoning = "Very high semantic similarity - output closely matches expected content.";
|
|
243
|
+
} else if (score >= 0.75) {
|
|
244
|
+
reasoning = "Good semantic similarity with minor differences in meaning or detail.";
|
|
245
|
+
} else if (score >= this._config.passThreshold) {
|
|
246
|
+
reasoning = "Moderate similarity - core concepts match but notable differences exist.";
|
|
247
|
+
} else if (score >= 0.4) {
|
|
248
|
+
reasoning = "Low semantic similarity - significant differences in meaning.";
|
|
249
|
+
} else {
|
|
250
|
+
reasoning = "Very low similarity - output does not match expected content.";
|
|
251
|
+
}
|
|
252
|
+
const passed = score >= this._config.passThreshold;
|
|
253
|
+
return {
|
|
254
|
+
score,
|
|
255
|
+
reasoning,
|
|
256
|
+
passed
|
|
257
|
+
};
|
|
258
|
+
} catch (error) {
|
|
259
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
260
|
+
return {
|
|
261
|
+
score: 0,
|
|
262
|
+
reasoning: `Embedding evaluation failed: ${errorMessage}`,
|
|
263
|
+
passed: false
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
/**
|
|
268
|
+
* Create a judge function compatible with the TestRunner.
|
|
269
|
+
*
|
|
270
|
+
* @returns A function that can be passed to the runner
|
|
271
|
+
*/
|
|
272
|
+
createEvaluator() {
|
|
273
|
+
return async (test, actualOutput) => {
|
|
274
|
+
const result = await this.evaluate(test, actualOutput);
|
|
275
|
+
return {
|
|
276
|
+
score: result.score,
|
|
277
|
+
reasoning: result.reasoning,
|
|
278
|
+
passed: result.passed
|
|
279
|
+
};
|
|
280
|
+
};
|
|
281
|
+
}
|
|
282
|
+
/**
|
|
283
|
+
* Get the judge configuration.
|
|
284
|
+
*/
|
|
285
|
+
get config() {
|
|
286
|
+
return this._config;
|
|
287
|
+
}
|
|
288
|
+
};
|
|
289
|
+
|
|
290
|
+
// src/judge.ts
|
|
291
|
+
import { anthropic } from "@ai-sdk/anthropic";
|
|
292
|
+
import { google } from "@ai-sdk/google";
|
|
293
|
+
import { openai } from "@ai-sdk/openai";
|
|
294
|
+
import { generateObject } from "ai";
|
|
295
|
+
import { z } from "zod";
|
|
296
|
+
var judgeOutputSchema = z.object({
|
|
297
|
+
score: z.number().min(0).max(1).describe("Score from 0 to 1 where 1 is perfect"),
|
|
298
|
+
reasoning: z.string().describe("Explanation for the score")
|
|
299
|
+
});
|
|
300
|
+
var Judge = class {
|
|
301
|
+
_config;
|
|
302
|
+
constructor(config) {
|
|
303
|
+
this._config = {
|
|
304
|
+
passThreshold: 0.7,
|
|
305
|
+
temperature: 0,
|
|
306
|
+
...config
|
|
307
|
+
};
|
|
308
|
+
}
|
|
309
|
+
/**
|
|
310
|
+
* Build the judge prompt.
|
|
311
|
+
*
|
|
312
|
+
* @param test - The test case
|
|
313
|
+
* @param actualOutput - The actual output from the agent
|
|
314
|
+
* @returns The prompt to send to the judge LLM
|
|
315
|
+
*/
|
|
316
|
+
buildPrompt(test, actualOutput) {
|
|
317
|
+
return `You are an expert evaluator assessing the quality of AI responses.
|
|
318
|
+
|
|
319
|
+
Your task is to evaluate how well the ACTUAL OUTPUT matches the EXPECTED OUTPUT description.
|
|
320
|
+
|
|
321
|
+
## Test Input
|
|
322
|
+
${test.input}
|
|
323
|
+
|
|
324
|
+
## Expected Output Description
|
|
325
|
+
${test.expectedOutput}
|
|
326
|
+
|
|
327
|
+
${test.context ? `## Additional Context
|
|
328
|
+
${test.context}
|
|
329
|
+
|
|
330
|
+
` : ""}## Actual Output
|
|
331
|
+
${actualOutput}
|
|
332
|
+
|
|
333
|
+
## Evaluation Instructions
|
|
334
|
+
|
|
335
|
+
1. Carefully read the expected output description and the actual output
|
|
336
|
+
2. Score the actual output on a scale of 0.0 to 1.0 where:
|
|
337
|
+
- 1.0 = Perfect match, fully satisfies the expected output
|
|
338
|
+
- 0.8-0.9 = Good match, minor issues or omissions
|
|
339
|
+
- 0.6-0.7 = Partial match, significant issues but some correct elements
|
|
340
|
+
- 0.4-0.5 = Poor match, mostly incorrect or incomplete
|
|
341
|
+
- 0.0-0.3 = Very poor match, completely wrong or irrelevant
|
|
342
|
+
|
|
343
|
+
3. Provide clear reasoning for your score
|
|
344
|
+
|
|
345
|
+
Respond with a structured object containing:
|
|
346
|
+
- score: number from 0 to 1
|
|
347
|
+
- reasoning: string explaining your evaluation`;
|
|
348
|
+
}
|
|
349
|
+
/**
|
|
350
|
+
* Get the model instance based on provider.
|
|
351
|
+
*
|
|
352
|
+
* @returns Model instance for the Vercel AI SDK
|
|
353
|
+
*/
|
|
354
|
+
getModel() {
|
|
355
|
+
const { provider, model } = this._config;
|
|
356
|
+
switch (provider) {
|
|
357
|
+
case "anthropic": {
|
|
358
|
+
return anthropic(model);
|
|
359
|
+
}
|
|
360
|
+
case "openai": {
|
|
361
|
+
return openai(model);
|
|
362
|
+
}
|
|
363
|
+
case "google":
|
|
364
|
+
case "gemini": {
|
|
365
|
+
return google(model);
|
|
366
|
+
}
|
|
367
|
+
default: {
|
|
368
|
+
throw new Error(`Unsupported judge provider: ${provider}`);
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
/**
|
|
373
|
+
* Evaluate a test case against the actual output.
|
|
374
|
+
*
|
|
375
|
+
* @param test - The test case
|
|
376
|
+
* @param actualOutput - The actual output from the agent
|
|
377
|
+
* @returns The judge result with score and reasoning
|
|
378
|
+
*/
|
|
379
|
+
async evaluate(test, actualOutput) {
|
|
380
|
+
const prompt = this.buildPrompt(test, actualOutput);
|
|
381
|
+
try {
|
|
382
|
+
const { object } = await generateObject({
|
|
383
|
+
model: this.getModel(),
|
|
384
|
+
schema: judgeOutputSchema,
|
|
385
|
+
messages: [
|
|
386
|
+
{
|
|
387
|
+
role: "user",
|
|
388
|
+
content: prompt
|
|
389
|
+
}
|
|
390
|
+
],
|
|
391
|
+
temperature: this._config.temperature
|
|
392
|
+
});
|
|
393
|
+
const passed = object.score >= (this._config.passThreshold ?? 0.7);
|
|
394
|
+
return {
|
|
395
|
+
score: object.score,
|
|
396
|
+
reasoning: object.reasoning,
|
|
397
|
+
passed
|
|
398
|
+
};
|
|
399
|
+
} catch (error) {
|
|
400
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
401
|
+
return {
|
|
402
|
+
score: 0,
|
|
403
|
+
reasoning: `Judge evaluation failed: ${errorMessage}`,
|
|
404
|
+
passed: false
|
|
405
|
+
};
|
|
406
|
+
}
|
|
407
|
+
}
|
|
408
|
+
/**
|
|
409
|
+
* Create a judge function compatible with the TestRunner.
|
|
410
|
+
*
|
|
411
|
+
* @returns A function that can be passed to the runner
|
|
412
|
+
*/
|
|
413
|
+
createEvaluator() {
|
|
414
|
+
return async (test, actualOutput) => {
|
|
415
|
+
const result = await this.evaluate(test, actualOutput);
|
|
416
|
+
return {
|
|
417
|
+
score: result.score,
|
|
418
|
+
reasoning: result.reasoning,
|
|
419
|
+
passed: result.passed
|
|
420
|
+
};
|
|
421
|
+
};
|
|
422
|
+
}
|
|
423
|
+
/**
|
|
424
|
+
* Get the judge configuration.
|
|
425
|
+
*/
|
|
426
|
+
get config() {
|
|
427
|
+
return this._config;
|
|
428
|
+
}
|
|
429
|
+
};
|
|
430
|
+
|
|
431
|
+
// src/reporter.ts
|
|
432
|
+
import { writeFile } from "fs/promises";
|
|
433
|
+
function formatDuration(ms) {
|
|
434
|
+
if (ms < 1e3) {
|
|
435
|
+
return `${ms}ms`;
|
|
436
|
+
}
|
|
437
|
+
return `${(ms / 1e3).toFixed(2)}s`;
|
|
438
|
+
}
|
|
439
|
+
function truncate(str, maxLength) {
|
|
440
|
+
if (str.length <= maxLength) {
|
|
441
|
+
return str;
|
|
442
|
+
}
|
|
443
|
+
return str.slice(0, maxLength - 3) + "...";
|
|
444
|
+
}
|
|
445
|
+
function formatResultForConsole(result, maxWidth) {
|
|
446
|
+
const status = result.passed ? "PASS" : "FAIL";
|
|
447
|
+
const score = `${(result.judgeScore * 100).toFixed(0)}%`;
|
|
448
|
+
const duration = formatDuration(result.durationMs);
|
|
449
|
+
const id = truncate(result.testId, 20);
|
|
450
|
+
return ` ${status.padEnd(4)} | ${id.padEnd(20)} | ${score.padEnd(4)} | ${duration}`;
|
|
451
|
+
}
|
|
452
|
+
var colors = {
|
|
453
|
+
reset: "\x1B[0m",
|
|
454
|
+
bright: "\x1B[1m",
|
|
455
|
+
dim: "\x1B[2m",
|
|
456
|
+
green: "\x1B[32m",
|
|
457
|
+
red: "\x1B[31m",
|
|
458
|
+
yellow: "\x1B[33m",
|
|
459
|
+
blue: "\x1B[34m",
|
|
460
|
+
cyan: "\x1B[36m"
|
|
461
|
+
};
|
|
462
|
+
var Reporter = class {
|
|
463
|
+
_config;
|
|
464
|
+
constructor(config) {
|
|
465
|
+
this._config = config;
|
|
466
|
+
}
|
|
467
|
+
/**
|
|
468
|
+
* Format results as a console table.
|
|
469
|
+
*
|
|
470
|
+
* @param results - Evaluation results
|
|
471
|
+
* @returns Formatted string for console output
|
|
472
|
+
*/
|
|
473
|
+
formatConsole(results) {
|
|
474
|
+
const lines = [];
|
|
475
|
+
lines.push("");
|
|
476
|
+
lines.push(`${colors.bright}Evaluation Results${colors.reset}`);
|
|
477
|
+
lines.push(`${colors.dim}${"=".repeat(60)}${colors.reset}`);
|
|
478
|
+
lines.push("");
|
|
479
|
+
lines.push(`${colors.cyan}Suite:${colors.reset} ${results.suite.name}`);
|
|
480
|
+
if (results.suite.description) {
|
|
481
|
+
lines.push(`${colors.cyan}Description:${colors.reset} ${results.suite.description}`);
|
|
482
|
+
}
|
|
483
|
+
lines.push(`${colors.cyan}Agent:${colors.reset} ${results.agent}`);
|
|
484
|
+
lines.push(`${colors.cyan}Timestamp:${colors.reset} ${new Date(results.timestamp).toLocaleString()}`);
|
|
485
|
+
lines.push("");
|
|
486
|
+
const { summary } = results;
|
|
487
|
+
const statusColor = summary.failed === 0 ? colors.green : colors.red;
|
|
488
|
+
lines.push(`${colors.bright}Summary:${colors.reset}`);
|
|
489
|
+
lines.push(` ${colors.cyan}Total:${colors.reset} ${summary.total}`);
|
|
490
|
+
lines.push(` ${colors.green}Passed:${colors.reset} ${summary.passed}`);
|
|
491
|
+
lines.push(` ${colors.red}Failed:${colors.reset} ${summary.failed}`);
|
|
492
|
+
lines.push(` ${colors.yellow}Avg Score:${colors.reset} ${(summary.avgScore * 100).toFixed(1)}%`);
|
|
493
|
+
lines.push(` ${colors.dim}Duration:${colors.reset} ${formatDuration(summary.totalDurationMs)}`);
|
|
494
|
+
lines.push("");
|
|
495
|
+
lines.push(`${colors.bright}Test Results:${colors.reset}`);
|
|
496
|
+
lines.push(` ${colors.dim}Status | ID | Score | Duration${colors.reset}`);
|
|
497
|
+
lines.push(` ${colors.dim}${"-".repeat(55)}${colors.reset}`);
|
|
498
|
+
const failedResults = [];
|
|
499
|
+
for (const result of results.results) {
|
|
500
|
+
const color = result.passed ? colors.green : colors.red;
|
|
501
|
+
lines.push(`${color}${formatResultForConsole(result, 80)}${colors.reset}`);
|
|
502
|
+
if (!result.passed) {
|
|
503
|
+
failedResults.push(result);
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
lines.push("");
|
|
507
|
+
if (failedResults.length > 0) {
|
|
508
|
+
lines.push(`${colors.bright}Failed Test Details:${colors.reset}`);
|
|
509
|
+
lines.push(`${colors.dim}${"=".repeat(60)}${colors.reset}`);
|
|
510
|
+
for (const result of failedResults) {
|
|
511
|
+
lines.push("");
|
|
512
|
+
lines.push(`${colors.red}${colors.bright}Test: ${result.testId}${colors.reset}`);
|
|
513
|
+
lines.push(`${colors.dim}Input:${colors.reset} ${result.input}`);
|
|
514
|
+
lines.push(`${colors.yellow}Expected:${colors.reset} ${result.expectedOutput}`);
|
|
515
|
+
lines.push(`${colors.cyan}Actual:${colors.reset} ${result.actualOutput}`);
|
|
516
|
+
if (result.judgeReasoning) {
|
|
517
|
+
lines.push(`${colors.dim}Reasoning: ${result.judgeReasoning}${colors.reset}`);
|
|
518
|
+
}
|
|
519
|
+
lines.push(`${colors.dim}${"-".repeat(40)}${colors.reset}`);
|
|
520
|
+
}
|
|
521
|
+
lines.push("");
|
|
522
|
+
}
|
|
523
|
+
if (summary.failed > 0) {
|
|
524
|
+
lines.push(`${colors.yellow}Some tests failed. Review the results above.${colors.reset}`);
|
|
525
|
+
} else {
|
|
526
|
+
lines.push(`${colors.green}All tests passed!${colors.reset}`);
|
|
527
|
+
}
|
|
528
|
+
lines.push("");
|
|
529
|
+
return lines.join("\n");
|
|
530
|
+
}
|
|
531
|
+
/**
|
|
532
|
+
* Format results as JSON.
|
|
533
|
+
*
|
|
534
|
+
* @param results - Evaluation results
|
|
535
|
+
* @returns JSON string
|
|
536
|
+
*/
|
|
537
|
+
formatJson(results) {
|
|
538
|
+
return JSON.stringify(results, null, 2);
|
|
539
|
+
}
|
|
540
|
+
/**
|
|
541
|
+
* Format results as HTML.
|
|
542
|
+
*
|
|
543
|
+
* @param results - Evaluation results
|
|
544
|
+
* @returns HTML string
|
|
545
|
+
*/
|
|
546
|
+
formatHtml(results) {
|
|
547
|
+
const { summary } = results;
|
|
548
|
+
const allPassed = summary.failed === 0;
|
|
549
|
+
const testRows = results.results.map((result) => `
|
|
550
|
+
<tr class="${result.passed ? "passed" : "failed"}">
|
|
551
|
+
<td class="status">${result.passed ? "PASS" : "FAIL"}</td>
|
|
552
|
+
<td class="id">${escapeHtml(result.testId)}</td>
|
|
553
|
+
<td class="score">${(result.judgeScore * 100).toFixed(0)}%</td>
|
|
554
|
+
<td class="duration">${formatDuration(result.durationMs)}</td>
|
|
555
|
+
<td class="reasoning">${escapeHtml(truncate(result.judgeReasoning, 100))}</td>
|
|
556
|
+
</tr>
|
|
557
|
+
`).join("\n");
|
|
558
|
+
return `<!DOCTYPE html>
|
|
559
|
+
<html lang="en">
|
|
560
|
+
<head>
|
|
561
|
+
<meta charset="UTF-8">
|
|
562
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
563
|
+
<title>Eval Results: ${escapeHtml(results.suite.name)}</title>
|
|
564
|
+
<style>
|
|
565
|
+
* { box-sizing: border-box; }
|
|
566
|
+
body {
|
|
567
|
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
568
|
+
line-height: 1.6;
|
|
569
|
+
max-width: 1200px;
|
|
570
|
+
margin: 0 auto;
|
|
571
|
+
padding: 2rem;
|
|
572
|
+
background: #f5f5f5;
|
|
573
|
+
}
|
|
574
|
+
.container {
|
|
575
|
+
background: white;
|
|
576
|
+
border-radius: 8px;
|
|
577
|
+
padding: 2rem;
|
|
578
|
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
|
579
|
+
}
|
|
580
|
+
h1 { margin-top: 0; color: #333; }
|
|
581
|
+
h2 { color: #555; border-bottom: 2px solid #eee; padding-bottom: 0.5rem; }
|
|
582
|
+
.meta {
|
|
583
|
+
display: grid;
|
|
584
|
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
|
585
|
+
gap: 1rem;
|
|
586
|
+
margin-bottom: 2rem;
|
|
587
|
+
}
|
|
588
|
+
.meta-item {
|
|
589
|
+
background: #f8f9fa;
|
|
590
|
+
padding: 1rem;
|
|
591
|
+
border-radius: 4px;
|
|
592
|
+
}
|
|
593
|
+
.meta-label { font-weight: 600; color: #666; font-size: 0.875rem; }
|
|
594
|
+
.meta-value { color: #333; }
|
|
595
|
+
.summary {
|
|
596
|
+
display: grid;
|
|
597
|
+
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
|
|
598
|
+
gap: 1rem;
|
|
599
|
+
margin: 2rem 0;
|
|
600
|
+
}
|
|
601
|
+
.summary-card {
|
|
602
|
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
|
603
|
+
color: white;
|
|
604
|
+
padding: 1.5rem;
|
|
605
|
+
border-radius: 8px;
|
|
606
|
+
text-align: center;
|
|
607
|
+
}
|
|
608
|
+
.summary-card.passed { background: linear-gradient(135deg, #11998e 0%, #38ef7d 100%); }
|
|
609
|
+
.summary-card.failed { background: linear-gradient(135deg, #eb3349 0%, #f45c43 100%); }
|
|
610
|
+
.summary-card.warning { background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); }
|
|
611
|
+
.summary-value { font-size: 2rem; font-weight: bold; }
|
|
612
|
+
.summary-label { font-size: 0.875rem; opacity: 0.9; }
|
|
613
|
+
table {
|
|
614
|
+
width: 100%;
|
|
615
|
+
border-collapse: collapse;
|
|
616
|
+
margin-top: 1rem;
|
|
617
|
+
}
|
|
618
|
+
th, td {
|
|
619
|
+
padding: 0.75rem;
|
|
620
|
+
text-align: left;
|
|
621
|
+
border-bottom: 1px solid #eee;
|
|
622
|
+
}
|
|
623
|
+
th {
|
|
624
|
+
font-weight: 600;
|
|
625
|
+
color: #555;
|
|
626
|
+
background: #f8f9fa;
|
|
627
|
+
}
|
|
628
|
+
tr.passed .status { color: #11998e; font-weight: 600; }
|
|
629
|
+
tr.failed .status { color: #eb3349; font-weight: 600; }
|
|
630
|
+
.footer {
|
|
631
|
+
margin-top: 2rem;
|
|
632
|
+
padding-top: 1rem;
|
|
633
|
+
border-top: 1px solid #eee;
|
|
634
|
+
text-align: center;
|
|
635
|
+
color: #666;
|
|
636
|
+
}
|
|
637
|
+
</style>
|
|
638
|
+
</head>
|
|
639
|
+
<body>
|
|
640
|
+
<div class="container">
|
|
641
|
+
<h1>Evaluation Results</h1>
|
|
642
|
+
|
|
643
|
+
<div class="meta">
|
|
644
|
+
<div class="meta-item">
|
|
645
|
+
<div class="meta-label">Suite</div>
|
|
646
|
+
<div class="meta-value">${escapeHtml(results.suite.name)}</div>
|
|
647
|
+
</div>
|
|
648
|
+
<div class="meta-item">
|
|
649
|
+
<div class="meta-label">Agent</div>
|
|
650
|
+
<div class="meta-value">${escapeHtml(results.agent)}</div>
|
|
651
|
+
</div>
|
|
652
|
+
<div class="meta-item">
|
|
653
|
+
<div class="meta-label">Timestamp</div>
|
|
654
|
+
<div class="meta-value">${new Date(results.timestamp).toLocaleString()}</div>
|
|
655
|
+
</div>
|
|
656
|
+
</div>
|
|
657
|
+
|
|
658
|
+
<h2>Summary</h2>
|
|
659
|
+
<div class="summary">
|
|
660
|
+
<div class="summary-card">
|
|
661
|
+
<div class="summary-value">${summary.total}</div>
|
|
662
|
+
<div class="summary-label">Total Tests</div>
|
|
663
|
+
</div>
|
|
664
|
+
<div class="summary-card ${summary.passed === summary.total ? "passed" : summary.passed === 0 ? "failed" : "warning"}">
|
|
665
|
+
<div class="summary-value">${summary.passed}</div>
|
|
666
|
+
<div class="summary-label">Passed</div>
|
|
667
|
+
</div>
|
|
668
|
+
<div class="summary-card ${summary.failed === 0 ? "passed" : "failed"}">
|
|
669
|
+
<div class="summary-value">${summary.failed}</div>
|
|
670
|
+
<div class="summary-label">Failed</div>
|
|
671
|
+
</div>
|
|
672
|
+
<div class="summary-card">
|
|
673
|
+
<div class="summary-value">${(summary.avgScore * 100).toFixed(1)}%</div>
|
|
674
|
+
<div class="summary-label">Avg Score</div>
|
|
675
|
+
</div>
|
|
676
|
+
<div class="summary-card">
|
|
677
|
+
<div class="summary-value">${formatDuration(summary.totalDurationMs)}</div>
|
|
678
|
+
<div class="summary-label">Duration</div>
|
|
679
|
+
</div>
|
|
680
|
+
</div>
|
|
681
|
+
|
|
682
|
+
<h2>Test Results</h2>
|
|
683
|
+
<table>
|
|
684
|
+
<thead>
|
|
685
|
+
<tr>
|
|
686
|
+
<th>Status</th>
|
|
687
|
+
<th>ID</th>
|
|
688
|
+
<th>Score</th>
|
|
689
|
+
<th>Duration</th>
|
|
690
|
+
<th>Reasoning</th>
|
|
691
|
+
</tr>
|
|
692
|
+
</thead>
|
|
693
|
+
<tbody>
|
|
694
|
+
${testRows}
|
|
695
|
+
</tbody>
|
|
696
|
+
</table>
|
|
697
|
+
|
|
698
|
+
<div class="footer">
|
|
699
|
+
<p>Generated by genesys-eval</p>
|
|
700
|
+
</div>
|
|
701
|
+
</div>
|
|
702
|
+
</body>
|
|
703
|
+
</html>`;
|
|
704
|
+
}
|
|
705
|
+
/**
|
|
706
|
+
* Report evaluation results.
|
|
707
|
+
*
|
|
708
|
+
* @param results - Evaluation results
|
|
709
|
+
* @returns The formatted output string
|
|
710
|
+
*/
|
|
711
|
+
report(results) {
|
|
712
|
+
switch (this._config.format) {
|
|
713
|
+
case "json": {
|
|
714
|
+
return this.formatJson(results);
|
|
715
|
+
}
|
|
716
|
+
case "html": {
|
|
717
|
+
return this.formatHtml(results);
|
|
718
|
+
}
|
|
719
|
+
case "console":
|
|
720
|
+
default: {
|
|
721
|
+
return this.formatConsole(results);
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
/**
|
|
726
|
+
* Report results and optionally write to file.
|
|
727
|
+
*
|
|
728
|
+
* @param results - Evaluation results
|
|
729
|
+
*/
|
|
730
|
+
async reportAndSave(results) {
|
|
731
|
+
const output = this.report(results);
|
|
732
|
+
if (this._config.outputPath) {
|
|
733
|
+
await writeFile(this._config.outputPath, output, "utf-8");
|
|
734
|
+
}
|
|
735
|
+
if (this._config.format === "console" || !this._config.outputPath) {
|
|
736
|
+
console.log(output);
|
|
737
|
+
} else {
|
|
738
|
+
console.log(`Results written to: ${this._config.outputPath}`);
|
|
739
|
+
}
|
|
740
|
+
}
|
|
741
|
+
/**
|
|
742
|
+
* Get the reporter configuration.
|
|
743
|
+
*/
|
|
744
|
+
get config() {
|
|
745
|
+
return this._config;
|
|
746
|
+
}
|
|
747
|
+
};
|
|
748
|
+
function escapeHtml(text) {
|
|
749
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
750
|
+
}
|
|
751
|
+
|
|
752
|
+
// src/cli-runner.ts
|
|
753
|
+
import { spawn } from "child_process";
|
|
754
|
+
var CLIError = class extends Error {
|
|
755
|
+
constructor(message, command, exitCode, stderr) {
|
|
756
|
+
super(message);
|
|
757
|
+
this.command = command;
|
|
758
|
+
this.exitCode = exitCode;
|
|
759
|
+
this.stderr = stderr;
|
|
760
|
+
this.name = "CLIError";
|
|
761
|
+
}
|
|
762
|
+
};
|
|
763
|
+
function parseAgentCommand(agentCommand) {
|
|
764
|
+
const trimmed = agentCommand.trim();
|
|
765
|
+
if (!trimmed.includes(" ")) {
|
|
766
|
+
return [trimmed, []];
|
|
767
|
+
}
|
|
768
|
+
const parts = trimmed.split(/\s+/);
|
|
769
|
+
const cmd = parts[0];
|
|
770
|
+
const args = parts.slice(1);
|
|
771
|
+
return [cmd, args];
|
|
772
|
+
}
|
|
773
|
+
async function runAgent(agent, prompt, options) {
|
|
774
|
+
const startTime = Date.now();
|
|
775
|
+
return new Promise((resolve3, reject) => {
|
|
776
|
+
let stdout = "";
|
|
777
|
+
let stderr = "";
|
|
778
|
+
const [cmd, cmdArgs] = parseAgentCommand(agent);
|
|
779
|
+
const isCompoundCommand = cmdArgs.length > 0;
|
|
780
|
+
const useShell = isCompoundCommand || process.platform === "win32";
|
|
781
|
+
const spawnArgs = [...cmdArgs, "-p"];
|
|
782
|
+
const child = spawn(cmd, spawnArgs, {
|
|
783
|
+
cwd: options.cwd,
|
|
784
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
785
|
+
env: { ...process.env },
|
|
786
|
+
shell: useShell
|
|
787
|
+
});
|
|
788
|
+
if (child.stdin) {
|
|
789
|
+
child.stdin.write(prompt, "utf-8");
|
|
790
|
+
child.stdin.end();
|
|
791
|
+
}
|
|
792
|
+
child.stdout?.on("data", (data) => {
|
|
793
|
+
stdout += data.toString();
|
|
794
|
+
});
|
|
795
|
+
child.stderr?.on("data", (data) => {
|
|
796
|
+
stderr += data.toString();
|
|
797
|
+
});
|
|
798
|
+
const timeout = setTimeout(() => {
|
|
799
|
+
child.kill("SIGTERM");
|
|
800
|
+
reject(new CLIError(
|
|
801
|
+
`Command timed out after ${options.timeout}ms`,
|
|
802
|
+
`${agent} -p`,
|
|
803
|
+
-1,
|
|
804
|
+
stderr
|
|
805
|
+
));
|
|
806
|
+
}, options.timeout);
|
|
807
|
+
child.on("error", (error) => {
|
|
808
|
+
clearTimeout(timeout);
|
|
809
|
+
reject(new CLIError(
|
|
810
|
+
`Failed to spawn ${agent}: ${error.message}. Make sure the command is installed and in PATH.`,
|
|
811
|
+
`${agent} -p`,
|
|
812
|
+
-1,
|
|
813
|
+
stderr
|
|
814
|
+
));
|
|
815
|
+
});
|
|
816
|
+
child.on("close", (code) => {
|
|
817
|
+
clearTimeout(timeout);
|
|
818
|
+
const durationMs = Date.now() - startTime;
|
|
819
|
+
resolve3({
|
|
820
|
+
output: stdout.trim(),
|
|
821
|
+
exitCode: code ?? 0,
|
|
822
|
+
stderr: stderr.trim(),
|
|
823
|
+
durationMs
|
|
824
|
+
});
|
|
825
|
+
});
|
|
826
|
+
});
|
|
827
|
+
}
|
|
828
|
+
|
|
829
|
+
// src/test-loader.ts
|
|
830
|
+
import { readFile } from "fs/promises";
|
|
831
|
+
import { resolve } from "path";
|
|
832
|
+
import YAML from "yaml";
|
|
833
|
+
import { z as z2 } from "zod";
|
|
834
|
+
var testCaseSchema = z2.object({
|
|
835
|
+
id: z2.string().min(1, "Test case ID is required"),
|
|
836
|
+
input: z2.string().min(1, "Test case input is required"),
|
|
837
|
+
context: z2.string().optional(),
|
|
838
|
+
expectedOutput: z2.string().min(1, "Test case expectedOutput is required")
|
|
839
|
+
});
|
|
840
|
+
var testSuiteSchema = z2.object({
|
|
841
|
+
name: z2.string().min(1, "Test suite name is required"),
|
|
842
|
+
description: z2.string().optional(),
|
|
843
|
+
context: z2.string().optional(),
|
|
844
|
+
tests: z2.array(testCaseSchema).min(1, "At least one test case is required")
|
|
845
|
+
});
|
|
846
|
+
var TestValidationError = class extends Error {
|
|
847
|
+
constructor(message, path, issues) {
|
|
848
|
+
super(message);
|
|
849
|
+
this.path = path;
|
|
850
|
+
this.issues = issues;
|
|
851
|
+
this.name = "TestValidationError";
|
|
852
|
+
}
|
|
853
|
+
};
|
|
854
|
+
var TestLoadError = class extends Error {
|
|
855
|
+
constructor(message, path, cause) {
|
|
856
|
+
super(message);
|
|
857
|
+
this.path = path;
|
|
858
|
+
this.cause = cause;
|
|
859
|
+
this.name = "TestLoadError";
|
|
860
|
+
}
|
|
861
|
+
};
|
|
862
|
+
async function loadTestSuite(filePath, cwd = process.cwd()) {
|
|
863
|
+
const absolutePath = resolve(cwd, filePath);
|
|
864
|
+
let content;
|
|
865
|
+
try {
|
|
866
|
+
content = await readFile(absolutePath, "utf-8");
|
|
867
|
+
} catch (error) {
|
|
868
|
+
throw new TestLoadError(
|
|
869
|
+
`Failed to read test file: ${absolutePath}`,
|
|
870
|
+
absolutePath,
|
|
871
|
+
error
|
|
872
|
+
);
|
|
873
|
+
}
|
|
874
|
+
let parsed;
|
|
875
|
+
try {
|
|
876
|
+
parsed = YAML.parse(content);
|
|
877
|
+
} catch (error) {
|
|
878
|
+
throw new TestLoadError(
|
|
879
|
+
`Failed to parse YAML: ${error instanceof Error ? error.message : String(error)}`,
|
|
880
|
+
absolutePath,
|
|
881
|
+
error
|
|
882
|
+
);
|
|
883
|
+
}
|
|
884
|
+
const result = testSuiteSchema.safeParse(parsed);
|
|
885
|
+
if (!result.success) {
|
|
886
|
+
throw new TestValidationError(
|
|
887
|
+
`Test suite validation failed: ${result.error.message}`,
|
|
888
|
+
absolutePath,
|
|
889
|
+
result.error.issues
|
|
890
|
+
);
|
|
891
|
+
}
|
|
892
|
+
const validated = result.data;
|
|
893
|
+
const ids = validated.tests.map((t) => t.id);
|
|
894
|
+
const duplicates = ids.filter((id, index) => ids.indexOf(id) !== index);
|
|
895
|
+
if (duplicates.length > 0) {
|
|
896
|
+
throw new TestValidationError(
|
|
897
|
+
`Duplicate test IDs found: ${[...new Set(duplicates)].join(", ")}`,
|
|
898
|
+
absolutePath,
|
|
899
|
+
[]
|
|
900
|
+
);
|
|
901
|
+
}
|
|
902
|
+
return {
|
|
903
|
+
suite: validated,
|
|
904
|
+
path: absolutePath
|
|
905
|
+
};
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
// src/runner.ts
|
|
909
|
+
async function runTest(test, suite, agent, timeout, judge, progress, index, total) {
|
|
910
|
+
progress?.onTestStart(test.id, index, total);
|
|
911
|
+
const startTime = Date.now();
|
|
912
|
+
const parts = [];
|
|
913
|
+
if (suite.context) {
|
|
914
|
+
parts.push("Context:", suite.context, "");
|
|
915
|
+
}
|
|
916
|
+
if (test.context) {
|
|
917
|
+
parts.push("Specific Context:", test.context, "");
|
|
918
|
+
}
|
|
919
|
+
parts.push("Task:", test.input);
|
|
920
|
+
const prompt = parts.join("\n");
|
|
921
|
+
try {
|
|
922
|
+
const response = await runAgent(
|
|
923
|
+
agent,
|
|
924
|
+
prompt,
|
|
925
|
+
{ cwd: process.cwd(), timeout }
|
|
926
|
+
);
|
|
927
|
+
const { score, reasoning, passed } = await judge(test, response.output);
|
|
928
|
+
const result = {
|
|
929
|
+
testId: test.id,
|
|
930
|
+
input: test.input,
|
|
931
|
+
expectedOutput: test.expectedOutput,
|
|
932
|
+
actualOutput: response.output,
|
|
933
|
+
judgeScore: score,
|
|
934
|
+
judgeReasoning: reasoning,
|
|
935
|
+
durationMs: response.durationMs,
|
|
936
|
+
passed
|
|
937
|
+
};
|
|
938
|
+
progress?.onTestComplete(result, index, total);
|
|
939
|
+
return result;
|
|
940
|
+
} catch (error) {
|
|
941
|
+
const durationMs = Date.now() - startTime;
|
|
942
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
943
|
+
progress?.onTestError(test.id, errorMessage, index, total);
|
|
944
|
+
return {
|
|
945
|
+
testId: test.id,
|
|
946
|
+
input: test.input,
|
|
947
|
+
expectedOutput: test.expectedOutput,
|
|
948
|
+
actualOutput: "",
|
|
949
|
+
judgeScore: 0,
|
|
950
|
+
judgeReasoning: `Error: ${errorMessage}`,
|
|
951
|
+
durationMs,
|
|
952
|
+
passed: false,
|
|
953
|
+
error: errorMessage
|
|
954
|
+
};
|
|
955
|
+
}
|
|
956
|
+
}
|
|
957
|
+
async function runInParallel(items, concurrency, runner) {
|
|
958
|
+
if (concurrency <= 1) {
|
|
959
|
+
const results2 = [];
|
|
960
|
+
for (let i = 0; i < items.length; i++) {
|
|
961
|
+
results2.push(await runner(items[i], i));
|
|
962
|
+
}
|
|
963
|
+
return results2;
|
|
964
|
+
}
|
|
965
|
+
const results = new Array(items.length);
|
|
966
|
+
let index = 0;
|
|
967
|
+
async function worker() {
|
|
968
|
+
while (index < items.length) {
|
|
969
|
+
const currentIndex = index++;
|
|
970
|
+
results[currentIndex] = await runner(items[currentIndex], currentIndex);
|
|
971
|
+
}
|
|
972
|
+
}
|
|
973
|
+
const workers = Array(Math.min(concurrency, items.length)).fill(null).map(() => worker());
|
|
974
|
+
await Promise.all(workers);
|
|
975
|
+
return results;
|
|
976
|
+
}
|
|
977
|
+
var TestRunner = class {
|
|
978
|
+
_config;
|
|
979
|
+
constructor(config) {
|
|
980
|
+
this._config = config;
|
|
981
|
+
}
|
|
982
|
+
/**
|
|
983
|
+
* Run the evaluation.
|
|
984
|
+
*
|
|
985
|
+
* @param judge - Function to evaluate agent outputs
|
|
986
|
+
* @param progress - Optional progress callback
|
|
987
|
+
* @returns The evaluation results
|
|
988
|
+
*/
|
|
989
|
+
async run(judge, progress) {
|
|
990
|
+
const { suite } = await loadTestSuite(this._config.testsPath, this._config.cwd);
|
|
991
|
+
const results = await runInParallel(
|
|
992
|
+
suite.tests,
|
|
993
|
+
this._config.parallel,
|
|
994
|
+
async (test, index) => {
|
|
995
|
+
return runTest(
|
|
996
|
+
test,
|
|
997
|
+
suite,
|
|
998
|
+
this._config.agent,
|
|
999
|
+
this._config.timeout,
|
|
1000
|
+
judge,
|
|
1001
|
+
progress,
|
|
1002
|
+
index,
|
|
1003
|
+
suite.tests.length
|
|
1004
|
+
);
|
|
1005
|
+
}
|
|
1006
|
+
);
|
|
1007
|
+
const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);
|
|
1008
|
+
const passed = results.filter((r) => r.passed).length;
|
|
1009
|
+
const failed = results.length - passed;
|
|
1010
|
+
const avgScore = results.reduce((sum, r) => sum + r.judgeScore, 0) / results.length;
|
|
1011
|
+
const evalResults = {
|
|
1012
|
+
suite,
|
|
1013
|
+
agent: this._config.agent,
|
|
1014
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
1015
|
+
results,
|
|
1016
|
+
summary: {
|
|
1017
|
+
total: results.length,
|
|
1018
|
+
passed,
|
|
1019
|
+
failed,
|
|
1020
|
+
avgScore,
|
|
1021
|
+
totalDurationMs
|
|
1022
|
+
}
|
|
1023
|
+
};
|
|
1024
|
+
return evalResults;
|
|
1025
|
+
}
|
|
1026
|
+
/**
|
|
1027
|
+
* Get the runner configuration.
|
|
1028
|
+
*/
|
|
1029
|
+
get config() {
|
|
1030
|
+
return this._config;
|
|
1031
|
+
}
|
|
1032
|
+
};
|
|
1033
|
+
async function runEvaluation(config, judge, progress) {
|
|
1034
|
+
const runner = new TestRunner(config);
|
|
1035
|
+
return runner.run(judge, progress);
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
// src/utils/package.ts
|
|
1039
|
+
import { readFileSync } from "fs";
|
|
1040
|
+
import { dirname, join } from "path";
|
|
1041
|
+
import { fileURLToPath as fileURLToPath2 } from "url";
|
|
1042
|
+
function getPackageJson() {
|
|
1043
|
+
const __filename = fileURLToPath2(import.meta.url);
|
|
1044
|
+
const __dirname = dirname(__filename);
|
|
1045
|
+
const packagePath = join(__dirname, "..", "..", "package.json");
|
|
1046
|
+
try {
|
|
1047
|
+
const content = readFileSync(packagePath, "utf-8");
|
|
1048
|
+
return JSON.parse(content);
|
|
1049
|
+
} catch {
|
|
1050
|
+
return {
|
|
1051
|
+
version: "1.0.0",
|
|
1052
|
+
name: "@gnsx/genesys.agent.eval"
|
|
1053
|
+
};
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
// src/cli.ts
|
|
1058
|
+
var colors2 = {
|
|
1059
|
+
reset: "\x1B[0m",
|
|
1060
|
+
bright: "\x1B[1m",
|
|
1061
|
+
dim: "\x1B[2m",
|
|
1062
|
+
green: "\x1B[32m",
|
|
1063
|
+
red: "\x1B[31m",
|
|
1064
|
+
yellow: "\x1B[33m",
|
|
1065
|
+
blue: "\x1B[34m",
|
|
1066
|
+
cyan: "\x1B[36m"
|
|
1067
|
+
};
|
|
1068
|
+
function createProgressCallback(verbose) {
|
|
1069
|
+
return {
|
|
1070
|
+
onTestStart(testId, index, total) {
|
|
1071
|
+
if (verbose) {
|
|
1072
|
+
console.log(`[${index + 1}/${total}] Running: ${testId}`);
|
|
1073
|
+
}
|
|
1074
|
+
},
|
|
1075
|
+
onTestComplete(result, index, total) {
|
|
1076
|
+
const status = result.passed ? "PASS" : "FAIL";
|
|
1077
|
+
const score = `${(result.judgeScore * 100).toFixed(0)}%`;
|
|
1078
|
+
const statusColor = result.passed ? colors2.green : colors2.red;
|
|
1079
|
+
console.log(`${statusColor}${status}${colors2.reset}`);
|
|
1080
|
+
},
|
|
1081
|
+
onTestError(testId, error, index, total) {
|
|
1082
|
+
console.error(`[${index + 1}/${total}] ERROR - ${testId}: ${error}`);
|
|
1083
|
+
}
|
|
1084
|
+
};
|
|
1085
|
+
}
|
|
1086
|
+
async function main(argv) {
|
|
1087
|
+
const pkg = getPackageJson();
|
|
1088
|
+
const args = parseArgs(argv, pkg.version, pkg.name);
|
|
1089
|
+
const cwd = resolve2(args.cwd);
|
|
1090
|
+
const config = {
|
|
1091
|
+
testsPath: args.tests,
|
|
1092
|
+
agent: args.agent,
|
|
1093
|
+
cwd,
|
|
1094
|
+
timeout: args.timeout * 1e3,
|
|
1095
|
+
// Convert to milliseconds
|
|
1096
|
+
outputPath: args.output,
|
|
1097
|
+
format: args.format,
|
|
1098
|
+
parallel: args.parallel,
|
|
1099
|
+
judge: {
|
|
1100
|
+
provider: args.judgeProvider,
|
|
1101
|
+
model: args.judgeModel
|
|
1102
|
+
}
|
|
1103
|
+
};
|
|
1104
|
+
console.log(`${colors2.cyan}Agent:${colors2.reset} ${colors2.bright}${args.agent}${colors2.reset}`);
|
|
1105
|
+
console.log(`${colors2.cyan}Judge:${colors2.reset} ${args.judgeType}`);
|
|
1106
|
+
console.log(`${colors2.cyan}Working directory:${colors2.reset} ${cwd}`);
|
|
1107
|
+
console.log(`${colors2.cyan}Test file:${colors2.reset} ${args.tests}`);
|
|
1108
|
+
console.log(`${colors2.cyan}Timeout:${colors2.reset} ${args.timeout}s per test`);
|
|
1109
|
+
console.log(`${colors2.cyan}Parallelism:${colors2.reset} ${args.parallel}`);
|
|
1110
|
+
console.log("");
|
|
1111
|
+
let judgeEvaluator;
|
|
1112
|
+
if (args.judgeType === "embedding") {
|
|
1113
|
+
const judge = new EmbeddingJudge({ passThreshold: 0.6 });
|
|
1114
|
+
judgeEvaluator = judge.createEvaluator();
|
|
1115
|
+
} else {
|
|
1116
|
+
const judge = new Judge({
|
|
1117
|
+
provider: args.judgeProvider,
|
|
1118
|
+
model: args.judgeModel,
|
|
1119
|
+
passThreshold: 0.7
|
|
1120
|
+
});
|
|
1121
|
+
judgeEvaluator = judge.createEvaluator();
|
|
1122
|
+
}
|
|
1123
|
+
const progress = createProgressCallback(args.parallel > 1 || args.format === "console");
|
|
1124
|
+
try {
|
|
1125
|
+
const results = await runEvaluation(config, judgeEvaluator, progress);
|
|
1126
|
+
const reporter = new Reporter({
|
|
1127
|
+
format: args.format,
|
|
1128
|
+
outputPath: args.output
|
|
1129
|
+
});
|
|
1130
|
+
await reporter.reportAndSave(results);
|
|
1131
|
+
process.exit(results.summary.failed > 0 ? 1 : 0);
|
|
1132
|
+
} catch (error) {
|
|
1133
|
+
console.error(`Evaluation failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
1134
|
+
if (error instanceof Error) {
|
|
1135
|
+
if (error.message.includes("ENOENT")) {
|
|
1136
|
+
console.error(`Make sure the ${args.agent} CLI is installed and in your PATH.`);
|
|
1137
|
+
}
|
|
1138
|
+
if (error.message.includes("ANTHROPIC_API_KEY") || error.message.includes("OPENAI_API_KEY")) {
|
|
1139
|
+
console.error("Set the appropriate API key environment variable for the LLM judge.");
|
|
1140
|
+
}
|
|
1141
|
+
}
|
|
1142
|
+
process.exit(1);
|
|
1143
|
+
}
|
|
1144
|
+
}
|
|
1145
|
+
main(process.argv.slice(2)).catch((err) => {
|
|
1146
|
+
console.error(err instanceof Error ? err.message : String(err));
|
|
1147
|
+
process.exit(1);
|
|
1148
|
+
});
|
|
1149
|
+
//# sourceMappingURL=cli.js.map
|