@gnsx/genesys.agent.eval 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1149 @@
1
+ #!/usr/bin/env node
2
+
3
+ // src/cli.ts
4
+ import { resolve as resolve2 } from "path";
5
+
6
+ // ../../../cli-utils/dist/self-update.js
7
+ import { execSync } from "child_process";
8
+ import { fileURLToPath } from "url";
9
+ function isInstalledPackage() {
10
+ try {
11
+ const currentFile = fileURLToPath(import.meta.url);
12
+ return currentFile.includes("node_modules");
13
+ } catch {
14
+ return false;
15
+ }
16
+ }
17
+ function detectPackageManager() {
18
+ if (process.env.PNPM_PACKAGE_NAME)
19
+ return "pnpm";
20
+ if (process.env.npm_execpath?.includes("pnpm"))
21
+ return "pnpm";
22
+ const execPath = process.argv[1] || "";
23
+ if (execPath.includes("pnpm"))
24
+ return "pnpm";
25
+ if (execPath.includes("npm"))
26
+ return "npm";
27
+ return "pnpm";
28
+ }
29
+ function isNewerVersion(latest, current) {
30
+ const latestParts = latest.split(".").map(Number);
31
+ const currentParts = current.split(".").map(Number);
32
+ for (let i = 0; i < 3; i++) {
33
+ const latestPart = latestParts[i] || 0;
34
+ const currentPart = currentParts[i] || 0;
35
+ if (latestPart > currentPart)
36
+ return true;
37
+ if (latestPart < currentPart)
38
+ return false;
39
+ }
40
+ return false;
41
+ }
42
+ async function checkForUpdates(packageName, currentVersion) {
43
+ try {
44
+ if (!isInstalledPackage())
45
+ return null;
46
+ const encodedName = encodeURIComponent(packageName);
47
+ const response = await fetch(`https://registry.npmjs.org/${encodedName}/latest`, {
48
+ signal: AbortSignal.timeout(1e4)
49
+ });
50
+ if (!response.ok)
51
+ return null;
52
+ const data = await response.json();
53
+ const latestVersion = data.version;
54
+ if (!latestVersion || latestVersion === currentVersion)
55
+ return null;
56
+ if (!isNewerVersion(latestVersion, currentVersion))
57
+ return null;
58
+ return latestVersion;
59
+ } catch {
60
+ return null;
61
+ }
62
+ }
63
+ function performUpdate(packageName, packageManager, currentVersion, latestVersion) {
64
+ console.log(`
65
+ Updating ${packageName}...`);
66
+ console.log(` Current: v${currentVersion}`);
67
+ console.log(` Latest: v${latestVersion}
68
+ `);
69
+ try {
70
+ const updateCommand = packageManager === "pnpm" ? `pnpm remove -g ${packageName} && pnpm add -g ${packageName}@latest` : `npm uninstall -g ${packageName} && npm install -g ${packageName}@latest`;
71
+ execSync(updateCommand, { stdio: "inherit" });
72
+ console.log(`
73
+ \u2705 Update complete! ${packageName} has been updated to v${latestVersion}.`);
74
+ return true;
75
+ } catch (error) {
76
+ console.error(`
77
+ \u274C Update failed. Please try running the commands manually:`);
78
+ if (packageManager === "pnpm") {
79
+ console.error(` pnpm remove -g ${packageName}`);
80
+ console.error(` pnpm add -g ${packageName}@latest`);
81
+ } else {
82
+ console.error(` npm uninstall -g ${packageName}`);
83
+ console.error(` npm install -g ${packageName}@latest`);
84
+ }
85
+ return false;
86
+ }
87
+ }
88
+ function addUpdateCommand(program, packageName, currentVersion) {
89
+ program.command("update").description("Check for updates and install the latest version").action(async () => {
90
+ if (!isInstalledPackage()) {
91
+ console.log("Skipping update check - running from local development.");
92
+ process.exit(0);
93
+ }
94
+ console.log("Checking for updates...");
95
+ const latestVersion = await checkForUpdates(packageName, currentVersion);
96
+ if (!latestVersion) {
97
+ console.log(`You are already using the latest version (v${currentVersion}).`);
98
+ process.exit(0);
99
+ }
100
+ console.log(`
101
+ \u{1F4E6} Update available: v${currentVersion} \u2192 v${latestVersion}`);
102
+ const packageManager = detectPackageManager();
103
+ if (packageManager === "unknown") {
104
+ console.error("\n\u274C Could not detect package manager. Please update manually:");
105
+ console.error(` pnpm remove -g ${packageName} && pnpm add -g ${packageName}@latest`);
106
+ console.error(` or`);
107
+ console.error(` npm uninstall -g ${packageName} && npm install -g ${packageName}@latest`);
108
+ process.exit(1);
109
+ }
110
+ const success = performUpdate(packageName, packageManager, currentVersion, latestVersion);
111
+ process.exit(success ? 0 : 1);
112
+ });
113
+ }
114
+
115
+ // src/args.ts
116
+ import { Command } from "commander";
117
+ var VALID_FORMATS = ["console", "json", "html"];
118
+ var VALID_JUDGE_TYPES = ["embedding", "llm"];
119
+ function isValidFormat(value) {
120
+ return VALID_FORMATS.includes(value);
121
+ }
122
+ function isValidJudgeType(value) {
123
+ return VALID_JUDGE_TYPES.includes(value);
124
+ }
125
+ function parseArgs(argv, version, packageName) {
126
+ const program = new Command();
127
+ program.name("genesys-eval").description("Agent evaluation harness for benchmarking AI agents").version(version, "-v, --version");
128
+ addUpdateCommand(program, packageName, version);
129
+ program.option("--tests <path>", "path to YAML test file", "./eval-tests.yaml").option("-a, --agent <command>", "agent CLI command to test", "genesys").option("--cwd <dir>", "working directory for test context", process.cwd()).option("-t, --timeout <secs>", "timeout per test in seconds", "120").option("-o, --output <path>", "output file for results").option("--format <format>", "output format: console, json, html", "console").option("-p, --parallel <n>", "number of parallel test executions", "1").option("--judge-type <type>", "judge type: embedding, llm", "embedding").option("--judge-model <model>", "model for LLM judge", "claude-3-5-sonnet-20241022").option("--judge-provider <provider>", "provider for LLM judge", "anthropic").parse(argv, { from: "user" });
130
+ const opts = program.opts();
131
+ if (!isValidFormat(opts.format)) {
132
+ console.error(`--format must be one of: ${VALID_FORMATS.join(", ")}`);
133
+ process.exit(1);
134
+ }
135
+ if (!isValidJudgeType(opts.judgeType)) {
136
+ console.error(`--judge-type must be one of: ${VALID_JUDGE_TYPES.join(", ")}`);
137
+ process.exit(1);
138
+ }
139
+ const timeout = parseInt(opts.timeout, 10);
140
+ if (isNaN(timeout) || timeout < 1) {
141
+ console.error("--timeout must be a positive integer");
142
+ process.exit(1);
143
+ }
144
+ const parallel = parseInt(opts.parallel, 10);
145
+ if (isNaN(parallel) || parallel < 1) {
146
+ console.error("--parallel must be a positive integer");
147
+ process.exit(1);
148
+ }
149
+ return {
150
+ tests: opts.tests,
151
+ agent: opts.agent,
152
+ cwd: opts.cwd,
153
+ timeout,
154
+ output: opts.output,
155
+ format: opts.format,
156
+ parallel,
157
+ judgeType: opts.judgeType,
158
+ judgeModel: opts.judgeModel,
159
+ judgeProvider: opts.judgeProvider,
160
+ help: false,
161
+ version: false
162
+ };
163
+ }
164
+
165
+ // src/embedding-judge.ts
166
+ import { pipeline } from "@huggingface/transformers";
167
+ var EmbeddingJudge = class {
168
+ _config;
169
+ _pipeline = null;
170
+ _modelLoading = null;
171
+ constructor(config = {}) {
172
+ this._config = {
173
+ passThreshold: 0.7,
174
+ model: "Xenova/all-MiniLM-L6-v2",
175
+ ...config
176
+ };
177
+ }
178
+ /**
179
+ * Get or create the embedding pipeline.
180
+ * Lazy loads the model on first use.
181
+ */
182
+ async getPipeline() {
183
+ if (this._pipeline) {
184
+ return this._pipeline;
185
+ }
186
+ if (this._modelLoading) {
187
+ return this._modelLoading;
188
+ }
189
+ this._modelLoading = pipeline(
190
+ "feature-extraction",
191
+ this._config.model
192
+ );
193
+ this._pipeline = await this._modelLoading;
194
+ return this._pipeline;
195
+ }
196
+ /**
197
+ * Generate embeddings for text.
198
+ *
199
+ * @param text - Text to embed
200
+ * @returns Embedding vector
201
+ */
202
+ async generateEmbedding(text) {
203
+ const pipe = await this.getPipeline();
204
+ const output = await pipe(text, {
205
+ pooling: "mean",
206
+ normalize: true
207
+ });
208
+ return Array.from(output.data);
209
+ }
210
+ /**
211
+ * Calculate cosine similarity between two vectors.
212
+ */
213
+ cosineSimilarity(a, b) {
214
+ let dotProduct = 0;
215
+ let normA = 0;
216
+ let normB = 0;
217
+ for (let i = 0; i < a.length; i++) {
218
+ dotProduct += a[i] * b[i];
219
+ normA += a[i] * a[i];
220
+ normB += b[i] * b[i];
221
+ }
222
+ const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
223
+ if (magnitude === 0) {
224
+ return 0;
225
+ }
226
+ return Math.max(0, Math.min(1, dotProduct / magnitude));
227
+ }
228
+ /**
229
+ * Evaluate a test case against the actual output.
230
+ *
231
+ * @param test - The test case
232
+ * @param actualOutput - The actual output from the agent
233
+ * @returns The judge result with score and reasoning
234
+ */
235
+ async evaluate(test, actualOutput) {
236
+ try {
237
+ const expectedEmbedding = await this.generateEmbedding(test.expectedOutput);
238
+ const actualEmbedding = await this.generateEmbedding(actualOutput);
239
+ const score = this.cosineSimilarity(expectedEmbedding, actualEmbedding);
240
+ let reasoning;
241
+ if (score >= 0.9) {
242
+ reasoning = "Very high semantic similarity - output closely matches expected content.";
243
+ } else if (score >= 0.75) {
244
+ reasoning = "Good semantic similarity with minor differences in meaning or detail.";
245
+ } else if (score >= this._config.passThreshold) {
246
+ reasoning = "Moderate similarity - core concepts match but notable differences exist.";
247
+ } else if (score >= 0.4) {
248
+ reasoning = "Low semantic similarity - significant differences in meaning.";
249
+ } else {
250
+ reasoning = "Very low similarity - output does not match expected content.";
251
+ }
252
+ const passed = score >= this._config.passThreshold;
253
+ return {
254
+ score,
255
+ reasoning,
256
+ passed
257
+ };
258
+ } catch (error) {
259
+ const errorMessage = error instanceof Error ? error.message : String(error);
260
+ return {
261
+ score: 0,
262
+ reasoning: `Embedding evaluation failed: ${errorMessage}`,
263
+ passed: false
264
+ };
265
+ }
266
+ }
267
+ /**
268
+ * Create a judge function compatible with the TestRunner.
269
+ *
270
+ * @returns A function that can be passed to the runner
271
+ */
272
+ createEvaluator() {
273
+ return async (test, actualOutput) => {
274
+ const result = await this.evaluate(test, actualOutput);
275
+ return {
276
+ score: result.score,
277
+ reasoning: result.reasoning,
278
+ passed: result.passed
279
+ };
280
+ };
281
+ }
282
+ /**
283
+ * Get the judge configuration.
284
+ */
285
+ get config() {
286
+ return this._config;
287
+ }
288
+ };
289
+
290
+ // src/judge.ts
291
+ import { anthropic } from "@ai-sdk/anthropic";
292
+ import { google } from "@ai-sdk/google";
293
+ import { openai } from "@ai-sdk/openai";
294
+ import { generateObject } from "ai";
295
+ import { z } from "zod";
296
+ var judgeOutputSchema = z.object({
297
+ score: z.number().min(0).max(1).describe("Score from 0 to 1 where 1 is perfect"),
298
+ reasoning: z.string().describe("Explanation for the score")
299
+ });
300
+ var Judge = class {
301
+ _config;
302
+ constructor(config) {
303
+ this._config = {
304
+ passThreshold: 0.7,
305
+ temperature: 0,
306
+ ...config
307
+ };
308
+ }
309
+ /**
310
+ * Build the judge prompt.
311
+ *
312
+ * @param test - The test case
313
+ * @param actualOutput - The actual output from the agent
314
+ * @returns The prompt to send to the judge LLM
315
+ */
316
+ buildPrompt(test, actualOutput) {
317
+ return `You are an expert evaluator assessing the quality of AI responses.
318
+
319
+ Your task is to evaluate how well the ACTUAL OUTPUT matches the EXPECTED OUTPUT description.
320
+
321
+ ## Test Input
322
+ ${test.input}
323
+
324
+ ## Expected Output Description
325
+ ${test.expectedOutput}
326
+
327
+ ${test.context ? `## Additional Context
328
+ ${test.context}
329
+
330
+ ` : ""}## Actual Output
331
+ ${actualOutput}
332
+
333
+ ## Evaluation Instructions
334
+
335
+ 1. Carefully read the expected output description and the actual output
336
+ 2. Score the actual output on a scale of 0.0 to 1.0 where:
337
+ - 1.0 = Perfect match, fully satisfies the expected output
338
+ - 0.8-0.9 = Good match, minor issues or omissions
339
+ - 0.6-0.7 = Partial match, significant issues but some correct elements
340
+ - 0.4-0.5 = Poor match, mostly incorrect or incomplete
341
+ - 0.0-0.3 = Very poor match, completely wrong or irrelevant
342
+
343
+ 3. Provide clear reasoning for your score
344
+
345
+ Respond with a structured object containing:
346
+ - score: number from 0 to 1
347
+ - reasoning: string explaining your evaluation`;
348
+ }
349
+ /**
350
+ * Get the model instance based on provider.
351
+ *
352
+ * @returns Model instance for the Vercel AI SDK
353
+ */
354
+ getModel() {
355
+ const { provider, model } = this._config;
356
+ switch (provider) {
357
+ case "anthropic": {
358
+ return anthropic(model);
359
+ }
360
+ case "openai": {
361
+ return openai(model);
362
+ }
363
+ case "google":
364
+ case "gemini": {
365
+ return google(model);
366
+ }
367
+ default: {
368
+ throw new Error(`Unsupported judge provider: ${provider}`);
369
+ }
370
+ }
371
+ }
372
+ /**
373
+ * Evaluate a test case against the actual output.
374
+ *
375
+ * @param test - The test case
376
+ * @param actualOutput - The actual output from the agent
377
+ * @returns The judge result with score and reasoning
378
+ */
379
+ async evaluate(test, actualOutput) {
380
+ const prompt = this.buildPrompt(test, actualOutput);
381
+ try {
382
+ const { object } = await generateObject({
383
+ model: this.getModel(),
384
+ schema: judgeOutputSchema,
385
+ messages: [
386
+ {
387
+ role: "user",
388
+ content: prompt
389
+ }
390
+ ],
391
+ temperature: this._config.temperature
392
+ });
393
+ const passed = object.score >= (this._config.passThreshold ?? 0.7);
394
+ return {
395
+ score: object.score,
396
+ reasoning: object.reasoning,
397
+ passed
398
+ };
399
+ } catch (error) {
400
+ const errorMessage = error instanceof Error ? error.message : String(error);
401
+ return {
402
+ score: 0,
403
+ reasoning: `Judge evaluation failed: ${errorMessage}`,
404
+ passed: false
405
+ };
406
+ }
407
+ }
408
+ /**
409
+ * Create a judge function compatible with the TestRunner.
410
+ *
411
+ * @returns A function that can be passed to the runner
412
+ */
413
+ createEvaluator() {
414
+ return async (test, actualOutput) => {
415
+ const result = await this.evaluate(test, actualOutput);
416
+ return {
417
+ score: result.score,
418
+ reasoning: result.reasoning,
419
+ passed: result.passed
420
+ };
421
+ };
422
+ }
423
+ /**
424
+ * Get the judge configuration.
425
+ */
426
+ get config() {
427
+ return this._config;
428
+ }
429
+ };
430
+
431
+ // src/reporter.ts
432
+ import { writeFile } from "fs/promises";
433
+ function formatDuration(ms) {
434
+ if (ms < 1e3) {
435
+ return `${ms}ms`;
436
+ }
437
+ return `${(ms / 1e3).toFixed(2)}s`;
438
+ }
439
+ function truncate(str, maxLength) {
440
+ if (str.length <= maxLength) {
441
+ return str;
442
+ }
443
+ return str.slice(0, maxLength - 3) + "...";
444
+ }
445
+ function formatResultForConsole(result, maxWidth) {
446
+ const status = result.passed ? "PASS" : "FAIL";
447
+ const score = `${(result.judgeScore * 100).toFixed(0)}%`;
448
+ const duration = formatDuration(result.durationMs);
449
+ const id = truncate(result.testId, 20);
450
+ return ` ${status.padEnd(4)} | ${id.padEnd(20)} | ${score.padEnd(4)} | ${duration}`;
451
+ }
452
+ var colors = {
453
+ reset: "\x1B[0m",
454
+ bright: "\x1B[1m",
455
+ dim: "\x1B[2m",
456
+ green: "\x1B[32m",
457
+ red: "\x1B[31m",
458
+ yellow: "\x1B[33m",
459
+ blue: "\x1B[34m",
460
+ cyan: "\x1B[36m"
461
+ };
462
+ var Reporter = class {
463
+ _config;
464
+ constructor(config) {
465
+ this._config = config;
466
+ }
467
+ /**
468
+ * Format results as a console table.
469
+ *
470
+ * @param results - Evaluation results
471
+ * @returns Formatted string for console output
472
+ */
473
+ formatConsole(results) {
474
+ const lines = [];
475
+ lines.push("");
476
+ lines.push(`${colors.bright}Evaluation Results${colors.reset}`);
477
+ lines.push(`${colors.dim}${"=".repeat(60)}${colors.reset}`);
478
+ lines.push("");
479
+ lines.push(`${colors.cyan}Suite:${colors.reset} ${results.suite.name}`);
480
+ if (results.suite.description) {
481
+ lines.push(`${colors.cyan}Description:${colors.reset} ${results.suite.description}`);
482
+ }
483
+ lines.push(`${colors.cyan}Agent:${colors.reset} ${results.agent}`);
484
+ lines.push(`${colors.cyan}Timestamp:${colors.reset} ${new Date(results.timestamp).toLocaleString()}`);
485
+ lines.push("");
486
+ const { summary } = results;
487
+ const statusColor = summary.failed === 0 ? colors.green : colors.red;
488
+ lines.push(`${colors.bright}Summary:${colors.reset}`);
489
+ lines.push(` ${colors.cyan}Total:${colors.reset} ${summary.total}`);
490
+ lines.push(` ${colors.green}Passed:${colors.reset} ${summary.passed}`);
491
+ lines.push(` ${colors.red}Failed:${colors.reset} ${summary.failed}`);
492
+ lines.push(` ${colors.yellow}Avg Score:${colors.reset} ${(summary.avgScore * 100).toFixed(1)}%`);
493
+ lines.push(` ${colors.dim}Duration:${colors.reset} ${formatDuration(summary.totalDurationMs)}`);
494
+ lines.push("");
495
+ lines.push(`${colors.bright}Test Results:${colors.reset}`);
496
+ lines.push(` ${colors.dim}Status | ID | Score | Duration${colors.reset}`);
497
+ lines.push(` ${colors.dim}${"-".repeat(55)}${colors.reset}`);
498
+ const failedResults = [];
499
+ for (const result of results.results) {
500
+ const color = result.passed ? colors.green : colors.red;
501
+ lines.push(`${color}${formatResultForConsole(result, 80)}${colors.reset}`);
502
+ if (!result.passed) {
503
+ failedResults.push(result);
504
+ }
505
+ }
506
+ lines.push("");
507
+ if (failedResults.length > 0) {
508
+ lines.push(`${colors.bright}Failed Test Details:${colors.reset}`);
509
+ lines.push(`${colors.dim}${"=".repeat(60)}${colors.reset}`);
510
+ for (const result of failedResults) {
511
+ lines.push("");
512
+ lines.push(`${colors.red}${colors.bright}Test: ${result.testId}${colors.reset}`);
513
+ lines.push(`${colors.dim}Input:${colors.reset} ${result.input}`);
514
+ lines.push(`${colors.yellow}Expected:${colors.reset} ${result.expectedOutput}`);
515
+ lines.push(`${colors.cyan}Actual:${colors.reset} ${result.actualOutput}`);
516
+ if (result.judgeReasoning) {
517
+ lines.push(`${colors.dim}Reasoning: ${result.judgeReasoning}${colors.reset}`);
518
+ }
519
+ lines.push(`${colors.dim}${"-".repeat(40)}${colors.reset}`);
520
+ }
521
+ lines.push("");
522
+ }
523
+ if (summary.failed > 0) {
524
+ lines.push(`${colors.yellow}Some tests failed. Review the results above.${colors.reset}`);
525
+ } else {
526
+ lines.push(`${colors.green}All tests passed!${colors.reset}`);
527
+ }
528
+ lines.push("");
529
+ return lines.join("\n");
530
+ }
531
+ /**
532
+ * Format results as JSON.
533
+ *
534
+ * @param results - Evaluation results
535
+ * @returns JSON string
536
+ */
537
+ formatJson(results) {
538
+ return JSON.stringify(results, null, 2);
539
+ }
540
+ /**
541
+ * Format results as HTML.
542
+ *
543
+ * @param results - Evaluation results
544
+ * @returns HTML string
545
+ */
546
+ formatHtml(results) {
547
+ const { summary } = results;
548
+ const allPassed = summary.failed === 0;
549
+ const testRows = results.results.map((result) => `
550
+ <tr class="${result.passed ? "passed" : "failed"}">
551
+ <td class="status">${result.passed ? "PASS" : "FAIL"}</td>
552
+ <td class="id">${escapeHtml(result.testId)}</td>
553
+ <td class="score">${(result.judgeScore * 100).toFixed(0)}%</td>
554
+ <td class="duration">${formatDuration(result.durationMs)}</td>
555
+ <td class="reasoning">${escapeHtml(truncate(result.judgeReasoning, 100))}</td>
556
+ </tr>
557
+ `).join("\n");
558
+ return `<!DOCTYPE html>
559
+ <html lang="en">
560
+ <head>
561
+ <meta charset="UTF-8">
562
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
563
+ <title>Eval Results: ${escapeHtml(results.suite.name)}</title>
564
+ <style>
565
+ * { box-sizing: border-box; }
566
+ body {
567
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
568
+ line-height: 1.6;
569
+ max-width: 1200px;
570
+ margin: 0 auto;
571
+ padding: 2rem;
572
+ background: #f5f5f5;
573
+ }
574
+ .container {
575
+ background: white;
576
+ border-radius: 8px;
577
+ padding: 2rem;
578
+ box-shadow: 0 2px 4px rgba(0,0,0,0.1);
579
+ }
580
+ h1 { margin-top: 0; color: #333; }
581
+ h2 { color: #555; border-bottom: 2px solid #eee; padding-bottom: 0.5rem; }
582
+ .meta {
583
+ display: grid;
584
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
585
+ gap: 1rem;
586
+ margin-bottom: 2rem;
587
+ }
588
+ .meta-item {
589
+ background: #f8f9fa;
590
+ padding: 1rem;
591
+ border-radius: 4px;
592
+ }
593
+ .meta-label { font-weight: 600; color: #666; font-size: 0.875rem; }
594
+ .meta-value { color: #333; }
595
+ .summary {
596
+ display: grid;
597
+ grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
598
+ gap: 1rem;
599
+ margin: 2rem 0;
600
+ }
601
+ .summary-card {
602
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
603
+ color: white;
604
+ padding: 1.5rem;
605
+ border-radius: 8px;
606
+ text-align: center;
607
+ }
608
+ .summary-card.passed { background: linear-gradient(135deg, #11998e 0%, #38ef7d 100%); }
609
+ .summary-card.failed { background: linear-gradient(135deg, #eb3349 0%, #f45c43 100%); }
610
+ .summary-card.warning { background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); }
611
+ .summary-value { font-size: 2rem; font-weight: bold; }
612
+ .summary-label { font-size: 0.875rem; opacity: 0.9; }
613
+ table {
614
+ width: 100%;
615
+ border-collapse: collapse;
616
+ margin-top: 1rem;
617
+ }
618
+ th, td {
619
+ padding: 0.75rem;
620
+ text-align: left;
621
+ border-bottom: 1px solid #eee;
622
+ }
623
+ th {
624
+ font-weight: 600;
625
+ color: #555;
626
+ background: #f8f9fa;
627
+ }
628
+ tr.passed .status { color: #11998e; font-weight: 600; }
629
+ tr.failed .status { color: #eb3349; font-weight: 600; }
630
+ .footer {
631
+ margin-top: 2rem;
632
+ padding-top: 1rem;
633
+ border-top: 1px solid #eee;
634
+ text-align: center;
635
+ color: #666;
636
+ }
637
+ </style>
638
+ </head>
639
+ <body>
640
+ <div class="container">
641
+ <h1>Evaluation Results</h1>
642
+
643
+ <div class="meta">
644
+ <div class="meta-item">
645
+ <div class="meta-label">Suite</div>
646
+ <div class="meta-value">${escapeHtml(results.suite.name)}</div>
647
+ </div>
648
+ <div class="meta-item">
649
+ <div class="meta-label">Agent</div>
650
+ <div class="meta-value">${escapeHtml(results.agent)}</div>
651
+ </div>
652
+ <div class="meta-item">
653
+ <div class="meta-label">Timestamp</div>
654
+ <div class="meta-value">${new Date(results.timestamp).toLocaleString()}</div>
655
+ </div>
656
+ </div>
657
+
658
+ <h2>Summary</h2>
659
+ <div class="summary">
660
+ <div class="summary-card">
661
+ <div class="summary-value">${summary.total}</div>
662
+ <div class="summary-label">Total Tests</div>
663
+ </div>
664
+ <div class="summary-card ${summary.passed === summary.total ? "passed" : summary.passed === 0 ? "failed" : "warning"}">
665
+ <div class="summary-value">${summary.passed}</div>
666
+ <div class="summary-label">Passed</div>
667
+ </div>
668
+ <div class="summary-card ${summary.failed === 0 ? "passed" : "failed"}">
669
+ <div class="summary-value">${summary.failed}</div>
670
+ <div class="summary-label">Failed</div>
671
+ </div>
672
+ <div class="summary-card">
673
+ <div class="summary-value">${(summary.avgScore * 100).toFixed(1)}%</div>
674
+ <div class="summary-label">Avg Score</div>
675
+ </div>
676
+ <div class="summary-card">
677
+ <div class="summary-value">${formatDuration(summary.totalDurationMs)}</div>
678
+ <div class="summary-label">Duration</div>
679
+ </div>
680
+ </div>
681
+
682
+ <h2>Test Results</h2>
683
+ <table>
684
+ <thead>
685
+ <tr>
686
+ <th>Status</th>
687
+ <th>ID</th>
688
+ <th>Score</th>
689
+ <th>Duration</th>
690
+ <th>Reasoning</th>
691
+ </tr>
692
+ </thead>
693
+ <tbody>
694
+ ${testRows}
695
+ </tbody>
696
+ </table>
697
+
698
+ <div class="footer">
699
+ <p>Generated by genesys-eval</p>
700
+ </div>
701
+ </div>
702
+ </body>
703
+ </html>`;
704
+ }
705
+ /**
706
+ * Report evaluation results.
707
+ *
708
+ * @param results - Evaluation results
709
+ * @returns The formatted output string
710
+ */
711
+ report(results) {
712
+ switch (this._config.format) {
713
+ case "json": {
714
+ return this.formatJson(results);
715
+ }
716
+ case "html": {
717
+ return this.formatHtml(results);
718
+ }
719
+ case "console":
720
+ default: {
721
+ return this.formatConsole(results);
722
+ }
723
+ }
724
+ }
725
+ /**
726
+ * Report results and optionally write to file.
727
+ *
728
+ * @param results - Evaluation results
729
+ */
730
+ async reportAndSave(results) {
731
+ const output = this.report(results);
732
+ if (this._config.outputPath) {
733
+ await writeFile(this._config.outputPath, output, "utf-8");
734
+ }
735
+ if (this._config.format === "console" || !this._config.outputPath) {
736
+ console.log(output);
737
+ } else {
738
+ console.log(`Results written to: ${this._config.outputPath}`);
739
+ }
740
+ }
741
+ /**
742
+ * Get the reporter configuration.
743
+ */
744
+ get config() {
745
+ return this._config;
746
+ }
747
+ };
748
+ function escapeHtml(text) {
749
+ return text.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;").replace(/'/g, "&#039;");
750
+ }
751
+
752
+ // src/cli-runner.ts
753
+ import { spawn } from "child_process";
754
+ var CLIError = class extends Error {
755
+ constructor(message, command, exitCode, stderr) {
756
+ super(message);
757
+ this.command = command;
758
+ this.exitCode = exitCode;
759
+ this.stderr = stderr;
760
+ this.name = "CLIError";
761
+ }
762
+ };
763
+ function parseAgentCommand(agentCommand) {
764
+ const trimmed = agentCommand.trim();
765
+ if (!trimmed.includes(" ")) {
766
+ return [trimmed, []];
767
+ }
768
+ const parts = trimmed.split(/\s+/);
769
+ const cmd = parts[0];
770
+ const args = parts.slice(1);
771
+ return [cmd, args];
772
+ }
773
+ async function runAgent(agent, prompt, options) {
774
+ const startTime = Date.now();
775
+ return new Promise((resolve3, reject) => {
776
+ let stdout = "";
777
+ let stderr = "";
778
+ const [cmd, cmdArgs] = parseAgentCommand(agent);
779
+ const isCompoundCommand = cmdArgs.length > 0;
780
+ const useShell = isCompoundCommand || process.platform === "win32";
781
+ const spawnArgs = [...cmdArgs, "-p"];
782
+ const child = spawn(cmd, spawnArgs, {
783
+ cwd: options.cwd,
784
+ stdio: ["pipe", "pipe", "pipe"],
785
+ env: { ...process.env },
786
+ shell: useShell
787
+ });
788
+ if (child.stdin) {
789
+ child.stdin.write(prompt, "utf-8");
790
+ child.stdin.end();
791
+ }
792
+ child.stdout?.on("data", (data) => {
793
+ stdout += data.toString();
794
+ });
795
+ child.stderr?.on("data", (data) => {
796
+ stderr += data.toString();
797
+ });
798
+ const timeout = setTimeout(() => {
799
+ child.kill("SIGTERM");
800
+ reject(new CLIError(
801
+ `Command timed out after ${options.timeout}ms`,
802
+ `${agent} -p`,
803
+ -1,
804
+ stderr
805
+ ));
806
+ }, options.timeout);
807
+ child.on("error", (error) => {
808
+ clearTimeout(timeout);
809
+ reject(new CLIError(
810
+ `Failed to spawn ${agent}: ${error.message}. Make sure the command is installed and in PATH.`,
811
+ `${agent} -p`,
812
+ -1,
813
+ stderr
814
+ ));
815
+ });
816
+ child.on("close", (code) => {
817
+ clearTimeout(timeout);
818
+ const durationMs = Date.now() - startTime;
819
+ resolve3({
820
+ output: stdout.trim(),
821
+ exitCode: code ?? 0,
822
+ stderr: stderr.trim(),
823
+ durationMs
824
+ });
825
+ });
826
+ });
827
+ }
828
+
829
+ // src/test-loader.ts
830
+ import { readFile } from "fs/promises";
831
+ import { resolve } from "path";
832
+ import YAML from "yaml";
833
+ import { z as z2 } from "zod";
834
+ var testCaseSchema = z2.object({
835
+ id: z2.string().min(1, "Test case ID is required"),
836
+ input: z2.string().min(1, "Test case input is required"),
837
+ context: z2.string().optional(),
838
+ expectedOutput: z2.string().min(1, "Test case expectedOutput is required")
839
+ });
840
+ var testSuiteSchema = z2.object({
841
+ name: z2.string().min(1, "Test suite name is required"),
842
+ description: z2.string().optional(),
843
+ context: z2.string().optional(),
844
+ tests: z2.array(testCaseSchema).min(1, "At least one test case is required")
845
+ });
846
+ var TestValidationError = class extends Error {
847
+ constructor(message, path, issues) {
848
+ super(message);
849
+ this.path = path;
850
+ this.issues = issues;
851
+ this.name = "TestValidationError";
852
+ }
853
+ };
854
+ var TestLoadError = class extends Error {
855
+ constructor(message, path, cause) {
856
+ super(message);
857
+ this.path = path;
858
+ this.cause = cause;
859
+ this.name = "TestLoadError";
860
+ }
861
+ };
862
+ async function loadTestSuite(filePath, cwd = process.cwd()) {
863
+ const absolutePath = resolve(cwd, filePath);
864
+ let content;
865
+ try {
866
+ content = await readFile(absolutePath, "utf-8");
867
+ } catch (error) {
868
+ throw new TestLoadError(
869
+ `Failed to read test file: ${absolutePath}`,
870
+ absolutePath,
871
+ error
872
+ );
873
+ }
874
+ let parsed;
875
+ try {
876
+ parsed = YAML.parse(content);
877
+ } catch (error) {
878
+ throw new TestLoadError(
879
+ `Failed to parse YAML: ${error instanceof Error ? error.message : String(error)}`,
880
+ absolutePath,
881
+ error
882
+ );
883
+ }
884
+ const result = testSuiteSchema.safeParse(parsed);
885
+ if (!result.success) {
886
+ throw new TestValidationError(
887
+ `Test suite validation failed: ${result.error.message}`,
888
+ absolutePath,
889
+ result.error.issues
890
+ );
891
+ }
892
+ const validated = result.data;
893
+ const ids = validated.tests.map((t) => t.id);
894
+ const duplicates = ids.filter((id, index) => ids.indexOf(id) !== index);
895
+ if (duplicates.length > 0) {
896
+ throw new TestValidationError(
897
+ `Duplicate test IDs found: ${[...new Set(duplicates)].join(", ")}`,
898
+ absolutePath,
899
+ []
900
+ );
901
+ }
902
+ return {
903
+ suite: validated,
904
+ path: absolutePath
905
+ };
906
+ }
907
+
908
+ // src/runner.ts
909
+ async function runTest(test, suite, agent, timeout, judge, progress, index, total) {
910
+ progress?.onTestStart(test.id, index, total);
911
+ const startTime = Date.now();
912
+ const parts = [];
913
+ if (suite.context) {
914
+ parts.push("Context:", suite.context, "");
915
+ }
916
+ if (test.context) {
917
+ parts.push("Specific Context:", test.context, "");
918
+ }
919
+ parts.push("Task:", test.input);
920
+ const prompt = parts.join("\n");
921
+ try {
922
+ const response = await runAgent(
923
+ agent,
924
+ prompt,
925
+ { cwd: process.cwd(), timeout }
926
+ );
927
+ const { score, reasoning, passed } = await judge(test, response.output);
928
+ const result = {
929
+ testId: test.id,
930
+ input: test.input,
931
+ expectedOutput: test.expectedOutput,
932
+ actualOutput: response.output,
933
+ judgeScore: score,
934
+ judgeReasoning: reasoning,
935
+ durationMs: response.durationMs,
936
+ passed
937
+ };
938
+ progress?.onTestComplete(result, index, total);
939
+ return result;
940
+ } catch (error) {
941
+ const durationMs = Date.now() - startTime;
942
+ const errorMessage = error instanceof Error ? error.message : String(error);
943
+ progress?.onTestError(test.id, errorMessage, index, total);
944
+ return {
945
+ testId: test.id,
946
+ input: test.input,
947
+ expectedOutput: test.expectedOutput,
948
+ actualOutput: "",
949
+ judgeScore: 0,
950
+ judgeReasoning: `Error: ${errorMessage}`,
951
+ durationMs,
952
+ passed: false,
953
+ error: errorMessage
954
+ };
955
+ }
956
+ }
957
+ async function runInParallel(items, concurrency, runner) {
958
+ if (concurrency <= 1) {
959
+ const results2 = [];
960
+ for (let i = 0; i < items.length; i++) {
961
+ results2.push(await runner(items[i], i));
962
+ }
963
+ return results2;
964
+ }
965
+ const results = new Array(items.length);
966
+ let index = 0;
967
+ async function worker() {
968
+ while (index < items.length) {
969
+ const currentIndex = index++;
970
+ results[currentIndex] = await runner(items[currentIndex], currentIndex);
971
+ }
972
+ }
973
+ const workers = Array(Math.min(concurrency, items.length)).fill(null).map(() => worker());
974
+ await Promise.all(workers);
975
+ return results;
976
+ }
977
+ var TestRunner = class {
978
+ _config;
979
+ constructor(config) {
980
+ this._config = config;
981
+ }
982
+ /**
983
+ * Run the evaluation.
984
+ *
985
+ * @param judge - Function to evaluate agent outputs
986
+ * @param progress - Optional progress callback
987
+ * @returns The evaluation results
988
+ */
989
+ async run(judge, progress) {
990
+ const { suite } = await loadTestSuite(this._config.testsPath, this._config.cwd);
991
+ const results = await runInParallel(
992
+ suite.tests,
993
+ this._config.parallel,
994
+ async (test, index) => {
995
+ return runTest(
996
+ test,
997
+ suite,
998
+ this._config.agent,
999
+ this._config.timeout,
1000
+ judge,
1001
+ progress,
1002
+ index,
1003
+ suite.tests.length
1004
+ );
1005
+ }
1006
+ );
1007
+ const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);
1008
+ const passed = results.filter((r) => r.passed).length;
1009
+ const failed = results.length - passed;
1010
+ const avgScore = results.reduce((sum, r) => sum + r.judgeScore, 0) / results.length;
1011
+ const evalResults = {
1012
+ suite,
1013
+ agent: this._config.agent,
1014
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
1015
+ results,
1016
+ summary: {
1017
+ total: results.length,
1018
+ passed,
1019
+ failed,
1020
+ avgScore,
1021
+ totalDurationMs
1022
+ }
1023
+ };
1024
+ return evalResults;
1025
+ }
1026
+ /**
1027
+ * Get the runner configuration.
1028
+ */
1029
+ get config() {
1030
+ return this._config;
1031
+ }
1032
+ };
1033
+ async function runEvaluation(config, judge, progress) {
1034
+ const runner = new TestRunner(config);
1035
+ return runner.run(judge, progress);
1036
+ }
1037
+
1038
+ // src/utils/package.ts
1039
+ import { readFileSync } from "fs";
1040
+ import { dirname, join } from "path";
1041
+ import { fileURLToPath as fileURLToPath2 } from "url";
1042
+ function getPackageJson() {
1043
+ const __filename = fileURLToPath2(import.meta.url);
1044
+ const __dirname = dirname(__filename);
1045
+ const packagePath = join(__dirname, "..", "..", "package.json");
1046
+ try {
1047
+ const content = readFileSync(packagePath, "utf-8");
1048
+ return JSON.parse(content);
1049
+ } catch {
1050
+ return {
1051
+ version: "1.0.0",
1052
+ name: "@gnsx/genesys.agent.eval"
1053
+ };
1054
+ }
1055
+ }
1056
+
1057
+ // src/cli.ts
1058
+ var colors2 = {
1059
+ reset: "\x1B[0m",
1060
+ bright: "\x1B[1m",
1061
+ dim: "\x1B[2m",
1062
+ green: "\x1B[32m",
1063
+ red: "\x1B[31m",
1064
+ yellow: "\x1B[33m",
1065
+ blue: "\x1B[34m",
1066
+ cyan: "\x1B[36m"
1067
+ };
1068
+ function createProgressCallback(verbose) {
1069
+ return {
1070
+ onTestStart(testId, index, total) {
1071
+ if (verbose) {
1072
+ console.log(`[${index + 1}/${total}] Running: ${testId}`);
1073
+ }
1074
+ },
1075
+ onTestComplete(result, index, total) {
1076
+ const status = result.passed ? "PASS" : "FAIL";
1077
+ const score = `${(result.judgeScore * 100).toFixed(0)}%`;
1078
+ const statusColor = result.passed ? colors2.green : colors2.red;
1079
+ console.log(`${statusColor}${status}${colors2.reset}`);
1080
+ },
1081
+ onTestError(testId, error, index, total) {
1082
+ console.error(`[${index + 1}/${total}] ERROR - ${testId}: ${error}`);
1083
+ }
1084
+ };
1085
+ }
1086
+ async function main(argv) {
1087
+ const pkg = getPackageJson();
1088
+ const args = parseArgs(argv, pkg.version, pkg.name);
1089
+ const cwd = resolve2(args.cwd);
1090
+ const config = {
1091
+ testsPath: args.tests,
1092
+ agent: args.agent,
1093
+ cwd,
1094
+ timeout: args.timeout * 1e3,
1095
+ // Convert to milliseconds
1096
+ outputPath: args.output,
1097
+ format: args.format,
1098
+ parallel: args.parallel,
1099
+ judge: {
1100
+ provider: args.judgeProvider,
1101
+ model: args.judgeModel
1102
+ }
1103
+ };
1104
+ console.log(`${colors2.cyan}Agent:${colors2.reset} ${colors2.bright}${args.agent}${colors2.reset}`);
1105
+ console.log(`${colors2.cyan}Judge:${colors2.reset} ${args.judgeType}`);
1106
+ console.log(`${colors2.cyan}Working directory:${colors2.reset} ${cwd}`);
1107
+ console.log(`${colors2.cyan}Test file:${colors2.reset} ${args.tests}`);
1108
+ console.log(`${colors2.cyan}Timeout:${colors2.reset} ${args.timeout}s per test`);
1109
+ console.log(`${colors2.cyan}Parallelism:${colors2.reset} ${args.parallel}`);
1110
+ console.log("");
1111
+ let judgeEvaluator;
1112
+ if (args.judgeType === "embedding") {
1113
+ const judge = new EmbeddingJudge({ passThreshold: 0.6 });
1114
+ judgeEvaluator = judge.createEvaluator();
1115
+ } else {
1116
+ const judge = new Judge({
1117
+ provider: args.judgeProvider,
1118
+ model: args.judgeModel,
1119
+ passThreshold: 0.7
1120
+ });
1121
+ judgeEvaluator = judge.createEvaluator();
1122
+ }
1123
+ const progress = createProgressCallback(args.parallel > 1 || args.format === "console");
1124
+ try {
1125
+ const results = await runEvaluation(config, judgeEvaluator, progress);
1126
+ const reporter = new Reporter({
1127
+ format: args.format,
1128
+ outputPath: args.output
1129
+ });
1130
+ await reporter.reportAndSave(results);
1131
+ process.exit(results.summary.failed > 0 ? 1 : 0);
1132
+ } catch (error) {
1133
+ console.error(`Evaluation failed: ${error instanceof Error ? error.message : String(error)}`);
1134
+ if (error instanceof Error) {
1135
+ if (error.message.includes("ENOENT")) {
1136
+ console.error(`Make sure the ${args.agent} CLI is installed and in your PATH.`);
1137
+ }
1138
+ if (error.message.includes("ANTHROPIC_API_KEY") || error.message.includes("OPENAI_API_KEY")) {
1139
+ console.error("Set the appropriate API key environment variable for the LLM judge.");
1140
+ }
1141
+ }
1142
+ process.exit(1);
1143
+ }
1144
+ }
1145
+ main(process.argv.slice(2)).catch((err) => {
1146
+ console.error(err instanceof Error ? err.message : String(err));
1147
+ process.exit(1);
1148
+ });
1149
+ //# sourceMappingURL=cli.js.map