@gnsx/genesys.agent.eval 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/README.md +9 -0
  2. package/dist/src/adapters/anthropic-adapter.d.ts +24 -0
  3. package/dist/src/adapters/anthropic-adapter.d.ts.map +1 -0
  4. package/dist/src/adapters/anthropic-adapter.js +80 -0
  5. package/dist/src/adapters/anthropic-adapter.js.map +1 -0
  6. package/dist/src/adapters/gemini-adapter.d.ts +23 -0
  7. package/dist/src/adapters/gemini-adapter.d.ts.map +1 -0
  8. package/dist/src/adapters/gemini-adapter.js +79 -0
  9. package/dist/src/adapters/gemini-adapter.js.map +1 -0
  10. package/dist/src/adapters/ollama-adapter.d.ts +28 -0
  11. package/dist/src/adapters/ollama-adapter.d.ts.map +1 -0
  12. package/dist/src/adapters/ollama-adapter.js +54 -0
  13. package/dist/src/adapters/ollama-adapter.js.map +1 -0
  14. package/dist/src/adapters/openai-adapter.d.ts +24 -0
  15. package/dist/src/adapters/openai-adapter.d.ts.map +1 -0
  16. package/dist/src/adapters/openai-adapter.js +80 -0
  17. package/dist/src/adapters/openai-adapter.js.map +1 -0
  18. package/dist/src/adapters/pi-adapter.d.ts +27 -0
  19. package/dist/src/adapters/pi-adapter.d.ts.map +1 -0
  20. package/dist/src/adapters/pi-adapter.js +136 -0
  21. package/dist/src/adapters/pi-adapter.js.map +1 -0
  22. package/dist/src/agent-adapter.d.ts +130 -0
  23. package/dist/src/agent-adapter.d.ts.map +1 -0
  24. package/dist/src/agent-adapter.js +134 -0
  25. package/dist/src/agent-adapter.js.map +1 -0
  26. package/dist/src/args.d.ts +22 -0
  27. package/dist/src/args.d.ts.map +1 -0
  28. package/dist/src/args.js +224 -0
  29. package/dist/src/args.js.map +1 -0
  30. package/dist/src/cli-runner.d.ts +39 -0
  31. package/dist/src/cli-runner.d.ts.map +1 -0
  32. package/dist/src/cli-runner.js +105 -0
  33. package/dist/src/cli-runner.js.map +1 -0
  34. package/dist/src/embedding-judge.d.ts +93 -0
  35. package/dist/src/embedding-judge.d.ts.map +1 -0
  36. package/dist/src/embedding-judge.js +160 -0
  37. package/dist/src/embedding-judge.js.map +1 -0
  38. package/dist/src/index.d.ts +15 -0
  39. package/dist/src/index.d.ts.map +1 -0
  40. package/dist/src/index.js +20 -0
  41. package/dist/src/index.js.map +1 -0
  42. package/dist/src/judge.d.ts +95 -0
  43. package/dist/src/judge.d.ts.map +1 -0
  44. package/dist/src/judge.js +189 -0
  45. package/dist/src/judge.js.map +1 -0
  46. package/dist/src/launcher.d.ts +9 -0
  47. package/dist/src/launcher.d.ts.map +1 -0
  48. package/dist/src/launcher.js +129 -0
  49. package/dist/src/launcher.js.map +1 -0
  50. package/dist/src/reporter.d.ts +86 -0
  51. package/dist/src/reporter.d.ts.map +1 -0
  52. package/dist/src/reporter.js +384 -0
  53. package/dist/src/reporter.js.map +1 -0
  54. package/dist/src/runner.d.ts +75 -0
  55. package/dist/src/runner.d.ts.map +1 -0
  56. package/dist/src/runner.js +165 -0
  57. package/dist/src/runner.js.map +1 -0
  58. package/dist/src/test-loader.d.ts +66 -0
  59. package/dist/src/test-loader.d.ts.map +1 -0
  60. package/dist/src/test-loader.js +140 -0
  61. package/dist/src/test-loader.js.map +1 -0
  62. package/dist/src/types.d.ts +161 -0
  63. package/dist/src/types.d.ts.map +1 -0
  64. package/dist/src/types.js +7 -0
  65. package/dist/src/types.js.map +1 -0
  66. package/dist/src/utils/package.d.ts +16 -0
  67. package/dist/src/utils/package.d.ts.map +1 -0
  68. package/dist/src/utils/package.js +30 -0
  69. package/dist/src/utils/package.js.map +1 -0
  70. package/dist/tsconfig.tsbuildinfo +1 -0
  71. package/examples/basic-tests.yaml +22 -0
  72. package/package.json +41 -0
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Test runner orchestrates loading, execution, and judging of test cases.
3
+ *
4
+ * @module runner
5
+ */
6
+ import type { EvalResults, RunnerConfig, TestCase, TestResult } from './types.js';
7
+ /**
8
+ * Progress callback for test execution.
9
+ */
10
+ export interface ProgressCallback {
11
+ /**
12
+ * Called when a test starts.
13
+ *
14
+ * @param testId - The test ID
15
+ * @param index - The test index (0-based)
16
+ * @param total - Total number of tests
17
+ */
18
+ onTestStart(testId: string, index: number, total: number): void;
19
+ /**
20
+ * Called when a test completes.
21
+ *
22
+ * @param result - The test result
23
+ * @param index - The test index (0-based)
24
+ * @param total - Total number of tests
25
+ */
26
+ onTestComplete(result: TestResult, index: number, total: number): void;
27
+ /**
28
+ * Called when a test fails with an error.
29
+ *
30
+ * @param testId - The test ID
31
+ * @param error - The error that occurred
32
+ * @param index - The test index (0-based)
33
+ * @param total - Total number of tests
34
+ */
35
+ onTestError(testId: string, error: string, index: number, total: number): void;
36
+ }
37
+ /**
38
+ * TestRunner orchestrates the evaluation process.
39
+ */
40
+ export declare class TestRunner {
41
+ private _config;
42
+ constructor(config: RunnerConfig);
43
+ /**
44
+ * Run the evaluation.
45
+ *
46
+ * @param judge - Function to evaluate agent outputs
47
+ * @param progress - Optional progress callback
48
+ * @returns The evaluation results
49
+ */
50
+ run(judge: (test: TestCase, actualOutput: string) => Promise<{
51
+ score: number;
52
+ reasoning: string;
53
+ passed: boolean;
54
+ }>, progress?: ProgressCallback): Promise<EvalResults>;
55
+ /**
56
+ * Get the runner configuration.
57
+ */
58
+ get config(): RunnerConfig;
59
+ }
60
+ /**
61
+ * Run an evaluation with the given configuration.
62
+ *
63
+ * Convenience function that creates a TestRunner and executes it.
64
+ *
65
+ * @param config - Runner configuration
66
+ * @param judge - Judge function for evaluating responses
67
+ * @param progress - Optional progress callback
68
+ * @returns Evaluation results
69
+ */
70
+ export declare function runEvaluation(config: RunnerConfig, judge: (test: TestCase, actualOutput: string) => Promise<{
71
+ score: number;
72
+ reasoning: string;
73
+ passed: boolean;
74
+ }>, progress?: ProgressCallback): Promise<EvalResults>;
75
+ //# sourceMappingURL=runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,OAAO,KAAK,EACV,WAAW,EACX,YAAY,EACZ,QAAQ,EACR,UAAU,EAEX,MAAM,YAAY,CAAC;AAEpB;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;;OAMG;IACH,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IAEhE;;;;;;OAMG;IACH,cAAc,CAAC,MAAM,EAAE,UAAU,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IAEvE;;;;;;;OAOG;IACH,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;CAChF;AAmID;;GAEG;AACH,qBAAa,UAAU;IACrB,OAAO,CAAC,OAAO,CAAe;gBAElB,MAAM,EAAE,YAAY;IAIhC;;;;;;OAMG;IACG,GAAG,CACP,KAAK,EAAE,CAAC,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,KAAK,OAAO,CAAC;QACvD,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC,EACF,QAAQ,CAAC,EAAE,gBAAgB,GAC1B,OAAO,CAAC,WAAW,CAAC;IA6CvB;;OAEG;IACH,IAAI,MAAM,IAAI,YAAY,CAEzB;CACF;AAED;;;;;;;;;GASG;AACH,wBAAsB,aAAa,CACjC,MAAM,EAAE,YAAY,EACpB,KAAK,EAAE,CAAC,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,KAAK,OAAO,CAAC;IACvD,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,OAAO,CAAC;CACjB,CAAC,EACF,QAAQ,CAAC,EAAE,gBAAgB,GAC1B,OAAO,CAAC,WAAW,CAAC,CAGtB"}
@@ -0,0 +1,165 @@
1
+ /**
2
+ * Test runner orchestrates loading, execution, and judging of test cases.
3
+ *
4
+ * @module runner
5
+ */
6
+ import { runAgent } from './cli-runner.js';
7
+ import { loadTestSuite } from './test-loader.js';
8
+ /**
9
+ * Run a single test case.
10
+ *
11
+ * @param test - The test case to run
12
+ * @param suite - The test suite
13
+ * @param agent - The agent to use
14
+ * @param timeout - Timeout in milliseconds
15
+ * @param judge - Function to judge the response
16
+ * @param progress - Optional progress callback
17
+ * @param index - Test index
18
+ * @param total - Total tests
19
+ * @returns The test result
20
+ */
21
+ async function runTest(test, suite, agent, timeout, judge, progress, index, total) {
22
+ progress?.onTestStart(test.id, index, total);
23
+ const startTime = Date.now();
24
+ // Build the full prompt with context
25
+ const parts = [];
26
+ if (suite.context) {
27
+ parts.push('Context:', suite.context, '');
28
+ }
29
+ if (test.context) {
30
+ parts.push('Specific Context:', test.context, '');
31
+ }
32
+ parts.push('Task:', test.input);
33
+ const prompt = parts.join('\n');
34
+ try {
35
+ // Run the agent
36
+ const response = await runAgent(agent, prompt, { cwd: process.cwd(), timeout });
37
+ // Judge the response
38
+ const { score, reasoning, passed } = await judge(test, response.output);
39
+ const result = {
40
+ testId: test.id,
41
+ input: test.input,
42
+ expectedOutput: test.expectedOutput,
43
+ actualOutput: response.output,
44
+ judgeScore: score,
45
+ judgeReasoning: reasoning,
46
+ durationMs: response.durationMs,
47
+ passed,
48
+ };
49
+ progress?.onTestComplete(result, index, total);
50
+ return result;
51
+ }
52
+ catch (error) {
53
+ const durationMs = Date.now() - startTime;
54
+ const errorMessage = error instanceof Error ? error.message : String(error);
55
+ progress?.onTestError(test.id, errorMessage, index, total);
56
+ return {
57
+ testId: test.id,
58
+ input: test.input,
59
+ expectedOutput: test.expectedOutput,
60
+ actualOutput: '',
61
+ judgeScore: 0,
62
+ judgeReasoning: `Error: ${errorMessage}`,
63
+ durationMs,
64
+ passed: false,
65
+ error: errorMessage,
66
+ };
67
+ }
68
+ }
69
+ /**
70
+ * Run tests in parallel with a concurrency limit.
71
+ *
72
+ * @param tests - Tests to run
73
+ * @param concurrency - Number of concurrent executions
74
+ * @param runner - Function to run a single test
75
+ * @returns Array of results in the same order as tests
76
+ */
77
+ async function runInParallel(items, concurrency, runner) {
78
+ if (concurrency <= 1) {
79
+ // Sequential execution
80
+ const results = [];
81
+ for (let i = 0; i < items.length; i++) {
82
+ results.push(await runner(items[i], i));
83
+ }
84
+ return results;
85
+ }
86
+ // Parallel execution with concurrency limit
87
+ const results = new Array(items.length);
88
+ let index = 0;
89
+ async function worker() {
90
+ while (index < items.length) {
91
+ const currentIndex = index++;
92
+ results[currentIndex] = await runner(items[currentIndex], currentIndex);
93
+ }
94
+ }
95
+ // Start workers
96
+ const workers = Array(Math.min(concurrency, items.length))
97
+ .fill(null)
98
+ .map(() => worker());
99
+ await Promise.all(workers);
100
+ return results;
101
+ }
102
+ /**
103
+ * TestRunner orchestrates the evaluation process.
104
+ */
105
+ export class TestRunner {
106
+ _config;
107
+ constructor(config) {
108
+ this._config = config;
109
+ }
110
+ /**
111
+ * Run the evaluation.
112
+ *
113
+ * @param judge - Function to evaluate agent outputs
114
+ * @param progress - Optional progress callback
115
+ * @returns The evaluation results
116
+ */
117
+ async run(judge, progress) {
118
+ // Load the test suite
119
+ const { suite } = await loadTestSuite(this._config.testsPath, this._config.cwd);
120
+ // Run all tests
121
+ const results = await runInParallel(suite.tests, this._config.parallel, async (test, index) => {
122
+ return runTest(test, suite, this._config.agent, this._config.timeout, judge, progress, index, suite.tests.length);
123
+ });
124
+ // Calculate summary
125
+ const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);
126
+ const passed = results.filter(r => r.passed).length;
127
+ const failed = results.length - passed;
128
+ const avgScore = results.reduce((sum, r) => sum + r.judgeScore, 0) / results.length;
129
+ const evalResults = {
130
+ suite,
131
+ agent: this._config.agent,
132
+ timestamp: new Date().toISOString(),
133
+ results,
134
+ summary: {
135
+ total: results.length,
136
+ passed,
137
+ failed,
138
+ avgScore,
139
+ totalDurationMs,
140
+ },
141
+ };
142
+ return evalResults;
143
+ }
144
+ /**
145
+ * Get the runner configuration.
146
+ */
147
+ get config() {
148
+ return this._config;
149
+ }
150
+ }
151
+ /**
152
+ * Run an evaluation with the given configuration.
153
+ *
154
+ * Convenience function that creates a TestRunner and executes it.
155
+ *
156
+ * @param config - Runner configuration
157
+ * @param judge - Judge function for evaluating responses
158
+ * @param progress - Optional progress callback
159
+ * @returns Evaluation results
160
+ */
161
+ export async function runEvaluation(config, judge, progress) {
162
+ const runner = new TestRunner(config);
163
+ return runner.run(judge, progress);
164
+ }
165
+ //# sourceMappingURL=runner.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,iBAAiB,CAAC;AAC3C,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AA2CjD;;;;;;;;;;;;GAYG;AACH,KAAK,UAAU,OAAO,CACpB,IAAc,EACd,KAAgB,EAChB,KAAa,EACb,OAAe,EACf,KAIE,EACF,QAAsC,EACtC,KAAa,EACb,KAAa;IAEb,QAAQ,EAAE,WAAW,CAAC,IAAI,CAAC,EAAE,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAE7C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,qCAAqC;IACrC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC;QAClB,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IAC5C,CAAC;IACD,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;QACjB,KAAK,CAAC,IAAI,CAAC,mBAAmB,EAAE,IAAI,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IACpD,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;IAChC,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEhC,IAAI,CAAC;QACH,gBAAgB;QAChB,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAC7B,KAAyB,EACzB,MAAM,EACN,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,CAChC,CAAC;QAEF,qBAAqB;QACrB,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE,GAAG,MAAM,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;QAExE,MAAM,MAAM,GAAe;YACzB,MAAM,EAAE,IAAI,CAAC,EAAE;YACf,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,cAAc,EAAE,IAAI,CAAC,cAAc;YACnC,YAAY,EAAE,QAAQ,CAAC,MAAM;YAC7B,UAAU,EAAE,KAAK;YACjB,cAAc,EAAE,SAAS;YACzB,UAAU,EAAE,QAAQ,CAAC,UAAU;YAC/B,MAAM;SACP,CAAC;QAEF,QAAQ,EAAE,cAAc,CAAC,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;QAC/C,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;QAC1C,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAE5E,QAAQ,EAAE,WAAW,CAAC,IAAI,CAAC,EAAE,EAAE,YAAY,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;QAE3D,OAAO;YACL,MAAM,EAAE,IAAI,CAAC,EAAE;YACf,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,cAAc,EAAE,IAAI,CAAC,cAAc;YACnC,YAAY,EAAE,EAAE;YAChB,UAAU,EAAE,CAAC;YACb,cAAc,EAAE,UAAU,YAAY,EAAE;YACxC,UAAU;YACV,MAAM,EAAE,KAAK;YACb,KAAK,EAAE,YAAY;SACpB,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;;;;;;GAOG;AACH,KAAK,UAAU,aAAa,CAC1B,KAAU,EACV,WAAmB,EACnB,MAA8C;IAE9C,IAAI,WAAW,IAAI,CAAC,EAAE,CAAC;QACrB,uBAAuB;QACvB,MAAM,OAAO,GAAQ,EAAE,CAAC;QACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,OAAO,CAAC,IAAI,CAAC,MAAM,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,4CAA4C;IAC5C,MAAM,OAAO,GAAG,IAAI,KAAK,CAAI,KAAK,CAAC,MAAM,CAAC,CAAC;IAC3C,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,KAAK,UAAU,MAAM;QACnB,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC5B,MAAM,YAAY,GAAG,KAAK,EAAE,CAAC;YAC7B,OAAO,CAAC,YAAY,CAAC,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,EAAE,YAAY,CAAC,CAAC;QAC1E,CAAC;IACH,CAAC;IAED,gBAAgB;IAChB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;SACvD,IAAI,CAAC,IAAI,CAAC;SACV,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;IAEvB,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IAE3B,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,MAAM,OAAO,UAAU;IACb,OAAO,CAAe;IAE9B,YAAY,MAAoB;QAC9B,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC;IACxB,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,GAAG,CACP,KAIE,EACF,QAA2B;QAE3B,sBAAsB;QACtB,MAAM,EAAE,KAAK,EAAE,GAAG,MAAM,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAEhF,gBAAgB;QAChB,MAAM,OAAO,GAAG,MAAM,aAAa,CACjC,KAAK,CAAC,KAAK,EACX,IAAI,CAAC,OAAO,CAAC,QAAQ,EACrB,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE;YACpB,OAAO,OAAO,CACZ,IAAI,EACJ,KAAK,EACL,IAAI,CAAC,OAAO,CAAC,KAAK,EAClB,IAAI,CAAC,OAAO,CAAC,OAAO,EACpB,KAAK,EACL,QAAQ,EACR,KAAK,EACL,KAAK,CAAC,KAAK,CAAC,MAAM,CACnB,CAAC;QACJ,CAAC,CACF,CAAC;QAEF,oBAAoB;QACpB,MAAM,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;QAC1E,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;QACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,MAAM,CAAC;QACvC,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QAEpF,MAAM,WAAW,GAAgB;YAC/B,KAAK;YACL,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK;YACzB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,OAAO;YACP,OAAO,EAAE;gBACP,KAAK,EAAE,OAAO,CAAC,MAAM;gBACrB,MAAM;gBACN,MAAM;gBACN,QAAQ;gBACR,eAAe;aAChB;SACF,CAAC;QAEF,OAAO,WAAW,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,MAAoB,EACpB,KAIE,EACF,QAA2B;IAE3B,MAAM,MAAM,GAAG,IAAI,UAAU,CAAC,MAAM,CAAC,CAAC;IACtC,OAAO,MAAM,CAAC,GAAG,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;AACrC,CAAC"}
@@ -0,0 +1,66 @@
1
+ /**
2
+ * Test suite loading and validation from YAML files.
3
+ *
4
+ * @module test-loader
5
+ */
6
+ import { z } from 'zod';
7
+ import type { TestSuite } from './types.js';
8
+ /**
9
+ * Result of loading a test suite.
10
+ */
11
+ export interface LoadResult {
12
+ /** The loaded test suite */
13
+ suite: TestSuite;
14
+ /** Absolute path to the test file */
15
+ path: string;
16
+ }
17
+ /**
18
+ * Error thrown when test file validation fails.
19
+ */
20
+ export declare class TestValidationError extends Error {
21
+ readonly path: string;
22
+ readonly issues: z.ZodIssue[];
23
+ constructor(message: string, path: string, issues: z.ZodIssue[]);
24
+ }
25
+ /**
26
+ * Error thrown when a test file cannot be read.
27
+ */
28
+ export declare class TestLoadError extends Error {
29
+ readonly path: string;
30
+ readonly cause?: unknown | undefined;
31
+ constructor(message: string, path: string, cause?: unknown | undefined);
32
+ }
33
+ /**
34
+ * Load and validate a test suite from a YAML file.
35
+ *
36
+ * @param filePath - Path to the YAML test file (relative or absolute)
37
+ * @param cwd - Working directory for resolving relative paths
38
+ * @returns The loaded and validated test suite
39
+ * @throws TestLoadError if the file cannot be read
40
+ * @throws TestValidationError if the YAML content is invalid
41
+ *
42
+ * @example
43
+ * ```typescript
44
+ * const { suite, path } = await loadTestSuite('./tests.yaml', process.cwd());
45
+ * console.log(`Loaded ${suite.tests.length} tests from ${path}`);
46
+ * ```
47
+ */
48
+ export declare function loadTestSuite(filePath: string, cwd?: string): Promise<LoadResult>;
49
+ /**
50
+ * Validate a test suite object without loading from file.
51
+ *
52
+ * @param data - The data to validate
53
+ * @returns The validated test suite
54
+ * @throws TestValidationError if validation fails
55
+ */
56
+ export declare function validateTestSuite(data: unknown): TestSuite;
57
+ /**
58
+ * Parse a test suite from a YAML string.
59
+ *
60
+ * @param yamlString - YAML content to parse
61
+ * @returns The parsed and validated test suite
62
+ * @throws TestLoadError if YAML parsing fails
63
+ * @throws TestValidationError if validation fails
64
+ */
65
+ export declare function parseTestSuite(yamlString: string): TestSuite;
66
+ //# sourceMappingURL=test-loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"test-loader.d.ts","sourceRoot":"","sources":["../../src/test-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAMH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,KAAK,EAAY,SAAS,EAAE,MAAM,YAAY,CAAC;AAsBtD;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,4BAA4B;IAC5B,KAAK,EAAE,SAAS,CAAC;IAEjB,qCAAqC;IACrC,IAAI,EAAE,MAAM,CAAC;CACd;AAED;;GAEG;AACH,qBAAa,mBAAoB,SAAQ,KAAK;aAG1B,IAAI,EAAE,MAAM;aACZ,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;gBAFpC,OAAO,EAAE,MAAM,EACC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;CAKvC;AAED;;GAEG;AACH,qBAAa,aAAc,SAAQ,KAAK;aAGpB,IAAI,EAAE,MAAM;aACZ,KAAK,CAAC,EAAE,OAAO;gBAF/B,OAAO,EAAE,MAAM,EACC,IAAI,EAAE,MAAM,EACZ,KAAK,CAAC,EAAE,OAAO,YAAA;CAKlC;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,aAAa,CACjC,QAAQ,EAAE,MAAM,EAChB,GAAG,GAAE,MAAsB,GAC1B,OAAO,CAAC,UAAU,CAAC,CAmDrB;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,OAAO,GAAG,SAAS,CAwB1D;AAED;;;;;;;GAOG;AACH,wBAAgB,cAAc,CAAC,UAAU,EAAE,MAAM,GAAG,SAAS,CAa5D"}
@@ -0,0 +1,140 @@
1
+ /**
2
+ * Test suite loading and validation from YAML files.
3
+ *
4
+ * @module test-loader
5
+ */
6
+ import { readFile } from 'node:fs/promises';
7
+ import { resolve } from 'node:path';
8
+ import YAML from 'yaml';
9
+ import { z } from 'zod';
10
+ /**
11
+ * Zod schema for validating a test case.
12
+ */
13
+ const testCaseSchema = z.object({
14
+ id: z.string().min(1, 'Test case ID is required'),
15
+ input: z.string().min(1, 'Test case input is required'),
16
+ context: z.string().optional(),
17
+ expectedOutput: z.string().min(1, 'Test case expectedOutput is required'),
18
+ });
19
+ /**
20
+ * Zod schema for validating a test suite.
21
+ */
22
+ const testSuiteSchema = z.object({
23
+ name: z.string().min(1, 'Test suite name is required'),
24
+ description: z.string().optional(),
25
+ context: z.string().optional(),
26
+ tests: z.array(testCaseSchema).min(1, 'At least one test case is required'),
27
+ });
28
+ /**
29
+ * Error thrown when test file validation fails.
30
+ */
31
+ export class TestValidationError extends Error {
32
+ path;
33
+ issues;
34
+ constructor(message, path, issues) {
35
+ super(message);
36
+ this.path = path;
37
+ this.issues = issues;
38
+ this.name = 'TestValidationError';
39
+ }
40
+ }
41
+ /**
42
+ * Error thrown when a test file cannot be read.
43
+ */
44
+ export class TestLoadError extends Error {
45
+ path;
46
+ cause;
47
+ constructor(message, path, cause) {
48
+ super(message);
49
+ this.path = path;
50
+ this.cause = cause;
51
+ this.name = 'TestLoadError';
52
+ }
53
+ }
54
+ /**
55
+ * Load and validate a test suite from a YAML file.
56
+ *
57
+ * @param filePath - Path to the YAML test file (relative or absolute)
58
+ * @param cwd - Working directory for resolving relative paths
59
+ * @returns The loaded and validated test suite
60
+ * @throws TestLoadError if the file cannot be read
61
+ * @throws TestValidationError if the YAML content is invalid
62
+ *
63
+ * @example
64
+ * ```typescript
65
+ * const { suite, path } = await loadTestSuite('./tests.yaml', process.cwd());
66
+ * console.log(`Loaded ${suite.tests.length} tests from ${path}`);
67
+ * ```
68
+ */
69
+ export async function loadTestSuite(filePath, cwd = process.cwd()) {
70
+ const absolutePath = resolve(cwd, filePath);
71
+ let content;
72
+ try {
73
+ content = await readFile(absolutePath, 'utf-8');
74
+ }
75
+ catch (error) {
76
+ throw new TestLoadError(`Failed to read test file: ${absolutePath}`, absolutePath, error);
77
+ }
78
+ let parsed;
79
+ try {
80
+ parsed = YAML.parse(content);
81
+ }
82
+ catch (error) {
83
+ throw new TestLoadError(`Failed to parse YAML: ${error instanceof Error ? error.message : String(error)}`, absolutePath, error);
84
+ }
85
+ const result = testSuiteSchema.safeParse(parsed);
86
+ if (!result.success) {
87
+ throw new TestValidationError(`Test suite validation failed: ${result.error.message}`, absolutePath, result.error.issues);
88
+ }
89
+ const validated = result.data;
90
+ // Check for duplicate test IDs
91
+ const ids = validated.tests.map((t) => t.id);
92
+ const duplicates = ids.filter((id, index) => ids.indexOf(id) !== index);
93
+ if (duplicates.length > 0) {
94
+ throw new TestValidationError(`Duplicate test IDs found: ${[...new Set(duplicates)].join(', ')}`, absolutePath, []);
95
+ }
96
+ return {
97
+ suite: validated,
98
+ path: absolutePath,
99
+ };
100
+ }
101
+ /**
102
+ * Validate a test suite object without loading from file.
103
+ *
104
+ * @param data - The data to validate
105
+ * @returns The validated test suite
106
+ * @throws TestValidationError if validation fails
107
+ */
108
+ export function validateTestSuite(data) {
109
+ const result = testSuiteSchema.safeParse(data);
110
+ if (!result.success) {
111
+ throw new TestValidationError(`Test suite validation failed: ${result.error.message}`, '<inline>', result.error.issues);
112
+ }
113
+ const validated = result.data;
114
+ // Check for duplicate test IDs
115
+ const ids = validated.tests.map((t) => t.id);
116
+ const duplicates = ids.filter((id, index) => ids.indexOf(id) !== index);
117
+ if (duplicates.length > 0) {
118
+ throw new TestValidationError(`Duplicate test IDs found: ${[...new Set(duplicates)].join(', ')}`, '<inline>', []);
119
+ }
120
+ return validated;
121
+ }
122
+ /**
123
+ * Parse a test suite from a YAML string.
124
+ *
125
+ * @param yamlString - YAML content to parse
126
+ * @returns The parsed and validated test suite
127
+ * @throws TestLoadError if YAML parsing fails
128
+ * @throws TestValidationError if validation fails
129
+ */
130
+ export function parseTestSuite(yamlString) {
131
+ let parsed;
132
+ try {
133
+ parsed = YAML.parse(yamlString);
134
+ }
135
+ catch (error) {
136
+ throw new TestLoadError(`Failed to parse YAML: ${error instanceof Error ? error.message : String(error)}`, '<string>', error);
137
+ }
138
+ return validateTestSuite(parsed);
139
+ }
140
+ //# sourceMappingURL=test-loader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"test-loader.js","sourceRoot":"","sources":["../../src/test-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAEpC,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB;;GAEG;AACH,MAAM,cAAc,GAAG,CAAC,CAAC,MAAM,CAAC;IAC9B,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,0BAA0B,CAAC;IACjD,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,6BAA6B,CAAC;IACvD,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC9B,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,sCAAsC,CAAC;CAC1E,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/B,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,6BAA6B,CAAC;IACtD,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC9B,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,oCAAoC,CAAC;CAC5E,CAAC,CAAC;AAaH;;GAEG;AACH,MAAM,OAAO,mBAAoB,SAAQ,KAAK;IAG1B;IACA;IAHlB,YACE,OAAe,EACC,IAAY,EACZ,MAAoB;QAEpC,KAAK,CAAC,OAAO,CAAC,CAAC;QAHC,SAAI,GAAJ,IAAI,CAAQ;QACZ,WAAM,GAAN,MAAM,CAAc;QAGpC,IAAI,CAAC,IAAI,GAAG,qBAAqB,CAAC;IACpC,CAAC;CACF;AAED;;GAEG;AACH,MAAM,OAAO,aAAc,SAAQ,KAAK;IAGpB;IACA;IAHlB,YACE,OAAe,EACC,IAAY,EACZ,KAAe;QAE/B,KAAK,CAAC,OAAO,CAAC,CAAC;QAHC,SAAI,GAAJ,IAAI,CAAQ;QACZ,UAAK,GAAL,KAAK,CAAU;QAG/B,IAAI,CAAC,IAAI,GAAG,eAAe,CAAC;IAC9B,CAAC;CACF;AAED;;;;;;;;;;;;;;GAcG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,QAAgB,EAChB,MAAc,OAAO,CAAC,GAAG,EAAE;IAE3B,MAAM,YAAY,GAAG,OAAO,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IAE5C,IAAI,OAAe,CAAC;IACpB,IAAI,CAAC;QACH,OAAO,GAAG,MAAM,QAAQ,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAClD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,IAAI,aAAa,CACrB,6BAA6B,YAAY,EAAE,EAC3C,YAAY,EACZ,KAAK,CACN,CAAC;IACJ,CAAC;IAED,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAC/B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,IAAI,aAAa,CACrB,yBAAyB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,EACjF,YAAY,EACZ,KAAK,CACN,CAAC;IACJ,CAAC;IAED,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;IACjD,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QACpB,MAAM,IAAI,mBAAmB,CAC3B,iCAAiC,MAAM,CAAC,KAAK,CAAC,OAAO,EAAE,EACvD,YAAY,EACZ,MAAM,CAAC,KAAK,CAAC,MAAM,CACpB,CAAC;IACJ,CAAC;IAED,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC;IAE9B,+BAA+B;IAC/B,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAW,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACvD,MAAM,UAAU,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,EAAU,EAAE,KAAa,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,KAAK,KAAK,CAAC,CAAC;IACxF,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,mBAAmB,CAC3B,6BAA6B,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,EAClE,YAAY,EACZ,EAAE,CACH,CAAC;IACJ,CAAC;IAED,OAAO;QACL,KAAK,EAAE,SAAS;QAChB,IAAI,EAAE,YAAY;KACnB,CAAC;AACJ,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAa;IAC7C,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;IAC/C,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QACpB,MAAM,IAAI,mBAAmB,CAC3B,iCAAiC,MAAM,CAAC,KAAK,CAAC,OAAO,EAAE,EACvD,UAAU,EACV,MAAM,CAAC,KAAK,CAAC,MAAM,CACpB,CAAC;IACJ,CAAC;IAED,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC;IAE9B,+BAA+B;IAC/B,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAW,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACvD,MAAM,UAAU,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,EAAU,EAAE,KAAa,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,KAAK,KAAK,CAAC,CAAC;IACxF,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,mBAAmB,CAC3B,6BAA6B,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,EAClE,UAAU,EACV,EAAE,CACH,CAAC;IACJ,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,cAAc,CAAC,UAAkB;IAC/C,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;IAClC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,IAAI,aAAa,CACrB,yBAAyB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,EACjF,UAAU,EACV,KAAK,CACN,CAAC;IACJ,CAAC;IAED,OAAO,iBAAiB,CAAC,MAAM,CAAC,CAAC;AACnC,CAAC"}
@@ -0,0 +1,161 @@
1
+ /**
2
+ * Core type definitions for the agent evaluation harness.
3
+ *
4
+ * @module types
5
+ */
6
+ /**
7
+ * A single test case for evaluating an agent.
8
+ */
9
+ export interface TestCase {
10
+ /** Unique identifier for this test case */
11
+ id: string;
12
+ /** The input prompt to send to the agent */
13
+ input: string;
14
+ /** Optional additional context for this specific test case */
15
+ context?: string;
16
+ /** Natural language description of expected output */
17
+ expectedOutput: string;
18
+ }
19
+ /**
20
+ * A suite of test cases for evaluation.
21
+ */
22
+ export interface TestSuite {
23
+ /** Name of the test suite */
24
+ name: string;
25
+ /** Optional description of what this suite tests */
26
+ description?: string;
27
+ /** Optional global context available to all tests in this suite */
28
+ context?: string;
29
+ /** Array of test cases */
30
+ tests: TestCase[];
31
+ }
32
+ /**
33
+ * Response from running a CLI agent.
34
+ */
35
+ export interface AgentResponse {
36
+ /** The actual output from the agent */
37
+ output: string;
38
+ /** Exit code from the process */
39
+ exitCode: number;
40
+ /** Any stderr output */
41
+ stderr: string;
42
+ /** Duration in milliseconds */
43
+ durationMs: number;
44
+ }
45
+ /**
46
+ * Result of running a single test case.
47
+ */
48
+ export interface TestResult {
49
+ /** Test case identifier */
50
+ testId: string;
51
+ /** Input that was sent to the agent */
52
+ input: string;
53
+ /** Expected output description */
54
+ expectedOutput: string;
55
+ /** Actual output from the agent */
56
+ actualOutput: string;
57
+ /** Judge score from 0 to 1 */
58
+ judgeScore: number;
59
+ /** Judge's reasoning for the score */
60
+ judgeReasoning: string;
61
+ /** Duration in milliseconds */
62
+ durationMs: number;
63
+ /** Whether the test passed (score >= threshold) */
64
+ passed: boolean;
65
+ /** Any error that occurred during execution */
66
+ error?: string;
67
+ }
68
+ /**
69
+ * Summary statistics for an evaluation run.
70
+ */
71
+ export interface EvalSummary {
72
+ /** Total number of tests */
73
+ total: number;
74
+ /** Number of tests that passed */
75
+ passed: number;
76
+ /** Number of tests that failed */
77
+ failed: number;
78
+ /** Average judge score across all tests */
79
+ avgScore: number;
80
+ /** Total duration in milliseconds */
81
+ totalDurationMs: number;
82
+ }
83
+ /**
84
+ * Complete results from an evaluation run.
85
+ */
86
+ export interface EvalResults {
87
+ /** The test suite that was run */
88
+ suite: TestSuite;
89
+ /** Agent name that was tested */
90
+ agent: string;
91
+ /** Timestamp of the evaluation */
92
+ timestamp: string;
93
+ /** Individual test results */
94
+ results: TestResult[];
95
+ /** Summary statistics */
96
+ summary: EvalSummary;
97
+ }
98
+ /**
99
+ * Configuration for the judge LLM.
100
+ */
101
+ export interface JudgeConfig {
102
+ /** Provider name (e.g., 'anthropic', 'openai') */
103
+ provider: string;
104
+ /** Model identifier */
105
+ model: string;
106
+ /** Score threshold for passing (0-1, default: 0.7) */
107
+ passThreshold?: number;
108
+ /** Temperature for judge LLM (default: 0) */
109
+ temperature?: number;
110
+ }
111
+ /**
112
+ * Configuration for the test runner.
113
+ */
114
+ export interface RunnerConfig {
115
+ /** Path to test file */
116
+ testsPath: string;
117
+ /** Agent to test: 'pi', 'genesys', or any custom command */
118
+ agent: string;
119
+ /** Working directory */
120
+ cwd: string;
121
+ /** Timeout in milliseconds (default: 120000) */
122
+ timeout: number;
123
+ /** Output file path (optional) */
124
+ outputPath?: string;
125
+ /** Output format */
126
+ format: 'console' | 'json' | 'html';
127
+ /** Number of parallel test executions */
128
+ parallel: number;
129
+ /** Judge configuration */
130
+ judge: JudgeConfig;
131
+ }
132
+ /**
133
+ * CLI argument structure.
134
+ */
135
+ export interface Args {
136
+ /** Path to YAML test file */
137
+ tests: string;
138
+ /** Agent to test: 'pi' or 'genesys' */
139
+ agent: string;
140
+ /** Working directory */
141
+ cwd: string;
142
+ /** Timeout in seconds */
143
+ timeout: number;
144
+ /** Output file path */
145
+ output?: string;
146
+ /** Output format */
147
+ format: 'console' | 'json' | 'html';
148
+ /** Parallelism level */
149
+ parallel: number;
150
+ /** Judge type: 'embedding' (default) or 'llm' */
151
+ judgeType: 'embedding' | 'llm';
152
+ /** Judge model (for LLM judge) */
153
+ judgeModel: string;
154
+ /** Judge provider (for LLM judge) */
155
+ judgeProvider: string;
156
+ /** Show help */
157
+ help: boolean;
158
+ /** Show version */
159
+ version: boolean;
160
+ }
161
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,2CAA2C;IAC3C,EAAE,EAAE,MAAM,CAAC;IAEX,4CAA4C;IAC5C,KAAK,EAAE,MAAM,CAAC;IAEd,8DAA8D;IAC9D,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,sDAAsD;IACtD,cAAc,EAAE,MAAM,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,6BAA6B;IAC7B,IAAI,EAAE,MAAM,CAAC;IAEb,oDAAoD;IACpD,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB,mEAAmE;IACnE,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,0BAA0B;IAC1B,KAAK,EAAE,QAAQ,EAAE,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,uCAAuC;IACvC,MAAM,EAAE,MAAM,CAAC;IAEf,iCAAiC;IACjC,QAAQ,EAAE,MAAM,CAAC;IAEjB,wBAAwB;IACxB,MAAM,EAAE,MAAM,CAAC;IAEf,+BAA+B;IAC/B,UAAU,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,2BAA2B;IAC3B,MAAM,EAAE,MAAM,CAAC;IAEf,uCAAuC;IACvC,KAAK,EAAE,MAAM,CAAC;IAEd,kCAAkC;IAClC,cAAc,EAAE,MAAM,CAAC;IAEvB,mCAAmC;IACnC,YAAY,EAAE,MAAM,CAAC;IAErB,8BAA8B;IAC9B,UAAU,EAAE,MAAM,CAAC;IAEnB,sCAAsC;IACtC,cAAc,EAAE,MAAM,CAAC;IAEvB,+BAA+B;IAC/B,UAAU,EAAE,MAAM,CAAC;IAEnB,mDAAmD;IACnD,MAAM,EAAE,OAAO,CAAC;IAEhB,+CAA+C;IAC/C,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,4BAA4B;IAC5B,KAAK,EAAE,MAAM,CAAC;IAEd,kCAAkC;IAClC,MAAM,EAAE,MAAM,CAAC;IAEf,kCAAkC;IAClC,MAAM,EAAE,MAAM,CAAC;IAEf,2CAA2C;IAC3C,QAAQ,EAAE,MAAM,CAAC;IAEjB,qCAAqC;IACrC,eAAe,EAAE,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,kCAAkC;IAClC,KAAK,EAAE,SAAS,CAAC;IAEjB,iCAAiC;IACjC,KAAK,EAAE,MAAM,CAAC;IAEd,kCAAkC;IAClC,SAAS,EAAE,MAAM,CAAC;IAElB,8BAA8B;IAC9B,OAAO,EAAE,UAAU,EAAE,CAAC;IAEtB,yBAAyB;IACzB,OAAO,EAAE,WAAW,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,kDAAkD;IAClD,QAAQ,EAAE,MAAM,CAAC;IAEjB,uBAAuB;IACvB,KAAK,EAAE,MAAM,CAAC;IAEd,sDAAsD;IACtD,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB,6CAA6C;IAC7C,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,wBAAwB;IACxB,SAAS,EAAE,MAAM,CAAC;IAElB,4DAA4D;IAC5D,KAAK,EAAE,MAAM,CAAC;IAEd,wBAAwB;IACxB,GAAG,EAAE,MAAM,CAAC;IAEZ,gDAAgD;IAChD,OAAO,EAAE,MAAM,CAAC;IAEhB,kCAAkC;IAClC,UAAU,CAAC,EAAE,MAAM,CAAC;IAEpB,oBAAoB;IACpB,MAAM,EAAE,SAAS,GAAG,MAAM,GAAG,MAAM,CAAC;IAEpC,yCAAyC;IACzC,QAAQ,EAAE,MAAM,CAAC;IAEjB,0BAA0B;IAC1B,KAAK,EAAE,WAAW,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,IAAI;IACnB,6BAA6B;IAC7B,KAAK,EAAE,MAAM,CAAC;IAEd,uCAAuC;IACvC,KAAK,EAAE,MAAM,CAAC;IAEd,wBAAwB;IACxB,GAAG,EAAE,MAAM,CAAC;IAEZ,yBAAyB;IACzB,OAAO,EAAE,MAAM,CAAC;IAEhB,uBAAuB;IACvB,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB,oBAAoB;IACpB,MAAM,EAAE,SAAS,GAAG,MAAM,GAAG,MAAM,CAAC;IAEpC,wBAAwB;IACxB,QAAQ,EAAE,MAAM,CAAC;IAEjB,iDAAiD;IACjD,SAAS,EAAE,WAAW,GAAG,KAAK,CAAC;IAE/B,kCAAkC;IAClC,UAAU,EAAE,MAAM,CAAC;IAEnB,qCAAqC;IACrC,aAAa,EAAE,MAAM,CAAC;IAEtB,gBAAgB;IAChB,IAAI,EAAE,OAAO,CAAC;IAEd,mBAAmB;IACnB,OAAO,EAAE,OAAO,CAAC;CAClB"}
@@ -0,0 +1,7 @@
1
+ /**
2
+ * Core type definitions for the agent evaluation harness.
3
+ *
4
+ * @module types
5
+ */
6
+ export {};
7
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;;;GAIG"}
@@ -0,0 +1,16 @@
1
+ /**
2
+ * Package metadata utilities.
3
+ *
4
+ * @module utils/package
5
+ */
6
+ /**
7
+ * Get package.json contents.
8
+ *
9
+ * @returns Package.json as an object
10
+ */
11
+ export declare function getPackageJson(): {
12
+ version: string;
13
+ name: string;
14
+ [key: string]: unknown;
15
+ };
16
+ //# sourceMappingURL=package.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"package.d.ts","sourceRoot":"","sources":["../../../src/utils/package.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAMH;;;;GAIG;AACH,wBAAgB,cAAc,IAAI;IAAE,OAAO,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC;IAAC,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CAAE,CAe1F"}