@gnsx/genesys.agent.eval 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -0
- package/dist/src/adapters/anthropic-adapter.d.ts +24 -0
- package/dist/src/adapters/anthropic-adapter.d.ts.map +1 -0
- package/dist/src/adapters/anthropic-adapter.js +80 -0
- package/dist/src/adapters/anthropic-adapter.js.map +1 -0
- package/dist/src/adapters/gemini-adapter.d.ts +23 -0
- package/dist/src/adapters/gemini-adapter.d.ts.map +1 -0
- package/dist/src/adapters/gemini-adapter.js +79 -0
- package/dist/src/adapters/gemini-adapter.js.map +1 -0
- package/dist/src/adapters/ollama-adapter.d.ts +28 -0
- package/dist/src/adapters/ollama-adapter.d.ts.map +1 -0
- package/dist/src/adapters/ollama-adapter.js +54 -0
- package/dist/src/adapters/ollama-adapter.js.map +1 -0
- package/dist/src/adapters/openai-adapter.d.ts +24 -0
- package/dist/src/adapters/openai-adapter.d.ts.map +1 -0
- package/dist/src/adapters/openai-adapter.js +80 -0
- package/dist/src/adapters/openai-adapter.js.map +1 -0
- package/dist/src/adapters/pi-adapter.d.ts +27 -0
- package/dist/src/adapters/pi-adapter.d.ts.map +1 -0
- package/dist/src/adapters/pi-adapter.js +136 -0
- package/dist/src/adapters/pi-adapter.js.map +1 -0
- package/dist/src/agent-adapter.d.ts +130 -0
- package/dist/src/agent-adapter.d.ts.map +1 -0
- package/dist/src/agent-adapter.js +134 -0
- package/dist/src/agent-adapter.js.map +1 -0
- package/dist/src/args.d.ts +22 -0
- package/dist/src/args.d.ts.map +1 -0
- package/dist/src/args.js +224 -0
- package/dist/src/args.js.map +1 -0
- package/dist/src/cli-runner.d.ts +39 -0
- package/dist/src/cli-runner.d.ts.map +1 -0
- package/dist/src/cli-runner.js +105 -0
- package/dist/src/cli-runner.js.map +1 -0
- package/dist/src/embedding-judge.d.ts +93 -0
- package/dist/src/embedding-judge.d.ts.map +1 -0
- package/dist/src/embedding-judge.js +160 -0
- package/dist/src/embedding-judge.js.map +1 -0
- package/dist/src/index.d.ts +15 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +20 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/judge.d.ts +95 -0
- package/dist/src/judge.d.ts.map +1 -0
- package/dist/src/judge.js +189 -0
- package/dist/src/judge.js.map +1 -0
- package/dist/src/launcher.d.ts +9 -0
- package/dist/src/launcher.d.ts.map +1 -0
- package/dist/src/launcher.js +129 -0
- package/dist/src/launcher.js.map +1 -0
- package/dist/src/reporter.d.ts +86 -0
- package/dist/src/reporter.d.ts.map +1 -0
- package/dist/src/reporter.js +384 -0
- package/dist/src/reporter.js.map +1 -0
- package/dist/src/runner.d.ts +75 -0
- package/dist/src/runner.d.ts.map +1 -0
- package/dist/src/runner.js +165 -0
- package/dist/src/runner.js.map +1 -0
- package/dist/src/test-loader.d.ts +66 -0
- package/dist/src/test-loader.d.ts.map +1 -0
- package/dist/src/test-loader.js +140 -0
- package/dist/src/test-loader.js.map +1 -0
- package/dist/src/types.d.ts +161 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +7 -0
- package/dist/src/types.js.map +1 -0
- package/dist/src/utils/package.d.ts +16 -0
- package/dist/src/utils/package.d.ts.map +1 -0
- package/dist/src/utils/package.js +30 -0
- package/dist/src/utils/package.js.map +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/examples/basic-tests.yaml +22 -0
- package/package.json +41 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test runner orchestrates loading, execution, and judging of test cases.
|
|
3
|
+
*
|
|
4
|
+
* @module runner
|
|
5
|
+
*/
|
|
6
|
+
import type { EvalResults, RunnerConfig, TestCase, TestResult } from './types.js';
|
|
7
|
+
/**
|
|
8
|
+
* Progress callback for test execution.
|
|
9
|
+
*/
|
|
10
|
+
export interface ProgressCallback {
|
|
11
|
+
/**
|
|
12
|
+
* Called when a test starts.
|
|
13
|
+
*
|
|
14
|
+
* @param testId - The test ID
|
|
15
|
+
* @param index - The test index (0-based)
|
|
16
|
+
* @param total - Total number of tests
|
|
17
|
+
*/
|
|
18
|
+
onTestStart(testId: string, index: number, total: number): void;
|
|
19
|
+
/**
|
|
20
|
+
* Called when a test completes.
|
|
21
|
+
*
|
|
22
|
+
* @param result - The test result
|
|
23
|
+
* @param index - The test index (0-based)
|
|
24
|
+
* @param total - Total number of tests
|
|
25
|
+
*/
|
|
26
|
+
onTestComplete(result: TestResult, index: number, total: number): void;
|
|
27
|
+
/**
|
|
28
|
+
* Called when a test fails with an error.
|
|
29
|
+
*
|
|
30
|
+
* @param testId - The test ID
|
|
31
|
+
* @param error - The error that occurred
|
|
32
|
+
* @param index - The test index (0-based)
|
|
33
|
+
* @param total - Total number of tests
|
|
34
|
+
*/
|
|
35
|
+
onTestError(testId: string, error: string, index: number, total: number): void;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* TestRunner orchestrates the evaluation process.
|
|
39
|
+
*/
|
|
40
|
+
export declare class TestRunner {
|
|
41
|
+
private _config;
|
|
42
|
+
constructor(config: RunnerConfig);
|
|
43
|
+
/**
|
|
44
|
+
* Run the evaluation.
|
|
45
|
+
*
|
|
46
|
+
* @param judge - Function to evaluate agent outputs
|
|
47
|
+
* @param progress - Optional progress callback
|
|
48
|
+
* @returns The evaluation results
|
|
49
|
+
*/
|
|
50
|
+
run(judge: (test: TestCase, actualOutput: string) => Promise<{
|
|
51
|
+
score: number;
|
|
52
|
+
reasoning: string;
|
|
53
|
+
passed: boolean;
|
|
54
|
+
}>, progress?: ProgressCallback): Promise<EvalResults>;
|
|
55
|
+
/**
|
|
56
|
+
* Get the runner configuration.
|
|
57
|
+
*/
|
|
58
|
+
get config(): RunnerConfig;
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Run an evaluation with the given configuration.
|
|
62
|
+
*
|
|
63
|
+
* Convenience function that creates a TestRunner and executes it.
|
|
64
|
+
*
|
|
65
|
+
* @param config - Runner configuration
|
|
66
|
+
* @param judge - Judge function for evaluating responses
|
|
67
|
+
* @param progress - Optional progress callback
|
|
68
|
+
* @returns Evaluation results
|
|
69
|
+
*/
|
|
70
|
+
export declare function runEvaluation(config: RunnerConfig, judge: (test: TestCase, actualOutput: string) => Promise<{
|
|
71
|
+
score: number;
|
|
72
|
+
reasoning: string;
|
|
73
|
+
passed: boolean;
|
|
74
|
+
}>, progress?: ProgressCallback): Promise<EvalResults>;
|
|
75
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAKH,OAAO,KAAK,EACV,WAAW,EACX,YAAY,EACZ,QAAQ,EACR,UAAU,EAEX,MAAM,YAAY,CAAC;AAEpB;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAC/B;;;;;;OAMG;IACH,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IAEhE;;;;;;OAMG;IACH,cAAc,CAAC,MAAM,EAAE,UAAU,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IAEvE;;;;;;;OAOG;IACH,WAAW,CAAC,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;CAChF;AAmID;;GAEG;AACH,qBAAa,UAAU;IACrB,OAAO,CAAC,OAAO,CAAe;gBAElB,MAAM,EAAE,YAAY;IAIhC;;;;;;OAMG;IACG,GAAG,CACP,KAAK,EAAE,CAAC,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,KAAK,OAAO,CAAC;QACvD,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC,EACF,QAAQ,CAAC,EAAE,gBAAgB,GAC1B,OAAO,CAAC,WAAW,CAAC;IA6CvB;;OAEG;IACH,IAAI,MAAM,IAAI,YAAY,CAEzB;CACF;AAED;;;;;;;;;GASG;AACH,wBAAsB,aAAa,CACjC,MAAM,EAAE,YAAY,EACpB,KAAK,EAAE,CAAC,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,KAAK,OAAO,CAAC;IACvD,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,OAAO,CAAC;CACjB,CAAC,EACF,QAAQ,CAAC,EAAE,gBAAgB,GAC1B,OAAO,CAAC,WAAW,CAAC,CAGtB"}
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test runner orchestrates loading, execution, and judging of test cases.
|
|
3
|
+
*
|
|
4
|
+
* @module runner
|
|
5
|
+
*/
|
|
6
|
+
import { runAgent } from './cli-runner.js';
|
|
7
|
+
import { loadTestSuite } from './test-loader.js';
|
|
8
|
+
/**
|
|
9
|
+
* Run a single test case.
|
|
10
|
+
*
|
|
11
|
+
* @param test - The test case to run
|
|
12
|
+
* @param suite - The test suite
|
|
13
|
+
* @param agent - The agent to use
|
|
14
|
+
* @param timeout - Timeout in milliseconds
|
|
15
|
+
* @param judge - Function to judge the response
|
|
16
|
+
* @param progress - Optional progress callback
|
|
17
|
+
* @param index - Test index
|
|
18
|
+
* @param total - Total tests
|
|
19
|
+
* @returns The test result
|
|
20
|
+
*/
|
|
21
|
+
async function runTest(test, suite, agent, timeout, judge, progress, index, total) {
|
|
22
|
+
progress?.onTestStart(test.id, index, total);
|
|
23
|
+
const startTime = Date.now();
|
|
24
|
+
// Build the full prompt with context
|
|
25
|
+
const parts = [];
|
|
26
|
+
if (suite.context) {
|
|
27
|
+
parts.push('Context:', suite.context, '');
|
|
28
|
+
}
|
|
29
|
+
if (test.context) {
|
|
30
|
+
parts.push('Specific Context:', test.context, '');
|
|
31
|
+
}
|
|
32
|
+
parts.push('Task:', test.input);
|
|
33
|
+
const prompt = parts.join('\n');
|
|
34
|
+
try {
|
|
35
|
+
// Run the agent
|
|
36
|
+
const response = await runAgent(agent, prompt, { cwd: process.cwd(), timeout });
|
|
37
|
+
// Judge the response
|
|
38
|
+
const { score, reasoning, passed } = await judge(test, response.output);
|
|
39
|
+
const result = {
|
|
40
|
+
testId: test.id,
|
|
41
|
+
input: test.input,
|
|
42
|
+
expectedOutput: test.expectedOutput,
|
|
43
|
+
actualOutput: response.output,
|
|
44
|
+
judgeScore: score,
|
|
45
|
+
judgeReasoning: reasoning,
|
|
46
|
+
durationMs: response.durationMs,
|
|
47
|
+
passed,
|
|
48
|
+
};
|
|
49
|
+
progress?.onTestComplete(result, index, total);
|
|
50
|
+
return result;
|
|
51
|
+
}
|
|
52
|
+
catch (error) {
|
|
53
|
+
const durationMs = Date.now() - startTime;
|
|
54
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
55
|
+
progress?.onTestError(test.id, errorMessage, index, total);
|
|
56
|
+
return {
|
|
57
|
+
testId: test.id,
|
|
58
|
+
input: test.input,
|
|
59
|
+
expectedOutput: test.expectedOutput,
|
|
60
|
+
actualOutput: '',
|
|
61
|
+
judgeScore: 0,
|
|
62
|
+
judgeReasoning: `Error: ${errorMessage}`,
|
|
63
|
+
durationMs,
|
|
64
|
+
passed: false,
|
|
65
|
+
error: errorMessage,
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Run tests in parallel with a concurrency limit.
|
|
71
|
+
*
|
|
72
|
+
* @param tests - Tests to run
|
|
73
|
+
* @param concurrency - Number of concurrent executions
|
|
74
|
+
* @param runner - Function to run a single test
|
|
75
|
+
* @returns Array of results in the same order as tests
|
|
76
|
+
*/
|
|
77
|
+
async function runInParallel(items, concurrency, runner) {
|
|
78
|
+
if (concurrency <= 1) {
|
|
79
|
+
// Sequential execution
|
|
80
|
+
const results = [];
|
|
81
|
+
for (let i = 0; i < items.length; i++) {
|
|
82
|
+
results.push(await runner(items[i], i));
|
|
83
|
+
}
|
|
84
|
+
return results;
|
|
85
|
+
}
|
|
86
|
+
// Parallel execution with concurrency limit
|
|
87
|
+
const results = new Array(items.length);
|
|
88
|
+
let index = 0;
|
|
89
|
+
async function worker() {
|
|
90
|
+
while (index < items.length) {
|
|
91
|
+
const currentIndex = index++;
|
|
92
|
+
results[currentIndex] = await runner(items[currentIndex], currentIndex);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
// Start workers
|
|
96
|
+
const workers = Array(Math.min(concurrency, items.length))
|
|
97
|
+
.fill(null)
|
|
98
|
+
.map(() => worker());
|
|
99
|
+
await Promise.all(workers);
|
|
100
|
+
return results;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* TestRunner orchestrates the evaluation process.
|
|
104
|
+
*/
|
|
105
|
+
export class TestRunner {
|
|
106
|
+
_config;
|
|
107
|
+
constructor(config) {
|
|
108
|
+
this._config = config;
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Run the evaluation.
|
|
112
|
+
*
|
|
113
|
+
* @param judge - Function to evaluate agent outputs
|
|
114
|
+
* @param progress - Optional progress callback
|
|
115
|
+
* @returns The evaluation results
|
|
116
|
+
*/
|
|
117
|
+
async run(judge, progress) {
|
|
118
|
+
// Load the test suite
|
|
119
|
+
const { suite } = await loadTestSuite(this._config.testsPath, this._config.cwd);
|
|
120
|
+
// Run all tests
|
|
121
|
+
const results = await runInParallel(suite.tests, this._config.parallel, async (test, index) => {
|
|
122
|
+
return runTest(test, suite, this._config.agent, this._config.timeout, judge, progress, index, suite.tests.length);
|
|
123
|
+
});
|
|
124
|
+
// Calculate summary
|
|
125
|
+
const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);
|
|
126
|
+
const passed = results.filter(r => r.passed).length;
|
|
127
|
+
const failed = results.length - passed;
|
|
128
|
+
const avgScore = results.reduce((sum, r) => sum + r.judgeScore, 0) / results.length;
|
|
129
|
+
const evalResults = {
|
|
130
|
+
suite,
|
|
131
|
+
agent: this._config.agent,
|
|
132
|
+
timestamp: new Date().toISOString(),
|
|
133
|
+
results,
|
|
134
|
+
summary: {
|
|
135
|
+
total: results.length,
|
|
136
|
+
passed,
|
|
137
|
+
failed,
|
|
138
|
+
avgScore,
|
|
139
|
+
totalDurationMs,
|
|
140
|
+
},
|
|
141
|
+
};
|
|
142
|
+
return evalResults;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Get the runner configuration.
|
|
146
|
+
*/
|
|
147
|
+
get config() {
|
|
148
|
+
return this._config;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Run an evaluation with the given configuration.
|
|
153
|
+
*
|
|
154
|
+
* Convenience function that creates a TestRunner and executes it.
|
|
155
|
+
*
|
|
156
|
+
* @param config - Runner configuration
|
|
157
|
+
* @param judge - Judge function for evaluating responses
|
|
158
|
+
* @param progress - Optional progress callback
|
|
159
|
+
* @returns Evaluation results
|
|
160
|
+
*/
|
|
161
|
+
export async function runEvaluation(config, judge, progress) {
|
|
162
|
+
const runner = new TestRunner(config);
|
|
163
|
+
return runner.run(judge, progress);
|
|
164
|
+
}
|
|
165
|
+
//# sourceMappingURL=runner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/runner.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,iBAAiB,CAAC;AAC3C,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AA2CjD;;;;;;;;;;;;GAYG;AACH,KAAK,UAAU,OAAO,CACpB,IAAc,EACd,KAAgB,EAChB,KAAa,EACb,OAAe,EACf,KAIE,EACF,QAAsC,EACtC,KAAa,EACb,KAAa;IAEb,QAAQ,EAAE,WAAW,CAAC,IAAI,CAAC,EAAE,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;IAE7C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,qCAAqC;IACrC,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC;QAClB,KAAK,CAAC,IAAI,CAAC,UAAU,EAAE,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IAC5C,CAAC;IACD,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;QACjB,KAAK,CAAC,IAAI,CAAC,mBAAmB,EAAE,IAAI,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IACpD,CAAC;IACD,KAAK,CAAC,IAAI,CAAC,OAAO,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;IAChC,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEhC,IAAI,CAAC;QACH,gBAAgB;QAChB,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAC7B,KAAyB,EACzB,MAAM,EACN,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,EAAE,OAAO,EAAE,CAChC,CAAC;QAEF,qBAAqB;QACrB,MAAM,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE,GAAG,MAAM,KAAK,CAAC,IAAI,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;QAExE,MAAM,MAAM,GAAe;YACzB,MAAM,EAAE,IAAI,CAAC,EAAE;YACf,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,cAAc,EAAE,IAAI,CAAC,cAAc;YACnC,YAAY,EAAE,QAAQ,CAAC,MAAM;YAC7B,UAAU,EAAE,KAAK;YACjB,cAAc,EAAE,SAAS;YACzB,UAAU,EAAE,QAAQ,CAAC,UAAU;YAC/B,MAAM;SACP,CAAC;QAEF,QAAQ,EAAE,cAAc,CAAC,MAAM,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;QAC/C,OAAO,MAAM,CAAC;IAChB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;QAC1C,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QAE5E,QAAQ,EAAE,WAAW,CAAC,IAAI,CAAC,EAAE,EAAE,YAAY,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;QAE3D,OAAO;YACL,MAAM,EAAE,IAAI,CAAC,EAAE;YACf,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,cAAc,EAAE,IAAI,CAAC,cAAc;YACnC,YAAY,EAAE,EAAE;YAChB,UAAU,EAAE,CAAC;YACb,cAAc,EAAE,UAAU,YAAY,EAAE;YACxC,UAAU;YACV,MAAM,EAAE,KAAK;YACb,KAAK,EAAE,YAAY;SACpB,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;;;;;;GAOG;AACH,KAAK,UAAU,aAAa,CAC1B,KAAU,EACV,WAAmB,EACnB,MAA8C;IAE9C,IAAI,WAAW,IAAI,CAAC,EAAE,CAAC;QACrB,uBAAuB;QACvB,MAAM,OAAO,GAAQ,EAAE,CAAC;QACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACtC,OAAO,CAAC,IAAI,CAAC,MAAM,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,4CAA4C;IAC5C,MAAM,OAAO,GAAG,IAAI,KAAK,CAAI,KAAK,CAAC,MAAM,CAAC,CAAC;IAC3C,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,KAAK,UAAU,MAAM;QACnB,OAAO,KAAK,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC5B,MAAM,YAAY,GAAG,KAAK,EAAE,CAAC;YAC7B,OAAO,CAAC,YAAY,CAAC,GAAG,MAAM,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,EAAE,YAAY,CAAC,CAAC;QAC1E,CAAC;IACH,CAAC;IAED,gBAAgB;IAChB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,WAAW,EAAE,KAAK,CAAC,MAAM,CAAC,CAAC;SACvD,IAAI,CAAC,IAAI,CAAC;SACV,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,EAAE,CAAC,CAAC;IAEvB,MAAM,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IAE3B,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;GAEG;AACH,MAAM,OAAO,UAAU;IACb,OAAO,CAAe;IAE9B,YAAY,MAAoB;QAC9B,IAAI,CAAC,OAAO,GAAG,MAAM,CAAC;IACxB,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,GAAG,CACP,KAIE,EACF,QAA2B;QAE3B,sBAAsB;QACtB,MAAM,EAAE,KAAK,EAAE,GAAG,MAAM,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAEhF,gBAAgB;QAChB,MAAM,OAAO,GAAG,MAAM,aAAa,CACjC,KAAK,CAAC,KAAK,EACX,IAAI,CAAC,OAAO,CAAC,QAAQ,EACrB,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE;YACpB,OAAO,OAAO,CACZ,IAAI,EACJ,KAAK,EACL,IAAI,CAAC,OAAO,CAAC,KAAK,EAClB,IAAI,CAAC,OAAO,CAAC,OAAO,EACpB,KAAK,EACL,QAAQ,EACR,KAAK,EACL,KAAK,CAAC,KAAK,CAAC,MAAM,CACnB,CAAC;QACJ,CAAC,CACF,CAAC;QAEF,oBAAoB;QACpB,MAAM,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,CAAC;QAC1E,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC;QACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,GAAG,MAAM,CAAC;QACvC,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;QAEpF,MAAM,WAAW,GAAgB;YAC/B,KAAK;YACL,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK;YACzB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,OAAO;YACP,OAAO,EAAE;gBACP,KAAK,EAAE,OAAO,CAAC,MAAM;gBACrB,MAAM;gBACN,MAAM;gBACN,QAAQ;gBACR,eAAe;aAChB;SACF,CAAC;QAEF,OAAO,WAAW,CAAC;IACrB,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF;AAED;;;;;;;;;GASG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,MAAoB,EACpB,KAIE,EACF,QAA2B;IAE3B,MAAM,MAAM,GAAG,IAAI,UAAU,CAAC,MAAM,CAAC,CAAC;IACtC,OAAO,MAAM,CAAC,GAAG,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;AACrC,CAAC"}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test suite loading and validation from YAML files.
|
|
3
|
+
*
|
|
4
|
+
* @module test-loader
|
|
5
|
+
*/
|
|
6
|
+
import { z } from 'zod';
|
|
7
|
+
import type { TestSuite } from './types.js';
|
|
8
|
+
/**
|
|
9
|
+
* Result of loading a test suite.
|
|
10
|
+
*/
|
|
11
|
+
export interface LoadResult {
|
|
12
|
+
/** The loaded test suite */
|
|
13
|
+
suite: TestSuite;
|
|
14
|
+
/** Absolute path to the test file */
|
|
15
|
+
path: string;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Error thrown when test file validation fails.
|
|
19
|
+
*/
|
|
20
|
+
export declare class TestValidationError extends Error {
|
|
21
|
+
readonly path: string;
|
|
22
|
+
readonly issues: z.ZodIssue[];
|
|
23
|
+
constructor(message: string, path: string, issues: z.ZodIssue[]);
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Error thrown when a test file cannot be read.
|
|
27
|
+
*/
|
|
28
|
+
export declare class TestLoadError extends Error {
|
|
29
|
+
readonly path: string;
|
|
30
|
+
readonly cause?: unknown | undefined;
|
|
31
|
+
constructor(message: string, path: string, cause?: unknown | undefined);
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Load and validate a test suite from a YAML file.
|
|
35
|
+
*
|
|
36
|
+
* @param filePath - Path to the YAML test file (relative or absolute)
|
|
37
|
+
* @param cwd - Working directory for resolving relative paths
|
|
38
|
+
* @returns The loaded and validated test suite
|
|
39
|
+
* @throws TestLoadError if the file cannot be read
|
|
40
|
+
* @throws TestValidationError if the YAML content is invalid
|
|
41
|
+
*
|
|
42
|
+
* @example
|
|
43
|
+
* ```typescript
|
|
44
|
+
* const { suite, path } = await loadTestSuite('./tests.yaml', process.cwd());
|
|
45
|
+
* console.log(`Loaded ${suite.tests.length} tests from ${path}`);
|
|
46
|
+
* ```
|
|
47
|
+
*/
|
|
48
|
+
export declare function loadTestSuite(filePath: string, cwd?: string): Promise<LoadResult>;
|
|
49
|
+
/**
|
|
50
|
+
* Validate a test suite object without loading from file.
|
|
51
|
+
*
|
|
52
|
+
* @param data - The data to validate
|
|
53
|
+
* @returns The validated test suite
|
|
54
|
+
* @throws TestValidationError if validation fails
|
|
55
|
+
*/
|
|
56
|
+
export declare function validateTestSuite(data: unknown): TestSuite;
|
|
57
|
+
/**
|
|
58
|
+
* Parse a test suite from a YAML string.
|
|
59
|
+
*
|
|
60
|
+
* @param yamlString - YAML content to parse
|
|
61
|
+
* @returns The parsed and validated test suite
|
|
62
|
+
* @throws TestLoadError if YAML parsing fails
|
|
63
|
+
* @throws TestValidationError if validation fails
|
|
64
|
+
*/
|
|
65
|
+
export declare function parseTestSuite(yamlString: string): TestSuite;
|
|
66
|
+
//# sourceMappingURL=test-loader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"test-loader.d.ts","sourceRoot":"","sources":["../../src/test-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAMH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,KAAK,EAAY,SAAS,EAAE,MAAM,YAAY,CAAC;AAsBtD;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,4BAA4B;IAC5B,KAAK,EAAE,SAAS,CAAC;IAEjB,qCAAqC;IACrC,IAAI,EAAE,MAAM,CAAC;CACd;AAED;;GAEG;AACH,qBAAa,mBAAoB,SAAQ,KAAK;aAG1B,IAAI,EAAE,MAAM;aACZ,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;gBAFpC,OAAO,EAAE,MAAM,EACC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE;CAKvC;AAED;;GAEG;AACH,qBAAa,aAAc,SAAQ,KAAK;aAGpB,IAAI,EAAE,MAAM;aACZ,KAAK,CAAC,EAAE,OAAO;gBAF/B,OAAO,EAAE,MAAM,EACC,IAAI,EAAE,MAAM,EACZ,KAAK,CAAC,EAAE,OAAO,YAAA;CAKlC;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAsB,aAAa,CACjC,QAAQ,EAAE,MAAM,EAChB,GAAG,GAAE,MAAsB,GAC1B,OAAO,CAAC,UAAU,CAAC,CAmDrB;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,OAAO,GAAG,SAAS,CAwB1D;AAED;;;;;;;GAOG;AACH,wBAAgB,cAAc,CAAC,UAAU,EAAE,MAAM,GAAG,SAAS,CAa5D"}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test suite loading and validation from YAML files.
|
|
3
|
+
*
|
|
4
|
+
* @module test-loader
|
|
5
|
+
*/
|
|
6
|
+
import { readFile } from 'node:fs/promises';
|
|
7
|
+
import { resolve } from 'node:path';
|
|
8
|
+
import YAML from 'yaml';
|
|
9
|
+
import { z } from 'zod';
|
|
10
|
+
/**
|
|
11
|
+
* Zod schema for validating a test case.
|
|
12
|
+
*/
|
|
13
|
+
const testCaseSchema = z.object({
|
|
14
|
+
id: z.string().min(1, 'Test case ID is required'),
|
|
15
|
+
input: z.string().min(1, 'Test case input is required'),
|
|
16
|
+
context: z.string().optional(),
|
|
17
|
+
expectedOutput: z.string().min(1, 'Test case expectedOutput is required'),
|
|
18
|
+
});
|
|
19
|
+
/**
|
|
20
|
+
* Zod schema for validating a test suite.
|
|
21
|
+
*/
|
|
22
|
+
const testSuiteSchema = z.object({
|
|
23
|
+
name: z.string().min(1, 'Test suite name is required'),
|
|
24
|
+
description: z.string().optional(),
|
|
25
|
+
context: z.string().optional(),
|
|
26
|
+
tests: z.array(testCaseSchema).min(1, 'At least one test case is required'),
|
|
27
|
+
});
|
|
28
|
+
/**
|
|
29
|
+
* Error thrown when test file validation fails.
|
|
30
|
+
*/
|
|
31
|
+
export class TestValidationError extends Error {
|
|
32
|
+
path;
|
|
33
|
+
issues;
|
|
34
|
+
constructor(message, path, issues) {
|
|
35
|
+
super(message);
|
|
36
|
+
this.path = path;
|
|
37
|
+
this.issues = issues;
|
|
38
|
+
this.name = 'TestValidationError';
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Error thrown when a test file cannot be read.
|
|
43
|
+
*/
|
|
44
|
+
export class TestLoadError extends Error {
|
|
45
|
+
path;
|
|
46
|
+
cause;
|
|
47
|
+
constructor(message, path, cause) {
|
|
48
|
+
super(message);
|
|
49
|
+
this.path = path;
|
|
50
|
+
this.cause = cause;
|
|
51
|
+
this.name = 'TestLoadError';
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Load and validate a test suite from a YAML file.
|
|
56
|
+
*
|
|
57
|
+
* @param filePath - Path to the YAML test file (relative or absolute)
|
|
58
|
+
* @param cwd - Working directory for resolving relative paths
|
|
59
|
+
* @returns The loaded and validated test suite
|
|
60
|
+
* @throws TestLoadError if the file cannot be read
|
|
61
|
+
* @throws TestValidationError if the YAML content is invalid
|
|
62
|
+
*
|
|
63
|
+
* @example
|
|
64
|
+
* ```typescript
|
|
65
|
+
* const { suite, path } = await loadTestSuite('./tests.yaml', process.cwd());
|
|
66
|
+
* console.log(`Loaded ${suite.tests.length} tests from ${path}`);
|
|
67
|
+
* ```
|
|
68
|
+
*/
|
|
69
|
+
export async function loadTestSuite(filePath, cwd = process.cwd()) {
|
|
70
|
+
const absolutePath = resolve(cwd, filePath);
|
|
71
|
+
let content;
|
|
72
|
+
try {
|
|
73
|
+
content = await readFile(absolutePath, 'utf-8');
|
|
74
|
+
}
|
|
75
|
+
catch (error) {
|
|
76
|
+
throw new TestLoadError(`Failed to read test file: ${absolutePath}`, absolutePath, error);
|
|
77
|
+
}
|
|
78
|
+
let parsed;
|
|
79
|
+
try {
|
|
80
|
+
parsed = YAML.parse(content);
|
|
81
|
+
}
|
|
82
|
+
catch (error) {
|
|
83
|
+
throw new TestLoadError(`Failed to parse YAML: ${error instanceof Error ? error.message : String(error)}`, absolutePath, error);
|
|
84
|
+
}
|
|
85
|
+
const result = testSuiteSchema.safeParse(parsed);
|
|
86
|
+
if (!result.success) {
|
|
87
|
+
throw new TestValidationError(`Test suite validation failed: ${result.error.message}`, absolutePath, result.error.issues);
|
|
88
|
+
}
|
|
89
|
+
const validated = result.data;
|
|
90
|
+
// Check for duplicate test IDs
|
|
91
|
+
const ids = validated.tests.map((t) => t.id);
|
|
92
|
+
const duplicates = ids.filter((id, index) => ids.indexOf(id) !== index);
|
|
93
|
+
if (duplicates.length > 0) {
|
|
94
|
+
throw new TestValidationError(`Duplicate test IDs found: ${[...new Set(duplicates)].join(', ')}`, absolutePath, []);
|
|
95
|
+
}
|
|
96
|
+
return {
|
|
97
|
+
suite: validated,
|
|
98
|
+
path: absolutePath,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Validate a test suite object without loading from file.
|
|
103
|
+
*
|
|
104
|
+
* @param data - The data to validate
|
|
105
|
+
* @returns The validated test suite
|
|
106
|
+
* @throws TestValidationError if validation fails
|
|
107
|
+
*/
|
|
108
|
+
export function validateTestSuite(data) {
|
|
109
|
+
const result = testSuiteSchema.safeParse(data);
|
|
110
|
+
if (!result.success) {
|
|
111
|
+
throw new TestValidationError(`Test suite validation failed: ${result.error.message}`, '<inline>', result.error.issues);
|
|
112
|
+
}
|
|
113
|
+
const validated = result.data;
|
|
114
|
+
// Check for duplicate test IDs
|
|
115
|
+
const ids = validated.tests.map((t) => t.id);
|
|
116
|
+
const duplicates = ids.filter((id, index) => ids.indexOf(id) !== index);
|
|
117
|
+
if (duplicates.length > 0) {
|
|
118
|
+
throw new TestValidationError(`Duplicate test IDs found: ${[...new Set(duplicates)].join(', ')}`, '<inline>', []);
|
|
119
|
+
}
|
|
120
|
+
return validated;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Parse a test suite from a YAML string.
|
|
124
|
+
*
|
|
125
|
+
* @param yamlString - YAML content to parse
|
|
126
|
+
* @returns The parsed and validated test suite
|
|
127
|
+
* @throws TestLoadError if YAML parsing fails
|
|
128
|
+
* @throws TestValidationError if validation fails
|
|
129
|
+
*/
|
|
130
|
+
export function parseTestSuite(yamlString) {
|
|
131
|
+
let parsed;
|
|
132
|
+
try {
|
|
133
|
+
parsed = YAML.parse(yamlString);
|
|
134
|
+
}
|
|
135
|
+
catch (error) {
|
|
136
|
+
throw new TestLoadError(`Failed to parse YAML: ${error instanceof Error ? error.message : String(error)}`, '<string>', error);
|
|
137
|
+
}
|
|
138
|
+
return validateTestSuite(parsed);
|
|
139
|
+
}
|
|
140
|
+
//# sourceMappingURL=test-loader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"test-loader.js","sourceRoot":"","sources":["../../src/test-loader.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAC;AAEpC,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAIxB;;GAEG;AACH,MAAM,cAAc,GAAG,CAAC,CAAC,MAAM,CAAC;IAC9B,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,0BAA0B,CAAC;IACjD,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,6BAA6B,CAAC;IACvD,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC9B,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,sCAAsC,CAAC;CAC1E,CAAC,CAAC;AAEH;;GAEG;AACH,MAAM,eAAe,GAAG,CAAC,CAAC,MAAM,CAAC;IAC/B,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,6BAA6B,CAAC;IACtD,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAClC,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IAC9B,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,cAAc,CAAC,CAAC,GAAG,CAAC,CAAC,EAAE,oCAAoC,CAAC;CAC5E,CAAC,CAAC;AAaH;;GAEG;AACH,MAAM,OAAO,mBAAoB,SAAQ,KAAK;IAG1B;IACA;IAHlB,YACE,OAAe,EACC,IAAY,EACZ,MAAoB;QAEpC,KAAK,CAAC,OAAO,CAAC,CAAC;QAHC,SAAI,GAAJ,IAAI,CAAQ;QACZ,WAAM,GAAN,MAAM,CAAc;QAGpC,IAAI,CAAC,IAAI,GAAG,qBAAqB,CAAC;IACpC,CAAC;CACF;AAED;;GAEG;AACH,MAAM,OAAO,aAAc,SAAQ,KAAK;IAGpB;IACA;IAHlB,YACE,OAAe,EACC,IAAY,EACZ,KAAe;QAE/B,KAAK,CAAC,OAAO,CAAC,CAAC;QAHC,SAAI,GAAJ,IAAI,CAAQ;QACZ,UAAK,GAAL,KAAK,CAAU;QAG/B,IAAI,CAAC,IAAI,GAAG,eAAe,CAAC;IAC9B,CAAC;CACF;AAED;;;;;;;;;;;;;;GAcG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,QAAgB,EAChB,MAAc,OAAO,CAAC,GAAG,EAAE;IAE3B,MAAM,YAAY,GAAG,OAAO,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IAE5C,IAAI,OAAe,CAAC;IACpB,IAAI,CAAC;QACH,OAAO,GAAG,MAAM,QAAQ,CAAC,YAAY,EAAE,OAAO,CAAC,CAAC;IAClD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,IAAI,aAAa,CACrB,6BAA6B,YAAY,EAAE,EAC3C,YAAY,EACZ,KAAK,CACN,CAAC;IACJ,CAAC;IAED,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAC/B,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,IAAI,aAAa,CACrB,yBAAyB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,EACjF,YAAY,EACZ,KAAK,CACN,CAAC;IACJ,CAAC;IAED,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;IACjD,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QACpB,MAAM,IAAI,mBAAmB,CAC3B,iCAAiC,MAAM,CAAC,KAAK,CAAC,OAAO,EAAE,EACvD,YAAY,EACZ,MAAM,CAAC,KAAK,CAAC,MAAM,CACpB,CAAC;IACJ,CAAC;IAED,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC;IAE9B,+BAA+B;IAC/B,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAW,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACvD,MAAM,UAAU,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,EAAU,EAAE,KAAa,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,KAAK,KAAK,CAAC,CAAC;IACxF,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,mBAAmB,CAC3B,6BAA6B,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,EAClE,YAAY,EACZ,EAAE,CACH,CAAC;IACJ,CAAC;IAED,OAAO;QACL,KAAK,EAAE,SAAS;QAChB,IAAI,EAAE,YAAY;KACnB,CAAC;AACJ,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAAa;IAC7C,MAAM,MAAM,GAAG,eAAe,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;IAC/C,IAAI,CAAC,MAAM,CAAC,OAAO,EAAE,CAAC;QACpB,MAAM,IAAI,mBAAmB,CAC3B,iCAAiC,MAAM,CAAC,KAAK,CAAC,OAAO,EAAE,EACvD,UAAU,EACV,MAAM,CAAC,KAAK,CAAC,MAAM,CACpB,CAAC;IACJ,CAAC;IAED,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC;IAE9B,+BAA+B;IAC/B,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAW,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IACvD,MAAM,UAAU,GAAG,GAAG,CAAC,MAAM,CAAC,CAAC,EAAU,EAAE,KAAa,EAAE,EAAE,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC,KAAK,KAAK,CAAC,CAAC;IACxF,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,mBAAmB,CAC3B,6BAA6B,CAAC,GAAG,IAAI,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,EAClE,UAAU,EACV,EAAE,CACH,CAAC;IACJ,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,cAAc,CAAC,UAAkB;IAC/C,IAAI,MAAe,CAAC;IACpB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;IAClC,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,IAAI,aAAa,CACrB,yBAAyB,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,EACjF,UAAU,EACV,KAAK,CACN,CAAC;IACJ,CAAC;IAED,OAAO,iBAAiB,CAAC,MAAM,CAAC,CAAC;AACnC,CAAC"}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core type definitions for the agent evaluation harness.
|
|
3
|
+
*
|
|
4
|
+
* @module types
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* A single test case for evaluating an agent.
|
|
8
|
+
*/
|
|
9
|
+
export interface TestCase {
|
|
10
|
+
/** Unique identifier for this test case */
|
|
11
|
+
id: string;
|
|
12
|
+
/** The input prompt to send to the agent */
|
|
13
|
+
input: string;
|
|
14
|
+
/** Optional additional context for this specific test case */
|
|
15
|
+
context?: string;
|
|
16
|
+
/** Natural language description of expected output */
|
|
17
|
+
expectedOutput: string;
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* A suite of test cases for evaluation.
|
|
21
|
+
*/
|
|
22
|
+
export interface TestSuite {
|
|
23
|
+
/** Name of the test suite */
|
|
24
|
+
name: string;
|
|
25
|
+
/** Optional description of what this suite tests */
|
|
26
|
+
description?: string;
|
|
27
|
+
/** Optional global context available to all tests in this suite */
|
|
28
|
+
context?: string;
|
|
29
|
+
/** Array of test cases */
|
|
30
|
+
tests: TestCase[];
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Response from running a CLI agent.
|
|
34
|
+
*/
|
|
35
|
+
export interface AgentResponse {
|
|
36
|
+
/** The actual output from the agent */
|
|
37
|
+
output: string;
|
|
38
|
+
/** Exit code from the process */
|
|
39
|
+
exitCode: number;
|
|
40
|
+
/** Any stderr output */
|
|
41
|
+
stderr: string;
|
|
42
|
+
/** Duration in milliseconds */
|
|
43
|
+
durationMs: number;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Result of running a single test case.
|
|
47
|
+
*/
|
|
48
|
+
export interface TestResult {
|
|
49
|
+
/** Test case identifier */
|
|
50
|
+
testId: string;
|
|
51
|
+
/** Input that was sent to the agent */
|
|
52
|
+
input: string;
|
|
53
|
+
/** Expected output description */
|
|
54
|
+
expectedOutput: string;
|
|
55
|
+
/** Actual output from the agent */
|
|
56
|
+
actualOutput: string;
|
|
57
|
+
/** Judge score from 0 to 1 */
|
|
58
|
+
judgeScore: number;
|
|
59
|
+
/** Judge's reasoning for the score */
|
|
60
|
+
judgeReasoning: string;
|
|
61
|
+
/** Duration in milliseconds */
|
|
62
|
+
durationMs: number;
|
|
63
|
+
/** Whether the test passed (score >= threshold) */
|
|
64
|
+
passed: boolean;
|
|
65
|
+
/** Any error that occurred during execution */
|
|
66
|
+
error?: string;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Summary statistics for an evaluation run.
|
|
70
|
+
*/
|
|
71
|
+
export interface EvalSummary {
|
|
72
|
+
/** Total number of tests */
|
|
73
|
+
total: number;
|
|
74
|
+
/** Number of tests that passed */
|
|
75
|
+
passed: number;
|
|
76
|
+
/** Number of tests that failed */
|
|
77
|
+
failed: number;
|
|
78
|
+
/** Average judge score across all tests */
|
|
79
|
+
avgScore: number;
|
|
80
|
+
/** Total duration in milliseconds */
|
|
81
|
+
totalDurationMs: number;
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Complete results from an evaluation run.
|
|
85
|
+
*/
|
|
86
|
+
export interface EvalResults {
|
|
87
|
+
/** The test suite that was run */
|
|
88
|
+
suite: TestSuite;
|
|
89
|
+
/** Agent name that was tested */
|
|
90
|
+
agent: string;
|
|
91
|
+
/** Timestamp of the evaluation */
|
|
92
|
+
timestamp: string;
|
|
93
|
+
/** Individual test results */
|
|
94
|
+
results: TestResult[];
|
|
95
|
+
/** Summary statistics */
|
|
96
|
+
summary: EvalSummary;
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Configuration for the judge LLM.
|
|
100
|
+
*/
|
|
101
|
+
export interface JudgeConfig {
|
|
102
|
+
/** Provider name (e.g., 'anthropic', 'openai') */
|
|
103
|
+
provider: string;
|
|
104
|
+
/** Model identifier */
|
|
105
|
+
model: string;
|
|
106
|
+
/** Score threshold for passing (0-1, default: 0.7) */
|
|
107
|
+
passThreshold?: number;
|
|
108
|
+
/** Temperature for judge LLM (default: 0) */
|
|
109
|
+
temperature?: number;
|
|
110
|
+
}
|
|
111
|
+
/**
|
|
112
|
+
* Configuration for the test runner.
|
|
113
|
+
*/
|
|
114
|
+
export interface RunnerConfig {
|
|
115
|
+
/** Path to test file */
|
|
116
|
+
testsPath: string;
|
|
117
|
+
/** Agent to test: 'pi', 'genesys', or any custom command */
|
|
118
|
+
agent: string;
|
|
119
|
+
/** Working directory */
|
|
120
|
+
cwd: string;
|
|
121
|
+
/** Timeout in milliseconds (default: 120000) */
|
|
122
|
+
timeout: number;
|
|
123
|
+
/** Output file path (optional) */
|
|
124
|
+
outputPath?: string;
|
|
125
|
+
/** Output format */
|
|
126
|
+
format: 'console' | 'json' | 'html';
|
|
127
|
+
/** Number of parallel test executions */
|
|
128
|
+
parallel: number;
|
|
129
|
+
/** Judge configuration */
|
|
130
|
+
judge: JudgeConfig;
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* CLI argument structure.
|
|
134
|
+
*/
|
|
135
|
+
export interface Args {
|
|
136
|
+
/** Path to YAML test file */
|
|
137
|
+
tests: string;
|
|
138
|
+
/** Agent to test: 'pi' or 'genesys' */
|
|
139
|
+
agent: string;
|
|
140
|
+
/** Working directory */
|
|
141
|
+
cwd: string;
|
|
142
|
+
/** Timeout in seconds */
|
|
143
|
+
timeout: number;
|
|
144
|
+
/** Output file path */
|
|
145
|
+
output?: string;
|
|
146
|
+
/** Output format */
|
|
147
|
+
format: 'console' | 'json' | 'html';
|
|
148
|
+
/** Parallelism level */
|
|
149
|
+
parallel: number;
|
|
150
|
+
/** Judge type: 'embedding' (default) or 'llm' */
|
|
151
|
+
judgeType: 'embedding' | 'llm';
|
|
152
|
+
/** Judge model (for LLM judge) */
|
|
153
|
+
judgeModel: string;
|
|
154
|
+
/** Judge provider (for LLM judge) */
|
|
155
|
+
judgeProvider: string;
|
|
156
|
+
/** Show help */
|
|
157
|
+
help: boolean;
|
|
158
|
+
/** Show version */
|
|
159
|
+
version: boolean;
|
|
160
|
+
}
|
|
161
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;GAEG;AACH,MAAM,WAAW,QAAQ;IACvB,2CAA2C;IAC3C,EAAE,EAAE,MAAM,CAAC;IAEX,4CAA4C;IAC5C,KAAK,EAAE,MAAM,CAAC;IAEd,8DAA8D;IAC9D,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,sDAAsD;IACtD,cAAc,EAAE,MAAM,CAAC;CACxB;AAED;;GAEG;AACH,MAAM,WAAW,SAAS;IACxB,6BAA6B;IAC7B,IAAI,EAAE,MAAM,CAAC;IAEb,oDAAoD;IACpD,WAAW,CAAC,EAAE,MAAM,CAAC;IAErB,mEAAmE;IACnE,OAAO,CAAC,EAAE,MAAM,CAAC;IAEjB,0BAA0B;IAC1B,KAAK,EAAE,QAAQ,EAAE,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,aAAa;IAC5B,uCAAuC;IACvC,MAAM,EAAE,MAAM,CAAC;IAEf,iCAAiC;IACjC,QAAQ,EAAE,MAAM,CAAC;IAEjB,wBAAwB;IACxB,MAAM,EAAE,MAAM,CAAC;IAEf,+BAA+B;IAC/B,UAAU,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,2BAA2B;IAC3B,MAAM,EAAE,MAAM,CAAC;IAEf,uCAAuC;IACvC,KAAK,EAAE,MAAM,CAAC;IAEd,kCAAkC;IAClC,cAAc,EAAE,MAAM,CAAC;IAEvB,mCAAmC;IACnC,YAAY,EAAE,MAAM,CAAC;IAErB,8BAA8B;IAC9B,UAAU,EAAE,MAAM,CAAC;IAEnB,sCAAsC;IACtC,cAAc,EAAE,MAAM,CAAC;IAEvB,+BAA+B;IAC/B,UAAU,EAAE,MAAM,CAAC;IAEnB,mDAAmD;IACnD,MAAM,EAAE,OAAO,CAAC;IAEhB,+CAA+C;IAC/C,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,4BAA4B;IAC5B,KAAK,EAAE,MAAM,CAAC;IAEd,kCAAkC;IAClC,MAAM,EAAE,MAAM,CAAC;IAEf,kCAAkC;IAClC,MAAM,EAAE,MAAM,CAAC;IAEf,2CAA2C;IAC3C,QAAQ,EAAE,MAAM,CAAC;IAEjB,qCAAqC;IACrC,eAAe,EAAE,MAAM,CAAC;CACzB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,kCAAkC;IAClC,KAAK,EAAE,SAAS,CAAC;IAEjB,iCAAiC;IACjC,KAAK,EAAE,MAAM,CAAC;IAEd,kCAAkC;IAClC,SAAS,EAAE,MAAM,CAAC;IAElB,8BAA8B;IAC9B,OAAO,EAAE,UAAU,EAAE,CAAC;IAEtB,yBAAyB;IACzB,OAAO,EAAE,WAAW,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,kDAAkD;IAClD,QAAQ,EAAE,MAAM,CAAC;IAEjB,uBAAuB;IACvB,KAAK,EAAE,MAAM,CAAC;IAEd,sDAAsD;IACtD,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB,6CAA6C;IAC7C,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,wBAAwB;IACxB,SAAS,EAAE,MAAM,CAAC;IAElB,4DAA4D;IAC5D,KAAK,EAAE,MAAM,CAAC;IAEd,wBAAwB;IACxB,GAAG,EAAE,MAAM,CAAC;IAEZ,gDAAgD;IAChD,OAAO,EAAE,MAAM,CAAC;IAEhB,kCAAkC;IAClC,UAAU,CAAC,EAAE,MAAM,CAAC;IAEpB,oBAAoB;IACpB,MAAM,EAAE,SAAS,GAAG,MAAM,GAAG,MAAM,CAAC;IAEpC,yCAAyC;IACzC,QAAQ,EAAE,MAAM,CAAC;IAEjB,0BAA0B;IAC1B,KAAK,EAAE,WAAW,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,IAAI;IACnB,6BAA6B;IAC7B,KAAK,EAAE,MAAM,CAAC;IAEd,uCAAuC;IACvC,KAAK,EAAE,MAAM,CAAC;IAEd,wBAAwB;IACxB,GAAG,EAAE,MAAM,CAAC;IAEZ,yBAAyB;IACzB,OAAO,EAAE,MAAM,CAAC;IAEhB,uBAAuB;IACvB,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB,oBAAoB;IACpB,MAAM,EAAE,SAAS,GAAG,MAAM,GAAG,MAAM,CAAC;IAEpC,wBAAwB;IACxB,QAAQ,EAAE,MAAM,CAAC;IAEjB,iDAAiD;IACjD,SAAS,EAAE,WAAW,GAAG,KAAK,CAAC;IAE/B,kCAAkC;IAClC,UAAU,EAAE,MAAM,CAAC;IAEnB,qCAAqC;IACrC,aAAa,EAAE,MAAM,CAAC;IAEtB,gBAAgB;IAChB,IAAI,EAAE,OAAO,CAAC;IAEd,mBAAmB;IACnB,OAAO,EAAE,OAAO,CAAC;CAClB"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/types.ts"],"names":[],"mappings":"AAAA;;;;GAIG"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Package metadata utilities.
|
|
3
|
+
*
|
|
4
|
+
* @module utils/package
|
|
5
|
+
*/
|
|
6
|
+
/**
|
|
7
|
+
* Get package.json contents.
|
|
8
|
+
*
|
|
9
|
+
* @returns Package.json as an object
|
|
10
|
+
*/
|
|
11
|
+
export declare function getPackageJson(): {
|
|
12
|
+
version: string;
|
|
13
|
+
name: string;
|
|
14
|
+
[key: string]: unknown;
|
|
15
|
+
};
|
|
16
|
+
//# sourceMappingURL=package.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"package.d.ts","sourceRoot":"","sources":["../../../src/utils/package.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAMH;;;;GAIG;AACH,wBAAgB,cAAc,IAAI;IAAE,OAAO,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAC;IAAC,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAA;CAAE,CAe1F"}
|