@gnsx/genesys.agent.eval 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -0
- package/dist/src/adapters/anthropic-adapter.d.ts +24 -0
- package/dist/src/adapters/anthropic-adapter.d.ts.map +1 -0
- package/dist/src/adapters/anthropic-adapter.js +80 -0
- package/dist/src/adapters/anthropic-adapter.js.map +1 -0
- package/dist/src/adapters/gemini-adapter.d.ts +23 -0
- package/dist/src/adapters/gemini-adapter.d.ts.map +1 -0
- package/dist/src/adapters/gemini-adapter.js +79 -0
- package/dist/src/adapters/gemini-adapter.js.map +1 -0
- package/dist/src/adapters/ollama-adapter.d.ts +28 -0
- package/dist/src/adapters/ollama-adapter.d.ts.map +1 -0
- package/dist/src/adapters/ollama-adapter.js +54 -0
- package/dist/src/adapters/ollama-adapter.js.map +1 -0
- package/dist/src/adapters/openai-adapter.d.ts +24 -0
- package/dist/src/adapters/openai-adapter.d.ts.map +1 -0
- package/dist/src/adapters/openai-adapter.js +80 -0
- package/dist/src/adapters/openai-adapter.js.map +1 -0
- package/dist/src/adapters/pi-adapter.d.ts +27 -0
- package/dist/src/adapters/pi-adapter.d.ts.map +1 -0
- package/dist/src/adapters/pi-adapter.js +136 -0
- package/dist/src/adapters/pi-adapter.js.map +1 -0
- package/dist/src/agent-adapter.d.ts +130 -0
- package/dist/src/agent-adapter.d.ts.map +1 -0
- package/dist/src/agent-adapter.js +134 -0
- package/dist/src/agent-adapter.js.map +1 -0
- package/dist/src/args.d.ts +22 -0
- package/dist/src/args.d.ts.map +1 -0
- package/dist/src/args.js +224 -0
- package/dist/src/args.js.map +1 -0
- package/dist/src/cli-runner.d.ts +39 -0
- package/dist/src/cli-runner.d.ts.map +1 -0
- package/dist/src/cli-runner.js +105 -0
- package/dist/src/cli-runner.js.map +1 -0
- package/dist/src/embedding-judge.d.ts +93 -0
- package/dist/src/embedding-judge.d.ts.map +1 -0
- package/dist/src/embedding-judge.js +160 -0
- package/dist/src/embedding-judge.js.map +1 -0
- package/dist/src/index.d.ts +15 -0
- package/dist/src/index.d.ts.map +1 -0
- package/dist/src/index.js +20 -0
- package/dist/src/index.js.map +1 -0
- package/dist/src/judge.d.ts +95 -0
- package/dist/src/judge.d.ts.map +1 -0
- package/dist/src/judge.js +189 -0
- package/dist/src/judge.js.map +1 -0
- package/dist/src/launcher.d.ts +9 -0
- package/dist/src/launcher.d.ts.map +1 -0
- package/dist/src/launcher.js +129 -0
- package/dist/src/launcher.js.map +1 -0
- package/dist/src/reporter.d.ts +86 -0
- package/dist/src/reporter.d.ts.map +1 -0
- package/dist/src/reporter.js +384 -0
- package/dist/src/reporter.js.map +1 -0
- package/dist/src/runner.d.ts +75 -0
- package/dist/src/runner.d.ts.map +1 -0
- package/dist/src/runner.js +165 -0
- package/dist/src/runner.js.map +1 -0
- package/dist/src/test-loader.d.ts +66 -0
- package/dist/src/test-loader.d.ts.map +1 -0
- package/dist/src/test-loader.js +140 -0
- package/dist/src/test-loader.js.map +1 -0
- package/dist/src/types.d.ts +161 -0
- package/dist/src/types.d.ts.map +1 -0
- package/dist/src/types.js +7 -0
- package/dist/src/types.js.map +1 -0
- package/dist/src/utils/package.d.ts +16 -0
- package/dist/src/utils/package.d.ts.map +1 -0
- package/dist/src/utils/package.js +30 -0
- package/dist/src/utils/package.js.map +1 -0
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/examples/basic-tests.yaml +22 -0
- package/package.json +41 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cli-runner.js","sourceRoot":"","sources":["../../src/cli-runner.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAe3C;;GAEG;AACH,MAAM,OAAO,QAAS,SAAQ,KAAK;IAGf;IACA;IACA;IAJlB,YACE,OAAe,EACC,OAAe,EACf,QAAgB,EAChB,MAAc;QAE9B,KAAK,CAAC,OAAO,CAAC,CAAC;QAJC,YAAO,GAAP,OAAO,CAAQ;QACf,aAAQ,GAAR,QAAQ,CAAQ;QAChB,WAAM,GAAN,MAAM,CAAQ;QAG9B,IAAI,CAAC,IAAI,GAAG,UAAU,CAAC;IACzB,CAAC;CACF;AAED;;;;;;;;GAQG;AACH,SAAS,iBAAiB,CAAC,YAAoB;IAC7C,MAAM,OAAO,GAAG,YAAY,CAAC,IAAI,EAAE,CAAC;IAEpC,sCAAsC;IACtC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAC3B,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IACvB,CAAC;IAED,4FAA4F;IAC5F,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACnC,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IACrB,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IAE5B,OAAO,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;AACrB,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,KAAa,EACb,MAAc,EACd,OAAmB;IAEnB,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,iHAAiH;IAEjH,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,IAAI,MAAM,GAAG,EAAE,CAAC;QAChB,IAAI,MAAM,GAAG,EAAE,CAAC;QAEhB,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,GAAG,iBAAiB,CAAC,KAAK,CAAC,CAAC;QAEhD,2EAA2E;QAC3E,MAAM,iBAAiB,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;QAC7C,MAAM,QAAQ,GAAG,iBAAiB,IAAI,OAAO,CAAC,QAAQ,KAAK,OAAO,CAAC;QAEnE,oEAAoE;QACpE,MAAM,SAAS,GAAG,CAAC,GAAG,OAAO,EAAE,IAAI,CAAC,CAAC;QAErC,MAAM,KAAK,GAAG,KAAK,CAAC,GAAG,EAAE,SAAS,EAAE;YAClC,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;YAC/B,GAAG,EAAE,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE;YACvB,KAAK,EAAE,QAAQ;SAChB,CAAC,CAAC;QAEH,qCAAqC;QACrC,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;YAChB,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;YACnC,KAAK,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC;QACpB,CAAC;QAED,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE;YACxC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC5B,CAAC,CAAC,CAAC;QAEH,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE;YACxC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC5B,CAAC,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE;YAC9B,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACtB,MAAM,CAAC,IAAI,QAAQ,CACjB,2BAA2B,OAAO,CAAC,OAAO,IAAI,EAC9C,GAAG,KAAK,KAAK,EACb,CAAC,CAAC,EACF,MAAM,CACP,CAAC,CAAC;QACL,CAAC,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;QAEpB,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;YAC1B,YAAY,CAAC,OAAO,CAAC,CAAC;YACtB,MAAM,CAAC,IAAI,QAAQ,CACjB,mBAAmB,KAAK,KAAK,KAAK,CAAC,OAAO,mDAAmD,EAC7F,GAAG,KAAK,KAAK,EACb,CAAC,CAAC,EACF,MAAM,CACP,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;YACzB,YAAY,CAAC,OAAO,CAAC,CAAC;YACtB,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YAE1C,OAAO,CAAC;gBACN,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE;gBACrB,QAAQ,EAAE,IAAI,IAAI,CAAC;gBACnB,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE;gBACrB,UAAU;aACX,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding-based cosine similarity judge using an actual embedding model.
|
|
3
|
+
*
|
|
4
|
+
* Uses @huggingface/transformers with a lightweight local model for semantic similarity.
|
|
5
|
+
*
|
|
6
|
+
* @module embedding-judge
|
|
7
|
+
*/
|
|
8
|
+
import type { TestCase } from './types.js';
|
|
9
|
+
/**
|
|
10
|
+
* Result of judging a test case.
|
|
11
|
+
*/
|
|
12
|
+
export interface JudgeResult {
|
|
13
|
+
/** Score from 0 to 1 */
|
|
14
|
+
score: number;
|
|
15
|
+
/** Reasoning for the score */
|
|
16
|
+
reasoning: string;
|
|
17
|
+
/** Whether the test passed (score >= threshold) */
|
|
18
|
+
passed: boolean;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Configuration for the embedding judge.
|
|
22
|
+
*/
|
|
23
|
+
export interface EmbeddingJudgeConfig {
|
|
24
|
+
/** Score threshold for passing (0-1, default: 0.7) */
|
|
25
|
+
passThreshold?: number;
|
|
26
|
+
/** Embedding model to use (default: Xenova/all-MiniLM-L6-v2 via Hugging Face) */
|
|
27
|
+
model?: string;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Judge that evaluates agent outputs using embedding cosine similarity.
|
|
31
|
+
*
|
|
32
|
+
* Uses a local transformer model for semantic similarity comparison.
|
|
33
|
+
* The model is downloaded on first use and cached locally.
|
|
34
|
+
*/
|
|
35
|
+
export declare class EmbeddingJudge {
|
|
36
|
+
private _config;
|
|
37
|
+
private _pipeline;
|
|
38
|
+
private _modelLoading;
|
|
39
|
+
constructor(config?: EmbeddingJudgeConfig);
|
|
40
|
+
/**
|
|
41
|
+
* Get or create the embedding pipeline.
|
|
42
|
+
* Lazy loads the model on first use.
|
|
43
|
+
*/
|
|
44
|
+
private getPipeline;
|
|
45
|
+
/**
|
|
46
|
+
* Generate embeddings for text.
|
|
47
|
+
*
|
|
48
|
+
* @param text - Text to embed
|
|
49
|
+
* @returns Embedding vector
|
|
50
|
+
*/
|
|
51
|
+
private generateEmbedding;
|
|
52
|
+
/**
|
|
53
|
+
* Calculate cosine similarity between two vectors.
|
|
54
|
+
*/
|
|
55
|
+
private cosineSimilarity;
|
|
56
|
+
/**
|
|
57
|
+
* Evaluate a test case against the actual output.
|
|
58
|
+
*
|
|
59
|
+
* @param test - The test case
|
|
60
|
+
* @param actualOutput - The actual output from the agent
|
|
61
|
+
* @returns The judge result with score and reasoning
|
|
62
|
+
*/
|
|
63
|
+
evaluate(test: TestCase, actualOutput: string): Promise<JudgeResult>;
|
|
64
|
+
/**
|
|
65
|
+
* Create a judge function compatible with the TestRunner.
|
|
66
|
+
*
|
|
67
|
+
* @returns A function that can be passed to the runner
|
|
68
|
+
*/
|
|
69
|
+
createEvaluator(): (test: TestCase, actualOutput: string) => Promise<{
|
|
70
|
+
score: number;
|
|
71
|
+
reasoning: string;
|
|
72
|
+
passed: boolean;
|
|
73
|
+
}>;
|
|
74
|
+
/**
|
|
75
|
+
* Get the judge configuration.
|
|
76
|
+
*/
|
|
77
|
+
get config(): EmbeddingJudgeConfig;
|
|
78
|
+
}
|
|
79
|
+
/**
|
|
80
|
+
* Create an embedding judge with the given configuration.
|
|
81
|
+
*
|
|
82
|
+
* @param config - Judge configuration
|
|
83
|
+
* @returns A new EmbeddingJudge instance
|
|
84
|
+
*
|
|
85
|
+
* @example
|
|
86
|
+
* ```typescript
|
|
87
|
+
* const judge = createEmbeddingJudge({ passThreshold: 0.75 });
|
|
88
|
+
* const result = await judge.evaluate(test, output);
|
|
89
|
+
* console.log(`Score: ${result.score}, Passed: ${result.passed}`);
|
|
90
|
+
* ```
|
|
91
|
+
*/
|
|
92
|
+
export declare function createEmbeddingJudge(config?: EmbeddingJudgeConfig): EmbeddingJudge;
|
|
93
|
+
//# sourceMappingURL=embedding-judge.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embedding-judge.d.ts","sourceRoot":"","sources":["../../src/embedding-judge.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAIH,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAE3C;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,wBAAwB;IACxB,KAAK,EAAE,MAAM,CAAC;IAEd,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAElB,mDAAmD;IACnD,MAAM,EAAE,OAAO,CAAC;CACjB;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,sDAAsD;IACtD,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB,iFAAiF;IACjF,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;GAKG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,OAAO,CAAiC;IAChD,OAAO,CAAC,SAAS,CAA0C;IAC3D,OAAO,CAAC,aAAa,CAAmD;gBAE5D,MAAM,GAAE,oBAAyB;IAQ7C;;;OAGG;YACW,WAAW;IAoBzB;;;;;OAKG;YACW,iBAAiB;IAY/B;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAqBxB;;;;;;OAMG;IACG,QAAQ,CAAC,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAwC1E;;;;OAIG;IACH,eAAe,IAAI,CACjB,IAAI,EAAE,QAAQ,EACd,YAAY,EAAE,MAAM,KACjB,OAAO,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC;IAWF;;OAEG;IACH,IAAI,MAAM,IAAI,oBAAoB,CAEjC;CACF;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,oBAAoB,CAAC,MAAM,CAAC,EAAE,oBAAoB,GAAG,cAAc,CAElF"}
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding-based cosine similarity judge using an actual embedding model.
|
|
3
|
+
*
|
|
4
|
+
* Uses @huggingface/transformers with a lightweight local model for semantic similarity.
|
|
5
|
+
*
|
|
6
|
+
* @module embedding-judge
|
|
7
|
+
*/
|
|
8
|
+
import { pipeline } from '@huggingface/transformers';
|
|
9
|
+
/**
|
|
10
|
+
* Judge that evaluates agent outputs using embedding cosine similarity.
|
|
11
|
+
*
|
|
12
|
+
* Uses a local transformer model for semantic similarity comparison.
|
|
13
|
+
* The model is downloaded on first use and cached locally.
|
|
14
|
+
*/
|
|
15
|
+
export class EmbeddingJudge {
|
|
16
|
+
_config;
|
|
17
|
+
_pipeline = null;
|
|
18
|
+
_modelLoading = null;
|
|
19
|
+
constructor(config = {}) {
|
|
20
|
+
this._config = {
|
|
21
|
+
passThreshold: 0.7,
|
|
22
|
+
model: 'Xenova/all-MiniLM-L6-v2',
|
|
23
|
+
...config,
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Get or create the embedding pipeline.
|
|
28
|
+
* Lazy loads the model on first use.
|
|
29
|
+
*/
|
|
30
|
+
async getPipeline() {
|
|
31
|
+
if (this._pipeline) {
|
|
32
|
+
return this._pipeline;
|
|
33
|
+
}
|
|
34
|
+
if (this._modelLoading) {
|
|
35
|
+
return this._modelLoading;
|
|
36
|
+
}
|
|
37
|
+
this._modelLoading = pipeline('feature-extraction', this._config.model);
|
|
38
|
+
this._pipeline = await this._modelLoading;
|
|
39
|
+
return this._pipeline;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Generate embeddings for text.
|
|
43
|
+
*
|
|
44
|
+
* @param text - Text to embed
|
|
45
|
+
* @returns Embedding vector
|
|
46
|
+
*/
|
|
47
|
+
async generateEmbedding(text) {
|
|
48
|
+
const pipe = await this.getPipeline();
|
|
49
|
+
const output = await pipe(text, {
|
|
50
|
+
pooling: 'mean',
|
|
51
|
+
normalize: true,
|
|
52
|
+
});
|
|
53
|
+
// Convert to array
|
|
54
|
+
return Array.from(output.data);
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Calculate cosine similarity between two vectors.
|
|
58
|
+
*/
|
|
59
|
+
cosineSimilarity(a, b) {
|
|
60
|
+
let dotProduct = 0;
|
|
61
|
+
let normA = 0;
|
|
62
|
+
let normB = 0;
|
|
63
|
+
for (let i = 0; i < a.length; i++) {
|
|
64
|
+
dotProduct += a[i] * b[i];
|
|
65
|
+
normA += a[i] * a[i];
|
|
66
|
+
normB += b[i] * b[i];
|
|
67
|
+
}
|
|
68
|
+
// Vectors are already normalized, but calculate just in case
|
|
69
|
+
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
|
|
70
|
+
if (magnitude === 0) {
|
|
71
|
+
return 0;
|
|
72
|
+
}
|
|
73
|
+
return Math.max(0, Math.min(1, dotProduct / magnitude));
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* Evaluate a test case against the actual output.
|
|
77
|
+
*
|
|
78
|
+
* @param test - The test case
|
|
79
|
+
* @param actualOutput - The actual output from the agent
|
|
80
|
+
* @returns The judge result with score and reasoning
|
|
81
|
+
*/
|
|
82
|
+
async evaluate(test, actualOutput) {
|
|
83
|
+
try {
|
|
84
|
+
// Generate embeddings for both expected and actual output
|
|
85
|
+
const expectedEmbedding = await this.generateEmbedding(test.expectedOutput);
|
|
86
|
+
const actualEmbedding = await this.generateEmbedding(actualOutput);
|
|
87
|
+
// Calculate similarity
|
|
88
|
+
const score = this.cosineSimilarity(expectedEmbedding, actualEmbedding);
|
|
89
|
+
// Generate reasoning
|
|
90
|
+
let reasoning;
|
|
91
|
+
if (score >= 0.9) {
|
|
92
|
+
reasoning = 'Very high semantic similarity - output closely matches expected content.';
|
|
93
|
+
}
|
|
94
|
+
else if (score >= 0.75) {
|
|
95
|
+
reasoning = 'Good semantic similarity with minor differences in meaning or detail.';
|
|
96
|
+
}
|
|
97
|
+
else if (score >= this._config.passThreshold) {
|
|
98
|
+
reasoning = 'Moderate similarity - core concepts match but notable differences exist.';
|
|
99
|
+
}
|
|
100
|
+
else if (score >= 0.4) {
|
|
101
|
+
reasoning = 'Low semantic similarity - significant differences in meaning.';
|
|
102
|
+
}
|
|
103
|
+
else {
|
|
104
|
+
reasoning = 'Very low similarity - output does not match expected content.';
|
|
105
|
+
}
|
|
106
|
+
const passed = score >= this._config.passThreshold;
|
|
107
|
+
return {
|
|
108
|
+
score,
|
|
109
|
+
reasoning,
|
|
110
|
+
passed,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
catch (error) {
|
|
114
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
115
|
+
return {
|
|
116
|
+
score: 0,
|
|
117
|
+
reasoning: `Embedding evaluation failed: ${errorMessage}`,
|
|
118
|
+
passed: false,
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Create a judge function compatible with the TestRunner.
|
|
124
|
+
*
|
|
125
|
+
* @returns A function that can be passed to the runner
|
|
126
|
+
*/
|
|
127
|
+
createEvaluator() {
|
|
128
|
+
return async (test, actualOutput) => {
|
|
129
|
+
const result = await this.evaluate(test, actualOutput);
|
|
130
|
+
return {
|
|
131
|
+
score: result.score,
|
|
132
|
+
reasoning: result.reasoning,
|
|
133
|
+
passed: result.passed,
|
|
134
|
+
};
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* Get the judge configuration.
|
|
139
|
+
*/
|
|
140
|
+
get config() {
|
|
141
|
+
return this._config;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Create an embedding judge with the given configuration.
|
|
146
|
+
*
|
|
147
|
+
* @param config - Judge configuration
|
|
148
|
+
* @returns A new EmbeddingJudge instance
|
|
149
|
+
*
|
|
150
|
+
* @example
|
|
151
|
+
* ```typescript
|
|
152
|
+
* const judge = createEmbeddingJudge({ passThreshold: 0.75 });
|
|
153
|
+
* const result = await judge.evaluate(test, output);
|
|
154
|
+
* console.log(`Score: ${result.score}, Passed: ${result.passed}`);
|
|
155
|
+
* ```
|
|
156
|
+
*/
|
|
157
|
+
export function createEmbeddingJudge(config) {
|
|
158
|
+
return new EmbeddingJudge(config);
|
|
159
|
+
}
|
|
160
|
+
//# sourceMappingURL=embedding-judge.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"embedding-judge.js","sourceRoot":"","sources":["../../src/embedding-judge.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAkC,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AA6BrF;;;;;GAKG;AACH,MAAM,OAAO,cAAc;IACjB,OAAO,CAAiC;IACxC,SAAS,GAAqC,IAAI,CAAC;IACnD,aAAa,GAA8C,IAAI,CAAC;IAExE,YAAY,SAA+B,EAAE;QAC3C,IAAI,CAAC,OAAO,GAAG;YACb,aAAa,EAAE,GAAG;YAClB,KAAK,EAAE,yBAAyB;YAChC,GAAG,MAAM;SACV,CAAC;IACJ,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,WAAW;QACvB,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;YACnB,OAAO,IAAI,CAAC,SAAS,CAAC;QACxB,CAAC;QAED,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,OAAO,IAAI,CAAC,aAAa,CAAC;QAC5B,CAAC;QAGD,IAAI,CAAC,aAAa,GAAG,QAAQ,CAC3B,oBAAoB,EACpB,IAAI,CAAC,OAAO,CAAC,KAAK,CAE8B,CAAC;QAEnD,IAAI,CAAC,SAAS,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC;QAC1C,OAAO,IAAI,CAAC,SAAS,CAAC;IACxB,CAAC;IAED;;;;;OAKG;IACK,KAAK,CAAC,iBAAiB,CAAC,IAAY;QAC1C,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,WAAW,EAAE,CAAC;QAEtC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,IAAI,EAAE;YAC9B,OAAO,EAAE,MAAM;YACf,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QAEH,mBAAmB;QACnB,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAoB,CAAC,CAAC;IACjD,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,CAAW,EAAE,CAAW;QAC/C,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAClC,UAAU,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1B,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YACrB,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACvB,CAAC;QAED,6DAA6D;QAC7D,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAEtD,IAAI,SAAS,KAAK,CAAC,EAAE,CAAC;YACpB,OAAO,CAAC,CAAC;QACX,CAAC;QAED,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,UAAU,GAAG,SAAS,CAAC,CAAC,CAAC;IAC1D,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,QAAQ,CAAC,IAAc,EAAE,YAAoB;QACjD,IAAI,CAAC;YACH,0DAA0D;YAC1D,MAAM,iBAAiB,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAC5E,MAAM,eAAe,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,YAAY,CAAC,CAAC;YAEnE,uBAAuB;YACvB,MAAM,KAAK,GAAG,IAAI,CAAC,gBAAgB,CAAC,iBAAiB,EAAE,eAAe,CAAC,CAAC;YAExE,qBAAqB;YACrB,IAAI,SAAiB,CAAC;YACtB,IAAI,KAAK,IAAI,GAAG,EAAE,CAAC;gBACjB,SAAS,GAAG,0EAA0E,CAAC;YACzF,CAAC;iBAAM,IAAI,KAAK,IAAI,IAAI,EAAE,CAAC;gBACzB,SAAS,GAAG,uEAAuE,CAAC;YACtF,CAAC;iBAAM,IAAI,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,CAAC;gBAC/C,SAAS,GAAG,0EAA0E,CAAC;YACzF,CAAC;iBAAM,IAAI,KAAK,IAAI,GAAG,EAAE,CAAC;gBACxB,SAAS,GAAG,+DAA+D,CAAC;YAC9E,CAAC;iBAAM,CAAC;gBACN,SAAS,GAAG,+DAA+D,CAAC;YAC9E,CAAC;YAED,MAAM,MAAM,GAAG,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC;YAEnD,OAAO;gBACL,KAAK;gBACL,SAAS;gBACT,MAAM;aACP,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,OAAO;gBACL,KAAK,EAAE,CAAC;gBACR,SAAS,EAAE,gCAAgC,YAAY,EAAE;gBACzD,MAAM,EAAE,KAAK;aACd,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;;;OAIG;IACH,eAAe;QAQb,OAAO,KAAK,EAAE,IAAI,EAAE,YAAY,EAAE,EAAE;YAClC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;YACvD,OAAO;gBACL,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,SAAS,EAAE,MAAM,CAAC,SAAS;gBAC3B,MAAM,EAAE,MAAM,CAAC,MAAM;aACtB,CAAC;QACJ,CAAC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,oBAAoB,CAAC,MAA6B;IAChE,OAAO,IAAI,cAAc,CAAC,MAAM,CAAC,CAAC;AACpC,CAAC"}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Genesys Agent Eval Harness
|
|
3
|
+
*
|
|
4
|
+
* A framework for evaluating AI agents (pi and genesys) against test suites defined in YAML files.
|
|
5
|
+
*
|
|
6
|
+
* @packageDocumentation
|
|
7
|
+
*/
|
|
8
|
+
export type { AgentResponse, Args, EvalResults, EvalSummary, JudgeConfig, RunnerConfig, TestCase, TestResult, TestSuite, } from './types.js';
|
|
9
|
+
export { CLIError, runAgent, type RunOptions, } from './cli-runner.js';
|
|
10
|
+
export { createEmbeddingJudge, EmbeddingJudge, type EmbeddingJudgeConfig, type JudgeResult as EmbeddingJudgeResult, } from './embedding-judge.js';
|
|
11
|
+
export { createJudge, evaluate, Judge, type JudgeResult as LLMJudgeResult, } from './judge.js';
|
|
12
|
+
export { formatResults, type OutputFormat, Reporter, type ReporterConfig, reportResults, } from './reporter.js';
|
|
13
|
+
export { runEvaluation, type ProgressCallback, TestRunner, } from './runner.js';
|
|
14
|
+
export { loadTestSuite, type LoadResult, parseTestSuite, TestLoadError, TestValidationError, validateTestSuite, } from './test-loader.js';
|
|
15
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAGH,YAAY,EACV,aAAa,EACb,IAAI,EACJ,WAAW,EACX,WAAW,EACX,WAAW,EACX,YAAY,EACZ,QAAQ,EACR,UAAU,EACV,SAAS,GACV,MAAM,YAAY,CAAC;AAGpB,OAAO,EACL,QAAQ,EACR,QAAQ,EACR,KAAK,UAAU,GAChB,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EACL,oBAAoB,EACpB,cAAc,EACd,KAAK,oBAAoB,EACzB,KAAK,WAAW,IAAI,oBAAoB,GACzC,MAAM,sBAAsB,CAAC;AAG9B,OAAO,EACL,WAAW,EACX,QAAQ,EACR,KAAK,EACL,KAAK,WAAW,IAAI,cAAc,GACnC,MAAM,YAAY,CAAC;AAGpB,OAAO,EACL,aAAa,EACb,KAAK,YAAY,EACjB,QAAQ,EACR,KAAK,cAAc,EACnB,aAAa,GACd,MAAM,eAAe,CAAC;AAGvB,OAAO,EACL,aAAa,EACb,KAAK,gBAAgB,EACrB,UAAU,GACX,MAAM,aAAa,CAAC;AAGrB,OAAO,EACL,aAAa,EACb,KAAK,UAAU,EACf,cAAc,EACd,aAAa,EACb,mBAAmB,EACnB,iBAAiB,GAClB,MAAM,kBAAkB,CAAC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Genesys Agent Eval Harness
|
|
3
|
+
*
|
|
4
|
+
* A framework for evaluating AI agents (pi and genesys) against test suites defined in YAML files.
|
|
5
|
+
*
|
|
6
|
+
* @packageDocumentation
|
|
7
|
+
*/
|
|
8
|
+
// Export CLI runner functionality
|
|
9
|
+
export { CLIError, runAgent, } from './cli-runner.js';
|
|
10
|
+
// Export embedding judge functionality
|
|
11
|
+
export { createEmbeddingJudge, EmbeddingJudge, } from './embedding-judge.js';
|
|
12
|
+
// Export LLM judge functionality
|
|
13
|
+
export { createJudge, evaluate, Judge, } from './judge.js';
|
|
14
|
+
// Export reporter functionality
|
|
15
|
+
export { formatResults, Reporter, reportResults, } from './reporter.js';
|
|
16
|
+
// Export runner functionality
|
|
17
|
+
export { runEvaluation, TestRunner, } from './runner.js';
|
|
18
|
+
// Export test loader functionality
|
|
19
|
+
export { loadTestSuite, parseTestSuite, TestLoadError, TestValidationError, validateTestSuite, } from './test-loader.js';
|
|
20
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAeH,kCAAkC;AAClC,OAAO,EACL,QAAQ,EACR,QAAQ,GAET,MAAM,iBAAiB,CAAC;AAEzB,uCAAuC;AACvC,OAAO,EACL,oBAAoB,EACpB,cAAc,GAGf,MAAM,sBAAsB,CAAC;AAE9B,iCAAiC;AACjC,OAAO,EACL,WAAW,EACX,QAAQ,EACR,KAAK,GAEN,MAAM,YAAY,CAAC;AAEpB,gCAAgC;AAChC,OAAO,EACL,aAAa,EAEb,QAAQ,EAER,aAAa,GACd,MAAM,eAAe,CAAC;AAEvB,8BAA8B;AAC9B,OAAO,EACL,aAAa,EAEb,UAAU,GACX,MAAM,aAAa,CAAC;AAErB,mCAAmC;AACnC,OAAO,EACL,aAAa,EAEb,cAAc,EACd,aAAa,EACb,mBAAmB,EACnB,iBAAiB,GAClB,MAAM,kBAAkB,CAAC"}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-based judge for evaluating agent outputs against expected results.
|
|
3
|
+
*
|
|
4
|
+
* The judge uses a separate LLM to score how well the agent's output matches
|
|
5
|
+
* the expected output description on a scale of 0-1.
|
|
6
|
+
*
|
|
7
|
+
* @module judge
|
|
8
|
+
*/
|
|
9
|
+
import type { JudgeConfig, TestCase } from './types.js';
|
|
10
|
+
/**
|
|
11
|
+
* Result of judging a test case.
|
|
12
|
+
*/
|
|
13
|
+
export interface JudgeResult {
|
|
14
|
+
/** Score from 0 to 1 */
|
|
15
|
+
score: number;
|
|
16
|
+
/** Reasoning for the score */
|
|
17
|
+
reasoning: string;
|
|
18
|
+
/** Whether the test passed (score >= threshold) */
|
|
19
|
+
passed: boolean;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Judge that evaluates agent outputs using an LLM.
|
|
23
|
+
*
|
|
24
|
+
* The judge prompts a separate LLM to compare the actual output against the
|
|
25
|
+
* expected output description and score it on a scale of 0-1.
|
|
26
|
+
*/
|
|
27
|
+
export declare class Judge {
|
|
28
|
+
private _config;
|
|
29
|
+
constructor(config: JudgeConfig);
|
|
30
|
+
/**
|
|
31
|
+
* Build the judge prompt.
|
|
32
|
+
*
|
|
33
|
+
* @param test - The test case
|
|
34
|
+
* @param actualOutput - The actual output from the agent
|
|
35
|
+
* @returns The prompt to send to the judge LLM
|
|
36
|
+
*/
|
|
37
|
+
private buildPrompt;
|
|
38
|
+
/**
|
|
39
|
+
* Get the model instance based on provider.
|
|
40
|
+
*
|
|
41
|
+
* @returns Model instance for the Vercel AI SDK
|
|
42
|
+
*/
|
|
43
|
+
private getModel;
|
|
44
|
+
/**
|
|
45
|
+
* Evaluate a test case against the actual output.
|
|
46
|
+
*
|
|
47
|
+
* @param test - The test case
|
|
48
|
+
* @param actualOutput - The actual output from the agent
|
|
49
|
+
* @returns The judge result with score and reasoning
|
|
50
|
+
*/
|
|
51
|
+
evaluate(test: TestCase, actualOutput: string): Promise<JudgeResult>;
|
|
52
|
+
/**
|
|
53
|
+
* Create a judge function compatible with the TestRunner.
|
|
54
|
+
*
|
|
55
|
+
* @returns A function that can be passed to the runner
|
|
56
|
+
*/
|
|
57
|
+
createEvaluator(): (test: TestCase, actualOutput: string) => Promise<{
|
|
58
|
+
score: number;
|
|
59
|
+
reasoning: string;
|
|
60
|
+
passed: boolean;
|
|
61
|
+
}>;
|
|
62
|
+
/**
|
|
63
|
+
* Get the judge configuration.
|
|
64
|
+
*/
|
|
65
|
+
get config(): JudgeConfig;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Create a judge with the given configuration.
|
|
69
|
+
*
|
|
70
|
+
* @param config - Judge configuration
|
|
71
|
+
* @returns A new Judge instance
|
|
72
|
+
*
|
|
73
|
+
* @example
|
|
74
|
+
* ```typescript
|
|
75
|
+
* const judge = createJudge({
|
|
76
|
+
* provider: 'anthropic',
|
|
77
|
+
* model: 'claude-3-5-sonnet-20241022',
|
|
78
|
+
* passThreshold: 0.8
|
|
79
|
+
* });
|
|
80
|
+
*
|
|
81
|
+
* const result = await judge.evaluate(test, output);
|
|
82
|
+
* console.log(`Score: ${result.score}, Passed: ${result.passed}`);
|
|
83
|
+
* ```
|
|
84
|
+
*/
|
|
85
|
+
export declare function createJudge(config: JudgeConfig): Judge;
|
|
86
|
+
/**
|
|
87
|
+
* Quick evaluate function for one-off judgments.
|
|
88
|
+
*
|
|
89
|
+
* @param config - Judge configuration
|
|
90
|
+
* @param test - Test case
|
|
91
|
+
* @param actualOutput - Actual output to evaluate
|
|
92
|
+
* @returns Judge result
|
|
93
|
+
*/
|
|
94
|
+
export declare function evaluate(config: JudgeConfig, test: TestCase, actualOutput: string): Promise<JudgeResult>;
|
|
95
|
+
//# sourceMappingURL=judge.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"judge.d.ts","sourceRoot":"","sources":["../../src/judge.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAQH,OAAO,KAAK,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAExD;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,wBAAwB;IACxB,KAAK,EAAE,MAAM,CAAC;IAEd,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAElB,mDAAmD;IACnD,MAAM,EAAE,OAAO,CAAC;CACjB;AAeD;;;;;GAKG;AACH,qBAAa,KAAK;IAChB,OAAO,CAAC,OAAO,CAAc;gBAEjB,MAAM,EAAE,WAAW;IAQ/B;;;;;;OAMG;IACH,OAAO,CAAC,WAAW;IA+BnB;;;;OAIG;IACH,OAAO,CAAC,QAAQ;IAoBhB;;;;;;OAMG;IACG,QAAQ,CAAC,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAkC1E;;;;OAIG;IACH,eAAe,IAAI,CACjB,IAAI,EAAE,QAAQ,EACd,YAAY,EAAE,MAAM,KACjB,OAAO,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC;IAWF;;OAEG;IACH,IAAI,MAAM,IAAI,WAAW,CAExB;CACF;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,WAAW,CAAC,MAAM,EAAE,WAAW,GAAG,KAAK,CAEtD;AAED;;;;;;;GAOG;AACH,wBAAsB,QAAQ,CAC5B,MAAM,EAAE,WAAW,EACnB,IAAI,EAAE,QAAQ,EACd,YAAY,EAAE,MAAM,GACnB,OAAO,CAAC,WAAW,CAAC,CAGtB"}
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-based judge for evaluating agent outputs against expected results.
|
|
3
|
+
*
|
|
4
|
+
* The judge uses a separate LLM to score how well the agent's output matches
|
|
5
|
+
* the expected output description on a scale of 0-1.
|
|
6
|
+
*
|
|
7
|
+
* @module judge
|
|
8
|
+
*/
|
|
9
|
+
import { anthropic } from '@ai-sdk/anthropic';
|
|
10
|
+
import { google } from '@ai-sdk/google';
|
|
11
|
+
import { openai } from '@ai-sdk/openai';
|
|
12
|
+
import { generateObject } from 'ai';
|
|
13
|
+
import { z } from 'zod';
|
|
14
|
+
/**
|
|
15
|
+
* Zod schema for the judge's structured output.
|
|
16
|
+
*/
|
|
17
|
+
const judgeOutputSchema = z.object({
|
|
18
|
+
score: z.number().min(0).max(1).describe('Score from 0 to 1 where 1 is perfect'),
|
|
19
|
+
reasoning: z.string().describe('Explanation for the score'),
|
|
20
|
+
});
|
|
21
|
+
/**
|
|
22
|
+
* Judge that evaluates agent outputs using an LLM.
|
|
23
|
+
*
|
|
24
|
+
* The judge prompts a separate LLM to compare the actual output against the
|
|
25
|
+
* expected output description and score it on a scale of 0-1.
|
|
26
|
+
*/
|
|
27
|
+
export class Judge {
|
|
28
|
+
_config;
|
|
29
|
+
constructor(config) {
|
|
30
|
+
this._config = {
|
|
31
|
+
passThreshold: 0.7,
|
|
32
|
+
temperature: 0,
|
|
33
|
+
...config,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Build the judge prompt.
|
|
38
|
+
*
|
|
39
|
+
* @param test - The test case
|
|
40
|
+
* @param actualOutput - The actual output from the agent
|
|
41
|
+
* @returns The prompt to send to the judge LLM
|
|
42
|
+
*/
|
|
43
|
+
buildPrompt(test, actualOutput) {
|
|
44
|
+
return `You are an expert evaluator assessing the quality of AI responses.
|
|
45
|
+
|
|
46
|
+
Your task is to evaluate how well the ACTUAL OUTPUT matches the EXPECTED OUTPUT description.
|
|
47
|
+
|
|
48
|
+
## Test Input
|
|
49
|
+
${test.input}
|
|
50
|
+
|
|
51
|
+
## Expected Output Description
|
|
52
|
+
${test.expectedOutput}
|
|
53
|
+
|
|
54
|
+
${test.context ? `## Additional Context\n${test.context}\n\n` : ''}## Actual Output
|
|
55
|
+
${actualOutput}
|
|
56
|
+
|
|
57
|
+
## Evaluation Instructions
|
|
58
|
+
|
|
59
|
+
1. Carefully read the expected output description and the actual output
|
|
60
|
+
2. Score the actual output on a scale of 0.0 to 1.0 where:
|
|
61
|
+
- 1.0 = Perfect match, fully satisfies the expected output
|
|
62
|
+
- 0.8-0.9 = Good match, minor issues or omissions
|
|
63
|
+
- 0.6-0.7 = Partial match, significant issues but some correct elements
|
|
64
|
+
- 0.4-0.5 = Poor match, mostly incorrect or incomplete
|
|
65
|
+
- 0.0-0.3 = Very poor match, completely wrong or irrelevant
|
|
66
|
+
|
|
67
|
+
3. Provide clear reasoning for your score
|
|
68
|
+
|
|
69
|
+
Respond with a structured object containing:
|
|
70
|
+
- score: number from 0 to 1
|
|
71
|
+
- reasoning: string explaining your evaluation`;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Get the model instance based on provider.
|
|
75
|
+
*
|
|
76
|
+
* @returns Model instance for the Vercel AI SDK
|
|
77
|
+
*/
|
|
78
|
+
getModel() {
|
|
79
|
+
const { provider, model } = this._config;
|
|
80
|
+
switch (provider) {
|
|
81
|
+
case 'anthropic': {
|
|
82
|
+
return anthropic(model);
|
|
83
|
+
}
|
|
84
|
+
case 'openai': {
|
|
85
|
+
return openai(model);
|
|
86
|
+
}
|
|
87
|
+
case 'google':
|
|
88
|
+
case 'gemini': {
|
|
89
|
+
return google(model);
|
|
90
|
+
}
|
|
91
|
+
default: {
|
|
92
|
+
throw new Error(`Unsupported judge provider: ${provider}`);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Evaluate a test case against the actual output.
|
|
98
|
+
*
|
|
99
|
+
* @param test - The test case
|
|
100
|
+
* @param actualOutput - The actual output from the agent
|
|
101
|
+
* @returns The judge result with score and reasoning
|
|
102
|
+
*/
|
|
103
|
+
async evaluate(test, actualOutput) {
|
|
104
|
+
const prompt = this.buildPrompt(test, actualOutput);
|
|
105
|
+
try {
|
|
106
|
+
const { object } = await generateObject({
|
|
107
|
+
model: this.getModel(),
|
|
108
|
+
schema: judgeOutputSchema,
|
|
109
|
+
messages: [
|
|
110
|
+
{
|
|
111
|
+
role: 'user',
|
|
112
|
+
content: prompt,
|
|
113
|
+
},
|
|
114
|
+
],
|
|
115
|
+
temperature: this._config.temperature,
|
|
116
|
+
});
|
|
117
|
+
const passed = object.score >= (this._config.passThreshold ?? 0.7);
|
|
118
|
+
return {
|
|
119
|
+
score: object.score,
|
|
120
|
+
reasoning: object.reasoning,
|
|
121
|
+
passed,
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
catch (error) {
|
|
125
|
+
// If structured generation fails, return a failed result
|
|
126
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
127
|
+
return {
|
|
128
|
+
score: 0,
|
|
129
|
+
reasoning: `Judge evaluation failed: ${errorMessage}`,
|
|
130
|
+
passed: false,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
/**
|
|
135
|
+
* Create a judge function compatible with the TestRunner.
|
|
136
|
+
*
|
|
137
|
+
* @returns A function that can be passed to the runner
|
|
138
|
+
*/
|
|
139
|
+
createEvaluator() {
|
|
140
|
+
return async (test, actualOutput) => {
|
|
141
|
+
const result = await this.evaluate(test, actualOutput);
|
|
142
|
+
return {
|
|
143
|
+
score: result.score,
|
|
144
|
+
reasoning: result.reasoning,
|
|
145
|
+
passed: result.passed,
|
|
146
|
+
};
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
/**
|
|
150
|
+
* Get the judge configuration.
|
|
151
|
+
*/
|
|
152
|
+
get config() {
|
|
153
|
+
return this._config;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Create a judge with the given configuration.
|
|
158
|
+
*
|
|
159
|
+
* @param config - Judge configuration
|
|
160
|
+
* @returns A new Judge instance
|
|
161
|
+
*
|
|
162
|
+
* @example
|
|
163
|
+
* ```typescript
|
|
164
|
+
* const judge = createJudge({
|
|
165
|
+
* provider: 'anthropic',
|
|
166
|
+
* model: 'claude-3-5-sonnet-20241022',
|
|
167
|
+
* passThreshold: 0.8
|
|
168
|
+
* });
|
|
169
|
+
*
|
|
170
|
+
* const result = await judge.evaluate(test, output);
|
|
171
|
+
* console.log(`Score: ${result.score}, Passed: ${result.passed}`);
|
|
172
|
+
* ```
|
|
173
|
+
*/
|
|
174
|
+
export function createJudge(config) {
|
|
175
|
+
return new Judge(config);
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Quick evaluate function for one-off judgments.
|
|
179
|
+
*
|
|
180
|
+
* @param config - Judge configuration
|
|
181
|
+
* @param test - Test case
|
|
182
|
+
* @param actualOutput - Actual output to evaluate
|
|
183
|
+
* @returns Judge result
|
|
184
|
+
*/
|
|
185
|
+
export async function evaluate(config, test, actualOutput) {
|
|
186
|
+
const judge = createJudge(config);
|
|
187
|
+
return judge.evaluate(test, actualOutput);
|
|
188
|
+
}
|
|
189
|
+
//# sourceMappingURL=judge.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"judge.js","sourceRoot":"","sources":["../../src/judge.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAC9C,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AACxC,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AACxC,OAAO,EAAE,cAAc,EAAE,MAAM,IAAI,CAAC;AACpC,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAkBxB;;GAEG;AACH,MAAM,iBAAiB,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,sCAAsC,CAAC;IAChF,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,2BAA2B,CAAC;CAC5D,CAAC,CAAC;AAOH;;;;;GAKG;AACH,MAAM,OAAO,KAAK;IACR,OAAO,CAAc;IAE7B,YAAY,MAAmB;QAC7B,IAAI,CAAC,OAAO,GAAG;YACb,aAAa,EAAE,GAAG;YAClB,WAAW,EAAE,CAAC;YACd,GAAG,MAAM;SACV,CAAC;IACJ,CAAC;IAED;;;;;;OAMG;IACK,WAAW,CAAC,IAAc,EAAE,YAAoB;QACtD,OAAO;;;;;EAKT,IAAI,CAAC,KAAK;;;EAGV,IAAI,CAAC,cAAc;;EAEnB,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,0BAA0B,IAAI,CAAC,OAAO,MAAM,CAAC,CAAC,CAAC,EAAE;EAChE,YAAY;;;;;;;;;;;;;;;;+CAgBiC,CAAC;IAC9C,CAAC;IAED;;;;OAIG;IACK,QAAQ;QACd,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,GAAG,IAAI,CAAC,OAAO,CAAC;QAEzC,QAAQ,QAAQ,EAAE,CAAC;YACjB,KAAK,WAAW,CAAC,CAAC,CAAC;gBACjB,OAAO,SAAS,CAAC,KAAK,CAAC,CAAC;YAC1B,CAAC;YACD,KAAK,QAAQ,CAAC,CAAC,CAAC;gBACd,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC;YACvB,CAAC;YACD,KAAK,QAAQ,CAAC;YACd,KAAK,QAAQ,CAAC,CAAC,CAAC;gBACd,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC;YACvB,CAAC;YACD,OAAO,CAAC,CAAC,CAAC;gBACR,MAAM,IAAI,KAAK,CAAC,+BAA+B,QAAQ,EAAE,CAAC,CAAC;YAC7D,CAAC;QACH,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,QAAQ,CAAC,IAAc,EAAE,YAAoB;QACjD,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;QAEpD,IAAI,CAAC;YACH,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,cAAc,CAAC;gBACtC,KAAK,EAAE,IAAI,CAAC,QAAQ,EAA8D;gBAClF,MAAM,EAAE,iBAAiB;gBACzB,QAAQ,EAAE;oBACR;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE,MAAM;qBAChB;iBACF;gBACD,WAAW,EAAE,IAAI,CAAC,OAAO,CAAC,WAAW;aACtC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,aAAa,IAAI,GAAG,CAAC,CAAC;YAEnE,OAAO;gBACL,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,SAAS,EAAE,MAAM,CAAC,SAAS;gBAC3B,MAAM;aACP,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,yDAAyD;YACzD,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,OAAO;gBACL,KAAK,EAAE,CAAC;gBACR,SAAS,EAAE,4BAA4B,YAAY,EAAE;gBACrD,MAAM,EAAE,KAAK;aACd,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;;;OAIG;IACH,eAAe;QAQb,OAAO,KAAK,EAAE,IAAI,EAAE,YAAY,EAAE,EAAE;YAClC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;YACvD,OAAO;gBACL,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,SAAS,EAAE,MAAM,CAAC,SAAS;gBAC3B,MAAM,EAAE,MAAM,CAAC,MAAM;aACtB,CAAC;QACJ,CAAC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,UAAU,WAAW,CAAC,MAAmB;IAC7C,OAAO,IAAI,KAAK,CAAC,MAAM,CAAC,CAAC;AAC3B,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,MAAmB,EACnB,IAAc,EACd,YAAoB;IAEpB,MAAM,KAAK,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;IAClC,OAAO,KAAK,CAAC,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;AAC5C,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"launcher.d.ts","sourceRoot":"","sources":["../../src/launcher.ts"],"names":[],"mappings":";AACA;;;;;GAKG"}
|