@gnsx/genesys.agent.eval 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/README.md +9 -0
  2. package/dist/src/adapters/anthropic-adapter.d.ts +24 -0
  3. package/dist/src/adapters/anthropic-adapter.d.ts.map +1 -0
  4. package/dist/src/adapters/anthropic-adapter.js +80 -0
  5. package/dist/src/adapters/anthropic-adapter.js.map +1 -0
  6. package/dist/src/adapters/gemini-adapter.d.ts +23 -0
  7. package/dist/src/adapters/gemini-adapter.d.ts.map +1 -0
  8. package/dist/src/adapters/gemini-adapter.js +79 -0
  9. package/dist/src/adapters/gemini-adapter.js.map +1 -0
  10. package/dist/src/adapters/ollama-adapter.d.ts +28 -0
  11. package/dist/src/adapters/ollama-adapter.d.ts.map +1 -0
  12. package/dist/src/adapters/ollama-adapter.js +54 -0
  13. package/dist/src/adapters/ollama-adapter.js.map +1 -0
  14. package/dist/src/adapters/openai-adapter.d.ts +24 -0
  15. package/dist/src/adapters/openai-adapter.d.ts.map +1 -0
  16. package/dist/src/adapters/openai-adapter.js +80 -0
  17. package/dist/src/adapters/openai-adapter.js.map +1 -0
  18. package/dist/src/adapters/pi-adapter.d.ts +27 -0
  19. package/dist/src/adapters/pi-adapter.d.ts.map +1 -0
  20. package/dist/src/adapters/pi-adapter.js +136 -0
  21. package/dist/src/adapters/pi-adapter.js.map +1 -0
  22. package/dist/src/agent-adapter.d.ts +130 -0
  23. package/dist/src/agent-adapter.d.ts.map +1 -0
  24. package/dist/src/agent-adapter.js +134 -0
  25. package/dist/src/agent-adapter.js.map +1 -0
  26. package/dist/src/args.d.ts +22 -0
  27. package/dist/src/args.d.ts.map +1 -0
  28. package/dist/src/args.js +224 -0
  29. package/dist/src/args.js.map +1 -0
  30. package/dist/src/cli-runner.d.ts +39 -0
  31. package/dist/src/cli-runner.d.ts.map +1 -0
  32. package/dist/src/cli-runner.js +105 -0
  33. package/dist/src/cli-runner.js.map +1 -0
  34. package/dist/src/embedding-judge.d.ts +93 -0
  35. package/dist/src/embedding-judge.d.ts.map +1 -0
  36. package/dist/src/embedding-judge.js +160 -0
  37. package/dist/src/embedding-judge.js.map +1 -0
  38. package/dist/src/index.d.ts +15 -0
  39. package/dist/src/index.d.ts.map +1 -0
  40. package/dist/src/index.js +20 -0
  41. package/dist/src/index.js.map +1 -0
  42. package/dist/src/judge.d.ts +95 -0
  43. package/dist/src/judge.d.ts.map +1 -0
  44. package/dist/src/judge.js +189 -0
  45. package/dist/src/judge.js.map +1 -0
  46. package/dist/src/launcher.d.ts +9 -0
  47. package/dist/src/launcher.d.ts.map +1 -0
  48. package/dist/src/launcher.js +129 -0
  49. package/dist/src/launcher.js.map +1 -0
  50. package/dist/src/reporter.d.ts +86 -0
  51. package/dist/src/reporter.d.ts.map +1 -0
  52. package/dist/src/reporter.js +384 -0
  53. package/dist/src/reporter.js.map +1 -0
  54. package/dist/src/runner.d.ts +75 -0
  55. package/dist/src/runner.d.ts.map +1 -0
  56. package/dist/src/runner.js +165 -0
  57. package/dist/src/runner.js.map +1 -0
  58. package/dist/src/test-loader.d.ts +66 -0
  59. package/dist/src/test-loader.d.ts.map +1 -0
  60. package/dist/src/test-loader.js +140 -0
  61. package/dist/src/test-loader.js.map +1 -0
  62. package/dist/src/types.d.ts +161 -0
  63. package/dist/src/types.d.ts.map +1 -0
  64. package/dist/src/types.js +7 -0
  65. package/dist/src/types.js.map +1 -0
  66. package/dist/src/utils/package.d.ts +16 -0
  67. package/dist/src/utils/package.d.ts.map +1 -0
  68. package/dist/src/utils/package.js +30 -0
  69. package/dist/src/utils/package.js.map +1 -0
  70. package/dist/tsconfig.tsbuildinfo +1 -0
  71. package/examples/basic-tests.yaml +22 -0
  72. package/package.json +41 -0
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli-runner.js","sourceRoot":"","sources":["../../src/cli-runner.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAE,KAAK,EAAE,MAAM,oBAAoB,CAAC;AAe3C;;GAEG;AACH,MAAM,OAAO,QAAS,SAAQ,KAAK;IAGf;IACA;IACA;IAJlB,YACE,OAAe,EACC,OAAe,EACf,QAAgB,EAChB,MAAc;QAE9B,KAAK,CAAC,OAAO,CAAC,CAAC;QAJC,YAAO,GAAP,OAAO,CAAQ;QACf,aAAQ,GAAR,QAAQ,CAAQ;QAChB,WAAM,GAAN,MAAM,CAAQ;QAG9B,IAAI,CAAC,IAAI,GAAG,UAAU,CAAC;IACzB,CAAC;CACF;AAED;;;;;;;;GAQG;AACH,SAAS,iBAAiB,CAAC,YAAoB;IAC7C,MAAM,OAAO,GAAG,YAAY,CAAC,IAAI,EAAE,CAAC;IAEpC,sCAAsC;IACtC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAC3B,OAAO,CAAC,OAAO,EAAE,EAAE,CAAC,CAAC;IACvB,CAAC;IAED,4FAA4F;IAC5F,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;IACnC,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;IACrB,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IAE5B,OAAO,CAAC,GAAG,EAAE,IAAI,CAAC,CAAC;AACrB,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,KAAa,EACb,MAAc,EACd,OAAmB;IAEnB,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAE7B,iHAAiH;IAEjH,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,IAAI,MAAM,GAAG,EAAE,CAAC;QAChB,IAAI,MAAM,GAAG,EAAE,CAAC;QAEhB,MAAM,CAAC,GAAG,EAAE,OAAO,CAAC,GAAG,iBAAiB,CAAC,KAAK,CAAC,CAAC;QAEhD,2EAA2E;QAC3E,MAAM,iBAAiB,GAAG,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC;QAC7C,MAAM,QAAQ,GAAG,iBAAiB,IAAI,OAAO,CAAC,QAAQ,KAAK,OAAO,CAAC;QAEnE,oEAAoE;QACpE,MAAM,SAAS,GAAG,CAAC,GAAG,OAAO,EAAE,IAAI,CAAC,CAAC;QAErC,MAAM,KAAK,GAAG,KAAK,CAAC,GAAG,EAAE,SAAS,EAAE;YAClC,GAAG,EAAE,OAAO,CAAC,GAAG;YAChB,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;YAC/B,GAAG,EAAE,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE;YACvB,KAAK,EAAE,QAAQ;SAChB,CAAC,CAAC;QAEH,qCAAqC;QACrC,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;YAChB,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;YACnC,KAAK,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC;QACpB,CAAC;QAED,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE;YACxC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC5B,CAAC,CAAC,CAAC;QAEH,KAAK,CAAC,MAAM,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAY,EAAE,EAAE;YACxC,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC5B,CAAC,CAAC,CAAC;QAEH,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,EAAE;YAC9B,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACtB,MAAM,CAAC,IAAI,QAAQ,CACjB,2BAA2B,OAAO,CAAC,OAAO,IAAI,EAC9C,GAAG,KAAK,KAAK,EACb,CAAC,CAAC,EACF,MAAM,CACP,CAAC,CAAC;QACL,CAAC,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;QAEpB,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;YAC1B,YAAY,CAAC,OAAO,CAAC,CAAC;YACtB,MAAM,CAAC,IAAI,QAAQ,CACjB,mBAAmB,KAAK,KAAK,KAAK,CAAC,OAAO,mDAAmD,EAC7F,GAAG,KAAK,KAAK,EACb,CAAC,CAAC,EACF,MAAM,CACP,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,KAAK,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;YACzB,YAAY,CAAC,OAAO,CAAC,CAAC;YACtB,MAAM,UAAU,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;YAE1C,OAAO,CAAC;gBACN,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE;gBACrB,QAAQ,EAAE,IAAI,IAAI,CAAC;gBACnB,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE;gBACrB,UAAU;aACX,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,93 @@
1
+ /**
2
+ * Embedding-based cosine similarity judge using an actual embedding model.
3
+ *
4
+ * Uses @huggingface/transformers with a lightweight local model for semantic similarity.
5
+ *
6
+ * @module embedding-judge
7
+ */
8
+ import type { TestCase } from './types.js';
9
+ /**
10
+ * Result of judging a test case.
11
+ */
12
+ export interface JudgeResult {
13
+ /** Score from 0 to 1 */
14
+ score: number;
15
+ /** Reasoning for the score */
16
+ reasoning: string;
17
+ /** Whether the test passed (score >= threshold) */
18
+ passed: boolean;
19
+ }
20
+ /**
21
+ * Configuration for the embedding judge.
22
+ */
23
+ export interface EmbeddingJudgeConfig {
24
+ /** Score threshold for passing (0-1, default: 0.7) */
25
+ passThreshold?: number;
26
+ /** Embedding model to use (default: Xenova/all-MiniLM-L6-v2 via Hugging Face) */
27
+ model?: string;
28
+ }
29
+ /**
30
+ * Judge that evaluates agent outputs using embedding cosine similarity.
31
+ *
32
+ * Uses a local transformer model for semantic similarity comparison.
33
+ * The model is downloaded on first use and cached locally.
34
+ */
35
+ export declare class EmbeddingJudge {
36
+ private _config;
37
+ private _pipeline;
38
+ private _modelLoading;
39
+ constructor(config?: EmbeddingJudgeConfig);
40
+ /**
41
+ * Get or create the embedding pipeline.
42
+ * Lazy loads the model on first use.
43
+ */
44
+ private getPipeline;
45
+ /**
46
+ * Generate embeddings for text.
47
+ *
48
+ * @param text - Text to embed
49
+ * @returns Embedding vector
50
+ */
51
+ private generateEmbedding;
52
+ /**
53
+ * Calculate cosine similarity between two vectors.
54
+ */
55
+ private cosineSimilarity;
56
+ /**
57
+ * Evaluate a test case against the actual output.
58
+ *
59
+ * @param test - The test case
60
+ * @param actualOutput - The actual output from the agent
61
+ * @returns The judge result with score and reasoning
62
+ */
63
+ evaluate(test: TestCase, actualOutput: string): Promise<JudgeResult>;
64
+ /**
65
+ * Create a judge function compatible with the TestRunner.
66
+ *
67
+ * @returns A function that can be passed to the runner
68
+ */
69
+ createEvaluator(): (test: TestCase, actualOutput: string) => Promise<{
70
+ score: number;
71
+ reasoning: string;
72
+ passed: boolean;
73
+ }>;
74
+ /**
75
+ * Get the judge configuration.
76
+ */
77
+ get config(): EmbeddingJudgeConfig;
78
+ }
79
+ /**
80
+ * Create an embedding judge with the given configuration.
81
+ *
82
+ * @param config - Judge configuration
83
+ * @returns A new EmbeddingJudge instance
84
+ *
85
+ * @example
86
+ * ```typescript
87
+ * const judge = createEmbeddingJudge({ passThreshold: 0.75 });
88
+ * const result = await judge.evaluate(test, output);
89
+ * console.log(`Score: ${result.score}, Passed: ${result.passed}`);
90
+ * ```
91
+ */
92
+ export declare function createEmbeddingJudge(config?: EmbeddingJudgeConfig): EmbeddingJudge;
93
+ //# sourceMappingURL=embedding-judge.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embedding-judge.d.ts","sourceRoot":"","sources":["../../src/embedding-judge.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAIH,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAE3C;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,wBAAwB;IACxB,KAAK,EAAE,MAAM,CAAC;IAEd,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAElB,mDAAmD;IACnD,MAAM,EAAE,OAAO,CAAC;CACjB;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,sDAAsD;IACtD,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB,iFAAiF;IACjF,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAED;;;;;GAKG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,OAAO,CAAiC;IAChD,OAAO,CAAC,SAAS,CAA0C;IAC3D,OAAO,CAAC,aAAa,CAAmD;gBAE5D,MAAM,GAAE,oBAAyB;IAQ7C;;;OAGG;YACW,WAAW;IAoBzB;;;;;OAKG;YACW,iBAAiB;IAY/B;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAqBxB;;;;;;OAMG;IACG,QAAQ,CAAC,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAwC1E;;;;OAIG;IACH,eAAe,IAAI,CACjB,IAAI,EAAE,QAAQ,EACd,YAAY,EAAE,MAAM,KACjB,OAAO,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC;IAWF;;OAEG;IACH,IAAI,MAAM,IAAI,oBAAoB,CAEjC;CACF;AAED;;;;;;;;;;;;GAYG;AACH,wBAAgB,oBAAoB,CAAC,MAAM,CAAC,EAAE,oBAAoB,GAAG,cAAc,CAElF"}
@@ -0,0 +1,160 @@
1
+ /**
2
+ * Embedding-based cosine similarity judge using an actual embedding model.
3
+ *
4
+ * Uses @huggingface/transformers with a lightweight local model for semantic similarity.
5
+ *
6
+ * @module embedding-judge
7
+ */
8
+ import { pipeline } from '@huggingface/transformers';
9
+ /**
10
+ * Judge that evaluates agent outputs using embedding cosine similarity.
11
+ *
12
+ * Uses a local transformer model for semantic similarity comparison.
13
+ * The model is downloaded on first use and cached locally.
14
+ */
15
+ export class EmbeddingJudge {
16
+ _config;
17
+ _pipeline = null;
18
+ _modelLoading = null;
19
+ constructor(config = {}) {
20
+ this._config = {
21
+ passThreshold: 0.7,
22
+ model: 'Xenova/all-MiniLM-L6-v2',
23
+ ...config,
24
+ };
25
+ }
26
+ /**
27
+ * Get or create the embedding pipeline.
28
+ * Lazy loads the model on first use.
29
+ */
30
+ async getPipeline() {
31
+ if (this._pipeline) {
32
+ return this._pipeline;
33
+ }
34
+ if (this._modelLoading) {
35
+ return this._modelLoading;
36
+ }
37
+ this._modelLoading = pipeline('feature-extraction', this._config.model);
38
+ this._pipeline = await this._modelLoading;
39
+ return this._pipeline;
40
+ }
41
+ /**
42
+ * Generate embeddings for text.
43
+ *
44
+ * @param text - Text to embed
45
+ * @returns Embedding vector
46
+ */
47
+ async generateEmbedding(text) {
48
+ const pipe = await this.getPipeline();
49
+ const output = await pipe(text, {
50
+ pooling: 'mean',
51
+ normalize: true,
52
+ });
53
+ // Convert to array
54
+ return Array.from(output.data);
55
+ }
56
+ /**
57
+ * Calculate cosine similarity between two vectors.
58
+ */
59
+ cosineSimilarity(a, b) {
60
+ let dotProduct = 0;
61
+ let normA = 0;
62
+ let normB = 0;
63
+ for (let i = 0; i < a.length; i++) {
64
+ dotProduct += a[i] * b[i];
65
+ normA += a[i] * a[i];
66
+ normB += b[i] * b[i];
67
+ }
68
+ // Vectors are already normalized, but calculate just in case
69
+ const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
70
+ if (magnitude === 0) {
71
+ return 0;
72
+ }
73
+ return Math.max(0, Math.min(1, dotProduct / magnitude));
74
+ }
75
+ /**
76
+ * Evaluate a test case against the actual output.
77
+ *
78
+ * @param test - The test case
79
+ * @param actualOutput - The actual output from the agent
80
+ * @returns The judge result with score and reasoning
81
+ */
82
+ async evaluate(test, actualOutput) {
83
+ try {
84
+ // Generate embeddings for both expected and actual output
85
+ const expectedEmbedding = await this.generateEmbedding(test.expectedOutput);
86
+ const actualEmbedding = await this.generateEmbedding(actualOutput);
87
+ // Calculate similarity
88
+ const score = this.cosineSimilarity(expectedEmbedding, actualEmbedding);
89
+ // Generate reasoning
90
+ let reasoning;
91
+ if (score >= 0.9) {
92
+ reasoning = 'Very high semantic similarity - output closely matches expected content.';
93
+ }
94
+ else if (score >= 0.75) {
95
+ reasoning = 'Good semantic similarity with minor differences in meaning or detail.';
96
+ }
97
+ else if (score >= this._config.passThreshold) {
98
+ reasoning = 'Moderate similarity - core concepts match but notable differences exist.';
99
+ }
100
+ else if (score >= 0.4) {
101
+ reasoning = 'Low semantic similarity - significant differences in meaning.';
102
+ }
103
+ else {
104
+ reasoning = 'Very low similarity - output does not match expected content.';
105
+ }
106
+ const passed = score >= this._config.passThreshold;
107
+ return {
108
+ score,
109
+ reasoning,
110
+ passed,
111
+ };
112
+ }
113
+ catch (error) {
114
+ const errorMessage = error instanceof Error ? error.message : String(error);
115
+ return {
116
+ score: 0,
117
+ reasoning: `Embedding evaluation failed: ${errorMessage}`,
118
+ passed: false,
119
+ };
120
+ }
121
+ }
122
+ /**
123
+ * Create a judge function compatible with the TestRunner.
124
+ *
125
+ * @returns A function that can be passed to the runner
126
+ */
127
+ createEvaluator() {
128
+ return async (test, actualOutput) => {
129
+ const result = await this.evaluate(test, actualOutput);
130
+ return {
131
+ score: result.score,
132
+ reasoning: result.reasoning,
133
+ passed: result.passed,
134
+ };
135
+ };
136
+ }
137
+ /**
138
+ * Get the judge configuration.
139
+ */
140
+ get config() {
141
+ return this._config;
142
+ }
143
+ }
144
+ /**
145
+ * Create an embedding judge with the given configuration.
146
+ *
147
+ * @param config - Judge configuration
148
+ * @returns A new EmbeddingJudge instance
149
+ *
150
+ * @example
151
+ * ```typescript
152
+ * const judge = createEmbeddingJudge({ passThreshold: 0.75 });
153
+ * const result = await judge.evaluate(test, output);
154
+ * console.log(`Score: ${result.score}, Passed: ${result.passed}`);
155
+ * ```
156
+ */
157
+ export function createEmbeddingJudge(config) {
158
+ return new EmbeddingJudge(config);
159
+ }
160
+ //# sourceMappingURL=embedding-judge.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embedding-judge.js","sourceRoot":"","sources":["../../src/embedding-judge.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAkC,QAAQ,EAAE,MAAM,2BAA2B,CAAC;AA6BrF;;;;;GAKG;AACH,MAAM,OAAO,cAAc;IACjB,OAAO,CAAiC;IACxC,SAAS,GAAqC,IAAI,CAAC;IACnD,aAAa,GAA8C,IAAI,CAAC;IAExE,YAAY,SAA+B,EAAE;QAC3C,IAAI,CAAC,OAAO,GAAG;YACb,aAAa,EAAE,GAAG;YAClB,KAAK,EAAE,yBAAyB;YAChC,GAAG,MAAM;SACV,CAAC;IACJ,CAAC;IAED;;;OAGG;IACK,KAAK,CAAC,WAAW;QACvB,IAAI,IAAI,CAAC,SAAS,EAAE,CAAC;YACnB,OAAO,IAAI,CAAC,SAAS,CAAC;QACxB,CAAC;QAED,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,OAAO,IAAI,CAAC,aAAa,CAAC;QAC5B,CAAC;QAGD,IAAI,CAAC,aAAa,GAAG,QAAQ,CAC3B,oBAAoB,EACpB,IAAI,CAAC,OAAO,CAAC,KAAK,CAE8B,CAAC;QAEnD,IAAI,CAAC,SAAS,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC;QAC1C,OAAO,IAAI,CAAC,SAAS,CAAC;IACxB,CAAC;IAED;;;;;OAKG;IACK,KAAK,CAAC,iBAAiB,CAAC,IAAY;QAC1C,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,WAAW,EAAE,CAAC;QAEtC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,IAAI,EAAE;YAC9B,OAAO,EAAE,MAAM;YACf,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QAEH,mBAAmB;QACnB,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAoB,CAAC,CAAC;IACjD,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,CAAW,EAAE,CAAW;QAC/C,IAAI,UAAU,GAAG,CAAC,CAAC;QACnB,IAAI,KAAK,GAAG,CAAC,CAAC;QACd,IAAI,KAAK,GAAG,CAAC,CAAC;QAEd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAClC,UAAU,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YAC1B,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;YACrB,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACvB,CAAC;QAED,6DAA6D;QAC7D,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAEtD,IAAI,SAAS,KAAK,CAAC,EAAE,CAAC;YACpB,OAAO,CAAC,CAAC;QACX,CAAC;QAED,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,UAAU,GAAG,SAAS,CAAC,CAAC,CAAC;IAC1D,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,QAAQ,CAAC,IAAc,EAAE,YAAoB;QACjD,IAAI,CAAC;YACH,0DAA0D;YAC1D,MAAM,iBAAiB,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC;YAC5E,MAAM,eAAe,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,YAAY,CAAC,CAAC;YAEnE,uBAAuB;YACvB,MAAM,KAAK,GAAG,IAAI,CAAC,gBAAgB,CAAC,iBAAiB,EAAE,eAAe,CAAC,CAAC;YAExE,qBAAqB;YACrB,IAAI,SAAiB,CAAC;YACtB,IAAI,KAAK,IAAI,GAAG,EAAE,CAAC;gBACjB,SAAS,GAAG,0EAA0E,CAAC;YACzF,CAAC;iBAAM,IAAI,KAAK,IAAI,IAAI,EAAE,CAAC;gBACzB,SAAS,GAAG,uEAAuE,CAAC;YACtF,CAAC;iBAAM,IAAI,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,CAAC;gBAC/C,SAAS,GAAG,0EAA0E,CAAC;YACzF,CAAC;iBAAM,IAAI,KAAK,IAAI,GAAG,EAAE,CAAC;gBACxB,SAAS,GAAG,+DAA+D,CAAC;YAC9E,CAAC;iBAAM,CAAC;gBACN,SAAS,GAAG,+DAA+D,CAAC;YAC9E,CAAC;YAED,MAAM,MAAM,GAAG,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC;YAEnD,OAAO;gBACL,KAAK;gBACL,SAAS;gBACT,MAAM;aACP,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,OAAO;gBACL,KAAK,EAAE,CAAC;gBACR,SAAS,EAAE,gCAAgC,YAAY,EAAE;gBACzD,MAAM,EAAE,KAAK;aACd,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;;;OAIG;IACH,eAAe;QAQb,OAAO,KAAK,EAAE,IAAI,EAAE,YAAY,EAAE,EAAE;YAClC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;YACvD,OAAO;gBACL,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,SAAS,EAAE,MAAM,CAAC,SAAS;gBAC3B,MAAM,EAAE,MAAM,CAAC,MAAM;aACtB,CAAC;QACJ,CAAC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF;AAED;;;;;;;;;;;;GAYG;AACH,MAAM,UAAU,oBAAoB,CAAC,MAA6B;IAChE,OAAO,IAAI,cAAc,CAAC,MAAM,CAAC,CAAC;AACpC,CAAC"}
@@ -0,0 +1,15 @@
1
+ /**
2
+ * Genesys Agent Eval Harness
3
+ *
4
+ * A framework for evaluating AI agents (pi and genesys) against test suites defined in YAML files.
5
+ *
6
+ * @packageDocumentation
7
+ */
8
+ export type { AgentResponse, Args, EvalResults, EvalSummary, JudgeConfig, RunnerConfig, TestCase, TestResult, TestSuite, } from './types.js';
9
+ export { CLIError, runAgent, type RunOptions, } from './cli-runner.js';
10
+ export { createEmbeddingJudge, EmbeddingJudge, type EmbeddingJudgeConfig, type JudgeResult as EmbeddingJudgeResult, } from './embedding-judge.js';
11
+ export { createJudge, evaluate, Judge, type JudgeResult as LLMJudgeResult, } from './judge.js';
12
+ export { formatResults, type OutputFormat, Reporter, type ReporterConfig, reportResults, } from './reporter.js';
13
+ export { runEvaluation, type ProgressCallback, TestRunner, } from './runner.js';
14
+ export { loadTestSuite, type LoadResult, parseTestSuite, TestLoadError, TestValidationError, validateTestSuite, } from './test-loader.js';
15
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAGH,YAAY,EACV,aAAa,EACb,IAAI,EACJ,WAAW,EACX,WAAW,EACX,WAAW,EACX,YAAY,EACZ,QAAQ,EACR,UAAU,EACV,SAAS,GACV,MAAM,YAAY,CAAC;AAGpB,OAAO,EACL,QAAQ,EACR,QAAQ,EACR,KAAK,UAAU,GAChB,MAAM,iBAAiB,CAAC;AAGzB,OAAO,EACL,oBAAoB,EACpB,cAAc,EACd,KAAK,oBAAoB,EACzB,KAAK,WAAW,IAAI,oBAAoB,GACzC,MAAM,sBAAsB,CAAC;AAG9B,OAAO,EACL,WAAW,EACX,QAAQ,EACR,KAAK,EACL,KAAK,WAAW,IAAI,cAAc,GACnC,MAAM,YAAY,CAAC;AAGpB,OAAO,EACL,aAAa,EACb,KAAK,YAAY,EACjB,QAAQ,EACR,KAAK,cAAc,EACnB,aAAa,GACd,MAAM,eAAe,CAAC;AAGvB,OAAO,EACL,aAAa,EACb,KAAK,gBAAgB,EACrB,UAAU,GACX,MAAM,aAAa,CAAC;AAGrB,OAAO,EACL,aAAa,EACb,KAAK,UAAU,EACf,cAAc,EACd,aAAa,EACb,mBAAmB,EACnB,iBAAiB,GAClB,MAAM,kBAAkB,CAAC"}
@@ -0,0 +1,20 @@
1
+ /**
2
+ * Genesys Agent Eval Harness
3
+ *
4
+ * A framework for evaluating AI agents (pi and genesys) against test suites defined in YAML files.
5
+ *
6
+ * @packageDocumentation
7
+ */
8
+ // Export CLI runner functionality
9
+ export { CLIError, runAgent, } from './cli-runner.js';
10
+ // Export embedding judge functionality
11
+ export { createEmbeddingJudge, EmbeddingJudge, } from './embedding-judge.js';
12
+ // Export LLM judge functionality
13
+ export { createJudge, evaluate, Judge, } from './judge.js';
14
+ // Export reporter functionality
15
+ export { formatResults, Reporter, reportResults, } from './reporter.js';
16
+ // Export runner functionality
17
+ export { runEvaluation, TestRunner, } from './runner.js';
18
+ // Export test loader functionality
19
+ export { loadTestSuite, parseTestSuite, TestLoadError, TestValidationError, validateTestSuite, } from './test-loader.js';
20
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAeH,kCAAkC;AAClC,OAAO,EACL,QAAQ,EACR,QAAQ,GAET,MAAM,iBAAiB,CAAC;AAEzB,uCAAuC;AACvC,OAAO,EACL,oBAAoB,EACpB,cAAc,GAGf,MAAM,sBAAsB,CAAC;AAE9B,iCAAiC;AACjC,OAAO,EACL,WAAW,EACX,QAAQ,EACR,KAAK,GAEN,MAAM,YAAY,CAAC;AAEpB,gCAAgC;AAChC,OAAO,EACL,aAAa,EAEb,QAAQ,EAER,aAAa,GACd,MAAM,eAAe,CAAC;AAEvB,8BAA8B;AAC9B,OAAO,EACL,aAAa,EAEb,UAAU,GACX,MAAM,aAAa,CAAC;AAErB,mCAAmC;AACnC,OAAO,EACL,aAAa,EAEb,cAAc,EACd,aAAa,EACb,mBAAmB,EACnB,iBAAiB,GAClB,MAAM,kBAAkB,CAAC"}
@@ -0,0 +1,95 @@
1
+ /**
2
+ * LLM-based judge for evaluating agent outputs against expected results.
3
+ *
4
+ * The judge uses a separate LLM to score how well the agent's output matches
5
+ * the expected output description on a scale of 0-1.
6
+ *
7
+ * @module judge
8
+ */
9
+ import type { JudgeConfig, TestCase } from './types.js';
10
+ /**
11
+ * Result of judging a test case.
12
+ */
13
+ export interface JudgeResult {
14
+ /** Score from 0 to 1 */
15
+ score: number;
16
+ /** Reasoning for the score */
17
+ reasoning: string;
18
+ /** Whether the test passed (score >= threshold) */
19
+ passed: boolean;
20
+ }
21
+ /**
22
+ * Judge that evaluates agent outputs using an LLM.
23
+ *
24
+ * The judge prompts a separate LLM to compare the actual output against the
25
+ * expected output description and score it on a scale of 0-1.
26
+ */
27
+ export declare class Judge {
28
+ private _config;
29
+ constructor(config: JudgeConfig);
30
+ /**
31
+ * Build the judge prompt.
32
+ *
33
+ * @param test - The test case
34
+ * @param actualOutput - The actual output from the agent
35
+ * @returns The prompt to send to the judge LLM
36
+ */
37
+ private buildPrompt;
38
+ /**
39
+ * Get the model instance based on provider.
40
+ *
41
+ * @returns Model instance for the Vercel AI SDK
42
+ */
43
+ private getModel;
44
+ /**
45
+ * Evaluate a test case against the actual output.
46
+ *
47
+ * @param test - The test case
48
+ * @param actualOutput - The actual output from the agent
49
+ * @returns The judge result with score and reasoning
50
+ */
51
+ evaluate(test: TestCase, actualOutput: string): Promise<JudgeResult>;
52
+ /**
53
+ * Create a judge function compatible with the TestRunner.
54
+ *
55
+ * @returns A function that can be passed to the runner
56
+ */
57
+ createEvaluator(): (test: TestCase, actualOutput: string) => Promise<{
58
+ score: number;
59
+ reasoning: string;
60
+ passed: boolean;
61
+ }>;
62
+ /**
63
+ * Get the judge configuration.
64
+ */
65
+ get config(): JudgeConfig;
66
+ }
67
+ /**
68
+ * Create a judge with the given configuration.
69
+ *
70
+ * @param config - Judge configuration
71
+ * @returns A new Judge instance
72
+ *
73
+ * @example
74
+ * ```typescript
75
+ * const judge = createJudge({
76
+ * provider: 'anthropic',
77
+ * model: 'claude-3-5-sonnet-20241022',
78
+ * passThreshold: 0.8
79
+ * });
80
+ *
81
+ * const result = await judge.evaluate(test, output);
82
+ * console.log(`Score: ${result.score}, Passed: ${result.passed}`);
83
+ * ```
84
+ */
85
+ export declare function createJudge(config: JudgeConfig): Judge;
86
+ /**
87
+ * Quick evaluate function for one-off judgments.
88
+ *
89
+ * @param config - Judge configuration
90
+ * @param test - Test case
91
+ * @param actualOutput - Actual output to evaluate
92
+ * @returns Judge result
93
+ */
94
+ export declare function evaluate(config: JudgeConfig, test: TestCase, actualOutput: string): Promise<JudgeResult>;
95
+ //# sourceMappingURL=judge.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"judge.d.ts","sourceRoot":"","sources":["../../src/judge.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAQH,OAAO,KAAK,EAAE,WAAW,EAAE,QAAQ,EAAE,MAAM,YAAY,CAAC;AAExD;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,wBAAwB;IACxB,KAAK,EAAE,MAAM,CAAC;IAEd,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAElB,mDAAmD;IACnD,MAAM,EAAE,OAAO,CAAC;CACjB;AAeD;;;;;GAKG;AACH,qBAAa,KAAK;IAChB,OAAO,CAAC,OAAO,CAAc;gBAEjB,MAAM,EAAE,WAAW;IAQ/B;;;;;;OAMG;IACH,OAAO,CAAC,WAAW;IA+BnB;;;;OAIG;IACH,OAAO,CAAC,QAAQ;IAoBhB;;;;;;OAMG;IACG,QAAQ,CAAC,IAAI,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAkC1E;;;;OAIG;IACH,eAAe,IAAI,CACjB,IAAI,EAAE,QAAQ,EACd,YAAY,EAAE,MAAM,KACjB,OAAO,CAAC;QACX,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,OAAO,CAAC;KACjB,CAAC;IAWF;;OAEG;IACH,IAAI,MAAM,IAAI,WAAW,CAExB;CACF;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,wBAAgB,WAAW,CAAC,MAAM,EAAE,WAAW,GAAG,KAAK,CAEtD;AAED;;;;;;;GAOG;AACH,wBAAsB,QAAQ,CAC5B,MAAM,EAAE,WAAW,EACnB,IAAI,EAAE,QAAQ,EACd,YAAY,EAAE,MAAM,GACnB,OAAO,CAAC,WAAW,CAAC,CAGtB"}
@@ -0,0 +1,189 @@
1
+ /**
2
+ * LLM-based judge for evaluating agent outputs against expected results.
3
+ *
4
+ * The judge uses a separate LLM to score how well the agent's output matches
5
+ * the expected output description on a scale of 0-1.
6
+ *
7
+ * @module judge
8
+ */
9
+ import { anthropic } from '@ai-sdk/anthropic';
10
+ import { google } from '@ai-sdk/google';
11
+ import { openai } from '@ai-sdk/openai';
12
+ import { generateObject } from 'ai';
13
+ import { z } from 'zod';
14
+ /**
15
+ * Zod schema for the judge's structured output.
16
+ */
17
+ const judgeOutputSchema = z.object({
18
+ score: z.number().min(0).max(1).describe('Score from 0 to 1 where 1 is perfect'),
19
+ reasoning: z.string().describe('Explanation for the score'),
20
+ });
21
+ /**
22
+ * Judge that evaluates agent outputs using an LLM.
23
+ *
24
+ * The judge prompts a separate LLM to compare the actual output against the
25
+ * expected output description and score it on a scale of 0-1.
26
+ */
27
+ export class Judge {
28
+ _config;
29
+ constructor(config) {
30
+ this._config = {
31
+ passThreshold: 0.7,
32
+ temperature: 0,
33
+ ...config,
34
+ };
35
+ }
36
+ /**
37
+ * Build the judge prompt.
38
+ *
39
+ * @param test - The test case
40
+ * @param actualOutput - The actual output from the agent
41
+ * @returns The prompt to send to the judge LLM
42
+ */
43
+ buildPrompt(test, actualOutput) {
44
+ return `You are an expert evaluator assessing the quality of AI responses.
45
+
46
+ Your task is to evaluate how well the ACTUAL OUTPUT matches the EXPECTED OUTPUT description.
47
+
48
+ ## Test Input
49
+ ${test.input}
50
+
51
+ ## Expected Output Description
52
+ ${test.expectedOutput}
53
+
54
+ ${test.context ? `## Additional Context\n${test.context}\n\n` : ''}## Actual Output
55
+ ${actualOutput}
56
+
57
+ ## Evaluation Instructions
58
+
59
+ 1. Carefully read the expected output description and the actual output
60
+ 2. Score the actual output on a scale of 0.0 to 1.0 where:
61
+ - 1.0 = Perfect match, fully satisfies the expected output
62
+ - 0.8-0.9 = Good match, minor issues or omissions
63
+ - 0.6-0.7 = Partial match, significant issues but some correct elements
64
+ - 0.4-0.5 = Poor match, mostly incorrect or incomplete
65
+ - 0.0-0.3 = Very poor match, completely wrong or irrelevant
66
+
67
+ 3. Provide clear reasoning for your score
68
+
69
+ Respond with a structured object containing:
70
+ - score: number from 0 to 1
71
+ - reasoning: string explaining your evaluation`;
72
+ }
73
+ /**
74
+ * Get the model instance based on provider.
75
+ *
76
+ * @returns Model instance for the Vercel AI SDK
77
+ */
78
+ getModel() {
79
+ const { provider, model } = this._config;
80
+ switch (provider) {
81
+ case 'anthropic': {
82
+ return anthropic(model);
83
+ }
84
+ case 'openai': {
85
+ return openai(model);
86
+ }
87
+ case 'google':
88
+ case 'gemini': {
89
+ return google(model);
90
+ }
91
+ default: {
92
+ throw new Error(`Unsupported judge provider: ${provider}`);
93
+ }
94
+ }
95
+ }
96
+ /**
97
+ * Evaluate a test case against the actual output.
98
+ *
99
+ * @param test - The test case
100
+ * @param actualOutput - The actual output from the agent
101
+ * @returns The judge result with score and reasoning
102
+ */
103
+ async evaluate(test, actualOutput) {
104
+ const prompt = this.buildPrompt(test, actualOutput);
105
+ try {
106
+ const { object } = await generateObject({
107
+ model: this.getModel(),
108
+ schema: judgeOutputSchema,
109
+ messages: [
110
+ {
111
+ role: 'user',
112
+ content: prompt,
113
+ },
114
+ ],
115
+ temperature: this._config.temperature,
116
+ });
117
+ const passed = object.score >= (this._config.passThreshold ?? 0.7);
118
+ return {
119
+ score: object.score,
120
+ reasoning: object.reasoning,
121
+ passed,
122
+ };
123
+ }
124
+ catch (error) {
125
+ // If structured generation fails, return a failed result
126
+ const errorMessage = error instanceof Error ? error.message : String(error);
127
+ return {
128
+ score: 0,
129
+ reasoning: `Judge evaluation failed: ${errorMessage}`,
130
+ passed: false,
131
+ };
132
+ }
133
+ }
134
+ /**
135
+ * Create a judge function compatible with the TestRunner.
136
+ *
137
+ * @returns A function that can be passed to the runner
138
+ */
139
+ createEvaluator() {
140
+ return async (test, actualOutput) => {
141
+ const result = await this.evaluate(test, actualOutput);
142
+ return {
143
+ score: result.score,
144
+ reasoning: result.reasoning,
145
+ passed: result.passed,
146
+ };
147
+ };
148
+ }
149
+ /**
150
+ * Get the judge configuration.
151
+ */
152
+ get config() {
153
+ return this._config;
154
+ }
155
+ }
156
+ /**
157
+ * Create a judge with the given configuration.
158
+ *
159
+ * @param config - Judge configuration
160
+ * @returns A new Judge instance
161
+ *
162
+ * @example
163
+ * ```typescript
164
+ * const judge = createJudge({
165
+ * provider: 'anthropic',
166
+ * model: 'claude-3-5-sonnet-20241022',
167
+ * passThreshold: 0.8
168
+ * });
169
+ *
170
+ * const result = await judge.evaluate(test, output);
171
+ * console.log(`Score: ${result.score}, Passed: ${result.passed}`);
172
+ * ```
173
+ */
174
+ export function createJudge(config) {
175
+ return new Judge(config);
176
+ }
177
+ /**
178
+ * Quick evaluate function for one-off judgments.
179
+ *
180
+ * @param config - Judge configuration
181
+ * @param test - Test case
182
+ * @param actualOutput - Actual output to evaluate
183
+ * @returns Judge result
184
+ */
185
+ export async function evaluate(config, test, actualOutput) {
186
+ const judge = createJudge(config);
187
+ return judge.evaluate(test, actualOutput);
188
+ }
189
+ //# sourceMappingURL=judge.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"judge.js","sourceRoot":"","sources":["../../src/judge.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAC9C,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AACxC,OAAO,EAAE,MAAM,EAAE,MAAM,gBAAgB,CAAC;AACxC,OAAO,EAAE,cAAc,EAAE,MAAM,IAAI,CAAC;AACpC,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAkBxB;;GAEG;AACH,MAAM,iBAAiB,GAAG,CAAC,CAAC,MAAM,CAAC;IACjC,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,sCAAsC,CAAC;IAChF,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,2BAA2B,CAAC;CAC5D,CAAC,CAAC;AAOH;;;;;GAKG;AACH,MAAM,OAAO,KAAK;IACR,OAAO,CAAc;IAE7B,YAAY,MAAmB;QAC7B,IAAI,CAAC,OAAO,GAAG;YACb,aAAa,EAAE,GAAG;YAClB,WAAW,EAAE,CAAC;YACd,GAAG,MAAM;SACV,CAAC;IACJ,CAAC;IAED;;;;;;OAMG;IACK,WAAW,CAAC,IAAc,EAAE,YAAoB;QACtD,OAAO;;;;;EAKT,IAAI,CAAC,KAAK;;;EAGV,IAAI,CAAC,cAAc;;EAEnB,IAAI,CAAC,OAAO,CAAC,CAAC,CAAC,0BAA0B,IAAI,CAAC,OAAO,MAAM,CAAC,CAAC,CAAC,EAAE;EAChE,YAAY;;;;;;;;;;;;;;;;+CAgBiC,CAAC;IAC9C,CAAC;IAED;;;;OAIG;IACK,QAAQ;QACd,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,GAAG,IAAI,CAAC,OAAO,CAAC;QAEzC,QAAQ,QAAQ,EAAE,CAAC;YACjB,KAAK,WAAW,CAAC,CAAC,CAAC;gBACjB,OAAO,SAAS,CAAC,KAAK,CAAC,CAAC;YAC1B,CAAC;YACD,KAAK,QAAQ,CAAC,CAAC,CAAC;gBACd,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC;YACvB,CAAC;YACD,KAAK,QAAQ,CAAC;YACd,KAAK,QAAQ,CAAC,CAAC,CAAC;gBACd,OAAO,MAAM,CAAC,KAAK,CAAC,CAAC;YACvB,CAAC;YACD,OAAO,CAAC,CAAC,CAAC;gBACR,MAAM,IAAI,KAAK,CAAC,+BAA+B,QAAQ,EAAE,CAAC,CAAC;YAC7D,CAAC;QACH,CAAC;IACH,CAAC;IAED;;;;;;OAMG;IACH,KAAK,CAAC,QAAQ,CAAC,IAAc,EAAE,YAAoB;QACjD,MAAM,MAAM,GAAG,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;QAEpD,IAAI,CAAC;YACH,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,cAAc,CAAC;gBACtC,KAAK,EAAE,IAAI,CAAC,QAAQ,EAA8D;gBAClF,MAAM,EAAE,iBAAiB;gBACzB,QAAQ,EAAE;oBACR;wBACE,IAAI,EAAE,MAAM;wBACZ,OAAO,EAAE,MAAM;qBAChB;iBACF;gBACD,WAAW,EAAE,IAAI,CAAC,OAAO,CAAC,WAAW;aACtC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,aAAa,IAAI,GAAG,CAAC,CAAC;YAEnE,OAAO;gBACL,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,SAAS,EAAE,MAAM,CAAC,SAAS;gBAC3B,MAAM;aACP,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,yDAAyD;YACzD,MAAM,YAAY,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC5E,OAAO;gBACL,KAAK,EAAE,CAAC;gBACR,SAAS,EAAE,4BAA4B,YAAY,EAAE;gBACrD,MAAM,EAAE,KAAK;aACd,CAAC;QACJ,CAAC;IACH,CAAC;IAED;;;;OAIG;IACH,eAAe;QAQb,OAAO,KAAK,EAAE,IAAI,EAAE,YAAY,EAAE,EAAE;YAClC,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;YACvD,OAAO;gBACL,KAAK,EAAE,MAAM,CAAC,KAAK;gBACnB,SAAS,EAAE,MAAM,CAAC,SAAS;gBAC3B,MAAM,EAAE,MAAM,CAAC,MAAM;aACtB,CAAC;QACJ,CAAC,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,OAAO,CAAC;IACtB,CAAC;CACF;AAED;;;;;;;;;;;;;;;;;GAiBG;AACH,MAAM,UAAU,WAAW,CAAC,MAAmB;IAC7C,OAAO,IAAI,KAAK,CAAC,MAAM,CAAC,CAAC;AAC3B,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,QAAQ,CAC5B,MAAmB,EACnB,IAAc,EACd,YAAoB;IAEpB,MAAM,KAAK,GAAG,WAAW,CAAC,MAAM,CAAC,CAAC;IAClC,OAAO,KAAK,CAAC,QAAQ,CAAC,IAAI,EAAE,YAAY,CAAC,CAAC;AAC5C,CAAC"}
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Genesys Agent Eval CLI launcher.
4
+ *
5
+ * Entry point for the evaluation harness. Spawns pi or genesys CLI
6
+ * processes to run tests and evaluates the results.
7
+ */
8
+ export {};
9
+ //# sourceMappingURL=launcher.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"launcher.d.ts","sourceRoot":"","sources":["../../src/launcher.ts"],"names":[],"mappings":";AACA;;;;;GAKG"}