@operor/testing 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,296 @@
1
+ import type { LLMProvider } from '@operor/llm';
2
+ import type { TestCase } from './types.js';
3
+
4
+ export interface EvaluationResult {
5
+ passed: boolean;
6
+ score: number;
7
+ method: 'exact' | 'contains' | 'similarity' | 'llm_judge';
8
+ reasoning: string;
9
+ toolsCorrect: boolean;
10
+ }
11
+
12
+ export class TestCaseEvaluator {
13
+ constructor(private llm?: LLMProvider) {}
14
+
15
+ async evaluate(
16
+ testCase: TestCase,
17
+ agentResponse: string,
18
+ toolsCalled: Array<{ name: string; params: any; result: any }>,
19
+ strategy?: 'exact' | 'contains' | 'similarity' | 'semantic'
20
+ ): Promise<EvaluationResult> {
21
+ // Validate tools first
22
+ const toolsCorrect = this.validateTools(testCase.expectedTools, toolsCalled);
23
+
24
+ // Choose evaluation strategy based on explicit strategy parameter
25
+ if (strategy === 'exact') {
26
+ return this.evaluateByExact(testCase, agentResponse, toolsCorrect);
27
+ }
28
+
29
+ if (strategy === 'contains') {
30
+ return this.evaluateByContains(testCase, agentResponse, toolsCorrect);
31
+ }
32
+
33
+ if (strategy === 'similarity') {
34
+ return this.evaluateBySimilarity(testCase, agentResponse, toolsCorrect);
35
+ }
36
+
37
+ if (strategy === 'semantic' && this.llm) {
38
+ if (testCase.expectedAnswer) {
39
+ return await this.evaluateByLLMComparison(testCase, agentResponse, toolsCorrect);
40
+ }
41
+ return await this.evaluateByLLMJudge(testCase, agentResponse, toolsCorrect);
42
+ }
43
+
44
+ // Default behavior (no strategy specified)
45
+ if (!this.llm) {
46
+ // No LLM: use string similarity
47
+ return this.evaluateBySimilarity(testCase, agentResponse, toolsCorrect);
48
+ }
49
+
50
+ if (testCase.expectedAnswer) {
51
+ // LLM + expected answer: use LLM comparison
52
+ return await this.evaluateByLLMComparison(testCase, agentResponse, toolsCorrect);
53
+ }
54
+
55
+ // LLM + no expected answer: use LLM standalone judge
56
+ return await this.evaluateByLLMJudge(testCase, agentResponse, toolsCorrect);
57
+ }
58
+
59
+ private validateTools(
60
+ expectedTools: string[] | undefined,
61
+ toolsCalled: Array<{ name: string; params: any; result: any }>
62
+ ): boolean {
63
+ if (!expectedTools || expectedTools.length === 0) {
64
+ return true; // No tools expected, always pass
65
+ }
66
+
67
+ const calledNames = new Set(toolsCalled.map((t) => t.name));
68
+ return expectedTools.every((tool) => calledNames.has(tool));
69
+ }
70
+
71
+ private evaluateByExact(
72
+ testCase: TestCase,
73
+ agentResponse: string,
74
+ toolsCorrect: boolean
75
+ ): EvaluationResult {
76
+ if (!testCase.expectedAnswer) {
77
+ return {
78
+ passed: toolsCorrect,
79
+ score: toolsCorrect ? 1 : 0,
80
+ method: 'exact',
81
+ reasoning: 'No expected answer provided, evaluated tools only',
82
+ toolsCorrect,
83
+ };
84
+ }
85
+
86
+ const matches = agentResponse.trim().toLowerCase() === testCase.expectedAnswer.trim().toLowerCase();
87
+ const passed = matches && toolsCorrect;
88
+
89
+ return {
90
+ passed,
91
+ score: matches ? 1 : 0,
92
+ method: 'exact',
93
+ reasoning: matches ? 'Exact match' : 'Response does not exactly match expected answer',
94
+ toolsCorrect,
95
+ };
96
+ }
97
+
98
+ private evaluateByContains(
99
+ testCase: TestCase,
100
+ agentResponse: string,
101
+ toolsCorrect: boolean
102
+ ): EvaluationResult {
103
+ if (!testCase.expectedAnswer) {
104
+ return {
105
+ passed: toolsCorrect,
106
+ score: toolsCorrect ? 1 : 0,
107
+ method: 'contains',
108
+ reasoning: 'No expected answer provided, evaluated tools only',
109
+ toolsCorrect,
110
+ };
111
+ }
112
+
113
+ const normalizeDashes = (s: string) => s.replace(/[\u2013\u2011]/g, '-');
114
+ const contains = normalizeDashes(agentResponse.toLowerCase()).includes(normalizeDashes(testCase.expectedAnswer.toLowerCase()));
115
+ const passed = contains && toolsCorrect;
116
+
117
+ return {
118
+ passed,
119
+ score: contains ? 1 : 0,
120
+ method: 'contains',
121
+ reasoning: contains
122
+ ? `Response contains expected text: "${testCase.expectedAnswer}"`
123
+ : `Response does not contain expected text: "${testCase.expectedAnswer}"`,
124
+ toolsCorrect,
125
+ };
126
+ }
127
+
128
+ private evaluateBySimilarity(
129
+ testCase: TestCase,
130
+ agentResponse: string,
131
+ toolsCorrect: boolean
132
+ ): EvaluationResult {
133
+ if (!testCase.expectedAnswer) {
134
+ // No expected answer and no LLM: can't evaluate, pass by default
135
+ return {
136
+ passed: toolsCorrect,
137
+ score: toolsCorrect ? 1 : 0,
138
+ method: 'similarity',
139
+ reasoning: 'No expected answer provided, evaluated tools only',
140
+ toolsCorrect,
141
+ };
142
+ }
143
+
144
+ const similarity = this.normalizedLevenshtein(
145
+ testCase.expectedAnswer.toLowerCase(),
146
+ agentResponse.toLowerCase()
147
+ );
148
+
149
+ const passed = similarity > 0.7 && toolsCorrect;
150
+
151
+ return {
152
+ passed,
153
+ score: similarity,
154
+ method: 'similarity',
155
+ reasoning: `String similarity: ${(similarity * 100).toFixed(1)}% (threshold: 70%)`,
156
+ toolsCorrect,
157
+ };
158
+ }
159
+
160
+ private async evaluateByLLMComparison(
161
+ testCase: TestCase,
162
+ agentResponse: string,
163
+ toolsCorrect: boolean
164
+ ): Promise<EvaluationResult> {
165
+ const prompt = `You are evaluating an AI agent's response to a customer question.
166
+
167
+ Question: ${testCase.question}
168
+ Expected Answer: ${testCase.expectedAnswer}
169
+ Actual Response: ${agentResponse}
170
+
171
+ Rate the actual response on a scale of 1-5:
172
+ 1 = Completely wrong or irrelevant
173
+ 2 = Partially correct but missing key information
174
+ 3 = Mostly correct with minor issues
175
+ 4 = Correct with good quality
176
+ 5 = Excellent, matches or exceeds expected answer
177
+
178
+ Respond with ONLY a JSON object in this format:
179
+ {"score": <1-5>, "reasoning": "<brief explanation>"}`;
180
+
181
+ const result = await this.llm!.complete(
182
+ [{ role: 'user', content: prompt }],
183
+ { temperature: 0, maxTokens: 200 }
184
+ );
185
+
186
+ let score = 3;
187
+ let reasoning = 'LLM evaluation completed';
188
+
189
+ try {
190
+ const parsed = JSON.parse(result.text);
191
+ score = parsed.score;
192
+ reasoning = parsed.reasoning;
193
+ } catch {
194
+ // Fallback: try to extract score from text
195
+ const match = result.text.match(/score["\s:]+(\d)/i);
196
+ if (match) {
197
+ score = parseInt(match[1], 10);
198
+ }
199
+ reasoning = result.text.substring(0, 200);
200
+ }
201
+
202
+ const normalizedScore = score / 5;
203
+ const passed = normalizedScore >= 0.6 && toolsCorrect;
204
+
205
+ return {
206
+ passed,
207
+ score: normalizedScore,
208
+ method: 'llm_judge',
209
+ reasoning: `LLM comparison (${score}/5): ${reasoning}`,
210
+ toolsCorrect,
211
+ };
212
+ }
213
+
214
+ private async evaluateByLLMJudge(
215
+ testCase: TestCase,
216
+ agentResponse: string,
217
+ toolsCorrect: boolean
218
+ ): Promise<EvaluationResult> {
219
+ const prompt = `You are evaluating an AI agent's response to a customer question.
220
+
221
+ Question: ${testCase.question}
222
+ Agent Response: ${agentResponse}
223
+
224
+ Rate the response quality on a scale of 1-5:
225
+ 1 = Unhelpful, incorrect, or inappropriate
226
+ 2 = Partially helpful but incomplete or unclear
227
+ 3 = Adequate, addresses the question reasonably
228
+ 4 = Good quality, helpful and accurate
229
+ 5 = Excellent, comprehensive and professional
230
+
231
+ Respond with ONLY a JSON object in this format:
232
+ {"score": <1-5>, "reasoning": "<brief explanation>"}`;
233
+
234
+ const result = await this.llm!.complete(
235
+ [{ role: 'user', content: prompt }],
236
+ { temperature: 0, maxTokens: 200 }
237
+ );
238
+
239
+ let score = 3;
240
+ let reasoning = 'LLM evaluation completed';
241
+
242
+ try {
243
+ const parsed = JSON.parse(result.text);
244
+ score = parsed.score;
245
+ reasoning = parsed.reasoning;
246
+ } catch {
247
+ // Fallback: try to extract score from text
248
+ const match = result.text.match(/score["\s:]+(\d)/i);
249
+ if (match) {
250
+ score = parseInt(match[1], 10);
251
+ }
252
+ reasoning = result.text.substring(0, 200);
253
+ }
254
+
255
+ const normalizedScore = score / 5;
256
+ const passed = normalizedScore >= 0.6 && toolsCorrect;
257
+
258
+ return {
259
+ passed,
260
+ score: normalizedScore,
261
+ method: 'llm_judge',
262
+ reasoning: `LLM standalone judge (${score}/5): ${reasoning}`,
263
+ toolsCorrect,
264
+ };
265
+ }
266
+
267
+ private normalizedLevenshtein(s1: string, s2: string): number {
268
+ const len1 = s1.length;
269
+ const len2 = s2.length;
270
+
271
+ if (len1 === 0) return len2 === 0 ? 1 : 0;
272
+ if (len2 === 0) return 0;
273
+
274
+ const matrix: number[][] = Array.from({ length: len1 + 1 }, () =>
275
+ Array(len2 + 1).fill(0)
276
+ );
277
+
278
+ for (let i = 0; i <= len1; i++) matrix[i][0] = i;
279
+ for (let j = 0; j <= len2; j++) matrix[0][j] = j;
280
+
281
+ for (let i = 1; i <= len1; i++) {
282
+ for (let j = 1; j <= len2; j++) {
283
+ const cost = s1[i - 1] === s2[j - 1] ? 0 : 1;
284
+ matrix[i][j] = Math.min(
285
+ matrix[i - 1][j] + 1,
286
+ matrix[i][j - 1] + 1,
287
+ matrix[i - 1][j - 1] + cost
288
+ );
289
+ }
290
+ }
291
+
292
+ const distance = matrix[len1][len2];
293
+ const maxLen = Math.max(len1, len2);
294
+ return 1 - distance / maxLen;
295
+ }
296
+ }
@@ -0,0 +1,151 @@
1
+ import { Operor } from '@operor/core';
2
+ import type { LLMProvider } from '@operor/llm';
3
+ import { MockProvider } from '@operor/provider-mock';
4
+ import type { TestCase, TestCaseResult, TestSuiteResult } from './types.js';
5
+ import { TestCaseEvaluator } from './TestCaseEvaluator.js';
6
+
7
+ export interface TestSuiteRunnerConfig {
8
+ agentOS: Operor;
9
+ llm?: LLMProvider;
10
+ timeout?: number;
11
+ strategy?: 'exact' | 'contains' | 'similarity' | 'semantic';
12
+ }
13
+
14
+ export class TestSuiteRunner {
15
+ private evaluator: TestCaseEvaluator;
16
+ private agentOS: Operor;
17
+ private timeout: number;
18
+ private strategy?: 'exact' | 'contains' | 'similarity' | 'semantic';
19
+
20
+ constructor(config: TestSuiteRunnerConfig) {
21
+ this.agentOS = config.agentOS;
22
+ this.evaluator = new TestCaseEvaluator(config.llm);
23
+ this.timeout = config.timeout || 30000;
24
+ this.strategy = config.strategy;
25
+ }
26
+
27
+ async runSuite(testCases: TestCase[]): Promise<TestSuiteResult> {
28
+ const results: TestCaseResult[] = [];
29
+ const startTime = Date.now();
30
+
31
+ for (const testCase of testCases) {
32
+ const result = await this.runTestCase(testCase);
33
+ results.push(result);
34
+ }
35
+
36
+ const totalDuration = Date.now() - startTime;
37
+ const passed = results.filter((r) => r.evaluation.passed).length;
38
+ const failed = results.length - passed;
39
+ const averageScore =
40
+ results.reduce((sum, r) => sum + r.evaluation.score, 0) / results.length;
41
+ const totalCost = results.reduce((sum, r) => sum + r.cost, 0);
42
+
43
+ // Group by tags
44
+ const byTag: Record<string, { total: number; passed: number; avgScore: number }> = {};
45
+ for (const result of results) {
46
+ const tags = result.testCase.tags || ['untagged'];
47
+ for (const tag of tags) {
48
+ if (!byTag[tag]) {
49
+ byTag[tag] = { total: 0, passed: 0, avgScore: 0 };
50
+ }
51
+ byTag[tag].total++;
52
+ if (result.evaluation.passed) {
53
+ byTag[tag].passed++;
54
+ }
55
+ byTag[tag].avgScore += result.evaluation.score;
56
+ }
57
+ }
58
+
59
+ // Calculate average scores per tag
60
+ for (const tag in byTag) {
61
+ byTag[tag].avgScore /= byTag[tag].total;
62
+ }
63
+
64
+ return {
65
+ total: results.length,
66
+ passed,
67
+ failed,
68
+ averageScore,
69
+ byTag,
70
+ results,
71
+ totalDuration,
72
+ totalCost,
73
+ };
74
+ }
75
+
76
+ private async runTestCase(testCase: TestCase): Promise<TestCaseResult> {
77
+ const startTime = Date.now();
78
+ let agentResponse = '';
79
+ let toolsCalled: Array<{ name: string; params: any; result: any }> = [];
80
+ let cost = 0;
81
+
82
+ try {
83
+ // Create a promise that resolves when we get a response
84
+ const responsePromise = new Promise<{
85
+ text: string;
86
+ toolCalls?: Array<{ name: string; params: any; result: any }>;
87
+ cost: number;
88
+ }>((resolve, reject) => {
89
+ const timeoutId = setTimeout(() => {
90
+ reject(new Error(`Test case ${testCase.id} timed out after ${this.timeout}ms`));
91
+ }, this.timeout);
92
+
93
+ // Listen for the response
94
+ this.agentOS.once('message:processed', (event: any) => {
95
+ clearTimeout(timeoutId);
96
+ resolve({
97
+ text: event.response.text,
98
+ toolCalls: event.response.toolCalls || [],
99
+ cost: event.cost || 0,
100
+ });
101
+ });
102
+
103
+ // Listen for errors
104
+ this.agentOS.once('error', (event: any) => {
105
+ clearTimeout(timeoutId);
106
+ reject(event.error);
107
+ });
108
+ });
109
+
110
+ // Get the mock provider
111
+ const mockProvider = Array.from((this.agentOS as any).providers.values()).find(
112
+ (p: any) => p.name === 'mock'
113
+ ) as MockProvider | undefined;
114
+
115
+ if (!mockProvider) {
116
+ throw new Error('MockProvider not found in Operor');
117
+ }
118
+
119
+ // Simulate incoming message
120
+ const testPhone = testCase.persona || 'test-user';
121
+ mockProvider.simulateIncomingMessage(testPhone, testCase.question);
122
+
123
+ // Wait for response
124
+ const response = await responsePromise;
125
+ agentResponse = response.text;
126
+ toolsCalled = response.toolCalls || [];
127
+ cost = response.cost;
128
+ } catch (error) {
129
+ agentResponse = `Error: ${error instanceof Error ? error.message : String(error)}`;
130
+ }
131
+
132
+ const duration = Date.now() - startTime;
133
+
134
+ // Evaluate the response
135
+ const evaluation = await this.evaluator.evaluate(
136
+ testCase,
137
+ agentResponse,
138
+ toolsCalled,
139
+ this.strategy
140
+ );
141
+
142
+ return {
143
+ testCase,
144
+ agentResponse,
145
+ toolsCalled,
146
+ evaluation,
147
+ duration,
148
+ cost,
149
+ };
150
+ }
151
+ }
@@ -0,0 +1,122 @@
1
+ import { describe, it, expect } from 'vitest';
2
+ import { CSVLoader } from '../CSVLoader.js';
3
+ import { resolve } from 'node:path';
4
+ import { fileURLToPath } from 'node:url';
5
+ import { dirname } from 'node:path';
6
+
7
+ const __dirname = dirname(fileURLToPath(import.meta.url));
8
+ const fixturesDir = resolve(__dirname, '../../fixtures');
9
+
10
+ describe('CSVLoader', () => {
11
+ describe('fromCSVString', () => {
12
+ it('parses basic CSV', () => {
13
+ const csv = `id,question,expected_answer,expected_tools,persona,tags
14
+ t1,Hello,Hi there,,friendly,greeting`;
15
+ const cases = CSVLoader.fromCSVString(csv);
16
+ expect(cases).toHaveLength(1);
17
+ expect(cases[0]).toEqual({
18
+ id: 't1',
19
+ question: 'Hello',
20
+ expectedAnswer: 'Hi there',
21
+ persona: 'friendly',
22
+ tags: ['greeting'],
23
+ });
24
+ });
25
+
26
+ it('handles quoted fields with commas', () => {
27
+ const csv = `id,question,expected_answer,expected_tools,persona,tags
28
+ t1,"Hello, world",response,"toolA,toolB",,`;
29
+ const cases = CSVLoader.fromCSVString(csv);
30
+ expect(cases[0].question).toBe('Hello, world');
31
+ expect(cases[0].expectedTools).toEqual(['toolA', 'toolB']);
32
+ });
33
+
34
+ it('strips UTF-8 BOM', () => {
35
+ const csv = `\uFEFFid,question,expected_answer,expected_tools,persona,tags
36
+ t1,Hello,Hi,,friendly,`;
37
+ const cases = CSVLoader.fromCSVString(csv);
38
+ expect(cases[0].id).toBe('t1');
39
+ });
40
+
41
+ it('skips empty optional fields', () => {
42
+ const csv = `id,question,expected_answer,expected_tools,persona,tags
43
+ t1,Hello,,,, `;
44
+ const cases = CSVLoader.fromCSVString(csv);
45
+ expect(cases[0]).toEqual({ id: 't1', question: 'Hello' });
46
+ });
47
+
48
+ it('throws on missing id', () => {
49
+ const csv = `id,question
50
+ ,Hello`;
51
+ expect(() => CSVLoader.fromCSVString(csv)).toThrow('missing required field');
52
+ });
53
+
54
+ it('throws on missing question', () => {
55
+ const csv = `id,question
56
+ t1,`;
57
+ expect(() => CSVLoader.fromCSVString(csv)).toThrow('missing required field');
58
+ });
59
+
60
+ it('handles multiple tags', () => {
61
+ const csv = `id,question,expected_answer,expected_tools,persona,tags
62
+ t1,Hello,,,,greeting,edge-case`;
63
+ const cases = CSVLoader.fromCSVString(csv);
64
+ expect(cases[0].tags).toContain('greeting');
65
+ });
66
+ });
67
+
68
+ describe('fromJSON', () => {
69
+ it('parses JSON array', () => {
70
+ const json = JSON.stringify([
71
+ { id: 't1', question: 'Hello', expectedAnswer: 'Hi' },
72
+ ]);
73
+ const cases = CSVLoader.fromJSON(json);
74
+ expect(cases).toHaveLength(1);
75
+ expect(cases[0].expectedAnswer).toBe('Hi');
76
+ });
77
+
78
+ it('parses object with testCases key', () => {
79
+ const json = JSON.stringify({
80
+ testCases: [{ id: 't1', question: 'Hello' }],
81
+ });
82
+ const cases = CSVLoader.fromJSON(json);
83
+ expect(cases).toHaveLength(1);
84
+ });
85
+
86
+ it('throws on missing required fields', () => {
87
+ const json = JSON.stringify([{ id: 't1' }]);
88
+ expect(() => CSVLoader.fromJSON(json)).toThrow('missing required field');
89
+ });
90
+
91
+ it('throws on invalid structure', () => {
92
+ expect(() => CSVLoader.fromJSON('{"foo": "bar"}')).toThrow(
93
+ 'JSON must be an array'
94
+ );
95
+ });
96
+ });
97
+
98
+ describe('fromFile', () => {
99
+ it('loads sample CSV fixture', async () => {
100
+ const cases = await CSVLoader.fromFile(
101
+ resolve(fixturesDir, 'sample-tests.csv')
102
+ );
103
+ expect(cases.length).toBeGreaterThanOrEqual(5);
104
+ expect(cases[0].id).toBe('greeting-1');
105
+ expect(cases[0].question).toBe('Hello');
106
+ });
107
+
108
+ it('loads JSON file', async () => {
109
+ const tmpPath = resolve(fixturesDir, '_test-tmp.json');
110
+ const { writeFile, unlink } = await import('node:fs/promises');
111
+ const data = [{ id: 'j1', question: 'Hi from JSON' }];
112
+ await writeFile(tmpPath, JSON.stringify(data));
113
+ try {
114
+ const cases = await CSVLoader.fromFile(tmpPath);
115
+ expect(cases).toHaveLength(1);
116
+ expect(cases[0].question).toBe('Hi from JSON');
117
+ } finally {
118
+ await unlink(tmpPath);
119
+ }
120
+ });
121
+ });
122
+ });