@operor/testing 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/API_VALIDATION.md +572 -0
- package/dist/index.d.ts +414 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1608 -0
- package/dist/index.js.map +1 -0
- package/fixtures/sample-tests.csv +10 -0
- package/package.json +31 -0
- package/src/CSVLoader.ts +83 -0
- package/src/ConversationEvaluator.ts +254 -0
- package/src/ConversationRunner.ts +267 -0
- package/src/CustomerSimulator.ts +106 -0
- package/src/MockShopifySkill.ts +336 -0
- package/src/SimulationRunner.ts +425 -0
- package/src/SkillTestHarness.ts +220 -0
- package/src/TestCaseEvaluator.ts +296 -0
- package/src/TestSuiteRunner.ts +151 -0
- package/src/__tests__/CSVLoader.test.ts +122 -0
- package/src/__tests__/ConversationEvaluator.test.ts +221 -0
- package/src/__tests__/ConversationRunner.test.ts +270 -0
- package/src/__tests__/CustomerSimulator.test.ts +160 -0
- package/src/__tests__/SimulationRunner.test.ts +281 -0
- package/src/__tests__/SkillTestHarness.test.ts +181 -0
- package/src/__tests__/scenarios.test.ts +71 -0
- package/src/index.ts +32 -0
- package/src/scenarios/edge-cases.ts +52 -0
- package/src/scenarios/general.ts +37 -0
- package/src/scenarios/index.ts +32 -0
- package/src/scenarios/order-tracking.ts +56 -0
- package/src/scenarios.ts +142 -0
- package/src/types.ts +133 -0
- package/src/utils.ts +6 -0
- package/tsconfig.json +9 -0
- package/tsdown.config.ts +10 -0
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
import type { LLMProvider } from '@operor/llm';
|
|
2
|
+
import type { TestCase } from './types.js';
|
|
3
|
+
|
|
4
|
+
export interface EvaluationResult {
|
|
5
|
+
passed: boolean;
|
|
6
|
+
score: number;
|
|
7
|
+
method: 'exact' | 'contains' | 'similarity' | 'llm_judge';
|
|
8
|
+
reasoning: string;
|
|
9
|
+
toolsCorrect: boolean;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export class TestCaseEvaluator {
|
|
13
|
+
constructor(private llm?: LLMProvider) {}
|
|
14
|
+
|
|
15
|
+
async evaluate(
|
|
16
|
+
testCase: TestCase,
|
|
17
|
+
agentResponse: string,
|
|
18
|
+
toolsCalled: Array<{ name: string; params: any; result: any }>,
|
|
19
|
+
strategy?: 'exact' | 'contains' | 'similarity' | 'semantic'
|
|
20
|
+
): Promise<EvaluationResult> {
|
|
21
|
+
// Validate tools first
|
|
22
|
+
const toolsCorrect = this.validateTools(testCase.expectedTools, toolsCalled);
|
|
23
|
+
|
|
24
|
+
// Choose evaluation strategy based on explicit strategy parameter
|
|
25
|
+
if (strategy === 'exact') {
|
|
26
|
+
return this.evaluateByExact(testCase, agentResponse, toolsCorrect);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (strategy === 'contains') {
|
|
30
|
+
return this.evaluateByContains(testCase, agentResponse, toolsCorrect);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if (strategy === 'similarity') {
|
|
34
|
+
return this.evaluateBySimilarity(testCase, agentResponse, toolsCorrect);
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (strategy === 'semantic' && this.llm) {
|
|
38
|
+
if (testCase.expectedAnswer) {
|
|
39
|
+
return await this.evaluateByLLMComparison(testCase, agentResponse, toolsCorrect);
|
|
40
|
+
}
|
|
41
|
+
return await this.evaluateByLLMJudge(testCase, agentResponse, toolsCorrect);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Default behavior (no strategy specified)
|
|
45
|
+
if (!this.llm) {
|
|
46
|
+
// No LLM: use string similarity
|
|
47
|
+
return this.evaluateBySimilarity(testCase, agentResponse, toolsCorrect);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
if (testCase.expectedAnswer) {
|
|
51
|
+
// LLM + expected answer: use LLM comparison
|
|
52
|
+
return await this.evaluateByLLMComparison(testCase, agentResponse, toolsCorrect);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// LLM + no expected answer: use LLM standalone judge
|
|
56
|
+
return await this.evaluateByLLMJudge(testCase, agentResponse, toolsCorrect);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
private validateTools(
|
|
60
|
+
expectedTools: string[] | undefined,
|
|
61
|
+
toolsCalled: Array<{ name: string; params: any; result: any }>
|
|
62
|
+
): boolean {
|
|
63
|
+
if (!expectedTools || expectedTools.length === 0) {
|
|
64
|
+
return true; // No tools expected, always pass
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const calledNames = new Set(toolsCalled.map((t) => t.name));
|
|
68
|
+
return expectedTools.every((tool) => calledNames.has(tool));
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
private evaluateByExact(
|
|
72
|
+
testCase: TestCase,
|
|
73
|
+
agentResponse: string,
|
|
74
|
+
toolsCorrect: boolean
|
|
75
|
+
): EvaluationResult {
|
|
76
|
+
if (!testCase.expectedAnswer) {
|
|
77
|
+
return {
|
|
78
|
+
passed: toolsCorrect,
|
|
79
|
+
score: toolsCorrect ? 1 : 0,
|
|
80
|
+
method: 'exact',
|
|
81
|
+
reasoning: 'No expected answer provided, evaluated tools only',
|
|
82
|
+
toolsCorrect,
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const matches = agentResponse.trim().toLowerCase() === testCase.expectedAnswer.trim().toLowerCase();
|
|
87
|
+
const passed = matches && toolsCorrect;
|
|
88
|
+
|
|
89
|
+
return {
|
|
90
|
+
passed,
|
|
91
|
+
score: matches ? 1 : 0,
|
|
92
|
+
method: 'exact',
|
|
93
|
+
reasoning: matches ? 'Exact match' : 'Response does not exactly match expected answer',
|
|
94
|
+
toolsCorrect,
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
private evaluateByContains(
|
|
99
|
+
testCase: TestCase,
|
|
100
|
+
agentResponse: string,
|
|
101
|
+
toolsCorrect: boolean
|
|
102
|
+
): EvaluationResult {
|
|
103
|
+
if (!testCase.expectedAnswer) {
|
|
104
|
+
return {
|
|
105
|
+
passed: toolsCorrect,
|
|
106
|
+
score: toolsCorrect ? 1 : 0,
|
|
107
|
+
method: 'contains',
|
|
108
|
+
reasoning: 'No expected answer provided, evaluated tools only',
|
|
109
|
+
toolsCorrect,
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const normalizeDashes = (s: string) => s.replace(/[\u2013\u2011]/g, '-');
|
|
114
|
+
const contains = normalizeDashes(agentResponse.toLowerCase()).includes(normalizeDashes(testCase.expectedAnswer.toLowerCase()));
|
|
115
|
+
const passed = contains && toolsCorrect;
|
|
116
|
+
|
|
117
|
+
return {
|
|
118
|
+
passed,
|
|
119
|
+
score: contains ? 1 : 0,
|
|
120
|
+
method: 'contains',
|
|
121
|
+
reasoning: contains
|
|
122
|
+
? `Response contains expected text: "${testCase.expectedAnswer}"`
|
|
123
|
+
: `Response does not contain expected text: "${testCase.expectedAnswer}"`,
|
|
124
|
+
toolsCorrect,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
private evaluateBySimilarity(
|
|
129
|
+
testCase: TestCase,
|
|
130
|
+
agentResponse: string,
|
|
131
|
+
toolsCorrect: boolean
|
|
132
|
+
): EvaluationResult {
|
|
133
|
+
if (!testCase.expectedAnswer) {
|
|
134
|
+
// No expected answer and no LLM: can't evaluate, pass by default
|
|
135
|
+
return {
|
|
136
|
+
passed: toolsCorrect,
|
|
137
|
+
score: toolsCorrect ? 1 : 0,
|
|
138
|
+
method: 'similarity',
|
|
139
|
+
reasoning: 'No expected answer provided, evaluated tools only',
|
|
140
|
+
toolsCorrect,
|
|
141
|
+
};
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
const similarity = this.normalizedLevenshtein(
|
|
145
|
+
testCase.expectedAnswer.toLowerCase(),
|
|
146
|
+
agentResponse.toLowerCase()
|
|
147
|
+
);
|
|
148
|
+
|
|
149
|
+
const passed = similarity > 0.7 && toolsCorrect;
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
passed,
|
|
153
|
+
score: similarity,
|
|
154
|
+
method: 'similarity',
|
|
155
|
+
reasoning: `String similarity: ${(similarity * 100).toFixed(1)}% (threshold: 70%)`,
|
|
156
|
+
toolsCorrect,
|
|
157
|
+
};
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
private async evaluateByLLMComparison(
|
|
161
|
+
testCase: TestCase,
|
|
162
|
+
agentResponse: string,
|
|
163
|
+
toolsCorrect: boolean
|
|
164
|
+
): Promise<EvaluationResult> {
|
|
165
|
+
const prompt = `You are evaluating an AI agent's response to a customer question.
|
|
166
|
+
|
|
167
|
+
Question: ${testCase.question}
|
|
168
|
+
Expected Answer: ${testCase.expectedAnswer}
|
|
169
|
+
Actual Response: ${agentResponse}
|
|
170
|
+
|
|
171
|
+
Rate the actual response on a scale of 1-5:
|
|
172
|
+
1 = Completely wrong or irrelevant
|
|
173
|
+
2 = Partially correct but missing key information
|
|
174
|
+
3 = Mostly correct with minor issues
|
|
175
|
+
4 = Correct with good quality
|
|
176
|
+
5 = Excellent, matches or exceeds expected answer
|
|
177
|
+
|
|
178
|
+
Respond with ONLY a JSON object in this format:
|
|
179
|
+
{"score": <1-5>, "reasoning": "<brief explanation>"}`;
|
|
180
|
+
|
|
181
|
+
const result = await this.llm!.complete(
|
|
182
|
+
[{ role: 'user', content: prompt }],
|
|
183
|
+
{ temperature: 0, maxTokens: 200 }
|
|
184
|
+
);
|
|
185
|
+
|
|
186
|
+
let score = 3;
|
|
187
|
+
let reasoning = 'LLM evaluation completed';
|
|
188
|
+
|
|
189
|
+
try {
|
|
190
|
+
const parsed = JSON.parse(result.text);
|
|
191
|
+
score = parsed.score;
|
|
192
|
+
reasoning = parsed.reasoning;
|
|
193
|
+
} catch {
|
|
194
|
+
// Fallback: try to extract score from text
|
|
195
|
+
const match = result.text.match(/score["\s:]+(\d)/i);
|
|
196
|
+
if (match) {
|
|
197
|
+
score = parseInt(match[1], 10);
|
|
198
|
+
}
|
|
199
|
+
reasoning = result.text.substring(0, 200);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const normalizedScore = score / 5;
|
|
203
|
+
const passed = normalizedScore >= 0.6 && toolsCorrect;
|
|
204
|
+
|
|
205
|
+
return {
|
|
206
|
+
passed,
|
|
207
|
+
score: normalizedScore,
|
|
208
|
+
method: 'llm_judge',
|
|
209
|
+
reasoning: `LLM comparison (${score}/5): ${reasoning}`,
|
|
210
|
+
toolsCorrect,
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
private async evaluateByLLMJudge(
|
|
215
|
+
testCase: TestCase,
|
|
216
|
+
agentResponse: string,
|
|
217
|
+
toolsCorrect: boolean
|
|
218
|
+
): Promise<EvaluationResult> {
|
|
219
|
+
const prompt = `You are evaluating an AI agent's response to a customer question.
|
|
220
|
+
|
|
221
|
+
Question: ${testCase.question}
|
|
222
|
+
Agent Response: ${agentResponse}
|
|
223
|
+
|
|
224
|
+
Rate the response quality on a scale of 1-5:
|
|
225
|
+
1 = Unhelpful, incorrect, or inappropriate
|
|
226
|
+
2 = Partially helpful but incomplete or unclear
|
|
227
|
+
3 = Adequate, addresses the question reasonably
|
|
228
|
+
4 = Good quality, helpful and accurate
|
|
229
|
+
5 = Excellent, comprehensive and professional
|
|
230
|
+
|
|
231
|
+
Respond with ONLY a JSON object in this format:
|
|
232
|
+
{"score": <1-5>, "reasoning": "<brief explanation>"}`;
|
|
233
|
+
|
|
234
|
+
const result = await this.llm!.complete(
|
|
235
|
+
[{ role: 'user', content: prompt }],
|
|
236
|
+
{ temperature: 0, maxTokens: 200 }
|
|
237
|
+
);
|
|
238
|
+
|
|
239
|
+
let score = 3;
|
|
240
|
+
let reasoning = 'LLM evaluation completed';
|
|
241
|
+
|
|
242
|
+
try {
|
|
243
|
+
const parsed = JSON.parse(result.text);
|
|
244
|
+
score = parsed.score;
|
|
245
|
+
reasoning = parsed.reasoning;
|
|
246
|
+
} catch {
|
|
247
|
+
// Fallback: try to extract score from text
|
|
248
|
+
const match = result.text.match(/score["\s:]+(\d)/i);
|
|
249
|
+
if (match) {
|
|
250
|
+
score = parseInt(match[1], 10);
|
|
251
|
+
}
|
|
252
|
+
reasoning = result.text.substring(0, 200);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const normalizedScore = score / 5;
|
|
256
|
+
const passed = normalizedScore >= 0.6 && toolsCorrect;
|
|
257
|
+
|
|
258
|
+
return {
|
|
259
|
+
passed,
|
|
260
|
+
score: normalizedScore,
|
|
261
|
+
method: 'llm_judge',
|
|
262
|
+
reasoning: `LLM standalone judge (${score}/5): ${reasoning}`,
|
|
263
|
+
toolsCorrect,
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
private normalizedLevenshtein(s1: string, s2: string): number {
|
|
268
|
+
const len1 = s1.length;
|
|
269
|
+
const len2 = s2.length;
|
|
270
|
+
|
|
271
|
+
if (len1 === 0) return len2 === 0 ? 1 : 0;
|
|
272
|
+
if (len2 === 0) return 0;
|
|
273
|
+
|
|
274
|
+
const matrix: number[][] = Array.from({ length: len1 + 1 }, () =>
|
|
275
|
+
Array(len2 + 1).fill(0)
|
|
276
|
+
);
|
|
277
|
+
|
|
278
|
+
for (let i = 0; i <= len1; i++) matrix[i][0] = i;
|
|
279
|
+
for (let j = 0; j <= len2; j++) matrix[0][j] = j;
|
|
280
|
+
|
|
281
|
+
for (let i = 1; i <= len1; i++) {
|
|
282
|
+
for (let j = 1; j <= len2; j++) {
|
|
283
|
+
const cost = s1[i - 1] === s2[j - 1] ? 0 : 1;
|
|
284
|
+
matrix[i][j] = Math.min(
|
|
285
|
+
matrix[i - 1][j] + 1,
|
|
286
|
+
matrix[i][j - 1] + 1,
|
|
287
|
+
matrix[i - 1][j - 1] + cost
|
|
288
|
+
);
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
const distance = matrix[len1][len2];
|
|
293
|
+
const maxLen = Math.max(len1, len2);
|
|
294
|
+
return 1 - distance / maxLen;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import { Operor } from '@operor/core';
|
|
2
|
+
import type { LLMProvider } from '@operor/llm';
|
|
3
|
+
import { MockProvider } from '@operor/provider-mock';
|
|
4
|
+
import type { TestCase, TestCaseResult, TestSuiteResult } from './types.js';
|
|
5
|
+
import { TestCaseEvaluator } from './TestCaseEvaluator.js';
|
|
6
|
+
|
|
7
|
+
export interface TestSuiteRunnerConfig {
|
|
8
|
+
agentOS: Operor;
|
|
9
|
+
llm?: LLMProvider;
|
|
10
|
+
timeout?: number;
|
|
11
|
+
strategy?: 'exact' | 'contains' | 'similarity' | 'semantic';
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export class TestSuiteRunner {
|
|
15
|
+
private evaluator: TestCaseEvaluator;
|
|
16
|
+
private agentOS: Operor;
|
|
17
|
+
private timeout: number;
|
|
18
|
+
private strategy?: 'exact' | 'contains' | 'similarity' | 'semantic';
|
|
19
|
+
|
|
20
|
+
constructor(config: TestSuiteRunnerConfig) {
|
|
21
|
+
this.agentOS = config.agentOS;
|
|
22
|
+
this.evaluator = new TestCaseEvaluator(config.llm);
|
|
23
|
+
this.timeout = config.timeout || 30000;
|
|
24
|
+
this.strategy = config.strategy;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
async runSuite(testCases: TestCase[]): Promise<TestSuiteResult> {
|
|
28
|
+
const results: TestCaseResult[] = [];
|
|
29
|
+
const startTime = Date.now();
|
|
30
|
+
|
|
31
|
+
for (const testCase of testCases) {
|
|
32
|
+
const result = await this.runTestCase(testCase);
|
|
33
|
+
results.push(result);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const totalDuration = Date.now() - startTime;
|
|
37
|
+
const passed = results.filter((r) => r.evaluation.passed).length;
|
|
38
|
+
const failed = results.length - passed;
|
|
39
|
+
const averageScore =
|
|
40
|
+
results.reduce((sum, r) => sum + r.evaluation.score, 0) / results.length;
|
|
41
|
+
const totalCost = results.reduce((sum, r) => sum + r.cost, 0);
|
|
42
|
+
|
|
43
|
+
// Group by tags
|
|
44
|
+
const byTag: Record<string, { total: number; passed: number; avgScore: number }> = {};
|
|
45
|
+
for (const result of results) {
|
|
46
|
+
const tags = result.testCase.tags || ['untagged'];
|
|
47
|
+
for (const tag of tags) {
|
|
48
|
+
if (!byTag[tag]) {
|
|
49
|
+
byTag[tag] = { total: 0, passed: 0, avgScore: 0 };
|
|
50
|
+
}
|
|
51
|
+
byTag[tag].total++;
|
|
52
|
+
if (result.evaluation.passed) {
|
|
53
|
+
byTag[tag].passed++;
|
|
54
|
+
}
|
|
55
|
+
byTag[tag].avgScore += result.evaluation.score;
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Calculate average scores per tag
|
|
60
|
+
for (const tag in byTag) {
|
|
61
|
+
byTag[tag].avgScore /= byTag[tag].total;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
total: results.length,
|
|
66
|
+
passed,
|
|
67
|
+
failed,
|
|
68
|
+
averageScore,
|
|
69
|
+
byTag,
|
|
70
|
+
results,
|
|
71
|
+
totalDuration,
|
|
72
|
+
totalCost,
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
private async runTestCase(testCase: TestCase): Promise<TestCaseResult> {
|
|
77
|
+
const startTime = Date.now();
|
|
78
|
+
let agentResponse = '';
|
|
79
|
+
let toolsCalled: Array<{ name: string; params: any; result: any }> = [];
|
|
80
|
+
let cost = 0;
|
|
81
|
+
|
|
82
|
+
try {
|
|
83
|
+
// Create a promise that resolves when we get a response
|
|
84
|
+
const responsePromise = new Promise<{
|
|
85
|
+
text: string;
|
|
86
|
+
toolCalls?: Array<{ name: string; params: any; result: any }>;
|
|
87
|
+
cost: number;
|
|
88
|
+
}>((resolve, reject) => {
|
|
89
|
+
const timeoutId = setTimeout(() => {
|
|
90
|
+
reject(new Error(`Test case ${testCase.id} timed out after ${this.timeout}ms`));
|
|
91
|
+
}, this.timeout);
|
|
92
|
+
|
|
93
|
+
// Listen for the response
|
|
94
|
+
this.agentOS.once('message:processed', (event: any) => {
|
|
95
|
+
clearTimeout(timeoutId);
|
|
96
|
+
resolve({
|
|
97
|
+
text: event.response.text,
|
|
98
|
+
toolCalls: event.response.toolCalls || [],
|
|
99
|
+
cost: event.cost || 0,
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
// Listen for errors
|
|
104
|
+
this.agentOS.once('error', (event: any) => {
|
|
105
|
+
clearTimeout(timeoutId);
|
|
106
|
+
reject(event.error);
|
|
107
|
+
});
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
// Get the mock provider
|
|
111
|
+
const mockProvider = Array.from((this.agentOS as any).providers.values()).find(
|
|
112
|
+
(p: any) => p.name === 'mock'
|
|
113
|
+
) as MockProvider | undefined;
|
|
114
|
+
|
|
115
|
+
if (!mockProvider) {
|
|
116
|
+
throw new Error('MockProvider not found in Operor');
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Simulate incoming message
|
|
120
|
+
const testPhone = testCase.persona || 'test-user';
|
|
121
|
+
mockProvider.simulateIncomingMessage(testPhone, testCase.question);
|
|
122
|
+
|
|
123
|
+
// Wait for response
|
|
124
|
+
const response = await responsePromise;
|
|
125
|
+
agentResponse = response.text;
|
|
126
|
+
toolsCalled = response.toolCalls || [];
|
|
127
|
+
cost = response.cost;
|
|
128
|
+
} catch (error) {
|
|
129
|
+
agentResponse = `Error: ${error instanceof Error ? error.message : String(error)}`;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const duration = Date.now() - startTime;
|
|
133
|
+
|
|
134
|
+
// Evaluate the response
|
|
135
|
+
const evaluation = await this.evaluator.evaluate(
|
|
136
|
+
testCase,
|
|
137
|
+
agentResponse,
|
|
138
|
+
toolsCalled,
|
|
139
|
+
this.strategy
|
|
140
|
+
);
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
testCase,
|
|
144
|
+
agentResponse,
|
|
145
|
+
toolsCalled,
|
|
146
|
+
evaluation,
|
|
147
|
+
duration,
|
|
148
|
+
cost,
|
|
149
|
+
};
|
|
150
|
+
}
|
|
151
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { CSVLoader } from '../CSVLoader.js';
|
|
3
|
+
import { resolve } from 'node:path';
|
|
4
|
+
import { fileURLToPath } from 'node:url';
|
|
5
|
+
import { dirname } from 'node:path';
|
|
6
|
+
|
|
7
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
8
|
+
const fixturesDir = resolve(__dirname, '../../fixtures');
|
|
9
|
+
|
|
10
|
+
describe('CSVLoader', () => {
|
|
11
|
+
describe('fromCSVString', () => {
|
|
12
|
+
it('parses basic CSV', () => {
|
|
13
|
+
const csv = `id,question,expected_answer,expected_tools,persona,tags
|
|
14
|
+
t1,Hello,Hi there,,friendly,greeting`;
|
|
15
|
+
const cases = CSVLoader.fromCSVString(csv);
|
|
16
|
+
expect(cases).toHaveLength(1);
|
|
17
|
+
expect(cases[0]).toEqual({
|
|
18
|
+
id: 't1',
|
|
19
|
+
question: 'Hello',
|
|
20
|
+
expectedAnswer: 'Hi there',
|
|
21
|
+
persona: 'friendly',
|
|
22
|
+
tags: ['greeting'],
|
|
23
|
+
});
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it('handles quoted fields with commas', () => {
|
|
27
|
+
const csv = `id,question,expected_answer,expected_tools,persona,tags
|
|
28
|
+
t1,"Hello, world",response,"toolA,toolB",,`;
|
|
29
|
+
const cases = CSVLoader.fromCSVString(csv);
|
|
30
|
+
expect(cases[0].question).toBe('Hello, world');
|
|
31
|
+
expect(cases[0].expectedTools).toEqual(['toolA', 'toolB']);
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
it('strips UTF-8 BOM', () => {
|
|
35
|
+
const csv = `\uFEFFid,question,expected_answer,expected_tools,persona,tags
|
|
36
|
+
t1,Hello,Hi,,friendly,`;
|
|
37
|
+
const cases = CSVLoader.fromCSVString(csv);
|
|
38
|
+
expect(cases[0].id).toBe('t1');
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
it('skips empty optional fields', () => {
|
|
42
|
+
const csv = `id,question,expected_answer,expected_tools,persona,tags
|
|
43
|
+
t1,Hello,,,, `;
|
|
44
|
+
const cases = CSVLoader.fromCSVString(csv);
|
|
45
|
+
expect(cases[0]).toEqual({ id: 't1', question: 'Hello' });
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
it('throws on missing id', () => {
|
|
49
|
+
const csv = `id,question
|
|
50
|
+
,Hello`;
|
|
51
|
+
expect(() => CSVLoader.fromCSVString(csv)).toThrow('missing required field');
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
it('throws on missing question', () => {
|
|
55
|
+
const csv = `id,question
|
|
56
|
+
t1,`;
|
|
57
|
+
expect(() => CSVLoader.fromCSVString(csv)).toThrow('missing required field');
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it('handles multiple tags', () => {
|
|
61
|
+
const csv = `id,question,expected_answer,expected_tools,persona,tags
|
|
62
|
+
t1,Hello,,,,greeting,edge-case`;
|
|
63
|
+
const cases = CSVLoader.fromCSVString(csv);
|
|
64
|
+
expect(cases[0].tags).toContain('greeting');
|
|
65
|
+
});
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
describe('fromJSON', () => {
|
|
69
|
+
it('parses JSON array', () => {
|
|
70
|
+
const json = JSON.stringify([
|
|
71
|
+
{ id: 't1', question: 'Hello', expectedAnswer: 'Hi' },
|
|
72
|
+
]);
|
|
73
|
+
const cases = CSVLoader.fromJSON(json);
|
|
74
|
+
expect(cases).toHaveLength(1);
|
|
75
|
+
expect(cases[0].expectedAnswer).toBe('Hi');
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
it('parses object with testCases key', () => {
|
|
79
|
+
const json = JSON.stringify({
|
|
80
|
+
testCases: [{ id: 't1', question: 'Hello' }],
|
|
81
|
+
});
|
|
82
|
+
const cases = CSVLoader.fromJSON(json);
|
|
83
|
+
expect(cases).toHaveLength(1);
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
it('throws on missing required fields', () => {
|
|
87
|
+
const json = JSON.stringify([{ id: 't1' }]);
|
|
88
|
+
expect(() => CSVLoader.fromJSON(json)).toThrow('missing required field');
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it('throws on invalid structure', () => {
|
|
92
|
+
expect(() => CSVLoader.fromJSON('{"foo": "bar"}')).toThrow(
|
|
93
|
+
'JSON must be an array'
|
|
94
|
+
);
|
|
95
|
+
});
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
describe('fromFile', () => {
|
|
99
|
+
it('loads sample CSV fixture', async () => {
|
|
100
|
+
const cases = await CSVLoader.fromFile(
|
|
101
|
+
resolve(fixturesDir, 'sample-tests.csv')
|
|
102
|
+
);
|
|
103
|
+
expect(cases.length).toBeGreaterThanOrEqual(5);
|
|
104
|
+
expect(cases[0].id).toBe('greeting-1');
|
|
105
|
+
expect(cases[0].question).toBe('Hello');
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
it('loads JSON file', async () => {
|
|
109
|
+
const tmpPath = resolve(fixturesDir, '_test-tmp.json');
|
|
110
|
+
const { writeFile, unlink } = await import('node:fs/promises');
|
|
111
|
+
const data = [{ id: 'j1', question: 'Hi from JSON' }];
|
|
112
|
+
await writeFile(tmpPath, JSON.stringify(data));
|
|
113
|
+
try {
|
|
114
|
+
const cases = await CSVLoader.fromFile(tmpPath);
|
|
115
|
+
expect(cases).toHaveLength(1);
|
|
116
|
+
expect(cases[0].question).toBe('Hi from JSON');
|
|
117
|
+
} finally {
|
|
118
|
+
await unlink(tmpPath);
|
|
119
|
+
}
|
|
120
|
+
});
|
|
121
|
+
});
|
|
122
|
+
});
|