@operor/testing 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/API_VALIDATION.md +572 -0
- package/dist/index.d.ts +414 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1608 -0
- package/dist/index.js.map +1 -0
- package/fixtures/sample-tests.csv +10 -0
- package/package.json +31 -0
- package/src/CSVLoader.ts +83 -0
- package/src/ConversationEvaluator.ts +254 -0
- package/src/ConversationRunner.ts +267 -0
- package/src/CustomerSimulator.ts +106 -0
- package/src/MockShopifySkill.ts +336 -0
- package/src/SimulationRunner.ts +425 -0
- package/src/SkillTestHarness.ts +220 -0
- package/src/TestCaseEvaluator.ts +296 -0
- package/src/TestSuiteRunner.ts +151 -0
- package/src/__tests__/CSVLoader.test.ts +122 -0
- package/src/__tests__/ConversationEvaluator.test.ts +221 -0
- package/src/__tests__/ConversationRunner.test.ts +270 -0
- package/src/__tests__/CustomerSimulator.test.ts +160 -0
- package/src/__tests__/SimulationRunner.test.ts +281 -0
- package/src/__tests__/SkillTestHarness.test.ts +181 -0
- package/src/__tests__/scenarios.test.ts +71 -0
- package/src/index.ts +32 -0
- package/src/scenarios/edge-cases.ts +52 -0
- package/src/scenarios/general.ts +37 -0
- package/src/scenarios/index.ts +32 -0
- package/src/scenarios/order-tracking.ts +56 -0
- package/src/scenarios.ts +142 -0
- package/src/types.ts +133 -0
- package/src/utils.ts +6 -0
- package/tsconfig.json +9 -0
- package/tsdown.config.ts +10 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,1608 @@
|
|
|
1
|
+
import { parse } from "csv-parse/sync";
|
|
2
|
+
import { readFile } from "node:fs/promises";
|
|
3
|
+
|
|
4
|
+
//#region src/CSVLoader.ts
|
|
5
|
+
var CSVLoader = class CSVLoader {
|
|
6
|
+
static async fromFile(path) {
|
|
7
|
+
const content = await readFile(path, "utf-8");
|
|
8
|
+
if (path.endsWith(".json")) return CSVLoader.fromJSON(content);
|
|
9
|
+
return CSVLoader.fromCSVString(content);
|
|
10
|
+
}
|
|
11
|
+
static fromCSVString(csv) {
|
|
12
|
+
return parse(csv.replace(/^\uFEFF/, ""), {
|
|
13
|
+
columns: true,
|
|
14
|
+
skip_empty_lines: true,
|
|
15
|
+
trim: true,
|
|
16
|
+
relax_column_count: true
|
|
17
|
+
}).map((row, i) => {
|
|
18
|
+
const id = row.id?.trim();
|
|
19
|
+
const question = row.question?.trim();
|
|
20
|
+
if (!id || !question) throw new Error(`Row ${i + 1}: missing required field(s) ā id and question are required`);
|
|
21
|
+
const testCase = {
|
|
22
|
+
id,
|
|
23
|
+
question
|
|
24
|
+
};
|
|
25
|
+
if (row.expected_answer?.trim()) testCase.expectedAnswer = row.expected_answer.trim();
|
|
26
|
+
if (row.expected_tools?.trim()) testCase.expectedTools = row.expected_tools.split(",").map((t) => t.trim()).filter(Boolean);
|
|
27
|
+
if (row.persona?.trim()) testCase.persona = row.persona.trim();
|
|
28
|
+
if (row.tags?.trim()) testCase.tags = row.tags.split(",").map((t) => t.trim()).filter(Boolean);
|
|
29
|
+
return testCase;
|
|
30
|
+
});
|
|
31
|
+
}
|
|
32
|
+
static fromJSON(json) {
|
|
33
|
+
const data = JSON.parse(json);
|
|
34
|
+
const arr = Array.isArray(data) ? data : data.testCases ?? data.tests;
|
|
35
|
+
if (!Array.isArray(arr)) throw new Error("JSON must be an array or contain a testCases/tests array");
|
|
36
|
+
return arr.map((item, i) => {
|
|
37
|
+
if (!item.id || !item.question) throw new Error(`Item ${i}: missing required field(s) ā id and question are required`);
|
|
38
|
+
const testCase = {
|
|
39
|
+
id: item.id,
|
|
40
|
+
question: item.question
|
|
41
|
+
};
|
|
42
|
+
if (item.expectedAnswer) testCase.expectedAnswer = item.expectedAnswer;
|
|
43
|
+
if (item.expectedTools) testCase.expectedTools = item.expectedTools;
|
|
44
|
+
if (item.persona) testCase.persona = item.persona;
|
|
45
|
+
if (item.tags) testCase.tags = item.tags;
|
|
46
|
+
if (item.metadata) testCase.metadata = item.metadata;
|
|
47
|
+
return testCase;
|
|
48
|
+
});
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
//#endregion
|
|
53
|
+
//#region src/TestCaseEvaluator.ts
|
|
54
|
+
var TestCaseEvaluator = class {
|
|
55
|
+
constructor(llm) {
|
|
56
|
+
this.llm = llm;
|
|
57
|
+
}
|
|
58
|
+
async evaluate(testCase, agentResponse, toolsCalled, strategy) {
|
|
59
|
+
const toolsCorrect = this.validateTools(testCase.expectedTools, toolsCalled);
|
|
60
|
+
if (strategy === "exact") return this.evaluateByExact(testCase, agentResponse, toolsCorrect);
|
|
61
|
+
if (strategy === "contains") return this.evaluateByContains(testCase, agentResponse, toolsCorrect);
|
|
62
|
+
if (strategy === "similarity") return this.evaluateBySimilarity(testCase, agentResponse, toolsCorrect);
|
|
63
|
+
if (strategy === "semantic" && this.llm) {
|
|
64
|
+
if (testCase.expectedAnswer) return await this.evaluateByLLMComparison(testCase, agentResponse, toolsCorrect);
|
|
65
|
+
return await this.evaluateByLLMJudge(testCase, agentResponse, toolsCorrect);
|
|
66
|
+
}
|
|
67
|
+
if (!this.llm) return this.evaluateBySimilarity(testCase, agentResponse, toolsCorrect);
|
|
68
|
+
if (testCase.expectedAnswer) return await this.evaluateByLLMComparison(testCase, agentResponse, toolsCorrect);
|
|
69
|
+
return await this.evaluateByLLMJudge(testCase, agentResponse, toolsCorrect);
|
|
70
|
+
}
|
|
71
|
+
validateTools(expectedTools, toolsCalled) {
|
|
72
|
+
if (!expectedTools || expectedTools.length === 0) return true;
|
|
73
|
+
const calledNames = new Set(toolsCalled.map((t) => t.name));
|
|
74
|
+
return expectedTools.every((tool) => calledNames.has(tool));
|
|
75
|
+
}
|
|
76
|
+
evaluateByExact(testCase, agentResponse, toolsCorrect) {
|
|
77
|
+
if (!testCase.expectedAnswer) return {
|
|
78
|
+
passed: toolsCorrect,
|
|
79
|
+
score: toolsCorrect ? 1 : 0,
|
|
80
|
+
method: "exact",
|
|
81
|
+
reasoning: "No expected answer provided, evaluated tools only",
|
|
82
|
+
toolsCorrect
|
|
83
|
+
};
|
|
84
|
+
const matches = agentResponse.trim().toLowerCase() === testCase.expectedAnswer.trim().toLowerCase();
|
|
85
|
+
return {
|
|
86
|
+
passed: matches && toolsCorrect,
|
|
87
|
+
score: matches ? 1 : 0,
|
|
88
|
+
method: "exact",
|
|
89
|
+
reasoning: matches ? "Exact match" : "Response does not exactly match expected answer",
|
|
90
|
+
toolsCorrect
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
evaluateByContains(testCase, agentResponse, toolsCorrect) {
|
|
94
|
+
if (!testCase.expectedAnswer) return {
|
|
95
|
+
passed: toolsCorrect,
|
|
96
|
+
score: toolsCorrect ? 1 : 0,
|
|
97
|
+
method: "contains",
|
|
98
|
+
reasoning: "No expected answer provided, evaluated tools only",
|
|
99
|
+
toolsCorrect
|
|
100
|
+
};
|
|
101
|
+
const normalizeDashes = (s) => s.replace(/[\u2013\u2011]/g, "-");
|
|
102
|
+
const contains = normalizeDashes(agentResponse.toLowerCase()).includes(normalizeDashes(testCase.expectedAnswer.toLowerCase()));
|
|
103
|
+
return {
|
|
104
|
+
passed: contains && toolsCorrect,
|
|
105
|
+
score: contains ? 1 : 0,
|
|
106
|
+
method: "contains",
|
|
107
|
+
reasoning: contains ? `Response contains expected text: "${testCase.expectedAnswer}"` : `Response does not contain expected text: "${testCase.expectedAnswer}"`,
|
|
108
|
+
toolsCorrect
|
|
109
|
+
};
|
|
110
|
+
}
|
|
111
|
+
evaluateBySimilarity(testCase, agentResponse, toolsCorrect) {
|
|
112
|
+
if (!testCase.expectedAnswer) return {
|
|
113
|
+
passed: toolsCorrect,
|
|
114
|
+
score: toolsCorrect ? 1 : 0,
|
|
115
|
+
method: "similarity",
|
|
116
|
+
reasoning: "No expected answer provided, evaluated tools only",
|
|
117
|
+
toolsCorrect
|
|
118
|
+
};
|
|
119
|
+
const similarity = this.normalizedLevenshtein(testCase.expectedAnswer.toLowerCase(), agentResponse.toLowerCase());
|
|
120
|
+
return {
|
|
121
|
+
passed: similarity > .7 && toolsCorrect,
|
|
122
|
+
score: similarity,
|
|
123
|
+
method: "similarity",
|
|
124
|
+
reasoning: `String similarity: ${(similarity * 100).toFixed(1)}% (threshold: 70%)`,
|
|
125
|
+
toolsCorrect
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
async evaluateByLLMComparison(testCase, agentResponse, toolsCorrect) {
|
|
129
|
+
const prompt = `You are evaluating an AI agent's response to a customer question.
|
|
130
|
+
|
|
131
|
+
Question: ${testCase.question}
|
|
132
|
+
Expected Answer: ${testCase.expectedAnswer}
|
|
133
|
+
Actual Response: ${agentResponse}
|
|
134
|
+
|
|
135
|
+
Rate the actual response on a scale of 1-5:
|
|
136
|
+
1 = Completely wrong or irrelevant
|
|
137
|
+
2 = Partially correct but missing key information
|
|
138
|
+
3 = Mostly correct with minor issues
|
|
139
|
+
4 = Correct with good quality
|
|
140
|
+
5 = Excellent, matches or exceeds expected answer
|
|
141
|
+
|
|
142
|
+
Respond with ONLY a JSON object in this format:
|
|
143
|
+
{"score": <1-5>, "reasoning": "<brief explanation>"}`;
|
|
144
|
+
const result = await this.llm.complete([{
|
|
145
|
+
role: "user",
|
|
146
|
+
content: prompt
|
|
147
|
+
}], {
|
|
148
|
+
temperature: 0,
|
|
149
|
+
maxTokens: 200
|
|
150
|
+
});
|
|
151
|
+
let score = 3;
|
|
152
|
+
let reasoning = "LLM evaluation completed";
|
|
153
|
+
try {
|
|
154
|
+
const parsed = JSON.parse(result.text);
|
|
155
|
+
score = parsed.score;
|
|
156
|
+
reasoning = parsed.reasoning;
|
|
157
|
+
} catch {
|
|
158
|
+
const match = result.text.match(/score["\s:]+(\d)/i);
|
|
159
|
+
if (match) score = parseInt(match[1], 10);
|
|
160
|
+
reasoning = result.text.substring(0, 200);
|
|
161
|
+
}
|
|
162
|
+
const normalizedScore = score / 5;
|
|
163
|
+
return {
|
|
164
|
+
passed: normalizedScore >= .6 && toolsCorrect,
|
|
165
|
+
score: normalizedScore,
|
|
166
|
+
method: "llm_judge",
|
|
167
|
+
reasoning: `LLM comparison (${score}/5): ${reasoning}`,
|
|
168
|
+
toolsCorrect
|
|
169
|
+
};
|
|
170
|
+
}
|
|
171
|
+
async evaluateByLLMJudge(testCase, agentResponse, toolsCorrect) {
|
|
172
|
+
const prompt = `You are evaluating an AI agent's response to a customer question.
|
|
173
|
+
|
|
174
|
+
Question: ${testCase.question}
|
|
175
|
+
Agent Response: ${agentResponse}
|
|
176
|
+
|
|
177
|
+
Rate the response quality on a scale of 1-5:
|
|
178
|
+
1 = Unhelpful, incorrect, or inappropriate
|
|
179
|
+
2 = Partially helpful but incomplete or unclear
|
|
180
|
+
3 = Adequate, addresses the question reasonably
|
|
181
|
+
4 = Good quality, helpful and accurate
|
|
182
|
+
5 = Excellent, comprehensive and professional
|
|
183
|
+
|
|
184
|
+
Respond with ONLY a JSON object in this format:
|
|
185
|
+
{"score": <1-5>, "reasoning": "<brief explanation>"}`;
|
|
186
|
+
const result = await this.llm.complete([{
|
|
187
|
+
role: "user",
|
|
188
|
+
content: prompt
|
|
189
|
+
}], {
|
|
190
|
+
temperature: 0,
|
|
191
|
+
maxTokens: 200
|
|
192
|
+
});
|
|
193
|
+
let score = 3;
|
|
194
|
+
let reasoning = "LLM evaluation completed";
|
|
195
|
+
try {
|
|
196
|
+
const parsed = JSON.parse(result.text);
|
|
197
|
+
score = parsed.score;
|
|
198
|
+
reasoning = parsed.reasoning;
|
|
199
|
+
} catch {
|
|
200
|
+
const match = result.text.match(/score["\s:]+(\d)/i);
|
|
201
|
+
if (match) score = parseInt(match[1], 10);
|
|
202
|
+
reasoning = result.text.substring(0, 200);
|
|
203
|
+
}
|
|
204
|
+
const normalizedScore = score / 5;
|
|
205
|
+
return {
|
|
206
|
+
passed: normalizedScore >= .6 && toolsCorrect,
|
|
207
|
+
score: normalizedScore,
|
|
208
|
+
method: "llm_judge",
|
|
209
|
+
reasoning: `LLM standalone judge (${score}/5): ${reasoning}`,
|
|
210
|
+
toolsCorrect
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
normalizedLevenshtein(s1, s2) {
|
|
214
|
+
const len1 = s1.length;
|
|
215
|
+
const len2 = s2.length;
|
|
216
|
+
if (len1 === 0) return len2 === 0 ? 1 : 0;
|
|
217
|
+
if (len2 === 0) return 0;
|
|
218
|
+
const matrix = Array.from({ length: len1 + 1 }, () => Array(len2 + 1).fill(0));
|
|
219
|
+
for (let i = 0; i <= len1; i++) matrix[i][0] = i;
|
|
220
|
+
for (let j = 0; j <= len2; j++) matrix[0][j] = j;
|
|
221
|
+
for (let i = 1; i <= len1; i++) for (let j = 1; j <= len2; j++) {
|
|
222
|
+
const cost = s1[i - 1] === s2[j - 1] ? 0 : 1;
|
|
223
|
+
matrix[i][j] = Math.min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost);
|
|
224
|
+
}
|
|
225
|
+
return 1 - matrix[len1][len2] / Math.max(len1, len2);
|
|
226
|
+
}
|
|
227
|
+
};
|
|
228
|
+
|
|
229
|
+
//#endregion
|
|
230
|
+
//#region src/TestSuiteRunner.ts
|
|
231
|
+
var TestSuiteRunner = class {
|
|
232
|
+
evaluator;
|
|
233
|
+
agentOS;
|
|
234
|
+
timeout;
|
|
235
|
+
strategy;
|
|
236
|
+
constructor(config) {
|
|
237
|
+
this.agentOS = config.agentOS;
|
|
238
|
+
this.evaluator = new TestCaseEvaluator(config.llm);
|
|
239
|
+
this.timeout = config.timeout || 3e4;
|
|
240
|
+
this.strategy = config.strategy;
|
|
241
|
+
}
|
|
242
|
+
async runSuite(testCases) {
|
|
243
|
+
const results = [];
|
|
244
|
+
const startTime = Date.now();
|
|
245
|
+
for (const testCase of testCases) {
|
|
246
|
+
const result = await this.runTestCase(testCase);
|
|
247
|
+
results.push(result);
|
|
248
|
+
}
|
|
249
|
+
const totalDuration = Date.now() - startTime;
|
|
250
|
+
const passed = results.filter((r) => r.evaluation.passed).length;
|
|
251
|
+
const failed = results.length - passed;
|
|
252
|
+
const averageScore = results.reduce((sum, r) => sum + r.evaluation.score, 0) / results.length;
|
|
253
|
+
const totalCost = results.reduce((sum, r) => sum + r.cost, 0);
|
|
254
|
+
const byTag = {};
|
|
255
|
+
for (const result of results) {
|
|
256
|
+
const tags = result.testCase.tags || ["untagged"];
|
|
257
|
+
for (const tag of tags) {
|
|
258
|
+
if (!byTag[tag]) byTag[tag] = {
|
|
259
|
+
total: 0,
|
|
260
|
+
passed: 0,
|
|
261
|
+
avgScore: 0
|
|
262
|
+
};
|
|
263
|
+
byTag[tag].total++;
|
|
264
|
+
if (result.evaluation.passed) byTag[tag].passed++;
|
|
265
|
+
byTag[tag].avgScore += result.evaluation.score;
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
for (const tag in byTag) byTag[tag].avgScore /= byTag[tag].total;
|
|
269
|
+
return {
|
|
270
|
+
total: results.length,
|
|
271
|
+
passed,
|
|
272
|
+
failed,
|
|
273
|
+
averageScore,
|
|
274
|
+
byTag,
|
|
275
|
+
results,
|
|
276
|
+
totalDuration,
|
|
277
|
+
totalCost
|
|
278
|
+
};
|
|
279
|
+
}
|
|
280
|
+
async runTestCase(testCase) {
|
|
281
|
+
const startTime = Date.now();
|
|
282
|
+
let agentResponse = "";
|
|
283
|
+
let toolsCalled = [];
|
|
284
|
+
let cost = 0;
|
|
285
|
+
try {
|
|
286
|
+
const responsePromise = new Promise((resolve, reject) => {
|
|
287
|
+
const timeoutId = setTimeout(() => {
|
|
288
|
+
reject(/* @__PURE__ */ new Error(`Test case ${testCase.id} timed out after ${this.timeout}ms`));
|
|
289
|
+
}, this.timeout);
|
|
290
|
+
this.agentOS.once("message:processed", (event) => {
|
|
291
|
+
clearTimeout(timeoutId);
|
|
292
|
+
resolve({
|
|
293
|
+
text: event.response.text,
|
|
294
|
+
toolCalls: event.response.toolCalls || [],
|
|
295
|
+
cost: event.cost || 0
|
|
296
|
+
});
|
|
297
|
+
});
|
|
298
|
+
this.agentOS.once("error", (event) => {
|
|
299
|
+
clearTimeout(timeoutId);
|
|
300
|
+
reject(event.error);
|
|
301
|
+
});
|
|
302
|
+
});
|
|
303
|
+
const mockProvider = Array.from(this.agentOS.providers.values()).find((p) => p.name === "mock");
|
|
304
|
+
if (!mockProvider) throw new Error("MockProvider not found in Operor");
|
|
305
|
+
const testPhone = testCase.persona || "test-user";
|
|
306
|
+
mockProvider.simulateIncomingMessage(testPhone, testCase.question);
|
|
307
|
+
const response = await responsePromise;
|
|
308
|
+
agentResponse = response.text;
|
|
309
|
+
toolsCalled = response.toolCalls || [];
|
|
310
|
+
cost = response.cost;
|
|
311
|
+
} catch (error) {
|
|
312
|
+
agentResponse = `Error: ${error instanceof Error ? error.message : String(error)}`;
|
|
313
|
+
}
|
|
314
|
+
const duration = Date.now() - startTime;
|
|
315
|
+
const evaluation = await this.evaluator.evaluate(testCase, agentResponse, toolsCalled, this.strategy);
|
|
316
|
+
return {
|
|
317
|
+
testCase,
|
|
318
|
+
agentResponse,
|
|
319
|
+
toolsCalled,
|
|
320
|
+
evaluation,
|
|
321
|
+
duration,
|
|
322
|
+
cost
|
|
323
|
+
};
|
|
324
|
+
}
|
|
325
|
+
};
|
|
326
|
+
|
|
327
|
+
//#endregion
|
|
328
|
+
//#region src/SkillTestHarness.ts
|
|
329
|
+
/**
|
|
330
|
+
* Safety wrapper for Integration instances during testing.
|
|
331
|
+
* Provides operation limits, dry-run mode, and audit logging.
|
|
332
|
+
*/
|
|
333
|
+
var SkillTestHarness = class SkillTestHarness {
|
|
334
|
+
name;
|
|
335
|
+
inner;
|
|
336
|
+
config;
|
|
337
|
+
auditLog = [];
|
|
338
|
+
operationCount = 0;
|
|
339
|
+
static READ_TOOLS = new Set([
|
|
340
|
+
"get_order",
|
|
341
|
+
"search_products",
|
|
342
|
+
"salesforce_get_contact",
|
|
343
|
+
"salesforce_get_cases",
|
|
344
|
+
"stripe_get_customer"
|
|
345
|
+
]);
|
|
346
|
+
static WRITE_TOOLS = new Set([
|
|
347
|
+
"create_discount",
|
|
348
|
+
"salesforce_update_contact",
|
|
349
|
+
"salesforce_create_case",
|
|
350
|
+
"salesforce_add_note"
|
|
351
|
+
]);
|
|
352
|
+
static DESTRUCTIVE_TOOLS = new Set(["stripe_create_refund"]);
|
|
353
|
+
constructor(inner, config = {}) {
|
|
354
|
+
this.inner = inner;
|
|
355
|
+
this.name = inner.name;
|
|
356
|
+
this.config = {
|
|
357
|
+
allowWrites: config.allowWrites ?? false,
|
|
358
|
+
allowDestructive: config.allowDestructive ?? false,
|
|
359
|
+
maxOperations: config.maxOperations ?? 10,
|
|
360
|
+
timeoutMs: config.timeoutMs ?? 3e4,
|
|
361
|
+
dryRun: config.dryRun ?? false
|
|
362
|
+
};
|
|
363
|
+
this.tools = this.wrapTools(inner.tools);
|
|
364
|
+
}
|
|
365
|
+
async initialize() {
|
|
366
|
+
return this.inner.initialize();
|
|
367
|
+
}
|
|
368
|
+
/** @deprecated Use initialize() instead. */
|
|
369
|
+
async authenticate() {
|
|
370
|
+
return this.inner.initialize();
|
|
371
|
+
}
|
|
372
|
+
isReady() {
|
|
373
|
+
return this.inner.isReady();
|
|
374
|
+
}
|
|
375
|
+
/** @deprecated Use isReady() instead. */
|
|
376
|
+
isAuthenticated() {
|
|
377
|
+
return this.inner.isReady();
|
|
378
|
+
}
|
|
379
|
+
tools;
|
|
380
|
+
wrapTools(innerTools) {
|
|
381
|
+
const wrapped = {};
|
|
382
|
+
for (const [toolName, tool] of Object.entries(innerTools)) wrapped[toolName] = {
|
|
383
|
+
...tool,
|
|
384
|
+
execute: async (params) => {
|
|
385
|
+
const startTime = Date.now();
|
|
386
|
+
const classification = this.classifyTool(toolName);
|
|
387
|
+
if (this.operationCount >= this.config.maxOperations) throw new Error(`SkillTestHarness: Max operations limit reached (${this.config.maxOperations})`);
|
|
388
|
+
if (classification === "write" && !this.config.allowWrites) throw new Error(`SkillTestHarness: Write operation '${toolName}' blocked (allowWrites=false)`);
|
|
389
|
+
if (classification === "destructive" && !this.config.allowDestructive) throw new Error(`SkillTestHarness: Destructive operation '${toolName}' blocked (allowDestructive=false)`);
|
|
390
|
+
this.operationCount++;
|
|
391
|
+
if (this.config.dryRun) {
|
|
392
|
+
const result = {
|
|
393
|
+
dryRun: true,
|
|
394
|
+
wouldExecute: toolName,
|
|
395
|
+
params
|
|
396
|
+
};
|
|
397
|
+
this.auditLog.push({
|
|
398
|
+
name: toolName,
|
|
399
|
+
params,
|
|
400
|
+
result,
|
|
401
|
+
timestamp: startTime,
|
|
402
|
+
duration: Date.now() - startTime,
|
|
403
|
+
classification
|
|
404
|
+
});
|
|
405
|
+
return result;
|
|
406
|
+
}
|
|
407
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
408
|
+
setTimeout(() => {
|
|
409
|
+
reject(/* @__PURE__ */ new Error(`SkillTestHarness: Operation '${toolName}' timed out after ${this.config.timeoutMs}ms`));
|
|
410
|
+
}, this.config.timeoutMs);
|
|
411
|
+
});
|
|
412
|
+
try {
|
|
413
|
+
const result = await Promise.race([tool.execute(params), timeoutPromise]);
|
|
414
|
+
const duration = Date.now() - startTime;
|
|
415
|
+
this.auditLog.push({
|
|
416
|
+
name: toolName,
|
|
417
|
+
params,
|
|
418
|
+
result,
|
|
419
|
+
timestamp: startTime,
|
|
420
|
+
duration,
|
|
421
|
+
classification
|
|
422
|
+
});
|
|
423
|
+
return result;
|
|
424
|
+
} catch (error) {
|
|
425
|
+
const duration = Date.now() - startTime;
|
|
426
|
+
this.auditLog.push({
|
|
427
|
+
name: toolName,
|
|
428
|
+
params,
|
|
429
|
+
result: { error: error instanceof Error ? error.message : "Unknown error" },
|
|
430
|
+
timestamp: startTime,
|
|
431
|
+
duration,
|
|
432
|
+
classification
|
|
433
|
+
});
|
|
434
|
+
throw error;
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
};
|
|
438
|
+
return wrapped;
|
|
439
|
+
}
|
|
440
|
+
classifyTool(toolName) {
|
|
441
|
+
if (SkillTestHarness.READ_TOOLS.has(toolName)) return "read";
|
|
442
|
+
if (SkillTestHarness.DESTRUCTIVE_TOOLS.has(toolName)) return "destructive";
|
|
443
|
+
if (SkillTestHarness.WRITE_TOOLS.has(toolName)) return "write";
|
|
444
|
+
return "write";
|
|
445
|
+
}
|
|
446
|
+
/**
|
|
447
|
+
* Get the audit log of all operations performed
|
|
448
|
+
*/
|
|
449
|
+
getAuditLog() {
|
|
450
|
+
return [...this.auditLog];
|
|
451
|
+
}
|
|
452
|
+
/**
|
|
453
|
+
* Reset the audit log and operation counter
|
|
454
|
+
*/
|
|
455
|
+
resetAuditLog() {
|
|
456
|
+
this.auditLog = [];
|
|
457
|
+
this.operationCount = 0;
|
|
458
|
+
}
|
|
459
|
+
/**
|
|
460
|
+
* Get the current operation count
|
|
461
|
+
*/
|
|
462
|
+
getOperationCount() {
|
|
463
|
+
return this.operationCount;
|
|
464
|
+
}
|
|
465
|
+
};
|
|
466
|
+
|
|
467
|
+
//#endregion
|
|
468
|
+
//#region src/CustomerSimulator.ts
|
|
469
|
+
const PERSONA_PROMPTS = {
|
|
470
|
+
polite: "You are polite, patient, and use courteous language.",
|
|
471
|
+
frustrated: "You are frustrated and impatient. You express dissatisfaction but remain civil.",
|
|
472
|
+
confused: "You are confused and unsure. You ask clarifying questions and sometimes misunderstand.",
|
|
473
|
+
terse: "You give very short, minimal responses. One sentence max.",
|
|
474
|
+
verbose: "You are detailed and talkative. You provide lots of context and background."
|
|
475
|
+
};
|
|
476
|
+
function buildSystemPrompt(persona, context) {
|
|
477
|
+
return `You are simulating a customer in a support conversation for testing purposes.
|
|
478
|
+
${PERSONA_PROMPTS[persona] || `You are a customer with a ${persona} communication style.`}${context?.scenario ? `\nScenario: ${context.scenario}` : ""}${context?.maxTurns ? `\nThis conversation has a maximum of ${context.maxTurns} turns. You are on turn ${context.currentTurn ?? 1}.` : ""}
|
|
479
|
+
|
|
480
|
+
Rules:
|
|
481
|
+
- Stay in character throughout the conversation.
|
|
482
|
+
- Escalate naturally if your issue isn't being resolved.
|
|
483
|
+
- Set shouldContinue to false when your issue is resolved or you have no more questions.
|
|
484
|
+
- If the agent asks a question, answer it in character.
|
|
485
|
+
|
|
486
|
+
Respond with ONLY valid JSON (no markdown, no code fences):
|
|
487
|
+
{"message": "your response as the customer", "shouldContinue": true}`;
|
|
488
|
+
}
|
|
489
|
+
function formatHistory(history) {
|
|
490
|
+
return history.map((turn) => ({
|
|
491
|
+
role: turn.role === "customer" ? "user" : "assistant",
|
|
492
|
+
content: turn.message
|
|
493
|
+
}));
|
|
494
|
+
}
|
|
495
|
+
function parseResponse(text) {
|
|
496
|
+
const cleaned = text.replace(/```(?:json)?\s*/g, "").replace(/```/g, "").trim();
|
|
497
|
+
try {
|
|
498
|
+
const parsed = JSON.parse(cleaned);
|
|
499
|
+
return {
|
|
500
|
+
message: String(parsed.message ?? ""),
|
|
501
|
+
shouldContinue: Boolean(parsed.shouldContinue)
|
|
502
|
+
};
|
|
503
|
+
} catch {
|
|
504
|
+
return {
|
|
505
|
+
message: text.trim(),
|
|
506
|
+
shouldContinue: true
|
|
507
|
+
};
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
var CustomerSimulator = class {
|
|
511
|
+
llm;
|
|
512
|
+
constructor(options) {
|
|
513
|
+
this.llm = options?.llmProvider;
|
|
514
|
+
}
|
|
515
|
+
async generateMessage(persona, history, context) {
|
|
516
|
+
if (context?.scriptedResponses?.length) {
|
|
517
|
+
const turn = context.currentTurn ?? history.filter((t) => t.role === "customer").length;
|
|
518
|
+
const responses = context.scriptedResponses;
|
|
519
|
+
if (turn < responses.length) return {
|
|
520
|
+
message: responses[turn],
|
|
521
|
+
shouldContinue: turn < responses.length - 1
|
|
522
|
+
};
|
|
523
|
+
return {
|
|
524
|
+
message: responses[responses.length - 1],
|
|
525
|
+
shouldContinue: false
|
|
526
|
+
};
|
|
527
|
+
}
|
|
528
|
+
if (!this.llm) throw new Error("CustomerSimulator requires an LLM provider for non-scripted mode");
|
|
529
|
+
const messages = [{
|
|
530
|
+
role: "system",
|
|
531
|
+
content: buildSystemPrompt(persona, context)
|
|
532
|
+
}, ...formatHistory(history)];
|
|
533
|
+
return parseResponse((await this.llm.complete(messages, {
|
|
534
|
+
temperature: .7,
|
|
535
|
+
maxTokens: 500
|
|
536
|
+
})).text);
|
|
537
|
+
}
|
|
538
|
+
};
|
|
539
|
+
|
|
540
|
+
//#endregion
|
|
541
|
+
//#region src/ConversationEvaluator.ts
|
|
542
|
+
function buildEvaluationPrompt(config) {
|
|
543
|
+
const { scenario, persona, turns, toolsCalled, expectedTools, expectedOutcome } = config;
|
|
544
|
+
const conversationText = turns.map((t, i) => `Turn ${i + 1} [${t.role}]: ${t.message}`).join("\n");
|
|
545
|
+
const toolsText = toolsCalled.length ? toolsCalled.map((tc) => `- ${tc.name}(${JSON.stringify(tc.params)}) ā ${JSON.stringify(tc.result)}`).join("\n") : "None";
|
|
546
|
+
return `You are evaluating a customer support conversation for testing purposes.
|
|
547
|
+
|
|
548
|
+
Scenario: ${scenario}
|
|
549
|
+
Customer Persona: ${persona}
|
|
550
|
+
Expected Tools: ${expectedTools?.length ? expectedTools.join(", ") : "Not specified"}
|
|
551
|
+
Expected Outcome: ${expectedOutcome || "Not specified"}
|
|
552
|
+
|
|
553
|
+
Conversation:
|
|
554
|
+
${conversationText}
|
|
555
|
+
|
|
556
|
+
Tools Called:
|
|
557
|
+
${toolsText}
|
|
558
|
+
|
|
559
|
+
Evaluate the conversation on these dimensions (score 1-5 for each):
|
|
560
|
+
|
|
561
|
+
1. Accuracy (1-5): Did the agent provide factually correct information based on tool results?
|
|
562
|
+
2. Tool Usage (1-5): Did the agent call the right tools at the right time?
|
|
563
|
+
3. Tone (1-5): Was the agent's tone appropriate, professional, and empathetic?
|
|
564
|
+
4. Resolution (1-5): Was the customer's issue resolved or properly addressed?
|
|
565
|
+
|
|
566
|
+
Overall assessment:
|
|
567
|
+
- "pass": All criteria met, customer satisfied (scores mostly 4-5)
|
|
568
|
+
- "partial": Some issues but acceptable (scores mostly 3-4)
|
|
569
|
+
- "fail": Significant problems (any score 1-2, or multiple scores below 3)
|
|
570
|
+
|
|
571
|
+
Respond with ONLY valid JSON (no markdown, no code fences):
|
|
572
|
+
{
|
|
573
|
+
"overall": "pass" | "fail" | "partial",
|
|
574
|
+
"scores": {
|
|
575
|
+
"accuracy": <integer 1-5>,
|
|
576
|
+
"toolUsage": <integer 1-5>,
|
|
577
|
+
"tone": <integer 1-5>,
|
|
578
|
+
"resolution": <integer 1-5>
|
|
579
|
+
},
|
|
580
|
+
"feedback": "<brief explanation of the evaluation>"
|
|
581
|
+
}`;
|
|
582
|
+
}
|
|
583
|
+
function parseEvaluationResponse(text) {
|
|
584
|
+
const cleaned = text.replace(/```(?:json)?\s*/g, "").replace(/```/g, "").trim();
|
|
585
|
+
try {
|
|
586
|
+
const parsed = JSON.parse(cleaned);
|
|
587
|
+
return {
|
|
588
|
+
overall: parsed.overall || "fail",
|
|
589
|
+
scores: {
|
|
590
|
+
accuracy: Math.round(parsed.scores?.accuracy ?? 1),
|
|
591
|
+
toolUsage: Math.round(parsed.scores?.toolUsage ?? 1),
|
|
592
|
+
tone: Math.round(parsed.scores?.tone ?? 1),
|
|
593
|
+
resolution: Math.round(parsed.scores?.resolution ?? 1)
|
|
594
|
+
},
|
|
595
|
+
feedback: String(parsed.feedback ?? "No feedback provided")
|
|
596
|
+
};
|
|
597
|
+
} catch {
|
|
598
|
+
return {
|
|
599
|
+
overall: "fail",
|
|
600
|
+
scores: {
|
|
601
|
+
accuracy: 1,
|
|
602
|
+
toolUsage: 1,
|
|
603
|
+
tone: 1,
|
|
604
|
+
resolution: 1
|
|
605
|
+
},
|
|
606
|
+
feedback: "Failed to parse evaluation response"
|
|
607
|
+
};
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
function evaluateCriteria(criteria, turns) {
|
|
611
|
+
const { type, value } = criteria;
|
|
612
|
+
switch (type) {
|
|
613
|
+
case "tool_called": {
|
|
614
|
+
const toolName = String(value);
|
|
615
|
+
const called = turns.some((turn) => turn.toolCalls?.some((tc) => tc.name === toolName));
|
|
616
|
+
return {
|
|
617
|
+
criteria,
|
|
618
|
+
passed: called,
|
|
619
|
+
details: called ? `Tool "${toolName}" was called` : `Tool "${toolName}" was not called`
|
|
620
|
+
};
|
|
621
|
+
}
|
|
622
|
+
case "response_contains": {
|
|
623
|
+
const searchText = String(value).toLowerCase();
|
|
624
|
+
const found = turns.some((turn) => turn.role === "agent" && turn.message.toLowerCase().includes(searchText));
|
|
625
|
+
return {
|
|
626
|
+
criteria,
|
|
627
|
+
passed: found,
|
|
628
|
+
details: found ? `Agent response contains "${value}"` : `Agent response does not contain "${value}"`
|
|
629
|
+
};
|
|
630
|
+
}
|
|
631
|
+
case "intent_matched": {
|
|
632
|
+
const intentText = String(value).toLowerCase();
|
|
633
|
+
const matched = turns.some((turn) => turn.message.toLowerCase().includes(intentText));
|
|
634
|
+
return {
|
|
635
|
+
criteria,
|
|
636
|
+
passed: matched,
|
|
637
|
+
details: matched ? `Intent "${value}" was matched` : `Intent "${value}" was not matched`
|
|
638
|
+
};
|
|
639
|
+
}
|
|
640
|
+
case "turns_under": {
|
|
641
|
+
const maxTurns = Number(value);
|
|
642
|
+
const actualTurns = turns.length;
|
|
643
|
+
const passed = actualTurns < maxTurns;
|
|
644
|
+
return {
|
|
645
|
+
criteria,
|
|
646
|
+
passed,
|
|
647
|
+
details: passed ? `Conversation completed in ${actualTurns} turns (under ${maxTurns})` : `Conversation took ${actualTurns} turns (expected under ${maxTurns})`
|
|
648
|
+
};
|
|
649
|
+
}
|
|
650
|
+
case "custom":
|
|
651
|
+
if (typeof value === "function") try {
|
|
652
|
+
const passed = value(turns);
|
|
653
|
+
return {
|
|
654
|
+
criteria,
|
|
655
|
+
passed,
|
|
656
|
+
details: passed ? "Custom criteria passed" : "Custom criteria failed"
|
|
657
|
+
};
|
|
658
|
+
} catch (error) {
|
|
659
|
+
return {
|
|
660
|
+
criteria,
|
|
661
|
+
passed: false,
|
|
662
|
+
details: `Custom criteria error: ${error}`
|
|
663
|
+
};
|
|
664
|
+
}
|
|
665
|
+
return {
|
|
666
|
+
criteria,
|
|
667
|
+
passed: false,
|
|
668
|
+
details: "Custom criteria value must be a function"
|
|
669
|
+
};
|
|
670
|
+
default: return {
|
|
671
|
+
criteria,
|
|
672
|
+
passed: false,
|
|
673
|
+
details: `Unknown criteria type: ${type}`
|
|
674
|
+
};
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
var ConversationEvaluator = class {
|
|
678
|
+
llm;
|
|
679
|
+
constructor(options) {
|
|
680
|
+
this.llm = options?.llmProvider;
|
|
681
|
+
}
|
|
682
|
+
async evaluate(config) {
|
|
683
|
+
const { turns, successCriteria } = config;
|
|
684
|
+
const criteriaResults = successCriteria ? successCriteria.map((criteria) => evaluateCriteria(criteria, turns)) : [];
|
|
685
|
+
if (this.llm) {
|
|
686
|
+
const messages = [{
|
|
687
|
+
role: "user",
|
|
688
|
+
content: buildEvaluationPrompt(config)
|
|
689
|
+
}];
|
|
690
|
+
const evaluation = parseEvaluationResponse((await this.llm.complete(messages, {
|
|
691
|
+
temperature: 0,
|
|
692
|
+
maxTokens: 1e3
|
|
693
|
+
})).text);
|
|
694
|
+
evaluation.criteriaResults = criteriaResults;
|
|
695
|
+
if (!criteriaResults.every((cr) => cr.passed) && evaluation.overall === "pass") evaluation.overall = "partial";
|
|
696
|
+
return evaluation;
|
|
697
|
+
}
|
|
698
|
+
if (!criteriaResults.length) return {
|
|
699
|
+
overall: "fail",
|
|
700
|
+
scores: {
|
|
701
|
+
accuracy: 1,
|
|
702
|
+
toolUsage: 1,
|
|
703
|
+
tone: 1,
|
|
704
|
+
resolution: 1
|
|
705
|
+
},
|
|
706
|
+
feedback: "No criteria specified",
|
|
707
|
+
criteriaResults
|
|
708
|
+
};
|
|
709
|
+
const allPassed = criteriaResults.every((cr) => cr.passed);
|
|
710
|
+
const somePassed = criteriaResults.some((cr) => cr.passed);
|
|
711
|
+
return {
|
|
712
|
+
overall: allPassed ? "pass" : somePassed ? "partial" : "fail",
|
|
713
|
+
scores: {
|
|
714
|
+
accuracy: 3,
|
|
715
|
+
toolUsage: 3,
|
|
716
|
+
tone: 3,
|
|
717
|
+
resolution: 3
|
|
718
|
+
},
|
|
719
|
+
feedback: criteriaResults.map((cr) => cr.details).join("; "),
|
|
720
|
+
criteriaResults
|
|
721
|
+
};
|
|
722
|
+
}
|
|
723
|
+
};
|
|
724
|
+
|
|
725
|
+
//#endregion
|
|
726
|
+
//#region src/utils.ts
|
|
727
|
+
/**
|
|
728
|
+
* Format current time as HH:MM:SS timestamp
|
|
729
|
+
*/
|
|
730
|
+
function formatTimestamp() {
|
|
731
|
+
return (/* @__PURE__ */ new Date()).toLocaleTimeString("en-US", { hour12: false });
|
|
732
|
+
}
|
|
733
|
+
|
|
734
|
+
//#endregion
|
|
735
|
+
//#region src/ConversationRunner.ts
|
|
736
|
+
var ConversationRunner = class {
|
|
737
|
+
agentOS;
|
|
738
|
+
customerSimulator;
|
|
739
|
+
conversationEvaluator;
|
|
740
|
+
timeout;
|
|
741
|
+
verbose;
|
|
742
|
+
constructor(config) {
|
|
743
|
+
this.agentOS = config.agentOS;
|
|
744
|
+
this.customerSimulator = config.customerSimulator;
|
|
745
|
+
this.conversationEvaluator = config.conversationEvaluator;
|
|
746
|
+
this.timeout = config.timeout || 3e4;
|
|
747
|
+
this.verbose = config.verbose ?? false;
|
|
748
|
+
}
|
|
749
|
+
async runScenario(scenario) {
|
|
750
|
+
const startTime = Date.now();
|
|
751
|
+
const turns = [];
|
|
752
|
+
const toolsCalled = [];
|
|
753
|
+
let totalCost = 0;
|
|
754
|
+
try {
|
|
755
|
+
const mockProvider = this.getMockProvider();
|
|
756
|
+
const customerId = `test-customer-${scenario.id}`;
|
|
757
|
+
const initialMessage = scenario.scriptedResponses?.[0] || "Hello, I need help";
|
|
758
|
+
if (this.verbose) {
|
|
759
|
+
console.log(`\n=== Starting Scenario: ${scenario.name} ===`);
|
|
760
|
+
console.log(`Persona: ${scenario.persona}`);
|
|
761
|
+
console.log(`Max Turns: ${scenario.maxTurns}\n`);
|
|
762
|
+
}
|
|
763
|
+
let shouldContinue = true;
|
|
764
|
+
let currentTurn = 0;
|
|
765
|
+
while (shouldContinue && currentTurn < scenario.maxTurns) {
|
|
766
|
+
const customerMessage = currentTurn === 0 ? initialMessage : (await this.customerSimulator.generateMessage(scenario.persona, turns, {
|
|
767
|
+
scenario: scenario.description,
|
|
768
|
+
maxTurns: scenario.maxTurns,
|
|
769
|
+
currentTurn,
|
|
770
|
+
scriptedResponses: scenario.scriptedResponses
|
|
771
|
+
})).message;
|
|
772
|
+
if (this.verbose) console.log(`[${formatTimestamp()}] Turn ${currentTurn + 1} [customer]: ${customerMessage}`);
|
|
773
|
+
const agentResponse = await this.waitForAgentResponse(mockProvider, customerId, customerMessage);
|
|
774
|
+
if (this.verbose) {
|
|
775
|
+
console.log(`[${formatTimestamp()}] Turn ${currentTurn + 1} [agent]: ${agentResponse.text}`);
|
|
776
|
+
if (agentResponse.toolCalls?.length) console.log(` Tools called: ${agentResponse.toolCalls.map((tc) => tc.name).join(", ")}`);
|
|
777
|
+
}
|
|
778
|
+
turns.push({
|
|
779
|
+
role: "customer",
|
|
780
|
+
message: customerMessage
|
|
781
|
+
});
|
|
782
|
+
turns.push({
|
|
783
|
+
role: "agent",
|
|
784
|
+
message: agentResponse.text,
|
|
785
|
+
toolCalls: agentResponse.toolCalls
|
|
786
|
+
});
|
|
787
|
+
if (agentResponse.toolCalls) toolsCalled.push(...agentResponse.toolCalls);
|
|
788
|
+
totalCost += agentResponse.cost || 0;
|
|
789
|
+
currentTurn++;
|
|
790
|
+
if (scenario.scriptedResponses) shouldContinue = currentTurn < scenario.scriptedResponses.length;
|
|
791
|
+
else if (currentTurn < scenario.maxTurns) {
|
|
792
|
+
shouldContinue = (await this.customerSimulator.generateMessage(scenario.persona, turns, {
|
|
793
|
+
scenario: scenario.description,
|
|
794
|
+
maxTurns: scenario.maxTurns,
|
|
795
|
+
currentTurn
|
|
796
|
+
})).shouldContinue;
|
|
797
|
+
if (!shouldContinue && this.verbose) console.log("Customer satisfied, ending conversation.");
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
if (this.verbose && currentTurn >= scenario.maxTurns) console.log(`\nReached max turns (${scenario.maxTurns})`);
|
|
801
|
+
const evaluation = await this.conversationEvaluator.evaluate({
|
|
802
|
+
scenario: scenario.description,
|
|
803
|
+
persona: scenario.persona,
|
|
804
|
+
turns,
|
|
805
|
+
toolsCalled,
|
|
806
|
+
expectedTools: scenario.expectedTools,
|
|
807
|
+
expectedOutcome: scenario.expectedOutcome,
|
|
808
|
+
successCriteria: scenario.successCriteria
|
|
809
|
+
});
|
|
810
|
+
if (this.verbose) {
|
|
811
|
+
console.log(`\n=== Evaluation ===`);
|
|
812
|
+
console.log(`Overall: ${evaluation.overall}`);
|
|
813
|
+
console.log(`Scores: ${JSON.stringify(evaluation.scores)}`);
|
|
814
|
+
console.log(`Feedback: ${evaluation.feedback}\n`);
|
|
815
|
+
}
|
|
816
|
+
const duration = Date.now() - startTime;
|
|
817
|
+
return {
|
|
818
|
+
scenario,
|
|
819
|
+
passed: evaluation.overall === "pass",
|
|
820
|
+
turns,
|
|
821
|
+
evaluation,
|
|
822
|
+
duration,
|
|
823
|
+
cost: totalCost
|
|
824
|
+
};
|
|
825
|
+
} catch (error) {
|
|
826
|
+
const duration = Date.now() - startTime;
|
|
827
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
828
|
+
if (this.verbose) console.error(`\nError in scenario ${scenario.name}: ${errorMessage}\n`);
|
|
829
|
+
return {
|
|
830
|
+
scenario,
|
|
831
|
+
passed: false,
|
|
832
|
+
turns,
|
|
833
|
+
evaluation: {
|
|
834
|
+
overall: "fail",
|
|
835
|
+
scores: {
|
|
836
|
+
accuracy: 1,
|
|
837
|
+
toolUsage: 1,
|
|
838
|
+
tone: 1,
|
|
839
|
+
resolution: 1
|
|
840
|
+
},
|
|
841
|
+
feedback: `Error: ${errorMessage}`
|
|
842
|
+
},
|
|
843
|
+
duration,
|
|
844
|
+
cost: totalCost
|
|
845
|
+
};
|
|
846
|
+
}
|
|
847
|
+
}
|
|
848
|
+
async runScenarios(scenarios) {
|
|
849
|
+
const results = [];
|
|
850
|
+
for (const scenario of scenarios) {
|
|
851
|
+
const result = await this.runScenario(scenario);
|
|
852
|
+
results.push(result);
|
|
853
|
+
}
|
|
854
|
+
return results;
|
|
855
|
+
}
|
|
856
|
+
getMockProvider() {
|
|
857
|
+
const mockProvider = Array.from(this.agentOS.providers.values()).find((p) => p.name === "mock");
|
|
858
|
+
if (!mockProvider) throw new Error("MockProvider not found in Operor. Add it with agentOS.addProvider()");
|
|
859
|
+
return mockProvider;
|
|
860
|
+
}
|
|
861
|
+
async waitForAgentResponse(mockProvider, customerId, message) {
|
|
862
|
+
return new Promise((resolve, reject) => {
|
|
863
|
+
let settled = false;
|
|
864
|
+
const cleanup = () => {
|
|
865
|
+
this.agentOS.removeListener("message:processed", onProcessed);
|
|
866
|
+
this.agentOS.removeListener("error", onError);
|
|
867
|
+
};
|
|
868
|
+
const timeoutId = setTimeout(() => {
|
|
869
|
+
if (!settled) {
|
|
870
|
+
settled = true;
|
|
871
|
+
cleanup();
|
|
872
|
+
reject(/* @__PURE__ */ new Error(`Agent response timed out after ${this.timeout}ms`));
|
|
873
|
+
}
|
|
874
|
+
}, this.timeout);
|
|
875
|
+
const onProcessed = (event) => {
|
|
876
|
+
if (!settled) {
|
|
877
|
+
settled = true;
|
|
878
|
+
clearTimeout(timeoutId);
|
|
879
|
+
cleanup();
|
|
880
|
+
resolve({
|
|
881
|
+
text: event.response.text,
|
|
882
|
+
toolCalls: event.response.toolCalls || [],
|
|
883
|
+
cost: event.cost || 0
|
|
884
|
+
});
|
|
885
|
+
}
|
|
886
|
+
};
|
|
887
|
+
const onError = (event) => {
|
|
888
|
+
if (!settled) {
|
|
889
|
+
settled = true;
|
|
890
|
+
clearTimeout(timeoutId);
|
|
891
|
+
cleanup();
|
|
892
|
+
reject(event.error);
|
|
893
|
+
}
|
|
894
|
+
};
|
|
895
|
+
this.agentOS.once("message:processed", onProcessed);
|
|
896
|
+
this.agentOS.once("error", onError);
|
|
897
|
+
mockProvider.simulateIncomingMessage(customerId, message);
|
|
898
|
+
});
|
|
899
|
+
}
|
|
900
|
+
};
|
|
901
|
+
|
|
902
|
+
//#endregion
|
|
903
|
+
//#region src/scenarios.ts
|
|
904
|
+
const ECOMMERCE_SCENARIOS = [
|
|
905
|
+
{
|
|
906
|
+
id: "delayed-order-compensation",
|
|
907
|
+
name: "Delayed order with compensation",
|
|
908
|
+
description: "Customer asks about a delayed order and expects compensation",
|
|
909
|
+
persona: "Frustrated customer whose order #12345 was supposed to arrive 3 days ago",
|
|
910
|
+
maxTurns: 6,
|
|
911
|
+
expectedTools: ["get_order", "create_discount"],
|
|
912
|
+
expectedOutcome: "Agent finds order, acknowledges delay, offers discount",
|
|
913
|
+
scriptedResponses: [
|
|
914
|
+
"Where is my order #12345? It was supposed to arrive 3 days ago!",
|
|
915
|
+
"This is really frustrating. Can you do anything to make up for this?",
|
|
916
|
+
"Okay, I appreciate the discount. Thank you."
|
|
917
|
+
]
|
|
918
|
+
},
|
|
919
|
+
{
|
|
920
|
+
id: "order-not-found",
|
|
921
|
+
name: "Order not found",
|
|
922
|
+
description: "Customer asks about an order that does not exist in the system",
|
|
923
|
+
persona: "Confused customer who may have the wrong order number",
|
|
924
|
+
maxTurns: 4,
|
|
925
|
+
expectedTools: ["get_order"],
|
|
926
|
+
expectedOutcome: "Agent attempts lookup, explains order not found, asks customer to verify",
|
|
927
|
+
scriptedResponses: ["Can you check on order #99999 for me?", "Hmm, let me double check the number and get back to you."]
|
|
928
|
+
},
|
|
929
|
+
{
|
|
930
|
+
id: "product-inquiry",
|
|
931
|
+
name: "Product inquiry",
|
|
932
|
+
description: "Customer searches for a product and asks about availability",
|
|
933
|
+
persona: "Curious shopper looking for electronics",
|
|
934
|
+
maxTurns: 4,
|
|
935
|
+
expectedTools: ["search_products"],
|
|
936
|
+
expectedOutcome: "Agent searches products and provides relevant results",
|
|
937
|
+
scriptedResponses: [
|
|
938
|
+
"Do you have any wireless headphones in stock?",
|
|
939
|
+
"What about the price range? Anything under $200?",
|
|
940
|
+
"Great, thanks for the info!"
|
|
941
|
+
]
|
|
942
|
+
},
|
|
943
|
+
{
|
|
944
|
+
id: "return-request",
|
|
945
|
+
name: "Return request",
|
|
946
|
+
description: "Customer wants to return a recently delivered item",
|
|
947
|
+
persona: "Polite customer who received a defective product from order #67890",
|
|
948
|
+
maxTurns: 5,
|
|
949
|
+
expectedTools: ["get_order"],
|
|
950
|
+
expectedOutcome: "Agent looks up order, acknowledges issue, explains return process",
|
|
951
|
+
scriptedResponses: [
|
|
952
|
+
"I received order #67890 but one of the items is defective. I would like to return it.",
|
|
953
|
+
"Yes, the wireless mouse stopped working after one day.",
|
|
954
|
+
"Okay, how do I send it back?"
|
|
955
|
+
]
|
|
956
|
+
},
|
|
957
|
+
{
|
|
958
|
+
id: "greeting",
|
|
959
|
+
name: "Simple greeting",
|
|
960
|
+
description: "Customer says hello and expects a friendly welcome",
|
|
961
|
+
persona: "Friendly first-time visitor",
|
|
962
|
+
maxTurns: 2,
|
|
963
|
+
expectedTools: [],
|
|
964
|
+
expectedOutcome: "Agent responds with a friendly greeting and offers help",
|
|
965
|
+
scriptedResponses: ["Hello!"]
|
|
966
|
+
},
|
|
967
|
+
{
|
|
968
|
+
id: "billing-dispute",
|
|
969
|
+
name: "Billing dispute",
|
|
970
|
+
description: "Customer believes they were charged incorrectly",
|
|
971
|
+
persona: "Concerned customer who noticed a double charge on order #12345",
|
|
972
|
+
maxTurns: 5,
|
|
973
|
+
expectedTools: ["get_order"],
|
|
974
|
+
expectedOutcome: "Agent looks up order, reviews charges, and addresses the billing concern",
|
|
975
|
+
scriptedResponses: [
|
|
976
|
+
"I think I was charged twice for order #12345. Can you check?",
|
|
977
|
+
"My credit card shows two charges of $299.99 on the same day.",
|
|
978
|
+
"Can you escalate this to someone who can issue a correction?"
|
|
979
|
+
]
|
|
980
|
+
},
|
|
981
|
+
{
|
|
982
|
+
id: "multi-issue",
|
|
983
|
+
name: "Multi-issue conversation",
|
|
984
|
+
description: "Customer has multiple problems: a delayed order and a product question",
|
|
985
|
+
persona: "Busy customer who wants to resolve everything in one conversation",
|
|
986
|
+
maxTurns: 6,
|
|
987
|
+
expectedTools: ["get_order", "search_products"],
|
|
988
|
+
expectedOutcome: "Agent handles both issues sequentially without losing context",
|
|
989
|
+
scriptedResponses: [
|
|
990
|
+
"Two things: first, where is my order #12345?",
|
|
991
|
+
"Okay thanks. Also, do you carry mechanical keyboards?",
|
|
992
|
+
"Nice, I might order one. Can you also give me a discount for the late delivery?",
|
|
993
|
+
"Sounds good, thanks for handling both issues."
|
|
994
|
+
]
|
|
995
|
+
},
|
|
996
|
+
{
|
|
997
|
+
id: "lead-qualification",
|
|
998
|
+
name: "Lead qualification",
|
|
999
|
+
description: "Potential customer asking pre-purchase questions about products and shipping",
|
|
1000
|
+
persona: "Prospective buyer evaluating whether to make a purchase",
|
|
1001
|
+
maxTurns: 4,
|
|
1002
|
+
expectedTools: ["search_products"],
|
|
1003
|
+
expectedOutcome: "Agent answers product questions and encourages purchase",
|
|
1004
|
+
scriptedResponses: [
|
|
1005
|
+
"I am thinking about buying some electronics. What do you have?",
|
|
1006
|
+
"How fast is shipping usually?",
|
|
1007
|
+
"Do you offer any discounts for first-time buyers?"
|
|
1008
|
+
]
|
|
1009
|
+
},
|
|
1010
|
+
{
|
|
1011
|
+
id: "frustrated-escalation",
|
|
1012
|
+
name: "Frustrated customer escalation",
|
|
1013
|
+
description: "Angry customer escalates through multiple complaints",
|
|
1014
|
+
persona: "Very frustrated customer who has contacted support multiple times about order #12345",
|
|
1015
|
+
maxTurns: 6,
|
|
1016
|
+
expectedTools: ["get_order", "create_discount"],
|
|
1017
|
+
expectedOutcome: "Agent remains professional, empathizes, and offers concrete resolution",
|
|
1018
|
+
scriptedResponses: [
|
|
1019
|
+
"This is the THIRD time I am contacting you about order #12345. Still not here!",
|
|
1020
|
+
"I have been waiting over a week. This is completely unacceptable.",
|
|
1021
|
+
"I want a refund or serious compensation. A 5% coupon is insulting.",
|
|
1022
|
+
"Fine, that is more reasonable. But I expect the order to arrive this week."
|
|
1023
|
+
]
|
|
1024
|
+
},
|
|
1025
|
+
{
|
|
1026
|
+
id: "on-time-order-check",
|
|
1027
|
+
name: "On-time order status check",
|
|
1028
|
+
description: "Customer checks on an order that is on time or already delivered",
|
|
1029
|
+
persona: "Polite customer just checking in on order #67890",
|
|
1030
|
+
maxTurns: 2,
|
|
1031
|
+
expectedTools: ["get_order"],
|
|
1032
|
+
expectedOutcome: "Agent confirms order status, no compensation needed",
|
|
1033
|
+
scriptedResponses: ["Hi, can I get an update on order #67890?", "Perfect, thanks!"]
|
|
1034
|
+
}
|
|
1035
|
+
];
|
|
1036
|
+
|
|
1037
|
+
//#endregion
|
|
1038
|
+
//#region src/SimulationRunner.ts
|
|
1039
|
+
var SimulationRunner = class {
|
|
1040
|
+
agentOS;
|
|
1041
|
+
config;
|
|
1042
|
+
llm;
|
|
1043
|
+
constructor(options) {
|
|
1044
|
+
this.agentOS = options.agentOS;
|
|
1045
|
+
this.config = options.config;
|
|
1046
|
+
this.llm = options.llm;
|
|
1047
|
+
}
|
|
1048
|
+
async run(onProgress) {
|
|
1049
|
+
const startTime = Date.now();
|
|
1050
|
+
const testSuiteResults = [];
|
|
1051
|
+
const conversationResults = [];
|
|
1052
|
+
let totalCost = 0;
|
|
1053
|
+
if (this.config.testSuiteFiles?.length) {
|
|
1054
|
+
const suiteRunner = new TestSuiteRunner({
|
|
1055
|
+
agentOS: this.agentOS,
|
|
1056
|
+
llm: this.llm,
|
|
1057
|
+
timeout: this.config.timeout
|
|
1058
|
+
});
|
|
1059
|
+
for (const file of this.config.testSuiteFiles) {
|
|
1060
|
+
const testCases = await CSVLoader.fromFile(file);
|
|
1061
|
+
const result = await suiteRunner.runSuite(testCases);
|
|
1062
|
+
testSuiteResults.push(result);
|
|
1063
|
+
totalCost += result.totalCost;
|
|
1064
|
+
}
|
|
1065
|
+
}
|
|
1066
|
+
const scenarios = this.resolveScenarios();
|
|
1067
|
+
if (scenarios.length) {
|
|
1068
|
+
const conversationRunner = new ConversationRunner({
|
|
1069
|
+
agentOS: this.agentOS,
|
|
1070
|
+
customerSimulator: new CustomerSimulator({ llmProvider: this.llm }),
|
|
1071
|
+
conversationEvaluator: new ConversationEvaluator({ llmProvider: this.llm }),
|
|
1072
|
+
timeout: this.config.timeout
|
|
1073
|
+
});
|
|
1074
|
+
const schedule = this.buildSchedule(scenarios);
|
|
1075
|
+
const pauseMs = this.config.pauseBetweenMs ?? 500;
|
|
1076
|
+
for (let i = 0; i < schedule.length; i++) {
|
|
1077
|
+
const scenario = schedule[i];
|
|
1078
|
+
const timeoutMs = this.config.timeout || 6e4;
|
|
1079
|
+
const result = await Promise.race([conversationRunner.runScenario(scenario), new Promise((_, reject) => setTimeout(() => reject(/* @__PURE__ */ new Error(`Conversation timed out after ${timeoutMs}ms`)), timeoutMs))]).catch((error) => {
|
|
1080
|
+
return {
|
|
1081
|
+
scenario,
|
|
1082
|
+
passed: false,
|
|
1083
|
+
turns: [],
|
|
1084
|
+
evaluation: {
|
|
1085
|
+
overall: "fail",
|
|
1086
|
+
scores: {
|
|
1087
|
+
accuracy: 1,
|
|
1088
|
+
toolUsage: 1,
|
|
1089
|
+
tone: 1,
|
|
1090
|
+
resolution: 1
|
|
1091
|
+
},
|
|
1092
|
+
feedback: `Timeout or error: ${error instanceof Error ? error.message : String(error)}`
|
|
1093
|
+
},
|
|
1094
|
+
duration: timeoutMs,
|
|
1095
|
+
cost: 0
|
|
1096
|
+
};
|
|
1097
|
+
});
|
|
1098
|
+
conversationResults.push(result);
|
|
1099
|
+
totalCost += result.cost;
|
|
1100
|
+
if (onProgress) onProgress(i + 1, schedule.length, result);
|
|
1101
|
+
if (i < schedule.length - 1 && pauseMs > 0) await new Promise((resolve) => setTimeout(resolve, pauseMs));
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
const duration = Date.now() - startTime;
|
|
1105
|
+
const totalTests = testSuiteResults.reduce((sum, r) => sum + r.total, 0);
|
|
1106
|
+
const passedTests = testSuiteResults.reduce((sum, r) => sum + r.passed, 0);
|
|
1107
|
+
const failedTests = testSuiteResults.reduce((sum, r) => sum + r.failed, 0);
|
|
1108
|
+
const totalConversations = conversationResults.length;
|
|
1109
|
+
const passedConversations = conversationResults.filter((r) => r.passed).length;
|
|
1110
|
+
const failedConversations = totalConversations - passedConversations;
|
|
1111
|
+
const totalItems = totalTests + totalConversations;
|
|
1112
|
+
const passedItems = passedTests + passedConversations;
|
|
1113
|
+
const overallPassRate = totalItems > 0 ? passedItems / totalItems : 0;
|
|
1114
|
+
const averageScores = this.computeAverageScores(conversationResults);
|
|
1115
|
+
const scenarioBreakdown = this.computeScenarioBreakdown(conversationResults);
|
|
1116
|
+
const toolUsageStats = this.computeToolUsageStats(conversationResults);
|
|
1117
|
+
const failedResults = conversationResults.filter((r) => !r.passed);
|
|
1118
|
+
let commonFailurePatterns = [];
|
|
1119
|
+
let recommendations = [];
|
|
1120
|
+
if (failedResults.length > 0) if (this.llm) {
|
|
1121
|
+
const analysis = await this.analyzeFailuresWithLLM(failedResults);
|
|
1122
|
+
commonFailurePatterns = analysis.patterns;
|
|
1123
|
+
recommendations = analysis.recommendations;
|
|
1124
|
+
} else {
|
|
1125
|
+
commonFailurePatterns = this.heuristicFailurePatterns(failedResults);
|
|
1126
|
+
recommendations = this.heuristicRecommendations(failedResults);
|
|
1127
|
+
}
|
|
1128
|
+
return {
|
|
1129
|
+
timestamp: /* @__PURE__ */ new Date(),
|
|
1130
|
+
duration,
|
|
1131
|
+
totalConversations,
|
|
1132
|
+
passed: passedConversations,
|
|
1133
|
+
failed: failedConversations,
|
|
1134
|
+
averageScores,
|
|
1135
|
+
scenarioBreakdown,
|
|
1136
|
+
toolUsageStats,
|
|
1137
|
+
commonFailurePatterns,
|
|
1138
|
+
recommendations,
|
|
1139
|
+
testSuiteResults,
|
|
1140
|
+
conversationResults,
|
|
1141
|
+
overallPassed: failedTests === 0 && failedConversations === 0 && totalItems > 0,
|
|
1142
|
+
totalCost,
|
|
1143
|
+
summary: {
|
|
1144
|
+
totalTests,
|
|
1145
|
+
passedTests,
|
|
1146
|
+
failedTests,
|
|
1147
|
+
totalConversations,
|
|
1148
|
+
passedConversations,
|
|
1149
|
+
failedConversations,
|
|
1150
|
+
overallPassRate
|
|
1151
|
+
}
|
|
1152
|
+
};
|
|
1153
|
+
}
|
|
1154
|
+
static formatReport(report) {
|
|
1155
|
+
const lines = [];
|
|
1156
|
+
lines.push("=== Simulation Report ===");
|
|
1157
|
+
lines.push(`Date: ${report.timestamp.toISOString()}`);
|
|
1158
|
+
lines.push(`Duration: ${(report.duration / 1e3).toFixed(1)}s`);
|
|
1159
|
+
lines.push(`Cost: $${report.totalCost.toFixed(4)}`);
|
|
1160
|
+
lines.push("");
|
|
1161
|
+
if (report.testSuiteResults.length) {
|
|
1162
|
+
lines.push("--- Test Suites ---");
|
|
1163
|
+
for (const suite of report.testSuiteResults) {
|
|
1164
|
+
lines.push(` ${suite.passed}/${suite.total} passed (avg score: ${suite.averageScore.toFixed(2)})`);
|
|
1165
|
+
for (const result of suite.results) {
|
|
1166
|
+
const status = result.evaluation.passed ? "PASS" : "FAIL";
|
|
1167
|
+
lines.push(` [${status}] ${result.testCase.id}: ${result.testCase.question}`);
|
|
1168
|
+
}
|
|
1169
|
+
}
|
|
1170
|
+
lines.push("");
|
|
1171
|
+
}
|
|
1172
|
+
if (report.scenarioBreakdown.length) {
|
|
1173
|
+
lines.push("--- Scenario Breakdown ---");
|
|
1174
|
+
for (const s of report.scenarioBreakdown) {
|
|
1175
|
+
const pct = (s.passRate * 100).toFixed(0);
|
|
1176
|
+
lines.push(` ${s.scenario}: ${s.runs} run(s), ${pct}% pass rate, avg score ${s.avgScore.toFixed(2)}`);
|
|
1177
|
+
}
|
|
1178
|
+
lines.push("");
|
|
1179
|
+
}
|
|
1180
|
+
const toolEntries = Object.entries(report.toolUsageStats);
|
|
1181
|
+
if (toolEntries.length) {
|
|
1182
|
+
lines.push("--- Tool Usage ---");
|
|
1183
|
+
for (const [tool, count] of toolEntries.sort((a, b) => b[1] - a[1])) lines.push(` ${tool}: ${count} call(s)`);
|
|
1184
|
+
lines.push("");
|
|
1185
|
+
}
|
|
1186
|
+
const { averageScores } = report;
|
|
1187
|
+
if (report.totalConversations > 0) {
|
|
1188
|
+
lines.push("--- Average Scores ---");
|
|
1189
|
+
lines.push(` Accuracy: ${averageScores.accuracy.toFixed(2)}`);
|
|
1190
|
+
lines.push(` Tool Usage: ${averageScores.toolUsage.toFixed(2)}`);
|
|
1191
|
+
lines.push(` Tone: ${averageScores.tone.toFixed(2)}`);
|
|
1192
|
+
lines.push(` Resolution: ${averageScores.resolution.toFixed(2)}`);
|
|
1193
|
+
lines.push("");
|
|
1194
|
+
}
|
|
1195
|
+
if (report.commonFailurePatterns.length) {
|
|
1196
|
+
lines.push("--- Common Failure Patterns ---");
|
|
1197
|
+
for (const pattern of report.commonFailurePatterns) lines.push(` - ${pattern}`);
|
|
1198
|
+
lines.push("");
|
|
1199
|
+
}
|
|
1200
|
+
if (report.recommendations.length) {
|
|
1201
|
+
lines.push("--- Recommendations ---");
|
|
1202
|
+
for (const rec of report.recommendations) lines.push(` - ${rec}`);
|
|
1203
|
+
lines.push("");
|
|
1204
|
+
}
|
|
1205
|
+
const { summary } = report;
|
|
1206
|
+
lines.push("--- Summary ---");
|
|
1207
|
+
if (summary.totalTests > 0) lines.push(`Tests: ${summary.passedTests}/${summary.totalTests} passed`);
|
|
1208
|
+
lines.push(`Conversations: ${summary.passedConversations}/${summary.totalConversations} passed`);
|
|
1209
|
+
lines.push(`Overall pass rate: ${(summary.overallPassRate * 100).toFixed(1)}%`);
|
|
1210
|
+
lines.push(`Result: ${report.overallPassed ? "PASSED" : "FAILED"}`);
|
|
1211
|
+
return lines.join("\n");
|
|
1212
|
+
}
|
|
1213
|
+
resolveScenarios() {
|
|
1214
|
+
if (!this.config.conversationScenarios) return [];
|
|
1215
|
+
if (this.config.conversationScenarios === "builtin") return ECOMMERCE_SCENARIOS;
|
|
1216
|
+
return this.config.conversationScenarios;
|
|
1217
|
+
}
|
|
1218
|
+
buildSchedule(scenarios) {
|
|
1219
|
+
const total = this.config.totalConversations ?? scenarios.length;
|
|
1220
|
+
const schedule = [];
|
|
1221
|
+
for (let i = 0; i < total; i++) schedule.push(scenarios[i % scenarios.length]);
|
|
1222
|
+
return schedule;
|
|
1223
|
+
}
|
|
1224
|
+
computeAverageScores(results) {
|
|
1225
|
+
if (results.length === 0) return {
|
|
1226
|
+
accuracy: 0,
|
|
1227
|
+
toolUsage: 0,
|
|
1228
|
+
tone: 0,
|
|
1229
|
+
resolution: 0
|
|
1230
|
+
};
|
|
1231
|
+
const totals = results.reduce((acc, r) => ({
|
|
1232
|
+
accuracy: acc.accuracy + r.evaluation.scores.accuracy,
|
|
1233
|
+
toolUsage: acc.toolUsage + r.evaluation.scores.toolUsage,
|
|
1234
|
+
tone: acc.tone + r.evaluation.scores.tone,
|
|
1235
|
+
resolution: acc.resolution + r.evaluation.scores.resolution
|
|
1236
|
+
}), {
|
|
1237
|
+
accuracy: 0,
|
|
1238
|
+
toolUsage: 0,
|
|
1239
|
+
tone: 0,
|
|
1240
|
+
resolution: 0
|
|
1241
|
+
});
|
|
1242
|
+
const n = results.length;
|
|
1243
|
+
return {
|
|
1244
|
+
accuracy: totals.accuracy / n,
|
|
1245
|
+
toolUsage: totals.toolUsage / n,
|
|
1246
|
+
tone: totals.tone / n,
|
|
1247
|
+
resolution: totals.resolution / n
|
|
1248
|
+
};
|
|
1249
|
+
}
|
|
1250
|
+
computeScenarioBreakdown(results) {
|
|
1251
|
+
const byScenario = /* @__PURE__ */ new Map();
|
|
1252
|
+
for (const r of results) {
|
|
1253
|
+
const name = r.scenario.name;
|
|
1254
|
+
if (!byScenario.has(name)) byScenario.set(name, []);
|
|
1255
|
+
byScenario.get(name).push(r);
|
|
1256
|
+
}
|
|
1257
|
+
return Array.from(byScenario.entries()).map(([scenario, runs]) => {
|
|
1258
|
+
const passed = runs.filter((r) => r.passed).length;
|
|
1259
|
+
const scores = runs.map((r) => {
|
|
1260
|
+
const s = r.evaluation.scores;
|
|
1261
|
+
return (s.accuracy + s.toolUsage + s.tone + s.resolution) / 4;
|
|
1262
|
+
});
|
|
1263
|
+
const avgScore = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
1264
|
+
return {
|
|
1265
|
+
scenario,
|
|
1266
|
+
runs: runs.length,
|
|
1267
|
+
passRate: passed / runs.length,
|
|
1268
|
+
avgScore
|
|
1269
|
+
};
|
|
1270
|
+
});
|
|
1271
|
+
}
|
|
1272
|
+
computeToolUsageStats(results) {
|
|
1273
|
+
const stats = {};
|
|
1274
|
+
for (const r of results) for (const turn of r.turns) if (turn.toolCalls) for (const tc of turn.toolCalls) stats[tc.name] = (stats[tc.name] || 0) + 1;
|
|
1275
|
+
return stats;
|
|
1276
|
+
}
|
|
1277
|
+
async analyzeFailuresWithLLM(failedResults) {
|
|
1278
|
+
if (!this.llm) return {
|
|
1279
|
+
patterns: [],
|
|
1280
|
+
recommendations: []
|
|
1281
|
+
};
|
|
1282
|
+
const prompt = `Analyze these failed customer support conversation tests and identify patterns.
|
|
1283
|
+
|
|
1284
|
+
${failedResults.slice(0, 10).map((r) => {
|
|
1285
|
+
const turns = r.turns.map((t) => `[${t.role}]: ${t.message}`).join("\n");
|
|
1286
|
+
return `Scenario: ${r.scenario.name}\nFeedback: ${r.evaluation.feedback}\nConversation:\n${turns}`;
|
|
1287
|
+
}).join("\n\n---\n\n")}
|
|
1288
|
+
|
|
1289
|
+
Respond with ONLY valid JSON (no markdown, no code fences):
|
|
1290
|
+
{
|
|
1291
|
+
"patterns": ["pattern 1", "pattern 2"],
|
|
1292
|
+
"recommendations": ["recommendation 1", "recommendation 2"]
|
|
1293
|
+
}`;
|
|
1294
|
+
try {
|
|
1295
|
+
const cleaned = (await this.llm.complete([{
|
|
1296
|
+
role: "user",
|
|
1297
|
+
content: prompt
|
|
1298
|
+
}], {
|
|
1299
|
+
temperature: 0,
|
|
1300
|
+
maxTokens: 1e3
|
|
1301
|
+
})).text.replace(/```(?:json)?\s*/g, "").replace(/```/g, "").trim();
|
|
1302
|
+
const parsed = JSON.parse(cleaned);
|
|
1303
|
+
return {
|
|
1304
|
+
patterns: Array.isArray(parsed.patterns) ? parsed.patterns.map(String) : [],
|
|
1305
|
+
recommendations: Array.isArray(parsed.recommendations) ? parsed.recommendations.map(String) : []
|
|
1306
|
+
};
|
|
1307
|
+
} catch {
|
|
1308
|
+
return {
|
|
1309
|
+
patterns: this.heuristicFailurePatterns(failedResults),
|
|
1310
|
+
recommendations: this.heuristicRecommendations(failedResults)
|
|
1311
|
+
};
|
|
1312
|
+
}
|
|
1313
|
+
}
|
|
1314
|
+
heuristicFailurePatterns(failedResults) {
|
|
1315
|
+
const patterns = [];
|
|
1316
|
+
const noToolCalls = failedResults.filter((r) => r.turns.every((t) => !t.toolCalls?.length));
|
|
1317
|
+
if (noToolCalls.length > 0) patterns.push(`${noToolCalls.length} conversation(s) failed with no tool calls`);
|
|
1318
|
+
const lowResolution = failedResults.filter((r) => r.evaluation.scores.resolution <= 2);
|
|
1319
|
+
if (lowResolution.length > 0) patterns.push(`${lowResolution.length} conversation(s) had low resolution scores`);
|
|
1320
|
+
const lowTone = failedResults.filter((r) => r.evaluation.scores.tone <= 2);
|
|
1321
|
+
if (lowTone.length > 0) patterns.push(`${lowTone.length} conversation(s) had low tone scores`);
|
|
1322
|
+
if (patterns.length === 0) patterns.push(`${failedResults.length} conversation(s) failed evaluation criteria`);
|
|
1323
|
+
return patterns;
|
|
1324
|
+
}
|
|
1325
|
+
heuristicRecommendations(failedResults) {
|
|
1326
|
+
const recs = [];
|
|
1327
|
+
if (failedResults.filter((r) => r.turns.every((t) => !t.toolCalls?.length)).length > 0) recs.push("Ensure agent is configured to use available tools for customer queries");
|
|
1328
|
+
const expectedButMissing = /* @__PURE__ */ new Set();
|
|
1329
|
+
for (const r of failedResults) for (const tool of r.scenario.expectedTools || []) if (!r.turns.some((t) => t.toolCalls?.some((tc) => tc.name === tool))) expectedButMissing.add(tool);
|
|
1330
|
+
if (expectedButMissing.size > 0) recs.push(`Tools expected but not called: ${Array.from(expectedButMissing).join(", ")}`);
|
|
1331
|
+
if (recs.length === 0) recs.push("Review failed scenarios and adjust agent rules or prompts");
|
|
1332
|
+
return recs;
|
|
1333
|
+
}
|
|
1334
|
+
};
|
|
1335
|
+
|
|
1336
|
+
//#endregion
|
|
1337
|
+
//#region src/MockShopifySkill.ts
|
|
1338
|
+
var MockShopifySkill = class {
|
|
1339
|
+
name = "shopify";
|
|
1340
|
+
ready = false;
|
|
1341
|
+
mockOrders = /* @__PURE__ */ new Map();
|
|
1342
|
+
mockProducts = [];
|
|
1343
|
+
mockDiscounts = [];
|
|
1344
|
+
nextPriceRuleId = 1e3;
|
|
1345
|
+
constructor() {
|
|
1346
|
+
this.seedMockData();
|
|
1347
|
+
}
|
|
1348
|
+
async initialize() {
|
|
1349
|
+
this.ready = true;
|
|
1350
|
+
console.log("ā
Mock Shopify initialized");
|
|
1351
|
+
}
|
|
1352
|
+
/** @deprecated Use initialize() instead. */
|
|
1353
|
+
async authenticate() {
|
|
1354
|
+
return this.initialize();
|
|
1355
|
+
}
|
|
1356
|
+
isReady() {
|
|
1357
|
+
return this.ready;
|
|
1358
|
+
}
|
|
1359
|
+
/** @deprecated Use isReady() instead. */
|
|
1360
|
+
isAuthenticated() {
|
|
1361
|
+
return this.ready;
|
|
1362
|
+
}
|
|
1363
|
+
/**
|
|
1364
|
+
* Reset all mock data to initial state (for testing)
|
|
1365
|
+
*/
|
|
1366
|
+
reset() {
|
|
1367
|
+
this.mockOrders.clear();
|
|
1368
|
+
this.mockProducts = [];
|
|
1369
|
+
this.mockDiscounts = [];
|
|
1370
|
+
this.nextPriceRuleId = 1e3;
|
|
1371
|
+
this.seedMockData();
|
|
1372
|
+
}
|
|
1373
|
+
/**
|
|
1374
|
+
* Seed custom test data (for testing)
|
|
1375
|
+
*/
|
|
1376
|
+
seedData(config) {
|
|
1377
|
+
if (config.orders) for (const order of config.orders) {
|
|
1378
|
+
const fullOrder = {
|
|
1379
|
+
id: order.id || String(Date.now()),
|
|
1380
|
+
name: order.name || `#${order.id || "1001"}`,
|
|
1381
|
+
status: order.status || "unfulfilled",
|
|
1382
|
+
financialStatus: order.financialStatus || "paid",
|
|
1383
|
+
createdAt: order.createdAt || (/* @__PURE__ */ new Date()).toISOString(),
|
|
1384
|
+
expectedDelivery: order.expectedDelivery || /* @__PURE__ */ new Date(),
|
|
1385
|
+
actualDelivery: order.actualDelivery,
|
|
1386
|
+
tracking: order.tracking,
|
|
1387
|
+
trackingUrl: order.trackingUrl,
|
|
1388
|
+
items: order.items || [],
|
|
1389
|
+
total: order.total || "0.00"
|
|
1390
|
+
};
|
|
1391
|
+
this.mockOrders.set(fullOrder.id, fullOrder);
|
|
1392
|
+
}
|
|
1393
|
+
if (config.products) for (const product of config.products) {
|
|
1394
|
+
const fullProduct = {
|
|
1395
|
+
id: product.id || Date.now(),
|
|
1396
|
+
title: product.title || "Product",
|
|
1397
|
+
vendor: product.vendor || "Mock Vendor",
|
|
1398
|
+
type: product.type || "General",
|
|
1399
|
+
price: product.price || "0.00",
|
|
1400
|
+
available: product.available !== void 0 ? product.available : true
|
|
1401
|
+
};
|
|
1402
|
+
this.mockProducts.push(fullProduct);
|
|
1403
|
+
}
|
|
1404
|
+
if (config.discounts) for (const discount of config.discounts) {
|
|
1405
|
+
const fullDiscount = {
|
|
1406
|
+
code: discount.code || "DISCOUNT",
|
|
1407
|
+
percent: discount.percent || 10,
|
|
1408
|
+
validDays: discount.validDays || 30,
|
|
1409
|
+
startsAt: discount.startsAt || (/* @__PURE__ */ new Date()).toISOString(),
|
|
1410
|
+
expiresAt: discount.expiresAt || new Date(Date.now() + 720 * 60 * 60 * 1e3).toISOString(),
|
|
1411
|
+
priceRuleId: discount.priceRuleId || this.nextPriceRuleId++,
|
|
1412
|
+
createdAt: discount.createdAt || /* @__PURE__ */ new Date()
|
|
1413
|
+
};
|
|
1414
|
+
this.mockDiscounts.push(fullDiscount);
|
|
1415
|
+
}
|
|
1416
|
+
}
|
|
1417
|
+
seedMockData() {
|
|
1418
|
+
const twoDaysAgo = /* @__PURE__ */ new Date();
|
|
1419
|
+
twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
|
|
1420
|
+
this.mockOrders.set("12345", {
|
|
1421
|
+
id: "12345",
|
|
1422
|
+
name: "#1001",
|
|
1423
|
+
status: "in_transit",
|
|
1424
|
+
financialStatus: "paid",
|
|
1425
|
+
createdAt: (/* @__PURE__ */ new Date(Date.now() - 7200 * 60 * 1e3)).toISOString(),
|
|
1426
|
+
expectedDelivery: twoDaysAgo,
|
|
1427
|
+
tracking: "TRACK123456789",
|
|
1428
|
+
trackingUrl: "https://track.example.com/TRACK123456789",
|
|
1429
|
+
items: [{
|
|
1430
|
+
name: "Premium Headphones",
|
|
1431
|
+
quantity: 1,
|
|
1432
|
+
price: "299.99"
|
|
1433
|
+
}],
|
|
1434
|
+
total: "299.99"
|
|
1435
|
+
});
|
|
1436
|
+
this.mockOrders.set("67890", {
|
|
1437
|
+
id: "67890",
|
|
1438
|
+
name: "#1002",
|
|
1439
|
+
status: "delivered",
|
|
1440
|
+
financialStatus: "paid",
|
|
1441
|
+
createdAt: (/* @__PURE__ */ new Date(Date.now() - 14400 * 60 * 1e3)).toISOString(),
|
|
1442
|
+
expectedDelivery: /* @__PURE__ */ new Date(Date.now() - 7200 * 60 * 1e3),
|
|
1443
|
+
actualDelivery: /* @__PURE__ */ new Date(Date.now() - 4320 * 60 * 1e3),
|
|
1444
|
+
tracking: "TRACK987654321",
|
|
1445
|
+
trackingUrl: "https://track.example.com/TRACK987654321",
|
|
1446
|
+
items: [{
|
|
1447
|
+
name: "Wireless Mouse",
|
|
1448
|
+
quantity: 2,
|
|
1449
|
+
price: "49.99"
|
|
1450
|
+
}],
|
|
1451
|
+
total: "99.98"
|
|
1452
|
+
});
|
|
1453
|
+
this.mockProducts = [
|
|
1454
|
+
{
|
|
1455
|
+
id: 1001,
|
|
1456
|
+
title: "Premium Headphones",
|
|
1457
|
+
vendor: "AudioTech",
|
|
1458
|
+
type: "Electronics",
|
|
1459
|
+
price: "299.99",
|
|
1460
|
+
available: true
|
|
1461
|
+
},
|
|
1462
|
+
{
|
|
1463
|
+
id: 1002,
|
|
1464
|
+
title: "Wireless Mouse",
|
|
1465
|
+
vendor: "TechGear",
|
|
1466
|
+
type: "Electronics",
|
|
1467
|
+
price: "49.99",
|
|
1468
|
+
available: true
|
|
1469
|
+
},
|
|
1470
|
+
{
|
|
1471
|
+
id: 1003,
|
|
1472
|
+
title: "Mechanical Keyboard",
|
|
1473
|
+
vendor: "KeyMaster",
|
|
1474
|
+
type: "Electronics",
|
|
1475
|
+
price: "149.99",
|
|
1476
|
+
available: true
|
|
1477
|
+
},
|
|
1478
|
+
{
|
|
1479
|
+
id: 1004,
|
|
1480
|
+
title: "USB-C Cable",
|
|
1481
|
+
vendor: "CableCo",
|
|
1482
|
+
type: "Accessories",
|
|
1483
|
+
price: "19.99",
|
|
1484
|
+
available: false
|
|
1485
|
+
}
|
|
1486
|
+
];
|
|
1487
|
+
}
|
|
1488
|
+
tools = {
|
|
1489
|
+
get_order: {
|
|
1490
|
+
name: "get_order",
|
|
1491
|
+
description: "Get order details by order ID or order name (e.g., #1001)",
|
|
1492
|
+
parameters: { orderId: {
|
|
1493
|
+
type: "string",
|
|
1494
|
+
required: true
|
|
1495
|
+
} },
|
|
1496
|
+
execute: async (params) => {
|
|
1497
|
+
const orderIdentifier = params.orderId.replace("#", "");
|
|
1498
|
+
const order = this.mockOrders.get(orderIdentifier);
|
|
1499
|
+
if (!order) return {
|
|
1500
|
+
found: false,
|
|
1501
|
+
error: `Order ${params.orderId} not found`
|
|
1502
|
+
};
|
|
1503
|
+
const delayMs = (/* @__PURE__ */ new Date()).getTime() - order.expectedDelivery.getTime();
|
|
1504
|
+
const delayDays = Math.floor(delayMs / (1e3 * 60 * 60 * 24));
|
|
1505
|
+
return {
|
|
1506
|
+
found: true,
|
|
1507
|
+
id: order.id,
|
|
1508
|
+
name: order.name,
|
|
1509
|
+
status: order.status,
|
|
1510
|
+
financialStatus: order.financialStatus,
|
|
1511
|
+
createdAt: order.createdAt,
|
|
1512
|
+
total: order.total,
|
|
1513
|
+
items: order.items,
|
|
1514
|
+
tracking: order.tracking || null,
|
|
1515
|
+
trackingUrl: order.trackingUrl || null,
|
|
1516
|
+
isDelayed: delayDays > 0,
|
|
1517
|
+
delayDays: Math.max(0, delayDays)
|
|
1518
|
+
};
|
|
1519
|
+
}
|
|
1520
|
+
},
|
|
1521
|
+
create_discount: {
|
|
1522
|
+
name: "create_discount",
|
|
1523
|
+
description: "Create a percentage discount code",
|
|
1524
|
+
parameters: {
|
|
1525
|
+
percent: {
|
|
1526
|
+
type: "number",
|
|
1527
|
+
required: true
|
|
1528
|
+
},
|
|
1529
|
+
validDays: {
|
|
1530
|
+
type: "number",
|
|
1531
|
+
required: true
|
|
1532
|
+
}
|
|
1533
|
+
},
|
|
1534
|
+
execute: async (params) => {
|
|
1535
|
+
const code = `SORRY${params.percent}`;
|
|
1536
|
+
const startsAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
1537
|
+
const expiresAt = new Date(Date.now() + params.validDays * 24 * 60 * 60 * 1e3).toISOString();
|
|
1538
|
+
const priceRuleId = this.nextPriceRuleId++;
|
|
1539
|
+
const discount = {
|
|
1540
|
+
code,
|
|
1541
|
+
percent: params.percent,
|
|
1542
|
+
validDays: params.validDays,
|
|
1543
|
+
startsAt,
|
|
1544
|
+
expiresAt,
|
|
1545
|
+
priceRuleId,
|
|
1546
|
+
createdAt: /* @__PURE__ */ new Date()
|
|
1547
|
+
};
|
|
1548
|
+
this.mockDiscounts.push(discount);
|
|
1549
|
+
console.log(`\nš° Created discount code: ${code}`);
|
|
1550
|
+
console.log(` ${params.percent}% off, valid for ${params.validDays} days\n`);
|
|
1551
|
+
return {
|
|
1552
|
+
code,
|
|
1553
|
+
percent: params.percent,
|
|
1554
|
+
validDays: params.validDays,
|
|
1555
|
+
startsAt,
|
|
1556
|
+
expiresAt,
|
|
1557
|
+
priceRuleId
|
|
1558
|
+
};
|
|
1559
|
+
}
|
|
1560
|
+
},
|
|
1561
|
+
search_products: {
|
|
1562
|
+
name: "search_products",
|
|
1563
|
+
description: "Search for products in the store",
|
|
1564
|
+
parameters: {
|
|
1565
|
+
query: {
|
|
1566
|
+
type: "string",
|
|
1567
|
+
required: true
|
|
1568
|
+
},
|
|
1569
|
+
limit: {
|
|
1570
|
+
type: "number",
|
|
1571
|
+
required: false
|
|
1572
|
+
}
|
|
1573
|
+
},
|
|
1574
|
+
execute: async (params) => {
|
|
1575
|
+
const limit = params.limit || 10;
|
|
1576
|
+
const queryLower = params.query.toLowerCase();
|
|
1577
|
+
const matched = this.mockProducts.filter((p) => p.title.toLowerCase().includes(queryLower) || p.vendor.toLowerCase().includes(queryLower) || p.type.toLowerCase().includes(queryLower)).slice(0, limit);
|
|
1578
|
+
return {
|
|
1579
|
+
found: matched.length,
|
|
1580
|
+
products: matched.map((p) => ({
|
|
1581
|
+
id: p.id,
|
|
1582
|
+
title: p.title,
|
|
1583
|
+
vendor: p.vendor,
|
|
1584
|
+
type: p.type,
|
|
1585
|
+
price: p.price,
|
|
1586
|
+
available: p.available
|
|
1587
|
+
}))
|
|
1588
|
+
};
|
|
1589
|
+
}
|
|
1590
|
+
}
|
|
1591
|
+
};
|
|
1592
|
+
/**
|
|
1593
|
+
* Get all created discounts (for testing)
|
|
1594
|
+
*/
|
|
1595
|
+
getDiscounts() {
|
|
1596
|
+
return [...this.mockDiscounts];
|
|
1597
|
+
}
|
|
1598
|
+
/**
|
|
1599
|
+
* Get all orders (for testing)
|
|
1600
|
+
*/
|
|
1601
|
+
getOrders() {
|
|
1602
|
+
return Array.from(this.mockOrders.values());
|
|
1603
|
+
}
|
|
1604
|
+
};
|
|
1605
|
+
|
|
1606
|
+
//#endregion
|
|
1607
|
+
export { CSVLoader, ConversationEvaluator, ConversationRunner, CustomerSimulator, ECOMMERCE_SCENARIOS, MockShopifySkill, SimulationRunner, SkillTestHarness, TestCaseEvaluator, TestSuiteRunner, formatTimestamp };
|
|
1608
|
+
//# sourceMappingURL=index.js.map
|