@operor/testing 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,1608 @@
1
+ import { parse } from "csv-parse/sync";
2
+ import { readFile } from "node:fs/promises";
3
+
4
+ //#region src/CSVLoader.ts
5
+ var CSVLoader = class CSVLoader {
6
+ static async fromFile(path) {
7
+ const content = await readFile(path, "utf-8");
8
+ if (path.endsWith(".json")) return CSVLoader.fromJSON(content);
9
+ return CSVLoader.fromCSVString(content);
10
+ }
11
+ static fromCSVString(csv) {
12
+ return parse(csv.replace(/^\uFEFF/, ""), {
13
+ columns: true,
14
+ skip_empty_lines: true,
15
+ trim: true,
16
+ relax_column_count: true
17
+ }).map((row, i) => {
18
+ const id = row.id?.trim();
19
+ const question = row.question?.trim();
20
+ if (!id || !question) throw new Error(`Row ${i + 1}: missing required field(s) — id and question are required`);
21
+ const testCase = {
22
+ id,
23
+ question
24
+ };
25
+ if (row.expected_answer?.trim()) testCase.expectedAnswer = row.expected_answer.trim();
26
+ if (row.expected_tools?.trim()) testCase.expectedTools = row.expected_tools.split(",").map((t) => t.trim()).filter(Boolean);
27
+ if (row.persona?.trim()) testCase.persona = row.persona.trim();
28
+ if (row.tags?.trim()) testCase.tags = row.tags.split(",").map((t) => t.trim()).filter(Boolean);
29
+ return testCase;
30
+ });
31
+ }
32
+ static fromJSON(json) {
33
+ const data = JSON.parse(json);
34
+ const arr = Array.isArray(data) ? data : data.testCases ?? data.tests;
35
+ if (!Array.isArray(arr)) throw new Error("JSON must be an array or contain a testCases/tests array");
36
+ return arr.map((item, i) => {
37
+ if (!item.id || !item.question) throw new Error(`Item ${i}: missing required field(s) — id and question are required`);
38
+ const testCase = {
39
+ id: item.id,
40
+ question: item.question
41
+ };
42
+ if (item.expectedAnswer) testCase.expectedAnswer = item.expectedAnswer;
43
+ if (item.expectedTools) testCase.expectedTools = item.expectedTools;
44
+ if (item.persona) testCase.persona = item.persona;
45
+ if (item.tags) testCase.tags = item.tags;
46
+ if (item.metadata) testCase.metadata = item.metadata;
47
+ return testCase;
48
+ });
49
+ }
50
+ };
51
+
52
+ //#endregion
53
+ //#region src/TestCaseEvaluator.ts
54
+ var TestCaseEvaluator = class {
55
+ constructor(llm) {
56
+ this.llm = llm;
57
+ }
58
+ async evaluate(testCase, agentResponse, toolsCalled, strategy) {
59
+ const toolsCorrect = this.validateTools(testCase.expectedTools, toolsCalled);
60
+ if (strategy === "exact") return this.evaluateByExact(testCase, agentResponse, toolsCorrect);
61
+ if (strategy === "contains") return this.evaluateByContains(testCase, agentResponse, toolsCorrect);
62
+ if (strategy === "similarity") return this.evaluateBySimilarity(testCase, agentResponse, toolsCorrect);
63
+ if (strategy === "semantic" && this.llm) {
64
+ if (testCase.expectedAnswer) return await this.evaluateByLLMComparison(testCase, agentResponse, toolsCorrect);
65
+ return await this.evaluateByLLMJudge(testCase, agentResponse, toolsCorrect);
66
+ }
67
+ if (!this.llm) return this.evaluateBySimilarity(testCase, agentResponse, toolsCorrect);
68
+ if (testCase.expectedAnswer) return await this.evaluateByLLMComparison(testCase, agentResponse, toolsCorrect);
69
+ return await this.evaluateByLLMJudge(testCase, agentResponse, toolsCorrect);
70
+ }
71
+ validateTools(expectedTools, toolsCalled) {
72
+ if (!expectedTools || expectedTools.length === 0) return true;
73
+ const calledNames = new Set(toolsCalled.map((t) => t.name));
74
+ return expectedTools.every((tool) => calledNames.has(tool));
75
+ }
76
+ evaluateByExact(testCase, agentResponse, toolsCorrect) {
77
+ if (!testCase.expectedAnswer) return {
78
+ passed: toolsCorrect,
79
+ score: toolsCorrect ? 1 : 0,
80
+ method: "exact",
81
+ reasoning: "No expected answer provided, evaluated tools only",
82
+ toolsCorrect
83
+ };
84
+ const matches = agentResponse.trim().toLowerCase() === testCase.expectedAnswer.trim().toLowerCase();
85
+ return {
86
+ passed: matches && toolsCorrect,
87
+ score: matches ? 1 : 0,
88
+ method: "exact",
89
+ reasoning: matches ? "Exact match" : "Response does not exactly match expected answer",
90
+ toolsCorrect
91
+ };
92
+ }
93
+ evaluateByContains(testCase, agentResponse, toolsCorrect) {
94
+ if (!testCase.expectedAnswer) return {
95
+ passed: toolsCorrect,
96
+ score: toolsCorrect ? 1 : 0,
97
+ method: "contains",
98
+ reasoning: "No expected answer provided, evaluated tools only",
99
+ toolsCorrect
100
+ };
101
+ const normalizeDashes = (s) => s.replace(/[\u2013\u2011]/g, "-");
102
+ const contains = normalizeDashes(agentResponse.toLowerCase()).includes(normalizeDashes(testCase.expectedAnswer.toLowerCase()));
103
+ return {
104
+ passed: contains && toolsCorrect,
105
+ score: contains ? 1 : 0,
106
+ method: "contains",
107
+ reasoning: contains ? `Response contains expected text: "${testCase.expectedAnswer}"` : `Response does not contain expected text: "${testCase.expectedAnswer}"`,
108
+ toolsCorrect
109
+ };
110
+ }
111
+ evaluateBySimilarity(testCase, agentResponse, toolsCorrect) {
112
+ if (!testCase.expectedAnswer) return {
113
+ passed: toolsCorrect,
114
+ score: toolsCorrect ? 1 : 0,
115
+ method: "similarity",
116
+ reasoning: "No expected answer provided, evaluated tools only",
117
+ toolsCorrect
118
+ };
119
+ const similarity = this.normalizedLevenshtein(testCase.expectedAnswer.toLowerCase(), agentResponse.toLowerCase());
120
+ return {
121
+ passed: similarity > .7 && toolsCorrect,
122
+ score: similarity,
123
+ method: "similarity",
124
+ reasoning: `String similarity: ${(similarity * 100).toFixed(1)}% (threshold: 70%)`,
125
+ toolsCorrect
126
+ };
127
+ }
128
+ async evaluateByLLMComparison(testCase, agentResponse, toolsCorrect) {
129
+ const prompt = `You are evaluating an AI agent's response to a customer question.
130
+
131
+ Question: ${testCase.question}
132
+ Expected Answer: ${testCase.expectedAnswer}
133
+ Actual Response: ${agentResponse}
134
+
135
+ Rate the actual response on a scale of 1-5:
136
+ 1 = Completely wrong or irrelevant
137
+ 2 = Partially correct but missing key information
138
+ 3 = Mostly correct with minor issues
139
+ 4 = Correct with good quality
140
+ 5 = Excellent, matches or exceeds expected answer
141
+
142
+ Respond with ONLY a JSON object in this format:
143
+ {"score": <1-5>, "reasoning": "<brief explanation>"}`;
144
+ const result = await this.llm.complete([{
145
+ role: "user",
146
+ content: prompt
147
+ }], {
148
+ temperature: 0,
149
+ maxTokens: 200
150
+ });
151
+ let score = 3;
152
+ let reasoning = "LLM evaluation completed";
153
+ try {
154
+ const parsed = JSON.parse(result.text);
155
+ score = parsed.score;
156
+ reasoning = parsed.reasoning;
157
+ } catch {
158
+ const match = result.text.match(/score["\s:]+(\d)/i);
159
+ if (match) score = parseInt(match[1], 10);
160
+ reasoning = result.text.substring(0, 200);
161
+ }
162
+ const normalizedScore = score / 5;
163
+ return {
164
+ passed: normalizedScore >= .6 && toolsCorrect,
165
+ score: normalizedScore,
166
+ method: "llm_judge",
167
+ reasoning: `LLM comparison (${score}/5): ${reasoning}`,
168
+ toolsCorrect
169
+ };
170
+ }
171
+ async evaluateByLLMJudge(testCase, agentResponse, toolsCorrect) {
172
+ const prompt = `You are evaluating an AI agent's response to a customer question.
173
+
174
+ Question: ${testCase.question}
175
+ Agent Response: ${agentResponse}
176
+
177
+ Rate the response quality on a scale of 1-5:
178
+ 1 = Unhelpful, incorrect, or inappropriate
179
+ 2 = Partially helpful but incomplete or unclear
180
+ 3 = Adequate, addresses the question reasonably
181
+ 4 = Good quality, helpful and accurate
182
+ 5 = Excellent, comprehensive and professional
183
+
184
+ Respond with ONLY a JSON object in this format:
185
+ {"score": <1-5>, "reasoning": "<brief explanation>"}`;
186
+ const result = await this.llm.complete([{
187
+ role: "user",
188
+ content: prompt
189
+ }], {
190
+ temperature: 0,
191
+ maxTokens: 200
192
+ });
193
+ let score = 3;
194
+ let reasoning = "LLM evaluation completed";
195
+ try {
196
+ const parsed = JSON.parse(result.text);
197
+ score = parsed.score;
198
+ reasoning = parsed.reasoning;
199
+ } catch {
200
+ const match = result.text.match(/score["\s:]+(\d)/i);
201
+ if (match) score = parseInt(match[1], 10);
202
+ reasoning = result.text.substring(0, 200);
203
+ }
204
+ const normalizedScore = score / 5;
205
+ return {
206
+ passed: normalizedScore >= .6 && toolsCorrect,
207
+ score: normalizedScore,
208
+ method: "llm_judge",
209
+ reasoning: `LLM standalone judge (${score}/5): ${reasoning}`,
210
+ toolsCorrect
211
+ };
212
+ }
213
+ normalizedLevenshtein(s1, s2) {
214
+ const len1 = s1.length;
215
+ const len2 = s2.length;
216
+ if (len1 === 0) return len2 === 0 ? 1 : 0;
217
+ if (len2 === 0) return 0;
218
+ const matrix = Array.from({ length: len1 + 1 }, () => Array(len2 + 1).fill(0));
219
+ for (let i = 0; i <= len1; i++) matrix[i][0] = i;
220
+ for (let j = 0; j <= len2; j++) matrix[0][j] = j;
221
+ for (let i = 1; i <= len1; i++) for (let j = 1; j <= len2; j++) {
222
+ const cost = s1[i - 1] === s2[j - 1] ? 0 : 1;
223
+ matrix[i][j] = Math.min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost);
224
+ }
225
+ return 1 - matrix[len1][len2] / Math.max(len1, len2);
226
+ }
227
+ };
228
+
229
+ //#endregion
230
+ //#region src/TestSuiteRunner.ts
231
+ var TestSuiteRunner = class {
232
+ evaluator;
233
+ agentOS;
234
+ timeout;
235
+ strategy;
236
+ constructor(config) {
237
+ this.agentOS = config.agentOS;
238
+ this.evaluator = new TestCaseEvaluator(config.llm);
239
+ this.timeout = config.timeout || 3e4;
240
+ this.strategy = config.strategy;
241
+ }
242
+ async runSuite(testCases) {
243
+ const results = [];
244
+ const startTime = Date.now();
245
+ for (const testCase of testCases) {
246
+ const result = await this.runTestCase(testCase);
247
+ results.push(result);
248
+ }
249
+ const totalDuration = Date.now() - startTime;
250
+ const passed = results.filter((r) => r.evaluation.passed).length;
251
+ const failed = results.length - passed;
252
+ const averageScore = results.reduce((sum, r) => sum + r.evaluation.score, 0) / results.length;
253
+ const totalCost = results.reduce((sum, r) => sum + r.cost, 0);
254
+ const byTag = {};
255
+ for (const result of results) {
256
+ const tags = result.testCase.tags || ["untagged"];
257
+ for (const tag of tags) {
258
+ if (!byTag[tag]) byTag[tag] = {
259
+ total: 0,
260
+ passed: 0,
261
+ avgScore: 0
262
+ };
263
+ byTag[tag].total++;
264
+ if (result.evaluation.passed) byTag[tag].passed++;
265
+ byTag[tag].avgScore += result.evaluation.score;
266
+ }
267
+ }
268
+ for (const tag in byTag) byTag[tag].avgScore /= byTag[tag].total;
269
+ return {
270
+ total: results.length,
271
+ passed,
272
+ failed,
273
+ averageScore,
274
+ byTag,
275
+ results,
276
+ totalDuration,
277
+ totalCost
278
+ };
279
+ }
280
+ async runTestCase(testCase) {
281
+ const startTime = Date.now();
282
+ let agentResponse = "";
283
+ let toolsCalled = [];
284
+ let cost = 0;
285
+ try {
286
+ const responsePromise = new Promise((resolve, reject) => {
287
+ const timeoutId = setTimeout(() => {
288
+ reject(/* @__PURE__ */ new Error(`Test case ${testCase.id} timed out after ${this.timeout}ms`));
289
+ }, this.timeout);
290
+ this.agentOS.once("message:processed", (event) => {
291
+ clearTimeout(timeoutId);
292
+ resolve({
293
+ text: event.response.text,
294
+ toolCalls: event.response.toolCalls || [],
295
+ cost: event.cost || 0
296
+ });
297
+ });
298
+ this.agentOS.once("error", (event) => {
299
+ clearTimeout(timeoutId);
300
+ reject(event.error);
301
+ });
302
+ });
303
+ const mockProvider = Array.from(this.agentOS.providers.values()).find((p) => p.name === "mock");
304
+ if (!mockProvider) throw new Error("MockProvider not found in Operor");
305
+ const testPhone = testCase.persona || "test-user";
306
+ mockProvider.simulateIncomingMessage(testPhone, testCase.question);
307
+ const response = await responsePromise;
308
+ agentResponse = response.text;
309
+ toolsCalled = response.toolCalls || [];
310
+ cost = response.cost;
311
+ } catch (error) {
312
+ agentResponse = `Error: ${error instanceof Error ? error.message : String(error)}`;
313
+ }
314
+ const duration = Date.now() - startTime;
315
+ const evaluation = await this.evaluator.evaluate(testCase, agentResponse, toolsCalled, this.strategy);
316
+ return {
317
+ testCase,
318
+ agentResponse,
319
+ toolsCalled,
320
+ evaluation,
321
+ duration,
322
+ cost
323
+ };
324
+ }
325
+ };
326
+
327
+ //#endregion
328
+ //#region src/SkillTestHarness.ts
329
+ /**
330
+ * Safety wrapper for Integration instances during testing.
331
+ * Provides operation limits, dry-run mode, and audit logging.
332
+ */
333
+ var SkillTestHarness = class SkillTestHarness {
334
+ name;
335
+ inner;
336
+ config;
337
+ auditLog = [];
338
+ operationCount = 0;
339
+ static READ_TOOLS = new Set([
340
+ "get_order",
341
+ "search_products",
342
+ "salesforce_get_contact",
343
+ "salesforce_get_cases",
344
+ "stripe_get_customer"
345
+ ]);
346
+ static WRITE_TOOLS = new Set([
347
+ "create_discount",
348
+ "salesforce_update_contact",
349
+ "salesforce_create_case",
350
+ "salesforce_add_note"
351
+ ]);
352
+ static DESTRUCTIVE_TOOLS = new Set(["stripe_create_refund"]);
353
+ constructor(inner, config = {}) {
354
+ this.inner = inner;
355
+ this.name = inner.name;
356
+ this.config = {
357
+ allowWrites: config.allowWrites ?? false,
358
+ allowDestructive: config.allowDestructive ?? false,
359
+ maxOperations: config.maxOperations ?? 10,
360
+ timeoutMs: config.timeoutMs ?? 3e4,
361
+ dryRun: config.dryRun ?? false
362
+ };
363
+ this.tools = this.wrapTools(inner.tools);
364
+ }
365
+ async initialize() {
366
+ return this.inner.initialize();
367
+ }
368
+ /** @deprecated Use initialize() instead. */
369
+ async authenticate() {
370
+ return this.inner.initialize();
371
+ }
372
+ isReady() {
373
+ return this.inner.isReady();
374
+ }
375
+ /** @deprecated Use isReady() instead. */
376
+ isAuthenticated() {
377
+ return this.inner.isReady();
378
+ }
379
+ tools;
380
+ wrapTools(innerTools) {
381
+ const wrapped = {};
382
+ for (const [toolName, tool] of Object.entries(innerTools)) wrapped[toolName] = {
383
+ ...tool,
384
+ execute: async (params) => {
385
+ const startTime = Date.now();
386
+ const classification = this.classifyTool(toolName);
387
+ if (this.operationCount >= this.config.maxOperations) throw new Error(`SkillTestHarness: Max operations limit reached (${this.config.maxOperations})`);
388
+ if (classification === "write" && !this.config.allowWrites) throw new Error(`SkillTestHarness: Write operation '${toolName}' blocked (allowWrites=false)`);
389
+ if (classification === "destructive" && !this.config.allowDestructive) throw new Error(`SkillTestHarness: Destructive operation '${toolName}' blocked (allowDestructive=false)`);
390
+ this.operationCount++;
391
+ if (this.config.dryRun) {
392
+ const result = {
393
+ dryRun: true,
394
+ wouldExecute: toolName,
395
+ params
396
+ };
397
+ this.auditLog.push({
398
+ name: toolName,
399
+ params,
400
+ result,
401
+ timestamp: startTime,
402
+ duration: Date.now() - startTime,
403
+ classification
404
+ });
405
+ return result;
406
+ }
407
+ const timeoutPromise = new Promise((_, reject) => {
408
+ setTimeout(() => {
409
+ reject(/* @__PURE__ */ new Error(`SkillTestHarness: Operation '${toolName}' timed out after ${this.config.timeoutMs}ms`));
410
+ }, this.config.timeoutMs);
411
+ });
412
+ try {
413
+ const result = await Promise.race([tool.execute(params), timeoutPromise]);
414
+ const duration = Date.now() - startTime;
415
+ this.auditLog.push({
416
+ name: toolName,
417
+ params,
418
+ result,
419
+ timestamp: startTime,
420
+ duration,
421
+ classification
422
+ });
423
+ return result;
424
+ } catch (error) {
425
+ const duration = Date.now() - startTime;
426
+ this.auditLog.push({
427
+ name: toolName,
428
+ params,
429
+ result: { error: error instanceof Error ? error.message : "Unknown error" },
430
+ timestamp: startTime,
431
+ duration,
432
+ classification
433
+ });
434
+ throw error;
435
+ }
436
+ }
437
+ };
438
+ return wrapped;
439
+ }
440
+ classifyTool(toolName) {
441
+ if (SkillTestHarness.READ_TOOLS.has(toolName)) return "read";
442
+ if (SkillTestHarness.DESTRUCTIVE_TOOLS.has(toolName)) return "destructive";
443
+ if (SkillTestHarness.WRITE_TOOLS.has(toolName)) return "write";
444
+ return "write";
445
+ }
446
+ /**
447
+ * Get the audit log of all operations performed
448
+ */
449
+ getAuditLog() {
450
+ return [...this.auditLog];
451
+ }
452
+ /**
453
+ * Reset the audit log and operation counter
454
+ */
455
+ resetAuditLog() {
456
+ this.auditLog = [];
457
+ this.operationCount = 0;
458
+ }
459
+ /**
460
+ * Get the current operation count
461
+ */
462
+ getOperationCount() {
463
+ return this.operationCount;
464
+ }
465
+ };
466
+
467
+ //#endregion
468
+ //#region src/CustomerSimulator.ts
469
+ const PERSONA_PROMPTS = {
470
+ polite: "You are polite, patient, and use courteous language.",
471
+ frustrated: "You are frustrated and impatient. You express dissatisfaction but remain civil.",
472
+ confused: "You are confused and unsure. You ask clarifying questions and sometimes misunderstand.",
473
+ terse: "You give very short, minimal responses. One sentence max.",
474
+ verbose: "You are detailed and talkative. You provide lots of context and background."
475
+ };
476
+ function buildSystemPrompt(persona, context) {
477
+ return `You are simulating a customer in a support conversation for testing purposes.
478
+ ${PERSONA_PROMPTS[persona] || `You are a customer with a ${persona} communication style.`}${context?.scenario ? `\nScenario: ${context.scenario}` : ""}${context?.maxTurns ? `\nThis conversation has a maximum of ${context.maxTurns} turns. You are on turn ${context.currentTurn ?? 1}.` : ""}
479
+
480
+ Rules:
481
+ - Stay in character throughout the conversation.
482
+ - Escalate naturally if your issue isn't being resolved.
483
+ - Set shouldContinue to false when your issue is resolved or you have no more questions.
484
+ - If the agent asks a question, answer it in character.
485
+
486
+ Respond with ONLY valid JSON (no markdown, no code fences):
487
+ {"message": "your response as the customer", "shouldContinue": true}`;
488
+ }
489
+ function formatHistory(history) {
490
+ return history.map((turn) => ({
491
+ role: turn.role === "customer" ? "user" : "assistant",
492
+ content: turn.message
493
+ }));
494
+ }
495
+ function parseResponse(text) {
496
+ const cleaned = text.replace(/```(?:json)?\s*/g, "").replace(/```/g, "").trim();
497
+ try {
498
+ const parsed = JSON.parse(cleaned);
499
+ return {
500
+ message: String(parsed.message ?? ""),
501
+ shouldContinue: Boolean(parsed.shouldContinue)
502
+ };
503
+ } catch {
504
+ return {
505
+ message: text.trim(),
506
+ shouldContinue: true
507
+ };
508
+ }
509
+ }
510
+ var CustomerSimulator = class {
511
+ llm;
512
+ constructor(options) {
513
+ this.llm = options?.llmProvider;
514
+ }
515
+ async generateMessage(persona, history, context) {
516
+ if (context?.scriptedResponses?.length) {
517
+ const turn = context.currentTurn ?? history.filter((t) => t.role === "customer").length;
518
+ const responses = context.scriptedResponses;
519
+ if (turn < responses.length) return {
520
+ message: responses[turn],
521
+ shouldContinue: turn < responses.length - 1
522
+ };
523
+ return {
524
+ message: responses[responses.length - 1],
525
+ shouldContinue: false
526
+ };
527
+ }
528
+ if (!this.llm) throw new Error("CustomerSimulator requires an LLM provider for non-scripted mode");
529
+ const messages = [{
530
+ role: "system",
531
+ content: buildSystemPrompt(persona, context)
532
+ }, ...formatHistory(history)];
533
+ return parseResponse((await this.llm.complete(messages, {
534
+ temperature: .7,
535
+ maxTokens: 500
536
+ })).text);
537
+ }
538
+ };
539
+
540
+ //#endregion
541
+ //#region src/ConversationEvaluator.ts
542
+ function buildEvaluationPrompt(config) {
543
+ const { scenario, persona, turns, toolsCalled, expectedTools, expectedOutcome } = config;
544
+ const conversationText = turns.map((t, i) => `Turn ${i + 1} [${t.role}]: ${t.message}`).join("\n");
545
+ const toolsText = toolsCalled.length ? toolsCalled.map((tc) => `- ${tc.name}(${JSON.stringify(tc.params)}) → ${JSON.stringify(tc.result)}`).join("\n") : "None";
546
+ return `You are evaluating a customer support conversation for testing purposes.
547
+
548
+ Scenario: ${scenario}
549
+ Customer Persona: ${persona}
550
+ Expected Tools: ${expectedTools?.length ? expectedTools.join(", ") : "Not specified"}
551
+ Expected Outcome: ${expectedOutcome || "Not specified"}
552
+
553
+ Conversation:
554
+ ${conversationText}
555
+
556
+ Tools Called:
557
+ ${toolsText}
558
+
559
+ Evaluate the conversation on these dimensions (score 1-5 for each):
560
+
561
+ 1. Accuracy (1-5): Did the agent provide factually correct information based on tool results?
562
+ 2. Tool Usage (1-5): Did the agent call the right tools at the right time?
563
+ 3. Tone (1-5): Was the agent's tone appropriate, professional, and empathetic?
564
+ 4. Resolution (1-5): Was the customer's issue resolved or properly addressed?
565
+
566
+ Overall assessment:
567
+ - "pass": All criteria met, customer satisfied (scores mostly 4-5)
568
+ - "partial": Some issues but acceptable (scores mostly 3-4)
569
+ - "fail": Significant problems (any score 1-2, or multiple scores below 3)
570
+
571
+ Respond with ONLY valid JSON (no markdown, no code fences):
572
+ {
573
+ "overall": "pass" | "fail" | "partial",
574
+ "scores": {
575
+ "accuracy": <integer 1-5>,
576
+ "toolUsage": <integer 1-5>,
577
+ "tone": <integer 1-5>,
578
+ "resolution": <integer 1-5>
579
+ },
580
+ "feedback": "<brief explanation of the evaluation>"
581
+ }`;
582
+ }
583
+ function parseEvaluationResponse(text) {
584
+ const cleaned = text.replace(/```(?:json)?\s*/g, "").replace(/```/g, "").trim();
585
+ try {
586
+ const parsed = JSON.parse(cleaned);
587
+ return {
588
+ overall: parsed.overall || "fail",
589
+ scores: {
590
+ accuracy: Math.round(parsed.scores?.accuracy ?? 1),
591
+ toolUsage: Math.round(parsed.scores?.toolUsage ?? 1),
592
+ tone: Math.round(parsed.scores?.tone ?? 1),
593
+ resolution: Math.round(parsed.scores?.resolution ?? 1)
594
+ },
595
+ feedback: String(parsed.feedback ?? "No feedback provided")
596
+ };
597
+ } catch {
598
+ return {
599
+ overall: "fail",
600
+ scores: {
601
+ accuracy: 1,
602
+ toolUsage: 1,
603
+ tone: 1,
604
+ resolution: 1
605
+ },
606
+ feedback: "Failed to parse evaluation response"
607
+ };
608
+ }
609
+ }
610
+ function evaluateCriteria(criteria, turns) {
611
+ const { type, value } = criteria;
612
+ switch (type) {
613
+ case "tool_called": {
614
+ const toolName = String(value);
615
+ const called = turns.some((turn) => turn.toolCalls?.some((tc) => tc.name === toolName));
616
+ return {
617
+ criteria,
618
+ passed: called,
619
+ details: called ? `Tool "${toolName}" was called` : `Tool "${toolName}" was not called`
620
+ };
621
+ }
622
+ case "response_contains": {
623
+ const searchText = String(value).toLowerCase();
624
+ const found = turns.some((turn) => turn.role === "agent" && turn.message.toLowerCase().includes(searchText));
625
+ return {
626
+ criteria,
627
+ passed: found,
628
+ details: found ? `Agent response contains "${value}"` : `Agent response does not contain "${value}"`
629
+ };
630
+ }
631
+ case "intent_matched": {
632
+ const intentText = String(value).toLowerCase();
633
+ const matched = turns.some((turn) => turn.message.toLowerCase().includes(intentText));
634
+ return {
635
+ criteria,
636
+ passed: matched,
637
+ details: matched ? `Intent "${value}" was matched` : `Intent "${value}" was not matched`
638
+ };
639
+ }
640
+ case "turns_under": {
641
+ const maxTurns = Number(value);
642
+ const actualTurns = turns.length;
643
+ const passed = actualTurns < maxTurns;
644
+ return {
645
+ criteria,
646
+ passed,
647
+ details: passed ? `Conversation completed in ${actualTurns} turns (under ${maxTurns})` : `Conversation took ${actualTurns} turns (expected under ${maxTurns})`
648
+ };
649
+ }
650
+ case "custom":
651
+ if (typeof value === "function") try {
652
+ const passed = value(turns);
653
+ return {
654
+ criteria,
655
+ passed,
656
+ details: passed ? "Custom criteria passed" : "Custom criteria failed"
657
+ };
658
+ } catch (error) {
659
+ return {
660
+ criteria,
661
+ passed: false,
662
+ details: `Custom criteria error: ${error}`
663
+ };
664
+ }
665
+ return {
666
+ criteria,
667
+ passed: false,
668
+ details: "Custom criteria value must be a function"
669
+ };
670
+ default: return {
671
+ criteria,
672
+ passed: false,
673
+ details: `Unknown criteria type: ${type}`
674
+ };
675
+ }
676
+ }
677
+ var ConversationEvaluator = class {
678
+ llm;
679
+ constructor(options) {
680
+ this.llm = options?.llmProvider;
681
+ }
682
+ async evaluate(config) {
683
+ const { turns, successCriteria } = config;
684
+ const criteriaResults = successCriteria ? successCriteria.map((criteria) => evaluateCriteria(criteria, turns)) : [];
685
+ if (this.llm) {
686
+ const messages = [{
687
+ role: "user",
688
+ content: buildEvaluationPrompt(config)
689
+ }];
690
+ const evaluation = parseEvaluationResponse((await this.llm.complete(messages, {
691
+ temperature: 0,
692
+ maxTokens: 1e3
693
+ })).text);
694
+ evaluation.criteriaResults = criteriaResults;
695
+ if (!criteriaResults.every((cr) => cr.passed) && evaluation.overall === "pass") evaluation.overall = "partial";
696
+ return evaluation;
697
+ }
698
+ if (!criteriaResults.length) return {
699
+ overall: "fail",
700
+ scores: {
701
+ accuracy: 1,
702
+ toolUsage: 1,
703
+ tone: 1,
704
+ resolution: 1
705
+ },
706
+ feedback: "No criteria specified",
707
+ criteriaResults
708
+ };
709
+ const allPassed = criteriaResults.every((cr) => cr.passed);
710
+ const somePassed = criteriaResults.some((cr) => cr.passed);
711
+ return {
712
+ overall: allPassed ? "pass" : somePassed ? "partial" : "fail",
713
+ scores: {
714
+ accuracy: 3,
715
+ toolUsage: 3,
716
+ tone: 3,
717
+ resolution: 3
718
+ },
719
+ feedback: criteriaResults.map((cr) => cr.details).join("; "),
720
+ criteriaResults
721
+ };
722
+ }
723
+ };
724
+
725
+ //#endregion
726
+ //#region src/utils.ts
727
+ /**
728
+ * Format current time as HH:MM:SS timestamp
729
+ */
730
+ function formatTimestamp() {
731
+ return (/* @__PURE__ */ new Date()).toLocaleTimeString("en-US", { hour12: false });
732
+ }
733
+
734
+ //#endregion
735
+ //#region src/ConversationRunner.ts
736
+ var ConversationRunner = class {
737
+ agentOS;
738
+ customerSimulator;
739
+ conversationEvaluator;
740
+ timeout;
741
+ verbose;
742
+ constructor(config) {
743
+ this.agentOS = config.agentOS;
744
+ this.customerSimulator = config.customerSimulator;
745
+ this.conversationEvaluator = config.conversationEvaluator;
746
+ this.timeout = config.timeout || 3e4;
747
+ this.verbose = config.verbose ?? false;
748
+ }
749
+ async runScenario(scenario) {
750
+ const startTime = Date.now();
751
+ const turns = [];
752
+ const toolsCalled = [];
753
+ let totalCost = 0;
754
+ try {
755
+ const mockProvider = this.getMockProvider();
756
+ const customerId = `test-customer-${scenario.id}`;
757
+ const initialMessage = scenario.scriptedResponses?.[0] || "Hello, I need help";
758
+ if (this.verbose) {
759
+ console.log(`\n=== Starting Scenario: ${scenario.name} ===`);
760
+ console.log(`Persona: ${scenario.persona}`);
761
+ console.log(`Max Turns: ${scenario.maxTurns}\n`);
762
+ }
763
+ let shouldContinue = true;
764
+ let currentTurn = 0;
765
+ while (shouldContinue && currentTurn < scenario.maxTurns) {
766
+ const customerMessage = currentTurn === 0 ? initialMessage : (await this.customerSimulator.generateMessage(scenario.persona, turns, {
767
+ scenario: scenario.description,
768
+ maxTurns: scenario.maxTurns,
769
+ currentTurn,
770
+ scriptedResponses: scenario.scriptedResponses
771
+ })).message;
772
+ if (this.verbose) console.log(`[${formatTimestamp()}] Turn ${currentTurn + 1} [customer]: ${customerMessage}`);
773
+ const agentResponse = await this.waitForAgentResponse(mockProvider, customerId, customerMessage);
774
+ if (this.verbose) {
775
+ console.log(`[${formatTimestamp()}] Turn ${currentTurn + 1} [agent]: ${agentResponse.text}`);
776
+ if (agentResponse.toolCalls?.length) console.log(` Tools called: ${agentResponse.toolCalls.map((tc) => tc.name).join(", ")}`);
777
+ }
778
+ turns.push({
779
+ role: "customer",
780
+ message: customerMessage
781
+ });
782
+ turns.push({
783
+ role: "agent",
784
+ message: agentResponse.text,
785
+ toolCalls: agentResponse.toolCalls
786
+ });
787
+ if (agentResponse.toolCalls) toolsCalled.push(...agentResponse.toolCalls);
788
+ totalCost += agentResponse.cost || 0;
789
+ currentTurn++;
790
+ if (scenario.scriptedResponses) shouldContinue = currentTurn < scenario.scriptedResponses.length;
791
+ else if (currentTurn < scenario.maxTurns) {
792
+ shouldContinue = (await this.customerSimulator.generateMessage(scenario.persona, turns, {
793
+ scenario: scenario.description,
794
+ maxTurns: scenario.maxTurns,
795
+ currentTurn
796
+ })).shouldContinue;
797
+ if (!shouldContinue && this.verbose) console.log("Customer satisfied, ending conversation.");
798
+ }
799
+ }
800
+ if (this.verbose && currentTurn >= scenario.maxTurns) console.log(`\nReached max turns (${scenario.maxTurns})`);
801
+ const evaluation = await this.conversationEvaluator.evaluate({
802
+ scenario: scenario.description,
803
+ persona: scenario.persona,
804
+ turns,
805
+ toolsCalled,
806
+ expectedTools: scenario.expectedTools,
807
+ expectedOutcome: scenario.expectedOutcome,
808
+ successCriteria: scenario.successCriteria
809
+ });
810
+ if (this.verbose) {
811
+ console.log(`\n=== Evaluation ===`);
812
+ console.log(`Overall: ${evaluation.overall}`);
813
+ console.log(`Scores: ${JSON.stringify(evaluation.scores)}`);
814
+ console.log(`Feedback: ${evaluation.feedback}\n`);
815
+ }
816
+ const duration = Date.now() - startTime;
817
+ return {
818
+ scenario,
819
+ passed: evaluation.overall === "pass",
820
+ turns,
821
+ evaluation,
822
+ duration,
823
+ cost: totalCost
824
+ };
825
+ } catch (error) {
826
+ const duration = Date.now() - startTime;
827
+ const errorMessage = error instanceof Error ? error.message : String(error);
828
+ if (this.verbose) console.error(`\nError in scenario ${scenario.name}: ${errorMessage}\n`);
829
+ return {
830
+ scenario,
831
+ passed: false,
832
+ turns,
833
+ evaluation: {
834
+ overall: "fail",
835
+ scores: {
836
+ accuracy: 1,
837
+ toolUsage: 1,
838
+ tone: 1,
839
+ resolution: 1
840
+ },
841
+ feedback: `Error: ${errorMessage}`
842
+ },
843
+ duration,
844
+ cost: totalCost
845
+ };
846
+ }
847
+ }
848
+ async runScenarios(scenarios) {
849
+ const results = [];
850
+ for (const scenario of scenarios) {
851
+ const result = await this.runScenario(scenario);
852
+ results.push(result);
853
+ }
854
+ return results;
855
+ }
856
+ getMockProvider() {
857
+ const mockProvider = Array.from(this.agentOS.providers.values()).find((p) => p.name === "mock");
858
+ if (!mockProvider) throw new Error("MockProvider not found in Operor. Add it with agentOS.addProvider()");
859
+ return mockProvider;
860
+ }
861
+ async waitForAgentResponse(mockProvider, customerId, message) {
862
+ return new Promise((resolve, reject) => {
863
+ let settled = false;
864
+ const cleanup = () => {
865
+ this.agentOS.removeListener("message:processed", onProcessed);
866
+ this.agentOS.removeListener("error", onError);
867
+ };
868
+ const timeoutId = setTimeout(() => {
869
+ if (!settled) {
870
+ settled = true;
871
+ cleanup();
872
+ reject(/* @__PURE__ */ new Error(`Agent response timed out after ${this.timeout}ms`));
873
+ }
874
+ }, this.timeout);
875
+ const onProcessed = (event) => {
876
+ if (!settled) {
877
+ settled = true;
878
+ clearTimeout(timeoutId);
879
+ cleanup();
880
+ resolve({
881
+ text: event.response.text,
882
+ toolCalls: event.response.toolCalls || [],
883
+ cost: event.cost || 0
884
+ });
885
+ }
886
+ };
887
+ const onError = (event) => {
888
+ if (!settled) {
889
+ settled = true;
890
+ clearTimeout(timeoutId);
891
+ cleanup();
892
+ reject(event.error);
893
+ }
894
+ };
895
+ this.agentOS.once("message:processed", onProcessed);
896
+ this.agentOS.once("error", onError);
897
+ mockProvider.simulateIncomingMessage(customerId, message);
898
+ });
899
+ }
900
+ };
901
+
902
+ //#endregion
903
+ //#region src/scenarios.ts
904
+ const ECOMMERCE_SCENARIOS = [
905
+ {
906
+ id: "delayed-order-compensation",
907
+ name: "Delayed order with compensation",
908
+ description: "Customer asks about a delayed order and expects compensation",
909
+ persona: "Frustrated customer whose order #12345 was supposed to arrive 3 days ago",
910
+ maxTurns: 6,
911
+ expectedTools: ["get_order", "create_discount"],
912
+ expectedOutcome: "Agent finds order, acknowledges delay, offers discount",
913
+ scriptedResponses: [
914
+ "Where is my order #12345? It was supposed to arrive 3 days ago!",
915
+ "This is really frustrating. Can you do anything to make up for this?",
916
+ "Okay, I appreciate the discount. Thank you."
917
+ ]
918
+ },
919
+ {
920
+ id: "order-not-found",
921
+ name: "Order not found",
922
+ description: "Customer asks about an order that does not exist in the system",
923
+ persona: "Confused customer who may have the wrong order number",
924
+ maxTurns: 4,
925
+ expectedTools: ["get_order"],
926
+ expectedOutcome: "Agent attempts lookup, explains order not found, asks customer to verify",
927
+ scriptedResponses: ["Can you check on order #99999 for me?", "Hmm, let me double check the number and get back to you."]
928
+ },
929
+ {
930
+ id: "product-inquiry",
931
+ name: "Product inquiry",
932
+ description: "Customer searches for a product and asks about availability",
933
+ persona: "Curious shopper looking for electronics",
934
+ maxTurns: 4,
935
+ expectedTools: ["search_products"],
936
+ expectedOutcome: "Agent searches products and provides relevant results",
937
+ scriptedResponses: [
938
+ "Do you have any wireless headphones in stock?",
939
+ "What about the price range? Anything under $200?",
940
+ "Great, thanks for the info!"
941
+ ]
942
+ },
943
+ {
944
+ id: "return-request",
945
+ name: "Return request",
946
+ description: "Customer wants to return a recently delivered item",
947
+ persona: "Polite customer who received a defective product from order #67890",
948
+ maxTurns: 5,
949
+ expectedTools: ["get_order"],
950
+ expectedOutcome: "Agent looks up order, acknowledges issue, explains return process",
951
+ scriptedResponses: [
952
+ "I received order #67890 but one of the items is defective. I would like to return it.",
953
+ "Yes, the wireless mouse stopped working after one day.",
954
+ "Okay, how do I send it back?"
955
+ ]
956
+ },
957
+ {
958
+ id: "greeting",
959
+ name: "Simple greeting",
960
+ description: "Customer says hello and expects a friendly welcome",
961
+ persona: "Friendly first-time visitor",
962
+ maxTurns: 2,
963
+ expectedTools: [],
964
+ expectedOutcome: "Agent responds with a friendly greeting and offers help",
965
+ scriptedResponses: ["Hello!"]
966
+ },
967
+ {
968
+ id: "billing-dispute",
969
+ name: "Billing dispute",
970
+ description: "Customer believes they were charged incorrectly",
971
+ persona: "Concerned customer who noticed a double charge on order #12345",
972
+ maxTurns: 5,
973
+ expectedTools: ["get_order"],
974
+ expectedOutcome: "Agent looks up order, reviews charges, and addresses the billing concern",
975
+ scriptedResponses: [
976
+ "I think I was charged twice for order #12345. Can you check?",
977
+ "My credit card shows two charges of $299.99 on the same day.",
978
+ "Can you escalate this to someone who can issue a correction?"
979
+ ]
980
+ },
981
+ {
982
+ id: "multi-issue",
983
+ name: "Multi-issue conversation",
984
+ description: "Customer has multiple problems: a delayed order and a product question",
985
+ persona: "Busy customer who wants to resolve everything in one conversation",
986
+ maxTurns: 6,
987
+ expectedTools: ["get_order", "search_products"],
988
+ expectedOutcome: "Agent handles both issues sequentially without losing context",
989
+ scriptedResponses: [
990
+ "Two things: first, where is my order #12345?",
991
+ "Okay thanks. Also, do you carry mechanical keyboards?",
992
+ "Nice, I might order one. Can you also give me a discount for the late delivery?",
993
+ "Sounds good, thanks for handling both issues."
994
+ ]
995
+ },
996
+ {
997
+ id: "lead-qualification",
998
+ name: "Lead qualification",
999
+ description: "Potential customer asking pre-purchase questions about products and shipping",
1000
+ persona: "Prospective buyer evaluating whether to make a purchase",
1001
+ maxTurns: 4,
1002
+ expectedTools: ["search_products"],
1003
+ expectedOutcome: "Agent answers product questions and encourages purchase",
1004
+ scriptedResponses: [
1005
+ "I am thinking about buying some electronics. What do you have?",
1006
+ "How fast is shipping usually?",
1007
+ "Do you offer any discounts for first-time buyers?"
1008
+ ]
1009
+ },
1010
+ {
1011
+ id: "frustrated-escalation",
1012
+ name: "Frustrated customer escalation",
1013
+ description: "Angry customer escalates through multiple complaints",
1014
+ persona: "Very frustrated customer who has contacted support multiple times about order #12345",
1015
+ maxTurns: 6,
1016
+ expectedTools: ["get_order", "create_discount"],
1017
+ expectedOutcome: "Agent remains professional, empathizes, and offers concrete resolution",
1018
+ scriptedResponses: [
1019
+ "This is the THIRD time I am contacting you about order #12345. Still not here!",
1020
+ "I have been waiting over a week. This is completely unacceptable.",
1021
+ "I want a refund or serious compensation. A 5% coupon is insulting.",
1022
+ "Fine, that is more reasonable. But I expect the order to arrive this week."
1023
+ ]
1024
+ },
1025
+ {
1026
+ id: "on-time-order-check",
1027
+ name: "On-time order status check",
1028
+ description: "Customer checks on an order that is on time or already delivered",
1029
+ persona: "Polite customer just checking in on order #67890",
1030
+ maxTurns: 2,
1031
+ expectedTools: ["get_order"],
1032
+ expectedOutcome: "Agent confirms order status, no compensation needed",
1033
+ scriptedResponses: ["Hi, can I get an update on order #67890?", "Perfect, thanks!"]
1034
+ }
1035
+ ];
1036
+
1037
+ //#endregion
1038
+ //#region src/SimulationRunner.ts
1039
+ var SimulationRunner = class {
1040
+ agentOS;
1041
+ config;
1042
+ llm;
1043
+ constructor(options) {
1044
+ this.agentOS = options.agentOS;
1045
+ this.config = options.config;
1046
+ this.llm = options.llm;
1047
+ }
1048
+ async run(onProgress) {
1049
+ const startTime = Date.now();
1050
+ const testSuiteResults = [];
1051
+ const conversationResults = [];
1052
+ let totalCost = 0;
1053
+ if (this.config.testSuiteFiles?.length) {
1054
+ const suiteRunner = new TestSuiteRunner({
1055
+ agentOS: this.agentOS,
1056
+ llm: this.llm,
1057
+ timeout: this.config.timeout
1058
+ });
1059
+ for (const file of this.config.testSuiteFiles) {
1060
+ const testCases = await CSVLoader.fromFile(file);
1061
+ const result = await suiteRunner.runSuite(testCases);
1062
+ testSuiteResults.push(result);
1063
+ totalCost += result.totalCost;
1064
+ }
1065
+ }
1066
+ const scenarios = this.resolveScenarios();
1067
+ if (scenarios.length) {
1068
+ const conversationRunner = new ConversationRunner({
1069
+ agentOS: this.agentOS,
1070
+ customerSimulator: new CustomerSimulator({ llmProvider: this.llm }),
1071
+ conversationEvaluator: new ConversationEvaluator({ llmProvider: this.llm }),
1072
+ timeout: this.config.timeout
1073
+ });
1074
+ const schedule = this.buildSchedule(scenarios);
1075
+ const pauseMs = this.config.pauseBetweenMs ?? 500;
1076
+ for (let i = 0; i < schedule.length; i++) {
1077
+ const scenario = schedule[i];
1078
+ const timeoutMs = this.config.timeout || 6e4;
1079
+ const result = await Promise.race([conversationRunner.runScenario(scenario), new Promise((_, reject) => setTimeout(() => reject(/* @__PURE__ */ new Error(`Conversation timed out after ${timeoutMs}ms`)), timeoutMs))]).catch((error) => {
1080
+ return {
1081
+ scenario,
1082
+ passed: false,
1083
+ turns: [],
1084
+ evaluation: {
1085
+ overall: "fail",
1086
+ scores: {
1087
+ accuracy: 1,
1088
+ toolUsage: 1,
1089
+ tone: 1,
1090
+ resolution: 1
1091
+ },
1092
+ feedback: `Timeout or error: ${error instanceof Error ? error.message : String(error)}`
1093
+ },
1094
+ duration: timeoutMs,
1095
+ cost: 0
1096
+ };
1097
+ });
1098
+ conversationResults.push(result);
1099
+ totalCost += result.cost;
1100
+ if (onProgress) onProgress(i + 1, schedule.length, result);
1101
+ if (i < schedule.length - 1 && pauseMs > 0) await new Promise((resolve) => setTimeout(resolve, pauseMs));
1102
+ }
1103
+ }
1104
+ const duration = Date.now() - startTime;
1105
+ const totalTests = testSuiteResults.reduce((sum, r) => sum + r.total, 0);
1106
+ const passedTests = testSuiteResults.reduce((sum, r) => sum + r.passed, 0);
1107
+ const failedTests = testSuiteResults.reduce((sum, r) => sum + r.failed, 0);
1108
+ const totalConversations = conversationResults.length;
1109
+ const passedConversations = conversationResults.filter((r) => r.passed).length;
1110
+ const failedConversations = totalConversations - passedConversations;
1111
+ const totalItems = totalTests + totalConversations;
1112
+ const passedItems = passedTests + passedConversations;
1113
+ const overallPassRate = totalItems > 0 ? passedItems / totalItems : 0;
1114
+ const averageScores = this.computeAverageScores(conversationResults);
1115
+ const scenarioBreakdown = this.computeScenarioBreakdown(conversationResults);
1116
+ const toolUsageStats = this.computeToolUsageStats(conversationResults);
1117
+ const failedResults = conversationResults.filter((r) => !r.passed);
1118
+ let commonFailurePatterns = [];
1119
+ let recommendations = [];
1120
+ if (failedResults.length > 0) if (this.llm) {
1121
+ const analysis = await this.analyzeFailuresWithLLM(failedResults);
1122
+ commonFailurePatterns = analysis.patterns;
1123
+ recommendations = analysis.recommendations;
1124
+ } else {
1125
+ commonFailurePatterns = this.heuristicFailurePatterns(failedResults);
1126
+ recommendations = this.heuristicRecommendations(failedResults);
1127
+ }
1128
+ return {
1129
+ timestamp: /* @__PURE__ */ new Date(),
1130
+ duration,
1131
+ totalConversations,
1132
+ passed: passedConversations,
1133
+ failed: failedConversations,
1134
+ averageScores,
1135
+ scenarioBreakdown,
1136
+ toolUsageStats,
1137
+ commonFailurePatterns,
1138
+ recommendations,
1139
+ testSuiteResults,
1140
+ conversationResults,
1141
+ overallPassed: failedTests === 0 && failedConversations === 0 && totalItems > 0,
1142
+ totalCost,
1143
+ summary: {
1144
+ totalTests,
1145
+ passedTests,
1146
+ failedTests,
1147
+ totalConversations,
1148
+ passedConversations,
1149
+ failedConversations,
1150
+ overallPassRate
1151
+ }
1152
+ };
1153
+ }
1154
+ static formatReport(report) {
1155
+ const lines = [];
1156
+ lines.push("=== Simulation Report ===");
1157
+ lines.push(`Date: ${report.timestamp.toISOString()}`);
1158
+ lines.push(`Duration: ${(report.duration / 1e3).toFixed(1)}s`);
1159
+ lines.push(`Cost: $${report.totalCost.toFixed(4)}`);
1160
+ lines.push("");
1161
+ if (report.testSuiteResults.length) {
1162
+ lines.push("--- Test Suites ---");
1163
+ for (const suite of report.testSuiteResults) {
1164
+ lines.push(` ${suite.passed}/${suite.total} passed (avg score: ${suite.averageScore.toFixed(2)})`);
1165
+ for (const result of suite.results) {
1166
+ const status = result.evaluation.passed ? "PASS" : "FAIL";
1167
+ lines.push(` [${status}] ${result.testCase.id}: ${result.testCase.question}`);
1168
+ }
1169
+ }
1170
+ lines.push("");
1171
+ }
1172
+ if (report.scenarioBreakdown.length) {
1173
+ lines.push("--- Scenario Breakdown ---");
1174
+ for (const s of report.scenarioBreakdown) {
1175
+ const pct = (s.passRate * 100).toFixed(0);
1176
+ lines.push(` ${s.scenario}: ${s.runs} run(s), ${pct}% pass rate, avg score ${s.avgScore.toFixed(2)}`);
1177
+ }
1178
+ lines.push("");
1179
+ }
1180
+ const toolEntries = Object.entries(report.toolUsageStats);
1181
+ if (toolEntries.length) {
1182
+ lines.push("--- Tool Usage ---");
1183
+ for (const [tool, count] of toolEntries.sort((a, b) => b[1] - a[1])) lines.push(` ${tool}: ${count} call(s)`);
1184
+ lines.push("");
1185
+ }
1186
+ const { averageScores } = report;
1187
+ if (report.totalConversations > 0) {
1188
+ lines.push("--- Average Scores ---");
1189
+ lines.push(` Accuracy: ${averageScores.accuracy.toFixed(2)}`);
1190
+ lines.push(` Tool Usage: ${averageScores.toolUsage.toFixed(2)}`);
1191
+ lines.push(` Tone: ${averageScores.tone.toFixed(2)}`);
1192
+ lines.push(` Resolution: ${averageScores.resolution.toFixed(2)}`);
1193
+ lines.push("");
1194
+ }
1195
+ if (report.commonFailurePatterns.length) {
1196
+ lines.push("--- Common Failure Patterns ---");
1197
+ for (const pattern of report.commonFailurePatterns) lines.push(` - ${pattern}`);
1198
+ lines.push("");
1199
+ }
1200
+ if (report.recommendations.length) {
1201
+ lines.push("--- Recommendations ---");
1202
+ for (const rec of report.recommendations) lines.push(` - ${rec}`);
1203
+ lines.push("");
1204
+ }
1205
+ const { summary } = report;
1206
+ lines.push("--- Summary ---");
1207
+ if (summary.totalTests > 0) lines.push(`Tests: ${summary.passedTests}/${summary.totalTests} passed`);
1208
+ lines.push(`Conversations: ${summary.passedConversations}/${summary.totalConversations} passed`);
1209
+ lines.push(`Overall pass rate: ${(summary.overallPassRate * 100).toFixed(1)}%`);
1210
+ lines.push(`Result: ${report.overallPassed ? "PASSED" : "FAILED"}`);
1211
+ return lines.join("\n");
1212
+ }
1213
+ resolveScenarios() {
1214
+ if (!this.config.conversationScenarios) return [];
1215
+ if (this.config.conversationScenarios === "builtin") return ECOMMERCE_SCENARIOS;
1216
+ return this.config.conversationScenarios;
1217
+ }
1218
+ buildSchedule(scenarios) {
1219
+ const total = this.config.totalConversations ?? scenarios.length;
1220
+ const schedule = [];
1221
+ for (let i = 0; i < total; i++) schedule.push(scenarios[i % scenarios.length]);
1222
+ return schedule;
1223
+ }
1224
+ computeAverageScores(results) {
1225
+ if (results.length === 0) return {
1226
+ accuracy: 0,
1227
+ toolUsage: 0,
1228
+ tone: 0,
1229
+ resolution: 0
1230
+ };
1231
+ const totals = results.reduce((acc, r) => ({
1232
+ accuracy: acc.accuracy + r.evaluation.scores.accuracy,
1233
+ toolUsage: acc.toolUsage + r.evaluation.scores.toolUsage,
1234
+ tone: acc.tone + r.evaluation.scores.tone,
1235
+ resolution: acc.resolution + r.evaluation.scores.resolution
1236
+ }), {
1237
+ accuracy: 0,
1238
+ toolUsage: 0,
1239
+ tone: 0,
1240
+ resolution: 0
1241
+ });
1242
+ const n = results.length;
1243
+ return {
1244
+ accuracy: totals.accuracy / n,
1245
+ toolUsage: totals.toolUsage / n,
1246
+ tone: totals.tone / n,
1247
+ resolution: totals.resolution / n
1248
+ };
1249
+ }
1250
+ computeScenarioBreakdown(results) {
1251
+ const byScenario = /* @__PURE__ */ new Map();
1252
+ for (const r of results) {
1253
+ const name = r.scenario.name;
1254
+ if (!byScenario.has(name)) byScenario.set(name, []);
1255
+ byScenario.get(name).push(r);
1256
+ }
1257
+ return Array.from(byScenario.entries()).map(([scenario, runs]) => {
1258
+ const passed = runs.filter((r) => r.passed).length;
1259
+ const scores = runs.map((r) => {
1260
+ const s = r.evaluation.scores;
1261
+ return (s.accuracy + s.toolUsage + s.tone + s.resolution) / 4;
1262
+ });
1263
+ const avgScore = scores.reduce((a, b) => a + b, 0) / scores.length;
1264
+ return {
1265
+ scenario,
1266
+ runs: runs.length,
1267
+ passRate: passed / runs.length,
1268
+ avgScore
1269
+ };
1270
+ });
1271
+ }
1272
+ computeToolUsageStats(results) {
1273
+ const stats = {};
1274
+ for (const r of results) for (const turn of r.turns) if (turn.toolCalls) for (const tc of turn.toolCalls) stats[tc.name] = (stats[tc.name] || 0) + 1;
1275
+ return stats;
1276
+ }
1277
+ async analyzeFailuresWithLLM(failedResults) {
1278
+ if (!this.llm) return {
1279
+ patterns: [],
1280
+ recommendations: []
1281
+ };
1282
+ const prompt = `Analyze these failed customer support conversation tests and identify patterns.
1283
+
1284
+ ${failedResults.slice(0, 10).map((r) => {
1285
+ const turns = r.turns.map((t) => `[${t.role}]: ${t.message}`).join("\n");
1286
+ return `Scenario: ${r.scenario.name}\nFeedback: ${r.evaluation.feedback}\nConversation:\n${turns}`;
1287
+ }).join("\n\n---\n\n")}
1288
+
1289
+ Respond with ONLY valid JSON (no markdown, no code fences):
1290
+ {
1291
+ "patterns": ["pattern 1", "pattern 2"],
1292
+ "recommendations": ["recommendation 1", "recommendation 2"]
1293
+ }`;
1294
+ try {
1295
+ const cleaned = (await this.llm.complete([{
1296
+ role: "user",
1297
+ content: prompt
1298
+ }], {
1299
+ temperature: 0,
1300
+ maxTokens: 1e3
1301
+ })).text.replace(/```(?:json)?\s*/g, "").replace(/```/g, "").trim();
1302
+ const parsed = JSON.parse(cleaned);
1303
+ return {
1304
+ patterns: Array.isArray(parsed.patterns) ? parsed.patterns.map(String) : [],
1305
+ recommendations: Array.isArray(parsed.recommendations) ? parsed.recommendations.map(String) : []
1306
+ };
1307
+ } catch {
1308
+ return {
1309
+ patterns: this.heuristicFailurePatterns(failedResults),
1310
+ recommendations: this.heuristicRecommendations(failedResults)
1311
+ };
1312
+ }
1313
+ }
1314
+ heuristicFailurePatterns(failedResults) {
1315
+ const patterns = [];
1316
+ const noToolCalls = failedResults.filter((r) => r.turns.every((t) => !t.toolCalls?.length));
1317
+ if (noToolCalls.length > 0) patterns.push(`${noToolCalls.length} conversation(s) failed with no tool calls`);
1318
+ const lowResolution = failedResults.filter((r) => r.evaluation.scores.resolution <= 2);
1319
+ if (lowResolution.length > 0) patterns.push(`${lowResolution.length} conversation(s) had low resolution scores`);
1320
+ const lowTone = failedResults.filter((r) => r.evaluation.scores.tone <= 2);
1321
+ if (lowTone.length > 0) patterns.push(`${lowTone.length} conversation(s) had low tone scores`);
1322
+ if (patterns.length === 0) patterns.push(`${failedResults.length} conversation(s) failed evaluation criteria`);
1323
+ return patterns;
1324
+ }
1325
+ heuristicRecommendations(failedResults) {
1326
+ const recs = [];
1327
+ if (failedResults.filter((r) => r.turns.every((t) => !t.toolCalls?.length)).length > 0) recs.push("Ensure agent is configured to use available tools for customer queries");
1328
+ const expectedButMissing = /* @__PURE__ */ new Set();
1329
+ for (const r of failedResults) for (const tool of r.scenario.expectedTools || []) if (!r.turns.some((t) => t.toolCalls?.some((tc) => tc.name === tool))) expectedButMissing.add(tool);
1330
+ if (expectedButMissing.size > 0) recs.push(`Tools expected but not called: ${Array.from(expectedButMissing).join(", ")}`);
1331
+ if (recs.length === 0) recs.push("Review failed scenarios and adjust agent rules or prompts");
1332
+ return recs;
1333
+ }
1334
+ };
1335
+
1336
+ //#endregion
1337
+ //#region src/MockShopifySkill.ts
1338
+ var MockShopifySkill = class {
1339
+ name = "shopify";
1340
+ ready = false;
1341
+ mockOrders = /* @__PURE__ */ new Map();
1342
+ mockProducts = [];
1343
+ mockDiscounts = [];
1344
+ nextPriceRuleId = 1e3;
1345
+ constructor() {
1346
+ this.seedMockData();
1347
+ }
1348
+ async initialize() {
1349
+ this.ready = true;
1350
+ console.log("āœ… Mock Shopify initialized");
1351
+ }
1352
+ /** @deprecated Use initialize() instead. */
1353
+ async authenticate() {
1354
+ return this.initialize();
1355
+ }
1356
+ isReady() {
1357
+ return this.ready;
1358
+ }
1359
+ /** @deprecated Use isReady() instead. */
1360
+ isAuthenticated() {
1361
+ return this.ready;
1362
+ }
1363
+ /**
1364
+ * Reset all mock data to initial state (for testing)
1365
+ */
1366
+ reset() {
1367
+ this.mockOrders.clear();
1368
+ this.mockProducts = [];
1369
+ this.mockDiscounts = [];
1370
+ this.nextPriceRuleId = 1e3;
1371
+ this.seedMockData();
1372
+ }
1373
+ /**
1374
+ * Seed custom test data (for testing)
1375
+ */
1376
+ seedData(config) {
1377
+ if (config.orders) for (const order of config.orders) {
1378
+ const fullOrder = {
1379
+ id: order.id || String(Date.now()),
1380
+ name: order.name || `#${order.id || "1001"}`,
1381
+ status: order.status || "unfulfilled",
1382
+ financialStatus: order.financialStatus || "paid",
1383
+ createdAt: order.createdAt || (/* @__PURE__ */ new Date()).toISOString(),
1384
+ expectedDelivery: order.expectedDelivery || /* @__PURE__ */ new Date(),
1385
+ actualDelivery: order.actualDelivery,
1386
+ tracking: order.tracking,
1387
+ trackingUrl: order.trackingUrl,
1388
+ items: order.items || [],
1389
+ total: order.total || "0.00"
1390
+ };
1391
+ this.mockOrders.set(fullOrder.id, fullOrder);
1392
+ }
1393
+ if (config.products) for (const product of config.products) {
1394
+ const fullProduct = {
1395
+ id: product.id || Date.now(),
1396
+ title: product.title || "Product",
1397
+ vendor: product.vendor || "Mock Vendor",
1398
+ type: product.type || "General",
1399
+ price: product.price || "0.00",
1400
+ available: product.available !== void 0 ? product.available : true
1401
+ };
1402
+ this.mockProducts.push(fullProduct);
1403
+ }
1404
+ if (config.discounts) for (const discount of config.discounts) {
1405
+ const fullDiscount = {
1406
+ code: discount.code || "DISCOUNT",
1407
+ percent: discount.percent || 10,
1408
+ validDays: discount.validDays || 30,
1409
+ startsAt: discount.startsAt || (/* @__PURE__ */ new Date()).toISOString(),
1410
+ expiresAt: discount.expiresAt || new Date(Date.now() + 720 * 60 * 60 * 1e3).toISOString(),
1411
+ priceRuleId: discount.priceRuleId || this.nextPriceRuleId++,
1412
+ createdAt: discount.createdAt || /* @__PURE__ */ new Date()
1413
+ };
1414
+ this.mockDiscounts.push(fullDiscount);
1415
+ }
1416
+ }
1417
+ seedMockData() {
1418
+ const twoDaysAgo = /* @__PURE__ */ new Date();
1419
+ twoDaysAgo.setDate(twoDaysAgo.getDate() - 2);
1420
+ this.mockOrders.set("12345", {
1421
+ id: "12345",
1422
+ name: "#1001",
1423
+ status: "in_transit",
1424
+ financialStatus: "paid",
1425
+ createdAt: (/* @__PURE__ */ new Date(Date.now() - 7200 * 60 * 1e3)).toISOString(),
1426
+ expectedDelivery: twoDaysAgo,
1427
+ tracking: "TRACK123456789",
1428
+ trackingUrl: "https://track.example.com/TRACK123456789",
1429
+ items: [{
1430
+ name: "Premium Headphones",
1431
+ quantity: 1,
1432
+ price: "299.99"
1433
+ }],
1434
+ total: "299.99"
1435
+ });
1436
+ this.mockOrders.set("67890", {
1437
+ id: "67890",
1438
+ name: "#1002",
1439
+ status: "delivered",
1440
+ financialStatus: "paid",
1441
+ createdAt: (/* @__PURE__ */ new Date(Date.now() - 14400 * 60 * 1e3)).toISOString(),
1442
+ expectedDelivery: /* @__PURE__ */ new Date(Date.now() - 7200 * 60 * 1e3),
1443
+ actualDelivery: /* @__PURE__ */ new Date(Date.now() - 4320 * 60 * 1e3),
1444
+ tracking: "TRACK987654321",
1445
+ trackingUrl: "https://track.example.com/TRACK987654321",
1446
+ items: [{
1447
+ name: "Wireless Mouse",
1448
+ quantity: 2,
1449
+ price: "49.99"
1450
+ }],
1451
+ total: "99.98"
1452
+ });
1453
+ this.mockProducts = [
1454
+ {
1455
+ id: 1001,
1456
+ title: "Premium Headphones",
1457
+ vendor: "AudioTech",
1458
+ type: "Electronics",
1459
+ price: "299.99",
1460
+ available: true
1461
+ },
1462
+ {
1463
+ id: 1002,
1464
+ title: "Wireless Mouse",
1465
+ vendor: "TechGear",
1466
+ type: "Electronics",
1467
+ price: "49.99",
1468
+ available: true
1469
+ },
1470
+ {
1471
+ id: 1003,
1472
+ title: "Mechanical Keyboard",
1473
+ vendor: "KeyMaster",
1474
+ type: "Electronics",
1475
+ price: "149.99",
1476
+ available: true
1477
+ },
1478
+ {
1479
+ id: 1004,
1480
+ title: "USB-C Cable",
1481
+ vendor: "CableCo",
1482
+ type: "Accessories",
1483
+ price: "19.99",
1484
+ available: false
1485
+ }
1486
+ ];
1487
+ }
1488
+ tools = {
1489
+ get_order: {
1490
+ name: "get_order",
1491
+ description: "Get order details by order ID or order name (e.g., #1001)",
1492
+ parameters: { orderId: {
1493
+ type: "string",
1494
+ required: true
1495
+ } },
1496
+ execute: async (params) => {
1497
+ const orderIdentifier = params.orderId.replace("#", "");
1498
+ const order = this.mockOrders.get(orderIdentifier);
1499
+ if (!order) return {
1500
+ found: false,
1501
+ error: `Order ${params.orderId} not found`
1502
+ };
1503
+ const delayMs = (/* @__PURE__ */ new Date()).getTime() - order.expectedDelivery.getTime();
1504
+ const delayDays = Math.floor(delayMs / (1e3 * 60 * 60 * 24));
1505
+ return {
1506
+ found: true,
1507
+ id: order.id,
1508
+ name: order.name,
1509
+ status: order.status,
1510
+ financialStatus: order.financialStatus,
1511
+ createdAt: order.createdAt,
1512
+ total: order.total,
1513
+ items: order.items,
1514
+ tracking: order.tracking || null,
1515
+ trackingUrl: order.trackingUrl || null,
1516
+ isDelayed: delayDays > 0,
1517
+ delayDays: Math.max(0, delayDays)
1518
+ };
1519
+ }
1520
+ },
1521
+ create_discount: {
1522
+ name: "create_discount",
1523
+ description: "Create a percentage discount code",
1524
+ parameters: {
1525
+ percent: {
1526
+ type: "number",
1527
+ required: true
1528
+ },
1529
+ validDays: {
1530
+ type: "number",
1531
+ required: true
1532
+ }
1533
+ },
1534
+ execute: async (params) => {
1535
+ const code = `SORRY${params.percent}`;
1536
+ const startsAt = (/* @__PURE__ */ new Date()).toISOString();
1537
+ const expiresAt = new Date(Date.now() + params.validDays * 24 * 60 * 60 * 1e3).toISOString();
1538
+ const priceRuleId = this.nextPriceRuleId++;
1539
+ const discount = {
1540
+ code,
1541
+ percent: params.percent,
1542
+ validDays: params.validDays,
1543
+ startsAt,
1544
+ expiresAt,
1545
+ priceRuleId,
1546
+ createdAt: /* @__PURE__ */ new Date()
1547
+ };
1548
+ this.mockDiscounts.push(discount);
1549
+ console.log(`\nšŸ’° Created discount code: ${code}`);
1550
+ console.log(` ${params.percent}% off, valid for ${params.validDays} days\n`);
1551
+ return {
1552
+ code,
1553
+ percent: params.percent,
1554
+ validDays: params.validDays,
1555
+ startsAt,
1556
+ expiresAt,
1557
+ priceRuleId
1558
+ };
1559
+ }
1560
+ },
1561
+ search_products: {
1562
+ name: "search_products",
1563
+ description: "Search for products in the store",
1564
+ parameters: {
1565
+ query: {
1566
+ type: "string",
1567
+ required: true
1568
+ },
1569
+ limit: {
1570
+ type: "number",
1571
+ required: false
1572
+ }
1573
+ },
1574
+ execute: async (params) => {
1575
+ const limit = params.limit || 10;
1576
+ const queryLower = params.query.toLowerCase();
1577
+ const matched = this.mockProducts.filter((p) => p.title.toLowerCase().includes(queryLower) || p.vendor.toLowerCase().includes(queryLower) || p.type.toLowerCase().includes(queryLower)).slice(0, limit);
1578
+ return {
1579
+ found: matched.length,
1580
+ products: matched.map((p) => ({
1581
+ id: p.id,
1582
+ title: p.title,
1583
+ vendor: p.vendor,
1584
+ type: p.type,
1585
+ price: p.price,
1586
+ available: p.available
1587
+ }))
1588
+ };
1589
+ }
1590
+ }
1591
+ };
1592
+ /**
1593
+ * Get all created discounts (for testing)
1594
+ */
1595
+ getDiscounts() {
1596
+ return [...this.mockDiscounts];
1597
+ }
1598
+ /**
1599
+ * Get all orders (for testing)
1600
+ */
1601
+ getOrders() {
1602
+ return Array.from(this.mockOrders.values());
1603
+ }
1604
+ };
1605
+
1606
+ //#endregion
1607
+ export { CSVLoader, ConversationEvaluator, ConversationRunner, CustomerSimulator, ECOMMERCE_SCENARIOS, MockShopifySkill, SimulationRunner, SkillTestHarness, TestCaseEvaluator, TestSuiteRunner, formatTimestamp };
1608
+ //# sourceMappingURL=index.js.map