@operor/testing 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/API_VALIDATION.md +572 -0
- package/dist/index.d.ts +414 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1608 -0
- package/dist/index.js.map +1 -0
- package/fixtures/sample-tests.csv +10 -0
- package/package.json +31 -0
- package/src/CSVLoader.ts +83 -0
- package/src/ConversationEvaluator.ts +254 -0
- package/src/ConversationRunner.ts +267 -0
- package/src/CustomerSimulator.ts +106 -0
- package/src/MockShopifySkill.ts +336 -0
- package/src/SimulationRunner.ts +425 -0
- package/src/SkillTestHarness.ts +220 -0
- package/src/TestCaseEvaluator.ts +296 -0
- package/src/TestSuiteRunner.ts +151 -0
- package/src/__tests__/CSVLoader.test.ts +122 -0
- package/src/__tests__/ConversationEvaluator.test.ts +221 -0
- package/src/__tests__/ConversationRunner.test.ts +270 -0
- package/src/__tests__/CustomerSimulator.test.ts +160 -0
- package/src/__tests__/SimulationRunner.test.ts +281 -0
- package/src/__tests__/SkillTestHarness.test.ts +181 -0
- package/src/__tests__/scenarios.test.ts +71 -0
- package/src/index.ts +32 -0
- package/src/scenarios/edge-cases.ts +52 -0
- package/src/scenarios/general.ts +37 -0
- package/src/scenarios/index.ts +32 -0
- package/src/scenarios/order-tracking.ts +56 -0
- package/src/scenarios.ts +142 -0
- package/src/types.ts +133 -0
- package/src/utils.ts +6 -0
- package/tsconfig.json +9 -0
- package/tsdown.config.ts +10 -0
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { ConversationEvaluator } from '../ConversationEvaluator.js';
|
|
3
|
+
import type { ConversationTurn, ConversationSuccessCriteria } from '../types.js';
|
|
4
|
+
|
|
5
|
+
const agentTurns: ConversationTurn[] = [
|
|
6
|
+
{ role: 'customer', message: 'Where is my order #1234?' },
|
|
7
|
+
{
|
|
8
|
+
role: 'agent',
|
|
9
|
+
message: 'Let me look that up for you. Your order #1234 is currently in transit and should arrive by Friday.',
|
|
10
|
+
toolCalls: [{ name: 'get_order', params: { orderId: '1234' }, result: { status: 'in_transit' } }],
|
|
11
|
+
},
|
|
12
|
+
{ role: 'customer', message: 'Great, thanks!' },
|
|
13
|
+
{ role: 'agent', message: 'You\'re welcome! Is there anything else I can help with?' },
|
|
14
|
+
];
|
|
15
|
+
|
|
16
|
+
describe('ConversationEvaluator', () => {
|
|
17
|
+
describe('criteria-based evaluation (no LLM)', () => {
|
|
18
|
+
const evaluator = new ConversationEvaluator();
|
|
19
|
+
|
|
20
|
+
it('passes tool_called criteria when tool was called', async () => {
|
|
21
|
+
const result = await evaluator.evaluate({
|
|
22
|
+
scenario: 'Order tracking',
|
|
23
|
+
persona: 'polite',
|
|
24
|
+
turns: agentTurns,
|
|
25
|
+
toolsCalled: [{ name: 'get_order', params: { orderId: '1234' }, result: { status: 'in_transit' } }],
|
|
26
|
+
successCriteria: [{ type: 'tool_called', value: 'get_order' }],
|
|
27
|
+
});
|
|
28
|
+
expect(result.overall).toBe('pass');
|
|
29
|
+
expect(result.criteriaResults![0].passed).toBe(true);
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
it('fails tool_called criteria when tool was not called', async () => {
|
|
33
|
+
const result = await evaluator.evaluate({
|
|
34
|
+
scenario: 'Order tracking',
|
|
35
|
+
persona: 'polite',
|
|
36
|
+
turns: agentTurns,
|
|
37
|
+
toolsCalled: [],
|
|
38
|
+
successCriteria: [{ type: 'tool_called', value: 'create_discount' }],
|
|
39
|
+
});
|
|
40
|
+
expect(result.overall).toBe('fail');
|
|
41
|
+
expect(result.criteriaResults![0].passed).toBe(false);
|
|
42
|
+
expect(result.criteriaResults![0].details).toContain('create_discount');
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it('passes response_contains criteria (case-insensitive)', async () => {
|
|
46
|
+
const result = await evaluator.evaluate({
|
|
47
|
+
scenario: 'Order tracking',
|
|
48
|
+
persona: 'polite',
|
|
49
|
+
turns: agentTurns,
|
|
50
|
+
toolsCalled: [],
|
|
51
|
+
successCriteria: [{ type: 'response_contains', value: 'in transit' }],
|
|
52
|
+
});
|
|
53
|
+
expect(result.overall).toBe('pass');
|
|
54
|
+
expect(result.criteriaResults![0].passed).toBe(true);
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it('fails response_contains when text not found', async () => {
|
|
58
|
+
const result = await evaluator.evaluate({
|
|
59
|
+
scenario: 'Order tracking',
|
|
60
|
+
persona: 'polite',
|
|
61
|
+
turns: agentTurns,
|
|
62
|
+
toolsCalled: [],
|
|
63
|
+
successCriteria: [{ type: 'response_contains', value: 'refund issued' }],
|
|
64
|
+
});
|
|
65
|
+
expect(result.overall).toBe('fail');
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it('passes turns_under criteria when under limit', async () => {
|
|
69
|
+
const result = await evaluator.evaluate({
|
|
70
|
+
scenario: 'Order tracking',
|
|
71
|
+
persona: 'polite',
|
|
72
|
+
turns: agentTurns,
|
|
73
|
+
toolsCalled: [],
|
|
74
|
+
successCriteria: [{ type: 'turns_under', value: 10 }],
|
|
75
|
+
});
|
|
76
|
+
expect(result.overall).toBe('pass');
|
|
77
|
+
expect(result.criteriaResults![0].details).toContain('4 turns');
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
it('fails turns_under criteria when over limit', async () => {
|
|
81
|
+
const result = await evaluator.evaluate({
|
|
82
|
+
scenario: 'Order tracking',
|
|
83
|
+
persona: 'polite',
|
|
84
|
+
turns: agentTurns,
|
|
85
|
+
toolsCalled: [],
|
|
86
|
+
successCriteria: [{ type: 'turns_under', value: 2 }],
|
|
87
|
+
});
|
|
88
|
+
expect(result.overall).toBe('fail');
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
it('evaluates custom criteria with function', async () => {
|
|
92
|
+
const customFn = (turns: ConversationTurn[]) =>
|
|
93
|
+
turns.some((t) => t.role === 'agent' && t.message.includes('welcome'));
|
|
94
|
+
|
|
95
|
+
const result = await evaluator.evaluate({
|
|
96
|
+
scenario: 'Order tracking',
|
|
97
|
+
persona: 'polite',
|
|
98
|
+
turns: agentTurns,
|
|
99
|
+
toolsCalled: [],
|
|
100
|
+
successCriteria: [{ type: 'custom', value: customFn }],
|
|
101
|
+
});
|
|
102
|
+
expect(result.overall).toBe('pass');
|
|
103
|
+
});
|
|
104
|
+
|
|
105
|
+
it('returns partial when some criteria pass and some fail', async () => {
|
|
106
|
+
const result = await evaluator.evaluate({
|
|
107
|
+
scenario: 'Order tracking',
|
|
108
|
+
persona: 'polite',
|
|
109
|
+
turns: agentTurns,
|
|
110
|
+
toolsCalled: [],
|
|
111
|
+
successCriteria: [
|
|
112
|
+
{ type: 'tool_called', value: 'get_order' },
|
|
113
|
+
{ type: 'tool_called', value: 'create_discount' },
|
|
114
|
+
],
|
|
115
|
+
});
|
|
116
|
+
expect(result.overall).toBe('partial');
|
|
117
|
+
expect(result.criteriaResults![0].passed).toBe(true);
|
|
118
|
+
expect(result.criteriaResults![1].passed).toBe(false);
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
it('returns pass with no criteria and no LLM', async () => {
|
|
122
|
+
const result = await evaluator.evaluate({
|
|
123
|
+
scenario: 'Order tracking',
|
|
124
|
+
persona: 'polite',
|
|
125
|
+
turns: agentTurns,
|
|
126
|
+
toolsCalled: [],
|
|
127
|
+
});
|
|
128
|
+
expect(result.overall).toBe('fail');
|
|
129
|
+
expect(result.feedback).toBe('No criteria specified');
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
it('handles multiple criteria all passing', async () => {
|
|
133
|
+
const result = await evaluator.evaluate({
|
|
134
|
+
scenario: 'Order tracking',
|
|
135
|
+
persona: 'polite',
|
|
136
|
+
turns: agentTurns,
|
|
137
|
+
toolsCalled: [],
|
|
138
|
+
successCriteria: [
|
|
139
|
+
{ type: 'tool_called', value: 'get_order' },
|
|
140
|
+
{ type: 'response_contains', value: 'transit' },
|
|
141
|
+
{ type: 'turns_under', value: 10 },
|
|
142
|
+
],
|
|
143
|
+
});
|
|
144
|
+
expect(result.overall).toBe('pass');
|
|
145
|
+
expect(result.criteriaResults!.every((cr) => cr.passed)).toBe(true);
|
|
146
|
+
});
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
describe('LLM-based evaluation', () => {
|
|
150
|
+
it('calls LLM with temperature 0 and parses response', async () => {
|
|
151
|
+
const mockLLM = {
|
|
152
|
+
complete: async (messages: any[], options: any) => {
|
|
153
|
+
expect(options.temperature).toBe(0);
|
|
154
|
+
expect(messages[0].content).toContain('Order tracking');
|
|
155
|
+
return {
|
|
156
|
+
text: JSON.stringify({
|
|
157
|
+
overall: 'pass',
|
|
158
|
+
scores: { accuracy: 5, toolUsage: 4, tone: 5, resolution: 4 },
|
|
159
|
+
feedback: 'Agent handled the order inquiry well.',
|
|
160
|
+
}),
|
|
161
|
+
};
|
|
162
|
+
},
|
|
163
|
+
};
|
|
164
|
+
|
|
165
|
+
const evaluator = new ConversationEvaluator({ llmProvider: mockLLM as any });
|
|
166
|
+
const result = await evaluator.evaluate({
|
|
167
|
+
scenario: 'Order tracking',
|
|
168
|
+
persona: 'polite',
|
|
169
|
+
turns: agentTurns,
|
|
170
|
+
toolsCalled: [{ name: 'get_order', params: { orderId: '1234' }, result: { status: 'in_transit' } }],
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
expect(result.overall).toBe('pass');
|
|
174
|
+
expect(result.scores.accuracy).toBe(5);
|
|
175
|
+
expect(result.scores.toolUsage).toBe(4);
|
|
176
|
+
expect(result.feedback).toContain('order inquiry');
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
it('downgrades LLM pass to partial when criteria fail', async () => {
|
|
180
|
+
const mockLLM = {
|
|
181
|
+
complete: async () => ({
|
|
182
|
+
text: JSON.stringify({
|
|
183
|
+
overall: 'pass',
|
|
184
|
+
scores: { accuracy: 5, toolUsage: 5, tone: 5, resolution: 5 },
|
|
185
|
+
feedback: 'Perfect.',
|
|
186
|
+
}),
|
|
187
|
+
}),
|
|
188
|
+
};
|
|
189
|
+
|
|
190
|
+
const evaluator = new ConversationEvaluator({ llmProvider: mockLLM as any });
|
|
191
|
+
const result = await evaluator.evaluate({
|
|
192
|
+
scenario: 'Order tracking',
|
|
193
|
+
persona: 'polite',
|
|
194
|
+
turns: agentTurns,
|
|
195
|
+
toolsCalled: [],
|
|
196
|
+
successCriteria: [{ type: 'tool_called', value: 'nonexistent_tool' }],
|
|
197
|
+
});
|
|
198
|
+
|
|
199
|
+
expect(result.overall).toBe('partial');
|
|
200
|
+
expect(result.criteriaResults![0].passed).toBe(false);
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
it('handles malformed LLM response gracefully', async () => {
|
|
204
|
+
const mockLLM = {
|
|
205
|
+
complete: async () => ({ text: 'not valid json at all' }),
|
|
206
|
+
};
|
|
207
|
+
|
|
208
|
+
const evaluator = new ConversationEvaluator({ llmProvider: mockLLM as any });
|
|
209
|
+
const result = await evaluator.evaluate({
|
|
210
|
+
scenario: 'Test',
|
|
211
|
+
persona: 'polite',
|
|
212
|
+
turns: agentTurns,
|
|
213
|
+
toolsCalled: [],
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
expect(result.overall).toBe('fail');
|
|
217
|
+
expect(result.scores.accuracy).toBe(1);
|
|
218
|
+
expect(result.feedback).toContain('Failed to parse');
|
|
219
|
+
});
|
|
220
|
+
});
|
|
221
|
+
});
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
2
|
+
import { ConversationRunner } from '../ConversationRunner.js';
|
|
3
|
+
import { CustomerSimulator } from '../CustomerSimulator.js';
|
|
4
|
+
import { ConversationEvaluator } from '../ConversationEvaluator.js';
|
|
5
|
+
import type { ConversationScenario } from '../types.js';
|
|
6
|
+
import { EventEmitter } from 'node:events';
|
|
7
|
+
|
|
8
|
+
// Minimal mock Operor that emits events
|
|
9
|
+
function createMockOperor(responses: string[], toolCalls?: any[][]) {
|
|
10
|
+
const emitter = new EventEmitter();
|
|
11
|
+
let callIndex = 0;
|
|
12
|
+
|
|
13
|
+
const mockProvider = {
|
|
14
|
+
name: 'mock',
|
|
15
|
+
simulateIncomingMessage: (_from: string, _text: string) => {
|
|
16
|
+
// Simulate async agent processing
|
|
17
|
+
setTimeout(() => {
|
|
18
|
+
const idx = callIndex++;
|
|
19
|
+
emitter.emit('message:processed', {
|
|
20
|
+
response: {
|
|
21
|
+
text: responses[idx] || 'Default response',
|
|
22
|
+
toolCalls: toolCalls?.[idx] || [],
|
|
23
|
+
},
|
|
24
|
+
cost: 0.001,
|
|
25
|
+
});
|
|
26
|
+
}, 5);
|
|
27
|
+
},
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
const providers = new Map();
|
|
31
|
+
providers.set('mock', mockProvider);
|
|
32
|
+
|
|
33
|
+
return Object.assign(emitter, { providers });
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
describe('ConversationRunner', () => {
|
|
37
|
+
const scriptedScenario: ConversationScenario = {
|
|
38
|
+
id: 'order-tracking',
|
|
39
|
+
name: 'Order Tracking',
|
|
40
|
+
description: 'Customer asks about a delayed order',
|
|
41
|
+
persona: 'polite',
|
|
42
|
+
maxTurns: 3,
|
|
43
|
+
scriptedResponses: [
|
|
44
|
+
'Where is my order #1234?',
|
|
45
|
+
'When will it arrive?',
|
|
46
|
+
],
|
|
47
|
+
successCriteria: [
|
|
48
|
+
{ type: 'response_contains', value: 'order' },
|
|
49
|
+
],
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
it('runs a scripted scenario end-to-end', async () => {
|
|
53
|
+
const agentOS = createMockOperor([
|
|
54
|
+
'Let me look up order #1234 for you. It is currently in transit.',
|
|
55
|
+
'It should arrive by Friday.',
|
|
56
|
+
]);
|
|
57
|
+
|
|
58
|
+
const runner = new ConversationRunner({
|
|
59
|
+
agentOS: agentOS as any,
|
|
60
|
+
customerSimulator: new CustomerSimulator(),
|
|
61
|
+
conversationEvaluator: new ConversationEvaluator(),
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
const result = await runner.runScenario(scriptedScenario);
|
|
65
|
+
|
|
66
|
+
expect(result.scenario.id).toBe('order-tracking');
|
|
67
|
+
expect(result.turns).toHaveLength(4); // 2 customer + 2 agent
|
|
68
|
+
expect(result.turns[0].role).toBe('customer');
|
|
69
|
+
expect(result.turns[0].message).toBe('Where is my order #1234?');
|
|
70
|
+
expect(result.turns[1].role).toBe('agent');
|
|
71
|
+
expect(result.turns[1].message).toContain('order #1234');
|
|
72
|
+
expect(result.turns[2].role).toBe('customer');
|
|
73
|
+
expect(result.turns[2].message).toBe('When will it arrive?');
|
|
74
|
+
expect(result.turns[3].role).toBe('agent');
|
|
75
|
+
expect(result.duration).toBeGreaterThan(0);
|
|
76
|
+
expect(result.cost).toBeGreaterThan(0);
|
|
77
|
+
});
|
|
78
|
+
|
|
79
|
+
it('evaluates success criteria from scenario', async () => {
|
|
80
|
+
const agentOS = createMockOperor([
|
|
81
|
+
'Your order is on its way!',
|
|
82
|
+
'It arrives Friday.',
|
|
83
|
+
]);
|
|
84
|
+
|
|
85
|
+
const runner = new ConversationRunner({
|
|
86
|
+
agentOS: agentOS as any,
|
|
87
|
+
customerSimulator: new CustomerSimulator(),
|
|
88
|
+
conversationEvaluator: new ConversationEvaluator(),
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
const result = await runner.runScenario(scriptedScenario);
|
|
92
|
+
expect(result.passed).toBe(true);
|
|
93
|
+
expect(result.evaluation.overall).toBe('pass');
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
it('fails when success criteria not met', async () => {
|
|
97
|
+
const scenario: ConversationScenario = {
|
|
98
|
+
...scriptedScenario,
|
|
99
|
+
successCriteria: [
|
|
100
|
+
{ type: 'tool_called', value: 'get_order' },
|
|
101
|
+
],
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
const agentOS = createMockOperor([
|
|
105
|
+
'I can help with that.',
|
|
106
|
+
'Let me check.',
|
|
107
|
+
]);
|
|
108
|
+
|
|
109
|
+
const runner = new ConversationRunner({
|
|
110
|
+
agentOS: agentOS as any,
|
|
111
|
+
customerSimulator: new CustomerSimulator(),
|
|
112
|
+
conversationEvaluator: new ConversationEvaluator(),
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
const result = await runner.runScenario(scenario);
|
|
116
|
+
expect(result.passed).toBe(false);
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
it('tracks tool calls across turns', async () => {
|
|
120
|
+
const agentOS = createMockOperor(
|
|
121
|
+
['Found your order.', 'Discount applied.'],
|
|
122
|
+
[
|
|
123
|
+
[{ name: 'get_order', params: { id: '1234' }, result: { status: 'delayed' } }],
|
|
124
|
+
[{ name: 'create_discount', params: { amount: 10 }, result: { code: 'SORRY10' } }],
|
|
125
|
+
]
|
|
126
|
+
);
|
|
127
|
+
|
|
128
|
+
const runner = new ConversationRunner({
|
|
129
|
+
agentOS: agentOS as any,
|
|
130
|
+
customerSimulator: new CustomerSimulator(),
|
|
131
|
+
conversationEvaluator: new ConversationEvaluator(),
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
const result = await runner.runScenario(scriptedScenario);
|
|
135
|
+
expect(result.turns[1].toolCalls).toHaveLength(1);
|
|
136
|
+
expect(result.turns[1].toolCalls![0].name).toBe('get_order');
|
|
137
|
+
expect(result.turns[3].toolCalls).toHaveLength(1);
|
|
138
|
+
expect(result.turns[3].toolCalls![0].name).toBe('create_discount');
|
|
139
|
+
});
|
|
140
|
+
|
|
141
|
+
it('respects maxTurns limit', async () => {
|
|
142
|
+
const scenario: ConversationScenario = {
|
|
143
|
+
id: 'limited',
|
|
144
|
+
name: 'Limited Turns',
|
|
145
|
+
description: 'Test max turns',
|
|
146
|
+
persona: 'verbose',
|
|
147
|
+
maxTurns: 1,
|
|
148
|
+
scriptedResponses: ['Hello', 'More', 'Even more'],
|
|
149
|
+
};
|
|
150
|
+
|
|
151
|
+
const agentOS = createMockOperor(['Hi there!']);
|
|
152
|
+
|
|
153
|
+
const runner = new ConversationRunner({
|
|
154
|
+
agentOS: agentOS as any,
|
|
155
|
+
customerSimulator: new CustomerSimulator(),
|
|
156
|
+
conversationEvaluator: new ConversationEvaluator(),
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
const result = await runner.runScenario(scenario);
|
|
160
|
+
// maxTurns=1 means only 1 exchange (1 customer + 1 agent = 2 turns)
|
|
161
|
+
expect(result.turns).toHaveLength(2);
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
it('handles agent timeout gracefully', async () => {
|
|
165
|
+
const emitter = new EventEmitter();
|
|
166
|
+
const mockProvider = {
|
|
167
|
+
name: 'mock',
|
|
168
|
+
simulateIncomingMessage: () => {
|
|
169
|
+
// Never emits response — simulates timeout
|
|
170
|
+
},
|
|
171
|
+
};
|
|
172
|
+
const providers = new Map();
|
|
173
|
+
providers.set('mock', mockProvider);
|
|
174
|
+
const agentOS = Object.assign(emitter, { providers });
|
|
175
|
+
|
|
176
|
+
const runner = new ConversationRunner({
|
|
177
|
+
agentOS: agentOS as any,
|
|
178
|
+
customerSimulator: new CustomerSimulator(),
|
|
179
|
+
conversationEvaluator: new ConversationEvaluator(),
|
|
180
|
+
timeout: 50, // Very short timeout
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
const result = await runner.runScenario(scriptedScenario);
|
|
184
|
+
expect(result.passed).toBe(false);
|
|
185
|
+
expect(result.evaluation.feedback).toContain('timed out');
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
it('runs multiple scenarios sequentially', async () => {
|
|
189
|
+
const scenario2: ConversationScenario = {
|
|
190
|
+
id: 'greeting',
|
|
191
|
+
name: 'Simple Greeting',
|
|
192
|
+
description: 'Customer says hello',
|
|
193
|
+
persona: 'terse',
|
|
194
|
+
maxTurns: 1,
|
|
195
|
+
scriptedResponses: ['Hi'],
|
|
196
|
+
successCriteria: [{ type: 'turns_under', value: 10 }],
|
|
197
|
+
};
|
|
198
|
+
|
|
199
|
+
// Need separate agentOS instances since event listeners are consumed
|
|
200
|
+
let callCount = 0;
|
|
201
|
+
const emitter = new EventEmitter();
|
|
202
|
+
const mockProvider = {
|
|
203
|
+
name: 'mock',
|
|
204
|
+
simulateIncomingMessage: () => {
|
|
205
|
+
setTimeout(() => {
|
|
206
|
+
callCount++;
|
|
207
|
+
emitter.emit('message:processed', {
|
|
208
|
+
response: { text: `Response ${callCount}`, toolCalls: [] },
|
|
209
|
+
cost: 0,
|
|
210
|
+
});
|
|
211
|
+
}, 5);
|
|
212
|
+
},
|
|
213
|
+
};
|
|
214
|
+
const providers = new Map();
|
|
215
|
+
providers.set('mock', mockProvider);
|
|
216
|
+
const agentOS = Object.assign(emitter, { providers });
|
|
217
|
+
|
|
218
|
+
const runner = new ConversationRunner({
|
|
219
|
+
agentOS: agentOS as any,
|
|
220
|
+
customerSimulator: new CustomerSimulator(),
|
|
221
|
+
conversationEvaluator: new ConversationEvaluator(),
|
|
222
|
+
});
|
|
223
|
+
|
|
224
|
+
const results = await runner.runScenarios([scriptedScenario, scenario2]);
|
|
225
|
+
expect(results).toHaveLength(2);
|
|
226
|
+
expect(results[0].scenario.id).toBe('order-tracking');
|
|
227
|
+
expect(results[1].scenario.id).toBe('greeting');
|
|
228
|
+
});
|
|
229
|
+
|
|
230
|
+
it('handles error events from Operor', async () => {
|
|
231
|
+
const emitter = new EventEmitter();
|
|
232
|
+
const mockProvider = {
|
|
233
|
+
name: 'mock',
|
|
234
|
+
simulateIncomingMessage: () => {
|
|
235
|
+
setTimeout(() => {
|
|
236
|
+
emitter.emit('error', { error: new Error('Agent processing failed') });
|
|
237
|
+
}, 5);
|
|
238
|
+
},
|
|
239
|
+
};
|
|
240
|
+
const providers = new Map();
|
|
241
|
+
providers.set('mock', mockProvider);
|
|
242
|
+
const agentOS = Object.assign(emitter, { providers });
|
|
243
|
+
|
|
244
|
+
const runner = new ConversationRunner({
|
|
245
|
+
agentOS: agentOS as any,
|
|
246
|
+
customerSimulator: new CustomerSimulator(),
|
|
247
|
+
conversationEvaluator: new ConversationEvaluator(),
|
|
248
|
+
});
|
|
249
|
+
|
|
250
|
+
const result = await runner.runScenario(scriptedScenario);
|
|
251
|
+
expect(result.passed).toBe(false);
|
|
252
|
+
expect(result.evaluation.feedback).toContain('Agent processing failed');
|
|
253
|
+
});
|
|
254
|
+
|
|
255
|
+
it('throws when MockProvider not found', async () => {
|
|
256
|
+
const emitter = new EventEmitter();
|
|
257
|
+
const providers = new Map(); // Empty — no mock provider
|
|
258
|
+
const agentOS = Object.assign(emitter, { providers });
|
|
259
|
+
|
|
260
|
+
const runner = new ConversationRunner({
|
|
261
|
+
agentOS: agentOS as any,
|
|
262
|
+
customerSimulator: new CustomerSimulator(),
|
|
263
|
+
conversationEvaluator: new ConversationEvaluator(),
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
const result = await runner.runScenario(scriptedScenario);
|
|
267
|
+
expect(result.passed).toBe(false);
|
|
268
|
+
expect(result.evaluation.feedback).toContain('MockProvider not found');
|
|
269
|
+
});
|
|
270
|
+
});
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest';
|
|
2
|
+
import { CustomerSimulator } from '../CustomerSimulator.js';
|
|
3
|
+
import type { ConversationTurn } from '../types.js';
|
|
4
|
+
|
|
5
|
+
describe('CustomerSimulator', () => {
|
|
6
|
+
describe('script mode', () => {
|
|
7
|
+
const sim = new CustomerSimulator();
|
|
8
|
+
|
|
9
|
+
it('returns first scripted response on turn 0', async () => {
|
|
10
|
+
const result = await sim.generateMessage('polite', [], {
|
|
11
|
+
scriptedResponses: ['Hi, where is my order?', 'Order #1234', 'Thanks!'],
|
|
12
|
+
currentTurn: 0,
|
|
13
|
+
});
|
|
14
|
+
expect(result.message).toBe('Hi, where is my order?');
|
|
15
|
+
expect(result.shouldContinue).toBe(true);
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
it('returns subsequent responses based on currentTurn', async () => {
|
|
19
|
+
const history: ConversationTurn[] = [
|
|
20
|
+
{ role: 'customer', message: 'Hi, where is my order?' },
|
|
21
|
+
{ role: 'agent', message: 'Can you give me your order number?' },
|
|
22
|
+
];
|
|
23
|
+
const result = await sim.generateMessage('polite', history, {
|
|
24
|
+
scriptedResponses: ['Hi, where is my order?', 'Order #1234', 'Thanks!'],
|
|
25
|
+
currentTurn: 1,
|
|
26
|
+
});
|
|
27
|
+
expect(result.message).toBe('Order #1234');
|
|
28
|
+
expect(result.shouldContinue).toBe(true);
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
it('sets shouldContinue=false on last scripted response', async () => {
|
|
32
|
+
const result = await sim.generateMessage('polite', [], {
|
|
33
|
+
scriptedResponses: ['Hi, where is my order?', 'Order #1234', 'Thanks!'],
|
|
34
|
+
currentTurn: 2,
|
|
35
|
+
});
|
|
36
|
+
expect(result.message).toBe('Thanks!');
|
|
37
|
+
expect(result.shouldContinue).toBe(false);
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it('returns last response with shouldContinue=false when turns exceed scripts', async () => {
|
|
41
|
+
const result = await sim.generateMessage('polite', [], {
|
|
42
|
+
scriptedResponses: ['Hello', 'Goodbye'],
|
|
43
|
+
currentTurn: 5,
|
|
44
|
+
});
|
|
45
|
+
expect(result.message).toBe('Goodbye');
|
|
46
|
+
expect(result.shouldContinue).toBe(false);
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
it('infers turn from history when currentTurn not provided', async () => {
|
|
50
|
+
const history: ConversationTurn[] = [
|
|
51
|
+
{ role: 'customer', message: 'Hi' },
|
|
52
|
+
{ role: 'agent', message: 'Hello!' },
|
|
53
|
+
];
|
|
54
|
+
const result = await sim.generateMessage('polite', history, {
|
|
55
|
+
scriptedResponses: ['Hi', 'My order is late', 'Thanks'],
|
|
56
|
+
});
|
|
57
|
+
// 1 customer message in history → turn 1
|
|
58
|
+
expect(result.message).toBe('My order is late');
|
|
59
|
+
expect(result.shouldContinue).toBe(true);
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
it('works with a single scripted response', async () => {
|
|
63
|
+
const result = await sim.generateMessage('terse', [], {
|
|
64
|
+
scriptedResponses: ['Just checking'],
|
|
65
|
+
currentTurn: 0,
|
|
66
|
+
});
|
|
67
|
+
expect(result.message).toBe('Just checking');
|
|
68
|
+
expect(result.shouldContinue).toBe(false);
|
|
69
|
+
});
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
describe('LLM mode', () => {
|
|
73
|
+
it('throws when no LLM provider and no scripted responses', async () => {
|
|
74
|
+
const sim = new CustomerSimulator();
|
|
75
|
+
await expect(
|
|
76
|
+
sim.generateMessage('polite', [])
|
|
77
|
+
).rejects.toThrow('CustomerSimulator requires an LLM provider');
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
it('calls LLM with correct messages and parses JSON response', async () => {
|
|
81
|
+
const mockLLM = {
|
|
82
|
+
complete: async (messages: any[], options: any) => {
|
|
83
|
+
// Verify system prompt is first
|
|
84
|
+
expect(messages[0].role).toBe('system');
|
|
85
|
+
expect(messages[0].content).toContain('simulating a customer');
|
|
86
|
+
// Verify temperature
|
|
87
|
+
expect(options.temperature).toBe(0.7);
|
|
88
|
+
return {
|
|
89
|
+
text: '{"message": "I need help with my order", "shouldContinue": true}',
|
|
90
|
+
};
|
|
91
|
+
},
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
const sim = new CustomerSimulator({ llmProvider: mockLLM as any });
|
|
95
|
+
const result = await sim.generateMessage('frustrated', [
|
|
96
|
+
{ role: 'agent', message: 'How can I help you today?' },
|
|
97
|
+
]);
|
|
98
|
+
expect(result.message).toBe('I need help with my order');
|
|
99
|
+
expect(result.shouldContinue).toBe(true);
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
it('handles non-JSON LLM response gracefully', async () => {
|
|
103
|
+
const mockLLM = {
|
|
104
|
+
complete: async () => ({
|
|
105
|
+
text: 'I just want to know where my package is!',
|
|
106
|
+
}),
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
const sim = new CustomerSimulator({ llmProvider: mockLLM as any });
|
|
110
|
+
const result = await sim.generateMessage('frustrated', []);
|
|
111
|
+
expect(result.message).toBe('I just want to know where my package is!');
|
|
112
|
+
expect(result.shouldContinue).toBe(true);
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
it('includes scenario context in system prompt', async () => {
|
|
116
|
+
let capturedMessages: any[] = [];
|
|
117
|
+
const mockLLM = {
|
|
118
|
+
complete: async (messages: any[]) => {
|
|
119
|
+
capturedMessages = messages;
|
|
120
|
+
return { text: '{"message": "test", "shouldContinue": false}' };
|
|
121
|
+
},
|
|
122
|
+
};
|
|
123
|
+
|
|
124
|
+
const sim = new CustomerSimulator({ llmProvider: mockLLM as any });
|
|
125
|
+
await sim.generateMessage('polite', [], {
|
|
126
|
+
scenario: 'Customer ordered headphones 5 days ago',
|
|
127
|
+
maxTurns: 5,
|
|
128
|
+
currentTurn: 2,
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
const systemMsg = capturedMessages[0].content;
|
|
132
|
+
expect(systemMsg).toContain('Customer ordered headphones 5 days ago');
|
|
133
|
+
expect(systemMsg).toContain('maximum of 5 turns');
|
|
134
|
+
expect(systemMsg).toContain('turn 2');
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
it('converts history turns to LLM messages', async () => {
|
|
138
|
+
let capturedMessages: any[] = [];
|
|
139
|
+
const mockLLM = {
|
|
140
|
+
complete: async (messages: any[]) => {
|
|
141
|
+
capturedMessages = messages;
|
|
142
|
+
return { text: '{"message": "ok", "shouldContinue": false}' };
|
|
143
|
+
},
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
const sim = new CustomerSimulator({ llmProvider: mockLLM as any });
|
|
147
|
+
await sim.generateMessage('polite', [
|
|
148
|
+
{ role: 'customer', message: 'Hi there' },
|
|
149
|
+
{ role: 'agent', message: 'Hello! How can I help?' },
|
|
150
|
+
{ role: 'customer', message: 'I have a question' },
|
|
151
|
+
]);
|
|
152
|
+
|
|
153
|
+
// system + 3 history messages
|
|
154
|
+
expect(capturedMessages).toHaveLength(4);
|
|
155
|
+
expect(capturedMessages[1]).toEqual({ role: 'user', content: 'Hi there' });
|
|
156
|
+
expect(capturedMessages[2]).toEqual({ role: 'assistant', content: 'Hello! How can I help?' });
|
|
157
|
+
expect(capturedMessages[3]).toEqual({ role: 'user', content: 'I have a question' });
|
|
158
|
+
});
|
|
159
|
+
});
|
|
160
|
+
});
|