@operor/testing 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/API_VALIDATION.md +572 -0
- package/dist/index.d.ts +414 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1608 -0
- package/dist/index.js.map +1 -0
- package/fixtures/sample-tests.csv +10 -0
- package/package.json +31 -0
- package/src/CSVLoader.ts +83 -0
- package/src/ConversationEvaluator.ts +254 -0
- package/src/ConversationRunner.ts +267 -0
- package/src/CustomerSimulator.ts +106 -0
- package/src/MockShopifySkill.ts +336 -0
- package/src/SimulationRunner.ts +425 -0
- package/src/SkillTestHarness.ts +220 -0
- package/src/TestCaseEvaluator.ts +296 -0
- package/src/TestSuiteRunner.ts +151 -0
- package/src/__tests__/CSVLoader.test.ts +122 -0
- package/src/__tests__/ConversationEvaluator.test.ts +221 -0
- package/src/__tests__/ConversationRunner.test.ts +270 -0
- package/src/__tests__/CustomerSimulator.test.ts +160 -0
- package/src/__tests__/SimulationRunner.test.ts +281 -0
- package/src/__tests__/SkillTestHarness.test.ts +181 -0
- package/src/__tests__/scenarios.test.ts +71 -0
- package/src/index.ts +32 -0
- package/src/scenarios/edge-cases.ts +52 -0
- package/src/scenarios/general.ts +37 -0
- package/src/scenarios/index.ts +32 -0
- package/src/scenarios/order-tracking.ts +56 -0
- package/src/scenarios.ts +142 -0
- package/src/types.ts +133 -0
- package/src/utils.ts +6 -0
- package/tsconfig.json +9 -0
- package/tsdown.config.ts +10 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import type { ConversationScenario } from '../types.js';
|
|
2
|
+
|
|
3
|
+
export const delayedOrderScenario: ConversationScenario = {
|
|
4
|
+
id: 'order-tracking-delayed',
|
|
5
|
+
name: 'Delayed Order with Compensation',
|
|
6
|
+
description: 'Customer asks about a delayed order and expects compensation offer',
|
|
7
|
+
persona: 'polite',
|
|
8
|
+
maxTurns: 3,
|
|
9
|
+
expectedTools: ['get_order', 'create_discount'],
|
|
10
|
+
expectedOutcome: 'Agent looks up order, identifies delay, and offers discount code',
|
|
11
|
+
successCriteria: [
|
|
12
|
+
{ type: 'tool_called', value: 'get_order' },
|
|
13
|
+
{ type: 'tool_called', value: 'create_discount' },
|
|
14
|
+
],
|
|
15
|
+
scriptedResponses: [
|
|
16
|
+
'Where is my order #12345? It was supposed to arrive two days ago.',
|
|
17
|
+
'That is really frustrating. Is there anything you can do to make up for this?',
|
|
18
|
+
'Thank you, I appreciate the discount code.',
|
|
19
|
+
],
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
export const onTimeOrderScenario: ConversationScenario = {
|
|
23
|
+
id: 'order-tracking-ontime',
|
|
24
|
+
name: 'On-time Order Status Check',
|
|
25
|
+
description: 'Customer asks about an order that is on time or already delivered',
|
|
26
|
+
persona: 'polite',
|
|
27
|
+
maxTurns: 2,
|
|
28
|
+
expectedTools: ['get_order'],
|
|
29
|
+
expectedOutcome: 'Agent looks up order and confirms status without offering compensation',
|
|
30
|
+
successCriteria: [
|
|
31
|
+
{ type: 'tool_called', value: 'get_order' },
|
|
32
|
+
{ type: 'turns_under', value: 3 },
|
|
33
|
+
],
|
|
34
|
+
scriptedResponses: [
|
|
35
|
+
'Can you check the status of order #67890?',
|
|
36
|
+
'Great, thanks for the update!',
|
|
37
|
+
],
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
export const orderNotFoundScenario: ConversationScenario = {
|
|
41
|
+
id: 'order-tracking-not-found',
|
|
42
|
+
name: 'Order Not Found',
|
|
43
|
+
description: 'Customer asks about an order that does not exist',
|
|
44
|
+
persona: 'confused',
|
|
45
|
+
maxTurns: 2,
|
|
46
|
+
expectedTools: ['get_order'],
|
|
47
|
+
expectedOutcome: 'Agent attempts lookup and explains order was not found',
|
|
48
|
+
successCriteria: [
|
|
49
|
+
{ type: 'tool_called', value: 'get_order' },
|
|
50
|
+
{ type: 'response_contains', value: 'not found' },
|
|
51
|
+
],
|
|
52
|
+
scriptedResponses: [
|
|
53
|
+
'Where is order #99999?',
|
|
54
|
+
'Hmm, let me double check the order number and get back to you.',
|
|
55
|
+
],
|
|
56
|
+
};
|
package/src/scenarios.ts
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
import type { ConversationScenario } from './types.js';
|
|
2
|
+
|
|
3
|
+
export const ECOMMERCE_SCENARIOS: ConversationScenario[] = [
|
|
4
|
+
{
|
|
5
|
+
id: 'delayed-order-compensation',
|
|
6
|
+
name: 'Delayed order with compensation',
|
|
7
|
+
description: 'Customer asks about a delayed order and expects compensation',
|
|
8
|
+
persona: 'Frustrated customer whose order #12345 was supposed to arrive 3 days ago',
|
|
9
|
+
maxTurns: 6,
|
|
10
|
+
expectedTools: ['get_order', 'create_discount'],
|
|
11
|
+
expectedOutcome: 'Agent finds order, acknowledges delay, offers discount',
|
|
12
|
+
scriptedResponses: [
|
|
13
|
+
'Where is my order #12345? It was supposed to arrive 3 days ago!',
|
|
14
|
+
'This is really frustrating. Can you do anything to make up for this?',
|
|
15
|
+
'Okay, I appreciate the discount. Thank you.',
|
|
16
|
+
],
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
id: 'order-not-found',
|
|
20
|
+
name: 'Order not found',
|
|
21
|
+
description: 'Customer asks about an order that does not exist in the system',
|
|
22
|
+
persona: 'Confused customer who may have the wrong order number',
|
|
23
|
+
maxTurns: 4,
|
|
24
|
+
expectedTools: ['get_order'],
|
|
25
|
+
expectedOutcome: 'Agent attempts lookup, explains order not found, asks customer to verify',
|
|
26
|
+
scriptedResponses: [
|
|
27
|
+
'Can you check on order #99999 for me?',
|
|
28
|
+
'Hmm, let me double check the number and get back to you.',
|
|
29
|
+
],
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
id: 'product-inquiry',
|
|
33
|
+
name: 'Product inquiry',
|
|
34
|
+
description: 'Customer searches for a product and asks about availability',
|
|
35
|
+
persona: 'Curious shopper looking for electronics',
|
|
36
|
+
maxTurns: 4,
|
|
37
|
+
expectedTools: ['search_products'],
|
|
38
|
+
expectedOutcome: 'Agent searches products and provides relevant results',
|
|
39
|
+
scriptedResponses: [
|
|
40
|
+
'Do you have any wireless headphones in stock?',
|
|
41
|
+
'What about the price range? Anything under $200?',
|
|
42
|
+
'Great, thanks for the info!',
|
|
43
|
+
],
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
id: 'return-request',
|
|
47
|
+
name: 'Return request',
|
|
48
|
+
description: 'Customer wants to return a recently delivered item',
|
|
49
|
+
persona: 'Polite customer who received a defective product from order #67890',
|
|
50
|
+
maxTurns: 5,
|
|
51
|
+
expectedTools: ['get_order'],
|
|
52
|
+
expectedOutcome: 'Agent looks up order, acknowledges issue, explains return process',
|
|
53
|
+
scriptedResponses: [
|
|
54
|
+
'I received order #67890 but one of the items is defective. I would like to return it.',
|
|
55
|
+
'Yes, the wireless mouse stopped working after one day.',
|
|
56
|
+
'Okay, how do I send it back?',
|
|
57
|
+
],
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
id: 'greeting',
|
|
61
|
+
name: 'Simple greeting',
|
|
62
|
+
description: 'Customer says hello and expects a friendly welcome',
|
|
63
|
+
persona: 'Friendly first-time visitor',
|
|
64
|
+
maxTurns: 2,
|
|
65
|
+
expectedTools: [],
|
|
66
|
+
expectedOutcome: 'Agent responds with a friendly greeting and offers help',
|
|
67
|
+
scriptedResponses: [
|
|
68
|
+
'Hello!',
|
|
69
|
+
],
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
id: 'billing-dispute',
|
|
73
|
+
name: 'Billing dispute',
|
|
74
|
+
description: 'Customer believes they were charged incorrectly',
|
|
75
|
+
persona: 'Concerned customer who noticed a double charge on order #12345',
|
|
76
|
+
maxTurns: 5,
|
|
77
|
+
expectedTools: ['get_order'],
|
|
78
|
+
expectedOutcome: 'Agent looks up order, reviews charges, and addresses the billing concern',
|
|
79
|
+
scriptedResponses: [
|
|
80
|
+
'I think I was charged twice for order #12345. Can you check?',
|
|
81
|
+
'My credit card shows two charges of $299.99 on the same day.',
|
|
82
|
+
'Can you escalate this to someone who can issue a correction?',
|
|
83
|
+
],
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
id: 'multi-issue',
|
|
87
|
+
name: 'Multi-issue conversation',
|
|
88
|
+
description: 'Customer has multiple problems: a delayed order and a product question',
|
|
89
|
+
persona: 'Busy customer who wants to resolve everything in one conversation',
|
|
90
|
+
maxTurns: 6,
|
|
91
|
+
expectedTools: ['get_order', 'search_products'],
|
|
92
|
+
expectedOutcome: 'Agent handles both issues sequentially without losing context',
|
|
93
|
+
scriptedResponses: [
|
|
94
|
+
'Two things: first, where is my order #12345?',
|
|
95
|
+
'Okay thanks. Also, do you carry mechanical keyboards?',
|
|
96
|
+
'Nice, I might order one. Can you also give me a discount for the late delivery?',
|
|
97
|
+
'Sounds good, thanks for handling both issues.',
|
|
98
|
+
],
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
id: 'lead-qualification',
|
|
102
|
+
name: 'Lead qualification',
|
|
103
|
+
description: 'Potential customer asking pre-purchase questions about products and shipping',
|
|
104
|
+
persona: 'Prospective buyer evaluating whether to make a purchase',
|
|
105
|
+
maxTurns: 4,
|
|
106
|
+
expectedTools: ['search_products'],
|
|
107
|
+
expectedOutcome: 'Agent answers product questions and encourages purchase',
|
|
108
|
+
scriptedResponses: [
|
|
109
|
+
'I am thinking about buying some electronics. What do you have?',
|
|
110
|
+
'How fast is shipping usually?',
|
|
111
|
+
'Do you offer any discounts for first-time buyers?',
|
|
112
|
+
],
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
id: 'frustrated-escalation',
|
|
116
|
+
name: 'Frustrated customer escalation',
|
|
117
|
+
description: 'Angry customer escalates through multiple complaints',
|
|
118
|
+
persona: 'Very frustrated customer who has contacted support multiple times about order #12345',
|
|
119
|
+
maxTurns: 6,
|
|
120
|
+
expectedTools: ['get_order', 'create_discount'],
|
|
121
|
+
expectedOutcome: 'Agent remains professional, empathizes, and offers concrete resolution',
|
|
122
|
+
scriptedResponses: [
|
|
123
|
+
'This is the THIRD time I am contacting you about order #12345. Still not here!',
|
|
124
|
+
'I have been waiting over a week. This is completely unacceptable.',
|
|
125
|
+
'I want a refund or serious compensation. A 5% coupon is insulting.',
|
|
126
|
+
'Fine, that is more reasonable. But I expect the order to arrive this week.',
|
|
127
|
+
],
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
id: 'on-time-order-check',
|
|
131
|
+
name: 'On-time order status check',
|
|
132
|
+
description: 'Customer checks on an order that is on time or already delivered',
|
|
133
|
+
persona: 'Polite customer just checking in on order #67890',
|
|
134
|
+
maxTurns: 2,
|
|
135
|
+
expectedTools: ['get_order'],
|
|
136
|
+
expectedOutcome: 'Agent confirms order status, no compensation needed',
|
|
137
|
+
scriptedResponses: [
|
|
138
|
+
'Hi, can I get an update on order #67890?',
|
|
139
|
+
'Perfect, thanks!',
|
|
140
|
+
],
|
|
141
|
+
},
|
|
142
|
+
];
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
export interface TestCase {
|
|
2
|
+
id: string;
|
|
3
|
+
question: string;
|
|
4
|
+
expectedAnswer?: string;
|
|
5
|
+
expectedTools?: string[];
|
|
6
|
+
persona?: string;
|
|
7
|
+
tags?: string[];
|
|
8
|
+
metadata?: Record<string, any>;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface TestCaseResult {
|
|
12
|
+
testCase: TestCase;
|
|
13
|
+
agentResponse: string;
|
|
14
|
+
toolsCalled: Array<{ name: string; params: any; result: any }>;
|
|
15
|
+
evaluation: {
|
|
16
|
+
passed: boolean;
|
|
17
|
+
score: number;
|
|
18
|
+
method: 'exact' | 'contains' | 'similarity' | 'llm_judge';
|
|
19
|
+
reasoning: string;
|
|
20
|
+
toolsCorrect: boolean;
|
|
21
|
+
};
|
|
22
|
+
duration: number;
|
|
23
|
+
cost: number;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export interface TestSuiteResult {
|
|
27
|
+
total: number;
|
|
28
|
+
passed: number;
|
|
29
|
+
failed: number;
|
|
30
|
+
averageScore: number;
|
|
31
|
+
byTag: Record<string, { total: number; passed: number; avgScore: number }>;
|
|
32
|
+
results: TestCaseResult[];
|
|
33
|
+
totalDuration: number;
|
|
34
|
+
totalCost: number;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// --- Multi-turn conversation testing types ---
|
|
38
|
+
|
|
39
|
+
export interface CustomerPersona {
|
|
40
|
+
name: string;
|
|
41
|
+
style: 'polite' | 'frustrated' | 'confused' | 'terse' | 'verbose';
|
|
42
|
+
context?: string;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export interface ConversationSuccessCriteria {
|
|
46
|
+
type: 'tool_called' | 'response_contains' | 'intent_matched' | 'turns_under' | 'custom';
|
|
47
|
+
value: string | number | ((history: ConversationTurn[]) => boolean);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export interface ConversationScenario {
|
|
51
|
+
id: string;
|
|
52
|
+
name: string;
|
|
53
|
+
description: string;
|
|
54
|
+
persona: string;
|
|
55
|
+
maxTurns: number;
|
|
56
|
+
expectedTools?: string[];
|
|
57
|
+
expectedOutcome?: string;
|
|
58
|
+
successCriteria?: ConversationSuccessCriteria[];
|
|
59
|
+
scriptedResponses?: string[];
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export interface ConversationTurn {
|
|
63
|
+
role: 'customer' | 'agent';
|
|
64
|
+
message: string;
|
|
65
|
+
toolCalls?: Array<{ name: string; params: any; result: any }>;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export interface CustomerSimulatorResponse {
|
|
69
|
+
message: string;
|
|
70
|
+
shouldContinue: boolean;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
export interface CriteriaResult {
|
|
74
|
+
criteria: ConversationSuccessCriteria;
|
|
75
|
+
passed: boolean;
|
|
76
|
+
details: string;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export interface ConversationEvaluation {
|
|
80
|
+
overall: 'pass' | 'fail' | 'partial';
|
|
81
|
+
scores: { accuracy: number; toolUsage: number; tone: number; resolution: number };
|
|
82
|
+
feedback: string;
|
|
83
|
+
criteriaResults?: CriteriaResult[];
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export interface ConversationTestResult {
|
|
87
|
+
scenario: ConversationScenario;
|
|
88
|
+
passed: boolean;
|
|
89
|
+
turns: ConversationTurn[];
|
|
90
|
+
evaluation: ConversationEvaluation;
|
|
91
|
+
duration: number;
|
|
92
|
+
cost: number;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// --- Simulation testing types ---
|
|
96
|
+
|
|
97
|
+
export interface SimulationConfig {
|
|
98
|
+
testSuiteFiles?: string[];
|
|
99
|
+
conversationScenarios?: ConversationScenario[] | 'builtin';
|
|
100
|
+
totalConversations?: number;
|
|
101
|
+
pauseBetweenMs?: number;
|
|
102
|
+
evaluationStrategy?: 'exact' | 'similarity' | 'llm_judge';
|
|
103
|
+
integrationMode?: 'mock' | 'real' | 'dry-run';
|
|
104
|
+
allowWrites?: boolean;
|
|
105
|
+
timeout?: number;
|
|
106
|
+
parallel?: boolean;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
export interface SimulationReport {
|
|
110
|
+
timestamp: Date;
|
|
111
|
+
duration: number;
|
|
112
|
+
totalConversations: number;
|
|
113
|
+
passed: number;
|
|
114
|
+
failed: number;
|
|
115
|
+
averageScores: { accuracy: number; toolUsage: number; tone: number; resolution: number };
|
|
116
|
+
scenarioBreakdown: Array<{ scenario: string; runs: number; passRate: number; avgScore: number }>;
|
|
117
|
+
toolUsageStats: Record<string, number>;
|
|
118
|
+
commonFailurePatterns: string[];
|
|
119
|
+
recommendations: string[];
|
|
120
|
+
testSuiteResults: TestSuiteResult[];
|
|
121
|
+
conversationResults: ConversationTestResult[];
|
|
122
|
+
overallPassed: boolean;
|
|
123
|
+
totalCost: number;
|
|
124
|
+
summary: {
|
|
125
|
+
totalTests: number;
|
|
126
|
+
passedTests: number;
|
|
127
|
+
failedTests: number;
|
|
128
|
+
totalConversations: number;
|
|
129
|
+
passedConversations: number;
|
|
130
|
+
failedConversations: number;
|
|
131
|
+
overallPassRate: number;
|
|
132
|
+
};
|
|
133
|
+
}
|
package/src/utils.ts
ADDED
package/tsconfig.json
ADDED