@operor/testing 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/API_VALIDATION.md +572 -0
- package/dist/index.d.ts +414 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +1608 -0
- package/dist/index.js.map +1 -0
- package/fixtures/sample-tests.csv +10 -0
- package/package.json +31 -0
- package/src/CSVLoader.ts +83 -0
- package/src/ConversationEvaluator.ts +254 -0
- package/src/ConversationRunner.ts +267 -0
- package/src/CustomerSimulator.ts +106 -0
- package/src/MockShopifySkill.ts +336 -0
- package/src/SimulationRunner.ts +425 -0
- package/src/SkillTestHarness.ts +220 -0
- package/src/TestCaseEvaluator.ts +296 -0
- package/src/TestSuiteRunner.ts +151 -0
- package/src/__tests__/CSVLoader.test.ts +122 -0
- package/src/__tests__/ConversationEvaluator.test.ts +221 -0
- package/src/__tests__/ConversationRunner.test.ts +270 -0
- package/src/__tests__/CustomerSimulator.test.ts +160 -0
- package/src/__tests__/SimulationRunner.test.ts +281 -0
- package/src/__tests__/SkillTestHarness.test.ts +181 -0
- package/src/__tests__/scenarios.test.ts +71 -0
- package/src/index.ts +32 -0
- package/src/scenarios/edge-cases.ts +52 -0
- package/src/scenarios/general.ts +37 -0
- package/src/scenarios/index.ts +32 -0
- package/src/scenarios/order-tracking.ts +56 -0
- package/src/scenarios.ts +142 -0
- package/src/types.ts +133 -0
- package/src/utils.ts +6 -0
- package/tsconfig.json +9 -0
- package/tsdown.config.ts +10 -0
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
import type { Operor } from '@operor/core';
|
|
2
|
+
import type { LLMProvider } from '@operor/llm';
|
|
3
|
+
import type {
|
|
4
|
+
SimulationConfig,
|
|
5
|
+
SimulationReport,
|
|
6
|
+
TestSuiteResult,
|
|
7
|
+
ConversationTestResult,
|
|
8
|
+
ConversationScenario,
|
|
9
|
+
} from './types.js';
|
|
10
|
+
import { TestSuiteRunner } from './TestSuiteRunner.js';
|
|
11
|
+
import { ConversationRunner } from './ConversationRunner.js';
|
|
12
|
+
import { CustomerSimulator } from './CustomerSimulator.js';
|
|
13
|
+
import { ConversationEvaluator } from './ConversationEvaluator.js';
|
|
14
|
+
import { CSVLoader } from './CSVLoader.js';
|
|
15
|
+
import { ECOMMERCE_SCENARIOS } from './scenarios.js';
|
|
16
|
+
|
|
17
|
+
export interface SimulationRunnerOptions {
|
|
18
|
+
agentOS: Operor;
|
|
19
|
+
config: SimulationConfig;
|
|
20
|
+
llm?: LLMProvider;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export class SimulationRunner {
|
|
24
|
+
private agentOS: Operor;
|
|
25
|
+
private config: SimulationConfig;
|
|
26
|
+
private llm?: LLMProvider;
|
|
27
|
+
|
|
28
|
+
constructor(options: SimulationRunnerOptions) {
|
|
29
|
+
this.agentOS = options.agentOS;
|
|
30
|
+
this.config = options.config;
|
|
31
|
+
this.llm = options.llm;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
async run(
|
|
35
|
+
onProgress?: (completed: number, total: number, result: ConversationTestResult) => void
|
|
36
|
+
): Promise<SimulationReport> {
|
|
37
|
+
const startTime = Date.now();
|
|
38
|
+
const testSuiteResults: TestSuiteResult[] = [];
|
|
39
|
+
const conversationResults: ConversationTestResult[] = [];
|
|
40
|
+
let totalCost = 0;
|
|
41
|
+
|
|
42
|
+
// 1. Run test suites from CSV/JSON files
|
|
43
|
+
if (this.config.testSuiteFiles?.length) {
|
|
44
|
+
const suiteRunner = new TestSuiteRunner({
|
|
45
|
+
agentOS: this.agentOS,
|
|
46
|
+
llm: this.llm,
|
|
47
|
+
timeout: this.config.timeout,
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
for (const file of this.config.testSuiteFiles) {
|
|
51
|
+
const testCases = await CSVLoader.fromFile(file);
|
|
52
|
+
const result = await suiteRunner.runSuite(testCases);
|
|
53
|
+
testSuiteResults.push(result);
|
|
54
|
+
totalCost += result.totalCost;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// 2. Run conversation scenarios
|
|
59
|
+
const scenarios = this.resolveScenarios();
|
|
60
|
+
if (scenarios.length) {
|
|
61
|
+
const conversationRunner = new ConversationRunner({
|
|
62
|
+
agentOS: this.agentOS,
|
|
63
|
+
customerSimulator: new CustomerSimulator({ llmProvider: this.llm }),
|
|
64
|
+
conversationEvaluator: new ConversationEvaluator({ llmProvider: this.llm }),
|
|
65
|
+
timeout: this.config.timeout,
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
// Distribute totalConversations across scenarios round-robin
|
|
69
|
+
const schedule = this.buildSchedule(scenarios);
|
|
70
|
+
const pauseMs = this.config.pauseBetweenMs ?? 500;
|
|
71
|
+
|
|
72
|
+
for (let i = 0; i < schedule.length; i++) {
|
|
73
|
+
const scenario = schedule[i];
|
|
74
|
+
|
|
75
|
+
// Add timeout protection around conversation execution
|
|
76
|
+
const timeoutMs = this.config.timeout || 60000;
|
|
77
|
+
const result = await Promise.race([
|
|
78
|
+
conversationRunner.runScenario(scenario),
|
|
79
|
+
new Promise<ConversationTestResult>((_, reject) =>
|
|
80
|
+
setTimeout(() => reject(new Error(`Conversation timed out after ${timeoutMs}ms`)), timeoutMs)
|
|
81
|
+
),
|
|
82
|
+
]).catch((error): ConversationTestResult => {
|
|
83
|
+
// If timeout or error, return a failed result
|
|
84
|
+
return {
|
|
85
|
+
scenario,
|
|
86
|
+
passed: false,
|
|
87
|
+
turns: [],
|
|
88
|
+
evaluation: {
|
|
89
|
+
overall: 'fail',
|
|
90
|
+
scores: { accuracy: 1, toolUsage: 1, tone: 1, resolution: 1 },
|
|
91
|
+
feedback: `Timeout or error: ${error instanceof Error ? error.message : String(error)}`,
|
|
92
|
+
},
|
|
93
|
+
duration: timeoutMs,
|
|
94
|
+
cost: 0,
|
|
95
|
+
};
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
conversationResults.push(result);
|
|
99
|
+
totalCost += result.cost;
|
|
100
|
+
|
|
101
|
+
if (onProgress) {
|
|
102
|
+
onProgress(i + 1, schedule.length, result);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Pause between conversations (skip after last)
|
|
106
|
+
if (i < schedule.length - 1 && pauseMs > 0) {
|
|
107
|
+
await new Promise((resolve) => setTimeout(resolve, pauseMs));
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
const duration = Date.now() - startTime;
|
|
113
|
+
|
|
114
|
+
// 3. Aggregate results
|
|
115
|
+
const totalTests = testSuiteResults.reduce((sum, r) => sum + r.total, 0);
|
|
116
|
+
const passedTests = testSuiteResults.reduce((sum, r) => sum + r.passed, 0);
|
|
117
|
+
const failedTests = testSuiteResults.reduce((sum, r) => sum + r.failed, 0);
|
|
118
|
+
const totalConversations = conversationResults.length;
|
|
119
|
+
const passedConversations = conversationResults.filter((r) => r.passed).length;
|
|
120
|
+
const failedConversations = totalConversations - passedConversations;
|
|
121
|
+
|
|
122
|
+
const totalItems = totalTests + totalConversations;
|
|
123
|
+
const passedItems = passedTests + passedConversations;
|
|
124
|
+
const overallPassRate = totalItems > 0 ? passedItems / totalItems : 0;
|
|
125
|
+
|
|
126
|
+
const averageScores = this.computeAverageScores(conversationResults);
|
|
127
|
+
const scenarioBreakdown = this.computeScenarioBreakdown(conversationResults);
|
|
128
|
+
const toolUsageStats = this.computeToolUsageStats(conversationResults);
|
|
129
|
+
|
|
130
|
+
// 4. Failure analysis
|
|
131
|
+
const failedResults = conversationResults.filter((r) => !r.passed);
|
|
132
|
+
let commonFailurePatterns: string[] = [];
|
|
133
|
+
let recommendations: string[] = [];
|
|
134
|
+
|
|
135
|
+
if (failedResults.length > 0) {
|
|
136
|
+
if (this.llm) {
|
|
137
|
+
const analysis = await this.analyzeFailuresWithLLM(failedResults);
|
|
138
|
+
commonFailurePatterns = analysis.patterns;
|
|
139
|
+
recommendations = analysis.recommendations;
|
|
140
|
+
} else {
|
|
141
|
+
commonFailurePatterns = this.heuristicFailurePatterns(failedResults);
|
|
142
|
+
recommendations = this.heuristicRecommendations(failedResults);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
timestamp: new Date(),
|
|
148
|
+
duration,
|
|
149
|
+
totalConversations,
|
|
150
|
+
passed: passedConversations,
|
|
151
|
+
failed: failedConversations,
|
|
152
|
+
averageScores,
|
|
153
|
+
scenarioBreakdown,
|
|
154
|
+
toolUsageStats,
|
|
155
|
+
commonFailurePatterns,
|
|
156
|
+
recommendations,
|
|
157
|
+
testSuiteResults,
|
|
158
|
+
conversationResults,
|
|
159
|
+
overallPassed: failedTests === 0 && failedConversations === 0 && totalItems > 0,
|
|
160
|
+
totalCost,
|
|
161
|
+
summary: {
|
|
162
|
+
totalTests,
|
|
163
|
+
passedTests,
|
|
164
|
+
failedTests,
|
|
165
|
+
totalConversations,
|
|
166
|
+
passedConversations,
|
|
167
|
+
failedConversations,
|
|
168
|
+
overallPassRate,
|
|
169
|
+
},
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
static formatReport(report: SimulationReport): string {
|
|
174
|
+
const lines: string[] = [];
|
|
175
|
+
|
|
176
|
+
lines.push('=== Simulation Report ===');
|
|
177
|
+
lines.push(`Date: ${report.timestamp.toISOString()}`);
|
|
178
|
+
lines.push(`Duration: ${(report.duration / 1000).toFixed(1)}s`);
|
|
179
|
+
lines.push(`Cost: $${report.totalCost.toFixed(4)}`);
|
|
180
|
+
lines.push('');
|
|
181
|
+
|
|
182
|
+
// Test suite results
|
|
183
|
+
if (report.testSuiteResults.length) {
|
|
184
|
+
lines.push('--- Test Suites ---');
|
|
185
|
+
for (const suite of report.testSuiteResults) {
|
|
186
|
+
lines.push(` ${suite.passed}/${suite.total} passed (avg score: ${suite.averageScore.toFixed(2)})`);
|
|
187
|
+
for (const result of suite.results) {
|
|
188
|
+
const status = result.evaluation.passed ? 'PASS' : 'FAIL';
|
|
189
|
+
lines.push(` [${status}] ${result.testCase.id}: ${result.testCase.question}`);
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
lines.push('');
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Scenario breakdown
|
|
196
|
+
if (report.scenarioBreakdown.length) {
|
|
197
|
+
lines.push('--- Scenario Breakdown ---');
|
|
198
|
+
for (const s of report.scenarioBreakdown) {
|
|
199
|
+
const pct = (s.passRate * 100).toFixed(0);
|
|
200
|
+
lines.push(` ${s.scenario}: ${s.runs} run(s), ${pct}% pass rate, avg score ${s.avgScore.toFixed(2)}`);
|
|
201
|
+
}
|
|
202
|
+
lines.push('');
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Tool usage
|
|
206
|
+
const toolEntries = Object.entries(report.toolUsageStats);
|
|
207
|
+
if (toolEntries.length) {
|
|
208
|
+
lines.push('--- Tool Usage ---');
|
|
209
|
+
for (const [tool, count] of toolEntries.sort((a, b) => b[1] - a[1])) {
|
|
210
|
+
lines.push(` ${tool}: ${count} call(s)`);
|
|
211
|
+
}
|
|
212
|
+
lines.push('');
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
// Average scores
|
|
216
|
+
const { averageScores } = report;
|
|
217
|
+
if (report.totalConversations > 0) {
|
|
218
|
+
lines.push('--- Average Scores ---');
|
|
219
|
+
lines.push(` Accuracy: ${averageScores.accuracy.toFixed(2)}`);
|
|
220
|
+
lines.push(` Tool Usage: ${averageScores.toolUsage.toFixed(2)}`);
|
|
221
|
+
lines.push(` Tone: ${averageScores.tone.toFixed(2)}`);
|
|
222
|
+
lines.push(` Resolution: ${averageScores.resolution.toFixed(2)}`);
|
|
223
|
+
lines.push('');
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Failure patterns
|
|
227
|
+
if (report.commonFailurePatterns.length) {
|
|
228
|
+
lines.push('--- Common Failure Patterns ---');
|
|
229
|
+
for (const pattern of report.commonFailurePatterns) {
|
|
230
|
+
lines.push(` - ${pattern}`);
|
|
231
|
+
}
|
|
232
|
+
lines.push('');
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// Recommendations
|
|
236
|
+
if (report.recommendations.length) {
|
|
237
|
+
lines.push('--- Recommendations ---');
|
|
238
|
+
for (const rec of report.recommendations) {
|
|
239
|
+
lines.push(` - ${rec}`);
|
|
240
|
+
}
|
|
241
|
+
lines.push('');
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// Summary
|
|
245
|
+
const { summary } = report;
|
|
246
|
+
lines.push('--- Summary ---');
|
|
247
|
+
if (summary.totalTests > 0) {
|
|
248
|
+
lines.push(`Tests: ${summary.passedTests}/${summary.totalTests} passed`);
|
|
249
|
+
}
|
|
250
|
+
lines.push(`Conversations: ${summary.passedConversations}/${summary.totalConversations} passed`);
|
|
251
|
+
lines.push(`Overall pass rate: ${(summary.overallPassRate * 100).toFixed(1)}%`);
|
|
252
|
+
lines.push(`Result: ${report.overallPassed ? 'PASSED' : 'FAILED'}`);
|
|
253
|
+
|
|
254
|
+
return lines.join('\n');
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
private resolveScenarios(): ConversationScenario[] {
|
|
258
|
+
if (!this.config.conversationScenarios) return [];
|
|
259
|
+
if (this.config.conversationScenarios === 'builtin') return ECOMMERCE_SCENARIOS;
|
|
260
|
+
return this.config.conversationScenarios;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
private buildSchedule(scenarios: ConversationScenario[]): ConversationScenario[] {
|
|
264
|
+
const total = this.config.totalConversations ?? scenarios.length;
|
|
265
|
+
const schedule: ConversationScenario[] = [];
|
|
266
|
+
for (let i = 0; i < total; i++) {
|
|
267
|
+
schedule.push(scenarios[i % scenarios.length]);
|
|
268
|
+
}
|
|
269
|
+
return schedule;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
private computeAverageScores(
|
|
273
|
+
results: ConversationTestResult[]
|
|
274
|
+
): { accuracy: number; toolUsage: number; tone: number; resolution: number } {
|
|
275
|
+
if (results.length === 0) {
|
|
276
|
+
return { accuracy: 0, toolUsage: 0, tone: 0, resolution: 0 };
|
|
277
|
+
}
|
|
278
|
+
const totals = results.reduce(
|
|
279
|
+
(acc, r) => ({
|
|
280
|
+
accuracy: acc.accuracy + r.evaluation.scores.accuracy,
|
|
281
|
+
toolUsage: acc.toolUsage + r.evaluation.scores.toolUsage,
|
|
282
|
+
tone: acc.tone + r.evaluation.scores.tone,
|
|
283
|
+
resolution: acc.resolution + r.evaluation.scores.resolution,
|
|
284
|
+
}),
|
|
285
|
+
{ accuracy: 0, toolUsage: 0, tone: 0, resolution: 0 }
|
|
286
|
+
);
|
|
287
|
+
const n = results.length;
|
|
288
|
+
return {
|
|
289
|
+
accuracy: totals.accuracy / n,
|
|
290
|
+
toolUsage: totals.toolUsage / n,
|
|
291
|
+
tone: totals.tone / n,
|
|
292
|
+
resolution: totals.resolution / n,
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
private computeScenarioBreakdown(
|
|
297
|
+
results: ConversationTestResult[]
|
|
298
|
+
): Array<{ scenario: string; runs: number; passRate: number; avgScore: number }> {
|
|
299
|
+
const byScenario = new Map<string, ConversationTestResult[]>();
|
|
300
|
+
for (const r of results) {
|
|
301
|
+
const name = r.scenario.name;
|
|
302
|
+
if (!byScenario.has(name)) byScenario.set(name, []);
|
|
303
|
+
byScenario.get(name)!.push(r);
|
|
304
|
+
}
|
|
305
|
+
return Array.from(byScenario.entries()).map(([scenario, runs]) => {
|
|
306
|
+
const passed = runs.filter((r) => r.passed).length;
|
|
307
|
+
const scores = runs.map((r) => {
|
|
308
|
+
const s = r.evaluation.scores;
|
|
309
|
+
return (s.accuracy + s.toolUsage + s.tone + s.resolution) / 4;
|
|
310
|
+
});
|
|
311
|
+
const avgScore = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
312
|
+
return { scenario, runs: runs.length, passRate: passed / runs.length, avgScore };
|
|
313
|
+
});
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
private computeToolUsageStats(results: ConversationTestResult[]): Record<string, number> {
|
|
317
|
+
const stats: Record<string, number> = {};
|
|
318
|
+
for (const r of results) {
|
|
319
|
+
for (const turn of r.turns) {
|
|
320
|
+
if (turn.toolCalls) {
|
|
321
|
+
for (const tc of turn.toolCalls) {
|
|
322
|
+
stats[tc.name] = (stats[tc.name] || 0) + 1;
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
return stats;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
private async analyzeFailuresWithLLM(
|
|
331
|
+
failedResults: ConversationTestResult[]
|
|
332
|
+
): Promise<{ patterns: string[]; recommendations: string[] }> {
|
|
333
|
+
if (!this.llm) {
|
|
334
|
+
return { patterns: [], recommendations: [] };
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
const summaries = failedResults.slice(0, 10).map((r) => {
|
|
338
|
+
const turns = r.turns.map((t) => `[${t.role}]: ${t.message}`).join('\n');
|
|
339
|
+
return `Scenario: ${r.scenario.name}\nFeedback: ${r.evaluation.feedback}\nConversation:\n${turns}`;
|
|
340
|
+
});
|
|
341
|
+
|
|
342
|
+
const prompt = `Analyze these failed customer support conversation tests and identify patterns.
|
|
343
|
+
|
|
344
|
+
${summaries.join('\n\n---\n\n')}
|
|
345
|
+
|
|
346
|
+
Respond with ONLY valid JSON (no markdown, no code fences):
|
|
347
|
+
{
|
|
348
|
+
"patterns": ["pattern 1", "pattern 2"],
|
|
349
|
+
"recommendations": ["recommendation 1", "recommendation 2"]
|
|
350
|
+
}`;
|
|
351
|
+
|
|
352
|
+
try {
|
|
353
|
+
const result = await this.llm.complete(
|
|
354
|
+
[{ role: 'user', content: prompt }],
|
|
355
|
+
{ temperature: 0, maxTokens: 1000 }
|
|
356
|
+
);
|
|
357
|
+
const cleaned = result.text.replace(/```(?:json)?\s*/g, '').replace(/```/g, '').trim();
|
|
358
|
+
const parsed = JSON.parse(cleaned);
|
|
359
|
+
return {
|
|
360
|
+
patterns: Array.isArray(parsed.patterns) ? parsed.patterns.map(String) : [],
|
|
361
|
+
recommendations: Array.isArray(parsed.recommendations) ? parsed.recommendations.map(String) : [],
|
|
362
|
+
};
|
|
363
|
+
} catch {
|
|
364
|
+
return {
|
|
365
|
+
patterns: this.heuristicFailurePatterns(failedResults),
|
|
366
|
+
recommendations: this.heuristicRecommendations(failedResults),
|
|
367
|
+
};
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
private heuristicFailurePatterns(failedResults: ConversationTestResult[]): string[] {
|
|
372
|
+
const patterns: string[] = [];
|
|
373
|
+
|
|
374
|
+
const noToolCalls = failedResults.filter((r) =>
|
|
375
|
+
r.turns.every((t) => !t.toolCalls?.length)
|
|
376
|
+
);
|
|
377
|
+
if (noToolCalls.length > 0) {
|
|
378
|
+
patterns.push(`${noToolCalls.length} conversation(s) failed with no tool calls`);
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
const lowResolution = failedResults.filter((r) => r.evaluation.scores.resolution <= 2);
|
|
382
|
+
if (lowResolution.length > 0) {
|
|
383
|
+
patterns.push(`${lowResolution.length} conversation(s) had low resolution scores`);
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
const lowTone = failedResults.filter((r) => r.evaluation.scores.tone <= 2);
|
|
387
|
+
if (lowTone.length > 0) {
|
|
388
|
+
patterns.push(`${lowTone.length} conversation(s) had low tone scores`);
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
if (patterns.length === 0) {
|
|
392
|
+
patterns.push(`${failedResults.length} conversation(s) failed evaluation criteria`);
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
return patterns;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
private heuristicRecommendations(failedResults: ConversationTestResult[]): string[] {
|
|
399
|
+
const recs: string[] = [];
|
|
400
|
+
|
|
401
|
+
const noTools = failedResults.filter((r) =>
|
|
402
|
+
r.turns.every((t) => !t.toolCalls?.length)
|
|
403
|
+
);
|
|
404
|
+
if (noTools.length > 0) {
|
|
405
|
+
recs.push('Ensure agent is configured to use available tools for customer queries');
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
const expectedButMissing = new Set<string>();
|
|
409
|
+
for (const r of failedResults) {
|
|
410
|
+
for (const tool of r.scenario.expectedTools || []) {
|
|
411
|
+
const used = r.turns.some((t) => t.toolCalls?.some((tc) => tc.name === tool));
|
|
412
|
+
if (!used) expectedButMissing.add(tool);
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
if (expectedButMissing.size > 0) {
|
|
416
|
+
recs.push(`Tools expected but not called: ${Array.from(expectedButMissing).join(', ')}`);
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
if (recs.length === 0) {
|
|
420
|
+
recs.push('Review failed scenarios and adjust agent rules or prompts');
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
return recs;
|
|
424
|
+
}
|
|
425
|
+
}
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
import type { Skill, Tool } from '@operor/core';
|
|
2
|
+
|
|
3
|
+
export interface SkillTestHarnessConfig {
|
|
4
|
+
allowWrites?: boolean;
|
|
5
|
+
allowDestructive?: boolean;
|
|
6
|
+
maxOperations?: number;
|
|
7
|
+
timeoutMs?: number;
|
|
8
|
+
dryRun?: boolean;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface AuditLogEntry {
|
|
12
|
+
name: string;
|
|
13
|
+
params: any;
|
|
14
|
+
result: any;
|
|
15
|
+
timestamp: number;
|
|
16
|
+
duration: number;
|
|
17
|
+
classification: 'read' | 'write' | 'destructive';
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Safety wrapper for Integration instances during testing.
|
|
22
|
+
* Provides operation limits, dry-run mode, and audit logging.
|
|
23
|
+
*/
|
|
24
|
+
export class SkillTestHarness implements Skill {
|
|
25
|
+
public readonly name: string;
|
|
26
|
+
private inner: Skill;
|
|
27
|
+
private config: Required<SkillTestHarnessConfig>;
|
|
28
|
+
private auditLog: AuditLogEntry[] = [];
|
|
29
|
+
private operationCount = 0;
|
|
30
|
+
|
|
31
|
+
// Tool classification rules
|
|
32
|
+
private static readonly READ_TOOLS = new Set([
|
|
33
|
+
'get_order',
|
|
34
|
+
'search_products',
|
|
35
|
+
'salesforce_get_contact',
|
|
36
|
+
'salesforce_get_cases',
|
|
37
|
+
'stripe_get_customer',
|
|
38
|
+
]);
|
|
39
|
+
|
|
40
|
+
private static readonly WRITE_TOOLS = new Set([
|
|
41
|
+
'create_discount',
|
|
42
|
+
'salesforce_update_contact',
|
|
43
|
+
'salesforce_create_case',
|
|
44
|
+
'salesforce_add_note',
|
|
45
|
+
]);
|
|
46
|
+
|
|
47
|
+
private static readonly DESTRUCTIVE_TOOLS = new Set([
|
|
48
|
+
'stripe_create_refund',
|
|
49
|
+
]);
|
|
50
|
+
|
|
51
|
+
constructor(inner: Skill, config: SkillTestHarnessConfig = {}) {
|
|
52
|
+
this.inner = inner;
|
|
53
|
+
this.name = inner.name;
|
|
54
|
+
this.config = {
|
|
55
|
+
allowWrites: config.allowWrites ?? false,
|
|
56
|
+
allowDestructive: config.allowDestructive ?? false,
|
|
57
|
+
maxOperations: config.maxOperations ?? 10,
|
|
58
|
+
timeoutMs: config.timeoutMs ?? 30000,
|
|
59
|
+
dryRun: config.dryRun ?? false,
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
// Wrap all tools with safety checks
|
|
63
|
+
this.tools = this.wrapTools(inner.tools);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async initialize(): Promise<void> {
|
|
67
|
+
return this.inner.initialize();
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** @deprecated Use initialize() instead. */
|
|
71
|
+
async authenticate(): Promise<void> {
|
|
72
|
+
return this.inner.initialize();
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
isReady(): boolean {
|
|
76
|
+
return this.inner.isReady();
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/** @deprecated Use isReady() instead. */
|
|
80
|
+
isAuthenticated(): boolean {
|
|
81
|
+
return this.inner.isReady();
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
public tools: Record<string, Tool>;
|
|
85
|
+
|
|
86
|
+
private wrapTools(innerTools: Record<string, Tool>): Record<string, Tool> {
|
|
87
|
+
const wrapped: Record<string, Tool> = {};
|
|
88
|
+
|
|
89
|
+
for (const [toolName, tool] of Object.entries(innerTools)) {
|
|
90
|
+
wrapped[toolName] = {
|
|
91
|
+
...tool,
|
|
92
|
+
execute: async (params: any) => {
|
|
93
|
+
const startTime = Date.now();
|
|
94
|
+
const classification = this.classifyTool(toolName);
|
|
95
|
+
|
|
96
|
+
// Check operation limit
|
|
97
|
+
if (this.operationCount >= this.config.maxOperations) {
|
|
98
|
+
throw new Error(
|
|
99
|
+
`SkillTestHarness: Max operations limit reached (${this.config.maxOperations})`
|
|
100
|
+
);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Check write permissions
|
|
104
|
+
if (classification === 'write' && !this.config.allowWrites) {
|
|
105
|
+
throw new Error(
|
|
106
|
+
`SkillTestHarness: Write operation '${toolName}' blocked (allowWrites=false)`
|
|
107
|
+
);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Check destructive permissions
|
|
111
|
+
if (classification === 'destructive' && !this.config.allowDestructive) {
|
|
112
|
+
throw new Error(
|
|
113
|
+
`SkillTestHarness: Destructive operation '${toolName}' blocked (allowDestructive=false)`
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
this.operationCount++;
|
|
118
|
+
|
|
119
|
+
// Dry-run mode: return mock result without executing
|
|
120
|
+
if (this.config.dryRun) {
|
|
121
|
+
const result = {
|
|
122
|
+
dryRun: true,
|
|
123
|
+
wouldExecute: toolName,
|
|
124
|
+
params,
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
this.auditLog.push({
|
|
128
|
+
name: toolName,
|
|
129
|
+
params,
|
|
130
|
+
result,
|
|
131
|
+
timestamp: startTime,
|
|
132
|
+
duration: Date.now() - startTime,
|
|
133
|
+
classification,
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
return result;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Execute with timeout
|
|
140
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
141
|
+
setTimeout(() => {
|
|
142
|
+
reject(new Error(`SkillTestHarness: Operation '${toolName}' timed out after ${this.config.timeoutMs}ms`));
|
|
143
|
+
}, this.config.timeoutMs);
|
|
144
|
+
});
|
|
145
|
+
|
|
146
|
+
try {
|
|
147
|
+
const result = await Promise.race([
|
|
148
|
+
tool.execute(params),
|
|
149
|
+
timeoutPromise,
|
|
150
|
+
]);
|
|
151
|
+
|
|
152
|
+
const duration = Date.now() - startTime;
|
|
153
|
+
|
|
154
|
+
this.auditLog.push({
|
|
155
|
+
name: toolName,
|
|
156
|
+
params,
|
|
157
|
+
result,
|
|
158
|
+
timestamp: startTime,
|
|
159
|
+
duration,
|
|
160
|
+
classification,
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
return result;
|
|
164
|
+
} catch (error) {
|
|
165
|
+
const duration = Date.now() - startTime;
|
|
166
|
+
|
|
167
|
+
this.auditLog.push({
|
|
168
|
+
name: toolName,
|
|
169
|
+
params,
|
|
170
|
+
result: { error: error instanceof Error ? error.message : 'Unknown error' },
|
|
171
|
+
timestamp: startTime,
|
|
172
|
+
duration,
|
|
173
|
+
classification,
|
|
174
|
+
});
|
|
175
|
+
|
|
176
|
+
throw error;
|
|
177
|
+
}
|
|
178
|
+
},
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return wrapped;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
private classifyTool(toolName: string): 'read' | 'write' | 'destructive' {
|
|
186
|
+
if (SkillTestHarness.READ_TOOLS.has(toolName)) {
|
|
187
|
+
return 'read';
|
|
188
|
+
}
|
|
189
|
+
if (SkillTestHarness.DESTRUCTIVE_TOOLS.has(toolName)) {
|
|
190
|
+
return 'destructive';
|
|
191
|
+
}
|
|
192
|
+
if (SkillTestHarness.WRITE_TOOLS.has(toolName)) {
|
|
193
|
+
return 'write';
|
|
194
|
+
}
|
|
195
|
+
// Unknown tools default to 'write' (safe by default)
|
|
196
|
+
return 'write';
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
/**
|
|
200
|
+
* Get the audit log of all operations performed
|
|
201
|
+
*/
|
|
202
|
+
getAuditLog(): AuditLogEntry[] {
|
|
203
|
+
return [...this.auditLog];
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Reset the audit log and operation counter
|
|
208
|
+
*/
|
|
209
|
+
resetAuditLog(): void {
|
|
210
|
+
this.auditLog = [];
|
|
211
|
+
this.operationCount = 0;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Get the current operation count
|
|
216
|
+
*/
|
|
217
|
+
getOperationCount(): number {
|
|
218
|
+
return this.operationCount;
|
|
219
|
+
}
|
|
220
|
+
}
|