elsabro 2.3.0 → 3.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +668 -20
- package/bin/install.js +0 -0
- package/flows/development-flow.json +452 -0
- package/flows/quick-flow.json +118 -0
- package/package.json +3 -2
- package/references/SYSTEM_INDEX.md +379 -5
- package/references/agent-marketplace.md +2274 -0
- package/references/agent-protocol.md +1126 -0
- package/references/ai-code-suggestions.md +2413 -0
- package/references/checkpointing.md +595 -0
- package/references/collaboration-patterns.md +851 -0
- package/references/collaborative-sessions.md +1081 -0
- package/references/configuration-management.md +1810 -0
- package/references/cost-tracking.md +1095 -0
- package/references/enterprise-sso.md +2001 -0
- package/references/error-contracts-v2.md +968 -0
- package/references/event-driven.md +1031 -0
- package/references/flow-orchestration.md +940 -0
- package/references/flow-visualization.md +1557 -0
- package/references/ide-integrations.md +3513 -0
- package/references/interrupt-system.md +681 -0
- package/references/kubernetes-deployment.md +3099 -0
- package/references/memory-system.md +683 -0
- package/references/mobile-companion.md +3236 -0
- package/references/multi-llm-providers.md +2494 -0
- package/references/multi-project-memory.md +1182 -0
- package/references/observability.md +793 -0
- package/references/output-schemas.md +858 -0
- package/references/performance-profiler.md +955 -0
- package/references/plugin-system.md +1526 -0
- package/references/prompt-management.md +292 -0
- package/references/sandbox-execution.md +303 -0
- package/references/security-system.md +1253 -0
- package/references/streaming.md +696 -0
- package/references/testing-framework.md +1151 -0
- package/references/time-travel.md +802 -0
- package/references/tool-registry.md +886 -0
- package/references/voice-commands.md +3296 -0
- package/templates/agent-marketplace-config.json +220 -0
- package/templates/agent-protocol-config.json +136 -0
- package/templates/ai-suggestions-config.json +100 -0
- package/templates/checkpoint-state.json +61 -0
- package/templates/collaboration-config.json +157 -0
- package/templates/collaborative-sessions-config.json +153 -0
- package/templates/configuration-config.json +245 -0
- package/templates/cost-tracking-config.json +148 -0
- package/templates/enterprise-sso-config.json +438 -0
- package/templates/events-config.json +148 -0
- package/templates/flow-visualization-config.json +196 -0
- package/templates/ide-integrations-config.json +442 -0
- package/templates/kubernetes-config.json +764 -0
- package/templates/memory-state.json +84 -0
- package/templates/mobile-companion-config.json +600 -0
- package/templates/multi-llm-config.json +544 -0
- package/templates/multi-project-memory-config.json +145 -0
- package/templates/observability-config.json +109 -0
- package/templates/performance-profiler-config.json +125 -0
- package/templates/plugin-config.json +170 -0
- package/templates/prompt-management-config.json +86 -0
- package/templates/sandbox-config.json +185 -0
- package/templates/schemas-config.json +65 -0
- package/templates/security-config.json +120 -0
- package/templates/streaming-config.json +72 -0
- package/templates/testing-config.json +81 -0
- package/templates/timetravel-config.json +62 -0
- package/templates/tool-registry-config.json +109 -0
- package/templates/voice-commands-config.json +658 -0
|
@@ -0,0 +1,1151 @@
|
|
|
1
|
+
# Testing Framework (v3.4)
|
|
2
|
+
|
|
3
|
+
Framework de testing para agentes AI con mocking, simulación, fixtures y assertions comportamentales.
|
|
4
|
+
|
|
5
|
+
## Arquitectura
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
┌─────────────────────────────────────────────────────────────────────────┐
|
|
9
|
+
│ TESTING FRAMEWORK │
|
|
10
|
+
├─────────────────────────────────────────────────────────────────────────┤
|
|
11
|
+
│ │
|
|
12
|
+
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
|
13
|
+
│ │ TEST RUNNER │ │
|
|
14
|
+
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
|
15
|
+
│ │ │ Unit │ │ Integration │ │ E2E │ │ │
|
|
16
|
+
│ │ │ Tests │ │ Tests │ │ Tests │ │ │
|
|
17
|
+
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
|
18
|
+
│ └─────────────────────────────────────────────────────────────────┘ │
|
|
19
|
+
│ │ │
|
|
20
|
+
│ ▼ │
|
|
21
|
+
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
|
22
|
+
│ │ MOCK PROVIDERS │ │
|
|
23
|
+
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
|
24
|
+
│ │ │ MockLLM │ │ MockTools │ │ MockEvents │ │ │
|
|
25
|
+
│ │ │ (responses) │ │ (behavior) │ │ (triggers) │ │ │
|
|
26
|
+
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
|
27
|
+
│ └─────────────────────────────────────────────────────────────────┘ │
|
|
28
|
+
│ │ │
|
|
29
|
+
│ ▼ │
|
|
30
|
+
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
|
31
|
+
│ │ SIMULATION ENGINE │ │
|
|
32
|
+
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
|
33
|
+
│ │ │ Scenarios │ │ Personas │ │ Environment │ │ │
|
|
34
|
+
│ │ │ (workflows) │ │ (users) │ │ (context) │ │ │
|
|
35
|
+
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
|
36
|
+
│ └─────────────────────────────────────────────────────────────────┘ │
|
|
37
|
+
│ │ │
|
|
38
|
+
│ ▼ │
|
|
39
|
+
│ ┌─────────────────────────────────────────────────────────────────┐ │
|
|
40
|
+
│ │ ASSERTIONS │ │
|
|
41
|
+
│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │
|
|
42
|
+
│ │ │ Behavioral │ │ Property │ │ Semantic │ │ │
|
|
43
|
+
│ │ │ (actions) │ │(invariants) │ │ (meaning) │ │ │
|
|
44
|
+
│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │
|
|
45
|
+
│ └─────────────────────────────────────────────────────────────────┘ │
|
|
46
|
+
│ │
|
|
47
|
+
└─────────────────────────────────────────────────────────────────────────┘
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## AgentTestRunner
|
|
53
|
+
|
|
54
|
+
### API Principal
|
|
55
|
+
|
|
56
|
+
```typescript
|
|
57
|
+
interface TestCase {
|
|
58
|
+
name: string;
|
|
59
|
+
description?: string;
|
|
60
|
+
setup?: () => Promise<void>;
|
|
61
|
+
teardown?: () => Promise<void>;
|
|
62
|
+
timeout?: number;
|
|
63
|
+
retries?: number;
|
|
64
|
+
tags?: string[];
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
interface AgentTestCase extends TestCase {
|
|
68
|
+
agent: string | AgentConfig;
|
|
69
|
+
input: AgentInput;
|
|
70
|
+
expected?: ExpectedBehavior;
|
|
71
|
+
mocks?: MockConfig;
|
|
72
|
+
assertions: Assertion[];
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
interface TestResult {
|
|
76
|
+
name: string;
|
|
77
|
+
status: 'passed' | 'failed' | 'skipped' | 'error';
|
|
78
|
+
duration_ms: number;
|
|
79
|
+
assertions: AssertionResult[];
|
|
80
|
+
error?: Error;
|
|
81
|
+
logs?: string[];
|
|
82
|
+
metrics?: TestMetrics;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
interface TestSuite {
|
|
86
|
+
name: string;
|
|
87
|
+
tests: AgentTestCase[];
|
|
88
|
+
beforeAll?: () => Promise<void>;
|
|
89
|
+
afterAll?: () => Promise<void>;
|
|
90
|
+
beforeEach?: () => Promise<void>;
|
|
91
|
+
afterEach?: () => Promise<void>;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
class AgentTestRunner {
|
|
95
|
+
private mockProvider: MockProvider;
|
|
96
|
+
private assertionEngine: AssertionEngine;
|
|
97
|
+
private config: TestRunnerConfig;
|
|
98
|
+
|
|
99
|
+
constructor(config: TestRunnerConfig) {
|
|
100
|
+
this.config = config;
|
|
101
|
+
this.mockProvider = new MockProvider();
|
|
102
|
+
this.assertionEngine = new AssertionEngine();
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// Run single test
|
|
106
|
+
async runTest(test: AgentTestCase): Promise<TestResult> {
|
|
107
|
+
const startTime = Date.now();
|
|
108
|
+
const logs: string[] = [];
|
|
109
|
+
const assertionResults: AssertionResult[] = [];
|
|
110
|
+
|
|
111
|
+
try {
|
|
112
|
+
// Setup
|
|
113
|
+
if (test.setup) await test.setup();
|
|
114
|
+
|
|
115
|
+
// Configure mocks
|
|
116
|
+
if (test.mocks) {
|
|
117
|
+
this.mockProvider.configure(test.mocks);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Create agent with mocks injected
|
|
121
|
+
const agent = await this.createTestAgent(test.agent, test.mocks);
|
|
122
|
+
|
|
123
|
+
// Run agent
|
|
124
|
+
const result = await this.runWithTimeout(
|
|
125
|
+
agent.run(test.input),
|
|
126
|
+
test.timeout || this.config.defaultTimeout
|
|
127
|
+
);
|
|
128
|
+
|
|
129
|
+
// Run assertions
|
|
130
|
+
for (const assertion of test.assertions) {
|
|
131
|
+
const assertionResult = await this.assertionEngine.check(
|
|
132
|
+
assertion,
|
|
133
|
+
result,
|
|
134
|
+
{ input: test.input, agent }
|
|
135
|
+
);
|
|
136
|
+
assertionResults.push(assertionResult);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
// Check if all passed
|
|
140
|
+
const allPassed = assertionResults.every(r => r.passed);
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
name: test.name,
|
|
144
|
+
status: allPassed ? 'passed' : 'failed',
|
|
145
|
+
duration_ms: Date.now() - startTime,
|
|
146
|
+
assertions: assertionResults,
|
|
147
|
+
logs,
|
|
148
|
+
metrics: this.collectMetrics(result)
|
|
149
|
+
};
|
|
150
|
+
|
|
151
|
+
} catch (error) {
|
|
152
|
+
return {
|
|
153
|
+
name: test.name,
|
|
154
|
+
status: 'error',
|
|
155
|
+
duration_ms: Date.now() - startTime,
|
|
156
|
+
assertions: assertionResults,
|
|
157
|
+
error: error as Error,
|
|
158
|
+
logs
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
} finally {
|
|
162
|
+
// Teardown
|
|
163
|
+
if (test.teardown) await test.teardown();
|
|
164
|
+
this.mockProvider.reset();
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Run test suite
|
|
169
|
+
async runSuite(suite: TestSuite): Promise<SuiteResult> {
|
|
170
|
+
const results: TestResult[] = [];
|
|
171
|
+
|
|
172
|
+
// Before all
|
|
173
|
+
if (suite.beforeAll) await suite.beforeAll();
|
|
174
|
+
|
|
175
|
+
for (const test of suite.tests) {
|
|
176
|
+
// Before each
|
|
177
|
+
if (suite.beforeEach) await suite.beforeEach();
|
|
178
|
+
|
|
179
|
+
// Run with retries
|
|
180
|
+
let result = await this.runTest(test);
|
|
181
|
+
let attempts = 1;
|
|
182
|
+
|
|
183
|
+
while (result.status === 'failed' && attempts < (test.retries || 1)) {
|
|
184
|
+
attempts++;
|
|
185
|
+
result = await this.runTest(test);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
results.push(result);
|
|
189
|
+
|
|
190
|
+
// After each
|
|
191
|
+
if (suite.afterEach) await suite.afterEach();
|
|
192
|
+
|
|
193
|
+
// Stop on first failure if configured
|
|
194
|
+
if (this.config.stopOnFirstFailure && result.status === 'failed') {
|
|
195
|
+
break;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// After all
|
|
200
|
+
if (suite.afterAll) await suite.afterAll();
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
name: suite.name,
|
|
204
|
+
results,
|
|
205
|
+
summary: this.summarize(results)
|
|
206
|
+
};
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Run all tests matching pattern
|
|
210
|
+
async runAll(pattern?: string): Promise<SuiteResult[]> {
|
|
211
|
+
const suites = await this.discoverTests(pattern);
|
|
212
|
+
const results: SuiteResult[] = [];
|
|
213
|
+
|
|
214
|
+
for (const suite of suites) {
|
|
215
|
+
results.push(await this.runSuite(suite));
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return results;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
// Watch mode
|
|
222
|
+
async watch(pattern?: string): Promise<void> {
|
|
223
|
+
console.log('Watching for changes...');
|
|
224
|
+
|
|
225
|
+
// Initial run
|
|
226
|
+
await this.runAll(pattern);
|
|
227
|
+
|
|
228
|
+
// Watch for file changes
|
|
229
|
+
const watcher = fs.watch(this.config.testDir, { recursive: true });
|
|
230
|
+
|
|
231
|
+
watcher.on('change', async (eventType, filename) => {
|
|
232
|
+
if (filename?.endsWith('.test.ts') || filename?.endsWith('.test.json')) {
|
|
233
|
+
console.log(`\nFile changed: ${filename}`);
|
|
234
|
+
await this.runAll(pattern);
|
|
235
|
+
}
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
private async createTestAgent(
|
|
240
|
+
agentConfig: string | AgentConfig,
|
|
241
|
+
mocks?: MockConfig
|
|
242
|
+
): Promise<Agent> {
|
|
243
|
+
// Create agent with mock providers injected
|
|
244
|
+
const agent = typeof agentConfig === 'string'
|
|
245
|
+
? await AgentFactory.create(agentConfig)
|
|
246
|
+
: await AgentFactory.createFromConfig(agentConfig);
|
|
247
|
+
|
|
248
|
+
if (mocks?.llm) {
|
|
249
|
+
agent.setLLMProvider(this.mockProvider.getLLM());
|
|
250
|
+
}
|
|
251
|
+
if (mocks?.tools) {
|
|
252
|
+
agent.setToolProvider(this.mockProvider.getTools());
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
return agent;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
private async runWithTimeout<T>(promise: Promise<T>, timeout: number): Promise<T> {
|
|
259
|
+
return Promise.race([
|
|
260
|
+
promise,
|
|
261
|
+
new Promise<never>((_, reject) =>
|
|
262
|
+
setTimeout(() => reject(new Error('Test timeout')), timeout)
|
|
263
|
+
)
|
|
264
|
+
]);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
private summarize(results: TestResult[]): TestSummary {
|
|
268
|
+
return {
|
|
269
|
+
total: results.length,
|
|
270
|
+
passed: results.filter(r => r.status === 'passed').length,
|
|
271
|
+
failed: results.filter(r => r.status === 'failed').length,
|
|
272
|
+
skipped: results.filter(r => r.status === 'skipped').length,
|
|
273
|
+
errors: results.filter(r => r.status === 'error').length,
|
|
274
|
+
duration_ms: results.reduce((sum, r) => sum + r.duration_ms, 0)
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
private collectMetrics(result: AgentResult): TestMetrics {
|
|
279
|
+
return {
|
|
280
|
+
tokens_used: result.usage?.total_tokens || 0,
|
|
281
|
+
tool_calls: result.toolCalls?.length || 0,
|
|
282
|
+
cost: result.cost?.total || 0
|
|
283
|
+
};
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
private async discoverTests(pattern?: string): Promise<TestSuite[]> {
|
|
287
|
+
// Discover test files
|
|
288
|
+
const testFiles = await glob(
|
|
289
|
+
pattern || `${this.config.testDir}/**/*.test.{ts,json}`
|
|
290
|
+
);
|
|
291
|
+
|
|
292
|
+
const suites: TestSuite[] = [];
|
|
293
|
+
|
|
294
|
+
for (const file of testFiles) {
|
|
295
|
+
const suite = await this.loadTestFile(file);
|
|
296
|
+
suites.push(suite);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
return suites;
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
private async loadTestFile(file: string): Promise<TestSuite> {
|
|
303
|
+
if (file.endsWith('.json')) {
|
|
304
|
+
const content = await fs.readFile(file, 'utf-8');
|
|
305
|
+
return JSON.parse(content);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// Dynamic import for .ts files
|
|
309
|
+
const module = await import(file);
|
|
310
|
+
return module.default || module;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
---
|
|
316
|
+
|
|
317
|
+
## MockProvider
|
|
318
|
+
|
|
319
|
+
```typescript
|
|
320
|
+
interface MockLLMConfig {
|
|
321
|
+
responses?: MockResponse[];
|
|
322
|
+
defaultResponse?: string;
|
|
323
|
+
delay?: number;
|
|
324
|
+
errorRate?: number;
|
|
325
|
+
tokenUsage?: TokenUsage;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
interface MockResponse {
|
|
329
|
+
match: string | RegExp | ((input: string) => boolean);
|
|
330
|
+
response: string | (() => string);
|
|
331
|
+
toolCalls?: ToolCall[];
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
interface MockToolConfig {
|
|
335
|
+
tools: Record<string, MockToolBehavior>;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
interface MockToolBehavior {
|
|
339
|
+
response?: unknown;
|
|
340
|
+
error?: string;
|
|
341
|
+
delay?: number;
|
|
342
|
+
sideEffect?: (params: unknown) => void;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
class MockProvider {
|
|
346
|
+
private llmConfig: MockLLMConfig = {};
|
|
347
|
+
private toolConfig: MockToolConfig = { tools: {} };
|
|
348
|
+
private callHistory: CallRecord[] = [];
|
|
349
|
+
|
|
350
|
+
configure(config: MockConfig): void {
|
|
351
|
+
if (config.llm) this.llmConfig = config.llm;
|
|
352
|
+
if (config.tools) this.toolConfig = config.tools;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
reset(): void {
|
|
356
|
+
this.llmConfig = {};
|
|
357
|
+
this.toolConfig = { tools: {} };
|
|
358
|
+
this.callHistory = [];
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// Get mock LLM provider
|
|
362
|
+
getLLM(): MockLLM {
|
|
363
|
+
return {
|
|
364
|
+
complete: async (prompt: string) => {
|
|
365
|
+
// Record call
|
|
366
|
+
this.callHistory.push({
|
|
367
|
+
type: 'llm',
|
|
368
|
+
input: prompt,
|
|
369
|
+
timestamp: new Date()
|
|
370
|
+
});
|
|
371
|
+
|
|
372
|
+
// Simulate delay
|
|
373
|
+
if (this.llmConfig.delay) {
|
|
374
|
+
await this.delay(this.llmConfig.delay);
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
// Simulate errors
|
|
378
|
+
if (this.llmConfig.errorRate && Math.random() < this.llmConfig.errorRate) {
|
|
379
|
+
throw new Error('Mock LLM error');
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// Find matching response
|
|
383
|
+
const response = this.findMatchingResponse(prompt);
|
|
384
|
+
|
|
385
|
+
return {
|
|
386
|
+
content: response.text,
|
|
387
|
+
toolCalls: response.toolCalls,
|
|
388
|
+
usage: this.llmConfig.tokenUsage || {
|
|
389
|
+
input_tokens: prompt.length / 4,
|
|
390
|
+
output_tokens: response.text.length / 4,
|
|
391
|
+
total_tokens: (prompt.length + response.text.length) / 4
|
|
392
|
+
}
|
|
393
|
+
};
|
|
394
|
+
}
|
|
395
|
+
};
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
// Get mock tool provider
|
|
399
|
+
getTools(): MockToolProvider {
|
|
400
|
+
return {
|
|
401
|
+
call: async (name: string, params: unknown) => {
|
|
402
|
+
// Record call
|
|
403
|
+
this.callHistory.push({
|
|
404
|
+
type: 'tool',
|
|
405
|
+
name,
|
|
406
|
+
input: params,
|
|
407
|
+
timestamp: new Date()
|
|
408
|
+
});
|
|
409
|
+
|
|
410
|
+
const behavior = this.toolConfig.tools[name];
|
|
411
|
+
|
|
412
|
+
if (!behavior) {
|
|
413
|
+
throw new Error(`No mock configured for tool: ${name}`);
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
// Simulate delay
|
|
417
|
+
if (behavior.delay) {
|
|
418
|
+
await this.delay(behavior.delay);
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// Execute side effect
|
|
422
|
+
if (behavior.sideEffect) {
|
|
423
|
+
behavior.sideEffect(params);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
// Simulate error
|
|
427
|
+
if (behavior.error) {
|
|
428
|
+
throw new Error(behavior.error);
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
return behavior.response;
|
|
432
|
+
}
|
|
433
|
+
};
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// Get call history for assertions
|
|
437
|
+
getCallHistory(): CallRecord[] {
|
|
438
|
+
return [...this.callHistory];
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
// Helpers for setting up mocks
|
|
442
|
+
mockLLMResponse(match: string | RegExp, response: string): void {
|
|
443
|
+
if (!this.llmConfig.responses) {
|
|
444
|
+
this.llmConfig.responses = [];
|
|
445
|
+
}
|
|
446
|
+
this.llmConfig.responses.push({ match, response });
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
mockTool(name: string, behavior: MockToolBehavior): void {
|
|
450
|
+
this.toolConfig.tools[name] = behavior;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
mockToolSuccess(name: string, response: unknown): void {
|
|
454
|
+
this.toolConfig.tools[name] = { response };
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
mockToolError(name: string, error: string): void {
|
|
458
|
+
this.toolConfig.tools[name] = { error };
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
private findMatchingResponse(prompt: string): { text: string; toolCalls?: ToolCall[] } {
|
|
462
|
+
for (const mock of this.llmConfig.responses || []) {
|
|
463
|
+
let matches = false;
|
|
464
|
+
|
|
465
|
+
if (typeof mock.match === 'string') {
|
|
466
|
+
matches = prompt.includes(mock.match);
|
|
467
|
+
} else if (mock.match instanceof RegExp) {
|
|
468
|
+
matches = mock.match.test(prompt);
|
|
469
|
+
} else if (typeof mock.match === 'function') {
|
|
470
|
+
matches = mock.match(prompt);
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
if (matches) {
|
|
474
|
+
const text = typeof mock.response === 'function'
|
|
475
|
+
? mock.response()
|
|
476
|
+
: mock.response;
|
|
477
|
+
return { text, toolCalls: mock.toolCalls };
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
return {
|
|
482
|
+
text: this.llmConfig.defaultResponse || 'Mock response'
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
|
|
486
|
+
private delay(ms: number): Promise<void> {
|
|
487
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
```
|
|
491
|
+
|
|
492
|
+
---
|
|
493
|
+
|
|
494
|
+
## SimulationEngine
|
|
495
|
+
|
|
496
|
+
```typescript
|
|
497
|
+
interface Scenario {
|
|
498
|
+
name: string;
|
|
499
|
+
description: string;
|
|
500
|
+
steps: ScenarioStep[];
|
|
501
|
+
environment?: EnvironmentConfig;
|
|
502
|
+
persona?: PersonaConfig;
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
interface ScenarioStep {
|
|
506
|
+
action: 'user_input' | 'wait' | 'trigger_event' | 'assert' | 'checkpoint';
|
|
507
|
+
data?: unknown;
|
|
508
|
+
timeout?: number;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
interface EnvironmentConfig {
|
|
512
|
+
files?: Record<string, string>;
|
|
513
|
+
env?: Record<string, string>;
|
|
514
|
+
time?: string; // Frozen time
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
interface PersonaConfig {
|
|
518
|
+
name: string;
|
|
519
|
+
behavior: 'cooperative' | 'adversarial' | 'confused' | 'expert';
|
|
520
|
+
style?: 'verbose' | 'terse' | 'random';
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
class SimulationEngine {
|
|
524
|
+
private scenarios: Map<string, Scenario>;
|
|
525
|
+
private currentSimulation: SimulationState | null = null;
|
|
526
|
+
|
|
527
|
+
constructor() {
|
|
528
|
+
this.scenarios = new Map();
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
// Register scenario
|
|
532
|
+
registerScenario(scenario: Scenario): void {
|
|
533
|
+
this.scenarios.set(scenario.name, scenario);
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
// Run simulation
|
|
537
|
+
async simulate(
|
|
538
|
+
scenarioName: string,
|
|
539
|
+
agent: Agent,
|
|
540
|
+
options?: SimulationOptions
|
|
541
|
+
): Promise<SimulationResult> {
|
|
542
|
+
const scenario = this.scenarios.get(scenarioName);
|
|
543
|
+
if (!scenario) throw new Error(`Scenario not found: ${scenarioName}`);
|
|
544
|
+
|
|
545
|
+
// Setup environment
|
|
546
|
+
const env = await this.setupEnvironment(scenario.environment);
|
|
547
|
+
|
|
548
|
+
// Create simulation state
|
|
549
|
+
this.currentSimulation = {
|
|
550
|
+
scenario,
|
|
551
|
+
agent,
|
|
552
|
+
env,
|
|
553
|
+
stepResults: [],
|
|
554
|
+
startedAt: new Date()
|
|
555
|
+
};
|
|
556
|
+
|
|
557
|
+
try {
|
|
558
|
+
// Run each step
|
|
559
|
+
for (let i = 0; i < scenario.steps.length; i++) {
|
|
560
|
+
const step = scenario.steps[i];
|
|
561
|
+
const stepResult = await this.runStep(step, i);
|
|
562
|
+
|
|
563
|
+
this.currentSimulation.stepResults.push(stepResult);
|
|
564
|
+
|
|
565
|
+
// Stop on failure if configured
|
|
566
|
+
if (!stepResult.success && options?.stopOnFailure) {
|
|
567
|
+
break;
|
|
568
|
+
}
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
return this.buildResult();
|
|
572
|
+
|
|
573
|
+
} finally {
|
|
574
|
+
// Cleanup
|
|
575
|
+
await this.cleanup(env);
|
|
576
|
+
this.currentSimulation = null;
|
|
577
|
+
}
|
|
578
|
+
}
|
|
579
|
+
|
|
580
|
+
// Generate test cases from scenario
|
|
581
|
+
async generateTestCases(scenarioName: string): Promise<AgentTestCase[]> {
|
|
582
|
+
const scenario = this.scenarios.get(scenarioName);
|
|
583
|
+
if (!scenario) throw new Error(`Scenario not found: ${scenarioName}`);
|
|
584
|
+
|
|
585
|
+
const testCases: AgentTestCase[] = [];
|
|
586
|
+
|
|
587
|
+
// Generate test case for each meaningful step
|
|
588
|
+
for (let i = 0; i < scenario.steps.length; i++) {
|
|
589
|
+
const step = scenario.steps[i];
|
|
590
|
+
|
|
591
|
+
if (step.action === 'user_input') {
|
|
592
|
+
testCases.push({
|
|
593
|
+
name: `${scenario.name} - Step ${i + 1}`,
|
|
594
|
+
description: `Test step: ${step.action}`,
|
|
595
|
+
agent: 'default',
|
|
596
|
+
input: { message: step.data as string },
|
|
597
|
+
assertions: [
|
|
598
|
+
{ type: 'responds', timeout: 30000 },
|
|
599
|
+
{ type: 'no_error' }
|
|
600
|
+
]
|
|
601
|
+
});
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
return testCases;
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
private async runStep(step: ScenarioStep, index: number): Promise<StepResult> {
|
|
609
|
+
const startTime = Date.now();
|
|
610
|
+
|
|
611
|
+
try {
|
|
612
|
+
switch (step.action) {
|
|
613
|
+
case 'user_input':
|
|
614
|
+
return await this.handleUserInput(step.data as string);
|
|
615
|
+
|
|
616
|
+
case 'wait':
|
|
617
|
+
await this.delay(step.data as number);
|
|
618
|
+
return { success: true, duration_ms: step.data as number };
|
|
619
|
+
|
|
620
|
+
case 'trigger_event':
|
|
621
|
+
await EventBus.publish(
|
|
622
|
+
(step.data as any).event,
|
|
623
|
+
(step.data as any).payload
|
|
624
|
+
);
|
|
625
|
+
return { success: true, duration_ms: Date.now() - startTime };
|
|
626
|
+
|
|
627
|
+
case 'assert':
|
|
628
|
+
const assertion = step.data as Assertion;
|
|
629
|
+
const result = await this.checkAssertion(assertion);
|
|
630
|
+
return {
|
|
631
|
+
success: result.passed,
|
|
632
|
+
duration_ms: Date.now() - startTime,
|
|
633
|
+
assertion: result
|
|
634
|
+
};
|
|
635
|
+
|
|
636
|
+
case 'checkpoint':
|
|
637
|
+
// Save simulation state for later analysis
|
|
638
|
+
return {
|
|
639
|
+
success: true,
|
|
640
|
+
duration_ms: Date.now() - startTime,
|
|
641
|
+
checkpoint: this.captureState()
|
|
642
|
+
};
|
|
643
|
+
|
|
644
|
+
default:
|
|
645
|
+
throw new Error(`Unknown step action: ${step.action}`);
|
|
646
|
+
}
|
|
647
|
+
} catch (error) {
|
|
648
|
+
return {
|
|
649
|
+
success: false,
|
|
650
|
+
duration_ms: Date.now() - startTime,
|
|
651
|
+
error: error as Error
|
|
652
|
+
};
|
|
653
|
+
}
|
|
654
|
+
}
|
|
655
|
+
|
|
656
|
+
private async handleUserInput(input: string): Promise<StepResult> {
|
|
657
|
+
if (!this.currentSimulation) throw new Error('No simulation running');
|
|
658
|
+
|
|
659
|
+
const startTime = Date.now();
|
|
660
|
+
const result = await this.currentSimulation.agent.run({ message: input });
|
|
661
|
+
|
|
662
|
+
return {
|
|
663
|
+
success: true,
|
|
664
|
+
duration_ms: Date.now() - startTime,
|
|
665
|
+
agentResult: result
|
|
666
|
+
};
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
private async setupEnvironment(config?: EnvironmentConfig): Promise<SimulationEnv> {
|
|
670
|
+
const env: SimulationEnv = {
|
|
671
|
+
tempDir: await fs.mkdtemp(path.join(os.tmpdir(), 'elsabro-sim-')),
|
|
672
|
+
originalEnv: { ...process.env }
|
|
673
|
+
};
|
|
674
|
+
|
|
675
|
+
// Create mock files
|
|
676
|
+
if (config?.files) {
|
|
677
|
+
for (const [filePath, content] of Object.entries(config.files)) {
|
|
678
|
+
const fullPath = path.join(env.tempDir, filePath);
|
|
679
|
+
await fs.mkdir(path.dirname(fullPath), { recursive: true });
|
|
680
|
+
await fs.writeFile(fullPath, content);
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
// Set environment variables
|
|
685
|
+
if (config?.env) {
|
|
686
|
+
Object.assign(process.env, config.env);
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
return env;
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
private async cleanup(env: SimulationEnv): Promise<void> {
|
|
693
|
+
// Restore environment
|
|
694
|
+
process.env = env.originalEnv;
|
|
695
|
+
|
|
696
|
+
// Remove temp directory
|
|
697
|
+
await fs.rm(env.tempDir, { recursive: true, force: true });
|
|
698
|
+
}
|
|
699
|
+
|
|
700
|
+
private buildResult(): SimulationResult {
|
|
701
|
+
if (!this.currentSimulation) throw new Error('No simulation');
|
|
702
|
+
|
|
703
|
+
const { scenario, stepResults, startedAt } = this.currentSimulation;
|
|
704
|
+
|
|
705
|
+
return {
|
|
706
|
+
scenario: scenario.name,
|
|
707
|
+
success: stepResults.every(r => r.success),
|
|
708
|
+
steps: stepResults,
|
|
709
|
+
duration_ms: Date.now() - startedAt.getTime(),
|
|
710
|
+
summary: {
|
|
711
|
+
totalSteps: stepResults.length,
|
|
712
|
+
passedSteps: stepResults.filter(r => r.success).length,
|
|
713
|
+
failedSteps: stepResults.filter(r => !r.success).length
|
|
714
|
+
}
|
|
715
|
+
};
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
private captureState(): SimulationCheckpoint {
|
|
719
|
+
return {
|
|
720
|
+
timestamp: new Date().toISOString(),
|
|
721
|
+
memory: MemoryManager.export(),
|
|
722
|
+
tasks: TaskList()
|
|
723
|
+
};
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
private async checkAssertion(assertion: Assertion): Promise<AssertionResult> {
|
|
727
|
+
return this.assertionEngine.check(assertion, this.currentSimulation);
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
private delay(ms: number): Promise<void> {
|
|
731
|
+
return new Promise(resolve => setTimeout(resolve, ms));
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
```
|
|
735
|
+
|
|
736
|
+
---
|
|
737
|
+
|
|
738
|
+
## AssertionEngine
|
|
739
|
+
|
|
740
|
+
```typescript
|
|
741
|
+
type AssertionType =
|
|
742
|
+
| 'responds' // Agent responded
|
|
743
|
+
| 'no_error' // No errors occurred
|
|
744
|
+
| 'tool_called' // Specific tool was called
|
|
745
|
+
| 'tool_not_called' // Tool was not called
|
|
746
|
+
| 'output_contains' // Output contains text
|
|
747
|
+
| 'output_matches' // Output matches regex
|
|
748
|
+
| 'semantic_match' // Semantic similarity
|
|
749
|
+
| 'behavior' // Custom behavior check
|
|
750
|
+
| 'property' // Property-based test
|
|
751
|
+
| 'cost_under' // Cost below threshold
|
|
752
|
+
| 'time_under'; // Time below threshold
|
|
753
|
+
|
|
754
|
+
interface Assertion {
|
|
755
|
+
type: AssertionType;
|
|
756
|
+
value?: unknown;
|
|
757
|
+
message?: string;
|
|
758
|
+
timeout?: number;
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
interface AssertionResult {
|
|
762
|
+
assertion: Assertion;
|
|
763
|
+
passed: boolean;
|
|
764
|
+
actual?: unknown;
|
|
765
|
+
expected?: unknown;
|
|
766
|
+
message?: string;
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
class AssertionEngine {
|
|
770
|
+
private customAssertions: Map<string, AssertionHandler>;
|
|
771
|
+
|
|
772
|
+
constructor() {
|
|
773
|
+
this.customAssertions = new Map();
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
// Check assertion
|
|
777
|
+
async check(
|
|
778
|
+
assertion: Assertion,
|
|
779
|
+
result: AgentResult,
|
|
780
|
+
context?: AssertionContext
|
|
781
|
+
): Promise<AssertionResult> {
|
|
782
|
+
switch (assertion.type) {
|
|
783
|
+
case 'responds':
|
|
784
|
+
return this.assertResponds(result);
|
|
785
|
+
|
|
786
|
+
case 'no_error':
|
|
787
|
+
return this.assertNoError(result);
|
|
788
|
+
|
|
789
|
+
case 'tool_called':
|
|
790
|
+
return this.assertToolCalled(result, assertion.value as string);
|
|
791
|
+
|
|
792
|
+
case 'tool_not_called':
|
|
793
|
+
return this.assertToolNotCalled(result, assertion.value as string);
|
|
794
|
+
|
|
795
|
+
case 'output_contains':
|
|
796
|
+
return this.assertOutputContains(result, assertion.value as string);
|
|
797
|
+
|
|
798
|
+
case 'output_matches':
|
|
799
|
+
return this.assertOutputMatches(result, assertion.value as string);
|
|
800
|
+
|
|
801
|
+
case 'semantic_match':
|
|
802
|
+
return await this.assertSemanticMatch(result, assertion.value as string);
|
|
803
|
+
|
|
804
|
+
case 'behavior':
|
|
805
|
+
return await this.assertBehavior(result, assertion.value as BehaviorSpec, context);
|
|
806
|
+
|
|
807
|
+
case 'property':
|
|
808
|
+
return this.assertProperty(result, assertion.value as PropertySpec);
|
|
809
|
+
|
|
810
|
+
case 'cost_under':
|
|
811
|
+
return this.assertCostUnder(result, assertion.value as number);
|
|
812
|
+
|
|
813
|
+
case 'time_under':
|
|
814
|
+
return this.assertTimeUnder(result, assertion.value as number);
|
|
815
|
+
|
|
816
|
+
default:
|
|
817
|
+
// Check custom assertions
|
|
818
|
+
const handler = this.customAssertions.get(assertion.type);
|
|
819
|
+
if (handler) {
|
|
820
|
+
return handler(assertion, result, context);
|
|
821
|
+
}
|
|
822
|
+
throw new Error(`Unknown assertion type: ${assertion.type}`);
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
// Register custom assertion
|
|
827
|
+
registerAssertion(type: string, handler: AssertionHandler): void {
|
|
828
|
+
this.customAssertions.set(type, handler);
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
// Built-in assertions
|
|
832
|
+
private assertResponds(result: AgentResult): AssertionResult {
|
|
833
|
+
const passed = result.output !== undefined && result.output !== null;
|
|
834
|
+
return {
|
|
835
|
+
assertion: { type: 'responds' },
|
|
836
|
+
passed,
|
|
837
|
+
actual: result.output,
|
|
838
|
+
message: passed ? 'Agent responded' : 'Agent did not respond'
|
|
839
|
+
};
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
private assertNoError(result: AgentResult): AssertionResult {
|
|
843
|
+
const passed = !result.error;
|
|
844
|
+
return {
|
|
845
|
+
assertion: { type: 'no_error' },
|
|
846
|
+
passed,
|
|
847
|
+
actual: result.error,
|
|
848
|
+
message: passed ? 'No error' : `Error: ${result.error?.message}`
|
|
849
|
+
};
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
private assertToolCalled(result: AgentResult, toolName: string): AssertionResult {
|
|
853
|
+
const called = result.toolCalls?.some(tc => tc.name === toolName) || false;
|
|
854
|
+
return {
|
|
855
|
+
assertion: { type: 'tool_called', value: toolName },
|
|
856
|
+
passed: called,
|
|
857
|
+
actual: result.toolCalls?.map(tc => tc.name),
|
|
858
|
+
expected: toolName,
|
|
859
|
+
message: called ? `Tool ${toolName} was called` : `Tool ${toolName} was not called`
|
|
860
|
+
};
|
|
861
|
+
}
|
|
862
|
+
|
|
863
|
+
private assertToolNotCalled(result: AgentResult, toolName: string): AssertionResult {
|
|
864
|
+
const called = result.toolCalls?.some(tc => tc.name === toolName) || false;
|
|
865
|
+
return {
|
|
866
|
+
assertion: { type: 'tool_not_called', value: toolName },
|
|
867
|
+
passed: !called,
|
|
868
|
+
actual: result.toolCalls?.map(tc => tc.name),
|
|
869
|
+
message: !called ? `Tool ${toolName} was not called` : `Tool ${toolName} was called`
|
|
870
|
+
};
|
|
871
|
+
}
|
|
872
|
+
|
|
873
|
+
private assertOutputContains(result: AgentResult, text: string): AssertionResult {
|
|
874
|
+
const output = String(result.output || '');
|
|
875
|
+
const contains = output.includes(text);
|
|
876
|
+
return {
|
|
877
|
+
assertion: { type: 'output_contains', value: text },
|
|
878
|
+
passed: contains,
|
|
879
|
+
actual: output.slice(0, 200),
|
|
880
|
+
expected: text,
|
|
881
|
+
message: contains ? 'Output contains expected text' : 'Output does not contain expected text'
|
|
882
|
+
};
|
|
883
|
+
}
|
|
884
|
+
|
|
885
|
+
private assertOutputMatches(result: AgentResult, pattern: string): AssertionResult {
|
|
886
|
+
const output = String(result.output || '');
|
|
887
|
+
const regex = new RegExp(pattern);
|
|
888
|
+
const matches = regex.test(output);
|
|
889
|
+
return {
|
|
890
|
+
assertion: { type: 'output_matches', value: pattern },
|
|
891
|
+
passed: matches,
|
|
892
|
+
actual: output.slice(0, 200),
|
|
893
|
+
expected: pattern,
|
|
894
|
+
message: matches ? 'Output matches pattern' : 'Output does not match pattern'
|
|
895
|
+
};
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
private async assertSemanticMatch(result: AgentResult, expected: string): Promise<AssertionResult> {
|
|
899
|
+
// Use LLM to check semantic similarity
|
|
900
|
+
const output = String(result.output || '');
|
|
901
|
+
|
|
902
|
+
// Simple semantic check (could use embeddings in production)
|
|
903
|
+
const prompt = `Compare these two texts for semantic similarity.
|
|
904
|
+
Text 1: "${output.slice(0, 500)}"
|
|
905
|
+
Text 2: "${expected}"
|
|
906
|
+
|
|
907
|
+
Are they semantically similar in meaning? Answer only "yes" or "no".`;
|
|
908
|
+
|
|
909
|
+
const response = await this.quickLLMCheck(prompt);
|
|
910
|
+
const similar = response.toLowerCase().includes('yes');
|
|
911
|
+
|
|
912
|
+
return {
|
|
913
|
+
assertion: { type: 'semantic_match', value: expected },
|
|
914
|
+
passed: similar,
|
|
915
|
+
actual: output.slice(0, 200),
|
|
916
|
+
expected,
|
|
917
|
+
message: similar ? 'Semantically similar' : 'Not semantically similar'
|
|
918
|
+
};
|
|
919
|
+
}
|
|
920
|
+
|
|
921
|
+
private async assertBehavior(
|
|
922
|
+
result: AgentResult,
|
|
923
|
+
spec: BehaviorSpec,
|
|
924
|
+
context?: AssertionContext
|
|
925
|
+
): Promise<AssertionResult> {
|
|
926
|
+
// Check behavioral specification
|
|
927
|
+
let passed = true;
|
|
928
|
+
const violations: string[] = [];
|
|
929
|
+
|
|
930
|
+
if (spec.mustCallTools) {
|
|
931
|
+
for (const tool of spec.mustCallTools) {
|
|
932
|
+
if (!result.toolCalls?.some(tc => tc.name === tool)) {
|
|
933
|
+
passed = false;
|
|
934
|
+
violations.push(`Did not call required tool: ${tool}`);
|
|
935
|
+
}
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
|
|
939
|
+
if (spec.mustNotCallTools) {
|
|
940
|
+
for (const tool of spec.mustNotCallTools) {
|
|
941
|
+
if (result.toolCalls?.some(tc => tc.name === tool)) {
|
|
942
|
+
passed = false;
|
|
943
|
+
violations.push(`Called forbidden tool: ${tool}`);
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
}
|
|
947
|
+
|
|
948
|
+
if (spec.mustAsk && !result.askedUser) {
|
|
949
|
+
passed = false;
|
|
950
|
+
violations.push('Did not ask user when expected');
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
if (spec.customCheck) {
|
|
954
|
+
const customResult = await spec.customCheck(result, context);
|
|
955
|
+
if (!customResult.passed) {
|
|
956
|
+
passed = false;
|
|
957
|
+
violations.push(customResult.reason || 'Custom check failed');
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
return {
|
|
962
|
+
assertion: { type: 'behavior', value: spec },
|
|
963
|
+
passed,
|
|
964
|
+
message: passed ? 'Behavior matches spec' : violations.join('; ')
|
|
965
|
+
};
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
private assertProperty(result: AgentResult, spec: PropertySpec): AssertionResult {
|
|
969
|
+
// Property-based testing
|
|
970
|
+
let passed = true;
|
|
971
|
+
const violations: string[] = [];
|
|
972
|
+
|
|
973
|
+
if (spec.invariant) {
|
|
974
|
+
const holds = spec.invariant(result);
|
|
975
|
+
if (!holds) {
|
|
976
|
+
passed = false;
|
|
977
|
+
violations.push('Invariant violated');
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
if (spec.postcondition) {
|
|
982
|
+
const holds = spec.postcondition(result);
|
|
983
|
+
if (!holds) {
|
|
984
|
+
passed = false;
|
|
985
|
+
violations.push('Postcondition not satisfied');
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
return {
|
|
990
|
+
assertion: { type: 'property', value: spec },
|
|
991
|
+
passed,
|
|
992
|
+
message: passed ? 'Properties satisfied' : violations.join('; ')
|
|
993
|
+
};
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
private assertCostUnder(result: AgentResult, maxCost: number): AssertionResult {
|
|
997
|
+
const cost = result.cost?.total || 0;
|
|
998
|
+
const passed = cost <= maxCost;
|
|
999
|
+
return {
|
|
1000
|
+
assertion: { type: 'cost_under', value: maxCost },
|
|
1001
|
+
passed,
|
|
1002
|
+
actual: cost,
|
|
1003
|
+
expected: `<= ${maxCost}`,
|
|
1004
|
+
message: passed ? `Cost $${cost} is under $${maxCost}` : `Cost $${cost} exceeds $${maxCost}`
|
|
1005
|
+
};
|
|
1006
|
+
}
|
|
1007
|
+
|
|
1008
|
+
private assertTimeUnder(result: AgentResult, maxTime: number): AssertionResult {
|
|
1009
|
+
const time = result.duration_ms || 0;
|
|
1010
|
+
const passed = time <= maxTime;
|
|
1011
|
+
return {
|
|
1012
|
+
assertion: { type: 'time_under', value: maxTime },
|
|
1013
|
+
passed,
|
|
1014
|
+
actual: time,
|
|
1015
|
+
expected: `<= ${maxTime}ms`,
|
|
1016
|
+
message: passed ? `Time ${time}ms is under ${maxTime}ms` : `Time ${time}ms exceeds ${maxTime}ms`
|
|
1017
|
+
};
|
|
1018
|
+
}
|
|
1019
|
+
|
|
1020
|
+
private async quickLLMCheck(prompt: string): Promise<string> {
|
|
1021
|
+
// Quick LLM check for semantic assertions
|
|
1022
|
+
// In production, use a small/fast model
|
|
1023
|
+
return 'yes'; // Placeholder
|
|
1024
|
+
}
|
|
1025
|
+
}
|
|
1026
|
+
```
|
|
1027
|
+
|
|
1028
|
+
---
|
|
1029
|
+
|
|
1030
|
+
## Test Fixtures
|
|
1031
|
+
|
|
1032
|
+
```typescript
|
|
1033
|
+
class TestFixtures {
|
|
1034
|
+
private fixtures: Map<string, unknown>;
|
|
1035
|
+
|
|
1036
|
+
constructor() {
|
|
1037
|
+
this.fixtures = new Map();
|
|
1038
|
+
this.loadBuiltinFixtures();
|
|
1039
|
+
}
|
|
1040
|
+
|
|
1041
|
+
// Get fixture
|
|
1042
|
+
get<T>(name: string): T {
|
|
1043
|
+
const fixture = this.fixtures.get(name);
|
|
1044
|
+
if (!fixture) throw new Error(`Fixture not found: ${name}`);
|
|
1045
|
+
return fixture as T;
|
|
1046
|
+
}
|
|
1047
|
+
|
|
1048
|
+
// Register fixture
|
|
1049
|
+
register(name: string, value: unknown): void {
|
|
1050
|
+
this.fixtures.set(name, value);
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
// Load fixture from file
|
|
1054
|
+
async loadFile(name: string, path: string): Promise<void> {
|
|
1055
|
+
const content = await fs.readFile(path, 'utf-8');
|
|
1056
|
+
this.fixtures.set(name, content);
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
// Create temporary file fixture
|
|
1060
|
+
async createTempFile(name: string, content: string): Promise<string> {
|
|
1061
|
+
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'fixture-'));
|
|
1062
|
+
const filePath = path.join(tempDir, name);
|
|
1063
|
+
await fs.writeFile(filePath, content);
|
|
1064
|
+
this.fixtures.set(`file:${name}`, filePath);
|
|
1065
|
+
return filePath;
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
private loadBuiltinFixtures(): void {
|
|
1069
|
+
// Common test fixtures
|
|
1070
|
+
this.fixtures.set('emptyInput', { message: '' });
|
|
1071
|
+
this.fixtures.set('simpleTask', { message: 'Hello, world!' });
|
|
1072
|
+
this.fixtures.set('codeTask', {
|
|
1073
|
+
message: 'Write a function to add two numbers'
|
|
1074
|
+
});
|
|
1075
|
+
this.fixtures.set('errorInput', { message: null });
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
```
|
|
1079
|
+
|
|
1080
|
+
---
|
|
1081
|
+
|
|
1082
|
+
## Comandos
|
|
1083
|
+
|
|
1084
|
+
```bash
|
|
1085
|
+
/elsabro:test # Ejecutar todos los tests
|
|
1086
|
+
/elsabro:test run explore.test # Ejecutar test específico
|
|
1087
|
+
/elsabro:test watch # Modo watch
|
|
1088
|
+
/elsabro:test coverage # Reporte de cobertura
|
|
1089
|
+
/elsabro:test simulate scenario1 # Ejecutar simulación
|
|
1090
|
+
```
|
|
1091
|
+
|
|
1092
|
+
---
|
|
1093
|
+
|
|
1094
|
+
## Ejemplo de Test
|
|
1095
|
+
|
|
1096
|
+
```typescript
|
|
1097
|
+
// tests/explore-agent.test.ts
|
|
1098
|
+
export default {
|
|
1099
|
+
name: 'Explore Agent Tests',
|
|
1100
|
+
tests: [
|
|
1101
|
+
{
|
|
1102
|
+
name: 'should find TypeScript files',
|
|
1103
|
+
agent: 'Explore',
|
|
1104
|
+
input: { message: 'Find all TypeScript files in src/' },
|
|
1105
|
+
mocks: {
|
|
1106
|
+
tools: {
|
|
1107
|
+
Glob: { response: ['src/index.ts', 'src/utils.ts'] }
|
|
1108
|
+
}
|
|
1109
|
+
},
|
|
1110
|
+
assertions: [
|
|
1111
|
+
{ type: 'responds' },
|
|
1112
|
+
{ type: 'no_error' },
|
|
1113
|
+
{ type: 'tool_called', value: 'Glob' },
|
|
1114
|
+
{ type: 'output_contains', value: '.ts' }
|
|
1115
|
+
]
|
|
1116
|
+
}
|
|
1117
|
+
]
|
|
1118
|
+
};
|
|
1119
|
+
```
|
|
1120
|
+
|
|
1121
|
+
---
|
|
1122
|
+
|
|
1123
|
+
## Configuración
|
|
1124
|
+
|
|
1125
|
+
```json
|
|
1126
|
+
{
|
|
1127
|
+
"testing": {
|
|
1128
|
+
"enabled": true,
|
|
1129
|
+
"testDir": ".elsabro/tests",
|
|
1130
|
+
"defaultTimeout": 30000,
|
|
1131
|
+
"stopOnFirstFailure": false,
|
|
1132
|
+
"retries": 1,
|
|
1133
|
+
"parallel": false,
|
|
1134
|
+
"coverage": {
|
|
1135
|
+
"enabled": true,
|
|
1136
|
+
"threshold": 80
|
|
1137
|
+
}
|
|
1138
|
+
}
|
|
1139
|
+
}
|
|
1140
|
+
```
|
|
1141
|
+
|
|
1142
|
+
---
|
|
1143
|
+
|
|
1144
|
+
## Changelog
|
|
1145
|
+
|
|
1146
|
+
- **v3.4.0**: Initial Testing Framework
|
|
1147
|
+
- AgentTestRunner with suites
|
|
1148
|
+
- MockProvider for LLM and tools
|
|
1149
|
+
- SimulationEngine for scenarios
|
|
1150
|
+
- AssertionEngine with 12+ assertion types
|
|
1151
|
+
- TestFixtures for reusable data
|