@memberjunction/testing-engine 0.0.1 → 2.118.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +403 -29
- package/dist/drivers/AgentEvalDriver.d.ts +197 -0
- package/dist/drivers/AgentEvalDriver.d.ts.map +1 -0
- package/dist/drivers/AgentEvalDriver.js +370 -0
- package/dist/drivers/AgentEvalDriver.js.map +1 -0
- package/dist/drivers/BaseTestDriver.d.ts +145 -0
- package/dist/drivers/BaseTestDriver.d.ts.map +1 -0
- package/dist/drivers/BaseTestDriver.js +266 -0
- package/dist/drivers/BaseTestDriver.js.map +1 -0
- package/dist/drivers/index.d.ts +6 -0
- package/dist/drivers/index.d.ts.map +1 -0
- package/dist/drivers/index.js +22 -0
- package/dist/drivers/index.js.map +1 -0
- package/dist/engine/TestEngine.d.ts +148 -0
- package/dist/engine/TestEngine.d.ts.map +1 -0
- package/dist/engine/TestEngine.js +490 -0
- package/dist/engine/TestEngine.js.map +1 -0
- package/dist/index.d.ts +20 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +42 -0
- package/dist/index.js.map +1 -0
- package/dist/oracles/ExactMatchOracle.d.ts +98 -0
- package/dist/oracles/ExactMatchOracle.d.ts.map +1 -0
- package/dist/oracles/ExactMatchOracle.js +355 -0
- package/dist/oracles/ExactMatchOracle.js.map +1 -0
- package/dist/oracles/IOracle.d.ts +47 -0
- package/dist/oracles/IOracle.d.ts.map +1 -0
- package/dist/oracles/IOracle.js +7 -0
- package/dist/oracles/IOracle.js.map +1 -0
- package/dist/oracles/LLMJudgeOracle.d.ts +65 -0
- package/dist/oracles/LLMJudgeOracle.d.ts.map +1 -0
- package/dist/oracles/LLMJudgeOracle.js +214 -0
- package/dist/oracles/LLMJudgeOracle.js.map +1 -0
- package/dist/oracles/SQLValidatorOracle.d.ts +78 -0
- package/dist/oracles/SQLValidatorOracle.d.ts.map +1 -0
- package/dist/oracles/SQLValidatorOracle.js +215 -0
- package/dist/oracles/SQLValidatorOracle.js.map +1 -0
- package/dist/oracles/SchemaValidatorOracle.d.ts +61 -0
- package/dist/oracles/SchemaValidatorOracle.d.ts.map +1 -0
- package/dist/oracles/SchemaValidatorOracle.js +193 -0
- package/dist/oracles/SchemaValidatorOracle.js.map +1 -0
- package/dist/oracles/TraceValidatorOracle.d.ts +41 -0
- package/dist/oracles/TraceValidatorOracle.d.ts.map +1 -0
- package/dist/oracles/TraceValidatorOracle.js +159 -0
- package/dist/oracles/TraceValidatorOracle.js.map +1 -0
- package/dist/oracles/index.d.ts +10 -0
- package/dist/oracles/index.d.ts.map +1 -0
- package/dist/oracles/index.js +26 -0
- package/dist/oracles/index.js.map +1 -0
- package/dist/types.d.ts +428 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/cost-calculator.d.ts +92 -0
- package/dist/utils/cost-calculator.d.ts.map +1 -0
- package/dist/utils/cost-calculator.js +137 -0
- package/dist/utils/cost-calculator.js.map +1 -0
- package/dist/utils/result-formatter.d.ts +98 -0
- package/dist/utils/result-formatter.d.ts.map +1 -0
- package/dist/utils/result-formatter.js +252 -0
- package/dist/utils/result-formatter.js.map +1 -0
- package/dist/utils/scoring.d.ts +64 -0
- package/dist/utils/scoring.d.ts.map +1 -0
- package/dist/utils/scoring.js +140 -0
- package/dist/utils/scoring.js.map +1 -0
- package/package.json +36 -7
package/README.md
CHANGED
|
@@ -1,45 +1,419 @@
|
|
|
1
|
-
#
|
|
1
|
+
# MemberJunction Testing Engine
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Core execution engine for the MemberJunction Testing Framework. Provides the foundational infrastructure for running tests, evaluating results, and managing test execution.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
## Architecture
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
The Testing Engine follows a driver-based architecture that allows for extensible test types:
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
```
|
|
10
|
+
TestEngine (Orchestration)
|
|
11
|
+
↓
|
|
12
|
+
BaseTestDriver (Abstract base class)
|
|
13
|
+
↓
|
|
14
|
+
Concrete Drivers:
|
|
15
|
+
- AgentEvalDriver
|
|
16
|
+
- WorkflowScenarioDriver (future)
|
|
17
|
+
- CodeGenTestDriver (future)
|
|
18
|
+
- IntegrationTestDriver (future)
|
|
19
|
+
```
|
|
10
20
|
|
|
11
|
-
|
|
12
|
-
1. Configure OIDC trusted publishing for the package name `@memberjunction/testing-engine`
|
|
13
|
-
2. Enable secure, token-less publishing from CI/CD workflows
|
|
14
|
-
3. Establish provenance for packages published under this name
|
|
21
|
+
## Core Components
|
|
15
22
|
|
|
16
|
-
|
|
23
|
+
### 1. Test Engine (`TestEngine`)
|
|
24
|
+
Main orchestration layer responsible for:
|
|
25
|
+
- Loading test definitions from database
|
|
26
|
+
- Instantiating appropriate test drivers via ClassFactory
|
|
27
|
+
- Managing test execution lifecycle
|
|
28
|
+
- Aggregating and storing results
|
|
29
|
+
- Tracking costs and performance metrics
|
|
17
30
|
|
|
18
|
-
|
|
31
|
+
### 2. Base Test Driver (`BaseTestDriver`)
|
|
32
|
+
Abstract base class that all test drivers extend:
|
|
33
|
+
- Defines execution contract
|
|
34
|
+
- Provides common utilities (scoring, validation, result formatting)
|
|
35
|
+
- Handles error management
|
|
36
|
+
- Manages test context
|
|
19
37
|
|
|
20
|
-
|
|
38
|
+
### 3. Oracle System
|
|
39
|
+
Pluggable evaluation components:
|
|
40
|
+
- **SchemaValidator** - Validates output structure
|
|
41
|
+
- **TraceValidator** - Checks execution traces for errors
|
|
42
|
+
- **LLMJudge** - Uses LLM to evaluate quality
|
|
43
|
+
- **ExactMatcher** - Deterministic output comparison
|
|
44
|
+
- **SQLValidator** - Database state verification
|
|
21
45
|
|
|
22
|
-
|
|
46
|
+
### 4. Result Management
|
|
47
|
+
Structured result capture and storage:
|
|
48
|
+
- TestRun entity management
|
|
49
|
+
- Metric collection and aggregation
|
|
50
|
+
- Artifact storage coordination
|
|
51
|
+
- Cost tracking
|
|
23
52
|
|
|
24
|
-
|
|
25
|
-
2. Configure the trusted publisher (e.g., GitHub Actions)
|
|
26
|
-
3. Specify the repository and workflow that should be allowed to publish
|
|
27
|
-
4. Use the configured workflow to publish your actual package
|
|
53
|
+
## Class Structure
|
|
28
54
|
|
|
29
|
-
|
|
55
|
+
### TestEngine
|
|
30
56
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
57
|
+
```typescript
|
|
58
|
+
export class TestEngine {
|
|
59
|
+
/**
|
|
60
|
+
* Run a single test by ID
|
|
61
|
+
*/
|
|
62
|
+
async runTest(
|
|
63
|
+
testId: string,
|
|
64
|
+
contextUser: UserInfo,
|
|
65
|
+
options?: TestRunOptions
|
|
66
|
+
): Promise<TestRunResult>
|
|
36
67
|
|
|
37
|
-
|
|
68
|
+
/**
|
|
69
|
+
* Run an entire test suite
|
|
70
|
+
*/
|
|
71
|
+
async runSuite(
|
|
72
|
+
suiteId: string,
|
|
73
|
+
contextUser: UserInfo,
|
|
74
|
+
options?: SuiteRunOptions
|
|
75
|
+
): Promise<SuiteRunResult>
|
|
38
76
|
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
77
|
+
/**
|
|
78
|
+
* Validate test definition without executing
|
|
79
|
+
*/
|
|
80
|
+
async validateTest(testId: string): Promise<ValidationResult>
|
|
81
|
+
}
|
|
82
|
+
```
|
|
42
83
|
|
|
43
|
-
|
|
84
|
+
### BaseTestDriver
|
|
44
85
|
|
|
45
|
-
|
|
86
|
+
```typescript
|
|
87
|
+
export abstract class BaseTestDriver {
|
|
88
|
+
/**
|
|
89
|
+
* Execute the test and return results
|
|
90
|
+
* Must be implemented by concrete drivers
|
|
91
|
+
*/
|
|
92
|
+
abstract execute(
|
|
93
|
+
test: TestEntity,
|
|
94
|
+
contextUser: UserInfo,
|
|
95
|
+
options: ExecutionOptions
|
|
96
|
+
): Promise<DriverExecutionResult>
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Validate test configuration
|
|
100
|
+
* Can be overridden by concrete drivers
|
|
101
|
+
*/
|
|
102
|
+
async validate(test: TestEntity): Promise<ValidationResult>
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Calculate overall score from oracle results
|
|
106
|
+
*/
|
|
107
|
+
protected calculateScore(
|
|
108
|
+
oracleResults: OracleResult[],
|
|
109
|
+
weights: ScoringWeights
|
|
110
|
+
): number
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Store execution results to database
|
|
114
|
+
*/
|
|
115
|
+
protected async storeResults(
|
|
116
|
+
testRun: TestRunEntity,
|
|
117
|
+
results: DriverExecutionResult
|
|
118
|
+
): Promise<void>
|
|
119
|
+
}
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Oracle Interface
|
|
123
|
+
|
|
124
|
+
```typescript
|
|
125
|
+
export interface IOracle {
|
|
126
|
+
/**
|
|
127
|
+
* Unique identifier for this oracle type
|
|
128
|
+
*/
|
|
129
|
+
readonly type: string;
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Execute the oracle evaluation
|
|
133
|
+
*/
|
|
134
|
+
evaluate(
|
|
135
|
+
input: OracleInput,
|
|
136
|
+
config: OracleConfig
|
|
137
|
+
): Promise<OracleResult>
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
export interface OracleResult {
|
|
141
|
+
oracleType: string;
|
|
142
|
+
passed: boolean;
|
|
143
|
+
score?: number;
|
|
144
|
+
message: string;
|
|
145
|
+
details?: any;
|
|
146
|
+
}
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Agent Eval Driver
|
|
150
|
+
|
|
151
|
+
The first concrete implementation:
|
|
152
|
+
|
|
153
|
+
```typescript
|
|
154
|
+
export class AgentEvalDriver extends BaseTestDriver {
|
|
155
|
+
async execute(
|
|
156
|
+
test: TestEntity,
|
|
157
|
+
contextUser: UserInfo,
|
|
158
|
+
options: ExecutionOptions
|
|
159
|
+
): Promise<DriverExecutionResult> {
|
|
160
|
+
// 1. Parse test configuration
|
|
161
|
+
const config = this.parseConfig(test);
|
|
162
|
+
|
|
163
|
+
// 2. Execute agent with test inputs
|
|
164
|
+
const agentRun = await this.executeAgent(config.inputs, contextUser);
|
|
165
|
+
|
|
166
|
+
// 3. Run oracles to evaluate output
|
|
167
|
+
const oracleResults = await this.runOracles(
|
|
168
|
+
agentRun,
|
|
169
|
+
config.oracles,
|
|
170
|
+
config.expectedOutcomes
|
|
171
|
+
);
|
|
172
|
+
|
|
173
|
+
// 4. Calculate score based on weights
|
|
174
|
+
const score = this.calculateScore(oracleResults, config.weights);
|
|
175
|
+
|
|
176
|
+
// 5. Return structured results
|
|
177
|
+
return {
|
|
178
|
+
targetType: 'Agent Run',
|
|
179
|
+
targetLogId: agentRun.ID,
|
|
180
|
+
status: this.determineStatus(oracleResults),
|
|
181
|
+
score,
|
|
182
|
+
oracleResults,
|
|
183
|
+
cost: this.calculateCost(agentRun),
|
|
184
|
+
duration: this.calculateDuration(agentRun)
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
private async runOracles(
|
|
189
|
+
agentRun: AIAgentRunEntity,
|
|
190
|
+
oracleConfigs: OracleConfig[],
|
|
191
|
+
expectedOutcomes: any
|
|
192
|
+
): Promise<OracleResult[]> {
|
|
193
|
+
const results: OracleResult[] = [];
|
|
194
|
+
|
|
195
|
+
for (const config of oracleConfigs) {
|
|
196
|
+
const oracle = this.getOracle(config.type);
|
|
197
|
+
const result = await oracle.evaluate({
|
|
198
|
+
agentRun,
|
|
199
|
+
expectedOutcomes
|
|
200
|
+
}, config);
|
|
201
|
+
results.push(result);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return results;
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Oracle Implementations
|
|
210
|
+
|
|
211
|
+
### Schema Validator
|
|
212
|
+
```typescript
|
|
213
|
+
export class SchemaValidatorOracle implements IOracle {
|
|
214
|
+
readonly type = 'schema-validate';
|
|
215
|
+
|
|
216
|
+
async evaluate(input: OracleInput, config: OracleConfig): Promise<OracleResult> {
|
|
217
|
+
const { agentRun } = input;
|
|
218
|
+
const schema = this.loadSchema(config.schema);
|
|
219
|
+
|
|
220
|
+
try {
|
|
221
|
+
schema.parse(agentRun.ResultPayload);
|
|
222
|
+
return {
|
|
223
|
+
oracleType: this.type,
|
|
224
|
+
passed: true,
|
|
225
|
+
score: 1.0,
|
|
226
|
+
message: 'Output matches schema'
|
|
227
|
+
};
|
|
228
|
+
} catch (error) {
|
|
229
|
+
return {
|
|
230
|
+
oracleType: this.type,
|
|
231
|
+
passed: false,
|
|
232
|
+
score: 0.0,
|
|
233
|
+
message: `Schema validation failed: ${error.message}`,
|
|
234
|
+
details: error.errors
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### Trace Validator
|
|
242
|
+
```typescript
|
|
243
|
+
export class TraceValidatorOracle implements IOracle {
|
|
244
|
+
readonly type = 'trace-no-errors';
|
|
245
|
+
|
|
246
|
+
async evaluate(input: OracleInput, config: OracleConfig): Promise<OracleResult> {
|
|
247
|
+
const { agentRun } = input;
|
|
248
|
+
|
|
249
|
+
// Load agent run steps
|
|
250
|
+
const rv = new RunView();
|
|
251
|
+
const stepsResult = await rv.RunView<AIAgentRunStepEntity>({
|
|
252
|
+
EntityName: 'MJ: AI Agent Run Steps',
|
|
253
|
+
ExtraFilter: `AgentRunID='${agentRun.ID}'`,
|
|
254
|
+
ResultType: 'entity_object'
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
if (!stepsResult.Success) {
|
|
258
|
+
throw new Error(`Failed to load agent run steps: ${stepsResult.ErrorMessage}`);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
const steps = stepsResult.Results || [];
|
|
262
|
+
const errorSteps = steps.filter(s => s.Status === 'Error' || s.Status === 'Failed');
|
|
263
|
+
|
|
264
|
+
if (errorSteps.length === 0) {
|
|
265
|
+
return {
|
|
266
|
+
oracleType: this.type,
|
|
267
|
+
passed: true,
|
|
268
|
+
score: 1.0,
|
|
269
|
+
message: `All ${steps.length} steps completed without errors`
|
|
270
|
+
};
|
|
271
|
+
} else {
|
|
272
|
+
return {
|
|
273
|
+
oracleType: this.type,
|
|
274
|
+
passed: false,
|
|
275
|
+
score: 0.0,
|
|
276
|
+
message: `${errorSteps.length} of ${steps.length} steps had errors`,
|
|
277
|
+
details: errorSteps.map(s => ({
|
|
278
|
+
stepId: s.ID,
|
|
279
|
+
sequence: s.Sequence,
|
|
280
|
+
error: s.Notes
|
|
281
|
+
}))
|
|
282
|
+
};
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
### LLM Judge
|
|
289
|
+
```typescript
|
|
290
|
+
export class LLMJudgeOracle implements IOracle {
|
|
291
|
+
readonly type = 'llm-judge';
|
|
292
|
+
|
|
293
|
+
async evaluate(input: OracleInput, config: OracleConfig): Promise<OracleResult> {
|
|
294
|
+
const { agentRun, expectedOutcomes } = input;
|
|
295
|
+
|
|
296
|
+
// Load rubric
|
|
297
|
+
const md = new Metadata();
|
|
298
|
+
const rubric = await md.GetEntityObject<TestRubricEntity>('Test Rubrics');
|
|
299
|
+
await rubric.Load(config.rubricId);
|
|
300
|
+
|
|
301
|
+
// Build evaluation prompt
|
|
302
|
+
const prompt = this.buildPrompt(rubric, agentRun, expectedOutcomes);
|
|
303
|
+
|
|
304
|
+
// Execute LLM evaluation
|
|
305
|
+
const judgmentResult = await this.executeJudgment(prompt);
|
|
306
|
+
|
|
307
|
+
// Parse structured response
|
|
308
|
+
const scores = this.parseScores(judgmentResult, rubric.Criteria);
|
|
309
|
+
|
|
310
|
+
// Calculate weighted score
|
|
311
|
+
const overallScore = this.calculateWeightedScore(scores, rubric.Criteria);
|
|
312
|
+
|
|
313
|
+
return {
|
|
314
|
+
oracleType: this.type,
|
|
315
|
+
passed: overallScore >= (config.threshold || 0.7),
|
|
316
|
+
score: overallScore,
|
|
317
|
+
message: `LLM judge score: ${(overallScore * 100).toFixed(1)}%`,
|
|
318
|
+
details: {
|
|
319
|
+
rubricId: rubric.ID,
|
|
320
|
+
rubricVersion: rubric.Version,
|
|
321
|
+
dimensionScores: scores,
|
|
322
|
+
judgmentNotes: judgmentResult.notes
|
|
323
|
+
}
|
|
324
|
+
};
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
## Usage Example
|
|
330
|
+
|
|
331
|
+
```typescript
|
|
332
|
+
import { TestEngine } from '@memberjunction/testing-engine';
|
|
333
|
+
import { getSystemUser } from '@memberjunction/core';
|
|
334
|
+
|
|
335
|
+
// Initialize engine
|
|
336
|
+
const engine = new TestEngine();
|
|
337
|
+
|
|
338
|
+
// Run a single test
|
|
339
|
+
const contextUser = getSystemUser();
|
|
340
|
+
const result = await engine.runTest('test-id-123', contextUser, {
|
|
341
|
+
dryRun: false,
|
|
342
|
+
environment: 'staging'
|
|
343
|
+
});
|
|
344
|
+
|
|
345
|
+
console.log(`Test ${result.status}: Score ${result.score}`);
|
|
346
|
+
console.log(`Cost: $${result.cost}, Duration: ${result.duration}s`);
|
|
347
|
+
|
|
348
|
+
// Run a suite
|
|
349
|
+
const suiteResult = await engine.runSuite('suite-id-456', contextUser, {
|
|
350
|
+
parallel: false,
|
|
351
|
+
failFast: true
|
|
352
|
+
});
|
|
353
|
+
|
|
354
|
+
console.log(`Suite completed: ${suiteResult.passedTests}/${suiteResult.totalTests} passed`);
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
## Extension Points
|
|
358
|
+
|
|
359
|
+
### Adding New Test Types
|
|
360
|
+
|
|
361
|
+
1. Create driver class extending `BaseTestDriver`:
|
|
362
|
+
```typescript
|
|
363
|
+
export class WorkflowScenarioDriver extends BaseTestDriver {
|
|
364
|
+
async execute(test: TestEntity, contextUser: UserInfo, options: ExecutionOptions) {
|
|
365
|
+
// Your workflow test logic
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
2. Register with ClassFactory:
|
|
371
|
+
```typescript
|
|
372
|
+
MJGlobal.Instance.ClassFactory.Register(
|
|
373
|
+
BaseTestDriver,
|
|
374
|
+
'WorkflowScenarioDriver',
|
|
375
|
+
WorkflowScenarioDriver
|
|
376
|
+
);
|
|
377
|
+
```
|
|
378
|
+
|
|
379
|
+
3. Create TestType record in database:
|
|
380
|
+
```sql
|
|
381
|
+
INSERT INTO TestType (Name, DriverClass, Status)
|
|
382
|
+
VALUES ('Workflow Scenario', 'WorkflowScenarioDriver', 'Active');
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
### Adding New Oracles
|
|
386
|
+
|
|
387
|
+
1. Implement `IOracle` interface:
|
|
388
|
+
```typescript
|
|
389
|
+
export class CustomOracle implements IOracle {
|
|
390
|
+
readonly type = 'custom-check';
|
|
391
|
+
|
|
392
|
+
async evaluate(input: OracleInput, config: OracleConfig): Promise<OracleResult> {
|
|
393
|
+
// Your evaluation logic
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
```
|
|
397
|
+
|
|
398
|
+
2. Register with engine:
|
|
399
|
+
```typescript
|
|
400
|
+
engine.registerOracle(new CustomOracle());
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
## Testing Best Practices
|
|
404
|
+
|
|
405
|
+
1. **Use appropriate oracles** - Combine deterministic checks (schema, trace) with semantic checks (LLM judge)
|
|
406
|
+
2. **Set realistic weights** - Balance different evaluation dimensions
|
|
407
|
+
3. **Track costs** - Monitor LLM evaluation costs
|
|
408
|
+
4. **Validate before execute** - Use `validateTest()` to catch config errors
|
|
409
|
+
5. **Leverage traces** - Link TestRun to target entities for debugging
|
|
410
|
+
|
|
411
|
+
## Future Enhancements
|
|
412
|
+
|
|
413
|
+
- [ ] Parallel test execution within suites
|
|
414
|
+
- [ ] Test retry logic with exponential backoff
|
|
415
|
+
- [ ] Result caching for expensive evaluations
|
|
416
|
+
- [ ] Composite oracles (AND/OR logic)
|
|
417
|
+
- [ ] Dataset-driven test execution (run same test with multiple inputs)
|
|
418
|
+
- [ ] Comparative evaluation (A/B testing different agent versions)
|
|
419
|
+
- [ ] Replay mode (re-evaluate existing agent runs)
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @fileoverview Test driver for AI Agent evaluation
|
|
3
|
+
* @module @memberjunction/testing-engine
|
|
4
|
+
*/
|
|
5
|
+
import { TestEntity } from '@memberjunction/core-entities';
|
|
6
|
+
import { BaseTestDriver } from './BaseTestDriver';
|
|
7
|
+
import { DriverExecutionContext, DriverExecutionResult, ValidationResult } from '../types';
|
|
8
|
+
/**
|
|
9
|
+
* Configuration for Agent Evaluation tests.
|
|
10
|
+
*/
|
|
11
|
+
export interface AgentEvalConfig {
|
|
12
|
+
/**
|
|
13
|
+
* Agent to test
|
|
14
|
+
*/
|
|
15
|
+
agentId: string;
|
|
16
|
+
/**
|
|
17
|
+
* Oracles to run for evaluation
|
|
18
|
+
*/
|
|
19
|
+
oracles: {
|
|
20
|
+
type: string;
|
|
21
|
+
config?: Record<string, unknown>;
|
|
22
|
+
weight?: number;
|
|
23
|
+
}[];
|
|
24
|
+
/**
|
|
25
|
+
* Scoring weights by oracle type
|
|
26
|
+
*/
|
|
27
|
+
scoringWeights?: Record<string, number>;
|
|
28
|
+
/**
|
|
29
|
+
* Maximum execution time in milliseconds
|
|
30
|
+
*/
|
|
31
|
+
maxExecutionTime?: number;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Input definition for Agent Evaluation tests.
|
|
35
|
+
*/
|
|
36
|
+
export interface AgentEvalInput {
|
|
37
|
+
/**
|
|
38
|
+
* User message to send to agent
|
|
39
|
+
*/
|
|
40
|
+
userMessage: string;
|
|
41
|
+
/**
|
|
42
|
+
* Optional conversation context
|
|
43
|
+
*/
|
|
44
|
+
conversationContext?: {
|
|
45
|
+
conversationId?: string;
|
|
46
|
+
priorMessages?: Array<{
|
|
47
|
+
role: 'user' | 'assistant';
|
|
48
|
+
content: string;
|
|
49
|
+
}>;
|
|
50
|
+
};
|
|
51
|
+
/**
|
|
52
|
+
* Optional agent execution parameters
|
|
53
|
+
*/
|
|
54
|
+
executionParams?: {
|
|
55
|
+
modelOverride?: string;
|
|
56
|
+
temperatureOverride?: number;
|
|
57
|
+
maxTokensOverride?: number;
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Expected outcomes for Agent Evaluation tests.
|
|
62
|
+
*/
|
|
63
|
+
export interface AgentEvalExpectedOutcomes {
|
|
64
|
+
/**
|
|
65
|
+
* Expected response patterns (for regex matching)
|
|
66
|
+
*/
|
|
67
|
+
responsePatterns?: string[];
|
|
68
|
+
/**
|
|
69
|
+
* Expected entities mentioned in response
|
|
70
|
+
*/
|
|
71
|
+
expectedEntities?: string[];
|
|
72
|
+
/**
|
|
73
|
+
* Expected actions taken
|
|
74
|
+
*/
|
|
75
|
+
expectedActions?: string[];
|
|
76
|
+
/**
|
|
77
|
+
* Schema for response validation
|
|
78
|
+
*/
|
|
79
|
+
responseSchema?: Record<string, unknown>;
|
|
80
|
+
/**
|
|
81
|
+
* SQL queries to validate database state
|
|
82
|
+
*/
|
|
83
|
+
sqlValidations?: Array<{
|
|
84
|
+
query: string;
|
|
85
|
+
expectedResult: unknown;
|
|
86
|
+
}>;
|
|
87
|
+
/**
|
|
88
|
+
* Custom validation criteria for LLM judge
|
|
89
|
+
*/
|
|
90
|
+
judgeValidationCriteria?: string[];
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Test driver for AI Agent evaluation.
|
|
94
|
+
*
|
|
95
|
+
* Executes an AI agent with test input, runs configured oracles,
|
|
96
|
+
* and creates bidirectional link between TestRun and AgentRun.
|
|
97
|
+
*
|
|
98
|
+
* @example
|
|
99
|
+
* ```typescript
|
|
100
|
+
* // Configuration JSON in Test entity
|
|
101
|
+
* {
|
|
102
|
+
* "agentId": "agent-123",
|
|
103
|
+
* "oracles": [
|
|
104
|
+
* { "type": "trace-no-errors", "weight": 0.2 },
|
|
105
|
+
* { "type": "llm-judge", "weight": 0.5, "config": { "criteria": [...] } },
|
|
106
|
+
* { "type": "schema-validate", "weight": 0.3, "config": { "schema": {...} } }
|
|
107
|
+
* ],
|
|
108
|
+
* "scoringWeights": { "trace-no-errors": 0.2, "llm-judge": 0.5, "schema-validate": 0.3 }
|
|
109
|
+
* }
|
|
110
|
+
*
|
|
111
|
+
* // InputDefinition JSON in Test entity
|
|
112
|
+
* {
|
|
113
|
+
* "userMessage": "Create a report showing sales by region",
|
|
114
|
+
* "conversationContext": null,
|
|
115
|
+
* "executionParams": { "temperatureOverride": 0.3 }
|
|
116
|
+
* }
|
|
117
|
+
*
|
|
118
|
+
* // ExpectedOutcomes JSON in Test entity
|
|
119
|
+
* {
|
|
120
|
+
* "responsePatterns": ["sales.*region", "chart|graph"],
|
|
121
|
+
* "expectedEntities": ["Report", "Dashboard"],
|
|
122
|
+
* "responseSchema": { "type": "object", "properties": {...} },
|
|
123
|
+
* "judgeValidationCriteria": [
|
|
124
|
+
* "Response accurately answers the user's question",
|
|
125
|
+
* "Report includes proper data visualization",
|
|
126
|
+
* "Response is professional and clear"
|
|
127
|
+
* ]
|
|
128
|
+
* }
|
|
129
|
+
* ```
|
|
130
|
+
*/
|
|
131
|
+
export declare class AgentEvalDriver extends BaseTestDriver {
|
|
132
|
+
/**
|
|
133
|
+
* Execute agent evaluation test.
|
|
134
|
+
*
|
|
135
|
+
* Steps:
|
|
136
|
+
* 1. Parse configuration and input
|
|
137
|
+
* 2. Load and execute agent via AgentRunner
|
|
138
|
+
* 3. Create bidirectional link (TestRun ↔ AgentRun)
|
|
139
|
+
* 4. Run oracles to evaluate results
|
|
140
|
+
* 5. Calculate score and determine status
|
|
141
|
+
* 6. Return structured results
|
|
142
|
+
*
|
|
143
|
+
* @param context - Execution context
|
|
144
|
+
* @returns Execution result
|
|
145
|
+
*/
|
|
146
|
+
Execute(context: DriverExecutionContext): Promise<DriverExecutionResult>;
|
|
147
|
+
/**
|
|
148
|
+
* Validate agent evaluation test configuration.
|
|
149
|
+
*
|
|
150
|
+
* Checks:
|
|
151
|
+
* - Base validation (InputDefinition, ExpectedOutcomes, Configuration)
|
|
152
|
+
* - Agent ID is valid
|
|
153
|
+
* - At least one oracle is configured
|
|
154
|
+
* - Oracle types are registered
|
|
155
|
+
* - Scoring weights are valid
|
|
156
|
+
*
|
|
157
|
+
* @param test - Test entity to validate
|
|
158
|
+
* @returns Validation result
|
|
159
|
+
*/
|
|
160
|
+
Validate(test: TestEntity): Promise<ValidationResult>;
|
|
161
|
+
/**
|
|
162
|
+
* Load agent entity.
|
|
163
|
+
* @private
|
|
164
|
+
*/
|
|
165
|
+
private loadAgent;
|
|
166
|
+
/**
|
|
167
|
+
* Execute agent and return result.
|
|
168
|
+
* @private
|
|
169
|
+
*/
|
|
170
|
+
private executeAgent;
|
|
171
|
+
/**
|
|
172
|
+
* Create bidirectional link between TestRun and AgentRun.
|
|
173
|
+
* @private
|
|
174
|
+
*/
|
|
175
|
+
private linkTestRunToAgentRun;
|
|
176
|
+
/**
|
|
177
|
+
* Extract agent output from agent run.
|
|
178
|
+
* @private
|
|
179
|
+
*/
|
|
180
|
+
private extractAgentOutput;
|
|
181
|
+
/**
|
|
182
|
+
* Run configured oracles.
|
|
183
|
+
* @private
|
|
184
|
+
*/
|
|
185
|
+
private runOracles;
|
|
186
|
+
/**
|
|
187
|
+
* Calculate total cost from agent run.
|
|
188
|
+
* @private
|
|
189
|
+
*/
|
|
190
|
+
private calculateTotalCost;
|
|
191
|
+
/**
|
|
192
|
+
* Calculate duration in milliseconds from agent run.
|
|
193
|
+
* @private
|
|
194
|
+
*/
|
|
195
|
+
private calculateDurationMs;
|
|
196
|
+
}
|
|
197
|
+
//# sourceMappingURL=AgentEvalDriver.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"AgentEvalDriver.d.ts","sourceRoot":"","sources":["../../src/drivers/AgentEvalDriver.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAIH,OAAO,EAAmC,UAAU,EAAiB,MAAM,+BAA+B,CAAC;AAG3G,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAClD,OAAO,EACH,sBAAsB,EACtB,qBAAqB,EAGrB,gBAAgB,EAGnB,MAAM,UAAU,CAAC;AAElB;;GAEG;AACH,MAAM,WAAW,eAAe;IAC5B;;OAEG;IACH,OAAO,EAAE,MAAM,CAAC;IAEhB;;OAEG;IACH,OAAO,EAAE;QACL,IAAI,EAAE,MAAM,CAAC;QACb,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QACjC,MAAM,CAAC,EAAE,MAAM,CAAC;KACnB,EAAE,CAAC;IAEJ;;OAEG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAExC;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B;;OAEG;IACH,WAAW,EAAE,MAAM,CAAC;IAEpB;;OAEG;IACH,mBAAmB,CAAC,EAAE;QAClB,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,aAAa,CAAC,EAAE,KAAK,CAAC;YAClB,IAAI,EAAE,MAAM,GAAG,WAAW,CAAC;YAC3B,OAAO,EAAE,MAAM,CAAC;SACnB,CAAC,CAAC;KACN,CAAC;IAEF;;OAEG;IACH,eAAe,CAAC,EAAE;QACd,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAC7B,iBAAiB,CAAC,EAAE,MAAM,CAAC;KAC9B,CAAC;CACL;AAED;;GAEG;AACH,MAAM,WAAW,yBAAyB;IACtC;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;IAE5B;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;IAE5B;;OAEG;IACH,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAE3B;;OAEG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAEzC;;OAEG;IACH,cAAc,CAAC,EAAE,KAAK,CAAC;QACnB,KAAK,EAAE,MAAM,CAAC;QACd,cAAc,EAAE,OAAO,CAAC;KAC3B,CAAC,CAAC;IAEH;;OAEG;IACH,uBAAuB,CAAC,EAAE,MAAM,EAAE,CAAC;CACtC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AACH,qBACa,eAAgB,SAAQ,cAAc;IAC/C;;;;;;;;;;;;;OAaG;IACU,OAAO,CAAC,OAAO,EAAE,sBAAsB,GAAG,OAAO,CAAC,qBAAqB,CAAC;IAwFrF;;;;;;;;;;;;OAYG;IACmB,QAAQ,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAgF3E;;;OAGG;YACW,SAAS;IAMvB;;;OAGG;YACW,YAAY;IA0E1B;;;OAGG;YACW,qBAAqB;IAanC;;;OAGG;IACH,OAAO,CAAC,kBAAkB;IAS1B;;;OAGG;YACW,UAAU;IAyDxB;;;OAGG;IACH,OAAO,CAAC,kBAAkB;IAI1B;;;OAGG;IACH,OAAO,CAAC,mBAAmB;CAQ9B"}
|