@memberjunction/testing-engine 0.0.1 → 2.119.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/README.md +403 -29
  2. package/dist/drivers/AgentEvalDriver.d.ts +197 -0
  3. package/dist/drivers/AgentEvalDriver.d.ts.map +1 -0
  4. package/dist/drivers/AgentEvalDriver.js +370 -0
  5. package/dist/drivers/AgentEvalDriver.js.map +1 -0
  6. package/dist/drivers/BaseTestDriver.d.ts +145 -0
  7. package/dist/drivers/BaseTestDriver.d.ts.map +1 -0
  8. package/dist/drivers/BaseTestDriver.js +266 -0
  9. package/dist/drivers/BaseTestDriver.js.map +1 -0
  10. package/dist/drivers/index.d.ts +6 -0
  11. package/dist/drivers/index.d.ts.map +1 -0
  12. package/dist/drivers/index.js +22 -0
  13. package/dist/drivers/index.js.map +1 -0
  14. package/dist/engine/TestEngine.d.ts +148 -0
  15. package/dist/engine/TestEngine.d.ts.map +1 -0
  16. package/dist/engine/TestEngine.js +490 -0
  17. package/dist/engine/TestEngine.js.map +1 -0
  18. package/dist/index.d.ts +20 -0
  19. package/dist/index.d.ts.map +1 -0
  20. package/dist/index.js +42 -0
  21. package/dist/index.js.map +1 -0
  22. package/dist/oracles/ExactMatchOracle.d.ts +98 -0
  23. package/dist/oracles/ExactMatchOracle.d.ts.map +1 -0
  24. package/dist/oracles/ExactMatchOracle.js +355 -0
  25. package/dist/oracles/ExactMatchOracle.js.map +1 -0
  26. package/dist/oracles/IOracle.d.ts +47 -0
  27. package/dist/oracles/IOracle.d.ts.map +1 -0
  28. package/dist/oracles/IOracle.js +7 -0
  29. package/dist/oracles/IOracle.js.map +1 -0
  30. package/dist/oracles/LLMJudgeOracle.d.ts +65 -0
  31. package/dist/oracles/LLMJudgeOracle.d.ts.map +1 -0
  32. package/dist/oracles/LLMJudgeOracle.js +214 -0
  33. package/dist/oracles/LLMJudgeOracle.js.map +1 -0
  34. package/dist/oracles/SQLValidatorOracle.d.ts +78 -0
  35. package/dist/oracles/SQLValidatorOracle.d.ts.map +1 -0
  36. package/dist/oracles/SQLValidatorOracle.js +215 -0
  37. package/dist/oracles/SQLValidatorOracle.js.map +1 -0
  38. package/dist/oracles/SchemaValidatorOracle.d.ts +61 -0
  39. package/dist/oracles/SchemaValidatorOracle.d.ts.map +1 -0
  40. package/dist/oracles/SchemaValidatorOracle.js +193 -0
  41. package/dist/oracles/SchemaValidatorOracle.js.map +1 -0
  42. package/dist/oracles/TraceValidatorOracle.d.ts +41 -0
  43. package/dist/oracles/TraceValidatorOracle.d.ts.map +1 -0
  44. package/dist/oracles/TraceValidatorOracle.js +159 -0
  45. package/dist/oracles/TraceValidatorOracle.js.map +1 -0
  46. package/dist/oracles/index.d.ts +10 -0
  47. package/dist/oracles/index.d.ts.map +1 -0
  48. package/dist/oracles/index.js +26 -0
  49. package/dist/oracles/index.js.map +1 -0
  50. package/dist/types.d.ts +428 -0
  51. package/dist/types.d.ts.map +1 -0
  52. package/dist/types.js +6 -0
  53. package/dist/types.js.map +1 -0
  54. package/dist/utils/cost-calculator.d.ts +92 -0
  55. package/dist/utils/cost-calculator.d.ts.map +1 -0
  56. package/dist/utils/cost-calculator.js +137 -0
  57. package/dist/utils/cost-calculator.js.map +1 -0
  58. package/dist/utils/result-formatter.d.ts +98 -0
  59. package/dist/utils/result-formatter.d.ts.map +1 -0
  60. package/dist/utils/result-formatter.js +252 -0
  61. package/dist/utils/result-formatter.js.map +1 -0
  62. package/dist/utils/scoring.d.ts +64 -0
  63. package/dist/utils/scoring.d.ts.map +1 -0
  64. package/dist/utils/scoring.js +140 -0
  65. package/dist/utils/scoring.js.map +1 -0
  66. package/package.json +36 -7
package/README.md CHANGED
@@ -1,45 +1,419 @@
1
- # @memberjunction/testing-engine
1
+ # MemberJunction Testing Engine
2
2
 
3
- ## ⚠️ IMPORTANT NOTICE ⚠️
3
+ Core execution engine for the MemberJunction Testing Framework. Provides the foundational infrastructure for running tests, evaluating results, and managing test execution.
4
4
 
5
- **This package is created solely for the purpose of setting up OIDC (OpenID Connect) trusted publishing with npm.**
5
+ ## Architecture
6
6
 
7
- This is **NOT** a functional package and contains **NO** code or functionality beyond the OIDC setup configuration.
7
+ The Testing Engine follows a driver-based architecture that allows for extensible test types:
8
8
 
9
- ## Purpose
9
+ ```
10
+ TestEngine (Orchestration)
11
+
12
+ BaseTestDriver (Abstract base class)
13
+
14
+ Concrete Drivers:
15
+ - AgentEvalDriver
16
+ - WorkflowScenarioDriver (future)
17
+ - CodeGenTestDriver (future)
18
+ - IntegrationTestDriver (future)
19
+ ```
10
20
 
11
- This package exists to:
12
- 1. Configure OIDC trusted publishing for the package name `@memberjunction/testing-engine`
13
- 2. Enable secure, token-less publishing from CI/CD workflows
14
- 3. Establish provenance for packages published under this name
21
+ ## Core Components
15
22
 
16
- ## What is OIDC Trusted Publishing?
23
+ ### 1. Test Engine (`TestEngine`)
24
+ Main orchestration layer responsible for:
25
+ - Loading test definitions from database
26
+ - Instantiating appropriate test drivers via ClassFactory
27
+ - Managing test execution lifecycle
28
+ - Aggregating and storing results
29
+ - Tracking costs and performance metrics
17
30
 
18
- OIDC trusted publishing allows package maintainers to publish packages directly from their CI/CD workflows without needing to manage npm access tokens. Instead, it uses OpenID Connect to establish trust between the CI/CD provider (like GitHub Actions) and npm.
31
+ ### 2. Base Test Driver (`BaseTestDriver`)
32
+ Abstract base class that all test drivers extend:
33
+ - Defines execution contract
34
+ - Provides common utilities (scoring, validation, result formatting)
35
+ - Handles error management
36
+ - Manages test context
19
37
 
20
- ## Setup Instructions
38
+ ### 3. Oracle System
39
+ Pluggable evaluation components:
40
+ - **SchemaValidator** - Validates output structure
41
+ - **TraceValidator** - Checks execution traces for errors
42
+ - **LLMJudge** - Uses LLM to evaluate quality
43
+ - **ExactMatcher** - Deterministic output comparison
44
+ - **SQLValidator** - Database state verification
21
45
 
22
- To properly configure OIDC trusted publishing for this package:
46
+ ### 4. Result Management
47
+ Structured result capture and storage:
48
+ - TestRun entity management
49
+ - Metric collection and aggregation
50
+ - Artifact storage coordination
51
+ - Cost tracking
23
52
 
24
- 1. Go to [npmjs.com](https://www.npmjs.com/) and navigate to your package settings
25
- 2. Configure the trusted publisher (e.g., GitHub Actions)
26
- 3. Specify the repository and workflow that should be allowed to publish
27
- 4. Use the configured workflow to publish your actual package
53
+ ## Class Structure
28
54
 
29
- ## DO NOT USE THIS PACKAGE
55
+ ### TestEngine
30
56
 
31
- This package is a placeholder for OIDC configuration only. It:
32
- - Contains no executable code
33
- - Provides no functionality
34
- - Should not be installed as a dependency
35
- - Exists only for administrative purposes
57
+ ```typescript
58
+ export class TestEngine {
59
+ /**
60
+ * Run a single test by ID
61
+ */
62
+ async runTest(
63
+ testId: string,
64
+ contextUser: UserInfo,
65
+ options?: TestRunOptions
66
+ ): Promise<TestRunResult>
36
67
 
37
- ## More Information
68
+ /**
69
+ * Run an entire test suite
70
+ */
71
+ async runSuite(
72
+ suiteId: string,
73
+ contextUser: UserInfo,
74
+ options?: SuiteRunOptions
75
+ ): Promise<SuiteRunResult>
38
76
 
39
- For more details about npm's trusted publishing feature, see:
40
- - [npm Trusted Publishing Documentation](https://docs.npmjs.com/generating-provenance-statements)
41
- - [GitHub Actions OIDC Documentation](https://docs.github.com/en/actions/deployment/security-hardening-your-deployments/about-security-hardening-with-openid-connect)
77
+ /**
78
+ * Validate test definition without executing
79
+ */
80
+ async validateTest(testId: string): Promise<ValidationResult>
81
+ }
82
+ ```
42
83
 
43
- ---
84
+ ### BaseTestDriver
44
85
 
45
- **Maintained for OIDC setup purposes only**
86
+ ```typescript
87
+ export abstract class BaseTestDriver {
88
+ /**
89
+ * Execute the test and return results
90
+ * Must be implemented by concrete drivers
91
+ */
92
+ abstract execute(
93
+ test: TestEntity,
94
+ contextUser: UserInfo,
95
+ options: ExecutionOptions
96
+ ): Promise<DriverExecutionResult>
97
+
98
+ /**
99
+ * Validate test configuration
100
+ * Can be overridden by concrete drivers
101
+ */
102
+ async validate(test: TestEntity): Promise<ValidationResult>
103
+
104
+ /**
105
+ * Calculate overall score from oracle results
106
+ */
107
+ protected calculateScore(
108
+ oracleResults: OracleResult[],
109
+ weights: ScoringWeights
110
+ ): number
111
+
112
+ /**
113
+ * Store execution results to database
114
+ */
115
+ protected async storeResults(
116
+ testRun: TestRunEntity,
117
+ results: DriverExecutionResult
118
+ ): Promise<void>
119
+ }
120
+ ```
121
+
122
+ ### Oracle Interface
123
+
124
+ ```typescript
125
+ export interface IOracle {
126
+ /**
127
+ * Unique identifier for this oracle type
128
+ */
129
+ readonly type: string;
130
+
131
+ /**
132
+ * Execute the oracle evaluation
133
+ */
134
+ evaluate(
135
+ input: OracleInput,
136
+ config: OracleConfig
137
+ ): Promise<OracleResult>
138
+ }
139
+
140
+ export interface OracleResult {
141
+ oracleType: string;
142
+ passed: boolean;
143
+ score?: number;
144
+ message: string;
145
+ details?: any;
146
+ }
147
+ ```
148
+
149
+ ## Agent Eval Driver
150
+
151
+ The first concrete implementation:
152
+
153
+ ```typescript
154
+ export class AgentEvalDriver extends BaseTestDriver {
155
+ async execute(
156
+ test: TestEntity,
157
+ contextUser: UserInfo,
158
+ options: ExecutionOptions
159
+ ): Promise<DriverExecutionResult> {
160
+ // 1. Parse test configuration
161
+ const config = this.parseConfig(test);
162
+
163
+ // 2. Execute agent with test inputs
164
+ const agentRun = await this.executeAgent(config.inputs, contextUser);
165
+
166
+ // 3. Run oracles to evaluate output
167
+ const oracleResults = await this.runOracles(
168
+ agentRun,
169
+ config.oracles,
170
+ config.expectedOutcomes
171
+ );
172
+
173
+ // 4. Calculate score based on weights
174
+ const score = this.calculateScore(oracleResults, config.weights);
175
+
176
+ // 5. Return structured results
177
+ return {
178
+ targetType: 'Agent Run',
179
+ targetLogId: agentRun.ID,
180
+ status: this.determineStatus(oracleResults),
181
+ score,
182
+ oracleResults,
183
+ cost: this.calculateCost(agentRun),
184
+ duration: this.calculateDuration(agentRun)
185
+ };
186
+ }
187
+
188
+ private async runOracles(
189
+ agentRun: AIAgentRunEntity,
190
+ oracleConfigs: OracleConfig[],
191
+ expectedOutcomes: any
192
+ ): Promise<OracleResult[]> {
193
+ const results: OracleResult[] = [];
194
+
195
+ for (const config of oracleConfigs) {
196
+ const oracle = this.getOracle(config.type);
197
+ const result = await oracle.evaluate({
198
+ agentRun,
199
+ expectedOutcomes
200
+ }, config);
201
+ results.push(result);
202
+ }
203
+
204
+ return results;
205
+ }
206
+ }
207
+ ```
208
+
209
+ ## Oracle Implementations
210
+
211
+ ### Schema Validator
212
+ ```typescript
213
+ export class SchemaValidatorOracle implements IOracle {
214
+ readonly type = 'schema-validate';
215
+
216
+ async evaluate(input: OracleInput, config: OracleConfig): Promise<OracleResult> {
217
+ const { agentRun } = input;
218
+ const schema = this.loadSchema(config.schema);
219
+
220
+ try {
221
+ schema.parse(agentRun.ResultPayload);
222
+ return {
223
+ oracleType: this.type,
224
+ passed: true,
225
+ score: 1.0,
226
+ message: 'Output matches schema'
227
+ };
228
+ } catch (error) {
229
+ return {
230
+ oracleType: this.type,
231
+ passed: false,
232
+ score: 0.0,
233
+ message: `Schema validation failed: ${error.message}`,
234
+ details: error.errors
235
+ };
236
+ }
237
+ }
238
+ }
239
+ ```
240
+
241
+ ### Trace Validator
242
+ ```typescript
243
+ export class TraceValidatorOracle implements IOracle {
244
+ readonly type = 'trace-no-errors';
245
+
246
+ async evaluate(input: OracleInput, config: OracleConfig): Promise<OracleResult> {
247
+ const { agentRun } = input;
248
+
249
+ // Load agent run steps
250
+ const rv = new RunView();
251
+ const stepsResult = await rv.RunView<AIAgentRunStepEntity>({
252
+ EntityName: 'MJ: AI Agent Run Steps',
253
+ ExtraFilter: `AgentRunID='${agentRun.ID}'`,
254
+ ResultType: 'entity_object'
255
+ });
256
+
257
+ if (!stepsResult.Success) {
258
+ throw new Error(`Failed to load agent run steps: ${stepsResult.ErrorMessage}`);
259
+ }
260
+
261
+ const steps = stepsResult.Results || [];
262
+ const errorSteps = steps.filter(s => s.Status === 'Error' || s.Status === 'Failed');
263
+
264
+ if (errorSteps.length === 0) {
265
+ return {
266
+ oracleType: this.type,
267
+ passed: true,
268
+ score: 1.0,
269
+ message: `All ${steps.length} steps completed without errors`
270
+ };
271
+ } else {
272
+ return {
273
+ oracleType: this.type,
274
+ passed: false,
275
+ score: 0.0,
276
+ message: `${errorSteps.length} of ${steps.length} steps had errors`,
277
+ details: errorSteps.map(s => ({
278
+ stepId: s.ID,
279
+ sequence: s.Sequence,
280
+ error: s.Notes
281
+ }))
282
+ };
283
+ }
284
+ }
285
+ }
286
+ ```
287
+
288
+ ### LLM Judge
289
+ ```typescript
290
+ export class LLMJudgeOracle implements IOracle {
291
+ readonly type = 'llm-judge';
292
+
293
+ async evaluate(input: OracleInput, config: OracleConfig): Promise<OracleResult> {
294
+ const { agentRun, expectedOutcomes } = input;
295
+
296
+ // Load rubric
297
+ const md = new Metadata();
298
+ const rubric = await md.GetEntityObject<TestRubricEntity>('Test Rubrics');
299
+ await rubric.Load(config.rubricId);
300
+
301
+ // Build evaluation prompt
302
+ const prompt = this.buildPrompt(rubric, agentRun, expectedOutcomes);
303
+
304
+ // Execute LLM evaluation
305
+ const judgmentResult = await this.executeJudgment(prompt);
306
+
307
+ // Parse structured response
308
+ const scores = this.parseScores(judgmentResult, rubric.Criteria);
309
+
310
+ // Calculate weighted score
311
+ const overallScore = this.calculateWeightedScore(scores, rubric.Criteria);
312
+
313
+ return {
314
+ oracleType: this.type,
315
+ passed: overallScore >= (config.threshold || 0.7),
316
+ score: overallScore,
317
+ message: `LLM judge score: ${(overallScore * 100).toFixed(1)}%`,
318
+ details: {
319
+ rubricId: rubric.ID,
320
+ rubricVersion: rubric.Version,
321
+ dimensionScores: scores,
322
+ judgmentNotes: judgmentResult.notes
323
+ }
324
+ };
325
+ }
326
+ }
327
+ ```
328
+
329
+ ## Usage Example
330
+
331
+ ```typescript
332
+ import { TestEngine } from '@memberjunction/testing-engine';
333
+ import { getSystemUser } from '@memberjunction/core';
334
+
335
+ // Initialize engine
336
+ const engine = new TestEngine();
337
+
338
+ // Run a single test
339
+ const contextUser = getSystemUser();
340
+ const result = await engine.runTest('test-id-123', contextUser, {
341
+ dryRun: false,
342
+ environment: 'staging'
343
+ });
344
+
345
+ console.log(`Test ${result.status}: Score ${result.score}`);
346
+ console.log(`Cost: $${result.cost}, Duration: ${result.duration}s`);
347
+
348
+ // Run a suite
349
+ const suiteResult = await engine.runSuite('suite-id-456', contextUser, {
350
+ parallel: false,
351
+ failFast: true
352
+ });
353
+
354
+ console.log(`Suite completed: ${suiteResult.passedTests}/${suiteResult.totalTests} passed`);
355
+ ```
356
+
357
+ ## Extension Points
358
+
359
+ ### Adding New Test Types
360
+
361
+ 1. Create driver class extending `BaseTestDriver`:
362
+ ```typescript
363
+ export class WorkflowScenarioDriver extends BaseTestDriver {
364
+ async execute(test: TestEntity, contextUser: UserInfo, options: ExecutionOptions) {
365
+ // Your workflow test logic
366
+ }
367
+ }
368
+ ```
369
+
370
+ 2. Register with ClassFactory:
371
+ ```typescript
372
+ MJGlobal.Instance.ClassFactory.Register(
373
+ BaseTestDriver,
374
+ 'WorkflowScenarioDriver',
375
+ WorkflowScenarioDriver
376
+ );
377
+ ```
378
+
379
+ 3. Create TestType record in database:
380
+ ```sql
381
+ INSERT INTO TestType (Name, DriverClass, Status)
382
+ VALUES ('Workflow Scenario', 'WorkflowScenarioDriver', 'Active');
383
+ ```
384
+
385
+ ### Adding New Oracles
386
+
387
+ 1. Implement `IOracle` interface:
388
+ ```typescript
389
+ export class CustomOracle implements IOracle {
390
+ readonly type = 'custom-check';
391
+
392
+ async evaluate(input: OracleInput, config: OracleConfig): Promise<OracleResult> {
393
+ // Your evaluation logic
394
+ }
395
+ }
396
+ ```
397
+
398
+ 2. Register with engine:
399
+ ```typescript
400
+ engine.registerOracle(new CustomOracle());
401
+ ```
402
+
403
+ ## Testing Best Practices
404
+
405
+ 1. **Use appropriate oracles** - Combine deterministic checks (schema, trace) with semantic checks (LLM judge)
406
+ 2. **Set realistic weights** - Balance different evaluation dimensions
407
+ 3. **Track costs** - Monitor LLM evaluation costs
408
+ 4. **Validate before execute** - Use `validateTest()` to catch config errors
409
+ 5. **Leverage traces** - Link TestRun to target entities for debugging
410
+
411
+ ## Future Enhancements
412
+
413
+ - [ ] Parallel test execution within suites
414
+ - [ ] Test retry logic with exponential backoff
415
+ - [ ] Result caching for expensive evaluations
416
+ - [ ] Composite oracles (AND/OR logic)
417
+ - [ ] Dataset-driven test execution (run same test with multiple inputs)
418
+ - [ ] Comparative evaluation (A/B testing different agent versions)
419
+ - [ ] Replay mode (re-evaluate existing agent runs)
@@ -0,0 +1,197 @@
1
+ /**
2
+ * @fileoverview Test driver for AI Agent evaluation
3
+ * @module @memberjunction/testing-engine
4
+ */
5
+ import { TestEntity } from '@memberjunction/core-entities';
6
+ import { BaseTestDriver } from './BaseTestDriver';
7
+ import { DriverExecutionContext, DriverExecutionResult, ValidationResult } from '../types';
8
+ /**
9
+ * Configuration for Agent Evaluation tests.
10
+ */
11
+ export interface AgentEvalConfig {
12
+ /**
13
+ * Agent to test
14
+ */
15
+ agentId: string;
16
+ /**
17
+ * Oracles to run for evaluation
18
+ */
19
+ oracles: {
20
+ type: string;
21
+ config?: Record<string, unknown>;
22
+ weight?: number;
23
+ }[];
24
+ /**
25
+ * Scoring weights by oracle type
26
+ */
27
+ scoringWeights?: Record<string, number>;
28
+ /**
29
+ * Maximum execution time in milliseconds
30
+ */
31
+ maxExecutionTime?: number;
32
+ }
33
+ /**
34
+ * Input definition for Agent Evaluation tests.
35
+ */
36
+ export interface AgentEvalInput {
37
+ /**
38
+ * User message to send to agent
39
+ */
40
+ userMessage: string;
41
+ /**
42
+ * Optional conversation context
43
+ */
44
+ conversationContext?: {
45
+ conversationId?: string;
46
+ priorMessages?: Array<{
47
+ role: 'user' | 'assistant';
48
+ content: string;
49
+ }>;
50
+ };
51
+ /**
52
+ * Optional agent execution parameters
53
+ */
54
+ executionParams?: {
55
+ modelOverride?: string;
56
+ temperatureOverride?: number;
57
+ maxTokensOverride?: number;
58
+ };
59
+ }
60
+ /**
61
+ * Expected outcomes for Agent Evaluation tests.
62
+ */
63
+ export interface AgentEvalExpectedOutcomes {
64
+ /**
65
+ * Expected response patterns (for regex matching)
66
+ */
67
+ responsePatterns?: string[];
68
+ /**
69
+ * Expected entities mentioned in response
70
+ */
71
+ expectedEntities?: string[];
72
+ /**
73
+ * Expected actions taken
74
+ */
75
+ expectedActions?: string[];
76
+ /**
77
+ * Schema for response validation
78
+ */
79
+ responseSchema?: Record<string, unknown>;
80
+ /**
81
+ * SQL queries to validate database state
82
+ */
83
+ sqlValidations?: Array<{
84
+ query: string;
85
+ expectedResult: unknown;
86
+ }>;
87
+ /**
88
+ * Custom validation criteria for LLM judge
89
+ */
90
+ judgeValidationCriteria?: string[];
91
+ }
92
+ /**
93
+ * Test driver for AI Agent evaluation.
94
+ *
95
+ * Executes an AI agent with test input, runs configured oracles,
96
+ * and creates bidirectional link between TestRun and AgentRun.
97
+ *
98
+ * @example
99
+ * ```typescript
100
+ * // Configuration JSON in Test entity
101
+ * {
102
+ * "agentId": "agent-123",
103
+ * "oracles": [
104
+ * { "type": "trace-no-errors", "weight": 0.2 },
105
+ * { "type": "llm-judge", "weight": 0.5, "config": { "criteria": [...] } },
106
+ * { "type": "schema-validate", "weight": 0.3, "config": { "schema": {...} } }
107
+ * ],
108
+ * "scoringWeights": { "trace-no-errors": 0.2, "llm-judge": 0.5, "schema-validate": 0.3 }
109
+ * }
110
+ *
111
+ * // InputDefinition JSON in Test entity
112
+ * {
113
+ * "userMessage": "Create a report showing sales by region",
114
+ * "conversationContext": null,
115
+ * "executionParams": { "temperatureOverride": 0.3 }
116
+ * }
117
+ *
118
+ * // ExpectedOutcomes JSON in Test entity
119
+ * {
120
+ * "responsePatterns": ["sales.*region", "chart|graph"],
121
+ * "expectedEntities": ["Report", "Dashboard"],
122
+ * "responseSchema": { "type": "object", "properties": {...} },
123
+ * "judgeValidationCriteria": [
124
+ * "Response accurately answers the user's question",
125
+ * "Report includes proper data visualization",
126
+ * "Response is professional and clear"
127
+ * ]
128
+ * }
129
+ * ```
130
+ */
131
+ export declare class AgentEvalDriver extends BaseTestDriver {
132
+ /**
133
+ * Execute agent evaluation test.
134
+ *
135
+ * Steps:
136
+ * 1. Parse configuration and input
137
+ * 2. Load and execute agent via AgentRunner
138
+ * 3. Create bidirectional link (TestRun ↔ AgentRun)
139
+ * 4. Run oracles to evaluate results
140
+ * 5. Calculate score and determine status
141
+ * 6. Return structured results
142
+ *
143
+ * @param context - Execution context
144
+ * @returns Execution result
145
+ */
146
+ Execute(context: DriverExecutionContext): Promise<DriverExecutionResult>;
147
+ /**
148
+ * Validate agent evaluation test configuration.
149
+ *
150
+ * Checks:
151
+ * - Base validation (InputDefinition, ExpectedOutcomes, Configuration)
152
+ * - Agent ID is valid
153
+ * - At least one oracle is configured
154
+ * - Oracle types are registered
155
+ * - Scoring weights are valid
156
+ *
157
+ * @param test - Test entity to validate
158
+ * @returns Validation result
159
+ */
160
+ Validate(test: TestEntity): Promise<ValidationResult>;
161
+ /**
162
+ * Load agent entity.
163
+ * @private
164
+ */
165
+ private loadAgent;
166
+ /**
167
+ * Execute agent and return result.
168
+ * @private
169
+ */
170
+ private executeAgent;
171
+ /**
172
+ * Create bidirectional link between TestRun and AgentRun.
173
+ * @private
174
+ */
175
+ private linkTestRunToAgentRun;
176
+ /**
177
+ * Extract agent output from agent run.
178
+ * @private
179
+ */
180
+ private extractAgentOutput;
181
+ /**
182
+ * Run configured oracles.
183
+ * @private
184
+ */
185
+ private runOracles;
186
+ /**
187
+ * Calculate total cost from agent run.
188
+ * @private
189
+ */
190
+ private calculateTotalCost;
191
+ /**
192
+ * Calculate duration in milliseconds from agent run.
193
+ * @private
194
+ */
195
+ private calculateDurationMs;
196
+ }
197
+ //# sourceMappingURL=AgentEvalDriver.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"AgentEvalDriver.d.ts","sourceRoot":"","sources":["../../src/drivers/AgentEvalDriver.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAIH,OAAO,EAAmC,UAAU,EAAiB,MAAM,+BAA+B,CAAC;AAG3G,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAClD,OAAO,EACH,sBAAsB,EACtB,qBAAqB,EAGrB,gBAAgB,EAGnB,MAAM,UAAU,CAAC;AAElB;;GAEG;AACH,MAAM,WAAW,eAAe;IAC5B;;OAEG;IACH,OAAO,EAAE,MAAM,CAAC;IAEhB;;OAEG;IACH,OAAO,EAAE;QACL,IAAI,EAAE,MAAM,CAAC;QACb,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QACjC,MAAM,CAAC,EAAE,MAAM,CAAC;KACnB,EAAE,CAAC;IAEJ;;OAEG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAExC;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC7B;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC3B;;OAEG;IACH,WAAW,EAAE,MAAM,CAAC;IAEpB;;OAEG;IACH,mBAAmB,CAAC,EAAE;QAClB,cAAc,CAAC,EAAE,MAAM,CAAC;QACxB,aAAa,CAAC,EAAE,KAAK,CAAC;YAClB,IAAI,EAAE,MAAM,GAAG,WAAW,CAAC;YAC3B,OAAO,EAAE,MAAM,CAAC;SACnB,CAAC,CAAC;KACN,CAAC;IAEF;;OAEG;IACH,eAAe,CAAC,EAAE;QACd,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,mBAAmB,CAAC,EAAE,MAAM,CAAC;QAC7B,iBAAiB,CAAC,EAAE,MAAM,CAAC;KAC9B,CAAC;CACL;AAED;;GAEG;AACH,MAAM,WAAW,yBAAyB;IACtC;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;IAE5B;;OAEG;IACH,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;IAE5B;;OAEG;IACH,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAE3B;;OAEG;IACH,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IAEzC;;OAEG;IACH,cAAc,CAAC,EAAE,KAAK,CAAC;QACnB,KAAK,EAAE,MAAM,CAAC;QACd,cAAc,EAAE,OAAO,CAAC;KAC3B,CAAC,CAAC;IAEH;;OAEG;IACH,uBAAuB,CAAC,EAAE,MAAM,EAAE,CAAC;CACtC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AACH,qBACa,eAAgB,SAAQ,cAAc;IAC/C;;;;;;;;;;;;;OAaG;IACU,OAAO,CAAC,OAAO,EAAE,sBAAsB,GAAG,OAAO,CAAC,qBAAqB,CAAC;IAwFrF;;;;;;;;;;;;OAYG;IACmB,QAAQ,CAAC,IAAI,EAAE,UAAU,GAAG,OAAO,CAAC,gBAAgB,CAAC;IAgF3E;;;OAGG;YACW,SAAS;IAMvB;;;OAGG;YACW,YAAY;IA0E1B;;;OAGG;YACW,qBAAqB;IAanC;;;OAGG;IACH,OAAO,CAAC,kBAAkB;IAS1B;;;OAGG;YACW,UAAU;IAyDxB;;;OAGG;IACH,OAAO,CAAC,kBAAkB;IAI1B;;;OAGG;IACH,OAAO,CAAC,mBAAmB;CAQ9B"}