@artemiskit/core 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +48 -0
- package/dist/adapters/factory.d.ts +23 -0
- package/dist/adapters/factory.d.ts.map +1 -0
- package/dist/adapters/index.d.ts +7 -0
- package/dist/adapters/index.d.ts.map +1 -0
- package/dist/adapters/registry.d.ts +56 -0
- package/dist/adapters/registry.d.ts.map +1 -0
- package/dist/adapters/types.d.ts +151 -0
- package/dist/adapters/types.d.ts.map +1 -0
- package/dist/artifacts/index.d.ts +6 -0
- package/dist/artifacts/index.d.ts.map +1 -0
- package/dist/artifacts/manifest.d.ts +19 -0
- package/dist/artifacts/manifest.d.ts.map +1 -0
- package/dist/artifacts/types.d.ts +368 -0
- package/dist/artifacts/types.d.ts.map +1 -0
- package/dist/evaluators/contains.d.ts +10 -0
- package/dist/evaluators/contains.d.ts.map +1 -0
- package/dist/evaluators/exact.d.ts +10 -0
- package/dist/evaluators/exact.d.ts.map +1 -0
- package/dist/evaluators/fuzzy.d.ts +10 -0
- package/dist/evaluators/fuzzy.d.ts.map +1 -0
- package/dist/evaluators/index.d.ts +24 -0
- package/dist/evaluators/index.d.ts.map +1 -0
- package/dist/evaluators/json-schema.d.ts +11 -0
- package/dist/evaluators/json-schema.d.ts.map +1 -0
- package/dist/evaluators/llm-grader.d.ts +11 -0
- package/dist/evaluators/llm-grader.d.ts.map +1 -0
- package/dist/evaluators/regex.d.ts +10 -0
- package/dist/evaluators/regex.d.ts.map +1 -0
- package/dist/evaluators/types.d.ts +29 -0
- package/dist/evaluators/types.d.ts.map +1 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +26021 -0
- package/dist/provenance/environment.d.ts +12 -0
- package/dist/provenance/environment.d.ts.map +1 -0
- package/dist/provenance/git.d.ts +9 -0
- package/dist/provenance/git.d.ts.map +1 -0
- package/dist/provenance/index.d.ts +6 -0
- package/dist/provenance/index.d.ts.map +1 -0
- package/dist/redaction/index.d.ts +3 -0
- package/dist/redaction/index.d.ts.map +1 -0
- package/dist/redaction/redactor.d.ts +79 -0
- package/dist/redaction/redactor.d.ts.map +1 -0
- package/dist/redaction/types.d.ts +120 -0
- package/dist/redaction/types.d.ts.map +1 -0
- package/dist/runner/executor.d.ts +11 -0
- package/dist/runner/executor.d.ts.map +1 -0
- package/dist/runner/index.d.ts +7 -0
- package/dist/runner/index.d.ts.map +1 -0
- package/dist/runner/runner.d.ts +13 -0
- package/dist/runner/runner.d.ts.map +1 -0
- package/dist/runner/types.d.ts +57 -0
- package/dist/runner/types.d.ts.map +1 -0
- package/dist/scenario/index.d.ts +7 -0
- package/dist/scenario/index.d.ts.map +1 -0
- package/dist/scenario/parser.d.ts +17 -0
- package/dist/scenario/parser.d.ts.map +1 -0
- package/dist/scenario/schema.d.ts +945 -0
- package/dist/scenario/schema.d.ts.map +1 -0
- package/dist/scenario/variables.d.ts +19 -0
- package/dist/scenario/variables.d.ts.map +1 -0
- package/dist/storage/factory.d.ts +13 -0
- package/dist/storage/factory.d.ts.map +1 -0
- package/dist/storage/index.d.ts +8 -0
- package/dist/storage/index.d.ts.map +1 -0
- package/dist/storage/local.d.ts +20 -0
- package/dist/storage/local.d.ts.map +1 -0
- package/dist/storage/supabase.d.ts +21 -0
- package/dist/storage/supabase.d.ts.map +1 -0
- package/dist/storage/types.d.ts +86 -0
- package/dist/storage/types.d.ts.map +1 -0
- package/dist/utils/errors.d.ts +25 -0
- package/dist/utils/errors.d.ts.map +1 -0
- package/dist/utils/index.d.ts +6 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/logger.d.ts +21 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/package.json +56 -0
- package/src/adapters/factory.ts +75 -0
- package/src/adapters/index.ts +7 -0
- package/src/adapters/registry.ts +143 -0
- package/src/adapters/types.ts +184 -0
- package/src/artifacts/index.ts +6 -0
- package/src/artifacts/manifest.test.ts +206 -0
- package/src/artifacts/manifest.ts +136 -0
- package/src/artifacts/types.ts +426 -0
- package/src/evaluators/contains.test.ts +58 -0
- package/src/evaluators/contains.ts +41 -0
- package/src/evaluators/exact.test.ts +48 -0
- package/src/evaluators/exact.ts +33 -0
- package/src/evaluators/fuzzy.test.ts +50 -0
- package/src/evaluators/fuzzy.ts +39 -0
- package/src/evaluators/index.ts +53 -0
- package/src/evaluators/json-schema.ts +98 -0
- package/src/evaluators/llm-grader.ts +100 -0
- package/src/evaluators/regex.test.ts +73 -0
- package/src/evaluators/regex.ts +43 -0
- package/src/evaluators/types.ts +37 -0
- package/src/index.ts +31 -0
- package/src/provenance/environment.ts +18 -0
- package/src/provenance/git.ts +48 -0
- package/src/provenance/index.ts +6 -0
- package/src/redaction/index.ts +23 -0
- package/src/redaction/redactor.test.ts +258 -0
- package/src/redaction/redactor.ts +246 -0
- package/src/redaction/types.ts +135 -0
- package/src/runner/executor.ts +251 -0
- package/src/runner/index.ts +7 -0
- package/src/runner/runner.ts +153 -0
- package/src/runner/types.ts +60 -0
- package/src/scenario/index.ts +7 -0
- package/src/scenario/parser.test.ts +99 -0
- package/src/scenario/parser.ts +108 -0
- package/src/scenario/schema.ts +176 -0
- package/src/scenario/variables.test.ts +150 -0
- package/src/scenario/variables.ts +60 -0
- package/src/storage/factory.ts +52 -0
- package/src/storage/index.ts +8 -0
- package/src/storage/local.test.ts +165 -0
- package/src/storage/local.ts +194 -0
- package/src/storage/supabase.ts +151 -0
- package/src/storage/types.ts +98 -0
- package/src/utils/errors.ts +76 -0
- package/src/utils/index.ts +6 -0
- package/src/utils/logger.ts +59 -0
- package/tsconfig.json +13 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Test case executor
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { CaseRedactionInfo, CaseResult } from '../artifacts/types';
|
|
6
|
+
import { getEvaluator } from '../evaluators';
|
|
7
|
+
import { type RedactionConfig, Redactor } from '../redaction';
|
|
8
|
+
import type { TestCase } from '../scenario/schema';
|
|
9
|
+
import { mergeVariables, substituteVariables } from '../scenario/variables';
|
|
10
|
+
import type { ExecutorContext } from './types';
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Merge redaction configs with priority: CLI > case > scenario
|
|
14
|
+
*/
|
|
15
|
+
function mergeRedactionConfig(
|
|
16
|
+
scenarioConfig?: RedactionConfig,
|
|
17
|
+
caseConfig?: RedactionConfig,
|
|
18
|
+
cliConfig?: RedactionConfig
|
|
19
|
+
): RedactionConfig {
|
|
20
|
+
// CLI config takes highest priority if enabled is explicitly set
|
|
21
|
+
if (cliConfig?.enabled !== undefined) {
|
|
22
|
+
return {
|
|
23
|
+
enabled: cliConfig.enabled,
|
|
24
|
+
patterns: cliConfig.patterns ?? caseConfig?.patterns ?? scenarioConfig?.patterns,
|
|
25
|
+
redactPrompts:
|
|
26
|
+
cliConfig.redactPrompts ??
|
|
27
|
+
caseConfig?.redactPrompts ??
|
|
28
|
+
scenarioConfig?.redactPrompts ??
|
|
29
|
+
true,
|
|
30
|
+
redactResponses:
|
|
31
|
+
cliConfig.redactResponses ??
|
|
32
|
+
caseConfig?.redactResponses ??
|
|
33
|
+
scenarioConfig?.redactResponses ??
|
|
34
|
+
true,
|
|
35
|
+
redactMetadata:
|
|
36
|
+
cliConfig.redactMetadata ??
|
|
37
|
+
caseConfig?.redactMetadata ??
|
|
38
|
+
scenarioConfig?.redactMetadata ??
|
|
39
|
+
false,
|
|
40
|
+
replacement:
|
|
41
|
+
cliConfig.replacement ??
|
|
42
|
+
caseConfig?.replacement ??
|
|
43
|
+
scenarioConfig?.replacement ??
|
|
44
|
+
'[REDACTED]',
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Case config takes priority over scenario
|
|
49
|
+
if (caseConfig?.enabled !== undefined) {
|
|
50
|
+
return {
|
|
51
|
+
enabled: caseConfig.enabled,
|
|
52
|
+
patterns: caseConfig.patterns ?? scenarioConfig?.patterns,
|
|
53
|
+
redactPrompts: caseConfig.redactPrompts ?? scenarioConfig?.redactPrompts ?? true,
|
|
54
|
+
redactResponses: caseConfig.redactResponses ?? scenarioConfig?.redactResponses ?? true,
|
|
55
|
+
redactMetadata: caseConfig.redactMetadata ?? scenarioConfig?.redactMetadata ?? false,
|
|
56
|
+
replacement: caseConfig.replacement ?? scenarioConfig?.replacement ?? '[REDACTED]',
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Fall back to scenario config
|
|
61
|
+
if (scenarioConfig?.enabled) {
|
|
62
|
+
return {
|
|
63
|
+
enabled: scenarioConfig.enabled,
|
|
64
|
+
patterns: scenarioConfig.patterns,
|
|
65
|
+
redactPrompts: scenarioConfig.redactPrompts ?? true,
|
|
66
|
+
redactResponses: scenarioConfig.redactResponses ?? true,
|
|
67
|
+
redactMetadata: scenarioConfig.redactMetadata ?? false,
|
|
68
|
+
replacement: scenarioConfig.replacement ?? '[REDACTED]',
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Default: disabled
|
|
73
|
+
return {
|
|
74
|
+
enabled: false,
|
|
75
|
+
redactPrompts: true,
|
|
76
|
+
redactResponses: true,
|
|
77
|
+
redactMetadata: false,
|
|
78
|
+
replacement: '[REDACTED]',
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Execute a single test case
|
|
84
|
+
*/
|
|
85
|
+
export async function executeCase(
|
|
86
|
+
testCase: TestCase,
|
|
87
|
+
context: ExecutorContext
|
|
88
|
+
): Promise<CaseResult> {
|
|
89
|
+
const { timeout, retries = 0 } = context;
|
|
90
|
+
const caseStartTime = Date.now();
|
|
91
|
+
|
|
92
|
+
let lastError: Error | null = null;
|
|
93
|
+
|
|
94
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
95
|
+
try {
|
|
96
|
+
const result = await executeCaseAttempt(testCase, context, timeout);
|
|
97
|
+
return result;
|
|
98
|
+
} catch (error) {
|
|
99
|
+
lastError = error as Error;
|
|
100
|
+
if (attempt < retries) {
|
|
101
|
+
// Wait before retry with exponential backoff
|
|
102
|
+
await sleep(2 ** attempt * 1000);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// All retries failed
|
|
108
|
+
const latencyMs = Date.now() - caseStartTime;
|
|
109
|
+
return {
|
|
110
|
+
id: testCase.id,
|
|
111
|
+
name: testCase.name,
|
|
112
|
+
ok: false,
|
|
113
|
+
score: 0,
|
|
114
|
+
matcherType: testCase.expected.type,
|
|
115
|
+
reason: `Failed after ${retries + 1} attempts: ${lastError?.message}`,
|
|
116
|
+
latencyMs,
|
|
117
|
+
tokens: { prompt: 0, completion: 0, total: 0 },
|
|
118
|
+
prompt: testCase.prompt,
|
|
119
|
+
response: '',
|
|
120
|
+
expected: testCase.expected,
|
|
121
|
+
tags: testCase.tags,
|
|
122
|
+
error: lastError?.message,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
async function executeCaseAttempt(
|
|
127
|
+
testCase: TestCase,
|
|
128
|
+
context: ExecutorContext,
|
|
129
|
+
timeout?: number
|
|
130
|
+
): Promise<CaseResult> {
|
|
131
|
+
const { client, scenario, redaction: cliRedaction } = context;
|
|
132
|
+
|
|
133
|
+
// Merge scenario-level and case-level variables (case overrides scenario)
|
|
134
|
+
const variables = mergeVariables(scenario.variables, testCase.variables);
|
|
135
|
+
|
|
136
|
+
// Apply variable substitution to prompt
|
|
137
|
+
let prompt = substituteVariables(testCase.prompt, variables);
|
|
138
|
+
|
|
139
|
+
// Build prompt with system prompt if present
|
|
140
|
+
if (scenario.setup?.systemPrompt && typeof prompt === 'string') {
|
|
141
|
+
const systemPrompt = substituteVariables(scenario.setup.systemPrompt, variables);
|
|
142
|
+
prompt = [
|
|
143
|
+
{ role: 'system' as const, content: systemPrompt },
|
|
144
|
+
{ role: 'user' as const, content: prompt },
|
|
145
|
+
];
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Generate response with optional timeout
|
|
149
|
+
const generatePromise = client.generate({
|
|
150
|
+
prompt,
|
|
151
|
+
model: testCase.model || scenario.model,
|
|
152
|
+
temperature: scenario.temperature,
|
|
153
|
+
maxTokens: scenario.maxTokens,
|
|
154
|
+
seed: scenario.seed,
|
|
155
|
+
});
|
|
156
|
+
|
|
157
|
+
const result = timeout
|
|
158
|
+
? await Promise.race([generatePromise, createTimeout(timeout)])
|
|
159
|
+
: await generatePromise;
|
|
160
|
+
|
|
161
|
+
// Evaluate response
|
|
162
|
+
const evaluator = getEvaluator(testCase.expected.type);
|
|
163
|
+
const evalResult = await evaluator.evaluate(result.text, testCase.expected, {
|
|
164
|
+
client,
|
|
165
|
+
testCase,
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
// Determine effective redaction config (CLI > case > scenario)
|
|
169
|
+
const effectiveRedaction = mergeRedactionConfig(
|
|
170
|
+
scenario.redaction,
|
|
171
|
+
testCase.redaction,
|
|
172
|
+
cliRedaction
|
|
173
|
+
);
|
|
174
|
+
|
|
175
|
+
// Apply redaction if enabled
|
|
176
|
+
let finalPrompt: string | object = testCase.prompt;
|
|
177
|
+
let finalResponse = result.text;
|
|
178
|
+
let redactionInfo: CaseRedactionInfo | undefined;
|
|
179
|
+
|
|
180
|
+
if (effectiveRedaction.enabled) {
|
|
181
|
+
const redactor = new Redactor(effectiveRedaction);
|
|
182
|
+
|
|
183
|
+
let promptRedacted = false;
|
|
184
|
+
let responseRedacted = false;
|
|
185
|
+
let totalRedactions = 0;
|
|
186
|
+
|
|
187
|
+
// Redact prompt if configured
|
|
188
|
+
if (effectiveRedaction.redactPrompts) {
|
|
189
|
+
if (typeof finalPrompt === 'string') {
|
|
190
|
+
const promptResult = redactor.redactPrompt(finalPrompt);
|
|
191
|
+
finalPrompt = promptResult.text;
|
|
192
|
+
promptRedacted = promptResult.wasRedacted;
|
|
193
|
+
totalRedactions += promptResult.redactionCount;
|
|
194
|
+
} else if (Array.isArray(finalPrompt)) {
|
|
195
|
+
// Handle chat message array
|
|
196
|
+
finalPrompt = finalPrompt.map((msg) => {
|
|
197
|
+
if (typeof msg === 'object' && 'content' in msg && typeof msg.content === 'string') {
|
|
198
|
+
const promptResult = redactor.redactPrompt(msg.content);
|
|
199
|
+
if (promptResult.wasRedacted) {
|
|
200
|
+
promptRedacted = true;
|
|
201
|
+
totalRedactions += promptResult.redactionCount;
|
|
202
|
+
}
|
|
203
|
+
return { ...msg, content: promptResult.text };
|
|
204
|
+
}
|
|
205
|
+
return msg;
|
|
206
|
+
});
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Redact response if configured
|
|
211
|
+
if (effectiveRedaction.redactResponses) {
|
|
212
|
+
const responseResult = redactor.redactResponse(finalResponse);
|
|
213
|
+
finalResponse = responseResult.text;
|
|
214
|
+
responseRedacted = responseResult.wasRedacted;
|
|
215
|
+
totalRedactions += responseResult.redactionCount;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
redactionInfo = {
|
|
219
|
+
redacted: promptRedacted || responseRedacted,
|
|
220
|
+
promptRedacted,
|
|
221
|
+
responseRedacted,
|
|
222
|
+
redactionCount: totalRedactions,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
return {
|
|
227
|
+
id: testCase.id,
|
|
228
|
+
name: testCase.name,
|
|
229
|
+
ok: evalResult.passed,
|
|
230
|
+
score: evalResult.score,
|
|
231
|
+
matcherType: testCase.expected.type,
|
|
232
|
+
reason: evalResult.reason,
|
|
233
|
+
latencyMs: result.latencyMs,
|
|
234
|
+
tokens: result.tokens,
|
|
235
|
+
prompt: finalPrompt,
|
|
236
|
+
response: finalResponse,
|
|
237
|
+
expected: testCase.expected,
|
|
238
|
+
tags: testCase.tags,
|
|
239
|
+
redaction: redactionInfo,
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
function createTimeout(ms: number): Promise<never> {
|
|
244
|
+
return new Promise((_, reject) => {
|
|
245
|
+
setTimeout(() => reject(new Error(`Timeout after ${ms}ms`)), ms);
|
|
246
|
+
});
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
function sleep(ms: number): Promise<void> {
|
|
250
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
251
|
+
}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scenario runner - main entry point for running test scenarios
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { createRunManifest } from '../artifacts/manifest';
|
|
6
|
+
import type { CaseResult, ManifestRedactionInfo } from '../artifacts/types';
|
|
7
|
+
import { Redactor } from '../redaction';
|
|
8
|
+
import { executeCase } from './executor';
|
|
9
|
+
import type { RunOptions, RunResult } from './types';
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Run a test scenario
|
|
13
|
+
*/
|
|
14
|
+
export async function runScenario(options: RunOptions): Promise<RunResult> {
|
|
15
|
+
const {
|
|
16
|
+
scenario,
|
|
17
|
+
client,
|
|
18
|
+
project = process.env.ARTEMIS_PROJECT || 'default',
|
|
19
|
+
resolvedConfig,
|
|
20
|
+
tags,
|
|
21
|
+
concurrency = 1,
|
|
22
|
+
timeout,
|
|
23
|
+
retries,
|
|
24
|
+
redaction,
|
|
25
|
+
onCaseComplete,
|
|
26
|
+
onProgress,
|
|
27
|
+
} = options;
|
|
28
|
+
|
|
29
|
+
// Filter cases by tags if specified
|
|
30
|
+
let cases = scenario.cases;
|
|
31
|
+
if (tags && tags.length > 0) {
|
|
32
|
+
cases = cases.filter((c) => tags.some((tag) => c.tags.includes(tag)));
|
|
33
|
+
onProgress?.(`Filtered to ${cases.length} cases by tags: ${tags.join(', ')}`);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
if (cases.length === 0) {
|
|
37
|
+
throw new Error('No test cases to run after filtering');
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
onProgress?.(`Running ${cases.length} test cases...`);
|
|
41
|
+
|
|
42
|
+
const startTime = new Date();
|
|
43
|
+
const results: CaseResult[] = [];
|
|
44
|
+
|
|
45
|
+
if (concurrency === 1) {
|
|
46
|
+
// Sequential execution
|
|
47
|
+
for (let i = 0; i < cases.length; i++) {
|
|
48
|
+
const testCase = cases[i];
|
|
49
|
+
const result = await executeCase(testCase, {
|
|
50
|
+
client,
|
|
51
|
+
scenario,
|
|
52
|
+
timeout: testCase.timeout || timeout,
|
|
53
|
+
retries: testCase.retries ?? retries,
|
|
54
|
+
redaction,
|
|
55
|
+
});
|
|
56
|
+
results.push(result);
|
|
57
|
+
onCaseComplete?.(result, i, cases.length);
|
|
58
|
+
}
|
|
59
|
+
} else {
|
|
60
|
+
// Concurrent execution with limited parallelism
|
|
61
|
+
const chunks = chunkArray(cases, concurrency);
|
|
62
|
+
let completed = 0;
|
|
63
|
+
|
|
64
|
+
for (const chunk of chunks) {
|
|
65
|
+
const chunkResults = await Promise.all(
|
|
66
|
+
chunk.map(async (testCase) => {
|
|
67
|
+
const result = await executeCase(testCase, {
|
|
68
|
+
client,
|
|
69
|
+
scenario,
|
|
70
|
+
timeout: testCase.timeout || timeout,
|
|
71
|
+
retries: testCase.retries ?? retries,
|
|
72
|
+
redaction,
|
|
73
|
+
});
|
|
74
|
+
completed++;
|
|
75
|
+
onCaseComplete?.(result, completed - 1, cases.length);
|
|
76
|
+
return result;
|
|
77
|
+
})
|
|
78
|
+
);
|
|
79
|
+
results.push(...chunkResults);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const endTime = new Date();
|
|
84
|
+
|
|
85
|
+
// Calculate redaction metadata if any redaction occurred
|
|
86
|
+
let redactionInfo: ManifestRedactionInfo | undefined;
|
|
87
|
+
const effectiveRedaction = redaction ?? scenario.redaction;
|
|
88
|
+
|
|
89
|
+
if (effectiveRedaction?.enabled) {
|
|
90
|
+
const redactor = new Redactor(effectiveRedaction);
|
|
91
|
+
const promptsRedacted = results.filter((r) => r.redaction?.promptRedacted).length;
|
|
92
|
+
const responsesRedacted = results.filter((r) => r.redaction?.responseRedacted).length;
|
|
93
|
+
const totalRedactions = results.reduce((sum, r) => sum + (r.redaction?.redactionCount ?? 0), 0);
|
|
94
|
+
|
|
95
|
+
redactionInfo = {
|
|
96
|
+
enabled: true,
|
|
97
|
+
patternsUsed: redactor.patternNames,
|
|
98
|
+
replacement: redactor.replacement,
|
|
99
|
+
summary: {
|
|
100
|
+
promptsRedacted,
|
|
101
|
+
responsesRedacted,
|
|
102
|
+
totalRedactions,
|
|
103
|
+
},
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Create manifest
|
|
108
|
+
const manifest = createRunManifest({
|
|
109
|
+
project,
|
|
110
|
+
config: {
|
|
111
|
+
scenario: scenario.name,
|
|
112
|
+
provider: client.provider,
|
|
113
|
+
model: resolvedConfig?.model || scenario.model,
|
|
114
|
+
temperature: resolvedConfig?.temperature ?? scenario.temperature,
|
|
115
|
+
seed: scenario.seed,
|
|
116
|
+
},
|
|
117
|
+
resolvedConfig,
|
|
118
|
+
cases: results,
|
|
119
|
+
startTime,
|
|
120
|
+
endTime,
|
|
121
|
+
redaction: redactionInfo,
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
const success = manifest.metrics.failed_cases === 0;
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
manifest,
|
|
128
|
+
cases: results,
|
|
129
|
+
success,
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Run multiple scenarios
|
|
135
|
+
*/
|
|
136
|
+
export async function runScenarios(optionsList: RunOptions[]): Promise<RunResult[]> {
|
|
137
|
+
const results: RunResult[] = [];
|
|
138
|
+
|
|
139
|
+
for (const options of optionsList) {
|
|
140
|
+
const result = await runScenario(options);
|
|
141
|
+
results.push(result);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return results;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
function chunkArray<T>(array: T[], size: number): T[][] {
|
|
148
|
+
const chunks: T[][] = [];
|
|
149
|
+
for (let i = 0; i < array.length; i += size) {
|
|
150
|
+
chunks.push(array.slice(i, i + size));
|
|
151
|
+
}
|
|
152
|
+
return chunks;
|
|
153
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Runner types and interfaces
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { ModelClient } from '../adapters/types';
|
|
6
|
+
import type { CaseResult, ResolvedConfig, RunManifest } from '../artifacts/types';
|
|
7
|
+
import type { RedactionConfig } from '../redaction/types';
|
|
8
|
+
import type { Scenario } from '../scenario/schema';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Options for running a scenario
|
|
12
|
+
*/
|
|
13
|
+
export interface RunOptions {
|
|
14
|
+
/** The scenario to run */
|
|
15
|
+
scenario: Scenario;
|
|
16
|
+
/** Model client to use */
|
|
17
|
+
client: ModelClient;
|
|
18
|
+
/** Project name for the manifest */
|
|
19
|
+
project?: string;
|
|
20
|
+
/** Resolved configuration with source tracking */
|
|
21
|
+
resolvedConfig?: ResolvedConfig;
|
|
22
|
+
/** Filter cases by tags */
|
|
23
|
+
tags?: string[];
|
|
24
|
+
/** Number of concurrent requests */
|
|
25
|
+
concurrency?: number;
|
|
26
|
+
/** Timeout per case in milliseconds */
|
|
27
|
+
timeout?: number;
|
|
28
|
+
/** Number of retries per case */
|
|
29
|
+
retries?: number;
|
|
30
|
+
/** Redaction configuration (CLI overrides scenario) */
|
|
31
|
+
redaction?: RedactionConfig;
|
|
32
|
+
/** Callback for each case result */
|
|
33
|
+
onCaseComplete?: (result: CaseResult, index: number, total: number) => void;
|
|
34
|
+
/** Callback for progress updates */
|
|
35
|
+
onProgress?: (message: string) => void;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Result of a scenario run
|
|
40
|
+
*/
|
|
41
|
+
export interface RunResult {
|
|
42
|
+
/** The generated manifest */
|
|
43
|
+
manifest: RunManifest;
|
|
44
|
+
/** Individual case results */
|
|
45
|
+
cases: CaseResult[];
|
|
46
|
+
/** Whether all cases passed */
|
|
47
|
+
success: boolean;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Context passed to case executor
|
|
52
|
+
*/
|
|
53
|
+
export interface ExecutorContext {
|
|
54
|
+
client: ModelClient;
|
|
55
|
+
scenario: Scenario;
|
|
56
|
+
timeout?: number;
|
|
57
|
+
retries?: number;
|
|
58
|
+
/** Redaction configuration for this execution */
|
|
59
|
+
redaction?: RedactionConfig;
|
|
60
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tests for scenario parser
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { describe, expect, test } from 'bun:test';
|
|
6
|
+
import { parseScenarioString, validateScenario } from './parser';
|
|
7
|
+
|
|
8
|
+
describe('parseScenarioString', () => {
|
|
9
|
+
test('parses valid YAML scenario', () => {
|
|
10
|
+
const yaml = `
|
|
11
|
+
name: Test Scenario
|
|
12
|
+
description: A test scenario
|
|
13
|
+
version: "1.0"
|
|
14
|
+
provider: openai
|
|
15
|
+
model: gpt-4
|
|
16
|
+
cases:
|
|
17
|
+
- id: test1
|
|
18
|
+
prompt: "Hello"
|
|
19
|
+
expected:
|
|
20
|
+
type: exact
|
|
21
|
+
value: "Hi"
|
|
22
|
+
`;
|
|
23
|
+
const scenario = parseScenarioString(yaml);
|
|
24
|
+
expect(scenario.name).toBe('Test Scenario');
|
|
25
|
+
expect(scenario.cases.length).toBe(1);
|
|
26
|
+
expect(scenario.cases[0].id).toBe('test1');
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
test('throws on invalid YAML', () => {
|
|
30
|
+
const yaml = `
|
|
31
|
+
name: Test
|
|
32
|
+
cases: not an array
|
|
33
|
+
`;
|
|
34
|
+
expect(() => parseScenarioString(yaml)).toThrow();
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
test('throws on missing required fields', () => {
|
|
38
|
+
const yaml = `
|
|
39
|
+
description: Missing name and cases
|
|
40
|
+
`;
|
|
41
|
+
expect(() => parseScenarioString(yaml)).toThrow();
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
test('parses different expected types', () => {
|
|
45
|
+
const yaml = `
|
|
46
|
+
name: Multi-matcher Test
|
|
47
|
+
cases:
|
|
48
|
+
- id: regex-test
|
|
49
|
+
prompt: "Test"
|
|
50
|
+
expected:
|
|
51
|
+
type: regex
|
|
52
|
+
pattern: "^\\\\w+$"
|
|
53
|
+
- id: fuzzy-test
|
|
54
|
+
prompt: "Test"
|
|
55
|
+
expected:
|
|
56
|
+
type: fuzzy
|
|
57
|
+
value: "test"
|
|
58
|
+
threshold: 0.8
|
|
59
|
+
- id: contains-test
|
|
60
|
+
prompt: "Test"
|
|
61
|
+
expected:
|
|
62
|
+
type: contains
|
|
63
|
+
values:
|
|
64
|
+
- foo
|
|
65
|
+
- bar
|
|
66
|
+
mode: all
|
|
67
|
+
`;
|
|
68
|
+
const scenario = parseScenarioString(yaml);
|
|
69
|
+
expect(scenario.cases.length).toBe(3);
|
|
70
|
+
expect(scenario.cases[0].expected.type).toBe('regex');
|
|
71
|
+
expect(scenario.cases[1].expected.type).toBe('fuzzy');
|
|
72
|
+
expect(scenario.cases[2].expected.type).toBe('contains');
|
|
73
|
+
});
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
describe('validateScenario', () => {
|
|
77
|
+
test('validates correct scenario object', () => {
|
|
78
|
+
const scenario = {
|
|
79
|
+
name: 'Test',
|
|
80
|
+
cases: [
|
|
81
|
+
{
|
|
82
|
+
id: 'test1',
|
|
83
|
+
prompt: 'Hello',
|
|
84
|
+
expected: { type: 'exact', value: 'Hi' },
|
|
85
|
+
},
|
|
86
|
+
],
|
|
87
|
+
};
|
|
88
|
+
const validated = validateScenario(scenario);
|
|
89
|
+
expect(validated.name).toBe('Test');
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
test('throws on invalid scenario', () => {
|
|
93
|
+
const scenario = {
|
|
94
|
+
name: 'Test',
|
|
95
|
+
cases: [],
|
|
96
|
+
};
|
|
97
|
+
expect(() => validateScenario(scenario)).toThrow();
|
|
98
|
+
});
|
|
99
|
+
});
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Scenario parser for YAML files
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { readFile } from 'node:fs/promises';
|
|
6
|
+
import { parse as parseYaml } from 'yaml';
|
|
7
|
+
import { ArtemisError } from '../utils/errors';
|
|
8
|
+
import { type Scenario, ScenarioSchema } from './schema';
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Expand environment variables in config values
|
|
12
|
+
* Supports ${VAR} and ${VAR:-default} syntax
|
|
13
|
+
*/
|
|
14
|
+
function expandEnvVars(obj: unknown): unknown {
|
|
15
|
+
if (typeof obj === 'string') {
|
|
16
|
+
return obj.replace(/\$\{([^}]+)\}/g, (_, expr) => {
|
|
17
|
+
const [varName, defaultValue] = expr.split(':-');
|
|
18
|
+
return process.env[varName] || defaultValue || '';
|
|
19
|
+
});
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
if (Array.isArray(obj)) {
|
|
23
|
+
return obj.map(expandEnvVars);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
if (obj && typeof obj === 'object') {
|
|
27
|
+
const result: Record<string, unknown> = {};
|
|
28
|
+
for (const [key, value] of Object.entries(obj)) {
|
|
29
|
+
result[key] = expandEnvVars(value);
|
|
30
|
+
}
|
|
31
|
+
return result;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
return obj;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Parse a scenario from a YAML file
|
|
39
|
+
*/
|
|
40
|
+
export async function parseScenarioFile(filePath: string): Promise<Scenario> {
|
|
41
|
+
try {
|
|
42
|
+
const content = await readFile(filePath, 'utf-8');
|
|
43
|
+
return parseScenarioString(content, filePath);
|
|
44
|
+
} catch (error) {
|
|
45
|
+
if (error instanceof ArtemisError) {
|
|
46
|
+
throw error;
|
|
47
|
+
}
|
|
48
|
+
throw new ArtemisError(`Failed to read scenario file: ${filePath}`, 'SCENARIO_READ_ERROR', {
|
|
49
|
+
cause: error as Error,
|
|
50
|
+
});
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Parse a scenario from a YAML string
|
|
56
|
+
*/
|
|
57
|
+
export function parseScenarioString(content: string, source?: string): Scenario {
|
|
58
|
+
try {
|
|
59
|
+
const raw = parseYaml(content);
|
|
60
|
+
|
|
61
|
+
// Expand environment variables before validation
|
|
62
|
+
const expanded = expandEnvVars(raw);
|
|
63
|
+
|
|
64
|
+
const result = ScenarioSchema.safeParse(expanded);
|
|
65
|
+
|
|
66
|
+
if (!result.success) {
|
|
67
|
+
const issues = result.error.issues
|
|
68
|
+
.map((i) => ` - ${i.path.join('.')}: ${i.message}`)
|
|
69
|
+
.join('\n');
|
|
70
|
+
|
|
71
|
+
throw new ArtemisError(
|
|
72
|
+
`Invalid scenario${source ? ` in ${source}` : ''}:\n${issues}`,
|
|
73
|
+
'SCENARIO_VALIDATION_ERROR',
|
|
74
|
+
{ zodError: result.error }
|
|
75
|
+
);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return result.data;
|
|
79
|
+
} catch (error) {
|
|
80
|
+
if (error instanceof ArtemisError) {
|
|
81
|
+
throw error;
|
|
82
|
+
}
|
|
83
|
+
throw new ArtemisError(
|
|
84
|
+
`Failed to parse scenario YAML${source ? ` from ${source}` : ''}`,
|
|
85
|
+
'SCENARIO_PARSE_ERROR',
|
|
86
|
+
{ cause: error as Error }
|
|
87
|
+
);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Validate a scenario object
|
|
93
|
+
*/
|
|
94
|
+
export function validateScenario(scenario: unknown): Scenario {
|
|
95
|
+
const result = ScenarioSchema.safeParse(scenario);
|
|
96
|
+
|
|
97
|
+
if (!result.success) {
|
|
98
|
+
const issues = result.error.issues
|
|
99
|
+
.map((i) => ` - ${i.path.join('.')}: ${i.message}`)
|
|
100
|
+
.join('\n');
|
|
101
|
+
|
|
102
|
+
throw new ArtemisError(`Invalid scenario:\n${issues}`, 'SCENARIO_VALIDATION_ERROR', {
|
|
103
|
+
zodError: result.error,
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
return result.data;
|
|
108
|
+
}
|