@artemiskit/cli 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +106 -0
- package/bin/artemis.ts +0 -0
- package/dist/index.js +70954 -35881
- package/dist/src/cli.d.ts.map +1 -1
- package/dist/src/commands/compare.d.ts.map +1 -1
- package/dist/src/commands/init.d.ts.map +1 -1
- package/dist/src/commands/redteam.d.ts.map +1 -1
- package/dist/src/commands/run.d.ts.map +1 -1
- package/dist/src/commands/stress.d.ts.map +1 -1
- package/dist/src/config/loader.d.ts +3 -1
- package/dist/src/config/loader.d.ts.map +1 -1
- package/dist/src/config/schema.d.ts +8 -0
- package/dist/src/config/schema.d.ts.map +1 -1
- package/dist/src/ui/index.d.ts +3 -1
- package/dist/src/ui/index.d.ts.map +1 -1
- package/dist/src/ui/panels.d.ts +21 -0
- package/dist/src/ui/panels.d.ts.map +1 -1
- package/dist/src/ui/prompts.d.ts +92 -0
- package/dist/src/ui/prompts.d.ts.map +1 -0
- package/dist/src/utils/adapter.d.ts.map +1 -1
- package/dist/src/utils/update-checker.d.ts +31 -0
- package/dist/src/utils/update-checker.d.ts.map +1 -0
- package/package.json +6 -6
- package/src/cli.ts +22 -1
- package/src/commands/compare.ts +25 -0
- package/src/commands/init.ts +221 -77
- package/src/commands/redteam.ts +63 -10
- package/src/commands/run.ts +542 -137
- package/src/commands/stress.ts +76 -3
- package/src/config/loader.ts +5 -2
- package/src/config/schema.ts +1 -0
- package/src/ui/index.ts +19 -0
- package/src/ui/panels.ts +153 -5
- package/src/ui/prompts.ts +749 -0
- package/src/utils/adapter.ts +8 -0
- package/src/utils/update-checker.ts +121 -0
package/src/commands/run.ts
CHANGED
|
@@ -2,23 +2,32 @@
|
|
|
2
2
|
* Run command - Execute test scenarios
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
|
+
import { basename } from 'node:path';
|
|
5
6
|
import {
|
|
6
7
|
type RedactionConfig,
|
|
8
|
+
type RunManifest,
|
|
7
9
|
createAdapter,
|
|
8
10
|
parseScenarioFile,
|
|
11
|
+
resolveScenarioPaths,
|
|
9
12
|
runScenario,
|
|
10
13
|
} from '@artemiskit/core';
|
|
11
14
|
import chalk from 'chalk';
|
|
12
15
|
import { Command } from 'commander';
|
|
13
16
|
import { loadConfig } from '../config/loader.js';
|
|
17
|
+
import type { ArtemisConfig } from '../config/schema.js';
|
|
14
18
|
import {
|
|
15
19
|
createSpinner,
|
|
16
20
|
formatDuration,
|
|
17
21
|
getProviderErrorContext,
|
|
18
22
|
icons,
|
|
23
|
+
isInteractive,
|
|
19
24
|
isTTY,
|
|
20
25
|
padText,
|
|
26
|
+
promptModel,
|
|
27
|
+
promptProvider,
|
|
28
|
+
promptScenarios,
|
|
21
29
|
renderError,
|
|
30
|
+
renderFailureReason,
|
|
22
31
|
renderProgressBar,
|
|
23
32
|
renderSummaryPanel,
|
|
24
33
|
} from '../ui/index.js';
|
|
@@ -42,21 +51,322 @@ interface RunOptions {
|
|
|
42
51
|
config?: string;
|
|
43
52
|
redact?: boolean;
|
|
44
53
|
redactPatterns?: string[];
|
|
54
|
+
parallel?: number;
|
|
55
|
+
interactive?: boolean;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
interface ScenarioRunResult {
|
|
59
|
+
scenarioPath: string;
|
|
60
|
+
scenarioName: string;
|
|
61
|
+
success: boolean;
|
|
62
|
+
manifest: RunManifest;
|
|
63
|
+
error?: string;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Run a single scenario and return the result (quiet mode for parallel execution)
|
|
68
|
+
*/
|
|
69
|
+
async function runSingleScenarioQuiet(
|
|
70
|
+
scenarioPath: string,
|
|
71
|
+
options: RunOptions,
|
|
72
|
+
config: ArtemisConfig | null
|
|
73
|
+
): Promise<ScenarioRunResult> {
|
|
74
|
+
// Parse scenario
|
|
75
|
+
const scenario = await parseScenarioFile(scenarioPath);
|
|
76
|
+
|
|
77
|
+
// Resolve provider and model with precedence and source tracking:
|
|
78
|
+
// CLI > Scenario > Config > Default
|
|
79
|
+
const { provider, source: providerSource } = resolveProviderWithSource(
|
|
80
|
+
options.provider,
|
|
81
|
+
scenario.provider,
|
|
82
|
+
config?.provider
|
|
83
|
+
);
|
|
84
|
+
const { model, source: modelSource } = resolveModelWithSource(
|
|
85
|
+
options.model,
|
|
86
|
+
scenario.model,
|
|
87
|
+
config?.model
|
|
88
|
+
);
|
|
89
|
+
|
|
90
|
+
// Build adapter config with full precedence chain and source tracking
|
|
91
|
+
const { adapterConfig, resolvedConfig } = buildAdapterConfig({
|
|
92
|
+
provider,
|
|
93
|
+
model,
|
|
94
|
+
providerSource,
|
|
95
|
+
modelSource,
|
|
96
|
+
scenarioConfig: scenario.providerConfig,
|
|
97
|
+
fileConfig: config,
|
|
98
|
+
});
|
|
99
|
+
const client = await createAdapter(adapterConfig);
|
|
100
|
+
|
|
101
|
+
// Build redaction config from CLI options
|
|
102
|
+
let redaction: RedactionConfig | undefined;
|
|
103
|
+
if (options.redact) {
|
|
104
|
+
redaction = {
|
|
105
|
+
enabled: true,
|
|
106
|
+
patterns: options.redactPatterns,
|
|
107
|
+
redactPrompts: true,
|
|
108
|
+
redactResponses: true,
|
|
109
|
+
redactMetadata: false,
|
|
110
|
+
replacement: '[REDACTED]',
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Run scenario using core runner (no callbacks in quiet mode)
|
|
115
|
+
const result = await runScenario({
|
|
116
|
+
scenario,
|
|
117
|
+
client,
|
|
118
|
+
project: config?.project || process.env.ARTEMIS_PROJECT || 'default',
|
|
119
|
+
resolvedConfig,
|
|
120
|
+
tags: options.tags,
|
|
121
|
+
concurrency: Number.parseInt(String(options.concurrency)) || 1,
|
|
122
|
+
timeout: options.timeout ? Number.parseInt(String(options.timeout)) : undefined,
|
|
123
|
+
retries: options.retries ? Number.parseInt(String(options.retries)) : undefined,
|
|
124
|
+
redaction,
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
return {
|
|
128
|
+
scenarioPath,
|
|
129
|
+
scenarioName: scenario.name,
|
|
130
|
+
success: result.success,
|
|
131
|
+
manifest: result.manifest,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Run a single scenario and return the result (verbose mode for sequential execution)
|
|
137
|
+
*/
|
|
138
|
+
async function runSingleScenario(
|
|
139
|
+
scenarioPath: string,
|
|
140
|
+
options: RunOptions,
|
|
141
|
+
config: ArtemisConfig | null,
|
|
142
|
+
spinner: ReturnType<typeof createSpinner>,
|
|
143
|
+
isMultiScenario: boolean
|
|
144
|
+
): Promise<ScenarioRunResult> {
|
|
145
|
+
// Parse scenario
|
|
146
|
+
const scenario = await parseScenarioFile(scenarioPath);
|
|
147
|
+
|
|
148
|
+
if (isMultiScenario) {
|
|
149
|
+
console.log();
|
|
150
|
+
console.log(chalk.bold.cyan(`━━━ ${scenario.name} ━━━`));
|
|
151
|
+
console.log(chalk.dim(`File: ${basename(scenarioPath)}`));
|
|
152
|
+
console.log();
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// Resolve provider and model with precedence and source tracking:
|
|
156
|
+
// CLI > Scenario > Config > Default
|
|
157
|
+
const { provider, source: providerSource } = resolveProviderWithSource(
|
|
158
|
+
options.provider,
|
|
159
|
+
scenario.provider,
|
|
160
|
+
config?.provider
|
|
161
|
+
);
|
|
162
|
+
const { model, source: modelSource } = resolveModelWithSource(
|
|
163
|
+
options.model,
|
|
164
|
+
scenario.model,
|
|
165
|
+
config?.model
|
|
166
|
+
);
|
|
167
|
+
|
|
168
|
+
// Build adapter config with full precedence chain and source tracking
|
|
169
|
+
if (!isMultiScenario) {
|
|
170
|
+
spinner.start(`Connecting to ${provider}...`);
|
|
171
|
+
}
|
|
172
|
+
const { adapterConfig, resolvedConfig } = buildAdapterConfig({
|
|
173
|
+
provider,
|
|
174
|
+
model,
|
|
175
|
+
providerSource,
|
|
176
|
+
modelSource,
|
|
177
|
+
scenarioConfig: scenario.providerConfig,
|
|
178
|
+
fileConfig: config,
|
|
179
|
+
});
|
|
180
|
+
const client = await createAdapter(adapterConfig);
|
|
181
|
+
if (!isMultiScenario) {
|
|
182
|
+
spinner.succeed(`Connected to ${provider}`);
|
|
183
|
+
console.log();
|
|
184
|
+
console.log(chalk.bold(`Running scenario: ${scenario.name}`));
|
|
185
|
+
console.log();
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
// Build redaction config from CLI options
|
|
189
|
+
let redaction: RedactionConfig | undefined;
|
|
190
|
+
if (options.redact) {
|
|
191
|
+
redaction = {
|
|
192
|
+
enabled: true,
|
|
193
|
+
patterns: options.redactPatterns,
|
|
194
|
+
redactPrompts: true,
|
|
195
|
+
redactResponses: true,
|
|
196
|
+
redactMetadata: false,
|
|
197
|
+
replacement: '[REDACTED]',
|
|
198
|
+
};
|
|
199
|
+
if (!isMultiScenario) {
|
|
200
|
+
console.log(
|
|
201
|
+
chalk.dim(
|
|
202
|
+
`Redaction enabled${options.redactPatterns ? ` with patterns: ${options.redactPatterns.join(', ')}` : ' (default patterns)'}`
|
|
203
|
+
)
|
|
204
|
+
);
|
|
205
|
+
console.log();
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// Track progress
|
|
210
|
+
const totalCases = scenario.cases.length;
|
|
211
|
+
let completedCases = 0;
|
|
212
|
+
|
|
213
|
+
// Calculate max widths for alignment
|
|
214
|
+
const maxIdLength = Math.max(...scenario.cases.map((c) => c.id.length));
|
|
215
|
+
const maxScoreLength = 6; // "(100%)"
|
|
216
|
+
const maxDurationLength = 6; // "10.0s" or "999ms"
|
|
217
|
+
|
|
218
|
+
// Run scenario using core runner
|
|
219
|
+
const result = await runScenario({
|
|
220
|
+
scenario,
|
|
221
|
+
client,
|
|
222
|
+
project: config?.project || process.env.ARTEMIS_PROJECT || 'default',
|
|
223
|
+
resolvedConfig,
|
|
224
|
+
tags: options.tags,
|
|
225
|
+
concurrency: Number.parseInt(String(options.concurrency)) || 1,
|
|
226
|
+
timeout: options.timeout ? Number.parseInt(String(options.timeout)) : undefined,
|
|
227
|
+
retries: options.retries ? Number.parseInt(String(options.retries)) : undefined,
|
|
228
|
+
redaction,
|
|
229
|
+
onCaseComplete: (caseResult) => {
|
|
230
|
+
completedCases++;
|
|
231
|
+
|
|
232
|
+
const statusIcon = caseResult.ok ? icons.passed : icons.failed;
|
|
233
|
+
const scoreStr = `(${(caseResult.score * 100).toFixed(0)}%)`;
|
|
234
|
+
const durationStr = caseResult.latencyMs ? formatDuration(caseResult.latencyMs) : '';
|
|
235
|
+
|
|
236
|
+
// Pad columns for alignment
|
|
237
|
+
const paddedId = padText(caseResult.id, maxIdLength);
|
|
238
|
+
const paddedScore = padText(scoreStr, maxScoreLength, 'right');
|
|
239
|
+
const paddedDuration = padText(durationStr, maxDurationLength, 'right');
|
|
240
|
+
|
|
241
|
+
// Show result - with progress bar in TTY, simple format in CI/CD
|
|
242
|
+
if (isTTY) {
|
|
243
|
+
const progressBar = renderProgressBar(completedCases, totalCases, { width: 15 });
|
|
244
|
+
console.log(
|
|
245
|
+
`${statusIcon} ${paddedId} ${chalk.dim(paddedScore)} ${chalk.dim(paddedDuration)} ${progressBar}`
|
|
246
|
+
);
|
|
247
|
+
} else {
|
|
248
|
+
// CI/CD friendly output - no progress bar, just count
|
|
249
|
+
console.log(
|
|
250
|
+
`${statusIcon} ${paddedId} ${chalk.dim(paddedScore)} ${chalk.dim(paddedDuration)} [${completedCases}/${totalCases}]`
|
|
251
|
+
);
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
if (!caseResult.ok && options.verbose && caseResult.reason) {
|
|
255
|
+
console.log(
|
|
256
|
+
renderFailureReason(caseResult.reason, { matcherType: caseResult.matcherType })
|
|
257
|
+
);
|
|
258
|
+
}
|
|
259
|
+
},
|
|
260
|
+
onProgress: (message) => {
|
|
261
|
+
if (options.verbose) {
|
|
262
|
+
console.log(chalk.dim(message));
|
|
263
|
+
}
|
|
264
|
+
},
|
|
265
|
+
});
|
|
266
|
+
|
|
267
|
+
return {
|
|
268
|
+
scenarioPath,
|
|
269
|
+
scenarioName: scenario.name,
|
|
270
|
+
success: result.success,
|
|
271
|
+
manifest: result.manifest,
|
|
272
|
+
};
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
/**
|
|
276
|
+
* Run scenarios in parallel with a concurrency limit
|
|
277
|
+
*/
|
|
278
|
+
async function runScenariosInParallel(
|
|
279
|
+
scenarioPaths: string[],
|
|
280
|
+
options: RunOptions,
|
|
281
|
+
config: ArtemisConfig | null,
|
|
282
|
+
parallelLimit: number,
|
|
283
|
+
storage: ReturnType<typeof createStorage>
|
|
284
|
+
): Promise<ScenarioRunResult[]> {
|
|
285
|
+
const results: ScenarioRunResult[] = [];
|
|
286
|
+
let completedCount = 0;
|
|
287
|
+
const totalCount = scenarioPaths.length;
|
|
288
|
+
|
|
289
|
+
// Create a queue of scenario paths
|
|
290
|
+
const queue = [...scenarioPaths];
|
|
291
|
+
const inProgress = new Set<Promise<void>>();
|
|
292
|
+
|
|
293
|
+
// Progress display function
|
|
294
|
+
const updateProgress = (scenarioName: string, success: boolean) => {
|
|
295
|
+
completedCount++;
|
|
296
|
+
const icon = success ? icons.passed : icons.failed;
|
|
297
|
+
const status = success ? chalk.green('passed') : chalk.red('failed');
|
|
298
|
+
|
|
299
|
+
if (isTTY) {
|
|
300
|
+
const progressBar = renderProgressBar(completedCount, totalCount, { width: 20 });
|
|
301
|
+
console.log(`${icon} ${scenarioName} ${status} ${progressBar}`);
|
|
302
|
+
} else {
|
|
303
|
+
console.log(`${icon} ${scenarioName} ${status} [${completedCount}/${totalCount}]`);
|
|
304
|
+
}
|
|
305
|
+
};
|
|
306
|
+
|
|
307
|
+
// Process a single scenario
|
|
308
|
+
const processScenario = async (path: string): Promise<void> => {
|
|
309
|
+
try {
|
|
310
|
+
const result = await runSingleScenarioQuiet(path, options, config);
|
|
311
|
+
results.push(result);
|
|
312
|
+
updateProgress(result.scenarioName, result.success);
|
|
313
|
+
|
|
314
|
+
// Save results if enabled
|
|
315
|
+
if (options.save && result.manifest.run_id) {
|
|
316
|
+
await storage.save(result.manifest);
|
|
317
|
+
}
|
|
318
|
+
} catch (error) {
|
|
319
|
+
const scenarioName = basename(path);
|
|
320
|
+
results.push({
|
|
321
|
+
scenarioPath: path,
|
|
322
|
+
scenarioName,
|
|
323
|
+
success: false,
|
|
324
|
+
manifest: {} as RunManifest,
|
|
325
|
+
error: (error as Error).message,
|
|
326
|
+
});
|
|
327
|
+
updateProgress(scenarioName, false);
|
|
328
|
+
}
|
|
329
|
+
};
|
|
330
|
+
|
|
331
|
+
// Run with concurrency limit
|
|
332
|
+
while (queue.length > 0 || inProgress.size > 0) {
|
|
333
|
+
// Start new tasks up to the limit
|
|
334
|
+
while (queue.length > 0 && inProgress.size < parallelLimit) {
|
|
335
|
+
const path = queue.shift()!;
|
|
336
|
+
const promise = processScenario(path).then(() => {
|
|
337
|
+
inProgress.delete(promise);
|
|
338
|
+
});
|
|
339
|
+
inProgress.add(promise);
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
// Wait for at least one task to complete
|
|
343
|
+
if (inProgress.size > 0) {
|
|
344
|
+
await Promise.race(inProgress);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
return results;
|
|
45
349
|
}
|
|
46
350
|
|
|
47
351
|
export function runCommand(): Command {
|
|
48
352
|
const cmd = new Command('run');
|
|
49
353
|
|
|
50
354
|
cmd
|
|
51
|
-
.description(
|
|
52
|
-
|
|
355
|
+
.description(
|
|
356
|
+
'Run test scenarios against an LLM. Accepts a file path, directory, or glob pattern.'
|
|
357
|
+
)
|
|
358
|
+
.argument(
|
|
359
|
+
'[scenario]',
|
|
360
|
+
'Path to scenario file, directory, or glob pattern (e.g., scenarios/**/*.yaml)'
|
|
361
|
+
)
|
|
53
362
|
.option('-p, --provider <provider>', 'Provider to use (openai, azure-openai, vercel-ai)')
|
|
54
363
|
.option('-m, --model <model>', 'Model to use')
|
|
55
364
|
.option('-o, --output <dir>', 'Output directory for results')
|
|
56
365
|
.option('-v, --verbose', 'Verbose output')
|
|
57
366
|
.option('-t, --tags <tags...>', 'Filter test cases by tags')
|
|
58
367
|
.option('--save', 'Save results to storage', true)
|
|
59
|
-
.option('-c, --concurrency <number>', 'Number of concurrent test cases', '1')
|
|
368
|
+
.option('-c, --concurrency <number>', 'Number of concurrent test cases per scenario', '1')
|
|
369
|
+
.option('--parallel <number>', 'Number of scenarios to run in parallel (default: sequential)')
|
|
60
370
|
.option('--timeout <ms>', 'Timeout per test case in milliseconds')
|
|
61
371
|
.option('--retries <number>', 'Number of retries per test case')
|
|
62
372
|
.option('--config <path>', 'Path to config file')
|
|
@@ -65,7 +375,8 @@ export function runCommand(): Command {
|
|
|
65
375
|
'--redact-patterns <patterns...>',
|
|
66
376
|
'Custom redaction patterns (regex or built-in: email, phone, credit_card, ssn, api_key)'
|
|
67
377
|
)
|
|
68
|
-
.
|
|
378
|
+
.option('-i, --interactive', 'Enable interactive mode for scenario/provider selection')
|
|
379
|
+
.action(async (scenarioPath: string | undefined, options: RunOptions) => {
|
|
69
380
|
const spinner = createSpinner('Loading configuration...');
|
|
70
381
|
spinner.start();
|
|
71
382
|
|
|
@@ -73,161 +384,255 @@ export function runCommand(): Command {
|
|
|
73
384
|
// Load config file if present
|
|
74
385
|
const config = await loadConfig(options.config);
|
|
75
386
|
if (config) {
|
|
76
|
-
spinner.succeed(`Loaded config from ${
|
|
387
|
+
spinner.succeed(`Loaded config from ${config._path}`);
|
|
77
388
|
} else {
|
|
78
389
|
spinner.info('No config file found, using defaults');
|
|
79
390
|
}
|
|
80
391
|
|
|
81
|
-
//
|
|
82
|
-
|
|
83
|
-
const scenario = await parseScenarioFile(scenarioPath);
|
|
84
|
-
spinner.succeed(`Loaded scenario: ${scenario.name}`);
|
|
85
|
-
|
|
86
|
-
// Resolve provider and model with precedence and source tracking:
|
|
87
|
-
// CLI > Scenario > Config > Default
|
|
88
|
-
const { provider, source: providerSource } = resolveProviderWithSource(
|
|
89
|
-
options.provider,
|
|
90
|
-
scenario.provider,
|
|
91
|
-
config?.provider
|
|
92
|
-
);
|
|
93
|
-
const { model, source: modelSource } = resolveModelWithSource(
|
|
94
|
-
options.model,
|
|
95
|
-
scenario.model,
|
|
96
|
-
config?.model
|
|
97
|
-
);
|
|
392
|
+
// Determine if we should use interactive mode
|
|
393
|
+
const useInteractive = options.interactive || (!scenarioPath && isInteractive());
|
|
98
394
|
|
|
99
|
-
//
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
model,
|
|
104
|
-
providerSource,
|
|
105
|
-
modelSource,
|
|
106
|
-
scenarioConfig: scenario.providerConfig,
|
|
107
|
-
fileConfig: config,
|
|
108
|
-
});
|
|
109
|
-
const client = await createAdapter(adapterConfig);
|
|
110
|
-
spinner.succeed(`Connected to ${provider}`);
|
|
395
|
+
// Interactive provider/model selection if requested
|
|
396
|
+
if (useInteractive && !options.provider) {
|
|
397
|
+
spinner.stop();
|
|
398
|
+
console.log(chalk.cyan('\n Interactive mode enabled\n'));
|
|
111
399
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
console.log();
|
|
400
|
+
const provider = await promptProvider('Select a provider:');
|
|
401
|
+
options.provider = provider;
|
|
115
402
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
403
|
+
const model = await promptModel(provider, 'Select a model:');
|
|
404
|
+
options.model = model;
|
|
405
|
+
|
|
406
|
+
console.log(''); // spacing
|
|
407
|
+
spinner.start('Discovering scenarios...');
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// If no scenario path provided, try to find scenarios or prompt
|
|
411
|
+
let resolvedScenarioPath = scenarioPath;
|
|
412
|
+
if (!resolvedScenarioPath) {
|
|
413
|
+
// Try default scenarios directory
|
|
414
|
+
const defaultPath = config?.scenariosDir || './scenarios';
|
|
415
|
+
spinner.start(`Looking for scenarios in ${defaultPath}...`);
|
|
416
|
+
|
|
417
|
+
try {
|
|
418
|
+
const defaultScenarios = await resolveScenarioPaths(defaultPath);
|
|
419
|
+
if (defaultScenarios.length > 0) {
|
|
420
|
+
spinner.stop();
|
|
421
|
+
|
|
422
|
+
if (useInteractive) {
|
|
423
|
+
// Let user select which scenarios to run
|
|
424
|
+
const scenarioChoices = await Promise.all(
|
|
425
|
+
defaultScenarios.map(async (path) => {
|
|
426
|
+
try {
|
|
427
|
+
const scenario = await parseScenarioFile(path);
|
|
428
|
+
return { path, name: scenario.name || basename(path) };
|
|
429
|
+
} catch {
|
|
430
|
+
return { path, name: basename(path) };
|
|
431
|
+
}
|
|
432
|
+
})
|
|
433
|
+
);
|
|
434
|
+
|
|
435
|
+
const selectedPaths = await promptScenarios(
|
|
436
|
+
scenarioChoices,
|
|
437
|
+
'Select scenarios to run:'
|
|
438
|
+
);
|
|
439
|
+
|
|
440
|
+
if (selectedPaths.length === 0) {
|
|
441
|
+
console.log(chalk.yellow('\nNo scenarios selected. Exiting.'));
|
|
442
|
+
process.exit(0);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
// Use the first selected scenario or create a temp pattern
|
|
446
|
+
resolvedScenarioPath =
|
|
447
|
+
selectedPaths.length === 1 ? selectedPaths[0] : `{${selectedPaths.join(',')}}`;
|
|
448
|
+
|
|
449
|
+
console.log(''); // spacing
|
|
450
|
+
spinner.start('Preparing scenarios...');
|
|
451
|
+
} else {
|
|
452
|
+
spinner.succeed(`Found ${defaultScenarios.length} scenarios in ${defaultPath}`);
|
|
453
|
+
resolvedScenarioPath = defaultPath;
|
|
454
|
+
}
|
|
455
|
+
} else {
|
|
456
|
+
spinner.fail(`No scenarios found in ${defaultPath}`);
|
|
457
|
+
console.log();
|
|
458
|
+
console.log(chalk.yellow('Please provide a scenario path:'));
|
|
459
|
+
console.log(chalk.dim(' artemiskit run <path-to-scenario.yaml>'));
|
|
460
|
+
console.log(chalk.dim(' artemiskit run scenarios/'));
|
|
461
|
+
console.log(chalk.dim(' artemiskit run "scenarios/**/*.yaml"'));
|
|
462
|
+
process.exit(1);
|
|
463
|
+
}
|
|
464
|
+
} catch {
|
|
465
|
+
spinner.fail('No scenario path provided');
|
|
466
|
+
console.log();
|
|
467
|
+
console.log(chalk.yellow('Usage: artemiskit run <scenario>'));
|
|
468
|
+
console.log(chalk.dim(' <scenario> can be a file, directory, or glob pattern'));
|
|
469
|
+
process.exit(1);
|
|
470
|
+
}
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
// Resolve scenario paths (handles files, directories, and globs)
|
|
474
|
+
spinner.start('Discovering scenarios...');
|
|
475
|
+
const scenarioPaths = await resolveScenarioPaths(resolvedScenarioPath);
|
|
476
|
+
|
|
477
|
+
if (scenarioPaths.length === 0) {
|
|
478
|
+
spinner.fail('No scenario files found');
|
|
132
479
|
console.log();
|
|
480
|
+
console.log(chalk.yellow(`No .yaml or .yml files found matching: ${scenarioPath}`));
|
|
481
|
+
console.log(chalk.dim('Make sure the path exists and contains valid scenario files.'));
|
|
482
|
+
process.exit(1);
|
|
133
483
|
}
|
|
134
484
|
|
|
135
|
-
|
|
136
|
-
const
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
const
|
|
170
|
-
|
|
171
|
-
|
|
485
|
+
const isMultiScenario = scenarioPaths.length > 1;
|
|
486
|
+
const parallelLimit = options.parallel ? Number.parseInt(String(options.parallel)) : 0;
|
|
487
|
+
const runInParallel = parallelLimit > 0 && isMultiScenario;
|
|
488
|
+
|
|
489
|
+
if (isMultiScenario) {
|
|
490
|
+
const modeStr = runInParallel
|
|
491
|
+
? chalk.cyan(`parallel (${parallelLimit} concurrent)`)
|
|
492
|
+
: chalk.dim('sequential');
|
|
493
|
+
spinner.succeed(`Found ${scenarioPaths.length} scenario files`);
|
|
494
|
+
console.log();
|
|
495
|
+
console.log(chalk.bold(`Running ${scenarioPaths.length} scenarios ${modeStr}...`));
|
|
496
|
+
console.log();
|
|
497
|
+
} else {
|
|
498
|
+
spinner.succeed('Loaded scenario file');
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// Run all scenarios
|
|
502
|
+
const storage = createStorage({ fileConfig: config });
|
|
503
|
+
let results: ScenarioRunResult[];
|
|
504
|
+
|
|
505
|
+
if (runInParallel) {
|
|
506
|
+
// Parallel execution
|
|
507
|
+
results = await runScenariosInParallel(
|
|
508
|
+
scenarioPaths,
|
|
509
|
+
options,
|
|
510
|
+
config,
|
|
511
|
+
parallelLimit,
|
|
512
|
+
storage
|
|
513
|
+
);
|
|
514
|
+
} else {
|
|
515
|
+
// Sequential execution
|
|
516
|
+
results = [];
|
|
517
|
+
for (const path of scenarioPaths) {
|
|
518
|
+
try {
|
|
519
|
+
const result = await runSingleScenario(
|
|
520
|
+
path,
|
|
521
|
+
options,
|
|
522
|
+
config,
|
|
523
|
+
spinner,
|
|
524
|
+
isMultiScenario
|
|
172
525
|
);
|
|
173
|
-
|
|
174
|
-
|
|
526
|
+
results.push(result);
|
|
527
|
+
|
|
528
|
+
// Display per-scenario summary
|
|
529
|
+
const summaryData = {
|
|
530
|
+
passed: result.manifest.metrics.passed_cases,
|
|
531
|
+
failed: result.manifest.metrics.failed_cases,
|
|
532
|
+
skipped: 0,
|
|
533
|
+
successRate: result.manifest.metrics.success_rate * 100,
|
|
534
|
+
duration: result.manifest.duration_ms,
|
|
535
|
+
title: isMultiScenario ? result.scenarioName.toUpperCase() : 'TEST RESULTS',
|
|
536
|
+
};
|
|
537
|
+
console.log();
|
|
538
|
+
console.log(renderSummaryPanel(summaryData));
|
|
539
|
+
|
|
540
|
+
// Show additional metrics
|
|
541
|
+
console.log();
|
|
175
542
|
console.log(
|
|
176
|
-
|
|
543
|
+
chalk.dim(
|
|
544
|
+
`Run ID: ${result.manifest.run_id} | Median Latency: ${result.manifest.metrics.median_latency_ms}ms | Tokens: ${result.manifest.metrics.total_tokens.toLocaleString()}`
|
|
545
|
+
)
|
|
177
546
|
);
|
|
178
|
-
}
|
|
179
547
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
548
|
+
// Show redaction info if enabled
|
|
549
|
+
if (result.manifest.redaction?.enabled) {
|
|
550
|
+
const r = result.manifest.redaction;
|
|
551
|
+
console.log(
|
|
552
|
+
chalk.dim(
|
|
553
|
+
`Redactions: ${r.summary.totalRedactions} (${r.summary.promptsRedacted} prompts, ${r.summary.responsesRedacted} responses)`
|
|
554
|
+
)
|
|
555
|
+
);
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
// Save results
|
|
559
|
+
if (options.save) {
|
|
560
|
+
const savedPath = await storage.save(result.manifest);
|
|
561
|
+
console.log(chalk.dim(`Saved: ${savedPath}`));
|
|
562
|
+
}
|
|
563
|
+
} catch (error) {
|
|
564
|
+
// Record failed scenario
|
|
565
|
+
console.log();
|
|
566
|
+
console.log(chalk.red(`${icons.failed} Failed to run: ${basename(path)}`));
|
|
567
|
+
if (options.verbose) {
|
|
568
|
+
console.log(chalk.dim((error as Error).message));
|
|
569
|
+
}
|
|
570
|
+
results.push({
|
|
571
|
+
scenarioPath: path,
|
|
572
|
+
scenarioName: basename(path),
|
|
573
|
+
success: false,
|
|
574
|
+
manifest: {} as RunManifest,
|
|
575
|
+
});
|
|
187
576
|
}
|
|
188
|
-
}
|
|
189
|
-
}
|
|
577
|
+
}
|
|
578
|
+
}
|
|
190
579
|
|
|
191
|
-
// Display summary
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
)
|
|
209
|
-
|
|
580
|
+
// Display aggregate summary for multiple scenarios
|
|
581
|
+
if (isMultiScenario) {
|
|
582
|
+
console.log();
|
|
583
|
+
console.log(chalk.bold.cyan('━━━ AGGREGATE SUMMARY ━━━'));
|
|
584
|
+
console.log();
|
|
585
|
+
|
|
586
|
+
const totalScenarios = results.length;
|
|
587
|
+
const passedScenarios = results.filter((r) => r.success).length;
|
|
588
|
+
const failedScenarios = totalScenarios - passedScenarios;
|
|
589
|
+
|
|
590
|
+
const totalCases = results.reduce(
|
|
591
|
+
(sum, r) => sum + (r.manifest.metrics?.total_cases || 0),
|
|
592
|
+
0
|
|
593
|
+
);
|
|
594
|
+
const passedCases = results.reduce(
|
|
595
|
+
(sum, r) => sum + (r.manifest.metrics?.passed_cases || 0),
|
|
596
|
+
0
|
|
597
|
+
);
|
|
598
|
+
const failedCases = results.reduce(
|
|
599
|
+
(sum, r) => sum + (r.manifest.metrics?.failed_cases || 0),
|
|
600
|
+
0
|
|
601
|
+
);
|
|
602
|
+
const totalDuration = results.reduce((sum, r) => sum + (r.manifest.duration_ms || 0), 0);
|
|
210
603
|
|
|
211
|
-
// Show redaction info if enabled
|
|
212
|
-
if (result.manifest.redaction?.enabled) {
|
|
213
|
-
const r = result.manifest.redaction;
|
|
214
604
|
console.log(
|
|
215
|
-
chalk.dim(
|
|
216
|
-
`Redactions: ${r.summary.totalRedactions} (${r.summary.promptsRedacted} prompts, ${r.summary.responsesRedacted} responses)`
|
|
217
|
-
)
|
|
605
|
+
`Scenarios: ${chalk.green(`${passedScenarios} passed`)} ${failedScenarios > 0 ? chalk.red(`${failedScenarios} failed`) : ''} ${chalk.dim(`(${totalScenarios} total)`)}`
|
|
218
606
|
);
|
|
219
|
-
|
|
607
|
+
console.log(
|
|
608
|
+
`Test Cases: ${chalk.green(`${passedCases} passed`)} ${failedCases > 0 ? chalk.red(`${failedCases} failed`) : ''} ${chalk.dim(`(${totalCases} total)`)}`
|
|
609
|
+
);
|
|
610
|
+
console.log(`Duration: ${chalk.dim(formatDuration(totalDuration))}`);
|
|
220
611
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
612
|
+
if (runInParallel) {
|
|
613
|
+
console.log(
|
|
614
|
+
`Mode: ${chalk.cyan('parallel')} ${chalk.dim(`(${parallelLimit} concurrent)`)}`
|
|
615
|
+
);
|
|
616
|
+
}
|
|
617
|
+
console.log();
|
|
618
|
+
|
|
619
|
+
// List failed scenarios
|
|
620
|
+
const failedResults = results.filter((r) => !r.success);
|
|
621
|
+
if (failedResults.length > 0) {
|
|
622
|
+
console.log(chalk.red('Failed scenarios:'));
|
|
623
|
+
for (const result of failedResults) {
|
|
624
|
+
console.log(chalk.red(` ${icons.failed} ${result.scenarioName}`));
|
|
625
|
+
if (result.error && options.verbose) {
|
|
626
|
+
console.log(chalk.dim(` ${result.error}`));
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
console.log();
|
|
630
|
+
}
|
|
227
631
|
}
|
|
228
632
|
|
|
229
|
-
// Exit with error if any
|
|
230
|
-
|
|
633
|
+
// Exit with error if any scenarios failed
|
|
634
|
+
const hasFailures = results.some((r) => !r.success);
|
|
635
|
+
if (hasFailures) {
|
|
231
636
|
process.exit(1);
|
|
232
637
|
}
|
|
233
638
|
} catch (error) {
|