@artemiskit/cli 0.1.8 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,23 +2,33 @@
2
2
  * Run command - Execute test scenarios
3
3
  */
4
4
 
5
+ import { basename } from 'node:path';
5
6
  import {
7
+ type BaselineStorageAdapter,
6
8
  type RedactionConfig,
9
+ type RunManifest,
7
10
  createAdapter,
8
11
  parseScenarioFile,
12
+ resolveScenarioPaths,
9
13
  runScenario,
10
14
  } from '@artemiskit/core';
11
15
  import chalk from 'chalk';
12
16
  import { Command } from 'commander';
13
17
  import { loadConfig } from '../config/loader.js';
18
+ import type { ArtemisConfig } from '../config/schema.js';
14
19
  import {
15
20
  createSpinner,
16
21
  formatDuration,
17
22
  getProviderErrorContext,
18
23
  icons,
24
+ isInteractive,
19
25
  isTTY,
20
26
  padText,
27
+ promptModel,
28
+ promptProvider,
29
+ promptScenarios,
21
30
  renderError,
31
+ renderFailureReason,
22
32
  renderProgressBar,
23
33
  renderSummaryPanel,
24
34
  } from '../ui/index.js';
@@ -42,21 +52,493 @@ interface RunOptions {
42
52
  config?: string;
43
53
  redact?: boolean;
44
54
  redactPatterns?: string[];
55
+ parallel?: number;
56
+ interactive?: boolean;
57
+ /** CI mode - machine-readable output, no colors/spinners */
58
+ ci?: boolean;
59
+ /** Summary format: json, text, or security */
60
+ summary?: 'json' | 'text' | 'security';
61
+ /** Compare against baseline and detect regression */
62
+ baseline?: boolean;
63
+ /** Regression threshold (0-1), default 0.05 (5%) */
64
+ threshold?: number;
65
+ }
66
+
67
+ interface ScenarioRunResult {
68
+ scenarioPath: string;
69
+ scenarioName: string;
70
+ success: boolean;
71
+ manifest: RunManifest;
72
+ error?: string;
73
+ }
74
+
75
+ /**
76
+ * Minimal spinner interface for CI/non-TTY compatibility
77
+ */
78
+ interface SpinnerLike {
79
+ start: (text?: string) => void;
80
+ stop: () => void;
81
+ succeed: (text?: string) => void;
82
+ fail: (text?: string) => void;
83
+ info: (text?: string) => void;
84
+ }
85
+
86
+ /**
87
+ * CI-friendly JSON summary output
88
+ */
89
+ interface CISummary {
90
+ success: boolean;
91
+ scenarios: {
92
+ total: number;
93
+ passed: number;
94
+ failed: number;
95
+ };
96
+ cases: {
97
+ total: number;
98
+ passed: number;
99
+ failed: number;
100
+ successRate: number;
101
+ };
102
+ duration: {
103
+ totalMs: number;
104
+ formatted: string;
105
+ };
106
+ runs: Array<{
107
+ runId: string;
108
+ scenario: string;
109
+ success: boolean;
110
+ successRate: number;
111
+ passedCases: number;
112
+ failedCases: number;
113
+ totalCases: number;
114
+ durationMs: number;
115
+ }>;
116
+ baseline?: {
117
+ compared: boolean;
118
+ hasRegression: boolean;
119
+ threshold: number;
120
+ delta?: {
121
+ successRate: number;
122
+ latency: number;
123
+ tokens: number;
124
+ };
125
+ };
126
+ }
127
+
128
+ /**
129
+ * Security-focused summary for red team/security reporting
130
+ */
131
+ interface SecuritySummary {
132
+ overallRisk: 'low' | 'medium' | 'high' | 'critical';
133
+ successRate: number;
134
+ vulnerabilities: {
135
+ critical: number;
136
+ high: number;
137
+ medium: number;
138
+ low: number;
139
+ };
140
+ recommendations: string[];
141
+ }
142
+
143
+ /**
144
+ * Check if storage adapter supports baselines
145
+ */
146
+ function isBaselineStorage(storage: unknown): storage is BaselineStorageAdapter {
147
+ return (
148
+ typeof storage === 'object' &&
149
+ storage !== null &&
150
+ 'setBaseline' in storage &&
151
+ 'getBaseline' in storage &&
152
+ 'listBaselines' in storage &&
153
+ 'compareToBaseline' in storage
154
+ );
155
+ }
156
+
157
+ /**
158
+ * Build CI summary from results
159
+ */
160
+ function buildCISummary(results: ScenarioRunResult[]): CISummary {
161
+ const totalScenarios = results.length;
162
+ const passedScenarios = results.filter((r) => r.success).length;
163
+ const failedScenarios = totalScenarios - passedScenarios;
164
+
165
+ const totalCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_cases || 0), 0);
166
+ const passedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.passed_cases || 0), 0);
167
+ const failedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.failed_cases || 0), 0);
168
+ const totalDuration = results.reduce((sum, r) => sum + (r.manifest.duration_ms || 0), 0);
169
+
170
+ return {
171
+ success: failedScenarios === 0,
172
+ scenarios: {
173
+ total: totalScenarios,
174
+ passed: passedScenarios,
175
+ failed: failedScenarios,
176
+ },
177
+ cases: {
178
+ total: totalCases,
179
+ passed: passedCases,
180
+ failed: failedCases,
181
+ successRate: totalCases > 0 ? passedCases / totalCases : 0,
182
+ },
183
+ duration: {
184
+ totalMs: totalDuration,
185
+ formatted: formatDuration(totalDuration),
186
+ },
187
+ runs: results.map((r) => ({
188
+ runId: r.manifest.run_id || '',
189
+ scenario: r.scenarioName,
190
+ success: r.success,
191
+ successRate: r.manifest.metrics?.success_rate || 0,
192
+ passedCases: r.manifest.metrics?.passed_cases || 0,
193
+ failedCases: r.manifest.metrics?.failed_cases || 0,
194
+ totalCases: r.manifest.metrics?.total_cases || 0,
195
+ durationMs: r.manifest.duration_ms || 0,
196
+ })),
197
+ };
198
+ }
199
+
200
+ /**
201
+ * Build security summary (for --summary security)
202
+ */
203
+ function buildSecuritySummary(results: ScenarioRunResult[]): SecuritySummary {
204
+ const totalCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_cases || 0), 0);
205
+ const passedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.passed_cases || 0), 0);
206
+ const successRate = totalCases > 0 ? passedCases / totalCases : 0;
207
+
208
+ // Categorize risk based on success rate (for standard runs, invert for security context)
209
+ let overallRisk: 'low' | 'medium' | 'high' | 'critical';
210
+ if (successRate >= 0.95) overallRisk = 'low';
211
+ else if (successRate >= 0.8) overallRisk = 'medium';
212
+ else if (successRate >= 0.5) overallRisk = 'high';
213
+ else overallRisk = 'critical';
214
+
215
+ // Count failures by severity (simplified - can be enhanced with actual severity data)
216
+ const failedCases = totalCases - passedCases;
217
+
218
+ return {
219
+ overallRisk,
220
+ successRate,
221
+ vulnerabilities: {
222
+ critical: overallRisk === 'critical' ? failedCases : 0,
223
+ high: overallRisk === 'high' ? failedCases : 0,
224
+ medium: overallRisk === 'medium' ? failedCases : 0,
225
+ low: overallRisk === 'low' ? failedCases : 0,
226
+ },
227
+ recommendations:
228
+ successRate < 1
229
+ ? [
230
+ 'Review failed test cases for potential issues',
231
+ 'Consider adding more comprehensive test coverage',
232
+ successRate < 0.8 ? 'Investigate root causes of failures before deployment' : '',
233
+ ].filter(Boolean)
234
+ : ['All tests passing - continue monitoring'],
235
+ };
236
+ }
237
+
238
+ /**
239
+ * Run a single scenario and return the result (quiet mode for parallel execution)
240
+ */
241
+ async function runSingleScenarioQuiet(
242
+ scenarioPath: string,
243
+ options: RunOptions,
244
+ config: ArtemisConfig | null
245
+ ): Promise<ScenarioRunResult> {
246
+ // Parse scenario
247
+ const scenario = await parseScenarioFile(scenarioPath);
248
+
249
+ // Resolve provider and model with precedence and source tracking:
250
+ // CLI > Scenario > Config > Default
251
+ const { provider, source: providerSource } = resolveProviderWithSource(
252
+ options.provider,
253
+ scenario.provider,
254
+ config?.provider
255
+ );
256
+ const { model, source: modelSource } = resolveModelWithSource(
257
+ options.model,
258
+ scenario.model,
259
+ config?.model
260
+ );
261
+
262
+ // Build adapter config with full precedence chain and source tracking
263
+ const { adapterConfig, resolvedConfig } = buildAdapterConfig({
264
+ provider,
265
+ model,
266
+ providerSource,
267
+ modelSource,
268
+ scenarioConfig: scenario.providerConfig,
269
+ fileConfig: config,
270
+ });
271
+ const client = await createAdapter(adapterConfig);
272
+
273
+ // Build redaction config from CLI options
274
+ let redaction: RedactionConfig | undefined;
275
+ if (options.redact) {
276
+ redaction = {
277
+ enabled: true,
278
+ patterns: options.redactPatterns,
279
+ redactPrompts: true,
280
+ redactResponses: true,
281
+ redactMetadata: false,
282
+ replacement: '[REDACTED]',
283
+ };
284
+ }
285
+
286
+ // Run scenario using core runner (no callbacks in quiet mode)
287
+ const result = await runScenario({
288
+ scenario,
289
+ client,
290
+ project: config?.project || process.env.ARTEMIS_PROJECT || 'default',
291
+ resolvedConfig,
292
+ tags: options.tags,
293
+ concurrency: Number.parseInt(String(options.concurrency)) || 1,
294
+ timeout: options.timeout ? Number.parseInt(String(options.timeout)) : undefined,
295
+ retries: options.retries ? Number.parseInt(String(options.retries)) : undefined,
296
+ redaction,
297
+ });
298
+
299
+ return {
300
+ scenarioPath,
301
+ scenarioName: scenario.name,
302
+ success: result.success,
303
+ manifest: result.manifest,
304
+ };
305
+ }
306
+
307
+ /**
308
+ * Run a single scenario and return the result (verbose mode for sequential execution)
309
+ */
310
+ async function runSingleScenario(
311
+ scenarioPath: string,
312
+ options: RunOptions,
313
+ config: ArtemisConfig | null,
314
+ spinner: SpinnerLike,
315
+ isMultiScenario: boolean
316
+ ): Promise<ScenarioRunResult> {
317
+ // Parse scenario
318
+ const scenario = await parseScenarioFile(scenarioPath);
319
+
320
+ if (isMultiScenario) {
321
+ console.log();
322
+ console.log(chalk.bold.cyan(`━━━ ${scenario.name} ━━━`));
323
+ console.log(chalk.dim(`File: ${basename(scenarioPath)}`));
324
+ console.log();
325
+ }
326
+
327
+ // Resolve provider and model with precedence and source tracking:
328
+ // CLI > Scenario > Config > Default
329
+ const { provider, source: providerSource } = resolveProviderWithSource(
330
+ options.provider,
331
+ scenario.provider,
332
+ config?.provider
333
+ );
334
+ const { model, source: modelSource } = resolveModelWithSource(
335
+ options.model,
336
+ scenario.model,
337
+ config?.model
338
+ );
339
+
340
+ // Build adapter config with full precedence chain and source tracking
341
+ if (!isMultiScenario) {
342
+ spinner.start(`Connecting to ${provider}...`);
343
+ }
344
+ const { adapterConfig, resolvedConfig } = buildAdapterConfig({
345
+ provider,
346
+ model,
347
+ providerSource,
348
+ modelSource,
349
+ scenarioConfig: scenario.providerConfig,
350
+ fileConfig: config,
351
+ });
352
+ const client = await createAdapter(adapterConfig);
353
+ if (!isMultiScenario) {
354
+ spinner.succeed(`Connected to ${provider}`);
355
+ console.log();
356
+ console.log(chalk.bold(`Running scenario: ${scenario.name}`));
357
+ console.log();
358
+ }
359
+
360
+ // Build redaction config from CLI options
361
+ let redaction: RedactionConfig | undefined;
362
+ if (options.redact) {
363
+ redaction = {
364
+ enabled: true,
365
+ patterns: options.redactPatterns,
366
+ redactPrompts: true,
367
+ redactResponses: true,
368
+ redactMetadata: false,
369
+ replacement: '[REDACTED]',
370
+ };
371
+ if (!isMultiScenario) {
372
+ console.log(
373
+ chalk.dim(
374
+ `Redaction enabled${options.redactPatterns ? ` with patterns: ${options.redactPatterns.join(', ')}` : ' (default patterns)'}`
375
+ )
376
+ );
377
+ console.log();
378
+ }
379
+ }
380
+
381
+ // Track progress
382
+ const totalCases = scenario.cases.length;
383
+ let completedCases = 0;
384
+
385
+ // Calculate max widths for alignment
386
+ const maxIdLength = Math.max(...scenario.cases.map((c) => c.id.length));
387
+ const maxScoreLength = 6; // "(100%)"
388
+ const maxDurationLength = 6; // "10.0s" or "999ms"
389
+
390
+ // Run scenario using core runner
391
+ const result = await runScenario({
392
+ scenario,
393
+ client,
394
+ project: config?.project || process.env.ARTEMIS_PROJECT || 'default',
395
+ resolvedConfig,
396
+ tags: options.tags,
397
+ concurrency: Number.parseInt(String(options.concurrency)) || 1,
398
+ timeout: options.timeout ? Number.parseInt(String(options.timeout)) : undefined,
399
+ retries: options.retries ? Number.parseInt(String(options.retries)) : undefined,
400
+ redaction,
401
+ onCaseComplete: (caseResult) => {
402
+ completedCases++;
403
+
404
+ const statusIcon = caseResult.ok ? icons.passed : icons.failed;
405
+ const scoreStr = `(${(caseResult.score * 100).toFixed(0)}%)`;
406
+ const durationStr = caseResult.latencyMs ? formatDuration(caseResult.latencyMs) : '';
407
+
408
+ // Pad columns for alignment
409
+ const paddedId = padText(caseResult.id, maxIdLength);
410
+ const paddedScore = padText(scoreStr, maxScoreLength, 'right');
411
+ const paddedDuration = padText(durationStr, maxDurationLength, 'right');
412
+
413
+ // Show result - with progress bar in TTY, simple format in CI/CD
414
+ if (isTTY) {
415
+ const progressBar = renderProgressBar(completedCases, totalCases, { width: 15 });
416
+ console.log(
417
+ `${statusIcon} ${paddedId} ${chalk.dim(paddedScore)} ${chalk.dim(paddedDuration)} ${progressBar}`
418
+ );
419
+ } else {
420
+ // CI/CD friendly output - no progress bar, just count
421
+ console.log(
422
+ `${statusIcon} ${paddedId} ${chalk.dim(paddedScore)} ${chalk.dim(paddedDuration)} [${completedCases}/${totalCases}]`
423
+ );
424
+ }
425
+
426
+ if (!caseResult.ok && options.verbose && caseResult.reason) {
427
+ console.log(
428
+ renderFailureReason(caseResult.reason, { matcherType: caseResult.matcherType })
429
+ );
430
+ }
431
+ },
432
+ onProgress: (message) => {
433
+ if (options.verbose) {
434
+ console.log(chalk.dim(message));
435
+ }
436
+ },
437
+ });
438
+
439
+ return {
440
+ scenarioPath,
441
+ scenarioName: scenario.name,
442
+ success: result.success,
443
+ manifest: result.manifest,
444
+ };
445
+ }
446
+
447
+ /**
448
+ * Run scenarios in parallel with a concurrency limit
449
+ */
450
+ async function runScenariosInParallel(
451
+ scenarioPaths: string[],
452
+ options: RunOptions,
453
+ config: ArtemisConfig | null,
454
+ parallelLimit: number,
455
+ storage: ReturnType<typeof createStorage>
456
+ ): Promise<ScenarioRunResult[]> {
457
+ const results: ScenarioRunResult[] = [];
458
+ let completedCount = 0;
459
+ const totalCount = scenarioPaths.length;
460
+
461
+ // Create a queue of scenario paths
462
+ const queue = [...scenarioPaths];
463
+ const inProgress = new Set<Promise<void>>();
464
+
465
+ // Progress display function
466
+ const updateProgress = (scenarioName: string, success: boolean) => {
467
+ completedCount++;
468
+ const icon = success ? icons.passed : icons.failed;
469
+ const status = success ? chalk.green('passed') : chalk.red('failed');
470
+
471
+ if (isTTY) {
472
+ const progressBar = renderProgressBar(completedCount, totalCount, { width: 20 });
473
+ console.log(`${icon} ${scenarioName} ${status} ${progressBar}`);
474
+ } else {
475
+ console.log(`${icon} ${scenarioName} ${status} [${completedCount}/${totalCount}]`);
476
+ }
477
+ };
478
+
479
+ // Process a single scenario
480
+ const processScenario = async (path: string): Promise<void> => {
481
+ try {
482
+ const result = await runSingleScenarioQuiet(path, options, config);
483
+ results.push(result);
484
+ updateProgress(result.scenarioName, result.success);
485
+
486
+ // Save results if enabled
487
+ if (options.save && result.manifest.run_id) {
488
+ await storage.save(result.manifest);
489
+ }
490
+ } catch (error) {
491
+ const scenarioName = basename(path);
492
+ results.push({
493
+ scenarioPath: path,
494
+ scenarioName,
495
+ success: false,
496
+ manifest: {} as RunManifest,
497
+ error: (error as Error).message,
498
+ });
499
+ updateProgress(scenarioName, false);
500
+ }
501
+ };
502
+
503
+ // Run with concurrency limit
504
+ while (queue.length > 0 || inProgress.size > 0) {
505
+ // Start new tasks up to the limit
506
+ while (queue.length > 0 && inProgress.size < parallelLimit) {
507
+ const path = queue.shift()!;
508
+ const promise = processScenario(path).then(() => {
509
+ inProgress.delete(promise);
510
+ });
511
+ inProgress.add(promise);
512
+ }
513
+
514
+ // Wait for at least one task to complete
515
+ if (inProgress.size > 0) {
516
+ await Promise.race(inProgress);
517
+ }
518
+ }
519
+
520
+ return results;
45
521
  }
46
522
 
47
523
  export function runCommand(): Command {
48
524
  const cmd = new Command('run');
49
525
 
50
526
  cmd
51
- .description('Run test scenarios against an LLM')
52
- .argument('<scenario>', 'Path to scenario YAML file')
527
+ .description(
528
+ 'Run test scenarios against an LLM. Accepts a file path, directory, or glob pattern.'
529
+ )
530
+ .argument(
531
+ '[scenario]',
532
+ 'Path to scenario file, directory, or glob pattern (e.g., scenarios/**/*.yaml)'
533
+ )
53
534
  .option('-p, --provider <provider>', 'Provider to use (openai, azure-openai, vercel-ai)')
54
535
  .option('-m, --model <model>', 'Model to use')
55
536
  .option('-o, --output <dir>', 'Output directory for results')
56
537
  .option('-v, --verbose', 'Verbose output')
57
538
  .option('-t, --tags <tags...>', 'Filter test cases by tags')
58
539
  .option('--save', 'Save results to storage', true)
59
- .option('-c, --concurrency <number>', 'Number of concurrent test cases', '1')
540
+ .option('-c, --concurrency <number>', 'Number of concurrent test cases per scenario', '1')
541
+ .option('--parallel <number>', 'Number of scenarios to run in parallel (default: sequential)')
60
542
  .option('--timeout <ms>', 'Timeout per test case in milliseconds')
61
543
  .option('--retries <number>', 'Number of retries per test case')
62
544
  .option('--config <path>', 'Path to config file')
@@ -65,169 +547,409 @@ export function runCommand(): Command {
65
547
  '--redact-patterns <patterns...>',
66
548
  'Custom redaction patterns (regex or built-in: email, phone, credit_card, ssn, api_key)'
67
549
  )
68
- .action(async (scenarioPath: string, options: RunOptions) => {
69
- const spinner = createSpinner('Loading configuration...');
70
- spinner.start();
550
+ .option('-i, --interactive', 'Enable interactive mode for scenario/provider selection')
551
+ .option('--ci', 'CI mode: machine-readable output, no colors/spinners, JSON summary')
552
+ .option(
553
+ '--summary <format>',
554
+ 'Summary output format: json, text, or security (implies --ci for json/security)',
555
+ 'text'
556
+ )
557
+ .option('--baseline', 'Compare against baseline and detect regression')
558
+ .option('--threshold <number>', 'Regression threshold (0-1), e.g., 0.05 for 5%', '0.05')
559
+ .action(async (scenarioPath: string | undefined, options: RunOptions) => {
560
+ // Determine CI mode: explicit flag, environment variable, or summary format that implies CI
561
+ const isCIMode =
562
+ options.ci ||
563
+ process.env.CI === 'true' ||
564
+ options.summary === 'json' ||
565
+ options.summary === 'security';
566
+
567
+ // In CI mode, use a no-op spinner
568
+ const spinner = isCIMode
569
+ ? {
570
+ start: () => {},
571
+ stop: () => {},
572
+ succeed: () => {},
573
+ fail: () => {},
574
+ info: () => {},
575
+ }
576
+ : createSpinner('Loading configuration...');
577
+
578
+ if (!isCIMode) {
579
+ spinner.start();
580
+ }
71
581
 
72
582
  try {
73
583
  // Load config file if present
74
584
  const config = await loadConfig(options.config);
75
- if (config) {
76
- spinner.succeed(`Loaded config from ${(config as { _path?: string })._path}`);
77
- } else {
78
- spinner.info('No config file found, using defaults');
585
+ if (!isCIMode) {
586
+ if (config) {
587
+ spinner.succeed(`Loaded config from ${config._path}`);
588
+ } else {
589
+ spinner.info('No config file found, using defaults');
590
+ }
79
591
  }
80
592
 
81
- // Parse scenario
82
- spinner.start('Loading scenario...');
83
- const scenario = await parseScenarioFile(scenarioPath);
84
- spinner.succeed(`Loaded scenario: ${scenario.name}`);
85
-
86
- // Resolve provider and model with precedence and source tracking:
87
- // CLI > Scenario > Config > Default
88
- const { provider, source: providerSource } = resolveProviderWithSource(
89
- options.provider,
90
- scenario.provider,
91
- config?.provider
92
- );
93
- const { model, source: modelSource } = resolveModelWithSource(
94
- options.model,
95
- scenario.model,
96
- config?.model
97
- );
593
+ // Determine if we should use interactive mode (never in CI mode)
594
+ const useInteractive =
595
+ !isCIMode && (options.interactive || (!scenarioPath && isInteractive()));
98
596
 
99
- // Build adapter config with full precedence chain and source tracking
100
- spinner.start(`Connecting to ${provider}...`);
101
- const { adapterConfig, resolvedConfig } = buildAdapterConfig({
102
- provider,
103
- model,
104
- providerSource,
105
- modelSource,
106
- scenarioConfig: scenario.providerConfig,
107
- fileConfig: config,
108
- });
109
- const client = await createAdapter(adapterConfig);
110
- spinner.succeed(`Connected to ${provider}`);
597
+ // Interactive provider/model selection if requested
598
+ if (useInteractive && !options.provider) {
599
+ spinner.stop();
600
+ console.log(chalk.cyan('\n Interactive mode enabled\n'));
111
601
 
112
- console.log();
113
- console.log(chalk.bold(`Running scenario: ${scenario.name}`));
114
- console.log();
602
+ const provider = await promptProvider('Select a provider:');
603
+ options.provider = provider;
115
604
 
116
- // Build redaction config from CLI options
117
- let redaction: RedactionConfig | undefined;
118
- if (options.redact) {
119
- redaction = {
120
- enabled: true,
121
- patterns: options.redactPatterns,
122
- redactPrompts: true,
123
- redactResponses: true,
124
- redactMetadata: false,
125
- replacement: '[REDACTED]',
126
- };
127
- console.log(
128
- chalk.dim(
129
- `Redaction enabled${options.redactPatterns ? ` with patterns: ${options.redactPatterns.join(', ')}` : ' (default patterns)'}`
130
- )
131
- );
605
+ const model = await promptModel(provider, 'Select a model:');
606
+ options.model = model;
607
+
608
+ console.log(''); // spacing
609
+ spinner.start('Discovering scenarios...');
610
+ }
611
+
612
+ // If no scenario path provided, try to find scenarios or prompt
613
+ let resolvedScenarioPath = scenarioPath;
614
+ if (!resolvedScenarioPath) {
615
+ // Try default scenarios directory
616
+ const defaultPath = config?.scenariosDir || './scenarios';
617
+ spinner.start(`Looking for scenarios in ${defaultPath}...`);
618
+
619
+ try {
620
+ const defaultScenarios = await resolveScenarioPaths(defaultPath);
621
+ if (defaultScenarios.length > 0) {
622
+ spinner.stop();
623
+
624
+ if (useInteractive) {
625
+ // Let user select which scenarios to run
626
+ const scenarioChoices = await Promise.all(
627
+ defaultScenarios.map(async (path) => {
628
+ try {
629
+ const scenario = await parseScenarioFile(path);
630
+ return { path, name: scenario.name || basename(path) };
631
+ } catch {
632
+ return { path, name: basename(path) };
633
+ }
634
+ })
635
+ );
636
+
637
+ const selectedPaths = await promptScenarios(
638
+ scenarioChoices,
639
+ 'Select scenarios to run:'
640
+ );
641
+
642
+ if (selectedPaths.length === 0) {
643
+ console.log(chalk.yellow('\nNo scenarios selected. Exiting.'));
644
+ process.exit(0);
645
+ }
646
+
647
+ // Use the first selected scenario or create a temp pattern
648
+ resolvedScenarioPath =
649
+ selectedPaths.length === 1 ? selectedPaths[0] : `{${selectedPaths.join(',')}}`;
650
+
651
+ console.log(''); // spacing
652
+ spinner.start('Preparing scenarios...');
653
+ } else {
654
+ spinner.succeed(`Found ${defaultScenarios.length} scenarios in ${defaultPath}`);
655
+ resolvedScenarioPath = defaultPath;
656
+ }
657
+ } else {
658
+ spinner.fail(`No scenarios found in ${defaultPath}`);
659
+ console.log();
660
+ console.log(chalk.yellow('Please provide a scenario path:'));
661
+ console.log(chalk.dim(' artemiskit run <path-to-scenario.yaml>'));
662
+ console.log(chalk.dim(' artemiskit run scenarios/'));
663
+ console.log(chalk.dim(' artemiskit run "scenarios/**/*.yaml"'));
664
+ process.exit(1);
665
+ }
666
+ } catch {
667
+ spinner.fail('No scenario path provided');
668
+ console.log();
669
+ console.log(chalk.yellow('Usage: artemiskit run <scenario>'));
670
+ console.log(chalk.dim(' <scenario> can be a file, directory, or glob pattern'));
671
+ process.exit(1);
672
+ }
673
+ }
674
+
675
+ // Resolve scenario paths (handles files, directories, and globs)
676
+ spinner.start('Discovering scenarios...');
677
+ const scenarioPaths = await resolveScenarioPaths(resolvedScenarioPath);
678
+
679
+ if (scenarioPaths.length === 0) {
680
+ spinner.fail('No scenario files found');
132
681
  console.log();
682
+ console.log(chalk.yellow(`No .yaml or .yml files found matching: ${scenarioPath}`));
683
+ console.log(chalk.dim('Make sure the path exists and contains valid scenario files.'));
684
+ process.exit(1);
133
685
  }
134
686
 
135
- // Track progress
136
- const totalCases = scenario.cases.length;
137
- let completedCases = 0;
138
-
139
- // Calculate max widths for alignment
140
- const maxIdLength = Math.max(...scenario.cases.map((c) => c.id.length));
141
- const maxScoreLength = 6; // "(100%)"
142
- const maxDurationLength = 6; // "10.0s" or "999ms"
143
-
144
- // Run scenario using core runner
145
- const result = await runScenario({
146
- scenario,
147
- client,
148
- project: config?.project || process.env.ARTEMIS_PROJECT || 'default',
149
- resolvedConfig,
150
- tags: options.tags,
151
- concurrency: Number.parseInt(String(options.concurrency)) || 1,
152
- timeout: options.timeout ? Number.parseInt(String(options.timeout)) : undefined,
153
- retries: options.retries ? Number.parseInt(String(options.retries)) : undefined,
154
- redaction,
155
- onCaseComplete: (caseResult) => {
156
- completedCases++;
157
-
158
- const statusIcon = caseResult.ok ? icons.passed : icons.failed;
159
- const scoreStr = `(${(caseResult.score * 100).toFixed(0)}%)`;
160
- const durationStr = caseResult.latencyMs ? formatDuration(caseResult.latencyMs) : '';
161
-
162
- // Pad columns for alignment
163
- const paddedId = padText(caseResult.id, maxIdLength);
164
- const paddedScore = padText(scoreStr, maxScoreLength, 'right');
165
- const paddedDuration = padText(durationStr, maxDurationLength, 'right');
166
-
167
- // Show result - with progress bar in TTY, simple format in CI/CD
168
- if (isTTY) {
169
- const progressBar = renderProgressBar(completedCases, totalCases, { width: 15 });
687
+ const isMultiScenario = scenarioPaths.length > 1;
688
+ const parallelLimit = options.parallel ? Number.parseInt(String(options.parallel)) : 0;
689
+ const runInParallel = parallelLimit > 0 && isMultiScenario;
690
+
691
+ if (isMultiScenario) {
692
+ const modeStr = runInParallel
693
+ ? chalk.cyan(`parallel (${parallelLimit} concurrent)`)
694
+ : chalk.dim('sequential');
695
+ spinner.succeed(`Found ${scenarioPaths.length} scenario files`);
696
+ console.log();
697
+ console.log(chalk.bold(`Running ${scenarioPaths.length} scenarios ${modeStr}...`));
698
+ console.log();
699
+ } else {
700
+ spinner.succeed('Loaded scenario file');
701
+ }
702
+
703
+ // Run all scenarios
704
+ const storage = createStorage({ fileConfig: config });
705
+ let results: ScenarioRunResult[];
706
+
707
+ if (runInParallel) {
708
+ // Parallel execution
709
+ results = await runScenariosInParallel(
710
+ scenarioPaths,
711
+ options,
712
+ config,
713
+ parallelLimit,
714
+ storage
715
+ );
716
+ } else {
717
+ // Sequential execution
718
+ results = [];
719
+ for (const path of scenarioPaths) {
720
+ try {
721
+ const result = await runSingleScenario(
722
+ path,
723
+ options,
724
+ config,
725
+ spinner,
726
+ isMultiScenario
727
+ );
728
+ results.push(result);
729
+
730
+ // Display per-scenario summary
731
+ const summaryData = {
732
+ passed: result.manifest.metrics.passed_cases,
733
+ failed: result.manifest.metrics.failed_cases,
734
+ skipped: 0,
735
+ successRate: result.manifest.metrics.success_rate * 100,
736
+ duration: result.manifest.duration_ms,
737
+ title: isMultiScenario ? result.scenarioName.toUpperCase() : 'TEST RESULTS',
738
+ };
739
+ console.log();
740
+ console.log(renderSummaryPanel(summaryData));
741
+
742
+ // Show additional metrics
743
+ console.log();
170
744
  console.log(
171
- `${statusIcon} ${paddedId} ${chalk.dim(paddedScore)} ${chalk.dim(paddedDuration)} ${progressBar}`
745
+ chalk.dim(
746
+ `Run ID: ${result.manifest.run_id} | Median Latency: ${result.manifest.metrics.median_latency_ms}ms | Tokens: ${result.manifest.metrics.total_tokens.toLocaleString()}`
747
+ )
172
748
  );
173
- } else {
174
- // CI/CD friendly output - no progress bar, just count
749
+
750
+ // Show redaction info if enabled
751
+ if (result.manifest.redaction?.enabled) {
752
+ const r = result.manifest.redaction;
753
+ console.log(
754
+ chalk.dim(
755
+ `Redactions: ${r.summary.totalRedactions} (${r.summary.promptsRedacted} prompts, ${r.summary.responsesRedacted} responses)`
756
+ )
757
+ );
758
+ }
759
+
760
+ // Save results
761
+ if (options.save) {
762
+ const savedPath = await storage.save(result.manifest);
763
+ console.log(chalk.dim(`Saved: ${savedPath}`));
764
+ }
765
+ } catch (error) {
766
+ // Record failed scenario
767
+ console.log();
768
+ console.log(chalk.red(`${icons.failed} Failed to run: ${basename(path)}`));
769
+ if (options.verbose) {
770
+ console.log(chalk.dim((error as Error).message));
771
+ }
772
+ results.push({
773
+ scenarioPath: path,
774
+ scenarioName: basename(path),
775
+ success: false,
776
+ manifest: {} as RunManifest,
777
+ });
778
+ }
779
+ }
780
+ }
781
+
782
+ // Build CI summary (used for CI mode output and baseline comparison)
783
+ const ciSummary = buildCISummary(results);
784
+
785
+ // Baseline comparison (if enabled)
786
+ let baselineResult: {
787
+ hasRegression: boolean;
788
+ threshold: number;
789
+ delta?: { successRate: number; latency: number; tokens: number };
790
+ } | null = null;
791
+
792
+ if (options.baseline && results.length > 0) {
793
+ const regressionThreshold = Number.parseFloat(String(options.threshold)) || 0.05;
794
+
795
+ // Check each scenario against its baseline
796
+ for (const result of results) {
797
+ if (!result.manifest.run_id) continue;
798
+
799
+ if (isBaselineStorage(storage) && storage.compareToBaseline) {
800
+ try {
801
+ const comparison = await storage.compareToBaseline(
802
+ result.manifest.run_id,
803
+ regressionThreshold
804
+ );
805
+
806
+ if (comparison) {
807
+ baselineResult = {
808
+ hasRegression: comparison.hasRegression,
809
+ threshold: comparison.regressionThreshold,
810
+ delta: comparison.comparison.delta,
811
+ };
812
+
813
+ // Add baseline info to CI summary
814
+ ciSummary.baseline = {
815
+ compared: true,
816
+ hasRegression: comparison.hasRegression,
817
+ threshold: comparison.regressionThreshold,
818
+ delta: comparison.comparison.delta,
819
+ };
820
+
821
+ if (!isCIMode && comparison.hasRegression) {
822
+ console.log();
823
+ console.log(
824
+ `${icons.failed} ${chalk.red('Regression detected!')} for ${chalk.bold(result.scenarioName)}`
825
+ );
826
+ console.log(
827
+ chalk.dim(
828
+ ` Success rate dropped by ${Math.abs(comparison.comparison.delta.successRate * 100).toFixed(1)}% (threshold: ${regressionThreshold * 100}%)`
829
+ )
830
+ );
831
+ }
832
+ }
833
+ } catch {
834
+ // Baseline comparison failed, continue without it
835
+ }
836
+ }
837
+ }
838
+ }
839
+
840
+ // Handle CI mode output
841
+ if (isCIMode) {
842
+ if (options.summary === 'json') {
843
+ console.log(JSON.stringify(ciSummary, null, 2));
844
+ } else if (options.summary === 'security') {
845
+ const securitySummary = buildSecuritySummary(results);
846
+ console.log(JSON.stringify(securitySummary, null, 2));
847
+ } else {
848
+ // Default CI text output (minimal)
849
+ const totalCases = ciSummary.cases.total;
850
+ const passedCases = ciSummary.cases.passed;
851
+ const failedCases = ciSummary.cases.failed;
852
+ const successRate = (ciSummary.cases.successRate * 100).toFixed(1);
853
+
854
+ console.log(`ARTEMISKIT_RESULT=${ciSummary.success ? 'PASS' : 'FAIL'}`);
855
+ console.log(`ARTEMISKIT_SCENARIOS_TOTAL=${ciSummary.scenarios.total}`);
856
+ console.log(`ARTEMISKIT_SCENARIOS_PASSED=${ciSummary.scenarios.passed}`);
857
+ console.log(`ARTEMISKIT_SCENARIOS_FAILED=${ciSummary.scenarios.failed}`);
858
+ console.log(`ARTEMISKIT_CASES_TOTAL=${totalCases}`);
859
+ console.log(`ARTEMISKIT_CASES_PASSED=${passedCases}`);
860
+ console.log(`ARTEMISKIT_CASES_FAILED=${failedCases}`);
861
+ console.log(`ARTEMISKIT_SUCCESS_RATE=${successRate}`);
862
+ console.log(`ARTEMISKIT_DURATION_MS=${ciSummary.duration.totalMs}`);
863
+
864
+ if (baselineResult) {
865
+ console.log('ARTEMISKIT_BASELINE_COMPARED=true');
175
866
  console.log(
176
- `${statusIcon} ${paddedId} ${chalk.dim(paddedScore)} ${chalk.dim(paddedDuration)} [${completedCases}/${totalCases}]`
867
+ `ARTEMISKIT_REGRESSION=${baselineResult.hasRegression ? 'true' : 'false'}`
177
868
  );
869
+ if (baselineResult.delta) {
870
+ console.log(
871
+ `ARTEMISKIT_DELTA_SUCCESS_RATE=${(baselineResult.delta.successRate * 100).toFixed(2)}`
872
+ );
873
+ }
178
874
  }
179
875
 
180
- if (!caseResult.ok && options.verbose) {
181
- console.log(chalk.dim(` Reason: ${caseResult.reason}`));
876
+ // Also print run IDs for reference
877
+ for (const run of ciSummary.runs) {
878
+ if (run.runId) {
879
+ console.log(
880
+ `ARTEMISKIT_RUN_ID_${run.scenario.toUpperCase().replace(/[^A-Z0-9]/g, '_')}=${run.runId}`
881
+ );
882
+ }
182
883
  }
183
- },
184
- onProgress: (message) => {
185
- if (options.verbose) {
186
- console.log(chalk.dim(message));
884
+ }
885
+ } else {
886
+ // Display aggregate summary for multiple scenarios (non-CI mode)
887
+ if (isMultiScenario) {
888
+ console.log();
889
+ console.log(chalk.bold.cyan('━━━ AGGREGATE SUMMARY ━━━'));
890
+ console.log();
891
+
892
+ const totalScenarios = results.length;
893
+ const passedScenarios = results.filter((r) => r.success).length;
894
+ const failedScenarios = totalScenarios - passedScenarios;
895
+
896
+ const totalCases = results.reduce(
897
+ (sum, r) => sum + (r.manifest.metrics?.total_cases || 0),
898
+ 0
899
+ );
900
+ const passedCases = results.reduce(
901
+ (sum, r) => sum + (r.manifest.metrics?.passed_cases || 0),
902
+ 0
903
+ );
904
+ const failedCases = results.reduce(
905
+ (sum, r) => sum + (r.manifest.metrics?.failed_cases || 0),
906
+ 0
907
+ );
908
+ const totalDuration = results.reduce(
909
+ (sum, r) => sum + (r.manifest.duration_ms || 0),
910
+ 0
911
+ );
912
+
913
+ console.log(
914
+ `Scenarios: ${chalk.green(`${passedScenarios} passed`)} ${failedScenarios > 0 ? chalk.red(`${failedScenarios} failed`) : ''} ${chalk.dim(`(${totalScenarios} total)`)}`
915
+ );
916
+ console.log(
917
+ `Test Cases: ${chalk.green(`${passedCases} passed`)} ${failedCases > 0 ? chalk.red(`${failedCases} failed`) : ''} ${chalk.dim(`(${totalCases} total)`)}`
918
+ );
919
+ console.log(`Duration: ${chalk.dim(formatDuration(totalDuration))}`);
920
+
921
+ if (runInParallel) {
922
+ console.log(
923
+ `Mode: ${chalk.cyan('parallel')} ${chalk.dim(`(${parallelLimit} concurrent)`)}`
924
+ );
187
925
  }
188
- },
189
- });
926
+ console.log();
190
927
 
191
- // Display summary using enhanced panel
192
- console.log();
193
- const summaryData = {
194
- passed: result.manifest.metrics.passed_cases,
195
- failed: result.manifest.metrics.failed_cases,
196
- skipped: 0,
197
- successRate: result.manifest.metrics.success_rate * 100,
198
- duration: result.manifest.duration_ms,
199
- title: 'TEST RESULTS',
200
- };
201
- console.log(renderSummaryPanel(summaryData));
202
-
203
- // Show additional metrics
204
- console.log();
205
- console.log(
206
- chalk.dim(
207
- `Run ID: ${result.manifest.run_id} | Median Latency: ${result.manifest.metrics.median_latency_ms}ms | Tokens: ${result.manifest.metrics.total_tokens.toLocaleString()}`
208
- )
209
- );
928
+ // List failed scenarios
929
+ const failedResults = results.filter((r) => !r.success);
930
+ if (failedResults.length > 0) {
931
+ console.log(chalk.red('Failed scenarios:'));
932
+ for (const result of failedResults) {
933
+ console.log(chalk.red(` ${icons.failed} ${result.scenarioName}`));
934
+ if (result.error && options.verbose) {
935
+ console.log(chalk.dim(` ${result.error}`));
936
+ }
937
+ }
938
+ console.log();
939
+ }
940
+ }
210
941
 
211
- // Show redaction info if enabled
212
- if (result.manifest.redaction?.enabled) {
213
- const r = result.manifest.redaction;
214
- console.log(
215
- chalk.dim(
216
- `Redactions: ${r.summary.totalRedactions} (${r.summary.promptsRedacted} prompts, ${r.summary.responsesRedacted} responses)`
217
- )
218
- );
942
+ // Show baseline comparison result in non-CI mode
943
+ if (baselineResult && !baselineResult.hasRegression) {
944
+ console.log(`${icons.passed} ${chalk.green('No regression detected')}`);
945
+ }
219
946
  }
220
947
 
221
- // Save results
222
- if (options.save) {
223
- spinner.start('Saving results...');
224
- const storage = createStorage({ fileConfig: config });
225
- const path = await storage.save(result.manifest);
226
- spinner.succeed(`Results saved: ${path}`);
227
- }
948
+ // Exit with error if any scenarios failed or regression detected
949
+ const hasFailures = results.some((r) => !r.success);
950
+ const hasRegression = baselineResult?.hasRegression || false;
228
951
 
229
- // Exit with error if any tests failed
230
- if (!result.success) {
952
+ if (hasFailures || hasRegression) {
231
953
  process.exit(1);
232
954
  }
233
955
  } catch (error) {