@artemiskit/cli 0.1.8 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +139 -0
- package/bin/artemis.ts +0 -0
- package/dist/index.js +72343 -34002
- package/dist/src/cli.d.ts.map +1 -1
- package/dist/src/commands/baseline.d.ts +9 -0
- package/dist/src/commands/baseline.d.ts.map +1 -0
- package/dist/src/commands/compare.d.ts.map +1 -1
- package/dist/src/commands/init.d.ts.map +1 -1
- package/dist/src/commands/redteam.d.ts.map +1 -1
- package/dist/src/commands/run.d.ts.map +1 -1
- package/dist/src/commands/stress.d.ts.map +1 -1
- package/dist/src/config/loader.d.ts +3 -1
- package/dist/src/config/loader.d.ts.map +1 -1
- package/dist/src/config/schema.d.ts +16 -0
- package/dist/src/config/schema.d.ts.map +1 -1
- package/dist/src/ui/index.d.ts +3 -1
- package/dist/src/ui/index.d.ts.map +1 -1
- package/dist/src/ui/panels.d.ts +21 -0
- package/dist/src/ui/panels.d.ts.map +1 -1
- package/dist/src/ui/prompts.d.ts +92 -0
- package/dist/src/ui/prompts.d.ts.map +1 -0
- package/dist/src/utils/adapter.d.ts.map +1 -1
- package/package.json +6 -6
- package/src/cli.ts +2 -0
- package/src/commands/baseline.ts +473 -0
- package/src/commands/compare.ts +25 -0
- package/src/commands/init.ts +173 -69
- package/src/commands/redteam.ts +63 -10
- package/src/commands/run.ts +863 -141
- package/src/commands/stress.ts +76 -3
- package/src/config/loader.ts +5 -2
- package/src/config/schema.ts +4 -0
- package/src/ui/index.ts +19 -0
- package/src/ui/panels.ts +153 -5
- package/src/ui/prompts.ts +749 -0
- package/src/utils/adapter.ts +15 -0
package/src/commands/run.ts
CHANGED
|
@@ -2,23 +2,33 @@
|
|
|
2
2
|
* Run command - Execute test scenarios
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
|
+
import { basename } from 'node:path';
|
|
5
6
|
import {
|
|
7
|
+
type BaselineStorageAdapter,
|
|
6
8
|
type RedactionConfig,
|
|
9
|
+
type RunManifest,
|
|
7
10
|
createAdapter,
|
|
8
11
|
parseScenarioFile,
|
|
12
|
+
resolveScenarioPaths,
|
|
9
13
|
runScenario,
|
|
10
14
|
} from '@artemiskit/core';
|
|
11
15
|
import chalk from 'chalk';
|
|
12
16
|
import { Command } from 'commander';
|
|
13
17
|
import { loadConfig } from '../config/loader.js';
|
|
18
|
+
import type { ArtemisConfig } from '../config/schema.js';
|
|
14
19
|
import {
|
|
15
20
|
createSpinner,
|
|
16
21
|
formatDuration,
|
|
17
22
|
getProviderErrorContext,
|
|
18
23
|
icons,
|
|
24
|
+
isInteractive,
|
|
19
25
|
isTTY,
|
|
20
26
|
padText,
|
|
27
|
+
promptModel,
|
|
28
|
+
promptProvider,
|
|
29
|
+
promptScenarios,
|
|
21
30
|
renderError,
|
|
31
|
+
renderFailureReason,
|
|
22
32
|
renderProgressBar,
|
|
23
33
|
renderSummaryPanel,
|
|
24
34
|
} from '../ui/index.js';
|
|
@@ -42,21 +52,493 @@ interface RunOptions {
|
|
|
42
52
|
config?: string;
|
|
43
53
|
redact?: boolean;
|
|
44
54
|
redactPatterns?: string[];
|
|
55
|
+
parallel?: number;
|
|
56
|
+
interactive?: boolean;
|
|
57
|
+
/** CI mode - machine-readable output, no colors/spinners */
|
|
58
|
+
ci?: boolean;
|
|
59
|
+
/** Summary format: json, text, or security */
|
|
60
|
+
summary?: 'json' | 'text' | 'security';
|
|
61
|
+
/** Compare against baseline and detect regression */
|
|
62
|
+
baseline?: boolean;
|
|
63
|
+
/** Regression threshold (0-1), default 0.05 (5%) */
|
|
64
|
+
threshold?: number;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
interface ScenarioRunResult {
|
|
68
|
+
scenarioPath: string;
|
|
69
|
+
scenarioName: string;
|
|
70
|
+
success: boolean;
|
|
71
|
+
manifest: RunManifest;
|
|
72
|
+
error?: string;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Minimal spinner interface for CI/non-TTY compatibility
|
|
77
|
+
*/
|
|
78
|
+
interface SpinnerLike {
|
|
79
|
+
start: (text?: string) => void;
|
|
80
|
+
stop: () => void;
|
|
81
|
+
succeed: (text?: string) => void;
|
|
82
|
+
fail: (text?: string) => void;
|
|
83
|
+
info: (text?: string) => void;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* CI-friendly JSON summary output
|
|
88
|
+
*/
|
|
89
|
+
interface CISummary {
|
|
90
|
+
success: boolean;
|
|
91
|
+
scenarios: {
|
|
92
|
+
total: number;
|
|
93
|
+
passed: number;
|
|
94
|
+
failed: number;
|
|
95
|
+
};
|
|
96
|
+
cases: {
|
|
97
|
+
total: number;
|
|
98
|
+
passed: number;
|
|
99
|
+
failed: number;
|
|
100
|
+
successRate: number;
|
|
101
|
+
};
|
|
102
|
+
duration: {
|
|
103
|
+
totalMs: number;
|
|
104
|
+
formatted: string;
|
|
105
|
+
};
|
|
106
|
+
runs: Array<{
|
|
107
|
+
runId: string;
|
|
108
|
+
scenario: string;
|
|
109
|
+
success: boolean;
|
|
110
|
+
successRate: number;
|
|
111
|
+
passedCases: number;
|
|
112
|
+
failedCases: number;
|
|
113
|
+
totalCases: number;
|
|
114
|
+
durationMs: number;
|
|
115
|
+
}>;
|
|
116
|
+
baseline?: {
|
|
117
|
+
compared: boolean;
|
|
118
|
+
hasRegression: boolean;
|
|
119
|
+
threshold: number;
|
|
120
|
+
delta?: {
|
|
121
|
+
successRate: number;
|
|
122
|
+
latency: number;
|
|
123
|
+
tokens: number;
|
|
124
|
+
};
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Security-focused summary for red team/security reporting
|
|
130
|
+
*/
|
|
131
|
+
interface SecuritySummary {
|
|
132
|
+
overallRisk: 'low' | 'medium' | 'high' | 'critical';
|
|
133
|
+
successRate: number;
|
|
134
|
+
vulnerabilities: {
|
|
135
|
+
critical: number;
|
|
136
|
+
high: number;
|
|
137
|
+
medium: number;
|
|
138
|
+
low: number;
|
|
139
|
+
};
|
|
140
|
+
recommendations: string[];
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Check if storage adapter supports baselines
|
|
145
|
+
*/
|
|
146
|
+
function isBaselineStorage(storage: unknown): storage is BaselineStorageAdapter {
|
|
147
|
+
return (
|
|
148
|
+
typeof storage === 'object' &&
|
|
149
|
+
storage !== null &&
|
|
150
|
+
'setBaseline' in storage &&
|
|
151
|
+
'getBaseline' in storage &&
|
|
152
|
+
'listBaselines' in storage &&
|
|
153
|
+
'compareToBaseline' in storage
|
|
154
|
+
);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Build CI summary from results
|
|
159
|
+
*/
|
|
160
|
+
function buildCISummary(results: ScenarioRunResult[]): CISummary {
|
|
161
|
+
const totalScenarios = results.length;
|
|
162
|
+
const passedScenarios = results.filter((r) => r.success).length;
|
|
163
|
+
const failedScenarios = totalScenarios - passedScenarios;
|
|
164
|
+
|
|
165
|
+
const totalCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_cases || 0), 0);
|
|
166
|
+
const passedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.passed_cases || 0), 0);
|
|
167
|
+
const failedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.failed_cases || 0), 0);
|
|
168
|
+
const totalDuration = results.reduce((sum, r) => sum + (r.manifest.duration_ms || 0), 0);
|
|
169
|
+
|
|
170
|
+
return {
|
|
171
|
+
success: failedScenarios === 0,
|
|
172
|
+
scenarios: {
|
|
173
|
+
total: totalScenarios,
|
|
174
|
+
passed: passedScenarios,
|
|
175
|
+
failed: failedScenarios,
|
|
176
|
+
},
|
|
177
|
+
cases: {
|
|
178
|
+
total: totalCases,
|
|
179
|
+
passed: passedCases,
|
|
180
|
+
failed: failedCases,
|
|
181
|
+
successRate: totalCases > 0 ? passedCases / totalCases : 0,
|
|
182
|
+
},
|
|
183
|
+
duration: {
|
|
184
|
+
totalMs: totalDuration,
|
|
185
|
+
formatted: formatDuration(totalDuration),
|
|
186
|
+
},
|
|
187
|
+
runs: results.map((r) => ({
|
|
188
|
+
runId: r.manifest.run_id || '',
|
|
189
|
+
scenario: r.scenarioName,
|
|
190
|
+
success: r.success,
|
|
191
|
+
successRate: r.manifest.metrics?.success_rate || 0,
|
|
192
|
+
passedCases: r.manifest.metrics?.passed_cases || 0,
|
|
193
|
+
failedCases: r.manifest.metrics?.failed_cases || 0,
|
|
194
|
+
totalCases: r.manifest.metrics?.total_cases || 0,
|
|
195
|
+
durationMs: r.manifest.duration_ms || 0,
|
|
196
|
+
})),
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Build security summary (for --summary security)
|
|
202
|
+
*/
|
|
203
|
+
function buildSecuritySummary(results: ScenarioRunResult[]): SecuritySummary {
|
|
204
|
+
const totalCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_cases || 0), 0);
|
|
205
|
+
const passedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.passed_cases || 0), 0);
|
|
206
|
+
const successRate = totalCases > 0 ? passedCases / totalCases : 0;
|
|
207
|
+
|
|
208
|
+
// Categorize risk based on success rate (for standard runs, invert for security context)
|
|
209
|
+
let overallRisk: 'low' | 'medium' | 'high' | 'critical';
|
|
210
|
+
if (successRate >= 0.95) overallRisk = 'low';
|
|
211
|
+
else if (successRate >= 0.8) overallRisk = 'medium';
|
|
212
|
+
else if (successRate >= 0.5) overallRisk = 'high';
|
|
213
|
+
else overallRisk = 'critical';
|
|
214
|
+
|
|
215
|
+
// Count failures by severity (simplified - can be enhanced with actual severity data)
|
|
216
|
+
const failedCases = totalCases - passedCases;
|
|
217
|
+
|
|
218
|
+
return {
|
|
219
|
+
overallRisk,
|
|
220
|
+
successRate,
|
|
221
|
+
vulnerabilities: {
|
|
222
|
+
critical: overallRisk === 'critical' ? failedCases : 0,
|
|
223
|
+
high: overallRisk === 'high' ? failedCases : 0,
|
|
224
|
+
medium: overallRisk === 'medium' ? failedCases : 0,
|
|
225
|
+
low: overallRisk === 'low' ? failedCases : 0,
|
|
226
|
+
},
|
|
227
|
+
recommendations:
|
|
228
|
+
successRate < 1
|
|
229
|
+
? [
|
|
230
|
+
'Review failed test cases for potential issues',
|
|
231
|
+
'Consider adding more comprehensive test coverage',
|
|
232
|
+
successRate < 0.8 ? 'Investigate root causes of failures before deployment' : '',
|
|
233
|
+
].filter(Boolean)
|
|
234
|
+
: ['All tests passing - continue monitoring'],
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* Run a single scenario and return the result (quiet mode for parallel execution)
|
|
240
|
+
*/
|
|
241
|
+
async function runSingleScenarioQuiet(
|
|
242
|
+
scenarioPath: string,
|
|
243
|
+
options: RunOptions,
|
|
244
|
+
config: ArtemisConfig | null
|
|
245
|
+
): Promise<ScenarioRunResult> {
|
|
246
|
+
// Parse scenario
|
|
247
|
+
const scenario = await parseScenarioFile(scenarioPath);
|
|
248
|
+
|
|
249
|
+
// Resolve provider and model with precedence and source tracking:
|
|
250
|
+
// CLI > Scenario > Config > Default
|
|
251
|
+
const { provider, source: providerSource } = resolveProviderWithSource(
|
|
252
|
+
options.provider,
|
|
253
|
+
scenario.provider,
|
|
254
|
+
config?.provider
|
|
255
|
+
);
|
|
256
|
+
const { model, source: modelSource } = resolveModelWithSource(
|
|
257
|
+
options.model,
|
|
258
|
+
scenario.model,
|
|
259
|
+
config?.model
|
|
260
|
+
);
|
|
261
|
+
|
|
262
|
+
// Build adapter config with full precedence chain and source tracking
|
|
263
|
+
const { adapterConfig, resolvedConfig } = buildAdapterConfig({
|
|
264
|
+
provider,
|
|
265
|
+
model,
|
|
266
|
+
providerSource,
|
|
267
|
+
modelSource,
|
|
268
|
+
scenarioConfig: scenario.providerConfig,
|
|
269
|
+
fileConfig: config,
|
|
270
|
+
});
|
|
271
|
+
const client = await createAdapter(adapterConfig);
|
|
272
|
+
|
|
273
|
+
// Build redaction config from CLI options
|
|
274
|
+
let redaction: RedactionConfig | undefined;
|
|
275
|
+
if (options.redact) {
|
|
276
|
+
redaction = {
|
|
277
|
+
enabled: true,
|
|
278
|
+
patterns: options.redactPatterns,
|
|
279
|
+
redactPrompts: true,
|
|
280
|
+
redactResponses: true,
|
|
281
|
+
redactMetadata: false,
|
|
282
|
+
replacement: '[REDACTED]',
|
|
283
|
+
};
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// Run scenario using core runner (no callbacks in quiet mode)
|
|
287
|
+
const result = await runScenario({
|
|
288
|
+
scenario,
|
|
289
|
+
client,
|
|
290
|
+
project: config?.project || process.env.ARTEMIS_PROJECT || 'default',
|
|
291
|
+
resolvedConfig,
|
|
292
|
+
tags: options.tags,
|
|
293
|
+
concurrency: Number.parseInt(String(options.concurrency)) || 1,
|
|
294
|
+
timeout: options.timeout ? Number.parseInt(String(options.timeout)) : undefined,
|
|
295
|
+
retries: options.retries ? Number.parseInt(String(options.retries)) : undefined,
|
|
296
|
+
redaction,
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
return {
|
|
300
|
+
scenarioPath,
|
|
301
|
+
scenarioName: scenario.name,
|
|
302
|
+
success: result.success,
|
|
303
|
+
manifest: result.manifest,
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Run a single scenario and return the result (verbose mode for sequential execution)
|
|
309
|
+
*/
|
|
310
|
+
async function runSingleScenario(
|
|
311
|
+
scenarioPath: string,
|
|
312
|
+
options: RunOptions,
|
|
313
|
+
config: ArtemisConfig | null,
|
|
314
|
+
spinner: SpinnerLike,
|
|
315
|
+
isMultiScenario: boolean
|
|
316
|
+
): Promise<ScenarioRunResult> {
|
|
317
|
+
// Parse scenario
|
|
318
|
+
const scenario = await parseScenarioFile(scenarioPath);
|
|
319
|
+
|
|
320
|
+
if (isMultiScenario) {
|
|
321
|
+
console.log();
|
|
322
|
+
console.log(chalk.bold.cyan(`━━━ ${scenario.name} ━━━`));
|
|
323
|
+
console.log(chalk.dim(`File: ${basename(scenarioPath)}`));
|
|
324
|
+
console.log();
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
// Resolve provider and model with precedence and source tracking:
|
|
328
|
+
// CLI > Scenario > Config > Default
|
|
329
|
+
const { provider, source: providerSource } = resolveProviderWithSource(
|
|
330
|
+
options.provider,
|
|
331
|
+
scenario.provider,
|
|
332
|
+
config?.provider
|
|
333
|
+
);
|
|
334
|
+
const { model, source: modelSource } = resolveModelWithSource(
|
|
335
|
+
options.model,
|
|
336
|
+
scenario.model,
|
|
337
|
+
config?.model
|
|
338
|
+
);
|
|
339
|
+
|
|
340
|
+
// Build adapter config with full precedence chain and source tracking
|
|
341
|
+
if (!isMultiScenario) {
|
|
342
|
+
spinner.start(`Connecting to ${provider}...`);
|
|
343
|
+
}
|
|
344
|
+
const { adapterConfig, resolvedConfig } = buildAdapterConfig({
|
|
345
|
+
provider,
|
|
346
|
+
model,
|
|
347
|
+
providerSource,
|
|
348
|
+
modelSource,
|
|
349
|
+
scenarioConfig: scenario.providerConfig,
|
|
350
|
+
fileConfig: config,
|
|
351
|
+
});
|
|
352
|
+
const client = await createAdapter(adapterConfig);
|
|
353
|
+
if (!isMultiScenario) {
|
|
354
|
+
spinner.succeed(`Connected to ${provider}`);
|
|
355
|
+
console.log();
|
|
356
|
+
console.log(chalk.bold(`Running scenario: ${scenario.name}`));
|
|
357
|
+
console.log();
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
// Build redaction config from CLI options
|
|
361
|
+
let redaction: RedactionConfig | undefined;
|
|
362
|
+
if (options.redact) {
|
|
363
|
+
redaction = {
|
|
364
|
+
enabled: true,
|
|
365
|
+
patterns: options.redactPatterns,
|
|
366
|
+
redactPrompts: true,
|
|
367
|
+
redactResponses: true,
|
|
368
|
+
redactMetadata: false,
|
|
369
|
+
replacement: '[REDACTED]',
|
|
370
|
+
};
|
|
371
|
+
if (!isMultiScenario) {
|
|
372
|
+
console.log(
|
|
373
|
+
chalk.dim(
|
|
374
|
+
`Redaction enabled${options.redactPatterns ? ` with patterns: ${options.redactPatterns.join(', ')}` : ' (default patterns)'}`
|
|
375
|
+
)
|
|
376
|
+
);
|
|
377
|
+
console.log();
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Track progress
|
|
382
|
+
const totalCases = scenario.cases.length;
|
|
383
|
+
let completedCases = 0;
|
|
384
|
+
|
|
385
|
+
// Calculate max widths for alignment
|
|
386
|
+
const maxIdLength = Math.max(...scenario.cases.map((c) => c.id.length));
|
|
387
|
+
const maxScoreLength = 6; // "(100%)"
|
|
388
|
+
const maxDurationLength = 6; // "10.0s" or "999ms"
|
|
389
|
+
|
|
390
|
+
// Run scenario using core runner
|
|
391
|
+
const result = await runScenario({
|
|
392
|
+
scenario,
|
|
393
|
+
client,
|
|
394
|
+
project: config?.project || process.env.ARTEMIS_PROJECT || 'default',
|
|
395
|
+
resolvedConfig,
|
|
396
|
+
tags: options.tags,
|
|
397
|
+
concurrency: Number.parseInt(String(options.concurrency)) || 1,
|
|
398
|
+
timeout: options.timeout ? Number.parseInt(String(options.timeout)) : undefined,
|
|
399
|
+
retries: options.retries ? Number.parseInt(String(options.retries)) : undefined,
|
|
400
|
+
redaction,
|
|
401
|
+
onCaseComplete: (caseResult) => {
|
|
402
|
+
completedCases++;
|
|
403
|
+
|
|
404
|
+
const statusIcon = caseResult.ok ? icons.passed : icons.failed;
|
|
405
|
+
const scoreStr = `(${(caseResult.score * 100).toFixed(0)}%)`;
|
|
406
|
+
const durationStr = caseResult.latencyMs ? formatDuration(caseResult.latencyMs) : '';
|
|
407
|
+
|
|
408
|
+
// Pad columns for alignment
|
|
409
|
+
const paddedId = padText(caseResult.id, maxIdLength);
|
|
410
|
+
const paddedScore = padText(scoreStr, maxScoreLength, 'right');
|
|
411
|
+
const paddedDuration = padText(durationStr, maxDurationLength, 'right');
|
|
412
|
+
|
|
413
|
+
// Show result - with progress bar in TTY, simple format in CI/CD
|
|
414
|
+
if (isTTY) {
|
|
415
|
+
const progressBar = renderProgressBar(completedCases, totalCases, { width: 15 });
|
|
416
|
+
console.log(
|
|
417
|
+
`${statusIcon} ${paddedId} ${chalk.dim(paddedScore)} ${chalk.dim(paddedDuration)} ${progressBar}`
|
|
418
|
+
);
|
|
419
|
+
} else {
|
|
420
|
+
// CI/CD friendly output - no progress bar, just count
|
|
421
|
+
console.log(
|
|
422
|
+
`${statusIcon} ${paddedId} ${chalk.dim(paddedScore)} ${chalk.dim(paddedDuration)} [${completedCases}/${totalCases}]`
|
|
423
|
+
);
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
if (!caseResult.ok && options.verbose && caseResult.reason) {
|
|
427
|
+
console.log(
|
|
428
|
+
renderFailureReason(caseResult.reason, { matcherType: caseResult.matcherType })
|
|
429
|
+
);
|
|
430
|
+
}
|
|
431
|
+
},
|
|
432
|
+
onProgress: (message) => {
|
|
433
|
+
if (options.verbose) {
|
|
434
|
+
console.log(chalk.dim(message));
|
|
435
|
+
}
|
|
436
|
+
},
|
|
437
|
+
});
|
|
438
|
+
|
|
439
|
+
return {
|
|
440
|
+
scenarioPath,
|
|
441
|
+
scenarioName: scenario.name,
|
|
442
|
+
success: result.success,
|
|
443
|
+
manifest: result.manifest,
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
/**
|
|
448
|
+
* Run scenarios in parallel with a concurrency limit
|
|
449
|
+
*/
|
|
450
|
+
async function runScenariosInParallel(
|
|
451
|
+
scenarioPaths: string[],
|
|
452
|
+
options: RunOptions,
|
|
453
|
+
config: ArtemisConfig | null,
|
|
454
|
+
parallelLimit: number,
|
|
455
|
+
storage: ReturnType<typeof createStorage>
|
|
456
|
+
): Promise<ScenarioRunResult[]> {
|
|
457
|
+
const results: ScenarioRunResult[] = [];
|
|
458
|
+
let completedCount = 0;
|
|
459
|
+
const totalCount = scenarioPaths.length;
|
|
460
|
+
|
|
461
|
+
// Create a queue of scenario paths
|
|
462
|
+
const queue = [...scenarioPaths];
|
|
463
|
+
const inProgress = new Set<Promise<void>>();
|
|
464
|
+
|
|
465
|
+
// Progress display function
|
|
466
|
+
const updateProgress = (scenarioName: string, success: boolean) => {
|
|
467
|
+
completedCount++;
|
|
468
|
+
const icon = success ? icons.passed : icons.failed;
|
|
469
|
+
const status = success ? chalk.green('passed') : chalk.red('failed');
|
|
470
|
+
|
|
471
|
+
if (isTTY) {
|
|
472
|
+
const progressBar = renderProgressBar(completedCount, totalCount, { width: 20 });
|
|
473
|
+
console.log(`${icon} ${scenarioName} ${status} ${progressBar}`);
|
|
474
|
+
} else {
|
|
475
|
+
console.log(`${icon} ${scenarioName} ${status} [${completedCount}/${totalCount}]`);
|
|
476
|
+
}
|
|
477
|
+
};
|
|
478
|
+
|
|
479
|
+
// Process a single scenario
|
|
480
|
+
const processScenario = async (path: string): Promise<void> => {
|
|
481
|
+
try {
|
|
482
|
+
const result = await runSingleScenarioQuiet(path, options, config);
|
|
483
|
+
results.push(result);
|
|
484
|
+
updateProgress(result.scenarioName, result.success);
|
|
485
|
+
|
|
486
|
+
// Save results if enabled
|
|
487
|
+
if (options.save && result.manifest.run_id) {
|
|
488
|
+
await storage.save(result.manifest);
|
|
489
|
+
}
|
|
490
|
+
} catch (error) {
|
|
491
|
+
const scenarioName = basename(path);
|
|
492
|
+
results.push({
|
|
493
|
+
scenarioPath: path,
|
|
494
|
+
scenarioName,
|
|
495
|
+
success: false,
|
|
496
|
+
manifest: {} as RunManifest,
|
|
497
|
+
error: (error as Error).message,
|
|
498
|
+
});
|
|
499
|
+
updateProgress(scenarioName, false);
|
|
500
|
+
}
|
|
501
|
+
};
|
|
502
|
+
|
|
503
|
+
// Run with concurrency limit
|
|
504
|
+
while (queue.length > 0 || inProgress.size > 0) {
|
|
505
|
+
// Start new tasks up to the limit
|
|
506
|
+
while (queue.length > 0 && inProgress.size < parallelLimit) {
|
|
507
|
+
const path = queue.shift()!;
|
|
508
|
+
const promise = processScenario(path).then(() => {
|
|
509
|
+
inProgress.delete(promise);
|
|
510
|
+
});
|
|
511
|
+
inProgress.add(promise);
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// Wait for at least one task to complete
|
|
515
|
+
if (inProgress.size > 0) {
|
|
516
|
+
await Promise.race(inProgress);
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
return results;
|
|
45
521
|
}
|
|
46
522
|
|
|
47
523
|
export function runCommand(): Command {
|
|
48
524
|
const cmd = new Command('run');
|
|
49
525
|
|
|
50
526
|
cmd
|
|
51
|
-
.description(
|
|
52
|
-
|
|
527
|
+
.description(
|
|
528
|
+
'Run test scenarios against an LLM. Accepts a file path, directory, or glob pattern.'
|
|
529
|
+
)
|
|
530
|
+
.argument(
|
|
531
|
+
'[scenario]',
|
|
532
|
+
'Path to scenario file, directory, or glob pattern (e.g., scenarios/**/*.yaml)'
|
|
533
|
+
)
|
|
53
534
|
.option('-p, --provider <provider>', 'Provider to use (openai, azure-openai, vercel-ai)')
|
|
54
535
|
.option('-m, --model <model>', 'Model to use')
|
|
55
536
|
.option('-o, --output <dir>', 'Output directory for results')
|
|
56
537
|
.option('-v, --verbose', 'Verbose output')
|
|
57
538
|
.option('-t, --tags <tags...>', 'Filter test cases by tags')
|
|
58
539
|
.option('--save', 'Save results to storage', true)
|
|
59
|
-
.option('-c, --concurrency <number>', 'Number of concurrent test cases', '1')
|
|
540
|
+
.option('-c, --concurrency <number>', 'Number of concurrent test cases per scenario', '1')
|
|
541
|
+
.option('--parallel <number>', 'Number of scenarios to run in parallel (default: sequential)')
|
|
60
542
|
.option('--timeout <ms>', 'Timeout per test case in milliseconds')
|
|
61
543
|
.option('--retries <number>', 'Number of retries per test case')
|
|
62
544
|
.option('--config <path>', 'Path to config file')
|
|
@@ -65,169 +547,409 @@ export function runCommand(): Command {
|
|
|
65
547
|
'--redact-patterns <patterns...>',
|
|
66
548
|
'Custom redaction patterns (regex or built-in: email, phone, credit_card, ssn, api_key)'
|
|
67
549
|
)
|
|
68
|
-
.
|
|
69
|
-
|
|
70
|
-
|
|
550
|
+
.option('-i, --interactive', 'Enable interactive mode for scenario/provider selection')
|
|
551
|
+
.option('--ci', 'CI mode: machine-readable output, no colors/spinners, JSON summary')
|
|
552
|
+
.option(
|
|
553
|
+
'--summary <format>',
|
|
554
|
+
'Summary output format: json, text, or security (implies --ci for json/security)',
|
|
555
|
+
'text'
|
|
556
|
+
)
|
|
557
|
+
.option('--baseline', 'Compare against baseline and detect regression')
|
|
558
|
+
.option('--threshold <number>', 'Regression threshold (0-1), e.g., 0.05 for 5%', '0.05')
|
|
559
|
+
.action(async (scenarioPath: string | undefined, options: RunOptions) => {
|
|
560
|
+
// Determine CI mode: explicit flag, environment variable, or summary format that implies CI
|
|
561
|
+
const isCIMode =
|
|
562
|
+
options.ci ||
|
|
563
|
+
process.env.CI === 'true' ||
|
|
564
|
+
options.summary === 'json' ||
|
|
565
|
+
options.summary === 'security';
|
|
566
|
+
|
|
567
|
+
// In CI mode, use a no-op spinner
|
|
568
|
+
const spinner = isCIMode
|
|
569
|
+
? {
|
|
570
|
+
start: () => {},
|
|
571
|
+
stop: () => {},
|
|
572
|
+
succeed: () => {},
|
|
573
|
+
fail: () => {},
|
|
574
|
+
info: () => {},
|
|
575
|
+
}
|
|
576
|
+
: createSpinner('Loading configuration...');
|
|
577
|
+
|
|
578
|
+
if (!isCIMode) {
|
|
579
|
+
spinner.start();
|
|
580
|
+
}
|
|
71
581
|
|
|
72
582
|
try {
|
|
73
583
|
// Load config file if present
|
|
74
584
|
const config = await loadConfig(options.config);
|
|
75
|
-
if (
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
585
|
+
if (!isCIMode) {
|
|
586
|
+
if (config) {
|
|
587
|
+
spinner.succeed(`Loaded config from ${config._path}`);
|
|
588
|
+
} else {
|
|
589
|
+
spinner.info('No config file found, using defaults');
|
|
590
|
+
}
|
|
79
591
|
}
|
|
80
592
|
|
|
81
|
-
//
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
spinner.succeed(`Loaded scenario: ${scenario.name}`);
|
|
85
|
-
|
|
86
|
-
// Resolve provider and model with precedence and source tracking:
|
|
87
|
-
// CLI > Scenario > Config > Default
|
|
88
|
-
const { provider, source: providerSource } = resolveProviderWithSource(
|
|
89
|
-
options.provider,
|
|
90
|
-
scenario.provider,
|
|
91
|
-
config?.provider
|
|
92
|
-
);
|
|
93
|
-
const { model, source: modelSource } = resolveModelWithSource(
|
|
94
|
-
options.model,
|
|
95
|
-
scenario.model,
|
|
96
|
-
config?.model
|
|
97
|
-
);
|
|
593
|
+
// Determine if we should use interactive mode (never in CI mode)
|
|
594
|
+
const useInteractive =
|
|
595
|
+
!isCIMode && (options.interactive || (!scenarioPath && isInteractive()));
|
|
98
596
|
|
|
99
|
-
//
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
model,
|
|
104
|
-
providerSource,
|
|
105
|
-
modelSource,
|
|
106
|
-
scenarioConfig: scenario.providerConfig,
|
|
107
|
-
fileConfig: config,
|
|
108
|
-
});
|
|
109
|
-
const client = await createAdapter(adapterConfig);
|
|
110
|
-
spinner.succeed(`Connected to ${provider}`);
|
|
597
|
+
// Interactive provider/model selection if requested
|
|
598
|
+
if (useInteractive && !options.provider) {
|
|
599
|
+
spinner.stop();
|
|
600
|
+
console.log(chalk.cyan('\n Interactive mode enabled\n'));
|
|
111
601
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
console.log();
|
|
602
|
+
const provider = await promptProvider('Select a provider:');
|
|
603
|
+
options.provider = provider;
|
|
115
604
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
605
|
+
const model = await promptModel(provider, 'Select a model:');
|
|
606
|
+
options.model = model;
|
|
607
|
+
|
|
608
|
+
console.log(''); // spacing
|
|
609
|
+
spinner.start('Discovering scenarios...');
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
// If no scenario path provided, try to find scenarios or prompt
|
|
613
|
+
let resolvedScenarioPath = scenarioPath;
|
|
614
|
+
if (!resolvedScenarioPath) {
|
|
615
|
+
// Try default scenarios directory
|
|
616
|
+
const defaultPath = config?.scenariosDir || './scenarios';
|
|
617
|
+
spinner.start(`Looking for scenarios in ${defaultPath}...`);
|
|
618
|
+
|
|
619
|
+
try {
|
|
620
|
+
const defaultScenarios = await resolveScenarioPaths(defaultPath);
|
|
621
|
+
if (defaultScenarios.length > 0) {
|
|
622
|
+
spinner.stop();
|
|
623
|
+
|
|
624
|
+
if (useInteractive) {
|
|
625
|
+
// Let user select which scenarios to run
|
|
626
|
+
const scenarioChoices = await Promise.all(
|
|
627
|
+
defaultScenarios.map(async (path) => {
|
|
628
|
+
try {
|
|
629
|
+
const scenario = await parseScenarioFile(path);
|
|
630
|
+
return { path, name: scenario.name || basename(path) };
|
|
631
|
+
} catch {
|
|
632
|
+
return { path, name: basename(path) };
|
|
633
|
+
}
|
|
634
|
+
})
|
|
635
|
+
);
|
|
636
|
+
|
|
637
|
+
const selectedPaths = await promptScenarios(
|
|
638
|
+
scenarioChoices,
|
|
639
|
+
'Select scenarios to run:'
|
|
640
|
+
);
|
|
641
|
+
|
|
642
|
+
if (selectedPaths.length === 0) {
|
|
643
|
+
console.log(chalk.yellow('\nNo scenarios selected. Exiting.'));
|
|
644
|
+
process.exit(0);
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
// Use the first selected scenario or create a temp pattern
|
|
648
|
+
resolvedScenarioPath =
|
|
649
|
+
selectedPaths.length === 1 ? selectedPaths[0] : `{${selectedPaths.join(',')}}`;
|
|
650
|
+
|
|
651
|
+
console.log(''); // spacing
|
|
652
|
+
spinner.start('Preparing scenarios...');
|
|
653
|
+
} else {
|
|
654
|
+
spinner.succeed(`Found ${defaultScenarios.length} scenarios in ${defaultPath}`);
|
|
655
|
+
resolvedScenarioPath = defaultPath;
|
|
656
|
+
}
|
|
657
|
+
} else {
|
|
658
|
+
spinner.fail(`No scenarios found in ${defaultPath}`);
|
|
659
|
+
console.log();
|
|
660
|
+
console.log(chalk.yellow('Please provide a scenario path:'));
|
|
661
|
+
console.log(chalk.dim(' artemiskit run <path-to-scenario.yaml>'));
|
|
662
|
+
console.log(chalk.dim(' artemiskit run scenarios/'));
|
|
663
|
+
console.log(chalk.dim(' artemiskit run "scenarios/**/*.yaml"'));
|
|
664
|
+
process.exit(1);
|
|
665
|
+
}
|
|
666
|
+
} catch {
|
|
667
|
+
spinner.fail('No scenario path provided');
|
|
668
|
+
console.log();
|
|
669
|
+
console.log(chalk.yellow('Usage: artemiskit run <scenario>'));
|
|
670
|
+
console.log(chalk.dim(' <scenario> can be a file, directory, or glob pattern'));
|
|
671
|
+
process.exit(1);
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
// Resolve scenario paths (handles files, directories, and globs)
|
|
676
|
+
spinner.start('Discovering scenarios...');
|
|
677
|
+
const scenarioPaths = await resolveScenarioPaths(resolvedScenarioPath);
|
|
678
|
+
|
|
679
|
+
if (scenarioPaths.length === 0) {
|
|
680
|
+
spinner.fail('No scenario files found');
|
|
132
681
|
console.log();
|
|
682
|
+
console.log(chalk.yellow(`No .yaml or .yml files found matching: ${scenarioPath}`));
|
|
683
|
+
console.log(chalk.dim('Make sure the path exists and contains valid scenario files.'));
|
|
684
|
+
process.exit(1);
|
|
133
685
|
}
|
|
134
686
|
|
|
135
|
-
|
|
136
|
-
const
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
const
|
|
687
|
+
const isMultiScenario = scenarioPaths.length > 1;
|
|
688
|
+
const parallelLimit = options.parallel ? Number.parseInt(String(options.parallel)) : 0;
|
|
689
|
+
const runInParallel = parallelLimit > 0 && isMultiScenario;
|
|
690
|
+
|
|
691
|
+
if (isMultiScenario) {
|
|
692
|
+
const modeStr = runInParallel
|
|
693
|
+
? chalk.cyan(`parallel (${parallelLimit} concurrent)`)
|
|
694
|
+
: chalk.dim('sequential');
|
|
695
|
+
spinner.succeed(`Found ${scenarioPaths.length} scenario files`);
|
|
696
|
+
console.log();
|
|
697
|
+
console.log(chalk.bold(`Running ${scenarioPaths.length} scenarios ${modeStr}...`));
|
|
698
|
+
console.log();
|
|
699
|
+
} else {
|
|
700
|
+
spinner.succeed('Loaded scenario file');
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
// Run all scenarios
|
|
704
|
+
const storage = createStorage({ fileConfig: config });
|
|
705
|
+
let results: ScenarioRunResult[];
|
|
706
|
+
|
|
707
|
+
if (runInParallel) {
|
|
708
|
+
// Parallel execution
|
|
709
|
+
results = await runScenariosInParallel(
|
|
710
|
+
scenarioPaths,
|
|
711
|
+
options,
|
|
712
|
+
config,
|
|
713
|
+
parallelLimit,
|
|
714
|
+
storage
|
|
715
|
+
);
|
|
716
|
+
} else {
|
|
717
|
+
// Sequential execution
|
|
718
|
+
results = [];
|
|
719
|
+
for (const path of scenarioPaths) {
|
|
720
|
+
try {
|
|
721
|
+
const result = await runSingleScenario(
|
|
722
|
+
path,
|
|
723
|
+
options,
|
|
724
|
+
config,
|
|
725
|
+
spinner,
|
|
726
|
+
isMultiScenario
|
|
727
|
+
);
|
|
728
|
+
results.push(result);
|
|
729
|
+
|
|
730
|
+
// Display per-scenario summary
|
|
731
|
+
const summaryData = {
|
|
732
|
+
passed: result.manifest.metrics.passed_cases,
|
|
733
|
+
failed: result.manifest.metrics.failed_cases,
|
|
734
|
+
skipped: 0,
|
|
735
|
+
successRate: result.manifest.metrics.success_rate * 100,
|
|
736
|
+
duration: result.manifest.duration_ms,
|
|
737
|
+
title: isMultiScenario ? result.scenarioName.toUpperCase() : 'TEST RESULTS',
|
|
738
|
+
};
|
|
739
|
+
console.log();
|
|
740
|
+
console.log(renderSummaryPanel(summaryData));
|
|
741
|
+
|
|
742
|
+
// Show additional metrics
|
|
743
|
+
console.log();
|
|
170
744
|
console.log(
|
|
171
|
-
|
|
745
|
+
chalk.dim(
|
|
746
|
+
`Run ID: ${result.manifest.run_id} | Median Latency: ${result.manifest.metrics.median_latency_ms}ms | Tokens: ${result.manifest.metrics.total_tokens.toLocaleString()}`
|
|
747
|
+
)
|
|
172
748
|
);
|
|
173
|
-
|
|
174
|
-
//
|
|
749
|
+
|
|
750
|
+
// Show redaction info if enabled
|
|
751
|
+
if (result.manifest.redaction?.enabled) {
|
|
752
|
+
const r = result.manifest.redaction;
|
|
753
|
+
console.log(
|
|
754
|
+
chalk.dim(
|
|
755
|
+
`Redactions: ${r.summary.totalRedactions} (${r.summary.promptsRedacted} prompts, ${r.summary.responsesRedacted} responses)`
|
|
756
|
+
)
|
|
757
|
+
);
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
// Save results
|
|
761
|
+
if (options.save) {
|
|
762
|
+
const savedPath = await storage.save(result.manifest);
|
|
763
|
+
console.log(chalk.dim(`Saved: ${savedPath}`));
|
|
764
|
+
}
|
|
765
|
+
} catch (error) {
|
|
766
|
+
// Record failed scenario
|
|
767
|
+
console.log();
|
|
768
|
+
console.log(chalk.red(`${icons.failed} Failed to run: ${basename(path)}`));
|
|
769
|
+
if (options.verbose) {
|
|
770
|
+
console.log(chalk.dim((error as Error).message));
|
|
771
|
+
}
|
|
772
|
+
results.push({
|
|
773
|
+
scenarioPath: path,
|
|
774
|
+
scenarioName: basename(path),
|
|
775
|
+
success: false,
|
|
776
|
+
manifest: {} as RunManifest,
|
|
777
|
+
});
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
|
|
782
|
+
// Build CI summary (used for CI mode output and baseline comparison)
|
|
783
|
+
const ciSummary = buildCISummary(results);
|
|
784
|
+
|
|
785
|
+
// Baseline comparison (if enabled)
|
|
786
|
+
let baselineResult: {
|
|
787
|
+
hasRegression: boolean;
|
|
788
|
+
threshold: number;
|
|
789
|
+
delta?: { successRate: number; latency: number; tokens: number };
|
|
790
|
+
} | null = null;
|
|
791
|
+
|
|
792
|
+
if (options.baseline && results.length > 0) {
|
|
793
|
+
const regressionThreshold = Number.parseFloat(String(options.threshold)) || 0.05;
|
|
794
|
+
|
|
795
|
+
// Check each scenario against its baseline
|
|
796
|
+
for (const result of results) {
|
|
797
|
+
if (!result.manifest.run_id) continue;
|
|
798
|
+
|
|
799
|
+
if (isBaselineStorage(storage) && storage.compareToBaseline) {
|
|
800
|
+
try {
|
|
801
|
+
const comparison = await storage.compareToBaseline(
|
|
802
|
+
result.manifest.run_id,
|
|
803
|
+
regressionThreshold
|
|
804
|
+
);
|
|
805
|
+
|
|
806
|
+
if (comparison) {
|
|
807
|
+
baselineResult = {
|
|
808
|
+
hasRegression: comparison.hasRegression,
|
|
809
|
+
threshold: comparison.regressionThreshold,
|
|
810
|
+
delta: comparison.comparison.delta,
|
|
811
|
+
};
|
|
812
|
+
|
|
813
|
+
// Add baseline info to CI summary
|
|
814
|
+
ciSummary.baseline = {
|
|
815
|
+
compared: true,
|
|
816
|
+
hasRegression: comparison.hasRegression,
|
|
817
|
+
threshold: comparison.regressionThreshold,
|
|
818
|
+
delta: comparison.comparison.delta,
|
|
819
|
+
};
|
|
820
|
+
|
|
821
|
+
if (!isCIMode && comparison.hasRegression) {
|
|
822
|
+
console.log();
|
|
823
|
+
console.log(
|
|
824
|
+
`${icons.failed} ${chalk.red('Regression detected!')} for ${chalk.bold(result.scenarioName)}`
|
|
825
|
+
);
|
|
826
|
+
console.log(
|
|
827
|
+
chalk.dim(
|
|
828
|
+
` Success rate dropped by ${Math.abs(comparison.comparison.delta.successRate * 100).toFixed(1)}% (threshold: ${regressionThreshold * 100}%)`
|
|
829
|
+
)
|
|
830
|
+
);
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
} catch {
|
|
834
|
+
// Baseline comparison failed, continue without it
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
// Handle CI mode output
|
|
841
|
+
if (isCIMode) {
|
|
842
|
+
if (options.summary === 'json') {
|
|
843
|
+
console.log(JSON.stringify(ciSummary, null, 2));
|
|
844
|
+
} else if (options.summary === 'security') {
|
|
845
|
+
const securitySummary = buildSecuritySummary(results);
|
|
846
|
+
console.log(JSON.stringify(securitySummary, null, 2));
|
|
847
|
+
} else {
|
|
848
|
+
// Default CI text output (minimal)
|
|
849
|
+
const totalCases = ciSummary.cases.total;
|
|
850
|
+
const passedCases = ciSummary.cases.passed;
|
|
851
|
+
const failedCases = ciSummary.cases.failed;
|
|
852
|
+
const successRate = (ciSummary.cases.successRate * 100).toFixed(1);
|
|
853
|
+
|
|
854
|
+
console.log(`ARTEMISKIT_RESULT=${ciSummary.success ? 'PASS' : 'FAIL'}`);
|
|
855
|
+
console.log(`ARTEMISKIT_SCENARIOS_TOTAL=${ciSummary.scenarios.total}`);
|
|
856
|
+
console.log(`ARTEMISKIT_SCENARIOS_PASSED=${ciSummary.scenarios.passed}`);
|
|
857
|
+
console.log(`ARTEMISKIT_SCENARIOS_FAILED=${ciSummary.scenarios.failed}`);
|
|
858
|
+
console.log(`ARTEMISKIT_CASES_TOTAL=${totalCases}`);
|
|
859
|
+
console.log(`ARTEMISKIT_CASES_PASSED=${passedCases}`);
|
|
860
|
+
console.log(`ARTEMISKIT_CASES_FAILED=${failedCases}`);
|
|
861
|
+
console.log(`ARTEMISKIT_SUCCESS_RATE=${successRate}`);
|
|
862
|
+
console.log(`ARTEMISKIT_DURATION_MS=${ciSummary.duration.totalMs}`);
|
|
863
|
+
|
|
864
|
+
if (baselineResult) {
|
|
865
|
+
console.log('ARTEMISKIT_BASELINE_COMPARED=true');
|
|
175
866
|
console.log(
|
|
176
|
-
|
|
867
|
+
`ARTEMISKIT_REGRESSION=${baselineResult.hasRegression ? 'true' : 'false'}`
|
|
177
868
|
);
|
|
869
|
+
if (baselineResult.delta) {
|
|
870
|
+
console.log(
|
|
871
|
+
`ARTEMISKIT_DELTA_SUCCESS_RATE=${(baselineResult.delta.successRate * 100).toFixed(2)}`
|
|
872
|
+
);
|
|
873
|
+
}
|
|
178
874
|
}
|
|
179
875
|
|
|
180
|
-
|
|
181
|
-
|
|
876
|
+
// Also print run IDs for reference
|
|
877
|
+
for (const run of ciSummary.runs) {
|
|
878
|
+
if (run.runId) {
|
|
879
|
+
console.log(
|
|
880
|
+
`ARTEMISKIT_RUN_ID_${run.scenario.toUpperCase().replace(/[^A-Z0-9]/g, '_')}=${run.runId}`
|
|
881
|
+
);
|
|
882
|
+
}
|
|
182
883
|
}
|
|
183
|
-
}
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
884
|
+
}
|
|
885
|
+
} else {
|
|
886
|
+
// Display aggregate summary for multiple scenarios (non-CI mode)
|
|
887
|
+
if (isMultiScenario) {
|
|
888
|
+
console.log();
|
|
889
|
+
console.log(chalk.bold.cyan('━━━ AGGREGATE SUMMARY ━━━'));
|
|
890
|
+
console.log();
|
|
891
|
+
|
|
892
|
+
const totalScenarios = results.length;
|
|
893
|
+
const passedScenarios = results.filter((r) => r.success).length;
|
|
894
|
+
const failedScenarios = totalScenarios - passedScenarios;
|
|
895
|
+
|
|
896
|
+
const totalCases = results.reduce(
|
|
897
|
+
(sum, r) => sum + (r.manifest.metrics?.total_cases || 0),
|
|
898
|
+
0
|
|
899
|
+
);
|
|
900
|
+
const passedCases = results.reduce(
|
|
901
|
+
(sum, r) => sum + (r.manifest.metrics?.passed_cases || 0),
|
|
902
|
+
0
|
|
903
|
+
);
|
|
904
|
+
const failedCases = results.reduce(
|
|
905
|
+
(sum, r) => sum + (r.manifest.metrics?.failed_cases || 0),
|
|
906
|
+
0
|
|
907
|
+
);
|
|
908
|
+
const totalDuration = results.reduce(
|
|
909
|
+
(sum, r) => sum + (r.manifest.duration_ms || 0),
|
|
910
|
+
0
|
|
911
|
+
);
|
|
912
|
+
|
|
913
|
+
console.log(
|
|
914
|
+
`Scenarios: ${chalk.green(`${passedScenarios} passed`)} ${failedScenarios > 0 ? chalk.red(`${failedScenarios} failed`) : ''} ${chalk.dim(`(${totalScenarios} total)`)}`
|
|
915
|
+
);
|
|
916
|
+
console.log(
|
|
917
|
+
`Test Cases: ${chalk.green(`${passedCases} passed`)} ${failedCases > 0 ? chalk.red(`${failedCases} failed`) : ''} ${chalk.dim(`(${totalCases} total)`)}`
|
|
918
|
+
);
|
|
919
|
+
console.log(`Duration: ${chalk.dim(formatDuration(totalDuration))}`);
|
|
920
|
+
|
|
921
|
+
if (runInParallel) {
|
|
922
|
+
console.log(
|
|
923
|
+
`Mode: ${chalk.cyan('parallel')} ${chalk.dim(`(${parallelLimit} concurrent)`)}`
|
|
924
|
+
);
|
|
187
925
|
}
|
|
188
|
-
|
|
189
|
-
});
|
|
926
|
+
console.log();
|
|
190
927
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
console.log();
|
|
205
|
-
console.log(
|
|
206
|
-
chalk.dim(
|
|
207
|
-
`Run ID: ${result.manifest.run_id} | Median Latency: ${result.manifest.metrics.median_latency_ms}ms | Tokens: ${result.manifest.metrics.total_tokens.toLocaleString()}`
|
|
208
|
-
)
|
|
209
|
-
);
|
|
928
|
+
// List failed scenarios
|
|
929
|
+
const failedResults = results.filter((r) => !r.success);
|
|
930
|
+
if (failedResults.length > 0) {
|
|
931
|
+
console.log(chalk.red('Failed scenarios:'));
|
|
932
|
+
for (const result of failedResults) {
|
|
933
|
+
console.log(chalk.red(` ${icons.failed} ${result.scenarioName}`));
|
|
934
|
+
if (result.error && options.verbose) {
|
|
935
|
+
console.log(chalk.dim(` ${result.error}`));
|
|
936
|
+
}
|
|
937
|
+
}
|
|
938
|
+
console.log();
|
|
939
|
+
}
|
|
940
|
+
}
|
|
210
941
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
chalk.dim(
|
|
216
|
-
`Redactions: ${r.summary.totalRedactions} (${r.summary.promptsRedacted} prompts, ${r.summary.responsesRedacted} responses)`
|
|
217
|
-
)
|
|
218
|
-
);
|
|
942
|
+
// Show baseline comparison result in non-CI mode
|
|
943
|
+
if (baselineResult && !baselineResult.hasRegression) {
|
|
944
|
+
console.log(`${icons.passed} ${chalk.green('No regression detected')}`);
|
|
945
|
+
}
|
|
219
946
|
}
|
|
220
947
|
|
|
221
|
-
//
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
const storage = createStorage({ fileConfig: config });
|
|
225
|
-
const path = await storage.save(result.manifest);
|
|
226
|
-
spinner.succeed(`Results saved: ${path}`);
|
|
227
|
-
}
|
|
948
|
+
// Exit with error if any scenarios failed or regression detected
|
|
949
|
+
const hasFailures = results.some((r) => !r.success);
|
|
950
|
+
const hasRegression = baselineResult?.hasRegression || false;
|
|
228
951
|
|
|
229
|
-
|
|
230
|
-
if (!result.success) {
|
|
952
|
+
if (hasFailures || hasRegression) {
|
|
231
953
|
process.exit(1);
|
|
232
954
|
}
|
|
233
955
|
} catch (error) {
|