@artemiskit/cli 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,15 +2,20 @@
2
2
  * Run command - Execute test scenarios
3
3
  */
4
4
 
5
+ import { mkdir, writeFile } from 'node:fs/promises';
5
6
  import { basename } from 'node:path';
7
+ import { join } from 'node:path';
6
8
  import {
9
+ type BaselineStorageAdapter,
7
10
  type RedactionConfig,
8
11
  type RunManifest,
9
12
  createAdapter,
13
+ formatCost,
10
14
  parseScenarioFile,
11
15
  resolveScenarioPaths,
12
16
  runScenario,
13
17
  } from '@artemiskit/core';
18
+ import { generateMarkdownReport } from '@artemiskit/reports';
14
19
  import chalk from 'chalk';
15
20
  import { Command } from 'commander';
16
21
  import { loadConfig } from '../config/loader.js';
@@ -53,6 +58,20 @@ interface RunOptions {
53
58
  redactPatterns?: string[];
54
59
  parallel?: number;
55
60
  interactive?: boolean;
61
+ /** CI mode - machine-readable output, no colors/spinners */
62
+ ci?: boolean;
63
+ /** Summary format: json, text, or security */
64
+ summary?: 'json' | 'text' | 'security';
65
+ /** Compare against baseline and detect regression */
66
+ baseline?: boolean;
67
+ /** Regression threshold (0-1), default 0.05 (5%) */
68
+ threshold?: number;
69
+ /** Budget limit in USD - fail if cost exceeds this */
70
+ budget?: number;
71
+ /** Export format: markdown */
72
+ export?: 'markdown';
73
+ /** Output directory for exports */
74
+ exportOutput?: string;
56
75
  }
57
76
 
58
77
  interface ScenarioRunResult {
@@ -63,6 +82,209 @@ interface ScenarioRunResult {
63
82
  error?: string;
64
83
  }
65
84
 
85
+ /**
86
+ * Minimal spinner interface for CI/non-TTY compatibility
87
+ */
88
+ interface SpinnerLike {
89
+ start: (text?: string) => void;
90
+ stop: () => void;
91
+ succeed: (text?: string) => void;
92
+ fail: (text?: string) => void;
93
+ info: (text?: string) => void;
94
+ }
95
+
96
+ /**
97
+ * CI-friendly JSON summary output
98
+ */
99
+ interface CISummary {
100
+ success: boolean;
101
+ scenarios: {
102
+ total: number;
103
+ passed: number;
104
+ failed: number;
105
+ };
106
+ cases: {
107
+ total: number;
108
+ passed: number;
109
+ failed: number;
110
+ successRate: number;
111
+ };
112
+ duration: {
113
+ totalMs: number;
114
+ formatted: string;
115
+ };
116
+ tokens: {
117
+ prompt: number;
118
+ completion: number;
119
+ total: number;
120
+ };
121
+ cost: {
122
+ estimatedUsd: number;
123
+ formatted: string;
124
+ };
125
+ runs: Array<{
126
+ runId: string;
127
+ scenario: string;
128
+ success: boolean;
129
+ successRate: number;
130
+ passedCases: number;
131
+ failedCases: number;
132
+ totalCases: number;
133
+ durationMs: number;
134
+ estimatedCostUsd?: number;
135
+ }>;
136
+ baseline?: {
137
+ compared: boolean;
138
+ hasRegression: boolean;
139
+ threshold: number;
140
+ delta?: {
141
+ successRate: number;
142
+ latency: number;
143
+ tokens: number;
144
+ };
145
+ };
146
+ budget?: {
147
+ limit: number;
148
+ exceeded: boolean;
149
+ overBy: number;
150
+ };
151
+ }
152
+
153
+ /**
154
+ * Security-focused summary for red team/security reporting
155
+ */
156
+ interface SecuritySummary {
157
+ overallRisk: 'low' | 'medium' | 'high' | 'critical';
158
+ successRate: number;
159
+ vulnerabilities: {
160
+ critical: number;
161
+ high: number;
162
+ medium: number;
163
+ low: number;
164
+ };
165
+ recommendations: string[];
166
+ }
167
+
168
+ /**
169
+ * Check if storage adapter supports baselines
170
+ */
171
+ function isBaselineStorage(storage: unknown): storage is BaselineStorageAdapter {
172
+ return (
173
+ typeof storage === 'object' &&
174
+ storage !== null &&
175
+ 'setBaseline' in storage &&
176
+ 'getBaseline' in storage &&
177
+ 'listBaselines' in storage &&
178
+ 'compareToBaseline' in storage
179
+ );
180
+ }
181
+
182
+ /**
183
+ * Build CI summary from results
184
+ */
185
+ function buildCISummary(results: ScenarioRunResult[]): CISummary {
186
+ const totalScenarios = results.length;
187
+ const passedScenarios = results.filter((r) => r.success).length;
188
+ const failedScenarios = totalScenarios - passedScenarios;
189
+
190
+ const totalCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_cases || 0), 0);
191
+ const passedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.passed_cases || 0), 0);
192
+ const failedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.failed_cases || 0), 0);
193
+ const totalDuration = results.reduce((sum, r) => sum + (r.manifest.duration_ms || 0), 0);
194
+
195
+ // Aggregate token and cost metrics
196
+ const totalPromptTokens = results.reduce(
197
+ (sum, r) => sum + (r.manifest.metrics?.total_prompt_tokens || 0),
198
+ 0
199
+ );
200
+ const totalCompletionTokens = results.reduce(
201
+ (sum, r) => sum + (r.manifest.metrics?.total_completion_tokens || 0),
202
+ 0
203
+ );
204
+ const totalTokens = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_tokens || 0), 0);
205
+ const totalCostUsd = results.reduce(
206
+ (sum, r) => sum + (r.manifest.metrics?.cost?.total_usd || 0),
207
+ 0
208
+ );
209
+
210
+ return {
211
+ success: failedScenarios === 0,
212
+ scenarios: {
213
+ total: totalScenarios,
214
+ passed: passedScenarios,
215
+ failed: failedScenarios,
216
+ },
217
+ cases: {
218
+ total: totalCases,
219
+ passed: passedCases,
220
+ failed: failedCases,
221
+ successRate: totalCases > 0 ? passedCases / totalCases : 0,
222
+ },
223
+ duration: {
224
+ totalMs: totalDuration,
225
+ formatted: formatDuration(totalDuration),
226
+ },
227
+ tokens: {
228
+ prompt: totalPromptTokens,
229
+ completion: totalCompletionTokens,
230
+ total: totalTokens,
231
+ },
232
+ cost: {
233
+ estimatedUsd: totalCostUsd,
234
+ formatted: formatCost(totalCostUsd),
235
+ },
236
+ runs: results.map((r) => ({
237
+ runId: r.manifest.run_id || '',
238
+ scenario: r.scenarioName,
239
+ success: r.success,
240
+ successRate: r.manifest.metrics?.success_rate || 0,
241
+ passedCases: r.manifest.metrics?.passed_cases || 0,
242
+ failedCases: r.manifest.metrics?.failed_cases || 0,
243
+ totalCases: r.manifest.metrics?.total_cases || 0,
244
+ durationMs: r.manifest.duration_ms || 0,
245
+ estimatedCostUsd: r.manifest.metrics?.cost?.total_usd,
246
+ })),
247
+ };
248
+ }
249
+
250
+ /**
251
+ * Build security summary (for --summary security)
252
+ */
253
+ function buildSecuritySummary(results: ScenarioRunResult[]): SecuritySummary {
254
+ const totalCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_cases || 0), 0);
255
+ const passedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.passed_cases || 0), 0);
256
+ const successRate = totalCases > 0 ? passedCases / totalCases : 0;
257
+
258
+ // Categorize risk based on success rate (for standard runs, invert for security context)
259
+ let overallRisk: 'low' | 'medium' | 'high' | 'critical';
260
+ if (successRate >= 0.95) overallRisk = 'low';
261
+ else if (successRate >= 0.8) overallRisk = 'medium';
262
+ else if (successRate >= 0.5) overallRisk = 'high';
263
+ else overallRisk = 'critical';
264
+
265
+ // Count failures by severity (simplified - can be enhanced with actual severity data)
266
+ const failedCases = totalCases - passedCases;
267
+
268
+ return {
269
+ overallRisk,
270
+ successRate,
271
+ vulnerabilities: {
272
+ critical: overallRisk === 'critical' ? failedCases : 0,
273
+ high: overallRisk === 'high' ? failedCases : 0,
274
+ medium: overallRisk === 'medium' ? failedCases : 0,
275
+ low: overallRisk === 'low' ? failedCases : 0,
276
+ },
277
+ recommendations:
278
+ successRate < 1
279
+ ? [
280
+ 'Review failed test cases for potential issues',
281
+ 'Consider adding more comprehensive test coverage',
282
+ successRate < 0.8 ? 'Investigate root causes of failures before deployment' : '',
283
+ ].filter(Boolean)
284
+ : ['All tests passing - continue monitoring'],
285
+ };
286
+ }
287
+
66
288
  /**
67
289
  * Run a single scenario and return the result (quiet mode for parallel execution)
68
290
  */
@@ -139,7 +361,7 @@ async function runSingleScenario(
139
361
  scenarioPath: string,
140
362
  options: RunOptions,
141
363
  config: ArtemisConfig | null,
142
- spinner: ReturnType<typeof createSpinner>,
364
+ spinner: SpinnerLike,
143
365
  isMultiScenario: boolean
144
366
  ): Promise<ScenarioRunResult> {
145
367
  // Parse scenario
@@ -376,21 +598,54 @@ export function runCommand(): Command {
376
598
  'Custom redaction patterns (regex or built-in: email, phone, credit_card, ssn, api_key)'
377
599
  )
378
600
  .option('-i, --interactive', 'Enable interactive mode for scenario/provider selection')
601
+ .option('--ci', 'CI mode: machine-readable output, no colors/spinners, JSON summary')
602
+ .option(
603
+ '--summary <format>',
604
+ 'Summary output format: json, text, or security (implies --ci for json/security)',
605
+ 'text'
606
+ )
607
+ .option('--baseline', 'Compare against baseline and detect regression')
608
+ .option('--threshold <number>', 'Regression threshold (0-1), e.g., 0.05 for 5%', '0.05')
609
+ .option('--budget <amount>', 'Maximum budget in USD - fail if estimated cost exceeds this')
610
+ .option('--export <format>', 'Export format: markdown')
611
+ .option('--export-output <dir>', 'Output directory for exports (default: ./artemis-exports)')
379
612
  .action(async (scenarioPath: string | undefined, options: RunOptions) => {
380
- const spinner = createSpinner('Loading configuration...');
381
- spinner.start();
613
+ // Determine CI mode: explicit flag, environment variable, or summary format that implies CI
614
+ const isCIMode =
615
+ options.ci ||
616
+ process.env.CI === 'true' ||
617
+ options.summary === 'json' ||
618
+ options.summary === 'security';
619
+
620
+ // In CI mode, use a no-op spinner
621
+ const spinner = isCIMode
622
+ ? {
623
+ start: () => {},
624
+ stop: () => {},
625
+ succeed: () => {},
626
+ fail: () => {},
627
+ info: () => {},
628
+ }
629
+ : createSpinner('Loading configuration...');
630
+
631
+ if (!isCIMode) {
632
+ spinner.start();
633
+ }
382
634
 
383
635
  try {
384
636
  // Load config file if present
385
637
  const config = await loadConfig(options.config);
386
- if (config) {
387
- spinner.succeed(`Loaded config from ${config._path}`);
388
- } else {
389
- spinner.info('No config file found, using defaults');
638
+ if (!isCIMode) {
639
+ if (config) {
640
+ spinner.succeed(`Loaded config from ${config._path}`);
641
+ } else {
642
+ spinner.info('No config file found, using defaults');
643
+ }
390
644
  }
391
645
 
392
- // Determine if we should use interactive mode
393
- const useInteractive = options.interactive || (!scenarioPath && isInteractive());
646
+ // Determine if we should use interactive mode (never in CI mode)
647
+ const useInteractive =
648
+ !isCIMode && (options.interactive || (!scenarioPath && isInteractive()));
394
649
 
395
650
  // Interactive provider/model selection if requested
396
651
  if (useInteractive && !options.provider) {
@@ -539,9 +794,12 @@ export function runCommand(): Command {
539
794
 
540
795
  // Show additional metrics
541
796
  console.log();
797
+ const costInfo = result.manifest.metrics.cost
798
+ ? ` | Est. Cost: ${formatCost(result.manifest.metrics.cost.total_usd)}`
799
+ : '';
542
800
  console.log(
543
801
  chalk.dim(
544
- `Run ID: ${result.manifest.run_id} | Median Latency: ${result.manifest.metrics.median_latency_ms}ms | Tokens: ${result.manifest.metrics.total_tokens.toLocaleString()}`
802
+ `Run ID: ${result.manifest.run_id} | Median Latency: ${result.manifest.metrics.median_latency_ms}ms | Tokens: ${result.manifest.metrics.total_tokens.toLocaleString()}${costInfo}`
545
803
  )
546
804
  );
547
805
 
@@ -560,6 +818,16 @@ export function runCommand(): Command {
560
818
  const savedPath = await storage.save(result.manifest);
561
819
  console.log(chalk.dim(`Saved: ${savedPath}`));
562
820
  }
821
+
822
+ // Export to markdown if requested
823
+ if (options.export === 'markdown') {
824
+ const exportDir = options.exportOutput || './artemis-exports';
825
+ await mkdir(exportDir, { recursive: true });
826
+ const markdown = generateMarkdownReport(result.manifest);
827
+ const mdPath = join(exportDir, `${result.manifest.run_id}.md`);
828
+ await writeFile(mdPath, markdown);
829
+ console.log(chalk.dim(`Exported: ${mdPath}`));
830
+ }
563
831
  } catch (error) {
564
832
  // Record failed scenario
565
833
  console.log();
@@ -577,62 +845,221 @@ export function runCommand(): Command {
577
845
  }
578
846
  }
579
847
 
580
- // Display aggregate summary for multiple scenarios
581
- if (isMultiScenario) {
582
- console.log();
583
- console.log(chalk.bold.cyan('━━━ AGGREGATE SUMMARY ━━━'));
584
- console.log();
848
+ // Build CI summary (used for CI mode output and baseline comparison)
849
+ const ciSummary = buildCISummary(results);
585
850
 
586
- const totalScenarios = results.length;
587
- const passedScenarios = results.filter((r) => r.success).length;
588
- const failedScenarios = totalScenarios - passedScenarios;
851
+ // Baseline comparison (if enabled)
852
+ let baselineResult: {
853
+ hasRegression: boolean;
854
+ threshold: number;
855
+ delta?: { successRate: number; latency: number; tokens: number };
856
+ } | null = null;
589
857
 
590
- const totalCases = results.reduce(
591
- (sum, r) => sum + (r.manifest.metrics?.total_cases || 0),
592
- 0
593
- );
594
- const passedCases = results.reduce(
595
- (sum, r) => sum + (r.manifest.metrics?.passed_cases || 0),
596
- 0
597
- );
598
- const failedCases = results.reduce(
599
- (sum, r) => sum + (r.manifest.metrics?.failed_cases || 0),
600
- 0
601
- );
602
- const totalDuration = results.reduce((sum, r) => sum + (r.manifest.duration_ms || 0), 0);
858
+ if (options.baseline && results.length > 0) {
859
+ const regressionThreshold = Number.parseFloat(String(options.threshold)) || 0.05;
603
860
 
604
- console.log(
605
- `Scenarios: ${chalk.green(`${passedScenarios} passed`)} ${failedScenarios > 0 ? chalk.red(`${failedScenarios} failed`) : ''} ${chalk.dim(`(${totalScenarios} total)`)}`
606
- );
607
- console.log(
608
- `Test Cases: ${chalk.green(`${passedCases} passed`)} ${failedCases > 0 ? chalk.red(`${failedCases} failed`) : ''} ${chalk.dim(`(${totalCases} total)`)}`
609
- );
610
- console.log(`Duration: ${chalk.dim(formatDuration(totalDuration))}`);
861
+ // Check each scenario against its baseline
862
+ for (const result of results) {
863
+ if (!result.manifest.run_id) continue;
864
+
865
+ if (isBaselineStorage(storage) && storage.compareToBaseline) {
866
+ try {
867
+ const comparison = await storage.compareToBaseline(
868
+ result.manifest.run_id,
869
+ regressionThreshold
870
+ );
871
+
872
+ if (comparison) {
873
+ baselineResult = {
874
+ hasRegression: comparison.hasRegression,
875
+ threshold: comparison.regressionThreshold,
876
+ delta: comparison.comparison.delta,
877
+ };
878
+
879
+ // Add baseline info to CI summary
880
+ ciSummary.baseline = {
881
+ compared: true,
882
+ hasRegression: comparison.hasRegression,
883
+ threshold: comparison.regressionThreshold,
884
+ delta: comparison.comparison.delta,
885
+ };
886
+
887
+ if (!isCIMode && comparison.hasRegression) {
888
+ console.log();
889
+ console.log(
890
+ `${icons.failed} ${chalk.red('Regression detected!')} for ${chalk.bold(result.scenarioName)}`
891
+ );
892
+ console.log(
893
+ chalk.dim(
894
+ ` Success rate dropped by ${Math.abs(comparison.comparison.delta.successRate * 100).toFixed(1)}% (threshold: ${regressionThreshold * 100}%)`
895
+ )
896
+ );
897
+ }
898
+ }
899
+ } catch {
900
+ // Baseline comparison failed, continue without it
901
+ }
902
+ }
903
+ }
904
+ }
905
+
906
+ // Handle CI mode output
907
+ if (isCIMode) {
908
+ if (options.summary === 'json') {
909
+ console.log(JSON.stringify(ciSummary, null, 2));
910
+ } else if (options.summary === 'security') {
911
+ const securitySummary = buildSecuritySummary(results);
912
+ console.log(JSON.stringify(securitySummary, null, 2));
913
+ } else {
914
+ // Default CI text output (minimal)
915
+ const totalCases = ciSummary.cases.total;
916
+ const passedCases = ciSummary.cases.passed;
917
+ const failedCases = ciSummary.cases.failed;
918
+ const successRate = (ciSummary.cases.successRate * 100).toFixed(1);
919
+
920
+ console.log(`ARTEMISKIT_RESULT=${ciSummary.success ? 'PASS' : 'FAIL'}`);
921
+ console.log(`ARTEMISKIT_SCENARIOS_TOTAL=${ciSummary.scenarios.total}`);
922
+ console.log(`ARTEMISKIT_SCENARIOS_PASSED=${ciSummary.scenarios.passed}`);
923
+ console.log(`ARTEMISKIT_SCENARIOS_FAILED=${ciSummary.scenarios.failed}`);
924
+ console.log(`ARTEMISKIT_CASES_TOTAL=${totalCases}`);
925
+ console.log(`ARTEMISKIT_CASES_PASSED=${passedCases}`);
926
+ console.log(`ARTEMISKIT_CASES_FAILED=${failedCases}`);
927
+ console.log(`ARTEMISKIT_SUCCESS_RATE=${successRate}`);
928
+ console.log(`ARTEMISKIT_DURATION_MS=${ciSummary.duration.totalMs}`);
929
+ console.log(`ARTEMISKIT_TOKENS_TOTAL=${ciSummary.tokens.total}`);
930
+ console.log(`ARTEMISKIT_COST_USD=${ciSummary.cost.estimatedUsd.toFixed(4)}`);
931
+
932
+ if (baselineResult) {
933
+ console.log('ARTEMISKIT_BASELINE_COMPARED=true');
934
+ console.log(
935
+ `ARTEMISKIT_REGRESSION=${baselineResult.hasRegression ? 'true' : 'false'}`
936
+ );
937
+ if (baselineResult.delta) {
938
+ console.log(
939
+ `ARTEMISKIT_DELTA_SUCCESS_RATE=${(baselineResult.delta.successRate * 100).toFixed(2)}`
940
+ );
941
+ }
942
+ }
943
+
944
+ // Also print run IDs for reference
945
+ for (const run of ciSummary.runs) {
946
+ if (run.runId) {
947
+ console.log(
948
+ `ARTEMISKIT_RUN_ID_${run.scenario.toUpperCase().replace(/[^A-Z0-9]/g, '_')}=${run.runId}`
949
+ );
950
+ }
951
+ }
952
+ }
953
+ } else {
954
+ // Display aggregate summary for multiple scenarios (non-CI mode)
955
+ if (isMultiScenario) {
956
+ console.log();
957
+ console.log(chalk.bold.cyan('━━━ AGGREGATE SUMMARY ━━━'));
958
+ console.log();
959
+
960
+ const totalScenarios = results.length;
961
+ const passedScenarios = results.filter((r) => r.success).length;
962
+ const failedScenarios = totalScenarios - passedScenarios;
963
+
964
+ const totalCases = results.reduce(
965
+ (sum, r) => sum + (r.manifest.metrics?.total_cases || 0),
966
+ 0
967
+ );
968
+ const passedCases = results.reduce(
969
+ (sum, r) => sum + (r.manifest.metrics?.passed_cases || 0),
970
+ 0
971
+ );
972
+ const failedCases = results.reduce(
973
+ (sum, r) => sum + (r.manifest.metrics?.failed_cases || 0),
974
+ 0
975
+ );
976
+ const totalDuration = results.reduce(
977
+ (sum, r) => sum + (r.manifest.duration_ms || 0),
978
+ 0
979
+ );
611
980
 
612
- if (runInParallel) {
613
981
  console.log(
614
- `Mode: ${chalk.cyan('parallel')} ${chalk.dim(`(${parallelLimit} concurrent)`)}`
982
+ `Scenarios: ${chalk.green(`${passedScenarios} passed`)} ${failedScenarios > 0 ? chalk.red(`${failedScenarios} failed`) : ''} ${chalk.dim(`(${totalScenarios} total)`)}`
615
983
  );
984
+ console.log(
985
+ `Test Cases: ${chalk.green(`${passedCases} passed`)} ${failedCases > 0 ? chalk.red(`${failedCases} failed`) : ''} ${chalk.dim(`(${totalCases} total)`)}`
986
+ );
987
+ console.log(`Duration: ${chalk.dim(formatDuration(totalDuration))}`);
988
+
989
+ if (runInParallel) {
990
+ console.log(
991
+ `Mode: ${chalk.cyan('parallel')} ${chalk.dim(`(${parallelLimit} concurrent)`)}`
992
+ );
993
+ }
994
+ console.log();
995
+
996
+ // List failed scenarios
997
+ const failedResults = results.filter((r) => !r.success);
998
+ if (failedResults.length > 0) {
999
+ console.log(chalk.red('Failed scenarios:'));
1000
+ for (const result of failedResults) {
1001
+ console.log(chalk.red(` ${icons.failed} ${result.scenarioName}`));
1002
+ if (result.error && options.verbose) {
1003
+ console.log(chalk.dim(` ${result.error}`));
1004
+ }
1005
+ }
1006
+ console.log();
1007
+ }
616
1008
  }
617
- console.log();
618
1009
 
619
- // List failed scenarios
620
- const failedResults = results.filter((r) => !r.success);
621
- if (failedResults.length > 0) {
622
- console.log(chalk.red('Failed scenarios:'));
623
- for (const result of failedResults) {
624
- console.log(chalk.red(` ${icons.failed} ${result.scenarioName}`));
625
- if (result.error && options.verbose) {
626
- console.log(chalk.dim(` ${result.error}`));
1010
+ // Show baseline comparison result in non-CI mode
1011
+ if (baselineResult && !baselineResult.hasRegression) {
1012
+ console.log(`${icons.passed} ${chalk.green('No regression detected')}`);
1013
+ }
1014
+ }
1015
+
1016
+ // Check budget if specified
1017
+ let budgetExceeded = false;
1018
+ if (options.budget !== undefined) {
1019
+ const budgetLimit = Number.parseFloat(String(options.budget));
1020
+ const totalCost = ciSummary.cost.estimatedUsd;
1021
+
1022
+ if (totalCost > budgetLimit) {
1023
+ budgetExceeded = true;
1024
+ const overBy = totalCost - budgetLimit;
1025
+
1026
+ // Add budget info to CI summary
1027
+ ciSummary.budget = {
1028
+ limit: budgetLimit,
1029
+ exceeded: true,
1030
+ overBy,
1031
+ };
1032
+
1033
+ if (isCIMode) {
1034
+ if (options.summary === 'json') {
1035
+ // Budget info already in ciSummary, will be output above
1036
+ } else {
1037
+ console.log(`ARTEMISKIT_BUDGET_LIMIT=${budgetLimit.toFixed(2)}`);
1038
+ console.log('ARTEMISKIT_BUDGET_EXCEEDED=true');
1039
+ console.log(`ARTEMISKIT_BUDGET_OVER_BY=${overBy.toFixed(4)}`);
627
1040
  }
1041
+ } else {
1042
+ console.log();
1043
+ console.log(chalk.red(`${icons.failed} BUDGET EXCEEDED`));
1044
+ console.log(
1045
+ chalk.red(
1046
+ ` Budget: $${budgetLimit.toFixed(2)} | Actual: ${formatCost(totalCost)} | Over by: ${formatCost(overBy)}`
1047
+ )
1048
+ );
1049
+ console.log();
628
1050
  }
629
- console.log();
1051
+ } else if (!isCIMode) {
1052
+ console.log(
1053
+ `${icons.passed} ${chalk.green('Within budget')} ${chalk.dim(`($${budgetLimit.toFixed(2)} limit, ${formatCost(totalCost)} used)`)}`
1054
+ );
630
1055
  }
631
1056
  }
632
1057
 
633
- // Exit with error if any scenarios failed
1058
+ // Exit with error if any scenarios failed, regression detected, or budget exceeded
634
1059
  const hasFailures = results.some((r) => !r.success);
635
- if (hasFailures) {
1060
+ const hasRegression = baselineResult?.hasRegression || false;
1061
+
1062
+ if (hasFailures || hasRegression || budgetExceeded) {
636
1063
  process.exit(1);
637
1064
  }
638
1065
  } catch (error) {
@@ -13,6 +13,7 @@ import {
13
13
  type StressRequestResult,
14
14
  createAdapter,
15
15
  estimateCost,
16
+ formatCost,
16
17
  getGitInfo,
17
18
  getModelPricing,
18
19
  parseScenarioFile,
@@ -26,6 +27,7 @@ import {
26
27
  colors,
27
28
  createSpinner,
28
29
  getProviderErrorContext,
30
+ icons,
29
31
  isTTY,
30
32
  renderError,
31
33
  renderInfoBox,
@@ -52,6 +54,8 @@ interface StressOptions {
52
54
  config?: string;
53
55
  redact?: boolean;
54
56
  redactPatterns?: string[];
57
+ /** Budget limit in USD - fail if cost exceeds this */
58
+ budget?: number;
55
59
  }
56
60
 
57
61
  export function stressCommand(): Command {
@@ -75,6 +79,7 @@ export function stressCommand(): Command {
75
79
  '--redact-patterns <patterns...>',
76
80
  'Custom redaction patterns (regex or built-in: email, phone, credit_card, ssn, api_key)'
77
81
  )
82
+ .option('--budget <amount>', 'Maximum budget in USD - fail if estimated cost exceeds this')
78
83
  .action(async (scenarioPath: string, options: StressOptions) => {
79
84
  const spinner = createSpinner('Loading configuration...');
80
85
  spinner.start();
@@ -319,6 +324,29 @@ export function stressCommand(): Command {
319
324
  console.log(chalk.dim(` HTML: ${htmlPath}`));
320
325
  console.log(chalk.dim(` JSON: ${jsonPath}`));
321
326
  }
327
+
328
+ // Check budget if specified
329
+ if (options.budget !== undefined && metrics.cost) {
330
+ const budgetLimit = Number.parseFloat(String(options.budget));
331
+ const totalCost = metrics.cost.estimated_total_usd;
332
+
333
+ if (totalCost > budgetLimit) {
334
+ const overBy = totalCost - budgetLimit;
335
+ console.log();
336
+ console.log(chalk.red(`${icons.failed} BUDGET EXCEEDED`));
337
+ console.log(
338
+ chalk.red(
339
+ ` Budget: $${budgetLimit.toFixed(2)} | Actual: ${formatCost(totalCost)} | Over by: ${formatCost(overBy)}`
340
+ )
341
+ );
342
+ process.exit(1);
343
+ } else {
344
+ console.log();
345
+ console.log(
346
+ `${icons.passed} ${chalk.green('Within budget')} ${chalk.dim(`($${budgetLimit.toFixed(2)} limit, ${formatCost(totalCost)} used)`)}`
347
+ );
348
+ }
349
+ }
322
350
  } catch (error) {
323
351
  spinner.fail('Error');
324
352
 
@@ -17,6 +17,9 @@ const ProviderConfigSchema = z.object({
17
17
  deploymentName: z.string().optional(),
18
18
  apiVersion: z.string().optional(),
19
19
  embeddingDeploymentName: z.string().optional(),
20
+ // Model family for parameter detection (e.g., 'gpt-5-mini' when deployment is '5-mini')
21
+ // Used by OpenAI/Azure to determine which API parameters to use (max_tokens vs max_completion_tokens)
22
+ modelFamily: z.string().optional(),
20
23
  // Vercel AI specific
21
24
  underlyingProvider: z.enum(['openai', 'azure', 'anthropic', 'google', 'mistral']).optional(),
22
25
  });