@artemiskit/cli 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@
4
4
 
5
5
  import { basename } from 'node:path';
6
6
  import {
7
+ type BaselineStorageAdapter,
7
8
  type RedactionConfig,
8
9
  type RunManifest,
9
10
  createAdapter,
@@ -53,6 +54,14 @@ interface RunOptions {
53
54
  redactPatterns?: string[];
54
55
  parallel?: number;
55
56
  interactive?: boolean;
57
+ /** CI mode - machine-readable output, no colors/spinners */
58
+ ci?: boolean;
59
+ /** Summary format: json, text, or security */
60
+ summary?: 'json' | 'text' | 'security';
61
+ /** Compare against baseline and detect regression */
62
+ baseline?: boolean;
63
+ /** Regression threshold (0-1), default 0.05 (5%) */
64
+ threshold?: number;
56
65
  }
57
66
 
58
67
  interface ScenarioRunResult {
@@ -63,6 +72,169 @@ interface ScenarioRunResult {
63
72
  error?: string;
64
73
  }
65
74
 
75
+ /**
76
+ * Minimal spinner interface for CI/non-TTY compatibility
77
+ */
78
+ interface SpinnerLike {
79
+ start: (text?: string) => void;
80
+ stop: () => void;
81
+ succeed: (text?: string) => void;
82
+ fail: (text?: string) => void;
83
+ info: (text?: string) => void;
84
+ }
85
+
86
+ /**
87
+ * CI-friendly JSON summary output
88
+ */
89
+ interface CISummary {
90
+ success: boolean;
91
+ scenarios: {
92
+ total: number;
93
+ passed: number;
94
+ failed: number;
95
+ };
96
+ cases: {
97
+ total: number;
98
+ passed: number;
99
+ failed: number;
100
+ successRate: number;
101
+ };
102
+ duration: {
103
+ totalMs: number;
104
+ formatted: string;
105
+ };
106
+ runs: Array<{
107
+ runId: string;
108
+ scenario: string;
109
+ success: boolean;
110
+ successRate: number;
111
+ passedCases: number;
112
+ failedCases: number;
113
+ totalCases: number;
114
+ durationMs: number;
115
+ }>;
116
+ baseline?: {
117
+ compared: boolean;
118
+ hasRegression: boolean;
119
+ threshold: number;
120
+ delta?: {
121
+ successRate: number;
122
+ latency: number;
123
+ tokens: number;
124
+ };
125
+ };
126
+ }
127
+
128
+ /**
129
+ * Security-focused summary for red team/security reporting
130
+ */
131
+ interface SecuritySummary {
132
+ overallRisk: 'low' | 'medium' | 'high' | 'critical';
133
+ successRate: number;
134
+ vulnerabilities: {
135
+ critical: number;
136
+ high: number;
137
+ medium: number;
138
+ low: number;
139
+ };
140
+ recommendations: string[];
141
+ }
142
+
143
+ /**
144
+ * Check if storage adapter supports baselines
145
+ */
146
+ function isBaselineStorage(storage: unknown): storage is BaselineStorageAdapter {
147
+ return (
148
+ typeof storage === 'object' &&
149
+ storage !== null &&
150
+ 'setBaseline' in storage &&
151
+ 'getBaseline' in storage &&
152
+ 'listBaselines' in storage &&
153
+ 'compareToBaseline' in storage
154
+ );
155
+ }
156
+
157
+ /**
158
+ * Build CI summary from results
159
+ */
160
+ function buildCISummary(results: ScenarioRunResult[]): CISummary {
161
+ const totalScenarios = results.length;
162
+ const passedScenarios = results.filter((r) => r.success).length;
163
+ const failedScenarios = totalScenarios - passedScenarios;
164
+
165
+ const totalCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_cases || 0), 0);
166
+ const passedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.passed_cases || 0), 0);
167
+ const failedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.failed_cases || 0), 0);
168
+ const totalDuration = results.reduce((sum, r) => sum + (r.manifest.duration_ms || 0), 0);
169
+
170
+ return {
171
+ success: failedScenarios === 0,
172
+ scenarios: {
173
+ total: totalScenarios,
174
+ passed: passedScenarios,
175
+ failed: failedScenarios,
176
+ },
177
+ cases: {
178
+ total: totalCases,
179
+ passed: passedCases,
180
+ failed: failedCases,
181
+ successRate: totalCases > 0 ? passedCases / totalCases : 0,
182
+ },
183
+ duration: {
184
+ totalMs: totalDuration,
185
+ formatted: formatDuration(totalDuration),
186
+ },
187
+ runs: results.map((r) => ({
188
+ runId: r.manifest.run_id || '',
189
+ scenario: r.scenarioName,
190
+ success: r.success,
191
+ successRate: r.manifest.metrics?.success_rate || 0,
192
+ passedCases: r.manifest.metrics?.passed_cases || 0,
193
+ failedCases: r.manifest.metrics?.failed_cases || 0,
194
+ totalCases: r.manifest.metrics?.total_cases || 0,
195
+ durationMs: r.manifest.duration_ms || 0,
196
+ })),
197
+ };
198
+ }
199
+
200
+ /**
201
+ * Build security summary (for --summary security)
202
+ */
203
+ function buildSecuritySummary(results: ScenarioRunResult[]): SecuritySummary {
204
+ const totalCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_cases || 0), 0);
205
+ const passedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.passed_cases || 0), 0);
206
+ const successRate = totalCases > 0 ? passedCases / totalCases : 0;
207
+
208
+ // Categorize risk based on success rate (for standard runs, invert for security context)
209
+ let overallRisk: 'low' | 'medium' | 'high' | 'critical';
210
+ if (successRate >= 0.95) overallRisk = 'low';
211
+ else if (successRate >= 0.8) overallRisk = 'medium';
212
+ else if (successRate >= 0.5) overallRisk = 'high';
213
+ else overallRisk = 'critical';
214
+
215
+ // Count failures by severity (simplified - can be enhanced with actual severity data)
216
+ const failedCases = totalCases - passedCases;
217
+
218
+ return {
219
+ overallRisk,
220
+ successRate,
221
+ vulnerabilities: {
222
+ critical: overallRisk === 'critical' ? failedCases : 0,
223
+ high: overallRisk === 'high' ? failedCases : 0,
224
+ medium: overallRisk === 'medium' ? failedCases : 0,
225
+ low: overallRisk === 'low' ? failedCases : 0,
226
+ },
227
+ recommendations:
228
+ successRate < 1
229
+ ? [
230
+ 'Review failed test cases for potential issues',
231
+ 'Consider adding more comprehensive test coverage',
232
+ successRate < 0.8 ? 'Investigate root causes of failures before deployment' : '',
233
+ ].filter(Boolean)
234
+ : ['All tests passing - continue monitoring'],
235
+ };
236
+ }
237
+
66
238
  /**
67
239
  * Run a single scenario and return the result (quiet mode for parallel execution)
68
240
  */
@@ -139,7 +311,7 @@ async function runSingleScenario(
139
311
  scenarioPath: string,
140
312
  options: RunOptions,
141
313
  config: ArtemisConfig | null,
142
- spinner: ReturnType<typeof createSpinner>,
314
+ spinner: SpinnerLike,
143
315
  isMultiScenario: boolean
144
316
  ): Promise<ScenarioRunResult> {
145
317
  // Parse scenario
@@ -376,21 +548,51 @@ export function runCommand(): Command {
376
548
  'Custom redaction patterns (regex or built-in: email, phone, credit_card, ssn, api_key)'
377
549
  )
378
550
  .option('-i, --interactive', 'Enable interactive mode for scenario/provider selection')
551
+ .option('--ci', 'CI mode: machine-readable output, no colors/spinners, JSON summary')
552
+ .option(
553
+ '--summary <format>',
554
+ 'Summary output format: json, text, or security (implies --ci for json/security)',
555
+ 'text'
556
+ )
557
+ .option('--baseline', 'Compare against baseline and detect regression')
558
+ .option('--threshold <number>', 'Regression threshold (0-1), e.g., 0.05 for 5%', '0.05')
379
559
  .action(async (scenarioPath: string | undefined, options: RunOptions) => {
380
- const spinner = createSpinner('Loading configuration...');
381
- spinner.start();
560
+ // Determine CI mode: explicit flag, environment variable, or summary format that implies CI
561
+ const isCIMode =
562
+ options.ci ||
563
+ process.env.CI === 'true' ||
564
+ options.summary === 'json' ||
565
+ options.summary === 'security';
566
+
567
+ // In CI mode, use a no-op spinner
568
+ const spinner = isCIMode
569
+ ? {
570
+ start: () => {},
571
+ stop: () => {},
572
+ succeed: () => {},
573
+ fail: () => {},
574
+ info: () => {},
575
+ }
576
+ : createSpinner('Loading configuration...');
577
+
578
+ if (!isCIMode) {
579
+ spinner.start();
580
+ }
382
581
 
383
582
  try {
384
583
  // Load config file if present
385
584
  const config = await loadConfig(options.config);
386
- if (config) {
387
- spinner.succeed(`Loaded config from ${config._path}`);
388
- } else {
389
- spinner.info('No config file found, using defaults');
585
+ if (!isCIMode) {
586
+ if (config) {
587
+ spinner.succeed(`Loaded config from ${config._path}`);
588
+ } else {
589
+ spinner.info('No config file found, using defaults');
590
+ }
390
591
  }
391
592
 
392
- // Determine if we should use interactive mode
393
- const useInteractive = options.interactive || (!scenarioPath && isInteractive());
593
+ // Determine if we should use interactive mode (never in CI mode)
594
+ const useInteractive =
595
+ !isCIMode && (options.interactive || (!scenarioPath && isInteractive()));
394
596
 
395
597
  // Interactive provider/model selection if requested
396
598
  if (useInteractive && !options.provider) {
@@ -577,62 +779,177 @@ export function runCommand(): Command {
577
779
  }
578
780
  }
579
781
 
580
- // Display aggregate summary for multiple scenarios
581
- if (isMultiScenario) {
582
- console.log();
583
- console.log(chalk.bold.cyan('━━━ AGGREGATE SUMMARY ━━━'));
584
- console.log();
782
+ // Build CI summary (used for CI mode output and baseline comparison)
783
+ const ciSummary = buildCISummary(results);
585
784
 
586
- const totalScenarios = results.length;
587
- const passedScenarios = results.filter((r) => r.success).length;
588
- const failedScenarios = totalScenarios - passedScenarios;
785
+ // Baseline comparison (if enabled)
786
+ let baselineResult: {
787
+ hasRegression: boolean;
788
+ threshold: number;
789
+ delta?: { successRate: number; latency: number; tokens: number };
790
+ } | null = null;
589
791
 
590
- const totalCases = results.reduce(
591
- (sum, r) => sum + (r.manifest.metrics?.total_cases || 0),
592
- 0
593
- );
594
- const passedCases = results.reduce(
595
- (sum, r) => sum + (r.manifest.metrics?.passed_cases || 0),
596
- 0
597
- );
598
- const failedCases = results.reduce(
599
- (sum, r) => sum + (r.manifest.metrics?.failed_cases || 0),
600
- 0
601
- );
602
- const totalDuration = results.reduce((sum, r) => sum + (r.manifest.duration_ms || 0), 0);
792
+ if (options.baseline && results.length > 0) {
793
+ const regressionThreshold = Number.parseFloat(String(options.threshold)) || 0.05;
603
794
 
604
- console.log(
605
- `Scenarios: ${chalk.green(`${passedScenarios} passed`)} ${failedScenarios > 0 ? chalk.red(`${failedScenarios} failed`) : ''} ${chalk.dim(`(${totalScenarios} total)`)}`
606
- );
607
- console.log(
608
- `Test Cases: ${chalk.green(`${passedCases} passed`)} ${failedCases > 0 ? chalk.red(`${failedCases} failed`) : ''} ${chalk.dim(`(${totalCases} total)`)}`
609
- );
610
- console.log(`Duration: ${chalk.dim(formatDuration(totalDuration))}`);
795
+ // Check each scenario against its baseline
796
+ for (const result of results) {
797
+ if (!result.manifest.run_id) continue;
611
798
 
612
- if (runInParallel) {
613
- console.log(
614
- `Mode: ${chalk.cyan('parallel')} ${chalk.dim(`(${parallelLimit} concurrent)`)}`
615
- );
799
+ if (isBaselineStorage(storage) && storage.compareToBaseline) {
800
+ try {
801
+ const comparison = await storage.compareToBaseline(
802
+ result.manifest.run_id,
803
+ regressionThreshold
804
+ );
805
+
806
+ if (comparison) {
807
+ baselineResult = {
808
+ hasRegression: comparison.hasRegression,
809
+ threshold: comparison.regressionThreshold,
810
+ delta: comparison.comparison.delta,
811
+ };
812
+
813
+ // Add baseline info to CI summary
814
+ ciSummary.baseline = {
815
+ compared: true,
816
+ hasRegression: comparison.hasRegression,
817
+ threshold: comparison.regressionThreshold,
818
+ delta: comparison.comparison.delta,
819
+ };
820
+
821
+ if (!isCIMode && comparison.hasRegression) {
822
+ console.log();
823
+ console.log(
824
+ `${icons.failed} ${chalk.red('Regression detected!')} for ${chalk.bold(result.scenarioName)}`
825
+ );
826
+ console.log(
827
+ chalk.dim(
828
+ ` Success rate dropped by ${Math.abs(comparison.comparison.delta.successRate * 100).toFixed(1)}% (threshold: ${regressionThreshold * 100}%)`
829
+ )
830
+ );
831
+ }
832
+ }
833
+ } catch {
834
+ // Baseline comparison failed, continue without it
835
+ }
836
+ }
616
837
  }
617
- console.log();
838
+ }
618
839
 
619
- // List failed scenarios
620
- const failedResults = results.filter((r) => !r.success);
621
- if (failedResults.length > 0) {
622
- console.log(chalk.red('Failed scenarios:'));
623
- for (const result of failedResults) {
624
- console.log(chalk.red(` ${icons.failed} ${result.scenarioName}`));
625
- if (result.error && options.verbose) {
626
- console.log(chalk.dim(` ${result.error}`));
840
+ // Handle CI mode output
841
+ if (isCIMode) {
842
+ if (options.summary === 'json') {
843
+ console.log(JSON.stringify(ciSummary, null, 2));
844
+ } else if (options.summary === 'security') {
845
+ const securitySummary = buildSecuritySummary(results);
846
+ console.log(JSON.stringify(securitySummary, null, 2));
847
+ } else {
848
+ // Default CI text output (minimal)
849
+ const totalCases = ciSummary.cases.total;
850
+ const passedCases = ciSummary.cases.passed;
851
+ const failedCases = ciSummary.cases.failed;
852
+ const successRate = (ciSummary.cases.successRate * 100).toFixed(1);
853
+
854
+ console.log(`ARTEMISKIT_RESULT=${ciSummary.success ? 'PASS' : 'FAIL'}`);
855
+ console.log(`ARTEMISKIT_SCENARIOS_TOTAL=${ciSummary.scenarios.total}`);
856
+ console.log(`ARTEMISKIT_SCENARIOS_PASSED=${ciSummary.scenarios.passed}`);
857
+ console.log(`ARTEMISKIT_SCENARIOS_FAILED=${ciSummary.scenarios.failed}`);
858
+ console.log(`ARTEMISKIT_CASES_TOTAL=${totalCases}`);
859
+ console.log(`ARTEMISKIT_CASES_PASSED=${passedCases}`);
860
+ console.log(`ARTEMISKIT_CASES_FAILED=${failedCases}`);
861
+ console.log(`ARTEMISKIT_SUCCESS_RATE=${successRate}`);
862
+ console.log(`ARTEMISKIT_DURATION_MS=${ciSummary.duration.totalMs}`);
863
+
864
+ if (baselineResult) {
865
+ console.log('ARTEMISKIT_BASELINE_COMPARED=true');
866
+ console.log(
867
+ `ARTEMISKIT_REGRESSION=${baselineResult.hasRegression ? 'true' : 'false'}`
868
+ );
869
+ if (baselineResult.delta) {
870
+ console.log(
871
+ `ARTEMISKIT_DELTA_SUCCESS_RATE=${(baselineResult.delta.successRate * 100).toFixed(2)}`
872
+ );
627
873
  }
628
874
  }
875
+
876
+ // Also print run IDs for reference
877
+ for (const run of ciSummary.runs) {
878
+ if (run.runId) {
879
+ console.log(
880
+ `ARTEMISKIT_RUN_ID_${run.scenario.toUpperCase().replace(/[^A-Z0-9]/g, '_')}=${run.runId}`
881
+ );
882
+ }
883
+ }
884
+ }
885
+ } else {
886
+ // Display aggregate summary for multiple scenarios (non-CI mode)
887
+ if (isMultiScenario) {
888
+ console.log();
889
+ console.log(chalk.bold.cyan('━━━ AGGREGATE SUMMARY ━━━'));
890
+ console.log();
891
+
892
+ const totalScenarios = results.length;
893
+ const passedScenarios = results.filter((r) => r.success).length;
894
+ const failedScenarios = totalScenarios - passedScenarios;
895
+
896
+ const totalCases = results.reduce(
897
+ (sum, r) => sum + (r.manifest.metrics?.total_cases || 0),
898
+ 0
899
+ );
900
+ const passedCases = results.reduce(
901
+ (sum, r) => sum + (r.manifest.metrics?.passed_cases || 0),
902
+ 0
903
+ );
904
+ const failedCases = results.reduce(
905
+ (sum, r) => sum + (r.manifest.metrics?.failed_cases || 0),
906
+ 0
907
+ );
908
+ const totalDuration = results.reduce(
909
+ (sum, r) => sum + (r.manifest.duration_ms || 0),
910
+ 0
911
+ );
912
+
913
+ console.log(
914
+ `Scenarios: ${chalk.green(`${passedScenarios} passed`)} ${failedScenarios > 0 ? chalk.red(`${failedScenarios} failed`) : ''} ${chalk.dim(`(${totalScenarios} total)`)}`
915
+ );
916
+ console.log(
917
+ `Test Cases: ${chalk.green(`${passedCases} passed`)} ${failedCases > 0 ? chalk.red(`${failedCases} failed`) : ''} ${chalk.dim(`(${totalCases} total)`)}`
918
+ );
919
+ console.log(`Duration: ${chalk.dim(formatDuration(totalDuration))}`);
920
+
921
+ if (runInParallel) {
922
+ console.log(
923
+ `Mode: ${chalk.cyan('parallel')} ${chalk.dim(`(${parallelLimit} concurrent)`)}`
924
+ );
925
+ }
629
926
  console.log();
927
+
928
+ // List failed scenarios
929
+ const failedResults = results.filter((r) => !r.success);
930
+ if (failedResults.length > 0) {
931
+ console.log(chalk.red('Failed scenarios:'));
932
+ for (const result of failedResults) {
933
+ console.log(chalk.red(` ${icons.failed} ${result.scenarioName}`));
934
+ if (result.error && options.verbose) {
935
+ console.log(chalk.dim(` ${result.error}`));
936
+ }
937
+ }
938
+ console.log();
939
+ }
940
+ }
941
+
942
+ // Show baseline comparison result in non-CI mode
943
+ if (baselineResult && !baselineResult.hasRegression) {
944
+ console.log(`${icons.passed} ${chalk.green('No regression detected')}`);
630
945
  }
631
946
  }
632
947
 
633
- // Exit with error if any scenarios failed
948
+ // Exit with error if any scenarios failed or regression detected
634
949
  const hasFailures = results.some((r) => !r.success);
635
- if (hasFailures) {
950
+ const hasRegression = baselineResult?.hasRegression || false;
951
+
952
+ if (hasFailures || hasRegression) {
636
953
  process.exit(1);
637
954
  }
638
955
  } catch (error) {
@@ -17,6 +17,9 @@ const ProviderConfigSchema = z.object({
17
17
  deploymentName: z.string().optional(),
18
18
  apiVersion: z.string().optional(),
19
19
  embeddingDeploymentName: z.string().optional(),
20
+ // Model family for parameter detection (e.g., 'gpt-5-mini' when deployment is '5-mini')
21
+ // Used by OpenAI/Azure to determine which API parameters to use (max_tokens vs max_completion_tokens)
22
+ modelFamily: z.string().optional(),
20
23
  // Vercel AI specific
21
24
  underlyingProvider: z.enum(['openai', 'azure', 'anthropic', 'google', 'mistral']).optional(),
22
25
  });
@@ -270,6 +270,12 @@ function buildAzureOpenAIConfig(options: ProviderBuildOptions): AdapterConfigRes
270
270
  { value: process.env.AZURE_OPENAI_EMBEDDING_DEPLOYMENT, source: 'env' }
271
271
  );
272
272
 
273
+ // Model family for parameter detection (e.g., 'gpt-5-mini' when deployment is '5-mini')
274
+ const resolvedModelFamily = resolveValueWithSource<string>(
275
+ { value: scenarioConfig?.modelFamily, source: 'scenario' },
276
+ { value: fileProviderConfig?.modelFamily, source: 'config' }
277
+ );
278
+
273
279
  const resolvedTimeout = resolveValueWithSource<number>(
274
280
  { value: scenarioConfig?.timeout, source: 'scenario' },
275
281
  { value: fileProviderConfig?.timeout, source: 'config' }
@@ -300,6 +306,7 @@ function buildAzureOpenAIConfig(options: ProviderBuildOptions): AdapterConfigRes
300
306
  timeout: resolvedTimeout.value,
301
307
  maxRetries: resolvedMaxRetries.value,
302
308
  embeddingDeploymentName: resolvedEmbeddingDeploymentName.value,
309
+ modelFamily: resolvedModelFamily.value,
303
310
  },
304
311
  resolvedConfig: {
305
312
  provider,