npm - @artemiskit/cli - Versions diffs - 0.2.0 → 0.2.3 - Mend

@artemiskit/cli 0.2.0 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/CHANGELOG.md +97 -0
package/dist/index.js +65256 -63756
package/dist/src/cli.d.ts.map +1 -1
package/dist/src/commands/baseline.d.ts +9 -0
package/dist/src/commands/baseline.d.ts.map +1 -0
package/dist/src/commands/history.d.ts.map +1 -1
package/dist/src/commands/redteam.d.ts.map +1 -1
package/dist/src/commands/run.d.ts.map +1 -1
package/dist/src/commands/stress.d.ts.map +1 -1
package/dist/src/config/schema.d.ts +8 -0
package/dist/src/config/schema.d.ts.map +1 -1
package/dist/src/utils/adapter.d.ts.map +1 -1
package/package.json +6 -6
package/src/cli.ts +2 -0
package/src/commands/baseline.ts +473 -0
package/src/commands/history.ts +58 -9
package/src/commands/redteam.ts +19 -1
package/src/commands/run.ts +479 -52
package/src/commands/stress.ts +28 -0
package/src/config/schema.ts +3 -0
package/src/utils/adapter.ts +7 -0

package/src/commands/run.ts CHANGED Viewed

@@ -2,15 +2,20 @@
  * Run command - Execute test scenarios
  */
+import { mkdir, writeFile } from 'node:fs/promises';
 import { basename } from 'node:path';
+import { join } from 'node:path';
 import {
+  type BaselineStorageAdapter,
   type RedactionConfig,
   type RunManifest,
   createAdapter,
+  formatCost,
   parseScenarioFile,
   resolveScenarioPaths,
   runScenario,
 } from '@artemiskit/core';
+import { generateMarkdownReport } from '@artemiskit/reports';
 import chalk from 'chalk';
 import { Command } from 'commander';
 import { loadConfig } from '../config/loader.js';
@@ -53,6 +58,20 @@ interface RunOptions {
   redactPatterns?: string[];
   parallel?: number;
   interactive?: boolean;
+  /** CI mode - machine-readable output, no colors/spinners */
+  ci?: boolean;
+  /** Summary format: json, text, or security */
+  summary?: 'json' | 'text' | 'security';
+  /** Compare against baseline and detect regression */
+  baseline?: boolean;
+  /** Regression threshold (0-1), default 0.05 (5%) */
+  threshold?: number;
+  /** Budget limit in USD - fail if cost exceeds this */
+  budget?: number;
+  /** Export format: markdown */
+  export?: 'markdown';
+  /** Output directory for exports */
+  exportOutput?: string;
 }
 interface ScenarioRunResult {
@@ -63,6 +82,209 @@ interface ScenarioRunResult {
   error?: string;
 }
+/**
+ * Minimal spinner interface for CI/non-TTY compatibility
+ */
+interface SpinnerLike {
+  start: (text?: string) => void;
+  stop: () => void;
+  succeed: (text?: string) => void;
+  fail: (text?: string) => void;
+  info: (text?: string) => void;
+}
+/**
+ * CI-friendly JSON summary output
+ */
+interface CISummary {
+  success: boolean;
+  scenarios: {
+    total: number;
+    passed: number;
+    failed: number;
+  };
+  cases: {
+    total: number;
+    passed: number;
+    failed: number;
+    successRate: number;
+  };
+  duration: {
+    totalMs: number;
+    formatted: string;
+  };
+  tokens: {
+    prompt: number;
+    completion: number;
+    total: number;
+  };
+  cost: {
+    estimatedUsd: number;
+    formatted: string;
+  };
+  runs: Array<{
+    runId: string;
+    scenario: string;
+    success: boolean;
+    successRate: number;
+    passedCases: number;
+    failedCases: number;
+    totalCases: number;
+    durationMs: number;
+    estimatedCostUsd?: number;
+  }>;
+  baseline?: {
+    compared: boolean;
+    hasRegression: boolean;
+    threshold: number;
+    delta?: {
+      successRate: number;
+      latency: number;
+      tokens: number;
+    };
+  };
+  budget?: {
+    limit: number;
+    exceeded: boolean;
+    overBy: number;
+  };
+}
+/**
+ * Security-focused summary for red team/security reporting
+ */
+interface SecuritySummary {
+  overallRisk: 'low' | 'medium' | 'high' | 'critical';
+  successRate: number;
+  vulnerabilities: {
+    critical: number;
+    high: number;
+    medium: number;
+    low: number;
+  };
+  recommendations: string[];
+}
+/**
+ * Check if storage adapter supports baselines
+ */
+function isBaselineStorage(storage: unknown): storage is BaselineStorageAdapter {
+  return (
+    typeof storage === 'object' &&
+    storage !== null &&
+    'setBaseline' in storage &&
+    'getBaseline' in storage &&
+    'listBaselines' in storage &&
+    'compareToBaseline' in storage
+  );
+}
+/**
+ * Build CI summary from results
+ */
+function buildCISummary(results: ScenarioRunResult[]): CISummary {
+  const totalScenarios = results.length;
+  const passedScenarios = results.filter((r) => r.success).length;
+  const failedScenarios = totalScenarios - passedScenarios;
+  const totalCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_cases || 0), 0);
+  const passedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.passed_cases || 0), 0);
+  const failedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.failed_cases || 0), 0);
+  const totalDuration = results.reduce((sum, r) => sum + (r.manifest.duration_ms || 0), 0);
+  // Aggregate token and cost metrics
+  const totalPromptTokens = results.reduce(
+    (sum, r) => sum + (r.manifest.metrics?.total_prompt_tokens || 0),
+    0
+  );
+  const totalCompletionTokens = results.reduce(
+    (sum, r) => sum + (r.manifest.metrics?.total_completion_tokens || 0),
+    0
+  );
+  const totalTokens = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_tokens || 0), 0);
+  const totalCostUsd = results.reduce(
+    (sum, r) => sum + (r.manifest.metrics?.cost?.total_usd || 0),
+    0
+  );
+  return {
+    success: failedScenarios === 0,
+    scenarios: {
+      total: totalScenarios,
+      passed: passedScenarios,
+      failed: failedScenarios,
+    },
+    cases: {
+      total: totalCases,
+      passed: passedCases,
+      failed: failedCases,
+      successRate: totalCases > 0 ? passedCases / totalCases : 0,
+    },
+    duration: {
+      totalMs: totalDuration,
+      formatted: formatDuration(totalDuration),
+    },
+    tokens: {
+      prompt: totalPromptTokens,
+      completion: totalCompletionTokens,
+      total: totalTokens,
+    },
+    cost: {
+      estimatedUsd: totalCostUsd,
+      formatted: formatCost(totalCostUsd),
+    },
+    runs: results.map((r) => ({
+      runId: r.manifest.run_id || '',
+      scenario: r.scenarioName,
+      success: r.success,
+      successRate: r.manifest.metrics?.success_rate || 0,
+      passedCases: r.manifest.metrics?.passed_cases || 0,
+      failedCases: r.manifest.metrics?.failed_cases || 0,
+      totalCases: r.manifest.metrics?.total_cases || 0,
+      durationMs: r.manifest.duration_ms || 0,
+      estimatedCostUsd: r.manifest.metrics?.cost?.total_usd,
+    })),
+  };
+}
+/**
+ * Build security summary (for --summary security)
+ */
+function buildSecuritySummary(results: ScenarioRunResult[]): SecuritySummary {
+  const totalCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.total_cases || 0), 0);
+  const passedCases = results.reduce((sum, r) => sum + (r.manifest.metrics?.passed_cases || 0), 0);
+  const successRate = totalCases > 0 ? passedCases / totalCases : 0;
+  // Categorize risk based on success rate (for standard runs, invert for security context)
+  let overallRisk: 'low' | 'medium' | 'high' | 'critical';
+  if (successRate >= 0.95) overallRisk = 'low';
+  else if (successRate >= 0.8) overallRisk = 'medium';
+  else if (successRate >= 0.5) overallRisk = 'high';
+  else overallRisk = 'critical';
+  // Count failures by severity (simplified - can be enhanced with actual severity data)
+  const failedCases = totalCases - passedCases;
+  return {
+    overallRisk,
+    successRate,
+    vulnerabilities: {
+      critical: overallRisk === 'critical' ? failedCases : 0,
+      high: overallRisk === 'high' ? failedCases : 0,
+      medium: overallRisk === 'medium' ? failedCases : 0,
+      low: overallRisk === 'low' ? failedCases : 0,
+    },
+    recommendations:
+      successRate < 1
+        ? [
+            'Review failed test cases for potential issues',
+            'Consider adding more comprehensive test coverage',
+            successRate < 0.8 ? 'Investigate root causes of failures before deployment' : '',
+          ].filter(Boolean)
+        : ['All tests passing - continue monitoring'],
+  };
+}
 /**
  * Run a single scenario and return the result (quiet mode for parallel execution)
  */
@@ -139,7 +361,7 @@ async function runSingleScenario(
   scenarioPath: string,
   options: RunOptions,
   config: ArtemisConfig | null,
-  spinner: ReturnType<typeof createSpinner>,
+  spinner: SpinnerLike,
   isMultiScenario: boolean
 ): Promise<ScenarioRunResult> {
   // Parse scenario
@@ -376,21 +598,54 @@ export function runCommand(): Command {
       'Custom redaction patterns (regex or built-in: email, phone, credit_card, ssn, api_key)'
     )
     .option('-i, --interactive', 'Enable interactive mode for scenario/provider selection')
+    .option('--ci', 'CI mode: machine-readable output, no colors/spinners, JSON summary')
+    .option(
+      '--summary <format>',
+      'Summary output format: json, text, or security (implies --ci for json/security)',
+      'text'
+    )
+    .option('--baseline', 'Compare against baseline and detect regression')
+    .option('--threshold <number>', 'Regression threshold (0-1), e.g., 0.05 for 5%', '0.05')
+    .option('--budget <amount>', 'Maximum budget in USD - fail if estimated cost exceeds this')
+    .option('--export <format>', 'Export format: markdown')
+    .option('--export-output <dir>', 'Output directory for exports (default: ./artemis-exports)')
     .action(async (scenarioPath: string | undefined, options: RunOptions) => {
-      const spinner = createSpinner('Loading configuration...');
-      spinner.start();
+      // Determine CI mode: explicit flag, environment variable, or summary format that implies CI
+      const isCIMode =
+        options.ci ||
+        process.env.CI === 'true' ||
+        options.summary === 'json' ||
+        options.summary === 'security';
+      // In CI mode, use a no-op spinner
+      const spinner = isCIMode
+        ? {
+            start: () => {},
+            stop: () => {},
+            succeed: () => {},
+            fail: () => {},
+            info: () => {},
+          }
+        : createSpinner('Loading configuration...');
+      if (!isCIMode) {
+        spinner.start();
+      }
       try {
         // Load config file if present
         const config = await loadConfig(options.config);
-        if (config) {
-          spinner.succeed(`Loaded config from ${config._path}`);
-        } else {
-          spinner.info('No config file found, using defaults');
+        if (!isCIMode) {
+          if (config) {
+            spinner.succeed(`Loaded config from ${config._path}`);
+          } else {
+            spinner.info('No config file found, using defaults');
+          }
         }
-        // Determine if we should use interactive mode
-        const useInteractive = options.interactive || (!scenarioPath && isInteractive());
+        // Determine if we should use interactive mode (never in CI mode)
+        const useInteractive =
+          !isCIMode && (options.interactive || (!scenarioPath && isInteractive()));
         // Interactive provider/model selection if requested
         if (useInteractive && !options.provider) {
@@ -539,9 +794,12 @@ export function runCommand(): Command {
               // Show additional metrics
               console.log();
+              const costInfo = result.manifest.metrics.cost
+                ? `  |  Est. Cost: ${formatCost(result.manifest.metrics.cost.total_usd)}`
+                : '';
               console.log(
                 chalk.dim(
-                  `Run ID: ${result.manifest.run_id}  |  Median Latency: ${result.manifest.metrics.median_latency_ms}ms  |  Tokens: ${result.manifest.metrics.total_tokens.toLocaleString()}`
+                  `Run ID: ${result.manifest.run_id}  |  Median Latency: ${result.manifest.metrics.median_latency_ms}ms  |  Tokens: ${result.manifest.metrics.total_tokens.toLocaleString()}${costInfo}`
                 )
               );
@@ -560,6 +818,16 @@ export function runCommand(): Command {
                 const savedPath = await storage.save(result.manifest);
                 console.log(chalk.dim(`Saved: ${savedPath}`));
               }
+              // Export to markdown if requested
+              if (options.export === 'markdown') {
+                const exportDir = options.exportOutput || './artemis-exports';
+                await mkdir(exportDir, { recursive: true });
+                const markdown = generateMarkdownReport(result.manifest);
+                const mdPath = join(exportDir, `${result.manifest.run_id}.md`);
+                await writeFile(mdPath, markdown);
+                console.log(chalk.dim(`Exported: ${mdPath}`));
+              }
             } catch (error) {
               // Record failed scenario
               console.log();
@@ -577,62 +845,221 @@ export function runCommand(): Command {
           }
         }
-        // Display aggregate summary for multiple scenarios
-        if (isMultiScenario) {
-          console.log();
-          console.log(chalk.bold.cyan('━━━ AGGREGATE SUMMARY ━━━'));
-          console.log();
+        // Build CI summary (used for CI mode output and baseline comparison)
+        const ciSummary = buildCISummary(results);
-          const totalScenarios = results.length;
-          const passedScenarios = results.filter((r) => r.success).length;
-          const failedScenarios = totalScenarios - passedScenarios;
+        // Baseline comparison (if enabled)
+        let baselineResult: {
+          hasRegression: boolean;
+          threshold: number;
+          delta?: { successRate: number; latency: number; tokens: number };
+        } | null = null;
-          const totalCases = results.reduce(
-            (sum, r) => sum + (r.manifest.metrics?.total_cases || 0),
-            0
-          );
-          const passedCases = results.reduce(
-            (sum, r) => sum + (r.manifest.metrics?.passed_cases || 0),
-            0
-          );
-          const failedCases = results.reduce(
-            (sum, r) => sum + (r.manifest.metrics?.failed_cases || 0),
-            0
-          );
-          const totalDuration = results.reduce((sum, r) => sum + (r.manifest.duration_ms || 0), 0);
+        if (options.baseline && results.length > 0) {
+          const regressionThreshold = Number.parseFloat(String(options.threshold)) || 0.05;
-          console.log(
-            `Scenarios:  ${chalk.green(`${passedScenarios} passed`)}  ${failedScenarios > 0 ? chalk.red(`${failedScenarios} failed`) : ''}  ${chalk.dim(`(${totalScenarios} total)`)}`
-          );
-          console.log(
-            `Test Cases: ${chalk.green(`${passedCases} passed`)}  ${failedCases > 0 ? chalk.red(`${failedCases} failed`) : ''}  ${chalk.dim(`(${totalCases} total)`)}`
-          );
-          console.log(`Duration:   ${chalk.dim(formatDuration(totalDuration))}`);
+          // Check each scenario against its baseline
+          for (const result of results) {
+            if (!result.manifest.run_id) continue;
+            if (isBaselineStorage(storage) && storage.compareToBaseline) {
+              try {
+                const comparison = await storage.compareToBaseline(
+                  result.manifest.run_id,
+                  regressionThreshold
+                );
+                if (comparison) {
+                  baselineResult = {
+                    hasRegression: comparison.hasRegression,
+                    threshold: comparison.regressionThreshold,
+                    delta: comparison.comparison.delta,
+                  };
+                  // Add baseline info to CI summary
+                  ciSummary.baseline = {
+                    compared: true,
+                    hasRegression: comparison.hasRegression,
+                    threshold: comparison.regressionThreshold,
+                    delta: comparison.comparison.delta,
+                  };
+                  if (!isCIMode && comparison.hasRegression) {
+                    console.log();
+                    console.log(
+                      `${icons.failed} ${chalk.red('Regression detected!')} for ${chalk.bold(result.scenarioName)}`
+                    );
+                    console.log(
+                      chalk.dim(
+                        `  Success rate dropped by ${Math.abs(comparison.comparison.delta.successRate * 100).toFixed(1)}% (threshold: ${regressionThreshold * 100}%)`
+                      )
+                    );
+                  }
+                }
+              } catch {
+                // Baseline comparison failed, continue without it
+              }
+            }
+          }
+        }
+        // Handle CI mode output
+        if (isCIMode) {
+          if (options.summary === 'json') {
+            console.log(JSON.stringify(ciSummary, null, 2));
+          } else if (options.summary === 'security') {
+            const securitySummary = buildSecuritySummary(results);
+            console.log(JSON.stringify(securitySummary, null, 2));
+          } else {
+            // Default CI text output (minimal)
+            const totalCases = ciSummary.cases.total;
+            const passedCases = ciSummary.cases.passed;
+            const failedCases = ciSummary.cases.failed;
+            const successRate = (ciSummary.cases.successRate * 100).toFixed(1);
+            console.log(`ARTEMISKIT_RESULT=${ciSummary.success ? 'PASS' : 'FAIL'}`);
+            console.log(`ARTEMISKIT_SCENARIOS_TOTAL=${ciSummary.scenarios.total}`);
+            console.log(`ARTEMISKIT_SCENARIOS_PASSED=${ciSummary.scenarios.passed}`);
+            console.log(`ARTEMISKIT_SCENARIOS_FAILED=${ciSummary.scenarios.failed}`);
+            console.log(`ARTEMISKIT_CASES_TOTAL=${totalCases}`);
+            console.log(`ARTEMISKIT_CASES_PASSED=${passedCases}`);
+            console.log(`ARTEMISKIT_CASES_FAILED=${failedCases}`);
+            console.log(`ARTEMISKIT_SUCCESS_RATE=${successRate}`);
+            console.log(`ARTEMISKIT_DURATION_MS=${ciSummary.duration.totalMs}`);
+            console.log(`ARTEMISKIT_TOKENS_TOTAL=${ciSummary.tokens.total}`);
+            console.log(`ARTEMISKIT_COST_USD=${ciSummary.cost.estimatedUsd.toFixed(4)}`);
+            if (baselineResult) {
+              console.log('ARTEMISKIT_BASELINE_COMPARED=true');
+              console.log(
+                `ARTEMISKIT_REGRESSION=${baselineResult.hasRegression ? 'true' : 'false'}`
+              );
+              if (baselineResult.delta) {
+                console.log(
+                  `ARTEMISKIT_DELTA_SUCCESS_RATE=${(baselineResult.delta.successRate * 100).toFixed(2)}`
+                );
+              }
+            }
+            // Also print run IDs for reference
+            for (const run of ciSummary.runs) {
+              if (run.runId) {
+                console.log(
+                  `ARTEMISKIT_RUN_ID_${run.scenario.toUpperCase().replace(/[^A-Z0-9]/g, '_')}=${run.runId}`
+                );
+              }
+            }
+          }
+        } else {
+          // Display aggregate summary for multiple scenarios (non-CI mode)
+          if (isMultiScenario) {
+            console.log();
+            console.log(chalk.bold.cyan('━━━ AGGREGATE SUMMARY ━━━'));
+            console.log();
+            const totalScenarios = results.length;
+            const passedScenarios = results.filter((r) => r.success).length;
+            const failedScenarios = totalScenarios - passedScenarios;
+            const totalCases = results.reduce(
+              (sum, r) => sum + (r.manifest.metrics?.total_cases || 0),
+              0
+            );
+            const passedCases = results.reduce(
+              (sum, r) => sum + (r.manifest.metrics?.passed_cases || 0),
+              0
+            );
+            const failedCases = results.reduce(
+              (sum, r) => sum + (r.manifest.metrics?.failed_cases || 0),
+              0
+            );
+            const totalDuration = results.reduce(
+              (sum, r) => sum + (r.manifest.duration_ms || 0),
+              0
+            );
-          if (runInParallel) {
             console.log(
-              `Mode:       ${chalk.cyan('parallel')} ${chalk.dim(`(${parallelLimit} concurrent)`)}`
+              `Scenarios:  ${chalk.green(`${passedScenarios} passed`)}  ${failedScenarios > 0 ? chalk.red(`${failedScenarios} failed`) : ''}  ${chalk.dim(`(${totalScenarios} total)`)}`
             );
+            console.log(
+              `Test Cases: ${chalk.green(`${passedCases} passed`)}  ${failedCases > 0 ? chalk.red(`${failedCases} failed`) : ''}  ${chalk.dim(`(${totalCases} total)`)}`
+            );
+            console.log(`Duration:   ${chalk.dim(formatDuration(totalDuration))}`);
+            if (runInParallel) {
+              console.log(
+                `Mode:       ${chalk.cyan('parallel')} ${chalk.dim(`(${parallelLimit} concurrent)`)}`
+              );
+            }
+            console.log();
+            // List failed scenarios
+            const failedResults = results.filter((r) => !r.success);
+            if (failedResults.length > 0) {
+              console.log(chalk.red('Failed scenarios:'));
+              for (const result of failedResults) {
+                console.log(chalk.red(`  ${icons.failed} ${result.scenarioName}`));
+                if (result.error && options.verbose) {
+                  console.log(chalk.dim(`      ${result.error}`));
+                }
+              }
+              console.log();
+            }
           }
-          console.log();
-          // List failed scenarios
-          const failedResults = results.filter((r) => !r.success);
-          if (failedResults.length > 0) {
-            console.log(chalk.red('Failed scenarios:'));
-            for (const result of failedResults) {
-              console.log(chalk.red(`  ${icons.failed} ${result.scenarioName}`));
-              if (result.error && options.verbose) {
-                console.log(chalk.dim(`      ${result.error}`));
+          // Show baseline comparison result in non-CI mode
+          if (baselineResult && !baselineResult.hasRegression) {
+            console.log(`${icons.passed} ${chalk.green('No regression detected')}`);
+          }
+        }
+        // Check budget if specified
+        let budgetExceeded = false;
+        if (options.budget !== undefined) {
+          const budgetLimit = Number.parseFloat(String(options.budget));
+          const totalCost = ciSummary.cost.estimatedUsd;
+          if (totalCost > budgetLimit) {
+            budgetExceeded = true;
+            const overBy = totalCost - budgetLimit;
+            // Add budget info to CI summary
+            ciSummary.budget = {
+              limit: budgetLimit,
+              exceeded: true,
+              overBy,
+            };
+            if (isCIMode) {
+              if (options.summary === 'json') {
+                // Budget info already in ciSummary, will be output above
+              } else {
+                console.log(`ARTEMISKIT_BUDGET_LIMIT=${budgetLimit.toFixed(2)}`);
+                console.log('ARTEMISKIT_BUDGET_EXCEEDED=true');
+                console.log(`ARTEMISKIT_BUDGET_OVER_BY=${overBy.toFixed(4)}`);
               }
+            } else {
+              console.log();
+              console.log(chalk.red(`${icons.failed} BUDGET EXCEEDED`));
+              console.log(
+                chalk.red(
+                  `   Budget: $${budgetLimit.toFixed(2)}  |  Actual: ${formatCost(totalCost)}  |  Over by: ${formatCost(overBy)}`
+                )
+              );
+              console.log();
             }
-            console.log();
+          } else if (!isCIMode) {
+            console.log(
+              `${icons.passed} ${chalk.green('Within budget')} ${chalk.dim(`($${budgetLimit.toFixed(2)} limit, ${formatCost(totalCost)} used)`)}`
+            );
           }
         }
-        // Exit with error if any scenarios failed
+        // Exit with error if any scenarios failed, regression detected, or budget exceeded
         const hasFailures = results.some((r) => !r.success);
-        if (hasFailures) {
+        const hasRegression = baselineResult?.hasRegression || false;
+        if (hasFailures || hasRegression || budgetExceeded) {
           process.exit(1);
         }
       } catch (error) {

package/src/commands/stress.ts CHANGED Viewed

@@ -13,6 +13,7 @@ import {
   type StressRequestResult,
   createAdapter,
   estimateCost,
+  formatCost,
   getGitInfo,
   getModelPricing,
   parseScenarioFile,
@@ -26,6 +27,7 @@ import {
   colors,
   createSpinner,
   getProviderErrorContext,
+  icons,
   isTTY,
   renderError,
   renderInfoBox,
@@ -52,6 +54,8 @@ interface StressOptions {
   config?: string;
   redact?: boolean;
   redactPatterns?: string[];
+  /** Budget limit in USD - fail if cost exceeds this */
+  budget?: number;
 }
 export function stressCommand(): Command {
@@ -75,6 +79,7 @@ export function stressCommand(): Command {
       '--redact-patterns <patterns...>',
       'Custom redaction patterns (regex or built-in: email, phone, credit_card, ssn, api_key)'
     )
+    .option('--budget <amount>', 'Maximum budget in USD - fail if estimated cost exceeds this')
     .action(async (scenarioPath: string, options: StressOptions) => {
       const spinner = createSpinner('Loading configuration...');
       spinner.start();
@@ -319,6 +324,29 @@ export function stressCommand(): Command {
           console.log(chalk.dim(`  HTML: ${htmlPath}`));
           console.log(chalk.dim(`  JSON: ${jsonPath}`));
         }
+        // Check budget if specified
+        if (options.budget !== undefined && metrics.cost) {
+          const budgetLimit = Number.parseFloat(String(options.budget));
+          const totalCost = metrics.cost.estimated_total_usd;
+          if (totalCost > budgetLimit) {
+            const overBy = totalCost - budgetLimit;
+            console.log();
+            console.log(chalk.red(`${icons.failed} BUDGET EXCEEDED`));
+            console.log(
+              chalk.red(
+                `   Budget: $${budgetLimit.toFixed(2)}  |  Actual: ${formatCost(totalCost)}  |  Over by: ${formatCost(overBy)}`
+              )
+            );
+            process.exit(1);
+          } else {
+            console.log();
+            console.log(
+              `${icons.passed} ${chalk.green('Within budget')} ${chalk.dim(`($${budgetLimit.toFixed(2)} limit, ${formatCost(totalCost)} used)`)}`
+            );
+          }
+        }
       } catch (error) {
         spinner.fail('Error');

package/src/config/schema.ts CHANGED Viewed

@@ -17,6 +17,9 @@ const ProviderConfigSchema = z.object({
   deploymentName: z.string().optional(),
   apiVersion: z.string().optional(),
   embeddingDeploymentName: z.string().optional(),
+  // Model family for parameter detection (e.g., 'gpt-5-mini' when deployment is '5-mini')
+  // Used by OpenAI/Azure to determine which API parameters to use (max_tokens vs max_completion_tokens)
+  modelFamily: z.string().optional(),
   // Vercel AI specific
   underlyingProvider: z.enum(['openai', 'azure', 'anthropic', 'google', 'mistral']).optional(),
 });