npm - ai-speedometer - Versions diffs - 1.2.5 → 1.3.2 - Mend

ai-speedometer 1.2.5 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/cli.js CHANGED Viewed

@@ -6,7 +6,6 @@ import { createOpenAICompatible } from '@ai-sdk/openai-compatible';
 import { createAnthropic } from '@ai-sdk/anthropic';
 import { streamText } from 'ai';  // Changed from streamText to generateText
 import { testPrompt } from './test-prompt.js';
-import { LLMBenchmark } from './benchmark-rest.js';
 import { getAllProviders, searchProviders, getModelsForProvider } from './models-dev.js';
 import {
   getAllAvailableProviders,
@@ -29,8 +28,64 @@ import {
 import 'dotenv/config';
 import Table from 'cli-table3';
-// Check for debug flag
-const debugMode = process.argv.includes('--debug');
+// Parse command line arguments
+function parseCliArgs() {
+  const args = process.argv.slice(2);
+  const parsed = {
+    debug: false,
+    bench: null,
+    apiKey: null,
+    useAiSdk: false,
+    formatted: false,
+    help: false
+  };
+  for (let i = 0; i < args.length; i++) {
+    const arg = args[i];
+    if (arg === '--debug') {
+      parsed.debug = true;
+    } else if (arg === '--bench') {
+      parsed.bench = args[++i];
+    } else if (arg === '--api-key') {
+      parsed.apiKey = args[++i];
+    } else if (arg === '--ai-sdk') {
+      parsed.useAiSdk = true;
+    } else if (arg === '--formatted') {
+      parsed.formatted = true;
+    } else if (arg === '--help' || arg === '-h') {
+      parsed.help = true;
+    }
+  }
+  return parsed;
+}
+function showHelp() {
+  console.log(colorText('ai-speedometer - Benchmark AI models', 'cyan'));
+  console.log('');
+  console.log(colorText('Usage:', 'yellow'));
+  console.log('  ai-speedometer                                  ' + colorText('# Interactive mode', 'dim'));
+  console.log('  ai-speedometer --bench <provider:model>         ' + colorText('# Headless benchmark', 'dim'));
+  console.log('');
+  console.log(colorText('Options:', 'yellow'));
+  console.log('  --bench <provider:model>    ' + colorText('Run benchmark in headless mode', 'dim'));
+  console.log('  --api-key <key>             ' + colorText('Override API key (optional)', 'dim'));
+  console.log('  --ai-sdk                    ' + colorText('Use AI SDK instead of REST API', 'dim'));
+  console.log('  --formatted                 ' + colorText('Format JSON output for human readability', 'dim'));
+  console.log('  --debug                     ' + colorText('Enable debug logging', 'dim'));
+  console.log('  --help, -h                  ' + colorText('Show this help message', 'dim'));
+  console.log('');
+  console.log(colorText('Examples:', 'yellow'));
+  console.log('  ai-speedometer --bench openai:gpt-4');
+  console.log('  ai-speedometer --bench anthropic:claude-3-opus --api-key "sk-..."');
+  console.log('  ai-speedometer --bench openai:gpt-4 --ai-sdk');
+  console.log('  ai-speedometer --bench openai:gpt-4 --formatted');
+  console.log('');
+}
+const cliArgs = parseCliArgs();
+const debugMode = cliArgs.debug;
 let logFile = null;
 function log(message) {
@@ -125,7 +180,6 @@ function clearScreen() {
 function showHeader() {
   console.log(colorText('Ai-speedometer', 'cyan'));
   console.log(colorText('=============================', 'cyan'));
-  console.log(colorText('Note: opencode uses ai-sdk', 'dim'));
   console.log('');
 }
@@ -302,7 +356,6 @@ async function selectModelsCircular() {
       // Add header
       screenContent += colorText('Ai-speedometer', 'cyan') + '\n';
       screenContent += colorText('=============================', 'cyan') + '\n';
-      screenContent += colorText('Note: opencode uses ai-sdk', 'dim') + '\n';
       screenContent += '\n';
       screenContent += colorText('Select Models for Benchmark', 'magenta') + '\n';
@@ -805,8 +858,9 @@ async function displayColorfulResults(results, method = 'AI SDK', models = []) {
   console.log(colorText('COMPREHENSIVE PERFORMANCE SUMMARY', 'yellow'));
   // Add note about method differences
-  console.log(colorText('Note: ', 'cyan') + colorText('Benchmark over REST API doesn\'t utilize streaming, so TTFT is 0. AI SDK utilizes streaming, but', 'dim'));
-  console.log(colorText('      ', 'cyan') + colorText('if the model is a thinking model, TTFT will be much higher because thinking tokens are not counted as first token.', 'dim'));
+  console.log(colorText('Note: ', 'cyan') + colorText('REST API with streaming now supports TTFT measurement. AI SDK also supports streaming.', 'dim'));
+  console.log(colorText('      ', 'cyan') + colorText('For thinking models, TTFT will be higher as thinking tokens are processed before output tokens.', 'dim'));
+  console.log(colorText('      ', 'cyan') + colorText('[est] markers indicate token counts were estimated (API did not provide usage metadata).', 'dim'));
   console.log('');
   const table = new Table({
@@ -833,15 +887,19 @@ async function displayColorfulResults(results, method = 'AI SDK', models = []) {
   // Add data rows (already ranked by sort order)
   sortedResults.forEach((result) => {
+    const outputTokenDisplay = result.tokenCount.toString() + (result.usedEstimateForOutput ? ' [est]' : '');
+    const promptTokenDisplay = result.promptTokens.toString() + (result.usedEstimateForInput ? ' [est]' : '');
+    const totalTokenDisplay = result.totalTokens.toString() + ((result.usedEstimateForInput || result.usedEstimateForOutput) ? ' [est]' : '');
     table.push([
       colorText(result.model, 'white'),
       colorText(result.provider, 'white'),
       colorText((result.totalTime / 1000).toFixed(2), 'green'),
       colorText((result.timeToFirstToken / 1000).toFixed(2), 'yellow'),
       colorText(result.tokensPerSecond.toFixed(1), 'magenta'),
-      colorText(result.tokenCount.toString(), 'blue'),
-      colorText(result.promptTokens.toString(), 'blue'),
-      colorText(result.totalTokens.toString(), 'bright')
+      colorText(outputTokenDisplay, result.usedEstimateForOutput ? 'yellow' : 'blue'),
+      colorText(promptTokenDisplay, result.usedEstimateForInput ? 'yellow' : 'blue'),
+      colorText(totalTokenDisplay, (result.usedEstimateForInput || result.usedEstimateForOutput) ? 'yellow' : 'bright')
     ]);
   });
@@ -1007,7 +1065,6 @@ async function addVerifiedProvider() {
     // Add header
     screenContent += colorText('Ai-speedometer', 'cyan') + '\n';
     screenContent += colorText('=============================', 'cyan') + '\n';
-    screenContent += colorText('Note: opencode uses ai-sdk', 'dim') + '\n';
     screenContent += '\n';
     screenContent += colorText('Add Verified Provider', 'magenta') + '\n';
@@ -1223,7 +1280,6 @@ async function addCustomProviderCLI() {
     // Add header
     screenContent += colorText('Ai-speedometer', 'cyan') + '\n';
     screenContent += colorText('=============================', 'cyan') + '\n';
-    screenContent += colorText('Note: opencode uses ai-sdk', 'dim') + '\n';
     screenContent += '\n';
     screenContent += colorText('Add Custom Provider', 'magenta') + '\n';
@@ -1486,7 +1542,6 @@ async function addCustomModelsMenu() {
     // Add header
     screenContent += colorText('Ai-speedometer', 'cyan') + '\n';
     screenContent += colorText('=============================', 'cyan') + '\n';
-    screenContent += colorText('Note: opencode uses ai-sdk', 'dim') + '\n';
     screenContent += '\n';
     screenContent += colorText('Add Custom Models', 'magenta') + '\n';
@@ -1553,7 +1608,6 @@ async function addModelsToExistingProvider() {
     // Add header
     screenContent += colorText('Ai-speedometer', 'cyan') + '\n';
     screenContent += colorText('=============================', 'cyan') + '\n';
-    screenContent += colorText('Note: opencode uses ai-sdk', 'dim') + '\n';
     screenContent += '\n';
     screenContent += colorText('Add Models to Existing Provider', 'magenta') + '\n';
@@ -1665,23 +1719,9 @@ async function addModelsToExistingProvider() {
   await question(colorText('\nPress Enter to continue...', 'yellow'));
 }
-// REST API benchmark function using direct API calls
-async function runRestApiBenchmark(models) {
-  if (models.length === 0) {
-    console.log(colorText('No models selected for benchmarking.', 'red'));
-    return;
-  }
-  clearScreen();
-  showHeader();
-  console.log(colorText('Running REST API Benchmark...', 'green'));
-  console.log(colorText(`Running ${models.length} models in parallel...`, 'cyan'));
-  console.log(colorText('Note: This uses direct REST API calls instead of AI SDK', 'dim'));
-  console.log('');
-  // Create a function to benchmark a single model using REST API
-  const benchmarkModelRest = async (model) => {
-    console.log(colorText(`Testing ${model.name} (${model.providerName}) via REST API...`, 'yellow'));
+// Silent benchmark helper (returns raw result without UI)
+async function benchmarkSingleModelRest(model) {
     try {
       // Validate required configuration
@@ -1693,14 +1733,24 @@ async function runRestApiBenchmark(models) {
         throw new Error(`Missing base URL for provider ${model.providerName}`);
       }
+      // Extract the actual model ID for API calls (moved before usage)
+      let actualModelId = model.name;
+      if (model.id && model.id.includes('_')) {
+        actualModelId = model.id.split('_')[1];
+      }
+      actualModelId = actualModelId.trim();
       const startTime = Date.now();
+      let firstTokenTime = null;
+      let streamedText = '';
+      let tokenCount = 0;
       // Use correct endpoint based on provider type
       let endpoint;
       if (model.providerType === 'anthropic') {
         endpoint = '/messages';
       } else if (model.providerType === 'google') {
-        endpoint = '/models/' + actualModelId + ':generateContent';
+        endpoint = '/models/' + actualModelId + ':streamGenerateContent';
       } else {
         endpoint = '/chat/completions';
       }
@@ -1709,17 +1759,7 @@ async function runRestApiBenchmark(models) {
       const baseUrl = model.providerConfig.baseUrl.replace(/\/$/, '');
       const url = `${baseUrl}${endpoint}`;
-      // Extract the actual model ID for API calls
-      let actualModelId = model.name;
-      if (model.id && model.id.includes('_')) {
-        // For models with provider prefix, extract the actual model ID
-        actualModelId = model.id.split('_')[1];
-        console.log(colorText(`  Using extracted model ID: ${actualModelId}`, 'cyan'));
-      }
-      // Trim any trailing spaces from model names
-      actualModelId = actualModelId.trim();
-      console.log(colorText(`  Using final model ID: "${actualModelId}"`, 'cyan'));
       const headers = {
         'Content-Type': 'application/json',
@@ -1731,7 +1771,6 @@ async function runRestApiBenchmark(models) {
         headers['x-api-key'] = model.providerConfig.apiKey;
         headers['anthropic-version'] = '2023-06-01';
       } else if (model.providerType === 'google') {
-        // Google uses different auth
         delete headers['Authorization'];
         headers['x-goog-api-key'] = model.providerConfig.apiKey;
       }
@@ -1742,14 +1781,15 @@ async function runRestApiBenchmark(models) {
           { role: 'user', content: testPrompt }
         ],
         max_tokens: 500,
-        temperature: 0.7
+        temperature: 0.7,
+        stream: true
       };
       // Adjust for provider-specific formats
       if (model.providerType === 'anthropic') {
         body.max_tokens = 500;
+        body.stream = true;
       } else if (model.providerType === 'google') {
-        // Google format is slightly different
         body.contents = [{ parts: [{ text: testPrompt }] }];
         body.generationConfig = {
           maxOutputTokens: 500,
@@ -1757,10 +1797,10 @@ async function runRestApiBenchmark(models) {
         };
         delete body.messages;
         delete body.max_tokens;
+        delete body.stream;
       }
-      console.log(colorText(`  Making request to: ${url}`, 'cyan'));
-      console.log(colorText(`  Using model: ${actualModelId}`, 'cyan'));
       const response = await fetch(url, {
         method: 'POST',
@@ -1768,56 +1808,154 @@ async function runRestApiBenchmark(models) {
         body: JSON.stringify(body)
       });
-      console.log(colorText(`  Response status: ${response.status}`, 'cyan'));
       if (!response.ok) {
         const errorText = await response.text();
-        console.log(colorText(`  Error: ${errorText.slice(0, 200)}...`, 'red'));
         throw new Error(`API request failed: ${response.status} ${response.statusText}`);
       }
-      const data = await response.json();
+      // Process streaming response
+      const reader = response.body.getReader();
+      const decoder = new TextDecoder();
+      let buffer = '';
+      let inputTokens = 0;
+      let outputTokens = 0;
+      let isFirstChunk = true;
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) break;
+        // Capture TTFT on first chunk arrival (network level)
+        if (isFirstChunk && !firstTokenTime) {
+          firstTokenTime = Date.now();
+          isFirstChunk = false;
+          // Show live TTFT result (only in interactive mode, not headless)
+          const ttftSeconds = ((firstTokenTime - startTime) / 1000).toFixed(2);
+          if (!cliArgs.bench) {
+            console.log(colorText(`TTFT received at ${ttftSeconds}s for ${model.name}`, 'green'));
+          }
+        }
+        buffer += decoder.decode(value, { stream: true });
+        const lines = buffer.split('\n');
+        buffer = lines.pop() || '';
+        for (const line of lines) {
+          const trimmedLine = line.trim();
+          if (!trimmedLine) continue;
+          try {
+            if (model.providerType === 'anthropic') {
+              // Anthropic uses newline-delimited JSON with event types
+              if (trimmedLine.startsWith('data: ')) {
+                const jsonStr = trimmedLine.slice(6);
+                if (jsonStr === '[DONE]') break;
+                const chunk = JSON.parse(jsonStr);
+                if (chunk.type === 'content_block_delta' && chunk.delta?.text) {
+                  streamedText += chunk.delta.text;
+                } else if (chunk.type === 'message_start' && chunk.message?.usage) {
+                  inputTokens = chunk.message.usage.input_tokens || 0;
+                } else if (chunk.type === 'message_delta') {
+                  // Capture output tokens from message_delta
+                  if (chunk.usage?.output_tokens) {
+                    outputTokens = chunk.usage.output_tokens;
+                  }
+                  // Some implementations put input_tokens here too
+                  if (chunk.usage?.input_tokens && !inputTokens) {
+                    inputTokens = chunk.usage.input_tokens;
+                  }
+                }
+              } else if (trimmedLine.startsWith('event: ')) {
+                // Skip event lines (Anthropic SSE format uses separate event and data lines)
+                continue;
+              } else {
+                // Try parsing as raw JSON (some Anthropic-compatible APIs don't use SSE format)
+                const chunk = JSON.parse(trimmedLine);
+                if (chunk.type === 'content_block_delta' && chunk.delta?.text) {
+                  streamedText += chunk.delta.text;
+                } else if (chunk.type === 'message_start' && chunk.message?.usage) {
+                  inputTokens = chunk.message.usage.input_tokens || 0;
+                } else if (chunk.type === 'message_delta') {
+                  if (chunk.usage?.output_tokens) {
+                    outputTokens = chunk.usage.output_tokens;
+                  }
+                  if (chunk.usage?.input_tokens && !inputTokens) {
+                    inputTokens = chunk.usage.input_tokens;
+                  }
+                }
+              }
+            } else if (model.providerType === 'google') {
+              // Google streaming format
+              const chunk = JSON.parse(trimmedLine);
+              if (chunk.candidates?.[0]?.content?.parts?.[0]?.text) {
+                const text = chunk.candidates[0].content.parts[0].text;
+                streamedText += text;
+              }
+              if (chunk.usageMetadata?.promptTokenCount) {
+                inputTokens = chunk.usageMetadata.promptTokenCount;
+              }
+              if (chunk.usageMetadata?.candidatesTokenCount) {
+                outputTokens = chunk.usageMetadata.candidatesTokenCount;
+              }
+            } else {
+              // OpenAI-compatible SSE format
+              if (trimmedLine.startsWith('data: ')) {
+                const jsonStr = trimmedLine.slice(6);
+                if (jsonStr === '[DONE]') break;
+                const chunk = JSON.parse(jsonStr);
+                if (chunk.choices?.[0]?.delta?.content) {
+                  streamedText += chunk.choices[0].delta.content;
+                }
+                if (chunk.usage?.prompt_tokens) {
+                  inputTokens = chunk.usage.prompt_tokens;
+                }
+                if (chunk.usage?.completion_tokens) {
+                  outputTokens = chunk.usage.completion_tokens;
+                }
+              }
+            }
+          } catch (parseError) {
+            // Skip invalid JSON lines
+            continue;
+          }
+        }
+      }
       const endTime = Date.now();
       const totalTime = endTime - startTime;
+      const timeToFirstToken = firstTokenTime ? firstTokenTime - startTime : totalTime;
-      // Calculate tokens based on provider type
-      let inputTokens, outputTokens;
-      if (model.providerType === 'anthropic') {
-        inputTokens = data.usage?.input_tokens || Math.round(testPrompt.length / 4);
-        outputTokens = data.usage?.output_tokens || Math.round(data.content?.[0]?.text?.length / 4 || 0);
-      } else if (model.providerType === 'google') {
-        inputTokens = data.usageMetadata?.promptTokenCount || Math.round(testPrompt.length / 4);
-        outputTokens = data.usageMetadata?.candidatesTokenCount || Math.round(data.candidates?.[0]?.content?.parts?.[0]?.text?.length / 4 || 0);
-      } else {
-        inputTokens = data.usage?.prompt_tokens || Math.round(testPrompt.length / 4);
-        outputTokens = data.usage?.completion_tokens || Math.round(data.choices?.[0]?.message?.content?.length / 4 || 0);
-      }
-      const totalTokens = inputTokens + outputTokens;
+      // Calculate token counts - use provider's count if available, otherwise estimate
+      const usedEstimateForOutput = !outputTokens;
+      const usedEstimateForInput = !inputTokens;
+      const finalOutputTokens = outputTokens || Math.round(streamedText.length / 4);
+      const finalInputTokens = inputTokens || Math.round(testPrompt.length / 4);
+      const totalTokens = finalInputTokens + finalOutputTokens;
       const tokensPerSecond = totalTime > 0 ? (totalTokens / totalTime) * 1000 : 0;
-      console.log(colorText('Completed!', 'green'));
-      console.log(colorText(`  Total Time: ${(totalTime / 1000).toFixed(2)}s`, 'cyan'));
-      console.log(colorText(`  Tokens/Sec: ${tokensPerSecond.toFixed(1)}`, 'cyan'));
-      console.log(colorText(`  Input Tokens: ${inputTokens}`, 'cyan'));
-      console.log(colorText(`  Output Tokens: ${outputTokens}`, 'cyan'));
-      console.log(colorText(`  Total Tokens: ${totalTokens}`, 'cyan'));
       return {
         model: model.name,
         provider: model.providerName,
         totalTime: totalTime,
-        timeToFirstToken: 0, // REST API doesn't track TTFT
-        tokenCount: outputTokens,
+        timeToFirstToken: timeToFirstToken,
+        tokenCount: finalOutputTokens,
         tokensPerSecond: tokensPerSecond,
-        promptTokens: inputTokens,
+        promptTokens: finalInputTokens,
         totalTokens: totalTokens,
+        usedEstimateForOutput: usedEstimateForOutput,
+        usedEstimateForInput: usedEstimateForInput,
         success: true
       };
     } catch (error) {
-      console.log(colorText('Failed: ', 'red') + error.message);
       return {
         model: model.name,
         provider: model.providerName,
@@ -1831,17 +1969,49 @@ async function runRestApiBenchmark(models) {
         error: error.message
       };
     }
-  };
+}
+// REST API benchmark function using direct API calls (with UI)
+async function runRestApiBenchmark(models) {
+  if (models.length === 0) {
+    console.log(colorText('No models selected for benchmarking.', 'red'));
+    return;
+  }
-  // Run all benchmarks in parallel
+  clearScreen();
+  showHeader();
+  console.log(colorText('Running REST API Benchmark with Streaming...', 'green'));
+  console.log(colorText(`Running ${models.length} models in parallel...`, 'cyan'));
+  console.log(colorText('Note: This uses direct REST API calls with streaming support', 'dim'));
+  console.log('');
+  // Run all benchmarks in parallel with UI feedback
   console.log(colorText('Starting parallel REST API benchmark execution...', 'cyan'));
-  const promises = models.map(model => benchmarkModelRest(model));
+  // Start all benchmarks in parallel
+  const promises = models.map(model => {
+    console.log(colorText(`Testing ${model.name} (${model.providerName}) via REST API with streaming...`, 'yellow'));
+    return benchmarkSingleModelRest(model);
+  });
   const results = await Promise.all(promises);
+  // Show individual results after all complete
+  results.forEach((result, index) => {
+    if (result.success) {
+      console.log(colorText(`✓ ${result.model} (${result.provider}) completed!`, 'green'));
+      console.log(colorText(`  Total Time: ${(result.totalTime / 1000).toFixed(2)}s`, 'cyan'));
+      console.log(colorText(`  TTFT: ${(result.timeToFirstToken / 1000).toFixed(2)}s`, 'cyan'));
+      console.log(colorText(`  Tokens/Sec: ${result.tokensPerSecond.toFixed(1)}`, 'cyan'));
+    } else {
+      console.log(colorText(`✗ ${result.model} (${result.provider}) failed: `, 'red') + result.error);
+    }
+  });
   console.log('');
   console.log(colorText('All REST API benchmarks completed!', 'green'));
-  await displayColorfulResults(results, 'REST API', models);
+  await displayColorfulResults(results, 'REST API (Streaming)', models);
   // Add successful models to recent models list
   const successfulModels = results
@@ -1868,16 +2038,16 @@ async function runRestApiBenchmark(models) {
 async function showMainMenu() {
   const menuOptions = [
     { id: 1, text: 'Set Model', action: () => showModelMenu() },
-    { id: 2, text: 'Run Benchmark (AI SDK)', action: async () => {
+    { id: 2, text: 'Run Benchmark (REST API)', action: async () => {
       const selectedModels = await selectModelsCircular();
       if (selectedModels.length > 0) {
-        await runStreamingBenchmark(selectedModels);
+        await runRestApiBenchmark(selectedModels);
       }
     }},
-    { id: 3, text: 'Run Benchmark (REST API)', action: async () => {
+    { id: 3, text: 'Run Benchmark (AI SDK - Legacy)', action: async () => {
       const selectedModels = await selectModelsCircular();
       if (selectedModels.length > 0) {
-        await runRestApiBenchmark(selectedModels);
+        await runStreamingBenchmark(selectedModels);
       }
     }},
     { id: 4, text: 'Exit', action: () => {
@@ -1896,7 +2066,6 @@ async function showMainMenu() {
     // Add header
     screenContent += colorText('Ai-speedometer', 'cyan') + '\n';
     screenContent += colorText('=============================', 'cyan') + '\n';
-    screenContent += colorText('Note: opencode uses ai-sdk', 'dim') + '\n';
     screenContent += '\n';
     screenContent += colorText('Main Menu:', 'cyan') + '\n';
@@ -1953,7 +2122,6 @@ async function showModelMenu() {
     // Add header
     screenContent += colorText('Ai-speedometer', 'cyan') + '\n';
     screenContent += colorText('=============================', 'cyan') + '\n';
-    screenContent += colorText('Note: opencode uses ai-sdk', 'dim') + '\n';
     screenContent += '\n';
     screenContent += colorText('Model Management:', 'cyan') + '\n';
@@ -2002,17 +2170,158 @@ process.on('SIGINT', () => {
   process.exit(0);
 });
+// Headless benchmark mode
+async function runHeadlessBenchmark(benchSpec, apiKey, useAiSdk) {
+  try {
+    // Parse provider:model format
+    const [providerSpec, modelName] = benchSpec.split(':');
+    if (!providerSpec || !modelName) {
+      console.error(colorText('Error: Invalid --bench format. Use: provider:model', 'red'));
+      console.error(colorText('Example: --bench zai-code-anth:glm-4.6', 'yellow'));
+      process.exit(1);
+    }
+    // Load all available providers
+    const config = await loadConfig();
+    // Find the provider (case-insensitive search)
+    const provider = config.providers.find(p =>
+      p.id?.toLowerCase() === providerSpec.toLowerCase() ||
+      p.name?.toLowerCase() === providerSpec.toLowerCase()
+    );
+    if (!provider) {
+      console.error(colorText(`Error: Provider '${providerSpec}' not found`, 'red'));
+      console.error(colorText('Available providers:', 'yellow'));
+      config.providers.forEach(p => {
+        console.error(colorText(`  - ${p.id || p.name}`, 'cyan'));
+      });
+      process.exit(1);
+    }
+    // Find the model
+    // Model IDs are prefixed with provider name (e.g., "zai-code-anth_glm-4.6")
+    // So we need to check:
+    // 1. Full ID match: "zai-code-anth_glm-4.6"
+    // 2. ID without provider prefix: "glm-4.6"
+    // 3. Name match: "GLM-4.6-anth"
+    const model = provider.models.find(m => {
+      const modelIdLower = m.id?.toLowerCase() || '';
+      const modelNameLower = m.name?.toLowerCase() || '';
+      const searchLower = modelName.toLowerCase();
+      // Check full ID match
+      if (modelIdLower === searchLower) return true;
+      // Check ID without provider prefix (strip "provider_" prefix)
+      const idWithoutPrefix = modelIdLower.includes('_')
+        ? modelIdLower.split('_').slice(1).join('_')
+        : modelIdLower;
+      if (idWithoutPrefix === searchLower) return true;
+      // Check name match
+      if (modelNameLower === searchLower) return true;
+      return false;
+    });
+    if (!model) {
+      console.error(colorText(`Error: Model '${modelName}' not found in provider '${provider.name}'`, 'red'));
+      console.error(colorText('Available models:', 'yellow'));
+      provider.models.forEach(m => {
+        // Show both name and ID (without provider prefix) for clarity
+        const idWithoutPrefix = m.id?.includes('_')
+          ? m.id.split('_').slice(1).join('_')
+          : m.id;
+        console.error(colorText(`  - ${m.name} (id: ${idWithoutPrefix})`, 'cyan'));
+      });
+      process.exit(1);
+    }
+    // If API key provided via flag, use it; otherwise use existing config
+    let finalApiKey = apiKey || provider.apiKey;
+    if (!finalApiKey) {
+      console.error(colorText(`Error: No API key found for provider '${provider.name}'`, 'red'));
+      console.error(colorText('Please provide --api-key flag or configure the provider first', 'yellow'));
+      process.exit(1);
+    }
+    // Create model object with all required config
+    const modelConfig = {
+      ...model,
+      providerName: provider.name,
+      providerType: provider.type,
+      providerId: provider.id,
+      providerConfig: {
+        ...provider,
+        apiKey: finalApiKey,
+        baseUrl: provider.baseUrl || ''
+      },
+      selected: true
+    };
+    // Run benchmark silently and get results
+    let result;
+    if (useAiSdk) {
+      // TODO: Implement AI SDK silent benchmark
+      console.error(colorText('AI SDK headless mode not yet implemented', 'red'));
+      process.exit(1);
+    } else {
+      result = await benchmarkSingleModelRest(modelConfig);
+    }
+    // Output JSON to stdout
+    const jsonOutput = {
+      provider: provider.name,
+      providerId: provider.id,
+      model: model.name,
+      modelId: model.id,
+      method: useAiSdk ? 'ai-sdk' : 'rest-api',
+      success: result.success,
+      totalTime: result.totalTime,
+      totalTimeSeconds: result.totalTime / 1000,
+      timeToFirstToken: result.timeToFirstToken,
+      timeToFirstTokenSeconds: result.timeToFirstToken / 1000,
+      tokensPerSecond: result.tokensPerSecond,
+      outputTokens: result.tokenCount,
+      promptTokens: result.promptTokens,
+      totalTokens: result.totalTokens,
+      is_estimated: !!(result.usedEstimateForOutput || result.usedEstimateForInput),
+      error: result.error || null
+    };
+    console.log(JSON.stringify(jsonOutput, null, cliArgs.formatted ? 2 : 0));
+    process.exit(result.success ? 0 : 1);
+  } catch (error) {
+    console.error(colorText('Error: ' + error.message, 'red'));
+    if (debugMode) {
+      console.error(error.stack);
+    }
+    process.exit(1);
+  }
+}
 // Start the CLI
-if (import.meta.url === `file://${process.argv[1]}` ||
-    process.argv.length === 2 ||
-    (process.argv.length === 3 && process.argv[2] === '--debug')) {
-  // Clean up recent models from main config and migrate to cache on startup
-  cleanupRecentModelsFromConfig().then(() => {
-    showMainMenu();
-  }).catch(() => {
-    showMainMenu();
-  });
+if (require.main === module) {
+  // Check if help flag
+  if (cliArgs.help) {
+    showHelp();
+    process.exit(0);
+  }
+  // Check if headless benchmark mode
+  if (cliArgs.bench) {
+    runHeadlessBenchmark(cliArgs.bench, cliArgs.apiKey, cliArgs.useAiSdk);
+  } else {
+    // Interactive mode
+    cleanupRecentModelsFromConfig().then(() => {
+      showMainMenu();
+    }).catch(() => {
+      showMainMenu();
+    });
+  }
 }
 export { showMainMenu, listProviders, selectModelsCircular, runStreamingBenchmark, loadConfig, saveConfig };