npm - ollama-bench - Versions diffs - 1.1.0 → 1.2.0 - Mend

ollama-bench 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md CHANGED Viewed

@@ -1,22 +1,27 @@
 # Ollama-bench
-Minimal CLI tool to benchmark Ollama models with detailed phase analysis. Zero runtime dependencies.
+Minimal CLI tool to benchmark Ollama models with detailed phase-by-phase analysis — now with **time-to-first-token (TTFT)**, **reasoning/thinking** measurement, GPU/VRAM reporting, and a side-by-side ranking table.
 ## Features
-- Phase-by-phase performance breakdown
-- Precise timing measurements
-- Works with npm, pnpm, yarn, and bun
+- Phase-by-phase performance breakdown (load · prompt eval · generation)
+- **TTFT** (time to first token) — the metric that actually drives perceived latency
+- **Reasoning models**: auto-detects thinking-capable models and measures the thinking phase separately
+- Size / quantization / VRAM (GPU vs CPU) reporting via the live model state
+- Aligned **ranking table** when comparing multiple models
+- `--json` output for scripting and CI
+- Multi-run averaging, custom prompts, custom host
+- TTY-aware: colors/spinners on a terminal, clean plain text when piped (honors `NO_COLOR`)
 ## Quick Start
 ```bash
 # Run directly (no installation)
-npx ollama-bench qwen2.5:0.5b llama3.2:1b
+npx ollama-bench qwen3:0.6b llama3.2:1b
 # Or with other package managers
-bunx ollama-bench qwen2.5:0.5b
-pnpm dlx ollama-bench qwen2.5:0.5b
+bunx ollama-bench qwen3:0.6b
+pnpm dlx ollama-bench qwen3:0.6b
 ```
 ## Prerequisites
@@ -24,23 +29,78 @@ pnpm dlx ollama-bench qwen2.5:0.5b
 1. **Install Ollama** - [ollama.com/download](https://ollama.com/download)
 2. **Start Ollama server** - Run `ollama serve`
+## Usage
+```
+ollama-bench [options] <model> [model...]
+Options
+  --think[=high|medium|low]  Enable reasoning/thinking (auto-detected by default)
+  --no-think                 Disable thinking even for reasoning models
+  --prompt <text>            Custom benchmark prompt
+  --runs <n>                 Repeat each model n times and average (default: 1)
+  --host <url>               Ollama server URL (default: http://127.0.0.1:11434)
+  --json                     Emit machine-readable JSON instead of the report
+  --demo                     Render the UI with synthetic data (no server needed)
+  -v, --version              Print version
+  -h, --help                 Show this help
+```
+### Examples
+```bash
+# Compare two models
+ollama-bench qwen3:0.6b llama3.2:1b
+# Benchmark a reasoning model at high thinking effort, averaged over 3 runs
+ollama-bench --runs 3 --think=high deepseek-r1:1.5b
+# Custom prompt, JSON output for a script
+ollama-bench --prompt "Write a haiku about TCP" --json gemma3:1b > result.json
+# Preview the UI without an Ollama server
+ollama-bench --demo
+```
 ## Benchmark Phases
-Each benchmark measures three distinct phases:
+Each benchmark measures these phases (timings come straight from the Ollama server):
+**Model Loading** — time to load weights into memory. Hardware-dependent, very consistent.
-**Phase 1: Model Loading** (Loading weights into memory)
-- Time to load model from disk into RAM
-- Hardware-dependent, very consistent
+**Prompt Processing** — time to encode and process the input prompt. Fast, scales with prompt length.
-**Phase 2: Prompt Processing** (Encoding input)
-- Time to encode and process your input prompt
-- Fast, scales with prompt length
+**Thinking** *(reasoning models only)* — the model's streamed thinking text, measured separately from the visible answer. Automatically enabled for thinking-capable models such as `qwen3` and `deepseek-r1`. Ollama does not expose separate thinking token counts, so ollama-bench reports exact thinking characters and chars/sec instead of estimating tokens.
-**Phase 3: Response Generation** (Creating output)
-- Time to generate the actual response
-- Most important metric for user-facing performance
-- Varies with content complexity
+**Response Generation** — time to generate the output tokens. The most important metric for user-facing performance.
+Alongside the phases, ollama-bench reports **TTFT** (wall-clock time to the first streamed token) and the model's **size / quantization / VRAM** placement.
+## JSON output
+`--json` writes a single JSON object to **stdout** (all progress goes to stderr, so the stream stays parseable):
+```json
+{
+  "server": "0.12.0",
+  "prompt": "Explain the theory of relativity in simple terms.",
+  "results": [
+    {
+      "model": "qwen3:0.6b",
+      "ok": true,
+      "tokensPerSecond": 168.4,
+      "ttft": 0.51,
+      "thinking": true,
+      "thinkingTime": 1.13,
+      "thinkingChars": 640,
+      "thinkingCharsPerSecond": 568,
+      "loadTime": 0.42,
+      "generationTime": 1.9,
+      "totalTime": 2.4
+    }
+  ]
+}
+```
 ## Available Models
@@ -48,4 +108,4 @@ See [ollama.com/library](https://ollama.com/library) for all available models.
 ## License
-MIT
+MIT

package/dist/index.js CHANGED Viewed

@@ -1,9 +1,9 @@
 #!/usr/bin/env node
-import ollama from 'ollama';
+import { Ollama } from 'ollama';
 /**
  * Object containing ANSI color codes for text coloring.
  */
-const colors = {
+const codes = {
     reset: '\x1b[0m',
     green: '\x1b[32m',
     yellow: '\x1b[33m',
@@ -11,165 +11,620 @@ const colors = {
     cyan: '\x1b[36m',
     magenta: '\x1b[35m',
     blue: '\x1b[34m',
+    gray: '\x1b[90m',
+    bold: '\x1b[1m',
 };
 /**
- * Applies color to the given text.
+ * Whether the current stdout is an interactive terminal (controls spinners).
+ */
+const isTTY = process.stdout.isTTY === true;
+/**
+ * Whether ANSI colors should be emitted. Honors NO_COLOR / FORCE_COLOR and TTY.
+ */
+const useColor = !('NO_COLOR' in process.env) &&
+    process.env.TERM !== 'dumb' &&
+    (isTTY || 'FORCE_COLOR' in process.env);
+/**
+ * Applies color to the given text (no-op when colors are disabled).
  * @param text - The text to colorize.
  * @param color - The color to apply.
  * @returns The colorized text.
  */
 function colorize(text, color) {
-    return `${colors[color]}${text}${colors.reset}`;
+    return useColor ? `${codes[color]}${text}${codes.reset}` : text;
+}
+/* -------------------------------------------------------------------------- */
+/*  Formatting helpers                                                        */
+/* -------------------------------------------------------------------------- */
+/**
+ * Formats a duration in seconds into a compact human-readable string.
+ */
+function fmtDuration(seconds) {
+    if (!isFinite(seconds) || seconds < 0)
+        return '—';
+    if (seconds < 1)
+        return `${(seconds * 1000).toFixed(0)}ms`;
+    if (seconds < 60)
+        return `${seconds.toFixed(2)}s`;
+    const m = Math.floor(seconds / 60);
+    const s = seconds % 60;
+    return `${m}m${s.toFixed(0)}s`;
 }
 /**
- * Creates a loading animation for the console.
- * @param operation - The operation being performed.
- * @param model - The model being processed.
- * @returns An interval ID for the animation.
+ * Formats a byte count into a human-readable string (GB / MB / KB).
  */
-function createLoadingAnimation(operation, model) {
-    const frames = ['|', '/', '-', '\\'];
+function fmtBytes(bytes) {
+    if (!bytes || bytes <= 0)
+        return '—';
+    const units = ['B', 'KB', 'MB', 'GB', 'TB'];
+    let v = bytes;
     let i = 0;
-    let dots = 0;
-    return setInterval(() => {
-        const frame = frames[i];
-        const dotString = '.'.repeat(dots);
-        const operationText = colorize(`${operation} ${model}${dotString}`, 'blue');
-        process.stdout.write(`\r${frame} ${operationText}`.padEnd(50));
-        i = (i + 1) % frames.length;
-        dots = (dots + 1) % 4;
-    }, 100);
-}
-/**
- * Pulls a model from Ollama.
- * @param model - The name of the model to pull.
- */
-async function pullModel(model) {
-    console.log(colorize(`Initiating pull for ${model}...`, 'yellow'));
-    const loadingAnimation = createLoadingAnimation('Pulling', model);
+    while (v >= 1024 && i < units.length - 1) {
+        v /= 1024;
+        i++;
+    }
+    return `${v.toFixed(v < 10 && i > 0 ? 1 : 0)}${units[i]}`;
+}
+/**
+ * Formats a tokens-per-second value.
+ */
+function fmtRate(rate) {
+    if (!isFinite(rate) || rate <= 0)
+        return '—';
+    return `${rate.toFixed(1)} t/s`;
+}
+/* -------------------------------------------------------------------------- */
+/*  Spinner                                                                   */
+/* -------------------------------------------------------------------------- */
+/**
+ * A minimal TTY spinner. On non-interactive terminals it prints a single line
+ * and becomes a no-op, so piped/CI output stays clean.
+ */
+class Spinner {
+    static get tty() {
+        return process.stderr.isTTY === true;
+    }
+    constructor(text) {
+        this.text = text;
+        this.frames = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏'];
+        this.timer = null;
+        this.i = 0;
+    }
+    start() {
+        // Progress goes to stderr so stdout stays clean for reports / --json.
+        if (!Spinner.tty) {
+            process.stderr.write(`${this.text}\n`);
+            return this;
+        }
+        this.timer = setInterval(() => {
+            const frame = colorize(this.frames[this.i], 'cyan');
+            process.stderr.write(`\r${frame} ${this.text}\x1b[K`);
+            this.i = (this.i + 1) % this.frames.length;
+        }, 80);
+        return this;
+    }
+    update(text) {
+        this.text = text;
+    }
+    stop() {
+        if (this.timer) {
+            clearInterval(this.timer);
+            this.timer = null;
+        }
+        if (Spinner.tty)
+            process.stderr.write('\r\x1b[K');
+    }
+}
+const DEFAULT_PROMPT = 'Explain the theory of relativity in simple terms.';
+const TOOL_VERSION = '1.2.0';
+/* -------------------------------------------------------------------------- */
+/*  Argument parsing                                                          */
+/* -------------------------------------------------------------------------- */
+/**
+ * Parses process.argv into structured CLI options.
+ */
+function parseArgs(argv) {
+    const opts = {
+        models: [],
+        prompt: DEFAULT_PROMPT,
+        runs: 1,
+        think: undefined,
+        noThink: false,
+        json: false,
+        demo: false,
+        help: false,
+        version: false,
+    };
+    for (let i = 0; i < argv.length; i++) {
+        const arg = argv[i];
+        const next = () => argv[++i];
+        if (arg === '-h' || arg === '--help')
+            opts.help = true;
+        else if (arg === '-v' || arg === '--version')
+            opts.version = true;
+        else if (arg === '--json')
+            opts.json = true;
+        else if (arg === '--demo')
+            opts.demo = true;
+        else if (arg === '--no-think')
+            opts.noThink = true;
+        else if (arg === '--think')
+            opts.think = true;
+        else if (arg.startsWith('--think=')) {
+            const level = arg.slice('--think='.length);
+            opts.think = level === 'true' ? true : level;
+        }
+        else if (arg === '--prompt')
+            opts.prompt = next() ?? opts.prompt;
+        else if (arg.startsWith('--prompt='))
+            opts.prompt = arg.slice('--prompt='.length);
+        else if (arg === '--runs')
+            opts.runs = Math.max(1, parseInt(next() ?? '1', 10) || 1);
+        else if (arg.startsWith('--runs='))
+            opts.runs = Math.max(1, parseInt(arg.slice('--runs='.length), 10) || 1);
+        else if (arg === '--host')
+            opts.host = next();
+        else if (arg.startsWith('--host='))
+            opts.host = arg.slice('--host='.length);
+        else if (arg.startsWith('-')) {
+            console.error(colorize(`Unknown option: ${arg}`, 'red'));
+            process.exit(1);
+        }
+        else
+            opts.models.push(arg);
+    }
+    return opts;
+}
+/**
+ * Prints CLI usage / help text.
+ */
+function printHelp() {
+    const b = (t) => colorize(t, 'bold');
+    const c = (t) => colorize(t, 'cyan');
+    console.log(`
+${b('ollama-bench')} — benchmark Ollama models with phase-by-phase analysis
+${b('USAGE')}
+  ollama-bench [options] <model> [model...]
+${b('OPTIONS')}
+  ${c('--think[=high|medium|low]')}  Enable reasoning/thinking (auto-detected by default)
+  ${c('--no-think')}                 Disable thinking even for reasoning models
+  ${c('--prompt <text>')}            Custom benchmark prompt
+  ${c('--runs <n>')}                 Repeat each model n times and average (default: 1)
+  ${c('--host <url>')}               Ollama server URL (default: http://127.0.0.1:11434)
+  ${c('--json')}                     Emit machine-readable JSON instead of the report
+  ${c('--demo')}                     Render the UI with synthetic data (no server needed)
+  ${c('-v, --version')}              Print version
+  ${c('-h, --help')}                 Show this help
+${b('EXAMPLES')}
+  ollama-bench qwen3:0.6b llama3.2:1b
+  ollama-bench --runs 3 --think=high deepseek-r1:1.5b
+  ollama-bench --prompt "Write a haiku about TCP" --json gemma3:1b
+`);
+}
+/* -------------------------------------------------------------------------- */
+/*  Ollama interactions                                                       */
+/* -------------------------------------------------------------------------- */
+/**
+ * Verifies the Ollama server is reachable, returning its version.
+ * Exits with a friendly message when the server is unreachable.
+ */
+async function ensureServer(client) {
     try {
-        const start = performance.now();
-        const response = await ollama.pull({ model, stream: true });
-        for await (const part of response) {
-            if (part.status === 'success') {
-                clearInterval(loadingAnimation);
-                const end = performance.now();
-                const duration = (end - start) / 1000;
-                console.log(`\r${colorize(`Successfully pulled ${model} in ${duration.toFixed(2)} seconds`, 'green')}     `);
-                return;
+        const { version } = await client.version();
+        return version;
+    }
+    catch {
+        console.error(colorize('✗ Could not reach the Ollama server.', 'red'));
+        console.error(colorize('  Is it running?  Start it with:  ollama serve', 'gray'));
+        process.exit(1);
+    }
+}
+/**
+ * Returns the capability list for a model (e.g. ['completion', 'thinking', 'tools']).
+ * Returns an empty array if the model is not present locally.
+ */
+async function modelCapabilities(client, model) {
+    try {
+        const info = await client.show({ model });
+        return info.capabilities ?? [];
+    }
+    catch {
+        return [];
+    }
+}
+/**
+ * Pulls a model, rendering a live progress bar with percentage.
+ */
+async function pullModel(client, model) {
+    const spinner = new Spinner(colorize(`Pulling ${model}…`, 'blue')).start();
+    const start = performance.now();
+    try {
+        const stream = await client.pull({ model, stream: true });
+        for await (const part of stream) {
+            if (part.total && part.completed) {
+                const pct = Math.min(100, (part.completed / part.total) * 100);
+                const width = 24;
+                const filled = Math.round((pct / 100) * width);
+                const bar = '█'.repeat(filled) + '░'.repeat(width - filled);
+                spinner.update(`${colorize(`Pulling ${model}`, 'blue')} ${colorize(bar, 'cyan')} ` +
+                    `${pct.toFixed(0)}%  ${colorize(`${fmtBytes(part.completed)}/${fmtBytes(part.total)}`, 'gray')}`);
+            }
+            else {
+                spinner.update(colorize(`Pulling ${model} — ${part.status}…`, 'blue'));
             }
         }
+        spinner.stop();
+        const elapsed = (performance.now() - start) / 1000;
+        console.error(colorize(`✓ ${model} ready ${colorize(`(${fmtDuration(elapsed)})`, 'gray')}`, 'green'));
+        return true;
     }
     catch (error) {
-        clearInterval(loadingAnimation);
-        console.log(`\r${colorize(`Error pulling ${model}: ${error.message}`, 'red')}     `);
+        spinner.stop();
+        console.error(colorize(`✗ Failed to pull ${model}: ${error.message}`, 'red'));
+        return false;
+    }
+}
+/**
+ * Computes a finite rate, avoiding Infinity/NaN in JSON output.
+ */
+function rate(count, seconds) {
+    return seconds > 0 ? count / seconds : 0;
+}
+/**
+ * Runs a single streamed generation and captures timing samples.
+ */
+async function runOnce(client, model, prompt, think) {
+    const start = performance.now();
+    let firstTokenAt = 0;
+    let thinkStartAt = 0;
+    let thinkEndAt = 0;
+    let thinkingChars = 0;
+    let final;
+    const stream = await client.generate({
+        model,
+        prompt,
+        // Pass false explicitly so --no-think disables models whose default is to think.
+        think,
+        stream: true,
+    });
+    for await (const chunk of stream) {
+        const now = performance.now();
+        if (chunk.thinking) {
+            if (!thinkStartAt)
+                thinkStartAt = now;
+            thinkEndAt = now;
+            thinkingChars += chunk.thinking.length;
+        }
+        if ((chunk.response || chunk.thinking) && !firstTokenAt)
+            firstTokenAt = now;
+        if (chunk.done)
+            final = chunk;
     }
+    if (!final)
+        throw new Error('No response received from server');
+    return {
+        loadTime: final.load_duration / 1e9,
+        promptEvalTime: final.prompt_eval_duration / 1e9,
+        promptEvalCount: final.prompt_eval_count,
+        generationTime: final.eval_duration / 1e9,
+        evalCount: final.eval_count,
+        totalTime: final.total_duration / 1e9,
+        ttft: firstTokenAt ? (firstTokenAt - start) / 1000 : 0,
+        thinkingChars,
+        thinkingWallTime: thinkStartAt ? (thinkEndAt - thinkStartAt) / 1000 : 0,
+    };
 }
 /**
- * Benchmarks a model's performance.
- * @param model - The name of the model to benchmark.
- * @returns A promise that resolves to the benchmark result.
+ * Benchmarks a model across one or more runs and aggregates the results.
  */
-async function benchmarkModel(model) {
-    const prompt = "Explain the theory of relativity in simple terms.";
-    console.log(colorize(`\nBenchmarking ${model}`, 'cyan'));
-    console.log(colorize('─'.repeat(50), 'cyan'));
-    const loadingAnimation = createLoadingAnimation('Running benchmark', model);
+async function benchmarkModel(client, model, opts) {
+    // Decide whether to enable thinking.
+    let think = opts.think;
+    if (opts.noThink)
+        think = false;
+    else if (think === undefined) {
+        const caps = await modelCapabilities(client, model);
+        think = caps.includes('thinking') ? true : false;
+    }
+    const label = think ? `${model} ${colorize('(thinking)', 'magenta')}` : model;
+    const spinner = new Spinner(colorize(`Benchmarking ${label}…`, 'blue')).start();
+    const samples = [];
     try {
-        const response = await ollama.generate({
-            model,
-            prompt,
-            stream: false,
-        });
-        clearInterval(loadingAnimation);
-        process.stdout.write('\r' + ' '.repeat(50) + '\r');
-        // Calculate phase timings
-        const loadTime = response.load_duration / 1e9;
-        const promptEvalTime = response.prompt_eval_duration / 1e9;
-        const generationTime = response.eval_duration / 1e9;
-        const totalTime = response.total_duration / 1e9;
-        const tokensPerSecond = response.eval_count / generationTime;
-        // Calculate percentages
-        const loadPercent = (loadTime / totalTime * 100).toFixed(1);
-        const promptPercent = (promptEvalTime / totalTime * 100).toFixed(1);
-        const genPercent = (generationTime / totalTime * 100).toFixed(1);
-        // Display phases
-        console.log(colorize('Phase 1: Model Loading (Loading weights into memory)', 'yellow'));
-        console.log(colorize(`  Time: ${loadTime.toFixed(2)}s (${loadPercent}% of total)`, 'yellow'));
-        console.log();
-        console.log(colorize('Phase 2: Prompt Processing (Encoding input)', 'yellow'));
-        console.log(colorize(`  Tokens: ${response.prompt_eval_count}`, 'yellow'));
-        console.log(colorize(`  Time: ${promptEvalTime.toFixed(2)}s (${promptPercent}% of total)`, 'yellow'));
-        console.log(colorize(`  Speed: ${(response.prompt_eval_count / promptEvalTime).toFixed(2)} tokens/s`, 'yellow'));
-        console.log();
-        console.log(colorize('Phase 3: Response Generation (Creating output)', 'yellow'));
-        console.log(colorize(`  Tokens: ${response.eval_count}`, 'yellow'));
-        console.log(colorize(`  Time: ${generationTime.toFixed(2)}s (${genPercent}% of total)`, 'yellow'));
-        console.log(colorize(`  Speed: ${tokensPerSecond.toFixed(2)} tokens/s`, 'yellow'));
-        console.log();
-        console.log(colorize('Summary', 'green'));
-        console.log(colorize(`  Total time: ${totalTime.toFixed(2)}s`, 'green'));
-        console.log(colorize(`  Generation speed: ${tokensPerSecond.toFixed(2)} tokens/s`, 'green'));
-        console.log();
-        return {
-            model,
-            tokensPerSecond,
-            loadTime,
-            promptEvalTime,
-            generationTime,
-            totalTime
-        };
+        for (let r = 0; r < opts.runs; r++) {
+            if (opts.runs > 1)
+                spinner.update(colorize(`Benchmarking ${label} — run ${r + 1}/${opts.runs}…`, 'blue'));
+            samples.push(await runOnce(client, model, opts.prompt, think));
+        }
+        spinner.stop();
     }
     catch (error) {
-        clearInterval(loadingAnimation);
-        process.stdout.write('\r' + ' '.repeat(50) + '\r');
-        console.log(colorize(`Error benchmarking ${model}: ${error.message}`, 'red'));
-        console.log();
+        spinner.stop();
         return {
             model,
-            tokensPerSecond: 0,
+            ok: false,
+            error: error.message,
+            runs: 0,
             loadTime: 0,
             promptEvalTime: 0,
+            promptEvalCount: 0,
+            promptTokensPerSecond: 0,
             generationTime: 0,
-            totalTime: 0
+            evalCount: 0,
+            tokensPerSecond: 0,
+            totalTime: 0,
+            ttft: 0,
+            thinking: false,
+            thinkingTime: 0,
+            thinkingChars: 0,
+            thinkingCharsPerSecond: 0,
         };
     }
+    // Average across runs.
+    const avg = (pick) => samples.reduce((a, s) => a + pick(s), 0) / samples.length;
+    const loadTime = avg((s) => s.loadTime);
+    const promptEvalTime = avg((s) => s.promptEvalTime);
+    const promptEvalCount = avg((s) => s.promptEvalCount);
+    const generationTime = avg((s) => s.generationTime);
+    const evalCount = avg((s) => s.evalCount);
+    const totalTime = avg((s) => s.totalTime);
+    const thinkingChars = avg((s) => s.thinkingChars);
+    const thinkingWallTime = avg((s) => s.thinkingWallTime);
+    // Pull resource usage for the (still-loaded) model.
+    let sizeBytes;
+    let sizeVramBytes;
+    let parameterSize;
+    let quantization;
+    try {
+        const { models } = await client.ps();
+        const live = models.find((m) => m.name === model || m.model === model);
+        if (live) {
+            sizeBytes = live.size;
+            sizeVramBytes = live.size_vram;
+            parameterSize = live.details?.parameter_size;
+            quantization = live.details?.quantization_level;
+        }
+    }
+    catch {
+        /* ps() is best-effort */
+    }
+    return {
+        model,
+        ok: true,
+        runs: samples.length,
+        loadTime,
+        promptEvalTime,
+        promptEvalCount,
+        promptTokensPerSecond: rate(promptEvalCount, promptEvalTime),
+        generationTime,
+        evalCount,
+        tokensPerSecond: rate(evalCount, generationTime),
+        totalTime,
+        ttft: avg((s) => s.ttft),
+        thinking: thinkingChars > 0,
+        thinkingTime: thinkingWallTime,
+        thinkingChars,
+        // Ollama does not expose a separate token count for thinking chunks, so report
+        // the exact streamed character rate instead of estimating tokens from chars.
+        thinkingCharsPerSecond: rate(thinkingChars, thinkingWallTime),
+        sizeBytes,
+        sizeVramBytes,
+        parameterSize,
+        quantization,
+    };
+}
+/* -------------------------------------------------------------------------- */
+/*  Rendering                                                                 */
+/* -------------------------------------------------------------------------- */
+/**
+ * Renders the detailed per-phase breakdown for a single result.
+ */
+function renderResult(r) {
+    const runsNote = r.runs > 1 ? colorize(`  (avg of ${r.runs} runs)`, 'gray') : '';
+    console.log(colorize(`\n${r.model}`, 'cyan') + runsNote);
+    console.log(colorize('─'.repeat(52), 'gray'));
+    if (!r.ok) {
+        console.log(colorize(`  ✗ ${r.error}`, 'red'));
+        return;
+    }
+    const pct = (t) => (r.totalTime > 0 ? `${((t / r.totalTime) * 100).toFixed(0)}%` : '—');
+    const line = (label, value, note = '') => console.log(`  ${label.padEnd(22)} ${colorize(value, 'bold')}  ${colorize(note, 'gray')}`);
+    if (r.sizeBytes || r.parameterSize) {
+        const where = r.sizeVramBytes && r.sizeVramBytes > 0 ? 'GPU' : 'CPU';
+        const detail = [
+            r.parameterSize,
+            r.quantization,
+            r.sizeBytes ? fmtBytes(r.sizeBytes) : undefined,
+            r.sizeVramBytes ? `${fmtBytes(r.sizeVramBytes)} VRAM · ${where}` : where,
+        ]
+            .filter(Boolean)
+            .join(' · ');
+        console.log('  ' + colorize(detail, 'gray'));
+        console.log();
+    }
+    line('Load', fmtDuration(r.loadTime), pct(r.loadTime) + ' of total');
+    line('Prompt eval', fmtDuration(r.promptEvalTime), `${Math.round(r.promptEvalCount)} tok · ${fmtRate(r.promptTokensPerSecond)}`);
+    line('First token (TTFT)', fmtDuration(r.ttft));
+    if (r.thinking) {
+        line('Thinking', fmtDuration(r.thinkingTime), `${Math.round(r.thinkingChars)} chars · ${r.thinkingCharsPerSecond.toFixed(1)} chars/s`);
+    }
+    line('Generation', fmtDuration(r.generationTime), `${Math.round(r.evalCount)} tok · ${pct(r.generationTime)} of total`);
+    console.log();
+    line(colorize('Speed', 'green'), colorize(fmtRate(r.tokensPerSecond), 'green'), colorize('total ' + fmtDuration(r.totalTime), 'gray'));
 }
 /**
- * The main function that orchestrates the model pulling and benchmarking process.
+ * Renders an aligned comparison table ranking models by generation speed.
+ */
+function renderTable(results) {
+    const ok = results.filter((r) => r.ok);
+    if (ok.length === 0)
+        return;
+    const ranked = [...ok].sort((a, b) => b.tokensPerSecond - a.tokensPerSecond);
+    const best = ranked[0];
+    const headers = ['', 'Model', 'Params', 'Gen', 'Prompt', 'TTFT', 'Load', 'Total'];
+    const rows = ranked.map((r, i) => ({
+        '': i === 0 ? '★' : `${i + 1}`,
+        Model: r.model + (r.thinking ? ' ◇' : ''),
+        Params: r.parameterSize ?? '—',
+        Gen: fmtRate(r.tokensPerSecond),
+        Prompt: fmtRate(r.promptTokensPerSecond),
+        TTFT: fmtDuration(r.ttft),
+        Load: fmtDuration(r.loadTime),
+        Total: fmtDuration(r.totalTime),
+    }));
+    const widths = headers.map((h) => Math.max(h.length, ...rows.map((row) => row[h].length)));
+    const fmtRow = (cells) => cells.map((c, i) => (i <= 1 ? c.padEnd(widths[i]) : c.padStart(widths[i]))).join('  ');
+    console.log(colorize('\nRanking', 'magenta'));
+    console.log(colorize('═'.repeat(52), 'magenta'));
+    console.log(colorize(fmtRow(headers), 'bold'));
+    console.log(colorize(headers.map((_, i) => '─'.repeat(widths[i])).join('  '), 'gray'));
+    ranked.forEach((r, i) => {
+        const cells = fmtRow(headers.map((h) => rows[i][h]));
+        console.log(i === 0 ? colorize(cells, 'green') : cells);
+    });
+    if (ranked.some((r) => r.thinking)) {
+        console.log(colorize('\n◇ reasoning model (thinking enabled)', 'gray'));
+    }
+    console.log(colorize(`\nFastest: ${best.model} at ${fmtRate(best.tokensPerSecond)}`, 'magenta'));
+}
+/* -------------------------------------------------------------------------- */
+/*  Demo data (for UI testing without a server)                              */
+/* -------------------------------------------------------------------------- */
+/**
+ * Produces synthetic benchmark results so the UI can be previewed/tested
+ * without a running Ollama server.
+ */
+function demoResults() {
+    return [
+        {
+            model: 'qwen3:0.6b',
+            ok: true,
+            runs: 1,
+            loadTime: 0.42,
+            promptEvalTime: 0.08,
+            promptEvalCount: 14,
+            promptTokensPerSecond: 175,
+            generationTime: 1.9,
+            evalCount: 320,
+            tokensPerSecond: 168.4,
+            totalTime: 2.4,
+            ttft: 0.51,
+            thinking: true,
+            thinkingTime: 1.13,
+            thinkingChars: 640,
+            thinkingCharsPerSecond: 568,
+            sizeBytes: 1.3e9,
+            sizeVramBytes: 1.3e9,
+            parameterSize: '0.6B',
+            quantization: 'Q4_K_M',
+        },
+        {
+            model: 'llama3.2:1b',
+            ok: true,
+            runs: 1,
+            loadTime: 0.6,
+            promptEvalTime: 0.05,
+            promptEvalCount: 12,
+            promptTokensPerSecond: 240,
+            generationTime: 2.4,
+            evalCount: 280,
+            tokensPerSecond: 116.7,
+            totalTime: 3.05,
+            ttft: 0.66,
+            thinking: false,
+            thinkingTime: 0,
+            thinkingChars: 0,
+            thinkingCharsPerSecond: 0,
+            sizeBytes: 1.9e9,
+            sizeVramBytes: 0,
+            parameterSize: '1.2B',
+            quantization: 'Q8_0',
+        },
+        {
+            model: 'gemma3:1b',
+            ok: false,
+            error: "model 'gemma3:1b' not found",
+            runs: 0,
+            loadTime: 0,
+            promptEvalTime: 0,
+            promptEvalCount: 0,
+            promptTokensPerSecond: 0,
+            generationTime: 0,
+            evalCount: 0,
+            tokensPerSecond: 0,
+            totalTime: 0,
+            ttft: 0,
+            thinking: false,
+            thinkingTime: 0,
+            thinkingChars: 0,
+            thinkingCharsPerSecond: 0,
+        },
+    ];
+}
+/* -------------------------------------------------------------------------- */
+/*  Main                                                                      */
+/* -------------------------------------------------------------------------- */
+/**
+ * Orchestrates argument parsing, model preparation, benchmarking and output.
  */
 export async function main() {
-    const models = process.argv.slice(2);
-    if (models.length === 0) {
-        console.log(colorize(`Error: No models provided. Please specify at least one model.`, 'red'));
+    const opts = parseArgs(process.argv.slice(2));
+    if (opts.help)
+        return printHelp();
+    if (opts.version) {
+        console.log(TOOL_VERSION);
+        return;
+    }
+    // Demo mode: render the UI from synthetic data, no server required.
+    if (opts.demo) {
+        const results = demoResults();
+        console.log(colorize('ollama-bench (demo)', 'cyan'));
+        console.log(colorize('═'.repeat(52), 'cyan'));
+        results.forEach(renderResult);
+        renderTable(results);
+        return;
+    }
+    if (opts.models.length === 0) {
+        console.error(colorize('Error: specify at least one model.\n', 'red'));
+        printHelp();
         process.exit(1);
     }
-    console.log(colorize(`Ollama Benchmark Script`, 'cyan'));
-    console.log(colorize('═'.repeat(50), 'cyan'));
-    // Pull models
-    console.log(colorize('\nPhase: Model Preparation', 'cyan'));
-    console.log(colorize('─'.repeat(50), 'cyan'));
-    for (const model of models) {
-        await pullModel(model);
-    }
-    // Benchmark models
-    console.log(colorize('\nPhase: Performance Testing', 'cyan'));
-    console.log(colorize('─'.repeat(50), 'cyan'));
+    const client = new Ollama(opts.host ? { host: opts.host } : undefined);
+    const serverVersion = await ensureServer(client);
+    if (!opts.json) {
+        console.log(colorize('ollama-bench', 'cyan') + colorize(`  ·  server v${serverVersion}`, 'gray'));
+        console.log(colorize('═'.repeat(52), 'cyan'));
+        console.log(colorize('\nPreparing models', 'cyan'));
+        console.log(colorize('─'.repeat(52), 'gray'));
+    }
+    for (const model of opts.models) {
+        await pullModel(client, model);
+    }
+    if (!opts.json) {
+        console.log(colorize('\nBenchmarking', 'cyan'));
+        console.log(colorize('─'.repeat(52), 'gray'));
+    }
     const results = [];
-    for (const model of models) {
-        const result = await benchmarkModel(model);
+    for (const model of opts.models) {
+        const result = await benchmarkModel(client, model, opts);
         results.push(result);
+        if (!opts.json)
+            renderResult(result);
     }
-    // Find the best performing model
-    const bestModel = results.reduce((best, current) => current.tokensPerSecond > best.tokensPerSecond ? current : best);
-    console.log(colorize('Final Results', 'magenta'));
-    console.log(colorize('═'.repeat(50), 'magenta'));
-    console.log(colorize(`Best performing model: ${bestModel.model}`, 'magenta'));
-    console.log(colorize(`Generation speed: ${bestModel.tokensPerSecond.toFixed(2)} tokens/s`, 'magenta'));
-    console.log(colorize(`Total time: ${bestModel.totalTime.toFixed(2)}s`, 'magenta'));
+    if (opts.json) {
+        console.log(JSON.stringify({ server: serverVersion, prompt: opts.prompt, results }, null, 2));
+    }
+    else if (results.filter((r) => r.ok).length > 1) {
+        renderTable(results);
+    }
+    // Non-zero exit if every model failed.
+    if (results.every((r) => !r.ok))
+        process.exit(1);
 }
 if (import.meta.url === import.meta.resolve(process.argv[1])) {
-    main().catch(error => {
+    main().catch((error) => {
         console.error('Error:', error);
         process.exit(1);
     });

package/package.json CHANGED Viewed

@@ -1,18 +1,23 @@
 {
     "name": "ollama-bench",
-    "version": "1.1.0",
-    "description": "Minimal CLI tool to benchmark Ollama models with detailed phase analysis. Zero runtime dependencies.",
+    "version": "1.2.0",
+    "description": "Minimal CLI tool to benchmark Ollama models — phase analysis, TTFT, reasoning/thinking measurement, and side-by-side ranking.",
     "main": "dist/index.js",
     "type": "module",
     "bin": {
         "ollama-bench": "./dist/index.js"
     },
+    "files": [
+        "dist",
+        "README.md",
+        "LICENSE"
+    ],
     "scripts": {
         "build": "tsc",
         "start": "node dist/index.js",
         "dev": "tsc && node dist/index.js"
     },
-    "keywords": ["ollama", "benchmark", "ai", "models", "cli", "performance", "llm", "testing"],
+    "keywords": ["ollama", "benchmark", "ai", "models", "cli", "performance", "llm", "testing", "ttft", "reasoning", "tokens-per-second"],
     "author": "dalist1",
     "license": "MIT",
     "repository": {
@@ -24,14 +29,14 @@
     },
     "homepage": "https://github.com/dalist1/ollama-bench#readme",
     "dependencies": {
-        "ollama": "latest"
+        "ollama": "^0.6.3"
     },
     "devDependencies": {
-        "@types/node": "^20.19.25",
+        "@types/node": "^20.19.41",
         "typescript": "^5.9.3"
     },
     "engines": {
-        "node": ">=14.0.0"
+        "node": ">=18.0.0"
     },
     "publishConfig": {
         "access": "public"

package/.idx/dev.nix DELETED Viewed

@@ -1,21 +0,0 @@
-{ pkgs }: {
-  channel = "unstable";
-  packages =
-    let
-      bunLatest = builtins.fetchurl {
-        url = "https://github.com/oven-sh/bun/releases/download/canary/bun-linux-x64.zip";
-      };
-    in
-    [
-      pkgs.nodejs_23
-      (pkgs.bun.overrideAttrs (oldAttrs: {
-        version = "canary";
-        src = bunLatest;
-      }))
-    ];
-  idx.extensions = [
-    "biomejs.biome"
-    "BeardedBear.beardedicons"
-    "BeardedBear.beardedtheme"
-  ];
-}

package/bun.lock DELETED Viewed

@@ -1,27 +0,0 @@
-{
-  "lockfileVersion": 1,
-  "configVersion": 1,
-  "workspaces": {
-    "": {
-      "name": "ollama-bench",
-      "dependencies": {
-        "ollama": "latest",
-      },
-      "devDependencies": {
-        "@types/node": "^20.19.25",
-        "typescript": "^5.9.3",
-      },
-    },
-  },
-  "packages": {
-    "@types/node": ["@types/node@20.19.25", "", { "dependencies": { "undici-types": "~6.21.0" } }, "sha512-ZsJzA5thDQMSQO788d7IocwwQbI8B5OPzmqNvpf3NY/+MHDAS759Wo0gd2WQeXYt5AAAQjzcrTVC6SKCuYgoCQ=="],
-    "ollama": ["ollama@0.6.3", "", { "dependencies": { "whatwg-fetch": "^3.6.20" } }, "sha512-KEWEhIqE5wtfzEIZbDCLH51VFZ6Z3ZSa6sIOg/E/tBV8S51flyqBOXi+bRxlOYKDf8i327zG9eSTb8IJxvm3Zg=="],
-    "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
-    "undici-types": ["undici-types@6.21.0", "", {}, "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="],
-    "whatwg-fetch": ["whatwg-fetch@3.6.20", "", {}, "sha512-EqhiFU6daOA8kpjOWTL0olhVOF3i7OrFzSYiGsEMB8GcXS+RrzauAERX65xMeNWVqxA6HXH2m69Z9LaKKdisfg=="],
-  }
-}

package/src/index.ts DELETED Viewed

@@ -1,217 +0,0 @@
-#!/usr/bin/env node
-import ollama from 'ollama';
-/**
- * Represents the available color codes for text coloring.
- */
-type Color = 'reset' | 'green' | 'yellow' | 'red' | 'cyan' | 'magenta' | 'blue';
-/**
- * Object containing ANSI color codes for text coloring.
- */
-const colors: Record<Color, string> = {
-  reset: '\x1b[0m',
-  green: '\x1b[32m',
-  yellow: '\x1b[33m',
-  red: '\x1b[31m',
-  cyan: '\x1b[36m',
-  magenta: '\x1b[35m',
-  blue: '\x1b[34m',
-};
-/**
- * Applies color to the given text.
- * @param text - The text to colorize.
- * @param color - The color to apply.
- * @returns The colorized text.
- */
-function colorize(text: string, color: Color): string {
-  return `${colors[color]}${text}${colors.reset}`;
-}
-/**
- * Creates a loading animation for the console.
- * @param operation - The operation being performed.
- * @param model - The model being processed.
- * @returns An interval ID for the animation.
- */
-function createLoadingAnimation(operation: string, model: string): NodeJS.Timeout {
-  const frames: string[] = ['|', '/', '-', '\\'];
-  let i = 0;
-  let dots = 0;
-  return setInterval(() => {
-    const frame = frames[i];
-    const dotString = '.'.repeat(dots);
-    const operationText = colorize(`${operation} ${model}${dotString}`, 'blue');
-    process.stdout.write(`\r${frame} ${operationText}`.padEnd(50));
-    i = (i + 1) % frames.length;
-    dots = (dots + 1) % 4;
-  }, 100);
-}
-/**
- * Pulls a model from Ollama.
- * @param model - The name of the model to pull.
- */
-async function pullModel(model: string): Promise<void> {
-  console.log(colorize(`Initiating pull for ${model}...`, 'yellow'));
-  const loadingAnimation = createLoadingAnimation('Pulling', model);
-  try {
-    const start = performance.now();
-    const response = await ollama.pull({ model, stream: true });
-    for await (const part of response) {
-      if (part.status === 'success') {
-        clearInterval(loadingAnimation);
-        const end = performance.now();
-        const duration = (end - start) / 1000;
-        console.log(`\r${colorize(`Successfully pulled ${model} in ${duration.toFixed(2)} seconds`, 'green')}     `);
-        return;
-      }
-    }
-  } catch (error) {
-    clearInterval(loadingAnimation);
-    console.log(`\r${colorize(`Error pulling ${model}: ${(error as Error).message}`, 'red')}     `);
-  }
-}
-/**
- * Represents the result of a model benchmark.
- */
-interface BenchmarkResult {
-  model: string;
-  tokensPerSecond: number;
-  loadTime: number;
-  promptEvalTime: number;
-  generationTime: number;
-  totalTime: number;
-}
-/**
- * Benchmarks a model's performance.
- * @param model - The name of the model to benchmark.
- * @returns A promise that resolves to the benchmark result.
- */
-async function benchmarkModel(model: string): Promise<BenchmarkResult> {
-  const prompt = "Explain the theory of relativity in simple terms.";
-  console.log(colorize(`\nBenchmarking ${model}`, 'cyan'));
-  console.log(colorize('─'.repeat(50), 'cyan'));
-  const loadingAnimation = createLoadingAnimation('Running benchmark', model);
-  try {
-    const response = await ollama.generate({
-      model,
-      prompt,
-      stream: false,
-    });
-    clearInterval(loadingAnimation);
-    process.stdout.write('\r' + ' '.repeat(50) + '\r');
-    // Calculate phase timings
-    const loadTime = response.load_duration / 1e9;
-    const promptEvalTime = response.prompt_eval_duration / 1e9;
-    const generationTime = response.eval_duration / 1e9;
-    const totalTime = response.total_duration / 1e9;
-    const tokensPerSecond = response.eval_count / generationTime;
-    // Calculate percentages
-    const loadPercent = (loadTime / totalTime * 100).toFixed(1);
-    const promptPercent = (promptEvalTime / totalTime * 100).toFixed(1);
-    const genPercent = (generationTime / totalTime * 100).toFixed(1);
-    // Display phases
-    console.log(colorize('Phase 1: Model Loading (Loading weights into memory)', 'yellow'));
-    console.log(colorize(`  Time: ${loadTime.toFixed(2)}s (${loadPercent}% of total)`, 'yellow'));
-    console.log();
-    console.log(colorize('Phase 2: Prompt Processing (Encoding input)', 'yellow'));
-    console.log(colorize(`  Tokens: ${response.prompt_eval_count}`, 'yellow'));
-    console.log(colorize(`  Time: ${promptEvalTime.toFixed(2)}s (${promptPercent}% of total)`, 'yellow'));
-    console.log(colorize(`  Speed: ${(response.prompt_eval_count / promptEvalTime).toFixed(2)} tokens/s`, 'yellow'));
-    console.log();
-    console.log(colorize('Phase 3: Response Generation (Creating output)', 'yellow'));
-    console.log(colorize(`  Tokens: ${response.eval_count}`, 'yellow'));
-    console.log(colorize(`  Time: ${generationTime.toFixed(2)}s (${genPercent}% of total)`, 'yellow'));
-    console.log(colorize(`  Speed: ${tokensPerSecond.toFixed(2)} tokens/s`, 'yellow'));
-    console.log();
-    console.log(colorize('Summary', 'green'));
-    console.log(colorize(`  Total time: ${totalTime.toFixed(2)}s`, 'green'));
-    console.log(colorize(`  Generation speed: ${tokensPerSecond.toFixed(2)} tokens/s`, 'green'));
-    console.log();
-    return {
-      model,
-      tokensPerSecond,
-      loadTime,
-      promptEvalTime,
-      generationTime,
-      totalTime
-    };
-  } catch (error) {
-    clearInterval(loadingAnimation);
-    process.stdout.write('\r' + ' '.repeat(50) + '\r');
-    console.log(colorize(`Error benchmarking ${model}: ${(error as Error).message}`, 'red'));
-    console.log();
-    return {
-      model,
-      tokensPerSecond: 0,
-      loadTime: 0,
-      promptEvalTime: 0,
-      generationTime: 0,
-      totalTime: 0
-    };
-  }
-}
-/**
- * The main function that orchestrates the model pulling and benchmarking process.
- */
-export async function main(): Promise<void> {
-  const models = process.argv.slice(2);
-  if (models.length === 0) {
-    console.log(colorize(`Error: No models provided. Please specify at least one model.`, 'red'));
-    process.exit(1);
-  }
-  console.log(colorize(`Ollama Benchmark Script`, 'cyan'));
-  console.log(colorize('═'.repeat(50), 'cyan'));
-  // Pull models
-  console.log(colorize('\nPhase: Model Preparation', 'cyan'));
-  console.log(colorize('─'.repeat(50), 'cyan'));
-  for (const model of models) {
-    await pullModel(model);
-  }
-  // Benchmark models
-  console.log(colorize('\nPhase: Performance Testing', 'cyan'));
-  console.log(colorize('─'.repeat(50), 'cyan'));
-  const results: BenchmarkResult[] = [];
-  for (const model of models) {
-    const result = await benchmarkModel(model);
-    results.push(result);
-  }
-  // Find the best performing model
-  const bestModel = results.reduce((best, current) =>
-    current.tokensPerSecond > best.tokensPerSecond ? current : best
-  );
-  console.log(colorize('Final Results', 'magenta'));
-  console.log(colorize('═'.repeat(50), 'magenta'));
-  console.log(colorize(`Best performing model: ${bestModel.model}`, 'magenta'));
-  console.log(colorize(`Generation speed: ${bestModel.tokensPerSecond.toFixed(2)} tokens/s`, 'magenta'));
-  console.log(colorize(`Total time: ${bestModel.totalTime.toFixed(2)}s`, 'magenta'));
-}
-if (import.meta.url === import.meta.resolve(process.argv[1])) {
-  main().catch(error => {
-      console.error('Error:', error);
-      process.exit(1);
-  });
-}

package/tsconfig.json DELETED Viewed

@@ -1,15 +0,0 @@
-{
-  "compilerOptions": {
-    "target": "ES2020",
-    "module": "ES2020",
-    "moduleResolution": "node",
-    "outDir": "./dist",
-    "rootDir": "./src",
-    "strict": true,
-    "esModuleInterop": true,
-    "skipLibCheck": true,
-    "forceConsistentCasingInFileNames": true
-  },
-  "include": ["src/**/*"],
-  "exclude": ["node_modules", "dist"]
-}