PyPI - llama-benchy - Versions diffs - 0.1.0__py3-none-any.whl - Mend

llama-benchy 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

llama_benchy/__init__.py +24 -0
llama_benchy/__main__.py +535 -0
llama_benchy/_version.py +34 -0
llama_benchy-0.1.0.dist-info/METADATA +249 -0
llama_benchy-0.1.0.dist-info/RECORD +8 -0
llama_benchy-0.1.0.dist-info/WHEEL +4 -0
llama_benchy-0.1.0.dist-info/entry_points.txt +2 -0
llama_benchy-0.1.0.dist-info/licenses/LICENSE +21 -0

llama_benchy/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""
+llama-benchy - llama-bench style benchmarking tool for all backends
+This package provides a benchmarking tool for OpenAI-compatible LLM endpoints,
+generating statistics similar to `llama-bench`.
+"""
+from ._version import __version__
+# Extract build number from the version string
+# Version format is like: '0.1.dev34+g33f03d886.d20260105'
+# We want to extract the git hash part: '33f03d886'
+__build__ = "unknown"
+if "+" in __version__:
+    try:
+        # Extract the part after the '+' and before the '.'
+        build_part = __version__.split("+")[1].split(".")[0]
+        # Remove the 'g' prefix if it exists
+        if build_part.startswith("g"):
+            __build__ = build_part[1:]
+        else:
+            __build__ = build_part
+    except (IndexError, AttributeError):
+        pass

llama_benchy/__main__.py ADDED Viewed

@@ -0,0 +1,535 @@
+"""
+Main entry point for the llama-benchy CLI.
+"""
+import argparse
+import os
+import time
+import uuid
+import subprocess
+import datetime
+import numpy as np
+from tabulate import tabulate
+import aiohttp
+import asyncio
+import json
+import codecs
+import hashlib
+from transformers import AutoTokenizer
+import requests
+# Build number is now imported from __init__.py
+from . import __build__, __version__
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="LLM Benchmark Script")
+    parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
+    parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
+    parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
+    parser.add_argument("--served-model-name", type=str, default=None, help="Model name used in API calls (defaults to --model if not specified)")
+    parser.add_argument("--tokenizer", type=str, default=None, help="HuggingFace tokenizer name (defaults to model name)")
+    parser.add_argument("--pp", type=int, nargs='+', required=False, default=[2048], help="List of prompt processing token counts - default: 2048")
+    parser.add_argument("--tg", type=int, nargs='+', required=False, default=[32], help="List of token generation counts - default: 32")
+    parser.add_argument("--depth", type=int, nargs='+', default=[0], help="List of context depths (previous conversation tokens) - default: 0")
+    parser.add_argument("--runs", type=int, default=3, help="Number of runs per test - default: 3")
+    parser.add_argument("--no-cache", action="store_true", help="Ensure unique requests to avoid prefix caching and send cache_prompt=false to the server")
+    parser.add_argument("--post-run-cmd", type=str, default=None, help="Command to execute after each test run")
+    parser.add_argument("--book-url", type=str, default="https://www.gutenberg.org/files/1661/1661-0.txt", help="URL of a book to use for text generation, defaults to Sherlock Holmes (https://www.gutenberg.org/files/1661/1661-0.txt)")
+    parser.add_argument("--latency-mode", type=str, default="api", choices=["api", "generation", "none"], help="Method to measure latency: 'api' (list models) - default, 'generation' (single token generation), or 'none' (skip latency measurement)")
+    parser.add_argument("--no-warmup", action="store_true", help="Skip warmup phase")
+    parser.add_argument("--adapt-prompt", action="store_true", default=True, help="Adapt prompt size based on warmup token usage delta (default: True)")
+    parser.add_argument("--no-adapt-prompt", action="store_false", dest="adapt_prompt", help="Disable prompt size adaptation")
+    parser.add_argument("--enable-prefix-caching", action="store_true", help="Enable prefix caching performance measurement")
+    return parser.parse_args()
+def get_tokenizer(model_name, tokenizer_name=None):
+    try:
+        name = tokenizer_name if tokenizer_name else model_name
+        return AutoTokenizer.from_pretrained(name)
+    except Exception as e:
+        print(f"Error loading tokenizer: {e}")
+        print("Falling back to 'gpt2' tokenizer as approximation.")
+        return AutoTokenizer.from_pretrained("gpt2")
+def prepare_text_data(book_url, tokenizer):
+    try:
+        # Create cache directory if it doesn't exist
+        cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "llama-benchy")
+        os.makedirs(cache_dir, exist_ok=True)
+        # Generate hash of the URL for the filename
+        url_hash = hashlib.md5(book_url.encode()).hexdigest()
+        cache_file = os.path.join(cache_dir, f"{url_hash}.txt")
+        if os.path.exists(cache_file):
+            print(f"Loading text from cache: {cache_file}")
+            with open(cache_file, "r", encoding="utf-8") as f:
+                text = f.read()
+        else:
+            print(f"Downloading book from {book_url}...")
+            response = requests.get(book_url)
+            response.raise_for_status()
+            text = response.text
+            # Basic cleanup
+            start_idx = text.find("*** START OF THE PROJECT GUTENBERG EBOOK")
+            if start_idx != -1:
+                text = text[start_idx:]
+            # Save to cache
+            with open(cache_file, "w", encoding="utf-8") as f:
+                f.write(text)
+            print(f"Saved text to cache: {cache_file}")
+        return tokenizer.encode(text, add_special_tokens=False)
+    except Exception as e:
+        print(f"Error downloading book: {e}")
+        exit(1)
+def generate_prompt(all_tokens, tokenizer, prompt_tokens, context_tokens=0, no_cache=False):
+    suffix = ""
+    suffix_len = 0
+    if no_cache:
+        suffix = f" {uuid.uuid4()}"
+        suffix_len = len(tokenizer.encode(suffix, add_special_tokens=False))
+    # Adjust prompt tokens to fetch from text
+    text_prompt_tokens = max(0, prompt_tokens - suffix_len)
+    # Create a pool of tokens large enough
+    total_needed = text_prompt_tokens + context_tokens
+    if len(all_tokens) < total_needed:
+        # Repeat tokens if not enough
+        all_tokens = all_tokens * (total_needed // len(all_tokens) + 2)
+    # Pick a random start position
+    max_start = len(all_tokens) - total_needed
+    start_idx = np.random.randint(0, max_start)
+    selected_tokens = all_tokens[start_idx : start_idx + total_needed]
+    context_text = tokenizer.decode(selected_tokens[:context_tokens]) if context_tokens > 0 else ""
+    prompt_text = tokenizer.decode(selected_tokens[context_tokens:])
+    if no_cache:
+        prompt_text += suffix
+    return context_text, prompt_text
+async def measure_latency(session, base_url, api_key, mode="api", model_name=None):
+    if mode == "none":
+        print("Skipping latency measurement (assuming 0 ms).")
+        return 0
+    print(f"Measuring latency using mode: {mode}...")
+    latencies = []
+    headers = {"Authorization": f"Bearer {api_key}"}
+    for _ in range(3):
+        start = time.perf_counter()
+        try:
+            if mode == "api":
+                async with session.get(f"{base_url}/models", headers=headers) as response:
+                    await response.read()
+                latencies.append(time.perf_counter() - start)
+            elif mode == "generation":
+                if not model_name:
+                    raise ValueError("Model name required for generation latency mode")
+                payload = {
+                    "model": model_name,
+                    "messages": [{"role": "user", "content": "hello"}],
+                    "max_tokens": 1,
+                    "stream": True
+                }
+                async with session.post(f"{base_url}/chat/completions", json=payload, headers=headers) as response:
+                    async for _ in response.content:
+                        # record latency as soon as the first byte is received
+                        latencies.append(time.perf_counter() - start)
+                        break
+                    # Drain the rest of the response to keep the connection alive
+                    async for _ in response.content: pass
+        except Exception as e:
+            print(f"Error measuring latency: {e}")
+    if latencies:
+        avg_latency = np.mean(latencies)
+        print(f"Average latency ({mode}): {avg_latency*1000:.2f} ms")
+        return avg_latency
+    return 0
+async def warmup(session, base_url, api_key, model, tokenizer=None):
+    print("Warming up...")
+    headers = {"Authorization": f"Bearer {api_key}"}
+    warmup_text = "Warmup " * 10
+    delta_user = 0
+    delta_context = 0
+    # 1. User only (No Context)
+    payload_user = {
+        "model": model,
+        "messages": [{"role": "user", "content": warmup_text}],
+        "max_tokens": 1
+    }
+    try:
+        async with session.post(f"{base_url}/chat/completions", json=payload_user, headers=headers) as response:
+            response_json = await response.json()
+            if tokenizer:
+                if 'usage' in response_json:
+                    prompt_tokens = response_json['usage']['prompt_tokens']
+                    local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
+                    delta_user = prompt_tokens - local_tokens
+                    print(f"Warmup (User only) complete. Delta: {delta_user} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
+                else:
+                    print("Warmup (User only) complete (no usage stats found).")
+            else:
+                print("Warmup complete.")
+        if tokenizer:
+            # 2. System + Empty User (Context Only)
+            payload_sys_empty = {
+                "model": model,
+                "messages": [
+                    {"role": "system", "content": warmup_text},
+                    {"role": "user", "content": ""}
+                ],
+                "max_tokens": 1
+            }
+            async with session.post(f"{base_url}/chat/completions", json=payload_sys_empty, headers=headers) as response:
+                response_json = await response.json()
+                if 'usage' in response_json:
+                    prompt_tokens = response_json['usage']['prompt_tokens']
+                    local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
+                    delta_context = prompt_tokens - local_tokens
+                    print(f"Warmup (System+Empty) complete. Delta: {delta_context} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
+                else:
+                    print("Warmup (System+Empty) complete (no usage stats found).")
+                    delta_context = delta_user
+    except Exception as e:
+        print(f"Warmup failed: {e}")
+    return delta_user, delta_context
+async def run_benchmark(session, base_url, api_key, model_name, context_text, prompt_text, expected_pp_tokens, tg, no_cache, latency, post_run_cmd):
+    messages = []
+    if context_text:
+        messages.append({"role": "system", "content": context_text})
+    messages.append({"role": "user", "content": prompt_text})
+    ttft = 0
+    e2e_ttft = 0
+    token_count = 0
+    first_token_time = 0
+    first_response_time = 0
+    prompt_usage_tokens = 0
+    result = {
+        "pp_speed": None,
+        "tg_speed": None,
+        "ttft": None,
+        "ttfr": None,
+        "est_ppt": None,
+        "e2e_ttft": None
+    }
+    try:
+        payload = {
+            "model": model_name,
+            "messages": messages,
+            "max_tokens": tg,
+            "stream": True,
+            "stream_options": {"include_usage": True},
+            # "temperature": 0,
+            # "seed": 42
+        }
+        if no_cache:
+            payload["cache_prompt"] = False
+        headers = {"Authorization": f"Bearer {api_key}"}
+        start_time = time.perf_counter()
+        async with session.post(f"{base_url}/chat/completions", json=payload, headers=headers) as response:
+            if response.status != 200:
+                error_text = await response.text()
+                print(f"Error: {response.status} - {error_text}")
+                return None
+            buffer = ""
+            decoder = codecs.getincrementaldecoder("utf-8")(errors='replace')
+            async for chunk_bytes in response.content:
+                chunk_time = time.perf_counter()
+                decoded_chunk = decoder.decode(chunk_bytes, final=False)
+                buffer += decoded_chunk
+                while "\n" in buffer:
+                    line, buffer = buffer.split("\n", 1)
+                    line = line.strip()
+                    if not line or line == 'data: [DONE]':
+                        continue
+                    if line.startswith('data: '):
+                        try:
+                            chunk = json.loads(line[6:])
+                            if 'usage' in chunk:
+                                prompt_usage_tokens = chunk['usage'].get('prompt_tokens', 0)
+                            if 'choices' in chunk and len(chunk['choices']) > 0:
+                                if first_response_time == 0:
+                                    first_response_time = chunk_time
+                                delta = chunk['choices'][0].get('delta', {})
+                                content = delta.get('content')
+                                reasoning_content = delta.get('reasoning_content')
+                                if content or reasoning_content:
+                                    if token_count == 0:
+                                        first_token_time = chunk_time
+                                        e2e_ttft = first_token_time - start_time
+                                        ttft = e2e_ttft-latency
+                                        if ttft < 0:
+                                            ttft = 0
+                                    token_count += 1
+                        except json.JSONDecodeError:
+                            continue
+        end_time = time.perf_counter()
+        if token_count > 0:
+            # Calculate decode time (time for subsequent tokens)
+            # If only 1 token, decode_time is effectively 0, so we can't calculate inter-token speed
+            if token_count > 1:
+                decode_time = end_time - first_token_time
+                if decode_time > 0:
+                    # Speed for the generated tokens (excluding the first one which is TTFT)
+                    result["tg_speed"] = (token_count - 1) / decode_time
+                else:
+                    # Fallback if time is too small
+                    result["tg_speed"] = (token_count - 1) / 0.0001
+            # Use expected_pp_tokens for speed calculation
+            total_prompt_tokens = expected_pp_tokens
+            # Only use reported usage if it's close to expected (to handle tokenizer differences)
+            # but not if it's vastly different (which happens in prefix caching where usage includes cached tokens)
+            if prompt_usage_tokens > 0:
+                diff = abs(prompt_usage_tokens - expected_pp_tokens)
+                if diff < expected_pp_tokens * 0.2: # 20% tolerance
+                     total_prompt_tokens = prompt_usage_tokens
+            # Calculate TTFR and Estimated Prompt Processing Time
+            ttfr = 0
+            est_ppt = 0
+            if first_response_time > 0:
+                    ttfr = first_response_time - start_time
+                    est_ppt = ttfr - latency
+                    if est_ppt < 0: est_ppt = 0
+            if est_ppt > 0:
+                    result["pp_speed"] = total_prompt_tokens / est_ppt
+                    result["est_ppt"] = est_ppt
+            if ttfr > 0:
+                    result["ttfr"] = ttfr
+            if ttft > 0:
+                result["ttft"] = ttft
+            if e2e_ttft > 0:
+                result["e2e_ttft"] = e2e_ttft
+    except Exception as e:
+        print(f"Error during run: {e}")
+        return None
+    if post_run_cmd:
+        try:
+            subprocess.run(post_run_cmd, shell=True, check=True)
+        except subprocess.CalledProcessError as e:
+            print(f"Post-run command failed: {e}")
+    return result
+async def main_async():
+    args = parse_arguments()
+    if args.enable_prefix_caching and args.no_cache:
+        print("Error: --enable-prefix-caching and --no-cache are incompatible.")
+        return
+    build_number = __build__
+    version_number = __version__
+    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    print(f"llama-benchy ({version_number}.{build_number})")
+    print(f"Date: {current_time}")
+    print(f"Benchmarking model: {args.model} at {args.base_url}")
+    served_model_name = args.served_model_name if args.served_model_name else args.model
+    tokenizer = get_tokenizer(args.model, args.tokenizer)
+    all_tokens = prepare_text_data(args.book_url, tokenizer)
+    print(f"Total tokens available in text corpus: {len(all_tokens)}")
+    # Use a large timeout for long-running benchmarks
+    timeout = aiohttp.ClientTimeout(total=3600)
+    connector = aiohttp.TCPConnector(limit=1, force_close=False, keepalive_timeout=600)
+    async with aiohttp.ClientSession(timeout=timeout, connector=connector, trust_env=True) as session:
+        delta_user = 0
+        delta_context = 0
+        should_warmup = not args.no_warmup
+        if args.adapt_prompt:
+            should_warmup = True
+        if should_warmup:
+            delta_user, delta_context = await warmup(session, args.base_url, args.api_key, served_model_name, tokenizer if args.adapt_prompt else None)
+        latency = await measure_latency(session, args.base_url, args.api_key, args.latency_mode, served_model_name)
+        results = []
+        for depth in args.depth:
+            for pp in args.pp:
+                for tg in args.tg:
+                    print(f"Running test: pp={pp}, tg={tg}, depth={depth}")
+                    pp_speeds = []
+                    tg_speeds = []
+                    ttft_values = []
+                    ttfr_values = []
+                    est_ppt_values = []
+                    e2e_ttft_values = []
+                    ctx_pp_speeds = []
+                    ctx_tg_speeds = []
+                    ctx_ttfr_values = []
+                    ctx_est_ppt_values = []
+                    ctx_e2e_ttft_values = []
+                    for run in range(args.runs):
+                        current_pp = pp
+                        current_depth = depth
+                        if args.adapt_prompt:
+                            if depth == 0:
+                                current_pp = max(1, pp - delta_user)
+                            else:
+                                current_depth = max(1, depth - delta_context)
+                        context, prompt = generate_prompt(all_tokens, tokenizer, current_pp, current_depth, args.no_cache)
+                        if args.enable_prefix_caching and depth > 0:
+                            # Request 1: Context only
+                            # We send context as system message, and empty prompt as user message.
+                            # This establishes the prefix: [System: Context] [User: ""]
+                            # Expected PP tokens = current_depth (context size)
+                            print(f"  Run {run+1}/{args.runs} (Context Load)...")
+                            ctx_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, "", current_depth, tg, args.no_cache, latency, None)
+                            if ctx_result:
+                                if ctx_result["pp_speed"] is not None:
+                                    ctx_pp_speeds.append(ctx_result["pp_speed"])
+                                if ctx_result["tg_speed"] is not None:
+                                    ctx_tg_speeds.append(ctx_result["tg_speed"])
+                                if ctx_result["ttfr"] is not None:
+                                    ctx_ttfr_values.append(ctx_result["ttfr"])
+                                if ctx_result["est_ppt"] is not None:
+                                    ctx_est_ppt_values.append(ctx_result["est_ppt"])
+                                if ctx_result["e2e_ttft"] is not None:
+                                    ctx_e2e_ttft_values.append(ctx_result["e2e_ttft"])
+                            # Request 2: Context + Prompt
+                            # We send context as system message, and prompt as user message.
+                            # The prefix [System: Context] should be cached.
+                            # Expected PP tokens = current_pp (prompt size only)
+                            print(f"  Run {run+1}/{args.runs} (Inference)...")
+                            run_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, prompt, current_pp, tg, args.no_cache, latency, args.post_run_cmd)
+                        else:
+                            # Standard run
+                            # Expected PP tokens = current_pp + current_depth
+                            expected_tokens = current_pp + current_depth
+                            run_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, prompt, expected_tokens, tg, args.no_cache, latency, args.post_run_cmd)
+                        if run_result:
+                            if run_result["tg_speed"] is not None:
+                                tg_speeds.append(run_result["tg_speed"])
+                            if run_result["pp_speed"] is not None:
+                                pp_speeds.append(run_result["pp_speed"])
+                            if run_result["est_ppt"] is not None:
+                                est_ppt_values.append(run_result["est_ppt"])
+                            if run_result["ttfr"] is not None:
+                                ttfr_values.append(run_result["ttfr"])
+                            if run_result["ttft"] is not None:
+                                ttft_values.append(run_result["ttft"])
+                            if run_result["e2e_ttft"] is not None:
+                                e2e_ttft_values.append(run_result["e2e_ttft"])
+                    # Aggregate results
+                    def format_result(values, multiplier=1.0):
+                        if not values: return ""
+                        mean = np.mean(values) * multiplier
+                        std = np.std(values) * multiplier
+                        return f"{mean:.2f} ± {std:.2f}"
+                    # Context PP (if enabled)
+                    if ctx_pp_speeds:
+                        test_name = f"ctx_pp @ d{depth}"
+                        results.append([
+                            args.model,
+                            test_name,
+                            format_result(ctx_pp_speeds),
+                            format_result(ctx_ttfr_values, 1000),
+                            format_result(ctx_est_ppt_values, 1000),
+                            format_result(ctx_e2e_ttft_values, 1000)
+                        ])
+                    # Context TG (if enabled)
+                    if ctx_tg_speeds:
+                        test_name = f"ctx_tg @ d{depth}"
+                        results.append([args.model, test_name, format_result(ctx_tg_speeds), "", "", ""])
+                    # Standard PP
+                    if pp_speeds:
+                        test_name = f"pp{pp}"
+                        if depth > 0: test_name += f" @ d{depth}"
+                        results.append([
+                            args.model,
+                            test_name,
+                            format_result(pp_speeds),
+                            format_result(ttfr_values, 1000),
+                            format_result(est_ppt_values, 1000),
+                            format_result(e2e_ttft_values, 1000)
+                        ])
+                    # Standard TG
+                    if tg_speeds:
+                        test_name = f"tg{tg}"
+                        if depth > 0: test_name += f" @ d{depth}"
+                        results.append([args.model, test_name, format_result(tg_speeds), "", "", ""])
+        print()
+        if not results:
+            print("No results collected. Check if the model is generating tokens.")
+        else:
+            print(tabulate(results, headers=["model", "test", "t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"], tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right")))
+            print(f"\nllama-benchy ({version_number}.{build_number})")
+            print(f"date: {current_time} | latency mode: {args.latency_mode}")
+def main():
+    """Entry point for the CLI command."""
+    asyncio.run(main_async())
+if __name__ == "__main__":
+    main()

llama_benchy/_version.py ADDED Viewed

@@ -0,0 +1,34 @@
+# file generated by setuptools-scm
+# don't change, don't track in version control
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple
+    from typing import Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+    COMMIT_ID = Union[str, None]
+else:
+    VERSION_TUPLE = object
+    COMMIT_ID = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+commit_id: COMMIT_ID
+__commit_id__: COMMIT_ID
+__version__ = version = '0.1.0'
+__version_tuple__ = version_tuple = (0, 1, 0)
+__commit_id__ = commit_id = None

llama_benchy-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,249 @@
+Metadata-Version: 2.4
+Name: llama-benchy
+Version: 0.1.0
+Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
+Project-URL: Homepage, https://github.com/eugr/llama-benchy
+Project-URL: Bug Tracker, https://github.com/eugr/llama-benchy/issues
+Author: eugr
+License: MIT License
+        Copyright (c) 2026 Eugene Rakhmatulin
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+License-File: LICENSE
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10
+Requires-Dist: aiohttp
+Requires-Dist: asyncio
+Requires-Dist: numpy
+Requires-Dist: openai
+Requires-Dist: requests
+Requires-Dist: tabulate
+Requires-Dist: transformers
+Description-Content-Type: text/markdown
+# llama-benchy - llama-bench style benchmarking tool for all backends
+This script benchmarks OpenAI-compatible LLM endpoints, generating statistics similar to `llama-bench`.
+## Motivation
+`llama-bench` is a CLI tool that is a part of a very popular [llama.cpp](https://github.com/ggml-org/llama.cpp) inference engine. It is widely used in LLM community to benchmark models and allows to perform measurement at different context sizes.
+However, it is available only for llama.cpp and cannot be used with other inference engines, like vllm or SGLang.
+Also, it performs measurements using the C++ engine directly which is not representative of the end user experience which can be quite different in practice.
+vLLM has its own powerful benchmarking tool, but while it can be used with other inference engines, there are a few issues:
+- It's very tricky and even impossible to calculate prompt processing speeds at different context lengths. You can use `vllm bench sweep serve`, but it only works well with vLLM with prefix caching disabled on the server. Even with random prompts it will reuse the same prompt between multiple runs which will hit the cache in `llama-server` for instance. So you will get very low median TTFT times and very high prompt processing speeds.
+- The TTFT measurement it uses is not actually until the first usable token, it's until the very first data chunk from the server which may not contain any generated tokens in /v1/chat/completions mode.
+- Random dataset is the only ones that allows to specify an arbitrary number of tokens, but randomly generated token sequence doesn't let you adequately measure speculative decoding/MTP.
+As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool that brings llama-bench style measurements at different context lengths to any OpenAI-compatible endpoint.
+## Features
+- Measures Prompt Processing (pp) and Token Generation (tg) speeds at different context depths.
+- Can measure separate context prefill and prompt processing over existing cached context at different context depths.
+- Reports Time To First Response (data chunk) (TTFR), Estimated Prompt Processing Time (est_ppt), and End-to-End TTFT.
+- Supports configurable prompt length (`--pp`), generation length (`--tg`), and context depth (`--depth`).
+- Can run multiple iterations (`--runs`) and report mean ± std.
+- Uses HuggingFace tokenizers for accurate token counts.
+- Downloads a book from Project Gutenberg to use as source text for prompts to ensure better benchmarking of spec.decoding/MTP models.
+- Supports executing a command after each run (e.g., to clear cache).
+- Configurable latency measurement mode.
+## Installation
+Install `uv` first: https://docs.astral.sh/uv/getting-started/installation/
+### Option 1: Run without installation using `uvx`
+```bash
+uvx --from git+https://github.com/eugr/llama-benchy llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
+```
+### Option 2: Install into virtual environment
+```bash
+# Clone the repository
+git clone https://github.com/eugr/llama-benchy.git
+cd llama-benchy
+# Create virtual environment
+uv venv
+# Install with uv (installs into a virtual environment automatically)
+uv pip install -e .
+```
+To run, activate the environment first
+```bash
+source .venv/bin/activate
+```
+Then execute the command:
+```bash
+llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
+```
+### Option 3: Run without installing (`uv run`)
+```bash
+# Clone the repository
+git clone https://github.com/eugr/llama-benchy.git
+cd llama-benchy
+# Using uv run (creates a virtual environment if it doesn't exist and runs the command)
+uv run llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
+```
+### Option 3: Install into system path
+```bash
+uv pip install git+https://github.com/eugr/llama-benchy --system
+```
+## Usage
+After installation, you can run the tool directly:
+```bash
+llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME> --pp <PROMPT_TOKENS> --tg <GEN_TOKENS> [OPTIONS]
+```
+Example:
+```bash
+llama-benchy \
+  --base-url http://localhost:8000/v1 \
+  --model openai/gpt-oss-120b \
+  --depth 0 4096 8192 16384 32768 \
+  --latency-mode generation
+```
+Output:
+| model               |            test |             t/s |          ttfr (ms) |       est_ppt (ms) |      e2e_ttft (ms) |
+|:--------------------|----------------:|----------------:|-------------------:|-------------------:|-------------------:|
+| openai/gpt-oss-120b |          pp2048 | 2019.02 ± 34.98 |    1054.64 ± 17.57 |    1014.66 ± 17.57 |    1115.41 ± 18.70 |
+| openai/gpt-oss-120b |            tg32 |    52.94 ± 1.01 |                    |                    |                    |
+| openai/gpt-oss-120b |  pp2048 @ d4096 | 1994.49 ± 77.97 |   3129.18 ± 120.27 |   3089.19 ± 120.27 |   3198.97 ± 122.24 |
+| openai/gpt-oss-120b |    tg32 @ d4096 |    46.69 ± 1.11 |                    |                    |                    |
+| openai/gpt-oss-120b |  pp2048 @ d8192 | 1751.68 ± 34.44 |   5892.61 ± 114.68 |   5852.63 ± 114.68 |   5971.27 ± 115.77 |
+| openai/gpt-oss-120b |    tg32 @ d8192 |    40.40 ± 1.19 |                    |                    |                    |
+| openai/gpt-oss-120b | pp2048 @ d16384 | 1475.63 ± 31.41 |  12542.02 ± 265.86 |  12502.04 ± 265.86 |  12634.67 ± 269.43 |
+| openai/gpt-oss-120b |   tg32 @ d16384 |    33.86 ± 1.45 |                    |                    |                    |
+| openai/gpt-oss-120b | pp2048 @ d32768 | 1131.86 ± 50.53 | 30869.90 ± 1410.15 | 30829.92 ± 1410.15 | 30992.96 ± 1417.33 |
+| openai/gpt-oss-120b |   tg32 @ d32768 |    25.34 ± 1.31 |                    |                    |                    |
+llama-benchy (build: 75bc129)
+date: 2026-01-02 17:11:19 | latency mode: generation
+-------
+It's recommended to use "generation" latency mode to get prompt processing speeds closer to real numbers, especially on shorter prompts.
+By default, the script adapts the prompt size to match the specified value, regardless of the chat template applied. Use `--no-adapt-prompt` to disable this behavior.
+Generally you don't need to disable prompt caching on the server, as a probability of cache hits is fairly small. You can add `--no-cache` that will add some random noise if you get cache hits.
+### Arguments
+-   `--base-url`: OpenAI compatible endpoint URL (Required).
+-   `--api-key`: API Key (Default: "EMPTY").
+-   `--model`: Model name (Required).
+-   `--served-model-name`: Model name used in API calls (Defaults to --model if not specified).
+-   `--tokenizer`: HuggingFace tokenizer name (Defaults to model name).
+-   `--pp`: List of prompt processing token counts (Default: [2048]).
+-   `--tg`: List of token generation counts (Default: [32]).
+-   `--depth`: List of context depths (Default: [0]).
+-   `--runs`: Number of runs per test (Default: 3).
+-   `--no-cache`: Add noise to requests to improve prefix caching avoidance. Also sends `cache-prompt=false` to the server.
+-   `--post-run-cmd`: Command to execute after each test run.
+-   `--book-url`: URL of a book to use for text generation (Defaults to Sherlock Holmes).
+-   `--latency-mode`: Method to measure latency: 'api' (call list models function) - default, 'generation' (single token generation), or 'none' (skip latency measurement).
+-   `--no-warmup`: Skip warmup phase.
+-   `--adapt-prompt`: Adapt prompt size based on warmup token usage delta (Default: True).
+-   `--no-adapt-prompt`: Disable prompt size adaptation.
+-   `--enable-prefix-caching`: Enable prefix caching performance measurement. When enabled (and depth > 0), it performs a two-step benchmark: first loading the context (reported as `ctx_pp`), then running the prompt with the cached context.
+### Metrics
+The script outputs a table with the following metrics. All time measurements are in milliseconds (ms).
+#### Latency Adjustment
+The script attempts to estimate network or processing latency to provide "server-side" processing times.
+- **Latency**: Measured based on `--latency-mode`.
+  - `api`: Time to fetch `/models` (from sending request to getting first byte of the response). Eliminates network latency only.
+  - `generation`: Time to generate 1 token (from sending request to getting first byte of the response). Tries to eliminate network and server overhead latency.
+  - `none`: Assumed to be 0.
+- This measured latency is subtracted from `ttfr` to calculate `est_ppt`.
+#### Table Columns
+-   **`t/s` (Tokens per Second)**:
+    -   **For Prompt Processing (pp)**: Calculated as `Total Prompt Tokens / est_ppt`. This represents the prefill speed.
+    -   **For Token Generation (tg)**: Calculated as `(Total Generated Tokens - 1) / (Time of Last Token - Time of First Token)`. This represents the decode speed, excluding the first token latency.
+-   **`ttfr (ms)` (Time To First Response)**:
+    -   Calculation: `Time of First Response Chunk - Start Time`.
+    -   Represents the raw time until the client receives *any* stream data from the server (including empty chunks or role definitions, but excluding initial http response header). This includes network latency. The same measurement method is used by `vllm bench serve` to report TTFT.
+-   **`est_ppt (ms)` (Estimated Prompt Processing Time)**:
+    -   Calculation: `TTFR - Estimated Latency`.
+    -   Estimated time the server spent processing the prompt. Used for calculating Prompt Processing speed.
+-   **`e2e_ttft (ms)` (End-to-End Time To First Token)**:
+    -   Calculation: `Time of First Content Token - Start Time`.
+    -   The total time perceived by the client from sending the request to seeing the first generated content.
+### Prefix Caching Benchmarking
+When `--enable-prefix-caching` is used (with `--depth` > 0), the script performs a two-step process for each run to measure the impact of prefix caching:
+1.  **Context Load**: Sends the context tokens (as a system message) with an empty user message. This forces the server to process and cache the context.
+    -   Reported as `ctx_pp @ d{depth}` (Context Prompt Processing) and `ctx_tg @ d{depth}`.
+2.  **Inference**: Sends the same context (system message) followed by the actual prompt (user message). The server should reuse the cached context.
+    -   Reported as standard `pp{tokens} @ d{depth}` and `tg{tokens} @ d{depth}`.
+By comparing the `pp` speed of the Inference step with a non-cached run (or the Context Load step), you can observe the speedup provided by prefix caching.
+### Example
+```bash
+llama-benchy \
+  --base-url http://localhost:8000/v1 \
+  --model openai/gpt-oss-120b \
+  --pp 128 256 \
+  --tg 32 64 \
+  --depth 0 1024
+```
+This will run benchmarks for all combinations of pp (128, 256), tg (32, 64), and depth (0, 1024).

llama_benchy-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,8 @@
+llama_benchy/__init__.py,sha256=4DTHEmeJShcJJdb6rPhFqv4_zREEQRZGZ3h0ThA6YpU,798
+llama_benchy/__main__.py,sha256=CZEf_36w5iut5RZjAZ4F894PAHF3hoxDWkT6lTFcr_I,24351
+llama_benchy/_version.py,sha256=5jwwVncvCiTnhOedfkzzxmxsggwmTBORdFL_4wq0ZeY,704
+llama_benchy-0.1.0.dist-info/METADATA,sha256=WEL0ASCMSmJA8QPXJzAYA0fAztA_6-D7xGjNdx3o4vY,12943
+llama_benchy-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+llama_benchy-0.1.0.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
+llama_benchy-0.1.0.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
+llama_benchy-0.1.0.dist-info/RECORD,,

llama_benchy-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.28.0
+Root-Is-Purelib: true
+Tag: py3-none-any

llama_benchy-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ llama-benchy = llama_benchy.__main__:main

llama_benchy-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Eugene Rakhmatulin
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.