PyPI - llama-benchy - Versions diffs - 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

llama-benchy 0.1.2py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

llama_benchy/__main__.py +26 -534
llama_benchy/_version.py +2 -2
llama_benchy/client.py +201 -0
llama_benchy/config.py +76 -0
llama_benchy/corpus.py +62 -0
llama_benchy/prompts.py +54 -0
llama_benchy/results.py +435 -0
llama_benchy/runner.py +155 -0
{llama_benchy-0.1.2.dist-info → llama_benchy-0.2.1.dist-info}/METADATA +43 -3
llama_benchy-0.2.1.dist-info/RECORD +14 -0
llama_benchy-0.1.2.dist-info/RECORD +0 -8
{llama_benchy-0.1.2.dist-info → llama_benchy-0.2.1.dist-info}/WHEEL +0 -0
{llama_benchy-0.1.2.dist-info → llama_benchy-0.2.1.dist-info}/entry_points.txt +0 -0
{llama_benchy-0.1.2.dist-info → llama_benchy-0.2.1.dist-info}/licenses/LICENSE +0 -0

llama_benchy/results.py ADDED Viewed

@@ -0,0 +1,435 @@
+import numpy as np
+from tabulate import tabulate
+from typing import List, Dict, Any, Optional
+from dataclasses import dataclass, asdict
+import json
+import csv
+import sys
+from .client import RequestResult
+@dataclass
+class BenchmarkMetric:
+    mean: float
+    std: float
+    values: List[float]
+@dataclass
+class BenchmarkMetadata:
+    version: str
+    timestamp: str
+    latency_mode: str
+    latency_ms: float
+    model: str
+    prefix_caching_enabled: bool
+    max_concurrency: int
+@dataclass
+class BenchmarkRun:
+    concurrency: int
+    context_size: int
+    prompt_size: int
+    response_size: int
+    is_context_prefill_phase: bool
+    # Metrics (using BenchmarkMetric)
+    pp_throughput: Optional[BenchmarkMetric]
+    pp_req_throughput: Optional[BenchmarkMetric]
+    tg_throughput: Optional[BenchmarkMetric]
+    tg_req_throughput: Optional[BenchmarkMetric]
+    peak_throughput: Optional[BenchmarkMetric]
+    ttfr: Optional[BenchmarkMetric]
+    est_ppt: Optional[BenchmarkMetric]
+    e2e_ttft: Optional[BenchmarkMetric]
+class BenchmarkResults:
+    def __init__(self):
+        self.runs: List[BenchmarkRun] = []
+        self.metadata: Optional[BenchmarkMetadata] = None
+        self.model_name: Optional[str] = None
+    def _calculate_metric(self, values: List[float], multiplier: float = 1.0) -> Optional[BenchmarkMetric]:
+        if not values:
+            return None
+        scaled_values = [v * multiplier for v in values]
+        return BenchmarkMetric(
+            mean=np.mean(values) * multiplier,
+            std=np.std(values) * multiplier,
+            values=scaled_values
+        )
+    def _calculate_peak_throughput(self, all_timestamps: List[float], window: float = 1.0) -> float:
+        if not all_timestamps:
+            return 0.0
+        all_timestamps.sort()
+        # If total duration is less than the window, use actual duration to calculate rate
+        # This handles short bursts correctly where Peak would otherwise be < Mean
+        total_duration = all_timestamps[-1] - all_timestamps[0]
+        if total_duration < window and total_duration > 0:
+             return len(all_timestamps) / total_duration
+        max_tokens = 0
+        start_idx = 0
+        for end_idx, end_time in enumerate(all_timestamps):
+            # Window starts at end_time - window
+            while start_idx < end_idx and all_timestamps[start_idx] <= end_time - window:
+                start_idx += 1
+            # Count includes current token, so range is [start_idx, end_idx]
+            current_tokens = end_idx - start_idx + 1
+            if current_tokens > max_tokens:
+                max_tokens = current_tokens
+        return float(max_tokens) / window
+    def add(self,
+            model: str,
+            pp: int,
+            tg: int,
+            depth: int,
+            concurrency: int,
+            run_results: List[List[RequestResult]], # List of batches (one batch per run)
+            latency: float,
+            expected_pp_tokens: int,
+            is_context_phase: bool = False):
+        if self.model_name is None:
+            self.model_name = model
+        # Aggregators
+        agg_pp_speeds = []
+        agg_tg_speeds = []
+        agg_ttft_values = []
+        agg_ttfr_values = []
+        agg_est_ppt_values = []
+        agg_e2e_ttft_values = []
+        agg_batch_pp_throughputs = []
+        agg_batch_tg_throughputs = []
+        agg_peak_throughputs = []
+        for batch in run_results:
+            self._process_batch(
+                batch,
+                expected_pp_tokens,
+                latency,
+                agg_pp_speeds,
+                agg_tg_speeds,
+                agg_ttft_values,
+                agg_ttfr_values,
+                agg_est_ppt_values,
+                agg_e2e_ttft_values,
+                agg_batch_pp_throughputs,
+                agg_batch_tg_throughputs,
+                agg_peak_throughputs
+            )
+        # Calculate metrics for BenchmarkRun
+        run_metric_pp_throughput = self._calculate_metric(agg_batch_pp_throughputs if concurrency > 1 else agg_pp_speeds)
+        run_metric_pp_req_throughput = run_metric_pp_throughput if concurrency == 1 else self._calculate_metric(agg_pp_speeds)
+        run_metric_tg_throughput = self._calculate_metric(agg_batch_tg_throughputs if concurrency > 1 else agg_tg_speeds)
+        run_metric_tg_req_throughput = run_metric_tg_throughput if concurrency == 1 else self._calculate_metric(agg_tg_speeds)
+        run_metric_peak_throughput = self._calculate_metric(agg_peak_throughputs)
+        run_metric_ttfr = self._calculate_metric(agg_ttfr_values, 1000)
+        run_metric_est_ppt = self._calculate_metric(agg_est_ppt_values, 1000)
+        run_metric_e2e_ttft = self._calculate_metric(agg_e2e_ttft_values, 1000)
+        self.runs.append(BenchmarkRun(
+            concurrency=concurrency,
+            context_size=depth,
+            prompt_size=pp, # Configured prompt size
+            response_size=tg,
+            is_context_prefill_phase=is_context_phase,
+            pp_throughput=run_metric_pp_throughput,
+            pp_req_throughput=run_metric_pp_req_throughput,
+            tg_throughput=run_metric_tg_throughput,
+            tg_req_throughput=run_metric_tg_req_throughput,
+            peak_throughput=run_metric_peak_throughput,
+            ttfr=run_metric_ttfr,
+            est_ppt=run_metric_est_ppt,
+            e2e_ttft=run_metric_e2e_ttft
+        ))
+    def _process_batch(self,
+                       results: List[RequestResult],
+                       expected_pp_tokens: int,
+                       latency: float,
+                       agg_pp_speeds: List[float],
+                       agg_tg_speeds: List[float],
+                       agg_ttft_values: List[float],
+                       agg_ttfr_values: List[float],
+                       agg_est_ppt_values: List[float],
+                       agg_e2e_ttft_values: List[float],
+                       agg_batch_pp_throughputs: List[float],
+                       agg_batch_tg_throughputs: List[float],
+                       agg_peak_throughputs: List[float]):
+        valid_results = [r for r in results if r and not r.error]
+        if not valid_results:
+            return
+        batch_prompt_tokens = 0
+        batch_gen_tokens = 0
+        start_times = []
+        end_times = []
+        first_token_times = []
+        last_token_times = []
+        # Collect all token timestamps for peak calculation
+        all_token_timestamps = []
+        for res in valid_results:
+            start_times.append(res.start_ts)
+            end_times.append(res.end_ts)
+            all_token_timestamps.extend(res.token_timestamps)
+            if res.token_timestamps:
+                last_token_times.append(res.token_timestamps[-1])
+            elif res.end_ts:
+                # Fallback if no timestamps recorded but request finished
+                last_token_times.append(res.end_ts)
+            # Use reported usage if available and reasonable, else expected
+            prompt_tokens = expected_pp_tokens
+            if res.prompt_tokens > 0:
+                diff = abs(res.prompt_tokens - expected_pp_tokens)
+                if diff < expected_pp_tokens * 0.2:
+                    prompt_tokens = res.prompt_tokens
+            batch_prompt_tokens += prompt_tokens
+            batch_gen_tokens += res.total_tokens
+            # Metrics Calculation
+            ttft = 0.0
+            e2e_ttft = 0.0
+            ttfr = 0.0
+            est_ppt = 0.0
+            if res.first_response_ts:
+                ttfr = res.first_response_ts - res.start_ts
+                agg_ttfr_values.append(ttfr)
+            if res.first_token_ts:
+                first_token_times.append(res.first_token_ts)
+                e2e_ttft = res.first_token_ts - res.start_ts
+                ttft = max(0, e2e_ttft - latency)
+                est_ppt = max(0, ttfr - latency)
+                agg_e2e_ttft_values.append(e2e_ttft)
+                agg_ttft_values.append(ttft)
+                agg_est_ppt_values.append(est_ppt)
+            # Individual Speeds
+            if est_ppt > 0:
+                pp_speed = prompt_tokens / est_ppt
+                agg_pp_speeds.append(pp_speed)
+            if res.total_tokens > 1 and res.first_token_ts:
+                decode_time = res.end_ts - res.first_token_ts
+                if decode_time > 0:
+                    tg_speed = (res.total_tokens - 1) / decode_time
+                    agg_tg_speeds.append(tg_speed)
+        # Batch-Level Throughput
+        if start_times and end_times and first_token_times:
+            min_start = min(start_times)
+            max_end = max(end_times)
+            max_first_token = max(first_token_times)
+            pp_duration = max_first_token - min_start
+            if pp_duration > 0:
+                batch_pp_throughput = batch_prompt_tokens / pp_duration
+                agg_batch_pp_throughputs.append(batch_pp_throughput)
+            min_first_token = min(first_token_times)
+            # Use max(last_token_times) instead of max(end_times) to remove protocol overhead (headers, [DONE], etc)
+            # This makes the throughput metric purely about token generation speed.
+            max_last_token = max(last_token_times) if last_token_times else max_end
+            tg_duration = max_last_token - min_first_token
+            if tg_duration > 0:
+                if batch_gen_tokens > len(valid_results):
+                     batch_tg_throughput = (batch_gen_tokens - len(valid_results)) / tg_duration
+                     agg_batch_tg_throughputs.append(batch_tg_throughput)
+        if all_token_timestamps:
+            peak = self._calculate_peak_throughput(all_token_timestamps)
+            agg_peak_throughputs.append(peak)
+    def _generate_rows(self) -> List[Dict[str, Any]]:
+        rows = []
+        for run in self.runs:
+            c_suffix = ""
+            if self.metadata and self.metadata.max_concurrency > 1:
+                c_suffix = f" (c{run.concurrency})"
+            if run.is_context_prefill_phase:
+                # Context Phase Prompt Processing
+                if run.pp_throughput:
+                    rows.append({
+                        "model": self.model_name or "Unknown",
+                        "test_name": f"ctx_pp @ d{run.context_size}{c_suffix}",
+                        "t_s": run.pp_throughput,
+                        "t_s_req": run.pp_req_throughput,
+                        "peak_ts": None,
+                        "ttfr": run.ttfr,
+                        "est_ppt": run.est_ppt,
+                        "e2e_ttft": run.e2e_ttft
+                    })
+                # Context Phase Token Generation
+                if run.tg_throughput:
+                    rows.append({
+                        "model": self.model_name or "Unknown",
+                        "test_name": f"ctx_tg @ d{run.context_size}{c_suffix}",
+                        "t_s": run.tg_throughput,
+                        "t_s_req": run.tg_req_throughput,
+                        "peak_ts": run.peak_throughput,
+                        "ttfr": None,
+                        "est_ppt": None,
+                        "e2e_ttft": None
+                    })
+            else:
+                # Standard Phase
+                d_suffix = f" @ d{run.context_size}" if run.context_size > 0 else ""
+                # Prompt Processing
+                if run.pp_throughput:
+                    rows.append({
+                        "model": self.model_name or "Unknown",
+                        "test_name": f"pp{run.prompt_size}{d_suffix}{c_suffix}",
+                        "t_s": run.pp_throughput,
+                        "t_s_req": run.pp_req_throughput,
+                        "peak_ts": None,
+                        "ttfr": run.ttfr,
+                        "est_ppt": run.est_ppt,
+                        "e2e_ttft": run.e2e_ttft
+                    })
+                # Token Generation
+                if run.tg_throughput:
+                    rows.append({
+                        "model": self.model_name or "Unknown",
+                        "test_name": f"tg{run.response_size}{d_suffix}{c_suffix}",
+                        "t_s": run.tg_throughput,
+                        "t_s_req": run.tg_req_throughput,
+                        "peak_ts": run.peak_throughput,
+                        "ttfr": None,
+                        "est_ppt": None,
+                        "e2e_ttft": None
+                    })
+        return rows
+    def _generate_md_report(self, concurrency: int) -> str:
+        rows = self._generate_rows()
+        if not rows:
+            return "No results collected. Check if the model is generating tokens."
+        def fmt(metric: Optional[BenchmarkMetric]) -> str:
+            if metric is None:
+                return ""
+            return f"{metric.mean:.2f} ± {metric.std:.2f}"
+        data = [[
+            row["model"],
+            row["test_name"],
+            fmt(row["t_s"]),
+            fmt(row["t_s_req"]),
+            fmt(row["peak_ts"]),
+            fmt(row["ttfr"]),
+            fmt(row["est_ppt"]),
+            fmt(row["e2e_ttft"])
+        ] for row in rows]
+        ts_header = "t/s (total)" if concurrency > 1 else "t/s"
+        headers = ["model", "test", ts_header, "t/s (req)", "peak t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
+        if concurrency == 1:
+            data = [[
+                row["model"],
+                row["test_name"],
+                fmt(row["t_s"]),
+                fmt(row["peak_ts"]),
+                fmt(row["ttfr"]),
+                fmt(row["est_ppt"]),
+                fmt(row["e2e_ttft"])
+            ] for row in rows]
+            headers = ["model", "test", ts_header, "peak t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
+        return tabulate(data, headers=headers, tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right", "right", "right") if concurrency > 1 else ("left", "right", "right", "right", "right", "right", "right"))
+    def save_report(self, filename: Optional[str], format: str, concurrency: int = 1):
+        msg = ""
+        if filename:
+            msg += f"Saving results to {filename} in {format.upper()} format...\n"
+        else:
+            msg += f"Printing results in {format.upper()} format:\n"
+        print(f"{msg}\n")
+        if format == "md":
+            output = self._generate_md_report(concurrency)
+            if filename:
+                with open(filename, "w") as f:
+                    f.write(output)
+            else:
+                 print("\n" + output)
+        elif format == "json":
+            data = asdict(self.metadata) if self.metadata else {}
+            data["benchmarks"] = [asdict(run) for run in self.runs]
+            if filename:
+                 with open(filename, "w") as f:
+                     json.dump(data, f, indent=2)
+            else:
+                 print(json.dumps(data, indent=2))
+        elif format == "csv":
+             rows = self._generate_rows()
+             csv_rows = []
+             headers = ["model", "test_name", "t_s_mean", "t_s_std", "t_s_req_mean", "t_s_req_std", "peak_ts_mean", "peak_ts_std", "ttfr_mean", "ttfr_std", "est_ppt_mean", "est_ppt_std", "e2e_ttft_mean", "e2e_ttft_std"]
+             for r in rows:
+                 row = {
+                     "model": r["model"],
+                     "test_name": r["test_name"],
+                     "t_s_mean": r["t_s"].mean if r["t_s"] else None,
+                     "t_s_std": r["t_s"].std if r["t_s"] else None,
+                     "t_s_req_mean": r["t_s_req"].mean if r["t_s_req"] else None,
+                     "t_s_req_std": r["t_s_req"].std if r["t_s_req"] else None,
+                     "peak_ts_mean": r["peak_ts"].mean if r["peak_ts"] else None,
+                     "peak_ts_std": r["peak_ts"].std if r["peak_ts"] else None,
+                     "ttfr_mean": r["ttfr"].mean if r["ttfr"] else None,
+                     "ttfr_std": r["ttfr"].std if r["ttfr"] else None,
+                     "est_ppt_mean": r["est_ppt"].mean if r["est_ppt"] else None,
+                     "est_ppt_std": r["est_ppt"].std if r["est_ppt"] else None,
+                     "e2e_ttft_mean": r["e2e_ttft"].mean if r["e2e_ttft"] else None,
+                     "e2e_ttft_std": r["e2e_ttft"].std if r["e2e_ttft"] else None,
+                 }
+                 csv_rows.append(row)
+             output_file = filename if filename else sys.stdout
+             is_file = isinstance(output_file, str)
+             if is_file:
+                 with open(output_file, "w", newline="") as f:
+                      writer = csv.DictWriter(f, fieldnames=headers)
+                      writer.writeheader()
+                      writer.writerows(csv_rows)
+             else:
+                 writer = csv.DictWriter(sys.stdout, fieldnames=headers)
+                 writer.writeheader()
+                 writer.writerows(csv_rows)

llama_benchy/runner.py ADDED Viewed

@@ -0,0 +1,155 @@
+import asyncio
+import subprocess
+import time
+from datetime import datetime, timezone
+from typing import List
+import aiohttp
+from ._version import __version__
+from .config import BenchmarkConfig
+from .client import LLMClient
+from .prompts import PromptGenerator
+from .results import BenchmarkResults, BenchmarkMetadata
+class BenchmarkRunner:
+    def __init__(self, config: BenchmarkConfig, client: LLMClient, prompt_generator: PromptGenerator):
+        self.config = config
+        self.client = client
+        self.prompt_gen = prompt_generator
+        self.results = BenchmarkResults()
+        # We need to track deltas from warmup to adapt prompts
+        self.delta_user = 0
+        self.delta_context = 0
+    async def run_suite(self):
+        # Initialize session
+        timeout = aiohttp.ClientTimeout(total=3600)
+        max_concurrency = max(self.config.concurrency_levels)
+        connector = aiohttp.TCPConnector(limit=max_concurrency + 5, force_close=False, keepalive_timeout=600)
+        async with aiohttp.ClientSession(timeout=timeout, connector=connector, trust_env=True) as session:
+            # Warmup
+            should_warmup = not self.config.no_warmup
+            if self.config.adapt_prompt:
+                should_warmup = True
+            if should_warmup:
+                tokenizer = self.prompt_gen.corpus.get_tokenizer() if self.config.adapt_prompt else None
+                self.delta_user, self.delta_context = await self.client.warmup(session, tokenizer)
+            # Measure latency
+            latency = await self.client.measure_latency(session, self.config.latency_mode)
+            # Main Loop
+            for depth in self.config.depths:
+                for pp in self.config.pp_counts:
+                    for tg in self.config.tg_counts:
+                        for concurrency in self.config.concurrency_levels:
+                            print(f"Running test: pp={pp}, tg={tg}, depth={depth}, concurrency={concurrency}")
+                            run_std_results = []
+                            run_ctx_results = []
+                            expected_pp = pp
+                            expected_ctx = depth
+                            for run in range(self.config.num_runs):
+                                # Adapt prompt tokens
+                                current_pp = pp
+                                current_depth = depth
+                                if self.config.adapt_prompt:
+                                    if depth == 0:
+                                        current_pp = max(1, pp - self.delta_user)
+                                    else:
+                                        current_depth = max(1, depth - self.delta_context)
+                                expected_pp = current_pp
+                                expected_ctx = current_depth
+                                prompt_batch = self.prompt_gen.generate_batch(
+                                    concurrency,
+                                    current_pp,
+                                    current_depth,
+                                    self.config.no_cache
+                                )
+                                if self.config.enable_prefix_caching and depth > 0:
+                                    # Phase 1: Context Load
+                                    print(f"  Run {run+1}/{self.config.num_runs} (Context Load, batch size {concurrency})...")
+                                    load_tasks = []
+                                    for i in range(concurrency):
+                                        context, _ = prompt_batch[i]
+                                        load_tasks.append(self.client.run_generation(
+                                            session,
+                                            context_text=context,
+                                            prompt_text="",
+                                            max_tokens=tg,
+                                            no_cache=self.config.no_cache
+                                        ))
+                                    load_results = await asyncio.gather(*load_tasks)
+                                    run_ctx_results.append(load_results)
+                                    # Phase 2: Inference
+                                    print(f"  Run {run+1}/{self.config.num_runs} (Inference, batch size {concurrency})...")
+                                    inf_tasks = []
+                                    for i in range(concurrency):
+                                        context, prompt = prompt_batch[i]
+                                        inf_tasks.append(self.client.run_generation(
+                                            session,
+                                            context_text=context,
+                                            prompt_text=prompt,
+                                            max_tokens=tg,
+                                            no_cache=self.config.no_cache
+                                        ))
+                                    batch_results = await asyncio.gather(*inf_tasks)
+                                    run_std_results.append(batch_results)
+                                else:
+                                    # Standard Run
+                                    print(f"  Run {run+1}/{self.config.num_runs} (batch size {concurrency})...")
+                                    expected_tokens = current_pp + current_depth
+                                    batch_tasks = []
+                                    for i in range(concurrency):
+                                        context, prompt = prompt_batch[i]
+                                        batch_tasks.append(self.client.run_generation(
+                                            session,
+                                            context_text=context,
+                                            prompt_text=prompt,
+                                            max_tokens=tg,
+                                            no_cache=self.config.no_cache
+                                        ))
+                                    batch_results = await asyncio.gather(*batch_tasks)
+                                    run_std_results.append(batch_results)
+                                # Post Run Command
+                                if self.config.post_run_cmd:
+                                    try:
+                                        subprocess.run(self.config.post_run_cmd, shell=True, check=True)
+                                    except subprocess.CalledProcessError as e:
+                                        print(f"Post-run command failed: {e}")
+                            # Aggregate and Record
+                            if self.config.enable_prefix_caching and depth > 0:
+                                self.results.add(self.config.model, pp, tg, depth, concurrency, run_ctx_results, latency, expected_ctx, is_context_phase=True)
+                                self.results.add(self.config.model, pp, tg, depth, concurrency, run_std_results, latency, expected_pp, is_context_phase=False)
+                            else:
+                                # Standard run expected tokens = pp + depth (usually depth=0 or concatenated)
+                                # In the loop above: expected_tokens = current_pp + current_depth
+                                self.results.add(self.config.model, pp, tg, depth, concurrency, run_std_results, latency, expected_pp + expected_ctx, is_context_phase=False)
+            self.results.metadata = BenchmarkMetadata(
+                version=__version__,
+                timestamp=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%SZ"),
+                latency_mode=self.config.latency_mode,
+                latency_ms=latency * 1000,
+                model=self.config.model,
+                prefix_caching_enabled=self.config.enable_prefix_caching,
+                max_concurrency=max(self.config.concurrency_levels) if self.config.concurrency_levels else 1
+            )
+        self.results.save_report(self.config.save_result, self.config.result_format, max(self.config.concurrency_levels) if self.config.concurrency_levels else 1)

{llama_benchy-0.1.2.dist-info → llama_benchy-0.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llama-benchy
-Version: 0.1.2
+Version: 0.2.1
 Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
 Project-URL: Homepage, https://github.com/eugr/llama-benchy
 Project-URL: Bug Tracker, https://github.com/eugr/llama-benchy/issues
@@ -43,6 +43,12 @@ Requires-Dist: openai
 Requires-Dist: requests
 Requires-Dist: tabulate
 Requires-Dist: transformers
+Provides-Extra: dev
+Requires-Dist: fastapi; extra == 'dev'
+Requires-Dist: pydantic; extra == 'dev'
+Requires-Dist: pytest; extra == 'dev'
+Requires-Dist: pytest-asyncio; extra == 'dev'
+Requires-Dist: uvicorn; extra == 'dev'
 Description-Content-Type: text/markdown
 # llama-benchy - llama-bench style benchmarking tool for all backends
@@ -75,12 +81,12 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
 - Downloads a book from Project Gutenberg to use as source text for prompts to ensure better benchmarking of spec.decoding/MTP models.
 - Supports executing a command after each run (e.g., to clear cache).
 - Configurable latency measurement mode.
+- Supports concurrent requests (`--concurrency`) to measure throughput under load.
+- Can save results to file in Markdown, JSON, or CSV format.
 # Current Limitations
 - Evaluates against `/v1/chat/completions` endpoint only.
-- Doesn't measure throughput in concurrency mode (coming later).
-- Outputs results as a Markdown table only for now.
 ## Installation
@@ -215,6 +221,9 @@ Generally you don't need to disable prompt caching on the server, as a probabili
 -   `--adapt-prompt`: Adapt prompt size based on warmup token usage delta (Default: True).
 -   `--no-adapt-prompt`: Disable prompt size adaptation.
 -   `--enable-prefix-caching`: Enable prefix caching performance measurement. When enabled (and depth > 0), it performs a two-step benchmark: first loading the context (reported as `ctx_pp`), then running the prompt with the cached context.
+-   `--concurrency`: List of concurrency levels (number of concurrent requests per test) (Default: [1]).
+-   `--save-result`: File to save results to.
+-   `--format`: Output format: 'md', 'json', 'csv' (Default: 'md').
 ### Metrics
@@ -230,6 +239,9 @@ The script attempts to estimate network or processing latency to provide "server
 #### Table Columns
+    -   When `concurrency` > 1:
+        -   **`t/s (total)`**: Total throughput across all concurrent requests.
+        -   **`t/s (req)`**: Average throughput per individual request.
 -   **`t/s` (Tokens per Second)**:
     -   **For Prompt Processing (pp)**: Calculated as `Total Prompt Tokens / est_ppt`. This represents the prefill speed.
     -   **For Token Generation (tg)**: Calculated as `(Total Generated Tokens - 1) / (Time of Last Token - Time of First Token)`. This represents the decode speed, excluding the first token latency.
@@ -269,3 +281,31 @@ llama-benchy \
 ```
 This will run benchmarks for all combinations of pp (128, 256), tg (32, 64), and depth (0, 1024).
+## Development
+### Running Integration Tests
+This repository includes a mock server and an integration test suite to verify `llama-benchy` logic without needing a real GPU server.
+The mock server emulates:
+-   **Prompt Processing (PP):** ~1000 t/s drift-corrected.
+-   **Token Generation (TG):** ~50 t/s.
+-   **Prefix Caching:** Emulates cache hits by skipping processing time for cached prefixes (system messages).
+-   **OpenAI API Compatibility**: Serves `/v1/chat/completions` and `/v1/models`.
+To run the integration tests:
+```bash
+# Install development dependencies
+uv sync --all-extras --dev
+# Run tests
+uv run pytest tests/test_mock_integration.py
+```
+This test will:
+1.  Spin up the mock server on port 8001.
+2.  Run `llama-benchy` against it.
+3.  Parse the JSON output.
+4.  Verify that throughputs match the emulated speeds (PP ~1000, TG ~50) and that caching effectively increases effective throughput.

llama-benchy 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

llama-benchy 0.1.2py3-none-any.whl → 0.2.1py3-none-any.whl