PyPI - llama-benchy - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

llama-benchy 0.2.0py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

llama_benchy/_version.py CHANGED Viewed

@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
 commit_id: COMMIT_ID
 __commit_id__: COMMIT_ID
-__version__ = version = '0.2.0'
-__version_tuple__ = version_tuple = (0, 2, 0)
+__version__ = version = '0.2.1'
+__version_tuple__ = version_tuple = (0, 2, 1)
 __commit_id__ = commit_id = None

llama_benchy/client.py CHANGED Viewed

@@ -4,7 +4,7 @@ import codecs
 import aiohttp
 import asyncio
 import numpy as np
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional, List, Dict, Any
 @dataclass
@@ -16,6 +16,7 @@ class RequestResult:
     prompt_tokens: int = 0
     total_tokens: int = 0
     error: Optional[str] = None
+    token_timestamps: List[float] = field(default_factory=list)
 class LLMClient:
     def __init__(self, base_url: str, api_key: str, model_name: str):
@@ -187,6 +188,7 @@ class LLMClient:
                                             result.first_token_ts = chunk_time
                                         result.total_tokens += 1
+                                        result.token_timestamps.append(chunk_time)
                             except json.JSONDecodeError:
                                 continue

llama_benchy/results.py CHANGED Viewed

@@ -37,6 +37,7 @@ class BenchmarkRun:
     pp_req_throughput: Optional[BenchmarkMetric]
     tg_throughput: Optional[BenchmarkMetric]
     tg_req_throughput: Optional[BenchmarkMetric]
+    peak_throughput: Optional[BenchmarkMetric]
     ttfr: Optional[BenchmarkMetric]
     est_ppt: Optional[BenchmarkMetric]
     e2e_ttft: Optional[BenchmarkMetric]
@@ -57,6 +58,33 @@ class BenchmarkResults:
             values=scaled_values
         )
+    def _calculate_peak_throughput(self, all_timestamps: List[float], window: float = 1.0) -> float:
+        if not all_timestamps:
+            return 0.0
+        all_timestamps.sort()
+        # If total duration is less than the window, use actual duration to calculate rate
+        # This handles short bursts correctly where Peak would otherwise be < Mean
+        total_duration = all_timestamps[-1] - all_timestamps[0]
+        if total_duration < window and total_duration > 0:
+             return len(all_timestamps) / total_duration
+        max_tokens = 0
+        start_idx = 0
+        for end_idx, end_time in enumerate(all_timestamps):
+            # Window starts at end_time - window
+            while start_idx < end_idx and all_timestamps[start_idx] <= end_time - window:
+                start_idx += 1
+            # Count includes current token, so range is [start_idx, end_idx]
+            current_tokens = end_idx - start_idx + 1
+            if current_tokens > max_tokens:
+                max_tokens = current_tokens
+        return float(max_tokens) / window
     def add(self,
             model: str,
             pp: int,
@@ -81,6 +109,7 @@ class BenchmarkResults:
         agg_batch_pp_throughputs = []
         agg_batch_tg_throughputs = []
+        agg_peak_throughputs = []
         for batch in run_results:
             self._process_batch(
@@ -94,7 +123,8 @@ class BenchmarkResults:
                 agg_est_ppt_values,
                 agg_e2e_ttft_values,
                 agg_batch_pp_throughputs,
-                agg_batch_tg_throughputs
+                agg_batch_tg_throughputs,
+                agg_peak_throughputs
             )
         # Calculate metrics for BenchmarkRun
@@ -104,6 +134,8 @@ class BenchmarkResults:
         run_metric_tg_throughput = self._calculate_metric(agg_batch_tg_throughputs if concurrency > 1 else agg_tg_speeds)
         run_metric_tg_req_throughput = run_metric_tg_throughput if concurrency == 1 else self._calculate_metric(agg_tg_speeds)
+        run_metric_peak_throughput = self._calculate_metric(agg_peak_throughputs)
         run_metric_ttfr = self._calculate_metric(agg_ttfr_values, 1000)
         run_metric_est_ppt = self._calculate_metric(agg_est_ppt_values, 1000)
         run_metric_e2e_ttft = self._calculate_metric(agg_e2e_ttft_values, 1000)
@@ -118,6 +150,7 @@ class BenchmarkResults:
             pp_req_throughput=run_metric_pp_req_throughput,
             tg_throughput=run_metric_tg_throughput,
             tg_req_throughput=run_metric_tg_req_throughput,
+            peak_throughput=run_metric_peak_throughput,
             ttfr=run_metric_ttfr,
             est_ppt=run_metric_est_ppt,
             e2e_ttft=run_metric_e2e_ttft
@@ -134,7 +167,8 @@ class BenchmarkResults:
                        agg_est_ppt_values: List[float],
                        agg_e2e_ttft_values: List[float],
                        agg_batch_pp_throughputs: List[float],
-                       agg_batch_tg_throughputs: List[float]):
+                       agg_batch_tg_throughputs: List[float],
+                       agg_peak_throughputs: List[float]):
         valid_results = [r for r in results if r and not r.error]
         if not valid_results:
@@ -146,10 +180,21 @@ class BenchmarkResults:
         start_times = []
         end_times = []
         first_token_times = []
+        last_token_times = []
+        # Collect all token timestamps for peak calculation
+        all_token_timestamps = []
         for res in valid_results:
             start_times.append(res.start_ts)
             end_times.append(res.end_ts)
+            all_token_timestamps.extend(res.token_timestamps)
+            if res.token_timestamps:
+                last_token_times.append(res.token_timestamps[-1])
+            elif res.end_ts:
+                # Fallback if no timestamps recorded but request finished
+                last_token_times.append(res.end_ts)
             # Use reported usage if available and reasonable, else expected
             prompt_tokens = expected_pp_tokens
@@ -205,13 +250,21 @@ class BenchmarkResults:
                 agg_batch_pp_throughputs.append(batch_pp_throughput)
             min_first_token = min(first_token_times)
-            tg_duration = max_end - min_first_token
+            # Use max(last_token_times) instead of max(end_times) to remove protocol overhead (headers, [DONE], etc)
+            # This makes the throughput metric purely about token generation speed.
+            max_last_token = max(last_token_times) if last_token_times else max_end
+            tg_duration = max_last_token - min_first_token
             if tg_duration > 0:
                 if batch_gen_tokens > len(valid_results):
                      batch_tg_throughput = (batch_gen_tokens - len(valid_results)) / tg_duration
                      agg_batch_tg_throughputs.append(batch_tg_throughput)
+        if all_token_timestamps:
+            peak = self._calculate_peak_throughput(all_token_timestamps)
+            agg_peak_throughputs.append(peak)
     def _generate_rows(self) -> List[Dict[str, Any]]:
         rows = []
@@ -228,6 +281,7 @@ class BenchmarkResults:
                         "test_name": f"ctx_pp @ d{run.context_size}{c_suffix}",
                         "t_s": run.pp_throughput,
                         "t_s_req": run.pp_req_throughput,
+                        "peak_ts": None,
                         "ttfr": run.ttfr,
                         "est_ppt": run.est_ppt,
                         "e2e_ttft": run.e2e_ttft
@@ -240,6 +294,7 @@ class BenchmarkResults:
                         "test_name": f"ctx_tg @ d{run.context_size}{c_suffix}",
                         "t_s": run.tg_throughput,
                         "t_s_req": run.tg_req_throughput,
+                        "peak_ts": run.peak_throughput,
                         "ttfr": None,
                         "est_ppt": None,
                         "e2e_ttft": None
@@ -255,6 +310,7 @@ class BenchmarkResults:
                         "test_name": f"pp{run.prompt_size}{d_suffix}{c_suffix}",
                         "t_s": run.pp_throughput,
                         "t_s_req": run.pp_req_throughput,
+                        "peak_ts": None,
                         "ttfr": run.ttfr,
                         "est_ppt": run.est_ppt,
                         "e2e_ttft": run.e2e_ttft
@@ -267,6 +323,7 @@ class BenchmarkResults:
                         "test_name": f"tg{run.response_size}{d_suffix}{c_suffix}",
                         "t_s": run.tg_throughput,
                         "t_s_req": run.tg_req_throughput,
+                        "peak_ts": run.peak_throughput,
                         "ttfr": None,
                         "est_ppt": None,
                         "e2e_ttft": None
@@ -288,26 +345,28 @@ class BenchmarkResults:
             row["test_name"],
             fmt(row["t_s"]),
             fmt(row["t_s_req"]),
+            fmt(row["peak_ts"]),
             fmt(row["ttfr"]),
             fmt(row["est_ppt"]),
             fmt(row["e2e_ttft"])
         ] for row in rows]
         ts_header = "t/s (total)" if concurrency > 1 else "t/s"
-        headers = ["model", "test", ts_header, "t/s (req)", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
+        headers = ["model", "test", ts_header, "t/s (req)", "peak t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
         if concurrency == 1:
             data = [[
                 row["model"],
                 row["test_name"],
-                fmt(row["t_s"]),
+                fmt(row["t_s"]),
+                fmt(row["peak_ts"]),
                 fmt(row["ttfr"]),
                 fmt(row["est_ppt"]),
                 fmt(row["e2e_ttft"])
             ] for row in rows]
-            headers = ["model", "test", ts_header, "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
+            headers = ["model", "test", ts_header, "peak t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
-        return tabulate(data, headers=headers, tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right", "right") if concurrency > 1 else ("left", "right", "right", "right", "right", "right"))
+        return tabulate(data, headers=headers, tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right", "right", "right") if concurrency > 1 else ("left", "right", "right", "right", "right", "right", "right"))
     def save_report(self, filename: Optional[str], format: str, concurrency: int = 1):
         msg = ""
@@ -339,7 +398,7 @@ class BenchmarkResults:
         elif format == "csv":
              rows = self._generate_rows()
              csv_rows = []
-             headers = ["model", "test_name", "t_s_mean", "t_s_std", "t_s_req_mean", "t_s_req_std", "ttfr_mean", "ttfr_std", "est_ppt_mean", "est_ppt_std", "e2e_ttft_mean", "e2e_ttft_std"]
+             headers = ["model", "test_name", "t_s_mean", "t_s_std", "t_s_req_mean", "t_s_req_std", "peak_ts_mean", "peak_ts_std", "ttfr_mean", "ttfr_std", "est_ppt_mean", "est_ppt_std", "e2e_ttft_mean", "e2e_ttft_std"]
              for r in rows:
                  row = {
@@ -349,6 +408,8 @@ class BenchmarkResults:
                      "t_s_std": r["t_s"].std if r["t_s"] else None,
                      "t_s_req_mean": r["t_s_req"].mean if r["t_s_req"] else None,
                      "t_s_req_std": r["t_s_req"].std if r["t_s_req"] else None,
+                     "peak_ts_mean": r["peak_ts"].mean if r["peak_ts"] else None,
+                     "peak_ts_std": r["peak_ts"].std if r["peak_ts"] else None,
                      "ttfr_mean": r["ttfr"].mean if r["ttfr"] else None,
                      "ttfr_std": r["ttfr"].std if r["ttfr"] else None,
                      "est_ppt_mean": r["est_ppt"].mean if r["est_ppt"] else None,

{llama_benchy-0.2.0.dist-info → llama_benchy-0.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llama-benchy
-Version: 0.2.0
+Version: 0.2.1
 Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
 Project-URL: Homepage, https://github.com/eugr/llama-benchy
 Project-URL: Bug Tracker, https://github.com/eugr/llama-benchy/issues

{llama_benchy-0.2.0.dist-info → llama_benchy-0.2.1.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 llama_benchy/__init__.py,sha256=D2TacJCNiAvfxHovv86Cm1kkFfmwgj_Z6QPoWdjJFhs,239
 llama_benchy/__main__.py,sha256=rk0Re1dehcJNIxIRsTRF_HCvcDEb20nMV05pYtG7FIw,1384
-llama_benchy/_version.py,sha256=Dg8AmJomLVpjKL6prJylOONZAPRtB86LOce7dorQS_A,704
-llama_benchy/client.py,sha256=dYFwlFJvr0aSThb6lN6coQt2KJy8tYb-BhDobniviV8,8362
+llama_benchy/_version.py,sha256=vYqoJTG51NOUmYyL0xt8asRK8vUT4lGAdal_EZ59mvw,704
+llama_benchy/client.py,sha256=SJ7jOT9q2hdjXIfEIaggTx6h2IlvE5ZAAyQzKbP1NLU,8516
 llama_benchy/config.py,sha256=FV4jyEHm2G-lU2wX1atq--lLW-53uZQRWrWc00Qrnwc,4462
 llama_benchy/corpus.py,sha256=b0RSkN8bpySiPEToH_XZR3hHKYz752BjsNqlE-78nPY,2404
 llama_benchy/prompts.py,sha256=AUgAOKK2QIBb9DcwhgIrRTGxIqXiFjD7D-Ek0A3mmEk,2090
-llama_benchy/results.py,sha256=jP2UUe5juHu5XDwgiS-7rCbPdbiU0XMn-DcqjVXiCNY,14453
+llama_benchy/results.py,sha256=x_3MlaZ959Mm8l-jqy_Jo73ajzp45FIus3jgs9ia_XM,17324
 llama_benchy/runner.py,sha256=PSycdp6nkgkWuW7DYsAJpw2PWBuQXgGEpVUci-r1dDo,8579
-llama_benchy-0.2.0.dist-info/METADATA,sha256=MTE8qthP3WoVJ-crPIUBuGN3rnqi_v1jEOOGL4Dda5A,15012
-llama_benchy-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-llama_benchy-0.2.0.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
-llama_benchy-0.2.0.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
-llama_benchy-0.2.0.dist-info/RECORD,,
+llama_benchy-0.2.1.dist-info/METADATA,sha256=1zmNoa8cga70yfLREsXlhBNZyXFH0CKieFr6W-h0oyY,15012
+llama_benchy-0.2.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+llama_benchy-0.2.1.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
+llama_benchy-0.2.1.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
+llama_benchy-0.2.1.dist-info/RECORD,,

{llama_benchy-0.2.0.dist-info → llama_benchy-0.2.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{llama_benchy-0.2.0.dist-info → llama_benchy-0.2.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{llama_benchy-0.2.0.dist-info → llama_benchy-0.2.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

llama-benchy 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

llama-benchy 0.2.0py3-none-any.whl → 0.2.1py3-none-any.whl