llama-benchy 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llama_benchy/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.2.0'
32
- __version_tuple__ = version_tuple = (0, 2, 0)
31
+ __version__ = version = '0.2.1'
32
+ __version_tuple__ = version_tuple = (0, 2, 1)
33
33
 
34
34
  __commit_id__ = commit_id = None
llama_benchy/client.py CHANGED
@@ -4,7 +4,7 @@ import codecs
4
4
  import aiohttp
5
5
  import asyncio
6
6
  import numpy as np
7
- from dataclasses import dataclass
7
+ from dataclasses import dataclass, field
8
8
  from typing import Optional, List, Dict, Any
9
9
 
10
10
  @dataclass
@@ -16,6 +16,7 @@ class RequestResult:
16
16
  prompt_tokens: int = 0
17
17
  total_tokens: int = 0
18
18
  error: Optional[str] = None
19
+ token_timestamps: List[float] = field(default_factory=list)
19
20
 
20
21
  class LLMClient:
21
22
  def __init__(self, base_url: str, api_key: str, model_name: str):
@@ -187,6 +188,7 @@ class LLMClient:
187
188
  result.first_token_ts = chunk_time
188
189
 
189
190
  result.total_tokens += 1
191
+ result.token_timestamps.append(chunk_time)
190
192
  except json.JSONDecodeError:
191
193
  continue
192
194
 
llama_benchy/results.py CHANGED
@@ -37,6 +37,7 @@ class BenchmarkRun:
37
37
  pp_req_throughput: Optional[BenchmarkMetric]
38
38
  tg_throughput: Optional[BenchmarkMetric]
39
39
  tg_req_throughput: Optional[BenchmarkMetric]
40
+ peak_throughput: Optional[BenchmarkMetric]
40
41
  ttfr: Optional[BenchmarkMetric]
41
42
  est_ppt: Optional[BenchmarkMetric]
42
43
  e2e_ttft: Optional[BenchmarkMetric]
@@ -57,6 +58,33 @@ class BenchmarkResults:
57
58
  values=scaled_values
58
59
  )
59
60
 
61
+ def _calculate_peak_throughput(self, all_timestamps: List[float], window: float = 1.0) -> float:
62
+ if not all_timestamps:
63
+ return 0.0
64
+
65
+ all_timestamps.sort()
66
+
67
+ # If total duration is less than the window, use actual duration to calculate rate
68
+ # This handles short bursts correctly where Peak would otherwise be < Mean
69
+ total_duration = all_timestamps[-1] - all_timestamps[0]
70
+ if total_duration < window and total_duration > 0:
71
+ return len(all_timestamps) / total_duration
72
+
73
+ max_tokens = 0
74
+
75
+ start_idx = 0
76
+ for end_idx, end_time in enumerate(all_timestamps):
77
+ # Window starts at end_time - window
78
+ while start_idx < end_idx and all_timestamps[start_idx] <= end_time - window:
79
+ start_idx += 1
80
+
81
+ # Count includes current token, so range is [start_idx, end_idx]
82
+ current_tokens = end_idx - start_idx + 1
83
+ if current_tokens > max_tokens:
84
+ max_tokens = current_tokens
85
+
86
+ return float(max_tokens) / window
87
+
60
88
  def add(self,
61
89
  model: str,
62
90
  pp: int,
@@ -81,6 +109,7 @@ class BenchmarkResults:
81
109
 
82
110
  agg_batch_pp_throughputs = []
83
111
  agg_batch_tg_throughputs = []
112
+ agg_peak_throughputs = []
84
113
 
85
114
  for batch in run_results:
86
115
  self._process_batch(
@@ -94,7 +123,8 @@ class BenchmarkResults:
94
123
  agg_est_ppt_values,
95
124
  agg_e2e_ttft_values,
96
125
  agg_batch_pp_throughputs,
97
- agg_batch_tg_throughputs
126
+ agg_batch_tg_throughputs,
127
+ agg_peak_throughputs
98
128
  )
99
129
 
100
130
  # Calculate metrics for BenchmarkRun
@@ -104,6 +134,8 @@ class BenchmarkResults:
104
134
  run_metric_tg_throughput = self._calculate_metric(agg_batch_tg_throughputs if concurrency > 1 else agg_tg_speeds)
105
135
  run_metric_tg_req_throughput = run_metric_tg_throughput if concurrency == 1 else self._calculate_metric(agg_tg_speeds)
106
136
 
137
+ run_metric_peak_throughput = self._calculate_metric(agg_peak_throughputs)
138
+
107
139
  run_metric_ttfr = self._calculate_metric(agg_ttfr_values, 1000)
108
140
  run_metric_est_ppt = self._calculate_metric(agg_est_ppt_values, 1000)
109
141
  run_metric_e2e_ttft = self._calculate_metric(agg_e2e_ttft_values, 1000)
@@ -118,6 +150,7 @@ class BenchmarkResults:
118
150
  pp_req_throughput=run_metric_pp_req_throughput,
119
151
  tg_throughput=run_metric_tg_throughput,
120
152
  tg_req_throughput=run_metric_tg_req_throughput,
153
+ peak_throughput=run_metric_peak_throughput,
121
154
  ttfr=run_metric_ttfr,
122
155
  est_ppt=run_metric_est_ppt,
123
156
  e2e_ttft=run_metric_e2e_ttft
@@ -134,7 +167,8 @@ class BenchmarkResults:
134
167
  agg_est_ppt_values: List[float],
135
168
  agg_e2e_ttft_values: List[float],
136
169
  agg_batch_pp_throughputs: List[float],
137
- agg_batch_tg_throughputs: List[float]):
170
+ agg_batch_tg_throughputs: List[float],
171
+ agg_peak_throughputs: List[float]):
138
172
 
139
173
  valid_results = [r for r in results if r and not r.error]
140
174
  if not valid_results:
@@ -146,10 +180,21 @@ class BenchmarkResults:
146
180
  start_times = []
147
181
  end_times = []
148
182
  first_token_times = []
183
+ last_token_times = []
184
+
185
+ # Collect all token timestamps for peak calculation
186
+ all_token_timestamps = []
149
187
 
150
188
  for res in valid_results:
151
189
  start_times.append(res.start_ts)
152
190
  end_times.append(res.end_ts)
191
+ all_token_timestamps.extend(res.token_timestamps)
192
+
193
+ if res.token_timestamps:
194
+ last_token_times.append(res.token_timestamps[-1])
195
+ elif res.end_ts:
196
+ # Fallback if no timestamps recorded but request finished
197
+ last_token_times.append(res.end_ts)
153
198
 
154
199
  # Use reported usage if available and reasonable, else expected
155
200
  prompt_tokens = expected_pp_tokens
@@ -205,13 +250,21 @@ class BenchmarkResults:
205
250
  agg_batch_pp_throughputs.append(batch_pp_throughput)
206
251
 
207
252
  min_first_token = min(first_token_times)
208
- tg_duration = max_end - min_first_token
253
+
254
+ # Use max(last_token_times) instead of max(end_times) to remove protocol overhead (headers, [DONE], etc)
255
+ # This makes the throughput metric purely about token generation speed.
256
+ max_last_token = max(last_token_times) if last_token_times else max_end
257
+ tg_duration = max_last_token - min_first_token
209
258
 
210
259
  if tg_duration > 0:
211
260
  if batch_gen_tokens > len(valid_results):
212
261
  batch_tg_throughput = (batch_gen_tokens - len(valid_results)) / tg_duration
213
262
  agg_batch_tg_throughputs.append(batch_tg_throughput)
214
263
 
264
+ if all_token_timestamps:
265
+ peak = self._calculate_peak_throughput(all_token_timestamps)
266
+ agg_peak_throughputs.append(peak)
267
+
215
268
 
216
269
  def _generate_rows(self) -> List[Dict[str, Any]]:
217
270
  rows = []
@@ -228,6 +281,7 @@ class BenchmarkResults:
228
281
  "test_name": f"ctx_pp @ d{run.context_size}{c_suffix}",
229
282
  "t_s": run.pp_throughput,
230
283
  "t_s_req": run.pp_req_throughput,
284
+ "peak_ts": None,
231
285
  "ttfr": run.ttfr,
232
286
  "est_ppt": run.est_ppt,
233
287
  "e2e_ttft": run.e2e_ttft
@@ -240,6 +294,7 @@ class BenchmarkResults:
240
294
  "test_name": f"ctx_tg @ d{run.context_size}{c_suffix}",
241
295
  "t_s": run.tg_throughput,
242
296
  "t_s_req": run.tg_req_throughput,
297
+ "peak_ts": run.peak_throughput,
243
298
  "ttfr": None,
244
299
  "est_ppt": None,
245
300
  "e2e_ttft": None
@@ -255,6 +310,7 @@ class BenchmarkResults:
255
310
  "test_name": f"pp{run.prompt_size}{d_suffix}{c_suffix}",
256
311
  "t_s": run.pp_throughput,
257
312
  "t_s_req": run.pp_req_throughput,
313
+ "peak_ts": None,
258
314
  "ttfr": run.ttfr,
259
315
  "est_ppt": run.est_ppt,
260
316
  "e2e_ttft": run.e2e_ttft
@@ -267,6 +323,7 @@ class BenchmarkResults:
267
323
  "test_name": f"tg{run.response_size}{d_suffix}{c_suffix}",
268
324
  "t_s": run.tg_throughput,
269
325
  "t_s_req": run.tg_req_throughput,
326
+ "peak_ts": run.peak_throughput,
270
327
  "ttfr": None,
271
328
  "est_ppt": None,
272
329
  "e2e_ttft": None
@@ -288,26 +345,28 @@ class BenchmarkResults:
288
345
  row["test_name"],
289
346
  fmt(row["t_s"]),
290
347
  fmt(row["t_s_req"]),
348
+ fmt(row["peak_ts"]),
291
349
  fmt(row["ttfr"]),
292
350
  fmt(row["est_ppt"]),
293
351
  fmt(row["e2e_ttft"])
294
352
  ] for row in rows]
295
353
 
296
354
  ts_header = "t/s (total)" if concurrency > 1 else "t/s"
297
- headers = ["model", "test", ts_header, "t/s (req)", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
355
+ headers = ["model", "test", ts_header, "t/s (req)", "peak t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
298
356
 
299
357
  if concurrency == 1:
300
358
  data = [[
301
359
  row["model"],
302
360
  row["test_name"],
303
- fmt(row["t_s"]),
361
+ fmt(row["t_s"]),
362
+ fmt(row["peak_ts"]),
304
363
  fmt(row["ttfr"]),
305
364
  fmt(row["est_ppt"]),
306
365
  fmt(row["e2e_ttft"])
307
366
  ] for row in rows]
308
- headers = ["model", "test", ts_header, "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
367
+ headers = ["model", "test", ts_header, "peak t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
309
368
 
310
- return tabulate(data, headers=headers, tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right", "right") if concurrency > 1 else ("left", "right", "right", "right", "right", "right"))
369
+ return tabulate(data, headers=headers, tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right", "right", "right") if concurrency > 1 else ("left", "right", "right", "right", "right", "right", "right"))
311
370
 
312
371
  def save_report(self, filename: Optional[str], format: str, concurrency: int = 1):
313
372
  msg = ""
@@ -339,7 +398,7 @@ class BenchmarkResults:
339
398
  elif format == "csv":
340
399
  rows = self._generate_rows()
341
400
  csv_rows = []
342
- headers = ["model", "test_name", "t_s_mean", "t_s_std", "t_s_req_mean", "t_s_req_std", "ttfr_mean", "ttfr_std", "est_ppt_mean", "est_ppt_std", "e2e_ttft_mean", "e2e_ttft_std"]
401
+ headers = ["model", "test_name", "t_s_mean", "t_s_std", "t_s_req_mean", "t_s_req_std", "peak_ts_mean", "peak_ts_std", "ttfr_mean", "ttfr_std", "est_ppt_mean", "est_ppt_std", "e2e_ttft_mean", "e2e_ttft_std"]
343
402
 
344
403
  for r in rows:
345
404
  row = {
@@ -349,6 +408,8 @@ class BenchmarkResults:
349
408
  "t_s_std": r["t_s"].std if r["t_s"] else None,
350
409
  "t_s_req_mean": r["t_s_req"].mean if r["t_s_req"] else None,
351
410
  "t_s_req_std": r["t_s_req"].std if r["t_s_req"] else None,
411
+ "peak_ts_mean": r["peak_ts"].mean if r["peak_ts"] else None,
412
+ "peak_ts_std": r["peak_ts"].std if r["peak_ts"] else None,
352
413
  "ttfr_mean": r["ttfr"].mean if r["ttfr"] else None,
353
414
  "ttfr_std": r["ttfr"].std if r["ttfr"] else None,
354
415
  "est_ppt_mean": r["est_ppt"].mean if r["est_ppt"] else None,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llama-benchy
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
5
5
  Project-URL: Homepage, https://github.com/eugr/llama-benchy
6
6
  Project-URL: Bug Tracker, https://github.com/eugr/llama-benchy/issues
@@ -1,14 +1,14 @@
1
1
  llama_benchy/__init__.py,sha256=D2TacJCNiAvfxHovv86Cm1kkFfmwgj_Z6QPoWdjJFhs,239
2
2
  llama_benchy/__main__.py,sha256=rk0Re1dehcJNIxIRsTRF_HCvcDEb20nMV05pYtG7FIw,1384
3
- llama_benchy/_version.py,sha256=Dg8AmJomLVpjKL6prJylOONZAPRtB86LOce7dorQS_A,704
4
- llama_benchy/client.py,sha256=dYFwlFJvr0aSThb6lN6coQt2KJy8tYb-BhDobniviV8,8362
3
+ llama_benchy/_version.py,sha256=vYqoJTG51NOUmYyL0xt8asRK8vUT4lGAdal_EZ59mvw,704
4
+ llama_benchy/client.py,sha256=SJ7jOT9q2hdjXIfEIaggTx6h2IlvE5ZAAyQzKbP1NLU,8516
5
5
  llama_benchy/config.py,sha256=FV4jyEHm2G-lU2wX1atq--lLW-53uZQRWrWc00Qrnwc,4462
6
6
  llama_benchy/corpus.py,sha256=b0RSkN8bpySiPEToH_XZR3hHKYz752BjsNqlE-78nPY,2404
7
7
  llama_benchy/prompts.py,sha256=AUgAOKK2QIBb9DcwhgIrRTGxIqXiFjD7D-Ek0A3mmEk,2090
8
- llama_benchy/results.py,sha256=jP2UUe5juHu5XDwgiS-7rCbPdbiU0XMn-DcqjVXiCNY,14453
8
+ llama_benchy/results.py,sha256=x_3MlaZ959Mm8l-jqy_Jo73ajzp45FIus3jgs9ia_XM,17324
9
9
  llama_benchy/runner.py,sha256=PSycdp6nkgkWuW7DYsAJpw2PWBuQXgGEpVUci-r1dDo,8579
10
- llama_benchy-0.2.0.dist-info/METADATA,sha256=MTE8qthP3WoVJ-crPIUBuGN3rnqi_v1jEOOGL4Dda5A,15012
11
- llama_benchy-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
12
- llama_benchy-0.2.0.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
13
- llama_benchy-0.2.0.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
14
- llama_benchy-0.2.0.dist-info/RECORD,,
10
+ llama_benchy-0.2.1.dist-info/METADATA,sha256=1zmNoa8cga70yfLREsXlhBNZyXFH0CKieFr6W-h0oyY,15012
11
+ llama_benchy-0.2.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
12
+ llama_benchy-0.2.1.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
13
+ llama_benchy-0.2.1.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
14
+ llama_benchy-0.2.1.dist-info/RECORD,,