llama-benchy 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/PKG-INFO +1 -1
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/src/llama_benchy/_version.py +2 -2
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/src/llama_benchy/client.py +3 -1
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/src/llama_benchy/results.py +69 -8
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/.github/workflows/tests.yml +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/.gitignore +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/LICENSE +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/README.md +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/pyproject.toml +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/src/llama_benchy/__init__.py +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/src/llama_benchy/__main__.py +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/src/llama_benchy/config.py +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/src/llama_benchy/corpus.py +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/src/llama_benchy/prompts.py +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/src/llama_benchy/runner.py +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/tests/__init__.py +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/tests/mock_server.py +0 -0
- {llama_benchy-0.2.0 → llama_benchy-0.2.1}/tests/test_mock_integration.py +0 -0
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.2.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 2,
|
|
31
|
+
__version__ = version = '0.2.1'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 1)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -4,7 +4,7 @@ import codecs
|
|
|
4
4
|
import aiohttp
|
|
5
5
|
import asyncio
|
|
6
6
|
import numpy as np
|
|
7
|
-
from dataclasses import dataclass
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
8
|
from typing import Optional, List, Dict, Any
|
|
9
9
|
|
|
10
10
|
@dataclass
|
|
@@ -16,6 +16,7 @@ class RequestResult:
|
|
|
16
16
|
prompt_tokens: int = 0
|
|
17
17
|
total_tokens: int = 0
|
|
18
18
|
error: Optional[str] = None
|
|
19
|
+
token_timestamps: List[float] = field(default_factory=list)
|
|
19
20
|
|
|
20
21
|
class LLMClient:
|
|
21
22
|
def __init__(self, base_url: str, api_key: str, model_name: str):
|
|
@@ -187,6 +188,7 @@ class LLMClient:
|
|
|
187
188
|
result.first_token_ts = chunk_time
|
|
188
189
|
|
|
189
190
|
result.total_tokens += 1
|
|
191
|
+
result.token_timestamps.append(chunk_time)
|
|
190
192
|
except json.JSONDecodeError:
|
|
191
193
|
continue
|
|
192
194
|
|
|
@@ -37,6 +37,7 @@ class BenchmarkRun:
|
|
|
37
37
|
pp_req_throughput: Optional[BenchmarkMetric]
|
|
38
38
|
tg_throughput: Optional[BenchmarkMetric]
|
|
39
39
|
tg_req_throughput: Optional[BenchmarkMetric]
|
|
40
|
+
peak_throughput: Optional[BenchmarkMetric]
|
|
40
41
|
ttfr: Optional[BenchmarkMetric]
|
|
41
42
|
est_ppt: Optional[BenchmarkMetric]
|
|
42
43
|
e2e_ttft: Optional[BenchmarkMetric]
|
|
@@ -57,6 +58,33 @@ class BenchmarkResults:
|
|
|
57
58
|
values=scaled_values
|
|
58
59
|
)
|
|
59
60
|
|
|
61
|
+
def _calculate_peak_throughput(self, all_timestamps: List[float], window: float = 1.0) -> float:
|
|
62
|
+
if not all_timestamps:
|
|
63
|
+
return 0.0
|
|
64
|
+
|
|
65
|
+
all_timestamps.sort()
|
|
66
|
+
|
|
67
|
+
# If total duration is less than the window, use actual duration to calculate rate
|
|
68
|
+
# This handles short bursts correctly where Peak would otherwise be < Mean
|
|
69
|
+
total_duration = all_timestamps[-1] - all_timestamps[0]
|
|
70
|
+
if total_duration < window and total_duration > 0:
|
|
71
|
+
return len(all_timestamps) / total_duration
|
|
72
|
+
|
|
73
|
+
max_tokens = 0
|
|
74
|
+
|
|
75
|
+
start_idx = 0
|
|
76
|
+
for end_idx, end_time in enumerate(all_timestamps):
|
|
77
|
+
# Window starts at end_time - window
|
|
78
|
+
while start_idx < end_idx and all_timestamps[start_idx] <= end_time - window:
|
|
79
|
+
start_idx += 1
|
|
80
|
+
|
|
81
|
+
# Count includes current token, so range is [start_idx, end_idx]
|
|
82
|
+
current_tokens = end_idx - start_idx + 1
|
|
83
|
+
if current_tokens > max_tokens:
|
|
84
|
+
max_tokens = current_tokens
|
|
85
|
+
|
|
86
|
+
return float(max_tokens) / window
|
|
87
|
+
|
|
60
88
|
def add(self,
|
|
61
89
|
model: str,
|
|
62
90
|
pp: int,
|
|
@@ -81,6 +109,7 @@ class BenchmarkResults:
|
|
|
81
109
|
|
|
82
110
|
agg_batch_pp_throughputs = []
|
|
83
111
|
agg_batch_tg_throughputs = []
|
|
112
|
+
agg_peak_throughputs = []
|
|
84
113
|
|
|
85
114
|
for batch in run_results:
|
|
86
115
|
self._process_batch(
|
|
@@ -94,7 +123,8 @@ class BenchmarkResults:
|
|
|
94
123
|
agg_est_ppt_values,
|
|
95
124
|
agg_e2e_ttft_values,
|
|
96
125
|
agg_batch_pp_throughputs,
|
|
97
|
-
agg_batch_tg_throughputs
|
|
126
|
+
agg_batch_tg_throughputs,
|
|
127
|
+
agg_peak_throughputs
|
|
98
128
|
)
|
|
99
129
|
|
|
100
130
|
# Calculate metrics for BenchmarkRun
|
|
@@ -104,6 +134,8 @@ class BenchmarkResults:
|
|
|
104
134
|
run_metric_tg_throughput = self._calculate_metric(agg_batch_tg_throughputs if concurrency > 1 else agg_tg_speeds)
|
|
105
135
|
run_metric_tg_req_throughput = run_metric_tg_throughput if concurrency == 1 else self._calculate_metric(agg_tg_speeds)
|
|
106
136
|
|
|
137
|
+
run_metric_peak_throughput = self._calculate_metric(agg_peak_throughputs)
|
|
138
|
+
|
|
107
139
|
run_metric_ttfr = self._calculate_metric(agg_ttfr_values, 1000)
|
|
108
140
|
run_metric_est_ppt = self._calculate_metric(agg_est_ppt_values, 1000)
|
|
109
141
|
run_metric_e2e_ttft = self._calculate_metric(agg_e2e_ttft_values, 1000)
|
|
@@ -118,6 +150,7 @@ class BenchmarkResults:
|
|
|
118
150
|
pp_req_throughput=run_metric_pp_req_throughput,
|
|
119
151
|
tg_throughput=run_metric_tg_throughput,
|
|
120
152
|
tg_req_throughput=run_metric_tg_req_throughput,
|
|
153
|
+
peak_throughput=run_metric_peak_throughput,
|
|
121
154
|
ttfr=run_metric_ttfr,
|
|
122
155
|
est_ppt=run_metric_est_ppt,
|
|
123
156
|
e2e_ttft=run_metric_e2e_ttft
|
|
@@ -134,7 +167,8 @@ class BenchmarkResults:
|
|
|
134
167
|
agg_est_ppt_values: List[float],
|
|
135
168
|
agg_e2e_ttft_values: List[float],
|
|
136
169
|
agg_batch_pp_throughputs: List[float],
|
|
137
|
-
agg_batch_tg_throughputs: List[float]
|
|
170
|
+
agg_batch_tg_throughputs: List[float],
|
|
171
|
+
agg_peak_throughputs: List[float]):
|
|
138
172
|
|
|
139
173
|
valid_results = [r for r in results if r and not r.error]
|
|
140
174
|
if not valid_results:
|
|
@@ -146,10 +180,21 @@ class BenchmarkResults:
|
|
|
146
180
|
start_times = []
|
|
147
181
|
end_times = []
|
|
148
182
|
first_token_times = []
|
|
183
|
+
last_token_times = []
|
|
184
|
+
|
|
185
|
+
# Collect all token timestamps for peak calculation
|
|
186
|
+
all_token_timestamps = []
|
|
149
187
|
|
|
150
188
|
for res in valid_results:
|
|
151
189
|
start_times.append(res.start_ts)
|
|
152
190
|
end_times.append(res.end_ts)
|
|
191
|
+
all_token_timestamps.extend(res.token_timestamps)
|
|
192
|
+
|
|
193
|
+
if res.token_timestamps:
|
|
194
|
+
last_token_times.append(res.token_timestamps[-1])
|
|
195
|
+
elif res.end_ts:
|
|
196
|
+
# Fallback if no timestamps recorded but request finished
|
|
197
|
+
last_token_times.append(res.end_ts)
|
|
153
198
|
|
|
154
199
|
# Use reported usage if available and reasonable, else expected
|
|
155
200
|
prompt_tokens = expected_pp_tokens
|
|
@@ -205,13 +250,21 @@ class BenchmarkResults:
|
|
|
205
250
|
agg_batch_pp_throughputs.append(batch_pp_throughput)
|
|
206
251
|
|
|
207
252
|
min_first_token = min(first_token_times)
|
|
208
|
-
|
|
253
|
+
|
|
254
|
+
# Use max(last_token_times) instead of max(end_times) to remove protocol overhead (headers, [DONE], etc)
|
|
255
|
+
# This makes the throughput metric purely about token generation speed.
|
|
256
|
+
max_last_token = max(last_token_times) if last_token_times else max_end
|
|
257
|
+
tg_duration = max_last_token - min_first_token
|
|
209
258
|
|
|
210
259
|
if tg_duration > 0:
|
|
211
260
|
if batch_gen_tokens > len(valid_results):
|
|
212
261
|
batch_tg_throughput = (batch_gen_tokens - len(valid_results)) / tg_duration
|
|
213
262
|
agg_batch_tg_throughputs.append(batch_tg_throughput)
|
|
214
263
|
|
|
264
|
+
if all_token_timestamps:
|
|
265
|
+
peak = self._calculate_peak_throughput(all_token_timestamps)
|
|
266
|
+
agg_peak_throughputs.append(peak)
|
|
267
|
+
|
|
215
268
|
|
|
216
269
|
def _generate_rows(self) -> List[Dict[str, Any]]:
|
|
217
270
|
rows = []
|
|
@@ -228,6 +281,7 @@ class BenchmarkResults:
|
|
|
228
281
|
"test_name": f"ctx_pp @ d{run.context_size}{c_suffix}",
|
|
229
282
|
"t_s": run.pp_throughput,
|
|
230
283
|
"t_s_req": run.pp_req_throughput,
|
|
284
|
+
"peak_ts": None,
|
|
231
285
|
"ttfr": run.ttfr,
|
|
232
286
|
"est_ppt": run.est_ppt,
|
|
233
287
|
"e2e_ttft": run.e2e_ttft
|
|
@@ -240,6 +294,7 @@ class BenchmarkResults:
|
|
|
240
294
|
"test_name": f"ctx_tg @ d{run.context_size}{c_suffix}",
|
|
241
295
|
"t_s": run.tg_throughput,
|
|
242
296
|
"t_s_req": run.tg_req_throughput,
|
|
297
|
+
"peak_ts": run.peak_throughput,
|
|
243
298
|
"ttfr": None,
|
|
244
299
|
"est_ppt": None,
|
|
245
300
|
"e2e_ttft": None
|
|
@@ -255,6 +310,7 @@ class BenchmarkResults:
|
|
|
255
310
|
"test_name": f"pp{run.prompt_size}{d_suffix}{c_suffix}",
|
|
256
311
|
"t_s": run.pp_throughput,
|
|
257
312
|
"t_s_req": run.pp_req_throughput,
|
|
313
|
+
"peak_ts": None,
|
|
258
314
|
"ttfr": run.ttfr,
|
|
259
315
|
"est_ppt": run.est_ppt,
|
|
260
316
|
"e2e_ttft": run.e2e_ttft
|
|
@@ -267,6 +323,7 @@ class BenchmarkResults:
|
|
|
267
323
|
"test_name": f"tg{run.response_size}{d_suffix}{c_suffix}",
|
|
268
324
|
"t_s": run.tg_throughput,
|
|
269
325
|
"t_s_req": run.tg_req_throughput,
|
|
326
|
+
"peak_ts": run.peak_throughput,
|
|
270
327
|
"ttfr": None,
|
|
271
328
|
"est_ppt": None,
|
|
272
329
|
"e2e_ttft": None
|
|
@@ -288,26 +345,28 @@ class BenchmarkResults:
|
|
|
288
345
|
row["test_name"],
|
|
289
346
|
fmt(row["t_s"]),
|
|
290
347
|
fmt(row["t_s_req"]),
|
|
348
|
+
fmt(row["peak_ts"]),
|
|
291
349
|
fmt(row["ttfr"]),
|
|
292
350
|
fmt(row["est_ppt"]),
|
|
293
351
|
fmt(row["e2e_ttft"])
|
|
294
352
|
] for row in rows]
|
|
295
353
|
|
|
296
354
|
ts_header = "t/s (total)" if concurrency > 1 else "t/s"
|
|
297
|
-
headers = ["model", "test", ts_header, "t/s (req)", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
|
|
355
|
+
headers = ["model", "test", ts_header, "t/s (req)", "peak t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
|
|
298
356
|
|
|
299
357
|
if concurrency == 1:
|
|
300
358
|
data = [[
|
|
301
359
|
row["model"],
|
|
302
360
|
row["test_name"],
|
|
303
|
-
fmt(row["t_s"]),
|
|
361
|
+
fmt(row["t_s"]),
|
|
362
|
+
fmt(row["peak_ts"]),
|
|
304
363
|
fmt(row["ttfr"]),
|
|
305
364
|
fmt(row["est_ppt"]),
|
|
306
365
|
fmt(row["e2e_ttft"])
|
|
307
366
|
] for row in rows]
|
|
308
|
-
headers = ["model", "test", ts_header, "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
|
|
367
|
+
headers = ["model", "test", ts_header, "peak t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
|
|
309
368
|
|
|
310
|
-
return tabulate(data, headers=headers, tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right", "right") if concurrency > 1 else ("left", "right", "right", "right", "right", "right"))
|
|
369
|
+
return tabulate(data, headers=headers, tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right", "right", "right") if concurrency > 1 else ("left", "right", "right", "right", "right", "right", "right"))
|
|
311
370
|
|
|
312
371
|
def save_report(self, filename: Optional[str], format: str, concurrency: int = 1):
|
|
313
372
|
msg = ""
|
|
@@ -339,7 +398,7 @@ class BenchmarkResults:
|
|
|
339
398
|
elif format == "csv":
|
|
340
399
|
rows = self._generate_rows()
|
|
341
400
|
csv_rows = []
|
|
342
|
-
headers = ["model", "test_name", "t_s_mean", "t_s_std", "t_s_req_mean", "t_s_req_std", "ttfr_mean", "ttfr_std", "est_ppt_mean", "est_ppt_std", "e2e_ttft_mean", "e2e_ttft_std"]
|
|
401
|
+
headers = ["model", "test_name", "t_s_mean", "t_s_std", "t_s_req_mean", "t_s_req_std", "peak_ts_mean", "peak_ts_std", "ttfr_mean", "ttfr_std", "est_ppt_mean", "est_ppt_std", "e2e_ttft_mean", "e2e_ttft_std"]
|
|
343
402
|
|
|
344
403
|
for r in rows:
|
|
345
404
|
row = {
|
|
@@ -349,6 +408,8 @@ class BenchmarkResults:
|
|
|
349
408
|
"t_s_std": r["t_s"].std if r["t_s"] else None,
|
|
350
409
|
"t_s_req_mean": r["t_s_req"].mean if r["t_s_req"] else None,
|
|
351
410
|
"t_s_req_std": r["t_s_req"].std if r["t_s_req"] else None,
|
|
411
|
+
"peak_ts_mean": r["peak_ts"].mean if r["peak_ts"] else None,
|
|
412
|
+
"peak_ts_std": r["peak_ts"].std if r["peak_ts"] else None,
|
|
352
413
|
"ttfr_mean": r["ttfr"].mean if r["ttfr"] else None,
|
|
353
414
|
"ttfr_std": r["ttfr"].std if r["ttfr"] else None,
|
|
354
415
|
"est_ppt_mean": r["est_ppt"].mean if r["est_ppt"] else None,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|