llama-benchy 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_benchy/__main__.py +26 -534
- llama_benchy/_version.py +2 -2
- llama_benchy/client.py +199 -0
- llama_benchy/config.py +76 -0
- llama_benchy/corpus.py +62 -0
- llama_benchy/prompts.py +54 -0
- llama_benchy/results.py +374 -0
- llama_benchy/runner.py +155 -0
- {llama_benchy-0.1.2.dist-info → llama_benchy-0.2.0.dist-info}/METADATA +43 -3
- llama_benchy-0.2.0.dist-info/RECORD +14 -0
- llama_benchy-0.1.2.dist-info/RECORD +0 -8
- {llama_benchy-0.1.2.dist-info → llama_benchy-0.2.0.dist-info}/WHEEL +0 -0
- {llama_benchy-0.1.2.dist-info → llama_benchy-0.2.0.dist-info}/entry_points.txt +0 -0
- {llama_benchy-0.1.2.dist-info → llama_benchy-0.2.0.dist-info}/licenses/LICENSE +0 -0
llama_benchy/results.py
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from tabulate import tabulate
|
|
3
|
+
from typing import List, Dict, Any, Optional
|
|
4
|
+
from dataclasses import dataclass, asdict
|
|
5
|
+
import json
|
|
6
|
+
import csv
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
from .client import RequestResult
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class BenchmarkMetric:
|
|
13
|
+
mean: float
|
|
14
|
+
std: float
|
|
15
|
+
values: List[float]
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class BenchmarkMetadata:
|
|
19
|
+
version: str
|
|
20
|
+
timestamp: str
|
|
21
|
+
latency_mode: str
|
|
22
|
+
latency_ms: float
|
|
23
|
+
model: str
|
|
24
|
+
prefix_caching_enabled: bool
|
|
25
|
+
max_concurrency: int
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class BenchmarkRun:
|
|
29
|
+
concurrency: int
|
|
30
|
+
context_size: int
|
|
31
|
+
prompt_size: int
|
|
32
|
+
response_size: int
|
|
33
|
+
is_context_prefill_phase: bool
|
|
34
|
+
|
|
35
|
+
# Metrics (using BenchmarkMetric)
|
|
36
|
+
pp_throughput: Optional[BenchmarkMetric]
|
|
37
|
+
pp_req_throughput: Optional[BenchmarkMetric]
|
|
38
|
+
tg_throughput: Optional[BenchmarkMetric]
|
|
39
|
+
tg_req_throughput: Optional[BenchmarkMetric]
|
|
40
|
+
ttfr: Optional[BenchmarkMetric]
|
|
41
|
+
est_ppt: Optional[BenchmarkMetric]
|
|
42
|
+
e2e_ttft: Optional[BenchmarkMetric]
|
|
43
|
+
|
|
44
|
+
class BenchmarkResults:
|
|
45
|
+
def __init__(self):
|
|
46
|
+
self.runs: List[BenchmarkRun] = []
|
|
47
|
+
self.metadata: Optional[BenchmarkMetadata] = None
|
|
48
|
+
self.model_name: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
def _calculate_metric(self, values: List[float], multiplier: float = 1.0) -> Optional[BenchmarkMetric]:
|
|
51
|
+
if not values:
|
|
52
|
+
return None
|
|
53
|
+
scaled_values = [v * multiplier for v in values]
|
|
54
|
+
return BenchmarkMetric(
|
|
55
|
+
mean=np.mean(values) * multiplier,
|
|
56
|
+
std=np.std(values) * multiplier,
|
|
57
|
+
values=scaled_values
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def add(self,
|
|
61
|
+
model: str,
|
|
62
|
+
pp: int,
|
|
63
|
+
tg: int,
|
|
64
|
+
depth: int,
|
|
65
|
+
concurrency: int,
|
|
66
|
+
run_results: List[List[RequestResult]], # List of batches (one batch per run)
|
|
67
|
+
latency: float,
|
|
68
|
+
expected_pp_tokens: int,
|
|
69
|
+
is_context_phase: bool = False):
|
|
70
|
+
|
|
71
|
+
if self.model_name is None:
|
|
72
|
+
self.model_name = model
|
|
73
|
+
|
|
74
|
+
# Aggregators
|
|
75
|
+
agg_pp_speeds = []
|
|
76
|
+
agg_tg_speeds = []
|
|
77
|
+
agg_ttft_values = []
|
|
78
|
+
agg_ttfr_values = []
|
|
79
|
+
agg_est_ppt_values = []
|
|
80
|
+
agg_e2e_ttft_values = []
|
|
81
|
+
|
|
82
|
+
agg_batch_pp_throughputs = []
|
|
83
|
+
agg_batch_tg_throughputs = []
|
|
84
|
+
|
|
85
|
+
for batch in run_results:
|
|
86
|
+
self._process_batch(
|
|
87
|
+
batch,
|
|
88
|
+
expected_pp_tokens,
|
|
89
|
+
latency,
|
|
90
|
+
agg_pp_speeds,
|
|
91
|
+
agg_tg_speeds,
|
|
92
|
+
agg_ttft_values,
|
|
93
|
+
agg_ttfr_values,
|
|
94
|
+
agg_est_ppt_values,
|
|
95
|
+
agg_e2e_ttft_values,
|
|
96
|
+
agg_batch_pp_throughputs,
|
|
97
|
+
agg_batch_tg_throughputs
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
# Calculate metrics for BenchmarkRun
|
|
101
|
+
run_metric_pp_throughput = self._calculate_metric(agg_batch_pp_throughputs if concurrency > 1 else agg_pp_speeds)
|
|
102
|
+
run_metric_pp_req_throughput = run_metric_pp_throughput if concurrency == 1 else self._calculate_metric(agg_pp_speeds)
|
|
103
|
+
|
|
104
|
+
run_metric_tg_throughput = self._calculate_metric(agg_batch_tg_throughputs if concurrency > 1 else agg_tg_speeds)
|
|
105
|
+
run_metric_tg_req_throughput = run_metric_tg_throughput if concurrency == 1 else self._calculate_metric(agg_tg_speeds)
|
|
106
|
+
|
|
107
|
+
run_metric_ttfr = self._calculate_metric(agg_ttfr_values, 1000)
|
|
108
|
+
run_metric_est_ppt = self._calculate_metric(agg_est_ppt_values, 1000)
|
|
109
|
+
run_metric_e2e_ttft = self._calculate_metric(agg_e2e_ttft_values, 1000)
|
|
110
|
+
|
|
111
|
+
self.runs.append(BenchmarkRun(
|
|
112
|
+
concurrency=concurrency,
|
|
113
|
+
context_size=depth,
|
|
114
|
+
prompt_size=pp, # Configured prompt size
|
|
115
|
+
response_size=tg,
|
|
116
|
+
is_context_prefill_phase=is_context_phase,
|
|
117
|
+
pp_throughput=run_metric_pp_throughput,
|
|
118
|
+
pp_req_throughput=run_metric_pp_req_throughput,
|
|
119
|
+
tg_throughput=run_metric_tg_throughput,
|
|
120
|
+
tg_req_throughput=run_metric_tg_req_throughput,
|
|
121
|
+
ttfr=run_metric_ttfr,
|
|
122
|
+
est_ppt=run_metric_est_ppt,
|
|
123
|
+
e2e_ttft=run_metric_e2e_ttft
|
|
124
|
+
))
|
|
125
|
+
|
|
126
|
+
def _process_batch(self,
|
|
127
|
+
results: List[RequestResult],
|
|
128
|
+
expected_pp_tokens: int,
|
|
129
|
+
latency: float,
|
|
130
|
+
agg_pp_speeds: List[float],
|
|
131
|
+
agg_tg_speeds: List[float],
|
|
132
|
+
agg_ttft_values: List[float],
|
|
133
|
+
agg_ttfr_values: List[float],
|
|
134
|
+
agg_est_ppt_values: List[float],
|
|
135
|
+
agg_e2e_ttft_values: List[float],
|
|
136
|
+
agg_batch_pp_throughputs: List[float],
|
|
137
|
+
agg_batch_tg_throughputs: List[float]):
|
|
138
|
+
|
|
139
|
+
valid_results = [r for r in results if r and not r.error]
|
|
140
|
+
if not valid_results:
|
|
141
|
+
return
|
|
142
|
+
|
|
143
|
+
batch_prompt_tokens = 0
|
|
144
|
+
batch_gen_tokens = 0
|
|
145
|
+
|
|
146
|
+
start_times = []
|
|
147
|
+
end_times = []
|
|
148
|
+
first_token_times = []
|
|
149
|
+
|
|
150
|
+
for res in valid_results:
|
|
151
|
+
start_times.append(res.start_ts)
|
|
152
|
+
end_times.append(res.end_ts)
|
|
153
|
+
|
|
154
|
+
# Use reported usage if available and reasonable, else expected
|
|
155
|
+
prompt_tokens = expected_pp_tokens
|
|
156
|
+
if res.prompt_tokens > 0:
|
|
157
|
+
diff = abs(res.prompt_tokens - expected_pp_tokens)
|
|
158
|
+
if diff < expected_pp_tokens * 0.2:
|
|
159
|
+
prompt_tokens = res.prompt_tokens
|
|
160
|
+
|
|
161
|
+
batch_prompt_tokens += prompt_tokens
|
|
162
|
+
batch_gen_tokens += res.total_tokens
|
|
163
|
+
|
|
164
|
+
# Metrics Calculation
|
|
165
|
+
ttft = 0.0
|
|
166
|
+
e2e_ttft = 0.0
|
|
167
|
+
ttfr = 0.0
|
|
168
|
+
est_ppt = 0.0
|
|
169
|
+
|
|
170
|
+
if res.first_response_ts:
|
|
171
|
+
ttfr = res.first_response_ts - res.start_ts
|
|
172
|
+
agg_ttfr_values.append(ttfr)
|
|
173
|
+
|
|
174
|
+
if res.first_token_ts:
|
|
175
|
+
first_token_times.append(res.first_token_ts)
|
|
176
|
+
e2e_ttft = res.first_token_ts - res.start_ts
|
|
177
|
+
ttft = max(0, e2e_ttft - latency)
|
|
178
|
+
est_ppt = max(0, ttfr - latency)
|
|
179
|
+
|
|
180
|
+
agg_e2e_ttft_values.append(e2e_ttft)
|
|
181
|
+
agg_ttft_values.append(ttft)
|
|
182
|
+
agg_est_ppt_values.append(est_ppt)
|
|
183
|
+
|
|
184
|
+
# Individual Speeds
|
|
185
|
+
if est_ppt > 0:
|
|
186
|
+
pp_speed = prompt_tokens / est_ppt
|
|
187
|
+
agg_pp_speeds.append(pp_speed)
|
|
188
|
+
|
|
189
|
+
if res.total_tokens > 1 and res.first_token_ts:
|
|
190
|
+
decode_time = res.end_ts - res.first_token_ts
|
|
191
|
+
if decode_time > 0:
|
|
192
|
+
tg_speed = (res.total_tokens - 1) / decode_time
|
|
193
|
+
agg_tg_speeds.append(tg_speed)
|
|
194
|
+
|
|
195
|
+
# Batch-Level Throughput
|
|
196
|
+
if start_times and end_times and first_token_times:
|
|
197
|
+
min_start = min(start_times)
|
|
198
|
+
max_end = max(end_times)
|
|
199
|
+
|
|
200
|
+
max_first_token = max(first_token_times)
|
|
201
|
+
pp_duration = max_first_token - min_start
|
|
202
|
+
|
|
203
|
+
if pp_duration > 0:
|
|
204
|
+
batch_pp_throughput = batch_prompt_tokens / pp_duration
|
|
205
|
+
agg_batch_pp_throughputs.append(batch_pp_throughput)
|
|
206
|
+
|
|
207
|
+
min_first_token = min(first_token_times)
|
|
208
|
+
tg_duration = max_end - min_first_token
|
|
209
|
+
|
|
210
|
+
if tg_duration > 0:
|
|
211
|
+
if batch_gen_tokens > len(valid_results):
|
|
212
|
+
batch_tg_throughput = (batch_gen_tokens - len(valid_results)) / tg_duration
|
|
213
|
+
agg_batch_tg_throughputs.append(batch_tg_throughput)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _generate_rows(self) -> List[Dict[str, Any]]:
|
|
217
|
+
rows = []
|
|
218
|
+
for run in self.runs:
|
|
219
|
+
c_suffix = ""
|
|
220
|
+
if self.metadata and self.metadata.max_concurrency > 1:
|
|
221
|
+
c_suffix = f" (c{run.concurrency})"
|
|
222
|
+
|
|
223
|
+
if run.is_context_prefill_phase:
|
|
224
|
+
# Context Phase Prompt Processing
|
|
225
|
+
if run.pp_throughput:
|
|
226
|
+
rows.append({
|
|
227
|
+
"model": self.model_name or "Unknown",
|
|
228
|
+
"test_name": f"ctx_pp @ d{run.context_size}{c_suffix}",
|
|
229
|
+
"t_s": run.pp_throughput,
|
|
230
|
+
"t_s_req": run.pp_req_throughput,
|
|
231
|
+
"ttfr": run.ttfr,
|
|
232
|
+
"est_ppt": run.est_ppt,
|
|
233
|
+
"e2e_ttft": run.e2e_ttft
|
|
234
|
+
})
|
|
235
|
+
|
|
236
|
+
# Context Phase Token Generation
|
|
237
|
+
if run.tg_throughput:
|
|
238
|
+
rows.append({
|
|
239
|
+
"model": self.model_name or "Unknown",
|
|
240
|
+
"test_name": f"ctx_tg @ d{run.context_size}{c_suffix}",
|
|
241
|
+
"t_s": run.tg_throughput,
|
|
242
|
+
"t_s_req": run.tg_req_throughput,
|
|
243
|
+
"ttfr": None,
|
|
244
|
+
"est_ppt": None,
|
|
245
|
+
"e2e_ttft": None
|
|
246
|
+
})
|
|
247
|
+
else:
|
|
248
|
+
# Standard Phase
|
|
249
|
+
d_suffix = f" @ d{run.context_size}" if run.context_size > 0 else ""
|
|
250
|
+
|
|
251
|
+
# Prompt Processing
|
|
252
|
+
if run.pp_throughput:
|
|
253
|
+
rows.append({
|
|
254
|
+
"model": self.model_name or "Unknown",
|
|
255
|
+
"test_name": f"pp{run.prompt_size}{d_suffix}{c_suffix}",
|
|
256
|
+
"t_s": run.pp_throughput,
|
|
257
|
+
"t_s_req": run.pp_req_throughput,
|
|
258
|
+
"ttfr": run.ttfr,
|
|
259
|
+
"est_ppt": run.est_ppt,
|
|
260
|
+
"e2e_ttft": run.e2e_ttft
|
|
261
|
+
})
|
|
262
|
+
|
|
263
|
+
# Token Generation
|
|
264
|
+
if run.tg_throughput:
|
|
265
|
+
rows.append({
|
|
266
|
+
"model": self.model_name or "Unknown",
|
|
267
|
+
"test_name": f"tg{run.response_size}{d_suffix}{c_suffix}",
|
|
268
|
+
"t_s": run.tg_throughput,
|
|
269
|
+
"t_s_req": run.tg_req_throughput,
|
|
270
|
+
"ttfr": None,
|
|
271
|
+
"est_ppt": None,
|
|
272
|
+
"e2e_ttft": None
|
|
273
|
+
})
|
|
274
|
+
return rows
|
|
275
|
+
|
|
276
|
+
def _generate_md_report(self, concurrency: int) -> str:
|
|
277
|
+
rows = self._generate_rows()
|
|
278
|
+
if not rows:
|
|
279
|
+
return "No results collected. Check if the model is generating tokens."
|
|
280
|
+
|
|
281
|
+
def fmt(metric: Optional[BenchmarkMetric]) -> str:
|
|
282
|
+
if metric is None:
|
|
283
|
+
return ""
|
|
284
|
+
return f"{metric.mean:.2f} ± {metric.std:.2f}"
|
|
285
|
+
|
|
286
|
+
data = [[
|
|
287
|
+
row["model"],
|
|
288
|
+
row["test_name"],
|
|
289
|
+
fmt(row["t_s"]),
|
|
290
|
+
fmt(row["t_s_req"]),
|
|
291
|
+
fmt(row["ttfr"]),
|
|
292
|
+
fmt(row["est_ppt"]),
|
|
293
|
+
fmt(row["e2e_ttft"])
|
|
294
|
+
] for row in rows]
|
|
295
|
+
|
|
296
|
+
ts_header = "t/s (total)" if concurrency > 1 else "t/s"
|
|
297
|
+
headers = ["model", "test", ts_header, "t/s (req)", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
|
|
298
|
+
|
|
299
|
+
if concurrency == 1:
|
|
300
|
+
data = [[
|
|
301
|
+
row["model"],
|
|
302
|
+
row["test_name"],
|
|
303
|
+
fmt(row["t_s"]),
|
|
304
|
+
fmt(row["ttfr"]),
|
|
305
|
+
fmt(row["est_ppt"]),
|
|
306
|
+
fmt(row["e2e_ttft"])
|
|
307
|
+
] for row in rows]
|
|
308
|
+
headers = ["model", "test", ts_header, "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
|
|
309
|
+
|
|
310
|
+
return tabulate(data, headers=headers, tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right", "right") if concurrency > 1 else ("left", "right", "right", "right", "right", "right"))
|
|
311
|
+
|
|
312
|
+
def save_report(self, filename: Optional[str], format: str, concurrency: int = 1):
|
|
313
|
+
msg = ""
|
|
314
|
+
if filename:
|
|
315
|
+
msg += f"Saving results to {filename} in {format.upper()} format...\n"
|
|
316
|
+
else:
|
|
317
|
+
msg += f"Printing results in {format.upper()} format:\n"
|
|
318
|
+
|
|
319
|
+
print(f"{msg}\n")
|
|
320
|
+
|
|
321
|
+
if format == "md":
|
|
322
|
+
output = self._generate_md_report(concurrency)
|
|
323
|
+
if filename:
|
|
324
|
+
with open(filename, "w") as f:
|
|
325
|
+
f.write(output)
|
|
326
|
+
else:
|
|
327
|
+
print("\n" + output)
|
|
328
|
+
|
|
329
|
+
elif format == "json":
|
|
330
|
+
data = asdict(self.metadata) if self.metadata else {}
|
|
331
|
+
data["benchmarks"] = [asdict(run) for run in self.runs]
|
|
332
|
+
|
|
333
|
+
if filename:
|
|
334
|
+
with open(filename, "w") as f:
|
|
335
|
+
json.dump(data, f, indent=2)
|
|
336
|
+
else:
|
|
337
|
+
print(json.dumps(data, indent=2))
|
|
338
|
+
|
|
339
|
+
elif format == "csv":
|
|
340
|
+
rows = self._generate_rows()
|
|
341
|
+
csv_rows = []
|
|
342
|
+
headers = ["model", "test_name", "t_s_mean", "t_s_std", "t_s_req_mean", "t_s_req_std", "ttfr_mean", "ttfr_std", "est_ppt_mean", "est_ppt_std", "e2e_ttft_mean", "e2e_ttft_std"]
|
|
343
|
+
|
|
344
|
+
for r in rows:
|
|
345
|
+
row = {
|
|
346
|
+
"model": r["model"],
|
|
347
|
+
"test_name": r["test_name"],
|
|
348
|
+
"t_s_mean": r["t_s"].mean if r["t_s"] else None,
|
|
349
|
+
"t_s_std": r["t_s"].std if r["t_s"] else None,
|
|
350
|
+
"t_s_req_mean": r["t_s_req"].mean if r["t_s_req"] else None,
|
|
351
|
+
"t_s_req_std": r["t_s_req"].std if r["t_s_req"] else None,
|
|
352
|
+
"ttfr_mean": r["ttfr"].mean if r["ttfr"] else None,
|
|
353
|
+
"ttfr_std": r["ttfr"].std if r["ttfr"] else None,
|
|
354
|
+
"est_ppt_mean": r["est_ppt"].mean if r["est_ppt"] else None,
|
|
355
|
+
"est_ppt_std": r["est_ppt"].std if r["est_ppt"] else None,
|
|
356
|
+
"e2e_ttft_mean": r["e2e_ttft"].mean if r["e2e_ttft"] else None,
|
|
357
|
+
"e2e_ttft_std": r["e2e_ttft"].std if r["e2e_ttft"] else None,
|
|
358
|
+
}
|
|
359
|
+
csv_rows.append(row)
|
|
360
|
+
|
|
361
|
+
output_file = filename if filename else sys.stdout
|
|
362
|
+
is_file = isinstance(output_file, str)
|
|
363
|
+
|
|
364
|
+
if is_file:
|
|
365
|
+
with open(output_file, "w", newline="") as f:
|
|
366
|
+
writer = csv.DictWriter(f, fieldnames=headers)
|
|
367
|
+
writer.writeheader()
|
|
368
|
+
writer.writerows(csv_rows)
|
|
369
|
+
else:
|
|
370
|
+
writer = csv.DictWriter(sys.stdout, fieldnames=headers)
|
|
371
|
+
writer.writeheader()
|
|
372
|
+
writer.writerows(csv_rows)
|
|
373
|
+
|
|
374
|
+
|
llama_benchy/runner.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import subprocess
|
|
3
|
+
import time
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import List
|
|
6
|
+
import aiohttp
|
|
7
|
+
|
|
8
|
+
from ._version import __version__
|
|
9
|
+
from .config import BenchmarkConfig
|
|
10
|
+
from .client import LLMClient
|
|
11
|
+
from .prompts import PromptGenerator
|
|
12
|
+
from .results import BenchmarkResults, BenchmarkMetadata
|
|
13
|
+
|
|
14
|
+
class BenchmarkRunner:
|
|
15
|
+
def __init__(self, config: BenchmarkConfig, client: LLMClient, prompt_generator: PromptGenerator):
|
|
16
|
+
self.config = config
|
|
17
|
+
self.client = client
|
|
18
|
+
self.prompt_gen = prompt_generator
|
|
19
|
+
self.results = BenchmarkResults()
|
|
20
|
+
|
|
21
|
+
# We need to track deltas from warmup to adapt prompts
|
|
22
|
+
self.delta_user = 0
|
|
23
|
+
self.delta_context = 0
|
|
24
|
+
|
|
25
|
+
async def run_suite(self):
|
|
26
|
+
# Initialize session
|
|
27
|
+
timeout = aiohttp.ClientTimeout(total=3600)
|
|
28
|
+
max_concurrency = max(self.config.concurrency_levels)
|
|
29
|
+
connector = aiohttp.TCPConnector(limit=max_concurrency + 5, force_close=False, keepalive_timeout=600)
|
|
30
|
+
|
|
31
|
+
async with aiohttp.ClientSession(timeout=timeout, connector=connector, trust_env=True) as session:
|
|
32
|
+
# Warmup
|
|
33
|
+
should_warmup = not self.config.no_warmup
|
|
34
|
+
if self.config.adapt_prompt:
|
|
35
|
+
should_warmup = True
|
|
36
|
+
|
|
37
|
+
if should_warmup:
|
|
38
|
+
tokenizer = self.prompt_gen.corpus.get_tokenizer() if self.config.adapt_prompt else None
|
|
39
|
+
self.delta_user, self.delta_context = await self.client.warmup(session, tokenizer)
|
|
40
|
+
|
|
41
|
+
# Measure latency
|
|
42
|
+
latency = await self.client.measure_latency(session, self.config.latency_mode)
|
|
43
|
+
|
|
44
|
+
# Main Loop
|
|
45
|
+
for depth in self.config.depths:
|
|
46
|
+
for pp in self.config.pp_counts:
|
|
47
|
+
for tg in self.config.tg_counts:
|
|
48
|
+
for concurrency in self.config.concurrency_levels:
|
|
49
|
+
print(f"Running test: pp={pp}, tg={tg}, depth={depth}, concurrency={concurrency}")
|
|
50
|
+
|
|
51
|
+
run_std_results = []
|
|
52
|
+
run_ctx_results = []
|
|
53
|
+
expected_pp = pp
|
|
54
|
+
expected_ctx = depth
|
|
55
|
+
|
|
56
|
+
for run in range(self.config.num_runs):
|
|
57
|
+
|
|
58
|
+
# Adapt prompt tokens
|
|
59
|
+
current_pp = pp
|
|
60
|
+
current_depth = depth
|
|
61
|
+
if self.config.adapt_prompt:
|
|
62
|
+
if depth == 0:
|
|
63
|
+
current_pp = max(1, pp - self.delta_user)
|
|
64
|
+
else:
|
|
65
|
+
current_depth = max(1, depth - self.delta_context)
|
|
66
|
+
|
|
67
|
+
expected_pp = current_pp
|
|
68
|
+
expected_ctx = current_depth
|
|
69
|
+
|
|
70
|
+
prompt_batch = self.prompt_gen.generate_batch(
|
|
71
|
+
concurrency,
|
|
72
|
+
current_pp,
|
|
73
|
+
current_depth,
|
|
74
|
+
self.config.no_cache
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
if self.config.enable_prefix_caching and depth > 0:
|
|
78
|
+
# Phase 1: Context Load
|
|
79
|
+
print(f" Run {run+1}/{self.config.num_runs} (Context Load, batch size {concurrency})...")
|
|
80
|
+
load_tasks = []
|
|
81
|
+
for i in range(concurrency):
|
|
82
|
+
context, _ = prompt_batch[i]
|
|
83
|
+
load_tasks.append(self.client.run_generation(
|
|
84
|
+
session,
|
|
85
|
+
context_text=context,
|
|
86
|
+
prompt_text="",
|
|
87
|
+
max_tokens=tg,
|
|
88
|
+
no_cache=self.config.no_cache
|
|
89
|
+
))
|
|
90
|
+
|
|
91
|
+
load_results = await asyncio.gather(*load_tasks)
|
|
92
|
+
run_ctx_results.append(load_results)
|
|
93
|
+
|
|
94
|
+
# Phase 2: Inference
|
|
95
|
+
print(f" Run {run+1}/{self.config.num_runs} (Inference, batch size {concurrency})...")
|
|
96
|
+
inf_tasks = []
|
|
97
|
+
for i in range(concurrency):
|
|
98
|
+
context, prompt = prompt_batch[i]
|
|
99
|
+
inf_tasks.append(self.client.run_generation(
|
|
100
|
+
session,
|
|
101
|
+
context_text=context,
|
|
102
|
+
prompt_text=prompt,
|
|
103
|
+
max_tokens=tg,
|
|
104
|
+
no_cache=self.config.no_cache
|
|
105
|
+
))
|
|
106
|
+
|
|
107
|
+
batch_results = await asyncio.gather(*inf_tasks)
|
|
108
|
+
run_std_results.append(batch_results)
|
|
109
|
+
|
|
110
|
+
else:
|
|
111
|
+
# Standard Run
|
|
112
|
+
print(f" Run {run+1}/{self.config.num_runs} (batch size {concurrency})...")
|
|
113
|
+
expected_tokens = current_pp + current_depth
|
|
114
|
+
batch_tasks = []
|
|
115
|
+
for i in range(concurrency):
|
|
116
|
+
context, prompt = prompt_batch[i]
|
|
117
|
+
batch_tasks.append(self.client.run_generation(
|
|
118
|
+
session,
|
|
119
|
+
context_text=context,
|
|
120
|
+
prompt_text=prompt,
|
|
121
|
+
max_tokens=tg,
|
|
122
|
+
no_cache=self.config.no_cache
|
|
123
|
+
))
|
|
124
|
+
|
|
125
|
+
batch_results = await asyncio.gather(*batch_tasks)
|
|
126
|
+
run_std_results.append(batch_results)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# Post Run Command
|
|
130
|
+
if self.config.post_run_cmd:
|
|
131
|
+
try:
|
|
132
|
+
subprocess.run(self.config.post_run_cmd, shell=True, check=True)
|
|
133
|
+
except subprocess.CalledProcessError as e:
|
|
134
|
+
print(f"Post-run command failed: {e}")
|
|
135
|
+
|
|
136
|
+
# Aggregate and Record
|
|
137
|
+
if self.config.enable_prefix_caching and depth > 0:
|
|
138
|
+
self.results.add(self.config.model, pp, tg, depth, concurrency, run_ctx_results, latency, expected_ctx, is_context_phase=True)
|
|
139
|
+
self.results.add(self.config.model, pp, tg, depth, concurrency, run_std_results, latency, expected_pp, is_context_phase=False)
|
|
140
|
+
else:
|
|
141
|
+
# Standard run expected tokens = pp + depth (usually depth=0 or concatenated)
|
|
142
|
+
# In the loop above: expected_tokens = current_pp + current_depth
|
|
143
|
+
self.results.add(self.config.model, pp, tg, depth, concurrency, run_std_results, latency, expected_pp + expected_ctx, is_context_phase=False)
|
|
144
|
+
|
|
145
|
+
self.results.metadata = BenchmarkMetadata(
|
|
146
|
+
version=__version__,
|
|
147
|
+
timestamp=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%SZ"),
|
|
148
|
+
latency_mode=self.config.latency_mode,
|
|
149
|
+
latency_ms=latency * 1000,
|
|
150
|
+
model=self.config.model,
|
|
151
|
+
prefix_caching_enabled=self.config.enable_prefix_caching,
|
|
152
|
+
max_concurrency=max(self.config.concurrency_levels) if self.config.concurrency_levels else 1
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
self.results.save_report(self.config.save_result, self.config.result_format, max(self.config.concurrency_levels) if self.config.concurrency_levels else 1)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llama-benchy
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
|
|
5
5
|
Project-URL: Homepage, https://github.com/eugr/llama-benchy
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/eugr/llama-benchy/issues
|
|
@@ -43,6 +43,12 @@ Requires-Dist: openai
|
|
|
43
43
|
Requires-Dist: requests
|
|
44
44
|
Requires-Dist: tabulate
|
|
45
45
|
Requires-Dist: transformers
|
|
46
|
+
Provides-Extra: dev
|
|
47
|
+
Requires-Dist: fastapi; extra == 'dev'
|
|
48
|
+
Requires-Dist: pydantic; extra == 'dev'
|
|
49
|
+
Requires-Dist: pytest; extra == 'dev'
|
|
50
|
+
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
51
|
+
Requires-Dist: uvicorn; extra == 'dev'
|
|
46
52
|
Description-Content-Type: text/markdown
|
|
47
53
|
|
|
48
54
|
# llama-benchy - llama-bench style benchmarking tool for all backends
|
|
@@ -75,12 +81,12 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
|
|
|
75
81
|
- Downloads a book from Project Gutenberg to use as source text for prompts to ensure better benchmarking of spec.decoding/MTP models.
|
|
76
82
|
- Supports executing a command after each run (e.g., to clear cache).
|
|
77
83
|
- Configurable latency measurement mode.
|
|
84
|
+
- Supports concurrent requests (`--concurrency`) to measure throughput under load.
|
|
85
|
+
- Can save results to file in Markdown, JSON, or CSV format.
|
|
78
86
|
|
|
79
87
|
# Current Limitations
|
|
80
88
|
|
|
81
89
|
- Evaluates against `/v1/chat/completions` endpoint only.
|
|
82
|
-
- Doesn't measure throughput in concurrency mode (coming later).
|
|
83
|
-
- Outputs results as a Markdown table only for now.
|
|
84
90
|
|
|
85
91
|
## Installation
|
|
86
92
|
|
|
@@ -215,6 +221,9 @@ Generally you don't need to disable prompt caching on the server, as a probabili
|
|
|
215
221
|
- `--adapt-prompt`: Adapt prompt size based on warmup token usage delta (Default: True).
|
|
216
222
|
- `--no-adapt-prompt`: Disable prompt size adaptation.
|
|
217
223
|
- `--enable-prefix-caching`: Enable prefix caching performance measurement. When enabled (and depth > 0), it performs a two-step benchmark: first loading the context (reported as `ctx_pp`), then running the prompt with the cached context.
|
|
224
|
+
- `--concurrency`: List of concurrency levels (number of concurrent requests per test) (Default: [1]).
|
|
225
|
+
- `--save-result`: File to save results to.
|
|
226
|
+
- `--format`: Output format: 'md', 'json', 'csv' (Default: 'md').
|
|
218
227
|
|
|
219
228
|
### Metrics
|
|
220
229
|
|
|
@@ -230,6 +239,9 @@ The script attempts to estimate network or processing latency to provide "server
|
|
|
230
239
|
|
|
231
240
|
#### Table Columns
|
|
232
241
|
|
|
242
|
+
- When `concurrency` > 1:
|
|
243
|
+
- **`t/s (total)`**: Total throughput across all concurrent requests.
|
|
244
|
+
- **`t/s (req)`**: Average throughput per individual request.
|
|
233
245
|
- **`t/s` (Tokens per Second)**:
|
|
234
246
|
- **For Prompt Processing (pp)**: Calculated as `Total Prompt Tokens / est_ppt`. This represents the prefill speed.
|
|
235
247
|
- **For Token Generation (tg)**: Calculated as `(Total Generated Tokens - 1) / (Time of Last Token - Time of First Token)`. This represents the decode speed, excluding the first token latency.
|
|
@@ -269,3 +281,31 @@ llama-benchy \
|
|
|
269
281
|
```
|
|
270
282
|
|
|
271
283
|
This will run benchmarks for all combinations of pp (128, 256), tg (32, 64), and depth (0, 1024).
|
|
284
|
+
|
|
285
|
+
## Development
|
|
286
|
+
|
|
287
|
+
### Running Integration Tests
|
|
288
|
+
|
|
289
|
+
This repository includes a mock server and an integration test suite to verify `llama-benchy` logic without needing a real GPU server.
|
|
290
|
+
|
|
291
|
+
The mock server emulates:
|
|
292
|
+
- **Prompt Processing (PP):** ~1000 t/s drift-corrected.
|
|
293
|
+
- **Token Generation (TG):** ~50 t/s.
|
|
294
|
+
- **Prefix Caching:** Emulates cache hits by skipping processing time for cached prefixes (system messages).
|
|
295
|
+
- **OpenAI API Compatibility**: Serves `/v1/chat/completions` and `/v1/models`.
|
|
296
|
+
|
|
297
|
+
To run the integration tests:
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
# Install development dependencies
|
|
301
|
+
uv sync --all-extras --dev
|
|
302
|
+
|
|
303
|
+
# Run tests
|
|
304
|
+
uv run pytest tests/test_mock_integration.py
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
This test will:
|
|
308
|
+
1. Spin up the mock server on port 8001.
|
|
309
|
+
2. Run `llama-benchy` against it.
|
|
310
|
+
3. Parse the JSON output.
|
|
311
|
+
4. Verify that throughputs match the emulated speeds (PP ~1000, TG ~50) and that caching effectively increases effective throughput.
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
llama_benchy/__init__.py,sha256=D2TacJCNiAvfxHovv86Cm1kkFfmwgj_Z6QPoWdjJFhs,239
|
|
2
|
+
llama_benchy/__main__.py,sha256=rk0Re1dehcJNIxIRsTRF_HCvcDEb20nMV05pYtG7FIw,1384
|
|
3
|
+
llama_benchy/_version.py,sha256=Dg8AmJomLVpjKL6prJylOONZAPRtB86LOce7dorQS_A,704
|
|
4
|
+
llama_benchy/client.py,sha256=dYFwlFJvr0aSThb6lN6coQt2KJy8tYb-BhDobniviV8,8362
|
|
5
|
+
llama_benchy/config.py,sha256=FV4jyEHm2G-lU2wX1atq--lLW-53uZQRWrWc00Qrnwc,4462
|
|
6
|
+
llama_benchy/corpus.py,sha256=b0RSkN8bpySiPEToH_XZR3hHKYz752BjsNqlE-78nPY,2404
|
|
7
|
+
llama_benchy/prompts.py,sha256=AUgAOKK2QIBb9DcwhgIrRTGxIqXiFjD7D-Ek0A3mmEk,2090
|
|
8
|
+
llama_benchy/results.py,sha256=jP2UUe5juHu5XDwgiS-7rCbPdbiU0XMn-DcqjVXiCNY,14453
|
|
9
|
+
llama_benchy/runner.py,sha256=PSycdp6nkgkWuW7DYsAJpw2PWBuQXgGEpVUci-r1dDo,8579
|
|
10
|
+
llama_benchy-0.2.0.dist-info/METADATA,sha256=MTE8qthP3WoVJ-crPIUBuGN3rnqi_v1jEOOGL4Dda5A,15012
|
|
11
|
+
llama_benchy-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
12
|
+
llama_benchy-0.2.0.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
|
|
13
|
+
llama_benchy-0.2.0.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
|
|
14
|
+
llama_benchy-0.2.0.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
llama_benchy/__init__.py,sha256=D2TacJCNiAvfxHovv86Cm1kkFfmwgj_Z6QPoWdjJFhs,239
|
|
2
|
-
llama_benchy/__main__.py,sha256=ArgfdkzjgVv-tdoRW0WXxKEGfdbFDzmH6h3w3lay5zI,25120
|
|
3
|
-
llama_benchy/_version.py,sha256=Ok5oAXdWgR9aghaFXTafTeDW6sYO3uVe6d2Nket57R4,704
|
|
4
|
-
llama_benchy-0.1.2.dist-info/METADATA,sha256=oiJHBXHW_74XnVoKPvALBVP5-sXibFPDtELiCcdQaFw,13439
|
|
5
|
-
llama_benchy-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
-
llama_benchy-0.1.2.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
|
|
7
|
-
llama_benchy-0.1.2.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
|
|
8
|
-
llama_benchy-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|