llama-benchy 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,435 @@
1
+ import numpy as np
2
+ from tabulate import tabulate
3
+ from typing import List, Dict, Any, Optional
4
+ from dataclasses import dataclass, asdict
5
+ import json
6
+ import csv
7
+ import sys
8
+
9
+ from .client import RequestResult
10
+
11
+ @dataclass
12
+ class BenchmarkMetric:
13
+ mean: float
14
+ std: float
15
+ values: List[float]
16
+
17
+ @dataclass
18
+ class BenchmarkMetadata:
19
+ version: str
20
+ timestamp: str
21
+ latency_mode: str
22
+ latency_ms: float
23
+ model: str
24
+ prefix_caching_enabled: bool
25
+ max_concurrency: int
26
+
27
+ @dataclass
28
+ class BenchmarkRun:
29
+ concurrency: int
30
+ context_size: int
31
+ prompt_size: int
32
+ response_size: int
33
+ is_context_prefill_phase: bool
34
+
35
+ # Metrics (using BenchmarkMetric)
36
+ pp_throughput: Optional[BenchmarkMetric]
37
+ pp_req_throughput: Optional[BenchmarkMetric]
38
+ tg_throughput: Optional[BenchmarkMetric]
39
+ tg_req_throughput: Optional[BenchmarkMetric]
40
+ peak_throughput: Optional[BenchmarkMetric]
41
+ ttfr: Optional[BenchmarkMetric]
42
+ est_ppt: Optional[BenchmarkMetric]
43
+ e2e_ttft: Optional[BenchmarkMetric]
44
+
45
+ class BenchmarkResults:
46
+ def __init__(self):
47
+ self.runs: List[BenchmarkRun] = []
48
+ self.metadata: Optional[BenchmarkMetadata] = None
49
+ self.model_name: Optional[str] = None
50
+
51
+ def _calculate_metric(self, values: List[float], multiplier: float = 1.0) -> Optional[BenchmarkMetric]:
52
+ if not values:
53
+ return None
54
+ scaled_values = [v * multiplier for v in values]
55
+ return BenchmarkMetric(
56
+ mean=np.mean(values) * multiplier,
57
+ std=np.std(values) * multiplier,
58
+ values=scaled_values
59
+ )
60
+
61
+ def _calculate_peak_throughput(self, all_timestamps: List[float], window: float = 1.0) -> float:
62
+ if not all_timestamps:
63
+ return 0.0
64
+
65
+ all_timestamps.sort()
66
+
67
+ # If total duration is less than the window, use actual duration to calculate rate
68
+ # This handles short bursts correctly where Peak would otherwise be < Mean
69
+ total_duration = all_timestamps[-1] - all_timestamps[0]
70
+ if total_duration < window and total_duration > 0:
71
+ return len(all_timestamps) / total_duration
72
+
73
+ max_tokens = 0
74
+
75
+ start_idx = 0
76
+ for end_idx, end_time in enumerate(all_timestamps):
77
+ # Window starts at end_time - window
78
+ while start_idx < end_idx and all_timestamps[start_idx] <= end_time - window:
79
+ start_idx += 1
80
+
81
+ # Count includes current token, so range is [start_idx, end_idx]
82
+ current_tokens = end_idx - start_idx + 1
83
+ if current_tokens > max_tokens:
84
+ max_tokens = current_tokens
85
+
86
+ return float(max_tokens) / window
87
+
88
+ def add(self,
89
+ model: str,
90
+ pp: int,
91
+ tg: int,
92
+ depth: int,
93
+ concurrency: int,
94
+ run_results: List[List[RequestResult]], # List of batches (one batch per run)
95
+ latency: float,
96
+ expected_pp_tokens: int,
97
+ is_context_phase: bool = False):
98
+
99
+ if self.model_name is None:
100
+ self.model_name = model
101
+
102
+ # Aggregators
103
+ agg_pp_speeds = []
104
+ agg_tg_speeds = []
105
+ agg_ttft_values = []
106
+ agg_ttfr_values = []
107
+ agg_est_ppt_values = []
108
+ agg_e2e_ttft_values = []
109
+
110
+ agg_batch_pp_throughputs = []
111
+ agg_batch_tg_throughputs = []
112
+ agg_peak_throughputs = []
113
+
114
+ for batch in run_results:
115
+ self._process_batch(
116
+ batch,
117
+ expected_pp_tokens,
118
+ latency,
119
+ agg_pp_speeds,
120
+ agg_tg_speeds,
121
+ agg_ttft_values,
122
+ agg_ttfr_values,
123
+ agg_est_ppt_values,
124
+ agg_e2e_ttft_values,
125
+ agg_batch_pp_throughputs,
126
+ agg_batch_tg_throughputs,
127
+ agg_peak_throughputs
128
+ )
129
+
130
+ # Calculate metrics for BenchmarkRun
131
+ run_metric_pp_throughput = self._calculate_metric(agg_batch_pp_throughputs if concurrency > 1 else agg_pp_speeds)
132
+ run_metric_pp_req_throughput = run_metric_pp_throughput if concurrency == 1 else self._calculate_metric(agg_pp_speeds)
133
+
134
+ run_metric_tg_throughput = self._calculate_metric(agg_batch_tg_throughputs if concurrency > 1 else agg_tg_speeds)
135
+ run_metric_tg_req_throughput = run_metric_tg_throughput if concurrency == 1 else self._calculate_metric(agg_tg_speeds)
136
+
137
+ run_metric_peak_throughput = self._calculate_metric(agg_peak_throughputs)
138
+
139
+ run_metric_ttfr = self._calculate_metric(agg_ttfr_values, 1000)
140
+ run_metric_est_ppt = self._calculate_metric(agg_est_ppt_values, 1000)
141
+ run_metric_e2e_ttft = self._calculate_metric(agg_e2e_ttft_values, 1000)
142
+
143
+ self.runs.append(BenchmarkRun(
144
+ concurrency=concurrency,
145
+ context_size=depth,
146
+ prompt_size=pp, # Configured prompt size
147
+ response_size=tg,
148
+ is_context_prefill_phase=is_context_phase,
149
+ pp_throughput=run_metric_pp_throughput,
150
+ pp_req_throughput=run_metric_pp_req_throughput,
151
+ tg_throughput=run_metric_tg_throughput,
152
+ tg_req_throughput=run_metric_tg_req_throughput,
153
+ peak_throughput=run_metric_peak_throughput,
154
+ ttfr=run_metric_ttfr,
155
+ est_ppt=run_metric_est_ppt,
156
+ e2e_ttft=run_metric_e2e_ttft
157
+ ))
158
+
159
+ def _process_batch(self,
160
+ results: List[RequestResult],
161
+ expected_pp_tokens: int,
162
+ latency: float,
163
+ agg_pp_speeds: List[float],
164
+ agg_tg_speeds: List[float],
165
+ agg_ttft_values: List[float],
166
+ agg_ttfr_values: List[float],
167
+ agg_est_ppt_values: List[float],
168
+ agg_e2e_ttft_values: List[float],
169
+ agg_batch_pp_throughputs: List[float],
170
+ agg_batch_tg_throughputs: List[float],
171
+ agg_peak_throughputs: List[float]):
172
+
173
+ valid_results = [r for r in results if r and not r.error]
174
+ if not valid_results:
175
+ return
176
+
177
+ batch_prompt_tokens = 0
178
+ batch_gen_tokens = 0
179
+
180
+ start_times = []
181
+ end_times = []
182
+ first_token_times = []
183
+ last_token_times = []
184
+
185
+ # Collect all token timestamps for peak calculation
186
+ all_token_timestamps = []
187
+
188
+ for res in valid_results:
189
+ start_times.append(res.start_ts)
190
+ end_times.append(res.end_ts)
191
+ all_token_timestamps.extend(res.token_timestamps)
192
+
193
+ if res.token_timestamps:
194
+ last_token_times.append(res.token_timestamps[-1])
195
+ elif res.end_ts:
196
+ # Fallback if no timestamps recorded but request finished
197
+ last_token_times.append(res.end_ts)
198
+
199
+ # Use reported usage if available and reasonable, else expected
200
+ prompt_tokens = expected_pp_tokens
201
+ if res.prompt_tokens > 0:
202
+ diff = abs(res.prompt_tokens - expected_pp_tokens)
203
+ if diff < expected_pp_tokens * 0.2:
204
+ prompt_tokens = res.prompt_tokens
205
+
206
+ batch_prompt_tokens += prompt_tokens
207
+ batch_gen_tokens += res.total_tokens
208
+
209
+ # Metrics Calculation
210
+ ttft = 0.0
211
+ e2e_ttft = 0.0
212
+ ttfr = 0.0
213
+ est_ppt = 0.0
214
+
215
+ if res.first_response_ts:
216
+ ttfr = res.first_response_ts - res.start_ts
217
+ agg_ttfr_values.append(ttfr)
218
+
219
+ if res.first_token_ts:
220
+ first_token_times.append(res.first_token_ts)
221
+ e2e_ttft = res.first_token_ts - res.start_ts
222
+ ttft = max(0, e2e_ttft - latency)
223
+ est_ppt = max(0, ttfr - latency)
224
+
225
+ agg_e2e_ttft_values.append(e2e_ttft)
226
+ agg_ttft_values.append(ttft)
227
+ agg_est_ppt_values.append(est_ppt)
228
+
229
+ # Individual Speeds
230
+ if est_ppt > 0:
231
+ pp_speed = prompt_tokens / est_ppt
232
+ agg_pp_speeds.append(pp_speed)
233
+
234
+ if res.total_tokens > 1 and res.first_token_ts:
235
+ decode_time = res.end_ts - res.first_token_ts
236
+ if decode_time > 0:
237
+ tg_speed = (res.total_tokens - 1) / decode_time
238
+ agg_tg_speeds.append(tg_speed)
239
+
240
+ # Batch-Level Throughput
241
+ if start_times and end_times and first_token_times:
242
+ min_start = min(start_times)
243
+ max_end = max(end_times)
244
+
245
+ max_first_token = max(first_token_times)
246
+ pp_duration = max_first_token - min_start
247
+
248
+ if pp_duration > 0:
249
+ batch_pp_throughput = batch_prompt_tokens / pp_duration
250
+ agg_batch_pp_throughputs.append(batch_pp_throughput)
251
+
252
+ min_first_token = min(first_token_times)
253
+
254
+ # Use max(last_token_times) instead of max(end_times) to remove protocol overhead (headers, [DONE], etc)
255
+ # This makes the throughput metric purely about token generation speed.
256
+ max_last_token = max(last_token_times) if last_token_times else max_end
257
+ tg_duration = max_last_token - min_first_token
258
+
259
+ if tg_duration > 0:
260
+ if batch_gen_tokens > len(valid_results):
261
+ batch_tg_throughput = (batch_gen_tokens - len(valid_results)) / tg_duration
262
+ agg_batch_tg_throughputs.append(batch_tg_throughput)
263
+
264
+ if all_token_timestamps:
265
+ peak = self._calculate_peak_throughput(all_token_timestamps)
266
+ agg_peak_throughputs.append(peak)
267
+
268
+
269
+ def _generate_rows(self) -> List[Dict[str, Any]]:
270
+ rows = []
271
+ for run in self.runs:
272
+ c_suffix = ""
273
+ if self.metadata and self.metadata.max_concurrency > 1:
274
+ c_suffix = f" (c{run.concurrency})"
275
+
276
+ if run.is_context_prefill_phase:
277
+ # Context Phase Prompt Processing
278
+ if run.pp_throughput:
279
+ rows.append({
280
+ "model": self.model_name or "Unknown",
281
+ "test_name": f"ctx_pp @ d{run.context_size}{c_suffix}",
282
+ "t_s": run.pp_throughput,
283
+ "t_s_req": run.pp_req_throughput,
284
+ "peak_ts": None,
285
+ "ttfr": run.ttfr,
286
+ "est_ppt": run.est_ppt,
287
+ "e2e_ttft": run.e2e_ttft
288
+ })
289
+
290
+ # Context Phase Token Generation
291
+ if run.tg_throughput:
292
+ rows.append({
293
+ "model": self.model_name or "Unknown",
294
+ "test_name": f"ctx_tg @ d{run.context_size}{c_suffix}",
295
+ "t_s": run.tg_throughput,
296
+ "t_s_req": run.tg_req_throughput,
297
+ "peak_ts": run.peak_throughput,
298
+ "ttfr": None,
299
+ "est_ppt": None,
300
+ "e2e_ttft": None
301
+ })
302
+ else:
303
+ # Standard Phase
304
+ d_suffix = f" @ d{run.context_size}" if run.context_size > 0 else ""
305
+
306
+ # Prompt Processing
307
+ if run.pp_throughput:
308
+ rows.append({
309
+ "model": self.model_name or "Unknown",
310
+ "test_name": f"pp{run.prompt_size}{d_suffix}{c_suffix}",
311
+ "t_s": run.pp_throughput,
312
+ "t_s_req": run.pp_req_throughput,
313
+ "peak_ts": None,
314
+ "ttfr": run.ttfr,
315
+ "est_ppt": run.est_ppt,
316
+ "e2e_ttft": run.e2e_ttft
317
+ })
318
+
319
+ # Token Generation
320
+ if run.tg_throughput:
321
+ rows.append({
322
+ "model": self.model_name or "Unknown",
323
+ "test_name": f"tg{run.response_size}{d_suffix}{c_suffix}",
324
+ "t_s": run.tg_throughput,
325
+ "t_s_req": run.tg_req_throughput,
326
+ "peak_ts": run.peak_throughput,
327
+ "ttfr": None,
328
+ "est_ppt": None,
329
+ "e2e_ttft": None
330
+ })
331
+ return rows
332
+
333
+ def _generate_md_report(self, concurrency: int) -> str:
334
+ rows = self._generate_rows()
335
+ if not rows:
336
+ return "No results collected. Check if the model is generating tokens."
337
+
338
+ def fmt(metric: Optional[BenchmarkMetric]) -> str:
339
+ if metric is None:
340
+ return ""
341
+ return f"{metric.mean:.2f} ± {metric.std:.2f}"
342
+
343
+ data = [[
344
+ row["model"],
345
+ row["test_name"],
346
+ fmt(row["t_s"]),
347
+ fmt(row["t_s_req"]),
348
+ fmt(row["peak_ts"]),
349
+ fmt(row["ttfr"]),
350
+ fmt(row["est_ppt"]),
351
+ fmt(row["e2e_ttft"])
352
+ ] for row in rows]
353
+
354
+ ts_header = "t/s (total)" if concurrency > 1 else "t/s"
355
+ headers = ["model", "test", ts_header, "t/s (req)", "peak t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
356
+
357
+ if concurrency == 1:
358
+ data = [[
359
+ row["model"],
360
+ row["test_name"],
361
+ fmt(row["t_s"]),
362
+ fmt(row["peak_ts"]),
363
+ fmt(row["ttfr"]),
364
+ fmt(row["est_ppt"]),
365
+ fmt(row["e2e_ttft"])
366
+ ] for row in rows]
367
+ headers = ["model", "test", ts_header, "peak t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
368
+
369
+ return tabulate(data, headers=headers, tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right", "right", "right") if concurrency > 1 else ("left", "right", "right", "right", "right", "right", "right"))
370
+
371
+ def save_report(self, filename: Optional[str], format: str, concurrency: int = 1):
372
+ msg = ""
373
+ if filename:
374
+ msg += f"Saving results to {filename} in {format.upper()} format...\n"
375
+ else:
376
+ msg += f"Printing results in {format.upper()} format:\n"
377
+
378
+ print(f"{msg}\n")
379
+
380
+ if format == "md":
381
+ output = self._generate_md_report(concurrency)
382
+ if filename:
383
+ with open(filename, "w") as f:
384
+ f.write(output)
385
+ else:
386
+ print("\n" + output)
387
+
388
+ elif format == "json":
389
+ data = asdict(self.metadata) if self.metadata else {}
390
+ data["benchmarks"] = [asdict(run) for run in self.runs]
391
+
392
+ if filename:
393
+ with open(filename, "w") as f:
394
+ json.dump(data, f, indent=2)
395
+ else:
396
+ print(json.dumps(data, indent=2))
397
+
398
+ elif format == "csv":
399
+ rows = self._generate_rows()
400
+ csv_rows = []
401
+ headers = ["model", "test_name", "t_s_mean", "t_s_std", "t_s_req_mean", "t_s_req_std", "peak_ts_mean", "peak_ts_std", "ttfr_mean", "ttfr_std", "est_ppt_mean", "est_ppt_std", "e2e_ttft_mean", "e2e_ttft_std"]
402
+
403
+ for r in rows:
404
+ row = {
405
+ "model": r["model"],
406
+ "test_name": r["test_name"],
407
+ "t_s_mean": r["t_s"].mean if r["t_s"] else None,
408
+ "t_s_std": r["t_s"].std if r["t_s"] else None,
409
+ "t_s_req_mean": r["t_s_req"].mean if r["t_s_req"] else None,
410
+ "t_s_req_std": r["t_s_req"].std if r["t_s_req"] else None,
411
+ "peak_ts_mean": r["peak_ts"].mean if r["peak_ts"] else None,
412
+ "peak_ts_std": r["peak_ts"].std if r["peak_ts"] else None,
413
+ "ttfr_mean": r["ttfr"].mean if r["ttfr"] else None,
414
+ "ttfr_std": r["ttfr"].std if r["ttfr"] else None,
415
+ "est_ppt_mean": r["est_ppt"].mean if r["est_ppt"] else None,
416
+ "est_ppt_std": r["est_ppt"].std if r["est_ppt"] else None,
417
+ "e2e_ttft_mean": r["e2e_ttft"].mean if r["e2e_ttft"] else None,
418
+ "e2e_ttft_std": r["e2e_ttft"].std if r["e2e_ttft"] else None,
419
+ }
420
+ csv_rows.append(row)
421
+
422
+ output_file = filename if filename else sys.stdout
423
+ is_file = isinstance(output_file, str)
424
+
425
+ if is_file:
426
+ with open(output_file, "w", newline="") as f:
427
+ writer = csv.DictWriter(f, fieldnames=headers)
428
+ writer.writeheader()
429
+ writer.writerows(csv_rows)
430
+ else:
431
+ writer = csv.DictWriter(sys.stdout, fieldnames=headers)
432
+ writer.writeheader()
433
+ writer.writerows(csv_rows)
434
+
435
+
llama_benchy/runner.py ADDED
@@ -0,0 +1,155 @@
1
+ import asyncio
2
+ import subprocess
3
+ import time
4
+ from datetime import datetime, timezone
5
+ from typing import List
6
+ import aiohttp
7
+
8
+ from ._version import __version__
9
+ from .config import BenchmarkConfig
10
+ from .client import LLMClient
11
+ from .prompts import PromptGenerator
12
+ from .results import BenchmarkResults, BenchmarkMetadata
13
+
14
+ class BenchmarkRunner:
15
+ def __init__(self, config: BenchmarkConfig, client: LLMClient, prompt_generator: PromptGenerator):
16
+ self.config = config
17
+ self.client = client
18
+ self.prompt_gen = prompt_generator
19
+ self.results = BenchmarkResults()
20
+
21
+ # We need to track deltas from warmup to adapt prompts
22
+ self.delta_user = 0
23
+ self.delta_context = 0
24
+
25
+ async def run_suite(self):
26
+ # Initialize session
27
+ timeout = aiohttp.ClientTimeout(total=3600)
28
+ max_concurrency = max(self.config.concurrency_levels)
29
+ connector = aiohttp.TCPConnector(limit=max_concurrency + 5, force_close=False, keepalive_timeout=600)
30
+
31
+ async with aiohttp.ClientSession(timeout=timeout, connector=connector, trust_env=True) as session:
32
+ # Warmup
33
+ should_warmup = not self.config.no_warmup
34
+ if self.config.adapt_prompt:
35
+ should_warmup = True
36
+
37
+ if should_warmup:
38
+ tokenizer = self.prompt_gen.corpus.get_tokenizer() if self.config.adapt_prompt else None
39
+ self.delta_user, self.delta_context = await self.client.warmup(session, tokenizer)
40
+
41
+ # Measure latency
42
+ latency = await self.client.measure_latency(session, self.config.latency_mode)
43
+
44
+ # Main Loop
45
+ for depth in self.config.depths:
46
+ for pp in self.config.pp_counts:
47
+ for tg in self.config.tg_counts:
48
+ for concurrency in self.config.concurrency_levels:
49
+ print(f"Running test: pp={pp}, tg={tg}, depth={depth}, concurrency={concurrency}")
50
+
51
+ run_std_results = []
52
+ run_ctx_results = []
53
+ expected_pp = pp
54
+ expected_ctx = depth
55
+
56
+ for run in range(self.config.num_runs):
57
+
58
+ # Adapt prompt tokens
59
+ current_pp = pp
60
+ current_depth = depth
61
+ if self.config.adapt_prompt:
62
+ if depth == 0:
63
+ current_pp = max(1, pp - self.delta_user)
64
+ else:
65
+ current_depth = max(1, depth - self.delta_context)
66
+
67
+ expected_pp = current_pp
68
+ expected_ctx = current_depth
69
+
70
+ prompt_batch = self.prompt_gen.generate_batch(
71
+ concurrency,
72
+ current_pp,
73
+ current_depth,
74
+ self.config.no_cache
75
+ )
76
+
77
+ if self.config.enable_prefix_caching and depth > 0:
78
+ # Phase 1: Context Load
79
+ print(f" Run {run+1}/{self.config.num_runs} (Context Load, batch size {concurrency})...")
80
+ load_tasks = []
81
+ for i in range(concurrency):
82
+ context, _ = prompt_batch[i]
83
+ load_tasks.append(self.client.run_generation(
84
+ session,
85
+ context_text=context,
86
+ prompt_text="",
87
+ max_tokens=tg,
88
+ no_cache=self.config.no_cache
89
+ ))
90
+
91
+ load_results = await asyncio.gather(*load_tasks)
92
+ run_ctx_results.append(load_results)
93
+
94
+ # Phase 2: Inference
95
+ print(f" Run {run+1}/{self.config.num_runs} (Inference, batch size {concurrency})...")
96
+ inf_tasks = []
97
+ for i in range(concurrency):
98
+ context, prompt = prompt_batch[i]
99
+ inf_tasks.append(self.client.run_generation(
100
+ session,
101
+ context_text=context,
102
+ prompt_text=prompt,
103
+ max_tokens=tg,
104
+ no_cache=self.config.no_cache
105
+ ))
106
+
107
+ batch_results = await asyncio.gather(*inf_tasks)
108
+ run_std_results.append(batch_results)
109
+
110
+ else:
111
+ # Standard Run
112
+ print(f" Run {run+1}/{self.config.num_runs} (batch size {concurrency})...")
113
+ expected_tokens = current_pp + current_depth
114
+ batch_tasks = []
115
+ for i in range(concurrency):
116
+ context, prompt = prompt_batch[i]
117
+ batch_tasks.append(self.client.run_generation(
118
+ session,
119
+ context_text=context,
120
+ prompt_text=prompt,
121
+ max_tokens=tg,
122
+ no_cache=self.config.no_cache
123
+ ))
124
+
125
+ batch_results = await asyncio.gather(*batch_tasks)
126
+ run_std_results.append(batch_results)
127
+
128
+
129
+ # Post Run Command
130
+ if self.config.post_run_cmd:
131
+ try:
132
+ subprocess.run(self.config.post_run_cmd, shell=True, check=True)
133
+ except subprocess.CalledProcessError as e:
134
+ print(f"Post-run command failed: {e}")
135
+
136
+ # Aggregate and Record
137
+ if self.config.enable_prefix_caching and depth > 0:
138
+ self.results.add(self.config.model, pp, tg, depth, concurrency, run_ctx_results, latency, expected_ctx, is_context_phase=True)
139
+ self.results.add(self.config.model, pp, tg, depth, concurrency, run_std_results, latency, expected_pp, is_context_phase=False)
140
+ else:
141
+ # Standard run expected tokens = pp + depth (usually depth=0 or concatenated)
142
+ # In the loop above: expected_tokens = current_pp + current_depth
143
+ self.results.add(self.config.model, pp, tg, depth, concurrency, run_std_results, latency, expected_pp + expected_ctx, is_context_phase=False)
144
+
145
+ self.results.metadata = BenchmarkMetadata(
146
+ version=__version__,
147
+ timestamp=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%SZ"),
148
+ latency_mode=self.config.latency_mode,
149
+ latency_ms=latency * 1000,
150
+ model=self.config.model,
151
+ prefix_caching_enabled=self.config.enable_prefix_caching,
152
+ max_concurrency=max(self.config.concurrency_levels) if self.config.concurrency_levels else 1
153
+ )
154
+
155
+ self.results.save_report(self.config.save_result, self.config.result_format, max(self.config.concurrency_levels) if self.config.concurrency_levels else 1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llama-benchy
3
- Version: 0.1.2
3
+ Version: 0.2.1
4
4
  Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
5
5
  Project-URL: Homepage, https://github.com/eugr/llama-benchy
6
6
  Project-URL: Bug Tracker, https://github.com/eugr/llama-benchy/issues
@@ -43,6 +43,12 @@ Requires-Dist: openai
43
43
  Requires-Dist: requests
44
44
  Requires-Dist: tabulate
45
45
  Requires-Dist: transformers
46
+ Provides-Extra: dev
47
+ Requires-Dist: fastapi; extra == 'dev'
48
+ Requires-Dist: pydantic; extra == 'dev'
49
+ Requires-Dist: pytest; extra == 'dev'
50
+ Requires-Dist: pytest-asyncio; extra == 'dev'
51
+ Requires-Dist: uvicorn; extra == 'dev'
46
52
  Description-Content-Type: text/markdown
47
53
 
48
54
  # llama-benchy - llama-bench style benchmarking tool for all backends
@@ -75,12 +81,12 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
75
81
  - Downloads a book from Project Gutenberg to use as source text for prompts to ensure better benchmarking of spec.decoding/MTP models.
76
82
  - Supports executing a command after each run (e.g., to clear cache).
77
83
  - Configurable latency measurement mode.
84
+ - Supports concurrent requests (`--concurrency`) to measure throughput under load.
85
+ - Can save results to file in Markdown, JSON, or CSV format.
78
86
 
79
87
  # Current Limitations
80
88
 
81
89
  - Evaluates against `/v1/chat/completions` endpoint only.
82
- - Doesn't measure throughput in concurrency mode (coming later).
83
- - Outputs results as a Markdown table only for now.
84
90
 
85
91
  ## Installation
86
92
 
@@ -215,6 +221,9 @@ Generally you don't need to disable prompt caching on the server, as a probabili
215
221
  - `--adapt-prompt`: Adapt prompt size based on warmup token usage delta (Default: True).
216
222
  - `--no-adapt-prompt`: Disable prompt size adaptation.
217
223
  - `--enable-prefix-caching`: Enable prefix caching performance measurement. When enabled (and depth > 0), it performs a two-step benchmark: first loading the context (reported as `ctx_pp`), then running the prompt with the cached context.
224
+ - `--concurrency`: List of concurrency levels (number of concurrent requests per test) (Default: [1]).
225
+ - `--save-result`: File to save results to.
226
+ - `--format`: Output format: 'md', 'json', 'csv' (Default: 'md').
218
227
 
219
228
  ### Metrics
220
229
 
@@ -230,6 +239,9 @@ The script attempts to estimate network or processing latency to provide "server
230
239
 
231
240
  #### Table Columns
232
241
 
242
+ - When `concurrency` > 1:
243
+ - **`t/s (total)`**: Total throughput across all concurrent requests.
244
+ - **`t/s (req)`**: Average throughput per individual request.
233
245
  - **`t/s` (Tokens per Second)**:
234
246
  - **For Prompt Processing (pp)**: Calculated as `Total Prompt Tokens / est_ppt`. This represents the prefill speed.
235
247
  - **For Token Generation (tg)**: Calculated as `(Total Generated Tokens - 1) / (Time of Last Token - Time of First Token)`. This represents the decode speed, excluding the first token latency.
@@ -269,3 +281,31 @@ llama-benchy \
269
281
  ```
270
282
 
271
283
  This will run benchmarks for all combinations of pp (128, 256), tg (32, 64), and depth (0, 1024).
284
+
285
+ ## Development
286
+
287
+ ### Running Integration Tests
288
+
289
+ This repository includes a mock server and an integration test suite to verify `llama-benchy` logic without needing a real GPU server.
290
+
291
+ The mock server emulates:
292
+ - **Prompt Processing (PP):** ~1000 t/s drift-corrected.
293
+ - **Token Generation (TG):** ~50 t/s.
294
+ - **Prefix Caching:** Emulates cache hits by skipping processing time for cached prefixes (system messages).
295
+ - **OpenAI API Compatibility**: Serves `/v1/chat/completions` and `/v1/models`.
296
+
297
+ To run the integration tests:
298
+
299
+ ```bash
300
+ # Install development dependencies
301
+ uv sync --all-extras --dev
302
+
303
+ # Run tests
304
+ uv run pytest tests/test_mock_integration.py
305
+ ```
306
+
307
+ This test will:
308
+ 1. Spin up the mock server on port 8001.
309
+ 2. Run `llama-benchy` against it.
310
+ 3. Parse the JSON output.
311
+ 4. Verify that throughputs match the emulated speeds (PP ~1000, TG ~50) and that caching effectively increases effective throughput.