llama-benchy 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,374 @@
1
+ import numpy as np
2
+ from tabulate import tabulate
3
+ from typing import List, Dict, Any, Optional
4
+ from dataclasses import dataclass, asdict
5
+ import json
6
+ import csv
7
+ import sys
8
+
9
+ from .client import RequestResult
10
+
11
+ @dataclass
12
+ class BenchmarkMetric:
13
+ mean: float
14
+ std: float
15
+ values: List[float]
16
+
17
+ @dataclass
18
+ class BenchmarkMetadata:
19
+ version: str
20
+ timestamp: str
21
+ latency_mode: str
22
+ latency_ms: float
23
+ model: str
24
+ prefix_caching_enabled: bool
25
+ max_concurrency: int
26
+
27
+ @dataclass
28
+ class BenchmarkRun:
29
+ concurrency: int
30
+ context_size: int
31
+ prompt_size: int
32
+ response_size: int
33
+ is_context_prefill_phase: bool
34
+
35
+ # Metrics (using BenchmarkMetric)
36
+ pp_throughput: Optional[BenchmarkMetric]
37
+ pp_req_throughput: Optional[BenchmarkMetric]
38
+ tg_throughput: Optional[BenchmarkMetric]
39
+ tg_req_throughput: Optional[BenchmarkMetric]
40
+ ttfr: Optional[BenchmarkMetric]
41
+ est_ppt: Optional[BenchmarkMetric]
42
+ e2e_ttft: Optional[BenchmarkMetric]
43
+
44
+ class BenchmarkResults:
45
+ def __init__(self):
46
+ self.runs: List[BenchmarkRun] = []
47
+ self.metadata: Optional[BenchmarkMetadata] = None
48
+ self.model_name: Optional[str] = None
49
+
50
+ def _calculate_metric(self, values: List[float], multiplier: float = 1.0) -> Optional[BenchmarkMetric]:
51
+ if not values:
52
+ return None
53
+ scaled_values = [v * multiplier for v in values]
54
+ return BenchmarkMetric(
55
+ mean=np.mean(values) * multiplier,
56
+ std=np.std(values) * multiplier,
57
+ values=scaled_values
58
+ )
59
+
60
+ def add(self,
61
+ model: str,
62
+ pp: int,
63
+ tg: int,
64
+ depth: int,
65
+ concurrency: int,
66
+ run_results: List[List[RequestResult]], # List of batches (one batch per run)
67
+ latency: float,
68
+ expected_pp_tokens: int,
69
+ is_context_phase: bool = False):
70
+
71
+ if self.model_name is None:
72
+ self.model_name = model
73
+
74
+ # Aggregators
75
+ agg_pp_speeds = []
76
+ agg_tg_speeds = []
77
+ agg_ttft_values = []
78
+ agg_ttfr_values = []
79
+ agg_est_ppt_values = []
80
+ agg_e2e_ttft_values = []
81
+
82
+ agg_batch_pp_throughputs = []
83
+ agg_batch_tg_throughputs = []
84
+
85
+ for batch in run_results:
86
+ self._process_batch(
87
+ batch,
88
+ expected_pp_tokens,
89
+ latency,
90
+ agg_pp_speeds,
91
+ agg_tg_speeds,
92
+ agg_ttft_values,
93
+ agg_ttfr_values,
94
+ agg_est_ppt_values,
95
+ agg_e2e_ttft_values,
96
+ agg_batch_pp_throughputs,
97
+ agg_batch_tg_throughputs
98
+ )
99
+
100
+ # Calculate metrics for BenchmarkRun
101
+ run_metric_pp_throughput = self._calculate_metric(agg_batch_pp_throughputs if concurrency > 1 else agg_pp_speeds)
102
+ run_metric_pp_req_throughput = run_metric_pp_throughput if concurrency == 1 else self._calculate_metric(agg_pp_speeds)
103
+
104
+ run_metric_tg_throughput = self._calculate_metric(agg_batch_tg_throughputs if concurrency > 1 else agg_tg_speeds)
105
+ run_metric_tg_req_throughput = run_metric_tg_throughput if concurrency == 1 else self._calculate_metric(agg_tg_speeds)
106
+
107
+ run_metric_ttfr = self._calculate_metric(agg_ttfr_values, 1000)
108
+ run_metric_est_ppt = self._calculate_metric(agg_est_ppt_values, 1000)
109
+ run_metric_e2e_ttft = self._calculate_metric(agg_e2e_ttft_values, 1000)
110
+
111
+ self.runs.append(BenchmarkRun(
112
+ concurrency=concurrency,
113
+ context_size=depth,
114
+ prompt_size=pp, # Configured prompt size
115
+ response_size=tg,
116
+ is_context_prefill_phase=is_context_phase,
117
+ pp_throughput=run_metric_pp_throughput,
118
+ pp_req_throughput=run_metric_pp_req_throughput,
119
+ tg_throughput=run_metric_tg_throughput,
120
+ tg_req_throughput=run_metric_tg_req_throughput,
121
+ ttfr=run_metric_ttfr,
122
+ est_ppt=run_metric_est_ppt,
123
+ e2e_ttft=run_metric_e2e_ttft
124
+ ))
125
+
126
+ def _process_batch(self,
127
+ results: List[RequestResult],
128
+ expected_pp_tokens: int,
129
+ latency: float,
130
+ agg_pp_speeds: List[float],
131
+ agg_tg_speeds: List[float],
132
+ agg_ttft_values: List[float],
133
+ agg_ttfr_values: List[float],
134
+ agg_est_ppt_values: List[float],
135
+ agg_e2e_ttft_values: List[float],
136
+ agg_batch_pp_throughputs: List[float],
137
+ agg_batch_tg_throughputs: List[float]):
138
+
139
+ valid_results = [r for r in results if r and not r.error]
140
+ if not valid_results:
141
+ return
142
+
143
+ batch_prompt_tokens = 0
144
+ batch_gen_tokens = 0
145
+
146
+ start_times = []
147
+ end_times = []
148
+ first_token_times = []
149
+
150
+ for res in valid_results:
151
+ start_times.append(res.start_ts)
152
+ end_times.append(res.end_ts)
153
+
154
+ # Use reported usage if available and reasonable, else expected
155
+ prompt_tokens = expected_pp_tokens
156
+ if res.prompt_tokens > 0:
157
+ diff = abs(res.prompt_tokens - expected_pp_tokens)
158
+ if diff < expected_pp_tokens * 0.2:
159
+ prompt_tokens = res.prompt_tokens
160
+
161
+ batch_prompt_tokens += prompt_tokens
162
+ batch_gen_tokens += res.total_tokens
163
+
164
+ # Metrics Calculation
165
+ ttft = 0.0
166
+ e2e_ttft = 0.0
167
+ ttfr = 0.0
168
+ est_ppt = 0.0
169
+
170
+ if res.first_response_ts:
171
+ ttfr = res.first_response_ts - res.start_ts
172
+ agg_ttfr_values.append(ttfr)
173
+
174
+ if res.first_token_ts:
175
+ first_token_times.append(res.first_token_ts)
176
+ e2e_ttft = res.first_token_ts - res.start_ts
177
+ ttft = max(0, e2e_ttft - latency)
178
+ est_ppt = max(0, ttfr - latency)
179
+
180
+ agg_e2e_ttft_values.append(e2e_ttft)
181
+ agg_ttft_values.append(ttft)
182
+ agg_est_ppt_values.append(est_ppt)
183
+
184
+ # Individual Speeds
185
+ if est_ppt > 0:
186
+ pp_speed = prompt_tokens / est_ppt
187
+ agg_pp_speeds.append(pp_speed)
188
+
189
+ if res.total_tokens > 1 and res.first_token_ts:
190
+ decode_time = res.end_ts - res.first_token_ts
191
+ if decode_time > 0:
192
+ tg_speed = (res.total_tokens - 1) / decode_time
193
+ agg_tg_speeds.append(tg_speed)
194
+
195
+ # Batch-Level Throughput
196
+ if start_times and end_times and first_token_times:
197
+ min_start = min(start_times)
198
+ max_end = max(end_times)
199
+
200
+ max_first_token = max(first_token_times)
201
+ pp_duration = max_first_token - min_start
202
+
203
+ if pp_duration > 0:
204
+ batch_pp_throughput = batch_prompt_tokens / pp_duration
205
+ agg_batch_pp_throughputs.append(batch_pp_throughput)
206
+
207
+ min_first_token = min(first_token_times)
208
+ tg_duration = max_end - min_first_token
209
+
210
+ if tg_duration > 0:
211
+ if batch_gen_tokens > len(valid_results):
212
+ batch_tg_throughput = (batch_gen_tokens - len(valid_results)) / tg_duration
213
+ agg_batch_tg_throughputs.append(batch_tg_throughput)
214
+
215
+
216
+ def _generate_rows(self) -> List[Dict[str, Any]]:
217
+ rows = []
218
+ for run in self.runs:
219
+ c_suffix = ""
220
+ if self.metadata and self.metadata.max_concurrency > 1:
221
+ c_suffix = f" (c{run.concurrency})"
222
+
223
+ if run.is_context_prefill_phase:
224
+ # Context Phase Prompt Processing
225
+ if run.pp_throughput:
226
+ rows.append({
227
+ "model": self.model_name or "Unknown",
228
+ "test_name": f"ctx_pp @ d{run.context_size}{c_suffix}",
229
+ "t_s": run.pp_throughput,
230
+ "t_s_req": run.pp_req_throughput,
231
+ "ttfr": run.ttfr,
232
+ "est_ppt": run.est_ppt,
233
+ "e2e_ttft": run.e2e_ttft
234
+ })
235
+
236
+ # Context Phase Token Generation
237
+ if run.tg_throughput:
238
+ rows.append({
239
+ "model": self.model_name or "Unknown",
240
+ "test_name": f"ctx_tg @ d{run.context_size}{c_suffix}",
241
+ "t_s": run.tg_throughput,
242
+ "t_s_req": run.tg_req_throughput,
243
+ "ttfr": None,
244
+ "est_ppt": None,
245
+ "e2e_ttft": None
246
+ })
247
+ else:
248
+ # Standard Phase
249
+ d_suffix = f" @ d{run.context_size}" if run.context_size > 0 else ""
250
+
251
+ # Prompt Processing
252
+ if run.pp_throughput:
253
+ rows.append({
254
+ "model": self.model_name or "Unknown",
255
+ "test_name": f"pp{run.prompt_size}{d_suffix}{c_suffix}",
256
+ "t_s": run.pp_throughput,
257
+ "t_s_req": run.pp_req_throughput,
258
+ "ttfr": run.ttfr,
259
+ "est_ppt": run.est_ppt,
260
+ "e2e_ttft": run.e2e_ttft
261
+ })
262
+
263
+ # Token Generation
264
+ if run.tg_throughput:
265
+ rows.append({
266
+ "model": self.model_name or "Unknown",
267
+ "test_name": f"tg{run.response_size}{d_suffix}{c_suffix}",
268
+ "t_s": run.tg_throughput,
269
+ "t_s_req": run.tg_req_throughput,
270
+ "ttfr": None,
271
+ "est_ppt": None,
272
+ "e2e_ttft": None
273
+ })
274
+ return rows
275
+
276
+ def _generate_md_report(self, concurrency: int) -> str:
277
+ rows = self._generate_rows()
278
+ if not rows:
279
+ return "No results collected. Check if the model is generating tokens."
280
+
281
+ def fmt(metric: Optional[BenchmarkMetric]) -> str:
282
+ if metric is None:
283
+ return ""
284
+ return f"{metric.mean:.2f} ± {metric.std:.2f}"
285
+
286
+ data = [[
287
+ row["model"],
288
+ row["test_name"],
289
+ fmt(row["t_s"]),
290
+ fmt(row["t_s_req"]),
291
+ fmt(row["ttfr"]),
292
+ fmt(row["est_ppt"]),
293
+ fmt(row["e2e_ttft"])
294
+ ] for row in rows]
295
+
296
+ ts_header = "t/s (total)" if concurrency > 1 else "t/s"
297
+ headers = ["model", "test", ts_header, "t/s (req)", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
298
+
299
+ if concurrency == 1:
300
+ data = [[
301
+ row["model"],
302
+ row["test_name"],
303
+ fmt(row["t_s"]),
304
+ fmt(row["ttfr"]),
305
+ fmt(row["est_ppt"]),
306
+ fmt(row["e2e_ttft"])
307
+ ] for row in rows]
308
+ headers = ["model", "test", ts_header, "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"]
309
+
310
+ return tabulate(data, headers=headers, tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right", "right") if concurrency > 1 else ("left", "right", "right", "right", "right", "right"))
311
+
312
+ def save_report(self, filename: Optional[str], format: str, concurrency: int = 1):
313
+ msg = ""
314
+ if filename:
315
+ msg += f"Saving results to {filename} in {format.upper()} format...\n"
316
+ else:
317
+ msg += f"Printing results in {format.upper()} format:\n"
318
+
319
+ print(f"{msg}\n")
320
+
321
+ if format == "md":
322
+ output = self._generate_md_report(concurrency)
323
+ if filename:
324
+ with open(filename, "w") as f:
325
+ f.write(output)
326
+ else:
327
+ print("\n" + output)
328
+
329
+ elif format == "json":
330
+ data = asdict(self.metadata) if self.metadata else {}
331
+ data["benchmarks"] = [asdict(run) for run in self.runs]
332
+
333
+ if filename:
334
+ with open(filename, "w") as f:
335
+ json.dump(data, f, indent=2)
336
+ else:
337
+ print(json.dumps(data, indent=2))
338
+
339
+ elif format == "csv":
340
+ rows = self._generate_rows()
341
+ csv_rows = []
342
+ headers = ["model", "test_name", "t_s_mean", "t_s_std", "t_s_req_mean", "t_s_req_std", "ttfr_mean", "ttfr_std", "est_ppt_mean", "est_ppt_std", "e2e_ttft_mean", "e2e_ttft_std"]
343
+
344
+ for r in rows:
345
+ row = {
346
+ "model": r["model"],
347
+ "test_name": r["test_name"],
348
+ "t_s_mean": r["t_s"].mean if r["t_s"] else None,
349
+ "t_s_std": r["t_s"].std if r["t_s"] else None,
350
+ "t_s_req_mean": r["t_s_req"].mean if r["t_s_req"] else None,
351
+ "t_s_req_std": r["t_s_req"].std if r["t_s_req"] else None,
352
+ "ttfr_mean": r["ttfr"].mean if r["ttfr"] else None,
353
+ "ttfr_std": r["ttfr"].std if r["ttfr"] else None,
354
+ "est_ppt_mean": r["est_ppt"].mean if r["est_ppt"] else None,
355
+ "est_ppt_std": r["est_ppt"].std if r["est_ppt"] else None,
356
+ "e2e_ttft_mean": r["e2e_ttft"].mean if r["e2e_ttft"] else None,
357
+ "e2e_ttft_std": r["e2e_ttft"].std if r["e2e_ttft"] else None,
358
+ }
359
+ csv_rows.append(row)
360
+
361
+ output_file = filename if filename else sys.stdout
362
+ is_file = isinstance(output_file, str)
363
+
364
+ if is_file:
365
+ with open(output_file, "w", newline="") as f:
366
+ writer = csv.DictWriter(f, fieldnames=headers)
367
+ writer.writeheader()
368
+ writer.writerows(csv_rows)
369
+ else:
370
+ writer = csv.DictWriter(sys.stdout, fieldnames=headers)
371
+ writer.writeheader()
372
+ writer.writerows(csv_rows)
373
+
374
+
llama_benchy/runner.py ADDED
@@ -0,0 +1,155 @@
1
+ import asyncio
2
+ import subprocess
3
+ import time
4
+ from datetime import datetime, timezone
5
+ from typing import List
6
+ import aiohttp
7
+
8
+ from ._version import __version__
9
+ from .config import BenchmarkConfig
10
+ from .client import LLMClient
11
+ from .prompts import PromptGenerator
12
+ from .results import BenchmarkResults, BenchmarkMetadata
13
+
14
+ class BenchmarkRunner:
15
+ def __init__(self, config: BenchmarkConfig, client: LLMClient, prompt_generator: PromptGenerator):
16
+ self.config = config
17
+ self.client = client
18
+ self.prompt_gen = prompt_generator
19
+ self.results = BenchmarkResults()
20
+
21
+ # We need to track deltas from warmup to adapt prompts
22
+ self.delta_user = 0
23
+ self.delta_context = 0
24
+
25
+ async def run_suite(self):
26
+ # Initialize session
27
+ timeout = aiohttp.ClientTimeout(total=3600)
28
+ max_concurrency = max(self.config.concurrency_levels)
29
+ connector = aiohttp.TCPConnector(limit=max_concurrency + 5, force_close=False, keepalive_timeout=600)
30
+
31
+ async with aiohttp.ClientSession(timeout=timeout, connector=connector, trust_env=True) as session:
32
+ # Warmup
33
+ should_warmup = not self.config.no_warmup
34
+ if self.config.adapt_prompt:
35
+ should_warmup = True
36
+
37
+ if should_warmup:
38
+ tokenizer = self.prompt_gen.corpus.get_tokenizer() if self.config.adapt_prompt else None
39
+ self.delta_user, self.delta_context = await self.client.warmup(session, tokenizer)
40
+
41
+ # Measure latency
42
+ latency = await self.client.measure_latency(session, self.config.latency_mode)
43
+
44
+ # Main Loop
45
+ for depth in self.config.depths:
46
+ for pp in self.config.pp_counts:
47
+ for tg in self.config.tg_counts:
48
+ for concurrency in self.config.concurrency_levels:
49
+ print(f"Running test: pp={pp}, tg={tg}, depth={depth}, concurrency={concurrency}")
50
+
51
+ run_std_results = []
52
+ run_ctx_results = []
53
+ expected_pp = pp
54
+ expected_ctx = depth
55
+
56
+ for run in range(self.config.num_runs):
57
+
58
+ # Adapt prompt tokens
59
+ current_pp = pp
60
+ current_depth = depth
61
+ if self.config.adapt_prompt:
62
+ if depth == 0:
63
+ current_pp = max(1, pp - self.delta_user)
64
+ else:
65
+ current_depth = max(1, depth - self.delta_context)
66
+
67
+ expected_pp = current_pp
68
+ expected_ctx = current_depth
69
+
70
+ prompt_batch = self.prompt_gen.generate_batch(
71
+ concurrency,
72
+ current_pp,
73
+ current_depth,
74
+ self.config.no_cache
75
+ )
76
+
77
+ if self.config.enable_prefix_caching and depth > 0:
78
+ # Phase 1: Context Load
79
+ print(f" Run {run+1}/{self.config.num_runs} (Context Load, batch size {concurrency})...")
80
+ load_tasks = []
81
+ for i in range(concurrency):
82
+ context, _ = prompt_batch[i]
83
+ load_tasks.append(self.client.run_generation(
84
+ session,
85
+ context_text=context,
86
+ prompt_text="",
87
+ max_tokens=tg,
88
+ no_cache=self.config.no_cache
89
+ ))
90
+
91
+ load_results = await asyncio.gather(*load_tasks)
92
+ run_ctx_results.append(load_results)
93
+
94
+ # Phase 2: Inference
95
+ print(f" Run {run+1}/{self.config.num_runs} (Inference, batch size {concurrency})...")
96
+ inf_tasks = []
97
+ for i in range(concurrency):
98
+ context, prompt = prompt_batch[i]
99
+ inf_tasks.append(self.client.run_generation(
100
+ session,
101
+ context_text=context,
102
+ prompt_text=prompt,
103
+ max_tokens=tg,
104
+ no_cache=self.config.no_cache
105
+ ))
106
+
107
+ batch_results = await asyncio.gather(*inf_tasks)
108
+ run_std_results.append(batch_results)
109
+
110
+ else:
111
+ # Standard Run
112
+ print(f" Run {run+1}/{self.config.num_runs} (batch size {concurrency})...")
113
+ expected_tokens = current_pp + current_depth
114
+ batch_tasks = []
115
+ for i in range(concurrency):
116
+ context, prompt = prompt_batch[i]
117
+ batch_tasks.append(self.client.run_generation(
118
+ session,
119
+ context_text=context,
120
+ prompt_text=prompt,
121
+ max_tokens=tg,
122
+ no_cache=self.config.no_cache
123
+ ))
124
+
125
+ batch_results = await asyncio.gather(*batch_tasks)
126
+ run_std_results.append(batch_results)
127
+
128
+
129
+ # Post Run Command
130
+ if self.config.post_run_cmd:
131
+ try:
132
+ subprocess.run(self.config.post_run_cmd, shell=True, check=True)
133
+ except subprocess.CalledProcessError as e:
134
+ print(f"Post-run command failed: {e}")
135
+
136
+ # Aggregate and Record
137
+ if self.config.enable_prefix_caching and depth > 0:
138
+ self.results.add(self.config.model, pp, tg, depth, concurrency, run_ctx_results, latency, expected_ctx, is_context_phase=True)
139
+ self.results.add(self.config.model, pp, tg, depth, concurrency, run_std_results, latency, expected_pp, is_context_phase=False)
140
+ else:
141
+ # Standard run expected tokens = pp + depth (usually depth=0 or concatenated)
142
+ # In the loop above: expected_tokens = current_pp + current_depth
143
+ self.results.add(self.config.model, pp, tg, depth, concurrency, run_std_results, latency, expected_pp + expected_ctx, is_context_phase=False)
144
+
145
+ self.results.metadata = BenchmarkMetadata(
146
+ version=__version__,
147
+ timestamp=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%SZ"),
148
+ latency_mode=self.config.latency_mode,
149
+ latency_ms=latency * 1000,
150
+ model=self.config.model,
151
+ prefix_caching_enabled=self.config.enable_prefix_caching,
152
+ max_concurrency=max(self.config.concurrency_levels) if self.config.concurrency_levels else 1
153
+ )
154
+
155
+ self.results.save_report(self.config.save_result, self.config.result_format, max(self.config.concurrency_levels) if self.config.concurrency_levels else 1)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llama-benchy
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
5
5
  Project-URL: Homepage, https://github.com/eugr/llama-benchy
6
6
  Project-URL: Bug Tracker, https://github.com/eugr/llama-benchy/issues
@@ -43,6 +43,12 @@ Requires-Dist: openai
43
43
  Requires-Dist: requests
44
44
  Requires-Dist: tabulate
45
45
  Requires-Dist: transformers
46
+ Provides-Extra: dev
47
+ Requires-Dist: fastapi; extra == 'dev'
48
+ Requires-Dist: pydantic; extra == 'dev'
49
+ Requires-Dist: pytest; extra == 'dev'
50
+ Requires-Dist: pytest-asyncio; extra == 'dev'
51
+ Requires-Dist: uvicorn; extra == 'dev'
46
52
  Description-Content-Type: text/markdown
47
53
 
48
54
  # llama-benchy - llama-bench style benchmarking tool for all backends
@@ -75,12 +81,12 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
75
81
  - Downloads a book from Project Gutenberg to use as source text for prompts to ensure better benchmarking of spec.decoding/MTP models.
76
82
  - Supports executing a command after each run (e.g., to clear cache).
77
83
  - Configurable latency measurement mode.
84
+ - Supports concurrent requests (`--concurrency`) to measure throughput under load.
85
+ - Can save results to file in Markdown, JSON, or CSV format.
78
86
 
79
87
  # Current Limitations
80
88
 
81
89
  - Evaluates against `/v1/chat/completions` endpoint only.
82
- - Doesn't measure throughput in concurrency mode (coming later).
83
- - Outputs results as a Markdown table only for now.
84
90
 
85
91
  ## Installation
86
92
 
@@ -215,6 +221,9 @@ Generally you don't need to disable prompt caching on the server, as a probabili
215
221
  - `--adapt-prompt`: Adapt prompt size based on warmup token usage delta (Default: True).
216
222
  - `--no-adapt-prompt`: Disable prompt size adaptation.
217
223
  - `--enable-prefix-caching`: Enable prefix caching performance measurement. When enabled (and depth > 0), it performs a two-step benchmark: first loading the context (reported as `ctx_pp`), then running the prompt with the cached context.
224
+ - `--concurrency`: List of concurrency levels (number of concurrent requests per test) (Default: [1]).
225
+ - `--save-result`: File to save results to.
226
+ - `--format`: Output format: 'md', 'json', 'csv' (Default: 'md').
218
227
 
219
228
  ### Metrics
220
229
 
@@ -230,6 +239,9 @@ The script attempts to estimate network or processing latency to provide "server
230
239
 
231
240
  #### Table Columns
232
241
 
242
+ - When `concurrency` > 1:
243
+ - **`t/s (total)`**: Total throughput across all concurrent requests.
244
+ - **`t/s (req)`**: Average throughput per individual request.
233
245
  - **`t/s` (Tokens per Second)**:
234
246
  - **For Prompt Processing (pp)**: Calculated as `Total Prompt Tokens / est_ppt`. This represents the prefill speed.
235
247
  - **For Token Generation (tg)**: Calculated as `(Total Generated Tokens - 1) / (Time of Last Token - Time of First Token)`. This represents the decode speed, excluding the first token latency.
@@ -269,3 +281,31 @@ llama-benchy \
269
281
  ```
270
282
 
271
283
  This will run benchmarks for all combinations of pp (128, 256), tg (32, 64), and depth (0, 1024).
284
+
285
+ ## Development
286
+
287
+ ### Running Integration Tests
288
+
289
+ This repository includes a mock server and an integration test suite to verify `llama-benchy` logic without needing a real GPU server.
290
+
291
+ The mock server emulates:
292
+ - **Prompt Processing (PP):** ~1000 t/s drift-corrected.
293
+ - **Token Generation (TG):** ~50 t/s.
294
+ - **Prefix Caching:** Emulates cache hits by skipping processing time for cached prefixes (system messages).
295
+ - **OpenAI API Compatibility**: Serves `/v1/chat/completions` and `/v1/models`.
296
+
297
+ To run the integration tests:
298
+
299
+ ```bash
300
+ # Install development dependencies
301
+ uv sync --all-extras --dev
302
+
303
+ # Run tests
304
+ uv run pytest tests/test_mock_integration.py
305
+ ```
306
+
307
+ This test will:
308
+ 1. Spin up the mock server on port 8001.
309
+ 2. Run `llama-benchy` against it.
310
+ 3. Parse the JSON output.
311
+ 4. Verify that throughputs match the emulated speeds (PP ~1000, TG ~50) and that caching effectively increases effective throughput.
@@ -0,0 +1,14 @@
1
+ llama_benchy/__init__.py,sha256=D2TacJCNiAvfxHovv86Cm1kkFfmwgj_Z6QPoWdjJFhs,239
2
+ llama_benchy/__main__.py,sha256=rk0Re1dehcJNIxIRsTRF_HCvcDEb20nMV05pYtG7FIw,1384
3
+ llama_benchy/_version.py,sha256=Dg8AmJomLVpjKL6prJylOONZAPRtB86LOce7dorQS_A,704
4
+ llama_benchy/client.py,sha256=dYFwlFJvr0aSThb6lN6coQt2KJy8tYb-BhDobniviV8,8362
5
+ llama_benchy/config.py,sha256=FV4jyEHm2G-lU2wX1atq--lLW-53uZQRWrWc00Qrnwc,4462
6
+ llama_benchy/corpus.py,sha256=b0RSkN8bpySiPEToH_XZR3hHKYz752BjsNqlE-78nPY,2404
7
+ llama_benchy/prompts.py,sha256=AUgAOKK2QIBb9DcwhgIrRTGxIqXiFjD7D-Ek0A3mmEk,2090
8
+ llama_benchy/results.py,sha256=jP2UUe5juHu5XDwgiS-7rCbPdbiU0XMn-DcqjVXiCNY,14453
9
+ llama_benchy/runner.py,sha256=PSycdp6nkgkWuW7DYsAJpw2PWBuQXgGEpVUci-r1dDo,8579
10
+ llama_benchy-0.2.0.dist-info/METADATA,sha256=MTE8qthP3WoVJ-crPIUBuGN3rnqi_v1jEOOGL4Dda5A,15012
11
+ llama_benchy-0.2.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
12
+ llama_benchy-0.2.0.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
13
+ llama_benchy-0.2.0.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
14
+ llama_benchy-0.2.0.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- llama_benchy/__init__.py,sha256=D2TacJCNiAvfxHovv86Cm1kkFfmwgj_Z6QPoWdjJFhs,239
2
- llama_benchy/__main__.py,sha256=RZalKXmtAAKiCBenE1maVeyvly5fsGQanS5v3YLeDLs,24371
3
- llama_benchy/_version.py,sha256=m8HxkqoKGw_wAJtc4ZokpJKNLXqp4zwnNhbnfDtro7w,704
4
- llama_benchy-0.1.1.dist-info/METADATA,sha256=O6DTAZAJta_puufDXqbeFhhlTT-WaeBVoJSfDLOREDo,13439
5
- llama_benchy-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
6
- llama_benchy-0.1.1.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
7
- llama_benchy-0.1.1.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
8
- llama_benchy-0.1.1.dist-info/RECORD,,