llama-benchy 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ """
2
+ llama-benchy - llama-bench style benchmarking tool for all backends
3
+
4
+ This package provides a benchmarking tool for OpenAI-compatible LLM endpoints,
5
+ generating statistics similar to `llama-bench`.
6
+ """
7
+
8
+ from ._version import __version__
9
+
10
+ # Extract build number from the version string
11
+ # Version format is like: '0.1.dev34+g33f03d886.d20260105'
12
+ # We want to extract the git hash part: '33f03d886'
13
+ __build__ = "unknown"
14
+ if "+" in __version__:
15
+ try:
16
+ # Extract the part after the '+' and before the '.'
17
+ build_part = __version__.split("+")[1].split(".")[0]
18
+ # Remove the 'g' prefix if it exists
19
+ if build_part.startswith("g"):
20
+ __build__ = build_part[1:]
21
+ else:
22
+ __build__ = build_part
23
+ except (IndexError, AttributeError):
24
+ pass
@@ -0,0 +1,535 @@
1
+ """
2
+ Main entry point for the llama-benchy CLI.
3
+ """
4
+
5
+ import argparse
6
+ import os
7
+ import time
8
+ import uuid
9
+ import subprocess
10
+ import datetime
11
+ import numpy as np
12
+ from tabulate import tabulate
13
+ import aiohttp
14
+ import asyncio
15
+ import json
16
+ import codecs
17
+ import hashlib
18
+ from transformers import AutoTokenizer
19
+ import requests
20
+
21
+ # Build number is now imported from __init__.py
22
+ from . import __build__, __version__
23
+
24
+
25
+
26
+ def parse_arguments():
27
+ parser = argparse.ArgumentParser(description="LLM Benchmark Script")
28
+ parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
29
+ parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
30
+ parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
31
+ parser.add_argument("--served-model-name", type=str, default=None, help="Model name used in API calls (defaults to --model if not specified)")
32
+ parser.add_argument("--tokenizer", type=str, default=None, help="HuggingFace tokenizer name (defaults to model name)")
33
+ parser.add_argument("--pp", type=int, nargs='+', required=False, default=[2048], help="List of prompt processing token counts - default: 2048")
34
+ parser.add_argument("--tg", type=int, nargs='+', required=False, default=[32], help="List of token generation counts - default: 32")
35
+ parser.add_argument("--depth", type=int, nargs='+', default=[0], help="List of context depths (previous conversation tokens) - default: 0")
36
+ parser.add_argument("--runs", type=int, default=3, help="Number of runs per test - default: 3")
37
+ parser.add_argument("--no-cache", action="store_true", help="Ensure unique requests to avoid prefix caching and send cache_prompt=false to the server")
38
+ parser.add_argument("--post-run-cmd", type=str, default=None, help="Command to execute after each test run")
39
+ parser.add_argument("--book-url", type=str, default="https://www.gutenberg.org/files/1661/1661-0.txt", help="URL of a book to use for text generation, defaults to Sherlock Holmes (https://www.gutenberg.org/files/1661/1661-0.txt)")
40
+ parser.add_argument("--latency-mode", type=str, default="api", choices=["api", "generation", "none"], help="Method to measure latency: 'api' (list models) - default, 'generation' (single token generation), or 'none' (skip latency measurement)")
41
+ parser.add_argument("--no-warmup", action="store_true", help="Skip warmup phase")
42
+ parser.add_argument("--adapt-prompt", action="store_true", default=True, help="Adapt prompt size based on warmup token usage delta (default: True)")
43
+ parser.add_argument("--no-adapt-prompt", action="store_false", dest="adapt_prompt", help="Disable prompt size adaptation")
44
+ parser.add_argument("--enable-prefix-caching", action="store_true", help="Enable prefix caching performance measurement")
45
+ return parser.parse_args()
46
+
47
+
48
+ def get_tokenizer(model_name, tokenizer_name=None):
49
+ try:
50
+ name = tokenizer_name if tokenizer_name else model_name
51
+ return AutoTokenizer.from_pretrained(name)
52
+ except Exception as e:
53
+ print(f"Error loading tokenizer: {e}")
54
+ print("Falling back to 'gpt2' tokenizer as approximation.")
55
+ return AutoTokenizer.from_pretrained("gpt2")
56
+
57
+
58
+ def prepare_text_data(book_url, tokenizer):
59
+ try:
60
+ # Create cache directory if it doesn't exist
61
+ cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "llama-benchy")
62
+ os.makedirs(cache_dir, exist_ok=True)
63
+
64
+ # Generate hash of the URL for the filename
65
+ url_hash = hashlib.md5(book_url.encode()).hexdigest()
66
+ cache_file = os.path.join(cache_dir, f"{url_hash}.txt")
67
+
68
+ if os.path.exists(cache_file):
69
+ print(f"Loading text from cache: {cache_file}")
70
+ with open(cache_file, "r", encoding="utf-8") as f:
71
+ text = f.read()
72
+ else:
73
+ print(f"Downloading book from {book_url}...")
74
+ response = requests.get(book_url)
75
+ response.raise_for_status()
76
+ text = response.text
77
+ # Basic cleanup
78
+ start_idx = text.find("*** START OF THE PROJECT GUTENBERG EBOOK")
79
+ if start_idx != -1:
80
+ text = text[start_idx:]
81
+
82
+ # Save to cache
83
+ with open(cache_file, "w", encoding="utf-8") as f:
84
+ f.write(text)
85
+ print(f"Saved text to cache: {cache_file}")
86
+
87
+ return tokenizer.encode(text, add_special_tokens=False)
88
+ except Exception as e:
89
+ print(f"Error downloading book: {e}")
90
+ exit(1)
91
+
92
+
93
+ def generate_prompt(all_tokens, tokenizer, prompt_tokens, context_tokens=0, no_cache=False):
94
+ suffix = ""
95
+ suffix_len = 0
96
+ if no_cache:
97
+ suffix = f" {uuid.uuid4()}"
98
+ suffix_len = len(tokenizer.encode(suffix, add_special_tokens=False))
99
+
100
+ # Adjust prompt tokens to fetch from text
101
+ text_prompt_tokens = max(0, prompt_tokens - suffix_len)
102
+
103
+ # Create a pool of tokens large enough
104
+ total_needed = text_prompt_tokens + context_tokens
105
+
106
+ if len(all_tokens) < total_needed:
107
+ # Repeat tokens if not enough
108
+ all_tokens = all_tokens * (total_needed // len(all_tokens) + 2)
109
+
110
+ # Pick a random start position
111
+ max_start = len(all_tokens) - total_needed
112
+ start_idx = np.random.randint(0, max_start)
113
+
114
+ selected_tokens = all_tokens[start_idx : start_idx + total_needed]
115
+
116
+ context_text = tokenizer.decode(selected_tokens[:context_tokens]) if context_tokens > 0 else ""
117
+ prompt_text = tokenizer.decode(selected_tokens[context_tokens:])
118
+
119
+ if no_cache:
120
+ prompt_text += suffix
121
+
122
+ return context_text, prompt_text
123
+
124
+
125
+ async def measure_latency(session, base_url, api_key, mode="api", model_name=None):
126
+ if mode == "none":
127
+ print("Skipping latency measurement (assuming 0 ms).")
128
+ return 0
129
+
130
+ print(f"Measuring latency using mode: {mode}...")
131
+ latencies = []
132
+ headers = {"Authorization": f"Bearer {api_key}"}
133
+
134
+ for _ in range(3):
135
+ start = time.perf_counter()
136
+ try:
137
+ if mode == "api":
138
+ async with session.get(f"{base_url}/models", headers=headers) as response:
139
+ await response.read()
140
+ latencies.append(time.perf_counter() - start)
141
+ elif mode == "generation":
142
+ if not model_name:
143
+ raise ValueError("Model name required for generation latency mode")
144
+ payload = {
145
+ "model": model_name,
146
+ "messages": [{"role": "user", "content": "hello"}],
147
+ "max_tokens": 1,
148
+ "stream": True
149
+ }
150
+ async with session.post(f"{base_url}/chat/completions", json=payload, headers=headers) as response:
151
+ async for _ in response.content:
152
+ # record latency as soon as the first byte is received
153
+ latencies.append(time.perf_counter() - start)
154
+ break
155
+ # Drain the rest of the response to keep the connection alive
156
+ async for _ in response.content: pass
157
+ except Exception as e:
158
+ print(f"Error measuring latency: {e}")
159
+
160
+ if latencies:
161
+ avg_latency = np.mean(latencies)
162
+ print(f"Average latency ({mode}): {avg_latency*1000:.2f} ms")
163
+ return avg_latency
164
+ return 0
165
+
166
+
167
+ async def warmup(session, base_url, api_key, model, tokenizer=None):
168
+ print("Warming up...")
169
+ headers = {"Authorization": f"Bearer {api_key}"}
170
+ warmup_text = "Warmup " * 10
171
+
172
+ delta_user = 0
173
+ delta_context = 0
174
+
175
+ # 1. User only (No Context)
176
+ payload_user = {
177
+ "model": model,
178
+ "messages": [{"role": "user", "content": warmup_text}],
179
+ "max_tokens": 1
180
+ }
181
+
182
+ try:
183
+ async with session.post(f"{base_url}/chat/completions", json=payload_user, headers=headers) as response:
184
+ response_json = await response.json()
185
+ if tokenizer:
186
+ if 'usage' in response_json:
187
+ prompt_tokens = response_json['usage']['prompt_tokens']
188
+ local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
189
+ delta_user = prompt_tokens - local_tokens
190
+ print(f"Warmup (User only) complete. Delta: {delta_user} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
191
+ else:
192
+ print("Warmup (User only) complete (no usage stats found).")
193
+ else:
194
+ print("Warmup complete.")
195
+
196
+ if tokenizer:
197
+ # 2. System + Empty User (Context Only)
198
+ payload_sys_empty = {
199
+ "model": model,
200
+ "messages": [
201
+ {"role": "system", "content": warmup_text},
202
+ {"role": "user", "content": ""}
203
+ ],
204
+ "max_tokens": 1
205
+ }
206
+ async with session.post(f"{base_url}/chat/completions", json=payload_sys_empty, headers=headers) as response:
207
+ response_json = await response.json()
208
+ if 'usage' in response_json:
209
+ prompt_tokens = response_json['usage']['prompt_tokens']
210
+ local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
211
+ delta_context = prompt_tokens - local_tokens
212
+ print(f"Warmup (System+Empty) complete. Delta: {delta_context} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
213
+ else:
214
+ print("Warmup (System+Empty) complete (no usage stats found).")
215
+ delta_context = delta_user
216
+
217
+ except Exception as e:
218
+ print(f"Warmup failed: {e}")
219
+ return delta_user, delta_context
220
+
221
+
222
+ async def run_benchmark(session, base_url, api_key, model_name, context_text, prompt_text, expected_pp_tokens, tg, no_cache, latency, post_run_cmd):
223
+ messages = []
224
+ if context_text:
225
+ messages.append({"role": "system", "content": context_text})
226
+ messages.append({"role": "user", "content": prompt_text})
227
+
228
+ ttft = 0
229
+ e2e_ttft = 0
230
+ token_count = 0
231
+ first_token_time = 0
232
+ first_response_time = 0
233
+ prompt_usage_tokens = 0
234
+
235
+ result = {
236
+ "pp_speed": None,
237
+ "tg_speed": None,
238
+ "ttft": None,
239
+ "ttfr": None,
240
+ "est_ppt": None,
241
+ "e2e_ttft": None
242
+ }
243
+
244
+ try:
245
+ payload = {
246
+ "model": model_name,
247
+ "messages": messages,
248
+ "max_tokens": tg,
249
+ "stream": True,
250
+ "stream_options": {"include_usage": True},
251
+ # "temperature": 0,
252
+ # "seed": 42
253
+ }
254
+
255
+ if no_cache:
256
+ payload["cache_prompt"] = False
257
+
258
+ headers = {"Authorization": f"Bearer {api_key}"}
259
+
260
+ start_time = time.perf_counter()
261
+
262
+ async with session.post(f"{base_url}/chat/completions", json=payload, headers=headers) as response:
263
+ if response.status != 200:
264
+ error_text = await response.text()
265
+ print(f"Error: {response.status} - {error_text}")
266
+ return None
267
+
268
+ buffer = ""
269
+ decoder = codecs.getincrementaldecoder("utf-8")(errors='replace')
270
+ async for chunk_bytes in response.content:
271
+ chunk_time = time.perf_counter()
272
+ decoded_chunk = decoder.decode(chunk_bytes, final=False)
273
+ buffer += decoded_chunk
274
+
275
+ while "\n" in buffer:
276
+ line, buffer = buffer.split("\n", 1)
277
+ line = line.strip()
278
+ if not line or line == 'data: [DONE]':
279
+ continue
280
+
281
+ if line.startswith('data: '):
282
+ try:
283
+ chunk = json.loads(line[6:])
284
+ if 'usage' in chunk:
285
+ prompt_usage_tokens = chunk['usage'].get('prompt_tokens', 0)
286
+
287
+ if 'choices' in chunk and len(chunk['choices']) > 0:
288
+ if first_response_time == 0:
289
+ first_response_time = chunk_time
290
+
291
+ delta = chunk['choices'][0].get('delta', {})
292
+ content = delta.get('content')
293
+ reasoning_content = delta.get('reasoning_content')
294
+
295
+ if content or reasoning_content:
296
+ if token_count == 0:
297
+ first_token_time = chunk_time
298
+ e2e_ttft = first_token_time - start_time
299
+ ttft = e2e_ttft-latency
300
+ if ttft < 0:
301
+ ttft = 0
302
+
303
+ token_count += 1
304
+ except json.JSONDecodeError:
305
+ continue
306
+
307
+ end_time = time.perf_counter()
308
+
309
+ if token_count > 0:
310
+ # Calculate decode time (time for subsequent tokens)
311
+ # If only 1 token, decode_time is effectively 0, so we can't calculate inter-token speed
312
+ if token_count > 1:
313
+ decode_time = end_time - first_token_time
314
+ if decode_time > 0:
315
+ # Speed for the generated tokens (excluding the first one which is TTFT)
316
+ result["tg_speed"] = (token_count - 1) / decode_time
317
+ else:
318
+ # Fallback if time is too small
319
+ result["tg_speed"] = (token_count - 1) / 0.0001
320
+
321
+ # Use expected_pp_tokens for speed calculation
322
+ total_prompt_tokens = expected_pp_tokens
323
+
324
+ # Only use reported usage if it's close to expected (to handle tokenizer differences)
325
+ # but not if it's vastly different (which happens in prefix caching where usage includes cached tokens)
326
+ if prompt_usage_tokens > 0:
327
+ diff = abs(prompt_usage_tokens - expected_pp_tokens)
328
+ if diff < expected_pp_tokens * 0.2: # 20% tolerance
329
+ total_prompt_tokens = prompt_usage_tokens
330
+
331
+ # Calculate TTFR and Estimated Prompt Processing Time
332
+ ttfr = 0
333
+ est_ppt = 0
334
+ if first_response_time > 0:
335
+ ttfr = first_response_time - start_time
336
+ est_ppt = ttfr - latency
337
+ if est_ppt < 0: est_ppt = 0
338
+
339
+ if est_ppt > 0:
340
+ result["pp_speed"] = total_prompt_tokens / est_ppt
341
+ result["est_ppt"] = est_ppt
342
+
343
+ if ttfr > 0:
344
+ result["ttfr"] = ttfr
345
+
346
+ if ttft > 0:
347
+ result["ttft"] = ttft
348
+
349
+ if e2e_ttft > 0:
350
+ result["e2e_ttft"] = e2e_ttft
351
+
352
+ except Exception as e:
353
+ print(f"Error during run: {e}")
354
+ return None
355
+
356
+ if post_run_cmd:
357
+ try:
358
+ subprocess.run(post_run_cmd, shell=True, check=True)
359
+ except subprocess.CalledProcessError as e:
360
+ print(f"Post-run command failed: {e}")
361
+
362
+ return result
363
+
364
+
365
+ async def main_async():
366
+ args = parse_arguments()
367
+
368
+ if args.enable_prefix_caching and args.no_cache:
369
+ print("Error: --enable-prefix-caching and --no-cache are incompatible.")
370
+ return
371
+
372
+ build_number = __build__
373
+ version_number = __version__
374
+
375
+ current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
376
+ print(f"llama-benchy ({version_number}.{build_number})")
377
+ print(f"Date: {current_time}")
378
+ print(f"Benchmarking model: {args.model} at {args.base_url}")
379
+
380
+ served_model_name = args.served_model_name if args.served_model_name else args.model
381
+
382
+ tokenizer = get_tokenizer(args.model, args.tokenizer)
383
+ all_tokens = prepare_text_data(args.book_url, tokenizer)
384
+ print(f"Total tokens available in text corpus: {len(all_tokens)}")
385
+
386
+ # Use a large timeout for long-running benchmarks
387
+ timeout = aiohttp.ClientTimeout(total=3600)
388
+ connector = aiohttp.TCPConnector(limit=1, force_close=False, keepalive_timeout=600)
389
+ async with aiohttp.ClientSession(timeout=timeout, connector=connector, trust_env=True) as session:
390
+ delta_user = 0
391
+ delta_context = 0
392
+ should_warmup = not args.no_warmup
393
+ if args.adapt_prompt:
394
+ should_warmup = True
395
+
396
+ if should_warmup:
397
+ delta_user, delta_context = await warmup(session, args.base_url, args.api_key, served_model_name, tokenizer if args.adapt_prompt else None)
398
+
399
+ latency = await measure_latency(session, args.base_url, args.api_key, args.latency_mode, served_model_name)
400
+
401
+ results = []
402
+
403
+ for depth in args.depth:
404
+ for pp in args.pp:
405
+ for tg in args.tg:
406
+ print(f"Running test: pp={pp}, tg={tg}, depth={depth}")
407
+ pp_speeds = []
408
+ tg_speeds = []
409
+ ttft_values = []
410
+ ttfr_values = []
411
+ est_ppt_values = []
412
+ e2e_ttft_values = []
413
+
414
+ ctx_pp_speeds = []
415
+ ctx_tg_speeds = []
416
+ ctx_ttfr_values = []
417
+ ctx_est_ppt_values = []
418
+ ctx_e2e_ttft_values = []
419
+
420
+ for run in range(args.runs):
421
+ current_pp = pp
422
+ current_depth = depth
423
+ if args.adapt_prompt:
424
+ if depth == 0:
425
+ current_pp = max(1, pp - delta_user)
426
+ else:
427
+ current_depth = max(1, depth - delta_context)
428
+
429
+ context, prompt = generate_prompt(all_tokens, tokenizer, current_pp, current_depth, args.no_cache)
430
+
431
+ if args.enable_prefix_caching and depth > 0:
432
+ # Request 1: Context only
433
+ # We send context as system message, and empty prompt as user message.
434
+ # This establishes the prefix: [System: Context] [User: ""]
435
+ # Expected PP tokens = current_depth (context size)
436
+ print(f" Run {run+1}/{args.runs} (Context Load)...")
437
+ ctx_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, "", current_depth, tg, args.no_cache, latency, None)
438
+
439
+ if ctx_result:
440
+ if ctx_result["pp_speed"] is not None:
441
+ ctx_pp_speeds.append(ctx_result["pp_speed"])
442
+ if ctx_result["tg_speed"] is not None:
443
+ ctx_tg_speeds.append(ctx_result["tg_speed"])
444
+ if ctx_result["ttfr"] is not None:
445
+ ctx_ttfr_values.append(ctx_result["ttfr"])
446
+ if ctx_result["est_ppt"] is not None:
447
+ ctx_est_ppt_values.append(ctx_result["est_ppt"])
448
+ if ctx_result["e2e_ttft"] is not None:
449
+ ctx_e2e_ttft_values.append(ctx_result["e2e_ttft"])
450
+
451
+ # Request 2: Context + Prompt
452
+ # We send context as system message, and prompt as user message.
453
+ # The prefix [System: Context] should be cached.
454
+ # Expected PP tokens = current_pp (prompt size only)
455
+ print(f" Run {run+1}/{args.runs} (Inference)...")
456
+ run_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, prompt, current_pp, tg, args.no_cache, latency, args.post_run_cmd)
457
+ else:
458
+ # Standard run
459
+ # Expected PP tokens = current_pp + current_depth
460
+ expected_tokens = current_pp + current_depth
461
+ run_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, prompt, expected_tokens, tg, args.no_cache, latency, args.post_run_cmd)
462
+
463
+ if run_result:
464
+ if run_result["tg_speed"] is not None:
465
+ tg_speeds.append(run_result["tg_speed"])
466
+ if run_result["pp_speed"] is not None:
467
+ pp_speeds.append(run_result["pp_speed"])
468
+ if run_result["est_ppt"] is not None:
469
+ est_ppt_values.append(run_result["est_ppt"])
470
+ if run_result["ttfr"] is not None:
471
+ ttfr_values.append(run_result["ttfr"])
472
+ if run_result["ttft"] is not None:
473
+ ttft_values.append(run_result["ttft"])
474
+ if run_result["e2e_ttft"] is not None:
475
+ e2e_ttft_values.append(run_result["e2e_ttft"])
476
+
477
+ # Aggregate results
478
+ def format_result(values, multiplier=1.0):
479
+ if not values: return ""
480
+ mean = np.mean(values) * multiplier
481
+ std = np.std(values) * multiplier
482
+ return f"{mean:.2f} ± {std:.2f}"
483
+
484
+ # Context PP (if enabled)
485
+ if ctx_pp_speeds:
486
+ test_name = f"ctx_pp @ d{depth}"
487
+ results.append([
488
+ args.model,
489
+ test_name,
490
+ format_result(ctx_pp_speeds),
491
+ format_result(ctx_ttfr_values, 1000),
492
+ format_result(ctx_est_ppt_values, 1000),
493
+ format_result(ctx_e2e_ttft_values, 1000)
494
+ ])
495
+
496
+ # Context TG (if enabled)
497
+ if ctx_tg_speeds:
498
+ test_name = f"ctx_tg @ d{depth}"
499
+ results.append([args.model, test_name, format_result(ctx_tg_speeds), "", "", ""])
500
+
501
+ # Standard PP
502
+ if pp_speeds:
503
+ test_name = f"pp{pp}"
504
+ if depth > 0: test_name += f" @ d{depth}"
505
+ results.append([
506
+ args.model,
507
+ test_name,
508
+ format_result(pp_speeds),
509
+ format_result(ttfr_values, 1000),
510
+ format_result(est_ppt_values, 1000),
511
+ format_result(e2e_ttft_values, 1000)
512
+ ])
513
+
514
+ # Standard TG
515
+ if tg_speeds:
516
+ test_name = f"tg{tg}"
517
+ if depth > 0: test_name += f" @ d{depth}"
518
+ results.append([args.model, test_name, format_result(tg_speeds), "", "", ""])
519
+
520
+ print()
521
+ if not results:
522
+ print("No results collected. Check if the model is generating tokens.")
523
+ else:
524
+ print(tabulate(results, headers=["model", "test", "t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"], tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right")))
525
+ print(f"\nllama-benchy ({version_number}.{build_number})")
526
+ print(f"date: {current_time} | latency mode: {args.latency_mode}")
527
+
528
+
529
+ def main():
530
+ """Entry point for the CLI command."""
531
+ asyncio.run(main_async())
532
+
533
+
534
+ if __name__ == "__main__":
535
+ main()
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.1.0'
32
+ __version_tuple__ = version_tuple = (0, 1, 0)
33
+
34
+ __commit_id__ = commit_id = None
@@ -0,0 +1,249 @@
1
+ Metadata-Version: 2.4
2
+ Name: llama-benchy
3
+ Version: 0.1.0
4
+ Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
5
+ Project-URL: Homepage, https://github.com/eugr/llama-benchy
6
+ Project-URL: Bug Tracker, https://github.com/eugr/llama-benchy/issues
7
+ Author: eugr
8
+ License: MIT License
9
+
10
+ Copyright (c) 2026 Eugene Rakhmatulin
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ License-File: LICENSE
30
+ Classifier: Intended Audience :: Science/Research
31
+ Classifier: License :: OSI Approved :: MIT License
32
+ Classifier: Operating System :: OS Independent
33
+ Classifier: Programming Language :: Python :: 3
34
+ Classifier: Programming Language :: Python :: 3.10
35
+ Classifier: Programming Language :: Python :: 3.11
36
+ Classifier: Programming Language :: Python :: 3.12
37
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
38
+ Requires-Python: >=3.10
39
+ Requires-Dist: aiohttp
40
+ Requires-Dist: asyncio
41
+ Requires-Dist: numpy
42
+ Requires-Dist: openai
43
+ Requires-Dist: requests
44
+ Requires-Dist: tabulate
45
+ Requires-Dist: transformers
46
+ Description-Content-Type: text/markdown
47
+
48
+ # llama-benchy - llama-bench style benchmarking tool for all backends
49
+
50
+ This script benchmarks OpenAI-compatible LLM endpoints, generating statistics similar to `llama-bench`.
51
+
52
+ ## Motivation
53
+
54
+ `llama-bench` is a CLI tool that is a part of a very popular [llama.cpp](https://github.com/ggml-org/llama.cpp) inference engine. It is widely used in LLM community to benchmark models and allows to perform measurement at different context sizes.
55
+ However, it is available only for llama.cpp and cannot be used with other inference engines, like vllm or SGLang.
56
+
57
+ Also, it performs measurements using the C++ engine directly which is not representative of the end user experience which can be quite different in practice.
58
+
59
+ vLLM has its own powerful benchmarking tool, but while it can be used with other inference engines, there are a few issues:
60
+
61
+ - It's very tricky and even impossible to calculate prompt processing speeds at different context lengths. You can use `vllm bench sweep serve`, but it only works well with vLLM with prefix caching disabled on the server. Even with random prompts it will reuse the same prompt between multiple runs which will hit the cache in `llama-server` for instance. So you will get very low median TTFT times and very high prompt processing speeds.
62
+ - The TTFT measurement it uses is not actually until the first usable token, it's until the very first data chunk from the server which may not contain any generated tokens in /v1/chat/completions mode.
63
+ - Random dataset is the only ones that allows to specify an arbitrary number of tokens, but randomly generated token sequence doesn't let you adequately measure speculative decoding/MTP.
64
+
65
+ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool that brings llama-bench style measurements at different context lengths to any OpenAI-compatible endpoint.
66
+
67
+ ## Features
68
+
69
+ - Measures Prompt Processing (pp) and Token Generation (tg) speeds at different context depths.
70
+ - Can measure separate context prefill and prompt processing over existing cached context at different context depths.
71
+ - Reports Time To First Response (data chunk) (TTFR), Estimated Prompt Processing Time (est_ppt), and End-to-End TTFT.
72
+ - Supports configurable prompt length (`--pp`), generation length (`--tg`), and context depth (`--depth`).
73
+ - Can run multiple iterations (`--runs`) and report mean ± std.
74
+ - Uses HuggingFace tokenizers for accurate token counts.
75
+ - Downloads a book from Project Gutenberg to use as source text for prompts to ensure better benchmarking of spec.decoding/MTP models.
76
+ - Supports executing a command after each run (e.g., to clear cache).
77
+ - Configurable latency measurement mode.
78
+
79
+ ## Installation
80
+
81
+ Install `uv` first: https://docs.astral.sh/uv/getting-started/installation/
82
+
83
+ ### Option 1: Run without installation using `uvx`
84
+
85
+ ```bash
86
+ uvx --from git+https://github.com/eugr/llama-benchy llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
87
+ ```
88
+
89
+ ### Option 2: Install into virtual environment
90
+
91
+ ```bash
92
+ # Clone the repository
93
+ git clone https://github.com/eugr/llama-benchy.git
94
+ cd llama-benchy
95
+
96
+ # Create virtual environment
97
+ uv venv
98
+
99
+ # Install with uv (installs into a virtual environment automatically)
100
+ uv pip install -e .
101
+ ```
102
+
103
+ To run, activate the environment first
104
+
105
+ ```bash
106
+ source .venv/bin/activate
107
+ ```
108
+
109
+ Then execute the command:
110
+
111
+ ```bash
112
+ llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
113
+ ```
114
+
115
+
116
+ ### Option 3: Run without installing (`uv run`)
117
+
118
+ ```bash
119
+ # Clone the repository
120
+ git clone https://github.com/eugr/llama-benchy.git
121
+ cd llama-benchy
122
+
123
+ # Using uv run (creates a virtual environment if it doesn't exist and runs the command)
124
+ uv run llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
125
+ ```
126
+
127
+ ### Option 3: Install into system path
128
+
129
+ ```bash
130
+ uv pip install git+https://github.com/eugr/llama-benchy --system
131
+ ```
132
+
133
+ ## Usage
134
+
135
+ After installation, you can run the tool directly:
136
+
137
+ ```bash
138
+ llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME> --pp <PROMPT_TOKENS> --tg <GEN_TOKENS> [OPTIONS]
139
+ ```
140
+
141
+ Example:
142
+
143
+ ```bash
144
+ llama-benchy \
145
+ --base-url http://localhost:8000/v1 \
146
+ --model openai/gpt-oss-120b \
147
+ --depth 0 4096 8192 16384 32768 \
148
+ --latency-mode generation
149
+ ```
150
+
151
+ Output:
152
+
153
+
154
+ | model | test | t/s | ttfr (ms) | est_ppt (ms) | e2e_ttft (ms) |
155
+ |:--------------------|----------------:|----------------:|-------------------:|-------------------:|-------------------:|
156
+ | openai/gpt-oss-120b | pp2048 | 2019.02 ± 34.98 | 1054.64 ± 17.57 | 1014.66 ± 17.57 | 1115.41 ± 18.70 |
157
+ | openai/gpt-oss-120b | tg32 | 52.94 ± 1.01 | | | |
158
+ | openai/gpt-oss-120b | pp2048 @ d4096 | 1994.49 ± 77.97 | 3129.18 ± 120.27 | 3089.19 ± 120.27 | 3198.97 ± 122.24 |
159
+ | openai/gpt-oss-120b | tg32 @ d4096 | 46.69 ± 1.11 | | | |
160
+ | openai/gpt-oss-120b | pp2048 @ d8192 | 1751.68 ± 34.44 | 5892.61 ± 114.68 | 5852.63 ± 114.68 | 5971.27 ± 115.77 |
161
+ | openai/gpt-oss-120b | tg32 @ d8192 | 40.40 ± 1.19 | | | |
162
+ | openai/gpt-oss-120b | pp2048 @ d16384 | 1475.63 ± 31.41 | 12542.02 ± 265.86 | 12502.04 ± 265.86 | 12634.67 ± 269.43 |
163
+ | openai/gpt-oss-120b | tg32 @ d16384 | 33.86 ± 1.45 | | | |
164
+ | openai/gpt-oss-120b | pp2048 @ d32768 | 1131.86 ± 50.53 | 30869.90 ± 1410.15 | 30829.92 ± 1410.15 | 30992.96 ± 1417.33 |
165
+ | openai/gpt-oss-120b | tg32 @ d32768 | 25.34 ± 1.31 | | | |
166
+
167
+ llama-benchy (build: 75bc129)
168
+ date: 2026-01-02 17:11:19 | latency mode: generation
169
+
170
+ -------
171
+
172
+ It's recommended to use "generation" latency mode to get prompt processing speeds closer to real numbers, especially on shorter prompts.
173
+ By default, the script adapts the prompt size to match the specified value, regardless of the chat template applied. Use `--no-adapt-prompt` to disable this behavior.
174
+
175
+ Generally you don't need to disable prompt caching on the server, as a probability of cache hits is fairly small. You can add `--no-cache` that will add some random noise if you get cache hits.
176
+
177
+ ### Arguments
178
+
179
+ - `--base-url`: OpenAI compatible endpoint URL (Required).
180
+ - `--api-key`: API Key (Default: "EMPTY").
181
+ - `--model`: Model name (Required).
182
+ - `--served-model-name`: Model name used in API calls (Defaults to --model if not specified).
183
+ - `--tokenizer`: HuggingFace tokenizer name (Defaults to model name).
184
+ - `--pp`: List of prompt processing token counts (Default: [2048]).
185
+ - `--tg`: List of token generation counts (Default: [32]).
186
+ - `--depth`: List of context depths (Default: [0]).
187
+ - `--runs`: Number of runs per test (Default: 3).
188
+ - `--no-cache`: Add noise to requests to improve prefix caching avoidance. Also sends `cache-prompt=false` to the server.
189
+ - `--post-run-cmd`: Command to execute after each test run.
190
+ - `--book-url`: URL of a book to use for text generation (Defaults to Sherlock Holmes).
191
+ - `--latency-mode`: Method to measure latency: 'api' (call list models function) - default, 'generation' (single token generation), or 'none' (skip latency measurement).
192
+ - `--no-warmup`: Skip warmup phase.
193
+ - `--adapt-prompt`: Adapt prompt size based on warmup token usage delta (Default: True).
194
+ - `--no-adapt-prompt`: Disable prompt size adaptation.
195
+ - `--enable-prefix-caching`: Enable prefix caching performance measurement. When enabled (and depth > 0), it performs a two-step benchmark: first loading the context (reported as `ctx_pp`), then running the prompt with the cached context.
196
+
197
+ ### Metrics
198
+
199
+ The script outputs a table with the following metrics. All time measurements are in milliseconds (ms).
200
+
201
+ #### Latency Adjustment
202
+ The script attempts to estimate network or processing latency to provide "server-side" processing times.
203
+ - **Latency**: Measured based on `--latency-mode`.
204
+ - `api`: Time to fetch `/models` (from sending request to getting first byte of the response). Eliminates network latency only.
205
+ - `generation`: Time to generate 1 token (from sending request to getting first byte of the response). Tries to eliminate network and server overhead latency.
206
+ - `none`: Assumed to be 0.
207
+ - This measured latency is subtracted from `ttfr` to calculate `est_ppt`.
208
+
209
+ #### Table Columns
210
+
211
+ - **`t/s` (Tokens per Second)**:
212
+ - **For Prompt Processing (pp)**: Calculated as `Total Prompt Tokens / est_ppt`. This represents the prefill speed.
213
+ - **For Token Generation (tg)**: Calculated as `(Total Generated Tokens - 1) / (Time of Last Token - Time of First Token)`. This represents the decode speed, excluding the first token latency.
214
+
215
+ - **`ttfr (ms)` (Time To First Response)**:
216
+ - Calculation: `Time of First Response Chunk - Start Time`.
217
+ - Represents the raw time until the client receives *any* stream data from the server (including empty chunks or role definitions, but excluding initial http response header). This includes network latency. The same measurement method is used by `vllm bench serve` to report TTFT.
218
+
219
+ - **`est_ppt (ms)` (Estimated Prompt Processing Time)**:
220
+ - Calculation: `TTFR - Estimated Latency`.
221
+ - Estimated time the server spent processing the prompt. Used for calculating Prompt Processing speed.
222
+
223
+ - **`e2e_ttft (ms)` (End-to-End Time To First Token)**:
224
+ - Calculation: `Time of First Content Token - Start Time`.
225
+ - The total time perceived by the client from sending the request to seeing the first generated content.
226
+
227
+ ### Prefix Caching Benchmarking
228
+
229
+ When `--enable-prefix-caching` is used (with `--depth` > 0), the script performs a two-step process for each run to measure the impact of prefix caching:
230
+
231
+ 1. **Context Load**: Sends the context tokens (as a system message) with an empty user message. This forces the server to process and cache the context.
232
+ - Reported as `ctx_pp @ d{depth}` (Context Prompt Processing) and `ctx_tg @ d{depth}`.
233
+ 2. **Inference**: Sends the same context (system message) followed by the actual prompt (user message). The server should reuse the cached context.
234
+ - Reported as standard `pp{tokens} @ d{depth}` and `tg{tokens} @ d{depth}`.
235
+
236
+ By comparing the `pp` speed of the Inference step with a non-cached run (or the Context Load step), you can observe the speedup provided by prefix caching.
237
+
238
+ ### Example
239
+
240
+ ```bash
241
+ llama-benchy \
242
+ --base-url http://localhost:8000/v1 \
243
+ --model openai/gpt-oss-120b \
244
+ --pp 128 256 \
245
+ --tg 32 64 \
246
+ --depth 0 1024
247
+ ```
248
+
249
+ This will run benchmarks for all combinations of pp (128, 256), tg (32, 64), and depth (0, 1024).
@@ -0,0 +1,8 @@
1
+ llama_benchy/__init__.py,sha256=4DTHEmeJShcJJdb6rPhFqv4_zREEQRZGZ3h0ThA6YpU,798
2
+ llama_benchy/__main__.py,sha256=CZEf_36w5iut5RZjAZ4F894PAHF3hoxDWkT6lTFcr_I,24351
3
+ llama_benchy/_version.py,sha256=5jwwVncvCiTnhOedfkzzxmxsggwmTBORdFL_4wq0ZeY,704
4
+ llama_benchy-0.1.0.dist-info/METADATA,sha256=WEL0ASCMSmJA8QPXJzAYA0fAztA_6-D7xGjNdx3o4vY,12943
5
+ llama_benchy-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
6
+ llama_benchy-0.1.0.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
7
+ llama_benchy-0.1.0.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
8
+ llama_benchy-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ llama-benchy = llama_benchy.__main__:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Eugene Rakhmatulin
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.