llama-benchy 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llama_benchy/__main__.py CHANGED
@@ -2,534 +2,44 @@
2
2
  Main entry point for the llama-benchy CLI.
3
3
  """
4
4
 
5
- import argparse
6
- import os
7
- import time
8
- import uuid
9
- import subprocess
10
- import datetime
11
- import numpy as np
12
- from tabulate import tabulate
13
- import aiohttp
14
5
  import asyncio
15
- import json
16
- import codecs
17
- import hashlib
18
- from transformers import AutoTokenizer
19
- import requests
20
-
21
- # Build number is now imported from __init__.py
6
+ import datetime
22
7
  from . import __version__
23
-
24
-
25
-
26
- def parse_arguments():
27
- parser = argparse.ArgumentParser(description="LLM Benchmark Script")
28
- parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
29
- parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
30
- parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
31
- parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
32
- parser.add_argument("--served-model-name", type=str, default=None, help="Model name used in API calls (defaults to --model if not specified)")
33
- parser.add_argument("--tokenizer", type=str, default=None, help="HuggingFace tokenizer name (defaults to model name)")
34
- parser.add_argument("--pp", type=int, nargs='+', required=False, default=[2048], help="List of prompt processing token counts - default: 2048")
35
- parser.add_argument("--tg", type=int, nargs='+', required=False, default=[32], help="List of token generation counts - default: 32")
36
- parser.add_argument("--depth", type=int, nargs='+', default=[0], help="List of context depths (previous conversation tokens) - default: 0")
37
- parser.add_argument("--runs", type=int, default=3, help="Number of runs per test - default: 3")
38
- parser.add_argument("--no-cache", action="store_true", help="Ensure unique requests to avoid prefix caching and send cache_prompt=false to the server")
39
- parser.add_argument("--post-run-cmd", type=str, default=None, help="Command to execute after each test run")
40
- parser.add_argument("--book-url", type=str, default="https://www.gutenberg.org/files/1661/1661-0.txt", help="URL of a book to use for text generation, defaults to Sherlock Holmes (https://www.gutenberg.org/files/1661/1661-0.txt)")
41
- parser.add_argument("--latency-mode", type=str, default="api", choices=["api", "generation", "none"], help="Method to measure latency: 'api' (list models) - default, 'generation' (single token generation), or 'none' (skip latency measurement)")
42
- parser.add_argument("--no-warmup", action="store_true", help="Skip warmup phase")
43
- parser.add_argument("--adapt-prompt", action="store_true", default=True, help="Adapt prompt size based on warmup token usage delta (default: True)")
44
- parser.add_argument("--no-adapt-prompt", action="store_false", dest="adapt_prompt", help="Disable prompt size adaptation")
45
- parser.add_argument("--enable-prefix-caching", action="store_true", help="Enable prefix caching performance measurement")
46
- return parser.parse_args()
47
-
48
-
49
- def get_tokenizer(model_name, tokenizer_name=None):
50
- try:
51
- name = tokenizer_name if tokenizer_name else model_name
52
- return AutoTokenizer.from_pretrained(name)
53
- except Exception as e:
54
- print(f"Error loading tokenizer: {e}")
55
- print("Falling back to 'gpt2' tokenizer as approximation.")
56
- return AutoTokenizer.from_pretrained("gpt2")
57
-
58
-
59
- def prepare_text_data(book_url, tokenizer):
60
- try:
61
- # Create cache directory if it doesn't exist
62
- cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "llama-benchy")
63
- os.makedirs(cache_dir, exist_ok=True)
64
-
65
- # Generate hash of the URL for the filename
66
- url_hash = hashlib.md5(book_url.encode()).hexdigest()
67
- cache_file = os.path.join(cache_dir, f"{url_hash}.txt")
68
-
69
- if os.path.exists(cache_file):
70
- print(f"Loading text from cache: {cache_file}")
71
- with open(cache_file, "r", encoding="utf-8") as f:
72
- text = f.read()
73
- else:
74
- print(f"Downloading book from {book_url}...")
75
- response = requests.get(book_url)
76
- response.raise_for_status()
77
- text = response.text
78
- # Basic cleanup
79
- start_idx = text.find("*** START OF THE PROJECT GUTENBERG EBOOK")
80
- if start_idx != -1:
81
- text = text[start_idx:]
82
-
83
- # Save to cache
84
- with open(cache_file, "w", encoding="utf-8") as f:
85
- f.write(text)
86
- print(f"Saved text to cache: {cache_file}")
87
-
88
- return tokenizer.encode(text, add_special_tokens=False)
89
- except Exception as e:
90
- print(f"Error downloading book: {e}")
91
- exit(1)
92
-
93
-
94
- def generate_prompt(all_tokens, tokenizer, prompt_tokens, context_tokens=0, no_cache=False):
95
- suffix = ""
96
- suffix_len = 0
97
- if no_cache:
98
- suffix = f" {uuid.uuid4()}"
99
- suffix_len = len(tokenizer.encode(suffix, add_special_tokens=False))
100
-
101
- # Adjust prompt tokens to fetch from text
102
- text_prompt_tokens = max(0, prompt_tokens - suffix_len)
103
-
104
- # Create a pool of tokens large enough
105
- total_needed = text_prompt_tokens + context_tokens
106
-
107
- if len(all_tokens) < total_needed:
108
- # Repeat tokens if not enough
109
- all_tokens = all_tokens * (total_needed // len(all_tokens) + 2)
110
-
111
- # Pick a random start position
112
- max_start = len(all_tokens) - total_needed
113
- start_idx = np.random.randint(0, max_start)
114
-
115
- selected_tokens = all_tokens[start_idx : start_idx + total_needed]
116
-
117
- context_text = tokenizer.decode(selected_tokens[:context_tokens]) if context_tokens > 0 else ""
118
- prompt_text = tokenizer.decode(selected_tokens[context_tokens:])
119
-
120
- if no_cache:
121
- prompt_text += suffix
122
-
123
- return context_text, prompt_text
124
-
125
-
126
- async def measure_latency(session, base_url, api_key, mode="api", model_name=None):
127
- if mode == "none":
128
- print("Skipping latency measurement (assuming 0 ms).")
129
- return 0
130
-
131
- print(f"Measuring latency using mode: {mode}...")
132
- latencies = []
133
- headers = {"Authorization": f"Bearer {api_key}"}
134
-
135
- for _ in range(3):
136
- start = time.perf_counter()
137
- try:
138
- if mode == "api":
139
- async with session.get(f"{base_url}/models", headers=headers) as response:
140
- await response.read()
141
- latencies.append(time.perf_counter() - start)
142
- elif mode == "generation":
143
- if not model_name:
144
- raise ValueError("Model name required for generation latency mode")
145
- payload = {
146
- "model": model_name,
147
- "messages": [{"role": "user", "content": "hello"}],
148
- "max_tokens": 1,
149
- "stream": True
150
- }
151
- async with session.post(f"{base_url}/chat/completions", json=payload, headers=headers) as response:
152
- async for _ in response.content:
153
- # record latency as soon as the first byte is received
154
- latencies.append(time.perf_counter() - start)
155
- break
156
- # Drain the rest of the response to keep the connection alive
157
- async for _ in response.content: pass
158
- except Exception as e:
159
- print(f"Error measuring latency: {e}")
160
-
161
- if latencies:
162
- avg_latency = np.mean(latencies)
163
- print(f"Average latency ({mode}): {avg_latency*1000:.2f} ms")
164
- return avg_latency
165
- return 0
166
-
167
-
168
- async def warmup(session, base_url, api_key, model, tokenizer=None):
169
- print("Warming up...")
170
- headers = {"Authorization": f"Bearer {api_key}"}
171
- warmup_text = "Warmup " * 10
172
-
173
- delta_user = 0
174
- delta_context = 0
175
-
176
- # 1. User only (No Context)
177
- payload_user = {
178
- "model": model,
179
- "messages": [{"role": "user", "content": warmup_text}],
180
- "max_tokens": 1
181
- }
182
-
183
- try:
184
- async with session.post(f"{base_url}/chat/completions", json=payload_user, headers=headers) as response:
185
- response_json = await response.json()
186
- if tokenizer:
187
- if 'usage' in response_json:
188
- prompt_tokens = response_json['usage']['prompt_tokens']
189
- local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
190
- delta_user = prompt_tokens - local_tokens
191
- print(f"Warmup (User only) complete. Delta: {delta_user} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
192
- else:
193
- print("Warmup (User only) complete (no usage stats found).")
194
- else:
195
- print("Warmup complete.")
196
-
197
- if tokenizer:
198
- # 2. System + Empty User (Context Only)
199
- payload_sys_empty = {
200
- "model": model,
201
- "messages": [
202
- {"role": "system", "content": warmup_text},
203
- {"role": "user", "content": ""}
204
- ],
205
- "max_tokens": 1
206
- }
207
- async with session.post(f"{base_url}/chat/completions", json=payload_sys_empty, headers=headers) as response:
208
- response_json = await response.json()
209
- if 'usage' in response_json:
210
- prompt_tokens = response_json['usage']['prompt_tokens']
211
- local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
212
- delta_context = prompt_tokens - local_tokens
213
- print(f"Warmup (System+Empty) complete. Delta: {delta_context} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
214
- else:
215
- print("Warmup (System+Empty) complete (no usage stats found).")
216
- delta_context = delta_user
217
-
218
- except Exception as e:
219
- print(f"Warmup failed: {e}")
220
- return delta_user, delta_context
221
-
222
-
223
- async def run_benchmark(session, base_url, api_key, model_name, context_text, prompt_text, expected_pp_tokens, tg, no_cache, latency, post_run_cmd):
224
- messages = []
225
- if context_text:
226
- messages.append({"role": "system", "content": context_text})
227
- messages.append({"role": "user", "content": prompt_text})
228
-
229
- ttft = 0
230
- e2e_ttft = 0
231
- token_count = 0
232
- first_token_time = 0
233
- first_response_time = 0
234
- prompt_usage_tokens = 0
235
-
236
- result = {
237
- "pp_speed": None,
238
- "tg_speed": None,
239
- "ttft": None,
240
- "ttfr": None,
241
- "est_ppt": None,
242
- "e2e_ttft": None
243
- }
244
-
245
- try:
246
- payload = {
247
- "model": model_name,
248
- "messages": messages,
249
- "max_tokens": tg,
250
- "stream": True,
251
- "stream_options": {"include_usage": True},
252
- # "temperature": 0,
253
- # "seed": 42
254
- }
255
-
256
- if no_cache:
257
- payload["cache_prompt"] = False
258
-
259
- headers = {"Authorization": f"Bearer {api_key}"}
260
-
261
- start_time = time.perf_counter()
262
-
263
- async with session.post(f"{base_url}/chat/completions", json=payload, headers=headers) as response:
264
- if response.status != 200:
265
- error_text = await response.text()
266
- print(f"Error: {response.status} - {error_text}")
267
- return None
268
-
269
- buffer = ""
270
- decoder = codecs.getincrementaldecoder("utf-8")(errors='replace')
271
- async for chunk_bytes in response.content:
272
- chunk_time = time.perf_counter()
273
- decoded_chunk = decoder.decode(chunk_bytes, final=False)
274
- buffer += decoded_chunk
275
-
276
- while "\n" in buffer:
277
- line, buffer = buffer.split("\n", 1)
278
- line = line.strip()
279
- if not line or line == 'data: [DONE]':
280
- continue
281
-
282
- if line.startswith('data: '):
283
- try:
284
- chunk = json.loads(line[6:])
285
- if 'usage' in chunk:
286
- prompt_usage_tokens = chunk['usage'].get('prompt_tokens', 0)
287
-
288
- if 'choices' in chunk and len(chunk['choices']) > 0:
289
- if first_response_time == 0:
290
- first_response_time = chunk_time
291
-
292
- delta = chunk['choices'][0].get('delta', {})
293
- content = delta.get('content')
294
- reasoning_content = delta.get('reasoning_content')
295
-
296
- if content or reasoning_content:
297
- if token_count == 0:
298
- first_token_time = chunk_time
299
- e2e_ttft = first_token_time - start_time
300
- ttft = e2e_ttft-latency
301
- if ttft < 0:
302
- ttft = 0
303
-
304
- token_count += 1
305
- except json.JSONDecodeError:
306
- continue
307
-
308
- end_time = time.perf_counter()
309
-
310
- if token_count > 0:
311
- # Calculate decode time (time for subsequent tokens)
312
- # If only 1 token, decode_time is effectively 0, so we can't calculate inter-token speed
313
- if token_count > 1:
314
- decode_time = end_time - first_token_time
315
- if decode_time > 0:
316
- # Speed for the generated tokens (excluding the first one which is TTFT)
317
- result["tg_speed"] = (token_count - 1) / decode_time
318
- else:
319
- # Fallback if time is too small
320
- result["tg_speed"] = (token_count - 1) / 0.0001
321
-
322
- # Use expected_pp_tokens for speed calculation
323
- total_prompt_tokens = expected_pp_tokens
324
-
325
- # Only use reported usage if it's close to expected (to handle tokenizer differences)
326
- # but not if it's vastly different (which happens in prefix caching where usage includes cached tokens)
327
- if prompt_usage_tokens > 0:
328
- diff = abs(prompt_usage_tokens - expected_pp_tokens)
329
- if diff < expected_pp_tokens * 0.2: # 20% tolerance
330
- total_prompt_tokens = prompt_usage_tokens
331
-
332
- # Calculate TTFR and Estimated Prompt Processing Time
333
- ttfr = 0
334
- est_ppt = 0
335
- if first_response_time > 0:
336
- ttfr = first_response_time - start_time
337
- est_ppt = ttfr - latency
338
- if est_ppt < 0: est_ppt = 0
339
-
340
- if est_ppt > 0:
341
- result["pp_speed"] = total_prompt_tokens / est_ppt
342
- result["est_ppt"] = est_ppt
343
-
344
- if ttfr > 0:
345
- result["ttfr"] = ttfr
346
-
347
- if ttft > 0:
348
- result["ttft"] = ttft
349
-
350
- if e2e_ttft > 0:
351
- result["e2e_ttft"] = e2e_ttft
352
-
353
- except Exception as e:
354
- print(f"Error during run: {e}")
355
- return None
356
-
357
- if post_run_cmd:
358
- try:
359
- subprocess.run(post_run_cmd, shell=True, check=True)
360
- except subprocess.CalledProcessError as e:
361
- print(f"Post-run command failed: {e}")
362
-
363
- return result
364
-
8
+ from .config import BenchmarkConfig
9
+ from .corpus import TokenizedCorpus
10
+ from .prompts import PromptGenerator
11
+ from .client import LLMClient
12
+ from .runner import BenchmarkRunner
365
13
 
366
14
  async def main_async():
367
- args = parse_arguments()
15
+ # 1. Parse Configuration
16
+ config = BenchmarkConfig.from_args()
368
17
 
369
- if args.enable_prefix_caching and args.no_cache:
370
- print("Error: --enable-prefix-caching and --no-cache are incompatible.")
371
- return
372
-
373
- version_number = __version__
374
-
18
+ # 2. Print Header
375
19
  current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
376
- print(f"llama-benchy ({version_number})")
20
+ print(f"llama-benchy ({__version__})")
377
21
  print(f"Date: {current_time}")
378
- print(f"Benchmarking model: {args.model} at {args.base_url}")
379
-
380
- served_model_name = args.served_model_name if args.served_model_name else args.model
22
+ print(f"Benchmarking model: {config.model} at {config.base_url}")
23
+ print(f"Concurrency levels: {config.concurrency_levels}")
381
24
 
382
- tokenizer = get_tokenizer(args.model, args.tokenizer)
383
- all_tokens = prepare_text_data(args.book_url, tokenizer)
384
- print(f"Total tokens available in text corpus: {len(all_tokens)}")
25
+ # 3. Prepare Data
26
+ corpus = TokenizedCorpus(config.book_url, config.tokenizer, config.model)
27
+ print(f"Total tokens available in text corpus: {len(corpus)}")
385
28
 
386
- # Use a large timeout for long-running benchmarks
387
- timeout = aiohttp.ClientTimeout(total=3600)
388
- connector = aiohttp.TCPConnector(limit=1, force_close=False, keepalive_timeout=600)
389
- async with aiohttp.ClientSession(timeout=timeout, connector=connector, trust_env=True) as session:
390
- delta_user = 0
391
- delta_context = 0
392
- should_warmup = not args.no_warmup
393
- if args.adapt_prompt:
394
- should_warmup = True
395
-
396
- if should_warmup:
397
- delta_user, delta_context = await warmup(session, args.base_url, args.api_key, served_model_name, tokenizer if args.adapt_prompt else None)
398
-
399
- latency = await measure_latency(session, args.base_url, args.api_key, args.latency_mode, served_model_name)
400
-
401
- results = []
402
-
403
- for depth in args.depth:
404
- for pp in args.pp:
405
- for tg in args.tg:
406
- print(f"Running test: pp={pp}, tg={tg}, depth={depth}")
407
- pp_speeds = []
408
- tg_speeds = []
409
- ttft_values = []
410
- ttfr_values = []
411
- est_ppt_values = []
412
- e2e_ttft_values = []
413
-
414
- ctx_pp_speeds = []
415
- ctx_tg_speeds = []
416
- ctx_ttfr_values = []
417
- ctx_est_ppt_values = []
418
- ctx_e2e_ttft_values = []
419
-
420
- for run in range(args.runs):
421
- current_pp = pp
422
- current_depth = depth
423
- if args.adapt_prompt:
424
- if depth == 0:
425
- current_pp = max(1, pp - delta_user)
426
- else:
427
- current_depth = max(1, depth - delta_context)
428
-
429
- context, prompt = generate_prompt(all_tokens, tokenizer, current_pp, current_depth, args.no_cache)
430
-
431
- if args.enable_prefix_caching and depth > 0:
432
- # Request 1: Context only
433
- # We send context as system message, and empty prompt as user message.
434
- # This establishes the prefix: [System: Context] [User: ""]
435
- # Expected PP tokens = current_depth (context size)
436
- print(f" Run {run+1}/{args.runs} (Context Load)...")
437
- ctx_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, "", current_depth, tg, args.no_cache, latency, None)
438
-
439
- if ctx_result:
440
- if ctx_result["pp_speed"] is not None:
441
- ctx_pp_speeds.append(ctx_result["pp_speed"])
442
- if ctx_result["tg_speed"] is not None:
443
- ctx_tg_speeds.append(ctx_result["tg_speed"])
444
- if ctx_result["ttfr"] is not None:
445
- ctx_ttfr_values.append(ctx_result["ttfr"])
446
- if ctx_result["est_ppt"] is not None:
447
- ctx_est_ppt_values.append(ctx_result["est_ppt"])
448
- if ctx_result["e2e_ttft"] is not None:
449
- ctx_e2e_ttft_values.append(ctx_result["e2e_ttft"])
450
-
451
- # Request 2: Context + Prompt
452
- # We send context as system message, and prompt as user message.
453
- # The prefix [System: Context] should be cached.
454
- # Expected PP tokens = current_pp (prompt size only)
455
- print(f" Run {run+1}/{args.runs} (Inference)...")
456
- run_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, prompt, current_pp, tg, args.no_cache, latency, args.post_run_cmd)
457
- else:
458
- # Standard run
459
- # Expected PP tokens = current_pp + current_depth
460
- expected_tokens = current_pp + current_depth
461
- run_result = await run_benchmark(session, args.base_url, args.api_key, served_model_name, context, prompt, expected_tokens, tg, args.no_cache, latency, args.post_run_cmd)
462
-
463
- if run_result:
464
- if run_result["tg_speed"] is not None:
465
- tg_speeds.append(run_result["tg_speed"])
466
- if run_result["pp_speed"] is not None:
467
- pp_speeds.append(run_result["pp_speed"])
468
- if run_result["est_ppt"] is not None:
469
- est_ppt_values.append(run_result["est_ppt"])
470
- if run_result["ttfr"] is not None:
471
- ttfr_values.append(run_result["ttfr"])
472
- if run_result["ttft"] is not None:
473
- ttft_values.append(run_result["ttft"])
474
- if run_result["e2e_ttft"] is not None:
475
- e2e_ttft_values.append(run_result["e2e_ttft"])
476
-
477
- # Aggregate results
478
- def format_result(values, multiplier=1.0):
479
- if not values: return ""
480
- mean = np.mean(values) * multiplier
481
- std = np.std(values) * multiplier
482
- return f"{mean:.2f} ± {std:.2f}"
483
-
484
- # Context PP (if enabled)
485
- if ctx_pp_speeds:
486
- test_name = f"ctx_pp @ d{depth}"
487
- results.append([
488
- args.model,
489
- test_name,
490
- format_result(ctx_pp_speeds),
491
- format_result(ctx_ttfr_values, 1000),
492
- format_result(ctx_est_ppt_values, 1000),
493
- format_result(ctx_e2e_ttft_values, 1000)
494
- ])
495
-
496
- # Context TG (if enabled)
497
- if ctx_tg_speeds:
498
- test_name = f"ctx_tg @ d{depth}"
499
- results.append([args.model, test_name, format_result(ctx_tg_speeds), "", "", ""])
500
-
501
- # Standard PP
502
- if pp_speeds:
503
- test_name = f"pp{pp}"
504
- if depth > 0: test_name += f" @ d{depth}"
505
- results.append([
506
- args.model,
507
- test_name,
508
- format_result(pp_speeds),
509
- format_result(ttfr_values, 1000),
510
- format_result(est_ppt_values, 1000),
511
- format_result(e2e_ttft_values, 1000)
512
- ])
513
-
514
- # Standard TG
515
- if tg_speeds:
516
- test_name = f"tg{tg}"
517
- if depth > 0: test_name += f" @ d{depth}"
518
- results.append([args.model, test_name, format_result(tg_speeds), "", "", ""])
519
-
520
- print()
521
- if not results:
522
- print("No results collected. Check if the model is generating tokens.")
523
- else:
524
- print(tabulate(results, headers=["model", "test", "t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"], tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right")))
525
- print(f"\nllama-benchy ({version_number})")
526
- print(f"date: {current_time} | latency mode: {args.latency_mode}")
527
-
29
+ # 4. Initialize Components
30
+ prompt_gen = PromptGenerator(corpus)
31
+ client = LLMClient(config.base_url, config.api_key, config.served_model_name)
32
+ runner = BenchmarkRunner(config, client, prompt_gen)
33
+
34
+ # 5. Run Benchmark Suite
35
+ await runner.run_suite()
36
+
37
+ print(f"\nllama-benchy ({__version__})")
38
+ print(f"date: {current_time} | latency mode: {config.latency_mode}")
528
39
 
529
40
  def main():
530
41
  """Entry point for the CLI command."""
531
42
  asyncio.run(main_async())
532
43
 
533
-
534
44
  if __name__ == "__main__":
535
- main()
45
+ main()
llama_benchy/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.1'
32
- __version_tuple__ = version_tuple = (0, 1, 1)
31
+ __version__ = version = '0.2.0'
32
+ __version_tuple__ = version_tuple = (0, 2, 0)
33
33
 
34
34
  __commit_id__ = commit_id = None