llama-benchy 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llama_benchy/client.py ADDED
@@ -0,0 +1,199 @@
1
+ import time
2
+ import json
3
+ import codecs
4
+ import aiohttp
5
+ import asyncio
6
+ import numpy as np
7
+ from dataclasses import dataclass
8
+ from typing import Optional, List, Dict, Any
9
+
10
+ @dataclass
11
+ class RequestResult:
12
+ start_ts: float = 0.0
13
+ end_ts: float = 0.0
14
+ first_token_ts: Optional[float] = None
15
+ first_response_ts: Optional[float] = None
16
+ prompt_tokens: int = 0
17
+ total_tokens: int = 0
18
+ error: Optional[str] = None
19
+
20
+ class LLMClient:
21
+ def __init__(self, base_url: str, api_key: str, model_name: str):
22
+ self.base_url = base_url
23
+ self.api_key = api_key
24
+ self.model_name = model_name
25
+ self.headers = {"Authorization": f"Bearer {api_key}"}
26
+
27
+ async def measure_latency(self, session: aiohttp.ClientSession, mode: str = "api") -> float:
28
+ if mode == "none":
29
+ print("Skipping latency measurement (assuming 0 ms).")
30
+ return 0
31
+
32
+ print(f"Measuring latency using mode: {mode}...")
33
+ latencies = []
34
+
35
+ for _ in range(3):
36
+ start = time.perf_counter()
37
+ try:
38
+ if mode == "api":
39
+ async with session.get(f"{self.base_url}/models", headers=self.headers) as response:
40
+ await response.read()
41
+ latencies.append(time.perf_counter() - start)
42
+ elif mode == "generation":
43
+ payload = {
44
+ "model": self.model_name,
45
+ "messages": [{"role": "user", "content": "hello"}],
46
+ "max_tokens": 1,
47
+ "stream": True
48
+ }
49
+ async with session.post(f"{self.base_url}/chat/completions", json=payload, headers=self.headers) as response:
50
+ async for _ in response.content:
51
+ latencies.append(time.perf_counter() - start)
52
+ break
53
+ async for _ in response.content: pass
54
+ except Exception as e:
55
+ print(f"Error measuring latency: {e}")
56
+
57
+ if latencies:
58
+ avg_latency = np.mean(latencies)
59
+ print(f"Average latency ({mode}): {avg_latency*1000:.2f} ms")
60
+ return avg_latency
61
+ return 0
62
+
63
+ async def warmup(self, session: aiohttp.ClientSession, tokenizer=None):
64
+ print("Warming up...")
65
+ warmup_text = "Warmup " * 10
66
+
67
+ delta_user = 0
68
+ delta_context = 0
69
+
70
+ # 1. User only
71
+ payload_user = {
72
+ "model": self.model_name,
73
+ "messages": [{"role": "user", "content": warmup_text}],
74
+ "max_tokens": 1
75
+ }
76
+
77
+ try:
78
+ async with session.post(f"{self.base_url}/chat/completions", json=payload_user, headers=self.headers) as response:
79
+ response_json = await response.json()
80
+ if tokenizer:
81
+ if 'usage' in response_json:
82
+ prompt_tokens = response_json['usage']['prompt_tokens']
83
+ local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
84
+ delta_user = prompt_tokens - local_tokens
85
+ print(f"Warmup (User only) complete. Delta: {delta_user} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
86
+ else:
87
+ print("Warmup (User only) complete (no usage stats found).")
88
+ else:
89
+ print("Warmup complete.")
90
+
91
+ if tokenizer:
92
+ # 2. Context Only
93
+ payload_sys_empty = {
94
+ "model": self.model_name,
95
+ "messages": [
96
+ {"role": "system", "content": warmup_text},
97
+ {"role": "user", "content": ""}
98
+ ],
99
+ "max_tokens": 1
100
+ }
101
+ async with session.post(f"{self.base_url}/chat/completions", json=payload_sys_empty, headers=self.headers) as response:
102
+ response_json = await response.json()
103
+ if 'usage' in response_json:
104
+ prompt_tokens = response_json['usage']['prompt_tokens']
105
+ local_tokens = len(tokenizer.encode(warmup_text, add_special_tokens=False))
106
+ delta_context = prompt_tokens - local_tokens
107
+ print(f"Warmup (System+Empty) complete. Delta: {delta_context} tokens (Server: {prompt_tokens}, Local: {local_tokens})")
108
+ else:
109
+ delta_context = delta_user
110
+ except Exception as e:
111
+ print(f"Warmup failed: {e}")
112
+ return delta_user, delta_context
113
+
114
+ async def run_generation(
115
+ self,
116
+ session: aiohttp.ClientSession,
117
+ context_text: str,
118
+ prompt_text: str,
119
+ max_tokens: int,
120
+ no_cache: bool
121
+ ) -> RequestResult:
122
+
123
+ messages = []
124
+ if context_text:
125
+ messages.append({"role": "system", "content": context_text})
126
+ messages.append({"role": "user", "content": prompt_text})
127
+
128
+ result = RequestResult()
129
+
130
+ try:
131
+ payload = {
132
+ "model": self.model_name,
133
+ "messages": messages,
134
+ "max_tokens": max_tokens,
135
+ "stream": True,
136
+ "stream_options": {"include_usage": True},
137
+ }
138
+
139
+ if no_cache:
140
+ payload["cache_prompt"] = False
141
+
142
+ result.start_ts = time.perf_counter()
143
+
144
+ async with session.post(f"{self.base_url}/chat/completions", json=payload, headers=self.headers) as response:
145
+ if response.status != 200:
146
+ error_text = await response.text()
147
+ result.error = f"HTTP {response.status}: {error_text}"
148
+ print(result.error)
149
+ return result
150
+
151
+ decoder = codecs.getincrementaldecoder("utf-8")(errors='replace')
152
+ buffer = ""
153
+
154
+ async for chunk_bytes in response.content:
155
+ chunk_time = time.perf_counter()
156
+ decoded_chunk = decoder.decode(chunk_bytes, final=False)
157
+ buffer += decoded_chunk
158
+
159
+ while "\n" in buffer:
160
+ line, buffer = buffer.split("\n", 1)
161
+ line = line.strip()
162
+ if not line:
163
+ continue
164
+
165
+ if line == 'data: [DONE]' or line == 'data:[DONE]':
166
+ continue
167
+
168
+ if line.startswith('data:'):
169
+ try:
170
+ json_str = line[5:].strip()
171
+ chunk = json.loads(json_str)
172
+
173
+ if 'usage' in chunk:
174
+ result.prompt_tokens = chunk['usage'].get('prompt_tokens', 0)
175
+
176
+ if 'choices' in chunk and len(chunk['choices']) > 0:
177
+ if result.first_response_ts is None:
178
+ result.first_response_ts = chunk_time
179
+
180
+ delta = chunk['choices'][0].get('delta', {})
181
+ content = delta.get('content')
182
+ reasoning_content = delta.get('reasoning_content')
183
+ reasoning = delta.get('reasoning')
184
+
185
+ if content or reasoning_content or reasoning:
186
+ if result.first_token_ts is None:
187
+ result.first_token_ts = chunk_time
188
+
189
+ result.total_tokens += 1
190
+ except json.JSONDecodeError:
191
+ continue
192
+
193
+ result.end_ts = time.perf_counter()
194
+
195
+ except Exception as e:
196
+ print(f"Error during run: {e}")
197
+ result.error = str(e)
198
+
199
+ return result
llama_benchy/config.py ADDED
@@ -0,0 +1,76 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import List, Optional
3
+ import argparse
4
+ import os
5
+ from ._version import __version__
6
+
7
+ @dataclass
8
+ class BenchmarkConfig:
9
+ base_url: str
10
+ api_key: str
11
+ model: str
12
+ served_model_name: str
13
+ tokenizer: Optional[str]
14
+ pp_counts: List[int]
15
+ tg_counts: List[int]
16
+ depths: List[int]
17
+ num_runs: int
18
+ no_cache: bool
19
+ latency_mode: str
20
+ no_warmup: bool
21
+ adapt_prompt: bool
22
+ enable_prefix_caching: bool
23
+ book_url: str
24
+ post_run_cmd: Optional[str]
25
+ concurrency_levels: List[int]
26
+ save_result: Optional[str] = None
27
+ result_format: str = "md"
28
+
29
+ @classmethod
30
+ def from_args(cls):
31
+ parser = argparse.ArgumentParser(description="LLM Benchmark Script")
32
+ parser.add_argument('--version', action='version', version=f'%(prog)s {__version__}')
33
+ parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
34
+ parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
35
+ parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
36
+ parser.add_argument("--served-model-name", type=str, default=None, help="Model name used in API calls (defaults to --model if not specified)")
37
+ parser.add_argument("--tokenizer", type=str, default=None, help="HuggingFace tokenizer name (defaults to model name)")
38
+ parser.add_argument("--pp", type=int, nargs='+', required=False, default=[2048], help="List of prompt processing token counts - default: 2048")
39
+ parser.add_argument("--tg", type=int, nargs='+', required=False, default=[32], help="List of token generation counts - default: 32")
40
+ parser.add_argument("--depth", type=int, nargs='+', default=[0], help="List of context depths (previous conversation tokens) - default: 0")
41
+ parser.add_argument("--runs", type=int, default=3, help="Number of runs per test - default: 3")
42
+ parser.add_argument("--no-cache", action="store_true", help="Ensure unique requests to avoid prefix caching and send cache_prompt=false to the server")
43
+ parser.add_argument("--post-run-cmd", type=str, default=None, help="Command to execute after each test run")
44
+ parser.add_argument("--book-url", type=str, default="https://www.gutenberg.org/files/1661/1661-0.txt", help="URL of a book to use for text generation, defaults to Sherlock Holmes")
45
+ parser.add_argument("--latency-mode", type=str, default="api", choices=["api", "generation", "none"], help="Method to measure latency: 'api' (list models) - default, 'generation' (single token generation), or 'none' (skip latency measurement)")
46
+ parser.add_argument("--no-warmup", action="store_true", help="Skip warmup phase")
47
+ parser.add_argument("--adapt-prompt", action="store_true", default=True, help="Adapt prompt size based on warmup token usage delta (default: True)")
48
+ parser.add_argument("--no-adapt-prompt", action="store_false", dest="adapt_prompt", help="Disable prompt size adaptation")
49
+ parser.add_argument("--enable-prefix-caching", action="store_true", help="Enable prefix caching performance measurement")
50
+ parser.add_argument("--concurrency", type=int, nargs='+', default=[1], help="List of concurrency levels (number of concurrent requests per test) - default: [1]")
51
+ parser.add_argument("--save-result", type=str, help="File to save results to")
52
+ parser.add_argument("--format", type=str, default="md", choices=["md", "json", "csv"], help="Output format")
53
+
54
+ args = parser.parse_args()
55
+
56
+ return cls(
57
+ base_url=args.base_url,
58
+ api_key=args.api_key,
59
+ model=args.model,
60
+ served_model_name=args.served_model_name if args.served_model_name else args.model,
61
+ tokenizer=args.tokenizer,
62
+ pp_counts=args.pp,
63
+ tg_counts=args.tg,
64
+ depths=args.depth,
65
+ num_runs=args.runs,
66
+ no_cache=args.no_cache,
67
+ latency_mode=args.latency_mode,
68
+ no_warmup=args.no_warmup,
69
+ adapt_prompt=args.adapt_prompt,
70
+ enable_prefix_caching=args.enable_prefix_caching,
71
+ book_url=args.book_url,
72
+ post_run_cmd=args.post_run_cmd,
73
+ concurrency_levels=args.concurrency,
74
+ save_result=args.save_result,
75
+ result_format=args.format
76
+ )
llama_benchy/corpus.py ADDED
@@ -0,0 +1,62 @@
1
+ import os
2
+ import hashlib
3
+ import requests
4
+ from transformers import AutoTokenizer
5
+
6
+ class TokenizedCorpus:
7
+ def __init__(self, book_url: str, tokenizer_name: str, model_name: str):
8
+ self.book_url = book_url
9
+ self.tokenizer = self._get_tokenizer(model_name, tokenizer_name)
10
+ self.tokens = self._load_data()
11
+
12
+ def _get_tokenizer(self, model_name, tokenizer_name=None):
13
+ try:
14
+ name = tokenizer_name if tokenizer_name else model_name
15
+ return AutoTokenizer.from_pretrained(name)
16
+ except Exception as e:
17
+ print(f"Error loading tokenizer: {e}")
18
+ print("Falling back to 'gpt2' tokenizer as approximation.")
19
+ return AutoTokenizer.from_pretrained("gpt2")
20
+
21
+ def _load_data(self):
22
+ try:
23
+ # Create cache directory if it doesn't exist
24
+ cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "llama-benchy")
25
+ os.makedirs(cache_dir, exist_ok=True)
26
+
27
+ # Generate hash of the URL for the filename
28
+ url_hash = hashlib.md5(self.book_url.encode()).hexdigest()
29
+ cache_file = os.path.join(cache_dir, f"{url_hash}.txt")
30
+
31
+ if os.path.exists(cache_file):
32
+ print(f"Loading text from cache: {cache_file}")
33
+ with open(cache_file, "r", encoding="utf-8") as f:
34
+ text = f.read()
35
+ else:
36
+ print(f"Downloading book from {self.book_url}...")
37
+ response = requests.get(self.book_url)
38
+ response.raise_for_status()
39
+ text = response.text
40
+ # Basic cleanup
41
+ start_idx = text.find("*** START OF THE PROJECT GUTENBERG EBOOK")
42
+ if start_idx != -1:
43
+ text = text[start_idx:]
44
+
45
+ # Save to cache
46
+ with open(cache_file, "w", encoding="utf-8") as f:
47
+ f.write(text)
48
+ print(f"Saved text to cache: {cache_file}")
49
+
50
+ return self.tokenizer.encode(text, add_special_tokens=False)
51
+ except Exception as e:
52
+ print(f"Error downloading or processing book: {e}")
53
+ exit(1)
54
+
55
+ def get_tokenizer(self):
56
+ return self.tokenizer
57
+
58
+ def get_tokens(self):
59
+ return self.tokens
60
+
61
+ def __len__(self):
62
+ return len(self.tokens)
@@ -0,0 +1,54 @@
1
+ import uuid
2
+ import numpy as np
3
+ from typing import Tuple, List
4
+
5
+ from .corpus import TokenizedCorpus
6
+
7
+ class PromptGenerator:
8
+ def __init__(self, corpus: TokenizedCorpus):
9
+ self.corpus = corpus
10
+ self.tokenizer = corpus.get_tokenizer()
11
+ self.all_tokens = corpus.get_tokens()
12
+
13
+ def generate(self, prompt_tokens: int, context_tokens: int = 0, no_cache: bool = False) -> Tuple[str, str]:
14
+ """
15
+ Generates a single (context, prompt) pair.
16
+ """
17
+ suffix = ""
18
+ suffix_len = 0
19
+ if no_cache:
20
+ suffix = f" {uuid.uuid4()}"
21
+ suffix_len = len(self.tokenizer.encode(suffix, add_special_tokens=False))
22
+
23
+ # Adjust prompt tokens to fetch from text
24
+ text_prompt_tokens = max(0, prompt_tokens - suffix_len)
25
+
26
+ # Create a pool of tokens large enough
27
+ total_needed = text_prompt_tokens + context_tokens
28
+
29
+ # Create a local reference to tokens to potentially extend
30
+ current_tokens = self.all_tokens
31
+
32
+ if len(current_tokens) < total_needed:
33
+ # Repeat tokens if not enough
34
+ current_tokens = current_tokens * (total_needed // len(current_tokens) + 2)
35
+
36
+ # Pick a random start position
37
+ max_start = len(current_tokens) - total_needed
38
+ start_idx = np.random.randint(0, max_start)
39
+
40
+ selected_tokens = current_tokens[start_idx : start_idx + total_needed]
41
+
42
+ context_text = self.tokenizer.decode(selected_tokens[:context_tokens]) if context_tokens > 0 else ""
43
+ prompt_text = self.tokenizer.decode(selected_tokens[context_tokens:])
44
+
45
+ if no_cache:
46
+ prompt_text += suffix
47
+
48
+ return context_text, prompt_text
49
+
50
+ def generate_batch(self, batch_size: int, prompt_tokens: int, context_tokens: int = 0, no_cache: bool = False) -> List[Tuple[str, str]]:
51
+ """
52
+ Generates a batch of (context, prompt) pairs.
53
+ """
54
+ return [self.generate(prompt_tokens, context_tokens, no_cache) for _ in range(batch_size)]