llama-benchy 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_benchy/__init__.py +0 -16
- llama_benchy/__main__.py +26 -8
- llama_benchy/_version.py +2 -2
- {llama_benchy-0.1.0.dist-info → llama_benchy-0.1.2.dist-info}/METADATA +25 -3
- llama_benchy-0.1.2.dist-info/RECORD +8 -0
- llama_benchy-0.1.0.dist-info/RECORD +0 -8
- {llama_benchy-0.1.0.dist-info → llama_benchy-0.1.2.dist-info}/WHEEL +0 -0
- {llama_benchy-0.1.0.dist-info → llama_benchy-0.1.2.dist-info}/entry_points.txt +0 -0
- {llama_benchy-0.1.0.dist-info → llama_benchy-0.1.2.dist-info}/licenses/LICENSE +0 -0
llama_benchy/__init__.py
CHANGED
|
@@ -6,19 +6,3 @@ generating statistics similar to `llama-bench`.
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from ._version import __version__
|
|
9
|
-
|
|
10
|
-
# Extract build number from the version string
|
|
11
|
-
# Version format is like: '0.1.dev34+g33f03d886.d20260105'
|
|
12
|
-
# We want to extract the git hash part: '33f03d886'
|
|
13
|
-
__build__ = "unknown"
|
|
14
|
-
if "+" in __version__:
|
|
15
|
-
try:
|
|
16
|
-
# Extract the part after the '+' and before the '.'
|
|
17
|
-
build_part = __version__.split("+")[1].split(".")[0]
|
|
18
|
-
# Remove the 'g' prefix if it exists
|
|
19
|
-
if build_part.startswith("g"):
|
|
20
|
-
__build__ = build_part[1:]
|
|
21
|
-
else:
|
|
22
|
-
__build__ = build_part
|
|
23
|
-
except (IndexError, AttributeError):
|
|
24
|
-
pass
|
llama_benchy/__main__.py
CHANGED
|
@@ -19,12 +19,13 @@ from transformers import AutoTokenizer
|
|
|
19
19
|
import requests
|
|
20
20
|
|
|
21
21
|
# Build number is now imported from __init__.py
|
|
22
|
-
from . import
|
|
22
|
+
from . import __version__
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
def parse_arguments():
|
|
27
27
|
parser = argparse.ArgumentParser(description="LLM Benchmark Script")
|
|
28
|
+
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
28
29
|
parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
|
|
29
30
|
parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
|
|
30
31
|
parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
|
|
@@ -240,6 +241,9 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
|
|
|
240
241
|
"est_ppt": None,
|
|
241
242
|
"e2e_ttft": None
|
|
242
243
|
}
|
|
244
|
+
|
|
245
|
+
# DEBUG: Buffer to store first few lines of raw response
|
|
246
|
+
debug_lines = []
|
|
243
247
|
|
|
244
248
|
try:
|
|
245
249
|
payload = {
|
|
@@ -275,12 +279,22 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
|
|
|
275
279
|
while "\n" in buffer:
|
|
276
280
|
line, buffer = buffer.split("\n", 1)
|
|
277
281
|
line = line.strip()
|
|
278
|
-
if not line
|
|
282
|
+
if not line:
|
|
283
|
+
continue
|
|
284
|
+
|
|
285
|
+
# Capture first 5 lines for debugging if needed
|
|
286
|
+
if len(debug_lines) < 5:
|
|
287
|
+
debug_lines.append(line)
|
|
288
|
+
|
|
289
|
+
if line == 'data: [DONE]' or line == 'data:[DONE]':
|
|
279
290
|
continue
|
|
280
291
|
|
|
281
|
-
if line.startswith('data:
|
|
292
|
+
if line.startswith('data:'):
|
|
282
293
|
try:
|
|
283
|
-
|
|
294
|
+
# Strip 'data:' and potential whitespace
|
|
295
|
+
json_str = line[5:].strip()
|
|
296
|
+
chunk = json.loads(json_str)
|
|
297
|
+
|
|
284
298
|
if 'usage' in chunk:
|
|
285
299
|
prompt_usage_tokens = chunk['usage'].get('prompt_tokens', 0)
|
|
286
300
|
|
|
@@ -291,8 +305,9 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
|
|
|
291
305
|
delta = chunk['choices'][0].get('delta', {})
|
|
292
306
|
content = delta.get('content')
|
|
293
307
|
reasoning_content = delta.get('reasoning_content')
|
|
308
|
+
reasoning = delta.get('reasoning')
|
|
294
309
|
|
|
295
|
-
if content or reasoning_content:
|
|
310
|
+
if content or reasoning_content or reasoning:
|
|
296
311
|
if token_count == 0:
|
|
297
312
|
first_token_time = chunk_time
|
|
298
313
|
e2e_ttft = first_token_time - start_time
|
|
@@ -306,6 +321,10 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
|
|
|
306
321
|
|
|
307
322
|
end_time = time.perf_counter()
|
|
308
323
|
|
|
324
|
+
# DEBUG: Print warning if no tokens were collected
|
|
325
|
+
if token_count == 0:
|
|
326
|
+
print(f"\n[Warning] Run generated 0 tokens. Raw response sample: {debug_lines}")
|
|
327
|
+
|
|
309
328
|
if token_count > 0:
|
|
310
329
|
# Calculate decode time (time for subsequent tokens)
|
|
311
330
|
# If only 1 token, decode_time is effectively 0, so we can't calculate inter-token speed
|
|
@@ -369,11 +388,10 @@ async def main_async():
|
|
|
369
388
|
print("Error: --enable-prefix-caching and --no-cache are incompatible.")
|
|
370
389
|
return
|
|
371
390
|
|
|
372
|
-
build_number = __build__
|
|
373
391
|
version_number = __version__
|
|
374
392
|
|
|
375
393
|
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
376
|
-
print(f"llama-benchy ({version_number}
|
|
394
|
+
print(f"llama-benchy ({version_number})")
|
|
377
395
|
print(f"Date: {current_time}")
|
|
378
396
|
print(f"Benchmarking model: {args.model} at {args.base_url}")
|
|
379
397
|
|
|
@@ -522,7 +540,7 @@ async def main_async():
|
|
|
522
540
|
print("No results collected. Check if the model is generating tokens.")
|
|
523
541
|
else:
|
|
524
542
|
print(tabulate(results, headers=["model", "test", "t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"], tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right")))
|
|
525
|
-
print(f"\nllama-benchy ({version_number}
|
|
543
|
+
print(f"\nllama-benchy ({version_number})")
|
|
526
544
|
print(f"date: {current_time} | latency mode: {args.latency_mode}")
|
|
527
545
|
|
|
528
546
|
|
llama_benchy/_version.py
CHANGED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 2)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llama-benchy
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
|
|
5
5
|
Project-URL: Homepage, https://github.com/eugr/llama-benchy
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/eugr/llama-benchy/issues
|
|
@@ -76,12 +76,26 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
|
|
|
76
76
|
- Supports executing a command after each run (e.g., to clear cache).
|
|
77
77
|
- Configurable latency measurement mode.
|
|
78
78
|
|
|
79
|
+
# Current Limitations
|
|
80
|
+
|
|
81
|
+
- Evaluates against `/v1/chat/completions` endpoint only.
|
|
82
|
+
- Doesn't measure throughput in concurrency mode (coming later).
|
|
83
|
+
- Outputs results as a Markdown table only for now.
|
|
84
|
+
|
|
79
85
|
## Installation
|
|
80
86
|
|
|
81
|
-
|
|
87
|
+
Using `uv` is recommended. You can install `uv` here: https://docs.astral.sh/uv/getting-started/installation/
|
|
82
88
|
|
|
83
89
|
### Option 1: Run without installation using `uvx`
|
|
84
90
|
|
|
91
|
+
Run the release version from PyPI:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
uvx llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Run the latest version from the main branch:
|
|
98
|
+
|
|
85
99
|
```bash
|
|
86
100
|
uvx --from git+https://github.com/eugr/llama-benchy llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
|
|
87
101
|
```
|
|
@@ -126,6 +140,14 @@ uv run llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
|
|
|
126
140
|
|
|
127
141
|
### Option 3: Install into system path
|
|
128
142
|
|
|
143
|
+
Release version from PyPI:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
uv pip install -U llama-benchy
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Current version from the main branch:
|
|
150
|
+
|
|
129
151
|
```bash
|
|
130
152
|
uv pip install git+https://github.com/eugr/llama-benchy --system
|
|
131
153
|
```
|
|
@@ -233,7 +255,7 @@ When `--enable-prefix-caching` is used (with `--depth` > 0), the script performs
|
|
|
233
255
|
2. **Inference**: Sends the same context (system message) followed by the actual prompt (user message). The server should reuse the cached context.
|
|
234
256
|
- Reported as standard `pp{tokens} @ d{depth}` and `tg{tokens} @ d{depth}`.
|
|
235
257
|
|
|
236
|
-
|
|
258
|
+
In this case, `pp` and `tg` speeds will show an actual prompt processing / token generation speeds for a follow up prompt with a context pre-filled.
|
|
237
259
|
|
|
238
260
|
### Example
|
|
239
261
|
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
llama_benchy/__init__.py,sha256=D2TacJCNiAvfxHovv86Cm1kkFfmwgj_Z6QPoWdjJFhs,239
|
|
2
|
+
llama_benchy/__main__.py,sha256=ArgfdkzjgVv-tdoRW0WXxKEGfdbFDzmH6h3w3lay5zI,25120
|
|
3
|
+
llama_benchy/_version.py,sha256=Ok5oAXdWgR9aghaFXTafTeDW6sYO3uVe6d2Nket57R4,704
|
|
4
|
+
llama_benchy-0.1.2.dist-info/METADATA,sha256=oiJHBXHW_74XnVoKPvALBVP5-sXibFPDtELiCcdQaFw,13439
|
|
5
|
+
llama_benchy-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
llama_benchy-0.1.2.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
|
|
7
|
+
llama_benchy-0.1.2.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
|
|
8
|
+
llama_benchy-0.1.2.dist-info/RECORD,,
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
llama_benchy/__init__.py,sha256=4DTHEmeJShcJJdb6rPhFqv4_zREEQRZGZ3h0ThA6YpU,798
|
|
2
|
-
llama_benchy/__main__.py,sha256=CZEf_36w5iut5RZjAZ4F894PAHF3hoxDWkT6lTFcr_I,24351
|
|
3
|
-
llama_benchy/_version.py,sha256=5jwwVncvCiTnhOedfkzzxmxsggwmTBORdFL_4wq0ZeY,704
|
|
4
|
-
llama_benchy-0.1.0.dist-info/METADATA,sha256=WEL0ASCMSmJA8QPXJzAYA0fAztA_6-D7xGjNdx3o4vY,12943
|
|
5
|
-
llama_benchy-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
-
llama_benchy-0.1.0.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
|
|
7
|
-
llama_benchy-0.1.0.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
|
|
8
|
-
llama_benchy-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|