llama-benchy 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llama_benchy-0.1.0 → llama_benchy-0.1.1}/PKG-INFO +25 -3
- {llama_benchy-0.1.0 → llama_benchy-0.1.1}/README.md +24 -2
- llama_benchy-0.1.1/src/llama_benchy/__init__.py +8 -0
- {llama_benchy-0.1.0 → llama_benchy-0.1.1}/src/llama_benchy/__main__.py +4 -4
- {llama_benchy-0.1.0 → llama_benchy-0.1.1}/src/llama_benchy/_version.py +2 -2
- llama_benchy-0.1.0/src/llama_benchy/__init__.py +0 -24
- {llama_benchy-0.1.0 → llama_benchy-0.1.1}/.gitignore +0 -0
- {llama_benchy-0.1.0 → llama_benchy-0.1.1}/LICENSE +0 -0
- {llama_benchy-0.1.0 → llama_benchy-0.1.1}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: llama-benchy
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
|
|
5
5
|
Author: eugr
|
|
6
6
|
License: MIT License
|
|
@@ -74,12 +74,26 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
|
|
|
74
74
|
- Supports executing a command after each run (e.g., to clear cache).
|
|
75
75
|
- Configurable latency measurement mode.
|
|
76
76
|
|
|
77
|
+
# Current Limitations
|
|
78
|
+
|
|
79
|
+
- Evaluates against `/v1/chat/completions` endpoint only.
|
|
80
|
+
- Doesn't measure throughput in concurrency mode (coming later).
|
|
81
|
+
- Outputs results as a Markdown table only for now.
|
|
82
|
+
|
|
77
83
|
## Installation
|
|
78
84
|
|
|
79
|
-
|
|
85
|
+
Using `uv` is recommended. You can install `uv` here: https://docs.astral.sh/uv/getting-started/installation/
|
|
80
86
|
|
|
81
87
|
### Option 1: Run without installation using `uvx`
|
|
82
88
|
|
|
89
|
+
Run the release version from PyPI:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
uvx llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Run the latest version from the main branch:
|
|
96
|
+
|
|
83
97
|
```bash
|
|
84
98
|
uvx --from git+https://github.com/eugr/llama-benchy llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
|
|
85
99
|
```
|
|
@@ -124,6 +138,14 @@ uv run llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
|
|
|
124
138
|
|
|
125
139
|
### Option 3: Install into system path
|
|
126
140
|
|
|
141
|
+
Release version from PyPI:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
uv pip install -U llama-benchy
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
Current version from the main branch:
|
|
148
|
+
|
|
127
149
|
```bash
|
|
128
150
|
uv pip install git+https://github.com/eugr/llama-benchy --system
|
|
129
151
|
```
|
|
@@ -231,7 +253,7 @@ When `--enable-prefix-caching` is used (with `--depth` > 0), the script performs
|
|
|
231
253
|
2. **Inference**: Sends the same context (system message) followed by the actual prompt (user message). The server should reuse the cached context.
|
|
232
254
|
- Reported as standard `pp{tokens} @ d{depth}` and `tg{tokens} @ d{depth}`.
|
|
233
255
|
|
|
234
|
-
|
|
256
|
+
In this case, `pp` and `tg` speeds will show an actual prompt processing / token generation speeds for a follow up prompt with a context pre-filled.
|
|
235
257
|
|
|
236
258
|
### Example
|
|
237
259
|
|
|
@@ -29,12 +29,26 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
|
|
|
29
29
|
- Supports executing a command after each run (e.g., to clear cache).
|
|
30
30
|
- Configurable latency measurement mode.
|
|
31
31
|
|
|
32
|
+
# Current Limitations
|
|
33
|
+
|
|
34
|
+
- Evaluates against `/v1/chat/completions` endpoint only.
|
|
35
|
+
- Doesn't measure throughput in concurrency mode (coming later).
|
|
36
|
+
- Outputs results as a Markdown table only for now.
|
|
37
|
+
|
|
32
38
|
## Installation
|
|
33
39
|
|
|
34
|
-
|
|
40
|
+
Using `uv` is recommended. You can install `uv` here: https://docs.astral.sh/uv/getting-started/installation/
|
|
35
41
|
|
|
36
42
|
### Option 1: Run without installation using `uvx`
|
|
37
43
|
|
|
44
|
+
Run the release version from PyPI:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
uvx llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Run the latest version from the main branch:
|
|
51
|
+
|
|
38
52
|
```bash
|
|
39
53
|
uvx --from git+https://github.com/eugr/llama-benchy llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
|
|
40
54
|
```
|
|
@@ -79,6 +93,14 @@ uv run llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
|
|
|
79
93
|
|
|
80
94
|
### Option 3: Install into system path
|
|
81
95
|
|
|
96
|
+
Release version from PyPI:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
uv pip install -U llama-benchy
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
Current version from the main branch:
|
|
103
|
+
|
|
82
104
|
```bash
|
|
83
105
|
uv pip install git+https://github.com/eugr/llama-benchy --system
|
|
84
106
|
```
|
|
@@ -186,7 +208,7 @@ When `--enable-prefix-caching` is used (with `--depth` > 0), the script performs
|
|
|
186
208
|
2. **Inference**: Sends the same context (system message) followed by the actual prompt (user message). The server should reuse the cached context.
|
|
187
209
|
- Reported as standard `pp{tokens} @ d{depth}` and `tg{tokens} @ d{depth}`.
|
|
188
210
|
|
|
189
|
-
|
|
211
|
+
In this case, `pp` and `tg` speeds will show an actual prompt processing / token generation speeds for a follow up prompt with a context pre-filled.
|
|
190
212
|
|
|
191
213
|
### Example
|
|
192
214
|
|
|
@@ -19,12 +19,13 @@ from transformers import AutoTokenizer
|
|
|
19
19
|
import requests
|
|
20
20
|
|
|
21
21
|
# Build number is now imported from __init__.py
|
|
22
|
-
from . import
|
|
22
|
+
from . import __version__
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
def parse_arguments():
|
|
27
27
|
parser = argparse.ArgumentParser(description="LLM Benchmark Script")
|
|
28
|
+
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
28
29
|
parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
|
|
29
30
|
parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
|
|
30
31
|
parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
|
|
@@ -369,11 +370,10 @@ async def main_async():
|
|
|
369
370
|
print("Error: --enable-prefix-caching and --no-cache are incompatible.")
|
|
370
371
|
return
|
|
371
372
|
|
|
372
|
-
build_number = __build__
|
|
373
373
|
version_number = __version__
|
|
374
374
|
|
|
375
375
|
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
376
|
-
print(f"llama-benchy ({version_number}
|
|
376
|
+
print(f"llama-benchy ({version_number})")
|
|
377
377
|
print(f"Date: {current_time}")
|
|
378
378
|
print(f"Benchmarking model: {args.model} at {args.base_url}")
|
|
379
379
|
|
|
@@ -522,7 +522,7 @@ async def main_async():
|
|
|
522
522
|
print("No results collected. Check if the model is generating tokens.")
|
|
523
523
|
else:
|
|
524
524
|
print(tabulate(results, headers=["model", "test", "t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"], tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right")))
|
|
525
|
-
print(f"\nllama-benchy ({version_number}
|
|
525
|
+
print(f"\nllama-benchy ({version_number})")
|
|
526
526
|
print(f"date: {current_time} | latency mode: {args.latency_mode}")
|
|
527
527
|
|
|
528
528
|
|
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.1.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 1,
|
|
31
|
+
__version__ = version = '0.1.1'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 1, 1)
|
|
33
33
|
|
|
34
34
|
__commit_id__ = commit_id = None
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
llama-benchy - llama-bench style benchmarking tool for all backends
|
|
3
|
-
|
|
4
|
-
This package provides a benchmarking tool for OpenAI-compatible LLM endpoints,
|
|
5
|
-
generating statistics similar to `llama-bench`.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from ._version import __version__
|
|
9
|
-
|
|
10
|
-
# Extract build number from the version string
|
|
11
|
-
# Version format is like: '0.1.dev34+g33f03d886.d20260105'
|
|
12
|
-
# We want to extract the git hash part: '33f03d886'
|
|
13
|
-
__build__ = "unknown"
|
|
14
|
-
if "+" in __version__:
|
|
15
|
-
try:
|
|
16
|
-
# Extract the part after the '+' and before the '.'
|
|
17
|
-
build_part = __version__.split("+")[1].split(".")[0]
|
|
18
|
-
# Remove the 'g' prefix if it exists
|
|
19
|
-
if build_part.startswith("g"):
|
|
20
|
-
__build__ = build_part[1:]
|
|
21
|
-
else:
|
|
22
|
-
__build__ = build_part
|
|
23
|
-
except (IndexError, AttributeError):
|
|
24
|
-
pass
|
|
File without changes
|
|
File without changes
|
|
File without changes
|