llama-benchy 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llama-benchy
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
5
5
  Author: eugr
6
6
  License: MIT License
@@ -74,12 +74,26 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
74
74
  - Supports executing a command after each run (e.g., to clear cache).
75
75
  - Configurable latency measurement mode.
76
76
 
77
+ # Current Limitations
78
+
79
+ - Evaluates against `/v1/chat/completions` endpoint only.
80
+ - Doesn't measure throughput in concurrency mode (coming later).
81
+ - Outputs results as a Markdown table only for now.
82
+
77
83
  ## Installation
78
84
 
79
- Install `uv` first: https://docs.astral.sh/uv/getting-started/installation/
85
+ Using `uv` is recommended. You can install `uv` here: https://docs.astral.sh/uv/getting-started/installation/
80
86
 
81
87
  ### Option 1: Run without installation using `uvx`
82
88
 
89
+ Run the release version from PyPI:
90
+
91
+ ```bash
92
+ uvx llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
93
+ ```
94
+
95
+ Run the latest version from the main branch:
96
+
83
97
  ```bash
84
98
  uvx --from git+https://github.com/eugr/llama-benchy llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
85
99
  ```
@@ -124,6 +138,14 @@ uv run llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
124
138
 
125
139
  ### Option 3: Install into system path
126
140
 
141
+ Release version from PyPI:
142
+
143
+ ```bash
144
+ uv pip install -U llama-benchy
145
+ ```
146
+
147
+ Current version from the main branch:
148
+
127
149
  ```bash
128
150
  uv pip install git+https://github.com/eugr/llama-benchy --system
129
151
  ```
@@ -231,7 +253,7 @@ When `--enable-prefix-caching` is used (with `--depth` > 0), the script performs
231
253
  2. **Inference**: Sends the same context (system message) followed by the actual prompt (user message). The server should reuse the cached context.
232
254
  - Reported as standard `pp{tokens} @ d{depth}` and `tg{tokens} @ d{depth}`.
233
255
 
234
- By comparing the `pp` speed of the Inference step with a non-cached run (or the Context Load step), you can observe the speedup provided by prefix caching.
256
+ In this case, `pp` and `tg` speeds will show an actual prompt processing / token generation speeds for a follow up prompt with a context pre-filled.
235
257
 
236
258
  ### Example
237
259
 
@@ -29,12 +29,26 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
29
29
  - Supports executing a command after each run (e.g., to clear cache).
30
30
  - Configurable latency measurement mode.
31
31
 
32
+ # Current Limitations
33
+
34
+ - Evaluates against `/v1/chat/completions` endpoint only.
35
+ - Doesn't measure throughput in concurrency mode (coming later).
36
+ - Outputs results as a Markdown table only for now.
37
+
32
38
  ## Installation
33
39
 
34
- Install `uv` first: https://docs.astral.sh/uv/getting-started/installation/
40
+ Using `uv` is recommended. You can install `uv` here: https://docs.astral.sh/uv/getting-started/installation/
35
41
 
36
42
  ### Option 1: Run without installation using `uvx`
37
43
 
44
+ Run the release version from PyPI:
45
+
46
+ ```bash
47
+ uvx llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
48
+ ```
49
+
50
+ Run the latest version from the main branch:
51
+
38
52
  ```bash
39
53
  uvx --from git+https://github.com/eugr/llama-benchy llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
40
54
  ```
@@ -79,6 +93,14 @@ uv run llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
79
93
 
80
94
  ### Option 3: Install into system path
81
95
 
96
+ Release version from PyPI:
97
+
98
+ ```bash
99
+ uv pip install -U llama-benchy
100
+ ```
101
+
102
+ Current version from the main branch:
103
+
82
104
  ```bash
83
105
  uv pip install git+https://github.com/eugr/llama-benchy --system
84
106
  ```
@@ -186,7 +208,7 @@ When `--enable-prefix-caching` is used (with `--depth` > 0), the script performs
186
208
  2. **Inference**: Sends the same context (system message) followed by the actual prompt (user message). The server should reuse the cached context.
187
209
  - Reported as standard `pp{tokens} @ d{depth}` and `tg{tokens} @ d{depth}`.
188
210
 
189
- By comparing the `pp` speed of the Inference step with a non-cached run (or the Context Load step), you can observe the speedup provided by prefix caching.
211
+ In this case, `pp` and `tg` speeds will show an actual prompt processing / token generation speeds for a follow up prompt with a context pre-filled.
190
212
 
191
213
  ### Example
192
214
 
@@ -0,0 +1,8 @@
1
+ """
2
+ llama-benchy - llama-bench style benchmarking tool for all backends
3
+
4
+ This package provides a benchmarking tool for OpenAI-compatible LLM endpoints,
5
+ generating statistics similar to `llama-bench`.
6
+ """
7
+
8
+ from ._version import __version__
@@ -19,12 +19,13 @@ from transformers import AutoTokenizer
19
19
  import requests
20
20
 
21
21
  # Build number is now imported from __init__.py
22
- from . import __build__, __version__
22
+ from . import __version__
23
23
 
24
24
 
25
25
 
26
26
  def parse_arguments():
27
27
  parser = argparse.ArgumentParser(description="LLM Benchmark Script")
28
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
28
29
  parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
29
30
  parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
30
31
  parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
@@ -369,11 +370,10 @@ async def main_async():
369
370
  print("Error: --enable-prefix-caching and --no-cache are incompatible.")
370
371
  return
371
372
 
372
- build_number = __build__
373
373
  version_number = __version__
374
374
 
375
375
  current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
376
- print(f"llama-benchy ({version_number}.{build_number})")
376
+ print(f"llama-benchy ({version_number})")
377
377
  print(f"Date: {current_time}")
378
378
  print(f"Benchmarking model: {args.model} at {args.base_url}")
379
379
 
@@ -522,7 +522,7 @@ async def main_async():
522
522
  print("No results collected. Check if the model is generating tokens.")
523
523
  else:
524
524
  print(tabulate(results, headers=["model", "test", "t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"], tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right")))
525
- print(f"\nllama-benchy ({version_number}.{build_number})")
525
+ print(f"\nllama-benchy ({version_number})")
526
526
  print(f"date: {current_time} | latency mode: {args.latency_mode}")
527
527
 
528
528
 
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.0'
32
- __version_tuple__ = version_tuple = (0, 1, 0)
31
+ __version__ = version = '0.1.1'
32
+ __version_tuple__ = version_tuple = (0, 1, 1)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,24 +0,0 @@
1
- """
2
- llama-benchy - llama-bench style benchmarking tool for all backends
3
-
4
- This package provides a benchmarking tool for OpenAI-compatible LLM endpoints,
5
- generating statistics similar to `llama-bench`.
6
- """
7
-
8
- from ._version import __version__
9
-
10
- # Extract build number from the version string
11
- # Version format is like: '0.1.dev34+g33f03d886.d20260105'
12
- # We want to extract the git hash part: '33f03d886'
13
- __build__ = "unknown"
14
- if "+" in __version__:
15
- try:
16
- # Extract the part after the '+' and before the '.'
17
- build_part = __version__.split("+")[1].split(".")[0]
18
- # Remove the 'g' prefix if it exists
19
- if build_part.startswith("g"):
20
- __build__ = build_part[1:]
21
- else:
22
- __build__ = build_part
23
- except (IndexError, AttributeError):
24
- pass
File without changes
File without changes