llama-benchy 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llama_benchy/__init__.py CHANGED
@@ -6,19 +6,3 @@ generating statistics similar to `llama-bench`.
6
6
  """
7
7
 
8
8
  from ._version import __version__
9
-
10
- # Extract build number from the version string
11
- # Version format is like: '0.1.dev34+g33f03d886.d20260105'
12
- # We want to extract the git hash part: '33f03d886'
13
- __build__ = "unknown"
14
- if "+" in __version__:
15
- try:
16
- # Extract the part after the '+' and before the '.'
17
- build_part = __version__.split("+")[1].split(".")[0]
18
- # Remove the 'g' prefix if it exists
19
- if build_part.startswith("g"):
20
- __build__ = build_part[1:]
21
- else:
22
- __build__ = build_part
23
- except (IndexError, AttributeError):
24
- pass
llama_benchy/__main__.py CHANGED
@@ -19,12 +19,13 @@ from transformers import AutoTokenizer
19
19
  import requests
20
20
 
21
21
  # Build number is now imported from __init__.py
22
- from . import __build__, __version__
22
+ from . import __version__
23
23
 
24
24
 
25
25
 
26
26
  def parse_arguments():
27
27
  parser = argparse.ArgumentParser(description="LLM Benchmark Script")
28
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
28
29
  parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
29
30
  parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
30
31
  parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
@@ -369,11 +370,10 @@ async def main_async():
369
370
  print("Error: --enable-prefix-caching and --no-cache are incompatible.")
370
371
  return
371
372
 
372
- build_number = __build__
373
373
  version_number = __version__
374
374
 
375
375
  current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
376
- print(f"llama-benchy ({version_number}.{build_number})")
376
+ print(f"llama-benchy ({version_number})")
377
377
  print(f"Date: {current_time}")
378
378
  print(f"Benchmarking model: {args.model} at {args.base_url}")
379
379
 
@@ -522,7 +522,7 @@ async def main_async():
522
522
  print("No results collected. Check if the model is generating tokens.")
523
523
  else:
524
524
  print(tabulate(results, headers=["model", "test", "t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"], tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right")))
525
- print(f"\nllama-benchy ({version_number}.{build_number})")
525
+ print(f"\nllama-benchy ({version_number})")
526
526
  print(f"date: {current_time} | latency mode: {args.latency_mode}")
527
527
 
528
528
 
llama_benchy/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.0'
32
- __version_tuple__ = version_tuple = (0, 1, 0)
31
+ __version__ = version = '0.1.1'
32
+ __version_tuple__ = version_tuple = (0, 1, 1)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llama-benchy
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
5
5
  Project-URL: Homepage, https://github.com/eugr/llama-benchy
6
6
  Project-URL: Bug Tracker, https://github.com/eugr/llama-benchy/issues
@@ -76,12 +76,26 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
76
76
  - Supports executing a command after each run (e.g., to clear cache).
77
77
  - Configurable latency measurement mode.
78
78
 
79
+ # Current Limitations
80
+
81
+ - Evaluates against `/v1/chat/completions` endpoint only.
82
+ - Doesn't measure throughput in concurrency mode (coming later).
83
+ - Outputs results as a Markdown table only for now.
84
+
79
85
  ## Installation
80
86
 
81
- Install `uv` first: https://docs.astral.sh/uv/getting-started/installation/
87
+ Using `uv` is recommended. You can install `uv` here: https://docs.astral.sh/uv/getting-started/installation/
82
88
 
83
89
  ### Option 1: Run without installation using `uvx`
84
90
 
91
+ Run the release version from PyPI:
92
+
93
+ ```bash
94
+ uvx llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
95
+ ```
96
+
97
+ Run the latest version from the main branch:
98
+
85
99
  ```bash
86
100
  uvx --from git+https://github.com/eugr/llama-benchy llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
87
101
  ```
@@ -126,6 +140,14 @@ uv run llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
126
140
 
127
141
  ### Option 3: Install into system path
128
142
 
143
+ Release version from PyPI:
144
+
145
+ ```bash
146
+ uv pip install -U llama-benchy
147
+ ```
148
+
149
+ Current version from the main branch:
150
+
129
151
  ```bash
130
152
  uv pip install git+https://github.com/eugr/llama-benchy --system
131
153
  ```
@@ -233,7 +255,7 @@ When `--enable-prefix-caching` is used (with `--depth` > 0), the script performs
233
255
  2. **Inference**: Sends the same context (system message) followed by the actual prompt (user message). The server should reuse the cached context.
234
256
  - Reported as standard `pp{tokens} @ d{depth}` and `tg{tokens} @ d{depth}`.
235
257
 
236
- By comparing the `pp` speed of the Inference step with a non-cached run (or the Context Load step), you can observe the speedup provided by prefix caching.
258
+ In this case, `pp` and `tg` speeds will show an actual prompt processing / token generation speeds for a follow up prompt with a context pre-filled.
237
259
 
238
260
  ### Example
239
261
 
@@ -0,0 +1,8 @@
1
+ llama_benchy/__init__.py,sha256=D2TacJCNiAvfxHovv86Cm1kkFfmwgj_Z6QPoWdjJFhs,239
2
+ llama_benchy/__main__.py,sha256=RZalKXmtAAKiCBenE1maVeyvly5fsGQanS5v3YLeDLs,24371
3
+ llama_benchy/_version.py,sha256=m8HxkqoKGw_wAJtc4ZokpJKNLXqp4zwnNhbnfDtro7w,704
4
+ llama_benchy-0.1.1.dist-info/METADATA,sha256=O6DTAZAJta_puufDXqbeFhhlTT-WaeBVoJSfDLOREDo,13439
5
+ llama_benchy-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
6
+ llama_benchy-0.1.1.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
7
+ llama_benchy-0.1.1.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
8
+ llama_benchy-0.1.1.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- llama_benchy/__init__.py,sha256=4DTHEmeJShcJJdb6rPhFqv4_zREEQRZGZ3h0ThA6YpU,798
2
- llama_benchy/__main__.py,sha256=CZEf_36w5iut5RZjAZ4F894PAHF3hoxDWkT6lTFcr_I,24351
3
- llama_benchy/_version.py,sha256=5jwwVncvCiTnhOedfkzzxmxsggwmTBORdFL_4wq0ZeY,704
4
- llama_benchy-0.1.0.dist-info/METADATA,sha256=WEL0ASCMSmJA8QPXJzAYA0fAztA_6-D7xGjNdx3o4vY,12943
5
- llama_benchy-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
6
- llama_benchy-0.1.0.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
7
- llama_benchy-0.1.0.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
8
- llama_benchy-0.1.0.dist-info/RECORD,,