llama-benchy 0.1.0__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llama-benchy
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
5
5
  Author: eugr
6
6
  License: MIT License
@@ -74,12 +74,26 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
74
74
  - Supports executing a command after each run (e.g., to clear cache).
75
75
  - Configurable latency measurement mode.
76
76
 
77
+ # Current Limitations
78
+
79
+ - Evaluates against `/v1/chat/completions` endpoint only.
80
+ - Doesn't measure throughput in concurrency mode (coming later).
81
+ - Outputs results as a Markdown table only for now.
82
+
77
83
  ## Installation
78
84
 
79
- Install `uv` first: https://docs.astral.sh/uv/getting-started/installation/
85
+ Using `uv` is recommended. You can install `uv` here: https://docs.astral.sh/uv/getting-started/installation/
80
86
 
81
87
  ### Option 1: Run without installation using `uvx`
82
88
 
89
+ Run the release version from PyPI:
90
+
91
+ ```bash
92
+ uvx llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
93
+ ```
94
+
95
+ Run the latest version from the main branch:
96
+
83
97
  ```bash
84
98
  uvx --from git+https://github.com/eugr/llama-benchy llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
85
99
  ```
@@ -124,6 +138,14 @@ uv run llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
124
138
 
125
139
  ### Option 3: Install into system path
126
140
 
141
+ Release version from PyPI:
142
+
143
+ ```bash
144
+ uv pip install -U llama-benchy
145
+ ```
146
+
147
+ Current version from the main branch:
148
+
127
149
  ```bash
128
150
  uv pip install git+https://github.com/eugr/llama-benchy --system
129
151
  ```
@@ -231,7 +253,7 @@ When `--enable-prefix-caching` is used (with `--depth` > 0), the script performs
231
253
  2. **Inference**: Sends the same context (system message) followed by the actual prompt (user message). The server should reuse the cached context.
232
254
  - Reported as standard `pp{tokens} @ d{depth}` and `tg{tokens} @ d{depth}`.
233
255
 
234
- By comparing the `pp` speed of the Inference step with a non-cached run (or the Context Load step), you can observe the speedup provided by prefix caching.
256
+ In this case, `pp` and `tg` speeds will show an actual prompt processing / token generation speeds for a follow up prompt with a context pre-filled.
235
257
 
236
258
  ### Example
237
259
 
@@ -29,12 +29,26 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
29
29
  - Supports executing a command after each run (e.g., to clear cache).
30
30
  - Configurable latency measurement mode.
31
31
 
32
+ # Current Limitations
33
+
34
+ - Evaluates against `/v1/chat/completions` endpoint only.
35
+ - Doesn't measure throughput in concurrency mode (coming later).
36
+ - Outputs results as a Markdown table only for now.
37
+
32
38
  ## Installation
33
39
 
34
- Install `uv` first: https://docs.astral.sh/uv/getting-started/installation/
40
+ Using `uv` is recommended. You can install `uv` here: https://docs.astral.sh/uv/getting-started/installation/
35
41
 
36
42
  ### Option 1: Run without installation using `uvx`
37
43
 
44
+ Run the release version from PyPI:
45
+
46
+ ```bash
47
+ uvx llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
48
+ ```
49
+
50
+ Run the latest version from the main branch:
51
+
38
52
  ```bash
39
53
  uvx --from git+https://github.com/eugr/llama-benchy llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
40
54
  ```
@@ -79,6 +93,14 @@ uv run llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
79
93
 
80
94
  ### Option 3: Install into system path
81
95
 
96
+ Release version from PyPI:
97
+
98
+ ```bash
99
+ uv pip install -U llama-benchy
100
+ ```
101
+
102
+ Current version from the main branch:
103
+
82
104
  ```bash
83
105
  uv pip install git+https://github.com/eugr/llama-benchy --system
84
106
  ```
@@ -186,7 +208,7 @@ When `--enable-prefix-caching` is used (with `--depth` > 0), the script performs
186
208
  2. **Inference**: Sends the same context (system message) followed by the actual prompt (user message). The server should reuse the cached context.
187
209
  - Reported as standard `pp{tokens} @ d{depth}` and `tg{tokens} @ d{depth}`.
188
210
 
189
- By comparing the `pp` speed of the Inference step with a non-cached run (or the Context Load step), you can observe the speedup provided by prefix caching.
211
+ In this case, `pp` and `tg` speeds will show an actual prompt processing / token generation speeds for a follow up prompt with a context pre-filled.
190
212
 
191
213
  ### Example
192
214
 
@@ -0,0 +1,8 @@
1
+ """
2
+ llama-benchy - llama-bench style benchmarking tool for all backends
3
+
4
+ This package provides a benchmarking tool for OpenAI-compatible LLM endpoints,
5
+ generating statistics similar to `llama-bench`.
6
+ """
7
+
8
+ from ._version import __version__
@@ -19,12 +19,13 @@ from transformers import AutoTokenizer
19
19
  import requests
20
20
 
21
21
  # Build number is now imported from __init__.py
22
- from . import __build__, __version__
22
+ from . import __version__
23
23
 
24
24
 
25
25
 
26
26
  def parse_arguments():
27
27
  parser = argparse.ArgumentParser(description="LLM Benchmark Script")
28
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
28
29
  parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
29
30
  parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
30
31
  parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
@@ -240,6 +241,9 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
240
241
  "est_ppt": None,
241
242
  "e2e_ttft": None
242
243
  }
244
+
245
+ # DEBUG: Buffer to store first few lines of raw response
246
+ debug_lines = []
243
247
 
244
248
  try:
245
249
  payload = {
@@ -275,12 +279,22 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
275
279
  while "\n" in buffer:
276
280
  line, buffer = buffer.split("\n", 1)
277
281
  line = line.strip()
278
- if not line or line == 'data: [DONE]':
282
+ if not line:
283
+ continue
284
+
285
+ # Capture first 5 lines for debugging if needed
286
+ if len(debug_lines) < 5:
287
+ debug_lines.append(line)
288
+
289
+ if line == 'data: [DONE]' or line == 'data:[DONE]':
279
290
  continue
280
291
 
281
- if line.startswith('data: '):
292
+ if line.startswith('data:'):
282
293
  try:
283
- chunk = json.loads(line[6:])
294
+ # Strip 'data:' and potential whitespace
295
+ json_str = line[5:].strip()
296
+ chunk = json.loads(json_str)
297
+
284
298
  if 'usage' in chunk:
285
299
  prompt_usage_tokens = chunk['usage'].get('prompt_tokens', 0)
286
300
 
@@ -291,8 +305,9 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
291
305
  delta = chunk['choices'][0].get('delta', {})
292
306
  content = delta.get('content')
293
307
  reasoning_content = delta.get('reasoning_content')
308
+ reasoning = delta.get('reasoning')
294
309
 
295
- if content or reasoning_content:
310
+ if content or reasoning_content or reasoning:
296
311
  if token_count == 0:
297
312
  first_token_time = chunk_time
298
313
  e2e_ttft = first_token_time - start_time
@@ -306,6 +321,10 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
306
321
 
307
322
  end_time = time.perf_counter()
308
323
 
324
+ # DEBUG: Print warning if no tokens were collected
325
+ if token_count == 0:
326
+ print(f"\n[Warning] Run generated 0 tokens. Raw response sample: {debug_lines}")
327
+
309
328
  if token_count > 0:
310
329
  # Calculate decode time (time for subsequent tokens)
311
330
  # If only 1 token, decode_time is effectively 0, so we can't calculate inter-token speed
@@ -369,11 +388,10 @@ async def main_async():
369
388
  print("Error: --enable-prefix-caching and --no-cache are incompatible.")
370
389
  return
371
390
 
372
- build_number = __build__
373
391
  version_number = __version__
374
392
 
375
393
  current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
376
- print(f"llama-benchy ({version_number}.{build_number})")
394
+ print(f"llama-benchy ({version_number})")
377
395
  print(f"Date: {current_time}")
378
396
  print(f"Benchmarking model: {args.model} at {args.base_url}")
379
397
 
@@ -522,7 +540,7 @@ async def main_async():
522
540
  print("No results collected. Check if the model is generating tokens.")
523
541
  else:
524
542
  print(tabulate(results, headers=["model", "test", "t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"], tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right")))
525
- print(f"\nllama-benchy ({version_number}.{build_number})")
543
+ print(f"\nllama-benchy ({version_number})")
526
544
  print(f"date: {current_time} | latency mode: {args.latency_mode}")
527
545
 
528
546
 
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.0'
32
- __version_tuple__ = version_tuple = (0, 1, 0)
31
+ __version__ = version = '0.1.2'
32
+ __version_tuple__ = version_tuple = (0, 1, 2)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,24 +0,0 @@
1
- """
2
- llama-benchy - llama-bench style benchmarking tool for all backends
3
-
4
- This package provides a benchmarking tool for OpenAI-compatible LLM endpoints,
5
- generating statistics similar to `llama-bench`.
6
- """
7
-
8
- from ._version import __version__
9
-
10
- # Extract build number from the version string
11
- # Version format is like: '0.1.dev34+g33f03d886.d20260105'
12
- # We want to extract the git hash part: '33f03d886'
13
- __build__ = "unknown"
14
- if "+" in __version__:
15
- try:
16
- # Extract the part after the '+' and before the '.'
17
- build_part = __version__.split("+")[1].split(".")[0]
18
- # Remove the 'g' prefix if it exists
19
- if build_part.startswith("g"):
20
- __build__ = build_part[1:]
21
- else:
22
- __build__ = build_part
23
- except (IndexError, AttributeError):
24
- pass
File without changes
File without changes