llama-benchy 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
llama_benchy/__init__.py CHANGED
@@ -6,19 +6,3 @@ generating statistics similar to `llama-bench`.
6
6
  """
7
7
 
8
8
  from ._version import __version__
9
-
10
- # Extract build number from the version string
11
- # Version format is like: '0.1.dev34+g33f03d886.d20260105'
12
- # We want to extract the git hash part: '33f03d886'
13
- __build__ = "unknown"
14
- if "+" in __version__:
15
- try:
16
- # Extract the part after the '+' and before the '.'
17
- build_part = __version__.split("+")[1].split(".")[0]
18
- # Remove the 'g' prefix if it exists
19
- if build_part.startswith("g"):
20
- __build__ = build_part[1:]
21
- else:
22
- __build__ = build_part
23
- except (IndexError, AttributeError):
24
- pass
llama_benchy/__main__.py CHANGED
@@ -19,12 +19,13 @@ from transformers import AutoTokenizer
19
19
  import requests
20
20
 
21
21
  # Build number is now imported from __init__.py
22
- from . import __build__, __version__
22
+ from . import __version__
23
23
 
24
24
 
25
25
 
26
26
  def parse_arguments():
27
27
  parser = argparse.ArgumentParser(description="LLM Benchmark Script")
28
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
28
29
  parser.add_argument("--base-url", type=str, required=True, help="OpenAI compatible endpoint URL")
29
30
  parser.add_argument("--api-key", type=str, default="EMPTY", help="API Key for the endpoint")
30
31
  parser.add_argument("--model", type=str, required=True, help="Model name to use for benchmarking")
@@ -240,6 +241,9 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
240
241
  "est_ppt": None,
241
242
  "e2e_ttft": None
242
243
  }
244
+
245
+ # DEBUG: Buffer to store first few lines of raw response
246
+ debug_lines = []
243
247
 
244
248
  try:
245
249
  payload = {
@@ -275,12 +279,22 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
275
279
  while "\n" in buffer:
276
280
  line, buffer = buffer.split("\n", 1)
277
281
  line = line.strip()
278
- if not line or line == 'data: [DONE]':
282
+ if not line:
283
+ continue
284
+
285
+ # Capture first 5 lines for debugging if needed
286
+ if len(debug_lines) < 5:
287
+ debug_lines.append(line)
288
+
289
+ if line == 'data: [DONE]' or line == 'data:[DONE]':
279
290
  continue
280
291
 
281
- if line.startswith('data: '):
292
+ if line.startswith('data:'):
282
293
  try:
283
- chunk = json.loads(line[6:])
294
+ # Strip 'data:' and potential whitespace
295
+ json_str = line[5:].strip()
296
+ chunk = json.loads(json_str)
297
+
284
298
  if 'usage' in chunk:
285
299
  prompt_usage_tokens = chunk['usage'].get('prompt_tokens', 0)
286
300
 
@@ -291,8 +305,9 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
291
305
  delta = chunk['choices'][0].get('delta', {})
292
306
  content = delta.get('content')
293
307
  reasoning_content = delta.get('reasoning_content')
308
+ reasoning = delta.get('reasoning')
294
309
 
295
- if content or reasoning_content:
310
+ if content or reasoning_content or reasoning:
296
311
  if token_count == 0:
297
312
  first_token_time = chunk_time
298
313
  e2e_ttft = first_token_time - start_time
@@ -306,6 +321,10 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
306
321
 
307
322
  end_time = time.perf_counter()
308
323
 
324
+ # DEBUG: Print warning if no tokens were collected
325
+ if token_count == 0:
326
+ print(f"\n[Warning] Run generated 0 tokens. Raw response sample: {debug_lines}")
327
+
309
328
  if token_count > 0:
310
329
  # Calculate decode time (time for subsequent tokens)
311
330
  # If only 1 token, decode_time is effectively 0, so we can't calculate inter-token speed
@@ -369,11 +388,10 @@ async def main_async():
369
388
  print("Error: --enable-prefix-caching and --no-cache are incompatible.")
370
389
  return
371
390
 
372
- build_number = __build__
373
391
  version_number = __version__
374
392
 
375
393
  current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
376
- print(f"llama-benchy ({version_number}.{build_number})")
394
+ print(f"llama-benchy ({version_number})")
377
395
  print(f"Date: {current_time}")
378
396
  print(f"Benchmarking model: {args.model} at {args.base_url}")
379
397
 
@@ -522,7 +540,7 @@ async def main_async():
522
540
  print("No results collected. Check if the model is generating tokens.")
523
541
  else:
524
542
  print(tabulate(results, headers=["model", "test", "t/s", "ttfr (ms)", "est_ppt (ms)", "e2e_ttft (ms)"], tablefmt="pipe", colalign=("left", "right", "right", "right", "right", "right")))
525
- print(f"\nllama-benchy ({version_number}.{build_number})")
543
+ print(f"\nllama-benchy ({version_number})")
526
544
  print(f"date: {current_time} | latency mode: {args.latency_mode}")
527
545
 
528
546
 
llama_benchy/_version.py CHANGED
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.0'
32
- __version_tuple__ = version_tuple = (0, 1, 0)
31
+ __version__ = version = '0.1.2'
32
+ __version_tuple__ = version_tuple = (0, 1, 2)
33
33
 
34
34
  __commit_id__ = commit_id = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llama-benchy
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
5
5
  Project-URL: Homepage, https://github.com/eugr/llama-benchy
6
6
  Project-URL: Bug Tracker, https://github.com/eugr/llama-benchy/issues
@@ -76,12 +76,26 @@ As of January 2nd, 2026, I wasn't able to find any existing benchmarking tool th
76
76
  - Supports executing a command after each run (e.g., to clear cache).
77
77
  - Configurable latency measurement mode.
78
78
 
79
+ # Current Limitations
80
+
81
+ - Evaluates against `/v1/chat/completions` endpoint only.
82
+ - Doesn't measure throughput in concurrency mode (coming later).
83
+ - Outputs results as a Markdown table only for now.
84
+
79
85
  ## Installation
80
86
 
81
- Install `uv` first: https://docs.astral.sh/uv/getting-started/installation/
87
+ Using `uv` is recommended. You can install `uv` here: https://docs.astral.sh/uv/getting-started/installation/
82
88
 
83
89
  ### Option 1: Run without installation using `uvx`
84
90
 
91
+ Run the release version from PyPI:
92
+
93
+ ```bash
94
+ uvx llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
95
+ ```
96
+
97
+ Run the latest version from the main branch:
98
+
85
99
  ```bash
86
100
  uvx --from git+https://github.com/eugr/llama-benchy llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
87
101
  ```
@@ -126,6 +140,14 @@ uv run llama-benchy --base-url <ENDPOINT_URL> --model <MODEL_NAME>
126
140
 
127
141
  ### Option 3: Install into system path
128
142
 
143
+ Release version from PyPI:
144
+
145
+ ```bash
146
+ uv pip install -U llama-benchy
147
+ ```
148
+
149
+ Current version from the main branch:
150
+
129
151
  ```bash
130
152
  uv pip install git+https://github.com/eugr/llama-benchy --system
131
153
  ```
@@ -233,7 +255,7 @@ When `--enable-prefix-caching` is used (with `--depth` > 0), the script performs
233
255
  2. **Inference**: Sends the same context (system message) followed by the actual prompt (user message). The server should reuse the cached context.
234
256
  - Reported as standard `pp{tokens} @ d{depth}` and `tg{tokens} @ d{depth}`.
235
257
 
236
- By comparing the `pp` speed of the Inference step with a non-cached run (or the Context Load step), you can observe the speedup provided by prefix caching.
258
+ In this case, `pp` and `tg` speeds will show an actual prompt processing / token generation speeds for a follow up prompt with a context pre-filled.
237
259
 
238
260
  ### Example
239
261
 
@@ -0,0 +1,8 @@
1
+ llama_benchy/__init__.py,sha256=D2TacJCNiAvfxHovv86Cm1kkFfmwgj_Z6QPoWdjJFhs,239
2
+ llama_benchy/__main__.py,sha256=ArgfdkzjgVv-tdoRW0WXxKEGfdbFDzmH6h3w3lay5zI,25120
3
+ llama_benchy/_version.py,sha256=Ok5oAXdWgR9aghaFXTafTeDW6sYO3uVe6d2Nket57R4,704
4
+ llama_benchy-0.1.2.dist-info/METADATA,sha256=oiJHBXHW_74XnVoKPvALBVP5-sXibFPDtELiCcdQaFw,13439
5
+ llama_benchy-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
6
+ llama_benchy-0.1.2.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
7
+ llama_benchy-0.1.2.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
8
+ llama_benchy-0.1.2.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- llama_benchy/__init__.py,sha256=4DTHEmeJShcJJdb6rPhFqv4_zREEQRZGZ3h0ThA6YpU,798
2
- llama_benchy/__main__.py,sha256=CZEf_36w5iut5RZjAZ4F894PAHF3hoxDWkT6lTFcr_I,24351
3
- llama_benchy/_version.py,sha256=5jwwVncvCiTnhOedfkzzxmxsggwmTBORdFL_4wq0ZeY,704
4
- llama_benchy-0.1.0.dist-info/METADATA,sha256=WEL0ASCMSmJA8QPXJzAYA0fAztA_6-D7xGjNdx3o4vY,12943
5
- llama_benchy-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
6
- llama_benchy-0.1.0.dist-info/entry_points.txt,sha256=ZWci87MxOyQtH4tBsuxiLxxnZW7Z-pGiUtmObnXeOv0,60
7
- llama_benchy-0.1.0.dist-info/licenses/LICENSE,sha256=K71ff-hxnl3muDdvJ3-fbbf5uVgv2dNkzJQXj4G20nk,1075
8
- llama_benchy-0.1.0.dist-info/RECORD,,