llama-benchy 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: llama-benchy
3
- Version: 0.1.1
3
+ Version: 0.1.2
4
4
  Summary: llama-bench style benchmarking tool for all OpenAI-compatible LLM endpoints
5
5
  Author: eugr
6
6
  License: MIT License
@@ -241,6 +241,9 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
241
241
  "est_ppt": None,
242
242
  "e2e_ttft": None
243
243
  }
244
+
245
+ # DEBUG: Buffer to store first few lines of raw response
246
+ debug_lines = []
244
247
 
245
248
  try:
246
249
  payload = {
@@ -276,12 +279,22 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
276
279
  while "\n" in buffer:
277
280
  line, buffer = buffer.split("\n", 1)
278
281
  line = line.strip()
279
- if not line or line == 'data: [DONE]':
282
+ if not line:
283
+ continue
284
+
285
+ # Capture first 5 lines for debugging if needed
286
+ if len(debug_lines) < 5:
287
+ debug_lines.append(line)
288
+
289
+ if line == 'data: [DONE]' or line == 'data:[DONE]':
280
290
  continue
281
291
 
282
- if line.startswith('data: '):
292
+ if line.startswith('data:'):
283
293
  try:
284
- chunk = json.loads(line[6:])
294
+ # Strip 'data:' and potential whitespace
295
+ json_str = line[5:].strip()
296
+ chunk = json.loads(json_str)
297
+
285
298
  if 'usage' in chunk:
286
299
  prompt_usage_tokens = chunk['usage'].get('prompt_tokens', 0)
287
300
 
@@ -292,8 +305,9 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
292
305
  delta = chunk['choices'][0].get('delta', {})
293
306
  content = delta.get('content')
294
307
  reasoning_content = delta.get('reasoning_content')
308
+ reasoning = delta.get('reasoning')
295
309
 
296
- if content or reasoning_content:
310
+ if content or reasoning_content or reasoning:
297
311
  if token_count == 0:
298
312
  first_token_time = chunk_time
299
313
  e2e_ttft = first_token_time - start_time
@@ -307,6 +321,10 @@ async def run_benchmark(session, base_url, api_key, model_name, context_text, pr
307
321
 
308
322
  end_time = time.perf_counter()
309
323
 
324
+ # DEBUG: Print warning if no tokens were collected
325
+ if token_count == 0:
326
+ print(f"\n[Warning] Run generated 0 tokens. Raw response sample: {debug_lines}")
327
+
310
328
  if token_count > 0:
311
329
  # Calculate decode time (time for subsequent tokens)
312
330
  # If only 1 token, decode_time is effectively 0, so we can't calculate inter-token speed
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.1.1'
32
- __version_tuple__ = version_tuple = (0, 1, 1)
31
+ __version__ = version = '0.1.2'
32
+ __version_tuple__ = version_tuple = (0, 1, 2)
33
33
 
34
34
  __commit_id__ = commit_id = None
File without changes
File without changes
File without changes