@blockrun/clawrouter 0.12.45 → 0.12.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +80 -54
- package/dist/cli.js.map +1 -1
- package/dist/index.js +80 -54
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/scripts/benchmark.py +100 -178
package/package.json
CHANGED
package/scripts/benchmark.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
BlockRun Model Performance Benchmark
|
|
4
|
-
Measures
|
|
5
|
-
Uses
|
|
3
|
+
BlockRun Model Performance Benchmark v3
|
|
4
|
+
Measures end-to-end latency via non-streaming requests.
|
|
5
|
+
Uses real token counts from API usage response.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import time
|
|
@@ -14,47 +14,27 @@ from openai import OpenAI
|
|
|
14
14
|
|
|
15
15
|
MODELS = [
|
|
16
16
|
# OpenAI
|
|
17
|
-
"openai/gpt-5.4",
|
|
18
|
-
"openai/gpt-5.
|
|
19
|
-
"openai/gpt-5.
|
|
20
|
-
"openai/gpt-5
|
|
21
|
-
"openai/gpt-
|
|
22
|
-
"openai/gpt-
|
|
23
|
-
"openai/
|
|
24
|
-
"openai/
|
|
25
|
-
"openai/gpt-4.1",
|
|
26
|
-
"openai/gpt-4.1-mini",
|
|
27
|
-
"openai/gpt-4.1-nano",
|
|
28
|
-
"openai/gpt-4o",
|
|
29
|
-
"openai/gpt-4o-mini",
|
|
30
|
-
"openai/o3",
|
|
31
|
-
"openai/o3-mini",
|
|
32
|
-
"openai/o4-mini",
|
|
33
|
-
"openai/o1",
|
|
34
|
-
"openai/o1-mini",
|
|
17
|
+
"openai/gpt-5.4", "openai/gpt-5.4-pro",
|
|
18
|
+
"openai/gpt-5.3", "openai/gpt-5.3-codex",
|
|
19
|
+
"openai/gpt-5.2", "openai/gpt-5.2-pro",
|
|
20
|
+
"openai/gpt-5-mini", "openai/gpt-5-nano",
|
|
21
|
+
"openai/gpt-4.1", "openai/gpt-4.1-mini", "openai/gpt-4.1-nano",
|
|
22
|
+
"openai/gpt-4o", "openai/gpt-4o-mini",
|
|
23
|
+
"openai/o3", "openai/o3-mini", "openai/o4-mini",
|
|
24
|
+
"openai/o1", "openai/o1-mini",
|
|
35
25
|
# Anthropic
|
|
36
|
-
"anthropic/claude-sonnet-4.6",
|
|
37
|
-
"anthropic/claude-opus-4.6",
|
|
38
|
-
"anthropic/claude-haiku-4.5",
|
|
26
|
+
"anthropic/claude-sonnet-4.6", "anthropic/claude-opus-4.6", "anthropic/claude-haiku-4.5",
|
|
39
27
|
# Google
|
|
40
|
-
"google/gemini-3.1-pro",
|
|
41
|
-
"google/gemini-
|
|
42
|
-
"google/gemini-3-flash-preview",
|
|
43
|
-
"google/gemini-2.5-pro",
|
|
44
|
-
"google/gemini-2.5-flash",
|
|
45
|
-
"google/gemini-2.5-flash-lite",
|
|
28
|
+
"google/gemini-3.1-pro", "google/gemini-3-pro-preview", "google/gemini-3-flash-preview",
|
|
29
|
+
"google/gemini-2.5-pro", "google/gemini-2.5-flash", "google/gemini-2.5-flash-lite",
|
|
46
30
|
# DeepSeek
|
|
47
|
-
"deepseek/deepseek-chat",
|
|
48
|
-
"deepseek/deepseek-reasoner",
|
|
31
|
+
"deepseek/deepseek-chat", "deepseek/deepseek-reasoner",
|
|
49
32
|
# Moonshot
|
|
50
33
|
"moonshot/kimi-k2.5",
|
|
51
34
|
# xAI
|
|
52
|
-
"xai/grok-3",
|
|
53
|
-
"xai/grok-
|
|
54
|
-
"xai/grok-4-fast-reasoning",
|
|
55
|
-
"xai/grok-4-fast-non-reasoning",
|
|
56
|
-
"xai/grok-4-1-fast-reasoning",
|
|
57
|
-
"xai/grok-4-1-fast-non-reasoning",
|
|
35
|
+
"xai/grok-3", "xai/grok-3-mini",
|
|
36
|
+
"xai/grok-4-fast-reasoning", "xai/grok-4-fast-non-reasoning",
|
|
37
|
+
"xai/grok-4-1-fast-reasoning", "xai/grok-4-1-fast-non-reasoning",
|
|
58
38
|
"xai/grok-4-0709",
|
|
59
39
|
# MiniMax
|
|
60
40
|
"minimax/minimax-m2.5",
|
|
@@ -62,236 +42,178 @@ MODELS = [
|
|
|
62
42
|
"nvidia/gpt-oss-120b",
|
|
63
43
|
]
|
|
64
44
|
|
|
65
|
-
# Varied prompts to bypass dedup cache — each run uses a different prompt
|
|
66
45
|
PROMPTS = [
|
|
67
46
|
"Write a Python function that checks if a string is a valid IPv4 address. Include edge cases and a docstring.",
|
|
68
47
|
"Write a Python function that finds the longest common subsequence of two strings. Include type hints and examples.",
|
|
69
48
|
"Write a Python function that implements a simple LRU cache using OrderedDict. Include usage examples.",
|
|
70
|
-
"Write a Python function that converts a Roman numeral string to an integer. Handle all standard cases.",
|
|
71
|
-
"Write a Python function that flattens a nested list of arbitrary depth. Include type hints.",
|
|
72
49
|
]
|
|
73
50
|
|
|
74
51
|
NUM_REQUESTS = 2
|
|
75
52
|
MAX_TOKENS = 256
|
|
76
53
|
|
|
77
54
|
|
|
78
|
-
def estimate_tokens(text: str) -> int:
|
|
79
|
-
"""Rough token estimate: ~4 chars per token for English/code."""
|
|
80
|
-
return max(1, len(text) // 4)
|
|
81
|
-
|
|
82
|
-
|
|
83
55
|
def benchmark_model(client: OpenAI, model: str) -> list:
|
|
84
|
-
"""Benchmark a single model with streaming."""
|
|
85
56
|
results = []
|
|
86
|
-
|
|
87
57
|
for i in range(NUM_REQUESTS):
|
|
88
58
|
prompt = PROMPTS[i % len(PROMPTS)]
|
|
89
|
-
|
|
90
59
|
try:
|
|
91
60
|
start = time.perf_counter()
|
|
92
|
-
|
|
93
|
-
full_response = ""
|
|
94
|
-
first_chunk_time = None
|
|
95
|
-
last_chunk_time = None
|
|
96
|
-
chunk_count = 0
|
|
97
|
-
|
|
98
|
-
stream = client.chat.completions.create(
|
|
61
|
+
resp = client.chat.completions.create(
|
|
99
62
|
model=model,
|
|
100
63
|
messages=[{"role": "user", "content": prompt}],
|
|
101
64
|
max_tokens=MAX_TOKENS,
|
|
102
|
-
stream=
|
|
65
|
+
stream=False,
|
|
103
66
|
temperature=0.7,
|
|
104
67
|
)
|
|
68
|
+
latency = (time.perf_counter() - start) * 1000
|
|
69
|
+
|
|
70
|
+
content = resp.choices[0].message.content or ""
|
|
71
|
+
finish = resp.choices[0].finish_reason
|
|
72
|
+
usage = resp.usage
|
|
73
|
+
input_tokens = usage.prompt_tokens if usage else 0
|
|
74
|
+
output_tokens = usage.completion_tokens if usage else 0
|
|
75
|
+
total_tokens = usage.total_tokens if usage else 0
|
|
105
76
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
if chunk.choices and chunk.choices[0].delta.content:
|
|
109
|
-
content = chunk.choices[0].delta.content
|
|
110
|
-
if ttft is None:
|
|
111
|
-
ttft = (now - start) * 1000 # ms
|
|
112
|
-
first_chunk_time = now
|
|
113
|
-
full_response += content
|
|
114
|
-
last_chunk_time = now
|
|
115
|
-
chunk_count += 1
|
|
116
|
-
|
|
117
|
-
end = time.perf_counter()
|
|
118
|
-
total_time = (end - start) * 1000 # ms
|
|
119
|
-
|
|
120
|
-
# Estimate output tokens
|
|
121
|
-
output_tokens = estimate_tokens(full_response)
|
|
122
|
-
|
|
123
|
-
# Tokens per second (generation phase: from first token to last token)
|
|
124
|
-
if first_chunk_time and last_chunk_time and last_chunk_time > first_chunk_time:
|
|
125
|
-
gen_time_s = last_chunk_time - first_chunk_time
|
|
126
|
-
tps = output_tokens / gen_time_s if gen_time_s > 0.001 else 0
|
|
127
|
-
elif output_tokens > 0 and ttft is not None:
|
|
128
|
-
# Single chunk — use total time minus TTFT
|
|
129
|
-
gen_time_s = (end - start) - (ttft / 1000)
|
|
130
|
-
tps = output_tokens / gen_time_s if gen_time_s > 0.001 else 0
|
|
131
|
-
else:
|
|
132
|
-
tps = 0
|
|
133
|
-
|
|
134
|
-
# Inter-token latency (approximate from chunks)
|
|
135
|
-
if first_chunk_time and last_chunk_time and chunk_count > 1:
|
|
136
|
-
avg_itl = ((last_chunk_time - first_chunk_time) / (chunk_count - 1)) * 1000
|
|
137
|
-
else:
|
|
138
|
-
avg_itl = None
|
|
77
|
+
# Tokens per second (output tokens / latency)
|
|
78
|
+
tps = (output_tokens / (latency / 1000)) if latency > 0 and output_tokens > 0 else 0
|
|
139
79
|
|
|
140
80
|
results.append({
|
|
141
81
|
"request": i + 1,
|
|
142
|
-
"
|
|
143
|
-
"
|
|
82
|
+
"latency_ms": round(latency, 0),
|
|
83
|
+
"input_tokens": input_tokens,
|
|
144
84
|
"output_tokens": output_tokens,
|
|
145
|
-
"
|
|
146
|
-
"chunks": chunk_count,
|
|
85
|
+
"total_tokens": total_tokens,
|
|
147
86
|
"tokens_per_sec": round(tps, 1),
|
|
148
|
-
"
|
|
87
|
+
"output_chars": len(content),
|
|
88
|
+
"finish_reason": finish,
|
|
149
89
|
"status": "success",
|
|
150
90
|
})
|
|
151
|
-
|
|
152
|
-
ttft_s = f"{ttft:.0f}" if ttft is not None else "N/A"
|
|
153
|
-
print(f" Run {i+1}/{NUM_REQUESTS}: TTFT={ttft_s}ms, {tps:.0f} tok/s, ~{output_tokens} tokens ({len(full_response)} chars), {total_time:.0f}ms")
|
|
91
|
+
print(f" Run {i+1}: {latency:.0f}ms, {output_tokens} out tokens, {tps:.0f} tok/s, finish={finish}")
|
|
154
92
|
|
|
155
93
|
except Exception as e:
|
|
156
|
-
error_msg = str(e)[:
|
|
157
|
-
results.append({
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
"
|
|
161
|
-
})
|
|
162
|
-
print(f" Run {i+1}/{NUM_REQUESTS}: ERROR - {error_msg[:150]}")
|
|
163
|
-
# If rate limited or out of funds, wait longer
|
|
164
|
-
if "429" in error_msg or "rate" in error_msg.lower() or "402" in error_msg:
|
|
165
|
-
print(" >> Rate limited or payment issue, waiting 30s...")
|
|
94
|
+
error_msg = str(e)[:200]
|
|
95
|
+
results.append({"request": i + 1, "status": "error", "error": error_msg})
|
|
96
|
+
print(f" Run {i+1}: ERROR - {error_msg[:120]}")
|
|
97
|
+
if "429" in error_msg or "rate" in error_msg.lower():
|
|
98
|
+
print(" >> Rate limited, waiting 30s...")
|
|
166
99
|
time.sleep(30)
|
|
167
100
|
|
|
168
101
|
time.sleep(3)
|
|
169
|
-
|
|
170
102
|
return results
|
|
171
103
|
|
|
172
104
|
|
|
173
105
|
def aggregate(results: list) -> dict:
|
|
174
|
-
"""Aggregate results across runs."""
|
|
175
106
|
successes = [r for r in results if r["status"] == "success"]
|
|
176
107
|
if not successes:
|
|
177
108
|
return {"error_rate": 1.0, "runs": len(results)}
|
|
178
109
|
|
|
179
|
-
|
|
180
|
-
itl_vals = [r["avg_itl_ms"] for r in successes if r.get("avg_itl_ms") is not None]
|
|
110
|
+
latencies = [r["latency_ms"] for r in successes]
|
|
181
111
|
tps_vals = [r["tokens_per_sec"] for r in successes if r["tokens_per_sec"] > 0]
|
|
112
|
+
out_tokens = [r["output_tokens"] for r in successes]
|
|
182
113
|
|
|
183
|
-
|
|
114
|
+
return {
|
|
184
115
|
"runs": len(results),
|
|
185
116
|
"successes": len(successes),
|
|
186
117
|
"error_rate": round(1 - len(successes) / len(results), 2),
|
|
187
|
-
"
|
|
188
|
-
"
|
|
118
|
+
"avg_latency_ms": round(sum(latencies) / len(latencies), 0),
|
|
119
|
+
"min_latency_ms": round(min(latencies), 0),
|
|
120
|
+
"max_latency_ms": round(max(latencies), 0),
|
|
121
|
+
"avg_tokens_per_sec": round(sum(tps_vals) / len(tps_vals), 1) if tps_vals else 0,
|
|
122
|
+
"avg_output_tokens": round(sum(out_tokens) / len(out_tokens), 0),
|
|
189
123
|
}
|
|
190
|
-
if tps_vals:
|
|
191
|
-
agg["avg_tokens_per_sec"] = round(sum(tps_vals) / len(tps_vals), 1)
|
|
192
|
-
agg["max_tokens_per_sec"] = round(max(tps_vals), 1)
|
|
193
|
-
if ttft_vals:
|
|
194
|
-
agg["avg_ttft_ms"] = round(sum(ttft_vals) / len(ttft_vals), 0)
|
|
195
|
-
agg["p50_ttft_ms"] = round(sorted(ttft_vals)[len(ttft_vals) // 2], 0)
|
|
196
|
-
if itl_vals:
|
|
197
|
-
agg["avg_itl_ms"] = round(sum(itl_vals) / len(itl_vals), 1)
|
|
198
124
|
|
|
199
|
-
|
|
125
|
+
|
|
126
|
+
# Model pricing (USD per 1M tokens) — for cost calculation
|
|
127
|
+
PRICING = {
|
|
128
|
+
"openai/gpt-5.4": (2.5, 15), "openai/gpt-5.4-pro": (2.5, 15),
|
|
129
|
+
"openai/gpt-5.3": (2.5, 10), "openai/gpt-5.3-codex": (2.5, 10),
|
|
130
|
+
"openai/gpt-5.2": (2.5, 10), "openai/gpt-5.2-pro": (2.5, 10),
|
|
131
|
+
"openai/gpt-5-mini": (1.1, 4.4), "openai/gpt-5-nano": (0.5, 2),
|
|
132
|
+
"openai/gpt-4.1": (2, 8), "openai/gpt-4.1-mini": (0.4, 1.6), "openai/gpt-4.1-nano": (0.1, 0.4),
|
|
133
|
+
"openai/gpt-4o": (2.5, 10), "openai/gpt-4o-mini": (0.15, 0.6),
|
|
134
|
+
"openai/o3": (2, 8), "openai/o3-mini": (1.1, 4.4), "openai/o4-mini": (1.1, 4.4),
|
|
135
|
+
"openai/o1": (15, 60), "openai/o1-mini": (1.1, 4.4),
|
|
136
|
+
"anthropic/claude-sonnet-4.6": (3, 15), "anthropic/claude-opus-4.6": (15, 75),
|
|
137
|
+
"anthropic/claude-haiku-4.5": (0.8, 4),
|
|
138
|
+
"google/gemini-3.1-pro": (1.25, 10), "google/gemini-3-pro-preview": (1.25, 10),
|
|
139
|
+
"google/gemini-3-flash-preview": (0.15, 0.6),
|
|
140
|
+
"google/gemini-2.5-pro": (1.25, 10), "google/gemini-2.5-flash": (0.15, 0.6),
|
|
141
|
+
"google/gemini-2.5-flash-lite": (0.1, 0.4),
|
|
142
|
+
"deepseek/deepseek-chat": (0.27, 1.1), "deepseek/deepseek-reasoner": (0.55, 2.19),
|
|
143
|
+
"moonshot/kimi-k2.5": (0.6, 3),
|
|
144
|
+
"xai/grok-3": (3, 15), "xai/grok-3-mini": (0.3, 0.5),
|
|
145
|
+
"xai/grok-4-fast-reasoning": (0.2, 0.5), "xai/grok-4-fast-non-reasoning": (0.2, 0.5),
|
|
146
|
+
"xai/grok-4-1-fast-reasoning": (0.2, 0.5), "xai/grok-4-1-fast-non-reasoning": (0.2, 0.5),
|
|
147
|
+
"xai/grok-4-0709": (0.2, 1.5),
|
|
148
|
+
"minimax/minimax-m2.5": (0.3, 1.1),
|
|
149
|
+
"nvidia/gpt-oss-120b": (0, 0),
|
|
150
|
+
}
|
|
200
151
|
|
|
201
152
|
|
|
202
153
|
def main():
|
|
203
|
-
client = OpenAI(
|
|
204
|
-
api_key="x402",
|
|
205
|
-
base_url="http://localhost:18789/v1",
|
|
206
|
-
)
|
|
154
|
+
client = OpenAI(api_key="x402", base_url="http://localhost:18789/v1")
|
|
207
155
|
|
|
208
|
-
print("
|
|
156
|
+
print("BlockRun Model Performance Benchmark v3")
|
|
157
|
+
print("=" * 60)
|
|
209
158
|
try:
|
|
210
159
|
models = client.models.list()
|
|
211
160
|
print(f"Connected. {len(models.data)} models available.")
|
|
212
161
|
except Exception as e:
|
|
213
|
-
print(f"
|
|
162
|
+
print(f"Connection failed: {e}")
|
|
214
163
|
sys.exit(1)
|
|
215
164
|
|
|
216
|
-
print(f"
|
|
217
|
-
print(f"Prompts varied per request to bypass dedup cache\n")
|
|
165
|
+
print(f"Config: {NUM_REQUESTS} requests/model, {MAX_TOKENS} max tokens, non-streaming\n")
|
|
218
166
|
|
|
219
167
|
all_results = {}
|
|
220
168
|
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
221
169
|
|
|
222
|
-
models_to_test = MODELS
|
|
223
|
-
if len(sys.argv) > 1:
|
|
224
|
-
models_to_test = sys.argv[1:]
|
|
225
|
-
|
|
170
|
+
models_to_test = MODELS if len(sys.argv) <= 1 else sys.argv[1:]
|
|
226
171
|
total = len(models_to_test)
|
|
172
|
+
|
|
227
173
|
for idx, model in enumerate(models_to_test, 1):
|
|
228
174
|
print(f"\n[{idx}/{total}] {model}")
|
|
229
|
-
print("-" * 50)
|
|
230
|
-
|
|
231
175
|
results = benchmark_model(client, model)
|
|
232
176
|
agg = aggregate(results)
|
|
233
177
|
all_results[model] = {"raw": results, "summary": agg}
|
|
234
178
|
|
|
235
179
|
if agg.get("successes", 0) > 0:
|
|
236
|
-
|
|
237
|
-
tps_str = f"{agg.get('avg_tokens_per_sec', 0)} tok/s, " if "avg_tokens_per_sec" in agg else ""
|
|
238
|
-
print(f" >> {ttft_str}{tps_str}{agg['avg_total_ms']:.0f}ms avg total")
|
|
180
|
+
print(f" >> {agg['avg_latency_ms']:.0f}ms avg, {agg.get('avg_tokens_per_sec', 0)} tok/s, ~{agg['avg_output_tokens']} tokens")
|
|
239
181
|
else:
|
|
240
182
|
print(f" >> ALL FAILED")
|
|
241
183
|
|
|
242
184
|
# Save incrementally
|
|
243
185
|
output = {
|
|
244
186
|
"benchmark": "BlockRun Model Performance",
|
|
245
|
-
"version": "
|
|
187
|
+
"version": "3.0",
|
|
246
188
|
"timestamp": timestamp,
|
|
247
|
-
"config": {
|
|
248
|
-
"num_requests": NUM_REQUESTS,
|
|
249
|
-
"max_tokens": MAX_TOKENS,
|
|
250
|
-
"prompts": PROMPTS,
|
|
251
|
-
"token_estimation": "~4 chars per token",
|
|
252
|
-
"endpoint": "localhost:18789 (ClawRouter → blockrun.ai x402)",
|
|
253
|
-
},
|
|
189
|
+
"config": {"num_requests": NUM_REQUESTS, "max_tokens": MAX_TOKENS, "mode": "non-streaming"},
|
|
254
190
|
"results": all_results,
|
|
255
191
|
}
|
|
256
|
-
|
|
257
|
-
with open(outfile, "w") as f:
|
|
192
|
+
with open(os.path.join(os.path.dirname(__file__), "..", "benchmark-results.json"), "w") as f:
|
|
258
193
|
json.dump(output, f, indent=2)
|
|
259
194
|
|
|
260
|
-
#
|
|
261
|
-
print("\n" + "=" *
|
|
195
|
+
# === LEADERBOARD ===
|
|
196
|
+
print("\n" + "=" * 100)
|
|
262
197
|
print("BLOCKRUN MODEL PERFORMANCE LEADERBOARD")
|
|
263
|
-
print(f"
|
|
264
|
-
print("=" *
|
|
265
|
-
|
|
266
|
-
ranked = []
|
|
267
|
-
for model, data in all_results.items():
|
|
268
|
-
s = data["summary"]
|
|
269
|
-
if s.get("avg_tokens_per_sec", 0) > 0:
|
|
270
|
-
ranked.append((model, s))
|
|
198
|
+
print(f"Date: {timestamp} | Mode: non-streaming | Max tokens: {MAX_TOKENS}")
|
|
199
|
+
print("=" * 100)
|
|
271
200
|
|
|
272
|
-
ranked
|
|
201
|
+
ranked = [(m, d["summary"]) for m, d in all_results.items() if d["summary"].get("successes", 0) > 0]
|
|
202
|
+
ranked.sort(key=lambda x: x[1]["avg_latency_ms"])
|
|
273
203
|
|
|
274
|
-
print(f"\n{'#':<4} {'Model':<40} {'
|
|
275
|
-
print("-" *
|
|
204
|
+
print(f"\n{'#':<4} {'Model':<40} {'Latency':<12} {'Tok/s':<9} {'Out Tok':<9} {'$/1M in':<9} {'$/1M out':<9}")
|
|
205
|
+
print("-" * 92)
|
|
276
206
|
for i, (model, s) in enumerate(ranked, 1):
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
print(f"{i:<4} {model:<40} {
|
|
280
|
-
|
|
281
|
-
# Models with issues
|
|
282
|
-
zero_tps = [(m, d["summary"]) for m, d in all_results.items()
|
|
283
|
-
if d["summary"].get("avg_tokens_per_sec", 0) == 0 and d["summary"].get("successes", 0) > 0]
|
|
284
|
-
if zero_tps:
|
|
285
|
-
print(f"\nModels with 0 tok/s (single-chunk or non-streaming response):")
|
|
286
|
-
for model, s in zero_tps:
|
|
287
|
-
ttft = f"{s.get('avg_ttft_ms', 'N/A')}ms"
|
|
288
|
-
print(f" {model}: TTFT={ttft}, ~{s.get('avg_output_tokens', 0)} tokens, {s['avg_total_ms']:.0f}ms total")
|
|
207
|
+
p = PRICING.get(model, (0, 0))
|
|
208
|
+
tps = s.get("avg_tokens_per_sec", 0)
|
|
209
|
+
print(f"{i:<4} {model:<40} {s['avg_latency_ms']:<12.0f} {tps:<9.1f} {s['avg_output_tokens']:<9.0f} ${p[0]:<8} ${p[1]:<8}")
|
|
289
210
|
|
|
211
|
+
# Errors
|
|
290
212
|
errors = [(m, d["summary"]) for m, d in all_results.items() if d["summary"].get("error_rate", 0) > 0]
|
|
291
213
|
if errors:
|
|
292
|
-
print(f"\
|
|
214
|
+
print(f"\nErrors:")
|
|
293
215
|
for model, s in errors:
|
|
294
|
-
print(f" {model}: {s.get('error_rate', 1
|
|
216
|
+
print(f" {model}: {s.get('error_rate', 1)*100:.0f}% failures")
|
|
295
217
|
|
|
296
218
|
print(f"\nResults saved to benchmark-results.json")
|
|
297
219
|
|