@blockrun/clawrouter 0.12.44 → 0.12.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli.js +10 -16
- package/dist/cli.js.map +1 -1
- package/dist/index.js +10 -16
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/scripts/benchmark.py +300 -0
package/package.json
CHANGED
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
BlockRun Model Performance Benchmark v2
|
|
4
|
+
Measures TTFT, tokens/s, and total latency for all BlockRun models.
|
|
5
|
+
Uses varied prompts to bypass response dedup cache.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from openai import OpenAI
|
|
14
|
+
|
|
15
|
+
MODELS = [
|
|
16
|
+
# OpenAI
|
|
17
|
+
"openai/gpt-5.4",
|
|
18
|
+
"openai/gpt-5.4-pro",
|
|
19
|
+
"openai/gpt-5.3",
|
|
20
|
+
"openai/gpt-5.3-codex",
|
|
21
|
+
"openai/gpt-5.2",
|
|
22
|
+
"openai/gpt-5.2-pro",
|
|
23
|
+
"openai/gpt-5-mini",
|
|
24
|
+
"openai/gpt-5-nano",
|
|
25
|
+
"openai/gpt-4.1",
|
|
26
|
+
"openai/gpt-4.1-mini",
|
|
27
|
+
"openai/gpt-4.1-nano",
|
|
28
|
+
"openai/gpt-4o",
|
|
29
|
+
"openai/gpt-4o-mini",
|
|
30
|
+
"openai/o3",
|
|
31
|
+
"openai/o3-mini",
|
|
32
|
+
"openai/o4-mini",
|
|
33
|
+
"openai/o1",
|
|
34
|
+
"openai/o1-mini",
|
|
35
|
+
# Anthropic
|
|
36
|
+
"anthropic/claude-sonnet-4.6",
|
|
37
|
+
"anthropic/claude-opus-4.6",
|
|
38
|
+
"anthropic/claude-haiku-4.5",
|
|
39
|
+
# Google
|
|
40
|
+
"google/gemini-3.1-pro",
|
|
41
|
+
"google/gemini-3-pro-preview",
|
|
42
|
+
"google/gemini-3-flash-preview",
|
|
43
|
+
"google/gemini-2.5-pro",
|
|
44
|
+
"google/gemini-2.5-flash",
|
|
45
|
+
"google/gemini-2.5-flash-lite",
|
|
46
|
+
# DeepSeek
|
|
47
|
+
"deepseek/deepseek-chat",
|
|
48
|
+
"deepseek/deepseek-reasoner",
|
|
49
|
+
# Moonshot
|
|
50
|
+
"moonshot/kimi-k2.5",
|
|
51
|
+
# xAI
|
|
52
|
+
"xai/grok-3",
|
|
53
|
+
"xai/grok-3-mini",
|
|
54
|
+
"xai/grok-4-fast-reasoning",
|
|
55
|
+
"xai/grok-4-fast-non-reasoning",
|
|
56
|
+
"xai/grok-4-1-fast-reasoning",
|
|
57
|
+
"xai/grok-4-1-fast-non-reasoning",
|
|
58
|
+
"xai/grok-4-0709",
|
|
59
|
+
# MiniMax
|
|
60
|
+
"minimax/minimax-m2.5",
|
|
61
|
+
# NVIDIA
|
|
62
|
+
"nvidia/gpt-oss-120b",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
# Varied prompts to bypass dedup cache — each run uses a different prompt
|
|
66
|
+
PROMPTS = [
|
|
67
|
+
"Write a Python function that checks if a string is a valid IPv4 address. Include edge cases and a docstring.",
|
|
68
|
+
"Write a Python function that finds the longest common subsequence of two strings. Include type hints and examples.",
|
|
69
|
+
"Write a Python function that implements a simple LRU cache using OrderedDict. Include usage examples.",
|
|
70
|
+
"Write a Python function that converts a Roman numeral string to an integer. Handle all standard cases.",
|
|
71
|
+
"Write a Python function that flattens a nested list of arbitrary depth. Include type hints.",
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
NUM_REQUESTS = 2
|
|
75
|
+
MAX_TOKENS = 256
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def estimate_tokens(text: str) -> int:
|
|
79
|
+
"""Rough token estimate: ~4 chars per token for English/code."""
|
|
80
|
+
return max(1, len(text) // 4)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def benchmark_model(client: OpenAI, model: str) -> list:
|
|
84
|
+
"""Benchmark a single model with streaming."""
|
|
85
|
+
results = []
|
|
86
|
+
|
|
87
|
+
for i in range(NUM_REQUESTS):
|
|
88
|
+
prompt = PROMPTS[i % len(PROMPTS)]
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
start = time.perf_counter()
|
|
92
|
+
ttft = None
|
|
93
|
+
full_response = ""
|
|
94
|
+
first_chunk_time = None
|
|
95
|
+
last_chunk_time = None
|
|
96
|
+
chunk_count = 0
|
|
97
|
+
|
|
98
|
+
stream = client.chat.completions.create(
|
|
99
|
+
model=model,
|
|
100
|
+
messages=[{"role": "user", "content": prompt}],
|
|
101
|
+
max_tokens=MAX_TOKENS,
|
|
102
|
+
stream=True,
|
|
103
|
+
temperature=0.7,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
for chunk in stream:
|
|
107
|
+
now = time.perf_counter()
|
|
108
|
+
if chunk.choices and chunk.choices[0].delta.content:
|
|
109
|
+
content = chunk.choices[0].delta.content
|
|
110
|
+
if ttft is None:
|
|
111
|
+
ttft = (now - start) * 1000 # ms
|
|
112
|
+
first_chunk_time = now
|
|
113
|
+
full_response += content
|
|
114
|
+
last_chunk_time = now
|
|
115
|
+
chunk_count += 1
|
|
116
|
+
|
|
117
|
+
end = time.perf_counter()
|
|
118
|
+
total_time = (end - start) * 1000 # ms
|
|
119
|
+
|
|
120
|
+
# Estimate output tokens
|
|
121
|
+
output_tokens = estimate_tokens(full_response)
|
|
122
|
+
|
|
123
|
+
# Tokens per second (generation phase: from first token to last token)
|
|
124
|
+
if first_chunk_time and last_chunk_time and last_chunk_time > first_chunk_time:
|
|
125
|
+
gen_time_s = last_chunk_time - first_chunk_time
|
|
126
|
+
tps = output_tokens / gen_time_s if gen_time_s > 0.001 else 0
|
|
127
|
+
elif output_tokens > 0 and ttft is not None:
|
|
128
|
+
# Single chunk — use total time minus TTFT
|
|
129
|
+
gen_time_s = (end - start) - (ttft / 1000)
|
|
130
|
+
tps = output_tokens / gen_time_s if gen_time_s > 0.001 else 0
|
|
131
|
+
else:
|
|
132
|
+
tps = 0
|
|
133
|
+
|
|
134
|
+
# Inter-token latency (approximate from chunks)
|
|
135
|
+
if first_chunk_time and last_chunk_time and chunk_count > 1:
|
|
136
|
+
avg_itl = ((last_chunk_time - first_chunk_time) / (chunk_count - 1)) * 1000
|
|
137
|
+
else:
|
|
138
|
+
avg_itl = None
|
|
139
|
+
|
|
140
|
+
results.append({
|
|
141
|
+
"request": i + 1,
|
|
142
|
+
"ttft_ms": round(ttft, 1) if ttft is not None else None,
|
|
143
|
+
"total_ms": round(total_time, 1),
|
|
144
|
+
"output_tokens": output_tokens,
|
|
145
|
+
"output_chars": len(full_response),
|
|
146
|
+
"chunks": chunk_count,
|
|
147
|
+
"tokens_per_sec": round(tps, 1),
|
|
148
|
+
"avg_itl_ms": round(avg_itl, 1) if avg_itl is not None else None,
|
|
149
|
+
"status": "success",
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
ttft_s = f"{ttft:.0f}" if ttft is not None else "N/A"
|
|
153
|
+
print(f" Run {i+1}/{NUM_REQUESTS}: TTFT={ttft_s}ms, {tps:.0f} tok/s, ~{output_tokens} tokens ({len(full_response)} chars), {total_time:.0f}ms")
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
error_msg = str(e)[:300]
|
|
157
|
+
results.append({
|
|
158
|
+
"request": i + 1,
|
|
159
|
+
"status": "error",
|
|
160
|
+
"error": error_msg,
|
|
161
|
+
})
|
|
162
|
+
print(f" Run {i+1}/{NUM_REQUESTS}: ERROR - {error_msg[:150]}")
|
|
163
|
+
# If rate limited or out of funds, wait longer
|
|
164
|
+
if "429" in error_msg or "rate" in error_msg.lower() or "402" in error_msg:
|
|
165
|
+
print(" >> Rate limited or payment issue, waiting 30s...")
|
|
166
|
+
time.sleep(30)
|
|
167
|
+
|
|
168
|
+
time.sleep(3)
|
|
169
|
+
|
|
170
|
+
return results
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def aggregate(results: list) -> dict:
|
|
174
|
+
"""Aggregate results across runs."""
|
|
175
|
+
successes = [r for r in results if r["status"] == "success"]
|
|
176
|
+
if not successes:
|
|
177
|
+
return {"error_rate": 1.0, "runs": len(results)}
|
|
178
|
+
|
|
179
|
+
ttft_vals = [r["ttft_ms"] for r in successes if r.get("ttft_ms") is not None]
|
|
180
|
+
itl_vals = [r["avg_itl_ms"] for r in successes if r.get("avg_itl_ms") is not None]
|
|
181
|
+
tps_vals = [r["tokens_per_sec"] for r in successes if r["tokens_per_sec"] > 0]
|
|
182
|
+
|
|
183
|
+
agg = {
|
|
184
|
+
"runs": len(results),
|
|
185
|
+
"successes": len(successes),
|
|
186
|
+
"error_rate": round(1 - len(successes) / len(results), 2),
|
|
187
|
+
"avg_output_tokens": round(sum(r["output_tokens"] for r in successes) / len(successes), 0),
|
|
188
|
+
"avg_total_ms": round(sum(r["total_ms"] for r in successes) / len(successes), 0),
|
|
189
|
+
}
|
|
190
|
+
if tps_vals:
|
|
191
|
+
agg["avg_tokens_per_sec"] = round(sum(tps_vals) / len(tps_vals), 1)
|
|
192
|
+
agg["max_tokens_per_sec"] = round(max(tps_vals), 1)
|
|
193
|
+
if ttft_vals:
|
|
194
|
+
agg["avg_ttft_ms"] = round(sum(ttft_vals) / len(ttft_vals), 0)
|
|
195
|
+
agg["p50_ttft_ms"] = round(sorted(ttft_vals)[len(ttft_vals) // 2], 0)
|
|
196
|
+
if itl_vals:
|
|
197
|
+
agg["avg_itl_ms"] = round(sum(itl_vals) / len(itl_vals), 1)
|
|
198
|
+
|
|
199
|
+
return agg
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def main():
|
|
203
|
+
client = OpenAI(
|
|
204
|
+
api_key="x402",
|
|
205
|
+
base_url="http://localhost:18789/v1",
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
print("Testing connection to ClawRouter...")
|
|
209
|
+
try:
|
|
210
|
+
models = client.models.list()
|
|
211
|
+
print(f"Connected. {len(models.data)} models available.")
|
|
212
|
+
except Exception as e:
|
|
213
|
+
print(f"Failed to connect: {e}")
|
|
214
|
+
sys.exit(1)
|
|
215
|
+
|
|
216
|
+
print(f"Benchmark: {NUM_REQUESTS} requests per model, {MAX_TOKENS} max tokens")
|
|
217
|
+
print(f"Prompts varied per request to bypass dedup cache\n")
|
|
218
|
+
|
|
219
|
+
all_results = {}
|
|
220
|
+
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
221
|
+
|
|
222
|
+
models_to_test = MODELS
|
|
223
|
+
if len(sys.argv) > 1:
|
|
224
|
+
models_to_test = sys.argv[1:]
|
|
225
|
+
|
|
226
|
+
total = len(models_to_test)
|
|
227
|
+
for idx, model in enumerate(models_to_test, 1):
|
|
228
|
+
print(f"\n[{idx}/{total}] {model}")
|
|
229
|
+
print("-" * 50)
|
|
230
|
+
|
|
231
|
+
results = benchmark_model(client, model)
|
|
232
|
+
agg = aggregate(results)
|
|
233
|
+
all_results[model] = {"raw": results, "summary": agg}
|
|
234
|
+
|
|
235
|
+
if agg.get("successes", 0) > 0:
|
|
236
|
+
ttft_str = f"TTFT={agg.get('avg_ttft_ms', 'N/A')}ms, " if "avg_ttft_ms" in agg else ""
|
|
237
|
+
tps_str = f"{agg.get('avg_tokens_per_sec', 0)} tok/s, " if "avg_tokens_per_sec" in agg else ""
|
|
238
|
+
print(f" >> {ttft_str}{tps_str}{agg['avg_total_ms']:.0f}ms avg total")
|
|
239
|
+
else:
|
|
240
|
+
print(f" >> ALL FAILED")
|
|
241
|
+
|
|
242
|
+
# Save incrementally
|
|
243
|
+
output = {
|
|
244
|
+
"benchmark": "BlockRun Model Performance",
|
|
245
|
+
"version": "1.0",
|
|
246
|
+
"timestamp": timestamp,
|
|
247
|
+
"config": {
|
|
248
|
+
"num_requests": NUM_REQUESTS,
|
|
249
|
+
"max_tokens": MAX_TOKENS,
|
|
250
|
+
"prompts": PROMPTS,
|
|
251
|
+
"token_estimation": "~4 chars per token",
|
|
252
|
+
"endpoint": "localhost:18789 (ClawRouter → blockrun.ai x402)",
|
|
253
|
+
},
|
|
254
|
+
"results": all_results,
|
|
255
|
+
}
|
|
256
|
+
outfile = os.path.join(os.path.dirname(__file__), "..", "benchmark-results.json")
|
|
257
|
+
with open(outfile, "w") as f:
|
|
258
|
+
json.dump(output, f, indent=2)
|
|
259
|
+
|
|
260
|
+
# Leaderboard
|
|
261
|
+
print("\n" + "=" * 90)
|
|
262
|
+
print("BLOCKRUN MODEL PERFORMANCE LEADERBOARD")
|
|
263
|
+
print(f"Tested: {timestamp}")
|
|
264
|
+
print("=" * 90)
|
|
265
|
+
|
|
266
|
+
ranked = []
|
|
267
|
+
for model, data in all_results.items():
|
|
268
|
+
s = data["summary"]
|
|
269
|
+
if s.get("avg_tokens_per_sec", 0) > 0:
|
|
270
|
+
ranked.append((model, s))
|
|
271
|
+
|
|
272
|
+
ranked.sort(key=lambda x: x[1]["avg_tokens_per_sec"], reverse=True)
|
|
273
|
+
|
|
274
|
+
print(f"\n{'#':<4} {'Model':<40} {'TTFT':<9} {'Tok/s':<9} {'ITL':<9} {'Total':<9} {'~Tokens':<8}")
|
|
275
|
+
print("-" * 88)
|
|
276
|
+
for i, (model, s) in enumerate(ranked, 1):
|
|
277
|
+
ttft = f"{s['avg_ttft_ms']:.0f}ms" if "avg_ttft_ms" in s else "N/A"
|
|
278
|
+
itl = f"{s['avg_itl_ms']:.1f}ms" if "avg_itl_ms" in s else "N/A"
|
|
279
|
+
print(f"{i:<4} {model:<40} {ttft:<9} {s['avg_tokens_per_sec']:<9.1f} {itl:<9} {s['avg_total_ms']:<9.0f} {s['avg_output_tokens']:<8.0f}")
|
|
280
|
+
|
|
281
|
+
# Models with issues
|
|
282
|
+
zero_tps = [(m, d["summary"]) for m, d in all_results.items()
|
|
283
|
+
if d["summary"].get("avg_tokens_per_sec", 0) == 0 and d["summary"].get("successes", 0) > 0]
|
|
284
|
+
if zero_tps:
|
|
285
|
+
print(f"\nModels with 0 tok/s (single-chunk or non-streaming response):")
|
|
286
|
+
for model, s in zero_tps:
|
|
287
|
+
ttft = f"{s.get('avg_ttft_ms', 'N/A')}ms"
|
|
288
|
+
print(f" {model}: TTFT={ttft}, ~{s.get('avg_output_tokens', 0)} tokens, {s['avg_total_ms']:.0f}ms total")
|
|
289
|
+
|
|
290
|
+
errors = [(m, d["summary"]) for m, d in all_results.items() if d["summary"].get("error_rate", 0) > 0]
|
|
291
|
+
if errors:
|
|
292
|
+
print(f"\nModels with errors:")
|
|
293
|
+
for model, s in errors:
|
|
294
|
+
print(f" {model}: {s.get('error_rate', 1.0)*100:.0f}% error rate")
|
|
295
|
+
|
|
296
|
+
print(f"\nResults saved to benchmark-results.json")
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
if __name__ == "__main__":
|
|
300
|
+
main()
|