@blockrun/clawrouter 0.12.44 → 0.12.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/cli.js +86 -66
- package/dist/cli.js.map +1 -1
- package/dist/index.js +86 -66
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
- package/scripts/benchmark.py +222 -0
package/package.json
CHANGED
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
BlockRun Model Performance Benchmark v3
|
|
4
|
+
Measures end-to-end latency via non-streaming requests.
|
|
5
|
+
Uses real token counts from API usage response.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
import sys
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from openai import OpenAI
|
|
14
|
+
|
|
15
|
+
MODELS = [
|
|
16
|
+
# OpenAI
|
|
17
|
+
"openai/gpt-5.4", "openai/gpt-5.4-pro",
|
|
18
|
+
"openai/gpt-5.3", "openai/gpt-5.3-codex",
|
|
19
|
+
"openai/gpt-5.2", "openai/gpt-5.2-pro",
|
|
20
|
+
"openai/gpt-5-mini", "openai/gpt-5-nano",
|
|
21
|
+
"openai/gpt-4.1", "openai/gpt-4.1-mini", "openai/gpt-4.1-nano",
|
|
22
|
+
"openai/gpt-4o", "openai/gpt-4o-mini",
|
|
23
|
+
"openai/o3", "openai/o3-mini", "openai/o4-mini",
|
|
24
|
+
"openai/o1", "openai/o1-mini",
|
|
25
|
+
# Anthropic
|
|
26
|
+
"anthropic/claude-sonnet-4.6", "anthropic/claude-opus-4.6", "anthropic/claude-haiku-4.5",
|
|
27
|
+
# Google
|
|
28
|
+
"google/gemini-3.1-pro", "google/gemini-3-pro-preview", "google/gemini-3-flash-preview",
|
|
29
|
+
"google/gemini-2.5-pro", "google/gemini-2.5-flash", "google/gemini-2.5-flash-lite",
|
|
30
|
+
# DeepSeek
|
|
31
|
+
"deepseek/deepseek-chat", "deepseek/deepseek-reasoner",
|
|
32
|
+
# Moonshot
|
|
33
|
+
"moonshot/kimi-k2.5",
|
|
34
|
+
# xAI
|
|
35
|
+
"xai/grok-3", "xai/grok-3-mini",
|
|
36
|
+
"xai/grok-4-fast-reasoning", "xai/grok-4-fast-non-reasoning",
|
|
37
|
+
"xai/grok-4-1-fast-reasoning", "xai/grok-4-1-fast-non-reasoning",
|
|
38
|
+
"xai/grok-4-0709",
|
|
39
|
+
# MiniMax
|
|
40
|
+
"minimax/minimax-m2.5",
|
|
41
|
+
# NVIDIA
|
|
42
|
+
"nvidia/gpt-oss-120b",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
PROMPTS = [
|
|
46
|
+
"Write a Python function that checks if a string is a valid IPv4 address. Include edge cases and a docstring.",
|
|
47
|
+
"Write a Python function that finds the longest common subsequence of two strings. Include type hints and examples.",
|
|
48
|
+
"Write a Python function that implements a simple LRU cache using OrderedDict. Include usage examples.",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
NUM_REQUESTS = 2
|
|
52
|
+
MAX_TOKENS = 256
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def benchmark_model(client: OpenAI, model: str) -> list:
|
|
56
|
+
results = []
|
|
57
|
+
for i in range(NUM_REQUESTS):
|
|
58
|
+
prompt = PROMPTS[i % len(PROMPTS)]
|
|
59
|
+
try:
|
|
60
|
+
start = time.perf_counter()
|
|
61
|
+
resp = client.chat.completions.create(
|
|
62
|
+
model=model,
|
|
63
|
+
messages=[{"role": "user", "content": prompt}],
|
|
64
|
+
max_tokens=MAX_TOKENS,
|
|
65
|
+
stream=False,
|
|
66
|
+
temperature=0.7,
|
|
67
|
+
)
|
|
68
|
+
latency = (time.perf_counter() - start) * 1000
|
|
69
|
+
|
|
70
|
+
content = resp.choices[0].message.content or ""
|
|
71
|
+
finish = resp.choices[0].finish_reason
|
|
72
|
+
usage = resp.usage
|
|
73
|
+
input_tokens = usage.prompt_tokens if usage else 0
|
|
74
|
+
output_tokens = usage.completion_tokens if usage else 0
|
|
75
|
+
total_tokens = usage.total_tokens if usage else 0
|
|
76
|
+
|
|
77
|
+
# Tokens per second (output tokens / latency)
|
|
78
|
+
tps = (output_tokens / (latency / 1000)) if latency > 0 and output_tokens > 0 else 0
|
|
79
|
+
|
|
80
|
+
results.append({
|
|
81
|
+
"request": i + 1,
|
|
82
|
+
"latency_ms": round(latency, 0),
|
|
83
|
+
"input_tokens": input_tokens,
|
|
84
|
+
"output_tokens": output_tokens,
|
|
85
|
+
"total_tokens": total_tokens,
|
|
86
|
+
"tokens_per_sec": round(tps, 1),
|
|
87
|
+
"output_chars": len(content),
|
|
88
|
+
"finish_reason": finish,
|
|
89
|
+
"status": "success",
|
|
90
|
+
})
|
|
91
|
+
print(f" Run {i+1}: {latency:.0f}ms, {output_tokens} out tokens, {tps:.0f} tok/s, finish={finish}")
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
error_msg = str(e)[:200]
|
|
95
|
+
results.append({"request": i + 1, "status": "error", "error": error_msg})
|
|
96
|
+
print(f" Run {i+1}: ERROR - {error_msg[:120]}")
|
|
97
|
+
if "429" in error_msg or "rate" in error_msg.lower():
|
|
98
|
+
print(" >> Rate limited, waiting 30s...")
|
|
99
|
+
time.sleep(30)
|
|
100
|
+
|
|
101
|
+
time.sleep(3)
|
|
102
|
+
return results
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def aggregate(results: list) -> dict:
|
|
106
|
+
successes = [r for r in results if r["status"] == "success"]
|
|
107
|
+
if not successes:
|
|
108
|
+
return {"error_rate": 1.0, "runs": len(results)}
|
|
109
|
+
|
|
110
|
+
latencies = [r["latency_ms"] for r in successes]
|
|
111
|
+
tps_vals = [r["tokens_per_sec"] for r in successes if r["tokens_per_sec"] > 0]
|
|
112
|
+
out_tokens = [r["output_tokens"] for r in successes]
|
|
113
|
+
|
|
114
|
+
return {
|
|
115
|
+
"runs": len(results),
|
|
116
|
+
"successes": len(successes),
|
|
117
|
+
"error_rate": round(1 - len(successes) / len(results), 2),
|
|
118
|
+
"avg_latency_ms": round(sum(latencies) / len(latencies), 0),
|
|
119
|
+
"min_latency_ms": round(min(latencies), 0),
|
|
120
|
+
"max_latency_ms": round(max(latencies), 0),
|
|
121
|
+
"avg_tokens_per_sec": round(sum(tps_vals) / len(tps_vals), 1) if tps_vals else 0,
|
|
122
|
+
"avg_output_tokens": round(sum(out_tokens) / len(out_tokens), 0),
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# Model pricing (USD per 1M tokens) — for cost calculation
|
|
127
|
+
PRICING = {
|
|
128
|
+
"openai/gpt-5.4": (2.5, 15), "openai/gpt-5.4-pro": (2.5, 15),
|
|
129
|
+
"openai/gpt-5.3": (2.5, 10), "openai/gpt-5.3-codex": (2.5, 10),
|
|
130
|
+
"openai/gpt-5.2": (2.5, 10), "openai/gpt-5.2-pro": (2.5, 10),
|
|
131
|
+
"openai/gpt-5-mini": (1.1, 4.4), "openai/gpt-5-nano": (0.5, 2),
|
|
132
|
+
"openai/gpt-4.1": (2, 8), "openai/gpt-4.1-mini": (0.4, 1.6), "openai/gpt-4.1-nano": (0.1, 0.4),
|
|
133
|
+
"openai/gpt-4o": (2.5, 10), "openai/gpt-4o-mini": (0.15, 0.6),
|
|
134
|
+
"openai/o3": (2, 8), "openai/o3-mini": (1.1, 4.4), "openai/o4-mini": (1.1, 4.4),
|
|
135
|
+
"openai/o1": (15, 60), "openai/o1-mini": (1.1, 4.4),
|
|
136
|
+
"anthropic/claude-sonnet-4.6": (3, 15), "anthropic/claude-opus-4.6": (15, 75),
|
|
137
|
+
"anthropic/claude-haiku-4.5": (0.8, 4),
|
|
138
|
+
"google/gemini-3.1-pro": (1.25, 10), "google/gemini-3-pro-preview": (1.25, 10),
|
|
139
|
+
"google/gemini-3-flash-preview": (0.15, 0.6),
|
|
140
|
+
"google/gemini-2.5-pro": (1.25, 10), "google/gemini-2.5-flash": (0.15, 0.6),
|
|
141
|
+
"google/gemini-2.5-flash-lite": (0.1, 0.4),
|
|
142
|
+
"deepseek/deepseek-chat": (0.27, 1.1), "deepseek/deepseek-reasoner": (0.55, 2.19),
|
|
143
|
+
"moonshot/kimi-k2.5": (0.6, 3),
|
|
144
|
+
"xai/grok-3": (3, 15), "xai/grok-3-mini": (0.3, 0.5),
|
|
145
|
+
"xai/grok-4-fast-reasoning": (0.2, 0.5), "xai/grok-4-fast-non-reasoning": (0.2, 0.5),
|
|
146
|
+
"xai/grok-4-1-fast-reasoning": (0.2, 0.5), "xai/grok-4-1-fast-non-reasoning": (0.2, 0.5),
|
|
147
|
+
"xai/grok-4-0709": (0.2, 1.5),
|
|
148
|
+
"minimax/minimax-m2.5": (0.3, 1.1),
|
|
149
|
+
"nvidia/gpt-oss-120b": (0, 0),
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def main():
|
|
154
|
+
client = OpenAI(api_key="x402", base_url="http://localhost:18789/v1")
|
|
155
|
+
|
|
156
|
+
print("BlockRun Model Performance Benchmark v3")
|
|
157
|
+
print("=" * 60)
|
|
158
|
+
try:
|
|
159
|
+
models = client.models.list()
|
|
160
|
+
print(f"Connected. {len(models.data)} models available.")
|
|
161
|
+
except Exception as e:
|
|
162
|
+
print(f"Connection failed: {e}")
|
|
163
|
+
sys.exit(1)
|
|
164
|
+
|
|
165
|
+
print(f"Config: {NUM_REQUESTS} requests/model, {MAX_TOKENS} max tokens, non-streaming\n")
|
|
166
|
+
|
|
167
|
+
all_results = {}
|
|
168
|
+
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
169
|
+
|
|
170
|
+
models_to_test = MODELS if len(sys.argv) <= 1 else sys.argv[1:]
|
|
171
|
+
total = len(models_to_test)
|
|
172
|
+
|
|
173
|
+
for idx, model in enumerate(models_to_test, 1):
|
|
174
|
+
print(f"\n[{idx}/{total}] {model}")
|
|
175
|
+
results = benchmark_model(client, model)
|
|
176
|
+
agg = aggregate(results)
|
|
177
|
+
all_results[model] = {"raw": results, "summary": agg}
|
|
178
|
+
|
|
179
|
+
if agg.get("successes", 0) > 0:
|
|
180
|
+
print(f" >> {agg['avg_latency_ms']:.0f}ms avg, {agg.get('avg_tokens_per_sec', 0)} tok/s, ~{agg['avg_output_tokens']} tokens")
|
|
181
|
+
else:
|
|
182
|
+
print(f" >> ALL FAILED")
|
|
183
|
+
|
|
184
|
+
# Save incrementally
|
|
185
|
+
output = {
|
|
186
|
+
"benchmark": "BlockRun Model Performance",
|
|
187
|
+
"version": "3.0",
|
|
188
|
+
"timestamp": timestamp,
|
|
189
|
+
"config": {"num_requests": NUM_REQUESTS, "max_tokens": MAX_TOKENS, "mode": "non-streaming"},
|
|
190
|
+
"results": all_results,
|
|
191
|
+
}
|
|
192
|
+
with open(os.path.join(os.path.dirname(__file__), "..", "benchmark-results.json"), "w") as f:
|
|
193
|
+
json.dump(output, f, indent=2)
|
|
194
|
+
|
|
195
|
+
# === LEADERBOARD ===
|
|
196
|
+
print("\n" + "=" * 100)
|
|
197
|
+
print("BLOCKRUN MODEL PERFORMANCE LEADERBOARD")
|
|
198
|
+
print(f"Date: {timestamp} | Mode: non-streaming | Max tokens: {MAX_TOKENS}")
|
|
199
|
+
print("=" * 100)
|
|
200
|
+
|
|
201
|
+
ranked = [(m, d["summary"]) for m, d in all_results.items() if d["summary"].get("successes", 0) > 0]
|
|
202
|
+
ranked.sort(key=lambda x: x[1]["avg_latency_ms"])
|
|
203
|
+
|
|
204
|
+
print(f"\n{'#':<4} {'Model':<40} {'Latency':<12} {'Tok/s':<9} {'Out Tok':<9} {'$/1M in':<9} {'$/1M out':<9}")
|
|
205
|
+
print("-" * 92)
|
|
206
|
+
for i, (model, s) in enumerate(ranked, 1):
|
|
207
|
+
p = PRICING.get(model, (0, 0))
|
|
208
|
+
tps = s.get("avg_tokens_per_sec", 0)
|
|
209
|
+
print(f"{i:<4} {model:<40} {s['avg_latency_ms']:<12.0f} {tps:<9.1f} {s['avg_output_tokens']:<9.0f} ${p[0]:<8} ${p[1]:<8}")
|
|
210
|
+
|
|
211
|
+
# Errors
|
|
212
|
+
errors = [(m, d["summary"]) for m, d in all_results.items() if d["summary"].get("error_rate", 0) > 0]
|
|
213
|
+
if errors:
|
|
214
|
+
print(f"\nErrors:")
|
|
215
|
+
for model, s in errors:
|
|
216
|
+
print(f" {model}: {s.get('error_rate', 1)*100:.0f}% failures")
|
|
217
|
+
|
|
218
|
+
print(f"\nResults saved to benchmark-results.json")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
if __name__ == "__main__":
|
|
222
|
+
main()
|