@blockrun/clawrouter 0.12.44 → 0.12.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blockrun/clawrouter",
3
- "version": "0.12.44",
3
+ "version": "0.12.45",
4
4
  "description": "Smart LLM router — save 92% on inference costs. 41+ models, one wallet, x402 micropayments.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -0,0 +1,300 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ BlockRun Model Performance Benchmark v2
4
+ Measures TTFT, tokens/s, and total latency for all BlockRun models.
5
+ Uses varied prompts to bypass response dedup cache.
6
+ """
7
+
8
+ import time
9
+ import json
10
+ import os
11
+ import sys
12
+ from datetime import datetime, timezone
13
+ from openai import OpenAI
14
+
15
+ MODELS = [
16
+ # OpenAI
17
+ "openai/gpt-5.4",
18
+ "openai/gpt-5.4-pro",
19
+ "openai/gpt-5.3",
20
+ "openai/gpt-5.3-codex",
21
+ "openai/gpt-5.2",
22
+ "openai/gpt-5.2-pro",
23
+ "openai/gpt-5-mini",
24
+ "openai/gpt-5-nano",
25
+ "openai/gpt-4.1",
26
+ "openai/gpt-4.1-mini",
27
+ "openai/gpt-4.1-nano",
28
+ "openai/gpt-4o",
29
+ "openai/gpt-4o-mini",
30
+ "openai/o3",
31
+ "openai/o3-mini",
32
+ "openai/o4-mini",
33
+ "openai/o1",
34
+ "openai/o1-mini",
35
+ # Anthropic
36
+ "anthropic/claude-sonnet-4.6",
37
+ "anthropic/claude-opus-4.6",
38
+ "anthropic/claude-haiku-4.5",
39
+ # Google
40
+ "google/gemini-3.1-pro",
41
+ "google/gemini-3-pro-preview",
42
+ "google/gemini-3-flash-preview",
43
+ "google/gemini-2.5-pro",
44
+ "google/gemini-2.5-flash",
45
+ "google/gemini-2.5-flash-lite",
46
+ # DeepSeek
47
+ "deepseek/deepseek-chat",
48
+ "deepseek/deepseek-reasoner",
49
+ # Moonshot
50
+ "moonshot/kimi-k2.5",
51
+ # xAI
52
+ "xai/grok-3",
53
+ "xai/grok-3-mini",
54
+ "xai/grok-4-fast-reasoning",
55
+ "xai/grok-4-fast-non-reasoning",
56
+ "xai/grok-4-1-fast-reasoning",
57
+ "xai/grok-4-1-fast-non-reasoning",
58
+ "xai/grok-4-0709",
59
+ # MiniMax
60
+ "minimax/minimax-m2.5",
61
+ # NVIDIA
62
+ "nvidia/gpt-oss-120b",
63
+ ]
64
+
65
+ # Varied prompts to bypass dedup cache — each run uses a different prompt
66
+ PROMPTS = [
67
+ "Write a Python function that checks if a string is a valid IPv4 address. Include edge cases and a docstring.",
68
+ "Write a Python function that finds the longest common subsequence of two strings. Include type hints and examples.",
69
+ "Write a Python function that implements a simple LRU cache using OrderedDict. Include usage examples.",
70
+ "Write a Python function that converts a Roman numeral string to an integer. Handle all standard cases.",
71
+ "Write a Python function that flattens a nested list of arbitrary depth. Include type hints.",
72
+ ]
73
+
74
+ NUM_REQUESTS = 2
75
+ MAX_TOKENS = 256
76
+
77
+
78
+ def estimate_tokens(text: str) -> int:
79
+ """Rough token estimate: ~4 chars per token for English/code."""
80
+ return max(1, len(text) // 4)
81
+
82
+
83
+ def benchmark_model(client: OpenAI, model: str) -> list:
84
+ """Benchmark a single model with streaming."""
85
+ results = []
86
+
87
+ for i in range(NUM_REQUESTS):
88
+ prompt = PROMPTS[i % len(PROMPTS)]
89
+
90
+ try:
91
+ start = time.perf_counter()
92
+ ttft = None
93
+ full_response = ""
94
+ first_chunk_time = None
95
+ last_chunk_time = None
96
+ chunk_count = 0
97
+
98
+ stream = client.chat.completions.create(
99
+ model=model,
100
+ messages=[{"role": "user", "content": prompt}],
101
+ max_tokens=MAX_TOKENS,
102
+ stream=True,
103
+ temperature=0.7,
104
+ )
105
+
106
+ for chunk in stream:
107
+ now = time.perf_counter()
108
+ if chunk.choices and chunk.choices[0].delta.content:
109
+ content = chunk.choices[0].delta.content
110
+ if ttft is None:
111
+ ttft = (now - start) * 1000 # ms
112
+ first_chunk_time = now
113
+ full_response += content
114
+ last_chunk_time = now
115
+ chunk_count += 1
116
+
117
+ end = time.perf_counter()
118
+ total_time = (end - start) * 1000 # ms
119
+
120
+ # Estimate output tokens
121
+ output_tokens = estimate_tokens(full_response)
122
+
123
+ # Tokens per second (generation phase: from first token to last token)
124
+ if first_chunk_time and last_chunk_time and last_chunk_time > first_chunk_time:
125
+ gen_time_s = last_chunk_time - first_chunk_time
126
+ tps = output_tokens / gen_time_s if gen_time_s > 0.001 else 0
127
+ elif output_tokens > 0 and ttft is not None:
128
+ # Single chunk — use total time minus TTFT
129
+ gen_time_s = (end - start) - (ttft / 1000)
130
+ tps = output_tokens / gen_time_s if gen_time_s > 0.001 else 0
131
+ else:
132
+ tps = 0
133
+
134
+ # Inter-token latency (approximate from chunks)
135
+ if first_chunk_time and last_chunk_time and chunk_count > 1:
136
+ avg_itl = ((last_chunk_time - first_chunk_time) / (chunk_count - 1)) * 1000
137
+ else:
138
+ avg_itl = None
139
+
140
+ results.append({
141
+ "request": i + 1,
142
+ "ttft_ms": round(ttft, 1) if ttft is not None else None,
143
+ "total_ms": round(total_time, 1),
144
+ "output_tokens": output_tokens,
145
+ "output_chars": len(full_response),
146
+ "chunks": chunk_count,
147
+ "tokens_per_sec": round(tps, 1),
148
+ "avg_itl_ms": round(avg_itl, 1) if avg_itl is not None else None,
149
+ "status": "success",
150
+ })
151
+
152
+ ttft_s = f"{ttft:.0f}" if ttft is not None else "N/A"
153
+ print(f" Run {i+1}/{NUM_REQUESTS}: TTFT={ttft_s}ms, {tps:.0f} tok/s, ~{output_tokens} tokens ({len(full_response)} chars), {total_time:.0f}ms")
154
+
155
+ except Exception as e:
156
+ error_msg = str(e)[:300]
157
+ results.append({
158
+ "request": i + 1,
159
+ "status": "error",
160
+ "error": error_msg,
161
+ })
162
+ print(f" Run {i+1}/{NUM_REQUESTS}: ERROR - {error_msg[:150]}")
163
+ # If rate limited or out of funds, wait longer
164
+ if "429" in error_msg or "rate" in error_msg.lower() or "402" in error_msg:
165
+ print(" >> Rate limited or payment issue, waiting 30s...")
166
+ time.sleep(30)
167
+
168
+ time.sleep(3)
169
+
170
+ return results
171
+
172
+
173
+ def aggregate(results: list) -> dict:
174
+ """Aggregate results across runs."""
175
+ successes = [r for r in results if r["status"] == "success"]
176
+ if not successes:
177
+ return {"error_rate": 1.0, "runs": len(results)}
178
+
179
+ ttft_vals = [r["ttft_ms"] for r in successes if r.get("ttft_ms") is not None]
180
+ itl_vals = [r["avg_itl_ms"] for r in successes if r.get("avg_itl_ms") is not None]
181
+ tps_vals = [r["tokens_per_sec"] for r in successes if r["tokens_per_sec"] > 0]
182
+
183
+ agg = {
184
+ "runs": len(results),
185
+ "successes": len(successes),
186
+ "error_rate": round(1 - len(successes) / len(results), 2),
187
+ "avg_output_tokens": round(sum(r["output_tokens"] for r in successes) / len(successes), 0),
188
+ "avg_total_ms": round(sum(r["total_ms"] for r in successes) / len(successes), 0),
189
+ }
190
+ if tps_vals:
191
+ agg["avg_tokens_per_sec"] = round(sum(tps_vals) / len(tps_vals), 1)
192
+ agg["max_tokens_per_sec"] = round(max(tps_vals), 1)
193
+ if ttft_vals:
194
+ agg["avg_ttft_ms"] = round(sum(ttft_vals) / len(ttft_vals), 0)
195
+ agg["p50_ttft_ms"] = round(sorted(ttft_vals)[len(ttft_vals) // 2], 0)
196
+ if itl_vals:
197
+ agg["avg_itl_ms"] = round(sum(itl_vals) / len(itl_vals), 1)
198
+
199
+ return agg
200
+
201
+
202
+ def main():
203
+ client = OpenAI(
204
+ api_key="x402",
205
+ base_url="http://localhost:18789/v1",
206
+ )
207
+
208
+ print("Testing connection to ClawRouter...")
209
+ try:
210
+ models = client.models.list()
211
+ print(f"Connected. {len(models.data)} models available.")
212
+ except Exception as e:
213
+ print(f"Failed to connect: {e}")
214
+ sys.exit(1)
215
+
216
+ print(f"Benchmark: {NUM_REQUESTS} requests per model, {MAX_TOKENS} max tokens")
217
+ print(f"Prompts varied per request to bypass dedup cache\n")
218
+
219
+ all_results = {}
220
+ timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
221
+
222
+ models_to_test = MODELS
223
+ if len(sys.argv) > 1:
224
+ models_to_test = sys.argv[1:]
225
+
226
+ total = len(models_to_test)
227
+ for idx, model in enumerate(models_to_test, 1):
228
+ print(f"\n[{idx}/{total}] {model}")
229
+ print("-" * 50)
230
+
231
+ results = benchmark_model(client, model)
232
+ agg = aggregate(results)
233
+ all_results[model] = {"raw": results, "summary": agg}
234
+
235
+ if agg.get("successes", 0) > 0:
236
+ ttft_str = f"TTFT={agg.get('avg_ttft_ms', 'N/A')}ms, " if "avg_ttft_ms" in agg else ""
237
+ tps_str = f"{agg.get('avg_tokens_per_sec', 0)} tok/s, " if "avg_tokens_per_sec" in agg else ""
238
+ print(f" >> {ttft_str}{tps_str}{agg['avg_total_ms']:.0f}ms avg total")
239
+ else:
240
+ print(f" >> ALL FAILED")
241
+
242
+ # Save incrementally
243
+ output = {
244
+ "benchmark": "BlockRun Model Performance",
245
+ "version": "1.0",
246
+ "timestamp": timestamp,
247
+ "config": {
248
+ "num_requests": NUM_REQUESTS,
249
+ "max_tokens": MAX_TOKENS,
250
+ "prompts": PROMPTS,
251
+ "token_estimation": "~4 chars per token",
252
+ "endpoint": "localhost:18789 (ClawRouter → blockrun.ai x402)",
253
+ },
254
+ "results": all_results,
255
+ }
256
+ outfile = os.path.join(os.path.dirname(__file__), "..", "benchmark-results.json")
257
+ with open(outfile, "w") as f:
258
+ json.dump(output, f, indent=2)
259
+
260
+ # Leaderboard
261
+ print("\n" + "=" * 90)
262
+ print("BLOCKRUN MODEL PERFORMANCE LEADERBOARD")
263
+ print(f"Tested: {timestamp}")
264
+ print("=" * 90)
265
+
266
+ ranked = []
267
+ for model, data in all_results.items():
268
+ s = data["summary"]
269
+ if s.get("avg_tokens_per_sec", 0) > 0:
270
+ ranked.append((model, s))
271
+
272
+ ranked.sort(key=lambda x: x[1]["avg_tokens_per_sec"], reverse=True)
273
+
274
+ print(f"\n{'#':<4} {'Model':<40} {'TTFT':<9} {'Tok/s':<9} {'ITL':<9} {'Total':<9} {'~Tokens':<8}")
275
+ print("-" * 88)
276
+ for i, (model, s) in enumerate(ranked, 1):
277
+ ttft = f"{s['avg_ttft_ms']:.0f}ms" if "avg_ttft_ms" in s else "N/A"
278
+ itl = f"{s['avg_itl_ms']:.1f}ms" if "avg_itl_ms" in s else "N/A"
279
+ print(f"{i:<4} {model:<40} {ttft:<9} {s['avg_tokens_per_sec']:<9.1f} {itl:<9} {s['avg_total_ms']:<9.0f} {s['avg_output_tokens']:<8.0f}")
280
+
281
+ # Models with issues
282
+ zero_tps = [(m, d["summary"]) for m, d in all_results.items()
283
+ if d["summary"].get("avg_tokens_per_sec", 0) == 0 and d["summary"].get("successes", 0) > 0]
284
+ if zero_tps:
285
+ print(f"\nModels with 0 tok/s (single-chunk or non-streaming response):")
286
+ for model, s in zero_tps:
287
+ ttft = f"{s.get('avg_ttft_ms', 'N/A')}ms"
288
+ print(f" {model}: TTFT={ttft}, ~{s.get('avg_output_tokens', 0)} tokens, {s['avg_total_ms']:.0f}ms total")
289
+
290
+ errors = [(m, d["summary"]) for m, d in all_results.items() if d["summary"].get("error_rate", 0) > 0]
291
+ if errors:
292
+ print(f"\nModels with errors:")
293
+ for model, s in errors:
294
+ print(f" {model}: {s.get('error_rate', 1.0)*100:.0f}% error rate")
295
+
296
+ print(f"\nResults saved to benchmark-results.json")
297
+
298
+
299
+ if __name__ == "__main__":
300
+ main()