@blockrun/clawrouter 0.12.44 → 0.12.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blockrun/clawrouter",
3
- "version": "0.12.44",
3
+ "version": "0.12.46",
4
4
  "description": "Smart LLM router — save 92% on inference costs. 41+ models, one wallet, x402 micropayments.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -0,0 +1,222 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ BlockRun Model Performance Benchmark v3
4
+ Measures end-to-end latency via non-streaming requests.
5
+ Uses real token counts from API usage response.
6
+ """
7
+
8
+ import time
9
+ import json
10
+ import os
11
+ import sys
12
+ from datetime import datetime, timezone
13
+ from openai import OpenAI
14
+
15
+ MODELS = [
16
+ # OpenAI
17
+ "openai/gpt-5.4", "openai/gpt-5.4-pro",
18
+ "openai/gpt-5.3", "openai/gpt-5.3-codex",
19
+ "openai/gpt-5.2", "openai/gpt-5.2-pro",
20
+ "openai/gpt-5-mini", "openai/gpt-5-nano",
21
+ "openai/gpt-4.1", "openai/gpt-4.1-mini", "openai/gpt-4.1-nano",
22
+ "openai/gpt-4o", "openai/gpt-4o-mini",
23
+ "openai/o3", "openai/o3-mini", "openai/o4-mini",
24
+ "openai/o1", "openai/o1-mini",
25
+ # Anthropic
26
+ "anthropic/claude-sonnet-4.6", "anthropic/claude-opus-4.6", "anthropic/claude-haiku-4.5",
27
+ # Google
28
+ "google/gemini-3.1-pro", "google/gemini-3-pro-preview", "google/gemini-3-flash-preview",
29
+ "google/gemini-2.5-pro", "google/gemini-2.5-flash", "google/gemini-2.5-flash-lite",
30
+ # DeepSeek
31
+ "deepseek/deepseek-chat", "deepseek/deepseek-reasoner",
32
+ # Moonshot
33
+ "moonshot/kimi-k2.5",
34
+ # xAI
35
+ "xai/grok-3", "xai/grok-3-mini",
36
+ "xai/grok-4-fast-reasoning", "xai/grok-4-fast-non-reasoning",
37
+ "xai/grok-4-1-fast-reasoning", "xai/grok-4-1-fast-non-reasoning",
38
+ "xai/grok-4-0709",
39
+ # MiniMax
40
+ "minimax/minimax-m2.5",
41
+ # NVIDIA
42
+ "nvidia/gpt-oss-120b",
43
+ ]
44
+
45
+ PROMPTS = [
46
+ "Write a Python function that checks if a string is a valid IPv4 address. Include edge cases and a docstring.",
47
+ "Write a Python function that finds the longest common subsequence of two strings. Include type hints and examples.",
48
+ "Write a Python function that implements a simple LRU cache using OrderedDict. Include usage examples.",
49
+ ]
50
+
51
+ NUM_REQUESTS = 2
52
+ MAX_TOKENS = 256
53
+
54
+
55
+ def benchmark_model(client: OpenAI, model: str) -> list:
56
+ results = []
57
+ for i in range(NUM_REQUESTS):
58
+ prompt = PROMPTS[i % len(PROMPTS)]
59
+ try:
60
+ start = time.perf_counter()
61
+ resp = client.chat.completions.create(
62
+ model=model,
63
+ messages=[{"role": "user", "content": prompt}],
64
+ max_tokens=MAX_TOKENS,
65
+ stream=False,
66
+ temperature=0.7,
67
+ )
68
+ latency = (time.perf_counter() - start) * 1000
69
+
70
+ content = resp.choices[0].message.content or ""
71
+ finish = resp.choices[0].finish_reason
72
+ usage = resp.usage
73
+ input_tokens = usage.prompt_tokens if usage else 0
74
+ output_tokens = usage.completion_tokens if usage else 0
75
+ total_tokens = usage.total_tokens if usage else 0
76
+
77
+ # Tokens per second (output tokens / latency)
78
+ tps = (output_tokens / (latency / 1000)) if latency > 0 and output_tokens > 0 else 0
79
+
80
+ results.append({
81
+ "request": i + 1,
82
+ "latency_ms": round(latency, 0),
83
+ "input_tokens": input_tokens,
84
+ "output_tokens": output_tokens,
85
+ "total_tokens": total_tokens,
86
+ "tokens_per_sec": round(tps, 1),
87
+ "output_chars": len(content),
88
+ "finish_reason": finish,
89
+ "status": "success",
90
+ })
91
+ print(f" Run {i+1}: {latency:.0f}ms, {output_tokens} out tokens, {tps:.0f} tok/s, finish={finish}")
92
+
93
+ except Exception as e:
94
+ error_msg = str(e)[:200]
95
+ results.append({"request": i + 1, "status": "error", "error": error_msg})
96
+ print(f" Run {i+1}: ERROR - {error_msg[:120]}")
97
+ if "429" in error_msg or "rate" in error_msg.lower():
98
+ print(" >> Rate limited, waiting 30s...")
99
+ time.sleep(30)
100
+
101
+ time.sleep(3)
102
+ return results
103
+
104
+
105
+ def aggregate(results: list) -> dict:
106
+ successes = [r for r in results if r["status"] == "success"]
107
+ if not successes:
108
+ return {"error_rate": 1.0, "runs": len(results)}
109
+
110
+ latencies = [r["latency_ms"] for r in successes]
111
+ tps_vals = [r["tokens_per_sec"] for r in successes if r["tokens_per_sec"] > 0]
112
+ out_tokens = [r["output_tokens"] for r in successes]
113
+
114
+ return {
115
+ "runs": len(results),
116
+ "successes": len(successes),
117
+ "error_rate": round(1 - len(successes) / len(results), 2),
118
+ "avg_latency_ms": round(sum(latencies) / len(latencies), 0),
119
+ "min_latency_ms": round(min(latencies), 0),
120
+ "max_latency_ms": round(max(latencies), 0),
121
+ "avg_tokens_per_sec": round(sum(tps_vals) / len(tps_vals), 1) if tps_vals else 0,
122
+ "avg_output_tokens": round(sum(out_tokens) / len(out_tokens), 0),
123
+ }
124
+
125
+
126
+ # Model pricing (USD per 1M tokens) — for cost calculation
127
+ PRICING = {
128
+ "openai/gpt-5.4": (2.5, 15), "openai/gpt-5.4-pro": (2.5, 15),
129
+ "openai/gpt-5.3": (2.5, 10), "openai/gpt-5.3-codex": (2.5, 10),
130
+ "openai/gpt-5.2": (2.5, 10), "openai/gpt-5.2-pro": (2.5, 10),
131
+ "openai/gpt-5-mini": (1.1, 4.4), "openai/gpt-5-nano": (0.5, 2),
132
+ "openai/gpt-4.1": (2, 8), "openai/gpt-4.1-mini": (0.4, 1.6), "openai/gpt-4.1-nano": (0.1, 0.4),
133
+ "openai/gpt-4o": (2.5, 10), "openai/gpt-4o-mini": (0.15, 0.6),
134
+ "openai/o3": (2, 8), "openai/o3-mini": (1.1, 4.4), "openai/o4-mini": (1.1, 4.4),
135
+ "openai/o1": (15, 60), "openai/o1-mini": (1.1, 4.4),
136
+ "anthropic/claude-sonnet-4.6": (3, 15), "anthropic/claude-opus-4.6": (15, 75),
137
+ "anthropic/claude-haiku-4.5": (0.8, 4),
138
+ "google/gemini-3.1-pro": (1.25, 10), "google/gemini-3-pro-preview": (1.25, 10),
139
+ "google/gemini-3-flash-preview": (0.15, 0.6),
140
+ "google/gemini-2.5-pro": (1.25, 10), "google/gemini-2.5-flash": (0.15, 0.6),
141
+ "google/gemini-2.5-flash-lite": (0.1, 0.4),
142
+ "deepseek/deepseek-chat": (0.27, 1.1), "deepseek/deepseek-reasoner": (0.55, 2.19),
143
+ "moonshot/kimi-k2.5": (0.6, 3),
144
+ "xai/grok-3": (3, 15), "xai/grok-3-mini": (0.3, 0.5),
145
+ "xai/grok-4-fast-reasoning": (0.2, 0.5), "xai/grok-4-fast-non-reasoning": (0.2, 0.5),
146
+ "xai/grok-4-1-fast-reasoning": (0.2, 0.5), "xai/grok-4-1-fast-non-reasoning": (0.2, 0.5),
147
+ "xai/grok-4-0709": (0.2, 1.5),
148
+ "minimax/minimax-m2.5": (0.3, 1.1),
149
+ "nvidia/gpt-oss-120b": (0, 0),
150
+ }
151
+
152
+
153
+ def main():
154
+ client = OpenAI(api_key="x402", base_url="http://localhost:18789/v1")
155
+
156
+ print("BlockRun Model Performance Benchmark v3")
157
+ print("=" * 60)
158
+ try:
159
+ models = client.models.list()
160
+ print(f"Connected. {len(models.data)} models available.")
161
+ except Exception as e:
162
+ print(f"Connection failed: {e}")
163
+ sys.exit(1)
164
+
165
+ print(f"Config: {NUM_REQUESTS} requests/model, {MAX_TOKENS} max tokens, non-streaming\n")
166
+
167
+ all_results = {}
168
+ timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
169
+
170
+ models_to_test = MODELS if len(sys.argv) <= 1 else sys.argv[1:]
171
+ total = len(models_to_test)
172
+
173
+ for idx, model in enumerate(models_to_test, 1):
174
+ print(f"\n[{idx}/{total}] {model}")
175
+ results = benchmark_model(client, model)
176
+ agg = aggregate(results)
177
+ all_results[model] = {"raw": results, "summary": agg}
178
+
179
+ if agg.get("successes", 0) > 0:
180
+ print(f" >> {agg['avg_latency_ms']:.0f}ms avg, {agg.get('avg_tokens_per_sec', 0)} tok/s, ~{agg['avg_output_tokens']} tokens")
181
+ else:
182
+ print(f" >> ALL FAILED")
183
+
184
+ # Save incrementally
185
+ output = {
186
+ "benchmark": "BlockRun Model Performance",
187
+ "version": "3.0",
188
+ "timestamp": timestamp,
189
+ "config": {"num_requests": NUM_REQUESTS, "max_tokens": MAX_TOKENS, "mode": "non-streaming"},
190
+ "results": all_results,
191
+ }
192
+ with open(os.path.join(os.path.dirname(__file__), "..", "benchmark-results.json"), "w") as f:
193
+ json.dump(output, f, indent=2)
194
+
195
+ # === LEADERBOARD ===
196
+ print("\n" + "=" * 100)
197
+ print("BLOCKRUN MODEL PERFORMANCE LEADERBOARD")
198
+ print(f"Date: {timestamp} | Mode: non-streaming | Max tokens: {MAX_TOKENS}")
199
+ print("=" * 100)
200
+
201
+ ranked = [(m, d["summary"]) for m, d in all_results.items() if d["summary"].get("successes", 0) > 0]
202
+ ranked.sort(key=lambda x: x[1]["avg_latency_ms"])
203
+
204
+ print(f"\n{'#':<4} {'Model':<40} {'Latency':<12} {'Tok/s':<9} {'Out Tok':<9} {'$/1M in':<9} {'$/1M out':<9}")
205
+ print("-" * 92)
206
+ for i, (model, s) in enumerate(ranked, 1):
207
+ p = PRICING.get(model, (0, 0))
208
+ tps = s.get("avg_tokens_per_sec", 0)
209
+ print(f"{i:<4} {model:<40} {s['avg_latency_ms']:<12.0f} {tps:<9.1f} {s['avg_output_tokens']:<9.0f} ${p[0]:<8} ${p[1]:<8}")
210
+
211
+ # Errors
212
+ errors = [(m, d["summary"]) for m, d in all_results.items() if d["summary"].get("error_rate", 0) > 0]
213
+ if errors:
214
+ print(f"\nErrors:")
215
+ for model, s in errors:
216
+ print(f" {model}: {s.get('error_rate', 1)*100:.0f}% failures")
217
+
218
+ print(f"\nResults saved to benchmark-results.json")
219
+
220
+
221
+ if __name__ == "__main__":
222
+ main()