@blockrun/clawrouter 0.12.45 → 0.12.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blockrun/clawrouter",
3
- "version": "0.12.45",
3
+ "version": "0.12.46",
4
4
  "description": "Smart LLM router — save 92% on inference costs. 41+ models, one wallet, x402 micropayments.",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- BlockRun Model Performance Benchmark v2
4
- Measures TTFT, tokens/s, and total latency for all BlockRun models.
5
- Uses varied prompts to bypass response dedup cache.
3
+ BlockRun Model Performance Benchmark v3
4
+ Measures end-to-end latency via non-streaming requests.
5
+ Uses real token counts from API usage response.
6
6
  """
7
7
 
8
8
  import time
@@ -14,47 +14,27 @@ from openai import OpenAI
14
14
 
15
15
  MODELS = [
16
16
  # OpenAI
17
- "openai/gpt-5.4",
18
- "openai/gpt-5.4-pro",
19
- "openai/gpt-5.3",
20
- "openai/gpt-5.3-codex",
21
- "openai/gpt-5.2",
22
- "openai/gpt-5.2-pro",
23
- "openai/gpt-5-mini",
24
- "openai/gpt-5-nano",
25
- "openai/gpt-4.1",
26
- "openai/gpt-4.1-mini",
27
- "openai/gpt-4.1-nano",
28
- "openai/gpt-4o",
29
- "openai/gpt-4o-mini",
30
- "openai/o3",
31
- "openai/o3-mini",
32
- "openai/o4-mini",
33
- "openai/o1",
34
- "openai/o1-mini",
17
+ "openai/gpt-5.4", "openai/gpt-5.4-pro",
18
+ "openai/gpt-5.3", "openai/gpt-5.3-codex",
19
+ "openai/gpt-5.2", "openai/gpt-5.2-pro",
20
+ "openai/gpt-5-mini", "openai/gpt-5-nano",
21
+ "openai/gpt-4.1", "openai/gpt-4.1-mini", "openai/gpt-4.1-nano",
22
+ "openai/gpt-4o", "openai/gpt-4o-mini",
23
+ "openai/o3", "openai/o3-mini", "openai/o4-mini",
24
+ "openai/o1", "openai/o1-mini",
35
25
  # Anthropic
36
- "anthropic/claude-sonnet-4.6",
37
- "anthropic/claude-opus-4.6",
38
- "anthropic/claude-haiku-4.5",
26
+ "anthropic/claude-sonnet-4.6", "anthropic/claude-opus-4.6", "anthropic/claude-haiku-4.5",
39
27
  # Google
40
- "google/gemini-3.1-pro",
41
- "google/gemini-3-pro-preview",
42
- "google/gemini-3-flash-preview",
43
- "google/gemini-2.5-pro",
44
- "google/gemini-2.5-flash",
45
- "google/gemini-2.5-flash-lite",
28
+ "google/gemini-3.1-pro", "google/gemini-3-pro-preview", "google/gemini-3-flash-preview",
29
+ "google/gemini-2.5-pro", "google/gemini-2.5-flash", "google/gemini-2.5-flash-lite",
46
30
  # DeepSeek
47
- "deepseek/deepseek-chat",
48
- "deepseek/deepseek-reasoner",
31
+ "deepseek/deepseek-chat", "deepseek/deepseek-reasoner",
49
32
  # Moonshot
50
33
  "moonshot/kimi-k2.5",
51
34
  # xAI
52
- "xai/grok-3",
53
- "xai/grok-3-mini",
54
- "xai/grok-4-fast-reasoning",
55
- "xai/grok-4-fast-non-reasoning",
56
- "xai/grok-4-1-fast-reasoning",
57
- "xai/grok-4-1-fast-non-reasoning",
35
+ "xai/grok-3", "xai/grok-3-mini",
36
+ "xai/grok-4-fast-reasoning", "xai/grok-4-fast-non-reasoning",
37
+ "xai/grok-4-1-fast-reasoning", "xai/grok-4-1-fast-non-reasoning",
58
38
  "xai/grok-4-0709",
59
39
  # MiniMax
60
40
  "minimax/minimax-m2.5",
@@ -62,236 +42,178 @@ MODELS = [
62
42
  "nvidia/gpt-oss-120b",
63
43
  ]
64
44
 
65
- # Varied prompts to bypass dedup cache — each run uses a different prompt
66
45
  PROMPTS = [
67
46
  "Write a Python function that checks if a string is a valid IPv4 address. Include edge cases and a docstring.",
68
47
  "Write a Python function that finds the longest common subsequence of two strings. Include type hints and examples.",
69
48
  "Write a Python function that implements a simple LRU cache using OrderedDict. Include usage examples.",
70
- "Write a Python function that converts a Roman numeral string to an integer. Handle all standard cases.",
71
- "Write a Python function that flattens a nested list of arbitrary depth. Include type hints.",
72
49
  ]
73
50
 
74
51
  NUM_REQUESTS = 2
75
52
  MAX_TOKENS = 256
76
53
 
77
54
 
78
- def estimate_tokens(text: str) -> int:
79
- """Rough token estimate: ~4 chars per token for English/code."""
80
- return max(1, len(text) // 4)
81
-
82
-
83
55
  def benchmark_model(client: OpenAI, model: str) -> list:
84
- """Benchmark a single model with streaming."""
85
56
  results = []
86
-
87
57
  for i in range(NUM_REQUESTS):
88
58
  prompt = PROMPTS[i % len(PROMPTS)]
89
-
90
59
  try:
91
60
  start = time.perf_counter()
92
- ttft = None
93
- full_response = ""
94
- first_chunk_time = None
95
- last_chunk_time = None
96
- chunk_count = 0
97
-
98
- stream = client.chat.completions.create(
61
+ resp = client.chat.completions.create(
99
62
  model=model,
100
63
  messages=[{"role": "user", "content": prompt}],
101
64
  max_tokens=MAX_TOKENS,
102
- stream=True,
65
+ stream=False,
103
66
  temperature=0.7,
104
67
  )
68
+ latency = (time.perf_counter() - start) * 1000
69
+
70
+ content = resp.choices[0].message.content or ""
71
+ finish = resp.choices[0].finish_reason
72
+ usage = resp.usage
73
+ input_tokens = usage.prompt_tokens if usage else 0
74
+ output_tokens = usage.completion_tokens if usage else 0
75
+ total_tokens = usage.total_tokens if usage else 0
105
76
 
106
- for chunk in stream:
107
- now = time.perf_counter()
108
- if chunk.choices and chunk.choices[0].delta.content:
109
- content = chunk.choices[0].delta.content
110
- if ttft is None:
111
- ttft = (now - start) * 1000 # ms
112
- first_chunk_time = now
113
- full_response += content
114
- last_chunk_time = now
115
- chunk_count += 1
116
-
117
- end = time.perf_counter()
118
- total_time = (end - start) * 1000 # ms
119
-
120
- # Estimate output tokens
121
- output_tokens = estimate_tokens(full_response)
122
-
123
- # Tokens per second (generation phase: from first token to last token)
124
- if first_chunk_time and last_chunk_time and last_chunk_time > first_chunk_time:
125
- gen_time_s = last_chunk_time - first_chunk_time
126
- tps = output_tokens / gen_time_s if gen_time_s > 0.001 else 0
127
- elif output_tokens > 0 and ttft is not None:
128
- # Single chunk — use total time minus TTFT
129
- gen_time_s = (end - start) - (ttft / 1000)
130
- tps = output_tokens / gen_time_s if gen_time_s > 0.001 else 0
131
- else:
132
- tps = 0
133
-
134
- # Inter-token latency (approximate from chunks)
135
- if first_chunk_time and last_chunk_time and chunk_count > 1:
136
- avg_itl = ((last_chunk_time - first_chunk_time) / (chunk_count - 1)) * 1000
137
- else:
138
- avg_itl = None
77
+ # Tokens per second (output tokens / latency)
78
+ tps = (output_tokens / (latency / 1000)) if latency > 0 and output_tokens > 0 else 0
139
79
 
140
80
  results.append({
141
81
  "request": i + 1,
142
- "ttft_ms": round(ttft, 1) if ttft is not None else None,
143
- "total_ms": round(total_time, 1),
82
+ "latency_ms": round(latency, 0),
83
+ "input_tokens": input_tokens,
144
84
  "output_tokens": output_tokens,
145
- "output_chars": len(full_response),
146
- "chunks": chunk_count,
85
+ "total_tokens": total_tokens,
147
86
  "tokens_per_sec": round(tps, 1),
148
- "avg_itl_ms": round(avg_itl, 1) if avg_itl is not None else None,
87
+ "output_chars": len(content),
88
+ "finish_reason": finish,
149
89
  "status": "success",
150
90
  })
151
-
152
- ttft_s = f"{ttft:.0f}" if ttft is not None else "N/A"
153
- print(f" Run {i+1}/{NUM_REQUESTS}: TTFT={ttft_s}ms, {tps:.0f} tok/s, ~{output_tokens} tokens ({len(full_response)} chars), {total_time:.0f}ms")
91
+ print(f" Run {i+1}: {latency:.0f}ms, {output_tokens} out tokens, {tps:.0f} tok/s, finish={finish}")
154
92
 
155
93
  except Exception as e:
156
- error_msg = str(e)[:300]
157
- results.append({
158
- "request": i + 1,
159
- "status": "error",
160
- "error": error_msg,
161
- })
162
- print(f" Run {i+1}/{NUM_REQUESTS}: ERROR - {error_msg[:150]}")
163
- # If rate limited or out of funds, wait longer
164
- if "429" in error_msg or "rate" in error_msg.lower() or "402" in error_msg:
165
- print(" >> Rate limited or payment issue, waiting 30s...")
94
+ error_msg = str(e)[:200]
95
+ results.append({"request": i + 1, "status": "error", "error": error_msg})
96
+ print(f" Run {i+1}: ERROR - {error_msg[:120]}")
97
+ if "429" in error_msg or "rate" in error_msg.lower():
98
+ print(" >> Rate limited, waiting 30s...")
166
99
  time.sleep(30)
167
100
 
168
101
  time.sleep(3)
169
-
170
102
  return results
171
103
 
172
104
 
173
105
  def aggregate(results: list) -> dict:
174
- """Aggregate results across runs."""
175
106
  successes = [r for r in results if r["status"] == "success"]
176
107
  if not successes:
177
108
  return {"error_rate": 1.0, "runs": len(results)}
178
109
 
179
- ttft_vals = [r["ttft_ms"] for r in successes if r.get("ttft_ms") is not None]
180
- itl_vals = [r["avg_itl_ms"] for r in successes if r.get("avg_itl_ms") is not None]
110
+ latencies = [r["latency_ms"] for r in successes]
181
111
  tps_vals = [r["tokens_per_sec"] for r in successes if r["tokens_per_sec"] > 0]
112
+ out_tokens = [r["output_tokens"] for r in successes]
182
113
 
183
- agg = {
114
+ return {
184
115
  "runs": len(results),
185
116
  "successes": len(successes),
186
117
  "error_rate": round(1 - len(successes) / len(results), 2),
187
- "avg_output_tokens": round(sum(r["output_tokens"] for r in successes) / len(successes), 0),
188
- "avg_total_ms": round(sum(r["total_ms"] for r in successes) / len(successes), 0),
118
+ "avg_latency_ms": round(sum(latencies) / len(latencies), 0),
119
+ "min_latency_ms": round(min(latencies), 0),
120
+ "max_latency_ms": round(max(latencies), 0),
121
+ "avg_tokens_per_sec": round(sum(tps_vals) / len(tps_vals), 1) if tps_vals else 0,
122
+ "avg_output_tokens": round(sum(out_tokens) / len(out_tokens), 0),
189
123
  }
190
- if tps_vals:
191
- agg["avg_tokens_per_sec"] = round(sum(tps_vals) / len(tps_vals), 1)
192
- agg["max_tokens_per_sec"] = round(max(tps_vals), 1)
193
- if ttft_vals:
194
- agg["avg_ttft_ms"] = round(sum(ttft_vals) / len(ttft_vals), 0)
195
- agg["p50_ttft_ms"] = round(sorted(ttft_vals)[len(ttft_vals) // 2], 0)
196
- if itl_vals:
197
- agg["avg_itl_ms"] = round(sum(itl_vals) / len(itl_vals), 1)
198
124
 
199
- return agg
125
+
126
+ # Model pricing (USD per 1M tokens) — for cost calculation
127
+ PRICING = {
128
+ "openai/gpt-5.4": (2.5, 15), "openai/gpt-5.4-pro": (2.5, 15),
129
+ "openai/gpt-5.3": (2.5, 10), "openai/gpt-5.3-codex": (2.5, 10),
130
+ "openai/gpt-5.2": (2.5, 10), "openai/gpt-5.2-pro": (2.5, 10),
131
+ "openai/gpt-5-mini": (1.1, 4.4), "openai/gpt-5-nano": (0.5, 2),
132
+ "openai/gpt-4.1": (2, 8), "openai/gpt-4.1-mini": (0.4, 1.6), "openai/gpt-4.1-nano": (0.1, 0.4),
133
+ "openai/gpt-4o": (2.5, 10), "openai/gpt-4o-mini": (0.15, 0.6),
134
+ "openai/o3": (2, 8), "openai/o3-mini": (1.1, 4.4), "openai/o4-mini": (1.1, 4.4),
135
+ "openai/o1": (15, 60), "openai/o1-mini": (1.1, 4.4),
136
+ "anthropic/claude-sonnet-4.6": (3, 15), "anthropic/claude-opus-4.6": (15, 75),
137
+ "anthropic/claude-haiku-4.5": (0.8, 4),
138
+ "google/gemini-3.1-pro": (1.25, 10), "google/gemini-3-pro-preview": (1.25, 10),
139
+ "google/gemini-3-flash-preview": (0.15, 0.6),
140
+ "google/gemini-2.5-pro": (1.25, 10), "google/gemini-2.5-flash": (0.15, 0.6),
141
+ "google/gemini-2.5-flash-lite": (0.1, 0.4),
142
+ "deepseek/deepseek-chat": (0.27, 1.1), "deepseek/deepseek-reasoner": (0.55, 2.19),
143
+ "moonshot/kimi-k2.5": (0.6, 3),
144
+ "xai/grok-3": (3, 15), "xai/grok-3-mini": (0.3, 0.5),
145
+ "xai/grok-4-fast-reasoning": (0.2, 0.5), "xai/grok-4-fast-non-reasoning": (0.2, 0.5),
146
+ "xai/grok-4-1-fast-reasoning": (0.2, 0.5), "xai/grok-4-1-fast-non-reasoning": (0.2, 0.5),
147
+ "xai/grok-4-0709": (0.2, 1.5),
148
+ "minimax/minimax-m2.5": (0.3, 1.1),
149
+ "nvidia/gpt-oss-120b": (0, 0),
150
+ }
200
151
 
201
152
 
202
153
  def main():
203
- client = OpenAI(
204
- api_key="x402",
205
- base_url="http://localhost:18789/v1",
206
- )
154
+ client = OpenAI(api_key="x402", base_url="http://localhost:18789/v1")
207
155
 
208
- print("Testing connection to ClawRouter...")
156
+ print("BlockRun Model Performance Benchmark v3")
157
+ print("=" * 60)
209
158
  try:
210
159
  models = client.models.list()
211
160
  print(f"Connected. {len(models.data)} models available.")
212
161
  except Exception as e:
213
- print(f"Failed to connect: {e}")
162
+ print(f"Connection failed: {e}")
214
163
  sys.exit(1)
215
164
 
216
- print(f"Benchmark: {NUM_REQUESTS} requests per model, {MAX_TOKENS} max tokens")
217
- print(f"Prompts varied per request to bypass dedup cache\n")
165
+ print(f"Config: {NUM_REQUESTS} requests/model, {MAX_TOKENS} max tokens, non-streaming\n")
218
166
 
219
167
  all_results = {}
220
168
  timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
221
169
 
222
- models_to_test = MODELS
223
- if len(sys.argv) > 1:
224
- models_to_test = sys.argv[1:]
225
-
170
+ models_to_test = MODELS if len(sys.argv) <= 1 else sys.argv[1:]
226
171
  total = len(models_to_test)
172
+
227
173
  for idx, model in enumerate(models_to_test, 1):
228
174
  print(f"\n[{idx}/{total}] {model}")
229
- print("-" * 50)
230
-
231
175
  results = benchmark_model(client, model)
232
176
  agg = aggregate(results)
233
177
  all_results[model] = {"raw": results, "summary": agg}
234
178
 
235
179
  if agg.get("successes", 0) > 0:
236
- ttft_str = f"TTFT={agg.get('avg_ttft_ms', 'N/A')}ms, " if "avg_ttft_ms" in agg else ""
237
- tps_str = f"{agg.get('avg_tokens_per_sec', 0)} tok/s, " if "avg_tokens_per_sec" in agg else ""
238
- print(f" >> {ttft_str}{tps_str}{agg['avg_total_ms']:.0f}ms avg total")
180
+ print(f" >> {agg['avg_latency_ms']:.0f}ms avg, {agg.get('avg_tokens_per_sec', 0)} tok/s, ~{agg['avg_output_tokens']} tokens")
239
181
  else:
240
182
  print(f" >> ALL FAILED")
241
183
 
242
184
  # Save incrementally
243
185
  output = {
244
186
  "benchmark": "BlockRun Model Performance",
245
- "version": "1.0",
187
+ "version": "3.0",
246
188
  "timestamp": timestamp,
247
- "config": {
248
- "num_requests": NUM_REQUESTS,
249
- "max_tokens": MAX_TOKENS,
250
- "prompts": PROMPTS,
251
- "token_estimation": "~4 chars per token",
252
- "endpoint": "localhost:18789 (ClawRouter → blockrun.ai x402)",
253
- },
189
+ "config": {"num_requests": NUM_REQUESTS, "max_tokens": MAX_TOKENS, "mode": "non-streaming"},
254
190
  "results": all_results,
255
191
  }
256
- outfile = os.path.join(os.path.dirname(__file__), "..", "benchmark-results.json")
257
- with open(outfile, "w") as f:
192
+ with open(os.path.join(os.path.dirname(__file__), "..", "benchmark-results.json"), "w") as f:
258
193
  json.dump(output, f, indent=2)
259
194
 
260
- # Leaderboard
261
- print("\n" + "=" * 90)
195
+ # === LEADERBOARD ===
196
+ print("\n" + "=" * 100)
262
197
  print("BLOCKRUN MODEL PERFORMANCE LEADERBOARD")
263
- print(f"Tested: {timestamp}")
264
- print("=" * 90)
265
-
266
- ranked = []
267
- for model, data in all_results.items():
268
- s = data["summary"]
269
- if s.get("avg_tokens_per_sec", 0) > 0:
270
- ranked.append((model, s))
198
+ print(f"Date: {timestamp} | Mode: non-streaming | Max tokens: {MAX_TOKENS}")
199
+ print("=" * 100)
271
200
 
272
- ranked.sort(key=lambda x: x[1]["avg_tokens_per_sec"], reverse=True)
201
+ ranked = [(m, d["summary"]) for m, d in all_results.items() if d["summary"].get("successes", 0) > 0]
202
+ ranked.sort(key=lambda x: x[1]["avg_latency_ms"])
273
203
 
274
- print(f"\n{'#':<4} {'Model':<40} {'TTFT':<9} {'Tok/s':<9} {'ITL':<9} {'Total':<9} {'~Tokens':<8}")
275
- print("-" * 88)
204
+ print(f"\n{'#':<4} {'Model':<40} {'Latency':<12} {'Tok/s':<9} {'Out Tok':<9} {'$/1M in':<9} {'$/1M out':<9}")
205
+ print("-" * 92)
276
206
  for i, (model, s) in enumerate(ranked, 1):
277
- ttft = f"{s['avg_ttft_ms']:.0f}ms" if "avg_ttft_ms" in s else "N/A"
278
- itl = f"{s['avg_itl_ms']:.1f}ms" if "avg_itl_ms" in s else "N/A"
279
- print(f"{i:<4} {model:<40} {ttft:<9} {s['avg_tokens_per_sec']:<9.1f} {itl:<9} {s['avg_total_ms']:<9.0f} {s['avg_output_tokens']:<8.0f}")
280
-
281
- # Models with issues
282
- zero_tps = [(m, d["summary"]) for m, d in all_results.items()
283
- if d["summary"].get("avg_tokens_per_sec", 0) == 0 and d["summary"].get("successes", 0) > 0]
284
- if zero_tps:
285
- print(f"\nModels with 0 tok/s (single-chunk or non-streaming response):")
286
- for model, s in zero_tps:
287
- ttft = f"{s.get('avg_ttft_ms', 'N/A')}ms"
288
- print(f" {model}: TTFT={ttft}, ~{s.get('avg_output_tokens', 0)} tokens, {s['avg_total_ms']:.0f}ms total")
207
+ p = PRICING.get(model, (0, 0))
208
+ tps = s.get("avg_tokens_per_sec", 0)
209
+ print(f"{i:<4} {model:<40} {s['avg_latency_ms']:<12.0f} {tps:<9.1f} {s['avg_output_tokens']:<9.0f} ${p[0]:<8} ${p[1]:<8}")
289
210
 
211
+ # Errors
290
212
  errors = [(m, d["summary"]) for m, d in all_results.items() if d["summary"].get("error_rate", 0) > 0]
291
213
  if errors:
292
- print(f"\nModels with errors:")
214
+ print(f"\nErrors:")
293
215
  for model, s in errors:
294
- print(f" {model}: {s.get('error_rate', 1.0)*100:.0f}% error rate")
216
+ print(f" {model}: {s.get('error_rate', 1)*100:.0f}% failures")
295
217
 
296
218
  print(f"\nResults saved to benchmark-results.json")
297
219