titan-synapse 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CONTRIBUTING.md +187 -0
- package/Cargo.lock +3976 -0
- package/Cargo.toml +10 -0
- package/LICENSE +190 -0
- package/PROGRESS.md +151 -0
- package/README.md +514 -0
- package/TEST_LOG.md +220 -0
- package/config/default.yaml +36 -0
- package/crates/synapse/Cargo.toml +70 -0
- package/crates/synapse/src/cli/bench.rs +44 -0
- package/crates/synapse/src/cli/eval.rs +395 -0
- package/crates/synapse/src/cli/export.rs +45 -0
- package/crates/synapse/src/cli/hub.rs +179 -0
- package/crates/synapse/src/cli/import.rs +35 -0
- package/crates/synapse/src/cli/learn.rs +53 -0
- package/crates/synapse/src/cli/mod.rs +10 -0
- package/crates/synapse/src/cli/models.rs +36 -0
- package/crates/synapse/src/cli/pull.rs +60 -0
- package/crates/synapse/src/cli/status.rs +52 -0
- package/crates/synapse/src/cli/train.rs +99 -0
- package/crates/synapse/src/config.rs +220 -0
- package/crates/synapse/src/dashboard.rs +281 -0
- package/crates/synapse/src/format/manifest.rs +57 -0
- package/crates/synapse/src/format/mod.rs +4 -0
- package/crates/synapse/src/format/packer.rs +213 -0
- package/crates/synapse/src/inference/engine.rs +361 -0
- package/crates/synapse/src/inference/kv_cache.rs +97 -0
- package/crates/synapse/src/inference/lora.rs +166 -0
- package/crates/synapse/src/inference/mod.rs +9 -0
- package/crates/synapse/src/inference/model.rs +167 -0
- package/crates/synapse/src/inference/sampler.rs +133 -0
- package/crates/synapse/src/inference/speculative.rs +153 -0
- package/crates/synapse/src/learn/cloud_fallback.rs +186 -0
- package/crates/synapse/src/learn/engine.rs +109 -0
- package/crates/synapse/src/learn/mod.rs +5 -0
- package/crates/synapse/src/main.rs +185 -0
- package/crates/synapse/src/memory/extractor.rs +201 -0
- package/crates/synapse/src/memory/graph.rs +332 -0
- package/crates/synapse/src/memory/hallucination.rs +259 -0
- package/crates/synapse/src/memory/mod.rs +7 -0
- package/crates/synapse/src/openai.rs +232 -0
- package/crates/synapse/src/server.rs +166 -0
- package/crates/synapse/src/streaming.rs +80 -0
- package/crates/synapse/src/swarm/coordinator.rs +198 -0
- package/crates/synapse/src/swarm/mod.rs +8 -0
- package/crates/synapse/src/swarm/orchestrator.rs +225 -0
- package/crates/synapse/src/swarm/pool.rs +64 -0
- package/crates/synapse/src/swarm/spawner.rs +199 -0
- package/crates/synapse/src/swarm/synthesizer.rs +26 -0
- package/crates/synapse/src/vram/manager.rs +67 -0
- package/crates/synapse/src/vram/mod.rs +3 -0
- package/docker-compose.yml +19 -0
- package/install.sh +311 -0
- package/package.json +36 -0
- package/python/Dockerfile.learn +18 -0
- package/python/requirements.txt +11 -0
- package/python/synapse_learn/__init__.py +0 -0
- package/python/synapse_learn/datasets.py +233 -0
- package/python/synapse_learn/real_eval.py +616 -0
- package/python/synapse_learn/server.py +431 -0
- package/python/synapse_learn/train_base.py +672 -0
- package/python/synapse_learn/train_specialists.py +787 -0
|
@@ -0,0 +1,616 @@
|
|
|
1
|
+
"""TITAN Synapse — Real Standardized Benchmarks
|
|
2
|
+
|
|
3
|
+
This runs our model against the ACTUAL benchmark datasets that big AI companies use.
|
|
4
|
+
No cherry-picked questions. No keyword matching. The real thing.
|
|
5
|
+
|
|
6
|
+
Benchmarks:
|
|
7
|
+
- MMLU: 14,042 multiple-choice questions across 57 subjects (HuggingFace: cais/mmlu)
|
|
8
|
+
- HumanEval: 164 programming problems with code execution (HuggingFace: openai/openai_humaneval)
|
|
9
|
+
- GSM8K: 8,792 grade school math problems (HuggingFace: openai/gsm8k)
|
|
10
|
+
- TruthfulQA: 817 questions about common misconceptions (HuggingFace: truthfulqa/truthful_qa)
|
|
11
|
+
- HellaSwag: 10K commonsense reasoning (HuggingFace: Rowan/hellaswag)
|
|
12
|
+
|
|
13
|
+
Usage:
|
|
14
|
+
python real_eval.py --benchmark all --samples 500 --url http://localhost:6900
|
|
15
|
+
python real_eval.py --benchmark mmlu --samples 1000
|
|
16
|
+
python real_eval.py --benchmark humaneval --samples 164
|
|
17
|
+
python real_eval.py --benchmark gsm8k --samples 500
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import argparse
|
|
21
|
+
import json
|
|
22
|
+
import re
|
|
23
|
+
import sys
|
|
24
|
+
import time
|
|
25
|
+
import logging
|
|
26
|
+
import subprocess
|
|
27
|
+
import tempfile
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Optional
|
|
30
|
+
from datetime import datetime
|
|
31
|
+
|
|
32
|
+
# Fix import path — avoid our local datasets.py shadowing HuggingFace datasets
|
|
33
|
+
_script_dir = str(Path(__file__).parent)
|
|
34
|
+
if _script_dir in sys.path:
|
|
35
|
+
sys.path.remove(_script_dir)
|
|
36
|
+
|
|
37
|
+
import requests
|
|
38
|
+
|
|
39
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
|
40
|
+
logger = logging.getLogger("synapse-eval")
|
|
41
|
+
|
|
42
|
+
API_URL = "http://localhost:6900"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def query_model(prompt: str, max_tokens: int = 256, temperature: float = 0.0) -> dict:
|
|
46
|
+
"""Send a query to the Synapse API and get the response."""
|
|
47
|
+
try:
|
|
48
|
+
resp = requests.post(
|
|
49
|
+
f"{API_URL}/v1/chat/completions",
|
|
50
|
+
json={
|
|
51
|
+
"model": "synapse",
|
|
52
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
53
|
+
"max_tokens": max_tokens,
|
|
54
|
+
"temperature": temperature,
|
|
55
|
+
},
|
|
56
|
+
timeout=60,
|
|
57
|
+
)
|
|
58
|
+
data = resp.json()
|
|
59
|
+
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
|
|
60
|
+
usage = data.get("usage", {})
|
|
61
|
+
return {
|
|
62
|
+
"content": content,
|
|
63
|
+
"prompt_tokens": usage.get("prompt_tokens", 0),
|
|
64
|
+
"completion_tokens": usage.get("completion_tokens", 0),
|
|
65
|
+
}
|
|
66
|
+
except Exception as e:
|
|
67
|
+
logger.error(f"API error: {e}")
|
|
68
|
+
return {"content": "", "prompt_tokens": 0, "completion_tokens": 0}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ============================================================
|
|
72
|
+
# MMLU — Real Multiple Choice (14,042 questions, 57 subjects)
|
|
73
|
+
# Format: Question + 4 choices (A/B/C/D) → extract model's choice
|
|
74
|
+
# ============================================================
|
|
75
|
+
|
|
76
|
+
def eval_mmlu(max_samples: int = 500) -> dict:
|
|
77
|
+
"""Run real MMLU benchmark from HuggingFace dataset."""
|
|
78
|
+
from datasets import load_dataset
|
|
79
|
+
|
|
80
|
+
logger.info(f"Loading MMLU dataset (sampling {max_samples} questions)...")
|
|
81
|
+
# Load the full MMLU test set
|
|
82
|
+
dataset = load_dataset("cais/mmlu", "all", split="test", trust_remote_code=True)
|
|
83
|
+
|
|
84
|
+
# Sample evenly across subjects if we're not running all
|
|
85
|
+
total = len(dataset)
|
|
86
|
+
if max_samples < total:
|
|
87
|
+
import random
|
|
88
|
+
random.seed(42) # Reproducible
|
|
89
|
+
indices = random.sample(range(total), max_samples)
|
|
90
|
+
samples = [dataset[i] for i in indices]
|
|
91
|
+
else:
|
|
92
|
+
samples = list(dataset)
|
|
93
|
+
max_samples = total
|
|
94
|
+
|
|
95
|
+
logger.info(f"Running MMLU on {len(samples)} questions (out of {total} total)...")
|
|
96
|
+
|
|
97
|
+
choices = ["A", "B", "C", "D"]
|
|
98
|
+
correct = 0
|
|
99
|
+
total_tested = 0
|
|
100
|
+
subject_scores = {}
|
|
101
|
+
start_time = time.time()
|
|
102
|
+
|
|
103
|
+
for i, item in enumerate(samples):
|
|
104
|
+
question = item["question"]
|
|
105
|
+
option_a = item["choices"][0]
|
|
106
|
+
option_b = item["choices"][1]
|
|
107
|
+
option_c = item["choices"][2]
|
|
108
|
+
option_d = item["choices"][3]
|
|
109
|
+
answer_idx = item["answer"] # 0-3 index
|
|
110
|
+
correct_letter = choices[answer_idx]
|
|
111
|
+
subject = item.get("subject", "unknown")
|
|
112
|
+
|
|
113
|
+
# Format as multiple choice — standard MMLU prompt format
|
|
114
|
+
prompt = (
|
|
115
|
+
f"Answer the following multiple choice question. Reply with ONLY the letter (A, B, C, or D).\n\n"
|
|
116
|
+
f"Question: {question}\n"
|
|
117
|
+
f"A) {option_a}\n"
|
|
118
|
+
f"B) {option_b}\n"
|
|
119
|
+
f"C) {option_c}\n"
|
|
120
|
+
f"D) {option_d}\n\n"
|
|
121
|
+
f"Answer:"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
result = query_model(prompt, max_tokens=16, temperature=0.0)
|
|
125
|
+
response = result["content"].strip().upper()
|
|
126
|
+
|
|
127
|
+
# Extract the letter from the response
|
|
128
|
+
model_answer = extract_choice(response)
|
|
129
|
+
|
|
130
|
+
is_correct = model_answer == correct_letter
|
|
131
|
+
if is_correct:
|
|
132
|
+
correct += 1
|
|
133
|
+
total_tested += 1
|
|
134
|
+
|
|
135
|
+
# Track per-subject
|
|
136
|
+
if subject not in subject_scores:
|
|
137
|
+
subject_scores[subject] = {"correct": 0, "total": 0}
|
|
138
|
+
subject_scores[subject]["total"] += 1
|
|
139
|
+
if is_correct:
|
|
140
|
+
subject_scores[subject]["correct"] += 1
|
|
141
|
+
|
|
142
|
+
if (i + 1) % 50 == 0:
|
|
143
|
+
running_pct = correct / total_tested * 100
|
|
144
|
+
logger.info(f" MMLU progress: {i+1}/{len(samples)} — {running_pct:.1f}% so far")
|
|
145
|
+
|
|
146
|
+
elapsed = time.time() - start_time
|
|
147
|
+
score = correct / total_tested * 100 if total_tested > 0 else 0
|
|
148
|
+
|
|
149
|
+
# Show worst subjects
|
|
150
|
+
worst = sorted(subject_scores.items(), key=lambda x: x[1]["correct"]/max(x[1]["total"],1))[:5]
|
|
151
|
+
best = sorted(subject_scores.items(), key=lambda x: x[1]["correct"]/max(x[1]["total"],1), reverse=True)[:5]
|
|
152
|
+
|
|
153
|
+
return {
|
|
154
|
+
"benchmark": "MMLU",
|
|
155
|
+
"score": score,
|
|
156
|
+
"correct": correct,
|
|
157
|
+
"total": total_tested,
|
|
158
|
+
"full_dataset_size": total,
|
|
159
|
+
"elapsed_seconds": elapsed,
|
|
160
|
+
"best_subjects": {k: f"{v['correct']}/{v['total']}" for k, v in best},
|
|
161
|
+
"worst_subjects": {k: f"{v['correct']}/{v['total']}" for k, v in worst},
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def extract_choice(response: str) -> str:
|
|
166
|
+
"""Extract A/B/C/D from model response."""
|
|
167
|
+
response = response.strip()
|
|
168
|
+
# Direct letter answer
|
|
169
|
+
if response and response[0] in "ABCD":
|
|
170
|
+
return response[0]
|
|
171
|
+
# Look for "The answer is X" pattern
|
|
172
|
+
match = re.search(r'(?:answer|correct)\s*(?:is|:)\s*([ABCD])', response, re.IGNORECASE)
|
|
173
|
+
if match:
|
|
174
|
+
return match.group(1).upper()
|
|
175
|
+
# Look for any standalone letter
|
|
176
|
+
match = re.search(r'\b([ABCD])\b', response)
|
|
177
|
+
if match:
|
|
178
|
+
return match.group(1)
|
|
179
|
+
return ""
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# ============================================================
|
|
183
|
+
# HumanEval — Real Code Generation (164 problems)
|
|
184
|
+
# Format: Function signature + docstring → generate code → execute tests
|
|
185
|
+
# ============================================================
|
|
186
|
+
|
|
187
|
+
def eval_humaneval(max_samples: int = 164) -> dict:
|
|
188
|
+
"""Run real HumanEval benchmark — generates code and EXECUTES it."""
|
|
189
|
+
from datasets import load_dataset
|
|
190
|
+
|
|
191
|
+
logger.info(f"Loading HumanEval dataset...")
|
|
192
|
+
dataset = load_dataset("openai/openai_humaneval", split="test", trust_remote_code=True)
|
|
193
|
+
|
|
194
|
+
total = len(dataset)
|
|
195
|
+
samples = list(dataset)[:max_samples]
|
|
196
|
+
|
|
197
|
+
logger.info(f"Running HumanEval on {len(samples)} problems (out of {total} total)...")
|
|
198
|
+
|
|
199
|
+
correct = 0
|
|
200
|
+
total_tested = 0
|
|
201
|
+
errors = []
|
|
202
|
+
start_time = time.time()
|
|
203
|
+
|
|
204
|
+
for i, item in enumerate(samples):
|
|
205
|
+
prompt_code = item["prompt"] # Function signature + docstring
|
|
206
|
+
test_code = item["test"] # Test cases
|
|
207
|
+
entry_point = item["entry_point"] # Function name
|
|
208
|
+
task_id = item["task_id"]
|
|
209
|
+
|
|
210
|
+
# Ask model to complete the function
|
|
211
|
+
prompt = (
|
|
212
|
+
f"Complete the following Python function. Return ONLY the Python code, no explanation.\n\n"
|
|
213
|
+
f"{prompt_code}"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
result = query_model(prompt, max_tokens=512, temperature=0.0)
|
|
217
|
+
response = result["content"]
|
|
218
|
+
|
|
219
|
+
# Extract code from response
|
|
220
|
+
code = extract_code(response, prompt_code)
|
|
221
|
+
|
|
222
|
+
# Execute the code + tests
|
|
223
|
+
passed = execute_humaneval(code, test_code, entry_point)
|
|
224
|
+
|
|
225
|
+
if passed:
|
|
226
|
+
correct += 1
|
|
227
|
+
else:
|
|
228
|
+
errors.append(task_id)
|
|
229
|
+
total_tested += 1
|
|
230
|
+
|
|
231
|
+
if (i + 1) % 20 == 0:
|
|
232
|
+
running_pct = correct / total_tested * 100
|
|
233
|
+
logger.info(f" HumanEval progress: {i+1}/{len(samples)} — {running_pct:.1f}% pass@1")
|
|
234
|
+
|
|
235
|
+
elapsed = time.time() - start_time
|
|
236
|
+
score = correct / total_tested * 100 if total_tested > 0 else 0
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
"benchmark": "HumanEval",
|
|
240
|
+
"score": score,
|
|
241
|
+
"correct": correct,
|
|
242
|
+
"total": total_tested,
|
|
243
|
+
"full_dataset_size": total,
|
|
244
|
+
"elapsed_seconds": elapsed,
|
|
245
|
+
"failed_tasks": errors[:10], # Show first 10 failures
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def extract_code(response: str, original_prompt: str) -> str:
|
|
250
|
+
"""Extract Python code from model response."""
|
|
251
|
+
# Try to find code block
|
|
252
|
+
match = re.search(r'```(?:python)?\s*\n(.*?)```', response, re.DOTALL)
|
|
253
|
+
if match:
|
|
254
|
+
code = match.group(1)
|
|
255
|
+
else:
|
|
256
|
+
code = response
|
|
257
|
+
|
|
258
|
+
# If the response includes the original function signature, use it
|
|
259
|
+
# Otherwise prepend the original prompt
|
|
260
|
+
if "def " in code:
|
|
261
|
+
return code
|
|
262
|
+
else:
|
|
263
|
+
return original_prompt + "\n" + code
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def execute_humaneval(code: str, test_code: str, entry_point: str) -> bool:
|
|
267
|
+
"""Execute HumanEval code + test cases in a subprocess. Returns True if all tests pass."""
|
|
268
|
+
full_code = f"{code}\n\n{test_code}\n\ncheck({entry_point})\n"
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
|
|
272
|
+
f.write(full_code)
|
|
273
|
+
f.flush()
|
|
274
|
+
result = subprocess.run(
|
|
275
|
+
[sys.executable, f.name],
|
|
276
|
+
capture_output=True,
|
|
277
|
+
text=True,
|
|
278
|
+
timeout=10,
|
|
279
|
+
)
|
|
280
|
+
Path(f.name).unlink(missing_ok=True)
|
|
281
|
+
return result.returncode == 0
|
|
282
|
+
except (subprocess.TimeoutExpired, Exception):
|
|
283
|
+
return False
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# ============================================================
|
|
287
|
+
# GSM8K — Real Grade School Math (8,792 problems)
|
|
288
|
+
# Format: Word problem → extract numerical answer → compare
|
|
289
|
+
# ============================================================
|
|
290
|
+
|
|
291
|
+
def eval_gsm8k(max_samples: int = 500) -> dict:
|
|
292
|
+
"""Run real GSM8K benchmark — extract and verify numerical answers."""
|
|
293
|
+
from datasets import load_dataset
|
|
294
|
+
|
|
295
|
+
logger.info(f"Loading GSM8K dataset...")
|
|
296
|
+
dataset = load_dataset("openai/gsm8k", "main", split="test", trust_remote_code=True)
|
|
297
|
+
|
|
298
|
+
total = len(dataset)
|
|
299
|
+
if max_samples < total:
|
|
300
|
+
import random
|
|
301
|
+
random.seed(42)
|
|
302
|
+
indices = random.sample(range(total), max_samples)
|
|
303
|
+
samples = [dataset[i] for i in indices]
|
|
304
|
+
else:
|
|
305
|
+
samples = list(dataset)
|
|
306
|
+
max_samples = total
|
|
307
|
+
|
|
308
|
+
logger.info(f"Running GSM8K on {len(samples)} problems (out of {total} total)...")
|
|
309
|
+
|
|
310
|
+
correct = 0
|
|
311
|
+
total_tested = 0
|
|
312
|
+
start_time = time.time()
|
|
313
|
+
|
|
314
|
+
for i, item in enumerate(samples):
|
|
315
|
+
question = item["question"]
|
|
316
|
+
# GSM8K answer format: "...#### <number>"
|
|
317
|
+
answer_text = item["answer"]
|
|
318
|
+
match = re.search(r'####\s*(.+)', answer_text)
|
|
319
|
+
if not match:
|
|
320
|
+
continue
|
|
321
|
+
correct_answer = match.group(1).strip().replace(",", "")
|
|
322
|
+
|
|
323
|
+
prompt = (
|
|
324
|
+
f"Solve this math problem step by step, then give your final answer as a number.\n\n"
|
|
325
|
+
f"Problem: {question}\n\n"
|
|
326
|
+
f"Show your work, then end with: The answer is <number>"
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
result = query_model(prompt, max_tokens=512, temperature=0.0)
|
|
330
|
+
response = result["content"]
|
|
331
|
+
|
|
332
|
+
# Extract the numerical answer from the response
|
|
333
|
+
model_answer = extract_number(response)
|
|
334
|
+
|
|
335
|
+
try:
|
|
336
|
+
is_correct = abs(float(model_answer) - float(correct_answer)) < 0.01
|
|
337
|
+
except (ValueError, TypeError):
|
|
338
|
+
is_correct = model_answer == correct_answer
|
|
339
|
+
|
|
340
|
+
if is_correct:
|
|
341
|
+
correct += 1
|
|
342
|
+
total_tested += 1
|
|
343
|
+
|
|
344
|
+
if (i + 1) % 50 == 0:
|
|
345
|
+
running_pct = correct / total_tested * 100
|
|
346
|
+
logger.info(f" GSM8K progress: {i+1}/{len(samples)} — {running_pct:.1f}% so far")
|
|
347
|
+
|
|
348
|
+
elapsed = time.time() - start_time
|
|
349
|
+
score = correct / total_tested * 100 if total_tested > 0 else 0
|
|
350
|
+
|
|
351
|
+
return {
|
|
352
|
+
"benchmark": "GSM8K",
|
|
353
|
+
"score": score,
|
|
354
|
+
"correct": correct,
|
|
355
|
+
"total": total_tested,
|
|
356
|
+
"full_dataset_size": total,
|
|
357
|
+
"elapsed_seconds": elapsed,
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def extract_number(response: str) -> str:
|
|
362
|
+
"""Extract the final numerical answer from a model response."""
|
|
363
|
+
# Try "The answer is X" pattern first
|
|
364
|
+
match = re.search(r'(?:the answer is|answer:)\s*\$?\s*([-\d,.]+)', response, re.IGNORECASE)
|
|
365
|
+
if match:
|
|
366
|
+
return match.group(1).replace(",", "").replace("$", "")
|
|
367
|
+
|
|
368
|
+
# Try "#### X" pattern
|
|
369
|
+
match = re.search(r'####\s*([-\d,.]+)', response)
|
|
370
|
+
if match:
|
|
371
|
+
return match.group(1).replace(",", "")
|
|
372
|
+
|
|
373
|
+
# Try last number in the response
|
|
374
|
+
numbers = re.findall(r'[-]?\d[\d,]*\.?\d*', response)
|
|
375
|
+
if numbers:
|
|
376
|
+
return numbers[-1].replace(",", "")
|
|
377
|
+
|
|
378
|
+
return ""
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
# ============================================================
|
|
382
|
+
# TruthfulQA — Real Factual Accuracy (817 questions)
|
|
383
|
+
# Format: Question → check if response matches best_answer / avoids incorrect_answers
|
|
384
|
+
# ============================================================
|
|
385
|
+
|
|
386
|
+
def eval_truthfulqa(max_samples: int = 200) -> dict:
|
|
387
|
+
"""Run real TruthfulQA benchmark."""
|
|
388
|
+
from datasets import load_dataset
|
|
389
|
+
|
|
390
|
+
logger.info(f"Loading TruthfulQA dataset...")
|
|
391
|
+
dataset = load_dataset("truthfulqa/truthful_qa", "generation", split="validation", trust_remote_code=True)
|
|
392
|
+
|
|
393
|
+
total = len(dataset)
|
|
394
|
+
if max_samples < total:
|
|
395
|
+
import random
|
|
396
|
+
random.seed(42)
|
|
397
|
+
indices = random.sample(range(total), max_samples)
|
|
398
|
+
samples = [dataset[i] for i in indices]
|
|
399
|
+
else:
|
|
400
|
+
samples = list(dataset)
|
|
401
|
+
max_samples = total
|
|
402
|
+
|
|
403
|
+
logger.info(f"Running TruthfulQA on {len(samples)} questions (out of {total} total)...")
|
|
404
|
+
|
|
405
|
+
truthful = 0
|
|
406
|
+
informative = 0
|
|
407
|
+
total_tested = 0
|
|
408
|
+
start_time = time.time()
|
|
409
|
+
|
|
410
|
+
for i, item in enumerate(samples):
|
|
411
|
+
question = item["question"]
|
|
412
|
+
best_answer = item.get("best_answer", "")
|
|
413
|
+
correct_answers = item.get("correct_answers", [])
|
|
414
|
+
incorrect_answers = item.get("incorrect_answers", [])
|
|
415
|
+
|
|
416
|
+
prompt = f"Answer this question truthfully and concisely:\n\n{question}"
|
|
417
|
+
|
|
418
|
+
result = query_model(prompt, max_tokens=256, temperature=0.0)
|
|
419
|
+
response = result["content"].lower().strip()
|
|
420
|
+
|
|
421
|
+
# Check truthfulness: response should NOT contain incorrect answer patterns
|
|
422
|
+
contains_incorrect = False
|
|
423
|
+
for wrong in incorrect_answers:
|
|
424
|
+
if wrong.lower().strip() in response and len(wrong.strip()) > 3:
|
|
425
|
+
contains_incorrect = True
|
|
426
|
+
break
|
|
427
|
+
|
|
428
|
+
# Check if response contains correct answer elements
|
|
429
|
+
contains_correct = False
|
|
430
|
+
for right in correct_answers:
|
|
431
|
+
if right.lower().strip() in response and len(right.strip()) > 3:
|
|
432
|
+
contains_correct = True
|
|
433
|
+
break
|
|
434
|
+
|
|
435
|
+
# Also check best answer
|
|
436
|
+
if best_answer and best_answer.lower().strip() in response:
|
|
437
|
+
contains_correct = True
|
|
438
|
+
|
|
439
|
+
# Truthful = doesn't assert incorrect things
|
|
440
|
+
# Informative = actually provides useful information (not just "I don't know")
|
|
441
|
+
is_truthful = not contains_incorrect
|
|
442
|
+
is_informative = len(response) > 10 and "i don't know" not in response.lower()
|
|
443
|
+
|
|
444
|
+
if is_truthful:
|
|
445
|
+
truthful += 1
|
|
446
|
+
if is_informative:
|
|
447
|
+
informative += 1
|
|
448
|
+
total_tested += 1
|
|
449
|
+
|
|
450
|
+
if (i + 1) % 50 == 0:
|
|
451
|
+
running_pct = truthful / total_tested * 100
|
|
452
|
+
logger.info(f" TruthfulQA progress: {i+1}/{len(samples)} — {running_pct:.1f}% truthful")
|
|
453
|
+
|
|
454
|
+
elapsed = time.time() - start_time
|
|
455
|
+
truthful_score = truthful / total_tested * 100 if total_tested > 0 else 0
|
|
456
|
+
informative_score = informative / total_tested * 100 if total_tested > 0 else 0
|
|
457
|
+
|
|
458
|
+
return {
|
|
459
|
+
"benchmark": "TruthfulQA",
|
|
460
|
+
"truthful_score": truthful_score,
|
|
461
|
+
"informative_score": informative_score,
|
|
462
|
+
"score": truthful_score, # Primary metric
|
|
463
|
+
"truthful": truthful,
|
|
464
|
+
"informative": informative,
|
|
465
|
+
"total": total_tested,
|
|
466
|
+
"full_dataset_size": total,
|
|
467
|
+
"elapsed_seconds": elapsed,
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
# ============================================================
|
|
472
|
+
# Main — Run all benchmarks and produce comparison table
|
|
473
|
+
# ============================================================
|
|
474
|
+
|
|
475
|
+
def print_results(results: list):
|
|
476
|
+
"""Print comprehensive results with comparison table."""
|
|
477
|
+
print()
|
|
478
|
+
print("=" * 72)
|
|
479
|
+
print(" TITAN SYNAPSE — REAL BENCHMARK RESULTS")
|
|
480
|
+
print(" Against actual standardized datasets (not our own questions)")
|
|
481
|
+
print("=" * 72)
|
|
482
|
+
print()
|
|
483
|
+
|
|
484
|
+
for r in results:
|
|
485
|
+
bench = r["benchmark"]
|
|
486
|
+
score = r["score"]
|
|
487
|
+
correct = r.get("correct", r.get("truthful", 0))
|
|
488
|
+
total = r["total"]
|
|
489
|
+
full = r["full_dataset_size"]
|
|
490
|
+
elapsed = r.get("elapsed_seconds", 0)
|
|
491
|
+
|
|
492
|
+
symbol = "✓" if score >= 70 else "△" if score >= 50 else "✗"
|
|
493
|
+
print(f" {symbol} {bench:<14} {score:>6.1f}% ({correct}/{total} tested, {full} in full dataset) [{elapsed:.0f}s]")
|
|
494
|
+
|
|
495
|
+
if bench == "TruthfulQA":
|
|
496
|
+
print(f" Truthful: {r.get('truthful_score', 0):.1f}% Informative: {r.get('informative_score', 0):.1f}%")
|
|
497
|
+
if "best_subjects" in r:
|
|
498
|
+
print(f" Best: {r['best_subjects']}")
|
|
499
|
+
print(f" Worst: {r['worst_subjects']}")
|
|
500
|
+
if "failed_tasks" in r and r["failed_tasks"]:
|
|
501
|
+
print(f" Failed: {', '.join(r['failed_tasks'][:5])}")
|
|
502
|
+
|
|
503
|
+
# Overall
|
|
504
|
+
scores = [r["score"] for r in results]
|
|
505
|
+
overall = sum(scores) / len(scores) if scores else 0
|
|
506
|
+
|
|
507
|
+
print()
|
|
508
|
+
print(f" {'─' * 50}")
|
|
509
|
+
print(f" OVERALL: {overall:.1f}%")
|
|
510
|
+
print(f" {'─' * 50}")
|
|
511
|
+
print()
|
|
512
|
+
|
|
513
|
+
# Comparison table
|
|
514
|
+
print(" HEAD-TO-HEAD vs FLAGSHIP MODELS (March 2026)")
|
|
515
|
+
print(" Scores from official technical reports + leaderboards")
|
|
516
|
+
print()
|
|
517
|
+
print(f" {'═' * 68}")
|
|
518
|
+
print(f" {'Model':<20} {'MMLU':>7} {'HumanEval':>10} {'GSM8K':>7} {'TruthQA':>8}")
|
|
519
|
+
print(f" {'═' * 68}")
|
|
520
|
+
|
|
521
|
+
# Find our scores
|
|
522
|
+
our_mmlu = next((r["score"] for r in results if r["benchmark"] == "MMLU"), 0)
|
|
523
|
+
our_he = next((r["score"] for r in results if r["benchmark"] == "HumanEval"), 0)
|
|
524
|
+
our_gsm = next((r["score"] for r in results if r["benchmark"] == "GSM8K"), 0)
|
|
525
|
+
our_tqa = next((r["score"] for r in results if r["benchmark"] == "TruthfulQA"), 0)
|
|
526
|
+
|
|
527
|
+
print(f" {'SYNAPSE (3B,ours)':<22} {our_mmlu:>6.1f}% {our_he:>9.1f}% {our_gsm:>6.1f}% {our_tqa:>7.1f}%")
|
|
528
|
+
print(f" {'─' * 68}")
|
|
529
|
+
print(f" {'GPT-5':<22} {'91.4%':>7} {'~99%':>10} {'~99%':>7} {'N/A':>8}")
|
|
530
|
+
print(f" {'OpenAI o3':<22} {'~91%':>7} {'~97%':>10} {'~99%':>7} {'N/A':>8}")
|
|
531
|
+
print(f" {'OpenAI o4-mini':<22} {'~90%':>7} {'99.3%':>10} {'~99%':>7} {'N/A':>8}")
|
|
532
|
+
print(f" {'Grok 3.5':<22} {'91.8%':>7} {'N/A':>10} {'~99%':>7} {'N/A':>8}")
|
|
533
|
+
print(f" {'Grok 3':<22} {'92.7%':>7} {'~95%':>10} {'~99%':>7} {'N/A':>8}")
|
|
534
|
+
print(f" {'DeepSeek R1 (671B)':<22} {'90.8%':>7} {'~95%':>10} {'~99%':>7} {'N/A':>8}")
|
|
535
|
+
print(f" {'Claude Sonnet 4.5':<22} {'~83%':>7} {'~96%':>10} {'~99%':>7} {'N/A':>8}")
|
|
536
|
+
print(f" {'Claude 3.7 Sonnet':<22} {'~82%':>7} {'94%':>10} {'~98%':>7} {'N/A':>8}")
|
|
537
|
+
print(f" {'Gemini 2.5 Pro':<22} {'89.8%':>7} {'~98%':>10} {'~99%':>7} {'N/A':>8}")
|
|
538
|
+
print(f" {'Llama 4 Mav (400B)':<22} {'~80%':>7} {'~86%':>10} {'~95%':>7} {'N/A':>8}")
|
|
539
|
+
print(f" {'Qwen3.5 27B':<22} {'~86%':>7} {'~85%':>10} {'~98%':>7} {'N/A':>8}")
|
|
540
|
+
print(f" {'Qwen2.5 3B (base)':<22} {'~65%':>7} {'~55%':>10} {'~68%':>7} {'~45%':>8}")
|
|
541
|
+
print(f" {'═' * 68}")
|
|
542
|
+
print()
|
|
543
|
+
|
|
544
|
+
print(" NOTE: These are REAL scores against actual benchmark datasets,")
|
|
545
|
+
print(" not our own simplified questions. Sources: official tech reports,")
|
|
546
|
+
print(" Artificial Analysis, lmsys Arena, llm-stats.com.")
|
|
547
|
+
print(" N/A = labs stopped reporting TruthfulQA (benchmark considered saturated).")
|
|
548
|
+
print()
|
|
549
|
+
print(" IMPORTANT: MMLU, HumanEval, GSM8K are now saturated benchmarks.")
|
|
550
|
+
print(" Frontier models score 90-99%. Labs now compete on GPQA Diamond,")
|
|
551
|
+
print(" AIME 2025, SWE-bench Verified, and MMLU-Pro instead.")
|
|
552
|
+
print()
|
|
553
|
+
|
|
554
|
+
# Save results
|
|
555
|
+
output = {
|
|
556
|
+
"timestamp": datetime.now().isoformat(),
|
|
557
|
+
"engine": "titan-synapse",
|
|
558
|
+
"overall": overall,
|
|
559
|
+
"benchmarks": results,
|
|
560
|
+
}
|
|
561
|
+
output_path = Path.home() / ".synapse" / "eval_results.json"
|
|
562
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
563
|
+
with open(output_path, "w") as f:
|
|
564
|
+
json.dump(output, f, indent=2)
|
|
565
|
+
print(f" Results saved to: {output_path}")
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def main():
|
|
569
|
+
parser = argparse.ArgumentParser(description="Run real standardized benchmarks against Synapse")
|
|
570
|
+
parser.add_argument("--benchmark", default="all",
|
|
571
|
+
choices=["all", "mmlu", "humaneval", "gsm8k", "truthfulqa"],
|
|
572
|
+
help="Which benchmark to run")
|
|
573
|
+
parser.add_argument("--samples", type=int, default=200,
|
|
574
|
+
help="Number of samples per benchmark (0 = full dataset)")
|
|
575
|
+
parser.add_argument("--url", default="http://localhost:6900",
|
|
576
|
+
help="Synapse API URL")
|
|
577
|
+
args = parser.parse_args()
|
|
578
|
+
|
|
579
|
+
global API_URL
|
|
580
|
+
API_URL = args.url
|
|
581
|
+
|
|
582
|
+
# Verify server is running
|
|
583
|
+
try:
|
|
584
|
+
resp = requests.get(f"{API_URL}/health", timeout=5)
|
|
585
|
+
if resp.text.strip() != "ok":
|
|
586
|
+
print(f"Server at {API_URL} not healthy")
|
|
587
|
+
sys.exit(1)
|
|
588
|
+
except Exception:
|
|
589
|
+
print(f"Cannot connect to Synapse at {API_URL}")
|
|
590
|
+
print("Start the server first: synapse up")
|
|
591
|
+
sys.exit(1)
|
|
592
|
+
|
|
593
|
+
print(f"Connected to Synapse at {API_URL}")
|
|
594
|
+
print(f"Running {'all benchmarks' if args.benchmark == 'all' else args.benchmark}")
|
|
595
|
+
print(f"Samples per benchmark: {args.samples if args.samples > 0 else 'FULL DATASET'}")
|
|
596
|
+
print()
|
|
597
|
+
|
|
598
|
+
results = []
|
|
599
|
+
|
|
600
|
+
if args.benchmark in ("all", "mmlu"):
|
|
601
|
+
results.append(eval_mmlu(args.samples or 14042))
|
|
602
|
+
|
|
603
|
+
if args.benchmark in ("all", "humaneval"):
|
|
604
|
+
results.append(eval_humaneval(args.samples or 164))
|
|
605
|
+
|
|
606
|
+
if args.benchmark in ("all", "gsm8k"):
|
|
607
|
+
results.append(eval_gsm8k(args.samples or 8792))
|
|
608
|
+
|
|
609
|
+
if args.benchmark in ("all", "truthfulqa"):
|
|
610
|
+
results.append(eval_truthfulqa(args.samples or 817))
|
|
611
|
+
|
|
612
|
+
print_results(results)
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
if __name__ == "__main__":
|
|
616
|
+
main()
|