titan-synapse 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +102 -66
- package/package.json +1 -1
- package/paper/synapse_architecture.md +542 -0
- package/python/synapse_learn/bench_merged.py +380 -0
- package/python/synapse_learn/merge_model.py +136 -21
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Full benchmark suite for Synapse-3B merged model.
|
|
3
|
+
|
|
4
|
+
Runs standard evaluations that match published benchmarks for Qwen2-3B class models:
|
|
5
|
+
- GSM8K (math reasoning) — full test set or configurable N
|
|
6
|
+
- HumanEval (code generation) — full 164 problems
|
|
7
|
+
- MMLU (general knowledge) — 5-shot, standard benchmark
|
|
8
|
+
- Speed test — tok/s on RTX 5090
|
|
9
|
+
|
|
10
|
+
Apples-to-apples comparison against published Qwen2-3B scores.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
import torch
|
|
16
|
+
import json
|
|
17
|
+
import time
|
|
18
|
+
import re
|
|
19
|
+
import traceback
|
|
20
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
21
|
+
|
|
22
|
+
MODEL_DIR = os.path.expanduser("~/.synapse/merged/synapse-3b")
|
|
23
|
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
24
|
+
print(f"[INFO] Device: {DEVICE}")
|
|
25
|
+
if DEVICE == "cuda":
|
|
26
|
+
print(f"[INFO] GPU: {torch.cuda.get_device_name(0)}")
|
|
27
|
+
print(f"[INFO] VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
|
|
28
|
+
|
|
29
|
+
print(f"\nLoading Synapse-3B from {MODEL_DIR}...")
|
|
30
|
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
|
|
31
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
32
|
+
MODEL_DIR,
|
|
33
|
+
torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
|
|
34
|
+
device_map=DEVICE,
|
|
35
|
+
)
|
|
36
|
+
print(f"Model loaded: {type(model).__name__} on {DEVICE}")
|
|
37
|
+
print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e9:.2f}B")
|
|
38
|
+
|
|
39
|
+
results = {}
|
|
40
|
+
|
|
41
|
+
# ============================================================
|
|
42
|
+
# 1. GSM8K — Math Reasoning (standard: 8-shot CoT)
|
|
43
|
+
# Published Qwen2-3B: ~50-55% on GSM8K
|
|
44
|
+
# ============================================================
|
|
45
|
+
print("\n" + "="*60)
|
|
46
|
+
print("BENCHMARK 1: GSM8K (Math Reasoning)")
|
|
47
|
+
print("="*60)
|
|
48
|
+
|
|
49
|
+
from datasets import load_dataset
|
|
50
|
+
|
|
51
|
+
gsm = load_dataset("openai/gsm8k", "main", split="test")
|
|
52
|
+
GSM_N = int(os.environ.get("GSM_N", len(gsm))) # default: full test set (1319)
|
|
53
|
+
print(f"Running {GSM_N} / {len(gsm)} problems")
|
|
54
|
+
|
|
55
|
+
# 8-shot examples (standard for GSM8K benchmark)
|
|
56
|
+
FEW_SHOT_EXAMPLES = """Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
|
|
57
|
+
A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. #### 6
|
|
58
|
+
|
|
59
|
+
Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
|
|
60
|
+
A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. #### 5
|
|
61
|
+
|
|
62
|
+
Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?
|
|
63
|
+
A: Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. #### 39
|
|
64
|
+
|
|
65
|
+
Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?
|
|
66
|
+
A: Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8. #### 8
|
|
67
|
+
|
|
68
|
+
Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?
|
|
69
|
+
A: Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9. #### 9
|
|
70
|
+
|
|
71
|
+
Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?
|
|
72
|
+
A: There were originally 9 computers. For each of 4 days, 5 more computers were added. So 4 * 5 = 20 computers were added. 9 + 20 = 29. #### 29
|
|
73
|
+
|
|
74
|
+
Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?
|
|
75
|
+
A: Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33. #### 33
|
|
76
|
+
|
|
77
|
+
Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
|
|
78
|
+
A: Olivia had 23 dollars. 5 bagels for 3 dollars each = 5 * 3 = 15 dollars. 23 - 15 = 8. #### 8"""
|
|
79
|
+
|
|
80
|
+
def extract_gsm_answer(text):
|
|
81
|
+
"""Extract the number after #### in GSM8K format."""
|
|
82
|
+
matches = re.findall(r'####\s*([\-\d,\.]+)', text)
|
|
83
|
+
if matches:
|
|
84
|
+
return matches[-1].replace(",", "").strip()
|
|
85
|
+
# Fallback: last number in the text
|
|
86
|
+
numbers = re.findall(r'[\-]?\d+(?:\.\d+)?', text)
|
|
87
|
+
return numbers[-1] if numbers else ""
|
|
88
|
+
|
|
89
|
+
correct = 0
|
|
90
|
+
for i in range(GSM_N):
|
|
91
|
+
q = gsm[i]["question"]
|
|
92
|
+
gold = gsm[i]["answer"].split("####")[-1].strip().replace(",", "")
|
|
93
|
+
|
|
94
|
+
prompt = f"{FEW_SHOT_EXAMPLES}\n\nQ: {q}\nA:"
|
|
95
|
+
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(DEVICE)
|
|
96
|
+
with torch.no_grad():
|
|
97
|
+
out = model.generate(**inputs, max_new_tokens=256, temperature=0.0, do_sample=False)
|
|
98
|
+
response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
|
|
99
|
+
pred = extract_gsm_answer(response)
|
|
100
|
+
|
|
101
|
+
if pred == gold:
|
|
102
|
+
correct += 1
|
|
103
|
+
|
|
104
|
+
if (i + 1) % 50 == 0 or i == GSM_N - 1:
|
|
105
|
+
print(f" GSM8K: {i+1}/{GSM_N} done — {correct}/{i+1} correct ({correct/(i+1)*100:.1f}%)")
|
|
106
|
+
|
|
107
|
+
gsm_score = correct / GSM_N * 100
|
|
108
|
+
results["GSM8K"] = {"score": round(gsm_score, 1), "correct": correct, "total": GSM_N}
|
|
109
|
+
print(f"\nGSM8K Final: {gsm_score:.1f}% ({correct}/{GSM_N})")
|
|
110
|
+
|
|
111
|
+
# ============================================================
|
|
112
|
+
# 2. HumanEval — Code Generation (pass@1)
|
|
113
|
+
# Published Qwen2-3B: ~30-40% HumanEval
|
|
114
|
+
# ============================================================
|
|
115
|
+
print("\n" + "="*60)
|
|
116
|
+
print("BENCHMARK 2: HumanEval (Code Generation, pass@1)")
|
|
117
|
+
print("="*60)
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
he = load_dataset("openai/openai_humaneval", split="test")
|
|
121
|
+
HE_N = int(os.environ.get("HE_N", len(he))) # default: full 164 problems
|
|
122
|
+
print(f"Running {HE_N} / {len(he)} problems")
|
|
123
|
+
|
|
124
|
+
code_correct = 0
|
|
125
|
+
code_errors = 0
|
|
126
|
+
for i in range(HE_N):
|
|
127
|
+
prompt = he[i]["prompt"]
|
|
128
|
+
test_code = he[i]["test"]
|
|
129
|
+
entry_point = he[i]["entry_point"]
|
|
130
|
+
|
|
131
|
+
messages = [{"role": "user", "content": f"Complete this Python function. Return ONLY the function body, no explanation:\n\n{prompt}"}]
|
|
132
|
+
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
133
|
+
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=2048).to(DEVICE)
|
|
134
|
+
|
|
135
|
+
with torch.no_grad():
|
|
136
|
+
out = model.generate(**inputs, max_new_tokens=512, temperature=0.0, do_sample=False)
|
|
137
|
+
response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
|
|
138
|
+
|
|
139
|
+
# Extract code
|
|
140
|
+
if "```python" in response:
|
|
141
|
+
code = response.split("```python")[1].split("```")[0]
|
|
142
|
+
elif "```" in response:
|
|
143
|
+
parts = response.split("```")
|
|
144
|
+
code = parts[1] if len(parts) > 1 else response
|
|
145
|
+
else:
|
|
146
|
+
code = response
|
|
147
|
+
|
|
148
|
+
full_code = prompt + code
|
|
149
|
+
try:
|
|
150
|
+
exec_globals = {}
|
|
151
|
+
exec(full_code + "\n" + test_code, exec_globals)
|
|
152
|
+
code_correct += 1
|
|
153
|
+
except Exception:
|
|
154
|
+
code_errors += 1
|
|
155
|
+
|
|
156
|
+
if (i + 1) % 20 == 0 or i == HE_N - 1:
|
|
157
|
+
print(f" HumanEval: {i+1}/{HE_N} done — {code_correct}/{i+1} pass ({code_correct/(i+1)*100:.1f}%)")
|
|
158
|
+
|
|
159
|
+
he_score = code_correct / HE_N * 100
|
|
160
|
+
results["HumanEval"] = {"score": round(he_score, 1), "correct": code_correct, "total": HE_N}
|
|
161
|
+
print(f"\nHumanEval Final: {he_score:.1f}% ({code_correct}/{HE_N})")
|
|
162
|
+
except Exception as e:
|
|
163
|
+
print(f"HumanEval skipped: {e}")
|
|
164
|
+
traceback.print_exc()
|
|
165
|
+
|
|
166
|
+
# ============================================================
|
|
167
|
+
# 3. MMLU — General Knowledge (5-shot)
|
|
168
|
+
# Published Qwen2-3B: ~53-55% on MMLU
|
|
169
|
+
# ============================================================
|
|
170
|
+
print("\n" + "="*60)
|
|
171
|
+
print("BENCHMARK 3: MMLU (General Knowledge, 5-shot)")
|
|
172
|
+
print("="*60)
|
|
173
|
+
|
|
174
|
+
try:
|
|
175
|
+
# Use cais/mmlu which has all subjects
|
|
176
|
+
mmlu = load_dataset("cais/mmlu", "all", split="test")
|
|
177
|
+
mmlu_dev = load_dataset("cais/mmlu", "all", split="dev")
|
|
178
|
+
MMLU_N = int(os.environ.get("MMLU_N", len(mmlu)))
|
|
179
|
+
print(f"Running {MMLU_N} / {len(mmlu)} problems")
|
|
180
|
+
|
|
181
|
+
CHOICES = ["A", "B", "C", "D"]
|
|
182
|
+
|
|
183
|
+
def format_mmlu_question(item, few_shot_items=None):
|
|
184
|
+
"""Format an MMLU question with optional few-shot examples."""
|
|
185
|
+
subject = item.get("subject", "general knowledge").replace("_", " ")
|
|
186
|
+
prompt = f"The following are multiple choice questions about {subject}.\n\n"
|
|
187
|
+
|
|
188
|
+
if few_shot_items:
|
|
189
|
+
for fs in few_shot_items[:5]:
|
|
190
|
+
prompt += f"Question: {fs['question']}\n"
|
|
191
|
+
for j, choice in enumerate(fs["choices"]):
|
|
192
|
+
prompt += f"{CHOICES[j]}. {choice}\n"
|
|
193
|
+
prompt += f"Answer: {CHOICES[fs['answer']]}\n\n"
|
|
194
|
+
|
|
195
|
+
prompt += f"Question: {item['question']}\n"
|
|
196
|
+
for j, choice in enumerate(item["choices"]):
|
|
197
|
+
prompt += f"{CHOICES[j]}. {choice}\n"
|
|
198
|
+
prompt += "Answer:"
|
|
199
|
+
return prompt
|
|
200
|
+
|
|
201
|
+
# Group dev set by subject for few-shot
|
|
202
|
+
dev_by_subject = {}
|
|
203
|
+
for item in mmlu_dev:
|
|
204
|
+
subj = item.get("subject", "unknown")
|
|
205
|
+
if subj not in dev_by_subject:
|
|
206
|
+
dev_by_subject[subj] = []
|
|
207
|
+
dev_by_subject[subj].append(item)
|
|
208
|
+
|
|
209
|
+
mmlu_correct = 0
|
|
210
|
+
subject_results = {}
|
|
211
|
+
|
|
212
|
+
for i in range(MMLU_N):
|
|
213
|
+
item = mmlu[i]
|
|
214
|
+
subj = item.get("subject", "unknown")
|
|
215
|
+
few_shot = dev_by_subject.get(subj, [])[:5]
|
|
216
|
+
|
|
217
|
+
prompt = format_mmlu_question(item, few_shot)
|
|
218
|
+
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(DEVICE)
|
|
219
|
+
|
|
220
|
+
with torch.no_grad():
|
|
221
|
+
out = model.generate(**inputs, max_new_tokens=1, do_sample=False)
|
|
222
|
+
response = tokenizer.decode(out[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
|
|
223
|
+
|
|
224
|
+
# Extract first letter answer
|
|
225
|
+
pred_letter = response[0].upper() if response and response[0].upper() in CHOICES else ""
|
|
226
|
+
gold_letter = CHOICES[item["answer"]]
|
|
227
|
+
|
|
228
|
+
is_correct = pred_letter == gold_letter
|
|
229
|
+
if is_correct:
|
|
230
|
+
mmlu_correct += 1
|
|
231
|
+
|
|
232
|
+
if subj not in subject_results:
|
|
233
|
+
subject_results[subj] = {"correct": 0, "total": 0}
|
|
234
|
+
subject_results[subj]["total"] += 1
|
|
235
|
+
if is_correct:
|
|
236
|
+
subject_results[subj]["correct"] += 1
|
|
237
|
+
|
|
238
|
+
if (i + 1) % 500 == 0 or i == MMLU_N - 1:
|
|
239
|
+
print(f" MMLU: {i+1}/{MMLU_N} done — {mmlu_correct}/{i+1} correct ({mmlu_correct/(i+1)*100:.1f}%)")
|
|
240
|
+
|
|
241
|
+
mmlu_score = mmlu_correct / MMLU_N * 100
|
|
242
|
+
results["MMLU"] = {"score": round(mmlu_score, 1), "correct": mmlu_correct, "total": MMLU_N}
|
|
243
|
+
|
|
244
|
+
# Top and bottom subjects
|
|
245
|
+
subject_scores = {}
|
|
246
|
+
for subj, data in subject_results.items():
|
|
247
|
+
if data["total"] >= 5:
|
|
248
|
+
subject_scores[subj] = data["correct"] / data["total"] * 100
|
|
249
|
+
top_subjects = sorted(subject_scores.items(), key=lambda x: -x[1])[:5]
|
|
250
|
+
bottom_subjects = sorted(subject_scores.items(), key=lambda x: x[1])[:5]
|
|
251
|
+
|
|
252
|
+
results["MMLU_top_subjects"] = {s: round(v, 1) for s, v in top_subjects}
|
|
253
|
+
results["MMLU_bottom_subjects"] = {s: round(v, 1) for s, v in bottom_subjects}
|
|
254
|
+
|
|
255
|
+
print(f"\nMMLU Final: {mmlu_score:.1f}% ({mmlu_correct}/{MMLU_N})")
|
|
256
|
+
print(f"Top subjects: {top_subjects[:3]}")
|
|
257
|
+
print(f"Bottom subjects: {bottom_subjects[:3]}")
|
|
258
|
+
|
|
259
|
+
except Exception as e:
|
|
260
|
+
print(f"MMLU skipped: {e}")
|
|
261
|
+
traceback.print_exc()
|
|
262
|
+
|
|
263
|
+
# ============================================================
|
|
264
|
+
# 4. Speed Benchmark
|
|
265
|
+
# ============================================================
|
|
266
|
+
print("\n" + "="*60)
|
|
267
|
+
print("BENCHMARK 4: Inference Speed")
|
|
268
|
+
print("="*60)
|
|
269
|
+
|
|
270
|
+
messages = [{"role": "user", "content": "Write a detailed explanation of how neural networks learn through backpropagation."}]
|
|
271
|
+
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
272
|
+
inputs = tokenizer(text, return_tensors="pt").to(DEVICE)
|
|
273
|
+
|
|
274
|
+
# Warmup (3 runs)
|
|
275
|
+
print("Warming up...")
|
|
276
|
+
for _ in range(3):
|
|
277
|
+
with torch.no_grad():
|
|
278
|
+
model.generate(**inputs, max_new_tokens=32, do_sample=False)
|
|
279
|
+
|
|
280
|
+
# Actual speed tests
|
|
281
|
+
speeds = []
|
|
282
|
+
for run in range(5):
|
|
283
|
+
if DEVICE == "cuda":
|
|
284
|
+
torch.cuda.synchronize()
|
|
285
|
+
start = time.time()
|
|
286
|
+
with torch.no_grad():
|
|
287
|
+
out = model.generate(**inputs, max_new_tokens=256, temperature=0.7, do_sample=True)
|
|
288
|
+
if DEVICE == "cuda":
|
|
289
|
+
torch.cuda.synchronize()
|
|
290
|
+
elapsed = time.time() - start
|
|
291
|
+
tokens = out.shape[1] - inputs["input_ids"].shape[1]
|
|
292
|
+
tok_s = tokens / elapsed
|
|
293
|
+
speeds.append(tok_s)
|
|
294
|
+
print(f" Run {run+1}: {tok_s:.1f} tok/s ({tokens} tokens in {elapsed:.2f}s)")
|
|
295
|
+
|
|
296
|
+
avg_speed = sum(speeds) / len(speeds)
|
|
297
|
+
max_speed = max(speeds)
|
|
298
|
+
min_speed = min(speeds)
|
|
299
|
+
|
|
300
|
+
results["speed"] = {
|
|
301
|
+
"avg_tok_s": round(avg_speed, 1),
|
|
302
|
+
"max_tok_s": round(max_speed, 1),
|
|
303
|
+
"min_tok_s": round(min_speed, 1),
|
|
304
|
+
"device": DEVICE,
|
|
305
|
+
"gpu": torch.cuda.get_device_name(0) if DEVICE == "cuda" else "N/A",
|
|
306
|
+
"dtype": "bfloat16" if DEVICE == "cuda" else "float32",
|
|
307
|
+
}
|
|
308
|
+
print(f"\nSpeed: avg {avg_speed:.1f} tok/s (min {min_speed:.1f}, max {max_speed:.1f})")
|
|
309
|
+
|
|
310
|
+
# ============================================================
|
|
311
|
+
# 5. TTFT (Time to First Token)
|
|
312
|
+
# ============================================================
|
|
313
|
+
print("\n" + "="*60)
|
|
314
|
+
print("BENCHMARK 5: Time to First Token (TTFT)")
|
|
315
|
+
print("="*60)
|
|
316
|
+
|
|
317
|
+
ttft_times = []
|
|
318
|
+
for run in range(10):
|
|
319
|
+
if DEVICE == "cuda":
|
|
320
|
+
torch.cuda.synchronize()
|
|
321
|
+
start = time.time()
|
|
322
|
+
with torch.no_grad():
|
|
323
|
+
out = model.generate(**inputs, max_new_tokens=1, do_sample=False)
|
|
324
|
+
if DEVICE == "cuda":
|
|
325
|
+
torch.cuda.synchronize()
|
|
326
|
+
ttft = (time.time() - start) * 1000 # ms
|
|
327
|
+
ttft_times.append(ttft)
|
|
328
|
+
|
|
329
|
+
avg_ttft = sum(ttft_times) / len(ttft_times)
|
|
330
|
+
p50_ttft = sorted(ttft_times)[5]
|
|
331
|
+
p99_ttft = sorted(ttft_times)[9]
|
|
332
|
+
|
|
333
|
+
results["ttft"] = {
|
|
334
|
+
"avg_ms": round(avg_ttft, 1),
|
|
335
|
+
"p50_ms": round(p50_ttft, 1),
|
|
336
|
+
"p99_ms": round(p99_ttft, 1),
|
|
337
|
+
}
|
|
338
|
+
print(f"TTFT: avg {avg_ttft:.1f}ms, p50 {p50_ttft:.1f}ms, p99 {p99_ttft:.1f}ms")
|
|
339
|
+
|
|
340
|
+
# ============================================================
|
|
341
|
+
# VRAM Usage
|
|
342
|
+
# ============================================================
|
|
343
|
+
if DEVICE == "cuda":
|
|
344
|
+
vram_used = torch.cuda.max_memory_allocated() / 1e9
|
|
345
|
+
vram_total = torch.cuda.get_device_properties(0).total_memory / 1e9
|
|
346
|
+
results["vram"] = {
|
|
347
|
+
"used_gb": round(vram_used, 2),
|
|
348
|
+
"total_gb": round(vram_total, 2),
|
|
349
|
+
"utilization_pct": round(vram_used / vram_total * 100, 1),
|
|
350
|
+
}
|
|
351
|
+
print(f"\nVRAM: {vram_used:.2f} GB / {vram_total:.2f} GB ({vram_used/vram_total*100:.1f}%)")
|
|
352
|
+
|
|
353
|
+
# ============================================================
|
|
354
|
+
# Summary
|
|
355
|
+
# ============================================================
|
|
356
|
+
print("\n" + "="*60)
|
|
357
|
+
print("FINAL RESULTS — Synapse-3B (TIES Merged)")
|
|
358
|
+
print("="*60)
|
|
359
|
+
|
|
360
|
+
# Reference scores for Qwen2-3B (published)
|
|
361
|
+
print(f"\n{'Benchmark':<20} {'Synapse-3B':>12} {'Qwen2-3B (ref)':>15}")
|
|
362
|
+
print("-" * 50)
|
|
363
|
+
if "GSM8K" in results:
|
|
364
|
+
print(f"{'GSM8K':<20} {results['GSM8K']['score']:>11.1f}% {'~54%':>15}")
|
|
365
|
+
if "HumanEval" in results:
|
|
366
|
+
print(f"{'HumanEval':<20} {results['HumanEval']['score']:>11.1f}% {'~36%':>15}")
|
|
367
|
+
if "MMLU" in results:
|
|
368
|
+
print(f"{'MMLU (5-shot)':<20} {results['MMLU']['score']:>11.1f}% {'~53%':>15}")
|
|
369
|
+
if "speed" in results:
|
|
370
|
+
print(f"{'Tok/s (avg)':<20} {results['speed']['avg_tok_s']:>11.1f} {'N/A':>15}")
|
|
371
|
+
if "ttft" in results:
|
|
372
|
+
print(f"{'TTFT (avg)':<20} {results['ttft']['avg_ms']:>10.1f}ms {'N/A':>15}")
|
|
373
|
+
|
|
374
|
+
print(json.dumps(results, indent=2))
|
|
375
|
+
|
|
376
|
+
# Save
|
|
377
|
+
out_path = "/tmp/synapse-bench-results.json"
|
|
378
|
+
with open(out_path, "w") as f:
|
|
379
|
+
json.dump(results, f, indent=2)
|
|
380
|
+
print(f"\nResults saved to {out_path}")
|
|
@@ -311,51 +311,166 @@ tags:
|
|
|
311
311
|
- specialist-swarm
|
|
312
312
|
- continuous-learning
|
|
313
313
|
- merged-model
|
|
314
|
+
- mamba
|
|
315
|
+
- xlstm
|
|
316
|
+
- mixture-of-experts
|
|
317
|
+
- fast-weights
|
|
318
|
+
- brain-inspired
|
|
319
|
+
- rust
|
|
320
|
+
- local-inference
|
|
314
321
|
base_model: {BASE_MODEL}
|
|
315
322
|
model_type: qwen2
|
|
323
|
+
pipeline_tag: text-generation
|
|
324
|
+
datasets:
|
|
325
|
+
- gsm8k
|
|
326
|
+
- openwebmath
|
|
327
|
+
- microsoft/orca-math-word-problems-200k
|
|
328
|
+
- sahil2801/CodeAlpaca-20k
|
|
329
|
+
- nickrosh/Evol-Instruct-Code-80k-v1
|
|
330
|
+
- iamtarun/python_code_instructions_18k_alpaca
|
|
331
|
+
- Open-Orca/SlimOrca
|
|
332
|
+
- yahma/alpaca-cleaned
|
|
316
333
|
---
|
|
317
334
|
|
|
318
|
-
# Synapse-3B
|
|
335
|
+
# Synapse-3B
|
|
319
336
|
|
|
320
|
-
**
|
|
337
|
+
**Small models that think together. And learn.**
|
|
338
|
+
|
|
339
|
+
Synapse-3B is a merged specialist model created by [TITAN Synapse](https://github.com/Djtony707/titan-synapse) — an open-source Rust inference engine that runs a swarm of tiny specialist models that collaborate and learn continuously on your GPU.
|
|
340
|
+
|
|
341
|
+
This model combines **4 specialist LoRA adapters** (math, code, general, coordinator) trained on curated datasets, then merged into a single model using **TIES merging** (Trim, Elect Sign, Merge) for minimal interference between specializations.
|
|
342
|
+
|
|
343
|
+
## Key Features
|
|
344
|
+
|
|
345
|
+
- **4 specialist domains** merged into one model without catastrophic forgetting
|
|
346
|
+
- **TIES merging** — trims small deltas, elects signs by majority vote, merges only agreeing directions
|
|
347
|
+
- **Based on Qwen2.5-3B-Instruct** — strong Apache 2.0 base with multilingual support
|
|
348
|
+
- **Part of the Synapse ecosystem** — designed for the brain-inspired Synapse Architecture (Mamba + xLSTM + Sparse MoE + Fast Weights)
|
|
321
349
|
|
|
322
350
|
## How This Model Was Made
|
|
323
351
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
352
|
+
```
|
|
353
|
+
Base Model: Qwen/Qwen2.5-3B-Instruct (Apache 2.0)
|
|
354
|
+
|
|
|
355
|
+
+---> QLoRA (rank 64) ---> Math Specialist (GSM8K + OpenWebMath + Orca-Math, 50k samples)
|
|
356
|
+
+---> QLoRA (rank 64) ---> Code Specialist (CodeAlpaca + Evol-Instruct + Python-18k, 50k samples)
|
|
357
|
+
+---> QLoRA (rank 64) ---> General Specialist (SlimOrca + Alpaca-Cleaned, 50k samples)
|
|
358
|
+
+---> QLoRA (rank 32) ---> Coordinator (Synthetic routing, 5k samples)
|
|
359
|
+
|
|
|
360
|
+
+---> TIES Merge (trim 80%, sign election, agreement merge)
|
|
361
|
+
|
|
|
362
|
+
= Synapse-3B
|
|
363
|
+
```
|
|
329
364
|
|
|
330
|
-
|
|
365
|
+
### Specialist Details
|
|
331
366
|
|
|
332
|
-
| Specialist |
|
|
333
|
-
|
|
334
|
-
|
|
|
335
|
-
|
|
|
336
|
-
|
|
|
337
|
-
|
|
|
367
|
+
| Specialist | Datasets | Samples | LoRA Rank | Focus |
|
|
368
|
+
|:---|:---|:---:|:---:|:---|
|
|
369
|
+
| **Math** | GSM8K, OpenWebMath, Orca-Math | 50,000 | 64 | Mathematical reasoning, step-by-step problem solving |
|
|
370
|
+
| **Code** | CodeAlpaca-20k, Evol-Instruct-Code-80k, Python-18k | 50,000 | 64 | Code generation, debugging, Python expertise |
|
|
371
|
+
| **General** | SlimOrca, Alpaca-Cleaned | 50,000 | 64 | General knowledge, instruction following, reasoning |
|
|
372
|
+
| **Coordinator** | Synthetic routing examples | 5,000 | 32 | Task analysis, specialist routing, swarm coordination |
|
|
373
|
+
|
|
374
|
+
### Merge Method: TIES
|
|
375
|
+
|
|
376
|
+
[TIES (Trim, Elect Sign, Merge)](https://arxiv.org/abs/2306.01708) is used to combine adapters with minimal interference:
|
|
377
|
+
|
|
378
|
+
1. **Trim** — Remove small-magnitude deltas (keep top 20% per parameter)
|
|
379
|
+
2. **Elect Sign** — For each parameter, take a majority vote on the sign direction across all specialists
|
|
380
|
+
3. **Merge** — Only average deltas that agree with the elected sign
|
|
381
|
+
|
|
382
|
+
This produces cleaner merges than simple averaging, preserving each specialist's strengths.
|
|
338
383
|
|
|
339
384
|
## Usage
|
|
340
385
|
|
|
386
|
+
### With Transformers
|
|
387
|
+
|
|
341
388
|
```python
|
|
342
389
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
343
390
|
|
|
344
|
-
model = AutoModelForCausalLM.from_pretrained("
|
|
345
|
-
tokenizer = AutoTokenizer.from_pretrained("
|
|
391
|
+
model = AutoModelForCausalLM.from_pretrained("djtony707/synapse-3b")
|
|
392
|
+
tokenizer = AutoTokenizer.from_pretrained("djtony707/synapse-3b")
|
|
393
|
+
|
|
394
|
+
messages = [{{"role": "user", "content": "Solve: If a train travels 120km in 2 hours, what is its speed in m/s?"}}]
|
|
395
|
+
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
396
|
+
inputs = tokenizer(text, return_tensors="pt").to(model.device)
|
|
397
|
+
outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.7)
|
|
398
|
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
|
346
399
|
```
|
|
347
400
|
|
|
348
|
-
|
|
401
|
+
### With TITAN Synapse Engine (Rust, local inference)
|
|
402
|
+
|
|
349
403
|
```bash
|
|
404
|
+
# Install
|
|
405
|
+
curl -sSL https://raw.githubusercontent.com/Djtony707/titan-synapse/main/install.sh | bash
|
|
406
|
+
|
|
407
|
+
# Pull and run
|
|
350
408
|
synapse pull synapse-3b
|
|
351
409
|
synapse up
|
|
410
|
+
|
|
411
|
+
# OpenAI-compatible API on localhost:6900
|
|
412
|
+
curl http://localhost:6900/v1/chat/completions \\
|
|
413
|
+
-d '{{"model":"synapse-3b","messages":[{{"role":"user","content":"Hello!"}}]}}'
|
|
414
|
+
```
|
|
415
|
+
|
|
416
|
+
## The Synapse Architecture (v1.0 Target)
|
|
417
|
+
|
|
418
|
+
Synapse-3B is the foundation for the **Synapse Architecture** — a brain-inspired modular model that replaces monolithic transformers:
|
|
419
|
+
|
|
420
|
+
```
|
|
421
|
+
THALAMUS (Mamba Router, O(n))
|
|
422
|
+
|
|
|
423
|
+
+--------------+--------------+
|
|
424
|
+
| | |
|
|
425
|
+
xLSTM Lang Sparse MoE Fast-Weight
|
|
426
|
+
Module Expert Pool Memory
|
|
427
|
+
O(n) top-k of 8+ Learn during
|
|
428
|
+
syntax, specialists inference,
|
|
429
|
+
grammar activate no backprop
|
|
430
|
+
```
|
|
431
|
+
|
|
432
|
+
- **No O(n^2) attention** — Mamba (state-space) + xLSTM (recurrent)
|
|
433
|
+
- **Sparse activation** — only 2-3 of 8+ modules fire per token
|
|
434
|
+
- **Fast-weight memory** — learn new facts in ONE forward pass
|
|
435
|
+
- **Full observability** — every routing decision is transparent, no black box
|
|
436
|
+
|
|
437
|
+
## Training Details
|
|
438
|
+
|
|
439
|
+
- **Hardware**: NVIDIA RTX 5090 (32GB VRAM)
|
|
440
|
+
- **Training framework**: QLoRA via TRL SFTTrainer
|
|
441
|
+
- **Quantization**: 4-bit NF4 (for training efficiency)
|
|
442
|
+
- **Learning rate**: 2e-4 with cosine scheduler
|
|
443
|
+
- **Epochs**: 3 per specialist
|
|
444
|
+
- **Batch size**: 2 (gradient accumulation 8, effective batch 16)
|
|
445
|
+
- **Max sequence length**: 2048 tokens
|
|
446
|
+
- **Training time**: ~2 hours per specialist on RTX 5090
|
|
447
|
+
- **Merge method**: TIES (trim ratio 0.8)
|
|
448
|
+
- **Created**: {datetime.now().strftime("%B %d, %Y")}
|
|
449
|
+
|
|
450
|
+
## Limitations
|
|
451
|
+
|
|
452
|
+
- This is a 3B parameter model — it won't match 70B+ models on complex reasoning
|
|
453
|
+
- Trained on English-focused datasets; multilingual performance inherited from Qwen base
|
|
454
|
+
- The coordinator specialist is trained on synthetic routing data; real-world routing improves with use
|
|
455
|
+
- Best used as part of the TITAN Synapse swarm (multiple specialists collaborating)
|
|
456
|
+
|
|
457
|
+
## Citation
|
|
458
|
+
|
|
459
|
+
```bibtex
|
|
460
|
+
@misc{{synapse3b2026,
|
|
461
|
+
title={{Synapse-3B: A Merged Specialist Model for the TITAN Synapse Engine}},
|
|
462
|
+
author={{Tony Elliott}},
|
|
463
|
+
year={{2026}},
|
|
464
|
+
url={{https://huggingface.co/djtony707/synapse-3b}},
|
|
465
|
+
note={{Created with TITAN Synapse — https://github.com/Djtony707/titan-synapse}}
|
|
466
|
+
}}
|
|
352
467
|
```
|
|
353
468
|
|
|
354
469
|
## License
|
|
355
470
|
|
|
356
|
-
Apache 2.0
|
|
471
|
+
Apache 2.0 — use it for anything.
|
|
357
472
|
|
|
358
|
-
Built by [Tony Elliott](https://github.com/Djtony707) with TITAN Synapse.
|
|
473
|
+
Built by [Tony Elliott](https://github.com/Djtony707) with [TITAN Synapse](https://github.com/Djtony707/titan-synapse).
|
|
359
474
|
"""
|
|
360
475
|
(output_dir / "README.md").write_text(card)
|
|
361
476
|
|
|
@@ -505,7 +620,7 @@ def export_gguf_native(model_dir: Path, output_path: Path):
|
|
|
505
620
|
return output_path
|
|
506
621
|
|
|
507
622
|
|
|
508
|
-
def push_to_hub(model_dir: Path, repo_name: str = "
|
|
623
|
+
def push_to_hub(model_dir: Path, repo_name: str = "djtony707/synapse-3b"):
|
|
509
624
|
"""Push merged model to HuggingFace Hub."""
|
|
510
625
|
from huggingface_hub import HfApi
|
|
511
626
|
|
|
@@ -532,7 +647,7 @@ def main():
|
|
|
532
647
|
help="GGUF quantization type (e.g., Q4_K_M, Q5_K_M, Q8_0)")
|
|
533
648
|
parser.add_argument("--push", action="store_true",
|
|
534
649
|
help="Push to HuggingFace Hub after merge")
|
|
535
|
-
parser.add_argument("--repo", default="
|
|
650
|
+
parser.add_argument("--repo", default="djtony707/synapse-3b",
|
|
536
651
|
help="HuggingFace repo name for push")
|
|
537
652
|
args = parser.parse_args()
|
|
538
653
|
|