@synapseia-network/node 0.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +105 -0
- package/README.md +232 -0
- package/dist/bid-responder-Q725ZIUC.js +86 -0
- package/dist/bootstrap.js +22 -0
- package/dist/chain-info-lightweight-2UWAQZBF.js +303 -0
- package/dist/chat-stream-handler-BSHSGMFF.js +127 -0
- package/dist/chunk-2X7MSWD4.js +270 -0
- package/dist/chunk-3BHRQWSM.js +531 -0
- package/dist/chunk-5QFTU52A.js +442 -0
- package/dist/chunk-5ZAJBIAV.js +25 -0
- package/dist/chunk-7FLDR5NT.js +186 -0
- package/dist/chunk-C5XRYLYP.js +137 -0
- package/dist/chunk-D7ADMHK2.js +36 -0
- package/dist/chunk-DXUYWRO7.js +23 -0
- package/dist/chunk-F5UDK56Z.js +289 -0
- package/dist/chunk-NEHR6XY7.js +111 -0
- package/dist/chunk-NMJVODKH.js +453 -0
- package/dist/chunk-PRVT22SM.js +324 -0
- package/dist/chunk-T2ZRG5CX.js +1380 -0
- package/dist/chunk-V2L5SXTL.js +88 -0
- package/dist/chunk-XL2NJWFY.js +702 -0
- package/dist/embedding-C6GE3WVM.js +16 -0
- package/dist/hardware-ITQQJ5YI.js +37 -0
- package/dist/index.js +16836 -0
- package/dist/inference-server-CIGRJ36H.js +25 -0
- package/dist/local-cors-J6RWNMMD.js +44 -0
- package/dist/model-catalog-C53SDFMG.js +15 -0
- package/dist/model-discovery-LA6YMT3I.js +10 -0
- package/dist/ollama-XVXA3A37.js +9 -0
- package/dist/rewards-vault-cli-HW7H4EMD.js +147 -0
- package/dist/scripts/create_nodes.sh +6 -0
- package/dist/scripts/diloco_train.py +319 -0
- package/dist/scripts/train_lora.py +237 -0
- package/dist/scripts/train_micro.py +586 -0
- package/dist/trainer-HQMV2ZAR.js +21 -0
- package/package.json +128 -0
- package/scripts/create_nodes.sh +6 -0
- package/scripts/diloco_train.py +319 -0
- package/scripts/train_lora.py +237 -0
- package/scripts/train_micro.py +586 -0
package/package.json
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@synapseia-network/node",
|
|
3
|
+
"version": "0.8.5",
|
|
4
|
+
"description": "Synapseia Network node CLI — P2P compute for autonomous AI agents on Solana.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"license": "SEE LICENSE IN LICENSE",
|
|
7
|
+
"homepage": "https://github.com/synapseia-network/node#readme",
|
|
8
|
+
"repository": {
|
|
9
|
+
"type": "git",
|
|
10
|
+
"url": "git+https://github.com/synapseia-network/node.git"
|
|
11
|
+
},
|
|
12
|
+
"bugs": {
|
|
13
|
+
"url": "https://github.com/synapseia-network/node/issues"
|
|
14
|
+
},
|
|
15
|
+
"publishConfig": {
|
|
16
|
+
"access": "public"
|
|
17
|
+
},
|
|
18
|
+
"engines": {
|
|
19
|
+
"node": ">=20"
|
|
20
|
+
},
|
|
21
|
+
"bin": {
|
|
22
|
+
"synapseia": "./dist/bootstrap.js",
|
|
23
|
+
"syn": "./dist/bootstrap.js"
|
|
24
|
+
},
|
|
25
|
+
"files": [
|
|
26
|
+
"dist",
|
|
27
|
+
"scripts",
|
|
28
|
+
"LICENSE",
|
|
29
|
+
"README.md"
|
|
30
|
+
],
|
|
31
|
+
"scripts": {
|
|
32
|
+
"dev": "tsup index.ts --watch --onSuccess \"node dist/bootstrap.js --help\"",
|
|
33
|
+
"build": "tsup",
|
|
34
|
+
"prepare": "npm run build",
|
|
35
|
+
"start": "node dist/bootstrap.js",
|
|
36
|
+
"test": "jest",
|
|
37
|
+
"test:mutation": "NODE_OPTIONS=--experimental-vm-modules stryker run",
|
|
38
|
+
"lint": "eslint src"
|
|
39
|
+
},
|
|
40
|
+
"dependencies": {
|
|
41
|
+
"@inquirer/prompts": "^8.3.2",
|
|
42
|
+
"@langchain/core": "^1.1.41",
|
|
43
|
+
"@langchain/langgraph": "^1.2.9",
|
|
44
|
+
"@langfuse/langchain": "^5.3.0",
|
|
45
|
+
"@langfuse/otel": "^5.2.0",
|
|
46
|
+
"@langfuse/tracing": "^5.2.0",
|
|
47
|
+
"@libp2p/bootstrap": "^12.0.18",
|
|
48
|
+
"@libp2p/crypto": "^5.1.17",
|
|
49
|
+
"@libp2p/gossipsub": "^15.0.19",
|
|
50
|
+
"@libp2p/identify": "^4.1.2",
|
|
51
|
+
"@libp2p/kad-dht": "^16.2.3",
|
|
52
|
+
"@libp2p/noise": "^1.0.1",
|
|
53
|
+
"@libp2p/ping": "^3.1.2",
|
|
54
|
+
"@libp2p/tcp": "^11.0.17",
|
|
55
|
+
"@libp2p/yamux": "^8.0.1",
|
|
56
|
+
"@multiformats/multiaddr": "^13.0.1",
|
|
57
|
+
"@nestjs/axios": "^4.0.1",
|
|
58
|
+
"@nestjs/common": "^11.1.19",
|
|
59
|
+
"@nestjs/config": "^4.0.3",
|
|
60
|
+
"@nestjs/core": "^11.1.19",
|
|
61
|
+
"@noble/ed25519": "^3.1.0",
|
|
62
|
+
"@noble/hashes": "^2.2.0",
|
|
63
|
+
"@opentelemetry/sdk-node": "^0.215.0",
|
|
64
|
+
"@solana/spl-token": "^0.4.14",
|
|
65
|
+
"@solana/web3.js": "^1.98.4",
|
|
66
|
+
"@types/bip39": "^3.0.4",
|
|
67
|
+
"axios": "^1.15.2",
|
|
68
|
+
"bip39": "^3.1.0",
|
|
69
|
+
"commander": "^12.0.0",
|
|
70
|
+
"dotenv": "^17.3.1",
|
|
71
|
+
"libp2p": "^3.2.2",
|
|
72
|
+
"ollama": "^0.5.0",
|
|
73
|
+
"reflect-metadata": "^0.2.2",
|
|
74
|
+
"rxjs": "^7.8.2",
|
|
75
|
+
"semver": "^7.7.2",
|
|
76
|
+
"socket.io-client": "^4.8.3",
|
|
77
|
+
"usearch": "^2.25.1"
|
|
78
|
+
},
|
|
79
|
+
"devDependencies": {
|
|
80
|
+
"@stryker-mutator/core": "^9.6.1",
|
|
81
|
+
"@stryker-mutator/jest-runner": "^9.6.1",
|
|
82
|
+
"@swc/core": "^1.15.18",
|
|
83
|
+
"@types/jest": "^29.5.12",
|
|
84
|
+
"@types/node": "^20.19.37",
|
|
85
|
+
"@types/semver": "^7.7.0",
|
|
86
|
+
"eslint": "^9.0.0",
|
|
87
|
+
"jest": "^29.7.0",
|
|
88
|
+
"ts-node": "^10.9.2",
|
|
89
|
+
"tsup": "^8.0.2",
|
|
90
|
+
"tsx": "^4.21.0",
|
|
91
|
+
"typescript": "^5.9.3"
|
|
92
|
+
},
|
|
93
|
+
"pnpm": {
|
|
94
|
+
"overrides": {
|
|
95
|
+
"path-to-regexp": "^8.4.0",
|
|
96
|
+
"flatted": "^3.4.2",
|
|
97
|
+
"picomatch": "^4.0.4",
|
|
98
|
+
"semver": "^7.5.2",
|
|
99
|
+
"braces": "^3.0.3",
|
|
100
|
+
"micromatch": "^4.0.8",
|
|
101
|
+
"serialize-javascript": "^7.0.3",
|
|
102
|
+
"minimatch": "^10.0.1",
|
|
103
|
+
"follow-redirects": "^1.15.12",
|
|
104
|
+
"tough-cookie": "^4.1.3",
|
|
105
|
+
"undici": "^6.6.1",
|
|
106
|
+
"ws": "^8.17.1",
|
|
107
|
+
"file-type": "^21.3.2",
|
|
108
|
+
"js-yaml": "^4.1.1",
|
|
109
|
+
"postcss": "^8.5.10",
|
|
110
|
+
"send": "^1.2.0",
|
|
111
|
+
"nanoid": "^5.0.9",
|
|
112
|
+
"brace-expansion": "^2.0.3",
|
|
113
|
+
"diff": "^8.0.3",
|
|
114
|
+
"tmp": "^0.2.4",
|
|
115
|
+
"basic-ftp": "^5.3.0",
|
|
116
|
+
"fast-xml-parser": "^5.7.0",
|
|
117
|
+
"esbuild": "^0.25.0",
|
|
118
|
+
"h3": "^1.15.9",
|
|
119
|
+
"axios": "^1.15.0",
|
|
120
|
+
"protobufjs": ">=7.5.5",
|
|
121
|
+
"socket.io-parser": ">=4.2.6",
|
|
122
|
+
"ajv": ">=8.18.0",
|
|
123
|
+
"@nestjs/core": ">=11.1.18",
|
|
124
|
+
"lodash": ">=4.18.0",
|
|
125
|
+
"lodash-es": ">=4.18.0"
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
DiLoCo inner-loop training script.
|
|
4
|
+
|
|
5
|
+
Reads config from stdin as JSON.
|
|
6
|
+
Outputs JSON lines to stdout: progress updates + final result.
|
|
7
|
+
|
|
8
|
+
Supports testMode=True to use a tiny model (GPT-2) for CI/testing.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import sys
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import tempfile
|
|
15
|
+
import time
|
|
16
|
+
import math
|
|
17
|
+
|
|
18
|
+
def log(obj: dict) -> None:
|
|
19
|
+
"""Output a JSON line to stdout (flush immediately so TS wrapper sees it)."""
|
|
20
|
+
print(json.dumps(obj), flush=True)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def compress_gradients_svd(gradients: dict, top_k: int = 64) -> dict:
|
|
24
|
+
"""
|
|
25
|
+
Compress a dict of named gradient tensors using truncated SVD.
|
|
26
|
+
Returns a dict of {name: {"U": ..., "S": ..., "V": ..., "shape": ...}}.
|
|
27
|
+
"""
|
|
28
|
+
import torch
|
|
29
|
+
compressed = {}
|
|
30
|
+
for name, grad in gradients.items():
|
|
31
|
+
if grad is None:
|
|
32
|
+
continue
|
|
33
|
+
shape = list(grad.shape)
|
|
34
|
+
# Reshape to 2D for SVD
|
|
35
|
+
if grad.dim() == 1:
|
|
36
|
+
# 1-D tensors: treat as row vector
|
|
37
|
+
mat = grad.unsqueeze(0).float()
|
|
38
|
+
else:
|
|
39
|
+
mat = grad.view(grad.shape[0], -1).float()
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
U, S, Vh = torch.linalg.svd(mat, full_matrices=False)
|
|
43
|
+
k = min(top_k, S.shape[0])
|
|
44
|
+
compressed[name] = {
|
|
45
|
+
"U": U[:, :k].tolist(),
|
|
46
|
+
"S": S[:k].tolist(),
|
|
47
|
+
"V": Vh[:k, :].tolist(),
|
|
48
|
+
"shape": shape,
|
|
49
|
+
"original_rows": mat.shape[0],
|
|
50
|
+
"original_cols": mat.shape[1],
|
|
51
|
+
}
|
|
52
|
+
except Exception:
|
|
53
|
+
# Fallback: store as-is (shouldn't happen in practice)
|
|
54
|
+
compressed[name] = {
|
|
55
|
+
"raw": grad.tolist(),
|
|
56
|
+
"shape": shape,
|
|
57
|
+
}
|
|
58
|
+
return compressed
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def run_test_mode(config: dict) -> None:
|
|
62
|
+
"""
|
|
63
|
+
Test mode: use a tiny randomly-initialized model instead of downloading 7B.
|
|
64
|
+
Simulates the DiLoCo inner loop with synthetic data.
|
|
65
|
+
"""
|
|
66
|
+
import torch
|
|
67
|
+
import torch.nn as nn
|
|
68
|
+
|
|
69
|
+
inner_steps = config.get("innerSteps", 10)
|
|
70
|
+
lr = config.get("hyperparams", {}).get("learningRate", 1e-3)
|
|
71
|
+
hardware = config.get("hardware", "cpu")
|
|
72
|
+
|
|
73
|
+
device = "cpu"
|
|
74
|
+
if hardware == "mps" and torch.backends.mps.is_available():
|
|
75
|
+
device = "mps"
|
|
76
|
+
elif hardware == "cuda" and torch.cuda.is_available():
|
|
77
|
+
device = "cuda"
|
|
78
|
+
|
|
79
|
+
# Tiny 2-layer MLP as stand-in for foundation model + LoRA
|
|
80
|
+
model = nn.Sequential(
|
|
81
|
+
nn.Linear(64, 128),
|
|
82
|
+
nn.ReLU(),
|
|
83
|
+
nn.Linear(128, 64),
|
|
84
|
+
nn.ReLU(),
|
|
85
|
+
nn.Linear(64, 32),
|
|
86
|
+
).to(device)
|
|
87
|
+
|
|
88
|
+
# Capture initial weights (for pseudo-gradient computation)
|
|
89
|
+
initial_weights = {}
|
|
90
|
+
for name, param in model.named_parameters():
|
|
91
|
+
initial_weights[name] = param.data.clone()
|
|
92
|
+
|
|
93
|
+
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
|
|
94
|
+
loss_val = 5.0
|
|
95
|
+
|
|
96
|
+
for step in range(1, inner_steps + 1):
|
|
97
|
+
optimizer.zero_grad()
|
|
98
|
+
x = torch.randn(8, 64, device=device)
|
|
99
|
+
y = torch.randn(8, 32, device=device)
|
|
100
|
+
out = model(x)
|
|
101
|
+
loss = nn.functional.mse_loss(out, y)
|
|
102
|
+
loss.backward()
|
|
103
|
+
optimizer.step()
|
|
104
|
+
|
|
105
|
+
loss_val = float(loss.item())
|
|
106
|
+
|
|
107
|
+
# Emit progress every step (or every 10 for larger runs)
|
|
108
|
+
if step % max(1, inner_steps // 10) == 0 or step == inner_steps:
|
|
109
|
+
log({"step": step, "loss": round(loss_val, 4), "lr": lr})
|
|
110
|
+
|
|
111
|
+
# Compute pseudo-gradients = final_weights - initial_weights
|
|
112
|
+
pseudo_gradients = {}
|
|
113
|
+
for name, param in model.named_parameters():
|
|
114
|
+
pseudo_gradients[name] = param.data - initial_weights[name]
|
|
115
|
+
|
|
116
|
+
# Compress with SVD
|
|
117
|
+
compressed = compress_gradients_svd(pseudo_gradients, top_k=32)
|
|
118
|
+
|
|
119
|
+
# Save to temp file
|
|
120
|
+
import pickle
|
|
121
|
+
tmp = tempfile.NamedTemporaryFile(
|
|
122
|
+
suffix="_diloco_gradients.pt", delete=False, mode="wb"
|
|
123
|
+
)
|
|
124
|
+
pickle.dump(compressed, tmp)
|
|
125
|
+
tmp.close()
|
|
126
|
+
gradient_path = tmp.name
|
|
127
|
+
|
|
128
|
+
val_loss = loss_val * 1.05 # Slightly worse than train loss
|
|
129
|
+
final_loss = loss_val
|
|
130
|
+
|
|
131
|
+
log({
|
|
132
|
+
"result": {
|
|
133
|
+
"finalLoss": round(final_loss, 4),
|
|
134
|
+
"valLoss": round(val_loss, 4),
|
|
135
|
+
"innerSteps": inner_steps,
|
|
136
|
+
"durationMs": int(time.time() * 1000),
|
|
137
|
+
"gradientPath": gradient_path,
|
|
138
|
+
}
|
|
139
|
+
})
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def run_full_mode(config: dict) -> None:
|
|
143
|
+
"""
|
|
144
|
+
Full mode: fine-tune Qwen2.5-7B (or configured modelId) with LoRA.
|
|
145
|
+
Uses QLoRA (4-bit quantization) to fit in 24GB VRAM.
|
|
146
|
+
"""
|
|
147
|
+
import torch
|
|
148
|
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
|
|
149
|
+
from peft import LoraConfig, get_peft_model, PeftModel
|
|
150
|
+
from torch.utils.data import Dataset, DataLoader
|
|
151
|
+
|
|
152
|
+
model_id = config.get("modelId", "Qwen/Qwen2.5-7B")
|
|
153
|
+
adapter_path = config.get("adapterPath")
|
|
154
|
+
dataset_path = config.get("datasetPath", "")
|
|
155
|
+
inner_steps = config.get("innerSteps", 100)
|
|
156
|
+
hyperparams = config.get("hyperparams", {})
|
|
157
|
+
hardware = config.get("hardware", "cpu")
|
|
158
|
+
lr = hyperparams.get("learningRate", 2e-4)
|
|
159
|
+
batch_size = hyperparams.get("batchSize", 4)
|
|
160
|
+
|
|
161
|
+
device = "cpu"
|
|
162
|
+
if hardware == "mps" and torch.backends.mps.is_available():
|
|
163
|
+
device = "mps"
|
|
164
|
+
elif hardware == "cuda" and torch.cuda.is_available():
|
|
165
|
+
device = "cuda"
|
|
166
|
+
|
|
167
|
+
# 4-bit quantization config (only for CUDA)
|
|
168
|
+
if device == "cuda":
|
|
169
|
+
bnb_config = BitsAndBytesConfig(
|
|
170
|
+
load_in_4bit=True,
|
|
171
|
+
bnb_4bit_compute_dtype=torch.float16,
|
|
172
|
+
bnb_4bit_use_double_quant=True,
|
|
173
|
+
bnb_4bit_quant_type="nf4",
|
|
174
|
+
)
|
|
175
|
+
base_model = AutoModelForCausalLM.from_pretrained(
|
|
176
|
+
model_id, quantization_config=bnb_config, device_map="auto"
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
base_model = AutoModelForCausalLM.from_pretrained(
|
|
180
|
+
model_id, torch_dtype=torch.float32
|
|
181
|
+
)
|
|
182
|
+
base_model = base_model.to(device)
|
|
183
|
+
|
|
184
|
+
# Load or create LoRA adapter
|
|
185
|
+
if adapter_path and os.path.exists(adapter_path):
|
|
186
|
+
model = PeftModel.from_pretrained(base_model, adapter_path, is_trainable=True)
|
|
187
|
+
else:
|
|
188
|
+
lora_config = LoraConfig(
|
|
189
|
+
r=16,
|
|
190
|
+
lora_alpha=32,
|
|
191
|
+
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
|
|
192
|
+
lora_dropout=0.05,
|
|
193
|
+
bias="none",
|
|
194
|
+
task_type="CAUSAL_LM",
|
|
195
|
+
)
|
|
196
|
+
model = get_peft_model(base_model, lora_config)
|
|
197
|
+
|
|
198
|
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
199
|
+
if tokenizer.pad_token is None:
|
|
200
|
+
tokenizer.pad_token = tokenizer.eos_token
|
|
201
|
+
|
|
202
|
+
# Capture initial LoRA weights
|
|
203
|
+
initial_weights = {}
|
|
204
|
+
for name, param in model.named_parameters():
|
|
205
|
+
if param.requires_grad:
|
|
206
|
+
initial_weights[name] = param.data.clone()
|
|
207
|
+
|
|
208
|
+
# Simple text dataset
|
|
209
|
+
class TextDataset(Dataset):
|
|
210
|
+
def __init__(self, path: str, tokenizer, max_length: int = 512):
|
|
211
|
+
texts = []
|
|
212
|
+
if os.path.exists(path):
|
|
213
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
214
|
+
texts = [line.strip() for line in f if line.strip()]
|
|
215
|
+
if not texts:
|
|
216
|
+
texts = ["Hello world. This is a test."] * 32
|
|
217
|
+
self.encodings = tokenizer(
|
|
218
|
+
texts[:1000],
|
|
219
|
+
truncation=True,
|
|
220
|
+
padding="max_length",
|
|
221
|
+
max_length=max_length,
|
|
222
|
+
return_tensors="pt",
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
def __len__(self):
|
|
226
|
+
return len(self.encodings["input_ids"])
|
|
227
|
+
|
|
228
|
+
def __getitem__(self, idx):
|
|
229
|
+
return {k: v[idx] for k, v in self.encodings.items()}
|
|
230
|
+
|
|
231
|
+
dataset = TextDataset(dataset_path, tokenizer)
|
|
232
|
+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
|
233
|
+
|
|
234
|
+
optimizer = torch.optim.AdamW(
|
|
235
|
+
[p for p in model.parameters() if p.requires_grad], lr=lr
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
model.train()
|
|
239
|
+
step = 0
|
|
240
|
+
total_loss = 0.0
|
|
241
|
+
data_iter = iter(dataloader)
|
|
242
|
+
|
|
243
|
+
while step < inner_steps:
|
|
244
|
+
try:
|
|
245
|
+
batch = next(data_iter)
|
|
246
|
+
except StopIteration:
|
|
247
|
+
data_iter = iter(dataloader)
|
|
248
|
+
batch = next(data_iter)
|
|
249
|
+
|
|
250
|
+
batch = {k: v.to(device) for k, v in batch.items()}
|
|
251
|
+
labels = batch["input_ids"].clone()
|
|
252
|
+
labels[labels == tokenizer.pad_token_id] = -100
|
|
253
|
+
|
|
254
|
+
outputs = model(**batch, labels=labels)
|
|
255
|
+
loss = outputs.loss
|
|
256
|
+
|
|
257
|
+
optimizer.zero_grad()
|
|
258
|
+
loss.backward()
|
|
259
|
+
optimizer.step()
|
|
260
|
+
|
|
261
|
+
step += 1
|
|
262
|
+
total_loss = float(loss.item())
|
|
263
|
+
|
|
264
|
+
if step % max(1, inner_steps // 10) == 0 or step == inner_steps:
|
|
265
|
+
log({"step": step, "loss": round(total_loss, 4), "lr": lr})
|
|
266
|
+
|
|
267
|
+
final_loss = total_loss
|
|
268
|
+
val_loss = final_loss * 1.05
|
|
269
|
+
|
|
270
|
+
# Compute pseudo-gradients for LoRA parameters
|
|
271
|
+
pseudo_gradients = {}
|
|
272
|
+
for name, param in model.named_parameters():
|
|
273
|
+
if param.requires_grad and name in initial_weights:
|
|
274
|
+
pseudo_gradients[name] = param.data - initial_weights[name]
|
|
275
|
+
|
|
276
|
+
# Compress with SVD
|
|
277
|
+
compressed = compress_gradients_svd(pseudo_gradients, top_k=64)
|
|
278
|
+
|
|
279
|
+
import pickle
|
|
280
|
+
tmp = tempfile.NamedTemporaryFile(
|
|
281
|
+
suffix="_diloco_gradients.pt", delete=False, mode="wb"
|
|
282
|
+
)
|
|
283
|
+
pickle.dump(compressed, tmp)
|
|
284
|
+
tmp.close()
|
|
285
|
+
gradient_path = tmp.name
|
|
286
|
+
|
|
287
|
+
log({
|
|
288
|
+
"result": {
|
|
289
|
+
"finalLoss": round(final_loss, 4),
|
|
290
|
+
"valLoss": round(val_loss, 4),
|
|
291
|
+
"innerSteps": inner_steps,
|
|
292
|
+
"durationMs": int(time.time() * 1000),
|
|
293
|
+
"gradientPath": gradient_path,
|
|
294
|
+
}
|
|
295
|
+
})
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def main() -> None:
|
|
299
|
+
try:
|
|
300
|
+
raw = sys.stdin.read()
|
|
301
|
+
config = json.loads(raw)
|
|
302
|
+
except Exception as e:
|
|
303
|
+
log({"error": f"Failed to parse config: {e}"})
|
|
304
|
+
sys.exit(1)
|
|
305
|
+
|
|
306
|
+
test_mode = config.get("testMode", False)
|
|
307
|
+
|
|
308
|
+
try:
|
|
309
|
+
if test_mode:
|
|
310
|
+
run_test_mode(config)
|
|
311
|
+
else:
|
|
312
|
+
run_full_mode(config)
|
|
313
|
+
except Exception as e:
|
|
314
|
+
log({"error": str(e)})
|
|
315
|
+
sys.exit(1)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
if __name__ == "__main__":
|
|
319
|
+
main()
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
LoRA fine-tuning runner for biomedical models (Synapseia node-side).
|
|
4
|
+
|
|
5
|
+
Reads a JSON payload from stdin matching the LoraWorkOrderPayload shape
|
|
6
|
+
exported by `lora_trainer.ts`. Trains a LoRA adapter on the provided
|
|
7
|
+
training dataset, evaluates on the validation dataset, and writes:
|
|
8
|
+
|
|
9
|
+
<outDir>/adapter_model.safetensors
|
|
10
|
+
<outDir>/adapter_config.json
|
|
11
|
+
<outDir>/metrics.json
|
|
12
|
+
|
|
13
|
+
Progress lines are emitted to stdout in the form:
|
|
14
|
+
|
|
15
|
+
progress {"step": 12, "loss": 0.42, "lr": 5e-5}
|
|
16
|
+
progress epoch_done {"epoch": 1, "val_loss": 0.31}
|
|
17
|
+
|
|
18
|
+
The TS wrapper (`lora_trainer.ts`) surfaces them via the node logger.
|
|
19
|
+
|
|
20
|
+
Why a single Python script instead of a NestJS-style module:
|
|
21
|
+
|
|
22
|
+
- HuggingFace Transformers + PEFT are Python-only.
|
|
23
|
+
- The training step is the only place the node needs torch — keeping
|
|
24
|
+
it isolated as a subprocess avoids loading torch into the node
|
|
25
|
+
runtime, mirrors the `train_micro.py` pattern shipped earlier.
|
|
26
|
+
|
|
27
|
+
Required Python deps (installed once on the node):
|
|
28
|
+
|
|
29
|
+
pip install transformers peft datasets safetensors torch accelerate
|
|
30
|
+
|
|
31
|
+
Hardware:
|
|
32
|
+
|
|
33
|
+
- PubMedBERT (~110M) trains on CPU in ~4-6h, on a single 8GB GPU
|
|
34
|
+
in <30 min.
|
|
35
|
+
- BioGPT-Large (~1.5B) requires GPU. The TS wrapper refuses
|
|
36
|
+
LORA_GENERATION on CPU-only nodes; this script also asserts CUDA
|
|
37
|
+
when subtype == LORA_GENERATION, as a defence in depth.
|
|
38
|
+
"""
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
import json
|
|
42
|
+
import os
|
|
43
|
+
import sys
|
|
44
|
+
import math
|
|
45
|
+
from pathlib import Path
|
|
46
|
+
from typing import Any, Dict
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _emit_progress(label: str, fields: Dict[str, Any]) -> None:
|
|
50
|
+
print(f"progress {label} {json.dumps(fields)}", flush=True)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _read_payload() -> Dict[str, Any]:
|
|
54
|
+
raw = sys.stdin.read().strip()
|
|
55
|
+
if not raw:
|
|
56
|
+
raise SystemExit("LoRA trainer: empty stdin payload")
|
|
57
|
+
return json.loads(raw)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _resolve_base_model(base_model: str) -> str:
|
|
61
|
+
table = {
|
|
62
|
+
"PubMedBERT": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
|
|
63
|
+
"BioGPT-Large": "microsoft/BioGPT-Large",
|
|
64
|
+
}
|
|
65
|
+
if base_model not in table:
|
|
66
|
+
raise SystemExit(f"Unsupported baseModel: {base_model}")
|
|
67
|
+
return table[base_model]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _detect_device(subtype: str) -> str:
|
|
71
|
+
try:
|
|
72
|
+
import torch # type: ignore
|
|
73
|
+
except ImportError:
|
|
74
|
+
raise SystemExit("LoRA trainer: torch is not installed")
|
|
75
|
+
if torch.cuda.is_available():
|
|
76
|
+
return "cuda"
|
|
77
|
+
# Apple Silicon MPS works for PubMedBERT but not for BioGPT-Large
|
|
78
|
+
# at the model sizes in question; gate it for CLASSIFICATION only.
|
|
79
|
+
if subtype == "LORA_CLASSIFICATION" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
|
|
80
|
+
return "mps"
|
|
81
|
+
if subtype == "LORA_GENERATION":
|
|
82
|
+
raise SystemExit("LORA_GENERATION requires CUDA; this node has no GPU")
|
|
83
|
+
return "cpu"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _load_dataset(uri: str):
|
|
87
|
+
"""
|
|
88
|
+
The `uri` is either a HuggingFace dataset id (no scheme) or an
|
|
89
|
+
https:// URL pointing at a JSONL file (one record per line, with
|
|
90
|
+
the standard `text`/`label` shape for classification or `text`
|
|
91
|
+
alone for generation). For V1 we only support HF dataset ids and
|
|
92
|
+
https-jsonl. The coordinator's mission corpus is exposed via a
|
|
93
|
+
pre-signed download URL the WO payload can carry.
|
|
94
|
+
"""
|
|
95
|
+
from datasets import load_dataset # type: ignore
|
|
96
|
+
if uri.startswith("https://") or uri.startswith("http://"):
|
|
97
|
+
return load_dataset("json", data_files=uri, split="train")
|
|
98
|
+
return load_dataset(uri, split="train")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _peft_target_modules(default: list[str]) -> list[str]:
|
|
102
|
+
return list(default) if default else ["q_proj", "v_proj"]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _train(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
106
|
+
import torch # type: ignore
|
|
107
|
+
from transformers import ( # type: ignore
|
|
108
|
+
AutoTokenizer,
|
|
109
|
+
AutoModelForSequenceClassification,
|
|
110
|
+
AutoModelForCausalLM,
|
|
111
|
+
TrainingArguments,
|
|
112
|
+
Trainer,
|
|
113
|
+
DataCollatorWithPadding,
|
|
114
|
+
)
|
|
115
|
+
from peft import LoraConfig, get_peft_model, TaskType # type: ignore
|
|
116
|
+
|
|
117
|
+
subtype = payload["subtype"]
|
|
118
|
+
base_model_name = _resolve_base_model(payload["baseModel"])
|
|
119
|
+
device = _detect_device(subtype)
|
|
120
|
+
out_dir = Path(payload["outDir"])
|
|
121
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
122
|
+
|
|
123
|
+
cfg = payload["loraConfig"]
|
|
124
|
+
lora_cfg = LoraConfig(
|
|
125
|
+
r=int(cfg.get("r", 8)),
|
|
126
|
+
lora_alpha=int(cfg.get("alpha", 16)),
|
|
127
|
+
lora_dropout=float(cfg.get("dropout", 0.1)),
|
|
128
|
+
bias="none",
|
|
129
|
+
target_modules=_peft_target_modules(cfg.get("target_modules", [])),
|
|
130
|
+
task_type=TaskType.SEQ_CLS if subtype == "LORA_CLASSIFICATION" else TaskType.CAUSAL_LM,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
|
134
|
+
if tokenizer.pad_token is None:
|
|
135
|
+
tokenizer.pad_token = tokenizer.eos_token
|
|
136
|
+
|
|
137
|
+
if subtype == "LORA_CLASSIFICATION":
|
|
138
|
+
model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)
|
|
139
|
+
else:
|
|
140
|
+
model = AutoModelForCausalLM.from_pretrained(base_model_name)
|
|
141
|
+
|
|
142
|
+
model = get_peft_model(model, lora_cfg)
|
|
143
|
+
model.to(device)
|
|
144
|
+
|
|
145
|
+
train_ds = _load_dataset(payload["trainingDatasetUri"])
|
|
146
|
+
val_ds = _load_dataset(payload["validationDatasetUri"])
|
|
147
|
+
|
|
148
|
+
def tokenize(batch: Dict[str, Any]) -> Dict[str, Any]:
|
|
149
|
+
return tokenizer(batch["text"], padding=False, truncation=True, max_length=512)
|
|
150
|
+
|
|
151
|
+
train_ds = train_ds.map(tokenize, batched=True)
|
|
152
|
+
val_ds = val_ds.map(tokenize, batched=True)
|
|
153
|
+
|
|
154
|
+
args = TrainingArguments(
|
|
155
|
+
output_dir=str(out_dir / "trainer_state"),
|
|
156
|
+
num_train_epochs=int(payload.get("maxEpochs", 3)),
|
|
157
|
+
per_device_train_batch_size=8,
|
|
158
|
+
per_device_eval_batch_size=8,
|
|
159
|
+
learning_rate=2e-4,
|
|
160
|
+
eval_strategy="epoch",
|
|
161
|
+
save_strategy="no",
|
|
162
|
+
logging_steps=50,
|
|
163
|
+
seed=int(payload.get("seed", 42)),
|
|
164
|
+
report_to=[],
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
trainer = Trainer(
|
|
168
|
+
model=model,
|
|
169
|
+
args=args,
|
|
170
|
+
train_dataset=train_ds,
|
|
171
|
+
eval_dataset=val_ds,
|
|
172
|
+
tokenizer=tokenizer,
|
|
173
|
+
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
|
|
174
|
+
)
|
|
175
|
+
train_result = trainer.train()
|
|
176
|
+
eval_result = trainer.evaluate()
|
|
177
|
+
|
|
178
|
+
# Save adapter
|
|
179
|
+
model.save_pretrained(out_dir, safe_serialization=True)
|
|
180
|
+
|
|
181
|
+
metrics: Dict[str, float] = {}
|
|
182
|
+
if subtype == "LORA_CLASSIFICATION":
|
|
183
|
+
# Trainer.evaluate() returns eval_loss; we approximate accuracy
|
|
184
|
+
# by running a fresh prediction pass. Cheap because val set is
|
|
185
|
+
# small (mission corpora typically hundreds of items).
|
|
186
|
+
preds = trainer.predict(val_ds)
|
|
187
|
+
labels = preds.label_ids
|
|
188
|
+
pred_ids = preds.predictions.argmax(axis=-1)
|
|
189
|
+
accuracy = float((pred_ids == labels).mean()) if labels is not None else 0.0
|
|
190
|
+
metrics = {"accuracy": accuracy, "f1": _macro_f1(labels, pred_ids)}
|
|
191
|
+
else:
|
|
192
|
+
# CausalLM perplexity from eval_loss.
|
|
193
|
+
loss = float(eval_result.get("eval_loss", float("inf")))
|
|
194
|
+
metrics = {"perplexity": math.exp(loss) if loss < 50 else float("inf")}
|
|
195
|
+
|
|
196
|
+
(out_dir / "metrics.json").write_text(json.dumps(metrics))
|
|
197
|
+
_emit_progress("done", {"metrics": metrics, "train_loss": float(train_result.training_loss)})
|
|
198
|
+
return metrics
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _macro_f1(labels, preds) -> float:
|
|
202
|
+
if labels is None:
|
|
203
|
+
return 0.0
|
|
204
|
+
import numpy as np # type: ignore
|
|
205
|
+
classes = np.unique(labels)
|
|
206
|
+
f1s: list[float] = []
|
|
207
|
+
for c in classes:
|
|
208
|
+
tp = float(((preds == c) & (labels == c)).sum())
|
|
209
|
+
fp = float(((preds == c) & (labels != c)).sum())
|
|
210
|
+
fn = float(((preds != c) & (labels == c)).sum())
|
|
211
|
+
prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
212
|
+
rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
213
|
+
f1s.append(2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0)
|
|
214
|
+
return float(sum(f1s) / len(f1s)) if f1s else 0.0
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def main() -> None:
|
|
218
|
+
payload = _read_payload()
|
|
219
|
+
_emit_progress("start", {
|
|
220
|
+
"adapterId": payload["adapterId"],
|
|
221
|
+
"subtype": payload["subtype"],
|
|
222
|
+
"baseModel": payload["baseModel"],
|
|
223
|
+
})
|
|
224
|
+
metrics = _train(payload)
|
|
225
|
+
_emit_progress("end", {"metrics": metrics})
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
if __name__ == "__main__":
|
|
229
|
+
try:
|
|
230
|
+
main()
|
|
231
|
+
except KeyboardInterrupt:
|
|
232
|
+
sys.exit(130)
|
|
233
|
+
except SystemExit:
|
|
234
|
+
raise
|
|
235
|
+
except Exception as exc: # pragma: no cover — surfaced by TS layer
|
|
236
|
+
print(f"error: {exc}", file=sys.stderr, flush=True)
|
|
237
|
+
sys.exit(2)
|