@synapseia-network/node 0.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +105 -0
- package/README.md +232 -0
- package/dist/bid-responder-Q725ZIUC.js +86 -0
- package/dist/bootstrap.js +22 -0
- package/dist/chain-info-lightweight-2UWAQZBF.js +303 -0
- package/dist/chat-stream-handler-BSHSGMFF.js +127 -0
- package/dist/chunk-2X7MSWD4.js +270 -0
- package/dist/chunk-3BHRQWSM.js +531 -0
- package/dist/chunk-5QFTU52A.js +442 -0
- package/dist/chunk-5ZAJBIAV.js +25 -0
- package/dist/chunk-7FLDR5NT.js +186 -0
- package/dist/chunk-C5XRYLYP.js +137 -0
- package/dist/chunk-D7ADMHK2.js +36 -0
- package/dist/chunk-DXUYWRO7.js +23 -0
- package/dist/chunk-F5UDK56Z.js +289 -0
- package/dist/chunk-NEHR6XY7.js +111 -0
- package/dist/chunk-NMJVODKH.js +453 -0
- package/dist/chunk-PRVT22SM.js +324 -0
- package/dist/chunk-T2ZRG5CX.js +1380 -0
- package/dist/chunk-V2L5SXTL.js +88 -0
- package/dist/chunk-XL2NJWFY.js +702 -0
- package/dist/embedding-C6GE3WVM.js +16 -0
- package/dist/hardware-ITQQJ5YI.js +37 -0
- package/dist/index.js +16836 -0
- package/dist/inference-server-CIGRJ36H.js +25 -0
- package/dist/local-cors-J6RWNMMD.js +44 -0
- package/dist/model-catalog-C53SDFMG.js +15 -0
- package/dist/model-discovery-LA6YMT3I.js +10 -0
- package/dist/ollama-XVXA3A37.js +9 -0
- package/dist/rewards-vault-cli-HW7H4EMD.js +147 -0
- package/dist/scripts/create_nodes.sh +6 -0
- package/dist/scripts/diloco_train.py +319 -0
- package/dist/scripts/train_lora.py +237 -0
- package/dist/scripts/train_micro.py +586 -0
- package/dist/trainer-HQMV2ZAR.js +21 -0
- package/package.json +128 -0
- package/scripts/create_nodes.sh +6 -0
- package/scripts/diloco_train.py +319 -0
- package/scripts/train_lora.py +237 -0
- package/scripts/train_micro.py +586 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
LoRA fine-tuning runner for biomedical models (Synapseia node-side).
|
|
4
|
+
|
|
5
|
+
Reads a JSON payload from stdin matching the LoraWorkOrderPayload shape
|
|
6
|
+
exported by `lora_trainer.ts`. Trains a LoRA adapter on the provided
|
|
7
|
+
training dataset, evaluates on the validation dataset, and writes:
|
|
8
|
+
|
|
9
|
+
<outDir>/adapter_model.safetensors
|
|
10
|
+
<outDir>/adapter_config.json
|
|
11
|
+
<outDir>/metrics.json
|
|
12
|
+
|
|
13
|
+
Progress lines are emitted to stdout in the form:
|
|
14
|
+
|
|
15
|
+
progress {"step": 12, "loss": 0.42, "lr": 5e-5}
|
|
16
|
+
progress epoch_done {"epoch": 1, "val_loss": 0.31}
|
|
17
|
+
|
|
18
|
+
The TS wrapper (`lora_trainer.ts`) surfaces them via the node logger.
|
|
19
|
+
|
|
20
|
+
Why a single Python script instead of a NestJS-style module:
|
|
21
|
+
|
|
22
|
+
- HuggingFace Transformers + PEFT are Python-only.
|
|
23
|
+
- The training step is the only place the node needs torch — keeping
|
|
24
|
+
it isolated as a subprocess avoids loading torch into the node
|
|
25
|
+
runtime, mirrors the `train_micro.py` pattern shipped earlier.
|
|
26
|
+
|
|
27
|
+
Required Python deps (installed once on the node):
|
|
28
|
+
|
|
29
|
+
pip install transformers peft datasets safetensors torch accelerate
|
|
30
|
+
|
|
31
|
+
Hardware:
|
|
32
|
+
|
|
33
|
+
- PubMedBERT (~110M) trains on CPU in ~4-6h, on a single 8GB GPU
|
|
34
|
+
in <30 min.
|
|
35
|
+
- BioGPT-Large (~1.5B) requires GPU. The TS wrapper refuses
|
|
36
|
+
LORA_GENERATION on CPU-only nodes; this script also asserts CUDA
|
|
37
|
+
when subtype == LORA_GENERATION, as a defence in depth.
|
|
38
|
+
"""
|
|
39
|
+
from __future__ import annotations
|
|
40
|
+
|
|
41
|
+
import json
|
|
42
|
+
import os
|
|
43
|
+
import sys
|
|
44
|
+
import math
|
|
45
|
+
from pathlib import Path
|
|
46
|
+
from typing import Any, Dict
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _emit_progress(label: str, fields: Dict[str, Any]) -> None:
|
|
50
|
+
print(f"progress {label} {json.dumps(fields)}", flush=True)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _read_payload() -> Dict[str, Any]:
|
|
54
|
+
raw = sys.stdin.read().strip()
|
|
55
|
+
if not raw:
|
|
56
|
+
raise SystemExit("LoRA trainer: empty stdin payload")
|
|
57
|
+
return json.loads(raw)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _resolve_base_model(base_model: str) -> str:
|
|
61
|
+
table = {
|
|
62
|
+
"PubMedBERT": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
|
|
63
|
+
"BioGPT-Large": "microsoft/BioGPT-Large",
|
|
64
|
+
}
|
|
65
|
+
if base_model not in table:
|
|
66
|
+
raise SystemExit(f"Unsupported baseModel: {base_model}")
|
|
67
|
+
return table[base_model]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _detect_device(subtype: str) -> str:
|
|
71
|
+
try:
|
|
72
|
+
import torch # type: ignore
|
|
73
|
+
except ImportError:
|
|
74
|
+
raise SystemExit("LoRA trainer: torch is not installed")
|
|
75
|
+
if torch.cuda.is_available():
|
|
76
|
+
return "cuda"
|
|
77
|
+
# Apple Silicon MPS works for PubMedBERT but not for BioGPT-Large
|
|
78
|
+
# at the model sizes in question; gate it for CLASSIFICATION only.
|
|
79
|
+
if subtype == "LORA_CLASSIFICATION" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
|
|
80
|
+
return "mps"
|
|
81
|
+
if subtype == "LORA_GENERATION":
|
|
82
|
+
raise SystemExit("LORA_GENERATION requires CUDA; this node has no GPU")
|
|
83
|
+
return "cpu"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _load_dataset(uri: str):
|
|
87
|
+
"""
|
|
88
|
+
The `uri` is either a HuggingFace dataset id (no scheme) or an
|
|
89
|
+
https:// URL pointing at a JSONL file (one record per line, with
|
|
90
|
+
the standard `text`/`label` shape for classification or `text`
|
|
91
|
+
alone for generation). For V1 we only support HF dataset ids and
|
|
92
|
+
https-jsonl. The coordinator's mission corpus is exposed via a
|
|
93
|
+
pre-signed download URL the WO payload can carry.
|
|
94
|
+
"""
|
|
95
|
+
from datasets import load_dataset # type: ignore
|
|
96
|
+
if uri.startswith("https://") or uri.startswith("http://"):
|
|
97
|
+
return load_dataset("json", data_files=uri, split="train")
|
|
98
|
+
return load_dataset(uri, split="train")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _peft_target_modules(default: list[str]) -> list[str]:
|
|
102
|
+
return list(default) if default else ["q_proj", "v_proj"]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _train(payload: Dict[str, Any]) -> Dict[str, Any]:
|
|
106
|
+
import torch # type: ignore
|
|
107
|
+
from transformers import ( # type: ignore
|
|
108
|
+
AutoTokenizer,
|
|
109
|
+
AutoModelForSequenceClassification,
|
|
110
|
+
AutoModelForCausalLM,
|
|
111
|
+
TrainingArguments,
|
|
112
|
+
Trainer,
|
|
113
|
+
DataCollatorWithPadding,
|
|
114
|
+
)
|
|
115
|
+
from peft import LoraConfig, get_peft_model, TaskType # type: ignore
|
|
116
|
+
|
|
117
|
+
subtype = payload["subtype"]
|
|
118
|
+
base_model_name = _resolve_base_model(payload["baseModel"])
|
|
119
|
+
device = _detect_device(subtype)
|
|
120
|
+
out_dir = Path(payload["outDir"])
|
|
121
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
122
|
+
|
|
123
|
+
cfg = payload["loraConfig"]
|
|
124
|
+
lora_cfg = LoraConfig(
|
|
125
|
+
r=int(cfg.get("r", 8)),
|
|
126
|
+
lora_alpha=int(cfg.get("alpha", 16)),
|
|
127
|
+
lora_dropout=float(cfg.get("dropout", 0.1)),
|
|
128
|
+
bias="none",
|
|
129
|
+
target_modules=_peft_target_modules(cfg.get("target_modules", [])),
|
|
130
|
+
task_type=TaskType.SEQ_CLS if subtype == "LORA_CLASSIFICATION" else TaskType.CAUSAL_LM,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
|
|
134
|
+
if tokenizer.pad_token is None:
|
|
135
|
+
tokenizer.pad_token = tokenizer.eos_token
|
|
136
|
+
|
|
137
|
+
if subtype == "LORA_CLASSIFICATION":
|
|
138
|
+
model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)
|
|
139
|
+
else:
|
|
140
|
+
model = AutoModelForCausalLM.from_pretrained(base_model_name)
|
|
141
|
+
|
|
142
|
+
model = get_peft_model(model, lora_cfg)
|
|
143
|
+
model.to(device)
|
|
144
|
+
|
|
145
|
+
train_ds = _load_dataset(payload["trainingDatasetUri"])
|
|
146
|
+
val_ds = _load_dataset(payload["validationDatasetUri"])
|
|
147
|
+
|
|
148
|
+
def tokenize(batch: Dict[str, Any]) -> Dict[str, Any]:
|
|
149
|
+
return tokenizer(batch["text"], padding=False, truncation=True, max_length=512)
|
|
150
|
+
|
|
151
|
+
train_ds = train_ds.map(tokenize, batched=True)
|
|
152
|
+
val_ds = val_ds.map(tokenize, batched=True)
|
|
153
|
+
|
|
154
|
+
args = TrainingArguments(
|
|
155
|
+
output_dir=str(out_dir / "trainer_state"),
|
|
156
|
+
num_train_epochs=int(payload.get("maxEpochs", 3)),
|
|
157
|
+
per_device_train_batch_size=8,
|
|
158
|
+
per_device_eval_batch_size=8,
|
|
159
|
+
learning_rate=2e-4,
|
|
160
|
+
eval_strategy="epoch",
|
|
161
|
+
save_strategy="no",
|
|
162
|
+
logging_steps=50,
|
|
163
|
+
seed=int(payload.get("seed", 42)),
|
|
164
|
+
report_to=[],
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
trainer = Trainer(
|
|
168
|
+
model=model,
|
|
169
|
+
args=args,
|
|
170
|
+
train_dataset=train_ds,
|
|
171
|
+
eval_dataset=val_ds,
|
|
172
|
+
tokenizer=tokenizer,
|
|
173
|
+
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
|
|
174
|
+
)
|
|
175
|
+
train_result = trainer.train()
|
|
176
|
+
eval_result = trainer.evaluate()
|
|
177
|
+
|
|
178
|
+
# Save adapter
|
|
179
|
+
model.save_pretrained(out_dir, safe_serialization=True)
|
|
180
|
+
|
|
181
|
+
metrics: Dict[str, float] = {}
|
|
182
|
+
if subtype == "LORA_CLASSIFICATION":
|
|
183
|
+
# Trainer.evaluate() returns eval_loss; we approximate accuracy
|
|
184
|
+
# by running a fresh prediction pass. Cheap because val set is
|
|
185
|
+
# small (mission corpora typically hundreds of items).
|
|
186
|
+
preds = trainer.predict(val_ds)
|
|
187
|
+
labels = preds.label_ids
|
|
188
|
+
pred_ids = preds.predictions.argmax(axis=-1)
|
|
189
|
+
accuracy = float((pred_ids == labels).mean()) if labels is not None else 0.0
|
|
190
|
+
metrics = {"accuracy": accuracy, "f1": _macro_f1(labels, pred_ids)}
|
|
191
|
+
else:
|
|
192
|
+
# CausalLM perplexity from eval_loss.
|
|
193
|
+
loss = float(eval_result.get("eval_loss", float("inf")))
|
|
194
|
+
metrics = {"perplexity": math.exp(loss) if loss < 50 else float("inf")}
|
|
195
|
+
|
|
196
|
+
(out_dir / "metrics.json").write_text(json.dumps(metrics))
|
|
197
|
+
_emit_progress("done", {"metrics": metrics, "train_loss": float(train_result.training_loss)})
|
|
198
|
+
return metrics
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _macro_f1(labels, preds) -> float:
|
|
202
|
+
if labels is None:
|
|
203
|
+
return 0.0
|
|
204
|
+
import numpy as np # type: ignore
|
|
205
|
+
classes = np.unique(labels)
|
|
206
|
+
f1s: list[float] = []
|
|
207
|
+
for c in classes:
|
|
208
|
+
tp = float(((preds == c) & (labels == c)).sum())
|
|
209
|
+
fp = float(((preds == c) & (labels != c)).sum())
|
|
210
|
+
fn = float(((preds != c) & (labels == c)).sum())
|
|
211
|
+
prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
212
|
+
rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
213
|
+
f1s.append(2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0)
|
|
214
|
+
return float(sum(f1s) / len(f1s)) if f1s else 0.0
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def main() -> None:
|
|
218
|
+
payload = _read_payload()
|
|
219
|
+
_emit_progress("start", {
|
|
220
|
+
"adapterId": payload["adapterId"],
|
|
221
|
+
"subtype": payload["subtype"],
|
|
222
|
+
"baseModel": payload["baseModel"],
|
|
223
|
+
})
|
|
224
|
+
metrics = _train(payload)
|
|
225
|
+
_emit_progress("end", {"metrics": metrics})
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
if __name__ == "__main__":
|
|
229
|
+
try:
|
|
230
|
+
main()
|
|
231
|
+
except KeyboardInterrupt:
|
|
232
|
+
sys.exit(130)
|
|
233
|
+
except SystemExit:
|
|
234
|
+
raise
|
|
235
|
+
except Exception as exc: # pragma: no cover — surfaced by TS layer
|
|
236
|
+
print(f"error: {exc}", file=sys.stderr, flush=True)
|
|
237
|
+
sys.exit(2)
|