@synapseia-network/node 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +105 -0
  2. package/README.md +232 -0
  3. package/dist/bid-responder-Q725ZIUC.js +86 -0
  4. package/dist/bootstrap.js +22 -0
  5. package/dist/chain-info-lightweight-2UWAQZBF.js +303 -0
  6. package/dist/chat-stream-handler-BSHSGMFF.js +127 -0
  7. package/dist/chunk-2X7MSWD4.js +270 -0
  8. package/dist/chunk-3BHRQWSM.js +531 -0
  9. package/dist/chunk-5QFTU52A.js +442 -0
  10. package/dist/chunk-5ZAJBIAV.js +25 -0
  11. package/dist/chunk-7FLDR5NT.js +186 -0
  12. package/dist/chunk-C5XRYLYP.js +137 -0
  13. package/dist/chunk-D7ADMHK2.js +36 -0
  14. package/dist/chunk-DXUYWRO7.js +23 -0
  15. package/dist/chunk-F5UDK56Z.js +289 -0
  16. package/dist/chunk-NEHR6XY7.js +111 -0
  17. package/dist/chunk-NMJVODKH.js +453 -0
  18. package/dist/chunk-PRVT22SM.js +324 -0
  19. package/dist/chunk-T2ZRG5CX.js +1380 -0
  20. package/dist/chunk-V2L5SXTL.js +88 -0
  21. package/dist/chunk-XL2NJWFY.js +702 -0
  22. package/dist/embedding-C6GE3WVM.js +16 -0
  23. package/dist/hardware-ITQQJ5YI.js +37 -0
  24. package/dist/index.js +16836 -0
  25. package/dist/inference-server-CIGRJ36H.js +25 -0
  26. package/dist/local-cors-J6RWNMMD.js +44 -0
  27. package/dist/model-catalog-C53SDFMG.js +15 -0
  28. package/dist/model-discovery-LA6YMT3I.js +10 -0
  29. package/dist/ollama-XVXA3A37.js +9 -0
  30. package/dist/rewards-vault-cli-HW7H4EMD.js +147 -0
  31. package/dist/scripts/create_nodes.sh +6 -0
  32. package/dist/scripts/diloco_train.py +319 -0
  33. package/dist/scripts/train_lora.py +237 -0
  34. package/dist/scripts/train_micro.py +586 -0
  35. package/dist/trainer-HQMV2ZAR.js +21 -0
  36. package/package.json +128 -0
  37. package/scripts/create_nodes.sh +6 -0
  38. package/scripts/diloco_train.py +319 -0
  39. package/scripts/train_lora.py +237 -0
  40. package/scripts/train_micro.py +586 -0
@@ -0,0 +1,237 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ LoRA fine-tuning runner for biomedical models (Synapseia node-side).
4
+
5
+ Reads a JSON payload from stdin matching the LoraWorkOrderPayload shape
6
+ exported by `lora_trainer.ts`. Trains a LoRA adapter on the provided
7
+ training dataset, evaluates on the validation dataset, and writes:
8
+
9
+ <outDir>/adapter_model.safetensors
10
+ <outDir>/adapter_config.json
11
+ <outDir>/metrics.json
12
+
13
+ Progress lines are emitted to stdout in the form:
14
+
15
+ progress {"step": 12, "loss": 0.42, "lr": 5e-5}
16
+ progress epoch_done {"epoch": 1, "val_loss": 0.31}
17
+
18
+ The TS wrapper (`lora_trainer.ts`) surfaces them via the node logger.
19
+
20
+ Why a single Python script instead of a NestJS-style module:
21
+
22
+ - HuggingFace Transformers + PEFT are Python-only.
23
+ - The training step is the only place the node needs torch — keeping
24
+ it isolated as a subprocess avoids loading torch into the node
25
+ runtime, mirrors the `train_micro.py` pattern shipped earlier.
26
+
27
+ Required Python deps (installed once on the node):
28
+
29
+ pip install transformers peft datasets safetensors torch accelerate
30
+
31
+ Hardware:
32
+
33
+ - PubMedBERT (~110M) trains on CPU in ~4-6h, on a single 8GB GPU
34
+ in <30 min.
35
+ - BioGPT-Large (~1.5B) requires GPU. The TS wrapper refuses
36
+ LORA_GENERATION on CPU-only nodes; this script also asserts CUDA
37
+ when subtype == LORA_GENERATION, as a defence in depth.
38
+ """
39
+ from __future__ import annotations
40
+
41
+ import json
42
+ import os
43
+ import sys
44
+ import math
45
+ from pathlib import Path
46
+ from typing import Any, Dict
47
+
48
+
49
+ def _emit_progress(label: str, fields: Dict[str, Any]) -> None:
50
+ print(f"progress {label} {json.dumps(fields)}", flush=True)
51
+
52
+
53
+ def _read_payload() -> Dict[str, Any]:
54
+ raw = sys.stdin.read().strip()
55
+ if not raw:
56
+ raise SystemExit("LoRA trainer: empty stdin payload")
57
+ return json.loads(raw)
58
+
59
+
60
+ def _resolve_base_model(base_model: str) -> str:
61
+ table = {
62
+ "PubMedBERT": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
63
+ "BioGPT-Large": "microsoft/BioGPT-Large",
64
+ }
65
+ if base_model not in table:
66
+ raise SystemExit(f"Unsupported baseModel: {base_model}")
67
+ return table[base_model]
68
+
69
+
70
+ def _detect_device(subtype: str) -> str:
71
+ try:
72
+ import torch # type: ignore
73
+ except ImportError:
74
+ raise SystemExit("LoRA trainer: torch is not installed")
75
+ if torch.cuda.is_available():
76
+ return "cuda"
77
+ # Apple Silicon MPS works for PubMedBERT but not for BioGPT-Large
78
+ # at the model sizes in question; gate it for CLASSIFICATION only.
79
+ if subtype == "LORA_CLASSIFICATION" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
80
+ return "mps"
81
+ if subtype == "LORA_GENERATION":
82
+ raise SystemExit("LORA_GENERATION requires CUDA; this node has no GPU")
83
+ return "cpu"
84
+
85
+
86
+ def _load_dataset(uri: str):
87
+ """
88
+ The `uri` is either a HuggingFace dataset id (no scheme) or an
89
+ https:// URL pointing at a JSONL file (one record per line, with
90
+ the standard `text`/`label` shape for classification or `text`
91
+ alone for generation). For V1 we only support HF dataset ids and
92
+ https-jsonl. The coordinator's mission corpus is exposed via a
93
+ pre-signed download URL the WO payload can carry.
94
+ """
95
+ from datasets import load_dataset # type: ignore
96
+ if uri.startswith("https://") or uri.startswith("http://"):
97
+ return load_dataset("json", data_files=uri, split="train")
98
+ return load_dataset(uri, split="train")
99
+
100
+
101
+ def _peft_target_modules(default: list[str]) -> list[str]:
102
+ return list(default) if default else ["q_proj", "v_proj"]
103
+
104
+
105
+ def _train(payload: Dict[str, Any]) -> Dict[str, Any]:
106
+ import torch # type: ignore
107
+ from transformers import ( # type: ignore
108
+ AutoTokenizer,
109
+ AutoModelForSequenceClassification,
110
+ AutoModelForCausalLM,
111
+ TrainingArguments,
112
+ Trainer,
113
+ DataCollatorWithPadding,
114
+ )
115
+ from peft import LoraConfig, get_peft_model, TaskType # type: ignore
116
+
117
+ subtype = payload["subtype"]
118
+ base_model_name = _resolve_base_model(payload["baseModel"])
119
+ device = _detect_device(subtype)
120
+ out_dir = Path(payload["outDir"])
121
+ out_dir.mkdir(parents=True, exist_ok=True)
122
+
123
+ cfg = payload["loraConfig"]
124
+ lora_cfg = LoraConfig(
125
+ r=int(cfg.get("r", 8)),
126
+ lora_alpha=int(cfg.get("alpha", 16)),
127
+ lora_dropout=float(cfg.get("dropout", 0.1)),
128
+ bias="none",
129
+ target_modules=_peft_target_modules(cfg.get("target_modules", [])),
130
+ task_type=TaskType.SEQ_CLS if subtype == "LORA_CLASSIFICATION" else TaskType.CAUSAL_LM,
131
+ )
132
+
133
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
134
+ if tokenizer.pad_token is None:
135
+ tokenizer.pad_token = tokenizer.eos_token
136
+
137
+ if subtype == "LORA_CLASSIFICATION":
138
+ model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)
139
+ else:
140
+ model = AutoModelForCausalLM.from_pretrained(base_model_name)
141
+
142
+ model = get_peft_model(model, lora_cfg)
143
+ model.to(device)
144
+
145
+ train_ds = _load_dataset(payload["trainingDatasetUri"])
146
+ val_ds = _load_dataset(payload["validationDatasetUri"])
147
+
148
+ def tokenize(batch: Dict[str, Any]) -> Dict[str, Any]:
149
+ return tokenizer(batch["text"], padding=False, truncation=True, max_length=512)
150
+
151
+ train_ds = train_ds.map(tokenize, batched=True)
152
+ val_ds = val_ds.map(tokenize, batched=True)
153
+
154
+ args = TrainingArguments(
155
+ output_dir=str(out_dir / "trainer_state"),
156
+ num_train_epochs=int(payload.get("maxEpochs", 3)),
157
+ per_device_train_batch_size=8,
158
+ per_device_eval_batch_size=8,
159
+ learning_rate=2e-4,
160
+ eval_strategy="epoch",
161
+ save_strategy="no",
162
+ logging_steps=50,
163
+ seed=int(payload.get("seed", 42)),
164
+ report_to=[],
165
+ )
166
+
167
+ trainer = Trainer(
168
+ model=model,
169
+ args=args,
170
+ train_dataset=train_ds,
171
+ eval_dataset=val_ds,
172
+ tokenizer=tokenizer,
173
+ data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
174
+ )
175
+ train_result = trainer.train()
176
+ eval_result = trainer.evaluate()
177
+
178
+ # Save adapter
179
+ model.save_pretrained(out_dir, safe_serialization=True)
180
+
181
+ metrics: Dict[str, float] = {}
182
+ if subtype == "LORA_CLASSIFICATION":
183
+ # Trainer.evaluate() returns eval_loss; we approximate accuracy
184
+ # by running a fresh prediction pass. Cheap because val set is
185
+ # small (mission corpora typically hundreds of items).
186
+ preds = trainer.predict(val_ds)
187
+ labels = preds.label_ids
188
+ pred_ids = preds.predictions.argmax(axis=-1)
189
+ accuracy = float((pred_ids == labels).mean()) if labels is not None else 0.0
190
+ metrics = {"accuracy": accuracy, "f1": _macro_f1(labels, pred_ids)}
191
+ else:
192
+ # CausalLM perplexity from eval_loss.
193
+ loss = float(eval_result.get("eval_loss", float("inf")))
194
+ metrics = {"perplexity": math.exp(loss) if loss < 50 else float("inf")}
195
+
196
+ (out_dir / "metrics.json").write_text(json.dumps(metrics))
197
+ _emit_progress("done", {"metrics": metrics, "train_loss": float(train_result.training_loss)})
198
+ return metrics
199
+
200
+
201
+ def _macro_f1(labels, preds) -> float:
202
+ if labels is None:
203
+ return 0.0
204
+ import numpy as np # type: ignore
205
+ classes = np.unique(labels)
206
+ f1s: list[float] = []
207
+ for c in classes:
208
+ tp = float(((preds == c) & (labels == c)).sum())
209
+ fp = float(((preds == c) & (labels != c)).sum())
210
+ fn = float(((preds != c) & (labels == c)).sum())
211
+ prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
212
+ rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
213
+ f1s.append(2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0)
214
+ return float(sum(f1s) / len(f1s)) if f1s else 0.0
215
+
216
+
217
+ def main() -> None:
218
+ payload = _read_payload()
219
+ _emit_progress("start", {
220
+ "adapterId": payload["adapterId"],
221
+ "subtype": payload["subtype"],
222
+ "baseModel": payload["baseModel"],
223
+ })
224
+ metrics = _train(payload)
225
+ _emit_progress("end", {"metrics": metrics})
226
+
227
+
228
+ if __name__ == "__main__":
229
+ try:
230
+ main()
231
+ except KeyboardInterrupt:
232
+ sys.exit(130)
233
+ except SystemExit:
234
+ raise
235
+ except Exception as exc: # pragma: no cover — surfaced by TS layer
236
+ print(f"error: {exc}", file=sys.stderr, flush=True)
237
+ sys.exit(2)