@synapseia-network/node 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +105 -0
  2. package/README.md +232 -0
  3. package/dist/bid-responder-Q725ZIUC.js +86 -0
  4. package/dist/bootstrap.js +22 -0
  5. package/dist/chain-info-lightweight-2UWAQZBF.js +303 -0
  6. package/dist/chat-stream-handler-BSHSGMFF.js +127 -0
  7. package/dist/chunk-2X7MSWD4.js +270 -0
  8. package/dist/chunk-3BHRQWSM.js +531 -0
  9. package/dist/chunk-5QFTU52A.js +442 -0
  10. package/dist/chunk-5ZAJBIAV.js +25 -0
  11. package/dist/chunk-7FLDR5NT.js +186 -0
  12. package/dist/chunk-C5XRYLYP.js +137 -0
  13. package/dist/chunk-D7ADMHK2.js +36 -0
  14. package/dist/chunk-DXUYWRO7.js +23 -0
  15. package/dist/chunk-F5UDK56Z.js +289 -0
  16. package/dist/chunk-NEHR6XY7.js +111 -0
  17. package/dist/chunk-NMJVODKH.js +453 -0
  18. package/dist/chunk-PRVT22SM.js +324 -0
  19. package/dist/chunk-T2ZRG5CX.js +1380 -0
  20. package/dist/chunk-V2L5SXTL.js +88 -0
  21. package/dist/chunk-XL2NJWFY.js +702 -0
  22. package/dist/embedding-C6GE3WVM.js +16 -0
  23. package/dist/hardware-ITQQJ5YI.js +37 -0
  24. package/dist/index.js +16836 -0
  25. package/dist/inference-server-CIGRJ36H.js +25 -0
  26. package/dist/local-cors-J6RWNMMD.js +44 -0
  27. package/dist/model-catalog-C53SDFMG.js +15 -0
  28. package/dist/model-discovery-LA6YMT3I.js +10 -0
  29. package/dist/ollama-XVXA3A37.js +9 -0
  30. package/dist/rewards-vault-cli-HW7H4EMD.js +147 -0
  31. package/dist/scripts/create_nodes.sh +6 -0
  32. package/dist/scripts/diloco_train.py +319 -0
  33. package/dist/scripts/train_lora.py +237 -0
  34. package/dist/scripts/train_micro.py +586 -0
  35. package/dist/trainer-HQMV2ZAR.js +21 -0
  36. package/package.json +128 -0
  37. package/scripts/create_nodes.sh +6 -0
  38. package/scripts/diloco_train.py +319 -0
  39. package/scripts/train_lora.py +237 -0
  40. package/scripts/train_micro.py +586 -0
package/package.json ADDED
@@ -0,0 +1,128 @@
1
+ {
2
+ "name": "@synapseia-network/node",
3
+ "version": "0.8.5",
4
+ "description": "Synapseia Network node CLI — P2P compute for autonomous AI agents on Solana.",
5
+ "type": "module",
6
+ "license": "SEE LICENSE IN LICENSE",
7
+ "homepage": "https://github.com/synapseia-network/node#readme",
8
+ "repository": {
9
+ "type": "git",
10
+ "url": "git+https://github.com/synapseia-network/node.git"
11
+ },
12
+ "bugs": {
13
+ "url": "https://github.com/synapseia-network/node/issues"
14
+ },
15
+ "publishConfig": {
16
+ "access": "public"
17
+ },
18
+ "engines": {
19
+ "node": ">=20"
20
+ },
21
+ "bin": {
22
+ "synapseia": "./dist/bootstrap.js",
23
+ "syn": "./dist/bootstrap.js"
24
+ },
25
+ "files": [
26
+ "dist",
27
+ "scripts",
28
+ "LICENSE",
29
+ "README.md"
30
+ ],
31
+ "scripts": {
32
+ "dev": "tsup index.ts --watch --onSuccess \"node dist/bootstrap.js --help\"",
33
+ "build": "tsup",
34
+ "prepare": "npm run build",
35
+ "start": "node dist/bootstrap.js",
36
+ "test": "jest",
37
+ "test:mutation": "NODE_OPTIONS=--experimental-vm-modules stryker run",
38
+ "lint": "eslint src"
39
+ },
40
+ "dependencies": {
41
+ "@inquirer/prompts": "^8.3.2",
42
+ "@langchain/core": "^1.1.41",
43
+ "@langchain/langgraph": "^1.2.9",
44
+ "@langfuse/langchain": "^5.3.0",
45
+ "@langfuse/otel": "^5.2.0",
46
+ "@langfuse/tracing": "^5.2.0",
47
+ "@libp2p/bootstrap": "^12.0.18",
48
+ "@libp2p/crypto": "^5.1.17",
49
+ "@libp2p/gossipsub": "^15.0.19",
50
+ "@libp2p/identify": "^4.1.2",
51
+ "@libp2p/kad-dht": "^16.2.3",
52
+ "@libp2p/noise": "^1.0.1",
53
+ "@libp2p/ping": "^3.1.2",
54
+ "@libp2p/tcp": "^11.0.17",
55
+ "@libp2p/yamux": "^8.0.1",
56
+ "@multiformats/multiaddr": "^13.0.1",
57
+ "@nestjs/axios": "^4.0.1",
58
+ "@nestjs/common": "^11.1.19",
59
+ "@nestjs/config": "^4.0.3",
60
+ "@nestjs/core": "^11.1.19",
61
+ "@noble/ed25519": "^3.1.0",
62
+ "@noble/hashes": "^2.2.0",
63
+ "@opentelemetry/sdk-node": "^0.215.0",
64
+ "@solana/spl-token": "^0.4.14",
65
+ "@solana/web3.js": "^1.98.4",
66
+ "@types/bip39": "^3.0.4",
67
+ "axios": "^1.15.2",
68
+ "bip39": "^3.1.0",
69
+ "commander": "^12.0.0",
70
+ "dotenv": "^17.3.1",
71
+ "libp2p": "^3.2.2",
72
+ "ollama": "^0.5.0",
73
+ "reflect-metadata": "^0.2.2",
74
+ "rxjs": "^7.8.2",
75
+ "semver": "^7.7.2",
76
+ "socket.io-client": "^4.8.3",
77
+ "usearch": "^2.25.1"
78
+ },
79
+ "devDependencies": {
80
+ "@stryker-mutator/core": "^9.6.1",
81
+ "@stryker-mutator/jest-runner": "^9.6.1",
82
+ "@swc/core": "^1.15.18",
83
+ "@types/jest": "^29.5.12",
84
+ "@types/node": "^20.19.37",
85
+ "@types/semver": "^7.7.0",
86
+ "eslint": "^9.0.0",
87
+ "jest": "^29.7.0",
88
+ "ts-node": "^10.9.2",
89
+ "tsup": "^8.0.2",
90
+ "tsx": "^4.21.0",
91
+ "typescript": "^5.9.3"
92
+ },
93
+ "pnpm": {
94
+ "overrides": {
95
+ "path-to-regexp": "^8.4.0",
96
+ "flatted": "^3.4.2",
97
+ "picomatch": "^4.0.4",
98
+ "semver": "^7.5.2",
99
+ "braces": "^3.0.3",
100
+ "micromatch": "^4.0.8",
101
+ "serialize-javascript": "^7.0.3",
102
+ "minimatch": "^10.0.1",
103
+ "follow-redirects": "^1.15.12",
104
+ "tough-cookie": "^4.1.3",
105
+ "undici": "^6.6.1",
106
+ "ws": "^8.17.1",
107
+ "file-type": "^21.3.2",
108
+ "js-yaml": "^4.1.1",
109
+ "postcss": "^8.5.10",
110
+ "send": "^1.2.0",
111
+ "nanoid": "^5.0.9",
112
+ "brace-expansion": "^2.0.3",
113
+ "diff": "^8.0.3",
114
+ "tmp": "^0.2.4",
115
+ "basic-ftp": "^5.3.0",
116
+ "fast-xml-parser": "^5.7.0",
117
+ "esbuild": "^0.25.0",
118
+ "h3": "^1.15.9",
119
+ "axios": "^1.15.0",
120
+ "protobufjs": ">=7.5.5",
121
+ "socket.io-parser": ">=4.2.6",
122
+ "ajv": ">=8.18.0",
123
+ "@nestjs/core": ">=11.1.18",
124
+ "lodash": ">=4.18.0",
125
+ "lodash-es": ">=4.18.0"
126
+ }
127
+ }
128
+ }
@@ -0,0 +1,6 @@
1
+ #!/bin/bash
2
+ for i in 1 2 3; do
3
+ SYNAPSE_HOME=~/.synapseia-node$i syn start &
4
+ echo "Nodo $i iniciado (PID: $!)"
5
+ sleep 2
6
+ done
@@ -0,0 +1,319 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ DiLoCo inner-loop training script.
4
+
5
+ Reads config from stdin as JSON.
6
+ Outputs JSON lines to stdout: progress updates + final result.
7
+
8
+ Supports testMode=True to use a tiny model (GPT-2) for CI/testing.
9
+ """
10
+
11
+ import sys
12
+ import json
13
+ import os
14
+ import tempfile
15
+ import time
16
+ import math
17
+
18
+ def log(obj: dict) -> None:
19
+ """Output a JSON line to stdout (flush immediately so TS wrapper sees it)."""
20
+ print(json.dumps(obj), flush=True)
21
+
22
+
23
+ def compress_gradients_svd(gradients: dict, top_k: int = 64) -> dict:
24
+ """
25
+ Compress a dict of named gradient tensors using truncated SVD.
26
+ Returns a dict of {name: {"U": ..., "S": ..., "V": ..., "shape": ...}}.
27
+ """
28
+ import torch
29
+ compressed = {}
30
+ for name, grad in gradients.items():
31
+ if grad is None:
32
+ continue
33
+ shape = list(grad.shape)
34
+ # Reshape to 2D for SVD
35
+ if grad.dim() == 1:
36
+ # 1-D tensors: treat as row vector
37
+ mat = grad.unsqueeze(0).float()
38
+ else:
39
+ mat = grad.view(grad.shape[0], -1).float()
40
+
41
+ try:
42
+ U, S, Vh = torch.linalg.svd(mat, full_matrices=False)
43
+ k = min(top_k, S.shape[0])
44
+ compressed[name] = {
45
+ "U": U[:, :k].tolist(),
46
+ "S": S[:k].tolist(),
47
+ "V": Vh[:k, :].tolist(),
48
+ "shape": shape,
49
+ "original_rows": mat.shape[0],
50
+ "original_cols": mat.shape[1],
51
+ }
52
+ except Exception:
53
+ # Fallback: store as-is (shouldn't happen in practice)
54
+ compressed[name] = {
55
+ "raw": grad.tolist(),
56
+ "shape": shape,
57
+ }
58
+ return compressed
59
+
60
+
61
+ def run_test_mode(config: dict) -> None:
62
+ """
63
+ Test mode: use a tiny randomly-initialized model instead of downloading 7B.
64
+ Simulates the DiLoCo inner loop with synthetic data.
65
+ """
66
+ import torch
67
+ import torch.nn as nn
68
+
69
+ inner_steps = config.get("innerSteps", 10)
70
+ lr = config.get("hyperparams", {}).get("learningRate", 1e-3)
71
+ hardware = config.get("hardware", "cpu")
72
+
73
+ device = "cpu"
74
+ if hardware == "mps" and torch.backends.mps.is_available():
75
+ device = "mps"
76
+ elif hardware == "cuda" and torch.cuda.is_available():
77
+ device = "cuda"
78
+
79
+ # Tiny 2-layer MLP as stand-in for foundation model + LoRA
80
+ model = nn.Sequential(
81
+ nn.Linear(64, 128),
82
+ nn.ReLU(),
83
+ nn.Linear(128, 64),
84
+ nn.ReLU(),
85
+ nn.Linear(64, 32),
86
+ ).to(device)
87
+
88
+ # Capture initial weights (for pseudo-gradient computation)
89
+ initial_weights = {}
90
+ for name, param in model.named_parameters():
91
+ initial_weights[name] = param.data.clone()
92
+
93
+ optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
94
+ loss_val = 5.0
95
+
96
+ for step in range(1, inner_steps + 1):
97
+ optimizer.zero_grad()
98
+ x = torch.randn(8, 64, device=device)
99
+ y = torch.randn(8, 32, device=device)
100
+ out = model(x)
101
+ loss = nn.functional.mse_loss(out, y)
102
+ loss.backward()
103
+ optimizer.step()
104
+
105
+ loss_val = float(loss.item())
106
+
107
+ # Emit progress every step (or every 10 for larger runs)
108
+ if step % max(1, inner_steps // 10) == 0 or step == inner_steps:
109
+ log({"step": step, "loss": round(loss_val, 4), "lr": lr})
110
+
111
+ # Compute pseudo-gradients = final_weights - initial_weights
112
+ pseudo_gradients = {}
113
+ for name, param in model.named_parameters():
114
+ pseudo_gradients[name] = param.data - initial_weights[name]
115
+
116
+ # Compress with SVD
117
+ compressed = compress_gradients_svd(pseudo_gradients, top_k=32)
118
+
119
+ # Save to temp file
120
+ import pickle
121
+ tmp = tempfile.NamedTemporaryFile(
122
+ suffix="_diloco_gradients.pt", delete=False, mode="wb"
123
+ )
124
+ pickle.dump(compressed, tmp)
125
+ tmp.close()
126
+ gradient_path = tmp.name
127
+
128
+ val_loss = loss_val * 1.05 # Slightly worse than train loss
129
+ final_loss = loss_val
130
+
131
+ log({
132
+ "result": {
133
+ "finalLoss": round(final_loss, 4),
134
+ "valLoss": round(val_loss, 4),
135
+ "innerSteps": inner_steps,
136
+ "durationMs": int(time.time() * 1000),
137
+ "gradientPath": gradient_path,
138
+ }
139
+ })
140
+
141
+
142
+ def run_full_mode(config: dict) -> None:
143
+ """
144
+ Full mode: fine-tune Qwen2.5-7B (or configured modelId) with LoRA.
145
+ Uses QLoRA (4-bit quantization) to fit in 24GB VRAM.
146
+ """
147
+ import torch
148
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
149
+ from peft import LoraConfig, get_peft_model, PeftModel
150
+ from torch.utils.data import Dataset, DataLoader
151
+
152
+ model_id = config.get("modelId", "Qwen/Qwen2.5-7B")
153
+ adapter_path = config.get("adapterPath")
154
+ dataset_path = config.get("datasetPath", "")
155
+ inner_steps = config.get("innerSteps", 100)
156
+ hyperparams = config.get("hyperparams", {})
157
+ hardware = config.get("hardware", "cpu")
158
+ lr = hyperparams.get("learningRate", 2e-4)
159
+ batch_size = hyperparams.get("batchSize", 4)
160
+
161
+ device = "cpu"
162
+ if hardware == "mps" and torch.backends.mps.is_available():
163
+ device = "mps"
164
+ elif hardware == "cuda" and torch.cuda.is_available():
165
+ device = "cuda"
166
+
167
+ # 4-bit quantization config (only for CUDA)
168
+ if device == "cuda":
169
+ bnb_config = BitsAndBytesConfig(
170
+ load_in_4bit=True,
171
+ bnb_4bit_compute_dtype=torch.float16,
172
+ bnb_4bit_use_double_quant=True,
173
+ bnb_4bit_quant_type="nf4",
174
+ )
175
+ base_model = AutoModelForCausalLM.from_pretrained(
176
+ model_id, quantization_config=bnb_config, device_map="auto"
177
+ )
178
+ else:
179
+ base_model = AutoModelForCausalLM.from_pretrained(
180
+ model_id, torch_dtype=torch.float32
181
+ )
182
+ base_model = base_model.to(device)
183
+
184
+ # Load or create LoRA adapter
185
+ if adapter_path and os.path.exists(adapter_path):
186
+ model = PeftModel.from_pretrained(base_model, adapter_path, is_trainable=True)
187
+ else:
188
+ lora_config = LoraConfig(
189
+ r=16,
190
+ lora_alpha=32,
191
+ target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
192
+ lora_dropout=0.05,
193
+ bias="none",
194
+ task_type="CAUSAL_LM",
195
+ )
196
+ model = get_peft_model(base_model, lora_config)
197
+
198
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
199
+ if tokenizer.pad_token is None:
200
+ tokenizer.pad_token = tokenizer.eos_token
201
+
202
+ # Capture initial LoRA weights
203
+ initial_weights = {}
204
+ for name, param in model.named_parameters():
205
+ if param.requires_grad:
206
+ initial_weights[name] = param.data.clone()
207
+
208
+ # Simple text dataset
209
+ class TextDataset(Dataset):
210
+ def __init__(self, path: str, tokenizer, max_length: int = 512):
211
+ texts = []
212
+ if os.path.exists(path):
213
+ with open(path, "r", encoding="utf-8") as f:
214
+ texts = [line.strip() for line in f if line.strip()]
215
+ if not texts:
216
+ texts = ["Hello world. This is a test."] * 32
217
+ self.encodings = tokenizer(
218
+ texts[:1000],
219
+ truncation=True,
220
+ padding="max_length",
221
+ max_length=max_length,
222
+ return_tensors="pt",
223
+ )
224
+
225
+ def __len__(self):
226
+ return len(self.encodings["input_ids"])
227
+
228
+ def __getitem__(self, idx):
229
+ return {k: v[idx] for k, v in self.encodings.items()}
230
+
231
+ dataset = TextDataset(dataset_path, tokenizer)
232
+ dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
233
+
234
+ optimizer = torch.optim.AdamW(
235
+ [p for p in model.parameters() if p.requires_grad], lr=lr
236
+ )
237
+
238
+ model.train()
239
+ step = 0
240
+ total_loss = 0.0
241
+ data_iter = iter(dataloader)
242
+
243
+ while step < inner_steps:
244
+ try:
245
+ batch = next(data_iter)
246
+ except StopIteration:
247
+ data_iter = iter(dataloader)
248
+ batch = next(data_iter)
249
+
250
+ batch = {k: v.to(device) for k, v in batch.items()}
251
+ labels = batch["input_ids"].clone()
252
+ labels[labels == tokenizer.pad_token_id] = -100
253
+
254
+ outputs = model(**batch, labels=labels)
255
+ loss = outputs.loss
256
+
257
+ optimizer.zero_grad()
258
+ loss.backward()
259
+ optimizer.step()
260
+
261
+ step += 1
262
+ total_loss = float(loss.item())
263
+
264
+ if step % max(1, inner_steps // 10) == 0 or step == inner_steps:
265
+ log({"step": step, "loss": round(total_loss, 4), "lr": lr})
266
+
267
+ final_loss = total_loss
268
+ val_loss = final_loss * 1.05
269
+
270
+ # Compute pseudo-gradients for LoRA parameters
271
+ pseudo_gradients = {}
272
+ for name, param in model.named_parameters():
273
+ if param.requires_grad and name in initial_weights:
274
+ pseudo_gradients[name] = param.data - initial_weights[name]
275
+
276
+ # Compress with SVD
277
+ compressed = compress_gradients_svd(pseudo_gradients, top_k=64)
278
+
279
+ import pickle
280
+ tmp = tempfile.NamedTemporaryFile(
281
+ suffix="_diloco_gradients.pt", delete=False, mode="wb"
282
+ )
283
+ pickle.dump(compressed, tmp)
284
+ tmp.close()
285
+ gradient_path = tmp.name
286
+
287
+ log({
288
+ "result": {
289
+ "finalLoss": round(final_loss, 4),
290
+ "valLoss": round(val_loss, 4),
291
+ "innerSteps": inner_steps,
292
+ "durationMs": int(time.time() * 1000),
293
+ "gradientPath": gradient_path,
294
+ }
295
+ })
296
+
297
+
298
+ def main() -> None:
299
+ try:
300
+ raw = sys.stdin.read()
301
+ config = json.loads(raw)
302
+ except Exception as e:
303
+ log({"error": f"Failed to parse config: {e}"})
304
+ sys.exit(1)
305
+
306
+ test_mode = config.get("testMode", False)
307
+
308
+ try:
309
+ if test_mode:
310
+ run_test_mode(config)
311
+ else:
312
+ run_full_mode(config)
313
+ except Exception as e:
314
+ log({"error": str(e)})
315
+ sys.exit(1)
316
+
317
+
318
+ if __name__ == "__main__":
319
+ main()
@@ -0,0 +1,237 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ LoRA fine-tuning runner for biomedical models (Synapseia node-side).
4
+
5
+ Reads a JSON payload from stdin matching the LoraWorkOrderPayload shape
6
+ exported by `lora_trainer.ts`. Trains a LoRA adapter on the provided
7
+ training dataset, evaluates on the validation dataset, and writes:
8
+
9
+ <outDir>/adapter_model.safetensors
10
+ <outDir>/adapter_config.json
11
+ <outDir>/metrics.json
12
+
13
+ Progress lines are emitted to stdout in the form:
14
+
15
+ progress {"step": 12, "loss": 0.42, "lr": 5e-5}
16
+ progress epoch_done {"epoch": 1, "val_loss": 0.31}
17
+
18
+ The TS wrapper (`lora_trainer.ts`) surfaces them via the node logger.
19
+
20
+ Why a single Python script instead of a NestJS-style module:
21
+
22
+ - HuggingFace Transformers + PEFT are Python-only.
23
+ - The training step is the only place the node needs torch — keeping
24
+ it isolated as a subprocess avoids loading torch into the node
25
+ runtime, mirrors the `train_micro.py` pattern shipped earlier.
26
+
27
+ Required Python deps (installed once on the node):
28
+
29
+ pip install transformers peft datasets safetensors torch accelerate
30
+
31
+ Hardware:
32
+
33
+ - PubMedBERT (~110M) trains on CPU in ~4-6h, on a single 8GB GPU
34
+ in <30 min.
35
+ - BioGPT-Large (~1.5B) requires GPU. The TS wrapper refuses
36
+ LORA_GENERATION on CPU-only nodes; this script also asserts CUDA
37
+ when subtype == LORA_GENERATION, as a defence in depth.
38
+ """
39
+ from __future__ import annotations
40
+
41
+ import json
42
+ import os
43
+ import sys
44
+ import math
45
+ from pathlib import Path
46
+ from typing import Any, Dict
47
+
48
+
49
+ def _emit_progress(label: str, fields: Dict[str, Any]) -> None:
50
+ print(f"progress {label} {json.dumps(fields)}", flush=True)
51
+
52
+
53
+ def _read_payload() -> Dict[str, Any]:
54
+ raw = sys.stdin.read().strip()
55
+ if not raw:
56
+ raise SystemExit("LoRA trainer: empty stdin payload")
57
+ return json.loads(raw)
58
+
59
+
60
+ def _resolve_base_model(base_model: str) -> str:
61
+ table = {
62
+ "PubMedBERT": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
63
+ "BioGPT-Large": "microsoft/BioGPT-Large",
64
+ }
65
+ if base_model not in table:
66
+ raise SystemExit(f"Unsupported baseModel: {base_model}")
67
+ return table[base_model]
68
+
69
+
70
+ def _detect_device(subtype: str) -> str:
71
+ try:
72
+ import torch # type: ignore
73
+ except ImportError:
74
+ raise SystemExit("LoRA trainer: torch is not installed")
75
+ if torch.cuda.is_available():
76
+ return "cuda"
77
+ # Apple Silicon MPS works for PubMedBERT but not for BioGPT-Large
78
+ # at the model sizes in question; gate it for CLASSIFICATION only.
79
+ if subtype == "LORA_CLASSIFICATION" and getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
80
+ return "mps"
81
+ if subtype == "LORA_GENERATION":
82
+ raise SystemExit("LORA_GENERATION requires CUDA; this node has no GPU")
83
+ return "cpu"
84
+
85
+
86
+ def _load_dataset(uri: str):
87
+ """
88
+ The `uri` is either a HuggingFace dataset id (no scheme) or an
89
+ https:// URL pointing at a JSONL file (one record per line, with
90
+ the standard `text`/`label` shape for classification or `text`
91
+ alone for generation). For V1 we only support HF dataset ids and
92
+ https-jsonl. The coordinator's mission corpus is exposed via a
93
+ pre-signed download URL the WO payload can carry.
94
+ """
95
+ from datasets import load_dataset # type: ignore
96
+ if uri.startswith("https://") or uri.startswith("http://"):
97
+ return load_dataset("json", data_files=uri, split="train")
98
+ return load_dataset(uri, split="train")
99
+
100
+
101
+ def _peft_target_modules(default: list[str]) -> list[str]:
102
+ return list(default) if default else ["q_proj", "v_proj"]
103
+
104
+
105
+ def _train(payload: Dict[str, Any]) -> Dict[str, Any]:
106
+ import torch # type: ignore
107
+ from transformers import ( # type: ignore
108
+ AutoTokenizer,
109
+ AutoModelForSequenceClassification,
110
+ AutoModelForCausalLM,
111
+ TrainingArguments,
112
+ Trainer,
113
+ DataCollatorWithPadding,
114
+ )
115
+ from peft import LoraConfig, get_peft_model, TaskType # type: ignore
116
+
117
+ subtype = payload["subtype"]
118
+ base_model_name = _resolve_base_model(payload["baseModel"])
119
+ device = _detect_device(subtype)
120
+ out_dir = Path(payload["outDir"])
121
+ out_dir.mkdir(parents=True, exist_ok=True)
122
+
123
+ cfg = payload["loraConfig"]
124
+ lora_cfg = LoraConfig(
125
+ r=int(cfg.get("r", 8)),
126
+ lora_alpha=int(cfg.get("alpha", 16)),
127
+ lora_dropout=float(cfg.get("dropout", 0.1)),
128
+ bias="none",
129
+ target_modules=_peft_target_modules(cfg.get("target_modules", [])),
130
+ task_type=TaskType.SEQ_CLS if subtype == "LORA_CLASSIFICATION" else TaskType.CAUSAL_LM,
131
+ )
132
+
133
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name)
134
+ if tokenizer.pad_token is None:
135
+ tokenizer.pad_token = tokenizer.eos_token
136
+
137
+ if subtype == "LORA_CLASSIFICATION":
138
+ model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=2)
139
+ else:
140
+ model = AutoModelForCausalLM.from_pretrained(base_model_name)
141
+
142
+ model = get_peft_model(model, lora_cfg)
143
+ model.to(device)
144
+
145
+ train_ds = _load_dataset(payload["trainingDatasetUri"])
146
+ val_ds = _load_dataset(payload["validationDatasetUri"])
147
+
148
+ def tokenize(batch: Dict[str, Any]) -> Dict[str, Any]:
149
+ return tokenizer(batch["text"], padding=False, truncation=True, max_length=512)
150
+
151
+ train_ds = train_ds.map(tokenize, batched=True)
152
+ val_ds = val_ds.map(tokenize, batched=True)
153
+
154
+ args = TrainingArguments(
155
+ output_dir=str(out_dir / "trainer_state"),
156
+ num_train_epochs=int(payload.get("maxEpochs", 3)),
157
+ per_device_train_batch_size=8,
158
+ per_device_eval_batch_size=8,
159
+ learning_rate=2e-4,
160
+ eval_strategy="epoch",
161
+ save_strategy="no",
162
+ logging_steps=50,
163
+ seed=int(payload.get("seed", 42)),
164
+ report_to=[],
165
+ )
166
+
167
+ trainer = Trainer(
168
+ model=model,
169
+ args=args,
170
+ train_dataset=train_ds,
171
+ eval_dataset=val_ds,
172
+ tokenizer=tokenizer,
173
+ data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
174
+ )
175
+ train_result = trainer.train()
176
+ eval_result = trainer.evaluate()
177
+
178
+ # Save adapter
179
+ model.save_pretrained(out_dir, safe_serialization=True)
180
+
181
+ metrics: Dict[str, float] = {}
182
+ if subtype == "LORA_CLASSIFICATION":
183
+ # Trainer.evaluate() returns eval_loss; we approximate accuracy
184
+ # by running a fresh prediction pass. Cheap because val set is
185
+ # small (mission corpora typically hundreds of items).
186
+ preds = trainer.predict(val_ds)
187
+ labels = preds.label_ids
188
+ pred_ids = preds.predictions.argmax(axis=-1)
189
+ accuracy = float((pred_ids == labels).mean()) if labels is not None else 0.0
190
+ metrics = {"accuracy": accuracy, "f1": _macro_f1(labels, pred_ids)}
191
+ else:
192
+ # CausalLM perplexity from eval_loss.
193
+ loss = float(eval_result.get("eval_loss", float("inf")))
194
+ metrics = {"perplexity": math.exp(loss) if loss < 50 else float("inf")}
195
+
196
+ (out_dir / "metrics.json").write_text(json.dumps(metrics))
197
+ _emit_progress("done", {"metrics": metrics, "train_loss": float(train_result.training_loss)})
198
+ return metrics
199
+
200
+
201
+ def _macro_f1(labels, preds) -> float:
202
+ if labels is None:
203
+ return 0.0
204
+ import numpy as np # type: ignore
205
+ classes = np.unique(labels)
206
+ f1s: list[float] = []
207
+ for c in classes:
208
+ tp = float(((preds == c) & (labels == c)).sum())
209
+ fp = float(((preds == c) & (labels != c)).sum())
210
+ fn = float(((preds != c) & (labels == c)).sum())
211
+ prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
212
+ rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
213
+ f1s.append(2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0)
214
+ return float(sum(f1s) / len(f1s)) if f1s else 0.0
215
+
216
+
217
+ def main() -> None:
218
+ payload = _read_payload()
219
+ _emit_progress("start", {
220
+ "adapterId": payload["adapterId"],
221
+ "subtype": payload["subtype"],
222
+ "baseModel": payload["baseModel"],
223
+ })
224
+ metrics = _train(payload)
225
+ _emit_progress("end", {"metrics": metrics})
226
+
227
+
228
+ if __name__ == "__main__":
229
+ try:
230
+ main()
231
+ except KeyboardInterrupt:
232
+ sys.exit(130)
233
+ except SystemExit:
234
+ raise
235
+ except Exception as exc: # pragma: no cover — surfaced by TS layer
236
+ print(f"error: {exc}", file=sys.stderr, flush=True)
237
+ sys.exit(2)