@synapseia-network/node 0.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +105 -0
- package/README.md +232 -0
- package/dist/bid-responder-Q725ZIUC.js +86 -0
- package/dist/bootstrap.js +22 -0
- package/dist/chain-info-lightweight-2UWAQZBF.js +303 -0
- package/dist/chat-stream-handler-BSHSGMFF.js +127 -0
- package/dist/chunk-2X7MSWD4.js +270 -0
- package/dist/chunk-3BHRQWSM.js +531 -0
- package/dist/chunk-5QFTU52A.js +442 -0
- package/dist/chunk-5ZAJBIAV.js +25 -0
- package/dist/chunk-7FLDR5NT.js +186 -0
- package/dist/chunk-C5XRYLYP.js +137 -0
- package/dist/chunk-D7ADMHK2.js +36 -0
- package/dist/chunk-DXUYWRO7.js +23 -0
- package/dist/chunk-F5UDK56Z.js +289 -0
- package/dist/chunk-NEHR6XY7.js +111 -0
- package/dist/chunk-NMJVODKH.js +453 -0
- package/dist/chunk-PRVT22SM.js +324 -0
- package/dist/chunk-T2ZRG5CX.js +1380 -0
- package/dist/chunk-V2L5SXTL.js +88 -0
- package/dist/chunk-XL2NJWFY.js +702 -0
- package/dist/embedding-C6GE3WVM.js +16 -0
- package/dist/hardware-ITQQJ5YI.js +37 -0
- package/dist/index.js +16836 -0
- package/dist/inference-server-CIGRJ36H.js +25 -0
- package/dist/local-cors-J6RWNMMD.js +44 -0
- package/dist/model-catalog-C53SDFMG.js +15 -0
- package/dist/model-discovery-LA6YMT3I.js +10 -0
- package/dist/ollama-XVXA3A37.js +9 -0
- package/dist/rewards-vault-cli-HW7H4EMD.js +147 -0
- package/dist/scripts/create_nodes.sh +6 -0
- package/dist/scripts/diloco_train.py +319 -0
- package/dist/scripts/train_lora.py +237 -0
- package/dist/scripts/train_micro.py +586 -0
- package/dist/trainer-HQMV2ZAR.js +21 -0
- package/package.json +128 -0
- package/scripts/create_nodes.sh +6 -0
- package/scripts/diloco_train.py +319 -0
- package/scripts/train_lora.py +237 -0
- package/scripts/train_micro.py +586 -0
|
@@ -0,0 +1,586 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Micro-transformer training script for SynapseIA
|
|
4
|
+
Trains a tiny transformer (120K params) with configurable hyperparameters
|
|
5
|
+
|
|
6
|
+
Input: JSON via stdin with hyperparameters and dataset path
|
|
7
|
+
Output: JSON lines to stdout (progress updates + final result)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
import math
|
|
15
|
+
import random
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import List, Dict, Any, Tuple, Optional
|
|
18
|
+
|
|
19
|
+
# Cap BLAS/MKL/OpenMP threadpools BEFORE importing torch. On arm64 under Docker
|
|
20
|
+
# `import torch` peaks at 600-800 MB RSS because MKL/OpenMP pre-allocate one
|
|
21
|
+
# thread per core; capping to 1 shaves ~100-300 MB and avoids cgroup OOM-kills
|
|
22
|
+
# when the host is tight on RAM (node + ollama + pubmedbert + coordinator).
|
|
23
|
+
# These env vars MUST be set before torch is imported — too late afterwards.
|
|
24
|
+
for _var in ('OMP_NUM_THREADS', 'MKL_NUM_THREADS', 'OPENBLAS_NUM_THREADS',
|
|
25
|
+
'NUMEXPR_NUM_THREADS', 'VECLIB_MAXIMUM_THREADS'):
|
|
26
|
+
os.environ.setdefault(_var, '1')
|
|
27
|
+
|
|
28
|
+
# Pre-import heartbeat: proves Python itself started. If the next run fails
|
|
29
|
+
# WITHOUT this line appearing in stdout, Python never booted (bad interpreter,
|
|
30
|
+
# missing script, cgroup killed the fork). If this appears but the post-import
|
|
31
|
+
# heartbeat further down does NOT, the OOM happened during `import torch`.
|
|
32
|
+
print(json.dumps({"stage": "pre-import", "pid": os.getpid()}), flush=True)
|
|
33
|
+
|
|
34
|
+
import torch
|
|
35
|
+
import torch.nn as nn
|
|
36
|
+
from torch.utils.data import Dataset, DataLoader
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CharTokenizer:
|
|
40
|
+
"""Simple character-level tokenizer"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, text: str):
|
|
43
|
+
self.chars = sorted(list(set(text)))
|
|
44
|
+
self.vocab_size = len(self.chars)
|
|
45
|
+
self.char_to_idx = {ch: i for i, ch in enumerate(self.chars)}
|
|
46
|
+
self.idx_to_char = {i: ch for i, ch in enumerate(self.chars)}
|
|
47
|
+
|
|
48
|
+
def encode(self, text: str) -> List[int]:
|
|
49
|
+
return [self.char_to_idx.get(ch, 0) for ch in text]
|
|
50
|
+
|
|
51
|
+
def decode(self, indices: List[int]) -> str:
|
|
52
|
+
return ''.join([self.idx_to_char.get(i, '') for i in indices])
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class TextDataset(Dataset):
|
|
56
|
+
"""Dataset for language modeling"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, data: str, tokenizer: CharTokenizer, seq_length: int = 128):
|
|
59
|
+
self.data = tokenizer.encode(data)
|
|
60
|
+
self.seq_length = seq_length
|
|
61
|
+
|
|
62
|
+
def __len__(self) -> int:
|
|
63
|
+
return max(0, len(self.data) - self.seq_length - 1)
|
|
64
|
+
|
|
65
|
+
def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
66
|
+
x = torch.tensor(self.data[idx:idx + self.seq_length], dtype=torch.long)
|
|
67
|
+
y = torch.tensor(self.data[idx + 1:idx + self.seq_length + 1], dtype=torch.long)
|
|
68
|
+
return x, y
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class RMSNorm(nn.Module):
|
|
72
|
+
"""Root Mean Square Layer Normalization"""
|
|
73
|
+
|
|
74
|
+
def __init__(self, dim: int, eps: float = 1e-6):
|
|
75
|
+
super().__init__()
|
|
76
|
+
self.eps = eps
|
|
77
|
+
self.weight = nn.Parameter(torch.ones(dim))
|
|
78
|
+
|
|
79
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
80
|
+
norm = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
|
|
81
|
+
return norm * self.weight
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class Head(nn.Module):
|
|
85
|
+
"""Single attention head"""
|
|
86
|
+
|
|
87
|
+
def __init__(self, head_size: int, n_embd: int, dropout: float = 0.1):
|
|
88
|
+
super().__init__()
|
|
89
|
+
self.key = nn.Linear(n_embd, head_size, bias=False)
|
|
90
|
+
self.query = nn.Linear(n_embd, head_size, bias=False)
|
|
91
|
+
self.value = nn.Linear(n_embd, head_size, bias=False)
|
|
92
|
+
self.dropout = nn.Dropout(dropout)
|
|
93
|
+
|
|
94
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
95
|
+
B, T, C = x.shape
|
|
96
|
+
k = self.key(x) # (B, T, head_size)
|
|
97
|
+
q = self.query(x) # (B, T, head_size)
|
|
98
|
+
|
|
99
|
+
# Attention scores
|
|
100
|
+
wei = q @ k.transpose(-2, -1) * (k.shape[-1] ** -0.5) # (B, T, T)
|
|
101
|
+
wei = torch.tril(wei)
|
|
102
|
+
wei = wei.masked_fill(wei == 0, float('-inf'))
|
|
103
|
+
wei = torch.softmax(wei, dim=-1)
|
|
104
|
+
wei = self.dropout(wei)
|
|
105
|
+
|
|
106
|
+
v = self.value(x) # (B, T, head_size)
|
|
107
|
+
out = wei @ v # (B, T, head_size)
|
|
108
|
+
return out
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class MultiHeadAttention(nn.Module):
|
|
112
|
+
"""Multiple attention heads in parallel"""
|
|
113
|
+
|
|
114
|
+
def __init__(self, num_heads: int, head_size: int, n_embd: int, dropout: float = 0.1):
|
|
115
|
+
super().__init__()
|
|
116
|
+
self.heads = nn.ModuleList([Head(head_size, n_embd, dropout) for _ in range(num_heads)])
|
|
117
|
+
self.proj = nn.Linear(n_embd, n_embd)
|
|
118
|
+
self.dropout = nn.Dropout(dropout)
|
|
119
|
+
|
|
120
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
121
|
+
out = torch.cat([h(x) for h in self.heads], dim=-1)
|
|
122
|
+
out = self.proj(out)
|
|
123
|
+
out = self.dropout(out)
|
|
124
|
+
return out
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class FeedForward(nn.Module):
|
|
128
|
+
"""Feed-forward layer"""
|
|
129
|
+
|
|
130
|
+
def __init__(self, n_embd: int, dropout: float = 0.1):
|
|
131
|
+
super().__init__()
|
|
132
|
+
self.net = nn.Sequential(
|
|
133
|
+
nn.Linear(n_embd, 4 * n_embd),
|
|
134
|
+
nn.GELU(),
|
|
135
|
+
nn.Linear(4 * n_embd, n_embd),
|
|
136
|
+
nn.Dropout(dropout),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
140
|
+
return self.net(x)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class Block(nn.Module):
|
|
144
|
+
"""Transformer block"""
|
|
145
|
+
|
|
146
|
+
def __init__(self, n_embd: int, num_heads: int, dropout: float = 0.1, normalization: str = 'layernorm'):
|
|
147
|
+
super().__init__()
|
|
148
|
+
head_size = n_embd // num_heads
|
|
149
|
+
self.sa = MultiHeadAttention(num_heads, head_size, n_embd, dropout)
|
|
150
|
+
self.ffwd = FeedForward(n_embd, dropout)
|
|
151
|
+
|
|
152
|
+
if normalization == 'rmsnorm':
|
|
153
|
+
self.ln1 = RMSNorm(n_embd)
|
|
154
|
+
self.ln2 = RMSNorm(n_embd)
|
|
155
|
+
else:
|
|
156
|
+
self.ln1 = nn.LayerNorm(n_embd)
|
|
157
|
+
self.ln2 = nn.LayerNorm(n_embd)
|
|
158
|
+
|
|
159
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
160
|
+
x = x + self.sa(self.ln1(x))
|
|
161
|
+
x = x + self.ffwd(self.ln2(x))
|
|
162
|
+
return x
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class MicroTransformer(nn.Module):
|
|
166
|
+
"""Micro transformer for language modeling (~120K params)"""
|
|
167
|
+
|
|
168
|
+
def __init__(
|
|
169
|
+
self,
|
|
170
|
+
vocab_size: int,
|
|
171
|
+
n_embd: int = 128,
|
|
172
|
+
num_layers: int = 4,
|
|
173
|
+
num_heads: int = 4,
|
|
174
|
+
dropout: float = 0.1,
|
|
175
|
+
normalization: str = 'layernorm',
|
|
176
|
+
init_scheme: str = 'xavier',
|
|
177
|
+
):
|
|
178
|
+
super().__init__()
|
|
179
|
+
self.token_embedding = nn.Embedding(vocab_size, n_embd)
|
|
180
|
+
self.position_embedding = nn.Embedding(512, n_embd) # Max 512 positions
|
|
181
|
+
self.blocks = nn.Sequential(*[
|
|
182
|
+
Block(n_embd, num_heads, dropout, normalization)
|
|
183
|
+
for _ in range(num_layers)
|
|
184
|
+
])
|
|
185
|
+
self.ln_f = RMSNorm(n_embd) if normalization == 'rmsnorm' else nn.LayerNorm(n_embd)
|
|
186
|
+
self.lm_head = nn.Linear(n_embd, vocab_size)
|
|
187
|
+
self.dropout = nn.Dropout(dropout)
|
|
188
|
+
|
|
189
|
+
# Apply initialization scheme
|
|
190
|
+
self._apply_init(init_scheme)
|
|
191
|
+
|
|
192
|
+
def _apply_init(self, scheme: str):
|
|
193
|
+
"""Apply weight initialization scheme"""
|
|
194
|
+
for name, p in self.named_parameters():
|
|
195
|
+
if 'weight' in name and p.dim() >= 2:
|
|
196
|
+
if scheme == 'xavier':
|
|
197
|
+
nn.init.xavier_uniform_(p)
|
|
198
|
+
elif scheme == 'kaiming':
|
|
199
|
+
nn.init.kaiming_uniform_(p, nonlinearity='relu')
|
|
200
|
+
elif scheme == 'normal':
|
|
201
|
+
nn.init.normal_(p, mean=0, std=0.02)
|
|
202
|
+
elif 'bias' in name:
|
|
203
|
+
nn.init.zeros_(p)
|
|
204
|
+
|
|
205
|
+
def forward(self, idx: torch.Tensor) -> torch.Tensor:
|
|
206
|
+
B, T = idx.shape
|
|
207
|
+
|
|
208
|
+
tok_emb = self.token_embedding(idx) # (B, T, n_embd)
|
|
209
|
+
pos_emb = self.position_embedding(torch.arange(T, device=idx.device)) # (T, n_embd)
|
|
210
|
+
x = self.dropout(tok_emb + pos_emb) # (B, T, n_embd)
|
|
211
|
+
x = self.blocks(x) # (B, T, n_embd)
|
|
212
|
+
x = self.ln_f(x) # (B, T, n_embd)
|
|
213
|
+
logits = self.lm_head(x) # (B, T, vocab_size)
|
|
214
|
+
|
|
215
|
+
return logits
|
|
216
|
+
|
|
217
|
+
def count_parameters(self) -> int:
|
|
218
|
+
return sum(p.numel() for p in self.parameters())
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def load_data(data_path: str) -> str:
|
|
222
|
+
"""Load text data from file"""
|
|
223
|
+
path = Path(data_path)
|
|
224
|
+
if not path.exists():
|
|
225
|
+
# Create sample data if file doesn't exist
|
|
226
|
+
sample_text = """Astrophysics is the branch of astronomy that employs the principles of physics and chemistry to explain the nature of celestial objects. Stars, galaxies, planets, and other objects in the universe emit radiation across the electromagnetic spectrum. Astronomers use telescopes to detect and analyze this radiation, learning about the composition, structure, and evolution of cosmic objects.
|
|
227
|
+
|
|
228
|
+
The study of stars involves understanding their life cycles, from formation in nebulae through main sequence evolution to their final states as white dwarfs, neutron stars, or black holes. Stellar nucleosynthesis creates the chemical elements that make up planets and life.
|
|
229
|
+
|
|
230
|
+
Galaxies are vast collections of stars, gas, dust, and dark matter bound together by gravity. The Milky Way is a barred spiral galaxy containing billions of stars. At the centers of most large galaxies lie supermassive black holes millions to billions of times the mass of our Sun.
|
|
231
|
+
|
|
232
|
+
Cosmology studies the origin and evolution of the universe as a whole. The Big Bang theory describes the universe expanding from an extremely hot, dense state approximately 13.8 billion years ago. Dark matter and dark energy remain mysterious components that dominate the mass-energy content of the universe.
|
|
233
|
+
|
|
234
|
+
Observational astrophysics spans the entire electromagnetic spectrum from radio waves to gamma rays. Space telescopes like Hubble and James Webb observe infrared, visible, and ultraviolet light above Earth's atmosphere. Radio telescopes detect cold gas and energetic processes in galaxies.
|
|
235
|
+
|
|
236
|
+
Computational astrophysics uses numerical simulations to model complex systems like galaxy formation, stellar interiors, and accretion disks around black holes. Machine learning increasingly helps analyze vast datasets from sky surveys."""
|
|
237
|
+
return sample_text
|
|
238
|
+
|
|
239
|
+
return path.read_text(encoding='utf-8')
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def split_data(text: str, train_ratio: float = 0.9) -> Tuple[str, str]:
|
|
243
|
+
"""Split text into train and validation sets"""
|
|
244
|
+
n = len(text)
|
|
245
|
+
train_size = int(n * train_ratio)
|
|
246
|
+
return text[:train_size], text[train_size:]
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def get_activation(name: str):
|
|
250
|
+
"""Get activation function by name"""
|
|
251
|
+
activations = {
|
|
252
|
+
'gelu': nn.GELU(),
|
|
253
|
+
'relu': nn.ReLU(),
|
|
254
|
+
'silu': nn.SiLU(),
|
|
255
|
+
}
|
|
256
|
+
return activations.get(name, nn.GELU())
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def train_step(model: nn.Module, batch: Tuple[torch.Tensor, torch.Tensor], optimizer: torch.optim.Optimizer, device: str) -> float:
|
|
260
|
+
"""Single training step"""
|
|
261
|
+
model.train()
|
|
262
|
+
x, y = batch
|
|
263
|
+
x, y = x.to(device), y.to(device)
|
|
264
|
+
|
|
265
|
+
logits = model(x)
|
|
266
|
+
B, T, C = logits.shape
|
|
267
|
+
loss = nn.functional.cross_entropy(logits.view(B * T, C), y.view(B * T))
|
|
268
|
+
|
|
269
|
+
optimizer.zero_grad()
|
|
270
|
+
loss.backward()
|
|
271
|
+
optimizer.step()
|
|
272
|
+
|
|
273
|
+
return loss.item()
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
# Sentinel value emitted in `valLoss` when the validation loader yields zero
|
|
277
|
+
# batches (empty val set, or batch_size larger than val_dataset). 1e30 is
|
|
278
|
+
# JSON-safe (json.dumps emits a literal `1e+30`, not the non-standard
|
|
279
|
+
# `Infinity` token that Node's JSON.parse rejects) and, for any practical
|
|
280
|
+
# baseline `currentBestLoss`, compares as "definitely worse" so the Node-side
|
|
281
|
+
# `improved = valLoss < currentBestLoss` check stays correct. The Node
|
|
282
|
+
# parser ALSO inspects the `valLossEvalFailed` boolean and short-circuits
|
|
283
|
+
# the `improved` comparison there — the sentinel is the belt, the boolean
|
|
284
|
+
# is the suspenders.
|
|
285
|
+
#
|
|
286
|
+
# TS counterpart: TRAINER_EVAL_FAILED_SENTINEL in
|
|
287
|
+
# packages/node/src/modules/model/trainer.ts — both must stay numerically
|
|
288
|
+
# identical. Grep both names if you change the value.
|
|
289
|
+
EVAL_FAILED_SENTINEL = 1e30
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def evaluate(model: nn.Module, dataloader: DataLoader, device: str, deadline: Optional[float] = None) -> float:
|
|
293
|
+
"""Evaluate model on validation set. Honours ``deadline`` (monotonic time).
|
|
294
|
+
|
|
295
|
+
Returns ``EVAL_FAILED_SENTINEL`` (1e30) when no batches were consumed —
|
|
296
|
+
either because the dataloader is empty (val set smaller than batch_size)
|
|
297
|
+
or the deadline expired before the first batch. Returning 0.0 here was
|
|
298
|
+
a silent quality bug: the Node side initialises ``bestLoss=Infinity``
|
|
299
|
+
and computes ``improved = valLoss < bestLoss`` — a 0.0 always wins,
|
|
300
|
+
which had nodes claiming improvement on no-eval runs and getting paid
|
|
301
|
+
rewards for them. Sentinel preserves the comparison invariant without
|
|
302
|
+
breaking the JSON contract (see EVAL_FAILED_SENTINEL docstring above).
|
|
303
|
+
"""
|
|
304
|
+
model.eval()
|
|
305
|
+
total_loss = 0
|
|
306
|
+
count = 0
|
|
307
|
+
|
|
308
|
+
with torch.no_grad():
|
|
309
|
+
for batch in dataloader:
|
|
310
|
+
x, y = batch
|
|
311
|
+
x, y = x.to(device), y.to(device)
|
|
312
|
+
|
|
313
|
+
logits = model(x)
|
|
314
|
+
B, T, C = logits.shape
|
|
315
|
+
loss = nn.functional.cross_entropy(logits.view(B * T, C), y.view(B * T))
|
|
316
|
+
|
|
317
|
+
total_loss += loss.item()
|
|
318
|
+
count += 1
|
|
319
|
+
|
|
320
|
+
# Deadline check AFTER processing the batch + guarded by `count > 0`
|
|
321
|
+
# so at least one batch always completes even when the deadline is
|
|
322
|
+
# already past at entry (slow batch loader, tight reserve, etc).
|
|
323
|
+
# Top-of-loop break was the root cause of "0 batches before
|
|
324
|
+
# deadline" sentinels under shared-CPU contention.
|
|
325
|
+
if deadline is not None and time.time() > deadline and count > 0:
|
|
326
|
+
break
|
|
327
|
+
|
|
328
|
+
if count == 0:
|
|
329
|
+
return EVAL_FAILED_SENTINEL
|
|
330
|
+
return total_loss / count
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def main():
|
|
334
|
+
"""Main training function"""
|
|
335
|
+
# Read hyperparameters from stdin
|
|
336
|
+
try:
|
|
337
|
+
input_data = sys.stdin.read()
|
|
338
|
+
config = json.loads(input_data)
|
|
339
|
+
except json.JSONDecodeError as e:
|
|
340
|
+
print(json.dumps({"error": f"Invalid JSON input: {str(e)}"}), file=sys.stderr)
|
|
341
|
+
sys.exit(1)
|
|
342
|
+
|
|
343
|
+
# Extract hyperparameters with defaults
|
|
344
|
+
learning_rate = config.get('learningRate', 0.001)
|
|
345
|
+
batch_size = config.get('batchSize', 32)
|
|
346
|
+
hidden_dim = config.get('hiddenDim', 128)
|
|
347
|
+
num_layers = config.get('numLayers', 4)
|
|
348
|
+
num_heads = config.get('numHeads', 4)
|
|
349
|
+
activation_name = config.get('activation', 'gelu')
|
|
350
|
+
normalization = config.get('normalization', 'layernorm')
|
|
351
|
+
init_scheme = config.get('initScheme', 'xavier')
|
|
352
|
+
warmup_steps = config.get('warmupSteps', 100)
|
|
353
|
+
weight_decay = config.get('weightDecay', 0.01)
|
|
354
|
+
max_train_seconds = config.get('maxTrainSeconds', 120)
|
|
355
|
+
data_path = config.get('dataPath', './data/astro-sample.txt')
|
|
356
|
+
hardware = config.get('hardware', 'cpu')
|
|
357
|
+
|
|
358
|
+
# Set device
|
|
359
|
+
device = 'cuda' if hardware == 'gpu' and torch.cuda.is_available() else 'cpu'
|
|
360
|
+
|
|
361
|
+
# CPU-constrained nodes (Docker) share cores with Ollama at 400%+. Cap torch
|
|
362
|
+
# threads so autograd workers don't thrash against inference. Single-thread
|
|
363
|
+
# prevents the scheduler stall that blocks stdout prints entirely.
|
|
364
|
+
if device == 'cpu':
|
|
365
|
+
try:
|
|
366
|
+
torch.set_num_threads(max(1, (torch.get_num_threads() or 1) // 2))
|
|
367
|
+
except Exception:
|
|
368
|
+
pass
|
|
369
|
+
|
|
370
|
+
# Emit a startup heartbeat BEFORE loading data/building the model so the
|
|
371
|
+
# Node parent can confirm the spawn reached Python code (diagnoses "no
|
|
372
|
+
# output for 10 minutes" timeouts caused by slow imports or data loading).
|
|
373
|
+
print(json.dumps({
|
|
374
|
+
"started": True,
|
|
375
|
+
"device": device,
|
|
376
|
+
"hardware": hardware,
|
|
377
|
+
"torchThreads": torch.get_num_threads(),
|
|
378
|
+
"maxTrainSeconds": max_train_seconds,
|
|
379
|
+
}), flush=True)
|
|
380
|
+
|
|
381
|
+
# Load and prepare data
|
|
382
|
+
text = load_data(data_path)
|
|
383
|
+
train_text, val_text = split_data(text)
|
|
384
|
+
|
|
385
|
+
# Create tokenizer
|
|
386
|
+
tokenizer = CharTokenizer(text)
|
|
387
|
+
vocab_size = tokenizer.vocab_size
|
|
388
|
+
|
|
389
|
+
# Create datasets
|
|
390
|
+
seq_length = 128
|
|
391
|
+
train_dataset = TextDataset(train_text, tokenizer, seq_length)
|
|
392
|
+
val_dataset = TextDataset(val_text, tokenizer, seq_length)
|
|
393
|
+
|
|
394
|
+
# Guard: dataset must have enough samples to train (domain corpus must be correct)
|
|
395
|
+
MIN_TRAIN_SAMPLES = 2
|
|
396
|
+
if len(train_dataset) < MIN_TRAIN_SAMPLES:
|
|
397
|
+
# Delete cached file so it gets re-downloaded on the next run
|
|
398
|
+
from pathlib import Path as _Path
|
|
399
|
+
try:
|
|
400
|
+
_Path(data_path).unlink(missing_ok=True)
|
|
401
|
+
except Exception:
|
|
402
|
+
pass
|
|
403
|
+
err_msg = (
|
|
404
|
+
f"Corpus too short for domain training: only {len(text)} chars in '{data_path}'. "
|
|
405
|
+
f"Need at least {seq_length + 1} chars to produce even one training sample. "
|
|
406
|
+
f"The coordinator must export a valid corpus before dispatching training work orders."
|
|
407
|
+
)
|
|
408
|
+
print(json.dumps({"error": err_msg}), flush=True)
|
|
409
|
+
sys.exit(1)
|
|
410
|
+
|
|
411
|
+
# Clamp batch_size to available samples (avoids empty-batch edge cases with small corpora)
|
|
412
|
+
effective_batch_size = min(batch_size, len(train_dataset))
|
|
413
|
+
if effective_batch_size != batch_size:
|
|
414
|
+
print(json.dumps({"warning": f"batch_size clamped from {batch_size} to {effective_batch_size} (only {len(train_dataset)} training samples)"}), flush=True)
|
|
415
|
+
|
|
416
|
+
train_loader = DataLoader(train_dataset, batch_size=effective_batch_size, shuffle=True)
|
|
417
|
+
|
|
418
|
+
# Val set must produce at least one batch. With the previous
|
|
419
|
+
# `batch_size=max(1, min(effective_batch_size, len(val_dataset)))`,
|
|
420
|
+
# a val_dataset of length 0 silently produced an empty DataLoader and
|
|
421
|
+
# `evaluate()` returned 0.0 — Node treated the run as "improved" against
|
|
422
|
+
# its Infinity baseline and paid out rewards for a no-eval submission.
|
|
423
|
+
# Cap val batch size at len(val_dataset) so even a tiny val set yields
|
|
424
|
+
# one batch; if val_dataset is empty, log to stderr and the eval call
|
|
425
|
+
# below will return EVAL_FAILED_SENTINEL via the empty-loader branch.
|
|
426
|
+
val_eval_failed = False
|
|
427
|
+
val_eval_failure_reason: Optional[str] = None
|
|
428
|
+
if len(val_dataset) == 0:
|
|
429
|
+
val_eval_failed = True
|
|
430
|
+
val_eval_failure_reason = (
|
|
431
|
+
f"val set empty: 0 samples (corpus_chars={len(text)}, train_ratio=0.9, "
|
|
432
|
+
f"seq_length={seq_length}, effective_batch_size={effective_batch_size}). "
|
|
433
|
+
"Domain corpus is too small to carve out a usable validation split."
|
|
434
|
+
)
|
|
435
|
+
print(f"[train_micro] WARNING: {val_eval_failure_reason}", file=sys.stderr, flush=True)
|
|
436
|
+
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
|
|
437
|
+
else:
|
|
438
|
+
val_batch_size = max(1, min(effective_batch_size, len(val_dataset)))
|
|
439
|
+
val_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)
|
|
440
|
+
|
|
441
|
+
# Create model
|
|
442
|
+
model = MicroTransformer(
|
|
443
|
+
vocab_size=vocab_size,
|
|
444
|
+
n_embd=hidden_dim,
|
|
445
|
+
num_layers=num_layers,
|
|
446
|
+
num_heads=num_heads,
|
|
447
|
+
dropout=0.1,
|
|
448
|
+
normalization=normalization,
|
|
449
|
+
init_scheme=init_scheme,
|
|
450
|
+
).to(device)
|
|
451
|
+
|
|
452
|
+
param_count = model.count_parameters()
|
|
453
|
+
|
|
454
|
+
# Setup optimizer with weight decay
|
|
455
|
+
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
|
|
456
|
+
|
|
457
|
+
# Training loop with time limit.
|
|
458
|
+
#
|
|
459
|
+
# Reserve a minimum eval budget upfront so training can't starve the final
|
|
460
|
+
# evaluation pass. Without the reserve, training consumes the entire
|
|
461
|
+
# max_train_seconds and `evaluate()` only gets a best-effort 10s reprieve
|
|
462
|
+
# — under shared-CPU contention (Docker + Ollama + coordinator on the same
|
|
463
|
+
# host) that's not enough to load even one val batch and the run reports
|
|
464
|
+
# `valLossEvalFailed=true` despite training fine. Live-repro on node-1
|
|
465
|
+
# (linux container) and node-kike (Mac native) before this fix.
|
|
466
|
+
#
|
|
467
|
+
# Floor of 30s OR 10% of total budget, whichever is larger:
|
|
468
|
+
# - 60s budget → 30s eval / 30s train
|
|
469
|
+
# - 600s budget → 60s eval / 540s train
|
|
470
|
+
# - 1800s budget → 180s eval / 1620s train
|
|
471
|
+
# Training also gets a hard floor of 50% of budget so a tiny budget still
|
|
472
|
+
# gets reasonable training. `final_deadline` is `start + max_train_seconds`
|
|
473
|
+
# so we don't compound clock drift via `now + 10`.
|
|
474
|
+
MIN_EVAL_BUDGET_SEC = 30.0
|
|
475
|
+
eval_budget_sec = max(MIN_EVAL_BUDGET_SEC, max_train_seconds * 0.10)
|
|
476
|
+
training_budget_sec = max(max_train_seconds - eval_budget_sec, max_train_seconds * 0.5)
|
|
477
|
+
start_time = time.time()
|
|
478
|
+
deadline = start_time + training_budget_sec
|
|
479
|
+
final_deadline = start_time + max_train_seconds
|
|
480
|
+
step = 0
|
|
481
|
+
best_val_loss = float('inf')
|
|
482
|
+
|
|
483
|
+
try:
|
|
484
|
+
while time.time() < deadline:
|
|
485
|
+
for batch in train_loader:
|
|
486
|
+
if time.time() >= deadline:
|
|
487
|
+
break
|
|
488
|
+
|
|
489
|
+
# Learning rate warmup
|
|
490
|
+
if step < warmup_steps:
|
|
491
|
+
lr = learning_rate * (step + 1) / warmup_steps
|
|
492
|
+
for param_group in optimizer.param_groups:
|
|
493
|
+
param_group['lr'] = lr
|
|
494
|
+
|
|
495
|
+
# Training step
|
|
496
|
+
loss = train_step(model, batch, optimizer, device)
|
|
497
|
+
step += 1
|
|
498
|
+
|
|
499
|
+
# Log every step for the first 20 (proof-of-life on slow CPU
|
|
500
|
+
# nodes where step 10 might not be reached in 60s), then every
|
|
501
|
+
# 10 to keep stdout noise down.
|
|
502
|
+
if step <= 20 or step % 10 == 0:
|
|
503
|
+
current_lr = optimizer.param_groups[0]['lr']
|
|
504
|
+
progress = {
|
|
505
|
+
"step": step,
|
|
506
|
+
"loss": round(loss, 4),
|
|
507
|
+
"lr": round(current_lr, 6),
|
|
508
|
+
}
|
|
509
|
+
print(json.dumps(progress), flush=True)
|
|
510
|
+
|
|
511
|
+
# Validation at end of each epoch — honour the deadline so a slow
|
|
512
|
+
# val pass doesn't blow past max_train_seconds by minutes.
|
|
513
|
+
val_loss = evaluate(model, val_loader, device, deadline=deadline)
|
|
514
|
+
best_val_loss = min(best_val_loss, val_loss)
|
|
515
|
+
|
|
516
|
+
if time.time() >= deadline:
|
|
517
|
+
break
|
|
518
|
+
|
|
519
|
+
except KeyboardInterrupt:
|
|
520
|
+
pass
|
|
521
|
+
|
|
522
|
+
# Final evaluation — time-capped against `final_deadline` (set above as
|
|
523
|
+
# `start_time + max_train_seconds`). The reserved eval budget guarantees
|
|
524
|
+
# we have at least max(30s, 10% of total) here, and the `count > 0` guard
|
|
525
|
+
# in `evaluate()` ensures at least one batch always processes even under
|
|
526
|
+
# tight margins.
|
|
527
|
+
eval_start = time.time()
|
|
528
|
+
print(
|
|
529
|
+
json.dumps({
|
|
530
|
+
"stage": "eval-start",
|
|
531
|
+
"deadline_in_s": round(final_deadline - eval_start, 2),
|
|
532
|
+
"val_batches": len(val_loader),
|
|
533
|
+
"train_batches": len(train_loader),
|
|
534
|
+
}),
|
|
535
|
+
file=sys.stderr,
|
|
536
|
+
flush=True,
|
|
537
|
+
)
|
|
538
|
+
final_train_loss = evaluate(model, train_loader, device, deadline=final_deadline)
|
|
539
|
+
final_val_loss = evaluate(model, val_loader, device, deadline=final_deadline)
|
|
540
|
+
eval_elapsed = time.time() - eval_start
|
|
541
|
+
print(
|
|
542
|
+
json.dumps({
|
|
543
|
+
"stage": "eval-done",
|
|
544
|
+
"elapsed_s": round(eval_elapsed, 2),
|
|
545
|
+
"final_train_loss": round(final_train_loss, 4),
|
|
546
|
+
"final_val_loss": round(final_val_loss, 4),
|
|
547
|
+
}),
|
|
548
|
+
file=sys.stderr,
|
|
549
|
+
flush=True,
|
|
550
|
+
)
|
|
551
|
+
duration_ms = int((time.time() - start_time) * 1000)
|
|
552
|
+
|
|
553
|
+
# Detect "no batches consumed" — either val_dataset was empty (set above)
|
|
554
|
+
# or the deadline expired before the first val batch on a non-empty set.
|
|
555
|
+
# In both cases `evaluate()` returned EVAL_FAILED_SENTINEL. Propagate
|
|
556
|
+
# `valLossEvalFailed=true` so the Node parser can short-circuit the
|
|
557
|
+
# `improved` comparison instead of trusting the sentinel directly.
|
|
558
|
+
if final_val_loss >= EVAL_FAILED_SENTINEL:
|
|
559
|
+
val_eval_failed = True
|
|
560
|
+
if val_eval_failure_reason is None:
|
|
561
|
+
val_eval_failure_reason = (
|
|
562
|
+
f"final eval consumed 0 batches before deadline "
|
|
563
|
+
f"(val_dataset_size={len(val_dataset)}, effective_batch_size={effective_batch_size})"
|
|
564
|
+
)
|
|
565
|
+
print(f"[train_micro] WARNING: {val_eval_failure_reason}", file=sys.stderr, flush=True)
|
|
566
|
+
|
|
567
|
+
# Use json.dumps with allow_nan=False to fail loudly if any future code
|
|
568
|
+
# path lets a NaN/Inf reach this point — Node JSON.parse can't handle
|
|
569
|
+
# `NaN`/`Infinity` tokens and we'd lose the result silently otherwise.
|
|
570
|
+
result_payload = {
|
|
571
|
+
"result": {
|
|
572
|
+
"finalLoss": round(final_train_loss, 4),
|
|
573
|
+
"valLoss": round(final_val_loss, 4),
|
|
574
|
+
"steps": step,
|
|
575
|
+
"durationMs": duration_ms,
|
|
576
|
+
"params": param_count,
|
|
577
|
+
"vocabSize": vocab_size,
|
|
578
|
+
"valLossEvalFailed": val_eval_failed,
|
|
579
|
+
"valLossEvalFailureReason": val_eval_failure_reason,
|
|
580
|
+
}
|
|
581
|
+
}
|
|
582
|
+
print(json.dumps(result_payload, allow_nan=False), flush=True)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
if __name__ == '__main__':
|
|
586
|
+
main()
|