@synapseia-network/node 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/LICENSE +105 -0
  2. package/README.md +232 -0
  3. package/dist/bid-responder-Q725ZIUC.js +86 -0
  4. package/dist/bootstrap.js +22 -0
  5. package/dist/chain-info-lightweight-2UWAQZBF.js +303 -0
  6. package/dist/chat-stream-handler-BSHSGMFF.js +127 -0
  7. package/dist/chunk-2X7MSWD4.js +270 -0
  8. package/dist/chunk-3BHRQWSM.js +531 -0
  9. package/dist/chunk-5QFTU52A.js +442 -0
  10. package/dist/chunk-5ZAJBIAV.js +25 -0
  11. package/dist/chunk-7FLDR5NT.js +186 -0
  12. package/dist/chunk-C5XRYLYP.js +137 -0
  13. package/dist/chunk-D7ADMHK2.js +36 -0
  14. package/dist/chunk-DXUYWRO7.js +23 -0
  15. package/dist/chunk-F5UDK56Z.js +289 -0
  16. package/dist/chunk-NEHR6XY7.js +111 -0
  17. package/dist/chunk-NMJVODKH.js +453 -0
  18. package/dist/chunk-PRVT22SM.js +324 -0
  19. package/dist/chunk-T2ZRG5CX.js +1380 -0
  20. package/dist/chunk-V2L5SXTL.js +88 -0
  21. package/dist/chunk-XL2NJWFY.js +702 -0
  22. package/dist/embedding-C6GE3WVM.js +16 -0
  23. package/dist/hardware-ITQQJ5YI.js +37 -0
  24. package/dist/index.js +16836 -0
  25. package/dist/inference-server-CIGRJ36H.js +25 -0
  26. package/dist/local-cors-J6RWNMMD.js +44 -0
  27. package/dist/model-catalog-C53SDFMG.js +15 -0
  28. package/dist/model-discovery-LA6YMT3I.js +10 -0
  29. package/dist/ollama-XVXA3A37.js +9 -0
  30. package/dist/rewards-vault-cli-HW7H4EMD.js +147 -0
  31. package/dist/scripts/create_nodes.sh +6 -0
  32. package/dist/scripts/diloco_train.py +319 -0
  33. package/dist/scripts/train_lora.py +237 -0
  34. package/dist/scripts/train_micro.py +586 -0
  35. package/dist/trainer-HQMV2ZAR.js +21 -0
  36. package/package.json +128 -0
  37. package/scripts/create_nodes.sh +6 -0
  38. package/scripts/diloco_train.py +319 -0
  39. package/scripts/train_lora.py +237 -0
  40. package/scripts/train_micro.py +586 -0
@@ -0,0 +1,586 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Micro-transformer training script for SynapseIA
4
+ Trains a tiny transformer (120K params) with configurable hyperparameters
5
+
6
+ Input: JSON via stdin with hyperparameters and dataset path
7
+ Output: JSON lines to stdout (progress updates + final result)
8
+ """
9
+
10
+ import json
11
+ import os
12
+ import sys
13
+ import time
14
+ import math
15
+ import random
16
+ from pathlib import Path
17
+ from typing import List, Dict, Any, Tuple, Optional
18
+
19
+ # Cap BLAS/MKL/OpenMP threadpools BEFORE importing torch. On arm64 under Docker
20
+ # `import torch` peaks at 600-800 MB RSS because MKL/OpenMP pre-allocate one
21
+ # thread per core; capping to 1 shaves ~100-300 MB and avoids cgroup OOM-kills
22
+ # when the host is tight on RAM (node + ollama + pubmedbert + coordinator).
23
+ # These env vars MUST be set before torch is imported — too late afterwards.
24
+ for _var in ('OMP_NUM_THREADS', 'MKL_NUM_THREADS', 'OPENBLAS_NUM_THREADS',
25
+ 'NUMEXPR_NUM_THREADS', 'VECLIB_MAXIMUM_THREADS'):
26
+ os.environ.setdefault(_var, '1')
27
+
28
+ # Pre-import heartbeat: proves Python itself started. If the next run fails
29
+ # WITHOUT this line appearing in stdout, Python never booted (bad interpreter,
30
+ # missing script, cgroup killed the fork). If this appears but the post-import
31
+ # heartbeat further down does NOT, the OOM happened during `import torch`.
32
+ print(json.dumps({"stage": "pre-import", "pid": os.getpid()}), flush=True)
33
+
34
+ import torch
35
+ import torch.nn as nn
36
+ from torch.utils.data import Dataset, DataLoader
37
+
38
+
39
+ class CharTokenizer:
40
+ """Simple character-level tokenizer"""
41
+
42
+ def __init__(self, text: str):
43
+ self.chars = sorted(list(set(text)))
44
+ self.vocab_size = len(self.chars)
45
+ self.char_to_idx = {ch: i for i, ch in enumerate(self.chars)}
46
+ self.idx_to_char = {i: ch for i, ch in enumerate(self.chars)}
47
+
48
+ def encode(self, text: str) -> List[int]:
49
+ return [self.char_to_idx.get(ch, 0) for ch in text]
50
+
51
+ def decode(self, indices: List[int]) -> str:
52
+ return ''.join([self.idx_to_char.get(i, '') for i in indices])
53
+
54
+
55
+ class TextDataset(Dataset):
56
+ """Dataset for language modeling"""
57
+
58
+ def __init__(self, data: str, tokenizer: CharTokenizer, seq_length: int = 128):
59
+ self.data = tokenizer.encode(data)
60
+ self.seq_length = seq_length
61
+
62
+ def __len__(self) -> int:
63
+ return max(0, len(self.data) - self.seq_length - 1)
64
+
65
+ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
66
+ x = torch.tensor(self.data[idx:idx + self.seq_length], dtype=torch.long)
67
+ y = torch.tensor(self.data[idx + 1:idx + self.seq_length + 1], dtype=torch.long)
68
+ return x, y
69
+
70
+
71
+ class RMSNorm(nn.Module):
72
+ """Root Mean Square Layer Normalization"""
73
+
74
+ def __init__(self, dim: int, eps: float = 1e-6):
75
+ super().__init__()
76
+ self.eps = eps
77
+ self.weight = nn.Parameter(torch.ones(dim))
78
+
79
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
80
+ norm = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
81
+ return norm * self.weight
82
+
83
+
84
+ class Head(nn.Module):
85
+ """Single attention head"""
86
+
87
+ def __init__(self, head_size: int, n_embd: int, dropout: float = 0.1):
88
+ super().__init__()
89
+ self.key = nn.Linear(n_embd, head_size, bias=False)
90
+ self.query = nn.Linear(n_embd, head_size, bias=False)
91
+ self.value = nn.Linear(n_embd, head_size, bias=False)
92
+ self.dropout = nn.Dropout(dropout)
93
+
94
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
95
+ B, T, C = x.shape
96
+ k = self.key(x) # (B, T, head_size)
97
+ q = self.query(x) # (B, T, head_size)
98
+
99
+ # Attention scores
100
+ wei = q @ k.transpose(-2, -1) * (k.shape[-1] ** -0.5) # (B, T, T)
101
+ wei = torch.tril(wei)
102
+ wei = wei.masked_fill(wei == 0, float('-inf'))
103
+ wei = torch.softmax(wei, dim=-1)
104
+ wei = self.dropout(wei)
105
+
106
+ v = self.value(x) # (B, T, head_size)
107
+ out = wei @ v # (B, T, head_size)
108
+ return out
109
+
110
+
111
+ class MultiHeadAttention(nn.Module):
112
+ """Multiple attention heads in parallel"""
113
+
114
+ def __init__(self, num_heads: int, head_size: int, n_embd: int, dropout: float = 0.1):
115
+ super().__init__()
116
+ self.heads = nn.ModuleList([Head(head_size, n_embd, dropout) for _ in range(num_heads)])
117
+ self.proj = nn.Linear(n_embd, n_embd)
118
+ self.dropout = nn.Dropout(dropout)
119
+
120
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
121
+ out = torch.cat([h(x) for h in self.heads], dim=-1)
122
+ out = self.proj(out)
123
+ out = self.dropout(out)
124
+ return out
125
+
126
+
127
+ class FeedForward(nn.Module):
128
+ """Feed-forward layer"""
129
+
130
+ def __init__(self, n_embd: int, dropout: float = 0.1):
131
+ super().__init__()
132
+ self.net = nn.Sequential(
133
+ nn.Linear(n_embd, 4 * n_embd),
134
+ nn.GELU(),
135
+ nn.Linear(4 * n_embd, n_embd),
136
+ nn.Dropout(dropout),
137
+ )
138
+
139
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
140
+ return self.net(x)
141
+
142
+
143
+ class Block(nn.Module):
144
+ """Transformer block"""
145
+
146
+ def __init__(self, n_embd: int, num_heads: int, dropout: float = 0.1, normalization: str = 'layernorm'):
147
+ super().__init__()
148
+ head_size = n_embd // num_heads
149
+ self.sa = MultiHeadAttention(num_heads, head_size, n_embd, dropout)
150
+ self.ffwd = FeedForward(n_embd, dropout)
151
+
152
+ if normalization == 'rmsnorm':
153
+ self.ln1 = RMSNorm(n_embd)
154
+ self.ln2 = RMSNorm(n_embd)
155
+ else:
156
+ self.ln1 = nn.LayerNorm(n_embd)
157
+ self.ln2 = nn.LayerNorm(n_embd)
158
+
159
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
160
+ x = x + self.sa(self.ln1(x))
161
+ x = x + self.ffwd(self.ln2(x))
162
+ return x
163
+
164
+
165
+ class MicroTransformer(nn.Module):
166
+ """Micro transformer for language modeling (~120K params)"""
167
+
168
+ def __init__(
169
+ self,
170
+ vocab_size: int,
171
+ n_embd: int = 128,
172
+ num_layers: int = 4,
173
+ num_heads: int = 4,
174
+ dropout: float = 0.1,
175
+ normalization: str = 'layernorm',
176
+ init_scheme: str = 'xavier',
177
+ ):
178
+ super().__init__()
179
+ self.token_embedding = nn.Embedding(vocab_size, n_embd)
180
+ self.position_embedding = nn.Embedding(512, n_embd) # Max 512 positions
181
+ self.blocks = nn.Sequential(*[
182
+ Block(n_embd, num_heads, dropout, normalization)
183
+ for _ in range(num_layers)
184
+ ])
185
+ self.ln_f = RMSNorm(n_embd) if normalization == 'rmsnorm' else nn.LayerNorm(n_embd)
186
+ self.lm_head = nn.Linear(n_embd, vocab_size)
187
+ self.dropout = nn.Dropout(dropout)
188
+
189
+ # Apply initialization scheme
190
+ self._apply_init(init_scheme)
191
+
192
+ def _apply_init(self, scheme: str):
193
+ """Apply weight initialization scheme"""
194
+ for name, p in self.named_parameters():
195
+ if 'weight' in name and p.dim() >= 2:
196
+ if scheme == 'xavier':
197
+ nn.init.xavier_uniform_(p)
198
+ elif scheme == 'kaiming':
199
+ nn.init.kaiming_uniform_(p, nonlinearity='relu')
200
+ elif scheme == 'normal':
201
+ nn.init.normal_(p, mean=0, std=0.02)
202
+ elif 'bias' in name:
203
+ nn.init.zeros_(p)
204
+
205
+ def forward(self, idx: torch.Tensor) -> torch.Tensor:
206
+ B, T = idx.shape
207
+
208
+ tok_emb = self.token_embedding(idx) # (B, T, n_embd)
209
+ pos_emb = self.position_embedding(torch.arange(T, device=idx.device)) # (T, n_embd)
210
+ x = self.dropout(tok_emb + pos_emb) # (B, T, n_embd)
211
+ x = self.blocks(x) # (B, T, n_embd)
212
+ x = self.ln_f(x) # (B, T, n_embd)
213
+ logits = self.lm_head(x) # (B, T, vocab_size)
214
+
215
+ return logits
216
+
217
+ def count_parameters(self) -> int:
218
+ return sum(p.numel() for p in self.parameters())
219
+
220
+
221
+ def load_data(data_path: str) -> str:
222
+ """Load text data from file"""
223
+ path = Path(data_path)
224
+ if not path.exists():
225
+ # Create sample data if file doesn't exist
226
+ sample_text = """Astrophysics is the branch of astronomy that employs the principles of physics and chemistry to explain the nature of celestial objects. Stars, galaxies, planets, and other objects in the universe emit radiation across the electromagnetic spectrum. Astronomers use telescopes to detect and analyze this radiation, learning about the composition, structure, and evolution of cosmic objects.
227
+
228
+ The study of stars involves understanding their life cycles, from formation in nebulae through main sequence evolution to their final states as white dwarfs, neutron stars, or black holes. Stellar nucleosynthesis creates the chemical elements that make up planets and life.
229
+
230
+ Galaxies are vast collections of stars, gas, dust, and dark matter bound together by gravity. The Milky Way is a barred spiral galaxy containing billions of stars. At the centers of most large galaxies lie supermassive black holes millions to billions of times the mass of our Sun.
231
+
232
+ Cosmology studies the origin and evolution of the universe as a whole. The Big Bang theory describes the universe expanding from an extremely hot, dense state approximately 13.8 billion years ago. Dark matter and dark energy remain mysterious components that dominate the mass-energy content of the universe.
233
+
234
+ Observational astrophysics spans the entire electromagnetic spectrum from radio waves to gamma rays. Space telescopes like Hubble and James Webb observe infrared, visible, and ultraviolet light above Earth's atmosphere. Radio telescopes detect cold gas and energetic processes in galaxies.
235
+
236
+ Computational astrophysics uses numerical simulations to model complex systems like galaxy formation, stellar interiors, and accretion disks around black holes. Machine learning increasingly helps analyze vast datasets from sky surveys."""
237
+ return sample_text
238
+
239
+ return path.read_text(encoding='utf-8')
240
+
241
+
242
+ def split_data(text: str, train_ratio: float = 0.9) -> Tuple[str, str]:
243
+ """Split text into train and validation sets"""
244
+ n = len(text)
245
+ train_size = int(n * train_ratio)
246
+ return text[:train_size], text[train_size:]
247
+
248
+
249
+ def get_activation(name: str):
250
+ """Get activation function by name"""
251
+ activations = {
252
+ 'gelu': nn.GELU(),
253
+ 'relu': nn.ReLU(),
254
+ 'silu': nn.SiLU(),
255
+ }
256
+ return activations.get(name, nn.GELU())
257
+
258
+
259
+ def train_step(model: nn.Module, batch: Tuple[torch.Tensor, torch.Tensor], optimizer: torch.optim.Optimizer, device: str) -> float:
260
+ """Single training step"""
261
+ model.train()
262
+ x, y = batch
263
+ x, y = x.to(device), y.to(device)
264
+
265
+ logits = model(x)
266
+ B, T, C = logits.shape
267
+ loss = nn.functional.cross_entropy(logits.view(B * T, C), y.view(B * T))
268
+
269
+ optimizer.zero_grad()
270
+ loss.backward()
271
+ optimizer.step()
272
+
273
+ return loss.item()
274
+
275
+
276
+ # Sentinel value emitted in `valLoss` when the validation loader yields zero
277
+ # batches (empty val set, or batch_size larger than val_dataset). 1e30 is
278
+ # JSON-safe (json.dumps emits a literal `1e+30`, not the non-standard
279
+ # `Infinity` token that Node's JSON.parse rejects) and, for any practical
280
+ # baseline `currentBestLoss`, compares as "definitely worse" so the Node-side
281
+ # `improved = valLoss < currentBestLoss` check stays correct. The Node
282
+ # parser ALSO inspects the `valLossEvalFailed` boolean and short-circuits
283
+ # the `improved` comparison there — the sentinel is the belt, the boolean
284
+ # is the suspenders.
285
+ #
286
+ # TS counterpart: TRAINER_EVAL_FAILED_SENTINEL in
287
+ # packages/node/src/modules/model/trainer.ts — both must stay numerically
288
+ # identical. Grep both names if you change the value.
289
+ EVAL_FAILED_SENTINEL = 1e30
290
+
291
+
292
+ def evaluate(model: nn.Module, dataloader: DataLoader, device: str, deadline: Optional[float] = None) -> float:
293
+ """Evaluate model on validation set. Honours ``deadline`` (monotonic time).
294
+
295
+ Returns ``EVAL_FAILED_SENTINEL`` (1e30) when no batches were consumed —
296
+ either because the dataloader is empty (val set smaller than batch_size)
297
+ or the deadline expired before the first batch. Returning 0.0 here was
298
+ a silent quality bug: the Node side initialises ``bestLoss=Infinity``
299
+ and computes ``improved = valLoss < bestLoss`` — a 0.0 always wins,
300
+ which had nodes claiming improvement on no-eval runs and getting paid
301
+ rewards for them. Sentinel preserves the comparison invariant without
302
+ breaking the JSON contract (see EVAL_FAILED_SENTINEL docstring above).
303
+ """
304
+ model.eval()
305
+ total_loss = 0
306
+ count = 0
307
+
308
+ with torch.no_grad():
309
+ for batch in dataloader:
310
+ x, y = batch
311
+ x, y = x.to(device), y.to(device)
312
+
313
+ logits = model(x)
314
+ B, T, C = logits.shape
315
+ loss = nn.functional.cross_entropy(logits.view(B * T, C), y.view(B * T))
316
+
317
+ total_loss += loss.item()
318
+ count += 1
319
+
320
+ # Deadline check AFTER processing the batch + guarded by `count > 0`
321
+ # so at least one batch always completes even when the deadline is
322
+ # already past at entry (slow batch loader, tight reserve, etc).
323
+ # Top-of-loop break was the root cause of "0 batches before
324
+ # deadline" sentinels under shared-CPU contention.
325
+ if deadline is not None and time.time() > deadline and count > 0:
326
+ break
327
+
328
+ if count == 0:
329
+ return EVAL_FAILED_SENTINEL
330
+ return total_loss / count
331
+
332
+
333
+ def main():
334
+ """Main training function"""
335
+ # Read hyperparameters from stdin
336
+ try:
337
+ input_data = sys.stdin.read()
338
+ config = json.loads(input_data)
339
+ except json.JSONDecodeError as e:
340
+ print(json.dumps({"error": f"Invalid JSON input: {str(e)}"}), file=sys.stderr)
341
+ sys.exit(1)
342
+
343
+ # Extract hyperparameters with defaults
344
+ learning_rate = config.get('learningRate', 0.001)
345
+ batch_size = config.get('batchSize', 32)
346
+ hidden_dim = config.get('hiddenDim', 128)
347
+ num_layers = config.get('numLayers', 4)
348
+ num_heads = config.get('numHeads', 4)
349
+ activation_name = config.get('activation', 'gelu')
350
+ normalization = config.get('normalization', 'layernorm')
351
+ init_scheme = config.get('initScheme', 'xavier')
352
+ warmup_steps = config.get('warmupSteps', 100)
353
+ weight_decay = config.get('weightDecay', 0.01)
354
+ max_train_seconds = config.get('maxTrainSeconds', 120)
355
+ data_path = config.get('dataPath', './data/astro-sample.txt')
356
+ hardware = config.get('hardware', 'cpu')
357
+
358
+ # Set device
359
+ device = 'cuda' if hardware == 'gpu' and torch.cuda.is_available() else 'cpu'
360
+
361
+ # CPU-constrained nodes (Docker) share cores with Ollama at 400%+. Cap torch
362
+ # threads so autograd workers don't thrash against inference. Single-thread
363
+ # prevents the scheduler stall that blocks stdout prints entirely.
364
+ if device == 'cpu':
365
+ try:
366
+ torch.set_num_threads(max(1, (torch.get_num_threads() or 1) // 2))
367
+ except Exception:
368
+ pass
369
+
370
+ # Emit a startup heartbeat BEFORE loading data/building the model so the
371
+ # Node parent can confirm the spawn reached Python code (diagnoses "no
372
+ # output for 10 minutes" timeouts caused by slow imports or data loading).
373
+ print(json.dumps({
374
+ "started": True,
375
+ "device": device,
376
+ "hardware": hardware,
377
+ "torchThreads": torch.get_num_threads(),
378
+ "maxTrainSeconds": max_train_seconds,
379
+ }), flush=True)
380
+
381
+ # Load and prepare data
382
+ text = load_data(data_path)
383
+ train_text, val_text = split_data(text)
384
+
385
+ # Create tokenizer
386
+ tokenizer = CharTokenizer(text)
387
+ vocab_size = tokenizer.vocab_size
388
+
389
+ # Create datasets
390
+ seq_length = 128
391
+ train_dataset = TextDataset(train_text, tokenizer, seq_length)
392
+ val_dataset = TextDataset(val_text, tokenizer, seq_length)
393
+
394
+ # Guard: dataset must have enough samples to train (domain corpus must be correct)
395
+ MIN_TRAIN_SAMPLES = 2
396
+ if len(train_dataset) < MIN_TRAIN_SAMPLES:
397
+ # Delete cached file so it gets re-downloaded on the next run
398
+ from pathlib import Path as _Path
399
+ try:
400
+ _Path(data_path).unlink(missing_ok=True)
401
+ except Exception:
402
+ pass
403
+ err_msg = (
404
+ f"Corpus too short for domain training: only {len(text)} chars in '{data_path}'. "
405
+ f"Need at least {seq_length + 1} chars to produce even one training sample. "
406
+ f"The coordinator must export a valid corpus before dispatching training work orders."
407
+ )
408
+ print(json.dumps({"error": err_msg}), flush=True)
409
+ sys.exit(1)
410
+
411
+ # Clamp batch_size to available samples (avoids empty-batch edge cases with small corpora)
412
+ effective_batch_size = min(batch_size, len(train_dataset))
413
+ if effective_batch_size != batch_size:
414
+ print(json.dumps({"warning": f"batch_size clamped from {batch_size} to {effective_batch_size} (only {len(train_dataset)} training samples)"}), flush=True)
415
+
416
+ train_loader = DataLoader(train_dataset, batch_size=effective_batch_size, shuffle=True)
417
+
418
+ # Val set must produce at least one batch. With the previous
419
+ # `batch_size=max(1, min(effective_batch_size, len(val_dataset)))`,
420
+ # a val_dataset of length 0 silently produced an empty DataLoader and
421
+ # `evaluate()` returned 0.0 — Node treated the run as "improved" against
422
+ # its Infinity baseline and paid out rewards for a no-eval submission.
423
+ # Cap val batch size at len(val_dataset) so even a tiny val set yields
424
+ # one batch; if val_dataset is empty, log to stderr and the eval call
425
+ # below will return EVAL_FAILED_SENTINEL via the empty-loader branch.
426
+ val_eval_failed = False
427
+ val_eval_failure_reason: Optional[str] = None
428
+ if len(val_dataset) == 0:
429
+ val_eval_failed = True
430
+ val_eval_failure_reason = (
431
+ f"val set empty: 0 samples (corpus_chars={len(text)}, train_ratio=0.9, "
432
+ f"seq_length={seq_length}, effective_batch_size={effective_batch_size}). "
433
+ "Domain corpus is too small to carve out a usable validation split."
434
+ )
435
+ print(f"[train_micro] WARNING: {val_eval_failure_reason}", file=sys.stderr, flush=True)
436
+ val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
437
+ else:
438
+ val_batch_size = max(1, min(effective_batch_size, len(val_dataset)))
439
+ val_loader = DataLoader(val_dataset, batch_size=val_batch_size, shuffle=False)
440
+
441
+ # Create model
442
+ model = MicroTransformer(
443
+ vocab_size=vocab_size,
444
+ n_embd=hidden_dim,
445
+ num_layers=num_layers,
446
+ num_heads=num_heads,
447
+ dropout=0.1,
448
+ normalization=normalization,
449
+ init_scheme=init_scheme,
450
+ ).to(device)
451
+
452
+ param_count = model.count_parameters()
453
+
454
+ # Setup optimizer with weight decay
455
+ optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
456
+
457
+ # Training loop with time limit.
458
+ #
459
+ # Reserve a minimum eval budget upfront so training can't starve the final
460
+ # evaluation pass. Without the reserve, training consumes the entire
461
+ # max_train_seconds and `evaluate()` only gets a best-effort 10s reprieve
462
+ # — under shared-CPU contention (Docker + Ollama + coordinator on the same
463
+ # host) that's not enough to load even one val batch and the run reports
464
+ # `valLossEvalFailed=true` despite training fine. Live-repro on node-1
465
+ # (linux container) and node-kike (Mac native) before this fix.
466
+ #
467
+ # Floor of 30s OR 10% of total budget, whichever is larger:
468
+ # - 60s budget → 30s eval / 30s train
469
+ # - 600s budget → 60s eval / 540s train
470
+ # - 1800s budget → 180s eval / 1620s train
471
+ # Training also gets a hard floor of 50% of budget so a tiny budget still
472
+ # gets reasonable training. `final_deadline` is `start + max_train_seconds`
473
+ # so we don't compound clock drift via `now + 10`.
474
+ MIN_EVAL_BUDGET_SEC = 30.0
475
+ eval_budget_sec = max(MIN_EVAL_BUDGET_SEC, max_train_seconds * 0.10)
476
+ training_budget_sec = max(max_train_seconds - eval_budget_sec, max_train_seconds * 0.5)
477
+ start_time = time.time()
478
+ deadline = start_time + training_budget_sec
479
+ final_deadline = start_time + max_train_seconds
480
+ step = 0
481
+ best_val_loss = float('inf')
482
+
483
+ try:
484
+ while time.time() < deadline:
485
+ for batch in train_loader:
486
+ if time.time() >= deadline:
487
+ break
488
+
489
+ # Learning rate warmup
490
+ if step < warmup_steps:
491
+ lr = learning_rate * (step + 1) / warmup_steps
492
+ for param_group in optimizer.param_groups:
493
+ param_group['lr'] = lr
494
+
495
+ # Training step
496
+ loss = train_step(model, batch, optimizer, device)
497
+ step += 1
498
+
499
+ # Log every step for the first 20 (proof-of-life on slow CPU
500
+ # nodes where step 10 might not be reached in 60s), then every
501
+ # 10 to keep stdout noise down.
502
+ if step <= 20 or step % 10 == 0:
503
+ current_lr = optimizer.param_groups[0]['lr']
504
+ progress = {
505
+ "step": step,
506
+ "loss": round(loss, 4),
507
+ "lr": round(current_lr, 6),
508
+ }
509
+ print(json.dumps(progress), flush=True)
510
+
511
+ # Validation at end of each epoch — honour the deadline so a slow
512
+ # val pass doesn't blow past max_train_seconds by minutes.
513
+ val_loss = evaluate(model, val_loader, device, deadline=deadline)
514
+ best_val_loss = min(best_val_loss, val_loss)
515
+
516
+ if time.time() >= deadline:
517
+ break
518
+
519
+ except KeyboardInterrupt:
520
+ pass
521
+
522
+ # Final evaluation — time-capped against `final_deadline` (set above as
523
+ # `start_time + max_train_seconds`). The reserved eval budget guarantees
524
+ # we have at least max(30s, 10% of total) here, and the `count > 0` guard
525
+ # in `evaluate()` ensures at least one batch always processes even under
526
+ # tight margins.
527
+ eval_start = time.time()
528
+ print(
529
+ json.dumps({
530
+ "stage": "eval-start",
531
+ "deadline_in_s": round(final_deadline - eval_start, 2),
532
+ "val_batches": len(val_loader),
533
+ "train_batches": len(train_loader),
534
+ }),
535
+ file=sys.stderr,
536
+ flush=True,
537
+ )
538
+ final_train_loss = evaluate(model, train_loader, device, deadline=final_deadline)
539
+ final_val_loss = evaluate(model, val_loader, device, deadline=final_deadline)
540
+ eval_elapsed = time.time() - eval_start
541
+ print(
542
+ json.dumps({
543
+ "stage": "eval-done",
544
+ "elapsed_s": round(eval_elapsed, 2),
545
+ "final_train_loss": round(final_train_loss, 4),
546
+ "final_val_loss": round(final_val_loss, 4),
547
+ }),
548
+ file=sys.stderr,
549
+ flush=True,
550
+ )
551
+ duration_ms = int((time.time() - start_time) * 1000)
552
+
553
+ # Detect "no batches consumed" — either val_dataset was empty (set above)
554
+ # or the deadline expired before the first val batch on a non-empty set.
555
+ # In both cases `evaluate()` returned EVAL_FAILED_SENTINEL. Propagate
556
+ # `valLossEvalFailed=true` so the Node parser can short-circuit the
557
+ # `improved` comparison instead of trusting the sentinel directly.
558
+ if final_val_loss >= EVAL_FAILED_SENTINEL:
559
+ val_eval_failed = True
560
+ if val_eval_failure_reason is None:
561
+ val_eval_failure_reason = (
562
+ f"final eval consumed 0 batches before deadline "
563
+ f"(val_dataset_size={len(val_dataset)}, effective_batch_size={effective_batch_size})"
564
+ )
565
+ print(f"[train_micro] WARNING: {val_eval_failure_reason}", file=sys.stderr, flush=True)
566
+
567
+ # Use json.dumps with allow_nan=False to fail loudly if any future code
568
+ # path lets a NaN/Inf reach this point — Node JSON.parse can't handle
569
+ # `NaN`/`Infinity` tokens and we'd lose the result silently otherwise.
570
+ result_payload = {
571
+ "result": {
572
+ "finalLoss": round(final_train_loss, 4),
573
+ "valLoss": round(final_val_loss, 4),
574
+ "steps": step,
575
+ "durationMs": duration_ms,
576
+ "params": param_count,
577
+ "vocabSize": vocab_size,
578
+ "valLossEvalFailed": val_eval_failed,
579
+ "valLossEvalFailureReason": val_eval_failure_reason,
580
+ }
581
+ }
582
+ print(json.dumps(result_payload, allow_nan=False), flush=True)
583
+
584
+
585
+ if __name__ == '__main__':
586
+ main()
@@ -0,0 +1,21 @@
1
+ import { fileURLToPath as __synFup } from "url";import { dirname as __synDn } from "path";const __filename = __synFup(import.meta.url);const __dirname = __synDn(__filename);
2
+ import {
3
+ TRAINER_EVAL_FAILED_SENTINEL,
4
+ TRAINING_MEM_FLOOR_MB,
5
+ TrainerHelper,
6
+ calculateImprovement,
7
+ isPyTorchAvailable,
8
+ trainMicroModel,
9
+ validateTrainingConfig
10
+ } from "./chunk-5QFTU52A.js";
11
+ import "./chunk-V2L5SXTL.js";
12
+ import "./chunk-D7ADMHK2.js";
13
+ export {
14
+ TRAINER_EVAL_FAILED_SENTINEL,
15
+ TRAINING_MEM_FLOOR_MB,
16
+ TrainerHelper,
17
+ calculateImprovement,
18
+ isPyTorchAvailable,
19
+ trainMicroModel,
20
+ validateTrainingConfig
21
+ };