nexaroa 0.0.111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. neuroshard/__init__.py +93 -0
  2. neuroshard/__main__.py +4 -0
  3. neuroshard/cli.py +466 -0
  4. neuroshard/core/__init__.py +92 -0
  5. neuroshard/core/consensus/verifier.py +252 -0
  6. neuroshard/core/crypto/__init__.py +20 -0
  7. neuroshard/core/crypto/ecdsa.py +392 -0
  8. neuroshard/core/economics/__init__.py +52 -0
  9. neuroshard/core/economics/constants.py +387 -0
  10. neuroshard/core/economics/ledger.py +2111 -0
  11. neuroshard/core/economics/market.py +975 -0
  12. neuroshard/core/economics/wallet.py +168 -0
  13. neuroshard/core/governance/__init__.py +74 -0
  14. neuroshard/core/governance/proposal.py +561 -0
  15. neuroshard/core/governance/registry.py +545 -0
  16. neuroshard/core/governance/versioning.py +332 -0
  17. neuroshard/core/governance/voting.py +453 -0
  18. neuroshard/core/model/__init__.py +30 -0
  19. neuroshard/core/model/dynamic.py +4186 -0
  20. neuroshard/core/model/llm.py +905 -0
  21. neuroshard/core/model/registry.py +164 -0
  22. neuroshard/core/model/scaler.py +387 -0
  23. neuroshard/core/model/tokenizer.py +568 -0
  24. neuroshard/core/network/__init__.py +56 -0
  25. neuroshard/core/network/connection_pool.py +72 -0
  26. neuroshard/core/network/dht.py +130 -0
  27. neuroshard/core/network/dht_plan.py +55 -0
  28. neuroshard/core/network/dht_proof_store.py +516 -0
  29. neuroshard/core/network/dht_protocol.py +261 -0
  30. neuroshard/core/network/dht_service.py +506 -0
  31. neuroshard/core/network/encrypted_channel.py +141 -0
  32. neuroshard/core/network/nat.py +201 -0
  33. neuroshard/core/network/nat_traversal.py +695 -0
  34. neuroshard/core/network/p2p.py +929 -0
  35. neuroshard/core/network/p2p_data.py +150 -0
  36. neuroshard/core/swarm/__init__.py +106 -0
  37. neuroshard/core/swarm/aggregation.py +729 -0
  38. neuroshard/core/swarm/buffers.py +643 -0
  39. neuroshard/core/swarm/checkpoint.py +709 -0
  40. neuroshard/core/swarm/compute.py +624 -0
  41. neuroshard/core/swarm/diloco.py +844 -0
  42. neuroshard/core/swarm/factory.py +1288 -0
  43. neuroshard/core/swarm/heartbeat.py +669 -0
  44. neuroshard/core/swarm/logger.py +487 -0
  45. neuroshard/core/swarm/router.py +658 -0
  46. neuroshard/core/swarm/service.py +640 -0
  47. neuroshard/core/training/__init__.py +29 -0
  48. neuroshard/core/training/checkpoint.py +600 -0
  49. neuroshard/core/training/distributed.py +1602 -0
  50. neuroshard/core/training/global_tracker.py +617 -0
  51. neuroshard/core/training/production.py +276 -0
  52. neuroshard/governance_cli.py +729 -0
  53. neuroshard/grpc_server.py +895 -0
  54. neuroshard/runner.py +3223 -0
  55. neuroshard/sdk/__init__.py +92 -0
  56. neuroshard/sdk/client.py +990 -0
  57. neuroshard/sdk/errors.py +101 -0
  58. neuroshard/sdk/types.py +282 -0
  59. neuroshard/tracker/__init__.py +0 -0
  60. neuroshard/tracker/server.py +864 -0
  61. neuroshard/ui/__init__.py +0 -0
  62. neuroshard/ui/app.py +102 -0
  63. neuroshard/ui/templates/index.html +1052 -0
  64. neuroshard/utils/__init__.py +0 -0
  65. neuroshard/utils/autostart.py +81 -0
  66. neuroshard/utils/hardware.py +121 -0
  67. neuroshard/utils/serialization.py +90 -0
  68. neuroshard/version.py +1 -0
  69. nexaroa-0.0.111.dist-info/METADATA +283 -0
  70. nexaroa-0.0.111.dist-info/RECORD +78 -0
  71. nexaroa-0.0.111.dist-info/WHEEL +5 -0
  72. nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
  73. nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
  74. nexaroa-0.0.111.dist-info/top_level.txt +2 -0
  75. protos/__init__.py +0 -0
  76. protos/neuroshard.proto +651 -0
  77. protos/neuroshard_pb2.py +160 -0
  78. protos/neuroshard_pb2_grpc.py +1298 -0
neuroshard/runner.py ADDED
@@ -0,0 +1,3223 @@
1
+ """
2
+ NeuroShard Node Runner
3
+
4
+ This is the main entry point for running a NeuroShard node.
5
+ The node participates in:
6
+ 1. Training NeuroLLM (our own model, trained from scratch by the network)
7
+ 2. Inference (generating text using the collective model)
8
+ 3. Earning NEURO tokens through Proof of Neural Work
9
+
10
+ TRULY DECENTRALIZED:
11
+ - No fixed model phases
12
+ - Model size grows with network capacity
13
+ - Each node contributes based on available memory
14
+ - More memory = more layers = more NEURO rewards
15
+ """
16
+
17
+ import argparse
18
+ import uvicorn
19
+ import threading
20
+ import torch # Imported early for API endpoints
21
+ import time
22
+ import requests
23
+ import logging
24
+ import logging.handlers # For RotatingFileHandler
25
+ import sys
26
+ import os
27
+ import socket
28
+ import uuid
29
+ import hashlib
30
+ import math
31
+ from fastapi import FastAPI, Request, HTTPException
32
+ from fastapi.middleware.cors import CORSMiddleware
33
+ from fastapi.responses import JSONResponse
34
+ from pydantic import BaseModel
35
+ from typing import Optional, List
36
+
37
+ from neuroshard.core.model.dynamic import DynamicNeuroNode, create_dynamic_node
38
+ from neuroshard.core.model.tokenizer import get_neuro_tokenizer
39
+ from neuroshard.core.network.p2p import P2PManager
40
+
41
+ # Swarm Architecture Imports (Phase 4)
42
+ try:
43
+ from neuroshard.core.swarm.factory import (
44
+ SwarmEnabledDynamicNode,
45
+ SwarmNodeConfig,
46
+ create_swarm_node,
47
+ create_swarm_node_with_p2p,
48
+ )
49
+ SWARM_AVAILABLE = True
50
+ except ImportError:
51
+ SWARM_AVAILABLE = False
52
+ from neuroshard.core.economics.constants import (
53
+ is_valid_stake_amount,
54
+ is_valid_stake_duration,
55
+ VALIDATOR_BASE_STAKE,
56
+ get_dynamic_validator_stake,
57
+ get_validator_stake_info,
58
+ )
59
+ from neuroshard.ui.app import STATE, templates
60
+ from neuroshard.utils.serialization import deserialize_tensor, serialize_tensor
61
+ from neuroshard.grpc_server import start_grpc_background
62
+ from neuroshard.version import __version__
63
+
64
+ # Safe print for Windows frozen GUI mode (where stdout may be None)
65
+ _original_print = print
66
+
67
+ def _safe_print(*args, **kwargs):
68
+ """Print that works even when stdout is None (Windows GUI)."""
69
+ try:
70
+ if sys.stdout is not None:
71
+ _original_print(*args, **kwargs)
72
+ except (AttributeError, OSError, ValueError):
73
+ pass # Silently ignore - logging will capture it
74
+
75
+ # Override print globally in this module
76
+ print = _safe_print
77
+
78
+ # Global shutdown flag for clean exit from GUI
79
+ _SHUTDOWN_REQUESTED = threading.Event()
80
+ _UVICORN_SERVER = None # Global reference to uvicorn server for shutdown
81
+
82
+ def request_shutdown():
83
+ """Request graceful shutdown of the node. Called from GUI when stopping."""
84
+ global _UVICORN_SERVER, NEURO_NODE, P2P
85
+ logger.info("[NODE] Shutdown requested...")
86
+ _SHUTDOWN_REQUESTED.set()
87
+
88
+ # Stop gRPC server first (releases port)
89
+ try:
90
+ from neuroshard.grpc_server import stop_grpc
91
+ stop_grpc(timeout=3.0)
92
+ except Exception as e:
93
+ logger.error(f"[NODE] gRPC shutdown error: {e}")
94
+
95
+ # Stop the node first (sets is_running = False)
96
+ if NEURO_NODE:
97
+ try:
98
+ logger.info("[NODE] Stopping node...")
99
+ # Get base node for SwarmEnabledDynamicNode
100
+ base = getattr(NEURO_NODE, 'base_node', NEURO_NODE)
101
+ if hasattr(base, 'stop'):
102
+ base.stop()
103
+ if hasattr(NEURO_NODE, 'stop') and NEURO_NODE != base:
104
+ NEURO_NODE.stop()
105
+ except Exception as e:
106
+ logger.error(f"[NODE] Node stop error: {e}")
107
+
108
+ # Stop swarm components if enabled
109
+ if NEURO_NODE and hasattr(NEURO_NODE, 'stop_swarm_sync'):
110
+ try:
111
+ logger.info("[NODE] Stopping swarm components...")
112
+ NEURO_NODE.stop_swarm_sync()
113
+ logger.info("[NODE] Swarm components stopped.")
114
+ except Exception as e:
115
+ logger.error(f"[NODE] Swarm shutdown error: {e}")
116
+
117
+ # Save checkpoint before shutting down
118
+ if NEURO_NODE:
119
+ try:
120
+ logger.info("[NODE] Saving checkpoint before shutdown...")
121
+ # Force synchronous save during shutdown to ensure it completes
122
+ NEURO_NODE._save_checkpoint(async_save=False)
123
+ logger.info("[NODE] Checkpoint saved.")
124
+ except Exception as e:
125
+ logger.error(f"[NODE] Failed to save checkpoint: {e}")
126
+
127
+ # Wait for any ongoing async saves to complete
128
+ try:
129
+ from neuroshard.core.model.dynamic import DynamicNeuroNode
130
+ # Try to acquire the lock (will wait if async save in progress)
131
+ if DynamicNeuroNode._checkpoint_save_lock.acquire(timeout=30):
132
+ DynamicNeuroNode._checkpoint_save_lock.release()
133
+ logger.info("[NODE] All checkpoint saves completed.")
134
+ except Exception as e:
135
+ logger.warning(f"[NODE] Could not wait for checkpoint save: {e}")
136
+
137
+ # CRITICAL: Free memory by deleting model and data
138
+ try:
139
+ logger.info("[NODE] Freeing memory...")
140
+
141
+ # Clear genesis data
142
+ if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
143
+ NEURO_NODE.genesis_loader.loaded_shards.clear()
144
+ NEURO_NODE.genesis_loader._prefetch_ready.clear()
145
+ NEURO_NODE.genesis_loader.current_dataset = None
146
+
147
+ # Get base node (for SwarmEnabledDynamicNode) or use directly
148
+ base = getattr(NEURO_NODE, 'base_node', NEURO_NODE)
149
+
150
+ # Delete optimizer (holds 2x model params in memory for Adam)
151
+ if hasattr(base, 'optimizer') and base.optimizer is not None:
152
+ del base.optimizer
153
+
154
+ # Delete model
155
+ if hasattr(base, 'model') and base.model is not None:
156
+ del base.model
157
+
158
+ # Force garbage collection
159
+ import gc
160
+ gc.collect()
161
+
162
+ # Clear GPU cache if applicable
163
+ if torch.cuda.is_available():
164
+ torch.cuda.empty_cache()
165
+ logger.info("[NODE] Cleared CUDA cache")
166
+ elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
167
+ torch.mps.empty_cache()
168
+ logger.info("[NODE] Cleared MPS cache")
169
+
170
+ logger.info("[NODE] Memory freed")
171
+ except Exception as e:
172
+ logger.error(f"[NODE] Memory cleanup error: {e}")
173
+
174
+ # Stop P2P manager (stops background threads)
175
+ if P2P:
176
+ try:
177
+ P2P.stop()
178
+ except Exception as e:
179
+ logger.error(f"[NODE] P2P stop error: {e}")
180
+
181
+ # Stop uvicorn server
182
+ if _UVICORN_SERVER:
183
+ logger.info("[NODE] Stopping HTTP server...")
184
+ _UVICORN_SERVER.should_exit = True
185
+
186
+ # FORCE EXIT: Always force exit after 3 seconds regardless
187
+ # This handles nohup, daemon, and any other run mode
188
+ def force_exit():
189
+ import time as t_module
190
+ import os
191
+ import signal
192
+ t_module.sleep(3.0)
193
+ logger.warning("[NODE] Force exiting (server didn't stop gracefully)...")
194
+ # Try SIGTERM first (graceful)
195
+ try:
196
+ os.kill(os.getpid(), signal.SIGTERM)
197
+ except Exception:
198
+ pass
199
+ t_module.sleep(0.5)
200
+ # If still running, force kill
201
+ logger.warning("[NODE] Forcing process termination...")
202
+ os._exit(0) # Force exit without cleanup
203
+
204
+ # Use non-daemon thread to ensure force_exit runs to completion
205
+ force_thread = threading.Thread(target=force_exit, daemon=False)
206
+ force_thread.start()
207
+ logger.info("[NODE] Force exit scheduled in 3 seconds...")
208
+
209
+ # Reset globals so next run starts fresh
210
+ NEURO_NODE = None
211
+ P2P = None
212
+ _UVICORN_SERVER = None
213
+
214
+ # Configure Logging - ensure all loggers use our format
215
+ # Clear any existing handlers first to prevent duplicates
216
+ root_logger = logging.getLogger()
217
+ root_logger.handlers = [] # Clear existing handlers
218
+ root_logger.setLevel(logging.INFO)
219
+
220
+ # --- In-memory log buffer for dashboard ---
221
+ from collections import deque
222
+ from datetime import datetime
223
+
224
+ # Circular buffer to store recent logs (max 500 entries)
225
+ _LOG_BUFFER = deque(maxlen=500)
226
+ _LOG_BUFFER_LOCK = threading.Lock()
227
+
228
+ class MemoryLogHandler(logging.Handler):
229
+ """Custom handler that stores logs in memory for dashboard API."""
230
+
231
+ # Auto-incrementing log ID for reliable polling
232
+ _log_id_counter = 0
233
+
234
+ def emit(self, record):
235
+ try:
236
+ msg = self.format(record)
237
+ # Store both display timestamp and epoch for sorting
238
+ epoch_ms = int(record.created * 1000)
239
+ timestamp = datetime.fromtimestamp(record.created).strftime('%H:%M:%S')
240
+
241
+ # Determine log type for filtering
242
+ log_type = 'info'
243
+ msg_lower = msg.lower()
244
+ if 'neuro' in msg_lower and ('earned' in msg_lower or 'reward' in msg_lower or '+' in msg):
245
+ log_type = 'neuro'
246
+ elif 'error' in msg_lower or record.levelno >= logging.ERROR:
247
+ log_type = 'error'
248
+ elif 'training' in msg_lower or 'diloco' in msg_lower or 'gradient' in msg_lower or 'batch' in msg_lower:
249
+ log_type = 'training'
250
+ elif record.levelno >= logging.WARNING:
251
+ log_type = 'warning'
252
+
253
+ with _LOG_BUFFER_LOCK:
254
+ MemoryLogHandler._log_id_counter += 1
255
+ _LOG_BUFFER.append({
256
+ 'id': MemoryLogHandler._log_id_counter,
257
+ 'epoch': epoch_ms,
258
+ 'timestamp': timestamp,
259
+ 'message': msg,
260
+ 'type': log_type,
261
+ 'level': record.levelname,
262
+ })
263
+ except Exception:
264
+ pass # Never fail logging
265
+
266
+ # Windows GUI apps (frozen) may have None stdout/stderr
267
+ # Create a safe handler that won't crash
268
+ def _create_safe_handler():
269
+ """Create a logging handler that works even when stdout is None (Windows GUI)."""
270
+ # Check if stdout is usable
271
+ if sys.stdout is not None and hasattr(sys.stdout, 'write'):
272
+ try:
273
+ # Test if it's actually writable
274
+ sys.stdout.write('')
275
+ sys.stdout.flush()
276
+ return logging.StreamHandler(sys.stdout)
277
+ except (AttributeError, OSError, ValueError):
278
+ pass
279
+
280
+ # Fallback: log to file in .neuroshard directory
281
+ log_dir = os.path.join(os.path.expanduser("~"), ".neuroshard")
282
+ os.makedirs(log_dir, exist_ok=True)
283
+ log_file = os.path.join(log_dir, "node.log")
284
+
285
+ # Rotate logs - keep last 5MB
286
+ return logging.handlers.RotatingFileHandler(log_file, maxBytes=5*1024*1024, backupCount=2, encoding='utf-8')
287
+
288
+ handler = _create_safe_handler()
289
+ handler.setFormatter(logging.Formatter('[NODE] %(message)s'))
290
+ root_logger.addHandler(handler)
291
+
292
+ # Add memory handler for dashboard logs API
293
+ memory_handler = MemoryLogHandler()
294
+ memory_handler.setFormatter(logging.Formatter('%(message)s'))
295
+ memory_handler.setLevel(logging.INFO)
296
+ root_logger.addHandler(memory_handler)
297
+
298
+ # Also configure neuroshard module loggers explicitly
299
+ for module in ['neuroshard.core.p2p', 'neuroshard.core.ledger', 'neuroshard.core.dynamic_model',
300
+ 'neuroshard.core.distributed_training', 'neuroshard.core.dht_service']:
301
+ module_logger = logging.getLogger(module)
302
+ module_logger.setLevel(logging.INFO)
303
+ module_logger.propagate = True # Propagate to root logger
304
+
305
+ # Create logger for this module
306
+ logger = logging.getLogger(__name__)
307
+
308
+ # --- Main API App ---
309
+ node_app = FastAPI(title="NeuroShard Node", version=__version__)
310
+ # Serve dashboard at root
311
+ from fastapi.responses import HTMLResponse
312
+
313
+ @node_app.get("/", response_class=HTMLResponse)
314
+ async def serve_dashboard(request: Request):
315
+ """Serve the main dashboard at root."""
316
+ return templates.TemplateResponse("index.html", {"request": request})
317
+
318
+ # Shared State
319
+ NEURO_NODE: Optional[DynamicNeuroNode] = None
320
+ P2P: Optional[P2PManager] = None
321
+ SESSION_TIMESTAMPS = {}
322
+
323
+ def get_app():
324
+ return node_app
325
+
326
+
327
+ class InferenceRequest(BaseModel):
328
+ tensor_data: str
329
+ request_id: str
330
+ session_id: Optional[str] = None
331
+ sender_reputation: float = 100.0
332
+
333
+
334
+ class TextGenerationRequest(BaseModel):
335
+ prompt: str
336
+ max_new_tokens: int = 50
337
+ temperature: float = 1.0
338
+ top_k: int = 50
339
+ top_p: float = 0.9
340
+
341
+
342
+ class TrainingDataRequest(BaseModel):
343
+ text: str
344
+ apply_dp: bool = True # Apply differential privacy
345
+
346
+
347
+ # ==================== INFERENCE ENDPOINTS ====================
348
+
349
+ @node_app.post("/generate_text")
350
+ async def generate_text(req: TextGenerationRequest):
351
+ """
352
+ Generate text using NeuroLLM.
353
+
354
+ Note: Early in the network's life, output will be mostly random.
355
+ As more users train the model, quality will improve!
356
+ """
357
+ if not NEURO_NODE:
358
+ raise HTTPException(status_code=503, detail="Node not ready")
359
+
360
+ try:
361
+ generated = NEURO_NODE.generate(
362
+ prompt=req.prompt,
363
+ max_new_tokens=req.max_new_tokens,
364
+ temperature=req.temperature,
365
+ )
366
+
367
+ STATE["processed_count"] = STATE.get("processed_count", 0) + 1
368
+ STATE["token_count"] = NEURO_NODE.total_tokens_processed
369
+
370
+ return {
371
+ "text": generated,
372
+ "my_layers": NEURO_NODE.my_layer_ids,
373
+ "total_training_rounds": NEURO_NODE.total_training_rounds,
374
+ "note": "Quality improves as more users train the model!"
375
+ }
376
+
377
+ except Exception as e:
378
+ return {"error": str(e)}
379
+
380
+
381
+ @node_app.post("/forward")
382
+ async def forward(req: InferenceRequest):
383
+ """Forward pass for distributed inference pipeline."""
384
+ if not NEURO_NODE:
385
+ raise HTTPException(status_code=503, detail="Node not ready")
386
+
387
+ try:
388
+ STATE["processed_count"] = STATE.get("processed_count", 0) + 1
389
+
390
+ if req.session_id:
391
+ SESSION_TIMESTAMPS[req.session_id] = time.time()
392
+
393
+ # Deserialize input
394
+ input_tensor = deserialize_tensor(req.tensor_data)
395
+
396
+ # Forward through NeuroLLM
397
+ output = NEURO_NODE.forward(input_tensor, session_id=req.session_id)
398
+
399
+ # Update token count
400
+ STATE["token_count"] = NEURO_NODE.total_tokens_processed
401
+
402
+ # Return result (NeuroLLM is always a full model, no pipeline needed)
403
+ return {"result": serialize_tensor(output)}
404
+
405
+ except Exception as e:
406
+ return {"error": str(e)}
407
+
408
+
409
+ # ==================== TRAINING ENDPOINTS ====================
410
+
411
+ @node_app.post("/contribute_data")
412
+ async def contribute_training_data(req: TrainingDataRequest):
413
+ """
414
+ Contribute training data to help train NeuroLLM.
415
+
416
+ Your data is processed locally with differential privacy.
417
+ You earn NEURO tokens for contributing!
418
+ """
419
+ if not NEURO_NODE:
420
+ raise HTTPException(status_code=503, detail="Node not ready")
421
+
422
+ if not NEURO_NODE.enable_training:
423
+ raise HTTPException(status_code=400, detail="Training not enabled on this node")
424
+
425
+ try:
426
+ tokens_added = NEURO_NODE.contribute_training_data(req.text, apply_dp=req.apply_dp)
427
+
428
+ data_stats = NEURO_NODE.data_manager.get_stats() if NEURO_NODE.data_manager else {}
429
+
430
+ return {
431
+ "success": True,
432
+ "message": "Data added to training buffer",
433
+ "tokens_added": tokens_added or 0,
434
+ "buffer_size": data_stats.get("buffer_size", 0),
435
+ "total_tokens": data_stats.get("total_tokens", 0),
436
+ }
437
+
438
+ except Exception as e:
439
+ return {"error": str(e)}
440
+
441
+
442
+ @node_app.post("/train_step")
443
+ async def trigger_train_step():
444
+ """Manually trigger a training step (for testing)."""
445
+ if not NEURO_NODE:
446
+ raise HTTPException(status_code=503, detail="Node not ready")
447
+
448
+ loss = NEURO_NODE.train_step()
449
+
450
+ if loss is None:
451
+ return {"success": False, "message": "Not enough training data in buffer"}
452
+
453
+ return {
454
+ "success": True,
455
+ "loss": loss,
456
+ "total_training_rounds": NEURO_NODE.total_training_rounds
457
+ }
458
+
459
+
460
+ @node_app.get("/training_status")
461
+ async def get_training_status():
462
+ """Get current training status."""
463
+ if not NEURO_NODE:
464
+ raise HTTPException(status_code=503, detail="Node not ready")
465
+
466
+ # Sanitize loss for JSON
467
+ current_loss = NEURO_NODE.current_loss
468
+ if math.isinf(current_loss) or math.isnan(current_loss):
469
+ current_loss = None
470
+
471
+ return {
472
+ "training_enabled": NEURO_NODE.enable_training,
473
+ "total_training_rounds": NEURO_NODE.total_training_rounds,
474
+ "current_loss": current_loss,
475
+ "training_contributions": NEURO_NODE.training_contribution_count,
476
+ "data_buffer": NEURO_NODE.data_manager.get_stats() if NEURO_NODE.data_manager else None,
477
+ "my_layers": NEURO_NODE.my_layer_ids,
478
+ }
479
+
480
+
481
+ @node_app.get("/api/training/global")
482
+ async def get_global_training_status():
483
+ """
484
+ Get GLOBAL training verification status.
485
+
486
+ This endpoint answers the question: "Is the distributed training ACTUALLY working?"
487
+
488
+ Key metrics:
489
+ - training_verified: True if we can confirm the model is improving
490
+ - is_converging: True if the network appears to be converging
491
+ - hash_agreement_rate: % of nodes with the same model hash (should be 100%)
492
+ - global_avg_loss: Average loss across all network nodes
493
+ - sync_success_rate: % of gradient syncs that succeeded
494
+
495
+ If hash_agreement_rate < 100%, nodes have diverged and training is NOT coordinated!
496
+ """
497
+ if not NEURO_NODE:
498
+ raise HTTPException(status_code=503, detail="Node not ready")
499
+
500
+ # Get global status from swarm-enabled node
501
+ if hasattr(NEURO_NODE, 'get_global_training_status'):
502
+ global_status = NEURO_NODE.get_global_training_status()
503
+ else:
504
+ # Fallback for non-swarm nodes
505
+ global_status = {
506
+ "error": "Node does not support global training tracking",
507
+ "training_verified": False,
508
+ "is_converging": False,
509
+ }
510
+
511
+ # Add local context (sanitize float values for JSON)
512
+ current_loss = NEURO_NODE.current_loss
513
+ if math.isinf(current_loss) or math.isnan(current_loss):
514
+ current_loss = None
515
+
516
+ # Get model hash from global tracker if available
517
+ model_hash = ""
518
+ if hasattr(NEURO_NODE, '_global_tracker') and NEURO_NODE._global_tracker:
519
+ local_status = NEURO_NODE._global_tracker.get_local_status()
520
+ model_hash = local_status.get('model_hash', '')
521
+
522
+ global_status["local"] = {
523
+ "node_id": NEURO_NODE.node_id[:16],
524
+ "training_rounds": NEURO_NODE.total_training_rounds,
525
+ "current_loss": current_loss,
526
+ "is_training": NEURO_NODE.enable_training,
527
+ "model_hash": model_hash,
528
+ }
529
+
530
+ # Add DiLoCo status if available
531
+ if hasattr(NEURO_NODE, 'get_diloco_progress'):
532
+ global_status["diloco"] = NEURO_NODE.get_diloco_progress()
533
+
534
+ return global_status
535
+
536
+
537
+ @node_app.get("/api/training/verify")
538
+ async def verify_training():
539
+ """
540
+ Quick verification endpoint - answers: "Is training working?"
541
+
542
+ Returns a simple yes/no with explanation.
543
+ """
544
+ if not NEURO_NODE:
545
+ raise HTTPException(status_code=503, detail="Node not ready")
546
+
547
+ if not NEURO_NODE.enable_training:
548
+ return {
549
+ "is_working": False,
550
+ "reason": "Training is not enabled on this node",
551
+ "action": "Start the node with --train flag",
552
+ }
553
+
554
+ # Check if we have enough training data
555
+ if NEURO_NODE.total_training_rounds < 10:
556
+ return {
557
+ "is_working": "insufficient_data",
558
+ "reason": f"Only {NEURO_NODE.total_training_rounds} training steps completed",
559
+ "action": "Wait for more training steps (need 10+ for verification)",
560
+ }
561
+
562
+ # Get global status
563
+ if hasattr(NEURO_NODE, 'get_global_training_status'):
564
+ global_status = NEURO_NODE.get_global_training_status()
565
+
566
+ is_working = global_status.get("training_verified", False)
567
+ is_converging = global_status.get("is_converging", False)
568
+ hash_agreement = global_status.get("hash_agreement_rate", 0)
569
+
570
+ if is_working and is_converging:
571
+ return {
572
+ "is_working": True,
573
+ "reason": "Training verified! Loss is decreasing and network is converging.",
574
+ "metrics": {
575
+ "loss_trend": global_status.get("loss_trend", "unknown"),
576
+ "hash_agreement": f"{hash_agreement*100:.1f}%",
577
+ "global_loss": global_status.get("global_avg_loss", 0),
578
+ },
579
+ }
580
+ elif not is_converging and hash_agreement < 0.5:
581
+ return {
582
+ "is_working": False,
583
+ "reason": f"Network NOT converging! Only {hash_agreement*100:.1f}% hash agreement.",
584
+ "action": "Nodes have diverged. Check gradient sync is working.",
585
+ }
586
+ else:
587
+ return {
588
+ "is_working": "partial",
589
+ "reason": "Training running but not yet verified as improving.",
590
+ "action": "Continue training - need more data for verification.",
591
+ }
592
+
593
+ # Fallback: check if loss is decreasing
594
+ loss = NEURO_NODE.current_loss
595
+ if loss < 1.0:
596
+ return {
597
+ "is_working": True,
598
+ "reason": f"Loss is {loss:.4f} which is reasonable for training.",
599
+ }
600
+ else:
601
+ return {
602
+ "is_working": "unknown",
603
+ "reason": "Cannot verify without global tracker.",
604
+ "action": "Check loss values in logs - should be decreasing.",
605
+ }
606
+
607
+
608
+ @node_app.get("/api/training/history")
609
+ async def get_local_training_history():
610
+ """
611
+ Get LOCAL loss history to verify model is improving.
612
+
613
+ Returns loss checkpoints recorded during training.
614
+ Use this to see if YOUR node's training is working.
615
+ """
616
+ if not NEURO_NODE:
617
+ raise HTTPException(status_code=503, detail="Node not ready")
618
+
619
+ result = {
620
+ "total_steps": NEURO_NODE.total_training_rounds,
621
+ "current_loss": NEURO_NODE.current_loss if NEURO_NODE.current_loss != float('inf') else None,
622
+ "loss_checkpoints": [],
623
+ "loss_trend": "unknown",
624
+ "improvement_percent": 0.0,
625
+ "training_verified": False,
626
+ "analysis": {},
627
+ }
628
+
629
+ # Get loss checkpoints from global tracker
630
+ if hasattr(NEURO_NODE, '_global_tracker') and NEURO_NODE._global_tracker:
631
+ tracker = NEURO_NODE._global_tracker
632
+
633
+ # Get loss checkpoints (list of (step, loss) tuples)
634
+ checkpoints = getattr(tracker, '_loss_checkpoints', [])
635
+ result["loss_checkpoints"] = [
636
+ {"step": step, "loss": round(loss, 4)}
637
+ for step, loss in checkpoints
638
+ ]
639
+
640
+ # Analyze trend
641
+ if len(checkpoints) >= 5:
642
+ losses = [loss for _, loss in checkpoints]
643
+
644
+ # Compare first 20% to last 20%
645
+ n = len(losses)
646
+ first_n = max(1, n // 5)
647
+ first_avg = sum(losses[:first_n]) / first_n
648
+ last_avg = sum(losses[-first_n:]) / first_n
649
+
650
+ if first_avg > 0:
651
+ improvement = (first_avg - last_avg) / first_avg * 100
652
+ result["improvement_percent"] = round(improvement, 2)
653
+
654
+ if improvement > 10:
655
+ result["loss_trend"] = "improving_strongly"
656
+ result["training_verified"] = True
657
+ elif improvement > 2:
658
+ result["loss_trend"] = "improving"
659
+ result["training_verified"] = True
660
+ elif improvement > -2:
661
+ result["loss_trend"] = "stable"
662
+ result["training_verified"] = n > 20 # Stable after many steps = converged
663
+ elif improvement > -10:
664
+ result["loss_trend"] = "degrading_slightly"
665
+ else:
666
+ result["loss_trend"] = "degrading"
667
+
668
+ result["analysis"] = {
669
+ "data_points": n,
670
+ "first_avg_loss": round(first_avg, 4),
671
+ "last_avg_loss": round(last_avg, 4),
672
+ "min_loss_seen": round(min(losses), 4),
673
+ "max_loss_seen": round(max(losses), 4),
674
+ "expected_initial_loss": "~10-11 (random init for 50k vocab)",
675
+ "good_loss_range": "< 4.0 (perplexity < 55)",
676
+ "great_loss_range": "< 2.5 (perplexity < 12)",
677
+ }
678
+ else:
679
+ result["analysis"]["note"] = "Global tracker not initialized - restart node to enable"
680
+
681
+ return result
682
+
683
+
684
+ # ==================== STATS & PONW ENDPOINTS ====================
685
+
686
+ @node_app.get("/api/stats")
687
+ async def get_api_stats():
688
+ """Endpoint for GUI to fetch local node stats."""
689
+ import math
690
+ import asyncio
691
+ import os
692
+
693
+ # Yield to event loop to ensure responsiveness
694
+ await asyncio.sleep(0)
695
+
696
+ # Get actual system resource usage
697
+ system_stats = {}
698
+ try:
699
+ import psutil
700
+ # CPU usage (system-wide percentage)
701
+ system_stats["cpu_percent"] = psutil.cpu_percent(interval=None) # Non-blocking
702
+
703
+ # Memory usage (system-wide)
704
+ mem = psutil.virtual_memory()
705
+ system_stats["ram_used_gb"] = round(mem.used / (1024**3), 2)
706
+ system_stats["ram_total_gb"] = round(mem.total / (1024**3), 2)
707
+ system_stats["ram_percent"] = mem.percent
708
+
709
+ # Process-specific memory
710
+ process = psutil.Process(os.getpid())
711
+ system_stats["process_ram_mb"] = round(process.memory_info().rss / (1024**2), 1)
712
+ except Exception:
713
+ pass
714
+
715
+ # Start with basic stats from STATE
716
+ stats = {
717
+ "peer_count": STATE.get("peer_count", 0),
718
+ "processed_count": STATE.get("processed_count", 0),
719
+ "training_status": STATE.get("training_status", "idle"),
720
+ # Actual system resource usage
721
+ "system": system_stats,
722
+ # Resource throttle info
723
+ "throttle": {
724
+ "cpu_ratio": STATE.get("throttle_cpu_ratio", 1.0),
725
+ "ram_ratio": STATE.get("throttle_ram_ratio", 1.0),
726
+ "effective": STATE.get("throttle_effective", 1.0),
727
+ "interval_seconds": STATE.get("throttle_interval", 2.0),
728
+ "max_steps_per_min": STATE.get("throttle_max_steps", 30),
729
+ },
730
+ }
731
+
732
+ if NEURO_NODE:
733
+ # Run get_stats in executor to not block event loop
734
+ loop = asyncio.get_event_loop()
735
+ node_stats = await loop.run_in_executor(None, NEURO_NODE.get_stats)
736
+
737
+ # Handle infinity values (not JSON serializable)
738
+ current_loss = node_stats.get("current_loss", float('inf'))
739
+ if math.isinf(current_loss) or math.isnan(current_loss):
740
+ current_loss = None # Use None for JSON compatibility
741
+
742
+ # Determine role string for display
743
+ has_embedding = node_stats.get("has_embedding", False)
744
+ has_lm_head = node_stats.get("has_lm_head", False)
745
+ if has_embedding and has_lm_head:
746
+ role = "Full Node (Driver + Validator)"
747
+ elif has_embedding:
748
+ role = "Driver"
749
+ elif has_lm_head:
750
+ role = "Validator"
751
+ else:
752
+ role = "Worker"
753
+
754
+ stats.update({
755
+ # My contribution
756
+ "my_layers": node_stats.get("my_layers", []),
757
+ "my_params_m": node_stats.get("my_params", 0) / 1e6,
758
+ "has_embedding": has_embedding,
759
+ "has_lm_head": has_lm_head,
760
+ "role": role,
761
+ "available_memory_mb": node_stats.get("available_memory_mb", 0),
762
+ "reward_multiplier": node_stats.get("reward_multiplier", 1.0),
763
+
764
+ # Network stats
765
+ "network_layers": node_stats.get("network_layers", 0),
766
+ "network_params_m": node_stats.get("network_params", 0) / 1e6,
767
+ "network_nodes": node_stats.get("network_nodes", 0),
768
+ "contribution_ratio": node_stats.get("contribution_ratio", 0),
769
+
770
+ # Training stats - use CUMULATIVE values from NEURO_NODE, not delta from STATE
771
+ "training_enabled": NEURO_NODE.enable_training,
772
+ "training_rounds": node_stats.get("total_training_rounds", 0),
773
+ "token_count": node_stats.get("total_tokens_processed", 0), # Cumulative tokens
774
+ "current_loss": current_loss,
775
+ "data_buffer_size": node_stats.get("data_buffer_size", 0),
776
+
777
+ # Data shard stats (if Driver)
778
+ "shard_stats": node_stats.get("shard_stats", {}),
779
+
780
+ # Device info
781
+ "device": NEURO_NODE.device,
782
+
783
+ # Instance info (for multi-node support)
784
+ "instance_id": getattr(NEURO_NODE, 'instance_id', None),
785
+ })
786
+
787
+ # Add DiLoCo progress
788
+ diloco = NEURO_NODE.get_diloco_progress()
789
+ if diloco.get("enabled", False):
790
+ stats["diloco"] = {
791
+ "inner_step": diloco.get("inner_step_count", 0),
792
+ "inner_total": diloco.get("inner_steps_total", 500),
793
+ "progress": diloco.get("progress", 0.0),
794
+ "outer_step": diloco.get("outer_step_count", 0),
795
+ }
796
+ else:
797
+ # Node not ready yet
798
+ stats["token_count"] = 0
799
+ stats["training_rounds"] = 0
800
+
801
+ # Add version
802
+ stats["version"] = __version__
803
+
804
+ # Add current config settings (for UI sliders)
805
+ stats["config"] = {
806
+ "cpu_threads": STATE.get("config_cpu_threads"),
807
+ "memory_mb": STATE.get("config_memory_mb"),
808
+ "storage_mb": STATE.get("config_storage_mb", 100), # Default 100MB
809
+ }
810
+
811
+ return stats
812
+
813
+
814
+ @node_app.get("/api/node/architecture")
815
+ async def get_node_architecture():
816
+ """
817
+ Get this node's current architecture.
818
+
819
+ Used by other nodes to query network architecture when rejoining.
820
+ This enables smart architecture reconciliation across the network.
821
+ """
822
+ if not NEURO_NODE or not NEURO_NODE.model:
823
+ raise HTTPException(status_code=503, detail="Node not ready")
824
+
825
+ arch = NEURO_NODE.model.architecture
826
+
827
+ return {
828
+ "hidden_dim": arch.hidden_dim,
829
+ "intermediate_dim": arch.intermediate_dim,
830
+ "num_layers": arch.num_layers,
831
+ "num_heads": arch.num_heads,
832
+ "num_kv_heads": arch.num_kv_heads,
833
+ "estimated_params": arch.estimate_params(),
834
+ "estimated_memory_mb": arch.estimate_memory_mb(),
835
+ "architecture_version": getattr(NEURO_NODE.layer_pool, 'architecture_version', 1),
836
+ }
837
+
838
+
839
+ @node_app.get("/api/market")
840
+ async def get_market_stats():
841
+ """
842
+ Get real-time inference market statistics.
843
+
844
+ Returns current price, supply, demand, utilization.
845
+ """
846
+ if not P2P or not P2P.ledger:
847
+ raise HTTPException(status_code=503, detail="Ledger not available")
848
+
849
+ return P2P.ledger.get_inference_market_stats()
850
+
851
+
852
+ @node_app.post("/api/market/register")
853
+ async def register_inference_capacity(
854
+ tokens_per_second: int,
855
+ min_price: float = 0.0
856
+ ):
857
+ """
858
+ Register this node's inference capacity with the market.
859
+
860
+ Nodes should call this when idle/available to serve inference.
861
+ Call withdraw endpoint when switching to training.
862
+ """
863
+ if not P2P or not P2P.ledger:
864
+ raise HTTPException(status_code=503, detail="Ledger not available")
865
+
866
+ P2P.ledger.register_inference_capacity(
867
+ tokens_per_second=tokens_per_second,
868
+ min_price=min_price
869
+ )
870
+
871
+ return {"status": "registered", "tokens_per_second": tokens_per_second, "min_price": min_price}
872
+
873
+
874
+ @node_app.post("/api/market/withdraw")
875
+ async def withdraw_inference_capacity():
876
+ """
877
+ Withdraw this node from inference market.
878
+
879
+ Call this when switching to training.
880
+ """
881
+ if not P2P or not P2P.ledger:
882
+ raise HTTPException(status_code=503, detail="Ledger not available")
883
+
884
+ P2P.ledger.withdraw_inference_capacity()
885
+
886
+ return {"status": "withdrawn"}
887
+
888
+
889
+ # ==================== DISTRIBUTED INFERENCE MARKETPLACE ====================
890
+
891
+ class MarketplaceSubmitRequest(BaseModel):
892
+ """User submits inference request to marketplace."""
893
+ prompt: str
894
+ max_tokens: int = 100
895
+ max_price: float = 1.0
896
+ driver_node_id: Optional[str] = None # Optional: specify driver, else round-robin
897
+
898
+
899
+ class DriverPromptRequest(BaseModel):
900
+ """User sends encrypted prompt directly to driver."""
901
+ encrypted_prompt: str
902
+ user_id: str
903
+
904
+
905
+ @node_app.post("/api/market/submit")
906
+ async def submit_marketplace_request(req: MarketplaceSubmitRequest):
907
+ """
908
+ Submit inference request to marketplace (USER API).
909
+
910
+ Flow:
911
+ 1. User submits request with prompt
912
+ 2. Marketplace locks price, assigns driver
913
+ 3. User sends encrypted prompt to driver
914
+ 4. Driver processes, returns result
915
+
916
+ Returns:
917
+ request_id, locked_price, driver_node_id
918
+ """
919
+ if not NEURO_NODE or not P2P or not P2P.ledger:
920
+ raise HTTPException(status_code=503, detail="Node not ready")
921
+
922
+ if not hasattr(P2P.ledger, 'inference_market'):
923
+ raise HTTPException(status_code=503, detail="Marketplace not available")
924
+
925
+ market = P2P.ledger.inference_market
926
+
927
+ # Choose driver (round-robin if not specified)
928
+ driver_node_id = req.driver_node_id
929
+
930
+ if not driver_node_id:
931
+ # Find a driver node from layer pool
932
+ if NEURO_NODE.layer_pool:
933
+ route = NEURO_NODE.layer_pool.get_pipeline_route()
934
+ if route and len(route) > 0:
935
+ # First layer should be embedding (driver)
936
+ driver_node_id = route[0][1].split(':')[0] if ':' in route[0][1] else NEURO_NODE.node_id
937
+ else:
938
+ # Fallback to this node if it's a driver
939
+ if NEURO_NODE.model.has_embedding:
940
+ driver_node_id = NEURO_NODE.node_id
941
+ else:
942
+ raise HTTPException(status_code=503, detail="No driver nodes available")
943
+ else:
944
+ # Single node mode
945
+ driver_node_id = NEURO_NODE.node_id
946
+
947
+ # Sign request with node's ECDSA key (authorizes payment)
948
+ from neuroshard.core.crypto.ecdsa import sign_message
949
+ signature_payload = f"{NEURO_NODE.node_id}:{driver_node_id}:{req.max_tokens}:{req.max_price}"
950
+ user_signature = sign_message(signature_payload, NEURO_NODE.node_token)
951
+
952
+ # Submit to marketplace (without prompt - privacy!)
953
+ success, request_id, locked_price = market.submit_request(
954
+ user_id=NEURO_NODE.node_id, # For testing, use node ID as user ID
955
+ driver_node_id=driver_node_id,
956
+ tokens_requested=req.max_tokens,
957
+ max_price=req.max_price,
958
+ user_signature=user_signature,
959
+ priority=0
960
+ )
961
+
962
+ if not success:
963
+ raise HTTPException(status_code=400, detail="Request rejected (price too high or market full)")
964
+
965
+ # Encrypt prompt for driver
966
+ from neuroshard.core.network.encrypted_channel import PromptEncryption
967
+ encrypted_prompt = PromptEncryption.encrypt_prompt(req.prompt, request_id)
968
+
969
+ # If we are the driver, add to our own queue
970
+ if driver_node_id == NEURO_NODE.node_id and hasattr(NEURO_NODE, 'prompt_queue'):
971
+ from neuroshard.core.network.encrypted_channel import EncryptedPrompt
972
+ import time
973
+
974
+ NEURO_NODE.prompt_queue.add_prompt(EncryptedPrompt(
975
+ request_id=request_id,
976
+ encrypted_data=encrypted_prompt,
977
+ timestamp=time.time(),
978
+ user_id=NEURO_NODE.node_id
979
+ ))
980
+ logger.info(f"[API] ✓ Added encrypted prompt to local driver queue")
981
+
982
+ return {
983
+ "request_id": request_id,
984
+ "locked_price": locked_price,
985
+ "driver_node_id": driver_node_id,
986
+ "encrypted_prompt": encrypted_prompt, # User should send this to driver
987
+ "instructions": f"POST encrypted_prompt to /api/driver/prompt/{request_id} on driver node"
988
+ }
989
+
990
+
991
+ @node_app.post("/api/driver/prompt/{request_id}")
992
+ async def submit_encrypted_prompt(request_id: str, req: DriverPromptRequest):
993
+ """
994
+ User sends encrypted prompt to driver node (PRIVACY CHANNEL).
995
+
996
+ This endpoint is called on the DRIVER node, not the marketplace.
997
+ Prompt is encrypted - only driver can decrypt it.
998
+ """
999
+ if not NEURO_NODE or not NEURO_NODE.model.has_embedding:
1000
+ raise HTTPException(status_code=403, detail="This node is not a driver")
1001
+
1002
+ if not hasattr(NEURO_NODE, 'prompt_queue'):
1003
+ raise HTTPException(status_code=503, detail="Driver not initialized")
1004
+
1005
+ # Add to prompt queue
1006
+ from neuroshard.core.network.encrypted_channel import EncryptedPrompt
1007
+ import time
1008
+
1009
+ prompt = EncryptedPrompt(
1010
+ request_id=request_id,
1011
+ encrypted_data=req.encrypted_prompt,
1012
+ timestamp=time.time(),
1013
+ user_id=req.user_id
1014
+ )
1015
+
1016
+ success = NEURO_NODE.prompt_queue.add_prompt(prompt)
1017
+
1018
+ if not success:
1019
+ raise HTTPException(status_code=503, detail="Prompt queue full")
1020
+
1021
+ return {
1022
+ "status": "success",
1023
+ "message": f"Encrypted prompt queued for request {request_id[:8]}...",
1024
+ "queue_position": len(NEURO_NODE.prompt_queue.prompts)
1025
+ }
1026
+
1027
+
1028
+ @node_app.get("/api/market/request/{request_id}")
1029
+ async def get_request_status(request_id: str):
1030
+ """
1031
+ Get status of inference request.
1032
+
1033
+ Returns:
1034
+ status, progress, eta, result (if completed)
1035
+ """
1036
+ if not NEURO_NODE or not P2P or not P2P.ledger:
1037
+ raise HTTPException(status_code=503, detail="Node not ready")
1038
+
1039
+ if not hasattr(P2P.ledger, 'inference_market'):
1040
+ raise HTTPException(status_code=503, detail="Marketplace not available")
1041
+
1042
+ market = P2P.ledger.inference_market
1043
+ request = market.get_request(request_id)
1044
+
1045
+ if not request:
1046
+ raise HTTPException(status_code=404, detail="Request not found")
1047
+
1048
+ # Get result from marketplace storage
1049
+ result_text = market.get_result(request_id)
1050
+
1051
+ return {
1052
+ "request_id": request_id,
1053
+ "status": request.status,
1054
+ "locked_price": request.locked_price,
1055
+ "tokens_requested": request.tokens_requested,
1056
+ "driver_node_id": request.driver_node_id,
1057
+ "pipeline_session_id": request.pipeline_session_id,
1058
+ "result": result_text,
1059
+ "completed": request.status == "completed" and result_text is not None
1060
+ }
1061
+
1062
+
1063
+ @node_app.get("/api/ponw")
1064
+ async def get_ponw_proof():
1065
+ """
1066
+ Get Proof of Neural Work for this node.
1067
+
1068
+ This proves the node actually contributed compute for training/inference.
1069
+ Used for NEURO token rewards.
1070
+ """
1071
+ if not NEURO_NODE:
1072
+ raise HTTPException(status_code=503, detail="Node not ready")
1073
+
1074
+ return NEURO_NODE.get_ponw_proof()
1075
+
1076
+
1077
+ @node_app.get("/api/neuro")
1078
+ async def get_neuro_balance():
1079
+ """
1080
+ Get NEURO token balance and account info for this node.
1081
+
1082
+ Returns:
1083
+ - balance: Current spendable balance
1084
+ - total_earned: Lifetime earnings from PoNW
1085
+ - total_spent: Lifetime spending
1086
+ - stake: Currently staked amount
1087
+ - stake_multiplier: Reward multiplier from staking
1088
+ """
1089
+ # Use local reference to avoid race condition during shutdown
1090
+ p2p = P2P
1091
+ if not p2p or not p2p.ledger:
1092
+ raise HTTPException(status_code=503, detail="Ledger not available")
1093
+
1094
+ ledger = p2p.ledger
1095
+
1096
+ try:
1097
+ # Use NEUROLedger API (no fallbacks)
1098
+ account_info = ledger.get_account_info()
1099
+ burn_stats = ledger.get_burn_stats()
1100
+
1101
+ # Get node IDs
1102
+ wallet_id = ledger.node_id
1103
+ node_id = p2p.node_id
1104
+
1105
+ return {
1106
+ "balance": round(account_info.get("balance", 0.0), 6),
1107
+ "total_earned": round(account_info.get("total_earned", 0.0), 6),
1108
+ "total_spent": round(account_info.get("total_spent", 0.0), 6),
1109
+ "stake": round(account_info.get("stake", 0.0), 2),
1110
+ "stake_multiplier": round(account_info.get("stake_multiplier", 1.0), 2),
1111
+ "proof_count": account_info.get("proof_count", 0),
1112
+ "wallet_id": wallet_id,
1113
+ "node_id": node_id,
1114
+ "network": {
1115
+ "total_burned": round(burn_stats.get("total_burned", 0.0), 6),
1116
+ "circulating_supply": round(burn_stats.get("circulating_supply", 0.0), 6),
1117
+ "burn_rate": "5%"
1118
+ }
1119
+ }
1120
+ except Exception as e:
1121
+ # Handle shutdown race condition gracefully
1122
+ raise HTTPException(status_code=503, detail=f"Service shutting down: {e}")
1123
+
1124
+
1125
+ # ==================== STAKING ENDPOINTS ====================
1126
+
1127
+ class StakeRequest(BaseModel):
1128
+ amount: float
1129
+ duration_days: int = 30
1130
+
1131
+
1132
+ @node_app.post("/api/stake")
1133
+ async def stake_neuro(req: StakeRequest):
1134
+ """
1135
+ Stake NEURO tokens for reward multiplier.
1136
+
1137
+ Staking provides:
1138
+ - 10% bonus per 1000 NEURO staked (diminishing returns)
1139
+ - Tokens locked for specified duration
1140
+ - 100+ NEURO stake unlocks Validator role (computes real cross-entropy loss)
1141
+
1142
+ Example: Stake 2000 NEURO for 30 days = ~1.16x multiplier on all rewards
1143
+ """
1144
+ if not P2P or not P2P.ledger:
1145
+ raise HTTPException(status_code=503, detail="Ledger not available")
1146
+
1147
+ # Validate using centralized economics
1148
+ is_valid, error = is_valid_stake_amount(req.amount)
1149
+ if not is_valid:
1150
+ raise HTTPException(status_code=400, detail=error)
1151
+
1152
+ is_valid, error = is_valid_stake_duration(req.duration_days)
1153
+ if not is_valid:
1154
+ raise HTTPException(status_code=400, detail=error)
1155
+
1156
+ success, message = P2P.ledger.stake(req.amount, req.duration_days)
1157
+
1158
+ if success:
1159
+ account = P2P.ledger.get_account_info()
1160
+ new_stake = account.get("stake", 0.0)
1161
+
1162
+ # Get dynamic validator stake requirement based on network size
1163
+ num_validators = 0
1164
+ if NEURO_NODE and hasattr(NEURO_NODE, 'layer_pool') and NEURO_NODE.layer_pool:
1165
+ last_layer = max(0, NEURO_NODE.layer_pool.current_num_layers - 1)
1166
+ num_validators = len(NEURO_NODE.layer_pool.layer_assignments.get(last_layer, []))
1167
+
1168
+ required_stake = get_dynamic_validator_stake(num_validators)
1169
+
1170
+ # Check if we should upgrade to Validator (no restart needed!)
1171
+ validator_upgraded = False
1172
+ if new_stake >= required_stake and NEURO_NODE:
1173
+ # Check if not already a validator
1174
+ if hasattr(NEURO_NODE, 'model') and NEURO_NODE.model and not NEURO_NODE.model.has_lm_head:
1175
+ # Upgrade the model to have LM head
1176
+ if NEURO_NODE.model.initialize_lm_head():
1177
+ validator_upgraded = True
1178
+ logger.info(f"Node upgraded to VALIDATOR! Now computing real cross-entropy loss.")
1179
+
1180
+ response = {
1181
+ "success": True,
1182
+ "message": message,
1183
+ "new_stake": new_stake,
1184
+ "new_multiplier": account.get("stake_multiplier", 1.0),
1185
+ "locked_until": account.get("stake_locked_until", 0.0),
1186
+ "validator_stake_required": required_stake,
1187
+ "num_validators": num_validators,
1188
+ }
1189
+
1190
+ if validator_upgraded:
1191
+ response["validator_upgrade"] = True
1192
+ response["message"] += " Upgraded to VALIDATOR! Now computing real training loss."
1193
+ elif new_stake < required_stake:
1194
+ response["validator_progress"] = f"{new_stake:.0f}/{required_stake:.0f} NEURO ({new_stake/required_stake*100:.1f}%)"
1195
+
1196
+ return response
1197
+ else:
1198
+ raise HTTPException(status_code=400, detail=message)
1199
+
1200
+
1201
+ @node_app.post("/api/unstake")
1202
+ async def unstake_neuro():
1203
+ """
1204
+ Unstake NEURO tokens (if lock period expired).
1205
+
1206
+ Returns staked tokens to balance.
1207
+ Note: If remaining stake drops below validator requirement, node is demoted to Worker.
1208
+ """
1209
+ if not P2P or not P2P.ledger:
1210
+ raise HTTPException(status_code=503, detail="Ledger not available")
1211
+
1212
+ success, amount, message = P2P.ledger.unstake()
1213
+
1214
+ if success:
1215
+ # Check if we need to demote from Validator
1216
+ validator_demoted = False
1217
+ account = P2P.ledger.get_account_info()
1218
+ remaining_stake = account.get("stake", 0.0)
1219
+
1220
+ # Get current network size for dynamic stake calculation
1221
+ num_validators = 0
1222
+ if NEURO_NODE and hasattr(NEURO_NODE, 'layer_pool') and NEURO_NODE.layer_pool:
1223
+ last_layer = max(0, NEURO_NODE.layer_pool.current_num_layers - 1)
1224
+ num_validators = len(NEURO_NODE.layer_pool.layer_assignments.get(last_layer, []))
1225
+
1226
+ required_stake = get_dynamic_validator_stake(num_validators)
1227
+
1228
+ # Check if we were a validator and now don't qualify
1229
+ if NEURO_NODE and hasattr(NEURO_NODE, 'model') and NEURO_NODE.model:
1230
+ if NEURO_NODE.model.has_lm_head and remaining_stake < required_stake:
1231
+ # Demote from validator
1232
+ if NEURO_NODE.model.disable_lm_head():
1233
+ validator_demoted = True
1234
+ # Also update layer pool
1235
+ if NEURO_NODE.layer_pool:
1236
+ NEURO_NODE.layer_pool.demote_from_validator(NEURO_NODE.node_id)
1237
+ logger.warning(f"Node demoted from Validator: stake {remaining_stake:.0f} < {required_stake:.0f} required")
1238
+
1239
+ response = {
1240
+ "success": True,
1241
+ "message": message,
1242
+ "amount_unstaked": amount,
1243
+ "remaining_stake": remaining_stake,
1244
+ }
1245
+
1246
+ if validator_demoted:
1247
+ response["validator_demoted"] = True
1248
+ response["message"] += f" WARNING: Demoted from Validator (need {required_stake:.0f} NEURO, have {remaining_stake:.0f})"
1249
+
1250
+ return response
1251
+ else:
1252
+ raise HTTPException(status_code=400, detail=message)
1253
+
1254
+
1255
+ @node_app.get("/api/stake/info")
1256
+ async def get_stake_info():
1257
+ """Get current staking information with dynamic validator requirements."""
1258
+ if not P2P or not P2P.ledger:
1259
+ raise HTTPException(status_code=503, detail="Ledger not available")
1260
+
1261
+ account = P2P.ledger.get_account_info()
1262
+
1263
+ # Get current network size for dynamic stake calculation
1264
+ num_validators = 0
1265
+ if NEURO_NODE and hasattr(NEURO_NODE, 'layer_pool') and NEURO_NODE.layer_pool:
1266
+ last_layer = max(0, NEURO_NODE.layer_pool.current_num_layers - 1)
1267
+ num_validators = len(NEURO_NODE.layer_pool.layer_assignments.get(last_layer, []))
1268
+
1269
+ return {
1270
+ "stake": account.get("stake", 0.0),
1271
+ "stake_multiplier": account.get("stake_multiplier", 1.0),
1272
+ "stake_locked_until": account.get("stake_locked_until", 0.0),
1273
+ "balance": account.get("balance", 0.0),
1274
+ "staking_info": {
1275
+ "bonus_per_1000": "10% (diminishing)",
1276
+ "min_lock_days": 1,
1277
+ "max_lock_days": 365,
1278
+ "validator_stake": get_validator_stake_info(num_validators),
1279
+ }
1280
+ }
1281
+
1282
+
1283
+ class ThrottleUpdateRequest(BaseModel):
1284
+ cpu_threads: Optional[int] = None
1285
+ memory_mb: Optional[int] = None
1286
+ storage_mb: Optional[int] = None
1287
+
1288
+
1289
+ @node_app.post("/api/throttle")
1290
+ async def update_throttle(req: ThrottleUpdateRequest):
1291
+ """
1292
+ Update training throttle settings while node is running.
1293
+
1294
+ This allows the GUI to change CPU/RAM/Storage limits without restarting.
1295
+ Changes take effect within 5 seconds.
1296
+ """
1297
+ updated = {}
1298
+
1299
+ if req.cpu_threads is not None:
1300
+ STATE["config_cpu_threads"] = req.cpu_threads
1301
+ updated["cpu_threads"] = req.cpu_threads
1302
+
1303
+ if req.memory_mb is not None:
1304
+ STATE["config_memory_mb"] = req.memory_mb
1305
+ updated["memory_mb"] = req.memory_mb
1306
+
1307
+ if req.storage_mb is not None:
1308
+ STATE["config_storage_mb"] = req.storage_mb
1309
+ updated["storage_mb"] = req.storage_mb
1310
+ # Update genesis loader if it exists
1311
+ if NEURO_NODE and hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
1312
+ NEURO_NODE.genesis_loader.max_storage_mb = req.storage_mb
1313
+ NEURO_NODE.genesis_loader.max_shards = max(1, int(req.storage_mb / 10))
1314
+ logger.info(f"[NODE] Updated storage limit: {req.storage_mb}MB ({NEURO_NODE.genesis_loader.max_shards} shards)")
1315
+
1316
+ return {
1317
+ "success": True,
1318
+ "updated": updated,
1319
+ "message": "Settings updated. Changes take effect within 5 seconds.",
1320
+ "current_throttle": {
1321
+ "cpu_ratio": STATE.get("throttle_cpu_ratio", 1.0),
1322
+ "ram_ratio": STATE.get("throttle_ram_ratio", 1.0),
1323
+ "effective": STATE.get("throttle_effective", 1.0),
1324
+ }
1325
+ }
1326
+
1327
+
1328
+ @node_app.get("/api/validator/info")
1329
+ async def get_validator_info():
1330
+ """
1331
+ Get validator eligibility and status.
1332
+
1333
+ Validators require:
1334
+ - Minimum 100 NEURO staked
1335
+ - LM Head layer assignment (last layer)
1336
+
1337
+ Validators earn:
1338
+ - 30% bonus on rewards (up from 20%)
1339
+ - 0.001 NEURO per proof validated
1340
+ """
1341
+ if not P2P or not P2P.ledger:
1342
+ raise HTTPException(status_code=503, detail="Ledger not available")
1343
+
1344
+ validator_info = P2P.ledger.get_validator_info()
1345
+
1346
+ # Add role info from node
1347
+ if NEURO_NODE:
1348
+ validator_info["has_lm_head"] = NEURO_NODE.model.has_lm_head if NEURO_NODE.model else False
1349
+ validator_info["is_active_validator"] = (
1350
+ validator_info["is_eligible_validator"] and
1351
+ validator_info.get("has_lm_head", False)
1352
+ )
1353
+
1354
+ return validator_info
1355
+
1356
+
1357
+ # ==================== SWARM ENDPOINTS ====================
1358
+
1359
+ @node_app.get("/api/swarm")
1360
+ async def get_swarm_status():
1361
+ """
1362
+ Get Swarm architecture status.
1363
+
1364
+ Returns buffer fill rates, heartbeat peers, routing stats.
1365
+ """
1366
+ if not NEURO_NODE:
1367
+ raise HTTPException(status_code=503, detail="Node not ready")
1368
+
1369
+ # Get swarm status from node
1370
+ swarm_status = NEURO_NODE.get_swarm_status()
1371
+
1372
+ return swarm_status
1373
+
1374
+
1375
+ @node_app.get("/api/diloco")
1376
+ async def get_diloco_progress():
1377
+ """
1378
+ Get DiLoCo training progress.
1379
+
1380
+ Returns inner step count, sync progress, outer step count.
1381
+ """
1382
+ if not NEURO_NODE:
1383
+ raise HTTPException(status_code=503, detail="Node not ready")
1384
+
1385
+ return NEURO_NODE.get_diloco_progress()
1386
+
1387
+
1388
+ @node_app.get("/api/model_info")
1389
+ async def get_model_info():
1390
+ """Get information about the NeuroLLM model."""
1391
+ if not NEURO_NODE:
1392
+ raise HTTPException(status_code=503, detail="Node not ready")
1393
+
1394
+ stats = NEURO_NODE.get_stats()
1395
+
1396
+ # Get architecture info
1397
+ arch_info = {}
1398
+ if NEURO_NODE.layer_pool and NEURO_NODE.layer_pool.current_architecture:
1399
+ arch = NEURO_NODE.layer_pool.current_architecture
1400
+ arch_info = {
1401
+ "hidden_dim": arch.hidden_dim,
1402
+ "num_layers": arch.num_layers,
1403
+ "num_heads": arch.num_heads,
1404
+ "vocab_size": arch.vocab_size,
1405
+ "architecture_version": NEURO_NODE.layer_pool.architecture_version,
1406
+ "total_params": arch.estimate_params(),
1407
+ }
1408
+
1409
+ # Sanitize loss for JSON
1410
+ model_loss = NEURO_NODE.current_loss
1411
+ if math.isinf(model_loss) or math.isnan(model_loss):
1412
+ model_loss = None
1413
+
1414
+ return {
1415
+ "model_name": "NeuroLLM",
1416
+ "description": "The People's Language Model - trained from scratch by the network",
1417
+ "architecture": arch_info, # NEW: Show current architecture
1418
+ "my_layers": stats.get("my_layers", []),
1419
+ "my_params": stats.get("my_params", 0),
1420
+ "network_layers": stats.get("network_layers", 0),
1421
+ "network_nodes": stats.get("network_nodes", 0),
1422
+ "total_training_rounds": NEURO_NODE.total_training_rounds,
1423
+ "current_loss": model_loss,
1424
+ "note": "This model is trained collaboratively. Quality improves as more users contribute!"
1425
+ }
1426
+
1427
+
1428
+ @node_app.get("/api/network")
1429
+ async def get_network_info():
1430
+ """Get network capacity and layer distribution."""
1431
+ if not NEURO_NODE or not NEURO_NODE.layer_pool:
1432
+ raise HTTPException(status_code=503, detail="Node not ready")
1433
+
1434
+ capacity = NEURO_NODE.layer_pool.get_network_capacity()
1435
+
1436
+ return {
1437
+ "total_nodes": capacity.total_nodes,
1438
+ "total_memory_mb": capacity.total_memory_mb,
1439
+ "max_possible_layers": capacity.max_layers,
1440
+ "current_layers": capacity.assigned_layers,
1441
+ "layer_coverage": capacity.layer_coverage,
1442
+ "my_contribution": NEURO_NODE.model.get_my_contribution() if NEURO_NODE.model else {},
1443
+ }
1444
+
1445
+
1446
+ @node_app.get("/api/logs")
1447
+ async def get_logs(since_id: Optional[int] = None, limit: int = 100):
1448
+ """
1449
+ Get recent logs from the node.
1450
+
1451
+ Args:
1452
+ since_id: Return logs with ID greater than this (for polling).
1453
+ Use 0 or omit to get all available logs on initial load.
1454
+ limit: Maximum number of logs to return (default 100)
1455
+
1456
+ Returns:
1457
+ List of log entries with id, epoch, timestamp, message, type, and level
1458
+ """
1459
+ with _LOG_BUFFER_LOCK:
1460
+ logs = list(_LOG_BUFFER)
1461
+
1462
+ # If since_id is provided, filter to only logs with ID > since_id
1463
+ if since_id is not None and since_id > 0:
1464
+ logs = [log for log in logs if log.get('id', 0) > since_id]
1465
+
1466
+ # Limit results (take most recent)
1467
+ if len(logs) > limit:
1468
+ logs = logs[-limit:]
1469
+
1470
+ # Get the latest log ID for next poll
1471
+ latest_id = logs[-1]['id'] if logs else (since_id or 0)
1472
+
1473
+ return {
1474
+ "logs": logs,
1475
+ "total": len(_LOG_BUFFER),
1476
+ "latest_id": latest_id, # Client should use this for next poll
1477
+ }
1478
+
1479
+
1480
+ @node_app.post("/api/shutdown")
1481
+ async def shutdown_node():
1482
+ """
1483
+ Gracefully shutdown the node.
1484
+
1485
+ Saves checkpoint and stops all components cleanly.
1486
+ """
1487
+ logger.info("[NODE] Shutdown requested via API")
1488
+
1489
+ # Use a background thread for shutdown (more reliable than asyncio.create_task)
1490
+ def do_shutdown():
1491
+ import time
1492
+ time.sleep(0.5) # Brief delay to allow response to be sent
1493
+ request_shutdown()
1494
+
1495
+ shutdown_thread = threading.Thread(target=do_shutdown, daemon=False)
1496
+ shutdown_thread.start()
1497
+
1498
+ return {
1499
+ "status": "shutting_down",
1500
+ "message": "Node will shutdown in 0.5 seconds. Checkpoint will be saved."
1501
+ }
1502
+
1503
+
1504
+ @node_app.get("/api/checkpoint/info")
1505
+ async def get_checkpoint_info():
1506
+ """Get checkpoint info for P2P sync."""
1507
+ if not NEURO_NODE:
1508
+ raise HTTPException(status_code=503, detail="Node not ready")
1509
+
1510
+ return NEURO_NODE.get_checkpoint_info()
1511
+
1512
+
1513
+ @node_app.get("/api/checkpoint/download")
1514
+ async def download_checkpoint():
1515
+ """Download checkpoint (for P2P sync via HTTP fallback)."""
1516
+ import io
1517
+ import zlib
1518
+ from fastapi.responses import Response
1519
+
1520
+ if not NEURO_NODE or not NEURO_NODE.model:
1521
+ raise HTTPException(status_code=503, detail="Node not ready")
1522
+
1523
+ try:
1524
+ # Serialize checkpoint for my layers only
1525
+ buffer = io.BytesIO()
1526
+
1527
+ # Collect layer state dicts
1528
+ layer_states = {
1529
+ layer_id: layer.state_dict()
1530
+ for layer_id, layer in NEURO_NODE.model.my_layers.items()
1531
+ }
1532
+
1533
+ checkpoint = {
1534
+ "layer_ids": NEURO_NODE.my_layer_ids,
1535
+ "layers": layer_states,
1536
+ "has_embedding": NEURO_NODE.model.has_embedding,
1537
+ "has_lm_head": NEURO_NODE.model.has_lm_head,
1538
+ "version": NEURO_NODE.total_training_rounds,
1539
+ }
1540
+
1541
+ if NEURO_NODE.model.embedding:
1542
+ checkpoint["embedding"] = NEURO_NODE.model.embedding.state_dict()
1543
+ if NEURO_NODE.model.lm_head:
1544
+ checkpoint["lm_head"] = NEURO_NODE.model.lm_head.state_dict()
1545
+ if NEURO_NODE.model.final_norm:
1546
+ checkpoint["final_norm"] = NEURO_NODE.model.final_norm.state_dict()
1547
+
1548
+ torch.save(checkpoint, buffer)
1549
+
1550
+ # Compress
1551
+ raw_data = buffer.getvalue()
1552
+ compressed = zlib.compress(raw_data, level=6)
1553
+
1554
+ return Response(
1555
+ content=compressed,
1556
+ media_type="application/octet-stream",
1557
+ headers={
1558
+ "X-Checkpoint-Version": str(checkpoint["version"]),
1559
+ "X-Layer-IDs": ",".join(map(str, NEURO_NODE.my_layer_ids)),
1560
+ "X-Original-Size": str(len(raw_data)),
1561
+ }
1562
+ )
1563
+
1564
+ except Exception as e:
1565
+ raise HTTPException(status_code=500, detail=str(e))
1566
+
1567
+
1568
+ # ==================== API v1 ENDPOINTS (SDK Compatible) ====================
1569
+
1570
+ class InferenceV1Request(BaseModel):
1571
+ """Inference request matching SDK expectations."""
1572
+ prompt: str
1573
+ max_tokens: int = 100
1574
+ temperature: float = 1.0
1575
+ top_p: float = 1.0
1576
+ top_k: int = 50
1577
+ stop: List[str] = []
1578
+ stream: bool = False
1579
+
1580
+
1581
+ class SendNEURORequest(BaseModel):
1582
+ """Send NEURO request."""
1583
+ to: str
1584
+ amount: float
1585
+ memo: str = ""
1586
+
1587
+
1588
+ @node_app.get("/api/v1/status")
1589
+ async def get_status_v1():
1590
+ """
1591
+ Get current node status (SDK compatible).
1592
+
1593
+ Returns status in format expected by NeuroNode SDK.
1594
+ """
1595
+ import math
1596
+ import psutil
1597
+ import os
1598
+
1599
+ if not NEURO_NODE:
1600
+ raise HTTPException(status_code=503, detail="Node not ready")
1601
+
1602
+ # Get node stats
1603
+ stats = NEURO_NODE.get_stats()
1604
+
1605
+ # Determine role
1606
+ has_embedding = stats.get("has_embedding", False)
1607
+ has_lm_head = stats.get("has_lm_head", False)
1608
+ if has_embedding and has_lm_head:
1609
+ role = "full"
1610
+ elif has_embedding:
1611
+ role = "driver"
1612
+ elif has_lm_head:
1613
+ role = "validator"
1614
+ else:
1615
+ role = "worker"
1616
+
1617
+ # Get system resources
1618
+ try:
1619
+ mem = psutil.virtual_memory()
1620
+ process = psutil.Process(os.getpid())
1621
+ gpu_used = 0
1622
+ gpu_total = 0
1623
+
1624
+ if torch.cuda.is_available():
1625
+ gpu_used = torch.cuda.memory_allocated()
1626
+ gpu_total = torch.cuda.get_device_properties(0).total_memory
1627
+
1628
+ resources = {
1629
+ "gpu_memory_used": gpu_used,
1630
+ "gpu_memory_total": gpu_total,
1631
+ "cpu_percent": psutil.cpu_percent(),
1632
+ "ram_used": mem.used,
1633
+ "ram_total": mem.total,
1634
+ }
1635
+ except Exception:
1636
+ resources = {}
1637
+
1638
+ # Handle infinity loss
1639
+ loss = stats.get("current_loss", 0.0)
1640
+ if math.isinf(loss) or math.isnan(loss):
1641
+ loss = 0.0
1642
+
1643
+ return {
1644
+ "node_id": NEURO_NODE.node_id,
1645
+ "version": __version__,
1646
+ "uptime_seconds": int(time.time() - getattr(NEURO_NODE, '_start_time', time.time())),
1647
+ "status": STATE.get("training_status", "running"),
1648
+ "role": role,
1649
+ "layers": stats.get("my_layers", []),
1650
+ "peer_count": STATE.get("peer_count", 0),
1651
+ "has_embedding": has_embedding,
1652
+ "has_lm_head": has_lm_head,
1653
+ "training": {
1654
+ "enabled": NEURO_NODE.enable_training,
1655
+ "epoch": 0, # Not tracked separately
1656
+ "step": stats.get("total_training_rounds", 0),
1657
+ "loss": loss,
1658
+ },
1659
+ "resources": resources,
1660
+ }
1661
+
1662
+
1663
+ @node_app.get("/api/v1/metrics")
1664
+ async def get_metrics_v1():
1665
+ """
1666
+ Get performance metrics (SDK compatible).
1667
+ """
1668
+ import math
1669
+ from datetime import datetime
1670
+
1671
+ if not NEURO_NODE:
1672
+ raise HTTPException(status_code=503, detail="Node not ready")
1673
+
1674
+ stats = NEURO_NODE.get_stats()
1675
+
1676
+ # Get balance info for rewards
1677
+ earned_total = 0.0
1678
+ pending = 0.0
1679
+ if P2P and P2P.ledger:
1680
+ account = P2P.ledger.get_account_info()
1681
+ earned_total = account.get("total_earned", 0.0)
1682
+ pending = 0.0 # Could track pending proofs
1683
+
1684
+ return {
1685
+ "timestamp": datetime.now().isoformat(),
1686
+ "inference": {
1687
+ "requests_total": STATE.get("processed_count", 0),
1688
+ "requests_per_minute": 0.0, # Would need tracking
1689
+ "avg_latency_ms": 0.0,
1690
+ "p99_latency_ms": 0.0,
1691
+ "tokens_generated": stats.get("total_tokens_processed", 0),
1692
+ },
1693
+ "training": {
1694
+ "steps_total": stats.get("total_training_rounds", 0),
1695
+ "steps_per_hour": 0.0,
1696
+ "gradients_submitted": 0,
1697
+ "gradients_accepted": 0,
1698
+ },
1699
+ "network": {
1700
+ "bytes_sent": 0,
1701
+ "bytes_received": 0,
1702
+ "active_connections": STATE.get("peer_count", 0),
1703
+ "rpc_calls": 0,
1704
+ "peer_count": STATE.get("peer_count", 0),
1705
+ },
1706
+ "rewards": {
1707
+ "earned_today": 0.0, # Would need daily tracking
1708
+ "earned_total": earned_total,
1709
+ "pending": pending,
1710
+ },
1711
+ }
1712
+
1713
+
1714
+ @node_app.get("/api/v1/health")
1715
+ async def health_check_v1():
1716
+ """Health check endpoint (SDK compatible)."""
1717
+ checks = {
1718
+ "node": "ok" if NEURO_NODE else "error",
1719
+ "network": "ok" if P2P else "error",
1720
+ "model": "ok" if NEURO_NODE and NEURO_NODE.model else "error",
1721
+ }
1722
+
1723
+ # Check GPU
1724
+ try:
1725
+ if torch.cuda.is_available():
1726
+ checks["gpu"] = "ok"
1727
+ else:
1728
+ checks["gpu"] = "cpu_only"
1729
+ except Exception:
1730
+ checks["gpu"] = "unknown"
1731
+
1732
+ healthy = all(v == "ok" for k, v in checks.items() if k != "gpu")
1733
+
1734
+ return {
1735
+ "healthy": healthy,
1736
+ "checks": checks,
1737
+ }
1738
+
1739
+
1740
+ @node_app.post("/api/v1/inference")
1741
+ async def inference_v1(req: InferenceV1Request):
1742
+ """
1743
+ Run inference (SDK compatible).
1744
+
1745
+ Supports both streaming and non-streaming modes.
1746
+ """
1747
+ from fastapi.responses import StreamingResponse
1748
+ import uuid
1749
+
1750
+ if not NEURO_NODE:
1751
+ raise HTTPException(status_code=503, detail="Node not ready")
1752
+
1753
+ start_time = time.time()
1754
+ request_id = f"inf_{uuid.uuid4().hex[:12]}"
1755
+
1756
+ if req.stream:
1757
+ # Streaming response
1758
+ async def generate_stream():
1759
+ try:
1760
+ # Generate tokens one at a time
1761
+ text = NEURO_NODE.generate(
1762
+ prompt=req.prompt,
1763
+ max_new_tokens=req.max_tokens,
1764
+ temperature=req.temperature,
1765
+ )
1766
+
1767
+ # Emit tokens
1768
+ tokens = text.split()
1769
+ for i, token in enumerate(tokens):
1770
+ yield f"data: {json.dumps({'token': token + ' ', 'index': i})}\n\n"
1771
+
1772
+ # Final message
1773
+ yield f"data: {json.dumps({'token': '[DONE]', 'finish_reason': 'stop'})}\n\n"
1774
+
1775
+ except Exception as e:
1776
+ yield f"data: {json.dumps({'error': str(e)})}\n\n"
1777
+
1778
+ return StreamingResponse(
1779
+ generate_stream(),
1780
+ media_type="text/event-stream",
1781
+ )
1782
+
1783
+ # Non-streaming response
1784
+ try:
1785
+ text = NEURO_NODE.generate(
1786
+ prompt=req.prompt,
1787
+ max_new_tokens=req.max_tokens,
1788
+ temperature=req.temperature,
1789
+ )
1790
+
1791
+ end_time = time.time()
1792
+ inference_ms = (end_time - start_time) * 1000
1793
+
1794
+ # Count tokens (simple approximation)
1795
+ prompt_tokens = len(req.prompt.split())
1796
+ completion_tokens = len(text.split())
1797
+
1798
+ STATE["processed_count"] = STATE.get("processed_count", 0) + 1
1799
+
1800
+ return {
1801
+ "id": request_id,
1802
+ "text": text,
1803
+ "tokens_generated": completion_tokens,
1804
+ "finish_reason": "stop",
1805
+ "usage": {
1806
+ "prompt_tokens": prompt_tokens,
1807
+ "completion_tokens": completion_tokens,
1808
+ "total_tokens": prompt_tokens + completion_tokens,
1809
+ },
1810
+ "cost": {
1811
+ "amount": completion_tokens * 0.000001, # Approximate
1812
+ "currency": "NEURO",
1813
+ },
1814
+ "timing": {
1815
+ "queue_ms": 0,
1816
+ "inference_ms": inference_ms,
1817
+ "total_ms": inference_ms,
1818
+ },
1819
+ }
1820
+
1821
+ except Exception as e:
1822
+ raise HTTPException(status_code=500, detail=str(e))
1823
+
1824
+
1825
+ @node_app.get("/api/v1/wallet/balance")
1826
+ async def get_wallet_balance_v1():
1827
+ """Get wallet balance (SDK compatible)."""
1828
+ if not P2P or not P2P.ledger:
1829
+ raise HTTPException(status_code=503, detail="Ledger not available")
1830
+
1831
+ account = P2P.ledger.get_account_info()
1832
+
1833
+ return {
1834
+ "address": P2P.ledger.node_id,
1835
+ "balances": {
1836
+ "available": account.get("balance", 0.0),
1837
+ "staked": account.get("stake", 0.0),
1838
+ "pending": 0.0,
1839
+ "total": account.get("balance", 0.0) + account.get("stake", 0.0),
1840
+ },
1841
+ "staking": {
1842
+ "amount": account.get("stake", 0.0),
1843
+ "duration_days": 30,
1844
+ "multiplier": account.get("stake_multiplier", 1.0),
1845
+ },
1846
+ }
1847
+
1848
+
1849
+ @node_app.post("/api/v1/wallet/send")
1850
+ async def send_neuro_v1(req: SendNEURORequest):
1851
+ """Send NEURO tokens (SDK compatible)."""
1852
+ if not P2P or not P2P.ledger:
1853
+ raise HTTPException(status_code=503, detail="Ledger not available")
1854
+
1855
+ success, message, tx = P2P.ledger.transfer(req.to, req.amount, req.memo)
1856
+
1857
+ if not success:
1858
+ raise HTTPException(status_code=400, detail=message)
1859
+
1860
+ return {
1861
+ "transaction_id": tx.tx_id if tx else "",
1862
+ "from": P2P.ledger.node_id,
1863
+ "to": req.to,
1864
+ "amount": req.amount,
1865
+ "fee": tx.fee if tx else 0.0,
1866
+ "memo": req.memo,
1867
+ "status": "confirmed",
1868
+ "timestamp": datetime.now().isoformat() if 'datetime' in dir() else time.strftime("%Y-%m-%dT%H:%M:%SZ"),
1869
+ }
1870
+
1871
+
1872
+ @node_app.get("/api/v1/wallet/transactions")
1873
+ async def get_transactions_v1(limit: int = 10, offset: int = 0, type: Optional[str] = None):
1874
+ """Get transaction history (SDK compatible)."""
1875
+ if not P2P or not P2P.ledger:
1876
+ raise HTTPException(status_code=503, detail="Ledger not available")
1877
+
1878
+ # Get recent proofs as transactions
1879
+ import sqlite3
1880
+ transactions = []
1881
+
1882
+ try:
1883
+ with sqlite3.connect(P2P.ledger.db_path) as conn:
1884
+ query = """
1885
+ SELECT signature, node_id, proof_type, timestamp, reward_amount
1886
+ FROM proof_history
1887
+ WHERE node_id = ?
1888
+ ORDER BY timestamp DESC
1889
+ LIMIT ? OFFSET ?
1890
+ """
1891
+ rows = conn.execute(query, (P2P.ledger.node_id, limit, offset)).fetchall()
1892
+
1893
+ for sig, node_id, ptype, ts, reward in rows:
1894
+ transactions.append({
1895
+ "id": sig[:16] if sig else "",
1896
+ "type": "reward",
1897
+ "amount": reward,
1898
+ "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(ts)),
1899
+ "details": {
1900
+ "proof_type": ptype,
1901
+ },
1902
+ })
1903
+ except Exception:
1904
+ pass
1905
+
1906
+ return {
1907
+ "transactions": transactions,
1908
+ "total": len(transactions),
1909
+ "limit": limit,
1910
+ "offset": offset,
1911
+ }
1912
+
1913
+
1914
+ @node_app.post("/api/v1/wallet/stake")
1915
+ async def stake_neuro_v1(req: StakeRequest):
1916
+ """Stake NEURO tokens (SDK compatible)."""
1917
+ if not P2P or not P2P.ledger:
1918
+ raise HTTPException(status_code=503, detail="Ledger not available")
1919
+
1920
+ success, message = P2P.ledger.stake(req.amount, req.duration_days)
1921
+
1922
+ if not success:
1923
+ raise HTTPException(status_code=400, detail=message)
1924
+
1925
+ account = P2P.ledger.get_account_info()
1926
+
1927
+ from datetime import date, timedelta
1928
+ start = date.today()
1929
+ unlock = start + timedelta(days=req.duration_days)
1930
+
1931
+ return {
1932
+ "success": True,
1933
+ "stake": {
1934
+ "amount": req.amount,
1935
+ "duration_days": req.duration_days,
1936
+ "start_date": start.isoformat(),
1937
+ "unlock_date": unlock.isoformat(),
1938
+ "multiplier": account.get("stake_multiplier", 1.0),
1939
+ },
1940
+ "new_balance": {
1941
+ "available": account.get("balance", 0.0),
1942
+ "staked": account.get("stake", 0.0),
1943
+ },
1944
+ }
1945
+
1946
+
1947
+ @node_app.post("/api/v1/wallet/unstake")
1948
+ async def unstake_neuro_v1(amount: float = None):
1949
+ """Request unstaking (SDK compatible)."""
1950
+ if not P2P or not P2P.ledger:
1951
+ raise HTTPException(status_code=503, detail="Ledger not available")
1952
+
1953
+ success, unstaked_amount, message = P2P.ledger.unstake()
1954
+
1955
+ if not success:
1956
+ raise HTTPException(status_code=400, detail=message)
1957
+
1958
+ from datetime import date, timedelta
1959
+ available = date.today() + timedelta(days=7)
1960
+
1961
+ return {
1962
+ "success": True,
1963
+ "unstake": {
1964
+ "amount": unstaked_amount,
1965
+ "cooldown_days": 7,
1966
+ "available_date": available.isoformat(),
1967
+ },
1968
+ }
1969
+
1970
+
1971
+ @node_app.get("/api/v1/wallet/rewards")
1972
+ async def get_rewards_v1(start_date: Optional[str] = None, end_date: Optional[str] = None):
1973
+ """Get reward history (SDK compatible)."""
1974
+ if not P2P or not P2P.ledger:
1975
+ raise HTTPException(status_code=503, detail="Ledger not available")
1976
+
1977
+ account = P2P.ledger.get_account_info()
1978
+
1979
+ return {
1980
+ "total": account.get("total_earned", 0.0),
1981
+ "by_day": [], # Would need daily tracking
1982
+ "by_type": {
1983
+ "uptime": 0.0,
1984
+ "inference": 0.0,
1985
+ "training": 0.0,
1986
+ },
1987
+ }
1988
+
1989
+
1990
+ @node_app.get("/api/v1/peers")
1991
+ async def get_peers_v1():
1992
+ """List connected peers (SDK compatible)."""
1993
+ if not P2P:
1994
+ raise HTTPException(status_code=503, detail="P2P not available")
1995
+
1996
+ peers = []
1997
+ for peer_url, peer_info in P2P.known_peers.items():
1998
+ # Parse peer info
1999
+ peer_id = peer_info.get("id", peer_url)
2000
+ role = "worker"
2001
+ layers = []
2002
+
2003
+ if isinstance(peer_info, dict):
2004
+ if peer_info.get("has_embedding"):
2005
+ role = "driver"
2006
+ elif peer_info.get("has_lm_head"):
2007
+ role = "validator"
2008
+ layers = peer_info.get("layers", [])
2009
+
2010
+ peers.append({
2011
+ "id": peer_id,
2012
+ "address": peer_url,
2013
+ "role": role,
2014
+ "layers": layers,
2015
+ "latency_ms": 0.0,
2016
+ "connected_since": None,
2017
+ })
2018
+
2019
+ return {
2020
+ "peers": peers,
2021
+ "total": len(peers),
2022
+ }
2023
+
2024
+
2025
+ @node_app.get("/api/v1/layers")
2026
+ async def get_layers_v1():
2027
+ """List assigned layers (SDK compatible)."""
2028
+ if not NEURO_NODE:
2029
+ raise HTTPException(status_code=503, detail="Node not ready")
2030
+
2031
+ layers = []
2032
+ for layer_id in NEURO_NODE.my_layer_ids:
2033
+ layer_type = "transformer"
2034
+ if layer_id == 0 and NEURO_NODE.model.has_embedding:
2035
+ layer_type = "embedding"
2036
+
2037
+ layers.append({
2038
+ "index": layer_id,
2039
+ "type": layer_type,
2040
+ "memory_mb": 0, # Would need per-layer tracking
2041
+ "status": "active",
2042
+ })
2043
+
2044
+ # Add LM head if present
2045
+ if NEURO_NODE.model.has_lm_head:
2046
+ layers.append({
2047
+ "index": max(NEURO_NODE.my_layer_ids) + 1 if NEURO_NODE.my_layer_ids else 0,
2048
+ "type": "lm_head",
2049
+ "memory_mb": 0,
2050
+ "status": "active",
2051
+ })
2052
+
2053
+ return {
2054
+ "layers": layers,
2055
+ "total_layers": len(NEURO_NODE.my_layer_ids),
2056
+ "my_layer_count": len(NEURO_NODE.my_layer_ids),
2057
+ }
2058
+
2059
+
2060
+ @node_app.get("/api/v1/config")
2061
+ async def get_config_v1():
2062
+ """Get node configuration (SDK compatible)."""
2063
+ if not NEURO_NODE:
2064
+ raise HTTPException(status_code=503, detail="Node not ready")
2065
+
2066
+ port = STATE.get("port", 8000)
2067
+
2068
+ return {
2069
+ "node_id": NEURO_NODE.node_id,
2070
+ "port": port,
2071
+ "grpc_port": port + 1000,
2072
+ "tracker_url": "https://neuroshard.com/api/tracker",
2073
+ "training": {
2074
+ "enabled": NEURO_NODE.enable_training,
2075
+ "batch_size": 8,
2076
+ "learning_rate": 0.0001,
2077
+ "diloco_steps": STATE.get("diloco_inner_steps", 500),
2078
+ },
2079
+ "resources": {
2080
+ "max_memory_mb": STATE.get("config_memory_mb"),
2081
+ "cpu_threads": STATE.get("config_cpu_threads"),
2082
+ },
2083
+ }
2084
+
2085
+
2086
+ @node_app.patch("/api/v1/config")
2087
+ async def update_config_v1(updates: dict):
2088
+ """Update node configuration (SDK compatible)."""
2089
+ updated = []
2090
+
2091
+ if "training" in updates:
2092
+ training = updates["training"]
2093
+ if "batch_size" in training:
2094
+ updated.append("training.batch_size")
2095
+ if "diloco_steps" in training:
2096
+ STATE["diloco_inner_steps"] = training["diloco_steps"]
2097
+ updated.append("training.diloco_steps")
2098
+
2099
+ if "resources" in updates:
2100
+ resources = updates["resources"]
2101
+ if "max_memory_mb" in resources:
2102
+ STATE["config_memory_mb"] = resources["max_memory_mb"]
2103
+ updated.append("resources.max_memory_mb")
2104
+ if "cpu_threads" in resources:
2105
+ STATE["config_cpu_threads"] = resources["cpu_threads"]
2106
+ updated.append("resources.cpu_threads")
2107
+
2108
+ return {
2109
+ "success": True,
2110
+ "updated": updated,
2111
+ "restart_required": False,
2112
+ }
2113
+
2114
+
2115
+ # ==================== UTILITY FUNCTIONS ====================
2116
+
2117
+ def get_public_ip():
2118
+ """Attempt to get the public IP address of this node."""
2119
+ try:
2120
+ services = [
2121
+ 'https://api.ipify.org',
2122
+ 'https://ifconfig.me/ip',
2123
+ 'https://icanhazip.com'
2124
+ ]
2125
+ for service in services:
2126
+ try:
2127
+ return requests.get(service, timeout=3).text.strip()
2128
+ except:
2129
+ continue
2130
+ except Exception:
2131
+ pass
2132
+ return None
2133
+
2134
+
2135
+ def get_local_ip():
2136
+ s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
2137
+ try:
2138
+ s.connect(('10.255.255.255', 1))
2139
+ IP = s.getsockname()[0]
2140
+ except Exception:
2141
+ IP = '127.0.0.1'
2142
+ finally:
2143
+ s.close()
2144
+ return IP
2145
+
2146
+
2147
+ def run_node(
2148
+ port: int,
2149
+ tracker: str = "https://neuroshard.com/api/tracker",
2150
+ node_token: Optional[str] = None,
2151
+ announce_ip: str = None,
2152
+ announce_port: int = None,
2153
+ enable_training: bool = True,
2154
+ available_memory_mb: Optional[float] = None,
2155
+ max_storage_mb: float = 100.0,
2156
+ max_cpu_threads: Optional[int] = None,
2157
+ diloco_inner_steps: int = 500,
2158
+ device: str = "auto",
2159
+ ):
2160
+ """
2161
+ Start a NeuroShard node.
2162
+
2163
+ TRULY DECENTRALIZED:
2164
+ - No fixed phases or model sizes
2165
+ - Node contributes based on available memory
2166
+ - More memory = more layers = more NEURO rewards
2167
+
2168
+ MULTI-NODE SUPPORT:
2169
+ - Same token on multiple machines/ports is now supported
2170
+ - Each instance gets unique network identity (for layers)
2171
+ - Earnings accumulate to the same NEURO wallet
2172
+
2173
+ Args:
2174
+ port: HTTP port
2175
+ tracker: Tracker URL for peer discovery
2176
+ node_token: Authentication token
2177
+ enable_training: Whether to participate in training
2178
+ available_memory_mb: Override memory detection (for testing)
2179
+ max_storage_mb: Maximum disk space for training data shards
2180
+ max_cpu_threads: Maximum CPU threads to use for training
2181
+ """
2182
+ global NEURO_NODE, P2P
2183
+
2184
+ # CRITICAL: Clear shutdown flag from previous run (for GUI restart support)
2185
+ _SHUTDOWN_REQUESTED.clear()
2186
+
2187
+ # Reset STATE for fresh start (important for GUI restart)
2188
+ STATE.clear()
2189
+ STATE.update({
2190
+ "shard_range": "Unknown",
2191
+ "peer_count": 0,
2192
+ "processed_count": 0,
2193
+ "training_updates": 0,
2194
+ "token_count": 0,
2195
+ "training_batches": 0,
2196
+ "assigned_layers": [],
2197
+ "has_embedding": False,
2198
+ "has_lm_head": False,
2199
+ })
2200
+
2201
+ logger.info(f"Starting NeuroShard Node {__version__} on Port {port}")
2202
+
2203
+ # Multi-node detection and info
2204
+ from neuroshard.utils.hardware import get_instance_id, get_machine_id
2205
+ instance_id = get_instance_id(port)
2206
+ machine_id = get_machine_id()
2207
+
2208
+ logger.info(f"Machine ID: {machine_id}")
2209
+ logger.info(f"Instance ID: {instance_id} (machine:port unique)")
2210
+
2211
+ if node_token:
2212
+ wallet_id = hashlib.sha256(node_token.encode()).hexdigest()[:16]
2213
+ logger.info(f"Wallet ID: {wallet_id}... (NEURO earnings go here)")
2214
+ logger.info("=" * 50)
2215
+ logger.info("MULTI-NODE INFO:")
2216
+ logger.info(" Same token on multiple machines? Each gets unique assignment")
2217
+ logger.info("=" * 50)
2218
+ logger.info(f"Dashboard available at http://localhost:{port}/")
2219
+ logger.info(f"Max training data storage: {max_storage_mb}MB")
2220
+
2221
+ # Thread configuration
2222
+ # Note: For GUI mode, this is already set in gui_runner.py wrapper
2223
+ # For CLI mode, we do our best here (may fail if torch already initialized)
2224
+ if max_cpu_threads:
2225
+ logger.info(f"Limiting CPU threads to: {max_cpu_threads}")
2226
+
2227
+ # Set environment variables (these always work)
2228
+ import os
2229
+ os.environ['OMP_NUM_THREADS'] = str(max_cpu_threads)
2230
+ os.environ['MKL_NUM_THREADS'] = str(max_cpu_threads)
2231
+ os.environ['OPENBLAS_NUM_THREADS'] = str(max_cpu_threads)
2232
+
2233
+ # Try to set PyTorch threads (may fail if already set)
2234
+ try:
2235
+ torch.set_num_threads(max_cpu_threads)
2236
+ torch.set_num_interop_threads(max(1, max_cpu_threads // 2))
2237
+ except RuntimeError:
2238
+ # Already configured (likely by GUI wrapper or torch initialized)
2239
+ pass
2240
+
2241
+ # Lower process priority (to not hog system resources)
2242
+ try:
2243
+ if sys.platform == 'win32':
2244
+ # Windows: Use SetPriorityClass
2245
+ import ctypes
2246
+ kernel32 = ctypes.windll.kernel32
2247
+ BELOW_NORMAL_PRIORITY_CLASS = 0x00004000
2248
+ kernel32.SetPriorityClass(kernel32.GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS)
2249
+ logger.info("Process priority lowered (Windows BELOW_NORMAL)")
2250
+ elif hasattr(os, 'nice'):
2251
+ # Unix/Mac: Use nice
2252
+ os.nice(10)
2253
+ logger.info("Process priority lowered (nice=10)")
2254
+ except Exception:
2255
+ pass
2256
+
2257
+ if node_token:
2258
+ logger.info(f"Authenticated with Token: {node_token[:8]}...")
2259
+
2260
+ # FULLY DECENTRALIZED INITIALIZATION ORDER:
2261
+ # 1. Setup networking FIRST (so DHT is available for layer discovery)
2262
+ # 2. Initialize P2P BEFORE creating the node
2263
+ # 3. Create node WITH P2P connected (uses DHT for network discovery)
2264
+ # This ensures layer assignment can use DHT to detect existing nodes!
2265
+
2266
+ token_for_id = node_token or str(uuid.uuid4())
2267
+
2268
+ # 1. Setup networking FIRST
2269
+ from neuroshard.core.network.nat import NATTraverser
2270
+ nat = NATTraverser()
2271
+
2272
+ ip_addr = announce_ip or nat.discover_public_ip() or get_public_ip() or get_local_ip()
2273
+
2274
+ # UPnP mapping
2275
+ nat.attempt_upnp_mapping(port, "TCP", "NeuroShard HTTP")
2276
+ nat.attempt_upnp_mapping(port + 1000, "TCP", "NeuroShard gRPC")
2277
+
2278
+ final_announce_port = announce_port or port
2279
+ logger.info(f"Announcing as: {ip_addr}:{final_announce_port}")
2280
+
2281
+ my_url = f"http://{ip_addr}:{final_announce_port}"
2282
+
2283
+ # 2. Initialize P2P BEFORE creating the node
2284
+ # Use temporary shard_range "0-0" - will be updated after layer assignment
2285
+ # This allows DHT to be available for network discovery during layer assignment!
2286
+ P2P = P2PManager(my_url, "0-0", tracker, node_token=node_token)
2287
+ P2P.state_ref = STATE
2288
+
2289
+ # CRITICAL: Synchronously fetch peers and populate routing table BEFORE node creation!
2290
+ # The background thread might not have run yet, so we do it explicitly here.
2291
+ logger.info("DHT bootstrapping... (discovering existing nodes)")
2292
+ import time
2293
+ import hashlib as hashlib_module # Avoid shadowing issues
2294
+
2295
+ try:
2296
+ import requests
2297
+ from urllib.parse import urlparse
2298
+ from neuroshard.core.network.dht import Node
2299
+
2300
+ # Fetch ALL peers from tracker
2301
+ resp = requests.get(f"{tracker}/peers", params={"limit": 100}, timeout=5)
2302
+ if resp.status_code == 200:
2303
+ peers = resp.json()
2304
+ peer_count = 0
2305
+ for p in peers:
2306
+ if p.get("url") != my_url:
2307
+ P2P.known_peers[p["url"]] = p
2308
+ # Add to DHT routing table so layer lookups can find them!
2309
+ if P2P.routing_table:
2310
+ try:
2311
+ p_parsed = urlparse(p["url"])
2312
+ p_ip = p_parsed.hostname
2313
+ p_port = p_parsed.port or 80
2314
+ p_id = int(hashlib_module.sha1(f"{p['url']}".encode()).hexdigest(), 16)
2315
+ P2P.routing_table.add_contact(Node(p_id, p_ip, p_port))
2316
+ peer_count += 1
2317
+ except:
2318
+ pass
2319
+ if peer_count > 0:
2320
+ logger.info(f"DHT: Added {peer_count} peers to routing table")
2321
+ except Exception as e:
2322
+ logger.debug(f"Peer discovery failed: {e}")
2323
+
2324
+ # Additional wait to let DHT stabilize
2325
+ time.sleep(1)
2326
+
2327
+ logger.info(f"Initializing NeuroShard Node (training={enable_training}, DiLoCo steps={diloco_inner_steps})...")
2328
+
2329
+ # 3. Create swarm config
2330
+ swarm_config = SwarmNodeConfig(
2331
+ diloco_inner_steps=diloco_inner_steps,
2332
+ )
2333
+
2334
+ # 4. Create node WITH P2P already available
2335
+ # This allows layer assignment to use DHT for network discovery!
2336
+ NEURO_NODE = create_swarm_node_with_p2p(
2337
+ node_token=token_for_id,
2338
+ port=port,
2339
+ tracker_url=tracker,
2340
+ config=swarm_config,
2341
+ available_memory_mb=available_memory_mb,
2342
+ enable_training=enable_training,
2343
+ max_storage_mb=max_storage_mb,
2344
+ max_cpu_threads=max_cpu_threads,
2345
+ device=device,
2346
+ p2p_manager=P2P, # Pass P2P so DHT is available during layer assignment!
2347
+ )
2348
+
2349
+ STATE["diloco_inner_steps"] = diloco_inner_steps
2350
+
2351
+ logger.info(f"NeuroLLM loaded: {NEURO_NODE.model.get_num_params() / 1e6:.1f}M parameters")
2352
+ logger.info(f"Assigned layers: {NEURO_NODE.my_layer_ids}")
2353
+ logger.info(f"Embedding: {NEURO_NODE.model.has_embedding}, LM Head: {NEURO_NODE.model.has_lm_head}")
2354
+ logger.info(f"DiLoCo: inner_steps={diloco_inner_steps}")
2355
+
2356
+ # EARLY NETWORK WARNING
2357
+ num_layers = len(NEURO_NODE.my_layer_ids)
2358
+ if num_layers > 50:
2359
+ logger.warning("⚠️ EARLY NETWORK NOTICE ⚠️")
2360
+ logger.warning(f"You're holding {num_layers} layers because the network is small.")
2361
+ logger.warning("This is TEMPORARY - as more nodes join, the model will be sharded.")
2362
+
2363
+ # Show initial memory usage
2364
+ try:
2365
+ import psutil
2366
+ process = psutil.Process()
2367
+ process_mem_mb = process.memory_info().rss / (1024 * 1024)
2368
+ logger.info(f"Current memory usage: {process_mem_mb:.0f}MB / {available_memory_mb or '?'}MB allocated")
2369
+ except Exception:
2370
+ pass
2371
+
2372
+ # 5. Update P2P shard_range with actual assigned layers
2373
+ layer_ids = NEURO_NODE.my_layer_ids
2374
+ if layer_ids:
2375
+ start_layer = min(layer_ids)
2376
+ end_layer = max(layer_ids)
2377
+ shard_range = f"{start_layer}-{end_layer}"
2378
+ else:
2379
+ shard_range = "0-0"
2380
+ P2P.shard_range = shard_range
2381
+ P2P.start_layer = start_layer if layer_ids else 0
2382
+ P2P.end_layer = end_layer if layer_ids else 0
2383
+ STATE["shard_range"] = shard_range
2384
+ logger.info(f"P2P shard_range: {shard_range} (layers {layer_ids})")
2385
+
2386
+ # Set node role info for PoNW reward calculation
2387
+ STATE["assigned_layers"] = NEURO_NODE.my_layer_ids
2388
+ STATE["has_embedding"] = NEURO_NODE.model.has_embedding
2389
+ STATE["has_lm_head"] = NEURO_NODE.model.has_lm_head
2390
+ STATE["current_loss"] = NEURO_NODE.current_loss if NEURO_NODE.current_loss != float('inf') else None
2391
+
2392
+ logger.info(f"Connected to P2P network for distributed training")
2393
+
2394
+ # 4a. Set up ROLE VERIFICATION to prevent fake Validator/Driver claims
2395
+ # This is CRITICAL for security - nodes can't claim roles they don't have
2396
+ def verify_node_role(node_id: str, claimed_embed: bool, claimed_head: bool):
2397
+ """
2398
+ Verify that a node actually holds the layers it claims.
2399
+
2400
+ Uses THREE sources for verification (defense in depth):
2401
+ 1. Local layer_pool (authoritative for nodes we know)
2402
+ 2. DHT lookup (for remote nodes we don't have in local pool)
2403
+ 3. Tracker query (fallback for unverifiable claims)
2404
+
2405
+ Returns: (is_valid, actual_has_embedding, actual_has_lm_head)
2406
+ """
2407
+ import json
2408
+ import hashlib
2409
+
2410
+ # 1. LOCAL VERIFICATION (fastest, most authoritative)
2411
+ if NEURO_NODE.layer_pool:
2412
+ layer_0_holders = [a.node_id for a in NEURO_NODE.layer_pool.get_layer_holders(0)]
2413
+ last_layer = max(0, NEURO_NODE.layer_pool.current_num_layers - 1)
2414
+ last_layer_holders = [a.node_id for a in NEURO_NODE.layer_pool.get_layer_holders(last_layer)]
2415
+
2416
+ # Check if we know this node locally
2417
+ all_known_nodes = set(layer_0_holders + last_layer_holders)
2418
+ for assignments in NEURO_NODE.layer_pool.layer_assignments.values():
2419
+ for a in assignments:
2420
+ all_known_nodes.add(a.node_id)
2421
+
2422
+ if node_id in all_known_nodes:
2423
+ # We know this node - verify against local data
2424
+ actual_embed = node_id in layer_0_holders
2425
+ actual_head = node_id in last_layer_holders
2426
+
2427
+ is_valid = True
2428
+ if claimed_head and not actual_head:
2429
+ is_valid = False
2430
+ if claimed_embed and not actual_embed:
2431
+ is_valid = False
2432
+
2433
+ return is_valid, actual_embed, actual_head
2434
+
2435
+ # 2. HEARTBEAT/PEER_STATS VERIFICATION (from swarm router)
2436
+ # Heartbeats contain node_id AND layer_range - this is the best source for remote nodes!
2437
+ # Note: swarm_components contains SwarmComponents (router, buffers, etc.)
2438
+ if hasattr(NEURO_NODE, 'swarm_components') and NEURO_NODE.swarm_components and hasattr(NEURO_NODE.swarm_components, 'swarm_router'):
2439
+ router = NEURO_NODE.swarm_components.swarm_router
2440
+ if hasattr(router, 'peer_stats') and node_id in router.peer_stats:
2441
+ peer = router.peer_stats[node_id]
2442
+ layer_range = peer.layer_range # (start, end) tuple
2443
+
2444
+ # Get last layer from our layer pool
2445
+ last_layer = max(0, NEURO_NODE.layer_pool.current_num_layers - 1) if NEURO_NODE.layer_pool else 0
2446
+
2447
+ # Driver = holds layer 0
2448
+ actual_embed = layer_range[0] == 0
2449
+ # Validator = holds last layer
2450
+ actual_head = last_layer in range(layer_range[0], layer_range[1])
2451
+
2452
+ is_valid = True
2453
+ if claimed_head and not actual_head:
2454
+ is_valid = False
2455
+ if claimed_embed and not actual_embed:
2456
+ is_valid = False
2457
+
2458
+ logger.debug(f"Role verification via heartbeat: {node_id[:16]}... "
2459
+ f"layers={layer_range}, embed={actual_embed}, head={actual_head}")
2460
+ return is_valid, actual_embed, actual_head
2461
+
2462
+ # 3. FALLBACK: For unknown nodes, use CONSERVATIVE verification
2463
+ # NOTE: DHT stores IP:port not node_id, so we can't verify roles via DHT alone
2464
+ # If we can't verify, we have two options:
2465
+ # a) REJECT all unknown claims (secure but might reject valid proofs)
2466
+ # b) ACCEPT but cap rewards (economic security)
2467
+ #
2468
+ # We use option (b) - the proof is ACCEPTED but role bonuses are NOT applied
2469
+ # This is handled in _calculate_reward by checking verified roles
2470
+
2471
+ # For now, if we can't verify, return "claims not verified"
2472
+ # The reward calculation should treat unverified claims as false
2473
+ logger.debug(f"Role verification: Node {node_id[:16]}... not in local pool, claims unverifiable")
2474
+
2475
+ # Return: valid=True (don't reject), but actual roles = False (no bonus)
2476
+ # This allows the proof through but without Validator/Driver bonuses
2477
+ return True, False, False
2478
+
2479
+ P2P.ledger.set_role_verifier(verify_node_role)
2480
+ logger.info("Role verification enabled - fake Validator/Driver claims will be REJECTED")
2481
+
2482
+ # Set model interface for training work verification
2483
+ P2P.ledger.set_model_interface(NEURO_NODE)
2484
+
2485
+ # 4b. Start Swarm components
2486
+ if hasattr(NEURO_NODE, 'start_swarm_sync'):
2487
+ logger.info("[SWARM] Starting swarm components...")
2488
+ NEURO_NODE.start_swarm_sync()
2489
+ logger.info("[SWARM] Swarm components started")
2490
+
2491
+ # 5. Start gRPC Server
2492
+ start_grpc_background(port, NEURO_NODE, P2P, None)
2493
+
2494
+ # 5. Background tasks (runs every 1 second)
2495
+ def background_tasks():
2496
+ # CONTINUOUS TRAINING with USER-DEFINED THROTTLING
2497
+ # Respects user's CPU AND RAM limits to allow background operation without hogging resources
2498
+ # Settings are re-read each iteration so changes take effect immediately!
2499
+
2500
+ import psutil
2501
+
2502
+ # Store initial limits (can be updated via API)
2503
+ STATE["config_cpu_threads"] = max_cpu_threads
2504
+ STATE["config_memory_mb"] = available_memory_mb
2505
+ STATE["config_storage_mb"] = max_storage_mb
2506
+
2507
+ total_cpu_cores = psutil.cpu_count() or 4
2508
+ total_ram_mb = psutil.virtual_memory().total / (1024 * 1024)
2509
+ last_throttle_log = 0
2510
+
2511
+ def calculate_throttle():
2512
+ """Calculate throttle settings from current config (allows live updates)."""
2513
+ # Read current config (can be updated via API while running)
2514
+ user_cpu_limit = STATE.get("config_cpu_threads") or total_cpu_cores
2515
+ user_ram_limit = STATE.get("config_memory_mb") or (total_ram_mb * 0.7)
2516
+
2517
+ cpu_ratio = min(1.0, user_cpu_limit / total_cpu_cores)
2518
+ ram_ratio = min(1.0, user_ram_limit / total_ram_mb)
2519
+ resource_ratio = min(cpu_ratio, ram_ratio)
2520
+
2521
+ # GPU nodes can train much faster without lagging the system
2522
+ is_gpu = NEURO_NODE.device in ["cuda", "mps"] if NEURO_NODE else False
2523
+
2524
+ # Log device status occasionally to debug "why is it slow?"
2525
+ # Use time.time() directly to avoid closure issues with 'now'
2526
+ current_time = time.time()
2527
+ if current_time - last_throttle_log >= 60:
2528
+ current_device = NEURO_NODE.device if NEURO_NODE else 'None'
2529
+ logger.debug(f"[NODE] Device: {current_device} (is_gpu={is_gpu})")
2530
+
2531
+ base_interval = 0.01 if is_gpu else 2.0
2532
+
2533
+ interval = max(base_interval, base_interval / max(0.1, resource_ratio))
2534
+ # Allow much higher steps per minute on GPU
2535
+ base_max_steps = 600 if is_gpu else 30
2536
+ max_steps = max(5, int(base_max_steps * resource_ratio))
2537
+
2538
+ # Store for API access
2539
+ STATE["throttle_cpu_ratio"] = cpu_ratio
2540
+ STATE["throttle_ram_ratio"] = ram_ratio
2541
+ STATE["throttle_effective"] = resource_ratio
2542
+ STATE["throttle_interval"] = interval
2543
+ STATE["throttle_max_steps"] = max_steps
2544
+
2545
+ return interval, max_steps, resource_ratio
2546
+
2547
+ # Initial calculation and log
2548
+ min_interval_between_steps, max_steps_per_minute, resource_ratio = calculate_throttle()
2549
+ logger.info(f"[NODE] Training throttle: effective={resource_ratio*100:.0f}%, "
2550
+ f"interval={min_interval_between_steps:.1f}s, max={max_steps_per_minute} steps/min")
2551
+
2552
+ last_train_complete = 0
2553
+ # BUGFIX: Initialize to current values (may be >0 if loaded from checkpoint)
2554
+ last_tokens = NEURO_NODE.total_tokens_processed if NEURO_NODE else 0
2555
+ last_training_rounds = NEURO_NODE.total_training_rounds if NEURO_NODE else 0
2556
+ training_in_progress = False
2557
+ consecutive_data_not_ready = 0
2558
+ steps_this_minute = 0
2559
+ training_step_count = 0 # Track total steps for logging
2560
+ minute_start = time.time()
2561
+ last_memory_report = 0 # For periodic memory usage logging
2562
+ last_training_heartbeat = 0 # For periodic training loop status
2563
+
2564
+ while not _SHUTDOWN_REQUESTED.is_set():
2565
+ now = time.time()
2566
+
2567
+ # Reset per-minute counter
2568
+ if now - minute_start >= 60:
2569
+ steps_this_minute = 0
2570
+ minute_start = now
2571
+
2572
+ # RE-CALCULATE THROTTLE periodically (allows live config changes)
2573
+ # Only recalculate every 5 seconds to avoid overhead
2574
+ if now - last_throttle_log >= 5:
2575
+ new_interval, new_max_steps, new_ratio = calculate_throttle()
2576
+ # Log only if changed significantly
2577
+ if abs(new_ratio - resource_ratio) > 0.05:
2578
+ logger.info(f"[NODE] Throttle updated: {new_ratio*100:.0f}% "
2579
+ f"(interval={new_interval:.1f}s, max={new_max_steps}/min)")
2580
+ min_interval_between_steps = new_interval
2581
+ max_steps_per_minute = new_max_steps
2582
+ resource_ratio = new_ratio
2583
+ last_throttle_log = now
2584
+
2585
+ # Update peer count
2586
+ STATE["peer_count"] = len(P2P.known_peers)
2587
+
2588
+ # TRAINING LOOP HEARTBEAT (every 30 seconds) - confirms loop is running
2589
+ if now - last_training_heartbeat >= 30:
2590
+ last_training_heartbeat = now
2591
+ data_status = "unknown"
2592
+ if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
2593
+ try:
2594
+ loader = NEURO_NODE.genesis_loader
2595
+ loaded = len(loader.loaded_shards)
2596
+ prefetch = len(loader._prefetch_ready)
2597
+ data_status = f"loaded={loaded},prefetch={prefetch}"
2598
+ except Exception:
2599
+ data_status = "error"
2600
+ logger.debug(f"[NODE] Training loop alive: status={STATE.get('training_status', '?')}, "
2601
+ f"steps={training_step_count}, data={data_status}")
2602
+
2603
+ # PERIODIC MEMORY REPORT (every 60 seconds)
2604
+ if now - last_memory_report >= 60:
2605
+ try:
2606
+ import os
2607
+ process = psutil.Process(os.getpid())
2608
+ process_mem_mb = process.memory_info().rss / (1024 * 1024)
2609
+ memory_limit = STATE.get("config_memory_mb") or available_memory_mb
2610
+ system_mem = psutil.virtual_memory()
2611
+
2612
+ logger.info(f"[NODE] Memory: process={process_mem_mb:.0f}MB / {memory_limit or '?'}MB limit, "
2613
+ f"system={system_mem.percent:.0f}% ({system_mem.used/(1024**3):.1f}GB / {system_mem.total/(1024**3):.1f}GB)")
2614
+
2615
+ # Show Genesis data loader stats if training
2616
+ if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
2617
+ loader = NEURO_NODE.genesis_loader
2618
+ stats = loader.get_stats()
2619
+ num_loaded = stats.get('loaded_shards', 0)
2620
+ num_prefetched = stats.get('prefetch_ready', 0)
2621
+ shard_id = stats.get('current_shard_id', '?')
2622
+ shard_progress = stats.get('shard_progress_pct', 0)
2623
+ loss_avg = stats.get('loss_avg', 0)
2624
+
2625
+ logger.info(f"[NODE] Genesis: shard {shard_id} ({shard_progress:.0f}% done), "
2626
+ f"{num_loaded} loaded + {num_prefetched} prefetched")
2627
+
2628
+ # Show loss plateau status if loss is tracked
2629
+ if loss_avg > 0:
2630
+ loss_var = stats.get('loss_variance', 0)
2631
+ steps_shard = stats.get('steps_on_current_shard', 0)
2632
+ min_steps = 100 # Minimum steps before plateau can trigger rotation
2633
+
2634
+ # Plateau = low variance + low loss + enough steps
2635
+ is_plateau = loss_var < 0.02 and loss_avg < 0.05 and steps_shard >= min_steps
2636
+ if is_plateau:
2637
+ plateau_status = "will_rotate"
2638
+ elif loss_var < 0.02 and loss_avg < 0.05:
2639
+ plateau_status = f"plateau (need {min_steps - steps_shard} more steps)"
2640
+ else:
2641
+ plateau_status = "learning"
2642
+
2643
+ logger.info(f"[NODE] Training: loss_avg={loss_avg:.4f}, variance={loss_var:.6f}, "
2644
+ f"steps_on_shard={steps_shard}, status={plateau_status}")
2645
+
2646
+ last_memory_report = now
2647
+ except Exception:
2648
+ pass
2649
+
2650
+ # Update token count and training batches from node
2651
+ current_tokens = NEURO_NODE.total_tokens_processed
2652
+ current_training = NEURO_NODE.total_training_rounds
2653
+
2654
+ # Add DELTA to STATE counters (for PoNW proof calculation)
2655
+ # NOTE: last_tokens/last_training_rounds are initialized to current values
2656
+ # at startup to handle checkpoint loading correctly
2657
+ STATE["token_count"] = STATE.get("token_count", 0) + (current_tokens - last_tokens)
2658
+ STATE["training_batches"] = STATE.get("training_batches", 0) + (current_training - last_training_rounds)
2659
+
2660
+ last_tokens = current_tokens
2661
+ last_training_rounds = current_training
2662
+
2663
+ # Store totals for display
2664
+ STATE["total_tokens_processed"] = current_tokens
2665
+ STATE["total_training_rounds"] = current_training
2666
+
2667
+ # Update model hash for PoNW proofs
2668
+ # IMPORTANT: Must use same hash algorithm as SwarmEnabledDynamicNode._get_model_hash()
2669
+ # to ensure proofs verify correctly
2670
+ if NEURO_NODE.model:
2671
+ if hasattr(NEURO_NODE, '_get_model_hash'):
2672
+ # Use the swarm node's hash method for consistency
2673
+ STATE["model_hash"] = NEURO_NODE._get_model_hash()
2674
+ else:
2675
+ # Fallback: compute architecture-based hash (same logic as factory.py)
2676
+ hasher = hashlib.sha256()
2677
+ arch_str = f"{NEURO_NODE.model.hidden_dim}:{len(NEURO_NODE.my_layer_ids)}:{getattr(NEURO_NODE.model, 'num_heads', 0)}"
2678
+ hasher.update(arch_str.encode())
2679
+ for name, param in sorted(NEURO_NODE.model.named_parameters()):
2680
+ hasher.update(f"{name}:{list(param.shape)}".encode())
2681
+ STATE["model_hash"] = hasher.hexdigest()[:16]
2682
+
2683
+ # Session cleanup
2684
+ to_remove = [sid for sid, ts in SESSION_TIMESTAMPS.items() if now - ts > 300]
2685
+ for sid in to_remove:
2686
+ del SESSION_TIMESTAMPS[sid]
2687
+
2688
+ # Marketplace cleanup (every 60 seconds)
2689
+ if int(now) % 60 == 0:
2690
+ market = P2P.ledger.inference_market
2691
+ # Cleanup stale claims
2692
+ stale = market.cleanup_stale_claims()
2693
+ if stale > 0:
2694
+ logger.info(f"[MARKET] Cleaned up {stale} stale claims")
2695
+ # Cleanup old results
2696
+ market.cleanup_old_results()
2697
+
2698
+ # VALIDATOR ELIGIBILITY CHECK
2699
+ # Ensure validators still meet stake requirements when tier changes
2700
+ if NEURO_NODE and NEURO_NODE.layer_pool:
2701
+ def get_node_stake(node_id: str) -> float:
2702
+ """Get stake for a node (checks local ledger)."""
2703
+ if node_id == NEURO_NODE.node_id:
2704
+ return P2P.ledger.get_account_info().get("stake", 0.0)
2705
+ # For remote nodes, we'd need to query their stake
2706
+ # For now, assume they meet requirements (trust but verify via gossip)
2707
+ return float('inf')
2708
+
2709
+ # Check if any validators need demotion
2710
+ demoted = NEURO_NODE.layer_pool.validate_all_validators(get_node_stake)
2711
+
2712
+ # If we were demoted, disable our LM head
2713
+ if NEURO_NODE.node_id in demoted and NEURO_NODE.model:
2714
+ NEURO_NODE.model.disable_lm_head()
2715
+ logger.warning("[NODE] Self-demoted from Validator due to stake tier change")
2716
+
2717
+ # CONTINUOUS TRAINING with smart throttling:
2718
+ # 1. Training must be enabled
2719
+ # 2. NEURO_NODE must exist
2720
+ # 3. No training currently in progress
2721
+ # 4. Minimum interval since last step (for system responsiveness)
2722
+ # 5. Haven't exceeded max steps per minute (optional throttle)
2723
+ should_train = (
2724
+ enable_training and
2725
+ not training_in_progress and
2726
+ (now - last_train_complete) >= min_interval_between_steps and
2727
+ steps_this_minute < max_steps_per_minute
2728
+ )
2729
+
2730
+ if should_train:
2731
+ # MEMORY WARNING: Log if over limit (rate-limited to once per 60s)
2732
+ # Note: This is informational only - we don't skip training because
2733
+ # the --memory flag is a HINT for layer calculation, not a hard cap
2734
+ try:
2735
+ import os
2736
+ process = psutil.Process(os.getpid())
2737
+ process_mem_mb = process.memory_info().rss / (1024 * 1024)
2738
+ memory_limit = STATE.get("config_memory_mb") or available_memory_mb
2739
+
2740
+ # Rate-limit warning to once per 60 seconds
2741
+ last_mem_warning = STATE.get("_last_mem_warning", 0)
2742
+ if memory_limit and process_mem_mb > memory_limit * 1.2 and (now - last_mem_warning) >= 60:
2743
+ STATE["_last_mem_warning"] = now
2744
+ system_mem = psutil.virtual_memory()
2745
+ logger.info(f"[NODE] Memory note: process={process_mem_mb:.0f}MB (limit={memory_limit}MB is a hint, not cap)")
2746
+ logger.info(f"[NODE] System has {system_mem.available / (1024**3):.1f}GB available - training continues normally")
2747
+
2748
+ # Only clear caches if system memory is actually low (>80% used)
2749
+ if system_mem.percent > 80:
2750
+ logger.warning(f"[NODE] System memory high ({system_mem.percent}%), clearing caches...")
2751
+ if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
2752
+ loader = NEURO_NODE.genesis_loader
2753
+ current_shard = loader.assigned_shard_ids[loader.current_shard_idx % len(loader.assigned_shard_ids)] if loader.assigned_shard_ids else None
2754
+ shards_to_remove = [sid for sid in loader.loaded_shards.keys() if sid != current_shard]
2755
+ for sid in shards_to_remove:
2756
+ del loader.loaded_shards[sid]
2757
+ loader._prefetch_ready.clear()
2758
+ import gc
2759
+ gc.collect()
2760
+ if NEURO_NODE.device == "cuda":
2761
+ torch.cuda.empty_cache()
2762
+ elif NEURO_NODE.device == "mps":
2763
+ torch.mps.empty_cache()
2764
+ except Exception:
2765
+ pass
2766
+
2767
+ # Check if data is ready (non-blocking)
2768
+ data_ready = False
2769
+ if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
2770
+ try:
2771
+ # Use timeout to prevent lock contention from blocking training loop
2772
+ data_ready = NEURO_NODE.genesis_loader.is_data_ready()
2773
+ except Exception as e:
2774
+ logger.warning(f"[GENESIS] is_data_ready() error: {e}")
2775
+ data_ready = False
2776
+
2777
+ # Show shard download status periodically
2778
+ if not data_ready and consecutive_data_not_ready % 5 == 0:
2779
+ try:
2780
+ stats = NEURO_NODE.genesis_loader.get_stats()
2781
+ logger.info(f"[GENESIS] Status: assigned={stats.get('assigned_shards', 0)} shards, "
2782
+ f"loaded={stats.get('loaded_shards', 0)}, "
2783
+ f"prefetching={stats.get('prefetch_in_progress', 0)}")
2784
+ except Exception:
2785
+ pass
2786
+ elif data_ready and training_step_count == 0:
2787
+ logger.info(f"[GENESIS] Data ready! Starting first training step...")
2788
+ training_step_count = 1 # Prevent repeat message
2789
+ else:
2790
+ # No genesis loader yet - first training step will create it
2791
+ data_ready = True
2792
+
2793
+ if data_ready or consecutive_data_not_ready > 3:
2794
+ training_in_progress = True
2795
+ consecutive_data_not_ready = 0
2796
+ step_start = time.time()
2797
+
2798
+ STATE["training_status"] = "running"
2799
+
2800
+ # Debug: Log why we're training
2801
+ if not data_ready:
2802
+ logger.debug(f"[NODE] Forcing training step after {consecutive_data_not_ready} waits")
2803
+
2804
+ try:
2805
+ loss = NEURO_NODE.train_step()
2806
+ step_duration = time.time() - step_start
2807
+
2808
+ if loss is not None:
2809
+ steps_this_minute += 1
2810
+ training_step_count += 1
2811
+
2812
+ # Get LR from DiLoCo trainer if available
2813
+ lr_info = ""
2814
+ # Note: swarm_components contains SwarmComponents (DiLoCo, etc.)
2815
+ if hasattr(NEURO_NODE, 'swarm_components') and NEURO_NODE.swarm_components:
2816
+ diloco = getattr(NEURO_NODE.swarm_components, 'diloco_trainer', None)
2817
+ if diloco:
2818
+ current_lr = diloco.get_current_lr()
2819
+ lr_info = f", lr={current_lr:.2e}"
2820
+
2821
+ # Log every step with timing info
2822
+ logger.info(f"[NODE] Training step #{NEURO_NODE.total_training_rounds}: "
2823
+ f"loss={loss:.4f}{lr_info} ({step_duration:.1f}s)")
2824
+ STATE["training_status"] = "idle"
2825
+ STATE["last_loss"] = loss
2826
+ STATE["current_loss"] = loss # For gossip proof creation
2827
+ else:
2828
+ # train_step returned None - log why
2829
+ logger.info(f"[NODE] Training step returned None (took {step_duration:.1f}s)")
2830
+ STATE["training_status"] = "waiting_for_data"
2831
+
2832
+ except RuntimeError as e:
2833
+ error_msg = str(e).lower()
2834
+ if "not ready" in error_msg:
2835
+ if consecutive_data_not_ready == 0:
2836
+ logger.info(f"[NODE] Waiting for Genesis data to download...")
2837
+ if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
2838
+ stats = NEURO_NODE.genesis_loader.get_stats()
2839
+ logger.info(f"[GENESIS] Downloading shard... "
2840
+ f"(assigned: {stats.get('assigned_shards', '?')}, "
2841
+ f"loaded: {stats.get('loaded_shards', 0)}, "
2842
+ f"prefetching: {stats.get('prefetch_in_progress', 0)})")
2843
+ STATE["training_status"] = "loading_data"
2844
+ consecutive_data_not_ready += 1
2845
+ elif "genesis loader init failed" in error_msg or "manifest" in error_msg:
2846
+ # Genesis loader initialization error - show details
2847
+ logger.error(f"[GENESIS] ERROR: {e}")
2848
+ STATE["training_status"] = "genesis_error"
2849
+ # Don't spam - wait before retrying
2850
+ time.sleep(10)
2851
+ else:
2852
+ logger.error(f"[NODE] Training error: {e}")
2853
+ STATE["training_status"] = "error"
2854
+ except Exception as e:
2855
+ logger.error(f"[NODE] Training error: {e}")
2856
+ STATE["training_status"] = "error"
2857
+
2858
+ training_in_progress = False
2859
+ last_train_complete = time.time()
2860
+ else:
2861
+ consecutive_data_not_ready += 1
2862
+ if consecutive_data_not_ready == 1:
2863
+ logger.info(f"[NODE] Waiting for training data to load...")
2864
+
2865
+ # Heartbeat for layers (only every 10 seconds to reduce overhead)
2866
+ if int(now) % 10 == 0 and NEURO_NODE.layer_pool:
2867
+ NEURO_NODE.layer_pool.heartbeat(NEURO_NODE.node_id, NEURO_NODE.my_layer_ids)
2868
+
2869
+ # Cleanup stale layer assignments (every 60 seconds)
2870
+ if int(now) % 60 == 0:
2871
+ removed = NEURO_NODE.layer_pool.cleanup_stale_assignments()
2872
+ if removed > 0:
2873
+ logger.info(f"[LAYER_POOL] Cleaned up {removed} stale layer assignments")
2874
+
2875
+ # TOKENIZER AUTO-REFRESH: Check for vocab updates every 10 minutes
2876
+ # Synced with MANIFEST_REFRESH_INTERVAL (600s) in GenesisDataLoader
2877
+ # This ensures model embedding expands when tokenizer grows
2878
+ if int(now) % 600 == 0: # Every 10 minutes (matches data loader refresh)
2879
+ try:
2880
+ if hasattr(NEURO_NODE, '_load_learned_tokenizer'):
2881
+ old_vocab = NEURO_NODE.tokenizer.current_vocab_size if NEURO_NODE.tokenizer else 0
2882
+ NEURO_NODE._load_learned_tokenizer()
2883
+ new_vocab = NEURO_NODE.tokenizer.current_vocab_size if NEURO_NODE.tokenizer else 0
2884
+ if new_vocab > old_vocab:
2885
+ logger.info(f"[TOKENIZER] Vocab updated: {old_vocab:,} → {new_vocab:,} tokens")
2886
+ except Exception as e:
2887
+ logger.debug(f"[TOKENIZER] Refresh check failed: {e}")
2888
+
2889
+ # RESOURCE-AWARE SLEEP: Adjust based on system load
2890
+ # This ensures we're a good citizen when running in the background
2891
+ try:
2892
+ current_cpu = psutil.cpu_percent(interval=None) # Non-blocking
2893
+ current_mem = psutil.virtual_memory().percent
2894
+
2895
+ # If system is under heavy load (not from us), back off
2896
+ if current_cpu > 90 or current_mem > 90:
2897
+ time.sleep(5) # Back off significantly if system is stressed
2898
+ continue
2899
+
2900
+ # Dynamic sleep based on activity and user's CPU setting
2901
+ if training_in_progress:
2902
+ time.sleep(0.1) # Fast loop during active training
2903
+ else:
2904
+ # Check if data is likely ready (quick check without blocking)
2905
+ likely_data_ready = False
2906
+ if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
2907
+ try:
2908
+ loader = NEURO_NODE.genesis_loader
2909
+ # Quick non-locking check - just look at dict sizes
2910
+ likely_data_ready = bool(loader._prefetch_ready or loader.loaded_shards or loader.current_dataset is not None)
2911
+ except Exception:
2912
+ pass
2913
+
2914
+ if likely_data_ready:
2915
+ # Data might be ready - use shorter interval
2916
+ time.sleep(min_interval_between_steps * 0.5)
2917
+ else:
2918
+ time.sleep(1) # Slower loop when idle/waiting
2919
+ except:
2920
+ time.sleep(1) # Fallback if psutil fails
2921
+
2922
+ threading.Thread(target=background_tasks, daemon=True).start()
2923
+
2924
+ # DRIVER WORKER LOOP: Poll marketplace AND process requests
2925
+ def driver_worker_loop():
2926
+ """
2927
+ PRODUCTION-READY Driver Worker Loop
2928
+
2929
+ 1. Polls marketplace for pending requests
2930
+ 2. Claims requests assigned to this driver
2931
+ 3. Waits for encrypted prompt from user
2932
+ 4. Processes inference through distributed pipeline
2933
+ 5. Submits PoNW proof for rewards
2934
+ """
2935
+ import time
2936
+
2937
+ # Check if this node is a driver
2938
+ is_driver = NEURO_NODE and NEURO_NODE.model.has_embedding
2939
+
2940
+ if not is_driver:
2941
+ logger.info("[DRIVER] Not a driver node - skipping marketplace worker loop")
2942
+ return
2943
+
2944
+ logger.info("[DRIVER] Starting PRODUCTION marketplace worker loop...")
2945
+ logger.info(f"[DRIVER] Will poll for requests assigned to: {NEURO_NODE.node_id[:16]}...")
2946
+
2947
+ # Import encrypted prompt handling
2948
+ from neuroshard.core.network.encrypted_channel import PromptEncryption, PromptQueue
2949
+
2950
+ prompt_queue = PromptQueue()
2951
+
2952
+ # Store in node for API access
2953
+ NEURO_NODE.prompt_queue = prompt_queue
2954
+
2955
+ last_claim_attempt = 0
2956
+ processing_requests = {} # request_id -> asyncio.Task
2957
+
2958
+ def process_request(request_id: str):
2959
+ """Process a single inference request using existing distributed inference."""
2960
+ try:
2961
+ # Get marketplace request for parameters
2962
+ market = P2P.ledger.inference_market
2963
+
2964
+ market_request = market.get_request(request_id)
2965
+ if not market_request:
2966
+ logger.warning(f"[DRIVER] ✗ Request {request_id[:8]}... not found in marketplace")
2967
+ return
2968
+
2969
+ # Get encrypted prompt
2970
+ encrypted_prompt = prompt_queue.get_prompt(request_id)
2971
+
2972
+ if not encrypted_prompt:
2973
+ logger.warning(f"[DRIVER] ✗ No prompt found for {request_id[:8]}...")
2974
+ return
2975
+
2976
+ # Decrypt prompt
2977
+ try:
2978
+ prompt_text = PromptEncryption.decrypt_prompt(
2979
+ encrypted_prompt.encrypted_data,
2980
+ request_id
2981
+ )
2982
+ logger.info(f"[DRIVER] ✓ Decrypted prompt: '{prompt_text[:50]}...'")
2983
+ except Exception as e:
2984
+ logger.error(f"[DRIVER] ✗ Failed to decrypt prompt: {e}")
2985
+ return
2986
+
2987
+ # Process using EXISTING distributed inference
2988
+ try:
2989
+ output = NEURO_NODE.generate(
2990
+ prompt=prompt_text,
2991
+ max_new_tokens=market_request.tokens_requested,
2992
+ temperature=0.8
2993
+ )
2994
+
2995
+ logger.info(f"[DRIVER] ✓ Generated: '{output[:100]}...'")
2996
+ logger.info(f"[DRIVER] ✓ Request {request_id[:8]}... completed")
2997
+ processing_requests[request_id] = "completed"
2998
+
2999
+ # Store result in marketplace
3000
+ market.store_result(request_id, output)
3001
+
3002
+ except Exception as e:
3003
+ logger.error(f"[DRIVER] ✗ Generation failed: {e}")
3004
+ import traceback
3005
+ traceback.print_exc()
3006
+ processing_requests[request_id] = "failed"
3007
+
3008
+ except Exception as e:
3009
+ logger.error(f"[DRIVER] ✗ Error processing {request_id[:8]}...: {e}")
3010
+ import traceback
3011
+ logger.error(traceback.format_exc())
3012
+ processing_requests[request_id] = "failed"
3013
+
3014
+ while not _SHUTDOWN_REQUESTED.is_set():
3015
+ now = time.time()
3016
+
3017
+ # STEP 1: Poll marketplace for new requests (every 5 seconds)
3018
+ if now - last_claim_attempt >= 5:
3019
+ try:
3020
+ market = P2P.ledger.inference_market
3021
+
3022
+ # Try to claim a request
3023
+ request = market.claim_request(NEURO_NODE.node_id)
3024
+
3025
+ if request:
3026
+ logger.info(f"[DRIVER] ✓ Claimed request {request.request_id[:8]}... "
3027
+ f"({request.tokens_requested} tokens @ {request.locked_price:.6f} NEURO/1M)")
3028
+
3029
+ # Start pipeline session
3030
+ market.start_pipeline_session(
3031
+ request_id=request.request_id,
3032
+ session_id=request.pipeline_session_id,
3033
+ driver_node_id=NEURO_NODE.node_id
3034
+ )
3035
+
3036
+ # Check if we already have the prompt
3037
+ if prompt_queue.has_prompt(request.request_id):
3038
+ logger.info(f"[DRIVER] ✓ Prompt already received, processing immediately")
3039
+ # Process immediately
3040
+ process_request(request.request_id)
3041
+ else:
3042
+ logger.info(f"[DRIVER] Waiting for encrypted prompt from user...")
3043
+ logger.info(f"[DRIVER] User should POST to /api/driver/prompt/{request.request_id[:8]}...")
3044
+ processing_requests[request.request_id] = None # Mark as waiting
3045
+
3046
+ except Exception as e:
3047
+ if "not found" not in str(e).lower():
3048
+ logger.error(f"[DRIVER] Marketplace poll error: {e}")
3049
+
3050
+ last_claim_attempt = now
3051
+
3052
+ # STEP 2: Check for prompts that arrived for waiting requests
3053
+ for request_id in list(processing_requests.keys()):
3054
+ if processing_requests[request_id] is None: # Waiting for prompt
3055
+ if prompt_queue.has_prompt(request_id):
3056
+ logger.info(f"[DRIVER] ✓ Prompt received for {request_id[:8]}..., starting processing")
3057
+ # Process (uses existing distributed inference)
3058
+ process_request(request_id)
3059
+ processing_requests[request_id] = "processing" # Mark as processing
3060
+
3061
+ # STEP 3: Cleanup finished requests
3062
+ for request_id in list(processing_requests.keys()):
3063
+ if processing_requests[request_id] == "completed":
3064
+ del processing_requests[request_id]
3065
+
3066
+ # STEP 4: Cleanup old prompts
3067
+ prompt_queue.cleanup_old_prompts()
3068
+
3069
+ time.sleep(1) # Fast loop for responsiveness
3070
+
3071
+ # Start driver worker loop if this is a driver node
3072
+ if NEURO_NODE and NEURO_NODE.model.has_embedding:
3073
+ threading.Thread(target=driver_worker_loop, daemon=True).start()
3074
+
3075
+ # 6. Run HTTP Server
3076
+ logger.info("=" * 50)
3077
+ logger.info("NeuroShard Node Ready!")
3078
+ logger.info(f" Device: {NEURO_NODE.device.upper()}")
3079
+ logger.info(f" My Layers: {NEURO_NODE.my_layer_ids}")
3080
+ logger.info(f" My Params: {NEURO_NODE.model.get_num_params() / 1e6:.1f}M")
3081
+ logger.info(f" Embedding: {NEURO_NODE.model.has_embedding}")
3082
+ logger.info(f" LM Head: {NEURO_NODE.model.has_lm_head}")
3083
+ logger.info(f" Training: {'Enabled' if enable_training else 'Disabled'}")
3084
+ logger.info(f" DiLoCo: sync every {diloco_inner_steps} steps")
3085
+ logger.info("=" * 50)
3086
+ logger.info("TRULY DECENTRALIZED: Model grows with network capacity!")
3087
+ logger.info("=" * 50)
3088
+
3089
+ # Custom log config: disable access logs and customize startup messages
3090
+ # Handle Windows GUI mode where stdout may be None
3091
+ if sys.stdout is not None and hasattr(sys.stdout, 'write'):
3092
+ log_config = {
3093
+ "version": 1,
3094
+ "disable_existing_loggers": False,
3095
+ "formatters": {
3096
+ "default": {"format": "[NODE] %(message)s"},
3097
+ },
3098
+ "handlers": {
3099
+ "default": {
3100
+ "formatter": "default",
3101
+ "class": "logging.StreamHandler",
3102
+ "stream": "ext://sys.stdout",
3103
+ },
3104
+ },
3105
+ "loggers": {
3106
+ # Suppress uvicorn's default startup messages (including "Press CTRL+C")
3107
+ "uvicorn": {"handlers": ["default"], "level": "WARNING", "propagate": False},
3108
+ "uvicorn.error": {"handlers": ["default"], "level": "WARNING", "propagate": False},
3109
+ "uvicorn.access": {"handlers": [], "level": "CRITICAL", "propagate": False},
3110
+ },
3111
+ }
3112
+ else:
3113
+ # Fallback to file logging when stdout is unavailable (Windows frozen GUI)
3114
+ log_dir = os.path.join(os.path.expanduser("~"), ".neuroshard")
3115
+ log_file = os.path.join(log_dir, "uvicorn.log")
3116
+ log_config = {
3117
+ "version": 1,
3118
+ "disable_existing_loggers": False,
3119
+ "formatters": {
3120
+ "default": {"format": "[NODE] %(message)s"},
3121
+ },
3122
+ "handlers": {
3123
+ "default": {
3124
+ "formatter": "default",
3125
+ "class": "logging.handlers.RotatingFileHandler",
3126
+ "filename": log_file,
3127
+ "maxBytes": 5*1024*1024,
3128
+ "backupCount": 2,
3129
+ "encoding": "utf-8",
3130
+ },
3131
+ },
3132
+ "loggers": {
3133
+ # Suppress uvicorn's default startup messages
3134
+ "uvicorn": {"handlers": ["default"], "level": "WARNING", "propagate": False},
3135
+ "uvicorn.error": {"handlers": ["default"], "level": "WARNING", "propagate": False},
3136
+ "uvicorn.access": {"handlers": [], "level": "CRITICAL", "propagate": False},
3137
+ },
3138
+ }
3139
+
3140
+ # Use Server object so we can stop it from outside (GUI shutdown)
3141
+ global _UVICORN_SERVER
3142
+ config = uvicorn.Config(node_app, host="0.0.0.0", port=port, log_config=log_config)
3143
+ _UVICORN_SERVER = uvicorn.Server(config)
3144
+
3145
+ # Print our own clean startup message (without "Press CTRL+C")
3146
+ logger.info(f"[NODE] HTTP server started on port {port}")
3147
+
3148
+ _UVICORN_SERVER.run()
3149
+
3150
+
3151
+ def main():
3152
+ import signal
3153
+ import atexit
3154
+
3155
+ # Register signal handlers for graceful shutdown
3156
+ def _signal_handler(signum, frame):
3157
+ logger.info(f"[NODE] Received signal {signum}, initiating graceful shutdown...")
3158
+ request_shutdown()
3159
+ sys.exit(0)
3160
+
3161
+ # Handle Ctrl+C (SIGINT) and SIGTERM
3162
+ signal.signal(signal.SIGINT, _signal_handler)
3163
+ signal.signal(signal.SIGTERM, _signal_handler)
3164
+
3165
+ # Also register atexit handler as backup
3166
+ atexit.register(lambda: request_shutdown() if NEURO_NODE else None)
3167
+
3168
+ parser = argparse.ArgumentParser(description="NeuroShard Node Runner - Truly Decentralized LLM")
3169
+ parser.add_argument("--port", type=int, default=8000)
3170
+ parser.add_argument("--tracker", type=str, default="https://neuroshard.com/api/tracker")
3171
+ parser.add_argument("--token", type=str, default=None,
3172
+ help="Node Token OR 12-word mnemonic phrase for wallet access")
3173
+ parser.add_argument("--announce-ip", type=str, default=None, help="Force IP address to announce")
3174
+ parser.add_argument("--announce-port", type=int, default=None, help="Force port to announce")
3175
+ parser.add_argument("--no-training", action="store_true", help="Disable training (inference only)")
3176
+ parser.add_argument("--memory", type=int, default=None,
3177
+ help="Override detected memory (MB) - for testing")
3178
+ parser.add_argument("--max-storage", type=int, default=100,
3179
+ help="Max disk space for training data (MB)")
3180
+ parser.add_argument("--cpu-threads", type=int, default=None,
3181
+ help="Max CPU threads to use")
3182
+ parser.add_argument("--diloco-steps", type=int, default=500,
3183
+ help="DiLoCo inner steps before gradient sync (default: 500)")
3184
+
3185
+ args = parser.parse_args()
3186
+
3187
+ # Handle mnemonic input: If token is 12 words, convert to token
3188
+ node_token = args.token
3189
+ if node_token:
3190
+ words = node_token.strip().split()
3191
+ if len(words) == 12:
3192
+ # It's a BIP39 mnemonic - derive token from it
3193
+ try:
3194
+ from mnemonic import Mnemonic
3195
+ mnemo = Mnemonic("english")
3196
+ if mnemo.check(node_token):
3197
+ # Convert mnemonic to deterministic token
3198
+ seed = mnemo.to_seed(node_token, passphrase="")
3199
+ node_token = seed[:32].hex() # Use first 32 bytes as token
3200
+ logger.info("✅ Wallet recovered from mnemonic")
3201
+ else:
3202
+ logger.warning("⚠️ Invalid mnemonic phrase - treating as raw token")
3203
+ except ImportError:
3204
+ logger.warning("⚠️ 'mnemonic' package not installed - treating as raw token")
3205
+ except Exception as e:
3206
+ logger.warning(f"⚠️ Mnemonic error: {e} - treating as raw token")
3207
+
3208
+ run_node(
3209
+ port=args.port,
3210
+ tracker=args.tracker,
3211
+ node_token=node_token,
3212
+ announce_ip=args.announce_ip,
3213
+ announce_port=args.announce_port,
3214
+ enable_training=not args.no_training,
3215
+ available_memory_mb=args.memory,
3216
+ max_storage_mb=args.max_storage,
3217
+ max_cpu_threads=args.cpu_threads,
3218
+ diloco_inner_steps=args.diloco_steps,
3219
+ )
3220
+
3221
+
3222
+ if __name__ == "__main__":
3223
+ main()