nexaroa 0.0.111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuroshard/__init__.py +93 -0
- neuroshard/__main__.py +4 -0
- neuroshard/cli.py +466 -0
- neuroshard/core/__init__.py +92 -0
- neuroshard/core/consensus/verifier.py +252 -0
- neuroshard/core/crypto/__init__.py +20 -0
- neuroshard/core/crypto/ecdsa.py +392 -0
- neuroshard/core/economics/__init__.py +52 -0
- neuroshard/core/economics/constants.py +387 -0
- neuroshard/core/economics/ledger.py +2111 -0
- neuroshard/core/economics/market.py +975 -0
- neuroshard/core/economics/wallet.py +168 -0
- neuroshard/core/governance/__init__.py +74 -0
- neuroshard/core/governance/proposal.py +561 -0
- neuroshard/core/governance/registry.py +545 -0
- neuroshard/core/governance/versioning.py +332 -0
- neuroshard/core/governance/voting.py +453 -0
- neuroshard/core/model/__init__.py +30 -0
- neuroshard/core/model/dynamic.py +4186 -0
- neuroshard/core/model/llm.py +905 -0
- neuroshard/core/model/registry.py +164 -0
- neuroshard/core/model/scaler.py +387 -0
- neuroshard/core/model/tokenizer.py +568 -0
- neuroshard/core/network/__init__.py +56 -0
- neuroshard/core/network/connection_pool.py +72 -0
- neuroshard/core/network/dht.py +130 -0
- neuroshard/core/network/dht_plan.py +55 -0
- neuroshard/core/network/dht_proof_store.py +516 -0
- neuroshard/core/network/dht_protocol.py +261 -0
- neuroshard/core/network/dht_service.py +506 -0
- neuroshard/core/network/encrypted_channel.py +141 -0
- neuroshard/core/network/nat.py +201 -0
- neuroshard/core/network/nat_traversal.py +695 -0
- neuroshard/core/network/p2p.py +929 -0
- neuroshard/core/network/p2p_data.py +150 -0
- neuroshard/core/swarm/__init__.py +106 -0
- neuroshard/core/swarm/aggregation.py +729 -0
- neuroshard/core/swarm/buffers.py +643 -0
- neuroshard/core/swarm/checkpoint.py +709 -0
- neuroshard/core/swarm/compute.py +624 -0
- neuroshard/core/swarm/diloco.py +844 -0
- neuroshard/core/swarm/factory.py +1288 -0
- neuroshard/core/swarm/heartbeat.py +669 -0
- neuroshard/core/swarm/logger.py +487 -0
- neuroshard/core/swarm/router.py +658 -0
- neuroshard/core/swarm/service.py +640 -0
- neuroshard/core/training/__init__.py +29 -0
- neuroshard/core/training/checkpoint.py +600 -0
- neuroshard/core/training/distributed.py +1602 -0
- neuroshard/core/training/global_tracker.py +617 -0
- neuroshard/core/training/production.py +276 -0
- neuroshard/governance_cli.py +729 -0
- neuroshard/grpc_server.py +895 -0
- neuroshard/runner.py +3223 -0
- neuroshard/sdk/__init__.py +92 -0
- neuroshard/sdk/client.py +990 -0
- neuroshard/sdk/errors.py +101 -0
- neuroshard/sdk/types.py +282 -0
- neuroshard/tracker/__init__.py +0 -0
- neuroshard/tracker/server.py +864 -0
- neuroshard/ui/__init__.py +0 -0
- neuroshard/ui/app.py +102 -0
- neuroshard/ui/templates/index.html +1052 -0
- neuroshard/utils/__init__.py +0 -0
- neuroshard/utils/autostart.py +81 -0
- neuroshard/utils/hardware.py +121 -0
- neuroshard/utils/serialization.py +90 -0
- neuroshard/version.py +1 -0
- nexaroa-0.0.111.dist-info/METADATA +283 -0
- nexaroa-0.0.111.dist-info/RECORD +78 -0
- nexaroa-0.0.111.dist-info/WHEEL +5 -0
- nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
- nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
- nexaroa-0.0.111.dist-info/top_level.txt +2 -0
- protos/__init__.py +0 -0
- protos/neuroshard.proto +651 -0
- protos/neuroshard_pb2.py +160 -0
- protos/neuroshard_pb2_grpc.py +1298 -0
neuroshard/runner.py
ADDED
|
@@ -0,0 +1,3223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NeuroShard Node Runner
|
|
3
|
+
|
|
4
|
+
This is the main entry point for running a NeuroShard node.
|
|
5
|
+
The node participates in:
|
|
6
|
+
1. Training NeuroLLM (our own model, trained from scratch by the network)
|
|
7
|
+
2. Inference (generating text using the collective model)
|
|
8
|
+
3. Earning NEURO tokens through Proof of Neural Work
|
|
9
|
+
|
|
10
|
+
TRULY DECENTRALIZED:
|
|
11
|
+
- No fixed model phases
|
|
12
|
+
- Model size grows with network capacity
|
|
13
|
+
- Each node contributes based on available memory
|
|
14
|
+
- More memory = more layers = more NEURO rewards
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import uvicorn
|
|
19
|
+
import threading
|
|
20
|
+
import torch # Imported early for API endpoints
|
|
21
|
+
import time
|
|
22
|
+
import requests
|
|
23
|
+
import logging
|
|
24
|
+
import logging.handlers # For RotatingFileHandler
|
|
25
|
+
import sys
|
|
26
|
+
import os
|
|
27
|
+
import socket
|
|
28
|
+
import uuid
|
|
29
|
+
import hashlib
|
|
30
|
+
import math
|
|
31
|
+
from fastapi import FastAPI, Request, HTTPException
|
|
32
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
33
|
+
from fastapi.responses import JSONResponse
|
|
34
|
+
from pydantic import BaseModel
|
|
35
|
+
from typing import Optional, List
|
|
36
|
+
|
|
37
|
+
from neuroshard.core.model.dynamic import DynamicNeuroNode, create_dynamic_node
|
|
38
|
+
from neuroshard.core.model.tokenizer import get_neuro_tokenizer
|
|
39
|
+
from neuroshard.core.network.p2p import P2PManager
|
|
40
|
+
|
|
41
|
+
# Swarm Architecture Imports (Phase 4)
|
|
42
|
+
try:
|
|
43
|
+
from neuroshard.core.swarm.factory import (
|
|
44
|
+
SwarmEnabledDynamicNode,
|
|
45
|
+
SwarmNodeConfig,
|
|
46
|
+
create_swarm_node,
|
|
47
|
+
create_swarm_node_with_p2p,
|
|
48
|
+
)
|
|
49
|
+
SWARM_AVAILABLE = True
|
|
50
|
+
except ImportError:
|
|
51
|
+
SWARM_AVAILABLE = False
|
|
52
|
+
from neuroshard.core.economics.constants import (
|
|
53
|
+
is_valid_stake_amount,
|
|
54
|
+
is_valid_stake_duration,
|
|
55
|
+
VALIDATOR_BASE_STAKE,
|
|
56
|
+
get_dynamic_validator_stake,
|
|
57
|
+
get_validator_stake_info,
|
|
58
|
+
)
|
|
59
|
+
from neuroshard.ui.app import STATE, templates
|
|
60
|
+
from neuroshard.utils.serialization import deserialize_tensor, serialize_tensor
|
|
61
|
+
from neuroshard.grpc_server import start_grpc_background
|
|
62
|
+
from neuroshard.version import __version__
|
|
63
|
+
|
|
64
|
+
# Safe print for Windows frozen GUI mode (where stdout may be None)
|
|
65
|
+
_original_print = print
|
|
66
|
+
|
|
67
|
+
def _safe_print(*args, **kwargs):
|
|
68
|
+
"""Print that works even when stdout is None (Windows GUI)."""
|
|
69
|
+
try:
|
|
70
|
+
if sys.stdout is not None:
|
|
71
|
+
_original_print(*args, **kwargs)
|
|
72
|
+
except (AttributeError, OSError, ValueError):
|
|
73
|
+
pass # Silently ignore - logging will capture it
|
|
74
|
+
|
|
75
|
+
# Override print globally in this module
|
|
76
|
+
print = _safe_print
|
|
77
|
+
|
|
78
|
+
# Global shutdown flag for clean exit from GUI
|
|
79
|
+
_SHUTDOWN_REQUESTED = threading.Event()
|
|
80
|
+
_UVICORN_SERVER = None # Global reference to uvicorn server for shutdown
|
|
81
|
+
|
|
82
|
+
def request_shutdown():
|
|
83
|
+
"""Request graceful shutdown of the node. Called from GUI when stopping."""
|
|
84
|
+
global _UVICORN_SERVER, NEURO_NODE, P2P
|
|
85
|
+
logger.info("[NODE] Shutdown requested...")
|
|
86
|
+
_SHUTDOWN_REQUESTED.set()
|
|
87
|
+
|
|
88
|
+
# Stop gRPC server first (releases port)
|
|
89
|
+
try:
|
|
90
|
+
from neuroshard.grpc_server import stop_grpc
|
|
91
|
+
stop_grpc(timeout=3.0)
|
|
92
|
+
except Exception as e:
|
|
93
|
+
logger.error(f"[NODE] gRPC shutdown error: {e}")
|
|
94
|
+
|
|
95
|
+
# Stop the node first (sets is_running = False)
|
|
96
|
+
if NEURO_NODE:
|
|
97
|
+
try:
|
|
98
|
+
logger.info("[NODE] Stopping node...")
|
|
99
|
+
# Get base node for SwarmEnabledDynamicNode
|
|
100
|
+
base = getattr(NEURO_NODE, 'base_node', NEURO_NODE)
|
|
101
|
+
if hasattr(base, 'stop'):
|
|
102
|
+
base.stop()
|
|
103
|
+
if hasattr(NEURO_NODE, 'stop') and NEURO_NODE != base:
|
|
104
|
+
NEURO_NODE.stop()
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.error(f"[NODE] Node stop error: {e}")
|
|
107
|
+
|
|
108
|
+
# Stop swarm components if enabled
|
|
109
|
+
if NEURO_NODE and hasattr(NEURO_NODE, 'stop_swarm_sync'):
|
|
110
|
+
try:
|
|
111
|
+
logger.info("[NODE] Stopping swarm components...")
|
|
112
|
+
NEURO_NODE.stop_swarm_sync()
|
|
113
|
+
logger.info("[NODE] Swarm components stopped.")
|
|
114
|
+
except Exception as e:
|
|
115
|
+
logger.error(f"[NODE] Swarm shutdown error: {e}")
|
|
116
|
+
|
|
117
|
+
# Save checkpoint before shutting down
|
|
118
|
+
if NEURO_NODE:
|
|
119
|
+
try:
|
|
120
|
+
logger.info("[NODE] Saving checkpoint before shutdown...")
|
|
121
|
+
# Force synchronous save during shutdown to ensure it completes
|
|
122
|
+
NEURO_NODE._save_checkpoint(async_save=False)
|
|
123
|
+
logger.info("[NODE] Checkpoint saved.")
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.error(f"[NODE] Failed to save checkpoint: {e}")
|
|
126
|
+
|
|
127
|
+
# Wait for any ongoing async saves to complete
|
|
128
|
+
try:
|
|
129
|
+
from neuroshard.core.model.dynamic import DynamicNeuroNode
|
|
130
|
+
# Try to acquire the lock (will wait if async save in progress)
|
|
131
|
+
if DynamicNeuroNode._checkpoint_save_lock.acquire(timeout=30):
|
|
132
|
+
DynamicNeuroNode._checkpoint_save_lock.release()
|
|
133
|
+
logger.info("[NODE] All checkpoint saves completed.")
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.warning(f"[NODE] Could not wait for checkpoint save: {e}")
|
|
136
|
+
|
|
137
|
+
# CRITICAL: Free memory by deleting model and data
|
|
138
|
+
try:
|
|
139
|
+
logger.info("[NODE] Freeing memory...")
|
|
140
|
+
|
|
141
|
+
# Clear genesis data
|
|
142
|
+
if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
|
|
143
|
+
NEURO_NODE.genesis_loader.loaded_shards.clear()
|
|
144
|
+
NEURO_NODE.genesis_loader._prefetch_ready.clear()
|
|
145
|
+
NEURO_NODE.genesis_loader.current_dataset = None
|
|
146
|
+
|
|
147
|
+
# Get base node (for SwarmEnabledDynamicNode) or use directly
|
|
148
|
+
base = getattr(NEURO_NODE, 'base_node', NEURO_NODE)
|
|
149
|
+
|
|
150
|
+
# Delete optimizer (holds 2x model params in memory for Adam)
|
|
151
|
+
if hasattr(base, 'optimizer') and base.optimizer is not None:
|
|
152
|
+
del base.optimizer
|
|
153
|
+
|
|
154
|
+
# Delete model
|
|
155
|
+
if hasattr(base, 'model') and base.model is not None:
|
|
156
|
+
del base.model
|
|
157
|
+
|
|
158
|
+
# Force garbage collection
|
|
159
|
+
import gc
|
|
160
|
+
gc.collect()
|
|
161
|
+
|
|
162
|
+
# Clear GPU cache if applicable
|
|
163
|
+
if torch.cuda.is_available():
|
|
164
|
+
torch.cuda.empty_cache()
|
|
165
|
+
logger.info("[NODE] Cleared CUDA cache")
|
|
166
|
+
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
|
167
|
+
torch.mps.empty_cache()
|
|
168
|
+
logger.info("[NODE] Cleared MPS cache")
|
|
169
|
+
|
|
170
|
+
logger.info("[NODE] Memory freed")
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.error(f"[NODE] Memory cleanup error: {e}")
|
|
173
|
+
|
|
174
|
+
# Stop P2P manager (stops background threads)
|
|
175
|
+
if P2P:
|
|
176
|
+
try:
|
|
177
|
+
P2P.stop()
|
|
178
|
+
except Exception as e:
|
|
179
|
+
logger.error(f"[NODE] P2P stop error: {e}")
|
|
180
|
+
|
|
181
|
+
# Stop uvicorn server
|
|
182
|
+
if _UVICORN_SERVER:
|
|
183
|
+
logger.info("[NODE] Stopping HTTP server...")
|
|
184
|
+
_UVICORN_SERVER.should_exit = True
|
|
185
|
+
|
|
186
|
+
# FORCE EXIT: Always force exit after 3 seconds regardless
|
|
187
|
+
# This handles nohup, daemon, and any other run mode
|
|
188
|
+
def force_exit():
|
|
189
|
+
import time as t_module
|
|
190
|
+
import os
|
|
191
|
+
import signal
|
|
192
|
+
t_module.sleep(3.0)
|
|
193
|
+
logger.warning("[NODE] Force exiting (server didn't stop gracefully)...")
|
|
194
|
+
# Try SIGTERM first (graceful)
|
|
195
|
+
try:
|
|
196
|
+
os.kill(os.getpid(), signal.SIGTERM)
|
|
197
|
+
except Exception:
|
|
198
|
+
pass
|
|
199
|
+
t_module.sleep(0.5)
|
|
200
|
+
# If still running, force kill
|
|
201
|
+
logger.warning("[NODE] Forcing process termination...")
|
|
202
|
+
os._exit(0) # Force exit without cleanup
|
|
203
|
+
|
|
204
|
+
# Use non-daemon thread to ensure force_exit runs to completion
|
|
205
|
+
force_thread = threading.Thread(target=force_exit, daemon=False)
|
|
206
|
+
force_thread.start()
|
|
207
|
+
logger.info("[NODE] Force exit scheduled in 3 seconds...")
|
|
208
|
+
|
|
209
|
+
# Reset globals so next run starts fresh
|
|
210
|
+
NEURO_NODE = None
|
|
211
|
+
P2P = None
|
|
212
|
+
_UVICORN_SERVER = None
|
|
213
|
+
|
|
214
|
+
# Configure Logging - ensure all loggers use our format
|
|
215
|
+
# Clear any existing handlers first to prevent duplicates
|
|
216
|
+
root_logger = logging.getLogger()
|
|
217
|
+
root_logger.handlers = [] # Clear existing handlers
|
|
218
|
+
root_logger.setLevel(logging.INFO)
|
|
219
|
+
|
|
220
|
+
# --- In-memory log buffer for dashboard ---
|
|
221
|
+
from collections import deque
|
|
222
|
+
from datetime import datetime
|
|
223
|
+
|
|
224
|
+
# Circular buffer to store recent logs (max 500 entries)
|
|
225
|
+
_LOG_BUFFER = deque(maxlen=500)
|
|
226
|
+
_LOG_BUFFER_LOCK = threading.Lock()
|
|
227
|
+
|
|
228
|
+
class MemoryLogHandler(logging.Handler):
|
|
229
|
+
"""Custom handler that stores logs in memory for dashboard API."""
|
|
230
|
+
|
|
231
|
+
# Auto-incrementing log ID for reliable polling
|
|
232
|
+
_log_id_counter = 0
|
|
233
|
+
|
|
234
|
+
def emit(self, record):
|
|
235
|
+
try:
|
|
236
|
+
msg = self.format(record)
|
|
237
|
+
# Store both display timestamp and epoch for sorting
|
|
238
|
+
epoch_ms = int(record.created * 1000)
|
|
239
|
+
timestamp = datetime.fromtimestamp(record.created).strftime('%H:%M:%S')
|
|
240
|
+
|
|
241
|
+
# Determine log type for filtering
|
|
242
|
+
log_type = 'info'
|
|
243
|
+
msg_lower = msg.lower()
|
|
244
|
+
if 'neuro' in msg_lower and ('earned' in msg_lower or 'reward' in msg_lower or '+' in msg):
|
|
245
|
+
log_type = 'neuro'
|
|
246
|
+
elif 'error' in msg_lower or record.levelno >= logging.ERROR:
|
|
247
|
+
log_type = 'error'
|
|
248
|
+
elif 'training' in msg_lower or 'diloco' in msg_lower or 'gradient' in msg_lower or 'batch' in msg_lower:
|
|
249
|
+
log_type = 'training'
|
|
250
|
+
elif record.levelno >= logging.WARNING:
|
|
251
|
+
log_type = 'warning'
|
|
252
|
+
|
|
253
|
+
with _LOG_BUFFER_LOCK:
|
|
254
|
+
MemoryLogHandler._log_id_counter += 1
|
|
255
|
+
_LOG_BUFFER.append({
|
|
256
|
+
'id': MemoryLogHandler._log_id_counter,
|
|
257
|
+
'epoch': epoch_ms,
|
|
258
|
+
'timestamp': timestamp,
|
|
259
|
+
'message': msg,
|
|
260
|
+
'type': log_type,
|
|
261
|
+
'level': record.levelname,
|
|
262
|
+
})
|
|
263
|
+
except Exception:
|
|
264
|
+
pass # Never fail logging
|
|
265
|
+
|
|
266
|
+
# Windows GUI apps (frozen) may have None stdout/stderr
|
|
267
|
+
# Create a safe handler that won't crash
|
|
268
|
+
def _create_safe_handler():
|
|
269
|
+
"""Create a logging handler that works even when stdout is None (Windows GUI)."""
|
|
270
|
+
# Check if stdout is usable
|
|
271
|
+
if sys.stdout is not None and hasattr(sys.stdout, 'write'):
|
|
272
|
+
try:
|
|
273
|
+
# Test if it's actually writable
|
|
274
|
+
sys.stdout.write('')
|
|
275
|
+
sys.stdout.flush()
|
|
276
|
+
return logging.StreamHandler(sys.stdout)
|
|
277
|
+
except (AttributeError, OSError, ValueError):
|
|
278
|
+
pass
|
|
279
|
+
|
|
280
|
+
# Fallback: log to file in .neuroshard directory
|
|
281
|
+
log_dir = os.path.join(os.path.expanduser("~"), ".neuroshard")
|
|
282
|
+
os.makedirs(log_dir, exist_ok=True)
|
|
283
|
+
log_file = os.path.join(log_dir, "node.log")
|
|
284
|
+
|
|
285
|
+
# Rotate logs - keep last 5MB
|
|
286
|
+
return logging.handlers.RotatingFileHandler(log_file, maxBytes=5*1024*1024, backupCount=2, encoding='utf-8')
|
|
287
|
+
|
|
288
|
+
handler = _create_safe_handler()
|
|
289
|
+
handler.setFormatter(logging.Formatter('[NODE] %(message)s'))
|
|
290
|
+
root_logger.addHandler(handler)
|
|
291
|
+
|
|
292
|
+
# Add memory handler for dashboard logs API
|
|
293
|
+
memory_handler = MemoryLogHandler()
|
|
294
|
+
memory_handler.setFormatter(logging.Formatter('%(message)s'))
|
|
295
|
+
memory_handler.setLevel(logging.INFO)
|
|
296
|
+
root_logger.addHandler(memory_handler)
|
|
297
|
+
|
|
298
|
+
# Also configure neuroshard module loggers explicitly
|
|
299
|
+
for module in ['neuroshard.core.p2p', 'neuroshard.core.ledger', 'neuroshard.core.dynamic_model',
|
|
300
|
+
'neuroshard.core.distributed_training', 'neuroshard.core.dht_service']:
|
|
301
|
+
module_logger = logging.getLogger(module)
|
|
302
|
+
module_logger.setLevel(logging.INFO)
|
|
303
|
+
module_logger.propagate = True # Propagate to root logger
|
|
304
|
+
|
|
305
|
+
# Create logger for this module
|
|
306
|
+
logger = logging.getLogger(__name__)
|
|
307
|
+
|
|
308
|
+
# --- Main API App ---
|
|
309
|
+
node_app = FastAPI(title="NeuroShard Node", version=__version__)
|
|
310
|
+
# Serve dashboard at root
|
|
311
|
+
from fastapi.responses import HTMLResponse
|
|
312
|
+
|
|
313
|
+
@node_app.get("/", response_class=HTMLResponse)
|
|
314
|
+
async def serve_dashboard(request: Request):
|
|
315
|
+
"""Serve the main dashboard at root."""
|
|
316
|
+
return templates.TemplateResponse("index.html", {"request": request})
|
|
317
|
+
|
|
318
|
+
# Shared State
|
|
319
|
+
NEURO_NODE: Optional[DynamicNeuroNode] = None
|
|
320
|
+
P2P: Optional[P2PManager] = None
|
|
321
|
+
SESSION_TIMESTAMPS = {}
|
|
322
|
+
|
|
323
|
+
def get_app():
|
|
324
|
+
return node_app
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
class InferenceRequest(BaseModel):
|
|
328
|
+
tensor_data: str
|
|
329
|
+
request_id: str
|
|
330
|
+
session_id: Optional[str] = None
|
|
331
|
+
sender_reputation: float = 100.0
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
class TextGenerationRequest(BaseModel):
|
|
335
|
+
prompt: str
|
|
336
|
+
max_new_tokens: int = 50
|
|
337
|
+
temperature: float = 1.0
|
|
338
|
+
top_k: int = 50
|
|
339
|
+
top_p: float = 0.9
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
class TrainingDataRequest(BaseModel):
|
|
343
|
+
text: str
|
|
344
|
+
apply_dp: bool = True # Apply differential privacy
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
# ==================== INFERENCE ENDPOINTS ====================
|
|
348
|
+
|
|
349
|
+
@node_app.post("/generate_text")
|
|
350
|
+
async def generate_text(req: TextGenerationRequest):
|
|
351
|
+
"""
|
|
352
|
+
Generate text using NeuroLLM.
|
|
353
|
+
|
|
354
|
+
Note: Early in the network's life, output will be mostly random.
|
|
355
|
+
As more users train the model, quality will improve!
|
|
356
|
+
"""
|
|
357
|
+
if not NEURO_NODE:
|
|
358
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
359
|
+
|
|
360
|
+
try:
|
|
361
|
+
generated = NEURO_NODE.generate(
|
|
362
|
+
prompt=req.prompt,
|
|
363
|
+
max_new_tokens=req.max_new_tokens,
|
|
364
|
+
temperature=req.temperature,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
STATE["processed_count"] = STATE.get("processed_count", 0) + 1
|
|
368
|
+
STATE["token_count"] = NEURO_NODE.total_tokens_processed
|
|
369
|
+
|
|
370
|
+
return {
|
|
371
|
+
"text": generated,
|
|
372
|
+
"my_layers": NEURO_NODE.my_layer_ids,
|
|
373
|
+
"total_training_rounds": NEURO_NODE.total_training_rounds,
|
|
374
|
+
"note": "Quality improves as more users train the model!"
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
except Exception as e:
|
|
378
|
+
return {"error": str(e)}
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
@node_app.post("/forward")
|
|
382
|
+
async def forward(req: InferenceRequest):
|
|
383
|
+
"""Forward pass for distributed inference pipeline."""
|
|
384
|
+
if not NEURO_NODE:
|
|
385
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
386
|
+
|
|
387
|
+
try:
|
|
388
|
+
STATE["processed_count"] = STATE.get("processed_count", 0) + 1
|
|
389
|
+
|
|
390
|
+
if req.session_id:
|
|
391
|
+
SESSION_TIMESTAMPS[req.session_id] = time.time()
|
|
392
|
+
|
|
393
|
+
# Deserialize input
|
|
394
|
+
input_tensor = deserialize_tensor(req.tensor_data)
|
|
395
|
+
|
|
396
|
+
# Forward through NeuroLLM
|
|
397
|
+
output = NEURO_NODE.forward(input_tensor, session_id=req.session_id)
|
|
398
|
+
|
|
399
|
+
# Update token count
|
|
400
|
+
STATE["token_count"] = NEURO_NODE.total_tokens_processed
|
|
401
|
+
|
|
402
|
+
# Return result (NeuroLLM is always a full model, no pipeline needed)
|
|
403
|
+
return {"result": serialize_tensor(output)}
|
|
404
|
+
|
|
405
|
+
except Exception as e:
|
|
406
|
+
return {"error": str(e)}
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
# ==================== TRAINING ENDPOINTS ====================
|
|
410
|
+
|
|
411
|
+
@node_app.post("/contribute_data")
|
|
412
|
+
async def contribute_training_data(req: TrainingDataRequest):
|
|
413
|
+
"""
|
|
414
|
+
Contribute training data to help train NeuroLLM.
|
|
415
|
+
|
|
416
|
+
Your data is processed locally with differential privacy.
|
|
417
|
+
You earn NEURO tokens for contributing!
|
|
418
|
+
"""
|
|
419
|
+
if not NEURO_NODE:
|
|
420
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
421
|
+
|
|
422
|
+
if not NEURO_NODE.enable_training:
|
|
423
|
+
raise HTTPException(status_code=400, detail="Training not enabled on this node")
|
|
424
|
+
|
|
425
|
+
try:
|
|
426
|
+
tokens_added = NEURO_NODE.contribute_training_data(req.text, apply_dp=req.apply_dp)
|
|
427
|
+
|
|
428
|
+
data_stats = NEURO_NODE.data_manager.get_stats() if NEURO_NODE.data_manager else {}
|
|
429
|
+
|
|
430
|
+
return {
|
|
431
|
+
"success": True,
|
|
432
|
+
"message": "Data added to training buffer",
|
|
433
|
+
"tokens_added": tokens_added or 0,
|
|
434
|
+
"buffer_size": data_stats.get("buffer_size", 0),
|
|
435
|
+
"total_tokens": data_stats.get("total_tokens", 0),
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
except Exception as e:
|
|
439
|
+
return {"error": str(e)}
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
@node_app.post("/train_step")
|
|
443
|
+
async def trigger_train_step():
|
|
444
|
+
"""Manually trigger a training step (for testing)."""
|
|
445
|
+
if not NEURO_NODE:
|
|
446
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
447
|
+
|
|
448
|
+
loss = NEURO_NODE.train_step()
|
|
449
|
+
|
|
450
|
+
if loss is None:
|
|
451
|
+
return {"success": False, "message": "Not enough training data in buffer"}
|
|
452
|
+
|
|
453
|
+
return {
|
|
454
|
+
"success": True,
|
|
455
|
+
"loss": loss,
|
|
456
|
+
"total_training_rounds": NEURO_NODE.total_training_rounds
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
|
|
460
|
+
@node_app.get("/training_status")
|
|
461
|
+
async def get_training_status():
|
|
462
|
+
"""Get current training status."""
|
|
463
|
+
if not NEURO_NODE:
|
|
464
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
465
|
+
|
|
466
|
+
# Sanitize loss for JSON
|
|
467
|
+
current_loss = NEURO_NODE.current_loss
|
|
468
|
+
if math.isinf(current_loss) or math.isnan(current_loss):
|
|
469
|
+
current_loss = None
|
|
470
|
+
|
|
471
|
+
return {
|
|
472
|
+
"training_enabled": NEURO_NODE.enable_training,
|
|
473
|
+
"total_training_rounds": NEURO_NODE.total_training_rounds,
|
|
474
|
+
"current_loss": current_loss,
|
|
475
|
+
"training_contributions": NEURO_NODE.training_contribution_count,
|
|
476
|
+
"data_buffer": NEURO_NODE.data_manager.get_stats() if NEURO_NODE.data_manager else None,
|
|
477
|
+
"my_layers": NEURO_NODE.my_layer_ids,
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
@node_app.get("/api/training/global")
|
|
482
|
+
async def get_global_training_status():
|
|
483
|
+
"""
|
|
484
|
+
Get GLOBAL training verification status.
|
|
485
|
+
|
|
486
|
+
This endpoint answers the question: "Is the distributed training ACTUALLY working?"
|
|
487
|
+
|
|
488
|
+
Key metrics:
|
|
489
|
+
- training_verified: True if we can confirm the model is improving
|
|
490
|
+
- is_converging: True if the network appears to be converging
|
|
491
|
+
- hash_agreement_rate: % of nodes with the same model hash (should be 100%)
|
|
492
|
+
- global_avg_loss: Average loss across all network nodes
|
|
493
|
+
- sync_success_rate: % of gradient syncs that succeeded
|
|
494
|
+
|
|
495
|
+
If hash_agreement_rate < 100%, nodes have diverged and training is NOT coordinated!
|
|
496
|
+
"""
|
|
497
|
+
if not NEURO_NODE:
|
|
498
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
499
|
+
|
|
500
|
+
# Get global status from swarm-enabled node
|
|
501
|
+
if hasattr(NEURO_NODE, 'get_global_training_status'):
|
|
502
|
+
global_status = NEURO_NODE.get_global_training_status()
|
|
503
|
+
else:
|
|
504
|
+
# Fallback for non-swarm nodes
|
|
505
|
+
global_status = {
|
|
506
|
+
"error": "Node does not support global training tracking",
|
|
507
|
+
"training_verified": False,
|
|
508
|
+
"is_converging": False,
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
# Add local context (sanitize float values for JSON)
|
|
512
|
+
current_loss = NEURO_NODE.current_loss
|
|
513
|
+
if math.isinf(current_loss) or math.isnan(current_loss):
|
|
514
|
+
current_loss = None
|
|
515
|
+
|
|
516
|
+
# Get model hash from global tracker if available
|
|
517
|
+
model_hash = ""
|
|
518
|
+
if hasattr(NEURO_NODE, '_global_tracker') and NEURO_NODE._global_tracker:
|
|
519
|
+
local_status = NEURO_NODE._global_tracker.get_local_status()
|
|
520
|
+
model_hash = local_status.get('model_hash', '')
|
|
521
|
+
|
|
522
|
+
global_status["local"] = {
|
|
523
|
+
"node_id": NEURO_NODE.node_id[:16],
|
|
524
|
+
"training_rounds": NEURO_NODE.total_training_rounds,
|
|
525
|
+
"current_loss": current_loss,
|
|
526
|
+
"is_training": NEURO_NODE.enable_training,
|
|
527
|
+
"model_hash": model_hash,
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
# Add DiLoCo status if available
|
|
531
|
+
if hasattr(NEURO_NODE, 'get_diloco_progress'):
|
|
532
|
+
global_status["diloco"] = NEURO_NODE.get_diloco_progress()
|
|
533
|
+
|
|
534
|
+
return global_status
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
@node_app.get("/api/training/verify")
|
|
538
|
+
async def verify_training():
|
|
539
|
+
"""
|
|
540
|
+
Quick verification endpoint - answers: "Is training working?"
|
|
541
|
+
|
|
542
|
+
Returns a simple yes/no with explanation.
|
|
543
|
+
"""
|
|
544
|
+
if not NEURO_NODE:
|
|
545
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
546
|
+
|
|
547
|
+
if not NEURO_NODE.enable_training:
|
|
548
|
+
return {
|
|
549
|
+
"is_working": False,
|
|
550
|
+
"reason": "Training is not enabled on this node",
|
|
551
|
+
"action": "Start the node with --train flag",
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
# Check if we have enough training data
|
|
555
|
+
if NEURO_NODE.total_training_rounds < 10:
|
|
556
|
+
return {
|
|
557
|
+
"is_working": "insufficient_data",
|
|
558
|
+
"reason": f"Only {NEURO_NODE.total_training_rounds} training steps completed",
|
|
559
|
+
"action": "Wait for more training steps (need 10+ for verification)",
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
# Get global status
|
|
563
|
+
if hasattr(NEURO_NODE, 'get_global_training_status'):
|
|
564
|
+
global_status = NEURO_NODE.get_global_training_status()
|
|
565
|
+
|
|
566
|
+
is_working = global_status.get("training_verified", False)
|
|
567
|
+
is_converging = global_status.get("is_converging", False)
|
|
568
|
+
hash_agreement = global_status.get("hash_agreement_rate", 0)
|
|
569
|
+
|
|
570
|
+
if is_working and is_converging:
|
|
571
|
+
return {
|
|
572
|
+
"is_working": True,
|
|
573
|
+
"reason": "Training verified! Loss is decreasing and network is converging.",
|
|
574
|
+
"metrics": {
|
|
575
|
+
"loss_trend": global_status.get("loss_trend", "unknown"),
|
|
576
|
+
"hash_agreement": f"{hash_agreement*100:.1f}%",
|
|
577
|
+
"global_loss": global_status.get("global_avg_loss", 0),
|
|
578
|
+
},
|
|
579
|
+
}
|
|
580
|
+
elif not is_converging and hash_agreement < 0.5:
|
|
581
|
+
return {
|
|
582
|
+
"is_working": False,
|
|
583
|
+
"reason": f"Network NOT converging! Only {hash_agreement*100:.1f}% hash agreement.",
|
|
584
|
+
"action": "Nodes have diverged. Check gradient sync is working.",
|
|
585
|
+
}
|
|
586
|
+
else:
|
|
587
|
+
return {
|
|
588
|
+
"is_working": "partial",
|
|
589
|
+
"reason": "Training running but not yet verified as improving.",
|
|
590
|
+
"action": "Continue training - need more data for verification.",
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
# Fallback: check if loss is decreasing
|
|
594
|
+
loss = NEURO_NODE.current_loss
|
|
595
|
+
if loss < 1.0:
|
|
596
|
+
return {
|
|
597
|
+
"is_working": True,
|
|
598
|
+
"reason": f"Loss is {loss:.4f} which is reasonable for training.",
|
|
599
|
+
}
|
|
600
|
+
else:
|
|
601
|
+
return {
|
|
602
|
+
"is_working": "unknown",
|
|
603
|
+
"reason": "Cannot verify without global tracker.",
|
|
604
|
+
"action": "Check loss values in logs - should be decreasing.",
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
@node_app.get("/api/training/history")
|
|
609
|
+
async def get_local_training_history():
|
|
610
|
+
"""
|
|
611
|
+
Get LOCAL loss history to verify model is improving.
|
|
612
|
+
|
|
613
|
+
Returns loss checkpoints recorded during training.
|
|
614
|
+
Use this to see if YOUR node's training is working.
|
|
615
|
+
"""
|
|
616
|
+
if not NEURO_NODE:
|
|
617
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
618
|
+
|
|
619
|
+
result = {
|
|
620
|
+
"total_steps": NEURO_NODE.total_training_rounds,
|
|
621
|
+
"current_loss": NEURO_NODE.current_loss if NEURO_NODE.current_loss != float('inf') else None,
|
|
622
|
+
"loss_checkpoints": [],
|
|
623
|
+
"loss_trend": "unknown",
|
|
624
|
+
"improvement_percent": 0.0,
|
|
625
|
+
"training_verified": False,
|
|
626
|
+
"analysis": {},
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
# Get loss checkpoints from global tracker
|
|
630
|
+
if hasattr(NEURO_NODE, '_global_tracker') and NEURO_NODE._global_tracker:
|
|
631
|
+
tracker = NEURO_NODE._global_tracker
|
|
632
|
+
|
|
633
|
+
# Get loss checkpoints (list of (step, loss) tuples)
|
|
634
|
+
checkpoints = getattr(tracker, '_loss_checkpoints', [])
|
|
635
|
+
result["loss_checkpoints"] = [
|
|
636
|
+
{"step": step, "loss": round(loss, 4)}
|
|
637
|
+
for step, loss in checkpoints
|
|
638
|
+
]
|
|
639
|
+
|
|
640
|
+
# Analyze trend
|
|
641
|
+
if len(checkpoints) >= 5:
|
|
642
|
+
losses = [loss for _, loss in checkpoints]
|
|
643
|
+
|
|
644
|
+
# Compare first 20% to last 20%
|
|
645
|
+
n = len(losses)
|
|
646
|
+
first_n = max(1, n // 5)
|
|
647
|
+
first_avg = sum(losses[:first_n]) / first_n
|
|
648
|
+
last_avg = sum(losses[-first_n:]) / first_n
|
|
649
|
+
|
|
650
|
+
if first_avg > 0:
|
|
651
|
+
improvement = (first_avg - last_avg) / first_avg * 100
|
|
652
|
+
result["improvement_percent"] = round(improvement, 2)
|
|
653
|
+
|
|
654
|
+
if improvement > 10:
|
|
655
|
+
result["loss_trend"] = "improving_strongly"
|
|
656
|
+
result["training_verified"] = True
|
|
657
|
+
elif improvement > 2:
|
|
658
|
+
result["loss_trend"] = "improving"
|
|
659
|
+
result["training_verified"] = True
|
|
660
|
+
elif improvement > -2:
|
|
661
|
+
result["loss_trend"] = "stable"
|
|
662
|
+
result["training_verified"] = n > 20 # Stable after many steps = converged
|
|
663
|
+
elif improvement > -10:
|
|
664
|
+
result["loss_trend"] = "degrading_slightly"
|
|
665
|
+
else:
|
|
666
|
+
result["loss_trend"] = "degrading"
|
|
667
|
+
|
|
668
|
+
result["analysis"] = {
|
|
669
|
+
"data_points": n,
|
|
670
|
+
"first_avg_loss": round(first_avg, 4),
|
|
671
|
+
"last_avg_loss": round(last_avg, 4),
|
|
672
|
+
"min_loss_seen": round(min(losses), 4),
|
|
673
|
+
"max_loss_seen": round(max(losses), 4),
|
|
674
|
+
"expected_initial_loss": "~10-11 (random init for 50k vocab)",
|
|
675
|
+
"good_loss_range": "< 4.0 (perplexity < 55)",
|
|
676
|
+
"great_loss_range": "< 2.5 (perplexity < 12)",
|
|
677
|
+
}
|
|
678
|
+
else:
|
|
679
|
+
result["analysis"]["note"] = "Global tracker not initialized - restart node to enable"
|
|
680
|
+
|
|
681
|
+
return result
|
|
682
|
+
|
|
683
|
+
|
|
684
|
+
# ==================== STATS & PONW ENDPOINTS ====================
|
|
685
|
+
|
|
686
|
+
@node_app.get("/api/stats")
|
|
687
|
+
async def get_api_stats():
|
|
688
|
+
"""Endpoint for GUI to fetch local node stats."""
|
|
689
|
+
import math
|
|
690
|
+
import asyncio
|
|
691
|
+
import os
|
|
692
|
+
|
|
693
|
+
# Yield to event loop to ensure responsiveness
|
|
694
|
+
await asyncio.sleep(0)
|
|
695
|
+
|
|
696
|
+
# Get actual system resource usage
|
|
697
|
+
system_stats = {}
|
|
698
|
+
try:
|
|
699
|
+
import psutil
|
|
700
|
+
# CPU usage (system-wide percentage)
|
|
701
|
+
system_stats["cpu_percent"] = psutil.cpu_percent(interval=None) # Non-blocking
|
|
702
|
+
|
|
703
|
+
# Memory usage (system-wide)
|
|
704
|
+
mem = psutil.virtual_memory()
|
|
705
|
+
system_stats["ram_used_gb"] = round(mem.used / (1024**3), 2)
|
|
706
|
+
system_stats["ram_total_gb"] = round(mem.total / (1024**3), 2)
|
|
707
|
+
system_stats["ram_percent"] = mem.percent
|
|
708
|
+
|
|
709
|
+
# Process-specific memory
|
|
710
|
+
process = psutil.Process(os.getpid())
|
|
711
|
+
system_stats["process_ram_mb"] = round(process.memory_info().rss / (1024**2), 1)
|
|
712
|
+
except Exception:
|
|
713
|
+
pass
|
|
714
|
+
|
|
715
|
+
# Start with basic stats from STATE
|
|
716
|
+
stats = {
|
|
717
|
+
"peer_count": STATE.get("peer_count", 0),
|
|
718
|
+
"processed_count": STATE.get("processed_count", 0),
|
|
719
|
+
"training_status": STATE.get("training_status", "idle"),
|
|
720
|
+
# Actual system resource usage
|
|
721
|
+
"system": system_stats,
|
|
722
|
+
# Resource throttle info
|
|
723
|
+
"throttle": {
|
|
724
|
+
"cpu_ratio": STATE.get("throttle_cpu_ratio", 1.0),
|
|
725
|
+
"ram_ratio": STATE.get("throttle_ram_ratio", 1.0),
|
|
726
|
+
"effective": STATE.get("throttle_effective", 1.0),
|
|
727
|
+
"interval_seconds": STATE.get("throttle_interval", 2.0),
|
|
728
|
+
"max_steps_per_min": STATE.get("throttle_max_steps", 30),
|
|
729
|
+
},
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
if NEURO_NODE:
|
|
733
|
+
# Run get_stats in executor to not block event loop
|
|
734
|
+
loop = asyncio.get_event_loop()
|
|
735
|
+
node_stats = await loop.run_in_executor(None, NEURO_NODE.get_stats)
|
|
736
|
+
|
|
737
|
+
# Handle infinity values (not JSON serializable)
|
|
738
|
+
current_loss = node_stats.get("current_loss", float('inf'))
|
|
739
|
+
if math.isinf(current_loss) or math.isnan(current_loss):
|
|
740
|
+
current_loss = None # Use None for JSON compatibility
|
|
741
|
+
|
|
742
|
+
# Determine role string for display
|
|
743
|
+
has_embedding = node_stats.get("has_embedding", False)
|
|
744
|
+
has_lm_head = node_stats.get("has_lm_head", False)
|
|
745
|
+
if has_embedding and has_lm_head:
|
|
746
|
+
role = "Full Node (Driver + Validator)"
|
|
747
|
+
elif has_embedding:
|
|
748
|
+
role = "Driver"
|
|
749
|
+
elif has_lm_head:
|
|
750
|
+
role = "Validator"
|
|
751
|
+
else:
|
|
752
|
+
role = "Worker"
|
|
753
|
+
|
|
754
|
+
stats.update({
|
|
755
|
+
# My contribution
|
|
756
|
+
"my_layers": node_stats.get("my_layers", []),
|
|
757
|
+
"my_params_m": node_stats.get("my_params", 0) / 1e6,
|
|
758
|
+
"has_embedding": has_embedding,
|
|
759
|
+
"has_lm_head": has_lm_head,
|
|
760
|
+
"role": role,
|
|
761
|
+
"available_memory_mb": node_stats.get("available_memory_mb", 0),
|
|
762
|
+
"reward_multiplier": node_stats.get("reward_multiplier", 1.0),
|
|
763
|
+
|
|
764
|
+
# Network stats
|
|
765
|
+
"network_layers": node_stats.get("network_layers", 0),
|
|
766
|
+
"network_params_m": node_stats.get("network_params", 0) / 1e6,
|
|
767
|
+
"network_nodes": node_stats.get("network_nodes", 0),
|
|
768
|
+
"contribution_ratio": node_stats.get("contribution_ratio", 0),
|
|
769
|
+
|
|
770
|
+
# Training stats - use CUMULATIVE values from NEURO_NODE, not delta from STATE
|
|
771
|
+
"training_enabled": NEURO_NODE.enable_training,
|
|
772
|
+
"training_rounds": node_stats.get("total_training_rounds", 0),
|
|
773
|
+
"token_count": node_stats.get("total_tokens_processed", 0), # Cumulative tokens
|
|
774
|
+
"current_loss": current_loss,
|
|
775
|
+
"data_buffer_size": node_stats.get("data_buffer_size", 0),
|
|
776
|
+
|
|
777
|
+
# Data shard stats (if Driver)
|
|
778
|
+
"shard_stats": node_stats.get("shard_stats", {}),
|
|
779
|
+
|
|
780
|
+
# Device info
|
|
781
|
+
"device": NEURO_NODE.device,
|
|
782
|
+
|
|
783
|
+
# Instance info (for multi-node support)
|
|
784
|
+
"instance_id": getattr(NEURO_NODE, 'instance_id', None),
|
|
785
|
+
})
|
|
786
|
+
|
|
787
|
+
# Add DiLoCo progress
|
|
788
|
+
diloco = NEURO_NODE.get_diloco_progress()
|
|
789
|
+
if diloco.get("enabled", False):
|
|
790
|
+
stats["diloco"] = {
|
|
791
|
+
"inner_step": diloco.get("inner_step_count", 0),
|
|
792
|
+
"inner_total": diloco.get("inner_steps_total", 500),
|
|
793
|
+
"progress": diloco.get("progress", 0.0),
|
|
794
|
+
"outer_step": diloco.get("outer_step_count", 0),
|
|
795
|
+
}
|
|
796
|
+
else:
|
|
797
|
+
# Node not ready yet
|
|
798
|
+
stats["token_count"] = 0
|
|
799
|
+
stats["training_rounds"] = 0
|
|
800
|
+
|
|
801
|
+
# Add version
|
|
802
|
+
stats["version"] = __version__
|
|
803
|
+
|
|
804
|
+
# Add current config settings (for UI sliders)
|
|
805
|
+
stats["config"] = {
|
|
806
|
+
"cpu_threads": STATE.get("config_cpu_threads"),
|
|
807
|
+
"memory_mb": STATE.get("config_memory_mb"),
|
|
808
|
+
"storage_mb": STATE.get("config_storage_mb", 100), # Default 100MB
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
return stats
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
@node_app.get("/api/node/architecture")
|
|
815
|
+
async def get_node_architecture():
|
|
816
|
+
"""
|
|
817
|
+
Get this node's current architecture.
|
|
818
|
+
|
|
819
|
+
Used by other nodes to query network architecture when rejoining.
|
|
820
|
+
This enables smart architecture reconciliation across the network.
|
|
821
|
+
"""
|
|
822
|
+
if not NEURO_NODE or not NEURO_NODE.model:
|
|
823
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
824
|
+
|
|
825
|
+
arch = NEURO_NODE.model.architecture
|
|
826
|
+
|
|
827
|
+
return {
|
|
828
|
+
"hidden_dim": arch.hidden_dim,
|
|
829
|
+
"intermediate_dim": arch.intermediate_dim,
|
|
830
|
+
"num_layers": arch.num_layers,
|
|
831
|
+
"num_heads": arch.num_heads,
|
|
832
|
+
"num_kv_heads": arch.num_kv_heads,
|
|
833
|
+
"estimated_params": arch.estimate_params(),
|
|
834
|
+
"estimated_memory_mb": arch.estimate_memory_mb(),
|
|
835
|
+
"architecture_version": getattr(NEURO_NODE.layer_pool, 'architecture_version', 1),
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
|
|
839
|
+
@node_app.get("/api/market")
|
|
840
|
+
async def get_market_stats():
|
|
841
|
+
"""
|
|
842
|
+
Get real-time inference market statistics.
|
|
843
|
+
|
|
844
|
+
Returns current price, supply, demand, utilization.
|
|
845
|
+
"""
|
|
846
|
+
if not P2P or not P2P.ledger:
|
|
847
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
848
|
+
|
|
849
|
+
return P2P.ledger.get_inference_market_stats()
|
|
850
|
+
|
|
851
|
+
|
|
852
|
+
@node_app.post("/api/market/register")
|
|
853
|
+
async def register_inference_capacity(
|
|
854
|
+
tokens_per_second: int,
|
|
855
|
+
min_price: float = 0.0
|
|
856
|
+
):
|
|
857
|
+
"""
|
|
858
|
+
Register this node's inference capacity with the market.
|
|
859
|
+
|
|
860
|
+
Nodes should call this when idle/available to serve inference.
|
|
861
|
+
Call withdraw endpoint when switching to training.
|
|
862
|
+
"""
|
|
863
|
+
if not P2P or not P2P.ledger:
|
|
864
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
865
|
+
|
|
866
|
+
P2P.ledger.register_inference_capacity(
|
|
867
|
+
tokens_per_second=tokens_per_second,
|
|
868
|
+
min_price=min_price
|
|
869
|
+
)
|
|
870
|
+
|
|
871
|
+
return {"status": "registered", "tokens_per_second": tokens_per_second, "min_price": min_price}
|
|
872
|
+
|
|
873
|
+
|
|
874
|
+
@node_app.post("/api/market/withdraw")
|
|
875
|
+
async def withdraw_inference_capacity():
|
|
876
|
+
"""
|
|
877
|
+
Withdraw this node from inference market.
|
|
878
|
+
|
|
879
|
+
Call this when switching to training.
|
|
880
|
+
"""
|
|
881
|
+
if not P2P or not P2P.ledger:
|
|
882
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
883
|
+
|
|
884
|
+
P2P.ledger.withdraw_inference_capacity()
|
|
885
|
+
|
|
886
|
+
return {"status": "withdrawn"}
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
# ==================== DISTRIBUTED INFERENCE MARKETPLACE ====================
|
|
890
|
+
|
|
891
|
+
class MarketplaceSubmitRequest(BaseModel):
|
|
892
|
+
"""User submits inference request to marketplace."""
|
|
893
|
+
prompt: str
|
|
894
|
+
max_tokens: int = 100
|
|
895
|
+
max_price: float = 1.0
|
|
896
|
+
driver_node_id: Optional[str] = None # Optional: specify driver, else round-robin
|
|
897
|
+
|
|
898
|
+
|
|
899
|
+
class DriverPromptRequest(BaseModel):
|
|
900
|
+
"""User sends encrypted prompt directly to driver."""
|
|
901
|
+
encrypted_prompt: str
|
|
902
|
+
user_id: str
|
|
903
|
+
|
|
904
|
+
|
|
905
|
+
@node_app.post("/api/market/submit")
|
|
906
|
+
async def submit_marketplace_request(req: MarketplaceSubmitRequest):
|
|
907
|
+
"""
|
|
908
|
+
Submit inference request to marketplace (USER API).
|
|
909
|
+
|
|
910
|
+
Flow:
|
|
911
|
+
1. User submits request with prompt
|
|
912
|
+
2. Marketplace locks price, assigns driver
|
|
913
|
+
3. User sends encrypted prompt to driver
|
|
914
|
+
4. Driver processes, returns result
|
|
915
|
+
|
|
916
|
+
Returns:
|
|
917
|
+
request_id, locked_price, driver_node_id
|
|
918
|
+
"""
|
|
919
|
+
if not NEURO_NODE or not P2P or not P2P.ledger:
|
|
920
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
921
|
+
|
|
922
|
+
if not hasattr(P2P.ledger, 'inference_market'):
|
|
923
|
+
raise HTTPException(status_code=503, detail="Marketplace not available")
|
|
924
|
+
|
|
925
|
+
market = P2P.ledger.inference_market
|
|
926
|
+
|
|
927
|
+
# Choose driver (round-robin if not specified)
|
|
928
|
+
driver_node_id = req.driver_node_id
|
|
929
|
+
|
|
930
|
+
if not driver_node_id:
|
|
931
|
+
# Find a driver node from layer pool
|
|
932
|
+
if NEURO_NODE.layer_pool:
|
|
933
|
+
route = NEURO_NODE.layer_pool.get_pipeline_route()
|
|
934
|
+
if route and len(route) > 0:
|
|
935
|
+
# First layer should be embedding (driver)
|
|
936
|
+
driver_node_id = route[0][1].split(':')[0] if ':' in route[0][1] else NEURO_NODE.node_id
|
|
937
|
+
else:
|
|
938
|
+
# Fallback to this node if it's a driver
|
|
939
|
+
if NEURO_NODE.model.has_embedding:
|
|
940
|
+
driver_node_id = NEURO_NODE.node_id
|
|
941
|
+
else:
|
|
942
|
+
raise HTTPException(status_code=503, detail="No driver nodes available")
|
|
943
|
+
else:
|
|
944
|
+
# Single node mode
|
|
945
|
+
driver_node_id = NEURO_NODE.node_id
|
|
946
|
+
|
|
947
|
+
# Sign request with node's ECDSA key (authorizes payment)
|
|
948
|
+
from neuroshard.core.crypto.ecdsa import sign_message
|
|
949
|
+
signature_payload = f"{NEURO_NODE.node_id}:{driver_node_id}:{req.max_tokens}:{req.max_price}"
|
|
950
|
+
user_signature = sign_message(signature_payload, NEURO_NODE.node_token)
|
|
951
|
+
|
|
952
|
+
# Submit to marketplace (without prompt - privacy!)
|
|
953
|
+
success, request_id, locked_price = market.submit_request(
|
|
954
|
+
user_id=NEURO_NODE.node_id, # For testing, use node ID as user ID
|
|
955
|
+
driver_node_id=driver_node_id,
|
|
956
|
+
tokens_requested=req.max_tokens,
|
|
957
|
+
max_price=req.max_price,
|
|
958
|
+
user_signature=user_signature,
|
|
959
|
+
priority=0
|
|
960
|
+
)
|
|
961
|
+
|
|
962
|
+
if not success:
|
|
963
|
+
raise HTTPException(status_code=400, detail="Request rejected (price too high or market full)")
|
|
964
|
+
|
|
965
|
+
# Encrypt prompt for driver
|
|
966
|
+
from neuroshard.core.network.encrypted_channel import PromptEncryption
|
|
967
|
+
encrypted_prompt = PromptEncryption.encrypt_prompt(req.prompt, request_id)
|
|
968
|
+
|
|
969
|
+
# If we are the driver, add to our own queue
|
|
970
|
+
if driver_node_id == NEURO_NODE.node_id and hasattr(NEURO_NODE, 'prompt_queue'):
|
|
971
|
+
from neuroshard.core.network.encrypted_channel import EncryptedPrompt
|
|
972
|
+
import time
|
|
973
|
+
|
|
974
|
+
NEURO_NODE.prompt_queue.add_prompt(EncryptedPrompt(
|
|
975
|
+
request_id=request_id,
|
|
976
|
+
encrypted_data=encrypted_prompt,
|
|
977
|
+
timestamp=time.time(),
|
|
978
|
+
user_id=NEURO_NODE.node_id
|
|
979
|
+
))
|
|
980
|
+
logger.info(f"[API] ✓ Added encrypted prompt to local driver queue")
|
|
981
|
+
|
|
982
|
+
return {
|
|
983
|
+
"request_id": request_id,
|
|
984
|
+
"locked_price": locked_price,
|
|
985
|
+
"driver_node_id": driver_node_id,
|
|
986
|
+
"encrypted_prompt": encrypted_prompt, # User should send this to driver
|
|
987
|
+
"instructions": f"POST encrypted_prompt to /api/driver/prompt/{request_id} on driver node"
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
|
|
991
|
+
@node_app.post("/api/driver/prompt/{request_id}")
|
|
992
|
+
async def submit_encrypted_prompt(request_id: str, req: DriverPromptRequest):
|
|
993
|
+
"""
|
|
994
|
+
User sends encrypted prompt to driver node (PRIVACY CHANNEL).
|
|
995
|
+
|
|
996
|
+
This endpoint is called on the DRIVER node, not the marketplace.
|
|
997
|
+
Prompt is encrypted - only driver can decrypt it.
|
|
998
|
+
"""
|
|
999
|
+
if not NEURO_NODE or not NEURO_NODE.model.has_embedding:
|
|
1000
|
+
raise HTTPException(status_code=403, detail="This node is not a driver")
|
|
1001
|
+
|
|
1002
|
+
if not hasattr(NEURO_NODE, 'prompt_queue'):
|
|
1003
|
+
raise HTTPException(status_code=503, detail="Driver not initialized")
|
|
1004
|
+
|
|
1005
|
+
# Add to prompt queue
|
|
1006
|
+
from neuroshard.core.network.encrypted_channel import EncryptedPrompt
|
|
1007
|
+
import time
|
|
1008
|
+
|
|
1009
|
+
prompt = EncryptedPrompt(
|
|
1010
|
+
request_id=request_id,
|
|
1011
|
+
encrypted_data=req.encrypted_prompt,
|
|
1012
|
+
timestamp=time.time(),
|
|
1013
|
+
user_id=req.user_id
|
|
1014
|
+
)
|
|
1015
|
+
|
|
1016
|
+
success = NEURO_NODE.prompt_queue.add_prompt(prompt)
|
|
1017
|
+
|
|
1018
|
+
if not success:
|
|
1019
|
+
raise HTTPException(status_code=503, detail="Prompt queue full")
|
|
1020
|
+
|
|
1021
|
+
return {
|
|
1022
|
+
"status": "success",
|
|
1023
|
+
"message": f"Encrypted prompt queued for request {request_id[:8]}...",
|
|
1024
|
+
"queue_position": len(NEURO_NODE.prompt_queue.prompts)
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
|
|
1028
|
+
@node_app.get("/api/market/request/{request_id}")
|
|
1029
|
+
async def get_request_status(request_id: str):
|
|
1030
|
+
"""
|
|
1031
|
+
Get status of inference request.
|
|
1032
|
+
|
|
1033
|
+
Returns:
|
|
1034
|
+
status, progress, eta, result (if completed)
|
|
1035
|
+
"""
|
|
1036
|
+
if not NEURO_NODE or not P2P or not P2P.ledger:
|
|
1037
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
1038
|
+
|
|
1039
|
+
if not hasattr(P2P.ledger, 'inference_market'):
|
|
1040
|
+
raise HTTPException(status_code=503, detail="Marketplace not available")
|
|
1041
|
+
|
|
1042
|
+
market = P2P.ledger.inference_market
|
|
1043
|
+
request = market.get_request(request_id)
|
|
1044
|
+
|
|
1045
|
+
if not request:
|
|
1046
|
+
raise HTTPException(status_code=404, detail="Request not found")
|
|
1047
|
+
|
|
1048
|
+
# Get result from marketplace storage
|
|
1049
|
+
result_text = market.get_result(request_id)
|
|
1050
|
+
|
|
1051
|
+
return {
|
|
1052
|
+
"request_id": request_id,
|
|
1053
|
+
"status": request.status,
|
|
1054
|
+
"locked_price": request.locked_price,
|
|
1055
|
+
"tokens_requested": request.tokens_requested,
|
|
1056
|
+
"driver_node_id": request.driver_node_id,
|
|
1057
|
+
"pipeline_session_id": request.pipeline_session_id,
|
|
1058
|
+
"result": result_text,
|
|
1059
|
+
"completed": request.status == "completed" and result_text is not None
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
|
|
1063
|
+
@node_app.get("/api/ponw")
|
|
1064
|
+
async def get_ponw_proof():
|
|
1065
|
+
"""
|
|
1066
|
+
Get Proof of Neural Work for this node.
|
|
1067
|
+
|
|
1068
|
+
This proves the node actually contributed compute for training/inference.
|
|
1069
|
+
Used for NEURO token rewards.
|
|
1070
|
+
"""
|
|
1071
|
+
if not NEURO_NODE:
|
|
1072
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
1073
|
+
|
|
1074
|
+
return NEURO_NODE.get_ponw_proof()
|
|
1075
|
+
|
|
1076
|
+
|
|
1077
|
+
@node_app.get("/api/neuro")
|
|
1078
|
+
async def get_neuro_balance():
|
|
1079
|
+
"""
|
|
1080
|
+
Get NEURO token balance and account info for this node.
|
|
1081
|
+
|
|
1082
|
+
Returns:
|
|
1083
|
+
- balance: Current spendable balance
|
|
1084
|
+
- total_earned: Lifetime earnings from PoNW
|
|
1085
|
+
- total_spent: Lifetime spending
|
|
1086
|
+
- stake: Currently staked amount
|
|
1087
|
+
- stake_multiplier: Reward multiplier from staking
|
|
1088
|
+
"""
|
|
1089
|
+
# Use local reference to avoid race condition during shutdown
|
|
1090
|
+
p2p = P2P
|
|
1091
|
+
if not p2p or not p2p.ledger:
|
|
1092
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
1093
|
+
|
|
1094
|
+
ledger = p2p.ledger
|
|
1095
|
+
|
|
1096
|
+
try:
|
|
1097
|
+
# Use NEUROLedger API (no fallbacks)
|
|
1098
|
+
account_info = ledger.get_account_info()
|
|
1099
|
+
burn_stats = ledger.get_burn_stats()
|
|
1100
|
+
|
|
1101
|
+
# Get node IDs
|
|
1102
|
+
wallet_id = ledger.node_id
|
|
1103
|
+
node_id = p2p.node_id
|
|
1104
|
+
|
|
1105
|
+
return {
|
|
1106
|
+
"balance": round(account_info.get("balance", 0.0), 6),
|
|
1107
|
+
"total_earned": round(account_info.get("total_earned", 0.0), 6),
|
|
1108
|
+
"total_spent": round(account_info.get("total_spent", 0.0), 6),
|
|
1109
|
+
"stake": round(account_info.get("stake", 0.0), 2),
|
|
1110
|
+
"stake_multiplier": round(account_info.get("stake_multiplier", 1.0), 2),
|
|
1111
|
+
"proof_count": account_info.get("proof_count", 0),
|
|
1112
|
+
"wallet_id": wallet_id,
|
|
1113
|
+
"node_id": node_id,
|
|
1114
|
+
"network": {
|
|
1115
|
+
"total_burned": round(burn_stats.get("total_burned", 0.0), 6),
|
|
1116
|
+
"circulating_supply": round(burn_stats.get("circulating_supply", 0.0), 6),
|
|
1117
|
+
"burn_rate": "5%"
|
|
1118
|
+
}
|
|
1119
|
+
}
|
|
1120
|
+
except Exception as e:
|
|
1121
|
+
# Handle shutdown race condition gracefully
|
|
1122
|
+
raise HTTPException(status_code=503, detail=f"Service shutting down: {e}")
|
|
1123
|
+
|
|
1124
|
+
|
|
1125
|
+
# ==================== STAKING ENDPOINTS ====================
|
|
1126
|
+
|
|
1127
|
+
class StakeRequest(BaseModel):
|
|
1128
|
+
amount: float
|
|
1129
|
+
duration_days: int = 30
|
|
1130
|
+
|
|
1131
|
+
|
|
1132
|
+
@node_app.post("/api/stake")
|
|
1133
|
+
async def stake_neuro(req: StakeRequest):
|
|
1134
|
+
"""
|
|
1135
|
+
Stake NEURO tokens for reward multiplier.
|
|
1136
|
+
|
|
1137
|
+
Staking provides:
|
|
1138
|
+
- 10% bonus per 1000 NEURO staked (diminishing returns)
|
|
1139
|
+
- Tokens locked for specified duration
|
|
1140
|
+
- 100+ NEURO stake unlocks Validator role (computes real cross-entropy loss)
|
|
1141
|
+
|
|
1142
|
+
Example: Stake 2000 NEURO for 30 days = ~1.16x multiplier on all rewards
|
|
1143
|
+
"""
|
|
1144
|
+
if not P2P or not P2P.ledger:
|
|
1145
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
1146
|
+
|
|
1147
|
+
# Validate using centralized economics
|
|
1148
|
+
is_valid, error = is_valid_stake_amount(req.amount)
|
|
1149
|
+
if not is_valid:
|
|
1150
|
+
raise HTTPException(status_code=400, detail=error)
|
|
1151
|
+
|
|
1152
|
+
is_valid, error = is_valid_stake_duration(req.duration_days)
|
|
1153
|
+
if not is_valid:
|
|
1154
|
+
raise HTTPException(status_code=400, detail=error)
|
|
1155
|
+
|
|
1156
|
+
success, message = P2P.ledger.stake(req.amount, req.duration_days)
|
|
1157
|
+
|
|
1158
|
+
if success:
|
|
1159
|
+
account = P2P.ledger.get_account_info()
|
|
1160
|
+
new_stake = account.get("stake", 0.0)
|
|
1161
|
+
|
|
1162
|
+
# Get dynamic validator stake requirement based on network size
|
|
1163
|
+
num_validators = 0
|
|
1164
|
+
if NEURO_NODE and hasattr(NEURO_NODE, 'layer_pool') and NEURO_NODE.layer_pool:
|
|
1165
|
+
last_layer = max(0, NEURO_NODE.layer_pool.current_num_layers - 1)
|
|
1166
|
+
num_validators = len(NEURO_NODE.layer_pool.layer_assignments.get(last_layer, []))
|
|
1167
|
+
|
|
1168
|
+
required_stake = get_dynamic_validator_stake(num_validators)
|
|
1169
|
+
|
|
1170
|
+
# Check if we should upgrade to Validator (no restart needed!)
|
|
1171
|
+
validator_upgraded = False
|
|
1172
|
+
if new_stake >= required_stake and NEURO_NODE:
|
|
1173
|
+
# Check if not already a validator
|
|
1174
|
+
if hasattr(NEURO_NODE, 'model') and NEURO_NODE.model and not NEURO_NODE.model.has_lm_head:
|
|
1175
|
+
# Upgrade the model to have LM head
|
|
1176
|
+
if NEURO_NODE.model.initialize_lm_head():
|
|
1177
|
+
validator_upgraded = True
|
|
1178
|
+
logger.info(f"Node upgraded to VALIDATOR! Now computing real cross-entropy loss.")
|
|
1179
|
+
|
|
1180
|
+
response = {
|
|
1181
|
+
"success": True,
|
|
1182
|
+
"message": message,
|
|
1183
|
+
"new_stake": new_stake,
|
|
1184
|
+
"new_multiplier": account.get("stake_multiplier", 1.0),
|
|
1185
|
+
"locked_until": account.get("stake_locked_until", 0.0),
|
|
1186
|
+
"validator_stake_required": required_stake,
|
|
1187
|
+
"num_validators": num_validators,
|
|
1188
|
+
}
|
|
1189
|
+
|
|
1190
|
+
if validator_upgraded:
|
|
1191
|
+
response["validator_upgrade"] = True
|
|
1192
|
+
response["message"] += " Upgraded to VALIDATOR! Now computing real training loss."
|
|
1193
|
+
elif new_stake < required_stake:
|
|
1194
|
+
response["validator_progress"] = f"{new_stake:.0f}/{required_stake:.0f} NEURO ({new_stake/required_stake*100:.1f}%)"
|
|
1195
|
+
|
|
1196
|
+
return response
|
|
1197
|
+
else:
|
|
1198
|
+
raise HTTPException(status_code=400, detail=message)
|
|
1199
|
+
|
|
1200
|
+
|
|
1201
|
+
@node_app.post("/api/unstake")
|
|
1202
|
+
async def unstake_neuro():
|
|
1203
|
+
"""
|
|
1204
|
+
Unstake NEURO tokens (if lock period expired).
|
|
1205
|
+
|
|
1206
|
+
Returns staked tokens to balance.
|
|
1207
|
+
Note: If remaining stake drops below validator requirement, node is demoted to Worker.
|
|
1208
|
+
"""
|
|
1209
|
+
if not P2P or not P2P.ledger:
|
|
1210
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
1211
|
+
|
|
1212
|
+
success, amount, message = P2P.ledger.unstake()
|
|
1213
|
+
|
|
1214
|
+
if success:
|
|
1215
|
+
# Check if we need to demote from Validator
|
|
1216
|
+
validator_demoted = False
|
|
1217
|
+
account = P2P.ledger.get_account_info()
|
|
1218
|
+
remaining_stake = account.get("stake", 0.0)
|
|
1219
|
+
|
|
1220
|
+
# Get current network size for dynamic stake calculation
|
|
1221
|
+
num_validators = 0
|
|
1222
|
+
if NEURO_NODE and hasattr(NEURO_NODE, 'layer_pool') and NEURO_NODE.layer_pool:
|
|
1223
|
+
last_layer = max(0, NEURO_NODE.layer_pool.current_num_layers - 1)
|
|
1224
|
+
num_validators = len(NEURO_NODE.layer_pool.layer_assignments.get(last_layer, []))
|
|
1225
|
+
|
|
1226
|
+
required_stake = get_dynamic_validator_stake(num_validators)
|
|
1227
|
+
|
|
1228
|
+
# Check if we were a validator and now don't qualify
|
|
1229
|
+
if NEURO_NODE and hasattr(NEURO_NODE, 'model') and NEURO_NODE.model:
|
|
1230
|
+
if NEURO_NODE.model.has_lm_head and remaining_stake < required_stake:
|
|
1231
|
+
# Demote from validator
|
|
1232
|
+
if NEURO_NODE.model.disable_lm_head():
|
|
1233
|
+
validator_demoted = True
|
|
1234
|
+
# Also update layer pool
|
|
1235
|
+
if NEURO_NODE.layer_pool:
|
|
1236
|
+
NEURO_NODE.layer_pool.demote_from_validator(NEURO_NODE.node_id)
|
|
1237
|
+
logger.warning(f"Node demoted from Validator: stake {remaining_stake:.0f} < {required_stake:.0f} required")
|
|
1238
|
+
|
|
1239
|
+
response = {
|
|
1240
|
+
"success": True,
|
|
1241
|
+
"message": message,
|
|
1242
|
+
"amount_unstaked": amount,
|
|
1243
|
+
"remaining_stake": remaining_stake,
|
|
1244
|
+
}
|
|
1245
|
+
|
|
1246
|
+
if validator_demoted:
|
|
1247
|
+
response["validator_demoted"] = True
|
|
1248
|
+
response["message"] += f" WARNING: Demoted from Validator (need {required_stake:.0f} NEURO, have {remaining_stake:.0f})"
|
|
1249
|
+
|
|
1250
|
+
return response
|
|
1251
|
+
else:
|
|
1252
|
+
raise HTTPException(status_code=400, detail=message)
|
|
1253
|
+
|
|
1254
|
+
|
|
1255
|
+
@node_app.get("/api/stake/info")
|
|
1256
|
+
async def get_stake_info():
|
|
1257
|
+
"""Get current staking information with dynamic validator requirements."""
|
|
1258
|
+
if not P2P or not P2P.ledger:
|
|
1259
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
1260
|
+
|
|
1261
|
+
account = P2P.ledger.get_account_info()
|
|
1262
|
+
|
|
1263
|
+
# Get current network size for dynamic stake calculation
|
|
1264
|
+
num_validators = 0
|
|
1265
|
+
if NEURO_NODE and hasattr(NEURO_NODE, 'layer_pool') and NEURO_NODE.layer_pool:
|
|
1266
|
+
last_layer = max(0, NEURO_NODE.layer_pool.current_num_layers - 1)
|
|
1267
|
+
num_validators = len(NEURO_NODE.layer_pool.layer_assignments.get(last_layer, []))
|
|
1268
|
+
|
|
1269
|
+
return {
|
|
1270
|
+
"stake": account.get("stake", 0.0),
|
|
1271
|
+
"stake_multiplier": account.get("stake_multiplier", 1.0),
|
|
1272
|
+
"stake_locked_until": account.get("stake_locked_until", 0.0),
|
|
1273
|
+
"balance": account.get("balance", 0.0),
|
|
1274
|
+
"staking_info": {
|
|
1275
|
+
"bonus_per_1000": "10% (diminishing)",
|
|
1276
|
+
"min_lock_days": 1,
|
|
1277
|
+
"max_lock_days": 365,
|
|
1278
|
+
"validator_stake": get_validator_stake_info(num_validators),
|
|
1279
|
+
}
|
|
1280
|
+
}
|
|
1281
|
+
|
|
1282
|
+
|
|
1283
|
+
class ThrottleUpdateRequest(BaseModel):
|
|
1284
|
+
cpu_threads: Optional[int] = None
|
|
1285
|
+
memory_mb: Optional[int] = None
|
|
1286
|
+
storage_mb: Optional[int] = None
|
|
1287
|
+
|
|
1288
|
+
|
|
1289
|
+
@node_app.post("/api/throttle")
|
|
1290
|
+
async def update_throttle(req: ThrottleUpdateRequest):
|
|
1291
|
+
"""
|
|
1292
|
+
Update training throttle settings while node is running.
|
|
1293
|
+
|
|
1294
|
+
This allows the GUI to change CPU/RAM/Storage limits without restarting.
|
|
1295
|
+
Changes take effect within 5 seconds.
|
|
1296
|
+
"""
|
|
1297
|
+
updated = {}
|
|
1298
|
+
|
|
1299
|
+
if req.cpu_threads is not None:
|
|
1300
|
+
STATE["config_cpu_threads"] = req.cpu_threads
|
|
1301
|
+
updated["cpu_threads"] = req.cpu_threads
|
|
1302
|
+
|
|
1303
|
+
if req.memory_mb is not None:
|
|
1304
|
+
STATE["config_memory_mb"] = req.memory_mb
|
|
1305
|
+
updated["memory_mb"] = req.memory_mb
|
|
1306
|
+
|
|
1307
|
+
if req.storage_mb is not None:
|
|
1308
|
+
STATE["config_storage_mb"] = req.storage_mb
|
|
1309
|
+
updated["storage_mb"] = req.storage_mb
|
|
1310
|
+
# Update genesis loader if it exists
|
|
1311
|
+
if NEURO_NODE and hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
|
|
1312
|
+
NEURO_NODE.genesis_loader.max_storage_mb = req.storage_mb
|
|
1313
|
+
NEURO_NODE.genesis_loader.max_shards = max(1, int(req.storage_mb / 10))
|
|
1314
|
+
logger.info(f"[NODE] Updated storage limit: {req.storage_mb}MB ({NEURO_NODE.genesis_loader.max_shards} shards)")
|
|
1315
|
+
|
|
1316
|
+
return {
|
|
1317
|
+
"success": True,
|
|
1318
|
+
"updated": updated,
|
|
1319
|
+
"message": "Settings updated. Changes take effect within 5 seconds.",
|
|
1320
|
+
"current_throttle": {
|
|
1321
|
+
"cpu_ratio": STATE.get("throttle_cpu_ratio", 1.0),
|
|
1322
|
+
"ram_ratio": STATE.get("throttle_ram_ratio", 1.0),
|
|
1323
|
+
"effective": STATE.get("throttle_effective", 1.0),
|
|
1324
|
+
}
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
|
|
1328
|
+
@node_app.get("/api/validator/info")
|
|
1329
|
+
async def get_validator_info():
|
|
1330
|
+
"""
|
|
1331
|
+
Get validator eligibility and status.
|
|
1332
|
+
|
|
1333
|
+
Validators require:
|
|
1334
|
+
- Minimum 100 NEURO staked
|
|
1335
|
+
- LM Head layer assignment (last layer)
|
|
1336
|
+
|
|
1337
|
+
Validators earn:
|
|
1338
|
+
- 30% bonus on rewards (up from 20%)
|
|
1339
|
+
- 0.001 NEURO per proof validated
|
|
1340
|
+
"""
|
|
1341
|
+
if not P2P or not P2P.ledger:
|
|
1342
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
1343
|
+
|
|
1344
|
+
validator_info = P2P.ledger.get_validator_info()
|
|
1345
|
+
|
|
1346
|
+
# Add role info from node
|
|
1347
|
+
if NEURO_NODE:
|
|
1348
|
+
validator_info["has_lm_head"] = NEURO_NODE.model.has_lm_head if NEURO_NODE.model else False
|
|
1349
|
+
validator_info["is_active_validator"] = (
|
|
1350
|
+
validator_info["is_eligible_validator"] and
|
|
1351
|
+
validator_info.get("has_lm_head", False)
|
|
1352
|
+
)
|
|
1353
|
+
|
|
1354
|
+
return validator_info
|
|
1355
|
+
|
|
1356
|
+
|
|
1357
|
+
# ==================== SWARM ENDPOINTS ====================
|
|
1358
|
+
|
|
1359
|
+
@node_app.get("/api/swarm")
|
|
1360
|
+
async def get_swarm_status():
|
|
1361
|
+
"""
|
|
1362
|
+
Get Swarm architecture status.
|
|
1363
|
+
|
|
1364
|
+
Returns buffer fill rates, heartbeat peers, routing stats.
|
|
1365
|
+
"""
|
|
1366
|
+
if not NEURO_NODE:
|
|
1367
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
1368
|
+
|
|
1369
|
+
# Get swarm status from node
|
|
1370
|
+
swarm_status = NEURO_NODE.get_swarm_status()
|
|
1371
|
+
|
|
1372
|
+
return swarm_status
|
|
1373
|
+
|
|
1374
|
+
|
|
1375
|
+
@node_app.get("/api/diloco")
|
|
1376
|
+
async def get_diloco_progress():
|
|
1377
|
+
"""
|
|
1378
|
+
Get DiLoCo training progress.
|
|
1379
|
+
|
|
1380
|
+
Returns inner step count, sync progress, outer step count.
|
|
1381
|
+
"""
|
|
1382
|
+
if not NEURO_NODE:
|
|
1383
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
1384
|
+
|
|
1385
|
+
return NEURO_NODE.get_diloco_progress()
|
|
1386
|
+
|
|
1387
|
+
|
|
1388
|
+
@node_app.get("/api/model_info")
|
|
1389
|
+
async def get_model_info():
|
|
1390
|
+
"""Get information about the NeuroLLM model."""
|
|
1391
|
+
if not NEURO_NODE:
|
|
1392
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
1393
|
+
|
|
1394
|
+
stats = NEURO_NODE.get_stats()
|
|
1395
|
+
|
|
1396
|
+
# Get architecture info
|
|
1397
|
+
arch_info = {}
|
|
1398
|
+
if NEURO_NODE.layer_pool and NEURO_NODE.layer_pool.current_architecture:
|
|
1399
|
+
arch = NEURO_NODE.layer_pool.current_architecture
|
|
1400
|
+
arch_info = {
|
|
1401
|
+
"hidden_dim": arch.hidden_dim,
|
|
1402
|
+
"num_layers": arch.num_layers,
|
|
1403
|
+
"num_heads": arch.num_heads,
|
|
1404
|
+
"vocab_size": arch.vocab_size,
|
|
1405
|
+
"architecture_version": NEURO_NODE.layer_pool.architecture_version,
|
|
1406
|
+
"total_params": arch.estimate_params(),
|
|
1407
|
+
}
|
|
1408
|
+
|
|
1409
|
+
# Sanitize loss for JSON
|
|
1410
|
+
model_loss = NEURO_NODE.current_loss
|
|
1411
|
+
if math.isinf(model_loss) or math.isnan(model_loss):
|
|
1412
|
+
model_loss = None
|
|
1413
|
+
|
|
1414
|
+
return {
|
|
1415
|
+
"model_name": "NeuroLLM",
|
|
1416
|
+
"description": "The People's Language Model - trained from scratch by the network",
|
|
1417
|
+
"architecture": arch_info, # NEW: Show current architecture
|
|
1418
|
+
"my_layers": stats.get("my_layers", []),
|
|
1419
|
+
"my_params": stats.get("my_params", 0),
|
|
1420
|
+
"network_layers": stats.get("network_layers", 0),
|
|
1421
|
+
"network_nodes": stats.get("network_nodes", 0),
|
|
1422
|
+
"total_training_rounds": NEURO_NODE.total_training_rounds,
|
|
1423
|
+
"current_loss": model_loss,
|
|
1424
|
+
"note": "This model is trained collaboratively. Quality improves as more users contribute!"
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1427
|
+
|
|
1428
|
+
@node_app.get("/api/network")
|
|
1429
|
+
async def get_network_info():
|
|
1430
|
+
"""Get network capacity and layer distribution."""
|
|
1431
|
+
if not NEURO_NODE or not NEURO_NODE.layer_pool:
|
|
1432
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
1433
|
+
|
|
1434
|
+
capacity = NEURO_NODE.layer_pool.get_network_capacity()
|
|
1435
|
+
|
|
1436
|
+
return {
|
|
1437
|
+
"total_nodes": capacity.total_nodes,
|
|
1438
|
+
"total_memory_mb": capacity.total_memory_mb,
|
|
1439
|
+
"max_possible_layers": capacity.max_layers,
|
|
1440
|
+
"current_layers": capacity.assigned_layers,
|
|
1441
|
+
"layer_coverage": capacity.layer_coverage,
|
|
1442
|
+
"my_contribution": NEURO_NODE.model.get_my_contribution() if NEURO_NODE.model else {},
|
|
1443
|
+
}
|
|
1444
|
+
|
|
1445
|
+
|
|
1446
|
+
@node_app.get("/api/logs")
|
|
1447
|
+
async def get_logs(since_id: Optional[int] = None, limit: int = 100):
|
|
1448
|
+
"""
|
|
1449
|
+
Get recent logs from the node.
|
|
1450
|
+
|
|
1451
|
+
Args:
|
|
1452
|
+
since_id: Return logs with ID greater than this (for polling).
|
|
1453
|
+
Use 0 or omit to get all available logs on initial load.
|
|
1454
|
+
limit: Maximum number of logs to return (default 100)
|
|
1455
|
+
|
|
1456
|
+
Returns:
|
|
1457
|
+
List of log entries with id, epoch, timestamp, message, type, and level
|
|
1458
|
+
"""
|
|
1459
|
+
with _LOG_BUFFER_LOCK:
|
|
1460
|
+
logs = list(_LOG_BUFFER)
|
|
1461
|
+
|
|
1462
|
+
# If since_id is provided, filter to only logs with ID > since_id
|
|
1463
|
+
if since_id is not None and since_id > 0:
|
|
1464
|
+
logs = [log for log in logs if log.get('id', 0) > since_id]
|
|
1465
|
+
|
|
1466
|
+
# Limit results (take most recent)
|
|
1467
|
+
if len(logs) > limit:
|
|
1468
|
+
logs = logs[-limit:]
|
|
1469
|
+
|
|
1470
|
+
# Get the latest log ID for next poll
|
|
1471
|
+
latest_id = logs[-1]['id'] if logs else (since_id or 0)
|
|
1472
|
+
|
|
1473
|
+
return {
|
|
1474
|
+
"logs": logs,
|
|
1475
|
+
"total": len(_LOG_BUFFER),
|
|
1476
|
+
"latest_id": latest_id, # Client should use this for next poll
|
|
1477
|
+
}
|
|
1478
|
+
|
|
1479
|
+
|
|
1480
|
+
@node_app.post("/api/shutdown")
|
|
1481
|
+
async def shutdown_node():
|
|
1482
|
+
"""
|
|
1483
|
+
Gracefully shutdown the node.
|
|
1484
|
+
|
|
1485
|
+
Saves checkpoint and stops all components cleanly.
|
|
1486
|
+
"""
|
|
1487
|
+
logger.info("[NODE] Shutdown requested via API")
|
|
1488
|
+
|
|
1489
|
+
# Use a background thread for shutdown (more reliable than asyncio.create_task)
|
|
1490
|
+
def do_shutdown():
|
|
1491
|
+
import time
|
|
1492
|
+
time.sleep(0.5) # Brief delay to allow response to be sent
|
|
1493
|
+
request_shutdown()
|
|
1494
|
+
|
|
1495
|
+
shutdown_thread = threading.Thread(target=do_shutdown, daemon=False)
|
|
1496
|
+
shutdown_thread.start()
|
|
1497
|
+
|
|
1498
|
+
return {
|
|
1499
|
+
"status": "shutting_down",
|
|
1500
|
+
"message": "Node will shutdown in 0.5 seconds. Checkpoint will be saved."
|
|
1501
|
+
}
|
|
1502
|
+
|
|
1503
|
+
|
|
1504
|
+
@node_app.get("/api/checkpoint/info")
|
|
1505
|
+
async def get_checkpoint_info():
|
|
1506
|
+
"""Get checkpoint info for P2P sync."""
|
|
1507
|
+
if not NEURO_NODE:
|
|
1508
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
1509
|
+
|
|
1510
|
+
return NEURO_NODE.get_checkpoint_info()
|
|
1511
|
+
|
|
1512
|
+
|
|
1513
|
+
@node_app.get("/api/checkpoint/download")
|
|
1514
|
+
async def download_checkpoint():
|
|
1515
|
+
"""Download checkpoint (for P2P sync via HTTP fallback)."""
|
|
1516
|
+
import io
|
|
1517
|
+
import zlib
|
|
1518
|
+
from fastapi.responses import Response
|
|
1519
|
+
|
|
1520
|
+
if not NEURO_NODE or not NEURO_NODE.model:
|
|
1521
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
1522
|
+
|
|
1523
|
+
try:
|
|
1524
|
+
# Serialize checkpoint for my layers only
|
|
1525
|
+
buffer = io.BytesIO()
|
|
1526
|
+
|
|
1527
|
+
# Collect layer state dicts
|
|
1528
|
+
layer_states = {
|
|
1529
|
+
layer_id: layer.state_dict()
|
|
1530
|
+
for layer_id, layer in NEURO_NODE.model.my_layers.items()
|
|
1531
|
+
}
|
|
1532
|
+
|
|
1533
|
+
checkpoint = {
|
|
1534
|
+
"layer_ids": NEURO_NODE.my_layer_ids,
|
|
1535
|
+
"layers": layer_states,
|
|
1536
|
+
"has_embedding": NEURO_NODE.model.has_embedding,
|
|
1537
|
+
"has_lm_head": NEURO_NODE.model.has_lm_head,
|
|
1538
|
+
"version": NEURO_NODE.total_training_rounds,
|
|
1539
|
+
}
|
|
1540
|
+
|
|
1541
|
+
if NEURO_NODE.model.embedding:
|
|
1542
|
+
checkpoint["embedding"] = NEURO_NODE.model.embedding.state_dict()
|
|
1543
|
+
if NEURO_NODE.model.lm_head:
|
|
1544
|
+
checkpoint["lm_head"] = NEURO_NODE.model.lm_head.state_dict()
|
|
1545
|
+
if NEURO_NODE.model.final_norm:
|
|
1546
|
+
checkpoint["final_norm"] = NEURO_NODE.model.final_norm.state_dict()
|
|
1547
|
+
|
|
1548
|
+
torch.save(checkpoint, buffer)
|
|
1549
|
+
|
|
1550
|
+
# Compress
|
|
1551
|
+
raw_data = buffer.getvalue()
|
|
1552
|
+
compressed = zlib.compress(raw_data, level=6)
|
|
1553
|
+
|
|
1554
|
+
return Response(
|
|
1555
|
+
content=compressed,
|
|
1556
|
+
media_type="application/octet-stream",
|
|
1557
|
+
headers={
|
|
1558
|
+
"X-Checkpoint-Version": str(checkpoint["version"]),
|
|
1559
|
+
"X-Layer-IDs": ",".join(map(str, NEURO_NODE.my_layer_ids)),
|
|
1560
|
+
"X-Original-Size": str(len(raw_data)),
|
|
1561
|
+
}
|
|
1562
|
+
)
|
|
1563
|
+
|
|
1564
|
+
except Exception as e:
|
|
1565
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1566
|
+
|
|
1567
|
+
|
|
1568
|
+
# ==================== API v1 ENDPOINTS (SDK Compatible) ====================
|
|
1569
|
+
|
|
1570
|
+
class InferenceV1Request(BaseModel):
|
|
1571
|
+
"""Inference request matching SDK expectations."""
|
|
1572
|
+
prompt: str
|
|
1573
|
+
max_tokens: int = 100
|
|
1574
|
+
temperature: float = 1.0
|
|
1575
|
+
top_p: float = 1.0
|
|
1576
|
+
top_k: int = 50
|
|
1577
|
+
stop: List[str] = []
|
|
1578
|
+
stream: bool = False
|
|
1579
|
+
|
|
1580
|
+
|
|
1581
|
+
class SendNEURORequest(BaseModel):
|
|
1582
|
+
"""Send NEURO request."""
|
|
1583
|
+
to: str
|
|
1584
|
+
amount: float
|
|
1585
|
+
memo: str = ""
|
|
1586
|
+
|
|
1587
|
+
|
|
1588
|
+
@node_app.get("/api/v1/status")
|
|
1589
|
+
async def get_status_v1():
|
|
1590
|
+
"""
|
|
1591
|
+
Get current node status (SDK compatible).
|
|
1592
|
+
|
|
1593
|
+
Returns status in format expected by NeuroNode SDK.
|
|
1594
|
+
"""
|
|
1595
|
+
import math
|
|
1596
|
+
import psutil
|
|
1597
|
+
import os
|
|
1598
|
+
|
|
1599
|
+
if not NEURO_NODE:
|
|
1600
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
1601
|
+
|
|
1602
|
+
# Get node stats
|
|
1603
|
+
stats = NEURO_NODE.get_stats()
|
|
1604
|
+
|
|
1605
|
+
# Determine role
|
|
1606
|
+
has_embedding = stats.get("has_embedding", False)
|
|
1607
|
+
has_lm_head = stats.get("has_lm_head", False)
|
|
1608
|
+
if has_embedding and has_lm_head:
|
|
1609
|
+
role = "full"
|
|
1610
|
+
elif has_embedding:
|
|
1611
|
+
role = "driver"
|
|
1612
|
+
elif has_lm_head:
|
|
1613
|
+
role = "validator"
|
|
1614
|
+
else:
|
|
1615
|
+
role = "worker"
|
|
1616
|
+
|
|
1617
|
+
# Get system resources
|
|
1618
|
+
try:
|
|
1619
|
+
mem = psutil.virtual_memory()
|
|
1620
|
+
process = psutil.Process(os.getpid())
|
|
1621
|
+
gpu_used = 0
|
|
1622
|
+
gpu_total = 0
|
|
1623
|
+
|
|
1624
|
+
if torch.cuda.is_available():
|
|
1625
|
+
gpu_used = torch.cuda.memory_allocated()
|
|
1626
|
+
gpu_total = torch.cuda.get_device_properties(0).total_memory
|
|
1627
|
+
|
|
1628
|
+
resources = {
|
|
1629
|
+
"gpu_memory_used": gpu_used,
|
|
1630
|
+
"gpu_memory_total": gpu_total,
|
|
1631
|
+
"cpu_percent": psutil.cpu_percent(),
|
|
1632
|
+
"ram_used": mem.used,
|
|
1633
|
+
"ram_total": mem.total,
|
|
1634
|
+
}
|
|
1635
|
+
except Exception:
|
|
1636
|
+
resources = {}
|
|
1637
|
+
|
|
1638
|
+
# Handle infinity loss
|
|
1639
|
+
loss = stats.get("current_loss", 0.0)
|
|
1640
|
+
if math.isinf(loss) or math.isnan(loss):
|
|
1641
|
+
loss = 0.0
|
|
1642
|
+
|
|
1643
|
+
return {
|
|
1644
|
+
"node_id": NEURO_NODE.node_id,
|
|
1645
|
+
"version": __version__,
|
|
1646
|
+
"uptime_seconds": int(time.time() - getattr(NEURO_NODE, '_start_time', time.time())),
|
|
1647
|
+
"status": STATE.get("training_status", "running"),
|
|
1648
|
+
"role": role,
|
|
1649
|
+
"layers": stats.get("my_layers", []),
|
|
1650
|
+
"peer_count": STATE.get("peer_count", 0),
|
|
1651
|
+
"has_embedding": has_embedding,
|
|
1652
|
+
"has_lm_head": has_lm_head,
|
|
1653
|
+
"training": {
|
|
1654
|
+
"enabled": NEURO_NODE.enable_training,
|
|
1655
|
+
"epoch": 0, # Not tracked separately
|
|
1656
|
+
"step": stats.get("total_training_rounds", 0),
|
|
1657
|
+
"loss": loss,
|
|
1658
|
+
},
|
|
1659
|
+
"resources": resources,
|
|
1660
|
+
}
|
|
1661
|
+
|
|
1662
|
+
|
|
1663
|
+
@node_app.get("/api/v1/metrics")
|
|
1664
|
+
async def get_metrics_v1():
|
|
1665
|
+
"""
|
|
1666
|
+
Get performance metrics (SDK compatible).
|
|
1667
|
+
"""
|
|
1668
|
+
import math
|
|
1669
|
+
from datetime import datetime
|
|
1670
|
+
|
|
1671
|
+
if not NEURO_NODE:
|
|
1672
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
1673
|
+
|
|
1674
|
+
stats = NEURO_NODE.get_stats()
|
|
1675
|
+
|
|
1676
|
+
# Get balance info for rewards
|
|
1677
|
+
earned_total = 0.0
|
|
1678
|
+
pending = 0.0
|
|
1679
|
+
if P2P and P2P.ledger:
|
|
1680
|
+
account = P2P.ledger.get_account_info()
|
|
1681
|
+
earned_total = account.get("total_earned", 0.0)
|
|
1682
|
+
pending = 0.0 # Could track pending proofs
|
|
1683
|
+
|
|
1684
|
+
return {
|
|
1685
|
+
"timestamp": datetime.now().isoformat(),
|
|
1686
|
+
"inference": {
|
|
1687
|
+
"requests_total": STATE.get("processed_count", 0),
|
|
1688
|
+
"requests_per_minute": 0.0, # Would need tracking
|
|
1689
|
+
"avg_latency_ms": 0.0,
|
|
1690
|
+
"p99_latency_ms": 0.0,
|
|
1691
|
+
"tokens_generated": stats.get("total_tokens_processed", 0),
|
|
1692
|
+
},
|
|
1693
|
+
"training": {
|
|
1694
|
+
"steps_total": stats.get("total_training_rounds", 0),
|
|
1695
|
+
"steps_per_hour": 0.0,
|
|
1696
|
+
"gradients_submitted": 0,
|
|
1697
|
+
"gradients_accepted": 0,
|
|
1698
|
+
},
|
|
1699
|
+
"network": {
|
|
1700
|
+
"bytes_sent": 0,
|
|
1701
|
+
"bytes_received": 0,
|
|
1702
|
+
"active_connections": STATE.get("peer_count", 0),
|
|
1703
|
+
"rpc_calls": 0,
|
|
1704
|
+
"peer_count": STATE.get("peer_count", 0),
|
|
1705
|
+
},
|
|
1706
|
+
"rewards": {
|
|
1707
|
+
"earned_today": 0.0, # Would need daily tracking
|
|
1708
|
+
"earned_total": earned_total,
|
|
1709
|
+
"pending": pending,
|
|
1710
|
+
},
|
|
1711
|
+
}
|
|
1712
|
+
|
|
1713
|
+
|
|
1714
|
+
@node_app.get("/api/v1/health")
|
|
1715
|
+
async def health_check_v1():
|
|
1716
|
+
"""Health check endpoint (SDK compatible)."""
|
|
1717
|
+
checks = {
|
|
1718
|
+
"node": "ok" if NEURO_NODE else "error",
|
|
1719
|
+
"network": "ok" if P2P else "error",
|
|
1720
|
+
"model": "ok" if NEURO_NODE and NEURO_NODE.model else "error",
|
|
1721
|
+
}
|
|
1722
|
+
|
|
1723
|
+
# Check GPU
|
|
1724
|
+
try:
|
|
1725
|
+
if torch.cuda.is_available():
|
|
1726
|
+
checks["gpu"] = "ok"
|
|
1727
|
+
else:
|
|
1728
|
+
checks["gpu"] = "cpu_only"
|
|
1729
|
+
except Exception:
|
|
1730
|
+
checks["gpu"] = "unknown"
|
|
1731
|
+
|
|
1732
|
+
healthy = all(v == "ok" for k, v in checks.items() if k != "gpu")
|
|
1733
|
+
|
|
1734
|
+
return {
|
|
1735
|
+
"healthy": healthy,
|
|
1736
|
+
"checks": checks,
|
|
1737
|
+
}
|
|
1738
|
+
|
|
1739
|
+
|
|
1740
|
+
@node_app.post("/api/v1/inference")
|
|
1741
|
+
async def inference_v1(req: InferenceV1Request):
|
|
1742
|
+
"""
|
|
1743
|
+
Run inference (SDK compatible).
|
|
1744
|
+
|
|
1745
|
+
Supports both streaming and non-streaming modes.
|
|
1746
|
+
"""
|
|
1747
|
+
from fastapi.responses import StreamingResponse
|
|
1748
|
+
import uuid
|
|
1749
|
+
|
|
1750
|
+
if not NEURO_NODE:
|
|
1751
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
1752
|
+
|
|
1753
|
+
start_time = time.time()
|
|
1754
|
+
request_id = f"inf_{uuid.uuid4().hex[:12]}"
|
|
1755
|
+
|
|
1756
|
+
if req.stream:
|
|
1757
|
+
# Streaming response
|
|
1758
|
+
async def generate_stream():
|
|
1759
|
+
try:
|
|
1760
|
+
# Generate tokens one at a time
|
|
1761
|
+
text = NEURO_NODE.generate(
|
|
1762
|
+
prompt=req.prompt,
|
|
1763
|
+
max_new_tokens=req.max_tokens,
|
|
1764
|
+
temperature=req.temperature,
|
|
1765
|
+
)
|
|
1766
|
+
|
|
1767
|
+
# Emit tokens
|
|
1768
|
+
tokens = text.split()
|
|
1769
|
+
for i, token in enumerate(tokens):
|
|
1770
|
+
yield f"data: {json.dumps({'token': token + ' ', 'index': i})}\n\n"
|
|
1771
|
+
|
|
1772
|
+
# Final message
|
|
1773
|
+
yield f"data: {json.dumps({'token': '[DONE]', 'finish_reason': 'stop'})}\n\n"
|
|
1774
|
+
|
|
1775
|
+
except Exception as e:
|
|
1776
|
+
yield f"data: {json.dumps({'error': str(e)})}\n\n"
|
|
1777
|
+
|
|
1778
|
+
return StreamingResponse(
|
|
1779
|
+
generate_stream(),
|
|
1780
|
+
media_type="text/event-stream",
|
|
1781
|
+
)
|
|
1782
|
+
|
|
1783
|
+
# Non-streaming response
|
|
1784
|
+
try:
|
|
1785
|
+
text = NEURO_NODE.generate(
|
|
1786
|
+
prompt=req.prompt,
|
|
1787
|
+
max_new_tokens=req.max_tokens,
|
|
1788
|
+
temperature=req.temperature,
|
|
1789
|
+
)
|
|
1790
|
+
|
|
1791
|
+
end_time = time.time()
|
|
1792
|
+
inference_ms = (end_time - start_time) * 1000
|
|
1793
|
+
|
|
1794
|
+
# Count tokens (simple approximation)
|
|
1795
|
+
prompt_tokens = len(req.prompt.split())
|
|
1796
|
+
completion_tokens = len(text.split())
|
|
1797
|
+
|
|
1798
|
+
STATE["processed_count"] = STATE.get("processed_count", 0) + 1
|
|
1799
|
+
|
|
1800
|
+
return {
|
|
1801
|
+
"id": request_id,
|
|
1802
|
+
"text": text,
|
|
1803
|
+
"tokens_generated": completion_tokens,
|
|
1804
|
+
"finish_reason": "stop",
|
|
1805
|
+
"usage": {
|
|
1806
|
+
"prompt_tokens": prompt_tokens,
|
|
1807
|
+
"completion_tokens": completion_tokens,
|
|
1808
|
+
"total_tokens": prompt_tokens + completion_tokens,
|
|
1809
|
+
},
|
|
1810
|
+
"cost": {
|
|
1811
|
+
"amount": completion_tokens * 0.000001, # Approximate
|
|
1812
|
+
"currency": "NEURO",
|
|
1813
|
+
},
|
|
1814
|
+
"timing": {
|
|
1815
|
+
"queue_ms": 0,
|
|
1816
|
+
"inference_ms": inference_ms,
|
|
1817
|
+
"total_ms": inference_ms,
|
|
1818
|
+
},
|
|
1819
|
+
}
|
|
1820
|
+
|
|
1821
|
+
except Exception as e:
|
|
1822
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
1823
|
+
|
|
1824
|
+
|
|
1825
|
+
@node_app.get("/api/v1/wallet/balance")
|
|
1826
|
+
async def get_wallet_balance_v1():
|
|
1827
|
+
"""Get wallet balance (SDK compatible)."""
|
|
1828
|
+
if not P2P or not P2P.ledger:
|
|
1829
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
1830
|
+
|
|
1831
|
+
account = P2P.ledger.get_account_info()
|
|
1832
|
+
|
|
1833
|
+
return {
|
|
1834
|
+
"address": P2P.ledger.node_id,
|
|
1835
|
+
"balances": {
|
|
1836
|
+
"available": account.get("balance", 0.0),
|
|
1837
|
+
"staked": account.get("stake", 0.0),
|
|
1838
|
+
"pending": 0.0,
|
|
1839
|
+
"total": account.get("balance", 0.0) + account.get("stake", 0.0),
|
|
1840
|
+
},
|
|
1841
|
+
"staking": {
|
|
1842
|
+
"amount": account.get("stake", 0.0),
|
|
1843
|
+
"duration_days": 30,
|
|
1844
|
+
"multiplier": account.get("stake_multiplier", 1.0),
|
|
1845
|
+
},
|
|
1846
|
+
}
|
|
1847
|
+
|
|
1848
|
+
|
|
1849
|
+
@node_app.post("/api/v1/wallet/send")
|
|
1850
|
+
async def send_neuro_v1(req: SendNEURORequest):
|
|
1851
|
+
"""Send NEURO tokens (SDK compatible)."""
|
|
1852
|
+
if not P2P or not P2P.ledger:
|
|
1853
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
1854
|
+
|
|
1855
|
+
success, message, tx = P2P.ledger.transfer(req.to, req.amount, req.memo)
|
|
1856
|
+
|
|
1857
|
+
if not success:
|
|
1858
|
+
raise HTTPException(status_code=400, detail=message)
|
|
1859
|
+
|
|
1860
|
+
return {
|
|
1861
|
+
"transaction_id": tx.tx_id if tx else "",
|
|
1862
|
+
"from": P2P.ledger.node_id,
|
|
1863
|
+
"to": req.to,
|
|
1864
|
+
"amount": req.amount,
|
|
1865
|
+
"fee": tx.fee if tx else 0.0,
|
|
1866
|
+
"memo": req.memo,
|
|
1867
|
+
"status": "confirmed",
|
|
1868
|
+
"timestamp": datetime.now().isoformat() if 'datetime' in dir() else time.strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
1869
|
+
}
|
|
1870
|
+
|
|
1871
|
+
|
|
1872
|
+
@node_app.get("/api/v1/wallet/transactions")
|
|
1873
|
+
async def get_transactions_v1(limit: int = 10, offset: int = 0, type: Optional[str] = None):
|
|
1874
|
+
"""Get transaction history (SDK compatible)."""
|
|
1875
|
+
if not P2P or not P2P.ledger:
|
|
1876
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
1877
|
+
|
|
1878
|
+
# Get recent proofs as transactions
|
|
1879
|
+
import sqlite3
|
|
1880
|
+
transactions = []
|
|
1881
|
+
|
|
1882
|
+
try:
|
|
1883
|
+
with sqlite3.connect(P2P.ledger.db_path) as conn:
|
|
1884
|
+
query = """
|
|
1885
|
+
SELECT signature, node_id, proof_type, timestamp, reward_amount
|
|
1886
|
+
FROM proof_history
|
|
1887
|
+
WHERE node_id = ?
|
|
1888
|
+
ORDER BY timestamp DESC
|
|
1889
|
+
LIMIT ? OFFSET ?
|
|
1890
|
+
"""
|
|
1891
|
+
rows = conn.execute(query, (P2P.ledger.node_id, limit, offset)).fetchall()
|
|
1892
|
+
|
|
1893
|
+
for sig, node_id, ptype, ts, reward in rows:
|
|
1894
|
+
transactions.append({
|
|
1895
|
+
"id": sig[:16] if sig else "",
|
|
1896
|
+
"type": "reward",
|
|
1897
|
+
"amount": reward,
|
|
1898
|
+
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(ts)),
|
|
1899
|
+
"details": {
|
|
1900
|
+
"proof_type": ptype,
|
|
1901
|
+
},
|
|
1902
|
+
})
|
|
1903
|
+
except Exception:
|
|
1904
|
+
pass
|
|
1905
|
+
|
|
1906
|
+
return {
|
|
1907
|
+
"transactions": transactions,
|
|
1908
|
+
"total": len(transactions),
|
|
1909
|
+
"limit": limit,
|
|
1910
|
+
"offset": offset,
|
|
1911
|
+
}
|
|
1912
|
+
|
|
1913
|
+
|
|
1914
|
+
@node_app.post("/api/v1/wallet/stake")
|
|
1915
|
+
async def stake_neuro_v1(req: StakeRequest):
|
|
1916
|
+
"""Stake NEURO tokens (SDK compatible)."""
|
|
1917
|
+
if not P2P or not P2P.ledger:
|
|
1918
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
1919
|
+
|
|
1920
|
+
success, message = P2P.ledger.stake(req.amount, req.duration_days)
|
|
1921
|
+
|
|
1922
|
+
if not success:
|
|
1923
|
+
raise HTTPException(status_code=400, detail=message)
|
|
1924
|
+
|
|
1925
|
+
account = P2P.ledger.get_account_info()
|
|
1926
|
+
|
|
1927
|
+
from datetime import date, timedelta
|
|
1928
|
+
start = date.today()
|
|
1929
|
+
unlock = start + timedelta(days=req.duration_days)
|
|
1930
|
+
|
|
1931
|
+
return {
|
|
1932
|
+
"success": True,
|
|
1933
|
+
"stake": {
|
|
1934
|
+
"amount": req.amount,
|
|
1935
|
+
"duration_days": req.duration_days,
|
|
1936
|
+
"start_date": start.isoformat(),
|
|
1937
|
+
"unlock_date": unlock.isoformat(),
|
|
1938
|
+
"multiplier": account.get("stake_multiplier", 1.0),
|
|
1939
|
+
},
|
|
1940
|
+
"new_balance": {
|
|
1941
|
+
"available": account.get("balance", 0.0),
|
|
1942
|
+
"staked": account.get("stake", 0.0),
|
|
1943
|
+
},
|
|
1944
|
+
}
|
|
1945
|
+
|
|
1946
|
+
|
|
1947
|
+
@node_app.post("/api/v1/wallet/unstake")
|
|
1948
|
+
async def unstake_neuro_v1(amount: float = None):
|
|
1949
|
+
"""Request unstaking (SDK compatible)."""
|
|
1950
|
+
if not P2P or not P2P.ledger:
|
|
1951
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
1952
|
+
|
|
1953
|
+
success, unstaked_amount, message = P2P.ledger.unstake()
|
|
1954
|
+
|
|
1955
|
+
if not success:
|
|
1956
|
+
raise HTTPException(status_code=400, detail=message)
|
|
1957
|
+
|
|
1958
|
+
from datetime import date, timedelta
|
|
1959
|
+
available = date.today() + timedelta(days=7)
|
|
1960
|
+
|
|
1961
|
+
return {
|
|
1962
|
+
"success": True,
|
|
1963
|
+
"unstake": {
|
|
1964
|
+
"amount": unstaked_amount,
|
|
1965
|
+
"cooldown_days": 7,
|
|
1966
|
+
"available_date": available.isoformat(),
|
|
1967
|
+
},
|
|
1968
|
+
}
|
|
1969
|
+
|
|
1970
|
+
|
|
1971
|
+
@node_app.get("/api/v1/wallet/rewards")
|
|
1972
|
+
async def get_rewards_v1(start_date: Optional[str] = None, end_date: Optional[str] = None):
|
|
1973
|
+
"""Get reward history (SDK compatible)."""
|
|
1974
|
+
if not P2P or not P2P.ledger:
|
|
1975
|
+
raise HTTPException(status_code=503, detail="Ledger not available")
|
|
1976
|
+
|
|
1977
|
+
account = P2P.ledger.get_account_info()
|
|
1978
|
+
|
|
1979
|
+
return {
|
|
1980
|
+
"total": account.get("total_earned", 0.0),
|
|
1981
|
+
"by_day": [], # Would need daily tracking
|
|
1982
|
+
"by_type": {
|
|
1983
|
+
"uptime": 0.0,
|
|
1984
|
+
"inference": 0.0,
|
|
1985
|
+
"training": 0.0,
|
|
1986
|
+
},
|
|
1987
|
+
}
|
|
1988
|
+
|
|
1989
|
+
|
|
1990
|
+
@node_app.get("/api/v1/peers")
|
|
1991
|
+
async def get_peers_v1():
|
|
1992
|
+
"""List connected peers (SDK compatible)."""
|
|
1993
|
+
if not P2P:
|
|
1994
|
+
raise HTTPException(status_code=503, detail="P2P not available")
|
|
1995
|
+
|
|
1996
|
+
peers = []
|
|
1997
|
+
for peer_url, peer_info in P2P.known_peers.items():
|
|
1998
|
+
# Parse peer info
|
|
1999
|
+
peer_id = peer_info.get("id", peer_url)
|
|
2000
|
+
role = "worker"
|
|
2001
|
+
layers = []
|
|
2002
|
+
|
|
2003
|
+
if isinstance(peer_info, dict):
|
|
2004
|
+
if peer_info.get("has_embedding"):
|
|
2005
|
+
role = "driver"
|
|
2006
|
+
elif peer_info.get("has_lm_head"):
|
|
2007
|
+
role = "validator"
|
|
2008
|
+
layers = peer_info.get("layers", [])
|
|
2009
|
+
|
|
2010
|
+
peers.append({
|
|
2011
|
+
"id": peer_id,
|
|
2012
|
+
"address": peer_url,
|
|
2013
|
+
"role": role,
|
|
2014
|
+
"layers": layers,
|
|
2015
|
+
"latency_ms": 0.0,
|
|
2016
|
+
"connected_since": None,
|
|
2017
|
+
})
|
|
2018
|
+
|
|
2019
|
+
return {
|
|
2020
|
+
"peers": peers,
|
|
2021
|
+
"total": len(peers),
|
|
2022
|
+
}
|
|
2023
|
+
|
|
2024
|
+
|
|
2025
|
+
@node_app.get("/api/v1/layers")
|
|
2026
|
+
async def get_layers_v1():
|
|
2027
|
+
"""List assigned layers (SDK compatible)."""
|
|
2028
|
+
if not NEURO_NODE:
|
|
2029
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
2030
|
+
|
|
2031
|
+
layers = []
|
|
2032
|
+
for layer_id in NEURO_NODE.my_layer_ids:
|
|
2033
|
+
layer_type = "transformer"
|
|
2034
|
+
if layer_id == 0 and NEURO_NODE.model.has_embedding:
|
|
2035
|
+
layer_type = "embedding"
|
|
2036
|
+
|
|
2037
|
+
layers.append({
|
|
2038
|
+
"index": layer_id,
|
|
2039
|
+
"type": layer_type,
|
|
2040
|
+
"memory_mb": 0, # Would need per-layer tracking
|
|
2041
|
+
"status": "active",
|
|
2042
|
+
})
|
|
2043
|
+
|
|
2044
|
+
# Add LM head if present
|
|
2045
|
+
if NEURO_NODE.model.has_lm_head:
|
|
2046
|
+
layers.append({
|
|
2047
|
+
"index": max(NEURO_NODE.my_layer_ids) + 1 if NEURO_NODE.my_layer_ids else 0,
|
|
2048
|
+
"type": "lm_head",
|
|
2049
|
+
"memory_mb": 0,
|
|
2050
|
+
"status": "active",
|
|
2051
|
+
})
|
|
2052
|
+
|
|
2053
|
+
return {
|
|
2054
|
+
"layers": layers,
|
|
2055
|
+
"total_layers": len(NEURO_NODE.my_layer_ids),
|
|
2056
|
+
"my_layer_count": len(NEURO_NODE.my_layer_ids),
|
|
2057
|
+
}
|
|
2058
|
+
|
|
2059
|
+
|
|
2060
|
+
@node_app.get("/api/v1/config")
|
|
2061
|
+
async def get_config_v1():
|
|
2062
|
+
"""Get node configuration (SDK compatible)."""
|
|
2063
|
+
if not NEURO_NODE:
|
|
2064
|
+
raise HTTPException(status_code=503, detail="Node not ready")
|
|
2065
|
+
|
|
2066
|
+
port = STATE.get("port", 8000)
|
|
2067
|
+
|
|
2068
|
+
return {
|
|
2069
|
+
"node_id": NEURO_NODE.node_id,
|
|
2070
|
+
"port": port,
|
|
2071
|
+
"grpc_port": port + 1000,
|
|
2072
|
+
"tracker_url": "https://neuroshard.com/api/tracker",
|
|
2073
|
+
"training": {
|
|
2074
|
+
"enabled": NEURO_NODE.enable_training,
|
|
2075
|
+
"batch_size": 8,
|
|
2076
|
+
"learning_rate": 0.0001,
|
|
2077
|
+
"diloco_steps": STATE.get("diloco_inner_steps", 500),
|
|
2078
|
+
},
|
|
2079
|
+
"resources": {
|
|
2080
|
+
"max_memory_mb": STATE.get("config_memory_mb"),
|
|
2081
|
+
"cpu_threads": STATE.get("config_cpu_threads"),
|
|
2082
|
+
},
|
|
2083
|
+
}
|
|
2084
|
+
|
|
2085
|
+
|
|
2086
|
+
@node_app.patch("/api/v1/config")
|
|
2087
|
+
async def update_config_v1(updates: dict):
|
|
2088
|
+
"""Update node configuration (SDK compatible)."""
|
|
2089
|
+
updated = []
|
|
2090
|
+
|
|
2091
|
+
if "training" in updates:
|
|
2092
|
+
training = updates["training"]
|
|
2093
|
+
if "batch_size" in training:
|
|
2094
|
+
updated.append("training.batch_size")
|
|
2095
|
+
if "diloco_steps" in training:
|
|
2096
|
+
STATE["diloco_inner_steps"] = training["diloco_steps"]
|
|
2097
|
+
updated.append("training.diloco_steps")
|
|
2098
|
+
|
|
2099
|
+
if "resources" in updates:
|
|
2100
|
+
resources = updates["resources"]
|
|
2101
|
+
if "max_memory_mb" in resources:
|
|
2102
|
+
STATE["config_memory_mb"] = resources["max_memory_mb"]
|
|
2103
|
+
updated.append("resources.max_memory_mb")
|
|
2104
|
+
if "cpu_threads" in resources:
|
|
2105
|
+
STATE["config_cpu_threads"] = resources["cpu_threads"]
|
|
2106
|
+
updated.append("resources.cpu_threads")
|
|
2107
|
+
|
|
2108
|
+
return {
|
|
2109
|
+
"success": True,
|
|
2110
|
+
"updated": updated,
|
|
2111
|
+
"restart_required": False,
|
|
2112
|
+
}
|
|
2113
|
+
|
|
2114
|
+
|
|
2115
|
+
# ==================== UTILITY FUNCTIONS ====================
|
|
2116
|
+
|
|
2117
|
+
def get_public_ip():
|
|
2118
|
+
"""Attempt to get the public IP address of this node."""
|
|
2119
|
+
try:
|
|
2120
|
+
services = [
|
|
2121
|
+
'https://api.ipify.org',
|
|
2122
|
+
'https://ifconfig.me/ip',
|
|
2123
|
+
'https://icanhazip.com'
|
|
2124
|
+
]
|
|
2125
|
+
for service in services:
|
|
2126
|
+
try:
|
|
2127
|
+
return requests.get(service, timeout=3).text.strip()
|
|
2128
|
+
except:
|
|
2129
|
+
continue
|
|
2130
|
+
except Exception:
|
|
2131
|
+
pass
|
|
2132
|
+
return None
|
|
2133
|
+
|
|
2134
|
+
|
|
2135
|
+
def get_local_ip():
|
|
2136
|
+
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
2137
|
+
try:
|
|
2138
|
+
s.connect(('10.255.255.255', 1))
|
|
2139
|
+
IP = s.getsockname()[0]
|
|
2140
|
+
except Exception:
|
|
2141
|
+
IP = '127.0.0.1'
|
|
2142
|
+
finally:
|
|
2143
|
+
s.close()
|
|
2144
|
+
return IP
|
|
2145
|
+
|
|
2146
|
+
|
|
2147
|
+
def run_node(
|
|
2148
|
+
port: int,
|
|
2149
|
+
tracker: str = "https://neuroshard.com/api/tracker",
|
|
2150
|
+
node_token: Optional[str] = None,
|
|
2151
|
+
announce_ip: str = None,
|
|
2152
|
+
announce_port: int = None,
|
|
2153
|
+
enable_training: bool = True,
|
|
2154
|
+
available_memory_mb: Optional[float] = None,
|
|
2155
|
+
max_storage_mb: float = 100.0,
|
|
2156
|
+
max_cpu_threads: Optional[int] = None,
|
|
2157
|
+
diloco_inner_steps: int = 500,
|
|
2158
|
+
device: str = "auto",
|
|
2159
|
+
):
|
|
2160
|
+
"""
|
|
2161
|
+
Start a NeuroShard node.
|
|
2162
|
+
|
|
2163
|
+
TRULY DECENTRALIZED:
|
|
2164
|
+
- No fixed phases or model sizes
|
|
2165
|
+
- Node contributes based on available memory
|
|
2166
|
+
- More memory = more layers = more NEURO rewards
|
|
2167
|
+
|
|
2168
|
+
MULTI-NODE SUPPORT:
|
|
2169
|
+
- Same token on multiple machines/ports is now supported
|
|
2170
|
+
- Each instance gets unique network identity (for layers)
|
|
2171
|
+
- Earnings accumulate to the same NEURO wallet
|
|
2172
|
+
|
|
2173
|
+
Args:
|
|
2174
|
+
port: HTTP port
|
|
2175
|
+
tracker: Tracker URL for peer discovery
|
|
2176
|
+
node_token: Authentication token
|
|
2177
|
+
enable_training: Whether to participate in training
|
|
2178
|
+
available_memory_mb: Override memory detection (for testing)
|
|
2179
|
+
max_storage_mb: Maximum disk space for training data shards
|
|
2180
|
+
max_cpu_threads: Maximum CPU threads to use for training
|
|
2181
|
+
"""
|
|
2182
|
+
global NEURO_NODE, P2P
|
|
2183
|
+
|
|
2184
|
+
# CRITICAL: Clear shutdown flag from previous run (for GUI restart support)
|
|
2185
|
+
_SHUTDOWN_REQUESTED.clear()
|
|
2186
|
+
|
|
2187
|
+
# Reset STATE for fresh start (important for GUI restart)
|
|
2188
|
+
STATE.clear()
|
|
2189
|
+
STATE.update({
|
|
2190
|
+
"shard_range": "Unknown",
|
|
2191
|
+
"peer_count": 0,
|
|
2192
|
+
"processed_count": 0,
|
|
2193
|
+
"training_updates": 0,
|
|
2194
|
+
"token_count": 0,
|
|
2195
|
+
"training_batches": 0,
|
|
2196
|
+
"assigned_layers": [],
|
|
2197
|
+
"has_embedding": False,
|
|
2198
|
+
"has_lm_head": False,
|
|
2199
|
+
})
|
|
2200
|
+
|
|
2201
|
+
logger.info(f"Starting NeuroShard Node {__version__} on Port {port}")
|
|
2202
|
+
|
|
2203
|
+
# Multi-node detection and info
|
|
2204
|
+
from neuroshard.utils.hardware import get_instance_id, get_machine_id
|
|
2205
|
+
instance_id = get_instance_id(port)
|
|
2206
|
+
machine_id = get_machine_id()
|
|
2207
|
+
|
|
2208
|
+
logger.info(f"Machine ID: {machine_id}")
|
|
2209
|
+
logger.info(f"Instance ID: {instance_id} (machine:port unique)")
|
|
2210
|
+
|
|
2211
|
+
if node_token:
|
|
2212
|
+
wallet_id = hashlib.sha256(node_token.encode()).hexdigest()[:16]
|
|
2213
|
+
logger.info(f"Wallet ID: {wallet_id}... (NEURO earnings go here)")
|
|
2214
|
+
logger.info("=" * 50)
|
|
2215
|
+
logger.info("MULTI-NODE INFO:")
|
|
2216
|
+
logger.info(" Same token on multiple machines? Each gets unique assignment")
|
|
2217
|
+
logger.info("=" * 50)
|
|
2218
|
+
logger.info(f"Dashboard available at http://localhost:{port}/")
|
|
2219
|
+
logger.info(f"Max training data storage: {max_storage_mb}MB")
|
|
2220
|
+
|
|
2221
|
+
# Thread configuration
|
|
2222
|
+
# Note: For GUI mode, this is already set in gui_runner.py wrapper
|
|
2223
|
+
# For CLI mode, we do our best here (may fail if torch already initialized)
|
|
2224
|
+
if max_cpu_threads:
|
|
2225
|
+
logger.info(f"Limiting CPU threads to: {max_cpu_threads}")
|
|
2226
|
+
|
|
2227
|
+
# Set environment variables (these always work)
|
|
2228
|
+
import os
|
|
2229
|
+
os.environ['OMP_NUM_THREADS'] = str(max_cpu_threads)
|
|
2230
|
+
os.environ['MKL_NUM_THREADS'] = str(max_cpu_threads)
|
|
2231
|
+
os.environ['OPENBLAS_NUM_THREADS'] = str(max_cpu_threads)
|
|
2232
|
+
|
|
2233
|
+
# Try to set PyTorch threads (may fail if already set)
|
|
2234
|
+
try:
|
|
2235
|
+
torch.set_num_threads(max_cpu_threads)
|
|
2236
|
+
torch.set_num_interop_threads(max(1, max_cpu_threads // 2))
|
|
2237
|
+
except RuntimeError:
|
|
2238
|
+
# Already configured (likely by GUI wrapper or torch initialized)
|
|
2239
|
+
pass
|
|
2240
|
+
|
|
2241
|
+
# Lower process priority (to not hog system resources)
|
|
2242
|
+
try:
|
|
2243
|
+
if sys.platform == 'win32':
|
|
2244
|
+
# Windows: Use SetPriorityClass
|
|
2245
|
+
import ctypes
|
|
2246
|
+
kernel32 = ctypes.windll.kernel32
|
|
2247
|
+
BELOW_NORMAL_PRIORITY_CLASS = 0x00004000
|
|
2248
|
+
kernel32.SetPriorityClass(kernel32.GetCurrentProcess(), BELOW_NORMAL_PRIORITY_CLASS)
|
|
2249
|
+
logger.info("Process priority lowered (Windows BELOW_NORMAL)")
|
|
2250
|
+
elif hasattr(os, 'nice'):
|
|
2251
|
+
# Unix/Mac: Use nice
|
|
2252
|
+
os.nice(10)
|
|
2253
|
+
logger.info("Process priority lowered (nice=10)")
|
|
2254
|
+
except Exception:
|
|
2255
|
+
pass
|
|
2256
|
+
|
|
2257
|
+
if node_token:
|
|
2258
|
+
logger.info(f"Authenticated with Token: {node_token[:8]}...")
|
|
2259
|
+
|
|
2260
|
+
# FULLY DECENTRALIZED INITIALIZATION ORDER:
|
|
2261
|
+
# 1. Setup networking FIRST (so DHT is available for layer discovery)
|
|
2262
|
+
# 2. Initialize P2P BEFORE creating the node
|
|
2263
|
+
# 3. Create node WITH P2P connected (uses DHT for network discovery)
|
|
2264
|
+
# This ensures layer assignment can use DHT to detect existing nodes!
|
|
2265
|
+
|
|
2266
|
+
token_for_id = node_token or str(uuid.uuid4())
|
|
2267
|
+
|
|
2268
|
+
# 1. Setup networking FIRST
|
|
2269
|
+
from neuroshard.core.network.nat import NATTraverser
|
|
2270
|
+
nat = NATTraverser()
|
|
2271
|
+
|
|
2272
|
+
ip_addr = announce_ip or nat.discover_public_ip() or get_public_ip() or get_local_ip()
|
|
2273
|
+
|
|
2274
|
+
# UPnP mapping
|
|
2275
|
+
nat.attempt_upnp_mapping(port, "TCP", "NeuroShard HTTP")
|
|
2276
|
+
nat.attempt_upnp_mapping(port + 1000, "TCP", "NeuroShard gRPC")
|
|
2277
|
+
|
|
2278
|
+
final_announce_port = announce_port or port
|
|
2279
|
+
logger.info(f"Announcing as: {ip_addr}:{final_announce_port}")
|
|
2280
|
+
|
|
2281
|
+
my_url = f"http://{ip_addr}:{final_announce_port}"
|
|
2282
|
+
|
|
2283
|
+
# 2. Initialize P2P BEFORE creating the node
|
|
2284
|
+
# Use temporary shard_range "0-0" - will be updated after layer assignment
|
|
2285
|
+
# This allows DHT to be available for network discovery during layer assignment!
|
|
2286
|
+
P2P = P2PManager(my_url, "0-0", tracker, node_token=node_token)
|
|
2287
|
+
P2P.state_ref = STATE
|
|
2288
|
+
|
|
2289
|
+
# CRITICAL: Synchronously fetch peers and populate routing table BEFORE node creation!
|
|
2290
|
+
# The background thread might not have run yet, so we do it explicitly here.
|
|
2291
|
+
logger.info("DHT bootstrapping... (discovering existing nodes)")
|
|
2292
|
+
import time
|
|
2293
|
+
import hashlib as hashlib_module # Avoid shadowing issues
|
|
2294
|
+
|
|
2295
|
+
try:
|
|
2296
|
+
import requests
|
|
2297
|
+
from urllib.parse import urlparse
|
|
2298
|
+
from neuroshard.core.network.dht import Node
|
|
2299
|
+
|
|
2300
|
+
# Fetch ALL peers from tracker
|
|
2301
|
+
resp = requests.get(f"{tracker}/peers", params={"limit": 100}, timeout=5)
|
|
2302
|
+
if resp.status_code == 200:
|
|
2303
|
+
peers = resp.json()
|
|
2304
|
+
peer_count = 0
|
|
2305
|
+
for p in peers:
|
|
2306
|
+
if p.get("url") != my_url:
|
|
2307
|
+
P2P.known_peers[p["url"]] = p
|
|
2308
|
+
# Add to DHT routing table so layer lookups can find them!
|
|
2309
|
+
if P2P.routing_table:
|
|
2310
|
+
try:
|
|
2311
|
+
p_parsed = urlparse(p["url"])
|
|
2312
|
+
p_ip = p_parsed.hostname
|
|
2313
|
+
p_port = p_parsed.port or 80
|
|
2314
|
+
p_id = int(hashlib_module.sha1(f"{p['url']}".encode()).hexdigest(), 16)
|
|
2315
|
+
P2P.routing_table.add_contact(Node(p_id, p_ip, p_port))
|
|
2316
|
+
peer_count += 1
|
|
2317
|
+
except:
|
|
2318
|
+
pass
|
|
2319
|
+
if peer_count > 0:
|
|
2320
|
+
logger.info(f"DHT: Added {peer_count} peers to routing table")
|
|
2321
|
+
except Exception as e:
|
|
2322
|
+
logger.debug(f"Peer discovery failed: {e}")
|
|
2323
|
+
|
|
2324
|
+
# Additional wait to let DHT stabilize
|
|
2325
|
+
time.sleep(1)
|
|
2326
|
+
|
|
2327
|
+
logger.info(f"Initializing NeuroShard Node (training={enable_training}, DiLoCo steps={diloco_inner_steps})...")
|
|
2328
|
+
|
|
2329
|
+
# 3. Create swarm config
|
|
2330
|
+
swarm_config = SwarmNodeConfig(
|
|
2331
|
+
diloco_inner_steps=diloco_inner_steps,
|
|
2332
|
+
)
|
|
2333
|
+
|
|
2334
|
+
# 4. Create node WITH P2P already available
|
|
2335
|
+
# This allows layer assignment to use DHT for network discovery!
|
|
2336
|
+
NEURO_NODE = create_swarm_node_with_p2p(
|
|
2337
|
+
node_token=token_for_id,
|
|
2338
|
+
port=port,
|
|
2339
|
+
tracker_url=tracker,
|
|
2340
|
+
config=swarm_config,
|
|
2341
|
+
available_memory_mb=available_memory_mb,
|
|
2342
|
+
enable_training=enable_training,
|
|
2343
|
+
max_storage_mb=max_storage_mb,
|
|
2344
|
+
max_cpu_threads=max_cpu_threads,
|
|
2345
|
+
device=device,
|
|
2346
|
+
p2p_manager=P2P, # Pass P2P so DHT is available during layer assignment!
|
|
2347
|
+
)
|
|
2348
|
+
|
|
2349
|
+
STATE["diloco_inner_steps"] = diloco_inner_steps
|
|
2350
|
+
|
|
2351
|
+
logger.info(f"NeuroLLM loaded: {NEURO_NODE.model.get_num_params() / 1e6:.1f}M parameters")
|
|
2352
|
+
logger.info(f"Assigned layers: {NEURO_NODE.my_layer_ids}")
|
|
2353
|
+
logger.info(f"Embedding: {NEURO_NODE.model.has_embedding}, LM Head: {NEURO_NODE.model.has_lm_head}")
|
|
2354
|
+
logger.info(f"DiLoCo: inner_steps={diloco_inner_steps}")
|
|
2355
|
+
|
|
2356
|
+
# EARLY NETWORK WARNING
|
|
2357
|
+
num_layers = len(NEURO_NODE.my_layer_ids)
|
|
2358
|
+
if num_layers > 50:
|
|
2359
|
+
logger.warning("⚠️ EARLY NETWORK NOTICE ⚠️")
|
|
2360
|
+
logger.warning(f"You're holding {num_layers} layers because the network is small.")
|
|
2361
|
+
logger.warning("This is TEMPORARY - as more nodes join, the model will be sharded.")
|
|
2362
|
+
|
|
2363
|
+
# Show initial memory usage
|
|
2364
|
+
try:
|
|
2365
|
+
import psutil
|
|
2366
|
+
process = psutil.Process()
|
|
2367
|
+
process_mem_mb = process.memory_info().rss / (1024 * 1024)
|
|
2368
|
+
logger.info(f"Current memory usage: {process_mem_mb:.0f}MB / {available_memory_mb or '?'}MB allocated")
|
|
2369
|
+
except Exception:
|
|
2370
|
+
pass
|
|
2371
|
+
|
|
2372
|
+
# 5. Update P2P shard_range with actual assigned layers
|
|
2373
|
+
layer_ids = NEURO_NODE.my_layer_ids
|
|
2374
|
+
if layer_ids:
|
|
2375
|
+
start_layer = min(layer_ids)
|
|
2376
|
+
end_layer = max(layer_ids)
|
|
2377
|
+
shard_range = f"{start_layer}-{end_layer}"
|
|
2378
|
+
else:
|
|
2379
|
+
shard_range = "0-0"
|
|
2380
|
+
P2P.shard_range = shard_range
|
|
2381
|
+
P2P.start_layer = start_layer if layer_ids else 0
|
|
2382
|
+
P2P.end_layer = end_layer if layer_ids else 0
|
|
2383
|
+
STATE["shard_range"] = shard_range
|
|
2384
|
+
logger.info(f"P2P shard_range: {shard_range} (layers {layer_ids})")
|
|
2385
|
+
|
|
2386
|
+
# Set node role info for PoNW reward calculation
|
|
2387
|
+
STATE["assigned_layers"] = NEURO_NODE.my_layer_ids
|
|
2388
|
+
STATE["has_embedding"] = NEURO_NODE.model.has_embedding
|
|
2389
|
+
STATE["has_lm_head"] = NEURO_NODE.model.has_lm_head
|
|
2390
|
+
STATE["current_loss"] = NEURO_NODE.current_loss if NEURO_NODE.current_loss != float('inf') else None
|
|
2391
|
+
|
|
2392
|
+
logger.info(f"Connected to P2P network for distributed training")
|
|
2393
|
+
|
|
2394
|
+
# 4a. Set up ROLE VERIFICATION to prevent fake Validator/Driver claims
|
|
2395
|
+
# This is CRITICAL for security - nodes can't claim roles they don't have
|
|
2396
|
+
def verify_node_role(node_id: str, claimed_embed: bool, claimed_head: bool):
|
|
2397
|
+
"""
|
|
2398
|
+
Verify that a node actually holds the layers it claims.
|
|
2399
|
+
|
|
2400
|
+
Uses THREE sources for verification (defense in depth):
|
|
2401
|
+
1. Local layer_pool (authoritative for nodes we know)
|
|
2402
|
+
2. DHT lookup (for remote nodes we don't have in local pool)
|
|
2403
|
+
3. Tracker query (fallback for unverifiable claims)
|
|
2404
|
+
|
|
2405
|
+
Returns: (is_valid, actual_has_embedding, actual_has_lm_head)
|
|
2406
|
+
"""
|
|
2407
|
+
import json
|
|
2408
|
+
import hashlib
|
|
2409
|
+
|
|
2410
|
+
# 1. LOCAL VERIFICATION (fastest, most authoritative)
|
|
2411
|
+
if NEURO_NODE.layer_pool:
|
|
2412
|
+
layer_0_holders = [a.node_id for a in NEURO_NODE.layer_pool.get_layer_holders(0)]
|
|
2413
|
+
last_layer = max(0, NEURO_NODE.layer_pool.current_num_layers - 1)
|
|
2414
|
+
last_layer_holders = [a.node_id for a in NEURO_NODE.layer_pool.get_layer_holders(last_layer)]
|
|
2415
|
+
|
|
2416
|
+
# Check if we know this node locally
|
|
2417
|
+
all_known_nodes = set(layer_0_holders + last_layer_holders)
|
|
2418
|
+
for assignments in NEURO_NODE.layer_pool.layer_assignments.values():
|
|
2419
|
+
for a in assignments:
|
|
2420
|
+
all_known_nodes.add(a.node_id)
|
|
2421
|
+
|
|
2422
|
+
if node_id in all_known_nodes:
|
|
2423
|
+
# We know this node - verify against local data
|
|
2424
|
+
actual_embed = node_id in layer_0_holders
|
|
2425
|
+
actual_head = node_id in last_layer_holders
|
|
2426
|
+
|
|
2427
|
+
is_valid = True
|
|
2428
|
+
if claimed_head and not actual_head:
|
|
2429
|
+
is_valid = False
|
|
2430
|
+
if claimed_embed and not actual_embed:
|
|
2431
|
+
is_valid = False
|
|
2432
|
+
|
|
2433
|
+
return is_valid, actual_embed, actual_head
|
|
2434
|
+
|
|
2435
|
+
# 2. HEARTBEAT/PEER_STATS VERIFICATION (from swarm router)
|
|
2436
|
+
# Heartbeats contain node_id AND layer_range - this is the best source for remote nodes!
|
|
2437
|
+
# Note: swarm_components contains SwarmComponents (router, buffers, etc.)
|
|
2438
|
+
if hasattr(NEURO_NODE, 'swarm_components') and NEURO_NODE.swarm_components and hasattr(NEURO_NODE.swarm_components, 'swarm_router'):
|
|
2439
|
+
router = NEURO_NODE.swarm_components.swarm_router
|
|
2440
|
+
if hasattr(router, 'peer_stats') and node_id in router.peer_stats:
|
|
2441
|
+
peer = router.peer_stats[node_id]
|
|
2442
|
+
layer_range = peer.layer_range # (start, end) tuple
|
|
2443
|
+
|
|
2444
|
+
# Get last layer from our layer pool
|
|
2445
|
+
last_layer = max(0, NEURO_NODE.layer_pool.current_num_layers - 1) if NEURO_NODE.layer_pool else 0
|
|
2446
|
+
|
|
2447
|
+
# Driver = holds layer 0
|
|
2448
|
+
actual_embed = layer_range[0] == 0
|
|
2449
|
+
# Validator = holds last layer
|
|
2450
|
+
actual_head = last_layer in range(layer_range[0], layer_range[1])
|
|
2451
|
+
|
|
2452
|
+
is_valid = True
|
|
2453
|
+
if claimed_head and not actual_head:
|
|
2454
|
+
is_valid = False
|
|
2455
|
+
if claimed_embed and not actual_embed:
|
|
2456
|
+
is_valid = False
|
|
2457
|
+
|
|
2458
|
+
logger.debug(f"Role verification via heartbeat: {node_id[:16]}... "
|
|
2459
|
+
f"layers={layer_range}, embed={actual_embed}, head={actual_head}")
|
|
2460
|
+
return is_valid, actual_embed, actual_head
|
|
2461
|
+
|
|
2462
|
+
# 3. FALLBACK: For unknown nodes, use CONSERVATIVE verification
|
|
2463
|
+
# NOTE: DHT stores IP:port not node_id, so we can't verify roles via DHT alone
|
|
2464
|
+
# If we can't verify, we have two options:
|
|
2465
|
+
# a) REJECT all unknown claims (secure but might reject valid proofs)
|
|
2466
|
+
# b) ACCEPT but cap rewards (economic security)
|
|
2467
|
+
#
|
|
2468
|
+
# We use option (b) - the proof is ACCEPTED but role bonuses are NOT applied
|
|
2469
|
+
# This is handled in _calculate_reward by checking verified roles
|
|
2470
|
+
|
|
2471
|
+
# For now, if we can't verify, return "claims not verified"
|
|
2472
|
+
# The reward calculation should treat unverified claims as false
|
|
2473
|
+
logger.debug(f"Role verification: Node {node_id[:16]}... not in local pool, claims unverifiable")
|
|
2474
|
+
|
|
2475
|
+
# Return: valid=True (don't reject), but actual roles = False (no bonus)
|
|
2476
|
+
# This allows the proof through but without Validator/Driver bonuses
|
|
2477
|
+
return True, False, False
|
|
2478
|
+
|
|
2479
|
+
P2P.ledger.set_role_verifier(verify_node_role)
|
|
2480
|
+
logger.info("Role verification enabled - fake Validator/Driver claims will be REJECTED")
|
|
2481
|
+
|
|
2482
|
+
# Set model interface for training work verification
|
|
2483
|
+
P2P.ledger.set_model_interface(NEURO_NODE)
|
|
2484
|
+
|
|
2485
|
+
# 4b. Start Swarm components
|
|
2486
|
+
if hasattr(NEURO_NODE, 'start_swarm_sync'):
|
|
2487
|
+
logger.info("[SWARM] Starting swarm components...")
|
|
2488
|
+
NEURO_NODE.start_swarm_sync()
|
|
2489
|
+
logger.info("[SWARM] Swarm components started")
|
|
2490
|
+
|
|
2491
|
+
# 5. Start gRPC Server
|
|
2492
|
+
start_grpc_background(port, NEURO_NODE, P2P, None)
|
|
2493
|
+
|
|
2494
|
+
# 5. Background tasks (runs every 1 second)
|
|
2495
|
+
def background_tasks():
|
|
2496
|
+
# CONTINUOUS TRAINING with USER-DEFINED THROTTLING
|
|
2497
|
+
# Respects user's CPU AND RAM limits to allow background operation without hogging resources
|
|
2498
|
+
# Settings are re-read each iteration so changes take effect immediately!
|
|
2499
|
+
|
|
2500
|
+
import psutil
|
|
2501
|
+
|
|
2502
|
+
# Store initial limits (can be updated via API)
|
|
2503
|
+
STATE["config_cpu_threads"] = max_cpu_threads
|
|
2504
|
+
STATE["config_memory_mb"] = available_memory_mb
|
|
2505
|
+
STATE["config_storage_mb"] = max_storage_mb
|
|
2506
|
+
|
|
2507
|
+
total_cpu_cores = psutil.cpu_count() or 4
|
|
2508
|
+
total_ram_mb = psutil.virtual_memory().total / (1024 * 1024)
|
|
2509
|
+
last_throttle_log = 0
|
|
2510
|
+
|
|
2511
|
+
def calculate_throttle():
|
|
2512
|
+
"""Calculate throttle settings from current config (allows live updates)."""
|
|
2513
|
+
# Read current config (can be updated via API while running)
|
|
2514
|
+
user_cpu_limit = STATE.get("config_cpu_threads") or total_cpu_cores
|
|
2515
|
+
user_ram_limit = STATE.get("config_memory_mb") or (total_ram_mb * 0.7)
|
|
2516
|
+
|
|
2517
|
+
cpu_ratio = min(1.0, user_cpu_limit / total_cpu_cores)
|
|
2518
|
+
ram_ratio = min(1.0, user_ram_limit / total_ram_mb)
|
|
2519
|
+
resource_ratio = min(cpu_ratio, ram_ratio)
|
|
2520
|
+
|
|
2521
|
+
# GPU nodes can train much faster without lagging the system
|
|
2522
|
+
is_gpu = NEURO_NODE.device in ["cuda", "mps"] if NEURO_NODE else False
|
|
2523
|
+
|
|
2524
|
+
# Log device status occasionally to debug "why is it slow?"
|
|
2525
|
+
# Use time.time() directly to avoid closure issues with 'now'
|
|
2526
|
+
current_time = time.time()
|
|
2527
|
+
if current_time - last_throttle_log >= 60:
|
|
2528
|
+
current_device = NEURO_NODE.device if NEURO_NODE else 'None'
|
|
2529
|
+
logger.debug(f"[NODE] Device: {current_device} (is_gpu={is_gpu})")
|
|
2530
|
+
|
|
2531
|
+
base_interval = 0.01 if is_gpu else 2.0
|
|
2532
|
+
|
|
2533
|
+
interval = max(base_interval, base_interval / max(0.1, resource_ratio))
|
|
2534
|
+
# Allow much higher steps per minute on GPU
|
|
2535
|
+
base_max_steps = 600 if is_gpu else 30
|
|
2536
|
+
max_steps = max(5, int(base_max_steps * resource_ratio))
|
|
2537
|
+
|
|
2538
|
+
# Store for API access
|
|
2539
|
+
STATE["throttle_cpu_ratio"] = cpu_ratio
|
|
2540
|
+
STATE["throttle_ram_ratio"] = ram_ratio
|
|
2541
|
+
STATE["throttle_effective"] = resource_ratio
|
|
2542
|
+
STATE["throttle_interval"] = interval
|
|
2543
|
+
STATE["throttle_max_steps"] = max_steps
|
|
2544
|
+
|
|
2545
|
+
return interval, max_steps, resource_ratio
|
|
2546
|
+
|
|
2547
|
+
# Initial calculation and log
|
|
2548
|
+
min_interval_between_steps, max_steps_per_minute, resource_ratio = calculate_throttle()
|
|
2549
|
+
logger.info(f"[NODE] Training throttle: effective={resource_ratio*100:.0f}%, "
|
|
2550
|
+
f"interval={min_interval_between_steps:.1f}s, max={max_steps_per_minute} steps/min")
|
|
2551
|
+
|
|
2552
|
+
last_train_complete = 0
|
|
2553
|
+
# BUGFIX: Initialize to current values (may be >0 if loaded from checkpoint)
|
|
2554
|
+
last_tokens = NEURO_NODE.total_tokens_processed if NEURO_NODE else 0
|
|
2555
|
+
last_training_rounds = NEURO_NODE.total_training_rounds if NEURO_NODE else 0
|
|
2556
|
+
training_in_progress = False
|
|
2557
|
+
consecutive_data_not_ready = 0
|
|
2558
|
+
steps_this_minute = 0
|
|
2559
|
+
training_step_count = 0 # Track total steps for logging
|
|
2560
|
+
minute_start = time.time()
|
|
2561
|
+
last_memory_report = 0 # For periodic memory usage logging
|
|
2562
|
+
last_training_heartbeat = 0 # For periodic training loop status
|
|
2563
|
+
|
|
2564
|
+
while not _SHUTDOWN_REQUESTED.is_set():
|
|
2565
|
+
now = time.time()
|
|
2566
|
+
|
|
2567
|
+
# Reset per-minute counter
|
|
2568
|
+
if now - minute_start >= 60:
|
|
2569
|
+
steps_this_minute = 0
|
|
2570
|
+
minute_start = now
|
|
2571
|
+
|
|
2572
|
+
# RE-CALCULATE THROTTLE periodically (allows live config changes)
|
|
2573
|
+
# Only recalculate every 5 seconds to avoid overhead
|
|
2574
|
+
if now - last_throttle_log >= 5:
|
|
2575
|
+
new_interval, new_max_steps, new_ratio = calculate_throttle()
|
|
2576
|
+
# Log only if changed significantly
|
|
2577
|
+
if abs(new_ratio - resource_ratio) > 0.05:
|
|
2578
|
+
logger.info(f"[NODE] Throttle updated: {new_ratio*100:.0f}% "
|
|
2579
|
+
f"(interval={new_interval:.1f}s, max={new_max_steps}/min)")
|
|
2580
|
+
min_interval_between_steps = new_interval
|
|
2581
|
+
max_steps_per_minute = new_max_steps
|
|
2582
|
+
resource_ratio = new_ratio
|
|
2583
|
+
last_throttle_log = now
|
|
2584
|
+
|
|
2585
|
+
# Update peer count
|
|
2586
|
+
STATE["peer_count"] = len(P2P.known_peers)
|
|
2587
|
+
|
|
2588
|
+
# TRAINING LOOP HEARTBEAT (every 30 seconds) - confirms loop is running
|
|
2589
|
+
if now - last_training_heartbeat >= 30:
|
|
2590
|
+
last_training_heartbeat = now
|
|
2591
|
+
data_status = "unknown"
|
|
2592
|
+
if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
|
|
2593
|
+
try:
|
|
2594
|
+
loader = NEURO_NODE.genesis_loader
|
|
2595
|
+
loaded = len(loader.loaded_shards)
|
|
2596
|
+
prefetch = len(loader._prefetch_ready)
|
|
2597
|
+
data_status = f"loaded={loaded},prefetch={prefetch}"
|
|
2598
|
+
except Exception:
|
|
2599
|
+
data_status = "error"
|
|
2600
|
+
logger.debug(f"[NODE] Training loop alive: status={STATE.get('training_status', '?')}, "
|
|
2601
|
+
f"steps={training_step_count}, data={data_status}")
|
|
2602
|
+
|
|
2603
|
+
# PERIODIC MEMORY REPORT (every 60 seconds)
|
|
2604
|
+
if now - last_memory_report >= 60:
|
|
2605
|
+
try:
|
|
2606
|
+
import os
|
|
2607
|
+
process = psutil.Process(os.getpid())
|
|
2608
|
+
process_mem_mb = process.memory_info().rss / (1024 * 1024)
|
|
2609
|
+
memory_limit = STATE.get("config_memory_mb") or available_memory_mb
|
|
2610
|
+
system_mem = psutil.virtual_memory()
|
|
2611
|
+
|
|
2612
|
+
logger.info(f"[NODE] Memory: process={process_mem_mb:.0f}MB / {memory_limit or '?'}MB limit, "
|
|
2613
|
+
f"system={system_mem.percent:.0f}% ({system_mem.used/(1024**3):.1f}GB / {system_mem.total/(1024**3):.1f}GB)")
|
|
2614
|
+
|
|
2615
|
+
# Show Genesis data loader stats if training
|
|
2616
|
+
if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
|
|
2617
|
+
loader = NEURO_NODE.genesis_loader
|
|
2618
|
+
stats = loader.get_stats()
|
|
2619
|
+
num_loaded = stats.get('loaded_shards', 0)
|
|
2620
|
+
num_prefetched = stats.get('prefetch_ready', 0)
|
|
2621
|
+
shard_id = stats.get('current_shard_id', '?')
|
|
2622
|
+
shard_progress = stats.get('shard_progress_pct', 0)
|
|
2623
|
+
loss_avg = stats.get('loss_avg', 0)
|
|
2624
|
+
|
|
2625
|
+
logger.info(f"[NODE] Genesis: shard {shard_id} ({shard_progress:.0f}% done), "
|
|
2626
|
+
f"{num_loaded} loaded + {num_prefetched} prefetched")
|
|
2627
|
+
|
|
2628
|
+
# Show loss plateau status if loss is tracked
|
|
2629
|
+
if loss_avg > 0:
|
|
2630
|
+
loss_var = stats.get('loss_variance', 0)
|
|
2631
|
+
steps_shard = stats.get('steps_on_current_shard', 0)
|
|
2632
|
+
min_steps = 100 # Minimum steps before plateau can trigger rotation
|
|
2633
|
+
|
|
2634
|
+
# Plateau = low variance + low loss + enough steps
|
|
2635
|
+
is_plateau = loss_var < 0.02 and loss_avg < 0.05 and steps_shard >= min_steps
|
|
2636
|
+
if is_plateau:
|
|
2637
|
+
plateau_status = "will_rotate"
|
|
2638
|
+
elif loss_var < 0.02 and loss_avg < 0.05:
|
|
2639
|
+
plateau_status = f"plateau (need {min_steps - steps_shard} more steps)"
|
|
2640
|
+
else:
|
|
2641
|
+
plateau_status = "learning"
|
|
2642
|
+
|
|
2643
|
+
logger.info(f"[NODE] Training: loss_avg={loss_avg:.4f}, variance={loss_var:.6f}, "
|
|
2644
|
+
f"steps_on_shard={steps_shard}, status={plateau_status}")
|
|
2645
|
+
|
|
2646
|
+
last_memory_report = now
|
|
2647
|
+
except Exception:
|
|
2648
|
+
pass
|
|
2649
|
+
|
|
2650
|
+
# Update token count and training batches from node
|
|
2651
|
+
current_tokens = NEURO_NODE.total_tokens_processed
|
|
2652
|
+
current_training = NEURO_NODE.total_training_rounds
|
|
2653
|
+
|
|
2654
|
+
# Add DELTA to STATE counters (for PoNW proof calculation)
|
|
2655
|
+
# NOTE: last_tokens/last_training_rounds are initialized to current values
|
|
2656
|
+
# at startup to handle checkpoint loading correctly
|
|
2657
|
+
STATE["token_count"] = STATE.get("token_count", 0) + (current_tokens - last_tokens)
|
|
2658
|
+
STATE["training_batches"] = STATE.get("training_batches", 0) + (current_training - last_training_rounds)
|
|
2659
|
+
|
|
2660
|
+
last_tokens = current_tokens
|
|
2661
|
+
last_training_rounds = current_training
|
|
2662
|
+
|
|
2663
|
+
# Store totals for display
|
|
2664
|
+
STATE["total_tokens_processed"] = current_tokens
|
|
2665
|
+
STATE["total_training_rounds"] = current_training
|
|
2666
|
+
|
|
2667
|
+
# Update model hash for PoNW proofs
|
|
2668
|
+
# IMPORTANT: Must use same hash algorithm as SwarmEnabledDynamicNode._get_model_hash()
|
|
2669
|
+
# to ensure proofs verify correctly
|
|
2670
|
+
if NEURO_NODE.model:
|
|
2671
|
+
if hasattr(NEURO_NODE, '_get_model_hash'):
|
|
2672
|
+
# Use the swarm node's hash method for consistency
|
|
2673
|
+
STATE["model_hash"] = NEURO_NODE._get_model_hash()
|
|
2674
|
+
else:
|
|
2675
|
+
# Fallback: compute architecture-based hash (same logic as factory.py)
|
|
2676
|
+
hasher = hashlib.sha256()
|
|
2677
|
+
arch_str = f"{NEURO_NODE.model.hidden_dim}:{len(NEURO_NODE.my_layer_ids)}:{getattr(NEURO_NODE.model, 'num_heads', 0)}"
|
|
2678
|
+
hasher.update(arch_str.encode())
|
|
2679
|
+
for name, param in sorted(NEURO_NODE.model.named_parameters()):
|
|
2680
|
+
hasher.update(f"{name}:{list(param.shape)}".encode())
|
|
2681
|
+
STATE["model_hash"] = hasher.hexdigest()[:16]
|
|
2682
|
+
|
|
2683
|
+
# Session cleanup
|
|
2684
|
+
to_remove = [sid for sid, ts in SESSION_TIMESTAMPS.items() if now - ts > 300]
|
|
2685
|
+
for sid in to_remove:
|
|
2686
|
+
del SESSION_TIMESTAMPS[sid]
|
|
2687
|
+
|
|
2688
|
+
# Marketplace cleanup (every 60 seconds)
|
|
2689
|
+
if int(now) % 60 == 0:
|
|
2690
|
+
market = P2P.ledger.inference_market
|
|
2691
|
+
# Cleanup stale claims
|
|
2692
|
+
stale = market.cleanup_stale_claims()
|
|
2693
|
+
if stale > 0:
|
|
2694
|
+
logger.info(f"[MARKET] Cleaned up {stale} stale claims")
|
|
2695
|
+
# Cleanup old results
|
|
2696
|
+
market.cleanup_old_results()
|
|
2697
|
+
|
|
2698
|
+
# VALIDATOR ELIGIBILITY CHECK
|
|
2699
|
+
# Ensure validators still meet stake requirements when tier changes
|
|
2700
|
+
if NEURO_NODE and NEURO_NODE.layer_pool:
|
|
2701
|
+
def get_node_stake(node_id: str) -> float:
|
|
2702
|
+
"""Get stake for a node (checks local ledger)."""
|
|
2703
|
+
if node_id == NEURO_NODE.node_id:
|
|
2704
|
+
return P2P.ledger.get_account_info().get("stake", 0.0)
|
|
2705
|
+
# For remote nodes, we'd need to query their stake
|
|
2706
|
+
# For now, assume they meet requirements (trust but verify via gossip)
|
|
2707
|
+
return float('inf')
|
|
2708
|
+
|
|
2709
|
+
# Check if any validators need demotion
|
|
2710
|
+
demoted = NEURO_NODE.layer_pool.validate_all_validators(get_node_stake)
|
|
2711
|
+
|
|
2712
|
+
# If we were demoted, disable our LM head
|
|
2713
|
+
if NEURO_NODE.node_id in demoted and NEURO_NODE.model:
|
|
2714
|
+
NEURO_NODE.model.disable_lm_head()
|
|
2715
|
+
logger.warning("[NODE] Self-demoted from Validator due to stake tier change")
|
|
2716
|
+
|
|
2717
|
+
# CONTINUOUS TRAINING with smart throttling:
|
|
2718
|
+
# 1. Training must be enabled
|
|
2719
|
+
# 2. NEURO_NODE must exist
|
|
2720
|
+
# 3. No training currently in progress
|
|
2721
|
+
# 4. Minimum interval since last step (for system responsiveness)
|
|
2722
|
+
# 5. Haven't exceeded max steps per minute (optional throttle)
|
|
2723
|
+
should_train = (
|
|
2724
|
+
enable_training and
|
|
2725
|
+
not training_in_progress and
|
|
2726
|
+
(now - last_train_complete) >= min_interval_between_steps and
|
|
2727
|
+
steps_this_minute < max_steps_per_minute
|
|
2728
|
+
)
|
|
2729
|
+
|
|
2730
|
+
if should_train:
|
|
2731
|
+
# MEMORY WARNING: Log if over limit (rate-limited to once per 60s)
|
|
2732
|
+
# Note: This is informational only - we don't skip training because
|
|
2733
|
+
# the --memory flag is a HINT for layer calculation, not a hard cap
|
|
2734
|
+
try:
|
|
2735
|
+
import os
|
|
2736
|
+
process = psutil.Process(os.getpid())
|
|
2737
|
+
process_mem_mb = process.memory_info().rss / (1024 * 1024)
|
|
2738
|
+
memory_limit = STATE.get("config_memory_mb") or available_memory_mb
|
|
2739
|
+
|
|
2740
|
+
# Rate-limit warning to once per 60 seconds
|
|
2741
|
+
last_mem_warning = STATE.get("_last_mem_warning", 0)
|
|
2742
|
+
if memory_limit and process_mem_mb > memory_limit * 1.2 and (now - last_mem_warning) >= 60:
|
|
2743
|
+
STATE["_last_mem_warning"] = now
|
|
2744
|
+
system_mem = psutil.virtual_memory()
|
|
2745
|
+
logger.info(f"[NODE] Memory note: process={process_mem_mb:.0f}MB (limit={memory_limit}MB is a hint, not cap)")
|
|
2746
|
+
logger.info(f"[NODE] System has {system_mem.available / (1024**3):.1f}GB available - training continues normally")
|
|
2747
|
+
|
|
2748
|
+
# Only clear caches if system memory is actually low (>80% used)
|
|
2749
|
+
if system_mem.percent > 80:
|
|
2750
|
+
logger.warning(f"[NODE] System memory high ({system_mem.percent}%), clearing caches...")
|
|
2751
|
+
if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
|
|
2752
|
+
loader = NEURO_NODE.genesis_loader
|
|
2753
|
+
current_shard = loader.assigned_shard_ids[loader.current_shard_idx % len(loader.assigned_shard_ids)] if loader.assigned_shard_ids else None
|
|
2754
|
+
shards_to_remove = [sid for sid in loader.loaded_shards.keys() if sid != current_shard]
|
|
2755
|
+
for sid in shards_to_remove:
|
|
2756
|
+
del loader.loaded_shards[sid]
|
|
2757
|
+
loader._prefetch_ready.clear()
|
|
2758
|
+
import gc
|
|
2759
|
+
gc.collect()
|
|
2760
|
+
if NEURO_NODE.device == "cuda":
|
|
2761
|
+
torch.cuda.empty_cache()
|
|
2762
|
+
elif NEURO_NODE.device == "mps":
|
|
2763
|
+
torch.mps.empty_cache()
|
|
2764
|
+
except Exception:
|
|
2765
|
+
pass
|
|
2766
|
+
|
|
2767
|
+
# Check if data is ready (non-blocking)
|
|
2768
|
+
data_ready = False
|
|
2769
|
+
if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
|
|
2770
|
+
try:
|
|
2771
|
+
# Use timeout to prevent lock contention from blocking training loop
|
|
2772
|
+
data_ready = NEURO_NODE.genesis_loader.is_data_ready()
|
|
2773
|
+
except Exception as e:
|
|
2774
|
+
logger.warning(f"[GENESIS] is_data_ready() error: {e}")
|
|
2775
|
+
data_ready = False
|
|
2776
|
+
|
|
2777
|
+
# Show shard download status periodically
|
|
2778
|
+
if not data_ready and consecutive_data_not_ready % 5 == 0:
|
|
2779
|
+
try:
|
|
2780
|
+
stats = NEURO_NODE.genesis_loader.get_stats()
|
|
2781
|
+
logger.info(f"[GENESIS] Status: assigned={stats.get('assigned_shards', 0)} shards, "
|
|
2782
|
+
f"loaded={stats.get('loaded_shards', 0)}, "
|
|
2783
|
+
f"prefetching={stats.get('prefetch_in_progress', 0)}")
|
|
2784
|
+
except Exception:
|
|
2785
|
+
pass
|
|
2786
|
+
elif data_ready and training_step_count == 0:
|
|
2787
|
+
logger.info(f"[GENESIS] Data ready! Starting first training step...")
|
|
2788
|
+
training_step_count = 1 # Prevent repeat message
|
|
2789
|
+
else:
|
|
2790
|
+
# No genesis loader yet - first training step will create it
|
|
2791
|
+
data_ready = True
|
|
2792
|
+
|
|
2793
|
+
if data_ready or consecutive_data_not_ready > 3:
|
|
2794
|
+
training_in_progress = True
|
|
2795
|
+
consecutive_data_not_ready = 0
|
|
2796
|
+
step_start = time.time()
|
|
2797
|
+
|
|
2798
|
+
STATE["training_status"] = "running"
|
|
2799
|
+
|
|
2800
|
+
# Debug: Log why we're training
|
|
2801
|
+
if not data_ready:
|
|
2802
|
+
logger.debug(f"[NODE] Forcing training step after {consecutive_data_not_ready} waits")
|
|
2803
|
+
|
|
2804
|
+
try:
|
|
2805
|
+
loss = NEURO_NODE.train_step()
|
|
2806
|
+
step_duration = time.time() - step_start
|
|
2807
|
+
|
|
2808
|
+
if loss is not None:
|
|
2809
|
+
steps_this_minute += 1
|
|
2810
|
+
training_step_count += 1
|
|
2811
|
+
|
|
2812
|
+
# Get LR from DiLoCo trainer if available
|
|
2813
|
+
lr_info = ""
|
|
2814
|
+
# Note: swarm_components contains SwarmComponents (DiLoCo, etc.)
|
|
2815
|
+
if hasattr(NEURO_NODE, 'swarm_components') and NEURO_NODE.swarm_components:
|
|
2816
|
+
diloco = getattr(NEURO_NODE.swarm_components, 'diloco_trainer', None)
|
|
2817
|
+
if diloco:
|
|
2818
|
+
current_lr = diloco.get_current_lr()
|
|
2819
|
+
lr_info = f", lr={current_lr:.2e}"
|
|
2820
|
+
|
|
2821
|
+
# Log every step with timing info
|
|
2822
|
+
logger.info(f"[NODE] Training step #{NEURO_NODE.total_training_rounds}: "
|
|
2823
|
+
f"loss={loss:.4f}{lr_info} ({step_duration:.1f}s)")
|
|
2824
|
+
STATE["training_status"] = "idle"
|
|
2825
|
+
STATE["last_loss"] = loss
|
|
2826
|
+
STATE["current_loss"] = loss # For gossip proof creation
|
|
2827
|
+
else:
|
|
2828
|
+
# train_step returned None - log why
|
|
2829
|
+
logger.info(f"[NODE] Training step returned None (took {step_duration:.1f}s)")
|
|
2830
|
+
STATE["training_status"] = "waiting_for_data"
|
|
2831
|
+
|
|
2832
|
+
except RuntimeError as e:
|
|
2833
|
+
error_msg = str(e).lower()
|
|
2834
|
+
if "not ready" in error_msg:
|
|
2835
|
+
if consecutive_data_not_ready == 0:
|
|
2836
|
+
logger.info(f"[NODE] Waiting for Genesis data to download...")
|
|
2837
|
+
if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
|
|
2838
|
+
stats = NEURO_NODE.genesis_loader.get_stats()
|
|
2839
|
+
logger.info(f"[GENESIS] Downloading shard... "
|
|
2840
|
+
f"(assigned: {stats.get('assigned_shards', '?')}, "
|
|
2841
|
+
f"loaded: {stats.get('loaded_shards', 0)}, "
|
|
2842
|
+
f"prefetching: {stats.get('prefetch_in_progress', 0)})")
|
|
2843
|
+
STATE["training_status"] = "loading_data"
|
|
2844
|
+
consecutive_data_not_ready += 1
|
|
2845
|
+
elif "genesis loader init failed" in error_msg or "manifest" in error_msg:
|
|
2846
|
+
# Genesis loader initialization error - show details
|
|
2847
|
+
logger.error(f"[GENESIS] ERROR: {e}")
|
|
2848
|
+
STATE["training_status"] = "genesis_error"
|
|
2849
|
+
# Don't spam - wait before retrying
|
|
2850
|
+
time.sleep(10)
|
|
2851
|
+
else:
|
|
2852
|
+
logger.error(f"[NODE] Training error: {e}")
|
|
2853
|
+
STATE["training_status"] = "error"
|
|
2854
|
+
except Exception as e:
|
|
2855
|
+
logger.error(f"[NODE] Training error: {e}")
|
|
2856
|
+
STATE["training_status"] = "error"
|
|
2857
|
+
|
|
2858
|
+
training_in_progress = False
|
|
2859
|
+
last_train_complete = time.time()
|
|
2860
|
+
else:
|
|
2861
|
+
consecutive_data_not_ready += 1
|
|
2862
|
+
if consecutive_data_not_ready == 1:
|
|
2863
|
+
logger.info(f"[NODE] Waiting for training data to load...")
|
|
2864
|
+
|
|
2865
|
+
# Heartbeat for layers (only every 10 seconds to reduce overhead)
|
|
2866
|
+
if int(now) % 10 == 0 and NEURO_NODE.layer_pool:
|
|
2867
|
+
NEURO_NODE.layer_pool.heartbeat(NEURO_NODE.node_id, NEURO_NODE.my_layer_ids)
|
|
2868
|
+
|
|
2869
|
+
# Cleanup stale layer assignments (every 60 seconds)
|
|
2870
|
+
if int(now) % 60 == 0:
|
|
2871
|
+
removed = NEURO_NODE.layer_pool.cleanup_stale_assignments()
|
|
2872
|
+
if removed > 0:
|
|
2873
|
+
logger.info(f"[LAYER_POOL] Cleaned up {removed} stale layer assignments")
|
|
2874
|
+
|
|
2875
|
+
# TOKENIZER AUTO-REFRESH: Check for vocab updates every 10 minutes
|
|
2876
|
+
# Synced with MANIFEST_REFRESH_INTERVAL (600s) in GenesisDataLoader
|
|
2877
|
+
# This ensures model embedding expands when tokenizer grows
|
|
2878
|
+
if int(now) % 600 == 0: # Every 10 minutes (matches data loader refresh)
|
|
2879
|
+
try:
|
|
2880
|
+
if hasattr(NEURO_NODE, '_load_learned_tokenizer'):
|
|
2881
|
+
old_vocab = NEURO_NODE.tokenizer.current_vocab_size if NEURO_NODE.tokenizer else 0
|
|
2882
|
+
NEURO_NODE._load_learned_tokenizer()
|
|
2883
|
+
new_vocab = NEURO_NODE.tokenizer.current_vocab_size if NEURO_NODE.tokenizer else 0
|
|
2884
|
+
if new_vocab > old_vocab:
|
|
2885
|
+
logger.info(f"[TOKENIZER] Vocab updated: {old_vocab:,} → {new_vocab:,} tokens")
|
|
2886
|
+
except Exception as e:
|
|
2887
|
+
logger.debug(f"[TOKENIZER] Refresh check failed: {e}")
|
|
2888
|
+
|
|
2889
|
+
# RESOURCE-AWARE SLEEP: Adjust based on system load
|
|
2890
|
+
# This ensures we're a good citizen when running in the background
|
|
2891
|
+
try:
|
|
2892
|
+
current_cpu = psutil.cpu_percent(interval=None) # Non-blocking
|
|
2893
|
+
current_mem = psutil.virtual_memory().percent
|
|
2894
|
+
|
|
2895
|
+
# If system is under heavy load (not from us), back off
|
|
2896
|
+
if current_cpu > 90 or current_mem > 90:
|
|
2897
|
+
time.sleep(5) # Back off significantly if system is stressed
|
|
2898
|
+
continue
|
|
2899
|
+
|
|
2900
|
+
# Dynamic sleep based on activity and user's CPU setting
|
|
2901
|
+
if training_in_progress:
|
|
2902
|
+
time.sleep(0.1) # Fast loop during active training
|
|
2903
|
+
else:
|
|
2904
|
+
# Check if data is likely ready (quick check without blocking)
|
|
2905
|
+
likely_data_ready = False
|
|
2906
|
+
if hasattr(NEURO_NODE, 'genesis_loader') and NEURO_NODE.genesis_loader:
|
|
2907
|
+
try:
|
|
2908
|
+
loader = NEURO_NODE.genesis_loader
|
|
2909
|
+
# Quick non-locking check - just look at dict sizes
|
|
2910
|
+
likely_data_ready = bool(loader._prefetch_ready or loader.loaded_shards or loader.current_dataset is not None)
|
|
2911
|
+
except Exception:
|
|
2912
|
+
pass
|
|
2913
|
+
|
|
2914
|
+
if likely_data_ready:
|
|
2915
|
+
# Data might be ready - use shorter interval
|
|
2916
|
+
time.sleep(min_interval_between_steps * 0.5)
|
|
2917
|
+
else:
|
|
2918
|
+
time.sleep(1) # Slower loop when idle/waiting
|
|
2919
|
+
except:
|
|
2920
|
+
time.sleep(1) # Fallback if psutil fails
|
|
2921
|
+
|
|
2922
|
+
threading.Thread(target=background_tasks, daemon=True).start()
|
|
2923
|
+
|
|
2924
|
+
# DRIVER WORKER LOOP: Poll marketplace AND process requests
|
|
2925
|
+
def driver_worker_loop():
|
|
2926
|
+
"""
|
|
2927
|
+
PRODUCTION-READY Driver Worker Loop
|
|
2928
|
+
|
|
2929
|
+
1. Polls marketplace for pending requests
|
|
2930
|
+
2. Claims requests assigned to this driver
|
|
2931
|
+
3. Waits for encrypted prompt from user
|
|
2932
|
+
4. Processes inference through distributed pipeline
|
|
2933
|
+
5. Submits PoNW proof for rewards
|
|
2934
|
+
"""
|
|
2935
|
+
import time
|
|
2936
|
+
|
|
2937
|
+
# Check if this node is a driver
|
|
2938
|
+
is_driver = NEURO_NODE and NEURO_NODE.model.has_embedding
|
|
2939
|
+
|
|
2940
|
+
if not is_driver:
|
|
2941
|
+
logger.info("[DRIVER] Not a driver node - skipping marketplace worker loop")
|
|
2942
|
+
return
|
|
2943
|
+
|
|
2944
|
+
logger.info("[DRIVER] Starting PRODUCTION marketplace worker loop...")
|
|
2945
|
+
logger.info(f"[DRIVER] Will poll for requests assigned to: {NEURO_NODE.node_id[:16]}...")
|
|
2946
|
+
|
|
2947
|
+
# Import encrypted prompt handling
|
|
2948
|
+
from neuroshard.core.network.encrypted_channel import PromptEncryption, PromptQueue
|
|
2949
|
+
|
|
2950
|
+
prompt_queue = PromptQueue()
|
|
2951
|
+
|
|
2952
|
+
# Store in node for API access
|
|
2953
|
+
NEURO_NODE.prompt_queue = prompt_queue
|
|
2954
|
+
|
|
2955
|
+
last_claim_attempt = 0
|
|
2956
|
+
processing_requests = {} # request_id -> asyncio.Task
|
|
2957
|
+
|
|
2958
|
+
def process_request(request_id: str):
|
|
2959
|
+
"""Process a single inference request using existing distributed inference."""
|
|
2960
|
+
try:
|
|
2961
|
+
# Get marketplace request for parameters
|
|
2962
|
+
market = P2P.ledger.inference_market
|
|
2963
|
+
|
|
2964
|
+
market_request = market.get_request(request_id)
|
|
2965
|
+
if not market_request:
|
|
2966
|
+
logger.warning(f"[DRIVER] ✗ Request {request_id[:8]}... not found in marketplace")
|
|
2967
|
+
return
|
|
2968
|
+
|
|
2969
|
+
# Get encrypted prompt
|
|
2970
|
+
encrypted_prompt = prompt_queue.get_prompt(request_id)
|
|
2971
|
+
|
|
2972
|
+
if not encrypted_prompt:
|
|
2973
|
+
logger.warning(f"[DRIVER] ✗ No prompt found for {request_id[:8]}...")
|
|
2974
|
+
return
|
|
2975
|
+
|
|
2976
|
+
# Decrypt prompt
|
|
2977
|
+
try:
|
|
2978
|
+
prompt_text = PromptEncryption.decrypt_prompt(
|
|
2979
|
+
encrypted_prompt.encrypted_data,
|
|
2980
|
+
request_id
|
|
2981
|
+
)
|
|
2982
|
+
logger.info(f"[DRIVER] ✓ Decrypted prompt: '{prompt_text[:50]}...'")
|
|
2983
|
+
except Exception as e:
|
|
2984
|
+
logger.error(f"[DRIVER] ✗ Failed to decrypt prompt: {e}")
|
|
2985
|
+
return
|
|
2986
|
+
|
|
2987
|
+
# Process using EXISTING distributed inference
|
|
2988
|
+
try:
|
|
2989
|
+
output = NEURO_NODE.generate(
|
|
2990
|
+
prompt=prompt_text,
|
|
2991
|
+
max_new_tokens=market_request.tokens_requested,
|
|
2992
|
+
temperature=0.8
|
|
2993
|
+
)
|
|
2994
|
+
|
|
2995
|
+
logger.info(f"[DRIVER] ✓ Generated: '{output[:100]}...'")
|
|
2996
|
+
logger.info(f"[DRIVER] ✓ Request {request_id[:8]}... completed")
|
|
2997
|
+
processing_requests[request_id] = "completed"
|
|
2998
|
+
|
|
2999
|
+
# Store result in marketplace
|
|
3000
|
+
market.store_result(request_id, output)
|
|
3001
|
+
|
|
3002
|
+
except Exception as e:
|
|
3003
|
+
logger.error(f"[DRIVER] ✗ Generation failed: {e}")
|
|
3004
|
+
import traceback
|
|
3005
|
+
traceback.print_exc()
|
|
3006
|
+
processing_requests[request_id] = "failed"
|
|
3007
|
+
|
|
3008
|
+
except Exception as e:
|
|
3009
|
+
logger.error(f"[DRIVER] ✗ Error processing {request_id[:8]}...: {e}")
|
|
3010
|
+
import traceback
|
|
3011
|
+
logger.error(traceback.format_exc())
|
|
3012
|
+
processing_requests[request_id] = "failed"
|
|
3013
|
+
|
|
3014
|
+
while not _SHUTDOWN_REQUESTED.is_set():
|
|
3015
|
+
now = time.time()
|
|
3016
|
+
|
|
3017
|
+
# STEP 1: Poll marketplace for new requests (every 5 seconds)
|
|
3018
|
+
if now - last_claim_attempt >= 5:
|
|
3019
|
+
try:
|
|
3020
|
+
market = P2P.ledger.inference_market
|
|
3021
|
+
|
|
3022
|
+
# Try to claim a request
|
|
3023
|
+
request = market.claim_request(NEURO_NODE.node_id)
|
|
3024
|
+
|
|
3025
|
+
if request:
|
|
3026
|
+
logger.info(f"[DRIVER] ✓ Claimed request {request.request_id[:8]}... "
|
|
3027
|
+
f"({request.tokens_requested} tokens @ {request.locked_price:.6f} NEURO/1M)")
|
|
3028
|
+
|
|
3029
|
+
# Start pipeline session
|
|
3030
|
+
market.start_pipeline_session(
|
|
3031
|
+
request_id=request.request_id,
|
|
3032
|
+
session_id=request.pipeline_session_id,
|
|
3033
|
+
driver_node_id=NEURO_NODE.node_id
|
|
3034
|
+
)
|
|
3035
|
+
|
|
3036
|
+
# Check if we already have the prompt
|
|
3037
|
+
if prompt_queue.has_prompt(request.request_id):
|
|
3038
|
+
logger.info(f"[DRIVER] ✓ Prompt already received, processing immediately")
|
|
3039
|
+
# Process immediately
|
|
3040
|
+
process_request(request.request_id)
|
|
3041
|
+
else:
|
|
3042
|
+
logger.info(f"[DRIVER] Waiting for encrypted prompt from user...")
|
|
3043
|
+
logger.info(f"[DRIVER] User should POST to /api/driver/prompt/{request.request_id[:8]}...")
|
|
3044
|
+
processing_requests[request.request_id] = None # Mark as waiting
|
|
3045
|
+
|
|
3046
|
+
except Exception as e:
|
|
3047
|
+
if "not found" not in str(e).lower():
|
|
3048
|
+
logger.error(f"[DRIVER] Marketplace poll error: {e}")
|
|
3049
|
+
|
|
3050
|
+
last_claim_attempt = now
|
|
3051
|
+
|
|
3052
|
+
# STEP 2: Check for prompts that arrived for waiting requests
|
|
3053
|
+
for request_id in list(processing_requests.keys()):
|
|
3054
|
+
if processing_requests[request_id] is None: # Waiting for prompt
|
|
3055
|
+
if prompt_queue.has_prompt(request_id):
|
|
3056
|
+
logger.info(f"[DRIVER] ✓ Prompt received for {request_id[:8]}..., starting processing")
|
|
3057
|
+
# Process (uses existing distributed inference)
|
|
3058
|
+
process_request(request_id)
|
|
3059
|
+
processing_requests[request_id] = "processing" # Mark as processing
|
|
3060
|
+
|
|
3061
|
+
# STEP 3: Cleanup finished requests
|
|
3062
|
+
for request_id in list(processing_requests.keys()):
|
|
3063
|
+
if processing_requests[request_id] == "completed":
|
|
3064
|
+
del processing_requests[request_id]
|
|
3065
|
+
|
|
3066
|
+
# STEP 4: Cleanup old prompts
|
|
3067
|
+
prompt_queue.cleanup_old_prompts()
|
|
3068
|
+
|
|
3069
|
+
time.sleep(1) # Fast loop for responsiveness
|
|
3070
|
+
|
|
3071
|
+
# Start driver worker loop if this is a driver node
|
|
3072
|
+
if NEURO_NODE and NEURO_NODE.model.has_embedding:
|
|
3073
|
+
threading.Thread(target=driver_worker_loop, daemon=True).start()
|
|
3074
|
+
|
|
3075
|
+
# 6. Run HTTP Server
|
|
3076
|
+
logger.info("=" * 50)
|
|
3077
|
+
logger.info("NeuroShard Node Ready!")
|
|
3078
|
+
logger.info(f" Device: {NEURO_NODE.device.upper()}")
|
|
3079
|
+
logger.info(f" My Layers: {NEURO_NODE.my_layer_ids}")
|
|
3080
|
+
logger.info(f" My Params: {NEURO_NODE.model.get_num_params() / 1e6:.1f}M")
|
|
3081
|
+
logger.info(f" Embedding: {NEURO_NODE.model.has_embedding}")
|
|
3082
|
+
logger.info(f" LM Head: {NEURO_NODE.model.has_lm_head}")
|
|
3083
|
+
logger.info(f" Training: {'Enabled' if enable_training else 'Disabled'}")
|
|
3084
|
+
logger.info(f" DiLoCo: sync every {diloco_inner_steps} steps")
|
|
3085
|
+
logger.info("=" * 50)
|
|
3086
|
+
logger.info("TRULY DECENTRALIZED: Model grows with network capacity!")
|
|
3087
|
+
logger.info("=" * 50)
|
|
3088
|
+
|
|
3089
|
+
# Custom log config: disable access logs and customize startup messages
|
|
3090
|
+
# Handle Windows GUI mode where stdout may be None
|
|
3091
|
+
if sys.stdout is not None and hasattr(sys.stdout, 'write'):
|
|
3092
|
+
log_config = {
|
|
3093
|
+
"version": 1,
|
|
3094
|
+
"disable_existing_loggers": False,
|
|
3095
|
+
"formatters": {
|
|
3096
|
+
"default": {"format": "[NODE] %(message)s"},
|
|
3097
|
+
},
|
|
3098
|
+
"handlers": {
|
|
3099
|
+
"default": {
|
|
3100
|
+
"formatter": "default",
|
|
3101
|
+
"class": "logging.StreamHandler",
|
|
3102
|
+
"stream": "ext://sys.stdout",
|
|
3103
|
+
},
|
|
3104
|
+
},
|
|
3105
|
+
"loggers": {
|
|
3106
|
+
# Suppress uvicorn's default startup messages (including "Press CTRL+C")
|
|
3107
|
+
"uvicorn": {"handlers": ["default"], "level": "WARNING", "propagate": False},
|
|
3108
|
+
"uvicorn.error": {"handlers": ["default"], "level": "WARNING", "propagate": False},
|
|
3109
|
+
"uvicorn.access": {"handlers": [], "level": "CRITICAL", "propagate": False},
|
|
3110
|
+
},
|
|
3111
|
+
}
|
|
3112
|
+
else:
|
|
3113
|
+
# Fallback to file logging when stdout is unavailable (Windows frozen GUI)
|
|
3114
|
+
log_dir = os.path.join(os.path.expanduser("~"), ".neuroshard")
|
|
3115
|
+
log_file = os.path.join(log_dir, "uvicorn.log")
|
|
3116
|
+
log_config = {
|
|
3117
|
+
"version": 1,
|
|
3118
|
+
"disable_existing_loggers": False,
|
|
3119
|
+
"formatters": {
|
|
3120
|
+
"default": {"format": "[NODE] %(message)s"},
|
|
3121
|
+
},
|
|
3122
|
+
"handlers": {
|
|
3123
|
+
"default": {
|
|
3124
|
+
"formatter": "default",
|
|
3125
|
+
"class": "logging.handlers.RotatingFileHandler",
|
|
3126
|
+
"filename": log_file,
|
|
3127
|
+
"maxBytes": 5*1024*1024,
|
|
3128
|
+
"backupCount": 2,
|
|
3129
|
+
"encoding": "utf-8",
|
|
3130
|
+
},
|
|
3131
|
+
},
|
|
3132
|
+
"loggers": {
|
|
3133
|
+
# Suppress uvicorn's default startup messages
|
|
3134
|
+
"uvicorn": {"handlers": ["default"], "level": "WARNING", "propagate": False},
|
|
3135
|
+
"uvicorn.error": {"handlers": ["default"], "level": "WARNING", "propagate": False},
|
|
3136
|
+
"uvicorn.access": {"handlers": [], "level": "CRITICAL", "propagate": False},
|
|
3137
|
+
},
|
|
3138
|
+
}
|
|
3139
|
+
|
|
3140
|
+
# Use Server object so we can stop it from outside (GUI shutdown)
|
|
3141
|
+
global _UVICORN_SERVER
|
|
3142
|
+
config = uvicorn.Config(node_app, host="0.0.0.0", port=port, log_config=log_config)
|
|
3143
|
+
_UVICORN_SERVER = uvicorn.Server(config)
|
|
3144
|
+
|
|
3145
|
+
# Print our own clean startup message (without "Press CTRL+C")
|
|
3146
|
+
logger.info(f"[NODE] HTTP server started on port {port}")
|
|
3147
|
+
|
|
3148
|
+
_UVICORN_SERVER.run()
|
|
3149
|
+
|
|
3150
|
+
|
|
3151
|
+
def main():
|
|
3152
|
+
import signal
|
|
3153
|
+
import atexit
|
|
3154
|
+
|
|
3155
|
+
# Register signal handlers for graceful shutdown
|
|
3156
|
+
def _signal_handler(signum, frame):
|
|
3157
|
+
logger.info(f"[NODE] Received signal {signum}, initiating graceful shutdown...")
|
|
3158
|
+
request_shutdown()
|
|
3159
|
+
sys.exit(0)
|
|
3160
|
+
|
|
3161
|
+
# Handle Ctrl+C (SIGINT) and SIGTERM
|
|
3162
|
+
signal.signal(signal.SIGINT, _signal_handler)
|
|
3163
|
+
signal.signal(signal.SIGTERM, _signal_handler)
|
|
3164
|
+
|
|
3165
|
+
# Also register atexit handler as backup
|
|
3166
|
+
atexit.register(lambda: request_shutdown() if NEURO_NODE else None)
|
|
3167
|
+
|
|
3168
|
+
parser = argparse.ArgumentParser(description="NeuroShard Node Runner - Truly Decentralized LLM")
|
|
3169
|
+
parser.add_argument("--port", type=int, default=8000)
|
|
3170
|
+
parser.add_argument("--tracker", type=str, default="https://neuroshard.com/api/tracker")
|
|
3171
|
+
parser.add_argument("--token", type=str, default=None,
|
|
3172
|
+
help="Node Token OR 12-word mnemonic phrase for wallet access")
|
|
3173
|
+
parser.add_argument("--announce-ip", type=str, default=None, help="Force IP address to announce")
|
|
3174
|
+
parser.add_argument("--announce-port", type=int, default=None, help="Force port to announce")
|
|
3175
|
+
parser.add_argument("--no-training", action="store_true", help="Disable training (inference only)")
|
|
3176
|
+
parser.add_argument("--memory", type=int, default=None,
|
|
3177
|
+
help="Override detected memory (MB) - for testing")
|
|
3178
|
+
parser.add_argument("--max-storage", type=int, default=100,
|
|
3179
|
+
help="Max disk space for training data (MB)")
|
|
3180
|
+
parser.add_argument("--cpu-threads", type=int, default=None,
|
|
3181
|
+
help="Max CPU threads to use")
|
|
3182
|
+
parser.add_argument("--diloco-steps", type=int, default=500,
|
|
3183
|
+
help="DiLoCo inner steps before gradient sync (default: 500)")
|
|
3184
|
+
|
|
3185
|
+
args = parser.parse_args()
|
|
3186
|
+
|
|
3187
|
+
# Handle mnemonic input: If token is 12 words, convert to token
|
|
3188
|
+
node_token = args.token
|
|
3189
|
+
if node_token:
|
|
3190
|
+
words = node_token.strip().split()
|
|
3191
|
+
if len(words) == 12:
|
|
3192
|
+
# It's a BIP39 mnemonic - derive token from it
|
|
3193
|
+
try:
|
|
3194
|
+
from mnemonic import Mnemonic
|
|
3195
|
+
mnemo = Mnemonic("english")
|
|
3196
|
+
if mnemo.check(node_token):
|
|
3197
|
+
# Convert mnemonic to deterministic token
|
|
3198
|
+
seed = mnemo.to_seed(node_token, passphrase="")
|
|
3199
|
+
node_token = seed[:32].hex() # Use first 32 bytes as token
|
|
3200
|
+
logger.info("✅ Wallet recovered from mnemonic")
|
|
3201
|
+
else:
|
|
3202
|
+
logger.warning("⚠️ Invalid mnemonic phrase - treating as raw token")
|
|
3203
|
+
except ImportError:
|
|
3204
|
+
logger.warning("⚠️ 'mnemonic' package not installed - treating as raw token")
|
|
3205
|
+
except Exception as e:
|
|
3206
|
+
logger.warning(f"⚠️ Mnemonic error: {e} - treating as raw token")
|
|
3207
|
+
|
|
3208
|
+
run_node(
|
|
3209
|
+
port=args.port,
|
|
3210
|
+
tracker=args.tracker,
|
|
3211
|
+
node_token=node_token,
|
|
3212
|
+
announce_ip=args.announce_ip,
|
|
3213
|
+
announce_port=args.announce_port,
|
|
3214
|
+
enable_training=not args.no_training,
|
|
3215
|
+
available_memory_mb=args.memory,
|
|
3216
|
+
max_storage_mb=args.max_storage,
|
|
3217
|
+
max_cpu_threads=args.cpu_threads,
|
|
3218
|
+
diloco_inner_steps=args.diloco_steps,
|
|
3219
|
+
)
|
|
3220
|
+
|
|
3221
|
+
|
|
3222
|
+
if __name__ == "__main__":
|
|
3223
|
+
main()
|