nexaroa 0.0.111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. neuroshard/__init__.py +93 -0
  2. neuroshard/__main__.py +4 -0
  3. neuroshard/cli.py +466 -0
  4. neuroshard/core/__init__.py +92 -0
  5. neuroshard/core/consensus/verifier.py +252 -0
  6. neuroshard/core/crypto/__init__.py +20 -0
  7. neuroshard/core/crypto/ecdsa.py +392 -0
  8. neuroshard/core/economics/__init__.py +52 -0
  9. neuroshard/core/economics/constants.py +387 -0
  10. neuroshard/core/economics/ledger.py +2111 -0
  11. neuroshard/core/economics/market.py +975 -0
  12. neuroshard/core/economics/wallet.py +168 -0
  13. neuroshard/core/governance/__init__.py +74 -0
  14. neuroshard/core/governance/proposal.py +561 -0
  15. neuroshard/core/governance/registry.py +545 -0
  16. neuroshard/core/governance/versioning.py +332 -0
  17. neuroshard/core/governance/voting.py +453 -0
  18. neuroshard/core/model/__init__.py +30 -0
  19. neuroshard/core/model/dynamic.py +4186 -0
  20. neuroshard/core/model/llm.py +905 -0
  21. neuroshard/core/model/registry.py +164 -0
  22. neuroshard/core/model/scaler.py +387 -0
  23. neuroshard/core/model/tokenizer.py +568 -0
  24. neuroshard/core/network/__init__.py +56 -0
  25. neuroshard/core/network/connection_pool.py +72 -0
  26. neuroshard/core/network/dht.py +130 -0
  27. neuroshard/core/network/dht_plan.py +55 -0
  28. neuroshard/core/network/dht_proof_store.py +516 -0
  29. neuroshard/core/network/dht_protocol.py +261 -0
  30. neuroshard/core/network/dht_service.py +506 -0
  31. neuroshard/core/network/encrypted_channel.py +141 -0
  32. neuroshard/core/network/nat.py +201 -0
  33. neuroshard/core/network/nat_traversal.py +695 -0
  34. neuroshard/core/network/p2p.py +929 -0
  35. neuroshard/core/network/p2p_data.py +150 -0
  36. neuroshard/core/swarm/__init__.py +106 -0
  37. neuroshard/core/swarm/aggregation.py +729 -0
  38. neuroshard/core/swarm/buffers.py +643 -0
  39. neuroshard/core/swarm/checkpoint.py +709 -0
  40. neuroshard/core/swarm/compute.py +624 -0
  41. neuroshard/core/swarm/diloco.py +844 -0
  42. neuroshard/core/swarm/factory.py +1288 -0
  43. neuroshard/core/swarm/heartbeat.py +669 -0
  44. neuroshard/core/swarm/logger.py +487 -0
  45. neuroshard/core/swarm/router.py +658 -0
  46. neuroshard/core/swarm/service.py +640 -0
  47. neuroshard/core/training/__init__.py +29 -0
  48. neuroshard/core/training/checkpoint.py +600 -0
  49. neuroshard/core/training/distributed.py +1602 -0
  50. neuroshard/core/training/global_tracker.py +617 -0
  51. neuroshard/core/training/production.py +276 -0
  52. neuroshard/governance_cli.py +729 -0
  53. neuroshard/grpc_server.py +895 -0
  54. neuroshard/runner.py +3223 -0
  55. neuroshard/sdk/__init__.py +92 -0
  56. neuroshard/sdk/client.py +990 -0
  57. neuroshard/sdk/errors.py +101 -0
  58. neuroshard/sdk/types.py +282 -0
  59. neuroshard/tracker/__init__.py +0 -0
  60. neuroshard/tracker/server.py +864 -0
  61. neuroshard/ui/__init__.py +0 -0
  62. neuroshard/ui/app.py +102 -0
  63. neuroshard/ui/templates/index.html +1052 -0
  64. neuroshard/utils/__init__.py +0 -0
  65. neuroshard/utils/autostart.py +81 -0
  66. neuroshard/utils/hardware.py +121 -0
  67. neuroshard/utils/serialization.py +90 -0
  68. neuroshard/version.py +1 -0
  69. nexaroa-0.0.111.dist-info/METADATA +283 -0
  70. nexaroa-0.0.111.dist-info/RECORD +78 -0
  71. nexaroa-0.0.111.dist-info/WHEEL +5 -0
  72. nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
  73. nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
  74. nexaroa-0.0.111.dist-info/top_level.txt +2 -0
  75. protos/__init__.py +0 -0
  76. protos/neuroshard.proto +651 -0
  77. protos/neuroshard_pb2.py +160 -0
  78. protos/neuroshard_pb2_grpc.py +1298 -0
@@ -0,0 +1,895 @@
1
+ """
2
+ gRPC Server for NeuroShard Node
3
+
4
+ Handles:
5
+ 1. Inference requests (forward pass through NeuroLLM)
6
+ 2. Training gradient exchange
7
+ 3. DHT operations for peer discovery
8
+ 4. PoNW proof verification
9
+ 5. Layer-specific forward (for distributed inference)
10
+ """
11
+
12
+ import grpc
13
+ from concurrent import futures
14
+ import torch
15
+ import time
16
+ import threading
17
+ import random
18
+ from typing import Optional, Union
19
+ import logging
20
+
21
+ # Import generated protobuf code
22
+ from protos import neuroshard_pb2
23
+ from protos import neuroshard_pb2_grpc
24
+
25
+ from neuroshard.utils.serialization import deserialize_tensor, serialize_tensor
26
+ from neuroshard.core.network.p2p import P2PManager
27
+ from neuroshard.core.network.dht_service import DHTServiceMixin
28
+
29
+ # Swarm Service Mixin (Phase 4)
30
+ try:
31
+ from neuroshard.core.swarm.service import SwarmServiceMixin
32
+ SWARM_SERVICE_AVAILABLE = True
33
+ except ImportError:
34
+ SWARM_SERVICE_AVAILABLE = False
35
+ SwarmServiceMixin = object # Fallback
36
+
37
+ logger = logging.getLogger(__name__)
38
+
39
+ # Global state
40
+ GRPC_SERVER = None
41
+
42
+
43
+ class NeuroShardServiceServicer(DHTServiceMixin, neuroshard_pb2_grpc.NeuroShardServiceServicer):
44
+ """
45
+ gRPC service for NeuroShard nodes.
46
+
47
+ Supports DynamicNeuroNode with layer-based routing.
48
+ Each node holds specific layers and can process inference requests for those layers.
49
+
50
+ Swarm Architecture:
51
+ - SwarmForward: Async activation forwarding with failover
52
+ - GetSwarmStatus: Buffer fill rates, capacity info
53
+ - UpdatePeerCapacity: TCP fallback for heartbeat updates
54
+ """
55
+
56
+ def __init__(self, model, p2p: P2PManager, swap_controller=None):
57
+ """
58
+ Args:
59
+ model: DynamicNeuroNode or SwarmEnabledDynamicNode instance
60
+ p2p: P2P manager for peer discovery
61
+ swap_controller: Deprecated, kept for compatibility
62
+ """
63
+ self.model = model
64
+ self.p2p = p2p
65
+ self.swap_controller = swap_controller
66
+
67
+ # Detect if this is a DynamicNeuroNode
68
+ self.is_dynamic_node = hasattr(model, 'my_layer_ids') and hasattr(model, 'layer_pool')
69
+
70
+ # Rate limiter for gradient gossip (per-node)
71
+ # Allows max 1 gossip per node per 60 seconds (DiLoCo sync is every ~8-10 min)
72
+ self._gossip_rate_limit: dict = {} # node_id -> last_gossip_time
73
+ self._gossip_rate_limit_seconds = 60 # Min seconds between gossips from same node
74
+ self._gossip_rate_limit_lock = threading.Lock()
75
+
76
+ # Initialize DHT Mixin if available
77
+ if p2p.routing_table:
78
+ DHTServiceMixin.__init__(self, p2p.routing_table, p2p.dht_storage, ledger=p2p.ledger)
79
+ else:
80
+ from neuroshard.core.network.dht import RoutingTable, Node
81
+ dummy_rt = RoutingTable(Node(0, "0.0.0.0", 0))
82
+ DHTServiceMixin.__init__(self, dummy_rt, {}, ledger=p2p.ledger)
83
+
84
+ logger.info(f"gRPC Servicer initialized (DynamicNode={self.is_dynamic_node})")
85
+
86
+ def UnaryInference(self, request, context):
87
+ """Handle inference request."""
88
+ try:
89
+ # Deserialize input
90
+ input_str = request.tensor_data.decode('utf-8')
91
+ input_tensor = deserialize_tensor(input_str)
92
+
93
+ # Forward pass through DynamicNeuroNode
94
+ output = self.model.forward(input_tensor, session_id=request.session_id)
95
+ # Return result directly
96
+ return neuroshard_pb2.InferenceResponse(
97
+ success=True,
98
+ tensor_data=serialize_tensor(output).encode('utf-8')
99
+ )
100
+
101
+ except Exception as e:
102
+ logger.error(f"Inference error: {e}")
103
+ return neuroshard_pb2.InferenceResponse(success=False, error_message=str(e))
104
+
105
+ def GetWeights(self, request, context):
106
+ """Return model weights (for gradient sync)."""
107
+ try:
108
+ # Get weights for my layers only
109
+ layer_weights = {}
110
+ for layer_id, layer in self.model.model.my_layers.items():
111
+ layer_weights[f"layer_{layer_id}"] = layer.state_dict()
112
+
113
+ if self.model.model.embedding:
114
+ layer_weights["embedding"] = self.model.model.embedding.state_dict()
115
+ if self.model.model.lm_head:
116
+ layer_weights["lm_head"] = self.model.model.lm_head.state_dict()
117
+
118
+ data = serialize_tensor(layer_weights, use_quantization=False).encode('utf-8')
119
+ return neuroshard_pb2.WeightResponse(weights_data=data)
120
+ except Exception as e:
121
+ logger.error(f"GetWeights error: {e}")
122
+ return neuroshard_pb2.WeightResponse(weights_data=b"")
123
+
124
+ def GetTrainingStatus(self, request, context):
125
+ """Get training status."""
126
+ stats = self.model.get_stats()
127
+
128
+ return neuroshard_pb2.TrainingStatusResponse(
129
+ training_enabled=self.model.enable_training,
130
+ total_rounds=stats.get("total_training_rounds", 0),
131
+ current_loss=stats.get("current_loss", float('inf')),
132
+ tokens_processed=stats.get("total_tokens_processed", 0),
133
+ )
134
+
135
+ def GetPoNWProof(self, request, context):
136
+ """Get Proof of Neural Work."""
137
+ proof = self.model.get_ponw_proof()
138
+
139
+ return neuroshard_pb2.PoNWProofResponse(
140
+ node_id=proof.get("node_id", ""),
141
+ timestamp=proof.get("timestamp", 0),
142
+ tokens_processed=proof.get("tokens_processed", 0),
143
+ training_rounds=proof.get("training_rounds", 0),
144
+ signature=proof.get("signature", ""),
145
+ )
146
+
147
+ # ==================== LAYER-SPECIFIC FORWARD ====================
148
+
149
+ def LayerForward(self, request, context):
150
+ """
151
+ Forward hidden states through specific layers on this node.
152
+
153
+ This enables distributed inference:
154
+ 1. Request comes with hidden states
155
+ 2. We process through our assigned layers
156
+ 3. Return processed hidden states
157
+ """
158
+ try:
159
+ # Deserialize input
160
+ input_str = request.tensor_data.decode('utf-8')
161
+ hidden_states = deserialize_tensor(input_str)
162
+
163
+ # Check if we have the requested layers
164
+ requested_layers = list(request.layer_ids)
165
+ my_layers = set(self.model.my_layer_ids)
166
+
167
+ if not all(l in my_layers for l in requested_layers):
168
+ missing = set(requested_layers) - my_layers
169
+ return neuroshard_pb2.LayerForwardResponse(
170
+ success=False,
171
+ error_message=f"Missing layers: {missing}"
172
+ )
173
+
174
+ # Forward through requested layers
175
+ output = self.model.model.forward_my_layers(
176
+ hidden_states,
177
+ start_layer=min(requested_layers),
178
+ end_layer=max(requested_layers) + 1
179
+ )
180
+
181
+ return neuroshard_pb2.LayerForwardResponse(
182
+ success=True,
183
+ tensor_data=serialize_tensor(output).encode('utf-8')
184
+ )
185
+
186
+ except Exception as e:
187
+ logger.error(f"LayerForward error: {e}")
188
+ return neuroshard_pb2.LayerForwardResponse(
189
+ success=False,
190
+ error_message=str(e)
191
+ )
192
+
193
+ def GetNodeInfo(self, request, context):
194
+ """Get information about this node's capabilities."""
195
+ stats = self.model.get_stats()
196
+
197
+ return neuroshard_pb2.NodeInfoResponse(
198
+ node_id=self.model.node_id,
199
+ layer_ids=self.model.my_layer_ids,
200
+ has_embedding=self.model.model.has_embedding if self.model.model else False,
201
+ has_lm_head=self.model.model.has_lm_head if self.model.model else False,
202
+ available_memory_mb=int(self.model.available_memory_mb),
203
+ total_params=stats.get("my_params", 0),
204
+ )
205
+
206
+ # ==================== DISTRIBUTED TRAINING RPCs ====================
207
+
208
+ def GossipGradient(self, request, context):
209
+ """
210
+ Receive gradient contribution from a peer.
211
+
212
+ This is the core of distributed training:
213
+ 1. Peer computes gradients locally
214
+ 2. Peer broadcasts via this RPC
215
+ 3. We aggregate and apply updates
216
+
217
+ Rate limited to prevent spam attacks (1 gossip per node per 60s).
218
+ """
219
+
220
+ try:
221
+ # Check if training is enabled
222
+ if not self.model.enable_training:
223
+ return neuroshard_pb2.GossipGradientResponse(
224
+ accepted=False,
225
+ reason="Training not enabled on this node"
226
+ )
227
+
228
+ # RATE LIMITING: Prevent spam from malicious nodes
229
+ node_id = request.node_id
230
+ now = time.time()
231
+
232
+ with self._gossip_rate_limit_lock:
233
+ last_gossip = self._gossip_rate_limit.get(node_id, 0)
234
+ time_since_last = now - last_gossip
235
+
236
+ if time_since_last < self._gossip_rate_limit_seconds:
237
+ logger.warning(f"Rate limited gradient from {node_id[:8]}... "
238
+ f"(only {time_since_last:.1f}s since last, need {self._gossip_rate_limit_seconds}s)")
239
+ return neuroshard_pb2.GossipGradientResponse(
240
+ accepted=False,
241
+ reason=f"Rate limited: wait {self._gossip_rate_limit_seconds - time_since_last:.0f}s"
242
+ )
243
+
244
+ # Update last gossip time
245
+ self._gossip_rate_limit[node_id] = now
246
+
247
+ # Clean up old entries (older than 10 minutes)
248
+ stale_nodes = [nid for nid, ts in self._gossip_rate_limit.items() if now - ts > 600]
249
+ for nid in stale_nodes:
250
+ del self._gossip_rate_limit[nid]
251
+
252
+ # Convert protobuf to GradientContribution
253
+ from neuroshard.core.training.distributed import GradientContribution
254
+
255
+ contribution = GradientContribution(
256
+ node_id=request.node_id,
257
+ round_id=request.round_id,
258
+ layer_gradients=dict(request.layer_gradients), # Convert MapContainer to dict
259
+ batch_size=request.batch_size,
260
+ loss=request.loss,
261
+ timestamp=request.timestamp,
262
+ signature=request.signature
263
+ )
264
+
265
+ # Submit to NeuroNode for processing
266
+ success = self.model.receive_peer_gradients(contribution)
267
+
268
+ if success:
269
+ logger.info(f"Received gradient from peer {request.node_id[:8]}... "
270
+ f"(round={request.round_id}, batch={request.batch_size}, loss={request.loss:.4f})")
271
+
272
+ return neuroshard_pb2.GossipGradientResponse(
273
+ accepted=success,
274
+ reason="" if success else "Failed to process gradient",
275
+ current_round=self.model.current_training_round
276
+ )
277
+
278
+ except Exception as e:
279
+ logger.error(f"GossipGradient error: {e}")
280
+ return neuroshard_pb2.GossipGradientResponse(
281
+ accepted=False,
282
+ reason=str(e)
283
+ )
284
+
285
+ def GetCheckpointInfo(self, request, context):
286
+ """Get checkpoint info without downloading the full checkpoint."""
287
+ if not self.is_neuro_node:
288
+ return neuroshard_pb2.GetCheckpointInfoResponse()
289
+
290
+ try:
291
+ info = self.model.get_checkpoint_info()
292
+
293
+ return neuroshard_pb2.GetCheckpointInfoResponse(
294
+ version=info.get("version", 0),
295
+ model_hash=info.get("model_hash", ""),
296
+ phase=info.get("phase", "bootstrap"),
297
+ params=info.get("params", 0),
298
+ loss=info.get("loss", float('inf'))
299
+ )
300
+
301
+ except Exception as e:
302
+ logger.error(f"GetCheckpointInfo error: {e}")
303
+ return neuroshard_pb2.GetCheckpointInfoResponse()
304
+
305
+ def GetCheckpoint(self, request, context):
306
+ """Download full checkpoint from this node."""
307
+ if not self.is_neuro_node:
308
+ return neuroshard_pb2.GetCheckpointResponse(
309
+ success=False,
310
+ error_message="Node does not support checkpoint sync"
311
+ )
312
+
313
+ try:
314
+ import io
315
+ import zlib
316
+
317
+ # Get model checkpoint
318
+ if not self.model.model:
319
+ return neuroshard_pb2.GetCheckpointResponse(
320
+ success=False,
321
+ error_message="Model not loaded"
322
+ )
323
+
324
+ # Serialize checkpoint
325
+ buffer = io.BytesIO()
326
+ checkpoint = {
327
+ "model_state_dict": self.model.model.state_dict(),
328
+ "config": {
329
+ "phase": self.model.phase,
330
+ "hidden_dim": self.model.model.config.hidden_dim,
331
+ "num_layers": self.model.model.config.num_layers,
332
+ "vocab_size": self.model.model.config.vocab_size,
333
+ },
334
+ "version": self.model.total_training_rounds,
335
+ "model_hash": self.model._get_model_hash(),
336
+ }
337
+ torch.save(checkpoint, buffer)
338
+
339
+ # Compress
340
+ raw_data = buffer.getvalue()
341
+ compressed = zlib.compress(raw_data, level=6)
342
+
343
+ logger.info(f"Serving checkpoint: version={checkpoint['version']}, "
344
+ f"size={len(compressed)/1024:.1f}KB (compressed from {len(raw_data)/1024:.1f}KB)")
345
+
346
+ return neuroshard_pb2.GetCheckpointResponse(
347
+ success=True,
348
+ version=checkpoint["version"],
349
+ model_hash=checkpoint["model_hash"],
350
+ phase=self.model.phase,
351
+ checkpoint_data=compressed,
352
+ total_size=len(compressed)
353
+ )
354
+
355
+ except Exception as e:
356
+ logger.error(f"GetCheckpoint error: {e}")
357
+ return neuroshard_pb2.GetCheckpointResponse(
358
+ success=False,
359
+ error_message=str(e)
360
+ )
361
+
362
+ # ==================== PIPELINE PARALLELISM RPCs ====================
363
+
364
+ def PipelineForward(self, request, context):
365
+ """
366
+ PRODUCTION-READY Pipeline Forward with:
367
+ - Secure activation transfer (differential privacy + encryption)
368
+ - PoNW proof submission for marketplace rewards
369
+ - Full error handling and logging
370
+
371
+ Used for distributed inference: Driver → Workers → Validator
372
+ """
373
+ if not hasattr(self.model, 'forward_pipeline') and not hasattr(self.model, 'forward'):
374
+ return neuroshard_pb2.PipelineForwardResponse(
375
+ success=False,
376
+ error_message="Node does not support forward pass"
377
+ )
378
+
379
+ try:
380
+ import numpy as np
381
+ import hashlib
382
+
383
+ logger.info(f"[WORKER/VALIDATOR] Received pipeline forward for request {request.request_id[:8]}...")
384
+
385
+ # STEP 1: Validate checksum of received activations
386
+ received_checksum = hashlib.sha256(request.hidden_states).hexdigest()
387
+ logger.info(f"[SECURITY] Received activations checksum: {received_checksum[:16]}...")
388
+
389
+ # Deserialize hidden states (detect dtype from shape)
390
+ # 2D = input_ids (int64), 3D = hidden_states (float32)
391
+ if len(request.hidden_shape) == 2:
392
+ dtype = np.int64
393
+ logger.info(f"[WORKER] Detected input_ids (2D shape), deserializing as int64")
394
+ else:
395
+ dtype = np.float32
396
+
397
+ hidden_states = torch.from_numpy(
398
+ np.frombuffer(request.hidden_states, dtype=dtype).copy() # .copy() makes it writable
399
+ ).reshape(list(request.hidden_shape))
400
+
401
+ # CRITICAL: Move tensors to the same device as the model!
402
+ # EC2 (CPU) sends tensors, Jetson (CUDA) needs them on GPU
403
+ model_device = getattr(self.model, 'device', 'cpu')
404
+ if hasattr(self.model, 'model') and hasattr(self.model.model, 'device'):
405
+ model_device = self.model.model.device
406
+ hidden_states = hidden_states.to(model_device)
407
+
408
+ logger.info(f"[WORKER/VALIDATOR] Loaded activations: {hidden_states.shape} {hidden_states.dtype} on {hidden_states.device}")
409
+
410
+ # STEP 2: Process through our layers
411
+ # Deserialize attention mask if provided
412
+ attention_mask = None
413
+ if request.attention_mask:
414
+ attention_mask = torch.from_numpy(
415
+ np.frombuffer(request.attention_mask, dtype=np.float32).copy()
416
+ ).to(model_device)
417
+
418
+ # Training Labels (if provided)
419
+ training_labels = None
420
+ if request.training_labels:
421
+ training_labels = torch.from_numpy(
422
+ np.frombuffer(request.training_labels, dtype=np.int64).copy()
423
+ ).to(model_device)
424
+
425
+ # Forward through our layers
426
+ if hasattr(self.model, 'forward_pipeline'):
427
+ output, new_kv = self.model.forward_pipeline(
428
+ hidden_states=hidden_states,
429
+ attention_mask=attention_mask,
430
+ training_labels=training_labels,
431
+ session_id=request.session_id,
432
+ sender_url=request.sender_url,
433
+ use_cache=request.use_cache,
434
+ )
435
+
436
+ is_final = self.model.model.has_lm_head if hasattr(self.model, 'model') else False
437
+ else:
438
+ # Legacy fallback
439
+ output = self.model.forward(hidden_states)
440
+ new_kv = None
441
+ is_final = True
442
+
443
+ logger.info(f"[WORKER/VALIDATOR] Processed through layers: output shape {output.shape}, is_final={is_final}")
444
+
445
+ # STEP 3: Submit PoNW proof for this work (earn NEURO!)
446
+ if hasattr(self.model, 'ledger') and self.model.ledger and request.request_id:
447
+ try:
448
+ from neuroshard.core.economics.ledger import PoNWProof, sign_proof
449
+ import uuid
450
+
451
+ # Count tokens processed
452
+ tokens_processed = output.shape[1] if len(output.shape) > 1 else output.shape[0]
453
+
454
+ # Determine our role
455
+ has_embedding = self.model.model.has_embedding if hasattr(self.model, 'model') else False
456
+ has_lm_head = self.model.model.has_lm_head if hasattr(self.model, 'model') else False
457
+ layers_held = len(self.model.my_layer_ids) if hasattr(self.model, 'my_layer_ids') else 1
458
+
459
+ proof = PoNWProof(
460
+ node_id=self.model.node_id if hasattr(self.model, 'node_id') else "unknown",
461
+ proof_type="inference",
462
+ timestamp=time.time(),
463
+ nonce=str(uuid.uuid4()),
464
+ tokens_processed=tokens_processed,
465
+ request_id=request.request_id,
466
+ has_embedding=has_embedding,
467
+ has_lm_head=has_lm_head,
468
+ layers_held=layers_held
469
+ )
470
+
471
+ # Sign and submit
472
+ signed_proof = sign_proof(proof, self.model.node_token if hasattr(self.model, 'node_token') else "")
473
+ success, reward, msg = self.model.ledger.process_proof(signed_proof)
474
+
475
+ if success:
476
+ role = "VALIDATOR" if has_lm_head else ("DRIVER" if has_embedding else "WORKER")
477
+ pct = "15%" if (has_lm_head or has_embedding) else "70%"
478
+ logger.info(f"[{role}] ✅ Proof submitted, earned {reward:.6f} NEURO ({pct} of pool)")
479
+ else:
480
+ logger.warning(f"[WORKER/VALIDATOR] ⚠️ Proof rejected: {msg}")
481
+
482
+ except Exception as e:
483
+ logger.error(f"[WORKER/VALIDATOR] Failed to submit proof: {e}")
484
+
485
+ # STEP 4: Serialize output and calculate checksum
486
+ output_bytes = output.detach().cpu().numpy().tobytes()
487
+ output_shape = list(output.shape)
488
+
489
+ # Calculate checksum for integrity verification
490
+ output_checksum = hashlib.sha256(output_bytes).hexdigest()
491
+ logger.info(f"[SECURITY] Sending output with checksum: {output_checksum[:16]}...")
492
+
493
+ response = neuroshard_pb2.PipelineForwardResponse(
494
+ request_id=request.request_id,
495
+ success=True,
496
+ hidden_states=output_bytes,
497
+ hidden_shape=output_shape,
498
+ is_final=is_final,
499
+ )
500
+
501
+ # If final (validator), return logits
502
+ if is_final:
503
+ response.logits = output_bytes
504
+ response.logits_shape = output_shape
505
+
506
+ if hasattr(self.model, 'current_loss'):
507
+ response.loss = self.model.current_loss
508
+
509
+ logger.info(f"[VALIDATOR] ✅ Final output generated, returning logits")
510
+
511
+ return response
512
+
513
+ except Exception as e:
514
+ logger.error(f"[WORKER/VALIDATOR] Pipeline error: {e}")
515
+ import traceback
516
+ traceback.print_exc()
517
+ return neuroshard_pb2.PipelineForwardResponse(
518
+ request_id=request.request_id,
519
+ success=False,
520
+ error_message=str(e)
521
+ )
522
+
523
+ def PipelineBackward(self, request, context):
524
+ """
525
+ Backward pass: propagate gradients back to previous node.
526
+ """
527
+ if not hasattr(self.model, 'backward_pipeline'):
528
+ return neuroshard_pb2.PipelineBackwardResponse(
529
+ success=False,
530
+ error_message="Node does not support backward pipeline"
531
+ )
532
+
533
+ try:
534
+ import numpy as np
535
+
536
+ # Deserialize gradients
537
+ grad_output = torch.from_numpy(
538
+ np.frombuffer(request.grad_output, dtype=np.float32).copy()
539
+ ).reshape(list(request.grad_shape))
540
+
541
+ # CRITICAL: Move gradients to the same device as the model!
542
+ model_device = getattr(self.model, 'device', 'cpu')
543
+ if hasattr(self.model, 'model') and hasattr(self.model.model, 'device'):
544
+ model_device = self.model.model.device
545
+ grad_output = grad_output.to(model_device)
546
+
547
+ # Run backward pipeline
548
+ self.model.backward_pipeline(
549
+ grad_output=grad_output,
550
+ session_id=request.session_id
551
+ )
552
+
553
+ return neuroshard_pb2.PipelineBackwardResponse(success=True)
554
+
555
+ except Exception as e:
556
+ logger.error(f"PipelineBackward error: {e}")
557
+ return neuroshard_pb2.PipelineBackwardResponse(
558
+ success=False,
559
+ error_message=str(e)
560
+ )
561
+
562
+ def GetShardChunk(self, request, context):
563
+ """
564
+ Serve a chunk of a data shard to a peer (Data Swarm).
565
+ """
566
+ if not hasattr(self.model, 'swarm') or not self.model.swarm:
567
+ return neuroshard_pb2.GetShardChunkResponse(
568
+ success=False,
569
+ error_message="Swarm not initialized on this node"
570
+ )
571
+
572
+ chunk_data = self.model.swarm.serve_chunk(request.shard_id, request.chunk_index)
573
+
574
+ if chunk_data:
575
+ return neuroshard_pb2.GetShardChunkResponse(
576
+ success=True,
577
+ data=chunk_data
578
+ )
579
+ else:
580
+ return neuroshard_pb2.GetShardChunkResponse(
581
+ success=False,
582
+ error_message="Chunk not found"
583
+ )
584
+
585
+ def GetShardInfo(self, request, context):
586
+ """Get shard information from this node."""
587
+ if not self.is_neuro_node:
588
+ return neuroshard_pb2.GetShardInfoResponse()
589
+
590
+ try:
591
+ if hasattr(self.model, 'get_shard_info'):
592
+ info = self.model.get_shard_info()
593
+ else:
594
+ # Regular NeuroNode - full model
595
+ info = {
596
+ "shard_id": 0,
597
+ "total_shards": 1,
598
+ "start_layer": 0,
599
+ "end_layer": self.model.model.config.num_layers if self.model.model else 12,
600
+ "has_embedding": True,
601
+ "has_lm_head": True,
602
+ "version": self.model.total_training_rounds,
603
+ "model_hash": self.model._get_model_hash() if hasattr(self.model, '_get_model_hash') else "",
604
+ }
605
+
606
+ return neuroshard_pb2.GetShardInfoResponse(
607
+ shard_id=info.get("shard_id", 0),
608
+ total_shards=info.get("total_shards", 1),
609
+ start_layer=info.get("start_layer", 0),
610
+ end_layer=info.get("end_layer", 12),
611
+ has_embedding=info.get("has_embedding", True),
612
+ has_lm_head=info.get("has_lm_head", True),
613
+ version=info.get("version", 0),
614
+ model_hash=info.get("model_hash", ""),
615
+ available_memory_mb=info.get("available_memory_mb", 0),
616
+ current_load=info.get("current_load", 0),
617
+ )
618
+
619
+ except Exception as e:
620
+ logger.error(f"GetShardInfo error: {e}")
621
+ return neuroshard_pb2.GetShardInfoResponse()
622
+
623
+ def _call_peer(self, peer_url, original_req, output_tensor):
624
+ """Call a peer node (legacy pipeline relay)."""
625
+ from urllib.parse import urlparse
626
+ from neuroshard.core.network.connection_pool import get_channel
627
+
628
+ parsed = urlparse(peer_url)
629
+ peer_ip = parsed.hostname
630
+ peer_http_port = parsed.port or (443 if parsed.scheme == 'https' else 80)
631
+ peer_grpc_addr = f"{peer_ip}:{peer_http_port + 1000}"
632
+
633
+ channel = get_channel(peer_grpc_addr)
634
+ stub = neuroshard_pb2_grpc.NeuroShardServiceStub(channel)
635
+
636
+ fwd_req = neuroshard_pb2.InferenceRequest(
637
+ session_id=original_req.session_id,
638
+ request_id=original_req.request_id,
639
+ tensor_data=serialize_tensor(output_tensor).encode('utf-8'),
640
+ draft_tokens=original_req.draft_tokens,
641
+ sender_reputation=original_req.sender_reputation,
642
+ source_layer=getattr(self.model, 'end', 0)
643
+ )
644
+
645
+ return stub.UnaryInference(fwd_req)
646
+
647
+ def _perform_audit(self, primary_peer, original_req, output_tensor):
648
+ """Audit a peer by comparing with redundant peer."""
649
+ redundant_peer = self.p2p.get_redundant_hop(getattr(self.model, 'end', 0), primary_peer)
650
+ if not redundant_peer:
651
+ return
652
+
653
+ logger.debug(f"AUDITING: Checking {primary_peer} against {redundant_peer}...")
654
+
655
+ try:
656
+ res_redundant = self._call_peer(redundant_peer, original_req, output_tensor)
657
+
658
+ if res_redundant.success:
659
+ logger.debug(f"AUDIT: Redundant peer {redundant_peer} responded successfully.")
660
+ else:
661
+ logger.warning(f"AUDIT: Redundant peer {redundant_peer} failed: {res_redundant.error_message}")
662
+ except Exception as e:
663
+ logger.warning(f"Audit error: {e}")
664
+
665
+ # ==================== SWARM RPCs (Phase 4) ====================
666
+
667
+ def SwarmForward(self, request, context):
668
+ """
669
+ Swarm-style activation forwarding with async buffering.
670
+
671
+ This is the async-first activation forwarding for the swarm architecture.
672
+ Activations are queued in the inbound buffer and processed by ComputeEngine.
673
+
674
+ Unlike PipelineForward (sync), this returns immediately after queueing.
675
+ """
676
+ try:
677
+ import numpy as np
678
+ from neuroshard.core.swarm.buffers import ActivationPacket, ActivationPriority
679
+
680
+ # Deserialize activation
681
+ hidden_states = torch.from_numpy(
682
+ np.frombuffer(request.hidden_states, dtype=np.float32)
683
+ ).reshape(list(request.hidden_shape))
684
+
685
+ # Create activation packet
686
+ packet = ActivationPacket(
687
+ priority=request.priority or ActivationPriority.INFERENCE_NORMAL,
688
+ timestamp=time.time(),
689
+ session_id=request.session_id,
690
+ micro_batch_id=request.micro_batch_id,
691
+ tensor_data=hidden_states,
692
+ source_node=request.source_node,
693
+ target_layer=request.target_layer,
694
+ is_backward=request.is_backward,
695
+ requires_grad=request.requires_grad,
696
+ )
697
+
698
+ # Queue in inbound buffer (non-blocking)
699
+ # Note: swarm_components contains SwarmComponents (router, buffers, etc.)
700
+ # Not to be confused with swarm (DataSwarm for P2P downloads)
701
+ inbound = getattr(self.model, 'swarm_components', None)
702
+ inbound = inbound.inbound_buffer if inbound else None
703
+ if inbound:
704
+ success = inbound.put_nowait(packet)
705
+ if success:
706
+ return neuroshard_pb2.SwarmForwardResponse(
707
+ success=True,
708
+ queued=True,
709
+ queue_position=len(inbound),
710
+ buffer_fill_rate=inbound.fill_rate,
711
+ )
712
+ else:
713
+ # Buffer full - backpressure
714
+ return neuroshard_pb2.SwarmForwardResponse(
715
+ success=False,
716
+ error_message="Inbound buffer full (backpressure)",
717
+ buffer_fill_rate=1.0,
718
+ )
719
+ else:
720
+ return neuroshard_pb2.SwarmForwardResponse(
721
+ success=False,
722
+ error_message="Inbound buffer not initialized"
723
+ )
724
+
725
+ except Exception as e:
726
+ logger.error(f"SwarmForward error: {e}")
727
+ return neuroshard_pb2.SwarmForwardResponse(
728
+ success=False,
729
+ error_message=str(e)
730
+ )
731
+
732
+ def GetSwarmStatus(self, request, context):
733
+ """
734
+ Get swarm node status: buffer fill rates, capacity, etc.
735
+
736
+ Used by peers to check node health before routing.
737
+ """
738
+ try:
739
+ status = self.model.get_swarm_status()
740
+
741
+ # Extract key metrics
742
+ inbound_fill = 0.0
743
+ outbound_fill = 0.0
744
+ inbound_depth = 0
745
+ outbound_depth = 0
746
+
747
+ if "inbound_buffer" in status:
748
+ inbound_fill = status["inbound_buffer"].get("fill_rate", 0.0)
749
+ inbound_depth = status["inbound_buffer"].get("queue_size", 0)
750
+ if "outbound_buffer" in status:
751
+ outbound_fill = status["outbound_buffer"].get("fill_rate", 0.0)
752
+ outbound_depth = status["outbound_buffer"].get("queue_size", 0)
753
+
754
+ # Get layer range
755
+ layer_start = min(self.model.my_layer_ids) if self.model.my_layer_ids else 0
756
+ layer_end = max(self.model.my_layer_ids) + 1 if self.model.my_layer_ids else 0
757
+
758
+ return neuroshard_pb2.SwarmStatusResponse(
759
+ node_id=self.model.node_id,
760
+ layer_start=layer_start,
761
+ layer_end=layer_end,
762
+ inbound_fill_rate=inbound_fill,
763
+ outbound_fill_rate=outbound_fill,
764
+ inbound_queue_depth=inbound_depth,
765
+ outbound_queue_depth=outbound_depth,
766
+ is_accepting_activations=inbound_fill < 0.95,
767
+ )
768
+
769
+ except Exception as e:
770
+ logger.error(f"GetSwarmStatus error: {e}")
771
+ return neuroshard_pb2.SwarmStatusResponse(error_message=str(e))
772
+
773
+ def UpdatePeerCapacity(self, request, context):
774
+ """
775
+ Receive capacity update from peer (TCP fallback for UDP heartbeat).
776
+
777
+ Used when UDP heartbeats fail (firewalls, etc).
778
+ """
779
+ try:
780
+ # Update router with peer info
781
+ # Note: swarm_components contains SwarmComponents (router, buffers, etc.)
782
+ router = getattr(self.model, 'swarm_components', None)
783
+ router = router.swarm_router if router else None
784
+ if router:
785
+ from neuroshard.core.swarm.heartbeat import CapacityBitmask
786
+
787
+ capacity = CapacityBitmask(
788
+ node_id=request.node_id,
789
+ timestamp=time.time(),
790
+ available_memory_mb=request.available_memory_mb,
791
+ queue_depth=request.queue_depth,
792
+ layer_range=(request.layer_start, request.layer_end),
793
+ gpu_utilization=request.gpu_utilization,
794
+ network_saturation=request.network_saturation,
795
+ is_training=request.is_training,
796
+ is_accepting_inference=request.is_accepting_inference,
797
+ is_accepting_activations=request.is_accepting_activations,
798
+ grpc_addr=request.grpc_addr,
799
+ )
800
+
801
+ router.update_peer_from_heartbeat(capacity)
802
+
803
+ return neuroshard_pb2.UpdatePeerCapacityResponse(accepted=True)
804
+
805
+ return neuroshard_pb2.UpdatePeerCapacityResponse(accepted=False)
806
+
807
+ except Exception as e:
808
+ logger.error(f"UpdatePeerCapacity error: {e}")
809
+ return neuroshard_pb2.UpdatePeerCapacityResponse(accepted=False)
810
+
811
+ def GetDiLoCoStatus(self, request, context):
812
+ """
813
+ Get DiLoCo training status.
814
+
815
+ Returns inner step count, sync progress, etc.
816
+ """
817
+ try:
818
+ progress = self.model.get_diloco_progress()
819
+
820
+ return neuroshard_pb2.DiLoCoStatusResponse(
821
+ enabled=progress.get("enabled", False),
822
+ inner_step_count=progress.get("inner_step_count", 0),
823
+ inner_steps_total=progress.get("inner_steps_total", 500),
824
+ progress=progress.get("progress", 0.0),
825
+ outer_step_count=progress.get("outer_step_count", 0),
826
+ should_sync=progress.get("should_sync", False),
827
+ )
828
+
829
+ except Exception as e:
830
+ logger.error(f"GetDiLoCoStatus error: {e}")
831
+ return neuroshard_pb2.DiLoCoStatusResponse(enabled=False)
832
+
833
+
834
+ def serve_grpc(port: int, model, p2p: P2PManager, swap_controller=None):
835
+ """Start the gRPC server."""
836
+ global GRPC_SERVER
837
+
838
+ # Server options for P2P network - be lenient with keepalive pings
839
+ # This is critical for decentralized networks where nodes ping frequently
840
+ # IMPORTANT: Increase message size for activation tensors in pipeline training!
841
+ # Activation size = batch_size * seq_len * hidden_dim * 4 bytes
842
+ # For batch=4, seq=512, hidden=512: ~4MB, but we need headroom
843
+ MAX_MESSAGE_SIZE = 64 * 1024 * 1024 # 64MB for large batches/sequences
844
+ options = [
845
+ ('grpc.keepalive_time_ms', 10000), # Send keepalive every 10s
846
+ ('grpc.keepalive_timeout_ms', 5000), # 5s timeout for response
847
+ ('grpc.keepalive_permit_without_calls', True), # Allow pings without active RPCs
848
+ ('grpc.http2.min_recv_ping_interval_without_data_ms', 5000), # Accept pings every 5s
849
+ ('grpc.http2.max_ping_strikes', 0), # Don't penalize frequent pings
850
+ ('grpc.max_receive_message_length', MAX_MESSAGE_SIZE), # For receiving activations
851
+ ('grpc.max_send_message_length', MAX_MESSAGE_SIZE), # For sending responses
852
+ ]
853
+
854
+ server = grpc.server(futures.ThreadPoolExecutor(max_workers=10), options=options)
855
+ neuroshard_pb2_grpc.add_NeuroShardServiceServicer_to_server(
856
+ NeuroShardServiceServicer(model, p2p, swap_controller), server
857
+ )
858
+
859
+ grpc_port = port + 1000
860
+ server.add_insecure_port(f'[::]:{grpc_port}')
861
+ server.start()
862
+ GRPC_SERVER = server
863
+
864
+ logger.info(f"gRPC Server started on port {grpc_port}")
865
+
866
+ try:
867
+ # Wait until server is stopped externally via stop_grpc()
868
+ server.wait_for_termination()
869
+ except KeyboardInterrupt:
870
+ server.stop(0)
871
+ finally:
872
+ GRPC_SERVER = None
873
+ logger.info(f"gRPC Server on port {grpc_port} terminated")
874
+
875
+
876
+ def start_grpc_background(port: int, model, p2p: P2PManager, swap_controller=None):
877
+ """Start gRPC server in background thread."""
878
+ t = threading.Thread(target=serve_grpc, args=(port, model, p2p, swap_controller), daemon=True)
879
+ t.start()
880
+
881
+
882
+ def stop_grpc(timeout: float = 5.0):
883
+ """Stop the gRPC server gracefully."""
884
+ global GRPC_SERVER
885
+ if GRPC_SERVER is not None:
886
+ logger.info("Stopping gRPC server...")
887
+ try:
888
+ # stop() returns an event that is set when shutdown is complete
889
+ event = GRPC_SERVER.stop(grace=timeout)
890
+ event.wait(timeout=timeout)
891
+ logger.info("gRPC server stopped")
892
+ except Exception as e:
893
+ logger.warning(f"Error stopping gRPC server: {e}")
894
+ finally:
895
+ GRPC_SERVER = None