nexaroa 0.0.111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. neuroshard/__init__.py +93 -0
  2. neuroshard/__main__.py +4 -0
  3. neuroshard/cli.py +466 -0
  4. neuroshard/core/__init__.py +92 -0
  5. neuroshard/core/consensus/verifier.py +252 -0
  6. neuroshard/core/crypto/__init__.py +20 -0
  7. neuroshard/core/crypto/ecdsa.py +392 -0
  8. neuroshard/core/economics/__init__.py +52 -0
  9. neuroshard/core/economics/constants.py +387 -0
  10. neuroshard/core/economics/ledger.py +2111 -0
  11. neuroshard/core/economics/market.py +975 -0
  12. neuroshard/core/economics/wallet.py +168 -0
  13. neuroshard/core/governance/__init__.py +74 -0
  14. neuroshard/core/governance/proposal.py +561 -0
  15. neuroshard/core/governance/registry.py +545 -0
  16. neuroshard/core/governance/versioning.py +332 -0
  17. neuroshard/core/governance/voting.py +453 -0
  18. neuroshard/core/model/__init__.py +30 -0
  19. neuroshard/core/model/dynamic.py +4186 -0
  20. neuroshard/core/model/llm.py +905 -0
  21. neuroshard/core/model/registry.py +164 -0
  22. neuroshard/core/model/scaler.py +387 -0
  23. neuroshard/core/model/tokenizer.py +568 -0
  24. neuroshard/core/network/__init__.py +56 -0
  25. neuroshard/core/network/connection_pool.py +72 -0
  26. neuroshard/core/network/dht.py +130 -0
  27. neuroshard/core/network/dht_plan.py +55 -0
  28. neuroshard/core/network/dht_proof_store.py +516 -0
  29. neuroshard/core/network/dht_protocol.py +261 -0
  30. neuroshard/core/network/dht_service.py +506 -0
  31. neuroshard/core/network/encrypted_channel.py +141 -0
  32. neuroshard/core/network/nat.py +201 -0
  33. neuroshard/core/network/nat_traversal.py +695 -0
  34. neuroshard/core/network/p2p.py +929 -0
  35. neuroshard/core/network/p2p_data.py +150 -0
  36. neuroshard/core/swarm/__init__.py +106 -0
  37. neuroshard/core/swarm/aggregation.py +729 -0
  38. neuroshard/core/swarm/buffers.py +643 -0
  39. neuroshard/core/swarm/checkpoint.py +709 -0
  40. neuroshard/core/swarm/compute.py +624 -0
  41. neuroshard/core/swarm/diloco.py +844 -0
  42. neuroshard/core/swarm/factory.py +1288 -0
  43. neuroshard/core/swarm/heartbeat.py +669 -0
  44. neuroshard/core/swarm/logger.py +487 -0
  45. neuroshard/core/swarm/router.py +658 -0
  46. neuroshard/core/swarm/service.py +640 -0
  47. neuroshard/core/training/__init__.py +29 -0
  48. neuroshard/core/training/checkpoint.py +600 -0
  49. neuroshard/core/training/distributed.py +1602 -0
  50. neuroshard/core/training/global_tracker.py +617 -0
  51. neuroshard/core/training/production.py +276 -0
  52. neuroshard/governance_cli.py +729 -0
  53. neuroshard/grpc_server.py +895 -0
  54. neuroshard/runner.py +3223 -0
  55. neuroshard/sdk/__init__.py +92 -0
  56. neuroshard/sdk/client.py +990 -0
  57. neuroshard/sdk/errors.py +101 -0
  58. neuroshard/sdk/types.py +282 -0
  59. neuroshard/tracker/__init__.py +0 -0
  60. neuroshard/tracker/server.py +864 -0
  61. neuroshard/ui/__init__.py +0 -0
  62. neuroshard/ui/app.py +102 -0
  63. neuroshard/ui/templates/index.html +1052 -0
  64. neuroshard/utils/__init__.py +0 -0
  65. neuroshard/utils/autostart.py +81 -0
  66. neuroshard/utils/hardware.py +121 -0
  67. neuroshard/utils/serialization.py +90 -0
  68. neuroshard/version.py +1 -0
  69. nexaroa-0.0.111.dist-info/METADATA +283 -0
  70. nexaroa-0.0.111.dist-info/RECORD +78 -0
  71. nexaroa-0.0.111.dist-info/WHEEL +5 -0
  72. nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
  73. nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
  74. nexaroa-0.0.111.dist-info/top_level.txt +2 -0
  75. protos/__init__.py +0 -0
  76. protos/neuroshard.proto +651 -0
  77. protos/neuroshard_pb2.py +160 -0
  78. protos/neuroshard_pb2_grpc.py +1298 -0
@@ -0,0 +1,640 @@
1
+ """
2
+ Swarm Service Mixin for gRPC Integration
3
+
4
+ Integrates Phase 1 swarm components with the existing gRPC server:
5
+ - SwarmRouter for multipath routing with failover
6
+ - ActivationBuffer for async compute decoupling
7
+ - ComputeEngine for "Don't Stop" soft overflow handling
8
+ - SwarmHeartbeat for capacity advertisement
9
+
10
+ This mixin is designed to be added to the existing NeuroShardServiceServicer.
11
+ """
12
+
13
+ import asyncio
14
+ import time
15
+ import logging
16
+ import threading
17
+ import numpy as np
18
+ from typing import Dict, Optional, Tuple, Any
19
+ from dataclasses import dataclass, field
20
+ from concurrent.futures import ThreadPoolExecutor
21
+
22
+ import torch
23
+
24
+ from neuroshard.core.swarm.router import SwarmRouter, PeerCandidate
25
+ from neuroshard.core.swarm.buffers import (
26
+ ActivationBuffer,
27
+ OutboundBuffer,
28
+ ActivationPacket,
29
+ )
30
+ from neuroshard.core.swarm.heartbeat import SwarmHeartbeatService, CapacityBitmask
31
+ from neuroshard.core.swarm.compute import ComputeEngine, StepOutcome, ComputeStats
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+
36
+ @dataclass
37
+ class SwarmNodeState:
38
+ """State container for swarm node operation."""
39
+ node_id: str
40
+ layer_range: Tuple[int, int]
41
+ grpc_addr: str
42
+
43
+ # Capacity
44
+ available_memory_mb: int = 0
45
+ gpu_utilization: float = 0.0
46
+
47
+ # Status
48
+ is_training: bool = False
49
+ is_accepting_inference: bool = True
50
+ is_accepting_activations: bool = True
51
+
52
+ # Timestamps
53
+ last_updated: float = field(default_factory=time.time)
54
+
55
+
56
+ class SwarmServiceMixin:
57
+ """
58
+ Mixin class that adds Swarm functionality to NeuroShardServiceServicer.
59
+
60
+ Provides:
61
+ - Async activation handling via ActivationBuffer
62
+ - Fault-tolerant routing via SwarmRouter
63
+ - Capacity broadcasting via SwarmHeartbeatService
64
+ - Decoupled compute via ComputeEngine
65
+
66
+ Usage:
67
+ class NeuroShardServiceServicer(SwarmServiceMixin, DHTServiceMixin, ...):
68
+ def __init__(self, ...):
69
+ SwarmServiceMixin.__init__(self, model, layer_pool)
70
+ ...
71
+ """
72
+
73
+ # Configuration
74
+ INBOUND_BUFFER_SIZE = 100
75
+ OUTBOUND_BUFFER_SIZE = 50
76
+ HEARTBEAT_PORT = 9999
77
+
78
+ def __init_swarm__(
79
+ self,
80
+ model,
81
+ layer_pool,
82
+ p2p_manager=None,
83
+ enable_heartbeat: bool = True,
84
+ enable_compute_engine: bool = True,
85
+ ):
86
+ """
87
+ Initialize swarm components.
88
+
89
+ Args:
90
+ model: DynamicNeuroNode or DynamicNeuroLLM instance
91
+ layer_pool: DynamicLayerPool for peer discovery
92
+ p2p_manager: P2PManager for network communication
93
+ enable_heartbeat: Whether to start heartbeat service
94
+ enable_compute_engine: Whether to start compute engine
95
+ """
96
+ self.swarm_model = model
97
+ self.layer_pool = layer_pool
98
+ self.p2p = p2p_manager
99
+
100
+ # Extract node info
101
+ self.swarm_node_id = getattr(model, 'node_id', 'unknown')
102
+ self.swarm_layer_ids = getattr(model, 'my_layer_ids', [])
103
+
104
+ # Initialize buffers
105
+ self.inbound_buffer = ActivationBuffer(max_size=self.INBOUND_BUFFER_SIZE)
106
+ self.outbound_buffer = OutboundBuffer(
107
+ max_size=self.OUTBOUND_BUFFER_SIZE,
108
+ soft_overflow_threshold=0.9,
109
+ hard_overflow_threshold=0.99
110
+ )
111
+
112
+ # Initialize router (uses DHT/layer_pool for peer discovery)
113
+ dht = getattr(p2p_manager, 'routing_table', None) if p2p_manager else None
114
+ self.swarm_router = SwarmRouter(dht_protocol=dht, layer_pool=layer_pool)
115
+
116
+ # Initialize heartbeat service
117
+ self.heartbeat_service = None
118
+ if enable_heartbeat:
119
+ self.heartbeat_service = SwarmHeartbeatService(
120
+ node_id=self.swarm_node_id,
121
+ udp_port=self.HEARTBEAT_PORT
122
+ )
123
+
124
+ # Initialize compute engine (runs async)
125
+ self.compute_engine = None
126
+ self._compute_thread = None
127
+ self._async_loop = None
128
+
129
+ # State tracking
130
+ self.swarm_state = SwarmNodeState(
131
+ node_id=self.swarm_node_id,
132
+ layer_range=self._get_layer_range(),
133
+ grpc_addr=getattr(model, 'grpc_addr', ''),
134
+ )
135
+
136
+ # Statistics
137
+ self.swarm_stats = {
138
+ 'activations_received': 0,
139
+ 'activations_sent': 0,
140
+ 'failovers': 0,
141
+ 'soft_overflows': 0,
142
+ 'hard_overflows': 0,
143
+ }
144
+
145
+ logger.info(f"SwarmServiceMixin initialized for node {self.swarm_node_id}")
146
+ logger.info(f" Layers: {self.swarm_layer_ids}")
147
+ logger.info(f" Inbound buffer: {self.INBOUND_BUFFER_SIZE}")
148
+ logger.info(f" Outbound buffer: {self.OUTBOUND_BUFFER_SIZE}")
149
+
150
+ def _get_layer_range(self) -> Tuple[int, int]:
151
+ """Get the layer range for this node."""
152
+ if not self.swarm_layer_ids:
153
+ return (0, 0)
154
+ return (min(self.swarm_layer_ids), max(self.swarm_layer_ids) + 1)
155
+
156
+ # ==================== LIFECYCLE ====================
157
+
158
+ def start_swarm_services(self):
159
+ """Start all swarm background services."""
160
+ # Start heartbeat
161
+ if self.heartbeat_service:
162
+ self._update_heartbeat_capacity()
163
+ self.heartbeat_service.start()
164
+ logger.info("Swarm heartbeat service started")
165
+
166
+ # Start compute engine in background thread with async loop
167
+ self._start_compute_engine()
168
+
169
+ # Start outbound send loop
170
+ self._start_outbound_sender()
171
+
172
+ logger.info("All swarm services started")
173
+
174
+ def stop_swarm_services(self):
175
+ """Stop all swarm background services."""
176
+ # Stop compute engine
177
+ if self.compute_engine:
178
+ self.compute_engine.running = False
179
+
180
+ # Stop heartbeat
181
+ if self.heartbeat_service:
182
+ self.heartbeat_service.stop()
183
+
184
+ # Stop async loop
185
+ if self._async_loop:
186
+ self._async_loop.call_soon_threadsafe(self._async_loop.stop)
187
+
188
+ logger.info("All swarm services stopped")
189
+
190
+ def _start_compute_engine(self):
191
+ """Start the compute engine in a background thread."""
192
+ if not hasattr(self, 'swarm_model') or self.swarm_model is None:
193
+ logger.warning("No model available for compute engine")
194
+ return
195
+
196
+ # Create compute engine
197
+ self.compute_engine = ComputeEngine(
198
+ model=self.swarm_model,
199
+ inbound=self.inbound_buffer,
200
+ outbound=self.outbound_buffer,
201
+ num_micro_batches=4
202
+ )
203
+
204
+ def run_async_loop():
205
+ """Run asyncio event loop in thread."""
206
+ self._async_loop = asyncio.new_event_loop()
207
+ asyncio.set_event_loop(self._async_loop)
208
+
209
+ try:
210
+ self._async_loop.run_until_complete(self.compute_engine.run())
211
+ except Exception as e:
212
+ logger.error(f"Compute engine error: {e}")
213
+ finally:
214
+ self._async_loop.close()
215
+
216
+ self._compute_thread = threading.Thread(
217
+ target=run_async_loop,
218
+ daemon=True,
219
+ name="SwarmComputeEngine"
220
+ )
221
+ self._compute_thread.start()
222
+ logger.info("Compute engine started in background thread")
223
+
224
+ def _start_outbound_sender(self):
225
+ """Start the outbound buffer send loop."""
226
+ def send_loop():
227
+ """Continuously send packets from outbound buffer."""
228
+ loop = asyncio.new_event_loop()
229
+ asyncio.set_event_loop(loop)
230
+
231
+ async def _send():
232
+ while True:
233
+ try:
234
+ packet = await self.outbound_buffer.get()
235
+ if packet is None:
236
+ await asyncio.sleep(0.01)
237
+ continue
238
+
239
+ # Send via swarm router with failover
240
+ try:
241
+ await self.swarm_router.send_with_failover(
242
+ tensor=packet.tensor_data,
243
+ target_layer=packet.target_layer,
244
+ session_id=packet.session_id
245
+ )
246
+ self.swarm_stats['activations_sent'] += 1
247
+ except Exception as e:
248
+ logger.warning(f"Failed to send activation: {e}")
249
+ self.swarm_stats['failovers'] += 1
250
+ except Exception as e:
251
+ logger.error(f"Outbound send error: {e}")
252
+ await asyncio.sleep(0.1)
253
+
254
+ try:
255
+ loop.run_until_complete(_send())
256
+ except Exception as e:
257
+ logger.error(f"Outbound sender error: {e}")
258
+ finally:
259
+ loop.close()
260
+
261
+ threading.Thread(
262
+ target=send_loop,
263
+ daemon=True,
264
+ name="SwarmOutboundSender"
265
+ ).start()
266
+ logger.info("Outbound sender started")
267
+
268
+ def _update_heartbeat_capacity(self):
269
+ """Update heartbeat capacity from current state."""
270
+ if not self.heartbeat_service:
271
+ return
272
+
273
+ # Get memory info
274
+ available_mb = 0
275
+ gpu_util = 0.0
276
+
277
+ try:
278
+ if torch.cuda.is_available():
279
+ free, total = torch.cuda.mem_get_info()
280
+ available_mb = free // (1024 * 1024)
281
+ # GPU utilization would need nvidia-smi or pynvml
282
+ elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
283
+ # MPS doesn't have direct memory query
284
+ available_mb = 1024 # Estimate
285
+ else:
286
+ import psutil
287
+ mem = psutil.virtual_memory()
288
+ available_mb = mem.available // (1024 * 1024)
289
+ except Exception as e:
290
+ logger.debug(f"Could not get memory info: {e}")
291
+
292
+ self.swarm_state.available_memory_mb = available_mb
293
+ self.swarm_state.gpu_utilization = gpu_util
294
+
295
+ # Update heartbeat
296
+ self.heartbeat_service.update_local_capacity(
297
+ available_memory_mb=available_mb,
298
+ queue_depth=len(self.inbound_buffer),
299
+ layer_range=self.swarm_state.layer_range,
300
+ gpu_utilization=gpu_util,
301
+ is_training=self.swarm_state.is_training,
302
+ is_accepting_activations=self.swarm_state.is_accepting_activations,
303
+ )
304
+
305
+ # ==================== gRPC HANDLERS ====================
306
+
307
+ def SwarmForward(self, request, context):
308
+ """
309
+ Handle incoming activation packets for swarm routing.
310
+
311
+ This is the primary RPC for the async pipeline:
312
+ 1. Deserialize activation tensor
313
+ 2. Push to inbound ActivationBuffer
314
+ 3. ComputeEngine processes asynchronously
315
+ 4. Return immediately (non-blocking)
316
+
317
+ This replaces the synchronous PipelineForward for swarm mode.
318
+ """
319
+ try:
320
+ import numpy as np
321
+ from protos import neuroshard_pb2
322
+
323
+ # Deserialize activation
324
+ hidden_states = torch.from_numpy(
325
+ np.frombuffer(request.hidden_states, dtype=np.float32)
326
+ ).reshape(list(request.hidden_shape))
327
+
328
+ # Create activation packet
329
+ packet = ActivationPacket(
330
+ priority=request.priority if hasattr(request, 'priority') else 10,
331
+ session_id=request.session_id,
332
+ micro_batch_id=request.micro_batch_id if hasattr(request, 'micro_batch_id') else 0,
333
+ tensor_data=hidden_states,
334
+ source_node=request.sender_url if hasattr(request, 'sender_url') else '',
335
+ target_layer=request.target_layer if hasattr(request, 'target_layer') else 0,
336
+ is_backward=request.is_backward if hasattr(request, 'is_backward') else False,
337
+ )
338
+
339
+ # Non-blocking put to inbound buffer
340
+ success = self.inbound_buffer.put_nowait(packet)
341
+
342
+ if success:
343
+ self.swarm_stats['activations_received'] += 1
344
+ return neuroshard_pb2.SwarmForwardResponse(
345
+ success=True,
346
+ request_id=request.request_id,
347
+ buffer_depth=len(self.inbound_buffer),
348
+ )
349
+ else:
350
+ # Buffer full - backpressure signal
351
+ return neuroshard_pb2.SwarmForwardResponse(
352
+ success=False,
353
+ request_id=request.request_id,
354
+ error_message="Inbound buffer full - backpressure",
355
+ buffer_depth=len(self.inbound_buffer),
356
+ )
357
+
358
+ except Exception as e:
359
+ logger.error(f"SwarmForward error: {e}")
360
+ from protos import neuroshard_pb2
361
+ return neuroshard_pb2.SwarmForwardResponse(
362
+ success=False,
363
+ request_id=getattr(request, 'request_id', ''),
364
+ error_message=str(e),
365
+ )
366
+
367
+ def GetSwarmStatus(self, request, context):
368
+ """
369
+ Get swarm node status including buffer fill rates and capacity.
370
+
371
+ Used by peers for routing decisions.
372
+ """
373
+ try:
374
+ from protos import neuroshard_pb2
375
+
376
+ # Get buffer stats
377
+ inbound_stats = self.inbound_buffer.get_stats()
378
+ outbound_stats = self.outbound_buffer.get_stats()
379
+
380
+ # Get compute stats if available
381
+ compute_stats = {}
382
+ if self.compute_engine:
383
+ compute_stats = self.compute_engine.get_stats()
384
+
385
+ return neuroshard_pb2.SwarmStatusResponse(
386
+ node_id=self.swarm_node_id,
387
+ layer_start=self.swarm_state.layer_range[0],
388
+ layer_end=self.swarm_state.layer_range[1],
389
+
390
+ # Buffer status
391
+ inbound_fill_rate=inbound_stats.get('fill_rate', 0.0),
392
+ outbound_fill_rate=outbound_stats.get('fill_rate', 0.0),
393
+ inbound_queue_depth=inbound_stats.get('queue_size', 0),
394
+ outbound_queue_depth=outbound_stats.get('queue_size', 0),
395
+
396
+ # Capacity
397
+ available_memory_mb=self.swarm_state.available_memory_mb,
398
+ gpu_utilization=self.swarm_state.gpu_utilization,
399
+
400
+ # Status
401
+ is_training=self.swarm_state.is_training,
402
+ is_accepting_activations=self.swarm_state.is_accepting_activations,
403
+
404
+ # Compute stats
405
+ total_steps=compute_stats.get('total_steps', 0),
406
+ local_only_rate=compute_stats.get('local_only_rate', 0.0),
407
+ )
408
+
409
+ except Exception as e:
410
+ logger.error(f"GetSwarmStatus error: {e}")
411
+ from protos import neuroshard_pb2
412
+ return neuroshard_pb2.SwarmStatusResponse(
413
+ node_id=self.swarm_node_id,
414
+ error_message=str(e),
415
+ )
416
+
417
+ def UpdatePeerCapacity(self, request, context):
418
+ """
419
+ Receive peer capacity update (alternative to UDP heartbeat).
420
+
421
+ Used when UDP heartbeat is blocked by firewalls.
422
+ """
423
+ try:
424
+ from protos import neuroshard_pb2
425
+
426
+ # Update router's peer cache
427
+ peer = PeerCandidate(
428
+ node_id=request.node_id,
429
+ grpc_addr=request.grpc_addr,
430
+ layer_range=(request.layer_start, request.layer_end),
431
+ latency_ms=0.0, # Will be measured on first send
432
+ queue_depth=request.queue_depth,
433
+ last_heartbeat=time.time(),
434
+ )
435
+
436
+ self.swarm_router.update_peer_from_heartbeat(peer)
437
+
438
+ return neuroshard_pb2.UpdatePeerCapacityResponse(
439
+ success=True,
440
+ )
441
+
442
+ except Exception as e:
443
+ logger.error(f"UpdatePeerCapacity error: {e}")
444
+ from protos import neuroshard_pb2
445
+ return neuroshard_pb2.UpdatePeerCapacityResponse(
446
+ success=False,
447
+ error_message=str(e),
448
+ )
449
+
450
+ # ==================== INTEGRATION WITH EXISTING PIPELINE ====================
451
+
452
+ def handle_pipeline_forward_swarm(self, request, context):
453
+ """
454
+ Adapter for existing PipelineForward to use swarm buffers.
455
+
456
+ Converts synchronous PipelineForward to async buffer-based processing.
457
+ Can be called from PipelineForward() to enable swarm mode.
458
+ """
459
+ import numpy as np
460
+ from protos import neuroshard_pb2
461
+
462
+ try:
463
+ # Deserialize hidden states
464
+ hidden_states = torch.from_numpy(
465
+ np.frombuffer(request.hidden_states, dtype=np.float32)
466
+ ).reshape(list(request.hidden_shape))
467
+
468
+ # Determine priority
469
+ priority = 10 # Default: normal inference
470
+ if hasattr(request, 'is_training') and request.is_training:
471
+ priority = 20 # Training forward
472
+
473
+ # Create packet
474
+ packet = ActivationPacket(
475
+ priority=priority,
476
+ session_id=request.session_id,
477
+ micro_batch_id=0,
478
+ tensor_data=hidden_states,
479
+ source_node=request.sender_url if hasattr(request, 'sender_url') else '',
480
+ target_layer=request.target_shard if hasattr(request, 'target_shard') else 0,
481
+ is_backward=False,
482
+ )
483
+
484
+ # Check buffer pressure before accepting
485
+ if self.inbound_buffer.is_backpressured:
486
+ return neuroshard_pb2.PipelineForwardResponse(
487
+ request_id=request.request_id,
488
+ success=False,
489
+ error_message="Node backpressured - try another peer",
490
+ )
491
+
492
+ # Non-blocking put
493
+ if not self.inbound_buffer.put_nowait(packet):
494
+ return neuroshard_pb2.PipelineForwardResponse(
495
+ request_id=request.request_id,
496
+ success=False,
497
+ error_message="Buffer full",
498
+ )
499
+
500
+ self.swarm_stats['activations_received'] += 1
501
+
502
+ # In async mode, we return immediately
503
+ # The compute engine will process and send to next peer
504
+ return neuroshard_pb2.PipelineForwardResponse(
505
+ request_id=request.request_id,
506
+ success=True,
507
+ # hidden_states not returned - async processing
508
+ )
509
+
510
+ except Exception as e:
511
+ logger.error(f"handle_pipeline_forward_swarm error: {e}")
512
+ return neuroshard_pb2.PipelineForwardResponse(
513
+ request_id=request.request_id,
514
+ success=False,
515
+ error_message=str(e),
516
+ )
517
+
518
+ # ==================== UTILITIES ====================
519
+
520
+ def get_swarm_stats(self) -> Dict[str, Any]:
521
+ """Get comprehensive swarm statistics."""
522
+ stats = dict(self.swarm_stats)
523
+
524
+ # Add buffer stats
525
+ stats['inbound'] = self.inbound_buffer.get_stats()
526
+ stats['outbound'] = self.outbound_buffer.get_stats()
527
+
528
+ # Add compute stats
529
+ if self.compute_engine:
530
+ stats['compute'] = self.compute_engine.get_stats()
531
+
532
+ # Add router stats
533
+ stats['router'] = {
534
+ 'cached_peers': len(self.swarm_router.peer_stats),
535
+ 'total_sends': self.swarm_router.total_sends,
536
+ 'successful_sends': self.swarm_router.successful_sends,
537
+ 'failovers': self.swarm_router.failover_count,
538
+ }
539
+
540
+ # Add heartbeat stats
541
+ if self.heartbeat_service:
542
+ stats['heartbeat'] = {
543
+ 'known_peers': len(self.heartbeat_service.peer_capacities),
544
+ 'broadcast_count': self.heartbeat_service.broadcast_count,
545
+ }
546
+
547
+ return stats
548
+
549
+ def set_training_mode(self, enabled: bool):
550
+ """Set training mode (affects prioritization)."""
551
+ self.swarm_state.is_training = enabled
552
+ self._update_heartbeat_capacity()
553
+
554
+ def set_accepting_activations(self, enabled: bool):
555
+ """Set whether this node accepts new activations."""
556
+ self.swarm_state.is_accepting_activations = enabled
557
+ self._update_heartbeat_capacity()
558
+
559
+
560
+ # ==================== PROTO EXTENSION ====================
561
+ # These message types need to be added to neuroshard.proto
562
+
563
+ SWARM_PROTO_EXTENSION = """
564
+ // --- Swarm Routing Messages ---
565
+
566
+ message SwarmForwardRequest {
567
+ string session_id = 1;
568
+ string request_id = 2;
569
+
570
+ // Activation data
571
+ bytes hidden_states = 3;
572
+ repeated int64 hidden_shape = 4;
573
+
574
+ // Routing
575
+ int32 target_layer = 5;
576
+ string sender_url = 6;
577
+
578
+ // Priority (0=highest)
579
+ int32 priority = 7;
580
+ int32 micro_batch_id = 8;
581
+ bool is_backward = 9;
582
+
583
+ // Training metadata
584
+ bool requires_grad = 10;
585
+ bytes grad_output = 11;
586
+ }
587
+
588
+ message SwarmForwardResponse {
589
+ string request_id = 1;
590
+ bool success = 2;
591
+ string error_message = 3;
592
+ int32 buffer_depth = 4; // For backpressure signaling
593
+ }
594
+
595
+ message SwarmStatusRequest {
596
+ // Empty - just get status
597
+ }
598
+
599
+ message SwarmStatusResponse {
600
+ string node_id = 1;
601
+ int32 layer_start = 2;
602
+ int32 layer_end = 3;
603
+
604
+ // Buffer status
605
+ float inbound_fill_rate = 4;
606
+ float outbound_fill_rate = 5;
607
+ int32 inbound_queue_depth = 6;
608
+ int32 outbound_queue_depth = 7;
609
+
610
+ // Capacity
611
+ int32 available_memory_mb = 8;
612
+ float gpu_utilization = 9;
613
+
614
+ // Status
615
+ bool is_training = 10;
616
+ bool is_accepting_activations = 11;
617
+
618
+ // Compute stats
619
+ int64 total_steps = 12;
620
+ float local_only_rate = 13;
621
+
622
+ string error_message = 14;
623
+ }
624
+
625
+ message UpdatePeerCapacityRequest {
626
+ string node_id = 1;
627
+ string grpc_addr = 2;
628
+ int32 layer_start = 3;
629
+ int32 layer_end = 4;
630
+ int32 queue_depth = 5;
631
+ int32 available_memory_mb = 6;
632
+ float gpu_utilization = 7;
633
+ }
634
+
635
+ message UpdatePeerCapacityResponse {
636
+ bool success = 1;
637
+ string error_message = 2;
638
+ }
639
+ """
640
+
@@ -0,0 +1,29 @@
1
+ # neuroshard/core/training/__init__.py
2
+ """
3
+ Training coordination components for NeuroShard.
4
+
5
+ - distributed: TrainingCoordinator, FederatedDataManager, GenesisDataLoader
6
+ - production: GradientCompressor (for DiLoCo gradient exchange)
7
+ - global_tracker: GlobalTrainingTracker (for training verification)
8
+ """
9
+
10
+ __all__ = [
11
+ 'TrainingCoordinator',
12
+ 'FederatedDataManager',
13
+ 'GenesisDataLoader',
14
+ 'GradientCompressor',
15
+ 'GlobalTrainingTracker',
16
+ ]
17
+
18
+ def __getattr__(name):
19
+ """Lazy loading of submodules."""
20
+ if name in ('TrainingCoordinator', 'FederatedDataManager', 'GenesisDataLoader', 'GradientContribution'):
21
+ from neuroshard.core.training import distributed
22
+ return getattr(distributed, name)
23
+ elif name in ('GradientCompressor', 'CompressionConfig', 'CompressionMethod'):
24
+ from neuroshard.core.training import production
25
+ return getattr(production, name)
26
+ elif name in ('GlobalTrainingTracker', 'TrainingSnapshot', 'GlobalTrainingStats'):
27
+ from neuroshard.core.training import global_tracker
28
+ return getattr(global_tracker, name)
29
+ raise AttributeError(f"module 'neuroshard.core.training' has no attribute '{name}'")