nexaroa 0.0.111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuroshard/__init__.py +93 -0
- neuroshard/__main__.py +4 -0
- neuroshard/cli.py +466 -0
- neuroshard/core/__init__.py +92 -0
- neuroshard/core/consensus/verifier.py +252 -0
- neuroshard/core/crypto/__init__.py +20 -0
- neuroshard/core/crypto/ecdsa.py +392 -0
- neuroshard/core/economics/__init__.py +52 -0
- neuroshard/core/economics/constants.py +387 -0
- neuroshard/core/economics/ledger.py +2111 -0
- neuroshard/core/economics/market.py +975 -0
- neuroshard/core/economics/wallet.py +168 -0
- neuroshard/core/governance/__init__.py +74 -0
- neuroshard/core/governance/proposal.py +561 -0
- neuroshard/core/governance/registry.py +545 -0
- neuroshard/core/governance/versioning.py +332 -0
- neuroshard/core/governance/voting.py +453 -0
- neuroshard/core/model/__init__.py +30 -0
- neuroshard/core/model/dynamic.py +4186 -0
- neuroshard/core/model/llm.py +905 -0
- neuroshard/core/model/registry.py +164 -0
- neuroshard/core/model/scaler.py +387 -0
- neuroshard/core/model/tokenizer.py +568 -0
- neuroshard/core/network/__init__.py +56 -0
- neuroshard/core/network/connection_pool.py +72 -0
- neuroshard/core/network/dht.py +130 -0
- neuroshard/core/network/dht_plan.py +55 -0
- neuroshard/core/network/dht_proof_store.py +516 -0
- neuroshard/core/network/dht_protocol.py +261 -0
- neuroshard/core/network/dht_service.py +506 -0
- neuroshard/core/network/encrypted_channel.py +141 -0
- neuroshard/core/network/nat.py +201 -0
- neuroshard/core/network/nat_traversal.py +695 -0
- neuroshard/core/network/p2p.py +929 -0
- neuroshard/core/network/p2p_data.py +150 -0
- neuroshard/core/swarm/__init__.py +106 -0
- neuroshard/core/swarm/aggregation.py +729 -0
- neuroshard/core/swarm/buffers.py +643 -0
- neuroshard/core/swarm/checkpoint.py +709 -0
- neuroshard/core/swarm/compute.py +624 -0
- neuroshard/core/swarm/diloco.py +844 -0
- neuroshard/core/swarm/factory.py +1288 -0
- neuroshard/core/swarm/heartbeat.py +669 -0
- neuroshard/core/swarm/logger.py +487 -0
- neuroshard/core/swarm/router.py +658 -0
- neuroshard/core/swarm/service.py +640 -0
- neuroshard/core/training/__init__.py +29 -0
- neuroshard/core/training/checkpoint.py +600 -0
- neuroshard/core/training/distributed.py +1602 -0
- neuroshard/core/training/global_tracker.py +617 -0
- neuroshard/core/training/production.py +276 -0
- neuroshard/governance_cli.py +729 -0
- neuroshard/grpc_server.py +895 -0
- neuroshard/runner.py +3223 -0
- neuroshard/sdk/__init__.py +92 -0
- neuroshard/sdk/client.py +990 -0
- neuroshard/sdk/errors.py +101 -0
- neuroshard/sdk/types.py +282 -0
- neuroshard/tracker/__init__.py +0 -0
- neuroshard/tracker/server.py +864 -0
- neuroshard/ui/__init__.py +0 -0
- neuroshard/ui/app.py +102 -0
- neuroshard/ui/templates/index.html +1052 -0
- neuroshard/utils/__init__.py +0 -0
- neuroshard/utils/autostart.py +81 -0
- neuroshard/utils/hardware.py +121 -0
- neuroshard/utils/serialization.py +90 -0
- neuroshard/version.py +1 -0
- nexaroa-0.0.111.dist-info/METADATA +283 -0
- nexaroa-0.0.111.dist-info/RECORD +78 -0
- nexaroa-0.0.111.dist-info/WHEEL +5 -0
- nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
- nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
- nexaroa-0.0.111.dist-info/top_level.txt +2 -0
- protos/__init__.py +0 -0
- protos/neuroshard.proto +651 -0
- protos/neuroshard_pb2.py +160 -0
- protos/neuroshard_pb2_grpc.py +1298 -0
|
@@ -0,0 +1,640 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Swarm Service Mixin for gRPC Integration
|
|
3
|
+
|
|
4
|
+
Integrates Phase 1 swarm components with the existing gRPC server:
|
|
5
|
+
- SwarmRouter for multipath routing with failover
|
|
6
|
+
- ActivationBuffer for async compute decoupling
|
|
7
|
+
- ComputeEngine for "Don't Stop" soft overflow handling
|
|
8
|
+
- SwarmHeartbeat for capacity advertisement
|
|
9
|
+
|
|
10
|
+
This mixin is designed to be added to the existing NeuroShardServiceServicer.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import time
|
|
15
|
+
import logging
|
|
16
|
+
import threading
|
|
17
|
+
import numpy as np
|
|
18
|
+
from typing import Dict, Optional, Tuple, Any
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
21
|
+
|
|
22
|
+
import torch
|
|
23
|
+
|
|
24
|
+
from neuroshard.core.swarm.router import SwarmRouter, PeerCandidate
|
|
25
|
+
from neuroshard.core.swarm.buffers import (
|
|
26
|
+
ActivationBuffer,
|
|
27
|
+
OutboundBuffer,
|
|
28
|
+
ActivationPacket,
|
|
29
|
+
)
|
|
30
|
+
from neuroshard.core.swarm.heartbeat import SwarmHeartbeatService, CapacityBitmask
|
|
31
|
+
from neuroshard.core.swarm.compute import ComputeEngine, StepOutcome, ComputeStats
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class SwarmNodeState:
|
|
38
|
+
"""State container for swarm node operation."""
|
|
39
|
+
node_id: str
|
|
40
|
+
layer_range: Tuple[int, int]
|
|
41
|
+
grpc_addr: str
|
|
42
|
+
|
|
43
|
+
# Capacity
|
|
44
|
+
available_memory_mb: int = 0
|
|
45
|
+
gpu_utilization: float = 0.0
|
|
46
|
+
|
|
47
|
+
# Status
|
|
48
|
+
is_training: bool = False
|
|
49
|
+
is_accepting_inference: bool = True
|
|
50
|
+
is_accepting_activations: bool = True
|
|
51
|
+
|
|
52
|
+
# Timestamps
|
|
53
|
+
last_updated: float = field(default_factory=time.time)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class SwarmServiceMixin:
|
|
57
|
+
"""
|
|
58
|
+
Mixin class that adds Swarm functionality to NeuroShardServiceServicer.
|
|
59
|
+
|
|
60
|
+
Provides:
|
|
61
|
+
- Async activation handling via ActivationBuffer
|
|
62
|
+
- Fault-tolerant routing via SwarmRouter
|
|
63
|
+
- Capacity broadcasting via SwarmHeartbeatService
|
|
64
|
+
- Decoupled compute via ComputeEngine
|
|
65
|
+
|
|
66
|
+
Usage:
|
|
67
|
+
class NeuroShardServiceServicer(SwarmServiceMixin, DHTServiceMixin, ...):
|
|
68
|
+
def __init__(self, ...):
|
|
69
|
+
SwarmServiceMixin.__init__(self, model, layer_pool)
|
|
70
|
+
...
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
# Configuration
|
|
74
|
+
INBOUND_BUFFER_SIZE = 100
|
|
75
|
+
OUTBOUND_BUFFER_SIZE = 50
|
|
76
|
+
HEARTBEAT_PORT = 9999
|
|
77
|
+
|
|
78
|
+
def __init_swarm__(
|
|
79
|
+
self,
|
|
80
|
+
model,
|
|
81
|
+
layer_pool,
|
|
82
|
+
p2p_manager=None,
|
|
83
|
+
enable_heartbeat: bool = True,
|
|
84
|
+
enable_compute_engine: bool = True,
|
|
85
|
+
):
|
|
86
|
+
"""
|
|
87
|
+
Initialize swarm components.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
model: DynamicNeuroNode or DynamicNeuroLLM instance
|
|
91
|
+
layer_pool: DynamicLayerPool for peer discovery
|
|
92
|
+
p2p_manager: P2PManager for network communication
|
|
93
|
+
enable_heartbeat: Whether to start heartbeat service
|
|
94
|
+
enable_compute_engine: Whether to start compute engine
|
|
95
|
+
"""
|
|
96
|
+
self.swarm_model = model
|
|
97
|
+
self.layer_pool = layer_pool
|
|
98
|
+
self.p2p = p2p_manager
|
|
99
|
+
|
|
100
|
+
# Extract node info
|
|
101
|
+
self.swarm_node_id = getattr(model, 'node_id', 'unknown')
|
|
102
|
+
self.swarm_layer_ids = getattr(model, 'my_layer_ids', [])
|
|
103
|
+
|
|
104
|
+
# Initialize buffers
|
|
105
|
+
self.inbound_buffer = ActivationBuffer(max_size=self.INBOUND_BUFFER_SIZE)
|
|
106
|
+
self.outbound_buffer = OutboundBuffer(
|
|
107
|
+
max_size=self.OUTBOUND_BUFFER_SIZE,
|
|
108
|
+
soft_overflow_threshold=0.9,
|
|
109
|
+
hard_overflow_threshold=0.99
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Initialize router (uses DHT/layer_pool for peer discovery)
|
|
113
|
+
dht = getattr(p2p_manager, 'routing_table', None) if p2p_manager else None
|
|
114
|
+
self.swarm_router = SwarmRouter(dht_protocol=dht, layer_pool=layer_pool)
|
|
115
|
+
|
|
116
|
+
# Initialize heartbeat service
|
|
117
|
+
self.heartbeat_service = None
|
|
118
|
+
if enable_heartbeat:
|
|
119
|
+
self.heartbeat_service = SwarmHeartbeatService(
|
|
120
|
+
node_id=self.swarm_node_id,
|
|
121
|
+
udp_port=self.HEARTBEAT_PORT
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Initialize compute engine (runs async)
|
|
125
|
+
self.compute_engine = None
|
|
126
|
+
self._compute_thread = None
|
|
127
|
+
self._async_loop = None
|
|
128
|
+
|
|
129
|
+
# State tracking
|
|
130
|
+
self.swarm_state = SwarmNodeState(
|
|
131
|
+
node_id=self.swarm_node_id,
|
|
132
|
+
layer_range=self._get_layer_range(),
|
|
133
|
+
grpc_addr=getattr(model, 'grpc_addr', ''),
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Statistics
|
|
137
|
+
self.swarm_stats = {
|
|
138
|
+
'activations_received': 0,
|
|
139
|
+
'activations_sent': 0,
|
|
140
|
+
'failovers': 0,
|
|
141
|
+
'soft_overflows': 0,
|
|
142
|
+
'hard_overflows': 0,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
logger.info(f"SwarmServiceMixin initialized for node {self.swarm_node_id}")
|
|
146
|
+
logger.info(f" Layers: {self.swarm_layer_ids}")
|
|
147
|
+
logger.info(f" Inbound buffer: {self.INBOUND_BUFFER_SIZE}")
|
|
148
|
+
logger.info(f" Outbound buffer: {self.OUTBOUND_BUFFER_SIZE}")
|
|
149
|
+
|
|
150
|
+
def _get_layer_range(self) -> Tuple[int, int]:
|
|
151
|
+
"""Get the layer range for this node."""
|
|
152
|
+
if not self.swarm_layer_ids:
|
|
153
|
+
return (0, 0)
|
|
154
|
+
return (min(self.swarm_layer_ids), max(self.swarm_layer_ids) + 1)
|
|
155
|
+
|
|
156
|
+
# ==================== LIFECYCLE ====================
|
|
157
|
+
|
|
158
|
+
def start_swarm_services(self):
|
|
159
|
+
"""Start all swarm background services."""
|
|
160
|
+
# Start heartbeat
|
|
161
|
+
if self.heartbeat_service:
|
|
162
|
+
self._update_heartbeat_capacity()
|
|
163
|
+
self.heartbeat_service.start()
|
|
164
|
+
logger.info("Swarm heartbeat service started")
|
|
165
|
+
|
|
166
|
+
# Start compute engine in background thread with async loop
|
|
167
|
+
self._start_compute_engine()
|
|
168
|
+
|
|
169
|
+
# Start outbound send loop
|
|
170
|
+
self._start_outbound_sender()
|
|
171
|
+
|
|
172
|
+
logger.info("All swarm services started")
|
|
173
|
+
|
|
174
|
+
def stop_swarm_services(self):
|
|
175
|
+
"""Stop all swarm background services."""
|
|
176
|
+
# Stop compute engine
|
|
177
|
+
if self.compute_engine:
|
|
178
|
+
self.compute_engine.running = False
|
|
179
|
+
|
|
180
|
+
# Stop heartbeat
|
|
181
|
+
if self.heartbeat_service:
|
|
182
|
+
self.heartbeat_service.stop()
|
|
183
|
+
|
|
184
|
+
# Stop async loop
|
|
185
|
+
if self._async_loop:
|
|
186
|
+
self._async_loop.call_soon_threadsafe(self._async_loop.stop)
|
|
187
|
+
|
|
188
|
+
logger.info("All swarm services stopped")
|
|
189
|
+
|
|
190
|
+
def _start_compute_engine(self):
|
|
191
|
+
"""Start the compute engine in a background thread."""
|
|
192
|
+
if not hasattr(self, 'swarm_model') or self.swarm_model is None:
|
|
193
|
+
logger.warning("No model available for compute engine")
|
|
194
|
+
return
|
|
195
|
+
|
|
196
|
+
# Create compute engine
|
|
197
|
+
self.compute_engine = ComputeEngine(
|
|
198
|
+
model=self.swarm_model,
|
|
199
|
+
inbound=self.inbound_buffer,
|
|
200
|
+
outbound=self.outbound_buffer,
|
|
201
|
+
num_micro_batches=4
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
def run_async_loop():
|
|
205
|
+
"""Run asyncio event loop in thread."""
|
|
206
|
+
self._async_loop = asyncio.new_event_loop()
|
|
207
|
+
asyncio.set_event_loop(self._async_loop)
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
self._async_loop.run_until_complete(self.compute_engine.run())
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logger.error(f"Compute engine error: {e}")
|
|
213
|
+
finally:
|
|
214
|
+
self._async_loop.close()
|
|
215
|
+
|
|
216
|
+
self._compute_thread = threading.Thread(
|
|
217
|
+
target=run_async_loop,
|
|
218
|
+
daemon=True,
|
|
219
|
+
name="SwarmComputeEngine"
|
|
220
|
+
)
|
|
221
|
+
self._compute_thread.start()
|
|
222
|
+
logger.info("Compute engine started in background thread")
|
|
223
|
+
|
|
224
|
+
def _start_outbound_sender(self):
|
|
225
|
+
"""Start the outbound buffer send loop."""
|
|
226
|
+
def send_loop():
|
|
227
|
+
"""Continuously send packets from outbound buffer."""
|
|
228
|
+
loop = asyncio.new_event_loop()
|
|
229
|
+
asyncio.set_event_loop(loop)
|
|
230
|
+
|
|
231
|
+
async def _send():
|
|
232
|
+
while True:
|
|
233
|
+
try:
|
|
234
|
+
packet = await self.outbound_buffer.get()
|
|
235
|
+
if packet is None:
|
|
236
|
+
await asyncio.sleep(0.01)
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
# Send via swarm router with failover
|
|
240
|
+
try:
|
|
241
|
+
await self.swarm_router.send_with_failover(
|
|
242
|
+
tensor=packet.tensor_data,
|
|
243
|
+
target_layer=packet.target_layer,
|
|
244
|
+
session_id=packet.session_id
|
|
245
|
+
)
|
|
246
|
+
self.swarm_stats['activations_sent'] += 1
|
|
247
|
+
except Exception as e:
|
|
248
|
+
logger.warning(f"Failed to send activation: {e}")
|
|
249
|
+
self.swarm_stats['failovers'] += 1
|
|
250
|
+
except Exception as e:
|
|
251
|
+
logger.error(f"Outbound send error: {e}")
|
|
252
|
+
await asyncio.sleep(0.1)
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
loop.run_until_complete(_send())
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.error(f"Outbound sender error: {e}")
|
|
258
|
+
finally:
|
|
259
|
+
loop.close()
|
|
260
|
+
|
|
261
|
+
threading.Thread(
|
|
262
|
+
target=send_loop,
|
|
263
|
+
daemon=True,
|
|
264
|
+
name="SwarmOutboundSender"
|
|
265
|
+
).start()
|
|
266
|
+
logger.info("Outbound sender started")
|
|
267
|
+
|
|
268
|
+
def _update_heartbeat_capacity(self):
|
|
269
|
+
"""Update heartbeat capacity from current state."""
|
|
270
|
+
if not self.heartbeat_service:
|
|
271
|
+
return
|
|
272
|
+
|
|
273
|
+
# Get memory info
|
|
274
|
+
available_mb = 0
|
|
275
|
+
gpu_util = 0.0
|
|
276
|
+
|
|
277
|
+
try:
|
|
278
|
+
if torch.cuda.is_available():
|
|
279
|
+
free, total = torch.cuda.mem_get_info()
|
|
280
|
+
available_mb = free // (1024 * 1024)
|
|
281
|
+
# GPU utilization would need nvidia-smi or pynvml
|
|
282
|
+
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
|
|
283
|
+
# MPS doesn't have direct memory query
|
|
284
|
+
available_mb = 1024 # Estimate
|
|
285
|
+
else:
|
|
286
|
+
import psutil
|
|
287
|
+
mem = psutil.virtual_memory()
|
|
288
|
+
available_mb = mem.available // (1024 * 1024)
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.debug(f"Could not get memory info: {e}")
|
|
291
|
+
|
|
292
|
+
self.swarm_state.available_memory_mb = available_mb
|
|
293
|
+
self.swarm_state.gpu_utilization = gpu_util
|
|
294
|
+
|
|
295
|
+
# Update heartbeat
|
|
296
|
+
self.heartbeat_service.update_local_capacity(
|
|
297
|
+
available_memory_mb=available_mb,
|
|
298
|
+
queue_depth=len(self.inbound_buffer),
|
|
299
|
+
layer_range=self.swarm_state.layer_range,
|
|
300
|
+
gpu_utilization=gpu_util,
|
|
301
|
+
is_training=self.swarm_state.is_training,
|
|
302
|
+
is_accepting_activations=self.swarm_state.is_accepting_activations,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# ==================== gRPC HANDLERS ====================
|
|
306
|
+
|
|
307
|
+
def SwarmForward(self, request, context):
|
|
308
|
+
"""
|
|
309
|
+
Handle incoming activation packets for swarm routing.
|
|
310
|
+
|
|
311
|
+
This is the primary RPC for the async pipeline:
|
|
312
|
+
1. Deserialize activation tensor
|
|
313
|
+
2. Push to inbound ActivationBuffer
|
|
314
|
+
3. ComputeEngine processes asynchronously
|
|
315
|
+
4. Return immediately (non-blocking)
|
|
316
|
+
|
|
317
|
+
This replaces the synchronous PipelineForward for swarm mode.
|
|
318
|
+
"""
|
|
319
|
+
try:
|
|
320
|
+
import numpy as np
|
|
321
|
+
from protos import neuroshard_pb2
|
|
322
|
+
|
|
323
|
+
# Deserialize activation
|
|
324
|
+
hidden_states = torch.from_numpy(
|
|
325
|
+
np.frombuffer(request.hidden_states, dtype=np.float32)
|
|
326
|
+
).reshape(list(request.hidden_shape))
|
|
327
|
+
|
|
328
|
+
# Create activation packet
|
|
329
|
+
packet = ActivationPacket(
|
|
330
|
+
priority=request.priority if hasattr(request, 'priority') else 10,
|
|
331
|
+
session_id=request.session_id,
|
|
332
|
+
micro_batch_id=request.micro_batch_id if hasattr(request, 'micro_batch_id') else 0,
|
|
333
|
+
tensor_data=hidden_states,
|
|
334
|
+
source_node=request.sender_url if hasattr(request, 'sender_url') else '',
|
|
335
|
+
target_layer=request.target_layer if hasattr(request, 'target_layer') else 0,
|
|
336
|
+
is_backward=request.is_backward if hasattr(request, 'is_backward') else False,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Non-blocking put to inbound buffer
|
|
340
|
+
success = self.inbound_buffer.put_nowait(packet)
|
|
341
|
+
|
|
342
|
+
if success:
|
|
343
|
+
self.swarm_stats['activations_received'] += 1
|
|
344
|
+
return neuroshard_pb2.SwarmForwardResponse(
|
|
345
|
+
success=True,
|
|
346
|
+
request_id=request.request_id,
|
|
347
|
+
buffer_depth=len(self.inbound_buffer),
|
|
348
|
+
)
|
|
349
|
+
else:
|
|
350
|
+
# Buffer full - backpressure signal
|
|
351
|
+
return neuroshard_pb2.SwarmForwardResponse(
|
|
352
|
+
success=False,
|
|
353
|
+
request_id=request.request_id,
|
|
354
|
+
error_message="Inbound buffer full - backpressure",
|
|
355
|
+
buffer_depth=len(self.inbound_buffer),
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
except Exception as e:
|
|
359
|
+
logger.error(f"SwarmForward error: {e}")
|
|
360
|
+
from protos import neuroshard_pb2
|
|
361
|
+
return neuroshard_pb2.SwarmForwardResponse(
|
|
362
|
+
success=False,
|
|
363
|
+
request_id=getattr(request, 'request_id', ''),
|
|
364
|
+
error_message=str(e),
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
def GetSwarmStatus(self, request, context):
|
|
368
|
+
"""
|
|
369
|
+
Get swarm node status including buffer fill rates and capacity.
|
|
370
|
+
|
|
371
|
+
Used by peers for routing decisions.
|
|
372
|
+
"""
|
|
373
|
+
try:
|
|
374
|
+
from protos import neuroshard_pb2
|
|
375
|
+
|
|
376
|
+
# Get buffer stats
|
|
377
|
+
inbound_stats = self.inbound_buffer.get_stats()
|
|
378
|
+
outbound_stats = self.outbound_buffer.get_stats()
|
|
379
|
+
|
|
380
|
+
# Get compute stats if available
|
|
381
|
+
compute_stats = {}
|
|
382
|
+
if self.compute_engine:
|
|
383
|
+
compute_stats = self.compute_engine.get_stats()
|
|
384
|
+
|
|
385
|
+
return neuroshard_pb2.SwarmStatusResponse(
|
|
386
|
+
node_id=self.swarm_node_id,
|
|
387
|
+
layer_start=self.swarm_state.layer_range[0],
|
|
388
|
+
layer_end=self.swarm_state.layer_range[1],
|
|
389
|
+
|
|
390
|
+
# Buffer status
|
|
391
|
+
inbound_fill_rate=inbound_stats.get('fill_rate', 0.0),
|
|
392
|
+
outbound_fill_rate=outbound_stats.get('fill_rate', 0.0),
|
|
393
|
+
inbound_queue_depth=inbound_stats.get('queue_size', 0),
|
|
394
|
+
outbound_queue_depth=outbound_stats.get('queue_size', 0),
|
|
395
|
+
|
|
396
|
+
# Capacity
|
|
397
|
+
available_memory_mb=self.swarm_state.available_memory_mb,
|
|
398
|
+
gpu_utilization=self.swarm_state.gpu_utilization,
|
|
399
|
+
|
|
400
|
+
# Status
|
|
401
|
+
is_training=self.swarm_state.is_training,
|
|
402
|
+
is_accepting_activations=self.swarm_state.is_accepting_activations,
|
|
403
|
+
|
|
404
|
+
# Compute stats
|
|
405
|
+
total_steps=compute_stats.get('total_steps', 0),
|
|
406
|
+
local_only_rate=compute_stats.get('local_only_rate', 0.0),
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
except Exception as e:
|
|
410
|
+
logger.error(f"GetSwarmStatus error: {e}")
|
|
411
|
+
from protos import neuroshard_pb2
|
|
412
|
+
return neuroshard_pb2.SwarmStatusResponse(
|
|
413
|
+
node_id=self.swarm_node_id,
|
|
414
|
+
error_message=str(e),
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
def UpdatePeerCapacity(self, request, context):
|
|
418
|
+
"""
|
|
419
|
+
Receive peer capacity update (alternative to UDP heartbeat).
|
|
420
|
+
|
|
421
|
+
Used when UDP heartbeat is blocked by firewalls.
|
|
422
|
+
"""
|
|
423
|
+
try:
|
|
424
|
+
from protos import neuroshard_pb2
|
|
425
|
+
|
|
426
|
+
# Update router's peer cache
|
|
427
|
+
peer = PeerCandidate(
|
|
428
|
+
node_id=request.node_id,
|
|
429
|
+
grpc_addr=request.grpc_addr,
|
|
430
|
+
layer_range=(request.layer_start, request.layer_end),
|
|
431
|
+
latency_ms=0.0, # Will be measured on first send
|
|
432
|
+
queue_depth=request.queue_depth,
|
|
433
|
+
last_heartbeat=time.time(),
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
self.swarm_router.update_peer_from_heartbeat(peer)
|
|
437
|
+
|
|
438
|
+
return neuroshard_pb2.UpdatePeerCapacityResponse(
|
|
439
|
+
success=True,
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
except Exception as e:
|
|
443
|
+
logger.error(f"UpdatePeerCapacity error: {e}")
|
|
444
|
+
from protos import neuroshard_pb2
|
|
445
|
+
return neuroshard_pb2.UpdatePeerCapacityResponse(
|
|
446
|
+
success=False,
|
|
447
|
+
error_message=str(e),
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
# ==================== INTEGRATION WITH EXISTING PIPELINE ====================
|
|
451
|
+
|
|
452
|
+
def handle_pipeline_forward_swarm(self, request, context):
|
|
453
|
+
"""
|
|
454
|
+
Adapter for existing PipelineForward to use swarm buffers.
|
|
455
|
+
|
|
456
|
+
Converts synchronous PipelineForward to async buffer-based processing.
|
|
457
|
+
Can be called from PipelineForward() to enable swarm mode.
|
|
458
|
+
"""
|
|
459
|
+
import numpy as np
|
|
460
|
+
from protos import neuroshard_pb2
|
|
461
|
+
|
|
462
|
+
try:
|
|
463
|
+
# Deserialize hidden states
|
|
464
|
+
hidden_states = torch.from_numpy(
|
|
465
|
+
np.frombuffer(request.hidden_states, dtype=np.float32)
|
|
466
|
+
).reshape(list(request.hidden_shape))
|
|
467
|
+
|
|
468
|
+
# Determine priority
|
|
469
|
+
priority = 10 # Default: normal inference
|
|
470
|
+
if hasattr(request, 'is_training') and request.is_training:
|
|
471
|
+
priority = 20 # Training forward
|
|
472
|
+
|
|
473
|
+
# Create packet
|
|
474
|
+
packet = ActivationPacket(
|
|
475
|
+
priority=priority,
|
|
476
|
+
session_id=request.session_id,
|
|
477
|
+
micro_batch_id=0,
|
|
478
|
+
tensor_data=hidden_states,
|
|
479
|
+
source_node=request.sender_url if hasattr(request, 'sender_url') else '',
|
|
480
|
+
target_layer=request.target_shard if hasattr(request, 'target_shard') else 0,
|
|
481
|
+
is_backward=False,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
# Check buffer pressure before accepting
|
|
485
|
+
if self.inbound_buffer.is_backpressured:
|
|
486
|
+
return neuroshard_pb2.PipelineForwardResponse(
|
|
487
|
+
request_id=request.request_id,
|
|
488
|
+
success=False,
|
|
489
|
+
error_message="Node backpressured - try another peer",
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
# Non-blocking put
|
|
493
|
+
if not self.inbound_buffer.put_nowait(packet):
|
|
494
|
+
return neuroshard_pb2.PipelineForwardResponse(
|
|
495
|
+
request_id=request.request_id,
|
|
496
|
+
success=False,
|
|
497
|
+
error_message="Buffer full",
|
|
498
|
+
)
|
|
499
|
+
|
|
500
|
+
self.swarm_stats['activations_received'] += 1
|
|
501
|
+
|
|
502
|
+
# In async mode, we return immediately
|
|
503
|
+
# The compute engine will process and send to next peer
|
|
504
|
+
return neuroshard_pb2.PipelineForwardResponse(
|
|
505
|
+
request_id=request.request_id,
|
|
506
|
+
success=True,
|
|
507
|
+
# hidden_states not returned - async processing
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
except Exception as e:
|
|
511
|
+
logger.error(f"handle_pipeline_forward_swarm error: {e}")
|
|
512
|
+
return neuroshard_pb2.PipelineForwardResponse(
|
|
513
|
+
request_id=request.request_id,
|
|
514
|
+
success=False,
|
|
515
|
+
error_message=str(e),
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
# ==================== UTILITIES ====================
|
|
519
|
+
|
|
520
|
+
def get_swarm_stats(self) -> Dict[str, Any]:
|
|
521
|
+
"""Get comprehensive swarm statistics."""
|
|
522
|
+
stats = dict(self.swarm_stats)
|
|
523
|
+
|
|
524
|
+
# Add buffer stats
|
|
525
|
+
stats['inbound'] = self.inbound_buffer.get_stats()
|
|
526
|
+
stats['outbound'] = self.outbound_buffer.get_stats()
|
|
527
|
+
|
|
528
|
+
# Add compute stats
|
|
529
|
+
if self.compute_engine:
|
|
530
|
+
stats['compute'] = self.compute_engine.get_stats()
|
|
531
|
+
|
|
532
|
+
# Add router stats
|
|
533
|
+
stats['router'] = {
|
|
534
|
+
'cached_peers': len(self.swarm_router.peer_stats),
|
|
535
|
+
'total_sends': self.swarm_router.total_sends,
|
|
536
|
+
'successful_sends': self.swarm_router.successful_sends,
|
|
537
|
+
'failovers': self.swarm_router.failover_count,
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
# Add heartbeat stats
|
|
541
|
+
if self.heartbeat_service:
|
|
542
|
+
stats['heartbeat'] = {
|
|
543
|
+
'known_peers': len(self.heartbeat_service.peer_capacities),
|
|
544
|
+
'broadcast_count': self.heartbeat_service.broadcast_count,
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
return stats
|
|
548
|
+
|
|
549
|
+
def set_training_mode(self, enabled: bool):
|
|
550
|
+
"""Set training mode (affects prioritization)."""
|
|
551
|
+
self.swarm_state.is_training = enabled
|
|
552
|
+
self._update_heartbeat_capacity()
|
|
553
|
+
|
|
554
|
+
def set_accepting_activations(self, enabled: bool):
|
|
555
|
+
"""Set whether this node accepts new activations."""
|
|
556
|
+
self.swarm_state.is_accepting_activations = enabled
|
|
557
|
+
self._update_heartbeat_capacity()
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
# ==================== PROTO EXTENSION ====================
|
|
561
|
+
# These message types need to be added to neuroshard.proto
|
|
562
|
+
|
|
563
|
+
SWARM_PROTO_EXTENSION = """
|
|
564
|
+
// --- Swarm Routing Messages ---
|
|
565
|
+
|
|
566
|
+
message SwarmForwardRequest {
|
|
567
|
+
string session_id = 1;
|
|
568
|
+
string request_id = 2;
|
|
569
|
+
|
|
570
|
+
// Activation data
|
|
571
|
+
bytes hidden_states = 3;
|
|
572
|
+
repeated int64 hidden_shape = 4;
|
|
573
|
+
|
|
574
|
+
// Routing
|
|
575
|
+
int32 target_layer = 5;
|
|
576
|
+
string sender_url = 6;
|
|
577
|
+
|
|
578
|
+
// Priority (0=highest)
|
|
579
|
+
int32 priority = 7;
|
|
580
|
+
int32 micro_batch_id = 8;
|
|
581
|
+
bool is_backward = 9;
|
|
582
|
+
|
|
583
|
+
// Training metadata
|
|
584
|
+
bool requires_grad = 10;
|
|
585
|
+
bytes grad_output = 11;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
message SwarmForwardResponse {
|
|
589
|
+
string request_id = 1;
|
|
590
|
+
bool success = 2;
|
|
591
|
+
string error_message = 3;
|
|
592
|
+
int32 buffer_depth = 4; // For backpressure signaling
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
message SwarmStatusRequest {
|
|
596
|
+
// Empty - just get status
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
message SwarmStatusResponse {
|
|
600
|
+
string node_id = 1;
|
|
601
|
+
int32 layer_start = 2;
|
|
602
|
+
int32 layer_end = 3;
|
|
603
|
+
|
|
604
|
+
// Buffer status
|
|
605
|
+
float inbound_fill_rate = 4;
|
|
606
|
+
float outbound_fill_rate = 5;
|
|
607
|
+
int32 inbound_queue_depth = 6;
|
|
608
|
+
int32 outbound_queue_depth = 7;
|
|
609
|
+
|
|
610
|
+
// Capacity
|
|
611
|
+
int32 available_memory_mb = 8;
|
|
612
|
+
float gpu_utilization = 9;
|
|
613
|
+
|
|
614
|
+
// Status
|
|
615
|
+
bool is_training = 10;
|
|
616
|
+
bool is_accepting_activations = 11;
|
|
617
|
+
|
|
618
|
+
// Compute stats
|
|
619
|
+
int64 total_steps = 12;
|
|
620
|
+
float local_only_rate = 13;
|
|
621
|
+
|
|
622
|
+
string error_message = 14;
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
message UpdatePeerCapacityRequest {
|
|
626
|
+
string node_id = 1;
|
|
627
|
+
string grpc_addr = 2;
|
|
628
|
+
int32 layer_start = 3;
|
|
629
|
+
int32 layer_end = 4;
|
|
630
|
+
int32 queue_depth = 5;
|
|
631
|
+
int32 available_memory_mb = 6;
|
|
632
|
+
float gpu_utilization = 7;
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
message UpdatePeerCapacityResponse {
|
|
636
|
+
bool success = 1;
|
|
637
|
+
string error_message = 2;
|
|
638
|
+
}
|
|
639
|
+
"""
|
|
640
|
+
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# neuroshard/core/training/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Training coordination components for NeuroShard.
|
|
4
|
+
|
|
5
|
+
- distributed: TrainingCoordinator, FederatedDataManager, GenesisDataLoader
|
|
6
|
+
- production: GradientCompressor (for DiLoCo gradient exchange)
|
|
7
|
+
- global_tracker: GlobalTrainingTracker (for training verification)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
'TrainingCoordinator',
|
|
12
|
+
'FederatedDataManager',
|
|
13
|
+
'GenesisDataLoader',
|
|
14
|
+
'GradientCompressor',
|
|
15
|
+
'GlobalTrainingTracker',
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
def __getattr__(name):
|
|
19
|
+
"""Lazy loading of submodules."""
|
|
20
|
+
if name in ('TrainingCoordinator', 'FederatedDataManager', 'GenesisDataLoader', 'GradientContribution'):
|
|
21
|
+
from neuroshard.core.training import distributed
|
|
22
|
+
return getattr(distributed, name)
|
|
23
|
+
elif name in ('GradientCompressor', 'CompressionConfig', 'CompressionMethod'):
|
|
24
|
+
from neuroshard.core.training import production
|
|
25
|
+
return getattr(production, name)
|
|
26
|
+
elif name in ('GlobalTrainingTracker', 'TrainingSnapshot', 'GlobalTrainingStats'):
|
|
27
|
+
from neuroshard.core.training import global_tracker
|
|
28
|
+
return getattr(global_tracker, name)
|
|
29
|
+
raise AttributeError(f"module 'neuroshard.core.training' has no attribute '{name}'")
|