nexaroa 0.0.111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuroshard/__init__.py +93 -0
- neuroshard/__main__.py +4 -0
- neuroshard/cli.py +466 -0
- neuroshard/core/__init__.py +92 -0
- neuroshard/core/consensus/verifier.py +252 -0
- neuroshard/core/crypto/__init__.py +20 -0
- neuroshard/core/crypto/ecdsa.py +392 -0
- neuroshard/core/economics/__init__.py +52 -0
- neuroshard/core/economics/constants.py +387 -0
- neuroshard/core/economics/ledger.py +2111 -0
- neuroshard/core/economics/market.py +975 -0
- neuroshard/core/economics/wallet.py +168 -0
- neuroshard/core/governance/__init__.py +74 -0
- neuroshard/core/governance/proposal.py +561 -0
- neuroshard/core/governance/registry.py +545 -0
- neuroshard/core/governance/versioning.py +332 -0
- neuroshard/core/governance/voting.py +453 -0
- neuroshard/core/model/__init__.py +30 -0
- neuroshard/core/model/dynamic.py +4186 -0
- neuroshard/core/model/llm.py +905 -0
- neuroshard/core/model/registry.py +164 -0
- neuroshard/core/model/scaler.py +387 -0
- neuroshard/core/model/tokenizer.py +568 -0
- neuroshard/core/network/__init__.py +56 -0
- neuroshard/core/network/connection_pool.py +72 -0
- neuroshard/core/network/dht.py +130 -0
- neuroshard/core/network/dht_plan.py +55 -0
- neuroshard/core/network/dht_proof_store.py +516 -0
- neuroshard/core/network/dht_protocol.py +261 -0
- neuroshard/core/network/dht_service.py +506 -0
- neuroshard/core/network/encrypted_channel.py +141 -0
- neuroshard/core/network/nat.py +201 -0
- neuroshard/core/network/nat_traversal.py +695 -0
- neuroshard/core/network/p2p.py +929 -0
- neuroshard/core/network/p2p_data.py +150 -0
- neuroshard/core/swarm/__init__.py +106 -0
- neuroshard/core/swarm/aggregation.py +729 -0
- neuroshard/core/swarm/buffers.py +643 -0
- neuroshard/core/swarm/checkpoint.py +709 -0
- neuroshard/core/swarm/compute.py +624 -0
- neuroshard/core/swarm/diloco.py +844 -0
- neuroshard/core/swarm/factory.py +1288 -0
- neuroshard/core/swarm/heartbeat.py +669 -0
- neuroshard/core/swarm/logger.py +487 -0
- neuroshard/core/swarm/router.py +658 -0
- neuroshard/core/swarm/service.py +640 -0
- neuroshard/core/training/__init__.py +29 -0
- neuroshard/core/training/checkpoint.py +600 -0
- neuroshard/core/training/distributed.py +1602 -0
- neuroshard/core/training/global_tracker.py +617 -0
- neuroshard/core/training/production.py +276 -0
- neuroshard/governance_cli.py +729 -0
- neuroshard/grpc_server.py +895 -0
- neuroshard/runner.py +3223 -0
- neuroshard/sdk/__init__.py +92 -0
- neuroshard/sdk/client.py +990 -0
- neuroshard/sdk/errors.py +101 -0
- neuroshard/sdk/types.py +282 -0
- neuroshard/tracker/__init__.py +0 -0
- neuroshard/tracker/server.py +864 -0
- neuroshard/ui/__init__.py +0 -0
- neuroshard/ui/app.py +102 -0
- neuroshard/ui/templates/index.html +1052 -0
- neuroshard/utils/__init__.py +0 -0
- neuroshard/utils/autostart.py +81 -0
- neuroshard/utils/hardware.py +121 -0
- neuroshard/utils/serialization.py +90 -0
- neuroshard/version.py +1 -0
- nexaroa-0.0.111.dist-info/METADATA +283 -0
- nexaroa-0.0.111.dist-info/RECORD +78 -0
- nexaroa-0.0.111.dist-info/WHEEL +5 -0
- nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
- nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
- nexaroa-0.0.111.dist-info/top_level.txt +2 -0
- protos/__init__.py +0 -0
- protos/neuroshard.proto +651 -0
- protos/neuroshard_pb2.py +160 -0
- protos/neuroshard_pb2_grpc.py +1298 -0
|
@@ -0,0 +1,895 @@
|
|
|
1
|
+
"""
|
|
2
|
+
gRPC Server for NeuroShard Node
|
|
3
|
+
|
|
4
|
+
Handles:
|
|
5
|
+
1. Inference requests (forward pass through NeuroLLM)
|
|
6
|
+
2. Training gradient exchange
|
|
7
|
+
3. DHT operations for peer discovery
|
|
8
|
+
4. PoNW proof verification
|
|
9
|
+
5. Layer-specific forward (for distributed inference)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import grpc
|
|
13
|
+
from concurrent import futures
|
|
14
|
+
import torch
|
|
15
|
+
import time
|
|
16
|
+
import threading
|
|
17
|
+
import random
|
|
18
|
+
from typing import Optional, Union
|
|
19
|
+
import logging
|
|
20
|
+
|
|
21
|
+
# Import generated protobuf code
|
|
22
|
+
from protos import neuroshard_pb2
|
|
23
|
+
from protos import neuroshard_pb2_grpc
|
|
24
|
+
|
|
25
|
+
from neuroshard.utils.serialization import deserialize_tensor, serialize_tensor
|
|
26
|
+
from neuroshard.core.network.p2p import P2PManager
|
|
27
|
+
from neuroshard.core.network.dht_service import DHTServiceMixin
|
|
28
|
+
|
|
29
|
+
# Swarm Service Mixin (Phase 4)
|
|
30
|
+
try:
|
|
31
|
+
from neuroshard.core.swarm.service import SwarmServiceMixin
|
|
32
|
+
SWARM_SERVICE_AVAILABLE = True
|
|
33
|
+
except ImportError:
|
|
34
|
+
SWARM_SERVICE_AVAILABLE = False
|
|
35
|
+
SwarmServiceMixin = object # Fallback
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
# Global state
|
|
40
|
+
GRPC_SERVER = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class NeuroShardServiceServicer(DHTServiceMixin, neuroshard_pb2_grpc.NeuroShardServiceServicer):
|
|
44
|
+
"""
|
|
45
|
+
gRPC service for NeuroShard nodes.
|
|
46
|
+
|
|
47
|
+
Supports DynamicNeuroNode with layer-based routing.
|
|
48
|
+
Each node holds specific layers and can process inference requests for those layers.
|
|
49
|
+
|
|
50
|
+
Swarm Architecture:
|
|
51
|
+
- SwarmForward: Async activation forwarding with failover
|
|
52
|
+
- GetSwarmStatus: Buffer fill rates, capacity info
|
|
53
|
+
- UpdatePeerCapacity: TCP fallback for heartbeat updates
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self, model, p2p: P2PManager, swap_controller=None):
|
|
57
|
+
"""
|
|
58
|
+
Args:
|
|
59
|
+
model: DynamicNeuroNode or SwarmEnabledDynamicNode instance
|
|
60
|
+
p2p: P2P manager for peer discovery
|
|
61
|
+
swap_controller: Deprecated, kept for compatibility
|
|
62
|
+
"""
|
|
63
|
+
self.model = model
|
|
64
|
+
self.p2p = p2p
|
|
65
|
+
self.swap_controller = swap_controller
|
|
66
|
+
|
|
67
|
+
# Detect if this is a DynamicNeuroNode
|
|
68
|
+
self.is_dynamic_node = hasattr(model, 'my_layer_ids') and hasattr(model, 'layer_pool')
|
|
69
|
+
|
|
70
|
+
# Rate limiter for gradient gossip (per-node)
|
|
71
|
+
# Allows max 1 gossip per node per 60 seconds (DiLoCo sync is every ~8-10 min)
|
|
72
|
+
self._gossip_rate_limit: dict = {} # node_id -> last_gossip_time
|
|
73
|
+
self._gossip_rate_limit_seconds = 60 # Min seconds between gossips from same node
|
|
74
|
+
self._gossip_rate_limit_lock = threading.Lock()
|
|
75
|
+
|
|
76
|
+
# Initialize DHT Mixin if available
|
|
77
|
+
if p2p.routing_table:
|
|
78
|
+
DHTServiceMixin.__init__(self, p2p.routing_table, p2p.dht_storage, ledger=p2p.ledger)
|
|
79
|
+
else:
|
|
80
|
+
from neuroshard.core.network.dht import RoutingTable, Node
|
|
81
|
+
dummy_rt = RoutingTable(Node(0, "0.0.0.0", 0))
|
|
82
|
+
DHTServiceMixin.__init__(self, dummy_rt, {}, ledger=p2p.ledger)
|
|
83
|
+
|
|
84
|
+
logger.info(f"gRPC Servicer initialized (DynamicNode={self.is_dynamic_node})")
|
|
85
|
+
|
|
86
|
+
def UnaryInference(self, request, context):
|
|
87
|
+
"""Handle inference request."""
|
|
88
|
+
try:
|
|
89
|
+
# Deserialize input
|
|
90
|
+
input_str = request.tensor_data.decode('utf-8')
|
|
91
|
+
input_tensor = deserialize_tensor(input_str)
|
|
92
|
+
|
|
93
|
+
# Forward pass through DynamicNeuroNode
|
|
94
|
+
output = self.model.forward(input_tensor, session_id=request.session_id)
|
|
95
|
+
# Return result directly
|
|
96
|
+
return neuroshard_pb2.InferenceResponse(
|
|
97
|
+
success=True,
|
|
98
|
+
tensor_data=serialize_tensor(output).encode('utf-8')
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
except Exception as e:
|
|
102
|
+
logger.error(f"Inference error: {e}")
|
|
103
|
+
return neuroshard_pb2.InferenceResponse(success=False, error_message=str(e))
|
|
104
|
+
|
|
105
|
+
def GetWeights(self, request, context):
|
|
106
|
+
"""Return model weights (for gradient sync)."""
|
|
107
|
+
try:
|
|
108
|
+
# Get weights for my layers only
|
|
109
|
+
layer_weights = {}
|
|
110
|
+
for layer_id, layer in self.model.model.my_layers.items():
|
|
111
|
+
layer_weights[f"layer_{layer_id}"] = layer.state_dict()
|
|
112
|
+
|
|
113
|
+
if self.model.model.embedding:
|
|
114
|
+
layer_weights["embedding"] = self.model.model.embedding.state_dict()
|
|
115
|
+
if self.model.model.lm_head:
|
|
116
|
+
layer_weights["lm_head"] = self.model.model.lm_head.state_dict()
|
|
117
|
+
|
|
118
|
+
data = serialize_tensor(layer_weights, use_quantization=False).encode('utf-8')
|
|
119
|
+
return neuroshard_pb2.WeightResponse(weights_data=data)
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.error(f"GetWeights error: {e}")
|
|
122
|
+
return neuroshard_pb2.WeightResponse(weights_data=b"")
|
|
123
|
+
|
|
124
|
+
def GetTrainingStatus(self, request, context):
|
|
125
|
+
"""Get training status."""
|
|
126
|
+
stats = self.model.get_stats()
|
|
127
|
+
|
|
128
|
+
return neuroshard_pb2.TrainingStatusResponse(
|
|
129
|
+
training_enabled=self.model.enable_training,
|
|
130
|
+
total_rounds=stats.get("total_training_rounds", 0),
|
|
131
|
+
current_loss=stats.get("current_loss", float('inf')),
|
|
132
|
+
tokens_processed=stats.get("total_tokens_processed", 0),
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
def GetPoNWProof(self, request, context):
|
|
136
|
+
"""Get Proof of Neural Work."""
|
|
137
|
+
proof = self.model.get_ponw_proof()
|
|
138
|
+
|
|
139
|
+
return neuroshard_pb2.PoNWProofResponse(
|
|
140
|
+
node_id=proof.get("node_id", ""),
|
|
141
|
+
timestamp=proof.get("timestamp", 0),
|
|
142
|
+
tokens_processed=proof.get("tokens_processed", 0),
|
|
143
|
+
training_rounds=proof.get("training_rounds", 0),
|
|
144
|
+
signature=proof.get("signature", ""),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# ==================== LAYER-SPECIFIC FORWARD ====================
|
|
148
|
+
|
|
149
|
+
def LayerForward(self, request, context):
|
|
150
|
+
"""
|
|
151
|
+
Forward hidden states through specific layers on this node.
|
|
152
|
+
|
|
153
|
+
This enables distributed inference:
|
|
154
|
+
1. Request comes with hidden states
|
|
155
|
+
2. We process through our assigned layers
|
|
156
|
+
3. Return processed hidden states
|
|
157
|
+
"""
|
|
158
|
+
try:
|
|
159
|
+
# Deserialize input
|
|
160
|
+
input_str = request.tensor_data.decode('utf-8')
|
|
161
|
+
hidden_states = deserialize_tensor(input_str)
|
|
162
|
+
|
|
163
|
+
# Check if we have the requested layers
|
|
164
|
+
requested_layers = list(request.layer_ids)
|
|
165
|
+
my_layers = set(self.model.my_layer_ids)
|
|
166
|
+
|
|
167
|
+
if not all(l in my_layers for l in requested_layers):
|
|
168
|
+
missing = set(requested_layers) - my_layers
|
|
169
|
+
return neuroshard_pb2.LayerForwardResponse(
|
|
170
|
+
success=False,
|
|
171
|
+
error_message=f"Missing layers: {missing}"
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Forward through requested layers
|
|
175
|
+
output = self.model.model.forward_my_layers(
|
|
176
|
+
hidden_states,
|
|
177
|
+
start_layer=min(requested_layers),
|
|
178
|
+
end_layer=max(requested_layers) + 1
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return neuroshard_pb2.LayerForwardResponse(
|
|
182
|
+
success=True,
|
|
183
|
+
tensor_data=serialize_tensor(output).encode('utf-8')
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.error(f"LayerForward error: {e}")
|
|
188
|
+
return neuroshard_pb2.LayerForwardResponse(
|
|
189
|
+
success=False,
|
|
190
|
+
error_message=str(e)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
def GetNodeInfo(self, request, context):
|
|
194
|
+
"""Get information about this node's capabilities."""
|
|
195
|
+
stats = self.model.get_stats()
|
|
196
|
+
|
|
197
|
+
return neuroshard_pb2.NodeInfoResponse(
|
|
198
|
+
node_id=self.model.node_id,
|
|
199
|
+
layer_ids=self.model.my_layer_ids,
|
|
200
|
+
has_embedding=self.model.model.has_embedding if self.model.model else False,
|
|
201
|
+
has_lm_head=self.model.model.has_lm_head if self.model.model else False,
|
|
202
|
+
available_memory_mb=int(self.model.available_memory_mb),
|
|
203
|
+
total_params=stats.get("my_params", 0),
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# ==================== DISTRIBUTED TRAINING RPCs ====================
|
|
207
|
+
|
|
208
|
+
def GossipGradient(self, request, context):
|
|
209
|
+
"""
|
|
210
|
+
Receive gradient contribution from a peer.
|
|
211
|
+
|
|
212
|
+
This is the core of distributed training:
|
|
213
|
+
1. Peer computes gradients locally
|
|
214
|
+
2. Peer broadcasts via this RPC
|
|
215
|
+
3. We aggregate and apply updates
|
|
216
|
+
|
|
217
|
+
Rate limited to prevent spam attacks (1 gossip per node per 60s).
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
try:
|
|
221
|
+
# Check if training is enabled
|
|
222
|
+
if not self.model.enable_training:
|
|
223
|
+
return neuroshard_pb2.GossipGradientResponse(
|
|
224
|
+
accepted=False,
|
|
225
|
+
reason="Training not enabled on this node"
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# RATE LIMITING: Prevent spam from malicious nodes
|
|
229
|
+
node_id = request.node_id
|
|
230
|
+
now = time.time()
|
|
231
|
+
|
|
232
|
+
with self._gossip_rate_limit_lock:
|
|
233
|
+
last_gossip = self._gossip_rate_limit.get(node_id, 0)
|
|
234
|
+
time_since_last = now - last_gossip
|
|
235
|
+
|
|
236
|
+
if time_since_last < self._gossip_rate_limit_seconds:
|
|
237
|
+
logger.warning(f"Rate limited gradient from {node_id[:8]}... "
|
|
238
|
+
f"(only {time_since_last:.1f}s since last, need {self._gossip_rate_limit_seconds}s)")
|
|
239
|
+
return neuroshard_pb2.GossipGradientResponse(
|
|
240
|
+
accepted=False,
|
|
241
|
+
reason=f"Rate limited: wait {self._gossip_rate_limit_seconds - time_since_last:.0f}s"
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Update last gossip time
|
|
245
|
+
self._gossip_rate_limit[node_id] = now
|
|
246
|
+
|
|
247
|
+
# Clean up old entries (older than 10 minutes)
|
|
248
|
+
stale_nodes = [nid for nid, ts in self._gossip_rate_limit.items() if now - ts > 600]
|
|
249
|
+
for nid in stale_nodes:
|
|
250
|
+
del self._gossip_rate_limit[nid]
|
|
251
|
+
|
|
252
|
+
# Convert protobuf to GradientContribution
|
|
253
|
+
from neuroshard.core.training.distributed import GradientContribution
|
|
254
|
+
|
|
255
|
+
contribution = GradientContribution(
|
|
256
|
+
node_id=request.node_id,
|
|
257
|
+
round_id=request.round_id,
|
|
258
|
+
layer_gradients=dict(request.layer_gradients), # Convert MapContainer to dict
|
|
259
|
+
batch_size=request.batch_size,
|
|
260
|
+
loss=request.loss,
|
|
261
|
+
timestamp=request.timestamp,
|
|
262
|
+
signature=request.signature
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Submit to NeuroNode for processing
|
|
266
|
+
success = self.model.receive_peer_gradients(contribution)
|
|
267
|
+
|
|
268
|
+
if success:
|
|
269
|
+
logger.info(f"Received gradient from peer {request.node_id[:8]}... "
|
|
270
|
+
f"(round={request.round_id}, batch={request.batch_size}, loss={request.loss:.4f})")
|
|
271
|
+
|
|
272
|
+
return neuroshard_pb2.GossipGradientResponse(
|
|
273
|
+
accepted=success,
|
|
274
|
+
reason="" if success else "Failed to process gradient",
|
|
275
|
+
current_round=self.model.current_training_round
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.error(f"GossipGradient error: {e}")
|
|
280
|
+
return neuroshard_pb2.GossipGradientResponse(
|
|
281
|
+
accepted=False,
|
|
282
|
+
reason=str(e)
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
def GetCheckpointInfo(self, request, context):
|
|
286
|
+
"""Get checkpoint info without downloading the full checkpoint."""
|
|
287
|
+
if not self.is_neuro_node:
|
|
288
|
+
return neuroshard_pb2.GetCheckpointInfoResponse()
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
info = self.model.get_checkpoint_info()
|
|
292
|
+
|
|
293
|
+
return neuroshard_pb2.GetCheckpointInfoResponse(
|
|
294
|
+
version=info.get("version", 0),
|
|
295
|
+
model_hash=info.get("model_hash", ""),
|
|
296
|
+
phase=info.get("phase", "bootstrap"),
|
|
297
|
+
params=info.get("params", 0),
|
|
298
|
+
loss=info.get("loss", float('inf'))
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
except Exception as e:
|
|
302
|
+
logger.error(f"GetCheckpointInfo error: {e}")
|
|
303
|
+
return neuroshard_pb2.GetCheckpointInfoResponse()
|
|
304
|
+
|
|
305
|
+
def GetCheckpoint(self, request, context):
|
|
306
|
+
"""Download full checkpoint from this node."""
|
|
307
|
+
if not self.is_neuro_node:
|
|
308
|
+
return neuroshard_pb2.GetCheckpointResponse(
|
|
309
|
+
success=False,
|
|
310
|
+
error_message="Node does not support checkpoint sync"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
try:
|
|
314
|
+
import io
|
|
315
|
+
import zlib
|
|
316
|
+
|
|
317
|
+
# Get model checkpoint
|
|
318
|
+
if not self.model.model:
|
|
319
|
+
return neuroshard_pb2.GetCheckpointResponse(
|
|
320
|
+
success=False,
|
|
321
|
+
error_message="Model not loaded"
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# Serialize checkpoint
|
|
325
|
+
buffer = io.BytesIO()
|
|
326
|
+
checkpoint = {
|
|
327
|
+
"model_state_dict": self.model.model.state_dict(),
|
|
328
|
+
"config": {
|
|
329
|
+
"phase": self.model.phase,
|
|
330
|
+
"hidden_dim": self.model.model.config.hidden_dim,
|
|
331
|
+
"num_layers": self.model.model.config.num_layers,
|
|
332
|
+
"vocab_size": self.model.model.config.vocab_size,
|
|
333
|
+
},
|
|
334
|
+
"version": self.model.total_training_rounds,
|
|
335
|
+
"model_hash": self.model._get_model_hash(),
|
|
336
|
+
}
|
|
337
|
+
torch.save(checkpoint, buffer)
|
|
338
|
+
|
|
339
|
+
# Compress
|
|
340
|
+
raw_data = buffer.getvalue()
|
|
341
|
+
compressed = zlib.compress(raw_data, level=6)
|
|
342
|
+
|
|
343
|
+
logger.info(f"Serving checkpoint: version={checkpoint['version']}, "
|
|
344
|
+
f"size={len(compressed)/1024:.1f}KB (compressed from {len(raw_data)/1024:.1f}KB)")
|
|
345
|
+
|
|
346
|
+
return neuroshard_pb2.GetCheckpointResponse(
|
|
347
|
+
success=True,
|
|
348
|
+
version=checkpoint["version"],
|
|
349
|
+
model_hash=checkpoint["model_hash"],
|
|
350
|
+
phase=self.model.phase,
|
|
351
|
+
checkpoint_data=compressed,
|
|
352
|
+
total_size=len(compressed)
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logger.error(f"GetCheckpoint error: {e}")
|
|
357
|
+
return neuroshard_pb2.GetCheckpointResponse(
|
|
358
|
+
success=False,
|
|
359
|
+
error_message=str(e)
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
# ==================== PIPELINE PARALLELISM RPCs ====================
|
|
363
|
+
|
|
364
|
+
def PipelineForward(self, request, context):
|
|
365
|
+
"""
|
|
366
|
+
PRODUCTION-READY Pipeline Forward with:
|
|
367
|
+
- Secure activation transfer (differential privacy + encryption)
|
|
368
|
+
- PoNW proof submission for marketplace rewards
|
|
369
|
+
- Full error handling and logging
|
|
370
|
+
|
|
371
|
+
Used for distributed inference: Driver → Workers → Validator
|
|
372
|
+
"""
|
|
373
|
+
if not hasattr(self.model, 'forward_pipeline') and not hasattr(self.model, 'forward'):
|
|
374
|
+
return neuroshard_pb2.PipelineForwardResponse(
|
|
375
|
+
success=False,
|
|
376
|
+
error_message="Node does not support forward pass"
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
try:
|
|
380
|
+
import numpy as np
|
|
381
|
+
import hashlib
|
|
382
|
+
|
|
383
|
+
logger.info(f"[WORKER/VALIDATOR] Received pipeline forward for request {request.request_id[:8]}...")
|
|
384
|
+
|
|
385
|
+
# STEP 1: Validate checksum of received activations
|
|
386
|
+
received_checksum = hashlib.sha256(request.hidden_states).hexdigest()
|
|
387
|
+
logger.info(f"[SECURITY] Received activations checksum: {received_checksum[:16]}...")
|
|
388
|
+
|
|
389
|
+
# Deserialize hidden states (detect dtype from shape)
|
|
390
|
+
# 2D = input_ids (int64), 3D = hidden_states (float32)
|
|
391
|
+
if len(request.hidden_shape) == 2:
|
|
392
|
+
dtype = np.int64
|
|
393
|
+
logger.info(f"[WORKER] Detected input_ids (2D shape), deserializing as int64")
|
|
394
|
+
else:
|
|
395
|
+
dtype = np.float32
|
|
396
|
+
|
|
397
|
+
hidden_states = torch.from_numpy(
|
|
398
|
+
np.frombuffer(request.hidden_states, dtype=dtype).copy() # .copy() makes it writable
|
|
399
|
+
).reshape(list(request.hidden_shape))
|
|
400
|
+
|
|
401
|
+
# CRITICAL: Move tensors to the same device as the model!
|
|
402
|
+
# EC2 (CPU) sends tensors, Jetson (CUDA) needs them on GPU
|
|
403
|
+
model_device = getattr(self.model, 'device', 'cpu')
|
|
404
|
+
if hasattr(self.model, 'model') and hasattr(self.model.model, 'device'):
|
|
405
|
+
model_device = self.model.model.device
|
|
406
|
+
hidden_states = hidden_states.to(model_device)
|
|
407
|
+
|
|
408
|
+
logger.info(f"[WORKER/VALIDATOR] Loaded activations: {hidden_states.shape} {hidden_states.dtype} on {hidden_states.device}")
|
|
409
|
+
|
|
410
|
+
# STEP 2: Process through our layers
|
|
411
|
+
# Deserialize attention mask if provided
|
|
412
|
+
attention_mask = None
|
|
413
|
+
if request.attention_mask:
|
|
414
|
+
attention_mask = torch.from_numpy(
|
|
415
|
+
np.frombuffer(request.attention_mask, dtype=np.float32).copy()
|
|
416
|
+
).to(model_device)
|
|
417
|
+
|
|
418
|
+
# Training Labels (if provided)
|
|
419
|
+
training_labels = None
|
|
420
|
+
if request.training_labels:
|
|
421
|
+
training_labels = torch.from_numpy(
|
|
422
|
+
np.frombuffer(request.training_labels, dtype=np.int64).copy()
|
|
423
|
+
).to(model_device)
|
|
424
|
+
|
|
425
|
+
# Forward through our layers
|
|
426
|
+
if hasattr(self.model, 'forward_pipeline'):
|
|
427
|
+
output, new_kv = self.model.forward_pipeline(
|
|
428
|
+
hidden_states=hidden_states,
|
|
429
|
+
attention_mask=attention_mask,
|
|
430
|
+
training_labels=training_labels,
|
|
431
|
+
session_id=request.session_id,
|
|
432
|
+
sender_url=request.sender_url,
|
|
433
|
+
use_cache=request.use_cache,
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
is_final = self.model.model.has_lm_head if hasattr(self.model, 'model') else False
|
|
437
|
+
else:
|
|
438
|
+
# Legacy fallback
|
|
439
|
+
output = self.model.forward(hidden_states)
|
|
440
|
+
new_kv = None
|
|
441
|
+
is_final = True
|
|
442
|
+
|
|
443
|
+
logger.info(f"[WORKER/VALIDATOR] Processed through layers: output shape {output.shape}, is_final={is_final}")
|
|
444
|
+
|
|
445
|
+
# STEP 3: Submit PoNW proof for this work (earn NEURO!)
|
|
446
|
+
if hasattr(self.model, 'ledger') and self.model.ledger and request.request_id:
|
|
447
|
+
try:
|
|
448
|
+
from neuroshard.core.economics.ledger import PoNWProof, sign_proof
|
|
449
|
+
import uuid
|
|
450
|
+
|
|
451
|
+
# Count tokens processed
|
|
452
|
+
tokens_processed = output.shape[1] if len(output.shape) > 1 else output.shape[0]
|
|
453
|
+
|
|
454
|
+
# Determine our role
|
|
455
|
+
has_embedding = self.model.model.has_embedding if hasattr(self.model, 'model') else False
|
|
456
|
+
has_lm_head = self.model.model.has_lm_head if hasattr(self.model, 'model') else False
|
|
457
|
+
layers_held = len(self.model.my_layer_ids) if hasattr(self.model, 'my_layer_ids') else 1
|
|
458
|
+
|
|
459
|
+
proof = PoNWProof(
|
|
460
|
+
node_id=self.model.node_id if hasattr(self.model, 'node_id') else "unknown",
|
|
461
|
+
proof_type="inference",
|
|
462
|
+
timestamp=time.time(),
|
|
463
|
+
nonce=str(uuid.uuid4()),
|
|
464
|
+
tokens_processed=tokens_processed,
|
|
465
|
+
request_id=request.request_id,
|
|
466
|
+
has_embedding=has_embedding,
|
|
467
|
+
has_lm_head=has_lm_head,
|
|
468
|
+
layers_held=layers_held
|
|
469
|
+
)
|
|
470
|
+
|
|
471
|
+
# Sign and submit
|
|
472
|
+
signed_proof = sign_proof(proof, self.model.node_token if hasattr(self.model, 'node_token') else "")
|
|
473
|
+
success, reward, msg = self.model.ledger.process_proof(signed_proof)
|
|
474
|
+
|
|
475
|
+
if success:
|
|
476
|
+
role = "VALIDATOR" if has_lm_head else ("DRIVER" if has_embedding else "WORKER")
|
|
477
|
+
pct = "15%" if (has_lm_head or has_embedding) else "70%"
|
|
478
|
+
logger.info(f"[{role}] ✅ Proof submitted, earned {reward:.6f} NEURO ({pct} of pool)")
|
|
479
|
+
else:
|
|
480
|
+
logger.warning(f"[WORKER/VALIDATOR] ⚠️ Proof rejected: {msg}")
|
|
481
|
+
|
|
482
|
+
except Exception as e:
|
|
483
|
+
logger.error(f"[WORKER/VALIDATOR] Failed to submit proof: {e}")
|
|
484
|
+
|
|
485
|
+
# STEP 4: Serialize output and calculate checksum
|
|
486
|
+
output_bytes = output.detach().cpu().numpy().tobytes()
|
|
487
|
+
output_shape = list(output.shape)
|
|
488
|
+
|
|
489
|
+
# Calculate checksum for integrity verification
|
|
490
|
+
output_checksum = hashlib.sha256(output_bytes).hexdigest()
|
|
491
|
+
logger.info(f"[SECURITY] Sending output with checksum: {output_checksum[:16]}...")
|
|
492
|
+
|
|
493
|
+
response = neuroshard_pb2.PipelineForwardResponse(
|
|
494
|
+
request_id=request.request_id,
|
|
495
|
+
success=True,
|
|
496
|
+
hidden_states=output_bytes,
|
|
497
|
+
hidden_shape=output_shape,
|
|
498
|
+
is_final=is_final,
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
# If final (validator), return logits
|
|
502
|
+
if is_final:
|
|
503
|
+
response.logits = output_bytes
|
|
504
|
+
response.logits_shape = output_shape
|
|
505
|
+
|
|
506
|
+
if hasattr(self.model, 'current_loss'):
|
|
507
|
+
response.loss = self.model.current_loss
|
|
508
|
+
|
|
509
|
+
logger.info(f"[VALIDATOR] ✅ Final output generated, returning logits")
|
|
510
|
+
|
|
511
|
+
return response
|
|
512
|
+
|
|
513
|
+
except Exception as e:
|
|
514
|
+
logger.error(f"[WORKER/VALIDATOR] Pipeline error: {e}")
|
|
515
|
+
import traceback
|
|
516
|
+
traceback.print_exc()
|
|
517
|
+
return neuroshard_pb2.PipelineForwardResponse(
|
|
518
|
+
request_id=request.request_id,
|
|
519
|
+
success=False,
|
|
520
|
+
error_message=str(e)
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
def PipelineBackward(self, request, context):
|
|
524
|
+
"""
|
|
525
|
+
Backward pass: propagate gradients back to previous node.
|
|
526
|
+
"""
|
|
527
|
+
if not hasattr(self.model, 'backward_pipeline'):
|
|
528
|
+
return neuroshard_pb2.PipelineBackwardResponse(
|
|
529
|
+
success=False,
|
|
530
|
+
error_message="Node does not support backward pipeline"
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
try:
|
|
534
|
+
import numpy as np
|
|
535
|
+
|
|
536
|
+
# Deserialize gradients
|
|
537
|
+
grad_output = torch.from_numpy(
|
|
538
|
+
np.frombuffer(request.grad_output, dtype=np.float32).copy()
|
|
539
|
+
).reshape(list(request.grad_shape))
|
|
540
|
+
|
|
541
|
+
# CRITICAL: Move gradients to the same device as the model!
|
|
542
|
+
model_device = getattr(self.model, 'device', 'cpu')
|
|
543
|
+
if hasattr(self.model, 'model') and hasattr(self.model.model, 'device'):
|
|
544
|
+
model_device = self.model.model.device
|
|
545
|
+
grad_output = grad_output.to(model_device)
|
|
546
|
+
|
|
547
|
+
# Run backward pipeline
|
|
548
|
+
self.model.backward_pipeline(
|
|
549
|
+
grad_output=grad_output,
|
|
550
|
+
session_id=request.session_id
|
|
551
|
+
)
|
|
552
|
+
|
|
553
|
+
return neuroshard_pb2.PipelineBackwardResponse(success=True)
|
|
554
|
+
|
|
555
|
+
except Exception as e:
|
|
556
|
+
logger.error(f"PipelineBackward error: {e}")
|
|
557
|
+
return neuroshard_pb2.PipelineBackwardResponse(
|
|
558
|
+
success=False,
|
|
559
|
+
error_message=str(e)
|
|
560
|
+
)
|
|
561
|
+
|
|
562
|
+
def GetShardChunk(self, request, context):
|
|
563
|
+
"""
|
|
564
|
+
Serve a chunk of a data shard to a peer (Data Swarm).
|
|
565
|
+
"""
|
|
566
|
+
if not hasattr(self.model, 'swarm') or not self.model.swarm:
|
|
567
|
+
return neuroshard_pb2.GetShardChunkResponse(
|
|
568
|
+
success=False,
|
|
569
|
+
error_message="Swarm not initialized on this node"
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
chunk_data = self.model.swarm.serve_chunk(request.shard_id, request.chunk_index)
|
|
573
|
+
|
|
574
|
+
if chunk_data:
|
|
575
|
+
return neuroshard_pb2.GetShardChunkResponse(
|
|
576
|
+
success=True,
|
|
577
|
+
data=chunk_data
|
|
578
|
+
)
|
|
579
|
+
else:
|
|
580
|
+
return neuroshard_pb2.GetShardChunkResponse(
|
|
581
|
+
success=False,
|
|
582
|
+
error_message="Chunk not found"
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
def GetShardInfo(self, request, context):
|
|
586
|
+
"""Get shard information from this node."""
|
|
587
|
+
if not self.is_neuro_node:
|
|
588
|
+
return neuroshard_pb2.GetShardInfoResponse()
|
|
589
|
+
|
|
590
|
+
try:
|
|
591
|
+
if hasattr(self.model, 'get_shard_info'):
|
|
592
|
+
info = self.model.get_shard_info()
|
|
593
|
+
else:
|
|
594
|
+
# Regular NeuroNode - full model
|
|
595
|
+
info = {
|
|
596
|
+
"shard_id": 0,
|
|
597
|
+
"total_shards": 1,
|
|
598
|
+
"start_layer": 0,
|
|
599
|
+
"end_layer": self.model.model.config.num_layers if self.model.model else 12,
|
|
600
|
+
"has_embedding": True,
|
|
601
|
+
"has_lm_head": True,
|
|
602
|
+
"version": self.model.total_training_rounds,
|
|
603
|
+
"model_hash": self.model._get_model_hash() if hasattr(self.model, '_get_model_hash') else "",
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
return neuroshard_pb2.GetShardInfoResponse(
|
|
607
|
+
shard_id=info.get("shard_id", 0),
|
|
608
|
+
total_shards=info.get("total_shards", 1),
|
|
609
|
+
start_layer=info.get("start_layer", 0),
|
|
610
|
+
end_layer=info.get("end_layer", 12),
|
|
611
|
+
has_embedding=info.get("has_embedding", True),
|
|
612
|
+
has_lm_head=info.get("has_lm_head", True),
|
|
613
|
+
version=info.get("version", 0),
|
|
614
|
+
model_hash=info.get("model_hash", ""),
|
|
615
|
+
available_memory_mb=info.get("available_memory_mb", 0),
|
|
616
|
+
current_load=info.get("current_load", 0),
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
except Exception as e:
|
|
620
|
+
logger.error(f"GetShardInfo error: {e}")
|
|
621
|
+
return neuroshard_pb2.GetShardInfoResponse()
|
|
622
|
+
|
|
623
|
+
def _call_peer(self, peer_url, original_req, output_tensor):
|
|
624
|
+
"""Call a peer node (legacy pipeline relay)."""
|
|
625
|
+
from urllib.parse import urlparse
|
|
626
|
+
from neuroshard.core.network.connection_pool import get_channel
|
|
627
|
+
|
|
628
|
+
parsed = urlparse(peer_url)
|
|
629
|
+
peer_ip = parsed.hostname
|
|
630
|
+
peer_http_port = parsed.port or (443 if parsed.scheme == 'https' else 80)
|
|
631
|
+
peer_grpc_addr = f"{peer_ip}:{peer_http_port + 1000}"
|
|
632
|
+
|
|
633
|
+
channel = get_channel(peer_grpc_addr)
|
|
634
|
+
stub = neuroshard_pb2_grpc.NeuroShardServiceStub(channel)
|
|
635
|
+
|
|
636
|
+
fwd_req = neuroshard_pb2.InferenceRequest(
|
|
637
|
+
session_id=original_req.session_id,
|
|
638
|
+
request_id=original_req.request_id,
|
|
639
|
+
tensor_data=serialize_tensor(output_tensor).encode('utf-8'),
|
|
640
|
+
draft_tokens=original_req.draft_tokens,
|
|
641
|
+
sender_reputation=original_req.sender_reputation,
|
|
642
|
+
source_layer=getattr(self.model, 'end', 0)
|
|
643
|
+
)
|
|
644
|
+
|
|
645
|
+
return stub.UnaryInference(fwd_req)
|
|
646
|
+
|
|
647
|
+
def _perform_audit(self, primary_peer, original_req, output_tensor):
|
|
648
|
+
"""Audit a peer by comparing with redundant peer."""
|
|
649
|
+
redundant_peer = self.p2p.get_redundant_hop(getattr(self.model, 'end', 0), primary_peer)
|
|
650
|
+
if not redundant_peer:
|
|
651
|
+
return
|
|
652
|
+
|
|
653
|
+
logger.debug(f"AUDITING: Checking {primary_peer} against {redundant_peer}...")
|
|
654
|
+
|
|
655
|
+
try:
|
|
656
|
+
res_redundant = self._call_peer(redundant_peer, original_req, output_tensor)
|
|
657
|
+
|
|
658
|
+
if res_redundant.success:
|
|
659
|
+
logger.debug(f"AUDIT: Redundant peer {redundant_peer} responded successfully.")
|
|
660
|
+
else:
|
|
661
|
+
logger.warning(f"AUDIT: Redundant peer {redundant_peer} failed: {res_redundant.error_message}")
|
|
662
|
+
except Exception as e:
|
|
663
|
+
logger.warning(f"Audit error: {e}")
|
|
664
|
+
|
|
665
|
+
# ==================== SWARM RPCs (Phase 4) ====================
|
|
666
|
+
|
|
667
|
+
def SwarmForward(self, request, context):
|
|
668
|
+
"""
|
|
669
|
+
Swarm-style activation forwarding with async buffering.
|
|
670
|
+
|
|
671
|
+
This is the async-first activation forwarding for the swarm architecture.
|
|
672
|
+
Activations are queued in the inbound buffer and processed by ComputeEngine.
|
|
673
|
+
|
|
674
|
+
Unlike PipelineForward (sync), this returns immediately after queueing.
|
|
675
|
+
"""
|
|
676
|
+
try:
|
|
677
|
+
import numpy as np
|
|
678
|
+
from neuroshard.core.swarm.buffers import ActivationPacket, ActivationPriority
|
|
679
|
+
|
|
680
|
+
# Deserialize activation
|
|
681
|
+
hidden_states = torch.from_numpy(
|
|
682
|
+
np.frombuffer(request.hidden_states, dtype=np.float32)
|
|
683
|
+
).reshape(list(request.hidden_shape))
|
|
684
|
+
|
|
685
|
+
# Create activation packet
|
|
686
|
+
packet = ActivationPacket(
|
|
687
|
+
priority=request.priority or ActivationPriority.INFERENCE_NORMAL,
|
|
688
|
+
timestamp=time.time(),
|
|
689
|
+
session_id=request.session_id,
|
|
690
|
+
micro_batch_id=request.micro_batch_id,
|
|
691
|
+
tensor_data=hidden_states,
|
|
692
|
+
source_node=request.source_node,
|
|
693
|
+
target_layer=request.target_layer,
|
|
694
|
+
is_backward=request.is_backward,
|
|
695
|
+
requires_grad=request.requires_grad,
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
# Queue in inbound buffer (non-blocking)
|
|
699
|
+
# Note: swarm_components contains SwarmComponents (router, buffers, etc.)
|
|
700
|
+
# Not to be confused with swarm (DataSwarm for P2P downloads)
|
|
701
|
+
inbound = getattr(self.model, 'swarm_components', None)
|
|
702
|
+
inbound = inbound.inbound_buffer if inbound else None
|
|
703
|
+
if inbound:
|
|
704
|
+
success = inbound.put_nowait(packet)
|
|
705
|
+
if success:
|
|
706
|
+
return neuroshard_pb2.SwarmForwardResponse(
|
|
707
|
+
success=True,
|
|
708
|
+
queued=True,
|
|
709
|
+
queue_position=len(inbound),
|
|
710
|
+
buffer_fill_rate=inbound.fill_rate,
|
|
711
|
+
)
|
|
712
|
+
else:
|
|
713
|
+
# Buffer full - backpressure
|
|
714
|
+
return neuroshard_pb2.SwarmForwardResponse(
|
|
715
|
+
success=False,
|
|
716
|
+
error_message="Inbound buffer full (backpressure)",
|
|
717
|
+
buffer_fill_rate=1.0,
|
|
718
|
+
)
|
|
719
|
+
else:
|
|
720
|
+
return neuroshard_pb2.SwarmForwardResponse(
|
|
721
|
+
success=False,
|
|
722
|
+
error_message="Inbound buffer not initialized"
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
except Exception as e:
|
|
726
|
+
logger.error(f"SwarmForward error: {e}")
|
|
727
|
+
return neuroshard_pb2.SwarmForwardResponse(
|
|
728
|
+
success=False,
|
|
729
|
+
error_message=str(e)
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
def GetSwarmStatus(self, request, context):
|
|
733
|
+
"""
|
|
734
|
+
Get swarm node status: buffer fill rates, capacity, etc.
|
|
735
|
+
|
|
736
|
+
Used by peers to check node health before routing.
|
|
737
|
+
"""
|
|
738
|
+
try:
|
|
739
|
+
status = self.model.get_swarm_status()
|
|
740
|
+
|
|
741
|
+
# Extract key metrics
|
|
742
|
+
inbound_fill = 0.0
|
|
743
|
+
outbound_fill = 0.0
|
|
744
|
+
inbound_depth = 0
|
|
745
|
+
outbound_depth = 0
|
|
746
|
+
|
|
747
|
+
if "inbound_buffer" in status:
|
|
748
|
+
inbound_fill = status["inbound_buffer"].get("fill_rate", 0.0)
|
|
749
|
+
inbound_depth = status["inbound_buffer"].get("queue_size", 0)
|
|
750
|
+
if "outbound_buffer" in status:
|
|
751
|
+
outbound_fill = status["outbound_buffer"].get("fill_rate", 0.0)
|
|
752
|
+
outbound_depth = status["outbound_buffer"].get("queue_size", 0)
|
|
753
|
+
|
|
754
|
+
# Get layer range
|
|
755
|
+
layer_start = min(self.model.my_layer_ids) if self.model.my_layer_ids else 0
|
|
756
|
+
layer_end = max(self.model.my_layer_ids) + 1 if self.model.my_layer_ids else 0
|
|
757
|
+
|
|
758
|
+
return neuroshard_pb2.SwarmStatusResponse(
|
|
759
|
+
node_id=self.model.node_id,
|
|
760
|
+
layer_start=layer_start,
|
|
761
|
+
layer_end=layer_end,
|
|
762
|
+
inbound_fill_rate=inbound_fill,
|
|
763
|
+
outbound_fill_rate=outbound_fill,
|
|
764
|
+
inbound_queue_depth=inbound_depth,
|
|
765
|
+
outbound_queue_depth=outbound_depth,
|
|
766
|
+
is_accepting_activations=inbound_fill < 0.95,
|
|
767
|
+
)
|
|
768
|
+
|
|
769
|
+
except Exception as e:
|
|
770
|
+
logger.error(f"GetSwarmStatus error: {e}")
|
|
771
|
+
return neuroshard_pb2.SwarmStatusResponse(error_message=str(e))
|
|
772
|
+
|
|
773
|
+
def UpdatePeerCapacity(self, request, context):
|
|
774
|
+
"""
|
|
775
|
+
Receive capacity update from peer (TCP fallback for UDP heartbeat).
|
|
776
|
+
|
|
777
|
+
Used when UDP heartbeats fail (firewalls, etc).
|
|
778
|
+
"""
|
|
779
|
+
try:
|
|
780
|
+
# Update router with peer info
|
|
781
|
+
# Note: swarm_components contains SwarmComponents (router, buffers, etc.)
|
|
782
|
+
router = getattr(self.model, 'swarm_components', None)
|
|
783
|
+
router = router.swarm_router if router else None
|
|
784
|
+
if router:
|
|
785
|
+
from neuroshard.core.swarm.heartbeat import CapacityBitmask
|
|
786
|
+
|
|
787
|
+
capacity = CapacityBitmask(
|
|
788
|
+
node_id=request.node_id,
|
|
789
|
+
timestamp=time.time(),
|
|
790
|
+
available_memory_mb=request.available_memory_mb,
|
|
791
|
+
queue_depth=request.queue_depth,
|
|
792
|
+
layer_range=(request.layer_start, request.layer_end),
|
|
793
|
+
gpu_utilization=request.gpu_utilization,
|
|
794
|
+
network_saturation=request.network_saturation,
|
|
795
|
+
is_training=request.is_training,
|
|
796
|
+
is_accepting_inference=request.is_accepting_inference,
|
|
797
|
+
is_accepting_activations=request.is_accepting_activations,
|
|
798
|
+
grpc_addr=request.grpc_addr,
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
router.update_peer_from_heartbeat(capacity)
|
|
802
|
+
|
|
803
|
+
return neuroshard_pb2.UpdatePeerCapacityResponse(accepted=True)
|
|
804
|
+
|
|
805
|
+
return neuroshard_pb2.UpdatePeerCapacityResponse(accepted=False)
|
|
806
|
+
|
|
807
|
+
except Exception as e:
|
|
808
|
+
logger.error(f"UpdatePeerCapacity error: {e}")
|
|
809
|
+
return neuroshard_pb2.UpdatePeerCapacityResponse(accepted=False)
|
|
810
|
+
|
|
811
|
+
def GetDiLoCoStatus(self, request, context):
|
|
812
|
+
"""
|
|
813
|
+
Get DiLoCo training status.
|
|
814
|
+
|
|
815
|
+
Returns inner step count, sync progress, etc.
|
|
816
|
+
"""
|
|
817
|
+
try:
|
|
818
|
+
progress = self.model.get_diloco_progress()
|
|
819
|
+
|
|
820
|
+
return neuroshard_pb2.DiLoCoStatusResponse(
|
|
821
|
+
enabled=progress.get("enabled", False),
|
|
822
|
+
inner_step_count=progress.get("inner_step_count", 0),
|
|
823
|
+
inner_steps_total=progress.get("inner_steps_total", 500),
|
|
824
|
+
progress=progress.get("progress", 0.0),
|
|
825
|
+
outer_step_count=progress.get("outer_step_count", 0),
|
|
826
|
+
should_sync=progress.get("should_sync", False),
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
except Exception as e:
|
|
830
|
+
logger.error(f"GetDiLoCoStatus error: {e}")
|
|
831
|
+
return neuroshard_pb2.DiLoCoStatusResponse(enabled=False)
|
|
832
|
+
|
|
833
|
+
|
|
834
|
+
def serve_grpc(port: int, model, p2p: P2PManager, swap_controller=None):
|
|
835
|
+
"""Start the gRPC server."""
|
|
836
|
+
global GRPC_SERVER
|
|
837
|
+
|
|
838
|
+
# Server options for P2P network - be lenient with keepalive pings
|
|
839
|
+
# This is critical for decentralized networks where nodes ping frequently
|
|
840
|
+
# IMPORTANT: Increase message size for activation tensors in pipeline training!
|
|
841
|
+
# Activation size = batch_size * seq_len * hidden_dim * 4 bytes
|
|
842
|
+
# For batch=4, seq=512, hidden=512: ~4MB, but we need headroom
|
|
843
|
+
MAX_MESSAGE_SIZE = 64 * 1024 * 1024 # 64MB for large batches/sequences
|
|
844
|
+
options = [
|
|
845
|
+
('grpc.keepalive_time_ms', 10000), # Send keepalive every 10s
|
|
846
|
+
('grpc.keepalive_timeout_ms', 5000), # 5s timeout for response
|
|
847
|
+
('grpc.keepalive_permit_without_calls', True), # Allow pings without active RPCs
|
|
848
|
+
('grpc.http2.min_recv_ping_interval_without_data_ms', 5000), # Accept pings every 5s
|
|
849
|
+
('grpc.http2.max_ping_strikes', 0), # Don't penalize frequent pings
|
|
850
|
+
('grpc.max_receive_message_length', MAX_MESSAGE_SIZE), # For receiving activations
|
|
851
|
+
('grpc.max_send_message_length', MAX_MESSAGE_SIZE), # For sending responses
|
|
852
|
+
]
|
|
853
|
+
|
|
854
|
+
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10), options=options)
|
|
855
|
+
neuroshard_pb2_grpc.add_NeuroShardServiceServicer_to_server(
|
|
856
|
+
NeuroShardServiceServicer(model, p2p, swap_controller), server
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
grpc_port = port + 1000
|
|
860
|
+
server.add_insecure_port(f'[::]:{grpc_port}')
|
|
861
|
+
server.start()
|
|
862
|
+
GRPC_SERVER = server
|
|
863
|
+
|
|
864
|
+
logger.info(f"gRPC Server started on port {grpc_port}")
|
|
865
|
+
|
|
866
|
+
try:
|
|
867
|
+
# Wait until server is stopped externally via stop_grpc()
|
|
868
|
+
server.wait_for_termination()
|
|
869
|
+
except KeyboardInterrupt:
|
|
870
|
+
server.stop(0)
|
|
871
|
+
finally:
|
|
872
|
+
GRPC_SERVER = None
|
|
873
|
+
logger.info(f"gRPC Server on port {grpc_port} terminated")
|
|
874
|
+
|
|
875
|
+
|
|
876
|
+
def start_grpc_background(port: int, model, p2p: P2PManager, swap_controller=None):
|
|
877
|
+
"""Start gRPC server in background thread."""
|
|
878
|
+
t = threading.Thread(target=serve_grpc, args=(port, model, p2p, swap_controller), daemon=True)
|
|
879
|
+
t.start()
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
def stop_grpc(timeout: float = 5.0):
|
|
883
|
+
"""Stop the gRPC server gracefully."""
|
|
884
|
+
global GRPC_SERVER
|
|
885
|
+
if GRPC_SERVER is not None:
|
|
886
|
+
logger.info("Stopping gRPC server...")
|
|
887
|
+
try:
|
|
888
|
+
# stop() returns an event that is set when shutdown is complete
|
|
889
|
+
event = GRPC_SERVER.stop(grace=timeout)
|
|
890
|
+
event.wait(timeout=timeout)
|
|
891
|
+
logger.info("gRPC server stopped")
|
|
892
|
+
except Exception as e:
|
|
893
|
+
logger.warning(f"Error stopping gRPC server: {e}")
|
|
894
|
+
finally:
|
|
895
|
+
GRPC_SERVER = None
|