nexaroa 0.0.111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuroshard/__init__.py +93 -0
- neuroshard/__main__.py +4 -0
- neuroshard/cli.py +466 -0
- neuroshard/core/__init__.py +92 -0
- neuroshard/core/consensus/verifier.py +252 -0
- neuroshard/core/crypto/__init__.py +20 -0
- neuroshard/core/crypto/ecdsa.py +392 -0
- neuroshard/core/economics/__init__.py +52 -0
- neuroshard/core/economics/constants.py +387 -0
- neuroshard/core/economics/ledger.py +2111 -0
- neuroshard/core/economics/market.py +975 -0
- neuroshard/core/economics/wallet.py +168 -0
- neuroshard/core/governance/__init__.py +74 -0
- neuroshard/core/governance/proposal.py +561 -0
- neuroshard/core/governance/registry.py +545 -0
- neuroshard/core/governance/versioning.py +332 -0
- neuroshard/core/governance/voting.py +453 -0
- neuroshard/core/model/__init__.py +30 -0
- neuroshard/core/model/dynamic.py +4186 -0
- neuroshard/core/model/llm.py +905 -0
- neuroshard/core/model/registry.py +164 -0
- neuroshard/core/model/scaler.py +387 -0
- neuroshard/core/model/tokenizer.py +568 -0
- neuroshard/core/network/__init__.py +56 -0
- neuroshard/core/network/connection_pool.py +72 -0
- neuroshard/core/network/dht.py +130 -0
- neuroshard/core/network/dht_plan.py +55 -0
- neuroshard/core/network/dht_proof_store.py +516 -0
- neuroshard/core/network/dht_protocol.py +261 -0
- neuroshard/core/network/dht_service.py +506 -0
- neuroshard/core/network/encrypted_channel.py +141 -0
- neuroshard/core/network/nat.py +201 -0
- neuroshard/core/network/nat_traversal.py +695 -0
- neuroshard/core/network/p2p.py +929 -0
- neuroshard/core/network/p2p_data.py +150 -0
- neuroshard/core/swarm/__init__.py +106 -0
- neuroshard/core/swarm/aggregation.py +729 -0
- neuroshard/core/swarm/buffers.py +643 -0
- neuroshard/core/swarm/checkpoint.py +709 -0
- neuroshard/core/swarm/compute.py +624 -0
- neuroshard/core/swarm/diloco.py +844 -0
- neuroshard/core/swarm/factory.py +1288 -0
- neuroshard/core/swarm/heartbeat.py +669 -0
- neuroshard/core/swarm/logger.py +487 -0
- neuroshard/core/swarm/router.py +658 -0
- neuroshard/core/swarm/service.py +640 -0
- neuroshard/core/training/__init__.py +29 -0
- neuroshard/core/training/checkpoint.py +600 -0
- neuroshard/core/training/distributed.py +1602 -0
- neuroshard/core/training/global_tracker.py +617 -0
- neuroshard/core/training/production.py +276 -0
- neuroshard/governance_cli.py +729 -0
- neuroshard/grpc_server.py +895 -0
- neuroshard/runner.py +3223 -0
- neuroshard/sdk/__init__.py +92 -0
- neuroshard/sdk/client.py +990 -0
- neuroshard/sdk/errors.py +101 -0
- neuroshard/sdk/types.py +282 -0
- neuroshard/tracker/__init__.py +0 -0
- neuroshard/tracker/server.py +864 -0
- neuroshard/ui/__init__.py +0 -0
- neuroshard/ui/app.py +102 -0
- neuroshard/ui/templates/index.html +1052 -0
- neuroshard/utils/__init__.py +0 -0
- neuroshard/utils/autostart.py +81 -0
- neuroshard/utils/hardware.py +121 -0
- neuroshard/utils/serialization.py +90 -0
- neuroshard/version.py +1 -0
- nexaroa-0.0.111.dist-info/METADATA +283 -0
- nexaroa-0.0.111.dist-info/RECORD +78 -0
- nexaroa-0.0.111.dist-info/WHEEL +5 -0
- nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
- nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
- nexaroa-0.0.111.dist-info/top_level.txt +2 -0
- protos/__init__.py +0 -0
- protos/neuroshard.proto +651 -0
- protos/neuroshard_pb2.py +160 -0
- protos/neuroshard_pb2_grpc.py +1298 -0
protos/neuroshard.proto
ADDED
|
@@ -0,0 +1,651 @@
|
|
|
1
|
+
syntax = "proto3";
|
|
2
|
+
|
|
3
|
+
package neuroshard;
|
|
4
|
+
|
|
5
|
+
// Service definition for Node-to-Node and Client-to-Node communication
|
|
6
|
+
service NeuroShardService {
|
|
7
|
+
// Streaming inference: Client/Node sends a stream of requests, receives a stream of tokens/updates
|
|
8
|
+
rpc StreamInference (stream InferenceRequest) returns (stream InferenceResponse);
|
|
9
|
+
|
|
10
|
+
// Simple unary call for single-step (legacy/fallback)
|
|
11
|
+
rpc UnaryInference (InferenceRequest) returns (InferenceResponse);
|
|
12
|
+
|
|
13
|
+
// Gossip weights for training
|
|
14
|
+
rpc GetWeights (WeightRequest) returns (WeightResponse);
|
|
15
|
+
|
|
16
|
+
// Gossip Proof of Uptime
|
|
17
|
+
rpc GossipProof (GossipProofRequest) returns (GossipProofResponse);
|
|
18
|
+
|
|
19
|
+
// Gossip Transaction
|
|
20
|
+
rpc GossipTransaction (GossipTransactionRequest) returns (GossipTransactionResponse);
|
|
21
|
+
|
|
22
|
+
// Gossip Stake Update
|
|
23
|
+
rpc GossipStake (GossipStakeRequest) returns (GossipStakeResponse);
|
|
24
|
+
|
|
25
|
+
// Request Proof Validation from Validators
|
|
26
|
+
rpc RequestProofValidation (ProofValidationRequest) returns (ProofValidationResponse);
|
|
27
|
+
|
|
28
|
+
// Gossip Validation Vote
|
|
29
|
+
rpc GossipValidationVote (ValidationVoteRequest) returns (ValidationVoteResponse);
|
|
30
|
+
|
|
31
|
+
// --- Distributed Training RPCs ---
|
|
32
|
+
|
|
33
|
+
// Gossip gradients for distributed training
|
|
34
|
+
rpc GossipGradient (GossipGradientRequest) returns (GossipGradientResponse);
|
|
35
|
+
|
|
36
|
+
// Request checkpoint from peer
|
|
37
|
+
rpc GetCheckpoint (GetCheckpointRequest) returns (GetCheckpointResponse);
|
|
38
|
+
|
|
39
|
+
// Get checkpoint info (version, hash) without downloading
|
|
40
|
+
rpc GetCheckpointInfo (GetCheckpointInfoRequest) returns (GetCheckpointInfoResponse);
|
|
41
|
+
|
|
42
|
+
// --- Pipeline Parallelism RPCs ---
|
|
43
|
+
|
|
44
|
+
// Forward hidden states through this node's layers
|
|
45
|
+
rpc PipelineForward (PipelineForwardRequest) returns (PipelineForwardResponse);
|
|
46
|
+
|
|
47
|
+
// Backward pass: propagate gradients back to previous node
|
|
48
|
+
rpc PipelineBackward (PipelineBackwardRequest) returns (PipelineBackwardResponse);
|
|
49
|
+
|
|
50
|
+
// Get shard info from this node
|
|
51
|
+
rpc GetShardInfo (GetShardInfoRequest) returns (GetShardInfoResponse);
|
|
52
|
+
|
|
53
|
+
// --- Data Swarm RPCs (P2P Dataset) ---
|
|
54
|
+
|
|
55
|
+
// Request a chunk of a data shard
|
|
56
|
+
rpc GetShardChunk (GetShardChunkRequest) returns (GetShardChunkResponse);
|
|
57
|
+
|
|
58
|
+
// --- DHT RPCs ---
|
|
59
|
+
rpc DHTPing (DHTPingRequest) returns (DHTPingResponse);
|
|
60
|
+
rpc DHTStore (DHTStoreRequest) returns (DHTStoreResponse);
|
|
61
|
+
rpc DHTFindNode (DHTFindNodeRequest) returns (DHTFindNodeResponse);
|
|
62
|
+
rpc DHTFindValue (DHTFindValueRequest) returns (DHTFindValueResponse);
|
|
63
|
+
|
|
64
|
+
// --- Phase 4: Tensor Parallelism RPCs ---
|
|
65
|
+
|
|
66
|
+
// Exchange tensor chunks during ring all-reduce
|
|
67
|
+
rpc TensorExchange (TensorExchangeRequest) returns (TensorExchangeResponse);
|
|
68
|
+
|
|
69
|
+
// Send partial results for async aggregation
|
|
70
|
+
rpc SendPartialResult (PartialResultRequest) returns (PartialResultResponse);
|
|
71
|
+
|
|
72
|
+
// Announce tensor shard availability
|
|
73
|
+
rpc AnnounceTensorShard (AnnounceShardRequest) returns (AnnounceShardResponse);
|
|
74
|
+
|
|
75
|
+
// Find peer shards for all-reduce coordination
|
|
76
|
+
rpc FindTensorShardPeers (FindShardPeersRequest) returns (FindShardPeersResponse);
|
|
77
|
+
|
|
78
|
+
// --- Phase 4: Model Registry RPCs ---
|
|
79
|
+
|
|
80
|
+
// List available models in network
|
|
81
|
+
rpc ListModels (ListModelsRequest) returns (ListModelsResponse);
|
|
82
|
+
|
|
83
|
+
// Get status of a specific model
|
|
84
|
+
rpc GetModelStatus (GetModelStatusRequest) returns (GetModelStatusResponse);
|
|
85
|
+
|
|
86
|
+
// --- Swarm Routing RPCs (Phase 2) ---
|
|
87
|
+
|
|
88
|
+
// Async activation forward (non-blocking, buffers locally)
|
|
89
|
+
rpc SwarmForward (SwarmForwardRequest) returns (SwarmForwardResponse);
|
|
90
|
+
|
|
91
|
+
// Get swarm node status (buffer fill rates, capacity)
|
|
92
|
+
rpc GetSwarmStatus (SwarmStatusRequest) returns (SwarmStatusResponse);
|
|
93
|
+
|
|
94
|
+
// Update peer capacity (TCP fallback for UDP heartbeat)
|
|
95
|
+
rpc UpdatePeerCapacity (UpdatePeerCapacityRequest) returns (UpdatePeerCapacityResponse);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
message InferenceRequest {
|
|
99
|
+
string session_id = 1;
|
|
100
|
+
string request_id = 2;
|
|
101
|
+
|
|
102
|
+
// Serialized tensor data (compressed/quantized)
|
|
103
|
+
bytes tensor_data = 3;
|
|
104
|
+
|
|
105
|
+
// Speculative decoding
|
|
106
|
+
repeated int32 draft_tokens = 4;
|
|
107
|
+
|
|
108
|
+
float sender_reputation = 5;
|
|
109
|
+
|
|
110
|
+
// Routing metadata
|
|
111
|
+
int32 source_layer = 6;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
message InferenceResponse {
|
|
115
|
+
string request_id = 1;
|
|
116
|
+
|
|
117
|
+
// Status codes
|
|
118
|
+
bool success = 2;
|
|
119
|
+
string error_message = 3;
|
|
120
|
+
|
|
121
|
+
// Output data
|
|
122
|
+
bytes tensor_data = 4; // Serialized logits or activations
|
|
123
|
+
|
|
124
|
+
// Speculative decoding results
|
|
125
|
+
bool is_speculative = 5;
|
|
126
|
+
int32 valid_count = 6;
|
|
127
|
+
int32 next_token = 7;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
message WeightRequest {
|
|
131
|
+
string shard_range = 1;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
message WeightResponse {
|
|
135
|
+
bytes weights_data = 1; // Serialized state_dict
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
message GossipProofRequest {
|
|
139
|
+
string node_id = 1;
|
|
140
|
+
double timestamp = 2;
|
|
141
|
+
double uptime = 3; // uptime_seconds
|
|
142
|
+
string signature = 4;
|
|
143
|
+
int64 token_count = 5; // tokens_processed (inference)
|
|
144
|
+
int32 training_batches = 6; // training_batches (training)
|
|
145
|
+
int32 layers_held = 7; // Number of layers this node holds
|
|
146
|
+
bool has_embedding = 8; // Is this node a Driver (has embedding layer)?
|
|
147
|
+
bool has_lm_head = 9; // Is this node a Validator (has LM head)?
|
|
148
|
+
string proof_type = 10; // "UPTIME", "INFERENCE", "TRAINING", "DATA"
|
|
149
|
+
string nonce = 11; // Unique nonce for replay prevention
|
|
150
|
+
string public_key = 12; // ECDSA public key for trustless verification
|
|
151
|
+
int32 data_samples = 13; // Data samples processed (for canonical_payload)
|
|
152
|
+
string model_hash = 14; // Model hash (for canonical_payload)
|
|
153
|
+
string request_id = 15; // Request ID for inference proofs (for canonical_payload)
|
|
154
|
+
double current_loss = 16; // Current training loss (for aggregation on website)
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
message GossipProofResponse {
|
|
158
|
+
bool accepted = 1;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
message GossipTransactionRequest {
|
|
162
|
+
string sender_id = 1;
|
|
163
|
+
string recipient_id = 2;
|
|
164
|
+
double amount = 3;
|
|
165
|
+
double timestamp = 4;
|
|
166
|
+
string signature = 5;
|
|
167
|
+
string tx_hash = 6;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
message GossipTransactionResponse {
|
|
171
|
+
bool accepted = 1;
|
|
172
|
+
string reason = 2;
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Stake gossip - sync stakes across P2P network
|
|
176
|
+
message GossipStakeRequest {
|
|
177
|
+
string node_id = 1; // Node that staked (SHA256(public_key)[:32])
|
|
178
|
+
double amount = 2; // Total staked amount
|
|
179
|
+
double locked_until = 3; // Lock expiry timestamp
|
|
180
|
+
double timestamp = 4; // When this stake update occurred
|
|
181
|
+
string signature = 5; // ECDSA signature for verification
|
|
182
|
+
string public_key = 6; // Compressed public key (hex) - REQUIRED for verification
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
message GossipStakeResponse {
|
|
186
|
+
bool accepted = 1;
|
|
187
|
+
string reason = 2;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Proof Validation Request - Ask validators to validate a proof
|
|
191
|
+
message ProofValidationRequest {
|
|
192
|
+
string proof_signature = 1; // Signature of the proof to validate
|
|
193
|
+
string submitter_id = 2; // Node that submitted the proof
|
|
194
|
+
double timestamp = 3; // Proof timestamp
|
|
195
|
+
double uptime_seconds = 4; // Claimed uptime
|
|
196
|
+
int64 tokens_processed = 5; // Claimed tokens
|
|
197
|
+
int32 training_batches = 6; // Claimed training batches
|
|
198
|
+
int32 layers_held = 7; // Claimed layers
|
|
199
|
+
bool has_embedding = 8; // Is Driver
|
|
200
|
+
bool has_lm_head = 9; // Is Validator
|
|
201
|
+
string proof_type = 10; // UPTIME, INFERENCE, TRAINING, DATA
|
|
202
|
+
string nonce = 11; // Unique nonce
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
message ProofValidationResponse {
|
|
206
|
+
bool accepted = 1; // Whether validation request was accepted
|
|
207
|
+
string reason = 2; // Reason if not accepted
|
|
208
|
+
string validator_id = 3; // Validator that will process
|
|
209
|
+
double validator_stake = 4; // Validator's stake
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
// Validation Vote - Validator's vote on a proof
|
|
213
|
+
message ValidationVoteRequest {
|
|
214
|
+
string proof_signature = 1; // Proof being voted on
|
|
215
|
+
string validator_id = 2; // Validator casting vote
|
|
216
|
+
double validator_stake = 3; // Validator's stake (for weighting)
|
|
217
|
+
bool vote = 4; // true = valid, false = invalid
|
|
218
|
+
string details = 5; // Optional validation details
|
|
219
|
+
double timestamp = 6; // When vote was cast
|
|
220
|
+
string signature = 7; // Validator's signature on vote
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
message ValidationVoteResponse {
|
|
224
|
+
bool accepted = 1;
|
|
225
|
+
string reason = 2;
|
|
226
|
+
double total_valid_stake = 3; // Current valid stake tally
|
|
227
|
+
double total_invalid_stake = 4; // Current invalid stake tally
|
|
228
|
+
bool consensus_reached = 5; // Whether consensus has been reached
|
|
229
|
+
bool consensus_result = 6; // The consensus result (if reached)
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// --- Distributed Training Messages ---
|
|
233
|
+
|
|
234
|
+
message GossipGradientRequest {
|
|
235
|
+
string node_id = 1; // Sender node ID
|
|
236
|
+
int32 round_id = 2; // Training round ID
|
|
237
|
+
string model_hash = 3; // Model hash for consistency check
|
|
238
|
+
double timestamp = 4;
|
|
239
|
+
|
|
240
|
+
int32 batch_size = 5; // Batch size used for this gradient
|
|
241
|
+
double loss = 6; // Training loss
|
|
242
|
+
|
|
243
|
+
// Compressed gradients per layer
|
|
244
|
+
map<string, bytes> layer_gradients = 7;
|
|
245
|
+
|
|
246
|
+
string signature = 8; // Proof signature
|
|
247
|
+
int32 ttl = 9; // Time-to-live for forwarding
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
message GossipGradientResponse {
|
|
251
|
+
bool accepted = 1;
|
|
252
|
+
string reason = 2;
|
|
253
|
+
int32 current_round = 3; // Receiver's current round (for sync)
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
message GetCheckpointRequest {
|
|
257
|
+
string model_hash = 1; // Optional: specific checkpoint hash
|
|
258
|
+
int32 min_version = 2; // Optional: minimum version number
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
message GetCheckpointResponse {
|
|
262
|
+
bool success = 1;
|
|
263
|
+
string error_message = 2;
|
|
264
|
+
|
|
265
|
+
int32 version = 3; // Checkpoint version (training round)
|
|
266
|
+
string model_hash = 4; // Model hash
|
|
267
|
+
string phase = 5; // Model phase (bootstrap, early, etc.)
|
|
268
|
+
|
|
269
|
+
bytes checkpoint_data = 6; // Serialized checkpoint (compressed)
|
|
270
|
+
int64 total_size = 7; // Total size in bytes
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
message GetCheckpointInfoRequest {
|
|
274
|
+
// Empty - just get current info
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
message GetCheckpointInfoResponse {
|
|
278
|
+
int32 version = 1; // Current training round
|
|
279
|
+
string model_hash = 2; // Current model hash
|
|
280
|
+
string phase = 3; // Model phase
|
|
281
|
+
int64 params = 4; // Number of parameters
|
|
282
|
+
double loss = 5; // Current loss
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
// --- Pipeline Parallelism Messages ---
|
|
286
|
+
|
|
287
|
+
message PipelineForwardRequest {
|
|
288
|
+
string session_id = 1;
|
|
289
|
+
string request_id = 2;
|
|
290
|
+
|
|
291
|
+
// Hidden states from previous node
|
|
292
|
+
bytes hidden_states = 3; // Serialized tensor [batch, seq, hidden]
|
|
293
|
+
repeated int64 hidden_shape = 4; // Shape of hidden states
|
|
294
|
+
|
|
295
|
+
// Attention mask (optional)
|
|
296
|
+
bytes attention_mask = 5;
|
|
297
|
+
|
|
298
|
+
// Position IDs
|
|
299
|
+
bytes position_ids = 6;
|
|
300
|
+
|
|
301
|
+
// KV cache (optional, for incremental decoding)
|
|
302
|
+
repeated bytes past_key_values = 7;
|
|
303
|
+
bool use_cache = 8;
|
|
304
|
+
|
|
305
|
+
// Shard info
|
|
306
|
+
int32 source_shard = 9; // Shard that sent this
|
|
307
|
+
int32 target_shard = 10; // Shard to process (this node)
|
|
308
|
+
|
|
309
|
+
// Training data (only sent by Driver)
|
|
310
|
+
bytes training_labels = 11; // Serialized labels [batch, seq]
|
|
311
|
+
|
|
312
|
+
// Backward pass routing
|
|
313
|
+
string sender_url = 12; // URL to send gradients back to
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
message PipelineForwardResponse {
|
|
317
|
+
string request_id = 1;
|
|
318
|
+
bool success = 2;
|
|
319
|
+
string error_message = 3;
|
|
320
|
+
|
|
321
|
+
// Output hidden states
|
|
322
|
+
bytes hidden_states = 4;
|
|
323
|
+
repeated int64 hidden_shape = 5;
|
|
324
|
+
|
|
325
|
+
// Updated KV cache
|
|
326
|
+
repeated bytes past_key_values = 6;
|
|
327
|
+
|
|
328
|
+
// If this is the final shard, include logits
|
|
329
|
+
bool is_final = 7;
|
|
330
|
+
bytes logits = 8; // Only if is_final
|
|
331
|
+
repeated int64 logits_shape = 9;
|
|
332
|
+
|
|
333
|
+
// Training feedback
|
|
334
|
+
double loss = 10; // Returned by Validator
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
message PipelineBackwardRequest {
|
|
338
|
+
string session_id = 1;
|
|
339
|
+
string request_id = 2;
|
|
340
|
+
|
|
341
|
+
// Gradients w.r.t the OUTPUT of the previous layer
|
|
342
|
+
bytes grad_output = 3; // Serialized tensor
|
|
343
|
+
repeated int64 grad_shape = 4;
|
|
344
|
+
|
|
345
|
+
int32 target_shard = 5; // Shard we are sending back TO
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
message PipelineBackwardResponse {
|
|
349
|
+
bool success = 1;
|
|
350
|
+
string error_message = 2;
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
message GetShardInfoRequest {
|
|
354
|
+
// Empty - just get current shard info
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
message GetShardInfoResponse {
|
|
358
|
+
int32 shard_id = 1;
|
|
359
|
+
int32 total_shards = 2;
|
|
360
|
+
int32 start_layer = 3;
|
|
361
|
+
int32 end_layer = 4;
|
|
362
|
+
bool has_embedding = 5;
|
|
363
|
+
bool has_lm_head = 6;
|
|
364
|
+
|
|
365
|
+
int32 version = 7;
|
|
366
|
+
string model_hash = 8;
|
|
367
|
+
|
|
368
|
+
// Capacity
|
|
369
|
+
float available_memory_mb = 9;
|
|
370
|
+
float current_load = 10;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
// --- Data Swarm Messages ---
|
|
374
|
+
|
|
375
|
+
message GetShardChunkRequest {
|
|
376
|
+
int32 shard_id = 1; // Which shard (e.g., 42)
|
|
377
|
+
int32 chunk_index = 2; // Which 1MB chunk
|
|
378
|
+
string requester_id = 3; // Who is asking
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
message GetShardChunkResponse {
|
|
382
|
+
bool success = 1;
|
|
383
|
+
bytes data = 2; // The chunk data
|
|
384
|
+
string error_message = 3;
|
|
385
|
+
int64 total_size = 4; // Total size of shard
|
|
386
|
+
int32 total_chunks = 5; // Total chunks in shard
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
// --- DHT Messages ---
|
|
390
|
+
|
|
391
|
+
message DHTNodeInfo {
|
|
392
|
+
bytes node_id = 1; // 20-byte ID (160 bits)
|
|
393
|
+
string ip = 2;
|
|
394
|
+
int32 port = 3;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
message DHTPingRequest {
|
|
398
|
+
DHTNodeInfo sender = 1;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
message DHTPingResponse {
|
|
402
|
+
DHTNodeInfo responder = 1;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
message DHTStoreRequest {
|
|
406
|
+
DHTNodeInfo sender = 1;
|
|
407
|
+
bytes key = 2;
|
|
408
|
+
string value = 3; // For now, simple string (e.g., "ip:port")
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
message DHTStoreResponse {
|
|
412
|
+
DHTNodeInfo responder = 1;
|
|
413
|
+
bool success = 2;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
message DHTFindNodeRequest {
|
|
417
|
+
DHTNodeInfo sender = 1;
|
|
418
|
+
bytes target_id = 2;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
message DHTFindNodeResponse {
|
|
422
|
+
DHTNodeInfo responder = 1;
|
|
423
|
+
repeated DHTNodeInfo nodes = 2; // K closest nodes
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
message DHTFindValueRequest {
|
|
427
|
+
DHTNodeInfo sender = 1;
|
|
428
|
+
bytes key = 2;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
message DHTFindValueResponse {
|
|
432
|
+
DHTNodeInfo responder = 1;
|
|
433
|
+
string value = 2; // If found
|
|
434
|
+
repeated DHTNodeInfo nodes = 3; // If not found (K closest nodes)
|
|
435
|
+
bool found = 4;
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// --- Phase 4: Tensor Parallelism Messages ---
|
|
439
|
+
|
|
440
|
+
// Tensor exchange for all-reduce operations
|
|
441
|
+
message TensorExchangeRequest {
|
|
442
|
+
string operation_id = 1; // Unique operation ID
|
|
443
|
+
int32 layer_id = 2; // Layer being processed
|
|
444
|
+
int32 step = 3; // Ring all-reduce step
|
|
445
|
+
int32 chunk_idx = 4; // Chunk index in ring
|
|
446
|
+
|
|
447
|
+
int32 sender_shard_id = 5; // Sender's tensor shard ID
|
|
448
|
+
int32 total_shards = 6; // Total tensor shards
|
|
449
|
+
|
|
450
|
+
bytes tensor_data = 7; // Serialized tensor chunk
|
|
451
|
+
repeated int64 tensor_shape = 8; // Original tensor shape
|
|
452
|
+
string dtype = 9; // Tensor dtype (float32, float16, etc.)
|
|
453
|
+
|
|
454
|
+
string reduce_op = 10; // Reduction operation (sum, mean, max, min)
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
message TensorExchangeResponse {
|
|
458
|
+
string operation_id = 1;
|
|
459
|
+
bool success = 2;
|
|
460
|
+
string error_message = 3;
|
|
461
|
+
|
|
462
|
+
bytes tensor_data = 4; // Response tensor (for bidirectional exchange)
|
|
463
|
+
repeated int64 tensor_shape = 5;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
// Partial result aggregation (for async all-reduce)
|
|
467
|
+
message PartialResultRequest {
|
|
468
|
+
string session_id = 1;
|
|
469
|
+
string operation_id = 2;
|
|
470
|
+
int32 layer_id = 3;
|
|
471
|
+
|
|
472
|
+
int32 sender_shard_id = 4;
|
|
473
|
+
int32 total_shards = 5;
|
|
474
|
+
|
|
475
|
+
bytes partial_tensor = 6;
|
|
476
|
+
repeated int64 tensor_shape = 7;
|
|
477
|
+
string dtype = 8;
|
|
478
|
+
|
|
479
|
+
bool is_final = 9; // True if this is the final partial
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
message PartialResultResponse {
|
|
483
|
+
bool accepted = 1;
|
|
484
|
+
string error_message = 2;
|
|
485
|
+
|
|
486
|
+
// If all partials received, return combined result
|
|
487
|
+
bool all_received = 3;
|
|
488
|
+
bytes combined_tensor = 4;
|
|
489
|
+
repeated int64 tensor_shape = 5;
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
// Model registry messages
|
|
493
|
+
message ModelInfo {
|
|
494
|
+
string model_id = 1;
|
|
495
|
+
string name = 2;
|
|
496
|
+
string family = 3; // gpt2, llama, mistral, mixtral
|
|
497
|
+
string version = 4;
|
|
498
|
+
|
|
499
|
+
int32 num_layers = 5;
|
|
500
|
+
int32 hidden_dim = 6;
|
|
501
|
+
int32 num_heads = 7;
|
|
502
|
+
int32 vocab_size = 8;
|
|
503
|
+
|
|
504
|
+
float total_size_mb = 9;
|
|
505
|
+
float layer_size_mb = 10;
|
|
506
|
+
|
|
507
|
+
bool supports_tensor_parallel = 11;
|
|
508
|
+
bool supports_pipeline_parallel = 12;
|
|
509
|
+
|
|
510
|
+
float required_stake = 13; // Minimum NEURO stake to serve
|
|
511
|
+
bool approved = 14; // Community approved
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
message ListModelsRequest {
|
|
515
|
+
bool approved_only = 1;
|
|
516
|
+
string family_filter = 2; // Optional family filter
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
message ListModelsResponse {
|
|
520
|
+
repeated ModelInfo models = 1;
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
message ModelNetworkStatus {
|
|
524
|
+
string model_id = 1;
|
|
525
|
+
int32 total_nodes = 2;
|
|
526
|
+
bool is_fully_covered = 3;
|
|
527
|
+
bool is_inference_ready = 4;
|
|
528
|
+
float avg_latency_ms = 5;
|
|
529
|
+
float total_stake = 6;
|
|
530
|
+
map<int32, int32> layer_coverage = 7; // layer_id -> node_count
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
message GetModelStatusRequest {
|
|
534
|
+
string model_id = 1;
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
message GetModelStatusResponse {
|
|
538
|
+
ModelNetworkStatus status = 1;
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
// Tensor shard announcement
|
|
542
|
+
message TensorShardAnnouncement {
|
|
543
|
+
string model_id = 1;
|
|
544
|
+
int32 layer_id = 2;
|
|
545
|
+
int32 shard_id = 3;
|
|
546
|
+
int32 total_shards = 4;
|
|
547
|
+
|
|
548
|
+
string node_url = 5;
|
|
549
|
+
string grpc_addr = 6;
|
|
550
|
+
|
|
551
|
+
float available_memory_mb = 7;
|
|
552
|
+
float current_load = 8; // 0-1 load factor
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
message AnnounceShardRequest {
|
|
556
|
+
TensorShardAnnouncement shard = 1;
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
message AnnounceShardResponse {
|
|
560
|
+
bool accepted = 1;
|
|
561
|
+
string error_message = 2;
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
// Find tensor shard peers for all-reduce
|
|
565
|
+
message FindShardPeersRequest {
|
|
566
|
+
string model_id = 1;
|
|
567
|
+
int32 layer_id = 2;
|
|
568
|
+
int32 total_shards = 3;
|
|
569
|
+
int32 exclude_shard_id = 4; // Our own shard ID to exclude
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
message FindShardPeersResponse {
|
|
573
|
+
repeated TensorShardAnnouncement peers = 1;
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// --- Swarm Routing Messages (Phase 2) ---
|
|
577
|
+
|
|
578
|
+
message SwarmForwardRequest {
|
|
579
|
+
string session_id = 1;
|
|
580
|
+
string request_id = 2;
|
|
581
|
+
|
|
582
|
+
// Activation data
|
|
583
|
+
bytes hidden_states = 3;
|
|
584
|
+
repeated int64 hidden_shape = 4;
|
|
585
|
+
|
|
586
|
+
// Routing
|
|
587
|
+
int32 target_layer = 5;
|
|
588
|
+
string sender_url = 6;
|
|
589
|
+
|
|
590
|
+
// Priority (0=highest)
|
|
591
|
+
int32 priority = 7;
|
|
592
|
+
int32 micro_batch_id = 8;
|
|
593
|
+
bool is_backward = 9;
|
|
594
|
+
|
|
595
|
+
// Training metadata
|
|
596
|
+
bool requires_grad = 10;
|
|
597
|
+
bytes grad_output = 11;
|
|
598
|
+
repeated int64 grad_shape = 12;
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
message SwarmForwardResponse {
|
|
602
|
+
string request_id = 1;
|
|
603
|
+
bool success = 2;
|
|
604
|
+
string error_message = 3;
|
|
605
|
+
int32 buffer_depth = 4; // For backpressure signaling
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
message SwarmStatusRequest {
|
|
609
|
+
// Empty - just get status
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
message SwarmStatusResponse {
|
|
613
|
+
string node_id = 1;
|
|
614
|
+
int32 layer_start = 2;
|
|
615
|
+
int32 layer_end = 3;
|
|
616
|
+
|
|
617
|
+
// Buffer status
|
|
618
|
+
float inbound_fill_rate = 4;
|
|
619
|
+
float outbound_fill_rate = 5;
|
|
620
|
+
int32 inbound_queue_depth = 6;
|
|
621
|
+
int32 outbound_queue_depth = 7;
|
|
622
|
+
|
|
623
|
+
// Capacity
|
|
624
|
+
int32 available_memory_mb = 8;
|
|
625
|
+
float gpu_utilization = 9;
|
|
626
|
+
|
|
627
|
+
// Status
|
|
628
|
+
bool is_training = 10;
|
|
629
|
+
bool is_accepting_activations = 11;
|
|
630
|
+
|
|
631
|
+
// Compute stats
|
|
632
|
+
int64 total_steps = 12;
|
|
633
|
+
float local_only_rate = 13;
|
|
634
|
+
|
|
635
|
+
string error_message = 14;
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
message UpdatePeerCapacityRequest {
|
|
639
|
+
string node_id = 1;
|
|
640
|
+
string grpc_addr = 2;
|
|
641
|
+
int32 layer_start = 3;
|
|
642
|
+
int32 layer_end = 4;
|
|
643
|
+
int32 queue_depth = 5;
|
|
644
|
+
int32 available_memory_mb = 6;
|
|
645
|
+
float gpu_utilization = 7;
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
message UpdatePeerCapacityResponse {
|
|
649
|
+
bool success = 1;
|
|
650
|
+
string error_message = 2;
|
|
651
|
+
}
|