nexaroa 0.0.111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. neuroshard/__init__.py +93 -0
  2. neuroshard/__main__.py +4 -0
  3. neuroshard/cli.py +466 -0
  4. neuroshard/core/__init__.py +92 -0
  5. neuroshard/core/consensus/verifier.py +252 -0
  6. neuroshard/core/crypto/__init__.py +20 -0
  7. neuroshard/core/crypto/ecdsa.py +392 -0
  8. neuroshard/core/economics/__init__.py +52 -0
  9. neuroshard/core/economics/constants.py +387 -0
  10. neuroshard/core/economics/ledger.py +2111 -0
  11. neuroshard/core/economics/market.py +975 -0
  12. neuroshard/core/economics/wallet.py +168 -0
  13. neuroshard/core/governance/__init__.py +74 -0
  14. neuroshard/core/governance/proposal.py +561 -0
  15. neuroshard/core/governance/registry.py +545 -0
  16. neuroshard/core/governance/versioning.py +332 -0
  17. neuroshard/core/governance/voting.py +453 -0
  18. neuroshard/core/model/__init__.py +30 -0
  19. neuroshard/core/model/dynamic.py +4186 -0
  20. neuroshard/core/model/llm.py +905 -0
  21. neuroshard/core/model/registry.py +164 -0
  22. neuroshard/core/model/scaler.py +387 -0
  23. neuroshard/core/model/tokenizer.py +568 -0
  24. neuroshard/core/network/__init__.py +56 -0
  25. neuroshard/core/network/connection_pool.py +72 -0
  26. neuroshard/core/network/dht.py +130 -0
  27. neuroshard/core/network/dht_plan.py +55 -0
  28. neuroshard/core/network/dht_proof_store.py +516 -0
  29. neuroshard/core/network/dht_protocol.py +261 -0
  30. neuroshard/core/network/dht_service.py +506 -0
  31. neuroshard/core/network/encrypted_channel.py +141 -0
  32. neuroshard/core/network/nat.py +201 -0
  33. neuroshard/core/network/nat_traversal.py +695 -0
  34. neuroshard/core/network/p2p.py +929 -0
  35. neuroshard/core/network/p2p_data.py +150 -0
  36. neuroshard/core/swarm/__init__.py +106 -0
  37. neuroshard/core/swarm/aggregation.py +729 -0
  38. neuroshard/core/swarm/buffers.py +643 -0
  39. neuroshard/core/swarm/checkpoint.py +709 -0
  40. neuroshard/core/swarm/compute.py +624 -0
  41. neuroshard/core/swarm/diloco.py +844 -0
  42. neuroshard/core/swarm/factory.py +1288 -0
  43. neuroshard/core/swarm/heartbeat.py +669 -0
  44. neuroshard/core/swarm/logger.py +487 -0
  45. neuroshard/core/swarm/router.py +658 -0
  46. neuroshard/core/swarm/service.py +640 -0
  47. neuroshard/core/training/__init__.py +29 -0
  48. neuroshard/core/training/checkpoint.py +600 -0
  49. neuroshard/core/training/distributed.py +1602 -0
  50. neuroshard/core/training/global_tracker.py +617 -0
  51. neuroshard/core/training/production.py +276 -0
  52. neuroshard/governance_cli.py +729 -0
  53. neuroshard/grpc_server.py +895 -0
  54. neuroshard/runner.py +3223 -0
  55. neuroshard/sdk/__init__.py +92 -0
  56. neuroshard/sdk/client.py +990 -0
  57. neuroshard/sdk/errors.py +101 -0
  58. neuroshard/sdk/types.py +282 -0
  59. neuroshard/tracker/__init__.py +0 -0
  60. neuroshard/tracker/server.py +864 -0
  61. neuroshard/ui/__init__.py +0 -0
  62. neuroshard/ui/app.py +102 -0
  63. neuroshard/ui/templates/index.html +1052 -0
  64. neuroshard/utils/__init__.py +0 -0
  65. neuroshard/utils/autostart.py +81 -0
  66. neuroshard/utils/hardware.py +121 -0
  67. neuroshard/utils/serialization.py +90 -0
  68. neuroshard/version.py +1 -0
  69. nexaroa-0.0.111.dist-info/METADATA +283 -0
  70. nexaroa-0.0.111.dist-info/RECORD +78 -0
  71. nexaroa-0.0.111.dist-info/WHEEL +5 -0
  72. nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
  73. nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
  74. nexaroa-0.0.111.dist-info/top_level.txt +2 -0
  75. protos/__init__.py +0 -0
  76. protos/neuroshard.proto +651 -0
  77. protos/neuroshard_pb2.py +160 -0
  78. protos/neuroshard_pb2_grpc.py +1298 -0
@@ -0,0 +1,651 @@
1
+ syntax = "proto3";
2
+
3
+ package neuroshard;
4
+
5
+ // Service definition for Node-to-Node and Client-to-Node communication
6
+ service NeuroShardService {
7
+ // Streaming inference: Client/Node sends a stream of requests, receives a stream of tokens/updates
8
+ rpc StreamInference (stream InferenceRequest) returns (stream InferenceResponse);
9
+
10
+ // Simple unary call for single-step (legacy/fallback)
11
+ rpc UnaryInference (InferenceRequest) returns (InferenceResponse);
12
+
13
+ // Gossip weights for training
14
+ rpc GetWeights (WeightRequest) returns (WeightResponse);
15
+
16
+ // Gossip Proof of Uptime
17
+ rpc GossipProof (GossipProofRequest) returns (GossipProofResponse);
18
+
19
+ // Gossip Transaction
20
+ rpc GossipTransaction (GossipTransactionRequest) returns (GossipTransactionResponse);
21
+
22
+ // Gossip Stake Update
23
+ rpc GossipStake (GossipStakeRequest) returns (GossipStakeResponse);
24
+
25
+ // Request Proof Validation from Validators
26
+ rpc RequestProofValidation (ProofValidationRequest) returns (ProofValidationResponse);
27
+
28
+ // Gossip Validation Vote
29
+ rpc GossipValidationVote (ValidationVoteRequest) returns (ValidationVoteResponse);
30
+
31
+ // --- Distributed Training RPCs ---
32
+
33
+ // Gossip gradients for distributed training
34
+ rpc GossipGradient (GossipGradientRequest) returns (GossipGradientResponse);
35
+
36
+ // Request checkpoint from peer
37
+ rpc GetCheckpoint (GetCheckpointRequest) returns (GetCheckpointResponse);
38
+
39
+ // Get checkpoint info (version, hash) without downloading
40
+ rpc GetCheckpointInfo (GetCheckpointInfoRequest) returns (GetCheckpointInfoResponse);
41
+
42
+ // --- Pipeline Parallelism RPCs ---
43
+
44
+ // Forward hidden states through this node's layers
45
+ rpc PipelineForward (PipelineForwardRequest) returns (PipelineForwardResponse);
46
+
47
+ // Backward pass: propagate gradients back to previous node
48
+ rpc PipelineBackward (PipelineBackwardRequest) returns (PipelineBackwardResponse);
49
+
50
+ // Get shard info from this node
51
+ rpc GetShardInfo (GetShardInfoRequest) returns (GetShardInfoResponse);
52
+
53
+ // --- Data Swarm RPCs (P2P Dataset) ---
54
+
55
+ // Request a chunk of a data shard
56
+ rpc GetShardChunk (GetShardChunkRequest) returns (GetShardChunkResponse);
57
+
58
+ // --- DHT RPCs ---
59
+ rpc DHTPing (DHTPingRequest) returns (DHTPingResponse);
60
+ rpc DHTStore (DHTStoreRequest) returns (DHTStoreResponse);
61
+ rpc DHTFindNode (DHTFindNodeRequest) returns (DHTFindNodeResponse);
62
+ rpc DHTFindValue (DHTFindValueRequest) returns (DHTFindValueResponse);
63
+
64
+ // --- Phase 4: Tensor Parallelism RPCs ---
65
+
66
+ // Exchange tensor chunks during ring all-reduce
67
+ rpc TensorExchange (TensorExchangeRequest) returns (TensorExchangeResponse);
68
+
69
+ // Send partial results for async aggregation
70
+ rpc SendPartialResult (PartialResultRequest) returns (PartialResultResponse);
71
+
72
+ // Announce tensor shard availability
73
+ rpc AnnounceTensorShard (AnnounceShardRequest) returns (AnnounceShardResponse);
74
+
75
+ // Find peer shards for all-reduce coordination
76
+ rpc FindTensorShardPeers (FindShardPeersRequest) returns (FindShardPeersResponse);
77
+
78
+ // --- Phase 4: Model Registry RPCs ---
79
+
80
+ // List available models in network
81
+ rpc ListModels (ListModelsRequest) returns (ListModelsResponse);
82
+
83
+ // Get status of a specific model
84
+ rpc GetModelStatus (GetModelStatusRequest) returns (GetModelStatusResponse);
85
+
86
+ // --- Swarm Routing RPCs (Phase 2) ---
87
+
88
+ // Async activation forward (non-blocking, buffers locally)
89
+ rpc SwarmForward (SwarmForwardRequest) returns (SwarmForwardResponse);
90
+
91
+ // Get swarm node status (buffer fill rates, capacity)
92
+ rpc GetSwarmStatus (SwarmStatusRequest) returns (SwarmStatusResponse);
93
+
94
+ // Update peer capacity (TCP fallback for UDP heartbeat)
95
+ rpc UpdatePeerCapacity (UpdatePeerCapacityRequest) returns (UpdatePeerCapacityResponse);
96
+ }
97
+
98
+ message InferenceRequest {
99
+ string session_id = 1;
100
+ string request_id = 2;
101
+
102
+ // Serialized tensor data (compressed/quantized)
103
+ bytes tensor_data = 3;
104
+
105
+ // Speculative decoding
106
+ repeated int32 draft_tokens = 4;
107
+
108
+ float sender_reputation = 5;
109
+
110
+ // Routing metadata
111
+ int32 source_layer = 6;
112
+ }
113
+
114
+ message InferenceResponse {
115
+ string request_id = 1;
116
+
117
+ // Status codes
118
+ bool success = 2;
119
+ string error_message = 3;
120
+
121
+ // Output data
122
+ bytes tensor_data = 4; // Serialized logits or activations
123
+
124
+ // Speculative decoding results
125
+ bool is_speculative = 5;
126
+ int32 valid_count = 6;
127
+ int32 next_token = 7;
128
+ }
129
+
130
+ message WeightRequest {
131
+ string shard_range = 1;
132
+ }
133
+
134
+ message WeightResponse {
135
+ bytes weights_data = 1; // Serialized state_dict
136
+ }
137
+
138
+ message GossipProofRequest {
139
+ string node_id = 1;
140
+ double timestamp = 2;
141
+ double uptime = 3; // uptime_seconds
142
+ string signature = 4;
143
+ int64 token_count = 5; // tokens_processed (inference)
144
+ int32 training_batches = 6; // training_batches (training)
145
+ int32 layers_held = 7; // Number of layers this node holds
146
+ bool has_embedding = 8; // Is this node a Driver (has embedding layer)?
147
+ bool has_lm_head = 9; // Is this node a Validator (has LM head)?
148
+ string proof_type = 10; // "UPTIME", "INFERENCE", "TRAINING", "DATA"
149
+ string nonce = 11; // Unique nonce for replay prevention
150
+ string public_key = 12; // ECDSA public key for trustless verification
151
+ int32 data_samples = 13; // Data samples processed (for canonical_payload)
152
+ string model_hash = 14; // Model hash (for canonical_payload)
153
+ string request_id = 15; // Request ID for inference proofs (for canonical_payload)
154
+ double current_loss = 16; // Current training loss (for aggregation on website)
155
+ }
156
+
157
+ message GossipProofResponse {
158
+ bool accepted = 1;
159
+ }
160
+
161
+ message GossipTransactionRequest {
162
+ string sender_id = 1;
163
+ string recipient_id = 2;
164
+ double amount = 3;
165
+ double timestamp = 4;
166
+ string signature = 5;
167
+ string tx_hash = 6;
168
+ }
169
+
170
+ message GossipTransactionResponse {
171
+ bool accepted = 1;
172
+ string reason = 2;
173
+ }
174
+
175
+ // Stake gossip - sync stakes across P2P network
176
+ message GossipStakeRequest {
177
+ string node_id = 1; // Node that staked (SHA256(public_key)[:32])
178
+ double amount = 2; // Total staked amount
179
+ double locked_until = 3; // Lock expiry timestamp
180
+ double timestamp = 4; // When this stake update occurred
181
+ string signature = 5; // ECDSA signature for verification
182
+ string public_key = 6; // Compressed public key (hex) - REQUIRED for verification
183
+ }
184
+
185
+ message GossipStakeResponse {
186
+ bool accepted = 1;
187
+ string reason = 2;
188
+ }
189
+
190
+ // Proof Validation Request - Ask validators to validate a proof
191
+ message ProofValidationRequest {
192
+ string proof_signature = 1; // Signature of the proof to validate
193
+ string submitter_id = 2; // Node that submitted the proof
194
+ double timestamp = 3; // Proof timestamp
195
+ double uptime_seconds = 4; // Claimed uptime
196
+ int64 tokens_processed = 5; // Claimed tokens
197
+ int32 training_batches = 6; // Claimed training batches
198
+ int32 layers_held = 7; // Claimed layers
199
+ bool has_embedding = 8; // Is Driver
200
+ bool has_lm_head = 9; // Is Validator
201
+ string proof_type = 10; // UPTIME, INFERENCE, TRAINING, DATA
202
+ string nonce = 11; // Unique nonce
203
+ }
204
+
205
+ message ProofValidationResponse {
206
+ bool accepted = 1; // Whether validation request was accepted
207
+ string reason = 2; // Reason if not accepted
208
+ string validator_id = 3; // Validator that will process
209
+ double validator_stake = 4; // Validator's stake
210
+ }
211
+
212
+ // Validation Vote - Validator's vote on a proof
213
+ message ValidationVoteRequest {
214
+ string proof_signature = 1; // Proof being voted on
215
+ string validator_id = 2; // Validator casting vote
216
+ double validator_stake = 3; // Validator's stake (for weighting)
217
+ bool vote = 4; // true = valid, false = invalid
218
+ string details = 5; // Optional validation details
219
+ double timestamp = 6; // When vote was cast
220
+ string signature = 7; // Validator's signature on vote
221
+ }
222
+
223
+ message ValidationVoteResponse {
224
+ bool accepted = 1;
225
+ string reason = 2;
226
+ double total_valid_stake = 3; // Current valid stake tally
227
+ double total_invalid_stake = 4; // Current invalid stake tally
228
+ bool consensus_reached = 5; // Whether consensus has been reached
229
+ bool consensus_result = 6; // The consensus result (if reached)
230
+ }
231
+
232
+ // --- Distributed Training Messages ---
233
+
234
+ message GossipGradientRequest {
235
+ string node_id = 1; // Sender node ID
236
+ int32 round_id = 2; // Training round ID
237
+ string model_hash = 3; // Model hash for consistency check
238
+ double timestamp = 4;
239
+
240
+ int32 batch_size = 5; // Batch size used for this gradient
241
+ double loss = 6; // Training loss
242
+
243
+ // Compressed gradients per layer
244
+ map<string, bytes> layer_gradients = 7;
245
+
246
+ string signature = 8; // Proof signature
247
+ int32 ttl = 9; // Time-to-live for forwarding
248
+ }
249
+
250
+ message GossipGradientResponse {
251
+ bool accepted = 1;
252
+ string reason = 2;
253
+ int32 current_round = 3; // Receiver's current round (for sync)
254
+ }
255
+
256
+ message GetCheckpointRequest {
257
+ string model_hash = 1; // Optional: specific checkpoint hash
258
+ int32 min_version = 2; // Optional: minimum version number
259
+ }
260
+
261
+ message GetCheckpointResponse {
262
+ bool success = 1;
263
+ string error_message = 2;
264
+
265
+ int32 version = 3; // Checkpoint version (training round)
266
+ string model_hash = 4; // Model hash
267
+ string phase = 5; // Model phase (bootstrap, early, etc.)
268
+
269
+ bytes checkpoint_data = 6; // Serialized checkpoint (compressed)
270
+ int64 total_size = 7; // Total size in bytes
271
+ }
272
+
273
+ message GetCheckpointInfoRequest {
274
+ // Empty - just get current info
275
+ }
276
+
277
+ message GetCheckpointInfoResponse {
278
+ int32 version = 1; // Current training round
279
+ string model_hash = 2; // Current model hash
280
+ string phase = 3; // Model phase
281
+ int64 params = 4; // Number of parameters
282
+ double loss = 5; // Current loss
283
+ }
284
+
285
+ // --- Pipeline Parallelism Messages ---
286
+
287
+ message PipelineForwardRequest {
288
+ string session_id = 1;
289
+ string request_id = 2;
290
+
291
+ // Hidden states from previous node
292
+ bytes hidden_states = 3; // Serialized tensor [batch, seq, hidden]
293
+ repeated int64 hidden_shape = 4; // Shape of hidden states
294
+
295
+ // Attention mask (optional)
296
+ bytes attention_mask = 5;
297
+
298
+ // Position IDs
299
+ bytes position_ids = 6;
300
+
301
+ // KV cache (optional, for incremental decoding)
302
+ repeated bytes past_key_values = 7;
303
+ bool use_cache = 8;
304
+
305
+ // Shard info
306
+ int32 source_shard = 9; // Shard that sent this
307
+ int32 target_shard = 10; // Shard to process (this node)
308
+
309
+ // Training data (only sent by Driver)
310
+ bytes training_labels = 11; // Serialized labels [batch, seq]
311
+
312
+ // Backward pass routing
313
+ string sender_url = 12; // URL to send gradients back to
314
+ }
315
+
316
+ message PipelineForwardResponse {
317
+ string request_id = 1;
318
+ bool success = 2;
319
+ string error_message = 3;
320
+
321
+ // Output hidden states
322
+ bytes hidden_states = 4;
323
+ repeated int64 hidden_shape = 5;
324
+
325
+ // Updated KV cache
326
+ repeated bytes past_key_values = 6;
327
+
328
+ // If this is the final shard, include logits
329
+ bool is_final = 7;
330
+ bytes logits = 8; // Only if is_final
331
+ repeated int64 logits_shape = 9;
332
+
333
+ // Training feedback
334
+ double loss = 10; // Returned by Validator
335
+ }
336
+
337
+ message PipelineBackwardRequest {
338
+ string session_id = 1;
339
+ string request_id = 2;
340
+
341
+ // Gradients w.r.t the OUTPUT of the previous layer
342
+ bytes grad_output = 3; // Serialized tensor
343
+ repeated int64 grad_shape = 4;
344
+
345
+ int32 target_shard = 5; // Shard we are sending back TO
346
+ }
347
+
348
+ message PipelineBackwardResponse {
349
+ bool success = 1;
350
+ string error_message = 2;
351
+ }
352
+
353
+ message GetShardInfoRequest {
354
+ // Empty - just get current shard info
355
+ }
356
+
357
+ message GetShardInfoResponse {
358
+ int32 shard_id = 1;
359
+ int32 total_shards = 2;
360
+ int32 start_layer = 3;
361
+ int32 end_layer = 4;
362
+ bool has_embedding = 5;
363
+ bool has_lm_head = 6;
364
+
365
+ int32 version = 7;
366
+ string model_hash = 8;
367
+
368
+ // Capacity
369
+ float available_memory_mb = 9;
370
+ float current_load = 10;
371
+ }
372
+
373
+ // --- Data Swarm Messages ---
374
+
375
+ message GetShardChunkRequest {
376
+ int32 shard_id = 1; // Which shard (e.g., 42)
377
+ int32 chunk_index = 2; // Which 1MB chunk
378
+ string requester_id = 3; // Who is asking
379
+ }
380
+
381
+ message GetShardChunkResponse {
382
+ bool success = 1;
383
+ bytes data = 2; // The chunk data
384
+ string error_message = 3;
385
+ int64 total_size = 4; // Total size of shard
386
+ int32 total_chunks = 5; // Total chunks in shard
387
+ }
388
+
389
+ // --- DHT Messages ---
390
+
391
+ message DHTNodeInfo {
392
+ bytes node_id = 1; // 20-byte ID (160 bits)
393
+ string ip = 2;
394
+ int32 port = 3;
395
+ }
396
+
397
+ message DHTPingRequest {
398
+ DHTNodeInfo sender = 1;
399
+ }
400
+
401
+ message DHTPingResponse {
402
+ DHTNodeInfo responder = 1;
403
+ }
404
+
405
+ message DHTStoreRequest {
406
+ DHTNodeInfo sender = 1;
407
+ bytes key = 2;
408
+ string value = 3; // For now, simple string (e.g., "ip:port")
409
+ }
410
+
411
+ message DHTStoreResponse {
412
+ DHTNodeInfo responder = 1;
413
+ bool success = 2;
414
+ }
415
+
416
+ message DHTFindNodeRequest {
417
+ DHTNodeInfo sender = 1;
418
+ bytes target_id = 2;
419
+ }
420
+
421
+ message DHTFindNodeResponse {
422
+ DHTNodeInfo responder = 1;
423
+ repeated DHTNodeInfo nodes = 2; // K closest nodes
424
+ }
425
+
426
+ message DHTFindValueRequest {
427
+ DHTNodeInfo sender = 1;
428
+ bytes key = 2;
429
+ }
430
+
431
+ message DHTFindValueResponse {
432
+ DHTNodeInfo responder = 1;
433
+ string value = 2; // If found
434
+ repeated DHTNodeInfo nodes = 3; // If not found (K closest nodes)
435
+ bool found = 4;
436
+ }
437
+
438
+ // --- Phase 4: Tensor Parallelism Messages ---
439
+
440
+ // Tensor exchange for all-reduce operations
441
+ message TensorExchangeRequest {
442
+ string operation_id = 1; // Unique operation ID
443
+ int32 layer_id = 2; // Layer being processed
444
+ int32 step = 3; // Ring all-reduce step
445
+ int32 chunk_idx = 4; // Chunk index in ring
446
+
447
+ int32 sender_shard_id = 5; // Sender's tensor shard ID
448
+ int32 total_shards = 6; // Total tensor shards
449
+
450
+ bytes tensor_data = 7; // Serialized tensor chunk
451
+ repeated int64 tensor_shape = 8; // Original tensor shape
452
+ string dtype = 9; // Tensor dtype (float32, float16, etc.)
453
+
454
+ string reduce_op = 10; // Reduction operation (sum, mean, max, min)
455
+ }
456
+
457
+ message TensorExchangeResponse {
458
+ string operation_id = 1;
459
+ bool success = 2;
460
+ string error_message = 3;
461
+
462
+ bytes tensor_data = 4; // Response tensor (for bidirectional exchange)
463
+ repeated int64 tensor_shape = 5;
464
+ }
465
+
466
+ // Partial result aggregation (for async all-reduce)
467
+ message PartialResultRequest {
468
+ string session_id = 1;
469
+ string operation_id = 2;
470
+ int32 layer_id = 3;
471
+
472
+ int32 sender_shard_id = 4;
473
+ int32 total_shards = 5;
474
+
475
+ bytes partial_tensor = 6;
476
+ repeated int64 tensor_shape = 7;
477
+ string dtype = 8;
478
+
479
+ bool is_final = 9; // True if this is the final partial
480
+ }
481
+
482
+ message PartialResultResponse {
483
+ bool accepted = 1;
484
+ string error_message = 2;
485
+
486
+ // If all partials received, return combined result
487
+ bool all_received = 3;
488
+ bytes combined_tensor = 4;
489
+ repeated int64 tensor_shape = 5;
490
+ }
491
+
492
+ // Model registry messages
493
+ message ModelInfo {
494
+ string model_id = 1;
495
+ string name = 2;
496
+ string family = 3; // gpt2, llama, mistral, mixtral
497
+ string version = 4;
498
+
499
+ int32 num_layers = 5;
500
+ int32 hidden_dim = 6;
501
+ int32 num_heads = 7;
502
+ int32 vocab_size = 8;
503
+
504
+ float total_size_mb = 9;
505
+ float layer_size_mb = 10;
506
+
507
+ bool supports_tensor_parallel = 11;
508
+ bool supports_pipeline_parallel = 12;
509
+
510
+ float required_stake = 13; // Minimum NEURO stake to serve
511
+ bool approved = 14; // Community approved
512
+ }
513
+
514
+ message ListModelsRequest {
515
+ bool approved_only = 1;
516
+ string family_filter = 2; // Optional family filter
517
+ }
518
+
519
+ message ListModelsResponse {
520
+ repeated ModelInfo models = 1;
521
+ }
522
+
523
+ message ModelNetworkStatus {
524
+ string model_id = 1;
525
+ int32 total_nodes = 2;
526
+ bool is_fully_covered = 3;
527
+ bool is_inference_ready = 4;
528
+ float avg_latency_ms = 5;
529
+ float total_stake = 6;
530
+ map<int32, int32> layer_coverage = 7; // layer_id -> node_count
531
+ }
532
+
533
+ message GetModelStatusRequest {
534
+ string model_id = 1;
535
+ }
536
+
537
+ message GetModelStatusResponse {
538
+ ModelNetworkStatus status = 1;
539
+ }
540
+
541
+ // Tensor shard announcement
542
+ message TensorShardAnnouncement {
543
+ string model_id = 1;
544
+ int32 layer_id = 2;
545
+ int32 shard_id = 3;
546
+ int32 total_shards = 4;
547
+
548
+ string node_url = 5;
549
+ string grpc_addr = 6;
550
+
551
+ float available_memory_mb = 7;
552
+ float current_load = 8; // 0-1 load factor
553
+ }
554
+
555
+ message AnnounceShardRequest {
556
+ TensorShardAnnouncement shard = 1;
557
+ }
558
+
559
+ message AnnounceShardResponse {
560
+ bool accepted = 1;
561
+ string error_message = 2;
562
+ }
563
+
564
+ // Find tensor shard peers for all-reduce
565
+ message FindShardPeersRequest {
566
+ string model_id = 1;
567
+ int32 layer_id = 2;
568
+ int32 total_shards = 3;
569
+ int32 exclude_shard_id = 4; // Our own shard ID to exclude
570
+ }
571
+
572
+ message FindShardPeersResponse {
573
+ repeated TensorShardAnnouncement peers = 1;
574
+ }
575
+
576
+ // --- Swarm Routing Messages (Phase 2) ---
577
+
578
+ message SwarmForwardRequest {
579
+ string session_id = 1;
580
+ string request_id = 2;
581
+
582
+ // Activation data
583
+ bytes hidden_states = 3;
584
+ repeated int64 hidden_shape = 4;
585
+
586
+ // Routing
587
+ int32 target_layer = 5;
588
+ string sender_url = 6;
589
+
590
+ // Priority (0=highest)
591
+ int32 priority = 7;
592
+ int32 micro_batch_id = 8;
593
+ bool is_backward = 9;
594
+
595
+ // Training metadata
596
+ bool requires_grad = 10;
597
+ bytes grad_output = 11;
598
+ repeated int64 grad_shape = 12;
599
+ }
600
+
601
+ message SwarmForwardResponse {
602
+ string request_id = 1;
603
+ bool success = 2;
604
+ string error_message = 3;
605
+ int32 buffer_depth = 4; // For backpressure signaling
606
+ }
607
+
608
+ message SwarmStatusRequest {
609
+ // Empty - just get status
610
+ }
611
+
612
+ message SwarmStatusResponse {
613
+ string node_id = 1;
614
+ int32 layer_start = 2;
615
+ int32 layer_end = 3;
616
+
617
+ // Buffer status
618
+ float inbound_fill_rate = 4;
619
+ float outbound_fill_rate = 5;
620
+ int32 inbound_queue_depth = 6;
621
+ int32 outbound_queue_depth = 7;
622
+
623
+ // Capacity
624
+ int32 available_memory_mb = 8;
625
+ float gpu_utilization = 9;
626
+
627
+ // Status
628
+ bool is_training = 10;
629
+ bool is_accepting_activations = 11;
630
+
631
+ // Compute stats
632
+ int64 total_steps = 12;
633
+ float local_only_rate = 13;
634
+
635
+ string error_message = 14;
636
+ }
637
+
638
+ message UpdatePeerCapacityRequest {
639
+ string node_id = 1;
640
+ string grpc_addr = 2;
641
+ int32 layer_start = 3;
642
+ int32 layer_end = 4;
643
+ int32 queue_depth = 5;
644
+ int32 available_memory_mb = 6;
645
+ float gpu_utilization = 7;
646
+ }
647
+
648
+ message UpdatePeerCapacityResponse {
649
+ bool success = 1;
650
+ string error_message = 2;
651
+ }