nexaroa 0.0.111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuroshard/__init__.py +93 -0
- neuroshard/__main__.py +4 -0
- neuroshard/cli.py +466 -0
- neuroshard/core/__init__.py +92 -0
- neuroshard/core/consensus/verifier.py +252 -0
- neuroshard/core/crypto/__init__.py +20 -0
- neuroshard/core/crypto/ecdsa.py +392 -0
- neuroshard/core/economics/__init__.py +52 -0
- neuroshard/core/economics/constants.py +387 -0
- neuroshard/core/economics/ledger.py +2111 -0
- neuroshard/core/economics/market.py +975 -0
- neuroshard/core/economics/wallet.py +168 -0
- neuroshard/core/governance/__init__.py +74 -0
- neuroshard/core/governance/proposal.py +561 -0
- neuroshard/core/governance/registry.py +545 -0
- neuroshard/core/governance/versioning.py +332 -0
- neuroshard/core/governance/voting.py +453 -0
- neuroshard/core/model/__init__.py +30 -0
- neuroshard/core/model/dynamic.py +4186 -0
- neuroshard/core/model/llm.py +905 -0
- neuroshard/core/model/registry.py +164 -0
- neuroshard/core/model/scaler.py +387 -0
- neuroshard/core/model/tokenizer.py +568 -0
- neuroshard/core/network/__init__.py +56 -0
- neuroshard/core/network/connection_pool.py +72 -0
- neuroshard/core/network/dht.py +130 -0
- neuroshard/core/network/dht_plan.py +55 -0
- neuroshard/core/network/dht_proof_store.py +516 -0
- neuroshard/core/network/dht_protocol.py +261 -0
- neuroshard/core/network/dht_service.py +506 -0
- neuroshard/core/network/encrypted_channel.py +141 -0
- neuroshard/core/network/nat.py +201 -0
- neuroshard/core/network/nat_traversal.py +695 -0
- neuroshard/core/network/p2p.py +929 -0
- neuroshard/core/network/p2p_data.py +150 -0
- neuroshard/core/swarm/__init__.py +106 -0
- neuroshard/core/swarm/aggregation.py +729 -0
- neuroshard/core/swarm/buffers.py +643 -0
- neuroshard/core/swarm/checkpoint.py +709 -0
- neuroshard/core/swarm/compute.py +624 -0
- neuroshard/core/swarm/diloco.py +844 -0
- neuroshard/core/swarm/factory.py +1288 -0
- neuroshard/core/swarm/heartbeat.py +669 -0
- neuroshard/core/swarm/logger.py +487 -0
- neuroshard/core/swarm/router.py +658 -0
- neuroshard/core/swarm/service.py +640 -0
- neuroshard/core/training/__init__.py +29 -0
- neuroshard/core/training/checkpoint.py +600 -0
- neuroshard/core/training/distributed.py +1602 -0
- neuroshard/core/training/global_tracker.py +617 -0
- neuroshard/core/training/production.py +276 -0
- neuroshard/governance_cli.py +729 -0
- neuroshard/grpc_server.py +895 -0
- neuroshard/runner.py +3223 -0
- neuroshard/sdk/__init__.py +92 -0
- neuroshard/sdk/client.py +990 -0
- neuroshard/sdk/errors.py +101 -0
- neuroshard/sdk/types.py +282 -0
- neuroshard/tracker/__init__.py +0 -0
- neuroshard/tracker/server.py +864 -0
- neuroshard/ui/__init__.py +0 -0
- neuroshard/ui/app.py +102 -0
- neuroshard/ui/templates/index.html +1052 -0
- neuroshard/utils/__init__.py +0 -0
- neuroshard/utils/autostart.py +81 -0
- neuroshard/utils/hardware.py +121 -0
- neuroshard/utils/serialization.py +90 -0
- neuroshard/version.py +1 -0
- nexaroa-0.0.111.dist-info/METADATA +283 -0
- nexaroa-0.0.111.dist-info/RECORD +78 -0
- nexaroa-0.0.111.dist-info/WHEEL +5 -0
- nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
- nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
- nexaroa-0.0.111.dist-info/top_level.txt +2 -0
- protos/__init__.py +0 -0
- protos/neuroshard.proto +651 -0
- protos/neuroshard_pb2.py +160 -0
- protos/neuroshard_pb2_grpc.py +1298 -0
|
@@ -0,0 +1,643 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Activation Buffer System - Decoupling Compute from Communication
|
|
3
|
+
|
|
4
|
+
Implements priority queues for incoming and outgoing activations with:
|
|
5
|
+
- Priority-based scheduling (inference > training forward > backward)
|
|
6
|
+
- Soft overflow handling ("Don't Stop" logic)
|
|
7
|
+
- Metrics for buffer fill rate and wait times
|
|
8
|
+
|
|
9
|
+
Key Directive: "The GPU must never wait for network packets.
|
|
10
|
+
Forward pass can run 100 steps ahead of backward if necessary."
|
|
11
|
+
|
|
12
|
+
Metric: Buffer Fill Rate
|
|
13
|
+
- Empty (< 10%) = Starved (BAD - GPU idle)
|
|
14
|
+
- Full (> 90%) = Backpressured (triggers soft overflow)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import heapq
|
|
19
|
+
import logging
|
|
20
|
+
import threading
|
|
21
|
+
import time
|
|
22
|
+
import torch
|
|
23
|
+
from dataclasses import dataclass, field
|
|
24
|
+
from enum import IntEnum
|
|
25
|
+
from typing import Any, Dict, List, Optional, Callable, TYPE_CHECKING
|
|
26
|
+
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from neuroshard.core.swarm.router import SwarmRouter
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ActivationPriority(IntEnum):
|
|
34
|
+
"""
|
|
35
|
+
Priority levels for activation processing.
|
|
36
|
+
|
|
37
|
+
Lower value = higher priority (processed first).
|
|
38
|
+
|
|
39
|
+
Rationale:
|
|
40
|
+
- INFERENCE_URGENT: Paid inference, user waiting - highest priority
|
|
41
|
+
- INFERENCE_NORMAL: Standard inference requests
|
|
42
|
+
- TRAINING_FORWARD: Training forward passes need timely processing
|
|
43
|
+
- TRAINING_BACKWARD: Backward passes can be slightly delayed
|
|
44
|
+
- GRADIENT_SYNC: Gradient gossip can wait longest
|
|
45
|
+
"""
|
|
46
|
+
INFERENCE_URGENT = 0 # Paid inference, user waiting
|
|
47
|
+
INFERENCE_NORMAL = 10 # Standard inference
|
|
48
|
+
TRAINING_FORWARD = 20 # Training forward pass
|
|
49
|
+
TRAINING_BACKWARD = 30 # Training backward pass (can be delayed)
|
|
50
|
+
GRADIENT_SYNC = 40 # Gradient gossip
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(order=True)
|
|
54
|
+
class ActivationPacket:
|
|
55
|
+
"""
|
|
56
|
+
A unit of work for the compute engine.
|
|
57
|
+
|
|
58
|
+
Packets are ordered by (priority, sequence_num) for heap operations.
|
|
59
|
+
Lower priority value = processed first. Sequence_num breaks ties (FIFO).
|
|
60
|
+
"""
|
|
61
|
+
priority: int
|
|
62
|
+
sequence_num: int = field(default=0) # For stable FIFO ordering within priority
|
|
63
|
+
timestamp: float = field(compare=False, default_factory=time.time)
|
|
64
|
+
|
|
65
|
+
# Payload
|
|
66
|
+
session_id: str = field(compare=False, default="")
|
|
67
|
+
micro_batch_id: int = field(compare=False, default=0)
|
|
68
|
+
tensor_data: Optional[torch.Tensor] = field(compare=False, default=None)
|
|
69
|
+
|
|
70
|
+
# Routing info
|
|
71
|
+
source_node: str = field(compare=False, default="")
|
|
72
|
+
target_layer: int = field(compare=False, default=0)
|
|
73
|
+
is_backward: bool = field(compare=False, default=False)
|
|
74
|
+
|
|
75
|
+
# Training metadata
|
|
76
|
+
requires_grad: bool = field(compare=False, default=False)
|
|
77
|
+
grad_output: Optional[torch.Tensor] = field(compare=False, default=None)
|
|
78
|
+
|
|
79
|
+
# For tracking
|
|
80
|
+
created_at: float = field(compare=False, default_factory=time.time)
|
|
81
|
+
|
|
82
|
+
def wait_time_ms(self) -> float:
|
|
83
|
+
"""Time since packet was created, in milliseconds."""
|
|
84
|
+
return (time.time() - self.created_at) * 1000
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def create_forward(
|
|
88
|
+
cls,
|
|
89
|
+
session_id: str,
|
|
90
|
+
micro_batch_id: int,
|
|
91
|
+
tensor: torch.Tensor,
|
|
92
|
+
source_node: str,
|
|
93
|
+
target_layer: int,
|
|
94
|
+
is_inference: bool = False,
|
|
95
|
+
urgent: bool = False,
|
|
96
|
+
) -> 'ActivationPacket':
|
|
97
|
+
"""Factory for forward pass packets."""
|
|
98
|
+
if is_inference:
|
|
99
|
+
priority = ActivationPriority.INFERENCE_URGENT if urgent else ActivationPriority.INFERENCE_NORMAL
|
|
100
|
+
else:
|
|
101
|
+
priority = ActivationPriority.TRAINING_FORWARD
|
|
102
|
+
|
|
103
|
+
return cls(
|
|
104
|
+
priority=priority,
|
|
105
|
+
session_id=session_id,
|
|
106
|
+
micro_batch_id=micro_batch_id,
|
|
107
|
+
tensor_data=tensor,
|
|
108
|
+
source_node=source_node,
|
|
109
|
+
target_layer=target_layer,
|
|
110
|
+
is_backward=False,
|
|
111
|
+
requires_grad=not is_inference,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def create_backward(
|
|
116
|
+
cls,
|
|
117
|
+
session_id: str,
|
|
118
|
+
micro_batch_id: int,
|
|
119
|
+
grad_tensor: torch.Tensor,
|
|
120
|
+
source_node: str,
|
|
121
|
+
target_layer: int,
|
|
122
|
+
) -> 'ActivationPacket':
|
|
123
|
+
"""Factory for backward pass packets."""
|
|
124
|
+
return cls(
|
|
125
|
+
priority=ActivationPriority.TRAINING_BACKWARD,
|
|
126
|
+
session_id=session_id,
|
|
127
|
+
micro_batch_id=micro_batch_id,
|
|
128
|
+
tensor_data=grad_tensor,
|
|
129
|
+
source_node=source_node,
|
|
130
|
+
target_layer=target_layer,
|
|
131
|
+
is_backward=True,
|
|
132
|
+
requires_grad=True,
|
|
133
|
+
grad_output=grad_tensor,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
class ActivationBuffer:
|
|
138
|
+
"""
|
|
139
|
+
Priority queue for incoming activations.
|
|
140
|
+
|
|
141
|
+
Decouples network I/O from GPU computation:
|
|
142
|
+
- Network receiver pushes packets
|
|
143
|
+
- GPU compute thread pops next-priority item
|
|
144
|
+
|
|
145
|
+
Thread-safe: Uses locks for multi-producer/single-consumer pattern.
|
|
146
|
+
|
|
147
|
+
Metrics:
|
|
148
|
+
- fill_rate: 0.0 = starved (bad), 1.0 = full (backpressured)
|
|
149
|
+
- avg_wait_time: How long packets wait before processing
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
# Default buffer size (100 packets ≈ 100 micro-batches)
|
|
153
|
+
DEFAULT_MAX_SIZE = 100
|
|
154
|
+
|
|
155
|
+
# Starvation/backpressure thresholds
|
|
156
|
+
STARVED_THRESHOLD = 0.1 # < 10% full = starved
|
|
157
|
+
BACKPRESSURE_THRESHOLD = 0.9 # > 90% full = backpressured
|
|
158
|
+
|
|
159
|
+
def __init__(self, max_size: int = DEFAULT_MAX_SIZE):
|
|
160
|
+
"""
|
|
161
|
+
Initialize activation buffer.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
max_size: Maximum number of packets in buffer
|
|
165
|
+
"""
|
|
166
|
+
self.max_size = max_size
|
|
167
|
+
self._queue: List[ActivationPacket] = []
|
|
168
|
+
self._lock = threading.RLock() # Reentrant lock to allow nested calls
|
|
169
|
+
self._event_not_empty = threading.Event()
|
|
170
|
+
self._event_not_full = threading.Event()
|
|
171
|
+
self._event_not_full.set() # Initially not full
|
|
172
|
+
|
|
173
|
+
# Sequence counter for ordering packets with same priority
|
|
174
|
+
self._sequence_counter = 0
|
|
175
|
+
|
|
176
|
+
# Metrics
|
|
177
|
+
self.packets_in = 0
|
|
178
|
+
self.packets_out = 0
|
|
179
|
+
self.packets_dropped = 0
|
|
180
|
+
self.total_wait_time = 0.0
|
|
181
|
+
|
|
182
|
+
# Priority breakdown
|
|
183
|
+
self._priority_counts: Dict[int, int] = {}
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def fill_rate(self) -> float:
|
|
187
|
+
"""Current fill rate (0.0 to 1.0)."""
|
|
188
|
+
with self._lock:
|
|
189
|
+
return len(self._queue) / self.max_size if self.max_size > 0 else 0.0
|
|
190
|
+
|
|
191
|
+
@property
|
|
192
|
+
def is_starved(self) -> bool:
|
|
193
|
+
"""True if buffer is starving (< 10% full)."""
|
|
194
|
+
return self.fill_rate < self.STARVED_THRESHOLD
|
|
195
|
+
|
|
196
|
+
@property
|
|
197
|
+
def is_backpressured(self) -> bool:
|
|
198
|
+
"""True if buffer is backpressured (>= 90% full)."""
|
|
199
|
+
return self.fill_rate >= self.BACKPRESSURE_THRESHOLD
|
|
200
|
+
|
|
201
|
+
def put(self, packet: ActivationPacket, timeout: Optional[float] = None) -> bool:
|
|
202
|
+
"""
|
|
203
|
+
Add activation to buffer.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
packet: Activation packet to add
|
|
207
|
+
timeout: Max time to wait if buffer full (None = blocking, 0 = non-blocking)
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
True if added successfully, False if timeout expired
|
|
211
|
+
"""
|
|
212
|
+
# Wait for space if buffer is full
|
|
213
|
+
if timeout == 0:
|
|
214
|
+
# Non-blocking
|
|
215
|
+
with self._lock:
|
|
216
|
+
if len(self._queue) >= self.max_size:
|
|
217
|
+
return False
|
|
218
|
+
else:
|
|
219
|
+
# Blocking with optional timeout
|
|
220
|
+
if not self._event_not_full.wait(timeout):
|
|
221
|
+
return False
|
|
222
|
+
|
|
223
|
+
with self._lock:
|
|
224
|
+
# Double-check after acquiring lock
|
|
225
|
+
if len(self._queue) >= self.max_size:
|
|
226
|
+
return False
|
|
227
|
+
|
|
228
|
+
# Add sequence number for stable ordering
|
|
229
|
+
self._sequence_counter += 1
|
|
230
|
+
packet.sequence_num = self._sequence_counter
|
|
231
|
+
|
|
232
|
+
heapq.heappush(self._queue, packet)
|
|
233
|
+
self.packets_in += 1
|
|
234
|
+
|
|
235
|
+
# Track priority distribution
|
|
236
|
+
self._priority_counts[packet.priority] = (
|
|
237
|
+
self._priority_counts.get(packet.priority, 0) + 1
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
# Update events
|
|
241
|
+
self._event_not_empty.set()
|
|
242
|
+
if len(self._queue) >= self.max_size:
|
|
243
|
+
self._event_not_full.clear()
|
|
244
|
+
|
|
245
|
+
return True
|
|
246
|
+
|
|
247
|
+
def put_nowait(self, packet: ActivationPacket) -> bool:
|
|
248
|
+
"""
|
|
249
|
+
Non-blocking put - returns False immediately if full.
|
|
250
|
+
|
|
251
|
+
Used by soft overflow logic.
|
|
252
|
+
"""
|
|
253
|
+
return self.put(packet, timeout=0)
|
|
254
|
+
|
|
255
|
+
def get(self, timeout: Optional[float] = None) -> Optional[ActivationPacket]:
|
|
256
|
+
"""
|
|
257
|
+
Get highest-priority activation.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
timeout: Max time to wait if buffer empty (None = blocking, 0 = non-blocking)
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
Highest priority packet, or None if timeout expired
|
|
264
|
+
"""
|
|
265
|
+
# Wait for data if buffer is empty
|
|
266
|
+
if timeout == 0:
|
|
267
|
+
# Non-blocking
|
|
268
|
+
with self._lock:
|
|
269
|
+
if not self._queue:
|
|
270
|
+
return None
|
|
271
|
+
else:
|
|
272
|
+
# Blocking with optional timeout
|
|
273
|
+
if not self._event_not_empty.wait(timeout):
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
with self._lock:
|
|
277
|
+
# Double-check after acquiring lock
|
|
278
|
+
if not self._queue:
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
packet = heapq.heappop(self._queue)
|
|
282
|
+
|
|
283
|
+
# Track metrics
|
|
284
|
+
wait_time = time.time() - packet.timestamp
|
|
285
|
+
self.total_wait_time += wait_time
|
|
286
|
+
self.packets_out += 1
|
|
287
|
+
|
|
288
|
+
# Update events
|
|
289
|
+
self._event_not_full.set()
|
|
290
|
+
if not self._queue:
|
|
291
|
+
self._event_not_empty.clear()
|
|
292
|
+
|
|
293
|
+
return packet
|
|
294
|
+
|
|
295
|
+
def get_nowait(self) -> Optional[ActivationPacket]:
|
|
296
|
+
"""
|
|
297
|
+
Non-blocking get - returns None immediately if empty.
|
|
298
|
+
|
|
299
|
+
Used by compute engine for non-blocking iteration.
|
|
300
|
+
"""
|
|
301
|
+
return self.get(timeout=0)
|
|
302
|
+
|
|
303
|
+
def peek(self) -> Optional[ActivationPacket]:
|
|
304
|
+
"""Peek at next packet without removing it."""
|
|
305
|
+
with self._lock:
|
|
306
|
+
if self._queue:
|
|
307
|
+
return self._queue[0]
|
|
308
|
+
return None
|
|
309
|
+
|
|
310
|
+
def clear(self) -> int:
|
|
311
|
+
"""Clear buffer, returns number of packets cleared."""
|
|
312
|
+
with self._lock:
|
|
313
|
+
count = len(self._queue)
|
|
314
|
+
self._queue.clear()
|
|
315
|
+
self.packets_dropped += count
|
|
316
|
+
self._event_not_full.set()
|
|
317
|
+
self._event_not_empty.clear()
|
|
318
|
+
return count
|
|
319
|
+
|
|
320
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
321
|
+
"""Get buffer statistics."""
|
|
322
|
+
with self._lock:
|
|
323
|
+
avg_wait = (
|
|
324
|
+
self.total_wait_time / max(1, self.packets_out) * 1000
|
|
325
|
+
) # Convert to ms
|
|
326
|
+
|
|
327
|
+
return {
|
|
328
|
+
"fill_rate": self.fill_rate,
|
|
329
|
+
"queue_size": len(self._queue),
|
|
330
|
+
"max_size": self.max_size,
|
|
331
|
+
"packets_in": self.packets_in,
|
|
332
|
+
"packets_out": self.packets_out,
|
|
333
|
+
"packets_dropped": self.packets_dropped,
|
|
334
|
+
"avg_wait_time_ms": avg_wait,
|
|
335
|
+
"is_starved": self.is_starved,
|
|
336
|
+
"is_backpressured": self.is_backpressured,
|
|
337
|
+
"priority_breakdown": dict(self._priority_counts),
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
class OutboundBuffer:
|
|
342
|
+
"""
|
|
343
|
+
Buffer for outgoing activations with async network send.
|
|
344
|
+
|
|
345
|
+
Supports the "Soft Overflow" mechanism:
|
|
346
|
+
- When full, compute engine can check without blocking
|
|
347
|
+
- Enables DiLoCo-style local-only steps during congestion
|
|
348
|
+
|
|
349
|
+
Thread-safe async queue with metrics.
|
|
350
|
+
"""
|
|
351
|
+
|
|
352
|
+
DEFAULT_MAX_SIZE = 50
|
|
353
|
+
|
|
354
|
+
# Default soft overflow thresholds
|
|
355
|
+
DEFAULT_SOFT_LIMIT = 0.9 # 90% - start warning
|
|
356
|
+
DEFAULT_HARD_LIMIT = 0.99 # 99% - critical
|
|
357
|
+
|
|
358
|
+
def __init__(
|
|
359
|
+
self,
|
|
360
|
+
max_size: int = DEFAULT_MAX_SIZE,
|
|
361
|
+
soft_overflow_threshold: float = DEFAULT_SOFT_LIMIT,
|
|
362
|
+
hard_overflow_threshold: float = DEFAULT_HARD_LIMIT,
|
|
363
|
+
):
|
|
364
|
+
"""
|
|
365
|
+
Initialize outbound buffer.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
max_size: Maximum number of pending outbound packets
|
|
369
|
+
soft_overflow_threshold: Fill rate to trigger soft overflow (0-1)
|
|
370
|
+
hard_overflow_threshold: Fill rate to trigger hard overflow (0-1)
|
|
371
|
+
"""
|
|
372
|
+
self.max_size = max_size
|
|
373
|
+
self.soft_limit = soft_overflow_threshold
|
|
374
|
+
self.hard_limit = hard_overflow_threshold
|
|
375
|
+
self._queue: asyncio.Queue = asyncio.Queue(maxsize=max_size)
|
|
376
|
+
|
|
377
|
+
# Track send status
|
|
378
|
+
self.send_in_progress = 0
|
|
379
|
+
self.packets_sent = 0
|
|
380
|
+
self.packets_failed = 0
|
|
381
|
+
self.send_retries = 0
|
|
382
|
+
|
|
383
|
+
# Soft overflow tracking
|
|
384
|
+
self.soft_overflow_count = 0
|
|
385
|
+
self.hard_overflow_count = 0
|
|
386
|
+
|
|
387
|
+
# Send loop state
|
|
388
|
+
self._send_task: Optional[asyncio.Task] = None
|
|
389
|
+
self._running = False
|
|
390
|
+
|
|
391
|
+
@property
|
|
392
|
+
def fill_rate(self) -> float:
|
|
393
|
+
"""Current fill rate (0.0 to 1.0)."""
|
|
394
|
+
return self._queue.qsize() / self.max_size if self.max_size > 0 else 0.0
|
|
395
|
+
|
|
396
|
+
@property
|
|
397
|
+
def is_soft_overflow(self) -> bool:
|
|
398
|
+
"""True if buffer is in soft overflow (>= soft_limit)."""
|
|
399
|
+
return self.fill_rate >= self.soft_limit
|
|
400
|
+
|
|
401
|
+
@property
|
|
402
|
+
def is_hard_overflow(self) -> bool:
|
|
403
|
+
"""True if buffer is in hard overflow (>= hard_limit)."""
|
|
404
|
+
return self.fill_rate >= self.hard_limit
|
|
405
|
+
|
|
406
|
+
def check_pressure(self) -> str:
|
|
407
|
+
"""
|
|
408
|
+
Check current backpressure level.
|
|
409
|
+
|
|
410
|
+
Returns:
|
|
411
|
+
"ok" - normal operation
|
|
412
|
+
"soft_overflow" - buffer almost full, consider local accumulation
|
|
413
|
+
"hard_overflow" - buffer critical, must discard
|
|
414
|
+
"""
|
|
415
|
+
if self.is_hard_overflow:
|
|
416
|
+
return "hard_overflow"
|
|
417
|
+
elif self.is_soft_overflow:
|
|
418
|
+
return "soft_overflow"
|
|
419
|
+
else:
|
|
420
|
+
return "ok"
|
|
421
|
+
|
|
422
|
+
async def put(self, packet: ActivationPacket, timeout: Optional[float] = None):
|
|
423
|
+
"""
|
|
424
|
+
Queue packet for sending.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
packet: Packet to send
|
|
428
|
+
timeout: Max time to wait if full (None = blocking)
|
|
429
|
+
|
|
430
|
+
Raises:
|
|
431
|
+
asyncio.TimeoutError: If timeout expires while buffer full
|
|
432
|
+
asyncio.QueueFull: If full and timeout=0
|
|
433
|
+
"""
|
|
434
|
+
if timeout == 0:
|
|
435
|
+
self._queue.put_nowait(packet)
|
|
436
|
+
elif timeout is not None:
|
|
437
|
+
await asyncio.wait_for(self._queue.put(packet), timeout=timeout)
|
|
438
|
+
else:
|
|
439
|
+
await self._queue.put(packet)
|
|
440
|
+
|
|
441
|
+
def put_nowait(self, packet: ActivationPacket) -> bool:
|
|
442
|
+
"""
|
|
443
|
+
Non-blocking put - returns False if full.
|
|
444
|
+
|
|
445
|
+
Used for soft overflow checking.
|
|
446
|
+
"""
|
|
447
|
+
try:
|
|
448
|
+
self._queue.put_nowait(packet)
|
|
449
|
+
return True
|
|
450
|
+
except asyncio.QueueFull:
|
|
451
|
+
return False
|
|
452
|
+
|
|
453
|
+
async def get(self) -> ActivationPacket:
|
|
454
|
+
"""Get next packet to send (blocks if empty)."""
|
|
455
|
+
return await self._queue.get()
|
|
456
|
+
|
|
457
|
+
def start_send_loop(self, swarm_router: 'SwarmRouter'):
|
|
458
|
+
"""Start the background send loop."""
|
|
459
|
+
if self._send_task is not None:
|
|
460
|
+
return
|
|
461
|
+
self._running = True
|
|
462
|
+
self._send_task = asyncio.create_task(self._send_loop(swarm_router))
|
|
463
|
+
logger.info("OutboundBuffer send loop started")
|
|
464
|
+
|
|
465
|
+
async def stop_send_loop(self):
|
|
466
|
+
"""Stop the background send loop."""
|
|
467
|
+
self._running = False
|
|
468
|
+
if self._send_task:
|
|
469
|
+
self._send_task.cancel()
|
|
470
|
+
try:
|
|
471
|
+
await self._send_task
|
|
472
|
+
except asyncio.CancelledError:
|
|
473
|
+
pass
|
|
474
|
+
self._send_task = None
|
|
475
|
+
logger.info("OutboundBuffer send loop stopped")
|
|
476
|
+
|
|
477
|
+
async def _send_loop(self, swarm_router: 'SwarmRouter'):
|
|
478
|
+
"""
|
|
479
|
+
Continuously send packets to peers via swarm router.
|
|
480
|
+
|
|
481
|
+
Uses failover routing for resilience.
|
|
482
|
+
"""
|
|
483
|
+
while self._running:
|
|
484
|
+
try:
|
|
485
|
+
packet = await asyncio.wait_for(self._queue.get(), timeout=1.0)
|
|
486
|
+
except asyncio.TimeoutError:
|
|
487
|
+
continue
|
|
488
|
+
except asyncio.CancelledError:
|
|
489
|
+
break
|
|
490
|
+
|
|
491
|
+
self.send_in_progress += 1
|
|
492
|
+
|
|
493
|
+
try:
|
|
494
|
+
result = await swarm_router.send_with_failover(
|
|
495
|
+
tensor=packet.tensor_data,
|
|
496
|
+
target_layer=packet.target_layer,
|
|
497
|
+
session_id=packet.session_id,
|
|
498
|
+
metadata={
|
|
499
|
+
"source_node": packet.source_node,
|
|
500
|
+
"micro_batch_id": packet.micro_batch_id,
|
|
501
|
+
"priority": packet.priority,
|
|
502
|
+
"is_backward": packet.is_backward,
|
|
503
|
+
}
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
if result.success:
|
|
507
|
+
self.packets_sent += 1
|
|
508
|
+
else:
|
|
509
|
+
self.packets_failed += 1
|
|
510
|
+
logger.warning(f"Failed to send packet: {result.error}")
|
|
511
|
+
|
|
512
|
+
# Track retries (failover attempts - 1)
|
|
513
|
+
if result.attempts > 1:
|
|
514
|
+
self.send_retries += result.attempts - 1
|
|
515
|
+
|
|
516
|
+
except Exception as e:
|
|
517
|
+
self.packets_failed += 1
|
|
518
|
+
logger.error(f"Send error: {e}")
|
|
519
|
+
finally:
|
|
520
|
+
self.send_in_progress -= 1
|
|
521
|
+
self._queue.task_done()
|
|
522
|
+
|
|
523
|
+
async def flush(self, timeout: float = 30.0):
|
|
524
|
+
"""Wait for all pending packets to be sent."""
|
|
525
|
+
try:
|
|
526
|
+
await asyncio.wait_for(self._queue.join(), timeout=timeout)
|
|
527
|
+
except asyncio.TimeoutError:
|
|
528
|
+
logger.warning(f"Flush timeout after {timeout}s, {self._queue.qsize()} packets remaining")
|
|
529
|
+
|
|
530
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
531
|
+
"""Get outbound buffer statistics."""
|
|
532
|
+
return {
|
|
533
|
+
"fill_rate": self.fill_rate,
|
|
534
|
+
"queue_size": self._queue.qsize(),
|
|
535
|
+
"max_size": self.max_size,
|
|
536
|
+
"send_in_progress": self.send_in_progress,
|
|
537
|
+
"packets_sent": self.packets_sent,
|
|
538
|
+
"packets_failed": self.packets_failed,
|
|
539
|
+
"send_retries": self.send_retries,
|
|
540
|
+
"soft_overflow_count": self.soft_overflow_count,
|
|
541
|
+
"hard_overflow_count": self.hard_overflow_count,
|
|
542
|
+
"is_soft_overflow": self.is_soft_overflow,
|
|
543
|
+
"is_hard_overflow": self.is_hard_overflow,
|
|
544
|
+
"pressure": self.check_pressure(),
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
class BufferMetricsCollector:
|
|
549
|
+
"""
|
|
550
|
+
Collects and aggregates metrics from activation buffers.
|
|
551
|
+
|
|
552
|
+
Provides unified view for monitoring dashboard.
|
|
553
|
+
"""
|
|
554
|
+
|
|
555
|
+
def __init__(
|
|
556
|
+
self,
|
|
557
|
+
inbound: ActivationBuffer,
|
|
558
|
+
outbound: OutboundBuffer,
|
|
559
|
+
collect_interval: float = 5.0,
|
|
560
|
+
):
|
|
561
|
+
self.inbound = inbound
|
|
562
|
+
self.outbound = outbound
|
|
563
|
+
self.collect_interval = collect_interval
|
|
564
|
+
|
|
565
|
+
# Historical metrics
|
|
566
|
+
self.history: List[Dict[str, Any]] = []
|
|
567
|
+
self.max_history = 720 # 1 hour at 5s intervals
|
|
568
|
+
|
|
569
|
+
self._running = False
|
|
570
|
+
self._task: Optional[asyncio.Task] = None
|
|
571
|
+
|
|
572
|
+
async def start(self):
|
|
573
|
+
"""Start metrics collection."""
|
|
574
|
+
self._running = True
|
|
575
|
+
self._task = asyncio.create_task(self._collect_loop())
|
|
576
|
+
|
|
577
|
+
async def stop(self):
|
|
578
|
+
"""Stop metrics collection."""
|
|
579
|
+
self._running = False
|
|
580
|
+
if self._task:
|
|
581
|
+
self._task.cancel()
|
|
582
|
+
try:
|
|
583
|
+
await self._task
|
|
584
|
+
except asyncio.CancelledError:
|
|
585
|
+
pass
|
|
586
|
+
|
|
587
|
+
async def _collect_loop(self):
|
|
588
|
+
"""Periodically collect buffer metrics."""
|
|
589
|
+
while self._running:
|
|
590
|
+
try:
|
|
591
|
+
await asyncio.sleep(self.collect_interval)
|
|
592
|
+
|
|
593
|
+
snapshot = {
|
|
594
|
+
"timestamp": time.time(),
|
|
595
|
+
"inbound": self.inbound.get_stats(),
|
|
596
|
+
"outbound": self.outbound.get_stats(),
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
self.history.append(snapshot)
|
|
600
|
+
|
|
601
|
+
# Trim history
|
|
602
|
+
if len(self.history) > self.max_history:
|
|
603
|
+
self.history = self.history[-self.max_history:]
|
|
604
|
+
|
|
605
|
+
# Log warnings
|
|
606
|
+
if self.inbound.is_starved:
|
|
607
|
+
logger.warning(
|
|
608
|
+
f"Inbound buffer STARVED: {self.inbound.fill_rate:.1%} full"
|
|
609
|
+
)
|
|
610
|
+
if self.outbound.is_soft_overflow:
|
|
611
|
+
logger.warning(
|
|
612
|
+
f"Outbound buffer OVERFLOW: {self.outbound.fill_rate:.1%} full"
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
except asyncio.CancelledError:
|
|
616
|
+
break
|
|
617
|
+
except Exception as e:
|
|
618
|
+
logger.error(f"Metrics collection error: {e}")
|
|
619
|
+
|
|
620
|
+
def get_summary(self) -> Dict[str, Any]:
|
|
621
|
+
"""Get summary statistics."""
|
|
622
|
+
if not self.history:
|
|
623
|
+
return {
|
|
624
|
+
"inbound": self.inbound.get_stats(),
|
|
625
|
+
"outbound": self.outbound.get_stats(),
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
# Calculate averages over history
|
|
629
|
+
avg_inbound_fill = sum(h["inbound"]["fill_rate"] for h in self.history) / len(self.history)
|
|
630
|
+
avg_outbound_fill = sum(h["outbound"]["fill_rate"] for h in self.history) / len(self.history)
|
|
631
|
+
|
|
632
|
+
return {
|
|
633
|
+
"current": {
|
|
634
|
+
"inbound": self.inbound.get_stats(),
|
|
635
|
+
"outbound": self.outbound.get_stats(),
|
|
636
|
+
},
|
|
637
|
+
"averages": {
|
|
638
|
+
"inbound_fill_rate": avg_inbound_fill,
|
|
639
|
+
"outbound_fill_rate": avg_outbound_fill,
|
|
640
|
+
},
|
|
641
|
+
"history_length": len(self.history),
|
|
642
|
+
}
|
|
643
|
+
|