nexaroa 0.0.111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. neuroshard/__init__.py +93 -0
  2. neuroshard/__main__.py +4 -0
  3. neuroshard/cli.py +466 -0
  4. neuroshard/core/__init__.py +92 -0
  5. neuroshard/core/consensus/verifier.py +252 -0
  6. neuroshard/core/crypto/__init__.py +20 -0
  7. neuroshard/core/crypto/ecdsa.py +392 -0
  8. neuroshard/core/economics/__init__.py +52 -0
  9. neuroshard/core/economics/constants.py +387 -0
  10. neuroshard/core/economics/ledger.py +2111 -0
  11. neuroshard/core/economics/market.py +975 -0
  12. neuroshard/core/economics/wallet.py +168 -0
  13. neuroshard/core/governance/__init__.py +74 -0
  14. neuroshard/core/governance/proposal.py +561 -0
  15. neuroshard/core/governance/registry.py +545 -0
  16. neuroshard/core/governance/versioning.py +332 -0
  17. neuroshard/core/governance/voting.py +453 -0
  18. neuroshard/core/model/__init__.py +30 -0
  19. neuroshard/core/model/dynamic.py +4186 -0
  20. neuroshard/core/model/llm.py +905 -0
  21. neuroshard/core/model/registry.py +164 -0
  22. neuroshard/core/model/scaler.py +387 -0
  23. neuroshard/core/model/tokenizer.py +568 -0
  24. neuroshard/core/network/__init__.py +56 -0
  25. neuroshard/core/network/connection_pool.py +72 -0
  26. neuroshard/core/network/dht.py +130 -0
  27. neuroshard/core/network/dht_plan.py +55 -0
  28. neuroshard/core/network/dht_proof_store.py +516 -0
  29. neuroshard/core/network/dht_protocol.py +261 -0
  30. neuroshard/core/network/dht_service.py +506 -0
  31. neuroshard/core/network/encrypted_channel.py +141 -0
  32. neuroshard/core/network/nat.py +201 -0
  33. neuroshard/core/network/nat_traversal.py +695 -0
  34. neuroshard/core/network/p2p.py +929 -0
  35. neuroshard/core/network/p2p_data.py +150 -0
  36. neuroshard/core/swarm/__init__.py +106 -0
  37. neuroshard/core/swarm/aggregation.py +729 -0
  38. neuroshard/core/swarm/buffers.py +643 -0
  39. neuroshard/core/swarm/checkpoint.py +709 -0
  40. neuroshard/core/swarm/compute.py +624 -0
  41. neuroshard/core/swarm/diloco.py +844 -0
  42. neuroshard/core/swarm/factory.py +1288 -0
  43. neuroshard/core/swarm/heartbeat.py +669 -0
  44. neuroshard/core/swarm/logger.py +487 -0
  45. neuroshard/core/swarm/router.py +658 -0
  46. neuroshard/core/swarm/service.py +640 -0
  47. neuroshard/core/training/__init__.py +29 -0
  48. neuroshard/core/training/checkpoint.py +600 -0
  49. neuroshard/core/training/distributed.py +1602 -0
  50. neuroshard/core/training/global_tracker.py +617 -0
  51. neuroshard/core/training/production.py +276 -0
  52. neuroshard/governance_cli.py +729 -0
  53. neuroshard/grpc_server.py +895 -0
  54. neuroshard/runner.py +3223 -0
  55. neuroshard/sdk/__init__.py +92 -0
  56. neuroshard/sdk/client.py +990 -0
  57. neuroshard/sdk/errors.py +101 -0
  58. neuroshard/sdk/types.py +282 -0
  59. neuroshard/tracker/__init__.py +0 -0
  60. neuroshard/tracker/server.py +864 -0
  61. neuroshard/ui/__init__.py +0 -0
  62. neuroshard/ui/app.py +102 -0
  63. neuroshard/ui/templates/index.html +1052 -0
  64. neuroshard/utils/__init__.py +0 -0
  65. neuroshard/utils/autostart.py +81 -0
  66. neuroshard/utils/hardware.py +121 -0
  67. neuroshard/utils/serialization.py +90 -0
  68. neuroshard/version.py +1 -0
  69. nexaroa-0.0.111.dist-info/METADATA +283 -0
  70. nexaroa-0.0.111.dist-info/RECORD +78 -0
  71. nexaroa-0.0.111.dist-info/WHEEL +5 -0
  72. nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
  73. nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
  74. nexaroa-0.0.111.dist-info/top_level.txt +2 -0
  75. protos/__init__.py +0 -0
  76. protos/neuroshard.proto +651 -0
  77. protos/neuroshard_pb2.py +160 -0
  78. protos/neuroshard_pb2_grpc.py +1298 -0
@@ -0,0 +1,643 @@
1
+ """
2
+ Activation Buffer System - Decoupling Compute from Communication
3
+
4
+ Implements priority queues for incoming and outgoing activations with:
5
+ - Priority-based scheduling (inference > training forward > backward)
6
+ - Soft overflow handling ("Don't Stop" logic)
7
+ - Metrics for buffer fill rate and wait times
8
+
9
+ Key Directive: "The GPU must never wait for network packets.
10
+ Forward pass can run 100 steps ahead of backward if necessary."
11
+
12
+ Metric: Buffer Fill Rate
13
+ - Empty (< 10%) = Starved (BAD - GPU idle)
14
+ - Full (> 90%) = Backpressured (triggers soft overflow)
15
+ """
16
+
17
+ import asyncio
18
+ import heapq
19
+ import logging
20
+ import threading
21
+ import time
22
+ import torch
23
+ from dataclasses import dataclass, field
24
+ from enum import IntEnum
25
+ from typing import Any, Dict, List, Optional, Callable, TYPE_CHECKING
26
+
27
+ if TYPE_CHECKING:
28
+ from neuroshard.core.swarm.router import SwarmRouter
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class ActivationPriority(IntEnum):
34
+ """
35
+ Priority levels for activation processing.
36
+
37
+ Lower value = higher priority (processed first).
38
+
39
+ Rationale:
40
+ - INFERENCE_URGENT: Paid inference, user waiting - highest priority
41
+ - INFERENCE_NORMAL: Standard inference requests
42
+ - TRAINING_FORWARD: Training forward passes need timely processing
43
+ - TRAINING_BACKWARD: Backward passes can be slightly delayed
44
+ - GRADIENT_SYNC: Gradient gossip can wait longest
45
+ """
46
+ INFERENCE_URGENT = 0 # Paid inference, user waiting
47
+ INFERENCE_NORMAL = 10 # Standard inference
48
+ TRAINING_FORWARD = 20 # Training forward pass
49
+ TRAINING_BACKWARD = 30 # Training backward pass (can be delayed)
50
+ GRADIENT_SYNC = 40 # Gradient gossip
51
+
52
+
53
+ @dataclass(order=True)
54
+ class ActivationPacket:
55
+ """
56
+ A unit of work for the compute engine.
57
+
58
+ Packets are ordered by (priority, sequence_num) for heap operations.
59
+ Lower priority value = processed first. Sequence_num breaks ties (FIFO).
60
+ """
61
+ priority: int
62
+ sequence_num: int = field(default=0) # For stable FIFO ordering within priority
63
+ timestamp: float = field(compare=False, default_factory=time.time)
64
+
65
+ # Payload
66
+ session_id: str = field(compare=False, default="")
67
+ micro_batch_id: int = field(compare=False, default=0)
68
+ tensor_data: Optional[torch.Tensor] = field(compare=False, default=None)
69
+
70
+ # Routing info
71
+ source_node: str = field(compare=False, default="")
72
+ target_layer: int = field(compare=False, default=0)
73
+ is_backward: bool = field(compare=False, default=False)
74
+
75
+ # Training metadata
76
+ requires_grad: bool = field(compare=False, default=False)
77
+ grad_output: Optional[torch.Tensor] = field(compare=False, default=None)
78
+
79
+ # For tracking
80
+ created_at: float = field(compare=False, default_factory=time.time)
81
+
82
+ def wait_time_ms(self) -> float:
83
+ """Time since packet was created, in milliseconds."""
84
+ return (time.time() - self.created_at) * 1000
85
+
86
+ @classmethod
87
+ def create_forward(
88
+ cls,
89
+ session_id: str,
90
+ micro_batch_id: int,
91
+ tensor: torch.Tensor,
92
+ source_node: str,
93
+ target_layer: int,
94
+ is_inference: bool = False,
95
+ urgent: bool = False,
96
+ ) -> 'ActivationPacket':
97
+ """Factory for forward pass packets."""
98
+ if is_inference:
99
+ priority = ActivationPriority.INFERENCE_URGENT if urgent else ActivationPriority.INFERENCE_NORMAL
100
+ else:
101
+ priority = ActivationPriority.TRAINING_FORWARD
102
+
103
+ return cls(
104
+ priority=priority,
105
+ session_id=session_id,
106
+ micro_batch_id=micro_batch_id,
107
+ tensor_data=tensor,
108
+ source_node=source_node,
109
+ target_layer=target_layer,
110
+ is_backward=False,
111
+ requires_grad=not is_inference,
112
+ )
113
+
114
+ @classmethod
115
+ def create_backward(
116
+ cls,
117
+ session_id: str,
118
+ micro_batch_id: int,
119
+ grad_tensor: torch.Tensor,
120
+ source_node: str,
121
+ target_layer: int,
122
+ ) -> 'ActivationPacket':
123
+ """Factory for backward pass packets."""
124
+ return cls(
125
+ priority=ActivationPriority.TRAINING_BACKWARD,
126
+ session_id=session_id,
127
+ micro_batch_id=micro_batch_id,
128
+ tensor_data=grad_tensor,
129
+ source_node=source_node,
130
+ target_layer=target_layer,
131
+ is_backward=True,
132
+ requires_grad=True,
133
+ grad_output=grad_tensor,
134
+ )
135
+
136
+
137
+ class ActivationBuffer:
138
+ """
139
+ Priority queue for incoming activations.
140
+
141
+ Decouples network I/O from GPU computation:
142
+ - Network receiver pushes packets
143
+ - GPU compute thread pops next-priority item
144
+
145
+ Thread-safe: Uses locks for multi-producer/single-consumer pattern.
146
+
147
+ Metrics:
148
+ - fill_rate: 0.0 = starved (bad), 1.0 = full (backpressured)
149
+ - avg_wait_time: How long packets wait before processing
150
+ """
151
+
152
+ # Default buffer size (100 packets ≈ 100 micro-batches)
153
+ DEFAULT_MAX_SIZE = 100
154
+
155
+ # Starvation/backpressure thresholds
156
+ STARVED_THRESHOLD = 0.1 # < 10% full = starved
157
+ BACKPRESSURE_THRESHOLD = 0.9 # > 90% full = backpressured
158
+
159
+ def __init__(self, max_size: int = DEFAULT_MAX_SIZE):
160
+ """
161
+ Initialize activation buffer.
162
+
163
+ Args:
164
+ max_size: Maximum number of packets in buffer
165
+ """
166
+ self.max_size = max_size
167
+ self._queue: List[ActivationPacket] = []
168
+ self._lock = threading.RLock() # Reentrant lock to allow nested calls
169
+ self._event_not_empty = threading.Event()
170
+ self._event_not_full = threading.Event()
171
+ self._event_not_full.set() # Initially not full
172
+
173
+ # Sequence counter for ordering packets with same priority
174
+ self._sequence_counter = 0
175
+
176
+ # Metrics
177
+ self.packets_in = 0
178
+ self.packets_out = 0
179
+ self.packets_dropped = 0
180
+ self.total_wait_time = 0.0
181
+
182
+ # Priority breakdown
183
+ self._priority_counts: Dict[int, int] = {}
184
+
185
+ @property
186
+ def fill_rate(self) -> float:
187
+ """Current fill rate (0.0 to 1.0)."""
188
+ with self._lock:
189
+ return len(self._queue) / self.max_size if self.max_size > 0 else 0.0
190
+
191
+ @property
192
+ def is_starved(self) -> bool:
193
+ """True if buffer is starving (< 10% full)."""
194
+ return self.fill_rate < self.STARVED_THRESHOLD
195
+
196
+ @property
197
+ def is_backpressured(self) -> bool:
198
+ """True if buffer is backpressured (>= 90% full)."""
199
+ return self.fill_rate >= self.BACKPRESSURE_THRESHOLD
200
+
201
+ def put(self, packet: ActivationPacket, timeout: Optional[float] = None) -> bool:
202
+ """
203
+ Add activation to buffer.
204
+
205
+ Args:
206
+ packet: Activation packet to add
207
+ timeout: Max time to wait if buffer full (None = blocking, 0 = non-blocking)
208
+
209
+ Returns:
210
+ True if added successfully, False if timeout expired
211
+ """
212
+ # Wait for space if buffer is full
213
+ if timeout == 0:
214
+ # Non-blocking
215
+ with self._lock:
216
+ if len(self._queue) >= self.max_size:
217
+ return False
218
+ else:
219
+ # Blocking with optional timeout
220
+ if not self._event_not_full.wait(timeout):
221
+ return False
222
+
223
+ with self._lock:
224
+ # Double-check after acquiring lock
225
+ if len(self._queue) >= self.max_size:
226
+ return False
227
+
228
+ # Add sequence number for stable ordering
229
+ self._sequence_counter += 1
230
+ packet.sequence_num = self._sequence_counter
231
+
232
+ heapq.heappush(self._queue, packet)
233
+ self.packets_in += 1
234
+
235
+ # Track priority distribution
236
+ self._priority_counts[packet.priority] = (
237
+ self._priority_counts.get(packet.priority, 0) + 1
238
+ )
239
+
240
+ # Update events
241
+ self._event_not_empty.set()
242
+ if len(self._queue) >= self.max_size:
243
+ self._event_not_full.clear()
244
+
245
+ return True
246
+
247
+ def put_nowait(self, packet: ActivationPacket) -> bool:
248
+ """
249
+ Non-blocking put - returns False immediately if full.
250
+
251
+ Used by soft overflow logic.
252
+ """
253
+ return self.put(packet, timeout=0)
254
+
255
+ def get(self, timeout: Optional[float] = None) -> Optional[ActivationPacket]:
256
+ """
257
+ Get highest-priority activation.
258
+
259
+ Args:
260
+ timeout: Max time to wait if buffer empty (None = blocking, 0 = non-blocking)
261
+
262
+ Returns:
263
+ Highest priority packet, or None if timeout expired
264
+ """
265
+ # Wait for data if buffer is empty
266
+ if timeout == 0:
267
+ # Non-blocking
268
+ with self._lock:
269
+ if not self._queue:
270
+ return None
271
+ else:
272
+ # Blocking with optional timeout
273
+ if not self._event_not_empty.wait(timeout):
274
+ return None
275
+
276
+ with self._lock:
277
+ # Double-check after acquiring lock
278
+ if not self._queue:
279
+ return None
280
+
281
+ packet = heapq.heappop(self._queue)
282
+
283
+ # Track metrics
284
+ wait_time = time.time() - packet.timestamp
285
+ self.total_wait_time += wait_time
286
+ self.packets_out += 1
287
+
288
+ # Update events
289
+ self._event_not_full.set()
290
+ if not self._queue:
291
+ self._event_not_empty.clear()
292
+
293
+ return packet
294
+
295
+ def get_nowait(self) -> Optional[ActivationPacket]:
296
+ """
297
+ Non-blocking get - returns None immediately if empty.
298
+
299
+ Used by compute engine for non-blocking iteration.
300
+ """
301
+ return self.get(timeout=0)
302
+
303
+ def peek(self) -> Optional[ActivationPacket]:
304
+ """Peek at next packet without removing it."""
305
+ with self._lock:
306
+ if self._queue:
307
+ return self._queue[0]
308
+ return None
309
+
310
+ def clear(self) -> int:
311
+ """Clear buffer, returns number of packets cleared."""
312
+ with self._lock:
313
+ count = len(self._queue)
314
+ self._queue.clear()
315
+ self.packets_dropped += count
316
+ self._event_not_full.set()
317
+ self._event_not_empty.clear()
318
+ return count
319
+
320
+ def get_stats(self) -> Dict[str, Any]:
321
+ """Get buffer statistics."""
322
+ with self._lock:
323
+ avg_wait = (
324
+ self.total_wait_time / max(1, self.packets_out) * 1000
325
+ ) # Convert to ms
326
+
327
+ return {
328
+ "fill_rate": self.fill_rate,
329
+ "queue_size": len(self._queue),
330
+ "max_size": self.max_size,
331
+ "packets_in": self.packets_in,
332
+ "packets_out": self.packets_out,
333
+ "packets_dropped": self.packets_dropped,
334
+ "avg_wait_time_ms": avg_wait,
335
+ "is_starved": self.is_starved,
336
+ "is_backpressured": self.is_backpressured,
337
+ "priority_breakdown": dict(self._priority_counts),
338
+ }
339
+
340
+
341
+ class OutboundBuffer:
342
+ """
343
+ Buffer for outgoing activations with async network send.
344
+
345
+ Supports the "Soft Overflow" mechanism:
346
+ - When full, compute engine can check without blocking
347
+ - Enables DiLoCo-style local-only steps during congestion
348
+
349
+ Thread-safe async queue with metrics.
350
+ """
351
+
352
+ DEFAULT_MAX_SIZE = 50
353
+
354
+ # Default soft overflow thresholds
355
+ DEFAULT_SOFT_LIMIT = 0.9 # 90% - start warning
356
+ DEFAULT_HARD_LIMIT = 0.99 # 99% - critical
357
+
358
+ def __init__(
359
+ self,
360
+ max_size: int = DEFAULT_MAX_SIZE,
361
+ soft_overflow_threshold: float = DEFAULT_SOFT_LIMIT,
362
+ hard_overflow_threshold: float = DEFAULT_HARD_LIMIT,
363
+ ):
364
+ """
365
+ Initialize outbound buffer.
366
+
367
+ Args:
368
+ max_size: Maximum number of pending outbound packets
369
+ soft_overflow_threshold: Fill rate to trigger soft overflow (0-1)
370
+ hard_overflow_threshold: Fill rate to trigger hard overflow (0-1)
371
+ """
372
+ self.max_size = max_size
373
+ self.soft_limit = soft_overflow_threshold
374
+ self.hard_limit = hard_overflow_threshold
375
+ self._queue: asyncio.Queue = asyncio.Queue(maxsize=max_size)
376
+
377
+ # Track send status
378
+ self.send_in_progress = 0
379
+ self.packets_sent = 0
380
+ self.packets_failed = 0
381
+ self.send_retries = 0
382
+
383
+ # Soft overflow tracking
384
+ self.soft_overflow_count = 0
385
+ self.hard_overflow_count = 0
386
+
387
+ # Send loop state
388
+ self._send_task: Optional[asyncio.Task] = None
389
+ self._running = False
390
+
391
+ @property
392
+ def fill_rate(self) -> float:
393
+ """Current fill rate (0.0 to 1.0)."""
394
+ return self._queue.qsize() / self.max_size if self.max_size > 0 else 0.0
395
+
396
+ @property
397
+ def is_soft_overflow(self) -> bool:
398
+ """True if buffer is in soft overflow (>= soft_limit)."""
399
+ return self.fill_rate >= self.soft_limit
400
+
401
+ @property
402
+ def is_hard_overflow(self) -> bool:
403
+ """True if buffer is in hard overflow (>= hard_limit)."""
404
+ return self.fill_rate >= self.hard_limit
405
+
406
+ def check_pressure(self) -> str:
407
+ """
408
+ Check current backpressure level.
409
+
410
+ Returns:
411
+ "ok" - normal operation
412
+ "soft_overflow" - buffer almost full, consider local accumulation
413
+ "hard_overflow" - buffer critical, must discard
414
+ """
415
+ if self.is_hard_overflow:
416
+ return "hard_overflow"
417
+ elif self.is_soft_overflow:
418
+ return "soft_overflow"
419
+ else:
420
+ return "ok"
421
+
422
+ async def put(self, packet: ActivationPacket, timeout: Optional[float] = None):
423
+ """
424
+ Queue packet for sending.
425
+
426
+ Args:
427
+ packet: Packet to send
428
+ timeout: Max time to wait if full (None = blocking)
429
+
430
+ Raises:
431
+ asyncio.TimeoutError: If timeout expires while buffer full
432
+ asyncio.QueueFull: If full and timeout=0
433
+ """
434
+ if timeout == 0:
435
+ self._queue.put_nowait(packet)
436
+ elif timeout is not None:
437
+ await asyncio.wait_for(self._queue.put(packet), timeout=timeout)
438
+ else:
439
+ await self._queue.put(packet)
440
+
441
+ def put_nowait(self, packet: ActivationPacket) -> bool:
442
+ """
443
+ Non-blocking put - returns False if full.
444
+
445
+ Used for soft overflow checking.
446
+ """
447
+ try:
448
+ self._queue.put_nowait(packet)
449
+ return True
450
+ except asyncio.QueueFull:
451
+ return False
452
+
453
+ async def get(self) -> ActivationPacket:
454
+ """Get next packet to send (blocks if empty)."""
455
+ return await self._queue.get()
456
+
457
+ def start_send_loop(self, swarm_router: 'SwarmRouter'):
458
+ """Start the background send loop."""
459
+ if self._send_task is not None:
460
+ return
461
+ self._running = True
462
+ self._send_task = asyncio.create_task(self._send_loop(swarm_router))
463
+ logger.info("OutboundBuffer send loop started")
464
+
465
+ async def stop_send_loop(self):
466
+ """Stop the background send loop."""
467
+ self._running = False
468
+ if self._send_task:
469
+ self._send_task.cancel()
470
+ try:
471
+ await self._send_task
472
+ except asyncio.CancelledError:
473
+ pass
474
+ self._send_task = None
475
+ logger.info("OutboundBuffer send loop stopped")
476
+
477
+ async def _send_loop(self, swarm_router: 'SwarmRouter'):
478
+ """
479
+ Continuously send packets to peers via swarm router.
480
+
481
+ Uses failover routing for resilience.
482
+ """
483
+ while self._running:
484
+ try:
485
+ packet = await asyncio.wait_for(self._queue.get(), timeout=1.0)
486
+ except asyncio.TimeoutError:
487
+ continue
488
+ except asyncio.CancelledError:
489
+ break
490
+
491
+ self.send_in_progress += 1
492
+
493
+ try:
494
+ result = await swarm_router.send_with_failover(
495
+ tensor=packet.tensor_data,
496
+ target_layer=packet.target_layer,
497
+ session_id=packet.session_id,
498
+ metadata={
499
+ "source_node": packet.source_node,
500
+ "micro_batch_id": packet.micro_batch_id,
501
+ "priority": packet.priority,
502
+ "is_backward": packet.is_backward,
503
+ }
504
+ )
505
+
506
+ if result.success:
507
+ self.packets_sent += 1
508
+ else:
509
+ self.packets_failed += 1
510
+ logger.warning(f"Failed to send packet: {result.error}")
511
+
512
+ # Track retries (failover attempts - 1)
513
+ if result.attempts > 1:
514
+ self.send_retries += result.attempts - 1
515
+
516
+ except Exception as e:
517
+ self.packets_failed += 1
518
+ logger.error(f"Send error: {e}")
519
+ finally:
520
+ self.send_in_progress -= 1
521
+ self._queue.task_done()
522
+
523
+ async def flush(self, timeout: float = 30.0):
524
+ """Wait for all pending packets to be sent."""
525
+ try:
526
+ await asyncio.wait_for(self._queue.join(), timeout=timeout)
527
+ except asyncio.TimeoutError:
528
+ logger.warning(f"Flush timeout after {timeout}s, {self._queue.qsize()} packets remaining")
529
+
530
+ def get_stats(self) -> Dict[str, Any]:
531
+ """Get outbound buffer statistics."""
532
+ return {
533
+ "fill_rate": self.fill_rate,
534
+ "queue_size": self._queue.qsize(),
535
+ "max_size": self.max_size,
536
+ "send_in_progress": self.send_in_progress,
537
+ "packets_sent": self.packets_sent,
538
+ "packets_failed": self.packets_failed,
539
+ "send_retries": self.send_retries,
540
+ "soft_overflow_count": self.soft_overflow_count,
541
+ "hard_overflow_count": self.hard_overflow_count,
542
+ "is_soft_overflow": self.is_soft_overflow,
543
+ "is_hard_overflow": self.is_hard_overflow,
544
+ "pressure": self.check_pressure(),
545
+ }
546
+
547
+
548
+ class BufferMetricsCollector:
549
+ """
550
+ Collects and aggregates metrics from activation buffers.
551
+
552
+ Provides unified view for monitoring dashboard.
553
+ """
554
+
555
+ def __init__(
556
+ self,
557
+ inbound: ActivationBuffer,
558
+ outbound: OutboundBuffer,
559
+ collect_interval: float = 5.0,
560
+ ):
561
+ self.inbound = inbound
562
+ self.outbound = outbound
563
+ self.collect_interval = collect_interval
564
+
565
+ # Historical metrics
566
+ self.history: List[Dict[str, Any]] = []
567
+ self.max_history = 720 # 1 hour at 5s intervals
568
+
569
+ self._running = False
570
+ self._task: Optional[asyncio.Task] = None
571
+
572
+ async def start(self):
573
+ """Start metrics collection."""
574
+ self._running = True
575
+ self._task = asyncio.create_task(self._collect_loop())
576
+
577
+ async def stop(self):
578
+ """Stop metrics collection."""
579
+ self._running = False
580
+ if self._task:
581
+ self._task.cancel()
582
+ try:
583
+ await self._task
584
+ except asyncio.CancelledError:
585
+ pass
586
+
587
+ async def _collect_loop(self):
588
+ """Periodically collect buffer metrics."""
589
+ while self._running:
590
+ try:
591
+ await asyncio.sleep(self.collect_interval)
592
+
593
+ snapshot = {
594
+ "timestamp": time.time(),
595
+ "inbound": self.inbound.get_stats(),
596
+ "outbound": self.outbound.get_stats(),
597
+ }
598
+
599
+ self.history.append(snapshot)
600
+
601
+ # Trim history
602
+ if len(self.history) > self.max_history:
603
+ self.history = self.history[-self.max_history:]
604
+
605
+ # Log warnings
606
+ if self.inbound.is_starved:
607
+ logger.warning(
608
+ f"Inbound buffer STARVED: {self.inbound.fill_rate:.1%} full"
609
+ )
610
+ if self.outbound.is_soft_overflow:
611
+ logger.warning(
612
+ f"Outbound buffer OVERFLOW: {self.outbound.fill_rate:.1%} full"
613
+ )
614
+
615
+ except asyncio.CancelledError:
616
+ break
617
+ except Exception as e:
618
+ logger.error(f"Metrics collection error: {e}")
619
+
620
+ def get_summary(self) -> Dict[str, Any]:
621
+ """Get summary statistics."""
622
+ if not self.history:
623
+ return {
624
+ "inbound": self.inbound.get_stats(),
625
+ "outbound": self.outbound.get_stats(),
626
+ }
627
+
628
+ # Calculate averages over history
629
+ avg_inbound_fill = sum(h["inbound"]["fill_rate"] for h in self.history) / len(self.history)
630
+ avg_outbound_fill = sum(h["outbound"]["fill_rate"] for h in self.history) / len(self.history)
631
+
632
+ return {
633
+ "current": {
634
+ "inbound": self.inbound.get_stats(),
635
+ "outbound": self.outbound.get_stats(),
636
+ },
637
+ "averages": {
638
+ "inbound_fill_rate": avg_inbound_fill,
639
+ "outbound_fill_rate": avg_outbound_fill,
640
+ },
641
+ "history_length": len(self.history),
642
+ }
643
+