nexaroa 0.0.111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuroshard/__init__.py +93 -0
- neuroshard/__main__.py +4 -0
- neuroshard/cli.py +466 -0
- neuroshard/core/__init__.py +92 -0
- neuroshard/core/consensus/verifier.py +252 -0
- neuroshard/core/crypto/__init__.py +20 -0
- neuroshard/core/crypto/ecdsa.py +392 -0
- neuroshard/core/economics/__init__.py +52 -0
- neuroshard/core/economics/constants.py +387 -0
- neuroshard/core/economics/ledger.py +2111 -0
- neuroshard/core/economics/market.py +975 -0
- neuroshard/core/economics/wallet.py +168 -0
- neuroshard/core/governance/__init__.py +74 -0
- neuroshard/core/governance/proposal.py +561 -0
- neuroshard/core/governance/registry.py +545 -0
- neuroshard/core/governance/versioning.py +332 -0
- neuroshard/core/governance/voting.py +453 -0
- neuroshard/core/model/__init__.py +30 -0
- neuroshard/core/model/dynamic.py +4186 -0
- neuroshard/core/model/llm.py +905 -0
- neuroshard/core/model/registry.py +164 -0
- neuroshard/core/model/scaler.py +387 -0
- neuroshard/core/model/tokenizer.py +568 -0
- neuroshard/core/network/__init__.py +56 -0
- neuroshard/core/network/connection_pool.py +72 -0
- neuroshard/core/network/dht.py +130 -0
- neuroshard/core/network/dht_plan.py +55 -0
- neuroshard/core/network/dht_proof_store.py +516 -0
- neuroshard/core/network/dht_protocol.py +261 -0
- neuroshard/core/network/dht_service.py +506 -0
- neuroshard/core/network/encrypted_channel.py +141 -0
- neuroshard/core/network/nat.py +201 -0
- neuroshard/core/network/nat_traversal.py +695 -0
- neuroshard/core/network/p2p.py +929 -0
- neuroshard/core/network/p2p_data.py +150 -0
- neuroshard/core/swarm/__init__.py +106 -0
- neuroshard/core/swarm/aggregation.py +729 -0
- neuroshard/core/swarm/buffers.py +643 -0
- neuroshard/core/swarm/checkpoint.py +709 -0
- neuroshard/core/swarm/compute.py +624 -0
- neuroshard/core/swarm/diloco.py +844 -0
- neuroshard/core/swarm/factory.py +1288 -0
- neuroshard/core/swarm/heartbeat.py +669 -0
- neuroshard/core/swarm/logger.py +487 -0
- neuroshard/core/swarm/router.py +658 -0
- neuroshard/core/swarm/service.py +640 -0
- neuroshard/core/training/__init__.py +29 -0
- neuroshard/core/training/checkpoint.py +600 -0
- neuroshard/core/training/distributed.py +1602 -0
- neuroshard/core/training/global_tracker.py +617 -0
- neuroshard/core/training/production.py +276 -0
- neuroshard/governance_cli.py +729 -0
- neuroshard/grpc_server.py +895 -0
- neuroshard/runner.py +3223 -0
- neuroshard/sdk/__init__.py +92 -0
- neuroshard/sdk/client.py +990 -0
- neuroshard/sdk/errors.py +101 -0
- neuroshard/sdk/types.py +282 -0
- neuroshard/tracker/__init__.py +0 -0
- neuroshard/tracker/server.py +864 -0
- neuroshard/ui/__init__.py +0 -0
- neuroshard/ui/app.py +102 -0
- neuroshard/ui/templates/index.html +1052 -0
- neuroshard/utils/__init__.py +0 -0
- neuroshard/utils/autostart.py +81 -0
- neuroshard/utils/hardware.py +121 -0
- neuroshard/utils/serialization.py +90 -0
- neuroshard/version.py +1 -0
- nexaroa-0.0.111.dist-info/METADATA +283 -0
- nexaroa-0.0.111.dist-info/RECORD +78 -0
- nexaroa-0.0.111.dist-info/WHEEL +5 -0
- nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
- nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
- nexaroa-0.0.111.dist-info/top_level.txt +2 -0
- protos/__init__.py +0 -0
- protos/neuroshard.proto +651 -0
- protos/neuroshard_pb2.py +160 -0
- protos/neuroshard_pb2_grpc.py +1298 -0
|
@@ -0,0 +1,487 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Swarm Logger - Structured logging for NeuroShard Swarm Architecture
|
|
3
|
+
|
|
4
|
+
This module provides:
|
|
5
|
+
- Structured JSON logging for metrics and events
|
|
6
|
+
- Periodic summary statistics (reduce log spam)
|
|
7
|
+
- Role-specific log prefixes (DRIVER, WORKER, VALIDATOR)
|
|
8
|
+
- Log level filtering by component
|
|
9
|
+
- Training/Inference/Swarm log separation
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
from neuroshard.core.swarm import SwarmLogger, LogCategory
|
|
13
|
+
|
|
14
|
+
logger = SwarmLogger("my_node")
|
|
15
|
+
logger.log_training_step(round=100, loss=0.5, tokens=1000)
|
|
16
|
+
logger.log_diloco_sync(inner_steps=500, outer_step=1)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import logging
|
|
21
|
+
import time
|
|
22
|
+
import threading
|
|
23
|
+
from dataclasses import dataclass, field, asdict
|
|
24
|
+
from typing import Dict, List, Optional, Any
|
|
25
|
+
from enum import Enum
|
|
26
|
+
from collections import defaultdict
|
|
27
|
+
from datetime import datetime
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LogCategory(Enum):
|
|
31
|
+
"""Log categories for filtering and routing."""
|
|
32
|
+
SWARM = "swarm"
|
|
33
|
+
TRAINING = "training"
|
|
34
|
+
INFERENCE = "inference"
|
|
35
|
+
DILOCO = "diloco"
|
|
36
|
+
HEARTBEAT = "heartbeat"
|
|
37
|
+
ROUTING = "routing"
|
|
38
|
+
CHECKPOINT = "checkpoint"
|
|
39
|
+
SYSTEM = "system"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class NodeRole(Enum):
|
|
43
|
+
"""Node roles for log prefixes."""
|
|
44
|
+
DRIVER = "DRIVER"
|
|
45
|
+
WORKER = "WORKER"
|
|
46
|
+
VALIDATOR = "VALIDATOR"
|
|
47
|
+
FULL = "FULL" # Has both embedding and LM head
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class LogStats:
|
|
52
|
+
"""Accumulated statistics for periodic summaries."""
|
|
53
|
+
training_steps: int = 0
|
|
54
|
+
total_loss: float = 0.0
|
|
55
|
+
tokens_processed: int = 0
|
|
56
|
+
diloco_syncs: int = 0
|
|
57
|
+
activations_sent: int = 0
|
|
58
|
+
activations_received: int = 0
|
|
59
|
+
activations_dropped: int = 0
|
|
60
|
+
local_only_steps: int = 0
|
|
61
|
+
heartbeats_sent: int = 0
|
|
62
|
+
heartbeats_received: int = 0
|
|
63
|
+
peer_updates: int = 0
|
|
64
|
+
checkpoints_saved: int = 0
|
|
65
|
+
|
|
66
|
+
def reset(self):
|
|
67
|
+
"""Reset all stats."""
|
|
68
|
+
self.training_steps = 0
|
|
69
|
+
self.total_loss = 0.0
|
|
70
|
+
self.tokens_processed = 0
|
|
71
|
+
self.diloco_syncs = 0
|
|
72
|
+
self.activations_sent = 0
|
|
73
|
+
self.activations_received = 0
|
|
74
|
+
self.activations_dropped = 0
|
|
75
|
+
self.local_only_steps = 0
|
|
76
|
+
self.heartbeats_sent = 0
|
|
77
|
+
self.heartbeats_received = 0
|
|
78
|
+
self.peer_updates = 0
|
|
79
|
+
self.checkpoints_saved = 0
|
|
80
|
+
|
|
81
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
82
|
+
"""Convert to dictionary."""
|
|
83
|
+
return asdict(self)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class SwarmLogger:
|
|
87
|
+
"""
|
|
88
|
+
Structured logger for NeuroShard Swarm Architecture.
|
|
89
|
+
|
|
90
|
+
Features:
|
|
91
|
+
- Role-specific prefixes (DRIVER, WORKER, VALIDATOR)
|
|
92
|
+
- JSON-structured event logging
|
|
93
|
+
- Periodic summary statistics
|
|
94
|
+
- Configurable log levels per category
|
|
95
|
+
"""
|
|
96
|
+
|
|
97
|
+
# Summary interval in seconds
|
|
98
|
+
SUMMARY_INTERVAL = 60
|
|
99
|
+
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
node_id: str,
|
|
103
|
+
role: Optional[NodeRole] = None,
|
|
104
|
+
log_level: int = logging.INFO,
|
|
105
|
+
enable_json: bool = False,
|
|
106
|
+
summary_interval: int = 60,
|
|
107
|
+
):
|
|
108
|
+
"""
|
|
109
|
+
Initialize SwarmLogger.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
node_id: Node identifier (first 8 chars used in prefix)
|
|
113
|
+
role: Node role (DRIVER, WORKER, VALIDATOR, FULL)
|
|
114
|
+
log_level: Base log level
|
|
115
|
+
enable_json: Enable JSON-structured logging
|
|
116
|
+
summary_interval: Seconds between periodic summaries
|
|
117
|
+
"""
|
|
118
|
+
self.node_id = node_id
|
|
119
|
+
self.node_id_short = node_id[:8] if node_id else "unknown"
|
|
120
|
+
self.role = role or NodeRole.WORKER
|
|
121
|
+
self.enable_json = enable_json
|
|
122
|
+
self.summary_interval = summary_interval
|
|
123
|
+
|
|
124
|
+
# Create logger
|
|
125
|
+
self.logger = logging.getLogger(f"neuroshard.swarm.{self.node_id_short}")
|
|
126
|
+
self.logger.setLevel(log_level)
|
|
127
|
+
|
|
128
|
+
# Category-specific log levels
|
|
129
|
+
self.category_levels: Dict[LogCategory, int] = defaultdict(lambda: log_level)
|
|
130
|
+
|
|
131
|
+
# Accumulated stats for summaries
|
|
132
|
+
self.stats = LogStats()
|
|
133
|
+
self._stats_lock = threading.Lock()
|
|
134
|
+
|
|
135
|
+
# Last summary time
|
|
136
|
+
self.last_summary_time = time.time()
|
|
137
|
+
|
|
138
|
+
# Start summary thread
|
|
139
|
+
self._running = True
|
|
140
|
+
self._summary_thread = threading.Thread(target=self._summary_loop, daemon=True)
|
|
141
|
+
self._summary_thread.start()
|
|
142
|
+
|
|
143
|
+
def stop(self):
|
|
144
|
+
"""Stop the logger and summary thread."""
|
|
145
|
+
self._running = False
|
|
146
|
+
|
|
147
|
+
def set_role(self, has_embedding: bool, has_lm_head: bool):
|
|
148
|
+
"""Set role based on layer assignment."""
|
|
149
|
+
if has_embedding and has_lm_head:
|
|
150
|
+
self.role = NodeRole.FULL
|
|
151
|
+
elif has_embedding:
|
|
152
|
+
self.role = NodeRole.DRIVER
|
|
153
|
+
elif has_lm_head:
|
|
154
|
+
self.role = NodeRole.VALIDATOR
|
|
155
|
+
else:
|
|
156
|
+
self.role = NodeRole.WORKER
|
|
157
|
+
|
|
158
|
+
def set_category_level(self, category: LogCategory, level: int):
|
|
159
|
+
"""Set log level for a specific category."""
|
|
160
|
+
self.category_levels[category] = level
|
|
161
|
+
|
|
162
|
+
def _get_prefix(self) -> str:
|
|
163
|
+
"""Get log prefix based on role."""
|
|
164
|
+
return f"[{self.role.value}:{self.node_id_short}]"
|
|
165
|
+
|
|
166
|
+
def _format_message(
|
|
167
|
+
self,
|
|
168
|
+
category: LogCategory,
|
|
169
|
+
message: str,
|
|
170
|
+
data: Optional[Dict[str, Any]] = None
|
|
171
|
+
) -> str:
|
|
172
|
+
"""Format log message."""
|
|
173
|
+
prefix = self._get_prefix()
|
|
174
|
+
|
|
175
|
+
if self.enable_json and data:
|
|
176
|
+
# JSON structured logging
|
|
177
|
+
log_entry = {
|
|
178
|
+
"timestamp": datetime.utcnow().isoformat(),
|
|
179
|
+
"node_id": self.node_id_short,
|
|
180
|
+
"role": self.role.value,
|
|
181
|
+
"category": category.value,
|
|
182
|
+
"message": message,
|
|
183
|
+
**data
|
|
184
|
+
}
|
|
185
|
+
return json.dumps(log_entry)
|
|
186
|
+
elif data:
|
|
187
|
+
# Human-readable with data
|
|
188
|
+
data_str = ", ".join(f"{k}={v}" for k, v in data.items())
|
|
189
|
+
return f"{prefix} [{category.value}] {message} ({data_str})"
|
|
190
|
+
else:
|
|
191
|
+
return f"{prefix} [{category.value}] {message}"
|
|
192
|
+
|
|
193
|
+
def _log(
|
|
194
|
+
self,
|
|
195
|
+
level: int,
|
|
196
|
+
category: LogCategory,
|
|
197
|
+
message: str,
|
|
198
|
+
data: Optional[Dict[str, Any]] = None
|
|
199
|
+
):
|
|
200
|
+
"""Internal log method."""
|
|
201
|
+
# Check category-specific level
|
|
202
|
+
if level < self.category_levels[category]:
|
|
203
|
+
return
|
|
204
|
+
|
|
205
|
+
formatted = self._format_message(category, message, data)
|
|
206
|
+
self.logger.log(level, formatted)
|
|
207
|
+
|
|
208
|
+
# ==================== TRAINING LOGS ====================
|
|
209
|
+
|
|
210
|
+
def log_training_step(
|
|
211
|
+
self,
|
|
212
|
+
round: int,
|
|
213
|
+
loss: float,
|
|
214
|
+
tokens: int = 0,
|
|
215
|
+
duration_ms: float = 0,
|
|
216
|
+
):
|
|
217
|
+
"""Log a training step completion."""
|
|
218
|
+
with self._stats_lock:
|
|
219
|
+
self.stats.training_steps += 1
|
|
220
|
+
self.stats.total_loss += loss
|
|
221
|
+
self.stats.tokens_processed += tokens
|
|
222
|
+
|
|
223
|
+
self._log(
|
|
224
|
+
logging.INFO,
|
|
225
|
+
LogCategory.TRAINING,
|
|
226
|
+
f"Step #{round} complete",
|
|
227
|
+
{"loss": f"{loss:.4f}", "tokens": tokens, "duration_ms": f"{duration_ms:.1f}"}
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
def log_training_waiting(self, reason: str = "data"):
|
|
231
|
+
"""Log training waiting state."""
|
|
232
|
+
self._log(
|
|
233
|
+
logging.DEBUG,
|
|
234
|
+
LogCategory.TRAINING,
|
|
235
|
+
f"Waiting for {reason}",
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# ==================== DiLoCo LOGS ====================
|
|
239
|
+
|
|
240
|
+
def log_diloco_progress(self, inner_step: int, inner_total: int):
|
|
241
|
+
"""Log DiLoCo inner step progress (only on milestones)."""
|
|
242
|
+
# Only log at 10%, 25%, 50%, 75%, 90%, 100%
|
|
243
|
+
progress = inner_step / inner_total
|
|
244
|
+
milestones = [0.1, 0.25, 0.5, 0.75, 0.9, 1.0]
|
|
245
|
+
|
|
246
|
+
for milestone in milestones:
|
|
247
|
+
if abs(progress - milestone) < 0.01:
|
|
248
|
+
self._log(
|
|
249
|
+
logging.INFO,
|
|
250
|
+
LogCategory.DILOCO,
|
|
251
|
+
f"Inner progress: {int(progress * 100)}%",
|
|
252
|
+
{"inner_step": inner_step, "inner_total": inner_total}
|
|
253
|
+
)
|
|
254
|
+
break
|
|
255
|
+
|
|
256
|
+
def log_diloco_sync(self, inner_steps: int, outer_step: int, duration_ms: float = 0):
|
|
257
|
+
"""Log DiLoCo outer sync completion."""
|
|
258
|
+
with self._stats_lock:
|
|
259
|
+
self.stats.diloco_syncs += 1
|
|
260
|
+
|
|
261
|
+
self._log(
|
|
262
|
+
logging.INFO,
|
|
263
|
+
LogCategory.DILOCO,
|
|
264
|
+
f"Outer sync #{outer_step} complete",
|
|
265
|
+
{"inner_steps": inner_steps, "duration_ms": f"{duration_ms:.1f}"}
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# ==================== SWARM LOGS ====================
|
|
269
|
+
|
|
270
|
+
def log_activation_sent(self, target_layer: int, target_node: str):
|
|
271
|
+
"""Log activation sent to peer."""
|
|
272
|
+
with self._stats_lock:
|
|
273
|
+
self.stats.activations_sent += 1
|
|
274
|
+
|
|
275
|
+
self._log(
|
|
276
|
+
logging.DEBUG,
|
|
277
|
+
LogCategory.ROUTING,
|
|
278
|
+
f"Sent activation to layer {target_layer}",
|
|
279
|
+
{"target_node": target_node[:8]}
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
def log_activation_received(self, source_node: str, layer: int):
|
|
283
|
+
"""Log activation received from peer."""
|
|
284
|
+
with self._stats_lock:
|
|
285
|
+
self.stats.activations_received += 1
|
|
286
|
+
|
|
287
|
+
self._log(
|
|
288
|
+
logging.DEBUG,
|
|
289
|
+
LogCategory.ROUTING,
|
|
290
|
+
f"Received activation for layer {layer}",
|
|
291
|
+
{"source_node": source_node[:8]}
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
def log_soft_overflow(self, step: int, buffer_fill: float):
|
|
295
|
+
"""Log soft overflow (local accumulation)."""
|
|
296
|
+
with self._stats_lock:
|
|
297
|
+
self.stats.local_only_steps += 1
|
|
298
|
+
|
|
299
|
+
self._log(
|
|
300
|
+
logging.WARNING,
|
|
301
|
+
LogCategory.SWARM,
|
|
302
|
+
f"Soft overflow at step {step}",
|
|
303
|
+
{"buffer_fill": f"{buffer_fill:.1%}"}
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
def log_hard_overflow(self, step: int, buffer_fill: float):
|
|
307
|
+
"""Log hard overflow (dropped step)."""
|
|
308
|
+
with self._stats_lock:
|
|
309
|
+
self.stats.activations_dropped += 1
|
|
310
|
+
|
|
311
|
+
self._log(
|
|
312
|
+
logging.ERROR,
|
|
313
|
+
LogCategory.SWARM,
|
|
314
|
+
f"Hard overflow at step {step} - step dropped",
|
|
315
|
+
{"buffer_fill": f"{buffer_fill:.1%}"}
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
def log_failover(self, from_node: str, to_node: str, reason: str):
|
|
319
|
+
"""Log routing failover."""
|
|
320
|
+
self._log(
|
|
321
|
+
logging.WARNING,
|
|
322
|
+
LogCategory.ROUTING,
|
|
323
|
+
f"Failover from {from_node[:8]} to {to_node[:8]}",
|
|
324
|
+
{"reason": reason}
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
# ==================== HEARTBEAT LOGS ====================
|
|
328
|
+
|
|
329
|
+
def log_heartbeat_sent(self, peers: int):
|
|
330
|
+
"""Log heartbeat broadcast."""
|
|
331
|
+
with self._stats_lock:
|
|
332
|
+
self.stats.heartbeats_sent += 1
|
|
333
|
+
|
|
334
|
+
self._log(
|
|
335
|
+
logging.DEBUG,
|
|
336
|
+
LogCategory.HEARTBEAT,
|
|
337
|
+
f"Heartbeat sent to {peers} peers",
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
def log_heartbeat_received(self, from_node: str):
|
|
341
|
+
"""Log heartbeat received."""
|
|
342
|
+
with self._stats_lock:
|
|
343
|
+
self.stats.heartbeats_received += 1
|
|
344
|
+
|
|
345
|
+
self._log(
|
|
346
|
+
logging.DEBUG,
|
|
347
|
+
LogCategory.HEARTBEAT,
|
|
348
|
+
f"Heartbeat from {from_node[:8]}",
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
def log_peer_update(self, node_id: str, capacity: int):
|
|
352
|
+
"""Log peer capacity update."""
|
|
353
|
+
with self._stats_lock:
|
|
354
|
+
self.stats.peer_updates += 1
|
|
355
|
+
|
|
356
|
+
self._log(
|
|
357
|
+
logging.DEBUG,
|
|
358
|
+
LogCategory.HEARTBEAT,
|
|
359
|
+
f"Peer {node_id[:8]} capacity updated",
|
|
360
|
+
{"available_mb": capacity}
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# ==================== CHECKPOINT LOGS ====================
|
|
364
|
+
|
|
365
|
+
def log_checkpoint_saved(self, path: str, size_mb: float):
|
|
366
|
+
"""Log checkpoint saved."""
|
|
367
|
+
with self._stats_lock:
|
|
368
|
+
self.stats.checkpoints_saved += 1
|
|
369
|
+
|
|
370
|
+
self._log(
|
|
371
|
+
logging.INFO,
|
|
372
|
+
LogCategory.CHECKPOINT,
|
|
373
|
+
f"Checkpoint saved",
|
|
374
|
+
{"path": path, "size_mb": f"{size_mb:.1f}"}
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
def log_checkpoint_restored(self, path: str, round: int):
|
|
378
|
+
"""Log checkpoint restored."""
|
|
379
|
+
self._log(
|
|
380
|
+
logging.INFO,
|
|
381
|
+
LogCategory.CHECKPOINT,
|
|
382
|
+
f"Checkpoint restored from round {round}",
|
|
383
|
+
{"path": path}
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
# ==================== SUMMARY LOGS ====================
|
|
387
|
+
|
|
388
|
+
def _summary_loop(self):
|
|
389
|
+
"""Background thread for periodic summaries."""
|
|
390
|
+
while self._running:
|
|
391
|
+
time.sleep(self.summary_interval)
|
|
392
|
+
self.log_summary()
|
|
393
|
+
|
|
394
|
+
def log_summary(self):
|
|
395
|
+
"""Log periodic summary of accumulated stats."""
|
|
396
|
+
with self._stats_lock:
|
|
397
|
+
stats = self.stats.to_dict()
|
|
398
|
+
|
|
399
|
+
# Calculate averages
|
|
400
|
+
avg_loss = (
|
|
401
|
+
stats["total_loss"] / stats["training_steps"]
|
|
402
|
+
if stats["training_steps"] > 0
|
|
403
|
+
else 0.0
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
# Build summary message
|
|
407
|
+
summary_parts = []
|
|
408
|
+
|
|
409
|
+
if stats["training_steps"] > 0:
|
|
410
|
+
summary_parts.append(
|
|
411
|
+
f"Training: {stats['training_steps']} steps, "
|
|
412
|
+
f"avg_loss={avg_loss:.4f}, "
|
|
413
|
+
f"tokens={stats['tokens_processed']}"
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
if stats["diloco_syncs"] > 0:
|
|
417
|
+
summary_parts.append(f"DiLoCo: {stats['diloco_syncs']} syncs")
|
|
418
|
+
|
|
419
|
+
if stats["activations_sent"] > 0 or stats["activations_received"] > 0:
|
|
420
|
+
summary_parts.append(
|
|
421
|
+
f"Routing: sent={stats['activations_sent']}, "
|
|
422
|
+
f"recv={stats['activations_received']}, "
|
|
423
|
+
f"dropped={stats['activations_dropped']}"
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
if stats["local_only_steps"] > 0:
|
|
427
|
+
summary_parts.append(f"Overflow: {stats['local_only_steps']} local-only steps")
|
|
428
|
+
|
|
429
|
+
if stats["heartbeats_sent"] > 0:
|
|
430
|
+
summary_parts.append(
|
|
431
|
+
f"Heartbeat: sent={stats['heartbeats_sent']}, "
|
|
432
|
+
f"recv={stats['heartbeats_received']}"
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
if stats["checkpoints_saved"] > 0:
|
|
436
|
+
summary_parts.append(f"Checkpoints: {stats['checkpoints_saved']} saved")
|
|
437
|
+
|
|
438
|
+
# Log summary if there's anything to report
|
|
439
|
+
if summary_parts:
|
|
440
|
+
self._log(
|
|
441
|
+
logging.INFO,
|
|
442
|
+
LogCategory.SYSTEM,
|
|
443
|
+
f"[SUMMARY] " + " | ".join(summary_parts),
|
|
444
|
+
{"interval_seconds": self.summary_interval}
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Reset stats
|
|
448
|
+
self.stats.reset()
|
|
449
|
+
|
|
450
|
+
# ==================== CONVENIENCE METHODS ====================
|
|
451
|
+
|
|
452
|
+
def info(self, message: str, category: LogCategory = LogCategory.SYSTEM, **data):
|
|
453
|
+
"""Log info message."""
|
|
454
|
+
self._log(logging.INFO, category, message, data if data else None)
|
|
455
|
+
|
|
456
|
+
def warning(self, message: str, category: LogCategory = LogCategory.SYSTEM, **data):
|
|
457
|
+
"""Log warning message."""
|
|
458
|
+
self._log(logging.WARNING, category, message, data if data else None)
|
|
459
|
+
|
|
460
|
+
def error(self, message: str, category: LogCategory = LogCategory.SYSTEM, **data):
|
|
461
|
+
"""Log error message."""
|
|
462
|
+
self._log(logging.ERROR, category, message, data if data else None)
|
|
463
|
+
|
|
464
|
+
def debug(self, message: str, category: LogCategory = LogCategory.SYSTEM, **data):
|
|
465
|
+
"""Log debug message."""
|
|
466
|
+
self._log(logging.DEBUG, category, message, data if data else None)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
# Global logger instance (can be initialized later)
|
|
470
|
+
_swarm_logger: Optional[SwarmLogger] = None
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def get_swarm_logger() -> Optional[SwarmLogger]:
|
|
474
|
+
"""Get the global swarm logger instance."""
|
|
475
|
+
return _swarm_logger
|
|
476
|
+
|
|
477
|
+
|
|
478
|
+
def init_swarm_logger(
|
|
479
|
+
node_id: str,
|
|
480
|
+
role: Optional[NodeRole] = None,
|
|
481
|
+
**kwargs
|
|
482
|
+
) -> SwarmLogger:
|
|
483
|
+
"""Initialize the global swarm logger."""
|
|
484
|
+
global _swarm_logger
|
|
485
|
+
_swarm_logger = SwarmLogger(node_id, role, **kwargs)
|
|
486
|
+
return _swarm_logger
|
|
487
|
+
|