nexaroa 0.0.111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. neuroshard/__init__.py +93 -0
  2. neuroshard/__main__.py +4 -0
  3. neuroshard/cli.py +466 -0
  4. neuroshard/core/__init__.py +92 -0
  5. neuroshard/core/consensus/verifier.py +252 -0
  6. neuroshard/core/crypto/__init__.py +20 -0
  7. neuroshard/core/crypto/ecdsa.py +392 -0
  8. neuroshard/core/economics/__init__.py +52 -0
  9. neuroshard/core/economics/constants.py +387 -0
  10. neuroshard/core/economics/ledger.py +2111 -0
  11. neuroshard/core/economics/market.py +975 -0
  12. neuroshard/core/economics/wallet.py +168 -0
  13. neuroshard/core/governance/__init__.py +74 -0
  14. neuroshard/core/governance/proposal.py +561 -0
  15. neuroshard/core/governance/registry.py +545 -0
  16. neuroshard/core/governance/versioning.py +332 -0
  17. neuroshard/core/governance/voting.py +453 -0
  18. neuroshard/core/model/__init__.py +30 -0
  19. neuroshard/core/model/dynamic.py +4186 -0
  20. neuroshard/core/model/llm.py +905 -0
  21. neuroshard/core/model/registry.py +164 -0
  22. neuroshard/core/model/scaler.py +387 -0
  23. neuroshard/core/model/tokenizer.py +568 -0
  24. neuroshard/core/network/__init__.py +56 -0
  25. neuroshard/core/network/connection_pool.py +72 -0
  26. neuroshard/core/network/dht.py +130 -0
  27. neuroshard/core/network/dht_plan.py +55 -0
  28. neuroshard/core/network/dht_proof_store.py +516 -0
  29. neuroshard/core/network/dht_protocol.py +261 -0
  30. neuroshard/core/network/dht_service.py +506 -0
  31. neuroshard/core/network/encrypted_channel.py +141 -0
  32. neuroshard/core/network/nat.py +201 -0
  33. neuroshard/core/network/nat_traversal.py +695 -0
  34. neuroshard/core/network/p2p.py +929 -0
  35. neuroshard/core/network/p2p_data.py +150 -0
  36. neuroshard/core/swarm/__init__.py +106 -0
  37. neuroshard/core/swarm/aggregation.py +729 -0
  38. neuroshard/core/swarm/buffers.py +643 -0
  39. neuroshard/core/swarm/checkpoint.py +709 -0
  40. neuroshard/core/swarm/compute.py +624 -0
  41. neuroshard/core/swarm/diloco.py +844 -0
  42. neuroshard/core/swarm/factory.py +1288 -0
  43. neuroshard/core/swarm/heartbeat.py +669 -0
  44. neuroshard/core/swarm/logger.py +487 -0
  45. neuroshard/core/swarm/router.py +658 -0
  46. neuroshard/core/swarm/service.py +640 -0
  47. neuroshard/core/training/__init__.py +29 -0
  48. neuroshard/core/training/checkpoint.py +600 -0
  49. neuroshard/core/training/distributed.py +1602 -0
  50. neuroshard/core/training/global_tracker.py +617 -0
  51. neuroshard/core/training/production.py +276 -0
  52. neuroshard/governance_cli.py +729 -0
  53. neuroshard/grpc_server.py +895 -0
  54. neuroshard/runner.py +3223 -0
  55. neuroshard/sdk/__init__.py +92 -0
  56. neuroshard/sdk/client.py +990 -0
  57. neuroshard/sdk/errors.py +101 -0
  58. neuroshard/sdk/types.py +282 -0
  59. neuroshard/tracker/__init__.py +0 -0
  60. neuroshard/tracker/server.py +864 -0
  61. neuroshard/ui/__init__.py +0 -0
  62. neuroshard/ui/app.py +102 -0
  63. neuroshard/ui/templates/index.html +1052 -0
  64. neuroshard/utils/__init__.py +0 -0
  65. neuroshard/utils/autostart.py +81 -0
  66. neuroshard/utils/hardware.py +121 -0
  67. neuroshard/utils/serialization.py +90 -0
  68. neuroshard/version.py +1 -0
  69. nexaroa-0.0.111.dist-info/METADATA +283 -0
  70. nexaroa-0.0.111.dist-info/RECORD +78 -0
  71. nexaroa-0.0.111.dist-info/WHEEL +5 -0
  72. nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
  73. nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
  74. nexaroa-0.0.111.dist-info/top_level.txt +2 -0
  75. protos/__init__.py +0 -0
  76. protos/neuroshard.proto +651 -0
  77. protos/neuroshard_pb2.py +160 -0
  78. protos/neuroshard_pb2_grpc.py +1298 -0
@@ -0,0 +1,487 @@
1
+ """
2
+ Swarm Logger - Structured logging for NeuroShard Swarm Architecture
3
+
4
+ This module provides:
5
+ - Structured JSON logging for metrics and events
6
+ - Periodic summary statistics (reduce log spam)
7
+ - Role-specific log prefixes (DRIVER, WORKER, VALIDATOR)
8
+ - Log level filtering by component
9
+ - Training/Inference/Swarm log separation
10
+
11
+ Usage:
12
+ from neuroshard.core.swarm import SwarmLogger, LogCategory
13
+
14
+ logger = SwarmLogger("my_node")
15
+ logger.log_training_step(round=100, loss=0.5, tokens=1000)
16
+ logger.log_diloco_sync(inner_steps=500, outer_step=1)
17
+ """
18
+
19
+ import json
20
+ import logging
21
+ import time
22
+ import threading
23
+ from dataclasses import dataclass, field, asdict
24
+ from typing import Dict, List, Optional, Any
25
+ from enum import Enum
26
+ from collections import defaultdict
27
+ from datetime import datetime
28
+
29
+
30
+ class LogCategory(Enum):
31
+ """Log categories for filtering and routing."""
32
+ SWARM = "swarm"
33
+ TRAINING = "training"
34
+ INFERENCE = "inference"
35
+ DILOCO = "diloco"
36
+ HEARTBEAT = "heartbeat"
37
+ ROUTING = "routing"
38
+ CHECKPOINT = "checkpoint"
39
+ SYSTEM = "system"
40
+
41
+
42
+ class NodeRole(Enum):
43
+ """Node roles for log prefixes."""
44
+ DRIVER = "DRIVER"
45
+ WORKER = "WORKER"
46
+ VALIDATOR = "VALIDATOR"
47
+ FULL = "FULL" # Has both embedding and LM head
48
+
49
+
50
+ @dataclass
51
+ class LogStats:
52
+ """Accumulated statistics for periodic summaries."""
53
+ training_steps: int = 0
54
+ total_loss: float = 0.0
55
+ tokens_processed: int = 0
56
+ diloco_syncs: int = 0
57
+ activations_sent: int = 0
58
+ activations_received: int = 0
59
+ activations_dropped: int = 0
60
+ local_only_steps: int = 0
61
+ heartbeats_sent: int = 0
62
+ heartbeats_received: int = 0
63
+ peer_updates: int = 0
64
+ checkpoints_saved: int = 0
65
+
66
+ def reset(self):
67
+ """Reset all stats."""
68
+ self.training_steps = 0
69
+ self.total_loss = 0.0
70
+ self.tokens_processed = 0
71
+ self.diloco_syncs = 0
72
+ self.activations_sent = 0
73
+ self.activations_received = 0
74
+ self.activations_dropped = 0
75
+ self.local_only_steps = 0
76
+ self.heartbeats_sent = 0
77
+ self.heartbeats_received = 0
78
+ self.peer_updates = 0
79
+ self.checkpoints_saved = 0
80
+
81
+ def to_dict(self) -> Dict[str, Any]:
82
+ """Convert to dictionary."""
83
+ return asdict(self)
84
+
85
+
86
+ class SwarmLogger:
87
+ """
88
+ Structured logger for NeuroShard Swarm Architecture.
89
+
90
+ Features:
91
+ - Role-specific prefixes (DRIVER, WORKER, VALIDATOR)
92
+ - JSON-structured event logging
93
+ - Periodic summary statistics
94
+ - Configurable log levels per category
95
+ """
96
+
97
+ # Summary interval in seconds
98
+ SUMMARY_INTERVAL = 60
99
+
100
+ def __init__(
101
+ self,
102
+ node_id: str,
103
+ role: Optional[NodeRole] = None,
104
+ log_level: int = logging.INFO,
105
+ enable_json: bool = False,
106
+ summary_interval: int = 60,
107
+ ):
108
+ """
109
+ Initialize SwarmLogger.
110
+
111
+ Args:
112
+ node_id: Node identifier (first 8 chars used in prefix)
113
+ role: Node role (DRIVER, WORKER, VALIDATOR, FULL)
114
+ log_level: Base log level
115
+ enable_json: Enable JSON-structured logging
116
+ summary_interval: Seconds between periodic summaries
117
+ """
118
+ self.node_id = node_id
119
+ self.node_id_short = node_id[:8] if node_id else "unknown"
120
+ self.role = role or NodeRole.WORKER
121
+ self.enable_json = enable_json
122
+ self.summary_interval = summary_interval
123
+
124
+ # Create logger
125
+ self.logger = logging.getLogger(f"neuroshard.swarm.{self.node_id_short}")
126
+ self.logger.setLevel(log_level)
127
+
128
+ # Category-specific log levels
129
+ self.category_levels: Dict[LogCategory, int] = defaultdict(lambda: log_level)
130
+
131
+ # Accumulated stats for summaries
132
+ self.stats = LogStats()
133
+ self._stats_lock = threading.Lock()
134
+
135
+ # Last summary time
136
+ self.last_summary_time = time.time()
137
+
138
+ # Start summary thread
139
+ self._running = True
140
+ self._summary_thread = threading.Thread(target=self._summary_loop, daemon=True)
141
+ self._summary_thread.start()
142
+
143
+ def stop(self):
144
+ """Stop the logger and summary thread."""
145
+ self._running = False
146
+
147
+ def set_role(self, has_embedding: bool, has_lm_head: bool):
148
+ """Set role based on layer assignment."""
149
+ if has_embedding and has_lm_head:
150
+ self.role = NodeRole.FULL
151
+ elif has_embedding:
152
+ self.role = NodeRole.DRIVER
153
+ elif has_lm_head:
154
+ self.role = NodeRole.VALIDATOR
155
+ else:
156
+ self.role = NodeRole.WORKER
157
+
158
+ def set_category_level(self, category: LogCategory, level: int):
159
+ """Set log level for a specific category."""
160
+ self.category_levels[category] = level
161
+
162
+ def _get_prefix(self) -> str:
163
+ """Get log prefix based on role."""
164
+ return f"[{self.role.value}:{self.node_id_short}]"
165
+
166
+ def _format_message(
167
+ self,
168
+ category: LogCategory,
169
+ message: str,
170
+ data: Optional[Dict[str, Any]] = None
171
+ ) -> str:
172
+ """Format log message."""
173
+ prefix = self._get_prefix()
174
+
175
+ if self.enable_json and data:
176
+ # JSON structured logging
177
+ log_entry = {
178
+ "timestamp": datetime.utcnow().isoformat(),
179
+ "node_id": self.node_id_short,
180
+ "role": self.role.value,
181
+ "category": category.value,
182
+ "message": message,
183
+ **data
184
+ }
185
+ return json.dumps(log_entry)
186
+ elif data:
187
+ # Human-readable with data
188
+ data_str = ", ".join(f"{k}={v}" for k, v in data.items())
189
+ return f"{prefix} [{category.value}] {message} ({data_str})"
190
+ else:
191
+ return f"{prefix} [{category.value}] {message}"
192
+
193
+ def _log(
194
+ self,
195
+ level: int,
196
+ category: LogCategory,
197
+ message: str,
198
+ data: Optional[Dict[str, Any]] = None
199
+ ):
200
+ """Internal log method."""
201
+ # Check category-specific level
202
+ if level < self.category_levels[category]:
203
+ return
204
+
205
+ formatted = self._format_message(category, message, data)
206
+ self.logger.log(level, formatted)
207
+
208
+ # ==================== TRAINING LOGS ====================
209
+
210
+ def log_training_step(
211
+ self,
212
+ round: int,
213
+ loss: float,
214
+ tokens: int = 0,
215
+ duration_ms: float = 0,
216
+ ):
217
+ """Log a training step completion."""
218
+ with self._stats_lock:
219
+ self.stats.training_steps += 1
220
+ self.stats.total_loss += loss
221
+ self.stats.tokens_processed += tokens
222
+
223
+ self._log(
224
+ logging.INFO,
225
+ LogCategory.TRAINING,
226
+ f"Step #{round} complete",
227
+ {"loss": f"{loss:.4f}", "tokens": tokens, "duration_ms": f"{duration_ms:.1f}"}
228
+ )
229
+
230
+ def log_training_waiting(self, reason: str = "data"):
231
+ """Log training waiting state."""
232
+ self._log(
233
+ logging.DEBUG,
234
+ LogCategory.TRAINING,
235
+ f"Waiting for {reason}",
236
+ )
237
+
238
+ # ==================== DiLoCo LOGS ====================
239
+
240
+ def log_diloco_progress(self, inner_step: int, inner_total: int):
241
+ """Log DiLoCo inner step progress (only on milestones)."""
242
+ # Only log at 10%, 25%, 50%, 75%, 90%, 100%
243
+ progress = inner_step / inner_total
244
+ milestones = [0.1, 0.25, 0.5, 0.75, 0.9, 1.0]
245
+
246
+ for milestone in milestones:
247
+ if abs(progress - milestone) < 0.01:
248
+ self._log(
249
+ logging.INFO,
250
+ LogCategory.DILOCO,
251
+ f"Inner progress: {int(progress * 100)}%",
252
+ {"inner_step": inner_step, "inner_total": inner_total}
253
+ )
254
+ break
255
+
256
+ def log_diloco_sync(self, inner_steps: int, outer_step: int, duration_ms: float = 0):
257
+ """Log DiLoCo outer sync completion."""
258
+ with self._stats_lock:
259
+ self.stats.diloco_syncs += 1
260
+
261
+ self._log(
262
+ logging.INFO,
263
+ LogCategory.DILOCO,
264
+ f"Outer sync #{outer_step} complete",
265
+ {"inner_steps": inner_steps, "duration_ms": f"{duration_ms:.1f}"}
266
+ )
267
+
268
+ # ==================== SWARM LOGS ====================
269
+
270
+ def log_activation_sent(self, target_layer: int, target_node: str):
271
+ """Log activation sent to peer."""
272
+ with self._stats_lock:
273
+ self.stats.activations_sent += 1
274
+
275
+ self._log(
276
+ logging.DEBUG,
277
+ LogCategory.ROUTING,
278
+ f"Sent activation to layer {target_layer}",
279
+ {"target_node": target_node[:8]}
280
+ )
281
+
282
+ def log_activation_received(self, source_node: str, layer: int):
283
+ """Log activation received from peer."""
284
+ with self._stats_lock:
285
+ self.stats.activations_received += 1
286
+
287
+ self._log(
288
+ logging.DEBUG,
289
+ LogCategory.ROUTING,
290
+ f"Received activation for layer {layer}",
291
+ {"source_node": source_node[:8]}
292
+ )
293
+
294
+ def log_soft_overflow(self, step: int, buffer_fill: float):
295
+ """Log soft overflow (local accumulation)."""
296
+ with self._stats_lock:
297
+ self.stats.local_only_steps += 1
298
+
299
+ self._log(
300
+ logging.WARNING,
301
+ LogCategory.SWARM,
302
+ f"Soft overflow at step {step}",
303
+ {"buffer_fill": f"{buffer_fill:.1%}"}
304
+ )
305
+
306
+ def log_hard_overflow(self, step: int, buffer_fill: float):
307
+ """Log hard overflow (dropped step)."""
308
+ with self._stats_lock:
309
+ self.stats.activations_dropped += 1
310
+
311
+ self._log(
312
+ logging.ERROR,
313
+ LogCategory.SWARM,
314
+ f"Hard overflow at step {step} - step dropped",
315
+ {"buffer_fill": f"{buffer_fill:.1%}"}
316
+ )
317
+
318
+ def log_failover(self, from_node: str, to_node: str, reason: str):
319
+ """Log routing failover."""
320
+ self._log(
321
+ logging.WARNING,
322
+ LogCategory.ROUTING,
323
+ f"Failover from {from_node[:8]} to {to_node[:8]}",
324
+ {"reason": reason}
325
+ )
326
+
327
+ # ==================== HEARTBEAT LOGS ====================
328
+
329
+ def log_heartbeat_sent(self, peers: int):
330
+ """Log heartbeat broadcast."""
331
+ with self._stats_lock:
332
+ self.stats.heartbeats_sent += 1
333
+
334
+ self._log(
335
+ logging.DEBUG,
336
+ LogCategory.HEARTBEAT,
337
+ f"Heartbeat sent to {peers} peers",
338
+ )
339
+
340
+ def log_heartbeat_received(self, from_node: str):
341
+ """Log heartbeat received."""
342
+ with self._stats_lock:
343
+ self.stats.heartbeats_received += 1
344
+
345
+ self._log(
346
+ logging.DEBUG,
347
+ LogCategory.HEARTBEAT,
348
+ f"Heartbeat from {from_node[:8]}",
349
+ )
350
+
351
+ def log_peer_update(self, node_id: str, capacity: int):
352
+ """Log peer capacity update."""
353
+ with self._stats_lock:
354
+ self.stats.peer_updates += 1
355
+
356
+ self._log(
357
+ logging.DEBUG,
358
+ LogCategory.HEARTBEAT,
359
+ f"Peer {node_id[:8]} capacity updated",
360
+ {"available_mb": capacity}
361
+ )
362
+
363
+ # ==================== CHECKPOINT LOGS ====================
364
+
365
+ def log_checkpoint_saved(self, path: str, size_mb: float):
366
+ """Log checkpoint saved."""
367
+ with self._stats_lock:
368
+ self.stats.checkpoints_saved += 1
369
+
370
+ self._log(
371
+ logging.INFO,
372
+ LogCategory.CHECKPOINT,
373
+ f"Checkpoint saved",
374
+ {"path": path, "size_mb": f"{size_mb:.1f}"}
375
+ )
376
+
377
+ def log_checkpoint_restored(self, path: str, round: int):
378
+ """Log checkpoint restored."""
379
+ self._log(
380
+ logging.INFO,
381
+ LogCategory.CHECKPOINT,
382
+ f"Checkpoint restored from round {round}",
383
+ {"path": path}
384
+ )
385
+
386
+ # ==================== SUMMARY LOGS ====================
387
+
388
+ def _summary_loop(self):
389
+ """Background thread for periodic summaries."""
390
+ while self._running:
391
+ time.sleep(self.summary_interval)
392
+ self.log_summary()
393
+
394
+ def log_summary(self):
395
+ """Log periodic summary of accumulated stats."""
396
+ with self._stats_lock:
397
+ stats = self.stats.to_dict()
398
+
399
+ # Calculate averages
400
+ avg_loss = (
401
+ stats["total_loss"] / stats["training_steps"]
402
+ if stats["training_steps"] > 0
403
+ else 0.0
404
+ )
405
+
406
+ # Build summary message
407
+ summary_parts = []
408
+
409
+ if stats["training_steps"] > 0:
410
+ summary_parts.append(
411
+ f"Training: {stats['training_steps']} steps, "
412
+ f"avg_loss={avg_loss:.4f}, "
413
+ f"tokens={stats['tokens_processed']}"
414
+ )
415
+
416
+ if stats["diloco_syncs"] > 0:
417
+ summary_parts.append(f"DiLoCo: {stats['diloco_syncs']} syncs")
418
+
419
+ if stats["activations_sent"] > 0 or stats["activations_received"] > 0:
420
+ summary_parts.append(
421
+ f"Routing: sent={stats['activations_sent']}, "
422
+ f"recv={stats['activations_received']}, "
423
+ f"dropped={stats['activations_dropped']}"
424
+ )
425
+
426
+ if stats["local_only_steps"] > 0:
427
+ summary_parts.append(f"Overflow: {stats['local_only_steps']} local-only steps")
428
+
429
+ if stats["heartbeats_sent"] > 0:
430
+ summary_parts.append(
431
+ f"Heartbeat: sent={stats['heartbeats_sent']}, "
432
+ f"recv={stats['heartbeats_received']}"
433
+ )
434
+
435
+ if stats["checkpoints_saved"] > 0:
436
+ summary_parts.append(f"Checkpoints: {stats['checkpoints_saved']} saved")
437
+
438
+ # Log summary if there's anything to report
439
+ if summary_parts:
440
+ self._log(
441
+ logging.INFO,
442
+ LogCategory.SYSTEM,
443
+ f"[SUMMARY] " + " | ".join(summary_parts),
444
+ {"interval_seconds": self.summary_interval}
445
+ )
446
+
447
+ # Reset stats
448
+ self.stats.reset()
449
+
450
+ # ==================== CONVENIENCE METHODS ====================
451
+
452
+ def info(self, message: str, category: LogCategory = LogCategory.SYSTEM, **data):
453
+ """Log info message."""
454
+ self._log(logging.INFO, category, message, data if data else None)
455
+
456
+ def warning(self, message: str, category: LogCategory = LogCategory.SYSTEM, **data):
457
+ """Log warning message."""
458
+ self._log(logging.WARNING, category, message, data if data else None)
459
+
460
+ def error(self, message: str, category: LogCategory = LogCategory.SYSTEM, **data):
461
+ """Log error message."""
462
+ self._log(logging.ERROR, category, message, data if data else None)
463
+
464
+ def debug(self, message: str, category: LogCategory = LogCategory.SYSTEM, **data):
465
+ """Log debug message."""
466
+ self._log(logging.DEBUG, category, message, data if data else None)
467
+
468
+
469
+ # Global logger instance (can be initialized later)
470
+ _swarm_logger: Optional[SwarmLogger] = None
471
+
472
+
473
+ def get_swarm_logger() -> Optional[SwarmLogger]:
474
+ """Get the global swarm logger instance."""
475
+ return _swarm_logger
476
+
477
+
478
+ def init_swarm_logger(
479
+ node_id: str,
480
+ role: Optional[NodeRole] = None,
481
+ **kwargs
482
+ ) -> SwarmLogger:
483
+ """Initialize the global swarm logger."""
484
+ global _swarm_logger
485
+ _swarm_logger = SwarmLogger(node_id, role, **kwargs)
486
+ return _swarm_logger
487
+