nexaroa 0.0.111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuroshard/__init__.py +93 -0
- neuroshard/__main__.py +4 -0
- neuroshard/cli.py +466 -0
- neuroshard/core/__init__.py +92 -0
- neuroshard/core/consensus/verifier.py +252 -0
- neuroshard/core/crypto/__init__.py +20 -0
- neuroshard/core/crypto/ecdsa.py +392 -0
- neuroshard/core/economics/__init__.py +52 -0
- neuroshard/core/economics/constants.py +387 -0
- neuroshard/core/economics/ledger.py +2111 -0
- neuroshard/core/economics/market.py +975 -0
- neuroshard/core/economics/wallet.py +168 -0
- neuroshard/core/governance/__init__.py +74 -0
- neuroshard/core/governance/proposal.py +561 -0
- neuroshard/core/governance/registry.py +545 -0
- neuroshard/core/governance/versioning.py +332 -0
- neuroshard/core/governance/voting.py +453 -0
- neuroshard/core/model/__init__.py +30 -0
- neuroshard/core/model/dynamic.py +4186 -0
- neuroshard/core/model/llm.py +905 -0
- neuroshard/core/model/registry.py +164 -0
- neuroshard/core/model/scaler.py +387 -0
- neuroshard/core/model/tokenizer.py +568 -0
- neuroshard/core/network/__init__.py +56 -0
- neuroshard/core/network/connection_pool.py +72 -0
- neuroshard/core/network/dht.py +130 -0
- neuroshard/core/network/dht_plan.py +55 -0
- neuroshard/core/network/dht_proof_store.py +516 -0
- neuroshard/core/network/dht_protocol.py +261 -0
- neuroshard/core/network/dht_service.py +506 -0
- neuroshard/core/network/encrypted_channel.py +141 -0
- neuroshard/core/network/nat.py +201 -0
- neuroshard/core/network/nat_traversal.py +695 -0
- neuroshard/core/network/p2p.py +929 -0
- neuroshard/core/network/p2p_data.py +150 -0
- neuroshard/core/swarm/__init__.py +106 -0
- neuroshard/core/swarm/aggregation.py +729 -0
- neuroshard/core/swarm/buffers.py +643 -0
- neuroshard/core/swarm/checkpoint.py +709 -0
- neuroshard/core/swarm/compute.py +624 -0
- neuroshard/core/swarm/diloco.py +844 -0
- neuroshard/core/swarm/factory.py +1288 -0
- neuroshard/core/swarm/heartbeat.py +669 -0
- neuroshard/core/swarm/logger.py +487 -0
- neuroshard/core/swarm/router.py +658 -0
- neuroshard/core/swarm/service.py +640 -0
- neuroshard/core/training/__init__.py +29 -0
- neuroshard/core/training/checkpoint.py +600 -0
- neuroshard/core/training/distributed.py +1602 -0
- neuroshard/core/training/global_tracker.py +617 -0
- neuroshard/core/training/production.py +276 -0
- neuroshard/governance_cli.py +729 -0
- neuroshard/grpc_server.py +895 -0
- neuroshard/runner.py +3223 -0
- neuroshard/sdk/__init__.py +92 -0
- neuroshard/sdk/client.py +990 -0
- neuroshard/sdk/errors.py +101 -0
- neuroshard/sdk/types.py +282 -0
- neuroshard/tracker/__init__.py +0 -0
- neuroshard/tracker/server.py +864 -0
- neuroshard/ui/__init__.py +0 -0
- neuroshard/ui/app.py +102 -0
- neuroshard/ui/templates/index.html +1052 -0
- neuroshard/utils/__init__.py +0 -0
- neuroshard/utils/autostart.py +81 -0
- neuroshard/utils/hardware.py +121 -0
- neuroshard/utils/serialization.py +90 -0
- neuroshard/version.py +1 -0
- nexaroa-0.0.111.dist-info/METADATA +283 -0
- nexaroa-0.0.111.dist-info/RECORD +78 -0
- nexaroa-0.0.111.dist-info/WHEEL +5 -0
- nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
- nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
- nexaroa-0.0.111.dist-info/top_level.txt +2 -0
- protos/__init__.py +0 -0
- protos/neuroshard.proto +651 -0
- protos/neuroshard_pb2.py +160 -0
- protos/neuroshard_pb2_grpc.py +1298 -0
|
@@ -0,0 +1,929 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import random
|
|
3
|
+
import threading
|
|
4
|
+
import time
|
|
5
|
+
import hashlib
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import sqlite3
|
|
9
|
+
from typing import Dict, List, Optional, Any
|
|
10
|
+
from urllib.parse import urlparse
|
|
11
|
+
|
|
12
|
+
# DHT Imports
|
|
13
|
+
try:
|
|
14
|
+
from neuroshard.core.network.dht import Node, RoutingTable
|
|
15
|
+
from neuroshard.core.network.dht_protocol import DHTProtocol
|
|
16
|
+
DHT_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
DHT_AVAILABLE = False
|
|
19
|
+
|
|
20
|
+
# Ledger Imports
|
|
21
|
+
try:
|
|
22
|
+
from neuroshard.core.economics.ledger import NEUROLedger, ProofType, PoNWProof
|
|
23
|
+
LEDGER_AVAILABLE = True
|
|
24
|
+
except ImportError as e:
|
|
25
|
+
LEDGER_AVAILABLE = False
|
|
26
|
+
print(f"[LEDGER IMPORT ERROR] {e}")
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
class P2PManager:
|
|
31
|
+
def __init__(self, my_url: str, shard_range: str, tracker_url: str = "http://localhost:3000", node_token: Optional[str] = None):
|
|
32
|
+
self.my_url = my_url
|
|
33
|
+
self.shard_range = shard_range
|
|
34
|
+
self.tracker_url = tracker_url
|
|
35
|
+
self.node_token = node_token
|
|
36
|
+
self.known_peers: Dict[str, dict] = {} # url -> info
|
|
37
|
+
self.running = True
|
|
38
|
+
self._stop_event = threading.Event() # For interruptible sleeps
|
|
39
|
+
|
|
40
|
+
# Parse local shard range
|
|
41
|
+
try:
|
|
42
|
+
self.start_layer, self.end_layer = map(int, shard_range.split("-"))
|
|
43
|
+
except:
|
|
44
|
+
self.start_layer, self.end_layer = 0, 0
|
|
45
|
+
|
|
46
|
+
# Metrics
|
|
47
|
+
self.current_tps = 0.0
|
|
48
|
+
self.current_latency = 0.0
|
|
49
|
+
|
|
50
|
+
# Reference to global state (injected by runner)
|
|
51
|
+
self.state_ref = {}
|
|
52
|
+
|
|
53
|
+
# --- DHT & Decentralization Init ---
|
|
54
|
+
self.dht = None
|
|
55
|
+
self.routing_table = None
|
|
56
|
+
self.ledger = None
|
|
57
|
+
|
|
58
|
+
# Node ID will be set from ledger crypto (ECDSA-derived)
|
|
59
|
+
# For DHT we need an integer ID, so we'll derive it from the token
|
|
60
|
+
if self.node_token:
|
|
61
|
+
# Use first 20 bytes of SHA256(token) as DHT node ID (160-bit)
|
|
62
|
+
self.node_id = int(hashlib.sha256(self.node_token.encode()).hexdigest()[:40], 16)
|
|
63
|
+
else:
|
|
64
|
+
# Fallback to random ID
|
|
65
|
+
self.node_id = int(hashlib.sha1(f"{my_url}{time.time()}".encode()).hexdigest(), 16)
|
|
66
|
+
|
|
67
|
+
# The ledger node_id (32 hex chars from ECDSA public key) will be different
|
|
68
|
+
# but deterministically linked to the same token
|
|
69
|
+
self.ledger_node_id = None # Set after ledger init
|
|
70
|
+
|
|
71
|
+
if DHT_AVAILABLE:
|
|
72
|
+
try:
|
|
73
|
+
parsed = urlparse(my_url)
|
|
74
|
+
ip = parsed.hostname or 'localhost'
|
|
75
|
+
port = parsed.port or (443 if parsed.scheme == 'https' else 80)
|
|
76
|
+
|
|
77
|
+
self.local_node = Node(self.node_id, ip, port)
|
|
78
|
+
self.routing_table = RoutingTable(self.local_node)
|
|
79
|
+
self.dht = DHTProtocol(self.local_node, self.routing_table, port)
|
|
80
|
+
# Expose internal storage for gRPC inspection
|
|
81
|
+
self.dht_storage = self.dht.storage
|
|
82
|
+
logger.info(f"DHT Initialized: {self.local_node}")
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.error(f"Failed to init DHT: {e}")
|
|
85
|
+
self.dht_storage = {} # Fallback
|
|
86
|
+
|
|
87
|
+
if not hasattr(self, 'dht_storage'):
|
|
88
|
+
self.dht_storage = {}
|
|
89
|
+
|
|
90
|
+
if LEDGER_AVAILABLE:
|
|
91
|
+
try:
|
|
92
|
+
# Check for explicit path from environment (Docker/production)
|
|
93
|
+
ledger_db_path = os.getenv("LEDGER_DB_PATH")
|
|
94
|
+
|
|
95
|
+
if not ledger_db_path:
|
|
96
|
+
# Fallback to ~/.neuroshard/ directory for local development
|
|
97
|
+
neuroshard_dir = os.path.join(os.path.expanduser("~"), ".neuroshard")
|
|
98
|
+
os.makedirs(neuroshard_dir, exist_ok=True)
|
|
99
|
+
ledger_db_path = os.path.join(neuroshard_dir, f"ledger_{self.node_id}.db")
|
|
100
|
+
else:
|
|
101
|
+
# Ensure directory exists for explicit path
|
|
102
|
+
os.makedirs(os.path.dirname(ledger_db_path), exist_ok=True)
|
|
103
|
+
|
|
104
|
+
logger.info(f"Ledger DB path: {ledger_db_path}")
|
|
105
|
+
|
|
106
|
+
self.ledger = NEUROLedger(
|
|
107
|
+
db_path=ledger_db_path,
|
|
108
|
+
node_token=self.node_token
|
|
109
|
+
)
|
|
110
|
+
# Get the ECDSA-derived node_id from ledger
|
|
111
|
+
self.ledger_node_id = self.ledger.node_id
|
|
112
|
+
logger.info(f"NEUROLedger Initialized with ECDSA node_id: {self.ledger_node_id[:16]}...")
|
|
113
|
+
|
|
114
|
+
# Bootstrap balance from DHT for existing wallets
|
|
115
|
+
# Fully trustless via ECDSA signature verification + Byzantine consensus
|
|
116
|
+
self._bootstrap_balance_from_dht()
|
|
117
|
+
except Exception as e:
|
|
118
|
+
logger.error(f"Failed to init Ledger: {e}")
|
|
119
|
+
self.ledger = None # Ensure explicit None on failure
|
|
120
|
+
else:
|
|
121
|
+
logger.info("Ledger Manager NOT available (dependencies missing or import failed)")
|
|
122
|
+
|
|
123
|
+
# Reference to NeuroNode (set later via set_neuro_node)
|
|
124
|
+
self.neuro_node = None
|
|
125
|
+
|
|
126
|
+
# Start background tasks
|
|
127
|
+
threading.Thread(target=self._announce_loop, daemon=True).start()
|
|
128
|
+
threading.Thread(target=self._gossip_loop, daemon=True).start()
|
|
129
|
+
if self.ledger:
|
|
130
|
+
threading.Thread(target=self._sync_stakes_loop, daemon=True).start()
|
|
131
|
+
|
|
132
|
+
def set_neuro_node(self, neuro_node):
|
|
133
|
+
"""Set reference to NeuroNode for checkpoint announcements."""
|
|
134
|
+
self.neuro_node = neuro_node
|
|
135
|
+
|
|
136
|
+
def get_swarm_status(self) -> Dict[str, Any]:
|
|
137
|
+
"""
|
|
138
|
+
Get swarm-related status from the connected node.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Dict with swarm status including buffer fill rates, DiLoCo progress, etc.
|
|
142
|
+
"""
|
|
143
|
+
if not self.neuro_node:
|
|
144
|
+
return {"swarm_enabled": False, "error": "Node not connected"}
|
|
145
|
+
|
|
146
|
+
# Check if node has swarm capabilities
|
|
147
|
+
if hasattr(self.neuro_node, 'get_swarm_status'):
|
|
148
|
+
return self.neuro_node.get_swarm_status()
|
|
149
|
+
else:
|
|
150
|
+
return {"swarm_enabled": False}
|
|
151
|
+
|
|
152
|
+
def get_diloco_progress(self) -> Dict[str, Any]:
|
|
153
|
+
"""
|
|
154
|
+
Get DiLoCo training progress from the connected node.
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Dict with inner step count, sync progress, etc.
|
|
158
|
+
"""
|
|
159
|
+
if not self.neuro_node:
|
|
160
|
+
return {"enabled": False, "error": "Node not connected"}
|
|
161
|
+
|
|
162
|
+
if hasattr(self.neuro_node, 'get_diloco_progress'):
|
|
163
|
+
return self.neuro_node.get_diloco_progress()
|
|
164
|
+
else:
|
|
165
|
+
return {"enabled": False}
|
|
166
|
+
|
|
167
|
+
def get_network_health(self) -> Dict[str, Any]:
|
|
168
|
+
"""
|
|
169
|
+
Get overall network health metrics.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
Dict with peer count, average latency, routing stats, etc.
|
|
173
|
+
"""
|
|
174
|
+
health = {
|
|
175
|
+
"peer_count": len(self.known_peers),
|
|
176
|
+
"avg_latency_ms": self.current_latency * 1000 if self.current_latency else 0,
|
|
177
|
+
"current_tps": self.current_tps,
|
|
178
|
+
"dht_available": self.dht is not None,
|
|
179
|
+
"ledger_available": self.ledger is not None,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
# Add swarm stats if available
|
|
183
|
+
swarm_status = self.get_swarm_status()
|
|
184
|
+
if swarm_status.get("swarm_enabled", False):
|
|
185
|
+
health["swarm_enabled"] = True
|
|
186
|
+
if "router" in swarm_status:
|
|
187
|
+
health["swarm_peers"] = swarm_status["router"].get("peer_count", 0)
|
|
188
|
+
if "heartbeat" in swarm_status:
|
|
189
|
+
health["heartbeat_peers"] = swarm_status["heartbeat"].get("peer_count", 0)
|
|
190
|
+
else:
|
|
191
|
+
health["swarm_enabled"] = False
|
|
192
|
+
|
|
193
|
+
return health
|
|
194
|
+
|
|
195
|
+
def stop(self):
|
|
196
|
+
"""Stop the P2P manager and all background threads."""
|
|
197
|
+
logger.info("Stopping P2P manager...")
|
|
198
|
+
self.running = False
|
|
199
|
+
|
|
200
|
+
# Signal stop event (for threads that check it)
|
|
201
|
+
if hasattr(self, '_stop_event'):
|
|
202
|
+
self._stop_event.set()
|
|
203
|
+
|
|
204
|
+
# Close DHT if available
|
|
205
|
+
if self.dht:
|
|
206
|
+
try:
|
|
207
|
+
# DHT doesn't have a stop method, but we can clear its state
|
|
208
|
+
self.dht.storage.clear()
|
|
209
|
+
except Exception:
|
|
210
|
+
pass
|
|
211
|
+
|
|
212
|
+
# Clear known peers
|
|
213
|
+
self.known_peers.clear()
|
|
214
|
+
|
|
215
|
+
logger.info("P2P manager stopped")
|
|
216
|
+
|
|
217
|
+
def update_metrics(self, tps: float, latency: float):
|
|
218
|
+
self.current_tps = tps
|
|
219
|
+
self.current_latency = latency
|
|
220
|
+
|
|
221
|
+
def _store_proof_in_dht(self, proof: 'PoNWProof', reward: float):
|
|
222
|
+
"""
|
|
223
|
+
Store proof in DHT for decentralized balance sync.
|
|
224
|
+
|
|
225
|
+
This enables new nodes to bootstrap their balance from DHT
|
|
226
|
+
without relying on a central API.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
proof: The PoNW proof to store
|
|
230
|
+
reward: The reward amount credited for this proof
|
|
231
|
+
"""
|
|
232
|
+
if not self.dht:
|
|
233
|
+
return # DHT not available
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
from neuroshard.core.network.dht_proof_store import DHTProofStore, DHTProofRecord
|
|
237
|
+
|
|
238
|
+
# Create DHT proof store (lazy init)
|
|
239
|
+
if not hasattr(self, '_dht_proof_store'):
|
|
240
|
+
self._dht_proof_store = DHTProofStore(self.dht)
|
|
241
|
+
|
|
242
|
+
# Create proof record with ALL fields for verification
|
|
243
|
+
# CRITICAL: Must include nonce, model_hash, and public_key for ECDSA verification
|
|
244
|
+
proof_record = DHTProofRecord(
|
|
245
|
+
node_id=proof.node_id,
|
|
246
|
+
timestamp=proof.timestamp,
|
|
247
|
+
proof_type=proof.proof_type.value if hasattr(proof.proof_type, 'value') else str(proof.proof_type),
|
|
248
|
+
nonce=proof.nonce, # 🔒 Required for canonical_payload
|
|
249
|
+
reward=reward,
|
|
250
|
+
signature=proof.signature,
|
|
251
|
+
public_key=self.ledger.crypto.public_key_hex if self.ledger and self.ledger.crypto else "", # 🔒 Required for verification
|
|
252
|
+
uptime_seconds=proof.uptime_seconds,
|
|
253
|
+
tokens_processed=proof.tokens_processed,
|
|
254
|
+
training_batches=proof.training_batches,
|
|
255
|
+
data_samples=proof.data_samples,
|
|
256
|
+
model_hash=proof.model_hash, # 🔒 Required for canonical_payload
|
|
257
|
+
layers_held=proof.layers_held,
|
|
258
|
+
has_embedding=proof.has_embedding,
|
|
259
|
+
has_lm_head=proof.has_lm_head
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# Get wallet_id (first 16 chars of node_id)
|
|
263
|
+
wallet_id = proof.node_id[:16]
|
|
264
|
+
|
|
265
|
+
# Store in DHT (async in background to not block)
|
|
266
|
+
threading.Thread(
|
|
267
|
+
target=self._dht_proof_store.store_proof_in_dht,
|
|
268
|
+
args=(wallet_id, proof_record),
|
|
269
|
+
daemon=True
|
|
270
|
+
).start()
|
|
271
|
+
|
|
272
|
+
except Exception as e:
|
|
273
|
+
logger.debug(f"DHT proof storage error (non-fatal): {e}")
|
|
274
|
+
|
|
275
|
+
def _bootstrap_balance_from_dht(self):
|
|
276
|
+
"""
|
|
277
|
+
Bootstrap balance from DHT (PRODUCTION-READY TRUSTLESS SYSTEM).
|
|
278
|
+
|
|
279
|
+
This is called on startup to sync historical earnings when running
|
|
280
|
+
the same wallet on a new machine.
|
|
281
|
+
|
|
282
|
+
SECURITY ARCHITECTURE (Production-Grade):
|
|
283
|
+
┌─────────────────────────────────────────────────────────────┐
|
|
284
|
+
│ DHT RETRIEVAL (FULLY TRUSTLESS) │
|
|
285
|
+
│ 1. Query DHT for historical proofs │
|
|
286
|
+
│ 2. Verify ECDSA signature on EACH proof │
|
|
287
|
+
│ 3. Cross-validate with 3+ independent DHT nodes │
|
|
288
|
+
│ 4. Require Byzantine consensus │
|
|
289
|
+
│ 5. Credit only cryptographically verified proofs │
|
|
290
|
+
│ ✅ Fully decentralized │
|
|
291
|
+
│ ✅ Fully trustless │
|
|
292
|
+
│ ✅ Byzantine-resistant │
|
|
293
|
+
│ ✅ Production-ready │
|
|
294
|
+
└─────────────────────────────────────────────────────────────┘
|
|
295
|
+
|
|
296
|
+
No Fallbacks:
|
|
297
|
+
- If DHT has no proofs → Start from 0 (new wallet)
|
|
298
|
+
- If network too small → Proofs stored when >=3 nodes
|
|
299
|
+
- No trusted servers required
|
|
300
|
+
|
|
301
|
+
Similar to: Bitcoin SPV, Ethereum Light Client
|
|
302
|
+
|
|
303
|
+
Why no API fallback:
|
|
304
|
+
- Would require trusting central server
|
|
305
|
+
- Defeats purpose of decentralization
|
|
306
|
+
- Opens security vulnerabilities
|
|
307
|
+
- Not needed - local DB persists your own proofs
|
|
308
|
+
"""
|
|
309
|
+
if not self.ledger or not self.node_token:
|
|
310
|
+
return
|
|
311
|
+
|
|
312
|
+
try:
|
|
313
|
+
# Get wallet_id (first 16 chars of ECDSA node_id)
|
|
314
|
+
wallet_id = self.ledger.node_id[:16]
|
|
315
|
+
|
|
316
|
+
# Check if we already have a balance (skip bootstrap if we do)
|
|
317
|
+
current_balance = self.ledger.get_balance()
|
|
318
|
+
if current_balance > 0:
|
|
319
|
+
logger.info(f"Local balance found: {current_balance:.4f} NEURO (skipping bootstrap)")
|
|
320
|
+
return
|
|
321
|
+
|
|
322
|
+
# ===================================================================
|
|
323
|
+
# PHASE 1: DHT RETRIEVAL (TRUSTLESS)
|
|
324
|
+
# ===================================================================
|
|
325
|
+
dht_success = False
|
|
326
|
+
if self.dht:
|
|
327
|
+
try:
|
|
328
|
+
from neuroshard.core.network.dht_proof_store import DHTProofStore
|
|
329
|
+
|
|
330
|
+
# Get network size for adaptive behavior
|
|
331
|
+
all_nodes = self.dht.routing_table.get_all_nodes() if self.dht else []
|
|
332
|
+
network_size = len(all_nodes) + 1 # +1 for self
|
|
333
|
+
|
|
334
|
+
logger.info(f"[DHT BOOTSTRAP] Querying DHT for wallet {wallet_id}... (network: {network_size} nodes)")
|
|
335
|
+
|
|
336
|
+
dht_store = DHTProofStore(self.dht)
|
|
337
|
+
|
|
338
|
+
# Retrieve proofs from DHT with signature verification
|
|
339
|
+
verified_proofs, metadata = dht_store.retrieve_proofs_from_dht(
|
|
340
|
+
wallet_id=wallet_id,
|
|
341
|
+
max_proofs=100,
|
|
342
|
+
verify_signatures=True # 🔒 TRUSTLESS
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
if verified_proofs:
|
|
346
|
+
logger.info(f"[DHT BOOTSTRAP] Found {len(verified_proofs)} verified proofs in DHT "
|
|
347
|
+
f"(total_reward={metadata.get('total_reward', 0):.6f} NEURO)")
|
|
348
|
+
|
|
349
|
+
# Cross-validate with multiple DHT nodes for Byzantine resistance
|
|
350
|
+
# Uses adaptive validation (works with 2-node networks)
|
|
351
|
+
consensus, validation_data = dht_store.cross_validate_proofs(
|
|
352
|
+
wallet_id=wallet_id,
|
|
353
|
+
desired_validators=3 # Adapts to actual network size
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
if consensus:
|
|
357
|
+
validators_count = validation_data.get('validators_queried', 0)
|
|
358
|
+
network_size = validation_data.get('network_size', 1)
|
|
359
|
+
|
|
360
|
+
logger.info(f"[DHT BOOTSTRAP] ✅ Cross-validation PASSED "
|
|
361
|
+
f"({validators_count} validators, network={network_size} nodes)")
|
|
362
|
+
|
|
363
|
+
# Credit verified proofs to local ledger
|
|
364
|
+
total_credited = 0.0
|
|
365
|
+
for proof_record in verified_proofs:
|
|
366
|
+
# Each proof is ECDSA-verified, safe to credit
|
|
367
|
+
total_credited += proof_record.reward
|
|
368
|
+
|
|
369
|
+
# Update local ledger with DHT data
|
|
370
|
+
with self.ledger.lock:
|
|
371
|
+
with sqlite3.connect(self.ledger.db_path) as conn:
|
|
372
|
+
conn.execute("""
|
|
373
|
+
INSERT OR REPLACE INTO balances
|
|
374
|
+
(node_id, balance, total_earned, total_spent, proof_count, last_proof_time)
|
|
375
|
+
VALUES (?, ?, ?, 0.0, ?, ?)
|
|
376
|
+
""", (
|
|
377
|
+
self.ledger.node_id,
|
|
378
|
+
total_credited,
|
|
379
|
+
total_credited,
|
|
380
|
+
len(verified_proofs),
|
|
381
|
+
time.time()
|
|
382
|
+
))
|
|
383
|
+
conn.commit()
|
|
384
|
+
|
|
385
|
+
logger.info(f"[DHT BOOTSTRAP] ✅ Synced from DHT: {total_credited:.6f} NEURO")
|
|
386
|
+
logger.info(f"[DHT BOOTSTRAP] {len(verified_proofs)} proofs verified via ECDSA signatures")
|
|
387
|
+
logger.info(f"[DHT BOOTSTRAP] Network: {network_size} nodes, {validators_count} validators confirmed")
|
|
388
|
+
dht_success = True
|
|
389
|
+
return # Success!
|
|
390
|
+
|
|
391
|
+
else:
|
|
392
|
+
logger.warning(f"[DHT BOOTSTRAP] ⚠️ Cross-validation FAILED - nodes disagree")
|
|
393
|
+
logger.warning(f"[DHT BOOTSTRAP] Validation data: {validation_data}")
|
|
394
|
+
# Fall through to API
|
|
395
|
+
else:
|
|
396
|
+
logger.info(f"[DHT BOOTSTRAP] No proofs found in DHT (new wallet or network still syncing)")
|
|
397
|
+
# Fall through to API
|
|
398
|
+
|
|
399
|
+
except Exception as e:
|
|
400
|
+
logger.warning(f"[DHT BOOTSTRAP] DHT retrieval failed: {e}")
|
|
401
|
+
# Fall through to API
|
|
402
|
+
|
|
403
|
+
# ===================================================================
|
|
404
|
+
# NO API FALLBACK - PRODUCTION-READY TRUSTLESS SYSTEM
|
|
405
|
+
# ===================================================================
|
|
406
|
+
# If DHT doesn't have proofs, we start from zero and earn naturally.
|
|
407
|
+
# This is the CORRECT behavior for a decentralized system.
|
|
408
|
+
#
|
|
409
|
+
# Why no API fallback:
|
|
410
|
+
# 1. API would be a trusted party (defeats trustless design)
|
|
411
|
+
# 2. Creates centralization point
|
|
412
|
+
# 3. Opens attack vector (malicious API can inflate balances)
|
|
413
|
+
#
|
|
414
|
+
# Edge case handling:
|
|
415
|
+
# - New wallet: Balance = 0 (correct)
|
|
416
|
+
# - Existing wallet but DHT empty: Proofs are in local DB, will
|
|
417
|
+
# propagate to DHT as we earn. Other machines bootstrap when DHT
|
|
418
|
+
# has enough replicas (3+ nodes needed)
|
|
419
|
+
#
|
|
420
|
+
# This is how Bitcoin/Ethereum work - fully decentralized.
|
|
421
|
+
# ===================================================================
|
|
422
|
+
|
|
423
|
+
if not dht_success:
|
|
424
|
+
logger.info(f"[BOOTSTRAP] No proofs found in DHT for wallet {wallet_id[:8]}...")
|
|
425
|
+
logger.info(f"[BOOTSTRAP] Starting with zero balance - will earn via PoNW")
|
|
426
|
+
logger.info(f"[BOOTSTRAP] Future earnings will be stored in DHT for other machines")
|
|
427
|
+
logger.info(f"[BALANCE] New wallet - starting from 0 NEURO. Start earning!")
|
|
428
|
+
|
|
429
|
+
except Exception as e:
|
|
430
|
+
logger.warning(f"[BOOTSTRAP] Error during DHT bootstrap: {e}")
|
|
431
|
+
logger.info("[BOOTSTRAP] Starting with zero balance - future earnings via P2P")
|
|
432
|
+
|
|
433
|
+
def _sync_stakes_loop(self):
|
|
434
|
+
"""
|
|
435
|
+
P2P stake gossip loop.
|
|
436
|
+
|
|
437
|
+
Periodically broadcasts our stake to peers so they can:
|
|
438
|
+
1. Verify our PoNW claims have correct multipliers
|
|
439
|
+
2. Maintain a network-wide view of stakes
|
|
440
|
+
"""
|
|
441
|
+
while self.running:
|
|
442
|
+
# Interruptible sleep - wakes up immediately on stop()
|
|
443
|
+
if self._stop_event.wait(timeout=300):
|
|
444
|
+
break # Stop event was set
|
|
445
|
+
|
|
446
|
+
if not self.ledger:
|
|
447
|
+
continue
|
|
448
|
+
|
|
449
|
+
try:
|
|
450
|
+
# Get our current stake
|
|
451
|
+
account_info = self.ledger.get_account_info()
|
|
452
|
+
stake = account_info.get("stake", 0.0)
|
|
453
|
+
stake_locked_until = account_info.get("stake_locked_until", 0.0)
|
|
454
|
+
|
|
455
|
+
if stake <= 0:
|
|
456
|
+
continue # Nothing to gossip
|
|
457
|
+
|
|
458
|
+
# Gossip our stake to peers
|
|
459
|
+
peers = list(self.known_peers.keys())
|
|
460
|
+
if self.routing_table:
|
|
461
|
+
for n in self.routing_table.get_all_nodes():
|
|
462
|
+
peers.append(f"http://{n.ip}:{n.port}")
|
|
463
|
+
|
|
464
|
+
if not peers:
|
|
465
|
+
continue
|
|
466
|
+
|
|
467
|
+
# DYNAMIC FANOUT: Scale with network size
|
|
468
|
+
# Formula: 2*sqrt(N) + 3, capped at 50 for stake gossip
|
|
469
|
+
# Stakes are important for security - need higher coverage
|
|
470
|
+
import math
|
|
471
|
+
fanout = min(int(2 * math.sqrt(len(peers)) + 3), 50)
|
|
472
|
+
targets = random.sample(peers, min(len(peers), fanout))
|
|
473
|
+
logger.info(f"Stake gossip: Broadcasting {stake:.2f} NEURO to {len(targets)} peers")
|
|
474
|
+
|
|
475
|
+
for target in targets:
|
|
476
|
+
threading.Thread(
|
|
477
|
+
target=self._send_stake_to_peer,
|
|
478
|
+
args=(target, stake, stake_locked_until),
|
|
479
|
+
daemon=True
|
|
480
|
+
).start()
|
|
481
|
+
|
|
482
|
+
except Exception as e:
|
|
483
|
+
logger.error(f"Stake gossip error: {e}")
|
|
484
|
+
|
|
485
|
+
def _send_stake_to_peer(self, target_url: str, amount: float, locked_until: float):
|
|
486
|
+
"""Send stake update to a peer via gRPC."""
|
|
487
|
+
from protos import neuroshard_pb2
|
|
488
|
+
from protos import neuroshard_pb2_grpc
|
|
489
|
+
from neuroshard.core.network.connection_pool import get_channel
|
|
490
|
+
from urllib.parse import urlparse
|
|
491
|
+
|
|
492
|
+
try:
|
|
493
|
+
parsed = urlparse(target_url)
|
|
494
|
+
ip = parsed.hostname
|
|
495
|
+
port = (parsed.port or 80) + 1000 # gRPC port
|
|
496
|
+
|
|
497
|
+
channel = get_channel(f"{ip}:{port}")
|
|
498
|
+
stub = neuroshard_pb2_grpc.NeuroShardServiceStub(channel)
|
|
499
|
+
|
|
500
|
+
# Create stake gossip request using ECDSA node_id
|
|
501
|
+
ledger_node_id = self.ledger.node_id
|
|
502
|
+
payload = f"{ledger_node_id}:{amount}:{locked_until}"
|
|
503
|
+
|
|
504
|
+
# SECURITY: Include public key for verification
|
|
505
|
+
# This allows peers to verify our signature without prior knowledge
|
|
506
|
+
public_key_hex = self.ledger.crypto.get_public_key_hex()
|
|
507
|
+
|
|
508
|
+
req = neuroshard_pb2.GossipStakeRequest(
|
|
509
|
+
node_id=ledger_node_id,
|
|
510
|
+
amount=amount,
|
|
511
|
+
locked_until=locked_until,
|
|
512
|
+
timestamp=time.time(),
|
|
513
|
+
signature=self.ledger._sign(payload),
|
|
514
|
+
public_key=public_key_hex # Required for verification
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
stub.GossipStake(req, timeout=3.0)
|
|
518
|
+
logger.debug(f"Stake gossip sent to {ip}:{port}")
|
|
519
|
+
except Exception as e:
|
|
520
|
+
logger.debug(f"Stake gossip to {target_url} failed: {e}")
|
|
521
|
+
|
|
522
|
+
def _announce_loop(self):
|
|
523
|
+
# Immediate announce on startup (verbose for first time)
|
|
524
|
+
self._announce_once(verbose=True)
|
|
525
|
+
while self.running:
|
|
526
|
+
# Re-announce every 60 seconds (DHT entries have ~5min TTL)
|
|
527
|
+
# This is frequent enough for peer discovery but not spammy
|
|
528
|
+
if self._stop_event.wait(timeout=60):
|
|
529
|
+
break
|
|
530
|
+
self._announce_once(verbose=False) # Silent re-announce
|
|
531
|
+
|
|
532
|
+
def broadcast_transaction(self, recipient_id: str, amount: float, signature: str, tx_hash: str):
|
|
533
|
+
"""Broadcast a transaction to the P2P network."""
|
|
534
|
+
threading.Thread(target=self._gossip_transaction, args=(recipient_id, amount, signature, tx_hash), daemon=True).start()
|
|
535
|
+
|
|
536
|
+
def _gossip_transaction(self, recipient_id: str, amount: float, signature: str, tx_hash: str):
|
|
537
|
+
"""Gossip transaction to peers."""
|
|
538
|
+
from protos import neuroshard_pb2
|
|
539
|
+
from protos import neuroshard_pb2_grpc
|
|
540
|
+
from neuroshard.core.network.connection_pool import get_channel
|
|
541
|
+
from urllib.parse import urlparse
|
|
542
|
+
|
|
543
|
+
# 1. Gather Peers
|
|
544
|
+
peers = list(self.known_peers.keys())
|
|
545
|
+
if self.routing_table:
|
|
546
|
+
for n in self.routing_table.get_all_nodes():
|
|
547
|
+
peers.append(f"http://{n.ip}:{n.port}")
|
|
548
|
+
|
|
549
|
+
if not peers: return
|
|
550
|
+
|
|
551
|
+
# 2. Gossip to random subset (Epidemic Propagation)
|
|
552
|
+
# DYNAMIC FANOUT: 2*sqrt(N) + 3, capped at 50 for transactions
|
|
553
|
+
# Transactions need high coverage for consistency
|
|
554
|
+
import math
|
|
555
|
+
fanout = min(int(2 * math.sqrt(len(peers)) + 3), 50)
|
|
556
|
+
targets = random.sample(peers, min(len(peers), fanout))
|
|
557
|
+
|
|
558
|
+
req = neuroshard_pb2.GossipTransactionRequest(
|
|
559
|
+
sender_id=str(self.node_id),
|
|
560
|
+
recipient_id=recipient_id,
|
|
561
|
+
amount=amount,
|
|
562
|
+
timestamp=time.time(),
|
|
563
|
+
signature=signature,
|
|
564
|
+
tx_hash=tx_hash
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
logger.info(f"Broadcasting TX {tx_hash[:8]} to {len(targets)} peers...")
|
|
568
|
+
|
|
569
|
+
for target_url in targets:
|
|
570
|
+
try:
|
|
571
|
+
parsed = urlparse(target_url)
|
|
572
|
+
ip = parsed.hostname
|
|
573
|
+
port = (parsed.port or 80) + 1000
|
|
574
|
+
|
|
575
|
+
channel = get_channel(f"{ip}:{port}")
|
|
576
|
+
stub = neuroshard_pb2_grpc.NeuroShardServiceStub(channel)
|
|
577
|
+
|
|
578
|
+
stub.GossipTransaction(req, timeout=3.0)
|
|
579
|
+
except Exception as e:
|
|
580
|
+
pass # Gossip is best effort
|
|
581
|
+
|
|
582
|
+
def _gossip_loop(self):
|
|
583
|
+
"""Periodically create Proof of Neural Work and gossip to peers."""
|
|
584
|
+
try:
|
|
585
|
+
from protos import neuroshard_pb2
|
|
586
|
+
from protos import neuroshard_pb2_grpc
|
|
587
|
+
from neuroshard.core.network.connection_pool import get_channel
|
|
588
|
+
except Exception as e:
|
|
589
|
+
logger.error(f"[PoNW] Failed to import protos: {e}")
|
|
590
|
+
return
|
|
591
|
+
|
|
592
|
+
logger.info("[PoNW] Gossip loop started (will generate first proof in 60s)")
|
|
593
|
+
|
|
594
|
+
while self.running:
|
|
595
|
+
# Interruptible sleep - wakes up immediately on stop()
|
|
596
|
+
if self._stop_event.wait(timeout=60):
|
|
597
|
+
break
|
|
598
|
+
if not self.ledger:
|
|
599
|
+
logger.info("[NODE] PoNW: No ledger available, skipping proof generation")
|
|
600
|
+
continue
|
|
601
|
+
|
|
602
|
+
try:
|
|
603
|
+
# Get metrics from state
|
|
604
|
+
tokens_processed = self.state_ref.get("token_count", 0)
|
|
605
|
+
training_batches = self.state_ref.get("training_batches", 0)
|
|
606
|
+
|
|
607
|
+
# Get pending inference request IDs (only paid inference gets rewards)
|
|
608
|
+
pending_request_ids = self.state_ref.get("pending_inference_requests", [])
|
|
609
|
+
|
|
610
|
+
# Reset counters after snapshot
|
|
611
|
+
self.state_ref["token_count"] = 0
|
|
612
|
+
self.state_ref["training_batches"] = 0
|
|
613
|
+
self.state_ref["pending_inference_requests"] = []
|
|
614
|
+
|
|
615
|
+
# Determine proof type based on activity
|
|
616
|
+
# IMPORTANT: Inference proofs REQUIRE a request_id (paid request)
|
|
617
|
+
# Tokens processed without a request_id don't earn inference rewards
|
|
618
|
+
if training_batches > 0:
|
|
619
|
+
proof_type = ProofType.TRAINING
|
|
620
|
+
# Note: tokens_processed during training doesn't count as inference
|
|
621
|
+
tokens_processed = 0 # Don't double-count training tokens
|
|
622
|
+
elif tokens_processed > 0 and pending_request_ids:
|
|
623
|
+
# Only create inference proof if we have actual paid requests
|
|
624
|
+
proof_type = ProofType.INFERENCE
|
|
625
|
+
else:
|
|
626
|
+
# Default to uptime - unpaid inference doesn't earn rewards
|
|
627
|
+
proof_type = ProofType.UPTIME
|
|
628
|
+
tokens_processed = 0 # Unpaid tokens don't count
|
|
629
|
+
|
|
630
|
+
# Get node info for role multipliers
|
|
631
|
+
layers_held = len(self.state_ref.get("assigned_layers", []))
|
|
632
|
+
has_embedding = self.state_ref.get("has_embedding", False)
|
|
633
|
+
has_lm_head = self.state_ref.get("has_lm_head", False)
|
|
634
|
+
model_hash = self.state_ref.get("model_hash", "")
|
|
635
|
+
current_loss = self.state_ref.get("current_loss", None)
|
|
636
|
+
|
|
637
|
+
# Sanitize loss (must be a valid float for storage)
|
|
638
|
+
if current_loss is not None:
|
|
639
|
+
import math
|
|
640
|
+
if math.isinf(current_loss) or math.isnan(current_loss):
|
|
641
|
+
current_loss = None
|
|
642
|
+
|
|
643
|
+
# Create PoNW proof using new NEUROLedger API
|
|
644
|
+
proof = self.ledger.create_proof(
|
|
645
|
+
proof_type=proof_type,
|
|
646
|
+
uptime_seconds=60,
|
|
647
|
+
tokens_processed=tokens_processed,
|
|
648
|
+
training_batches=training_batches,
|
|
649
|
+
layers_held=layers_held,
|
|
650
|
+
has_embedding=has_embedding,
|
|
651
|
+
has_lm_head=has_lm_head,
|
|
652
|
+
model_hash=model_hash,
|
|
653
|
+
current_loss=current_loss
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
# Process proof locally (credit ourselves)
|
|
657
|
+
success, reward, msg = self.ledger.process_proof(proof)
|
|
658
|
+
|
|
659
|
+
if success:
|
|
660
|
+
if proof_type == ProofType.TRAINING:
|
|
661
|
+
logger.info(f"[NODE] Earned {reward:.6f} NEURO (training, {training_batches} batches in last 60s)")
|
|
662
|
+
elif proof_type == ProofType.INFERENCE:
|
|
663
|
+
logger.info(f"[NODE] Earned {reward:.6f} NEURO (inference, {tokens_processed} tokens in last 60s)")
|
|
664
|
+
else:
|
|
665
|
+
logger.info(f"[NODE] Earned {reward:.6f} NEURO (uptime, 60s)")
|
|
666
|
+
|
|
667
|
+
# 🔥 NEW: Store proof in DHT for decentralized balance sync
|
|
668
|
+
self._store_proof_in_dht(proof, reward)
|
|
669
|
+
else:
|
|
670
|
+
logger.info(f"[NODE] ❌ PoNW rejected: {msg}")
|
|
671
|
+
|
|
672
|
+
# Gossip to random peers
|
|
673
|
+
peers = list(self.known_peers.keys())
|
|
674
|
+
if self.routing_table:
|
|
675
|
+
for n in self.routing_table.get_all_nodes():
|
|
676
|
+
peers.append(f"http://{n.ip}:{n.port}")
|
|
677
|
+
|
|
678
|
+
if not peers:
|
|
679
|
+
logger.info("PoNW: Solo mining (no peers to gossip)")
|
|
680
|
+
else:
|
|
681
|
+
# DYNAMIC FANOUT: sqrt(N) + 3, capped at 30 for PoNW proofs
|
|
682
|
+
# Proofs need reasonable coverage for DHT consistency
|
|
683
|
+
import math
|
|
684
|
+
fanout = min(int(math.sqrt(len(peers)) + 3), 30)
|
|
685
|
+
targets = random.sample(peers, min(len(peers), fanout))
|
|
686
|
+
logger.info(f"PoNW: Gossiping to {len(targets)} peers")
|
|
687
|
+
|
|
688
|
+
for target in targets:
|
|
689
|
+
threading.Thread(target=self._send_proof_to_peer, args=(target, proof)).start()
|
|
690
|
+
|
|
691
|
+
except Exception as e:
|
|
692
|
+
logger.error(f"PoNW gossip error: {e}")
|
|
693
|
+
|
|
694
|
+
def _send_proof_to_peer(self, target_url: str, proof: PoNWProof):
|
|
695
|
+
"""Send PoNW proof to a peer via gRPC."""
|
|
696
|
+
from protos import neuroshard_pb2
|
|
697
|
+
from protos import neuroshard_pb2_grpc
|
|
698
|
+
from neuroshard.core.network.connection_pool import get_channel
|
|
699
|
+
from urllib.parse import urlparse
|
|
700
|
+
|
|
701
|
+
try:
|
|
702
|
+
parsed = urlparse(target_url)
|
|
703
|
+
ip = parsed.hostname
|
|
704
|
+
# gRPC port = HTTP port + 1000
|
|
705
|
+
port = (parsed.port or 80) + 1000
|
|
706
|
+
|
|
707
|
+
channel = get_channel(f"{ip}:{port}")
|
|
708
|
+
stub = neuroshard_pb2_grpc.NeuroShardServiceStub(channel)
|
|
709
|
+
|
|
710
|
+
# Send FULL proof data for proper verification
|
|
711
|
+
# CRITICAL: Include public key for trustless verification
|
|
712
|
+
# CRITICAL: Include data_samples, model_hash, request_id for canonical_payload match
|
|
713
|
+
req = neuroshard_pb2.GossipProofRequest(
|
|
714
|
+
node_id=proof.node_id,
|
|
715
|
+
timestamp=proof.timestamp,
|
|
716
|
+
uptime=proof.uptime_seconds,
|
|
717
|
+
signature=proof.signature,
|
|
718
|
+
token_count=proof.tokens_processed,
|
|
719
|
+
training_batches=proof.training_batches,
|
|
720
|
+
layers_held=proof.layers_held,
|
|
721
|
+
has_embedding=proof.has_embedding,
|
|
722
|
+
has_lm_head=proof.has_lm_head,
|
|
723
|
+
proof_type=proof.proof_type,
|
|
724
|
+
nonce=proof.nonce,
|
|
725
|
+
public_key=self.ledger.crypto.get_public_key_hex(),
|
|
726
|
+
data_samples=proof.data_samples,
|
|
727
|
+
model_hash=proof.model_hash,
|
|
728
|
+
request_id=proof.request_id or "",
|
|
729
|
+
current_loss=proof.current_loss if proof.current_loss is not None else 0.0
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
stub.GossipProof(req, timeout=3.0)
|
|
733
|
+
except:
|
|
734
|
+
pass # Gossip is best-effort
|
|
735
|
+
|
|
736
|
+
def _sync_with_new_peer(self, peer_url: str):
|
|
737
|
+
"""
|
|
738
|
+
Sync state with a newly discovered peer.
|
|
739
|
+
|
|
740
|
+
IMPORTANT: Historical proofs CANNOT be replayed because of the
|
|
741
|
+
PROOF_FRESHNESS_WINDOW (5 minutes). This is BY DESIGN - it prevents
|
|
742
|
+
nodes from fabricating work while running solo.
|
|
743
|
+
|
|
744
|
+
How balance sync works (like Bitcoin):
|
|
745
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
746
|
+
│ LOCAL BALANCE = All proofs I generated (witnessed by me) │
|
|
747
|
+
│ NETWORK BALANCE = Proofs gossiped within 5 min (witnessed by N) │
|
|
748
|
+
│ │
|
|
749
|
+
│ If you run SOLO, only LOCAL balance increases. │
|
|
750
|
+
│ NETWORK balance only increases when peers witness your work. │
|
|
751
|
+
│ │
|
|
752
|
+
│ This is the SECURITY MODEL: No free NEURO from fabricated work! │
|
|
753
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
754
|
+
|
|
755
|
+
What we DO sync:
|
|
756
|
+
1. Current peer list (for gossip)
|
|
757
|
+
2. DHT routing table (for lookups)
|
|
758
|
+
3. Training state (for DiLoCo)
|
|
759
|
+
|
|
760
|
+
What we DON'T sync (by design):
|
|
761
|
+
- Historical proofs older than 5 minutes (prevents fraud)
|
|
762
|
+
"""
|
|
763
|
+
# Log the connection for transparency
|
|
764
|
+
logger.info(f"[SYNC] Connected to new peer: {peer_url}")
|
|
765
|
+
|
|
766
|
+
# NOTE: Historical proof replay removed because:
|
|
767
|
+
# 1. PROOF_FRESHNESS_WINDOW = 300s (5 min) - old proofs rejected
|
|
768
|
+
# 2. This is correct security behavior (like Bitcoin confirmations)
|
|
769
|
+
# 3. Solo-earned NEURO is LOCAL only - needs witnesses to be NETWORK-confirmed
|
|
770
|
+
|
|
771
|
+
def _announce_once(self, verbose: bool = True):
|
|
772
|
+
# 1. DHT Announce (Primary)
|
|
773
|
+
# Announces all layers so peers can find us for pipeline routing
|
|
774
|
+
if self.dht:
|
|
775
|
+
try:
|
|
776
|
+
num_layers = self.end_layer - self.start_layer + 1
|
|
777
|
+
success_count = 0
|
|
778
|
+
|
|
779
|
+
# Announce ALL layers we hold so peers can find us for any layer
|
|
780
|
+
# This is critical for distributed training pipeline routing!
|
|
781
|
+
for layer_id in range(self.start_layer, self.end_layer + 1):
|
|
782
|
+
try:
|
|
783
|
+
self.dht.announce(f"layer_{layer_id}")
|
|
784
|
+
success_count += 1
|
|
785
|
+
except:
|
|
786
|
+
pass
|
|
787
|
+
|
|
788
|
+
# Log summary (only on first announce or if verbose)
|
|
789
|
+
if verbose and num_layers > 0:
|
|
790
|
+
logger.info(f"DHT Announce: {success_count}/{num_layers} layers announced (layers {self.start_layer}-{self.end_layer})")
|
|
791
|
+
|
|
792
|
+
# Also announce checkpoint info for distributed training sync
|
|
793
|
+
if hasattr(self, 'neuro_node') and self.neuro_node:
|
|
794
|
+
checkpoint_info = self.neuro_node.get_checkpoint_info()
|
|
795
|
+
self.dht.announce(f"checkpoint_v{checkpoint_info['version']}")
|
|
796
|
+
except Exception as e:
|
|
797
|
+
logger.debug(f"DHT Announce error: {e}")
|
|
798
|
+
|
|
799
|
+
# 2. Legacy Tracker Announce (Fallback)
|
|
800
|
+
try:
|
|
801
|
+
parsed = urlparse(self.my_url)
|
|
802
|
+
ip = parsed.hostname
|
|
803
|
+
port = parsed.port or (443 if parsed.scheme == 'https' else 80)
|
|
804
|
+
|
|
805
|
+
requests.post(f"{self.tracker_url}/announce", json={
|
|
806
|
+
"ip": ip,
|
|
807
|
+
"port": port,
|
|
808
|
+
"shard_range": self.shard_range,
|
|
809
|
+
"tps": self.current_tps,
|
|
810
|
+
"latency": self.current_latency,
|
|
811
|
+
"node_token": self.node_token
|
|
812
|
+
}, timeout=2)
|
|
813
|
+
|
|
814
|
+
# Fetch Peers for Bootstrap
|
|
815
|
+
# Only done if routing table is empty or low
|
|
816
|
+
if not self.known_peers or len(self.known_peers) < 5:
|
|
817
|
+
# First, get peers with matching shard range (for inference routing)
|
|
818
|
+
resp = requests.get(f"{self.tracker_url}/peers", params={"shard_range": self.shard_range}, timeout=2)
|
|
819
|
+
if resp.status_code == 200:
|
|
820
|
+
new_peers = resp.json()
|
|
821
|
+
for p in new_peers:
|
|
822
|
+
if p["url"] != self.my_url:
|
|
823
|
+
self.known_peers[p["url"]] = p
|
|
824
|
+
|
|
825
|
+
# Also fetch ALL peers for gossip (ledger sync needs all nodes, not just matching shards)
|
|
826
|
+
resp_all = requests.get(f"{self.tracker_url}/peers", params={"limit": 100}, timeout=2)
|
|
827
|
+
if resp_all.status_code == 200:
|
|
828
|
+
all_peers = resp_all.json()
|
|
829
|
+
for p in all_peers:
|
|
830
|
+
if p["url"] != self.my_url and p["url"] not in self.known_peers:
|
|
831
|
+
is_new_peer = True
|
|
832
|
+
self.known_peers[p["url"]] = p
|
|
833
|
+
# Bootstrap DHT
|
|
834
|
+
if self.routing_table:
|
|
835
|
+
try:
|
|
836
|
+
p_parsed = urlparse(p["url"])
|
|
837
|
+
p_ip = p_parsed.hostname
|
|
838
|
+
p_port = p_parsed.port or 80
|
|
839
|
+
# Deterministic ID for stability in dev
|
|
840
|
+
p_id = int(hashlib.sha1(f"{p['url']}".encode()).hexdigest(), 16)
|
|
841
|
+
if self.routing_table:
|
|
842
|
+
self.routing_table.add_contact(Node(p_id, p_ip, p_port))
|
|
843
|
+
except: pass
|
|
844
|
+
|
|
845
|
+
# Log new peer connection (proof replay removed - see _sync_with_new_peer)
|
|
846
|
+
if is_new_peer:
|
|
847
|
+
self._sync_with_new_peer(p["url"])
|
|
848
|
+
except:
|
|
849
|
+
pass
|
|
850
|
+
|
|
851
|
+
def get_next_hop(self, current_end_layer: int, session_id: Optional[str] = None) -> Optional[str]:
|
|
852
|
+
"""Find a peer that starts where we end."""
|
|
853
|
+
candidates = []
|
|
854
|
+
|
|
855
|
+
# Strategy 1: DHT Lookup (Scalable)
|
|
856
|
+
if self.dht:
|
|
857
|
+
import json
|
|
858
|
+
key_str = f"layer_{current_end_layer}"
|
|
859
|
+
key = int(hashlib.sha1(key_str.encode()).hexdigest(), 16)
|
|
860
|
+
|
|
861
|
+
# Use iterative lookup
|
|
862
|
+
val = self.dht.lookup_value(key)
|
|
863
|
+
if val:
|
|
864
|
+
try:
|
|
865
|
+
# Try parsing as list of peers
|
|
866
|
+
dht_candidates = json.loads(val)
|
|
867
|
+
if isinstance(dht_candidates, list):
|
|
868
|
+
for c in dht_candidates:
|
|
869
|
+
# DHT stores "ip:port", we need full URL
|
|
870
|
+
if not c.startswith("http"):
|
|
871
|
+
candidates.append(f"http://{c}")
|
|
872
|
+
else:
|
|
873
|
+
candidates.append(c)
|
|
874
|
+
else:
|
|
875
|
+
# Legacy single value
|
|
876
|
+
if not isinstance(dht_candidates, str):
|
|
877
|
+
dht_candidates = str(dht_candidates)
|
|
878
|
+
if not dht_candidates.startswith("http"):
|
|
879
|
+
candidates.append(f"http://{dht_candidates}")
|
|
880
|
+
else:
|
|
881
|
+
candidates.append(dht_candidates)
|
|
882
|
+
except:
|
|
883
|
+
# Simple string fallback
|
|
884
|
+
if not val.startswith("http"):
|
|
885
|
+
candidates.append(f"http://{val}")
|
|
886
|
+
else:
|
|
887
|
+
candidates.append(val)
|
|
888
|
+
|
|
889
|
+
# Strategy 2: Local Cache (Fallback)
|
|
890
|
+
# Check if target layer is WITHIN the peer's range (not just at start)
|
|
891
|
+
for url, info in self.known_peers.items():
|
|
892
|
+
try:
|
|
893
|
+
r = info.get("shard_range", "0-0")
|
|
894
|
+
start, end = map(int, r.split("-"))
|
|
895
|
+
# Peer can handle layer if it's within their range
|
|
896
|
+
if start <= current_end_layer <= end:
|
|
897
|
+
candidates.append(url)
|
|
898
|
+
except: continue
|
|
899
|
+
|
|
900
|
+
if not candidates: return None
|
|
901
|
+
|
|
902
|
+
if session_id:
|
|
903
|
+
# Sticky routing
|
|
904
|
+
candidates.sort()
|
|
905
|
+
hash_val = int(hashlib.sha256(session_id.encode()).hexdigest(), 16)
|
|
906
|
+
return candidates[hash_val % len(candidates)]
|
|
907
|
+
|
|
908
|
+
return random.choice(candidates)
|
|
909
|
+
|
|
910
|
+
def get_redundant_hop(self, current_end_layer: int, primary_hop: str) -> Optional[str]:
|
|
911
|
+
candidates = []
|
|
912
|
+
for url, info in self.known_peers.items():
|
|
913
|
+
try:
|
|
914
|
+
r = info.get("shard_range", "0-0")
|
|
915
|
+
start, end = map(int, r.split("-"))
|
|
916
|
+
# Peer can handle layer if it's within their range
|
|
917
|
+
if start <= current_end_layer <= end and url != primary_hop:
|
|
918
|
+
candidates.append(url)
|
|
919
|
+
except: continue
|
|
920
|
+
|
|
921
|
+
if not candidates: return None
|
|
922
|
+
return random.choice(candidates)
|
|
923
|
+
|
|
924
|
+
def get_sync_peers(self) -> List[str]:
|
|
925
|
+
candidates = []
|
|
926
|
+
for url, info in self.known_peers.items():
|
|
927
|
+
if info.get("shard_range") == self.shard_range:
|
|
928
|
+
candidates.append(url)
|
|
929
|
+
return candidates
|