nexaroa 0.0.111__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. neuroshard/__init__.py +93 -0
  2. neuroshard/__main__.py +4 -0
  3. neuroshard/cli.py +466 -0
  4. neuroshard/core/__init__.py +92 -0
  5. neuroshard/core/consensus/verifier.py +252 -0
  6. neuroshard/core/crypto/__init__.py +20 -0
  7. neuroshard/core/crypto/ecdsa.py +392 -0
  8. neuroshard/core/economics/__init__.py +52 -0
  9. neuroshard/core/economics/constants.py +387 -0
  10. neuroshard/core/economics/ledger.py +2111 -0
  11. neuroshard/core/economics/market.py +975 -0
  12. neuroshard/core/economics/wallet.py +168 -0
  13. neuroshard/core/governance/__init__.py +74 -0
  14. neuroshard/core/governance/proposal.py +561 -0
  15. neuroshard/core/governance/registry.py +545 -0
  16. neuroshard/core/governance/versioning.py +332 -0
  17. neuroshard/core/governance/voting.py +453 -0
  18. neuroshard/core/model/__init__.py +30 -0
  19. neuroshard/core/model/dynamic.py +4186 -0
  20. neuroshard/core/model/llm.py +905 -0
  21. neuroshard/core/model/registry.py +164 -0
  22. neuroshard/core/model/scaler.py +387 -0
  23. neuroshard/core/model/tokenizer.py +568 -0
  24. neuroshard/core/network/__init__.py +56 -0
  25. neuroshard/core/network/connection_pool.py +72 -0
  26. neuroshard/core/network/dht.py +130 -0
  27. neuroshard/core/network/dht_plan.py +55 -0
  28. neuroshard/core/network/dht_proof_store.py +516 -0
  29. neuroshard/core/network/dht_protocol.py +261 -0
  30. neuroshard/core/network/dht_service.py +506 -0
  31. neuroshard/core/network/encrypted_channel.py +141 -0
  32. neuroshard/core/network/nat.py +201 -0
  33. neuroshard/core/network/nat_traversal.py +695 -0
  34. neuroshard/core/network/p2p.py +929 -0
  35. neuroshard/core/network/p2p_data.py +150 -0
  36. neuroshard/core/swarm/__init__.py +106 -0
  37. neuroshard/core/swarm/aggregation.py +729 -0
  38. neuroshard/core/swarm/buffers.py +643 -0
  39. neuroshard/core/swarm/checkpoint.py +709 -0
  40. neuroshard/core/swarm/compute.py +624 -0
  41. neuroshard/core/swarm/diloco.py +844 -0
  42. neuroshard/core/swarm/factory.py +1288 -0
  43. neuroshard/core/swarm/heartbeat.py +669 -0
  44. neuroshard/core/swarm/logger.py +487 -0
  45. neuroshard/core/swarm/router.py +658 -0
  46. neuroshard/core/swarm/service.py +640 -0
  47. neuroshard/core/training/__init__.py +29 -0
  48. neuroshard/core/training/checkpoint.py +600 -0
  49. neuroshard/core/training/distributed.py +1602 -0
  50. neuroshard/core/training/global_tracker.py +617 -0
  51. neuroshard/core/training/production.py +276 -0
  52. neuroshard/governance_cli.py +729 -0
  53. neuroshard/grpc_server.py +895 -0
  54. neuroshard/runner.py +3223 -0
  55. neuroshard/sdk/__init__.py +92 -0
  56. neuroshard/sdk/client.py +990 -0
  57. neuroshard/sdk/errors.py +101 -0
  58. neuroshard/sdk/types.py +282 -0
  59. neuroshard/tracker/__init__.py +0 -0
  60. neuroshard/tracker/server.py +864 -0
  61. neuroshard/ui/__init__.py +0 -0
  62. neuroshard/ui/app.py +102 -0
  63. neuroshard/ui/templates/index.html +1052 -0
  64. neuroshard/utils/__init__.py +0 -0
  65. neuroshard/utils/autostart.py +81 -0
  66. neuroshard/utils/hardware.py +121 -0
  67. neuroshard/utils/serialization.py +90 -0
  68. neuroshard/version.py +1 -0
  69. nexaroa-0.0.111.dist-info/METADATA +283 -0
  70. nexaroa-0.0.111.dist-info/RECORD +78 -0
  71. nexaroa-0.0.111.dist-info/WHEEL +5 -0
  72. nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
  73. nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
  74. nexaroa-0.0.111.dist-info/top_level.txt +2 -0
  75. protos/__init__.py +0 -0
  76. protos/neuroshard.proto +651 -0
  77. protos/neuroshard_pb2.py +160 -0
  78. protos/neuroshard_pb2_grpc.py +1298 -0
@@ -0,0 +1,864 @@
1
+
2
+ from fastapi import FastAPI, HTTPException, Body, Request, Query
3
+ from pydantic import BaseModel
4
+ from typing import Dict, List, Optional
5
+ import time
6
+ import uvicorn
7
+ import os
8
+ import asyncio
9
+ import logging
10
+
11
+ # Database Abstraction
12
+ class DatabaseManager:
13
+ def __init__(self):
14
+ self.pool = None
15
+ self.db_url = os.getenv("DATABASE_URL", "sqlite://tracker.db")
16
+ self.is_postgres = self.db_url.startswith("postgres")
17
+ self._sqlite_path = self.db_url.replace("sqlite://", "")
18
+
19
+ async def connect(self):
20
+ if self.is_postgres:
21
+ import asyncpg
22
+ # Wait for DB to be ready
23
+ for i in range(5):
24
+ try:
25
+ self.pool = await asyncpg.create_pool(self.db_url)
26
+ break
27
+ except Exception as e:
28
+ print(f"Waiting for DB... {e}")
29
+ await asyncio.sleep(2)
30
+ if not self.pool:
31
+ raise Exception("Could not connect to Postgres")
32
+
33
+ await self.init_postgres()
34
+ else:
35
+ self.init_sqlite()
36
+
37
+ async def close(self):
38
+ if self.pool:
39
+ await self.pool.close()
40
+
41
+ def init_sqlite(self):
42
+ import sqlite3
43
+ with sqlite3.connect(self._sqlite_path) as conn:
44
+ conn.execute("""
45
+ CREATE TABLE IF NOT EXISTS peers (
46
+ url TEXT PRIMARY KEY,
47
+ ip TEXT,
48
+ port INTEGER,
49
+ shard_range TEXT,
50
+ shard_start INTEGER,
51
+ shard_end INTEGER,
52
+ is_entry BOOLEAN,
53
+ is_exit BOOLEAN,
54
+ last_seen REAL,
55
+ tps REAL,
56
+ latency REAL,
57
+ node_token TEXT
58
+ )
59
+ """)
60
+ conn.execute("""
61
+ CREATE TABLE IF NOT EXISTS stakes (
62
+ url TEXT PRIMARY KEY,
63
+ amount REAL,
64
+ slashed BOOLEAN DEFAULT 0
65
+ )
66
+ """)
67
+ # Phase 4: Tensor shard tracking
68
+ conn.execute("""
69
+ CREATE TABLE IF NOT EXISTS tensor_shards (
70
+ id TEXT PRIMARY KEY,
71
+ model_id TEXT,
72
+ layer_id INTEGER,
73
+ shard_id INTEGER,
74
+ total_shards INTEGER,
75
+ node_url TEXT,
76
+ grpc_addr TEXT,
77
+ available_memory_mb REAL,
78
+ current_load REAL,
79
+ last_seen REAL,
80
+ node_token TEXT
81
+ )
82
+ """)
83
+ # Phase 4: Model registry
84
+ conn.execute("""
85
+ CREATE TABLE IF NOT EXISTS models (
86
+ model_id TEXT PRIMARY KEY,
87
+ name TEXT,
88
+ family TEXT,
89
+ num_layers INTEGER,
90
+ hidden_dim INTEGER,
91
+ total_size_mb REAL,
92
+ required_stake REAL,
93
+ approved BOOLEAN DEFAULT 1,
94
+ proposer_token TEXT,
95
+ created_at REAL
96
+ )
97
+ """)
98
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_shard_start ON peers(shard_start)")
99
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_shard_range ON peers(shard_range)")
100
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_last_seen ON peers(last_seen)")
101
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_tensor_model_layer ON tensor_shards(model_id, layer_id)")
102
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_tensor_last_seen ON tensor_shards(last_seen)")
103
+
104
+ async def init_postgres(self):
105
+ async with self.pool.acquire() as conn:
106
+ await conn.execute("""
107
+ CREATE TABLE IF NOT EXISTS peers (
108
+ url TEXT PRIMARY KEY,
109
+ ip TEXT,
110
+ port INTEGER,
111
+ shard_range TEXT,
112
+ shard_start INTEGER,
113
+ shard_end INTEGER,
114
+ is_entry BOOLEAN,
115
+ is_exit BOOLEAN,
116
+ last_seen DOUBLE PRECISION,
117
+ tps DOUBLE PRECISION,
118
+ latency DOUBLE PRECISION,
119
+ node_token TEXT
120
+ )
121
+ """)
122
+ await conn.execute("""
123
+ CREATE TABLE IF NOT EXISTS stakes (
124
+ url TEXT PRIMARY KEY,
125
+ amount DOUBLE PRECISION,
126
+ slashed BOOLEAN DEFAULT FALSE
127
+ )
128
+ """)
129
+ await conn.execute("CREATE INDEX IF NOT EXISTS idx_shard_start ON peers(shard_start)")
130
+ await conn.execute("CREATE INDEX IF NOT EXISTS idx_shard_range ON peers(shard_range)")
131
+ await conn.execute("CREATE INDEX IF NOT EXISTS idx_last_seen ON peers(last_seen)")
132
+
133
+ async def upsert_peer(self, url, ip, port, shard_range, start, end, is_entry, is_exit, now, tps, latency, node_token):
134
+ if self.is_postgres:
135
+ query = """
136
+ INSERT INTO peers
137
+ (url, ip, port, shard_range, shard_start, shard_end, is_entry, is_exit, last_seen, tps, latency, node_token)
138
+ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
139
+ ON CONFLICT (url) DO UPDATE SET
140
+ ip = EXCLUDED.ip,
141
+ port = EXCLUDED.port,
142
+ shard_range = EXCLUDED.shard_range,
143
+ shard_start = EXCLUDED.shard_start,
144
+ shard_end = EXCLUDED.shard_end,
145
+ is_entry = EXCLUDED.is_entry,
146
+ is_exit = EXCLUDED.is_exit,
147
+ last_seen = EXCLUDED.last_seen,
148
+ tps = EXCLUDED.tps,
149
+ latency = EXCLUDED.latency,
150
+ node_token = EXCLUDED.node_token
151
+ """
152
+ await self.pool.execute(query, url, ip, port, shard_range, start, end, is_entry, is_exit, now, tps, latency, node_token)
153
+ else:
154
+ import sqlite3
155
+ with sqlite3.connect(self._sqlite_path) as conn:
156
+ conn.execute("""
157
+ INSERT OR REPLACE INTO peers
158
+ (url, ip, port, shard_range, shard_start, shard_end, is_entry, is_exit, last_seen, tps, latency, node_token)
159
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
160
+ """, (url, ip, port, shard_range, start, end, is_entry, is_exit, now, tps, latency, node_token))
161
+
162
+ async def get_slashed_status(self, url):
163
+ if self.is_postgres:
164
+ row = await self.pool.fetchrow("SELECT slashed FROM stakes WHERE url = $1", url)
165
+ return row['slashed'] if row else False
166
+ else:
167
+ import sqlite3
168
+ with sqlite3.connect(self._sqlite_path) as conn:
169
+ cursor = conn.execute("SELECT slashed FROM stakes WHERE url = ?", (url,))
170
+ row = cursor.fetchone()
171
+ return row[0] if row else False
172
+
173
+ async def grant_initial_stake(self, url):
174
+ if self.is_postgres:
175
+ await self.pool.execute("INSERT INTO stakes (url, amount) VALUES ($1, 1000.0) ON CONFLICT DO NOTHING", url)
176
+ else:
177
+ import sqlite3
178
+ with sqlite3.connect(self._sqlite_path) as conn:
179
+ conn.execute("INSERT OR IGNORE INTO stakes (url, amount) VALUES (?, 1000.0)", (url,))
180
+
181
+ async def get_stats(self, now):
182
+ if self.is_postgres:
183
+ count = await self.pool.fetchval("SELECT COUNT(*) FROM peers WHERE last_seen > $1", now - 60)
184
+ row = await self.pool.fetchrow("SELECT SUM(tps) as tps, AVG(latency) as lat FROM peers WHERE last_seen > $1", now - 60)
185
+ return count, row['tps'] or 0, row['lat'] or 0
186
+ else:
187
+ import sqlite3
188
+ with sqlite3.connect(self._sqlite_path) as conn:
189
+ count = conn.execute("SELECT COUNT(*) FROM peers WHERE last_seen > ?", (now - 60,)).fetchone()[0]
190
+ stats = conn.execute("SELECT SUM(tps), AVG(latency) FROM peers WHERE last_seen > ?", (now - 60,)).fetchone()
191
+ return count, stats[0] or 0, stats[1] or 0
192
+
193
+ async def get_stake(self, url):
194
+ if self.is_postgres:
195
+ return await self.pool.fetchval("SELECT amount FROM stakes WHERE url = $1", url)
196
+ else:
197
+ import sqlite3
198
+ with sqlite3.connect(self._sqlite_path) as conn:
199
+ res = conn.execute("SELECT amount FROM stakes WHERE url = ?", (url,)).fetchone()
200
+ return res[0] if res else 0.0
201
+
202
+ async def get_all_stakes(self):
203
+ if self.is_postgres:
204
+ rows = await self.pool.fetch("SELECT url, amount, slashed FROM stakes")
205
+ return [dict(r) for r in rows]
206
+ else:
207
+ import sqlite3
208
+ with sqlite3.connect(self._sqlite_path) as conn:
209
+ cursor = conn.execute("SELECT url, amount, slashed FROM stakes")
210
+ return [{"url": r[0], "amount": r[1], "slashed": bool(r[2])} for r in cursor]
211
+
212
+ async def check_active(self, token, now):
213
+ if self.is_postgres:
214
+ return await self.pool.fetchval("SELECT 1 FROM peers WHERE node_token = $1 AND last_seen > $2", token, now - 60)
215
+ else:
216
+ import sqlite3
217
+ with sqlite3.connect(self._sqlite_path) as conn:
218
+ return conn.execute("SELECT 1 FROM peers WHERE node_token = ? AND last_seen > ?", (token, now - 60)).fetchone()
219
+
220
+ async def get_peers(self, layer_needed, shard_range, limit, now):
221
+ if self.is_postgres:
222
+ query = "SELECT url, shard_range, last_seen, tps, latency, node_token FROM peers WHERE last_seen > $1"
223
+ params = [now - 60]
224
+
225
+ idx = 2
226
+ if layer_needed is not None:
227
+ query += f" AND shard_start <= ${idx} AND shard_end > ${idx+1}"
228
+ params.extend([layer_needed, layer_needed])
229
+ idx += 2
230
+
231
+ if shard_range is not None:
232
+ query += f" AND shard_range = ${idx}"
233
+ params.append(shard_range)
234
+ idx += 1
235
+
236
+ query += f" ORDER BY RANDOM() LIMIT ${idx}"
237
+ params.append(limit)
238
+
239
+ rows = await self.pool.fetch(query, *params)
240
+ return [dict(row) for row in rows]
241
+ else:
242
+ import sqlite3
243
+ query = "SELECT url, shard_range, last_seen, tps, latency, node_token FROM peers WHERE last_seen > ?"
244
+ params = [now - 60]
245
+
246
+ if layer_needed is not None:
247
+ query += " AND shard_start <= ? AND shard_end > ?"
248
+ params.extend([layer_needed, layer_needed])
249
+
250
+ if shard_range is not None:
251
+ query += " AND shard_range = ?"
252
+ params.append(shard_range)
253
+
254
+ query += " ORDER BY RANDOM() LIMIT ?"
255
+ params.append(limit)
256
+
257
+ with sqlite3.connect(self._sqlite_path) as conn:
258
+ cursor = conn.execute(query, params)
259
+ return [
260
+ {"url": r[0], "shard_range": r[1], "last_seen": r[2], "tps": r[3], "latency": r[4], "node_token": r[5]}
261
+ for r in cursor
262
+ ]
263
+
264
+ async def get_active_tokens(self, limit, offset, now):
265
+ if self.is_postgres:
266
+ tokens = await self.pool.fetch(
267
+ "SELECT node_token FROM peers WHERE last_seen > $1 AND node_token IS NOT NULL ORDER BY last_seen DESC LIMIT $2 OFFSET $3",
268
+ now - 60, limit, offset
269
+ )
270
+ total = await self.pool.fetchval(
271
+ "SELECT COUNT(*) FROM peers WHERE last_seen > $1 AND node_token IS NOT NULL",
272
+ now - 60
273
+ )
274
+ return [r['node_token'] for r in tokens], total
275
+ else:
276
+ import sqlite3
277
+ with sqlite3.connect(self._sqlite_path) as conn:
278
+ cursor = conn.execute(
279
+ "SELECT node_token FROM peers WHERE last_seen > ? AND node_token IS NOT NULL ORDER BY last_seen DESC LIMIT ? OFFSET ?",
280
+ (now - 60, limit, offset)
281
+ )
282
+ tokens = [row[0] for row in cursor.fetchall()]
283
+ total = conn.execute(
284
+ "SELECT COUNT(*) FROM peers WHERE last_seen > ? AND node_token IS NOT NULL",
285
+ (now - 60,)
286
+ ).fetchone()[0]
287
+ return tokens, total
288
+
289
+ async def add_stake(self, url, amount):
290
+ if self.is_postgres:
291
+ await self.pool.execute("""
292
+ INSERT INTO stakes (url, amount) VALUES ($1, $2)
293
+ ON CONFLICT(url) DO UPDATE SET amount = stakes.amount + $3
294
+ """, url, amount, amount)
295
+ return await self.pool.fetchval("SELECT amount FROM stakes WHERE url = $1", url)
296
+ else:
297
+ import sqlite3
298
+ with sqlite3.connect(self._sqlite_path) as conn:
299
+ conn.execute("INSERT INTO stakes (url, amount) VALUES (?, ?) ON CONFLICT(url) DO UPDATE SET amount = amount + ?", (url, amount, amount))
300
+ return conn.execute("SELECT amount FROM stakes WHERE url = ?", (url,)).fetchone()[0]
301
+
302
+ async def slash_node(self, url):
303
+ if self.is_postgres:
304
+ async with self.pool.acquire() as conn:
305
+ await conn.execute("UPDATE stakes SET amount = 0, slashed = TRUE WHERE url = $1", url)
306
+ await conn.execute("DELETE FROM peers WHERE url = $1", url)
307
+ else:
308
+ import sqlite3
309
+ with sqlite3.connect(self._sqlite_path) as conn:
310
+ conn.execute("UPDATE stakes SET amount = 0, slashed = 1 WHERE url = ?", (url,))
311
+ conn.execute("DELETE FROM peers WHERE url = ?", (url,))
312
+
313
+ app = FastAPI(title="NeuroShard Tracker")
314
+ db = DatabaseManager()
315
+
316
+ @app.on_event("startup")
317
+ async def startup():
318
+ await db.connect()
319
+
320
+ @app.on_event("shutdown")
321
+ async def shutdown():
322
+ await db.close()
323
+
324
+ class NodeAnnouncement(BaseModel):
325
+ ip: str
326
+ port: int
327
+ shard_range: str # e.g. "0-4"
328
+ is_entry: bool = False
329
+ is_exit: bool = False
330
+ tps: float = 0.0
331
+ latency: float = 0.0
332
+ node_token: Optional[str] = None
333
+
334
+ class PeerInfo(BaseModel):
335
+ url: str
336
+ shard_range: str
337
+ last_seen: float
338
+ tps: float
339
+ latency: float
340
+ node_token: Optional[str] = None
341
+
342
+ @app.post("/announce")
343
+ async def announce(node: NodeAnnouncement, request: Request):
344
+ client_ip = node.ip
345
+ url = f"http://{client_ip}:{node.port}"
346
+
347
+ # Parse shard range - supports both "0-11" format and "dynamic-57-layers" format
348
+ try:
349
+ if node.shard_range.startswith("dynamic-"):
350
+ # Dynamic mode: "dynamic-57-layers" means node has layers 0 to 56 (57 layers total)
351
+ # Parse the number of layers
352
+ parts = node.shard_range.split("-")
353
+ if len(parts) >= 2:
354
+ num_layers = int(parts[1])
355
+ start = 0 # Dynamic nodes always start at layer 0 (embedding)
356
+ end = num_layers # End is exclusive, so 57 layers = 0-56
357
+ else:
358
+ start, end = 0, 1
359
+ else:
360
+ # Traditional format: "0-11" means layers 0 to 11
361
+ start, end = map(int, node.shard_range.split("-"))
362
+ end = end + 1 # Make end exclusive for consistent querying
363
+ except Exception as e:
364
+ logging.warning(f"Failed to parse shard_range '{node.shard_range}': {e}")
365
+ start, end = 0, 1 # Default to at least layer 0
366
+
367
+ now = time.time()
368
+
369
+ # Check if slashed
370
+ if await db.get_slashed_status(url):
371
+ raise HTTPException(status_code=403, detail="Node is banned (slashed).")
372
+
373
+ # UPSERT Peer
374
+ await db.upsert_peer(url, client_ip, node.port, node.shard_range, start, end, node.is_entry, node.is_exit, now, node.tps, node.latency, node.node_token)
375
+
376
+ # REMOVED: Grant initial free stake for PoC if new
377
+ # This was a development feature - nodes should earn or stake their own NEURO
378
+ # await db.grant_initial_stake(url)
379
+
380
+ # Get stats
381
+ count, tps, latency = await db.get_stats(now)
382
+ stake = await db.get_stake(url)
383
+
384
+ return {"status": "registered", "peer_count": count, "stake": stake}
385
+
386
+ @app.get("/check_active")
387
+ async def check_active(token: str):
388
+ now = time.time()
389
+ if await db.check_active(token, now):
390
+ return {"active": True}
391
+ raise HTTPException(status_code=404, detail="No active node found with this token")
392
+
393
+ @app.get("/peers")
394
+ async def get_peers(
395
+ layer_needed: Optional[int] = None,
396
+ shard_range: Optional[str] = None,
397
+ limit: int = 50
398
+ ):
399
+ now = time.time()
400
+ peers = await db.get_peers(layer_needed, shard_range, limit, now)
401
+ return peers
402
+
403
+ @app.get("/active_tokens")
404
+ async def get_active_tokens(limit: int = 100, offset: int = 0):
405
+ """Return a paginated list of active node tokens for reward distribution."""
406
+ now = time.time()
407
+ tokens, total = await db.get_active_tokens(limit, offset, now)
408
+
409
+ return {
410
+ "tokens": tokens,
411
+ "total": total,
412
+ "page_size": limit,
413
+ "offset": offset
414
+ }
415
+
416
+ @app.get("/stats")
417
+ async def get_stats_endpoint():
418
+ now = time.time()
419
+ count, tps, lat = await db.get_stats(now)
420
+
421
+ return {
422
+ "active_nodes": count,
423
+ "model_size": "142B",
424
+ "total_tps": int(tps),
425
+ "avg_latency": f"{int(lat)}ms"
426
+ }
427
+
428
+ # --- Staking & Slashing Endpoints ---
429
+
430
+ @app.get("/stakes")
431
+ async def get_all_stakes():
432
+ """Get list of all stakes."""
433
+ return await db.get_all_stakes()
434
+
435
+ @app.post("/stake")
436
+ async def add_stake_endpoint(url: str = Body(...), amount: float = Body(...)):
437
+ # Check slashed
438
+ if await db.get_slashed_status(url):
439
+ return {"error": "Node slashed"}
440
+
441
+ new_stake = await db.add_stake(url, amount)
442
+ return {"new_stake": new_stake}
443
+
444
+ @app.post("/slash")
445
+ async def slash_node_endpoint(url: str = Body(...), reason: str = Body(...)):
446
+ print(f"SLASHING NODE {url} for {reason}")
447
+ await db.slash_node(url)
448
+ return {"status": "slashed", "url": url}
449
+
450
+ # --- Phase 4: Tensor Shard Endpoints ---
451
+
452
+ class TensorShardAnnouncement(BaseModel):
453
+ model_id: str
454
+ layer_id: int
455
+ shard_id: int
456
+ total_shards: int
457
+ grpc_addr: str
458
+ available_memory_mb: float = 0.0
459
+ current_load: float = 0.0
460
+ node_token: Optional[str] = None
461
+
462
+ @app.post("/tensor_shards/announce")
463
+ async def announce_tensor_shard(shard: TensorShardAnnouncement, request: Request):
464
+ """Announce a tensor shard availability."""
465
+ now = time.time()
466
+
467
+ # Generate unique ID
468
+ shard_id_key = f"{shard.model_id}:{shard.layer_id}:{shard.shard_id}:{shard.total_shards}"
469
+
470
+ # Get client URL from request or grpc_addr
471
+ client_ip = request.client.host if request.client else "unknown"
472
+ node_url = f"http://{client_ip}:8000" # Assume standard port
473
+
474
+ import sqlite3
475
+ with sqlite3.connect(db._sqlite_path) as conn:
476
+ conn.execute("""
477
+ INSERT OR REPLACE INTO tensor_shards
478
+ (id, model_id, layer_id, shard_id, total_shards, node_url, grpc_addr,
479
+ available_memory_mb, current_load, last_seen, node_token)
480
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
481
+ """, (shard_id_key, shard.model_id, shard.layer_id, shard.shard_id,
482
+ shard.total_shards, node_url, shard.grpc_addr,
483
+ shard.available_memory_mb, shard.current_load, now, shard.node_token))
484
+
485
+ return {"status": "registered", "shard_key": shard_id_key}
486
+
487
+ @app.get("/tensor_shards")
488
+ async def get_tensor_shards(
489
+ model_id: str,
490
+ layer_id: int,
491
+ total_shards: int
492
+ ):
493
+ """Get all tensor shards for a specific layer."""
494
+ now = time.time()
495
+
496
+ import sqlite3
497
+ with sqlite3.connect(db._sqlite_path) as conn:
498
+ cursor = conn.execute("""
499
+ SELECT shard_id, grpc_addr, available_memory_mb, current_load, last_seen, node_url
500
+ FROM tensor_shards
501
+ WHERE model_id = ? AND layer_id = ? AND total_shards = ? AND last_seen > ?
502
+ ORDER BY shard_id
503
+ """, (model_id, layer_id, total_shards, now - 120)) # 2 minute timeout
504
+
505
+ shards = []
506
+ for row in cursor:
507
+ shards.append({
508
+ "shard_id": row[0],
509
+ "grpc_addr": row[1],
510
+ "available_memory_mb": row[2],
511
+ "current_load": row[3],
512
+ "last_seen": row[4],
513
+ "node_url": row[5]
514
+ })
515
+
516
+ return {"shards": shards, "total_found": len(shards)}
517
+
518
+ @app.get("/tensor_shards/coverage")
519
+ async def get_tensor_shard_coverage(model_id: str):
520
+ """Get tensor shard coverage for a model."""
521
+ now = time.time()
522
+
523
+ import sqlite3
524
+ with sqlite3.connect(db._sqlite_path) as conn:
525
+ # Get all active shards for this model
526
+ cursor = conn.execute("""
527
+ SELECT layer_id, shard_id, total_shards, COUNT(*) as node_count
528
+ FROM tensor_shards
529
+ WHERE model_id = ? AND last_seen > ?
530
+ GROUP BY layer_id, shard_id, total_shards
531
+ """, (model_id, now - 120))
532
+
533
+ coverage = {}
534
+ for row in cursor:
535
+ layer_id, shard_id, total_shards, node_count = row
536
+ key = f"layer_{layer_id}"
537
+ if key not in coverage:
538
+ coverage[key] = {"layer_id": layer_id, "shards": {}}
539
+ coverage[key]["shards"][shard_id] = {
540
+ "shard_id": shard_id,
541
+ "total_shards": total_shards,
542
+ "node_count": node_count
543
+ }
544
+
545
+ # Check completeness
546
+ complete_layers = []
547
+ incomplete_layers = []
548
+
549
+ for layer_key, layer_info in coverage.items():
550
+ shards = layer_info["shards"]
551
+ if shards:
552
+ total = list(shards.values())[0]["total_shards"]
553
+ if len(shards) == total and all(s["node_count"] > 0 for s in shards.values()):
554
+ complete_layers.append(layer_info["layer_id"])
555
+ else:
556
+ incomplete_layers.append(layer_info["layer_id"])
557
+
558
+ return {
559
+ "model_id": model_id,
560
+ "coverage": coverage,
561
+ "complete_layers": complete_layers,
562
+ "incomplete_layers": incomplete_layers,
563
+ "is_inference_ready": len(incomplete_layers) == 0 and len(complete_layers) > 0
564
+ }
565
+
566
+ # --- Phase 4: Model Registry Endpoints ---
567
+
568
+ class ModelRegistration(BaseModel):
569
+ model_id: str
570
+ name: str
571
+ family: str
572
+ num_layers: int
573
+ hidden_dim: int
574
+ total_size_mb: float
575
+ required_stake: float = 0.0
576
+ node_token: Optional[str] = None
577
+
578
+ @app.post("/models/register")
579
+ async def register_model(model: ModelRegistration):
580
+ """Register a new model (requires stake for custom models)."""
581
+ now = time.time()
582
+
583
+ import sqlite3
584
+ with sqlite3.connect(db._sqlite_path) as conn:
585
+ # Check if already exists
586
+ existing = conn.execute(
587
+ "SELECT 1 FROM models WHERE model_id = ?",
588
+ (model.model_id,)
589
+ ).fetchone()
590
+
591
+ if existing:
592
+ return {"status": "exists", "model_id": model.model_id}
593
+
594
+ # Only NeuroLLM is supported - this is a decentralized network for our model
595
+ approved = model.family == "neurollm"
596
+
597
+ conn.execute("""
598
+ INSERT INTO models
599
+ (model_id, name, family, num_layers, hidden_dim, total_size_mb,
600
+ required_stake, approved, proposer_token, created_at)
601
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
602
+ """, (model.model_id, model.name, model.family, model.num_layers,
603
+ model.hidden_dim, model.total_size_mb, model.required_stake,
604
+ approved, model.node_token, now))
605
+
606
+ return {"status": "registered", "model_id": model.model_id, "approved": approved}
607
+
608
+ @app.get("/models")
609
+ async def list_models(approved_only: bool = True, family: Optional[str] = None):
610
+ """List available models."""
611
+ import sqlite3
612
+ with sqlite3.connect(db._sqlite_path) as conn:
613
+ query = "SELECT * FROM models WHERE 1=1"
614
+ params = []
615
+
616
+ if approved_only:
617
+ query += " AND approved = 1"
618
+
619
+ if family:
620
+ query += " AND family = ?"
621
+ params.append(family)
622
+
623
+ query += " ORDER BY total_size_mb"
624
+
625
+ cursor = conn.execute(query, params)
626
+ models = []
627
+ for row in cursor:
628
+ models.append({
629
+ "model_id": row[0],
630
+ "name": row[1],
631
+ "family": row[2],
632
+ "num_layers": row[3],
633
+ "hidden_dim": row[4],
634
+ "total_size_mb": row[5],
635
+ "required_stake": row[6],
636
+ "approved": bool(row[7])
637
+ })
638
+
639
+ return {"models": models}
640
+
641
+ @app.get("/models/{model_id}/status")
642
+ async def get_model_status(model_id: str):
643
+ """Get status of a specific model in the network."""
644
+ now = time.time()
645
+
646
+ import sqlite3
647
+ with sqlite3.connect(db._sqlite_path) as conn:
648
+ # Get model info
649
+ model_row = conn.execute(
650
+ "SELECT * FROM models WHERE model_id = ?",
651
+ (model_id,)
652
+ ).fetchone()
653
+
654
+ if not model_row:
655
+ raise HTTPException(status_code=404, detail="Model not found")
656
+
657
+ # Get pipeline coverage (from peers)
658
+ peers = await db.get_peers(layer_needed=None, shard_range=None, limit=1000, now=now)
659
+
660
+ # Filter peers serving this model (for now, assume all peers serve neurollm)
661
+ # In production, peers would announce which model they serve
662
+
663
+ layer_coverage = {}
664
+ for peer in peers:
665
+ shard_range = peer.get("shard_range", "0-0")
666
+ try:
667
+ start, end = map(int, shard_range.split("-"))
668
+ for layer in range(start, end + 1):
669
+ if layer not in layer_coverage:
670
+ layer_coverage[layer] = 0
671
+ layer_coverage[layer] += 1
672
+ except:
673
+ continue
674
+
675
+ # Get tensor shard coverage
676
+ tensor_cursor = conn.execute("""
677
+ SELECT layer_id, COUNT(DISTINCT shard_id) as shard_count,
678
+ MAX(total_shards) as total_shards
679
+ FROM tensor_shards
680
+ WHERE model_id = ? AND last_seen > ?
681
+ GROUP BY layer_id
682
+ """, (model_id, now - 120))
683
+
684
+ tensor_coverage = {}
685
+ for row in tensor_cursor:
686
+ tensor_coverage[row[0]] = {
687
+ "shards_available": row[1],
688
+ "total_shards": row[2],
689
+ "complete": row[1] == row[2]
690
+ }
691
+
692
+ return {
693
+ "model_id": model_id,
694
+ "name": model_row[1],
695
+ "family": model_row[2],
696
+ "num_layers": model_row[3],
697
+ "pipeline_coverage": layer_coverage,
698
+ "tensor_coverage": tensor_coverage,
699
+ "total_nodes": len(peers),
700
+ "is_fully_covered": all(
701
+ layer_coverage.get(i, 0) > 0
702
+ for i in range(model_row[3])
703
+ )
704
+ }
705
+
706
+ @app.get("/layer_coverage")
707
+ async def get_layer_coverage():
708
+ """
709
+ Get the current distribution of nodes across transformer layers.
710
+ Used by DynamicShardManager to determine optimal layer allocation.
711
+ """
712
+ now = time.time()
713
+
714
+ # Get all active peers
715
+ peers = await db.get_peers(layer_needed=None, shard_range=None, limit=1000, now=now)
716
+
717
+ # Count nodes per layer (GPT-2 has 12 layers: 0-11)
718
+ TOTAL_LAYERS = 12
719
+ layer_coverage = {i: {"layer_id": i, "node_count": 0, "nodes": []} for i in range(TOTAL_LAYERS)}
720
+
721
+ for peer in peers:
722
+ shard_range = peer.get("shard_range", "0-0")
723
+ try:
724
+ start, end = map(int, shard_range.split("-"))
725
+ # Each node covers layers from start to end (inclusive based on convention)
726
+ for layer in range(start, min(end + 1, TOTAL_LAYERS)):
727
+ layer_coverage[layer]["node_count"] += 1
728
+ layer_coverage[layer]["nodes"].append(peer.get("url", ""))
729
+ except:
730
+ continue
731
+
732
+ # Calculate statistics
733
+ node_counts = [lc["node_count"] for lc in layer_coverage.values()]
734
+ total_nodes = len(peers)
735
+ avg_coverage = sum(node_counts) / TOTAL_LAYERS if TOTAL_LAYERS > 0 else 0
736
+ min_coverage = min(node_counts) if node_counts else 0
737
+ max_coverage = max(node_counts) if node_counts else 0
738
+
739
+ # Find underserved layers (below average)
740
+ underserved = [
741
+ layer_id for layer_id, lc in layer_coverage.items()
742
+ if lc["node_count"] < avg_coverage
743
+ ]
744
+
745
+ # Find critical layers (entry=0, exit=11)
746
+ entry_coverage = layer_coverage[0]["node_count"]
747
+ exit_coverage = layer_coverage[TOTAL_LAYERS - 1]["node_count"]
748
+
749
+ return {
750
+ "total_layers": TOTAL_LAYERS,
751
+ "total_active_nodes": total_nodes,
752
+ "layer_coverage": [
753
+ {
754
+ "layer_id": lc["layer_id"],
755
+ "node_count": lc["node_count"],
756
+ # Don't include full node list in response for privacy/size
757
+ }
758
+ for lc in layer_coverage.values()
759
+ ],
760
+ "statistics": {
761
+ "avg_nodes_per_layer": round(avg_coverage, 2),
762
+ "min_coverage": min_coverage,
763
+ "max_coverage": max_coverage,
764
+ "underserved_layers": underserved,
765
+ "entry_layer_nodes": entry_coverage,
766
+ "exit_layer_nodes": exit_coverage
767
+ },
768
+ "health": {
769
+ "has_full_coverage": min_coverage > 0,
770
+ "is_balanced": (max_coverage - min_coverage) <= avg_coverage if avg_coverage > 0 else True,
771
+ "entry_healthy": entry_coverage >= 1,
772
+ "exit_healthy": exit_coverage >= 1
773
+ }
774
+ }
775
+
776
+
777
+ @app.get("/network_architecture")
778
+ async def get_network_architecture():
779
+ """
780
+ Get the current network-wide architecture.
781
+
782
+ This endpoint helps nodes rejoin the network with the correct architecture.
783
+ The architecture is determined by querying active nodes and finding consensus.
784
+
785
+ Returns the architecture used by the majority of active nodes.
786
+ """
787
+ import requests
788
+ import asyncio
789
+ from concurrent.futures import ThreadPoolExecutor
790
+
791
+ now = time.time()
792
+
793
+ # Get active peers
794
+ peers = await db.get_peers(layer_needed=None, shard_range=None, limit=50, now=now)
795
+
796
+ if not peers:
797
+ return {
798
+ "status": "no_peers",
799
+ "message": "No active peers - network is bootstrapping",
800
+ "hidden_dim": None,
801
+ }
802
+
803
+ # Query architecture from active peers (using requests in thread pool)
804
+ def query_peer_arch(peer_url: str):
805
+ """Query a single peer for its architecture (blocking)."""
806
+ try:
807
+ response = requests.get(f"{peer_url}/api/node/architecture", timeout=3.0)
808
+ if response.status_code == 200:
809
+ arch_data = response.json()
810
+ if arch_data.get("hidden_dim"):
811
+ return arch_data
812
+ except Exception:
813
+ pass
814
+ return None
815
+
816
+ # Run queries in thread pool to not block the event loop
817
+ architectures = []
818
+ loop = asyncio.get_event_loop()
819
+ with ThreadPoolExecutor(max_workers=5) as executor:
820
+ tasks = []
821
+ for peer in peers[:10]: # Sample up to 10 peers
822
+ peer_url = peer.get("url", "")
823
+ if peer_url:
824
+ tasks.append(loop.run_in_executor(executor, query_peer_arch, peer_url))
825
+
826
+ if tasks:
827
+ results = await asyncio.gather(*tasks, return_exceptions=True)
828
+ architectures = [r for r in results if r and not isinstance(r, Exception)]
829
+
830
+ if not architectures:
831
+ return {
832
+ "status": "unavailable",
833
+ "message": "Could not query peer architectures",
834
+ "hidden_dim": None,
835
+ }
836
+
837
+ # Find consensus (majority architecture)
838
+ # Group by (hidden_dim, num_heads, num_kv_heads) tuple
839
+ arch_counts = {}
840
+ for arch in architectures:
841
+ key = (arch["hidden_dim"], arch["num_heads"], arch["num_kv_heads"])
842
+ if key not in arch_counts:
843
+ arch_counts[key] = {"count": 0, "data": arch}
844
+ arch_counts[key]["count"] += 1
845
+
846
+ # Find the most common architecture
847
+ consensus = max(arch_counts.values(), key=lambda x: x["count"])
848
+ consensus_arch = consensus["data"]
849
+
850
+ return {
851
+ "status": "ok",
852
+ "hidden_dim": consensus_arch["hidden_dim"],
853
+ "intermediate_dim": consensus_arch.get("intermediate_dim"),
854
+ "num_layers": consensus_arch.get("num_layers"),
855
+ "num_heads": consensus_arch["num_heads"],
856
+ "num_kv_heads": consensus_arch["num_kv_heads"],
857
+ "estimated_params": consensus_arch.get("estimated_params"),
858
+ "consensus_peers": consensus["count"],
859
+ "total_peers_sampled": len(architectures),
860
+ }
861
+
862
+
863
+ if __name__ == "__main__":
864
+ uvicorn.run(app, host="0.0.0.0", port=3000)