nexaroa 0.0.111__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuroshard/__init__.py +93 -0
- neuroshard/__main__.py +4 -0
- neuroshard/cli.py +466 -0
- neuroshard/core/__init__.py +92 -0
- neuroshard/core/consensus/verifier.py +252 -0
- neuroshard/core/crypto/__init__.py +20 -0
- neuroshard/core/crypto/ecdsa.py +392 -0
- neuroshard/core/economics/__init__.py +52 -0
- neuroshard/core/economics/constants.py +387 -0
- neuroshard/core/economics/ledger.py +2111 -0
- neuroshard/core/economics/market.py +975 -0
- neuroshard/core/economics/wallet.py +168 -0
- neuroshard/core/governance/__init__.py +74 -0
- neuroshard/core/governance/proposal.py +561 -0
- neuroshard/core/governance/registry.py +545 -0
- neuroshard/core/governance/versioning.py +332 -0
- neuroshard/core/governance/voting.py +453 -0
- neuroshard/core/model/__init__.py +30 -0
- neuroshard/core/model/dynamic.py +4186 -0
- neuroshard/core/model/llm.py +905 -0
- neuroshard/core/model/registry.py +164 -0
- neuroshard/core/model/scaler.py +387 -0
- neuroshard/core/model/tokenizer.py +568 -0
- neuroshard/core/network/__init__.py +56 -0
- neuroshard/core/network/connection_pool.py +72 -0
- neuroshard/core/network/dht.py +130 -0
- neuroshard/core/network/dht_plan.py +55 -0
- neuroshard/core/network/dht_proof_store.py +516 -0
- neuroshard/core/network/dht_protocol.py +261 -0
- neuroshard/core/network/dht_service.py +506 -0
- neuroshard/core/network/encrypted_channel.py +141 -0
- neuroshard/core/network/nat.py +201 -0
- neuroshard/core/network/nat_traversal.py +695 -0
- neuroshard/core/network/p2p.py +929 -0
- neuroshard/core/network/p2p_data.py +150 -0
- neuroshard/core/swarm/__init__.py +106 -0
- neuroshard/core/swarm/aggregation.py +729 -0
- neuroshard/core/swarm/buffers.py +643 -0
- neuroshard/core/swarm/checkpoint.py +709 -0
- neuroshard/core/swarm/compute.py +624 -0
- neuroshard/core/swarm/diloco.py +844 -0
- neuroshard/core/swarm/factory.py +1288 -0
- neuroshard/core/swarm/heartbeat.py +669 -0
- neuroshard/core/swarm/logger.py +487 -0
- neuroshard/core/swarm/router.py +658 -0
- neuroshard/core/swarm/service.py +640 -0
- neuroshard/core/training/__init__.py +29 -0
- neuroshard/core/training/checkpoint.py +600 -0
- neuroshard/core/training/distributed.py +1602 -0
- neuroshard/core/training/global_tracker.py +617 -0
- neuroshard/core/training/production.py +276 -0
- neuroshard/governance_cli.py +729 -0
- neuroshard/grpc_server.py +895 -0
- neuroshard/runner.py +3223 -0
- neuroshard/sdk/__init__.py +92 -0
- neuroshard/sdk/client.py +990 -0
- neuroshard/sdk/errors.py +101 -0
- neuroshard/sdk/types.py +282 -0
- neuroshard/tracker/__init__.py +0 -0
- neuroshard/tracker/server.py +864 -0
- neuroshard/ui/__init__.py +0 -0
- neuroshard/ui/app.py +102 -0
- neuroshard/ui/templates/index.html +1052 -0
- neuroshard/utils/__init__.py +0 -0
- neuroshard/utils/autostart.py +81 -0
- neuroshard/utils/hardware.py +121 -0
- neuroshard/utils/serialization.py +90 -0
- neuroshard/version.py +1 -0
- nexaroa-0.0.111.dist-info/METADATA +283 -0
- nexaroa-0.0.111.dist-info/RECORD +78 -0
- nexaroa-0.0.111.dist-info/WHEEL +5 -0
- nexaroa-0.0.111.dist-info/entry_points.txt +4 -0
- nexaroa-0.0.111.dist-info/licenses/LICENSE +190 -0
- nexaroa-0.0.111.dist-info/top_level.txt +2 -0
- protos/__init__.py +0 -0
- protos/neuroshard.proto +651 -0
- protos/neuroshard_pb2.py +160 -0
- protos/neuroshard_pb2_grpc.py +1298 -0
|
@@ -0,0 +1,864 @@
|
|
|
1
|
+
|
|
2
|
+
from fastapi import FastAPI, HTTPException, Body, Request, Query
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
|
+
import time
|
|
6
|
+
import uvicorn
|
|
7
|
+
import os
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
# Database Abstraction
|
|
12
|
+
class DatabaseManager:
|
|
13
|
+
def __init__(self):
|
|
14
|
+
self.pool = None
|
|
15
|
+
self.db_url = os.getenv("DATABASE_URL", "sqlite://tracker.db")
|
|
16
|
+
self.is_postgres = self.db_url.startswith("postgres")
|
|
17
|
+
self._sqlite_path = self.db_url.replace("sqlite://", "")
|
|
18
|
+
|
|
19
|
+
async def connect(self):
|
|
20
|
+
if self.is_postgres:
|
|
21
|
+
import asyncpg
|
|
22
|
+
# Wait for DB to be ready
|
|
23
|
+
for i in range(5):
|
|
24
|
+
try:
|
|
25
|
+
self.pool = await asyncpg.create_pool(self.db_url)
|
|
26
|
+
break
|
|
27
|
+
except Exception as e:
|
|
28
|
+
print(f"Waiting for DB... {e}")
|
|
29
|
+
await asyncio.sleep(2)
|
|
30
|
+
if not self.pool:
|
|
31
|
+
raise Exception("Could not connect to Postgres")
|
|
32
|
+
|
|
33
|
+
await self.init_postgres()
|
|
34
|
+
else:
|
|
35
|
+
self.init_sqlite()
|
|
36
|
+
|
|
37
|
+
async def close(self):
|
|
38
|
+
if self.pool:
|
|
39
|
+
await self.pool.close()
|
|
40
|
+
|
|
41
|
+
def init_sqlite(self):
|
|
42
|
+
import sqlite3
|
|
43
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
44
|
+
conn.execute("""
|
|
45
|
+
CREATE TABLE IF NOT EXISTS peers (
|
|
46
|
+
url TEXT PRIMARY KEY,
|
|
47
|
+
ip TEXT,
|
|
48
|
+
port INTEGER,
|
|
49
|
+
shard_range TEXT,
|
|
50
|
+
shard_start INTEGER,
|
|
51
|
+
shard_end INTEGER,
|
|
52
|
+
is_entry BOOLEAN,
|
|
53
|
+
is_exit BOOLEAN,
|
|
54
|
+
last_seen REAL,
|
|
55
|
+
tps REAL,
|
|
56
|
+
latency REAL,
|
|
57
|
+
node_token TEXT
|
|
58
|
+
)
|
|
59
|
+
""")
|
|
60
|
+
conn.execute("""
|
|
61
|
+
CREATE TABLE IF NOT EXISTS stakes (
|
|
62
|
+
url TEXT PRIMARY KEY,
|
|
63
|
+
amount REAL,
|
|
64
|
+
slashed BOOLEAN DEFAULT 0
|
|
65
|
+
)
|
|
66
|
+
""")
|
|
67
|
+
# Phase 4: Tensor shard tracking
|
|
68
|
+
conn.execute("""
|
|
69
|
+
CREATE TABLE IF NOT EXISTS tensor_shards (
|
|
70
|
+
id TEXT PRIMARY KEY,
|
|
71
|
+
model_id TEXT,
|
|
72
|
+
layer_id INTEGER,
|
|
73
|
+
shard_id INTEGER,
|
|
74
|
+
total_shards INTEGER,
|
|
75
|
+
node_url TEXT,
|
|
76
|
+
grpc_addr TEXT,
|
|
77
|
+
available_memory_mb REAL,
|
|
78
|
+
current_load REAL,
|
|
79
|
+
last_seen REAL,
|
|
80
|
+
node_token TEXT
|
|
81
|
+
)
|
|
82
|
+
""")
|
|
83
|
+
# Phase 4: Model registry
|
|
84
|
+
conn.execute("""
|
|
85
|
+
CREATE TABLE IF NOT EXISTS models (
|
|
86
|
+
model_id TEXT PRIMARY KEY,
|
|
87
|
+
name TEXT,
|
|
88
|
+
family TEXT,
|
|
89
|
+
num_layers INTEGER,
|
|
90
|
+
hidden_dim INTEGER,
|
|
91
|
+
total_size_mb REAL,
|
|
92
|
+
required_stake REAL,
|
|
93
|
+
approved BOOLEAN DEFAULT 1,
|
|
94
|
+
proposer_token TEXT,
|
|
95
|
+
created_at REAL
|
|
96
|
+
)
|
|
97
|
+
""")
|
|
98
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_shard_start ON peers(shard_start)")
|
|
99
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_shard_range ON peers(shard_range)")
|
|
100
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_last_seen ON peers(last_seen)")
|
|
101
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_tensor_model_layer ON tensor_shards(model_id, layer_id)")
|
|
102
|
+
conn.execute("CREATE INDEX IF NOT EXISTS idx_tensor_last_seen ON tensor_shards(last_seen)")
|
|
103
|
+
|
|
104
|
+
async def init_postgres(self):
|
|
105
|
+
async with self.pool.acquire() as conn:
|
|
106
|
+
await conn.execute("""
|
|
107
|
+
CREATE TABLE IF NOT EXISTS peers (
|
|
108
|
+
url TEXT PRIMARY KEY,
|
|
109
|
+
ip TEXT,
|
|
110
|
+
port INTEGER,
|
|
111
|
+
shard_range TEXT,
|
|
112
|
+
shard_start INTEGER,
|
|
113
|
+
shard_end INTEGER,
|
|
114
|
+
is_entry BOOLEAN,
|
|
115
|
+
is_exit BOOLEAN,
|
|
116
|
+
last_seen DOUBLE PRECISION,
|
|
117
|
+
tps DOUBLE PRECISION,
|
|
118
|
+
latency DOUBLE PRECISION,
|
|
119
|
+
node_token TEXT
|
|
120
|
+
)
|
|
121
|
+
""")
|
|
122
|
+
await conn.execute("""
|
|
123
|
+
CREATE TABLE IF NOT EXISTS stakes (
|
|
124
|
+
url TEXT PRIMARY KEY,
|
|
125
|
+
amount DOUBLE PRECISION,
|
|
126
|
+
slashed BOOLEAN DEFAULT FALSE
|
|
127
|
+
)
|
|
128
|
+
""")
|
|
129
|
+
await conn.execute("CREATE INDEX IF NOT EXISTS idx_shard_start ON peers(shard_start)")
|
|
130
|
+
await conn.execute("CREATE INDEX IF NOT EXISTS idx_shard_range ON peers(shard_range)")
|
|
131
|
+
await conn.execute("CREATE INDEX IF NOT EXISTS idx_last_seen ON peers(last_seen)")
|
|
132
|
+
|
|
133
|
+
async def upsert_peer(self, url, ip, port, shard_range, start, end, is_entry, is_exit, now, tps, latency, node_token):
|
|
134
|
+
if self.is_postgres:
|
|
135
|
+
query = """
|
|
136
|
+
INSERT INTO peers
|
|
137
|
+
(url, ip, port, shard_range, shard_start, shard_end, is_entry, is_exit, last_seen, tps, latency, node_token)
|
|
138
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)
|
|
139
|
+
ON CONFLICT (url) DO UPDATE SET
|
|
140
|
+
ip = EXCLUDED.ip,
|
|
141
|
+
port = EXCLUDED.port,
|
|
142
|
+
shard_range = EXCLUDED.shard_range,
|
|
143
|
+
shard_start = EXCLUDED.shard_start,
|
|
144
|
+
shard_end = EXCLUDED.shard_end,
|
|
145
|
+
is_entry = EXCLUDED.is_entry,
|
|
146
|
+
is_exit = EXCLUDED.is_exit,
|
|
147
|
+
last_seen = EXCLUDED.last_seen,
|
|
148
|
+
tps = EXCLUDED.tps,
|
|
149
|
+
latency = EXCLUDED.latency,
|
|
150
|
+
node_token = EXCLUDED.node_token
|
|
151
|
+
"""
|
|
152
|
+
await self.pool.execute(query, url, ip, port, shard_range, start, end, is_entry, is_exit, now, tps, latency, node_token)
|
|
153
|
+
else:
|
|
154
|
+
import sqlite3
|
|
155
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
156
|
+
conn.execute("""
|
|
157
|
+
INSERT OR REPLACE INTO peers
|
|
158
|
+
(url, ip, port, shard_range, shard_start, shard_end, is_entry, is_exit, last_seen, tps, latency, node_token)
|
|
159
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
160
|
+
""", (url, ip, port, shard_range, start, end, is_entry, is_exit, now, tps, latency, node_token))
|
|
161
|
+
|
|
162
|
+
async def get_slashed_status(self, url):
|
|
163
|
+
if self.is_postgres:
|
|
164
|
+
row = await self.pool.fetchrow("SELECT slashed FROM stakes WHERE url = $1", url)
|
|
165
|
+
return row['slashed'] if row else False
|
|
166
|
+
else:
|
|
167
|
+
import sqlite3
|
|
168
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
169
|
+
cursor = conn.execute("SELECT slashed FROM stakes WHERE url = ?", (url,))
|
|
170
|
+
row = cursor.fetchone()
|
|
171
|
+
return row[0] if row else False
|
|
172
|
+
|
|
173
|
+
async def grant_initial_stake(self, url):
|
|
174
|
+
if self.is_postgres:
|
|
175
|
+
await self.pool.execute("INSERT INTO stakes (url, amount) VALUES ($1, 1000.0) ON CONFLICT DO NOTHING", url)
|
|
176
|
+
else:
|
|
177
|
+
import sqlite3
|
|
178
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
179
|
+
conn.execute("INSERT OR IGNORE INTO stakes (url, amount) VALUES (?, 1000.0)", (url,))
|
|
180
|
+
|
|
181
|
+
async def get_stats(self, now):
|
|
182
|
+
if self.is_postgres:
|
|
183
|
+
count = await self.pool.fetchval("SELECT COUNT(*) FROM peers WHERE last_seen > $1", now - 60)
|
|
184
|
+
row = await self.pool.fetchrow("SELECT SUM(tps) as tps, AVG(latency) as lat FROM peers WHERE last_seen > $1", now - 60)
|
|
185
|
+
return count, row['tps'] or 0, row['lat'] or 0
|
|
186
|
+
else:
|
|
187
|
+
import sqlite3
|
|
188
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
189
|
+
count = conn.execute("SELECT COUNT(*) FROM peers WHERE last_seen > ?", (now - 60,)).fetchone()[0]
|
|
190
|
+
stats = conn.execute("SELECT SUM(tps), AVG(latency) FROM peers WHERE last_seen > ?", (now - 60,)).fetchone()
|
|
191
|
+
return count, stats[0] or 0, stats[1] or 0
|
|
192
|
+
|
|
193
|
+
async def get_stake(self, url):
|
|
194
|
+
if self.is_postgres:
|
|
195
|
+
return await self.pool.fetchval("SELECT amount FROM stakes WHERE url = $1", url)
|
|
196
|
+
else:
|
|
197
|
+
import sqlite3
|
|
198
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
199
|
+
res = conn.execute("SELECT amount FROM stakes WHERE url = ?", (url,)).fetchone()
|
|
200
|
+
return res[0] if res else 0.0
|
|
201
|
+
|
|
202
|
+
async def get_all_stakes(self):
|
|
203
|
+
if self.is_postgres:
|
|
204
|
+
rows = await self.pool.fetch("SELECT url, amount, slashed FROM stakes")
|
|
205
|
+
return [dict(r) for r in rows]
|
|
206
|
+
else:
|
|
207
|
+
import sqlite3
|
|
208
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
209
|
+
cursor = conn.execute("SELECT url, amount, slashed FROM stakes")
|
|
210
|
+
return [{"url": r[0], "amount": r[1], "slashed": bool(r[2])} for r in cursor]
|
|
211
|
+
|
|
212
|
+
async def check_active(self, token, now):
|
|
213
|
+
if self.is_postgres:
|
|
214
|
+
return await self.pool.fetchval("SELECT 1 FROM peers WHERE node_token = $1 AND last_seen > $2", token, now - 60)
|
|
215
|
+
else:
|
|
216
|
+
import sqlite3
|
|
217
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
218
|
+
return conn.execute("SELECT 1 FROM peers WHERE node_token = ? AND last_seen > ?", (token, now - 60)).fetchone()
|
|
219
|
+
|
|
220
|
+
async def get_peers(self, layer_needed, shard_range, limit, now):
|
|
221
|
+
if self.is_postgres:
|
|
222
|
+
query = "SELECT url, shard_range, last_seen, tps, latency, node_token FROM peers WHERE last_seen > $1"
|
|
223
|
+
params = [now - 60]
|
|
224
|
+
|
|
225
|
+
idx = 2
|
|
226
|
+
if layer_needed is not None:
|
|
227
|
+
query += f" AND shard_start <= ${idx} AND shard_end > ${idx+1}"
|
|
228
|
+
params.extend([layer_needed, layer_needed])
|
|
229
|
+
idx += 2
|
|
230
|
+
|
|
231
|
+
if shard_range is not None:
|
|
232
|
+
query += f" AND shard_range = ${idx}"
|
|
233
|
+
params.append(shard_range)
|
|
234
|
+
idx += 1
|
|
235
|
+
|
|
236
|
+
query += f" ORDER BY RANDOM() LIMIT ${idx}"
|
|
237
|
+
params.append(limit)
|
|
238
|
+
|
|
239
|
+
rows = await self.pool.fetch(query, *params)
|
|
240
|
+
return [dict(row) for row in rows]
|
|
241
|
+
else:
|
|
242
|
+
import sqlite3
|
|
243
|
+
query = "SELECT url, shard_range, last_seen, tps, latency, node_token FROM peers WHERE last_seen > ?"
|
|
244
|
+
params = [now - 60]
|
|
245
|
+
|
|
246
|
+
if layer_needed is not None:
|
|
247
|
+
query += " AND shard_start <= ? AND shard_end > ?"
|
|
248
|
+
params.extend([layer_needed, layer_needed])
|
|
249
|
+
|
|
250
|
+
if shard_range is not None:
|
|
251
|
+
query += " AND shard_range = ?"
|
|
252
|
+
params.append(shard_range)
|
|
253
|
+
|
|
254
|
+
query += " ORDER BY RANDOM() LIMIT ?"
|
|
255
|
+
params.append(limit)
|
|
256
|
+
|
|
257
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
258
|
+
cursor = conn.execute(query, params)
|
|
259
|
+
return [
|
|
260
|
+
{"url": r[0], "shard_range": r[1], "last_seen": r[2], "tps": r[3], "latency": r[4], "node_token": r[5]}
|
|
261
|
+
for r in cursor
|
|
262
|
+
]
|
|
263
|
+
|
|
264
|
+
async def get_active_tokens(self, limit, offset, now):
|
|
265
|
+
if self.is_postgres:
|
|
266
|
+
tokens = await self.pool.fetch(
|
|
267
|
+
"SELECT node_token FROM peers WHERE last_seen > $1 AND node_token IS NOT NULL ORDER BY last_seen DESC LIMIT $2 OFFSET $3",
|
|
268
|
+
now - 60, limit, offset
|
|
269
|
+
)
|
|
270
|
+
total = await self.pool.fetchval(
|
|
271
|
+
"SELECT COUNT(*) FROM peers WHERE last_seen > $1 AND node_token IS NOT NULL",
|
|
272
|
+
now - 60
|
|
273
|
+
)
|
|
274
|
+
return [r['node_token'] for r in tokens], total
|
|
275
|
+
else:
|
|
276
|
+
import sqlite3
|
|
277
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
278
|
+
cursor = conn.execute(
|
|
279
|
+
"SELECT node_token FROM peers WHERE last_seen > ? AND node_token IS NOT NULL ORDER BY last_seen DESC LIMIT ? OFFSET ?",
|
|
280
|
+
(now - 60, limit, offset)
|
|
281
|
+
)
|
|
282
|
+
tokens = [row[0] for row in cursor.fetchall()]
|
|
283
|
+
total = conn.execute(
|
|
284
|
+
"SELECT COUNT(*) FROM peers WHERE last_seen > ? AND node_token IS NOT NULL",
|
|
285
|
+
(now - 60,)
|
|
286
|
+
).fetchone()[0]
|
|
287
|
+
return tokens, total
|
|
288
|
+
|
|
289
|
+
async def add_stake(self, url, amount):
|
|
290
|
+
if self.is_postgres:
|
|
291
|
+
await self.pool.execute("""
|
|
292
|
+
INSERT INTO stakes (url, amount) VALUES ($1, $2)
|
|
293
|
+
ON CONFLICT(url) DO UPDATE SET amount = stakes.amount + $3
|
|
294
|
+
""", url, amount, amount)
|
|
295
|
+
return await self.pool.fetchval("SELECT amount FROM stakes WHERE url = $1", url)
|
|
296
|
+
else:
|
|
297
|
+
import sqlite3
|
|
298
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
299
|
+
conn.execute("INSERT INTO stakes (url, amount) VALUES (?, ?) ON CONFLICT(url) DO UPDATE SET amount = amount + ?", (url, amount, amount))
|
|
300
|
+
return conn.execute("SELECT amount FROM stakes WHERE url = ?", (url,)).fetchone()[0]
|
|
301
|
+
|
|
302
|
+
async def slash_node(self, url):
|
|
303
|
+
if self.is_postgres:
|
|
304
|
+
async with self.pool.acquire() as conn:
|
|
305
|
+
await conn.execute("UPDATE stakes SET amount = 0, slashed = TRUE WHERE url = $1", url)
|
|
306
|
+
await conn.execute("DELETE FROM peers WHERE url = $1", url)
|
|
307
|
+
else:
|
|
308
|
+
import sqlite3
|
|
309
|
+
with sqlite3.connect(self._sqlite_path) as conn:
|
|
310
|
+
conn.execute("UPDATE stakes SET amount = 0, slashed = 1 WHERE url = ?", (url,))
|
|
311
|
+
conn.execute("DELETE FROM peers WHERE url = ?", (url,))
|
|
312
|
+
|
|
313
|
+
app = FastAPI(title="NeuroShard Tracker")
|
|
314
|
+
db = DatabaseManager()
|
|
315
|
+
|
|
316
|
+
@app.on_event("startup")
|
|
317
|
+
async def startup():
|
|
318
|
+
await db.connect()
|
|
319
|
+
|
|
320
|
+
@app.on_event("shutdown")
|
|
321
|
+
async def shutdown():
|
|
322
|
+
await db.close()
|
|
323
|
+
|
|
324
|
+
class NodeAnnouncement(BaseModel):
|
|
325
|
+
ip: str
|
|
326
|
+
port: int
|
|
327
|
+
shard_range: str # e.g. "0-4"
|
|
328
|
+
is_entry: bool = False
|
|
329
|
+
is_exit: bool = False
|
|
330
|
+
tps: float = 0.0
|
|
331
|
+
latency: float = 0.0
|
|
332
|
+
node_token: Optional[str] = None
|
|
333
|
+
|
|
334
|
+
class PeerInfo(BaseModel):
|
|
335
|
+
url: str
|
|
336
|
+
shard_range: str
|
|
337
|
+
last_seen: float
|
|
338
|
+
tps: float
|
|
339
|
+
latency: float
|
|
340
|
+
node_token: Optional[str] = None
|
|
341
|
+
|
|
342
|
+
@app.post("/announce")
|
|
343
|
+
async def announce(node: NodeAnnouncement, request: Request):
|
|
344
|
+
client_ip = node.ip
|
|
345
|
+
url = f"http://{client_ip}:{node.port}"
|
|
346
|
+
|
|
347
|
+
# Parse shard range - supports both "0-11" format and "dynamic-57-layers" format
|
|
348
|
+
try:
|
|
349
|
+
if node.shard_range.startswith("dynamic-"):
|
|
350
|
+
# Dynamic mode: "dynamic-57-layers" means node has layers 0 to 56 (57 layers total)
|
|
351
|
+
# Parse the number of layers
|
|
352
|
+
parts = node.shard_range.split("-")
|
|
353
|
+
if len(parts) >= 2:
|
|
354
|
+
num_layers = int(parts[1])
|
|
355
|
+
start = 0 # Dynamic nodes always start at layer 0 (embedding)
|
|
356
|
+
end = num_layers # End is exclusive, so 57 layers = 0-56
|
|
357
|
+
else:
|
|
358
|
+
start, end = 0, 1
|
|
359
|
+
else:
|
|
360
|
+
# Traditional format: "0-11" means layers 0 to 11
|
|
361
|
+
start, end = map(int, node.shard_range.split("-"))
|
|
362
|
+
end = end + 1 # Make end exclusive for consistent querying
|
|
363
|
+
except Exception as e:
|
|
364
|
+
logging.warning(f"Failed to parse shard_range '{node.shard_range}': {e}")
|
|
365
|
+
start, end = 0, 1 # Default to at least layer 0
|
|
366
|
+
|
|
367
|
+
now = time.time()
|
|
368
|
+
|
|
369
|
+
# Check if slashed
|
|
370
|
+
if await db.get_slashed_status(url):
|
|
371
|
+
raise HTTPException(status_code=403, detail="Node is banned (slashed).")
|
|
372
|
+
|
|
373
|
+
# UPSERT Peer
|
|
374
|
+
await db.upsert_peer(url, client_ip, node.port, node.shard_range, start, end, node.is_entry, node.is_exit, now, node.tps, node.latency, node.node_token)
|
|
375
|
+
|
|
376
|
+
# REMOVED: Grant initial free stake for PoC if new
|
|
377
|
+
# This was a development feature - nodes should earn or stake their own NEURO
|
|
378
|
+
# await db.grant_initial_stake(url)
|
|
379
|
+
|
|
380
|
+
# Get stats
|
|
381
|
+
count, tps, latency = await db.get_stats(now)
|
|
382
|
+
stake = await db.get_stake(url)
|
|
383
|
+
|
|
384
|
+
return {"status": "registered", "peer_count": count, "stake": stake}
|
|
385
|
+
|
|
386
|
+
@app.get("/check_active")
|
|
387
|
+
async def check_active(token: str):
|
|
388
|
+
now = time.time()
|
|
389
|
+
if await db.check_active(token, now):
|
|
390
|
+
return {"active": True}
|
|
391
|
+
raise HTTPException(status_code=404, detail="No active node found with this token")
|
|
392
|
+
|
|
393
|
+
@app.get("/peers")
|
|
394
|
+
async def get_peers(
|
|
395
|
+
layer_needed: Optional[int] = None,
|
|
396
|
+
shard_range: Optional[str] = None,
|
|
397
|
+
limit: int = 50
|
|
398
|
+
):
|
|
399
|
+
now = time.time()
|
|
400
|
+
peers = await db.get_peers(layer_needed, shard_range, limit, now)
|
|
401
|
+
return peers
|
|
402
|
+
|
|
403
|
+
@app.get("/active_tokens")
|
|
404
|
+
async def get_active_tokens(limit: int = 100, offset: int = 0):
|
|
405
|
+
"""Return a paginated list of active node tokens for reward distribution."""
|
|
406
|
+
now = time.time()
|
|
407
|
+
tokens, total = await db.get_active_tokens(limit, offset, now)
|
|
408
|
+
|
|
409
|
+
return {
|
|
410
|
+
"tokens": tokens,
|
|
411
|
+
"total": total,
|
|
412
|
+
"page_size": limit,
|
|
413
|
+
"offset": offset
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
@app.get("/stats")
|
|
417
|
+
async def get_stats_endpoint():
|
|
418
|
+
now = time.time()
|
|
419
|
+
count, tps, lat = await db.get_stats(now)
|
|
420
|
+
|
|
421
|
+
return {
|
|
422
|
+
"active_nodes": count,
|
|
423
|
+
"model_size": "142B",
|
|
424
|
+
"total_tps": int(tps),
|
|
425
|
+
"avg_latency": f"{int(lat)}ms"
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
# --- Staking & Slashing Endpoints ---
|
|
429
|
+
|
|
430
|
+
@app.get("/stakes")
|
|
431
|
+
async def get_all_stakes():
|
|
432
|
+
"""Get list of all stakes."""
|
|
433
|
+
return await db.get_all_stakes()
|
|
434
|
+
|
|
435
|
+
@app.post("/stake")
|
|
436
|
+
async def add_stake_endpoint(url: str = Body(...), amount: float = Body(...)):
|
|
437
|
+
# Check slashed
|
|
438
|
+
if await db.get_slashed_status(url):
|
|
439
|
+
return {"error": "Node slashed"}
|
|
440
|
+
|
|
441
|
+
new_stake = await db.add_stake(url, amount)
|
|
442
|
+
return {"new_stake": new_stake}
|
|
443
|
+
|
|
444
|
+
@app.post("/slash")
|
|
445
|
+
async def slash_node_endpoint(url: str = Body(...), reason: str = Body(...)):
|
|
446
|
+
print(f"SLASHING NODE {url} for {reason}")
|
|
447
|
+
await db.slash_node(url)
|
|
448
|
+
return {"status": "slashed", "url": url}
|
|
449
|
+
|
|
450
|
+
# --- Phase 4: Tensor Shard Endpoints ---
|
|
451
|
+
|
|
452
|
+
class TensorShardAnnouncement(BaseModel):
|
|
453
|
+
model_id: str
|
|
454
|
+
layer_id: int
|
|
455
|
+
shard_id: int
|
|
456
|
+
total_shards: int
|
|
457
|
+
grpc_addr: str
|
|
458
|
+
available_memory_mb: float = 0.0
|
|
459
|
+
current_load: float = 0.0
|
|
460
|
+
node_token: Optional[str] = None
|
|
461
|
+
|
|
462
|
+
@app.post("/tensor_shards/announce")
|
|
463
|
+
async def announce_tensor_shard(shard: TensorShardAnnouncement, request: Request):
|
|
464
|
+
"""Announce a tensor shard availability."""
|
|
465
|
+
now = time.time()
|
|
466
|
+
|
|
467
|
+
# Generate unique ID
|
|
468
|
+
shard_id_key = f"{shard.model_id}:{shard.layer_id}:{shard.shard_id}:{shard.total_shards}"
|
|
469
|
+
|
|
470
|
+
# Get client URL from request or grpc_addr
|
|
471
|
+
client_ip = request.client.host if request.client else "unknown"
|
|
472
|
+
node_url = f"http://{client_ip}:8000" # Assume standard port
|
|
473
|
+
|
|
474
|
+
import sqlite3
|
|
475
|
+
with sqlite3.connect(db._sqlite_path) as conn:
|
|
476
|
+
conn.execute("""
|
|
477
|
+
INSERT OR REPLACE INTO tensor_shards
|
|
478
|
+
(id, model_id, layer_id, shard_id, total_shards, node_url, grpc_addr,
|
|
479
|
+
available_memory_mb, current_load, last_seen, node_token)
|
|
480
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
481
|
+
""", (shard_id_key, shard.model_id, shard.layer_id, shard.shard_id,
|
|
482
|
+
shard.total_shards, node_url, shard.grpc_addr,
|
|
483
|
+
shard.available_memory_mb, shard.current_load, now, shard.node_token))
|
|
484
|
+
|
|
485
|
+
return {"status": "registered", "shard_key": shard_id_key}
|
|
486
|
+
|
|
487
|
+
@app.get("/tensor_shards")
|
|
488
|
+
async def get_tensor_shards(
|
|
489
|
+
model_id: str,
|
|
490
|
+
layer_id: int,
|
|
491
|
+
total_shards: int
|
|
492
|
+
):
|
|
493
|
+
"""Get all tensor shards for a specific layer."""
|
|
494
|
+
now = time.time()
|
|
495
|
+
|
|
496
|
+
import sqlite3
|
|
497
|
+
with sqlite3.connect(db._sqlite_path) as conn:
|
|
498
|
+
cursor = conn.execute("""
|
|
499
|
+
SELECT shard_id, grpc_addr, available_memory_mb, current_load, last_seen, node_url
|
|
500
|
+
FROM tensor_shards
|
|
501
|
+
WHERE model_id = ? AND layer_id = ? AND total_shards = ? AND last_seen > ?
|
|
502
|
+
ORDER BY shard_id
|
|
503
|
+
""", (model_id, layer_id, total_shards, now - 120)) # 2 minute timeout
|
|
504
|
+
|
|
505
|
+
shards = []
|
|
506
|
+
for row in cursor:
|
|
507
|
+
shards.append({
|
|
508
|
+
"shard_id": row[0],
|
|
509
|
+
"grpc_addr": row[1],
|
|
510
|
+
"available_memory_mb": row[2],
|
|
511
|
+
"current_load": row[3],
|
|
512
|
+
"last_seen": row[4],
|
|
513
|
+
"node_url": row[5]
|
|
514
|
+
})
|
|
515
|
+
|
|
516
|
+
return {"shards": shards, "total_found": len(shards)}
|
|
517
|
+
|
|
518
|
+
@app.get("/tensor_shards/coverage")
|
|
519
|
+
async def get_tensor_shard_coverage(model_id: str):
|
|
520
|
+
"""Get tensor shard coverage for a model."""
|
|
521
|
+
now = time.time()
|
|
522
|
+
|
|
523
|
+
import sqlite3
|
|
524
|
+
with sqlite3.connect(db._sqlite_path) as conn:
|
|
525
|
+
# Get all active shards for this model
|
|
526
|
+
cursor = conn.execute("""
|
|
527
|
+
SELECT layer_id, shard_id, total_shards, COUNT(*) as node_count
|
|
528
|
+
FROM tensor_shards
|
|
529
|
+
WHERE model_id = ? AND last_seen > ?
|
|
530
|
+
GROUP BY layer_id, shard_id, total_shards
|
|
531
|
+
""", (model_id, now - 120))
|
|
532
|
+
|
|
533
|
+
coverage = {}
|
|
534
|
+
for row in cursor:
|
|
535
|
+
layer_id, shard_id, total_shards, node_count = row
|
|
536
|
+
key = f"layer_{layer_id}"
|
|
537
|
+
if key not in coverage:
|
|
538
|
+
coverage[key] = {"layer_id": layer_id, "shards": {}}
|
|
539
|
+
coverage[key]["shards"][shard_id] = {
|
|
540
|
+
"shard_id": shard_id,
|
|
541
|
+
"total_shards": total_shards,
|
|
542
|
+
"node_count": node_count
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
# Check completeness
|
|
546
|
+
complete_layers = []
|
|
547
|
+
incomplete_layers = []
|
|
548
|
+
|
|
549
|
+
for layer_key, layer_info in coverage.items():
|
|
550
|
+
shards = layer_info["shards"]
|
|
551
|
+
if shards:
|
|
552
|
+
total = list(shards.values())[0]["total_shards"]
|
|
553
|
+
if len(shards) == total and all(s["node_count"] > 0 for s in shards.values()):
|
|
554
|
+
complete_layers.append(layer_info["layer_id"])
|
|
555
|
+
else:
|
|
556
|
+
incomplete_layers.append(layer_info["layer_id"])
|
|
557
|
+
|
|
558
|
+
return {
|
|
559
|
+
"model_id": model_id,
|
|
560
|
+
"coverage": coverage,
|
|
561
|
+
"complete_layers": complete_layers,
|
|
562
|
+
"incomplete_layers": incomplete_layers,
|
|
563
|
+
"is_inference_ready": len(incomplete_layers) == 0 and len(complete_layers) > 0
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
# --- Phase 4: Model Registry Endpoints ---
|
|
567
|
+
|
|
568
|
+
class ModelRegistration(BaseModel):
|
|
569
|
+
model_id: str
|
|
570
|
+
name: str
|
|
571
|
+
family: str
|
|
572
|
+
num_layers: int
|
|
573
|
+
hidden_dim: int
|
|
574
|
+
total_size_mb: float
|
|
575
|
+
required_stake: float = 0.0
|
|
576
|
+
node_token: Optional[str] = None
|
|
577
|
+
|
|
578
|
+
@app.post("/models/register")
|
|
579
|
+
async def register_model(model: ModelRegistration):
|
|
580
|
+
"""Register a new model (requires stake for custom models)."""
|
|
581
|
+
now = time.time()
|
|
582
|
+
|
|
583
|
+
import sqlite3
|
|
584
|
+
with sqlite3.connect(db._sqlite_path) as conn:
|
|
585
|
+
# Check if already exists
|
|
586
|
+
existing = conn.execute(
|
|
587
|
+
"SELECT 1 FROM models WHERE model_id = ?",
|
|
588
|
+
(model.model_id,)
|
|
589
|
+
).fetchone()
|
|
590
|
+
|
|
591
|
+
if existing:
|
|
592
|
+
return {"status": "exists", "model_id": model.model_id}
|
|
593
|
+
|
|
594
|
+
# Only NeuroLLM is supported - this is a decentralized network for our model
|
|
595
|
+
approved = model.family == "neurollm"
|
|
596
|
+
|
|
597
|
+
conn.execute("""
|
|
598
|
+
INSERT INTO models
|
|
599
|
+
(model_id, name, family, num_layers, hidden_dim, total_size_mb,
|
|
600
|
+
required_stake, approved, proposer_token, created_at)
|
|
601
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
602
|
+
""", (model.model_id, model.name, model.family, model.num_layers,
|
|
603
|
+
model.hidden_dim, model.total_size_mb, model.required_stake,
|
|
604
|
+
approved, model.node_token, now))
|
|
605
|
+
|
|
606
|
+
return {"status": "registered", "model_id": model.model_id, "approved": approved}
|
|
607
|
+
|
|
608
|
+
@app.get("/models")
|
|
609
|
+
async def list_models(approved_only: bool = True, family: Optional[str] = None):
|
|
610
|
+
"""List available models."""
|
|
611
|
+
import sqlite3
|
|
612
|
+
with sqlite3.connect(db._sqlite_path) as conn:
|
|
613
|
+
query = "SELECT * FROM models WHERE 1=1"
|
|
614
|
+
params = []
|
|
615
|
+
|
|
616
|
+
if approved_only:
|
|
617
|
+
query += " AND approved = 1"
|
|
618
|
+
|
|
619
|
+
if family:
|
|
620
|
+
query += " AND family = ?"
|
|
621
|
+
params.append(family)
|
|
622
|
+
|
|
623
|
+
query += " ORDER BY total_size_mb"
|
|
624
|
+
|
|
625
|
+
cursor = conn.execute(query, params)
|
|
626
|
+
models = []
|
|
627
|
+
for row in cursor:
|
|
628
|
+
models.append({
|
|
629
|
+
"model_id": row[0],
|
|
630
|
+
"name": row[1],
|
|
631
|
+
"family": row[2],
|
|
632
|
+
"num_layers": row[3],
|
|
633
|
+
"hidden_dim": row[4],
|
|
634
|
+
"total_size_mb": row[5],
|
|
635
|
+
"required_stake": row[6],
|
|
636
|
+
"approved": bool(row[7])
|
|
637
|
+
})
|
|
638
|
+
|
|
639
|
+
return {"models": models}
|
|
640
|
+
|
|
641
|
+
@app.get("/models/{model_id}/status")
|
|
642
|
+
async def get_model_status(model_id: str):
|
|
643
|
+
"""Get status of a specific model in the network."""
|
|
644
|
+
now = time.time()
|
|
645
|
+
|
|
646
|
+
import sqlite3
|
|
647
|
+
with sqlite3.connect(db._sqlite_path) as conn:
|
|
648
|
+
# Get model info
|
|
649
|
+
model_row = conn.execute(
|
|
650
|
+
"SELECT * FROM models WHERE model_id = ?",
|
|
651
|
+
(model_id,)
|
|
652
|
+
).fetchone()
|
|
653
|
+
|
|
654
|
+
if not model_row:
|
|
655
|
+
raise HTTPException(status_code=404, detail="Model not found")
|
|
656
|
+
|
|
657
|
+
# Get pipeline coverage (from peers)
|
|
658
|
+
peers = await db.get_peers(layer_needed=None, shard_range=None, limit=1000, now=now)
|
|
659
|
+
|
|
660
|
+
# Filter peers serving this model (for now, assume all peers serve neurollm)
|
|
661
|
+
# In production, peers would announce which model they serve
|
|
662
|
+
|
|
663
|
+
layer_coverage = {}
|
|
664
|
+
for peer in peers:
|
|
665
|
+
shard_range = peer.get("shard_range", "0-0")
|
|
666
|
+
try:
|
|
667
|
+
start, end = map(int, shard_range.split("-"))
|
|
668
|
+
for layer in range(start, end + 1):
|
|
669
|
+
if layer not in layer_coverage:
|
|
670
|
+
layer_coverage[layer] = 0
|
|
671
|
+
layer_coverage[layer] += 1
|
|
672
|
+
except:
|
|
673
|
+
continue
|
|
674
|
+
|
|
675
|
+
# Get tensor shard coverage
|
|
676
|
+
tensor_cursor = conn.execute("""
|
|
677
|
+
SELECT layer_id, COUNT(DISTINCT shard_id) as shard_count,
|
|
678
|
+
MAX(total_shards) as total_shards
|
|
679
|
+
FROM tensor_shards
|
|
680
|
+
WHERE model_id = ? AND last_seen > ?
|
|
681
|
+
GROUP BY layer_id
|
|
682
|
+
""", (model_id, now - 120))
|
|
683
|
+
|
|
684
|
+
tensor_coverage = {}
|
|
685
|
+
for row in tensor_cursor:
|
|
686
|
+
tensor_coverage[row[0]] = {
|
|
687
|
+
"shards_available": row[1],
|
|
688
|
+
"total_shards": row[2],
|
|
689
|
+
"complete": row[1] == row[2]
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
return {
|
|
693
|
+
"model_id": model_id,
|
|
694
|
+
"name": model_row[1],
|
|
695
|
+
"family": model_row[2],
|
|
696
|
+
"num_layers": model_row[3],
|
|
697
|
+
"pipeline_coverage": layer_coverage,
|
|
698
|
+
"tensor_coverage": tensor_coverage,
|
|
699
|
+
"total_nodes": len(peers),
|
|
700
|
+
"is_fully_covered": all(
|
|
701
|
+
layer_coverage.get(i, 0) > 0
|
|
702
|
+
for i in range(model_row[3])
|
|
703
|
+
)
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
@app.get("/layer_coverage")
|
|
707
|
+
async def get_layer_coverage():
|
|
708
|
+
"""
|
|
709
|
+
Get the current distribution of nodes across transformer layers.
|
|
710
|
+
Used by DynamicShardManager to determine optimal layer allocation.
|
|
711
|
+
"""
|
|
712
|
+
now = time.time()
|
|
713
|
+
|
|
714
|
+
# Get all active peers
|
|
715
|
+
peers = await db.get_peers(layer_needed=None, shard_range=None, limit=1000, now=now)
|
|
716
|
+
|
|
717
|
+
# Count nodes per layer (GPT-2 has 12 layers: 0-11)
|
|
718
|
+
TOTAL_LAYERS = 12
|
|
719
|
+
layer_coverage = {i: {"layer_id": i, "node_count": 0, "nodes": []} for i in range(TOTAL_LAYERS)}
|
|
720
|
+
|
|
721
|
+
for peer in peers:
|
|
722
|
+
shard_range = peer.get("shard_range", "0-0")
|
|
723
|
+
try:
|
|
724
|
+
start, end = map(int, shard_range.split("-"))
|
|
725
|
+
# Each node covers layers from start to end (inclusive based on convention)
|
|
726
|
+
for layer in range(start, min(end + 1, TOTAL_LAYERS)):
|
|
727
|
+
layer_coverage[layer]["node_count"] += 1
|
|
728
|
+
layer_coverage[layer]["nodes"].append(peer.get("url", ""))
|
|
729
|
+
except:
|
|
730
|
+
continue
|
|
731
|
+
|
|
732
|
+
# Calculate statistics
|
|
733
|
+
node_counts = [lc["node_count"] for lc in layer_coverage.values()]
|
|
734
|
+
total_nodes = len(peers)
|
|
735
|
+
avg_coverage = sum(node_counts) / TOTAL_LAYERS if TOTAL_LAYERS > 0 else 0
|
|
736
|
+
min_coverage = min(node_counts) if node_counts else 0
|
|
737
|
+
max_coverage = max(node_counts) if node_counts else 0
|
|
738
|
+
|
|
739
|
+
# Find underserved layers (below average)
|
|
740
|
+
underserved = [
|
|
741
|
+
layer_id for layer_id, lc in layer_coverage.items()
|
|
742
|
+
if lc["node_count"] < avg_coverage
|
|
743
|
+
]
|
|
744
|
+
|
|
745
|
+
# Find critical layers (entry=0, exit=11)
|
|
746
|
+
entry_coverage = layer_coverage[0]["node_count"]
|
|
747
|
+
exit_coverage = layer_coverage[TOTAL_LAYERS - 1]["node_count"]
|
|
748
|
+
|
|
749
|
+
return {
|
|
750
|
+
"total_layers": TOTAL_LAYERS,
|
|
751
|
+
"total_active_nodes": total_nodes,
|
|
752
|
+
"layer_coverage": [
|
|
753
|
+
{
|
|
754
|
+
"layer_id": lc["layer_id"],
|
|
755
|
+
"node_count": lc["node_count"],
|
|
756
|
+
# Don't include full node list in response for privacy/size
|
|
757
|
+
}
|
|
758
|
+
for lc in layer_coverage.values()
|
|
759
|
+
],
|
|
760
|
+
"statistics": {
|
|
761
|
+
"avg_nodes_per_layer": round(avg_coverage, 2),
|
|
762
|
+
"min_coverage": min_coverage,
|
|
763
|
+
"max_coverage": max_coverage,
|
|
764
|
+
"underserved_layers": underserved,
|
|
765
|
+
"entry_layer_nodes": entry_coverage,
|
|
766
|
+
"exit_layer_nodes": exit_coverage
|
|
767
|
+
},
|
|
768
|
+
"health": {
|
|
769
|
+
"has_full_coverage": min_coverage > 0,
|
|
770
|
+
"is_balanced": (max_coverage - min_coverage) <= avg_coverage if avg_coverage > 0 else True,
|
|
771
|
+
"entry_healthy": entry_coverage >= 1,
|
|
772
|
+
"exit_healthy": exit_coverage >= 1
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
@app.get("/network_architecture")
|
|
778
|
+
async def get_network_architecture():
|
|
779
|
+
"""
|
|
780
|
+
Get the current network-wide architecture.
|
|
781
|
+
|
|
782
|
+
This endpoint helps nodes rejoin the network with the correct architecture.
|
|
783
|
+
The architecture is determined by querying active nodes and finding consensus.
|
|
784
|
+
|
|
785
|
+
Returns the architecture used by the majority of active nodes.
|
|
786
|
+
"""
|
|
787
|
+
import requests
|
|
788
|
+
import asyncio
|
|
789
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
790
|
+
|
|
791
|
+
now = time.time()
|
|
792
|
+
|
|
793
|
+
# Get active peers
|
|
794
|
+
peers = await db.get_peers(layer_needed=None, shard_range=None, limit=50, now=now)
|
|
795
|
+
|
|
796
|
+
if not peers:
|
|
797
|
+
return {
|
|
798
|
+
"status": "no_peers",
|
|
799
|
+
"message": "No active peers - network is bootstrapping",
|
|
800
|
+
"hidden_dim": None,
|
|
801
|
+
}
|
|
802
|
+
|
|
803
|
+
# Query architecture from active peers (using requests in thread pool)
|
|
804
|
+
def query_peer_arch(peer_url: str):
|
|
805
|
+
"""Query a single peer for its architecture (blocking)."""
|
|
806
|
+
try:
|
|
807
|
+
response = requests.get(f"{peer_url}/api/node/architecture", timeout=3.0)
|
|
808
|
+
if response.status_code == 200:
|
|
809
|
+
arch_data = response.json()
|
|
810
|
+
if arch_data.get("hidden_dim"):
|
|
811
|
+
return arch_data
|
|
812
|
+
except Exception:
|
|
813
|
+
pass
|
|
814
|
+
return None
|
|
815
|
+
|
|
816
|
+
# Run queries in thread pool to not block the event loop
|
|
817
|
+
architectures = []
|
|
818
|
+
loop = asyncio.get_event_loop()
|
|
819
|
+
with ThreadPoolExecutor(max_workers=5) as executor:
|
|
820
|
+
tasks = []
|
|
821
|
+
for peer in peers[:10]: # Sample up to 10 peers
|
|
822
|
+
peer_url = peer.get("url", "")
|
|
823
|
+
if peer_url:
|
|
824
|
+
tasks.append(loop.run_in_executor(executor, query_peer_arch, peer_url))
|
|
825
|
+
|
|
826
|
+
if tasks:
|
|
827
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
828
|
+
architectures = [r for r in results if r and not isinstance(r, Exception)]
|
|
829
|
+
|
|
830
|
+
if not architectures:
|
|
831
|
+
return {
|
|
832
|
+
"status": "unavailable",
|
|
833
|
+
"message": "Could not query peer architectures",
|
|
834
|
+
"hidden_dim": None,
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
# Find consensus (majority architecture)
|
|
838
|
+
# Group by (hidden_dim, num_heads, num_kv_heads) tuple
|
|
839
|
+
arch_counts = {}
|
|
840
|
+
for arch in architectures:
|
|
841
|
+
key = (arch["hidden_dim"], arch["num_heads"], arch["num_kv_heads"])
|
|
842
|
+
if key not in arch_counts:
|
|
843
|
+
arch_counts[key] = {"count": 0, "data": arch}
|
|
844
|
+
arch_counts[key]["count"] += 1
|
|
845
|
+
|
|
846
|
+
# Find the most common architecture
|
|
847
|
+
consensus = max(arch_counts.values(), key=lambda x: x["count"])
|
|
848
|
+
consensus_arch = consensus["data"]
|
|
849
|
+
|
|
850
|
+
return {
|
|
851
|
+
"status": "ok",
|
|
852
|
+
"hidden_dim": consensus_arch["hidden_dim"],
|
|
853
|
+
"intermediate_dim": consensus_arch.get("intermediate_dim"),
|
|
854
|
+
"num_layers": consensus_arch.get("num_layers"),
|
|
855
|
+
"num_heads": consensus_arch["num_heads"],
|
|
856
|
+
"num_kv_heads": consensus_arch["num_kv_heads"],
|
|
857
|
+
"estimated_params": consensus_arch.get("estimated_params"),
|
|
858
|
+
"consensus_peers": consensus["count"],
|
|
859
|
+
"total_peers_sampled": len(architectures),
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
if __name__ == "__main__":
|
|
864
|
+
uvicorn.run(app, host="0.0.0.0", port=3000)
|