wolverine-ai 3.5.0 → 3.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +5 -0
- package/package.json +1 -1
- package/server/config/settings.json +18 -1
- package/server/lib/gpu-fleet.js +313 -0
- package/server/routes/fleet.js +167 -0
- package/server/routes/inference.js +329 -0
- package/src/agent/agent-engine.js +113 -4
- package/src/brain/brain.js +1 -1
- package/src/brain/embedder.js +1 -1
- package/src/brain/function-map.js +15 -1
- package/src/core/ai-client.js +22 -1
- package/src/core/error-parser.js +2 -2
- package/src/core/models.js +8 -1
- package/src/core/runner.js +29 -3
- package/src/dashboard/server.js +2 -2
- package/src/logger/pricing.js +8 -0
- package/src/logger/token-tracker.js +47 -5
- package/src/monitor/perf-monitor.js +1 -1
- package/src/notifications/notifier.js +1 -1
- package/src/platform/telemetry.js +2 -1
- package/src/security/injection-detector.js +1 -1
package/.env.example
CHANGED
|
@@ -6,6 +6,11 @@
|
|
|
6
6
|
# Your OpenAI API key (required)
|
|
7
7
|
OPENAI_API_KEY=
|
|
8
8
|
ANTHROPIC_API_KEY=
|
|
9
|
+
|
|
10
|
+
# ── Wolverine Inference (self-hosted models) ─────────────────────
|
|
11
|
+
# Get your API key at wolverinenode.xyz — $1 = 100 credits
|
|
12
|
+
# Set provider to "wolverine" in server/config/settings.json
|
|
13
|
+
WOLVERINE_API_KEY=
|
|
9
14
|
# ── Dashboard Admin Key (make your own) ──────────────────────────────────────────
|
|
10
15
|
# Required for the agent command interface on the dashboard.
|
|
11
16
|
# Generate: node -e "console.log(require('crypto').randomBytes(32).toString('hex'))"
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "wolverine-ai",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.6.1",
|
|
4
4
|
"description": "Self-healing Node.js server framework powered by AI. Catches crashes, diagnoses errors, generates fixes, verifies, and restarts — automatically.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
"env": "development"
|
|
6
6
|
},
|
|
7
7
|
|
|
8
|
-
"provider": "
|
|
8
|
+
"provider": "wolverine",
|
|
9
9
|
|
|
10
10
|
"openai_settings": {
|
|
11
11
|
"reasoning": "gpt-5.4-mini",
|
|
@@ -43,6 +43,18 @@
|
|
|
43
43
|
"embedding": "text-embedding-3-small"
|
|
44
44
|
},
|
|
45
45
|
|
|
46
|
+
"wolverine_settings": {
|
|
47
|
+
"reasoning": "wolverine-test-1",
|
|
48
|
+
"coding": "wolverine-test-1",
|
|
49
|
+
"chat": "wolverine-test-1",
|
|
50
|
+
"tool": "wolverine-test-1",
|
|
51
|
+
"classifier": "wolverine-test-1",
|
|
52
|
+
"audit": "wolverine-test-1",
|
|
53
|
+
"compacting": "wolverine-test-1",
|
|
54
|
+
"research": "wolverine-test-1",
|
|
55
|
+
"embedding": "text-embedding-3-small"
|
|
56
|
+
},
|
|
57
|
+
|
|
46
58
|
"server": {
|
|
47
59
|
"port": 3000,
|
|
48
60
|
"maxRetries": 3,
|
|
@@ -84,6 +96,11 @@
|
|
|
84
96
|
"intervalMs": 300000
|
|
85
97
|
},
|
|
86
98
|
|
|
99
|
+
"platform": {
|
|
100
|
+
"apiKey": "",
|
|
101
|
+
"cors": ["http://localhost:3000"]
|
|
102
|
+
},
|
|
103
|
+
|
|
87
104
|
"dashboard": {},
|
|
88
105
|
|
|
89
106
|
"cors": {
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
const https = require("https");
|
|
2
|
+
const http = require("http");
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* GPU Fleet Manager — controls Vast.ai GPU instances for inference.
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - Start/stop individual GPUs via Vast API
|
|
9
|
+
* - Health monitoring and auto-discovery
|
|
10
|
+
* - Round-robin routing across active GPUs
|
|
11
|
+
* - Auto-scale: start burst GPUs when queue grows, stop when idle
|
|
12
|
+
* - Cold start tracking (~5s per GPU)
|
|
13
|
+
*
|
|
14
|
+
* Each GPU instance runs llama.cpp with --api-key for security.
|
|
15
|
+
* Only the EC2 backend has the internal keys.
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
const VAST_API = "https://cloud.vast.ai/api/v0";
|
|
19
|
+
const VAST_KEY = process.env.VAST_API_KEY || "";
|
|
20
|
+
const POLL_INTERVAL_MS = 30000; // health check every 30s
|
|
21
|
+
const IDLE_STOP_MS = parseInt(process.env.GPU_IDLE_STOP_MS, 10) || 300000; // 5 min idle → stop
|
|
22
|
+
const SCALE_UP_QUEUE = parseInt(process.env.GPU_SCALE_UP_QUEUE, 10) || 3; // start burst GPU when 3+ queued
|
|
23
|
+
|
|
24
|
+
class GpuFleet {
|
|
25
|
+
constructor(config = {}) {
|
|
26
|
+
// GPU registry: { instanceId → { host, port, key, status, lastUsed, lastHealth, model } }
|
|
27
|
+
this.gpus = new Map();
|
|
28
|
+
this._roundRobinIndex = 0;
|
|
29
|
+
this._pollTimer = null;
|
|
30
|
+
this._scaleTimer = null;
|
|
31
|
+
this._requestQueue = [];
|
|
32
|
+
this._activeRequests = 0;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Register a GPU instance in the fleet.
|
|
37
|
+
*/
|
|
38
|
+
register(instanceId, { host, port, key, model = "wolverine-test-1", role = "general", autoStop = true }) {
|
|
39
|
+
this.gpus.set(String(instanceId), {
|
|
40
|
+
instanceId: String(instanceId),
|
|
41
|
+
host, port: parseInt(port, 10), key,
|
|
42
|
+
model, role, autoStop,
|
|
43
|
+
status: "unknown", // unknown, starting, healthy, unhealthy, stopped
|
|
44
|
+
lastUsed: 0,
|
|
45
|
+
lastHealth: null,
|
|
46
|
+
coldStartMs: null,
|
|
47
|
+
});
|
|
48
|
+
return this;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Load GPU config from environment or database.
|
|
53
|
+
*/
|
|
54
|
+
loadFromEnv() {
|
|
55
|
+
// Primary GPU from env
|
|
56
|
+
const url = process.env.WOLVERINE_INFERENCE_URL;
|
|
57
|
+
const key = process.env.WOLVERINE_GPU_KEY;
|
|
58
|
+
if (url && key) {
|
|
59
|
+
try {
|
|
60
|
+
const parsed = new URL(url);
|
|
61
|
+
const instanceId = process.env.WOLVERINE_GPU_INSTANCE_ID || "primary";
|
|
62
|
+
this.register(instanceId, {
|
|
63
|
+
host: parsed.hostname,
|
|
64
|
+
port: parseInt(parsed.port, 10) || 80,
|
|
65
|
+
key,
|
|
66
|
+
role: "primary",
|
|
67
|
+
autoStop: false, // primary stays on
|
|
68
|
+
});
|
|
69
|
+
} catch {}
|
|
70
|
+
}
|
|
71
|
+
return this;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Load GPU config from database.
|
|
76
|
+
*/
|
|
77
|
+
async loadFromDb(pool) {
|
|
78
|
+
try {
|
|
79
|
+
// Check if gpu_fleet table exists
|
|
80
|
+
const exists = await pool.query(
|
|
81
|
+
"SELECT 1 FROM information_schema.tables WHERE table_name = 'gpu_fleet' LIMIT 1"
|
|
82
|
+
);
|
|
83
|
+
if (exists.rows.length === 0) {
|
|
84
|
+
await pool.query(`
|
|
85
|
+
CREATE TABLE gpu_fleet (
|
|
86
|
+
instance_id TEXT PRIMARY KEY,
|
|
87
|
+
vast_id TEXT,
|
|
88
|
+
host TEXT NOT NULL,
|
|
89
|
+
port INTEGER NOT NULL DEFAULT 8080,
|
|
90
|
+
internal_key TEXT NOT NULL,
|
|
91
|
+
model TEXT DEFAULT 'wolverine-test-1',
|
|
92
|
+
role TEXT DEFAULT 'general',
|
|
93
|
+
auto_stop BOOLEAN DEFAULT true,
|
|
94
|
+
status TEXT DEFAULT 'stopped',
|
|
95
|
+
gpu_name TEXT,
|
|
96
|
+
created_at TIMESTAMPTZ DEFAULT NOW()
|
|
97
|
+
)
|
|
98
|
+
`);
|
|
99
|
+
}
|
|
100
|
+
const { rows } = await pool.query("SELECT * FROM gpu_fleet");
|
|
101
|
+
for (const r of rows) {
|
|
102
|
+
this.register(r.instance_id, {
|
|
103
|
+
host: r.host, port: r.port, key: r.internal_key,
|
|
104
|
+
model: r.model, role: r.role, autoStop: r.auto_stop,
|
|
105
|
+
});
|
|
106
|
+
const gpu = this.gpus.get(r.instance_id);
|
|
107
|
+
if (gpu) gpu.status = r.status;
|
|
108
|
+
if (gpu) gpu.vastId = r.vast_id;
|
|
109
|
+
if (gpu) gpu.gpuName = r.gpu_name;
|
|
110
|
+
}
|
|
111
|
+
} catch (err) {
|
|
112
|
+
console.log("[GPU Fleet] DB load failed:", err.message);
|
|
113
|
+
}
|
|
114
|
+
return this;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Start health polling.
|
|
119
|
+
*/
|
|
120
|
+
startPolling() {
|
|
121
|
+
if (this._pollTimer) return;
|
|
122
|
+
this._pollTimer = setInterval(() => this._healthCheck(), POLL_INTERVAL_MS);
|
|
123
|
+
this._healthCheck(); // immediate first check
|
|
124
|
+
return this;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
stopPolling() {
|
|
128
|
+
if (this._pollTimer) { clearInterval(this._pollTimer); this._pollTimer = null; }
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Get a healthy GPU for inference (round-robin).
|
|
133
|
+
* Returns { host, port, key, instanceId } or null if none available.
|
|
134
|
+
*/
|
|
135
|
+
getAvailable() {
|
|
136
|
+
const healthy = Array.from(this.gpus.values()).filter(g => g.status === "healthy");
|
|
137
|
+
if (healthy.length === 0) return null;
|
|
138
|
+
this._roundRobinIndex = (this._roundRobinIndex + 1) % healthy.length;
|
|
139
|
+
const gpu = healthy[this._roundRobinIndex];
|
|
140
|
+
gpu.lastUsed = Date.now();
|
|
141
|
+
return { host: gpu.host, port: gpu.port, key: gpu.key, instanceId: gpu.instanceId, model: gpu.model };
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Start a stopped GPU instance via Vast API.
|
|
146
|
+
*/
|
|
147
|
+
async startGpu(instanceId) {
|
|
148
|
+
const gpu = this.gpus.get(String(instanceId));
|
|
149
|
+
if (!gpu) throw new Error(`GPU ${instanceId} not registered`);
|
|
150
|
+
if (gpu.status === "healthy" || gpu.status === "starting") return gpu;
|
|
151
|
+
|
|
152
|
+
gpu.status = "starting";
|
|
153
|
+
gpu.coldStartMs = null;
|
|
154
|
+
const startTime = Date.now();
|
|
155
|
+
|
|
156
|
+
const vastId = gpu.vastId || instanceId;
|
|
157
|
+
try {
|
|
158
|
+
await this._vastApi("PUT", `/instances/${vastId}/`, { state: "running" });
|
|
159
|
+
|
|
160
|
+
// Poll until healthy (max 60s)
|
|
161
|
+
for (let i = 0; i < 120; i++) {
|
|
162
|
+
await new Promise(r => setTimeout(r, 500));
|
|
163
|
+
try {
|
|
164
|
+
const res = await this._httpGet(gpu.host, gpu.port, "/v1/models", gpu.key);
|
|
165
|
+
if (res && res.includes("gemma")) {
|
|
166
|
+
gpu.status = "healthy";
|
|
167
|
+
gpu.coldStartMs = Date.now() - startTime;
|
|
168
|
+
gpu.lastHealth = Date.now();
|
|
169
|
+
console.log(`[GPU Fleet] ${instanceId} started in ${gpu.coldStartMs}ms`);
|
|
170
|
+
return gpu;
|
|
171
|
+
}
|
|
172
|
+
} catch {}
|
|
173
|
+
}
|
|
174
|
+
gpu.status = "unhealthy";
|
|
175
|
+
throw new Error(`GPU ${instanceId} failed to start within 60s`);
|
|
176
|
+
} catch (err) {
|
|
177
|
+
gpu.status = "unhealthy";
|
|
178
|
+
throw err;
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* Stop a GPU instance via Vast API.
|
|
184
|
+
*/
|
|
185
|
+
async stopGpu(instanceId) {
|
|
186
|
+
const gpu = this.gpus.get(String(instanceId));
|
|
187
|
+
if (!gpu) throw new Error(`GPU ${instanceId} not registered`);
|
|
188
|
+
|
|
189
|
+
const vastId = gpu.vastId || instanceId;
|
|
190
|
+
try {
|
|
191
|
+
await this._vastApi("PUT", `/instances/${vastId}/`, { state: "stopped" });
|
|
192
|
+
gpu.status = "stopped";
|
|
193
|
+
console.log(`[GPU Fleet] ${instanceId} stopped`);
|
|
194
|
+
} catch (err) {
|
|
195
|
+
console.log(`[GPU Fleet] Stop failed for ${instanceId}:`, err.message);
|
|
196
|
+
}
|
|
197
|
+
return gpu;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Auto-scale: start burst GPUs when needed, stop idle ones.
|
|
202
|
+
*/
|
|
203
|
+
async autoScale(queueLength) {
|
|
204
|
+
// Scale up: start a stopped GPU if queue is long
|
|
205
|
+
if (queueLength >= SCALE_UP_QUEUE) {
|
|
206
|
+
const stopped = Array.from(this.gpus.values()).find(g => g.status === "stopped" && g.autoStop);
|
|
207
|
+
if (stopped) {
|
|
208
|
+
console.log(`[GPU Fleet] Queue at ${queueLength}, starting burst GPU ${stopped.instanceId}`);
|
|
209
|
+
try { await this.startGpu(stopped.instanceId); } catch (e) { console.log("[GPU Fleet] Scale-up failed:", e.message); }
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// Scale down: stop idle burst GPUs
|
|
214
|
+
const now = Date.now();
|
|
215
|
+
for (const gpu of this.gpus.values()) {
|
|
216
|
+
if (gpu.autoStop && gpu.status === "healthy" && gpu.lastUsed > 0 && (now - gpu.lastUsed) > IDLE_STOP_MS) {
|
|
217
|
+
console.log(`[GPU Fleet] ${gpu.instanceId} idle for ${Math.round((now - gpu.lastUsed) / 1000)}s, stopping`);
|
|
218
|
+
try { await this.stopGpu(gpu.instanceId); } catch {}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Get fleet status for dashboard/API.
|
|
225
|
+
*/
|
|
226
|
+
getStatus() {
|
|
227
|
+
const gpus = Array.from(this.gpus.values()).map(g => ({
|
|
228
|
+
instanceId: g.instanceId,
|
|
229
|
+
vastId: g.vastId,
|
|
230
|
+
gpuName: g.gpuName,
|
|
231
|
+
host: g.host,
|
|
232
|
+
port: g.port,
|
|
233
|
+
model: g.model,
|
|
234
|
+
role: g.role,
|
|
235
|
+
status: g.status,
|
|
236
|
+
autoStop: g.autoStop,
|
|
237
|
+
lastUsed: g.lastUsed ? new Date(g.lastUsed).toISOString() : null,
|
|
238
|
+
lastHealth: g.lastHealth ? new Date(g.lastHealth).toISOString() : null,
|
|
239
|
+
coldStartMs: g.coldStartMs,
|
|
240
|
+
}));
|
|
241
|
+
return {
|
|
242
|
+
total: gpus.length,
|
|
243
|
+
healthy: gpus.filter(g => g.status === "healthy").length,
|
|
244
|
+
stopped: gpus.filter(g => g.status === "stopped").length,
|
|
245
|
+
starting: gpus.filter(g => g.status === "starting").length,
|
|
246
|
+
gpus,
|
|
247
|
+
};
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// ── Private ──
|
|
251
|
+
|
|
252
|
+
async _healthCheck() {
|
|
253
|
+
for (const gpu of this.gpus.values()) {
|
|
254
|
+
if (gpu.status === "stopped" || gpu.status === "starting") continue;
|
|
255
|
+
try {
|
|
256
|
+
const res = await this._httpGet(gpu.host, gpu.port, "/v1/models", gpu.key);
|
|
257
|
+
if (res && (res.includes("gemma") || res.includes("wolverine"))) {
|
|
258
|
+
gpu.status = "healthy";
|
|
259
|
+
gpu.lastHealth = Date.now();
|
|
260
|
+
} else {
|
|
261
|
+
gpu.status = "unhealthy";
|
|
262
|
+
}
|
|
263
|
+
} catch {
|
|
264
|
+
gpu.status = "unhealthy";
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
_vastApi(method, path, body) {
|
|
270
|
+
return new Promise((resolve, reject) => {
|
|
271
|
+
const bodyStr = body ? JSON.stringify(body) : null;
|
|
272
|
+
const req = https.request({
|
|
273
|
+
hostname: "cloud.vast.ai",
|
|
274
|
+
path: `/api/v0${path}`,
|
|
275
|
+
method,
|
|
276
|
+
timeout: 15000,
|
|
277
|
+
headers: {
|
|
278
|
+
"Authorization": `Bearer ${VAST_KEY}`,
|
|
279
|
+
"Content-Type": "application/json",
|
|
280
|
+
...(bodyStr ? { "Content-Length": Buffer.byteLength(bodyStr) } : {}),
|
|
281
|
+
},
|
|
282
|
+
}, (res) => {
|
|
283
|
+
let data = "";
|
|
284
|
+
res.on("data", c => { data += c; });
|
|
285
|
+
res.on("end", () => {
|
|
286
|
+
try { resolve(JSON.parse(data)); } catch { resolve({ raw: data }); }
|
|
287
|
+
});
|
|
288
|
+
});
|
|
289
|
+
req.on("error", reject);
|
|
290
|
+
req.on("timeout", () => { req.destroy(); reject(new Error("Vast API timeout")); });
|
|
291
|
+
if (bodyStr) req.write(bodyStr);
|
|
292
|
+
req.end();
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
_httpGet(host, port, path, key) {
|
|
297
|
+
return new Promise((resolve, reject) => {
|
|
298
|
+
const req = http.request({
|
|
299
|
+
hostname: host, port, path, method: "GET", timeout: 5000,
|
|
300
|
+
headers: key ? { "Authorization": `Bearer ${key}` } : {},
|
|
301
|
+
}, (res) => {
|
|
302
|
+
let data = "";
|
|
303
|
+
res.on("data", c => { data += c; });
|
|
304
|
+
res.on("end", () => resolve(data));
|
|
305
|
+
});
|
|
306
|
+
req.on("error", reject);
|
|
307
|
+
req.on("timeout", () => { req.destroy(); reject(new Error("timeout")); });
|
|
308
|
+
req.end();
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
module.exports = { GpuFleet };
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* GPU Fleet Management API — admin routes for controlling inference GPUs.
|
|
3
|
+
*
|
|
4
|
+
* Endpoints:
|
|
5
|
+
* GET /status — fleet overview (all GPUs, health, queue)
|
|
6
|
+
* POST /start/:id — start a stopped GPU
|
|
7
|
+
* POST /stop/:id — stop a running GPU
|
|
8
|
+
* POST /register — add a new GPU to the fleet
|
|
9
|
+
* POST /remove/:id — remove a GPU from the fleet
|
|
10
|
+
* POST /scale — trigger auto-scale check
|
|
11
|
+
* GET /benchmark/:id — run inference benchmark on a GPU
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
async function routes(fastify) {
|
|
15
|
+
const { pool } = require("../lib/db");
|
|
16
|
+
|
|
17
|
+
// Fleet instance is attached to fastify by index.js
|
|
18
|
+
function getFleet() {
|
|
19
|
+
return fastify.gpuFleet;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Admin auth
|
|
23
|
+
async function requireAdmin(request, reply) {
|
|
24
|
+
const settings = require("../config/settings.json");
|
|
25
|
+
const token = request.headers.authorization?.replace("Bearer ", "") || request.headers["x-api-key"];
|
|
26
|
+
if (token !== settings.platform?.apiKey) {
|
|
27
|
+
return reply.code(401).send({ error: "Admin access required" });
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// GET /status — fleet overview
|
|
32
|
+
fastify.get("/status", { preHandler: requireAdmin }, async (request, reply) => {
|
|
33
|
+
const fleet = getFleet();
|
|
34
|
+
return fleet.getStatus();
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
// POST /start/:id — start a GPU
|
|
38
|
+
fastify.post("/start/:id", { preHandler: requireAdmin }, async (request, reply) => {
|
|
39
|
+
const fleet = getFleet();
|
|
40
|
+
const { id } = request.params;
|
|
41
|
+
try {
|
|
42
|
+
const gpu = await fleet.startGpu(id);
|
|
43
|
+
// Update DB
|
|
44
|
+
await pool.query("UPDATE gpu_fleet SET status = 'healthy' WHERE instance_id = $1", [id]).catch(() => {});
|
|
45
|
+
return { status: "started", instanceId: id, coldStartMs: gpu.coldStartMs };
|
|
46
|
+
} catch (err) {
|
|
47
|
+
return reply.code(500).send({ error: err.message });
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
// POST /stop/:id — stop a GPU
|
|
52
|
+
fastify.post("/stop/:id", { preHandler: requireAdmin }, async (request, reply) => {
|
|
53
|
+
const fleet = getFleet();
|
|
54
|
+
const { id } = request.params;
|
|
55
|
+
try {
|
|
56
|
+
await fleet.stopGpu(id);
|
|
57
|
+
await pool.query("UPDATE gpu_fleet SET status = 'stopped' WHERE instance_id = $1", [id]).catch(() => {});
|
|
58
|
+
return { status: "stopped", instanceId: id };
|
|
59
|
+
} catch (err) {
|
|
60
|
+
return reply.code(500).send({ error: err.message });
|
|
61
|
+
}
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
// POST /register — add a GPU to the fleet
|
|
65
|
+
fastify.post("/register", { preHandler: requireAdmin }, async (request, reply) => {
|
|
66
|
+
const fleet = getFleet();
|
|
67
|
+
const { instanceId, vastId, host, port, key, model, role, gpuName, autoStop } = request.body || {};
|
|
68
|
+
if (!instanceId || !host || !key) {
|
|
69
|
+
return reply.code(400).send({ error: "instanceId, host, and key required" });
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
fleet.register(instanceId, { host, port: port || 8080, key, model, role, autoStop: autoStop !== false });
|
|
73
|
+
const gpu = fleet.gpus.get(instanceId);
|
|
74
|
+
if (vastId) gpu.vastId = vastId;
|
|
75
|
+
if (gpuName) gpu.gpuName = gpuName;
|
|
76
|
+
|
|
77
|
+
// Save to DB
|
|
78
|
+
await pool.query(
|
|
79
|
+
`INSERT INTO gpu_fleet (instance_id, vast_id, host, port, internal_key, model, role, auto_stop, gpu_name)
|
|
80
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
|
|
81
|
+
ON CONFLICT (instance_id) DO UPDATE SET
|
|
82
|
+
host = $3, port = $4, internal_key = $5, model = $6, role = $7, auto_stop = $8, gpu_name = $9, vast_id = $2`,
|
|
83
|
+
[instanceId, vastId || null, host, port || 8080, key, model || "wolverine-test-1", role || "general", autoStop !== false, gpuName || null]
|
|
84
|
+
);
|
|
85
|
+
|
|
86
|
+
return { registered: instanceId, fleet: fleet.getStatus() };
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
// POST /remove/:id — remove a GPU from the fleet
|
|
90
|
+
fastify.post("/remove/:id", { preHandler: requireAdmin }, async (request, reply) => {
|
|
91
|
+
const fleet = getFleet();
|
|
92
|
+
const { id } = request.params;
|
|
93
|
+
fleet.gpus.delete(id);
|
|
94
|
+
await pool.query("DELETE FROM gpu_fleet WHERE instance_id = $1", [id]).catch(() => {});
|
|
95
|
+
return { removed: id };
|
|
96
|
+
});
|
|
97
|
+
|
|
98
|
+
// POST /scale — trigger auto-scale
|
|
99
|
+
fastify.post("/scale", { preHandler: requireAdmin }, async (request, reply) => {
|
|
100
|
+
const fleet = getFleet();
|
|
101
|
+
const queueLength = request.body?.queueLength || 0;
|
|
102
|
+
await fleet.autoScale(queueLength);
|
|
103
|
+
return fleet.getStatus();
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
// GET /benchmark/:id — quick benchmark
|
|
107
|
+
fastify.get("/benchmark/:id", { preHandler: requireAdmin }, async (request, reply) => {
|
|
108
|
+
const fleet = getFleet();
|
|
109
|
+
const gpu = fleet.gpus.get(request.params.id);
|
|
110
|
+
if (!gpu || gpu.status !== "healthy") {
|
|
111
|
+
return reply.code(400).send({ error: "GPU not available" });
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const http = require("http");
|
|
115
|
+
const results = [];
|
|
116
|
+
|
|
117
|
+
for (const prompt of ["2+2?", "Write isPrime in JS.", "Explain TCP in 1 sentence."]) {
|
|
118
|
+
const start = Date.now();
|
|
119
|
+
try {
|
|
120
|
+
const body = JSON.stringify({
|
|
121
|
+
model: gpu.model,
|
|
122
|
+
messages: [{ role: "user", content: prompt }],
|
|
123
|
+
max_tokens: 50, temperature: 0,
|
|
124
|
+
});
|
|
125
|
+
const res = await new Promise((resolve, reject) => {
|
|
126
|
+
const req = http.request({
|
|
127
|
+
hostname: gpu.host, port: gpu.port, path: "/v1/chat/completions",
|
|
128
|
+
method: "POST", timeout: 30000,
|
|
129
|
+
headers: { "Content-Type": "application/json", "Authorization": `Bearer ${gpu.key}`, "Content-Length": Buffer.byteLength(body) },
|
|
130
|
+
}, (res) => {
|
|
131
|
+
let data = "";
|
|
132
|
+
res.on("data", c => { data += c; });
|
|
133
|
+
res.on("end", () => { try { resolve(JSON.parse(data)); } catch { resolve(null); } });
|
|
134
|
+
});
|
|
135
|
+
req.on("error", reject);
|
|
136
|
+
req.write(body);
|
|
137
|
+
req.end();
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
const elapsed = Date.now() - start;
|
|
141
|
+
const usage = res?.usage || {};
|
|
142
|
+
const tokOut = usage.completion_tokens || 0;
|
|
143
|
+
results.push({
|
|
144
|
+
prompt: prompt.slice(0, 30),
|
|
145
|
+
latencyMs: elapsed,
|
|
146
|
+
tokensOut: tokOut,
|
|
147
|
+
tokPerSec: tokOut > 0 ? Math.round(tokOut / (elapsed / 1000)) : 0,
|
|
148
|
+
response: res?.choices?.[0]?.message?.content?.slice(0, 60),
|
|
149
|
+
});
|
|
150
|
+
} catch (err) {
|
|
151
|
+
results.push({ prompt: prompt.slice(0, 30), error: err.message });
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
const avgTokPerSec = results.filter(r => r.tokPerSec).reduce((s, r) => s + r.tokPerSec, 0) / Math.max(results.filter(r => r.tokPerSec).length, 1);
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
instanceId: request.params.id,
|
|
159
|
+
gpu: gpu.gpuName,
|
|
160
|
+
model: gpu.model,
|
|
161
|
+
results,
|
|
162
|
+
avgTokPerSec: Math.round(avgTokPerSec),
|
|
163
|
+
};
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
module.exports = routes;
|