wolverine-ai 1.6.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "wolverine-ai",
3
- "version": "1.6.0",
3
+ "version": "1.6.1",
4
4
  "description": "Self-healing Node.js server framework powered by AI. Catches crashes, diagnoses errors, generates fixes, verifies, and restarts — automatically.",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -24,7 +24,7 @@
24
24
  },
25
25
 
26
26
  "cluster": {
27
- "mode": "auto",
27
+ "enabled": false,
28
28
  "workers": 0
29
29
  },
30
30
 
@@ -48,7 +48,7 @@
48
48
  },
49
49
 
50
50
  "errorMonitor": {
51
- "defaultThreshold": 3,
51
+ "defaultThreshold": 1,
52
52
  "windowMs": 30000,
53
53
  "cooldownMs": 60000
54
54
  },
package/server/index.js CHANGED
@@ -1,61 +1,85 @@
1
- const fastify = require("fastify")({ logger: false });
1
+ const cluster = require("cluster");
2
+ const os = require("os");
2
3
  const PORT = process.env.PORT || 3000;
3
4
 
4
- // Routes
5
- fastify.register(require("./routes/health"), { prefix: "/health" });
6
- fastify.register(require("./routes/api"), { prefix: "/api" });
7
- fastify.register(require("./routes/time"), { prefix: "/time" });
8
-
9
- // Root
10
- fastify.get("/", async () => ({
11
- name: "Wolverine Server",
12
- version: "1.0.0",
13
- status: "running",
14
- uptime: process.uptime(),
15
- }));
16
-
17
- // 404
18
- fastify.setNotFoundHandler((req, reply) => {
19
- reply.code(404).send({ error: "Not found", path: req.url });
20
- });
21
-
22
- // Error handler reports to Wolverine parent via IPC for auto-healing
23
- fastify.setErrorHandler((err, req, reply) => {
24
- console.error(`[ERROR] ${err.message}`);
25
- reply.code(500).send({ error: err.message });
26
-
27
- // Report to Wolverine via IPC (if running under wolverine)
28
- if (typeof process.send === "function") {
29
- try {
30
- // Extract file/line from stack trace
31
- let file = null, line = null;
32
- if (err.stack) {
33
- const frames = err.stack.split("\n");
34
- for (const frame of frames) {
35
- const m = frame.match(/\(([^)]+):(\d+):(\d+)\)/) || frame.match(/at\s+([^\s(]+):(\d+):(\d+)/);
36
- if (m && !m[1].includes("node_modules") && !m[1].includes("node:")) {
37
- file = m[1]; line = parseInt(m[2], 10); break;
5
+ // Cluster mode: master forks workers, workers run the server.
6
+ // Wolverine sets WOLVERINE_RECOMMENDED_WORKERS based on system detection.
7
+ // Set cluster.enabled=true in settings.json or WOLVERINE_CLUSTER=true to enable.
8
+ const clusterEnabled = process.env.WOLVERINE_CLUSTER === "true";
9
+ const workerCount = parseInt(process.env.WOLVERINE_RECOMMENDED_WORKERS, 10) || os.cpus().length;
10
+
11
+ if (clusterEnabled && cluster.isPrimary && workerCount > 1) {
12
+ console.log(`[CLUSTER] Primary ${process.pid} forking ${workerCount} workers`);
13
+ for (let i = 0; i < workerCount; i++) cluster.fork();
14
+
15
+ cluster.on("exit", (worker, code) => {
16
+ if (code !== 0) {
17
+ console.log(`[CLUSTER] Worker ${worker.process.pid} died (code ${code}), respawning...`);
18
+ cluster.fork();
19
+ }
20
+ });
21
+ } else {
22
+ // Single worker or cluster worker — run the server
23
+ const fastify = require("fastify")({ logger: false });
24
+
25
+ // Routes
26
+ fastify.register(require("./routes/health"), { prefix: "/health" });
27
+ fastify.register(require("./routes/api"), { prefix: "/api" });
28
+ fastify.register(require("./routes/time"), { prefix: "/time" });
29
+
30
+ // Root
31
+ fastify.get("/", async () => ({
32
+ name: "Wolverine Server",
33
+ version: "1.0.0",
34
+ status: "running",
35
+ uptime: process.uptime(),
36
+ pid: process.pid,
37
+ worker: cluster.isWorker ? cluster.worker.id : "primary",
38
+ }));
39
+
40
+ // 404
41
+ fastify.setNotFoundHandler((req, reply) => {
42
+ reply.code(404).send({ error: "Not found", path: req.url });
43
+ });
44
+
45
+ // Error handler — reports to Wolverine parent via IPC for auto-healing
46
+ fastify.setErrorHandler((err, req, reply) => {
47
+ console.error(`[ERROR] ${err.message}`);
48
+ reply.code(500).send({ error: err.message });
49
+
50
+ // Report to Wolverine via IPC (if running under wolverine)
51
+ if (typeof process.send === "function") {
52
+ try {
53
+ let file = null, line = null;
54
+ if (err.stack) {
55
+ const frames = err.stack.split("\n");
56
+ for (const frame of frames) {
57
+ const m = frame.match(/\(([^)]+):(\d+):(\d+)\)/) || frame.match(/at\s+([^\s(]+):(\d+):(\d+)/);
58
+ if (m && !m[1].includes("node_modules") && !m[1].includes("node:")) {
59
+ file = m[1]; line = parseInt(m[2], 10); break;
60
+ }
38
61
  }
39
62
  }
40
- }
41
- process.send({
42
- type: "route_error",
43
- path: req.url,
44
- method: req.method,
45
- statusCode: 500,
46
- message: err.message,
47
- stack: err.stack,
48
- file,
49
- line,
50
- timestamp: Date.now(),
51
- });
52
- } catch (_) { /* IPC send failed — non-fatal */ }
53
- }
54
- });
55
-
56
- fastify.listen({ port: PORT, host: "0.0.0.0" }, (err) => {
57
- if (err) { console.error(err); process.exit(1); }
58
- console.log(`Server running on http://localhost:${PORT}`);
59
- console.log(`Health: http://localhost:${PORT}/health`);
60
- console.log(`API: http://localhost:${PORT}/api`);
61
- });
63
+ process.send({
64
+ type: "route_error",
65
+ path: req.url,
66
+ method: req.method,
67
+ statusCode: 500,
68
+ message: err.message,
69
+ stack: err.stack,
70
+ file,
71
+ line,
72
+ timestamp: Date.now(),
73
+ });
74
+ } catch (_) { /* IPC send failed — non-fatal */ }
75
+ }
76
+ });
77
+
78
+ fastify.listen({ port: PORT, host: "0.0.0.0", reusePort: clusterEnabled }, (err) => {
79
+ if (err) { console.error(err); process.exit(1); }
80
+ const label = cluster.isWorker ? ` (worker ${cluster.worker.id})` : "";
81
+ console.log(`Server running on http://localhost:${PORT}${label}`);
82
+ console.log(`Health: http://localhost:${PORT}/health`);
83
+ console.log(`API: http://localhost:${PORT}/api`);
84
+ });
85
+ }
@@ -95,7 +95,7 @@ class WolverineRunner {
95
95
 
96
96
  // Error monitor — detects caught 500 errors without process crash
97
97
  this.errorMonitor = new ErrorMonitor({
98
- threshold: parseInt(process.env.WOLVERINE_ERROR_THRESHOLD, 10) || 3,
98
+ threshold: parseInt(process.env.WOLVERINE_ERROR_THRESHOLD, 10) || 1,
99
99
  windowMs: parseInt(process.env.WOLVERINE_ERROR_WINDOW_MS, 10) || 30000,
100
100
  cooldownMs: parseInt(process.env.WOLVERINE_ERROR_COOLDOWN_MS, 10) || 60000,
101
101
  logger: this.logger,
@@ -236,11 +236,11 @@ class WolverineRunner {
236
236
 
237
237
  oldChild.removeAllListeners("exit");
238
238
  oldChild.once("exit", onExit);
239
- oldChild.kill("SIGTERM");
239
+ this._killProcessTree(oldChild.pid, "SIGTERM");
240
240
 
241
241
  // Force kill if it doesn't exit in 3s
242
242
  setTimeout(() => {
243
- try { oldChild.kill("SIGKILL"); } catch {}
243
+ this._killProcessTree(oldChild.pid, "SIGKILL");
244
244
  onExit();
245
245
  }, 3000);
246
246
  } else {
@@ -278,13 +278,14 @@ class WolverineRunner {
278
278
 
279
279
  this.logger.info(EVENT_TYPES.PROCESS_STOP, "Wolverine stopped (graceful shutdown)");
280
280
 
281
- // Kill child — remove exit listener first so it doesn't trigger heal
281
+ // Kill child + all its descendants — remove exit listener first so it doesn't trigger heal
282
282
  if (this.child) {
283
+ const pid = this.child.pid;
283
284
  this.child.removeAllListeners("exit");
284
- this.child.kill("SIGTERM");
285
+ this._killProcessTree(pid, "SIGTERM");
285
286
  // Force kill after 3s if it doesn't respond
286
287
  setTimeout(() => {
287
- try { if (this.child) this.child.kill("SIGKILL"); } catch {}
288
+ this._killProcessTree(pid, "SIGKILL");
288
289
  }, 3000);
289
290
  this.child = null;
290
291
  }
@@ -304,9 +305,15 @@ class WolverineRunner {
304
305
  // Spawn with --require error-hook.js for IPC error reporting
305
306
  // The error hook auto-patches Fastify/Express to report caught 500s
306
307
  const errorHookPath = path.join(__dirname, "error-hook.js");
308
+ const sysInfo = require("./system-info").detect();
307
309
  this.child = spawn("node", ["--require", errorHookPath, this.scriptPath], {
308
310
  cwd: this.cwd,
309
- env: { ...process.env },
311
+ env: {
312
+ ...process.env,
313
+ // Tell the user's server how many workers to fork (if it uses clustering)
314
+ WOLVERINE_RECOMMENDED_WORKERS: String(sysInfo.recommended?.workers || 1),
315
+ WOLVERINE_MANAGED: "1", // Signal that wolverine is managing this process
316
+ },
310
317
  stdio: ["inherit", "inherit", "pipe", "ipc"],
311
318
  });
312
319
 
@@ -347,8 +354,9 @@ class WolverineRunner {
347
354
 
348
355
  // Kill the hung process — remove exit listener to prevent double-heal
349
356
  if (this.child) {
357
+ const pid = this.child.pid;
350
358
  this.child.removeAllListeners("exit");
351
- this.child.kill("SIGKILL");
359
+ this._killProcessTree(pid, "SIGKILL");
352
360
  this.child = null;
353
361
  }
354
362
 
@@ -587,6 +595,34 @@ class WolverineRunner {
587
595
  }
588
596
  }
589
597
 
598
+ /**
599
+ * Kill a process and all its children (process tree kill).
600
+ * Handles servers that fork workers internally — prevents orphaned processes.
601
+ */
602
+ _killProcessTree(pid, signal = "SIGTERM") {
603
+ if (!pid) return;
604
+ try {
605
+ if (process.platform === "win32") {
606
+ // taskkill /T kills the process tree
607
+ execSync(`taskkill /PID ${pid} /T /F`, { timeout: 3000, stdio: "ignore" });
608
+ } else {
609
+ // Kill the process group (negative PID)
610
+ try { process.kill(-pid, signal); } catch {}
611
+ // Also kill individual PID in case it's not a group leader
612
+ try { process.kill(pid, signal); } catch {}
613
+ // Find and kill children via pgrep
614
+ try {
615
+ const children = execSync(`pgrep -P ${pid} 2>/dev/null`, { encoding: "utf-8", timeout: 3000 }).trim();
616
+ if (children) {
617
+ for (const cpid of children.split("\n").map(p => parseInt(p, 10)).filter(Boolean)) {
618
+ try { process.kill(cpid, signal); } catch {}
619
+ }
620
+ }
621
+ } catch { /* no children or pgrep not available */ }
622
+ }
623
+ } catch { /* process already dead */ }
624
+ }
625
+
590
626
  _ensurePortFree() {
591
627
  const port = parseInt(process.env.PORT, 10) || 3000;
592
628
  try {
package/PLATFORM.md DELETED
@@ -1,450 +0,0 @@
1
- # Wolverine Platform — Multi-Server Analytics & Management
2
-
3
- ## Overview
4
-
5
- The Wolverine Platform aggregates data from hundreds/thousands of wolverine server instances into a single backend + frontend dashboard. Each wolverine instance runs independently and broadcasts lightweight telemetry to the platform.
6
-
7
- ```
8
- ┌──────────────┐ ┌──────────────┐ ┌──────────────┐
9
- │ Wolverine #1 │ │ Wolverine #2 │ │ Wolverine #3 │ ... (N instances)
10
- │ server:3000 │ │ server:4000 │ │ server:5000 │
11
- │ dash:3001 │ │ dash:4001 │ │ dash:5001 │
12
- └──────┬───────┘ └──────┬───────┘ └──────┬───────┘
13
- │ │ │
14
- │ heartbeat │ heartbeat │ heartbeat
15
- │ (every 60s) │ (every 60s) │ (every 60s)
16
- ▼ ▼ ▼
17
- ┌─────────────────────────────────────────────────┐
18
- │ Wolverine Platform Backend │
19
- │ │
20
- │ POST /api/v1/heartbeat ← receive telemetry │
21
- │ GET /api/v1/servers ← list all instances │
22
- │ GET /api/v1/servers/:id ← single instance │
23
- │ GET /api/v1/analytics ← aggregated stats │
24
- │ GET /api/v1/alerts ← active alerts │
25
- │ WS /ws/live ← real-time stream │
26
- │ │
27
- │ Database: PostgreSQL (time-series optimized) │
28
- │ Cache: Redis (live state, pub/sub) │
29
- │ Queue: Bull/BullMQ (alert processing) │
30
- └─────────────────────────────────────────────────┘
31
-
32
-
33
- ┌─────────────────────────────────────────────────┐
34
- │ Wolverine Platform Frontend │
35
- │ │
36
- │ Fleet overview — all servers at a glance │
37
- │ Per-server deep dive — events, repairs, usage │
38
- │ Cost analytics — tokens, USD, by model │
39
- │ Alert management — acknowledge, escalate │
40
- │ Uptime history — SLA tracking over time │
41
- └─────────────────────────────────────────────────┘
42
- ```
43
-
44
- ---
45
-
46
- ## Telemetry Protocol
47
-
48
- ### Heartbeat Payload
49
-
50
- Each wolverine instance sends a heartbeat every **60 seconds** (configurable). This is the only outbound traffic — minimal network impact.
51
-
52
- ```json
53
- POST /api/v1/heartbeat
54
- Authorization: Bearer <PLATFORM_API_KEY>
55
- Content-Type: application/json
56
-
57
- {
58
- "instanceId": "wlv_a1b2c3d4",
59
- "version": "0.1.0",
60
- "timestamp": 1775073247574,
61
-
62
- "server": {
63
- "name": "my-api",
64
- "port": 3000,
65
- "uptime": 86400,
66
- "status": "healthy",
67
- "pid": 12345
68
- },
69
-
70
- "process": {
71
- "memoryMB": 128,
72
- "cpuPercent": 12,
73
- "peakMemoryMB": 256
74
- },
75
-
76
- "routes": {
77
- "total": 8,
78
- "healthy": 8,
79
- "unhealthy": 0,
80
- "slowest": { "path": "/api/search", "avgMs": 450 }
81
- },
82
-
83
- "repairs": {
84
- "total": 3,
85
- "successes": 2,
86
- "failures": 1,
87
- "lastRepair": {
88
- "error": "TypeError: Cannot read property 'id' of undefined",
89
- "resolution": "Added null check before accessing user.id",
90
- "tokens": 1820,
91
- "cost": 0.0045,
92
- "mode": "fast",
93
- "timestamp": 1775073200000
94
- }
95
- },
96
-
97
- "usage": {
98
- "totalTokens": 45000,
99
- "totalCost": 0.12,
100
- "totalCalls": 85,
101
- "byCategory": {
102
- "heal": { "tokens": 12000, "cost": 0.04, "calls": 5 },
103
- "chat": { "tokens": 25000, "cost": 0.05, "calls": 60 },
104
- "classify": { "tokens": 3000, "cost": 0.001, "calls": 15 },
105
- "develop": { "tokens": 5000, "cost": 0.03, "calls": 5 }
106
- },
107
- "byModel": {
108
- "gpt-5.4-mini": { "tokens": 30000, "cost": 0.06, "calls": 40 },
109
- "gpt-4o-mini": { "tokens": 15000, "cost": 0.02, "calls": 45 }
110
- },
111
- "byTool": {
112
- "call_endpoint": { "tokens": 5000, "cost": 0.01, "calls": 20 },
113
- "search_brain": { "tokens": 2000, "cost": 0.005, "calls": 10 }
114
- }
115
- },
116
-
117
- "brain": {
118
- "totalMemories": 45,
119
- "namespaces": { "docs": 23, "functions": 12, "errors": 5, "fixes": 3, "learnings": 2 }
120
- },
121
-
122
- "backups": {
123
- "total": 8,
124
- "stable": 3,
125
- "verified": 2,
126
- "unstable": 3
127
- },
128
-
129
- "alerts": [
130
- {
131
- "type": "memory_leak",
132
- "message": "Memory growing: +50MB over 10 samples",
133
- "severity": "warn",
134
- "timestamp": 1775073100000
135
- }
136
- ]
137
- }
138
- ```
139
-
140
- ### Design Principles
141
-
142
- - **Infrequent**: 1 heartbeat per 60 seconds = 1440/day per instance
143
- - **Small**: ~2KB per payload, gzipped < 500 bytes
144
- - **Idempotent**: same heartbeat can be sent twice safely (upsert by instanceId + timestamp)
145
- - **Offline-resilient**: if platform is down, wolverine queues heartbeats and replays on reconnect
146
- - **No PII**: never send secrets, user data, or source code in heartbeats
147
-
148
- ---
149
-
150
- ## Platform Backend Architecture
151
-
152
- ### Database Schema (PostgreSQL)
153
-
154
- ```sql
155
- -- Servers — one row per wolverine instance
156
- CREATE TABLE servers (
157
- id TEXT PRIMARY KEY, -- "wlv_a1b2c3d4"
158
- name TEXT NOT NULL,
159
- version TEXT,
160
- first_seen TIMESTAMPTZ NOT NULL DEFAULT NOW(),
161
- last_heartbeat TIMESTAMPTZ NOT NULL,
162
- status TEXT NOT NULL DEFAULT 'unknown', -- healthy, degraded, down, unknown
163
- config JSONB -- port, models, etc.
164
- );
165
-
166
- -- Time-series heartbeats — partitioned by day for scale
167
- CREATE TABLE heartbeats (
168
- id BIGSERIAL,
169
- server_id TEXT NOT NULL REFERENCES servers(id),
170
- timestamp TIMESTAMPTZ NOT NULL,
171
- uptime INTEGER,
172
- memory_mb INTEGER,
173
- cpu_percent INTEGER,
174
- routes_total INTEGER,
175
- routes_healthy INTEGER,
176
- routes_unhealthy INTEGER,
177
- tokens_total INTEGER,
178
- cost_total NUMERIC(10,6),
179
- repairs_total INTEGER,
180
- repairs_successes INTEGER,
181
- payload JSONB -- full heartbeat for deep queries
182
- ) PARTITION BY RANGE (timestamp);
183
-
184
- -- Create daily partitions automatically (pg_partman or manual)
185
- -- This allows dropping old data by partition instead of DELETE
186
-
187
- -- Repairs — detailed log of every fix
188
- CREATE TABLE repairs (
189
- id BIGSERIAL PRIMARY KEY,
190
- server_id TEXT NOT NULL REFERENCES servers(id),
191
- timestamp TIMESTAMPTZ NOT NULL,
192
- error TEXT,
193
- resolution TEXT,
194
- success BOOLEAN,
195
- mode TEXT, -- fast, agent, sub-agents
196
- model TEXT,
197
- tokens INTEGER,
198
- cost NUMERIC(10,6),
199
- iteration INTEGER,
200
- duration_ms INTEGER
201
- );
202
-
203
- -- Alerts — active and historical
204
- CREATE TABLE alerts (
205
- id BIGSERIAL PRIMARY KEY,
206
- server_id TEXT NOT NULL REFERENCES servers(id),
207
- type TEXT NOT NULL, -- memory_leak, route_down, crash_loop, etc.
208
- message TEXT,
209
- severity TEXT, -- info, warn, error, critical
210
- created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
211
- acknowledged_at TIMESTAMPTZ,
212
- resolved_at TIMESTAMPTZ,
213
- acknowledged_by TEXT
214
- );
215
-
216
- -- Usage aggregates — hourly rollups for fast analytics
217
- CREATE TABLE usage_hourly (
218
- server_id TEXT NOT NULL REFERENCES servers(id),
219
- hour TIMESTAMPTZ NOT NULL,
220
- tokens_total INTEGER DEFAULT 0,
221
- cost_total NUMERIC(10,6) DEFAULT 0,
222
- calls_total INTEGER DEFAULT 0,
223
- tokens_by_category JSONB,
224
- PRIMARY KEY (server_id, hour)
225
- );
226
-
227
- -- Indexes for common queries
228
- CREATE INDEX idx_heartbeats_server_time ON heartbeats (server_id, timestamp DESC);
229
- CREATE INDEX idx_repairs_server_time ON repairs (server_id, timestamp DESC);
230
- CREATE INDEX idx_alerts_active ON alerts (server_id) WHERE resolved_at IS NULL;
231
- CREATE INDEX idx_servers_status ON servers (status);
232
- ```
233
-
234
- ### API Endpoints
235
-
236
- ```
237
- Authentication: Bearer token (PLATFORM_API_KEY)
238
-
239
- POST /api/v1/heartbeat ← Receive heartbeat from wolverine instance
240
- → Upsert server, insert heartbeat, process alerts
241
- → Returns: { received: true, serverTime: "..." }
242
-
243
- GET /api/v1/servers ← List all instances
244
- → Query: ?status=healthy&sort=last_heartbeat&limit=50&offset=0
245
- → Returns: { servers: [...], total: 150, page: 1 }
246
-
247
- GET /api/v1/servers/:id ← Single instance detail
248
- → Returns: full server state + recent heartbeats + repairs + alerts
249
-
250
- GET /api/v1/servers/:id/heartbeats ← Heartbeat history
251
- → Query: ?from=2026-04-01&to=2026-04-02&interval=5m
252
- → Returns: time-series data for charting
253
-
254
- GET /api/v1/servers/:id/repairs ← Repair history for one server
255
- → Query: ?limit=50&success=true
256
- → Returns: { repairs: [...], stats: { total, successes, avgTokens } }
257
-
258
- GET /api/v1/analytics ← Fleet-wide aggregates
259
- → Query: ?period=24h or ?from=...&to=...
260
- → Returns: {
261
- totalServers, activeServers, totalRepairs, successRate,
262
- totalTokens, totalCost, tokensByCategory, costByModel,
263
- uptimePercent, avgResponseTime
264
- }
265
-
266
- GET /api/v1/analytics/cost ← Cost breakdown
267
- → Query: ?period=7d&groupBy=server|model|category
268
- → Returns: cost time-series + breakdown
269
-
270
- GET /api/v1/alerts ← Active alerts across fleet
271
- → Query: ?severity=critical&acknowledged=false
272
- → Returns: { alerts: [...], total: 5 }
273
-
274
- PATCH /api/v1/alerts/:id ← Acknowledge/resolve alert
275
- → Body: { action: "acknowledge" | "resolve", by: "admin@..." }
276
-
277
- WS /ws/live ← Real-time WebSocket stream
278
- → Streams: heartbeats, alerts, repairs as they arrive
279
- → Subscribe: { subscribe: ["heartbeat", "alert", "repair"] }
280
- → Filter: { servers: ["wlv_a1b2c3d4"] }
281
- ```
282
-
283
- ### Scaling Strategy
284
-
285
- ```
286
- 10 servers: Single PostgreSQL, single Node.js backend
287
- 100 servers: PostgreSQL with connection pooling (pgBouncer), Redis cache
288
- 1,000 servers: Partitioned heartbeats table, read replicas, queue workers
289
- 10,000 servers: TimescaleDB for time-series, horizontal API scaling, Kafka for ingestion
290
- 100,000+: Sharded by server_id, dedicated ingestion pipeline, ClickHouse for analytics
291
- ```
292
-
293
- **Key scaling decisions:**
294
- - Heartbeats are **append-only** — no updates, only inserts → perfect for time-series DBs
295
- - Hourly rollups in `usage_hourly` prevent expensive full-table scans for analytics
296
- - Partitioned by day → drop old data by partition (instant, no vacuum)
297
- - Redis caches the "current state" of each server (latest heartbeat) → fast fleet overview
298
- - WebSocket uses Redis pub/sub → horizontal scaling of frontend connections
299
- - Alert processing is async via job queue → doesn't block heartbeat ingestion
300
-
301
- ### Redis Structure
302
-
303
- ```
304
- wolverine:server:{id}:state ← Latest heartbeat (JSON, TTL 5min)
305
- wolverine:server:{id}:uptime ← Uptime counter (INCR every heartbeat)
306
- wolverine:servers:active ← Sorted set (score = last_heartbeat timestamp)
307
- wolverine:alerts:active ← Set of active alert IDs
308
- wolverine:stats:fleet ← Cached fleet-wide aggregates (TTL 30s)
309
- wolverine:pubsub:heartbeats ← Pub/sub channel for real-time streaming
310
- wolverine:pubsub:alerts ← Pub/sub channel for alert notifications
311
- ```
312
-
313
- ---
314
-
315
- ## Platform Frontend
316
-
317
- ### Pages
318
-
319
- **1. Fleet Overview**
320
- - Grid/list of all server instances
321
- - Color-coded status: green (healthy), yellow (degraded), red (down), gray (unknown)
322
- - Sortable by: status, uptime, memory, cost, last repair
323
- - Search/filter by name, status, tags
324
- - Fleet-wide stats bar: total servers, active, repairs today, cost today
325
-
326
- **2. Server Detail**
327
- - Real-time stats: memory, CPU, uptime, routes
328
- - Event timeline (same as local dashboard but from platform data)
329
- - Repair history with resolution details + token cost
330
- - Usage chart: tokens over time, cost over time
331
- - Route health table with response time trends
332
- - Backup status
333
- - Brain stats
334
-
335
- **3. Analytics**
336
- - Fleet-wide token usage over time (by day/hour)
337
- - Cost breakdown: by server, by model, by category
338
- - Repair success rate over time
339
- - Mean time to repair (MTTR) trend
340
- - Most expensive servers / most repaired servers
341
- - Uptime SLA tracking (99.9% target)
342
- - Response time percentiles across fleet
343
-
344
- **4. Alerts**
345
- - Active alerts sorted by severity
346
- - Acknowledge / resolve workflow
347
- - Alert history with resolution notes
348
- - Alert rules configuration (memory threshold, crash count, response time)
349
-
350
- **5. Cost Management**
351
- - Total spend by period (day/week/month)
352
- - Per-server cost ranking
353
- - Per-model cost ranking
354
- - Projected monthly cost based on current usage
355
- - Budget alerts (notify when approaching limit)
356
-
357
- ### Tech Stack Recommendation
358
-
359
- ```
360
- Frontend: Next.js + Tailwind + Recharts (or Tremor for dashboard components)
361
- Backend: Node.js + Express + PostgreSQL + Redis + BullMQ
362
- Auth: NextAuth.js or Clerk (team management)
363
- Hosting: Vercel (frontend) + Railway/Fly.io (backend) + Supabase (PostgreSQL)
364
- WebSocket: Socket.io or native WS through the backend
365
- ```
366
-
367
- ---
368
-
369
- ## Wolverine Client Integration
370
-
371
- ### New env variables for the wolverine instance:
372
-
373
- ```env
374
- # Platform telemetry (optional — wolverine works fine without it)
375
- WOLVERINE_PLATFORM_URL=https://api.wolverine.dev
376
- WOLVERINE_PLATFORM_KEY=wlvk_your_api_key_here
377
- WOLVERINE_INSTANCE_NAME=my-api-prod
378
- WOLVERINE_HEARTBEAT_INTERVAL_MS=60000
379
- ```
380
-
381
- ### Telemetry module to build in wolverine:
382
-
383
- ```
384
- src/platform/
385
- ├── telemetry.js ← Collects heartbeat data from all subsystems
386
- ├── heartbeat.js ← Sends heartbeat to platform on interval
387
- └── queue.js ← Queues heartbeats when platform is unreachable
388
- ```
389
-
390
- **telemetry.js** gathers data from:
391
- - `processMonitor.getMetrics()` → memory, CPU
392
- - `routeProber.getMetrics()` → route health
393
- - `tokenTracker.getAnalytics()` → usage
394
- - `repairHistory.getStats()` → repairs
395
- - `backupManager.getStats()` → backups
396
- - `brain.getStats()` → brain
397
- - `notifier` → active alerts
398
-
399
- **heartbeat.js** sends it:
400
- - HTTP POST to platform every 60s
401
- - Gzip compressed
402
- - Timeout: 5s (don't block if platform is slow)
403
- - On failure: queue locally, retry with exponential backoff
404
- - On reconnect: replay queued heartbeats
405
-
406
- **queue.js** handles offline resilience:
407
- - Append to `.wolverine/heartbeat-queue.jsonl` when platform unreachable
408
- - On next successful heartbeat, drain the queue (oldest first)
409
- - Max queue size: 1440 entries (24 hours of heartbeats)
410
- - After 24h, drop oldest entries (stale data isn't useful)
411
-
412
- ---
413
-
414
- ## Security Considerations
415
-
416
- - **Platform API key** per instance — revokable, rotatable
417
- - **Secret redactor** runs on heartbeat payload before sending (no env values leak)
418
- - **No source code** in heartbeats — only metrics, error messages (redacted), and stats
419
- - **TLS only** — platform endpoint must be HTTPS
420
- - **Rate limiting** on platform ingestion — max 1 heartbeat/second per instance
421
- - **Tenant isolation** — multi-tenant platform must scope data by organization
422
- - **Audit log** — track who acknowledged/resolved alerts
423
-
424
- ---
425
-
426
- ## Implementation Priority
427
-
428
- ### Phase 1: Core (1-2 weeks)
429
- 1. Platform backend: heartbeat ingestion + server listing + basic API
430
- 2. Wolverine telemetry module: collect + send heartbeats
431
- 3. Frontend: fleet overview + server detail page
432
- 4. PostgreSQL schema + Redis caching
433
-
434
- ### Phase 2: Analytics (1 week)
435
- 1. Hourly usage rollups
436
- 2. Cost analytics page
437
- 3. Repair history aggregation
438
- 4. Uptime tracking
439
-
440
- ### Phase 3: Alerting (1 week)
441
- 1. Alert rules engine
442
- 2. Acknowledge/resolve workflow
443
- 3. Email/Slack/webhook notifications
444
- 4. Alert history
445
-
446
- ### Phase 4: Scale (ongoing)
447
- 1. TimescaleDB migration for heartbeats
448
- 2. Horizontal API scaling
449
- 3. WebSocket real-time streaming
450
- 4. Team management + RBAC
@@ -1,70 +0,0 @@
1
- # Wolverine Server Best Practices
2
-
3
- Rules for building secure, scalable, well-structured servers. Wolverine's agent follows these when building or editing server code.
4
-
5
- ## Structure
6
-
7
- ```
8
- server/
9
- ├── index.js Entry point — app setup, middleware, route mounting, listen
10
- ├── routes/ Route modules — one file per resource
11
- │ ├── health.js Health check endpoint (always required)
12
- │ └── api.js API routes
13
- ├── middleware/ Custom middleware (auth, validation, logging)
14
- ├── models/ Data models / database schemas
15
- ├── services/ Business logic (keep routes thin)
16
- ├── config/ Configuration files
17
- └── utils/ Shared utilities
18
- ```
19
-
20
- ## Rules
21
-
22
- ### Ports
23
- - **Development**: use port 3000 (standard, no admin required, firewall-friendly)
24
- - **Production**: use port 443 (HTTPS) or 80 (HTTP) behind a reverse proxy (nginx/caddy)
25
- - **Never** use random high ports in production — they bypass firewalls and confuse load balancers
26
- - **Always** use HTTPS in production — terminate TLS at the reverse proxy, not in Node
27
- - Dashboard runs on port+1 automatically (3001 in dev, not exposed in prod)
28
-
29
- ### Security
30
- - Never expose secrets in responses — use env vars, never hardcode
31
- - Validate ALL input — Fastify has built-in JSON schema validation
32
- - Use HTTPS in production — reverse proxy (nginx/caddy) handles TLS
33
- - Rate limit public endpoints
34
- - Sanitize user input before database queries — use the SQL skill
35
- - Never return stack traces in production error responses
36
- - Use the sqlGuard() middleware on all routes that accept user input
37
-
38
- ### Scalability
39
- - Keep routes thin — business logic goes in services/
40
- - Use async/await, never block the event loop
41
- - Add a /health endpoint that returns status + uptime + memory
42
- - Use environment variables for all configuration
43
- - Structure for horizontal scaling — no in-memory session state
44
-
45
- ### Error Handling
46
- - Always have a global error handler middleware
47
- - Log errors with context (timestamp, request path, user)
48
- - Return consistent error response format: { error: "message" }
49
- - Never swallow errors silently
50
- - Use try/catch in async route handlers
51
-
52
- ### Code Quality
53
- - One route file per resource (users.js, orders.js, etc.)
54
- - Export express.Router() from each route file
55
- - Mount routes in index.js with clear prefixes
56
- - Use middleware for cross-cutting concerns (auth, logging)
57
- - Keep index.js under 50 lines — it's just wiring
58
-
59
- ### Database
60
- - Use connection pooling
61
- - Handle connection errors gracefully
62
- - Use migrations for schema changes
63
- - Never use string concatenation for queries — use parameterized queries
64
- - Close connections on process exit
65
-
66
- ### Monitoring
67
- - /health endpoint is mandatory
68
- - Log request duration for slow endpoint detection
69
- - Use structured logging (JSON format)
70
- - Track error rates per endpoint
package/TELEMETRY.md DELETED
@@ -1,108 +0,0 @@
1
- # Wolverine Telemetry
2
-
3
- Connect your Wolverine instance to a platform backend for fleet-wide monitoring, uptime tracking, and cost analytics.
4
-
5
- ## Setup
6
-
7
- ### 1. Deploy your platform backend
8
-
9
- See [PLATFORM.md](PLATFORM.md) for the full backend spec — database schema, API endpoints, scaling strategy.
10
-
11
- Your backend needs to implement:
12
- - `POST /api/v1/heartbeat` — receive heartbeat payloads
13
- - `GET /api/v1/servers` — list connected instances
14
- - Standard Bearer token auth
15
-
16
- ### 2. Configure your Wolverine instance
17
-
18
- Add to `.env.local`:
19
-
20
- ```env
21
- WOLVERINE_PLATFORM_URL=https://your-platform.com
22
- WOLVERINE_PLATFORM_KEY=your_api_key_here
23
- ```
24
-
25
- That's it. Wolverine starts sending heartbeats every 60 seconds.
26
-
27
- ### Optional settings
28
-
29
- ```env
30
- # Human-readable name (defaults to folder name)
31
- WOLVERINE_INSTANCE_NAME=my-api-prod
32
-
33
- # Heartbeat interval in ms (default: 60000 = 1 minute)
34
- WOLVERINE_HEARTBEAT_INTERVAL_MS=60000
35
- ```
36
-
37
- ### 3. Verify
38
-
39
- On startup you'll see:
40
-
41
- ```
42
- 📡 Platform: https://your-platform.com (every 60s)
43
- 📡 Instance: wlv_a8f3e9b1c4d7
44
- ```
45
-
46
- If the platform is unreachable, heartbeats queue locally in `.wolverine/heartbeat-queue.jsonl` and drain automatically when connectivity returns.
47
-
48
- ---
49
-
50
- ## Heartbeat Payload
51
-
52
- Each heartbeat is ~2KB JSON, sent every 60 seconds:
53
-
54
- ```json
55
- {
56
- "instanceId": "wlv_a8f3e9b1c4d7",
57
- "version": "0.1.0",
58
- "timestamp": 1775073247574,
59
- "server": {
60
- "name": "my-api",
61
- "port": 3000,
62
- "uptime": 86400,
63
- "status": "healthy",
64
- "pid": 12345
65
- },
66
- "process": {
67
- "memoryMB": 128,
68
- "cpuPercent": 12,
69
- "peakMemoryMB": 256
70
- },
71
- "routes": {
72
- "total": 8,
73
- "healthy": 8,
74
- "unhealthy": 0
75
- },
76
- "repairs": {
77
- "total": 3,
78
- "successes": 2,
79
- "failures": 1,
80
- "lastRepair": { "error": "...", "resolution": "...", "tokens": 1820, "cost": 0.0045 }
81
- },
82
- "usage": {
83
- "totalTokens": 45000,
84
- "totalCost": 0.12,
85
- "totalCalls": 85,
86
- "byCategory": { "heal": {...}, "chat": {...}, "develop": {...} }
87
- },
88
- "brain": { "totalMemories": 45 },
89
- "backups": { "total": 8, "stable": 3 }
90
- }
91
- ```
92
-
93
- ## Design
94
-
95
- - **Opt-in**: disabled unless `WOLVERINE_PLATFORM_URL` and `WOLVERINE_PLATFORM_KEY` are set
96
- - **Lightweight**: 1 request per 60s, ~2KB payload
97
- - **Offline-resilient**: queues locally when platform is down, replays on reconnect (max 24h / 1440 entries)
98
- - **Secure**: secrets redacted before sending, HTTPS supported, Bearer token auth
99
- - **No source code**: only metrics, redacted error messages, and stats
100
-
101
- ## Files
102
-
103
- ```
104
- src/platform/
105
- ├── telemetry.js — Collects metrics from all subsystems into heartbeat payload
106
- ├── heartbeat.js — Sends heartbeats on interval, handles failures
107
- └── queue.js — Offline queue with replay on reconnect
108
- ```