web-agent-bridge 2.7.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,894 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * WAB Cluster — Distributed Execution, Worker Nodes & Cluster Orchestration
5
+ *
6
+ * Turns WAB from a single-server Agent OS into a distributed fleet.
7
+ *
8
+ * Architecture:
9
+ * ┌──────────────┐ ┌──────────┐ ┌──────────┐
10
+ * │ Coordinator │────▶│ Worker-1 │ │ Worker-2 │
11
+ * │ (this node) │────▶│ (remote) │ │ (remote) │
12
+ * │ │────▶│ │ │ │
13
+ * └──────────────┘ └──────────┘ └──────────┘
14
+ * │ ▲ ▲
15
+ * │ │ │
16
+ * └───────────────────┴─────────────────┘
17
+ * heartbeat / task results
18
+ *
19
+ * Components:
20
+ * 1. WorkerNode — A remote execution node that connects, heartbeats, runs tasks
21
+ * 2. TaskDistributor — Routes tasks to workers based on capacity/affinity/load
22
+ * 3. ClusterOrchestrator — Fleet management, auto-scaling, failover, rebalancing
23
+ *
24
+ * Communication: HTTP/JSON between nodes (pull-based + push notifications)
25
+ * Persistence: SQLite tables for durability across restarts
26
+ * Consistency: Leader-based (coordinator is source of truth)
27
+ */
28
+
29
+ const crypto = require('crypto');
30
+ const http = require('http');
31
+ const https = require('https');
32
+ const { URL } = require('url');
33
+ const { db } = require('../models/db');
34
+ const { bus } = require('../runtime/event-bus');
35
+
36
+ // ─── Schema ──────────────────────────────────────────────────────────
37
+
38
+ db.exec(`
39
+ CREATE TABLE IF NOT EXISTS cluster_nodes (
40
+ id TEXT PRIMARY KEY,
41
+ name TEXT NOT NULL,
42
+ endpoint TEXT NOT NULL,
43
+ region TEXT DEFAULT 'default',
44
+ zone TEXT DEFAULT 'a',
45
+ role TEXT DEFAULT 'worker',
46
+ status TEXT DEFAULT 'joining',
47
+ capacity_total INTEGER DEFAULT 20,
48
+ capacity_used INTEGER DEFAULT 0,
49
+ tags TEXT DEFAULT '[]',
50
+ hardware TEXT DEFAULT '{}',
51
+ version TEXT,
52
+ secret_hash TEXT,
53
+ last_heartbeat TEXT DEFAULT (datetime('now')),
54
+ registered_at TEXT DEFAULT (datetime('now')),
55
+ updated_at TEXT DEFAULT (datetime('now'))
56
+ );
57
+
58
+ CREATE TABLE IF NOT EXISTS cluster_tasks (
59
+ id TEXT PRIMARY KEY,
60
+ external_id TEXT,
61
+ node_id TEXT,
62
+ task_type TEXT NOT NULL,
63
+ objective TEXT,
64
+ payload TEXT DEFAULT '{}',
65
+ priority INTEGER DEFAULT 50,
66
+ status TEXT DEFAULT 'pending',
67
+ result TEXT,
68
+ error TEXT,
69
+ attempts INTEGER DEFAULT 0,
70
+ max_attempts INTEGER DEFAULT 3,
71
+ affinity_tags TEXT DEFAULT '[]',
72
+ affinity_region TEXT,
73
+ timeout_ms INTEGER DEFAULT 60000,
74
+ submitted_at TEXT DEFAULT (datetime('now')),
75
+ assigned_at TEXT,
76
+ started_at TEXT,
77
+ completed_at TEXT
78
+ );
79
+
80
+ CREATE TABLE IF NOT EXISTS cluster_events (
81
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
82
+ event_type TEXT NOT NULL,
83
+ node_id TEXT,
84
+ task_id TEXT,
85
+ data TEXT DEFAULT '{}',
86
+ created_at TEXT DEFAULT (datetime('now'))
87
+ );
88
+
89
+ CREATE INDEX IF NOT EXISTS idx_cluster_nodes_status ON cluster_nodes(status);
90
+ CREATE INDEX IF NOT EXISTS idx_cluster_nodes_region ON cluster_nodes(region);
91
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_status ON cluster_tasks(status);
92
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_node ON cluster_tasks(node_id);
93
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_priority ON cluster_tasks(priority DESC);
94
+ CREATE INDEX IF NOT EXISTS idx_cluster_events_type ON cluster_events(event_type);
95
+ CREATE INDEX IF NOT EXISTS idx_cluster_events_node ON cluster_events(node_id);
96
+ `);
97
+
98
+ // ─── Prepared Statements ─────────────────────────────────────────────
99
+
100
+ const stmts = {
101
+ // Nodes
102
+ insertNode: db.prepare(`
103
+ INSERT INTO cluster_nodes (id, name, endpoint, region, zone, role, status, capacity_total, tags, hardware, version, secret_hash)
104
+ VALUES (@id, @name, @endpoint, @region, @zone, @role, @status, @capacity_total, @tags, @hardware, @version, @secret_hash)
105
+ `),
106
+ updateNode: db.prepare(`
107
+ UPDATE cluster_nodes SET name=@name, endpoint=@endpoint, region=@region, zone=@zone,
108
+ capacity_total=@capacity_total, tags=@tags, hardware=@hardware, version=@version, updated_at=datetime('now')
109
+ WHERE id=@id
110
+ `),
111
+ setNodeStatus: db.prepare(`UPDATE cluster_nodes SET status=@status, updated_at=datetime('now') WHERE id=@id`),
112
+ heartbeatNode: db.prepare(`
113
+ UPDATE cluster_nodes SET last_heartbeat=datetime('now'), capacity_used=@capacity_used, status='active', updated_at=datetime('now')
114
+ WHERE id=@id
115
+ `),
116
+ getNode: db.prepare(`SELECT * FROM cluster_nodes WHERE id=?`),
117
+ getNodeByEndpoint: db.prepare(`SELECT * FROM cluster_nodes WHERE endpoint=?`),
118
+ listNodes: db.prepare(`SELECT * FROM cluster_nodes ORDER BY registered_at DESC`),
119
+ listActiveNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' ORDER BY capacity_used ASC`),
120
+ listNodesByRegion: db.prepare(`SELECT * FROM cluster_nodes WHERE region=? AND status='active' ORDER BY capacity_used ASC`),
121
+ deleteNode: db.prepare(`DELETE FROM cluster_nodes WHERE id=?`),
122
+ getStaleNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' AND last_heartbeat < datetime('now', '-' || ? || ' seconds')`),
123
+
124
+ // Tasks
125
+ insertTask: db.prepare(`
126
+ INSERT INTO cluster_tasks (id, external_id, task_type, objective, payload, priority, status, affinity_tags, affinity_region, timeout_ms, max_attempts)
127
+ VALUES (@id, @external_id, @task_type, @objective, @payload, @priority, @status, @affinity_tags, @affinity_region, @timeout_ms, @max_attempts)
128
+ `),
129
+ assignTask: db.prepare(`
130
+ UPDATE cluster_tasks SET node_id=@node_id, status='assigned', assigned_at=datetime('now'), attempts=attempts+1
131
+ WHERE id=@id
132
+ `),
133
+ startTask: db.prepare(`UPDATE cluster_tasks SET status='running', started_at=datetime('now') WHERE id=?`),
134
+ completeTask: db.prepare(`
135
+ UPDATE cluster_tasks SET status='completed', result=@result, completed_at=datetime('now') WHERE id=@id
136
+ `),
137
+ failTask: db.prepare(`
138
+ UPDATE cluster_tasks SET status='failed', error=@error, completed_at=datetime('now') WHERE id=@id
139
+ `),
140
+ requeueTask: db.prepare(`UPDATE cluster_tasks SET status='pending', node_id=NULL, assigned_at=NULL WHERE id=?`),
141
+ getTask: db.prepare(`SELECT * FROM cluster_tasks WHERE id=?`),
142
+ getTaskByExternal: db.prepare(`SELECT * FROM cluster_tasks WHERE external_id=?`),
143
+ getPendingTasks: db.prepare(`SELECT * FROM cluster_tasks WHERE status='pending' ORDER BY priority DESC, submitted_at ASC LIMIT ?`),
144
+ getTasksByNode: db.prepare(`SELECT * FROM cluster_tasks WHERE node_id=? AND status IN ('assigned','running') ORDER BY priority DESC`),
145
+ getTasksByStatus: db.prepare(`SELECT * FROM cluster_tasks WHERE status=? ORDER BY submitted_at DESC LIMIT ?`),
146
+ listTasks: db.prepare(`SELECT * FROM cluster_tasks ORDER BY submitted_at DESC LIMIT ?`),
147
+ getStuckTasks: db.prepare(`
148
+ SELECT * FROM cluster_tasks WHERE status IN ('assigned','running')
149
+ AND assigned_at < datetime('now', '-' || ? || ' seconds')
150
+ `),
151
+ countByStatus: db.prepare(`SELECT status, COUNT(*) as count FROM cluster_tasks GROUP BY status`),
152
+ incrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = capacity_used + 1 WHERE id=?`),
153
+ decrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = MAX(0, capacity_used - 1) WHERE id=?`),
154
+
155
+ // Events
156
+ insertEvent: db.prepare(`INSERT INTO cluster_events (event_type, node_id, task_id, data) VALUES (@event_type, @node_id, @task_id, @data)`),
157
+ getEvents: db.prepare(`SELECT * FROM cluster_events ORDER BY id DESC LIMIT ?`),
158
+ getEventsByNode: db.prepare(`SELECT * FROM cluster_events WHERE node_id=? ORDER BY id DESC LIMIT ?`),
159
+ };
160
+
161
+ // ═══════════════════════════════════════════════════════════════════════════
162
+ // TASK DISTRIBUTOR
163
+ // ═══════════════════════════════════════════════════════════════════════════
164
+
165
+ /**
166
+ * Routes tasks to worker nodes based on capacity, affinity, and load balancing.
167
+ *
168
+ * Strategies:
169
+ * - least-loaded: Pick the node with the most free capacity
170
+ * - affinity: Match task tags to node tags
171
+ * - region: Prefer nodes in the same region as the task
172
+ * - round-robin: Distribute evenly across all active nodes
173
+ */
174
+ class TaskDistributor {
175
+ constructor() {
176
+ this._roundRobinIndex = 0;
177
+ this._stats = { distributed: 0, reassigned: 0, noCapacity: 0 };
178
+ }
179
+
180
+ /**
181
+ * Submit a task for distributed execution
182
+ */
183
+ submit(task) {
184
+ const id = task.id || `ct_${crypto.randomBytes(12).toString('hex')}`;
185
+ const entry = {
186
+ id,
187
+ external_id: task.externalId || null,
188
+ task_type: task.type || 'general',
189
+ objective: task.objective || '',
190
+ payload: JSON.stringify(task.params || {}),
191
+ priority: task.priority || 50,
192
+ status: 'pending',
193
+ affinity_tags: JSON.stringify(task.affinityTags || []),
194
+ affinity_region: task.affinityRegion || null,
195
+ timeout_ms: task.timeout || 60000,
196
+ max_attempts: task.maxAttempts || 3,
197
+ };
198
+ stmts.insertTask.run(entry);
199
+
200
+ bus.emit('cluster.task.submitted', { taskId: id, type: entry.task_type, priority: entry.priority });
201
+ this._stats.distributed++;
202
+
203
+ // Try immediate assignment
204
+ this._tryAssign(id);
205
+
206
+ return { taskId: id, status: 'pending' };
207
+ }
208
+
209
+ /**
210
+ * Try to assign a task to a worker node
211
+ */
212
+ _tryAssign(taskId) {
213
+ const task = stmts.getTask.get(taskId);
214
+ if (!task || task.status !== 'pending') return false;
215
+
216
+ const node = this._selectNode(task);
217
+ if (!node) {
218
+ this._stats.noCapacity++;
219
+ return false;
220
+ }
221
+
222
+ stmts.assignTask.run({ id: taskId, node_id: node.id });
223
+ stmts.incrementNodeLoad.run(node.id);
224
+
225
+ logEvent('task.assigned', node.id, taskId, { strategy: this._lastStrategy });
226
+ bus.emit('cluster.task.assigned', { taskId, nodeId: node.id });
227
+
228
+ // Push notification to worker (fire-and-forget)
229
+ this._notifyWorker(node, taskId, task);
230
+
231
+ return true;
232
+ }
233
+
234
+ /**
235
+ * Select the best node for a task
236
+ */
237
+ _selectNode(task) {
238
+ let candidates = stmts.listActiveNodes.all();
239
+ if (candidates.length === 0) return null;
240
+
241
+ // Filter by capacity
242
+ candidates = candidates.filter(n => n.capacity_used < n.capacity_total);
243
+ if (candidates.length === 0) return null;
244
+
245
+ const affinityTags = safeParse(task.affinity_tags, []);
246
+ const affinityRegion = task.affinity_region;
247
+
248
+ // Strategy 1: Region affinity
249
+ if (affinityRegion) {
250
+ const regionNodes = candidates.filter(n => n.region === affinityRegion);
251
+ if (regionNodes.length > 0) {
252
+ candidates = regionNodes;
253
+ this._lastStrategy = 'region';
254
+ }
255
+ }
256
+
257
+ // Strategy 2: Tag affinity
258
+ if (affinityTags.length > 0) {
259
+ const tagged = candidates.filter(n => {
260
+ const nodeTags = safeParse(n.tags, []);
261
+ return affinityTags.some(t => nodeTags.includes(t));
262
+ });
263
+ if (tagged.length > 0) {
264
+ candidates = tagged;
265
+ this._lastStrategy = 'affinity';
266
+ }
267
+ }
268
+
269
+ // Strategy 3: Least-loaded
270
+ candidates.sort((a, b) => {
271
+ const loadA = a.capacity_used / a.capacity_total;
272
+ const loadB = b.capacity_used / b.capacity_total;
273
+ return loadA - loadB;
274
+ });
275
+
276
+ this._lastStrategy = this._lastStrategy || 'least-loaded';
277
+ return candidates[0];
278
+ }
279
+
280
+ /**
281
+ * Push task notification to a worker node
282
+ */
283
+ _notifyWorker(node, taskId, task) {
284
+ const payload = JSON.stringify({
285
+ type: 'task.assigned',
286
+ taskId,
287
+ taskType: task.task_type,
288
+ objective: task.objective,
289
+ params: safeParse(task.payload, {}),
290
+ priority: task.priority,
291
+ timeout: task.timeout_ms,
292
+ });
293
+
294
+ const url = new URL('/wab-worker/tasks/notify', node.endpoint);
295
+ const mod = url.protocol === 'https:' ? https : http;
296
+
297
+ const req = mod.request(url, {
298
+ method: 'POST',
299
+ headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) },
300
+ timeout: 5000,
301
+ });
302
+ req.on('error', () => { /* best-effort push */ });
303
+ req.write(payload);
304
+ req.end();
305
+ }
306
+
307
+ /**
308
+ * Reassign tasks from a dead node to other nodes
309
+ */
310
+ reassignFromNode(nodeId) {
311
+ const tasks = stmts.getTasksByNode.all(nodeId);
312
+ let reassigned = 0;
313
+
314
+ for (const task of tasks) {
315
+ if (task.attempts >= task.max_attempts) {
316
+ stmts.failTask.run({ id: task.id, error: 'Node died, max attempts reached' });
317
+ logEvent('task.failed', nodeId, task.id, { reason: 'node_death' });
318
+ bus.emit('cluster.task.failed', { taskId: task.id, reason: 'node_death' });
319
+ continue;
320
+ }
321
+
322
+ stmts.decrementNodeLoad.run(nodeId);
323
+ stmts.requeueTask.run(task.id);
324
+ logEvent('task.requeued', nodeId, task.id, { attempt: task.attempts });
325
+
326
+ // Try to assign to another node
327
+ if (this._tryAssign(task.id)) {
328
+ reassigned++;
329
+ this._stats.reassigned++;
330
+ }
331
+ }
332
+
333
+ return reassigned;
334
+ }
335
+
336
+ /**
337
+ * Process pending tasks — called periodically
338
+ */
339
+ processPending() {
340
+ const pending = stmts.getPendingTasks.all(50);
341
+ let assigned = 0;
342
+ for (const task of pending) {
343
+ if (this._tryAssign(task.id)) assigned++;
344
+ }
345
+ return assigned;
346
+ }
347
+
348
+ /**
349
+ * Worker pulls tasks for execution
350
+ */
351
+ pullTasks(nodeId, limit = 5) {
352
+ const node = stmts.getNode.get(nodeId);
353
+ if (!node || node.status !== 'active') return [];
354
+
355
+ const available = node.capacity_total - node.capacity_used;
356
+ if (available <= 0) return [];
357
+
358
+ const count = Math.min(limit, available);
359
+ const pending = stmts.getPendingTasks.all(count);
360
+ const assigned = [];
361
+
362
+ for (const task of pending) {
363
+ // Check affinity
364
+ const affinityRegion = task.affinity_region;
365
+ if (affinityRegion && node.region !== affinityRegion) continue;
366
+
367
+ const affinityTags = safeParse(task.affinity_tags, []);
368
+ const nodeTags = safeParse(node.tags, []);
369
+ if (affinityTags.length > 0 && !affinityTags.some(t => nodeTags.includes(t))) continue;
370
+
371
+ stmts.assignTask.run({ id: task.id, node_id: nodeId });
372
+ stmts.incrementNodeLoad.run(nodeId);
373
+ logEvent('task.assigned', nodeId, task.id, { strategy: 'pull' });
374
+
375
+ assigned.push({
376
+ taskId: task.id,
377
+ type: task.task_type,
378
+ objective: task.objective,
379
+ params: safeParse(task.payload, {}),
380
+ priority: task.priority,
381
+ timeout: task.timeout_ms,
382
+ });
383
+ }
384
+
385
+ return assigned;
386
+ }
387
+
388
+ getStats() { return { ...this._stats }; }
389
+ }
390
+
391
+ // ═══════════════════════════════════════════════════════════════════════════
392
+ // CLUSTER ORCHESTRATOR
393
+ // ═══════════════════════════════════════════════════════════════════════════
394
+
395
+ /**
396
+ * Fleet management — lifecycle, health, auto-scaling, failover, rebalancing.
397
+ *
398
+ * Responsibilities:
399
+ * - Node registration and authentication
400
+ * - Health monitoring via heartbeats
401
+ * - Dead node detection and task failover
402
+ * - Load rebalancing across the cluster
403
+ * - Cluster topology and status reporting
404
+ * - Drain and cordon operations
405
+ */
406
+ class ClusterOrchestrator {
407
+ constructor(distributor) {
408
+ this._distributor = distributor;
409
+ this._heartbeatThresholdSec = 90; // Node considered dead after 90s no heartbeat
410
+ this._checkInterval = null;
411
+ this._rebalanceInterval = null;
412
+ this._started = false;
413
+ }
414
+
415
+ // ─── Lifecycle ──────────────────────────────────────────────────────
416
+
417
+ /**
418
+ * Start the orchestrator — begins periodic health checks and task processing
419
+ */
420
+ start() {
421
+ if (this._started) return;
422
+ this._started = true;
423
+
424
+ // Health check every 30s
425
+ this._checkInterval = setInterval(() => {
426
+ this._healthCheck();
427
+ this._recoverStuckTasks();
428
+ this._distributor.processPending();
429
+ }, 30_000);
430
+ if (this._checkInterval.unref) this._checkInterval.unref();
431
+
432
+ // Rebalance every 5 min
433
+ this._rebalanceInterval = setInterval(() => {
434
+ this._rebalance();
435
+ }, 300_000);
436
+ if (this._rebalanceInterval.unref) this._rebalanceInterval.unref();
437
+
438
+ bus.emit('cluster.started', { timestamp: Date.now() });
439
+ }
440
+
441
+ /**
442
+ * Stop the orchestrator
443
+ */
444
+ stop() {
445
+ if (!this._started) return;
446
+ this._started = false;
447
+ if (this._checkInterval) clearInterval(this._checkInterval);
448
+ if (this._rebalanceInterval) clearInterval(this._rebalanceInterval);
449
+ bus.emit('cluster.stopped', { timestamp: Date.now() });
450
+ }
451
+
452
+ // ─── Node Management ───────────────────────────────────────────────
453
+
454
+ /**
455
+ * Register a worker node to join the cluster
456
+ */
457
+ registerNode(config) {
458
+ if (!config.name || !config.endpoint) {
459
+ throw new Error('Node name and endpoint required');
460
+ }
461
+
462
+ // Check for existing node with same endpoint
463
+ const existing = stmts.getNodeByEndpoint.get(config.endpoint);
464
+ if (existing) {
465
+ // Re-register: update and reactivate
466
+ stmts.updateNode.run({
467
+ id: existing.id,
468
+ name: config.name,
469
+ endpoint: config.endpoint,
470
+ region: config.region || existing.region,
471
+ zone: config.zone || existing.zone,
472
+ capacity_total: config.capacity || existing.capacity_total,
473
+ tags: JSON.stringify(config.tags || safeParse(existing.tags, [])),
474
+ hardware: JSON.stringify(config.hardware || safeParse(existing.hardware, {})),
475
+ version: config.version || existing.version,
476
+ });
477
+ stmts.setNodeStatus.run({ id: existing.id, status: 'active' });
478
+ logEvent('node.re-registered', existing.id, null, { endpoint: config.endpoint });
479
+ bus.emit('cluster.node.joined', { nodeId: existing.id, name: config.name, rejoined: true });
480
+ return { nodeId: existing.id, status: 'active', rejoined: true };
481
+ }
482
+
483
+ const nodeId = `node_${crypto.randomBytes(12).toString('hex')}`;
484
+ const secretHash = crypto.createHash('sha256')
485
+ .update(config.secret || crypto.randomBytes(32).toString('hex'))
486
+ .digest('hex');
487
+
488
+ stmts.insertNode.run({
489
+ id: nodeId,
490
+ name: config.name,
491
+ endpoint: config.endpoint,
492
+ region: config.region || 'default',
493
+ zone: config.zone || 'a',
494
+ role: config.role || 'worker',
495
+ status: 'active',
496
+ capacity_total: config.capacity || 20,
497
+ tags: JSON.stringify(config.tags || []),
498
+ hardware: JSON.stringify(config.hardware || {}),
499
+ version: config.version || null,
500
+ secret_hash: secretHash,
501
+ });
502
+
503
+ logEvent('node.registered', nodeId, null, { name: config.name, endpoint: config.endpoint, region: config.region });
504
+ bus.emit('cluster.node.joined', { nodeId, name: config.name });
505
+
506
+ return { nodeId, status: 'active', secret: config.secret ? undefined : undefined };
507
+ }
508
+
509
+ /**
510
+ * Remove a node from the cluster
511
+ */
512
+ deregisterNode(nodeId) {
513
+ const node = stmts.getNode.get(nodeId);
514
+ if (!node) return null;
515
+
516
+ // Reassign tasks before removing
517
+ const reassigned = this._distributor.reassignFromNode(nodeId);
518
+ stmts.deleteNode.run(nodeId);
519
+
520
+ logEvent('node.deregistered', nodeId, null, { reassigned });
521
+ bus.emit('cluster.node.left', { nodeId, name: node.name, tasksReassigned: reassigned });
522
+
523
+ return { nodeId, reassigned };
524
+ }
525
+
526
+ /**
527
+ * Process heartbeat from a worker node
528
+ */
529
+ heartbeat(nodeId, data = {}) {
530
+ const node = stmts.getNode.get(nodeId);
531
+ if (!node) return null;
532
+
533
+ stmts.heartbeatNode.run({
534
+ id: nodeId,
535
+ capacity_used: data.capacityUsed != null ? data.capacityUsed : node.capacity_used,
536
+ });
537
+
538
+ // Update hardware profile if provided
539
+ if (data.hardware) {
540
+ stmts.updateNode.run({
541
+ id: nodeId,
542
+ name: node.name,
543
+ endpoint: node.endpoint,
544
+ region: node.region,
545
+ zone: node.zone,
546
+ capacity_total: data.capacityTotal || node.capacity_total,
547
+ tags: JSON.stringify(data.tags || safeParse(node.tags, [])),
548
+ hardware: JSON.stringify(data.hardware),
549
+ version: data.version || node.version,
550
+ });
551
+ }
552
+
553
+ return {
554
+ nodeId,
555
+ status: 'active',
556
+ pendingTasks: stmts.getPendingTasks.all(1).length > 0,
557
+ };
558
+ }
559
+
560
+ /**
561
+ * Drain a node — stop assigning new tasks, wait for running tasks to finish
562
+ */
563
+ drainNode(nodeId) {
564
+ const node = stmts.getNode.get(nodeId);
565
+ if (!node) return null;
566
+
567
+ stmts.setNodeStatus.run({ id: nodeId, status: 'draining' });
568
+ logEvent('node.draining', nodeId, null, {});
569
+ bus.emit('cluster.node.draining', { nodeId, name: node.name });
570
+
571
+ return { nodeId, status: 'draining', activeTasks: stmts.getTasksByNode.all(nodeId).length };
572
+ }
573
+
574
+ /**
575
+ * Cordon a node — prevent scheduling but keep running tasks
576
+ */
577
+ cordonNode(nodeId) {
578
+ const node = stmts.getNode.get(nodeId);
579
+ if (!node) return null;
580
+
581
+ stmts.setNodeStatus.run({ id: nodeId, status: 'cordoned' });
582
+ logEvent('node.cordoned', nodeId, null, {});
583
+ bus.emit('cluster.node.cordoned', { nodeId, name: node.name });
584
+
585
+ return { nodeId, status: 'cordoned' };
586
+ }
587
+
588
+ /**
589
+ * Uncordon a node — allow scheduling again
590
+ */
591
+ uncordonNode(nodeId) {
592
+ const node = stmts.getNode.get(nodeId);
593
+ if (!node) return null;
594
+
595
+ stmts.setNodeStatus.run({ id: nodeId, status: 'active' });
596
+ logEvent('node.uncordoned', nodeId, null, {});
597
+
598
+ return { nodeId, status: 'active' };
599
+ }
600
+
601
+ /**
602
+ * Get node details
603
+ */
604
+ getNode(nodeId) {
605
+ const node = stmts.getNode.get(nodeId);
606
+ if (!node) return null;
607
+ node.tags = safeParse(node.tags, []);
608
+ node.hardware = safeParse(node.hardware, {});
609
+ node.activeTasks = stmts.getTasksByNode.all(nodeId).length;
610
+ return node;
611
+ }
612
+
613
+ /**
614
+ * List all cluster nodes
615
+ */
616
+ listNodes(filter = {}) {
617
+ let nodes;
618
+ if (filter.region) {
619
+ nodes = stmts.listNodesByRegion.all(filter.region);
620
+ } else if (filter.active) {
621
+ nodes = stmts.listActiveNodes.all();
622
+ } else {
623
+ nodes = stmts.listNodes.all();
624
+ }
625
+ return nodes.map(n => ({
626
+ ...n,
627
+ tags: safeParse(n.tags, []),
628
+ hardware: safeParse(n.hardware, {}),
629
+ }));
630
+ }
631
+
632
+ // ─── Task Reporting ─────────────────────────────────────────────────
633
+
634
+ /**
635
+ * Worker reports task started
636
+ */
637
+ reportTaskStarted(taskId) {
638
+ const task = stmts.getTask.get(taskId);
639
+ if (!task) return null;
640
+ stmts.startTask.run(taskId);
641
+ logEvent('task.started', task.node_id, taskId, {});
642
+ bus.emit('cluster.task.started', { taskId, nodeId: task.node_id });
643
+ return { taskId, status: 'running' };
644
+ }
645
+
646
+ /**
647
+ * Worker reports task completed
648
+ */
649
+ reportTaskCompleted(taskId, result) {
650
+ const task = stmts.getTask.get(taskId);
651
+ if (!task) return null;
652
+
653
+ stmts.completeTask.run({ id: taskId, result: JSON.stringify(result || {}) });
654
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
655
+
656
+ logEvent('task.completed', task.node_id, taskId, { hasResult: !!result });
657
+ bus.emit('cluster.task.completed', { taskId, nodeId: task.node_id, result });
658
+
659
+ return { taskId, status: 'completed' };
660
+ }
661
+
662
+ /**
663
+ * Worker reports task failed
664
+ */
665
+ reportTaskFailed(taskId, error) {
666
+ const task = stmts.getTask.get(taskId);
667
+ if (!task) return null;
668
+
669
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
670
+
671
+ // Retry if attempts remaining
672
+ if (task.attempts < task.max_attempts) {
673
+ stmts.requeueTask.run(taskId);
674
+ logEvent('task.retrying', task.node_id, taskId, { attempt: task.attempts, error });
675
+ bus.emit('cluster.task.retrying', { taskId, attempt: task.attempts });
676
+
677
+ // Try to assign to a different node
678
+ this._distributor._tryAssign(taskId);
679
+
680
+ return { taskId, status: 'retrying', attempt: task.attempts };
681
+ }
682
+
683
+ // Max attempts reached
684
+ stmts.failTask.run({ id: taskId, error: typeof error === 'string' ? error : JSON.stringify(error) });
685
+ logEvent('task.failed', task.node_id, taskId, { error, attempts: task.attempts });
686
+ bus.emit('cluster.task.failed', { taskId, error, nodeId: task.node_id });
687
+
688
+ return { taskId, status: 'failed' };
689
+ }
690
+
691
+ /**
692
+ * Get task details
693
+ */
694
+ getTask(taskId) {
695
+ const task = stmts.getTask.get(taskId);
696
+ if (!task) return null;
697
+ task.payload = safeParse(task.payload, {});
698
+ task.affinity_tags = safeParse(task.affinity_tags, []);
699
+ task.result = safeParse(task.result, null);
700
+ return task;
701
+ }
702
+
703
+ /**
704
+ * List tasks with optional status filter
705
+ */
706
+ listTasks(filter = {}) {
707
+ let tasks;
708
+ if (filter.status) {
709
+ tasks = stmts.getTasksByStatus.all(filter.status, filter.limit || 50);
710
+ } else if (filter.nodeId) {
711
+ tasks = stmts.getTasksByNode.all(filter.nodeId);
712
+ } else {
713
+ tasks = stmts.listTasks.all(filter.limit || 50);
714
+ }
715
+ return tasks.map(t => ({
716
+ ...t,
717
+ payload: safeParse(t.payload, {}),
718
+ affinity_tags: safeParse(t.affinity_tags, []),
719
+ result: safeParse(t.result, null),
720
+ }));
721
+ }
722
+
723
+ // ─── Cluster Topology ───────────────────────────────────────────────
724
+
725
+ /**
726
+ * Get full cluster status
727
+ */
728
+ getClusterStatus() {
729
+ const nodes = stmts.listNodes.all();
730
+ const taskCounts = {};
731
+ for (const row of stmts.countByStatus.all()) {
732
+ taskCounts[row.status] = row.count;
733
+ }
734
+
735
+ const activeNodes = nodes.filter(n => n.status === 'active');
736
+ const totalCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_total, 0);
737
+ const usedCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_used, 0);
738
+
739
+ // Group by region
740
+ const regions = {};
741
+ for (const node of nodes) {
742
+ if (!regions[node.region]) regions[node.region] = { nodes: 0, active: 0, capacity: 0, used: 0 };
743
+ regions[node.region].nodes++;
744
+ if (node.status === 'active') {
745
+ regions[node.region].active++;
746
+ regions[node.region].capacity += node.capacity_total;
747
+ regions[node.region].used += node.capacity_used;
748
+ }
749
+ }
750
+
751
+ return {
752
+ coordinator: { started: this._started },
753
+ nodes: {
754
+ total: nodes.length,
755
+ active: activeNodes.length,
756
+ draining: nodes.filter(n => n.status === 'draining').length,
757
+ cordoned: nodes.filter(n => n.status === 'cordoned').length,
758
+ dead: nodes.filter(n => n.status === 'dead').length,
759
+ },
760
+ capacity: {
761
+ total: totalCapacity,
762
+ used: usedCapacity,
763
+ available: totalCapacity - usedCapacity,
764
+ utilization: totalCapacity > 0 ? Math.round((usedCapacity / totalCapacity) * 100) : 0,
765
+ },
766
+ tasks: taskCounts,
767
+ regions,
768
+ distributor: this._distributor.getStats(),
769
+ };
770
+ }
771
+
772
+ /**
773
+ * Get cluster events log
774
+ */
775
+ getEvents(limit = 100, nodeId = null) {
776
+ if (nodeId) {
777
+ return stmts.getEventsByNode.all(nodeId, limit).map(e => ({
778
+ ...e,
779
+ data: safeParse(e.data, {}),
780
+ }));
781
+ }
782
+ return stmts.getEvents.all(limit).map(e => ({
783
+ ...e,
784
+ data: safeParse(e.data, {}),
785
+ }));
786
+ }
787
+
788
+ // ─── Internal Operations ────────────────────────────────────────────
789
+
790
+ /**
791
+ * Check for dead nodes and failover their tasks
792
+ */
793
+ _healthCheck() {
794
+ const staleNodes = stmts.getStaleNodes.all(this._heartbeatThresholdSec);
795
+
796
+ for (const node of staleNodes) {
797
+ stmts.setNodeStatus.run({ id: node.id, status: 'dead' });
798
+ logEvent('node.dead', node.id, null, { lastHeartbeat: node.last_heartbeat });
799
+ bus.emit('cluster.node.dead', { nodeId: node.id, name: node.name });
800
+
801
+ // Failover: reassign all tasks from dead node
802
+ const reassigned = this._distributor.reassignFromNode(node.id);
803
+ logEvent('node.failover', node.id, null, { reassigned });
804
+ bus.emit('cluster.node.failover', { nodeId: node.id, tasksReassigned: reassigned });
805
+ }
806
+ }
807
+
808
+ /**
809
+ * Recover tasks that have been assigned/running too long (stuck)
810
+ */
811
+ _recoverStuckTasks() {
812
+ const stuckTasks = stmts.getStuckTasks.all(300); // 5 min stuck threshold
813
+
814
+ for (const task of stuckTasks) {
815
+ if (task.attempts >= task.max_attempts) {
816
+ stmts.failTask.run({ id: task.id, error: 'Task stuck, max attempts reached' });
817
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
818
+ logEvent('task.stuck_failed', task.node_id, task.id, {});
819
+ } else {
820
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
821
+ stmts.requeueTask.run(task.id);
822
+ this._distributor._tryAssign(task.id);
823
+ logEvent('task.stuck_requeued', task.node_id, task.id, { attempt: task.attempts });
824
+ }
825
+ }
826
+ }
827
+
828
+ /**
829
+ * Rebalance tasks across nodes when load is skewed
830
+ */
831
+ _rebalance() {
832
+ const nodes = stmts.listActiveNodes.all();
833
+ if (nodes.length < 2) return;
834
+
835
+ const avgLoad = nodes.reduce((s, n) => s + (n.capacity_used / n.capacity_total), 0) / nodes.length;
836
+ const overloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) > avgLoad * 1.5 && n.capacity_used > 2);
837
+ const underloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) < avgLoad * 0.5);
838
+
839
+ if (overloaded.length === 0 || underloaded.length === 0) return;
840
+
841
+ let moved = 0;
842
+ for (const over of overloaded) {
843
+ const tasks = stmts.getTasksByNode.all(over.id);
844
+ // Move up to 2 tasks from overloaded to underloaded
845
+ const toMove = tasks.filter(t => t.status === 'assigned').slice(0, 2);
846
+
847
+ for (const task of toMove) {
848
+ const target = underloaded.find(n => n.capacity_used < n.capacity_total);
849
+ if (!target) break;
850
+
851
+ stmts.decrementNodeLoad.run(over.id);
852
+ stmts.assignTask.run({ id: task.id, node_id: target.id });
853
+ stmts.incrementNodeLoad.run(target.id);
854
+ target.capacity_used++;
855
+ moved++;
856
+
857
+ logEvent('task.rebalanced', target.id, task.id, { from: over.id });
858
+ this._distributor._notifyWorker(target, task.id, task);
859
+ }
860
+ }
861
+
862
+ if (moved > 0) {
863
+ bus.emit('cluster.rebalanced', { tasksMoved: moved });
864
+ }
865
+ }
866
+ }
867
+
868
+ // ═══════════════════════════════════════════════════════════════════════════
869
+ // HELPERS
870
+ // ═══════════════════════════════════════════════════════════════════════════
871
+
872
+ function safeParse(str, fallback) {
873
+ if (str == null) return fallback;
874
+ if (typeof str === 'object') return str;
875
+ try { return JSON.parse(str); } catch { return fallback; }
876
+ }
877
+
878
+ function logEvent(type, nodeId, taskId, data) {
879
+ try {
880
+ stmts.insertEvent.run({
881
+ event_type: type,
882
+ node_id: nodeId || null,
883
+ task_id: taskId || null,
884
+ data: JSON.stringify(data || {}),
885
+ });
886
+ } catch { /* best-effort logging */ }
887
+ }
888
+
889
+ // ─── Singleton ───────────────────────────────────────────────────────
890
+
891
+ const distributor = new TaskDistributor();
892
+ const cluster = new ClusterOrchestrator(distributor);
893
+
894
+ module.exports = { cluster, distributor, ClusterOrchestrator, TaskDistributor };