web-agent-bridge 3.0.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/LICENSE +72 -21
  2. package/README.ar.md +1286 -1073
  3. package/README.md +1764 -1535
  4. package/bin/agent-runner.js +474 -474
  5. package/bin/cli.js +237 -138
  6. package/bin/wab.js +80 -80
  7. package/examples/bidi-agent.js +119 -119
  8. package/examples/cross-site-agent.js +91 -91
  9. package/examples/mcp-agent.js +94 -94
  10. package/examples/next-app-router/README.md +44 -44
  11. package/examples/puppeteer-agent.js +108 -108
  12. package/examples/saas-dashboard/README.md +55 -55
  13. package/examples/shopify-hydrogen/README.md +74 -74
  14. package/examples/vision-agent.js +171 -171
  15. package/examples/wordpress-elementor/README.md +77 -77
  16. package/package.json +17 -3
  17. package/public/.well-known/agent-tools.json +180 -180
  18. package/public/.well-known/ai-assets.json +59 -59
  19. package/public/.well-known/ai-plugin.json +28 -0
  20. package/public/.well-known/security.txt +8 -0
  21. package/public/agent-workspace.html +349 -347
  22. package/public/ai.html +198 -196
  23. package/public/api.html +413 -0
  24. package/public/browser.html +486 -484
  25. package/public/commander-dashboard.html +243 -243
  26. package/public/cookies.html +210 -208
  27. package/public/css/agent-workspace.css +1713 -1713
  28. package/public/css/premium.css +317 -317
  29. package/public/css/styles.css +1235 -1235
  30. package/public/dashboard.html +706 -704
  31. package/public/demo.html +1770 -1
  32. package/public/dns.html +507 -0
  33. package/public/docs.html +587 -585
  34. package/public/feed.xml +89 -89
  35. package/public/growth.html +463 -0
  36. package/public/index.html +341 -9
  37. package/public/integrations.html +556 -0
  38. package/public/js/agent-workspace.js +1740 -1740
  39. package/public/js/auth-nav.js +31 -31
  40. package/public/js/auth-redirect.js +12 -12
  41. package/public/js/cookie-consent.js +56 -56
  42. package/public/js/wab-demo-page.js +721 -721
  43. package/public/js/ws-client.js +74 -74
  44. package/public/llms-full.txt +360 -309
  45. package/public/llms.txt +125 -86
  46. package/public/login.html +85 -83
  47. package/public/mesh-dashboard.html +328 -328
  48. package/public/openapi.json +580 -580
  49. package/public/phone-shield.html +281 -0
  50. package/public/premium-dashboard.html +2489 -2487
  51. package/public/premium.html +793 -791
  52. package/public/privacy.html +297 -295
  53. package/public/register.html +105 -103
  54. package/public/robots.txt +87 -87
  55. package/public/script/wab-consent.d.ts +36 -36
  56. package/public/script/wab-consent.js +104 -104
  57. package/public/script/wab-schema.js +131 -131
  58. package/public/script/wab.d.ts +108 -108
  59. package/public/script/wab.min.js +580 -580
  60. package/public/security.txt +8 -0
  61. package/public/terms.html +256 -254
  62. package/script/ai-agent-bridge.js +1754 -1754
  63. package/sdk/README.md +99 -99
  64. package/sdk/agent-mesh.js +449 -449
  65. package/sdk/commander.js +262 -262
  66. package/sdk/index.d.ts +464 -464
  67. package/sdk/index.js +18 -1
  68. package/sdk/multi-agent.js +318 -318
  69. package/sdk/package.json +12 -1
  70. package/sdk/safety-shield.js +219 -0
  71. package/sdk/schema-discovery.js +83 -83
  72. package/server/adapters/index.js +520 -520
  73. package/server/config/plans.js +367 -367
  74. package/server/config/secrets.js +102 -102
  75. package/server/control-plane/index.js +301 -301
  76. package/server/data-plane/index.js +354 -354
  77. package/server/index.js +175 -19
  78. package/server/llm/index.js +404 -404
  79. package/server/middleware/adminAuth.js +35 -35
  80. package/server/middleware/auth.js +50 -50
  81. package/server/middleware/featureGate.js +88 -88
  82. package/server/middleware/rateLimits.js +100 -100
  83. package/server/middleware/sensitiveAction.js +157 -0
  84. package/server/migrations/001_add_analytics_indexes.sql +7 -7
  85. package/server/migrations/002_premium_features.sql +418 -418
  86. package/server/migrations/003_ads_integer_cents.sql +33 -33
  87. package/server/migrations/004_agent_os.sql +158 -158
  88. package/server/migrations/005_marketplace_metering.sql +126 -126
  89. package/server/models/adapters/index.js +33 -33
  90. package/server/models/adapters/mysql.js +183 -183
  91. package/server/models/adapters/postgresql.js +172 -172
  92. package/server/models/adapters/sqlite.js +7 -7
  93. package/server/models/db.js +681 -681
  94. package/server/observability/failure-analysis.js +337 -337
  95. package/server/observability/index.js +394 -394
  96. package/server/protocol/capabilities.js +223 -223
  97. package/server/protocol/index.js +243 -243
  98. package/server/protocol/schema.js +584 -584
  99. package/server/registry/certification.js +271 -271
  100. package/server/registry/index.js +326 -326
  101. package/server/routes/admin-premium.js +671 -671
  102. package/server/routes/admin.js +261 -261
  103. package/server/routes/ads.js +130 -130
  104. package/server/routes/agent-workspace.js +540 -378
  105. package/server/routes/api.js +150 -150
  106. package/server/routes/auth.js +71 -71
  107. package/server/routes/billing.js +45 -45
  108. package/server/routes/commander.js +316 -316
  109. package/server/routes/demo-showcase.js +332 -0
  110. package/server/routes/demo-store.js +154 -0
  111. package/server/routes/discovery.js +417 -406
  112. package/server/routes/gateway.js +173 -0
  113. package/server/routes/license.js +251 -240
  114. package/server/routes/mesh.js +469 -469
  115. package/server/routes/noscript.js +543 -543
  116. package/server/routes/premium-v2.js +686 -686
  117. package/server/routes/premium.js +724 -724
  118. package/server/routes/runtime.js +2148 -2147
  119. package/server/routes/sovereign.js +465 -385
  120. package/server/routes/universal.js +200 -177
  121. package/server/routes/wab-api.js +850 -491
  122. package/server/runtime/container-worker.js +111 -111
  123. package/server/runtime/container.js +448 -448
  124. package/server/runtime/distributed-worker.js +362 -362
  125. package/server/runtime/event-bus.js +210 -210
  126. package/server/runtime/index.js +253 -253
  127. package/server/runtime/queue.js +599 -599
  128. package/server/runtime/replay.js +666 -666
  129. package/server/runtime/sandbox.js +266 -266
  130. package/server/runtime/scheduler.js +534 -534
  131. package/server/runtime/session-engine.js +293 -293
  132. package/server/runtime/state-manager.js +188 -188
  133. package/server/security/cross-site-redactor.js +196 -0
  134. package/server/security/dry-run.js +180 -0
  135. package/server/security/human-gate-rate-limit.js +147 -0
  136. package/server/security/human-gate-transports.js +178 -0
  137. package/server/security/human-gate.js +281 -0
  138. package/server/security/index.js +368 -368
  139. package/server/security/intent-engine.js +245 -0
  140. package/server/security/reward-guard.js +171 -0
  141. package/server/security/rollback-store.js +239 -0
  142. package/server/security/token-scope.js +404 -0
  143. package/server/security/url-policy.js +139 -0
  144. package/server/services/agent-chat.js +506 -506
  145. package/server/services/agent-learning.js +601 -575
  146. package/server/services/agent-memory.js +625 -625
  147. package/server/services/agent-mesh.js +555 -539
  148. package/server/services/agent-symphony.js +717 -717
  149. package/server/services/agent-tasks.js +1807 -1807
  150. package/server/services/api-key-engine.js +292 -0
  151. package/server/services/cluster.js +894 -894
  152. package/server/services/commander.js +738 -738
  153. package/server/services/edge-compute.js +440 -440
  154. package/server/services/email.js +204 -204
  155. package/server/services/hosted-runtime.js +205 -205
  156. package/server/services/lfd.js +635 -616
  157. package/server/services/local-ai.js +389 -389
  158. package/server/services/marketplace.js +270 -270
  159. package/server/services/metering.js +182 -182
  160. package/server/services/modules/affiliate-intelligence.js +93 -0
  161. package/server/services/modules/agent-firewall.js +90 -0
  162. package/server/services/modules/bounty.js +89 -0
  163. package/server/services/modules/collective-bargaining.js +92 -0
  164. package/server/services/modules/dark-pattern.js +66 -0
  165. package/server/services/modules/gov-intelligence.js +45 -0
  166. package/server/services/modules/neural.js +55 -0
  167. package/server/services/modules/notary.js +49 -0
  168. package/server/services/modules/price-time-machine.js +86 -0
  169. package/server/services/modules/protocol.js +104 -0
  170. package/server/services/negotiation.js +439 -439
  171. package/server/services/plugins.js +771 -771
  172. package/server/services/premium.js +1 -1
  173. package/server/services/price-intelligence.js +566 -565
  174. package/server/services/price-shield.js +1137 -1137
  175. package/server/services/reputation.js +465 -465
  176. package/server/services/search-engine.js +357 -357
  177. package/server/services/security.js +513 -513
  178. package/server/services/self-healing.js +843 -843
  179. package/server/services/sovereign-shield.js +542 -0
  180. package/server/services/stripe.js +192 -192
  181. package/server/services/swarm.js +788 -788
  182. package/server/services/universal-scraper.js +662 -661
  183. package/server/services/verification.js +481 -481
  184. package/server/services/vision.js +1163 -1163
  185. package/server/utils/cache.js +125 -125
  186. package/server/utils/migrate.js +81 -81
  187. package/server/utils/safe-fetch.js +228 -0
  188. package/server/utils/secureFields.js +50 -50
  189. package/server/ws.js +161 -161
  190. package/templates/artisan-marketplace.yaml +104 -104
  191. package/templates/book-price-scout.yaml +98 -98
  192. package/templates/electronics-price-tracker.yaml +108 -108
  193. package/templates/flight-deal-hunter.yaml +113 -113
  194. package/templates/freelancer-direct.yaml +116 -116
  195. package/templates/grocery-price-compare.yaml +93 -93
  196. package/templates/hotel-direct-booking.yaml +113 -113
  197. package/templates/local-services.yaml +98 -98
  198. package/templates/olive-oil-tunisia.yaml +88 -88
  199. package/templates/organic-farm-fresh.yaml +101 -101
  200. package/templates/restaurant-direct.yaml +97 -97
  201. package/server/services/fairness-engine.js +0 -409
  202. package/server/services/fairness.js +0 -420
@@ -1,894 +1,894 @@
1
- 'use strict';
2
-
3
- /**
4
- * WAB Cluster — Distributed Execution, Worker Nodes & Cluster Orchestration
5
- *
6
- * Turns WAB from a single-server Agent OS into a distributed fleet.
7
- *
8
- * Architecture:
9
- * ┌──────────────┐ ┌──────────┐ ┌──────────┐
10
- * │ Coordinator │────▶│ Worker-1 │ │ Worker-2 │
11
- * │ (this node) │────▶│ (remote) │ │ (remote) │
12
- * │ │────▶│ │ │ │
13
- * └──────────────┘ └──────────┘ └──────────┘
14
- * │ ▲ ▲
15
- * │ │ │
16
- * └───────────────────┴─────────────────┘
17
- * heartbeat / task results
18
- *
19
- * Components:
20
- * 1. WorkerNode — A remote execution node that connects, heartbeats, runs tasks
21
- * 2. TaskDistributor — Routes tasks to workers based on capacity/affinity/load
22
- * 3. ClusterOrchestrator — Fleet management, auto-scaling, failover, rebalancing
23
- *
24
- * Communication: HTTP/JSON between nodes (pull-based + push notifications)
25
- * Persistence: SQLite tables for durability across restarts
26
- * Consistency: Leader-based (coordinator is source of truth)
27
- */
28
-
29
- const crypto = require('crypto');
30
- const http = require('http');
31
- const https = require('https');
32
- const { URL } = require('url');
33
- const { db } = require('../models/db');
34
- const { bus } = require('../runtime/event-bus');
35
-
36
- // ─── Schema ──────────────────────────────────────────────────────────
37
-
38
- db.exec(`
39
- CREATE TABLE IF NOT EXISTS cluster_nodes (
40
- id TEXT PRIMARY KEY,
41
- name TEXT NOT NULL,
42
- endpoint TEXT NOT NULL,
43
- region TEXT DEFAULT 'default',
44
- zone TEXT DEFAULT 'a',
45
- role TEXT DEFAULT 'worker',
46
- status TEXT DEFAULT 'joining',
47
- capacity_total INTEGER DEFAULT 20,
48
- capacity_used INTEGER DEFAULT 0,
49
- tags TEXT DEFAULT '[]',
50
- hardware TEXT DEFAULT '{}',
51
- version TEXT,
52
- secret_hash TEXT,
53
- last_heartbeat TEXT DEFAULT (datetime('now')),
54
- registered_at TEXT DEFAULT (datetime('now')),
55
- updated_at TEXT DEFAULT (datetime('now'))
56
- );
57
-
58
- CREATE TABLE IF NOT EXISTS cluster_tasks (
59
- id TEXT PRIMARY KEY,
60
- external_id TEXT,
61
- node_id TEXT,
62
- task_type TEXT NOT NULL,
63
- objective TEXT,
64
- payload TEXT DEFAULT '{}',
65
- priority INTEGER DEFAULT 50,
66
- status TEXT DEFAULT 'pending',
67
- result TEXT,
68
- error TEXT,
69
- attempts INTEGER DEFAULT 0,
70
- max_attempts INTEGER DEFAULT 3,
71
- affinity_tags TEXT DEFAULT '[]',
72
- affinity_region TEXT,
73
- timeout_ms INTEGER DEFAULT 60000,
74
- submitted_at TEXT DEFAULT (datetime('now')),
75
- assigned_at TEXT,
76
- started_at TEXT,
77
- completed_at TEXT
78
- );
79
-
80
- CREATE TABLE IF NOT EXISTS cluster_events (
81
- id INTEGER PRIMARY KEY AUTOINCREMENT,
82
- event_type TEXT NOT NULL,
83
- node_id TEXT,
84
- task_id TEXT,
85
- data TEXT DEFAULT '{}',
86
- created_at TEXT DEFAULT (datetime('now'))
87
- );
88
-
89
- CREATE INDEX IF NOT EXISTS idx_cluster_nodes_status ON cluster_nodes(status);
90
- CREATE INDEX IF NOT EXISTS idx_cluster_nodes_region ON cluster_nodes(region);
91
- CREATE INDEX IF NOT EXISTS idx_cluster_tasks_status ON cluster_tasks(status);
92
- CREATE INDEX IF NOT EXISTS idx_cluster_tasks_node ON cluster_tasks(node_id);
93
- CREATE INDEX IF NOT EXISTS idx_cluster_tasks_priority ON cluster_tasks(priority DESC);
94
- CREATE INDEX IF NOT EXISTS idx_cluster_events_type ON cluster_events(event_type);
95
- CREATE INDEX IF NOT EXISTS idx_cluster_events_node ON cluster_events(node_id);
96
- `);
97
-
98
- // ─── Prepared Statements ─────────────────────────────────────────────
99
-
100
- const stmts = {
101
- // Nodes
102
- insertNode: db.prepare(`
103
- INSERT INTO cluster_nodes (id, name, endpoint, region, zone, role, status, capacity_total, tags, hardware, version, secret_hash)
104
- VALUES (@id, @name, @endpoint, @region, @zone, @role, @status, @capacity_total, @tags, @hardware, @version, @secret_hash)
105
- `),
106
- updateNode: db.prepare(`
107
- UPDATE cluster_nodes SET name=@name, endpoint=@endpoint, region=@region, zone=@zone,
108
- capacity_total=@capacity_total, tags=@tags, hardware=@hardware, version=@version, updated_at=datetime('now')
109
- WHERE id=@id
110
- `),
111
- setNodeStatus: db.prepare(`UPDATE cluster_nodes SET status=@status, updated_at=datetime('now') WHERE id=@id`),
112
- heartbeatNode: db.prepare(`
113
- UPDATE cluster_nodes SET last_heartbeat=datetime('now'), capacity_used=@capacity_used, status='active', updated_at=datetime('now')
114
- WHERE id=@id
115
- `),
116
- getNode: db.prepare(`SELECT * FROM cluster_nodes WHERE id=?`),
117
- getNodeByEndpoint: db.prepare(`SELECT * FROM cluster_nodes WHERE endpoint=?`),
118
- listNodes: db.prepare(`SELECT * FROM cluster_nodes ORDER BY registered_at DESC`),
119
- listActiveNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' ORDER BY capacity_used ASC`),
120
- listNodesByRegion: db.prepare(`SELECT * FROM cluster_nodes WHERE region=? AND status='active' ORDER BY capacity_used ASC`),
121
- deleteNode: db.prepare(`DELETE FROM cluster_nodes WHERE id=?`),
122
- getStaleNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' AND last_heartbeat < datetime('now', '-' || ? || ' seconds')`),
123
-
124
- // Tasks
125
- insertTask: db.prepare(`
126
- INSERT INTO cluster_tasks (id, external_id, task_type, objective, payload, priority, status, affinity_tags, affinity_region, timeout_ms, max_attempts)
127
- VALUES (@id, @external_id, @task_type, @objective, @payload, @priority, @status, @affinity_tags, @affinity_region, @timeout_ms, @max_attempts)
128
- `),
129
- assignTask: db.prepare(`
130
- UPDATE cluster_tasks SET node_id=@node_id, status='assigned', assigned_at=datetime('now'), attempts=attempts+1
131
- WHERE id=@id
132
- `),
133
- startTask: db.prepare(`UPDATE cluster_tasks SET status='running', started_at=datetime('now') WHERE id=?`),
134
- completeTask: db.prepare(`
135
- UPDATE cluster_tasks SET status='completed', result=@result, completed_at=datetime('now') WHERE id=@id
136
- `),
137
- failTask: db.prepare(`
138
- UPDATE cluster_tasks SET status='failed', error=@error, completed_at=datetime('now') WHERE id=@id
139
- `),
140
- requeueTask: db.prepare(`UPDATE cluster_tasks SET status='pending', node_id=NULL, assigned_at=NULL WHERE id=?`),
141
- getTask: db.prepare(`SELECT * FROM cluster_tasks WHERE id=?`),
142
- getTaskByExternal: db.prepare(`SELECT * FROM cluster_tasks WHERE external_id=?`),
143
- getPendingTasks: db.prepare(`SELECT * FROM cluster_tasks WHERE status='pending' ORDER BY priority DESC, submitted_at ASC LIMIT ?`),
144
- getTasksByNode: db.prepare(`SELECT * FROM cluster_tasks WHERE node_id=? AND status IN ('assigned','running') ORDER BY priority DESC`),
145
- getTasksByStatus: db.prepare(`SELECT * FROM cluster_tasks WHERE status=? ORDER BY submitted_at DESC LIMIT ?`),
146
- listTasks: db.prepare(`SELECT * FROM cluster_tasks ORDER BY submitted_at DESC LIMIT ?`),
147
- getStuckTasks: db.prepare(`
148
- SELECT * FROM cluster_tasks WHERE status IN ('assigned','running')
149
- AND assigned_at < datetime('now', '-' || ? || ' seconds')
150
- `),
151
- countByStatus: db.prepare(`SELECT status, COUNT(*) as count FROM cluster_tasks GROUP BY status`),
152
- incrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = capacity_used + 1 WHERE id=?`),
153
- decrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = MAX(0, capacity_used - 1) WHERE id=?`),
154
-
155
- // Events
156
- insertEvent: db.prepare(`INSERT INTO cluster_events (event_type, node_id, task_id, data) VALUES (@event_type, @node_id, @task_id, @data)`),
157
- getEvents: db.prepare(`SELECT * FROM cluster_events ORDER BY id DESC LIMIT ?`),
158
- getEventsByNode: db.prepare(`SELECT * FROM cluster_events WHERE node_id=? ORDER BY id DESC LIMIT ?`),
159
- };
160
-
161
- // ═══════════════════════════════════════════════════════════════════════════
162
- // TASK DISTRIBUTOR
163
- // ═══════════════════════════════════════════════════════════════════════════
164
-
165
- /**
166
- * Routes tasks to worker nodes based on capacity, affinity, and load balancing.
167
- *
168
- * Strategies:
169
- * - least-loaded: Pick the node with the most free capacity
170
- * - affinity: Match task tags to node tags
171
- * - region: Prefer nodes in the same region as the task
172
- * - round-robin: Distribute evenly across all active nodes
173
- */
174
- class TaskDistributor {
175
- constructor() {
176
- this._roundRobinIndex = 0;
177
- this._stats = { distributed: 0, reassigned: 0, noCapacity: 0 };
178
- }
179
-
180
- /**
181
- * Submit a task for distributed execution
182
- */
183
- submit(task) {
184
- const id = task.id || `ct_${crypto.randomBytes(12).toString('hex')}`;
185
- const entry = {
186
- id,
187
- external_id: task.externalId || null,
188
- task_type: task.type || 'general',
189
- objective: task.objective || '',
190
- payload: JSON.stringify(task.params || {}),
191
- priority: task.priority || 50,
192
- status: 'pending',
193
- affinity_tags: JSON.stringify(task.affinityTags || []),
194
- affinity_region: task.affinityRegion || null,
195
- timeout_ms: task.timeout || 60000,
196
- max_attempts: task.maxAttempts || 3,
197
- };
198
- stmts.insertTask.run(entry);
199
-
200
- bus.emit('cluster.task.submitted', { taskId: id, type: entry.task_type, priority: entry.priority });
201
- this._stats.distributed++;
202
-
203
- // Try immediate assignment
204
- this._tryAssign(id);
205
-
206
- return { taskId: id, status: 'pending' };
207
- }
208
-
209
- /**
210
- * Try to assign a task to a worker node
211
- */
212
- _tryAssign(taskId) {
213
- const task = stmts.getTask.get(taskId);
214
- if (!task || task.status !== 'pending') return false;
215
-
216
- const node = this._selectNode(task);
217
- if (!node) {
218
- this._stats.noCapacity++;
219
- return false;
220
- }
221
-
222
- stmts.assignTask.run({ id: taskId, node_id: node.id });
223
- stmts.incrementNodeLoad.run(node.id);
224
-
225
- logEvent('task.assigned', node.id, taskId, { strategy: this._lastStrategy });
226
- bus.emit('cluster.task.assigned', { taskId, nodeId: node.id });
227
-
228
- // Push notification to worker (fire-and-forget)
229
- this._notifyWorker(node, taskId, task);
230
-
231
- return true;
232
- }
233
-
234
- /**
235
- * Select the best node for a task
236
- */
237
- _selectNode(task) {
238
- let candidates = stmts.listActiveNodes.all();
239
- if (candidates.length === 0) return null;
240
-
241
- // Filter by capacity
242
- candidates = candidates.filter(n => n.capacity_used < n.capacity_total);
243
- if (candidates.length === 0) return null;
244
-
245
- const affinityTags = safeParse(task.affinity_tags, []);
246
- const affinityRegion = task.affinity_region;
247
-
248
- // Strategy 1: Region affinity
249
- if (affinityRegion) {
250
- const regionNodes = candidates.filter(n => n.region === affinityRegion);
251
- if (regionNodes.length > 0) {
252
- candidates = regionNodes;
253
- this._lastStrategy = 'region';
254
- }
255
- }
256
-
257
- // Strategy 2: Tag affinity
258
- if (affinityTags.length > 0) {
259
- const tagged = candidates.filter(n => {
260
- const nodeTags = safeParse(n.tags, []);
261
- return affinityTags.some(t => nodeTags.includes(t));
262
- });
263
- if (tagged.length > 0) {
264
- candidates = tagged;
265
- this._lastStrategy = 'affinity';
266
- }
267
- }
268
-
269
- // Strategy 3: Least-loaded
270
- candidates.sort((a, b) => {
271
- const loadA = a.capacity_used / a.capacity_total;
272
- const loadB = b.capacity_used / b.capacity_total;
273
- return loadA - loadB;
274
- });
275
-
276
- this._lastStrategy = this._lastStrategy || 'least-loaded';
277
- return candidates[0];
278
- }
279
-
280
- /**
281
- * Push task notification to a worker node
282
- */
283
- _notifyWorker(node, taskId, task) {
284
- const payload = JSON.stringify({
285
- type: 'task.assigned',
286
- taskId,
287
- taskType: task.task_type,
288
- objective: task.objective,
289
- params: safeParse(task.payload, {}),
290
- priority: task.priority,
291
- timeout: task.timeout_ms,
292
- });
293
-
294
- const url = new URL('/wab-worker/tasks/notify', node.endpoint);
295
- const mod = url.protocol === 'https:' ? https : http;
296
-
297
- const req = mod.request(url, {
298
- method: 'POST',
299
- headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) },
300
- timeout: 5000,
301
- });
302
- req.on('error', () => { /* best-effort push */ });
303
- req.write(payload);
304
- req.end();
305
- }
306
-
307
- /**
308
- * Reassign tasks from a dead node to other nodes
309
- */
310
- reassignFromNode(nodeId) {
311
- const tasks = stmts.getTasksByNode.all(nodeId);
312
- let reassigned = 0;
313
-
314
- for (const task of tasks) {
315
- if (task.attempts >= task.max_attempts) {
316
- stmts.failTask.run({ id: task.id, error: 'Node died, max attempts reached' });
317
- logEvent('task.failed', nodeId, task.id, { reason: 'node_death' });
318
- bus.emit('cluster.task.failed', { taskId: task.id, reason: 'node_death' });
319
- continue;
320
- }
321
-
322
- stmts.decrementNodeLoad.run(nodeId);
323
- stmts.requeueTask.run(task.id);
324
- logEvent('task.requeued', nodeId, task.id, { attempt: task.attempts });
325
-
326
- // Try to assign to another node
327
- if (this._tryAssign(task.id)) {
328
- reassigned++;
329
- this._stats.reassigned++;
330
- }
331
- }
332
-
333
- return reassigned;
334
- }
335
-
336
- /**
337
- * Process pending tasks — called periodically
338
- */
339
- processPending() {
340
- const pending = stmts.getPendingTasks.all(50);
341
- let assigned = 0;
342
- for (const task of pending) {
343
- if (this._tryAssign(task.id)) assigned++;
344
- }
345
- return assigned;
346
- }
347
-
348
- /**
349
- * Worker pulls tasks for execution
350
- */
351
- pullTasks(nodeId, limit = 5) {
352
- const node = stmts.getNode.get(nodeId);
353
- if (!node || node.status !== 'active') return [];
354
-
355
- const available = node.capacity_total - node.capacity_used;
356
- if (available <= 0) return [];
357
-
358
- const count = Math.min(limit, available);
359
- const pending = stmts.getPendingTasks.all(count);
360
- const assigned = [];
361
-
362
- for (const task of pending) {
363
- // Check affinity
364
- const affinityRegion = task.affinity_region;
365
- if (affinityRegion && node.region !== affinityRegion) continue;
366
-
367
- const affinityTags = safeParse(task.affinity_tags, []);
368
- const nodeTags = safeParse(node.tags, []);
369
- if (affinityTags.length > 0 && !affinityTags.some(t => nodeTags.includes(t))) continue;
370
-
371
- stmts.assignTask.run({ id: task.id, node_id: nodeId });
372
- stmts.incrementNodeLoad.run(nodeId);
373
- logEvent('task.assigned', nodeId, task.id, { strategy: 'pull' });
374
-
375
- assigned.push({
376
- taskId: task.id,
377
- type: task.task_type,
378
- objective: task.objective,
379
- params: safeParse(task.payload, {}),
380
- priority: task.priority,
381
- timeout: task.timeout_ms,
382
- });
383
- }
384
-
385
- return assigned;
386
- }
387
-
388
- getStats() { return { ...this._stats }; }
389
- }
390
-
391
- // ═══════════════════════════════════════════════════════════════════════════
392
- // CLUSTER ORCHESTRATOR
393
- // ═══════════════════════════════════════════════════════════════════════════
394
-
395
- /**
396
- * Fleet management — lifecycle, health, auto-scaling, failover, rebalancing.
397
- *
398
- * Responsibilities:
399
- * - Node registration and authentication
400
- * - Health monitoring via heartbeats
401
- * - Dead node detection and task failover
402
- * - Load rebalancing across the cluster
403
- * - Cluster topology and status reporting
404
- * - Drain and cordon operations
405
- */
406
- class ClusterOrchestrator {
407
- constructor(distributor) {
408
- this._distributor = distributor;
409
- this._heartbeatThresholdSec = 90; // Node considered dead after 90s no heartbeat
410
- this._checkInterval = null;
411
- this._rebalanceInterval = null;
412
- this._started = false;
413
- }
414
-
415
- // ─── Lifecycle ──────────────────────────────────────────────────────
416
-
417
- /**
418
- * Start the orchestrator — begins periodic health checks and task processing
419
- */
420
- start() {
421
- if (this._started) return;
422
- this._started = true;
423
-
424
- // Health check every 30s
425
- this._checkInterval = setInterval(() => {
426
- this._healthCheck();
427
- this._recoverStuckTasks();
428
- this._distributor.processPending();
429
- }, 30_000);
430
- if (this._checkInterval.unref) this._checkInterval.unref();
431
-
432
- // Rebalance every 5 min
433
- this._rebalanceInterval = setInterval(() => {
434
- this._rebalance();
435
- }, 300_000);
436
- if (this._rebalanceInterval.unref) this._rebalanceInterval.unref();
437
-
438
- bus.emit('cluster.started', { timestamp: Date.now() });
439
- }
440
-
441
- /**
442
- * Stop the orchestrator
443
- */
444
- stop() {
445
- if (!this._started) return;
446
- this._started = false;
447
- if (this._checkInterval) clearInterval(this._checkInterval);
448
- if (this._rebalanceInterval) clearInterval(this._rebalanceInterval);
449
- bus.emit('cluster.stopped', { timestamp: Date.now() });
450
- }
451
-
452
- // ─── Node Management ───────────────────────────────────────────────
453
-
454
- /**
455
- * Register a worker node to join the cluster
456
- */
457
- registerNode(config) {
458
- if (!config.name || !config.endpoint) {
459
- throw new Error('Node name and endpoint required');
460
- }
461
-
462
- // Check for existing node with same endpoint
463
- const existing = stmts.getNodeByEndpoint.get(config.endpoint);
464
- if (existing) {
465
- // Re-register: update and reactivate
466
- stmts.updateNode.run({
467
- id: existing.id,
468
- name: config.name,
469
- endpoint: config.endpoint,
470
- region: config.region || existing.region,
471
- zone: config.zone || existing.zone,
472
- capacity_total: config.capacity || existing.capacity_total,
473
- tags: JSON.stringify(config.tags || safeParse(existing.tags, [])),
474
- hardware: JSON.stringify(config.hardware || safeParse(existing.hardware, {})),
475
- version: config.version || existing.version,
476
- });
477
- stmts.setNodeStatus.run({ id: existing.id, status: 'active' });
478
- logEvent('node.re-registered', existing.id, null, { endpoint: config.endpoint });
479
- bus.emit('cluster.node.joined', { nodeId: existing.id, name: config.name, rejoined: true });
480
- return { nodeId: existing.id, status: 'active', rejoined: true };
481
- }
482
-
483
- const nodeId = `node_${crypto.randomBytes(12).toString('hex')}`;
484
- const secretHash = crypto.createHash('sha256')
485
- .update(config.secret || crypto.randomBytes(32).toString('hex'))
486
- .digest('hex');
487
-
488
- stmts.insertNode.run({
489
- id: nodeId,
490
- name: config.name,
491
- endpoint: config.endpoint,
492
- region: config.region || 'default',
493
- zone: config.zone || 'a',
494
- role: config.role || 'worker',
495
- status: 'active',
496
- capacity_total: config.capacity || 20,
497
- tags: JSON.stringify(config.tags || []),
498
- hardware: JSON.stringify(config.hardware || {}),
499
- version: config.version || null,
500
- secret_hash: secretHash,
501
- });
502
-
503
- logEvent('node.registered', nodeId, null, { name: config.name, endpoint: config.endpoint, region: config.region });
504
- bus.emit('cluster.node.joined', { nodeId, name: config.name });
505
-
506
- return { nodeId, status: 'active', secret: config.secret ? undefined : undefined };
507
- }
508
-
509
- /**
510
- * Remove a node from the cluster
511
- */
512
- deregisterNode(nodeId) {
513
- const node = stmts.getNode.get(nodeId);
514
- if (!node) return null;
515
-
516
- // Reassign tasks before removing
517
- const reassigned = this._distributor.reassignFromNode(nodeId);
518
- stmts.deleteNode.run(nodeId);
519
-
520
- logEvent('node.deregistered', nodeId, null, { reassigned });
521
- bus.emit('cluster.node.left', { nodeId, name: node.name, tasksReassigned: reassigned });
522
-
523
- return { nodeId, reassigned };
524
- }
525
-
526
- /**
527
- * Process heartbeat from a worker node
528
- */
529
- heartbeat(nodeId, data = {}) {
530
- const node = stmts.getNode.get(nodeId);
531
- if (!node) return null;
532
-
533
- stmts.heartbeatNode.run({
534
- id: nodeId,
535
- capacity_used: data.capacityUsed != null ? data.capacityUsed : node.capacity_used,
536
- });
537
-
538
- // Update hardware profile if provided
539
- if (data.hardware) {
540
- stmts.updateNode.run({
541
- id: nodeId,
542
- name: node.name,
543
- endpoint: node.endpoint,
544
- region: node.region,
545
- zone: node.zone,
546
- capacity_total: data.capacityTotal || node.capacity_total,
547
- tags: JSON.stringify(data.tags || safeParse(node.tags, [])),
548
- hardware: JSON.stringify(data.hardware),
549
- version: data.version || node.version,
550
- });
551
- }
552
-
553
- return {
554
- nodeId,
555
- status: 'active',
556
- pendingTasks: stmts.getPendingTasks.all(1).length > 0,
557
- };
558
- }
559
-
560
- /**
561
- * Drain a node — stop assigning new tasks, wait for running tasks to finish
562
- */
563
- drainNode(nodeId) {
564
- const node = stmts.getNode.get(nodeId);
565
- if (!node) return null;
566
-
567
- stmts.setNodeStatus.run({ id: nodeId, status: 'draining' });
568
- logEvent('node.draining', nodeId, null, {});
569
- bus.emit('cluster.node.draining', { nodeId, name: node.name });
570
-
571
- return { nodeId, status: 'draining', activeTasks: stmts.getTasksByNode.all(nodeId).length };
572
- }
573
-
574
- /**
575
- * Cordon a node — prevent scheduling but keep running tasks
576
- */
577
- cordonNode(nodeId) {
578
- const node = stmts.getNode.get(nodeId);
579
- if (!node) return null;
580
-
581
- stmts.setNodeStatus.run({ id: nodeId, status: 'cordoned' });
582
- logEvent('node.cordoned', nodeId, null, {});
583
- bus.emit('cluster.node.cordoned', { nodeId, name: node.name });
584
-
585
- return { nodeId, status: 'cordoned' };
586
- }
587
-
588
- /**
589
- * Uncordon a node — allow scheduling again
590
- */
591
- uncordonNode(nodeId) {
592
- const node = stmts.getNode.get(nodeId);
593
- if (!node) return null;
594
-
595
- stmts.setNodeStatus.run({ id: nodeId, status: 'active' });
596
- logEvent('node.uncordoned', nodeId, null, {});
597
-
598
- return { nodeId, status: 'active' };
599
- }
600
-
601
- /**
602
- * Get node details
603
- */
604
- getNode(nodeId) {
605
- const node = stmts.getNode.get(nodeId);
606
- if (!node) return null;
607
- node.tags = safeParse(node.tags, []);
608
- node.hardware = safeParse(node.hardware, {});
609
- node.activeTasks = stmts.getTasksByNode.all(nodeId).length;
610
- return node;
611
- }
612
-
613
- /**
614
- * List all cluster nodes
615
- */
616
- listNodes(filter = {}) {
617
- let nodes;
618
- if (filter.region) {
619
- nodes = stmts.listNodesByRegion.all(filter.region);
620
- } else if (filter.active) {
621
- nodes = stmts.listActiveNodes.all();
622
- } else {
623
- nodes = stmts.listNodes.all();
624
- }
625
- return nodes.map(n => ({
626
- ...n,
627
- tags: safeParse(n.tags, []),
628
- hardware: safeParse(n.hardware, {}),
629
- }));
630
- }
631
-
632
- // ─── Task Reporting ─────────────────────────────────────────────────
633
-
634
- /**
635
- * Worker reports task started
636
- */
637
- reportTaskStarted(taskId) {
638
- const task = stmts.getTask.get(taskId);
639
- if (!task) return null;
640
- stmts.startTask.run(taskId);
641
- logEvent('task.started', task.node_id, taskId, {});
642
- bus.emit('cluster.task.started', { taskId, nodeId: task.node_id });
643
- return { taskId, status: 'running' };
644
- }
645
-
646
- /**
647
- * Worker reports task completed
648
- */
649
- reportTaskCompleted(taskId, result) {
650
- const task = stmts.getTask.get(taskId);
651
- if (!task) return null;
652
-
653
- stmts.completeTask.run({ id: taskId, result: JSON.stringify(result || {}) });
654
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
655
-
656
- logEvent('task.completed', task.node_id, taskId, { hasResult: !!result });
657
- bus.emit('cluster.task.completed', { taskId, nodeId: task.node_id, result });
658
-
659
- return { taskId, status: 'completed' };
660
- }
661
-
662
- /**
663
- * Worker reports task failed
664
- */
665
- reportTaskFailed(taskId, error) {
666
- const task = stmts.getTask.get(taskId);
667
- if (!task) return null;
668
-
669
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
670
-
671
- // Retry if attempts remaining
672
- if (task.attempts < task.max_attempts) {
673
- stmts.requeueTask.run(taskId);
674
- logEvent('task.retrying', task.node_id, taskId, { attempt: task.attempts, error });
675
- bus.emit('cluster.task.retrying', { taskId, attempt: task.attempts });
676
-
677
- // Try to assign to a different node
678
- this._distributor._tryAssign(taskId);
679
-
680
- return { taskId, status: 'retrying', attempt: task.attempts };
681
- }
682
-
683
- // Max attempts reached
684
- stmts.failTask.run({ id: taskId, error: typeof error === 'string' ? error : JSON.stringify(error) });
685
- logEvent('task.failed', task.node_id, taskId, { error, attempts: task.attempts });
686
- bus.emit('cluster.task.failed', { taskId, error, nodeId: task.node_id });
687
-
688
- return { taskId, status: 'failed' };
689
- }
690
-
691
- /**
692
- * Get task details
693
- */
694
- getTask(taskId) {
695
- const task = stmts.getTask.get(taskId);
696
- if (!task) return null;
697
- task.payload = safeParse(task.payload, {});
698
- task.affinity_tags = safeParse(task.affinity_tags, []);
699
- task.result = safeParse(task.result, null);
700
- return task;
701
- }
702
-
703
- /**
704
- * List tasks with optional status filter
705
- */
706
- listTasks(filter = {}) {
707
- let tasks;
708
- if (filter.status) {
709
- tasks = stmts.getTasksByStatus.all(filter.status, filter.limit || 50);
710
- } else if (filter.nodeId) {
711
- tasks = stmts.getTasksByNode.all(filter.nodeId);
712
- } else {
713
- tasks = stmts.listTasks.all(filter.limit || 50);
714
- }
715
- return tasks.map(t => ({
716
- ...t,
717
- payload: safeParse(t.payload, {}),
718
- affinity_tags: safeParse(t.affinity_tags, []),
719
- result: safeParse(t.result, null),
720
- }));
721
- }
722
-
723
- // ─── Cluster Topology ───────────────────────────────────────────────
724
-
725
- /**
726
- * Get full cluster status
727
- */
728
- getClusterStatus() {
729
- const nodes = stmts.listNodes.all();
730
- const taskCounts = {};
731
- for (const row of stmts.countByStatus.all()) {
732
- taskCounts[row.status] = row.count;
733
- }
734
-
735
- const activeNodes = nodes.filter(n => n.status === 'active');
736
- const totalCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_total, 0);
737
- const usedCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_used, 0);
738
-
739
- // Group by region
740
- const regions = {};
741
- for (const node of nodes) {
742
- if (!regions[node.region]) regions[node.region] = { nodes: 0, active: 0, capacity: 0, used: 0 };
743
- regions[node.region].nodes++;
744
- if (node.status === 'active') {
745
- regions[node.region].active++;
746
- regions[node.region].capacity += node.capacity_total;
747
- regions[node.region].used += node.capacity_used;
748
- }
749
- }
750
-
751
- return {
752
- coordinator: { started: this._started },
753
- nodes: {
754
- total: nodes.length,
755
- active: activeNodes.length,
756
- draining: nodes.filter(n => n.status === 'draining').length,
757
- cordoned: nodes.filter(n => n.status === 'cordoned').length,
758
- dead: nodes.filter(n => n.status === 'dead').length,
759
- },
760
- capacity: {
761
- total: totalCapacity,
762
- used: usedCapacity,
763
- available: totalCapacity - usedCapacity,
764
- utilization: totalCapacity > 0 ? Math.round((usedCapacity / totalCapacity) * 100) : 0,
765
- },
766
- tasks: taskCounts,
767
- regions,
768
- distributor: this._distributor.getStats(),
769
- };
770
- }
771
-
772
- /**
773
- * Get cluster events log
774
- */
775
- getEvents(limit = 100, nodeId = null) {
776
- if (nodeId) {
777
- return stmts.getEventsByNode.all(nodeId, limit).map(e => ({
778
- ...e,
779
- data: safeParse(e.data, {}),
780
- }));
781
- }
782
- return stmts.getEvents.all(limit).map(e => ({
783
- ...e,
784
- data: safeParse(e.data, {}),
785
- }));
786
- }
787
-
788
- // ─── Internal Operations ────────────────────────────────────────────
789
-
790
- /**
791
- * Check for dead nodes and failover their tasks
792
- */
793
- _healthCheck() {
794
- const staleNodes = stmts.getStaleNodes.all(this._heartbeatThresholdSec);
795
-
796
- for (const node of staleNodes) {
797
- stmts.setNodeStatus.run({ id: node.id, status: 'dead' });
798
- logEvent('node.dead', node.id, null, { lastHeartbeat: node.last_heartbeat });
799
- bus.emit('cluster.node.dead', { nodeId: node.id, name: node.name });
800
-
801
- // Failover: reassign all tasks from dead node
802
- const reassigned = this._distributor.reassignFromNode(node.id);
803
- logEvent('node.failover', node.id, null, { reassigned });
804
- bus.emit('cluster.node.failover', { nodeId: node.id, tasksReassigned: reassigned });
805
- }
806
- }
807
-
808
- /**
809
- * Recover tasks that have been assigned/running too long (stuck)
810
- */
811
- _recoverStuckTasks() {
812
- const stuckTasks = stmts.getStuckTasks.all(300); // 5 min stuck threshold
813
-
814
- for (const task of stuckTasks) {
815
- if (task.attempts >= task.max_attempts) {
816
- stmts.failTask.run({ id: task.id, error: 'Task stuck, max attempts reached' });
817
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
818
- logEvent('task.stuck_failed', task.node_id, task.id, {});
819
- } else {
820
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
821
- stmts.requeueTask.run(task.id);
822
- this._distributor._tryAssign(task.id);
823
- logEvent('task.stuck_requeued', task.node_id, task.id, { attempt: task.attempts });
824
- }
825
- }
826
- }
827
-
828
- /**
829
- * Rebalance tasks across nodes when load is skewed
830
- */
831
- _rebalance() {
832
- const nodes = stmts.listActiveNodes.all();
833
- if (nodes.length < 2) return;
834
-
835
- const avgLoad = nodes.reduce((s, n) => s + (n.capacity_used / n.capacity_total), 0) / nodes.length;
836
- const overloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) > avgLoad * 1.5 && n.capacity_used > 2);
837
- const underloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) < avgLoad * 0.5);
838
-
839
- if (overloaded.length === 0 || underloaded.length === 0) return;
840
-
841
- let moved = 0;
842
- for (const over of overloaded) {
843
- const tasks = stmts.getTasksByNode.all(over.id);
844
- // Move up to 2 tasks from overloaded to underloaded
845
- const toMove = tasks.filter(t => t.status === 'assigned').slice(0, 2);
846
-
847
- for (const task of toMove) {
848
- const target = underloaded.find(n => n.capacity_used < n.capacity_total);
849
- if (!target) break;
850
-
851
- stmts.decrementNodeLoad.run(over.id);
852
- stmts.assignTask.run({ id: task.id, node_id: target.id });
853
- stmts.incrementNodeLoad.run(target.id);
854
- target.capacity_used++;
855
- moved++;
856
-
857
- logEvent('task.rebalanced', target.id, task.id, { from: over.id });
858
- this._distributor._notifyWorker(target, task.id, task);
859
- }
860
- }
861
-
862
- if (moved > 0) {
863
- bus.emit('cluster.rebalanced', { tasksMoved: moved });
864
- }
865
- }
866
- }
867
-
868
- // ═══════════════════════════════════════════════════════════════════════════
869
- // HELPERS
870
- // ═══════════════════════════════════════════════════════════════════════════
871
-
872
- function safeParse(str, fallback) {
873
- if (str == null) return fallback;
874
- if (typeof str === 'object') return str;
875
- try { return JSON.parse(str); } catch { return fallback; }
876
- }
877
-
878
- function logEvent(type, nodeId, taskId, data) {
879
- try {
880
- stmts.insertEvent.run({
881
- event_type: type,
882
- node_id: nodeId || null,
883
- task_id: taskId || null,
884
- data: JSON.stringify(data || {}),
885
- });
886
- } catch { /* best-effort logging */ }
887
- }
888
-
889
- // ─── Singleton ───────────────────────────────────────────────────────
890
-
891
- const distributor = new TaskDistributor();
892
- const cluster = new ClusterOrchestrator(distributor);
893
-
894
- module.exports = { cluster, distributor, ClusterOrchestrator, TaskDistributor };
1
+ 'use strict';
2
+
3
+ /**
4
+ * WAB Cluster — Distributed Execution, Worker Nodes & Cluster Orchestration
5
+ *
6
+ * Turns WAB from a single-server Agent OS into a distributed fleet.
7
+ *
8
+ * Architecture:
9
+ * ┌──────────────┐ ┌──────────┐ ┌──────────┐
10
+ * │ Coordinator │────▶│ Worker-1 │ │ Worker-2 │
11
+ * │ (this node) │────▶│ (remote) │ │ (remote) │
12
+ * │ │────▶│ │ │ │
13
+ * └──────────────┘ └──────────┘ └──────────┘
14
+ * │ ▲ ▲
15
+ * │ │ │
16
+ * └───────────────────┴─────────────────┘
17
+ * heartbeat / task results
18
+ *
19
+ * Components:
20
+ * 1. WorkerNode — A remote execution node that connects, heartbeats, runs tasks
21
+ * 2. TaskDistributor — Routes tasks to workers based on capacity/affinity/load
22
+ * 3. ClusterOrchestrator — Fleet management, auto-scaling, failover, rebalancing
23
+ *
24
+ * Communication: HTTP/JSON between nodes (pull-based + push notifications)
25
+ * Persistence: SQLite tables for durability across restarts
26
+ * Consistency: Leader-based (coordinator is source of truth)
27
+ */
28
+
29
+ const crypto = require('crypto');
30
+ const http = require('http');
31
+ const https = require('https');
32
+ const { URL } = require('url');
33
+ const { db } = require('../models/db');
34
+ const { bus } = require('../runtime/event-bus');
35
+
36
+ // ─── Schema ──────────────────────────────────────────────────────────
37
+
38
+ db.exec(`
39
+ CREATE TABLE IF NOT EXISTS cluster_nodes (
40
+ id TEXT PRIMARY KEY,
41
+ name TEXT NOT NULL,
42
+ endpoint TEXT NOT NULL,
43
+ region TEXT DEFAULT 'default',
44
+ zone TEXT DEFAULT 'a',
45
+ role TEXT DEFAULT 'worker',
46
+ status TEXT DEFAULT 'joining',
47
+ capacity_total INTEGER DEFAULT 20,
48
+ capacity_used INTEGER DEFAULT 0,
49
+ tags TEXT DEFAULT '[]',
50
+ hardware TEXT DEFAULT '{}',
51
+ version TEXT,
52
+ secret_hash TEXT,
53
+ last_heartbeat TEXT DEFAULT (datetime('now')),
54
+ registered_at TEXT DEFAULT (datetime('now')),
55
+ updated_at TEXT DEFAULT (datetime('now'))
56
+ );
57
+
58
+ CREATE TABLE IF NOT EXISTS cluster_tasks (
59
+ id TEXT PRIMARY KEY,
60
+ external_id TEXT,
61
+ node_id TEXT,
62
+ task_type TEXT NOT NULL,
63
+ objective TEXT,
64
+ payload TEXT DEFAULT '{}',
65
+ priority INTEGER DEFAULT 50,
66
+ status TEXT DEFAULT 'pending',
67
+ result TEXT,
68
+ error TEXT,
69
+ attempts INTEGER DEFAULT 0,
70
+ max_attempts INTEGER DEFAULT 3,
71
+ affinity_tags TEXT DEFAULT '[]',
72
+ affinity_region TEXT,
73
+ timeout_ms INTEGER DEFAULT 60000,
74
+ submitted_at TEXT DEFAULT (datetime('now')),
75
+ assigned_at TEXT,
76
+ started_at TEXT,
77
+ completed_at TEXT
78
+ );
79
+
80
+ CREATE TABLE IF NOT EXISTS cluster_events (
81
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
82
+ event_type TEXT NOT NULL,
83
+ node_id TEXT,
84
+ task_id TEXT,
85
+ data TEXT DEFAULT '{}',
86
+ created_at TEXT DEFAULT (datetime('now'))
87
+ );
88
+
89
+ CREATE INDEX IF NOT EXISTS idx_cluster_nodes_status ON cluster_nodes(status);
90
+ CREATE INDEX IF NOT EXISTS idx_cluster_nodes_region ON cluster_nodes(region);
91
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_status ON cluster_tasks(status);
92
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_node ON cluster_tasks(node_id);
93
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_priority ON cluster_tasks(priority DESC);
94
+ CREATE INDEX IF NOT EXISTS idx_cluster_events_type ON cluster_events(event_type);
95
+ CREATE INDEX IF NOT EXISTS idx_cluster_events_node ON cluster_events(node_id);
96
+ `);
97
+
98
+ // ─── Prepared Statements ─────────────────────────────────────────────
99
+
100
+ const stmts = {
101
+ // Nodes
102
+ insertNode: db.prepare(`
103
+ INSERT INTO cluster_nodes (id, name, endpoint, region, zone, role, status, capacity_total, tags, hardware, version, secret_hash)
104
+ VALUES (@id, @name, @endpoint, @region, @zone, @role, @status, @capacity_total, @tags, @hardware, @version, @secret_hash)
105
+ `),
106
+ updateNode: db.prepare(`
107
+ UPDATE cluster_nodes SET name=@name, endpoint=@endpoint, region=@region, zone=@zone,
108
+ capacity_total=@capacity_total, tags=@tags, hardware=@hardware, version=@version, updated_at=datetime('now')
109
+ WHERE id=@id
110
+ `),
111
+ setNodeStatus: db.prepare(`UPDATE cluster_nodes SET status=@status, updated_at=datetime('now') WHERE id=@id`),
112
+ heartbeatNode: db.prepare(`
113
+ UPDATE cluster_nodes SET last_heartbeat=datetime('now'), capacity_used=@capacity_used, status='active', updated_at=datetime('now')
114
+ WHERE id=@id
115
+ `),
116
+ getNode: db.prepare(`SELECT * FROM cluster_nodes WHERE id=?`),
117
+ getNodeByEndpoint: db.prepare(`SELECT * FROM cluster_nodes WHERE endpoint=?`),
118
+ listNodes: db.prepare(`SELECT * FROM cluster_nodes ORDER BY registered_at DESC`),
119
+ listActiveNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' ORDER BY capacity_used ASC`),
120
+ listNodesByRegion: db.prepare(`SELECT * FROM cluster_nodes WHERE region=? AND status='active' ORDER BY capacity_used ASC`),
121
+ deleteNode: db.prepare(`DELETE FROM cluster_nodes WHERE id=?`),
122
+ getStaleNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' AND last_heartbeat < datetime('now', '-' || ? || ' seconds')`),
123
+
124
+ // Tasks
125
+ insertTask: db.prepare(`
126
+ INSERT INTO cluster_tasks (id, external_id, task_type, objective, payload, priority, status, affinity_tags, affinity_region, timeout_ms, max_attempts)
127
+ VALUES (@id, @external_id, @task_type, @objective, @payload, @priority, @status, @affinity_tags, @affinity_region, @timeout_ms, @max_attempts)
128
+ `),
129
+ assignTask: db.prepare(`
130
+ UPDATE cluster_tasks SET node_id=@node_id, status='assigned', assigned_at=datetime('now'), attempts=attempts+1
131
+ WHERE id=@id
132
+ `),
133
+ startTask: db.prepare(`UPDATE cluster_tasks SET status='running', started_at=datetime('now') WHERE id=?`),
134
+ completeTask: db.prepare(`
135
+ UPDATE cluster_tasks SET status='completed', result=@result, completed_at=datetime('now') WHERE id=@id
136
+ `),
137
+ failTask: db.prepare(`
138
+ UPDATE cluster_tasks SET status='failed', error=@error, completed_at=datetime('now') WHERE id=@id
139
+ `),
140
+ requeueTask: db.prepare(`UPDATE cluster_tasks SET status='pending', node_id=NULL, assigned_at=NULL WHERE id=?`),
141
+ getTask: db.prepare(`SELECT * FROM cluster_tasks WHERE id=?`),
142
+ getTaskByExternal: db.prepare(`SELECT * FROM cluster_tasks WHERE external_id=?`),
143
+ getPendingTasks: db.prepare(`SELECT * FROM cluster_tasks WHERE status='pending' ORDER BY priority DESC, submitted_at ASC LIMIT ?`),
144
+ getTasksByNode: db.prepare(`SELECT * FROM cluster_tasks WHERE node_id=? AND status IN ('assigned','running') ORDER BY priority DESC`),
145
+ getTasksByStatus: db.prepare(`SELECT * FROM cluster_tasks WHERE status=? ORDER BY submitted_at DESC LIMIT ?`),
146
+ listTasks: db.prepare(`SELECT * FROM cluster_tasks ORDER BY submitted_at DESC LIMIT ?`),
147
+ getStuckTasks: db.prepare(`
148
+ SELECT * FROM cluster_tasks WHERE status IN ('assigned','running')
149
+ AND assigned_at < datetime('now', '-' || ? || ' seconds')
150
+ `),
151
+ countByStatus: db.prepare(`SELECT status, COUNT(*) as count FROM cluster_tasks GROUP BY status`),
152
+ incrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = capacity_used + 1 WHERE id=?`),
153
+ decrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = MAX(0, capacity_used - 1) WHERE id=?`),
154
+
155
+ // Events
156
+ insertEvent: db.prepare(`INSERT INTO cluster_events (event_type, node_id, task_id, data) VALUES (@event_type, @node_id, @task_id, @data)`),
157
+ getEvents: db.prepare(`SELECT * FROM cluster_events ORDER BY id DESC LIMIT ?`),
158
+ getEventsByNode: db.prepare(`SELECT * FROM cluster_events WHERE node_id=? ORDER BY id DESC LIMIT ?`),
159
+ };
160
+
161
+ // ═══════════════════════════════════════════════════════════════════════════
162
+ // TASK DISTRIBUTOR
163
+ // ═══════════════════════════════════════════════════════════════════════════
164
+
165
+ /**
166
+ * Routes tasks to worker nodes based on capacity, affinity, and load balancing.
167
+ *
168
+ * Strategies:
169
+ * - least-loaded: Pick the node with the most free capacity
170
+ * - affinity: Match task tags to node tags
171
+ * - region: Prefer nodes in the same region as the task
172
+ * - round-robin: Distribute evenly across all active nodes
173
+ */
174
+ class TaskDistributor {
175
+ constructor() {
176
+ this._roundRobinIndex = 0;
177
+ this._stats = { distributed: 0, reassigned: 0, noCapacity: 0 };
178
+ }
179
+
180
+ /**
181
+ * Submit a task for distributed execution
182
+ */
183
+ submit(task) {
184
+ const id = task.id || `ct_${crypto.randomBytes(12).toString('hex')}`;
185
+ const entry = {
186
+ id,
187
+ external_id: task.externalId || null,
188
+ task_type: task.type || 'general',
189
+ objective: task.objective || '',
190
+ payload: JSON.stringify(task.params || {}),
191
+ priority: task.priority || 50,
192
+ status: 'pending',
193
+ affinity_tags: JSON.stringify(task.affinityTags || []),
194
+ affinity_region: task.affinityRegion || null,
195
+ timeout_ms: task.timeout || 60000,
196
+ max_attempts: task.maxAttempts || 3,
197
+ };
198
+ stmts.insertTask.run(entry);
199
+
200
+ bus.emit('cluster.task.submitted', { taskId: id, type: entry.task_type, priority: entry.priority });
201
+ this._stats.distributed++;
202
+
203
+ // Try immediate assignment
204
+ this._tryAssign(id);
205
+
206
+ return { taskId: id, status: 'pending' };
207
+ }
208
+
209
+ /**
210
+ * Try to assign a task to a worker node
211
+ */
212
+ _tryAssign(taskId) {
213
+ const task = stmts.getTask.get(taskId);
214
+ if (!task || task.status !== 'pending') return false;
215
+
216
+ const node = this._selectNode(task);
217
+ if (!node) {
218
+ this._stats.noCapacity++;
219
+ return false;
220
+ }
221
+
222
+ stmts.assignTask.run({ id: taskId, node_id: node.id });
223
+ stmts.incrementNodeLoad.run(node.id);
224
+
225
+ logEvent('task.assigned', node.id, taskId, { strategy: this._lastStrategy });
226
+ bus.emit('cluster.task.assigned', { taskId, nodeId: node.id });
227
+
228
+ // Push notification to worker (fire-and-forget)
229
+ this._notifyWorker(node, taskId, task);
230
+
231
+ return true;
232
+ }
233
+
234
+ /**
235
+ * Select the best node for a task
236
+ */
237
+ _selectNode(task) {
238
+ let candidates = stmts.listActiveNodes.all();
239
+ if (candidates.length === 0) return null;
240
+
241
+ // Filter by capacity
242
+ candidates = candidates.filter(n => n.capacity_used < n.capacity_total);
243
+ if (candidates.length === 0) return null;
244
+
245
+ const affinityTags = safeParse(task.affinity_tags, []);
246
+ const affinityRegion = task.affinity_region;
247
+
248
+ // Strategy 1: Region affinity
249
+ if (affinityRegion) {
250
+ const regionNodes = candidates.filter(n => n.region === affinityRegion);
251
+ if (regionNodes.length > 0) {
252
+ candidates = regionNodes;
253
+ this._lastStrategy = 'region';
254
+ }
255
+ }
256
+
257
+ // Strategy 2: Tag affinity
258
+ if (affinityTags.length > 0) {
259
+ const tagged = candidates.filter(n => {
260
+ const nodeTags = safeParse(n.tags, []);
261
+ return affinityTags.some(t => nodeTags.includes(t));
262
+ });
263
+ if (tagged.length > 0) {
264
+ candidates = tagged;
265
+ this._lastStrategy = 'affinity';
266
+ }
267
+ }
268
+
269
+ // Strategy 3: Least-loaded
270
+ candidates.sort((a, b) => {
271
+ const loadA = a.capacity_used / a.capacity_total;
272
+ const loadB = b.capacity_used / b.capacity_total;
273
+ return loadA - loadB;
274
+ });
275
+
276
+ this._lastStrategy = this._lastStrategy || 'least-loaded';
277
+ return candidates[0];
278
+ }
279
+
280
+ /**
281
+ * Push task notification to a worker node
282
+ */
283
+ _notifyWorker(node, taskId, task) {
284
+ const payload = JSON.stringify({
285
+ type: 'task.assigned',
286
+ taskId,
287
+ taskType: task.task_type,
288
+ objective: task.objective,
289
+ params: safeParse(task.payload, {}),
290
+ priority: task.priority,
291
+ timeout: task.timeout_ms,
292
+ });
293
+
294
+ const url = new URL('/wab-worker/tasks/notify', node.endpoint);
295
+ const mod = url.protocol === 'https:' ? https : http;
296
+
297
+ const req = mod.request(url, {
298
+ method: 'POST',
299
+ headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) },
300
+ timeout: 5000,
301
+ });
302
+ req.on('error', () => { /* best-effort push */ });
303
+ req.write(payload);
304
+ req.end();
305
+ }
306
+
307
+ /**
308
+ * Reassign tasks from a dead node to other nodes
309
+ */
310
+ reassignFromNode(nodeId) {
311
+ const tasks = stmts.getTasksByNode.all(nodeId);
312
+ let reassigned = 0;
313
+
314
+ for (const task of tasks) {
315
+ if (task.attempts >= task.max_attempts) {
316
+ stmts.failTask.run({ id: task.id, error: 'Node died, max attempts reached' });
317
+ logEvent('task.failed', nodeId, task.id, { reason: 'node_death' });
318
+ bus.emit('cluster.task.failed', { taskId: task.id, reason: 'node_death' });
319
+ continue;
320
+ }
321
+
322
+ stmts.decrementNodeLoad.run(nodeId);
323
+ stmts.requeueTask.run(task.id);
324
+ logEvent('task.requeued', nodeId, task.id, { attempt: task.attempts });
325
+
326
+ // Try to assign to another node
327
+ if (this._tryAssign(task.id)) {
328
+ reassigned++;
329
+ this._stats.reassigned++;
330
+ }
331
+ }
332
+
333
+ return reassigned;
334
+ }
335
+
336
+ /**
337
+ * Process pending tasks — called periodically
338
+ */
339
+ processPending() {
340
+ const pending = stmts.getPendingTasks.all(50);
341
+ let assigned = 0;
342
+ for (const task of pending) {
343
+ if (this._tryAssign(task.id)) assigned++;
344
+ }
345
+ return assigned;
346
+ }
347
+
348
+ /**
349
+ * Worker pulls tasks for execution
350
+ */
351
+ pullTasks(nodeId, limit = 5) {
352
+ const node = stmts.getNode.get(nodeId);
353
+ if (!node || node.status !== 'active') return [];
354
+
355
+ const available = node.capacity_total - node.capacity_used;
356
+ if (available <= 0) return [];
357
+
358
+ const count = Math.min(limit, available);
359
+ const pending = stmts.getPendingTasks.all(count);
360
+ const assigned = [];
361
+
362
+ for (const task of pending) {
363
+ // Check affinity
364
+ const affinityRegion = task.affinity_region;
365
+ if (affinityRegion && node.region !== affinityRegion) continue;
366
+
367
+ const affinityTags = safeParse(task.affinity_tags, []);
368
+ const nodeTags = safeParse(node.tags, []);
369
+ if (affinityTags.length > 0 && !affinityTags.some(t => nodeTags.includes(t))) continue;
370
+
371
+ stmts.assignTask.run({ id: task.id, node_id: nodeId });
372
+ stmts.incrementNodeLoad.run(nodeId);
373
+ logEvent('task.assigned', nodeId, task.id, { strategy: 'pull' });
374
+
375
+ assigned.push({
376
+ taskId: task.id,
377
+ type: task.task_type,
378
+ objective: task.objective,
379
+ params: safeParse(task.payload, {}),
380
+ priority: task.priority,
381
+ timeout: task.timeout_ms,
382
+ });
383
+ }
384
+
385
+ return assigned;
386
+ }
387
+
388
+ getStats() { return { ...this._stats }; }
389
+ }
390
+
391
+ // ═══════════════════════════════════════════════════════════════════════════
392
+ // CLUSTER ORCHESTRATOR
393
+ // ═══════════════════════════════════════════════════════════════════════════
394
+
395
+ /**
396
+ * Fleet management — lifecycle, health, auto-scaling, failover, rebalancing.
397
+ *
398
+ * Responsibilities:
399
+ * - Node registration and authentication
400
+ * - Health monitoring via heartbeats
401
+ * - Dead node detection and task failover
402
+ * - Load rebalancing across the cluster
403
+ * - Cluster topology and status reporting
404
+ * - Drain and cordon operations
405
+ */
406
+ class ClusterOrchestrator {
407
+ constructor(distributor) {
408
+ this._distributor = distributor;
409
+ this._heartbeatThresholdSec = 90; // Node considered dead after 90s no heartbeat
410
+ this._checkInterval = null;
411
+ this._rebalanceInterval = null;
412
+ this._started = false;
413
+ }
414
+
415
+ // ─── Lifecycle ──────────────────────────────────────────────────────
416
+
417
+ /**
418
+ * Start the orchestrator — begins periodic health checks and task processing
419
+ */
420
+ start() {
421
+ if (this._started) return;
422
+ this._started = true;
423
+
424
+ // Health check every 30s
425
+ this._checkInterval = setInterval(() => {
426
+ this._healthCheck();
427
+ this._recoverStuckTasks();
428
+ this._distributor.processPending();
429
+ }, 30_000);
430
+ if (this._checkInterval.unref) this._checkInterval.unref();
431
+
432
+ // Rebalance every 5 min
433
+ this._rebalanceInterval = setInterval(() => {
434
+ this._rebalance();
435
+ }, 300_000);
436
+ if (this._rebalanceInterval.unref) this._rebalanceInterval.unref();
437
+
438
+ bus.emit('cluster.started', { timestamp: Date.now() });
439
+ }
440
+
441
+ /**
442
+ * Stop the orchestrator
443
+ */
444
+ stop() {
445
+ if (!this._started) return;
446
+ this._started = false;
447
+ if (this._checkInterval) clearInterval(this._checkInterval);
448
+ if (this._rebalanceInterval) clearInterval(this._rebalanceInterval);
449
+ bus.emit('cluster.stopped', { timestamp: Date.now() });
450
+ }
451
+
452
+ // ─── Node Management ───────────────────────────────────────────────
453
+
454
+ /**
455
+ * Register a worker node to join the cluster
456
+ */
457
+ registerNode(config) {
458
+ if (!config.name || !config.endpoint) {
459
+ throw new Error('Node name and endpoint required');
460
+ }
461
+
462
+ // Check for existing node with same endpoint
463
+ const existing = stmts.getNodeByEndpoint.get(config.endpoint);
464
+ if (existing) {
465
+ // Re-register: update and reactivate
466
+ stmts.updateNode.run({
467
+ id: existing.id,
468
+ name: config.name,
469
+ endpoint: config.endpoint,
470
+ region: config.region || existing.region,
471
+ zone: config.zone || existing.zone,
472
+ capacity_total: config.capacity || existing.capacity_total,
473
+ tags: JSON.stringify(config.tags || safeParse(existing.tags, [])),
474
+ hardware: JSON.stringify(config.hardware || safeParse(existing.hardware, {})),
475
+ version: config.version || existing.version,
476
+ });
477
+ stmts.setNodeStatus.run({ id: existing.id, status: 'active' });
478
+ logEvent('node.re-registered', existing.id, null, { endpoint: config.endpoint });
479
+ bus.emit('cluster.node.joined', { nodeId: existing.id, name: config.name, rejoined: true });
480
+ return { nodeId: existing.id, status: 'active', rejoined: true };
481
+ }
482
+
483
+ const nodeId = `node_${crypto.randomBytes(12).toString('hex')}`;
484
+ const secretHash = crypto.createHash('sha256')
485
+ .update(config.secret || crypto.randomBytes(32).toString('hex'))
486
+ .digest('hex');
487
+
488
+ stmts.insertNode.run({
489
+ id: nodeId,
490
+ name: config.name,
491
+ endpoint: config.endpoint,
492
+ region: config.region || 'default',
493
+ zone: config.zone || 'a',
494
+ role: config.role || 'worker',
495
+ status: 'active',
496
+ capacity_total: config.capacity || 20,
497
+ tags: JSON.stringify(config.tags || []),
498
+ hardware: JSON.stringify(config.hardware || {}),
499
+ version: config.version || null,
500
+ secret_hash: secretHash,
501
+ });
502
+
503
+ logEvent('node.registered', nodeId, null, { name: config.name, endpoint: config.endpoint, region: config.region });
504
+ bus.emit('cluster.node.joined', { nodeId, name: config.name });
505
+
506
+ return { nodeId, status: 'active', secret: config.secret ? undefined : undefined };
507
+ }
508
+
509
+ /**
510
+ * Remove a node from the cluster
511
+ */
512
+ deregisterNode(nodeId) {
513
+ const node = stmts.getNode.get(nodeId);
514
+ if (!node) return null;
515
+
516
+ // Reassign tasks before removing
517
+ const reassigned = this._distributor.reassignFromNode(nodeId);
518
+ stmts.deleteNode.run(nodeId);
519
+
520
+ logEvent('node.deregistered', nodeId, null, { reassigned });
521
+ bus.emit('cluster.node.left', { nodeId, name: node.name, tasksReassigned: reassigned });
522
+
523
+ return { nodeId, reassigned };
524
+ }
525
+
526
+ /**
527
+ * Process heartbeat from a worker node
528
+ */
529
+ heartbeat(nodeId, data = {}) {
530
+ const node = stmts.getNode.get(nodeId);
531
+ if (!node) return null;
532
+
533
+ stmts.heartbeatNode.run({
534
+ id: nodeId,
535
+ capacity_used: data.capacityUsed != null ? data.capacityUsed : node.capacity_used,
536
+ });
537
+
538
+ // Update hardware profile if provided
539
+ if (data.hardware) {
540
+ stmts.updateNode.run({
541
+ id: nodeId,
542
+ name: node.name,
543
+ endpoint: node.endpoint,
544
+ region: node.region,
545
+ zone: node.zone,
546
+ capacity_total: data.capacityTotal || node.capacity_total,
547
+ tags: JSON.stringify(data.tags || safeParse(node.tags, [])),
548
+ hardware: JSON.stringify(data.hardware),
549
+ version: data.version || node.version,
550
+ });
551
+ }
552
+
553
+ return {
554
+ nodeId,
555
+ status: 'active',
556
+ pendingTasks: stmts.getPendingTasks.all(1).length > 0,
557
+ };
558
+ }
559
+
560
+ /**
561
+ * Drain a node — stop assigning new tasks, wait for running tasks to finish
562
+ */
563
+ drainNode(nodeId) {
564
+ const node = stmts.getNode.get(nodeId);
565
+ if (!node) return null;
566
+
567
+ stmts.setNodeStatus.run({ id: nodeId, status: 'draining' });
568
+ logEvent('node.draining', nodeId, null, {});
569
+ bus.emit('cluster.node.draining', { nodeId, name: node.name });
570
+
571
+ return { nodeId, status: 'draining', activeTasks: stmts.getTasksByNode.all(nodeId).length };
572
+ }
573
+
574
+ /**
575
+ * Cordon a node — prevent scheduling but keep running tasks
576
+ */
577
+ cordonNode(nodeId) {
578
+ const node = stmts.getNode.get(nodeId);
579
+ if (!node) return null;
580
+
581
+ stmts.setNodeStatus.run({ id: nodeId, status: 'cordoned' });
582
+ logEvent('node.cordoned', nodeId, null, {});
583
+ bus.emit('cluster.node.cordoned', { nodeId, name: node.name });
584
+
585
+ return { nodeId, status: 'cordoned' };
586
+ }
587
+
588
+ /**
589
+ * Uncordon a node — allow scheduling again
590
+ */
591
+ uncordonNode(nodeId) {
592
+ const node = stmts.getNode.get(nodeId);
593
+ if (!node) return null;
594
+
595
+ stmts.setNodeStatus.run({ id: nodeId, status: 'active' });
596
+ logEvent('node.uncordoned', nodeId, null, {});
597
+
598
+ return { nodeId, status: 'active' };
599
+ }
600
+
601
+ /**
602
+ * Get node details
603
+ */
604
+ getNode(nodeId) {
605
+ const node = stmts.getNode.get(nodeId);
606
+ if (!node) return null;
607
+ node.tags = safeParse(node.tags, []);
608
+ node.hardware = safeParse(node.hardware, {});
609
+ node.activeTasks = stmts.getTasksByNode.all(nodeId).length;
610
+ return node;
611
+ }
612
+
613
+ /**
614
+ * List all cluster nodes
615
+ */
616
+ listNodes(filter = {}) {
617
+ let nodes;
618
+ if (filter.region) {
619
+ nodes = stmts.listNodesByRegion.all(filter.region);
620
+ } else if (filter.active) {
621
+ nodes = stmts.listActiveNodes.all();
622
+ } else {
623
+ nodes = stmts.listNodes.all();
624
+ }
625
+ return nodes.map(n => ({
626
+ ...n,
627
+ tags: safeParse(n.tags, []),
628
+ hardware: safeParse(n.hardware, {}),
629
+ }));
630
+ }
631
+
632
+ // ─── Task Reporting ─────────────────────────────────────────────────
633
+
634
+ /**
635
+ * Worker reports task started
636
+ */
637
+ reportTaskStarted(taskId) {
638
+ const task = stmts.getTask.get(taskId);
639
+ if (!task) return null;
640
+ stmts.startTask.run(taskId);
641
+ logEvent('task.started', task.node_id, taskId, {});
642
+ bus.emit('cluster.task.started', { taskId, nodeId: task.node_id });
643
+ return { taskId, status: 'running' };
644
+ }
645
+
646
+ /**
647
+ * Worker reports task completed
648
+ */
649
+ reportTaskCompleted(taskId, result) {
650
+ const task = stmts.getTask.get(taskId);
651
+ if (!task) return null;
652
+
653
+ stmts.completeTask.run({ id: taskId, result: JSON.stringify(result || {}) });
654
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
655
+
656
+ logEvent('task.completed', task.node_id, taskId, { hasResult: !!result });
657
+ bus.emit('cluster.task.completed', { taskId, nodeId: task.node_id, result });
658
+
659
+ return { taskId, status: 'completed' };
660
+ }
661
+
662
+ /**
663
+ * Worker reports task failed
664
+ */
665
+ reportTaskFailed(taskId, error) {
666
+ const task = stmts.getTask.get(taskId);
667
+ if (!task) return null;
668
+
669
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
670
+
671
+ // Retry if attempts remaining
672
+ if (task.attempts < task.max_attempts) {
673
+ stmts.requeueTask.run(taskId);
674
+ logEvent('task.retrying', task.node_id, taskId, { attempt: task.attempts, error });
675
+ bus.emit('cluster.task.retrying', { taskId, attempt: task.attempts });
676
+
677
+ // Try to assign to a different node
678
+ this._distributor._tryAssign(taskId);
679
+
680
+ return { taskId, status: 'retrying', attempt: task.attempts };
681
+ }
682
+
683
+ // Max attempts reached
684
+ stmts.failTask.run({ id: taskId, error: typeof error === 'string' ? error : JSON.stringify(error) });
685
+ logEvent('task.failed', task.node_id, taskId, { error, attempts: task.attempts });
686
+ bus.emit('cluster.task.failed', { taskId, error, nodeId: task.node_id });
687
+
688
+ return { taskId, status: 'failed' };
689
+ }
690
+
691
+ /**
692
+ * Get task details
693
+ */
694
+ getTask(taskId) {
695
+ const task = stmts.getTask.get(taskId);
696
+ if (!task) return null;
697
+ task.payload = safeParse(task.payload, {});
698
+ task.affinity_tags = safeParse(task.affinity_tags, []);
699
+ task.result = safeParse(task.result, null);
700
+ return task;
701
+ }
702
+
703
+ /**
704
+ * List tasks with optional status filter
705
+ */
706
+ listTasks(filter = {}) {
707
+ let tasks;
708
+ if (filter.status) {
709
+ tasks = stmts.getTasksByStatus.all(filter.status, filter.limit || 50);
710
+ } else if (filter.nodeId) {
711
+ tasks = stmts.getTasksByNode.all(filter.nodeId);
712
+ } else {
713
+ tasks = stmts.listTasks.all(filter.limit || 50);
714
+ }
715
+ return tasks.map(t => ({
716
+ ...t,
717
+ payload: safeParse(t.payload, {}),
718
+ affinity_tags: safeParse(t.affinity_tags, []),
719
+ result: safeParse(t.result, null),
720
+ }));
721
+ }
722
+
723
+ // ─── Cluster Topology ───────────────────────────────────────────────
724
+
725
+ /**
726
+ * Get full cluster status
727
+ */
728
+ getClusterStatus() {
729
+ const nodes = stmts.listNodes.all();
730
+ const taskCounts = {};
731
+ for (const row of stmts.countByStatus.all()) {
732
+ taskCounts[row.status] = row.count;
733
+ }
734
+
735
+ const activeNodes = nodes.filter(n => n.status === 'active');
736
+ const totalCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_total, 0);
737
+ const usedCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_used, 0);
738
+
739
+ // Group by region
740
+ const regions = {};
741
+ for (const node of nodes) {
742
+ if (!regions[node.region]) regions[node.region] = { nodes: 0, active: 0, capacity: 0, used: 0 };
743
+ regions[node.region].nodes++;
744
+ if (node.status === 'active') {
745
+ regions[node.region].active++;
746
+ regions[node.region].capacity += node.capacity_total;
747
+ regions[node.region].used += node.capacity_used;
748
+ }
749
+ }
750
+
751
+ return {
752
+ coordinator: { started: this._started },
753
+ nodes: {
754
+ total: nodes.length,
755
+ active: activeNodes.length,
756
+ draining: nodes.filter(n => n.status === 'draining').length,
757
+ cordoned: nodes.filter(n => n.status === 'cordoned').length,
758
+ dead: nodes.filter(n => n.status === 'dead').length,
759
+ },
760
+ capacity: {
761
+ total: totalCapacity,
762
+ used: usedCapacity,
763
+ available: totalCapacity - usedCapacity,
764
+ utilization: totalCapacity > 0 ? Math.round((usedCapacity / totalCapacity) * 100) : 0,
765
+ },
766
+ tasks: taskCounts,
767
+ regions,
768
+ distributor: this._distributor.getStats(),
769
+ };
770
+ }
771
+
772
+ /**
773
+ * Get cluster events log
774
+ */
775
+ getEvents(limit = 100, nodeId = null) {
776
+ if (nodeId) {
777
+ return stmts.getEventsByNode.all(nodeId, limit).map(e => ({
778
+ ...e,
779
+ data: safeParse(e.data, {}),
780
+ }));
781
+ }
782
+ return stmts.getEvents.all(limit).map(e => ({
783
+ ...e,
784
+ data: safeParse(e.data, {}),
785
+ }));
786
+ }
787
+
788
+ // ─── Internal Operations ────────────────────────────────────────────
789
+
790
+ /**
791
+ * Check for dead nodes and failover their tasks
792
+ */
793
+ _healthCheck() {
794
+ const staleNodes = stmts.getStaleNodes.all(this._heartbeatThresholdSec);
795
+
796
+ for (const node of staleNodes) {
797
+ stmts.setNodeStatus.run({ id: node.id, status: 'dead' });
798
+ logEvent('node.dead', node.id, null, { lastHeartbeat: node.last_heartbeat });
799
+ bus.emit('cluster.node.dead', { nodeId: node.id, name: node.name });
800
+
801
+ // Failover: reassign all tasks from dead node
802
+ const reassigned = this._distributor.reassignFromNode(node.id);
803
+ logEvent('node.failover', node.id, null, { reassigned });
804
+ bus.emit('cluster.node.failover', { nodeId: node.id, tasksReassigned: reassigned });
805
+ }
806
+ }
807
+
808
+ /**
809
+ * Recover tasks that have been assigned/running too long (stuck)
810
+ */
811
+ _recoverStuckTasks() {
812
+ const stuckTasks = stmts.getStuckTasks.all(300); // 5 min stuck threshold
813
+
814
+ for (const task of stuckTasks) {
815
+ if (task.attempts >= task.max_attempts) {
816
+ stmts.failTask.run({ id: task.id, error: 'Task stuck, max attempts reached' });
817
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
818
+ logEvent('task.stuck_failed', task.node_id, task.id, {});
819
+ } else {
820
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
821
+ stmts.requeueTask.run(task.id);
822
+ this._distributor._tryAssign(task.id);
823
+ logEvent('task.stuck_requeued', task.node_id, task.id, { attempt: task.attempts });
824
+ }
825
+ }
826
+ }
827
+
828
+ /**
829
+ * Rebalance tasks across nodes when load is skewed
830
+ */
831
+ _rebalance() {
832
+ const nodes = stmts.listActiveNodes.all();
833
+ if (nodes.length < 2) return;
834
+
835
+ const avgLoad = nodes.reduce((s, n) => s + (n.capacity_used / n.capacity_total), 0) / nodes.length;
836
+ const overloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) > avgLoad * 1.5 && n.capacity_used > 2);
837
+ const underloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) < avgLoad * 0.5);
838
+
839
+ if (overloaded.length === 0 || underloaded.length === 0) return;
840
+
841
+ let moved = 0;
842
+ for (const over of overloaded) {
843
+ const tasks = stmts.getTasksByNode.all(over.id);
844
+ // Move up to 2 tasks from overloaded to underloaded
845
+ const toMove = tasks.filter(t => t.status === 'assigned').slice(0, 2);
846
+
847
+ for (const task of toMove) {
848
+ const target = underloaded.find(n => n.capacity_used < n.capacity_total);
849
+ if (!target) break;
850
+
851
+ stmts.decrementNodeLoad.run(over.id);
852
+ stmts.assignTask.run({ id: task.id, node_id: target.id });
853
+ stmts.incrementNodeLoad.run(target.id);
854
+ target.capacity_used++;
855
+ moved++;
856
+
857
+ logEvent('task.rebalanced', target.id, task.id, { from: over.id });
858
+ this._distributor._notifyWorker(target, task.id, task);
859
+ }
860
+ }
861
+
862
+ if (moved > 0) {
863
+ bus.emit('cluster.rebalanced', { tasksMoved: moved });
864
+ }
865
+ }
866
+ }
867
+
868
+ // ═══════════════════════════════════════════════════════════════════════════
869
+ // HELPERS
870
+ // ═══════════════════════════════════════════════════════════════════════════
871
+
872
+ function safeParse(str, fallback) {
873
+ if (str == null) return fallback;
874
+ if (typeof str === 'object') return str;
875
+ try { return JSON.parse(str); } catch { return fallback; }
876
+ }
877
+
878
+ function logEvent(type, nodeId, taskId, data) {
879
+ try {
880
+ stmts.insertEvent.run({
881
+ event_type: type,
882
+ node_id: nodeId || null,
883
+ task_id: taskId || null,
884
+ data: JSON.stringify(data || {}),
885
+ });
886
+ } catch { /* best-effort logging */ }
887
+ }
888
+
889
+ // ─── Singleton ───────────────────────────────────────────────────────
890
+
891
+ const distributor = new TaskDistributor();
892
+ const cluster = new ClusterOrchestrator(distributor);
893
+
894
+ module.exports = { cluster, distributor, ClusterOrchestrator, TaskDistributor };