web-agent-bridge 3.2.0 → 3.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/LICENSE +72 -72
  2. package/README.ar.md +1286 -1152
  3. package/README.md +1764 -1635
  4. package/bin/agent-runner.js +474 -474
  5. package/bin/cli.js +237 -138
  6. package/bin/wab.js +80 -80
  7. package/examples/bidi-agent.js +119 -119
  8. package/examples/cross-site-agent.js +91 -91
  9. package/examples/mcp-agent.js +94 -94
  10. package/examples/next-app-router/README.md +44 -44
  11. package/examples/puppeteer-agent.js +108 -108
  12. package/examples/saas-dashboard/README.md +55 -55
  13. package/examples/shopify-hydrogen/README.md +74 -74
  14. package/examples/vision-agent.js +171 -171
  15. package/examples/wordpress-elementor/README.md +77 -77
  16. package/package.json +16 -3
  17. package/public/.well-known/agent-tools.json +180 -180
  18. package/public/.well-known/ai-assets.json +59 -59
  19. package/public/.well-known/security.txt +8 -0
  20. package/public/agent-workspace.html +349 -349
  21. package/public/ai.html +198 -198
  22. package/public/api.html +413 -412
  23. package/public/browser.html +486 -486
  24. package/public/commander-dashboard.html +243 -243
  25. package/public/cookies.html +210 -210
  26. package/public/css/agent-workspace.css +1713 -1713
  27. package/public/css/premium.css +317 -317
  28. package/public/css/styles.css +1235 -1235
  29. package/public/dashboard.html +706 -706
  30. package/public/dns.html +507 -0
  31. package/public/docs.html +587 -587
  32. package/public/feed.xml +89 -89
  33. package/public/growth.html +463 -463
  34. package/public/index.html +1070 -982
  35. package/public/integrations.html +556 -0
  36. package/public/js/agent-workspace.js +1740 -1740
  37. package/public/js/auth-nav.js +31 -31
  38. package/public/js/auth-redirect.js +12 -12
  39. package/public/js/cookie-consent.js +56 -56
  40. package/public/js/wab-demo-page.js +721 -721
  41. package/public/js/ws-client.js +74 -74
  42. package/public/llms-full.txt +360 -360
  43. package/public/llms.txt +125 -125
  44. package/public/login.html +85 -85
  45. package/public/mesh-dashboard.html +328 -328
  46. package/public/openapi.json +580 -580
  47. package/public/phone-shield.html +281 -0
  48. package/public/premium-dashboard.html +2489 -2489
  49. package/public/premium.html +793 -793
  50. package/public/privacy.html +297 -297
  51. package/public/register.html +105 -105
  52. package/public/robots.txt +87 -87
  53. package/public/script/wab-consent.d.ts +36 -36
  54. package/public/script/wab-consent.js +104 -104
  55. package/public/script/wab-schema.js +131 -131
  56. package/public/script/wab.d.ts +108 -108
  57. package/public/script/wab.min.js +580 -580
  58. package/public/security.txt +8 -0
  59. package/public/terms.html +256 -256
  60. package/script/ai-agent-bridge.js +1754 -1754
  61. package/sdk/README.md +99 -99
  62. package/sdk/agent-mesh.js +449 -449
  63. package/sdk/commander.js +262 -262
  64. package/sdk/index.d.ts +464 -464
  65. package/sdk/index.js +12 -1
  66. package/sdk/multi-agent.js +318 -318
  67. package/sdk/package.json +1 -1
  68. package/sdk/safety-shield.js +219 -0
  69. package/sdk/schema-discovery.js +83 -83
  70. package/server/adapters/index.js +520 -520
  71. package/server/config/plans.js +367 -367
  72. package/server/config/secrets.js +102 -102
  73. package/server/control-plane/index.js +301 -301
  74. package/server/data-plane/index.js +354 -354
  75. package/server/index.js +531 -427
  76. package/server/llm/index.js +404 -404
  77. package/server/middleware/adminAuth.js +35 -35
  78. package/server/middleware/auth.js +50 -50
  79. package/server/middleware/featureGate.js +88 -88
  80. package/server/middleware/rateLimits.js +100 -100
  81. package/server/middleware/sensitiveAction.js +157 -0
  82. package/server/migrations/001_add_analytics_indexes.sql +7 -7
  83. package/server/migrations/002_premium_features.sql +418 -418
  84. package/server/migrations/003_ads_integer_cents.sql +33 -33
  85. package/server/migrations/004_agent_os.sql +158 -158
  86. package/server/migrations/005_marketplace_metering.sql +126 -126
  87. package/server/models/adapters/index.js +33 -33
  88. package/server/models/adapters/mysql.js +183 -183
  89. package/server/models/adapters/postgresql.js +172 -172
  90. package/server/models/adapters/sqlite.js +7 -7
  91. package/server/models/db.js +681 -681
  92. package/server/observability/failure-analysis.js +337 -337
  93. package/server/observability/index.js +394 -394
  94. package/server/protocol/capabilities.js +223 -223
  95. package/server/protocol/index.js +243 -243
  96. package/server/protocol/schema.js +584 -584
  97. package/server/registry/certification.js +271 -271
  98. package/server/registry/index.js +326 -326
  99. package/server/routes/admin-premium.js +671 -671
  100. package/server/routes/admin.js +261 -261
  101. package/server/routes/ads.js +130 -130
  102. package/server/routes/agent-workspace.js +540 -540
  103. package/server/routes/api.js +150 -150
  104. package/server/routes/auth.js +71 -71
  105. package/server/routes/billing.js +45 -45
  106. package/server/routes/commander.js +316 -316
  107. package/server/routes/demo-showcase.js +332 -332
  108. package/server/routes/demo-store.js +154 -0
  109. package/server/routes/discovery.js +417 -417
  110. package/server/routes/gateway.js +173 -157
  111. package/server/routes/license.js +251 -240
  112. package/server/routes/mesh.js +469 -469
  113. package/server/routes/noscript.js +543 -543
  114. package/server/routes/premium-v2.js +686 -686
  115. package/server/routes/premium.js +724 -724
  116. package/server/routes/runtime.js +2148 -2147
  117. package/server/routes/sovereign.js +465 -385
  118. package/server/routes/universal.js +200 -185
  119. package/server/routes/wab-api.js +850 -501
  120. package/server/runtime/container-worker.js +111 -111
  121. package/server/runtime/container.js +448 -448
  122. package/server/runtime/distributed-worker.js +362 -362
  123. package/server/runtime/event-bus.js +210 -210
  124. package/server/runtime/index.js +253 -253
  125. package/server/runtime/queue.js +599 -599
  126. package/server/runtime/replay.js +666 -666
  127. package/server/runtime/sandbox.js +266 -266
  128. package/server/runtime/scheduler.js +534 -534
  129. package/server/runtime/session-engine.js +293 -293
  130. package/server/runtime/state-manager.js +188 -188
  131. package/server/security/cross-site-redactor.js +196 -0
  132. package/server/security/dry-run.js +180 -0
  133. package/server/security/human-gate-rate-limit.js +147 -0
  134. package/server/security/human-gate-transports.js +178 -0
  135. package/server/security/human-gate.js +281 -0
  136. package/server/security/index.js +368 -368
  137. package/server/security/intent-engine.js +245 -0
  138. package/server/security/reward-guard.js +171 -0
  139. package/server/security/rollback-store.js +239 -0
  140. package/server/security/token-scope.js +404 -0
  141. package/server/security/url-policy.js +139 -0
  142. package/server/services/agent-chat.js +506 -506
  143. package/server/services/agent-learning.js +601 -575
  144. package/server/services/agent-memory.js +625 -625
  145. package/server/services/agent-mesh.js +555 -539
  146. package/server/services/agent-symphony.js +717 -717
  147. package/server/services/agent-tasks.js +1807 -1807
  148. package/server/services/api-key-engine.js +292 -261
  149. package/server/services/cluster.js +894 -894
  150. package/server/services/commander.js +738 -738
  151. package/server/services/edge-compute.js +440 -440
  152. package/server/services/email.js +204 -204
  153. package/server/services/hosted-runtime.js +205 -205
  154. package/server/services/lfd.js +635 -635
  155. package/server/services/local-ai.js +389 -389
  156. package/server/services/marketplace.js +270 -270
  157. package/server/services/metering.js +182 -182
  158. package/server/services/modules/affiliate-intelligence.js +93 -93
  159. package/server/services/modules/agent-firewall.js +90 -90
  160. package/server/services/modules/bounty.js +89 -89
  161. package/server/services/modules/collective-bargaining.js +92 -92
  162. package/server/services/modules/dark-pattern.js +66 -66
  163. package/server/services/modules/gov-intelligence.js +45 -45
  164. package/server/services/modules/neural.js +55 -55
  165. package/server/services/modules/notary.js +49 -49
  166. package/server/services/modules/price-time-machine.js +86 -86
  167. package/server/services/modules/protocol.js +104 -104
  168. package/server/services/negotiation.js +439 -439
  169. package/server/services/plugins.js +771 -771
  170. package/server/services/price-intelligence.js +566 -566
  171. package/server/services/price-shield.js +1137 -1137
  172. package/server/services/reputation.js +465 -465
  173. package/server/services/search-engine.js +357 -357
  174. package/server/services/security.js +513 -513
  175. package/server/services/self-healing.js +843 -843
  176. package/server/services/sovereign-shield.js +542 -0
  177. package/server/services/stripe.js +192 -192
  178. package/server/services/swarm.js +788 -788
  179. package/server/services/universal-scraper.js +662 -661
  180. package/server/services/verification.js +481 -481
  181. package/server/services/vision.js +1163 -1163
  182. package/server/utils/cache.js +125 -125
  183. package/server/utils/migrate.js +81 -81
  184. package/server/utils/safe-fetch.js +228 -0
  185. package/server/utils/secureFields.js +50 -50
  186. package/server/ws.js +161 -161
  187. package/templates/artisan-marketplace.yaml +104 -104
  188. package/templates/book-price-scout.yaml +98 -98
  189. package/templates/electronics-price-tracker.yaml +108 -108
  190. package/templates/flight-deal-hunter.yaml +113 -113
  191. package/templates/freelancer-direct.yaml +116 -116
  192. package/templates/grocery-price-compare.yaml +93 -93
  193. package/templates/hotel-direct-booking.yaml +113 -113
  194. package/templates/local-services.yaml +98 -98
  195. package/templates/olive-oil-tunisia.yaml +88 -88
  196. package/templates/organic-farm-fresh.yaml +101 -101
  197. package/templates/restaurant-direct.yaml +97 -97
  198. package/public/score.html +0 -263
  199. package/server/migrations/006_growth_suite.sql +0 -138
  200. package/server/routes/growth.js +0 -962
  201. package/server/services/fairness-engine.js +0 -409
  202. package/server/services/fairness.js +0 -420
@@ -1,894 +1,894 @@
1
- 'use strict';
2
-
3
- /**
4
- * WAB Cluster — Distributed Execution, Worker Nodes & Cluster Orchestration
5
- *
6
- * Turns WAB from a single-server Agent OS into a distributed fleet.
7
- *
8
- * Architecture:
9
- * ┌──────────────┐ ┌──────────┐ ┌──────────┐
10
- * │ Coordinator │────▶│ Worker-1 │ │ Worker-2 │
11
- * │ (this node) │────▶│ (remote) │ │ (remote) │
12
- * │ │────▶│ │ │ │
13
- * └──────────────┘ └──────────┘ └──────────┘
14
- * │ ▲ ▲
15
- * │ │ │
16
- * └───────────────────┴─────────────────┘
17
- * heartbeat / task results
18
- *
19
- * Components:
20
- * 1. WorkerNode — A remote execution node that connects, heartbeats, runs tasks
21
- * 2. TaskDistributor — Routes tasks to workers based on capacity/affinity/load
22
- * 3. ClusterOrchestrator — Fleet management, auto-scaling, failover, rebalancing
23
- *
24
- * Communication: HTTP/JSON between nodes (pull-based + push notifications)
25
- * Persistence: SQLite tables for durability across restarts
26
- * Consistency: Leader-based (coordinator is source of truth)
27
- */
28
-
29
- const crypto = require('crypto');
30
- const http = require('http');
31
- const https = require('https');
32
- const { URL } = require('url');
33
- const { db } = require('../models/db');
34
- const { bus } = require('../runtime/event-bus');
35
-
36
- // ─── Schema ──────────────────────────────────────────────────────────
37
-
38
- db.exec(`
39
- CREATE TABLE IF NOT EXISTS cluster_nodes (
40
- id TEXT PRIMARY KEY,
41
- name TEXT NOT NULL,
42
- endpoint TEXT NOT NULL,
43
- region TEXT DEFAULT 'default',
44
- zone TEXT DEFAULT 'a',
45
- role TEXT DEFAULT 'worker',
46
- status TEXT DEFAULT 'joining',
47
- capacity_total INTEGER DEFAULT 20,
48
- capacity_used INTEGER DEFAULT 0,
49
- tags TEXT DEFAULT '[]',
50
- hardware TEXT DEFAULT '{}',
51
- version TEXT,
52
- secret_hash TEXT,
53
- last_heartbeat TEXT DEFAULT (datetime('now')),
54
- registered_at TEXT DEFAULT (datetime('now')),
55
- updated_at TEXT DEFAULT (datetime('now'))
56
- );
57
-
58
- CREATE TABLE IF NOT EXISTS cluster_tasks (
59
- id TEXT PRIMARY KEY,
60
- external_id TEXT,
61
- node_id TEXT,
62
- task_type TEXT NOT NULL,
63
- objective TEXT,
64
- payload TEXT DEFAULT '{}',
65
- priority INTEGER DEFAULT 50,
66
- status TEXT DEFAULT 'pending',
67
- result TEXT,
68
- error TEXT,
69
- attempts INTEGER DEFAULT 0,
70
- max_attempts INTEGER DEFAULT 3,
71
- affinity_tags TEXT DEFAULT '[]',
72
- affinity_region TEXT,
73
- timeout_ms INTEGER DEFAULT 60000,
74
- submitted_at TEXT DEFAULT (datetime('now')),
75
- assigned_at TEXT,
76
- started_at TEXT,
77
- completed_at TEXT
78
- );
79
-
80
- CREATE TABLE IF NOT EXISTS cluster_events (
81
- id INTEGER PRIMARY KEY AUTOINCREMENT,
82
- event_type TEXT NOT NULL,
83
- node_id TEXT,
84
- task_id TEXT,
85
- data TEXT DEFAULT '{}',
86
- created_at TEXT DEFAULT (datetime('now'))
87
- );
88
-
89
- CREATE INDEX IF NOT EXISTS idx_cluster_nodes_status ON cluster_nodes(status);
90
- CREATE INDEX IF NOT EXISTS idx_cluster_nodes_region ON cluster_nodes(region);
91
- CREATE INDEX IF NOT EXISTS idx_cluster_tasks_status ON cluster_tasks(status);
92
- CREATE INDEX IF NOT EXISTS idx_cluster_tasks_node ON cluster_tasks(node_id);
93
- CREATE INDEX IF NOT EXISTS idx_cluster_tasks_priority ON cluster_tasks(priority DESC);
94
- CREATE INDEX IF NOT EXISTS idx_cluster_events_type ON cluster_events(event_type);
95
- CREATE INDEX IF NOT EXISTS idx_cluster_events_node ON cluster_events(node_id);
96
- `);
97
-
98
- // ─── Prepared Statements ─────────────────────────────────────────────
99
-
100
- const stmts = {
101
- // Nodes
102
- insertNode: db.prepare(`
103
- INSERT INTO cluster_nodes (id, name, endpoint, region, zone, role, status, capacity_total, tags, hardware, version, secret_hash)
104
- VALUES (@id, @name, @endpoint, @region, @zone, @role, @status, @capacity_total, @tags, @hardware, @version, @secret_hash)
105
- `),
106
- updateNode: db.prepare(`
107
- UPDATE cluster_nodes SET name=@name, endpoint=@endpoint, region=@region, zone=@zone,
108
- capacity_total=@capacity_total, tags=@tags, hardware=@hardware, version=@version, updated_at=datetime('now')
109
- WHERE id=@id
110
- `),
111
- setNodeStatus: db.prepare(`UPDATE cluster_nodes SET status=@status, updated_at=datetime('now') WHERE id=@id`),
112
- heartbeatNode: db.prepare(`
113
- UPDATE cluster_nodes SET last_heartbeat=datetime('now'), capacity_used=@capacity_used, status='active', updated_at=datetime('now')
114
- WHERE id=@id
115
- `),
116
- getNode: db.prepare(`SELECT * FROM cluster_nodes WHERE id=?`),
117
- getNodeByEndpoint: db.prepare(`SELECT * FROM cluster_nodes WHERE endpoint=?`),
118
- listNodes: db.prepare(`SELECT * FROM cluster_nodes ORDER BY registered_at DESC`),
119
- listActiveNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' ORDER BY capacity_used ASC`),
120
- listNodesByRegion: db.prepare(`SELECT * FROM cluster_nodes WHERE region=? AND status='active' ORDER BY capacity_used ASC`),
121
- deleteNode: db.prepare(`DELETE FROM cluster_nodes WHERE id=?`),
122
- getStaleNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' AND last_heartbeat < datetime('now', '-' || ? || ' seconds')`),
123
-
124
- // Tasks
125
- insertTask: db.prepare(`
126
- INSERT INTO cluster_tasks (id, external_id, task_type, objective, payload, priority, status, affinity_tags, affinity_region, timeout_ms, max_attempts)
127
- VALUES (@id, @external_id, @task_type, @objective, @payload, @priority, @status, @affinity_tags, @affinity_region, @timeout_ms, @max_attempts)
128
- `),
129
- assignTask: db.prepare(`
130
- UPDATE cluster_tasks SET node_id=@node_id, status='assigned', assigned_at=datetime('now'), attempts=attempts+1
131
- WHERE id=@id
132
- `),
133
- startTask: db.prepare(`UPDATE cluster_tasks SET status='running', started_at=datetime('now') WHERE id=?`),
134
- completeTask: db.prepare(`
135
- UPDATE cluster_tasks SET status='completed', result=@result, completed_at=datetime('now') WHERE id=@id
136
- `),
137
- failTask: db.prepare(`
138
- UPDATE cluster_tasks SET status='failed', error=@error, completed_at=datetime('now') WHERE id=@id
139
- `),
140
- requeueTask: db.prepare(`UPDATE cluster_tasks SET status='pending', node_id=NULL, assigned_at=NULL WHERE id=?`),
141
- getTask: db.prepare(`SELECT * FROM cluster_tasks WHERE id=?`),
142
- getTaskByExternal: db.prepare(`SELECT * FROM cluster_tasks WHERE external_id=?`),
143
- getPendingTasks: db.prepare(`SELECT * FROM cluster_tasks WHERE status='pending' ORDER BY priority DESC, submitted_at ASC LIMIT ?`),
144
- getTasksByNode: db.prepare(`SELECT * FROM cluster_tasks WHERE node_id=? AND status IN ('assigned','running') ORDER BY priority DESC`),
145
- getTasksByStatus: db.prepare(`SELECT * FROM cluster_tasks WHERE status=? ORDER BY submitted_at DESC LIMIT ?`),
146
- listTasks: db.prepare(`SELECT * FROM cluster_tasks ORDER BY submitted_at DESC LIMIT ?`),
147
- getStuckTasks: db.prepare(`
148
- SELECT * FROM cluster_tasks WHERE status IN ('assigned','running')
149
- AND assigned_at < datetime('now', '-' || ? || ' seconds')
150
- `),
151
- countByStatus: db.prepare(`SELECT status, COUNT(*) as count FROM cluster_tasks GROUP BY status`),
152
- incrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = capacity_used + 1 WHERE id=?`),
153
- decrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = MAX(0, capacity_used - 1) WHERE id=?`),
154
-
155
- // Events
156
- insertEvent: db.prepare(`INSERT INTO cluster_events (event_type, node_id, task_id, data) VALUES (@event_type, @node_id, @task_id, @data)`),
157
- getEvents: db.prepare(`SELECT * FROM cluster_events ORDER BY id DESC LIMIT ?`),
158
- getEventsByNode: db.prepare(`SELECT * FROM cluster_events WHERE node_id=? ORDER BY id DESC LIMIT ?`),
159
- };
160
-
161
- // ═══════════════════════════════════════════════════════════════════════════
162
- // TASK DISTRIBUTOR
163
- // ═══════════════════════════════════════════════════════════════════════════
164
-
165
- /**
166
- * Routes tasks to worker nodes based on capacity, affinity, and load balancing.
167
- *
168
- * Strategies:
169
- * - least-loaded: Pick the node with the most free capacity
170
- * - affinity: Match task tags to node tags
171
- * - region: Prefer nodes in the same region as the task
172
- * - round-robin: Distribute evenly across all active nodes
173
- */
174
- class TaskDistributor {
175
- constructor() {
176
- this._roundRobinIndex = 0;
177
- this._stats = { distributed: 0, reassigned: 0, noCapacity: 0 };
178
- }
179
-
180
- /**
181
- * Submit a task for distributed execution
182
- */
183
- submit(task) {
184
- const id = task.id || `ct_${crypto.randomBytes(12).toString('hex')}`;
185
- const entry = {
186
- id,
187
- external_id: task.externalId || null,
188
- task_type: task.type || 'general',
189
- objective: task.objective || '',
190
- payload: JSON.stringify(task.params || {}),
191
- priority: task.priority || 50,
192
- status: 'pending',
193
- affinity_tags: JSON.stringify(task.affinityTags || []),
194
- affinity_region: task.affinityRegion || null,
195
- timeout_ms: task.timeout || 60000,
196
- max_attempts: task.maxAttempts || 3,
197
- };
198
- stmts.insertTask.run(entry);
199
-
200
- bus.emit('cluster.task.submitted', { taskId: id, type: entry.task_type, priority: entry.priority });
201
- this._stats.distributed++;
202
-
203
- // Try immediate assignment
204
- this._tryAssign(id);
205
-
206
- return { taskId: id, status: 'pending' };
207
- }
208
-
209
- /**
210
- * Try to assign a task to a worker node
211
- */
212
- _tryAssign(taskId) {
213
- const task = stmts.getTask.get(taskId);
214
- if (!task || task.status !== 'pending') return false;
215
-
216
- const node = this._selectNode(task);
217
- if (!node) {
218
- this._stats.noCapacity++;
219
- return false;
220
- }
221
-
222
- stmts.assignTask.run({ id: taskId, node_id: node.id });
223
- stmts.incrementNodeLoad.run(node.id);
224
-
225
- logEvent('task.assigned', node.id, taskId, { strategy: this._lastStrategy });
226
- bus.emit('cluster.task.assigned', { taskId, nodeId: node.id });
227
-
228
- // Push notification to worker (fire-and-forget)
229
- this._notifyWorker(node, taskId, task);
230
-
231
- return true;
232
- }
233
-
234
- /**
235
- * Select the best node for a task
236
- */
237
- _selectNode(task) {
238
- let candidates = stmts.listActiveNodes.all();
239
- if (candidates.length === 0) return null;
240
-
241
- // Filter by capacity
242
- candidates = candidates.filter(n => n.capacity_used < n.capacity_total);
243
- if (candidates.length === 0) return null;
244
-
245
- const affinityTags = safeParse(task.affinity_tags, []);
246
- const affinityRegion = task.affinity_region;
247
-
248
- // Strategy 1: Region affinity
249
- if (affinityRegion) {
250
- const regionNodes = candidates.filter(n => n.region === affinityRegion);
251
- if (regionNodes.length > 0) {
252
- candidates = regionNodes;
253
- this._lastStrategy = 'region';
254
- }
255
- }
256
-
257
- // Strategy 2: Tag affinity
258
- if (affinityTags.length > 0) {
259
- const tagged = candidates.filter(n => {
260
- const nodeTags = safeParse(n.tags, []);
261
- return affinityTags.some(t => nodeTags.includes(t));
262
- });
263
- if (tagged.length > 0) {
264
- candidates = tagged;
265
- this._lastStrategy = 'affinity';
266
- }
267
- }
268
-
269
- // Strategy 3: Least-loaded
270
- candidates.sort((a, b) => {
271
- const loadA = a.capacity_used / a.capacity_total;
272
- const loadB = b.capacity_used / b.capacity_total;
273
- return loadA - loadB;
274
- });
275
-
276
- this._lastStrategy = this._lastStrategy || 'least-loaded';
277
- return candidates[0];
278
- }
279
-
280
- /**
281
- * Push task notification to a worker node
282
- */
283
- _notifyWorker(node, taskId, task) {
284
- const payload = JSON.stringify({
285
- type: 'task.assigned',
286
- taskId,
287
- taskType: task.task_type,
288
- objective: task.objective,
289
- params: safeParse(task.payload, {}),
290
- priority: task.priority,
291
- timeout: task.timeout_ms,
292
- });
293
-
294
- const url = new URL('/wab-worker/tasks/notify', node.endpoint);
295
- const mod = url.protocol === 'https:' ? https : http;
296
-
297
- const req = mod.request(url, {
298
- method: 'POST',
299
- headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) },
300
- timeout: 5000,
301
- });
302
- req.on('error', () => { /* best-effort push */ });
303
- req.write(payload);
304
- req.end();
305
- }
306
-
307
- /**
308
- * Reassign tasks from a dead node to other nodes
309
- */
310
- reassignFromNode(nodeId) {
311
- const tasks = stmts.getTasksByNode.all(nodeId);
312
- let reassigned = 0;
313
-
314
- for (const task of tasks) {
315
- if (task.attempts >= task.max_attempts) {
316
- stmts.failTask.run({ id: task.id, error: 'Node died, max attempts reached' });
317
- logEvent('task.failed', nodeId, task.id, { reason: 'node_death' });
318
- bus.emit('cluster.task.failed', { taskId: task.id, reason: 'node_death' });
319
- continue;
320
- }
321
-
322
- stmts.decrementNodeLoad.run(nodeId);
323
- stmts.requeueTask.run(task.id);
324
- logEvent('task.requeued', nodeId, task.id, { attempt: task.attempts });
325
-
326
- // Try to assign to another node
327
- if (this._tryAssign(task.id)) {
328
- reassigned++;
329
- this._stats.reassigned++;
330
- }
331
- }
332
-
333
- return reassigned;
334
- }
335
-
336
- /**
337
- * Process pending tasks — called periodically
338
- */
339
- processPending() {
340
- const pending = stmts.getPendingTasks.all(50);
341
- let assigned = 0;
342
- for (const task of pending) {
343
- if (this._tryAssign(task.id)) assigned++;
344
- }
345
- return assigned;
346
- }
347
-
348
- /**
349
- * Worker pulls tasks for execution
350
- */
351
- pullTasks(nodeId, limit = 5) {
352
- const node = stmts.getNode.get(nodeId);
353
- if (!node || node.status !== 'active') return [];
354
-
355
- const available = node.capacity_total - node.capacity_used;
356
- if (available <= 0) return [];
357
-
358
- const count = Math.min(limit, available);
359
- const pending = stmts.getPendingTasks.all(count);
360
- const assigned = [];
361
-
362
- for (const task of pending) {
363
- // Check affinity
364
- const affinityRegion = task.affinity_region;
365
- if (affinityRegion && node.region !== affinityRegion) continue;
366
-
367
- const affinityTags = safeParse(task.affinity_tags, []);
368
- const nodeTags = safeParse(node.tags, []);
369
- if (affinityTags.length > 0 && !affinityTags.some(t => nodeTags.includes(t))) continue;
370
-
371
- stmts.assignTask.run({ id: task.id, node_id: nodeId });
372
- stmts.incrementNodeLoad.run(nodeId);
373
- logEvent('task.assigned', nodeId, task.id, { strategy: 'pull' });
374
-
375
- assigned.push({
376
- taskId: task.id,
377
- type: task.task_type,
378
- objective: task.objective,
379
- params: safeParse(task.payload, {}),
380
- priority: task.priority,
381
- timeout: task.timeout_ms,
382
- });
383
- }
384
-
385
- return assigned;
386
- }
387
-
388
- getStats() { return { ...this._stats }; }
389
- }
390
-
391
- // ═══════════════════════════════════════════════════════════════════════════
392
- // CLUSTER ORCHESTRATOR
393
- // ═══════════════════════════════════════════════════════════════════════════
394
-
395
- /**
396
- * Fleet management — lifecycle, health, auto-scaling, failover, rebalancing.
397
- *
398
- * Responsibilities:
399
- * - Node registration and authentication
400
- * - Health monitoring via heartbeats
401
- * - Dead node detection and task failover
402
- * - Load rebalancing across the cluster
403
- * - Cluster topology and status reporting
404
- * - Drain and cordon operations
405
- */
406
- class ClusterOrchestrator {
407
- constructor(distributor) {
408
- this._distributor = distributor;
409
- this._heartbeatThresholdSec = 90; // Node considered dead after 90s no heartbeat
410
- this._checkInterval = null;
411
- this._rebalanceInterval = null;
412
- this._started = false;
413
- }
414
-
415
- // ─── Lifecycle ──────────────────────────────────────────────────────
416
-
417
- /**
418
- * Start the orchestrator — begins periodic health checks and task processing
419
- */
420
- start() {
421
- if (this._started) return;
422
- this._started = true;
423
-
424
- // Health check every 30s
425
- this._checkInterval = setInterval(() => {
426
- this._healthCheck();
427
- this._recoverStuckTasks();
428
- this._distributor.processPending();
429
- }, 30_000);
430
- if (this._checkInterval.unref) this._checkInterval.unref();
431
-
432
- // Rebalance every 5 min
433
- this._rebalanceInterval = setInterval(() => {
434
- this._rebalance();
435
- }, 300_000);
436
- if (this._rebalanceInterval.unref) this._rebalanceInterval.unref();
437
-
438
- bus.emit('cluster.started', { timestamp: Date.now() });
439
- }
440
-
441
- /**
442
- * Stop the orchestrator
443
- */
444
- stop() {
445
- if (!this._started) return;
446
- this._started = false;
447
- if (this._checkInterval) clearInterval(this._checkInterval);
448
- if (this._rebalanceInterval) clearInterval(this._rebalanceInterval);
449
- bus.emit('cluster.stopped', { timestamp: Date.now() });
450
- }
451
-
452
- // ─── Node Management ───────────────────────────────────────────────
453
-
454
- /**
455
- * Register a worker node to join the cluster
456
- */
457
- registerNode(config) {
458
- if (!config.name || !config.endpoint) {
459
- throw new Error('Node name and endpoint required');
460
- }
461
-
462
- // Check for existing node with same endpoint
463
- const existing = stmts.getNodeByEndpoint.get(config.endpoint);
464
- if (existing) {
465
- // Re-register: update and reactivate
466
- stmts.updateNode.run({
467
- id: existing.id,
468
- name: config.name,
469
- endpoint: config.endpoint,
470
- region: config.region || existing.region,
471
- zone: config.zone || existing.zone,
472
- capacity_total: config.capacity || existing.capacity_total,
473
- tags: JSON.stringify(config.tags || safeParse(existing.tags, [])),
474
- hardware: JSON.stringify(config.hardware || safeParse(existing.hardware, {})),
475
- version: config.version || existing.version,
476
- });
477
- stmts.setNodeStatus.run({ id: existing.id, status: 'active' });
478
- logEvent('node.re-registered', existing.id, null, { endpoint: config.endpoint });
479
- bus.emit('cluster.node.joined', { nodeId: existing.id, name: config.name, rejoined: true });
480
- return { nodeId: existing.id, status: 'active', rejoined: true };
481
- }
482
-
483
- const nodeId = `node_${crypto.randomBytes(12).toString('hex')}`;
484
- const secretHash = crypto.createHash('sha256')
485
- .update(config.secret || crypto.randomBytes(32).toString('hex'))
486
- .digest('hex');
487
-
488
- stmts.insertNode.run({
489
- id: nodeId,
490
- name: config.name,
491
- endpoint: config.endpoint,
492
- region: config.region || 'default',
493
- zone: config.zone || 'a',
494
- role: config.role || 'worker',
495
- status: 'active',
496
- capacity_total: config.capacity || 20,
497
- tags: JSON.stringify(config.tags || []),
498
- hardware: JSON.stringify(config.hardware || {}),
499
- version: config.version || null,
500
- secret_hash: secretHash,
501
- });
502
-
503
- logEvent('node.registered', nodeId, null, { name: config.name, endpoint: config.endpoint, region: config.region });
504
- bus.emit('cluster.node.joined', { nodeId, name: config.name });
505
-
506
- return { nodeId, status: 'active', secret: config.secret ? undefined : undefined };
507
- }
508
-
509
- /**
510
- * Remove a node from the cluster
511
- */
512
- deregisterNode(nodeId) {
513
- const node = stmts.getNode.get(nodeId);
514
- if (!node) return null;
515
-
516
- // Reassign tasks before removing
517
- const reassigned = this._distributor.reassignFromNode(nodeId);
518
- stmts.deleteNode.run(nodeId);
519
-
520
- logEvent('node.deregistered', nodeId, null, { reassigned });
521
- bus.emit('cluster.node.left', { nodeId, name: node.name, tasksReassigned: reassigned });
522
-
523
- return { nodeId, reassigned };
524
- }
525
-
526
- /**
527
- * Process heartbeat from a worker node
528
- */
529
- heartbeat(nodeId, data = {}) {
530
- const node = stmts.getNode.get(nodeId);
531
- if (!node) return null;
532
-
533
- stmts.heartbeatNode.run({
534
- id: nodeId,
535
- capacity_used: data.capacityUsed != null ? data.capacityUsed : node.capacity_used,
536
- });
537
-
538
- // Update hardware profile if provided
539
- if (data.hardware) {
540
- stmts.updateNode.run({
541
- id: nodeId,
542
- name: node.name,
543
- endpoint: node.endpoint,
544
- region: node.region,
545
- zone: node.zone,
546
- capacity_total: data.capacityTotal || node.capacity_total,
547
- tags: JSON.stringify(data.tags || safeParse(node.tags, [])),
548
- hardware: JSON.stringify(data.hardware),
549
- version: data.version || node.version,
550
- });
551
- }
552
-
553
- return {
554
- nodeId,
555
- status: 'active',
556
- pendingTasks: stmts.getPendingTasks.all(1).length > 0,
557
- };
558
- }
559
-
560
- /**
561
- * Drain a node — stop assigning new tasks, wait for running tasks to finish
562
- */
563
- drainNode(nodeId) {
564
- const node = stmts.getNode.get(nodeId);
565
- if (!node) return null;
566
-
567
- stmts.setNodeStatus.run({ id: nodeId, status: 'draining' });
568
- logEvent('node.draining', nodeId, null, {});
569
- bus.emit('cluster.node.draining', { nodeId, name: node.name });
570
-
571
- return { nodeId, status: 'draining', activeTasks: stmts.getTasksByNode.all(nodeId).length };
572
- }
573
-
574
- /**
575
- * Cordon a node — prevent scheduling but keep running tasks
576
- */
577
- cordonNode(nodeId) {
578
- const node = stmts.getNode.get(nodeId);
579
- if (!node) return null;
580
-
581
- stmts.setNodeStatus.run({ id: nodeId, status: 'cordoned' });
582
- logEvent('node.cordoned', nodeId, null, {});
583
- bus.emit('cluster.node.cordoned', { nodeId, name: node.name });
584
-
585
- return { nodeId, status: 'cordoned' };
586
- }
587
-
588
- /**
589
- * Uncordon a node — allow scheduling again
590
- */
591
- uncordonNode(nodeId) {
592
- const node = stmts.getNode.get(nodeId);
593
- if (!node) return null;
594
-
595
- stmts.setNodeStatus.run({ id: nodeId, status: 'active' });
596
- logEvent('node.uncordoned', nodeId, null, {});
597
-
598
- return { nodeId, status: 'active' };
599
- }
600
-
601
- /**
602
- * Get node details
603
- */
604
- getNode(nodeId) {
605
- const node = stmts.getNode.get(nodeId);
606
- if (!node) return null;
607
- node.tags = safeParse(node.tags, []);
608
- node.hardware = safeParse(node.hardware, {});
609
- node.activeTasks = stmts.getTasksByNode.all(nodeId).length;
610
- return node;
611
- }
612
-
613
- /**
614
- * List all cluster nodes
615
- */
616
- listNodes(filter = {}) {
617
- let nodes;
618
- if (filter.region) {
619
- nodes = stmts.listNodesByRegion.all(filter.region);
620
- } else if (filter.active) {
621
- nodes = stmts.listActiveNodes.all();
622
- } else {
623
- nodes = stmts.listNodes.all();
624
- }
625
- return nodes.map(n => ({
626
- ...n,
627
- tags: safeParse(n.tags, []),
628
- hardware: safeParse(n.hardware, {}),
629
- }));
630
- }
631
-
632
- // ─── Task Reporting ─────────────────────────────────────────────────
633
-
634
- /**
635
- * Worker reports task started
636
- */
637
- reportTaskStarted(taskId) {
638
- const task = stmts.getTask.get(taskId);
639
- if (!task) return null;
640
- stmts.startTask.run(taskId);
641
- logEvent('task.started', task.node_id, taskId, {});
642
- bus.emit('cluster.task.started', { taskId, nodeId: task.node_id });
643
- return { taskId, status: 'running' };
644
- }
645
-
646
- /**
647
- * Worker reports task completed
648
- */
649
- reportTaskCompleted(taskId, result) {
650
- const task = stmts.getTask.get(taskId);
651
- if (!task) return null;
652
-
653
- stmts.completeTask.run({ id: taskId, result: JSON.stringify(result || {}) });
654
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
655
-
656
- logEvent('task.completed', task.node_id, taskId, { hasResult: !!result });
657
- bus.emit('cluster.task.completed', { taskId, nodeId: task.node_id, result });
658
-
659
- return { taskId, status: 'completed' };
660
- }
661
-
662
- /**
663
- * Worker reports task failed
664
- */
665
- reportTaskFailed(taskId, error) {
666
- const task = stmts.getTask.get(taskId);
667
- if (!task) return null;
668
-
669
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
670
-
671
- // Retry if attempts remaining
672
- if (task.attempts < task.max_attempts) {
673
- stmts.requeueTask.run(taskId);
674
- logEvent('task.retrying', task.node_id, taskId, { attempt: task.attempts, error });
675
- bus.emit('cluster.task.retrying', { taskId, attempt: task.attempts });
676
-
677
- // Try to assign to a different node
678
- this._distributor._tryAssign(taskId);
679
-
680
- return { taskId, status: 'retrying', attempt: task.attempts };
681
- }
682
-
683
- // Max attempts reached
684
- stmts.failTask.run({ id: taskId, error: typeof error === 'string' ? error : JSON.stringify(error) });
685
- logEvent('task.failed', task.node_id, taskId, { error, attempts: task.attempts });
686
- bus.emit('cluster.task.failed', { taskId, error, nodeId: task.node_id });
687
-
688
- return { taskId, status: 'failed' };
689
- }
690
-
691
- /**
692
- * Get task details
693
- */
694
- getTask(taskId) {
695
- const task = stmts.getTask.get(taskId);
696
- if (!task) return null;
697
- task.payload = safeParse(task.payload, {});
698
- task.affinity_tags = safeParse(task.affinity_tags, []);
699
- task.result = safeParse(task.result, null);
700
- return task;
701
- }
702
-
703
- /**
704
- * List tasks with optional status filter
705
- */
706
- listTasks(filter = {}) {
707
- let tasks;
708
- if (filter.status) {
709
- tasks = stmts.getTasksByStatus.all(filter.status, filter.limit || 50);
710
- } else if (filter.nodeId) {
711
- tasks = stmts.getTasksByNode.all(filter.nodeId);
712
- } else {
713
- tasks = stmts.listTasks.all(filter.limit || 50);
714
- }
715
- return tasks.map(t => ({
716
- ...t,
717
- payload: safeParse(t.payload, {}),
718
- affinity_tags: safeParse(t.affinity_tags, []),
719
- result: safeParse(t.result, null),
720
- }));
721
- }
722
-
723
- // ─── Cluster Topology ───────────────────────────────────────────────
724
-
725
- /**
726
- * Get full cluster status
727
- */
728
- getClusterStatus() {
729
- const nodes = stmts.listNodes.all();
730
- const taskCounts = {};
731
- for (const row of stmts.countByStatus.all()) {
732
- taskCounts[row.status] = row.count;
733
- }
734
-
735
- const activeNodes = nodes.filter(n => n.status === 'active');
736
- const totalCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_total, 0);
737
- const usedCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_used, 0);
738
-
739
- // Group by region
740
- const regions = {};
741
- for (const node of nodes) {
742
- if (!regions[node.region]) regions[node.region] = { nodes: 0, active: 0, capacity: 0, used: 0 };
743
- regions[node.region].nodes++;
744
- if (node.status === 'active') {
745
- regions[node.region].active++;
746
- regions[node.region].capacity += node.capacity_total;
747
- regions[node.region].used += node.capacity_used;
748
- }
749
- }
750
-
751
- return {
752
- coordinator: { started: this._started },
753
- nodes: {
754
- total: nodes.length,
755
- active: activeNodes.length,
756
- draining: nodes.filter(n => n.status === 'draining').length,
757
- cordoned: nodes.filter(n => n.status === 'cordoned').length,
758
- dead: nodes.filter(n => n.status === 'dead').length,
759
- },
760
- capacity: {
761
- total: totalCapacity,
762
- used: usedCapacity,
763
- available: totalCapacity - usedCapacity,
764
- utilization: totalCapacity > 0 ? Math.round((usedCapacity / totalCapacity) * 100) : 0,
765
- },
766
- tasks: taskCounts,
767
- regions,
768
- distributor: this._distributor.getStats(),
769
- };
770
- }
771
-
772
- /**
773
- * Get cluster events log
774
- */
775
- getEvents(limit = 100, nodeId = null) {
776
- if (nodeId) {
777
- return stmts.getEventsByNode.all(nodeId, limit).map(e => ({
778
- ...e,
779
- data: safeParse(e.data, {}),
780
- }));
781
- }
782
- return stmts.getEvents.all(limit).map(e => ({
783
- ...e,
784
- data: safeParse(e.data, {}),
785
- }));
786
- }
787
-
788
- // ─── Internal Operations ────────────────────────────────────────────
789
-
790
- /**
791
- * Check for dead nodes and failover their tasks
792
- */
793
- _healthCheck() {
794
- const staleNodes = stmts.getStaleNodes.all(this._heartbeatThresholdSec);
795
-
796
- for (const node of staleNodes) {
797
- stmts.setNodeStatus.run({ id: node.id, status: 'dead' });
798
- logEvent('node.dead', node.id, null, { lastHeartbeat: node.last_heartbeat });
799
- bus.emit('cluster.node.dead', { nodeId: node.id, name: node.name });
800
-
801
- // Failover: reassign all tasks from dead node
802
- const reassigned = this._distributor.reassignFromNode(node.id);
803
- logEvent('node.failover', node.id, null, { reassigned });
804
- bus.emit('cluster.node.failover', { nodeId: node.id, tasksReassigned: reassigned });
805
- }
806
- }
807
-
808
- /**
809
- * Recover tasks that have been assigned/running too long (stuck)
810
- */
811
- _recoverStuckTasks() {
812
- const stuckTasks = stmts.getStuckTasks.all(300); // 5 min stuck threshold
813
-
814
- for (const task of stuckTasks) {
815
- if (task.attempts >= task.max_attempts) {
816
- stmts.failTask.run({ id: task.id, error: 'Task stuck, max attempts reached' });
817
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
818
- logEvent('task.stuck_failed', task.node_id, task.id, {});
819
- } else {
820
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
821
- stmts.requeueTask.run(task.id);
822
- this._distributor._tryAssign(task.id);
823
- logEvent('task.stuck_requeued', task.node_id, task.id, { attempt: task.attempts });
824
- }
825
- }
826
- }
827
-
828
- /**
829
- * Rebalance tasks across nodes when load is skewed
830
- */
831
- _rebalance() {
832
- const nodes = stmts.listActiveNodes.all();
833
- if (nodes.length < 2) return;
834
-
835
- const avgLoad = nodes.reduce((s, n) => s + (n.capacity_used / n.capacity_total), 0) / nodes.length;
836
- const overloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) > avgLoad * 1.5 && n.capacity_used > 2);
837
- const underloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) < avgLoad * 0.5);
838
-
839
- if (overloaded.length === 0 || underloaded.length === 0) return;
840
-
841
- let moved = 0;
842
- for (const over of overloaded) {
843
- const tasks = stmts.getTasksByNode.all(over.id);
844
- // Move up to 2 tasks from overloaded to underloaded
845
- const toMove = tasks.filter(t => t.status === 'assigned').slice(0, 2);
846
-
847
- for (const task of toMove) {
848
- const target = underloaded.find(n => n.capacity_used < n.capacity_total);
849
- if (!target) break;
850
-
851
- stmts.decrementNodeLoad.run(over.id);
852
- stmts.assignTask.run({ id: task.id, node_id: target.id });
853
- stmts.incrementNodeLoad.run(target.id);
854
- target.capacity_used++;
855
- moved++;
856
-
857
- logEvent('task.rebalanced', target.id, task.id, { from: over.id });
858
- this._distributor._notifyWorker(target, task.id, task);
859
- }
860
- }
861
-
862
- if (moved > 0) {
863
- bus.emit('cluster.rebalanced', { tasksMoved: moved });
864
- }
865
- }
866
- }
867
-
868
- // ═══════════════════════════════════════════════════════════════════════════
869
- // HELPERS
870
- // ═══════════════════════════════════════════════════════════════════════════
871
-
872
- function safeParse(str, fallback) {
873
- if (str == null) return fallback;
874
- if (typeof str === 'object') return str;
875
- try { return JSON.parse(str); } catch { return fallback; }
876
- }
877
-
878
- function logEvent(type, nodeId, taskId, data) {
879
- try {
880
- stmts.insertEvent.run({
881
- event_type: type,
882
- node_id: nodeId || null,
883
- task_id: taskId || null,
884
- data: JSON.stringify(data || {}),
885
- });
886
- } catch { /* best-effort logging */ }
887
- }
888
-
889
- // ─── Singleton ───────────────────────────────────────────────────────
890
-
891
- const distributor = new TaskDistributor();
892
- const cluster = new ClusterOrchestrator(distributor);
893
-
894
- module.exports = { cluster, distributor, ClusterOrchestrator, TaskDistributor };
1
+ 'use strict';
2
+
3
+ /**
4
+ * WAB Cluster — Distributed Execution, Worker Nodes & Cluster Orchestration
5
+ *
6
+ * Turns WAB from a single-server Agent OS into a distributed fleet.
7
+ *
8
+ * Architecture:
9
+ * ┌──────────────┐ ┌──────────┐ ┌──────────┐
10
+ * │ Coordinator │────▶│ Worker-1 │ │ Worker-2 │
11
+ * │ (this node) │────▶│ (remote) │ │ (remote) │
12
+ * │ │────▶│ │ │ │
13
+ * └──────────────┘ └──────────┘ └──────────┘
14
+ * │ ▲ ▲
15
+ * │ │ │
16
+ * └───────────────────┴─────────────────┘
17
+ * heartbeat / task results
18
+ *
19
+ * Components:
20
+ * 1. WorkerNode — A remote execution node that connects, heartbeats, runs tasks
21
+ * 2. TaskDistributor — Routes tasks to workers based on capacity/affinity/load
22
+ * 3. ClusterOrchestrator — Fleet management, auto-scaling, failover, rebalancing
23
+ *
24
+ * Communication: HTTP/JSON between nodes (pull-based + push notifications)
25
+ * Persistence: SQLite tables for durability across restarts
26
+ * Consistency: Leader-based (coordinator is source of truth)
27
+ */
28
+
29
+ const crypto = require('crypto');
30
+ const http = require('http');
31
+ const https = require('https');
32
+ const { URL } = require('url');
33
+ const { db } = require('../models/db');
34
+ const { bus } = require('../runtime/event-bus');
35
+
36
+ // ─── Schema ──────────────────────────────────────────────────────────
37
+
38
+ db.exec(`
39
+ CREATE TABLE IF NOT EXISTS cluster_nodes (
40
+ id TEXT PRIMARY KEY,
41
+ name TEXT NOT NULL,
42
+ endpoint TEXT NOT NULL,
43
+ region TEXT DEFAULT 'default',
44
+ zone TEXT DEFAULT 'a',
45
+ role TEXT DEFAULT 'worker',
46
+ status TEXT DEFAULT 'joining',
47
+ capacity_total INTEGER DEFAULT 20,
48
+ capacity_used INTEGER DEFAULT 0,
49
+ tags TEXT DEFAULT '[]',
50
+ hardware TEXT DEFAULT '{}',
51
+ version TEXT,
52
+ secret_hash TEXT,
53
+ last_heartbeat TEXT DEFAULT (datetime('now')),
54
+ registered_at TEXT DEFAULT (datetime('now')),
55
+ updated_at TEXT DEFAULT (datetime('now'))
56
+ );
57
+
58
+ CREATE TABLE IF NOT EXISTS cluster_tasks (
59
+ id TEXT PRIMARY KEY,
60
+ external_id TEXT,
61
+ node_id TEXT,
62
+ task_type TEXT NOT NULL,
63
+ objective TEXT,
64
+ payload TEXT DEFAULT '{}',
65
+ priority INTEGER DEFAULT 50,
66
+ status TEXT DEFAULT 'pending',
67
+ result TEXT,
68
+ error TEXT,
69
+ attempts INTEGER DEFAULT 0,
70
+ max_attempts INTEGER DEFAULT 3,
71
+ affinity_tags TEXT DEFAULT '[]',
72
+ affinity_region TEXT,
73
+ timeout_ms INTEGER DEFAULT 60000,
74
+ submitted_at TEXT DEFAULT (datetime('now')),
75
+ assigned_at TEXT,
76
+ started_at TEXT,
77
+ completed_at TEXT
78
+ );
79
+
80
+ CREATE TABLE IF NOT EXISTS cluster_events (
81
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
82
+ event_type TEXT NOT NULL,
83
+ node_id TEXT,
84
+ task_id TEXT,
85
+ data TEXT DEFAULT '{}',
86
+ created_at TEXT DEFAULT (datetime('now'))
87
+ );
88
+
89
+ CREATE INDEX IF NOT EXISTS idx_cluster_nodes_status ON cluster_nodes(status);
90
+ CREATE INDEX IF NOT EXISTS idx_cluster_nodes_region ON cluster_nodes(region);
91
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_status ON cluster_tasks(status);
92
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_node ON cluster_tasks(node_id);
93
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_priority ON cluster_tasks(priority DESC);
94
+ CREATE INDEX IF NOT EXISTS idx_cluster_events_type ON cluster_events(event_type);
95
+ CREATE INDEX IF NOT EXISTS idx_cluster_events_node ON cluster_events(node_id);
96
+ `);
97
+
98
+ // ─── Prepared Statements ─────────────────────────────────────────────
99
+
100
+ const stmts = {
101
+ // Nodes
102
+ insertNode: db.prepare(`
103
+ INSERT INTO cluster_nodes (id, name, endpoint, region, zone, role, status, capacity_total, tags, hardware, version, secret_hash)
104
+ VALUES (@id, @name, @endpoint, @region, @zone, @role, @status, @capacity_total, @tags, @hardware, @version, @secret_hash)
105
+ `),
106
+ updateNode: db.prepare(`
107
+ UPDATE cluster_nodes SET name=@name, endpoint=@endpoint, region=@region, zone=@zone,
108
+ capacity_total=@capacity_total, tags=@tags, hardware=@hardware, version=@version, updated_at=datetime('now')
109
+ WHERE id=@id
110
+ `),
111
+ setNodeStatus: db.prepare(`UPDATE cluster_nodes SET status=@status, updated_at=datetime('now') WHERE id=@id`),
112
+ heartbeatNode: db.prepare(`
113
+ UPDATE cluster_nodes SET last_heartbeat=datetime('now'), capacity_used=@capacity_used, status='active', updated_at=datetime('now')
114
+ WHERE id=@id
115
+ `),
116
+ getNode: db.prepare(`SELECT * FROM cluster_nodes WHERE id=?`),
117
+ getNodeByEndpoint: db.prepare(`SELECT * FROM cluster_nodes WHERE endpoint=?`),
118
+ listNodes: db.prepare(`SELECT * FROM cluster_nodes ORDER BY registered_at DESC`),
119
+ listActiveNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' ORDER BY capacity_used ASC`),
120
+ listNodesByRegion: db.prepare(`SELECT * FROM cluster_nodes WHERE region=? AND status='active' ORDER BY capacity_used ASC`),
121
+ deleteNode: db.prepare(`DELETE FROM cluster_nodes WHERE id=?`),
122
+ getStaleNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' AND last_heartbeat < datetime('now', '-' || ? || ' seconds')`),
123
+
124
+ // Tasks
125
+ insertTask: db.prepare(`
126
+ INSERT INTO cluster_tasks (id, external_id, task_type, objective, payload, priority, status, affinity_tags, affinity_region, timeout_ms, max_attempts)
127
+ VALUES (@id, @external_id, @task_type, @objective, @payload, @priority, @status, @affinity_tags, @affinity_region, @timeout_ms, @max_attempts)
128
+ `),
129
+ assignTask: db.prepare(`
130
+ UPDATE cluster_tasks SET node_id=@node_id, status='assigned', assigned_at=datetime('now'), attempts=attempts+1
131
+ WHERE id=@id
132
+ `),
133
+ startTask: db.prepare(`UPDATE cluster_tasks SET status='running', started_at=datetime('now') WHERE id=?`),
134
+ completeTask: db.prepare(`
135
+ UPDATE cluster_tasks SET status='completed', result=@result, completed_at=datetime('now') WHERE id=@id
136
+ `),
137
+ failTask: db.prepare(`
138
+ UPDATE cluster_tasks SET status='failed', error=@error, completed_at=datetime('now') WHERE id=@id
139
+ `),
140
+ requeueTask: db.prepare(`UPDATE cluster_tasks SET status='pending', node_id=NULL, assigned_at=NULL WHERE id=?`),
141
+ getTask: db.prepare(`SELECT * FROM cluster_tasks WHERE id=?`),
142
+ getTaskByExternal: db.prepare(`SELECT * FROM cluster_tasks WHERE external_id=?`),
143
+ getPendingTasks: db.prepare(`SELECT * FROM cluster_tasks WHERE status='pending' ORDER BY priority DESC, submitted_at ASC LIMIT ?`),
144
+ getTasksByNode: db.prepare(`SELECT * FROM cluster_tasks WHERE node_id=? AND status IN ('assigned','running') ORDER BY priority DESC`),
145
+ getTasksByStatus: db.prepare(`SELECT * FROM cluster_tasks WHERE status=? ORDER BY submitted_at DESC LIMIT ?`),
146
+ listTasks: db.prepare(`SELECT * FROM cluster_tasks ORDER BY submitted_at DESC LIMIT ?`),
147
+ getStuckTasks: db.prepare(`
148
+ SELECT * FROM cluster_tasks WHERE status IN ('assigned','running')
149
+ AND assigned_at < datetime('now', '-' || ? || ' seconds')
150
+ `),
151
+ countByStatus: db.prepare(`SELECT status, COUNT(*) as count FROM cluster_tasks GROUP BY status`),
152
+ incrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = capacity_used + 1 WHERE id=?`),
153
+ decrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = MAX(0, capacity_used - 1) WHERE id=?`),
154
+
155
+ // Events
156
+ insertEvent: db.prepare(`INSERT INTO cluster_events (event_type, node_id, task_id, data) VALUES (@event_type, @node_id, @task_id, @data)`),
157
+ getEvents: db.prepare(`SELECT * FROM cluster_events ORDER BY id DESC LIMIT ?`),
158
+ getEventsByNode: db.prepare(`SELECT * FROM cluster_events WHERE node_id=? ORDER BY id DESC LIMIT ?`),
159
+ };
160
+
161
+ // ═══════════════════════════════════════════════════════════════════════════
162
+ // TASK DISTRIBUTOR
163
+ // ═══════════════════════════════════════════════════════════════════════════
164
+
165
+ /**
166
+ * Routes tasks to worker nodes based on capacity, affinity, and load balancing.
167
+ *
168
+ * Strategies:
169
+ * - least-loaded: Pick the node with the most free capacity
170
+ * - affinity: Match task tags to node tags
171
+ * - region: Prefer nodes in the same region as the task
172
+ * - round-robin: Distribute evenly across all active nodes
173
+ */
174
+ class TaskDistributor {
175
+ constructor() {
176
+ this._roundRobinIndex = 0;
177
+ this._stats = { distributed: 0, reassigned: 0, noCapacity: 0 };
178
+ }
179
+
180
+ /**
181
+ * Submit a task for distributed execution
182
+ */
183
+ submit(task) {
184
+ const id = task.id || `ct_${crypto.randomBytes(12).toString('hex')}`;
185
+ const entry = {
186
+ id,
187
+ external_id: task.externalId || null,
188
+ task_type: task.type || 'general',
189
+ objective: task.objective || '',
190
+ payload: JSON.stringify(task.params || {}),
191
+ priority: task.priority || 50,
192
+ status: 'pending',
193
+ affinity_tags: JSON.stringify(task.affinityTags || []),
194
+ affinity_region: task.affinityRegion || null,
195
+ timeout_ms: task.timeout || 60000,
196
+ max_attempts: task.maxAttempts || 3,
197
+ };
198
+ stmts.insertTask.run(entry);
199
+
200
+ bus.emit('cluster.task.submitted', { taskId: id, type: entry.task_type, priority: entry.priority });
201
+ this._stats.distributed++;
202
+
203
+ // Try immediate assignment
204
+ this._tryAssign(id);
205
+
206
+ return { taskId: id, status: 'pending' };
207
+ }
208
+
209
+ /**
210
+ * Try to assign a task to a worker node
211
+ */
212
+ _tryAssign(taskId) {
213
+ const task = stmts.getTask.get(taskId);
214
+ if (!task || task.status !== 'pending') return false;
215
+
216
+ const node = this._selectNode(task);
217
+ if (!node) {
218
+ this._stats.noCapacity++;
219
+ return false;
220
+ }
221
+
222
+ stmts.assignTask.run({ id: taskId, node_id: node.id });
223
+ stmts.incrementNodeLoad.run(node.id);
224
+
225
+ logEvent('task.assigned', node.id, taskId, { strategy: this._lastStrategy });
226
+ bus.emit('cluster.task.assigned', { taskId, nodeId: node.id });
227
+
228
+ // Push notification to worker (fire-and-forget)
229
+ this._notifyWorker(node, taskId, task);
230
+
231
+ return true;
232
+ }
233
+
234
+ /**
235
+ * Select the best node for a task
236
+ */
237
+ _selectNode(task) {
238
+ let candidates = stmts.listActiveNodes.all();
239
+ if (candidates.length === 0) return null;
240
+
241
+ // Filter by capacity
242
+ candidates = candidates.filter(n => n.capacity_used < n.capacity_total);
243
+ if (candidates.length === 0) return null;
244
+
245
+ const affinityTags = safeParse(task.affinity_tags, []);
246
+ const affinityRegion = task.affinity_region;
247
+
248
+ // Strategy 1: Region affinity
249
+ if (affinityRegion) {
250
+ const regionNodes = candidates.filter(n => n.region === affinityRegion);
251
+ if (regionNodes.length > 0) {
252
+ candidates = regionNodes;
253
+ this._lastStrategy = 'region';
254
+ }
255
+ }
256
+
257
+ // Strategy 2: Tag affinity
258
+ if (affinityTags.length > 0) {
259
+ const tagged = candidates.filter(n => {
260
+ const nodeTags = safeParse(n.tags, []);
261
+ return affinityTags.some(t => nodeTags.includes(t));
262
+ });
263
+ if (tagged.length > 0) {
264
+ candidates = tagged;
265
+ this._lastStrategy = 'affinity';
266
+ }
267
+ }
268
+
269
+ // Strategy 3: Least-loaded
270
+ candidates.sort((a, b) => {
271
+ const loadA = a.capacity_used / a.capacity_total;
272
+ const loadB = b.capacity_used / b.capacity_total;
273
+ return loadA - loadB;
274
+ });
275
+
276
+ this._lastStrategy = this._lastStrategy || 'least-loaded';
277
+ return candidates[0];
278
+ }
279
+
280
+ /**
281
+ * Push task notification to a worker node
282
+ */
283
+ _notifyWorker(node, taskId, task) {
284
+ const payload = JSON.stringify({
285
+ type: 'task.assigned',
286
+ taskId,
287
+ taskType: task.task_type,
288
+ objective: task.objective,
289
+ params: safeParse(task.payload, {}),
290
+ priority: task.priority,
291
+ timeout: task.timeout_ms,
292
+ });
293
+
294
+ const url = new URL('/wab-worker/tasks/notify', node.endpoint);
295
+ const mod = url.protocol === 'https:' ? https : http;
296
+
297
+ const req = mod.request(url, {
298
+ method: 'POST',
299
+ headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) },
300
+ timeout: 5000,
301
+ });
302
+ req.on('error', () => { /* best-effort push */ });
303
+ req.write(payload);
304
+ req.end();
305
+ }
306
+
307
+ /**
308
+ * Reassign tasks from a dead node to other nodes
309
+ */
310
+ reassignFromNode(nodeId) {
311
+ const tasks = stmts.getTasksByNode.all(nodeId);
312
+ let reassigned = 0;
313
+
314
+ for (const task of tasks) {
315
+ if (task.attempts >= task.max_attempts) {
316
+ stmts.failTask.run({ id: task.id, error: 'Node died, max attempts reached' });
317
+ logEvent('task.failed', nodeId, task.id, { reason: 'node_death' });
318
+ bus.emit('cluster.task.failed', { taskId: task.id, reason: 'node_death' });
319
+ continue;
320
+ }
321
+
322
+ stmts.decrementNodeLoad.run(nodeId);
323
+ stmts.requeueTask.run(task.id);
324
+ logEvent('task.requeued', nodeId, task.id, { attempt: task.attempts });
325
+
326
+ // Try to assign to another node
327
+ if (this._tryAssign(task.id)) {
328
+ reassigned++;
329
+ this._stats.reassigned++;
330
+ }
331
+ }
332
+
333
+ return reassigned;
334
+ }
335
+
336
+ /**
337
+ * Process pending tasks — called periodically
338
+ */
339
+ processPending() {
340
+ const pending = stmts.getPendingTasks.all(50);
341
+ let assigned = 0;
342
+ for (const task of pending) {
343
+ if (this._tryAssign(task.id)) assigned++;
344
+ }
345
+ return assigned;
346
+ }
347
+
348
+ /**
349
+ * Worker pulls tasks for execution
350
+ */
351
+ pullTasks(nodeId, limit = 5) {
352
+ const node = stmts.getNode.get(nodeId);
353
+ if (!node || node.status !== 'active') return [];
354
+
355
+ const available = node.capacity_total - node.capacity_used;
356
+ if (available <= 0) return [];
357
+
358
+ const count = Math.min(limit, available);
359
+ const pending = stmts.getPendingTasks.all(count);
360
+ const assigned = [];
361
+
362
+ for (const task of pending) {
363
+ // Check affinity
364
+ const affinityRegion = task.affinity_region;
365
+ if (affinityRegion && node.region !== affinityRegion) continue;
366
+
367
+ const affinityTags = safeParse(task.affinity_tags, []);
368
+ const nodeTags = safeParse(node.tags, []);
369
+ if (affinityTags.length > 0 && !affinityTags.some(t => nodeTags.includes(t))) continue;
370
+
371
+ stmts.assignTask.run({ id: task.id, node_id: nodeId });
372
+ stmts.incrementNodeLoad.run(nodeId);
373
+ logEvent('task.assigned', nodeId, task.id, { strategy: 'pull' });
374
+
375
+ assigned.push({
376
+ taskId: task.id,
377
+ type: task.task_type,
378
+ objective: task.objective,
379
+ params: safeParse(task.payload, {}),
380
+ priority: task.priority,
381
+ timeout: task.timeout_ms,
382
+ });
383
+ }
384
+
385
+ return assigned;
386
+ }
387
+
388
+ getStats() { return { ...this._stats }; }
389
+ }
390
+
391
+ // ═══════════════════════════════════════════════════════════════════════════
392
+ // CLUSTER ORCHESTRATOR
393
+ // ═══════════════════════════════════════════════════════════════════════════
394
+
395
+ /**
396
+ * Fleet management — lifecycle, health, auto-scaling, failover, rebalancing.
397
+ *
398
+ * Responsibilities:
399
+ * - Node registration and authentication
400
+ * - Health monitoring via heartbeats
401
+ * - Dead node detection and task failover
402
+ * - Load rebalancing across the cluster
403
+ * - Cluster topology and status reporting
404
+ * - Drain and cordon operations
405
+ */
406
+ class ClusterOrchestrator {
407
+ constructor(distributor) {
408
+ this._distributor = distributor;
409
+ this._heartbeatThresholdSec = 90; // Node considered dead after 90s no heartbeat
410
+ this._checkInterval = null;
411
+ this._rebalanceInterval = null;
412
+ this._started = false;
413
+ }
414
+
415
+ // ─── Lifecycle ──────────────────────────────────────────────────────
416
+
417
+ /**
418
+ * Start the orchestrator — begins periodic health checks and task processing
419
+ */
420
+ start() {
421
+ if (this._started) return;
422
+ this._started = true;
423
+
424
+ // Health check every 30s
425
+ this._checkInterval = setInterval(() => {
426
+ this._healthCheck();
427
+ this._recoverStuckTasks();
428
+ this._distributor.processPending();
429
+ }, 30_000);
430
+ if (this._checkInterval.unref) this._checkInterval.unref();
431
+
432
+ // Rebalance every 5 min
433
+ this._rebalanceInterval = setInterval(() => {
434
+ this._rebalance();
435
+ }, 300_000);
436
+ if (this._rebalanceInterval.unref) this._rebalanceInterval.unref();
437
+
438
+ bus.emit('cluster.started', { timestamp: Date.now() });
439
+ }
440
+
441
+ /**
442
+ * Stop the orchestrator
443
+ */
444
+ stop() {
445
+ if (!this._started) return;
446
+ this._started = false;
447
+ if (this._checkInterval) clearInterval(this._checkInterval);
448
+ if (this._rebalanceInterval) clearInterval(this._rebalanceInterval);
449
+ bus.emit('cluster.stopped', { timestamp: Date.now() });
450
+ }
451
+
452
+ // ─── Node Management ───────────────────────────────────────────────
453
+
454
+ /**
455
+ * Register a worker node to join the cluster
456
+ */
457
+ registerNode(config) {
458
+ if (!config.name || !config.endpoint) {
459
+ throw new Error('Node name and endpoint required');
460
+ }
461
+
462
+ // Check for existing node with same endpoint
463
+ const existing = stmts.getNodeByEndpoint.get(config.endpoint);
464
+ if (existing) {
465
+ // Re-register: update and reactivate
466
+ stmts.updateNode.run({
467
+ id: existing.id,
468
+ name: config.name,
469
+ endpoint: config.endpoint,
470
+ region: config.region || existing.region,
471
+ zone: config.zone || existing.zone,
472
+ capacity_total: config.capacity || existing.capacity_total,
473
+ tags: JSON.stringify(config.tags || safeParse(existing.tags, [])),
474
+ hardware: JSON.stringify(config.hardware || safeParse(existing.hardware, {})),
475
+ version: config.version || existing.version,
476
+ });
477
+ stmts.setNodeStatus.run({ id: existing.id, status: 'active' });
478
+ logEvent('node.re-registered', existing.id, null, { endpoint: config.endpoint });
479
+ bus.emit('cluster.node.joined', { nodeId: existing.id, name: config.name, rejoined: true });
480
+ return { nodeId: existing.id, status: 'active', rejoined: true };
481
+ }
482
+
483
+ const nodeId = `node_${crypto.randomBytes(12).toString('hex')}`;
484
+ const secretHash = crypto.createHash('sha256')
485
+ .update(config.secret || crypto.randomBytes(32).toString('hex'))
486
+ .digest('hex');
487
+
488
+ stmts.insertNode.run({
489
+ id: nodeId,
490
+ name: config.name,
491
+ endpoint: config.endpoint,
492
+ region: config.region || 'default',
493
+ zone: config.zone || 'a',
494
+ role: config.role || 'worker',
495
+ status: 'active',
496
+ capacity_total: config.capacity || 20,
497
+ tags: JSON.stringify(config.tags || []),
498
+ hardware: JSON.stringify(config.hardware || {}),
499
+ version: config.version || null,
500
+ secret_hash: secretHash,
501
+ });
502
+
503
+ logEvent('node.registered', nodeId, null, { name: config.name, endpoint: config.endpoint, region: config.region });
504
+ bus.emit('cluster.node.joined', { nodeId, name: config.name });
505
+
506
+ return { nodeId, status: 'active', secret: config.secret ? undefined : undefined };
507
+ }
508
+
509
+ /**
510
+ * Remove a node from the cluster
511
+ */
512
+ deregisterNode(nodeId) {
513
+ const node = stmts.getNode.get(nodeId);
514
+ if (!node) return null;
515
+
516
+ // Reassign tasks before removing
517
+ const reassigned = this._distributor.reassignFromNode(nodeId);
518
+ stmts.deleteNode.run(nodeId);
519
+
520
+ logEvent('node.deregistered', nodeId, null, { reassigned });
521
+ bus.emit('cluster.node.left', { nodeId, name: node.name, tasksReassigned: reassigned });
522
+
523
+ return { nodeId, reassigned };
524
+ }
525
+
526
+ /**
527
+ * Process heartbeat from a worker node
528
+ */
529
+ heartbeat(nodeId, data = {}) {
530
+ const node = stmts.getNode.get(nodeId);
531
+ if (!node) return null;
532
+
533
+ stmts.heartbeatNode.run({
534
+ id: nodeId,
535
+ capacity_used: data.capacityUsed != null ? data.capacityUsed : node.capacity_used,
536
+ });
537
+
538
+ // Update hardware profile if provided
539
+ if (data.hardware) {
540
+ stmts.updateNode.run({
541
+ id: nodeId,
542
+ name: node.name,
543
+ endpoint: node.endpoint,
544
+ region: node.region,
545
+ zone: node.zone,
546
+ capacity_total: data.capacityTotal || node.capacity_total,
547
+ tags: JSON.stringify(data.tags || safeParse(node.tags, [])),
548
+ hardware: JSON.stringify(data.hardware),
549
+ version: data.version || node.version,
550
+ });
551
+ }
552
+
553
+ return {
554
+ nodeId,
555
+ status: 'active',
556
+ pendingTasks: stmts.getPendingTasks.all(1).length > 0,
557
+ };
558
+ }
559
+
560
+ /**
561
+ * Drain a node — stop assigning new tasks, wait for running tasks to finish
562
+ */
563
+ drainNode(nodeId) {
564
+ const node = stmts.getNode.get(nodeId);
565
+ if (!node) return null;
566
+
567
+ stmts.setNodeStatus.run({ id: nodeId, status: 'draining' });
568
+ logEvent('node.draining', nodeId, null, {});
569
+ bus.emit('cluster.node.draining', { nodeId, name: node.name });
570
+
571
+ return { nodeId, status: 'draining', activeTasks: stmts.getTasksByNode.all(nodeId).length };
572
+ }
573
+
574
+ /**
575
+ * Cordon a node — prevent scheduling but keep running tasks
576
+ */
577
+ cordonNode(nodeId) {
578
+ const node = stmts.getNode.get(nodeId);
579
+ if (!node) return null;
580
+
581
+ stmts.setNodeStatus.run({ id: nodeId, status: 'cordoned' });
582
+ logEvent('node.cordoned', nodeId, null, {});
583
+ bus.emit('cluster.node.cordoned', { nodeId, name: node.name });
584
+
585
+ return { nodeId, status: 'cordoned' };
586
+ }
587
+
588
+ /**
589
+ * Uncordon a node — allow scheduling again
590
+ */
591
+ uncordonNode(nodeId) {
592
+ const node = stmts.getNode.get(nodeId);
593
+ if (!node) return null;
594
+
595
+ stmts.setNodeStatus.run({ id: nodeId, status: 'active' });
596
+ logEvent('node.uncordoned', nodeId, null, {});
597
+
598
+ return { nodeId, status: 'active' };
599
+ }
600
+
601
+ /**
602
+ * Get node details
603
+ */
604
+ getNode(nodeId) {
605
+ const node = stmts.getNode.get(nodeId);
606
+ if (!node) return null;
607
+ node.tags = safeParse(node.tags, []);
608
+ node.hardware = safeParse(node.hardware, {});
609
+ node.activeTasks = stmts.getTasksByNode.all(nodeId).length;
610
+ return node;
611
+ }
612
+
613
+ /**
614
+ * List all cluster nodes
615
+ */
616
+ listNodes(filter = {}) {
617
+ let nodes;
618
+ if (filter.region) {
619
+ nodes = stmts.listNodesByRegion.all(filter.region);
620
+ } else if (filter.active) {
621
+ nodes = stmts.listActiveNodes.all();
622
+ } else {
623
+ nodes = stmts.listNodes.all();
624
+ }
625
+ return nodes.map(n => ({
626
+ ...n,
627
+ tags: safeParse(n.tags, []),
628
+ hardware: safeParse(n.hardware, {}),
629
+ }));
630
+ }
631
+
632
+ // ─── Task Reporting ─────────────────────────────────────────────────
633
+
634
+ /**
635
+ * Worker reports task started
636
+ */
637
+ reportTaskStarted(taskId) {
638
+ const task = stmts.getTask.get(taskId);
639
+ if (!task) return null;
640
+ stmts.startTask.run(taskId);
641
+ logEvent('task.started', task.node_id, taskId, {});
642
+ bus.emit('cluster.task.started', { taskId, nodeId: task.node_id });
643
+ return { taskId, status: 'running' };
644
+ }
645
+
646
+ /**
647
+ * Worker reports task completed
648
+ */
649
+ reportTaskCompleted(taskId, result) {
650
+ const task = stmts.getTask.get(taskId);
651
+ if (!task) return null;
652
+
653
+ stmts.completeTask.run({ id: taskId, result: JSON.stringify(result || {}) });
654
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
655
+
656
+ logEvent('task.completed', task.node_id, taskId, { hasResult: !!result });
657
+ bus.emit('cluster.task.completed', { taskId, nodeId: task.node_id, result });
658
+
659
+ return { taskId, status: 'completed' };
660
+ }
661
+
662
+ /**
663
+ * Worker reports task failed
664
+ */
665
+ reportTaskFailed(taskId, error) {
666
+ const task = stmts.getTask.get(taskId);
667
+ if (!task) return null;
668
+
669
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
670
+
671
+ // Retry if attempts remaining
672
+ if (task.attempts < task.max_attempts) {
673
+ stmts.requeueTask.run(taskId);
674
+ logEvent('task.retrying', task.node_id, taskId, { attempt: task.attempts, error });
675
+ bus.emit('cluster.task.retrying', { taskId, attempt: task.attempts });
676
+
677
+ // Try to assign to a different node
678
+ this._distributor._tryAssign(taskId);
679
+
680
+ return { taskId, status: 'retrying', attempt: task.attempts };
681
+ }
682
+
683
+ // Max attempts reached
684
+ stmts.failTask.run({ id: taskId, error: typeof error === 'string' ? error : JSON.stringify(error) });
685
+ logEvent('task.failed', task.node_id, taskId, { error, attempts: task.attempts });
686
+ bus.emit('cluster.task.failed', { taskId, error, nodeId: task.node_id });
687
+
688
+ return { taskId, status: 'failed' };
689
+ }
690
+
691
+ /**
692
+ * Get task details
693
+ */
694
+ getTask(taskId) {
695
+ const task = stmts.getTask.get(taskId);
696
+ if (!task) return null;
697
+ task.payload = safeParse(task.payload, {});
698
+ task.affinity_tags = safeParse(task.affinity_tags, []);
699
+ task.result = safeParse(task.result, null);
700
+ return task;
701
+ }
702
+
703
+ /**
704
+ * List tasks with optional status filter
705
+ */
706
+ listTasks(filter = {}) {
707
+ let tasks;
708
+ if (filter.status) {
709
+ tasks = stmts.getTasksByStatus.all(filter.status, filter.limit || 50);
710
+ } else if (filter.nodeId) {
711
+ tasks = stmts.getTasksByNode.all(filter.nodeId);
712
+ } else {
713
+ tasks = stmts.listTasks.all(filter.limit || 50);
714
+ }
715
+ return tasks.map(t => ({
716
+ ...t,
717
+ payload: safeParse(t.payload, {}),
718
+ affinity_tags: safeParse(t.affinity_tags, []),
719
+ result: safeParse(t.result, null),
720
+ }));
721
+ }
722
+
723
+ // ─── Cluster Topology ───────────────────────────────────────────────
724
+
725
+ /**
726
+ * Get full cluster status
727
+ */
728
+ getClusterStatus() {
729
+ const nodes = stmts.listNodes.all();
730
+ const taskCounts = {};
731
+ for (const row of stmts.countByStatus.all()) {
732
+ taskCounts[row.status] = row.count;
733
+ }
734
+
735
+ const activeNodes = nodes.filter(n => n.status === 'active');
736
+ const totalCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_total, 0);
737
+ const usedCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_used, 0);
738
+
739
+ // Group by region
740
+ const regions = {};
741
+ for (const node of nodes) {
742
+ if (!regions[node.region]) regions[node.region] = { nodes: 0, active: 0, capacity: 0, used: 0 };
743
+ regions[node.region].nodes++;
744
+ if (node.status === 'active') {
745
+ regions[node.region].active++;
746
+ regions[node.region].capacity += node.capacity_total;
747
+ regions[node.region].used += node.capacity_used;
748
+ }
749
+ }
750
+
751
+ return {
752
+ coordinator: { started: this._started },
753
+ nodes: {
754
+ total: nodes.length,
755
+ active: activeNodes.length,
756
+ draining: nodes.filter(n => n.status === 'draining').length,
757
+ cordoned: nodes.filter(n => n.status === 'cordoned').length,
758
+ dead: nodes.filter(n => n.status === 'dead').length,
759
+ },
760
+ capacity: {
761
+ total: totalCapacity,
762
+ used: usedCapacity,
763
+ available: totalCapacity - usedCapacity,
764
+ utilization: totalCapacity > 0 ? Math.round((usedCapacity / totalCapacity) * 100) : 0,
765
+ },
766
+ tasks: taskCounts,
767
+ regions,
768
+ distributor: this._distributor.getStats(),
769
+ };
770
+ }
771
+
772
+ /**
773
+ * Get cluster events log
774
+ */
775
+ getEvents(limit = 100, nodeId = null) {
776
+ if (nodeId) {
777
+ return stmts.getEventsByNode.all(nodeId, limit).map(e => ({
778
+ ...e,
779
+ data: safeParse(e.data, {}),
780
+ }));
781
+ }
782
+ return stmts.getEvents.all(limit).map(e => ({
783
+ ...e,
784
+ data: safeParse(e.data, {}),
785
+ }));
786
+ }
787
+
788
+ // ─── Internal Operations ────────────────────────────────────────────
789
+
790
+ /**
791
+ * Check for dead nodes and failover their tasks
792
+ */
793
+ _healthCheck() {
794
+ const staleNodes = stmts.getStaleNodes.all(this._heartbeatThresholdSec);
795
+
796
+ for (const node of staleNodes) {
797
+ stmts.setNodeStatus.run({ id: node.id, status: 'dead' });
798
+ logEvent('node.dead', node.id, null, { lastHeartbeat: node.last_heartbeat });
799
+ bus.emit('cluster.node.dead', { nodeId: node.id, name: node.name });
800
+
801
+ // Failover: reassign all tasks from dead node
802
+ const reassigned = this._distributor.reassignFromNode(node.id);
803
+ logEvent('node.failover', node.id, null, { reassigned });
804
+ bus.emit('cluster.node.failover', { nodeId: node.id, tasksReassigned: reassigned });
805
+ }
806
+ }
807
+
808
+ /**
809
+ * Recover tasks that have been assigned/running too long (stuck)
810
+ */
811
+ _recoverStuckTasks() {
812
+ const stuckTasks = stmts.getStuckTasks.all(300); // 5 min stuck threshold
813
+
814
+ for (const task of stuckTasks) {
815
+ if (task.attempts >= task.max_attempts) {
816
+ stmts.failTask.run({ id: task.id, error: 'Task stuck, max attempts reached' });
817
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
818
+ logEvent('task.stuck_failed', task.node_id, task.id, {});
819
+ } else {
820
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
821
+ stmts.requeueTask.run(task.id);
822
+ this._distributor._tryAssign(task.id);
823
+ logEvent('task.stuck_requeued', task.node_id, task.id, { attempt: task.attempts });
824
+ }
825
+ }
826
+ }
827
+
828
+ /**
829
+ * Rebalance tasks across nodes when load is skewed
830
+ */
831
+ _rebalance() {
832
+ const nodes = stmts.listActiveNodes.all();
833
+ if (nodes.length < 2) return;
834
+
835
+ const avgLoad = nodes.reduce((s, n) => s + (n.capacity_used / n.capacity_total), 0) / nodes.length;
836
+ const overloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) > avgLoad * 1.5 && n.capacity_used > 2);
837
+ const underloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) < avgLoad * 0.5);
838
+
839
+ if (overloaded.length === 0 || underloaded.length === 0) return;
840
+
841
+ let moved = 0;
842
+ for (const over of overloaded) {
843
+ const tasks = stmts.getTasksByNode.all(over.id);
844
+ // Move up to 2 tasks from overloaded to underloaded
845
+ const toMove = tasks.filter(t => t.status === 'assigned').slice(0, 2);
846
+
847
+ for (const task of toMove) {
848
+ const target = underloaded.find(n => n.capacity_used < n.capacity_total);
849
+ if (!target) break;
850
+
851
+ stmts.decrementNodeLoad.run(over.id);
852
+ stmts.assignTask.run({ id: task.id, node_id: target.id });
853
+ stmts.incrementNodeLoad.run(target.id);
854
+ target.capacity_used++;
855
+ moved++;
856
+
857
+ logEvent('task.rebalanced', target.id, task.id, { from: over.id });
858
+ this._distributor._notifyWorker(target, task.id, task);
859
+ }
860
+ }
861
+
862
+ if (moved > 0) {
863
+ bus.emit('cluster.rebalanced', { tasksMoved: moved });
864
+ }
865
+ }
866
+ }
867
+
868
+ // ═══════════════════════════════════════════════════════════════════════════
869
+ // HELPERS
870
+ // ═══════════════════════════════════════════════════════════════════════════
871
+
872
+ function safeParse(str, fallback) {
873
+ if (str == null) return fallback;
874
+ if (typeof str === 'object') return str;
875
+ try { return JSON.parse(str); } catch { return fallback; }
876
+ }
877
+
878
+ function logEvent(type, nodeId, taskId, data) {
879
+ try {
880
+ stmts.insertEvent.run({
881
+ event_type: type,
882
+ node_id: nodeId || null,
883
+ task_id: taskId || null,
884
+ data: JSON.stringify(data || {}),
885
+ });
886
+ } catch { /* best-effort logging */ }
887
+ }
888
+
889
+ // ─── Singleton ───────────────────────────────────────────────────────
890
+
891
+ const distributor = new TaskDistributor();
892
+ const cluster = new ClusterOrchestrator(distributor);
893
+
894
+ module.exports = { cluster, distributor, ClusterOrchestrator, TaskDistributor };