web-agent-bridge 3.4.0 → 3.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (315) hide show
  1. package/LICENSE +84 -84
  2. package/README.ar.md +1565 -1304
  3. package/README.md +171 -298
  4. package/bin/agent-runner.js +474 -474
  5. package/bin/cli.js +237 -237
  6. package/bin/wab-init.js +244 -223
  7. package/bin/wab.js +80 -80
  8. package/examples/azure-dns-wab.js +83 -83
  9. package/examples/bidi-agent.js +119 -119
  10. package/examples/cloudflare-wab-dns.js +121 -121
  11. package/examples/cpanel-wab-dns.js +114 -114
  12. package/examples/cross-site-agent.js +91 -91
  13. package/examples/dns-discovery-agent.js +166 -166
  14. package/examples/gcp-dns-wab.js +76 -76
  15. package/examples/governance-agent.js +169 -169
  16. package/examples/mcp-agent.js +94 -94
  17. package/examples/next-app-router/README.md +44 -44
  18. package/examples/plesk-wab-dns.js +103 -103
  19. package/examples/puppeteer-agent.js +108 -108
  20. package/examples/route53-wab-dns.js +144 -144
  21. package/examples/saas-dashboard/README.md +55 -55
  22. package/examples/safe-mode-agent.js +96 -96
  23. package/examples/self-discovery.js +106 -0
  24. package/examples/shopify-hydrogen/README.md +74 -74
  25. package/examples/vision-agent.js +171 -171
  26. package/examples/wab-sign.js +74 -74
  27. package/examples/wab-verify.js +60 -60
  28. package/examples/wordpress-elementor/README.md +77 -77
  29. package/package.json +93 -93
  30. package/public/.well-known/agent-tools.json +180 -180
  31. package/public/.well-known/ai-assets.json +59 -59
  32. package/public/.well-known/security.txt +8 -8
  33. package/public/.well-known/wab.json +28 -28
  34. package/public/activate.html +448 -368
  35. package/public/adopt.html +236 -0
  36. package/public/adoption-metrics.html +188 -188
  37. package/public/agent-workspace.html +359 -349
  38. package/public/ai.html +198 -198
  39. package/public/api.html +397 -413
  40. package/public/atp.html +171 -0
  41. package/public/azure-dns-integration.html +289 -289
  42. package/public/browser.html +486 -486
  43. package/public/cloudflare-integration.html +380 -380
  44. package/public/commander-dashboard.html +243 -243
  45. package/public/cookies.html +210 -210
  46. package/public/cpanel-integration.html +398 -398
  47. package/public/css/agent-workspace.css +1713 -1713
  48. package/public/css/premium.css +317 -317
  49. package/public/css/styles.css +1401 -1263
  50. package/public/dashboard-shieldlink.html +295 -0
  51. package/public/dashboard.html +711 -707
  52. package/public/dns.html +436 -436
  53. package/public/docs.html +588 -588
  54. package/public/enterprise-mesh.ar.html +80 -0
  55. package/public/enterprise-mesh.html +81 -0
  56. package/public/feed.xml +89 -89
  57. package/public/gcp-dns-integration.html +318 -318
  58. package/public/governance.ar.html +70 -0
  59. package/public/governance.html +69 -0
  60. package/public/growth.html +465 -465
  61. package/public/index.html +1372 -1266
  62. package/public/integrations.html +556 -556
  63. package/public/js/activate.js +449 -145
  64. package/public/js/agent-workspace.js +1740 -1740
  65. package/public/js/auth-nav.js +117 -65
  66. package/public/js/auth-redirect.js +12 -12
  67. package/public/js/cookie-consent.js +56 -56
  68. package/public/js/dns.js +438 -438
  69. package/public/js/wab-demo-page.js +721 -721
  70. package/public/js/ws-client.js +74 -74
  71. package/public/l-preview.html +242 -0
  72. package/public/llms-full.txt +360 -360
  73. package/public/llms.txt +125 -125
  74. package/public/login.html +85 -85
  75. package/public/mesh-dashboard.html +328 -328
  76. package/public/milestones.html +346 -0
  77. package/public/one-click.html +779 -0
  78. package/public/openapi.json +669 -669
  79. package/public/partners.ar.html +145 -0
  80. package/public/partners.html +143 -0
  81. package/public/phone-shield.html +281 -281
  82. package/public/plesk-integration.html +375 -375
  83. package/public/premium-dashboard.html +2489 -2489
  84. package/public/premium.html +793 -793
  85. package/public/privacy.html +297 -297
  86. package/public/provider-onboarding.html +172 -172
  87. package/public/provider-sandbox.html +134 -134
  88. package/public/providers.html +359 -359
  89. package/public/refusals.html +172 -0
  90. package/public/register.html +105 -105
  91. package/public/registrar-integrations.html +141 -141
  92. package/public/ring4.html +292 -0
  93. package/public/robots.txt +99 -99
  94. package/public/route53-integration.html +531 -531
  95. package/public/score.html +263 -0
  96. package/public/script/wab-consent.d.ts +36 -36
  97. package/public/script/wab-consent.js +104 -104
  98. package/public/script/wab-schema.js +131 -131
  99. package/public/script/wab.d.ts +108 -108
  100. package/public/script/wab.min.js +580 -580
  101. package/public/security.txt +8 -8
  102. package/public/shieldlink.html +244 -0
  103. package/public/shieldqr.html +231 -231
  104. package/public/sitemap.xml +13 -1
  105. package/public/terms.html +256 -256
  106. package/public/trust-graph-api.ar.html +92 -0
  107. package/public/trust-graph-api.html +91 -0
  108. package/public/wab-features.html +560 -0
  109. package/public/wab-trust.html +200 -200
  110. package/public/wab-truth.html +375 -0
  111. package/public/wab-vs-protocols.html +210 -210
  112. package/public/whitepaper.html +449 -449
  113. package/script/ai-agent-bridge.js +1754 -1754
  114. package/sdk/README.md +99 -99
  115. package/sdk/agent-mesh.js +449 -449
  116. package/sdk/atp.js +103 -0
  117. package/sdk/auto-discovery.js +301 -288
  118. package/sdk/commander.js +262 -262
  119. package/sdk/governance.js +262 -262
  120. package/sdk/index.d.ts +464 -464
  121. package/sdk/index.js +653 -649
  122. package/sdk/multi-agent.js +318 -318
  123. package/sdk/safe-mode.js +221 -221
  124. package/sdk/safety-shield.js +219 -219
  125. package/sdk/schema-discovery.js +83 -83
  126. package/server/adapters/index.js +520 -520
  127. package/server/config/plans.js +412 -367
  128. package/server/config/secrets.js +102 -102
  129. package/server/control-plane/index.js +301 -301
  130. package/server/data-plane/index.js +354 -354
  131. package/server/index.js +793 -670
  132. package/server/llm/index.js +404 -404
  133. package/server/middleware/adminAuth.js +35 -35
  134. package/server/middleware/api-tier.js +170 -0
  135. package/server/middleware/auth.js +50 -50
  136. package/server/middleware/featureGate.js +88 -88
  137. package/server/middleware/rateLimits.js +100 -100
  138. package/server/middleware/sensitiveAction.js +157 -157
  139. package/server/middleware/wab-trust.js +141 -0
  140. package/server/migrations/001_add_analytics_indexes.sql +7 -7
  141. package/server/migrations/002_premium_features.sql +418 -418
  142. package/server/migrations/003_ads_integer_cents.sql +33 -33
  143. package/server/migrations/004_agent_os.sql +158 -158
  144. package/server/migrations/005_marketplace_metering.sql +126 -126
  145. package/server/migrations/006_growth_suite.sql +138 -0
  146. package/server/migrations/007_governance.sql +106 -106
  147. package/server/migrations/008_plans.sql +144 -144
  148. package/server/migrations/009_shieldqr.sql +30 -30
  149. package/server/migrations/010_extended_trust.sql +33 -33
  150. package/server/migrations/011_outreach.sql +47 -0
  151. package/server/migrations/012_shieldlink.sql +116 -0
  152. package/server/migrations/013_ct_monitor.sql +13 -0
  153. package/server/migrations/014_wab_advanced_features.sql +128 -0
  154. package/server/migrations/015_wab_truth_layer.sql +101 -0
  155. package/server/migrations/016_ring4_external_trust.sql +84 -0
  156. package/server/migrations/017_ring4_extensions.sql +69 -0
  157. package/server/migrations/018_commercial_foundations.sql +167 -0
  158. package/server/migrations/019_unify_tier_constraints.sql +133 -0
  159. package/server/migrations/020_agent_transaction_primitive.sql +119 -0
  160. package/server/models/adapters/index.js +33 -33
  161. package/server/models/adapters/mysql.js +183 -183
  162. package/server/models/adapters/postgresql.js +172 -172
  163. package/server/models/adapters/sqlite.js +7 -7
  164. package/server/models/db.js +740 -740
  165. package/server/observability/failure-analysis.js +337 -337
  166. package/server/observability/index.js +394 -394
  167. package/server/protocol/capabilities.js +223 -223
  168. package/server/protocol/index.js +243 -243
  169. package/server/protocol/schema.js +584 -584
  170. package/server/registry/certification.js +271 -271
  171. package/server/registry/index.js +326 -326
  172. package/server/routes/activate.js +478 -0
  173. package/server/routes/admin-outreach.js +239 -0
  174. package/server/routes/admin-plans.js +76 -76
  175. package/server/routes/admin-premium.js +674 -673
  176. package/server/routes/admin-shieldlink.js +137 -0
  177. package/server/routes/admin-shieldqr.js +90 -90
  178. package/server/routes/admin-trust-monitor.js +139 -83
  179. package/server/routes/admin.js +550 -549
  180. package/server/routes/adopt.js +61 -0
  181. package/server/routes/ads.js +130 -130
  182. package/server/routes/agent-workspace.js +540 -540
  183. package/server/routes/api-keys.js +127 -0
  184. package/server/routes/api.js +150 -150
  185. package/server/routes/auth.js +71 -71
  186. package/server/routes/billing.js +57 -57
  187. package/server/routes/commander.js +316 -316
  188. package/server/routes/customer-shieldlink.js +133 -0
  189. package/server/routes/demo-showcase.js +332 -332
  190. package/server/routes/demo-store.js +154 -154
  191. package/server/routes/diagnose.js +373 -0
  192. package/server/routes/discovery.js +2348 -2348
  193. package/server/routes/enterprise-mesh.js +170 -0
  194. package/server/routes/gateway.js +173 -173
  195. package/server/routes/governance-saas.js +203 -0
  196. package/server/routes/governance.js +208 -208
  197. package/server/routes/growth.js +1048 -0
  198. package/server/routes/intent.js +328 -0
  199. package/server/routes/license.js +251 -251
  200. package/server/routes/mesh.js +469 -469
  201. package/server/routes/noscript.js +543 -543
  202. package/server/routes/partners.js +201 -0
  203. package/server/routes/plans.js +33 -33
  204. package/server/routes/premium-v2.js +686 -686
  205. package/server/routes/premium.js +724 -724
  206. package/server/routes/providers.js +650 -650
  207. package/server/routes/reputation.js +411 -0
  208. package/server/routes/ring4.js +885 -0
  209. package/server/routes/runtime.js +2148 -2148
  210. package/server/routes/shieldlink.js +70 -0
  211. package/server/routes/shieldqr.js +88 -88
  212. package/server/routes/sovereign.js +465 -465
  213. package/server/routes/transactions.js +233 -0
  214. package/server/routes/truth-layer.js +670 -0
  215. package/server/routes/universal.js +200 -200
  216. package/server/routes/unsubscribe.js +51 -0
  217. package/server/routes/wab-api.js +850 -850
  218. package/server/routes/wab-cache.js +282 -0
  219. package/server/runtime/container-worker.js +111 -111
  220. package/server/runtime/container.js +448 -448
  221. package/server/runtime/distributed-worker.js +362 -362
  222. package/server/runtime/event-bus.js +210 -210
  223. package/server/runtime/index.js +253 -253
  224. package/server/runtime/queue.js +599 -599
  225. package/server/runtime/replay.js +666 -666
  226. package/server/runtime/sandbox.js +266 -266
  227. package/server/runtime/scheduler.js +534 -534
  228. package/server/runtime/session-engine.js +293 -293
  229. package/server/runtime/state-manager.js +188 -188
  230. package/server/secrets/wab-signing-key.pem +3 -0
  231. package/server/secrets/wab-signing-pub.pem +3 -0
  232. package/server/security/cross-site-redactor.js +196 -196
  233. package/server/security/dry-run.js +180 -180
  234. package/server/security/human-gate-rate-limit.js +147 -147
  235. package/server/security/human-gate-transports.js +178 -178
  236. package/server/security/human-gate.js +281 -281
  237. package/server/security/index.js +368 -368
  238. package/server/security/intent-engine.js +245 -245
  239. package/server/security/reward-guard.js +171 -171
  240. package/server/security/rollback-store.js +239 -239
  241. package/server/security/token-scope.js +404 -404
  242. package/server/security/url-policy.js +139 -139
  243. package/server/services/adoption-agent.js +182 -0
  244. package/server/services/agent-chat.js +506 -506
  245. package/server/services/agent-learning.js +601 -601
  246. package/server/services/agent-memory.js +625 -625
  247. package/server/services/agent-mesh.js +555 -555
  248. package/server/services/agent-symphony.js +717 -717
  249. package/server/services/agent-tasks.js +1807 -1807
  250. package/server/services/api-key-engine.js +292 -292
  251. package/server/services/cluster.js +894 -894
  252. package/server/services/commander.js +738 -738
  253. package/server/services/edge-compute.js +440 -440
  254. package/server/services/email.js +233 -233
  255. package/server/services/fairness-engine.js +409 -0
  256. package/server/services/fairness.js +420 -0
  257. package/server/services/governance.js +466 -466
  258. package/server/services/hosted-runtime.js +205 -205
  259. package/server/services/lfd.js +635 -635
  260. package/server/services/local-ai.js +389 -389
  261. package/server/services/marketplace.js +270 -270
  262. package/server/services/metering.js +182 -182
  263. package/server/services/modules/affiliate-intelligence.js +93 -93
  264. package/server/services/modules/agent-firewall.js +90 -90
  265. package/server/services/modules/bounty.js +89 -89
  266. package/server/services/modules/collective-bargaining.js +92 -92
  267. package/server/services/modules/dark-pattern.js +66 -66
  268. package/server/services/modules/gov-intelligence.js +45 -45
  269. package/server/services/modules/neural.js +55 -55
  270. package/server/services/modules/notary.js +49 -49
  271. package/server/services/modules/price-time-machine.js +86 -86
  272. package/server/services/modules/protocol.js +104 -104
  273. package/server/services/negotiation.js +439 -439
  274. package/server/services/outreach-agent.js +312 -0
  275. package/server/services/plans.js +214 -214
  276. package/server/services/plugins.js +771 -771
  277. package/server/services/price-intelligence.js +566 -566
  278. package/server/services/price-shield.js +1137 -1137
  279. package/server/services/provider-clients.js +740 -740
  280. package/server/services/reputation.js +465 -465
  281. package/server/services/search-engine.js +357 -357
  282. package/server/services/security.js +513 -513
  283. package/server/services/self-healing.js +843 -843
  284. package/server/services/shieldlink.js +492 -0
  285. package/server/services/shieldqr.js +322 -322
  286. package/server/services/sovereign-shield.js +542 -542
  287. package/server/services/ssl-ct-monitor.js +224 -0
  288. package/server/services/ssl-inspector.js +42 -42
  289. package/server/services/ssl-monitor.js +167 -167
  290. package/server/services/stripe.js +206 -205
  291. package/server/services/swarm.js +788 -788
  292. package/server/services/transactions.js +525 -0
  293. package/server/services/universal-scraper.js +662 -662
  294. package/server/services/verification.js +481 -481
  295. package/server/services/vision.js +1163 -1163
  296. package/server/services/wab-crypto.js +178 -178
  297. package/server/utils/cache.js +125 -125
  298. package/server/utils/migrate.js +81 -81
  299. package/server/utils/safe-fetch.js +228 -228
  300. package/server/utils/secureFields.js +50 -50
  301. package/server/ws.js +161 -161
  302. package/templates/artisan-marketplace.yaml +104 -104
  303. package/templates/book-price-scout.yaml +98 -98
  304. package/templates/electronics-price-tracker.yaml +108 -108
  305. package/templates/flight-deal-hunter.yaml +113 -113
  306. package/templates/freelancer-direct.yaml +116 -116
  307. package/templates/grocery-price-compare.yaml +93 -93
  308. package/templates/hotel-direct-booking.yaml +113 -113
  309. package/templates/local-services.yaml +98 -98
  310. package/templates/olive-oil-tunisia.yaml +88 -88
  311. package/templates/organic-farm-fresh.yaml +101 -101
  312. package/templates/restaurant-direct.yaml +97 -97
  313. package/templates/ring4/banking-sovereign.yaml +55 -0
  314. package/templates/ring4/ecommerce-sovereign.yaml +58 -0
  315. package/templates/ring4/healthcare-sovereign.yaml +60 -0
@@ -1,894 +1,894 @@
1
- 'use strict';
2
-
3
- /**
4
- * WAB Cluster — Distributed Execution, Worker Nodes & Cluster Orchestration
5
- *
6
- * Turns WAB from a single-server Agent OS into a distributed fleet.
7
- *
8
- * Architecture:
9
- * ┌──────────────┐ ┌──────────┐ ┌──────────┐
10
- * │ Coordinator │────▶│ Worker-1 │ │ Worker-2 │
11
- * │ (this node) │────▶│ (remote) │ │ (remote) │
12
- * │ │────▶│ │ │ │
13
- * └──────────────┘ └──────────┘ └──────────┘
14
- * │ ▲ ▲
15
- * │ │ │
16
- * └───────────────────┴─────────────────┘
17
- * heartbeat / task results
18
- *
19
- * Components:
20
- * 1. WorkerNode — A remote execution node that connects, heartbeats, runs tasks
21
- * 2. TaskDistributor — Routes tasks to workers based on capacity/affinity/load
22
- * 3. ClusterOrchestrator — Fleet management, auto-scaling, failover, rebalancing
23
- *
24
- * Communication: HTTP/JSON between nodes (pull-based + push notifications)
25
- * Persistence: SQLite tables for durability across restarts
26
- * Consistency: Leader-based (coordinator is source of truth)
27
- */
28
-
29
- const crypto = require('crypto');
30
- const http = require('http');
31
- const https = require('https');
32
- const { URL } = require('url');
33
- const { db } = require('../models/db');
34
- const { bus } = require('../runtime/event-bus');
35
-
36
- // ─── Schema ──────────────────────────────────────────────────────────
37
-
38
- db.exec(`
39
- CREATE TABLE IF NOT EXISTS cluster_nodes (
40
- id TEXT PRIMARY KEY,
41
- name TEXT NOT NULL,
42
- endpoint TEXT NOT NULL,
43
- region TEXT DEFAULT 'default',
44
- zone TEXT DEFAULT 'a',
45
- role TEXT DEFAULT 'worker',
46
- status TEXT DEFAULT 'joining',
47
- capacity_total INTEGER DEFAULT 20,
48
- capacity_used INTEGER DEFAULT 0,
49
- tags TEXT DEFAULT '[]',
50
- hardware TEXT DEFAULT '{}',
51
- version TEXT,
52
- secret_hash TEXT,
53
- last_heartbeat TEXT DEFAULT (datetime('now')),
54
- registered_at TEXT DEFAULT (datetime('now')),
55
- updated_at TEXT DEFAULT (datetime('now'))
56
- );
57
-
58
- CREATE TABLE IF NOT EXISTS cluster_tasks (
59
- id TEXT PRIMARY KEY,
60
- external_id TEXT,
61
- node_id TEXT,
62
- task_type TEXT NOT NULL,
63
- objective TEXT,
64
- payload TEXT DEFAULT '{}',
65
- priority INTEGER DEFAULT 50,
66
- status TEXT DEFAULT 'pending',
67
- result TEXT,
68
- error TEXT,
69
- attempts INTEGER DEFAULT 0,
70
- max_attempts INTEGER DEFAULT 3,
71
- affinity_tags TEXT DEFAULT '[]',
72
- affinity_region TEXT,
73
- timeout_ms INTEGER DEFAULT 60000,
74
- submitted_at TEXT DEFAULT (datetime('now')),
75
- assigned_at TEXT,
76
- started_at TEXT,
77
- completed_at TEXT
78
- );
79
-
80
- CREATE TABLE IF NOT EXISTS cluster_events (
81
- id INTEGER PRIMARY KEY AUTOINCREMENT,
82
- event_type TEXT NOT NULL,
83
- node_id TEXT,
84
- task_id TEXT,
85
- data TEXT DEFAULT '{}',
86
- created_at TEXT DEFAULT (datetime('now'))
87
- );
88
-
89
- CREATE INDEX IF NOT EXISTS idx_cluster_nodes_status ON cluster_nodes(status);
90
- CREATE INDEX IF NOT EXISTS idx_cluster_nodes_region ON cluster_nodes(region);
91
- CREATE INDEX IF NOT EXISTS idx_cluster_tasks_status ON cluster_tasks(status);
92
- CREATE INDEX IF NOT EXISTS idx_cluster_tasks_node ON cluster_tasks(node_id);
93
- CREATE INDEX IF NOT EXISTS idx_cluster_tasks_priority ON cluster_tasks(priority DESC);
94
- CREATE INDEX IF NOT EXISTS idx_cluster_events_type ON cluster_events(event_type);
95
- CREATE INDEX IF NOT EXISTS idx_cluster_events_node ON cluster_events(node_id);
96
- `);
97
-
98
- // ─── Prepared Statements ─────────────────────────────────────────────
99
-
100
- const stmts = {
101
- // Nodes
102
- insertNode: db.prepare(`
103
- INSERT INTO cluster_nodes (id, name, endpoint, region, zone, role, status, capacity_total, tags, hardware, version, secret_hash)
104
- VALUES (@id, @name, @endpoint, @region, @zone, @role, @status, @capacity_total, @tags, @hardware, @version, @secret_hash)
105
- `),
106
- updateNode: db.prepare(`
107
- UPDATE cluster_nodes SET name=@name, endpoint=@endpoint, region=@region, zone=@zone,
108
- capacity_total=@capacity_total, tags=@tags, hardware=@hardware, version=@version, updated_at=datetime('now')
109
- WHERE id=@id
110
- `),
111
- setNodeStatus: db.prepare(`UPDATE cluster_nodes SET status=@status, updated_at=datetime('now') WHERE id=@id`),
112
- heartbeatNode: db.prepare(`
113
- UPDATE cluster_nodes SET last_heartbeat=datetime('now'), capacity_used=@capacity_used, status='active', updated_at=datetime('now')
114
- WHERE id=@id
115
- `),
116
- getNode: db.prepare(`SELECT * FROM cluster_nodes WHERE id=?`),
117
- getNodeByEndpoint: db.prepare(`SELECT * FROM cluster_nodes WHERE endpoint=?`),
118
- listNodes: db.prepare(`SELECT * FROM cluster_nodes ORDER BY registered_at DESC`),
119
- listActiveNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' ORDER BY capacity_used ASC`),
120
- listNodesByRegion: db.prepare(`SELECT * FROM cluster_nodes WHERE region=? AND status='active' ORDER BY capacity_used ASC`),
121
- deleteNode: db.prepare(`DELETE FROM cluster_nodes WHERE id=?`),
122
- getStaleNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' AND last_heartbeat < datetime('now', '-' || ? || ' seconds')`),
123
-
124
- // Tasks
125
- insertTask: db.prepare(`
126
- INSERT INTO cluster_tasks (id, external_id, task_type, objective, payload, priority, status, affinity_tags, affinity_region, timeout_ms, max_attempts)
127
- VALUES (@id, @external_id, @task_type, @objective, @payload, @priority, @status, @affinity_tags, @affinity_region, @timeout_ms, @max_attempts)
128
- `),
129
- assignTask: db.prepare(`
130
- UPDATE cluster_tasks SET node_id=@node_id, status='assigned', assigned_at=datetime('now'), attempts=attempts+1
131
- WHERE id=@id
132
- `),
133
- startTask: db.prepare(`UPDATE cluster_tasks SET status='running', started_at=datetime('now') WHERE id=?`),
134
- completeTask: db.prepare(`
135
- UPDATE cluster_tasks SET status='completed', result=@result, completed_at=datetime('now') WHERE id=@id
136
- `),
137
- failTask: db.prepare(`
138
- UPDATE cluster_tasks SET status='failed', error=@error, completed_at=datetime('now') WHERE id=@id
139
- `),
140
- requeueTask: db.prepare(`UPDATE cluster_tasks SET status='pending', node_id=NULL, assigned_at=NULL WHERE id=?`),
141
- getTask: db.prepare(`SELECT * FROM cluster_tasks WHERE id=?`),
142
- getTaskByExternal: db.prepare(`SELECT * FROM cluster_tasks WHERE external_id=?`),
143
- getPendingTasks: db.prepare(`SELECT * FROM cluster_tasks WHERE status='pending' ORDER BY priority DESC, submitted_at ASC LIMIT ?`),
144
- getTasksByNode: db.prepare(`SELECT * FROM cluster_tasks WHERE node_id=? AND status IN ('assigned','running') ORDER BY priority DESC`),
145
- getTasksByStatus: db.prepare(`SELECT * FROM cluster_tasks WHERE status=? ORDER BY submitted_at DESC LIMIT ?`),
146
- listTasks: db.prepare(`SELECT * FROM cluster_tasks ORDER BY submitted_at DESC LIMIT ?`),
147
- getStuckTasks: db.prepare(`
148
- SELECT * FROM cluster_tasks WHERE status IN ('assigned','running')
149
- AND assigned_at < datetime('now', '-' || ? || ' seconds')
150
- `),
151
- countByStatus: db.prepare(`SELECT status, COUNT(*) as count FROM cluster_tasks GROUP BY status`),
152
- incrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = capacity_used + 1 WHERE id=?`),
153
- decrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = MAX(0, capacity_used - 1) WHERE id=?`),
154
-
155
- // Events
156
- insertEvent: db.prepare(`INSERT INTO cluster_events (event_type, node_id, task_id, data) VALUES (@event_type, @node_id, @task_id, @data)`),
157
- getEvents: db.prepare(`SELECT * FROM cluster_events ORDER BY id DESC LIMIT ?`),
158
- getEventsByNode: db.prepare(`SELECT * FROM cluster_events WHERE node_id=? ORDER BY id DESC LIMIT ?`),
159
- };
160
-
161
- // ═══════════════════════════════════════════════════════════════════════════
162
- // TASK DISTRIBUTOR
163
- // ═══════════════════════════════════════════════════════════════════════════
164
-
165
- /**
166
- * Routes tasks to worker nodes based on capacity, affinity, and load balancing.
167
- *
168
- * Strategies:
169
- * - least-loaded: Pick the node with the most free capacity
170
- * - affinity: Match task tags to node tags
171
- * - region: Prefer nodes in the same region as the task
172
- * - round-robin: Distribute evenly across all active nodes
173
- */
174
- class TaskDistributor {
175
- constructor() {
176
- this._roundRobinIndex = 0;
177
- this._stats = { distributed: 0, reassigned: 0, noCapacity: 0 };
178
- }
179
-
180
- /**
181
- * Submit a task for distributed execution
182
- */
183
- submit(task) {
184
- const id = task.id || `ct_${crypto.randomBytes(12).toString('hex')}`;
185
- const entry = {
186
- id,
187
- external_id: task.externalId || null,
188
- task_type: task.type || 'general',
189
- objective: task.objective || '',
190
- payload: JSON.stringify(task.params || {}),
191
- priority: task.priority || 50,
192
- status: 'pending',
193
- affinity_tags: JSON.stringify(task.affinityTags || []),
194
- affinity_region: task.affinityRegion || null,
195
- timeout_ms: task.timeout || 60000,
196
- max_attempts: task.maxAttempts || 3,
197
- };
198
- stmts.insertTask.run(entry);
199
-
200
- bus.emit('cluster.task.submitted', { taskId: id, type: entry.task_type, priority: entry.priority });
201
- this._stats.distributed++;
202
-
203
- // Try immediate assignment
204
- this._tryAssign(id);
205
-
206
- return { taskId: id, status: 'pending' };
207
- }
208
-
209
- /**
210
- * Try to assign a task to a worker node
211
- */
212
- _tryAssign(taskId) {
213
- const task = stmts.getTask.get(taskId);
214
- if (!task || task.status !== 'pending') return false;
215
-
216
- const node = this._selectNode(task);
217
- if (!node) {
218
- this._stats.noCapacity++;
219
- return false;
220
- }
221
-
222
- stmts.assignTask.run({ id: taskId, node_id: node.id });
223
- stmts.incrementNodeLoad.run(node.id);
224
-
225
- logEvent('task.assigned', node.id, taskId, { strategy: this._lastStrategy });
226
- bus.emit('cluster.task.assigned', { taskId, nodeId: node.id });
227
-
228
- // Push notification to worker (fire-and-forget)
229
- this._notifyWorker(node, taskId, task);
230
-
231
- return true;
232
- }
233
-
234
- /**
235
- * Select the best node for a task
236
- */
237
- _selectNode(task) {
238
- let candidates = stmts.listActiveNodes.all();
239
- if (candidates.length === 0) return null;
240
-
241
- // Filter by capacity
242
- candidates = candidates.filter(n => n.capacity_used < n.capacity_total);
243
- if (candidates.length === 0) return null;
244
-
245
- const affinityTags = safeParse(task.affinity_tags, []);
246
- const affinityRegion = task.affinity_region;
247
-
248
- // Strategy 1: Region affinity
249
- if (affinityRegion) {
250
- const regionNodes = candidates.filter(n => n.region === affinityRegion);
251
- if (regionNodes.length > 0) {
252
- candidates = regionNodes;
253
- this._lastStrategy = 'region';
254
- }
255
- }
256
-
257
- // Strategy 2: Tag affinity
258
- if (affinityTags.length > 0) {
259
- const tagged = candidates.filter(n => {
260
- const nodeTags = safeParse(n.tags, []);
261
- return affinityTags.some(t => nodeTags.includes(t));
262
- });
263
- if (tagged.length > 0) {
264
- candidates = tagged;
265
- this._lastStrategy = 'affinity';
266
- }
267
- }
268
-
269
- // Strategy 3: Least-loaded
270
- candidates.sort((a, b) => {
271
- const loadA = a.capacity_used / a.capacity_total;
272
- const loadB = b.capacity_used / b.capacity_total;
273
- return loadA - loadB;
274
- });
275
-
276
- this._lastStrategy = this._lastStrategy || 'least-loaded';
277
- return candidates[0];
278
- }
279
-
280
- /**
281
- * Push task notification to a worker node
282
- */
283
- _notifyWorker(node, taskId, task) {
284
- const payload = JSON.stringify({
285
- type: 'task.assigned',
286
- taskId,
287
- taskType: task.task_type,
288
- objective: task.objective,
289
- params: safeParse(task.payload, {}),
290
- priority: task.priority,
291
- timeout: task.timeout_ms,
292
- });
293
-
294
- const url = new URL('/wab-worker/tasks/notify', node.endpoint);
295
- const mod = url.protocol === 'https:' ? https : http;
296
-
297
- const req = mod.request(url, {
298
- method: 'POST',
299
- headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) },
300
- timeout: 5000,
301
- });
302
- req.on('error', () => { /* best-effort push */ });
303
- req.write(payload);
304
- req.end();
305
- }
306
-
307
- /**
308
- * Reassign tasks from a dead node to other nodes
309
- */
310
- reassignFromNode(nodeId) {
311
- const tasks = stmts.getTasksByNode.all(nodeId);
312
- let reassigned = 0;
313
-
314
- for (const task of tasks) {
315
- if (task.attempts >= task.max_attempts) {
316
- stmts.failTask.run({ id: task.id, error: 'Node died, max attempts reached' });
317
- logEvent('task.failed', nodeId, task.id, { reason: 'node_death' });
318
- bus.emit('cluster.task.failed', { taskId: task.id, reason: 'node_death' });
319
- continue;
320
- }
321
-
322
- stmts.decrementNodeLoad.run(nodeId);
323
- stmts.requeueTask.run(task.id);
324
- logEvent('task.requeued', nodeId, task.id, { attempt: task.attempts });
325
-
326
- // Try to assign to another node
327
- if (this._tryAssign(task.id)) {
328
- reassigned++;
329
- this._stats.reassigned++;
330
- }
331
- }
332
-
333
- return reassigned;
334
- }
335
-
336
- /**
337
- * Process pending tasks — called periodically
338
- */
339
- processPending() {
340
- const pending = stmts.getPendingTasks.all(50);
341
- let assigned = 0;
342
- for (const task of pending) {
343
- if (this._tryAssign(task.id)) assigned++;
344
- }
345
- return assigned;
346
- }
347
-
348
- /**
349
- * Worker pulls tasks for execution
350
- */
351
- pullTasks(nodeId, limit = 5) {
352
- const node = stmts.getNode.get(nodeId);
353
- if (!node || node.status !== 'active') return [];
354
-
355
- const available = node.capacity_total - node.capacity_used;
356
- if (available <= 0) return [];
357
-
358
- const count = Math.min(limit, available);
359
- const pending = stmts.getPendingTasks.all(count);
360
- const assigned = [];
361
-
362
- for (const task of pending) {
363
- // Check affinity
364
- const affinityRegion = task.affinity_region;
365
- if (affinityRegion && node.region !== affinityRegion) continue;
366
-
367
- const affinityTags = safeParse(task.affinity_tags, []);
368
- const nodeTags = safeParse(node.tags, []);
369
- if (affinityTags.length > 0 && !affinityTags.some(t => nodeTags.includes(t))) continue;
370
-
371
- stmts.assignTask.run({ id: task.id, node_id: nodeId });
372
- stmts.incrementNodeLoad.run(nodeId);
373
- logEvent('task.assigned', nodeId, task.id, { strategy: 'pull' });
374
-
375
- assigned.push({
376
- taskId: task.id,
377
- type: task.task_type,
378
- objective: task.objective,
379
- params: safeParse(task.payload, {}),
380
- priority: task.priority,
381
- timeout: task.timeout_ms,
382
- });
383
- }
384
-
385
- return assigned;
386
- }
387
-
388
- getStats() { return { ...this._stats }; }
389
- }
390
-
391
- // ═══════════════════════════════════════════════════════════════════════════
392
- // CLUSTER ORCHESTRATOR
393
- // ═══════════════════════════════════════════════════════════════════════════
394
-
395
- /**
396
- * Fleet management — lifecycle, health, auto-scaling, failover, rebalancing.
397
- *
398
- * Responsibilities:
399
- * - Node registration and authentication
400
- * - Health monitoring via heartbeats
401
- * - Dead node detection and task failover
402
- * - Load rebalancing across the cluster
403
- * - Cluster topology and status reporting
404
- * - Drain and cordon operations
405
- */
406
- class ClusterOrchestrator {
407
- constructor(distributor) {
408
- this._distributor = distributor;
409
- this._heartbeatThresholdSec = 90; // Node considered dead after 90s no heartbeat
410
- this._checkInterval = null;
411
- this._rebalanceInterval = null;
412
- this._started = false;
413
- }
414
-
415
- // ─── Lifecycle ──────────────────────────────────────────────────────
416
-
417
- /**
418
- * Start the orchestrator — begins periodic health checks and task processing
419
- */
420
- start() {
421
- if (this._started) return;
422
- this._started = true;
423
-
424
- // Health check every 30s
425
- this._checkInterval = setInterval(() => {
426
- this._healthCheck();
427
- this._recoverStuckTasks();
428
- this._distributor.processPending();
429
- }, 30_000);
430
- if (this._checkInterval.unref) this._checkInterval.unref();
431
-
432
- // Rebalance every 5 min
433
- this._rebalanceInterval = setInterval(() => {
434
- this._rebalance();
435
- }, 300_000);
436
- if (this._rebalanceInterval.unref) this._rebalanceInterval.unref();
437
-
438
- bus.emit('cluster.started', { timestamp: Date.now() });
439
- }
440
-
441
- /**
442
- * Stop the orchestrator
443
- */
444
- stop() {
445
- if (!this._started) return;
446
- this._started = false;
447
- if (this._checkInterval) clearInterval(this._checkInterval);
448
- if (this._rebalanceInterval) clearInterval(this._rebalanceInterval);
449
- bus.emit('cluster.stopped', { timestamp: Date.now() });
450
- }
451
-
452
- // ─── Node Management ───────────────────────────────────────────────
453
-
454
- /**
455
- * Register a worker node to join the cluster
456
- */
457
- registerNode(config) {
458
- if (!config.name || !config.endpoint) {
459
- throw new Error('Node name and endpoint required');
460
- }
461
-
462
- // Check for existing node with same endpoint
463
- const existing = stmts.getNodeByEndpoint.get(config.endpoint);
464
- if (existing) {
465
- // Re-register: update and reactivate
466
- stmts.updateNode.run({
467
- id: existing.id,
468
- name: config.name,
469
- endpoint: config.endpoint,
470
- region: config.region || existing.region,
471
- zone: config.zone || existing.zone,
472
- capacity_total: config.capacity || existing.capacity_total,
473
- tags: JSON.stringify(config.tags || safeParse(existing.tags, [])),
474
- hardware: JSON.stringify(config.hardware || safeParse(existing.hardware, {})),
475
- version: config.version || existing.version,
476
- });
477
- stmts.setNodeStatus.run({ id: existing.id, status: 'active' });
478
- logEvent('node.re-registered', existing.id, null, { endpoint: config.endpoint });
479
- bus.emit('cluster.node.joined', { nodeId: existing.id, name: config.name, rejoined: true });
480
- return { nodeId: existing.id, status: 'active', rejoined: true };
481
- }
482
-
483
- const nodeId = `node_${crypto.randomBytes(12).toString('hex')}`;
484
- const secretHash = crypto.createHash('sha256')
485
- .update(config.secret || crypto.randomBytes(32).toString('hex'))
486
- .digest('hex');
487
-
488
- stmts.insertNode.run({
489
- id: nodeId,
490
- name: config.name,
491
- endpoint: config.endpoint,
492
- region: config.region || 'default',
493
- zone: config.zone || 'a',
494
- role: config.role || 'worker',
495
- status: 'active',
496
- capacity_total: config.capacity || 20,
497
- tags: JSON.stringify(config.tags || []),
498
- hardware: JSON.stringify(config.hardware || {}),
499
- version: config.version || null,
500
- secret_hash: secretHash,
501
- });
502
-
503
- logEvent('node.registered', nodeId, null, { name: config.name, endpoint: config.endpoint, region: config.region });
504
- bus.emit('cluster.node.joined', { nodeId, name: config.name });
505
-
506
- return { nodeId, status: 'active', secret: config.secret ? undefined : undefined };
507
- }
508
-
509
- /**
510
- * Remove a node from the cluster
511
- */
512
- deregisterNode(nodeId) {
513
- const node = stmts.getNode.get(nodeId);
514
- if (!node) return null;
515
-
516
- // Reassign tasks before removing
517
- const reassigned = this._distributor.reassignFromNode(nodeId);
518
- stmts.deleteNode.run(nodeId);
519
-
520
- logEvent('node.deregistered', nodeId, null, { reassigned });
521
- bus.emit('cluster.node.left', { nodeId, name: node.name, tasksReassigned: reassigned });
522
-
523
- return { nodeId, reassigned };
524
- }
525
-
526
- /**
527
- * Process heartbeat from a worker node
528
- */
529
- heartbeat(nodeId, data = {}) {
530
- const node = stmts.getNode.get(nodeId);
531
- if (!node) return null;
532
-
533
- stmts.heartbeatNode.run({
534
- id: nodeId,
535
- capacity_used: data.capacityUsed != null ? data.capacityUsed : node.capacity_used,
536
- });
537
-
538
- // Update hardware profile if provided
539
- if (data.hardware) {
540
- stmts.updateNode.run({
541
- id: nodeId,
542
- name: node.name,
543
- endpoint: node.endpoint,
544
- region: node.region,
545
- zone: node.zone,
546
- capacity_total: data.capacityTotal || node.capacity_total,
547
- tags: JSON.stringify(data.tags || safeParse(node.tags, [])),
548
- hardware: JSON.stringify(data.hardware),
549
- version: data.version || node.version,
550
- });
551
- }
552
-
553
- return {
554
- nodeId,
555
- status: 'active',
556
- pendingTasks: stmts.getPendingTasks.all(1).length > 0,
557
- };
558
- }
559
-
560
- /**
561
- * Drain a node — stop assigning new tasks, wait for running tasks to finish
562
- */
563
- drainNode(nodeId) {
564
- const node = stmts.getNode.get(nodeId);
565
- if (!node) return null;
566
-
567
- stmts.setNodeStatus.run({ id: nodeId, status: 'draining' });
568
- logEvent('node.draining', nodeId, null, {});
569
- bus.emit('cluster.node.draining', { nodeId, name: node.name });
570
-
571
- return { nodeId, status: 'draining', activeTasks: stmts.getTasksByNode.all(nodeId).length };
572
- }
573
-
574
- /**
575
- * Cordon a node — prevent scheduling but keep running tasks
576
- */
577
- cordonNode(nodeId) {
578
- const node = stmts.getNode.get(nodeId);
579
- if (!node) return null;
580
-
581
- stmts.setNodeStatus.run({ id: nodeId, status: 'cordoned' });
582
- logEvent('node.cordoned', nodeId, null, {});
583
- bus.emit('cluster.node.cordoned', { nodeId, name: node.name });
584
-
585
- return { nodeId, status: 'cordoned' };
586
- }
587
-
588
- /**
589
- * Uncordon a node — allow scheduling again
590
- */
591
- uncordonNode(nodeId) {
592
- const node = stmts.getNode.get(nodeId);
593
- if (!node) return null;
594
-
595
- stmts.setNodeStatus.run({ id: nodeId, status: 'active' });
596
- logEvent('node.uncordoned', nodeId, null, {});
597
-
598
- return { nodeId, status: 'active' };
599
- }
600
-
601
- /**
602
- * Get node details
603
- */
604
- getNode(nodeId) {
605
- const node = stmts.getNode.get(nodeId);
606
- if (!node) return null;
607
- node.tags = safeParse(node.tags, []);
608
- node.hardware = safeParse(node.hardware, {});
609
- node.activeTasks = stmts.getTasksByNode.all(nodeId).length;
610
- return node;
611
- }
612
-
613
- /**
614
- * List all cluster nodes
615
- */
616
- listNodes(filter = {}) {
617
- let nodes;
618
- if (filter.region) {
619
- nodes = stmts.listNodesByRegion.all(filter.region);
620
- } else if (filter.active) {
621
- nodes = stmts.listActiveNodes.all();
622
- } else {
623
- nodes = stmts.listNodes.all();
624
- }
625
- return nodes.map(n => ({
626
- ...n,
627
- tags: safeParse(n.tags, []),
628
- hardware: safeParse(n.hardware, {}),
629
- }));
630
- }
631
-
632
- // ─── Task Reporting ─────────────────────────────────────────────────
633
-
634
- /**
635
- * Worker reports task started
636
- */
637
- reportTaskStarted(taskId) {
638
- const task = stmts.getTask.get(taskId);
639
- if (!task) return null;
640
- stmts.startTask.run(taskId);
641
- logEvent('task.started', task.node_id, taskId, {});
642
- bus.emit('cluster.task.started', { taskId, nodeId: task.node_id });
643
- return { taskId, status: 'running' };
644
- }
645
-
646
- /**
647
- * Worker reports task completed
648
- */
649
- reportTaskCompleted(taskId, result) {
650
- const task = stmts.getTask.get(taskId);
651
- if (!task) return null;
652
-
653
- stmts.completeTask.run({ id: taskId, result: JSON.stringify(result || {}) });
654
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
655
-
656
- logEvent('task.completed', task.node_id, taskId, { hasResult: !!result });
657
- bus.emit('cluster.task.completed', { taskId, nodeId: task.node_id, result });
658
-
659
- return { taskId, status: 'completed' };
660
- }
661
-
662
- /**
663
- * Worker reports task failed
664
- */
665
- reportTaskFailed(taskId, error) {
666
- const task = stmts.getTask.get(taskId);
667
- if (!task) return null;
668
-
669
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
670
-
671
- // Retry if attempts remaining
672
- if (task.attempts < task.max_attempts) {
673
- stmts.requeueTask.run(taskId);
674
- logEvent('task.retrying', task.node_id, taskId, { attempt: task.attempts, error });
675
- bus.emit('cluster.task.retrying', { taskId, attempt: task.attempts });
676
-
677
- // Try to assign to a different node
678
- this._distributor._tryAssign(taskId);
679
-
680
- return { taskId, status: 'retrying', attempt: task.attempts };
681
- }
682
-
683
- // Max attempts reached
684
- stmts.failTask.run({ id: taskId, error: typeof error === 'string' ? error : JSON.stringify(error) });
685
- logEvent('task.failed', task.node_id, taskId, { error, attempts: task.attempts });
686
- bus.emit('cluster.task.failed', { taskId, error, nodeId: task.node_id });
687
-
688
- return { taskId, status: 'failed' };
689
- }
690
-
691
- /**
692
- * Get task details
693
- */
694
- getTask(taskId) {
695
- const task = stmts.getTask.get(taskId);
696
- if (!task) return null;
697
- task.payload = safeParse(task.payload, {});
698
- task.affinity_tags = safeParse(task.affinity_tags, []);
699
- task.result = safeParse(task.result, null);
700
- return task;
701
- }
702
-
703
- /**
704
- * List tasks with optional status filter
705
- */
706
- listTasks(filter = {}) {
707
- let tasks;
708
- if (filter.status) {
709
- tasks = stmts.getTasksByStatus.all(filter.status, filter.limit || 50);
710
- } else if (filter.nodeId) {
711
- tasks = stmts.getTasksByNode.all(filter.nodeId);
712
- } else {
713
- tasks = stmts.listTasks.all(filter.limit || 50);
714
- }
715
- return tasks.map(t => ({
716
- ...t,
717
- payload: safeParse(t.payload, {}),
718
- affinity_tags: safeParse(t.affinity_tags, []),
719
- result: safeParse(t.result, null),
720
- }));
721
- }
722
-
723
- // ─── Cluster Topology ───────────────────────────────────────────────
724
-
725
- /**
726
- * Get full cluster status
727
- */
728
- getClusterStatus() {
729
- const nodes = stmts.listNodes.all();
730
- const taskCounts = {};
731
- for (const row of stmts.countByStatus.all()) {
732
- taskCounts[row.status] = row.count;
733
- }
734
-
735
- const activeNodes = nodes.filter(n => n.status === 'active');
736
- const totalCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_total, 0);
737
- const usedCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_used, 0);
738
-
739
- // Group by region
740
- const regions = {};
741
- for (const node of nodes) {
742
- if (!regions[node.region]) regions[node.region] = { nodes: 0, active: 0, capacity: 0, used: 0 };
743
- regions[node.region].nodes++;
744
- if (node.status === 'active') {
745
- regions[node.region].active++;
746
- regions[node.region].capacity += node.capacity_total;
747
- regions[node.region].used += node.capacity_used;
748
- }
749
- }
750
-
751
- return {
752
- coordinator: { started: this._started },
753
- nodes: {
754
- total: nodes.length,
755
- active: activeNodes.length,
756
- draining: nodes.filter(n => n.status === 'draining').length,
757
- cordoned: nodes.filter(n => n.status === 'cordoned').length,
758
- dead: nodes.filter(n => n.status === 'dead').length,
759
- },
760
- capacity: {
761
- total: totalCapacity,
762
- used: usedCapacity,
763
- available: totalCapacity - usedCapacity,
764
- utilization: totalCapacity > 0 ? Math.round((usedCapacity / totalCapacity) * 100) : 0,
765
- },
766
- tasks: taskCounts,
767
- regions,
768
- distributor: this._distributor.getStats(),
769
- };
770
- }
771
-
772
- /**
773
- * Get cluster events log
774
- */
775
- getEvents(limit = 100, nodeId = null) {
776
- if (nodeId) {
777
- return stmts.getEventsByNode.all(nodeId, limit).map(e => ({
778
- ...e,
779
- data: safeParse(e.data, {}),
780
- }));
781
- }
782
- return stmts.getEvents.all(limit).map(e => ({
783
- ...e,
784
- data: safeParse(e.data, {}),
785
- }));
786
- }
787
-
788
- // ─── Internal Operations ────────────────────────────────────────────
789
-
790
- /**
791
- * Check for dead nodes and failover their tasks
792
- */
793
- _healthCheck() {
794
- const staleNodes = stmts.getStaleNodes.all(this._heartbeatThresholdSec);
795
-
796
- for (const node of staleNodes) {
797
- stmts.setNodeStatus.run({ id: node.id, status: 'dead' });
798
- logEvent('node.dead', node.id, null, { lastHeartbeat: node.last_heartbeat });
799
- bus.emit('cluster.node.dead', { nodeId: node.id, name: node.name });
800
-
801
- // Failover: reassign all tasks from dead node
802
- const reassigned = this._distributor.reassignFromNode(node.id);
803
- logEvent('node.failover', node.id, null, { reassigned });
804
- bus.emit('cluster.node.failover', { nodeId: node.id, tasksReassigned: reassigned });
805
- }
806
- }
807
-
808
- /**
809
- * Recover tasks that have been assigned/running too long (stuck)
810
- */
811
- _recoverStuckTasks() {
812
- const stuckTasks = stmts.getStuckTasks.all(300); // 5 min stuck threshold
813
-
814
- for (const task of stuckTasks) {
815
- if (task.attempts >= task.max_attempts) {
816
- stmts.failTask.run({ id: task.id, error: 'Task stuck, max attempts reached' });
817
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
818
- logEvent('task.stuck_failed', task.node_id, task.id, {});
819
- } else {
820
- if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
821
- stmts.requeueTask.run(task.id);
822
- this._distributor._tryAssign(task.id);
823
- logEvent('task.stuck_requeued', task.node_id, task.id, { attempt: task.attempts });
824
- }
825
- }
826
- }
827
-
828
- /**
829
- * Rebalance tasks across nodes when load is skewed
830
- */
831
- _rebalance() {
832
- const nodes = stmts.listActiveNodes.all();
833
- if (nodes.length < 2) return;
834
-
835
- const avgLoad = nodes.reduce((s, n) => s + (n.capacity_used / n.capacity_total), 0) / nodes.length;
836
- const overloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) > avgLoad * 1.5 && n.capacity_used > 2);
837
- const underloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) < avgLoad * 0.5);
838
-
839
- if (overloaded.length === 0 || underloaded.length === 0) return;
840
-
841
- let moved = 0;
842
- for (const over of overloaded) {
843
- const tasks = stmts.getTasksByNode.all(over.id);
844
- // Move up to 2 tasks from overloaded to underloaded
845
- const toMove = tasks.filter(t => t.status === 'assigned').slice(0, 2);
846
-
847
- for (const task of toMove) {
848
- const target = underloaded.find(n => n.capacity_used < n.capacity_total);
849
- if (!target) break;
850
-
851
- stmts.decrementNodeLoad.run(over.id);
852
- stmts.assignTask.run({ id: task.id, node_id: target.id });
853
- stmts.incrementNodeLoad.run(target.id);
854
- target.capacity_used++;
855
- moved++;
856
-
857
- logEvent('task.rebalanced', target.id, task.id, { from: over.id });
858
- this._distributor._notifyWorker(target, task.id, task);
859
- }
860
- }
861
-
862
- if (moved > 0) {
863
- bus.emit('cluster.rebalanced', { tasksMoved: moved });
864
- }
865
- }
866
- }
867
-
868
- // ═══════════════════════════════════════════════════════════════════════════
869
- // HELPERS
870
- // ═══════════════════════════════════════════════════════════════════════════
871
-
872
- function safeParse(str, fallback) {
873
- if (str == null) return fallback;
874
- if (typeof str === 'object') return str;
875
- try { return JSON.parse(str); } catch { return fallback; }
876
- }
877
-
878
- function logEvent(type, nodeId, taskId, data) {
879
- try {
880
- stmts.insertEvent.run({
881
- event_type: type,
882
- node_id: nodeId || null,
883
- task_id: taskId || null,
884
- data: JSON.stringify(data || {}),
885
- });
886
- } catch { /* best-effort logging */ }
887
- }
888
-
889
- // ─── Singleton ───────────────────────────────────────────────────────
890
-
891
- const distributor = new TaskDistributor();
892
- const cluster = new ClusterOrchestrator(distributor);
893
-
894
- module.exports = { cluster, distributor, ClusterOrchestrator, TaskDistributor };
1
+ 'use strict';
2
+
3
+ /**
4
+ * WAB Cluster — Distributed Execution, Worker Nodes & Cluster Orchestration
5
+ *
6
+ * Turns WAB from a single-server Agent OS into a distributed fleet.
7
+ *
8
+ * Architecture:
9
+ * ┌──────────────┐ ┌──────────┐ ┌──────────┐
10
+ * │ Coordinator │────▶│ Worker-1 │ │ Worker-2 │
11
+ * │ (this node) │────▶│ (remote) │ │ (remote) │
12
+ * │ │────▶│ │ │ │
13
+ * └──────────────┘ └──────────┘ └──────────┘
14
+ * │ ▲ ▲
15
+ * │ │ │
16
+ * └───────────────────┴─────────────────┘
17
+ * heartbeat / task results
18
+ *
19
+ * Components:
20
+ * 1. WorkerNode — A remote execution node that connects, heartbeats, runs tasks
21
+ * 2. TaskDistributor — Routes tasks to workers based on capacity/affinity/load
22
+ * 3. ClusterOrchestrator — Fleet management, auto-scaling, failover, rebalancing
23
+ *
24
+ * Communication: HTTP/JSON between nodes (pull-based + push notifications)
25
+ * Persistence: SQLite tables for durability across restarts
26
+ * Consistency: Leader-based (coordinator is source of truth)
27
+ */
28
+
29
+ const crypto = require('crypto');
30
+ const http = require('http');
31
+ const https = require('https');
32
+ const { URL } = require('url');
33
+ const { db } = require('../models/db');
34
+ const { bus } = require('../runtime/event-bus');
35
+
36
+ // ─── Schema ──────────────────────────────────────────────────────────
37
+
38
+ db.exec(`
39
+ CREATE TABLE IF NOT EXISTS cluster_nodes (
40
+ id TEXT PRIMARY KEY,
41
+ name TEXT NOT NULL,
42
+ endpoint TEXT NOT NULL,
43
+ region TEXT DEFAULT 'default',
44
+ zone TEXT DEFAULT 'a',
45
+ role TEXT DEFAULT 'worker',
46
+ status TEXT DEFAULT 'joining',
47
+ capacity_total INTEGER DEFAULT 20,
48
+ capacity_used INTEGER DEFAULT 0,
49
+ tags TEXT DEFAULT '[]',
50
+ hardware TEXT DEFAULT '{}',
51
+ version TEXT,
52
+ secret_hash TEXT,
53
+ last_heartbeat TEXT DEFAULT (datetime('now')),
54
+ registered_at TEXT DEFAULT (datetime('now')),
55
+ updated_at TEXT DEFAULT (datetime('now'))
56
+ );
57
+
58
+ CREATE TABLE IF NOT EXISTS cluster_tasks (
59
+ id TEXT PRIMARY KEY,
60
+ external_id TEXT,
61
+ node_id TEXT,
62
+ task_type TEXT NOT NULL,
63
+ objective TEXT,
64
+ payload TEXT DEFAULT '{}',
65
+ priority INTEGER DEFAULT 50,
66
+ status TEXT DEFAULT 'pending',
67
+ result TEXT,
68
+ error TEXT,
69
+ attempts INTEGER DEFAULT 0,
70
+ max_attempts INTEGER DEFAULT 3,
71
+ affinity_tags TEXT DEFAULT '[]',
72
+ affinity_region TEXT,
73
+ timeout_ms INTEGER DEFAULT 60000,
74
+ submitted_at TEXT DEFAULT (datetime('now')),
75
+ assigned_at TEXT,
76
+ started_at TEXT,
77
+ completed_at TEXT
78
+ );
79
+
80
+ CREATE TABLE IF NOT EXISTS cluster_events (
81
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
82
+ event_type TEXT NOT NULL,
83
+ node_id TEXT,
84
+ task_id TEXT,
85
+ data TEXT DEFAULT '{}',
86
+ created_at TEXT DEFAULT (datetime('now'))
87
+ );
88
+
89
+ CREATE INDEX IF NOT EXISTS idx_cluster_nodes_status ON cluster_nodes(status);
90
+ CREATE INDEX IF NOT EXISTS idx_cluster_nodes_region ON cluster_nodes(region);
91
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_status ON cluster_tasks(status);
92
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_node ON cluster_tasks(node_id);
93
+ CREATE INDEX IF NOT EXISTS idx_cluster_tasks_priority ON cluster_tasks(priority DESC);
94
+ CREATE INDEX IF NOT EXISTS idx_cluster_events_type ON cluster_events(event_type);
95
+ CREATE INDEX IF NOT EXISTS idx_cluster_events_node ON cluster_events(node_id);
96
+ `);
97
+
98
+ // ─── Prepared Statements ─────────────────────────────────────────────
99
+
100
+ const stmts = {
101
+ // Nodes
102
+ insertNode: db.prepare(`
103
+ INSERT INTO cluster_nodes (id, name, endpoint, region, zone, role, status, capacity_total, tags, hardware, version, secret_hash)
104
+ VALUES (@id, @name, @endpoint, @region, @zone, @role, @status, @capacity_total, @tags, @hardware, @version, @secret_hash)
105
+ `),
106
+ updateNode: db.prepare(`
107
+ UPDATE cluster_nodes SET name=@name, endpoint=@endpoint, region=@region, zone=@zone,
108
+ capacity_total=@capacity_total, tags=@tags, hardware=@hardware, version=@version, updated_at=datetime('now')
109
+ WHERE id=@id
110
+ `),
111
+ setNodeStatus: db.prepare(`UPDATE cluster_nodes SET status=@status, updated_at=datetime('now') WHERE id=@id`),
112
+ heartbeatNode: db.prepare(`
113
+ UPDATE cluster_nodes SET last_heartbeat=datetime('now'), capacity_used=@capacity_used, status='active', updated_at=datetime('now')
114
+ WHERE id=@id
115
+ `),
116
+ getNode: db.prepare(`SELECT * FROM cluster_nodes WHERE id=?`),
117
+ getNodeByEndpoint: db.prepare(`SELECT * FROM cluster_nodes WHERE endpoint=?`),
118
+ listNodes: db.prepare(`SELECT * FROM cluster_nodes ORDER BY registered_at DESC`),
119
+ listActiveNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' ORDER BY capacity_used ASC`),
120
+ listNodesByRegion: db.prepare(`SELECT * FROM cluster_nodes WHERE region=? AND status='active' ORDER BY capacity_used ASC`),
121
+ deleteNode: db.prepare(`DELETE FROM cluster_nodes WHERE id=?`),
122
+ getStaleNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' AND last_heartbeat < datetime('now', '-' || ? || ' seconds')`),
123
+
124
+ // Tasks
125
+ insertTask: db.prepare(`
126
+ INSERT INTO cluster_tasks (id, external_id, task_type, objective, payload, priority, status, affinity_tags, affinity_region, timeout_ms, max_attempts)
127
+ VALUES (@id, @external_id, @task_type, @objective, @payload, @priority, @status, @affinity_tags, @affinity_region, @timeout_ms, @max_attempts)
128
+ `),
129
+ assignTask: db.prepare(`
130
+ UPDATE cluster_tasks SET node_id=@node_id, status='assigned', assigned_at=datetime('now'), attempts=attempts+1
131
+ WHERE id=@id
132
+ `),
133
+ startTask: db.prepare(`UPDATE cluster_tasks SET status='running', started_at=datetime('now') WHERE id=?`),
134
+ completeTask: db.prepare(`
135
+ UPDATE cluster_tasks SET status='completed', result=@result, completed_at=datetime('now') WHERE id=@id
136
+ `),
137
+ failTask: db.prepare(`
138
+ UPDATE cluster_tasks SET status='failed', error=@error, completed_at=datetime('now') WHERE id=@id
139
+ `),
140
+ requeueTask: db.prepare(`UPDATE cluster_tasks SET status='pending', node_id=NULL, assigned_at=NULL WHERE id=?`),
141
+ getTask: db.prepare(`SELECT * FROM cluster_tasks WHERE id=?`),
142
+ getTaskByExternal: db.prepare(`SELECT * FROM cluster_tasks WHERE external_id=?`),
143
+ getPendingTasks: db.prepare(`SELECT * FROM cluster_tasks WHERE status='pending' ORDER BY priority DESC, submitted_at ASC LIMIT ?`),
144
+ getTasksByNode: db.prepare(`SELECT * FROM cluster_tasks WHERE node_id=? AND status IN ('assigned','running') ORDER BY priority DESC`),
145
+ getTasksByStatus: db.prepare(`SELECT * FROM cluster_tasks WHERE status=? ORDER BY submitted_at DESC LIMIT ?`),
146
+ listTasks: db.prepare(`SELECT * FROM cluster_tasks ORDER BY submitted_at DESC LIMIT ?`),
147
+ getStuckTasks: db.prepare(`
148
+ SELECT * FROM cluster_tasks WHERE status IN ('assigned','running')
149
+ AND assigned_at < datetime('now', '-' || ? || ' seconds')
150
+ `),
151
+ countByStatus: db.prepare(`SELECT status, COUNT(*) as count FROM cluster_tasks GROUP BY status`),
152
+ incrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = capacity_used + 1 WHERE id=?`),
153
+ decrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = MAX(0, capacity_used - 1) WHERE id=?`),
154
+
155
+ // Events
156
+ insertEvent: db.prepare(`INSERT INTO cluster_events (event_type, node_id, task_id, data) VALUES (@event_type, @node_id, @task_id, @data)`),
157
+ getEvents: db.prepare(`SELECT * FROM cluster_events ORDER BY id DESC LIMIT ?`),
158
+ getEventsByNode: db.prepare(`SELECT * FROM cluster_events WHERE node_id=? ORDER BY id DESC LIMIT ?`),
159
+ };
160
+
161
+ // ═══════════════════════════════════════════════════════════════════════════
162
+ // TASK DISTRIBUTOR
163
+ // ═══════════════════════════════════════════════════════════════════════════
164
+
165
+ /**
166
+ * Routes tasks to worker nodes based on capacity, affinity, and load balancing.
167
+ *
168
+ * Strategies:
169
+ * - least-loaded: Pick the node with the most free capacity
170
+ * - affinity: Match task tags to node tags
171
+ * - region: Prefer nodes in the same region as the task
172
+ * - round-robin: Distribute evenly across all active nodes
173
+ */
174
+ class TaskDistributor {
175
+ constructor() {
176
+ this._roundRobinIndex = 0;
177
+ this._stats = { distributed: 0, reassigned: 0, noCapacity: 0 };
178
+ }
179
+
180
+ /**
181
+ * Submit a task for distributed execution
182
+ */
183
+ submit(task) {
184
+ const id = task.id || `ct_${crypto.randomBytes(12).toString('hex')}`;
185
+ const entry = {
186
+ id,
187
+ external_id: task.externalId || null,
188
+ task_type: task.type || 'general',
189
+ objective: task.objective || '',
190
+ payload: JSON.stringify(task.params || {}),
191
+ priority: task.priority || 50,
192
+ status: 'pending',
193
+ affinity_tags: JSON.stringify(task.affinityTags || []),
194
+ affinity_region: task.affinityRegion || null,
195
+ timeout_ms: task.timeout || 60000,
196
+ max_attempts: task.maxAttempts || 3,
197
+ };
198
+ stmts.insertTask.run(entry);
199
+
200
+ bus.emit('cluster.task.submitted', { taskId: id, type: entry.task_type, priority: entry.priority });
201
+ this._stats.distributed++;
202
+
203
+ // Try immediate assignment
204
+ this._tryAssign(id);
205
+
206
+ return { taskId: id, status: 'pending' };
207
+ }
208
+
209
+ /**
210
+ * Try to assign a task to a worker node
211
+ */
212
+ _tryAssign(taskId) {
213
+ const task = stmts.getTask.get(taskId);
214
+ if (!task || task.status !== 'pending') return false;
215
+
216
+ const node = this._selectNode(task);
217
+ if (!node) {
218
+ this._stats.noCapacity++;
219
+ return false;
220
+ }
221
+
222
+ stmts.assignTask.run({ id: taskId, node_id: node.id });
223
+ stmts.incrementNodeLoad.run(node.id);
224
+
225
+ logEvent('task.assigned', node.id, taskId, { strategy: this._lastStrategy });
226
+ bus.emit('cluster.task.assigned', { taskId, nodeId: node.id });
227
+
228
+ // Push notification to worker (fire-and-forget)
229
+ this._notifyWorker(node, taskId, task);
230
+
231
+ return true;
232
+ }
233
+
234
+ /**
235
+ * Select the best node for a task
236
+ */
237
+ _selectNode(task) {
238
+ let candidates = stmts.listActiveNodes.all();
239
+ if (candidates.length === 0) return null;
240
+
241
+ // Filter by capacity
242
+ candidates = candidates.filter(n => n.capacity_used < n.capacity_total);
243
+ if (candidates.length === 0) return null;
244
+
245
+ const affinityTags = safeParse(task.affinity_tags, []);
246
+ const affinityRegion = task.affinity_region;
247
+
248
+ // Strategy 1: Region affinity
249
+ if (affinityRegion) {
250
+ const regionNodes = candidates.filter(n => n.region === affinityRegion);
251
+ if (regionNodes.length > 0) {
252
+ candidates = regionNodes;
253
+ this._lastStrategy = 'region';
254
+ }
255
+ }
256
+
257
+ // Strategy 2: Tag affinity
258
+ if (affinityTags.length > 0) {
259
+ const tagged = candidates.filter(n => {
260
+ const nodeTags = safeParse(n.tags, []);
261
+ return affinityTags.some(t => nodeTags.includes(t));
262
+ });
263
+ if (tagged.length > 0) {
264
+ candidates = tagged;
265
+ this._lastStrategy = 'affinity';
266
+ }
267
+ }
268
+
269
+ // Strategy 3: Least-loaded
270
+ candidates.sort((a, b) => {
271
+ const loadA = a.capacity_used / a.capacity_total;
272
+ const loadB = b.capacity_used / b.capacity_total;
273
+ return loadA - loadB;
274
+ });
275
+
276
+ this._lastStrategy = this._lastStrategy || 'least-loaded';
277
+ return candidates[0];
278
+ }
279
+
280
+ /**
281
+ * Push task notification to a worker node
282
+ */
283
+ _notifyWorker(node, taskId, task) {
284
+ const payload = JSON.stringify({
285
+ type: 'task.assigned',
286
+ taskId,
287
+ taskType: task.task_type,
288
+ objective: task.objective,
289
+ params: safeParse(task.payload, {}),
290
+ priority: task.priority,
291
+ timeout: task.timeout_ms,
292
+ });
293
+
294
+ const url = new URL('/wab-worker/tasks/notify', node.endpoint);
295
+ const mod = url.protocol === 'https:' ? https : http;
296
+
297
+ const req = mod.request(url, {
298
+ method: 'POST',
299
+ headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) },
300
+ timeout: 5000,
301
+ });
302
+ req.on('error', () => { /* best-effort push */ });
303
+ req.write(payload);
304
+ req.end();
305
+ }
306
+
307
+ /**
308
+ * Reassign tasks from a dead node to other nodes
309
+ */
310
+ reassignFromNode(nodeId) {
311
+ const tasks = stmts.getTasksByNode.all(nodeId);
312
+ let reassigned = 0;
313
+
314
+ for (const task of tasks) {
315
+ if (task.attempts >= task.max_attempts) {
316
+ stmts.failTask.run({ id: task.id, error: 'Node died, max attempts reached' });
317
+ logEvent('task.failed', nodeId, task.id, { reason: 'node_death' });
318
+ bus.emit('cluster.task.failed', { taskId: task.id, reason: 'node_death' });
319
+ continue;
320
+ }
321
+
322
+ stmts.decrementNodeLoad.run(nodeId);
323
+ stmts.requeueTask.run(task.id);
324
+ logEvent('task.requeued', nodeId, task.id, { attempt: task.attempts });
325
+
326
+ // Try to assign to another node
327
+ if (this._tryAssign(task.id)) {
328
+ reassigned++;
329
+ this._stats.reassigned++;
330
+ }
331
+ }
332
+
333
+ return reassigned;
334
+ }
335
+
336
+ /**
337
+ * Process pending tasks — called periodically
338
+ */
339
+ processPending() {
340
+ const pending = stmts.getPendingTasks.all(50);
341
+ let assigned = 0;
342
+ for (const task of pending) {
343
+ if (this._tryAssign(task.id)) assigned++;
344
+ }
345
+ return assigned;
346
+ }
347
+
348
+ /**
349
+ * Worker pulls tasks for execution
350
+ */
351
+ pullTasks(nodeId, limit = 5) {
352
+ const node = stmts.getNode.get(nodeId);
353
+ if (!node || node.status !== 'active') return [];
354
+
355
+ const available = node.capacity_total - node.capacity_used;
356
+ if (available <= 0) return [];
357
+
358
+ const count = Math.min(limit, available);
359
+ const pending = stmts.getPendingTasks.all(count);
360
+ const assigned = [];
361
+
362
+ for (const task of pending) {
363
+ // Check affinity
364
+ const affinityRegion = task.affinity_region;
365
+ if (affinityRegion && node.region !== affinityRegion) continue;
366
+
367
+ const affinityTags = safeParse(task.affinity_tags, []);
368
+ const nodeTags = safeParse(node.tags, []);
369
+ if (affinityTags.length > 0 && !affinityTags.some(t => nodeTags.includes(t))) continue;
370
+
371
+ stmts.assignTask.run({ id: task.id, node_id: nodeId });
372
+ stmts.incrementNodeLoad.run(nodeId);
373
+ logEvent('task.assigned', nodeId, task.id, { strategy: 'pull' });
374
+
375
+ assigned.push({
376
+ taskId: task.id,
377
+ type: task.task_type,
378
+ objective: task.objective,
379
+ params: safeParse(task.payload, {}),
380
+ priority: task.priority,
381
+ timeout: task.timeout_ms,
382
+ });
383
+ }
384
+
385
+ return assigned;
386
+ }
387
+
388
+ getStats() { return { ...this._stats }; }
389
+ }
390
+
391
+ // ═══════════════════════════════════════════════════════════════════════════
392
+ // CLUSTER ORCHESTRATOR
393
+ // ═══════════════════════════════════════════════════════════════════════════
394
+
395
+ /**
396
+ * Fleet management — lifecycle, health, auto-scaling, failover, rebalancing.
397
+ *
398
+ * Responsibilities:
399
+ * - Node registration and authentication
400
+ * - Health monitoring via heartbeats
401
+ * - Dead node detection and task failover
402
+ * - Load rebalancing across the cluster
403
+ * - Cluster topology and status reporting
404
+ * - Drain and cordon operations
405
+ */
406
+ class ClusterOrchestrator {
407
+ constructor(distributor) {
408
+ this._distributor = distributor;
409
+ this._heartbeatThresholdSec = 90; // Node considered dead after 90s no heartbeat
410
+ this._checkInterval = null;
411
+ this._rebalanceInterval = null;
412
+ this._started = false;
413
+ }
414
+
415
+ // ─── Lifecycle ──────────────────────────────────────────────────────
416
+
417
+ /**
418
+ * Start the orchestrator — begins periodic health checks and task processing
419
+ */
420
+ start() {
421
+ if (this._started) return;
422
+ this._started = true;
423
+
424
+ // Health check every 30s
425
+ this._checkInterval = setInterval(() => {
426
+ this._healthCheck();
427
+ this._recoverStuckTasks();
428
+ this._distributor.processPending();
429
+ }, 30_000);
430
+ if (this._checkInterval.unref) this._checkInterval.unref();
431
+
432
+ // Rebalance every 5 min
433
+ this._rebalanceInterval = setInterval(() => {
434
+ this._rebalance();
435
+ }, 300_000);
436
+ if (this._rebalanceInterval.unref) this._rebalanceInterval.unref();
437
+
438
+ bus.emit('cluster.started', { timestamp: Date.now() });
439
+ }
440
+
441
+ /**
442
+ * Stop the orchestrator
443
+ */
444
+ stop() {
445
+ if (!this._started) return;
446
+ this._started = false;
447
+ if (this._checkInterval) clearInterval(this._checkInterval);
448
+ if (this._rebalanceInterval) clearInterval(this._rebalanceInterval);
449
+ bus.emit('cluster.stopped', { timestamp: Date.now() });
450
+ }
451
+
452
+ // ─── Node Management ───────────────────────────────────────────────
453
+
454
+ /**
455
+ * Register a worker node to join the cluster
456
+ */
457
+ registerNode(config) {
458
+ if (!config.name || !config.endpoint) {
459
+ throw new Error('Node name and endpoint required');
460
+ }
461
+
462
+ // Check for existing node with same endpoint
463
+ const existing = stmts.getNodeByEndpoint.get(config.endpoint);
464
+ if (existing) {
465
+ // Re-register: update and reactivate
466
+ stmts.updateNode.run({
467
+ id: existing.id,
468
+ name: config.name,
469
+ endpoint: config.endpoint,
470
+ region: config.region || existing.region,
471
+ zone: config.zone || existing.zone,
472
+ capacity_total: config.capacity || existing.capacity_total,
473
+ tags: JSON.stringify(config.tags || safeParse(existing.tags, [])),
474
+ hardware: JSON.stringify(config.hardware || safeParse(existing.hardware, {})),
475
+ version: config.version || existing.version,
476
+ });
477
+ stmts.setNodeStatus.run({ id: existing.id, status: 'active' });
478
+ logEvent('node.re-registered', existing.id, null, { endpoint: config.endpoint });
479
+ bus.emit('cluster.node.joined', { nodeId: existing.id, name: config.name, rejoined: true });
480
+ return { nodeId: existing.id, status: 'active', rejoined: true };
481
+ }
482
+
483
+ const nodeId = `node_${crypto.randomBytes(12).toString('hex')}`;
484
+ const secretHash = crypto.createHash('sha256')
485
+ .update(config.secret || crypto.randomBytes(32).toString('hex'))
486
+ .digest('hex');
487
+
488
+ stmts.insertNode.run({
489
+ id: nodeId,
490
+ name: config.name,
491
+ endpoint: config.endpoint,
492
+ region: config.region || 'default',
493
+ zone: config.zone || 'a',
494
+ role: config.role || 'worker',
495
+ status: 'active',
496
+ capacity_total: config.capacity || 20,
497
+ tags: JSON.stringify(config.tags || []),
498
+ hardware: JSON.stringify(config.hardware || {}),
499
+ version: config.version || null,
500
+ secret_hash: secretHash,
501
+ });
502
+
503
+ logEvent('node.registered', nodeId, null, { name: config.name, endpoint: config.endpoint, region: config.region });
504
+ bus.emit('cluster.node.joined', { nodeId, name: config.name });
505
+
506
+ return { nodeId, status: 'active', secret: config.secret ? undefined : undefined };
507
+ }
508
+
509
+ /**
510
+ * Remove a node from the cluster
511
+ */
512
+ deregisterNode(nodeId) {
513
+ const node = stmts.getNode.get(nodeId);
514
+ if (!node) return null;
515
+
516
+ // Reassign tasks before removing
517
+ const reassigned = this._distributor.reassignFromNode(nodeId);
518
+ stmts.deleteNode.run(nodeId);
519
+
520
+ logEvent('node.deregistered', nodeId, null, { reassigned });
521
+ bus.emit('cluster.node.left', { nodeId, name: node.name, tasksReassigned: reassigned });
522
+
523
+ return { nodeId, reassigned };
524
+ }
525
+
526
+ /**
527
+ * Process heartbeat from a worker node
528
+ */
529
+ heartbeat(nodeId, data = {}) {
530
+ const node = stmts.getNode.get(nodeId);
531
+ if (!node) return null;
532
+
533
+ stmts.heartbeatNode.run({
534
+ id: nodeId,
535
+ capacity_used: data.capacityUsed != null ? data.capacityUsed : node.capacity_used,
536
+ });
537
+
538
+ // Update hardware profile if provided
539
+ if (data.hardware) {
540
+ stmts.updateNode.run({
541
+ id: nodeId,
542
+ name: node.name,
543
+ endpoint: node.endpoint,
544
+ region: node.region,
545
+ zone: node.zone,
546
+ capacity_total: data.capacityTotal || node.capacity_total,
547
+ tags: JSON.stringify(data.tags || safeParse(node.tags, [])),
548
+ hardware: JSON.stringify(data.hardware),
549
+ version: data.version || node.version,
550
+ });
551
+ }
552
+
553
+ return {
554
+ nodeId,
555
+ status: 'active',
556
+ pendingTasks: stmts.getPendingTasks.all(1).length > 0,
557
+ };
558
+ }
559
+
560
+ /**
561
+ * Drain a node — stop assigning new tasks, wait for running tasks to finish
562
+ */
563
+ drainNode(nodeId) {
564
+ const node = stmts.getNode.get(nodeId);
565
+ if (!node) return null;
566
+
567
+ stmts.setNodeStatus.run({ id: nodeId, status: 'draining' });
568
+ logEvent('node.draining', nodeId, null, {});
569
+ bus.emit('cluster.node.draining', { nodeId, name: node.name });
570
+
571
+ return { nodeId, status: 'draining', activeTasks: stmts.getTasksByNode.all(nodeId).length };
572
+ }
573
+
574
+ /**
575
+ * Cordon a node — prevent scheduling but keep running tasks
576
+ */
577
+ cordonNode(nodeId) {
578
+ const node = stmts.getNode.get(nodeId);
579
+ if (!node) return null;
580
+
581
+ stmts.setNodeStatus.run({ id: nodeId, status: 'cordoned' });
582
+ logEvent('node.cordoned', nodeId, null, {});
583
+ bus.emit('cluster.node.cordoned', { nodeId, name: node.name });
584
+
585
+ return { nodeId, status: 'cordoned' };
586
+ }
587
+
588
+ /**
589
+ * Uncordon a node — allow scheduling again
590
+ */
591
+ uncordonNode(nodeId) {
592
+ const node = stmts.getNode.get(nodeId);
593
+ if (!node) return null;
594
+
595
+ stmts.setNodeStatus.run({ id: nodeId, status: 'active' });
596
+ logEvent('node.uncordoned', nodeId, null, {});
597
+
598
+ return { nodeId, status: 'active' };
599
+ }
600
+
601
+ /**
602
+ * Get node details
603
+ */
604
+ getNode(nodeId) {
605
+ const node = stmts.getNode.get(nodeId);
606
+ if (!node) return null;
607
+ node.tags = safeParse(node.tags, []);
608
+ node.hardware = safeParse(node.hardware, {});
609
+ node.activeTasks = stmts.getTasksByNode.all(nodeId).length;
610
+ return node;
611
+ }
612
+
613
+ /**
614
+ * List all cluster nodes
615
+ */
616
+ listNodes(filter = {}) {
617
+ let nodes;
618
+ if (filter.region) {
619
+ nodes = stmts.listNodesByRegion.all(filter.region);
620
+ } else if (filter.active) {
621
+ nodes = stmts.listActiveNodes.all();
622
+ } else {
623
+ nodes = stmts.listNodes.all();
624
+ }
625
+ return nodes.map(n => ({
626
+ ...n,
627
+ tags: safeParse(n.tags, []),
628
+ hardware: safeParse(n.hardware, {}),
629
+ }));
630
+ }
631
+
632
+ // ─── Task Reporting ─────────────────────────────────────────────────
633
+
634
+ /**
635
+ * Worker reports task started
636
+ */
637
+ reportTaskStarted(taskId) {
638
+ const task = stmts.getTask.get(taskId);
639
+ if (!task) return null;
640
+ stmts.startTask.run(taskId);
641
+ logEvent('task.started', task.node_id, taskId, {});
642
+ bus.emit('cluster.task.started', { taskId, nodeId: task.node_id });
643
+ return { taskId, status: 'running' };
644
+ }
645
+
646
+ /**
647
+ * Worker reports task completed
648
+ */
649
+ reportTaskCompleted(taskId, result) {
650
+ const task = stmts.getTask.get(taskId);
651
+ if (!task) return null;
652
+
653
+ stmts.completeTask.run({ id: taskId, result: JSON.stringify(result || {}) });
654
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
655
+
656
+ logEvent('task.completed', task.node_id, taskId, { hasResult: !!result });
657
+ bus.emit('cluster.task.completed', { taskId, nodeId: task.node_id, result });
658
+
659
+ return { taskId, status: 'completed' };
660
+ }
661
+
662
+ /**
663
+ * Worker reports task failed
664
+ */
665
+ reportTaskFailed(taskId, error) {
666
+ const task = stmts.getTask.get(taskId);
667
+ if (!task) return null;
668
+
669
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
670
+
671
+ // Retry if attempts remaining
672
+ if (task.attempts < task.max_attempts) {
673
+ stmts.requeueTask.run(taskId);
674
+ logEvent('task.retrying', task.node_id, taskId, { attempt: task.attempts, error });
675
+ bus.emit('cluster.task.retrying', { taskId, attempt: task.attempts });
676
+
677
+ // Try to assign to a different node
678
+ this._distributor._tryAssign(taskId);
679
+
680
+ return { taskId, status: 'retrying', attempt: task.attempts };
681
+ }
682
+
683
+ // Max attempts reached
684
+ stmts.failTask.run({ id: taskId, error: typeof error === 'string' ? error : JSON.stringify(error) });
685
+ logEvent('task.failed', task.node_id, taskId, { error, attempts: task.attempts });
686
+ bus.emit('cluster.task.failed', { taskId, error, nodeId: task.node_id });
687
+
688
+ return { taskId, status: 'failed' };
689
+ }
690
+
691
+ /**
692
+ * Get task details
693
+ */
694
+ getTask(taskId) {
695
+ const task = stmts.getTask.get(taskId);
696
+ if (!task) return null;
697
+ task.payload = safeParse(task.payload, {});
698
+ task.affinity_tags = safeParse(task.affinity_tags, []);
699
+ task.result = safeParse(task.result, null);
700
+ return task;
701
+ }
702
+
703
+ /**
704
+ * List tasks with optional status filter
705
+ */
706
+ listTasks(filter = {}) {
707
+ let tasks;
708
+ if (filter.status) {
709
+ tasks = stmts.getTasksByStatus.all(filter.status, filter.limit || 50);
710
+ } else if (filter.nodeId) {
711
+ tasks = stmts.getTasksByNode.all(filter.nodeId);
712
+ } else {
713
+ tasks = stmts.listTasks.all(filter.limit || 50);
714
+ }
715
+ return tasks.map(t => ({
716
+ ...t,
717
+ payload: safeParse(t.payload, {}),
718
+ affinity_tags: safeParse(t.affinity_tags, []),
719
+ result: safeParse(t.result, null),
720
+ }));
721
+ }
722
+
723
+ // ─── Cluster Topology ───────────────────────────────────────────────
724
+
725
+ /**
726
+ * Get full cluster status
727
+ */
728
+ getClusterStatus() {
729
+ const nodes = stmts.listNodes.all();
730
+ const taskCounts = {};
731
+ for (const row of stmts.countByStatus.all()) {
732
+ taskCounts[row.status] = row.count;
733
+ }
734
+
735
+ const activeNodes = nodes.filter(n => n.status === 'active');
736
+ const totalCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_total, 0);
737
+ const usedCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_used, 0);
738
+
739
+ // Group by region
740
+ const regions = {};
741
+ for (const node of nodes) {
742
+ if (!regions[node.region]) regions[node.region] = { nodes: 0, active: 0, capacity: 0, used: 0 };
743
+ regions[node.region].nodes++;
744
+ if (node.status === 'active') {
745
+ regions[node.region].active++;
746
+ regions[node.region].capacity += node.capacity_total;
747
+ regions[node.region].used += node.capacity_used;
748
+ }
749
+ }
750
+
751
+ return {
752
+ coordinator: { started: this._started },
753
+ nodes: {
754
+ total: nodes.length,
755
+ active: activeNodes.length,
756
+ draining: nodes.filter(n => n.status === 'draining').length,
757
+ cordoned: nodes.filter(n => n.status === 'cordoned').length,
758
+ dead: nodes.filter(n => n.status === 'dead').length,
759
+ },
760
+ capacity: {
761
+ total: totalCapacity,
762
+ used: usedCapacity,
763
+ available: totalCapacity - usedCapacity,
764
+ utilization: totalCapacity > 0 ? Math.round((usedCapacity / totalCapacity) * 100) : 0,
765
+ },
766
+ tasks: taskCounts,
767
+ regions,
768
+ distributor: this._distributor.getStats(),
769
+ };
770
+ }
771
+
772
+ /**
773
+ * Get cluster events log
774
+ */
775
+ getEvents(limit = 100, nodeId = null) {
776
+ if (nodeId) {
777
+ return stmts.getEventsByNode.all(nodeId, limit).map(e => ({
778
+ ...e,
779
+ data: safeParse(e.data, {}),
780
+ }));
781
+ }
782
+ return stmts.getEvents.all(limit).map(e => ({
783
+ ...e,
784
+ data: safeParse(e.data, {}),
785
+ }));
786
+ }
787
+
788
+ // ─── Internal Operations ────────────────────────────────────────────
789
+
790
+ /**
791
+ * Check for dead nodes and failover their tasks
792
+ */
793
+ _healthCheck() {
794
+ const staleNodes = stmts.getStaleNodes.all(this._heartbeatThresholdSec);
795
+
796
+ for (const node of staleNodes) {
797
+ stmts.setNodeStatus.run({ id: node.id, status: 'dead' });
798
+ logEvent('node.dead', node.id, null, { lastHeartbeat: node.last_heartbeat });
799
+ bus.emit('cluster.node.dead', { nodeId: node.id, name: node.name });
800
+
801
+ // Failover: reassign all tasks from dead node
802
+ const reassigned = this._distributor.reassignFromNode(node.id);
803
+ logEvent('node.failover', node.id, null, { reassigned });
804
+ bus.emit('cluster.node.failover', { nodeId: node.id, tasksReassigned: reassigned });
805
+ }
806
+ }
807
+
808
+ /**
809
+ * Recover tasks that have been assigned/running too long (stuck)
810
+ */
811
+ _recoverStuckTasks() {
812
+ const stuckTasks = stmts.getStuckTasks.all(300); // 5 min stuck threshold
813
+
814
+ for (const task of stuckTasks) {
815
+ if (task.attempts >= task.max_attempts) {
816
+ stmts.failTask.run({ id: task.id, error: 'Task stuck, max attempts reached' });
817
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
818
+ logEvent('task.stuck_failed', task.node_id, task.id, {});
819
+ } else {
820
+ if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
821
+ stmts.requeueTask.run(task.id);
822
+ this._distributor._tryAssign(task.id);
823
+ logEvent('task.stuck_requeued', task.node_id, task.id, { attempt: task.attempts });
824
+ }
825
+ }
826
+ }
827
+
828
+ /**
829
+ * Rebalance tasks across nodes when load is skewed
830
+ */
831
+ _rebalance() {
832
+ const nodes = stmts.listActiveNodes.all();
833
+ if (nodes.length < 2) return;
834
+
835
+ const avgLoad = nodes.reduce((s, n) => s + (n.capacity_used / n.capacity_total), 0) / nodes.length;
836
+ const overloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) > avgLoad * 1.5 && n.capacity_used > 2);
837
+ const underloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) < avgLoad * 0.5);
838
+
839
+ if (overloaded.length === 0 || underloaded.length === 0) return;
840
+
841
+ let moved = 0;
842
+ for (const over of overloaded) {
843
+ const tasks = stmts.getTasksByNode.all(over.id);
844
+ // Move up to 2 tasks from overloaded to underloaded
845
+ const toMove = tasks.filter(t => t.status === 'assigned').slice(0, 2);
846
+
847
+ for (const task of toMove) {
848
+ const target = underloaded.find(n => n.capacity_used < n.capacity_total);
849
+ if (!target) break;
850
+
851
+ stmts.decrementNodeLoad.run(over.id);
852
+ stmts.assignTask.run({ id: task.id, node_id: target.id });
853
+ stmts.incrementNodeLoad.run(target.id);
854
+ target.capacity_used++;
855
+ moved++;
856
+
857
+ logEvent('task.rebalanced', target.id, task.id, { from: over.id });
858
+ this._distributor._notifyWorker(target, task.id, task);
859
+ }
860
+ }
861
+
862
+ if (moved > 0) {
863
+ bus.emit('cluster.rebalanced', { tasksMoved: moved });
864
+ }
865
+ }
866
+ }
867
+
868
+ // ═══════════════════════════════════════════════════════════════════════════
869
+ // HELPERS
870
+ // ═══════════════════════════════════════════════════════════════════════════
871
+
872
+ function safeParse(str, fallback) {
873
+ if (str == null) return fallback;
874
+ if (typeof str === 'object') return str;
875
+ try { return JSON.parse(str); } catch { return fallback; }
876
+ }
877
+
878
+ function logEvent(type, nodeId, taskId, data) {
879
+ try {
880
+ stmts.insertEvent.run({
881
+ event_type: type,
882
+ node_id: nodeId || null,
883
+ task_id: taskId || null,
884
+ data: JSON.stringify(data || {}),
885
+ });
886
+ } catch { /* best-effort logging */ }
887
+ }
888
+
889
+ // ─── Singleton ───────────────────────────────────────────────────────
890
+
891
+ const distributor = new TaskDistributor();
892
+ const cluster = new ClusterOrchestrator(distributor);
893
+
894
+ module.exports = { cluster, distributor, ClusterOrchestrator, TaskDistributor };