web-agent-bridge 3.2.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +72 -72
- package/README.ar.md +1286 -1152
- package/README.md +1764 -1635
- package/bin/agent-runner.js +474 -474
- package/bin/cli.js +237 -138
- package/bin/wab.js +80 -80
- package/examples/bidi-agent.js +119 -119
- package/examples/cross-site-agent.js +91 -91
- package/examples/mcp-agent.js +94 -94
- package/examples/next-app-router/README.md +44 -44
- package/examples/puppeteer-agent.js +108 -108
- package/examples/saas-dashboard/README.md +55 -55
- package/examples/shopify-hydrogen/README.md +74 -74
- package/examples/vision-agent.js +171 -171
- package/examples/wordpress-elementor/README.md +77 -77
- package/package.json +16 -3
- package/public/.well-known/agent-tools.json +180 -180
- package/public/.well-known/ai-assets.json +59 -59
- package/public/.well-known/security.txt +8 -0
- package/public/agent-workspace.html +349 -349
- package/public/ai.html +198 -198
- package/public/api.html +413 -412
- package/public/browser.html +486 -486
- package/public/commander-dashboard.html +243 -243
- package/public/cookies.html +210 -210
- package/public/css/agent-workspace.css +1713 -1713
- package/public/css/premium.css +317 -317
- package/public/css/styles.css +1235 -1235
- package/public/dashboard.html +706 -706
- package/public/dns.html +507 -0
- package/public/docs.html +587 -587
- package/public/feed.xml +89 -89
- package/public/growth.html +463 -463
- package/public/index.html +1070 -982
- package/public/integrations.html +556 -0
- package/public/js/agent-workspace.js +1740 -1740
- package/public/js/auth-nav.js +31 -31
- package/public/js/auth-redirect.js +12 -12
- package/public/js/cookie-consent.js +56 -56
- package/public/js/wab-demo-page.js +721 -721
- package/public/js/ws-client.js +74 -74
- package/public/llms-full.txt +360 -360
- package/public/llms.txt +125 -125
- package/public/login.html +85 -85
- package/public/mesh-dashboard.html +328 -328
- package/public/openapi.json +580 -580
- package/public/phone-shield.html +281 -0
- package/public/premium-dashboard.html +2489 -2489
- package/public/premium.html +793 -793
- package/public/privacy.html +297 -297
- package/public/register.html +105 -105
- package/public/robots.txt +87 -87
- package/public/script/wab-consent.d.ts +36 -36
- package/public/script/wab-consent.js +104 -104
- package/public/script/wab-schema.js +131 -131
- package/public/script/wab.d.ts +108 -108
- package/public/script/wab.min.js +580 -580
- package/public/security.txt +8 -0
- package/public/terms.html +256 -256
- package/script/ai-agent-bridge.js +1754 -1754
- package/sdk/README.md +99 -99
- package/sdk/agent-mesh.js +449 -449
- package/sdk/commander.js +262 -262
- package/sdk/index.d.ts +464 -464
- package/sdk/index.js +12 -1
- package/sdk/multi-agent.js +318 -318
- package/sdk/package.json +1 -1
- package/sdk/safety-shield.js +219 -0
- package/sdk/schema-discovery.js +83 -83
- package/server/adapters/index.js +520 -520
- package/server/config/plans.js +367 -367
- package/server/config/secrets.js +102 -102
- package/server/control-plane/index.js +301 -301
- package/server/data-plane/index.js +354 -354
- package/server/index.js +531 -427
- package/server/llm/index.js +404 -404
- package/server/middleware/adminAuth.js +35 -35
- package/server/middleware/auth.js +50 -50
- package/server/middleware/featureGate.js +88 -88
- package/server/middleware/rateLimits.js +100 -100
- package/server/middleware/sensitiveAction.js +157 -0
- package/server/migrations/001_add_analytics_indexes.sql +7 -7
- package/server/migrations/002_premium_features.sql +418 -418
- package/server/migrations/003_ads_integer_cents.sql +33 -33
- package/server/migrations/004_agent_os.sql +158 -158
- package/server/migrations/005_marketplace_metering.sql +126 -126
- package/server/models/adapters/index.js +33 -33
- package/server/models/adapters/mysql.js +183 -183
- package/server/models/adapters/postgresql.js +172 -172
- package/server/models/adapters/sqlite.js +7 -7
- package/server/models/db.js +681 -681
- package/server/observability/failure-analysis.js +337 -337
- package/server/observability/index.js +394 -394
- package/server/protocol/capabilities.js +223 -223
- package/server/protocol/index.js +243 -243
- package/server/protocol/schema.js +584 -584
- package/server/registry/certification.js +271 -271
- package/server/registry/index.js +326 -326
- package/server/routes/admin-premium.js +671 -671
- package/server/routes/admin.js +261 -261
- package/server/routes/ads.js +130 -130
- package/server/routes/agent-workspace.js +540 -540
- package/server/routes/api.js +150 -150
- package/server/routes/auth.js +71 -71
- package/server/routes/billing.js +45 -45
- package/server/routes/commander.js +316 -316
- package/server/routes/demo-showcase.js +332 -332
- package/server/routes/demo-store.js +154 -0
- package/server/routes/discovery.js +417 -417
- package/server/routes/gateway.js +173 -157
- package/server/routes/license.js +251 -240
- package/server/routes/mesh.js +469 -469
- package/server/routes/noscript.js +543 -543
- package/server/routes/premium-v2.js +686 -686
- package/server/routes/premium.js +724 -724
- package/server/routes/runtime.js +2148 -2147
- package/server/routes/sovereign.js +465 -385
- package/server/routes/universal.js +200 -185
- package/server/routes/wab-api.js +850 -501
- package/server/runtime/container-worker.js +111 -111
- package/server/runtime/container.js +448 -448
- package/server/runtime/distributed-worker.js +362 -362
- package/server/runtime/event-bus.js +210 -210
- package/server/runtime/index.js +253 -253
- package/server/runtime/queue.js +599 -599
- package/server/runtime/replay.js +666 -666
- package/server/runtime/sandbox.js +266 -266
- package/server/runtime/scheduler.js +534 -534
- package/server/runtime/session-engine.js +293 -293
- package/server/runtime/state-manager.js +188 -188
- package/server/security/cross-site-redactor.js +196 -0
- package/server/security/dry-run.js +180 -0
- package/server/security/human-gate-rate-limit.js +147 -0
- package/server/security/human-gate-transports.js +178 -0
- package/server/security/human-gate.js +281 -0
- package/server/security/index.js +368 -368
- package/server/security/intent-engine.js +245 -0
- package/server/security/reward-guard.js +171 -0
- package/server/security/rollback-store.js +239 -0
- package/server/security/token-scope.js +404 -0
- package/server/security/url-policy.js +139 -0
- package/server/services/agent-chat.js +506 -506
- package/server/services/agent-learning.js +601 -575
- package/server/services/agent-memory.js +625 -625
- package/server/services/agent-mesh.js +555 -539
- package/server/services/agent-symphony.js +717 -717
- package/server/services/agent-tasks.js +1807 -1807
- package/server/services/api-key-engine.js +292 -261
- package/server/services/cluster.js +894 -894
- package/server/services/commander.js +738 -738
- package/server/services/edge-compute.js +440 -440
- package/server/services/email.js +204 -204
- package/server/services/hosted-runtime.js +205 -205
- package/server/services/lfd.js +635 -635
- package/server/services/local-ai.js +389 -389
- package/server/services/marketplace.js +270 -270
- package/server/services/metering.js +182 -182
- package/server/services/modules/affiliate-intelligence.js +93 -93
- package/server/services/modules/agent-firewall.js +90 -90
- package/server/services/modules/bounty.js +89 -89
- package/server/services/modules/collective-bargaining.js +92 -92
- package/server/services/modules/dark-pattern.js +66 -66
- package/server/services/modules/gov-intelligence.js +45 -45
- package/server/services/modules/neural.js +55 -55
- package/server/services/modules/notary.js +49 -49
- package/server/services/modules/price-time-machine.js +86 -86
- package/server/services/modules/protocol.js +104 -104
- package/server/services/negotiation.js +439 -439
- package/server/services/plugins.js +771 -771
- package/server/services/price-intelligence.js +566 -566
- package/server/services/price-shield.js +1137 -1137
- package/server/services/reputation.js +465 -465
- package/server/services/search-engine.js +357 -357
- package/server/services/security.js +513 -513
- package/server/services/self-healing.js +843 -843
- package/server/services/sovereign-shield.js +542 -0
- package/server/services/stripe.js +192 -192
- package/server/services/swarm.js +788 -788
- package/server/services/universal-scraper.js +662 -661
- package/server/services/verification.js +481 -481
- package/server/services/vision.js +1163 -1163
- package/server/utils/cache.js +125 -125
- package/server/utils/migrate.js +81 -81
- package/server/utils/safe-fetch.js +228 -0
- package/server/utils/secureFields.js +50 -50
- package/server/ws.js +161 -161
- package/templates/artisan-marketplace.yaml +104 -104
- package/templates/book-price-scout.yaml +98 -98
- package/templates/electronics-price-tracker.yaml +108 -108
- package/templates/flight-deal-hunter.yaml +113 -113
- package/templates/freelancer-direct.yaml +116 -116
- package/templates/grocery-price-compare.yaml +93 -93
- package/templates/hotel-direct-booking.yaml +113 -113
- package/templates/local-services.yaml +98 -98
- package/templates/olive-oil-tunisia.yaml +88 -88
- package/templates/organic-farm-fresh.yaml +101 -101
- package/templates/restaurant-direct.yaml +97 -97
- package/public/score.html +0 -263
- package/server/migrations/006_growth_suite.sql +0 -138
- package/server/routes/growth.js +0 -962
- package/server/services/fairness-engine.js +0 -409
- package/server/services/fairness.js +0 -420
|
@@ -1,894 +1,894 @@
|
|
|
1
|
-
'use strict';
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* WAB Cluster — Distributed Execution, Worker Nodes & Cluster Orchestration
|
|
5
|
-
*
|
|
6
|
-
* Turns WAB from a single-server Agent OS into a distributed fleet.
|
|
7
|
-
*
|
|
8
|
-
* Architecture:
|
|
9
|
-
* ┌──────────────┐ ┌──────────┐ ┌──────────┐
|
|
10
|
-
* │ Coordinator │────▶│ Worker-1 │ │ Worker-2 │
|
|
11
|
-
* │ (this node) │────▶│ (remote) │ │ (remote) │
|
|
12
|
-
* │ │────▶│ │ │ │
|
|
13
|
-
* └──────────────┘ └──────────┘ └──────────┘
|
|
14
|
-
* │ ▲ ▲
|
|
15
|
-
* │ │ │
|
|
16
|
-
* └───────────────────┴─────────────────┘
|
|
17
|
-
* heartbeat / task results
|
|
18
|
-
*
|
|
19
|
-
* Components:
|
|
20
|
-
* 1. WorkerNode — A remote execution node that connects, heartbeats, runs tasks
|
|
21
|
-
* 2. TaskDistributor — Routes tasks to workers based on capacity/affinity/load
|
|
22
|
-
* 3. ClusterOrchestrator — Fleet management, auto-scaling, failover, rebalancing
|
|
23
|
-
*
|
|
24
|
-
* Communication: HTTP/JSON between nodes (pull-based + push notifications)
|
|
25
|
-
* Persistence: SQLite tables for durability across restarts
|
|
26
|
-
* Consistency: Leader-based (coordinator is source of truth)
|
|
27
|
-
*/
|
|
28
|
-
|
|
29
|
-
const crypto = require('crypto');
|
|
30
|
-
const http = require('http');
|
|
31
|
-
const https = require('https');
|
|
32
|
-
const { URL } = require('url');
|
|
33
|
-
const { db } = require('../models/db');
|
|
34
|
-
const { bus } = require('../runtime/event-bus');
|
|
35
|
-
|
|
36
|
-
// ─── Schema ──────────────────────────────────────────────────────────
|
|
37
|
-
|
|
38
|
-
db.exec(`
|
|
39
|
-
CREATE TABLE IF NOT EXISTS cluster_nodes (
|
|
40
|
-
id TEXT PRIMARY KEY,
|
|
41
|
-
name TEXT NOT NULL,
|
|
42
|
-
endpoint TEXT NOT NULL,
|
|
43
|
-
region TEXT DEFAULT 'default',
|
|
44
|
-
zone TEXT DEFAULT 'a',
|
|
45
|
-
role TEXT DEFAULT 'worker',
|
|
46
|
-
status TEXT DEFAULT 'joining',
|
|
47
|
-
capacity_total INTEGER DEFAULT 20,
|
|
48
|
-
capacity_used INTEGER DEFAULT 0,
|
|
49
|
-
tags TEXT DEFAULT '[]',
|
|
50
|
-
hardware TEXT DEFAULT '{}',
|
|
51
|
-
version TEXT,
|
|
52
|
-
secret_hash TEXT,
|
|
53
|
-
last_heartbeat TEXT DEFAULT (datetime('now')),
|
|
54
|
-
registered_at TEXT DEFAULT (datetime('now')),
|
|
55
|
-
updated_at TEXT DEFAULT (datetime('now'))
|
|
56
|
-
);
|
|
57
|
-
|
|
58
|
-
CREATE TABLE IF NOT EXISTS cluster_tasks (
|
|
59
|
-
id TEXT PRIMARY KEY,
|
|
60
|
-
external_id TEXT,
|
|
61
|
-
node_id TEXT,
|
|
62
|
-
task_type TEXT NOT NULL,
|
|
63
|
-
objective TEXT,
|
|
64
|
-
payload TEXT DEFAULT '{}',
|
|
65
|
-
priority INTEGER DEFAULT 50,
|
|
66
|
-
status TEXT DEFAULT 'pending',
|
|
67
|
-
result TEXT,
|
|
68
|
-
error TEXT,
|
|
69
|
-
attempts INTEGER DEFAULT 0,
|
|
70
|
-
max_attempts INTEGER DEFAULT 3,
|
|
71
|
-
affinity_tags TEXT DEFAULT '[]',
|
|
72
|
-
affinity_region TEXT,
|
|
73
|
-
timeout_ms INTEGER DEFAULT 60000,
|
|
74
|
-
submitted_at TEXT DEFAULT (datetime('now')),
|
|
75
|
-
assigned_at TEXT,
|
|
76
|
-
started_at TEXT,
|
|
77
|
-
completed_at TEXT
|
|
78
|
-
);
|
|
79
|
-
|
|
80
|
-
CREATE TABLE IF NOT EXISTS cluster_events (
|
|
81
|
-
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
82
|
-
event_type TEXT NOT NULL,
|
|
83
|
-
node_id TEXT,
|
|
84
|
-
task_id TEXT,
|
|
85
|
-
data TEXT DEFAULT '{}',
|
|
86
|
-
created_at TEXT DEFAULT (datetime('now'))
|
|
87
|
-
);
|
|
88
|
-
|
|
89
|
-
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_status ON cluster_nodes(status);
|
|
90
|
-
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_region ON cluster_nodes(region);
|
|
91
|
-
CREATE INDEX IF NOT EXISTS idx_cluster_tasks_status ON cluster_tasks(status);
|
|
92
|
-
CREATE INDEX IF NOT EXISTS idx_cluster_tasks_node ON cluster_tasks(node_id);
|
|
93
|
-
CREATE INDEX IF NOT EXISTS idx_cluster_tasks_priority ON cluster_tasks(priority DESC);
|
|
94
|
-
CREATE INDEX IF NOT EXISTS idx_cluster_events_type ON cluster_events(event_type);
|
|
95
|
-
CREATE INDEX IF NOT EXISTS idx_cluster_events_node ON cluster_events(node_id);
|
|
96
|
-
`);
|
|
97
|
-
|
|
98
|
-
// ─── Prepared Statements ─────────────────────────────────────────────
|
|
99
|
-
|
|
100
|
-
const stmts = {
|
|
101
|
-
// Nodes
|
|
102
|
-
insertNode: db.prepare(`
|
|
103
|
-
INSERT INTO cluster_nodes (id, name, endpoint, region, zone, role, status, capacity_total, tags, hardware, version, secret_hash)
|
|
104
|
-
VALUES (@id, @name, @endpoint, @region, @zone, @role, @status, @capacity_total, @tags, @hardware, @version, @secret_hash)
|
|
105
|
-
`),
|
|
106
|
-
updateNode: db.prepare(`
|
|
107
|
-
UPDATE cluster_nodes SET name=@name, endpoint=@endpoint, region=@region, zone=@zone,
|
|
108
|
-
capacity_total=@capacity_total, tags=@tags, hardware=@hardware, version=@version, updated_at=datetime('now')
|
|
109
|
-
WHERE id=@id
|
|
110
|
-
`),
|
|
111
|
-
setNodeStatus: db.prepare(`UPDATE cluster_nodes SET status=@status, updated_at=datetime('now') WHERE id=@id`),
|
|
112
|
-
heartbeatNode: db.prepare(`
|
|
113
|
-
UPDATE cluster_nodes SET last_heartbeat=datetime('now'), capacity_used=@capacity_used, status='active', updated_at=datetime('now')
|
|
114
|
-
WHERE id=@id
|
|
115
|
-
`),
|
|
116
|
-
getNode: db.prepare(`SELECT * FROM cluster_nodes WHERE id=?`),
|
|
117
|
-
getNodeByEndpoint: db.prepare(`SELECT * FROM cluster_nodes WHERE endpoint=?`),
|
|
118
|
-
listNodes: db.prepare(`SELECT * FROM cluster_nodes ORDER BY registered_at DESC`),
|
|
119
|
-
listActiveNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' ORDER BY capacity_used ASC`),
|
|
120
|
-
listNodesByRegion: db.prepare(`SELECT * FROM cluster_nodes WHERE region=? AND status='active' ORDER BY capacity_used ASC`),
|
|
121
|
-
deleteNode: db.prepare(`DELETE FROM cluster_nodes WHERE id=?`),
|
|
122
|
-
getStaleNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' AND last_heartbeat < datetime('now', '-' || ? || ' seconds')`),
|
|
123
|
-
|
|
124
|
-
// Tasks
|
|
125
|
-
insertTask: db.prepare(`
|
|
126
|
-
INSERT INTO cluster_tasks (id, external_id, task_type, objective, payload, priority, status, affinity_tags, affinity_region, timeout_ms, max_attempts)
|
|
127
|
-
VALUES (@id, @external_id, @task_type, @objective, @payload, @priority, @status, @affinity_tags, @affinity_region, @timeout_ms, @max_attempts)
|
|
128
|
-
`),
|
|
129
|
-
assignTask: db.prepare(`
|
|
130
|
-
UPDATE cluster_tasks SET node_id=@node_id, status='assigned', assigned_at=datetime('now'), attempts=attempts+1
|
|
131
|
-
WHERE id=@id
|
|
132
|
-
`),
|
|
133
|
-
startTask: db.prepare(`UPDATE cluster_tasks SET status='running', started_at=datetime('now') WHERE id=?`),
|
|
134
|
-
completeTask: db.prepare(`
|
|
135
|
-
UPDATE cluster_tasks SET status='completed', result=@result, completed_at=datetime('now') WHERE id=@id
|
|
136
|
-
`),
|
|
137
|
-
failTask: db.prepare(`
|
|
138
|
-
UPDATE cluster_tasks SET status='failed', error=@error, completed_at=datetime('now') WHERE id=@id
|
|
139
|
-
`),
|
|
140
|
-
requeueTask: db.prepare(`UPDATE cluster_tasks SET status='pending', node_id=NULL, assigned_at=NULL WHERE id=?`),
|
|
141
|
-
getTask: db.prepare(`SELECT * FROM cluster_tasks WHERE id=?`),
|
|
142
|
-
getTaskByExternal: db.prepare(`SELECT * FROM cluster_tasks WHERE external_id=?`),
|
|
143
|
-
getPendingTasks: db.prepare(`SELECT * FROM cluster_tasks WHERE status='pending' ORDER BY priority DESC, submitted_at ASC LIMIT ?`),
|
|
144
|
-
getTasksByNode: db.prepare(`SELECT * FROM cluster_tasks WHERE node_id=? AND status IN ('assigned','running') ORDER BY priority DESC`),
|
|
145
|
-
getTasksByStatus: db.prepare(`SELECT * FROM cluster_tasks WHERE status=? ORDER BY submitted_at DESC LIMIT ?`),
|
|
146
|
-
listTasks: db.prepare(`SELECT * FROM cluster_tasks ORDER BY submitted_at DESC LIMIT ?`),
|
|
147
|
-
getStuckTasks: db.prepare(`
|
|
148
|
-
SELECT * FROM cluster_tasks WHERE status IN ('assigned','running')
|
|
149
|
-
AND assigned_at < datetime('now', '-' || ? || ' seconds')
|
|
150
|
-
`),
|
|
151
|
-
countByStatus: db.prepare(`SELECT status, COUNT(*) as count FROM cluster_tasks GROUP BY status`),
|
|
152
|
-
incrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = capacity_used + 1 WHERE id=?`),
|
|
153
|
-
decrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = MAX(0, capacity_used - 1) WHERE id=?`),
|
|
154
|
-
|
|
155
|
-
// Events
|
|
156
|
-
insertEvent: db.prepare(`INSERT INTO cluster_events (event_type, node_id, task_id, data) VALUES (@event_type, @node_id, @task_id, @data)`),
|
|
157
|
-
getEvents: db.prepare(`SELECT * FROM cluster_events ORDER BY id DESC LIMIT ?`),
|
|
158
|
-
getEventsByNode: db.prepare(`SELECT * FROM cluster_events WHERE node_id=? ORDER BY id DESC LIMIT ?`),
|
|
159
|
-
};
|
|
160
|
-
|
|
161
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
162
|
-
// TASK DISTRIBUTOR
|
|
163
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
164
|
-
|
|
165
|
-
/**
|
|
166
|
-
* Routes tasks to worker nodes based on capacity, affinity, and load balancing.
|
|
167
|
-
*
|
|
168
|
-
* Strategies:
|
|
169
|
-
* - least-loaded: Pick the node with the most free capacity
|
|
170
|
-
* - affinity: Match task tags to node tags
|
|
171
|
-
* - region: Prefer nodes in the same region as the task
|
|
172
|
-
* - round-robin: Distribute evenly across all active nodes
|
|
173
|
-
*/
|
|
174
|
-
class TaskDistributor {
|
|
175
|
-
constructor() {
|
|
176
|
-
this._roundRobinIndex = 0;
|
|
177
|
-
this._stats = { distributed: 0, reassigned: 0, noCapacity: 0 };
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
/**
|
|
181
|
-
* Submit a task for distributed execution
|
|
182
|
-
*/
|
|
183
|
-
submit(task) {
|
|
184
|
-
const id = task.id || `ct_${crypto.randomBytes(12).toString('hex')}`;
|
|
185
|
-
const entry = {
|
|
186
|
-
id,
|
|
187
|
-
external_id: task.externalId || null,
|
|
188
|
-
task_type: task.type || 'general',
|
|
189
|
-
objective: task.objective || '',
|
|
190
|
-
payload: JSON.stringify(task.params || {}),
|
|
191
|
-
priority: task.priority || 50,
|
|
192
|
-
status: 'pending',
|
|
193
|
-
affinity_tags: JSON.stringify(task.affinityTags || []),
|
|
194
|
-
affinity_region: task.affinityRegion || null,
|
|
195
|
-
timeout_ms: task.timeout || 60000,
|
|
196
|
-
max_attempts: task.maxAttempts || 3,
|
|
197
|
-
};
|
|
198
|
-
stmts.insertTask.run(entry);
|
|
199
|
-
|
|
200
|
-
bus.emit('cluster.task.submitted', { taskId: id, type: entry.task_type, priority: entry.priority });
|
|
201
|
-
this._stats.distributed++;
|
|
202
|
-
|
|
203
|
-
// Try immediate assignment
|
|
204
|
-
this._tryAssign(id);
|
|
205
|
-
|
|
206
|
-
return { taskId: id, status: 'pending' };
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
/**
|
|
210
|
-
* Try to assign a task to a worker node
|
|
211
|
-
*/
|
|
212
|
-
_tryAssign(taskId) {
|
|
213
|
-
const task = stmts.getTask.get(taskId);
|
|
214
|
-
if (!task || task.status !== 'pending') return false;
|
|
215
|
-
|
|
216
|
-
const node = this._selectNode(task);
|
|
217
|
-
if (!node) {
|
|
218
|
-
this._stats.noCapacity++;
|
|
219
|
-
return false;
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
stmts.assignTask.run({ id: taskId, node_id: node.id });
|
|
223
|
-
stmts.incrementNodeLoad.run(node.id);
|
|
224
|
-
|
|
225
|
-
logEvent('task.assigned', node.id, taskId, { strategy: this._lastStrategy });
|
|
226
|
-
bus.emit('cluster.task.assigned', { taskId, nodeId: node.id });
|
|
227
|
-
|
|
228
|
-
// Push notification to worker (fire-and-forget)
|
|
229
|
-
this._notifyWorker(node, taskId, task);
|
|
230
|
-
|
|
231
|
-
return true;
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
/**
|
|
235
|
-
* Select the best node for a task
|
|
236
|
-
*/
|
|
237
|
-
_selectNode(task) {
|
|
238
|
-
let candidates = stmts.listActiveNodes.all();
|
|
239
|
-
if (candidates.length === 0) return null;
|
|
240
|
-
|
|
241
|
-
// Filter by capacity
|
|
242
|
-
candidates = candidates.filter(n => n.capacity_used < n.capacity_total);
|
|
243
|
-
if (candidates.length === 0) return null;
|
|
244
|
-
|
|
245
|
-
const affinityTags = safeParse(task.affinity_tags, []);
|
|
246
|
-
const affinityRegion = task.affinity_region;
|
|
247
|
-
|
|
248
|
-
// Strategy 1: Region affinity
|
|
249
|
-
if (affinityRegion) {
|
|
250
|
-
const regionNodes = candidates.filter(n => n.region === affinityRegion);
|
|
251
|
-
if (regionNodes.length > 0) {
|
|
252
|
-
candidates = regionNodes;
|
|
253
|
-
this._lastStrategy = 'region';
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
// Strategy 2: Tag affinity
|
|
258
|
-
if (affinityTags.length > 0) {
|
|
259
|
-
const tagged = candidates.filter(n => {
|
|
260
|
-
const nodeTags = safeParse(n.tags, []);
|
|
261
|
-
return affinityTags.some(t => nodeTags.includes(t));
|
|
262
|
-
});
|
|
263
|
-
if (tagged.length > 0) {
|
|
264
|
-
candidates = tagged;
|
|
265
|
-
this._lastStrategy = 'affinity';
|
|
266
|
-
}
|
|
267
|
-
}
|
|
268
|
-
|
|
269
|
-
// Strategy 3: Least-loaded
|
|
270
|
-
candidates.sort((a, b) => {
|
|
271
|
-
const loadA = a.capacity_used / a.capacity_total;
|
|
272
|
-
const loadB = b.capacity_used / b.capacity_total;
|
|
273
|
-
return loadA - loadB;
|
|
274
|
-
});
|
|
275
|
-
|
|
276
|
-
this._lastStrategy = this._lastStrategy || 'least-loaded';
|
|
277
|
-
return candidates[0];
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
/**
|
|
281
|
-
* Push task notification to a worker node
|
|
282
|
-
*/
|
|
283
|
-
_notifyWorker(node, taskId, task) {
|
|
284
|
-
const payload = JSON.stringify({
|
|
285
|
-
type: 'task.assigned',
|
|
286
|
-
taskId,
|
|
287
|
-
taskType: task.task_type,
|
|
288
|
-
objective: task.objective,
|
|
289
|
-
params: safeParse(task.payload, {}),
|
|
290
|
-
priority: task.priority,
|
|
291
|
-
timeout: task.timeout_ms,
|
|
292
|
-
});
|
|
293
|
-
|
|
294
|
-
const url = new URL('/wab-worker/tasks/notify', node.endpoint);
|
|
295
|
-
const mod = url.protocol === 'https:' ? https : http;
|
|
296
|
-
|
|
297
|
-
const req = mod.request(url, {
|
|
298
|
-
method: 'POST',
|
|
299
|
-
headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) },
|
|
300
|
-
timeout: 5000,
|
|
301
|
-
});
|
|
302
|
-
req.on('error', () => { /* best-effort push */ });
|
|
303
|
-
req.write(payload);
|
|
304
|
-
req.end();
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
/**
|
|
308
|
-
* Reassign tasks from a dead node to other nodes
|
|
309
|
-
*/
|
|
310
|
-
reassignFromNode(nodeId) {
|
|
311
|
-
const tasks = stmts.getTasksByNode.all(nodeId);
|
|
312
|
-
let reassigned = 0;
|
|
313
|
-
|
|
314
|
-
for (const task of tasks) {
|
|
315
|
-
if (task.attempts >= task.max_attempts) {
|
|
316
|
-
stmts.failTask.run({ id: task.id, error: 'Node died, max attempts reached' });
|
|
317
|
-
logEvent('task.failed', nodeId, task.id, { reason: 'node_death' });
|
|
318
|
-
bus.emit('cluster.task.failed', { taskId: task.id, reason: 'node_death' });
|
|
319
|
-
continue;
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
stmts.decrementNodeLoad.run(nodeId);
|
|
323
|
-
stmts.requeueTask.run(task.id);
|
|
324
|
-
logEvent('task.requeued', nodeId, task.id, { attempt: task.attempts });
|
|
325
|
-
|
|
326
|
-
// Try to assign to another node
|
|
327
|
-
if (this._tryAssign(task.id)) {
|
|
328
|
-
reassigned++;
|
|
329
|
-
this._stats.reassigned++;
|
|
330
|
-
}
|
|
331
|
-
}
|
|
332
|
-
|
|
333
|
-
return reassigned;
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
/**
|
|
337
|
-
* Process pending tasks — called periodically
|
|
338
|
-
*/
|
|
339
|
-
processPending() {
|
|
340
|
-
const pending = stmts.getPendingTasks.all(50);
|
|
341
|
-
let assigned = 0;
|
|
342
|
-
for (const task of pending) {
|
|
343
|
-
if (this._tryAssign(task.id)) assigned++;
|
|
344
|
-
}
|
|
345
|
-
return assigned;
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
/**
|
|
349
|
-
* Worker pulls tasks for execution
|
|
350
|
-
*/
|
|
351
|
-
pullTasks(nodeId, limit = 5) {
|
|
352
|
-
const node = stmts.getNode.get(nodeId);
|
|
353
|
-
if (!node || node.status !== 'active') return [];
|
|
354
|
-
|
|
355
|
-
const available = node.capacity_total - node.capacity_used;
|
|
356
|
-
if (available <= 0) return [];
|
|
357
|
-
|
|
358
|
-
const count = Math.min(limit, available);
|
|
359
|
-
const pending = stmts.getPendingTasks.all(count);
|
|
360
|
-
const assigned = [];
|
|
361
|
-
|
|
362
|
-
for (const task of pending) {
|
|
363
|
-
// Check affinity
|
|
364
|
-
const affinityRegion = task.affinity_region;
|
|
365
|
-
if (affinityRegion && node.region !== affinityRegion) continue;
|
|
366
|
-
|
|
367
|
-
const affinityTags = safeParse(task.affinity_tags, []);
|
|
368
|
-
const nodeTags = safeParse(node.tags, []);
|
|
369
|
-
if (affinityTags.length > 0 && !affinityTags.some(t => nodeTags.includes(t))) continue;
|
|
370
|
-
|
|
371
|
-
stmts.assignTask.run({ id: task.id, node_id: nodeId });
|
|
372
|
-
stmts.incrementNodeLoad.run(nodeId);
|
|
373
|
-
logEvent('task.assigned', nodeId, task.id, { strategy: 'pull' });
|
|
374
|
-
|
|
375
|
-
assigned.push({
|
|
376
|
-
taskId: task.id,
|
|
377
|
-
type: task.task_type,
|
|
378
|
-
objective: task.objective,
|
|
379
|
-
params: safeParse(task.payload, {}),
|
|
380
|
-
priority: task.priority,
|
|
381
|
-
timeout: task.timeout_ms,
|
|
382
|
-
});
|
|
383
|
-
}
|
|
384
|
-
|
|
385
|
-
return assigned;
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
getStats() { return { ...this._stats }; }
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
392
|
-
// CLUSTER ORCHESTRATOR
|
|
393
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
394
|
-
|
|
395
|
-
/**
|
|
396
|
-
* Fleet management — lifecycle, health, auto-scaling, failover, rebalancing.
|
|
397
|
-
*
|
|
398
|
-
* Responsibilities:
|
|
399
|
-
* - Node registration and authentication
|
|
400
|
-
* - Health monitoring via heartbeats
|
|
401
|
-
* - Dead node detection and task failover
|
|
402
|
-
* - Load rebalancing across the cluster
|
|
403
|
-
* - Cluster topology and status reporting
|
|
404
|
-
* - Drain and cordon operations
|
|
405
|
-
*/
|
|
406
|
-
class ClusterOrchestrator {
|
|
407
|
-
constructor(distributor) {
|
|
408
|
-
this._distributor = distributor;
|
|
409
|
-
this._heartbeatThresholdSec = 90; // Node considered dead after 90s no heartbeat
|
|
410
|
-
this._checkInterval = null;
|
|
411
|
-
this._rebalanceInterval = null;
|
|
412
|
-
this._started = false;
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
// ─── Lifecycle ──────────────────────────────────────────────────────
|
|
416
|
-
|
|
417
|
-
/**
|
|
418
|
-
* Start the orchestrator — begins periodic health checks and task processing
|
|
419
|
-
*/
|
|
420
|
-
start() {
|
|
421
|
-
if (this._started) return;
|
|
422
|
-
this._started = true;
|
|
423
|
-
|
|
424
|
-
// Health check every 30s
|
|
425
|
-
this._checkInterval = setInterval(() => {
|
|
426
|
-
this._healthCheck();
|
|
427
|
-
this._recoverStuckTasks();
|
|
428
|
-
this._distributor.processPending();
|
|
429
|
-
}, 30_000);
|
|
430
|
-
if (this._checkInterval.unref) this._checkInterval.unref();
|
|
431
|
-
|
|
432
|
-
// Rebalance every 5 min
|
|
433
|
-
this._rebalanceInterval = setInterval(() => {
|
|
434
|
-
this._rebalance();
|
|
435
|
-
}, 300_000);
|
|
436
|
-
if (this._rebalanceInterval.unref) this._rebalanceInterval.unref();
|
|
437
|
-
|
|
438
|
-
bus.emit('cluster.started', { timestamp: Date.now() });
|
|
439
|
-
}
|
|
440
|
-
|
|
441
|
-
/**
|
|
442
|
-
* Stop the orchestrator
|
|
443
|
-
*/
|
|
444
|
-
stop() {
|
|
445
|
-
if (!this._started) return;
|
|
446
|
-
this._started = false;
|
|
447
|
-
if (this._checkInterval) clearInterval(this._checkInterval);
|
|
448
|
-
if (this._rebalanceInterval) clearInterval(this._rebalanceInterval);
|
|
449
|
-
bus.emit('cluster.stopped', { timestamp: Date.now() });
|
|
450
|
-
}
|
|
451
|
-
|
|
452
|
-
// ─── Node Management ───────────────────────────────────────────────
|
|
453
|
-
|
|
454
|
-
/**
|
|
455
|
-
* Register a worker node to join the cluster
|
|
456
|
-
*/
|
|
457
|
-
registerNode(config) {
|
|
458
|
-
if (!config.name || !config.endpoint) {
|
|
459
|
-
throw new Error('Node name and endpoint required');
|
|
460
|
-
}
|
|
461
|
-
|
|
462
|
-
// Check for existing node with same endpoint
|
|
463
|
-
const existing = stmts.getNodeByEndpoint.get(config.endpoint);
|
|
464
|
-
if (existing) {
|
|
465
|
-
// Re-register: update and reactivate
|
|
466
|
-
stmts.updateNode.run({
|
|
467
|
-
id: existing.id,
|
|
468
|
-
name: config.name,
|
|
469
|
-
endpoint: config.endpoint,
|
|
470
|
-
region: config.region || existing.region,
|
|
471
|
-
zone: config.zone || existing.zone,
|
|
472
|
-
capacity_total: config.capacity || existing.capacity_total,
|
|
473
|
-
tags: JSON.stringify(config.tags || safeParse(existing.tags, [])),
|
|
474
|
-
hardware: JSON.stringify(config.hardware || safeParse(existing.hardware, {})),
|
|
475
|
-
version: config.version || existing.version,
|
|
476
|
-
});
|
|
477
|
-
stmts.setNodeStatus.run({ id: existing.id, status: 'active' });
|
|
478
|
-
logEvent('node.re-registered', existing.id, null, { endpoint: config.endpoint });
|
|
479
|
-
bus.emit('cluster.node.joined', { nodeId: existing.id, name: config.name, rejoined: true });
|
|
480
|
-
return { nodeId: existing.id, status: 'active', rejoined: true };
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
const nodeId = `node_${crypto.randomBytes(12).toString('hex')}`;
|
|
484
|
-
const secretHash = crypto.createHash('sha256')
|
|
485
|
-
.update(config.secret || crypto.randomBytes(32).toString('hex'))
|
|
486
|
-
.digest('hex');
|
|
487
|
-
|
|
488
|
-
stmts.insertNode.run({
|
|
489
|
-
id: nodeId,
|
|
490
|
-
name: config.name,
|
|
491
|
-
endpoint: config.endpoint,
|
|
492
|
-
region: config.region || 'default',
|
|
493
|
-
zone: config.zone || 'a',
|
|
494
|
-
role: config.role || 'worker',
|
|
495
|
-
status: 'active',
|
|
496
|
-
capacity_total: config.capacity || 20,
|
|
497
|
-
tags: JSON.stringify(config.tags || []),
|
|
498
|
-
hardware: JSON.stringify(config.hardware || {}),
|
|
499
|
-
version: config.version || null,
|
|
500
|
-
secret_hash: secretHash,
|
|
501
|
-
});
|
|
502
|
-
|
|
503
|
-
logEvent('node.registered', nodeId, null, { name: config.name, endpoint: config.endpoint, region: config.region });
|
|
504
|
-
bus.emit('cluster.node.joined', { nodeId, name: config.name });
|
|
505
|
-
|
|
506
|
-
return { nodeId, status: 'active', secret: config.secret ? undefined : undefined };
|
|
507
|
-
}
|
|
508
|
-
|
|
509
|
-
/**
|
|
510
|
-
* Remove a node from the cluster
|
|
511
|
-
*/
|
|
512
|
-
deregisterNode(nodeId) {
|
|
513
|
-
const node = stmts.getNode.get(nodeId);
|
|
514
|
-
if (!node) return null;
|
|
515
|
-
|
|
516
|
-
// Reassign tasks before removing
|
|
517
|
-
const reassigned = this._distributor.reassignFromNode(nodeId);
|
|
518
|
-
stmts.deleteNode.run(nodeId);
|
|
519
|
-
|
|
520
|
-
logEvent('node.deregistered', nodeId, null, { reassigned });
|
|
521
|
-
bus.emit('cluster.node.left', { nodeId, name: node.name, tasksReassigned: reassigned });
|
|
522
|
-
|
|
523
|
-
return { nodeId, reassigned };
|
|
524
|
-
}
|
|
525
|
-
|
|
526
|
-
/**
|
|
527
|
-
* Process heartbeat from a worker node
|
|
528
|
-
*/
|
|
529
|
-
heartbeat(nodeId, data = {}) {
|
|
530
|
-
const node = stmts.getNode.get(nodeId);
|
|
531
|
-
if (!node) return null;
|
|
532
|
-
|
|
533
|
-
stmts.heartbeatNode.run({
|
|
534
|
-
id: nodeId,
|
|
535
|
-
capacity_used: data.capacityUsed != null ? data.capacityUsed : node.capacity_used,
|
|
536
|
-
});
|
|
537
|
-
|
|
538
|
-
// Update hardware profile if provided
|
|
539
|
-
if (data.hardware) {
|
|
540
|
-
stmts.updateNode.run({
|
|
541
|
-
id: nodeId,
|
|
542
|
-
name: node.name,
|
|
543
|
-
endpoint: node.endpoint,
|
|
544
|
-
region: node.region,
|
|
545
|
-
zone: node.zone,
|
|
546
|
-
capacity_total: data.capacityTotal || node.capacity_total,
|
|
547
|
-
tags: JSON.stringify(data.tags || safeParse(node.tags, [])),
|
|
548
|
-
hardware: JSON.stringify(data.hardware),
|
|
549
|
-
version: data.version || node.version,
|
|
550
|
-
});
|
|
551
|
-
}
|
|
552
|
-
|
|
553
|
-
return {
|
|
554
|
-
nodeId,
|
|
555
|
-
status: 'active',
|
|
556
|
-
pendingTasks: stmts.getPendingTasks.all(1).length > 0,
|
|
557
|
-
};
|
|
558
|
-
}
|
|
559
|
-
|
|
560
|
-
/**
|
|
561
|
-
* Drain a node — stop assigning new tasks, wait for running tasks to finish
|
|
562
|
-
*/
|
|
563
|
-
drainNode(nodeId) {
|
|
564
|
-
const node = stmts.getNode.get(nodeId);
|
|
565
|
-
if (!node) return null;
|
|
566
|
-
|
|
567
|
-
stmts.setNodeStatus.run({ id: nodeId, status: 'draining' });
|
|
568
|
-
logEvent('node.draining', nodeId, null, {});
|
|
569
|
-
bus.emit('cluster.node.draining', { nodeId, name: node.name });
|
|
570
|
-
|
|
571
|
-
return { nodeId, status: 'draining', activeTasks: stmts.getTasksByNode.all(nodeId).length };
|
|
572
|
-
}
|
|
573
|
-
|
|
574
|
-
/**
|
|
575
|
-
* Cordon a node — prevent scheduling but keep running tasks
|
|
576
|
-
*/
|
|
577
|
-
cordonNode(nodeId) {
|
|
578
|
-
const node = stmts.getNode.get(nodeId);
|
|
579
|
-
if (!node) return null;
|
|
580
|
-
|
|
581
|
-
stmts.setNodeStatus.run({ id: nodeId, status: 'cordoned' });
|
|
582
|
-
logEvent('node.cordoned', nodeId, null, {});
|
|
583
|
-
bus.emit('cluster.node.cordoned', { nodeId, name: node.name });
|
|
584
|
-
|
|
585
|
-
return { nodeId, status: 'cordoned' };
|
|
586
|
-
}
|
|
587
|
-
|
|
588
|
-
/**
|
|
589
|
-
* Uncordon a node — allow scheduling again
|
|
590
|
-
*/
|
|
591
|
-
uncordonNode(nodeId) {
|
|
592
|
-
const node = stmts.getNode.get(nodeId);
|
|
593
|
-
if (!node) return null;
|
|
594
|
-
|
|
595
|
-
stmts.setNodeStatus.run({ id: nodeId, status: 'active' });
|
|
596
|
-
logEvent('node.uncordoned', nodeId, null, {});
|
|
597
|
-
|
|
598
|
-
return { nodeId, status: 'active' };
|
|
599
|
-
}
|
|
600
|
-
|
|
601
|
-
/**
|
|
602
|
-
* Get node details
|
|
603
|
-
*/
|
|
604
|
-
getNode(nodeId) {
|
|
605
|
-
const node = stmts.getNode.get(nodeId);
|
|
606
|
-
if (!node) return null;
|
|
607
|
-
node.tags = safeParse(node.tags, []);
|
|
608
|
-
node.hardware = safeParse(node.hardware, {});
|
|
609
|
-
node.activeTasks = stmts.getTasksByNode.all(nodeId).length;
|
|
610
|
-
return node;
|
|
611
|
-
}
|
|
612
|
-
|
|
613
|
-
/**
|
|
614
|
-
* List all cluster nodes
|
|
615
|
-
*/
|
|
616
|
-
listNodes(filter = {}) {
|
|
617
|
-
let nodes;
|
|
618
|
-
if (filter.region) {
|
|
619
|
-
nodes = stmts.listNodesByRegion.all(filter.region);
|
|
620
|
-
} else if (filter.active) {
|
|
621
|
-
nodes = stmts.listActiveNodes.all();
|
|
622
|
-
} else {
|
|
623
|
-
nodes = stmts.listNodes.all();
|
|
624
|
-
}
|
|
625
|
-
return nodes.map(n => ({
|
|
626
|
-
...n,
|
|
627
|
-
tags: safeParse(n.tags, []),
|
|
628
|
-
hardware: safeParse(n.hardware, {}),
|
|
629
|
-
}));
|
|
630
|
-
}
|
|
631
|
-
|
|
632
|
-
// ─── Task Reporting ─────────────────────────────────────────────────
|
|
633
|
-
|
|
634
|
-
/**
|
|
635
|
-
* Worker reports task started
|
|
636
|
-
*/
|
|
637
|
-
reportTaskStarted(taskId) {
|
|
638
|
-
const task = stmts.getTask.get(taskId);
|
|
639
|
-
if (!task) return null;
|
|
640
|
-
stmts.startTask.run(taskId);
|
|
641
|
-
logEvent('task.started', task.node_id, taskId, {});
|
|
642
|
-
bus.emit('cluster.task.started', { taskId, nodeId: task.node_id });
|
|
643
|
-
return { taskId, status: 'running' };
|
|
644
|
-
}
|
|
645
|
-
|
|
646
|
-
/**
|
|
647
|
-
* Worker reports task completed
|
|
648
|
-
*/
|
|
649
|
-
reportTaskCompleted(taskId, result) {
|
|
650
|
-
const task = stmts.getTask.get(taskId);
|
|
651
|
-
if (!task) return null;
|
|
652
|
-
|
|
653
|
-
stmts.completeTask.run({ id: taskId, result: JSON.stringify(result || {}) });
|
|
654
|
-
if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
|
|
655
|
-
|
|
656
|
-
logEvent('task.completed', task.node_id, taskId, { hasResult: !!result });
|
|
657
|
-
bus.emit('cluster.task.completed', { taskId, nodeId: task.node_id, result });
|
|
658
|
-
|
|
659
|
-
return { taskId, status: 'completed' };
|
|
660
|
-
}
|
|
661
|
-
|
|
662
|
-
/**
|
|
663
|
-
* Worker reports task failed
|
|
664
|
-
*/
|
|
665
|
-
reportTaskFailed(taskId, error) {
|
|
666
|
-
const task = stmts.getTask.get(taskId);
|
|
667
|
-
if (!task) return null;
|
|
668
|
-
|
|
669
|
-
if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
|
|
670
|
-
|
|
671
|
-
// Retry if attempts remaining
|
|
672
|
-
if (task.attempts < task.max_attempts) {
|
|
673
|
-
stmts.requeueTask.run(taskId);
|
|
674
|
-
logEvent('task.retrying', task.node_id, taskId, { attempt: task.attempts, error });
|
|
675
|
-
bus.emit('cluster.task.retrying', { taskId, attempt: task.attempts });
|
|
676
|
-
|
|
677
|
-
// Try to assign to a different node
|
|
678
|
-
this._distributor._tryAssign(taskId);
|
|
679
|
-
|
|
680
|
-
return { taskId, status: 'retrying', attempt: task.attempts };
|
|
681
|
-
}
|
|
682
|
-
|
|
683
|
-
// Max attempts reached
|
|
684
|
-
stmts.failTask.run({ id: taskId, error: typeof error === 'string' ? error : JSON.stringify(error) });
|
|
685
|
-
logEvent('task.failed', task.node_id, taskId, { error, attempts: task.attempts });
|
|
686
|
-
bus.emit('cluster.task.failed', { taskId, error, nodeId: task.node_id });
|
|
687
|
-
|
|
688
|
-
return { taskId, status: 'failed' };
|
|
689
|
-
}
|
|
690
|
-
|
|
691
|
-
/**
|
|
692
|
-
* Get task details
|
|
693
|
-
*/
|
|
694
|
-
getTask(taskId) {
|
|
695
|
-
const task = stmts.getTask.get(taskId);
|
|
696
|
-
if (!task) return null;
|
|
697
|
-
task.payload = safeParse(task.payload, {});
|
|
698
|
-
task.affinity_tags = safeParse(task.affinity_tags, []);
|
|
699
|
-
task.result = safeParse(task.result, null);
|
|
700
|
-
return task;
|
|
701
|
-
}
|
|
702
|
-
|
|
703
|
-
/**
|
|
704
|
-
* List tasks with optional status filter
|
|
705
|
-
*/
|
|
706
|
-
listTasks(filter = {}) {
|
|
707
|
-
let tasks;
|
|
708
|
-
if (filter.status) {
|
|
709
|
-
tasks = stmts.getTasksByStatus.all(filter.status, filter.limit || 50);
|
|
710
|
-
} else if (filter.nodeId) {
|
|
711
|
-
tasks = stmts.getTasksByNode.all(filter.nodeId);
|
|
712
|
-
} else {
|
|
713
|
-
tasks = stmts.listTasks.all(filter.limit || 50);
|
|
714
|
-
}
|
|
715
|
-
return tasks.map(t => ({
|
|
716
|
-
...t,
|
|
717
|
-
payload: safeParse(t.payload, {}),
|
|
718
|
-
affinity_tags: safeParse(t.affinity_tags, []),
|
|
719
|
-
result: safeParse(t.result, null),
|
|
720
|
-
}));
|
|
721
|
-
}
|
|
722
|
-
|
|
723
|
-
// ─── Cluster Topology ───────────────────────────────────────────────
|
|
724
|
-
|
|
725
|
-
/**
|
|
726
|
-
* Get full cluster status
|
|
727
|
-
*/
|
|
728
|
-
getClusterStatus() {
|
|
729
|
-
const nodes = stmts.listNodes.all();
|
|
730
|
-
const taskCounts = {};
|
|
731
|
-
for (const row of stmts.countByStatus.all()) {
|
|
732
|
-
taskCounts[row.status] = row.count;
|
|
733
|
-
}
|
|
734
|
-
|
|
735
|
-
const activeNodes = nodes.filter(n => n.status === 'active');
|
|
736
|
-
const totalCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_total, 0);
|
|
737
|
-
const usedCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_used, 0);
|
|
738
|
-
|
|
739
|
-
// Group by region
|
|
740
|
-
const regions = {};
|
|
741
|
-
for (const node of nodes) {
|
|
742
|
-
if (!regions[node.region]) regions[node.region] = { nodes: 0, active: 0, capacity: 0, used: 0 };
|
|
743
|
-
regions[node.region].nodes++;
|
|
744
|
-
if (node.status === 'active') {
|
|
745
|
-
regions[node.region].active++;
|
|
746
|
-
regions[node.region].capacity += node.capacity_total;
|
|
747
|
-
regions[node.region].used += node.capacity_used;
|
|
748
|
-
}
|
|
749
|
-
}
|
|
750
|
-
|
|
751
|
-
return {
|
|
752
|
-
coordinator: { started: this._started },
|
|
753
|
-
nodes: {
|
|
754
|
-
total: nodes.length,
|
|
755
|
-
active: activeNodes.length,
|
|
756
|
-
draining: nodes.filter(n => n.status === 'draining').length,
|
|
757
|
-
cordoned: nodes.filter(n => n.status === 'cordoned').length,
|
|
758
|
-
dead: nodes.filter(n => n.status === 'dead').length,
|
|
759
|
-
},
|
|
760
|
-
capacity: {
|
|
761
|
-
total: totalCapacity,
|
|
762
|
-
used: usedCapacity,
|
|
763
|
-
available: totalCapacity - usedCapacity,
|
|
764
|
-
utilization: totalCapacity > 0 ? Math.round((usedCapacity / totalCapacity) * 100) : 0,
|
|
765
|
-
},
|
|
766
|
-
tasks: taskCounts,
|
|
767
|
-
regions,
|
|
768
|
-
distributor: this._distributor.getStats(),
|
|
769
|
-
};
|
|
770
|
-
}
|
|
771
|
-
|
|
772
|
-
/**
|
|
773
|
-
* Get cluster events log
|
|
774
|
-
*/
|
|
775
|
-
getEvents(limit = 100, nodeId = null) {
|
|
776
|
-
if (nodeId) {
|
|
777
|
-
return stmts.getEventsByNode.all(nodeId, limit).map(e => ({
|
|
778
|
-
...e,
|
|
779
|
-
data: safeParse(e.data, {}),
|
|
780
|
-
}));
|
|
781
|
-
}
|
|
782
|
-
return stmts.getEvents.all(limit).map(e => ({
|
|
783
|
-
...e,
|
|
784
|
-
data: safeParse(e.data, {}),
|
|
785
|
-
}));
|
|
786
|
-
}
|
|
787
|
-
|
|
788
|
-
// ─── Internal Operations ────────────────────────────────────────────
|
|
789
|
-
|
|
790
|
-
/**
|
|
791
|
-
* Check for dead nodes and failover their tasks
|
|
792
|
-
*/
|
|
793
|
-
_healthCheck() {
|
|
794
|
-
const staleNodes = stmts.getStaleNodes.all(this._heartbeatThresholdSec);
|
|
795
|
-
|
|
796
|
-
for (const node of staleNodes) {
|
|
797
|
-
stmts.setNodeStatus.run({ id: node.id, status: 'dead' });
|
|
798
|
-
logEvent('node.dead', node.id, null, { lastHeartbeat: node.last_heartbeat });
|
|
799
|
-
bus.emit('cluster.node.dead', { nodeId: node.id, name: node.name });
|
|
800
|
-
|
|
801
|
-
// Failover: reassign all tasks from dead node
|
|
802
|
-
const reassigned = this._distributor.reassignFromNode(node.id);
|
|
803
|
-
logEvent('node.failover', node.id, null, { reassigned });
|
|
804
|
-
bus.emit('cluster.node.failover', { nodeId: node.id, tasksReassigned: reassigned });
|
|
805
|
-
}
|
|
806
|
-
}
|
|
807
|
-
|
|
808
|
-
/**
|
|
809
|
-
* Recover tasks that have been assigned/running too long (stuck)
|
|
810
|
-
*/
|
|
811
|
-
_recoverStuckTasks() {
|
|
812
|
-
const stuckTasks = stmts.getStuckTasks.all(300); // 5 min stuck threshold
|
|
813
|
-
|
|
814
|
-
for (const task of stuckTasks) {
|
|
815
|
-
if (task.attempts >= task.max_attempts) {
|
|
816
|
-
stmts.failTask.run({ id: task.id, error: 'Task stuck, max attempts reached' });
|
|
817
|
-
if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
|
|
818
|
-
logEvent('task.stuck_failed', task.node_id, task.id, {});
|
|
819
|
-
} else {
|
|
820
|
-
if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
|
|
821
|
-
stmts.requeueTask.run(task.id);
|
|
822
|
-
this._distributor._tryAssign(task.id);
|
|
823
|
-
logEvent('task.stuck_requeued', task.node_id, task.id, { attempt: task.attempts });
|
|
824
|
-
}
|
|
825
|
-
}
|
|
826
|
-
}
|
|
827
|
-
|
|
828
|
-
/**
|
|
829
|
-
* Rebalance tasks across nodes when load is skewed
|
|
830
|
-
*/
|
|
831
|
-
_rebalance() {
|
|
832
|
-
const nodes = stmts.listActiveNodes.all();
|
|
833
|
-
if (nodes.length < 2) return;
|
|
834
|
-
|
|
835
|
-
const avgLoad = nodes.reduce((s, n) => s + (n.capacity_used / n.capacity_total), 0) / nodes.length;
|
|
836
|
-
const overloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) > avgLoad * 1.5 && n.capacity_used > 2);
|
|
837
|
-
const underloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) < avgLoad * 0.5);
|
|
838
|
-
|
|
839
|
-
if (overloaded.length === 0 || underloaded.length === 0) return;
|
|
840
|
-
|
|
841
|
-
let moved = 0;
|
|
842
|
-
for (const over of overloaded) {
|
|
843
|
-
const tasks = stmts.getTasksByNode.all(over.id);
|
|
844
|
-
// Move up to 2 tasks from overloaded to underloaded
|
|
845
|
-
const toMove = tasks.filter(t => t.status === 'assigned').slice(0, 2);
|
|
846
|
-
|
|
847
|
-
for (const task of toMove) {
|
|
848
|
-
const target = underloaded.find(n => n.capacity_used < n.capacity_total);
|
|
849
|
-
if (!target) break;
|
|
850
|
-
|
|
851
|
-
stmts.decrementNodeLoad.run(over.id);
|
|
852
|
-
stmts.assignTask.run({ id: task.id, node_id: target.id });
|
|
853
|
-
stmts.incrementNodeLoad.run(target.id);
|
|
854
|
-
target.capacity_used++;
|
|
855
|
-
moved++;
|
|
856
|
-
|
|
857
|
-
logEvent('task.rebalanced', target.id, task.id, { from: over.id });
|
|
858
|
-
this._distributor._notifyWorker(target, task.id, task);
|
|
859
|
-
}
|
|
860
|
-
}
|
|
861
|
-
|
|
862
|
-
if (moved > 0) {
|
|
863
|
-
bus.emit('cluster.rebalanced', { tasksMoved: moved });
|
|
864
|
-
}
|
|
865
|
-
}
|
|
866
|
-
}
|
|
867
|
-
|
|
868
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
869
|
-
// HELPERS
|
|
870
|
-
// ═══════════════════════════════════════════════════════════════════════════
|
|
871
|
-
|
|
872
|
-
function safeParse(str, fallback) {
|
|
873
|
-
if (str == null) return fallback;
|
|
874
|
-
if (typeof str === 'object') return str;
|
|
875
|
-
try { return JSON.parse(str); } catch { return fallback; }
|
|
876
|
-
}
|
|
877
|
-
|
|
878
|
-
function logEvent(type, nodeId, taskId, data) {
|
|
879
|
-
try {
|
|
880
|
-
stmts.insertEvent.run({
|
|
881
|
-
event_type: type,
|
|
882
|
-
node_id: nodeId || null,
|
|
883
|
-
task_id: taskId || null,
|
|
884
|
-
data: JSON.stringify(data || {}),
|
|
885
|
-
});
|
|
886
|
-
} catch { /* best-effort logging */ }
|
|
887
|
-
}
|
|
888
|
-
|
|
889
|
-
// ─── Singleton ───────────────────────────────────────────────────────
|
|
890
|
-
|
|
891
|
-
const distributor = new TaskDistributor();
|
|
892
|
-
const cluster = new ClusterOrchestrator(distributor);
|
|
893
|
-
|
|
894
|
-
module.exports = { cluster, distributor, ClusterOrchestrator, TaskDistributor };
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* WAB Cluster — Distributed Execution, Worker Nodes & Cluster Orchestration
|
|
5
|
+
*
|
|
6
|
+
* Turns WAB from a single-server Agent OS into a distributed fleet.
|
|
7
|
+
*
|
|
8
|
+
* Architecture:
|
|
9
|
+
* ┌──────────────┐ ┌──────────┐ ┌──────────┐
|
|
10
|
+
* │ Coordinator │────▶│ Worker-1 │ │ Worker-2 │
|
|
11
|
+
* │ (this node) │────▶│ (remote) │ │ (remote) │
|
|
12
|
+
* │ │────▶│ │ │ │
|
|
13
|
+
* └──────────────┘ └──────────┘ └──────────┘
|
|
14
|
+
* │ ▲ ▲
|
|
15
|
+
* │ │ │
|
|
16
|
+
* └───────────────────┴─────────────────┘
|
|
17
|
+
* heartbeat / task results
|
|
18
|
+
*
|
|
19
|
+
* Components:
|
|
20
|
+
* 1. WorkerNode — A remote execution node that connects, heartbeats, runs tasks
|
|
21
|
+
* 2. TaskDistributor — Routes tasks to workers based on capacity/affinity/load
|
|
22
|
+
* 3. ClusterOrchestrator — Fleet management, auto-scaling, failover, rebalancing
|
|
23
|
+
*
|
|
24
|
+
* Communication: HTTP/JSON between nodes (pull-based + push notifications)
|
|
25
|
+
* Persistence: SQLite tables for durability across restarts
|
|
26
|
+
* Consistency: Leader-based (coordinator is source of truth)
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
const crypto = require('crypto');
|
|
30
|
+
const http = require('http');
|
|
31
|
+
const https = require('https');
|
|
32
|
+
const { URL } = require('url');
|
|
33
|
+
const { db } = require('../models/db');
|
|
34
|
+
const { bus } = require('../runtime/event-bus');
|
|
35
|
+
|
|
36
|
+
// ─── Schema ──────────────────────────────────────────────────────────
|
|
37
|
+
|
|
38
|
+
db.exec(`
|
|
39
|
+
CREATE TABLE IF NOT EXISTS cluster_nodes (
|
|
40
|
+
id TEXT PRIMARY KEY,
|
|
41
|
+
name TEXT NOT NULL,
|
|
42
|
+
endpoint TEXT NOT NULL,
|
|
43
|
+
region TEXT DEFAULT 'default',
|
|
44
|
+
zone TEXT DEFAULT 'a',
|
|
45
|
+
role TEXT DEFAULT 'worker',
|
|
46
|
+
status TEXT DEFAULT 'joining',
|
|
47
|
+
capacity_total INTEGER DEFAULT 20,
|
|
48
|
+
capacity_used INTEGER DEFAULT 0,
|
|
49
|
+
tags TEXT DEFAULT '[]',
|
|
50
|
+
hardware TEXT DEFAULT '{}',
|
|
51
|
+
version TEXT,
|
|
52
|
+
secret_hash TEXT,
|
|
53
|
+
last_heartbeat TEXT DEFAULT (datetime('now')),
|
|
54
|
+
registered_at TEXT DEFAULT (datetime('now')),
|
|
55
|
+
updated_at TEXT DEFAULT (datetime('now'))
|
|
56
|
+
);
|
|
57
|
+
|
|
58
|
+
CREATE TABLE IF NOT EXISTS cluster_tasks (
|
|
59
|
+
id TEXT PRIMARY KEY,
|
|
60
|
+
external_id TEXT,
|
|
61
|
+
node_id TEXT,
|
|
62
|
+
task_type TEXT NOT NULL,
|
|
63
|
+
objective TEXT,
|
|
64
|
+
payload TEXT DEFAULT '{}',
|
|
65
|
+
priority INTEGER DEFAULT 50,
|
|
66
|
+
status TEXT DEFAULT 'pending',
|
|
67
|
+
result TEXT,
|
|
68
|
+
error TEXT,
|
|
69
|
+
attempts INTEGER DEFAULT 0,
|
|
70
|
+
max_attempts INTEGER DEFAULT 3,
|
|
71
|
+
affinity_tags TEXT DEFAULT '[]',
|
|
72
|
+
affinity_region TEXT,
|
|
73
|
+
timeout_ms INTEGER DEFAULT 60000,
|
|
74
|
+
submitted_at TEXT DEFAULT (datetime('now')),
|
|
75
|
+
assigned_at TEXT,
|
|
76
|
+
started_at TEXT,
|
|
77
|
+
completed_at TEXT
|
|
78
|
+
);
|
|
79
|
+
|
|
80
|
+
CREATE TABLE IF NOT EXISTS cluster_events (
|
|
81
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
82
|
+
event_type TEXT NOT NULL,
|
|
83
|
+
node_id TEXT,
|
|
84
|
+
task_id TEXT,
|
|
85
|
+
data TEXT DEFAULT '{}',
|
|
86
|
+
created_at TEXT DEFAULT (datetime('now'))
|
|
87
|
+
);
|
|
88
|
+
|
|
89
|
+
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_status ON cluster_nodes(status);
|
|
90
|
+
CREATE INDEX IF NOT EXISTS idx_cluster_nodes_region ON cluster_nodes(region);
|
|
91
|
+
CREATE INDEX IF NOT EXISTS idx_cluster_tasks_status ON cluster_tasks(status);
|
|
92
|
+
CREATE INDEX IF NOT EXISTS idx_cluster_tasks_node ON cluster_tasks(node_id);
|
|
93
|
+
CREATE INDEX IF NOT EXISTS idx_cluster_tasks_priority ON cluster_tasks(priority DESC);
|
|
94
|
+
CREATE INDEX IF NOT EXISTS idx_cluster_events_type ON cluster_events(event_type);
|
|
95
|
+
CREATE INDEX IF NOT EXISTS idx_cluster_events_node ON cluster_events(node_id);
|
|
96
|
+
`);
|
|
97
|
+
|
|
98
|
+
// ─── Prepared Statements ─────────────────────────────────────────────
|
|
99
|
+
|
|
100
|
+
const stmts = {
|
|
101
|
+
// Nodes
|
|
102
|
+
insertNode: db.prepare(`
|
|
103
|
+
INSERT INTO cluster_nodes (id, name, endpoint, region, zone, role, status, capacity_total, tags, hardware, version, secret_hash)
|
|
104
|
+
VALUES (@id, @name, @endpoint, @region, @zone, @role, @status, @capacity_total, @tags, @hardware, @version, @secret_hash)
|
|
105
|
+
`),
|
|
106
|
+
updateNode: db.prepare(`
|
|
107
|
+
UPDATE cluster_nodes SET name=@name, endpoint=@endpoint, region=@region, zone=@zone,
|
|
108
|
+
capacity_total=@capacity_total, tags=@tags, hardware=@hardware, version=@version, updated_at=datetime('now')
|
|
109
|
+
WHERE id=@id
|
|
110
|
+
`),
|
|
111
|
+
setNodeStatus: db.prepare(`UPDATE cluster_nodes SET status=@status, updated_at=datetime('now') WHERE id=@id`),
|
|
112
|
+
heartbeatNode: db.prepare(`
|
|
113
|
+
UPDATE cluster_nodes SET last_heartbeat=datetime('now'), capacity_used=@capacity_used, status='active', updated_at=datetime('now')
|
|
114
|
+
WHERE id=@id
|
|
115
|
+
`),
|
|
116
|
+
getNode: db.prepare(`SELECT * FROM cluster_nodes WHERE id=?`),
|
|
117
|
+
getNodeByEndpoint: db.prepare(`SELECT * FROM cluster_nodes WHERE endpoint=?`),
|
|
118
|
+
listNodes: db.prepare(`SELECT * FROM cluster_nodes ORDER BY registered_at DESC`),
|
|
119
|
+
listActiveNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' ORDER BY capacity_used ASC`),
|
|
120
|
+
listNodesByRegion: db.prepare(`SELECT * FROM cluster_nodes WHERE region=? AND status='active' ORDER BY capacity_used ASC`),
|
|
121
|
+
deleteNode: db.prepare(`DELETE FROM cluster_nodes WHERE id=?`),
|
|
122
|
+
getStaleNodes: db.prepare(`SELECT * FROM cluster_nodes WHERE status='active' AND last_heartbeat < datetime('now', '-' || ? || ' seconds')`),
|
|
123
|
+
|
|
124
|
+
// Tasks
|
|
125
|
+
insertTask: db.prepare(`
|
|
126
|
+
INSERT INTO cluster_tasks (id, external_id, task_type, objective, payload, priority, status, affinity_tags, affinity_region, timeout_ms, max_attempts)
|
|
127
|
+
VALUES (@id, @external_id, @task_type, @objective, @payload, @priority, @status, @affinity_tags, @affinity_region, @timeout_ms, @max_attempts)
|
|
128
|
+
`),
|
|
129
|
+
assignTask: db.prepare(`
|
|
130
|
+
UPDATE cluster_tasks SET node_id=@node_id, status='assigned', assigned_at=datetime('now'), attempts=attempts+1
|
|
131
|
+
WHERE id=@id
|
|
132
|
+
`),
|
|
133
|
+
startTask: db.prepare(`UPDATE cluster_tasks SET status='running', started_at=datetime('now') WHERE id=?`),
|
|
134
|
+
completeTask: db.prepare(`
|
|
135
|
+
UPDATE cluster_tasks SET status='completed', result=@result, completed_at=datetime('now') WHERE id=@id
|
|
136
|
+
`),
|
|
137
|
+
failTask: db.prepare(`
|
|
138
|
+
UPDATE cluster_tasks SET status='failed', error=@error, completed_at=datetime('now') WHERE id=@id
|
|
139
|
+
`),
|
|
140
|
+
requeueTask: db.prepare(`UPDATE cluster_tasks SET status='pending', node_id=NULL, assigned_at=NULL WHERE id=?`),
|
|
141
|
+
getTask: db.prepare(`SELECT * FROM cluster_tasks WHERE id=?`),
|
|
142
|
+
getTaskByExternal: db.prepare(`SELECT * FROM cluster_tasks WHERE external_id=?`),
|
|
143
|
+
getPendingTasks: db.prepare(`SELECT * FROM cluster_tasks WHERE status='pending' ORDER BY priority DESC, submitted_at ASC LIMIT ?`),
|
|
144
|
+
getTasksByNode: db.prepare(`SELECT * FROM cluster_tasks WHERE node_id=? AND status IN ('assigned','running') ORDER BY priority DESC`),
|
|
145
|
+
getTasksByStatus: db.prepare(`SELECT * FROM cluster_tasks WHERE status=? ORDER BY submitted_at DESC LIMIT ?`),
|
|
146
|
+
listTasks: db.prepare(`SELECT * FROM cluster_tasks ORDER BY submitted_at DESC LIMIT ?`),
|
|
147
|
+
getStuckTasks: db.prepare(`
|
|
148
|
+
SELECT * FROM cluster_tasks WHERE status IN ('assigned','running')
|
|
149
|
+
AND assigned_at < datetime('now', '-' || ? || ' seconds')
|
|
150
|
+
`),
|
|
151
|
+
countByStatus: db.prepare(`SELECT status, COUNT(*) as count FROM cluster_tasks GROUP BY status`),
|
|
152
|
+
incrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = capacity_used + 1 WHERE id=?`),
|
|
153
|
+
decrementNodeLoad: db.prepare(`UPDATE cluster_nodes SET capacity_used = MAX(0, capacity_used - 1) WHERE id=?`),
|
|
154
|
+
|
|
155
|
+
// Events
|
|
156
|
+
insertEvent: db.prepare(`INSERT INTO cluster_events (event_type, node_id, task_id, data) VALUES (@event_type, @node_id, @task_id, @data)`),
|
|
157
|
+
getEvents: db.prepare(`SELECT * FROM cluster_events ORDER BY id DESC LIMIT ?`),
|
|
158
|
+
getEventsByNode: db.prepare(`SELECT * FROM cluster_events WHERE node_id=? ORDER BY id DESC LIMIT ?`),
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
162
|
+
// TASK DISTRIBUTOR
|
|
163
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Routes tasks to worker nodes based on capacity, affinity, and load balancing.
|
|
167
|
+
*
|
|
168
|
+
* Strategies:
|
|
169
|
+
* - least-loaded: Pick the node with the most free capacity
|
|
170
|
+
* - affinity: Match task tags to node tags
|
|
171
|
+
* - region: Prefer nodes in the same region as the task
|
|
172
|
+
* - round-robin: Distribute evenly across all active nodes
|
|
173
|
+
*/
|
|
174
|
+
class TaskDistributor {
|
|
175
|
+
constructor() {
|
|
176
|
+
this._roundRobinIndex = 0;
|
|
177
|
+
this._stats = { distributed: 0, reassigned: 0, noCapacity: 0 };
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
/**
|
|
181
|
+
* Submit a task for distributed execution
|
|
182
|
+
*/
|
|
183
|
+
submit(task) {
|
|
184
|
+
const id = task.id || `ct_${crypto.randomBytes(12).toString('hex')}`;
|
|
185
|
+
const entry = {
|
|
186
|
+
id,
|
|
187
|
+
external_id: task.externalId || null,
|
|
188
|
+
task_type: task.type || 'general',
|
|
189
|
+
objective: task.objective || '',
|
|
190
|
+
payload: JSON.stringify(task.params || {}),
|
|
191
|
+
priority: task.priority || 50,
|
|
192
|
+
status: 'pending',
|
|
193
|
+
affinity_tags: JSON.stringify(task.affinityTags || []),
|
|
194
|
+
affinity_region: task.affinityRegion || null,
|
|
195
|
+
timeout_ms: task.timeout || 60000,
|
|
196
|
+
max_attempts: task.maxAttempts || 3,
|
|
197
|
+
};
|
|
198
|
+
stmts.insertTask.run(entry);
|
|
199
|
+
|
|
200
|
+
bus.emit('cluster.task.submitted', { taskId: id, type: entry.task_type, priority: entry.priority });
|
|
201
|
+
this._stats.distributed++;
|
|
202
|
+
|
|
203
|
+
// Try immediate assignment
|
|
204
|
+
this._tryAssign(id);
|
|
205
|
+
|
|
206
|
+
return { taskId: id, status: 'pending' };
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Try to assign a task to a worker node
|
|
211
|
+
*/
|
|
212
|
+
_tryAssign(taskId) {
|
|
213
|
+
const task = stmts.getTask.get(taskId);
|
|
214
|
+
if (!task || task.status !== 'pending') return false;
|
|
215
|
+
|
|
216
|
+
const node = this._selectNode(task);
|
|
217
|
+
if (!node) {
|
|
218
|
+
this._stats.noCapacity++;
|
|
219
|
+
return false;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
stmts.assignTask.run({ id: taskId, node_id: node.id });
|
|
223
|
+
stmts.incrementNodeLoad.run(node.id);
|
|
224
|
+
|
|
225
|
+
logEvent('task.assigned', node.id, taskId, { strategy: this._lastStrategy });
|
|
226
|
+
bus.emit('cluster.task.assigned', { taskId, nodeId: node.id });
|
|
227
|
+
|
|
228
|
+
// Push notification to worker (fire-and-forget)
|
|
229
|
+
this._notifyWorker(node, taskId, task);
|
|
230
|
+
|
|
231
|
+
return true;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Select the best node for a task
|
|
236
|
+
*/
|
|
237
|
+
_selectNode(task) {
|
|
238
|
+
let candidates = stmts.listActiveNodes.all();
|
|
239
|
+
if (candidates.length === 0) return null;
|
|
240
|
+
|
|
241
|
+
// Filter by capacity
|
|
242
|
+
candidates = candidates.filter(n => n.capacity_used < n.capacity_total);
|
|
243
|
+
if (candidates.length === 0) return null;
|
|
244
|
+
|
|
245
|
+
const affinityTags = safeParse(task.affinity_tags, []);
|
|
246
|
+
const affinityRegion = task.affinity_region;
|
|
247
|
+
|
|
248
|
+
// Strategy 1: Region affinity
|
|
249
|
+
if (affinityRegion) {
|
|
250
|
+
const regionNodes = candidates.filter(n => n.region === affinityRegion);
|
|
251
|
+
if (regionNodes.length > 0) {
|
|
252
|
+
candidates = regionNodes;
|
|
253
|
+
this._lastStrategy = 'region';
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Strategy 2: Tag affinity
|
|
258
|
+
if (affinityTags.length > 0) {
|
|
259
|
+
const tagged = candidates.filter(n => {
|
|
260
|
+
const nodeTags = safeParse(n.tags, []);
|
|
261
|
+
return affinityTags.some(t => nodeTags.includes(t));
|
|
262
|
+
});
|
|
263
|
+
if (tagged.length > 0) {
|
|
264
|
+
candidates = tagged;
|
|
265
|
+
this._lastStrategy = 'affinity';
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Strategy 3: Least-loaded
|
|
270
|
+
candidates.sort((a, b) => {
|
|
271
|
+
const loadA = a.capacity_used / a.capacity_total;
|
|
272
|
+
const loadB = b.capacity_used / b.capacity_total;
|
|
273
|
+
return loadA - loadB;
|
|
274
|
+
});
|
|
275
|
+
|
|
276
|
+
this._lastStrategy = this._lastStrategy || 'least-loaded';
|
|
277
|
+
return candidates[0];
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Push task notification to a worker node
|
|
282
|
+
*/
|
|
283
|
+
_notifyWorker(node, taskId, task) {
|
|
284
|
+
const payload = JSON.stringify({
|
|
285
|
+
type: 'task.assigned',
|
|
286
|
+
taskId,
|
|
287
|
+
taskType: task.task_type,
|
|
288
|
+
objective: task.objective,
|
|
289
|
+
params: safeParse(task.payload, {}),
|
|
290
|
+
priority: task.priority,
|
|
291
|
+
timeout: task.timeout_ms,
|
|
292
|
+
});
|
|
293
|
+
|
|
294
|
+
const url = new URL('/wab-worker/tasks/notify', node.endpoint);
|
|
295
|
+
const mod = url.protocol === 'https:' ? https : http;
|
|
296
|
+
|
|
297
|
+
const req = mod.request(url, {
|
|
298
|
+
method: 'POST',
|
|
299
|
+
headers: { 'Content-Type': 'application/json', 'Content-Length': Buffer.byteLength(payload) },
|
|
300
|
+
timeout: 5000,
|
|
301
|
+
});
|
|
302
|
+
req.on('error', () => { /* best-effort push */ });
|
|
303
|
+
req.write(payload);
|
|
304
|
+
req.end();
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Reassign tasks from a dead node to other nodes
|
|
309
|
+
*/
|
|
310
|
+
reassignFromNode(nodeId) {
|
|
311
|
+
const tasks = stmts.getTasksByNode.all(nodeId);
|
|
312
|
+
let reassigned = 0;
|
|
313
|
+
|
|
314
|
+
for (const task of tasks) {
|
|
315
|
+
if (task.attempts >= task.max_attempts) {
|
|
316
|
+
stmts.failTask.run({ id: task.id, error: 'Node died, max attempts reached' });
|
|
317
|
+
logEvent('task.failed', nodeId, task.id, { reason: 'node_death' });
|
|
318
|
+
bus.emit('cluster.task.failed', { taskId: task.id, reason: 'node_death' });
|
|
319
|
+
continue;
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
stmts.decrementNodeLoad.run(nodeId);
|
|
323
|
+
stmts.requeueTask.run(task.id);
|
|
324
|
+
logEvent('task.requeued', nodeId, task.id, { attempt: task.attempts });
|
|
325
|
+
|
|
326
|
+
// Try to assign to another node
|
|
327
|
+
if (this._tryAssign(task.id)) {
|
|
328
|
+
reassigned++;
|
|
329
|
+
this._stats.reassigned++;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
return reassigned;
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
/**
|
|
337
|
+
* Process pending tasks — called periodically
|
|
338
|
+
*/
|
|
339
|
+
processPending() {
|
|
340
|
+
const pending = stmts.getPendingTasks.all(50);
|
|
341
|
+
let assigned = 0;
|
|
342
|
+
for (const task of pending) {
|
|
343
|
+
if (this._tryAssign(task.id)) assigned++;
|
|
344
|
+
}
|
|
345
|
+
return assigned;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
/**
|
|
349
|
+
* Worker pulls tasks for execution
|
|
350
|
+
*/
|
|
351
|
+
pullTasks(nodeId, limit = 5) {
|
|
352
|
+
const node = stmts.getNode.get(nodeId);
|
|
353
|
+
if (!node || node.status !== 'active') return [];
|
|
354
|
+
|
|
355
|
+
const available = node.capacity_total - node.capacity_used;
|
|
356
|
+
if (available <= 0) return [];
|
|
357
|
+
|
|
358
|
+
const count = Math.min(limit, available);
|
|
359
|
+
const pending = stmts.getPendingTasks.all(count);
|
|
360
|
+
const assigned = [];
|
|
361
|
+
|
|
362
|
+
for (const task of pending) {
|
|
363
|
+
// Check affinity
|
|
364
|
+
const affinityRegion = task.affinity_region;
|
|
365
|
+
if (affinityRegion && node.region !== affinityRegion) continue;
|
|
366
|
+
|
|
367
|
+
const affinityTags = safeParse(task.affinity_tags, []);
|
|
368
|
+
const nodeTags = safeParse(node.tags, []);
|
|
369
|
+
if (affinityTags.length > 0 && !affinityTags.some(t => nodeTags.includes(t))) continue;
|
|
370
|
+
|
|
371
|
+
stmts.assignTask.run({ id: task.id, node_id: nodeId });
|
|
372
|
+
stmts.incrementNodeLoad.run(nodeId);
|
|
373
|
+
logEvent('task.assigned', nodeId, task.id, { strategy: 'pull' });
|
|
374
|
+
|
|
375
|
+
assigned.push({
|
|
376
|
+
taskId: task.id,
|
|
377
|
+
type: task.task_type,
|
|
378
|
+
objective: task.objective,
|
|
379
|
+
params: safeParse(task.payload, {}),
|
|
380
|
+
priority: task.priority,
|
|
381
|
+
timeout: task.timeout_ms,
|
|
382
|
+
});
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
return assigned;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
getStats() { return { ...this._stats }; }
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
392
|
+
// CLUSTER ORCHESTRATOR
|
|
393
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
394
|
+
|
|
395
|
+
/**
|
|
396
|
+
* Fleet management — lifecycle, health, auto-scaling, failover, rebalancing.
|
|
397
|
+
*
|
|
398
|
+
* Responsibilities:
|
|
399
|
+
* - Node registration and authentication
|
|
400
|
+
* - Health monitoring via heartbeats
|
|
401
|
+
* - Dead node detection and task failover
|
|
402
|
+
* - Load rebalancing across the cluster
|
|
403
|
+
* - Cluster topology and status reporting
|
|
404
|
+
* - Drain and cordon operations
|
|
405
|
+
*/
|
|
406
|
+
class ClusterOrchestrator {
|
|
407
|
+
constructor(distributor) {
|
|
408
|
+
this._distributor = distributor;
|
|
409
|
+
this._heartbeatThresholdSec = 90; // Node considered dead after 90s no heartbeat
|
|
410
|
+
this._checkInterval = null;
|
|
411
|
+
this._rebalanceInterval = null;
|
|
412
|
+
this._started = false;
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
// ─── Lifecycle ──────────────────────────────────────────────────────
|
|
416
|
+
|
|
417
|
+
/**
|
|
418
|
+
* Start the orchestrator — begins periodic health checks and task processing
|
|
419
|
+
*/
|
|
420
|
+
start() {
|
|
421
|
+
if (this._started) return;
|
|
422
|
+
this._started = true;
|
|
423
|
+
|
|
424
|
+
// Health check every 30s
|
|
425
|
+
this._checkInterval = setInterval(() => {
|
|
426
|
+
this._healthCheck();
|
|
427
|
+
this._recoverStuckTasks();
|
|
428
|
+
this._distributor.processPending();
|
|
429
|
+
}, 30_000);
|
|
430
|
+
if (this._checkInterval.unref) this._checkInterval.unref();
|
|
431
|
+
|
|
432
|
+
// Rebalance every 5 min
|
|
433
|
+
this._rebalanceInterval = setInterval(() => {
|
|
434
|
+
this._rebalance();
|
|
435
|
+
}, 300_000);
|
|
436
|
+
if (this._rebalanceInterval.unref) this._rebalanceInterval.unref();
|
|
437
|
+
|
|
438
|
+
bus.emit('cluster.started', { timestamp: Date.now() });
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
/**
|
|
442
|
+
* Stop the orchestrator
|
|
443
|
+
*/
|
|
444
|
+
stop() {
|
|
445
|
+
if (!this._started) return;
|
|
446
|
+
this._started = false;
|
|
447
|
+
if (this._checkInterval) clearInterval(this._checkInterval);
|
|
448
|
+
if (this._rebalanceInterval) clearInterval(this._rebalanceInterval);
|
|
449
|
+
bus.emit('cluster.stopped', { timestamp: Date.now() });
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// ─── Node Management ───────────────────────────────────────────────
|
|
453
|
+
|
|
454
|
+
/**
|
|
455
|
+
* Register a worker node to join the cluster
|
|
456
|
+
*/
|
|
457
|
+
registerNode(config) {
|
|
458
|
+
if (!config.name || !config.endpoint) {
|
|
459
|
+
throw new Error('Node name and endpoint required');
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
// Check for existing node with same endpoint
|
|
463
|
+
const existing = stmts.getNodeByEndpoint.get(config.endpoint);
|
|
464
|
+
if (existing) {
|
|
465
|
+
// Re-register: update and reactivate
|
|
466
|
+
stmts.updateNode.run({
|
|
467
|
+
id: existing.id,
|
|
468
|
+
name: config.name,
|
|
469
|
+
endpoint: config.endpoint,
|
|
470
|
+
region: config.region || existing.region,
|
|
471
|
+
zone: config.zone || existing.zone,
|
|
472
|
+
capacity_total: config.capacity || existing.capacity_total,
|
|
473
|
+
tags: JSON.stringify(config.tags || safeParse(existing.tags, [])),
|
|
474
|
+
hardware: JSON.stringify(config.hardware || safeParse(existing.hardware, {})),
|
|
475
|
+
version: config.version || existing.version,
|
|
476
|
+
});
|
|
477
|
+
stmts.setNodeStatus.run({ id: existing.id, status: 'active' });
|
|
478
|
+
logEvent('node.re-registered', existing.id, null, { endpoint: config.endpoint });
|
|
479
|
+
bus.emit('cluster.node.joined', { nodeId: existing.id, name: config.name, rejoined: true });
|
|
480
|
+
return { nodeId: existing.id, status: 'active', rejoined: true };
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
const nodeId = `node_${crypto.randomBytes(12).toString('hex')}`;
|
|
484
|
+
const secretHash = crypto.createHash('sha256')
|
|
485
|
+
.update(config.secret || crypto.randomBytes(32).toString('hex'))
|
|
486
|
+
.digest('hex');
|
|
487
|
+
|
|
488
|
+
stmts.insertNode.run({
|
|
489
|
+
id: nodeId,
|
|
490
|
+
name: config.name,
|
|
491
|
+
endpoint: config.endpoint,
|
|
492
|
+
region: config.region || 'default',
|
|
493
|
+
zone: config.zone || 'a',
|
|
494
|
+
role: config.role || 'worker',
|
|
495
|
+
status: 'active',
|
|
496
|
+
capacity_total: config.capacity || 20,
|
|
497
|
+
tags: JSON.stringify(config.tags || []),
|
|
498
|
+
hardware: JSON.stringify(config.hardware || {}),
|
|
499
|
+
version: config.version || null,
|
|
500
|
+
secret_hash: secretHash,
|
|
501
|
+
});
|
|
502
|
+
|
|
503
|
+
logEvent('node.registered', nodeId, null, { name: config.name, endpoint: config.endpoint, region: config.region });
|
|
504
|
+
bus.emit('cluster.node.joined', { nodeId, name: config.name });
|
|
505
|
+
|
|
506
|
+
return { nodeId, status: 'active', secret: config.secret ? undefined : undefined };
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
/**
|
|
510
|
+
* Remove a node from the cluster
|
|
511
|
+
*/
|
|
512
|
+
deregisterNode(nodeId) {
|
|
513
|
+
const node = stmts.getNode.get(nodeId);
|
|
514
|
+
if (!node) return null;
|
|
515
|
+
|
|
516
|
+
// Reassign tasks before removing
|
|
517
|
+
const reassigned = this._distributor.reassignFromNode(nodeId);
|
|
518
|
+
stmts.deleteNode.run(nodeId);
|
|
519
|
+
|
|
520
|
+
logEvent('node.deregistered', nodeId, null, { reassigned });
|
|
521
|
+
bus.emit('cluster.node.left', { nodeId, name: node.name, tasksReassigned: reassigned });
|
|
522
|
+
|
|
523
|
+
return { nodeId, reassigned };
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
/**
|
|
527
|
+
* Process heartbeat from a worker node
|
|
528
|
+
*/
|
|
529
|
+
heartbeat(nodeId, data = {}) {
|
|
530
|
+
const node = stmts.getNode.get(nodeId);
|
|
531
|
+
if (!node) return null;
|
|
532
|
+
|
|
533
|
+
stmts.heartbeatNode.run({
|
|
534
|
+
id: nodeId,
|
|
535
|
+
capacity_used: data.capacityUsed != null ? data.capacityUsed : node.capacity_used,
|
|
536
|
+
});
|
|
537
|
+
|
|
538
|
+
// Update hardware profile if provided
|
|
539
|
+
if (data.hardware) {
|
|
540
|
+
stmts.updateNode.run({
|
|
541
|
+
id: nodeId,
|
|
542
|
+
name: node.name,
|
|
543
|
+
endpoint: node.endpoint,
|
|
544
|
+
region: node.region,
|
|
545
|
+
zone: node.zone,
|
|
546
|
+
capacity_total: data.capacityTotal || node.capacity_total,
|
|
547
|
+
tags: JSON.stringify(data.tags || safeParse(node.tags, [])),
|
|
548
|
+
hardware: JSON.stringify(data.hardware),
|
|
549
|
+
version: data.version || node.version,
|
|
550
|
+
});
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
return {
|
|
554
|
+
nodeId,
|
|
555
|
+
status: 'active',
|
|
556
|
+
pendingTasks: stmts.getPendingTasks.all(1).length > 0,
|
|
557
|
+
};
|
|
558
|
+
}
|
|
559
|
+
|
|
560
|
+
/**
|
|
561
|
+
* Drain a node — stop assigning new tasks, wait for running tasks to finish
|
|
562
|
+
*/
|
|
563
|
+
drainNode(nodeId) {
|
|
564
|
+
const node = stmts.getNode.get(nodeId);
|
|
565
|
+
if (!node) return null;
|
|
566
|
+
|
|
567
|
+
stmts.setNodeStatus.run({ id: nodeId, status: 'draining' });
|
|
568
|
+
logEvent('node.draining', nodeId, null, {});
|
|
569
|
+
bus.emit('cluster.node.draining', { nodeId, name: node.name });
|
|
570
|
+
|
|
571
|
+
return { nodeId, status: 'draining', activeTasks: stmts.getTasksByNode.all(nodeId).length };
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
/**
|
|
575
|
+
* Cordon a node — prevent scheduling but keep running tasks
|
|
576
|
+
*/
|
|
577
|
+
cordonNode(nodeId) {
|
|
578
|
+
const node = stmts.getNode.get(nodeId);
|
|
579
|
+
if (!node) return null;
|
|
580
|
+
|
|
581
|
+
stmts.setNodeStatus.run({ id: nodeId, status: 'cordoned' });
|
|
582
|
+
logEvent('node.cordoned', nodeId, null, {});
|
|
583
|
+
bus.emit('cluster.node.cordoned', { nodeId, name: node.name });
|
|
584
|
+
|
|
585
|
+
return { nodeId, status: 'cordoned' };
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
/**
|
|
589
|
+
* Uncordon a node — allow scheduling again
|
|
590
|
+
*/
|
|
591
|
+
uncordonNode(nodeId) {
|
|
592
|
+
const node = stmts.getNode.get(nodeId);
|
|
593
|
+
if (!node) return null;
|
|
594
|
+
|
|
595
|
+
stmts.setNodeStatus.run({ id: nodeId, status: 'active' });
|
|
596
|
+
logEvent('node.uncordoned', nodeId, null, {});
|
|
597
|
+
|
|
598
|
+
return { nodeId, status: 'active' };
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
/**
|
|
602
|
+
* Get node details
|
|
603
|
+
*/
|
|
604
|
+
getNode(nodeId) {
|
|
605
|
+
const node = stmts.getNode.get(nodeId);
|
|
606
|
+
if (!node) return null;
|
|
607
|
+
node.tags = safeParse(node.tags, []);
|
|
608
|
+
node.hardware = safeParse(node.hardware, {});
|
|
609
|
+
node.activeTasks = stmts.getTasksByNode.all(nodeId).length;
|
|
610
|
+
return node;
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
/**
|
|
614
|
+
* List all cluster nodes
|
|
615
|
+
*/
|
|
616
|
+
listNodes(filter = {}) {
|
|
617
|
+
let nodes;
|
|
618
|
+
if (filter.region) {
|
|
619
|
+
nodes = stmts.listNodesByRegion.all(filter.region);
|
|
620
|
+
} else if (filter.active) {
|
|
621
|
+
nodes = stmts.listActiveNodes.all();
|
|
622
|
+
} else {
|
|
623
|
+
nodes = stmts.listNodes.all();
|
|
624
|
+
}
|
|
625
|
+
return nodes.map(n => ({
|
|
626
|
+
...n,
|
|
627
|
+
tags: safeParse(n.tags, []),
|
|
628
|
+
hardware: safeParse(n.hardware, {}),
|
|
629
|
+
}));
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
// ─── Task Reporting ─────────────────────────────────────────────────
|
|
633
|
+
|
|
634
|
+
/**
|
|
635
|
+
* Worker reports task started
|
|
636
|
+
*/
|
|
637
|
+
reportTaskStarted(taskId) {
|
|
638
|
+
const task = stmts.getTask.get(taskId);
|
|
639
|
+
if (!task) return null;
|
|
640
|
+
stmts.startTask.run(taskId);
|
|
641
|
+
logEvent('task.started', task.node_id, taskId, {});
|
|
642
|
+
bus.emit('cluster.task.started', { taskId, nodeId: task.node_id });
|
|
643
|
+
return { taskId, status: 'running' };
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/**
|
|
647
|
+
* Worker reports task completed
|
|
648
|
+
*/
|
|
649
|
+
reportTaskCompleted(taskId, result) {
|
|
650
|
+
const task = stmts.getTask.get(taskId);
|
|
651
|
+
if (!task) return null;
|
|
652
|
+
|
|
653
|
+
stmts.completeTask.run({ id: taskId, result: JSON.stringify(result || {}) });
|
|
654
|
+
if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
|
|
655
|
+
|
|
656
|
+
logEvent('task.completed', task.node_id, taskId, { hasResult: !!result });
|
|
657
|
+
bus.emit('cluster.task.completed', { taskId, nodeId: task.node_id, result });
|
|
658
|
+
|
|
659
|
+
return { taskId, status: 'completed' };
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
/**
|
|
663
|
+
* Worker reports task failed
|
|
664
|
+
*/
|
|
665
|
+
reportTaskFailed(taskId, error) {
|
|
666
|
+
const task = stmts.getTask.get(taskId);
|
|
667
|
+
if (!task) return null;
|
|
668
|
+
|
|
669
|
+
if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
|
|
670
|
+
|
|
671
|
+
// Retry if attempts remaining
|
|
672
|
+
if (task.attempts < task.max_attempts) {
|
|
673
|
+
stmts.requeueTask.run(taskId);
|
|
674
|
+
logEvent('task.retrying', task.node_id, taskId, { attempt: task.attempts, error });
|
|
675
|
+
bus.emit('cluster.task.retrying', { taskId, attempt: task.attempts });
|
|
676
|
+
|
|
677
|
+
// Try to assign to a different node
|
|
678
|
+
this._distributor._tryAssign(taskId);
|
|
679
|
+
|
|
680
|
+
return { taskId, status: 'retrying', attempt: task.attempts };
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
// Max attempts reached
|
|
684
|
+
stmts.failTask.run({ id: taskId, error: typeof error === 'string' ? error : JSON.stringify(error) });
|
|
685
|
+
logEvent('task.failed', task.node_id, taskId, { error, attempts: task.attempts });
|
|
686
|
+
bus.emit('cluster.task.failed', { taskId, error, nodeId: task.node_id });
|
|
687
|
+
|
|
688
|
+
return { taskId, status: 'failed' };
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
/**
|
|
692
|
+
* Get task details
|
|
693
|
+
*/
|
|
694
|
+
getTask(taskId) {
|
|
695
|
+
const task = stmts.getTask.get(taskId);
|
|
696
|
+
if (!task) return null;
|
|
697
|
+
task.payload = safeParse(task.payload, {});
|
|
698
|
+
task.affinity_tags = safeParse(task.affinity_tags, []);
|
|
699
|
+
task.result = safeParse(task.result, null);
|
|
700
|
+
return task;
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
/**
|
|
704
|
+
* List tasks with optional status filter
|
|
705
|
+
*/
|
|
706
|
+
listTasks(filter = {}) {
|
|
707
|
+
let tasks;
|
|
708
|
+
if (filter.status) {
|
|
709
|
+
tasks = stmts.getTasksByStatus.all(filter.status, filter.limit || 50);
|
|
710
|
+
} else if (filter.nodeId) {
|
|
711
|
+
tasks = stmts.getTasksByNode.all(filter.nodeId);
|
|
712
|
+
} else {
|
|
713
|
+
tasks = stmts.listTasks.all(filter.limit || 50);
|
|
714
|
+
}
|
|
715
|
+
return tasks.map(t => ({
|
|
716
|
+
...t,
|
|
717
|
+
payload: safeParse(t.payload, {}),
|
|
718
|
+
affinity_tags: safeParse(t.affinity_tags, []),
|
|
719
|
+
result: safeParse(t.result, null),
|
|
720
|
+
}));
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
// ─── Cluster Topology ───────────────────────────────────────────────
|
|
724
|
+
|
|
725
|
+
/**
|
|
726
|
+
* Get full cluster status
|
|
727
|
+
*/
|
|
728
|
+
getClusterStatus() {
|
|
729
|
+
const nodes = stmts.listNodes.all();
|
|
730
|
+
const taskCounts = {};
|
|
731
|
+
for (const row of stmts.countByStatus.all()) {
|
|
732
|
+
taskCounts[row.status] = row.count;
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
const activeNodes = nodes.filter(n => n.status === 'active');
|
|
736
|
+
const totalCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_total, 0);
|
|
737
|
+
const usedCapacity = activeNodes.reduce((sum, n) => sum + n.capacity_used, 0);
|
|
738
|
+
|
|
739
|
+
// Group by region
|
|
740
|
+
const regions = {};
|
|
741
|
+
for (const node of nodes) {
|
|
742
|
+
if (!regions[node.region]) regions[node.region] = { nodes: 0, active: 0, capacity: 0, used: 0 };
|
|
743
|
+
regions[node.region].nodes++;
|
|
744
|
+
if (node.status === 'active') {
|
|
745
|
+
regions[node.region].active++;
|
|
746
|
+
regions[node.region].capacity += node.capacity_total;
|
|
747
|
+
regions[node.region].used += node.capacity_used;
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
return {
|
|
752
|
+
coordinator: { started: this._started },
|
|
753
|
+
nodes: {
|
|
754
|
+
total: nodes.length,
|
|
755
|
+
active: activeNodes.length,
|
|
756
|
+
draining: nodes.filter(n => n.status === 'draining').length,
|
|
757
|
+
cordoned: nodes.filter(n => n.status === 'cordoned').length,
|
|
758
|
+
dead: nodes.filter(n => n.status === 'dead').length,
|
|
759
|
+
},
|
|
760
|
+
capacity: {
|
|
761
|
+
total: totalCapacity,
|
|
762
|
+
used: usedCapacity,
|
|
763
|
+
available: totalCapacity - usedCapacity,
|
|
764
|
+
utilization: totalCapacity > 0 ? Math.round((usedCapacity / totalCapacity) * 100) : 0,
|
|
765
|
+
},
|
|
766
|
+
tasks: taskCounts,
|
|
767
|
+
regions,
|
|
768
|
+
distributor: this._distributor.getStats(),
|
|
769
|
+
};
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
/**
|
|
773
|
+
* Get cluster events log
|
|
774
|
+
*/
|
|
775
|
+
getEvents(limit = 100, nodeId = null) {
|
|
776
|
+
if (nodeId) {
|
|
777
|
+
return stmts.getEventsByNode.all(nodeId, limit).map(e => ({
|
|
778
|
+
...e,
|
|
779
|
+
data: safeParse(e.data, {}),
|
|
780
|
+
}));
|
|
781
|
+
}
|
|
782
|
+
return stmts.getEvents.all(limit).map(e => ({
|
|
783
|
+
...e,
|
|
784
|
+
data: safeParse(e.data, {}),
|
|
785
|
+
}));
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
// ─── Internal Operations ────────────────────────────────────────────
|
|
789
|
+
|
|
790
|
+
/**
|
|
791
|
+
* Check for dead nodes and failover their tasks
|
|
792
|
+
*/
|
|
793
|
+
_healthCheck() {
|
|
794
|
+
const staleNodes = stmts.getStaleNodes.all(this._heartbeatThresholdSec);
|
|
795
|
+
|
|
796
|
+
for (const node of staleNodes) {
|
|
797
|
+
stmts.setNodeStatus.run({ id: node.id, status: 'dead' });
|
|
798
|
+
logEvent('node.dead', node.id, null, { lastHeartbeat: node.last_heartbeat });
|
|
799
|
+
bus.emit('cluster.node.dead', { nodeId: node.id, name: node.name });
|
|
800
|
+
|
|
801
|
+
// Failover: reassign all tasks from dead node
|
|
802
|
+
const reassigned = this._distributor.reassignFromNode(node.id);
|
|
803
|
+
logEvent('node.failover', node.id, null, { reassigned });
|
|
804
|
+
bus.emit('cluster.node.failover', { nodeId: node.id, tasksReassigned: reassigned });
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
|
|
808
|
+
/**
|
|
809
|
+
* Recover tasks that have been assigned/running too long (stuck)
|
|
810
|
+
*/
|
|
811
|
+
_recoverStuckTasks() {
|
|
812
|
+
const stuckTasks = stmts.getStuckTasks.all(300); // 5 min stuck threshold
|
|
813
|
+
|
|
814
|
+
for (const task of stuckTasks) {
|
|
815
|
+
if (task.attempts >= task.max_attempts) {
|
|
816
|
+
stmts.failTask.run({ id: task.id, error: 'Task stuck, max attempts reached' });
|
|
817
|
+
if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
|
|
818
|
+
logEvent('task.stuck_failed', task.node_id, task.id, {});
|
|
819
|
+
} else {
|
|
820
|
+
if (task.node_id) stmts.decrementNodeLoad.run(task.node_id);
|
|
821
|
+
stmts.requeueTask.run(task.id);
|
|
822
|
+
this._distributor._tryAssign(task.id);
|
|
823
|
+
logEvent('task.stuck_requeued', task.node_id, task.id, { attempt: task.attempts });
|
|
824
|
+
}
|
|
825
|
+
}
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
/**
|
|
829
|
+
* Rebalance tasks across nodes when load is skewed
|
|
830
|
+
*/
|
|
831
|
+
_rebalance() {
|
|
832
|
+
const nodes = stmts.listActiveNodes.all();
|
|
833
|
+
if (nodes.length < 2) return;
|
|
834
|
+
|
|
835
|
+
const avgLoad = nodes.reduce((s, n) => s + (n.capacity_used / n.capacity_total), 0) / nodes.length;
|
|
836
|
+
const overloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) > avgLoad * 1.5 && n.capacity_used > 2);
|
|
837
|
+
const underloaded = nodes.filter(n => (n.capacity_used / n.capacity_total) < avgLoad * 0.5);
|
|
838
|
+
|
|
839
|
+
if (overloaded.length === 0 || underloaded.length === 0) return;
|
|
840
|
+
|
|
841
|
+
let moved = 0;
|
|
842
|
+
for (const over of overloaded) {
|
|
843
|
+
const tasks = stmts.getTasksByNode.all(over.id);
|
|
844
|
+
// Move up to 2 tasks from overloaded to underloaded
|
|
845
|
+
const toMove = tasks.filter(t => t.status === 'assigned').slice(0, 2);
|
|
846
|
+
|
|
847
|
+
for (const task of toMove) {
|
|
848
|
+
const target = underloaded.find(n => n.capacity_used < n.capacity_total);
|
|
849
|
+
if (!target) break;
|
|
850
|
+
|
|
851
|
+
stmts.decrementNodeLoad.run(over.id);
|
|
852
|
+
stmts.assignTask.run({ id: task.id, node_id: target.id });
|
|
853
|
+
stmts.incrementNodeLoad.run(target.id);
|
|
854
|
+
target.capacity_used++;
|
|
855
|
+
moved++;
|
|
856
|
+
|
|
857
|
+
logEvent('task.rebalanced', target.id, task.id, { from: over.id });
|
|
858
|
+
this._distributor._notifyWorker(target, task.id, task);
|
|
859
|
+
}
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
if (moved > 0) {
|
|
863
|
+
bus.emit('cluster.rebalanced', { tasksMoved: moved });
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
869
|
+
// HELPERS
|
|
870
|
+
// ═══════════════════════════════════════════════════════════════════════════
|
|
871
|
+
|
|
872
|
+
function safeParse(str, fallback) {
|
|
873
|
+
if (str == null) return fallback;
|
|
874
|
+
if (typeof str === 'object') return str;
|
|
875
|
+
try { return JSON.parse(str); } catch { return fallback; }
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
function logEvent(type, nodeId, taskId, data) {
|
|
879
|
+
try {
|
|
880
|
+
stmts.insertEvent.run({
|
|
881
|
+
event_type: type,
|
|
882
|
+
node_id: nodeId || null,
|
|
883
|
+
task_id: taskId || null,
|
|
884
|
+
data: JSON.stringify(data || {}),
|
|
885
|
+
});
|
|
886
|
+
} catch { /* best-effort logging */ }
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
// ─── Singleton ───────────────────────────────────────────────────────
|
|
890
|
+
|
|
891
|
+
const distributor = new TaskDistributor();
|
|
892
|
+
const cluster = new ClusterOrchestrator(distributor);
|
|
893
|
+
|
|
894
|
+
module.exports = { cluster, distributor, ClusterOrchestrator, TaskDistributor };
|